0% found this document useful (0 votes)
24 views2 pages

Addition Cuda

This document provides a CUDA program for adding two large vectors. It includes the kernel function for vector addition and the main function that initializes the vectors, allocates memory on both the host and device, and manages data transfer between them. The program also prints the first 10 results of the addition before freeing allocated memory.

Uploaded by

Vedant Rewagad
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
24 views2 pages

Addition Cuda

This document provides a CUDA program for adding two large vectors. It includes the kernel function for vector addition and the main function that initializes the vectors, allocates memory on both the host and device, and manages data transfer between them. The program also prints the first 10 results of the addition before freeing allocated memory.

Uploaded by

Vedant Rewagad
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 2

Cuda

Write a CUDA Program for :


1. Addition of two large vectors

#####################################################Addition of two large


vectors#############################

#include <iostream>
#include <cuda_runtime.h>

__global__ void addVectors(int* A, int* B, int* C, int n)


{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n)
{
C[i] = A[i] + B[i];
}
}

int main()
{
int n = 1000000;
int* A, * B, * C;
int size = n * sizeof(int);

// Allocate pinned memory on the host


cudaMallocHost(&A, size);
cudaMallocHost(&B, size);
cudaMallocHost(&C, size);

// Initialize input vectors


for (int i = 0; i < n; i++)
{
A[i] = i;
B[i] = i * 2;
}

// Allocate memory on the device


int* dev_A, * dev_B, * dev_C;
cudaMalloc(&dev_A, size);
cudaMalloc(&dev_B, size);
cudaMalloc(&dev_C, size);

// Copy data from host to device


cudaMemcpy(dev_A, A, size, cudaMemcpyHostToDevice);
cudaMemcpy(dev_B, B, size, cudaMemcpyHostToDevice);

// Launch kernel
int blockSize = 256;
int numBlocks = (n + blockSize - 1) / blockSize;
addVectors<<<numBlocks, blockSize>>>(dev_A, dev_B, dev_C, n);
cudaDeviceSynchronize();

// Copy result from device to host


cudaMemcpy(C, dev_C, size, cudaMemcpyDeviceToHost);
// Print first 10 results
for (int i = 0; i < 10; i++)
{
std::cout << C[i] << " ";
}
std::cout << std::endl;

// Free memory
cudaFree(dev_A);
cudaFree(dev_B);
cudaFree(dev_C);
cudaFreeHost(A);
cudaFreeHost(B);
cudaFreeHost(C);

return 0;
}

You might also like