0% found this document useful (0 votes)
5 views2 pages

Cuda 4.1

The document contains two CUDA programs: one for vector addition (vector_add.cu) and another for matrix multiplication (matrix_mul.cu). The vector addition program adds two arrays of floats using a CUDA kernel, while the matrix multiplication program multiplies two 2x2 matrices using a different CUDA kernel. Both programs allocate memory on the GPU, perform computations, and then copy the results back to the host for output.

Uploaded by

tryhackkme123
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
5 views2 pages

Cuda 4.1

The document contains two CUDA programs: one for vector addition (vector_add.cu) and another for matrix multiplication (matrix_mul.cu). The vector addition program adds two arrays of floats using a CUDA kernel, while the matrix multiplication program multiplies two 2x2 matrices using a different CUDA kernel. Both programs allocate memory on the GPU, perform computations, and then copy the results back to the host for output.

Uploaded by

tryhackkme123
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 2

CUDA 4.

// vector_add.cu #include <stdio.h>


__global__ void vecAdd(float *A, float *B, float *C, int N) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < N)
C[i] = A[i] + B[i];
}
int main() {
int N = 5;
size_t size = N * sizeof(float);
float A[] = {1, 2, 3, 4, 5};
float B[] = {10, 20, 30, 40, 50};
float C[5];

float *d_A, *d_B, *d_C;


cudaMalloc(&d_A, size);
cudaMalloc(&d_B, size);
cudaMalloc(&d_C, size);
cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, size, cudaMemcpyHostToDevice);
vecAdd<<<1, N>>>(d_A, d_B, d_C, N);
cudaMemcpy(C, d_C, size, cudaMemcpyDeviceToHost);
printf("Result Vector C:\n");
for (int i = 0; i < N; i++)
printf("%f ", C[i]);
printf("\n");
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
return 0;
}

__________________________________________________
CUDA 4.2

// matrix_mul.cu
#include <stdio.h>
__global__ void matMul(float *A, float *B, float *C, int N) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row < N && col < N) {
float sum = 0;
for (int k = 0; k < N; k++)
sum += A[row * N + k] * B[k * N + col];
C[row * N + col] = sum;
}
}
int main() {
int N = 2; // 2x2 matrix for simplicity
size_t size = N * N * sizeof(float);
float A[] = {1, 2, 3, 4};
float B[] = {5, 6, 7, 8};
float C[4];
float *d_A, *d_B, *d_C;
cudaMalloc(&d_A, size);
cudaMalloc(&d_B, size);
cudaMalloc(&d_C, size);
cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, size, cudaMemcpyHostToDevice);
dim3 threads(16, 16);
dim3 blocks((N+15)/16, (N+15)/16);
matMul<<<blocks, threads>>>(d_A, d_B, d_C, N);
cudaMemcpy(C, d_C, size, cudaMemcpyDeviceToHost);
printf("Result Matrix C:\n");
for (int i = 0; i < N*N; i++) {
printf("%f ", C[i]);
if((i +1) % N ==0) printf("\n");
}
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
return 0;
}

You might also like