CUDA 4.
// vector_add.cu #include <stdio.h>
__global__ void vecAdd(float *A, float *B, float *C, int N) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < N)
C[i] = A[i] + B[i];
}
int main() {
int N = 5;
size_t size = N * sizeof(float);
float A[] = {1, 2, 3, 4, 5};
float B[] = {10, 20, 30, 40, 50};
float C[5];
float *d_A, *d_B, *d_C;
cudaMalloc(&d_A, size);
cudaMalloc(&d_B, size);
cudaMalloc(&d_C, size);
cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, size, cudaMemcpyHostToDevice);
vecAdd<<<1, N>>>(d_A, d_B, d_C, N);
cudaMemcpy(C, d_C, size, cudaMemcpyDeviceToHost);
printf("Result Vector C:\n");
for (int i = 0; i < N; i++)
printf("%f ", C[i]);
printf("\n");
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
return 0;
}
__________________________________________________
CUDA 4.2
// matrix_mul.cu
#include <stdio.h>
__global__ void matMul(float *A, float *B, float *C, int N) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row < N && col < N) {
float sum = 0;
for (int k = 0; k < N; k++)
sum += A[row * N + k] * B[k * N + col];
C[row * N + col] = sum;
}
}
int main() {
int N = 2; // 2x2 matrix for simplicity
size_t size = N * N * sizeof(float);
float A[] = {1, 2, 3, 4};
float B[] = {5, 6, 7, 8};
float C[4];
float *d_A, *d_B, *d_C;
cudaMalloc(&d_A, size);
cudaMalloc(&d_B, size);
cudaMalloc(&d_C, size);
cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, size, cudaMemcpyHostToDevice);
dim3 threads(16, 16);
dim3 blocks((N+15)/16, (N+15)/16);
matMul<<<blocks, threads>>>(d_A, d_B, d_C, N);
cudaMemcpy(C, d_C, size, cudaMemcpyDeviceToHost);
printf("Result Matrix C:\n");
for (int i = 0; i < N*N; i++) {
printf("%f ", C[i]);
if((i +1) % N ==0) printf("\n");
}
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
return 0;
}