0% found this document useful (0 votes)

23 views4 pages

Cuda

The document contains CUDA programming assignments by Abhishek Kumar Yadav, which include summing an array of 10 numbers using a kernel function, adding three vectors of 10 elements each, multiplying three scalar variables, and swapping two elements without using a third variable. Each assignment includes the necessary CUDA code, memory allocation, and kernel execution. The document demonstrates fundamental CUDA operations and memory management for parallel computing.

Uploaded by

yadavabhi4268

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

23 views4 pages

Cuda

Uploaded by

yadavabhi4268

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 4

CUDA ASSIGNMENT :

Abhishek Kumar Yadav (CSE-AIML/22/62)

1. Take an array of 10 numbers and perform the summation of these 10 numbers. Use a kernel function.

%%cuda
#include <stdio.h>
#include <cuda_runtime.h>

global void sumKernel(int array, int result, int n) {

__shared__ int partialSum[10]; // Shared memory for partial sums

int tid = threadIdx.x;

partialSum[tid] = (tid < n) ? array[tid] : 0; // Load elements into shared memory

__syncthreads();

// Perform parallel reduction within the block

for (int stride = 1; stride < blockDim.x; stride *= 2) {
if (tid % (2 * stride) == 0) {
partialSum[tid] += partialSum[tid + stride];
}
__syncthreads();
}

// First thread in the block writes the result

if (tid == 0) {
*result = partialSum[0];
}
}

int main() {
const int n = 10;
int h_array[n] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; // Example array of 10 numbers
int h_result = 0; // Host variable for result

// Device variables
int *d_array, *d_result;
cudaMalloc((void **)&d_array, n * sizeof(int));
cudaMalloc((void **)&d_result, sizeof(int));

// Copy array from host to device

cudaMemcpy(d_array, h_array, n * sizeof(int), cudaMemcpyHostToDevice);

// Launch kernel with 10 threads in a single block

sumKernel<<<1, n>>>(d_array, d_result, n);

// Copy result back to host

cudaMemcpy(&h_result, d_result, sizeof(int), cudaMemcpyDeviceToHost);

// Print the result

printf("Sum of array elements: %d\n", h_result);

// Free device memory

cudaFree(d_array);
cudaFree(d_result);

return 0;
}
2. Take three vectors consisting of 10 elements each and add them and store it in a 4th
vector.
%%cuda
#include <stdio.h>
#include <cuda_runtime.h>

#define N 10 // Number of elements in each vector

// Kernel function to add vectors

__global__ void vectorAdd(int *A, int *B, int *C, int *D, int n) {
int tid = threadIdx.x;

if (tid < n) {
// Perform element-wise addition and store it in vector D
D[tid] = A[tid] + B[tid] + C[tid];
}
}

int main() {
int h_A[N], h_B[N], h_C[N], h_D[N]; // Host vectors
int *d_A, *d_B, *d_C, *d_D; // Device vectors

// Initialize vectors A, B, and C

for (int i = 0; i < N; i++) {
h_A[i] = i + 1; // Vector A: 1, 2, 3, ..., 10
h_B[i] = (i + 1) * 2; // Vector B: 2, 4, 6, ..., 20
h_C[i] = (i + 1) * 3; // Vector C: 3, 6, 9, ..., 30
}

// Allocate memory on the device

cudaMalloc((void **)&d_A, N * sizeof(int));
cudaMalloc((void **)&d_B, N * sizeof(int));
cudaMalloc((void **)&d_C, N * sizeof(int));
cudaMalloc((void **)&d_D, N * sizeof(int));

// Copy data from host to device

cudaMemcpy(d_A, h_A, N * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, N * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_C, h_C, N * sizeof(int), cudaMemcpyHostToDevice);

// Launch the kernel with N threads

vectorAdd<<<1, N>>>(d_A, d_B, d_C, d_D, N);

// Copy the result from device to host

cudaMemcpy(h_D, d_D, N * sizeof(int), cudaMemcpyDeviceToHost);

// Print the result

printf("Resulting vector D after adding A, B, and C:\n");
for (int i = 0; i < N; i++) {
printf("%d ", h_D[i]);
}
printf("\n");

// Free device memory

cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
cudaFree(d_D);

return 0;
}
3. Take 3 scalar variables and assign floating point values to them then perform the
multiplication and store it in 4th variable.

%%cuda
#include <stdio.h>
#include <cuda_runtime.h>

// Kernel function to multiply three scalars

__global__ void scalarMultiply(float *a, float *b, float *c, float *result) {
// Perform the multiplication and store the result
*result = (*a) * (*b) * (*c);
}

int main() {
// Declare and initialize the scalar variables
float h_a = 2.5f, h_b = 3.5f, h_c = 4.0f; // Host variables
float h_result = 0.0f; // Host variable for storing result

// Device variables
float *d_a, *d_b, *d_c, *d_result;

// Allocate memory on the device

cudaMalloc((void **)&d_a, sizeof(float));
cudaMalloc((void **)&d_b, sizeof(float));
cudaMalloc((void **)&d_c, sizeof(float));
cudaMalloc((void **)&d_result, sizeof(float));

// Copy data from host to device

cudaMemcpy(d_a, &h_a, sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, &h_b, sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_c, &h_c, sizeof(float), cudaMemcpyHostToDevice);

// Launch the kernel (1 block, 1 thread)

scalarMultiply<<<1, 1>>>(d_a, d_b, d_c, d_result);

// Copy the result from device to host

cudaMemcpy(&h_result, d_result, sizeof(float), cudaMemcpyDeviceToHost);

// Print the result

printf("The result of multiplying %.2f, %.2f, and %.2f is: %.2f\n", h_a, h_b, h_c, h_result);

// Free device memory

cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
cudaFree(d_result);

return 0;
}
4. Write a kernel function to swap two elements without the use of 3rd
variable.

%%cuda
#include <stdio.h>
#include <cuda_runtime.h>

// Kernel function to swap two elements without using a third variable

__global__ void swapKernel(int *a, int *b) {
// Swap the elements using arithmetic operations (addition and subtraction)
*a = *a + *b; // a = a + b
*b = *a - *b; // b = (a + b) - b = a
*a = *a - *b; // a = (a + b) - a = b
}

int main() {
int h_a = 5, h_b = 10; // Host variables
int *d_a, *d_b; // Device variables

// Allocate memory on the device

cudaMalloc((void **)&d_a, sizeof(int));
cudaMalloc((void **)&d_b, sizeof(int));

// Copy data from host to device

cudaMemcpy(d_a, &h_a, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, &h_b, sizeof(int), cudaMemcpyHostToDevice);

// Launch the kernel (1 block, 1 thread)

swapKernel<<<1, 1>>>(d_a, d_b);

// Copy the result back from device to host

cudaMemcpy(&h_a, d_a, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(&h_b, d_b, sizeof(int), cudaMemcpyDeviceToHost);

// Print the swapped values

printf("After swapping, a = %d and b = %d\n", h_a, h_b);

// Free device memory

cudaFree(d_a);
cudaFree(d_b);

return 0;
}

Cuda Add Mult
No ratings yet
Cuda Add Mult
3 pages
LP 1,,1
No ratings yet
LP 1,,1
5 pages
CUDA Additionof2Vector
No ratings yet
CUDA Additionof2Vector
2 pages
Addition Cuda
No ratings yet
Addition Cuda
2 pages
2023 CSC14120 Lecture01 CUDAIntroduction
No ratings yet
2023 CSC14120 Lecture01 CUDAIntroduction
32 pages
PDC Assignment
No ratings yet
PDC Assignment
9 pages
CUDA Programming for Developers
No ratings yet
CUDA Programming for Developers
42 pages
CUDA Exercises
No ratings yet
CUDA Exercises
185 pages
Google Colab Solution Activity
No ratings yet
Google Colab Solution Activity
5 pages
Vector Addition
No ratings yet
Vector Addition
3 pages
Cuda Firstprograms PDF
No ratings yet
Cuda Firstprograms PDF
6 pages
Cuda C/C++ Basics: NVIDIA Corporation
No ratings yet
Cuda C/C++ Basics: NVIDIA Corporation
67 pages
CUDA Practical's
No ratings yet
CUDA Practical's
38 pages
Rishi
No ratings yet
Rishi
30 pages
p4 Multiply
No ratings yet
p4 Multiply
2 pages
217 Lec2
No ratings yet
217 Lec2
24 pages
Introduction To CUDA: CAP 4730 Spring 2012
No ratings yet
Introduction To CUDA: CAP 4730 Spring 2012
35 pages
Group A Assignment 4 (A) : Two Large Vectors
No ratings yet
Group A Assignment 4 (A) : Two Large Vectors
5 pages
Cuda 4.1
No ratings yet
Cuda 4.1
2 pages
CUDA
No ratings yet
CUDA
3 pages
Moving To Parallel - Addition of 2 Matrices
No ratings yet
Moving To Parallel - Addition of 2 Matrices
14 pages
CUDA Programming Invert
No ratings yet
CUDA Programming Invert
36 pages
Introduction To CUDA C 3
No ratings yet
Introduction To CUDA C 3
67 pages
GPU History & CUDA Programming Basics
No ratings yet
GPU History & CUDA Programming Basics
44 pages
Intro To CUDA
No ratings yet
Intro To CUDA
76 pages
3 Cuda
No ratings yet
3 Cuda
5 pages
Allocate The Device Memory Where We Will Copy M
No ratings yet
Allocate The Device Memory Where We Will Copy M
2 pages
CUDA Class Lecture03
No ratings yet
CUDA Class Lecture03
18 pages
PC Cuda Assignment-2
No ratings yet
PC Cuda Assignment-2
29 pages
Source Code
No ratings yet
Source Code
7 pages
01 Cuda C Basics
No ratings yet
01 Cuda C Basics
32 pages
TP1: Converting Vector Addition To CUDA.: Listing 1 An Example of Vector Addition Implemented in C
No ratings yet
TP1: Converting Vector Addition To CUDA.: Listing 1 An Example of Vector Addition Implemented in C
1 page
CUDA Part-1
No ratings yet
CUDA Part-1
52 pages
Lecture 11 Programming On Gpus Part 1 Zxu2acms60212 40212 S15lec 11 Gpupdf
No ratings yet
Lecture 11 Programming On Gpus Part 1 Zxu2acms60212 40212 S15lec 11 Gpupdf
121 pages
Lab 1 Parallel
No ratings yet
Lab 1 Parallel
4 pages
CUDA - Part 1 LMS
No ratings yet
CUDA - Part 1 LMS
51 pages
Department of Computer Engineering BE Laboratory Practice-I A.Y 2021-22 SEM1
No ratings yet
Department of Computer Engineering BE Laboratory Practice-I A.Y 2021-22 SEM1
45 pages
Parallel Scan in C CUda
No ratings yet
Parallel Scan in C CUda
3 pages
CUDA Lab Guide for Students
No ratings yet
CUDA Lab Guide for Students
19 pages
Introduction To CUDA C
No ratings yet
Introduction To CUDA C
67 pages
Mulmatrix Cu
No ratings yet
Mulmatrix Cu
3 pages
BECOA157 Parallel Matrix Multiplication
No ratings yet
BECOA157 Parallel Matrix Multiplication
3 pages
CUDA MatrixMultiplication
No ratings yet
CUDA MatrixMultiplication
2 pages
Ejercicio 2 Práctica 3: CUDA Desempeño en Función de La Homogeneidad para Acceder A Memoria y de La Regularidad Del Código
No ratings yet
Ejercicio 2 Práctica 3: CUDA Desempeño en Función de La Homogeneidad para Acceder A Memoria y de La Regularidad Del Código
8 pages
Introduccion CUDA C
No ratings yet
Introduccion CUDA C
51 pages
GPU Series III CUDA Compilation Host Side 1721302802
No ratings yet
GPU Series III CUDA Compilation Host Side 1721302802
8 pages
3 Computation
No ratings yet
3 Computation
28 pages
5 Functions
No ratings yet
5 Functions
34 pages
Threads
No ratings yet
Threads
54 pages
Lecture2 Cuda Basic 2010
No ratings yet
Lecture2 Cuda Basic 2010
44 pages
CUDA Matrix Multiplication Quiz
No ratings yet
CUDA Matrix Multiplication Quiz
12 pages
3 Some Commonly Used CUDA API: 3.1 Function Type Qualifiers
No ratings yet
3 Some Commonly Used CUDA API: 3.1 Function Type Qualifiers
7 pages
Lab Report 6
No ratings yet
Lab Report 6
12 pages
Input: Output: 1. Sub String Program
No ratings yet
Input: Output: 1. Sub String Program
8 pages
CUDAProg Model
No ratings yet
CUDAProg Model
24 pages
Cuda4 1
No ratings yet
Cuda4 1
4 pages
Aca Lab Manual Final
No ratings yet
Aca Lab Manual Final
28 pages
HP Laserjet Managed E60155, E60165, E60175: Installation Guide
No ratings yet
HP Laserjet Managed E60155, E60165, E60175: Installation Guide
8 pages
AIX Training Slides
No ratings yet
AIX Training Slides
344 pages
Sona College of Technology: Laboratory Manual
No ratings yet
Sona College of Technology: Laboratory Manual
39 pages
BSNL MNP Call Flows
No ratings yet
BSNL MNP Call Flows
34 pages
Robust Module Based Data Management: Under The Guidance Of: MR - SNS Kalyan Bharadwaj B
No ratings yet
Robust Module Based Data Management: Under The Guidance Of: MR - SNS Kalyan Bharadwaj B
30 pages
On Holy Wars and A Plea For Peace - by Danny Cohen
No ratings yet
On Holy Wars and A Plea For Peace - by Danny Cohen
17 pages
Ingame Commands VCMP 0.3
No ratings yet
Ingame Commands VCMP 0.3
3 pages
F5 Programmability and Puppet: - Colin Walker, Sr. Product Management Engineer - September 2014
No ratings yet
F5 Programmability and Puppet: - Colin Walker, Sr. Product Management Engineer - September 2014
27 pages
Chapter 5 Study Questions
No ratings yet
Chapter 5 Study Questions
17 pages
BIM Standards
No ratings yet
BIM Standards
120 pages
Sri Lanka's Fastest Fiber Internet
No ratings yet
Sri Lanka's Fastest Fiber Internet
10 pages
D VXR Dy 23
No ratings yet
D VXR Dy 23
84 pages
Comandos VLSM
No ratings yet
Comandos VLSM
4 pages
6ES75262BF000AB0 - Datasheet - en
No ratings yet
6ES75262BF000AB0 - Datasheet - en
3 pages
Onestream Price Guide v21
No ratings yet
Onestream Price Guide v21
131 pages
Sap WM Sample Resume 2
No ratings yet
Sap WM Sample Resume 2
13 pages
Sophos Firewall Troubleshooting Guide
No ratings yet
Sophos Firewall Troubleshooting Guide
28 pages
DBMS Exp-14
No ratings yet
DBMS Exp-14
3 pages
Grade-7 Latest ICT Textbook 2024-25 - Print
No ratings yet
Grade-7 Latest ICT Textbook 2024-25 - Print
53 pages
Xpeditor UserGuide PDF
No ratings yet
Xpeditor UserGuide PDF
286 pages
Fiber Polarity
No ratings yet
Fiber Polarity
17 pages
Using Oracle SQL Developer Web PDF
No ratings yet
Using Oracle SQL Developer Web PDF
66 pages
ManuLEARN Training Guide
No ratings yet
ManuLEARN Training Guide
9 pages
Veeam Data Platform Feature Comparison
No ratings yet
Veeam Data Platform Feature Comparison
25 pages
A Survey of Spiking Neural Network Accelerator
No ratings yet
A Survey of Spiking Neural Network Accelerator
15 pages
Alfa Awus036h Awus050nh Installing Drivers
No ratings yet
Alfa Awus036h Awus050nh Installing Drivers
6 pages
GSoC Proposal Global Alliance For Genomics and Health
No ratings yet
GSoC Proposal Global Alliance For Genomics and Health
35 pages
CS3000 System Overview
100% (1)
CS3000 System Overview
32 pages
Meu Piano É Divertido - Vol II - Iniciação Ao Piano - Alice G. Botelho
100% (3)
Meu Piano É Divertido - Vol II - Iniciação Ao Piano - Alice G. Botelho
84 pages
CRUD en JDeveloper PDF
No ratings yet
CRUD en JDeveloper PDF
7 pages

Cuda

Uploaded by

Cuda

Uploaded by

CUDA ASSIGNMENT :

Abhishek Kumar Yadav (CSE-AIML/22/62)

__global__ void sumKernel(int *array, int *result, int n) {

int tid = threadIdx.x;

// Perform parallel reduction within the block

// First thread in the block writes the result

// Copy array from host to device

// Launch kernel with 10 threads in a single block

// Copy result back to host

// Print the result

// Free device memory

#define N 10 // Number of elements in each vector

// Kernel function to add vectors

// Initialize vectors A, B, and C

// Allocate memory on the device

// Copy data from host to device

// Launch the kernel with N threads

// Copy the result from device to host

// Print the result

// Free device memory

// Kernel function to multiply three scalars

// Allocate memory on the device

// Copy data from host to device

// Launch the kernel (1 block, 1 thread)

// Copy the result from device to host

// Print the result

// Free device memory

// Kernel function to swap two elements without using a third variable

// Allocate memory on the device

// Copy data from host to device

// Launch the kernel (1 block, 1 thread)

// Copy the result back from device to host

// Print the swapped values

// Free device memory

You might also like

global void sumKernel(int array, int result, int n) {