0% found this document useful (0 votes)

83 views12 pages

CUDA Matrix Multiplication Quiz

This document provides code for matrix multiplication on a GPU using CUDA. It includes CUDA kernel code to perform block-wise matrix multiplication in shared memory. It also includes host code to generate and print matrices, time the GPU computation, and test the matrix multiplication. The document then provides two exercises - to write a CUDA vector addition kernel, and kernels to reverse an integer array with and without shared memory.

Uploaded by

demro channel

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

83 views12 pages

CUDA Matrix Multiplication Quiz

Uploaded by

demro channel

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 12

CS4402/CS9535b: Quiz 3. UWO, March 28, 2013.

Student ID number:
Student Last Name:

Guidelines. The quiz consists of two exercises. All answers should be written
in the answer boxes. No justifications for the answers are needed, unless explicitly
required. You are expected to do this quiz on your own without assistance from
anyone else in the class. If possible, please avoid pencils and use pens with dark
ink. Thank you.

CUDA Cheat Sheet 1: Matrix multiplication

#include <iostream>
#include <string>
#include <cassert>
#include <ctime>

using namespace std;

struct cuda_exception {
explicit cuda_exception(const char *err) : error_info(err) {}
explicit cuda_exception(const string &err) : error_info(err) {}
string what() const throw() { return error_info; }

private:
string error_info;
};

void checkCudaError(const char *msg) {

cudaError_t err = cudaGetLastError();
if (cudaSuccess != err) {
string error_info(msg);
error_info += " : ";
error_info += cudaGetErrorString(err);
throw cuda_exception(error_info);
}
}

1
template<typename T>
void random_matrix(T *M, size_t height, size_t width, int p = 2) {
for(size_t i = 0; i < height; ++i) {
for (size_t j = 0; j < width; ++j) {
M[i * width + j] = rand() % p;
}
}
}

template<typename T>
void print_matrix(const T *M, size_t height, size_t width) {
if (height >= 32 || width >= 32) {
cout << "a matrix of height " << height << ", of width " << width <<
return;
}

for(size_t i = 0; i < height; ++i) {

for (size_t j = 0; j < width; ++j) {
cout << M[i * width + j] << " ";
}
cout << endl;
}
cout << endl;
}

#define BLOCK_SIZE 16

/**
* CUDA kernel for matrix multiplication, blockwise multiplcation
*
* @C, the output matrix C = A * B
* @A, the first input matrix
* @B, the second input matrix
* @wa, is the width of A
* @wb, is the width of B
*
* returns void.
*
*/

2
template <typename T>
__global__ void matrix_mul_ker(T* C, const T *A, const T *B,
size_t wa, size_t wb)
{
// Block index; WARNING: should be at most 2ˆ16 - 1
int bx = blockIdx.x;
int by = blockIdx.y;

// Thread index
int tx = threadIdx.x;
int ty = threadIdx.y;

// Index of the first submatrix of A processed by the block

// This is the y-coordinate of the NW corner of the working tile
int aBegin = wa * BLOCK_SIZE * by;

// Index of the last submatrix of A processed by the block

// This is the y-coordinate of the SE corner of the working tile
int aEnd = aBegin + wa - 1;

// Step size used to iterate through the submatrices of A

//
int aStep = BLOCK_SIZE;

// Index of the first submatrix of B processed by the block

// This is the x-coordinate of the NW corner of the working tile
int bBegin = BLOCK_SIZE * bx;

// Step size used to iterate through the submatrices of B

int bStep = BLOCK_SIZE * wb;

// The element of the block submatrix that is computed by the thread

// WARNING: This is a local variable for the working thread
int Csub = 0;

// Loop over all the submatrices of A and B required to

// compute the block submatrix
// This loop iterates through the tiles in A ndn B that
// contribute to the working tile of C
for(int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {

3
// shared memory for the submatrix of A
__shared__ int As[BLOCK_SIZE][BLOCK_SIZE];

// shared memory for the submatrix of B

__shared__ int Bs[BLOCK_SIZE][BLOCK_SIZE];

// Load the matrices from global memory to shared memory

// each thread loads one element of each matrix
As[ty][tx] = A[a + wa * ty + tx];
Bs[ty][tx] = B[b + wb * ty + tx];

// synchronize to make sure the matrices are loaded

__syncthreads();

// Multiply the two matrices together

// each thread computes one element of the block submatrix
for(int k = 0; k < BLOCK_SIZE; ++k) {
Csub += As[ty][k] * Bs[k][tx];
}
// synchronize to make sure that the preceding computation is
// done before loading two new submatrices of A dnd B in the next ite
__syncthreads();
}
// Write the block submatrix to global memory;
// each thread writes one element
int c = wb * BLOCK_SIZE * by + BLOCK_SIZE * bx;
C[c + wb * ty + tx] = Csub;
}

template <typename T>

void matrix_mul_dev(T* C, const T* A, const T* B, int ha, int wa, int wb) {

assert(wa % BLOCK_SIZE == 0);

assert(wb % BLOCK_SIZE == 0);

// load A and B to the device

size_t mem_size = ha * wa * sizeof(T);

T* Ad;
cudaMalloc((void **)&Ad, mem_size);

4
checkCudaError("allocate GPU memory for the first matrix");
cudaMemcpy(Ad, A, mem_size, cudaMemcpyHostToDevice);

T* Bd;
mem_size = wa * wb * sizeof(T);
cudaMalloc((void **)&Bd, mem_size);
checkCudaError("allocate GPU memory for the second matrix");
cudaMemcpy(Bd, B, mem_size, cudaMemcpyHostToDevice);

// allocate C on the device

T* Cd;
mem_size = ha * wb * sizeof(int);
cudaMalloc((void**)&Cd, mem_size);
checkCudaError("allocate GPU memory for the output matrix");

// compute the execution configure

// assume that the matrix dimensions are multiples of BLOCK_SIZE
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
size_t dgx = wb / dimBlock.x;
size_t dgy = ha / dimBlock.y;
dim3 dimGrid(dgx, dgy);

// launch the device computation

matrix_mul_ker<<<dimGrid, dimBlock>>>(Cd, Ad, Bd, wa, wb);
cudaThreadSynchronize();
checkCudaError("call the matrix multiplication kernel");

// read C from the device

cudaMemcpy(C, Cd, mem_size, cudaMemcpyDeviceToHost);

// Free device memory

cudaFree(Ad);
cudaFree(Bd);
cudaFree(Cd);
}

/**
* Returns the time spent in seconds
*/
template <typename T>

5
double matrix_mul_gpu(T* C, const T* A, const T* B, int ha, int wa, int wb) {
clock_t t1 = clock();

// do the multiplication
matrix_mul_dev(C, A, B, ha, wa, wb);

clock_t t2 = clock();
return (t2 - t1) / double(CLOCKS_PER_SEC);
}

/**
* ha = 2êha,
* wa = 2êwa,
* wb = 2êwb;
*
* If no parameter entered, then default values are used.
* If one parameter rentered, it is eha = ewa = ewb = argv[1].
* If three parameters rentered, there are eha, ewa, and ewb, respectively.
*
*/
int matrix_mul_test(int argc, char **argv) {
int *A, *B, *C;
size_t eha = 4;
size_t ewa = 4;
size_t ewb = 4;

if (argc == 2) {
eha = ewa = ewb = atoi(argv[1]);
} else if (argc >= 3) {
eha = atoi(argv[1]);
ewa = atoi(argv[2]);
ewb = atoi(argv[3]);
}

size_t ha = (1L << eha);

size_t wa = (1L << ewa);
size_t wb = (1L << ewb);

try {
A = new int[ha * wa];

6
B = new int[wa * wb];
C = new int[ha * wb];
random_matrix(A, ha, wa);
random_matrix(B, wa, wb);

// timing gpu based method

cout << matrix_mul_gpu(C, A, B, ha, wa, wb) << "(seconds)" << endl;

} catch (cuda_exception& err) {

cout << err.what() << endl;
delete [] A;
delete [] B;
delete [] C;
return EXIT_FAILURE;
} catch (...) {
delete [] A;
delete [] B;
delete [] C;
cout << "unknown exeception" << endl;
return EXIT_FAILURE;
}

print_matrix(A, ha, wa);

print_matrix(B, wa, wb);
print_matrix(C, ha, wb);

delete [] A;
delete [] B;
delete [] C;
return 0;
}

int main(int argc, char** argv) {

matrix_mul_test(argc, argv);
return 0;
}

7
Exercise 1. The following C function adds two float vectors iA and iB into
oC.

void vector_add (float *iA, float *iB, float* oC, int width) {
int i;
for (i=0; i<width ; i++) {
oC[i] = iA[i] + iB[i];
}

Write a CUDA kernel with the same specification, for a 1-D grid, with 1-D
thread blocks, assuming that each thread is in charge of computing one element in
oC.

8
9
Exercise 2. Write a CUDA kernel (and the launching code) implementing the
reversal of an input integer array A od size n. This reversing process will be
out-of-place. You are asked to proceed in two steps.
(1) First write a “naive” kernel which does not use shared memory.
(2) Then, write a kernel using shared memory.

10
11
12

Matrix Mult
100% (1)
Matrix Mult
55 pages
p4 Multiply
No ratings yet
p4 Multiply
2 pages
BECOA157 Parallel Matrix Multiplication
No ratings yet
BECOA157 Parallel Matrix Multiplication
3 pages
5 Computation
No ratings yet
5 Computation
13 pages
Input: Output: 1. Sub String Program
No ratings yet
Input: Output: 1. Sub String Program
8 pages
HPC (Pra 04)
No ratings yet
HPC (Pra 04)
11 pages
Lab7 GPU
No ratings yet
Lab7 GPU
10 pages
Lab Report 6
No ratings yet
Lab Report 6
12 pages
CUDA MatrixMultiplication
No ratings yet
CUDA MatrixMultiplication
2 pages
PDC Assignment
No ratings yet
PDC Assignment
9 pages
217 Lec3
No ratings yet
217 Lec3
46 pages
HPC File
No ratings yet
HPC File
22 pages
CUDA Class Lecture03
No ratings yet
CUDA Class Lecture03
18 pages
HPC-Practical-4Addition of Two Large Vectors
No ratings yet
HPC-Practical-4Addition of Two Large Vectors
4 pages
Cuda4 2
No ratings yet
Cuda4 2
4 pages
Source Code
No ratings yet
Source Code
7 pages
Cuuda Nvidai Guide - Part3
No ratings yet
Cuuda Nvidai Guide - Part3
15 pages
Cuda 4.1
No ratings yet
Cuda 4.1
2 pages
Cuda Add Mult
No ratings yet
Cuda Add Mult
3 pages
Multithreaded Architectures: Memory and Data Locality
No ratings yet
Multithreaded Architectures: Memory and Data Locality
39 pages
CUDA Exercises
No ratings yet
CUDA Exercises
185 pages
Department of Computer Engineering BE Laboratory Practice-I A.Y 2021-22 SEM1
No ratings yet
Department of Computer Engineering BE Laboratory Practice-I A.Y 2021-22 SEM1
45 pages
Assignment 04
No ratings yet
Assignment 04
16 pages
CUDA Part-2
No ratings yet
CUDA Part-2
49 pages
Matrix-Matrix Multiplication Using Shared Memory
No ratings yet
Matrix-Matrix Multiplication Using Shared Memory
27 pages
Rishi
No ratings yet
Rishi
30 pages
ECE408 S19 ZJUI Exam1 Study Guide
No ratings yet
ECE408 S19 ZJUI Exam1 Study Guide
25 pages
My Experiments: Opencl Gpu Matrix Multiplication Program
No ratings yet
My Experiments: Opencl Gpu Matrix Multiplication Program
19 pages
Web GPU
0% (1)
Web GPU
40 pages
HPC Int2 Key
No ratings yet
HPC Int2 Key
10 pages
CUDA Programming Quiz
100% (5)
CUDA Programming Quiz
4 pages
Lab 1 Parallel
No ratings yet
Lab 1 Parallel
4 pages
LinearAlgebra Matlab HW3 V2s
No ratings yet
LinearAlgebra Matlab HW3 V2s
5 pages
HPC 4 B
No ratings yet
HPC 4 B
5 pages
Mulmatrix Cu
No ratings yet
Mulmatrix Cu
3 pages
CUDA
No ratings yet
CUDA
3 pages
Lecture 4
No ratings yet
Lecture 4
48 pages
Processors
No ratings yet
Processors
25 pages
3 Cuda
No ratings yet
3 Cuda
5 pages
作业2
No ratings yet
作业2
5 pages
Cuda
No ratings yet
Cuda
4 pages
Threads
No ratings yet
Threads
54 pages
CUDA Matrix Multiplication Guide
No ratings yet
CUDA Matrix Multiplication Guide
38 pages
Moving To Parallel - Addition of 2 Matrices
No ratings yet
Moving To Parallel - Addition of 2 Matrices
14 pages
Vector Addition
No ratings yet
Vector Addition
3 pages
LP 1,,1
No ratings yet
LP 1,,1
5 pages
Allocate The Device Memory Where We Will Copy M
No ratings yet
Allocate The Device Memory Where We Will Copy M
2 pages
Parallel Computing Lab4
No ratings yet
Parallel Computing Lab4
13 pages
Module 4.1 - Memory and Data Locality: GPU Teaching Kit
No ratings yet
Module 4.1 - Memory and Data Locality: GPU Teaching Kit
132 pages
PC Cuda Assignment-2
No ratings yet
PC Cuda Assignment-2
29 pages
BCS3413 Principle & Applications of Parallel Programming Quiz 2: Gpgpu Cuda
No ratings yet
BCS3413 Principle & Applications of Parallel Programming Quiz 2: Gpgpu Cuda
3 pages
Computer Science Lab Guide
No ratings yet
Computer Science Lab Guide
129 pages
Tilining
No ratings yet
Tilining
23 pages
GPU Assignment-3 Solution
No ratings yet
GPU Assignment-3 Solution
4 pages
Cuda Firstprograms PDF
No ratings yet
Cuda Firstprograms PDF
6 pages
Hetero Lecture Slides 002 Lecture 1 Lecture-1-8-Kernel-matrix-multiplication
No ratings yet
Hetero Lecture Slides 002 Lecture 1 Lecture-1-8-Kernel-matrix-multiplication
12 pages
CUDA Programming: Johan Seland Johan - Seland@sintef - No
No ratings yet
CUDA Programming: Johan Seland Johan - Seland@sintef - No
76 pages
Ece408 Lecture5 CUDA Tiled Matrix Multiplication
No ratings yet
Ece408 Lecture5 CUDA Tiled Matrix Multiplication
31 pages
L 2 statistics
No ratings yet
L 2 statistics
7 pages
TB Chapter 13
No ratings yet
TB Chapter 13
15 pages
Surplus
No ratings yet
Surplus
7 pages
MIS Summary
No ratings yet
MIS Summary
14 pages
Case 2_Magic Millet
No ratings yet
Case 2_Magic Millet
14 pages
statistics lecture 2 demro
No ratings yet
statistics lecture 2 demro
3 pages
L 3 - Demro
No ratings yet
L 3 - Demro
4 pages
HR ch 1-3 AI
No ratings yet
HR ch 1-3 AI
5 pages
Puma - See Product Analysis
No ratings yet
Puma - See Product Analysis
29 pages
Taylor Swift Case E
No ratings yet
Taylor Swift Case E
6 pages
Comparison Between Inventory Management Models
No ratings yet
Comparison Between Inventory Management Models
1 page
CS105 W9 eCommerceAndEnterpriseSystems
No ratings yet
CS105 W9 eCommerceAndEnterpriseSystems
33 pages
Group V
No ratings yet
Group V
9 pages
Chapter 11 Aggregate Planning and Master Scheduling - Part 1
No ratings yet
Chapter 11 Aggregate Planning and Master Scheduling - Part 1
14 pages
People of IS
No ratings yet
People of IS
5 pages
Cost Accounting Midterm: ABC vs. Traditional
No ratings yet
Cost Accounting Midterm: ABC vs. Traditional
4 pages
L 6 Part 1 Summary
No ratings yet
L 6 Part 1 Summary
3 pages
L7 Demro
No ratings yet
L7 Demro
13 pages
Test
No ratings yet
Test
6 pages
Thermodynamics1 Ch2 Basic Concepts
No ratings yet
Thermodynamics1 Ch2 Basic Concepts
42 pages
Thermodynamics1 Ch7 Second Law
No ratings yet
Thermodynamics1 Ch7 Second Law
54 pages
Lecture 4
No ratings yet
Lecture 4
27 pages
2-Summary L 6
No ratings yet
2-Summary L 6
6 pages
CH 10 OB Summary
No ratings yet
CH 10 OB Summary
7 pages
Thermodynamics1 Ch6 Control Volume p1
No ratings yet
Thermodynamics1 Ch6 Control Volume p1
23 pages
BUS1710 Chapter 2 Emotions
No ratings yet
BUS1710 Chapter 2 Emotions
32 pages
Priority Queues & Heaps Lecture
No ratings yet
Priority Queues & Heaps Lecture
27 pages
CSC423 - Lec11 - Distributed and Parallel ComputerSystems
No ratings yet
CSC423 - Lec11 - Distributed and Parallel ComputerSystems
19 pages
lEC - 10 - Sorting - Part1
No ratings yet
lEC - 10 - Sorting - Part1
162 pages
CSC423 - Lec10 - Distributed and Parallel ComputerSystems
No ratings yet
CSC423 - Lec10 - Distributed and Parallel ComputerSystems
29 pages
Oot
No ratings yet
Oot
10 pages
Medal of Honor Reborn Patch Documentation RC3.5.1
No ratings yet
Medal of Honor Reborn Patch Documentation RC3.5.1
10 pages
Huawei Interview Questions
No ratings yet
Huawei Interview Questions
5 pages
DataStage Tricks & Tips
No ratings yet
DataStage Tricks & Tips
41 pages
Tailor Management System Project
No ratings yet
Tailor Management System Project
122 pages
Data Abstraction Problem Solving With C 6th Edition Frank M. Carrano Download
100% (1)
Data Abstraction Problem Solving With C 6th Edition Frank M. Carrano Download
61 pages
Clase JScrollPane
No ratings yet
Clase JScrollPane
38 pages
Class 10 MCQ-RDBMS-I
No ratings yet
Class 10 MCQ-RDBMS-I
18 pages
IDocs Configuration in SAP SD (S - 4HANA) - 1
0% (1)
IDocs Configuration in SAP SD (S - 4HANA) - 1
6 pages
Non-Recursive Predictive Parsing
No ratings yet
Non-Recursive Predictive Parsing
14 pages
SIC - C&P - Chapter 3 - Quiz - Rev2.0
0% (1)
SIC - C&P - Chapter 3 - Quiz - Rev2.0
7 pages
ComputerSecurity Virus and Threats
No ratings yet
ComputerSecurity Virus and Threats
17 pages
XLOOkUP Functions
No ratings yet
XLOOkUP Functions
17 pages
Azure Foundations Cheat Sheets
93% (14)
Azure Foundations Cheat Sheets
19 pages
DELPHDOS
No ratings yet
DELPHDOS
9 pages
Flexfield Value Set Security Feature at Responsibility Level Document 2379363.1
No ratings yet
Flexfield Value Set Security Feature at Responsibility Level Document 2379363.1
2 pages
Apache Spark Analytics Made Simple
No ratings yet
Apache Spark Analytics Made Simple
76 pages
CPP Practical File
No ratings yet
CPP Practical File
55 pages
Java Identifiers & Keywords Guide
No ratings yet
Java Identifiers & Keywords Guide
23 pages
SAP Table Authorizations
No ratings yet
SAP Table Authorizations
24 pages
AACS2284 Tut 4 Question
No ratings yet
AACS2284 Tut 4 Question
8 pages
IGCSE Pseudocode Practice
100% (1)
IGCSE Pseudocode Practice
6 pages
Optimize Symfony Dev with PhpStorm
No ratings yet
Optimize Symfony Dev with PhpStorm
39 pages
80-28966-13 REV AA QRU1xxx QBAT User Guide
No ratings yet
80-28966-13 REV AA QRU1xxx QBAT User Guide
37 pages
Cloud Computing Lab 2
No ratings yet
Cloud Computing Lab 2
4 pages
Intro To Itl 3 1 0
No ratings yet
Intro To Itl 3 1 0
60 pages
Frontend Assignment
No ratings yet
Frontend Assignment
4 pages
Quiz - 05
No ratings yet
Quiz - 05
6 pages
User Manual
No ratings yet
User Manual
9 pages
Xcode Cheat Sheet: Search Navigation Editing
No ratings yet
Xcode Cheat Sheet: Search Navigation Editing
2 pages

CUDA Matrix Multiplication Quiz

Uploaded by

CUDA Matrix Multiplication Quiz

Uploaded by

CS4402/CS9535b: Quiz 3. UWO, March 28, 2013.

CUDA Cheat Sheet 1: Matrix multiplication

using namespace std;

void checkCudaError(const char *msg) {

for(size_t i = 0; i < height; ++i) {

// Index of the first submatrix of A processed by the block

// Index of the last submatrix of A processed by the block

// Step size used to iterate through the submatrices of A

// Index of the first submatrix of B processed by the block

// Step size used to iterate through the submatrices of B

// The element of the block submatrix that is computed by the thread

// Loop over all the submatrices of A and B required to

// shared memory for the submatrix of B

// Load the matrices from global memory to shared memory

// synchronize to make sure the matrices are loaded

// Multiply the two matrices together

template <typename T>

assert(wa % BLOCK_SIZE == 0);

// load A and B to the device

// allocate C on the device

// compute the execution configure

// launch the device computation

// read C from the device

// Free device memory

size_t ha = (1L << eha);

// timing gpu based method

} catch (cuda_exception& err) {

print_matrix(A, ha, wa);

int main(int argc, char** argv) {

You might also like