-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmatrixMultiplicationWithoutSharedMemory.cu
More file actions
198 lines (147 loc) · 5.49 KB
/
matrixMultiplicationWithoutSharedMemory.cu
File metadata and controls
198 lines (147 loc) · 5.49 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
/*
* =====================================================================================
*
* Filename: matrixMultiplicationWithoutSharedMemory.c
*
* Description: based on CUDA C programming
* This code multiplies two matrices A and B and gives matrix C as
* the result, This code runs on GPU when all threads have no shared memory
* This is matrix multiplication on host
* or I can say its host code
* Matrix dimensions are assumed to be multiples of BLOCK_SIZE
*
* USAGE: ./matrixMultiplicationWithoutSharedMemory a1 a2 b2
* Example ./mulNoShare 4 2 5
*
* Version: 1.0
* Created: 04/01/2013 00:40:06
* Revision: none
* Compiler: gcc, nvcc
Compiling: nvcc -o matrixMultiplicationWithoutSharedMemory -g -G matrixMultiplicationWithoutSharedMemory.cu
*
* Author: ABDIMUNA (), abdimuna1@gmail.com
* Organization:
OPINIONS: "Contributions to this code is highly encouraged!, you can contact me thro
my email, or facebook, abdimuna1"
DISCLAIMER: use this code under your own modifications, since I'm still working on it,
it seem the results are not what I expected, thanks
*
* =====================================================================================
*/
#include "matrixMultiplicationWithoutSharedMemory.h"
#include <cuda.h>
#include <string.h>
void MatMul(const Matrix A, const Matrix B, Matrix C)
{
// Load A and B to device memory
Matrix d_A;
d_A.width = A.width;
d_A.height = A.height;
size_t size = A.width * A.height *sizeof(float);
cudaError_t err = cudaMalloc(&d_A.elements, size);
printf("CUDA malloc A:%s\n", cudaGetErrorString(err));
err = cudaMemcpy(d_A.elements, A.elements, size, cudaMemcpyHostToDevice);
printf("Copy A to device: %s\n", cudaGetErrorString(err));
// Loading Matrix B to device Memory
//
Matrix d_B;
d_B.width = B.width;
d_B.height = B.height;
size = B.width * B.height *sizeof(float);
err = cudaMalloc(&d_B.elements, size);
printf("CUDA malloc B: %s\n", cudaGetErrorString(err));
err = cudaMemcpy(d_B.elements, B.elements, size, cudaMemcpyHostToDevice);
printf("Copy B to device: %s\n", cudaGetErrorString(err));
// Allocate C in device memory
Matrix d_C;
d_C.width = C.width;
d_C.height = C.height;
size = C.width * C.height *sizeof(float);
err = cudaMalloc(&d_C.elements, size);
printf("CUDA malloc C: %s\n", cudaGetErrorString(err));
// Invoke kernel
//
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid((B.width + dimBlock.x - 1) / dimBlock.x, (A.height + dimBlock.y -1) / dimBlock.y);
MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);
err = cudaThreadSynchronize();
printf("Run kernel: %s\n", cudaGetErrorString(err));
// Read C from device memory
err = cudaMemcpy(C.elements, d_C.elements, size, cudaMemcpyDeviceToHost);
printf("Copy C off the GPU: %s\n", cudaGetErrorString(err));
// Free device memory
cudaFree(d_A.elements);
cudaFree(d_B.elements);
// cudaFree(d_C.elements);
}
// Matrix multiplication kernel called by MatMul():
//
__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C)
{
// Each thread computes one element of C
// by accumulating results into Cvalue
float Cvalue = 0.0;
int row = blockIdx.y + blockDim.y + threadIdx.y; // vertically are rows
int col = blockIdx.x + blockDim.x + threadIdx.x; // horrizontally are columns
if(row > A.height || col > B.width) return; // if there are more rows or cols just return
for(int i= 0; i<A.width; ++i)
Cvalue += (A.elements[row * A.width + i]) * (B.elements[i*B.width + col]);
C.elements[row *C.width + col] = Cvalue;
}
// Usage mulNoShare a1 a2 b2
int main(int argc, char **argv)
{
Matrix A, B, C;
int a1, a2, b1, b2;
// Read some values from commandLine
a1 = atoi(argv[1]); /* Height of A i.e total rows of matrix A */
a2 = atoi(argv[2]); /* Width of A i.e total number of columns of Matrix A */
b1 = a2; /* Height of B i.e Number of rows of B */
b2 = atoi(argv[3]); /* Width of B, i.e Number of columns of matrix B */
A.height = a1;
A.width = a2;
A.elements = (float*)malloc(A.width * A.height * sizeof(float));
B.height = b1;
B.width = b2;
B.elements = (float *)malloc(B.width * B.height *sizeof(float));
C.height = A.height;
C.width = B.width;
C.elements = (float *)malloc(C.width *C.height *sizeof(float));
for(int i = 0; i< A.height; i++)
for(int j =0; j<A.width; j++)
A.elements[i*A.width +j] = (float)(arc4random() % 3);
for(int i = 0; i< B.height; i++ )
for(int j =0; j<B.width; j++)
B.elements[i*B.width +j] = (float)(arc4random() % 2);
MatMul(A, B, C);
// printing up to a 10x10 portion of the three matrices
//
// printing ---matrix A----
printf("------------Matrix_A---------\n");
for(int i = 0; i< min(10, A.height); i++)
{
for(int j = 0; j< min(10, A.width); j++)
printf("%.3f ", A.elements[i*A.width +j]);
printf("\n");
}
printf("\n");
// printing matirx ----B-----
printf("------------Matrix_B---------\n");
for(int i = 0; i< min(10, B.height); i++)
{
for(int j = 0; j< min(10, B.width); j++)
printf("%.3f ", B.elements[i*B.width +j]);
printf("\n");
}
printf("\n");
// printing matirx ----C-----
printf("------------Matrix_C---------\n");
for(int i = 0; i< min(10, C.height); i++)
{
for(int j = 0; j< min(10, C.width); j++)
printf("%.3f ", B.elements[i*C.width +j]);
printf("\n");
}
printf("\n");
return 0;
}