-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathChunkTesting.cuh
More file actions
155 lines (125 loc) · 3.93 KB
/
ChunkTesting.cuh
File metadata and controls
155 lines (125 loc) · 3.93 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
/*
* ChunkTesting.cuh
*
* Created on: Jan 5, 2016
* Author: matan
*/
#ifndef CHUNKTESTING_CUH_
#define CHUNKTESTING_CUH_
#include "ChunkPOC.cuh"
#include <cstdio>
#define IRRED_SIZE 32
extern unsigned int irreducibles[][5];
unsigned int calcLog(unsigned int n)
{
unsigned int i = 0;
while (n != 1)
{
n /= 2;
++i;
}
return i;
}
unsigned int irreducibleIndex(unsigned int fieldDegree)
{
for (unsigned int i = 0 ; i < IRRED_SIZE ; ++i)
{
if (irreducibles[i][4] == fieldDegree)
{
return i;
}
}
printf("Field degree %d is not set in irreducibles!\n",fieldDegree);
exit(-1);
}
template <unsigned int N>
float testChunksMultiply(unsigned int size)
{
unsigned int * pentanomialCoefficients = irreducibles[irreducibleIndex(SIZE)];
setPentanomial(pentanomialCoefficients);
unsigned int (*chunksAHost)[ROUNDED(N)] = new unsigned int[size][ROUNDED(N)];
unsigned int (*chunksBHost)[ROUNDED(N)] = new unsigned int[size][ROUNDED(N)];
for(unsigned int i = 0 ; i < size ; ++i)
{
for(unsigned int j = 0 ; j < ROUNDED(N) ; ++j)
{
chunksAHost[i][j]=0;
chunksBHost[i][j]=0;
}
chunksAHost[i][N/2]=2;
chunksBHost[i][N/2]=2;
}
unsigned int (*chunksA)[ROUNDED(N)];
unsigned int (*chunksB)[ROUNDED(N)];
cudaMalloc(&chunksA, sizeof(unsigned int)*ROUNDED(N)*size);
cudaMalloc(&chunksB, sizeof(unsigned int)*ROUNDED(N)*size);
cudaMemcpy(chunksA, chunksAHost, sizeof(unsigned int)*ROUNDED(N)*size, cudaMemcpyHostToDevice);
cudaMemcpy(chunksB, chunksBHost, sizeof(unsigned int)*ROUNDED(N)*size, cudaMemcpyHostToDevice);
unsigned int blocksNum = (size*GROUP_SIZE(N)+THREAD_BLOCK_SIZE(N)-1)/THREAD_BLOCK_SIZE(N);
cudaEvent_t start, end;
cudaEventCreate(&start);
cudaEventCreate(&end);
cudaEventRecord(start);
finiteFieldMultiplyArrays<N><<<
blocksNum,
THREAD_BLOCK_SIZE(N)>>>
((unsigned int (*)[ROUNDED(N)])chunksA, reinterpret_cast<unsigned int (*)[ROUNDED(N)]>(chunksB), (unsigned int (*)[ROUNDED(N)])chunksA, size);
cudaEventRecord(end);
gpuErrchk(cudaPeekAtLastError());
cudaMemcpy(chunksAHost, chunksA, sizeof(unsigned int)*N*size, cudaMemcpyDeviceToHost);
float ms;
cudaEventElapsedTime(&ms, start, end);
delete[] chunksAHost;
delete[] chunksBHost;
cudaFree(chunksA);
cudaFree(chunksB);
return ms;
}
float testChunksMultiply64Ring(unsigned int size)
{
unsigned int * pentanomialCoefficients = irreducibles[irreducibleIndex(SIZE)];
setPentanomial(pentanomialCoefficients);
unsigned int (*chunksAHost)[64] = new unsigned int[size][64];
unsigned int (*chunksBHost)[64] = new unsigned int[size][64];
unsigned int (*chunksCHost)[128] = new unsigned int[size][128];
for(unsigned int i = 0 ; i < size ; ++i)
{
for(unsigned int j = 0 ; j < 64 ; ++j)
{
chunksAHost[i][j]=0;
chunksBHost[i][j]=0;
}
chunksAHost[i][32]=2;
chunksBHost[i][32]=2;
}
unsigned int (*chunksA)[64];
unsigned int (*chunksB)[64];
unsigned int (*chunksC)[128];
cudaMalloc(&chunksA, sizeof(unsigned int)*64*size);
cudaMalloc(&chunksB, sizeof(unsigned int)*64*size);
cudaMalloc(&chunksC, sizeof(unsigned int)*128*size);
cudaMemcpy(chunksA, chunksAHost, sizeof(unsigned int)*64*size, cudaMemcpyHostToDevice);
cudaMemcpy(chunksB, chunksBHost, sizeof(unsigned int)*64*size, cudaMemcpyHostToDevice);
unsigned int blocksNum = (size*GROUP_SIZE_RING+MAX_THREADBLOCK_SIZE-1)/MAX_THREADBLOCK_SIZE;
cudaEvent_t start, end;
cudaEventCreate(&start);
cudaEventCreate(&end);
cudaEventRecord(start);
performMult64<<<
blocksNum,
MAX_THREADBLOCK_SIZE>>>
((unsigned int (*)[64])chunksA, reinterpret_cast<unsigned int (*)[64]>(chunksB), (unsigned int (*)[128])chunksC, size);
cudaEventRecord(end);
gpuErrchk(cudaPeekAtLastError());
cudaMemcpy(chunksCHost, chunksC, sizeof(unsigned int)*128*size, cudaMemcpyDeviceToHost);
float ms;
cudaEventElapsedTime(&ms, start, end);
delete[] chunksAHost;
delete[] chunksBHost;
delete[] chunksCHost;
cudaFree(chunksA);
cudaFree(chunksB);
cudaFree(chunksC);
return ms;
}
#endif /* CHUNKTESTING_CUH_ */