-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsolution
More file actions
163 lines (126 loc) · 4.63 KB
/
solution
File metadata and controls
163 lines (126 loc) · 4.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include "timer.h"
#include "files.h"
#define SOFTENING 1e-9f
#define ASSERT(_err) if(_err != cudaSuccess) printf("Error: %s\n", cudaGetErrorString(_err));
/*
* Each body contains x, y, and z coordinate positions,
* as well as velocities in the x, y, and z directions.
*/
typedef struct { float x, y, z, vx, vy, vz; } Body;
/*
* Calculate the gravitational impact of all bodies in the system
* on all others - device version
*/
__global__
void bodyForce(Body *p, float dt, int n) {
int idxI = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f;
for (int i = idxI; i < n; i += stride) {
for (int j = 0; j < n; j++) {
float dx = p[j].x - p[i].x;
float dy = p[j].y - p[i].y;
float dz = p[j].z - p[i].z;
float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING;
float invDist = rsqrtf(distSqr);
float invDist3 = invDist * invDist * invDist;
Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3;
}
p[i].vx += dt*Fx; p[i].vy += dt*Fy; p[i].vz += dt*Fz;
}
}
__global__
void integratePosition(Body *p, float dt, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = idx; i < n; i += stride) {
p[i].x += p[i].vx*dt;
p[i].y += p[i].vy*dt;
p[i].z += p[i].vz*dt;
}
}
int main(const int argc, const char** argv) {
int deviceId;
int numberOfSMs;
cudaError_t cudaErr;
cudaErr = cudaGetDevice(&deviceId);
ASSERT(cudaErr);
cudaErr = cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId);
ASSERT(cudaErr);
// The assessment will test against both 2<11 and 2<15.
// Feel free to pass the command line argument 15 when you gernate ./nbody report files
int nBodies = 2<<11;
if (argc > 1) nBodies = 2<<atoi(argv[1]);
// The assessment will pass hidden initialized values to check for correctness.
// You should not make changes to these files, or else the assessment will not work.
const char * initialized_values;
const char * solution_values;
if (nBodies == 2<<11) {
initialized_values = "files/initialized_4096";
solution_values = "files/solution_4096";
} else { // nBodies == 2<<15
initialized_values = "files/initialized_65536";
solution_values = "files/solution_65536";
}
if (argc > 2) initialized_values = argv[2];
if (argc > 3) solution_values = argv[3];
const float dt = 0.01f; // Time step
const int nIters = 10; // Simulation iterations
int bytes = nBodies * sizeof(Body);
float *buf;
// buf = (float *)malloc(bytes);
cudaMallocManaged(&buf, bytes);
Body *p = (Body*)buf;
read_values_from_file(initialized_values, buf, bytes);
double totalTime = 0.0;
/*
* This simulation will run for 10 cycles of time, calculating gravitational
* interaction amongst bodies, and adjusting their positions to reflect.
*/
size_t threadsPerBlock = 256;
size_t numberOfBlocks = 32 * numberOfSMs;
cudaErr = cudaMemPrefetchAsync(p, bytes, deviceId);
ASSERT(cudaErr);
for (int iter = 0; iter < nIters; iter++) {
StartTimer();
/*
* You will likely wish to refactor the work being done in `bodyForce`,
* and potentially the work to integrate the positions.
*/
bodyForce<<<numberOfBlocks, threadsPerBlock>>>(p, dt, nBodies); // compute interbody forces
cudaErr = cudaGetLastError();
ASSERT(cudaErr);
cudaErr = cudaDeviceSynchronize();
ASSERT(cudaErr);
/*
* This position integration cannot occur until this round of `bodyForce` has completed.
* Also, the next round of `bodyForce` cannot begin until the integration is complete.
*/
integratePosition<<<numberOfBlocks, threadsPerBlock>>>(p, dt, nBodies);
cudaErr = cudaGetLastError();
ASSERT(cudaErr);
cudaErr = cudaDeviceSynchronize();
ASSERT(cudaErr);
#if 0
for (int i = 0 ; i < nBodies; i++) { // integrate position
p[i].x += p[i].vx*dt;
p[i].y += p[i].vy*dt;
p[i].z += p[i].vz*dt;
}
#endif
const double tElapsed = GetTimer() / 1000.0;
totalTime += tElapsed;
}
double avgTime = totalTime / (double)(nIters);
float billionsOfOpsPerSecond = 1e-9 * nBodies * nBodies / avgTime;
write_values_to_file(solution_values, buf, bytes);
// You will likely enjoy watching this value grow as you accelerate the application,
// but beware that a failure to correctly synchronize the device might result in
// unrealistically high values.
printf("%0.3f Billion Interactions / second", billionsOfOpsPerSecond);
// free(buf);
cudaFree(buf);
}