File tree Expand file tree Collapse file tree 2 files changed +6
-4
lines changed
Expand file tree Collapse file tree 2 files changed +6
-4
lines changed Original file line number Diff line number Diff line change @@ -24,7 +24,7 @@ float4 LoadFromGlobalPTX(float4 *ptr) {
2424__global__ void mem_bw (float * A, float * B, float * C){
2525 // 泛指当前线程在所有block范围内的全局id
2626 int idx = blockIdx .x * blockDim .x + threadIdx .x ;
27- // int idx = blockIdx.x * blockDim.x * 4 + threadIdx.x;
27+ // int idx = blockIdx.x * blockDim.x * 4 + threadIdx.x; // lesson11里面错写为了这行,请参考熊猫-lesson11和lesson37的勘误.mp4
2828 for (int i = idx; i < MEMORY_OFFSET / 4 ; i += blockDim .x * gridDim .x ) {
2929 // 问题1: 删除43-46行,会发现带宽数据为2666g/S
3030 // 尝试: 使用nv ptx load global memory指令,结果数据依然没变
Original file line number Diff line number Diff line change 33#include " cuda_runtime.h"
44#include " cooperative_groups.h"
55// #define THREAD_PER_BLOCK 256
6- // 注意:本节视频将会在后面重录,现有视频有很多没有讲清楚的地方
7- // 这种warp和shared在老的gpu上面会很有成效,但是在turing后的GPU,nvcc编译器优化了很多,所以导致效果不明显
8- // cpu
6+ // 注意:
7+ // 1.本节的文字解析放在了CUDA_lesson.pdf,如有不懂,可以先看看文字解析
8+ // 2.这种warp和shared在老的gpu上面会很有成效,但是在turing后的GPU,nvcc编译器优化了很多,所以导致效果不明显
9+ // 3.我记得在某个CUDA版本之前,atomic是可以保证block或thread严格按照ID串行,但是某个CUDA版本之后,就不行了,至少在现有流行版本不行了,所以会发现CUDA copy if执行后,虽然全都是>0的值,但是顺序和输入不一样
10+ // cpu实现
911int filter (int *dst, int *src, int n) {
1012 int nres = 0 ;
1113 for (int i = 0 ; i < n; i++)
You can’t perform that action at this time.
0 commit comments