From f802c0af84814075d28f78dac0af4c3951db1d1a Mon Sep 17 00:00:00 2001 From: Mwsxy Date: Sun, 6 Feb 2022 21:27:00 +0800 Subject: [PATCH] Ytterbium's solution --- ANSWER.md | 127 +++++++++++++++++++++++++++++++++++++++++++++------ alignalloc.h | 4 ++ main.cpp | 63 +++++++++++++++++++------ 3 files changed, 167 insertions(+), 27 deletions(-) diff --git a/ANSWER.md b/ANSWER.md index 83349d8..08a83ac 100644 --- a/ANSWER.md +++ b/ANSWER.md @@ -1,25 +1,122 @@ +# 运行环境 +## CPU +Name: Intel(R) Core(TM) i7-6500U CPU @ 2.50GHz +Core Count: 2 +Thread Count: 4 +Flags: FPU, SSE, SSE2, ... +L1 Cache: 64KB, Write back, 8-way Set-associative +L2 Cache 512KB, Write back, 4-way Set-associative +L3 Cache 4096KB, Write back, 16-way Set-associative + +## Memory +Size: 4096MB x 2 +Data Width: 64 bits +Type: LPDDR3 +Speed: 1867MT/s + +理论极限带宽=频率 * 宽度 * 数量 = 29872 MB/s +$ 1867 * 8 * 2 = 29872 MB/s $ # 改进前 ``` -这里贴改进前的运行结果。 -matrix_randomize: 100s +t=0: n=1120 +matrix_randomize: 0.004051s +matrix_randomize: 0.003408s +matrix_transpose: 0.007777s +matrix_multiply: 2.90488s +matrix_multiply: 2.26766s +matrix_RtAR: 5.1804s +matrix_trace: 6.5e-05s +1.75932e+08 +test_func: 5.19764s +t=1: n=928 +matrix_randomize: 0.004649s +matrix_randomize: 0.005066s +matrix_transpose: 0.00422s +matrix_multiply: 0.848782s +matrix_multiply: 0.835605s +matrix_RtAR: 1.6887s +matrix_trace: 0.000164s +1.00156e+08 +test_func: 1.70451s +t=2: n=1024 +matrix_randomize: 0.012848s +matrix_randomize: 0.012852s +matrix_transpose: 0.012015s +matrix_multiply: 6.02873s +matrix_multiply: 6.04324s +matrix_RtAR: 12.0866s +matrix_trace: 0.000372s +1.34324e+08 +test_func: 12.1157s +t=3: n=1056 +matrix_randomize: 0.011363s +matrix_randomize: 0.018065s +matrix_transpose: 0.005886s +matrix_multiply: 2.4236s +matrix_multiply: 2.40505s +matrix_RtAR: 4.84032s +matrix_trace: 0.000177s +1.47405e+08 +test_func: 4.87778s +overall: 23.9097s ``` # 改进后 ``` -这里贴改进后的运行结果。 -matrix_randomize: 0.01s +t=0: n=1120 +matrix_randomize: 0.001592s +matrix_randomize: 0.000908s +matrix_transpose: 0.012604s +matrix_multiply: 0.294772s +matrix_multiply: 0.259419s +matrix_RtAR: 0.566971s +matrix_trace: 6.3e-05s +1.75932e+08 +test_func: 0.577756s +t=1: n=928 +matrix_randomize: 0.000772s +matrix_randomize: 0.000502s +matrix_transpose: 0.003806s +matrix_multiply: 0.168204s +matrix_multiply: 0.1403s +matrix_RtAR: 0.312448s +matrix_trace: 4.2e-05s +1.00156e+08 +test_func: 0.32201s +t=2: n=1024 +matrix_randomize: 0.000424s +matrix_randomize: 0.000384s +matrix_transpose: 0.001753s +matrix_multiply: 0.206256s +matrix_multiply: 0.213244s +matrix_RtAR: 0.421353s +matrix_trace: 0.00011s +1.34324e+08 +test_func: 0.423929s +t=3: n=1056 +matrix_randomize: 0.000696s +matrix_randomize: 0.000604s +matrix_transpose: 0.002052s +matrix_multiply: 0.206916s +matrix_multiply: 0.200554s +matrix_RtAR: 0.409645s +matrix_trace: 5.2e-05s +1.47405e+08 +test_func: 0.41374s +overall: 1.74306s ``` # 加速比 -matrix_randomize: 10000x -matrix_transpose: 10000x -matrix_multiply: 10000x -matrix_RtAR: 10000x +matrix_randomize: 12.9x +matrix_transpose: 1.48x +matrix_multiply: 14x +matrix_RtAR: 13.9x > 如果记录了多种优化方法,可以做表格比较 +只是基于OpenMP做了优化,还没有使用其他的框架。 # 优化方法 @@ -27,20 +124,24 @@ matrix_RtAR: 10000x > matrix_randomize -请回答。 +使用正确的YX循环遍历顺序,确保顺序访问。 +使用分块循环遍历,充分利用Cache。 +适当展开定长循环,确保编译器能够充分自动SIMD。 > matrix_transpose -请回答。 +分块循环减少跨步访问。 > matrix_multiply -请回答。 +分块循环,同时可以使用本地临时变量缓存累加的结果,适当展开小循环。 +对写内存做统一集中写入,避免反复读取。 > matrix_RtAR -请回答。 +使用static变量减少变量的反复创建。 # 我的创新点 -如果有,请说明。 +没有:( + diff --git a/alignalloc.h b/alignalloc.h index 8ac1a5b..0e7a09f 100644 --- a/alignalloc.h +++ b/alignalloc.h @@ -10,7 +10,11 @@ // https://stackoverflow.com/questions/12942548/making-stdvector-allocate-aligned-memory namespace detail { void* allocate_aligned_memory(size_t align, size_t size) { +#if defined(__APPLE__) && defined(__MACH__) + return aligned_alloc(align, size); +#else return std::aligned_alloc(align, size); +#endif } void deallocate_aligned_memory(void* ptr) noexcept { std::free(ptr); diff --git a/main.cpp b/main.cpp index d5af053..7a536db 100644 --- a/main.cpp +++ b/main.cpp @@ -13,9 +13,13 @@ #include "ndarray.h" #include "wangsrng.h" #include "ticktock.h" +#include "alignalloc.h" +// nru 是用来对数据分块处理的粒度 +constexpr int nru = 16; // Matrix 是 YX 序的二维浮点数组:mat(x, y) = mat.data()[y * mat.shape(0) + x] -using Matrix = ndarray<2, float>; +using Matrix = ndarray<2, float, 0, nru, AlignedAllocator>; + // 注意:默认对齐到 64 字节,如需 4096 字节,请用 ndarray<2, float, AlignedAllocator<4096, float>> static void matrix_randomize(Matrix &out) { @@ -24,11 +28,22 @@ static void matrix_randomize(Matrix &out) { size_t ny = out.shape(1); // 这个循环为什么不够高效?如何优化? 10 分 + // 答: + // 1. YX 序的数组应该优先X维度的连续访问(但此场景编译器应该能处理好,OpenMP也可以帮助处理, collapse)。 + // 2. 各个内存区域赋值实际是彼此独立没有数据依赖的,可以并行处理(openmp parallel for) + // 3. 由于求随机数函数的存在,编译器无法解除数据以来关系,手动展开可以帮助编译器理解。(unroll) + // 4. 写入粒度过小,unroll后写入粒度达到64字节,满足一个Cache行的写入 #pragma omp parallel for collapse(2) - for (int x = 0; x < nx; x++) { - for (int y = 0; y < ny; y++) { - float val = wangsrng(x, y).next_float(); - out(x, y) = val; + for (int y = 0; y < ny; y+=nru) { + for (int x = 0; x < nx; x+=nru) { +#pragma omp SIMD + for (int yof = 0; yof < nru; yof++) { +#pragma omp unroll + for (int xof = 0; xof < nru; xof++){ + float val = wangsrng(x+xof, y+yof).next_float(); + out(x+xof, y+yof) = val; + } + } } } TOCK(matrix_randomize); @@ -41,10 +56,17 @@ static void matrix_transpose(Matrix &out, Matrix const &in) { out.reshape(ny, nx); // 这个循环为什么不够高效?如何优化? 15 分 + // 答:跨步访问导致不高效,可以使用分块遍历,充分利用Cache #pragma omp parallel for collapse(2) - for (int x = 0; x < nx; x++) { - for (int y = 0; y < ny; y++) { - out(y, x) = in(x, y); + for (int x = 0; x < nx; x+=nru) { + for (int y = 0; y < ny; y+=nru) { +#pragma omp SIMD + for (int xof = 0; xof < nru; xof++) { +#pragma omp unroll + for (int yof = 0; yof < nru; yof++) { + out(y+yof, x+xof) = in(x+xof, y+yof); + } + } } } TOCK(matrix_transpose); @@ -62,13 +84,25 @@ static void matrix_multiply(Matrix &out, Matrix const &lhs, Matrix const &rhs) { out.reshape(nx, ny); // 这个循环为什么不够高效?如何优化? 15 分 + // 答: #pragma omp parallel for collapse(2) - for (int y = 0; y < ny; y++) { - for (int x = 0; x < nx; x++) { - out(x, y) = 0; // 有没有必要手动初始化? 5 分 - for (int t = 0; t < nt; t++) { - out(x, y) += lhs(x, t) * rhs(t, y); + for (int y = 0; y < ny; y+=nru) { + for (int x = 0; x < nx; x+=nru) { + // out(x, y) = 0; // 有没有必要手动初始化? 5 分 答:不需要,可以使用临时数组累加再赋值规避 + float sum[nru][nru] = {0}; +#pragma omp SIMD + for (int t = 0; t < nt; t += nru) { +#pragma omp unroll partial(nru*nru*nru/4) + for (int yf = 0; yf < nru; yf++) + for (int xf = 0; xf < nru; xf++) + for (int tf = 0; tf < nru; tf++) + // out(x, y) += lhs(x, t) * rhs(t, y); + sum[yf][xf] += lhs(x + xf, t + tf) * rhs(t + tf, y + yf); } +#pragma omp unroll + for (int yf = 0; yf < nru; yf++) + for (int xf = 0; xf < nru; xf++) + out(x + xf, y + yf) = sum[yf][xf]; } } TOCK(matrix_multiply); @@ -78,7 +112,8 @@ static void matrix_multiply(Matrix &out, Matrix const &lhs, Matrix const &rhs) { static void matrix_RtAR(Matrix &RtAR, Matrix const &R, Matrix const &A) { TICK(matrix_RtAR); // 这两个是临时变量,有什么可以优化的? 5 分 - Matrix Rt, RtA; + // 答:可以声明为static的,避免重复创建 + static Matrix Rt, RtA; matrix_transpose(Rt, R); matrix_multiply(RtA, Rt, A); matrix_multiply(RtAR, RtA, R);