From 1b9fc2552318bb437365c345742a19a0675de4c8 Mon Sep 17 00:00:00 2001 From: GeLee-Q <865038696@qq.com> Date: Sat, 5 Feb 2022 00:35:23 +0800 Subject: [PATCH 1/2] 1 --- ANSWER.md | 193 ++++++++++++++++++++++++++++++++++++++++++++++--- CMakeLists.txt | 6 +- alignalloc.h | 9 ++- main.cpp | 82 ++++++++++++++++++--- ndarray.h | 18 +++-- 5 files changed, 277 insertions(+), 31 deletions(-) diff --git a/ANSWER.md b/ANSWER.md index 83349d8..6fe881d 100644 --- a/ANSWER.md +++ b/ANSWER.md @@ -1,23 +1,181 @@ + +| CPU 核心 | 6 | +| :------- | --------- | +| L1 数据 | 6 x 32 K | +| L1 指令 | 6 x 32 K | +| L2 | 6 x 256 K | +| L3 | 12 M | + + + # 改进前 +### 初始结果:(未开启OpenMP优化) + ``` -这里贴改进前的运行结果。 -matrix_randomize: 100s +t=0: n=1120 +matrix_randomize: 0.0047392s +matrix_randomize: 0.0039861s +matrix_transpose: 0.0025276s +matrix_multiply: 3.3746s +matrix_multiply: 3.31677s +matrix_RtAR: 6.69637s +matrix_trace: 3.15e-05s +1.75932e+08 +test_func: 6.70862s +t=1: n=928 +matrix_randomize: 0.0020972s +matrix_randomize: 0.0021105s +matrix_transpose: 0.0014424s +matrix_multiply: 1.77163s +matrix_multiply: 1.77209s +matrix_RtAR: 3.54777s +matrix_trace: 2.92e-05s +1.00156e+08 +test_func: 3.55457s +t=2: n=1024 +matrix_randomize: 0.0030934s +matrix_randomize: 0.0029501s +matrix_transpose: 0.0026326s +matrix_multiply: 3.043s +matrix_multiply: 3.06595s +matrix_RtAR: 6.11384s +matrix_trace: 2.95e-05s +1.34324e+08 +test_func: 6.12345s +t=3: n=1056 +matrix_randomize: 0.0027326s +matrix_randomize: 0.0026625s +matrix_transpose: 0.0019026s +matrix_multiply: 2.6474s +matrix_multiply: 2.62987s +matrix_RtAR: 5.28161s +matrix_trace: 3.22e-05s +1.47405e+08 +test_func: 5.29054s +overall: 21.6809s ``` +### 开启OpenMP优化 + +``` +t=0: n=1120 +matrix_randomize: 0.0068874s +matrix_randomize: 0.002048s +matrix_transpose: 0.0022631s +matrix_multiply: 0.807792s +matrix_multiply: 0.754302s +matrix_RtAR: 1.5655s +matrix_trace: 4.23e-05s +1.75932e+08 +test_func: 1.57837s +t=1: n=928 +matrix_randomize: 0.0007808s +matrix_randomize: 0.0006188s +matrix_transpose: 0.0010147s +matrix_multiply: 0.363695s +matrix_multiply: 0.395745s +matrix_RtAR: 0.761718s +matrix_trace: 5.13e-05s +1.00156e+08 +test_func: 0.766641s +t=2: n=1024 +matrix_randomize: 0.0007518s +matrix_randomize: 0.0010314s +matrix_transpose: 0.0024472s +matrix_multiply: 0.698181s +matrix_multiply: 0.687765s +matrix_RtAR: 1.38938s +matrix_trace: 9.16e-05s +1.34324e+08 +test_func: 1.39417s +t=3: n=1056 +matrix_randomize: 0.0010823s +matrix_randomize: 0.0010954s +matrix_transpose: 0.0020503s +matrix_multiply: 0.667081s +matrix_multiply: 0.682925s +matrix_RtAR: 1.35404s +matrix_trace: 4.5e-05s +1.47405e+08 +test_func: 1.36134s +overall: 5.10537s +``` + + + # 改进后 ``` -这里贴改进后的运行结果。 -matrix_randomize: 0.01s +t=0: n=1120 +matrix_randomize: 0.0022972s +matrix_randomize: 0.0009744s +matrix_transpose: 0.0018461s +matrix_multiply: 0.439565s +matrix_multiply: 0.37543s +matrix_RtAR: 0.818067s +matrix_trace: 5.74e-05s +1.75932e+08 +test_func: 0.824487s +t=1: n=928 +matrix_randomize: 0.0004208s +matrix_randomize: 0.0004122s +matrix_transpose: 0.0007337s +matrix_multiply: 0.213688s +matrix_multiply: 0.197275s +matrix_RtAR: 0.413488s +matrix_trace: 3.17e-05s +1.00156e+08 +test_func: 0.41781s +t=2: n=1024 +matrix_randomize: 0.0005401s +matrix_randomize: 0.0006231s +matrix_transpose: 0.0007753s +matrix_multiply: 0.280031s +matrix_multiply: 0.27925s +matrix_RtAR: 0.561334s +matrix_trace: 5.72e-05s +1.34324e+08 +test_func: 0.565142s +t=3: n=1056 +matrix_randomize: 0.0006056s +matrix_randomize: 0.0005789s +matrix_transpose: 0.0007156s +matrix_multiply: 0.299905s +matrix_multiply: 0.31011s +matrix_RtAR: 0.612472s +matrix_trace: 0.0004518s +1.47405e+08 +test_func: 0.618392s +overall: 2.43144s ``` + + # 加速比 -matrix_randomize: 10000x -matrix_transpose: 10000x -matrix_multiply: 10000x -matrix_RtAR: 10000x +由于`Clion+MSVC`我还没发现如何开启OpenMP优化; + +切换到 `Visual studio`后则可以开启OpenMP优化,速率大概提升了四倍左右; + + + +**使用各种优化方法的效果,比较的基准为开启OpenMP的情况;** + +`randomize` 和 `transpose`各使用了更改遍历序和TBB的优化方法; + +| | OpenMP改遍历序/开启OpenMP优化 | TBB/开启OpenMP优化 | +| ---------------- | ----------------------------- | ------------------ | +| matrix_randomize | 2.3 | 2.3 | +| matrix_transpose | 1.4 | 2.1 | + + + +| matrix_multiply | 2.13 | +| --------------- | ---- | +| matrix_RtAR | 2.2 | + + > 如果记录了多种优化方法,可以做表格比较 @@ -27,20 +185,31 @@ matrix_RtAR: 10000x > matrix_randomize -请回答。 +**优化** + +- 使用YX序遍历,即x作为内存循环体,这样其在时间上是连续的。 +- 遍历时使用`tbb::parallel_for` > matrix_transpose -请回答。 +**优化** + +- 循环分块,使用`YXyx`序,只要保证`BlockSize^2`小于下缓存容量即可 +- `tbb`自带莫顿序遍历功能 > matrix_multiply -请回答。 +**优化**: + +- 使用寄存器分块 > matrix_RtAR -请回答。 +优化: + +- 使用手动池化 `static thread_local` # 我的创新点 如果有,请说明。 + diff --git a/CMakeLists.txt b/CMakeLists.txt index 5d76276..3b0a97d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,12 +7,14 @@ set(CMAKE_BUILD_TYPE Release) #endif() add_executable(main main.cpp) +#add_executable(main t1.cpp) find_package(OpenMP REQUIRED) target_link_libraries(main PUBLIC OpenMP::OpenMP_CXX) -#find_package(TBB REQUIRED) -#target_link_libraries(main PUBLIC TBB::tbb) +find_package(TBB REQUIRED) +target_link_libraries(main PUBLIC TBB::tbb) + if (MSVC) target_compile_options(main PUBLIC /fp:fast /arch:AVX) diff --git a/alignalloc.h b/alignalloc.h index 8ac1a5b..461fa69 100644 --- a/alignalloc.h +++ b/alignalloc.h @@ -7,13 +7,18 @@ #include #include +#define _GLIBCXX_HAVE_ALIGNED_ALLOC // https://stackoverflow.com/questions/12942548/making-stdvector-allocate-aligned-memory namespace detail { void* allocate_aligned_memory(size_t align, size_t size) { - return std::aligned_alloc(align, size); +// return std::aligned_alloc(align, size); + return _aligned_malloc(size, align); } + + void deallocate_aligned_memory(void* ptr) noexcept { - std::free(ptr); +// std::free(ptr); + _aligned_free(ptr); } } diff --git a/main.cpp b/main.cpp index d5af053..c8cb087 100644 --- a/main.cpp +++ b/main.cpp @@ -9,12 +9,16 @@ #include //#include // _mm 系列指令都来自这个头文件 -//#include // 如果上面那个不行,试试这个 +#include // 如果上面那个不行,试试这个 +#include // 如果上面那个不行,试试这个 +#include +#include #include "ndarray.h" #include "wangsrng.h" #include "ticktock.h" // Matrix 是 YX 序的二维浮点数组:mat(x, y) = mat.data()[y * mat.shape(0) + x] +//using Matrix = ndarray<2, float,0,0,AlignedAllocator>; using Matrix = ndarray<2, float>; // 注意:默认对齐到 64 字节,如需 4096 字节,请用 ndarray<2, float, AlignedAllocator<4096, float>> @@ -24,13 +28,33 @@ static void matrix_randomize(Matrix &out) { size_t ny = out.shape(1); // 这个循环为什么不够高效?如何优化? 10 分 + //原因:对于YX序的数组,X是外层循环体,这样的先后的执行时间是不连续的。 + //优化:使用YX序遍历,即x作为内存循环体,这样其在时间上是连续的。 +//#pragma omp parallel for collapse(2) +// for (int x = 0; x < nx; x++) { +// for (int y = 0; y < ny; y++) { +// float val = wangsrng(x, y).next_float(); +// out(x, y) = val; +// } +// } + #pragma omp parallel for collapse(2) - for (int x = 0; x < nx; x++) { - for (int y = 0; y < ny; y++) { + for (int y = 0; y < ny; y++) { + for (int x = 0; x < nx; x++) { float val = wangsrng(x, y).next_float(); out(x, y) = val; } } + +// tbb::parallel_for(tbb::blocked_range2d(0,nx,0,ny), +// [&](tbb::blocked_range2d r){ +// for(int y = r.cols().begin(); y < r.cols().end(); y++){ +// for(int x = r.rows().begin(); x < r.rows().end(); x++){ +// float val = wangsrng(x, y).next_float(); +// out(x, y) = val; +// } +// } +// }); TOCK(matrix_randomize); } @@ -41,12 +65,34 @@ static void matrix_transpose(Matrix &out, Matrix const &in) { out.reshape(ny, nx); // 这个循环为什么不够高效?如何优化? 15 分 + //原因:在内存看来,访存是跳跃的,违背了空间局域性 + //优化:1.循环分块,使用YXyx序,只要保证BlockSize^2小于下缓存容量即可 + // 2.tbb自带莫顿序遍历功能 +//#pragma omp parallel for collapse(2) +// for (int x = 0; x < nx; x++) { +// for (int y = 0; y < ny; y++) { +// out(y, x) = in(x, y); +// } +// } + #pragma omp parallel for collapse(2) - for (int x = 0; x < nx; x++) { - for (int y = 0; y < ny; y++) { - out(y, x) = in(x, y); + for (int y = 0; y < ny; y++) { + for (int x = 0; x < nx; x++) { + out(x, y) = in(y, x); } } + + +// constexpr int blockSize = 64; +// tbb::parallel_for(tbb::blocked_range2d(0,nx,blockSize,0,ny,blockSize), +// [&](tbb::blocked_range2d const & r){ +// for(int y = r.cols().begin(); y < r.cols().end(); y++){ +// for(int x = r.rows().begin(); x < r.rows().end(); x++ ){ +// out(x,y) = in(y,x); +// } +// } +// },tbb::simple_partitioner{}); + TOCK(matrix_transpose); } @@ -62,15 +108,31 @@ static void matrix_multiply(Matrix &out, Matrix const &lhs, Matrix const &rhs) { out.reshape(nx, ny); // 这个循环为什么不够高效?如何优化? 15 分 + // 原因:存在不连续的lhs(t,y) 和 一直不动的 outs(x,y); + // 优化:寄存器分块 + // 没有必要手动初始化,out的内存已经被分配上了,不会处触发缺页中断; +//#pragma omp parallel for collapse(2) +// for (int y = 0; y < ny; y++) { +// for (int x = 0; x < nx; x++) { +// out(x, y) = 0; // 有没有必要手动初始化? 5 分 +// for (int t = 0; t < nt; t++) { +// out(x, y) += lhs(x, t) * rhs(t, y); +// } +// } +// } + #pragma omp parallel for collapse(2) for (int y = 0; y < ny; y++) { - for (int x = 0; x < nx; x++) { - out(x, y) = 0; // 有没有必要手动初始化? 5 分 + for (int iBase = 0; iBase < nx; iBase += 32) { for (int t = 0; t < nt; t++) { - out(x, y) += lhs(x, t) * rhs(t, y); + for (int x = iBase; x < iBase+32; x++) { + out(x, y) += lhs(x, t) * rhs(t, y); + } } + } } + TOCK(matrix_multiply); } @@ -78,7 +140,9 @@ static void matrix_multiply(Matrix &out, Matrix const &lhs, Matrix const &rhs) { static void matrix_RtAR(Matrix &RtAR, Matrix const &R, Matrix const &A) { TICK(matrix_RtAR); // 这两个是临时变量,有什么可以优化的? 5 分 + //优化: 手动池化,声明其为static变量,thread_local表示如果有个多个线程,每个线程保留对象的副本,防止出错; Matrix Rt, RtA; +// static thread_local Matrix Rt, RtA; matrix_transpose(Rt, R); matrix_multiply(RtA, Rt, A); matrix_multiply(RtAR, RtA, R); diff --git a/ndarray.h b/ndarray.h index 5acb728..bd7396e 100644 --- a/ndarray.h +++ b/ndarray.h @@ -46,7 +46,8 @@ class ndarray { { } - template && ...), int> = 0> + // template && ...), int > = 0> + template ...>, int > = 0> explicit ndarray(Ts const &...ts) : ndarray(Shape{ts...}) { @@ -73,7 +74,8 @@ class ndarray { m_arr.shrink_to_fit(); } - template && ...), int> = 0> + // template && ...), int> = 0> + template ...> , int> = 0> void reshape(Ts const &...ts) { this->reshape(Shape{ts...}); @@ -129,13 +131,15 @@ class ndarray { return data()[linearize(dim)]; } - template && ...), int> = 0> + // template && ...), int> = 0> + template ...> , int> = 0> constexpr T &operator()(Ts const &...ts) noexcept { return operator()(Dim{ts...}); } - template && ...), int> = 0> + // template && ...), int> = 0> + template ...> , int> = 0> constexpr T const &operator()(Ts const &...ts) const noexcept { return operator()(Dim{ts...}); @@ -161,13 +165,15 @@ class ndarray { return data()[safe_linearize(dim)]; } - template && ...), int> = 0> + // template && ...), int> = 0> + template ...> , int> = 0> T &at(Ts const &...ts) { return at(Dim{ts...}); } - template && ...), int> = 0> + // template && ...), int> = 0> + template ...> , int> = 0> T const &at(Ts const &...ts) const { return at(Dim{ts...}); From a36f57a665c299ffa5db2e49243647bd988c395e Mon Sep 17 00:00:00 2001 From: GeLee-Q <865038696@qq.com> Date: Sat, 5 Feb 2022 00:39:01 +0800 Subject: [PATCH 2/2] 1 --- ANSWER.md | 2 +- main.cpp | 65 ++++++++++++++++++++++++++++--------------------------- 2 files changed, 34 insertions(+), 33 deletions(-) diff --git a/ANSWER.md b/ANSWER.md index 6fe881d..a632871 100644 --- a/ANSWER.md +++ b/ANSWER.md @@ -195,7 +195,7 @@ overall: 2.43144s **优化** - 循环分块,使用`YXyx`序,只要保证`BlockSize^2`小于下缓存容量即可 -- `tbb`自带莫顿序遍历功能 +- `tbb::simple_partitioner`自带莫顿序遍历功能 > matrix_multiply diff --git a/main.cpp b/main.cpp index c8cb087..6d1e0d6 100644 --- a/main.cpp +++ b/main.cpp @@ -38,23 +38,24 @@ static void matrix_randomize(Matrix &out) { // } // } -#pragma omp parallel for collapse(2) - for (int y = 0; y < ny; y++) { - for (int x = 0; x < nx; x++) { - float val = wangsrng(x, y).next_float(); - out(x, y) = val; - } - } +//#pragma omp parallel for collapse(2) +// for (int y = 0; y < ny; y++) { +// for (int x = 0; x < nx; x++) { +// float val = wangsrng(x, y).next_float(); +// out(x, y) = val; +// } +// } + + tbb::parallel_for(tbb::blocked_range2d(0,nx,0,ny), + [&](tbb::blocked_range2d r){ + for(int y = r.cols().begin(); y < r.cols().end(); y++){ + for(int x = r.rows().begin(); x < r.rows().end(); x++){ + float val = wangsrng(x, y).next_float(); + out(x, y) = val; + } + } + }); -// tbb::parallel_for(tbb::blocked_range2d(0,nx,0,ny), -// [&](tbb::blocked_range2d r){ -// for(int y = r.cols().begin(); y < r.cols().end(); y++){ -// for(int x = r.rows().begin(); x < r.rows().end(); x++){ -// float val = wangsrng(x, y).next_float(); -// out(x, y) = val; -// } -// } -// }); TOCK(matrix_randomize); } @@ -67,7 +68,7 @@ static void matrix_transpose(Matrix &out, Matrix const &in) { // 这个循环为什么不够高效?如何优化? 15 分 //原因:在内存看来,访存是跳跃的,违背了空间局域性 //优化:1.循环分块,使用YXyx序,只要保证BlockSize^2小于下缓存容量即可 - // 2.tbb自带莫顿序遍历功能 + // 2.tbb::simple_partitioner自带莫顿序遍历功能 //#pragma omp parallel for collapse(2) // for (int x = 0; x < nx; x++) { // for (int y = 0; y < ny; y++) { @@ -75,23 +76,23 @@ static void matrix_transpose(Matrix &out, Matrix const &in) { // } // } -#pragma omp parallel for collapse(2) - for (int y = 0; y < ny; y++) { - for (int x = 0; x < nx; x++) { - out(x, y) = in(y, x); - } - } +//#pragma omp parallel for collapse(2) +// for (int y = 0; y < ny; y++) { +// for (int x = 0; x < nx; x++) { +// out(x, y) = in(y, x); +// } +// } -// constexpr int blockSize = 64; -// tbb::parallel_for(tbb::blocked_range2d(0,nx,blockSize,0,ny,blockSize), -// [&](tbb::blocked_range2d const & r){ -// for(int y = r.cols().begin(); y < r.cols().end(); y++){ -// for(int x = r.rows().begin(); x < r.rows().end(); x++ ){ -// out(x,y) = in(y,x); -// } -// } -// },tbb::simple_partitioner{}); + constexpr int blockSize = 64; + tbb::parallel_for(tbb::blocked_range2d(0,nx,blockSize,0,ny,blockSize), + [&](tbb::blocked_range2d const & r){ + for(int y = r.cols().begin(); y < r.cols().end(); y++){ + for(int x = r.rows().begin(); x < r.rows().end(); x++ ){ + out(x,y) = in(y,x); + } + } + },tbb::simple_partitioner{}); TOCK(matrix_transpose); }