From 1b9fc2552318bb437365c345742a19a0675de4c8 Mon Sep 17 00:00:00 2001
From: GeLee-Q <865038696@qq.com>
Date: Sat, 5 Feb 2022 00:35:23 +0800
Subject: [PATCH 1/2] 1

---
 ANSWER.md      | 193 ++++++++++++++++++++++++++++++++++++++++++++++---
 CMakeLists.txt |   6 +-
 alignalloc.h   |   9 ++-
 main.cpp       |  82 ++++++++++++++++++---
 ndarray.h      |  18 +++--
 5 files changed, 277 insertions(+), 31 deletions(-)

diff --git a/ANSWER.md b/ANSWER.md
index 83349d8..6fe881d 100644
--- a/ANSWER.md
+++ b/ANSWER.md
@@ -1,23 +1,181 @@
+
+| CPU 核心 | 6         |
+| :------- | --------- |
+| L1  数据 | 6  x 32 K |
+| L1 指令  | 6  x 32 K |
+| L2       | 6 x 256 K |
+| L3       | 12 M      |
+
+
+
 # 改进前
 
+### 初始结果：（未开启OpenMP优化）
+
 ```
-这里贴改进前的运行结果。
-matrix_randomize: 100s
+t=0: n=1120
+matrix_randomize: 0.0047392s
+matrix_randomize: 0.0039861s
+matrix_transpose: 0.0025276s
+matrix_multiply: 3.3746s
+matrix_multiply: 3.31677s
+matrix_RtAR: 6.69637s
+matrix_trace: 3.15e-05s
+1.75932e+08
+test_func: 6.70862s
+t=1: n=928
+matrix_randomize: 0.0020972s
+matrix_randomize: 0.0021105s
+matrix_transpose: 0.0014424s
+matrix_multiply: 1.77163s
+matrix_multiply: 1.77209s
+matrix_RtAR: 3.54777s
+matrix_trace: 2.92e-05s
+1.00156e+08
+test_func: 3.55457s
+t=2: n=1024
+matrix_randomize: 0.0030934s
+matrix_randomize: 0.0029501s
+matrix_transpose: 0.0026326s
+matrix_multiply: 3.043s
+matrix_multiply: 3.06595s
+matrix_RtAR: 6.11384s
+matrix_trace: 2.95e-05s
+1.34324e+08
+test_func: 6.12345s
+t=3: n=1056
+matrix_randomize: 0.0027326s
+matrix_randomize: 0.0026625s
+matrix_transpose: 0.0019026s
+matrix_multiply: 2.6474s
+matrix_multiply: 2.62987s
+matrix_RtAR: 5.28161s
+matrix_trace: 3.22e-05s
+1.47405e+08
+test_func: 5.29054s
+overall: 21.6809s
 ```
 
+### 开启OpenMP优化
+
+```
+t=0: n=1120
+matrix_randomize: 0.0068874s
+matrix_randomize: 0.002048s
+matrix_transpose: 0.0022631s
+matrix_multiply: 0.807792s
+matrix_multiply: 0.754302s
+matrix_RtAR: 1.5655s
+matrix_trace: 4.23e-05s
+1.75932e+08
+test_func: 1.57837s
+t=1: n=928
+matrix_randomize: 0.0007808s
+matrix_randomize: 0.0006188s
+matrix_transpose: 0.0010147s
+matrix_multiply: 0.363695s
+matrix_multiply: 0.395745s
+matrix_RtAR: 0.761718s
+matrix_trace: 5.13e-05s
+1.00156e+08
+test_func: 0.766641s
+t=2: n=1024
+matrix_randomize: 0.0007518s
+matrix_randomize: 0.0010314s
+matrix_transpose: 0.0024472s
+matrix_multiply: 0.698181s
+matrix_multiply: 0.687765s
+matrix_RtAR: 1.38938s
+matrix_trace: 9.16e-05s
+1.34324e+08
+test_func: 1.39417s
+t=3: n=1056
+matrix_randomize: 0.0010823s
+matrix_randomize: 0.0010954s
+matrix_transpose: 0.0020503s
+matrix_multiply: 0.667081s
+matrix_multiply: 0.682925s
+matrix_RtAR: 1.35404s
+matrix_trace: 4.5e-05s
+1.47405e+08
+test_func: 1.36134s
+overall: 5.10537s
+```
+
+
+
 # 改进后
 
 ```
-这里贴改进后的运行结果。
-matrix_randomize: 0.01s
+t=0: n=1120
+matrix_randomize: 0.0022972s
+matrix_randomize: 0.0009744s
+matrix_transpose: 0.0018461s
+matrix_multiply: 0.439565s
+matrix_multiply: 0.37543s
+matrix_RtAR: 0.818067s
+matrix_trace: 5.74e-05s
+1.75932e+08
+test_func: 0.824487s
+t=1: n=928
+matrix_randomize: 0.0004208s
+matrix_randomize: 0.0004122s
+matrix_transpose: 0.0007337s
+matrix_multiply: 0.213688s
+matrix_multiply: 0.197275s
+matrix_RtAR: 0.413488s
+matrix_trace: 3.17e-05s
+1.00156e+08
+test_func: 0.41781s
+t=2: n=1024
+matrix_randomize: 0.0005401s
+matrix_randomize: 0.0006231s
+matrix_transpose: 0.0007753s
+matrix_multiply: 0.280031s
+matrix_multiply: 0.27925s
+matrix_RtAR: 0.561334s
+matrix_trace: 5.72e-05s
+1.34324e+08
+test_func: 0.565142s
+t=3: n=1056
+matrix_randomize: 0.0006056s
+matrix_randomize: 0.0005789s
+matrix_transpose: 0.0007156s
+matrix_multiply: 0.299905s
+matrix_multiply: 0.31011s
+matrix_RtAR: 0.612472s
+matrix_trace: 0.0004518s
+1.47405e+08
+test_func: 0.618392s
+overall: 2.43144s
 ```
 
+
+
 # 加速比
 
-matrix_randomize: 10000x
-matrix_transpose: 10000x
-matrix_multiply: 10000x
-matrix_RtAR: 10000x
+由于`Clion+MSVC`我还没发现如何开启OpenMP优化；
+
+切换到 `Visual studio`后则可以开启OpenMP优化，速率大概提升了四倍左右；
+
+
+
+**使用各种优化方法的效果，比较的基准为开启OpenMP的情况；**
+
+`randomize` 和 `transpose`各使用了更改遍历序和TBB的优化方法；
+
+|                  | OpenMP改遍历序/开启OpenMP优化 | TBB/开启OpenMP优化 |
+| ---------------- | ----------------------------- | ------------------ |
+| matrix_randomize | 2.3                           | 2.3                |
+| matrix_transpose | 1.4                           | 2.1                |
+
+
+
+| matrix_multiply | 2.13 |
+| --------------- | ---- |
+| matrix_RtAR     | 2.2  |
+
+
 
 > 如果记录了多种优化方法，可以做表格比较
 
@@ -27,20 +185,31 @@ matrix_RtAR: 10000x
 
 > matrix_randomize
 
-请回答。
+**优化** 
+
+- 使用YX序遍历，即x作为内存循环体，这样其在时间上是连续的。
+- 遍历时使用`tbb::parallel_for`
 
 > matrix_transpose
 
-请回答。
+**优化**
+
+- 循环分块，使用`YXyx`序，只要保证`BlockSize^2`小于下缓存容量即可
+- `tbb`自带莫顿序遍历功能
 
 > matrix_multiply
 
-请回答。
+**优化**：
+
+- 使用寄存器分块
 
 > matrix_RtAR
 
-请回答。
+优化：
+
+- 使用手动池化 `static thread_local`
 
 # 我的创新点
 
 如果有，请说明。
+
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5d76276..3b0a97d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,12 +7,14 @@ set(CMAKE_BUILD_TYPE Release)
 #endif()
 
 add_executable(main main.cpp)
+#add_executable(main t1.cpp)
 
 find_package(OpenMP REQUIRED)
 target_link_libraries(main PUBLIC OpenMP::OpenMP_CXX)
 
-#find_package(TBB REQUIRED)
-#target_link_libraries(main PUBLIC TBB::tbb)
+find_package(TBB REQUIRED)
+target_link_libraries(main PUBLIC TBB::tbb)
+
 
 if (MSVC)
     target_compile_options(main PUBLIC /fp:fast /arch:AVX)
diff --git a/alignalloc.h b/alignalloc.h
index 8ac1a5b..461fa69 100644
--- a/alignalloc.h
+++ b/alignalloc.h
@@ -7,13 +7,18 @@
 #include <type_traits>
 #include <stdexcept>
 
+#define _GLIBCXX_HAVE_ALIGNED_ALLOC
 // https://stackoverflow.com/questions/12942548/making-stdvector-allocate-aligned-memory
 namespace detail {
     void* allocate_aligned_memory(size_t align, size_t size) {
-        return std::aligned_alloc(align, size);
+//        return std::aligned_alloc(align, size);
+        return _aligned_malloc(size, align);
     }
+
+
     void deallocate_aligned_memory(void* ptr) noexcept {
-        std::free(ptr);
+//        std::free(ptr);
+        _aligned_free(ptr);
     }
 }
 
diff --git a/main.cpp b/main.cpp
index d5af053..c8cb087 100644
--- a/main.cpp
+++ b/main.cpp
@@ -9,12 +9,16 @@
 
 #include <iostream>
 //#include <x86intrin.h>  // _mm 系列指令都来自这个头文件
-//#include <xmmintrin.h>  // 如果上面那个不行，试试这个
+#include <xmmintrin.h>  // 如果上面那个不行，试试这个
+#include <xmmintrin.h>  // 如果上面那个不行，试试这个
+#include <tbb/parallel_for.h>
+#include <tbb/blocked_range2d.h>
 #include "ndarray.h"
 #include "wangsrng.h"
 #include "ticktock.h"
 
 // Matrix 是 YX 序的二维浮点数组：mat(x, y) = mat.data()[y * mat.shape(0) + x]
+//using Matrix = ndarray<2, float,0,0,AlignedAllocator<float,4096>>;
 using Matrix = ndarray<2, float>;
 // 注意：默认对齐到 64 字节，如需 4096 字节，请用 ndarray<2, float, AlignedAllocator<4096, float>>
 
@@ -24,13 +28,33 @@ static void matrix_randomize(Matrix &out) {
     size_t ny = out.shape(1);
 
     // 这个循环为什么不够高效？如何优化？ 10 分
+    //原因：对于YX序的数组，X是外层循环体，这样的先后的执行时间是不连续的。
+    //优化：使用YX序遍历，即x作为内存循环体，这样其在时间上是连续的。
+//#pragma omp parallel for collapse(2)
+//    for (int x = 0; x < nx; x++) {
+//        for (int y = 0; y < ny; y++) {
+//            float val = wangsrng(x, y).next_float();
+//            out(x, y) = val;
+//        }
+//    }
+
 #pragma omp parallel for collapse(2)
-    for (int x = 0; x < nx; x++) {
-        for (int y = 0; y < ny; y++) {
+    for (int y = 0; y < ny; y++) {
+        for (int x = 0; x < nx; x++) {
             float val = wangsrng(x, y).next_float();
             out(x, y) = val;
         }
     }
+
+//    tbb::parallel_for(tbb::blocked_range2d<size_t>(0,nx,0,ny),
+//            [&](tbb::blocked_range2d<size_t> r){
+//                for(int y = r.cols().begin(); y < r.cols().end(); y++){
+//                    for(int x = r.rows().begin(); x < r.rows().end(); x++){
+//                        float val = wangsrng(x, y).next_float();
+//                            out(x, y) = val;
+//                    }
+//                }
+//    });
     TOCK(matrix_randomize);
 }
 
@@ -41,12 +65,34 @@ static void matrix_transpose(Matrix &out, Matrix const &in) {
     out.reshape(ny, nx);
 
     // 这个循环为什么不够高效？如何优化？ 15 分
+    //原因：在内存看来，访存是跳跃的，违背了空间局域性
+    //优化：1.循环分块，使用YXyx序，只要保证BlockSize^2小于下缓存容量即可
+    //      2.tbb自带莫顿序遍历功能
+//#pragma omp parallel for collapse(2)
+//    for (int x = 0; x < nx; x++) {
+//        for (int y = 0; y < ny; y++) {
+//            out(y, x) = in(x, y);
+//        }
+//    }
+
 #pragma omp parallel for collapse(2)
-    for (int x = 0; x < nx; x++) {
-        for (int y = 0; y < ny; y++) {
-            out(y, x) = in(x, y);
+    for (int y = 0; y < ny; y++) {
+        for (int x = 0; x < nx; x++) {
+            out(x, y) = in(y, x);
         }
     }
+
+
+//    constexpr int blockSize = 64;
+//    tbb::parallel_for(tbb::blocked_range2d<size_t>(0,nx,blockSize,0,ny,blockSize),
+//            [&](tbb::blocked_range2d<size_t> const & r){
+//        for(int y = r.cols().begin(); y < r.cols().end(); y++){
+//            for(int x = r.rows().begin(); x < r.rows().end(); x++ ){
+//                out(x,y) = in(y,x);
+//            }
+//        }
+//    },tbb::simple_partitioner{});
+
     TOCK(matrix_transpose);
 }
 
@@ -62,15 +108,31 @@ static void matrix_multiply(Matrix &out, Matrix const &lhs, Matrix const &rhs) {
     out.reshape(nx, ny);
 
     // 这个循环为什么不够高效？如何优化？ 15 分
+    // 原因：存在不连续的lhs(t,y) 和 一直不动的 outs(x,y);
+    // 优化：寄存器分块
+    // 没有必要手动初始化，out的内存已经被分配上了，不会处触发缺页中断；
+//#pragma omp parallel for collapse(2)
+//    for (int y = 0; y < ny; y++) {
+//        for (int x = 0; x < nx; x++) {
+//            out(x, y) = 0;  // 有没有必要手动初始化？ 5 分
+//            for (int t = 0; t < nt; t++) {
+//                out(x, y) += lhs(x, t) * rhs(t, y);
+//            }
+//        }
+//    }
+
 #pragma omp parallel for collapse(2)
     for (int y = 0; y < ny; y++) {
-        for (int x = 0; x < nx; x++) {
-            out(x, y) = 0;  // 有没有必要手动初始化？ 5 分
+        for (int iBase = 0; iBase < nx; iBase += 32) {
             for (int t = 0; t < nt; t++) {
-                out(x, y) += lhs(x, t) * rhs(t, y);
+                for (int x = iBase; x < iBase+32; x++) {
+                    out(x, y) += lhs(x, t) * rhs(t, y);
+                }
             }
+
         }
     }
+
     TOCK(matrix_multiply);
 }
 
@@ -78,7 +140,9 @@ static void matrix_multiply(Matrix &out, Matrix const &lhs, Matrix const &rhs) {
 static void matrix_RtAR(Matrix &RtAR, Matrix const &R, Matrix const &A) {
     TICK(matrix_RtAR);
     // 这两个是临时变量，有什么可以优化的？ 5 分
+    //优化： 手动池化，声明其为static变量，thread_local表示如果有个多个线程，每个线程保留对象的副本，防止出错；
     Matrix Rt, RtA;
+//    static thread_local Matrix Rt, RtA;
     matrix_transpose(Rt, R);
     matrix_multiply(RtA, Rt, A);
     matrix_multiply(RtAR, RtA, R);
diff --git a/ndarray.h b/ndarray.h
index 5acb728..bd7396e 100644
--- a/ndarray.h
+++ b/ndarray.h
@@ -46,7 +46,8 @@ class ndarray {
     {
     }
 
-    template <class ...Ts, std::enable_if_t<sizeof...(Ts) == N && (std::is_integral_v<Ts> && ...), int> = 0>
+    // template <typename...Ts, std::enable_if_t<sizeof...(Ts) == N && (std::is_integral_v<Ts> && ...), int > = 0>
+    template <typename...Ts, std::enable_if_t<sizeof...(Ts) == N && std::conjunction_v<std::is_integral<Ts>...>, int > = 0>
     explicit ndarray(Ts const &...ts)
         : ndarray(Shape{ts...})
     {
@@ -73,7 +74,8 @@ class ndarray {
         m_arr.shrink_to_fit();
     }
 
-    template <class ...Ts, std::enable_if_t<sizeof...(Ts) == N && (std::is_integral_v<Ts> && ...), int> = 0>
+    // template <class ...Ts, std::enable_if_t<sizeof...(Ts) == N && (std::is_integral_v<Ts> && ...), int> = 0>
+    template <class ...Ts, std::enable_if_t<sizeof...(Ts) == N && std::conjunction_v<std::is_integral<Ts>...> , int> = 0>
     void reshape(Ts const &...ts)
     {
         this->reshape(Shape{ts...});
@@ -129,13 +131,15 @@ class ndarray {
         return data()[linearize(dim)];
     }
 
-    template <class ...Ts, std::enable_if_t<sizeof...(Ts) == N && (std::is_integral_v<Ts> && ...), int> = 0>
+    // template <class ...Ts, std::enable_if_t<sizeof...(Ts) == N && (std::is_integral_v<Ts> && ...), int> = 0>
+    template <class ...Ts, std::enable_if_t<sizeof...(Ts) == N && std::conjunction_v<std::is_integral<Ts>...> , int> = 0>
     constexpr T &operator()(Ts const &...ts) noexcept
     {
         return operator()(Dim{ts...});
     }
 
-    template <class ...Ts, std::enable_if_t<sizeof...(Ts) == N && (std::is_integral_v<Ts> && ...), int> = 0>
+    // template <class ...Ts, std::enable_if_t<sizeof...(Ts) == N && (std::is_integral_v<Ts> && ...), int> = 0>
+    template <class ...Ts, std::enable_if_t<sizeof...(Ts) == N && std::conjunction_v<std::is_integral<Ts>...> , int> = 0>
     constexpr T const &operator()(Ts const &...ts) const noexcept
     {
         return operator()(Dim{ts...});
@@ -161,13 +165,15 @@ class ndarray {
         return data()[safe_linearize(dim)];
     }
 
-    template <class ...Ts, std::enable_if_t<sizeof...(Ts) == N && (std::is_integral_v<Ts> && ...), int> = 0>
+    // template <class ...Ts, std::enable_if_t<sizeof...(Ts) == N && (std::is_integral_v<Ts> && ...), int> = 0>
+    template <class ...Ts, std::enable_if_t<sizeof...(Ts) == N && std::conjunction_v<std::is_integral<Ts>...> , int> = 0>
     T &at(Ts const &...ts)
     {
         return at(Dim{ts...});
     }
 
-    template <class ...Ts, std::enable_if_t<sizeof...(Ts) == N && (std::is_integral_v<Ts> && ...), int> = 0>
+    // template <class ...Ts, std::enable_if_t<sizeof...(Ts) == N && (std::is_integral_v<Ts> && ...), int> = 0>
+    template <class ...Ts, std::enable_if_t<sizeof...(Ts) == N && std::conjunction_v<std::is_integral<Ts>...> , int> = 0>
     T const &at(Ts const &...ts) const
     {
         return at(Dim{ts...});

From a36f57a665c299ffa5db2e49243647bd988c395e Mon Sep 17 00:00:00 2001
From: GeLee-Q <865038696@qq.com>
Date: Sat, 5 Feb 2022 00:39:01 +0800
Subject: [PATCH 2/2] 1

---
 ANSWER.md |  2 +-
 main.cpp  | 65 ++++++++++++++++++++++++++++---------------------------
 2 files changed, 34 insertions(+), 33 deletions(-)

diff --git a/ANSWER.md b/ANSWER.md
index 6fe881d..a632871 100644
--- a/ANSWER.md
+++ b/ANSWER.md
@@ -195,7 +195,7 @@ overall: 2.43144s
 **优化**
 
 - 循环分块，使用`YXyx`序，只要保证`BlockSize^2`小于下缓存容量即可
-- `tbb`自带莫顿序遍历功能
+- `tbb::simple_partitioner`自带莫顿序遍历功能
 
 > matrix_multiply
 
diff --git a/main.cpp b/main.cpp
index c8cb087..6d1e0d6 100644
--- a/main.cpp
+++ b/main.cpp
@@ -38,23 +38,24 @@ static void matrix_randomize(Matrix &out) {
 //        }
 //    }
 
-#pragma omp parallel for collapse(2)
-    for (int y = 0; y < ny; y++) {
-        for (int x = 0; x < nx; x++) {
-            float val = wangsrng(x, y).next_float();
-            out(x, y) = val;
-        }
-    }
+//#pragma omp parallel for collapse(2)
+//    for (int y = 0; y < ny; y++) {
+//        for (int x = 0; x < nx; x++) {
+//            float val = wangsrng(x, y).next_float();
+//            out(x, y) = val;
+//        }
+//    }
+
+    tbb::parallel_for(tbb::blocked_range2d<size_t>(0,nx,0,ny),
+            [&](tbb::blocked_range2d<size_t> r){
+                for(int y = r.cols().begin(); y < r.cols().end(); y++){
+                    for(int x = r.rows().begin(); x < r.rows().end(); x++){
+                        float val = wangsrng(x, y).next_float();
+                            out(x, y) = val;
+                    }
+                }
+    });
 
-//    tbb::parallel_for(tbb::blocked_range2d<size_t>(0,nx,0,ny),
-//            [&](tbb::blocked_range2d<size_t> r){
-//                for(int y = r.cols().begin(); y < r.cols().end(); y++){
-//                    for(int x = r.rows().begin(); x < r.rows().end(); x++){
-//                        float val = wangsrng(x, y).next_float();
-//                            out(x, y) = val;
-//                    }
-//                }
-//    });
     TOCK(matrix_randomize);
 }
 
@@ -67,7 +68,7 @@ static void matrix_transpose(Matrix &out, Matrix const &in) {
     // 这个循环为什么不够高效？如何优化？ 15 分
     //原因：在内存看来，访存是跳跃的，违背了空间局域性
     //优化：1.循环分块，使用YXyx序，只要保证BlockSize^2小于下缓存容量即可
-    //      2.tbb自带莫顿序遍历功能
+    //      2.tbb::simple_partitioner自带莫顿序遍历功能
 //#pragma omp parallel for collapse(2)
 //    for (int x = 0; x < nx; x++) {
 //        for (int y = 0; y < ny; y++) {
@@ -75,23 +76,23 @@ static void matrix_transpose(Matrix &out, Matrix const &in) {
 //        }
 //    }
 
-#pragma omp parallel for collapse(2)
-    for (int y = 0; y < ny; y++) {
-        for (int x = 0; x < nx; x++) {
-            out(x, y) = in(y, x);
-        }
-    }
+//#pragma omp parallel for collapse(2)
+//    for (int y = 0; y < ny; y++) {
+//        for (int x = 0; x < nx; x++) {
+//            out(x, y) = in(y, x);
+//        }
+//    }
 
 
-//    constexpr int blockSize = 64;
-//    tbb::parallel_for(tbb::blocked_range2d<size_t>(0,nx,blockSize,0,ny,blockSize),
-//            [&](tbb::blocked_range2d<size_t> const & r){
-//        for(int y = r.cols().begin(); y < r.cols().end(); y++){
-//            for(int x = r.rows().begin(); x < r.rows().end(); x++ ){
-//                out(x,y) = in(y,x);
-//            }
-//        }
-//    },tbb::simple_partitioner{});
+    constexpr int blockSize = 64;
+    tbb::parallel_for(tbb::blocked_range2d<size_t>(0,nx,blockSize,0,ny,blockSize),
+            [&](tbb::blocked_range2d<size_t> const & r){
+        for(int y = r.cols().begin(); y < r.cols().end(); y++){
+            for(int x = r.rows().begin(); x < r.rows().end(); x++ ){
+                out(x,y) = in(y,x);
+            }
+        }
+    },tbb::simple_partitioner{});
 
     TOCK(matrix_transpose);
 }