From f802c0af84814075d28f78dac0af4c3951db1d1a Mon Sep 17 00:00:00 2001
From: Mwsxy <xs_yb@foxmail.com>
Date: Sun, 6 Feb 2022 21:27:00 +0800
Subject: [PATCH] Ytterbium's solution

---
 ANSWER.md    | 127 +++++++++++++++++++++++++++++++++++++++++++++------
 alignalloc.h |   4 ++
 main.cpp     |  63 +++++++++++++++++++------
 3 files changed, 167 insertions(+), 27 deletions(-)

diff --git a/ANSWER.md b/ANSWER.md
index 83349d8..08a83ac 100644
--- a/ANSWER.md
+++ b/ANSWER.md
@@ -1,25 +1,122 @@
+# 运行环境
+## CPU
+Name: Intel(R) Core(TM) i7-6500U CPU @ 2.50GHz
+Core Count: 2
+Thread Count: 4 
+Flags: FPU, SSE, SSE2, ...
+L1 Cache: 64KB, Write back, 8-way Set-associative
+L2 Cache 512KB, Write back, 4-way Set-associative
+L3 Cache 4096KB, Write back, 16-way Set-associative
+
+## Memory
+Size: 4096MB x 2
+Data Width: 64 bits
+Type: LPDDR3
+Speed: 1867MT/s
+
+理论极限带宽=频率 * 宽度 * 数量 = 29872 MB/s
+$ 1867 * 8 * 2 = 29872 MB/s $ 
 # 改进前
 
 ```
-这里贴改进前的运行结果。
-matrix_randomize: 100s
+t=0: n=1120
+matrix_randomize: 0.004051s
+matrix_randomize: 0.003408s
+matrix_transpose: 0.007777s
+matrix_multiply: 2.90488s
+matrix_multiply: 2.26766s
+matrix_RtAR: 5.1804s
+matrix_trace: 6.5e-05s
+1.75932e+08
+test_func: 5.19764s
+t=1: n=928
+matrix_randomize: 0.004649s
+matrix_randomize: 0.005066s
+matrix_transpose: 0.00422s
+matrix_multiply: 0.848782s
+matrix_multiply: 0.835605s
+matrix_RtAR: 1.6887s
+matrix_trace: 0.000164s
+1.00156e+08
+test_func: 1.70451s
+t=2: n=1024
+matrix_randomize: 0.012848s
+matrix_randomize: 0.012852s
+matrix_transpose: 0.012015s
+matrix_multiply: 6.02873s
+matrix_multiply: 6.04324s
+matrix_RtAR: 12.0866s
+matrix_trace: 0.000372s
+1.34324e+08
+test_func: 12.1157s
+t=3: n=1056
+matrix_randomize: 0.011363s
+matrix_randomize: 0.018065s
+matrix_transpose: 0.005886s
+matrix_multiply: 2.4236s
+matrix_multiply: 2.40505s
+matrix_RtAR: 4.84032s
+matrix_trace: 0.000177s
+1.47405e+08
+test_func: 4.87778s
+overall: 23.9097s
 ```
 
 # 改进后
 
 ```
-这里贴改进后的运行结果。
-matrix_randomize: 0.01s
+t=0: n=1120
+matrix_randomize: 0.001592s
+matrix_randomize: 0.000908s
+matrix_transpose: 0.012604s
+matrix_multiply: 0.294772s
+matrix_multiply: 0.259419s
+matrix_RtAR: 0.566971s
+matrix_trace: 6.3e-05s
+1.75932e+08
+test_func: 0.577756s
+t=1: n=928
+matrix_randomize: 0.000772s
+matrix_randomize: 0.000502s
+matrix_transpose: 0.003806s
+matrix_multiply: 0.168204s
+matrix_multiply: 0.1403s
+matrix_RtAR: 0.312448s
+matrix_trace: 4.2e-05s
+1.00156e+08
+test_func: 0.32201s
+t=2: n=1024
+matrix_randomize: 0.000424s
+matrix_randomize: 0.000384s
+matrix_transpose: 0.001753s
+matrix_multiply: 0.206256s
+matrix_multiply: 0.213244s
+matrix_RtAR: 0.421353s
+matrix_trace: 0.00011s
+1.34324e+08
+test_func: 0.423929s
+t=3: n=1056
+matrix_randomize: 0.000696s
+matrix_randomize: 0.000604s
+matrix_transpose: 0.002052s
+matrix_multiply: 0.206916s
+matrix_multiply: 0.200554s
+matrix_RtAR: 0.409645s
+matrix_trace: 5.2e-05s
+1.47405e+08
+test_func: 0.41374s
+overall: 1.74306s
 ```
 
 # 加速比
 
-matrix_randomize: 10000x
-matrix_transpose: 10000x
-matrix_multiply: 10000x
-matrix_RtAR: 10000x
+matrix_randomize: 12.9x
+matrix_transpose: 1.48x
+matrix_multiply: 14x
+matrix_RtAR: 13.9x
 
 > 如果记录了多种优化方法，可以做表格比较
+只是基于OpenMP做了优化，还没有使用其他的框架。
 
 # 优化方法
 
@@ -27,20 +124,24 @@ matrix_RtAR: 10000x
 
 > matrix_randomize
 
-请回答。
+使用正确的YX循环遍历顺序，确保顺序访问。
+使用分块循环遍历，充分利用Cache。
+适当展开定长循环，确保编译器能够充分自动SIMD。
 
 > matrix_transpose
 
-请回答。
+分块循环减少跨步访问。
 
 > matrix_multiply
 
-请回答。
+分块循环，同时可以使用本地临时变量缓存累加的结果，适当展开小循环。
+对写内存做统一集中写入，避免反复读取。
 
 > matrix_RtAR
 
-请回答。
+使用static变量减少变量的反复创建。
 
 # 我的创新点
 
-如果有，请说明。
+没有:(
+
diff --git a/alignalloc.h b/alignalloc.h
index 8ac1a5b..0e7a09f 100644
--- a/alignalloc.h
+++ b/alignalloc.h
@@ -10,7 +10,11 @@
 // https://stackoverflow.com/questions/12942548/making-stdvector-allocate-aligned-memory
 namespace detail {
     void* allocate_aligned_memory(size_t align, size_t size) {
+#if defined(__APPLE__) && defined(__MACH__)
+        return aligned_alloc(align, size);
+#else
         return std::aligned_alloc(align, size);
+#endif
     }
     void deallocate_aligned_memory(void* ptr) noexcept {
         std::free(ptr);
diff --git a/main.cpp b/main.cpp
index d5af053..7a536db 100644
--- a/main.cpp
+++ b/main.cpp
@@ -13,9 +13,13 @@
 #include "ndarray.h"
 #include "wangsrng.h"
 #include "ticktock.h"
+#include "alignalloc.h"
 
+// nru 是用来对数据分块处理的粒度
+constexpr int nru = 16;
 // Matrix 是 YX 序的二维浮点数组：mat(x, y) = mat.data()[y * mat.shape(0) + x]
-using Matrix = ndarray<2, float>;
+using Matrix = ndarray<2, float, 0, nru, AlignedAllocator<float, nru*nru*sizeof(float)>>;
+
 // 注意：默认对齐到 64 字节，如需 4096 字节，请用 ndarray<2, float, AlignedAllocator<4096, float>>
 
 static void matrix_randomize(Matrix &out) {
@@ -24,11 +28,22 @@ static void matrix_randomize(Matrix &out) {
     size_t ny = out.shape(1);
 
     // 这个循环为什么不够高效？如何优化？ 10 分
+    // 答：
+    // 1. YX 序的数组应该优先X维度的连续访问(但此场景编译器应该能处理好，OpenMP也可以帮助处理, collapse)。
+    // 2. 各个内存区域赋值实际是彼此独立没有数据依赖的，可以并行处理(openmp parallel for)
+    // 3. 由于求随机数函数的存在，编译器无法解除数据以来关系，手动展开可以帮助编译器理解。(unroll)
+    // 4. 写入粒度过小，unroll后写入粒度达到64字节，满足一个Cache行的写入
 #pragma omp parallel for collapse(2)
-    for (int x = 0; x < nx; x++) {
-        for (int y = 0; y < ny; y++) {
-            float val = wangsrng(x, y).next_float();
-            out(x, y) = val;
+    for (int y = 0; y < ny; y+=nru) {
+        for (int x = 0; x < nx; x+=nru) {
+#pragma omp SIMD
+            for (int yof = 0; yof < nru; yof++) {
+#pragma omp unroll
+                for (int xof = 0; xof < nru; xof++){
+                    float val = wangsrng(x+xof, y+yof).next_float();
+                    out(x+xof, y+yof) = val;
+                }
+            }
         }
     }
     TOCK(matrix_randomize);
@@ -41,10 +56,17 @@ static void matrix_transpose(Matrix &out, Matrix const &in) {
     out.reshape(ny, nx);
 
     // 这个循环为什么不够高效？如何优化？ 15 分
+    // 答：跨步访问导致不高效，可以使用分块遍历，充分利用Cache
 #pragma omp parallel for collapse(2)
-    for (int x = 0; x < nx; x++) {
-        for (int y = 0; y < ny; y++) {
-            out(y, x) = in(x, y);
+    for (int x = 0; x < nx; x+=nru) {
+        for (int y = 0; y < ny; y+=nru) {
+#pragma omp SIMD
+            for (int xof = 0; xof < nru; xof++) {
+#pragma omp unroll
+                for (int yof = 0; yof < nru; yof++) {
+                    out(y+yof, x+xof) = in(x+xof, y+yof);
+                }
+            }
         }
     }
     TOCK(matrix_transpose);
@@ -62,13 +84,25 @@ static void matrix_multiply(Matrix &out, Matrix const &lhs, Matrix const &rhs) {
     out.reshape(nx, ny);
 
     // 这个循环为什么不够高效？如何优化？ 15 分
+    // 答： 
 #pragma omp parallel for collapse(2)
-    for (int y = 0; y < ny; y++) {
-        for (int x = 0; x < nx; x++) {
-            out(x, y) = 0;  // 有没有必要手动初始化？ 5 分
-            for (int t = 0; t < nt; t++) {
-                out(x, y) += lhs(x, t) * rhs(t, y);
+    for (int y = 0; y < ny; y+=nru) {
+        for (int x = 0; x < nx; x+=nru) {
+            // out(x, y) = 0;  // 有没有必要手动初始化？ 5 分  答：不需要，可以使用临时数组累加再赋值规避
+            float sum[nru][nru] = {0};
+#pragma omp SIMD
+            for (int t = 0; t < nt; t += nru) {
+#pragma omp unroll partial(nru*nru*nru/4)
+                for (int yf = 0; yf < nru; yf++)
+                    for (int xf = 0; xf < nru; xf++) 
+                        for (int tf = 0; tf < nru; tf++)
+                            // out(x, y) += lhs(x, t) * rhs(t, y);
+                            sum[yf][xf] += lhs(x + xf, t + tf) * rhs(t + tf, y + yf);
             }
+#pragma omp unroll
+            for (int yf = 0; yf < nru; yf++)
+                for (int xf = 0; xf < nru; xf++)
+                    out(x + xf, y + yf) = sum[yf][xf];
         }
     }
     TOCK(matrix_multiply);
@@ -78,7 +112,8 @@ static void matrix_multiply(Matrix &out, Matrix const &lhs, Matrix const &rhs) {
 static void matrix_RtAR(Matrix &RtAR, Matrix const &R, Matrix const &A) {
     TICK(matrix_RtAR);
     // 这两个是临时变量，有什么可以优化的？ 5 分
-    Matrix Rt, RtA;
+    // 答：可以声明为static的，避免重复创建
+    static Matrix Rt, RtA;
     matrix_transpose(Rt, R);
     matrix_multiply(RtA, Rt, A);
     matrix_multiply(RtAR, RtA, R);