hw07/main.cpp at main · Paul-Laifan/hw07 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
// 这是第07课的回家作业，主题是访存优化
// 录播见：https://www.bilibili.com/video/BV1gu41117bW
// 作业的回答推荐写在 ANSWER.md 方便老师批阅，也可在 PR 描述中
// 请晒出程序被你优化前后的运行结果（ticktock 的用时统计）
// 可以比较采用了不同的优化手段后，加速了多少倍，做成表格
// 如能同时贴出 CPU 核心数量，缓存大小等就最好了（lscpu 命令）
// 作业中有很多个问句，请通过注释回答问题，并改进其代码，以使其更快
// 并行可以用 OpenMP 也可以用 TBB

#include <iostream>
#include <algorithm>
//#include <x86intrin.h>  // _mm 系列指令都来自这个头文件
//#include <xmmintrin.h>  // 如果上面那个不行，试试这个
#include "ndarray.h"
#include "wangsrng.h"
#include "ticktock.h"

// Matrix 是 YX 序的二维浮点数组：mat(x, y) = mat.data()[y * mat.shape(0) + x]
using Matrix = ndarray<2, float>;
// 注意：默认对齐到 64 字节，如需 4096 字节，请用 ndarray<2, float, AlignedAllocator<4096, float>>

static void matrix_randomize(Matrix &out) {
    TICK(matrix_randomize);
    size_t nx = out.shape(0);
    size_t ny = out.shape(1);

    // 【优化】原代码外层 x、内层 y，导致 out(x,y)=data[y*nx+x] 在 y 变化时
    // 步长为 nx 的跨步访问（Cache Miss）。
    // 修复：交换循环顺序，y 在外层，x 在内层。
    // x 在内层变化时，data[y*nx+x] 是连续内存，充分利用 Cache Line。
#pragma omp parallel for collapse(2)
    for (int y = 0; y < (int)ny; y++) {
        for (int x = 0; x < (int)nx; x++) {
            out(x, y) = wangsrng(x, y).next_float();
        }
    }
    TOCK(matrix_randomize);
}

static void matrix_transpose(Matrix &out, Matrix const &in) {
    TICK(matrix_transpose);
    size_t nx = in.shape(0);
    size_t ny = in.shape(1);
    out.reshape(ny, nx);

    // 【优化】矩阵转置无论如何读或写都有一个方向是跨步的，
    // 解决方案是使用"分块/Tiling"：将矩阵划分为小块（TILE×TILE），
    // 每个小块都能完整装入 L1/L2 Cache，从而减少 Cache Miss。
    // 原代码内层 y 变化时，读 in(x,y)=data[y*nx+x] 步长为 nx，非常慢。
    const int TILE = 32;
#pragma omp parallel for collapse(2)
    for (int y = 0; y < (int)ny; y += TILE) {
        for (int x = 0; x < (int)nx; x += TILE) {
            // 处理当前块，防止越界
            int yend = std::min((int)ny, y + TILE);
            int xend = std::min((int)nx, x + TILE);
            for (int yi = y; yi < yend; yi++) {
                for (int xi = x; xi < xend; xi++) {
                    out(yi, xi) = in(xi, yi);
                }
            }
        }
    }
    TOCK(matrix_transpose);
}

static void matrix_multiply(Matrix &out, Matrix const &lhs, Matrix const &rhs) {
    TICK(matrix_multiply);
    size_t nx = lhs.shape(0);
    size_t nt = lhs.shape(1);
    size_t ny = rhs.shape(1);
    if (rhs.shape(0) != nt) {
        std::cerr << "matrix_multiply: shape mismatch" << std::endl;
        throw;
    }
    out.reshape(nx, ny);

    // 【优化】原代码循环顺序 (y, x, t)，内层 t 变化时：
    //   lhs(x,t) = data[t*nx+x]，步长为 nx → 严重 Cache Miss（这是性能瓶颈！）
    //   rhs(t,y) = data[y*nt+t]，t 连续 → 访问良好
    //
    // 修复：改为循环顺序 (y, t, x)：
    //   - rhs(t,y) 提到 t 循环体内，变成一个标量，只读一次
    //   - x 在最内层：lhs(x,t)=data[t*nx+x] 和 out(x,y)=data[y*nx+x]
    //     都是 x 连续变化 → 连续内存访问，充分利用 Cache Line 和 SIMD
    //
    // 【初始化问题】reshape() 内部 clear()+resize() 会对新内存零初始化，
    // 所以不需要手动写 out(x,y)=0，直接用 += 累加即可。
    // 但注意：reshape 的 resize 是否真的零初始化？
    //   std::vector::resize(n) 对 POD 类型会值初始化（置零），所以是安全的。
    //   因此 out(x,y)=0 确实是冗余的，去掉可以节省一次写操作。
#pragma omp parallel for
    for (int y = 0; y < (int)ny; y++) {
        for (int t = 0; t < (int)nt; t++) {
            float rhs_ty = rhs(t, y);  // 标量提取，避免重复访问
            for (int x = 0; x < (int)nx; x++) {
                out(x, y) += lhs(x, t) * rhs_ty;
            }
        }
    }
    TOCK(matrix_multiply);
}

// 求出 R^T A R
static void matrix_RtAR(Matrix &RtAR, Matrix const &R, Matrix const &A) {
    TICK(matrix_RtAR);
    // 【优化】原代码 Rt、RtA 是局部变量，每次调用都要：
    //   构造 → 在 matrix_transpose/multiply 内 reshape（malloc）→ 析构（free）
    // 改成 static 局部变量后，内存只分配一次。
    // 后续 reshape 调用 clear()+resize() 时：
    //   clear() 只将 size 置 0，不释放内存（capacity 保留）
    //   resize() 若新 size <= capacity，不重新 malloc，只是扩充 size
    // 从而避免了反复的堆内存申请/释放。
    static Matrix Rt, RtA;
    matrix_transpose(Rt, R);
    matrix_multiply(RtA, Rt, A);
    matrix_multiply(RtAR, RtA, R);
    TOCK(matrix_RtAR);
}

static float matrix_trace(Matrix const &in) {
    TICK(matrix_trace);
    float res = 0;
    size_t nt = std::min(in.shape(0), in.shape(1));
#pragma omp parallel for reduction(+:res)
    for (int t = 0; t < (int)nt; t++) {
        res += in(t, t);
    }
    TOCK(matrix_trace);
    return res;
}

static void test_func(size_t n) {
    TICK(test_func);
    Matrix R(n, n);
    matrix_randomize(R);
    Matrix A(n, n);
    matrix_randomize(A);

    Matrix RtAR;
    matrix_RtAR(RtAR, R, A);

    std::cout << matrix_trace(RtAR) << std::endl;
    TOCK(test_func);
}

int main() {
    wangsrng rng;
    TICK(overall);
    for (int t = 0; t < 4; t++) {
        size_t n = 32 * (rng.next_uint64() % 16 + 24);
        std::cout << "t=" << t << ": n=" << n << std::endl;
        test_func(n);
    }
    TOCK(overall);
    return 0;
}