From 6bbd44dd4224a995bb34f4f17723df6ca66cea64 Mon Sep 17 00:00:00 2001 From: HarryGoGoGo Date: Fri, 21 Mar 2025 00:02:27 +0800 Subject: [PATCH] 1 --- .../src/deepx/tensorfunc/matmul.cu | 78 +++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 excuter/op-mem-cuda/src/deepx/tensorfunc/matmul.cu diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/matmul.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/matmul.cu new file mode 100644 index 00000000..b3fa4404 --- /dev/null +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/matmul.cu @@ -0,0 +1,78 @@ +#include "deepx/tensorfunc/cuda.hpp" + +#include +#include +#include "deepx/tensor.hpp" + +#include "deepx/tensorfunc/matmul.hpp" +#include "deepx/tensorfunc/authors.hpp" +#include "deepx/tensorfunc/cuda.hpp" + +namespace deepx::tensorfunc { + + #define BLOCK_SIZE 32 + + __global__ void fp64MatmulKernel(double *C, const double *A, const double *B, + int M, int N, int K) { + // 定义共享内存块,用于缓存A和B的矩阵块 + __shared__ double tileA[BLOCK_SIZE][BLOCK_SIZE]; + __shared__ double tileB[BLOCK_SIZE][BLOCK_SIZE]; + + // 计算当前线程处理的全局矩阵位置 + int row = blockIdx.y * BLOCK_SIZE + threadIdx.y; + int col = blockIdx.x * BLOCK_SIZE + threadIdx.x; + + double sum = 0.0; + + // 分块循环处理整个K维度 + for (int t = 0; t < (K + BLOCK_SIZE - 1) / BLOCK_SIZE; ++t) { + // 计算当前块的起始位置 + int tiledK = t * BLOCK_SIZE; + + // 加载A的块到共享内存(行优先) + int loadA_col = tiledK + threadIdx.x; + if (row < M && loadA_col < K) { + tileA[threadIdx.y][threadIdx.x] = A[row * K + loadA_col]; + } else { + tileA[threadIdx.y][threadIdx.x] = 0.0; // 填充0处理边界 + } + + // 加载B的块到共享内存(列优先等效处理) + int loadB_row = tiledK + threadIdx.y; + if (col < N && loadB_row < K) { + tileB[threadIdx.y][threadIdx.x] = B[loadB_row * N + col]; + } else { + tileB[threadIdx.y][threadIdx.x] = 0.0; // 填充0处理边界 + } + + __syncthreads(); // 确保块加载完成 + + // 计算当前块的矩阵乘法贡献 + for (int k = 0; k < BLOCK_SIZE; ++k) { + sum += tileA[threadIdx.y][k] * tileB[k][threadIdx.x]; + } + + __syncthreads(); // 确保计算完成再加载下一块 + } + + // 只将有效范围内的结果写入全局内存 + if (row < M && col < N) { + C[row * N + col] = sum; + } + } + + // 主机函数调用内核 + void fp64Matmul(double *d_C, const double *d_A, const double *d_B, + int M, int N, int K) { + dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE); + dim3 dimGrid((N + BLOCK_SIZE - 1) / BLOCK_SIZE, + (M + BLOCK_SIZE - 1) / BLOCK_SIZE); + + fp64MatmulKernel<<>>(d_C, d_A, d_B, M, N, K); + } + + +} + +} // namespace tensorfunc +} // namespace deepx