From ea84ac1b5853ad76ef2e30bb31340c4f3fdceaca Mon Sep 17 00:00:00 2001 From: lipeng <734991033@qq.com> Date: Thu, 3 Apr 2025 15:58:06 +0800 Subject: [PATCH 1/7] =?UTF-8?q?fp16&bf16:cuda=E7=89=88=E6=9C=AC=E4=BF=AE?= =?UTF-8?q?=E6=AD=A3=E3=80=8212.1->12.6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../tensorfunc/elementwise_miaobyte_sin.cu | 24 ++++- .../tensorfunc/elementwise_miaobyte_sin_a.cu | 67 ------------- .../tensorfunc/elementwise_miaobyte_sqrt.cu | 37 +++++++- .../tensorfunc/elementwise_miaobyte_sqrt_a.cu | 95 ------------------- 4 files changed, 51 insertions(+), 172 deletions(-) delete mode 100644 excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin_a.cu delete mode 100644 excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt_a.cu diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.cu index 23b78dbf..00bd232c 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.cu +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.cu @@ -2,7 +2,7 @@ #define DEEPX_TENSORFUNC_ELEMENTWISE_MIAOBYTE_SIN_CU #include - +#include #include "deepx/tensorfunc/cuda.hpp" #include "deepx/tensorfunc/authors.hpp" @@ -35,7 +35,15 @@ namespace deepx::tensorfunc C[idx] = hsin(A[idx]); } } - + template <> + __global__ void sin_kernel(const nv_bfloat16 *A, nv_bfloat16 *C, const int size){ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < size) + { + C[idx] = hsin(A[idx]); + } + } + template void launch_sin(int numBlocks, int blockSize, const T* a, T* c, const int size){ sin_kernel<<>>(a, c, size); @@ -49,7 +57,7 @@ namespace deepx::tensorfunc template void launch_sin(int numBlocks, int blockSize, const double* a, double* c, const int size); template void launch_sin(int numBlocks, int blockSize, const float* a, float* c, const int size); template void launch_sin<__half>(int numBlocks, int blockSize, const __half* a, __half* c, const int size); - + template void launch_sin(int numBlocks, int blockSize, const nv_bfloat16* a, nv_bfloat16* c, const int size); // cos template __global__ void cos_kernel(const T* A, T* C, const int size); @@ -75,7 +83,13 @@ namespace deepx::tensorfunc C[idx] = hcos(A[idx]); } } - + template <> + __global__ void cos_kernel(const nv_bfloat16 *A, nv_bfloat16 *C, const int size){ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < size) { + C[idx] = hcos(A[idx]); + } + } template void launch_cos(int numBlocks, int blockSize, const T* a, T* c, const int size){ cos_kernel<<>>(a, c, size); @@ -88,7 +102,7 @@ namespace deepx::tensorfunc template void launch_cos(int numBlocks, int blockSize, const double* a, double* c, const int size); template void launch_cos(int numBlocks, int blockSize, const float* a, float* c, const int size); template void launch_cos<__half>(int numBlocks, int blockSize, const __half* a, __half* c, const int size); - + template void launch_cos(int numBlocks, int blockSize, const nv_bfloat16* a, nv_bfloat16* c, const int size); // tan template __global__ void tan_kernel(const T* A, T* C, const int size); diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin_a.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin_a.cu deleted file mode 100644 index 0660c3fa..00000000 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin_a.cu +++ /dev/null @@ -1,67 +0,0 @@ -#ifndef DEEPX_TENSORFUNC_ELEMENTWISE_MIAOBYTE_SIN_A_CU -#define DEEPX_TENSORFUNC_ELEMENTWISE_MIAOBYTE_SIN_A_CU - -#include - -#include "deepx/tensorfunc/cuda.hpp" -#include "deepx/tensorfunc/authors.hpp" - -namespace deepx::tensorfunc -{ - // sin - template - __global__ void sin_kernel(const T *A, T *C, const int size); - - template <> - __global__ void sin_kernel(const nv_bfloat16 *A, nv_bfloat16 *C, const int size) - { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < size) - { - C[idx] = hsin(A[idx]); - } - } - - template - void launch_sin(int numBlocks, int blockSize, const T *a, T *c, const int size) - { - sin_kernel<<>>(a, c, size); - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) - { - throw std::runtime_error("Failed to launch sin kernel: " + - std::string(cudaGetErrorString(err))); - } - } - - template void launch_sin(int numBlocks, int blockSize, const nv_bfloat16 *a, nv_bfloat16 *c, const int size); - - // cos - template - __global__ void cos_kernel(const T *A, T *C, const int size); - - template <> - __global__ void cos_kernel(const nv_bfloat16 *A, nv_bfloat16 *C, const int size) - { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < size) - { - C[idx] = hcos(A[idx]); - } - } - template - void launch_cos(int numBlocks, int blockSize, const T *a, T *c, const int size) - { - cos_kernel<<>>(a, c, size); - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) - { - throw std::runtime_error("Failed to launch cos kernel: " + - std::string(cudaGetErrorString(err))); - } - } - template void launch_cos(int numBlocks, int blockSize, const nv_bfloat16 *a, nv_bfloat16 *c, const int size); - -} - -#endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAOBYTE_SIN_A_CU diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cu index fe5c92ca..95307389 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cu +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cu @@ -2,6 +2,7 @@ #define DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_SQRT_CU #include +#include #include "deepx/tensorfunc/cuda.hpp" #include "deepx/tensorfunc/authors.hpp" #include @@ -39,7 +40,15 @@ namespace deepx::tensorfunc C[idx] = hsqrt(A[idx]); } } - + template <> + __global__ void sqrt_kernel(const nv_bfloat16 *A, nv_bfloat16 *C, const int size) + { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < size) + { + C[idx] = hsqrt(A[idx]); + } + } template void launch_sqrt(int numBlocks, int blockSize, const T *a, T *c, const int size) { @@ -54,7 +63,7 @@ namespace deepx::tensorfunc template void launch_sqrt(int numBlocks, int blockSize, const double *a, double *c, const int size); template void launch_sqrt(int numBlocks, int blockSize, const float *a, float *c, const int size); template void launch_sqrt<__half>(int numBlocks, int blockSize, const __half *a, __half *c, const int size); - + template void launch_sqrt(int numBlocks, int blockSize, const nv_bfloat16 *a, nv_bfloat16 *c, const int size); // pow template __global__ void pow_kernel(const T *A, const T *B, T *C, const int size); @@ -159,7 +168,16 @@ namespace deepx::tensorfunc C[idx] = hlog(A[idx]); } } - + template <> + __global__ void log_kernel(const nv_bfloat16 *A, nv_bfloat16 *C, const int size) + { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < size) + { + C[idx] = hlog(A[idx]); + } + } + template void launch_log(int numBlocks, int blockSize, const T *a, T *c, const int size) { @@ -174,7 +192,7 @@ namespace deepx::tensorfunc template void launch_log(int numBlocks, int blockSize, const double *a, double *c, const int size); template void launch_log(int numBlocks, int blockSize, const float *a, float *c, const int size); template void launch_log<__half>(int numBlocks, int blockSize, const __half *a, __half *c, const int size); - + template void launch_log(int numBlocks, int blockSize, const nv_bfloat16 *a, nv_bfloat16 *c, const int size); // exp template __global__ void exp_kernel(const T *A, T *C, const int size); @@ -206,6 +224,15 @@ namespace deepx::tensorfunc C[idx] = hexp(A[idx]); } } + template <> + __global__ void exp_kernel(const nv_bfloat16 *A, nv_bfloat16 *C, const int size) + { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < size) + { + C[idx] = hexp(A[idx]); + } + } template void launch_exp(int numBlocks, int blockSize, const T *a, T *c, const int size) @@ -221,6 +248,6 @@ namespace deepx::tensorfunc template void launch_exp(int numBlocks, int blockSize, const double *a, double *c, const int size); template void launch_exp(int numBlocks, int blockSize, const float *a, float *c, const int size); template void launch_exp<__half>(int numBlocks, int blockSize, const __half *a, __half *c, const int size); + template void launch_exp(int numBlocks, int blockSize, const nv_bfloat16 *a, nv_bfloat16 *c, const int size); } - #endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_SQRT_CU diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt_a.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt_a.cu deleted file mode 100644 index 4c67fa04..00000000 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt_a.cu +++ /dev/null @@ -1,95 +0,0 @@ -#ifndef DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_SQRT_A_CU -#define DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_SQRT_A_CU - -#include - -#include "deepx/tensorfunc/cuda.hpp" -#include "deepx/tensorfunc/authors.hpp" -#include - -namespace deepx::tensorfunc -{ - // sqrt - template - __global__ void sqrt_kernel(const T *A, T *C, const int size); - template <> - - template <> - __global__ void sqrt_kernel(const nv_bfloat16 *A, nv_bfloat16 *C, const int size) - { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < size) - { - C[idx] = hsqrt(A[idx]); - } - } - - template - void launch_sqrt(int numBlocks, int blockSize, const T *a, T *c, const int size) - { - sqrt_kernel<<>>(a, c, size); - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) - { - throw std::runtime_error("Failed to launch sqrt kernel: " + - std::string(cudaGetErrorString(err))); - } - } - template void launch_sqrt(int numBlocks, int blockSize, const nv_bfloat16 *a, nv_bfloat16 *c, const int size); - - // log - template - __global__ void log_kernel(const T *A, T *C, const int size); - - template <> - __global__ void log_kernel(const nv_bfloat16 *A, nv_bfloat16 *C, const int size) - { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < size) - { - C[idx] = hlog(A[idx]); - } - } - - template - void launch_log(int numBlocks, int blockSize, const T *a, T *c, const int size) - { - log_kernel<<>>(a, c, size); - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) - { - throw std::runtime_error("Failed to launch log kernel: " + - std::string(cudaGetErrorString(err))); - } - } - template void launch_log(int numBlocks, int blockSize, const nv_bfloat16 *a, nv_bfloat16 *c, const int size); - - // exp - template - __global__ void exp_kernel(const T *A, T *C, const int size); - - template <> - __global__ void exp_kernel(const nv_bfloat16 *A, nv_bfloat16 *C, const int size) - { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < size) - { - C[idx] = hexp(A[idx]); - } - } - - template - void launch_exp(int numBlocks, int blockSize, const T *a, T *c, const int size) - { - exp_kernel<<>>(a, c, size); - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) - { - throw std::runtime_error("Failed to launch exp kernel: " + - std::string(cudaGetErrorString(err))); - } - } - template void launch_exp(int numBlocks, int blockSize, const nv_bfloat16 *a, nv_bfloat16 *c, const int size); -} - -#endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_SQRT_A_CU From 0830944dcd8540a32efd905ffcf41787830f3b72 Mon Sep 17 00:00:00 2001 From: lipeng <734991033@qq.com> Date: Thu, 3 Apr 2025 16:04:43 +0800 Subject: [PATCH 2/7] =?UTF-8?q?tf:todo=E6=B8=85=E5=8D=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../src/deepx/tensorfunc/changeshape.hpp | 144 ++++++++++++++---- .../src/deepx/tensorfunc/elementwise.hpp | 26 ++++ .../src/deepx/tensorfunc/matmul.hpp | 13 -- .../src/deepx/tensorfunc/reduce.hpp | 56 +++++++ 4 files changed, 197 insertions(+), 42 deletions(-) create mode 100644 excuter/cpp-common/src/deepx/tensorfunc/reduce.hpp diff --git a/excuter/cpp-common/src/deepx/tensorfunc/changeshape.hpp b/excuter/cpp-common/src/deepx/tensorfunc/changeshape.hpp index 6ae0ba86..dd190dac 100644 --- a/excuter/cpp-common/src/deepx/tensorfunc/changeshape.hpp +++ b/excuter/cpp-common/src/deepx/tensorfunc/changeshape.hpp @@ -6,43 +6,129 @@ namespace deepx::tensorfunc { - - // 通用模板声明 + template - struct InitDispatcher + struct reshapeDispatcher { static void reshape(Tensor &tensor, const Shape &new_shape) = delete; }; + // reshape(A,new_shape)=>B template void reshape(Tensor &tensor, const Shape &new_shape) { - InitDispatcher::reshape(tensor, new_shape); - } - - // // 作者特化示例(类型无关实现) - // template - // struct InitDispatcher - // { - // static void reshape(Tensor &tensor, const Shape &new_shape) - // { - // // 统一实现,不依赖T的类型 - // if (tensor.shape.size() != new_shape.size()) - // { - // throw std::invalid_argument("Total elements must match"); - // } - // tensor.shape = new_shape; - // } - // }; - // 特化作者和具体精度 - // template <> - // struct InitDispatcher - // { - // static void reshape(Tensor &tensor, const Shape &new_shape) - // { - // // CUDA实现 - // } - // }; + reshapeDispatcher::reshape(tensor, new_shape); + } + + template + struct transposeDispatcher + { + static void transpose(Tensor &tensor, const std::vector &dim_order) = delete; + }; + + // transpose(A,dim_order)=>B + template + void transpose(Tensor &tensor, const std::vector &dim_order) + { + transposeDispatcher::transpose(tensor, dim_order); + } + + template + struct concatDispatcher + { + static void concat(const Tensor *tensors, const int num_tensors, const int axis, Tensor &C) = delete; + }; + // concat(tensors,axis)=>C + template + void concat(const Tensor *tensors, const int num_tensors, const int axis, Tensor &C) + { + concatDispatcher::concat(tensors, num_tensors, axis, C); + } + + // https://onnx.ai/onnx/operators/onnx__Split.html + template + struct splitDispatcher + { + static void split(const Tensor &A, const int axis,const std::vector &splits, Tensor *&B) = delete; + }; + // split(tensor,axis,splits)=>tensors + template + void split(const Tensor &A, const int axis,const std::vector &splits, Tensor *&B) + { + splitDispatcher::split(A, axis, splits, B); + + } + template + struct splitDispatcher + { + static void split(const Tensor &A, const int axis,const int num_outputs, Tensor *&B) = delete; + }; + // split(tensor,axis,num_outputs)=>tensors + template + void split(const Tensor &A, const int axis,const int num_outputs, Tensor *&B) + { + splitDispatcher::split(A, axis, num_outputs, B); + } + + template + struct expandDispatcher + { + static void expand(const Tensor &A, const Shape &new_shape, Tensor &B) = delete; + }; + + template + void expand(const Tensor &A, const Shape &new_shape, Tensor &B) + { + expandDispatcher::expand(A, new_shape, B); + } + + template + struct squeezeDispatcher + { + static void squeeze(Tensor &tensor) = delete; + }; + + template + void squeeze(Tensor &tensor) + { + squeezeDispatcher::squeeze(tensor); + } + + template + struct unsqueezeDispatcher + { + static void unsqueeze(Tensor &tensor, const int axis) = delete; + }; + + template + void unsqueeze(Tensor &tensor, const int axis) + { + unsqueezeDispatcher::unsqueeze(tensor, axis); + } + + template + struct flattenDispatcher + { + static void flatten(Tensor &tensor) = delete; + }; + + template + void flatten(Tensor &tensor) + { + flattenDispatcher::flatten(tensor); + } + + template + struct paddingDispatcher + { + static void padding(Tensor &tensor, const Shape &new_shape) = delete; + }; + + template + void padding(Tensor &tensor, const Shape &new_shape) + { + paddingDispatcher::padding(tensor, new_shape); + } } #endif diff --git a/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp b/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp index 415e4449..4f50c6a2 100644 --- a/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp +++ b/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp @@ -15,6 +15,8 @@ namespace deepx::tensorfunc } }; + + // A+B=>C template void add(const Tensor &A, const Tensor &B, Tensor &C) { @@ -29,6 +31,7 @@ namespace deepx::tensorfunc } }; + // A+scalar=>C template void addscalar(const Tensor &input, const T value, Tensor &output) { @@ -43,6 +46,7 @@ namespace deepx::tensorfunc } }; + // A-B=>C template void sub(const Tensor &A, const Tensor &B, Tensor &C) { @@ -57,6 +61,7 @@ namespace deepx::tensorfunc } }; + // A-scalar=>C template void subscalar(const Tensor &input, const T value, Tensor &output) { @@ -69,6 +74,7 @@ namespace deepx::tensorfunc static void mul(const Tensor &A, const Tensor &B, Tensor &C) = delete; }; + // A*B=>C template void mul(const Tensor &A, const Tensor &B, Tensor &C) { @@ -81,6 +87,7 @@ namespace deepx::tensorfunc static void mulscalar(const Tensor &input, const T value, Tensor &output) = delete; }; + // A*scalar=>C template void mulscalar(const Tensor &input, const T value, Tensor &output) { @@ -95,6 +102,7 @@ namespace deepx::tensorfunc static void div(const Tensor &A, const Tensor &B, Tensor &C) = delete; }; + // A/B=>C template void div(const Tensor &A, const Tensor &B, Tensor &C) { @@ -107,6 +115,7 @@ namespace deepx::tensorfunc static void divscalar(const Tensor &input, const T value, Tensor &output) = delete; }; + // A/scalar=>C template void divscalar(const Tensor &input, const T value, Tensor &output) { @@ -119,6 +128,7 @@ namespace deepx::tensorfunc static void rdivscalar(const T value, const Tensor &input, Tensor &output) = delete; }; + // scalar/A=>C template void rdivscalar(const T value, const Tensor &input, Tensor &output) { @@ -132,6 +142,7 @@ namespace deepx::tensorfunc static void sqrt(const Tensor &input, Tensor &output) = delete; }; + // sqrt(A)=>C template void sqrt(const Tensor &input, Tensor &output) { @@ -144,6 +155,7 @@ namespace deepx::tensorfunc static void pow(const Tensor &A, const Tensor &B, Tensor &C) = delete; }; + // A^B=>C template void pow(const Tensor &A, const Tensor &B, Tensor &C) { @@ -156,6 +168,7 @@ namespace deepx::tensorfunc static void powscalar(const Tensor &input, const T value, Tensor &output) = delete; }; + // A^scalar=>C template void powscalar(const Tensor &input, const T value, Tensor &output) { @@ -168,6 +181,7 @@ namespace deepx::tensorfunc static void log(const Tensor &input, Tensor &output) = delete; }; + // log(A)=>C template void log(const Tensor &input, Tensor &output) { @@ -180,6 +194,7 @@ namespace deepx::tensorfunc static void exp(const Tensor &input, Tensor &output) = delete; }; + // exp(A)=>C template void exp(const Tensor &input, Tensor &output) { @@ -192,6 +207,7 @@ namespace deepx::tensorfunc static void sin(const Tensor &input, Tensor &output) = delete; }; + // sin(A)=>C template void sin(const Tensor &input, Tensor &output) { @@ -204,6 +220,7 @@ namespace deepx::tensorfunc static void cos(const Tensor &input, Tensor &output) = delete; }; + // cos(A)=>C template void cos(const Tensor &input, Tensor &output) { @@ -216,6 +233,7 @@ namespace deepx::tensorfunc static void tan(const Tensor &input, Tensor &output) = delete; }; + // tan(A)=>C template void tan(const Tensor &input, Tensor &output) { @@ -228,6 +246,7 @@ namespace deepx::tensorfunc static void max(const Tensor &A, const Tensor &B, Tensor &C) = delete; }; + // max(A,B)=>C template void max(const Tensor &A, const Tensor &B, Tensor &C) { @@ -242,6 +261,7 @@ namespace deepx::tensorfunc static void maxscalar(const Tensor &A, T b, Tensor &C) = delete; }; + // max(A,scalar)=>C template void maxscalar(const Tensor &A, T b, Tensor &C) { @@ -256,6 +276,7 @@ namespace deepx::tensorfunc static void min(const Tensor &A, const Tensor &B, Tensor &C) = delete; }; + // min(A,B)=>C template void min(const Tensor &A, const Tensor &B, Tensor &C) { @@ -268,6 +289,7 @@ namespace deepx::tensorfunc static void minscalar(const Tensor &A, T b, Tensor &C) = delete; }; + // min(A,scalar)=>C template void minscalar(const Tensor &A, T b, Tensor &C) { @@ -280,6 +302,10 @@ namespace deepx::tensorfunc static void compare(const Tensor &A, const Tensor &B, Tensor &mask) = delete; }; + // compare(A,B)=>mask + // if A[i]==B[i], mask[i]=0.5 + // if A[i]>B[i], mask[i]=0 + // if A[i] void compare(const Tensor &A, const Tensor &B,Tensor &mask) { diff --git a/excuter/cpp-common/src/deepx/tensorfunc/matmul.hpp b/excuter/cpp-common/src/deepx/tensorfunc/matmul.hpp index ca844f4b..2e099aba 100644 --- a/excuter/cpp-common/src/deepx/tensorfunc/matmul.hpp +++ b/excuter/cpp-common/src/deepx/tensorfunc/matmul.hpp @@ -40,19 +40,6 @@ namespace deepx::tensorfunc { matmulDispatcher::matmul(A, B, C); } - - template - struct matmuladdDispatcher - { - static void matmuladd(const Tensor &A, const Tensor &B, const T &alpha, const T &beta, Tensor &C) = delete; - }; - - template - void matmuladd(const Tensor &A, const Tensor &B, const T &alpha, const T &beta, Tensor &C) - { - matmuladdDispatcher::matmuladd(A, B, alpha, beta, C); - } - } #endif diff --git a/excuter/cpp-common/src/deepx/tensorfunc/reduce.hpp b/excuter/cpp-common/src/deepx/tensorfunc/reduce.hpp new file mode 100644 index 00000000..c9f3b2a7 --- /dev/null +++ b/excuter/cpp-common/src/deepx/tensorfunc/reduce.hpp @@ -0,0 +1,56 @@ +#ifndef DEEPX_TENSORFUNC_REDUCE_HPP +#define DEEPX_TENSORFUNC_REDUCE_HPP + + #include "deepx/tensor.hpp" +#include "deepx/tensorfunc/authors.hpp" +#include "stdutil/error.hpp" + +namespace deepx::tensorfunc +{ + template + struct reducesumDispatcher + { + static void reducesum(const Tensor &A, const int axis,const bool keepdims, Tensor &B) = delete; + }; + template + void reducesum(const Tensor &A, const int axis,const bool keepdims, Tensor &B) + { + reducesumDispatcher::reducesum(A, axis, keepdims, B); + } + + template + struct reduceprodDispatcher + { + static void reduceprod(const Tensor &A, const int axis,const bool keepdims, Tensor &B) = delete; + }; + + template + void reduceprod(const Tensor &A, const int axis,const bool keepdims, Tensor &B) + { + reduceprodDispatcher::reduceprod(A, axis, keepdims, B); + } + + template + struct reducemaxDispatcher + { + static void reducemax(const Tensor &A, const int axis,const bool keepdims, Tensor &B) = delete; + }; + template + void reducemax(const Tensor &A, const int axis,const bool keepdims, Tensor &B) + { + reducemaxDispatcher::reducemax(A, axis, keepdims, B); + } + + template + struct reduceminDispatcher + { + static void reducemin(const Tensor &A, const int axis,const bool keepdims, Tensor &B) = delete; + }; + template + void reducemin(const Tensor &A, const int axis,const bool keepdims, Tensor &B) + { + reduceminDispatcher::reducemin(A, axis, keepdims, B); + } + +} +#endif // DEEPX_TENSORFUNC_REDUCE_HPP From b2fce2584c10bc28fa71f3d6e71781a9d4372408 Mon Sep 17 00:00:00 2001 From: lipeng <734991033@qq.com> Date: Sun, 6 Apr 2025 01:46:38 +0800 Subject: [PATCH 3/7] excuter(cpu/cuda):reshape,transpose,concat --- doc/excuter/op-mem-cuda/list.md | 1 + doc/excuter/op-mem-ompsimd/list.md | 3 +- excuter/cpp-common/src/deepx/mem/mem.hpp | 1 + excuter/cpp-common/src/deepx/shape_concat.hpp | 31 ++- .../src/deepx/tensorfunc/changeshape.hpp | 26 +- .../deepx/tensorfunc/changeshape_miaobyte.cu | 231 ++++++++++++++++++ .../deepx/tensorfunc/changeshape_miaobyte.cuh | 82 +++++++ .../deepx/tensorfunc/changeshape_miaobyte.hpp | 81 ++++++ .../src/deepx/tensorfunc/concat.hpp | 13 - .../op-mem-cuda/src/deepx/tensorfunc/cuda.hpp | 24 +- .../src/deepx/tensorfunc/matmul_cublas.hpp | 119 +-------- .../src/deepx/tensorfunc/tensor_cuda.cuh | 39 +++ .../src/deepx/tensorfunc/vector_cuda.cuh | 100 ++++++++ .../test/tensorfunc/2_changeshape.cpp | 46 ++++ .../test/tensorfunc/CMakeLists.txt | 5 +- ...angeshape.hpp => changeshape_miaobyte.hpp} | 96 ++++---- .../src/deepx/tensorfunc/matmul_cblas.hpp | 145 ----------- .../test/tensorfunc/7_tensor_transpose.cpp | 6 +- .../test/tensorfunc/8_tensor_concat.cpp | 7 +- front/py/deepx/nn/functional/__init__.py | 2 +- front/py/deepx/nn/functional/init.py | 2 +- src/deepx/tensorfunc/changeshape_miaobyte.cu | 1 + 22 files changed, 715 insertions(+), 346 deletions(-) create mode 100644 excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu create mode 100644 excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cuh create mode 100644 excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.hpp delete mode 100644 excuter/op-mem-cuda/src/deepx/tensorfunc/concat.hpp create mode 100644 excuter/op-mem-cuda/src/deepx/tensorfunc/tensor_cuda.cuh create mode 100644 excuter/op-mem-cuda/src/deepx/tensorfunc/vector_cuda.cuh create mode 100644 excuter/op-mem-cuda/test/tensorfunc/2_changeshape.cpp rename excuter/op-mem-ompsimd/src/deepx/tensorfunc/{changeshape.hpp => changeshape_miaobyte.hpp} (71%) create mode 100644 src/deepx/tensorfunc/changeshape_miaobyte.cu diff --git a/doc/excuter/op-mem-cuda/list.md b/doc/excuter/op-mem-cuda/list.md index 4982da4c..5967d738 100644 --- a/doc/excuter/op-mem-cuda/list.md +++ b/doc/excuter/op-mem-cuda/list.md @@ -5,6 +5,7 @@ | Operation | Author | Func Def | Math Formula | IR Instruction | |-----------|--------|------------|--------------|----------------| | matmul | cublas | matmul(tensor A, tensor B)->(tensor C) | T3=T1 @ T2 | matmul(tensor A, tensor B)->(tensor C) | +| comparescalar | miaobyte | comparescalar(tensor A, var scalar)->(tensor mask) | mask=compare(T1, scalar) | comparescalar(tensor A, var scalar)->(tensor mask) | | add | cublas | add(tensor a, tensor b)->(tensor c) | T3=T1+T2 | add(tensor a, tensor b)->(tensor c) | | add | miaobyte | add(tensor a, tensor b)->(tensor c) | T3=T1+T2 | add(tensor a, tensor b)->(tensor c) | | uniform | miaobyte | uniform(tensor t, var low, var high, var seed)->() | uniform(T1,low,high,seed) | uniform(tensor t, var low, var high, var seed)->() | diff --git a/doc/excuter/op-mem-ompsimd/list.md b/doc/excuter/op-mem-ompsimd/list.md index 84f46f87..b4ab6cd7 100644 --- a/doc/excuter/op-mem-ompsimd/list.md +++ b/doc/excuter/op-mem-ompsimd/list.md @@ -7,7 +7,7 @@ | concat | none | concat()->() | Tresult = concat([T1, T2...], axis=3) | concat()->() | | matmul | cblas | matmul(tensor A, tensor B)->(tensor C) | T3=T1 @ T2 | matmul(tensor A, tensor B)->(tensor C) | | matmul | miaobyte | matmul(tensor A, tensor B)->(tensor C) | T3=T1 @ T2 | matmul(tensor A, tensor B)->(tensor C) | -| compare | miaobyte | compare(tensor A, tensor B)->(tensor mask) | mask=compare(T1,T2) | compare(tensor A, tensor B)->(tensor mask) | +| compare | miaobyte | compare(tensor A, tensor B)->(tensor mask) | mask=compare(T1,T2) | compare(tensor A, tensor B)->(tensor mask) | | min | miaobyte | min(tensor A, tensor B)->(tensor C) | T3=min(T1,T2) | min(tensor A, tensor B)->(tensor C) | | minscalar | miaobyte | minscalar(tensor A, var scalar)->(tensor C) | T3=min(T1,scalar) | minscalar(tensor A, var scalar)->(tensor C) | | exp | miaobyte | exp(tensor A)->(tensor C) | T3=exp(T1) | exp(tensor A)->(tensor C) | @@ -32,6 +32,7 @@ | subscalar | miaobyte | subscalar(tensor a, var scalar)->(tensor c) | T3=T1-scalar | subscalar(tensor a, var scalar)->(tensor c) | | log | miaobyte | log(tensor A)->(tensor C) | T3=log(T1) | log(tensor A)->(tensor C) | | uniform | miaobyte | uniform(tensor t, var low, var high, var seed)->() | uniform(T1,low,high,seed) | uniform(tensor t, var low, var high, var seed)->() | +| comparescalar | miaobyte | comparescalar(tensor A, var scalar)->(tensor mask) | mask=compare(T1,scalar) | comparescalar(tensor A, var scalar)->(tensor mask) | | add | cblas | add(tensor a, tensor b)->(tensor c) | T3=T1+T2 | add(tensor a, tensor b)->(tensor c) | | add | miaobyte | add(tensor a, tensor b)->(tensor c) | T3=T1+T2 | add(tensor a, tensor b)->(tensor c) | | addscalar | miaobyte | addscalar(tensor a, var scalar)->(tensor c) | T3=T1+scalar | addscalar(tensor a, var scalar)->(tensor c) | diff --git a/excuter/cpp-common/src/deepx/mem/mem.hpp b/excuter/cpp-common/src/deepx/mem/mem.hpp index a4db9d9d..504db01f 100644 --- a/excuter/cpp-common/src/deepx/mem/mem.hpp +++ b/excuter/cpp-common/src/deepx/mem/mem.hpp @@ -150,6 +150,7 @@ namespace deepx::mem return tensors; } + void delete_tensor(const string &name) { diff --git a/excuter/cpp-common/src/deepx/shape_concat.hpp b/excuter/cpp-common/src/deepx/shape_concat.hpp index 885a7678..91884e5e 100644 --- a/excuter/cpp-common/src/deepx/shape_concat.hpp +++ b/excuter/cpp-common/src/deepx/shape_concat.hpp @@ -3,6 +3,7 @@ #include "deepx/shape.hpp" #include "deepx/tensor.hpp" +#include "stdutil/error.hpp" namespace deepx { @@ -18,6 +19,32 @@ namespace deepx } return concatShape(shapes,axis); } -} -#endif \ No newline at end of file + template + bool checkShapeConcat(const std::vector*> &tensors,const int axis,const Tensor &output){ + int axisDim=0; + for (int i = 0; i < tensors.size(); i++) + { + if (tensors[i]->shape.dim != output.shape.dim) + { + throw TensorShapeError("All input tensors must have the same dimension size for concat"); + } + for (int j = 0; j < tensors[i]->shape.dim; j++) + { + if (j != axis) + { + if (tensors[i]->shape[j] != output.shape[j]) + { + throw TensorShapeError("All input tensors must have the same dimension size for concat"); + } + } + else + { + axisDim += tensors[i]->shape[j]; + } + } + } + return axisDim == output.shape[axis]; + } +}; +#endif // DEEPX_SHAPE_CONCAT_HPP \ No newline at end of file diff --git a/excuter/cpp-common/src/deepx/tensorfunc/changeshape.hpp b/excuter/cpp-common/src/deepx/tensorfunc/changeshape.hpp index dd190dac..5acd644e 100644 --- a/excuter/cpp-common/src/deepx/tensorfunc/changeshape.hpp +++ b/excuter/cpp-common/src/deepx/tensorfunc/changeshape.hpp @@ -1,21 +1,22 @@ #ifndef DEEPX_TENSORFUNC_CHANGE_SHAPE_HPP #define DEEPX_TENSORFUNC_CHANGE_SHAPE_HPP +#include #include "deepx/tensor.hpp" #include "stdutil/error.hpp" namespace deepx::tensorfunc { - + using namespace std; template struct reshapeDispatcher { - static void reshape(Tensor &tensor, const Shape &new_shape) = delete; + static void reshape(Tensor &tensor, const std::vector &new_shape) = delete; }; // reshape(A,new_shape)=>B template - void reshape(Tensor &tensor, const Shape &new_shape) + void reshape(Tensor &tensor, const std::vector &new_shape) { reshapeDispatcher::reshape(tensor, new_shape); } @@ -23,26 +24,26 @@ namespace deepx::tensorfunc template struct transposeDispatcher { - static void transpose(Tensor &tensor, const std::vector &dim_order) = delete; + static void transpose(const Tensor &tensor, const std::vector &dim_order, Tensor &output) = delete; }; // transpose(A,dim_order)=>B template - void transpose(Tensor &tensor, const std::vector &dim_order) + void transpose(const Tensor &tensor, const std::vector &dim_order, Tensor &output) { - transposeDispatcher::transpose(tensor, dim_order); + transposeDispatcher::transpose(tensor, dim_order, output); } template struct concatDispatcher { - static void concat(const Tensor *tensors, const int num_tensors, const int axis, Tensor &C) = delete; + static void concat(const vector*> tensors, const int axis, Tensor &C) = delete; }; // concat(tensors,axis)=>C template - void concat(const Tensor *tensors, const int num_tensors, const int axis, Tensor &C) + void concat(const vector*> tensors, const int axis, Tensor &C) { - concatDispatcher::concat(tensors, num_tensors, axis, C); + concatDispatcher::concat(tensors, axis, C); } // https://onnx.ai/onnx/operators/onnx__Split.html @@ -50,6 +51,7 @@ namespace deepx::tensorfunc struct splitDispatcher { static void split(const Tensor &A, const int axis,const std::vector &splits, Tensor *&B) = delete; + static void split(const Tensor &A, const int axis,const int num_outputs, Tensor *&B) = delete; }; // split(tensor,axis,splits)=>tensors template @@ -58,11 +60,7 @@ namespace deepx::tensorfunc splitDispatcher::split(A, axis, splits, B); } - template - struct splitDispatcher - { - static void split(const Tensor &A, const int axis,const int num_outputs, Tensor *&B) = delete; - }; + // split(tensor,axis,num_outputs)=>tensors template void split(const Tensor &A, const int axis,const int num_outputs, Tensor *&B) diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu new file mode 100644 index 00000000..82c893c0 --- /dev/null +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu @@ -0,0 +1,231 @@ +#ifndef DEEPX_TENSORFUNC_CHANGESHAPE_MIAOBYTE_CU +#define DEEPX_TENSORFUNC_CHANGESHAPE_MIAOBYTE_CU + +#include +#include +#include "deepx/tensorfunc/cuda.hpp" +#include "deepx/tensorfunc/authors.hpp" +#include "deepx/tensorfunc/tensor_cuda.cuh" +#include "deepx/tensorfunc/vector_cuda.cuh" +namespace deepx::tensorfunc +{ + // transpose + // DIM=2^n + template + __global__ void transpose_kernel(const T *inputData, + const int *inputStrides, + T *outputData, + const int *outputStrides, + const int dim, + const int len, + const int *dimOrder) + { + const int grid_stride = gridDim.x * blockDim.x; + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + for (; thread_id < len; thread_id += grid_stride) + { + int input_indices[DIM]; + + // 计算当前线程需要处理的索引 + linearTo(inputStrides, dim, input_indices, thread_id); + + int output_indices[DIM]; + + // 根据 dim_order 和输入输出的形状计算新索引 + reorder(input_indices, dimOrder, dim, output_indices); + int inputIdx = linearAt(inputStrides, dim, input_indices); + int outputIdx = linearAt(outputStrides, dim, output_indices); + outputData[outputIdx] = inputData[inputIdx]; + } + } + + inline int nextPowerOf2(int n) + { + if (n <= 0) + return 1; + if ((n & (n - 1)) == 0) + return n; // 如果n已经是2的幂 + + n--; + n |= n >> 1; + n |= n >> 2; + n |= n >> 4; + n |= n >> 8; + n |= n >> 16; + return n + 1; + } + + template + void launch_transpose(const int numBlocks, const int blockSize, + const T *input, + const int *inputStrides, + T *output, + const int *outputStrides, + const int dim, + const int len, + const int *dimOrder) + { + cudaVector strides_d(inputStrides, dim); + cudaVector newStrides_d(outputStrides, dim); + cudaVector dimOrder_d(dimOrder, dim); + + int powDim = nextPowerOf2(dim); + + // 根据计算出的2的幂次选择对应的模板实例 + switch (powDim) + { + case 1: + transpose_kernel<1, T><<>>(input, strides_d.data, output, newStrides_d.data, dim, len, dimOrder_d.data); + break; + case 2: + transpose_kernel<2, T><<>>(input, strides_d.data, output, newStrides_d.data, dim, len, dimOrder_d.data); + break; + case 4: + transpose_kernel<4, T><<>>(input, strides_d.data, output, newStrides_d.data, dim, len, dimOrder_d.data); + break; + case 8: + transpose_kernel<8, T><<>>(input, strides_d.data, output, newStrides_d.data, dim, len, dimOrder_d.data); + break; + case 16: + transpose_kernel<16, T><<>>(input, strides_d.data, output, newStrides_d.data, dim, len, dimOrder_d.data); + break; + case 32: + transpose_kernel<32, T><<>>(input, strides_d.data, output, newStrides_d.data, dim, len, dimOrder_d.data); + break; + case 64: + transpose_kernel<64, T><<>>(input, strides_d.data, output, newStrides_d.data, dim, len, dimOrder_d.data); + break; + case 128: + transpose_kernel<128, T><<>>(input, strides_d.data, output, newStrides_d.data, dim, len, dimOrder_d.data); + break; + default: + throw std::runtime_error("dim too large, max support 128"); + } + } + + template void launch_transpose(const int numBlocks, const int blockSize, const double *input, const int *inputStrides, double *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); + template void launch_transpose(const int numBlocks, const int blockSize, const float *input, const int *inputStrides, float *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); + template void launch_transpose(const int numBlocks, const int blockSize, const nv_bfloat16 *input, const int *inputStrides, nv_bfloat16 *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); + template void launch_transpose<__half>(const int numBlocks, const int blockSize, const __half *input, const int *inputStrides, __half *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); + template void launch_transpose(const int numBlocks, const int blockSize, const int64_t *input, const int *inputStrides, int64_t *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); + template void launch_transpose(const int numBlocks, const int blockSize, const int32_t *input, const int *inputStrides, int32_t *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); + template void launch_transpose(const int numBlocks, const int blockSize, const int16_t *input, const int *inputStrides, int16_t *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); + template void launch_transpose(const int numBlocks, const int blockSize, const int8_t *input, const int *inputStrides, int8_t *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); + + // concat + template + __global__ void concat_kernel(const T **tensorsData, + const int *inputStrides, + T *outputData, + const int *outputStrides, + const int dim, + const int outputLen, + const int axis, + const int numTensors, + const int *shapeAtAxis) + { + const int grid_stride = gridDim.x * blockDim.x; + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + cudaVector outputIndices(DIM); + cudaVector currentTensorIndices(DIM); + for (; thread_id < outputLen; thread_id += grid_stride) + { + linearTo(outputStrides, dim, outputIndices.data, thread_id); + int concatIdxResult = outputIndices[axis]; + int concatIdxCurrentTensor = concatIdxResult; + int tensorIdx = 0; + while (tensorIdx < numTensors) + { + if (concatIdxCurrentTensor < shapeAtAxis[tensorIdx]) + { + break; + } + else + { + concatIdxCurrentTensor -= shapeAtAxis[tensorIdx]; + tensorIdx++; + } + } + currentTensorIndices.copyFromDevice(outputIndices.data, dim); + currentTensorIndices[axis] = concatIdxCurrentTensor; + + int idxCurrentTensor = linearAt(inputStrides+tensorIdx*dim, dim, currentTensorIndices.data); + + int idx = linearAt(outputStrides, dim, outputIndices.data); + outputData[idx] = tensorsData[tensorIdx][idxCurrentTensor]; + } + } + + + template + void launch_concat( + const T **tensorsData, + const int *inputStrides, + T *outputData, + const int *outputStrides, + const int dim, + const int outputLen, + const int axis, + const int numTensors, + const int *shapeAtAxis) + { + auto [numBlocks, blockSize] = BestDims(outputLen); + + //output + cudaVector outputStrides_d(outputStrides, dim, cudaMemcpyHostToDevice); + + //input + //datas + cudaVector tensorsDataList(tensorsData, numTensors, cudaMemcpyHostToDevice); + //strides + cudaVector inputStrides_d(inputStrides, numTensors*dim, cudaMemcpyHostToDevice); + + + //shapeAtAxis + cudaVector shapeAtAxis_d(shapeAtAxis, numTensors, cudaMemcpyHostToDevice); + + int powDim = nextPowerOf2(dim); + + // 根据计算出的2的幂次选择对应的模板实例 + switch (powDim) + { + case 1: + concat_kernel<1, T><<>>(tensorsDataList.data, inputStrides_d.data, outputData, outputStrides_d.data, dim, outputLen, axis, numTensors, shapeAtAxis_d.data); + break; + case 2: + concat_kernel<2, T><<>>(tensorsDataList.data, inputStrides_d.data, outputData, outputStrides_d.data, dim, outputLen, axis, numTensors, shapeAtAxis_d.data); + break; + case 4: + concat_kernel<4, T><<>>(tensorsDataList.data, inputStrides_d.data, outputData, outputStrides_d.data, dim, outputLen, axis, numTensors, shapeAtAxis_d.data); + break; + case 8: + concat_kernel<8, T><<>>(tensorsDataList.data, inputStrides_d.data, outputData, outputStrides_d.data, dim, outputLen, axis, numTensors, shapeAtAxis_d.data); + break; + case 16: + concat_kernel<16, T><<>>(tensorsDataList.data, inputStrides_d.data, outputData, outputStrides_d.data, dim, outputLen, axis, numTensors, shapeAtAxis_d.data); + break; + case 32: + concat_kernel<32, T><<>>(tensorsDataList.data, inputStrides_d.data, outputData, outputStrides_d.data, dim, outputLen, axis, numTensors, shapeAtAxis_d.data); + break; + case 64: + concat_kernel<64, T><<>>(tensorsDataList.data, inputStrides_d.data, outputData, outputStrides_d.data, dim, outputLen, axis, numTensors, shapeAtAxis_d.data); + break; + case 128: + concat_kernel<128, T><<>>(tensorsDataList.data, inputStrides_d.data, outputData, outputStrides_d.data, dim, outputLen, axis, numTensors, shapeAtAxis_d.data); + break; + default: + throw std::runtime_error("dim too large, max support 128"); + } + } + template void launch_concat(const double **tensorsData, const int *inputStrides, double *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis); + template void launch_concat(const float **tensorsData, const int *inputStrides, float *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis); + template void launch_concat(const nv_bfloat16 **tensorsData, const int *inputStrides, nv_bfloat16 *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis); + template void launch_concat<__half>(const __half **tensorsData, const int *inputStrides, __half *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis); + template void launch_concat(const int64_t **tensorsData, const int *inputStrides, int64_t *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis); + template void launch_concat(const int32_t **tensorsData, const int *inputStrides, int32_t *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis); + template void launch_concat(const int16_t **tensorsData, const int *inputStrides, int16_t *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis); + template void launch_concat(const int8_t **tensorsData, const int *inputStrides, int8_t *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis); + +} +#endif // DEEPX_TENSORFUNC_CHANGESHAPE_MIAOBYTE_HPP \ No newline at end of file diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cuh b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cuh new file mode 100644 index 00000000..9e9a8629 --- /dev/null +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cuh @@ -0,0 +1,82 @@ +#ifndef DEEPX_TENSORFUNC_CHANGESHAPE_MIAOBYTE_CUH +#define DEEPX_TENSORFUNC_CHANGESHAPE_MIAOBYTE_CUH + +#include +#include +#include "deepx/tensorfunc/cuda.hpp" +#include "deepx/tensorfunc/authors.hpp" + +namespace deepx::tensorfunc +{ + //transpose + template + __global__ void transpose_kernel(const T* input, const int* inputStrides, T* output, const int* outputStrides, const int dim, const int len, const int* dimOrder); + + template + void launch_transpose(const int numBlocks, const int blockSize, const T* input, const int* inputStrides, T* output, const int* outputStrides, const int dim, const int len, const int* dimOrder); + + template <> + void launch_transpose(const int numBlocks, const int blockSize, const double* input, const int* inputStrides, double* output, const int* outputStrides, const int dim, const int len, const int* dimOrder); + + template <> + void launch_transpose(const int numBlocks, const int blockSize, const float* input, const int* inputStrides, float* output, const int* outputStrides, const int dim, const int len, const int* dimOrder); + + template <> + void launch_transpose(const int numBlocks, const int blockSize, const nv_bfloat16* input, const int* inputStrides, nv_bfloat16* output, const int* outputStrides, const int dim, const int len, const int* dimOrder); + + template <> + void launch_transpose<__half>(const int numBlocks, const int blockSize, const __half* input, const int* inputStrides, __half* output, const int* outputStrides, const int dim, const int len, const int* dimOrder); + + template <> + void launch_transpose(const int numBlocks, const int blockSize, const int64_t* input, const int* inputStrides, int64_t* output, const int* outputStrides, const int dim, const int len, const int* dimOrder); + + template <> + void launch_transpose(const int numBlocks, const int blockSize, const int32_t* input, const int* inputStrides, int32_t* output, const int* outputStrides, const int dim, const int len, const int* dimOrder); + + template <> + void launch_transpose(const int numBlocks, const int blockSize, const int16_t* input, const int* inputStrides, int16_t* output, const int* outputStrides, const int dim, const int len, const int* dimOrder); + + template <> + void launch_transpose(const int numBlocks, const int blockSize, const int8_t* input, const int* inputStrides, int8_t* output, const int* outputStrides, const int dim, const int len, const int* dimOrder); + + template + __global__ void concat_kernel(const T **tensorsData, + const int *inputStrides, + T *outputData, + const int *outputStrides, + const int dim, + const int len, + const int axis, + const int numTensors, + const int *shapeAtAxis); + + template + void launch_concat(const T **tensorsData, const int *inputStrides, T *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis); + + template <> + void launch_concat(const double **tensorsData, const int *inputStrides, double *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis); + + template <> + void launch_concat(const float **tensorsData, const int *inputStrides, float *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis); + + template <> + void launch_concat(const nv_bfloat16 **tensorsData, const int *inputStrides, nv_bfloat16 *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis); + + template <> + void launch_concat<__half>(const __half **tensorsData, const int *inputStrides, __half *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis); + + template <> + void launch_concat(const int64_t **tensorsData, const int *inputStrides, int64_t *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis); + + template <> + void launch_concat(const int32_t **tensorsData, const int *inputStrides, int32_t *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis); + + template <> + void launch_concat(const int16_t **tensorsData, const int *inputStrides, int16_t *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis); + + template <> + void launch_concat(const int8_t **tensorsData, const int *inputStrides, int8_t *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis); + + +} +#endif // DEEPX_TENSORFUNC_CHANGESHAPE_MIAOBYTE_HPP \ No newline at end of file diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.hpp new file mode 100644 index 00000000..4c9f5c4d --- /dev/null +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.hpp @@ -0,0 +1,81 @@ +#ifndef DEEPX_TENSORFUNC_CHANGESHAPE_MIAOBYTE_HPP +#define DEEPX_TENSORFUNC_CHANGESHAPE_MIAOBYTE_HPP + +#include +#include +#include "deepx/tensor.hpp" +#include "deepx/tensorfunc/changeshape.hpp" +#include "deepx/tensorfunc/authors.hpp" +#include "deepx/tensorfunc/changeshape_miaobyte.cuh" +#include "deepx/tensorfunc/cuda.hpp" +#include "deepx/shape_concat.hpp" +namespace deepx::tensorfunc +{ + template + struct reshapeDispatcher + { + static void reshape(Tensor &tensor, const std::vector &new_shape) + { + if (tensor.shape.dim != new_shape.size()) + { + throw std::runtime_error("Tensor shapes must match for reshape"); + } + tensor.shape = Shape(new_shape); + } + }; + + template + struct transposeDispatcher + { + static void transpose(const Tensor &tensor, const std::vector &dim_order, Tensor &output) + { + if (dim_order.size() != tensor.shape.dim) + { + throw std::runtime_error("Dimension order size must match tensor dimension size for transpose"); + } + auto [actual_blocks, optimal_block_size] = BestDims(tensor.shape.size); + launch_transpose(actual_blocks, optimal_block_size, + tensor.data, tensor.shape.strides.data(), + output.data, output.shape.strides.data(), + tensor.shape.dim, tensor.shape.size, dim_order.data()); + } + }; + + template + struct concatDispatcher + { + static void concat(const vector*> tensors, const int axis, Tensor &C) + { + //checkshape + if (!checkShapeConcat(tensors, axis, C)) + { + throw TensorShapeError("Output tensor shape size must match the sum of input tensor shape sizes for concat"); + } + + vector tensorsData(tensors.size()); + for (int i = 0; i < tensors.size(); i++) + { + tensorsData[i] = tensors[i]->data; + } + + vector< int> inputStrides; + for (int i = 0; i < tensors.size(); i++) + { + std::copy(tensors[i]->shape.strides.data(), tensors[i]->shape.strides.data() + tensors[i]->shape.dim, std::back_inserter(inputStrides)); + } + + vector shapeAtAxis(tensors.size()); + for (int i = 0; i < tensors.size(); i++) + { + shapeAtAxis[i] = tensors[i]->shape[axis]; + } + + launch_concat(tensorsData.data(), inputStrides.data(), + C.data, C.shape.strides.data(), + C.shape.dim, + C.shape.size, + axis, tensors.size(), shapeAtAxis.data()); + }; + }; +} +#endif // DEEPX_TENSORFUNC_CHANGESHAPE_MIAOBYTE_HPP \ No newline at end of file diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/concat.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/concat.hpp deleted file mode 100644 index eb009b75..00000000 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/concat.hpp +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef DEEPX_TENSORFUNC_CONCAT_HPP -#define DEEPX_TENSORFUNC_CONCAT_HPP - -#include -#include -#include "deepx/tensor.hpp" -#include "deepx/shape_concat.hpp" -#include "deepx/tensorfunc/new.hpp" -namespace deepx::tensorfunc -{ - -} -#endif \ No newline at end of file diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda.hpp index bfee6cae..a9b6886f 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda.hpp +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda.hpp @@ -1,7 +1,7 @@ #ifndef DEEPX_TENSORFUNC_CUDA_HPP #define DEEPX_TENSORFUNC_CUDA_HPP -#include +#include #include #include @@ -29,7 +29,27 @@ namespace deepx::tensorfunc private: cublasHandle_t handle_; }; - + + inline std::pair BestDims(int total_elements) + { + // 默认块大小 + int optimal_block_size = 256; // 一般256或512是较好的选择 + // 计算设备属性以确定最佳配置 + int device_id; + cudaGetDevice(&device_id); + cudaDeviceProp props; + cudaGetDeviceProperties(&props, device_id); + + // 根据SM数量和每个SM的最大线程数决定块数 + int sm_count = props.multiProcessorCount; + int optimal_blocks = sm_count * 8; // 每个SM分配多个块以增加并行度 + + // 确保至少启动足够的线程来处理所有数据 + int min_blocks = (total_elements + optimal_block_size - 1) / optimal_block_size; + int actual_blocks = std::min(optimal_blocks, min_blocks); + + return {actual_blocks, optimal_block_size}; + }; } #endif diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/matmul_cublas.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/matmul_cublas.hpp index 067a94c1..5931c054 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/matmul_cublas.hpp +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/matmul_cublas.hpp @@ -144,123 +144,6 @@ namespace deepx::tensorfunc } }; - template <> - struct matmuladdDispatcher - { - static void matmuladd(const Tensor &A, const Tensor &B, const float &alpha, const float &beta, Tensor &C) - { - if (!check_matmul_shape(A.shape, B.shape)) - { - throw std::invalid_argument("A.shape could not matmul with B.shape"); - } - - static CublasHandle handle; - int64_t batch_size = A.shape.size / (A.shape[-2] * A.shape[-1]); - - int m = A.shape[-2]; - int k = A.shape[-1]; - int n = B.shape[-1]; - - // 计算步长 - int64_t stride_a = m * k; - int64_t stride_b = k * n; - int64_t stride_c = m * n; - - if (batch_size > 1) - { - auto status = cublasSgemmStridedBatched(handle.get(), - CUBLAS_OP_N, - CUBLAS_OP_N, - n, m, k, // 交换m,n - &alpha, - B.data, n, stride_b, // B在前 - A.data, k, stride_a, // A在后 - &beta, - C.data, n, stride_c, // 调整leading dimension - batch_size); // 添加缺失的batch_size参数 - - if (status != CUBLAS_STATUS_SUCCESS) - { - throw std::runtime_error("cublasSgemmStridedBatched failed"); - } - } - else - { - auto status = cublasSgemm(handle.get(), - CUBLAS_OP_N, - CUBLAS_OP_N, - n, m, k, // 交换m,n - &alpha, - B.data, n, // B在前 - A.data, k, // A在后 - &beta, - C.data, n); // 调整leading dimension - - if (status != CUBLAS_STATUS_SUCCESS) - { - throw std::runtime_error("cublasSgemm failed"); - } - } - } - }; - template <> - struct matmuladdDispatcher - { - static void matmuladd(const Tensor &A, const Tensor &B, const double &alpha, const double &beta, Tensor &C) - { - if (!check_matmul_shape(A.shape, B.shape)) - { - throw std::invalid_argument("A.shape could not matmul with B.shape"); - } - - static CublasHandle handle; - int m = A.shape[-2]; - int k = A.shape[-1]; - int n = B.shape[-1]; - - int64_t batch_size = A.shape.size / (A.shape[-2] * A.shape[-1]); - - if (batch_size > 1) - { - // 计算步长 - int64_t stride_a = m * k; - int64_t stride_b = k * n; - int64_t stride_c = m * n; - - auto status = cublasDgemmStridedBatched(handle.get(), - CUBLAS_OP_N, - CUBLAS_OP_N, - n, m, k, // 交换m,n处理行主序 - &alpha, - B.data, n, stride_b, // B在前 - A.data, k, stride_a, // A在后 - &beta, - C.data, n, stride_c, // 输出维度对应调整 - batch_size); - - if (status != CUBLAS_STATUS_SUCCESS) - { - throw std::runtime_error("cublasDgemmStridedBatched failed"); - } - } - else - { - auto status = cublasDgemm(handle.get(), - CUBLAS_OP_N, - CUBLAS_OP_N, - m, n, k, - &alpha, - A.data, m, - B.data, k, - &beta, - C.data, m); - - if (status != CUBLAS_STATUS_SUCCESS) - { - throw std::runtime_error("cublasDgemm failed"); - } - } - }; - }; + }; #endif // DEEPX_TENSORFUNC_MATMUL_HPP \ No newline at end of file diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/tensor_cuda.cuh b/excuter/op-mem-cuda/src/deepx/tensorfunc/tensor_cuda.cuh new file mode 100644 index 00000000..a042d6d1 --- /dev/null +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/tensor_cuda.cuh @@ -0,0 +1,39 @@ +#ifndef DEEPX_TENSORFUNC_TENSOR_CUDA_CUH +#define DEEPX_TENSORFUNC_TENSOR_CUDA_CUH + +#include "deepx/tensor.hpp" + +namespace deepx::tensorfunc +{ + __host__ __device__ void linearTo(const int *strides, const int dim, int *indices, const int id) + { + int linearIndex = id; + for (int i = 0; i < dim; i++) + { + indices[i] = linearIndex / strides[i]; + linearIndex %= strides[i]; + } + } + + __host__ __device__ int linearAt(const int *strides, const int dim, int *indices) + { + int idx = 0; + for (int i = 0; i < dim; i++) + { + idx += indices[i] * strides[i]; + } + return idx; + } + + template + __device__ __host__ void reorder(const T *order, const int *dimOrder, int dim, T *neworder) + { + for (int i = 0; i < dim; i++) + { + neworder[i] = order[dimOrder[i]]; + } + } + +} + +#endif // DEEPX_TENSORFUNC_TENSOR_CUDA_CUH diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/vector_cuda.cuh b/excuter/op-mem-cuda/src/deepx/tensorfunc/vector_cuda.cuh new file mode 100644 index 00000000..4fe17030 --- /dev/null +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/vector_cuda.cuh @@ -0,0 +1,100 @@ +#ifndef DEEPX_TENSORFUNC_VECTOR_CUDA_CUH +#define DEEPX_TENSORFUNC_VECTOR_CUDA_CUH + +namespace deepx::tensorfunc +{ + template + __device__ void GridStrideLoopCopy(const T* src, T* dst, int size) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + + for (int i = idx; i < size; i += stride) { + dst[i] = src[i]; + } + } + + // 全局复制函数,可从主机调用 + template + __global__ void GridStrideLoopCopyKernel(const T* src, T* dst, int size) { + GridStrideLoopCopy(src, dst, size); + } + + //cudaVector + template + struct cudaVector + { + T *data; + int size; + __device__ __host__ cudaVector(int size) : size(size) + { + cudaMalloc(&data, size * sizeof(T)); + } + __host__ cudaVector(const T *src, int size, cudaMemcpyKind kind = cudaMemcpyHostToDevice) : size(size) + { + cudaMalloc(&data, size * sizeof(T)); + cudaMemcpy(data, src, size * sizeof(T), kind); + } + __host__ cudaVector(const cudaVector &other) : size(other.size) + { + cudaMalloc(&data, size * sizeof(T)); + cudaMemcpy(data, other.data, size * sizeof(T), cudaMemcpyDeviceToDevice); + } + __device__ __host__ cudaVector(cudaVector &&other) noexcept : data(other.data), size(other.size) + { + other.data = nullptr; + other.size = 0; + } + __device__ __host__ cudaVector &operator=(const cudaVector &other) + { + if (this != &other) + { + cudaFree(data); + data = other.data; + size = other.size; + } + return *this; + } + __device__ __host__ cudaVector &operator=(cudaVector &&other) noexcept + { + if (this != &other) + { + cudaFree(data); + data = other.data; + size = other.size; + other.data = nullptr; + other.size = 0; + } + return *this; + } + __device__ __host__ ~cudaVector() + { + cudaFree(data); + } + __device__ __host__ void copyFromHost(const T *hostData, int size,int offset=0) + { + cudaMemcpy(data+offset, hostData, size * sizeof(T), cudaMemcpyHostToDevice); + } + __device__ __host__ void copyToHost(T *hostData, int size,int offset=0) + { + cudaMemcpy(hostData, data+offset, size * sizeof(T), cudaMemcpyDeviceToHost); + } + __device__ __host__ void copyFromDevice(const T *deviceData, int size,int offset=0) + { + for (int i = 0; i < size; i++) + { + data[offset+i] = deviceData[i]; + } + } + __device__ __host__ T &operator[](int idx) + { + return data[idx]; + } + __device__ __host__ const T &operator[](int idx) const + { + return data[idx]; + } + + }; +} + +#endif // DEEPX_TENSORFUNC_VECTOR_CUDA_CUH diff --git a/excuter/op-mem-cuda/test/tensorfunc/2_changeshape.cpp b/excuter/op-mem-cuda/test/tensorfunc/2_changeshape.cpp new file mode 100644 index 00000000..66f5dc39 --- /dev/null +++ b/excuter/op-mem-cuda/test/tensorfunc/2_changeshape.cpp @@ -0,0 +1,46 @@ +#include "deepx/tensorfunc/init_miaobyte.hpp" +#include "deepx/tensor.hpp" +#include "deepx/tensorfunc/new.hpp" +#include "deepx/tensorfunc/print_miaobyte.hpp" +#include "deepx/tensorfunc/changeshape_miaobyte.hpp" +using namespace deepx::tensorfunc; +using namespace deepx; +void test_transpose() +{ + Tensor a=New({3,4,6}); + arange(a, 1.0f, 1.0f); + print(a,"%.0f"); + Tensor b=New({3,6,4}); + transpose(a, {0,2,1}, b); + print(b,"%.0f"); +} + +void test_concat() +{ + Tensor a=New({3,2,6}); + arange(a, 1.0f, 1.0f); + print(a,"%.0f"); + Tensor b=New({3,4,6}); + constant(b, 2.0f); + print(b,"%.0f"); + Tensor c=New({3,6,6}); + constant(c, 3.0f); + print(c,"%.0f"); + Tensor d=New({3,12,6}); + concat({&a,&b,&c},1,d); + print(d,"%.0f"); +} +int main(int argc, char **argv) +{ + int casearg=atoi(argv[1]); + switch (casearg) + { + case 0: + test_transpose(); + break; + case 1: + test_concat(); + break; + } + return 0; +} \ No newline at end of file diff --git a/excuter/op-mem-cuda/test/tensorfunc/CMakeLists.txt b/excuter/op-mem-cuda/test/tensorfunc/CMakeLists.txt index cbea6433..91fbb357 100644 --- a/excuter/op-mem-cuda/test/tensorfunc/CMakeLists.txt +++ b/excuter/op-mem-cuda/test/tensorfunc/CMakeLists.txt @@ -5,4 +5,7 @@ add_executable(1_cublas_add 1_cublas_add.cpp) target_link_libraries(1_cublas_add deepx CUDA::cudart) add_executable(1_cublas_matmul 1_cublas_matmul.cpp) -target_link_libraries(1_cublas_matmul deepx CUDA::cudart) \ No newline at end of file +target_link_libraries(1_cublas_matmul deepx CUDA::cudart) + +add_executable(2_changeshape 2_changeshape.cpp) +target_link_libraries(2_changeshape deepx CUDA::cudart) \ No newline at end of file diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/changeshape.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/changeshape_miaobyte.hpp similarity index 71% rename from excuter/op-mem-ompsimd/src/deepx/tensorfunc/changeshape.hpp rename to excuter/op-mem-ompsimd/src/deepx/tensorfunc/changeshape_miaobyte.hpp index fe562099..4ca0f747 100644 --- a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/changeshape.hpp +++ b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/changeshape_miaobyte.hpp @@ -6,59 +6,72 @@ #include "deepx/tensor.hpp" #include "deepx/tensorfunc/new.hpp" +#include "deepx/tensorfunc/changeshape.hpp" #include "deepx/shape_broadcast.hpp" - +#include "deepx/tensorfunc/authors.hpp" namespace deepx::tensorfunc { template - void reshape(Tensor &tensor, Tensor &output, const std::vector &shape) - { // 参数改为单个tensor引用 + struct reshapeDispatcher + { + void reshape(Tensor &tensor, Tensor &output, const std::vector &shape) + { // 参数改为单个tensor引用 - int new_prod = 1; - for (int dim : shape) - { - new_prod *= dim; - } + int new_prod = 1; + for (int dim : shape) + { + new_prod *= dim; + } - if (tensor.shape.size != new_prod) - { - throw std::invalid_argument("Shape size mismatch"); - } - if (tensor.data != output.data) - { - tensorfunc::copytensor(tensor, output); + if (tensor.shape.size != new_prod) + { + throw std::invalid_argument("Shape size mismatch"); + } + if (tensor.data != output.data) + { + tensorfunc::copytensor(tensor, output); + } + output.shape = Shape(shape); // 直接修改原tensor的shape } - output.shape = Shape(shape); // 直接修改原tensor的shape - } + }; template - void transpose(const Tensor &tensor, Tensor &result, const std::vector &dimOrder) + struct transposeDispatcher { - if (dimOrder.size() != tensor.shape.dim) - { - throw std::invalid_argument("dimOrder size does not match the number of dimensions in the TensorCPU."); - } - if (result.shape.size != tensor.shape.size) + static void transpose(const Tensor &tensor, const std::vector &dim_order, Tensor &output) { - throw std::runtime_error("transpose error!shape"); - } - result.shape.rangeParallel(dimOrder.size(), [&tensor, &result, &dimOrder](int idx_linear, const std::vector &indices, std::vector &newIndices) - { - - for (size_t i = 0; i < dimOrder.size(); ++i) { - newIndices[dimOrder[i]] = indices[i]; + + if (dim_order.size() != tensor.shape.dim) + { + throw std::invalid_argument("dimOrder size does not match the number of dimensions in the TensorCPU."); + } + if (output.shape.size != tensor.shape.size) + { + throw std::runtime_error("transpose error!shape"); + } + output.shape.rangeParallel(dim_order.size(), [&tensor, &output, &dim_order](int idx_linear, const std::vector &indices, std::vector &newIndices) + { + + for (size_t i = 0; i < dim_order.size(); ++i) { + newIndices[dim_order[i]] = indices[i]; } - result.data[idx_linear]= tensor.data[tensor.shape.linearat(newIndices)]; }, tensor.shape.dim); - } + output.data[idx_linear]= tensor.data[tensor.shape.linearat(newIndices)]; }, tensor.shape.dim); + } + }; template - void concat(const std::vector *> &tensors, const int axis, Tensor &result) + struct concatDispatcher { - // Shape shape=concatShape(tensors,axis); - // result=New(shape.shape); - int dimC = axis + 1; - result.shape.rangeParallel(dimC, [&](const int idx, const std::vector &indices) - { + static void concat(const vector *> tensors, const int axis, Tensor &result) + { + //checkshape + if (!checkShapeConcat(tensors, axis, result)) + { + throw TensorShapeError("Output tensor shape size must match the sum of input tensor shape sizes for concat"); + } + int dimC = axis + 1; + result.shape.rangeParallel(dimC, [&](const int idx, const std::vector &indices) + { int concatIdxCurrentTensor=indices[axis];; int tensorIdx=0; while (tensorIdx < tensors.size() ) { @@ -76,7 +89,8 @@ namespace deepx::tensorfunc int idxCurrentTensor=tensors[tensorIdx]->shape.linearat(currentTensorIndices); int copylen=tensors[tensorIdx]->shape.strides[axis]; std::copy(tensors[tensorIdx]->data+idxCurrentTensor,tensors[tensorIdx]->data+idxCurrentTensor+copylen,result.data+idx); }); - } + } + }; template void split(const Tensor &tensor, const int axis, std::vector *> &results) @@ -111,7 +125,7 @@ namespace deepx::tensorfunc throw std::invalid_argument("expand维度不匹配: 输入维度 " + std::to_string(input.shape.dim) + ", 目标维度 " + - std::to_string(output.shape.dim)+ + std::to_string(output.shape.dim) + "请先前dim补1的方式reshape"); } @@ -162,7 +176,5 @@ namespace deepx::tensorfunc output.data[idx_linear] = input.data[idx_old]; }, input.shape.dim); } } - - } #endif \ No newline at end of file diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/matmul_cblas.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/matmul_cblas.hpp index 1e1371f6..8656191b 100644 --- a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/matmul_cblas.hpp +++ b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/matmul_cblas.hpp @@ -115,150 +115,5 @@ namespace deepx::tensorfunc } } }; - - template - struct matmuladdDispatcher - { - static void matmuladd(const Tensor &a, const Tensor &b, const T &alpha, const T &beta, Tensor &c) - { - if (!check_shape(a.shape, b.shape)) - { - throw std::invalid_argument("a.shape could matmul with b.shape"); - } - c.shape.rangeParallel(c.shape.dim - 2, [&](const std::vector &indices) - { - int aIdx=a.shape.linearat(indices); - int bIdx=b.shape.linearat(indices); - int cIdx=c.shape.linearat(indices); - int m=a.shape[-2]; - int k=a.shape[-1]; - int n=b.shape[-1]; - for(int i=0;i - struct matmuladdDispatcher - { - static void matmuladd(const Tensor &a, const Tensor &b, const float &alpha, const float &beta, Tensor &c) - { - if (!check_matmul_shape(a.shape, b.shape)) - { - throw std::invalid_argument("a.shape could matmul with b.shape"); - } - // 计算batch size (将除最后两维外的所有维度展平) - // 计算batch size (将除最后两维外的所有维度展平) - int64_t batch_size = 1; - for (int i = 0; i < a.shape.dim - 2; ++i) - { - batch_size *= a.shape[i]; - } - - // 获取矩阵维度 - int64_t m = a.shape[-2]; // 倒数第二维 - int64_t k = a.shape[-1]; // 最后一维 - int64_t n = b.shape[-1]; // B的最后一维 - - // 设置每个矩阵的步长 - int64_t lda = k; - int64_t ldb = n; - int64_t ldc = n; - - // 计算每个batch的指针偏移 - std::vector a_array(batch_size); - std::vector b_array(batch_size); - std::vector c_array(batch_size); - - for (int64_t i = 0; i < batch_size; ++i) - { - a_array[i] = a.data + i * m * k; - b_array[i] = b.data + i * k * n; - c_array[i] = c.data + i * m * n; - } - - for (int64_t i = 0; i < batch_size; ++i) - { - // C = α * op(A) * op(B) + β * C - cblas_sgemm(CblasRowMajor, // 存储顺序 - CblasNoTrans, // op(A) = A - CblasNoTrans, // op(B) = B - m, n, k, // A[m×k], B[k×n], C[m×n] - alpha, // α = 1.0 - a_array[i], // A矩阵指针 - lda, // A的leading dimension(行主序时为列数k) - b_array[i], // B矩阵指针 - ldb, // B的leading dimension(行主序时为列数n) - beta, // β = 0.0 - c_array[i], // C矩阵指针 - ldc); // C的leading dimension(行主序时为列数n) - } - } - }; - - template <> - struct matmuladdDispatcher - { - static void matmuladd(const Tensor &a, const Tensor &b, const double &alpha, const double &beta, Tensor &c) - { - if (!check_matmul_shape(a.shape, b.shape)) - { - throw std::invalid_argument("a.shape could matmul with b.shape"); - } - // 计算batch size (将除最后两维外的所有维度展平) - // 计算batch size (将除最后两维外的所有维度展平) - int64_t batch_size = 1; - for (int i = 0; i < a.shape.dim - 2; ++i) - { - batch_size *= a.shape[i]; - } - - // 获取矩阵维度 - int64_t m = a.shape[-2]; // 倒数第二维 - int64_t k = a.shape[-1]; // 最后一维 - int64_t n = b.shape[-1]; // B的最后一维 - - // 设置每个矩阵的步长 - int64_t lda = k; - int64_t ldb = n; - int64_t ldc = n; - - // 计算每个batch的指针偏移 - std::vector a_array(batch_size); - std::vector b_array(batch_size); - std::vector c_array(batch_size); - - for (int64_t i = 0; i < batch_size; ++i) - { - a_array[i] = a.data + i * m * k; - b_array[i] = b.data + i * k * n; - c_array[i] = c.data + i * m * n; - } - - for (int64_t i = 0; i < batch_size; ++i) - { - // C = α * op(A) * op(B) + β * C - cblas_dgemm(CblasRowMajor, // 存储顺序 - CblasNoTrans, // op(A) = A - CblasNoTrans, // op(B) = B - m, n, k, // A[m×k], B[k×n], C[m×n] - alpha, // α = 1.0 - a_array[i], // A矩阵指针 - lda, // A的leading dimension(行主序时为列数k) - b_array[i], // B矩阵指针 - ldb, // B的leading dimension(行主序时为列数n) - beta, // β = 0.0 - c_array[i], // C矩阵指针 - ldc); // C的leading dimension(行主序时为列数n) - } - } - }; } #endif // DEEPX_TENSORFUNC_MATMUL_CBLAS_HPP \ No newline at end of file diff --git a/excuter/op-mem-ompsimd/test/tensorfunc/7_tensor_transpose.cpp b/excuter/op-mem-ompsimd/test/tensorfunc/7_tensor_transpose.cpp index f6e55be4..90188489 100644 --- a/excuter/op-mem-ompsimd/test/tensorfunc/7_tensor_transpose.cpp +++ b/excuter/op-mem-ompsimd/test/tensorfunc/7_tensor_transpose.cpp @@ -3,13 +3,13 @@ #include #include "deepx/tensor.hpp" -#include "deepx/tensorfunc/changeshape.hpp" +#include "deepx/tensorfunc/changeshape_miaobyte.hpp" #include "deepx/tensorfunc/new.hpp" +#include "deepx/tensorfunc/authors.hpp" #include "deepx/tensorfunc/print_miaobyte.hpp" #include "stdutil/vector.hpp" #include "tensorutil.hpp" #include "deepx/shape_transpose.hpp" -#include "deepx/tensorfunc/authors.hpp" using namespace deepx::tensorfunc; using namespace deepx; @@ -25,7 +25,7 @@ void test_transpose() std::vector resultshape = transposeShape(tensor.shape.shape, dimOrder); Tensor result = New(resultshape); - transpose(tensor, result, dimOrder); + transpose(tensor, dimOrder, result); print(result); } diff --git a/excuter/op-mem-ompsimd/test/tensorfunc/8_tensor_concat.cpp b/excuter/op-mem-ompsimd/test/tensorfunc/8_tensor_concat.cpp index 59d66dd1..3a6bafdc 100644 --- a/excuter/op-mem-ompsimd/test/tensorfunc/8_tensor_concat.cpp +++ b/excuter/op-mem-ompsimd/test/tensorfunc/8_tensor_concat.cpp @@ -3,12 +3,12 @@ #include -#include "deepx/tensorfunc/changeshape.hpp" +#include "deepx/tensorfunc/changeshape_miaobyte.hpp" #include "deepx/tensor.hpp" #include "deepx/shape.hpp" #include "deepx/shape_concat.hpp" #include "deepx/tensorfunc/new.hpp" -#include "deepx/tensorfunc/init.hpp" +#include "deepx/tensorfunc/init_miaobyte.hpp" #include "deepx/tensorfunc/print_miaobyte.hpp" #include "stdutil/vector.hpp" #include "deepx/mem/mem.hpp" @@ -22,6 +22,7 @@ shared_ptr makeMem(int cnt,std::vector shape){ for (int j=0; j(shape); + arange(ptr,0.0f,1.0f); mem->addtensor("tensor"+std::to_string(j), ptr); } return mem; @@ -39,7 +40,7 @@ void test_concat(){ for (int i=0;ishape.dim;i++){ Shape shape=concatShape(tensors,i); Tensor result=New(shape.shape); - concat(tensors,i,result); + concat(tensors,i,result); print(result); } std::cout<<"================"< Date: Sun, 6 Apr 2025 17:51:18 +0800 Subject: [PATCH 4/7] excuter(cpu/cuda):reshape,transpose --- .../src/deepx/tensorfunc/changeshape.hpp | 6 +- excuter/cpp-common/src/deepx/tf/tf.hpp | 2 +- excuter/op-mem-cuda/src/client/tfs.cpp | 128 ++++++----- .../op-mem-cuda/src/deepx/tf/changeshape.hpp | 143 ++++++++++++ excuter/op-mem-ompsimd/src/client/tfs.cpp | 78 ++++--- .../deepx/tensorfunc/changeshape_miaobyte.hpp | 14 +- .../src/deepx/tf/changeshape.hpp | 213 ++++++++++++------ 7 files changed, 416 insertions(+), 168 deletions(-) create mode 100644 excuter/op-mem-cuda/src/deepx/tf/changeshape.hpp diff --git a/excuter/cpp-common/src/deepx/tensorfunc/changeshape.hpp b/excuter/cpp-common/src/deepx/tensorfunc/changeshape.hpp index 5acd644e..c2eff6b1 100644 --- a/excuter/cpp-common/src/deepx/tensorfunc/changeshape.hpp +++ b/excuter/cpp-common/src/deepx/tensorfunc/changeshape.hpp @@ -1,5 +1,5 @@ -#ifndef DEEPX_TENSORFUNC_CHANGE_SHAPE_HPP -#define DEEPX_TENSORFUNC_CHANGE_SHAPE_HPP +#ifndef DEEPX_TENSORFUNC_CHANGESHAPE_HPP +#define DEEPX_TENSORFUNC_CHANGESHAPE_HPP #include #include "deepx/tensor.hpp" @@ -14,7 +14,7 @@ namespace deepx::tensorfunc static void reshape(Tensor &tensor, const std::vector &new_shape) = delete; }; - // reshape(A,new_shape)=>B + // A.reshape(new_shape) template void reshape(Tensor &tensor, const std::vector &new_shape) { diff --git a/excuter/cpp-common/src/deepx/tf/tf.hpp b/excuter/cpp-common/src/deepx/tf/tf.hpp index 2508a121..3425e9a8 100644 --- a/excuter/cpp-common/src/deepx/tf/tf.hpp +++ b/excuter/cpp-common/src/deepx/tf/tf.hpp @@ -77,7 +77,7 @@ namespace deepx::tf } template - vector argvector( int from=0, int to=0,bool arg=true){ + vector getvector( int from=0, int to=0,bool arg=true){ vector &vars=arg?args:returns; if(from<0){ from = vars.size()+from; diff --git a/excuter/op-mem-cuda/src/client/tfs.cpp b/excuter/op-mem-cuda/src/client/tfs.cpp index f8d33bd5..109847c9 100644 --- a/excuter/op-mem-cuda/src/client/tfs.cpp +++ b/excuter/op-mem-cuda/src/client/tfs.cpp @@ -8,6 +8,7 @@ #include "deepx/tf/elementwise_sin.hpp" #include "deepx/tf/elementwise_compare.hpp" #include "deepx/tf/matmul.hpp" +#include "deepx/tf/changeshape.hpp" #include "deepx/dtype.hpp" #include "deepx/tf/tffactory.hpp" #include "deepx/tensorfunc/authors.hpp" @@ -194,80 +195,80 @@ namespace deepx::tf tffactory.add_tf(std::make_shared>(vector( { - Param("A", DataCategory::Tensor, Precision::Float64|Precision::Float32|Precision::Float16|Precision::BFloat16), + Param("A", DataCategory::Tensor, Precision::Float64 | Precision::Float32 | Precision::Float16 | Precision::BFloat16), }), vector( { - Param("C", DataCategory::Tensor, Precision::Float64|Precision::Float32|Precision::Float16|Precision::BFloat16), + Param("C", DataCategory::Tensor, Precision::Float64 | Precision::Float32 | Precision::Float16 | Precision::BFloat16), }))); tffactory.add_tf(std::make_shared>(vector( { - Param("A", DataCategory::Tensor, Precision::Float64|Precision::Float32 ), - Param("B", DataCategory::Tensor, Precision::Float64|Precision::Float32 ), + Param("A", DataCategory::Tensor, Precision::Float64 | Precision::Float32), + Param("B", DataCategory::Tensor, Precision::Float64 | Precision::Float32), }), vector( { - Param("C", DataCategory::Tensor, Precision::Float64|Precision::Float32), + Param("C", DataCategory::Tensor, Precision::Float64 | Precision::Float32), }))); tffactory.add_tf(std::make_shared>(vector( { - Param("A", DataCategory::Tensor, Precision::Float64|Precision::Float32), - Param("scalar", DataCategory::Var, Precision::Float64|Precision::Float32), + Param("A", DataCategory::Tensor, Precision::Float64 | Precision::Float32), + Param("scalar", DataCategory::Var, Precision::Float64 | Precision::Float32), }), vector( { - Param("C", DataCategory::Tensor, Precision::Float64|Precision::Float32), + Param("C", DataCategory::Tensor, Precision::Float64 | Precision::Float32), }))); tffactory.add_tf(std::make_shared>(vector( { - Param("A", DataCategory::Tensor, Precision::Float64|Precision::Float32|Precision::Float16|Precision::BFloat16), + Param("A", DataCategory::Tensor, Precision::Float64 | Precision::Float32 | Precision::Float16 | Precision::BFloat16), }), vector( { - Param("C", DataCategory::Tensor, Precision::Float64|Precision::Float32|Precision::Float16|Precision::BFloat16), + Param("C", DataCategory::Tensor, Precision::Float64 | Precision::Float32 | Precision::Float16 | Precision::BFloat16), }))); tffactory.add_tf(std::make_shared>(vector( { - Param("A", DataCategory::Tensor, Precision::Float64|Precision::Float32|Precision::Float16|Precision::BFloat16), + Param("A", DataCategory::Tensor, Precision::Float64 | Precision::Float32 | Precision::Float16 | Precision::BFloat16), }), vector( { - Param("C", DataCategory::Tensor, Precision::Float64|Precision::Float32|Precision::Float16|Precision::BFloat16), + Param("C", DataCategory::Tensor, Precision::Float64 | Precision::Float32 | Precision::Float16 | Precision::BFloat16), }))); tffactory.add_tf(std::make_shared>(vector( { - Param("A", DataCategory::Tensor, Precision::Float64|Precision::Float32|Precision::Float16|Precision::BFloat16), + Param("A", DataCategory::Tensor, Precision::Float64 | Precision::Float32 | Precision::Float16 | Precision::BFloat16), }), vector( { - Param("C", DataCategory::Tensor, Precision::Float64|Precision::Float32|Precision::Float16|Precision::BFloat16), + Param("C", DataCategory::Tensor, Precision::Float64 | Precision::Float32 | Precision::Float16 | Precision::BFloat16), }))); tffactory.add_tf(std::make_shared>(vector( { - Param("A", DataCategory::Tensor, Precision::Float64|Precision::Float32|Precision::Float16|Precision::BFloat16), + Param("A", DataCategory::Tensor, Precision::Float64 | Precision::Float32 | Precision::Float16 | Precision::BFloat16), }), vector( { - Param("C", DataCategory::Tensor, Precision::Float64|Precision::Float32|Precision::Float16|Precision::BFloat16), + Param("C", DataCategory::Tensor, Precision::Float64 | Precision::Float32 | Precision::Float16 | Precision::BFloat16), }))); tffactory.add_tf(std::make_shared>(vector( { - Param("A", DataCategory::Tensor, Precision::Float64|Precision::Float32), + Param("A", DataCategory::Tensor, Precision::Float64 | Precision::Float32), }), vector( { - Param("C", DataCategory::Tensor, Precision::Float64|Precision::Float32), + Param("C", DataCategory::Tensor, Precision::Float64 | Precision::Float32), }))); tffactory.add_tf(std::make_shared>(vector( - { - Param("A", DataCategory::Tensor, Precision::Any), - Param("B", DataCategory::Tensor, Precision::Any), - }), - vector( - { - Param("C", DataCategory::Tensor, Precision::Any), - }))); + { + Param("A", DataCategory::Tensor, Precision::Any), + Param("B", DataCategory::Tensor, Precision::Any), + }), + vector( + { + Param("C", DataCategory::Tensor, Precision::Any), + }))); tffactory.add_tf(std::make_shared>(vector( { Param("A", DataCategory::Tensor, Precision::Any), @@ -276,16 +277,16 @@ namespace deepx::tf vector( { Param("C", DataCategory::Tensor, Precision::Any), - }))); + }))); tffactory.add_tf(std::make_shared>(vector( - { - Param("A", DataCategory::Tensor, Precision::Any), - Param("B", DataCategory::Tensor, Precision::Any), - }), - vector( - { - Param("C", DataCategory::Tensor, Precision::Any), - }))); + { + Param("A", DataCategory::Tensor, Precision::Any), + Param("B", DataCategory::Tensor, Precision::Any), + }), + vector( + { + Param("C", DataCategory::Tensor, Precision::Any), + }))); tffactory.add_tf(std::make_shared>(vector( { Param("A", DataCategory::Tensor, Precision::Any), @@ -294,26 +295,26 @@ namespace deepx::tf vector( { Param("C", DataCategory::Tensor, Precision::Any), - }))); + }))); tffactory.add_tf(std::make_shared>(vector( - { - Param("A", DataCategory::Tensor, Precision::Any), - Param("B", DataCategory::Tensor, Precision::Any), - }), - vector( - { - Param("mask", DataCategory::Tensor, Precision::Int8), - }))); + { + Param("A", DataCategory::Tensor, Precision::Any), + Param("B", DataCategory::Tensor, Precision::Any), + }), + vector( + { + Param("mask", DataCategory::Tensor, Precision::Int8), + }))); tffactory.add_tf(std::make_shared>(vector( - { - Param("A", DataCategory::Tensor, Precision::Any), - Param("scalar", DataCategory::Var, Precision::Any), - }), - vector( - { - Param("mask", DataCategory::Tensor, Precision::Int8), - }))); - } + { + Param("A", DataCategory::Tensor, Precision::Any), + Param("scalar", DataCategory::Var, Precision::Any), + }), + vector( + { + Param("mask", DataCategory::Tensor, Precision::Int8), + }))); + } // matmul void register_matmul(TfFactory &tffactory) { @@ -330,10 +331,23 @@ namespace deepx::tf // // changeshape void register_changeshape(TfFactory &tffactory) { - // opfactory.add_op(Transpose()); - // opfactory.add_op(Reshape()); - // opfactory.add_op(Expand()); - // tffactory.add_tf(std::make_shared()); + + tffactory.add_tf(std::make_shared>(vector( + { + Param("A", DataCategory::Tensor, Precision::Any), + Param("shape", DataCategory::Vector, Precision::Int32), + }), + vector())); + + tffactory.add_tf(std::make_shared>(vector( + { + Param("A", DataCategory::Tensor, Precision::Any), + Param("dim_order", DataCategory::Vector, Precision::Int32), + }), + vector( + { + Param("C", DataCategory::Tensor, Precision::Any), + }))); } // // reduce // void register_reduce(OpFactory &opfactory) diff --git a/excuter/op-mem-cuda/src/deepx/tf/changeshape.hpp b/excuter/op-mem-cuda/src/deepx/tf/changeshape.hpp new file mode 100644 index 00000000..551f7d51 --- /dev/null +++ b/excuter/op-mem-cuda/src/deepx/tf/changeshape.hpp @@ -0,0 +1,143 @@ +#ifndef DEEPX_TF_CHANGESHAPE_HPP +#define DEEPX_TF_CHANGESHAPE_HPP + +#include +#include +#include + + +#include "deepx/tensorfunc/changeshape_miaobyte.hpp" + +namespace deepx::tf +{ + using namespace deepx::tensorfunc; + using namespace std; + template + class Reshape : public TF + { + public: + Reshape(const vector &args, const vector &returns) + { + this->name = "reshape"; + this->author = Author::name(); + this->args = args; + this->returns = returns; + } + + string math_formula() const override + { + return "T2=T1.reshape(shape)"; + } + + shared_ptr clone() const override + { + return make_shared>(*this); + } + + int run(shared_ptr mem, string &error) override + { + Precision input_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; + vector shape = this->getvector(1, -1); + Precision output_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (input_type != output_type) + { + error = "Type mismatch: " + precision_str(input_type) + " != " + precision_str(output_type); + return 1; + } + switch (input_type) + { + case Precision::Float64: + reshape(*mem->gettensor(this->args[0].textvalue), shape); + break; + case Precision::Float32: + reshape(*mem->gettensor(this->args[0].textvalue), shape); + break; + case Precision::Int64: + reshape(*mem->gettensor(this->args[0].textvalue), shape); + break; + case Precision::Int32: + reshape(*mem->gettensor(this->args[0].textvalue), shape); + break; + case Precision::Int16: + reshape(*mem->gettensor(this->args[0].textvalue), shape); + break; + case Precision::Int8: + reshape(*mem->gettensor(this->args[0].textvalue), shape); + break; + default: + error = "Unsupported type: " + precision_str(input_type); + return 1; + } + return 0; + } + }; + + template + class Transpose : public TF + { + public: + Transpose(const vector &args, const vector &returns) + { + this->name = "transpose"; + this->author = Author::name(); + this->args = args; + this->returns = returns; + } + + string math_formula() const override + { + return "T2 = T1.transpose(dimorder=[1,0])"; + } + + shared_ptr clone() const override + { + return make_shared>(*this); + } + + int run(shared_ptr mem, string &error) override + { + Precision input_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; + vector dim_order = this->getvector(1, -1); + Precision output_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + + if (input_type != output_type) + { + error = "Type mismatch: " + precision_str(input_type) + " != " + precision_str(output_type); + return 1; + } + switch (input_type) + { + case Precision::Float64: + transpose(*mem->gettensor(this->args[0].textvalue), dim_order, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + transpose(*mem->gettensor(this->args[0].textvalue), dim_order, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float16: + transpose(*mem->gettensor(this->args[0].textvalue), dim_order, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::BFloat16: + transpose(*mem->gettensor(this->args[0].textvalue), dim_order, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int64: + transpose(*mem->gettensor(this->args[0].textvalue), dim_order, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + transpose(*mem->gettensor(this->args[0].textvalue), dim_order, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + transpose(*mem->gettensor(this->args[0].textvalue), dim_order, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + transpose(*mem->gettensor(this->args[0].textvalue), dim_order, *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported type: " + precision_str(input_type); + return 1; + } + return 0; + } + }; + +} +#endif // DEEPX_TF_CHANGESHAPE_HPP diff --git a/excuter/op-mem-ompsimd/src/client/tfs.cpp b/excuter/op-mem-ompsimd/src/client/tfs.cpp index 2670deae..cdf7ac6c 100644 --- a/excuter/op-mem-ompsimd/src/client/tfs.cpp +++ b/excuter/op-mem-ompsimd/src/client/tfs.cpp @@ -250,15 +250,15 @@ namespace deepx::tf { Param("C", DataCategory::Tensor, Precision::Any), }))); - tffactory.add_tf(std::make_shared>(vector( - { - Param("A", DataCategory::Tensor, Precision::Any), - Param("scalar", DataCategory::Var, Precision::Any), - }), - vector( - { - Param("C", DataCategory::Tensor, Precision::Any), - }))); + tffactory.add_tf(std::make_shared>(vector( + { + Param("A", DataCategory::Tensor, Precision::Any), + Param("scalar", DataCategory::Var, Precision::Any), + }), + vector( + { + Param("C", DataCategory::Tensor, Precision::Any), + }))); tffactory.add_tf(std::make_shared>(vector( { Param("A", DataCategory::Tensor, Precision::Any), @@ -272,31 +272,31 @@ namespace deepx::tf { Param("A", DataCategory::Tensor, Precision::Any), Param("scalar", DataCategory::Var, Precision::Any), - }), + }), vector( { Param("C", DataCategory::Tensor, Precision::Any), }))); tffactory.add_tf(std::make_shared>(vector( - { - Param("A", DataCategory::Tensor, Precision::Any), - Param("B", DataCategory::Tensor, Precision::Any), - - }), - vector( - { - Param("mask", DataCategory::Tensor, Precision::Float32), - }))); + { + Param("A", DataCategory::Tensor, Precision::Any), + Param("B", DataCategory::Tensor, Precision::Any), + + }), + vector( + { + Param("mask", DataCategory::Tensor, Precision::Float32), + }))); tffactory.add_tf(std::make_shared>(vector( - { - Param("A", DataCategory::Tensor, Precision::Any), - Param("scalar", DataCategory::Var, Precision::Any), - }), - vector( - { - Param("mask", DataCategory::Tensor, Precision::Float32), - }))); - } + { + Param("A", DataCategory::Tensor, Precision::Any), + Param("scalar", DataCategory::Var, Precision::Any), + }), + vector( + { + Param("mask", DataCategory::Tensor, Precision::Float32), + }))); + } // matmul void register_matmul(TfFactory &tffactory) { @@ -318,15 +318,27 @@ namespace deepx::tf { Param("C", DataCategory::Tensor, Precision::Float64 | Precision::Float32), }))); - } // // changeshape void register_changeshape(TfFactory &tffactory) { - // opfactory.add_op(Transpose()); - // opfactory.add_op(Reshape()); - // opfactory.add_op(Expand()); - tffactory.add_tf(std::make_shared()); + + tffactory.add_tf(std::make_shared>(vector( + { + Param("A", DataCategory::Tensor, Precision::Any), + Param("shape", DataCategory::Vector, Precision::Int32), + }), + vector())); + + tffactory.add_tf(std::make_shared>(vector( + { + Param("A", DataCategory::Tensor, Precision::Any), + Param("dim_order", DataCategory::Vector, Precision::Int32), + }), + vector( + { + Param("C", DataCategory::Tensor, Precision::Any), + }))); } // // reduce // void register_reduce(OpFactory &opfactory) diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/changeshape_miaobyte.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/changeshape_miaobyte.hpp index 4ca0f747..de1f277a 100644 --- a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/changeshape_miaobyte.hpp +++ b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/changeshape_miaobyte.hpp @@ -1,5 +1,5 @@ -#ifndef DEEPX_TENSORFUNC_CHANGESHAPE_HPP -#define DEEPX_TENSORFUNC_CHANGESHAPE_HPP +#ifndef DEEPX_TENSORFUNC_CHANGESHAPE_MIAOBYTE_HPP +#define DEEPX_TENSORFUNC_CHANGESHAPE_MIAOBYTE_HPP #include #include @@ -14,7 +14,7 @@ namespace deepx::tensorfunc template struct reshapeDispatcher { - void reshape(Tensor &tensor, Tensor &output, const std::vector &shape) + static void reshape(Tensor &tensor, const std::vector &shape) { // 参数改为单个tensor引用 int new_prod = 1; @@ -27,11 +27,7 @@ namespace deepx::tensorfunc { throw std::invalid_argument("Shape size mismatch"); } - if (tensor.data != output.data) - { - tensorfunc::copytensor(tensor, output); - } - output.shape = Shape(shape); // 直接修改原tensor的shape + tensor.shape = Shape(shape); } }; @@ -177,4 +173,4 @@ namespace deepx::tensorfunc } } } -#endif \ No newline at end of file +#endif // DEEPX_TENSORFUNC_CHANGESHAPE_MIAOBYTE_HPP \ No newline at end of file diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/changeshape.hpp b/excuter/op-mem-ompsimd/src/deepx/tf/changeshape.hpp index 838f2879..2c9707db 100644 --- a/excuter/op-mem-ompsimd/src/deepx/tf/changeshape.hpp +++ b/excuter/op-mem-ompsimd/src/deepx/tf/changeshape.hpp @@ -1,45 +1,171 @@ #ifndef DEEPX_TF_CHANGESHAPE_HPP #define DEEPX_TF_CHANGESHAPE_HPP +#include #include "deepx/tf/tf.hpp" -#include "deepx/tensorfunc/changeshape.hpp" +#include "deepx/tensorfunc/changeshape_miaobyte.hpp" #include "deepx/dtype.hpp" namespace deepx::tf { - class Concat : public TF + using namespace deepx::tensorfunc; + using namespace std; + + template + class Reshape : public TF { - private: - const string _name="concat"; public: - Concat() + Reshape(const vector &args, const vector &returns) + { + this->name = "reshape"; + this->author = Author::name(); + this->args = args; + this->returns = returns; + } + + string math_formula() const override { - this->name=_name; + return "T2=T1.reshape(shape)"; + } + shared_ptr clone() const override + { + return make_shared>(*this); } - Concat(string text) + + int run(shared_ptr mem, string &error) override { - this->parse(text); - if (this->name!=_name){ - throw std::runtime_error("Invalid name: "+this->name); + Precision input_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; + vector shape = this->getvector(1, -1); + Precision output_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (input_type != output_type) + { + error = "Type mismatch: " + precision_str(input_type) + " != " + precision_str(output_type); + return 1; + } + switch (input_type) + { + case Precision::Float64: + reshape(*mem->gettensor(this->args[0].textvalue), shape); + break; + case Precision::Float32: + reshape(*mem->gettensor(this->args[0].textvalue), shape); + break; + case Precision::Int64: + reshape(*mem->gettensor(this->args[0].textvalue), shape); + break; + case Precision::Int32: + reshape(*mem->gettensor(this->args[0].textvalue), shape); + break; + case Precision::Int16: + reshape(*mem->gettensor(this->args[0].textvalue), shape); + break; + case Precision::Int8: + reshape(*mem->gettensor(this->args[0].textvalue), shape); + break; + default: + error = "Unsupported type: " + precision_str(input_type); + return 1; } + return 0; + } + }; + + template + class Transpose : public TF + { + public: + Transpose(const vector &args, const vector &returns) + { + this->name = "transpose"; + this->author = Author::name(); + this->args = args; + this->returns = returns; } string math_formula() const override { - return "Tresult = concat([T1, T2...], axis=3)"; + return "T2 = T1.transpose(dimorder=[1,0])"; } + + shared_ptr clone() const override + { + return make_shared>(*this); + } + int run(shared_ptr mem, string &error) override { - //TODO,去掉T - // std::vector *> input; - // for (int i = 0; i < this->args.size() - 1; i++) - // { - // input.push_back(mem.gettensor(this->args[i].name).get()); - // } - // auto output = mem.gettensor(this->returns[0].name).get(); - // int axis = this->getvar(-1,mem,false); - // tensorfunc::concat(input, axis, *output); + Precision input_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; + vector dim_order = this->getvector(1, -1); + Precision output_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (input_type != output_type) + { + error = "Type mismatch: " + precision_str(input_type) + " != " + precision_str(output_type); + return 1; + } + + switch (input_type) + { + case Precision::Float64: + transpose(*mem->gettensor(this->args[0].textvalue), dim_order, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + transpose(*mem->gettensor(this->args[0].textvalue), dim_order, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int64: + transpose(*mem->gettensor(this->args[0].textvalue), dim_order, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + transpose(*mem->gettensor(this->args[0].textvalue), dim_order, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + transpose(*mem->gettensor(this->args[0].textvalue), dim_order, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + transpose(*mem->gettensor(this->args[0].textvalue), dim_order, *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported type: " + precision_str(input_type); + return 1; + } + return 0; + } + }; + + + + + + + +template + class Concat : public TF + { + public: + Concat(const vector &args, const vector &returns) + { + this->name = _name; + this->author = Author::name(); + this->args = args; + this->returns = returns; + } + + + string math_formula() const override + { + return "Tresult = concat([T1, T2...], axis=3)"; + } + int run(shared_ptr mem, string &error) override + { + // TODO,去掉T + // std::vector *> input; + // for (int i = 0; i < this->args.size() - 1; i++) + // { + // input.push_back(mem.gettensor(this->args[i].name).get()); + // } + // auto output = mem.gettensor(this->returns[0].name).get(); + // int axis = this->getvar(-1,mem,false); + // tensorfunc::concat(input, axis, *output); return 0; }; shared_ptr clone() const override @@ -65,8 +191,8 @@ namespace deepx::tf // void funcdef() override // { // this->parse("split(float32 T1,int32 3)->(float32 T2,T3)"); - // } - // string math_formula() const override + // } + // string math_formula() const override // { // return "T2,T3 = split(T1, axis=3)"; // } @@ -82,49 +208,6 @@ namespace deepx::tf // tensorfunc::split(*output, axis, input); // } // }; - // template - // class Reshape : public TF - // { - // public: - // Reshape() - // { - // this->init("reshape", "any", {}, {}, false, {}, {}); - // } - // void forward(mem::Mem &mem) override - // { - // auto input = mem.gettensor(this->args[0]).get(); - // auto output = mem.gettensor(this->returns[0]).get(); - // vector shape; - // if (this->args.size() == 2 && !is_integer(this->args[1])) - // { - // shape = mem.getvector(this->args[1]); - // } - // else - // { - // for (int i = 1; i < this->args.size(); i++) - // { - // shape.push_back(atoi(this->args[i].c_str())); - // } - // } - // tensorfunc::reshape(*input, *output, shape); - // } - // void backward(mem::Mem &mem) override - // { - // auto return_grad = mem.gettensor(this->returns_grad[0]).get(); - // auto input_grad = mem.gettensor(this->args_grad[0]).get(); - // auto input = mem.gettensor(this->args[0]).get(); - // vector shape = input->shape.shape; - // tensorfunc::reshape(*return_grad, *input_grad, shape); - // } - // void funcdef() override - // { - // this->init("reshape", "float32", {"T1", "2", "3", "4"}, {"T2"}, false, {}, {}); - // } - // string math_formula() const override - // { - // return "T2 = reshape(T1, [2,3,4])"; - // } - // }; // template // class Transpose : public Op From 731b833015bd0a7f49c28c17a3fd788a6dbf5e65 Mon Sep 17 00:00:00 2001 From: lipeng <734991033@qq.com> Date: Sun, 6 Apr 2025 18:28:23 +0800 Subject: [PATCH 5/7] excuter(cpu/cuda):reshape,transpose,concat --- excuter/cpp-common/src/deepx/dtype.hpp | 25 +++- excuter/cpp-common/src/deepx/tf/tf.hpp | 99 +++++++++----- excuter/op-mem-cuda/src/client/tfs.cpp | 10 ++ .../op-mem-cuda/src/deepx/tf/changeshape.hpp | 124 ++++++++++++++++++ excuter/op-mem-ompsimd/src/client/tfs.cpp | 10 ++ .../deepx/tensorfunc/changeshape_miaobyte.hpp | 1 + .../src/deepx/tf/changeshape.hpp | 110 ++++++++++++---- 7 files changed, 316 insertions(+), 63 deletions(-) diff --git a/excuter/cpp-common/src/deepx/dtype.hpp b/excuter/cpp-common/src/deepx/dtype.hpp index b93a2a7a..e1ca3627 100644 --- a/excuter/cpp-common/src/deepx/dtype.hpp +++ b/excuter/cpp-common/src/deepx/dtype.hpp @@ -3,9 +3,30 @@ #include #include +#include namespace deepx { + template + T to(const std::string &textvalue) + { + if constexpr (std::is_same_v) + { + return textvalue; + } + else if constexpr (std::is_arithmetic_v) + { + return static_cast(std::stof(textvalue)); + } + else + { + // 对于其他类型,尝试从字符串转换 + T value; + std::istringstream iss(textvalue); + iss >> value; + return value; + } + } enum class DataCategory : uint8_t { @@ -112,7 +133,7 @@ namespace deepx // 布尔类型 (13位) Bool = 1 << 13, // 0010 0000 0000 0000 String = 1 << 15, // 0100 0000 0000 0000 - // 常用组合 + // 常用组合 Any = 0xFFFF, // 1111 1111 1111 1111 Float = Float64 | Float32 | Float16 | BFloat16 | Float8E5M2 | Float8E4M3 | Float4E2M1, Float8 = Float8E5M2 | Float8E4M3, // 所有FP8格式 @@ -230,8 +251,6 @@ namespace deepx return TypeDef(category, precision); } - - // 修改precision_str函数以使用标准命名格式 inline std::string precision_str(Precision p) { diff --git a/excuter/cpp-common/src/deepx/tf/tf.hpp b/excuter/cpp-common/src/deepx/tf/tf.hpp index 3425e9a8..90c0d3cb 100644 --- a/excuter/cpp-common/src/deepx/tf/tf.hpp +++ b/excuter/cpp-common/src/deepx/tf/tf.hpp @@ -20,16 +20,17 @@ namespace deepx::tf using mem::MemBase; using namespace std; using namespace std::chrono; - - struct Param { + + struct Param + { TypeDef dtype; string textvalue; any value; - Param(const string& textvalue = "", const DataCategory& dt = DataCategory::Unknown, const Precision& prec = Precision::Any) + Param(const string &textvalue = "", const DataCategory &dt = DataCategory::Unknown, const Precision &prec = Precision::Any) : textvalue(textvalue), dtype(make_dtype(dt, prec)) {} }; - //TF:Tensor Function的缩写 + // TF:Tensor Function的缩写 class TF { public: @@ -42,55 +43,76 @@ namespace deepx::tf system_clock::time_point created_at; system_clock::time_point sent_at; system_clock::time_point recv_at; + public: TF() = default; TF(const TF &) = default; TF(const string text); TF &operator=(const TF &) = default; - + string op_name(); - virtual int run(shared_ptr mem,string &error){ + virtual int run(shared_ptr mem, string &error) + { throw NotImplementError(name); } virtual string math_formula() const; void parse(const string &str); - std::string to_string(bool show_extra=false, bool show_name=true) const; + std::string to_string(bool show_extra = false, bool show_name = true) const; void init(const string &opname, const vector &args, const vector &returns); - template - T getvar(int idx, shared_ptr mem,bool arg=true){ - vector &vars=arg?args:returns; - if(idx<0){ - idx = vars.size()+idx; + template + T getvar(int idx, shared_ptr mem, bool arg = true) + { + vector &vars = arg ? args : returns; + if (idx < 0) + { + idx = vars.size() + idx; } - if(idx<0 || idx>=vars.size()){ + if (idx < 0 || idx >= vars.size()) + { throw std::invalid_argument("Invalid argument index"); } - if (is_float(vars[idx].textvalue)){ - T value=T(std::stof(vars[idx].textvalue)); + if (is_float(vars[idx].textvalue)) + { + T value = T(std::stof(vars[idx].textvalue)); return value; } return mem->getarg(vars[idx].textvalue); } - template - vector getvector( int from=0, int to=0,bool arg=true){ - vector &vars=arg?args:returns; - if(from<0){ - from = vars.size()+from; - } - if(to<0){ - to = vars.size()+to; + + + template + vector getvector(int idx,bool arg = true) + { + vector &vars = arg ? args : returns; + if (idx < 0) + { + idx = vars.size() + idx; } - if(from>to){ + if (idx < 0 || idx >= vars.size()) + { throw std::invalid_argument("Invalid argument index"); } + if (idx < 0 || idx >= vars.size()) + { + throw std::invalid_argument("Invalid argument index"); + } + vector result; - for(int i=from;i<=to;i++){ - result.push_back(T(std::stof(vars[i].textvalue))); + string textvalue = vars[idx].textvalue; + if (textvalue.empty()) + { + throw std::invalid_argument("Invalid argument index"); + } + std::stringstream ss(textvalue); + std::string item; + while (std::getline(ss, item, ',')) + { + result.push_back(to(item)); } return result; } @@ -99,7 +121,8 @@ namespace deepx::tf bool check_dtype(const TF &other) const; // 添加虚拟克隆方法 - virtual shared_ptr clone() const { + virtual shared_ptr clone() const + { return make_shared(*this); } }; @@ -113,35 +136,41 @@ namespace deepx::tf system_clock::time_point start_at; system_clock::time_point finish_at; string message; + public: OpResp() = default; OpResp(const OpResp &) = default; OpResp &operator=(const OpResp &) = default; - - std::string to_string() const{ + + std::string to_string() const + { std::stringstream stream; stream << id << " " << result; stream << "// recv_at="; stream << duration_cast(recv_at.time_since_epoch()).count(); stream << " start_at="; stream << duration_cast(start_at.time_since_epoch()).count(); - stream << " finish_at="; + stream << " finish_at="; stream << duration_cast(finish_at.time_since_epoch()).count(); - if (message.size()>0){ - stream << " "<< message; + if (message.size() > 0) + { + stream << " " << message; } return stream.str(); } - void init(int id,system_clock::time_point recv_at){ + void init(int id, system_clock::time_point recv_at) + { this->id = id; this->recv_at = recv_at; } - void finish(const string &message){ + void finish(const string &message) + { this->result = "ok"; this->finish_at = system_clock::now(); this->message = message; } - void error(const string &message){ + void error(const string &message) + { this->result = "error"; this->finish_at = system_clock::now(); this->message = message; diff --git a/excuter/op-mem-cuda/src/client/tfs.cpp b/excuter/op-mem-cuda/src/client/tfs.cpp index 109847c9..c0d9bd9c 100644 --- a/excuter/op-mem-cuda/src/client/tfs.cpp +++ b/excuter/op-mem-cuda/src/client/tfs.cpp @@ -348,6 +348,16 @@ namespace deepx::tf { Param("C", DataCategory::Tensor, Precision::Any), }))); + + tffactory.add_tf(std::make_shared>(vector( + { + Param("tensors", DataCategory::ListTensor, Precision::Any), + Param("axis", DataCategory::Var, Precision::Int32), + }), + vector( + { + Param("result", DataCategory::Tensor, Precision::Any), + }))); } // // reduce // void register_reduce(OpFactory &opfactory) diff --git a/excuter/op-mem-cuda/src/deepx/tf/changeshape.hpp b/excuter/op-mem-cuda/src/deepx/tf/changeshape.hpp index 551f7d51..68d03767 100644 --- a/excuter/op-mem-cuda/src/deepx/tf/changeshape.hpp +++ b/excuter/op-mem-cuda/src/deepx/tf/changeshape.hpp @@ -139,5 +139,129 @@ namespace deepx::tf } }; + template + class Concat : public TF + { + public: + Concat(const vector &args, const vector &returns) + { + this->name = "concat"; + this->author = Author::name(); + this->args = args; + this->returns = returns; + } + + string math_formula() const override + { + return "Tresult = concat([T1, T2...], axis=3)"; + } + shared_ptr clone() const override + { + return make_shared(*this); + } + int run(shared_ptr mem, string &error) override + { + vector tensor_names = this->getvector(0, true); + Precision input_type = mem->gettensor(tensor_names[0]).get()->shape.dtype; + int axis = this->getvar(1, mem, false); + switch (input_type) + { + case Precision::Float64: + { + std::vector *> input; + for (int i = 0; i < tensor_names.size(); i++) + { + input.push_back(mem->gettensor(tensor_names[i]).get()); + } + auto output = mem->gettensor(this->returns[0].textvalue).get(); + concat(input, axis, *output); + break; + } + case Precision::Float32: + { + std::vector *> input; + for (int i = 0; i < tensor_names.size(); i++) + { + input.push_back(mem->gettensor(tensor_names[i]).get()); + } + auto output = mem->gettensor(this->returns[0].textvalue).get(); + concat(input, axis, *output); + break; + } + case Precision::Float16: + { + std::vector *> input; + for (int i = 0; i < tensor_names.size(); i++) + { + input.push_back(mem->gettensor(tensor_names[i]).get()); + } + auto output = mem->gettensor(this->returns[0].textvalue).get(); + concat(input, axis, *output); + break; + } + case Precision::BFloat16: + { + std::vector *> input; + for (int i = 0; i < tensor_names.size(); i++) + { + input.push_back(mem->gettensor(tensor_names[i]).get()); + } + auto output = mem->gettensor(this->returns[0].textvalue).get(); + concat(input, axis, *output); + break; + } + case Precision::Int64: + { + std::vector *> input; + for (int i = 0; i < tensor_names.size(); i++) + { + input.push_back(mem->gettensor(tensor_names[i]).get()); + } + auto output = mem->gettensor(this->returns[0].textvalue).get(); + concat(input, axis, *output); + break; + } + case Precision::Int32: + { + std::vector *> input; + for (int i = 0; i < tensor_names.size(); i++) + { + input.push_back(mem->gettensor(tensor_names[i]).get()); + } + auto output = mem->gettensor(this->returns[0].textvalue).get(); + concat(input, axis, *output); + break; + } + case Precision::Int16: + { + std::vector *> input; + for (int i = 0; i < tensor_names.size(); i++) + { + input.push_back(mem->gettensor(tensor_names[i]).get()); + } + auto output = mem->gettensor(this->returns[0].textvalue).get(); + concat(input, axis, *output); + break; + } + case Precision::Int8: + { + std::vector *> input; + for (int i = 0; i < tensor_names.size(); i++) + { + input.push_back(mem->gettensor(tensor_names[i]).get()); + } + auto output = mem->gettensor(this->returns[0].textvalue).get(); + concat(input, axis, *output); + break; + } + default: + error = "Unsupported type: " + precision_str(input_type); + return 1; + } + + return 0; + }; + }; + } #endif // DEEPX_TF_CHANGESHAPE_HPP diff --git a/excuter/op-mem-ompsimd/src/client/tfs.cpp b/excuter/op-mem-ompsimd/src/client/tfs.cpp index cdf7ac6c..57e2c206 100644 --- a/excuter/op-mem-ompsimd/src/client/tfs.cpp +++ b/excuter/op-mem-ompsimd/src/client/tfs.cpp @@ -339,6 +339,16 @@ namespace deepx::tf { Param("C", DataCategory::Tensor, Precision::Any), }))); + + tffactory.add_tf(std::make_shared>(vector( + { + Param("tensors", DataCategory::ListTensor, Precision::Any), + Param("axis", DataCategory::Var, Precision::Int32), + }), + vector( + { + Param("result", DataCategory::Tensor, Precision::Any), + }))); } // // reduce // void register_reduce(OpFactory &opfactory) diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/changeshape_miaobyte.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/changeshape_miaobyte.hpp index de1f277a..c03f3c0d 100644 --- a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/changeshape_miaobyte.hpp +++ b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/changeshape_miaobyte.hpp @@ -7,6 +7,7 @@ #include "deepx/tensor.hpp" #include "deepx/tensorfunc/new.hpp" #include "deepx/tensorfunc/changeshape.hpp" +#include "deepx/shape_concat.hpp" #include "deepx/shape_broadcast.hpp" #include "deepx/tensorfunc/authors.hpp" namespace deepx::tensorfunc diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/changeshape.hpp b/excuter/op-mem-ompsimd/src/deepx/tf/changeshape.hpp index 2c9707db..fa125766 100644 --- a/excuter/op-mem-ompsimd/src/deepx/tf/changeshape.hpp +++ b/excuter/op-mem-ompsimd/src/deepx/tf/changeshape.hpp @@ -76,7 +76,7 @@ namespace deepx::tf { public: Transpose(const vector &args, const vector &returns) - { + { this->name = "transpose"; this->author = Author::name(); this->args = args; @@ -103,7 +103,7 @@ namespace deepx::tf error = "Type mismatch: " + precision_str(input_type) + " != " + precision_str(output_type); return 1; } - + switch (input_type) { case Precision::Float64: @@ -132,46 +132,106 @@ namespace deepx::tf } }; - - - - - - -template + template class Concat : public TF { public: Concat(const vector &args, const vector &returns) { - this->name = _name; - this->author = Author::name(); + this->name = "concat"; + this->author = Author::name(); this->args = args; this->returns = returns; } - string math_formula() const override { return "Tresult = concat([T1, T2...], axis=3)"; } - int run(shared_ptr mem, string &error) override - { - // TODO,去掉T - // std::vector *> input; - // for (int i = 0; i < this->args.size() - 1; i++) - // { - // input.push_back(mem.gettensor(this->args[i].name).get()); - // } - // auto output = mem.gettensor(this->returns[0].name).get(); - // int axis = this->getvar(-1,mem,false); - // tensorfunc::concat(input, axis, *output); - return 0; - }; shared_ptr clone() const override { return make_shared(*this); } + int run(shared_ptr mem, string &error) override + { + vector tensor_names = this->getvector(0, true); + Precision input_type = mem->gettensor(tensor_names[0]).get()->shape.dtype; + int axis = this->getvar(1, mem, false); + switch (input_type) + { + case Precision::Float64: + { + std::vector *> input; + for (int i = 0; i < tensor_names.size(); i++) + { + input.push_back(mem->gettensor(tensor_names[i]).get()); + } + auto output = mem->gettensor(this->returns[0].textvalue).get(); + concat(input, axis, *output); + break; + } + case Precision::Float32: + { + std::vector *> input; + for (int i = 0; i < tensor_names.size(); i++) + { + input.push_back(mem->gettensor(tensor_names[i]).get()); + } + auto output = mem->gettensor(this->returns[0].textvalue).get(); + concat(input, axis, *output); + break; + } + case Precision::Int64: + { + std::vector *> input; + for (int i = 0; i < tensor_names.size(); i++) + { + input.push_back(mem->gettensor(tensor_names[i]).get()); + } + auto output = mem->gettensor(this->returns[0].textvalue).get(); + concat(input, axis, *output); + break; + } + case Precision::Int32: + { + std::vector *> input; + for (int i = 0; i < tensor_names.size(); i++) + { + input.push_back(mem->gettensor(tensor_names[i]).get()); + } + auto output = mem->gettensor(this->returns[0].textvalue).get(); + concat(input, axis, *output); + break; + } + case Precision::Int16: + { + std::vector *> input; + for (int i = 0; i < tensor_names.size(); i++) + { + input.push_back(mem->gettensor(tensor_names[i]).get()); + } + auto output = mem->gettensor(this->returns[0].textvalue).get(); + concat(input, axis, *output); + break; + } + case Precision::Int8: + { + std::vector *> input; + for (int i = 0; i < tensor_names.size(); i++) + { + input.push_back(mem->gettensor(tensor_names[i]).get()); + } + auto output = mem->gettensor(this->returns[0].textvalue).get(); + concat(input, axis, *output); + break; + } + default: + error = "Unsupported type: " + precision_str(input_type); + return 1; + } + + return 0; + }; }; // class Split : public TF From 72e5334d5c7524fb06972ac2045fe69fe1af5acb Mon Sep 17 00:00:00 2001 From: lipeng <734991033@qq.com> Date: Sun, 6 Apr 2025 19:29:28 +0800 Subject: [PATCH 6/7] front:Apply fix,requires_grad=requires_grad --- doc/excuter/op-mem-cuda/list.md | 3 + doc/excuter/op-mem-ompsimd/list.md | 60 +++---- front/py/deepx/nn/functional/elementwise.py | 32 ++-- front/py/examples/2_ir/1_init_zeroones.dot | 76 ++++----- .../py/examples/2_ir/1_init_zeroones.dot.svg | 152 +++++++++--------- 5 files changed, 164 insertions(+), 159 deletions(-) diff --git a/doc/excuter/op-mem-cuda/list.md b/doc/excuter/op-mem-cuda/list.md index 5967d738..09937bd1 100644 --- a/doc/excuter/op-mem-cuda/list.md +++ b/doc/excuter/op-mem-cuda/list.md @@ -4,6 +4,9 @@ | Operation | Author | Func Def | Math Formula | IR Instruction | |-----------|--------|------------|--------------|----------------| +| concat | miaobyte | concat(listtensor tensors, var axis)->(tensor result) | Tresult = concat([T1, T2...], axis=3) | concat(listtensor tensors, var axis)->(tensor result) | +| transpose | miaobyte | transpose(tensor A, vector dim_order)->(tensor C) | T2 = T1.transpose(dimorder=[1,0]) | transpose(tensor A, vector dim_order)->(tensor C) | +| reshape | miaobyte | reshape(tensor A, vector shape)->() | T2=T1.reshape(shape) | reshape(tensor A, vector shape)->() | | matmul | cublas | matmul(tensor A, tensor B)->(tensor C) | T3=T1 @ T2 | matmul(tensor A, tensor B)->(tensor C) | | comparescalar | miaobyte | comparescalar(tensor A, var scalar)->(tensor mask) | mask=compare(T1, scalar) | comparescalar(tensor A, var scalar)->(tensor mask) | | add | cublas | add(tensor a, tensor b)->(tensor c) | T3=T1+T2 | add(tensor a, tensor b)->(tensor c) | diff --git a/doc/excuter/op-mem-ompsimd/list.md b/doc/excuter/op-mem-ompsimd/list.md index b4ab6cd7..32aadf9d 100644 --- a/doc/excuter/op-mem-ompsimd/list.md +++ b/doc/excuter/op-mem-ompsimd/list.md @@ -4,36 +4,38 @@ | Operation | Author | Func Def | Math Formula | IR Instruction | |-----------|--------|------------|--------------|----------------| -| concat | none | concat()->() | Tresult = concat([T1, T2...], axis=3) | concat()->() | -| matmul | cblas | matmul(tensor A, tensor B)->(tensor C) | T3=T1 @ T2 | matmul(tensor A, tensor B)->(tensor C) | -| matmul | miaobyte | matmul(tensor A, tensor B)->(tensor C) | T3=T1 @ T2 | matmul(tensor A, tensor B)->(tensor C) | -| compare | miaobyte | compare(tensor A, tensor B)->(tensor mask) | mask=compare(T1,T2) | compare(tensor A, tensor B)->(tensor mask) | -| min | miaobyte | min(tensor A, tensor B)->(tensor C) | T3=min(T1,T2) | min(tensor A, tensor B)->(tensor C) | -| minscalar | miaobyte | minscalar(tensor A, var scalar)->(tensor C) | T3=min(T1,scalar) | minscalar(tensor A, var scalar)->(tensor C) | -| exp | miaobyte | exp(tensor A)->(tensor C) | T3=exp(T1) | exp(tensor A)->(tensor C) | -| maxscalar | miaobyte | maxscalar(tensor A, var scalar)->(tensor C) | T3=max(T1,scalar) | maxscalar(tensor A, var scalar)->(tensor C) | -| pow | miaobyte | pow(tensor A, tensor B)->(tensor C) | T3=T1^T2 | pow(tensor A, tensor B)->(tensor C) | -| powscalar | miaobyte | powscalar(tensor A, var scalar)->(tensor C) | T3=T1^scalar | powscalar(tensor A, var scalar)->(tensor C) | -| rdivscalar | miaobyte | rdivscalar(var scalar, tensor A)->(tensor C) | T3=scalar/T1 | rdivscalar(var scalar, tensor A)->(tensor C) | -| div | miaobyte | div(tensor A, tensor B)->(tensor C) | T3=T1/T2 | div(tensor A, tensor B)->(tensor C) | -| sub | miaobyte | sub(tensor a, tensor b)->(tensor c) | T3=T1-T2 | sub(tensor a, tensor b)->(tensor c) | -| argset | none | argset(var value)->(var name) | var argname = argvalue | argset(var value)->(var name) | -| mulscalar | miaobyte | mulscalar(tensor A, var b)->(tensor C) | T3=T1*scalar | mulscalar(tensor A, var b)->(tensor C) | -| sqrt | miaobyte | sqrt(tensor A)->(tensor C) | T3=sqrt(T1) | sqrt(tensor A)->(tensor C) | -| vecset | none | vecset(vector value)->(vector name) | shape = [3 4 5] | vecset(vector value)->(vector name) | -| newtensor | none | newtensor(vector shape)->(tensor tensor1) | T1 =Tensor(shape=[...]) | newtensor(vector shape)->(tensor tensor1) | -| newtensor | none | newtensor(var shape)->(tensor tensor1) | T1 =Tensor(shape=[...]) | newtensor(var shape)->(tensor tensor1) | -| print | miaobyte | print(tensor )->() | print(T1) | print(tensor )->() | -| print | miaobyte | print(tensor , var )->() | print(T1) | print(tensor , var )->() | -| max | miaobyte | max(tensor A, tensor B)->(tensor C) | T3=max(T1,T2) | max(tensor A, tensor B)->(tensor C) | -| divscalar | miaobyte | divscalar(tensor A, var scalar)->(tensor C) | T3=T1/scalar | divscalar(tensor A, var scalar)->(tensor C) | -| constant | miaobyte | constant(tensor t, var value)->() | constant(T1,value) | constant(tensor t, var value)->() | -| arange | miaobyte | arange(tensor t, var start, var step)->() | arange(T1,start,step) | arange(tensor t, var start, var step)->() | -| subscalar | miaobyte | subscalar(tensor a, var scalar)->(tensor c) | T3=T1-scalar | subscalar(tensor a, var scalar)->(tensor c) | -| log | miaobyte | log(tensor A)->(tensor C) | T3=log(T1) | log(tensor A)->(tensor C) | -| uniform | miaobyte | uniform(tensor t, var low, var high, var seed)->() | uniform(T1,low,high,seed) | uniform(tensor t, var low, var high, var seed)->() | -| comparescalar | miaobyte | comparescalar(tensor A, var scalar)->(tensor mask) | mask=compare(T1,scalar) | comparescalar(tensor A, var scalar)->(tensor mask) | +| concat | miaobyte | concat(listtensor tensors, var axis)->(tensor result) | Tresult = concat([T1, T2...], axis=3) | concat(listtensor tensors, var axis)->(tensor result) | +| transpose | miaobyte | transpose(tensor A, vector dim_order)->(tensor C) | T2 = T1.transpose(dimorder=[1,0]) | transpose(tensor A, vector dim_order)->(tensor C) | | add | cblas | add(tensor a, tensor b)->(tensor c) | T3=T1+T2 | add(tensor a, tensor b)->(tensor c) | | add | miaobyte | add(tensor a, tensor b)->(tensor c) | T3=T1+T2 | add(tensor a, tensor b)->(tensor c) | +| comparescalar | miaobyte | comparescalar(tensor A, var scalar)->(tensor mask) | mask=compare(T1,scalar) | comparescalar(tensor A, var scalar)->(tensor mask) | +| uniform | miaobyte | uniform(tensor t, var low, var high, var seed)->() | uniform(T1,low,high,seed) | uniform(tensor t, var low, var high, var seed)->() | | addscalar | miaobyte | addscalar(tensor a, var scalar)->(tensor c) | T3=T1+scalar | addscalar(tensor a, var scalar)->(tensor c) | +| log | miaobyte | log(tensor A)->(tensor C) | T3=log(T1) | log(tensor A)->(tensor C) | +| reshape | miaobyte | reshape(tensor A, vector shape)->() | T2=T1.reshape(shape) | reshape(tensor A, vector shape)->() | +| arange | miaobyte | arange(tensor t, var start, var step)->() | arange(T1,start,step) | arange(tensor t, var start, var step)->() | +| divscalar | miaobyte | divscalar(tensor A, var scalar)->(tensor C) | T3=T1/scalar | divscalar(tensor A, var scalar)->(tensor C) | +| print | miaobyte | print(tensor )->() | print(T1) | print(tensor )->() | +| print | miaobyte | print(tensor , var )->() | print(T1) | print(tensor , var )->() | +| newtensor | none | newtensor(vector shape)->(tensor tensor1) | T1 =Tensor(shape=[...]) | newtensor(vector shape)->(tensor tensor1) | +| newtensor | none | newtensor(var shape)->(tensor tensor1) | T1 =Tensor(shape=[...]) | newtensor(var shape)->(tensor tensor1) | +| vecset | none | vecset(vector value)->(vector name) | shape = [3 4 5] | vecset(vector value)->(vector name) | +| subscalar | miaobyte | subscalar(tensor a, var scalar)->(tensor c) | T3=T1-scalar | subscalar(tensor a, var scalar)->(tensor c) | +| sqrt | miaobyte | sqrt(tensor A)->(tensor C) | T3=sqrt(T1) | sqrt(tensor A)->(tensor C) | +| argset | none | argset(var value)->(var name) | var argname = argvalue | argset(var value)->(var name) | +| sub | miaobyte | sub(tensor a, tensor b)->(tensor c) | T3=T1-T2 | sub(tensor a, tensor b)->(tensor c) | +| mulscalar | miaobyte | mulscalar(tensor A, var b)->(tensor C) | T3=T1*scalar | mulscalar(tensor A, var b)->(tensor C) | +| div | miaobyte | div(tensor A, tensor B)->(tensor C) | T3=T1/T2 | div(tensor A, tensor B)->(tensor C) | +| constant | miaobyte | constant(tensor t, var value)->() | constant(T1,value) | constant(tensor t, var value)->() | +| powscalar | miaobyte | powscalar(tensor A, var scalar)->(tensor C) | T3=T1^scalar | powscalar(tensor A, var scalar)->(tensor C) | +| max | miaobyte | max(tensor A, tensor B)->(tensor C) | T3=max(T1,T2) | max(tensor A, tensor B)->(tensor C) | +| pow | miaobyte | pow(tensor A, tensor B)->(tensor C) | T3=T1^T2 | pow(tensor A, tensor B)->(tensor C) | +| maxscalar | miaobyte | maxscalar(tensor A, var scalar)->(tensor C) | T3=max(T1,scalar) | maxscalar(tensor A, var scalar)->(tensor C) | | mul | miaobyte | mul(tensor A, tensor B)->(tensor C) | T3=T1*T2 | mul(tensor A, tensor B)->(tensor C) | +| exp | miaobyte | exp(tensor A)->(tensor C) | T3=exp(T1) | exp(tensor A)->(tensor C) | +| rdivscalar | miaobyte | rdivscalar(var scalar, tensor A)->(tensor C) | T3=scalar/T1 | rdivscalar(var scalar, tensor A)->(tensor C) | +| minscalar | miaobyte | minscalar(tensor A, var scalar)->(tensor C) | T3=min(T1,scalar) | minscalar(tensor A, var scalar)->(tensor C) | +| min | miaobyte | min(tensor A, tensor B)->(tensor C) | T3=min(T1,T2) | min(tensor A, tensor B)->(tensor C) | +| compare | miaobyte | compare(tensor A, tensor B)->(tensor mask) | mask=compare(T1,T2) | compare(tensor A, tensor B)->(tensor mask) | +| matmul | cblas | matmul(tensor A, tensor B)->(tensor C) | T3=T1 @ T2 | matmul(tensor A, tensor B)->(tensor C) | +| matmul | miaobyte | matmul(tensor A, tensor B)->(tensor C) | T3=T1 @ T2 | matmul(tensor A, tensor B)->(tensor C) | diff --git a/front/py/deepx/nn/functional/elementwise.py b/front/py/deepx/nn/functional/elementwise.py index e6f8d326..358c88d0 100644 --- a/front/py/deepx/nn/functional/elementwise.py +++ b/front/py/deepx/nn/functional/elementwise.py @@ -125,9 +125,9 @@ def add( requires_grad:bool=False, author='miaobyte')->Tensor: if isinstance(b,Tensor): - return Add.apply(a,b,out,author,requires_grad) + return Add.apply(a,b,out,author,requires_grad=requires_grad) else: - return AddScalar.apply(a,b,out,author,requires_grad) + return AddScalar.apply(a,b,out,author,requires_grad=requires_grad) #sub @@ -157,9 +157,9 @@ def sub( requires_grad:bool=False, author='miaobyte')->Tensor: if isinstance(b,Tensor): - return Sub.apply(a,b,out,author,requires_grad) + return Sub.apply(a,b,out,author,requires_grad=requires_grad) else: - return SubScalar.apply(a,b,out,author,requires_grad) + return SubScalar.apply(a,b,out,author,requires_grad=requires_grad) #mul OpNode.register("mul") @@ -195,7 +195,7 @@ def mul( if isinstance(b,Tensor): return Mul.apply(a,b,out,author,requires_grad) else: - return MulScalar.apply(a,b,out,author,requires_grad) + return MulScalar.apply(a,b,out,author,requires_grad=requires_grad) #div @@ -248,10 +248,10 @@ def div( else: if isinstance(a,Tensor): #C=A/b - return DivScalar.apply(a,b,"divscalar",out,author,requires_grad) + return DivScalar.apply(a,b,"divscalar",out,author,requires_grad=requires_grad) else: #C=a/B - return RDivScalar.apply(a,b,"rdivscalar",out,author,requires_grad) + return RDivScalar.apply(a,b,"rdivscalar",out,author,requires_grad=requires_grad) OpNode.register("compare") class Compare(Function): @@ -301,7 +301,7 @@ def max( if isinstance(b,int) or isinstance(b,float): return MaxScalar.apply(a,b,"maxscalar",out,author,requires_grad) else: - return Max.apply(a,b,"max",out,author,requires_grad) + return Max.apply(a,b,"max",out,author,requires_grad=requires_grad) OpNode.register("min") @@ -337,9 +337,9 @@ def min( requires_grad:bool=False, author='miaobyte')->Tensor: if isinstance(b,int) or isinstance(b,float): - return MinScalar.apply(a,b,"minscalar",out,author,requires_grad) + return MinScalar.apply(a,b,"minscalar",out,author,requires_grad=requires_grad) else: - return Min.apply(a,b,"min",out,author,requires_grad) + return Min.apply(a,b,"min",out,author,requires_grad=requires_grad) #clamp,TODO @@ -362,7 +362,7 @@ def sqrt( out:Union[Tensor,str]='', requires_grad:bool=False, author='miaobyte')->Tensor: - return Sqrt.apply(input,out,author,requires_grad) + return Sqrt.apply(input,out,author,requires_grad=requires_grad) OpNode.register("pow") class Pow(Function): @@ -397,9 +397,9 @@ def pow( requires_grad:bool=False, author='miaobyte')->Tensor: if isinstance(b,int) or isinstance(b,float): - return PowScalar.apply(a,b,out,author,requires_grad) + return PowScalar.apply(a,b,out,author,requires_grad=requires_grad) else: - return Pow.apply(a,b,out,author,requires_grad) + return Pow.apply(a,b,out,author,requires_grad=requires_grad) #exp OpNode.register("exp") @@ -420,7 +420,7 @@ def exp( out:Union[Tensor,str]='', requires_grad:bool=False, author='miaobyte')->Tensor: - return Exp.apply(a,out,author,requires_grad) + return Exp.apply(a,out,author,requires_grad=requires_grad) #log OpNode.register("log") class Log(Function): @@ -440,7 +440,7 @@ def log( out:Union[Tensor,str]='', requires_grad:bool=False, author='miaobyte')->Tensor: - return Log.apply(a,out,author,requires_grad) + return Log.apply(a,out,author,requires_grad=requires_grad) OpNode.register("rsqrt") class Rsqrt(Function): @@ -460,7 +460,7 @@ def rsqrt( out:Union[Tensor,str]='', requires_grad:bool=False, author='miaobyte')->Tensor: - return Rsqrt.apply(input,out,author,requires_grad) + return Rsqrt.apply(input,out,author,requires_grad=requires_grad) diff --git a/front/py/examples/2_ir/1_init_zeroones.dot b/front/py/examples/2_ir/1_init_zeroones.dot index 33d54af8..55722399 100644 --- a/front/py/examples/2_ir/1_init_zeroones.dot +++ b/front/py/examples/2_ir/1_init_zeroones.dot @@ -2,55 +2,55 @@ digraph { rankdir=TB node [shape=record] - 132815942520016 [label="t1 + 129052233125168 [label="t1 (3, 4, 5)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 132813646230768 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 132814271881056 [label="var_1 + 129049935907280 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 129049975249392 [label="var_1 0" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled] - 132813645298272 [label="t2 + 129049975250448 [label="t2 (3, 4, 5)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 132813645298464 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 132813645298080 [label="var_2 + 129049935907424 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 129049935907568 [label="var_2 1" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled] - 132813645298704 [label=add color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 132813645298512 [label="t3 + 129049935907472 [label=add color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 129049935907760 [label="t3 (3, 4, 5)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 132813645298800 [label="t4 + 129049935907616 [label="t4 (3, 4, 5)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 132813645299136 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 132813645299088 [label="var_3 + 129049935907952 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 129049935908240 [label="var_3 0.5" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled] - 132813645298944 [label=add color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 132813645299424 [label="t5 + 129049935908432 [label=add color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 129049935908144 [label="t5 (3, 4, 5)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 132813645299664 [label="tensor_6 + 129049935908576 [label="tensor_6 (3, 4, 5)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 132813645293616 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 132813645293664 [label="var_4 + 129049935908624 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 129049935908912 [label="var_4 0" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled] - 132813645293280 [label=uniform color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 132813645299616 [label="var_5 + 129049935909056 [label=uniform color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 129049935909008 [label="var_5 -0.5477225575051661" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled] - 132813645293568 [label="var_6 + 129049935909200 [label="var_6 0.5477225575051661" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled] - 132813645293424 [label="var_7 + 129049935909152 [label="var_7 0" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled] - 132813646230768 -> 132815942520016 [arrowsize=0.8 color=gray40 penwidth=1.2] - 132814271881056 -> 132813646230768 [arrowsize=0.8 color=gray40 penwidth=1.2] - 132813645298464 -> 132813645298272 [arrowsize=0.8 color=gray40 penwidth=1.2] - 132813645298080 -> 132813645298464 [arrowsize=0.8 color=gray40 penwidth=1.2] - 132815942520016 -> 132813645298704 [arrowsize=0.8 color=gray40 penwidth=1.2] - 132813645298272 -> 132813645298704 [arrowsize=0.8 color=gray40 penwidth=1.2] - 132813645298704 -> 132813645298512 [arrowsize=0.8 color=gray40 penwidth=1.2] - 132813645299136 -> 132813645298800 [arrowsize=0.8 color=gray40 penwidth=1.2] - 132813645299088 -> 132813645299136 [arrowsize=0.8 color=gray40 penwidth=1.2] - 132813645298800 -> 132813645298944 [arrowsize=0.8 color=gray40 penwidth=1.2] - 132813645298512 -> 132813645298944 [arrowsize=0.8 color=gray40 penwidth=1.2] - 132813645298944 -> 132813645299424 [arrowsize=0.8 color=gray40 penwidth=1.2] - 132813645293616 -> 132813645299664 [arrowsize=0.8 color=gray40 penwidth=1.2] - 132813645293280 -> 132813645299664 [arrowsize=0.8 color=gray40 penwidth=1.2] - 132813645293664 -> 132813645293616 [arrowsize=0.8 color=gray40 penwidth=1.2] - 132813645299616 -> 132813645293280 [arrowsize=0.8 color=gray40 penwidth=1.2] - 132813645293568 -> 132813645293280 [arrowsize=0.8 color=gray40 penwidth=1.2] - 132813645293424 -> 132813645293280 [arrowsize=0.8 color=gray40 penwidth=1.2] + 129049935907280 -> 129052233125168 [arrowsize=0.8 color=gray40 penwidth=1.2] + 129049975249392 -> 129049935907280 [arrowsize=0.8 color=gray40 penwidth=1.2] + 129049935907424 -> 129049975250448 [arrowsize=0.8 color=gray40 penwidth=1.2] + 129049935907568 -> 129049935907424 [arrowsize=0.8 color=gray40 penwidth=1.2] + 129052233125168 -> 129049935907472 [arrowsize=0.8 color=gray40 penwidth=1.2] + 129049975250448 -> 129049935907472 [arrowsize=0.8 color=gray40 penwidth=1.2] + 129049935907472 -> 129049935907760 [arrowsize=0.8 color=gray40 penwidth=1.2] + 129049935907952 -> 129049935907616 [arrowsize=0.8 color=gray40 penwidth=1.2] + 129049935908240 -> 129049935907952 [arrowsize=0.8 color=gray40 penwidth=1.2] + 129049935907616 -> 129049935908432 [arrowsize=0.8 color=gray40 penwidth=1.2] + 129049935907760 -> 129049935908432 [arrowsize=0.8 color=gray40 penwidth=1.2] + 129049935908432 -> 129049935908144 [arrowsize=0.8 color=gray40 penwidth=1.2] + 129049935908624 -> 129049935908576 [arrowsize=0.8 color=gray40 penwidth=1.2] + 129049935909056 -> 129049935908576 [arrowsize=0.8 color=gray40 penwidth=1.2] + 129049935908912 -> 129049935908624 [arrowsize=0.8 color=gray40 penwidth=1.2] + 129049935909008 -> 129049935909056 [arrowsize=0.8 color=gray40 penwidth=1.2] + 129049935909200 -> 129049935909056 [arrowsize=0.8 color=gray40 penwidth=1.2] + 129049935909152 -> 129049935909056 [arrowsize=0.8 color=gray40 penwidth=1.2] } diff --git a/front/py/examples/2_ir/1_init_zeroones.dot.svg b/front/py/examples/2_ir/1_init_zeroones.dot.svg index 474bbbd8..b5bf6542 100644 --- a/front/py/examples/2_ir/1_init_zeroones.dot.svg +++ b/front/py/examples/2_ir/1_init_zeroones.dot.svg @@ -9,244 +9,244 @@ %3 - + -132815942520016 +129052233125168 t1 (3, 4, 5) - + -132813645298704 +129049935907472 add - + -132815942520016->132813645298704 +129052233125168->129049935907472 - + -132813646230768 +129049935907280 constant - + -132813646230768->132815942520016 +129049935907280->129052233125168 - + -132814271881056 +129049975249392 var_1 0 - + -132814271881056->132813646230768 +129049975249392->129049935907280 - + -132813645298272 +129049975250448 t2 (3, 4, 5) - + -132813645298272->132813645298704 +129049975250448->129049935907472 - + -132813645298464 +129049935907424 constant - + -132813645298464->132813645298272 +129049935907424->129049975250448 - + -132813645298080 +129049935907568 var_2 1 - + -132813645298080->132813645298464 +129049935907568->129049935907424 - + -132813645298512 +129049935907760 t3 (3, 4, 5) - + -132813645298704->132813645298512 +129049935907472->129049935907760 - + -132813645298944 +129049935908432 add - + -132813645298512->132813645298944 +129049935907760->129049935908432 - + -132813645298800 +129049935907616 t4 (3, 4, 5) - + -132813645298800->132813645298944 +129049935907616->129049935908432 - + -132813645299136 +129049935907952 constant - + -132813645299136->132813645298800 +129049935907952->129049935907616 - + -132813645299088 +129049935908240 var_3 0.5 - + -132813645299088->132813645299136 +129049935908240->129049935907952 - + -132813645299424 +129049935908144 t5 (3, 4, 5) - + -132813645298944->132813645299424 +129049935908432->129049935908144 - + -132813645299664 +129049935908576 tensor_6 (3, 4, 5) - + -132813645293616 +129049935908624 constant - + -132813645293616->132813645299664 +129049935908624->129049935908576 - + -132813645293664 +129049935908912 var_4 0 - + -132813645293664->132813645293616 +129049935908912->129049935908624 - + -132813645293280 +129049935909056 uniform - + -132813645293280->132813645299664 +129049935909056->129049935908576 - + -132813645299616 +129049935909008 var_5 -0.5477225575051661 - + -132813645299616->132813645293280 +129049935909008->129049935909056 - + -132813645293568 +129049935909200 var_6 0.5477225575051661 - + -132813645293568->132813645293280 +129049935909200->129049935909056 - + -132813645293424 +129049935909152 var_7 0 - + -132813645293424->132813645293280 +129049935909152->129049935909056 From fd2ed6aa9e1baf4b0575980e24370cf62d3e66b6 Mon Sep 17 00:00:00 2001 From: lipeng <734991033@qq.com> Date: Sun, 6 Apr 2025 19:41:28 +0800 Subject: [PATCH 7/7] front:Apply --- front/py/deepx/autograd/function.py | 4 + front/py/deepx/nn/functional/elementwise.py | 20 +- front/py/examples/2_ir/2_elementwise_add.dot | 40 ++-- .../examples/2_ir/2_elementwise_add.dot.svg | 80 ++++---- .../examples/2_ir/2_elementwise_operator.dot | 92 ++++----- .../2_ir/2_elementwise_operator.dot.svg | 184 +++++++++--------- .../examples/2_ir/2_elementwise_sqrtlog.dot | 46 ++--- .../2_ir/2_elementwise_sqrtlog.dot.svg | 92 ++++----- front/py/examples/2_ir/3_matmul.dot | 30 +-- front/py/examples/2_ir/3_matmul.dot.svg | 60 +++--- 10 files changed, 326 insertions(+), 322 deletions(-) diff --git a/front/py/deepx/autograd/function.py b/front/py/deepx/autograd/function.py index e9f5ff06..98f63def 100644 --- a/front/py/deepx/autograd/function.py +++ b/front/py/deepx/autograd/function.py @@ -18,6 +18,10 @@ def save_data(self, key, value): def get_data(self, key): return self._non_tensor_data.get(key) + @property + def requires_grad(self): + return self._requires_grad + class Function: @staticmethod def forward(ctx:Context, *args, **kwargs): diff --git a/front/py/deepx/nn/functional/elementwise.py b/front/py/deepx/nn/functional/elementwise.py index 358c88d0..7b524afc 100644 --- a/front/py/deepx/nn/functional/elementwise.py +++ b/front/py/deepx/nn/functional/elementwise.py @@ -193,7 +193,7 @@ def mul( requires_grad:bool=False, author='miaobyte')->Tensor: if isinstance(b,Tensor): - return Mul.apply(a,b,out,author,requires_grad) + return Mul.apply(a,b,out,author,requires_grad=requires_grad) else: return MulScalar.apply(a,b,out,author,requires_grad=requires_grad) @@ -228,10 +228,10 @@ def backward(ctx:Context, out_grad): OpNode.register("rdivscalar") class RDivScalar(Function): @staticmethod - def forward(ctx:Context, a, b,out,author='miaobyte'): + def forward(ctx:Context, a,b,out,author='miaobyte'): if ctx.requires_grad: ctx.save_data('b',b) - return _A_b_elementwiseop_C(a, b, "rdivscalar", out,author) + return _a_B_elementwiseop_C(a, b, "rdivscalar", out,author) @staticmethod def backward(ctx:Context, out_grad): @@ -244,14 +244,14 @@ def div( requires_grad:bool=False, author='miaobyte')->Tensor: if isinstance(b,Tensor) and isinstance(a,Tensor): - return Div.apply(a,b,out,author,requires_grad) + return Div.apply(a,b,out,author,requires_grad=requires_grad) else: if isinstance(a,Tensor): #C=A/b - return DivScalar.apply(a,b,"divscalar",out,author,requires_grad=requires_grad) + return DivScalar.apply(a,b,out,author,requires_grad=requires_grad) else: #C=a/B - return RDivScalar.apply(a,b,"rdivscalar",out,author,requires_grad=requires_grad) + return RDivScalar.apply(a,b,out,author,requires_grad=requires_grad) OpNode.register("compare") class Compare(Function): @@ -299,9 +299,9 @@ def max( requires_grad:bool=False, author='miaobyte')->Tensor: if isinstance(b,int) or isinstance(b,float): - return MaxScalar.apply(a,b,"maxscalar",out,author,requires_grad) + return MaxScalar.apply(a,b,out,author,requires_grad) else: - return Max.apply(a,b,"max",out,author,requires_grad=requires_grad) + return Max.apply(a,b,out,author,requires_grad=requires_grad) OpNode.register("min") @@ -337,9 +337,9 @@ def min( requires_grad:bool=False, author='miaobyte')->Tensor: if isinstance(b,int) or isinstance(b,float): - return MinScalar.apply(a,b,"minscalar",out,author,requires_grad=requires_grad) + return MinScalar.apply(a,b,out,author,requires_grad=requires_grad) else: - return Min.apply(a,b,"min",out,author,requires_grad=requires_grad) + return Min.apply(a,b,out,author,requires_grad=requires_grad) #clamp,TODO diff --git a/front/py/examples/2_ir/2_elementwise_add.dot b/front/py/examples/2_ir/2_elementwise_add.dot index 02ebe50a..1c2ae8c9 100644 --- a/front/py/examples/2_ir/2_elementwise_add.dot +++ b/front/py/examples/2_ir/2_elementwise_add.dot @@ -2,30 +2,30 @@ digraph { rankdir=TB node [shape=record] - 124025483544560 [label="t1 + 125643920431552 [label="t1 (2, 3, 4)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 124023224089136 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 124025483558240 [label="var_1 + 125643920431744 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 125643920432608 [label="var_1 10" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled] - 124023224086304 [label="t2 + 125645612091328 [label="t2 (2, 3, 4)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 124023224089184 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 124023224085296 [label="var_2 + 125643918940608 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 125643918940416 [label="var_2 10" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled] - 124023224088848 [label=add color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 124023224089376 [label="t3 + 125643918940464 [label=add color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 125643918940272 [label="t3 (2, 3, 4)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 124023224089568 [label=addscalar color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 124023224089520 [label="var_3 + 125643918940128 [label=addscalar color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 125643918939936 [label="var_3 0.5" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled] - 124023224089136 -> 124025483544560 [arrowsize=0.8 color=gray40 penwidth=1.2] - 124025483558240 -> 124023224089136 [arrowsize=0.8 color=gray40 penwidth=1.2] - 124023224089184 -> 124023224086304 [arrowsize=0.8 color=gray40 penwidth=1.2] - 124023224085296 -> 124023224089184 [arrowsize=0.8 color=gray40 penwidth=1.2] - 124025483544560 -> 124023224088848 [arrowsize=0.8 color=gray40 penwidth=1.2] - 124023224086304 -> 124023224088848 [arrowsize=0.8 color=gray40 penwidth=1.2] - 124023224088848 -> 124023224089376 [arrowsize=0.8 color=gray40 penwidth=1.2] - 124023224089568 -> 124023224089376 [arrowsize=0.8 color=gray40 penwidth=1.2] - 124023224089376 -> 124023224089568 [arrowsize=0.8 color=gray40 penwidth=1.2] - 124023224089520 -> 124023224089568 [arrowsize=0.8 color=gray40 penwidth=1.2] + 125643920431744 -> 125643920431552 [arrowsize=0.8 color=gray40 penwidth=1.2] + 125643920432608 -> 125643920431744 [arrowsize=0.8 color=gray40 penwidth=1.2] + 125643918940608 -> 125645612091328 [arrowsize=0.8 color=gray40 penwidth=1.2] + 125643918940416 -> 125643918940608 [arrowsize=0.8 color=gray40 penwidth=1.2] + 125643920431552 -> 125643918940464 [arrowsize=0.8 color=gray40 penwidth=1.2] + 125645612091328 -> 125643918940464 [arrowsize=0.8 color=gray40 penwidth=1.2] + 125643918940464 -> 125643918940272 [arrowsize=0.8 color=gray40 penwidth=1.2] + 125643918940128 -> 125643918940272 [arrowsize=0.8 color=gray40 penwidth=1.2] + 125643918940272 -> 125643918940128 [arrowsize=0.8 color=gray40 penwidth=1.2] + 125643918939936 -> 125643918940128 [arrowsize=0.8 color=gray40 penwidth=1.2] } diff --git a/front/py/examples/2_ir/2_elementwise_add.dot.svg b/front/py/examples/2_ir/2_elementwise_add.dot.svg index 16598880..86968ef0 100644 --- a/front/py/examples/2_ir/2_elementwise_add.dot.svg +++ b/front/py/examples/2_ir/2_elementwise_add.dot.svg @@ -9,129 +9,129 @@ %3 - + -124025483544560 +125643920431552 t1 (2, 3, 4) - + -124023224088848 +125643918940464 add - + -124025483544560->124023224088848 +125643920431552->125643918940464 - + -124023224089136 +125643920431744 constant - + -124023224089136->124025483544560 +125643920431744->125643920431552 - + -124025483558240 +125643920432608 var_1 10 - + -124025483558240->124023224089136 +125643920432608->125643920431744 - + -124023224086304 +125645612091328 t2 (2, 3, 4) - + -124023224086304->124023224088848 +125645612091328->125643918940464 - + -124023224089184 +125643918940608 constant - + -124023224089184->124023224086304 +125643918940608->125645612091328 - + -124023224085296 +125643918940416 var_2 10 - + -124023224085296->124023224089184 +125643918940416->125643918940608 - + -124023224089376 +125643918940272 t3 (2, 3, 4) - + -124023224088848->124023224089376 +125643918940464->125643918940272 - + -124023224089568 +125643918940128 addscalar - + -124023224089376->124023224089568 +125643918940272->125643918940128 - + -124023224089568->124023224089376 +125643918940128->125643918940272 - + -124023224089520 +125643918939936 var_3 0.5 - + -124023224089520->124023224089568 +125643918939936->125643918940128 diff --git a/front/py/examples/2_ir/2_elementwise_operator.dot b/front/py/examples/2_ir/2_elementwise_operator.dot index b39fa214..46fb2a94 100644 --- a/front/py/examples/2_ir/2_elementwise_operator.dot +++ b/front/py/examples/2_ir/2_elementwise_operator.dot @@ -2,63 +2,63 @@ digraph { rankdir=TB node [shape=record] - 134854829346096 [label="t1 + 130352998993200 [label="t1 (3, 4, 5)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 134854521156512 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 134854521844832 [label="var_1 + 130350739524192 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 130350741117520 [label="var_1 0" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled] - 134854462386816 [label="t2 + 130350741118576 [label="t2 (3, 4, 5)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 134854462387008 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 134854462386624 [label="var_2 + 130350739524336 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 130350739524480 [label="var_2 1" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled] - 134854462387248 [label=add color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 134854462387056 [label="t3 + 130350739524384 [label=add color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 130350739524672 [label="t3 (3, 4, 5)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 134854462387344 [label="t4 + 130350739524528 [label="t4 (3, 4, 5)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 134854462387680 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 134854462387632 [label="var_3 + 130350739524864 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 130350739525152 [label="var_3 0.5" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled] - 134854462387488 [label=add color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 134854462387776 [label="t5 + 130350739525104 [label=add color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 130350739525296 [label="t5 (3, 4, 5)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 134854462388016 [label="t6 + 130350739525440 [label="t6 (3, 4, 5)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 134854462388400 [label=div color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 134854462388256 [label=rdivscalar color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 134854462388352 [label="var_4 + 130350739525632 [label=div color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 130350739525776 [label=rdivscalar color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 130350739525968 [label="var_4 0.05" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled] - 134854462388688 [label="t7 + 130350739526016 [label="t7 (3, 4, 5)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 134854462388832 [label=mulscalar color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 134854462388880 [label="var_5 + 130350739526304 [label=mulscalar color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 130350739526352 [label="var_5 2.5" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled] - 134854462388736 [label=mul color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 134854462389168 [label="t8 + 130350739526544 [label=mul color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 130350739526256 [label="t8 (3, 4, 5)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 134854521156512 -> 134854829346096 [arrowsize=0.8 color=gray40 penwidth=1.2] - 134854521844832 -> 134854521156512 [arrowsize=0.8 color=gray40 penwidth=1.2] - 134854462387008 -> 134854462386816 [arrowsize=0.8 color=gray40 penwidth=1.2] - 134854462386624 -> 134854462387008 [arrowsize=0.8 color=gray40 penwidth=1.2] - 134854829346096 -> 134854462387248 [arrowsize=0.8 color=gray40 penwidth=1.2] - 134854462386816 -> 134854462387248 [arrowsize=0.8 color=gray40 penwidth=1.2] - 134854462387248 -> 134854462387056 [arrowsize=0.8 color=gray40 penwidth=1.2] - 134854462387680 -> 134854462387344 [arrowsize=0.8 color=gray40 penwidth=1.2] - 134854462387632 -> 134854462387680 [arrowsize=0.8 color=gray40 penwidth=1.2] - 134854462387344 -> 134854462387488 [arrowsize=0.8 color=gray40 penwidth=1.2] - 134854462387056 -> 134854462387488 [arrowsize=0.8 color=gray40 penwidth=1.2] - 134854462387488 -> 134854462387776 [arrowsize=0.8 color=gray40 penwidth=1.2] - 134854462388400 -> 134854462388016 [arrowsize=0.8 color=gray40 penwidth=1.2] - 134854829346096 -> 134854462388400 [arrowsize=0.8 color=gray40 penwidth=1.2] - 134854462386816 -> 134854462388400 [arrowsize=0.8 color=gray40 penwidth=1.2] - 134854462388352 -> 134854462388256 [arrowsize=0.8 color=gray40 penwidth=1.2] - 134854462386816 -> 134854462388256 [arrowsize=0.8 color=gray40 penwidth=1.2] - 134854462388256 -> 134854462388688 [arrowsize=0.8 color=gray40 penwidth=1.2] - 134854462388832 -> 134854462388688 [arrowsize=0.8 color=gray40 penwidth=1.2] - 134854462388688 -> 134854462388832 [arrowsize=0.8 color=gray40 penwidth=1.2] - 134854462388880 -> 134854462388832 [arrowsize=0.8 color=gray40 penwidth=1.2] - 134854462388688 -> 134854462388736 [arrowsize=0.8 color=gray40 penwidth=1.2] - 134854462386816 -> 134854462388736 [arrowsize=0.8 color=gray40 penwidth=1.2] - 134854462388736 -> 134854462389168 [arrowsize=0.8 color=gray40 penwidth=1.2] + 130350739524192 -> 130352998993200 [arrowsize=0.8 color=gray40 penwidth=1.2] + 130350741117520 -> 130350739524192 [arrowsize=0.8 color=gray40 penwidth=1.2] + 130350739524336 -> 130350741118576 [arrowsize=0.8 color=gray40 penwidth=1.2] + 130350739524480 -> 130350739524336 [arrowsize=0.8 color=gray40 penwidth=1.2] + 130352998993200 -> 130350739524384 [arrowsize=0.8 color=gray40 penwidth=1.2] + 130350741118576 -> 130350739524384 [arrowsize=0.8 color=gray40 penwidth=1.2] + 130350739524384 -> 130350739524672 [arrowsize=0.8 color=gray40 penwidth=1.2] + 130350739524864 -> 130350739524528 [arrowsize=0.8 color=gray40 penwidth=1.2] + 130350739525152 -> 130350739524864 [arrowsize=0.8 color=gray40 penwidth=1.2] + 130350739524528 -> 130350739525104 [arrowsize=0.8 color=gray40 penwidth=1.2] + 130350739524672 -> 130350739525104 [arrowsize=0.8 color=gray40 penwidth=1.2] + 130350739525104 -> 130350739525296 [arrowsize=0.8 color=gray40 penwidth=1.2] + 130350739525632 -> 130350739525440 [arrowsize=0.8 color=gray40 penwidth=1.2] + 130352998993200 -> 130350739525632 [arrowsize=0.8 color=gray40 penwidth=1.2] + 130350741118576 -> 130350739525632 [arrowsize=0.8 color=gray40 penwidth=1.2] + 130350739525968 -> 130350739525776 [arrowsize=0.8 color=gray40 penwidth=1.2] + 130350741118576 -> 130350739525776 [arrowsize=0.8 color=gray40 penwidth=1.2] + 130350739525776 -> 130350739526016 [arrowsize=0.8 color=gray40 penwidth=1.2] + 130350739526304 -> 130350739526016 [arrowsize=0.8 color=gray40 penwidth=1.2] + 130350739526016 -> 130350739526304 [arrowsize=0.8 color=gray40 penwidth=1.2] + 130350739526352 -> 130350739526304 [arrowsize=0.8 color=gray40 penwidth=1.2] + 130350739526016 -> 130350739526544 [arrowsize=0.8 color=gray40 penwidth=1.2] + 130350741118576 -> 130350739526544 [arrowsize=0.8 color=gray40 penwidth=1.2] + 130350739526544 -> 130350739526256 [arrowsize=0.8 color=gray40 penwidth=1.2] } diff --git a/front/py/examples/2_ir/2_elementwise_operator.dot.svg b/front/py/examples/2_ir/2_elementwise_operator.dot.svg index 1c50be16..bc9389b5 100644 --- a/front/py/examples/2_ir/2_elementwise_operator.dot.svg +++ b/front/py/examples/2_ir/2_elementwise_operator.dot.svg @@ -9,292 +9,292 @@ %3 - + -134854829346096 +130352998993200 t1 (3, 4, 5) - + -134854462387248 +130350739524384 add - + -134854829346096->134854462387248 +130352998993200->130350739524384 - + -134854462388400 +130350739525632 div - + -134854829346096->134854462388400 +130352998993200->130350739525632 - + -134854521156512 +130350739524192 constant - + -134854521156512->134854829346096 +130350739524192->130352998993200 - + -134854521844832 +130350741117520 var_1 0 - + -134854521844832->134854521156512 +130350741117520->130350739524192 - + -134854462386816 +130350741118576 t2 (3, 4, 5) - + -134854462386816->134854462387248 +130350741118576->130350739524384 - + -134854462386816->134854462388400 +130350741118576->130350739525632 - + -134854462388256 +130350739525776 rdivscalar - + -134854462386816->134854462388256 +130350741118576->130350739525776 - + -134854462388736 +130350739526544 mul - + -134854462386816->134854462388736 +130350741118576->130350739526544 - + -134854462387008 +130350739524336 constant - + -134854462387008->134854462386816 +130350739524336->130350741118576 - + -134854462386624 +130350739524480 var_2 1 - + -134854462386624->134854462387008 +130350739524480->130350739524336 - + -134854462387056 +130350739524672 t3 (3, 4, 5) - + -134854462387248->134854462387056 +130350739524384->130350739524672 - + -134854462387488 +130350739525104 add - + -134854462387056->134854462387488 +130350739524672->130350739525104 - + -134854462387344 +130350739524528 t4 (3, 4, 5) - + -134854462387344->134854462387488 +130350739524528->130350739525104 - + -134854462387680 +130350739524864 constant - + -134854462387680->134854462387344 +130350739524864->130350739524528 - + -134854462387632 +130350739525152 var_3 0.5 - + -134854462387632->134854462387680 +130350739525152->130350739524864 - + -134854462387776 +130350739525296 t5 (3, 4, 5) - + -134854462387488->134854462387776 +130350739525104->130350739525296 - + -134854462388016 +130350739525440 t6 (3, 4, 5) - + -134854462388400->134854462388016 +130350739525632->130350739525440 - + -134854462388688 +130350739526016 t7 (3, 4, 5) - + -134854462388256->134854462388688 +130350739525776->130350739526016 - + -134854462388352 +130350739525968 var_4 0.05 - + -134854462388352->134854462388256 +130350739525968->130350739525776 - + -134854462388832 +130350739526304 mulscalar - + -134854462388688->134854462388832 +130350739526016->130350739526304 - + -134854462388688->134854462388736 +130350739526016->130350739526544 - + -134854462388832->134854462388688 +130350739526304->130350739526016 - + -134854462388880 +130350739526352 var_5 2.5 - + -134854462388880->134854462388832 +130350739526352->130350739526304 - + -134854462389168 +130350739526256 t8 (3, 4, 5) - + -134854462388736->134854462389168 +130350739526544->130350739526256 diff --git a/front/py/examples/2_ir/2_elementwise_sqrtlog.dot b/front/py/examples/2_ir/2_elementwise_sqrtlog.dot index 4b3d20f4..fa272de2 100644 --- a/front/py/examples/2_ir/2_elementwise_sqrtlog.dot +++ b/front/py/examples/2_ir/2_elementwise_sqrtlog.dot @@ -2,34 +2,34 @@ digraph { rankdir=TB node [shape=record] - 136548958820992 [label="t1 + 127569128262912 [label="t1 (60,)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 136551216711568 [label="t2 + 127569128261520 [label="t2 (60,)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 136548919477104 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 136548919477152 [label="var_1 + 127566868400736 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 127566868400688 [label="var_1 2" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled] - 136548919476960 [label=sqrt color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 136548919477248 [label="t3 + 127566868400880 [label=sqrt color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 127566868401024 [label="t3 (60,)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 136548919477728 [label=log color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 136548919477632 [label="t4 + 127566868401456 [label=log color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 127566868401360 [label="t4 (60,)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 136548919478064 [label=exp color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 136548919477968 [label="t5 + 127566868401792 [label=exp color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 127566868401696 [label="t5 (60,)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 136548919478400 [label=pow color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 136548919478304 [label="t6 + 127566868402128 [label=pow color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 127566868402032 [label="t6 (60,)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 136548919477104 -> 136551216711568 [arrowsize=0.8 color=gray40 penwidth=1.2] - 136548919477152 -> 136548919477104 [arrowsize=0.8 color=gray40 penwidth=1.2] - 136548958820992 -> 136548919476960 [arrowsize=0.8 color=gray40 penwidth=1.2] - 136548919476960 -> 136548919477248 [arrowsize=0.8 color=gray40 penwidth=1.2] - 136551216711568 -> 136548919477728 [arrowsize=0.8 color=gray40 penwidth=1.2] - 136548919477728 -> 136548919477632 [arrowsize=0.8 color=gray40 penwidth=1.2] - 136548919477632 -> 136548919478064 [arrowsize=0.8 color=gray40 penwidth=1.2] - 136548919478064 -> 136548919477968 [arrowsize=0.8 color=gray40 penwidth=1.2] - 136548919477968 -> 136548919478400 [arrowsize=0.8 color=gray40 penwidth=1.2] - 136548919477248 -> 136548919478400 [arrowsize=0.8 color=gray40 penwidth=1.2] - 136548919478400 -> 136548919478304 [arrowsize=0.8 color=gray40 penwidth=1.2] + 127566868400736 -> 127569128261520 [arrowsize=0.8 color=gray40 penwidth=1.2] + 127566868400688 -> 127566868400736 [arrowsize=0.8 color=gray40 penwidth=1.2] + 127569128262912 -> 127566868400880 [arrowsize=0.8 color=gray40 penwidth=1.2] + 127566868400880 -> 127566868401024 [arrowsize=0.8 color=gray40 penwidth=1.2] + 127569128261520 -> 127566868401456 [arrowsize=0.8 color=gray40 penwidth=1.2] + 127566868401456 -> 127566868401360 [arrowsize=0.8 color=gray40 penwidth=1.2] + 127566868401360 -> 127566868401792 [arrowsize=0.8 color=gray40 penwidth=1.2] + 127566868401792 -> 127566868401696 [arrowsize=0.8 color=gray40 penwidth=1.2] + 127566868401696 -> 127566868402128 [arrowsize=0.8 color=gray40 penwidth=1.2] + 127566868401024 -> 127566868402128 [arrowsize=0.8 color=gray40 penwidth=1.2] + 127566868402128 -> 127566868402032 [arrowsize=0.8 color=gray40 penwidth=1.2] } diff --git a/front/py/examples/2_ir/2_elementwise_sqrtlog.dot.svg b/front/py/examples/2_ir/2_elementwise_sqrtlog.dot.svg index a688a2a1..3ca8df83 100644 --- a/front/py/examples/2_ir/2_elementwise_sqrtlog.dot.svg +++ b/front/py/examples/2_ir/2_elementwise_sqrtlog.dot.svg @@ -9,148 +9,148 @@ %3 - + -136548958820992 +127569128262912 t1 (60,) - + -136548919476960 +127566868400880 sqrt - + -136548958820992->136548919476960 +127569128262912->127566868400880 - + -136551216711568 +127569128261520 t2 (60,) - + -136548919477728 +127566868401456 log - + -136551216711568->136548919477728 +127569128261520->127566868401456 - + -136548919477104 +127566868400736 constant - + -136548919477104->136551216711568 +127566868400736->127569128261520 - + -136548919477152 +127566868400688 var_1 2 - + -136548919477152->136548919477104 +127566868400688->127566868400736 - + -136548919477248 +127566868401024 t3 (60,) - + -136548919476960->136548919477248 +127566868400880->127566868401024 - + -136548919478400 +127566868402128 pow - + -136548919477248->136548919478400 +127566868401024->127566868402128 - + -136548919477632 +127566868401360 t4 (60,) - + -136548919477728->136548919477632 +127566868401456->127566868401360 - + -136548919478064 +127566868401792 exp - + -136548919477632->136548919478064 +127566868401360->127566868401792 - + -136548919477968 +127566868401696 t5 (60,) - + -136548919478064->136548919477968 +127566868401792->127566868401696 - + -136548919477968->136548919478400 +127566868401696->127566868402128 - + -136548919478304 +127566868402032 t6 (60,) - + -136548919478400->136548919478304 +127566868402128->127566868402032 diff --git a/front/py/examples/2_ir/3_matmul.dot b/front/py/examples/2_ir/3_matmul.dot index f44682c8..232ef59c 100644 --- a/front/py/examples/2_ir/3_matmul.dot +++ b/front/py/examples/2_ir/3_matmul.dot @@ -2,24 +2,24 @@ digraph { rankdir=TB node [shape=record] - 135996949875968 [label="t1 + 137036194614528 [label="t1 (3, 4)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 135994975499600 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 135994976203776 [label="var_1 + 137033935129088 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 137033936738800 [label="var_1 1" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled] - 135994974384672 [label="t2 + 137033936739856 [label="t2 (4, 5)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 135994974384864 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 135994974384480 [label="var_2 + 137033935129232 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 137033935129376 [label="var_2 1" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled] - 135994974385104 [label=matmul color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 135994974385152 [label="tensor_3 + 137033935129280 [label=matmul color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 137033935129472 [label="tensor_3 (3, 5)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 135994975499600 -> 135996949875968 [arrowsize=0.8 color=gray40 penwidth=1.2] - 135994976203776 -> 135994975499600 [arrowsize=0.8 color=gray40 penwidth=1.2] - 135994974384864 -> 135994974384672 [arrowsize=0.8 color=gray40 penwidth=1.2] - 135994974384480 -> 135994974384864 [arrowsize=0.8 color=gray40 penwidth=1.2] - 135996949875968 -> 135994974385104 [arrowsize=0.8 color=gray40 penwidth=1.2] - 135994974384672 -> 135994974385104 [arrowsize=0.8 color=gray40 penwidth=1.2] - 135994974385104 -> 135994974385152 [arrowsize=0.8 color=gray40 penwidth=1.2] + 137033935129088 -> 137036194614528 [arrowsize=0.8 color=gray40 penwidth=1.2] + 137033936738800 -> 137033935129088 [arrowsize=0.8 color=gray40 penwidth=1.2] + 137033935129232 -> 137033936739856 [arrowsize=0.8 color=gray40 penwidth=1.2] + 137033935129376 -> 137033935129232 [arrowsize=0.8 color=gray40 penwidth=1.2] + 137036194614528 -> 137033935129280 [arrowsize=0.8 color=gray40 penwidth=1.2] + 137033936739856 -> 137033935129280 [arrowsize=0.8 color=gray40 penwidth=1.2] + 137033935129280 -> 137033935129472 [arrowsize=0.8 color=gray40 penwidth=1.2] } diff --git a/front/py/examples/2_ir/3_matmul.dot.svg b/front/py/examples/2_ir/3_matmul.dot.svg index 3e1d97eb..d14a0efd 100644 --- a/front/py/examples/2_ir/3_matmul.dot.svg +++ b/front/py/examples/2_ir/3_matmul.dot.svg @@ -9,98 +9,98 @@ %3 - + -135996949875968 +137036194614528 t1 (3, 4) - + -135994974385104 +137033935129280 matmul - + -135996949875968->135994974385104 +137036194614528->137033935129280 - + -135994975499600 +137033935129088 constant - + -135994975499600->135996949875968 +137033935129088->137036194614528 - + -135994976203776 +137033936738800 var_1 1 - + -135994976203776->135994975499600 +137033936738800->137033935129088 - + -135994974384672 +137033936739856 t2 (4, 5) - + -135994974384672->135994974385104 +137033936739856->137033935129280 - + -135994974384864 +137033935129232 constant - + -135994974384864->135994974384672 +137033935129232->137033936739856 - + -135994974384480 +137033935129376 var_2 1 - + -135994974384480->135994974384864 +137033935129376->137033935129232 - + -135994974385152 +137033935129472 tensor_3 (3, 5) - + -135994974385104->135994974385152 +137033935129280->137033935129472