From ab95117a1b69ae803814206c5a756a8fdb9b4cb1 Mon Sep 17 00:00:00 2001 From: lipeng <734991033@qq.com> Date: Mon, 31 Mar 2025 18:53:51 +0800 Subject: [PATCH] front&excuter:sqrt,pow,powscalar,log,exp --- doc/excuter/op-mem-cuda/list.md | 5 + doc/excuter/op-mem-ompsimd/list.md | 5 + .../src/deepx/tensorfunc/elementwise.hpp | 2 +- excuter/op-mem-cuda/src/client/tfs.cpp | 118 ++++-- .../tensorfunc/elementwise_miaobyte_sin.hpp.a | 61 +++ .../tensorfunc/elementwise_miaobyte_sqrt.cu | 188 +++++++++ .../tensorfunc/elementwise_miaobyte_sqrt.cuh | 169 ++++++++ .../tensorfunc/elementwise_miaobyte_sqrt.hpp | 88 ++++ .../src/deepx/tf/elementwise_sqrt.hpp | 378 ++++++++++++++++++ excuter/op-mem-ompsimd/src/client/tfs.cpp | 173 ++++---- .../deepx/tensorfunc/elementwise_miaobyte.hpp | 153 ++++--- .../src/deepx/tf/elementwise.hpp | 303 +++++++++++++- front/py/deepx/nn/functional/__init__.py | 2 +- front/py/deepx/nn/functional/elementwise.py | 30 +- .../examples/2_ir/2_elementwise_sqrtlog.dot | 35 ++ .../2_ir/2_elementwise_sqrtlog.dot.svg | 158 ++++++++ .../py/examples/2_ir/2_elementwise_sqrtlog.py | 38 ++ 17 files changed, 1709 insertions(+), 197 deletions(-) create mode 100644 excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.hpp.a create mode 100644 excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cu create mode 100644 excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cuh create mode 100644 excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.hpp create mode 100644 excuter/op-mem-cuda/src/deepx/tf/elementwise_sqrt.hpp create mode 100644 front/py/examples/2_ir/2_elementwise_sqrtlog.dot create mode 100644 front/py/examples/2_ir/2_elementwise_sqrtlog.dot.svg create mode 100644 front/py/examples/2_ir/2_elementwise_sqrtlog.py diff --git a/doc/excuter/op-mem-cuda/list.md b/doc/excuter/op-mem-cuda/list.md index 27bdd297..a314b05d 100644 --- a/doc/excuter/op-mem-cuda/list.md +++ b/doc/excuter/op-mem-cuda/list.md @@ -5,11 +5,15 @@ | Operation | Author | Func Def | Math Formula | IR Instruction | |-----------|--------|------------|--------------|----------------| | matmul | cublas | matmul(tensor A, tensor B)->(tensor C) | T3=T1 @ T2 | matmul(tensor A, tensor B)->(tensor C) | +| exp | miaobyte | exp(tensor A)->(tensor C) | T3=exp(T1) | exp(tensor A)->(tensor C) | +| pow | miaobyte | pow(tensor A, tensor B)->(tensor C) | T3=pow(T1, T2) | pow(tensor A, tensor B)->(tensor C) | +| powscalar | miaobyte | powscalar(tensor A, var scalar)->(tensor C) | T3=pow(T1, scalar) | powscalar(tensor A, var scalar)->(tensor C) | | rdivscalar | miaobyte | rdivscalar(var scalar, tensor A)->(tensor C) | T3=scalar/T1 | rdivscalar(var scalar, tensor A)->(tensor C) | | div | miaobyte | div(tensor A, tensor B)->(tensor C) | T3=T1/T2 | div(tensor A, tensor B)->(tensor C) | | sub | miaobyte | sub(tensor A, tensor B)->(tensor C) | T3=T1-T2 | sub(tensor A, tensor B)->(tensor C) | | argset | none | argset(var value)->(var name) | var argname = argvalue | argset(var value)->(var name) | | mulscalar | miaobyte | mulscalar(tensor A, var b)->(tensor C) | T3=T1*scalar | mulscalar(tensor A, var b)->(tensor C) | +| sqrt | miaobyte | sqrt(tensor A)->(tensor C) | T3=sqrt(T1) | sqrt(tensor A)->(tensor C) | | vecset | none | vecset(vector value)->(vector name) | shape = [3 4 5] | vecset(vector value)->(vector name) | | newtensor | none | newtensor(vector shape)->(tensor tensor1) | T1 = zeros(shape) | newtensor(vector shape)->(tensor tensor1) | | newtensor | none | newtensor(var shape)->(tensor tensor1) | T1 = zeros(shape) | newtensor(var shape)->(tensor tensor1) | @@ -19,6 +23,7 @@ | constant | miaobyte | constant(tensor t, var value)->() | constant(T1) | constant(tensor t, var value)->() | | arange | miaobyte | arange(tensor t, var start, var step)->() | arange(T1,start,step) | arange(tensor t, var start, var step)->() | | subscalar | miaobyte | subscalar(tensor A, var b)->(tensor C) | T3=T1-scalar | subscalar(tensor A, var b)->(tensor C) | +| log | miaobyte | log(tensor A)->(tensor C) | T3=log(T1) | log(tensor A)->(tensor C) | | uniform | miaobyte | uniform(tensor t, var low, var high, var seed)->() | uniform(T1,low,high,seed) | uniform(tensor t, var low, var high, var seed)->() | | add | cublas | add(tensor a, tensor b)->(tensor c) | T3=T1+T2 | add(tensor a, tensor b)->(tensor c) | | add | miaobyte | add(tensor a, tensor b)->(tensor c) | T3=T1+T2 | add(tensor a, tensor b)->(tensor c) | diff --git a/doc/excuter/op-mem-ompsimd/list.md b/doc/excuter/op-mem-ompsimd/list.md index 6e878c3a..47325905 100644 --- a/doc/excuter/op-mem-ompsimd/list.md +++ b/doc/excuter/op-mem-ompsimd/list.md @@ -7,11 +7,15 @@ | concat | none | concat()->() | Tresult = concat([T1, T2...], axis=3) | concat()->() | | matmul | cblas | matmul(tensor A, tensor B)->(tensor C) | T3=T1 @ T2 | matmul(tensor A, tensor B)->(tensor C) | | matmul | miaobyte | matmul(tensor A, tensor B)->(tensor C) | T3=T1 @ T2 | matmul(tensor A, tensor B)->(tensor C) | +| exp | miaobyte | exp(tensor A)->(tensor C) | T3=exp(T1) | exp(tensor A)->(tensor C) | +| pow | miaobyte | pow(tensor A, tensor B)->(tensor C) | T3=T1^T2 | pow(tensor A, tensor B)->(tensor C) | +| powscalar | miaobyte | powscalar(tensor A, var scalar)->(tensor C) | T3=T1^scalar | powscalar(tensor A, var scalar)->(tensor C) | | rdivscalar | miaobyte | rdivscalar(var scalar, tensor A)->(tensor C) | T3=scalar/T1 | rdivscalar(var scalar, tensor A)->(tensor C) | | div | miaobyte | div(tensor A, tensor B)->(tensor C) | T3=T1/T2 | div(tensor A, tensor B)->(tensor C) | | sub | miaobyte | sub(tensor a, tensor b)->(tensor c) | T3=T1-T2 | sub(tensor a, tensor b)->(tensor c) | | argset | none | argset(var value)->(var name) | var argname = argvalue | argset(var value)->(var name) | | mulscalar | miaobyte | mulscalar(tensor A, var b)->(tensor C) | T3=T1*scalar | mulscalar(tensor A, var b)->(tensor C) | +| sqrt | miaobyte | sqrt(tensor A)->(tensor C) | T3=sqrt(T1) | sqrt(tensor A)->(tensor C) | | vecset | none | vecset(vector value)->(vector name) | shape = [3 4 5] | vecset(vector value)->(vector name) | | newtensor | none | newtensor(vector shape)->(tensor tensor1) | T1 =Tensor(shape=[...]) | newtensor(vector shape)->(tensor tensor1) | | newtensor | none | newtensor(var shape)->(tensor tensor1) | T1 =Tensor(shape=[...]) | newtensor(var shape)->(tensor tensor1) | @@ -21,6 +25,7 @@ | constant | miaobyte | constant(tensor t, var value)->() | constant(T1,value) | constant(tensor t, var value)->() | | arange | miaobyte | arange(tensor t, var start, var step)->() | arange(T1,start,step) | arange(tensor t, var start, var step)->() | | subscalar | miaobyte | subscalar(tensor a, var scalar)->(tensor c) | T3=T1-scalar | subscalar(tensor a, var scalar)->(tensor c) | +| log | miaobyte | log(tensor A)->(tensor C) | T3=log(T1) | log(tensor A)->(tensor C) | | uniform | miaobyte | uniform(tensor t, var low, var high, var seed)->() | uniform(T1,low,high,seed) | uniform(tensor t, var low, var high, var seed)->() | | add | cblas | add(tensor a, tensor b)->(tensor c) | T3=T1+T2 | add(tensor a, tensor b)->(tensor c) | | add | miaobyte | add(tensor a, tensor b)->(tensor c) | T3=T1+T2 | add(tensor a, tensor b)->(tensor c) | diff --git a/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp b/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp index 4ee525c3..4e0edc6e 100644 --- a/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp +++ b/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp @@ -195,7 +195,7 @@ namespace deepx::tensorfunc divaddbetaDispatcher::divaddbeta(A, B, alpha, C, beta, D); } - template + template struct sqrtDispatcher { static void sqrt(const Tensor &input, Tensor &output) = delete; diff --git a/excuter/op-mem-cuda/src/client/tfs.cpp b/excuter/op-mem-cuda/src/client/tfs.cpp index 27361136..473e55d3 100644 --- a/excuter/op-mem-cuda/src/client/tfs.cpp +++ b/excuter/op-mem-cuda/src/client/tfs.cpp @@ -4,6 +4,7 @@ #include "deepx/tf/print.hpp" #include "deepx/tf/init.hpp" #include "deepx/tf/elementwise_basic.hpp" +#include "deepx/tf/elementwise_sqrt.hpp" #include "deepx/tf/matmul.hpp" #include "deepx/dtype.hpp" #include "deepx/tf/tffactory.hpp" @@ -107,14 +108,14 @@ namespace deepx::tf Param("c", DataCategory::Tensor, Precision::Any), }))); tffactory.add_tf(std::make_shared>(vector( - { - Param("a", DataCategory::Tensor, Precision::Any), - Param("b", DataCategory::Tensor, Precision::Any), - }), - vector( - { - Param("c", DataCategory::Tensor, Precision::Any), - }))); + { + Param("a", DataCategory::Tensor, Precision::Any), + Param("b", DataCategory::Tensor, Precision::Any), + }), + vector( + { + Param("c", DataCategory::Tensor, Precision::Any), + }))); tffactory.add_tf(std::make_shared>(vector( { Param("A", DataCategory::Tensor, Precision::Any), @@ -126,14 +127,14 @@ namespace deepx::tf }))); tffactory.add_tf(std::make_shared>(vector( - { - Param("A", DataCategory::Tensor, Precision::Any), - Param("B", DataCategory::Tensor, Precision::Any), - }), - vector( - { - Param("C", DataCategory::Tensor, Precision::Any), - }))); + { + Param("A", DataCategory::Tensor, Precision::Any), + Param("B", DataCategory::Tensor, Precision::Any), + }), + vector( + { + Param("C", DataCategory::Tensor, Precision::Any), + }))); tffactory.add_tf(std::make_shared>(vector( { Param("A", DataCategory::Tensor, Precision::Any), @@ -148,31 +149,31 @@ namespace deepx::tf Param("A", DataCategory::Tensor, Precision::Any), Param("B", DataCategory::Tensor, Precision::Any), }), - vector( + vector( { Param("C", DataCategory::Tensor, Precision::Any), }))); tffactory.add_tf(std::make_shared>(vector( { - Param("A", DataCategory::Tensor, Precision::Any), + Param("A", DataCategory::Tensor, Precision::Any), Param("b", DataCategory::Var, Precision::Any), }), vector( { Param("C", DataCategory::Tensor, Precision::Any), - }))); + }))); tffactory.add_tf(std::make_shared>(vector( { Param("A", DataCategory::Tensor, Precision::Any), Param("B", DataCategory::Tensor, Precision::Any), }), - vector( + vector( { Param("C", DataCategory::Tensor, Precision::Any), }))); tffactory.add_tf(std::make_shared>(vector( { - Param("A", DataCategory::Tensor, Precision::Any), + Param("A", DataCategory::Tensor, Precision::Any), Param("scalar", DataCategory::Var, Precision::Any), }), vector( @@ -180,41 +181,72 @@ namespace deepx::tf Param("C", DataCategory::Tensor, Precision::Any), }))); tffactory.add_tf(std::make_shared>(vector( + { + Param("scalar", DataCategory::Var, Precision::Any), + Param("A", DataCategory::Tensor, Precision::Any), + }), + vector( + { + Param("C", DataCategory::Tensor, Precision::Any), + }))); + + tffactory.add_tf(std::make_shared>(vector( + { + Param("A", DataCategory::Tensor, Precision::Any), + }), + vector( + { + Param("C", DataCategory::Tensor, Precision::Any), + }))); + + tffactory.add_tf(std::make_shared>(vector( + { + Param("A", DataCategory::Tensor, Precision::Any), + Param("B", DataCategory::Tensor, Precision::Any), + }), + vector( + { + Param("C", DataCategory::Tensor, Precision::Any), + }))); + tffactory.add_tf(std::make_shared>(vector( { - Param("scalar", DataCategory::Var, Precision::Any), Param("A", DataCategory::Tensor, Precision::Any), + Param("scalar", DataCategory::Var, Precision::Any), }), - vector( + vector( { Param("C", DataCategory::Tensor, Precision::Any), - }))); - - - // opfactory.add_op(Sqrt_miaobyte()); - // opfactory.add_op(Sqrt_miaobyte()); - - // opfactory.add_op(Exp_miaobyte()); - // opfactory.add_op(Exp_miaobyte()); - - // opfactory.add_op(Pow_miaobyte()); - // opfactory.add_op(Pow_miaobyte()); - - // opfactory.add_op(Powscalar_miaobyte()); - // opfactory.add_op(Powscalar_miaobyte()); - } - // matmul - void register_matmul(TfFactory &tffactory) - { - tffactory.add_tf(std::make_shared>(vector( + }))); + tffactory.add_tf(std::make_shared>(vector( + { + Param("A", DataCategory::Tensor, Precision::Any), + }), + vector( + { + Param("C", DataCategory::Tensor, Precision::Any), + }))); + tffactory.add_tf(std::make_shared>(vector( { Param("A", DataCategory::Tensor, Precision::Any), - Param("B", DataCategory::Tensor, Precision::Any), }), vector( { Param("C", DataCategory::Tensor, Precision::Any), }))); } + // matmul + void register_matmul(TfFactory &tffactory) + { + tffactory.add_tf(std::make_shared>(vector( + { + Param("A", DataCategory::Tensor, Precision::Any), + Param("B", DataCategory::Tensor, Precision::Any), + }), + vector( + { + Param("C", DataCategory::Tensor, Precision::Any), + }))); + } // // changeshape void register_changeshape(TfFactory &tffactory) { diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.hpp.a b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.hpp.a new file mode 100644 index 00000000..f31973f3 --- /dev/null +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.hpp.a @@ -0,0 +1,61 @@ +#ifndef DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_SIN_HPP +#define DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_SIN_HPP + +#include "deepx/tensorfunc/elementwise.hpp" +#include "deepx/tensorfunc/cuda.hpp" +#include "deepx/tensorfunc/authors.hpp" +#include "deepx/tensorfunc/elementwise_miaobyte_basic.cuh" + +#include "stdutil/error.hpp" + +namespace deepx::tensorfunc +{ + // CUDA kernel函数声明 + + + template + struct sinDispatcher + { + static void sin(const Tensor &A, Tensor &C) + { + if (A.shape.size != C.shape.size) { + throw TensorShapeError("sin"); + } + const int blockSize = A.shape.size > 256 ? 256 : A.shape.size; + int numBlocks = (A.shape.size + blockSize - 1) / blockSize; + launch_sin(numBlocks, blockSize, A.data, C.data, A.shape.size); + } + }; + + template + struct cosDispatcher + { + static void cos(const Tensor &A, Tensor &C) + { + if (A.shape.size != C.shape.size) { + throw TensorShapeError("cos"); + } + const int blockSize = A.shape.size > 256 ? 256 : A.shape.size; + int numBlocks = (A.shape.size + blockSize - 1) / blockSize; + launch_cos(numBlocks, blockSize, A.data, C.data, A.shape.size); + } + }; + + template + struct tanDispatcher + { + static void tan(const Tensor &A, Tensor &C) + { + if (A.shape.size != C.shape.size) { + throw TensorShapeError("tan"); + } + const int blockSize = A.shape.size > 256 ? 256 : A.shape.size; + int numBlocks = (A.shape.size + blockSize - 1) / blockSize; + launch_tan(numBlocks, blockSize, A.data, C.data, A.shape.size); + } + }; + + +} + +#endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_BASIC_HPP diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cu new file mode 100644 index 00000000..a808d5bc --- /dev/null +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cu @@ -0,0 +1,188 @@ +#ifndef DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_SQRT_CUH +#define DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_SQRT_CUH + +#include "deepx/tensorfunc/cuda.hpp" +#include "deepx/tensorfunc/authors.hpp" +#include + +namespace deepx::tensorfunc +{ + // sqrt + template + __global__ void sqrt_kernel(const T* A, T* C,const int size){ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < size) { + C[idx] = sqrtf(A[idx]); + } + } + template __global__ void sqrt_kernel(const double* A, double* C,const int size); + template __global__ void sqrt_kernel(const float* A, float* C,const int size); + // template __global__ void sqrt_kernel(const nv_bfloat16* A, nv_bfloat16* C,const int size); + // template __global__ void sqrt_kernel<__half>(const __half* A, __half* C,const int size); + template __global__ void sqrt_kernel(const int64_t* A, int64_t* C,const int size); + template __global__ void sqrt_kernel(const int32_t* A, int32_t* C,const int size); + template __global__ void sqrt_kernel(const int16_t* A, int16_t* C,const int size); + template __global__ void sqrt_kernel(const int8_t* A, int8_t* C,const int size); + + template + void launch_sqrt(int numBlocks, int blockSize, const T* a, T* c,const int size){ + sqrt_kernel<<>>(a, c, size); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + throw std::runtime_error("Failed to launch sqrt kernel: " + + std::string(cudaGetErrorString(err))); + } + } + template void launch_sqrt(int numBlocks, int blockSize, const double* a, double* c,const int size); + template void launch_sqrt(int numBlocks, int blockSize, const float* a, float* c,const int size); + // template void launch_sqrt(int numBlocks, int blockSize, const nv_bfloat16* a, nv_bfloat16* c,const int size); + // template void launch_sqrt<__half>(int numBlocks, int blockSize, const __half* a, __half* c,const int size); + template void launch_sqrt(int numBlocks, int blockSize, const int64_t* a, int64_t* c,const int size); + template void launch_sqrt(int numBlocks, int blockSize, const int32_t* a, int32_t* c,const int size); + template void launch_sqrt(int numBlocks, int blockSize, const int16_t* a, int16_t* c,const int size); + template void launch_sqrt(int numBlocks, int blockSize, const int8_t* a, int8_t* c,const int size); + + + // pow + template + __global__ void pow_kernel(const T* A, const T* B, T* C,const int size){ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < size) { + C[idx] = powf(A[idx], B[idx]); + } + } + template __global__ void pow_kernel(const double* A, const double* B, double* C,const int size); + template __global__ void pow_kernel(const float* A, const float* B, float* C,const int size); + // template __global__ void pow_kernel(const nv_bfloat16* A, const nv_bfloat16* B, nv_bfloat16* C,const int size); + // template __global__ void pow_kernel<__half>(const __half* A, const __half* B, __half* C,const int size); + template __global__ void pow_kernel(const int64_t* A, const int64_t* B, int64_t* C,const int size); + template __global__ void pow_kernel(const int32_t* A, const int32_t* B, int32_t* C,const int size); + template __global__ void pow_kernel(const int16_t* A, const int16_t* B, int16_t* C,const int size); + template __global__ void pow_kernel(const int8_t* A, const int8_t* B, int8_t* C,const int size); + + template + void launch_pow(int numBlocks, int blockSize, const T* a, const T* b, T* c,const int size){ + pow_kernel<<>>(a, b, c, size); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + throw std::runtime_error("Failed to launch pow kernel: " + + std::string(cudaGetErrorString(err))); + } + } + template void launch_pow(int numBlocks, int blockSize, const double* a, const double* b, double* c,const int size); + template void launch_pow(int numBlocks, int blockSize, const float* a, const float* b, float* c,const int size); + // template void launch_pow(int numBlocks, int blockSize, const nv_bfloat16* a, const nv_bfloat16* b, nv_bfloat16* c,const int size); + // template void launch_pow<__half>(int numBlocks, int blockSize, const __half* a, const __half* b, __half* c,const int size); + template void launch_pow(int numBlocks, int blockSize, const int64_t* a, const int64_t* b, int64_t* c,const int size); + template void launch_pow(int numBlocks, int blockSize, const int32_t* a, const int32_t* b, int32_t* c,const int size); + template void launch_pow(int numBlocks, int blockSize, const int16_t* a, const int16_t* b, int16_t* c,const int size); + template void launch_pow(int numBlocks, int blockSize, const int8_t* a, const int8_t* b, int8_t* c,const int size); + + // powscalar + template + __global__ void powscalar_kernel(const T* A, const T scalar, T* C,const int size){ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < size) { + C[idx] = powf(A[idx], scalar); + } + } + template __global__ void powscalar_kernel(const double* A, const double scalar, double* C,const int size); + template __global__ void powscalar_kernel(const float* A, const float scalar, float* C,const int size); + // template __global__ void powscalar_kernel(const nv_bfloat16* A, const nv_bfloat16 scalar, nv_bfloat16* C,const int size); + // template __global__ void powscalar_kernel<__half>(const __half* A, const __half scalar, __half* C,const int size); + template __global__ void powscalar_kernel(const int64_t* A, const int64_t scalar, int64_t* C,const int size); + template __global__ void powscalar_kernel(const int32_t* A, const int32_t scalar, int32_t* C,const int size); + template __global__ void powscalar_kernel(const int16_t* A, const int16_t scalar, int16_t* C,const int size); + template __global__ void powscalar_kernel(const int8_t* A, const int8_t scalar, int8_t* C,const int size); + + template + void launch_powscalar(int numBlocks, int blockSize, const T* a, const T scalar, T* c,const int size){ + powscalar_kernel<<>>(a, scalar, c, size); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + throw std::runtime_error("Failed to launch powscalar kernel: " + + std::string(cudaGetErrorString(err))); + } + } + template void launch_powscalar(int numBlocks, int blockSize, const double* a, const double scalar, double* c,const int size); + template void launch_powscalar(int numBlocks, int blockSize, const float* a, const float scalar, float* c,const int size); + // template void launch_powscalar(int numBlocks, int blockSize, const nv_bfloat16* a, const nv_bfloat16 scalar, nv_bfloat16* c,const int size); + // template void launch_powscalar<__half>(int numBlocks, int blockSize, const __half* a, const __half scalar, __half* c,const int size); + template void launch_powscalar(int numBlocks, int blockSize, const int64_t* a, const int64_t scalar, int64_t* c,const int size); + template void launch_powscalar(int numBlocks, int blockSize, const int32_t* a, const int32_t scalar, int32_t* c,const int size); + template void launch_powscalar(int numBlocks, int blockSize, const int16_t* a, const int16_t scalar, int16_t* c,const int size); + template void launch_powscalar(int numBlocks, int blockSize, const int8_t* a, const int8_t scalar, int8_t* c,const int size); + + // log + template + __global__ void log_kernel(const T* A, T* C,const int size){ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < size) { + C[idx] = logf(A[idx]); + } + } + template __global__ void log_kernel(const double* A, double* C,const int size); + template __global__ void log_kernel(const float* A, float* C,const int size); + // template __global__ void log_kernel(const nv_bfloat16* A, nv_bfloat16* C,const int size); + // template __global__ void log_kernel<__half>(const __half* A, __half* C,const int size); + template __global__ void log_kernel(const int64_t* A, int64_t* C,const int size); + template __global__ void log_kernel(const int32_t* A, int32_t* C,const int size); + template __global__ void log_kernel(const int16_t* A, int16_t* C,const int size); + template __global__ void log_kernel(const int8_t* A, int8_t* C,const int size); + + template + void launch_log(int numBlocks, int blockSize, const T* a, T* c,const int size){ + log_kernel<<>>(a, c, size); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + throw std::runtime_error("Failed to launch log kernel: " + + std::string(cudaGetErrorString(err))); + } + } + template void launch_log(int numBlocks, int blockSize, const double* a, double* c,const int size); + template void launch_log(int numBlocks, int blockSize, const float* a, float* c,const int size); + // template void launch_log(int numBlocks, int blockSize, const nv_bfloat16* a, nv_bfloat16* c,const int size); + // template void launch_log<__half>(int numBlocks, int blockSize, const __half* a, __half* c,const int size); + template void launch_log(int numBlocks, int blockSize, const int64_t* a, int64_t* c,const int size); + template void launch_log(int numBlocks, int blockSize, const int32_t* a, int32_t* c,const int size); + template void launch_log(int numBlocks, int blockSize, const int16_t* a, int16_t* c,const int size); + template void launch_log(int numBlocks, int blockSize, const int8_t* a, int8_t* c,const int size); + + // exp + template + __global__ void exp_kernel(const T* A, T* C,const int size){ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < size) { + C[idx] = expf(A[idx]); + } + } + template __global__ void exp_kernel(const double* A, double* C,const int size); + template __global__ void exp_kernel(const float* A, float* C,const int size); + // template __global__ void exp_kernel(const nv_bfloat16* A, nv_bfloat16* C,const int size); + // template __global__ void exp_kernel<__half>(const __half* A, __half* C,const int size); + template __global__ void exp_kernel(const int64_t* A, int64_t* C,const int size); + template __global__ void exp_kernel(const int32_t* A, int32_t* C,const int size); + template __global__ void exp_kernel(const int16_t* A, int16_t* C,const int size); + template __global__ void exp_kernel(const int8_t* A, int8_t* C,const int size); + + template + void launch_exp(int numBlocks, int blockSize, const T* a, T* c,const int size){ + exp_kernel<<>>(a, c, size); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + throw std::runtime_error("Failed to launch exp kernel: " + + std::string(cudaGetErrorString(err))); + } + } + template void launch_exp(int numBlocks, int blockSize, const double* a, double* c,const int size); + template void launch_exp(int numBlocks, int blockSize, const float* a, float* c,const int size); + // template void launch_exp(int numBlocks, int blockSize, const nv_bfloat16* a, nv_bfloat16* c,const int size); + // template void launch_exp<__half>(int numBlocks, int blockSize, const __half* a, __half* c,const int size); + template void launch_exp(int numBlocks, int blockSize, const int64_t* a, int64_t* c,const int size); + template void launch_exp(int numBlocks, int blockSize, const int32_t* a, int32_t* c,const int size); + template void launch_exp(int numBlocks, int blockSize, const int16_t* a, int16_t* c,const int size); + template void launch_exp(int numBlocks, int blockSize, const int8_t* a, int8_t* c,const int size); + +} + +#endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_SQRT_CUH diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cuh b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cuh new file mode 100644 index 00000000..dd428cbd --- /dev/null +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cuh @@ -0,0 +1,169 @@ +#ifndef DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_SQRT_CUH +#define DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_SQRT_CUH +#include +#include + + +#include "deepx/tensorfunc/elementwise.hpp" +#include "deepx/tensorfunc/cuda.hpp" +#include "deepx/tensorfunc/authors.hpp" + +namespace deepx::tensorfunc +{ + // sqrt + template + __global__ void sqrt_kernel(const T* A, T* C,const int size); + + template + void launch_sqrt(int numBlocks, int blockSize, const T* a, T* c,const int size); + + template <> + void launch_sqrt(int numBlocks, int blockSize, const double* a, double* c,const int size); + + template <> + void launch_sqrt(int numBlocks, int blockSize, const float* a, float* c,const int size); + + template <> + void launch_sqrt(int numBlocks, int blockSize, const nv_bfloat16* a, nv_bfloat16* c,const int size); + + template <> + void launch_sqrt<__half>(int numBlocks, int blockSize, const __half* a, __half* c,const int size); + + template <> + void launch_sqrt(int numBlocks, int blockSize, const int64_t* a, int64_t* c,const int size); + + template <> + void launch_sqrt(int numBlocks, int blockSize, const int32_t* a, int32_t* c,const int size); + + template <> + void launch_sqrt(int numBlocks, int blockSize, const int16_t* a, int16_t* c,const int size); + + template <> + void launch_sqrt(int numBlocks, int blockSize, const int8_t* a, int8_t* c,const int size); + + // pow + template + __global__ void pow_kernel(const T* A, const T* B, T* C,const int size); + + template + void launch_pow(int numBlocks, int blockSize, const T* a, const T* b, T* c,const int size); + + template <> + void launch_pow(int numBlocks, int blockSize, const double* a, const double* b, double* c,const int size); + + template <> + void launch_pow(int numBlocks, int blockSize, const float* a, const float* b, float* c,const int size); + + template <> + void launch_pow(int numBlocks, int blockSize, const nv_bfloat16* a, const nv_bfloat16* b, nv_bfloat16* c,const int size); + + template <> + void launch_pow<__half>(int numBlocks, int blockSize, const __half* a, const __half* b, __half* c,const int size); + + template <> + void launch_pow(int numBlocks, int blockSize, const int64_t* a, const int64_t* b, int64_t* c,const int size); + + template <> + void launch_pow(int numBlocks, int blockSize, const int32_t* a, const int32_t* b, int32_t* c,const int size); + + template <> + void launch_pow(int numBlocks, int blockSize, const int16_t* a, const int16_t* b, int16_t* c,const int size); + + template <> + void launch_pow(int numBlocks, int blockSize, const int8_t* a, const int8_t* b, int8_t* c,const int size); + + // powscalar + template + __global__ void powscalar_kernel(const T* A, const T scalar, T* C,const int size); + + template + void launch_powscalar(int numBlocks, int blockSize, const T* a, const T scalar, T* c,const int size); + + template <> + void launch_powscalar(int numBlocks, int blockSize, const double* a, const double scalar, double* c,const int size); + + template <> + void launch_powscalar(int numBlocks, int blockSize, const float* a, const float scalar, float* c,const int size); + + template <> + void launch_powscalar(int numBlocks, int blockSize, const nv_bfloat16* a, const nv_bfloat16 scalar, nv_bfloat16* c,const int size); + + template <> + void launch_powscalar<__half>(int numBlocks, int blockSize, const __half* a, const __half scalar, __half* c,const int size); + + template <> + void launch_powscalar(int numBlocks, int blockSize, const int64_t* a, const int64_t scalar, int64_t* c,const int size); + + template <> + void launch_powscalar(int numBlocks, int blockSize, const int32_t* a, const int32_t scalar, int32_t* c,const int size); + + template <> + void launch_powscalar(int numBlocks, int blockSize, const int16_t* a, const int16_t scalar, int16_t* c,const int size); + + template <> + void launch_powscalar(int numBlocks, int blockSize, const int8_t* a, const int8_t scalar, int8_t* c,const int size); + + // log + template + __global__ void log_kernel(const T* A, T* C,const int size); + + template + void launch_log(int numBlocks, int blockSize, const T* a, T* c,const int size); + + template <> + void launch_log(int numBlocks, int blockSize, const double* a, double* c,const int size); + + template <> + void launch_log(int numBlocks, int blockSize, const float* a, float* c,const int size); + + template <> + void launch_log(int numBlocks, int blockSize, const nv_bfloat16* a, nv_bfloat16* c,const int size); + + template <> + void launch_log<__half>(int numBlocks, int blockSize, const __half* a, __half* c,const int size); + + template <> + void launch_log(int numBlocks, int blockSize, const int64_t* a, int64_t* c,const int size); + + template <> + void launch_log(int numBlocks, int blockSize, const int32_t* a, int32_t* c,const int size); + + template <> + void launch_log(int numBlocks, int blockSize, const int16_t* a, int16_t* c,const int size); + + template <> + void launch_log(int numBlocks, int blockSize, const int8_t* a, int8_t* c,const int size); + + // exp + template + __global__ void exp_kernel(const T* A, T* C,const int size); + + template + void launch_exp(int numBlocks, int blockSize, const T* a, T* c,const int size); + + template <> + void launch_exp(int numBlocks, int blockSize, const double* a, double* c,const int size); + + template <> + void launch_exp(int numBlocks, int blockSize, const float* a, float* c,const int size); + + template <> + void launch_exp(int numBlocks, int blockSize, const nv_bfloat16* a, nv_bfloat16* c,const int size); + + template <> + void launch_exp<__half>(int numBlocks, int blockSize, const __half* a, __half* c,const int size); + + template <> + void launch_exp(int numBlocks, int blockSize, const int64_t* a, int64_t* c,const int size); + + template <> + void launch_exp(int numBlocks, int blockSize, const int32_t* a, int32_t* c,const int size); + + template <> + void launch_exp(int numBlocks, int blockSize, const int16_t* a, int16_t* c,const int size); + + template <> + void launch_exp(int numBlocks, int blockSize, const int8_t* a, int8_t* c,const int size); +} + +#endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_SQRT_CUH diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.hpp new file mode 100644 index 00000000..38afe270 --- /dev/null +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.hpp @@ -0,0 +1,88 @@ +#ifndef DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_SQRT_HPP +#define DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_SQRT_HPP + +#include "deepx/tensorfunc/elementwise.hpp" +#include "deepx/tensorfunc/cuda.hpp" +#include "deepx/tensorfunc/authors.hpp" +#include "deepx/tensorfunc/elementwise_miaobyte_sqrt.cuh" +#include "stdutil/error.hpp" + +namespace deepx::tensorfunc +{ + // CUDA kernel函数声明 + + + template + struct sqrtDispatcher + { + static void sqrt(const Tensor &A, Tensor &C) + { + if (A.shape.size != C.shape.size) { + throw TensorShapeError("sqrt"); + } + const int blockSize = A.shape.size > 256 ? 256 : A.shape.size; + int numBlocks = (A.shape.size + blockSize - 1) / blockSize; + launch_sqrt(numBlocks, blockSize, A.data, C.data, A.shape.size); + } + }; + + template + struct powDispatcher + { + static void pow(const Tensor &A, const Tensor &B, Tensor &C) + { + if (A.shape.size != C.shape.size) { + throw TensorShapeError("pow"); + } + const int blockSize = A.shape.size > 256 ? 256 : A.shape.size; + int numBlocks = (A.shape.size + blockSize - 1) / blockSize; + launch_pow(numBlocks, blockSize, A.data, B.data, C.data, A.shape.size); + } + }; + + template + struct powscalarDispatcher + { + static void powscalar(const Tensor &A, const T scalar, Tensor &C) + { + if (A.shape.size != C.shape.size) { + throw TensorShapeError("powscalar"); + } + const int blockSize = A.shape.size > 256 ? 256 : A.shape.size; + int numBlocks = (A.shape.size + blockSize - 1) / blockSize; + launch_powscalar(numBlocks, blockSize, A.data, scalar, C.data, A.shape.size); + } + }; + + template + struct logDispatcher + { + static void log(const Tensor &A, Tensor &C) + { + if (A.shape.size != C.shape.size) { + throw TensorShapeError("log"); + } + const int blockSize = A.shape.size > 256 ? 256 : A.shape.size; + int numBlocks = (A.shape.size + blockSize - 1) / blockSize; + launch_log(numBlocks, blockSize, A.data, C.data, A.shape.size); + } + }; + + template + struct expDispatcher + { + static void exp(const Tensor &A, Tensor &C) + { + if (A.shape.size != C.shape.size) { + throw TensorShapeError("exp"); + } + const int blockSize = A.shape.size > 256 ? 256 : A.shape.size; + int numBlocks = (A.shape.size + blockSize - 1) / blockSize; + launch_exp(numBlocks, blockSize, A.data, C.data, A.shape.size); + } + }; + + +} + +#endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_BASIC_HPP diff --git a/excuter/op-mem-cuda/src/deepx/tf/elementwise_sqrt.hpp b/excuter/op-mem-cuda/src/deepx/tf/elementwise_sqrt.hpp new file mode 100644 index 00000000..3865c03b --- /dev/null +++ b/excuter/op-mem-cuda/src/deepx/tf/elementwise_sqrt.hpp @@ -0,0 +1,378 @@ +#ifndef DEEPX_TF_ELEMENTWISE_SQRT_HPP +#define DEEPX_TF_ELEMENTWISE_SQRT_HPP + +#include +#include +#include "deepx/tensorfunc/elementwise_miaobyte_sqrt.hpp" + +namespace deepx::tf +{ + + template + class Sqrt : public TF + { + public: + Sqrt(const vector &args, const vector &returns) + { + this->name = "sqrt"; + this->author = Author::name(); + this->args = args; + this->returns = returns; + } + + Sqrt(string text) + { + this->parse(text); + this->author = Author::name(); + if (this->name != "sqrt") + { + throw std::runtime_error("Invalid name: " + this->name); + } + } + string math_formula() const override + { + return "T3=sqrt(T1)"; + } + shared_ptr clone() const override + { + return make_shared>(*this); + } + + int run(shared_ptr mem, string &error) override + { + Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; + Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (a_type != c_type) + { + error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type); + return 1; + } + switch (a_type) + { + case Precision::Float64: + tensorfunc::sqrt(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + tensorfunc::sqrt(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + // case Precision::Float16: + // tensorfunc::sqrt(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + // break; + // case Precision::Float16: + // tensorfunc::sqrt(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + // break; + case Precision::Int64: + tensorfunc::sqrt(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + tensorfunc::sqrt(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + tensorfunc::sqrt(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + tensorfunc::sqrt(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported type: " + precision_str(a_type); + return 1; + } + return 0; + } + }; + + template + class Pow : public TF + { + public: + Pow(const vector &args, const vector &returns) + { + this->name = "pow"; + this->author = Author::name(); + this->args = args; + this->returns = returns; + } + + Pow(string text) + { + this->parse(text); + this->author = Author::name(); + if (this->name != "pow") + { + throw std::runtime_error("Invalid name: " + this->name); + } + } + string math_formula() const override + { + return "T3=pow(T1, T2)"; + } + shared_ptr clone() const override + { + return make_shared>(*this); + } + + int run(shared_ptr mem, string &error) override + { + Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; + Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype; + Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (a_type != c_type || b_type != c_type) + { + error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type) + " or " + precision_str(b_type) + " != " + precision_str(c_type); + return 1; + } + switch (a_type) + { + case Precision::Float64: + tensorfunc::pow(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + tensorfunc::pow(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + // case Precision::BFloat16: + // tensorfunc::pow(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + // break; + // case Precision::Float16: + // tensorfunc::pow(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + // break; + case Precision::Int64: + tensorfunc::pow(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + tensorfunc::pow(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + tensorfunc::pow(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + tensorfunc::pow(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported type: " + precision_str(a_type); + return 1; + } + return 0; + } + }; + + template + class PowScalar : public TF + { + public: + PowScalar(const vector &args, const vector &returns) + { + this->name = "powscalar"; + this->author = Author::name(); + this->args = args; + this->returns = returns; + } + + PowScalar(string text) + { + this->parse(text); + this->author = Author::name(); + if (this->name != "powscalar") + { + throw std::runtime_error("Invalid name: " + this->name); + } + } + string math_formula() const override + { + return "T3=pow(T1, scalar)"; + } + shared_ptr clone() const override + { + return make_shared>(*this); + } + + int run(shared_ptr mem, string &error) override + { + Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; + Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype; + Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (a_type != c_type || b_type != c_type) + { + error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type) + " or " + precision_str(b_type) + " != " + precision_str(c_type); + return 1; + } + switch (a_type) + { + case Precision::Float64: + tensorfunc::powscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + tensorfunc::powscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + // case Precision::BFloat16: + // tensorfunc::powscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + // break; + // case Precision::Float16: + // tensorfunc::powscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + // break; + case Precision::Int64: + tensorfunc::powscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + tensorfunc::powscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + tensorfunc::powscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + tensorfunc::powscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported type: " + precision_str(a_type); + return 1; + } + return 0; + } + }; + + template + class Log : public TF + { + public: + Log(const vector &args, const vector &returns) + { + this->name = "log"; + this->author = Author::name(); + this->args = args; + this->returns = returns; + } + + Log(string text) + { + this->parse(text); + this->author = Author::name(); + if (this->name != "log") + { + throw std::runtime_error("Invalid name: " + this->name); + } + } + string math_formula() const override + { + return "T3=log(T1)"; + } + shared_ptr clone() const override + { + return make_shared>(*this); + } + + int run(shared_ptr mem, string &error) override + { + Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; + Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (a_type != c_type) + { + error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type); + return 1; + } + switch (a_type) + { + case Precision::Float64: + tensorfunc::log(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + tensorfunc::log(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + // case Precision::Float16: + // tensorfunc::log(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + // break; + // case Precision::BFloat16: + // tensorfunc::log(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + // break; + case Precision::Int64: + tensorfunc::log(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + tensorfunc::log(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + tensorfunc::log(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + tensorfunc::log(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported type: " + precision_str(a_type); + return 1; + } + return 0; + } + }; + + template + class Exp : public TF + { + public: + Exp(const vector &args, const vector &returns) + { + this->name = "exp"; + this->author = Author::name(); + this->args = args; + this->returns = returns; + } + + Exp(string text) + { + this->parse(text); + this->author = Author::name(); + if (this->name != "exp") + { + throw std::runtime_error("Invalid name: " + this->name); + } + } + string math_formula() const override + { + return "T3=exp(T1)"; + } + shared_ptr clone() const override + { + return make_shared>(*this); + } + + int run(shared_ptr mem, string &error) override + { + Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; + Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (a_type != c_type) + { + error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type); + return 1; + } + switch (a_type) + { + case Precision::Float64: + tensorfunc::exp(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + tensorfunc::exp(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + // case Precision::Float16: + // tensorfunc::exp(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + // break; + // case Precision::BFloat16: + // tensorfunc::exp(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + // break; + case Precision::Int64: + tensorfunc::exp(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + tensorfunc::exp(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + tensorfunc::exp(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + tensorfunc::exp(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported type: " + precision_str(a_type); + return 1; + } + return 0; + } + }; +}; +#endif // DEEPX_TF_ELEMENTWISE_SQRT_HPP diff --git a/excuter/op-mem-ompsimd/src/client/tfs.cpp b/excuter/op-mem-ompsimd/src/client/tfs.cpp index 59dfba65..afd1ee0a 100644 --- a/excuter/op-mem-ompsimd/src/client/tfs.cpp +++ b/excuter/op-mem-ompsimd/src/client/tfs.cpp @@ -112,44 +112,44 @@ namespace deepx::tf Param("c", DataCategory::Tensor, Precision::Any), }))); - tffactory.add_tf(std::make_shared>(vector( + tffactory.add_tf(std::make_shared>(vector( + { + Param("a", DataCategory::Tensor, Precision::Float64 | Precision::Float32), + Param("b", DataCategory::Tensor, Precision::Float64 | Precision::Float32), + }), + vector( + { + Param("c", DataCategory::Tensor, Precision::Float64 | Precision::Float32), + }))); + + tffactory.add_tf(std::make_shared>(vector( + { + Param("a", DataCategory::Tensor, Precision::Any), + Param("scalar", DataCategory::Var, Precision::Any), + }), + vector( + { + Param("c", DataCategory::Tensor, Precision::Any), + }))); + tffactory.add_tf(std::make_shared>(vector( { - Param("a", DataCategory::Tensor, Precision::Float64|Precision::Float32), - Param("b", DataCategory::Tensor, Precision::Float64|Precision::Float32), + Param("a", DataCategory::Tensor, Precision::Any), + Param("b", DataCategory::Tensor, Precision::Any), }), vector( { - Param("c", DataCategory::Tensor, Precision::Float64|Precision::Float32), + Param("c", DataCategory::Tensor, Precision::Any), }))); - - tffactory.add_tf(std::make_shared>(vector( - { - Param("a", DataCategory::Tensor, Precision::Any), - Param("scalar", DataCategory::Var, Precision::Any), - }), - vector( - { - Param("c", DataCategory::Tensor, Precision::Any), - }))); - tffactory.add_tf(std::make_shared>(vector( - { - Param("a", DataCategory::Tensor, Precision::Any), - Param("b", DataCategory::Tensor, Precision::Any), - }), - vector( - { - Param("c", DataCategory::Tensor, Precision::Any), - }))); tffactory.add_tf(std::make_shared>(vector( - { - Param("a", DataCategory::Tensor, Precision::Any), - Param("scalar", DataCategory::Var, Precision::Any), - }), - vector( - { - Param("c", DataCategory::Tensor, Precision::Any), - }))); + { + Param("a", DataCategory::Tensor, Precision::Any), + Param("scalar", DataCategory::Var, Precision::Any), + }), + vector( + { + Param("c", DataCategory::Tensor, Precision::Any), + }))); tffactory.add_tf(std::make_shared>(vector( { @@ -159,7 +159,7 @@ namespace deepx::tf vector( { Param("C", DataCategory::Tensor, Precision::Any), - }))); + }))); tffactory.add_tf(std::make_shared>(vector( { Param("A", DataCategory::Tensor, Precision::Any), @@ -169,7 +169,7 @@ namespace deepx::tf { Param("C", DataCategory::Tensor, Precision::Any), }))); - + tffactory.add_tf(std::make_shared>(vector( { Param("A", DataCategory::Tensor, Precision::Any), @@ -179,44 +179,35 @@ namespace deepx::tf { Param("C", DataCategory::Tensor, Precision::Any), }))); - tffactory.add_tf(std::make_shared>(vector( - { - Param("A", DataCategory::Tensor, Precision::Any), - Param("scalar", DataCategory::Var, Precision::Any), - }), - vector( - { - Param("C", DataCategory::Tensor, Precision::Any), - }))); - - + tffactory.add_tf(std::make_shared>(vector( + { + Param("A", DataCategory::Tensor, Precision::Any), + Param("scalar", DataCategory::Var, Precision::Any), + }), + vector( + { + Param("C", DataCategory::Tensor, Precision::Any), + }))); tffactory.add_tf(std::make_shared>(vector( - { - Param("scalar", DataCategory::Var, Precision::Any), - Param("A", DataCategory::Tensor, Precision::Any), - }), - vector( - { - Param("C", DataCategory::Tensor, Precision::Any), - }))); - - // opfactory.add_op(Sqrt_miaobyte()); - // opfactory.add_op(Sqrt_miaobyte()); - - // opfactory.add_op(Exp_miaobyte()); - // opfactory.add_op(Exp_miaobyte()); - - // opfactory.add_op(Pow_miaobyte()); - // opfactory.add_op(Pow_miaobyte()); + { + Param("scalar", DataCategory::Var, Precision::Any), + Param("A", DataCategory::Tensor, Precision::Any), + }), + vector( + { + Param("C", DataCategory::Tensor, Precision::Any), + }))); + tffactory.add_tf(std::make_shared>(vector( + { + Param("A", DataCategory::Tensor, Precision::Any), + }), + vector( + { + Param("C", DataCategory::Tensor, Precision::Any), + }))); - // opfactory.add_op(Powscalar_miaobyte()); - // opfactory.add_op(Powscalar_miaobyte()); - } - // matmul - void register_matmul(TfFactory &tffactory) - { - tffactory.add_tf(std::make_shared>(vector( + tffactory.add_tf(std::make_shared>(vector( { Param("A", DataCategory::Tensor, Precision::Any), Param("B", DataCategory::Tensor, Precision::Any), @@ -225,15 +216,53 @@ namespace deepx::tf { Param("C", DataCategory::Tensor, Precision::Any), }))); + tffactory.add_tf(std::make_shared>(vector( + { + Param("A", DataCategory::Tensor, Precision::Any), + Param("scalar", DataCategory::Var, Precision::Any), + }), + vector( + { + Param("C", DataCategory::Tensor, Precision::Any), + }))); + tffactory.add_tf(std::make_shared>(vector( + { + Param("A", DataCategory::Tensor, Precision::Any), + }), + vector( + { + Param("C", DataCategory::Tensor, Precision::Any), + }))); + tffactory.add_tf(std::make_shared>(vector( + { + Param("A", DataCategory::Tensor, Precision::Any), + }), + vector( + { + Param("C", DataCategory::Tensor, Precision::Any), + }))); + } + // matmul + void register_matmul(TfFactory &tffactory) + { + tffactory.add_tf(std::make_shared>(vector( + { + Param("A", DataCategory::Tensor, Precision::Any), + Param("B", DataCategory::Tensor, Precision::Any), + }), + vector( + { + Param("C", DataCategory::Tensor, Precision::Any), + }))); tffactory.add_tf(std::make_shared>(vector( { - Param("A", DataCategory::Tensor, Precision::Float64|Precision::Float32), - Param("B", DataCategory::Tensor, Precision::Float64|Precision::Float32), + Param("A", DataCategory::Tensor, Precision::Float64 | Precision::Float32), + Param("B", DataCategory::Tensor, Precision::Float64 | Precision::Float32), }), vector( { - Param("C", DataCategory::Tensor, Precision::Float64|Precision::Float32), - }))); + Param("C", DataCategory::Tensor, Precision::Float64 | Precision::Float32), + }))); } // // changeshape void register_changeshape(TfFactory &tffactory) diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp index e2e85677..f7bacc0b 100644 --- a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp +++ b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp @@ -101,7 +101,7 @@ namespace deepx::tensorfunc // 通用实现 template - struct addDispatcher + struct addDispatcher { static void add(const Tensor &A, const Tensor &B, Tensor &C) { @@ -122,7 +122,7 @@ namespace deepx::tensorfunc }; template - struct addscalarDispatcher + struct addscalarDispatcher { static void addscalar(const Tensor &A, const T value, Tensor &C) { @@ -143,7 +143,7 @@ namespace deepx::tensorfunc // 添加 sub 的模板特化实现 template - struct subDispatcher + struct subDispatcher { static void sub(const Tensor &A, const Tensor &B, Tensor &C) { @@ -163,7 +163,7 @@ namespace deepx::tensorfunc }; template - struct subscalarDispatcher + struct subscalarDispatcher { static void subscalar(const Tensor &A, const T value, Tensor &C) { @@ -184,7 +184,7 @@ namespace deepx::tensorfunc // 添加 mul 的模板特化实现 template - struct mulDispatcher + struct mulDispatcher { static void mul(const Tensor &A, const Tensor &B, Tensor &C) { @@ -204,7 +204,7 @@ namespace deepx::tensorfunc }; template - struct mulscalarDispatcher + struct mulscalarDispatcher { static void mulscalar(const Tensor &A, const T value, Tensor &C) { @@ -224,10 +224,10 @@ namespace deepx::tensorfunc }; template - struct muladdDispatcher + struct muladdDispatcher { // A*B+C=D - static void muladd(const Tensor &A, const Tensor &B, const Tensor &C, Tensor &D) + static void muladd(const Tensor &A, const Tensor &B, const Tensor &C, Tensor &D) { if (A.shape == B.shape && A.shape == C.shape && A.shape == D.shape) @@ -270,10 +270,10 @@ namespace deepx::tensorfunc }; template - struct muladdscalarDispatcher + struct muladdscalarDispatcher { // A*B*alpha+C*beta=D - static void muladdscalar(const Tensor &A, const Tensor &B, const T alpha, const Tensor &C, const T beta, Tensor &D) + static void muladdscalar(const Tensor &A, const Tensor &B, const T alpha, const Tensor &C, const T beta, Tensor &D) { if (A.shape == B.shape && A.shape == C.shape && A.shape == D.shape) { @@ -329,10 +329,10 @@ namespace deepx::tensorfunc }; template - struct mulscalaraddDispatcher + struct mulscalaraddDispatcher { // A*alpha+B*beta=C - static void mulscalaradd(const Tensor &A, const T alpha, const Tensor &B, const T beta, Tensor &C) + static void mulscalaradd(const Tensor &A, const T alpha, const Tensor &B, const T beta, Tensor &C) { if (A.shape == B.shape && A.shape == C.shape) { @@ -379,7 +379,7 @@ namespace deepx::tensorfunc // 添加 div 的模板特化实现 template - struct divDispatcher + struct divDispatcher { static void div(const Tensor &A, const Tensor &B, Tensor &C) { @@ -399,7 +399,7 @@ namespace deepx::tensorfunc }; template - struct divscalarDispatcher + struct divscalarDispatcher { static void divscalar(const Tensor &A, const T value, Tensor &C) { @@ -419,7 +419,7 @@ namespace deepx::tensorfunc }; template - struct rdivscalarDispatcher + struct rdivscalarDispatcher { static void rdivscalar(const T value, const Tensor &In, Tensor &Out) { @@ -439,10 +439,10 @@ namespace deepx::tensorfunc }; template - struct divaddDispatcher + struct divaddDispatcher { // D= A/B+ C - static void divadd(const Tensor &A, const Tensor &B, const Tensor &C, Tensor &D) + static void divadd(const Tensor &A, const Tensor &B, const Tensor &C, Tensor &D) { if (A.shape == B.shape && A.shape == C.shape && A.shape == D.shape) { @@ -481,13 +481,13 @@ namespace deepx::tensorfunc throw std::invalid_argument("shape mismatch"); } } - }; + }; template - struct divscalaraddDispatcher + struct divscalaraddDispatcher { // C= A/alpha+ B/beta - static void divscalaradd(const Tensor &A, const T alpha, const Tensor &B, const T beta, Tensor &C) + static void divscalaradd(const Tensor &A, const T alpha, const Tensor &B, const T beta, Tensor &C) { if (A.shape == B.shape && A.shape == C.shape) { @@ -533,10 +533,10 @@ namespace deepx::tensorfunc }; template - struct divaddbetaDispatcher + struct divaddbetaDispatcher { // D= A/B*alpha+ C*beta - static void divaddbeta(const Tensor &A, const Tensor &B, const T alpha, const Tensor &C, const T beta, Tensor &D) + static void divaddbeta(const Tensor &A, const Tensor &B, const T alpha, const Tensor &C, const T beta, Tensor &D) { if (A.shape == B.shape && A.shape == C.shape && A.shape == D.shape) { @@ -584,7 +584,7 @@ namespace deepx::tensorfunc }; template - struct sqrtDispatcher + struct sqrtDispatcher>> { static void sqrt(const Tensor &input, Tensor &output) { @@ -624,31 +624,66 @@ namespace deepx::tensorfunc } } }; + template + struct sqrtDispatcher>> + { + static void sqrt(const Tensor &input, Tensor &output) + { + if (input.shape == output.shape) + { + output.shape.rangeParallel(output.shape.dim - 1, [&input, &output](int i) + { + int shape_last = output.shape[-1]; + + size_t j = 0; + + while (j < shape_last) + { + output.data[i + j] = std::sqrt(input.data[i + j]); + ++j; + } + }); + } + else + { + throw std::invalid_argument("shape mismatch"); + } + } + }; template - struct powDispatcher + struct powDispatcher { // C=A^B - static void pow(const Tensor &A, Tensor &B, Tensor &C) + static void pow(const Tensor &A, const Tensor &B, Tensor &C) { if (A.shape == B.shape && A.shape == C.shape) { - C.shape.rangeParallel(C.shape.dim, [&A, &B, &C](int i) - { C.data[i] = std::pow(A.data[i], B.data[i]); }); + C.shape.rangeParallel(C.shape.dim - 1, [&A, &B, &C](int i) + { + for (int j = 0; j < C.shape[-1]; j++) + C.data[i+j] = std::pow(A.data[i+j], B.data[i+j]); }); } else { throw std::invalid_argument("shape mismatch"); } } + }; + + template + struct powscalarDispatcher + { // C=A^value // highway 不支持POW static void powscalar(const Tensor &input, const T value, Tensor &output) { if (input.shape == output.shape) { - output.shape.rangeParallel(output.shape.dim, [&input, &output, &value](int i) - { output.data[i] = std::pow(input.data[i], value); }); + output.shape.rangeParallel(output.shape.dim - 1, [&input, &output, &value](int i) + { + for (int j = 0; j < output.shape[-1]; j++) + output.data[i+j] = std::pow(input.data[i+j], value); }); } else { @@ -658,15 +693,16 @@ namespace deepx::tensorfunc }; template - struct logDispatcher - { + struct logDispatcher + { // hwy库没有log函数,所以只能用std::log static void log(const Tensor &input, Tensor &output) { if (input.shape == output.shape) { - output.shape.rangeParallel(output.shape.dim, [&input, &output](int i) - { output.data[i] = std::log(input.data[i]); }); + output.shape.rangeParallel(output.shape.dim - 1, [&input, &output](int i) + { for (int j = 0; j < output.shape[-1]; j++) + output.data[i+j] = std::log(input.data[i+j]); }); } else { @@ -676,15 +712,16 @@ namespace deepx::tensorfunc }; template - struct expDispatcher - { + struct expDispatcher + { // 发现hwy库没有exp函数,所以只能用std::exp static void exp(const Tensor &input, Tensor &output) { if (input.shape == output.shape) { - output.shape.rangeParallel(output.shape.dim, [&input, &output](int i) - { output.data[i] = std::exp(input.data[i]); }); + output.shape.rangeParallel(output.shape.dim - 1, [&input, &output](int i) + { for (int j = 0; j < output.shape[-1]; j++) + output.data[i+j] = std::exp(input.data[i+j]); }); } else { @@ -694,9 +731,9 @@ namespace deepx::tensorfunc }; template - struct sinDispatcher - { - + struct sinDispatcher + { + static void sin(const Tensor &input, Tensor &output) { if (input.shape == output.shape) @@ -737,9 +774,9 @@ namespace deepx::tensorfunc }; template - struct cosDispatcher - { - + struct cosDispatcher + { + static void cos(const Tensor &input, Tensor &output) { if (input.shape == output.shape) @@ -780,9 +817,9 @@ namespace deepx::tensorfunc }; template - struct tanDispatcher - { - + struct tanDispatcher + { + static void tan(const Tensor &input, Tensor &output) { if (input.shape == output.shape) @@ -823,8 +860,8 @@ namespace deepx::tensorfunc }; template - struct maxDispatcher - { + struct maxDispatcher + { static void max(const Tensor &A, const Tensor &B, Tensor &C) { if (A.shape == B.shape && A.shape == C.shape) @@ -866,7 +903,7 @@ namespace deepx::tensorfunc }; template - struct maxgradDispatcher + struct maxgradDispatcher { static void maxgrad(const Tensor &A, const Tensor &B, Tensor &A_grad, Tensor &B_grad, const Tensor &output_grad) { @@ -893,9 +930,9 @@ namespace deepx::tensorfunc }; template - struct maxscalarDispatcher + struct maxscalarDispatcher { - static void maxscalar(const Tensor &A,const T b, Tensor &C) + static void maxscalar(const Tensor &A, const T b, Tensor &C) { if (A.shape == C.shape) { @@ -936,7 +973,7 @@ namespace deepx::tensorfunc }; template - struct maxscalargradDispatcher + struct maxscalargradDispatcher { static void maxscalargrad(const Tensor &A, const T b, Tensor &A_grad, const Tensor &output_grad) { @@ -960,7 +997,7 @@ namespace deepx::tensorfunc }; template - struct minDispatcher + struct minDispatcher { static void min(const Tensor &A, const Tensor &B, Tensor &C) { @@ -1003,7 +1040,7 @@ namespace deepx::tensorfunc }; template - struct mingradDispatcher + struct mingradDispatcher { static void mingrad(const Tensor &A, const Tensor &B, Tensor &A_grad, Tensor &B_grad, const Tensor &output_grad) { @@ -1030,9 +1067,9 @@ namespace deepx::tensorfunc }; template - struct minscalarDispatcher + struct minscalarDispatcher { - static void minscalar(const Tensor &A,const T b, Tensor &C) + static void minscalar(const Tensor &A, const T b, Tensor &C) { if (A.shape == C.shape) { @@ -1069,10 +1106,10 @@ namespace deepx::tensorfunc throw std::invalid_argument("shape mismatch"); } } - }; + }; template - struct minscalargradDispatcher + struct minscalargradDispatcher { static void minscalargrad(const Tensor &A, const T b, Tensor &A_grad, const Tensor &output_grad) { diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp b/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp index 622463d5..26dde852 100644 --- a/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp +++ b/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp @@ -49,7 +49,7 @@ namespace deepx::tf tensorfunc::add(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int64: - tensorfunc::add(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::add(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int32: tensorfunc::add(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); @@ -105,7 +105,7 @@ namespace deepx::tf tensorfunc::addscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int64: - tensorfunc::addscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::addscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int32: tensorfunc::addscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); @@ -161,7 +161,7 @@ namespace deepx::tf tensorfunc::sub(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int64: - tensorfunc::sub(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::sub(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int32: tensorfunc::sub(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); @@ -217,7 +217,7 @@ namespace deepx::tf tensorfunc::subscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int64: - tensorfunc::subscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::subscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int32: tensorfunc::subscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); @@ -274,7 +274,7 @@ namespace deepx::tf tensorfunc::mul(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int64: - tensorfunc::mul(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::mul(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int32: tensorfunc::mul(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); @@ -330,7 +330,7 @@ namespace deepx::tf tensorfunc::mulscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int64: - tensorfunc::mulscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::mulscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int32: tensorfunc::mulscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); @@ -387,7 +387,7 @@ namespace deepx::tf tensorfunc::div(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int64: - tensorfunc::div(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::div(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int32: tensorfunc::div(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); @@ -443,7 +443,7 @@ namespace deepx::tf tensorfunc::divscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int64: - tensorfunc::divscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::divscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int32: tensorfunc::divscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); @@ -499,7 +499,7 @@ namespace deepx::tf tensorfunc::rdivscalar(this->getvar(0, mem),*mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int64: - tensorfunc::rdivscalar(this->getvar(0, mem),*mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::rdivscalar(this->getvar(0, mem),*mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int32: tensorfunc::rdivscalar(this->getvar(0, mem),*mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); @@ -517,7 +517,288 @@ namespace deepx::tf return 0; } }; - -} + template + class Sqrt : public TF + { + public: + Sqrt(vector args, vector returns) + { + this->name = "sqrt"; + this->author = Author::name(); + this->args = args; + this->returns = returns; + } + string math_formula() const override + { + return "T3=sqrt(T1)"; + } + shared_ptr clone() const override + { + return make_shared>(*this); + } + int run(shared_ptr mem, string &error) override + { + Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; + Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (a_type != c_type) + { + error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type); + return 1; + } + switch (a_type) + { + case Precision::Float64: + tensorfunc::sqrt(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + tensorfunc::sqrt(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int64: + tensorfunc::sqrt(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + tensorfunc::sqrt(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + tensorfunc::sqrt(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + tensorfunc::sqrt(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported dtype: " + precision_str(a_type); + return 1; + } + return 0; + } + }; + + template + class Pow : public TF + { + public: + Pow(vector args, vector returns) + { + this->name = "pow"; + this->author = Author::name(); + this->args = args; + this->returns = returns; + } + string math_formula() const override + { + return "T3=T1^T2"; + } + shared_ptr clone() const override + { + return make_shared>(*this); + } + int run(shared_ptr mem, string &error) override + { + Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; + Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype; + Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (a_type != b_type || a_type != c_type) + { + error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(b_type) + " != " + precision_str(c_type); + return 1; + } + switch (a_type) + { + case Precision::Float64: + tensorfunc::pow(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + tensorfunc::pow(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int64: + tensorfunc::pow(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + tensorfunc::pow(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + tensorfunc::pow(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + tensorfunc::pow(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported dtype: " + precision_str(a_type); + return 1; + } + return 0; + } + }; + + template + class PowScalar : public TF + { + public: + PowScalar(vector args, vector returns) + { + this->name = "powscalar"; + this->author = Author::name(); + this->args = args; + this->returns = returns; + } + string math_formula() const override + { + return "T3=T1^scalar"; + } + shared_ptr clone() const override + { + return make_shared>(*this); + } + int run(shared_ptr mem, string &error) override + { + Precision a_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype; + Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (a_type != c_type) + { + error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type); + return 1; + } + switch (a_type) + { + case Precision::Float64: + tensorfunc::powscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + tensorfunc::powscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int64: + tensorfunc::powscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + tensorfunc::powscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + tensorfunc::powscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + tensorfunc::powscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported dtype: " + precision_str(a_type); + return 1; + } + return 0; + } + }; + + template + class Log : public TF + { + public: + Log(vector args, vector returns) + { + this->name = "log"; + this->author = Author::name(); + this->args = args; + this->returns = returns; + } + string math_formula() const override + { + return "T3=log(T1)"; + } + shared_ptr clone() const override + { + return make_shared>(*this); + } + int run(shared_ptr mem, string &error) override + { + Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; + Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (a_type != c_type) + { + error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type); + return 1; + } + switch (a_type) + { + case Precision::Float64: + tensorfunc::log(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + tensorfunc::log(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int64: + tensorfunc::log(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + tensorfunc::log(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + tensorfunc::log(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + tensorfunc::log(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported dtype: " + precision_str(a_type); + return 1; + } + return 0; + } + }; + + template + class Exp : public TF + { + public: + Exp(vector args, vector returns) + { + this->name = "exp"; + this->author = Author::name(); + this->args = args; + this->returns = returns; + } + string math_formula() const override + { + return "T3=exp(T1)"; + } + shared_ptr clone() const override + { + return make_shared>(*this); + } + int run(shared_ptr mem, string &error) override + { + Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; + Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (a_type != c_type) + { + error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type); + return 1; + } + switch (a_type) + { + case Precision::Float64: + tensorfunc::exp(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + tensorfunc::exp(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int64: + tensorfunc::exp(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + tensorfunc::exp(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + tensorfunc::exp(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + tensorfunc::exp(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported dtype: " + precision_str(a_type); + return 1; + } + return 0; + } + }; + +}; + #endif diff --git a/front/py/deepx/nn/functional/__init__.py b/front/py/deepx/nn/functional/__init__.py index f724b698..07610c30 100644 --- a/front/py/deepx/nn/functional/__init__.py +++ b/front/py/deepx/nn/functional/__init__.py @@ -10,7 +10,7 @@ "newtensor", "printtensor", "constant","full","zeros","ones","uniform","arange","rand","randn","eye","kaiming_uniform_","calculate_fan_in_and_fan_out", - "add","sub","mul","div","clamp","exp","sqrt","rsqrt", + "add","sub","mul","div","clamp","sqrt","pow","exp","log","rsqrt", "matmul", "max","min","sum","prod","mean", "transpose","reshape","broadcast_shape","broadcast_to","unsqueeze", diff --git a/front/py/deepx/nn/functional/elementwise.py b/front/py/deepx/nn/functional/elementwise.py index ecf3c0c3..56acc975 100644 --- a/front/py/deepx/nn/functional/elementwise.py +++ b/front/py/deepx/nn/functional/elementwise.py @@ -208,14 +208,14 @@ def clamp( varir=DeepxIR("clamp", a.dtype, [a.node.name,min,max], [outtensor.node.name]) send(str(varir)) return outtensor -#exp -OpNode.register("exp") -def exp( - a:Tensor, + +#sqrt +OpNode.register("sqrt") +def sqrt( + input:Tensor, out:Union[Tensor,str]='')->Tensor: - return _A_elementwiseop_C(a,"exp",out) -#pow -# todo + return _A_elementwiseop_C(input,"sqrt",out) + OpNode.register("pow") OpNode.register("powscalar") def pow( @@ -226,12 +226,20 @@ def pow( return _A_b_elementwiseop_C(a,b,"powscalar",out) else: return _A_B_elementwiseop_C(a,b,"pow",out) -#sqrt -OpNode.register("sqrt") -def sqrt( + +#exp +OpNode.register("exp") +def exp( + a:Tensor, + out:Union[Tensor,str]='')->Tensor: + return _A_elementwiseop_C(a,"exp",out) +#log +OpNode.register("log") +def log( input:Tensor, out:Union[Tensor,str]='')->Tensor: - return _A_elementwiseop_C(input,"sqrt",out) + return _A_elementwiseop_C(input,"log",out) + def rsqrt( input:Tensor, diff --git a/front/py/examples/2_ir/2_elementwise_sqrtlog.dot b/front/py/examples/2_ir/2_elementwise_sqrtlog.dot new file mode 100644 index 00000000..4e476571 --- /dev/null +++ b/front/py/examples/2_ir/2_elementwise_sqrtlog.dot @@ -0,0 +1,35 @@ +// Computational Graph +digraph { + rankdir=TB + node [shape=record] + 140074505155728 [label="t1 +(60,)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] + 140076479891344 [label="t2 +(60,)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] + 140074503481968 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 140074503482016 [label="var_1 +2" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled] + 140074503481920 [label=sqrt color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 140074503481824 [label="t3 +(60,)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] + 140074503481728 [label=log color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 140074503482304 [label="t4 +(60,)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] + 140074503482544 [label=exp color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 140074503482640 [label="t5 +(60,)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] + 140074503487056 [label=pow color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 140074503486960 [label="t6 +(60,)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] + 140074503481968 -> 140076479891344 [arrowsize=0.8 color=gray40 penwidth=1.2] + 140074503482016 -> 140074503481968 [arrowsize=0.8 color=gray40 penwidth=1.2] + 140074505155728 -> 140074503481920 [arrowsize=0.8 color=gray40 penwidth=1.2] + 140074503481920 -> 140074503481824 [arrowsize=0.8 color=gray40 penwidth=1.2] + 140076479891344 -> 140074503481728 [arrowsize=0.8 color=gray40 penwidth=1.2] + 140074503481728 -> 140074503482304 [arrowsize=0.8 color=gray40 penwidth=1.2] + 140074503482304 -> 140074503482544 [arrowsize=0.8 color=gray40 penwidth=1.2] + 140074503482544 -> 140074503482640 [arrowsize=0.8 color=gray40 penwidth=1.2] + 140074503482640 -> 140074503487056 [arrowsize=0.8 color=gray40 penwidth=1.2] + 140074503481824 -> 140074503487056 [arrowsize=0.8 color=gray40 penwidth=1.2] + 140074503487056 -> 140074503486960 [arrowsize=0.8 color=gray40 penwidth=1.2] +} diff --git a/front/py/examples/2_ir/2_elementwise_sqrtlog.dot.svg b/front/py/examples/2_ir/2_elementwise_sqrtlog.dot.svg new file mode 100644 index 00000000..a517b63b --- /dev/null +++ b/front/py/examples/2_ir/2_elementwise_sqrtlog.dot.svg @@ -0,0 +1,158 @@ + + + + + + +%3 + + + +140074505155728 + +t1 +(60,) + + + +140074503481920 + +sqrt + + + +140074505155728->140074503481920 + + + + + +140076479891344 + +t2 +(60,) + + + +140074503481728 + +log + + + +140076479891344->140074503481728 + + + + + +140074503481968 + +constant + + + +140074503481968->140076479891344 + + + + + +140074503482016 + +var_1 +2 + + + +140074503482016->140074503481968 + + + + + +140074503481824 + +t3 +(60,) + + + +140074503481920->140074503481824 + + + + + +140074503487056 + +pow + + + +140074503481824->140074503487056 + + + + + +140074503482304 + +t4 +(60,) + + + +140074503481728->140074503482304 + + + + + +140074503482544 + +exp + + + +140074503482304->140074503482544 + + + + + +140074503482640 + +t5 +(60,) + + + +140074503482544->140074503482640 + + + + + +140074503482640->140074503487056 + + + + + +140074503486960 + +t6 +(60,) + + + +140074503487056->140074503486960 + + + + + diff --git a/front/py/examples/2_ir/2_elementwise_sqrtlog.py b/front/py/examples/2_ir/2_elementwise_sqrtlog.py new file mode 100644 index 00000000..908efd69 --- /dev/null +++ b/front/py/examples/2_ir/2_elementwise_sqrtlog.py @@ -0,0 +1,38 @@ + +############-------PyTorch-------################ + +import torch +torch_t1 = torch.arange(3*4*5, dtype=torch.float32) +torch_t2 = torch.full((3*4*5,),2, dtype=torch.float32) + +torch_t3 = torch.sqrt(torch_t1) +print(torch_t3) +torch_t4 = torch.log(torch_t2) +print(torch_t4) +torch_t5 = torch.exp(torch_t4) +print(torch_t5) +torch_t6 = torch.pow(torch_t5,torch_t3) +print(torch_t6) + +############-------DEEPX-------################ + +import deepx +print() + +t1 = deepx.arange(end=3*4*5,dtype='float32',name="t1") +t2 = deepx.full([3*4*5],value=2,dtype='float32',name="t2") +t3 = deepx.sqrt(t1,out='t3') +print(t3) +t4 = deepx.log(t2,out='t4') +print(t4) +t5 = deepx.exp(t4,out='t5') +print(t5) +t6 = deepx.pow(t5,t3,out='t6') +print(t6) + +import os +script_name = os.path.splitext(os.path.basename( os.path.abspath(__file__)))[0] # 获取不带后缀的脚本名 +str=t3.graph.to_dot() +str.render(script_name+".dot", format='svg') + +