From 3bb450145eb3fadba025b229339d8195aa120914 Mon Sep 17 00:00:00 2001 From: lipeng <734991033@qq.com> Date: Fri, 28 Mar 2025 18:25:41 +0800 Subject: [PATCH] =?UTF-8?q?front&excuter:=E8=81=94=E5=90=88=E8=B0=83?= =?UTF-8?q?=E8=AF=95mul,mulscalar,div,divscalar,rdivscalar?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- doc/excuter/op-mem-cuda/list.md | 29 +- doc/excuter/op-mem-ompsimd/list.md | 29 +- .../src/deepx/tensorfunc/elementwise.hpp | 6 +- excuter/op-mem-cuda/src/client/tfs.cpp | 64 +++- .../tensorfunc/elementwise_miaobyte_basic.cu | 148 ++++++- .../tensorfunc/elementwise_miaobyte_basic.cuh | 159 +++++++- .../tensorfunc/elementwise_miaobyte_basic.hpp | 71 ++++ .../src/deepx/tf/elementwise_basic.hpp | 362 +++++++++++++++++- excuter/op-mem-ompsimd/src/client/tfs.cpp | 65 +++- .../src/deepx/tf/elementwise.hpp | 169 ++++++++ .../examples/2_ir/2_elementwise_operator.dot | 64 ++++ .../2_ir/2_elementwise_operator.dot.svg | 302 +++++++++++++++ .../examples/2_ir/2_elementwise_operator.py | 14 +- 13 files changed, 1414 insertions(+), 68 deletions(-) create mode 100644 front/py/examples/2_ir/2_elementwise_operator.dot create mode 100644 front/py/examples/2_ir/2_elementwise_operator.dot.svg diff --git a/doc/excuter/op-mem-cuda/list.md b/doc/excuter/op-mem-cuda/list.md index 1d63988d..27bdd297 100644 --- a/doc/excuter/op-mem-cuda/list.md +++ b/doc/excuter/op-mem-cuda/list.md @@ -4,18 +4,23 @@ | Operation | Author | Func Def | Math Formula | IR Instruction | |-----------|--------|------------|--------------|----------------| -| addscalar | miaobyte | addscalar(tensor A, var b)->(tensor C) | T3=T1+scalar | addscalar(tensor A, var b)->(tensor C) | -| add | cublas | add(tensor a, tensor b)->(tensor c) | T3=T1+T2 | add(tensor a, tensor b)->(tensor c) | -| add | miaobyte | add(tensor a, tensor b)->(tensor c) | T3=T1+T2 | add(tensor a, tensor b)->(tensor c) | -| uniform | miaobyte | uniform(tensor t, var low, var high, var seed)->() | uniform(T1,low,high,seed) | uniform(tensor t, var low, var high, var seed)->() | -| subscalar | miaobyte | subscalar(tensor A, var b)->(tensor C) | T3=T1-scalar | subscalar(tensor A, var b)->(tensor C) | -| arange | miaobyte | arange(tensor t, var start, var step)->() | arange(T1,start,step) | arange(tensor t, var start, var step)->() | -| constant | miaobyte | constant(tensor t, var value)->() | constant(T1) | constant(tensor t, var value)->() | -| print | miaobyte | print(tensor )->() | print(T1) | print(tensor )->() | -| print | miaobyte | print(tensor , var )->() | print(T1) | print(tensor , var )->() | -| newtensor | none | newtensor(vector shape)->(tensor tensor1) | T1 = zeros(shape) | newtensor(vector shape)->(tensor tensor1) | -| newtensor | none | newtensor(var shape)->(tensor tensor1) | T1 = zeros(shape) | newtensor(var shape)->(tensor tensor1) | -| vecset | none | vecset(vector value)->(vector name) | shape = [3 4 5] | vecset(vector value)->(vector name) | | matmul | cublas | matmul(tensor A, tensor B)->(tensor C) | T3=T1 @ T2 | matmul(tensor A, tensor B)->(tensor C) | +| rdivscalar | miaobyte | rdivscalar(var scalar, tensor A)->(tensor C) | T3=scalar/T1 | rdivscalar(var scalar, tensor A)->(tensor C) | +| div | miaobyte | div(tensor A, tensor B)->(tensor C) | T3=T1/T2 | div(tensor A, tensor B)->(tensor C) | | sub | miaobyte | sub(tensor A, tensor B)->(tensor C) | T3=T1-T2 | sub(tensor A, tensor B)->(tensor C) | | argset | none | argset(var value)->(var name) | var argname = argvalue | argset(var value)->(var name) | +| mulscalar | miaobyte | mulscalar(tensor A, var b)->(tensor C) | T3=T1*scalar | mulscalar(tensor A, var b)->(tensor C) | +| vecset | none | vecset(vector value)->(vector name) | shape = [3 4 5] | vecset(vector value)->(vector name) | +| newtensor | none | newtensor(vector shape)->(tensor tensor1) | T1 = zeros(shape) | newtensor(vector shape)->(tensor tensor1) | +| newtensor | none | newtensor(var shape)->(tensor tensor1) | T1 = zeros(shape) | newtensor(var shape)->(tensor tensor1) | +| print | miaobyte | print(tensor )->() | print(T1) | print(tensor )->() | +| print | miaobyte | print(tensor , var )->() | print(T1) | print(tensor , var )->() | +| divscalar | miaobyte | divscalar(tensor A, var scalar)->(tensor C) | T3=scalar/T1 | divscalar(tensor A, var scalar)->(tensor C) | +| constant | miaobyte | constant(tensor t, var value)->() | constant(T1) | constant(tensor t, var value)->() | +| arange | miaobyte | arange(tensor t, var start, var step)->() | arange(T1,start,step) | arange(tensor t, var start, var step)->() | +| subscalar | miaobyte | subscalar(tensor A, var b)->(tensor C) | T3=T1-scalar | subscalar(tensor A, var b)->(tensor C) | +| uniform | miaobyte | uniform(tensor t, var low, var high, var seed)->() | uniform(T1,low,high,seed) | uniform(tensor t, var low, var high, var seed)->() | +| add | cublas | add(tensor a, tensor b)->(tensor c) | T3=T1+T2 | add(tensor a, tensor b)->(tensor c) | +| add | miaobyte | add(tensor a, tensor b)->(tensor c) | T3=T1+T2 | add(tensor a, tensor b)->(tensor c) | +| addscalar | miaobyte | addscalar(tensor A, var b)->(tensor C) | T3=T1+scalar | addscalar(tensor A, var b)->(tensor C) | +| mul | miaobyte | mul(tensor A, tensor B)->(tensor C) | T3=T1*T2 | mul(tensor A, tensor B)->(tensor C) | diff --git a/doc/excuter/op-mem-ompsimd/list.md b/doc/excuter/op-mem-ompsimd/list.md index 0b7d18d3..6e878c3a 100644 --- a/doc/excuter/op-mem-ompsimd/list.md +++ b/doc/excuter/op-mem-ompsimd/list.md @@ -5,19 +5,24 @@ | Operation | Author | Func Def | Math Formula | IR Instruction | |-----------|--------|------------|--------------|----------------| | concat | none | concat()->() | Tresult = concat([T1, T2...], axis=3) | concat()->() | -| addscalar | miaobyte | addscalar(tensor a, var scalar)->(tensor c) | T3=T1+scalar | addscalar(tensor a, var scalar)->(tensor c) | -| add | cblas | add(tensor a, tensor b)->(tensor c) | T3=T1+T2 | add(tensor a, tensor b)->(tensor c) | -| add | miaobyte | add(tensor a, tensor b)->(tensor c) | T3=T1+T2 | add(tensor a, tensor b)->(tensor c) | -| uniform | miaobyte | uniform(tensor t, var low, var high, var seed)->() | uniform(T1,low,high,seed) | uniform(tensor t, var low, var high, var seed)->() | -| subscalar | miaobyte | subscalar(tensor a, var scalar)->(tensor c) | T3=T1-scalar | subscalar(tensor a, var scalar)->(tensor c) | -| arange | miaobyte | arange(tensor t, var start, var step)->() | arange(T1,start,step) | arange(tensor t, var start, var step)->() | -| constant | miaobyte | constant(tensor t, var value)->() | constant(T1,value) | constant(tensor t, var value)->() | -| print | miaobyte | print(tensor )->() | print(T1) | print(tensor )->() | -| print | miaobyte | print(tensor , var )->() | print(T1) | print(tensor , var )->() | -| newtensor | none | newtensor(vector shape)->(tensor tensor1) | T1 =Tensor(shape=[...]) | newtensor(vector shape)->(tensor tensor1) | -| newtensor | none | newtensor(var shape)->(tensor tensor1) | T1 =Tensor(shape=[...]) | newtensor(var shape)->(tensor tensor1) | -| vecset | none | vecset(vector value)->(vector name) | shape = [3 4 5] | vecset(vector value)->(vector name) | | matmul | cblas | matmul(tensor A, tensor B)->(tensor C) | T3=T1 @ T2 | matmul(tensor A, tensor B)->(tensor C) | | matmul | miaobyte | matmul(tensor A, tensor B)->(tensor C) | T3=T1 @ T2 | matmul(tensor A, tensor B)->(tensor C) | +| rdivscalar | miaobyte | rdivscalar(var scalar, tensor A)->(tensor C) | T3=scalar/T1 | rdivscalar(var scalar, tensor A)->(tensor C) | +| div | miaobyte | div(tensor A, tensor B)->(tensor C) | T3=T1/T2 | div(tensor A, tensor B)->(tensor C) | | sub | miaobyte | sub(tensor a, tensor b)->(tensor c) | T3=T1-T2 | sub(tensor a, tensor b)->(tensor c) | | argset | none | argset(var value)->(var name) | var argname = argvalue | argset(var value)->(var name) | +| mulscalar | miaobyte | mulscalar(tensor A, var b)->(tensor C) | T3=T1*scalar | mulscalar(tensor A, var b)->(tensor C) | +| vecset | none | vecset(vector value)->(vector name) | shape = [3 4 5] | vecset(vector value)->(vector name) | +| newtensor | none | newtensor(vector shape)->(tensor tensor1) | T1 =Tensor(shape=[...]) | newtensor(vector shape)->(tensor tensor1) | +| newtensor | none | newtensor(var shape)->(tensor tensor1) | T1 =Tensor(shape=[...]) | newtensor(var shape)->(tensor tensor1) | +| print | miaobyte | print(tensor )->() | print(T1) | print(tensor )->() | +| print | miaobyte | print(tensor , var )->() | print(T1) | print(tensor , var )->() | +| divscalar | miaobyte | divscalar(tensor A, var scalar)->(tensor C) | T3=T1/scalar | divscalar(tensor A, var scalar)->(tensor C) | +| constant | miaobyte | constant(tensor t, var value)->() | constant(T1,value) | constant(tensor t, var value)->() | +| arange | miaobyte | arange(tensor t, var start, var step)->() | arange(T1,start,step) | arange(tensor t, var start, var step)->() | +| subscalar | miaobyte | subscalar(tensor a, var scalar)->(tensor c) | T3=T1-scalar | subscalar(tensor a, var scalar)->(tensor c) | +| uniform | miaobyte | uniform(tensor t, var low, var high, var seed)->() | uniform(T1,low,high,seed) | uniform(tensor t, var low, var high, var seed)->() | +| add | cblas | add(tensor a, tensor b)->(tensor c) | T3=T1+T2 | add(tensor a, tensor b)->(tensor c) | +| add | miaobyte | add(tensor a, tensor b)->(tensor c) | T3=T1+T2 | add(tensor a, tensor b)->(tensor c) | +| addscalar | miaobyte | addscalar(tensor a, var scalar)->(tensor c) | T3=T1+scalar | addscalar(tensor a, var scalar)->(tensor c) | +| mul | miaobyte | mul(tensor A, tensor B)->(tensor C) | T3=T1*T2 | mul(tensor A, tensor B)->(tensor C) | diff --git a/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp b/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp index e05506f7..4ee525c3 100644 --- a/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp +++ b/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp @@ -150,13 +150,13 @@ namespace deepx::tensorfunc template struct rdivscalarDispatcher { - static void rdivscalar(const Tensor &input, const T value, Tensor &output) = delete; + static void rdivscalar(const T value, const Tensor &input, Tensor &output) = delete; }; template - void rdivscalar(const Tensor &input, const T value, Tensor &output) + void rdivscalar(const T value, const Tensor &input, Tensor &output) { - rdivscalarDispatcher::rdivscalar(input, value, output); + rdivscalarDispatcher::rdivscalar(value, input, output); } template diff --git a/excuter/op-mem-cuda/src/client/tfs.cpp b/excuter/op-mem-cuda/src/client/tfs.cpp index fa83af24..27361136 100644 --- a/excuter/op-mem-cuda/src/client/tfs.cpp +++ b/excuter/op-mem-cuda/src/client/tfs.cpp @@ -143,24 +143,52 @@ namespace deepx::tf { Param("C", DataCategory::Tensor, Precision::Any), }))); - - // opfactory.add_op(Sub_cblas()); - // opfactory.add_op(Sub_cblas()); - - // opfactory.add_op(Mul_miaobyte()); - // opfactory.add_op(Mul_miaobyte()); - - // opfactory.add_op(Mulscalar_miaobyte()); - // opfactory.add_op(Mulscalar_miaobyte()); - - // opfactory.add_op(Div_miaobyte()); - // opfactory.add_op(Div_miaobyte()); - - // opfactory.add_op(Divscalar_miaobyte()); - // opfactory.add_op(Divscalar_miaobyte()); - - // opfactory.add_op(RDivscalar_miaobyte()); - // opfactory.add_op(RDivscalar_miaobyte()); + tffactory.add_tf(std::make_shared>(vector( + { + Param("A", DataCategory::Tensor, Precision::Any), + Param("B", DataCategory::Tensor, Precision::Any), + }), + vector( + { + Param("C", DataCategory::Tensor, Precision::Any), + }))); + tffactory.add_tf(std::make_shared>(vector( + { + Param("A", DataCategory::Tensor, Precision::Any), + Param("b", DataCategory::Var, Precision::Any), + }), + vector( + { + Param("C", DataCategory::Tensor, Precision::Any), + }))); + tffactory.add_tf(std::make_shared>(vector( + { + Param("A", DataCategory::Tensor, Precision::Any), + Param("B", DataCategory::Tensor, Precision::Any), + }), + vector( + { + Param("C", DataCategory::Tensor, Precision::Any), + }))); + tffactory.add_tf(std::make_shared>(vector( + { + Param("A", DataCategory::Tensor, Precision::Any), + Param("scalar", DataCategory::Var, Precision::Any), + }), + vector( + { + Param("C", DataCategory::Tensor, Precision::Any), + }))); + tffactory.add_tf(std::make_shared>(vector( + { + Param("scalar", DataCategory::Var, Precision::Any), + Param("A", DataCategory::Tensor, Precision::Any), + }), + vector( + { + Param("C", DataCategory::Tensor, Precision::Any), + }))); + // opfactory.add_op(Sqrt_miaobyte()); // opfactory.add_op(Sqrt_miaobyte()); diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu index f4836cd6..6d8e73ae 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu @@ -22,7 +22,6 @@ namespace deepx::tensorfunc template __global__ void add_kernel(const int16_t* A, const int16_t* B, int16_t* C,const int size); template __global__ void add_kernel(const int8_t* A, const int8_t* B, int8_t* C,const int size); - template void launch_add(int numBlocks, int blockSize,const T* a, const T* b, T* c,const int size) { @@ -133,6 +132,153 @@ namespace deepx::tensorfunc template void launch_subscalar(const int numBlocks, const int blockSize, const int32_t* a, const int32_t scalar, int32_t* c, const int size); template void launch_subscalar(const int numBlocks, const int blockSize, const int16_t* a, const int16_t scalar, int16_t* c, const int size); template void launch_subscalar(const int numBlocks, const int blockSize, const int8_t* a, const int8_t scalar, int8_t* c, const int size); + + template + __global__ void mul_kernel(const T* A, const T* B, T* C,const int size){ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < size) { + C[idx] = A[idx] * B[idx]; + } + } + template __global__ void mul_kernel(const double* A, const double* B, double* C,const int size); + template __global__ void mul_kernel(const float* A, const float* B, float* C,const int size); + template __global__ void mul_kernel(const half* A, const half* B, half* C,const int size); + template __global__ void mul_kernel(const nv_bfloat16* A, const nv_bfloat16* B, nv_bfloat16* C,const int size); + template __global__ void mul_kernel(const int64_t* A, const int64_t* B, int64_t* C,const int size); + template __global__ void mul_kernel(const int32_t* A, const int32_t* B, int32_t* C,const int size); + template __global__ void mul_kernel(const int16_t* A, const int16_t* B, int16_t* C,const int size); + template __global__ void mul_kernel(const int8_t* A, const int8_t* B, int8_t* C,const int size); + + template + void launch_mul(const int numBlocks, const int blockSize, const T* a, const T* b, T* c, const int size) { + mul_kernel<<>>(a, b, c, size); + } + template void launch_mul(const int numBlocks, const int blockSize, const double* a, const double* b, double* c, const int size); + template void launch_mul(const int numBlocks, const int blockSize, const float* a, const float* b, float* c, const int size); + template void launch_mul(const int numBlocks, const int blockSize, const half* a, const half* b, half* c, const int size); + template void launch_mul(const int numBlocks, const int blockSize, const nv_bfloat16* a, const nv_bfloat16* b, nv_bfloat16* c, const int size); + template void launch_mul(const int numBlocks, const int blockSize, const int64_t* a, const int64_t* b, int64_t* c, const int size); + template void launch_mul(const int numBlocks, const int blockSize, const int32_t* a, const int32_t* b, int32_t* c, const int size); + template void launch_mul(const int numBlocks, const int blockSize, const int16_t* a, const int16_t* b, int16_t* c, const int size); + template void launch_mul(const int numBlocks, const int blockSize, const int8_t* a, const int8_t* b, int8_t* c, const int size); + + template + __global__ void mulscalar_kernel(const T* A, const T scalar, T* C,const int size){ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < size) { + C[idx] = A[idx] * scalar; + } + } + template __global__ void mulscalar_kernel(const double* A, const double scalar, double* C,const int size); + template __global__ void mulscalar_kernel(const float* A, const float scalar, float* C,const int size); + template __global__ void mulscalar_kernel(const half* A, const half scalar, half* C,const int size); + template __global__ void mulscalar_kernel(const nv_bfloat16* A, const nv_bfloat16 scalar, nv_bfloat16* C,const int size); + template __global__ void mulscalar_kernel(const int64_t* A, const int64_t scalar, int64_t* C,const int size); + template __global__ void mulscalar_kernel(const int32_t* A, const int32_t scalar, int32_t* C,const int size); + template __global__ void mulscalar_kernel(const int16_t* A, const int16_t scalar, int16_t* C,const int size); + template __global__ void mulscalar_kernel(const int8_t* A, const int8_t scalar, int8_t* C,const int size); + + template + void launch_mulscalar(const int numBlocks, const int blockSize, const T* a, const T scalar, T* c, const int size) { + mulscalar_kernel<<>>(a, scalar, c, size); + } + template void launch_mulscalar(const int numBlocks, const int blockSize, const double* a, const double scalar, double* c, const int size); + template void launch_mulscalar(const int numBlocks, const int blockSize, const float* a, const float scalar, float* c, const int size); + template void launch_mulscalar(const int numBlocks, const int blockSize, const half* a, const half scalar, half* c, const int size); + template void launch_mulscalar(const int numBlocks, const int blockSize, const nv_bfloat16* a, const nv_bfloat16 scalar, nv_bfloat16* c, const int size); + template void launch_mulscalar(const int numBlocks, const int blockSize, const int64_t* a, const int64_t scalar, int64_t* c, const int size); + template void launch_mulscalar(const int numBlocks, const int blockSize, const int32_t* a, const int32_t scalar, int32_t* c, const int size); + template void launch_mulscalar(const int numBlocks, const int blockSize, const int16_t* a, const int16_t scalar, int16_t* c, const int size); + template void launch_mulscalar(const int numBlocks, const int blockSize, const int8_t* a, const int8_t scalar, int8_t* c, const int size); + + template + __global__ void div_kernel(const T* A, const T* B, T* C,const int size){ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < size) { + C[idx] = A[idx] / B[idx]; + } + } + template __global__ void div_kernel(const double* A, const double* B, double* C,const int size); + template __global__ void div_kernel(const float* A, const float* B, float* C,const int size); + template __global__ void div_kernel(const half* A, const half* B, half* C,const int size); + template __global__ void div_kernel(const nv_bfloat16* A, const nv_bfloat16* B, nv_bfloat16* C,const int size); + template __global__ void div_kernel(const int64_t* A, const int64_t* B, int64_t* C,const int size); + template __global__ void div_kernel(const int32_t* A, const int32_t* B, int32_t* C,const int size); + template __global__ void div_kernel(const int16_t* A, const int16_t* B, int16_t* C,const int size); + template __global__ void div_kernel(const int8_t* A, const int8_t* B, int8_t* C,const int size); + + template + void launch_div(const int numBlocks, const int blockSize, const T* a, const T* b, T* c, const int size) { + div_kernel<<>>(a, b, c, size); + } + template void launch_div(const int numBlocks, const int blockSize, const double* a, const double* b, double* c, const int size); + template void launch_div(const int numBlocks, const int blockSize, const float* a, const float* b, float* c, const int size); + template void launch_div(const int numBlocks, const int blockSize, const half* a, const half* b, half* c, const int size); + template void launch_div(const int numBlocks, const int blockSize, const nv_bfloat16* a, const nv_bfloat16* b, nv_bfloat16* c, const int size); + template void launch_div(const int numBlocks, const int blockSize, const int64_t* a, const int64_t* b, int64_t* c, const int size); + template void launch_div(const int numBlocks, const int blockSize, const int32_t* a, const int32_t* b, int32_t* c, const int size); + template void launch_div(const int numBlocks, const int blockSize, const int16_t* a, const int16_t* b, int16_t* c, const int size); + template void launch_div(const int numBlocks, const int blockSize, const int8_t* a, const int8_t* b, int8_t* c, const int size); + + template + __global__ void divscalar_kernel(const T* A, const T scalar, T* C,const int size){ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < size) { + C[idx] = A[idx] / scalar; + } + } + template __global__ void divscalar_kernel(const double* A, const double scalar, double* C,const int size); + template __global__ void divscalar_kernel(const float* A, const float scalar, float* C,const int size); + template __global__ void divscalar_kernel(const half* A, const half scalar, half* C,const int size); + template __global__ void divscalar_kernel(const nv_bfloat16* A, const nv_bfloat16 scalar, nv_bfloat16* C,const int size); + template __global__ void divscalar_kernel(const int64_t* A, const int64_t scalar, int64_t* C,const int size); + template __global__ void divscalar_kernel(const int32_t* A, const int32_t scalar, int32_t* C,const int size); + template __global__ void divscalar_kernel(const int16_t* A, const int16_t scalar, int16_t* C,const int size); + template __global__ void divscalar_kernel(const int8_t* A, const int8_t scalar, int8_t* C,const int size); + + template + void launch_divscalar(const int numBlocks, const int blockSize, const T* a, const T scalar, T* c, const int size) { + divscalar_kernel<<>>(a, scalar, c, size); + } + template void launch_divscalar(const int numBlocks, const int blockSize, const double* a, const double scalar, double* c, const int size); + template void launch_divscalar(const int numBlocks, const int blockSize, const float* a, const float scalar, float* c, const int size); + template void launch_divscalar(const int numBlocks, const int blockSize, const half* a, const half scalar, half* c, const int size); + template void launch_divscalar(const int numBlocks, const int blockSize, const nv_bfloat16* a, const nv_bfloat16 scalar, nv_bfloat16* c, const int size); + template void launch_divscalar(const int numBlocks, const int blockSize, const int64_t* a, const int64_t scalar, int64_t* c, const int size); + template void launch_divscalar(const int numBlocks, const int blockSize, const int32_t* a, const int32_t scalar, int32_t* c, const int size); + template void launch_divscalar(const int numBlocks, const int blockSize, const int16_t* a, const int16_t scalar, int16_t* c, const int size); + template void launch_divscalar(const int numBlocks, const int blockSize, const int8_t* a, const int8_t scalar, int8_t* c, const int size); + + template + __global__ void rdivscalar_kernel(const T scalar, const T* A, T* C,const int size){ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < size) { + C[idx] = scalar / A[idx]; + } + } + template __global__ void rdivscalar_kernel(const double scalar, const double* A, double* C,const int size); + template __global__ void rdivscalar_kernel(const float scalar, const float* A, float* C,const int size); + template __global__ void rdivscalar_kernel(const half scalar, const half* A, half* C,const int size); + template __global__ void rdivscalar_kernel(const nv_bfloat16 scalar, const nv_bfloat16* A, nv_bfloat16* C,const int size); + template __global__ void rdivscalar_kernel(const int64_t scalar, const int64_t* A, int64_t* C,const int size); + template __global__ void rdivscalar_kernel(const int32_t scalar, const int32_t* A, int32_t* C,const int size); + template __global__ void rdivscalar_kernel(const int16_t scalar, const int16_t* A, int16_t* C,const int size); + template __global__ void rdivscalar_kernel(const int8_t scalar, const int8_t* A, int8_t* C,const int size); + + template + void launch_rdivscalar(const int numBlocks, const int blockSize, const T scalar, const T* a, T* c, const int size) { + rdivscalar_kernel<<>>(scalar, a, c, size); + } + template void launch_rdivscalar(const int numBlocks, const int blockSize, const double scalar, const double* a, double* c, const int size); + template void launch_rdivscalar(const int numBlocks, const int blockSize, const float scalar, const float* a, float* c, const int size); + template void launch_rdivscalar(const int numBlocks, const int blockSize, const half scalar, const half* a, half* c, const int size); + template void launch_rdivscalar(const int numBlocks, const int blockSize, const nv_bfloat16 scalar, const nv_bfloat16* a, nv_bfloat16* c, const int size); + template void launch_rdivscalar(const int numBlocks, const int blockSize, const int64_t scalar, const int64_t* a, int64_t* c, const int size); + template void launch_rdivscalar(const int numBlocks, const int blockSize, const int32_t scalar, const int32_t* a, int32_t* c, const int size); + template void launch_rdivscalar(const int numBlocks, const int blockSize, const int16_t scalar, const int16_t* a, int16_t* c, const int size); + template void launch_rdivscalar(const int numBlocks, const int blockSize, const int8_t scalar, const int8_t* a, int8_t* c, const int size); + + } #endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_BASIC_CUH diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cuh b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cuh index 966cfa1c..2457a510 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cuh +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cuh @@ -134,7 +134,164 @@ namespace deepx::tensorfunc template <> void launch_subscalar(const int numBlocks, const int blockSize, const int8_t* a, const int8_t scalar, int8_t* c,const int size); - + + // mul + template + __global__ void mul_kernel(const T* A, const T* B, T* C,const int size); + + template + void launch_mul(const int numBlocks, const int blockSize, const T* a, const T* b, T* c,const int size); + + template <> + void launch_mul(const int numBlocks, const int blockSize, const double* a, const double* b, double* c,const int size); + + template <> + void launch_mul(const int numBlocks, const int blockSize, const float* a, const float* b, float* c,const int size); + + template <> + void launch_mul(const int numBlocks, const int blockSize, const nv_bfloat16* a, const nv_bfloat16* b, nv_bfloat16* c,const int size); + + template <> + void launch_mul<__half>(const int numBlocks, const int blockSize, const __half* a, const __half* b, __half* c,const int size); + + template <> + void launch_mul(const int numBlocks, const int blockSize, const int64_t* a, const int64_t* b, int64_t* c,const int size); + + template <> + void launch_mul(const int numBlocks, const int blockSize, const int32_t* a, const int32_t* b, int32_t* c,const int size); + + template <> + void launch_mul(const int numBlocks, const int blockSize, const int16_t* a, const int16_t* b, int16_t* c,const int size); + + template <> + void launch_mul(const int numBlocks, const int blockSize, const int8_t* a, const int8_t* b, int8_t* c,const int size); + + // mulscalar + template + __global__ void mulscalar_kernel(const T* A, const T scalar, T* C,const int size); + + template + void launch_mulscalar(const int numBlocks, const int blockSize, const T* a, const T scalar, T* c,const int size); + + template <> + void launch_mulscalar(const int numBlocks, const int blockSize, const double* a, const double scalar, double* c,const int size); + + template <> + void launch_mulscalar(const int numBlocks, const int blockSize, const float* a, const float scalar, float* c,const int size); + + template <> + void launch_mulscalar(const int numBlocks, const int blockSize, const nv_bfloat16* a, const nv_bfloat16 scalar, nv_bfloat16* c,const int size); + + template <> + void launch_mulscalar<__half>(const int numBlocks, const int blockSize, const __half* a, const __half scalar, __half* c,const int size); + + template <> + void launch_mulscalar(const int numBlocks, const int blockSize, const int64_t* a, const int64_t scalar, int64_t* c,const int size); + + template <> + void launch_mulscalar(const int numBlocks, const int blockSize, const int32_t* a, const int32_t scalar, int32_t* c,const int size); + + template <> + void launch_mulscalar(const int numBlocks, const int blockSize, const int16_t* a, const int16_t scalar, int16_t* c,const int size); + + template <> + void launch_mulscalar(const int numBlocks, const int blockSize, const int8_t* a, const int8_t scalar, int8_t* c,const int size); + + // div + template + __global__ void div_kernel(const T* A, const T* B, T* C,const int size); + + template + void launch_div(const int numBlocks, const int blockSize, const T* a, const T* b, T* c,const int size); + + template <> + void launch_div(const int numBlocks, const int blockSize, const double* a, const double* b, double* c,const int size); + + template <> + void launch_div(const int numBlocks, const int blockSize, const float* a, const float* b, float* c,const int size); + + template <> + void launch_div(const int numBlocks, const int blockSize, const nv_bfloat16* a, const nv_bfloat16* b, nv_bfloat16* c,const int size); + + template <> + void launch_div<__half>(const int numBlocks, const int blockSize, const __half* a, const __half* b, __half* c,const int size); + + template <> + void launch_div(const int numBlocks, const int blockSize, const int64_t* a, const int64_t* b, int64_t* c,const int size); + + template <> + void launch_div(const int numBlocks, const int blockSize, const int32_t* a, const int32_t* b, int32_t* c,const int size); + + template <> + void launch_div(const int numBlocks, const int blockSize, const int16_t* a, const int16_t* b, int16_t* c,const int size); + + template <> + void launch_div(const int numBlocks, const int blockSize, const int8_t* a, const int8_t* b, int8_t* c,const int size); + + // divscalar + template + __global__ void divscalar_kernel(const T* A, const T scalar, T* C,const int size); + + template + void launch_divscalar(const int numBlocks, const int blockSize, const T* a, const T scalar, T* c,const int size); + + template <> + void launch_divscalar(const int numBlocks, const int blockSize, const double* a, const double scalar, double* c,const int size); + + template <> + void launch_divscalar(const int numBlocks, const int blockSize, const float* a, const float scalar, float* c,const int size); + + template <> + void launch_divscalar(const int numBlocks, const int blockSize, const nv_bfloat16* a, const nv_bfloat16 scalar, nv_bfloat16* c,const int size); + + template <> + void launch_divscalar<__half>(const int numBlocks, const int blockSize, const __half* a, const __half scalar, __half* c,const int size); + + template <> + void launch_divscalar(const int numBlocks, const int blockSize, const int64_t* a, const int64_t scalar, int64_t* c,const int size); + + template <> + void launch_divscalar(const int numBlocks, const int blockSize, const int32_t* a, const int32_t scalar, int32_t* c,const int size); + + template <> + void launch_divscalar(const int numBlocks, const int blockSize, const int16_t* a, const int16_t scalar, int16_t* c,const int size); + + template <> + void launch_divscalar(const int numBlocks, const int blockSize, const int8_t* a, const int8_t scalar, int8_t* c,const int size); + + // rdivscalar + template + __global__ void rdivscalar_kernel(const T scalar, const T* A, T* C,const int size); + + template + void launch_rdivscalar(const int numBlocks, const int blockSize, const T scalar, const T* a, T* c,const int size); + + template <> + void launch_rdivscalar(const int numBlocks, const int blockSize, const double scalar, const double* a, double* c,const int size); + + template <> + void launch_rdivscalar(const int numBlocks, const int blockSize, const float scalar, const float* a, float* c,const int size); + + template <> + void launch_rdivscalar(const int numBlocks, const int blockSize, const nv_bfloat16 scalar, const nv_bfloat16* a, nv_bfloat16* c,const int size); + + template <> + void launch_rdivscalar<__half>(const int numBlocks, const int blockSize, const __half scalar, const __half* a, __half* c,const int size); + + template <> + void launch_rdivscalar(const int numBlocks, const int blockSize, const int64_t scalar, const int64_t* a, int64_t* c,const int size); + + template <> + void launch_rdivscalar(const int numBlocks, const int blockSize, const int32_t scalar, const int32_t* a, int32_t* c,const int size); + + template <> + void launch_rdivscalar(const int numBlocks, const int blockSize, const int16_t scalar, const int16_t* a, int16_t* c,const int size); + + template <> + void launch_rdivscalar(const int numBlocks, const int blockSize, const int8_t scalar, const int8_t* a, int8_t* c,const int size); + + + } #endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_BASIC_CUH diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.hpp index 0500dd60..72d0c32b 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.hpp +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.hpp @@ -69,6 +69,77 @@ namespace deepx::tensorfunc launch_subscalar(numBlocks, blockSize, A.data, scalar, C.data, A.shape.size); } }; + + template + struct mulDispatcher + { + static void mul(const Tensor &A, const Tensor &B, Tensor &C) + { + if (A.shape.size != B.shape.size || A.shape.size != C.shape.size) { + throw TensorShapeError("mul"); + } + const int blockSize = A.shape.size > 256 ? 256 : A.shape.size; + int numBlocks = (A.shape.size + blockSize - 1) / blockSize; + launch_mul(numBlocks, blockSize, A.data, B.data, C.data, A.shape.size); + } + }; + + template + struct mulscalarDispatcher + { + static void mulscalar(const Tensor &A, const T scalar, Tensor &C) + { + if (A.shape.size != C.shape.size) { + throw TensorShapeError("mulscalar"); + } + const int blockSize = A.shape.size > 256 ? 256 : A.shape.size; + int numBlocks = (A.shape.size + blockSize - 1) / blockSize; + launch_mulscalar(numBlocks, blockSize, A.data, scalar, C.data, A.shape.size); + } + }; + + template + struct divDispatcher + { + static void div(const Tensor &A, const Tensor &B, Tensor &C) + { + if (A.shape.size != B.shape.size || A.shape.size != C.shape.size) { + throw TensorShapeError("div"); + } + const int blockSize = A.shape.size > 256 ? 256 : A.shape.size; + int numBlocks = (A.shape.size + blockSize - 1) / blockSize; + launch_div(numBlocks, blockSize, A.data, B.data, C.data, A.shape.size); + } + }; + + template + struct divscalarDispatcher + { + static void divscalar(const Tensor &A, const T scalar, Tensor &C) + { + if (A.shape.size != C.shape.size) { + throw TensorShapeError("divscalar"); + } + const int blockSize = A.shape.size > 256 ? 256 : A.shape.size; + int numBlocks = (A.shape.size + blockSize - 1) / blockSize; + launch_divscalar(numBlocks, blockSize, A.data, scalar, C.data, A.shape.size); + } + }; + + template + struct rdivscalarDispatcher + { + static void rdivscalar(const T scalar, const Tensor &A, Tensor &C) + { + if (A.shape.size != C.shape.size) { + throw TensorShapeError("rdivscalar"); + } + const int blockSize = A.shape.size > 256 ? 256 : A.shape.size; + int numBlocks = (A.shape.size + blockSize - 1) / blockSize; + launch_rdivscalar(numBlocks, blockSize, scalar, A.data, C.data, A.shape.size); + } + }; + } #endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_BASIC_HPP diff --git a/excuter/op-mem-cuda/src/deepx/tf/elementwise_basic.hpp b/excuter/op-mem-cuda/src/deepx/tf/elementwise_basic.hpp index c0910a99..91fa6326 100644 --- a/excuter/op-mem-cuda/src/deepx/tf/elementwise_basic.hpp +++ b/excuter/op-mem-cuda/src/deepx/tf/elementwise_basic.hpp @@ -299,7 +299,367 @@ namespace deepx::tf } }; - + template + class Mul : public TF + { + public: + Mul(const vector &args, const vector &returns) + { + this->name = "mul"; + this->author = Author::name(); + this->args = args; + this->returns = returns; + } + + Mul(string text) + { + this->parse(text); + this->author = Author::name(); + if (this->name != "mul") + { + throw std::runtime_error("Invalid name: " + this->name); + } + } + string math_formula() const override + { + return "T3=T1*T2"; + } + shared_ptr clone() const override + { + return make_shared>(*this); + } + int run(shared_ptr mem, string &error) override + { + Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; + Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype; + Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (a_type != b_type || a_type != c_type) + { + error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(b_type) + " != " + precision_str(c_type); + return 1; + } + switch (a_type) + { + case Precision::Float64: + tensorfunc::mul(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + tensorfunc::mul(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float16: + tensorfunc::mul(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::BFloat16: + tensorfunc::mul(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int64: + tensorfunc::mul(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + tensorfunc::mul(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + tensorfunc::mul(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + tensorfunc::mul(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported dtype: " + precision_str(a_type); + return 1; + } + return 0; + } + }; + + template + class MulScalar : public TF + { + public: + MulScalar(const vector &args, const vector &returns) + { + this->name = "mulscalar"; + this->author = Author::name(); + this->args = args; + this->returns = returns; + } + + MulScalar(string text) + { + this->parse(text); + this->author = Author::name(); + if (this->name != "mulscalar") + { + throw std::runtime_error("Invalid name: " + this->name); + } + } + string math_formula() const override + { + return "T3=T1*scalar"; + } + shared_ptr clone() const override + { + return make_shared>(*this); + } + int run(shared_ptr mem, string &error) override + { + Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; + Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (a_type != c_type) + { + error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type); + return 1; + } + switch (a_type) + { + case Precision::Float64: + tensorfunc::mulscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + tensorfunc::mulscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float16: + tensorfunc::mulscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::BFloat16: + tensorfunc::mulscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int64: + tensorfunc::mulscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + tensorfunc::mulscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + tensorfunc::mulscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + tensorfunc::mulscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported dtype: " + precision_str(a_type); + return 1; + } + return 0; + } + }; + + template + class Div : public TF + { + public: + Div(const vector &args, const vector &returns) + { + this->name = "div"; + this->author = Author::name(); + this->args = args; + this->returns = returns; + } + + Div(string text) + { + this->parse(text); + this->author = Author::name(); + if (this->name != "div") + { + throw std::runtime_error("Invalid name: " + this->name); + } + } + string math_formula() const override + { + return "T3=T1/T2"; + } + shared_ptr clone() const override + { + return make_shared>(*this); + } + int run(shared_ptr mem, string &error) override + { + Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; + Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype; + Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (a_type != b_type || a_type != c_type) + { + error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(b_type) + " != " + precision_str(c_type); + return 1; + } + switch (a_type) + { + case Precision::Float64: + tensorfunc::div(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + tensorfunc::div(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float16: + tensorfunc::div(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::BFloat16: + tensorfunc::div(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int64: + tensorfunc::div(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + tensorfunc::div(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + tensorfunc::div(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + tensorfunc::div(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported dtype: " + precision_str(a_type); + return 1; + } + return 0; + } + }; + + template + class DivScalar : public TF + { + public: + DivScalar(const vector &args, const vector &returns) + { + this->name = "divscalar"; + this->author = Author::name(); + this->args = args; + this->returns = returns; + } + + DivScalar(string text) + { + this->parse(text); + this->author = Author::name(); + if (this->name != "divscalar") + { + throw std::runtime_error("Invalid name: " + this->name); + } + } + string math_formula() const override + { + return "T3=scalar/T1"; + } + shared_ptr clone() const override + { + return make_shared>(*this); + } + int run(shared_ptr mem, string &error) override + { + Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; + Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (a_type != c_type) + { + error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type); + return 1; + } + switch (a_type) + { + case Precision::Float64: + tensorfunc::divscalar( *mem->gettensor(this->args[0].textvalue),this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + tensorfunc::divscalar( *mem->gettensor(this->args[0].textvalue),this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float16: + tensorfunc::divscalar( *mem->gettensor(this->args[0].textvalue),this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::BFloat16: + tensorfunc::divscalar( *mem->gettensor(this->args[0].textvalue),this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int64: + tensorfunc::divscalar( *mem->gettensor(this->args[0].textvalue),this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + tensorfunc::divscalar( *mem->gettensor(this->args[0].textvalue),this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + tensorfunc::divscalar( *mem->gettensor(this->args[0].textvalue),this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + tensorfunc::divscalar( *mem->gettensor(this->args[0].textvalue),this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported dtype: " + precision_str(a_type); + return 1; + } + return 0; + } + }; + + template + class RDivScalar : public TF + { + public: + RDivScalar(const vector &args, const vector &returns) + { + this->name = "rdivscalar"; + this->author = Author::name(); + this->args = args; + this->returns = returns; + } + + RDivScalar(string text) + { + this->parse(text); + this->author = Author::name(); + if (this->name != "rdivscalar") + { + throw std::runtime_error("Invalid name: " + this->name); + } + } + string math_formula() const override + { + return "T3=scalar/T1"; + } + shared_ptr clone() const override + { + return make_shared>(*this); + } + int run(shared_ptr mem, string &error) override + { + Precision a_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype; + Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (a_type != c_type) + { + error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type); + return 1; + } + switch (a_type) + { + case Precision::Float64: + tensorfunc::rdivscalar(this->getvar(0, mem), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + tensorfunc::rdivscalar(this->getvar(0, mem), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float16: + tensorfunc::rdivscalar(this->getvar(0, mem), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::BFloat16: + tensorfunc::rdivscalar(this->getvar(0, mem), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int64: + tensorfunc::rdivscalar(this->getvar(0, mem), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + tensorfunc::rdivscalar(this->getvar(0, mem), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + tensorfunc::rdivscalar(this->getvar(0, mem), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + tensorfunc::rdivscalar(this->getvar(0, mem), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported dtype: " + precision_str(a_type); + return 1; + } + return 0; + } + }; }; #endif // DEEPX_TF_ELEMENTWISE_BASIC_HPP diff --git a/excuter/op-mem-ompsimd/src/client/tfs.cpp b/excuter/op-mem-ompsimd/src/client/tfs.cpp index 4eab0c4d..59dfba65 100644 --- a/excuter/op-mem-ompsimd/src/client/tfs.cpp +++ b/excuter/op-mem-ompsimd/src/client/tfs.cpp @@ -150,29 +150,56 @@ namespace deepx::tf { Param("c", DataCategory::Tensor, Precision::Any), }))); - // opfactory.add_op(Addscalar_miaobyte()); - // opfactory.add_op(Addscalar_miaobyte()); - // opfactory.add_op(Sub_miaobyte()); - // opfactory.add_op(Sub_miaobyte()); - - // opfactory.add_op(Sub_cblas()); - // opfactory.add_op(Sub_cblas()); - - // opfactory.add_op(Mul_miaobyte()); - // opfactory.add_op(Mul_miaobyte()); - - // opfactory.add_op(Mulscalar_miaobyte()); - // opfactory.add_op(Mulscalar_miaobyte()); + tffactory.add_tf(std::make_shared>(vector( + { + Param("A", DataCategory::Tensor, Precision::Any), + Param("B", DataCategory::Tensor, Precision::Any), + }), + vector( + { + Param("C", DataCategory::Tensor, Precision::Any), + }))); + tffactory.add_tf(std::make_shared>(vector( + { + Param("A", DataCategory::Tensor, Precision::Any), + Param("b", DataCategory::Var, Precision::Any), + }), + vector( + { + Param("C", DataCategory::Tensor, Precision::Any), + }))); + + tffactory.add_tf(std::make_shared>(vector( + { + Param("A", DataCategory::Tensor, Precision::Any), + Param("B", DataCategory::Tensor, Precision::Any), + }), + vector( + { + Param("C", DataCategory::Tensor, Precision::Any), + }))); + tffactory.add_tf(std::make_shared>(vector( + { + Param("A", DataCategory::Tensor, Precision::Any), + Param("scalar", DataCategory::Var, Precision::Any), + }), + vector( + { + Param("C", DataCategory::Tensor, Precision::Any), + }))); - // opfactory.add_op(Div_miaobyte()); - // opfactory.add_op(Div_miaobyte()); - // opfactory.add_op(Divscalar_miaobyte()); - // opfactory.add_op(Divscalar_miaobyte()); - // opfactory.add_op(RDivscalar_miaobyte()); - // opfactory.add_op(RDivscalar_miaobyte()); + tffactory.add_tf(std::make_shared>(vector( + { + Param("scalar", DataCategory::Var, Precision::Any), + Param("A", DataCategory::Tensor, Precision::Any), + }), + vector( + { + Param("C", DataCategory::Tensor, Precision::Any), + }))); // opfactory.add_op(Sqrt_miaobyte()); // opfactory.add_op(Sqrt_miaobyte()); diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp b/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp index 7a69c776..622463d5 100644 --- a/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp +++ b/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp @@ -349,6 +349,175 @@ namespace deepx::tf } }; + template + class Div : public TF + { + public: + Div(vector args, vector returns) + { + this->name = "div"; + this->author = Author::name(); + this->args = args; + this->returns = returns; + } + string math_formula() const override + { + return "T3=T1/T2"; + } + shared_ptr clone() const override + { + return make_shared>(*this); + } + int run(shared_ptr mem, string &error) override + { + Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; + Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype; + Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (a_type != b_type || a_type != c_type) + { + error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(b_type) + " != " + precision_str(c_type); + return 1; + } + switch (a_type) + { + case Precision::Float64: + tensorfunc::div(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + tensorfunc::div(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int64: + tensorfunc::div(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + tensorfunc::div(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + tensorfunc::div(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + tensorfunc::div(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported dtype: " + precision_str(a_type); + return 1; + } + return 0; + } + }; + + template + class DivScalar : public TF + { + public: + DivScalar(vector args, vector returns) + { + this->name = "divscalar"; + this->author = Author::name(); + this->args = args; + this->returns = returns; + } + string math_formula() const override + { + return "T3=T1/scalar"; + } + shared_ptr clone() const override + { + return make_shared>(*this); + } + int run(shared_ptr mem, string &error) override + { + Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; + Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (a_type != c_type) + { + error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type); + return 1; + } + switch (a_type) + { + case Precision::Float64: + tensorfunc::divscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + tensorfunc::divscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int64: + tensorfunc::divscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + tensorfunc::divscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + tensorfunc::divscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + tensorfunc::divscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported dtype: " + precision_str(a_type); + return 1; + } + return 0; + } + }; + + template + class RDivScalar : public TF + { + public: + RDivScalar(vector args, vector returns) + { + this->name = "rdivscalar"; + this->author = Author::name(); + this->args = args; + this->returns = returns; + } + string math_formula() const override + { + return "T3=scalar/T1"; + } + shared_ptr clone() const override + { + return make_shared>(*this); + } + int run(shared_ptr mem, string &error) override + { + Precision a_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype; + Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (a_type != c_type) + { + error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type); + return 1; + } + switch (a_type) + { + case Precision::Float64: + tensorfunc::rdivscalar( this->getvar(0, mem),*mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + tensorfunc::rdivscalar(this->getvar(0, mem),*mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int64: + tensorfunc::rdivscalar(this->getvar(0, mem),*mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + tensorfunc::rdivscalar(this->getvar(0, mem),*mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + tensorfunc::rdivscalar(this->getvar(0, mem),*mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + tensorfunc::rdivscalar(this->getvar(0, mem),*mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported dtype: " + precision_str(a_type); + return 1; + } + return 0; + } + }; + } #endif diff --git a/front/py/examples/2_ir/2_elementwise_operator.dot b/front/py/examples/2_ir/2_elementwise_operator.dot new file mode 100644 index 00000000..b39fa214 --- /dev/null +++ b/front/py/examples/2_ir/2_elementwise_operator.dot @@ -0,0 +1,64 @@ +// Computational Graph +digraph { + rankdir=TB + node [shape=record] + 134854829346096 [label="t1 +(3, 4, 5)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] + 134854521156512 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 134854521844832 [label="var_1 +0" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled] + 134854462386816 [label="t2 +(3, 4, 5)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] + 134854462387008 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 134854462386624 [label="var_2 +1" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled] + 134854462387248 [label=add color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 134854462387056 [label="t3 +(3, 4, 5)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] + 134854462387344 [label="t4 +(3, 4, 5)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] + 134854462387680 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 134854462387632 [label="var_3 +0.5" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled] + 134854462387488 [label=add color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 134854462387776 [label="t5 +(3, 4, 5)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] + 134854462388016 [label="t6 +(3, 4, 5)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] + 134854462388400 [label=div color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 134854462388256 [label=rdivscalar color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 134854462388352 [label="var_4 +0.05" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled] + 134854462388688 [label="t7 +(3, 4, 5)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] + 134854462388832 [label=mulscalar color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 134854462388880 [label="var_5 +2.5" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled] + 134854462388736 [label=mul color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 134854462389168 [label="t8 +(3, 4, 5)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] + 134854521156512 -> 134854829346096 [arrowsize=0.8 color=gray40 penwidth=1.2] + 134854521844832 -> 134854521156512 [arrowsize=0.8 color=gray40 penwidth=1.2] + 134854462387008 -> 134854462386816 [arrowsize=0.8 color=gray40 penwidth=1.2] + 134854462386624 -> 134854462387008 [arrowsize=0.8 color=gray40 penwidth=1.2] + 134854829346096 -> 134854462387248 [arrowsize=0.8 color=gray40 penwidth=1.2] + 134854462386816 -> 134854462387248 [arrowsize=0.8 color=gray40 penwidth=1.2] + 134854462387248 -> 134854462387056 [arrowsize=0.8 color=gray40 penwidth=1.2] + 134854462387680 -> 134854462387344 [arrowsize=0.8 color=gray40 penwidth=1.2] + 134854462387632 -> 134854462387680 [arrowsize=0.8 color=gray40 penwidth=1.2] + 134854462387344 -> 134854462387488 [arrowsize=0.8 color=gray40 penwidth=1.2] + 134854462387056 -> 134854462387488 [arrowsize=0.8 color=gray40 penwidth=1.2] + 134854462387488 -> 134854462387776 [arrowsize=0.8 color=gray40 penwidth=1.2] + 134854462388400 -> 134854462388016 [arrowsize=0.8 color=gray40 penwidth=1.2] + 134854829346096 -> 134854462388400 [arrowsize=0.8 color=gray40 penwidth=1.2] + 134854462386816 -> 134854462388400 [arrowsize=0.8 color=gray40 penwidth=1.2] + 134854462388352 -> 134854462388256 [arrowsize=0.8 color=gray40 penwidth=1.2] + 134854462386816 -> 134854462388256 [arrowsize=0.8 color=gray40 penwidth=1.2] + 134854462388256 -> 134854462388688 [arrowsize=0.8 color=gray40 penwidth=1.2] + 134854462388832 -> 134854462388688 [arrowsize=0.8 color=gray40 penwidth=1.2] + 134854462388688 -> 134854462388832 [arrowsize=0.8 color=gray40 penwidth=1.2] + 134854462388880 -> 134854462388832 [arrowsize=0.8 color=gray40 penwidth=1.2] + 134854462388688 -> 134854462388736 [arrowsize=0.8 color=gray40 penwidth=1.2] + 134854462386816 -> 134854462388736 [arrowsize=0.8 color=gray40 penwidth=1.2] + 134854462388736 -> 134854462389168 [arrowsize=0.8 color=gray40 penwidth=1.2] +} diff --git a/front/py/examples/2_ir/2_elementwise_operator.dot.svg b/front/py/examples/2_ir/2_elementwise_operator.dot.svg new file mode 100644 index 00000000..1c50be16 --- /dev/null +++ b/front/py/examples/2_ir/2_elementwise_operator.dot.svg @@ -0,0 +1,302 @@ + + + + + + +%3 + + + +134854829346096 + +t1 +(3, 4, 5) + + + +134854462387248 + +add + + + +134854829346096->134854462387248 + + + + + +134854462388400 + +div + + + +134854829346096->134854462388400 + + + + + +134854521156512 + +constant + + + +134854521156512->134854829346096 + + + + + +134854521844832 + +var_1 +0 + + + +134854521844832->134854521156512 + + + + + +134854462386816 + +t2 +(3, 4, 5) + + + +134854462386816->134854462387248 + + + + + +134854462386816->134854462388400 + + + + + +134854462388256 + +rdivscalar + + + +134854462386816->134854462388256 + + + + + +134854462388736 + +mul + + + +134854462386816->134854462388736 + + + + + +134854462387008 + +constant + + + +134854462387008->134854462386816 + + + + + +134854462386624 + +var_2 +1 + + + +134854462386624->134854462387008 + + + + + +134854462387056 + +t3 +(3, 4, 5) + + + +134854462387248->134854462387056 + + + + + +134854462387488 + +add + + + +134854462387056->134854462387488 + + + + + +134854462387344 + +t4 +(3, 4, 5) + + + +134854462387344->134854462387488 + + + + + +134854462387680 + +constant + + + +134854462387680->134854462387344 + + + + + +134854462387632 + +var_3 +0.5 + + + +134854462387632->134854462387680 + + + + + +134854462387776 + +t5 +(3, 4, 5) + + + +134854462387488->134854462387776 + + + + + +134854462388016 + +t6 +(3, 4, 5) + + + +134854462388400->134854462388016 + + + + + +134854462388688 + +t7 +(3, 4, 5) + + + +134854462388256->134854462388688 + + + + + +134854462388352 + +var_4 +0.05 + + + +134854462388352->134854462388256 + + + + + +134854462388832 + +mulscalar + + + +134854462388688->134854462388832 + + + + + +134854462388688->134854462388736 + + + + + +134854462388832->134854462388688 + + + + + +134854462388880 + +var_5 +2.5 + + + +134854462388880->134854462388832 + + + + + +134854462389168 + +t8 +(3, 4, 5) + + + +134854462388736->134854462389168 + + + + + diff --git a/front/py/examples/2_ir/2_elementwise_operator.py b/front/py/examples/2_ir/2_elementwise_operator.py index e1e4502e..80ecd6f6 100644 --- a/front/py/examples/2_ir/2_elementwise_operator.py +++ b/front/py/examples/2_ir/2_elementwise_operator.py @@ -8,7 +8,13 @@ torch_t4 = torch.full((3, 4, 5), 0.5) torch_t5 = torch_t4 + torch_t3 print(torch_t5) +torch_t6 = torch_t1 / torch_t2 +print(torch_t6) +torch_t7=0.05/torch_t2*2.5 +print(torch_t7) +torch_t8=torch_t7.mul(torch_t2) +print(torch_t8) ############-------DEEPX-------################ import deepx @@ -20,7 +26,13 @@ t4=deepx.full([3,4,5],value=0.5,name='t4') t5=t4.add(t3,out='t5') print(t5) - +t6=t1.div(t2,out='t6') +print(t6) +t7=t2.rdiv(0.05,out='t7') +t7.mul_(2.5) +print(t7) +t8=t7.mul(t2,out='t8') +print(t8) import os script_name = os.path.splitext(os.path.basename( os.path.abspath(__file__)))[0] # 获取不带后缀的脚本名 str=t3.graph.to_dot()