From 62f267248a9b32345699cc552fffb4a99c44346a Mon Sep 17 00:00:00 2001 From: lipeng <734991033@qq.com> Date: Wed, 2 Apr 2025 00:38:20 +0800 Subject: [PATCH 1/7] excuter(cpu/cuda):max,min,compare; sin,cos,tan; --- .../src/deepx/tensorfunc/elementwise.hpp | 121 +--- excuter/op-mem-cuda/src/client/tfs.cpp | 97 ++- .../tensorfunc/elementwise_miaobyte_basic.cu | 6 +- .../tensorfunc/elementwise_miaobyte_basic.cuh | 3 - .../tensorfunc/elementwise_miaobyte_basic.hpp | 2 +- .../elementwise_miaobyte_compare.cu | 207 ++++++ .../elementwise_miaobyte_compare.cuh | 160 +++++ .../elementwise_miaobyte_compare.hpp | 85 +++ .../tensorfunc/elementwise_miaobyte_sin.cu | 140 ++++ .../tensorfunc/elementwise_miaobyte_sin.cuh | 70 ++ ...sin.hpp.a => elementwise_miaobyte_sin.hpp} | 6 +- .../tensorfunc/elementwise_miaobyte_sqrt.cu | 193 +++--- .../tensorfunc/elementwise_miaobyte_sqrt.cuh | 67 +- .../src/deepx/tf/elementwise_compare.hpp | 378 +++++++++++ .../src/deepx/tf/elementwise_sin.hpp | 191 ++++++ .../src/deepx/tf/elementwise_sqrt.hpp | 98 +-- excuter/op-mem-ompsimd/src/client/tfs.cpp | 48 +- .../deepx/tensorfunc/elementwise_miaobyte.hpp | 412 +---------- excuter/op-mem-ompsimd/src/deepx/tf/a.zip | Bin 11358 -> 0 bytes .../src/deepx/tf/elementwise.hpp | 465 +++++++++++-- .../src/deepx/tf/elementwise.hpp.a | 427 ------------ .../src/deepx/tf/elementwise_cblas.hpp.a | 82 --- .../src/deepx/tf/elementwise_miaobyte.hpp.a | 637 ------------------ .../op-mem-ompsimd/src/deepx/tf/matmul.hpp.a | 69 -- .../test/tensorfunc/4_tensor_mul.cpp | 20 +- front/py/deepx/autograd/__init__.py | 4 +- front/py/deepx/autograd/function.py | 34 + front/py/deepx/nn/functional/elementwise.py | 211 +++++- front/py/examples/2_ir/3_matmul.dot | 30 +- front/py/examples/2_ir/3_matmul.dot.svg | 60 +- 30 files changed, 2200 insertions(+), 2123 deletions(-) create mode 100644 excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.cu create mode 100644 excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.cuh create mode 100644 excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.hpp create mode 100644 excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.cu create mode 100644 excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.cuh rename excuter/op-mem-cuda/src/deepx/tensorfunc/{elementwise_miaobyte_sin.hpp.a => elementwise_miaobyte_sin.hpp} (95%) create mode 100644 excuter/op-mem-cuda/src/deepx/tf/elementwise_compare.hpp create mode 100644 excuter/op-mem-cuda/src/deepx/tf/elementwise_sin.hpp delete mode 100644 excuter/op-mem-ompsimd/src/deepx/tf/a.zip delete mode 100644 excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp.a delete mode 100644 excuter/op-mem-ompsimd/src/deepx/tf/elementwise_cblas.hpp.a delete mode 100644 excuter/op-mem-ompsimd/src/deepx/tf/elementwise_miaobyte.hpp.a delete mode 100644 excuter/op-mem-ompsimd/src/deepx/tf/matmul.hpp.a create mode 100644 front/py/deepx/autograd/function.py diff --git a/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp b/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp index 4e0edc6e..c92b15ae 100644 --- a/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp +++ b/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp @@ -87,42 +87,8 @@ namespace deepx::tensorfunc mulscalarDispatcher::mulscalar(input, value, output); } - template - struct muladdDispatcher - { - static void muladd(const Tensor &A, const Tensor &B, const Tensor &C, Tensor &D) = delete; - }; - - template - void muladd(const Tensor &A, const Tensor &B, const Tensor &C, Tensor &D) - { - muladdDispatcher::muladd(A, B, C, D); - } - - template - struct muladdscalarDispatcher - { - static void muladdscalar(const Tensor &A, const Tensor &B, const T alpha, const Tensor &C, const T beta, Tensor &D) = delete; - }; - - template - void muladdscalar(const Tensor &A, const Tensor &B, const T alpha, const Tensor &C, const T beta, Tensor &D) - { - muladdscalarDispatcher::muladdscalar(A, B, alpha, C, beta, D); - } - - template - struct mulscalaraddDispatcher - { - static void mulscalaradd(const Tensor &A, const T alpha, const Tensor &B, const T beta, Tensor &C) = delete; - }; - - template - void mulscalaradd(const Tensor &A, const T alpha, const Tensor &B, const T beta, Tensor &C) - { - mulscalaraddDispatcher::mulscalaradd(A, alpha, B, beta, C); - } - + + template struct divDispatcher { @@ -159,42 +125,7 @@ namespace deepx::tensorfunc rdivscalarDispatcher::rdivscalar(value, input, output); } - template - struct divaddDispatcher - { - static void divadd(const Tensor &A, const Tensor &B, const Tensor &C, Tensor &D) = delete; - }; - - template - void divadd(const Tensor &A, const Tensor &B, const Tensor &C, Tensor &D) - { - divaddDispatcher::divadd(A, B, C, D); - } - - template - struct divscalaraddDispatcher - { - static void divscalaradd(const Tensor &A, const T alpha, const Tensor &B, const T beta, Tensor &C) = delete; - }; - - template - void divscalaradd(const Tensor &A, const T alpha, const Tensor &B, const T beta, Tensor &C) - { - divscalaraddDispatcher::divscalaradd(A, alpha, B, beta, C); - } - - template - struct divaddbetaDispatcher - { - static void divaddbeta(const Tensor &A, const Tensor &B, const T alpha, const Tensor &C, const T beta, Tensor &D) = delete; - }; - - template - void divaddbeta(const Tensor &A, const Tensor &B, const T alpha, const Tensor &C, const T beta, Tensor &D) - { - divaddbetaDispatcher::divaddbeta(A, B, alpha, C, beta, D); - } - + template struct sqrtDispatcher { @@ -303,17 +234,7 @@ namespace deepx::tensorfunc maxDispatcher::max(A, B, C); } - template - struct maxgradDispatcher - { - static void maxgrad(const Tensor &A, const Tensor &B, Tensor &A_grad, Tensor &B_grad, const Tensor &output_grad) = delete; - }; - - template - void maxgrad(const Tensor &A, const Tensor &B, Tensor &A_grad, Tensor &B_grad, const Tensor &output_grad) - { - maxgradDispatcher::maxgrad(A, B, A_grad, B_grad, output_grad); - } + template struct maxscalarDispatcher @@ -327,17 +248,7 @@ namespace deepx::tensorfunc maxscalarDispatcher::maxscalar(A, b, C); } - template - struct maxscalargradDispatcher - { - static void maxscalargrad(const Tensor &A, const T b, Tensor &A_grad, const Tensor &output_grad) = delete; - }; - - template - void maxscalargrad(const Tensor &A, const T b, Tensor &A_grad, const Tensor &output_grad) - { - maxscalargradDispatcher::maxscalargrad(A, b, A_grad, output_grad); - } + template struct minDispatcher @@ -351,18 +262,6 @@ namespace deepx::tensorfunc minDispatcher::min(A, B, C); } - template - struct mingradDispatcher - { - static void mingrad(const Tensor &A, const Tensor &B, Tensor &A_grad, Tensor &B_grad, const Tensor &output_grad) = delete; - }; - - template - void mingrad(const Tensor &A, const Tensor &B, Tensor &A_grad, Tensor &B_grad, const Tensor &output_grad) - { - mingradDispatcher::mingrad(A, B, A_grad, B_grad, output_grad); - } - template struct minscalarDispatcher { @@ -374,17 +273,17 @@ namespace deepx::tensorfunc { minscalarDispatcher::minscalar(A, b, C); } - + template - struct minscalargradDispatcher + struct compareDispatcher { - static void minscalargrad(const Tensor &A, const T b, Tensor &A_grad, const Tensor &output_grad) = delete; + static void compare(const Tensor &A, const Tensor &B, Tensor &mask) = delete; }; template - void minscalargrad(const Tensor &A, const T b, Tensor &A_grad, const Tensor &output_grad) + void compare(const Tensor &A, const Tensor &B,Tensor &mask) { - minscalargradDispatcher::minscalargrad(A, b, A_grad, output_grad); + compareDispatcher::compare(A, B, mask); } } // namespace deepx::tensorfunc diff --git a/excuter/op-mem-cuda/src/client/tfs.cpp b/excuter/op-mem-cuda/src/client/tfs.cpp index 473e55d3..2d8b0c5d 100644 --- a/excuter/op-mem-cuda/src/client/tfs.cpp +++ b/excuter/op-mem-cuda/src/client/tfs.cpp @@ -5,6 +5,8 @@ #include "deepx/tf/init.hpp" #include "deepx/tf/elementwise_basic.hpp" #include "deepx/tf/elementwise_sqrt.hpp" +#include "deepx/tf/elementwise_sin.hpp" +#include "deepx/tf/elementwise_compare.hpp" #include "deepx/tf/matmul.hpp" #include "deepx/dtype.hpp" #include "deepx/tf/tffactory.hpp" @@ -192,48 +194,117 @@ namespace deepx::tf tffactory.add_tf(std::make_shared>(vector( { - Param("A", DataCategory::Tensor, Precision::Any), + Param("A", DataCategory::Tensor, Precision::Float64|Precision::Float32|Precision::Float16|Precision::BFloat16), }), vector( { - Param("C", DataCategory::Tensor, Precision::Any), + Param("C", DataCategory::Tensor, Precision::Float64|Precision::Float32|Precision::Float16|Precision::BFloat16), }))); tffactory.add_tf(std::make_shared>(vector( { - Param("A", DataCategory::Tensor, Precision::Any), - Param("B", DataCategory::Tensor, Precision::Any), + Param("A", DataCategory::Tensor, Precision::Float64|Precision::Float32 ), + Param("B", DataCategory::Tensor, Precision::Float64|Precision::Float32 ), }), vector( { - Param("C", DataCategory::Tensor, Precision::Any), + Param("C", DataCategory::Tensor, Precision::Float64|Precision::Float32), }))); tffactory.add_tf(std::make_shared>(vector( { - Param("A", DataCategory::Tensor, Precision::Any), - Param("scalar", DataCategory::Var, Precision::Any), + Param("A", DataCategory::Tensor, Precision::Float64|Precision::Float32), + Param("scalar", DataCategory::Var, Precision::Float64|Precision::Float32), }), vector( { - Param("C", DataCategory::Tensor, Precision::Any), + Param("C", DataCategory::Tensor, Precision::Float64|Precision::Float32), }))); tffactory.add_tf(std::make_shared>(vector( { - Param("A", DataCategory::Tensor, Precision::Any), + Param("A", DataCategory::Tensor, Precision::Float64|Precision::Float32|Precision::Float16|Precision::BFloat16), }), vector( { - Param("C", DataCategory::Tensor, Precision::Any), + Param("C", DataCategory::Tensor, Precision::Float64|Precision::Float32|Precision::Float16|Precision::BFloat16), }))); tffactory.add_tf(std::make_shared>(vector( { - Param("A", DataCategory::Tensor, Precision::Any), + Param("A", DataCategory::Tensor, Precision::Float64|Precision::Float32|Precision::Float16|Precision::BFloat16), }), vector( { - Param("C", DataCategory::Tensor, Precision::Any), + Param("C", DataCategory::Tensor, Precision::Float64|Precision::Float32|Precision::Float16|Precision::BFloat16), }))); - } + tffactory.add_tf(std::make_shared>(vector( + { + Param("A", DataCategory::Tensor, Precision::Float64|Precision::Float32|Precision::Float16|Precision::BFloat16), + }), + vector( + { + Param("C", DataCategory::Tensor, Precision::Float64|Precision::Float32|Precision::Float16|Precision::BFloat16), + }))); + tffactory.add_tf(std::make_shared>(vector( + { + Param("A", DataCategory::Tensor, Precision::Float64|Precision::Float32|Precision::Float16|Precision::BFloat16), + }), + vector( + { + Param("C", DataCategory::Tensor, Precision::Float64|Precision::Float32|Precision::Float16|Precision::BFloat16), + }))); + tffactory.add_tf(std::make_shared>(vector( + { + Param("A", DataCategory::Tensor, Precision::Float64|Precision::Float32), + }), + vector( + { + Param("C", DataCategory::Tensor, Precision::Float64|Precision::Float32), + }))); + tffactory.add_tf(std::make_shared>(vector( + { + Param("A", DataCategory::Tensor, Precision::Any), + Param("B", DataCategory::Tensor, Precision::Any), + }), + vector( + { + Param("C", DataCategory::Tensor, Precision::Any), + }))); + tffactory.add_tf(std::make_shared>(vector( + { + Param("A", DataCategory::Tensor, Precision::Any), + Param("scalar", DataCategory::Var, Precision::Any), + }), + vector( + { + Param("C", DataCategory::Tensor, Precision::Any), + }))); + tffactory.add_tf(std::make_shared>(vector( + { + Param("A", DataCategory::Tensor, Precision::Any), + Param("B", DataCategory::Tensor, Precision::Any), + }), + vector( + { + Param("C", DataCategory::Tensor, Precision::Any), + }))); + tffactory.add_tf(std::make_shared>(vector( + { + Param("A", DataCategory::Tensor, Precision::Any), + Param("scalar", DataCategory::Var, Precision::Any), + }), + vector( + { + Param("C", DataCategory::Tensor, Precision::Any), + }))); + tffactory.add_tf(std::make_shared>(vector( + { + Param("A", DataCategory::Tensor, Precision::Any), + Param("B", DataCategory::Tensor, Precision::Any), + }), + vector( + { + Param("mask", DataCategory::Tensor, Precision::Int8), + }))); + } // matmul void register_matmul(TfFactory &tffactory) { diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu index 6d8e73ae..a5371005 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu @@ -1,5 +1,5 @@ -#ifndef DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_BASIC_CUH -#define DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_BASIC_CUH +#ifndef DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_BASIC_CU +#define DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_BASIC_CU #include "deepx/tensorfunc/cuda.hpp" #include "deepx/tensorfunc/authors.hpp" @@ -281,4 +281,4 @@ namespace deepx::tensorfunc } -#endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_BASIC_CUH +#endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_BASIC_CU diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cuh b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cuh index 2457a510..0f4da083 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cuh +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cuh @@ -3,9 +3,6 @@ #include #include - - -#include "deepx/tensorfunc/elementwise.hpp" #include "deepx/tensorfunc/cuda.hpp" #include "deepx/tensorfunc/authors.hpp" diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.hpp index 72d0c32b..e263b65b 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.hpp +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.hpp @@ -139,7 +139,7 @@ namespace deepx::tensorfunc launch_rdivscalar(numBlocks, blockSize, scalar, A.data, C.data, A.shape.size); } }; - + } #endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_BASIC_HPP diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.cu new file mode 100644 index 00000000..141a4889 --- /dev/null +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.cu @@ -0,0 +1,207 @@ +#ifndef DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_COMPARE_CU +#define DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_COMPARE_CU + +#include "deepx/tensorfunc/cuda.hpp" +#include "deepx/tensorfunc/authors.hpp" + +namespace deepx::tensorfunc +{ + template + __global__ void max_kernel(const T* A, const T* B, T* C, const int size){ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < size) { + C[idx] = A[idx] > B[idx] ? A[idx] : B[idx]; + } + } + + template __global__ void max_kernel(const double* A, const double* B, double* C, const int size); + template __global__ void max_kernel(const float* A, const float* B, float* C, const int size); + template __global__ void max_kernel(const nv_bfloat16* A, const nv_bfloat16* B, nv_bfloat16* C, const int size); + template __global__ void max_kernel<__half>(const __half* A, const __half* B, __half* C, const int size); + template __global__ void max_kernel(const int64_t* A, const int64_t* B, int64_t* C, const int size); + template __global__ void max_kernel(const int32_t* A, const int32_t* B, int32_t* C, const int size); + template __global__ void max_kernel(const int16_t* A, const int16_t* B, int16_t* C, const int size); + template __global__ void max_kernel(const int8_t* A, const int8_t* B, int8_t* C, const int size); + + template + void launch_max(int numBlocks, int blockSize, const T* A, const T* B, T* C, const int size) + { + max_kernel<<>>(A, B, C, size); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + throw std::runtime_error("Failed to launch add kernel: " + + std::string(cudaGetErrorString(err))); + } + } + + template void launch_max(int numBlocks, int blockSize, const double* A, const double* B, double* C, const int size); + template void launch_max(int numBlocks, int blockSize, const float* A, const float* B, float* C, const int size); + template void launch_max(int numBlocks, int blockSize, const nv_bfloat16* A, const nv_bfloat16* B, nv_bfloat16* C, const int size); + template void launch_max<__half>(int numBlocks, int blockSize, const __half* A, const __half* B, __half* C, const int size); + template void launch_max(int numBlocks, int blockSize, const int64_t* A, const int64_t* B, int64_t* C, const int size); + template void launch_max(int numBlocks, int blockSize, const int32_t* A, const int32_t* B, int32_t* C, const int size); + template void launch_max(int numBlocks, int blockSize, const int16_t* A, const int16_t* B, int16_t* C, const int size); + template void launch_max(int numBlocks, int blockSize, const int8_t* A, const int8_t* B, int8_t* C, const int size); + + template + __global__ void maxscalar_kernel(const T* A, const T scalar, T* C, const int size) + { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < size) { + C[idx] = A[idx] > scalar ? A[idx] : scalar; + } + } + + template __global__ void maxscalar_kernel(const double* A, const double scalar, double* C, const int size); + template __global__ void maxscalar_kernel(const float* A, const float scalar, float* C, const int size); + template __global__ void maxscalar_kernel(const nv_bfloat16* A, const nv_bfloat16 scalar, nv_bfloat16* C, const int size); + template __global__ void maxscalar_kernel<__half>(const __half* A, const __half scalar, __half* C, const int size); + template __global__ void maxscalar_kernel(const int64_t* A, const int64_t scalar, int64_t* C, const int size); + template __global__ void maxscalar_kernel(const int32_t* A, const int32_t scalar, int32_t* C, const int size); + template __global__ void maxscalar_kernel(const int16_t* A, const int16_t scalar, int16_t* C, const int size); + template __global__ void maxscalar_kernel(const int8_t* A, const int8_t scalar, int8_t* C, const int size); + + template + void launch_maxscalar(int numBlocks, int blockSize, const T* A, const T scalar, T* C, const int size) + { + maxscalar_kernel<<>>(A, scalar, C, size); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + throw std::runtime_error("Failed to launch add kernel: " + + std::string(cudaGetErrorString(err))); + } + } + + template void launch_maxscalar(int numBlocks, int blockSize, const double* A, const double scalar, double* C, const int size); + template void launch_maxscalar(int numBlocks, int blockSize, const float* A, const float scalar, float* C, const int size); + template void launch_maxscalar(int numBlocks, int blockSize, const nv_bfloat16* A, const nv_bfloat16 scalar, nv_bfloat16* C, const int size); + template void launch_maxscalar<__half>(int numBlocks, int blockSize, const __half* A, const __half scalar, __half* C, const int size); + template void launch_maxscalar(int numBlocks, int blockSize, const int64_t* A, const int64_t scalar, int64_t* C, const int size); + template void launch_maxscalar(int numBlocks, int blockSize, const int32_t* A, const int32_t scalar, int32_t* C, const int size); + template void launch_maxscalar(int numBlocks, int blockSize, const int16_t* A, const int16_t scalar, int16_t* C, const int size); + template void launch_maxscalar(int numBlocks, int blockSize, const int8_t* A, const int8_t scalar, int8_t* C, const int size); + + template + __global__ void min_kernel(const T* A, const T* B, T* C, const int size) + { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < size) { + C[idx] = A[idx] < B[idx] ? A[idx] : B[idx]; + } + } + + template __global__ void min_kernel(const double* A, const double* B, double* C, const int size); + template __global__ void min_kernel(const float* A, const float* B, float* C, const int size); + template __global__ void min_kernel(const nv_bfloat16* A, const nv_bfloat16* B, nv_bfloat16* C, const int size); + template __global__ void min_kernel<__half>(const __half* A, const __half* B, __half* C, const int size); + template __global__ void min_kernel(const int64_t* A, const int64_t* B, int64_t* C, const int size); + template __global__ void min_kernel(const int32_t* A, const int32_t* B, int32_t* C, const int size); + template __global__ void min_kernel(const int16_t* A, const int16_t* B, int16_t* C, const int size); + template __global__ void min_kernel(const int8_t* A, const int8_t* B, int8_t* C, const int size); + + template + void launch_min(int numBlocks, int blockSize, const T* A, const T* B, T* C, const int size) + { + min_kernel<<>>(A, B, C, size); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + throw std::runtime_error("Failed to launch add kernel: " + + std::string(cudaGetErrorString(err))); + } + } + + template void launch_min(int numBlocks, int blockSize, const double* A, const double* B, double* C, const int size); + template void launch_min(int numBlocks, int blockSize, const float* A, const float* B, float* C, const int size); + template void launch_min(int numBlocks, int blockSize, const nv_bfloat16* A, const nv_bfloat16* B, nv_bfloat16* C, const int size); + template void launch_min<__half>(int numBlocks, int blockSize, const __half* A, const __half* B, __half* C, const int size); + template void launch_min(int numBlocks, int blockSize, const int64_t* A, const int64_t* B, int64_t* C, const int size); + template void launch_min(int numBlocks, int blockSize, const int32_t* A, const int32_t* B, int32_t* C, const int size); + template void launch_min(int numBlocks, int blockSize, const int16_t* A, const int16_t* B, int16_t* C, const int size); + template void launch_min(int numBlocks, int blockSize, const int8_t* A, const int8_t* B, int8_t* C, const int size); + + template + __global__ void minscalar_kernel(const T* A, const T scalar, T* C, const int size) + { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < size) { + C[idx] = A[idx] < scalar ? A[idx] : scalar; + } + } + + template __global__ void minscalar_kernel(const double* A, const double scalar, double* C, const int size); + template __global__ void minscalar_kernel(const float* A, const float scalar, float* C, const int size); + template __global__ void minscalar_kernel(const nv_bfloat16* A, const nv_bfloat16 scalar, nv_bfloat16* C, const int size); + template __global__ void minscalar_kernel<__half>(const __half* A, const __half scalar, __half* C, const int size); + template __global__ void minscalar_kernel(const int64_t* A, const int64_t scalar, int64_t* C, const int size); + template __global__ void minscalar_kernel(const int32_t* A, const int32_t scalar, int32_t* C, const int size); + template __global__ void minscalar_kernel(const int16_t* A, const int16_t scalar, int16_t* C, const int size); + template __global__ void minscalar_kernel(const int8_t* A, const int8_t scalar, int8_t* C, const int size); + + template + void launch_minscalar(int numBlocks, int blockSize, const T* A, const T scalar, T* C, const int size) + { + minscalar_kernel<<>>(A, scalar, C, size); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + throw std::runtime_error("Failed to launch add kernel: " + + std::string(cudaGetErrorString(err))); + } + } + + template void launch_minscalar(int numBlocks, int blockSize, const double* A, const double scalar, double* C, const int size); + template void launch_minscalar(int numBlocks, int blockSize, const float* A, const float scalar, float* C, const int size); + template void launch_minscalar(int numBlocks, int blockSize, const nv_bfloat16* A, const nv_bfloat16 scalar, nv_bfloat16* C, const int size); + template void launch_minscalar<__half>(int numBlocks, int blockSize, const __half* A, const __half scalar, __half* C, const int size); + template void launch_minscalar(int numBlocks, int blockSize, const int64_t* A, const int64_t scalar, int64_t* C, const int size); + template void launch_minscalar(int numBlocks, int blockSize, const int32_t* A, const int32_t scalar, int32_t* C, const int size); + template void launch_minscalar(int numBlocks, int blockSize, const int16_t* A, const int16_t scalar, int16_t* C, const int size); + template void launch_minscalar(int numBlocks, int blockSize, const int8_t* A, const int8_t scalar, int8_t* C, const int size); + + template + __global__ void compare_kernel(const T* A, const T* B, int8_t* mask, const int size) + { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < size) { + if (A[idx] == B[idx]) { + mask[idx] = 2; + } else if (A[idx] > B[idx]) { + mask[idx] = 1; + } else { + mask[idx] = 0; + } + } + } + + template __global__ void compare_kernel(const double* A, const double* B, int8_t* mask, const int size); + template __global__ void compare_kernel(const float* A, const float* B, int8_t* mask, const int size); + template __global__ void compare_kernel(const nv_bfloat16* A, const nv_bfloat16* B, int8_t* mask, const int size); + template __global__ void compare_kernel<__half>(const __half* A, const __half* B, int8_t* mask, const int size); + template __global__ void compare_kernel(const int64_t* A, const int64_t* B, int8_t* mask, const int size); + template __global__ void compare_kernel(const int32_t* A, const int32_t* B, int8_t* mask, const int size); + template __global__ void compare_kernel(const int16_t* A, const int16_t* B, int8_t* mask, const int size); + template __global__ void compare_kernel(const int8_t* A, const int8_t* B, int8_t* mask, const int size); + + template + void launch_compare(int numBlocks, int blockSize, const T* A, const T* B, int8_t* mask, const int size) + { + compare_kernel<<>>(A, B, mask, size); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + throw std::runtime_error("Failed to launch add kernel: " + + std::string(cudaGetErrorString(err))); + } + } + + template void launch_compare(int numBlocks, int blockSize, const double* A, const double* B, int8_t* mask, const int size); + template void launch_compare(int numBlocks, int blockSize, const float* A, const float* B, int8_t* mask, const int size); + template void launch_compare(int numBlocks, int blockSize, const nv_bfloat16* A, const nv_bfloat16* B, int8_t* mask, const int size); + template void launch_compare<__half>(int numBlocks, int blockSize, const __half* A, const __half* B, int8_t* mask, const int size); + template void launch_compare(int numBlocks, int blockSize, const int64_t* A, const int64_t* B, int8_t* mask, const int size); + template void launch_compare(int numBlocks, int blockSize, const int32_t* A, const int32_t* B, int8_t* mask, const int size); + template void launch_compare(int numBlocks, int blockSize, const int16_t* A, const int16_t* B, int8_t* mask, const int size); + template void launch_compare(int numBlocks, int blockSize, const int8_t* A, const int8_t* B, int8_t* mask, const int size); + + +}; + +#endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_COMPARE_CU diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.cuh b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.cuh new file mode 100644 index 00000000..d3976947 --- /dev/null +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.cuh @@ -0,0 +1,160 @@ +#ifndef DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_COMPARE_CUH +#define DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_COMPARE_CUH + +#include +#include +#include "deepx/tensorfunc/cuda.hpp" +#include "deepx/tensorfunc/authors.hpp" +namespace deepx::tensorfunc +{ + template + __global__ void max_kernel(const T* A, const T* B, T* C, const int size); + + template + void launch_max(int numBlocks, int blockSize, const T* A, const T* B, T* C, const int size); + + template <> + void launch_max(int numBlocks, int blockSize, const double* A, const double* B, double* C, const int size); + + template <> + void launch_max(int numBlocks, int blockSize, const float* A, const float* B, float* C, const int size); + + template <> + void launch_max(int numBlocks, int blockSize, const nv_bfloat16* A, const nv_bfloat16* B, nv_bfloat16* C, const int size); + + template <> + void launch_max<__half>(int numBlocks, int blockSize, const __half* A, const __half* B, __half* C, const int size); + + template <> + void launch_max(int numBlocks, int blockSize, const int64_t* A, const int64_t* B, int64_t* C, const int size); + + template <> + void launch_max(int numBlocks, int blockSize, const int32_t* A, const int32_t* B, int32_t* C, const int size); + + template <> + void launch_max(int numBlocks, int blockSize, const int16_t* A, const int16_t* B, int16_t* C, const int size); + + template <> + void launch_max(int numBlocks, int blockSize, const int8_t* A, const int8_t* B, int8_t* C, const int size); + + template + __global__ void maxscalar_kernel(const T* A, const T scalar, T* C, const int size); + + template + void launch_maxscalar(int numBlocks, int blockSize, const T* A, const T scalar, T* C, const int size); + + template <> + void launch_maxscalar(int numBlocks, int blockSize, const double* A, const double scalar, double* C, const int size); + + template <> + void launch_maxscalar(int numBlocks, int blockSize, const float* A, const float scalar, float* C, const int size); + + template <> + void launch_maxscalar(int numBlocks, int blockSize, const nv_bfloat16* A, const nv_bfloat16 scalar, nv_bfloat16* C, const int size); + + template <> + void launch_maxscalar<__half>(int numBlocks, int blockSize, const __half* A, const __half scalar, __half* C, const int size); + + template <> + void launch_maxscalar(int numBlocks, int blockSize, const int64_t* A, const int64_t scalar, int64_t* C, const int size); + + template <> + void launch_maxscalar(int numBlocks, int blockSize, const int32_t* A, const int32_t scalar, int32_t* C, const int size); + + template <> + void launch_maxscalar(int numBlocks, int blockSize, const int16_t* A, const int16_t scalar, int16_t* C, const int size); + + template <> + void launch_maxscalar(int numBlocks, int blockSize, const int8_t* A, const int8_t scalar, int8_t* C, const int size); + + template + __global__ void min_kernel(const T* A, const T* B, T* C, const int size); + + template + void launch_min(int numBlocks, int blockSize, const T* A, const T* B, T* C, const int size); + + template <> + void launch_min(int numBlocks, int blockSize, const double* A, const double* B, double* C, const int size); + + template <> + void launch_min(int numBlocks, int blockSize, const float* A, const float* B, float* C, const int size); + + template <> + void launch_min(int numBlocks, int blockSize, const nv_bfloat16* A, const nv_bfloat16* B, nv_bfloat16* C, const int size); + + template <> + void launch_min<__half>(int numBlocks, int blockSize, const __half* A, const __half* B, __half* C, const int size); + + template <> + void launch_min(int numBlocks, int blockSize, const int64_t* A, const int64_t* B, int64_t* C, const int size); + + template <> + void launch_min(int numBlocks, int blockSize, const int32_t* A, const int32_t* B, int32_t* C, const int size); + + template <> + void launch_min(int numBlocks, int blockSize, const int16_t* A, const int16_t* B, int16_t* C, const int size); + + template <> + void launch_min(int numBlocks, int blockSize, const int8_t* A, const int8_t* B, int8_t* C, const int size); + + template + __global__ void minscalar_kernel(const T* A, const T scalar, T* C, const int size); + + template + void launch_minscalar(int numBlocks, int blockSize, const T* A, const T scalar, T* C, const int size); + + template <> + void launch_minscalar(int numBlocks, int blockSize, const double* A, const double scalar, double* C, const int size); + + template <> + void launch_minscalar(int numBlocks, int blockSize, const float* A, const float scalar, float* C, const int size); + + template <> + void launch_minscalar(int numBlocks, int blockSize, const nv_bfloat16* A, const nv_bfloat16 scalar, nv_bfloat16* C, const int size); + + template <> + void launch_minscalar<__half>(int numBlocks, int blockSize, const __half* A, const __half scalar, __half* C, const int size); + + template <> + void launch_minscalar(int numBlocks, int blockSize, const int64_t* A, const int64_t scalar, int64_t* C, const int size); + + template <> + void launch_minscalar(int numBlocks, int blockSize, const int32_t* A, const int32_t scalar, int32_t* C, const int size); + + template <> + void launch_minscalar(int numBlocks, int blockSize, const int16_t* A, const int16_t scalar, int16_t* C, const int size); + + template <> + void launch_minscalar(int numBlocks, int blockSize, const int8_t* A, const int8_t scalar, int8_t* C, const int size); + + template + __global__ void compare_kernel(const T* A, const T* B, int8_t* mask, const int size); + + template + void launch_compare(int numBlocks, int blockSize, const T* A, const T* B, int8_t* mask, const int size); + + template <> + void launch_compare(int numBlocks, int blockSize, const double* A, const double* B, int8_t* mask, const int size); + + template <> + void launch_compare(int numBlocks, int blockSize, const float* A, const float* B, int8_t* mask, const int size); + + template <> + void launch_compare(int numBlocks, int blockSize, const nv_bfloat16* A, const nv_bfloat16* B, int8_t* mask, const int size); + + template <> + void launch_compare<__half>(int numBlocks, int blockSize, const __half* A, const __half* B, int8_t* mask, const int size); + + template <> + void launch_compare(int numBlocks, int blockSize, const int64_t* A, const int64_t* B, int8_t* mask, const int size); + + template <> + void launch_compare(int numBlocks, int blockSize, const int32_t* A, const int32_t* B, int8_t* mask, const int size); + + template <> + void launch_compare(int numBlocks, int blockSize, const int16_t* A, const int16_t* B, int8_t* mask, const int size); + + template <> + void launch_compare(int numBlocks, int blockSize, const int8_t* A, const int8_t* B, int8_t* mask, const int size); +} +#endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_COMPARE_CUH diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.hpp new file mode 100644 index 00000000..3b1b16b9 --- /dev/null +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.hpp @@ -0,0 +1,85 @@ +#ifndef DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_COMPARE_HPP +#define DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_COMPARE_HPP + +#include "deepx/tensorfunc/elementwise.hpp" +#include "deepx/tensorfunc/cuda.hpp" +#include "deepx/tensorfunc/authors.hpp" +#include "deepx/tensorfunc/elementwise_miaobyte_compare.cuh" +#include "stdutil/error.hpp" + +namespace deepx::tensorfunc +{ + // CUDA kernel函数声明 + + + template + struct maxDispatcher + { + static void max(const Tensor &A, const Tensor &B, Tensor &C) + { + if (A.shape.size != C.shape.size) { + throw TensorShapeError("max"); + } + const int blockSize = A.shape.size > 256 ? 256 : A.shape.size; + int numBlocks = (A.shape.size + blockSize - 1) / blockSize; + launch_max(numBlocks, blockSize, A.data, B.data, C.data, A.shape.size); + } + }; + + template + struct maxscalarDispatcher + { + static void maxscalar(const Tensor &A, const T scalar, Tensor &C) + { + if (A.shape.size != C.shape.size) { + throw TensorShapeError("maxscalar"); + } + const int blockSize = A.shape.size > 256 ? 256 : A.shape.size; + int numBlocks = (A.shape.size + blockSize - 1) / blockSize; + launch_maxscalar(numBlocks, blockSize, A.data, scalar, C.data, A.shape.size); + } + }; + + template + struct minDispatcher + { + static void min(const Tensor &A, const Tensor &B, Tensor &C) + { + if (A.shape.size != C.shape.size) { + throw TensorShapeError("min"); + } + const int blockSize = A.shape.size > 256 ? 256 : A.shape.size; + int numBlocks = (A.shape.size + blockSize - 1) / blockSize; + launch_min(numBlocks, blockSize, A.data, B.data, C.data, A.shape.size); + } + }; + + template + struct minscalarDispatcher + { + static void minscalar(const Tensor &A, const T scalar, Tensor &C) + { + if (A.shape.size != C.shape.size) { + throw TensorShapeError("minscalar"); + } + const int blockSize = A.shape.size > 256 ? 256 : A.shape.size; + int numBlocks = (A.shape.size + blockSize - 1) / blockSize; + launch_minscalar(numBlocks, blockSize, A.data, scalar, C.data, A.shape.size); + } + }; + template + struct compareDispatcher + { + static void compare(const Tensor &A, const Tensor &B, Tensor &mask) + { + if (A.shape.size != B.shape.size || A.shape.size != mask.shape.size) { + throw TensorShapeError("compare"); + } + const int blockSize = A.shape.size > 256 ? 256 : A.shape.size; + int numBlocks = (A.shape.size + blockSize - 1) / blockSize; + launch_compare(numBlocks, blockSize, A.data, B.data, mask.data, A.shape.size); + } + }; +} + +#endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_BASIC_HPP diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.cu new file mode 100644 index 00000000..b45ff9a3 --- /dev/null +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.cu @@ -0,0 +1,140 @@ +#ifndef DEEPX_TENSORFUNC_ELEMENTWISE_MIAOBYTE_SIN_CU +#define DEEPX_TENSORFUNC_ELEMENTWISE_MIAOBYTE_SIN_CU + +#include +#include + + +#include "deepx/tensorfunc/cuda.hpp" +#include "deepx/tensorfunc/authors.hpp" + +namespace deepx::tensorfunc +{ + // sin + template + __global__ void sin_kernel(const T* A, T* C, const int size); + + template <> + __global__ void sin_kernel(const double* A, double* C, const int size){ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < size) { + C[idx] = sin(A[idx]); + } + } + template <> + __global__ void sin_kernel(const float* A, float* C, const int size){ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < size) { + C[idx] = sinf(A[idx]); + } + } + template <> + __global__ void sin_kernel(const nv_bfloat16* A, nv_bfloat16* C, const int size){ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < size) { + C[idx] = hsin(A[idx]); + } + } + template <> + __global__ void sin_kernel<__half>(const __half* A, __half* C, const int size){ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < size) { + C[idx] = hsin(A[idx]); + } + } + + template + void launch_sin(int numBlocks, int blockSize, const T* a, T* c, const int size){ + sin_kernel<<>>(a, c, size); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + throw std::runtime_error("Failed to launch sin kernel: " + + std::string(cudaGetErrorString(err))); + } + } + + template void launch_sin(int numBlocks, int blockSize, const double* a, double* c, const int size); + template void launch_sin(int numBlocks, int blockSize, const float* a, float* c, const int size); + template void launch_sin(int numBlocks, int blockSize, const nv_bfloat16* a, nv_bfloat16* c, const int size); + template void launch_sin<__half>(int numBlocks, int blockSize, const __half* a, __half* c, const int size); + + // cos + template + __global__ void cos_kernel(const T* A, T* C, const int size); + template <> + __global__ void cos_kernel(const double* A, double* C, const int size){ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < size) { + C[idx] = cos(A[idx]); + } + } + template <> + __global__ void cos_kernel(const float* A, float* C, const int size){ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < size) { + C[idx] = cosf(A[idx]); + } + } + template <> + __global__ void cos_kernel(const nv_bfloat16* A, nv_bfloat16* C, const int size){ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < size) { + C[idx] = hcos(A[idx]); + } + } + template <> + __global__ void cos_kernel<__half>(const __half* A, __half* C, const int size){ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < size) { + C[idx] = hcos(A[idx]); + } + } + + template + void launch_cos(int numBlocks, int blockSize, const T* a, T* c, const int size){ + cos_kernel<<>>(a, c, size); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + throw std::runtime_error("Failed to launch cos kernel: " + + std::string(cudaGetErrorString(err))); + } + } + template void launch_cos(int numBlocks, int blockSize, const double* a, double* c, const int size); + template void launch_cos(int numBlocks, int blockSize, const float* a, float* c, const int size); + template void launch_cos(int numBlocks, int blockSize, const nv_bfloat16* a, nv_bfloat16* c, const int size); + template void launch_cos<__half>(int numBlocks, int blockSize, const __half* a, __half* c, const int size); + + // tan + template + __global__ void tan_kernel(const T* A, T* C, const int size); + template <> + __global__ void tan_kernel(const double* A, double* C, const int size){ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < size) { + C[idx] = tan(A[idx]); + } + } + template <> + __global__ void tan_kernel(const float* A, float* C, const int size){ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < size) { + C[idx] = tanf(A[idx]); + } + } + + + template + void launch_tan(int numBlocks, int blockSize, const T* a, T* c, const int size){ + tan_kernel<<>>(a, c, size); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + throw std::runtime_error("Failed to launch tan kernel: " + + std::string(cudaGetErrorString(err))); + } + } + template void launch_tan(int numBlocks, int blockSize, const double* a, double* c, const int size); + template void launch_tan(int numBlocks, int blockSize, const float* a, float* c, const int size); + +} + +#endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAOBYTE_SIN_CU diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.cuh b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.cuh new file mode 100644 index 00000000..a66e996b --- /dev/null +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.cuh @@ -0,0 +1,70 @@ +#ifndef DEEPX_TENSORFUNC_ELEMENTWISE_MIAOBYTE_SIN_CUH +#define DEEPX_TENSORFUNC_ELEMENTWISE_MIAOBYTE_SIN_CUH + + #include +#include + + +#include "deepx/tensorfunc/elementwise.hpp" +#include "deepx/tensorfunc/cuda.hpp" +#include "deepx/tensorfunc/authors.hpp" + +namespace deepx::tensorfunc +{ + template + __global__ void sin_kernel(const T* A, T* C, const int size); + + template + void launch_sin(int numBlocks, int blockSize, const T* a, T* c, const int size); + + template <> + void launch_sin(int numBlocks, int blockSize, const double* a, double* c, const int size); + + template <> + void launch_sin(int numBlocks, int blockSize, const float* a, float* c, const int size); + + template <> + void launch_sin(int numBlocks, int blockSize, const nv_bfloat16* a, nv_bfloat16* c, const int size); + + template <> + void launch_sin<__half>(int numBlocks, int blockSize, const __half* a, __half* c, const int size); + + template + __global__ void cos_kernel(const T* A, T* C, const int size); + + template + void launch_cos(int numBlocks, int blockSize, const T* a, T* c, const int size); + + template <> + void launch_cos(int numBlocks, int blockSize, const double* a, double* c, const int size); + + template <> + void launch_cos(int numBlocks, int blockSize, const float* a, float* c, const int size); + + template <> + void launch_cos(int numBlocks, int blockSize, const nv_bfloat16* a, nv_bfloat16* c, const int size); + + template <> + void launch_cos<__half>(int numBlocks, int blockSize, const __half* a, __half* c, const int size); + + template + __global__ void tan_kernel(const T* A, T* C, const int size); + + template + void launch_tan(int numBlocks, int blockSize, const T* a, T* c, const int size); + + template <> + void launch_tan(int numBlocks, int blockSize, const double* a, double* c, const int size); + + template <> + void launch_tan(int numBlocks, int blockSize, const float* a, float* c, const int size); + + template <> + void launch_tan(int numBlocks, int blockSize, const nv_bfloat16* a, nv_bfloat16* c, const int size); + + template <> + void launch_tan<__half>(int numBlocks, int blockSize, const __half* a, __half* c, const int size); + +} + +#endif \ No newline at end of file diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.hpp.a b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.hpp similarity index 95% rename from excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.hpp.a rename to excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.hpp index f31973f3..4a71c664 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.hpp.a +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.hpp @@ -4,15 +4,13 @@ #include "deepx/tensorfunc/elementwise.hpp" #include "deepx/tensorfunc/cuda.hpp" #include "deepx/tensorfunc/authors.hpp" -#include "deepx/tensorfunc/elementwise_miaobyte_basic.cuh" +#include "deepx/tensorfunc/elementwise_miaobyte_sin.cuh" #include "stdutil/error.hpp" namespace deepx::tensorfunc { - // CUDA kernel函数声明 - - + template struct sinDispatcher { diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cu index a808d5bc..cc94ac00 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cu +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cu @@ -1,5 +1,5 @@ -#ifndef DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_SQRT_CUH -#define DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_SQRT_CUH +#ifndef DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_SQRT_CU +#define DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_SQRT_CU #include "deepx/tensorfunc/cuda.hpp" #include "deepx/tensorfunc/authors.hpp" @@ -9,21 +9,37 @@ namespace deepx::tensorfunc { // sqrt template - __global__ void sqrt_kernel(const T* A, T* C,const int size){ + __global__ void sqrt_kernel(const T* A, T* C,const int size); + template <> + __global__ void sqrt_kernel(const double* A, double* C,const int size){ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < size) { + C[idx] = sqrt(A[idx]); + } + } + template <> + __global__ void sqrt_kernel(const float* A, float* C,const int size){ int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < size) { C[idx] = sqrtf(A[idx]); } } - template __global__ void sqrt_kernel(const double* A, double* C,const int size); - template __global__ void sqrt_kernel(const float* A, float* C,const int size); - // template __global__ void sqrt_kernel(const nv_bfloat16* A, nv_bfloat16* C,const int size); - // template __global__ void sqrt_kernel<__half>(const __half* A, __half* C,const int size); - template __global__ void sqrt_kernel(const int64_t* A, int64_t* C,const int size); - template __global__ void sqrt_kernel(const int32_t* A, int32_t* C,const int size); - template __global__ void sqrt_kernel(const int16_t* A, int16_t* C,const int size); - template __global__ void sqrt_kernel(const int8_t* A, int8_t* C,const int size); + template <> + __global__ void sqrt_kernel(const nv_bfloat16* A, nv_bfloat16* C,const int size){ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < size) { + C[idx] = hsqrt(A[idx]); + } + } + template <> + __global__ void sqrt_kernel<__half>(const __half* A, __half* C,const int size){ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < size) { + C[idx] = hsqrt(A[idx]); + } + } + template void launch_sqrt(int numBlocks, int blockSize, const T* a, T* c,const int size){ sqrt_kernel<<>>(a, c, size); @@ -35,30 +51,27 @@ namespace deepx::tensorfunc } template void launch_sqrt(int numBlocks, int blockSize, const double* a, double* c,const int size); template void launch_sqrt(int numBlocks, int blockSize, const float* a, float* c,const int size); - // template void launch_sqrt(int numBlocks, int blockSize, const nv_bfloat16* a, nv_bfloat16* c,const int size); - // template void launch_sqrt<__half>(int numBlocks, int blockSize, const __half* a, __half* c,const int size); - template void launch_sqrt(int numBlocks, int blockSize, const int64_t* a, int64_t* c,const int size); - template void launch_sqrt(int numBlocks, int blockSize, const int32_t* a, int32_t* c,const int size); - template void launch_sqrt(int numBlocks, int blockSize, const int16_t* a, int16_t* c,const int size); - template void launch_sqrt(int numBlocks, int blockSize, const int8_t* a, int8_t* c,const int size); - + template void launch_sqrt(int numBlocks, int blockSize, const nv_bfloat16* a, nv_bfloat16* c,const int size); + template void launch_sqrt<__half>(int numBlocks, int blockSize, const __half* a, __half* c,const int size); + // pow template - __global__ void pow_kernel(const T* A, const T* B, T* C,const int size){ + __global__ void pow_kernel(const T* A, const T* B, T* C,const int size); + template <> + __global__ void pow_kernel(const double* A, const double* B, double* C,const int size){ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < size) { + C[idx] = pow(A[idx], B[idx]); + } + } + template <> + __global__ void pow_kernel(const float* A, const float* B, float* C,const int size){ int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < size) { C[idx] = powf(A[idx], B[idx]); } } - template __global__ void pow_kernel(const double* A, const double* B, double* C,const int size); - template __global__ void pow_kernel(const float* A, const float* B, float* C,const int size); - // template __global__ void pow_kernel(const nv_bfloat16* A, const nv_bfloat16* B, nv_bfloat16* C,const int size); - // template __global__ void pow_kernel<__half>(const __half* A, const __half* B, __half* C,const int size); - template __global__ void pow_kernel(const int64_t* A, const int64_t* B, int64_t* C,const int size); - template __global__ void pow_kernel(const int32_t* A, const int32_t* B, int32_t* C,const int size); - template __global__ void pow_kernel(const int16_t* A, const int16_t* B, int16_t* C,const int size); - template __global__ void pow_kernel(const int8_t* A, const int8_t* B, int8_t* C,const int size); template void launch_pow(int numBlocks, int blockSize, const T* a, const T* b, T* c,const int size){ @@ -71,16 +84,19 @@ namespace deepx::tensorfunc } template void launch_pow(int numBlocks, int blockSize, const double* a, const double* b, double* c,const int size); template void launch_pow(int numBlocks, int blockSize, const float* a, const float* b, float* c,const int size); - // template void launch_pow(int numBlocks, int blockSize, const nv_bfloat16* a, const nv_bfloat16* b, nv_bfloat16* c,const int size); - // template void launch_pow<__half>(int numBlocks, int blockSize, const __half* a, const __half* b, __half* c,const int size); - template void launch_pow(int numBlocks, int blockSize, const int64_t* a, const int64_t* b, int64_t* c,const int size); - template void launch_pow(int numBlocks, int blockSize, const int32_t* a, const int32_t* b, int32_t* c,const int size); - template void launch_pow(int numBlocks, int blockSize, const int16_t* a, const int16_t* b, int16_t* c,const int size); - template void launch_pow(int numBlocks, int blockSize, const int8_t* a, const int8_t* b, int8_t* c,const int size); - + // powscalar template - __global__ void powscalar_kernel(const T* A, const T scalar, T* C,const int size){ + __global__ void powscalar_kernel(const T* A, const T scalar, T* C,const int size); + template <> + __global__ void powscalar_kernel(const double* A, const double scalar, double* C,const int size){ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < size) { + C[idx] = pow(A[idx], scalar); + } + } + template <> + __global__ void powscalar_kernel(const float* A, const float scalar, float* C,const int size){ int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < size) { C[idx] = powf(A[idx], scalar); @@ -88,13 +104,7 @@ namespace deepx::tensorfunc } template __global__ void powscalar_kernel(const double* A, const double scalar, double* C,const int size); template __global__ void powscalar_kernel(const float* A, const float scalar, float* C,const int size); - // template __global__ void powscalar_kernel(const nv_bfloat16* A, const nv_bfloat16 scalar, nv_bfloat16* C,const int size); - // template __global__ void powscalar_kernel<__half>(const __half* A, const __half scalar, __half* C,const int size); - template __global__ void powscalar_kernel(const int64_t* A, const int64_t scalar, int64_t* C,const int size); - template __global__ void powscalar_kernel(const int32_t* A, const int32_t scalar, int32_t* C,const int size); - template __global__ void powscalar_kernel(const int16_t* A, const int16_t scalar, int16_t* C,const int size); - template __global__ void powscalar_kernel(const int8_t* A, const int8_t scalar, int8_t* C,const int size); - + template void launch_powscalar(int numBlocks, int blockSize, const T* a, const T scalar, T* c,const int size){ powscalar_kernel<<>>(a, scalar, c, size); @@ -106,30 +116,39 @@ namespace deepx::tensorfunc } template void launch_powscalar(int numBlocks, int blockSize, const double* a, const double scalar, double* c,const int size); template void launch_powscalar(int numBlocks, int blockSize, const float* a, const float scalar, float* c,const int size); - // template void launch_powscalar(int numBlocks, int blockSize, const nv_bfloat16* a, const nv_bfloat16 scalar, nv_bfloat16* c,const int size); - // template void launch_powscalar<__half>(int numBlocks, int blockSize, const __half* a, const __half scalar, __half* c,const int size); - template void launch_powscalar(int numBlocks, int blockSize, const int64_t* a, const int64_t scalar, int64_t* c,const int size); - template void launch_powscalar(int numBlocks, int blockSize, const int32_t* a, const int32_t scalar, int32_t* c,const int size); - template void launch_powscalar(int numBlocks, int blockSize, const int16_t* a, const int16_t scalar, int16_t* c,const int size); - template void launch_powscalar(int numBlocks, int blockSize, const int8_t* a, const int8_t scalar, int8_t* c,const int size); - + // log template - __global__ void log_kernel(const T* A, T* C,const int size){ + __global__ void log_kernel(const T* A, T* C,const int size); + template <> + __global__ void log_kernel(const double* A, double* C,const int size){ int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < size) { C[idx] = logf(A[idx]); } } - template __global__ void log_kernel(const double* A, double* C,const int size); - template __global__ void log_kernel(const float* A, float* C,const int size); - // template __global__ void log_kernel(const nv_bfloat16* A, nv_bfloat16* C,const int size); - // template __global__ void log_kernel<__half>(const __half* A, __half* C,const int size); - template __global__ void log_kernel(const int64_t* A, int64_t* C,const int size); - template __global__ void log_kernel(const int32_t* A, int32_t* C,const int size); - template __global__ void log_kernel(const int16_t* A, int16_t* C,const int size); - template __global__ void log_kernel(const int8_t* A, int8_t* C,const int size); - + template <> + __global__ void log_kernel(const float* A, float* C,const int size){ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < size) { + C[idx] = logf(A[idx]); + } + } + template <> + __global__ void log_kernel<__half>(const __half* A, __half* C,const int size){ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < size) { + C[idx] = hlog(A[idx]); + } + } + template <> + __global__ void log_kernel(const nv_bfloat16* A, nv_bfloat16* C,const int size){ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < size) { + C[idx] = hlog(A[idx]); + } + } + template void launch_log(int numBlocks, int blockSize, const T* a, T* c,const int size){ log_kernel<<>>(a, c, size); @@ -141,30 +160,41 @@ namespace deepx::tensorfunc } template void launch_log(int numBlocks, int blockSize, const double* a, double* c,const int size); template void launch_log(int numBlocks, int blockSize, const float* a, float* c,const int size); - // template void launch_log(int numBlocks, int blockSize, const nv_bfloat16* a, nv_bfloat16* c,const int size); - // template void launch_log<__half>(int numBlocks, int blockSize, const __half* a, __half* c,const int size); - template void launch_log(int numBlocks, int blockSize, const int64_t* a, int64_t* c,const int size); - template void launch_log(int numBlocks, int blockSize, const int32_t* a, int32_t* c,const int size); - template void launch_log(int numBlocks, int blockSize, const int16_t* a, int16_t* c,const int size); - template void launch_log(int numBlocks, int blockSize, const int8_t* a, int8_t* c,const int size); - + template void launch_log(int numBlocks, int blockSize, const nv_bfloat16* a, nv_bfloat16* c,const int size); + template void launch_log<__half>(int numBlocks, int blockSize, const __half* a, __half* c,const int size); + // exp template - __global__ void exp_kernel(const T* A, T* C,const int size){ + __global__ void exp_kernel(const T* A, T* C,const int size); + template <> + __global__ void exp_kernel(const double* A, double* C,const int size){ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < size) { + C[idx] = exp(A[idx]); + } + } + template <> + __global__ void exp_kernel(const float* A, float* C,const int size){ int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < size) { C[idx] = expf(A[idx]); } } - template __global__ void exp_kernel(const double* A, double* C,const int size); - template __global__ void exp_kernel(const float* A, float* C,const int size); - // template __global__ void exp_kernel(const nv_bfloat16* A, nv_bfloat16* C,const int size); - // template __global__ void exp_kernel<__half>(const __half* A, __half* C,const int size); - template __global__ void exp_kernel(const int64_t* A, int64_t* C,const int size); - template __global__ void exp_kernel(const int32_t* A, int32_t* C,const int size); - template __global__ void exp_kernel(const int16_t* A, int16_t* C,const int size); - template __global__ void exp_kernel(const int8_t* A, int8_t* C,const int size); - + template <> + __global__ void exp_kernel(const nv_bfloat16* A, nv_bfloat16* C,const int size){ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < size) { + C[idx] = hexp(A[idx]); + } + } + template <> + __global__ void exp_kernel<__half>(const __half* A, __half* C,const int size){ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < size) { + C[idx] = hexp(A[idx]); + } + } + template void launch_exp(int numBlocks, int blockSize, const T* a, T* c,const int size){ exp_kernel<<>>(a, c, size); @@ -176,13 +206,8 @@ namespace deepx::tensorfunc } template void launch_exp(int numBlocks, int blockSize, const double* a, double* c,const int size); template void launch_exp(int numBlocks, int blockSize, const float* a, float* c,const int size); - // template void launch_exp(int numBlocks, int blockSize, const nv_bfloat16* a, nv_bfloat16* c,const int size); - // template void launch_exp<__half>(int numBlocks, int blockSize, const __half* a, __half* c,const int size); - template void launch_exp(int numBlocks, int blockSize, const int64_t* a, int64_t* c,const int size); - template void launch_exp(int numBlocks, int blockSize, const int32_t* a, int32_t* c,const int size); - template void launch_exp(int numBlocks, int blockSize, const int16_t* a, int16_t* c,const int size); - template void launch_exp(int numBlocks, int blockSize, const int8_t* a, int8_t* c,const int size); - + template void launch_exp(int numBlocks, int blockSize, const nv_bfloat16* a, nv_bfloat16* c,const int size); + template void launch_exp<__half>(int numBlocks, int blockSize, const __half* a, __half* c,const int size); } -#endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_SQRT_CUH +#endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_SQRT_CU diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cuh b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cuh index dd428cbd..341a0295 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cuh +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cuh @@ -3,8 +3,6 @@ #include #include - -#include "deepx/tensorfunc/elementwise.hpp" #include "deepx/tensorfunc/cuda.hpp" #include "deepx/tensorfunc/authors.hpp" @@ -29,18 +27,7 @@ namespace deepx::tensorfunc template <> void launch_sqrt<__half>(int numBlocks, int blockSize, const __half* a, __half* c,const int size); - template <> - void launch_sqrt(int numBlocks, int blockSize, const int64_t* a, int64_t* c,const int size); - - template <> - void launch_sqrt(int numBlocks, int blockSize, const int32_t* a, int32_t* c,const int size); - - template <> - void launch_sqrt(int numBlocks, int blockSize, const int16_t* a, int16_t* c,const int size); - - template <> - void launch_sqrt(int numBlocks, int blockSize, const int8_t* a, int8_t* c,const int size); - + // pow template __global__ void pow_kernel(const T* A, const T* B, T* C,const int size); @@ -60,18 +47,7 @@ namespace deepx::tensorfunc template <> void launch_pow<__half>(int numBlocks, int blockSize, const __half* a, const __half* b, __half* c,const int size); - template <> - void launch_pow(int numBlocks, int blockSize, const int64_t* a, const int64_t* b, int64_t* c,const int size); - - template <> - void launch_pow(int numBlocks, int blockSize, const int32_t* a, const int32_t* b, int32_t* c,const int size); - - template <> - void launch_pow(int numBlocks, int blockSize, const int16_t* a, const int16_t* b, int16_t* c,const int size); - - template <> - void launch_pow(int numBlocks, int blockSize, const int8_t* a, const int8_t* b, int8_t* c,const int size); - + // powscalar template __global__ void powscalar_kernel(const T* A, const T scalar, T* C,const int size); @@ -91,18 +67,7 @@ namespace deepx::tensorfunc template <> void launch_powscalar<__half>(int numBlocks, int blockSize, const __half* a, const __half scalar, __half* c,const int size); - template <> - void launch_powscalar(int numBlocks, int blockSize, const int64_t* a, const int64_t scalar, int64_t* c,const int size); - - template <> - void launch_powscalar(int numBlocks, int blockSize, const int32_t* a, const int32_t scalar, int32_t* c,const int size); - - template <> - void launch_powscalar(int numBlocks, int blockSize, const int16_t* a, const int16_t scalar, int16_t* c,const int size); - - template <> - void launch_powscalar(int numBlocks, int blockSize, const int8_t* a, const int8_t scalar, int8_t* c,const int size); - + // log template __global__ void log_kernel(const T* A, T* C,const int size); @@ -121,19 +86,7 @@ namespace deepx::tensorfunc template <> void launch_log<__half>(int numBlocks, int blockSize, const __half* a, __half* c,const int size); - - template <> - void launch_log(int numBlocks, int blockSize, const int64_t* a, int64_t* c,const int size); - - template <> - void launch_log(int numBlocks, int blockSize, const int32_t* a, int32_t* c,const int size); - - template <> - void launch_log(int numBlocks, int blockSize, const int16_t* a, int16_t* c,const int size); - - template <> - void launch_log(int numBlocks, int blockSize, const int8_t* a, int8_t* c,const int size); - + // exp template __global__ void exp_kernel(const T* A, T* C,const int size); @@ -153,17 +106,7 @@ namespace deepx::tensorfunc template <> void launch_exp<__half>(int numBlocks, int blockSize, const __half* a, __half* c,const int size); - template <> - void launch_exp(int numBlocks, int blockSize, const int64_t* a, int64_t* c,const int size); - - template <> - void launch_exp(int numBlocks, int blockSize, const int32_t* a, int32_t* c,const int size); - - template <> - void launch_exp(int numBlocks, int blockSize, const int16_t* a, int16_t* c,const int size); - - template <> - void launch_exp(int numBlocks, int blockSize, const int8_t* a, int8_t* c,const int size); + } #endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_SQRT_CUH diff --git a/excuter/op-mem-cuda/src/deepx/tf/elementwise_compare.hpp b/excuter/op-mem-cuda/src/deepx/tf/elementwise_compare.hpp new file mode 100644 index 00000000..c29f3cfb --- /dev/null +++ b/excuter/op-mem-cuda/src/deepx/tf/elementwise_compare.hpp @@ -0,0 +1,378 @@ +#ifndef DEEPX_TF_ELEMENTWISE_COMPARE_HPP +#define DEEPX_TF_ELEMENTWISE_COMPARE_HPP + +#include +#include +#include "deepx/tensorfunc/elementwise_miaobyte_compare.hpp" + +namespace deepx::tf +{ + + template + class Max : public TF + { + public: + Max(const vector &args, const vector &returns) + { + this->name = "max"; + this->author = Author::name(); + this->args = args; + this->returns = returns; + } + + Max(string text) + { + this->parse(text); + this->author = Author::name(); + if (this->name != "max") + { + throw std::runtime_error("Invalid name: " + this->name); + } + } + string math_formula() const override + { + return "T3=max(T1, T2)"; + } + shared_ptr clone() const override + { + return make_shared>(*this); + } + + int run(shared_ptr mem, string &error) override + { + Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; + Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (a_type != c_type) + { + error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type); + return 1; + } + switch (a_type) + { + case Precision::Float64: + tensorfunc::max(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + tensorfunc::max(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float16: + tensorfunc::max(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::BFloat16: + tensorfunc::max(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int64: + tensorfunc::max(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + tensorfunc::max(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + tensorfunc::max(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + tensorfunc::max(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported type: " + precision_str(a_type); + return 1; + } + return 0; + } + }; + + template + class MaxScalar : public TF + { + public: + MaxScalar(const vector &args, const vector &returns) + { + this->name = "maxscalar"; + this->author = Author::name(); + this->args = args; + this->returns = returns; + } + + MaxScalar(string text) + { + this->parse(text); + this->author = Author::name(); + if (this->name != "maxscalar") + { + throw std::runtime_error("Invalid name: " + this->name); + } + } + string math_formula() const override + { + return "T3=max(T1, scalar)"; + } + shared_ptr clone() const override + { + return make_shared>(*this); + } + + int run(shared_ptr mem, string &error) override + { + Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; + Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (a_type != c_type) + { + error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type); + return 1; + } + switch (a_type) + { + case Precision::Float64: + tensorfunc::maxscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + tensorfunc::maxscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float16: + tensorfunc::maxscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::BFloat16: + tensorfunc::maxscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int64: + tensorfunc::maxscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + tensorfunc::maxscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + tensorfunc::maxscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + tensorfunc::maxscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported type: " + precision_str(a_type); + return 1; + } + return 0; + } + }; + + template + class Min : public TF + { + public: + Min(const vector &args, const vector &returns) + { + this->name = "min"; + this->author = Author::name(); + this->args = args; + this->returns = returns; + } + + Min(string text) + { + this->parse(text); + this->author = Author::name(); + if (this->name != "min") + { + throw std::runtime_error("Invalid name: " + this->name); + } + } + string math_formula() const override + { + return "T3=min(T1, T2)"; + } + shared_ptr clone() const override + { + return make_shared>(*this); + } + + int run(shared_ptr mem, string &error) override + { + Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; + Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype; + Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (a_type != c_type || b_type != c_type) + { + error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type) + " or " + precision_str(b_type) + " != " + precision_str(c_type); + return 1; + } + switch (a_type) + { + case Precision::Float64: + tensorfunc::min(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + tensorfunc::min(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float16: + tensorfunc::min(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::BFloat16: + tensorfunc::min(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int64: + tensorfunc::min(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + tensorfunc::min(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + tensorfunc::min(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + tensorfunc::min(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported type: " + precision_str(a_type); + return 1; + } + return 0; + } + }; + + template + class MinScalar : public TF + { + public: + MinScalar(const vector &args, const vector &returns) + { + this->name = "minscalar"; + this->author = Author::name(); + this->args = args; + this->returns = returns; + } + + MinScalar(string text) + { + this->parse(text); + this->author = Author::name(); + if (this->name != "minscalar") + { + throw std::runtime_error("Invalid name: " + this->name); + } + } + string math_formula() const override + { + return "T3=min(T1, scalar)"; + } + shared_ptr clone() const override + { + return make_shared>(*this); + } + + int run(shared_ptr mem, string &error) override + { + Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; + Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (a_type != c_type) + { + error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type); + return 1; + } + switch (a_type) + { + case Precision::Float64: + tensorfunc::minscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + tensorfunc::minscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float16: + tensorfunc::minscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::BFloat16: + tensorfunc::minscalar (*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int64: + tensorfunc::minscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + tensorfunc::minscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + tensorfunc::minscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + tensorfunc::minscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported type: " + precision_str(a_type); + return 1; + } + return 0; + } + }; + + template + class Compare : public TF + { + public: + Compare(const vector &args, const vector &returns) + { + this->name = "compare"; + this->author = Author::name(); + this->args = args; + this->returns = returns; + } + + Compare(string text) + { + this->parse(text); + this->author = Author::name(); + if (this->name != "compare") + { + throw std::runtime_error("Invalid name: " + this->name); + } + } + string math_formula() const override + { + return "mask=compare(T1, T2)"; + } + shared_ptr clone() const override + { + return make_shared>(*this); + } + + int run(shared_ptr mem, string &error) override + { + Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; + Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype; + Precision mask_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (a_type != b_type) + { + error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(b_type); + return 1; + } + switch (a_type) + { + case Precision::Float64: + tensorfunc::compare(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + tensorfunc::compare(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float16: + tensorfunc::compare(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::BFloat16: + tensorfunc::compare(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int64: + tensorfunc::compare(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + tensorfunc::compare(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + tensorfunc::compare(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + tensorfunc::compare(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported type: " + precision_str(a_type); + return 1; + } + return 0; + } + }; +}; +#endif // DEEPX_TF_ELEMENTWISE_COMPARE_HPP diff --git a/excuter/op-mem-cuda/src/deepx/tf/elementwise_sin.hpp b/excuter/op-mem-cuda/src/deepx/tf/elementwise_sin.hpp new file mode 100644 index 00000000..bea0a9f4 --- /dev/null +++ b/excuter/op-mem-cuda/src/deepx/tf/elementwise_sin.hpp @@ -0,0 +1,191 @@ +#ifndef DEEPX_TF_ELEMENTWISE_SIN_HPP +#define DEEPX_TF_ELEMENTWISE_SIN_HPP + +#include +#include +#include "deepx/tensorfunc/elementwise_miaobyte_sin.hpp" + +namespace deepx::tf +{ + + template + class Sin : public TF + { + public: + Sin(const vector &args, const vector &returns) + { + this->name = "sin"; + this->author = Author::name(); + this->args = args; + this->returns = returns; + } + + Sin(string text) + { + this->parse(text); + this->author = Author::name(); + if (this->name != "sin") + { + throw std::runtime_error("Invalid name: " + this->name); + } + } + string math_formula() const override + { + return "T3=sin(T1)"; + } + shared_ptr clone() const override + { + return make_shared>(*this); + } + + int run(shared_ptr mem, string &error) override + { + Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; + Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (a_type != c_type) + { + error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type); + return 1; + } + switch (a_type) + { + case Precision::Float64: + tensorfunc::sin(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + tensorfunc::sin(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float16: + tensorfunc::sin(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::BFloat16: + tensorfunc::sin(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported type: " + precision_str(a_type); + return 1; + } + return 0; + } + }; + + template + class Cos : public TF + { + public: + Cos(const vector &args, const vector &returns) + { + this->name = "cos"; + this->author = Author::name(); + this->args = args; + this->returns = returns; + } + + Cos(string text) + { + this->parse(text); + this->author = Author::name(); + if (this->name != "cos") + { + throw std::runtime_error("Invalid name: " + this->name); + } + } + string math_formula() const override + { + return "T3=cos(T1)"; + } + shared_ptr clone() const override + { + return make_shared>(*this); + } + + int run(shared_ptr mem, string &error) override + { + Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; + Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype; + Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (a_type != c_type || b_type != c_type) + { + error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type) + " or " + precision_str(b_type) + " != " + precision_str(c_type); + return 1; + } + switch (a_type) + { + case Precision::Float64: + tensorfunc::cos(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + tensorfunc::cos(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float16: + tensorfunc::cos(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::BFloat16: + tensorfunc::cos(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + + default: + error = "Unsupported type: " + precision_str(a_type); + return 1; + } + return 0; + } + }; + + template + class Tan : public TF + { + public: + Tan(const vector &args, const vector &returns) + { + this->name = "tan"; + this->author = Author::name(); + this->args = args; + this->returns = returns; + } + + Tan(string text) + { + this->parse(text); + this->author = Author::name(); + if (this->name != "tan") + { + throw std::runtime_error("Invalid name: " + this->name); + } + } + string math_formula() const override + { + return "T3=tan(T1)"; + } + shared_ptr clone() const override + { + return make_shared>(*this); + } + + int run(shared_ptr mem, string &error) override + { + Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; + Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype; + Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (a_type != c_type || b_type != c_type) + { + error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type) + " or " + precision_str(b_type) + " != " + precision_str(c_type); + return 1; + } + switch (a_type) + { + case Precision::Float64: + tensorfunc::tan(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + tensorfunc::tan(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported type: " + precision_str(a_type); + return 1; + } + return 0; + } + }; +}; +#endif // DEEPX_TF_ELEMENTWISE_SQRT_HPP diff --git a/excuter/op-mem-cuda/src/deepx/tf/elementwise_sqrt.hpp b/excuter/op-mem-cuda/src/deepx/tf/elementwise_sqrt.hpp index 3865c03b..204fae9e 100644 --- a/excuter/op-mem-cuda/src/deepx/tf/elementwise_sqrt.hpp +++ b/excuter/op-mem-cuda/src/deepx/tf/elementwise_sqrt.hpp @@ -55,23 +55,11 @@ namespace deepx::tf case Precision::Float32: tensorfunc::sqrt(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); break; - // case Precision::Float16: - // tensorfunc::sqrt(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); - // break; - // case Precision::Float16: - // tensorfunc::sqrt(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); - // break; - case Precision::Int64: - tensorfunc::sqrt(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + case Precision::Float16: + tensorfunc::sqrt(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); break; - case Precision::Int32: - tensorfunc::sqrt(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); - break; - case Precision::Int16: - tensorfunc::sqrt(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); - break; - case Precision::Int8: - tensorfunc::sqrt(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + case Precision::BFloat16: + tensorfunc::sqrt(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); break; default: error = "Unsupported type: " + precision_str(a_type); @@ -129,24 +117,7 @@ namespace deepx::tf case Precision::Float32: tensorfunc::pow(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); break; - // case Precision::BFloat16: - // tensorfunc::pow(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); - // break; - // case Precision::Float16: - // tensorfunc::pow(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); - // break; - case Precision::Int64: - tensorfunc::pow(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); - break; - case Precision::Int32: - tensorfunc::pow(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); - break; - case Precision::Int16: - tensorfunc::pow(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); - break; - case Precision::Int8: - tensorfunc::pow(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); - break; + default: error = "Unsupported type: " + precision_str(a_type); return 1; @@ -203,24 +174,7 @@ namespace deepx::tf case Precision::Float32: tensorfunc::powscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); break; - // case Precision::BFloat16: - // tensorfunc::powscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); - // break; - // case Precision::Float16: - // tensorfunc::powscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); - // break; - case Precision::Int64: - tensorfunc::powscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); - break; - case Precision::Int32: - tensorfunc::powscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); - break; - case Precision::Int16: - tensorfunc::powscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); - break; - case Precision::Int8: - tensorfunc::powscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); - break; + default: error = "Unsupported type: " + precision_str(a_type); return 1; @@ -276,23 +230,11 @@ namespace deepx::tf case Precision::Float32: tensorfunc::log(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); break; - // case Precision::Float16: - // tensorfunc::log(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); - // break; - // case Precision::BFloat16: - // tensorfunc::log(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); - // break; - case Precision::Int64: - tensorfunc::log(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); - break; - case Precision::Int32: - tensorfunc::log(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + case Precision::Float16: + tensorfunc::log(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); break; - case Precision::Int16: - tensorfunc::log(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); - break; - case Precision::Int8: - tensorfunc::log(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + case Precision::BFloat16: + tensorfunc::log(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); break; default: error = "Unsupported type: " + precision_str(a_type); @@ -349,23 +291,11 @@ namespace deepx::tf case Precision::Float32: tensorfunc::exp(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); break; - // case Precision::Float16: - // tensorfunc::exp(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); - // break; - // case Precision::BFloat16: - // tensorfunc::exp(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); - // break; - case Precision::Int64: - tensorfunc::exp(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); - break; - case Precision::Int32: - tensorfunc::exp(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); - break; - case Precision::Int16: - tensorfunc::exp(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + case Precision::Float16: + tensorfunc::exp(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); break; - case Precision::Int8: - tensorfunc::exp(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + case Precision::BFloat16: + tensorfunc::exp(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); break; default: error = "Unsupported type: " + precision_str(a_type); diff --git a/excuter/op-mem-ompsimd/src/client/tfs.cpp b/excuter/op-mem-ompsimd/src/client/tfs.cpp index afd1ee0a..add0e9e6 100644 --- a/excuter/op-mem-ompsimd/src/client/tfs.cpp +++ b/excuter/op-mem-ompsimd/src/client/tfs.cpp @@ -241,7 +241,53 @@ namespace deepx::tf { Param("C", DataCategory::Tensor, Precision::Any), }))); - } + tffactory.add_tf(std::make_shared>(vector( + { + Param("A", DataCategory::Tensor, Precision::Any), + Param("B", DataCategory::Tensor, Precision::Any), + }), + vector( + { + Param("C", DataCategory::Tensor, Precision::Any), + }))); + tffactory.add_tf(std::make_shared>(vector( + { + Param("A", DataCategory::Tensor, Precision::Any), + Param("scalar", DataCategory::Var, Precision::Any), + }), + vector( + { + Param("C", DataCategory::Tensor, Precision::Any), + }))); + tffactory.add_tf(std::make_shared>(vector( + { + Param("A", DataCategory::Tensor, Precision::Any), + Param("B", DataCategory::Tensor, Precision::Any), + }), + vector( + { + Param("C", DataCategory::Tensor, Precision::Any), + }))); + tffactory.add_tf(std::make_shared>(vector( + { + Param("A", DataCategory::Tensor, Precision::Any), + Param("scalar", DataCategory::Var, Precision::Any), + }), + vector( + { + Param("C", DataCategory::Tensor, Precision::Any), + }))); + tffactory.add_tf(std::make_shared>(vector( + { + Param("A", DataCategory::Tensor, Precision::Any), + Param("B", DataCategory::Tensor, Precision::Any), + + }), + vector( + { + Param("mask", DataCategory::Tensor, Precision::Int8), + }))); + } // matmul void register_matmul(TfFactory &tffactory) { diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp index f7bacc0b..a410ddd4 100644 --- a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp +++ b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp @@ -223,159 +223,8 @@ namespace deepx::tensorfunc } }; - template - struct muladdDispatcher - { - // A*B+C=D - static void muladd(const Tensor &A, const Tensor &B, const Tensor &C, Tensor &D) - { - - if (A.shape == B.shape && A.shape == C.shape && A.shape == D.shape) - { - D.shape.rangeParallel(D.shape.dim - 1, [&A, &B, &C, &D](int i) - { - int shape_last=D.shape[-1]; - const ScalableTag tag; - const size_t lanes = Lanes(tag); - size_t j=0; - - // 1. 处理前置未对齐部分 - while (j < shape_last && !IsAligned(tag,A.data + i + j)) { - D.data[i+j] = A.data[i+j] * B.data[i+j] + C.data[i+j]; - ++j; - } - - // 2. 处理中间对齐部分 - size_t aligned_end=shape_last-(shape_last%lanes); - for (; j+lanes<=aligned_end; j += lanes ) - { - auto vec1 = Load(tag, A.data + i + j); - auto vec2 = Load(tag, B.data + i + j); - auto vec3 = Load(tag, C.data + i + j); - auto vec_result = MulAdd(vec1, vec2, vec3); - Store(vec_result, tag, D.data + i + j); - } - - // 3. 处理尾部剩余元素 - for (;j - struct muladdscalarDispatcher - { - // A*B*alpha+C*beta=D - static void muladdscalar(const Tensor &A, const Tensor &B, const T alpha, const Tensor &C, const T beta, Tensor &D) - { - if (A.shape == B.shape && A.shape == C.shape && A.shape == D.shape) - { - D.shape.rangeParallel(D.shape.dim - 1, [&A, &B, &alpha, &C, &beta, &D](int i) - { - int shape_last=D.shape[-1]; - const ScalableTag tag; - const size_t lanes = Lanes(tag); - size_t j=0; - - // 1. 处理前置未对齐部分 - while (j < shape_last && !IsAligned(tag,A.data + i + j)) { - D.data[i+j] = alpha * A.data[i+j] * B.data[i+j] + beta * C.data[i+j]; - ++j; - } - - // 2. 处理中间对齐部分 - size_t aligned_end=shape_last-(shape_last%lanes); - for (; j+lanes<=aligned_end; j += lanes ) - { - auto vec1 = Load(tag, A.data + i + j); - auto vec2 = Load(tag, B.data + i + j); - auto alpha_vec = Set(tag, alpha); - auto beta_vec = Set(tag, beta); - if (alpha != 1.0) - { - vec1 = Mul(vec1, alpha_vec); - } - if (beta != 0.0) - { - auto vec3 = Load(tag, C.data + i + j); - vec3 = Mul(vec3, beta_vec); - auto vec_result = MulAdd(vec1, vec2, vec3); - Store(vec_result, tag, D.data + i + j); - }else{ - auto vec_result = Mul(vec1, vec2); - Store(vec_result, tag, D.data + i + j); - } - - } - - // 3. 处理尾部剩余元素 - for (;j - struct mulscalaraddDispatcher - { - // A*alpha+B*beta=C - static void mulscalaradd(const Tensor &A, const T alpha, const Tensor &B, const T beta, Tensor &C) - { - if (A.shape == B.shape && A.shape == C.shape) - { - C.shape.rangeParallel(C.shape.dim - 1, [&A, &alpha, &B, &beta, &C](int i) - { - int shape_last=C.shape[-1]; - const ScalableTag tag; - const size_t lanes = Lanes(tag); - size_t j=0; - - // 1. 处理前置未对齐部分 - while (j < shape_last && !IsAligned(tag,A.data + i + j)) { - C.data[i+j] = alpha * A.data[i+j] + beta * B.data[i+j]; - ++j; - } - - // 2. 处理中间对齐部分 - size_t aligned_end=shape_last-(shape_last%lanes); - for (; j+lanes<=aligned_end; j += lanes ) - { - auto vec_a = Load(tag, A.data + i + j); - auto alpha_vec = Set(tag, alpha); - vec_a=Mul(vec_a,alpha_vec); - auto vec_b = Load(tag, B.data + i + j); - auto beta_vec = Set(tag, beta); - vec_b=Mul(vec_b,beta_vec); - auto vec_c = Load(tag, C.data + i + j); - auto vec_result = Add(vec_a, vec_b); - Store(vec_result, tag, C.data + i + j); - } - - // 3. 处理尾部剩余元素 - for (;j @@ -438,151 +287,7 @@ namespace deepx::tensorfunc } }; - template - struct divaddDispatcher - { - // D= A/B+ C - static void divadd(const Tensor &A, const Tensor &B, const Tensor &C, Tensor &D) - { - if (A.shape == B.shape && A.shape == C.shape && A.shape == D.shape) - { - D.shape.rangeParallel(D.shape.dim - 1, [&A, &B, &C, &D](int i) - { - int shape_last=D.shape[-1]; - const ScalableTag tag; - const size_t lanes = Lanes(tag); - size_t j=0; - - // 1. 处理前置未对齐部分 - while (j < shape_last && !IsAligned(tag,A.data + i + j)) { - D.data[i+j] = A.data[i+j] / B.data[i+j] + C.data[i+j]; - ++j; - } - - // 2. 处理中间对齐部分 - size_t aligned_end=shape_last-(shape_last%lanes); - for (; j+lanes<=aligned_end; j += lanes ) - { - auto vec1 = Load(tag, A.data + i + j); - auto vec2 = Load(tag, B.data + i + j); - auto vec3 = Load(tag, C.data + i + j); - auto vec_result = Add(Div(vec1, vec2), vec3); - Store(vec_result, tag, D.data + i + j); - } - - // 3. 处理尾部剩余元素 - for (;j - struct divscalaraddDispatcher - { - // C= A/alpha+ B/beta - static void divscalaradd(const Tensor &A, const T alpha, const Tensor &B, const T beta, Tensor &C) - { - if (A.shape == B.shape && A.shape == C.shape) - { - C.shape.rangeParallel(C.shape.dim - 1, [&A, &alpha, &B, &beta, &C](int i) - { - int shape_last=C.shape[-1]; - const ScalableTag tag; - const size_t lanes = Lanes(tag); - size_t j=0; - - // 1. 处理前置未对齐部分 - while (j < shape_last && !IsAligned(tag,A.data + i + j)) { - C.data[i+j] = A.data[i+j] / alpha + B.data[i+j] / beta; - ++j; - } - - // 2. 处理中间对齐部分 - size_t aligned_end=shape_last-(shape_last%lanes); - for (; j+lanes<=aligned_end; j += lanes ) - { - auto vec_a = Load(tag, A.data + i + j); - auto alpha_vec = Set(tag, alpha); - vec_a=Div(vec_a,alpha_vec); - auto vec_b = Load(tag, B.data + i + j); - auto beta_vec = Set(tag, beta); - vec_b=Div(vec_b,beta_vec); - auto vec_c = Load(tag, C.data + i + j); - auto vec_result = Add(vec_a, vec_b); - Store(vec_result, tag, C.data + i + j); - } - - // 3. 处理尾部剩余元素 - for (;j - struct divaddbetaDispatcher - { - // D= A/B*alpha+ C*beta - static void divaddbeta(const Tensor &A, const Tensor &B, const T alpha, const Tensor &C, const T beta, Tensor &D) - { - if (A.shape == B.shape && A.shape == C.shape && A.shape == D.shape) - { - D.shape.rangeParallel(D.shape.dim - 1, [&A, &alpha, &B, &beta, &C, &D](int i) - { - int shape_last=D.shape[-1]; - const ScalableTag tag; - const size_t lanes = Lanes(tag); - size_t j=0; - - // 1. 处理前置未对齐部分 - while (j < shape_last && !IsAligned(tag,A.data + i + j)) { - D.data[i+j] = A.data[i+j] / B.data[i+j] * alpha + C.data[i+j] * beta; - ++j; - } - - // 2. 处理中间对齐部分 - size_t aligned_end=shape_last-(shape_last%lanes); - for (; j+lanes<=aligned_end; j += lanes ) - { - auto vec_a = Load(tag, A.data + i + j); - auto vec_b = Load(tag, B.data + i + j); - auto vec_c = Load(tag, C.data + i + j); - auto vec_d = Load(tag, D.data + i + j); - auto alpha_vec = Set(tag, alpha); - vec_a=Div(vec_a,vec_b); - vec_a=Mul(vec_a,alpha_vec); - auto beta_vec = Set(tag, beta); - vec_c=Mul(vec_c,beta_vec); - auto vec_result = Add(vec_a, vec_c); - Store(vec_result, tag, D.data + i + j); - } - - // 3. 处理尾部剩余元素 - for (;j struct sqrtDispatcher>> { @@ -863,10 +568,10 @@ namespace deepx::tensorfunc struct maxDispatcher { static void max(const Tensor &A, const Tensor &B, Tensor &C) - { + { if (A.shape == B.shape && A.shape == C.shape) { - C.shape.rangeParallel(C.shape.dim - 1, [&A, &B, &C](int idx) + C.shape.rangeParallel(C.shape.dim - 1, [&A, &B, &C ](int idx) { int shape_last=C.shape[-1]; const ScalableTag tag; @@ -901,33 +606,7 @@ namespace deepx::tensorfunc } } }; - - template - struct maxgradDispatcher - { - static void maxgrad(const Tensor &A, const Tensor &B, Tensor &A_grad, Tensor &B_grad, const Tensor &output_grad) - { - if (A.shape == B.shape && A.shape == output_grad.shape && A.shape == A_grad.shape && A.shape == B_grad.shape) - { - A_grad.shape.rangeParallel(A_grad.shape.dim, [&A, &B, &output_grad, &A_grad, &B_grad](int idx) - { - if (A.data[idx]>B.data[idx]){ - A_grad.data[idx]=output_grad.data[idx]; - B_grad.data[idx]=0; - }else if (A.data[idx] struct maxscalarDispatcher @@ -972,29 +651,7 @@ namespace deepx::tensorfunc } }; - template - struct maxscalargradDispatcher - { - static void maxscalargrad(const Tensor &A, const T b, Tensor &A_grad, const Tensor &output_grad) - { - if (A.shape == A_grad.shape && A.shape == output_grad.shape) - { - A_grad.shape.rangeParallel(A_grad.shape.dim, [&A, &b, &A_grad, &output_grad](int idx) - { - if (A.data[idx]>b){ - A_grad.data[idx]=output_grad.data[idx]; - }else if (A.data[idx] struct minDispatcher @@ -1039,33 +696,7 @@ namespace deepx::tensorfunc } }; - template - struct mingradDispatcher - { - static void mingrad(const Tensor &A, const Tensor &B, Tensor &A_grad, Tensor &B_grad, const Tensor &output_grad) - { - if (A.shape == B.shape && A.shape == output_grad.shape && A.shape == A_grad.shape && A.shape == B_grad.shape) - { - A_grad.shape.rangeParallel(A_grad.shape.dim, [&A, &B, &output_grad, &A_grad, &B_grad](int idx) - { - if (A.data[idx]B.data[idx]){ - A_grad.data[idx]=0; - B_grad.data[idx]=output_grad.data[idx]; - }else{ - A_grad.data[idx]=output_grad.data[idx]/2; - B_grad.data[idx]=output_grad.data[idx]/2; - } }); - } - else - { - throw std::invalid_argument("shape mismatch"); - } - } - }; - + template struct minscalarDispatcher { @@ -1107,23 +738,24 @@ namespace deepx::tensorfunc } } }; - + template - struct minscalargradDispatcher + struct compareDispatcher { - static void minscalargrad(const Tensor &A, const T b, Tensor &A_grad, const Tensor &output_grad) + static void compare(const Tensor &A, const Tensor &B,const Tensor &mask) { - if (A.shape == A_grad.shape && A.shape == output_grad.shape) + if (A.shape == B.shape && mask.shape == A.shape) { - A_grad.shape.rangeParallel(A_grad.shape.dim, [&A, &b, &A_grad, &output_grad](int idx) - { - if (A.data[idx]b){ - A_grad.data[idx]=0; - }else{ - A_grad.data[idx]=output_grad.data[idx]/2; - } }); + A.shape.rangeParallel(A.shape.dim, [&A, &B, &mask](int idx) + { + if(A.data[idx]==B.data[idx]){ + mask.data[idx]=2; + }else if(A.data[idx]>B.data[idx]){ + mask.data[idx]=1; + }else{ + mask.data[idx]=0; + } + }); } else { diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/a.zip b/excuter/op-mem-ompsimd/src/deepx/tf/a.zip deleted file mode 100644 index 136f350d02c386da88bf2c7924d496bc2c206e34..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 11358 zcmb7q1yEdR)AeA%HMj?N9X!E;yGvkjx53>#xD$fAySux)OK{g9fdF6fzTeiryS0DS zemym}=GIj8otmdl_vt>ZAPohL34r_cS_!lI1o&V3N&x_%0!RUdX8Ja!#tvrsw#H0m zwzjG&$N*R?rD@YY-cGKF00`(qC;;F;m$Oe=@&w|T-uIPX@M&w}_Y4BA^_Y|ra`y8f z>b;8sW9q91tHRn_*FlAOLm2t_q0quB33NC!YUaZ6D*c}^DN(leCr?-Qi+!>kv@X^y z3FD<_v2Z>1Pg5?^ZqnIJ7rw*0vBKVL?FZ5Z98T^#)pO3kImqIj9G%kTECg{D zl9A~KmW-LcBXuZFpb*C<2Y~PyMqqRT^pL_0d15k1*_RL@%>Ln}@hOf#TI+EbDGJ2QNUmC{G5=7z-{{mwOs zqhjVA5#0bxS|8Q%*OXNnP?1INYliQlcc6pS4nUZb3JV=zmQaH5daR+KVPP-4Ig#x^BruoKl@i^i8jtO5im~f5j3&n4(jwbvlOJ*&TpQD zI#8lIsgt{u`5JVWaOj`J=6h@vrKekZhsH5Atf3iIn)-fL1&YH{+ovk1w z_H%^eFpEdmNOp=nlk!vZ!#neQ_V(@cKxZHD977lM+)Bf~CfzrFc;J@se@opZ?*CQD zrH7|z=^K^Fj{aJqpOs&O0unVlN`ry%84aqUr|^Qkku@2qr5D?Rt`=fDql23)M%prO z%y`y1l5q-R+oG?%9s;YiEkJ*G-k7$ac5-YP=mT& zp}jNV*8+msd`CE+D+(CW=GO0wqdxSkro<*^H_j$#W^`NQ+4}{8Soq_&f^~BFQnh2; zSQqsz#e(>bM@iA20rKn4Y|7M4;WeG}ceygfp(G`JG~%c1-?@9-s4r(pi;NG=DZBWb z-O4s^1Hravb3VEyua8|t>nsbL5i9Kji3ejQCF+$rmTSx9uMJWM_>G*?;d%+K@d0}JJMsZbuA=1@*~)_!$+IPVftdL{W=w5=LV z&cKFFlQyvRU7|37MAbr@n~RoO3}lJP0~_)k7S#bDDqn8`MBdo7EXS%L2T3mXt(q0?1*na?(9mB|XqRMY(4fI2oZCE7o3T zMb1XvCWp`y_pF*Hx_m(_se(gzMJO8d2>2l_y- z5{x4?9&oZdJn7%eGbYH+VR46m)i7v6-OoHcif2j*to#Q!D2g7AS*jkcQ7c^PK#NB8 zp$&0d-hGcfaMicFwobKng%;Yn;@Gu4UOTKM4k}sw);lZn1Ks59Ey>&qE!*!t>)DZ~ z;dj>^^4_n&^!T7QzlOM=vU1XogDoQzETba|%i8);l5Gg)>D;Y5K5?jJoID9Ef~^kg zt@D!4AD03w)-*LV1;n{@nd@A3xEEG8_=_aFJw7OII~)o#z^nCiKpT3&pCmB!63Kh> zh-(3l_W>=brInTQi|W%RUDqR8Ktf%w#IOnn_~*w_XWpadls=36cGx)dpY~q~W#hmH z8a?jXo>cEP`zIr83!6n|UL};Gz6-I2SioF&+-{00A53OQ?)We~;wVVN!r{`EuRpv2 z0DhNFg?}un@^t?#s>W8v*2XrDF6IvZs;JI?-Lza)!yMdvR@cVFbxu}k=po*(^P}J> zo2o0|aFj=`aZ1;w(lvjKb%pMI)L1}y6OhOsg*Bo_?^s1*Rpmovw_5m&$fsVjOER5U zVKl5Jib1hSo}T;koaT^60FL3GwS+^vqKd(d4a^XV-3z!jkE!e1Y>LnkOw}iQ4BCNiOEYD^YOrwN^gU-%Ve#!Z+D#^Vf5*7LJ~FB%E0PO!y@Y?%bI!!|#5u z8%kN7D8iO>o}EzM!9v)P3-Hcx)T9xNPYUH1v%z<2ENEgPUlO7ps8Kt|X$`$5G`L*;_!9R{ZE-mI<&Re&$({ z?>||C-?r*=nz)Tm_0Y{!qzN-HM)rEfS4uiOX&fph?bgE^tPqovUR%_3EH%a)&r@!g zVuLsyR)~Y7AH>qoe3SK4yXD~|^(wxS`NiAq>Y#QsGo$_aMOA}QDER7T@glPm@AcZy z<>%R`%d59K$?GMb=X=H?n|Tl#0Qj8=|2aoqAOHYZzZ#D};wr`D zH7w}Np7oSK1XW|WoTy@Daks4)w!#KbJx1t8m@U59BMcj_vucN z{A#mGVU6{EC|K@G$GykJ^M3m`aj8B>G|p~DRcnE%^Lmc@e47%N#`DdiqLp!i^2jLX z)^%(t%#Va^a8wDQh8pNSf+Dyc@)i~1K8x-d)F#bULDr{%d=y=*CMcK&)V5yi_7UNF z<5P^dMuno^l$n`RBdr@SDxs_iU8lNyJ+S(313?f|)Ci%oFmHX(a>NSA;q!P1AYm`n zL#-$M-@UPnK>Qd`?sU#da|W;?#mp3h(8<&4(@Uy{#$XRUL)fgPMvmQGpBu8gl~4dt zqrg+5lToN)NVOpY3v;R-565QTZ+CuttNqq}@@>_{v`{?vey~#x)yMml#q%X)MLbh3 zIDq+Jka_YA_=}xS48EzuvupKY$uv%@&Q3eTh!xmq`J^=B3rXO(Yujf=`+#IQDu?=% z@08JBGd@}`bRckeMSb;-{hA@p*pdA#RsxbfQm3BT=^@2pv7p1Q0v1Sg`Ac+SR{7BC z3GpUMLp&iutO^WaR`IM2bCu|M`lG4%y^w?_tCWw@Yj^B@Fa4UYYY6l9Q5c>t^a}&& z4SS+j*2|mUK~BX)732>I0Q?T+e-82@5CDMtw;;DR*9RN8IsPj~aF(V8UKPXk-F&9k zfYDH>FxEu*lqE$ZsDxzYYh+jl*%IC_wl&UKjggO4+$XBnMcZIIBYT{;X!RAg(JU-y zy-&2p%y7}#_az~cVpaYb(Ic2#>7#`N3UdzUmfAr*U>yF$c3NSIqr1=|^6tEUrCvSG?ra^Yxw6~V5UQ^{uPu%!8(WW2K;tRBFT zAMEYk{8d5udoU?&T1x8VBEj25D(;6mM^&xu#~&^4GP6y-(+Q`VQjI|dD_N?*eEhZ{ zVvyyp{i7^}Tond%fKX#KI=R;@CS6v7A>~!8j)dqtd$Z5}7&HG+m+m?4&Gw|Db~~o2 zs*KD}Ib?4!f-$KA?-(;l?)UH=)5RL=l>tE&RQ)28nGK|_i`a9I|5Y*7UW>h}=i#5O zq(F(y{q8pMNqjm57%`?NT-8Z*g^93 zWNdSWCfWtwd;tsUc;E9_YQrftY}&#i19YFTs+;$tO;Bz&kELHW?2OjZPNeghTt*yJGlz^dmvF8>#V?eI z5Z0pg^TI|44)JRIRW&0UgQ2<=FPyc!d-&AH=omY6ra){J+`L9pQpWEv^(}A*osbD1 z3{Cl%V@?ze4!)|MQ8*J%FEw*=SCgajJ8gymk(KNM_qxhn|?{Ea@{AwbE@~%msP$ydtUFq3RLd{S(yG)12&N1Zt zaAX%48Js)1t?ta<5_G>6rBE-S@RnNhEfs;*StiN3*VX2E2&&L!a_`lLhx^6sZ>f=I zch}8jxJ~2$#8kOuN!F#?vm@BFvC9f(OBjBrl7}Rp`gB7G&4K(mpg-Ivmcpo}-x$qrTS*$yrweLu)F%uXZg8eiwz zo8NqgY|Os7wa!W=8V4+Oec)}n&wDzgM;~f<{dq2Pe?#(anaQ9dQo-S3wJ?3~zI*^t z)2^cLSl9K9uCq>7B|7CpT)72x_hRT9-RrGBjBcD63eKi8-HhIn{skKN#KGkpKCx<5 z;Egw2ELLg1JX^~M3T2D?D5?*b^*VY6?kylyT%-$EBS4d89DbSX1L2M5g~|KPR)YxJ zcZv%fv{fIkG*}@|N0KK%BM$4ZoBF%$>~&36KMVS`6{(tNEK9yf)T#1H(aa8rp*2aw zGz7wH=j7*_&Z}uaRiO39o`7O>n;59och}5x~FAl7l+i8Qk{UCOB>NKwb&$-4MqXJ zWy@a^xX&R$O5|3KxAM~H9wr64(zWg#3fM$+TNC}EzA3uq3J;LQFRt%OS*Jxc%_AZg zc|SQSN~lJ&YwM93b|syc-<}r=I4=13ZDwBz;}K)}s?3cnFacp9IO*!*&iNlEUncI& z8Yk9q9)G z?sz8lyb68004em~dbes(-`TFCSGPGgA9GlI2GhfNZ;h1qsD)F59}j{c(Q9KE89KF7 zY%A;85Fb3E zL^Z*=vGDGJ;+-E?XzoTYr8?>!F;j;hAKw~56X{#s3E4!`*^FdcJeq^&b`g5!7}wa2 zWHVL1{qOV+IqX89-3N7*c7qDiMUS)Z4C(8+sNWF23$qC@mGQnP2?imJC%bhQG4rmb zC(%`z{BT@p4Cp0oP>19HwB$X!&s|Z>161}?AI<7Ull1szO7*3>*aE0m@(Doq*}^JD zrQR@VP|*%$IEBX!1=gKN_5iJ|h$5{Q4cy*XYqST0K^j+U^y`-ZU5Am(d!YI3lEiGI z#OE|?)TEJ2;k~x|=b+~aA)Fv6cMD8yH9&-)R6x)$rTC~)z7xelx5s_7qVf9uIpD^b z7dxl#;F4F8-;^;C2|zdw4*>iohW|_%p)dfyBpOnHxsAEwziEZm)wC?%3SoLrR&rYj za@LIZcUY(TnsxJ=J)sGq!OyNsvNiG5PUM#-`I0V$!DP-A@;~K0n7mHWy&#&_Bx|== zeA>NfRaU*&aAO+)p zK()d0#mct7)sgq_S~^@yOQ-UZU^*7@N}=S zg+h0$Od(4FMvOLG0P&c zvtpL5kPnRo)oex@ZDQ&*_tcLdrjc-mQjIt~dFQ?|0-8#GNW)T{pHw8vqn=Z`dZu$+eQuZG2UEF3si*6tSxa2+g zQ{n9*IFO@JX2Y(?&|1u3Srq);D~N|9hSJTx!El1Nrgm=Dj_yI@(0Mj3CUWI$+3-5^ zf@E7)z}x2qkHqW4$waPHeps>mW1`P*+y>K})*1W~KYye1pK)(ty)E3+7M-4d$a6ZlG^rE5f3hWu|;`xyQJC!>O~8qNVDxWgy1pq zJ3AWFKr{)s&*DN@L5+GYXWR|d@haL~HR^X0r(;|5l`lY*jE7lFj^3*E! z`Yrp19*bF-9mJJgep$7c=V~NT1SagxWsdA1nnAqi)+|fb!#OunqCkRw%;r29s>gOQ zHrweeDgHpm`kk*^b*suaaezP%hDy#!K!`aUrOBksq9CO&OY-}UKbR^+G_B=o@Qw^* z4&9?40*j?Ea&z(}h`u?l9v-z^mT__bQ>Z4P}CQRvAJY+^o=18$w9& z6_FE7$;Y;b4j}2_v|ny1sL(mZ(YcHGZd8rU3&?EemeNXp%C5nmBAF0pbBgC+r7=l` z+e&v)PTCr^1lHL5AMcgsSybP8&Llnl;F{s$L)qp<3=Ietazu&Ey$&cn@jA`s`&s_k zpPf*&uRFyaoxP7iuqzk+z$;MG-ik+wFrzjO zP62#+G7d=IzPv&3BV)LL^B!lsWKEa%PZHx|$Uduk#wsdR>|V{~5#h%qjbZTm`!xT2 zX96#utDjZlAs`!zJ8u%Nr_ejONPnA8lcAID@1c7W81pU5@8dSe2Fc{yQL?`l6PuQx zTrKcXxv3ZrZ05nH`N{!RFy?wvU@`OTnuw5 zEF;X>Lq?(f$OULJRMz=i@bdug2?dYS z8|pVG1-}|OfA|H;-_ZMKP|~9T0I8PKmy9Sd0trnoD3dW^EZq6RAXzq`6jA*HtUE`OAk!=JZ~%GW zM35gKb91O|l;t0QI`|JDQ0;hjM}JYW4Y-j+fnlOp z{@CZ$#Q0%c@x8tJJ-x$fafJOBJav*mZCh(Ck)|y0trVLd1c}^0g(dEi9I6g>zkpF} z&t5sZM!&X|p9=4u0RlsMNv8{RalP$msDy#cP0Y%7iBV(wtDm z6S5(9Ha+Yvc>yh|ZM4-_X|aeh&)N>OI!IV?fJd+63GmW0PJXG^}^>HcAO&#e*Z zyc~-6IfFq1v;~$8m?Oo;?@Yrxl8$0d2eV8Vx?{~BX^$XdJ0I_S4=C{GzKnV7R9s}m^@VO(lW_{&8n(amtXQuN^ z1rCK93g5@ejJLt=zt*HAd49nA*>T~aBD8APU*&bVyGN54+iTr+;8~Bdca6sT$c>!j@yA)=Qqxj4W{_=eOp~_ z)eu;m%$XsI5U&)l%5Yqj--uq!{h%W^@|$K@PU8E{{IZ3kX|YWqZ>j!Tpik@$lj1dYo$ zYsh+KMU>-*khO05jLoofh0wO>J-~%W_?+9G;HDhj7s)$cI^cih?jx$C6n$IWkmRMo z*`*YG!6?*21!JSUkuxibepz+%pMk$c+PDeVFD(8>+&^P6aJtv9_aPVg5o_z4UgnWQ1yinS$73gi3h9dJ-K@d@j0B z>WO33LKY>1P^wgBcJxt&;2lWVIL^t8qZFMiLMcC!{c^u*meBz&-(v<>uj>`r=2F&X?nv2nzRR5nJ377U}Z!ep3@3Ew$2U_4!b`{KbE_zC*?(qNs}7meZ41+8F348A8tpb1P{XB?FMWb zg20+y_ip;@3q`QZ%!PB`XFj7W67Q>$ALY^^YiCE^F6meruQ5UKJZSg64~LQU9L^bj z`A_z(H%p#Kg#ZA4pX)!Lco5uwDUrRgk(1$H=iUFGBOg}j+M%x5cNlPgLxCA3UnT9wLen6}BUNRx8%@cU23+9_dt?Wi8y`Q zY18yvI4t+rY3Owv&TW&H>Y;|xL8CY8Uf@=+W)fB+crZZ`>p|(jyJ=R4ZeiM!P!!8) z3VFK0u@lo2!+HJumbk)GtSMN;jx|n7p@EQ`A&E^yJODBQdn|&Pds612wE%KEzYBW( z)NEqwl#LGN038+YRc~GHQO6Qw;yU$~f5)aS-yn}Ww^OhWFY$ONYv7VntvwXPRUWzb zS_dAd6{5uvs3H|Yal>2}irV<1hI7RMCj<{1>i=X1WTj^tR+m<=X~a+Q(~_Xv`L=Et z+2B5A6PZ&8nJW{Cp56T3caF^IOE+r-<-sScPaO2FpPq|Dd%H122adA1;LD@XZV63l z)e+M?pR{TT?<&xAu&rz568UDeNbg{__PLmdIAKgE4(A>5V5u1A$%MM!q&m#07_8RF zORvbYN6kNFo7+79$ilDo`Pf-Uez)$8-!dL#U#3<~yQyuuR3ynAUm_dNj*CaPqj!6I z0Cj&0CVunnvigMNCUHP`_3%D+J5bj#e(js=>3IVpb3$)m_<5pd=1a-R-eWzr(9!eq z3yX5}vb1_mhTk!O9C-U*(oi;<5T*M|wD}#J3epgem{9+3De}*o9lw_%|FAm0-=F@E zE0TYA{%1+_ug+Cr0e_Z3{|!_5yYoLwQNLByKdjEr4e2k=|Gmcg_ecDBbn#o$`orq{ zbWs2Li2sYi_4lXzxzYMP2KmG4{9^F{|9U9$cdvhDioct*KdjDAk_hmBXxaYm_s>=D zH_!aT>ijl--TX_|`FE#(POQHR<{wt)XYp&}=wFx5zq|c&;Q#eM(53e<^H#{zwiDBL5X#L diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp b/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp index 26dde852..23419f37 100644 --- a/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp +++ b/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp @@ -554,18 +554,6 @@ namespace deepx::tf case Precision::Float32: tensorfunc::sqrt(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); break; - case Precision::Int64: - tensorfunc::sqrt(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); - break; - case Precision::Int32: - tensorfunc::sqrt(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); - break; - case Precision::Int16: - tensorfunc::sqrt(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); - break; - case Precision::Int8: - tensorfunc::sqrt(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); - break; default: error = "Unsupported dtype: " + precision_str(a_type); return 1; @@ -611,18 +599,6 @@ namespace deepx::tf case Precision::Float32: tensorfunc::pow(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); break; - case Precision::Int64: - tensorfunc::pow(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); - break; - case Precision::Int32: - tensorfunc::pow(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); - break; - case Precision::Int16: - tensorfunc::pow(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); - break; - case Precision::Int8: - tensorfunc::pow(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); - break; default: error = "Unsupported dtype: " + precision_str(a_type); return 1; @@ -667,18 +643,6 @@ namespace deepx::tf case Precision::Float32: tensorfunc::powscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); break; - case Precision::Int64: - tensorfunc::powscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); - break; - case Precision::Int32: - tensorfunc::powscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); - break; - case Precision::Int16: - tensorfunc::powscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); - break; - case Precision::Int8: - tensorfunc::powscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); - break; default: error = "Unsupported dtype: " + precision_str(a_type); return 1; @@ -723,18 +687,6 @@ namespace deepx::tf case Precision::Float32: tensorfunc::log(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); break; - case Precision::Int64: - tensorfunc::log(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); - break; - case Precision::Int32: - tensorfunc::log(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); - break; - case Precision::Int16: - tensorfunc::log(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); - break; - case Precision::Int8: - tensorfunc::log(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); - break; default: error = "Unsupported dtype: " + precision_str(a_type); return 1; @@ -779,17 +731,365 @@ namespace deepx::tf case Precision::Float32: tensorfunc::exp(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); break; + default: + error = "Unsupported dtype: " + precision_str(a_type); + return 1; + } + return 0; + } + }; + + template + class Sin : public TF + { + public: + Sin(vector args, vector returns) + { + this->name = "sin"; + this->author = Author::name(); + this->args = args; + this->returns = returns; + } + string math_formula() const override + { + return "T3=sin(T1)"; + } + shared_ptr clone() const override + { + return make_shared>(*this); + } + int run(shared_ptr mem, string &error) override + { + Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; + Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (a_type != c_type) + { + error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type); + return 1; + } + switch (a_type) + { + case Precision::Float64: + tensorfunc::sin(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + tensorfunc::sin(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported dtype: " + precision_str(a_type); + return 1; + } + return 0; + } + }; + + template + class Cos : public TF + { + public: + Cos(vector args, vector returns) + { + this->name = "cos"; + this->author = Author::name(); + this->args = args; + this->returns = returns; + } + string math_formula() const override + { + return "T3=cos(T1)"; + } + shared_ptr clone() const override + { + return make_shared>(*this); + } + int run(shared_ptr mem, string &error) override + { + Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; + Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (a_type != c_type) + { + error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type); + return 1; + } + switch (a_type) + { + case Precision::Float64: + tensorfunc::cos(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + tensorfunc::cos(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported dtype: " + precision_str(a_type); + return 1; + } + return 0; + } + }; + + template + class Tan : public TF + { + public: + Tan(vector args, vector returns) + { + this->name = "tan"; + this->author = Author::name(); + this->args = args; + this->returns = returns; + } + string math_formula() const override + { + return "T3=tan(T1)"; + } + shared_ptr clone() const override + { + return make_shared>(*this); + } + int run(shared_ptr mem, string &error) override + { + Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; + Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (a_type != c_type) + { + error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type); + return 1; + } + switch (a_type) + { + case Precision::Float64: + tensorfunc::tan(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + tensorfunc::tan(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported dtype: " + precision_str(a_type); + return 1; + } + return 0; + } + }; + + template + class Max : public TF + { + public: + Max(vector args, vector returns) + { + this->name = "max"; + this->author = Author::name(); + this->args = args; + this->returns = returns; + } + string math_formula() const override + { + return "T3=max(T1,T2)"; + } + shared_ptr clone() const override + { + return make_shared>(*this); + } + int run(shared_ptr mem, string &error) override + { + Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; + Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype; + Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (a_type != b_type || a_type != c_type) + { + error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(b_type) + " != " + precision_str(c_type); + return 1; + } + switch (a_type) + { + case Precision::Float64: + tensorfunc::max(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + tensorfunc::max(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; case Precision::Int64: - tensorfunc::exp(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::max(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int32: - tensorfunc::exp(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::max(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + tensorfunc::max(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + tensorfunc::max(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported dtype: " + precision_str(a_type); + return 1; + } + return 0; + } + }; + + template + class MaxScalar : public TF + { + public: + MaxScalar(vector args, vector returns) + { + this->name = "maxscalar"; + this->author = Author::name(); + this->args = args; + this->returns = returns; + } + string math_formula() const override + { + return "T3=max(T1,scalar)"; + } + shared_ptr clone() const override + { + return make_shared>(*this); + } + int run(shared_ptr mem, string &error) override + { + Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; + Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype; + Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (a_type != b_type || a_type != c_type) + { + error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(b_type) + " != " + precision_str(c_type); + return 1; + } + switch (a_type) + { + case Precision::Float64: + tensorfunc::maxscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1,mem,true), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + tensorfunc::maxscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1,mem,true), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int64: + tensorfunc::maxscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1,mem,true), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + tensorfunc::maxscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1,mem,true), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + tensorfunc::maxscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1,mem,true), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + tensorfunc::maxscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1,mem,true), *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported dtype: " + precision_str(a_type); + return 1; + } + return 0; + } + }; + + template + class Min : public TF + { + public: + Min(vector args, vector returns) + { + this->name = "min"; + this->author = Author::name(); + this->args = args; + this->returns = returns; + } + string math_formula() const override + { + return "T3=min(T1,T2)"; + } + shared_ptr clone() const override + { + return make_shared>(*this); + } + int run(shared_ptr mem, string &error) override + { + Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; + Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype; + Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (a_type != b_type || a_type != c_type) + { + error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(b_type) + " != " + precision_str(c_type); + return 1; + } + switch (a_type) + { + case Precision::Float64: + tensorfunc::min(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + tensorfunc::min(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int64: + tensorfunc::min(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + tensorfunc::min(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + tensorfunc::min(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + tensorfunc::min(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported dtype: " + precision_str(a_type); + return 1; + } + return 0; + } + }; + + template + class MinScalar : public TF + { + public: + MinScalar(vector args, vector returns) + { + this->name = "minscalar"; + this->author = Author::name(); + this->args = args; + this->returns = returns; + } + string math_formula() const override + { + return "T3=min(T1,scalar)"; + } + shared_ptr clone() const override + { + return make_shared>(*this); + } + int run(shared_ptr mem, string &error) override + { + Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; + Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype; + Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (a_type != b_type || a_type != c_type) + { + error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(b_type) + " != " + precision_str(c_type); + return 1; + } + switch (a_type) + { + case Precision::Float64: + tensorfunc::minscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1,mem,true), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + tensorfunc::minscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1,mem,true), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int64: + tensorfunc::minscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1,mem,true), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + tensorfunc::minscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1,mem,true), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int16: - tensorfunc::exp(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::minscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1,mem,true), *mem->gettensor(this->returns[0].textvalue)); break; - case Precision::Int8: - tensorfunc::exp(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + case Precision::Int8: + tensorfunc::minscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1,mem,true), *mem->gettensor(this->returns[0].textvalue)); break; default: error = "Unsupported dtype: " + precision_str(a_type); @@ -798,7 +1098,64 @@ namespace deepx::tf return 0; } }; + + template + class Compare : public TF + { + public: + Compare(vector args, vector returns) + { + this->name = "compare"; + this->author = Author::name(); + this->args = args; + this->returns = returns; + } + string math_formula() const override + { + return "mask=compare(T1,T2)"; + } + shared_ptr clone() const override + { + return make_shared>(*this); + } + int run(shared_ptr mem, string &error) override + { + Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; + Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype; + Precision mask_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (a_type != b_type || a_type != mask_type) + { + error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(b_type) + " != " + precision_str(mask_type); + return 1; + } + switch (a_type) + { + case Precision::Float64: + tensorfunc::compare(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + tensorfunc::compare(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int64: + tensorfunc::compare(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + tensorfunc::compare(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + tensorfunc::compare(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + tensorfunc::compare(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported dtype: " + precision_str(a_type); + return 1; + } + return 0; + } + }; }; - + #endif diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp.a b/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp.a deleted file mode 100644 index af7d9c0c..00000000 --- a/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp.a +++ /dev/null @@ -1,427 +0,0 @@ -#ifndef DEEPX_OP_ELEMENTWISE_HPP -#define DEEPX_OP_ELEMENTWISE_HPP - -#include "deepx/op/op.hpp" -#include "deepx/tensorfunc/elementwise.hpp" -#include "deepx/dtype.hpp" - -#include "deepx/mem/mem.hpp" -#include "stdutil/num.hpp" - -namespace deepx::op -{ - using namespace std; - using namespace deepx::mem; - - - template - class Add : public Op - { - public: - Add(){ - this->init("add",deepx::dtype::name(), {}, {}, false, {}, {}); - } - void setexample() override { - this->init("add", "int32", {"T1", "T2"}, {"T3"}, false, {}, {}); - } - string math_formula() const override { - return "T3 = T1 + T2"; - } - void forward(mem::Mem &mem) override - { - throw NotImplementError("add"); - } - //已验证,2025-02-19,lipeng - void backward(mem::Mem &mem) override - { - throw NotImplementError("add"); - } - }; - - //Addscalar - template - class Addscalar : public Op - { - public: - Addscalar(){ - this->init("addscalar",deepx::dtype::name(), {}, {}, false, {}, {}); - } - void forward(mem::Mem &mem) override - { - throw NotImplementError("addscalar"); - } - void backward(mem::Mem &mem) override - { - throw NotImplementError("addscalar"); - } - void setexample() override { - this->init("addscalar", "float32", {"T1", "1.0"}, {"T2"}, false, {}, {}); - } - string math_formula() const override { - return "T2 = T1 + 1.0"; - } - }; - - template - class Sub : public Op - { - public: - Sub(){ - this->init("sub",deepx::dtype::name(), {}, {}, false, {}, {}); - } - - void forward(mem::Mem &mem) override - { - throw NotImplementError("sub"); - } - - void backward(mem::Mem &mem) override - { - throw NotImplementError("sub"); - } - void setexample() override { - this->init("sub", "int32", {"T1", "T2"}, {"T3"}, false, {}, {}); - } - string math_formula() const override { - return "T3 = T1 - T2"; - } - }; - template - class Mul : public Op - { - public: - Mul(){ - this->init("mul",deepx::dtype::name(), {}, {}, false, {}, {}); - } - - void forward(mem::Mem &mem) override - { - throw NotImplementError("mul"); - } - //已验证,2025-02-19,lipeng - void backward(mem::Mem &mem) override - { - throw NotImplementError("mul"); - } - void setexample() override { - this->init("mul", "float32", {"T1", "T2"}, {"T3"}, false, {}, {}); - } - string math_formula() const override { - return "T3 = T1 * T2"; - } - }; - - template - class Mulscalar : public Op - { - public: - Mulscalar(){ - this->init("mulscalar",deepx::dtype::name(), {}, {}, false, {}, {}); - } - - - void forward(mem::Mem &mem) override - { - throw NotImplementError("mulscalar"); - } - //已验证,2025-02-19,lipeng - void backward(mem::Mem &mem) override - { - throw NotImplementError("mulscalar"); - } - void setexample() override { - this->init("mulscalar", "float32", {"T1", "2.0"}, {"T2"}, false, {}, {}); - } - string math_formula() const override { - return "T2 = T1 * 2.0"; - } - }; - - template - class Div : public Op - { - public: - Div(){ - this->init("div",deepx::dtype::name(), {}, {}, false, {}, {}); - } - - void forward(mem::Mem &mem) override - { - throw NotImplementError("div"); - } - //已验证,2025-02-19,lipeng - void backward(mem::Mem &mem) override - { - throw NotImplementError("div"); - } - void setexample() override { - this->init("div", "float32", {"T1", "T2"}, {"T3"}, false, {}, {}); - } - string math_formula() const override { - return "T3 = T1 / T2"; - } - }; - - //Divscalar之所以不复用Mulscalar,是防止b接近0时,Mulscalar(1/b)不稳定 - //A/b=C - template - class Divscalar : public Op - { - public: - Divscalar(){ - this->init("divscalar",deepx::dtype::name(), {}, {}, false, {}, {}); - } - - - void forward(mem::Mem &mem) override - { - throw NotImplementError("divscalar"); - } - - - void backward(mem::Mem &mem) override - { - throw NotImplementError("divscalar"); - } - void setexample() override { - this->init("divscalar", "float32", {"T1", "2.0"}, {"T2"}, false, {}, {}); - } - string math_formula() const override { - return "T2 = T1 / 2.0"; - } - }; - - - template - class RDivscalar : public Op - { - public: - RDivscalar(){ - this->init("rdivscalar",deepx::dtype::name(), {}, {}, false, {}, {}); - } - - void forward(mem::Mem &mem) override - { - throw NotImplementError("rdivscalar"); - } - - - void backward(mem::Mem &mem) override - { - throw NotImplementError("rdivscalar"); - } - void setexample() override { - this->init("rdivscalar", "float32", {"1", "T2"}, {"T3"}, false, {}, {}); - } - string math_formula() const override { - return "T3 =1 / T2"; - } - }; - - template - class Sqrt : public Op - { - public: - Sqrt(){ - this->init("sqrt",deepx::dtype::name(), {}, {}, false, {}, {}); - } - - void forward(mem::Mem &mem) override - { - throw NotImplementError("sqrt"); - } - //已验证,2025-02-19,lipeng - void backward(mem::Mem &mem) override - { - throw NotImplementError("sqrt"); - } - void setexample() override { - this->init("sqrt", "float32", {"T1"}, {"T2"}, false, {}, {}); - } - string math_formula() const override { - return "T2 = sqrt(T1)"; - } - }; - - template - class Exp : public Op - { - public: - Exp(){ - this->init("exp",deepx::dtype::name(), {}, {}, false, {}, {}); - } - - void forward(mem::Mem &mem) override - { - throw NotImplementError("exp"); - } - //已验证,2025-02-19,lipeng - void backward(mem::Mem &mem) override - { - throw NotImplementError("exp"); - } - void setexample() override { - this->init("exp", "float32", {"T1"}, {"T2"}, false, {}, {}); - } - string math_formula() const override { - return "T2 = exp(T1)"; - } - }; - - template - class Pow : public Op - { - public: - Pow(){ - this->init("pow",deepx::dtype::name(), {}, {}, false, {}, {}); - } - - void forward(mem::Mem &mem) override - { - throw NotImplementError("pow"); - } - void backward(mem::Mem &mem) override - { - throw NotImplementError("pow"); - } - void setexample() override { - this->init("pow", "float32", {"T1", "T2"}, {"T3"}, false, {}, {}); - } - string math_formula() const override { - return "T3 = T1 ^ T2"; - } - }; - - - template - class Powscalar : public Op - { - public: - Powscalar(){ - this->init("powscalar",deepx::dtype::name(), {}, {}, false, {}, {}); - } - - void forward(mem::Mem &mem) override - { - throw NotImplementError("powscalar"); - - } - void backward(mem::Mem &mem) override - { - throw NotImplementError("powscalar"); - } - void setexample() override { - this->init("powscalar", "float32", {"T1", "2.0"}, {"T2"}, false, {}, {}); - } - string math_formula() const override { - return "T2 = T1 ^ 2.0"; - } - }; - - - template - class Log : public Op - { - public: - Log(){ - this->init("log",deepx::dtype::name(), {}, {}, false, {}, {}); - } - - void forward(mem::Mem &mem) override - { - throw NotImplementError("log"); - } - void backward(mem::Mem &mem) override - { - throw NotImplementError("log"); - } - void setexample() override { - this->init("log", "float32", {"T1"}, {"T2"}, false, {}, {}); - } - string math_formula() const override { - return "T2 = log(T1)"; - } - }; - - - template - class Max : public Op - { - public: - Max() - { - this->init("max", deepx::dtype::name(), {}, {}, false, {}, {}); - } - - void setexample() override - { - this->init("max", "float32", {"T1"}, {"T2"}, false, {}, {}); - } - string math_formula() const override - { - return "T3 = max(T1,T2)"; - } - }; - - template - class Maxscalar : public Op - { - public: - Maxscalar() - { - this->init("maxscalar", deepx::dtype::name(), {}, {}, false, {}, {}); - } - - - void setexample() override - { - this->init("maxscalar", "float32", {"T1", "0.0"}, {"T2"}, false, {}, {}); - } - string math_formula() const override - { - return "T2 = max(T1, 0.0)"; - } - }; - - template - class Min : public Op - { - public: - Min() - { - this->init("min", deepx::dtype::name(), {}, {}, false, {}, {}); - } - - void setexample() override - { - this->init("min", "float32", {"A", "B"}, {"C"}, false, {}, {}); - } - string math_formula() const override - { - return "C = min(A,B)"; - } - }; - - template - class Minscalar : public Op - { - public: - Minscalar() - { - this->init("minscalar", deepx::dtype::name(), {}, {}, false, {}, {}); - } - - - void setexample() override - { - this->init("minscalar", "float32", {"A", "1.0"}, {"B"}, false, {}, {}); - } - string math_formula() const override - { - return "B= min(A, 1.0)"; - } - }; - - -} -#endif // DEEPX_OP_ELEMENTWISE_HPP diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/elementwise_cblas.hpp.a b/excuter/op-mem-ompsimd/src/deepx/tf/elementwise_cblas.hpp.a deleted file mode 100644 index 356afc2c..00000000 --- a/excuter/op-mem-ompsimd/src/deepx/tf/elementwise_cblas.hpp.a +++ /dev/null @@ -1,82 +0,0 @@ -#ifndef DEEPX_OP_ELEMENTWISE_CBLAS_HPP -#define DEEPX_OP_ELEMENTWISE_CBLAS_HPP - -#include "deepx/op/op.hpp" -#include "deepx/op/elementwise.hpp" - -#include "deepx/tensorfunc/elementwise_cblas.hpp" -#include "deepx/dtype.hpp" - -#include "deepx/mem/mem.hpp" - -namespace deepx::op -{ - using namespace std; - using namespace deepx::mem; - - - template - class Add_cblas: public Add - { - public: - Add_cblas(){ - this->init("add",deepx::dtype::name(), {}, {}, false, {}, {}); - this->author="cblas"; - } - - void forward(mem::Mem &mem) override - { - auto a = mem.gettensor(this->args[0]).get(); - auto b = mem.gettensor(this->args[1]).get(); - auto c = mem.gettensor(this->returns[0]).get(); - deepx::tensorfunc::add_cblas(*a, *b, *c); - } - - void backward(mem::Mem &mem) override - { - auto a_grad = mem.gettensor(this->args_grad[0]).get(); - auto b_grad = mem.gettensor(this->args_grad[1]).get(); - auto c_grad = mem.gettensor(this->returns_grad[0]).get(); - // 加法的反向传播:输入的梯度等于输出的梯度 - // ∂L/∂a = ∂L/∂c * ∂c/∂a = ∂L/∂c * 1 - deepx::tensorfunc::add_cblas(*a_grad, *c_grad, *a_grad); // a_grad += c_grad - // ∂L/∂b = ∂L/∂c * ∂c/∂b = ∂L/∂c * 1 - deepx::tensorfunc::add_cblas(*b_grad, *c_grad, *b_grad); // b_grad += c_grad - } - }; - - - template - class Sub_cblas : public Sub - { - public: - Sub_cblas(){ - this->init("sub",deepx::dtype::name(), {}, {}, false, {}, {}); - this->author="cblas"; - } - - void forward(mem::Mem &mem) override - { - auto a = mem.gettensor(this->args[0]).get(); - auto b = mem.gettensor(this->args[1]).get(); - auto c = mem.gettensor(this->returns[0]).get(); - deepx::tensorfunc::sub_cblas(*a, *b, *c); - } - - void backward(mem::Mem &mem) override - { - auto a_grad = mem.gettensor(this->args_grad[0]).get(); - auto b_grad = mem.gettensor(this->args_grad[1]).get(); - auto c_grad = mem.gettensor(this->returns_grad[0]).get(); - // 减法的反向传播: - // 对于 c = a - b - // ∂L/∂a = ∂L/∂c * ∂c/∂a = ∂L/∂c * 1 - deepx::tensorfunc::add_cblas(*a_grad, *c_grad, *a_grad); // a_grad += c_grad - // ∂L/∂b = ∂L/∂c * ∂c/∂b = ∂L/∂c * (-1) - deepx::tensorfunc::sub_cblas(*b_grad, *c_grad, *b_grad); // b_grad -= c_grad - } - - }; - -} -#endif // DEEPX_OP_ELEMENTWISE_HPP diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/elementwise_miaobyte.hpp.a b/excuter/op-mem-ompsimd/src/deepx/tf/elementwise_miaobyte.hpp.a deleted file mode 100644 index 368620c2..00000000 --- a/excuter/op-mem-ompsimd/src/deepx/tf/elementwise_miaobyte.hpp.a +++ /dev/null @@ -1,637 +0,0 @@ -#ifndef DEEPX_OP_ELEMENTWISE_MIAOBYTE_HPP -#define DEEPX_OP_ELEMENTWISE_MIAOBYTE_HPP - -#include "deepx/op/op.hpp" -#include "deepx/op/elementwise.hpp" - -#include "deepx/tensorfunc/elementwise_miaobyte.hpp" -#include "deepx/dtype.hpp" - -#include "deepx/mem/mem.hpp" - -namespace deepx::op -{ - using namespace std; - using namespace deepx::mem; - - template - class Add_miaobyte : public Add - { - public: - Add_miaobyte() - { - this->init("add", deepx::dtype::name(), {}, {}, false, {}, {}); - this->author = "miaobyte"; - } - - void forward(mem::Mem &mem) override - { - auto a = mem.gettensor(this->args[0]).get(); - auto b = mem.gettensor(this->args[1]).get(); - auto c = mem.gettensor(this->returns[0]).get(); - deepx::tensorfunc::add_miaobyte(*a, *b, *c); - } - // 已验证,2025-02-19,lipeng - void backward(mem::Mem &mem) override - { - auto a_grad = mem.gettensor(this->args_grad[0]).get(); - auto b_grad = mem.gettensor(this->args_grad[1]).get(); - auto c_grad = mem.gettensor(this->returns_grad[0]).get(); - // 加法的反向传播:输入的梯度等于输出的梯度 - // ∂L/∂a = ∂L/∂c * ∂c/∂a = ∂L/∂c * 1 - deepx::tensorfunc::add_miaobyte(*a_grad, *c_grad, *a_grad); // a_grad += c_grad - // ∂L/∂b = ∂L/∂c * ∂c/∂b = ∂L/∂c * 1 - deepx::tensorfunc::add_miaobyte(*b_grad, *c_grad, *b_grad); // b_grad += c_grad - } - }; - - // Addscalar - template - class Addscalar_miaobyte : public Addscalar - { - public: - Addscalar_miaobyte() - { - this->init("addscalar", deepx::dtype::name(), {}, {}, false, {}, {}); - this->author = "miaobyte"; - } - - // 已验证,2025-02-19,lipeng - void forward(mem::Mem &mem) override - { - auto A = mem.gettensor(this->args[0]).get(); - auto b = this->template getarg(1, mem); - auto C = mem.gettensor(this->returns[0]).get(); - deepx::tensorfunc::addscalar_miaobyte(*A, b, *C); - } - // 已验证,2025-02-19,lipeng - void backward(mem::Mem &mem) override - { - auto a_grad = mem.gettensor(this->args_grad[0]); - auto c_grad = mem.gettensor(this->returns_grad[0]); - // 标量加法的反向传播:张量的梯度等于输出的梯度 - // ∂L/∂a = ∂L/∂c * ∂c/∂a = ∂L/∂c * 1 - deepx::tensorfunc::add_miaobyte(*a_grad, *c_grad, *a_grad); // a_grad += c_grad - // 标量b不需要计算梯度 - } - }; - - template - class Sub_miaobyte : public Sub - { - public: - Sub_miaobyte() - { - this->init("sub", deepx::dtype::name(), {}, {}, false, {}, {}); - this->author = "miaobyte"; - } - - void forward(mem::Mem &mem) override - { - auto a = mem.gettensor(this->args[0]).get(); - auto b = mem.gettensor(this->args[1]).get(); - auto c = mem.gettensor(this->returns[0]).get(); - deepx::tensorfunc::sub_miaobyte(*a, *b, *c); - } - // 已验证,2025-02-19,lipeng - void backward(mem::Mem &mem) override - { - auto a_grad = mem.gettensor(this->args_grad[0]).get(); - auto b_grad = mem.gettensor(this->args_grad[1]).get(); - auto c_grad = mem.gettensor(this->returns_grad[0]).get(); - // 减法的反向传播: - // 对于 c = a - b - // ∂L/∂a = ∂L/∂c * ∂c/∂a = ∂L/∂c * 1 - deepx::tensorfunc::add_miaobyte(*a_grad, *c_grad, *a_grad); // a_grad += c_grad - // ∂L/∂b = ∂L/∂c * ∂c/∂b = ∂L/∂c * (-1) - deepx::tensorfunc::sub_miaobyte(*b_grad, *c_grad, *b_grad); // b_grad -= c_grad - } - }; - template - class Mul_miaobyte : public Mul - { - public: - Mul_miaobyte() - { - this->init("mul", deepx::dtype::name(), {}, {}, false, {}, {}); - this->author = "miaobyte"; - } - - void forward(mem::Mem &mem) override - { - auto a = mem.gettensor(this->args[0]).get(); - auto b = mem.gettensor(this->args[1]).get(); - auto c = mem.gettensor(this->returns[0]).get(); - deepx::tensorfunc::mul_miaobyte(*a, *b, *c); - } - // 已验证,2025-02-19,lipeng - void backward(mem::Mem &mem) override - { - auto a = mem.gettensor(this->args[0]).get(); // 需要用到前向传播的输入 - auto b = mem.gettensor(this->args[1]).get(); // 需要用到前向传播的输入 - auto a_grad = mem.gettensor(this->args_grad[0]).get(); - auto b_grad = mem.gettensor(this->args_grad[1]).get(); - auto c_grad = mem.gettensor(this->returns_grad[0]).get(); - - // 乘法的反向传播: - // 对于 c = a * b - // ∂L/∂a = ∂L/∂c * ∂c/∂a = ∂L/∂c * b - deepx::tensorfunc::muladd_miaobyte(*b, *c_grad, *a_grad, *a_grad); // a_grad += b * c_grad - - // ∂L/∂b = ∂L/∂c * ∂c/∂b = ∂L/∂c * a - deepx::tensorfunc::muladd_miaobyte(*a, *c_grad, *b_grad, *b_grad); // b_grad += a * c_grad - } - }; - - template - class Mulscalar_miaobyte : public Mulscalar - { - public: - Mulscalar_miaobyte() - { - this->init("mulscalar", deepx::dtype::name(), {}, {}, false, {}, {}); - this->author = "miaobyte"; - } - - // 已验证,2025-02-19,lipeng - void forward(mem::Mem &mem) override - { - auto A = mem.gettensor(this->args[0]).get(); - auto b = this->template getarg(1, mem); - auto C = mem.gettensor(this->returns[0]).get(); - deepx::tensorfunc::mulscalar_miaobyte(*A, b, *C); - } - // 已验证,2025-02-19,lipeng - void backward(mem::Mem &mem) override - { - // 需要用到前向传播的标量输入b - auto b = this->template getarg(1, mem); - auto a_grad = mem.gettensor(this->args_grad[0]).get(); - auto c_grad = mem.gettensor(this->returns_grad[0]).get(); - - // 标量乘法的反向传播: - // 对于 c = a * b,其中b是标量 - // ∂L/∂a = ∂L/∂c * ∂c/∂a = ∂L/∂c * b - deepx::tensorfunc::mulscalaradd_miaobyte(*c_grad, b, *a_grad, T(1), *a_grad); // a_grad += c_grad * b - // 标量b不需要计算梯度 - } - }; - - template - class Div_miaobyte : public Div - { - public: - Div_miaobyte() - { - this->init("div", deepx::dtype::name(), {}, {}, false, {}, {}); - this->author = "miaobyte"; - } - - void forward(mem::Mem &mem) override - { - auto a = mem.gettensor(this->args[0]).get(); - auto b = mem.gettensor(this->args[1]).get(); - auto c = mem.gettensor(this->returns[0]).get(); - deepx::tensorfunc::div_miaobyte(*a, *b, *c); - } - // 已验证,2025-02-19,lipeng - void backward(mem::Mem &mem) override - { - // 需要用到前向传播的输入和输出 - auto b = mem.gettensor(this->args[1]).get(); - auto c = mem.gettensor(this->returns[0]).get(); // c = a/b,可以直接用 - auto a_grad = mem.gettensor(this->args_grad[0]).get(); - auto b_grad = mem.gettensor(this->args_grad[1]).get(); - auto c_grad = mem.gettensor(this->returns_grad[0]).get(); - - // 除法的反向传播: - // 对于 c = a/b - // ∂L/∂a = ∂L/∂c * ∂c/∂a = ∂L/∂c * (1/b) - deepx::tensorfunc::divadd_miaobyte(*c_grad, *b, *a_grad, *a_grad); // a_grad += c_grad / b - - // ∂L/∂b = ∂L/∂c * ∂c/∂b - // ∂L/∂b = ∂L/∂c * (-a/b²) - // 或 ∂L/∂b = -c_grad * (c/b) - auto temp_tensor = mem.temptensor(b->shape.shape).get(); - deepx::tensorfunc::div_miaobyte(*c, *b, *temp_tensor); // temp = c/b - deepx::tensorfunc::muladd_miaobyte(*c_grad, *temp_tensor, T(-1), *b_grad, T(1), *b_grad); // b_grad -= c_grad * temp - } - void setexample() override - { - this->init("div_miaobyte", "float32", {"T1", "T2"}, {"T3"}, false, {}, {}); - } - string math_formula() const override - { - return "T3 = T1 / T2"; - } - }; - - // Divscalar之所以不复用Mulscalar,是防止b接近0时,Mulscalar(1/b)不稳定 - // A/b=C - template - class Divscalar_miaobyte : public Divscalar - { - public: - Divscalar_miaobyte() - { - this->init("divscalar", deepx::dtype::name(), {}, {}, false, {}, {}); - this->author = "miaobyte"; - } - - // 已验证,2025-02-19,lipeng - void forward(mem::Mem &mem) override - { - auto A = mem.gettensor(this->args[0]).get(); - auto b = this->template getarg(1, mem); - auto C = mem.gettensor(this->returns[0]).get(); - deepx::tensorfunc::divscalar_miaobyte(*A, b, *C); // 直接使用除法 - } - - // 已验证,2025-02-19,lipeng - void backward(mem::Mem &mem) override - { - auto b = this->template getarg(1, mem); - auto a_grad = mem.gettensor(this->args_grad[0]).get(); - auto c_grad = mem.gettensor(this->returns_grad[0]).get(); - - // 标量除法的反向传播: - // 对于 c = a/b,其中b是标量 - // ∂L/∂a = ∂L/∂c * ∂c/∂a = ∂L/∂c * (1/b) - deepx::tensorfunc::divscalaradd_miaobyte(*c_grad, b, *a_grad, T(1), *a_grad); // a_grad += c_grad / b - // 标量b不需要计算梯度 - } - }; - - template - class RDivscalar_miaobyte : public RDivscalar - { - public: - RDivscalar_miaobyte() - { - this->init("rdivscalar", deepx::dtype::name(), {}, {}, false, {}, {}); - this->author = "miaobyte"; - } - - void forward(mem::Mem &mem) override - { - // C=a/B - auto a = this->template getarg(0, mem); - auto B = mem.gettensor(this->args[1]).get(); - auto C = mem.gettensor(this->returns[0]).get(); - deepx::tensorfunc::rdivscalar_miaobyte(a, *B, *C); // 直接使用除法 - } - - // TODO: 未验证W - void backward(mem::Mem &mem) override - { - // 需要用到前向传播的输入 - auto a = this->template getarg(0, mem); - auto B = mem.gettensor(this->args[1]).get(); - auto C = mem.gettensor(this->returns[0]).get(); // C = a/B - auto B_grad = mem.gettensor(this->args_grad[1]).get(); - auto C_grad = mem.gettensor(this->returns_grad[0]).get(); - - // 标量除法的反向传播: - // 对于 C = a/B - // ∂L/∂B = ∂L/∂C * ∂C/∂B = ∂L/∂C * (-a/B²) - // = -C_grad * (a/B²) = -C_grad * (C/B) - auto temp = mem.temptensor(B->shape.shape).get(); - deepx::tensorfunc::div_miaobyte(*C, *B, *temp); // temp = C/B - deepx::tensorfunc::muladd_miaobyte(*C_grad, *temp, T(-1), *B_grad, T(1), *B_grad); // B_grad -= C_grad * temp - - // 标量a不需要计算梯度 - } - }; - - template - class Sqrt_miaobyte : public Sqrt - { - public: - Sqrt_miaobyte() - { - this->init("sqrt", deepx::dtype::name(), {}, {}, false, {}, {}); - this->author = "miaobyte"; - } - void forward(mem::Mem &mem) override - { - auto a = mem.gettensor(this->args[0]).get(); - auto b = mem.gettensor(this->returns[0]).get(); - deepx::tensorfunc::sqrt_miaobyte(*a, *b); - } - // 已验证,2025-02-19,lipeng - void backward(mem::Mem &mem) override - { - auto b = mem.gettensor(this->returns[0]).get(); // b = sqrt(a) - auto a_grad = mem.gettensor(this->args_grad[0]).get(); - auto b_grad = mem.gettensor(this->returns_grad[0]).get(); - - // 平方根的反向传播: - // 对于 b = sqrt(a) - // ∂L/∂a = ∂L/∂b * ∂b/∂a = ∂L/∂b * (1/(2*sqrt(a))) = b_grad/(2*b) - deepx::tensorfunc::divadd_miaobyte(*b_grad, *b, T(0.5), *a_grad, T(1), *a_grad); // a_grad += 0.5 * b_grad/b - } - }; - - template - class Exp_miaobyte : public Exp - { - public: - Exp_miaobyte() - { - this->init("exp", deepx::dtype::name(), {}, {}, false, {}, {}); - this->author = "miaobyte"; - } - - void forward(mem::Mem &mem) override - { - auto a = mem.gettensor(this->args[0]).get(); - auto b = mem.gettensor(this->returns[0]).get(); - deepx::tensorfunc::exp_miaobyte(*a, *b); - } - // 已验证,2025-02-19,lipeng - void backward(mem::Mem &mem) override - { - auto b = mem.gettensor(this->returns[0]).get(); // b = exp(a) - auto a_grad = mem.gettensor(this->args_grad[0]).get(); - auto b_grad = mem.gettensor(this->returns_grad[0]).get(); - - // 指数函数的反向传播: - // 对于 b = exp(a) - // exp的导数是exp(x)本身,所以 - // ∂L/∂a = ∂L/∂b * ∂b/∂a = ∂L/∂b * exp(a) = b_grad * b - deepx::tensorfunc::muladd_miaobyte(*b_grad, *b, *a_grad, *a_grad); // a_grad += b_grad * b - } - }; - - template - class Pow_miaobyte : public Pow - { - public: - Pow_miaobyte() - { - this->init("pow", deepx::dtype::name(), {}, {}, false, {}, {}); - this->author = "miaobyte"; - } - - // 已验证,2025-03-06,lipeng - void forward(mem::Mem &mem) override - { - auto a = mem.gettensor(this->args[0]).get(); - auto b = mem.gettensor(this->args[1]).get(); - auto c = mem.gettensor(this->returns[0]).get(); - deepx::tensorfunc::pow_miaobyte(*a, *b, *c); - } - void backward(mem::Mem &mem) override - { - // 需要用到前向传播的输入和输出 - auto a = mem.gettensor(this->args[0]).get(); - auto b = mem.gettensor(this->args[1]).get(); - auto c = mem.gettensor(this->returns[0]).get(); // c = a^b - auto a_grad = mem.gettensor(this->args_grad[0]).get(); - auto b_grad = mem.gettensor(this->args_grad[1]).get(); - auto c_grad = mem.gettensor(this->returns_grad[0]).get(); - - // 幂运算的反向传播: - // 对于 c = a^b - - // 对a的偏导: - // ∂L/∂a = ∂L/∂c * ∂c/∂a = c_grad * b * a^(b-1) - // = c_grad * b * (c/a) 【因为c=a^b,所以a^(b-1)=c/a】 - deepx::tensorfunc::div_miaobyte(*c, *a, *a_grad); // temp = c/a - deepx::tensorfunc::mul_miaobyte(*a_grad, *b, *a_grad); // temp = b * (c/a) - deepx::tensorfunc::mul_miaobyte(*a_grad, *c_grad, *a_grad); // a_grad = c_grad * b * (c/a) - - // 对b的偏导: - // ∂L/∂b = ∂L/∂c * ∂c/∂b = c_grad * c * ln(a) - deepx::tensorfunc::log_miaobyte(*a, *b_grad); // temp = ln(a) - deepx::tensorfunc::mul_miaobyte(*b_grad, *c, *b_grad); // temp = c * ln(a) - deepx::tensorfunc::mul_miaobyte(*b_grad, *c_grad, *b_grad); // b_grad = c_grad * c * ln(a) - } - }; - - template - class Powscalar_miaobyte : public Powscalar - { - public: - Powscalar_miaobyte() - { - this->init("powscalar", deepx::dtype::name(), {}, {}, false, {}, {}); - this->author = "miaobyte"; - } - - void forward(mem::Mem &mem) override - { - auto A = mem.gettensor(this->args[0]).get(); - auto b = this->template getarg(1, mem); - auto C = mem.gettensor(this->returns[0]); - deepx::tensorfunc::powscalar_miaobyte(*A, b, *C); - } - void backward(mem::Mem &mem) override - { - // 需要用到前向传播的输入、输出和标量指数 - auto A = mem.gettensor(this->args[0]).get(); - auto b = this->template getarg(1, mem); // 标量指数 - auto C = mem.gettensor(this->returns[0]).get(); // c = a^b - auto A_grad = mem.gettensor(this->args_grad[0]).get(); - auto C_grad = mem.gettensor(this->returns_grad[0]).get(); - - // 标量幂运算的反向传播: - // 对于 c = a^b,其中b是标量 - // ∂L/∂a = ∂L/∂c * ∂c/∂a = c_grad * b * a^(b-1) - // = c_grad * b * (c/a) 【因为c=a^b,所以a^(b-1)=c/a】 - deepx::tensorfunc::div_miaobyte(*C, *A, *A_grad); // temp = c/a - deepx::tensorfunc::mulscalar_miaobyte(*A_grad, b, *A_grad); // temp = b * (c/a) - deepx::tensorfunc::mul_miaobyte(*A_grad, *C_grad, *A_grad); // a_grad = c_grad * b * (c/a) - // 标量b不需要计算梯度 - } - }; - - template - class Log_miaobyte : public Log - { - public: - Log_miaobyte() - { - this->init("log", deepx::dtype::name(), {}, {}, false, {}, {}); - this->author = "miaobyte"; - } - - void forward(mem::Mem &mem) override - { - auto a = mem.gettensor(this->args[0]).get(); - auto b = mem.gettensor(this->returns[0]).get(); - deepx::tensorfunc::log_miaobyte(*a, *b); - } - void backward(mem::Mem &mem) override - { - auto b = mem.gettensor(this->args[1]).get(); - auto a_grad = mem.gettensor(this->args_grad[0]).get(); - auto b_grad = mem.gettensor(this->returns_grad[0]).get(); - deepx::tensorfunc::div_miaobyte(*a_grad, *b, *a_grad); - deepx::tensorfunc::div_miaobyte(*b_grad, *b, *b_grad); - } - }; - - template - class Max_miaobyte : public Max - { - public: - Max_miaobyte() - { - this->init("max", deepx::dtype::name(), {}, {}, false, {}, {}); - this->author = "miaobyte"; - } - - void forward(mem::Mem &mem) override - { - auto A = mem.gettensor(this->args[0]); - auto B = mem.gettensor(this->args[1]); - auto output = mem.gettensor(this->returns[0]); - deepx::tensorfunc::max_miaobyte(*A, *B, *output); - } - - void backward(mem::Mem &mem) override - { - auto A = mem.gettensor(this->args[0]); - auto B = mem.gettensor(this->args[1]); - auto A_grad = mem.gettensor(this->args_grad[0]); - auto B_grad = mem.gettensor(this->args_grad[1]); - auto output_grad = mem.gettensor(this->returns_grad[0]); - deepx::tensorfunc::maxgrad_miaobyte(*A, *B, *A_grad, *B_grad, *output_grad); - } - void setexample() override - { - this->init("max_miaobyte", "float32", {"T1"}, {"T2"}, false, {}, {}); - } - string math_formula() const override - { - return "T3 = max(T1,T2)"; - } - }; - - template - class Maxscalar_miaobyte : public Maxscalar - { - public: - Maxscalar_miaobyte() - { - this->init("maxscalar", deepx::dtype::name(), {}, {}, false, {}, {}); - this->author = "miaobyte"; - } - - void forward(mem::Mem &mem) override - { - auto A = mem.gettensor(this->args[0]); - T b; - if (!is_float(this->args[1])) - { - b = mem.getarg(this->args[1]); - } - else - { - b = T(atof(this->args[1].c_str())); - } - auto output = mem.gettensor(this->returns[0]); - deepx::tensorfunc::maxscalar_miaobyte(*A, b, *output); - } - - void backward(mem::Mem &mem) override - { - auto A = mem.gettensor(this->args[0]); - T b; - if (!is_float(this->args[1])) - { - b = mem.getarg(this->args[1]); - } - else - { - b = T(atof(this->args[1].c_str())); - } - auto A_grad = mem.gettensor(this->args_grad[0]); - auto output_grad = mem.gettensor(this->returns_grad[0]); - deepx::tensorfunc::maxscalargrad_miaobyte(*A, b, *A_grad, *output_grad); - } - }; - - template - class Min_miaobyte : public Min - { - public: - Min_miaobyte() - { - this->init("min", deepx::dtype::name(), {}, {}, false, {}, {}); - this->author = "miaobyte"; - } - - void forward(mem::Mem &mem) override - { - auto A = mem.gettensor(this->args[0]); - auto B = mem.gettensor(this->args[1]); - auto output = mem.gettensor(this->returns[0]); - deepx::tensorfunc::min_miaobyte(*A, *B, *output); - } - - void backward(mem::Mem &mem) override - { - auto A = mem.gettensor(this->args[0]); - auto B = mem.gettensor(this->args[1]); - auto A_grad = mem.gettensor(this->args_grad[0]); - auto B_grad = mem.gettensor(this->args_grad[1]); - auto output_grad = mem.gettensor(this->returns_grad[0]); - deepx::tensorfunc::mingrad_miaobyte(*A, *B, *A_grad, *B_grad, *output_grad); - } - }; - - template - class Minscalar_miaobyte : public Minscalar - { - public: - Minscalar_miaobyte() - { - this->init("minscalar", deepx::dtype::name(), {}, {}, false, {}, {}); - this->author = "miaobyte"; - } - - void forward(mem::Mem &mem) override - { - auto A = mem.gettensor(this->args[0]); - T b; - if (!is_float(this->args[1])) - { - b = mem.getarg(this->args[1]); - } - else - { - b = T(atof(this->args[1].c_str())); - } - auto output = mem.gettensor(this->returns[0]); - deepx::tensorfunc::minscalar_miaobyte(*A, b, *output); - } - - void backward(mem::Mem &mem) override - { - auto A = mem.gettensor(this->args[0]); - T b; - if (!is_float(this->args[1])) - { - b = mem.getarg(this->args[1]); - } - else - { - b = T(atof(this->args[1].c_str())); - } - auto A_grad = mem.gettensor(this->args_grad[0]); - auto output_grad = mem.gettensor(this->returns_grad[0]); - deepx::tensorfunc::minscalargrad_miaobyte(*A, b, *A_grad, *output_grad); - } - void setexample() override - { - this->init("minscalar", "float32", {"A", "1.0"}, {"B"}, false, {}, {}); - } - string math_formula() const override - { - return "B= min(A, 1.0)"; - } - }; -} -#endif // DEEPX_OP_ELEMENTWISE_HPP diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/matmul.hpp.a b/excuter/op-mem-ompsimd/src/deepx/tf/matmul.hpp.a deleted file mode 100644 index 51ef13f3..00000000 --- a/excuter/op-mem-ompsimd/src/deepx/tf/matmul.hpp.a +++ /dev/null @@ -1,69 +0,0 @@ -#ifndef DEEPX_OP_MATMUL_HPP -#define DEEPX_OP_MATMUL_HPP - -#include "deepx/shape_transpose.hpp" -#include "deepx/op/op.hpp" -#include "deepx/mem/mem.hpp" -#include "deepx/tensorfunc/matmul.hpp" -#include "deepx/tensorfunc/changeshape.hpp" -namespace deepx::op -{ - using namespace std; - - - template - class MatMul : public Op - { - public: - MatMul(){ - this->init("matmul",deepx::dtype::name(), {}, {}, false, {}, {}); - } - MatMul(vector< string> args, vector< string> returns, bool require_grad = false, vector< string> args_grad = {}, vector< string> returns_grad = {}){ - this->init("matmul",deepx::dtype::name(), args, returns, require_grad, args_grad, returns_grad); - } - MatMul(initializer_list< string> args, initializer_list< string> returns, bool require_grad = false, initializer_list< string> args_grad = {}, initializer_list< string> returns_grad = {}){ - this->init("matmul",deepx::dtype::name(), args, returns, require_grad, args_grad, returns_grad); - } - void forward(mem::Mem &mem) override - { - auto a = mem.gettensor(this->args[0]).get(); - auto b = mem.gettensor(this->args[1]).get(); - auto c = mem.gettensor(this->returns[0]); - deepx::tensorfunc::matmul(*a, *b, *c); - } - void backward(mem::Mem &mem) override - { - auto a = mem.gettensor(this->args[0]).get(); - auto b = mem.gettensor(this->args[1]).get(); - - auto a_grad = mem.gettensor(this->args_grad[0]).get(); - auto b_grad = mem.gettensor(this->args_grad[1]).get(); - auto c_grad = mem.gettensor(this->returns_grad[0]).get(); - - // ∂L/∂A = ∂L/∂C · B^T - vector b_T_shape=b->shape.shape; - swap(b_T_shape[b->shape.dim-1], b_T_shape[b->shape.dim-2]); - auto b_T=mem.temptensor(b_T_shape).get(); - vector dimOrder_b=deepx::swaplastTwoDimOrder(b->shape.shape); - - deepx::tensorfunc::transpose(*b, *b_T, dimOrder_b); - deepx::tensorfunc::matmuladd(*c_grad, *b_T, T(1), T(1), *a_grad); - // ∂L/∂B = A^T · ∂L/∂C - vector a_T_shape=a->shape.shape; - swap(a_T_shape[a->shape.dim-1], a_T_shape[a->shape.dim-2]); - auto a_T=mem.temptensor(a_T_shape).get(); - vector dimOrder_a=deepx::swaplastTwoDimOrder(a->shape.shape); - deepx::tensorfunc::transpose(*a, *a_T, dimOrder_a); - deepx::tensorfunc::matmuladd(*a_T, *c_grad, T(1), T(1), *b_grad); - } - void setexample() override { - this->init("matmul", "float32", {"T1", "T2"}, {"T3"}, false, {}, {}); - } - string math_formula() const override { - return "T3 = T1 @ T2"; // 使用@表示矩阵乘法 - } - }; - -} - -#endif diff --git a/excuter/op-mem-ompsimd/test/tensorfunc/4_tensor_mul.cpp b/excuter/op-mem-ompsimd/test/tensorfunc/4_tensor_mul.cpp index a0eef177..4e469d58 100644 --- a/excuter/op-mem-ompsimd/test/tensorfunc/4_tensor_mul.cpp +++ b/excuter/op-mem-ompsimd/test/tensorfunc/4_tensor_mul.cpp @@ -44,21 +44,7 @@ void test_mul_1(){ mul(a, b,a); print(a); } -void test_muladd(){ - // std::vector shape=randomshape(1,4,1,8); - std::vector shape={3,7}; - Tensor a=New(shape); - Tensor b=New(shape); - Tensor c=New(shape); - arange(a,1.0f,1); - arange(b,101.0f,1); - print(a); - print(b); - mulscalaradd(a, 2.0f, b, -1.0f,c); - print(c); - mulscalaradd(a, 2.0f, b, -1.0f,a); - print(a); -} + void test_mul_scalar(){ std::vector shape=randomshape(1,1,1,100); Tensor a=New(shape); @@ -83,10 +69,6 @@ int main(int argc, char** argv){ test_mul_1(); break; case 3: - std::cout<<"test_muladd"<Tensor: + out:Union[Tensor,str]='', + author='miaobyte')->Tensor: if isinstance(b,Tensor): - return _A_B_elementwiseop_C(a,b,"add",out) + return Add.apply(a,b,out,author) else: - return _A_b_elementwiseop_C(a,b,"addscalar",out) + return AddScalar.apply(a,b,out,author) #sub OpNode.register("sub") +class Sub(Function): + @staticmethod + def forward(ctx:Context, a, b,out,author='miaobyte'): + return _A_B_elementwiseop_C(a, b, "sub", out,author) + + @staticmethod + def backward(ctx:Context, grad_output): + return grad_output, -grad_output + OpNode.register("subscalar") - +class SubScalar(Function): + @staticmethod + def forward(ctx:Context, a, b,out,author='miaobyte'): + return _A_b_elementwiseop_C(a, b, "subscalar", out,author) + + @staticmethod + def backward(ctx:Context, grad_output): + return grad_output, None def sub( a:Tensor, b: Optional[Union[Tensor, float, int]] = None, out:Union[Tensor,str]='',author='miaobyte')->Tensor: if isinstance(b,Tensor): - return _A_B_elementwiseop_C(a,b,"sub",out) + return Sub.apply(a,b,out,author) else: - return _A_b_elementwiseop_C(a,b*-1,"addscalar",out) + return SubScalar.apply(a,b,out,author) #mul OpNode.register("mul") +class Mul(Function): + @staticmethod + def forward(ctx:Context, a, b,out,author='miaobyte'): + ctx.save_tensors(a,b) + return _A_B_elementwiseop_C(a, b, "mul", out,author) + + @staticmethod + def backward(ctx:Context, out_grad): + a,b=ctx.get_tensor + return out_grad * b, out_grad * a + OpNode.register("mulscalar") - +class MulScalar(Function): + @staticmethod + def forward(ctx:Context, a, b,out,author='miaobyte'): + ctx.save_data('b',b) + return _A_b_elementwiseop_C(a, b, "mulscalar", out,author) + @staticmethod + def backward(ctx:Context, out_grad): + b=ctx.get_data('b') + return out_grad * b, None def mul( a:Tensor, b: Optional[Union[Tensor, float, int]] = None, out:Union[Tensor,str]='',author='miaobyte')->Tensor: if isinstance(b,Tensor): - return _A_B_elementwiseop_C(a,b,"mul",out) + return Mul.apply(a,b,out,author) else: - return _A_b_elementwiseop_C(a,b,"mulscalar",out) + return MulScalar.apply(a,b,out,author) #div OpNode.register("div") +class Div(Function): + @staticmethod + def forward(ctx:Context, a, b,out,author='miaobyte'): + ctx.save_tensors(a,b) + return _A_B_elementwiseop_C(a, b, "div", out,author) + + @staticmethod + def backward(ctx:Context, out_grad): + a,b=ctx.get_tensor + return out_grad / b, -out_grad * a / b / b + OpNode.register("divscalar") +class DivScalar(Function): + @staticmethod + def forward(ctx:Context, a, b,out,author='miaobyte'): + ctx.save_data('b',b) + return _A_b_elementwiseop_C(a, b, "divscalar", out,author) + + @staticmethod + def backward(ctx:Context, out_grad): + b=ctx.get_data('b') + return out_grad / b, None + +OpNode.register("rdivscalar") +class RDivScalar(Function): + @staticmethod + def forward(ctx:Context, a, b,out,author='miaobyte'): + ctx.save_data('b',b) + return _A_b_elementwiseop_C(a, b, "rdivscalar", out,author) + + @staticmethod + def backward(ctx:Context, out_grad): + b=ctx.get_data('b') + return out_grad * b, None + OpNode.register("rdivscalar") def div( a: Optional[Union[Tensor, float, int]] = None, b: Optional[Union[Tensor, float, int]] = None, out:Union[Tensor,str]='',author='miaobyte')->Tensor: if isinstance(b,Tensor) and isinstance(a,Tensor): - return _A_B_elementwiseop_C(a,b,"div",out) + return Div.apply(a,b,out,author) else: if isinstance(a,Tensor): #C=A/b - return _A_b_elementwiseop_C(a,b,"divscalar",out) + return DivScalar.apply(a,b,"divscalar",out,author) else: #C=a/B - return _a_B_elementwiseop_C(a,b,"rdivscalar",out) + return RDivScalar.apply(a,b,"rdivscalar",out,author) OpNode.register("max") @@ -211,47 +296,107 @@ def clamp( #sqrt OpNode.register("sqrt") +class Sqrt(Function): + @staticmethod + def forward(ctx:Context, a,out,author='miaobyte'): + ctx.save_tensor(a) + return _A_elementwiseop_C(a,"sqrt",out,author) + + @staticmethod + def backward(ctx:Context, out_grad): + a=ctx.get_tensor + return out_grad / (2 * sqrt(a)), None + def sqrt( input:Tensor, - out:Union[Tensor,str]='')->Tensor: - return _A_elementwiseop_C(input,"sqrt",out) + out:Union[Tensor,str]='',author='miaobyte')->Tensor: + return Sqrt.apply(input,out,author) OpNode.register("pow") +class Pow(Function): + @staticmethod + def forward(ctx:Context, a, b,out,author='miaobyte'): + ctx.save_tensors(a,b) + return _A_B_elementwiseop_C(a, b, "pow", out,author) + + @staticmethod + def backward(ctx:Context, out_grad): + a,b=ctx.get_tensor + return out_grad * b * pow(a,b-1), out_grad * pow(a,b) * log(a) + OpNode.register("powscalar") +class PowScalar(Function): + @staticmethod + def forward(ctx:Context, a, b,out,author='miaobyte'): + ctx.save_data('b',b) + return _A_b_elementwiseop_C(a, b, "powscalar", out,author) + + @staticmethod + def backward(ctx:Context, out_grad): + b=ctx.get_data('b') + return out_grad * b * pow(a,b-1), out_grad * pow(a,b) * log(a) + def pow( a:Tensor, b:Union[int,float,Tensor,]=0, - out:Union[Tensor,str]='')->Tensor: + out:Union[Tensor,str]='',author='miaobyte')->Tensor: if isinstance(b,int) or isinstance(b,float): - return _A_b_elementwiseop_C(a,b,"powscalar",out) + return PowScalar.apply(a,b,"powscalar",out,author) else: - return _A_B_elementwiseop_C(a,b,"pow",out) + return Pow.apply(a,b,"pow",out,author) #exp OpNode.register("exp") +class Exp(Function): + @staticmethod + def forward(ctx:Context, a,out,author='miaobyte'): + ctx.save_tensor(a) + return _A_elementwiseop_C(a,"exp",out,author) + + @staticmethod + def backward(ctx:Context, out_grad): + a=ctx.get_tensor + return out_grad * exp(a), None + def exp( a:Tensor, - out:Union[Tensor,str]='')->Tensor: - return _A_elementwiseop_C(a,"exp",out) + out:Union[Tensor,str]='',author='miaobyte')->Tensor: + return Exp.apply(a,out,author) #log OpNode.register("log") +class Log(Function): + @staticmethod + def forward(ctx:Context, a,out,author='miaobyte'): + ctx.save_tensor(a) + return _A_elementwiseop_C(a,"log",out,author) + + @staticmethod + def backward(ctx:Context, out_grad): + a=ctx.get_tensor + return out_grad / a, None + def log( - input:Tensor, - out:Union[Tensor,str]='')->Tensor: - return _A_elementwiseop_C(input,"log",out) + a:Tensor, + out:Union[Tensor,str]='',author='miaobyte')->Tensor: + return Log.apply(a,out,author) +OpNode.register("rsqrt") +class Rsqrt(Function): + @staticmethod + def forward(ctx:Context, a,out,author='miaobyte'): + ctx.save_tensor(a) + return _A_elementwiseop_C(a,"rsqrt",out,author) + @staticmethod + def backward(ctx:Context, out_grad): + a=ctx.get_tensor + return -out_grad / (2 * a * sqrt(a)), None + def rsqrt( input:Tensor, - out:Union[Tensor,str]='')->Tensor: - outtensor=None - if isinstance(out,str): - outtensor=Tensor(shape=input.shape, dtype=input.dtype, device=input.device) - outtensor.addtograph(out) - else: - outtensor=out - outtensor=1/sqrt(input,outtensor) - return outtensor + out:Union[Tensor,str]='',author='miaobyte')->Tensor: + return Rsqrt.apply(input,out,author) + diff --git a/front/py/examples/2_ir/3_matmul.dot b/front/py/examples/2_ir/3_matmul.dot index c8e3c65f..f44682c8 100644 --- a/front/py/examples/2_ir/3_matmul.dot +++ b/front/py/examples/2_ir/3_matmul.dot @@ -2,24 +2,24 @@ digraph { rankdir=TB node [shape=record] - 135175655853216 [label="t1 + 135996949875968 [label="t1 (3, 4)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 135173962560752 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 135173963166624 [label="var_1 + 135994975499600 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 135994976203776 [label="var_1 1" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled] - 135173961432896 [label="t2 + 135994974384672 [label="t2 (4, 5)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 135173961432704 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 135173961432224 [label="var_2 + 135994974384864 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 135994974384480 [label="var_2 1" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled] - 135173961432464 [label=matmul color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 135173961432416 [label="tensor_3 + 135994974385104 [label=matmul color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 135994974385152 [label="tensor_3 (3, 5)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 135173962560752 -> 135175655853216 [arrowsize=0.8 color=gray40 penwidth=1.2] - 135173963166624 -> 135173962560752 [arrowsize=0.8 color=gray40 penwidth=1.2] - 135173961432704 -> 135173961432896 [arrowsize=0.8 color=gray40 penwidth=1.2] - 135173961432224 -> 135173961432704 [arrowsize=0.8 color=gray40 penwidth=1.2] - 135175655853216 -> 135173961432464 [arrowsize=0.8 color=gray40 penwidth=1.2] - 135173961432896 -> 135173961432464 [arrowsize=0.8 color=gray40 penwidth=1.2] - 135173961432464 -> 135173961432416 [arrowsize=0.8 color=gray40 penwidth=1.2] + 135994975499600 -> 135996949875968 [arrowsize=0.8 color=gray40 penwidth=1.2] + 135994976203776 -> 135994975499600 [arrowsize=0.8 color=gray40 penwidth=1.2] + 135994974384864 -> 135994974384672 [arrowsize=0.8 color=gray40 penwidth=1.2] + 135994974384480 -> 135994974384864 [arrowsize=0.8 color=gray40 penwidth=1.2] + 135996949875968 -> 135994974385104 [arrowsize=0.8 color=gray40 penwidth=1.2] + 135994974384672 -> 135994974385104 [arrowsize=0.8 color=gray40 penwidth=1.2] + 135994974385104 -> 135994974385152 [arrowsize=0.8 color=gray40 penwidth=1.2] } diff --git a/front/py/examples/2_ir/3_matmul.dot.svg b/front/py/examples/2_ir/3_matmul.dot.svg index c3cffc0b..3e1d97eb 100644 --- a/front/py/examples/2_ir/3_matmul.dot.svg +++ b/front/py/examples/2_ir/3_matmul.dot.svg @@ -9,98 +9,98 @@ %3 - + -135175655853216 +135996949875968 t1 (3, 4) - + -135173961432464 +135994974385104 matmul - + -135175655853216->135173961432464 +135996949875968->135994974385104 - + -135173962560752 +135994975499600 constant - + -135173962560752->135175655853216 +135994975499600->135996949875968 - + -135173963166624 +135994976203776 var_1 1 - + -135173963166624->135173962560752 +135994976203776->135994975499600 - + -135173961432896 +135994974384672 t2 (4, 5) - + -135173961432896->135173961432464 +135994974384672->135994974385104 - + -135173961432704 +135994974384864 constant - + -135173961432704->135173961432896 +135994974384864->135994974384672 - + -135173961432224 +135994974384480 var_2 1 - + -135173961432224->135173961432704 +135994974384480->135994974384864 - + -135173961432416 +135994974385152 tensor_3 (3, 5) - + -135173961432464->135173961432416 +135994974385104->135994974385152 From 701ee0f6bc409c9de7919d771e07f61a77de67dc Mon Sep 17 00:00:00 2001 From: lipeng <734991033@qq.com> Date: Wed, 2 Apr 2025 19:48:10 +0800 Subject: [PATCH 2/7] excuter(cpu/cuda):comparescalar --- doc/excuter/op-mem-cuda/list.md | 48 +++-- doc/excuter/op-mem-ompsimd/list.md | 5 + .../src/deepx/tensorfunc/elementwise.hpp | 16 +- excuter/op-mem-cuda/src/client/tfs.cpp | 11 +- .../elementwise_miaobyte_compare.cu | 84 ++++++-- .../elementwise_miaobyte_compare.cuh | 57 ++++- .../elementwise_miaobyte_compare.hpp | 16 +- .../src/deepx/tf/elementwise_compare.hpp | 90 +++++++- excuter/op-mem-ompsimd/src/client/tfs.cpp | 12 +- .../deepx/tensorfunc/elementwise_miaobyte.hpp | 54 +++-- .../src/deepx/tf/elementwise.hpp | 70 +++++- front/py/deepx/autograd/function.py | 6 +- front/py/deepx/nn/functional/elementwise.py | 201 ++++++++++++------ front/py/deepx/nn/functional/init.py | 106 +++++---- front/py/deepx/nn/functional/matmul.py | 53 +++-- .../examples/2_ir/2_elementwise_sqrtlog.dot | 46 ++-- .../2_ir/2_elementwise_sqrtlog.dot.svg | 92 ++++---- 17 files changed, 677 insertions(+), 290 deletions(-) diff --git a/doc/excuter/op-mem-cuda/list.md b/doc/excuter/op-mem-cuda/list.md index a314b05d..4982da4c 100644 --- a/doc/excuter/op-mem-cuda/list.md +++ b/doc/excuter/op-mem-cuda/list.md @@ -5,27 +5,35 @@ | Operation | Author | Func Def | Math Formula | IR Instruction | |-----------|--------|------------|--------------|----------------| | matmul | cublas | matmul(tensor A, tensor B)->(tensor C) | T3=T1 @ T2 | matmul(tensor A, tensor B)->(tensor C) | -| exp | miaobyte | exp(tensor A)->(tensor C) | T3=exp(T1) | exp(tensor A)->(tensor C) | -| pow | miaobyte | pow(tensor A, tensor B)->(tensor C) | T3=pow(T1, T2) | pow(tensor A, tensor B)->(tensor C) | -| powscalar | miaobyte | powscalar(tensor A, var scalar)->(tensor C) | T3=pow(T1, scalar) | powscalar(tensor A, var scalar)->(tensor C) | -| rdivscalar | miaobyte | rdivscalar(var scalar, tensor A)->(tensor C) | T3=scalar/T1 | rdivscalar(var scalar, tensor A)->(tensor C) | -| div | miaobyte | div(tensor A, tensor B)->(tensor C) | T3=T1/T2 | div(tensor A, tensor B)->(tensor C) | -| sub | miaobyte | sub(tensor A, tensor B)->(tensor C) | T3=T1-T2 | sub(tensor A, tensor B)->(tensor C) | -| argset | none | argset(var value)->(var name) | var argname = argvalue | argset(var value)->(var name) | -| mulscalar | miaobyte | mulscalar(tensor A, var b)->(tensor C) | T3=T1*scalar | mulscalar(tensor A, var b)->(tensor C) | -| sqrt | miaobyte | sqrt(tensor A)->(tensor C) | T3=sqrt(T1) | sqrt(tensor A)->(tensor C) | -| vecset | none | vecset(vector value)->(vector name) | shape = [3 4 5] | vecset(vector value)->(vector name) | -| newtensor | none | newtensor(vector shape)->(tensor tensor1) | T1 = zeros(shape) | newtensor(vector shape)->(tensor tensor1) | -| newtensor | none | newtensor(var shape)->(tensor tensor1) | T1 = zeros(shape) | newtensor(var shape)->(tensor tensor1) | -| print | miaobyte | print(tensor )->() | print(T1) | print(tensor )->() | -| print | miaobyte | print(tensor , var )->() | print(T1) | print(tensor , var )->() | -| divscalar | miaobyte | divscalar(tensor A, var scalar)->(tensor C) | T3=scalar/T1 | divscalar(tensor A, var scalar)->(tensor C) | -| constant | miaobyte | constant(tensor t, var value)->() | constant(T1) | constant(tensor t, var value)->() | -| arange | miaobyte | arange(tensor t, var start, var step)->() | arange(T1,start,step) | arange(tensor t, var start, var step)->() | -| subscalar | miaobyte | subscalar(tensor A, var b)->(tensor C) | T3=T1-scalar | subscalar(tensor A, var b)->(tensor C) | -| log | miaobyte | log(tensor A)->(tensor C) | T3=log(T1) | log(tensor A)->(tensor C) | -| uniform | miaobyte | uniform(tensor t, var low, var high, var seed)->() | uniform(T1,low,high,seed) | uniform(tensor t, var low, var high, var seed)->() | | add | cublas | add(tensor a, tensor b)->(tensor c) | T3=T1+T2 | add(tensor a, tensor b)->(tensor c) | | add | miaobyte | add(tensor a, tensor b)->(tensor c) | T3=T1+T2 | add(tensor a, tensor b)->(tensor c) | +| uniform | miaobyte | uniform(tensor t, var low, var high, var seed)->() | uniform(T1,low,high,seed) | uniform(tensor t, var low, var high, var seed)->() | | addscalar | miaobyte | addscalar(tensor A, var b)->(tensor C) | T3=T1+scalar | addscalar(tensor A, var b)->(tensor C) | +| log | miaobyte | log(tensor A)->(tensor C) | T3=log(T1) | log(tensor A)->(tensor C) | +| arange | miaobyte | arange(tensor t, var start, var step)->() | arange(T1,start,step) | arange(tensor t, var start, var step)->() | +| divscalar | miaobyte | divscalar(tensor A, var scalar)->(tensor C) | T3=scalar/T1 | divscalar(tensor A, var scalar)->(tensor C) | +| sin | miaobyte | sin(tensor A)->(tensor C) | T3=sin(T1) | sin(tensor A)->(tensor C) | +| tan | miaobyte | tan(tensor A)->(tensor C) | T3=tan(T1) | tan(tensor A)->(tensor C) | +| print | miaobyte | print(tensor )->() | print(T1) | print(tensor )->() | +| print | miaobyte | print(tensor , var )->() | print(T1) | print(tensor , var )->() | +| newtensor | none | newtensor(vector shape)->(tensor tensor1) | T1 = zeros(shape) | newtensor(vector shape)->(tensor tensor1) | +| newtensor | none | newtensor(var shape)->(tensor tensor1) | T1 = zeros(shape) | newtensor(var shape)->(tensor tensor1) | +| vecset | none | vecset(vector value)->(vector name) | shape = [3 4 5] | vecset(vector value)->(vector name) | +| subscalar | miaobyte | subscalar(tensor A, var b)->(tensor C) | T3=T1-scalar | subscalar(tensor A, var b)->(tensor C) | +| sqrt | miaobyte | sqrt(tensor A)->(tensor C) | T3=sqrt(T1) | sqrt(tensor A)->(tensor C) | +| argset | none | argset(var value)->(var name) | var argname = argvalue | argset(var value)->(var name) | +| sub | miaobyte | sub(tensor A, tensor B)->(tensor C) | T3=T1-T2 | sub(tensor A, tensor B)->(tensor C) | +| mulscalar | miaobyte | mulscalar(tensor A, var b)->(tensor C) | T3=T1*scalar | mulscalar(tensor A, var b)->(tensor C) | +| div | miaobyte | div(tensor A, tensor B)->(tensor C) | T3=T1/T2 | div(tensor A, tensor B)->(tensor C) | +| constant | miaobyte | constant(tensor t, var value)->() | constant(T1) | constant(tensor t, var value)->() | +| powscalar | miaobyte | powscalar(tensor A, var scalar)->(tensor C) | T3=pow(T1, scalar) | powscalar(tensor A, var scalar)->(tensor C) | +| max | miaobyte | max(tensor A, tensor B)->(tensor C) | T3=max(T1, T2) | max(tensor A, tensor B)->(tensor C) | +| pow | miaobyte | pow(tensor A, tensor B)->(tensor C) | T3=pow(T1, T2) | pow(tensor A, tensor B)->(tensor C) | +| maxscalar | miaobyte | maxscalar(tensor A, var scalar)->(tensor C) | T3=max(T1, scalar) | maxscalar(tensor A, var scalar)->(tensor C) | | mul | miaobyte | mul(tensor A, tensor B)->(tensor C) | T3=T1*T2 | mul(tensor A, tensor B)->(tensor C) | +| exp | miaobyte | exp(tensor A)->(tensor C) | T3=exp(T1) | exp(tensor A)->(tensor C) | +| rdivscalar | miaobyte | rdivscalar(var scalar, tensor A)->(tensor C) | T3=scalar/T1 | rdivscalar(var scalar, tensor A)->(tensor C) | +| minscalar | miaobyte | minscalar(tensor A, var scalar)->(tensor C) | T3=min(T1, scalar) | minscalar(tensor A, var scalar)->(tensor C) | +| cos | miaobyte | cos(tensor A)->(tensor C) | T3=cos(T1) | cos(tensor A)->(tensor C) | +| min | miaobyte | min(tensor A, tensor B)->(tensor C) | T3=min(T1, T2) | min(tensor A, tensor B)->(tensor C) | +| compare | miaobyte | compare(tensor A, tensor B)->(tensor mask) | mask=compare(T1, T2) | compare(tensor A, tensor B)->(tensor mask) | diff --git a/doc/excuter/op-mem-ompsimd/list.md b/doc/excuter/op-mem-ompsimd/list.md index 47325905..84f46f87 100644 --- a/doc/excuter/op-mem-ompsimd/list.md +++ b/doc/excuter/op-mem-ompsimd/list.md @@ -7,7 +7,11 @@ | concat | none | concat()->() | Tresult = concat([T1, T2...], axis=3) | concat()->() | | matmul | cblas | matmul(tensor A, tensor B)->(tensor C) | T3=T1 @ T2 | matmul(tensor A, tensor B)->(tensor C) | | matmul | miaobyte | matmul(tensor A, tensor B)->(tensor C) | T3=T1 @ T2 | matmul(tensor A, tensor B)->(tensor C) | +| compare | miaobyte | compare(tensor A, tensor B)->(tensor mask) | mask=compare(T1,T2) | compare(tensor A, tensor B)->(tensor mask) | +| min | miaobyte | min(tensor A, tensor B)->(tensor C) | T3=min(T1,T2) | min(tensor A, tensor B)->(tensor C) | +| minscalar | miaobyte | minscalar(tensor A, var scalar)->(tensor C) | T3=min(T1,scalar) | minscalar(tensor A, var scalar)->(tensor C) | | exp | miaobyte | exp(tensor A)->(tensor C) | T3=exp(T1) | exp(tensor A)->(tensor C) | +| maxscalar | miaobyte | maxscalar(tensor A, var scalar)->(tensor C) | T3=max(T1,scalar) | maxscalar(tensor A, var scalar)->(tensor C) | | pow | miaobyte | pow(tensor A, tensor B)->(tensor C) | T3=T1^T2 | pow(tensor A, tensor B)->(tensor C) | | powscalar | miaobyte | powscalar(tensor A, var scalar)->(tensor C) | T3=T1^scalar | powscalar(tensor A, var scalar)->(tensor C) | | rdivscalar | miaobyte | rdivscalar(var scalar, tensor A)->(tensor C) | T3=scalar/T1 | rdivscalar(var scalar, tensor A)->(tensor C) | @@ -21,6 +25,7 @@ | newtensor | none | newtensor(var shape)->(tensor tensor1) | T1 =Tensor(shape=[...]) | newtensor(var shape)->(tensor tensor1) | | print | miaobyte | print(tensor )->() | print(T1) | print(tensor )->() | | print | miaobyte | print(tensor , var )->() | print(T1) | print(tensor , var )->() | +| max | miaobyte | max(tensor A, tensor B)->(tensor C) | T3=max(T1,T2) | max(tensor A, tensor B)->(tensor C) | | divscalar | miaobyte | divscalar(tensor A, var scalar)->(tensor C) | T3=T1/scalar | divscalar(tensor A, var scalar)->(tensor C) | | constant | miaobyte | constant(tensor t, var value)->() | constant(T1,value) | constant(tensor t, var value)->() | | arange | miaobyte | arange(tensor t, var start, var step)->() | arange(T1,start,step) | arange(tensor t, var start, var step)->() | diff --git a/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp b/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp index c92b15ae..415e4449 100644 --- a/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp +++ b/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp @@ -277,14 +277,26 @@ namespace deepx::tensorfunc template struct compareDispatcher { - static void compare(const Tensor &A, const Tensor &B, Tensor &mask) = delete; + static void compare(const Tensor &A, const Tensor &B, Tensor &mask) = delete; }; template - void compare(const Tensor &A, const Tensor &B,Tensor &mask) + void compare(const Tensor &A, const Tensor &B,Tensor &mask) { compareDispatcher::compare(A, B, mask); } + + template + struct comparescalarDispatcher + { + static void comparescalar(const Tensor &A, const T scalar, Tensor &mask) = delete; + }; + + template + void comparescalar(const Tensor &A, const T scalar, Tensor &mask) + { + comparescalarDispatcher::comparescalar(A, scalar, mask); + } } // namespace deepx::tensorfunc #endif // DEEPX_TENSORFUNC_ELEMENTWISE_HPP diff --git a/excuter/op-mem-cuda/src/client/tfs.cpp b/excuter/op-mem-cuda/src/client/tfs.cpp index 2d8b0c5d..f8d33bd5 100644 --- a/excuter/op-mem-cuda/src/client/tfs.cpp +++ b/excuter/op-mem-cuda/src/client/tfs.cpp @@ -303,7 +303,16 @@ namespace deepx::tf vector( { Param("mask", DataCategory::Tensor, Precision::Int8), - }))); + }))); + tffactory.add_tf(std::make_shared>(vector( + { + Param("A", DataCategory::Tensor, Precision::Any), + Param("scalar", DataCategory::Var, Precision::Any), + }), + vector( + { + Param("mask", DataCategory::Tensor, Precision::Int8), + }))); } // matmul void register_matmul(TfFactory &tffactory) diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.cu index 141a4889..cb117037 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.cu +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.cu @@ -158,12 +158,12 @@ namespace deepx::tensorfunc template void launch_minscalar(int numBlocks, int blockSize, const int8_t* A, const int8_t scalar, int8_t* C, const int size); template - __global__ void compare_kernel(const T* A, const T* B, int8_t* mask, const int size) + __global__ void compare_kernel(const T* A, const T* B, float* mask, const int size) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < size) { if (A[idx] == B[idx]) { - mask[idx] = 2; + mask[idx] = 0.5; } else if (A[idx] > B[idx]) { mask[idx] = 1; } else { @@ -172,17 +172,17 @@ namespace deepx::tensorfunc } } - template __global__ void compare_kernel(const double* A, const double* B, int8_t* mask, const int size); - template __global__ void compare_kernel(const float* A, const float* B, int8_t* mask, const int size); - template __global__ void compare_kernel(const nv_bfloat16* A, const nv_bfloat16* B, int8_t* mask, const int size); - template __global__ void compare_kernel<__half>(const __half* A, const __half* B, int8_t* mask, const int size); - template __global__ void compare_kernel(const int64_t* A, const int64_t* B, int8_t* mask, const int size); - template __global__ void compare_kernel(const int32_t* A, const int32_t* B, int8_t* mask, const int size); - template __global__ void compare_kernel(const int16_t* A, const int16_t* B, int8_t* mask, const int size); - template __global__ void compare_kernel(const int8_t* A, const int8_t* B, int8_t* mask, const int size); + template __global__ void compare_kernel(const double* A, const double* B, float* mask, const int size); + template __global__ void compare_kernel(const float* A, const float* B, float* mask, const int size); + template __global__ void compare_kernel(const nv_bfloat16* A, const nv_bfloat16* B, float* mask, const int size); + template __global__ void compare_kernel<__half>(const __half* A, const __half* B, float* mask, const int size); + template __global__ void compare_kernel(const int64_t* A, const int64_t* B, float* mask, const int size); + template __global__ void compare_kernel(const int32_t* A, const int32_t* B, float* mask, const int size); + template __global__ void compare_kernel(const int16_t* A, const int16_t* B, float* mask, const int size); + template __global__ void compare_kernel(const int8_t* A, const int8_t* B, float* mask, const int size); template - void launch_compare(int numBlocks, int blockSize, const T* A, const T* B, int8_t* mask, const int size) + void launch_compare(int numBlocks, int blockSize, const T* A, const T* B, float* mask, const int size) { compare_kernel<<>>(A, B, mask, size); cudaError_t err = cudaGetLastError(); @@ -192,16 +192,60 @@ namespace deepx::tensorfunc } } - template void launch_compare(int numBlocks, int blockSize, const double* A, const double* B, int8_t* mask, const int size); - template void launch_compare(int numBlocks, int blockSize, const float* A, const float* B, int8_t* mask, const int size); - template void launch_compare(int numBlocks, int blockSize, const nv_bfloat16* A, const nv_bfloat16* B, int8_t* mask, const int size); - template void launch_compare<__half>(int numBlocks, int blockSize, const __half* A, const __half* B, int8_t* mask, const int size); - template void launch_compare(int numBlocks, int blockSize, const int64_t* A, const int64_t* B, int8_t* mask, const int size); - template void launch_compare(int numBlocks, int blockSize, const int32_t* A, const int32_t* B, int8_t* mask, const int size); - template void launch_compare(int numBlocks, int blockSize, const int16_t* A, const int16_t* B, int8_t* mask, const int size); - template void launch_compare(int numBlocks, int blockSize, const int8_t* A, const int8_t* B, int8_t* mask, const int size); + template void launch_compare(int numBlocks, int blockSize, const double* A, const double* B, float* mask, const int size); + template void launch_compare(int numBlocks, int blockSize, const float* A, const float* B, float* mask, const int size); + template void launch_compare(int numBlocks, int blockSize, const nv_bfloat16* A, const nv_bfloat16* B, float* mask, const int size); + template void launch_compare<__half>(int numBlocks, int blockSize, const __half* A, const __half* B, float* mask, const int size); + template void launch_compare(int numBlocks, int blockSize, const int64_t* A, const int64_t* B, float* mask, const int size); + template void launch_compare(int numBlocks, int blockSize, const int32_t* A, const int32_t* B, float* mask, const int size); + template void launch_compare(int numBlocks, int blockSize, const int16_t* A, const int16_t* B, float* mask, const int size); + template void launch_compare(int numBlocks, int blockSize, const int8_t* A, const int8_t* B, float* mask, const int size); - + //comparescalar + template + __global__ void comparescalar_kernel(const T* A, const T scalar, float* mask, const int size) + { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < size) { + if (A[idx] == scalar) { + mask[idx] = 0.5; + } else if (A[idx] > scalar) { + mask[idx] = 1; + } else { + mask[idx] = 0; + } + } + } + + template __global__ void comparescalar_kernel(const double* A, const double scalar, float* mask, const int size); + template __global__ void comparescalar_kernel(const float* A, const float scalar, float* mask, const int size); + template __global__ void comparescalar_kernel(const nv_bfloat16* A, const nv_bfloat16 scalar, float* mask, const int size); + template __global__ void comparescalar_kernel<__half>(const __half* A, const __half scalar, float* mask, const int size); + template __global__ void comparescalar_kernel(const int64_t* A, const int64_t scalar, float* mask, const int size); + template __global__ void comparescalar_kernel(const int32_t* A, const int32_t scalar, float* mask, const int size); + template __global__ void comparescalar_kernel(const int16_t* A, const int16_t scalar, float* mask, const int size); + template __global__ void comparescalar_kernel(const int8_t* A, const int8_t scalar, float* mask, const int size); + + template + void launch_comparescalar(int numBlocks, int blockSize, const T* A, const T scalar, float* mask, const int size) + { + comparescalar_kernel<<>>(A, scalar, mask, size); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + throw std::runtime_error("Failed to launch add kernel: " + + std::string(cudaGetErrorString(err))); + } + } + + template void launch_comparescalar(int numBlocks, int blockSize, const double* A, const double scalar, float* mask, const int size); + template void launch_comparescalar(int numBlocks, int blockSize, const float* A, const float scalar, float* mask, const int size); + template void launch_comparescalar(int numBlocks, int blockSize, const nv_bfloat16* A, const nv_bfloat16 scalar, float* mask, const int size); + template void launch_comparescalar<__half>(int numBlocks, int blockSize, const __half* A, const __half scalar, float* mask, const int size); + template void launch_comparescalar(int numBlocks, int blockSize, const int64_t* A, const int64_t scalar, float* mask, const int size); + template void launch_comparescalar(int numBlocks, int blockSize, const int32_t* A, const int32_t scalar, float* mask, const int size); + template void launch_comparescalar(int numBlocks, int blockSize, const int16_t* A, const int16_t scalar, float* mask, const int size); + template void launch_comparescalar(int numBlocks, int blockSize, const int8_t* A, const int8_t scalar, float* mask, const int size); + }; #endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_COMPARE_CU diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.cuh b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.cuh index d3976947..708b6d05 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.cuh +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.cuh @@ -7,6 +7,7 @@ #include "deepx/tensorfunc/authors.hpp" namespace deepx::tensorfunc { + //max template __global__ void max_kernel(const T* A, const T* B, T* C, const int size); @@ -37,6 +38,7 @@ namespace deepx::tensorfunc template <> void launch_max(int numBlocks, int blockSize, const int8_t* A, const int8_t* B, int8_t* C, const int size); + //maxscalar template __global__ void maxscalar_kernel(const T* A, const T scalar, T* C, const int size); @@ -70,6 +72,7 @@ namespace deepx::tensorfunc template __global__ void min_kernel(const T* A, const T* B, T* C, const int size); + //min template void launch_min(int numBlocks, int blockSize, const T* A, const T* B, T* C, const int size); @@ -97,6 +100,7 @@ namespace deepx::tensorfunc template <> void launch_min(int numBlocks, int blockSize, const int8_t* A, const int8_t* B, int8_t* C, const int size); + //minscalar template __global__ void minscalar_kernel(const T* A, const T scalar, T* C, const int size); @@ -127,34 +131,67 @@ namespace deepx::tensorfunc template <> void launch_minscalar(int numBlocks, int blockSize, const int8_t* A, const int8_t scalar, int8_t* C, const int size); + //compare template - __global__ void compare_kernel(const T* A, const T* B, int8_t* mask, const int size); + __global__ void compare_kernel(const T* A, const T* B, float* mask, const int size); template - void launch_compare(int numBlocks, int blockSize, const T* A, const T* B, int8_t* mask, const int size); + void launch_compare(int numBlocks, int blockSize, const T* A, const T* B, float* mask, const int size); template <> - void launch_compare(int numBlocks, int blockSize, const double* A, const double* B, int8_t* mask, const int size); + void launch_compare(int numBlocks, int blockSize, const double* A, const double* B, float* mask, const int size); template <> - void launch_compare(int numBlocks, int blockSize, const float* A, const float* B, int8_t* mask, const int size); + void launch_compare(int numBlocks, int blockSize, const float* A, const float* B, float* mask, const int size); template <> - void launch_compare(int numBlocks, int blockSize, const nv_bfloat16* A, const nv_bfloat16* B, int8_t* mask, const int size); + void launch_compare(int numBlocks, int blockSize, const nv_bfloat16* A, const nv_bfloat16* B, float* mask, const int size); template <> - void launch_compare<__half>(int numBlocks, int blockSize, const __half* A, const __half* B, int8_t* mask, const int size); + void launch_compare<__half>(int numBlocks, int blockSize, const __half* A, const __half* B, float* mask, const int size); template <> - void launch_compare(int numBlocks, int blockSize, const int64_t* A, const int64_t* B, int8_t* mask, const int size); + void launch_compare(int numBlocks, int blockSize, const int64_t* A, const int64_t* B, float* mask, const int size); template <> - void launch_compare(int numBlocks, int blockSize, const int32_t* A, const int32_t* B, int8_t* mask, const int size); + void launch_compare(int numBlocks, int blockSize, const int32_t* A, const int32_t* B, float* mask, const int size); template <> - void launch_compare(int numBlocks, int blockSize, const int16_t* A, const int16_t* B, int8_t* mask, const int size); + void launch_compare(int numBlocks, int blockSize, const int16_t* A, const int16_t* B, float* mask, const int size); template <> - void launch_compare(int numBlocks, int blockSize, const int8_t* A, const int8_t* B, int8_t* mask, const int size); + void launch_compare(int numBlocks, int blockSize, const int8_t* A, const int8_t* B, float* mask, const int size); + + //comparescalar + template + __global__ void comparescalar_kernel(const T* A, const T scalar, float* mask, const int size); + + template + void launch_comparescalar(int numBlocks, int blockSize, const T* A, const T scalar, float* mask, const int size); + + template <> + void launch_comparescalar(int numBlocks, int blockSize, const double* A, const double scalar, float* mask, const int size); + + template <> + void launch_comparescalar(int numBlocks, int blockSize, const float* A, const float scalar, float* mask, const int size); + + template <> + void launch_comparescalar(int numBlocks, int blockSize, const nv_bfloat16* A, const nv_bfloat16 scalar, float* mask, const int size); + + template <> + void launch_comparescalar<__half>(int numBlocks, int blockSize, const __half* A, const __half scalar, float* mask, const int size); + + template <> + void launch_comparescalar(int numBlocks, int blockSize, const int64_t* A, const int64_t scalar, float* mask, const int size); + + template <> + void launch_comparescalar(int numBlocks, int blockSize, const int32_t* A, const int32_t scalar, float* mask, const int size); + + template <> + void launch_comparescalar(int numBlocks, int blockSize, const int16_t* A, const int16_t scalar, float* mask, const int size); + + template <> + void launch_comparescalar(int numBlocks, int blockSize, const int8_t* A, const int8_t scalar, float* mask, const int size); + } #endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_COMPARE_CUH diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.hpp index 3b1b16b9..1d0c49d9 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.hpp +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.hpp @@ -70,7 +70,7 @@ namespace deepx::tensorfunc template struct compareDispatcher { - static void compare(const Tensor &A, const Tensor &B, Tensor &mask) + static void compare(const Tensor &A, const Tensor &B, Tensor &mask) { if (A.shape.size != B.shape.size || A.shape.size != mask.shape.size) { throw TensorShapeError("compare"); @@ -80,6 +80,20 @@ namespace deepx::tensorfunc launch_compare(numBlocks, blockSize, A.data, B.data, mask.data, A.shape.size); } }; + + template + struct comparescalarDispatcher + { + static void comparescalar(const Tensor &A, const T scalar, Tensor &mask) + { + if (A.shape.size != mask.shape.size) { + throw TensorShapeError("comparescalar"); + } + const int blockSize = A.shape.size > 256 ? 256 : A.shape.size; + int numBlocks = (A.shape.size + blockSize - 1) / blockSize; + launch_comparescalar(numBlocks, blockSize, A.data, scalar, mask.data, A.shape.size); + } + }; } #endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_BASIC_HPP diff --git a/excuter/op-mem-cuda/src/deepx/tf/elementwise_compare.hpp b/excuter/op-mem-cuda/src/deepx/tf/elementwise_compare.hpp index c29f3cfb..4ec85b83 100644 --- a/excuter/op-mem-cuda/src/deepx/tf/elementwise_compare.hpp +++ b/excuter/op-mem-cuda/src/deepx/tf/elementwise_compare.hpp @@ -344,28 +344,28 @@ namespace deepx::tf switch (a_type) { case Precision::Float64: - tensorfunc::compare(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::compare(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Float32: - tensorfunc::compare(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::compare(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Float16: - tensorfunc::compare(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::compare(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::BFloat16: - tensorfunc::compare(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::compare(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int64: - tensorfunc::compare(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::compare(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int32: - tensorfunc::compare(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::compare(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int16: - tensorfunc::compare(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::compare(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int8: - tensorfunc::compare(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::compare(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); break; default: error = "Unsupported type: " + precision_str(a_type); @@ -374,5 +374,79 @@ namespace deepx::tf return 0; } }; + + template + class CompareScalar : public TF + { + public: + CompareScalar(const vector &args, const vector &returns) + { + this->name = "comparescalar"; + this->author = Author::name(); + this->args = args; + this->returns = returns; + } + + CompareScalar(string text) + { + this->parse(text); + this->author = Author::name(); + if (this->name != "comparescalar") + { + throw std::runtime_error("Invalid name: " + this->name); + } + } + string math_formula() const override + { + return "mask=compare(T1, scalar)"; + } + shared_ptr clone() const override + { + return make_shared>(*this); + } + + int run(shared_ptr mem, string &error) override + { + Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; + Precision mask_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (a_type != mask_type) + { + error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(mask_type); + return 1; + } + switch (a_type) + { + case Precision::Float64: + tensorfunc::comparescalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + tensorfunc::comparescalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float16: + tensorfunc::comparescalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::BFloat16: + tensorfunc::comparescalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int64: + tensorfunc::comparescalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + tensorfunc::comparescalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + tensorfunc::comparescalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + tensorfunc::comparescalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported type: " + precision_str(a_type); + return 1; + } + return 0; + } + }; + }; #endif // DEEPX_TF_ELEMENTWISE_COMPARE_HPP diff --git a/excuter/op-mem-ompsimd/src/client/tfs.cpp b/excuter/op-mem-ompsimd/src/client/tfs.cpp index add0e9e6..2670deae 100644 --- a/excuter/op-mem-ompsimd/src/client/tfs.cpp +++ b/excuter/op-mem-ompsimd/src/client/tfs.cpp @@ -285,8 +285,17 @@ namespace deepx::tf }), vector( { - Param("mask", DataCategory::Tensor, Precision::Int8), + Param("mask", DataCategory::Tensor, Precision::Float32), }))); + tffactory.add_tf(std::make_shared>(vector( + { + Param("A", DataCategory::Tensor, Precision::Any), + Param("scalar", DataCategory::Var, Precision::Any), + }), + vector( + { + Param("mask", DataCategory::Tensor, Precision::Float32), + }))); } // matmul void register_matmul(TfFactory &tffactory) @@ -309,6 +318,7 @@ namespace deepx::tf { Param("C", DataCategory::Tensor, Precision::Float64 | Precision::Float32), }))); + } // // changeshape void register_changeshape(TfFactory &tffactory) diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp index a410ddd4..18d0fbe7 100644 --- a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp +++ b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp @@ -223,9 +223,6 @@ namespace deepx::tensorfunc } }; - - - // 添加 div 的模板特化实现 template struct divDispatcher @@ -287,7 +284,6 @@ namespace deepx::tensorfunc } }; - template struct sqrtDispatcher>> { @@ -346,8 +342,7 @@ namespace deepx::tensorfunc { output.data[i + j] = std::sqrt(input.data[i + j]); ++j; - } - }); + } }); } else { @@ -568,10 +563,10 @@ namespace deepx::tensorfunc struct maxDispatcher { static void max(const Tensor &A, const Tensor &B, Tensor &C) - { + { if (A.shape == B.shape && A.shape == C.shape) { - C.shape.rangeParallel(C.shape.dim - 1, [&A, &B, &C ](int idx) + C.shape.rangeParallel(C.shape.dim - 1, [&A, &B, &C](int idx) { int shape_last=C.shape[-1]; const ScalableTag tag; @@ -606,7 +601,6 @@ namespace deepx::tensorfunc } } }; - template struct maxscalarDispatcher @@ -651,8 +645,6 @@ namespace deepx::tensorfunc } }; - - template struct minDispatcher { @@ -696,7 +688,6 @@ namespace deepx::tensorfunc } }; - template struct minscalarDispatcher { @@ -738,24 +729,23 @@ namespace deepx::tensorfunc } } }; - + template struct compareDispatcher { - static void compare(const Tensor &A, const Tensor &B,const Tensor &mask) + static void compare(const Tensor &A, const Tensor &B, const Tensor &mask) { - if (A.shape == B.shape && mask.shape == A.shape) + if (A.shape == B.shape && mask.shape == A.shape) { A.shape.rangeParallel(A.shape.dim, [&A, &B, &mask](int idx) { if(A.data[idx]==B.data[idx]){ - mask.data[idx]=2; + mask.data[idx]=0.5; }else if(A.data[idx]>B.data[idx]){ mask.data[idx]=1; }else{ mask.data[idx]=0; - } - }); + } }); } else { @@ -763,5 +753,31 @@ namespace deepx::tensorfunc } } }; -} + + template + struct comparescalarDispatcher + { + static void comparescalar(const Tensor &A, const T scalar, Tensor &mask) + { + if (A.shape == mask.shape) + { + A.shape.rangeParallel(A.shape.dim, [&A, &mask, &scalar](int idx) + { + if(A.data[idx]==scalar){ + mask.data[idx]=0.5; + }else if(A.data[idx]>scalar){ + mask.data[idx]=1; + }else{ + mask.data[idx]=0; + } }); + } + else + { + throw std::invalid_argument("shape mismatch"); + } + }; + }; + + +}; #endif // DEEPX_OP_CPU_ELEMENTWISE_HPP \ No newline at end of file diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp b/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp index 23419f37..c6abe07c 100644 --- a/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp +++ b/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp @@ -1131,22 +1131,22 @@ namespace deepx::tf switch (a_type) { case Precision::Float64: - tensorfunc::compare(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::compare(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Float32: - tensorfunc::compare(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::compare(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int64: - tensorfunc::compare(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::compare(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int32: - tensorfunc::compare(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::compare(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int16: - tensorfunc::compare(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::compare(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int8: - tensorfunc::compare(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::compare(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); break; default: error = "Unsupported dtype: " + precision_str(a_type); @@ -1155,7 +1155,63 @@ namespace deepx::tf return 0; } }; - + + + template + class CompareScalar : public TF + { + public: + CompareScalar(vector args, vector returns) + { + this->name = "comparescalar"; + this->author = Author::name(); + this->args = args; + this->returns = returns; + } + string math_formula() const override + { + return "mask=compare(T1,scalar)"; + } + shared_ptr clone() const override + { + return make_shared>(*this); + } + int run(shared_ptr mem, string &error) override + { + Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; + Precision mask_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (a_type != mask_type) + { + error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(mask_type); + return 1; + } + switch (a_type) + { + case Precision::Float64: + tensorfunc::comparescalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1,mem,true), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + tensorfunc::comparescalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1,mem,true), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int64: + tensorfunc::comparescalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1,mem,true), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + tensorfunc::comparescalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1,mem,true), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + tensorfunc::comparescalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1,mem,true), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + tensorfunc::comparescalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1,mem,true), *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported dtype: " + precision_str(a_type); + return 1; + } + return 0; + } + }; }; #endif diff --git a/front/py/deepx/autograd/function.py b/front/py/deepx/autograd/function.py index 98796472..e9f5ff06 100644 --- a/front/py/deepx/autograd/function.py +++ b/front/py/deepx/autograd/function.py @@ -1,6 +1,7 @@ from deepx.autograd import Graph class Context: - def __init__(self): + def __init__(self,requires_grad=False): + self._requires_grad = requires_grad self._saved_tensors = [] self._non_tensor_data = {} @@ -28,7 +29,8 @@ def backward(ctx:Context, *grad_outputs): @classmethod def apply(cls, *args, **kwargs): - ctx = Context() + requires_grad = kwargs.pop('requires_grad', False) + ctx = Context(requires_grad=requires_grad) result = cls.forward(ctx, *args, **kwargs) return result diff --git a/front/py/deepx/nn/functional/elementwise.py b/front/py/deepx/nn/functional/elementwise.py index 81619c15..e6f8d326 100644 --- a/front/py/deepx/nn/functional/elementwise.py +++ b/front/py/deepx/nn/functional/elementwise.py @@ -122,11 +122,12 @@ def add( a:Tensor, b: Optional[Union[Tensor, float, int]] = None, out:Union[Tensor,str]='', + requires_grad:bool=False, author='miaobyte')->Tensor: if isinstance(b,Tensor): - return Add.apply(a,b,out,author) + return Add.apply(a,b,out,author,requires_grad) else: - return AddScalar.apply(a,b,out,author) + return AddScalar.apply(a,b,out,author,requires_grad) #sub @@ -152,18 +153,21 @@ def backward(ctx:Context, grad_output): def sub( a:Tensor, b: Optional[Union[Tensor, float, int]] = None, - out:Union[Tensor,str]='',author='miaobyte')->Tensor: + out:Union[Tensor,str]='', + requires_grad:bool=False, + author='miaobyte')->Tensor: if isinstance(b,Tensor): - return Sub.apply(a,b,out,author) + return Sub.apply(a,b,out,author,requires_grad) else: - return SubScalar.apply(a,b,out,author) + return SubScalar.apply(a,b,out,author,requires_grad) #mul OpNode.register("mul") class Mul(Function): @staticmethod def forward(ctx:Context, a, b,out,author='miaobyte'): - ctx.save_tensors(a,b) + if ctx.requires_grad: + ctx.save_tensors(a,b) return _A_B_elementwiseop_C(a, b, "mul", out,author) @staticmethod @@ -175,7 +179,8 @@ def backward(ctx:Context, out_grad): class MulScalar(Function): @staticmethod def forward(ctx:Context, a, b,out,author='miaobyte'): - ctx.save_data('b',b) + if ctx.requires_grad: + ctx.save_data('b',b) return _A_b_elementwiseop_C(a, b, "mulscalar", out,author) @staticmethod def backward(ctx:Context, out_grad): @@ -184,11 +189,13 @@ def backward(ctx:Context, out_grad): def mul( a:Tensor, b: Optional[Union[Tensor, float, int]] = None, - out:Union[Tensor,str]='',author='miaobyte')->Tensor: + out:Union[Tensor,str]='', + requires_grad:bool=False, + author='miaobyte')->Tensor: if isinstance(b,Tensor): - return Mul.apply(a,b,out,author) + return Mul.apply(a,b,out,author,requires_grad) else: - return MulScalar.apply(a,b,out,author) + return MulScalar.apply(a,b,out,author,requires_grad) #div @@ -196,7 +203,8 @@ def mul( class Div(Function): @staticmethod def forward(ctx:Context, a, b,out,author='miaobyte'): - ctx.save_tensors(a,b) + if ctx.requires_grad: + ctx.save_tensors(a,b) return _A_B_elementwiseop_C(a, b, "div", out,author) @staticmethod @@ -208,7 +216,8 @@ def backward(ctx:Context, out_grad): class DivScalar(Function): @staticmethod def forward(ctx:Context, a, b,out,author='miaobyte'): - ctx.save_data('b',b) + if ctx.requires_grad: + ctx.save_data('b',b) return _A_b_elementwiseop_C(a, b, "divscalar", out,author) @staticmethod @@ -220,86 +229,127 @@ def backward(ctx:Context, out_grad): class RDivScalar(Function): @staticmethod def forward(ctx:Context, a, b,out,author='miaobyte'): - ctx.save_data('b',b) + if ctx.requires_grad: + ctx.save_data('b',b) return _A_b_elementwiseop_C(a, b, "rdivscalar", out,author) @staticmethod def backward(ctx:Context, out_grad): b=ctx.get_data('b') return out_grad * b, None - -OpNode.register("rdivscalar") def div( a: Optional[Union[Tensor, float, int]] = None, b: Optional[Union[Tensor, float, int]] = None, - out:Union[Tensor,str]='',author='miaobyte')->Tensor: + out:Union[Tensor,str]='', + requires_grad:bool=False, + author='miaobyte')->Tensor: if isinstance(b,Tensor) and isinstance(a,Tensor): - return Div.apply(a,b,out,author) + return Div.apply(a,b,out,author,requires_grad) else: if isinstance(a,Tensor): #C=A/b - return DivScalar.apply(a,b,"divscalar",out,author) + return DivScalar.apply(a,b,"divscalar",out,author,requires_grad) else: #C=a/B - return RDivScalar.apply(a,b,"rdivscalar",out,author) + return RDivScalar.apply(a,b,"rdivscalar",out,author,requires_grad) +OpNode.register("compare") +class Compare(Function): + @staticmethod + def forward(ctx:Context,a,b,out,author='miaobyte'): + if ctx.requires_grad: + ctx.save_tensors(a,b) + return _A_B_elementwiseop_C(a,b,"compare",out,author) + OpNode.register("max") +class Max(Function): + @staticmethod + def forward(ctx:Context,a,b,out,author='miaobyte'): + if ctx.requires_grad: + mask=_A_B_elementwiseop_C(a,b,"compare",'mask',author) + ctx.save_tensors(mask) + return _A_B_elementwiseop_C(a,b,"max",out,author) + + @staticmethod + def backward(ctx:Context,out_grad): + mask_a=ctx.get_tensor + mask_b=1-mask_a + return out_grad*mask_a, out_grad*mask_b + + OpNode.register("maxscalar") +class MaxScalar(Function): + @staticmethod + def forward(ctx:Context,a,b,out,author='miaobyte'): + if ctx.requires_grad: + ctx.save_data('b',b) + return _A_b_elementwiseop_C(a,b,"maxscalar",out,author) + + @staticmethod + def backward(ctx:Context,out_grad): + b=ctx.get_data('b') + return out_grad, out_grad + + def max( a:Tensor, b:Union[int,float,Tensor,]=0, - out:Union[Tensor,str]='')->Tensor: + out:Union[Tensor,str]='', + requires_grad:bool=False, + author='miaobyte')->Tensor: if isinstance(b,int) or isinstance(b,float): - return _A_b_elementwiseop_C(a,b,"maxscalar",out) + return MaxScalar.apply(a,b,"maxscalar",out,author,requires_grad) else: - return _A_B_elementwiseop_C(a,b,"max",out) + return Max.apply(a,b,"max",out,author,requires_grad) OpNode.register("min") +class Min(Function): + @staticmethod + def forward(ctx:Context,a,b,out,author='miaobyte'): + if ctx.requires_grad: + ctx.save_tensors(a,b) + return _A_B_elementwiseop_C(a,b,"min",out,author) + + @staticmethod + def backward(ctx:Context,out_grad): + a,b=ctx.get_tensors() + return out_grad, out_grad + OpNode.register("minscalar") +class MinScalar(Function): + @staticmethod + def forward(ctx:Context,a,b,out,author='miaobyte'): + if ctx.requires_grad: + ctx.save_data('b',b) + return _A_b_elementwiseop_C(a,b,"minscalar",out,author) + + @staticmethod + def backward(ctx:Context,out_grad): + b=ctx.get_data('b') + return out_grad, out_grad + def min( a:Tensor, b:Union[int,float,Tensor,]=0, - out:Union[Tensor,str]='')->Tensor: + out:Union[Tensor,str]='', + requires_grad:bool=False, + author='miaobyte')->Tensor: if isinstance(b,int) or isinstance(b,float): - return _A_b_elementwiseop_C(a,b,"minscalar",out) + return MinScalar.apply(a,b,"minscalar",out,author,requires_grad) else: - return _A_B_elementwiseop_C(a,b,"min",out) + return Min.apply(a,b,"min",out,author,requires_grad) -#clamp -OpNode.register("clamp") -def clamp( - a:Tensor, - min: Optional[Union[ float, int]] = None, - max: Optional[Union[ float, int]] = None, - out:Union[Tensor,str]='')->Tensor: - opnode = a.graph.add_op("clamp") - opnode.add_input(a.node) - outtensor=None - if isinstance(out,str): - outtensor=Tensor(shape=a.shape, dtype=a.dtype, device=a.device) - outtensor.addtograph(out) - else: - outtensor=out - if min is not None: - min_node = a.graph.add_var("", min) - opnode.add_input(min_node) - if max is not None: - max_node = a.graph.add_var("", max) - opnode.add_input(max_node) - outtensor.node.add_input(opnode) - if a.graph.eager: - varir=DeepxIR("clamp", a.dtype, [a.node.name,min,max], [outtensor.node.name]) - send(str(varir)) - return outtensor +#clamp,TODO #sqrt OpNode.register("sqrt") class Sqrt(Function): @staticmethod def forward(ctx:Context, a,out,author='miaobyte'): - ctx.save_tensor(a) + if ctx.requires_grad: + ctx.save_tensors(a) return _A_elementwiseop_C(a,"sqrt",out,author) @staticmethod @@ -309,14 +359,17 @@ def backward(ctx:Context, out_grad): def sqrt( input:Tensor, - out:Union[Tensor,str]='',author='miaobyte')->Tensor: - return Sqrt.apply(input,out,author) + out:Union[Tensor,str]='', + requires_grad:bool=False, + author='miaobyte')->Tensor: + return Sqrt.apply(input,out,author,requires_grad) OpNode.register("pow") class Pow(Function): @staticmethod def forward(ctx:Context, a, b,out,author='miaobyte'): - ctx.save_tensors(a,b) + if ctx.requires_grad: + ctx.save_tensors(a,b) return _A_B_elementwiseop_C(a, b, "pow", out,author) @staticmethod @@ -328,7 +381,8 @@ def backward(ctx:Context, out_grad): class PowScalar(Function): @staticmethod def forward(ctx:Context, a, b,out,author='miaobyte'): - ctx.save_data('b',b) + if ctx.requires_grad: + ctx.save_data('b',b) return _A_b_elementwiseop_C(a, b, "powscalar", out,author) @staticmethod @@ -339,18 +393,21 @@ def backward(ctx:Context, out_grad): def pow( a:Tensor, b:Union[int,float,Tensor,]=0, - out:Union[Tensor,str]='',author='miaobyte')->Tensor: + out:Union[Tensor,str]='', + requires_grad:bool=False, + author='miaobyte')->Tensor: if isinstance(b,int) or isinstance(b,float): - return PowScalar.apply(a,b,"powscalar",out,author) + return PowScalar.apply(a,b,out,author,requires_grad) else: - return Pow.apply(a,b,"pow",out,author) + return Pow.apply(a,b,out,author,requires_grad) #exp OpNode.register("exp") class Exp(Function): @staticmethod def forward(ctx:Context, a,out,author='miaobyte'): - ctx.save_tensor(a) + if ctx.requires_grad: + ctx.save_tensors(a) return _A_elementwiseop_C(a,"exp",out,author) @staticmethod @@ -360,14 +417,17 @@ def backward(ctx:Context, out_grad): def exp( a:Tensor, - out:Union[Tensor,str]='',author='miaobyte')->Tensor: - return Exp.apply(a,out,author) + out:Union[Tensor,str]='', + requires_grad:bool=False, + author='miaobyte')->Tensor: + return Exp.apply(a,out,author,requires_grad) #log OpNode.register("log") class Log(Function): @staticmethod def forward(ctx:Context, a,out,author='miaobyte'): - ctx.save_tensor(a) + if ctx.requires_grad: + ctx.save_tensors(a) return _A_elementwiseop_C(a,"log",out,author) @staticmethod @@ -377,14 +437,17 @@ def backward(ctx:Context, out_grad): def log( a:Tensor, - out:Union[Tensor,str]='',author='miaobyte')->Tensor: - return Log.apply(a,out,author) + out:Union[Tensor,str]='', + requires_grad:bool=False, + author='miaobyte')->Tensor: + return Log.apply(a,out,author,requires_grad) OpNode.register("rsqrt") class Rsqrt(Function): @staticmethod def forward(ctx:Context, a,out,author='miaobyte'): - ctx.save_tensor(a) + if ctx.requires_grad: + ctx.save_tensors(a) return _A_elementwiseop_C(a,"rsqrt",out,author) @staticmethod @@ -394,8 +457,10 @@ def backward(ctx:Context, out_grad): def rsqrt( input:Tensor, - out:Union[Tensor,str]='',author='miaobyte')->Tensor: - return Rsqrt.apply(input,out,author) + out:Union[Tensor,str]='', + requires_grad:bool=False, + author='miaobyte')->Tensor: + return Rsqrt.apply(input,out,author,requires_grad) diff --git a/front/py/deepx/nn/functional/init.py b/front/py/deepx/nn/functional/init.py index bb5b8f40..e79b683b 100644 --- a/front/py/deepx/nn/functional/init.py +++ b/front/py/deepx/nn/functional/init.py @@ -2,22 +2,29 @@ import math from deepx import Tensor -from deepx.autograd.graph import OpNode +from deepx.autograd.graph import OpNode,Function,Context from deepx.nn.deepxir import DeepxIR,Param from deepx.scheduler import send OpNode.register("constant") - -def constant(t:Tensor, value:Optional[Union[ - float,int]]=None,author='miaobyte') -> Tensor: - opnode = t.graph.add_op("constant") - argnode=t.graph.add_var('',value) - opnode.add_input(argnode) - t.node.add_input(opnode) - if t.graph.eager: - ir=DeepxIR("constant", [Param(t.node.name, 'tensor', t.dtype),Param(value)], [],author) - send(ir) - return t +class Constant(Function): + @staticmethod + def forward(ctx:Context, + t:Tensor, + value:Optional[Union[float,int]]=None, + author='miaobyte') -> Tensor: + opnode = t.graph.add_op("constant") + argnode=t.graph.add_var('',value) + opnode.add_input(argnode) + t.node.add_input(opnode) + if t.graph.eager: + ir=DeepxIR("constant", [Param(t.node.name, 'tensor', t.dtype),Param(value)], [],author) + send(ir) + return t +def constant(t:Tensor, + value:Optional[Union[float,int]]=None, + author='miaobyte')->Tensor: + return Constant.apply(t,value,author) def full(*shape, value=0, dtype=None, device=None, name:Union[Tensor,str]='')->Tensor: @@ -39,38 +46,56 @@ def ones(*size, dtype=None, device=None, name:Union[str]='')->Tensor: return full(*size, value=1, dtype=dtype, device=device,name=name) +OpNode.register("arange") +class Arange(Function): + @staticmethod + def forward(ctx:Context, + start:Optional[Union[float,int]]=0, + end:Optional[Union[float,int]]=None, + step:Optional[Union[float,int]]=1,dtype=None, device=None,name:Union[Tensor,str]='',author='miaobyte')->Tensor: + outtensor=None + if isinstance(name,str): + shape=[end-start] + outtensor=Tensor(shape=shape, dtype=dtype, device=device) + outtensor.addtograph(name) + else: + outtensor=name + g=outtensor.graph + if g.eager: + ir=DeepxIR("arange", [outtensor.node.name,start,step], [],author) + send(ir) + return outtensor def arange(start=0, end=None, step=1,dtype=None, device=None,name:Union[Tensor,str]='',author='miaobyte')->Tensor: - outtensor=None - if isinstance(name,str): - shape=[end-start] - outtensor=Tensor(shape=shape, dtype=dtype, device=device) - outtensor.addtograph(name) - else: - outtensor=name - g=outtensor.graph - if g.eager: - ir=DeepxIR("arange", [outtensor.node.name,start,step], [],author) - send(ir) - return outtensor + return Arange.apply(start,end,step,dtype,device,name,author) OpNode.register("uniform") +class Uniform(Function): + @staticmethod + def forward(ctx:Context, + t:Tensor, + low:Optional[Union[float,int]]=0, + high:Optional[Union[float,int]]=1, + seed:Optional[int]=0,author='miaobyte')->Tensor: + if low >= high: + raise ValueError(f"low({low})必须小于high({high})") + if t is None: + raise ValueError("t不能为None") + g=t.graph + + opnode = g.add_op("uniform") + opnode.add_input(g.add_var('',low)) + opnode.add_input(g.add_var('',high)) + if seed is not None: + opnode.add_input(g.add_var('',seed)) + t.node.add_input(opnode) + if t.graph.eager: + ir=DeepxIR("uniform", [t.node.name,low, high,seed], [],author) + send(ir) + return t + + def uniform(t:Tensor,low=0, high=1,seed:int=0,author='miaobyte')->Tensor: - if low >= high: - raise ValueError(f"low({low})必须小于high({high})") - if t is None: - raise ValueError("t不能为None") - g=t.graph - - opnode = g.add_op("uniform") - opnode.add_input(g.add_var('',low)) - opnode.add_input(g.add_var('',high)) - if seed is not None: - opnode.add_input(g.add_var('',seed)) - t.node.add_input(opnode) - if t.graph.eager: - ir=DeepxIR("uniform", [t.node.name,low, high,seed], [],author) - send(ir) - return t + return Uniform.apply(t,low,high,seed,author) def rand(*size, dtype=None, device=None): #TODO @@ -80,7 +105,6 @@ def randn(*size, dtype=None, device=None): #TODO pass - def eye( n:int, m:Optional[int]=None, diff --git a/front/py/deepx/nn/functional/matmul.py b/front/py/deepx/nn/functional/matmul.py index 4a788f9f..97356b8d 100644 --- a/front/py/deepx/nn/functional/matmul.py +++ b/front/py/deepx/nn/functional/matmul.py @@ -1,31 +1,42 @@ from typing import Optional,Union from deepx import Tensor -from deepx.autograd import OpNode +from deepx.autograd import OpNode,Function,Context from deepx.nn import DeepxIR from deepx.scheduler import send OpNode.register("matmul") +class Matmul(Function): + @staticmethod + def forward(ctx:Context, + a:Tensor, + b: Tensor, + out:Union[Tensor,str]='', + author:str='cublas'): + ctx.save_tensors(a,b) -def matmul( - a:Tensor, - b: Tensor, - out:Union[Tensor,str]='', - author:str='cublas'): - opnode = a.graph.add_op("matmul") - opnode.add_input(a.node) - opnode.add_input(b.node) + opnode = a.graph.add_op("matmul") + opnode.add_input(a.node) + opnode.add_input(b.node) + + outtensor=None + if isinstance(out,str): + matmulshape=a.Shape.matmul(b.shape) + outtensor=Tensor(shape=matmulshape, dtype=a.dtype, device=a.device) + outtensor.addtograph(out) + else: + outtensor=out + outtensor.node.add_input(opnode) + if a.graph.eager: + ir=DeepxIR("matmul", [a.node.name,b.node.name], [outtensor.node.name], author=author) + send(ir) + return outtensor - outtensor=None - if isinstance(out,str): - matmulshape=a.Shape.matmul(b.shape) - outtensor=Tensor(shape=matmulshape, dtype=a.dtype, device=a.device) - outtensor.addtograph(out) - else: - outtensor=out - outtensor.node.add_input(opnode) - if a.graph.eager: - ir=DeepxIR("matmul", [a.node.name,b.node.name], [outtensor.node.name], author=author) - send(ir) - return outtensor + @staticmethod + def backward(ctx:Context,out_grad): + a,b=ctx.get_tensors() + return out_grad @ b.T, a.T @ out_grad + +def matmul(a:Tensor,b:Tensor,out:Union[Tensor,str]='',author:str='cublas')->Tensor: + return Matmul.apply(a,b,out,author) diff --git a/front/py/examples/2_ir/2_elementwise_sqrtlog.dot b/front/py/examples/2_ir/2_elementwise_sqrtlog.dot index 4e476571..4b3d20f4 100644 --- a/front/py/examples/2_ir/2_elementwise_sqrtlog.dot +++ b/front/py/examples/2_ir/2_elementwise_sqrtlog.dot @@ -2,34 +2,34 @@ digraph { rankdir=TB node [shape=record] - 140074505155728 [label="t1 + 136548958820992 [label="t1 (60,)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 140076479891344 [label="t2 + 136551216711568 [label="t2 (60,)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 140074503481968 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 140074503482016 [label="var_1 + 136548919477104 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 136548919477152 [label="var_1 2" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled] - 140074503481920 [label=sqrt color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 140074503481824 [label="t3 + 136548919476960 [label=sqrt color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 136548919477248 [label="t3 (60,)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 140074503481728 [label=log color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 140074503482304 [label="t4 + 136548919477728 [label=log color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 136548919477632 [label="t4 (60,)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 140074503482544 [label=exp color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 140074503482640 [label="t5 + 136548919478064 [label=exp color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 136548919477968 [label="t5 (60,)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 140074503487056 [label=pow color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 140074503486960 [label="t6 + 136548919478400 [label=pow color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 136548919478304 [label="t6 (60,)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 140074503481968 -> 140076479891344 [arrowsize=0.8 color=gray40 penwidth=1.2] - 140074503482016 -> 140074503481968 [arrowsize=0.8 color=gray40 penwidth=1.2] - 140074505155728 -> 140074503481920 [arrowsize=0.8 color=gray40 penwidth=1.2] - 140074503481920 -> 140074503481824 [arrowsize=0.8 color=gray40 penwidth=1.2] - 140076479891344 -> 140074503481728 [arrowsize=0.8 color=gray40 penwidth=1.2] - 140074503481728 -> 140074503482304 [arrowsize=0.8 color=gray40 penwidth=1.2] - 140074503482304 -> 140074503482544 [arrowsize=0.8 color=gray40 penwidth=1.2] - 140074503482544 -> 140074503482640 [arrowsize=0.8 color=gray40 penwidth=1.2] - 140074503482640 -> 140074503487056 [arrowsize=0.8 color=gray40 penwidth=1.2] - 140074503481824 -> 140074503487056 [arrowsize=0.8 color=gray40 penwidth=1.2] - 140074503487056 -> 140074503486960 [arrowsize=0.8 color=gray40 penwidth=1.2] + 136548919477104 -> 136551216711568 [arrowsize=0.8 color=gray40 penwidth=1.2] + 136548919477152 -> 136548919477104 [arrowsize=0.8 color=gray40 penwidth=1.2] + 136548958820992 -> 136548919476960 [arrowsize=0.8 color=gray40 penwidth=1.2] + 136548919476960 -> 136548919477248 [arrowsize=0.8 color=gray40 penwidth=1.2] + 136551216711568 -> 136548919477728 [arrowsize=0.8 color=gray40 penwidth=1.2] + 136548919477728 -> 136548919477632 [arrowsize=0.8 color=gray40 penwidth=1.2] + 136548919477632 -> 136548919478064 [arrowsize=0.8 color=gray40 penwidth=1.2] + 136548919478064 -> 136548919477968 [arrowsize=0.8 color=gray40 penwidth=1.2] + 136548919477968 -> 136548919478400 [arrowsize=0.8 color=gray40 penwidth=1.2] + 136548919477248 -> 136548919478400 [arrowsize=0.8 color=gray40 penwidth=1.2] + 136548919478400 -> 136548919478304 [arrowsize=0.8 color=gray40 penwidth=1.2] } diff --git a/front/py/examples/2_ir/2_elementwise_sqrtlog.dot.svg b/front/py/examples/2_ir/2_elementwise_sqrtlog.dot.svg index a517b63b..a688a2a1 100644 --- a/front/py/examples/2_ir/2_elementwise_sqrtlog.dot.svg +++ b/front/py/examples/2_ir/2_elementwise_sqrtlog.dot.svg @@ -9,148 +9,148 @@ %3 - + -140074505155728 +136548958820992 t1 (60,) - + -140074503481920 +136548919476960 sqrt - + -140074505155728->140074503481920 +136548958820992->136548919476960 - + -140076479891344 +136551216711568 t2 (60,) - + -140074503481728 +136548919477728 log - + -140076479891344->140074503481728 +136551216711568->136548919477728 - + -140074503481968 +136548919477104 constant - + -140074503481968->140076479891344 +136548919477104->136551216711568 - + -140074503482016 +136548919477152 var_1 2 - + -140074503482016->140074503481968 +136548919477152->136548919477104 - + -140074503481824 +136548919477248 t3 (60,) - + -140074503481920->140074503481824 +136548919476960->136548919477248 - + -140074503487056 +136548919478400 pow - + -140074503481824->140074503487056 +136548919477248->136548919478400 - + -140074503482304 +136548919477632 t4 (60,) - + -140074503481728->140074503482304 +136548919477728->136548919477632 - + -140074503482544 +136548919478064 exp - + -140074503482304->140074503482544 +136548919477632->136548919478064 - + -140074503482640 +136548919477968 t5 (60,) - + -140074503482544->140074503482640 +136548919478064->136548919477968 - + -140074503482640->140074503487056 +136548919477968->136548919478400 - + -140074503486960 +136548919478304 t6 (60,) - + -140074503487056->140074503486960 +136548919478400->136548919478304 From 489c164ba2fcade65436e3270d0151fc6ca2892d Mon Sep 17 00:00:00 2001 From: lipeng <734991033@qq.com> Date: Wed, 2 Apr 2025 22:06:38 +0800 Subject: [PATCH 3/7] half,bfloat16 --- .../src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cu | 3 +++ 1 file changed, 3 insertions(+) diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cu index cc94ac00..b3333e9e 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cu +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cu @@ -1,6 +1,9 @@ #ifndef DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_SQRT_CU #define DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_SQRT_CU + +#include +#include #include "deepx/tensorfunc/cuda.hpp" #include "deepx/tensorfunc/authors.hpp" #include From 5d07d3b6f7ddd6bb2418c8b188cc79349285590d Mon Sep 17 00:00:00 2001 From: lipeng <734991033@qq.com> Date: Wed, 2 Apr 2025 23:01:46 +0800 Subject: [PATCH 4/7] half,bfloat16 --- .../src/deepx/tensorfunc/elementwise_miaobyte_sin.cu | 8 ++++---- .../deepx/tensorfunc/elementwise_miaobyte_sqrt.cu | 12 ++++++------ 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.cu index b45ff9a3..acf7832b 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.cu +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.cu @@ -32,14 +32,14 @@ namespace deepx::tensorfunc __global__ void sin_kernel(const nv_bfloat16* A, nv_bfloat16* C, const int size){ int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < size) { - C[idx] = hsin(A[idx]); + C[idx] = ::hsin(A[idx]); } } template <> __global__ void sin_kernel<__half>(const __half* A, __half* C, const int size){ int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < size) { - C[idx] = hsin(A[idx]); + C[idx] = ::hsin(A[idx]); } } @@ -79,14 +79,14 @@ namespace deepx::tensorfunc __global__ void cos_kernel(const nv_bfloat16* A, nv_bfloat16* C, const int size){ int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < size) { - C[idx] = hcos(A[idx]); + C[idx] = ::hcos(A[idx]); } } template <> __global__ void cos_kernel<__half>(const __half* A, __half* C, const int size){ int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < size) { - C[idx] = hcos(A[idx]); + C[idx] = ::hcos(A[idx]); } } diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cu index b3333e9e..8609763a 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cu +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cu @@ -32,14 +32,14 @@ namespace deepx::tensorfunc __global__ void sqrt_kernel(const nv_bfloat16* A, nv_bfloat16* C,const int size){ int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < size) { - C[idx] = hsqrt(A[idx]); + C[idx] = ::hsqrt(A[idx]); } } template <> __global__ void sqrt_kernel<__half>(const __half* A, __half* C,const int size){ int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < size) { - C[idx] = hsqrt(A[idx]); + C[idx] = ::hsqrt(A[idx]); } } @@ -141,14 +141,14 @@ namespace deepx::tensorfunc __global__ void log_kernel<__half>(const __half* A, __half* C,const int size){ int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < size) { - C[idx] = hlog(A[idx]); + C[idx] = ::hlog(A[idx]); } } template <> __global__ void log_kernel(const nv_bfloat16* A, nv_bfloat16* C,const int size){ int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < size) { - C[idx] = hlog(A[idx]); + C[idx] = ::hlog(A[idx]); } } @@ -187,14 +187,14 @@ namespace deepx::tensorfunc __global__ void exp_kernel(const nv_bfloat16* A, nv_bfloat16* C,const int size){ int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < size) { - C[idx] = hexp(A[idx]); + C[idx] = ::hexp(A[idx]); } } template <> __global__ void exp_kernel<__half>(const __half* A, __half* C,const int size){ int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < size) { - C[idx] = hexp(A[idx]); + C[idx] = ::hexp(A[idx]); } } From 60b61a83e2affdfd40c899f608394a5954e15b0f Mon Sep 17 00:00:00 2001 From: lipeng <734991033@qq.com> Date: Wed, 2 Apr 2025 23:24:16 +0800 Subject: [PATCH 5/7] half,bfloat16:fix --- .../tensorfunc/elementwise_miaobyte_sin.cu | 23 +- .../tensorfunc/elementwise_miaobyte_sin_a.cu | 67 +++++ .../tensorfunc/elementwise_miaobyte_sqrt.cu | 252 +++++++++--------- .../tensorfunc/elementwise_miaobyte_sqrt_a.cu | 95 +++++++ 4 files changed, 297 insertions(+), 140 deletions(-) create mode 100644 excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin_a.cu create mode 100644 excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt_a.cu diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.cu index acf7832b..354f7a89 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.cu +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.cu @@ -1,7 +1,6 @@ #ifndef DEEPX_TENSORFUNC_ELEMENTWISE_MIAOBYTE_SIN_CU #define DEEPX_TENSORFUNC_ELEMENTWISE_MIAOBYTE_SIN_CU -#include #include @@ -28,18 +27,12 @@ namespace deepx::tensorfunc C[idx] = sinf(A[idx]); } } - template <> - __global__ void sin_kernel(const nv_bfloat16* A, nv_bfloat16* C, const int size){ - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < size) { - C[idx] = ::hsin(A[idx]); - } - } + template <> __global__ void sin_kernel<__half>(const __half* A, __half* C, const int size){ int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < size) { - C[idx] = ::hsin(A[idx]); + C[idx] = ::__nv_half(hsin(A[idx])); } } @@ -55,7 +48,6 @@ namespace deepx::tensorfunc template void launch_sin(int numBlocks, int blockSize, const double* a, double* c, const int size); template void launch_sin(int numBlocks, int blockSize, const float* a, float* c, const int size); - template void launch_sin(int numBlocks, int blockSize, const nv_bfloat16* a, nv_bfloat16* c, const int size); template void launch_sin<__half>(int numBlocks, int blockSize, const __half* a, __half* c, const int size); // cos @@ -75,18 +67,12 @@ namespace deepx::tensorfunc C[idx] = cosf(A[idx]); } } - template <> - __global__ void cos_kernel(const nv_bfloat16* A, nv_bfloat16* C, const int size){ - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < size) { - C[idx] = ::hcos(A[idx]); - } - } + template <> __global__ void cos_kernel<__half>(const __half* A, __half* C, const int size){ int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < size) { - C[idx] = ::hcos(A[idx]); + C[idx] = ::__nv_half(hcos(A[idx])); } } @@ -101,7 +87,6 @@ namespace deepx::tensorfunc } template void launch_cos(int numBlocks, int blockSize, const double* a, double* c, const int size); template void launch_cos(int numBlocks, int blockSize, const float* a, float* c, const int size); - template void launch_cos(int numBlocks, int blockSize, const nv_bfloat16* a, nv_bfloat16* c, const int size); template void launch_cos<__half>(int numBlocks, int blockSize, const __half* a, __half* c, const int size); // tan diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin_a.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin_a.cu new file mode 100644 index 00000000..0b621601 --- /dev/null +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin_a.cu @@ -0,0 +1,67 @@ +#ifndef DEEPX_TENSORFUNC_ELEMENTWISE_MIAOBYTE_SIN_A_CU +#define DEEPX_TENSORFUNC_ELEMENTWISE_MIAOBYTE_SIN_A_CU + +#include + +#include "deepx/tensorfunc/cuda.hpp" +#include "deepx/tensorfunc/authors.hpp" + +namespace deepx::tensorfunc +{ + // sin + template + __global__ void sin_kernel(const T *A, T *C, const int size); + + template <> + __global__ void sin_kernel(const nv_bfloat16 *A, nv_bfloat16 *C, const int size) + { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < size) + { + C[idx] = ::__nv_bfloat16(hsin(A[idx])); + } + } + + template + void launch_sin(int numBlocks, int blockSize, const T *a, T *c, const int size) + { + sin_kernel<<>>(a, c, size); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + throw std::runtime_error("Failed to launch sin kernel: " + + std::string(cudaGetErrorString(err))); + } + } + + template void launch_sin(int numBlocks, int blockSize, const nv_bfloat16 *a, nv_bfloat16 *c, const int size); + + // cos + template + __global__ void cos_kernel(const T *A, T *C, const int size); + + template <> + __global__ void cos_kernel(const nv_bfloat16 *A, nv_bfloat16 *C, const int size) + { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < size) + { + C[idx] = ::__nv_bfloat16(hcos(A[idx])); + } + } + template + void launch_cos(int numBlocks, int blockSize, const T *a, T *c, const int size) + { + cos_kernel<<>>(a, c, size); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + throw std::runtime_error("Failed to launch cos kernel: " + + std::string(cudaGetErrorString(err))); + } + } + template void launch_cos(int numBlocks, int blockSize, const nv_bfloat16 *a, nv_bfloat16 *c, const int size); + +} + +#endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAOBYTE_SIN_A_CU diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cu index 8609763a..c30f48d4 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cu +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cu @@ -1,8 +1,6 @@ #ifndef DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_SQRT_CU #define DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_SQRT_CU - -#include #include #include "deepx/tensorfunc/cuda.hpp" #include "deepx/tensorfunc/authors.hpp" @@ -11,206 +9,218 @@ namespace deepx::tensorfunc { // sqrt - template - __global__ void sqrt_kernel(const T* A, T* C,const int size); + template + __global__ void sqrt_kernel(const T *A, T *C, const int size); template <> - __global__ void sqrt_kernel(const double* A, double* C,const int size){ + __global__ void sqrt_kernel(const double *A, double *C, const int size) + { int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < size) { + if (idx < size) + { C[idx] = sqrt(A[idx]); } } template <> - __global__ void sqrt_kernel(const float* A, float* C,const int size){ + __global__ void sqrt_kernel(const float *A, float *C, const int size) + { int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < size) { + if (idx < size) + { C[idx] = sqrtf(A[idx]); } } template <> - __global__ void sqrt_kernel(const nv_bfloat16* A, nv_bfloat16* C,const int size){ - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < size) { - C[idx] = ::hsqrt(A[idx]); - } - } - template <> - __global__ void sqrt_kernel<__half>(const __half* A, __half* C,const int size){ + __global__ void sqrt_kernel<__half>(const __half *A, __half *C, const int size) + { int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < size) { - C[idx] = ::hsqrt(A[idx]); + if (idx < size) + { + C[idx] = ::__nv_half(hsqrt(A[idx])); } } - + template - void launch_sqrt(int numBlocks, int blockSize, const T* a, T* c,const int size){ + void launch_sqrt(int numBlocks, int blockSize, const T *a, T *c, const int size) + { sqrt_kernel<<>>(a, c, size); cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) { - throw std::runtime_error("Failed to launch sqrt kernel: " + - std::string(cudaGetErrorString(err))); - } - } - template void launch_sqrt(int numBlocks, int blockSize, const double* a, double* c,const int size); - template void launch_sqrt(int numBlocks, int blockSize, const float* a, float* c,const int size); - template void launch_sqrt(int numBlocks, int blockSize, const nv_bfloat16* a, nv_bfloat16* c,const int size); - template void launch_sqrt<__half>(int numBlocks, int blockSize, const __half* a, __half* c,const int size); - - + if (err != cudaSuccess) + { + throw std::runtime_error("Failed to launch sqrt kernel: " + + std::string(cudaGetErrorString(err))); + } + } + template void launch_sqrt(int numBlocks, int blockSize, const double *a, double *c, const int size); + template void launch_sqrt(int numBlocks, int blockSize, const float *a, float *c, const int size); + template void launch_sqrt<__half>(int numBlocks, int blockSize, const __half *a, __half *c, const int size); + // pow template - __global__ void pow_kernel(const T* A, const T* B, T* C,const int size); + __global__ void pow_kernel(const T *A, const T *B, T *C, const int size); template <> - __global__ void pow_kernel(const double* A, const double* B, double* C,const int size){ + __global__ void pow_kernel(const double *A, const double *B, double *C, const int size) + { int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < size) { + if (idx < size) + { C[idx] = pow(A[idx], B[idx]); } } template <> - __global__ void pow_kernel(const float* A, const float* B, float* C,const int size){ + __global__ void pow_kernel(const float *A, const float *B, float *C, const int size) + { int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < size) { + if (idx < size) + { C[idx] = powf(A[idx], B[idx]); } } template - void launch_pow(int numBlocks, int blockSize, const T* a, const T* b, T* c,const int size){ + void launch_pow(int numBlocks, int blockSize, const T *a, const T *b, T *c, const int size) + { pow_kernel<<>>(a, b, c, size); cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) { - throw std::runtime_error("Failed to launch pow kernel: " + - std::string(cudaGetErrorString(err))); - } - } - template void launch_pow(int numBlocks, int blockSize, const double* a, const double* b, double* c,const int size); - template void launch_pow(int numBlocks, int blockSize, const float* a, const float* b, float* c,const int size); - + if (err != cudaSuccess) + { + throw std::runtime_error("Failed to launch pow kernel: " + + std::string(cudaGetErrorString(err))); + } + } + template void launch_pow(int numBlocks, int blockSize, const double *a, const double *b, double *c, const int size); + template void launch_pow(int numBlocks, int blockSize, const float *a, const float *b, float *c, const int size); + // powscalar template - __global__ void powscalar_kernel(const T* A, const T scalar, T* C,const int size); + __global__ void powscalar_kernel(const T *A, const T scalar, T *C, const int size); template <> - __global__ void powscalar_kernel(const double* A, const double scalar, double* C,const int size){ + __global__ void powscalar_kernel(const double *A, const double scalar, double *C, const int size) + { int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < size) { - C[idx] = pow(A[idx], scalar); + if (idx < size) + { + C[idx] = pow(A[idx], scalar); } } template <> - __global__ void powscalar_kernel(const float* A, const float scalar, float* C,const int size){ + __global__ void powscalar_kernel(const float *A, const float scalar, float *C, const int size) + { int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < size) { - C[idx] = powf(A[idx], scalar); + if (idx < size) + { + C[idx] = powf(A[idx], scalar); } } - template __global__ void powscalar_kernel(const double* A, const double scalar, double* C,const int size); - template __global__ void powscalar_kernel(const float* A, const float scalar, float* C,const int size); - + template __global__ void powscalar_kernel(const double *A, const double scalar, double *C, const int size); + template __global__ void powscalar_kernel(const float *A, const float scalar, float *C, const int size); + template - void launch_powscalar(int numBlocks, int blockSize, const T* a, const T scalar, T* c,const int size){ + void launch_powscalar(int numBlocks, int blockSize, const T *a, const T scalar, T *c, const int size) + { powscalar_kernel<<>>(a, scalar, c, size); cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) { - throw std::runtime_error("Failed to launch powscalar kernel: " + - std::string(cudaGetErrorString(err))); - } - } - template void launch_powscalar(int numBlocks, int blockSize, const double* a, const double scalar, double* c,const int size); - template void launch_powscalar(int numBlocks, int blockSize, const float* a, const float scalar, float* c,const int size); - + if (err != cudaSuccess) + { + throw std::runtime_error("Failed to launch powscalar kernel: " + + std::string(cudaGetErrorString(err))); + } + } + template void launch_powscalar(int numBlocks, int blockSize, const double *a, const double scalar, double *c, const int size); + template void launch_powscalar(int numBlocks, int blockSize, const float *a, const float scalar, float *c, const int size); + // log template - __global__ void log_kernel(const T* A, T* C,const int size); + __global__ void log_kernel(const T *A, T *C, const int size); template <> - __global__ void log_kernel(const double* A, double* C,const int size){ + __global__ void log_kernel(const double *A, double *C, const int size) + { int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < size) { - C[idx] = logf(A[idx]); + if (idx < size) + { + C[idx] = logf(A[idx]); } } template <> - __global__ void log_kernel(const float* A, float* C,const int size){ + __global__ void log_kernel(const float *A, float *C, const int size) + { int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < size) { - C[idx] = logf(A[idx]); + if (idx < size) + { + C[idx] = logf(A[idx]); } } template <> - __global__ void log_kernel<__half>(const __half* A, __half* C,const int size){ - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < size) { - C[idx] = ::hlog(A[idx]); - } - } - template <> - __global__ void log_kernel(const nv_bfloat16* A, nv_bfloat16* C,const int size){ + __global__ void log_kernel<__half>(const __half *A, __half *C, const int size) + { int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < size) { - C[idx] = ::hlog(A[idx]); + if (idx < size) + { + C[idx] = ::__nv_half(hlog(A[idx])); } } - + template - void launch_log(int numBlocks, int blockSize, const T* a, T* c,const int size){ + void launch_log(int numBlocks, int blockSize, const T *a, T *c, const int size) + { log_kernel<<>>(a, c, size); cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) { - throw std::runtime_error("Failed to launch log kernel: " + - std::string(cudaGetErrorString(err))); - } - } - template void launch_log(int numBlocks, int blockSize, const double* a, double* c,const int size); - template void launch_log(int numBlocks, int blockSize, const float* a, float* c,const int size); - template void launch_log(int numBlocks, int blockSize, const nv_bfloat16* a, nv_bfloat16* c,const int size); - template void launch_log<__half>(int numBlocks, int blockSize, const __half* a, __half* c,const int size); - - // exp - template - __global__ void exp_kernel(const T* A, T* C,const int size); - template <> - __global__ void exp_kernel(const double* A, double* C,const int size){ - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < size) { - C[idx] = exp(A[idx]); + if (err != cudaSuccess) + { + throw std::runtime_error("Failed to launch log kernel: " + + std::string(cudaGetErrorString(err))); } } + template void launch_log(int numBlocks, int blockSize, const double *a, double *c, const int size); + template void launch_log(int numBlocks, int blockSize, const float *a, float *c, const int size); + template void launch_log<__half>(int numBlocks, int blockSize, const __half *a, __half *c, const int size); + + // exp + template + __global__ void exp_kernel(const T *A, T *C, const int size); template <> - __global__ void exp_kernel(const float* A, float* C,const int size){ + __global__ void exp_kernel(const double *A, double *C, const int size) + { int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < size) { - C[idx] = expf(A[idx]); + if (idx < size) + { + C[idx] = exp(A[idx]); } } template <> - __global__ void exp_kernel(const nv_bfloat16* A, nv_bfloat16* C,const int size){ + __global__ void exp_kernel(const float *A, float *C, const int size) + { int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < size) { - C[idx] = ::hexp(A[idx]); + if (idx < size) + { + C[idx] = expf(A[idx]); } } + template <> - __global__ void exp_kernel<__half>(const __half* A, __half* C,const int size){ + __global__ void exp_kernel<__half>(const __half *A, __half *C, const int size) + { int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < size) { - C[idx] = ::hexp(A[idx]); + if (idx < size) + { + C[idx] = ::__nv_half(hexp(A[idx])); } } - + template - void launch_exp(int numBlocks, int blockSize, const T* a, T* c,const int size){ + void launch_exp(int numBlocks, int blockSize, const T *a, T *c, const int size) + { exp_kernel<<>>(a, c, size); cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) { - throw std::runtime_error("Failed to launch exp kernel: " + - std::string(cudaGetErrorString(err))); - } - } - template void launch_exp(int numBlocks, int blockSize, const double* a, double* c,const int size); - template void launch_exp(int numBlocks, int blockSize, const float* a, float* c,const int size); - template void launch_exp(int numBlocks, int blockSize, const nv_bfloat16* a, nv_bfloat16* c,const int size); - template void launch_exp<__half>(int numBlocks, int blockSize, const __half* a, __half* c,const int size); + if (err != cudaSuccess) + { + throw std::runtime_error("Failed to launch exp kernel: " + + std::string(cudaGetErrorString(err))); + } + } + template void launch_exp(int numBlocks, int blockSize, const double *a, double *c, const int size); + template void launch_exp(int numBlocks, int blockSize, const float *a, float *c, const int size); + template void launch_exp<__half>(int numBlocks, int blockSize, const __half *a, __half *c, const int size); } #endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_SQRT_CU diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt_a.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt_a.cu new file mode 100644 index 00000000..fb261bb0 --- /dev/null +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt_a.cu @@ -0,0 +1,95 @@ +#ifndef DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_SQRT_A_CU +#define DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_SQRT_A_CU + +#include + +#include "deepx/tensorfunc/cuda.hpp" +#include "deepx/tensorfunc/authors.hpp" +#include + +namespace deepx::tensorfunc +{ + // sqrt + template + __global__ void sqrt_kernel(const T *A, T *C, const int size); + template <> + + template <> + __global__ void sqrt_kernel(const nv_bfloat16 *A, nv_bfloat16 *C, const int size) + { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < size) + { + C[idx] = ::__nv_bfloat16(hsqrt(A[idx])); + } + } + + template + void launch_sqrt(int numBlocks, int blockSize, const T *a, T *c, const int size) + { + sqrt_kernel<<>>(a, c, size); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + throw std::runtime_error("Failed to launch sqrt kernel: " + + std::string(cudaGetErrorString(err))); + } + } + template void launch_sqrt(int numBlocks, int blockSize, const nv_bfloat16 *a, nv_bfloat16 *c, const int size); + + // log + template + __global__ void log_kernel(const T *A, T *C, const int size); + + template <> + __global__ void log_kernel(const nv_bfloat16 *A, nv_bfloat16 *C, const int size) + { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < size) + { + C[idx] = ::__nv_bfloat16(hlog(A[idx])); + } + } + + template + void launch_log(int numBlocks, int blockSize, const T *a, T *c, const int size) + { + log_kernel<<>>(a, c, size); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + throw std::runtime_error("Failed to launch log kernel: " + + std::string(cudaGetErrorString(err))); + } + } + template void launch_log(int numBlocks, int blockSize, const nv_bfloat16 *a, nv_bfloat16 *c, const int size); + + // exp + template + __global__ void exp_kernel(const T *A, T *C, const int size); + + template <> + __global__ void exp_kernel(const nv_bfloat16 *A, nv_bfloat16 *C, const int size) + { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < size) + { + C[idx] = ::__nv_bfloat16(hexp(A[idx])); + } + } + + template + void launch_exp(int numBlocks, int blockSize, const T *a, T *c, const int size) + { + exp_kernel<<>>(a, c, size); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + throw std::runtime_error("Failed to launch exp kernel: " + + std::string(cudaGetErrorString(err))); + } + } + template void launch_exp(int numBlocks, int blockSize, const nv_bfloat16 *a, nv_bfloat16 *c, const int size); +} + +#endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_SQRT_A_CU From e3993e6313583ccd233c24bd6bdecf3c973b47e9 Mon Sep 17 00:00:00 2001 From: lipeng <734991033@qq.com> Date: Wed, 2 Apr 2025 23:40:16 +0800 Subject: [PATCH 6/7] half,bfloat16:fix --- .../src/deepx/tensorfunc/elementwise_miaobyte_sin.cu | 4 ++-- .../src/deepx/tensorfunc/elementwise_miaobyte_sin_a.cu | 4 ++-- .../src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cu | 6 +++--- .../src/deepx/tensorfunc/elementwise_miaobyte_sqrt_a.cu | 6 +++--- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.cu index 354f7a89..23b78dbf 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.cu +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.cu @@ -32,7 +32,7 @@ namespace deepx::tensorfunc __global__ void sin_kernel<__half>(const __half* A, __half* C, const int size){ int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < size) { - C[idx] = ::__nv_half(hsin(A[idx])); + C[idx] = hsin(A[idx]); } } @@ -72,7 +72,7 @@ namespace deepx::tensorfunc __global__ void cos_kernel<__half>(const __half* A, __half* C, const int size){ int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < size) { - C[idx] = ::__nv_half(hcos(A[idx])); + C[idx] = hcos(A[idx]); } } diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin_a.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin_a.cu index 0b621601..0660c3fa 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin_a.cu +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin_a.cu @@ -18,7 +18,7 @@ namespace deepx::tensorfunc int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < size) { - C[idx] = ::__nv_bfloat16(hsin(A[idx])); + C[idx] = hsin(A[idx]); } } @@ -46,7 +46,7 @@ namespace deepx::tensorfunc int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < size) { - C[idx] = ::__nv_bfloat16(hcos(A[idx])); + C[idx] = hcos(A[idx]); } } template diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cu index c30f48d4..fe5c92ca 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cu +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cu @@ -36,7 +36,7 @@ namespace deepx::tensorfunc int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < size) { - C[idx] = ::__nv_half(hsqrt(A[idx])); + C[idx] = hsqrt(A[idx]); } } @@ -156,7 +156,7 @@ namespace deepx::tensorfunc int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < size) { - C[idx] = ::__nv_half(hlog(A[idx])); + C[idx] = hlog(A[idx]); } } @@ -203,7 +203,7 @@ namespace deepx::tensorfunc int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < size) { - C[idx] = ::__nv_half(hexp(A[idx])); + C[idx] = hexp(A[idx]); } } diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt_a.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt_a.cu index fb261bb0..4c67fa04 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt_a.cu +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt_a.cu @@ -20,7 +20,7 @@ namespace deepx::tensorfunc int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < size) { - C[idx] = ::__nv_bfloat16(hsqrt(A[idx])); + C[idx] = hsqrt(A[idx]); } } @@ -47,7 +47,7 @@ namespace deepx::tensorfunc int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < size) { - C[idx] = ::__nv_bfloat16(hlog(A[idx])); + C[idx] = hlog(A[idx]); } } @@ -74,7 +74,7 @@ namespace deepx::tensorfunc int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < size) { - C[idx] = ::__nv_bfloat16(hexp(A[idx])); + C[idx] = hexp(A[idx]); } } From fbd68e01ca653f056258808dce79fcde7d2e6a18 Mon Sep 17 00:00:00 2001 From: lipeng <734991033@qq.com> Date: Wed, 2 Apr 2025 23:50:26 +0800 Subject: [PATCH 7/7] half,bfloat16:fix --- .github/workflows/excuter-cuda-linux.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/excuter-cuda-linux.yml b/.github/workflows/excuter-cuda-linux.yml index b65169c4..5da2e626 100644 --- a/.github/workflows/excuter-cuda-linux.yml +++ b/.github/workflows/excuter-cuda-linux.yml @@ -2,7 +2,7 @@ name: Excuter/cuda-linux Build on: [push, pull_request] env: - CUDA_VERSION: "12.1.0" + CUDA_VERSION: "12.6.0" CUDA_MAJOR_VERSION: "12" CUDNN_VERSION: "8.9.7.29" CUTLASS_VERSION: "3.4.1" @@ -29,7 +29,7 @@ jobs: run: | docker run --rm -v ${{ github.workspace }}:/workspace \ -w /workspace \ - nvidia/cuda:12.1.0-devel-ubuntu22.04 \ + nvidia/cuda:12.6.0-devel-ubuntu22.04 \ /bin/bash -c " # 安装系统依赖 apt-get update && \