diff --git a/doc/excuter/op-mem-cuda/list.md b/doc/excuter/op-mem-cuda/list.md index e9f62d96..2d7a12fb 100644 --- a/doc/excuter/op-mem-cuda/list.md +++ b/doc/excuter/op-mem-cuda/list.md @@ -10,8 +10,10 @@ | transpose | miaobyte | transpose(tensor A, vector dim_order)->(tensor C) | T2 = T1.transpose(dimorder=[1,0]) | transpose(tensor A, vector dim_order)->(tensor C) | | reshape | miaobyte | reshape(tensor A, vector shape)->(tensor B) | T1.reshape(shape)->T2 | reshape(tensor A, vector shape)->(tensor B) | | matmul | cublas | matmul(tensor A, tensor B)->(tensor C) | T3=T1 @ T2 | matmul(tensor A, tensor B)->(tensor C) | -| comparescalar | miaobyte | comparescalar(tensor A, var scalar)->(tensor mask) | mask=compare(T1, scalar) | comparescalar(tensor A, var scalar)->(tensor mask) | -| compare | miaobyte | compare(tensor A, tensor B)->(tensor mask) | mask=compare(T1, T2) | compare(tensor A, tensor B)->(tensor mask) | +| equalscalar | miaobyte | equalscalar(tensor A, var scalar, var epsilon)->(tensor mask) | mask=compare(T1, scalar) | equalscalar(tensor A, var scalar, var epsilon)->(tensor mask) | +| prod | miaobyte | prod(tensor A, vector dims, var keepdims)->(tensor B) | B = prod(A, axis=[1 2], keepdims=false) | prod(tensor A, vector dims, var keepdims)->(tensor B) | +| min | miaobyte | min(tensor A, tensor B)->(tensor C) | T3=min(T1, T2) | min(tensor A, tensor B)->(tensor C) | +| maxscalar | miaobyte | maxscalar(tensor A, var scalar)->(tensor C) | T3=max(T1, scalar) | maxscalar(tensor A, var scalar)->(tensor C) | | uniform | miaobyte | uniform(tensor t, var low, var high, var seed)->() | uniform(T1,low,high,seed) | uniform(tensor t, var low, var high, var seed)->() | | addscalar | miaobyte | addscalar(tensor A, var b)->(tensor C) | T3=T1+scalar | addscalar(tensor A, var b)->(tensor C) | | log | miaobyte | log(tensor A)->(tensor C) | T3=log(T1) | log(tensor A)->(tensor C) | @@ -22,28 +24,28 @@ | add | cublas | add(tensor a, tensor b)->(tensor c) | T3=T1+T2 | add(tensor a, tensor b)->(tensor c) | | add | miaobyte | add(tensor a, tensor b)->(tensor c) | T3=T1+T2 | add(tensor a, tensor b)->(tensor c) | | copytensor | none | copytensor(tensor src, tensor dst)->() | T2.data = T1.data | copytensor(tensor src, tensor dst)->() | -| prod | miaobyte | prod(tensor A, vector dims, var keepdims)->(tensor B) | B = prod(A, axis=[1 2], keepdims=false) | prod(tensor A, vector dims, var keepdims)->(tensor B) | -| min | miaobyte | min(tensor A, tensor B)->(tensor C) | T3=min(T1, T2) | min(tensor A, tensor B)->(tensor C) | | print | miaobyte | print(tensor )->() | print(T1) | print(tensor )->() | | print | miaobyte | print(tensor , var )->() | print(T1) | print(tensor , var )->() | | newtensor | none | newtensor(vector shape)->(tensor tensor1) | T1 = zeros(shape) | newtensor(vector shape)->(tensor tensor1) | | newtensor | none | newtensor(var shape)->(tensor tensor1) | T1 = zeros(shape) | newtensor(var shape)->(tensor tensor1) | -| minscalar | miaobyte | minscalar(tensor A, var scalar)->(tensor C) | T3=min(T1, scalar) | minscalar(tensor A, var scalar)->(tensor C) | -| rdivscalar | miaobyte | rdivscalar(var scalar, tensor A)->(tensor C) | T3=scalar/T1 | rdivscalar(var scalar, tensor A)->(tensor C) | | constant | miaobyte | constant(tensor t, var value)->() | constant(T1) | constant(tensor t, var value)->() | -| powscalar | miaobyte | powscalar(tensor A, var scalar)->(tensor C) | T3=pow(T1, scalar) | powscalar(tensor A, var scalar)->(tensor C) | +| powscalar | miaobyte | powscalar(tensor A, var scalar)->(tensor C) | T3=pow(T1, scalar) | powscalar(tensor A, var scalar)->(tensor C) | | vecset | none | vecset(vector value)->(vector name) | shape = [3 4 5] | vecset(vector value)->(vector name) | | reducemin | miaobyte | reducemin(tensor A, vector dims, var keepdims)->(tensor B) | B = reducemin(A, axis=[1 2], keepdims=false) | reducemin(tensor A, vector dims, var keepdims)->(tensor B) | | subscalar | miaobyte | subscalar(tensor A, var b)->(tensor C) | T3=T1-scalar | subscalar(tensor A, var b)->(tensor C) | | sqrt | miaobyte | sqrt(tensor A)->(tensor C) | T3=sqrt(T1) | sqrt(tensor A)->(tensor C) | +| minscalar | miaobyte | minscalar(tensor A, var scalar)->(tensor C) | T3=min(T1, scalar) | minscalar(tensor A, var scalar)->(tensor C) | +| rdivscalar | miaobyte | rdivscalar(var scalar, tensor A)->(tensor C) | T3=scalar/T1 | rdivscalar(var scalar, tensor A)->(tensor C) | +| rpowscalar | miaobyte | rpowscalar(var scalar, tensor A)->(tensor C) | T3=pow(scalar, T1) | rpowscalar(var scalar, tensor A)->(tensor C) | | sub | miaobyte | sub(tensor A, tensor B)->(tensor C) | T3=T1-T2 | sub(tensor A, tensor B)->(tensor C) | | sum | miaobyte | sum(tensor A, vector dims, var keepdims)->(tensor B) | B = sum(A, axis=[1 2], keepdims=false) | sum(tensor A, vector dims, var keepdims)->(tensor B) | | argset | none | argset(var value)->(var name) | var argname = argvalue | argset(var value)->(var name) | +| equal | miaobyte | equal(tensor A, tensor B, var epsilon)->(tensor mask) | mask=compare(T1, T2) | equal(tensor A, tensor B, var epsilon)->(tensor mask) | | mulscalar | miaobyte | mulscalar(tensor A, var b)->(tensor C) | T3=T1*scalar | mulscalar(tensor A, var b)->(tensor C) | | div | miaobyte | div(tensor A, tensor B)->(tensor C) | T3=T1/T2 | div(tensor A, tensor B)->(tensor C) | +| invert | miaobyte | invert(tensor A)->(tensor C) | T3=~T1 | invert(tensor A)->(tensor C) | | max | miaobyte | max(tensor A, tensor B)->(tensor C) | T3=max(T1, T2) | max(tensor A, tensor B)->(tensor C) | | pow | miaobyte | pow(tensor A, tensor B)->(tensor C) | T3=pow(T1, T2) | pow(tensor A, tensor B)->(tensor C) | -| maxscalar | miaobyte | maxscalar(tensor A, var scalar)->(tensor C) | T3=max(T1, scalar) | maxscalar(tensor A, var scalar)->(tensor C) | | mul | miaobyte | mul(tensor A, tensor B)->(tensor C) | T3=T1*T2 | mul(tensor A, tensor B)->(tensor C) | | exp | miaobyte | exp(tensor A)->(tensor C) | T3=exp(T1) | exp(tensor A)->(tensor C) | | deltensor | none | deltensor(tensor t)->() | del T1 | deltensor(tensor t)->() | diff --git a/doc/excuter/op-mem-ompsimd/list.md b/doc/excuter/op-mem-ompsimd/list.md index 9593ef2f..68ea3b70 100644 --- a/doc/excuter/op-mem-ompsimd/list.md +++ b/doc/excuter/op-mem-ompsimd/list.md @@ -12,6 +12,7 @@ | matmul | cblas | matmul(tensor A, tensor B)->(tensor C) | T3=T1 @ T2 | matmul(tensor A, tensor B)->(tensor C) | | matmul | miaobyte | matmul(tensor A, tensor B)->(tensor C) | T3=T1 @ T2 | matmul(tensor A, tensor B)->(tensor C) | | comparescalar | miaobyte | comparescalar(tensor A, var scalar)->(tensor mask) | mask=compare(T1,scalar) | comparescalar(tensor A, var scalar)->(tensor mask) | +| compare | miaobyte | compare(tensor A, tensor B)->(tensor mask) | mask=compare(T1,T2) | compare(tensor A, tensor B)->(tensor mask) | | uniform | miaobyte | uniform(tensor t, var low, var high, var seed)->() | uniform(T1,low,high,seed) | uniform(tensor t, var low, var high, var seed)->() | | addscalar | miaobyte | addscalar(tensor a, var scalar)->(tensor c) | T3=T1+scalar | addscalar(tensor a, var scalar)->(tensor c) | | log | miaobyte | log(tensor A)->(tensor C) | T3=log(T1) | log(tensor A)->(tensor C) | @@ -38,11 +39,12 @@ | argset | none | argset(var value)->(var name) | var argname = argvalue | argset(var value)->(var name) | | mulscalar | miaobyte | mulscalar(tensor A, var b)->(tensor C) | T3=T1*scalar | mulscalar(tensor A, var b)->(tensor C) | | div | miaobyte | div(tensor A, tensor B)->(tensor C) | T3=T1/T2 | div(tensor A, tensor B)->(tensor C) | +| invert | miaobyte | invert(tensor A)->(tensor C) | T3=~T1 | invert(tensor A)->(tensor C) | | max | miaobyte | max(tensor A, tensor B)->(tensor C) | T3=max(T1,T2) | max(tensor A, tensor B)->(tensor C) | | pow | miaobyte | pow(tensor A, tensor B)->(tensor C) | T3=T1^T2 | pow(tensor A, tensor B)->(tensor C) | | maxscalar | miaobyte | maxscalar(tensor A, var scalar)->(tensor C) | T3=max(T1,scalar) | maxscalar(tensor A, var scalar)->(tensor C) | | mul | miaobyte | mul(tensor A, tensor B)->(tensor C) | T3=T1*T2 | mul(tensor A, tensor B)->(tensor C) | | exp | miaobyte | exp(tensor A)->(tensor C) | T3=exp(T1) | exp(tensor A)->(tensor C) | | rdivscalar | miaobyte | rdivscalar(var scalar, tensor A)->(tensor C) | T3=scalar/T1 | rdivscalar(var scalar, tensor A)->(tensor C) | +| rpowscalar | miaobyte | rpowscalar(var scalar, tensor A)->(tensor C) | T3=scalar^T1 | rpowscalar(var scalar, tensor A)->(tensor C) | | minscalar | miaobyte | minscalar(tensor A, var scalar)->(tensor C) | T3=min(T1,scalar) | minscalar(tensor A, var scalar)->(tensor C) | -| compare | miaobyte | compare(tensor A, tensor B)->(tensor mask) | mask=compare(T1,T2) | compare(tensor A, tensor B)->(tensor mask) | diff --git a/excuter/cpp-common/src/deepx/tensor.hpp b/excuter/cpp-common/src/deepx/tensor.hpp index b755a4c9..af5f0dc6 100644 --- a/excuter/cpp-common/src/deepx/tensor.hpp +++ b/excuter/cpp-common/src/deepx/tensor.hpp @@ -4,6 +4,7 @@ #include #include #include +#include #include "deepx/shape.hpp" #include "deepx/dtype.hpp" @@ -12,7 +13,7 @@ namespace deepx { using namespace std; - + template struct Tensor : public TensorBase { @@ -28,11 +29,11 @@ namespace deepx CopyFn copyer; // 拷贝内存 Tensor() = default; - Tensor(const vector &s) + Tensor(const vector &s) { shape = Shape(s); } - Tensor(const Shape &s) + Tensor(const Shape &s) { shape = s; } @@ -140,28 +141,6 @@ namespace deepx tensor.newer = nullptr; return *this; } - }; - - // template - // struct TensorSlice { - // Slice slice; - // Tensor tensor; - // }; - - // 添加一个新的类用于类型擦除 - struct TensorVoid : public TensorBase { - void* data; - void (*deleter)(void*); - void (*copyer)(void*, void*, int); - void* (*newer)(int); - - TensorVoid() = default; - ~TensorVoid() { - if (data && deleter) { - deleter(data); - data = nullptr; - } - } - }; + }; } #endif \ No newline at end of file diff --git a/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp b/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp index 8c9ebd94..ca44fd13 100644 --- a/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp +++ b/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp @@ -15,7 +15,6 @@ namespace deepx::tensorfunc } }; - // A+B=>C template void add(const Tensor &A, const Tensor &B, Tensor &C) @@ -26,7 +25,8 @@ namespace deepx::tensorfunc template struct addscalarDispatcher { - static void addscalar(const Tensor &input, const T value, Tensor &output){ + static void addscalar(const Tensor &input, const T value, Tensor &output) + { throw NotImplementError("addscalar"); } }; @@ -41,7 +41,8 @@ namespace deepx::tensorfunc template struct subDispatcher { - static void sub(const Tensor &A, const Tensor &B, Tensor &C){ + static void sub(const Tensor &A, const Tensor &B, Tensor &C) + { throw NotImplementError("sub"); } }; @@ -56,7 +57,8 @@ namespace deepx::tensorfunc template struct subscalarDispatcher { - static void subscalar(const Tensor &input, const T value, Tensor &output){ + static void subscalar(const Tensor &input, const T value, Tensor &output) + { throw NotImplementError("subscalar"); } }; @@ -94,8 +96,6 @@ namespace deepx::tensorfunc mulscalarDispatcher::mulscalar(input, value, output); } - - template struct divDispatcher { @@ -135,27 +135,12 @@ namespace deepx::tensorfunc rdivscalarDispatcher::rdivscalar(value, input, output); } - - template - struct sqrtDispatcher - { - static void sqrt(const Tensor &input, Tensor &output) = delete; - }; - - // sqrt(A)=>C - template - void sqrt(const Tensor &input, Tensor &output) - { - sqrtDispatcher::sqrt(input, output); - } - + // A^B=>C template struct powDispatcher { static void pow(const Tensor &A, const Tensor &B, Tensor &C) = delete; }; - - // A^B=>C template void pow(const Tensor &A, const Tensor &B, Tensor &C) { @@ -175,6 +160,32 @@ namespace deepx::tensorfunc powscalarDispatcher::powscalar(input, value, output); } + template + struct rpowscalarDispatcher + { + static void rpowscalar(const T value, const Tensor &input, Tensor &output) = delete; + }; + + // scalar^A=>C + template + void rpowscalar(const T value, const Tensor &input, Tensor &output) + { + rpowscalarDispatcher::rpowscalar(value, input, output); + } + + template + struct sqrtDispatcher + { + static void sqrt(const Tensor &input, Tensor &output) = delete; + }; + + // sqrt(A)=>C + template + void sqrt(const Tensor &input, Tensor &output) + { + sqrtDispatcher::sqrt(input, output); + } + template struct logDispatcher { @@ -253,8 +264,6 @@ namespace deepx::tensorfunc maxDispatcher::max(A, B, C); } - - template struct maxscalarDispatcher { @@ -268,8 +277,6 @@ namespace deepx::tensorfunc maxscalarDispatcher::maxscalar(A, b, C); } - - template struct minDispatcher { @@ -295,49 +302,112 @@ namespace deepx::tensorfunc { minscalarDispatcher::minscalar(A, b, C); } - - template - struct compareDispatcher + + // equal(A,B)=>mask + + template + struct equalDispatcher { - static void compare(const Tensor &A, const Tensor &B, Tensor &mask) = delete; + static void equal(const Tensor &A, const Tensor &B, float epsilon, Tensor &mask) = delete; }; - // compare(A,B)=>mask - // if A[i]==B[i], mask[i]=0.5 - // if A[i]>B[i], mask[i]=0 - // if A[i] - void compare(const Tensor &A, const Tensor &B,Tensor &mask) + template + void equal(const Tensor &A, const Tensor &B, float epsilon, Tensor &mask) { - compareDispatcher::compare(A, B, mask); + equalDispatcher::equal(A, B, epsilon, mask); } - template - struct comparescalarDispatcher + // equal(A,scalar)=>mask + template + struct equalscalarDispatcher { - static void comparescalar(const Tensor &A, const T scalar, Tensor &mask) = delete; + static void equalscalar(const Tensor &A, const T scalar, float epsilon, Tensor &mask) = delete; }; - template - void comparescalar(const Tensor &A, const T scalar, Tensor &mask) + template + void equalscalar(const Tensor &A, const T scalar, float epsilon, Tensor &mask) + { + equalscalarDispatcher::equalscalar(A, scalar, epsilon, mask); + } + + // less(A,B)=>mask + template + struct lessDispatcher { - comparescalarDispatcher::comparescalar(A, scalar, mask); + static void less(const Tensor &A, const Tensor &B, Tensor &mask) = delete; + }; + + template + void less(const Tensor &A, const Tensor &B, Tensor &mask) + { + lessDispatcher::less(A, B, mask); + } + + // less(A,scalar)=>mask + template + struct lessscalarDispatcher + { + static void lessscalar(const Tensor &A, const T scalar, Tensor &mask) = delete; + }; + + template + void lessscalar(const Tensor &A, const T scalar, Tensor &mask) + { + lessscalarDispatcher::lessscalar(A, scalar, mask); } - // 判断两个张量是否相等,TODO + // greater(A,B)=>C + template + struct greaterDispatcher + { + static void greater(const Tensor &A, const Tensor &B, Tensor &mask) = delete; + }; + + template + void greater(const Tensor &A, const Tensor &B, Tensor &mask) + { + greaterDispatcher::greater(A, B, mask); + } + + // greater(A,scalar)=>C + template + struct greaterscalarDispatcher + { + static void greaterscalar(const Tensor &A, const T scalar, Tensor &mask) = delete; + }; + + template + void greaterscalar(const Tensor &A, const T scalar, Tensor &mask) + { + greaterscalarDispatcher::greaterscalar(A, scalar, mask); + } + + // switch(tensors,cases)=>C + template + struct switchDispatcher + { + static void Switch(const vector *> tensors, const Tensor &cases, Tensor &C) = delete; + }; + + template + void Switch(const vector *> tensors, const Tensor &cases, Tensor &C) + { + switchDispatcher::Switch(tensors, cases, C); + } + + // invert(A)=>C template - struct equalDispatcher + struct invertDispatcher { - static bool equal(const Tensor &A, const Tensor &B, float epsilon=1e-6) = delete; + static void invert(const Tensor &input, Tensor &output) = delete; }; template - bool equal(const Tensor &A, const Tensor &B,float epsilon=1e-6) + void invert(const Tensor &input, Tensor &output) { - return equalDispatcher::equal(A, B, epsilon); + invertDispatcher::invert(input, output); } - - + } // namespace deepx::tensorfunc #endif // DEEPX_TENSORFUNC_ELEMENTWISE_HPP diff --git a/excuter/cpp-common/src/deepx/tf/tf.hpp b/excuter/cpp-common/src/deepx/tf/tf.hpp index 95c2956d..430dc4c5 100644 --- a/excuter/cpp-common/src/deepx/tf/tf.hpp +++ b/excuter/cpp-common/src/deepx/tf/tf.hpp @@ -146,7 +146,7 @@ namespace deepx::tf { for (const auto &name : names) { - if (!mem->gettensor(name)) + if (!mem->existstensor(name)) { error = "tensor not found: " + name; return false; diff --git a/excuter/cpp-common/src/stdutil/print.hpp b/excuter/cpp-common/src/stdutil/print.hpp index 31c2737c..139575b4 100644 --- a/excuter/cpp-common/src/stdutil/print.hpp +++ b/excuter/cpp-common/src/stdutil/print.hpp @@ -14,7 +14,12 @@ namespace stdutil { switch (dtype) { - case deepx::Precision::Int8: + case Precision::Bool:{ + bool bool_data = ((bool *)data)[offset]; + printf(format.c_str(), static_cast(bool_data)); + break; + } + case Precision::Int8: printf(format.c_str(), ((int8_t *)data)[offset]); break; case Precision::Int16: @@ -26,15 +31,16 @@ namespace stdutil case Precision::Int64: printf(format.c_str(), ((int64_t *)data)[offset]); break; - case Precision::Float32: - printf(format.c_str(), ((float *)data)[offset]); - break; + case Precision::Float64: printf(format.c_str(), ((double *)data)[offset]); break; + case Precision::Float32: + printf(format.c_str(), ((float *)data)[offset]); + break; case Precision::Float16: printf(format.c_str(), ((float *)data)[offset]); - break; + break; case Precision::BFloat16: printf(format.c_str(), ((float *)data)[offset]); break; @@ -58,25 +64,25 @@ namespace stdutil format = "%d"; } else if (dtype == Precision::String) - { + { format = "%s"; }; - - + return format; } - void print(const std::vector &shape_vec, void *data, const Precision &dtype, const std::string &f="") + void print(const std::vector &shape_vec, void *data, const Precision &dtype, const std::string &f = "") { std::string format = f; - if (f.empty()) { + if (f.empty()) + { format = stdutil::default_format(dtype); } // 创建临时Shape对象用于打印和计算 deepx::Shape shape(shape_vec); shape.dtype = dtype; - + shape.print(); if (shape.dim == 1) { @@ -89,10 +95,10 @@ namespace stdutil } std::cout << "]" << std::endl; } - else + else { shape.range(-2, [&format, data, &shape, &dtype](const int idx_linear, const std::vector &indices) - { + { std::cout << indices << "="; std::cout<<"["< 0) std::cout << " "; int offset = idx_linear + i * shape[-1] + j; - stdutil::print_element(data, offset, dtype, format); + print_element(data, offset, dtype, format); } std::cout<<"]"; diff --git a/excuter/op-mem-cuda/src/client/tfs.cpp b/excuter/op-mem-cuda/src/client/tfs.cpp index f8e79c7b..c64973ca 100644 --- a/excuter/op-mem-cuda/src/client/tfs.cpp +++ b/excuter/op-mem-cuda/src/client/tfs.cpp @@ -205,7 +205,16 @@ namespace deepx::tf { Param("C", DataCategory::Tensor, Precision::Any), }))); - + //invert + tffactory.add_tf(std::make_shared>(vector( + { + Param("A", DataCategory::Tensor, Precision::Int64 | Precision::Int32 | Precision::Int16 | Precision::Int8), + }), + vector( + { + Param("C", DataCategory::Tensor, Precision::Int64 | Precision::Int32 | Precision::Int16 | Precision::Int8), + }))); + tffactory.add_tf(std::make_shared>(vector( { Param("A", DataCategory::Tensor, Precision::Float64 | Precision::Float32 | Precision::Float16 | Precision::BFloat16), @@ -227,12 +236,23 @@ namespace deepx::tf tffactory.add_tf(std::make_shared>(vector( { Param("A", DataCategory::Tensor, Precision::Float64 | Precision::Float32), - Param("scalar", DataCategory::Var, Precision::Float64 | Precision::Float32), + Param("scalar", DataCategory::Var, Precision::Float64 | Precision::Int32), + }), + vector( + { + Param("C", DataCategory::Tensor, Precision::Float64 | Precision::Float32), + }))); + //rpowscalar + tffactory.add_tf(std::make_shared>(vector( + { + Param("scalar", DataCategory::Var, Precision::Float64 | Precision::Int32), + Param("A", DataCategory::Tensor, Precision::Float64 | Precision::Float32), }), vector( { Param("C", DataCategory::Tensor, Precision::Float64 | Precision::Float32), }))); + //log tffactory.add_tf(std::make_shared>(vector( { Param("A", DataCategory::Tensor, Precision::Float64 | Precision::Float32 | Precision::Float16 | Precision::BFloat16), @@ -309,24 +329,77 @@ namespace deepx::tf { Param("C", DataCategory::Tensor, Precision::Any), }))); - tffactory.add_tf(std::make_shared>(vector( + //equal + tffactory.add_tf(std::make_shared>(vector( { Param("A", DataCategory::Tensor, Precision::Any), Param("B", DataCategory::Tensor, Precision::Any), + Param("epsilon", DataCategory::Var, Precision::Float64), }), vector( { - Param("mask", DataCategory::Tensor, Precision::Int8), + Param("mask", DataCategory::Tensor, Precision::Bool), }))); - tffactory.add_tf(std::make_shared>(vector( + tffactory.add_tf(std::make_shared>(vector( { Param("A", DataCategory::Tensor, Precision::Any), Param("scalar", DataCategory::Var, Precision::Any), + Param("epsilon", DataCategory::Var, Precision::Float64), }), vector( { - Param("mask", DataCategory::Tensor, Precision::Int8), + Param("mask", DataCategory::Tensor, Precision::Bool), }))); + //less + tffactory.add_tf(std::make_shared>(vector( + { + Param("A", DataCategory::Tensor, Precision::Any), + Param("B", DataCategory::Tensor, Precision::Any), + }), + vector( + { + Param("mask", DataCategory::Tensor, Precision::Bool), + }))); + //lessscalar + tffactory.add_tf(std::make_shared>(vector( + { + Param("A", DataCategory::Tensor, Precision::Any), + Param("scalar", DataCategory::Var, Precision::Any), + }), + vector( + { + Param("mask", DataCategory::Tensor, Precision::Bool), + }))); + //greater + tffactory.add_tf(std::make_shared>(vector( + { + Param("A", DataCategory::Tensor, Precision::Any), + Param("B", DataCategory::Tensor, Precision::Any), + }), + vector( + { + Param("mask", DataCategory::Tensor, Precision::Bool), + }))); + //greaterscalar + tffactory.add_tf(std::make_shared>(vector( + { + Param("A", DataCategory::Tensor, Precision::Any), + Param("scalar", DataCategory::Var, Precision::Any), + }), + vector( + { + Param("mask", DataCategory::Tensor, Precision::Bool), + }))); + //switch + tffactory.add_tf(std::make_shared>(vector( + { + Param("tensors", DataCategory::ListTensor, Precision::Any), + Param("cases", DataCategory::Tensor,Precision::Int8), + }), + vector( + { + Param("result", DataCategory::Tensor, Precision::Any), + }))); } // matmul void register_matmul(TfFactory &tffactory) diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cuh b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cuh index a1fcf9fa..7b0f5d31 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cuh +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cuh @@ -15,30 +15,7 @@ namespace deepx::tensorfunc template void launch_transpose( const T *input, const int *inputStrides, T *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); - // template <> - // void launch_transpose( const double *input, const int *inputStrides, double *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); - - // template <> - // void launch_transpose( const float *input, const int *inputStrides, float *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); - - // template <> - // void launch_transpose(const nv_bfloat16 *input, const int *inputStrides, nv_bfloat16 *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); - - // template <> - // void launch_transpose<__half>(const __half *input, const int *inputStrides, __half *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); - - // template <> - // void launch_transpose(const int64_t *input, const int *inputStrides, int64_t *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); - - // template <> - // void launch_transpose(const int32_t *input, const int *inputStrides, int32_t *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); - - // template <> - // void launch_transpose(const int16_t *input, const int *inputStrides, int16_t *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); - - // template <> - // void launch_transpose(const int8_t *input, const int *inputStrides, int8_t *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); - + template __global__ void concat_kernel(const T **tensorsData, const int *inputStrides, @@ -53,30 +30,7 @@ namespace deepx::tensorfunc template void launch_concat(const T **tensorsData, const int *inputStrides, T *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis); - // template <> - // void launch_concat(const double **tensorsData, const int *inputStrides, double *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis); - - // template <> - // void launch_concat(const float **tensorsData, const int *inputStrides, float *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis); - - // template <> - // void launch_concat(const nv_bfloat16 **tensorsData, const int *inputStrides, nv_bfloat16 *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis); - - // template <> - // void launch_concat<__half>(const __half **tensorsData, const int *inputStrides, __half *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis); - - // template <> - // void launch_concat(const int64_t **tensorsData, const int *inputStrides, int64_t *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis); - - // template <> - // void launch_concat(const int32_t **tensorsData, const int *inputStrides, int32_t *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis); - - // template <> - // void launch_concat(const int16_t **tensorsData, const int *inputStrides, int16_t *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis); - - // template <> - // void launch_concat(const int8_t **tensorsData, const int *inputStrides, int8_t *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis); - + __host__ __device__ void fromBroadcastIndices(const BroadcastMap *broadcastMap, const int *broadcastIndices, const int broadcastIndicesDim, int *indices); @@ -91,46 +45,6 @@ namespace deepx::tensorfunc void launch_broadcastTo(const T *input, const int *inputStrides,const int intputDim, const BroadcastMap *broadcastMap, T *output, const int *outputStrides,const int outputDim,const int outputlen); - -// template <> -// void launch_broadcastTo(const double *input, const int *inputStrides,const int inputDim, -// const BroadcastMap *broadcastMap, -// double *output, const int *outputStrides,const int outputDim,const int outputlen); - -// template <> -// void launch_broadcastTo(const float *input, const int *inputStrides,const int inputDim, -// const BroadcastMap *broadcastMap, -// float *output, const int *outputStrides,const int outputDim,const int outputlen); - -// template <> -// void launch_broadcastTo(const nv_bfloat16 *input, const int *inputStrides,const int inputDim, -// const BroadcastMap *broadcastMap, -// nv_bfloat16 *output, const int *outputStrides,const int outputDim,const int outputlen); - -// template <> -// void launch_broadcastTo<__half>(const __half *input, const int *inputStrides,const int inputDim, -// const BroadcastMap *broadcastMap, -// __half *output, const int *outputStrides,const int outputDim,const int outputlen); - -// template <> -// void launch_broadcastTo(const int64_t *input, const int *inputStrides,const int inputDim, -// const BroadcastMap *broadcastMap, -// int64_t *output, const int *outputStrides,const int outputDim,const int outputlen); - -// template <> -// void launch_broadcastTo(const int32_t *input, const int *inputStrides,const int inputDim, -// const BroadcastMap *broadcastMap, -// int32_t *output, const int *outputStrides,const int outputDim,const int outputlen); - -// template <> -// void launch_broadcastTo(const int16_t *input, const int *inputStrides,const int inputDim, -// const BroadcastMap *broadcastMap, -// int16_t *output, const int *outputStrides,const int outputDim,const int outputlen); - -// template <> -// void launch_broadcastTo(const int8_t *input, const int *inputStrides,const int inputDim, -// const BroadcastMap *broadcastMap, -// int8_t *output, const int *outputStrides,const int outputDim,const int outputlen); -// } + }; #endif // DEEPX_TENSORFUNC_CHANGESHAPE_MIAOBYTE_CUH \ No newline at end of file diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda_atomic.cuh b/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda_atomic.cuh new file mode 100644 index 00000000..58596f7b --- /dev/null +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda_atomic.cuh @@ -0,0 +1,260 @@ +#ifndef DEEPX_TENSORFUNC_CUDA_ATOMIC_CUH +#define DEEPX_TENSORFUNC_CUDA_ATOMIC_CUH + +#include +#include +#include +#include +namespace deepx::tensorfunc +{ + // atomicAdd + template + __device__ __forceinline__ void deepx_atomicAdd(T *a, T b); + + template <> + __device__ __forceinline__ void deepx_atomicAdd(double *a, double b) + { + atomicAdd(a, b); + } + + template <> + __device__ __forceinline__ void deepx_atomicAdd(float *a, float b) + { + atomicAdd(a, b); + } + + template <> + __device__ __forceinline__ void deepx_atomicAdd(half *a, half b) + { + atomicAdd(a, b); + } + + template <> + __device__ __forceinline__ void deepx_atomicAdd(nv_bfloat16 *a, nv_bfloat16 b) + { + atomicAdd(a, b); + } + + template <> + __device__ __forceinline__ void deepx_atomicAdd(int64_t *a, int64_t b) + { + int64_t old = *a; + int64_t assumed; + do + { + assumed = old; + old = atomicCAS((unsigned long long *)a, (unsigned long long)assumed, (unsigned long long)(assumed + b)); + } while (assumed != old); + *a = old + b; + } + + template <> + __device__ __forceinline__ void deepx_atomicAdd(int32_t *a, int32_t b) + { + atomicAdd(a, b); + } + + template <> + __device__ __forceinline__ void deepx_atomicAdd(int16_t *a, int16_t b) + { + unsigned int *address_as_uint = (unsigned int *)((char *)a - ((size_t)a & 2)); + unsigned int old = *address_as_uint; + unsigned int assumed; + + do + { + assumed = old; + unsigned int new_val; + if ((size_t)a & 2) + { + new_val = (old & 0x0000FFFF) | (((unsigned short)(((old >> 16) & 0xFFFF) + b)) << 16); + } + else + { + new_val = (old & 0xFFFF0000) | ((unsigned short)((old & 0xFFFF) + b)); + } + old = atomicCAS(address_as_uint, assumed, new_val); + } while (assumed != old); + } + + template <> + __device__ __forceinline__ void deepx_atomicAdd(int8_t *a, int8_t b) + { + unsigned int *address_as_uint = (unsigned int *)((char *)a - ((size_t)a & 3)); + unsigned int old = *address_as_uint; + unsigned int assumed; + unsigned int byte_offset = ((size_t)a & 3) * 8; + unsigned int mask = 0xFF << byte_offset; + + do + { + assumed = old; + unsigned char byte_val = (old >> byte_offset) & 0xFF; + byte_val += b; + unsigned int new_val = (old & ~mask) | (byte_val << byte_offset); + old = atomicCAS(address_as_uint, assumed, new_val); + } while (assumed != old); + } + + + // atomicMul + // atomicMul + template + __device__ __forceinline__ void deepx_atomicMul(T *a, T b); + + template <> + __device__ __forceinline__ void deepx_atomicMul(double *a, double b) + { + double old = *a; + double assumed; + do + { + assumed = old; + old = __longlong_as_double(atomicCAS((unsigned long long int*)a, + __double_as_longlong(assumed), + __double_as_longlong(assumed * b))); + } while (assumed != old); + } + + template <> + __device__ __forceinline__ void deepx_atomicMul(float *a, float b) + { + float old = *a; + float assumed; + do + { + assumed = old; + old = __int_as_float(atomicCAS((int*)a, + __float_as_int(assumed), + __float_as_int(assumed * b))); + } while (assumed != old); + } + + template <> + __device__ __forceinline__ void deepx_atomicMul(half *a, half b) + { + unsigned int *address_as_uint = (unsigned int *)((char *)a - ((size_t)a & 2)); + unsigned int old = *address_as_uint; + unsigned int assumed; + + do + { + assumed = old; + half assumed_half; + if ((size_t)a & 2) + { + assumed_half = __ushort_as_half((unsigned short)(old >> 16)); + half new_half = __hmul(assumed_half, b); + unsigned int new_val = (old & 0x0000FFFF) | ((unsigned int)__half_as_ushort(new_half) << 16); + old = atomicCAS(address_as_uint, assumed, new_val); + } + else + { + assumed_half = __ushort_as_half((unsigned short)(old & 0xFFFF)); + half new_half = __hmul(assumed_half, b); + unsigned int new_val = (old & 0xFFFF0000) | __half_as_ushort(new_half); + old = atomicCAS(address_as_uint, assumed, new_val); + } + } while (assumed != old); + } + + template <> + __device__ __forceinline__ void deepx_atomicMul(nv_bfloat16 *a, nv_bfloat16 b) + { + unsigned int *address_as_uint = (unsigned int *)((char *)a - ((size_t)a & 2)); + unsigned int old = *address_as_uint; + unsigned int assumed; + + do + { + assumed = old; + nv_bfloat16 assumed_bf16; + if ((size_t)a & 2) + { + assumed_bf16 = __ushort_as_bfloat16((unsigned short)(old >> 16)); + nv_bfloat16 new_bf16 = __hmul(assumed_bf16, b); + unsigned int new_val = (old & 0x0000FFFF) | ((unsigned int)__bfloat16_as_ushort(new_bf16) << 16); + old = atomicCAS(address_as_uint, assumed, new_val); + } + else + { + assumed_bf16 = __ushort_as_bfloat16((unsigned short)(old & 0xFFFF)); + nv_bfloat16 new_bf16 = __hmul(assumed_bf16, b); + unsigned int new_val = (old & 0xFFFF0000) | __bfloat16_as_ushort(new_bf16); + old = atomicCAS(address_as_uint, assumed, new_val); + } + } while (assumed != old); + } + + template <> + __device__ __forceinline__ void deepx_atomicMul(int64_t *a, int64_t b) + { + int64_t old = *a; + int64_t assumed; + do + { + assumed = old; + old = atomicCAS((unsigned long long *)a, + (unsigned long long)assumed, + (unsigned long long)(assumed * b)); + } while (assumed != old); + } + + template <> + __device__ __forceinline__ void deepx_atomicMul(int32_t *a, int32_t b) + { + int32_t old = *a; + int32_t assumed; + do + { + assumed = old; + old = atomicCAS((int32_t *)a, assumed, assumed * b); + } while (assumed != old); + } + + template <> + __device__ __forceinline__ void deepx_atomicMul(int16_t *a, int16_t b) + { + unsigned int *address_as_uint = (unsigned int *)((char *)a - ((size_t)a & 2)); + unsigned int old = *address_as_uint; + unsigned int assumed; + + do + { + assumed = old; + unsigned int new_val; + if ((size_t)a & 2) + { + int16_t assumed_short = (int16_t)(old >> 16); + new_val = (old & 0x0000FFFF) | (((unsigned short)(assumed_short * b)) << 16); + } + else + { + int16_t assumed_short = (int16_t)(old & 0xFFFF); + new_val = (old & 0xFFFF0000) | ((unsigned short)(assumed_short * b)); + } + old = atomicCAS(address_as_uint, assumed, new_val); + } while (assumed != old); + } + + template <> + __device__ __forceinline__ void deepx_atomicMul(int8_t *a, int8_t b) + { + unsigned int *address_as_uint = (unsigned int *)((char *)a - ((size_t)a & 3)); + unsigned int old = *address_as_uint; + unsigned int assumed; + unsigned int byte_offset = ((size_t)a & 3) * 8; + unsigned int mask = 0xFF << byte_offset; + + do + { + assumed = old; + int8_t byte_val = (old >> byte_offset) & 0xFF; + byte_val *= b; + unsigned int new_val = (old & ~mask) | ((byte_val & 0xFF) << byte_offset); + old = atomicCAS(address_as_uint, assumed, new_val); + } while (assumed != old); + } +} + +#endif diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda_math.cuh b/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda_math.cuh index d1828724..14764266 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda_math.cuh +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda_math.cuh @@ -9,354 +9,207 @@ namespace deepx::tensorfunc { - // max + //sqrt template - __device__ void deepx_max(const T *a, const T *b, T *out); + __device__ __forceinline__ void deepx_sqrt(const T *a, T *out); template <> - __device__ void deepx_max(const double *a, const double *b, double *out) + __device__ __forceinline__ void deepx_sqrt(const double *a, double *out) { - *out = fmax(*a, *b); + *out = sqrt(*a); } template <> - __device__ void deepx_max(const float *a, const float *b, float *out) + __device__ __forceinline__ void deepx_sqrt(const float *a, float *out) { - *out = fmaxf(*a, *b); + *out = sqrtf(*a); } template <> - __device__ void deepx_max(const half *a, const half *b, half *out) + __device__ __forceinline__ void deepx_sqrt(const half *a, half *out) { - *out = __hmax(*a, *b); + *out = hsqrt(*a); } template <> - __device__ void deepx_max(const nv_bfloat16 *a, const nv_bfloat16 *b, nv_bfloat16 *out) + __device__ __forceinline__ void deepx_sqrt(const nv_bfloat16 *a, nv_bfloat16 *out) { - *out = __hmax(*a, *b); - } - template <> - __device__ void deepx_max(const int64_t *a, const int64_t *b, int64_t *out) - { - *out = *a > *b ? *a : *b; - } - template <> - __device__ void deepx_max(const int32_t *a, const int32_t *b, int32_t *out) - { - *out = *a > *b ? *a : *b; + *out = hsqrt(*a); } + + //pow + template + __device__ __forceinline__ void deepx_pow(const T *a, const T *b, T *out); + template <> - __device__ void deepx_max(const int16_t *a, const int16_t *b, int16_t *out) + __device__ __forceinline__ void deepx_pow(const double *a, const double *b, double *out) { - *out = *a > *b ? *a : *b; + *out = pow(*a, *b); } + template <> - __device__ void deepx_max(const int8_t *a, const int8_t *b, int8_t *out) + __device__ __forceinline__ void deepx_pow(const float *a, const float *b, float *out) { - *out = *a > *b ? *a : *b; + *out = powf(*a, *b); } - // min + //log template - __device__ void deepx_min(const T *a, const T *b, T *out); + __device__ __forceinline__ void deepx_log(const T *a, T *out); template <> - __device__ void deepx_min(const double *a, const double *b, double *out) + __device__ __forceinline__ void deepx_log(const double *a, double *out) { - *out = fmin(*a, *b); + *out = log(*a); } template <> - __device__ void deepx_min(const float *a, const float *b, float *out) + __device__ __forceinline__ void deepx_log(const float *a, float *out) { - *out = fminf(*a, *b); + *out = logf(*a); } template <> - __device__ void deepx_min(const half *a, const half *b, half *out) + __device__ __forceinline__ void deepx_log(const half *a, half *out) { - *out = __hmin(*a, *b); - } + *out = hlog(*a); + } template <> - __device__ void deepx_min(const nv_bfloat16 *a, const nv_bfloat16 *b, nv_bfloat16 *out) + __device__ __forceinline__ void deepx_log(const nv_bfloat16 *a, nv_bfloat16 *out) { - *out = __hmin(*a, *b); + *out = hlog(*a); } + //exp + template + __device__ __forceinline__ void deepx_exp(const T *a, T *out); + template <> - __device__ void deepx_min(const int64_t *a, const int64_t *b, int64_t *out) + __device__ __forceinline__ void deepx_exp(const double *a, double *out) { - *out = *a < *b ? *a : *b; + *out = exp(*a); } template <> - __device__ void deepx_min(const int32_t *a, const int32_t *b, int32_t *out) + __device__ __forceinline__ void deepx_exp(const float *a, float *out) { - *out = *a < *b ? *a : *b; + *out = expf(*a); } template <> - __device__ void deepx_min(const int16_t *a, const int16_t *b, int16_t *out) + __device__ __forceinline__ void deepx_exp(const half *a, half *out) { - *out = *a < *b ? *a : *b; + *out = hexp(*a); } template <> - __device__ void deepx_min(const int8_t *a, const int8_t *b, int8_t *out) + __device__ __forceinline__ void deepx_exp(const nv_bfloat16 *a, nv_bfloat16 *out) { - *out = *a < *b ? *a : *b; + *out = hexp(*a); } - - // atomicAdd + + // max template - __device__ void deepx_atomicAdd(T *a, T b); + __device__ __forceinline__ void deepx_max(const T *a, const T *b, T *out); template <> - __device__ void deepx_atomicAdd(double *a, double b) + __device__ __forceinline__ void deepx_max(const double *a, const double *b, double *out) { - atomicAdd(a, b); + *out = fmax(*a, *b); } template <> - __device__ void deepx_atomicAdd(float *a, float b) + __device__ __forceinline__ void deepx_max(const float *a, const float *b, float *out) { - atomicAdd(a, b); + *out = fmaxf(*a, *b); } template <> - __device__ void deepx_atomicAdd(half *a, half b) + __device__ __forceinline__ void deepx_max(const half *a, const half *b, half *out) { - atomicAdd(a, b); + *out = __hmax(*a, *b); } template <> - __device__ void deepx_atomicAdd(nv_bfloat16 *a, nv_bfloat16 b) + __device__ __forceinline__ void deepx_max(const nv_bfloat16 *a, const nv_bfloat16 *b, nv_bfloat16 *out) { - atomicAdd(a, b); + *out = __hmax(*a, *b); } - template <> - __device__ void deepx_atomicAdd(int64_t *a, int64_t b) + __device__ __forceinline__ void deepx_max(const int64_t *a, const int64_t *b, int64_t *out) { - int64_t old = *a; - int64_t assumed; - do - { - assumed = old; - old = atomicCAS((unsigned long long *)a, (unsigned long long)assumed, (unsigned long long)(assumed + b)); - } while (assumed != old); - *a = old + b; + *out = *a > *b ? *a : *b; } - template <> - __device__ void deepx_atomicAdd(int32_t *a, int32_t b) + __device__ __forceinline__ void deepx_max(const int32_t *a, const int32_t *b, int32_t *out) { - atomicAdd(a, b); + *out = *a > *b ? *a : *b; } - template <> - __device__ void deepx_atomicAdd(int16_t *a, int16_t b) + __device__ __forceinline__ void deepx_max(const int16_t *a, const int16_t *b, int16_t *out) { - unsigned int *address_as_uint = (unsigned int *)((char *)a - ((size_t)a & 2)); - unsigned int old = *address_as_uint; - unsigned int assumed; - - do - { - assumed = old; - unsigned int new_val; - if ((size_t)a & 2) - { - new_val = (old & 0x0000FFFF) | (((unsigned short)(((old >> 16) & 0xFFFF) + b)) << 16); - } - else - { - new_val = (old & 0xFFFF0000) | ((unsigned short)((old & 0xFFFF) + b)); - } - old = atomicCAS(address_as_uint, assumed, new_val); - } while (assumed != old); + *out = *a > *b ? *a : *b; } - template <> - __device__ void deepx_atomicAdd(int8_t *a, int8_t b) + __device__ __forceinline__ void deepx_max(const int8_t *a, const int8_t *b, int8_t *out) { - unsigned int *address_as_uint = (unsigned int *)((char *)a - ((size_t)a & 3)); - unsigned int old = *address_as_uint; - unsigned int assumed; - unsigned int byte_offset = ((size_t)a & 3) * 8; - unsigned int mask = 0xFF << byte_offset; - - do - { - assumed = old; - unsigned char byte_val = (old >> byte_offset) & 0xFF; - byte_val += b; - unsigned int new_val = (old & ~mask) | (byte_val << byte_offset); - old = atomicCAS(address_as_uint, assumed, new_val); - } while (assumed != old); + *out = *a > *b ? *a : *b; } - - // atomicMul - // atomicMul + // min template - __device__ void deepx_atomicMul(T *a, T b); + __device__ __forceinline__ void deepx_min(const T *a, const T *b, T *out); template <> - __device__ void deepx_atomicMul(double *a, double b) + __device__ __forceinline__ void deepx_min(const double *a, const double *b, double *out) { - double old = *a; - double assumed; - do - { - assumed = old; - old = __longlong_as_double(atomicCAS((unsigned long long int*)a, - __double_as_longlong(assumed), - __double_as_longlong(assumed * b))); - } while (assumed != old); + *out = fmin(*a, *b); } template <> - __device__ void deepx_atomicMul(float *a, float b) + __device__ __forceinline__ void deepx_min(const float *a, const float *b, float *out) { - float old = *a; - float assumed; - do - { - assumed = old; - old = __int_as_float(atomicCAS((int*)a, - __float_as_int(assumed), - __float_as_int(assumed * b))); - } while (assumed != old); + *out = fminf(*a, *b); } - + template <> - __device__ void deepx_atomicMul(half *a, half b) + __device__ __forceinline__ void deepx_min(const half *a, const half *b, half *out) { - unsigned int *address_as_uint = (unsigned int *)((char *)a - ((size_t)a & 2)); - unsigned int old = *address_as_uint; - unsigned int assumed; - - do - { - assumed = old; - half assumed_half; - if ((size_t)a & 2) - { - assumed_half = __ushort_as_half((unsigned short)(old >> 16)); - half new_half = __hmul(assumed_half, b); - unsigned int new_val = (old & 0x0000FFFF) | ((unsigned int)__half_as_ushort(new_half) << 16); - old = atomicCAS(address_as_uint, assumed, new_val); - } - else - { - assumed_half = __ushort_as_half((unsigned short)(old & 0xFFFF)); - half new_half = __hmul(assumed_half, b); - unsigned int new_val = (old & 0xFFFF0000) | __half_as_ushort(new_half); - old = atomicCAS(address_as_uint, assumed, new_val); - } - } while (assumed != old); + *out = __hmin(*a, *b); } template <> - __device__ void deepx_atomicMul(nv_bfloat16 *a, nv_bfloat16 b) + __device__ __forceinline__ void deepx_min(const nv_bfloat16 *a, const nv_bfloat16 *b, nv_bfloat16 *out) { - unsigned int *address_as_uint = (unsigned int *)((char *)a - ((size_t)a & 2)); - unsigned int old = *address_as_uint; - unsigned int assumed; - - do - { - assumed = old; - nv_bfloat16 assumed_bf16; - if ((size_t)a & 2) - { - assumed_bf16 = __ushort_as_bfloat16((unsigned short)(old >> 16)); - nv_bfloat16 new_bf16 = __hmul(assumed_bf16, b); - unsigned int new_val = (old & 0x0000FFFF) | ((unsigned int)__bfloat16_as_ushort(new_bf16) << 16); - old = atomicCAS(address_as_uint, assumed, new_val); - } - else - { - assumed_bf16 = __ushort_as_bfloat16((unsigned short)(old & 0xFFFF)); - nv_bfloat16 new_bf16 = __hmul(assumed_bf16, b); - unsigned int new_val = (old & 0xFFFF0000) | __bfloat16_as_ushort(new_bf16); - old = atomicCAS(address_as_uint, assumed, new_val); - } - } while (assumed != old); + *out = __hmin(*a, *b); } template <> - __device__ void deepx_atomicMul(int64_t *a, int64_t b) + __device__ __forceinline__ void deepx_min(const int64_t *a, const int64_t *b, int64_t *out) { - int64_t old = *a; - int64_t assumed; - do - { - assumed = old; - old = atomicCAS((unsigned long long *)a, - (unsigned long long)assumed, - (unsigned long long)(assumed * b)); - } while (assumed != old); + *out = *a < *b ? *a : *b; } template <> - __device__ void deepx_atomicMul(int32_t *a, int32_t b) + __device__ __forceinline__ void deepx_min(const int32_t *a, const int32_t *b, int32_t *out) { - int32_t old = *a; - int32_t assumed; - do - { - assumed = old; - old = atomicCAS((int32_t *)a, assumed, assumed * b); - } while (assumed != old); + *out = *a < *b ? *a : *b; } template <> - __device__ void deepx_atomicMul(int16_t *a, int16_t b) + __device__ __forceinline__ void deepx_min(const int16_t *a, const int16_t *b, int16_t *out) { - unsigned int *address_as_uint = (unsigned int *)((char *)a - ((size_t)a & 2)); - unsigned int old = *address_as_uint; - unsigned int assumed; - - do - { - assumed = old; - unsigned int new_val; - if ((size_t)a & 2) - { - int16_t assumed_short = (int16_t)(old >> 16); - new_val = (old & 0x0000FFFF) | (((unsigned short)(assumed_short * b)) << 16); - } - else - { - int16_t assumed_short = (int16_t)(old & 0xFFFF); - new_val = (old & 0xFFFF0000) | ((unsigned short)(assumed_short * b)); - } - old = atomicCAS(address_as_uint, assumed, new_val); - } while (assumed != old); + *out = *a < *b ? *a : *b; } template <> - __device__ void deepx_atomicMul(int8_t *a, int8_t b) + __device__ __forceinline__ void deepx_min(const int8_t *a, const int8_t *b, int8_t *out) { - unsigned int *address_as_uint = (unsigned int *)((char *)a - ((size_t)a & 3)); - unsigned int old = *address_as_uint; - unsigned int assumed; - unsigned int byte_offset = ((size_t)a & 3) * 8; - unsigned int mask = 0xFF << byte_offset; - - do - { - assumed = old; - int8_t byte_val = (old >> byte_offset) & 0xFF; - byte_val *= b; - unsigned int new_val = (old & ~mask) | ((byte_val & 0xFF) << byte_offset); - old = atomicCAS(address_as_uint, assumed, new_val); - } while (assumed != old); + *out = *a < *b ? *a : *b; } + + } diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu index 3f54e08e..abe6f223 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu @@ -1,284 +1,322 @@ #ifndef DEEPX_TENSORFUNC_ELEMENTWISE_MIAOBYTE_BASIC_CU #define DEEPX_TENSORFUNC_ELEMENTWISE_MIAOBYTE_BASIC_CU +#include +#include #include "deepx/tensorfunc/cuda.hpp" #include "deepx/tensorfunc/authors.hpp" namespace deepx::tensorfunc { - template - __global__ void add_kernel(const T* A, const T* B, T* C,const int size) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < size) { + // add + template + __global__ void add_kernel(const T *A, const T *B, T *C, const int size) + { + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x) + { C[idx] = A[idx] + B[idx]; } } - template __global__ void add_kernel(const double* A, const double* B, double* C,const int size); - template __global__ void add_kernel(const float* A, const float* B, float* C,const int size); - template __global__ void add_kernel(const half* A, const half* B, half* C,const int size); - template __global__ void add_kernel(const nv_bfloat16* A, const nv_bfloat16* B, nv_bfloat16* C,const int size); - template __global__ void add_kernel(const int64_t* A, const int64_t* B, int64_t* C,const int size); - template __global__ void add_kernel(const int32_t* A, const int32_t* B, int32_t* C,const int size); - template __global__ void add_kernel(const int16_t* A, const int16_t* B, int16_t* C,const int size); - template __global__ void add_kernel(const int8_t* A, const int8_t* B, int8_t* C,const int size); - + template - void launch_add(int numBlocks, int blockSize,const T* a, const T* b, T* c,const int size) + void launch_add(const T *a, const T *b, T *c, const int size) { - // 启动kernel - add_kernel<<>>(a, b, c, size); - // 检查kernel执行是否成功 - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) { - throw std::runtime_error("Failed to launch add kernel: " + - std::string(cudaGetErrorString(err))); - } + // 启动kernel + auto [numBlocks, blockSize] = BestDims(size); + add_kernel<<>>(a, b, c, size); + // 检查kernel执行是否成功 + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + throw std::runtime_error("Failed to launch add kernel: " + + std::string(cudaGetErrorString(err))); + } } - template void launch_add(int numBlocks, int blockSize,const double* a, const double* b, double* c,const int size); - template void launch_add(int numBlocks, int blockSize,const float* a, const float* b, float* c,const int size); - template void launch_add(int numBlocks, int blockSize,const half* a, const half* b, half* c,const int size); - template void launch_add(int numBlocks, int blockSize,const nv_bfloat16* a, const nv_bfloat16* b, nv_bfloat16* c,const int size); - template void launch_add(int numBlocks, int blockSize,const int64_t* a, const int64_t* b, int64_t* c,const int size); - template void launch_add(int numBlocks, int blockSize, const int32_t* a, const int32_t* b, int32_t* c,const int size); - template void launch_add(int numBlocks, int blockSize, const int16_t* a, const int16_t* b, int16_t* c,const int size); - template void launch_add(int numBlocks, int blockSize, const int8_t* a, const int8_t* b, int8_t* c,const int size); - + template void launch_add(const double *a, const double *b, double *c, const int size); + template void launch_add(const float *a, const float *b, float *c, const int size); + template void launch_add(const half *a, const half *b, half *c, const int size); + template void launch_add(const nv_bfloat16 *a, const nv_bfloat16 *b, nv_bfloat16 *c, const int size); + template void launch_add(const int64_t *a, const int64_t *b, int64_t *c, const int size); + template void launch_add(const int32_t *a, const int32_t *b, int32_t *c, const int size); + template void launch_add(const int16_t *a, const int16_t *b, int16_t *c, const int size); + template void launch_add(const int8_t *a, const int8_t *b, int8_t *c, const int size); + // addscalar template - __global__ void addscalar_kernel(const T* A, const T scalar, T* C,const int size) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < size) { + __global__ void addscalar_kernel(const T *A, const T scalar, T *C, const int size) + { + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x) + { C[idx] = A[idx] + scalar; } - } - template __global__ void addscalar_kernel(const double* A, const double scalar, double* C,const int size); - template __global__ void addscalar_kernel(const float* A, const float scalar, float* C,const int size); - template __global__ void addscalar_kernel(const half* A, const half scalar, half* C,const int size); - template __global__ void addscalar_kernel(const nv_bfloat16* A, const nv_bfloat16 scalar, nv_bfloat16* C,const int size); - template __global__ void addscalar_kernel(const int64_t* A, const int64_t scalar, int64_t* C,const int size); - template __global__ void addscalar_kernel(const int32_t* A, const int32_t scalar, int32_t* C,const int size); - template __global__ void addscalar_kernel(const int16_t* A, const int16_t scalar, int16_t* C,const int size); - template __global__ void addscalar_kernel(const int8_t* A, const int8_t scalar, int8_t* C,const int size); - + } + template - void launch_addscalar(const int numBlocks, const int blockSize, const T* a, const T scalar, T* c, const int size) { + void launch_addscalar(const T *a, const T scalar, T *c, const int size) + { + auto [numBlocks, blockSize] = BestDims(size); addscalar_kernel<<>>(a, scalar, c, size); - } - template void launch_addscalar(const int numBlocks, const int blockSize, const double* a, const double scalar, double* c, const int size); - template void launch_addscalar(const int numBlocks, const int blockSize, const float* a, const float scalar, float* c, const int size); - template void launch_addscalar(const int numBlocks, const int blockSize, const half* a, const half scalar, half* c, const int size); - template void launch_addscalar(const int numBlocks, const int blockSize, const nv_bfloat16* a, const nv_bfloat16 scalar, nv_bfloat16* c, const int size); - template void launch_addscalar(const int numBlocks, const int blockSize, const int64_t* a, const int64_t scalar, int64_t* c, const int size); - template void launch_addscalar(const int numBlocks, const int blockSize, const int32_t* a, const int32_t scalar, int32_t* c, const int size); - template void launch_addscalar(const int numBlocks, const int blockSize, const int16_t* a, const int16_t scalar, int16_t* c, const int size); - template void launch_addscalar(const int numBlocks, const int blockSize, const int8_t* a, const int8_t scalar, int8_t* c, const int size); - + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + throw std::runtime_error("Failed to launch addscalar kernel: " + + std::string(cudaGetErrorString(err))); + } + } + template void launch_addscalar(const double *a, const double scalar, double *c, const int size); + template void launch_addscalar(const float *a, const float scalar, float *c, const int size); + template void launch_addscalar(const half *a, const half scalar, half *c, const int size); + template void launch_addscalar(const nv_bfloat16 *a, const nv_bfloat16 scalar, nv_bfloat16 *c, const int size); + template void launch_addscalar(const int64_t *a, const int64_t scalar, int64_t *c, const int size); + template void launch_addscalar(const int32_t *a, const int32_t scalar, int32_t *c, const int size); + template void launch_addscalar(const int16_t *a, const int16_t scalar, int16_t *c, const int size); + template void launch_addscalar(const int8_t *a, const int8_t scalar, int8_t *c, const int size); + // sub template - __global__ void sub_kernel(const T* A, const T* B, T* C,const int size){ - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < size) { + __global__ void sub_kernel(const T *A, const T *B, T *C, const int size) + { + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x) + { C[idx] = A[idx] - B[idx]; - } + } } - template __global__ void sub_kernel(const double* A, const double* B, double* C, const int size); - template __global__ void sub_kernel(const float* A, const float* B, float* C, const int size); - template __global__ void sub_kernel(const half* A, const half* B, half* C, const int size); - template __global__ void sub_kernel(const nv_bfloat16* A, const nv_bfloat16* B, nv_bfloat16* C, const int size); - template __global__ void sub_kernel(const int64_t* A, const int64_t* B, int64_t* C, const int size); - template __global__ void sub_kernel(const int32_t* A, const int32_t* B, int32_t* C, const int size); - template __global__ void sub_kernel(const int16_t* A, const int16_t* B, int16_t* C, const int size); - template __global__ void sub_kernel(const int8_t* A, const int8_t* B, int8_t* C, const int size); template - void launch_sub(const int numBlocks, const int blockSize, const T* a, const T* b, T* c, const int size) { + void launch_sub(const T *a, const T *b, T *c, const int size) + { + auto [numBlocks, blockSize] = BestDims(size); sub_kernel<<>>(a, b, c, size); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + throw std::runtime_error("Failed to launch sub kernel: " + + std::string(cudaGetErrorString(err))); + } } - template void launch_sub(const int numBlocks, const int blockSize, const double* a, const double* b, double* c, const int size); - template void launch_sub(const int numBlocks, const int blockSize, const float* a, const float* b, float* c, const int size); - template void launch_sub(const int numBlocks, const int blockSize, const half* a, const half* b, half* c, const int size); - template void launch_sub(const int numBlocks, const int blockSize, const nv_bfloat16* a, const nv_bfloat16* b, nv_bfloat16* c, const int size); - template void launch_sub(const int numBlocks, const int blockSize, const int64_t* a, const int64_t* b, int64_t* c, const int size); - template void launch_sub(const int numBlocks, const int blockSize, const int32_t* a, const int32_t* b, int32_t* c, const int size); - template void launch_sub(const int numBlocks, const int blockSize, const int16_t* a, const int16_t* b, int16_t* c, const int size); - template void launch_sub(const int numBlocks, const int blockSize, const int8_t* a, const int8_t* b, int8_t* c, const int size); - + template void launch_sub(const double *a, const double *b, double *c, const int size); + template void launch_sub(const float *a, const float *b, float *c, const int size); + template void launch_sub(const half *a, const half *b, half *c, const int size); + template void launch_sub(const nv_bfloat16 *a, const nv_bfloat16 *b, nv_bfloat16 *c, const int size); + template void launch_sub(const int64_t *a, const int64_t *b, int64_t *c, const int size); + template void launch_sub(const int32_t *a, const int32_t *b, int32_t *c, const int size); + template void launch_sub(const int16_t *a, const int16_t *b, int16_t *c, const int size); + template void launch_sub(const int8_t *a, const int8_t *b, int8_t *c, const int size); + + // subscalar template - __global__ void subscalar_kernel(const T* A, const T scalar, T* C,const int size){ - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < size) { + __global__ void subscalar_kernel(const T *A, const T scalar, T *C, const int size) + { + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x) + { C[idx] = A[idx] - scalar; } - } - template __global__ void subscalar_kernel(const double* A, const double scalar, double* C,const int size); - template __global__ void subscalar_kernel(const float* A, const float scalar, float* C,const int size); - template __global__ void subscalar_kernel(const half* A, const half scalar, half* C,const int size); - template __global__ void subscalar_kernel(const nv_bfloat16* A, const nv_bfloat16 scalar, nv_bfloat16* C,const int size); - template __global__ void subscalar_kernel(const int64_t* A, const int64_t scalar, int64_t* C,const int size); - template __global__ void subscalar_kernel(const int32_t* A, const int32_t scalar, int32_t* C,const int size); - template __global__ void subscalar_kernel(const int16_t* A, const int16_t scalar, int16_t* C,const int size); - template __global__ void subscalar_kernel(const int8_t* A, const int8_t scalar, int8_t* C,const int size); + } template - void launch_subscalar(const int numBlocks, const int blockSize, const T* a, const T scalar, T* c, const int size) { + void launch_subscalar(const T *a, const T scalar, T *c, const int size) + { + auto [numBlocks, blockSize] = BestDims(size); subscalar_kernel<<>>(a, scalar, c, size); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + throw std::runtime_error("Failed to launch subscalar kernel: " + + std::string(cudaGetErrorString(err))); + } } - template void launch_subscalar(const int numBlocks, const int blockSize, const double* a, const double scalar, double* c, const int size); - template void launch_subscalar(const int numBlocks, const int blockSize, const float* a, const float scalar, float* c, const int size); - template void launch_subscalar(const int numBlocks, const int blockSize, const half* a, const half scalar, half* c, const int size); - template void launch_subscalar(const int numBlocks, const int blockSize, const nv_bfloat16* a, const nv_bfloat16 scalar, nv_bfloat16* c, const int size); - template void launch_subscalar(const int numBlocks, const int blockSize, const int64_t* a, const int64_t scalar, int64_t* c, const int size); - template void launch_subscalar(const int numBlocks, const int blockSize, const int32_t* a, const int32_t scalar, int32_t* c, const int size); - template void launch_subscalar(const int numBlocks, const int blockSize, const int16_t* a, const int16_t scalar, int16_t* c, const int size); - template void launch_subscalar(const int numBlocks, const int blockSize, const int8_t* a, const int8_t scalar, int8_t* c, const int size); - - template - __global__ void mul_kernel(const T* A, const T* B, T* C,const int size){ - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < size) { + template void launch_subscalar(const double *a, const double scalar, double *c, const int size); + template void launch_subscalar(const float *a, const float scalar, float *c, const int size); + template void launch_subscalar(const half *a, const half scalar, half *c, const int size); + template void launch_subscalar(const nv_bfloat16 *a, const nv_bfloat16 scalar, nv_bfloat16 *c, const int size); + template void launch_subscalar(const int64_t *a, const int64_t scalar, int64_t *c, const int size); + template void launch_subscalar(const int32_t *a, const int32_t scalar, int32_t *c, const int size); + template void launch_subscalar(const int16_t *a, const int16_t scalar, int16_t *c, const int size); + template void launch_subscalar(const int8_t *a, const int8_t scalar, int8_t *c, const int size); + + // mul + template + __global__ void mul_kernel(const T *A, const T *B, T *C, const int size) + { + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x) + { C[idx] = A[idx] * B[idx]; } - } - template __global__ void mul_kernel(const double* A, const double* B, double* C,const int size); - template __global__ void mul_kernel(const float* A, const float* B, float* C,const int size); - template __global__ void mul_kernel(const half* A, const half* B, half* C,const int size); - template __global__ void mul_kernel(const nv_bfloat16* A, const nv_bfloat16* B, nv_bfloat16* C,const int size); - template __global__ void mul_kernel(const int64_t* A, const int64_t* B, int64_t* C,const int size); - template __global__ void mul_kernel(const int32_t* A, const int32_t* B, int32_t* C,const int size); - template __global__ void mul_kernel(const int16_t* A, const int16_t* B, int16_t* C,const int size); - template __global__ void mul_kernel(const int8_t* A, const int8_t* B, int8_t* C,const int size); + } template - void launch_mul(const int numBlocks, const int blockSize, const T* a, const T* b, T* c, const int size) { + void launch_mul(const T *a, const T *b, T *c, const int size) + { + auto [numBlocks, blockSize] = BestDims(size); mul_kernel<<>>(a, b, c, size); - } - template void launch_mul(const int numBlocks, const int blockSize, const double* a, const double* b, double* c, const int size); - template void launch_mul(const int numBlocks, const int blockSize, const float* a, const float* b, float* c, const int size); - template void launch_mul(const int numBlocks, const int blockSize, const half* a, const half* b, half* c, const int size); - template void launch_mul(const int numBlocks, const int blockSize, const nv_bfloat16* a, const nv_bfloat16* b, nv_bfloat16* c, const int size); - template void launch_mul(const int numBlocks, const int blockSize, const int64_t* a, const int64_t* b, int64_t* c, const int size); - template void launch_mul(const int numBlocks, const int blockSize, const int32_t* a, const int32_t* b, int32_t* c, const int size); - template void launch_mul(const int numBlocks, const int blockSize, const int16_t* a, const int16_t* b, int16_t* c, const int size); - template void launch_mul(const int numBlocks, const int blockSize, const int8_t* a, const int8_t* b, int8_t* c, const int size); - + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + throw std::runtime_error("Failed to launch mul kernel: " + + std::string(cudaGetErrorString(err))); + } + } + template void launch_mul(const double *a, const double *b, double *c, const int size); + template void launch_mul(const float *a, const float *b, float *c, const int size); + template void launch_mul(const half *a, const half *b, half *c, const int size); + template void launch_mul(const nv_bfloat16 *a, const nv_bfloat16 *b, nv_bfloat16 *c, const int size); + template void launch_mul(const int64_t *a, const int64_t *b, int64_t *c, const int size); + template void launch_mul(const int32_t *a, const int32_t *b, int32_t *c, const int size); + template void launch_mul(const int16_t *a, const int16_t *b, int16_t *c, const int size); + template void launch_mul(const int8_t *a, const int8_t *b, int8_t *c, const int size); + + // mulscalar template - __global__ void mulscalar_kernel(const T* A, const T scalar, T* C,const int size){ - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < size) { + __global__ void mulscalar_kernel(const T *A, const T scalar, T *C, const int size) + { + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x) + { C[idx] = A[idx] * scalar; } - } - template __global__ void mulscalar_kernel(const double* A, const double scalar, double* C,const int size); - template __global__ void mulscalar_kernel(const float* A, const float scalar, float* C,const int size); - template __global__ void mulscalar_kernel(const half* A, const half scalar, half* C,const int size); - template __global__ void mulscalar_kernel(const nv_bfloat16* A, const nv_bfloat16 scalar, nv_bfloat16* C,const int size); - template __global__ void mulscalar_kernel(const int64_t* A, const int64_t scalar, int64_t* C,const int size); - template __global__ void mulscalar_kernel(const int32_t* A, const int32_t scalar, int32_t* C,const int size); - template __global__ void mulscalar_kernel(const int16_t* A, const int16_t scalar, int16_t* C,const int size); - template __global__ void mulscalar_kernel(const int8_t* A, const int8_t scalar, int8_t* C,const int size); - + } + template - void launch_mulscalar(const int numBlocks, const int blockSize, const T* a, const T scalar, T* c, const int size) { + void launch_mulscalar(const T *a, const T scalar, T *c, const int size) + { + auto [numBlocks, blockSize] = BestDims(size); mulscalar_kernel<<>>(a, scalar, c, size); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + throw std::runtime_error("Failed to launch mulscalar kernel: " + + std::string(cudaGetErrorString(err))); + } } - template void launch_mulscalar(const int numBlocks, const int blockSize, const double* a, const double scalar, double* c, const int size); - template void launch_mulscalar(const int numBlocks, const int blockSize, const float* a, const float scalar, float* c, const int size); - template void launch_mulscalar(const int numBlocks, const int blockSize, const half* a, const half scalar, half* c, const int size); - template void launch_mulscalar(const int numBlocks, const int blockSize, const nv_bfloat16* a, const nv_bfloat16 scalar, nv_bfloat16* c, const int size); - template void launch_mulscalar(const int numBlocks, const int blockSize, const int64_t* a, const int64_t scalar, int64_t* c, const int size); - template void launch_mulscalar(const int numBlocks, const int blockSize, const int32_t* a, const int32_t scalar, int32_t* c, const int size); - template void launch_mulscalar(const int numBlocks, const int blockSize, const int16_t* a, const int16_t scalar, int16_t* c, const int size); - template void launch_mulscalar(const int numBlocks, const int blockSize, const int8_t* a, const int8_t scalar, int8_t* c, const int size); - + template void launch_mulscalar(const double *a, const double scalar, double *c, const int size); + template void launch_mulscalar(const float *a, const float scalar, float *c, const int size); + template void launch_mulscalar(const half *a, const half scalar, half *c, const int size); + template void launch_mulscalar(const nv_bfloat16 *a, const nv_bfloat16 scalar, nv_bfloat16 *c, const int size); + template void launch_mulscalar(const int64_t *a, const int64_t scalar, int64_t *c, const int size); + template void launch_mulscalar(const int32_t *a, const int32_t scalar, int32_t *c, const int size); + template void launch_mulscalar(const int16_t *a, const int16_t scalar, int16_t *c, const int size); + template void launch_mulscalar(const int8_t *a, const int8_t scalar, int8_t *c, const int size); + + // div template - __global__ void div_kernel(const T* A, const T* B, T* C,const int size){ - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < size) { + __global__ void div_kernel(const T *A, const T *B, T *C, const int size) + { + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x) + { C[idx] = A[idx] / B[idx]; } - } - template __global__ void div_kernel(const double* A, const double* B, double* C,const int size); - template __global__ void div_kernel(const float* A, const float* B, float* C,const int size); - template __global__ void div_kernel(const half* A, const half* B, half* C,const int size); - template __global__ void div_kernel(const nv_bfloat16* A, const nv_bfloat16* B, nv_bfloat16* C,const int size); - template __global__ void div_kernel(const int64_t* A, const int64_t* B, int64_t* C,const int size); - template __global__ void div_kernel(const int32_t* A, const int32_t* B, int32_t* C,const int size); - template __global__ void div_kernel(const int16_t* A, const int16_t* B, int16_t* C,const int size); - template __global__ void div_kernel(const int8_t* A, const int8_t* B, int8_t* C,const int size); - + } + template - void launch_div(const int numBlocks, const int blockSize, const T* a, const T* b, T* c, const int size) { + void launch_div(const T *a, const T *b, T *c, const int size) + { + auto [numBlocks, blockSize] = BestDims(size); div_kernel<<>>(a, b, c, size); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + throw std::runtime_error("Failed to launch div kernel: " + + std::string(cudaGetErrorString(err))); + } } - template void launch_div(const int numBlocks, const int blockSize, const double* a, const double* b, double* c, const int size); - template void launch_div(const int numBlocks, const int blockSize, const float* a, const float* b, float* c, const int size); - template void launch_div(const int numBlocks, const int blockSize, const half* a, const half* b, half* c, const int size); - template void launch_div(const int numBlocks, const int blockSize, const nv_bfloat16* a, const nv_bfloat16* b, nv_bfloat16* c, const int size); - template void launch_div(const int numBlocks, const int blockSize, const int64_t* a, const int64_t* b, int64_t* c, const int size); - template void launch_div(const int numBlocks, const int blockSize, const int32_t* a, const int32_t* b, int32_t* c, const int size); - template void launch_div(const int numBlocks, const int blockSize, const int16_t* a, const int16_t* b, int16_t* c, const int size); - template void launch_div(const int numBlocks, const int blockSize, const int8_t* a, const int8_t* b, int8_t* c, const int size); - + template void launch_div(const double *a, const double *b, double *c, const int size); + template void launch_div(const float *a, const float *b, float *c, const int size); + template void launch_div(const half *a, const half *b, half *c, const int size); + template void launch_div(const nv_bfloat16 *a, const nv_bfloat16 *b, nv_bfloat16 *c, const int size); + template void launch_div(const int64_t *a, const int64_t *b, int64_t *c, const int size); + template void launch_div(const int32_t *a, const int32_t *b, int32_t *c, const int size); + template void launch_div(const int16_t *a, const int16_t *b, int16_t *c, const int size); + template void launch_div(const int8_t *a, const int8_t *b, int8_t *c, const int size); + + // divscalar template - __global__ void divscalar_kernel(const T* A, const T scalar, T* C,const int size){ - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < size) { + __global__ void divscalar_kernel(const T *A, const T scalar, T *C, const int size) + { + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x) + { C[idx] = A[idx] / scalar; } - } - template __global__ void divscalar_kernel(const double* A, const double scalar, double* C,const int size); - template __global__ void divscalar_kernel(const float* A, const float scalar, float* C,const int size); - template __global__ void divscalar_kernel(const half* A, const half scalar, half* C,const int size); - template __global__ void divscalar_kernel(const nv_bfloat16* A, const nv_bfloat16 scalar, nv_bfloat16* C,const int size); - template __global__ void divscalar_kernel(const int64_t* A, const int64_t scalar, int64_t* C,const int size); - template __global__ void divscalar_kernel(const int32_t* A, const int32_t scalar, int32_t* C,const int size); - template __global__ void divscalar_kernel(const int16_t* A, const int16_t scalar, int16_t* C,const int size); - template __global__ void divscalar_kernel(const int8_t* A, const int8_t scalar, int8_t* C,const int size); - + } + template - void launch_divscalar(const int numBlocks, const int blockSize, const T* a, const T scalar, T* c, const int size) { + void launch_divscalar(const T *a, const T scalar, T *c, const int size) + { + auto [numBlocks, blockSize] = BestDims(size); divscalar_kernel<<>>(a, scalar, c, size); - } - template void launch_divscalar(const int numBlocks, const int blockSize, const double* a, const double scalar, double* c, const int size); - template void launch_divscalar(const int numBlocks, const int blockSize, const float* a, const float scalar, float* c, const int size); - template void launch_divscalar(const int numBlocks, const int blockSize, const half* a, const half scalar, half* c, const int size); - template void launch_divscalar(const int numBlocks, const int blockSize, const nv_bfloat16* a, const nv_bfloat16 scalar, nv_bfloat16* c, const int size); - template void launch_divscalar(const int numBlocks, const int blockSize, const int64_t* a, const int64_t scalar, int64_t* c, const int size); - template void launch_divscalar(const int numBlocks, const int blockSize, const int32_t* a, const int32_t scalar, int32_t* c, const int size); - template void launch_divscalar(const int numBlocks, const int blockSize, const int16_t* a, const int16_t scalar, int16_t* c, const int size); - template void launch_divscalar(const int numBlocks, const int blockSize, const int8_t* a, const int8_t scalar, int8_t* c, const int size); - + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + throw std::runtime_error("Failed to launch divscalar kernel: " + + std::string(cudaGetErrorString(err))); + } + } + template void launch_divscalar(const double *a, const double scalar, double *c, const int size); + template void launch_divscalar(const float *a, const float scalar, float *c, const int size); + template void launch_divscalar(const half *a, const half scalar, half *c, const int size); + template void launch_divscalar(const nv_bfloat16 *a, const nv_bfloat16 scalar, nv_bfloat16 *c, const int size); + template void launch_divscalar(const int64_t *a, const int64_t scalar, int64_t *c, const int size); + template void launch_divscalar(const int32_t *a, const int32_t scalar, int32_t *c, const int size); + template void launch_divscalar(const int16_t *a, const int16_t scalar, int16_t *c, const int size); + template void launch_divscalar(const int8_t *a, const int8_t scalar, int8_t *c, const int size); + + // rdivscalar template - __global__ void rdivscalar_kernel(const T scalar, const T* A, T* C,const int size){ - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < size) { + __global__ void rdivscalar_kernel(const T scalar, const T *A, T *C, const int size) + { + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x) + { C[idx] = scalar / A[idx]; } - } - template __global__ void rdivscalar_kernel(const double scalar, const double* A, double* C,const int size); - template __global__ void rdivscalar_kernel(const float scalar, const float* A, float* C,const int size); - template __global__ void rdivscalar_kernel(const half scalar, const half* A, half* C,const int size); - template __global__ void rdivscalar_kernel(const nv_bfloat16 scalar, const nv_bfloat16* A, nv_bfloat16* C,const int size); - template __global__ void rdivscalar_kernel(const int64_t scalar, const int64_t* A, int64_t* C,const int size); - template __global__ void rdivscalar_kernel(const int32_t scalar, const int32_t* A, int32_t* C,const int size); - template __global__ void rdivscalar_kernel(const int16_t scalar, const int16_t* A, int16_t* C,const int size); - template __global__ void rdivscalar_kernel(const int8_t scalar, const int8_t* A, int8_t* C,const int size); - + } + template - void launch_rdivscalar(const int numBlocks, const int blockSize, const T scalar, const T* a, T* c, const int size) { + void launch_rdivscalar(const T scalar, const T *a, T *c, const int size) + { + auto [numBlocks, blockSize] = BestDims(size); rdivscalar_kernel<<>>(scalar, a, c, size); - } - template void launch_rdivscalar(const int numBlocks, const int blockSize, const double scalar, const double* a, double* c, const int size); - template void launch_rdivscalar(const int numBlocks, const int blockSize, const float scalar, const float* a, float* c, const int size); - template void launch_rdivscalar(const int numBlocks, const int blockSize, const half scalar, const half* a, half* c, const int size); - template void launch_rdivscalar(const int numBlocks, const int blockSize, const nv_bfloat16 scalar, const nv_bfloat16* a, nv_bfloat16* c, const int size); - template void launch_rdivscalar(const int numBlocks, const int blockSize, const int64_t scalar, const int64_t* a, int64_t* c, const int size); - template void launch_rdivscalar(const int numBlocks, const int blockSize, const int32_t scalar, const int32_t* a, int32_t* c, const int size); - template void launch_rdivscalar(const int numBlocks, const int blockSize, const int16_t scalar, const int16_t* a, int16_t* c, const int size); - template void launch_rdivscalar(const int numBlocks, const int blockSize, const int8_t scalar, const int8_t* a, int8_t* c, const int size); - - + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + throw std::runtime_error("Failed to launch rdivscalar kernel: " + + std::string(cudaGetErrorString(err))); + } + } + template void launch_rdivscalar(const double scalar, const double *a, double *c, const int size); + template void launch_rdivscalar(const float scalar, const float *a, float *c, const int size); + template void launch_rdivscalar(const half scalar, const half *a, half *c, const int size); + template void launch_rdivscalar(const nv_bfloat16 scalar, const nv_bfloat16 *a, nv_bfloat16 *c, const int size); + template void launch_rdivscalar(const int64_t scalar, const int64_t *a, int64_t *c, const int size); + template void launch_rdivscalar(const int32_t scalar, const int32_t *a, int32_t *c, const int size); + template void launch_rdivscalar(const int16_t scalar, const int16_t *a, int16_t *c, const int size); + template void launch_rdivscalar(const int8_t scalar, const int8_t *a, int8_t *c, const int size); + + // invert + template + __global__ void invert_kernel(const T *A, T *C, const int size) + { + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x) + { + C[idx] = ~A[idx]; + } + } + + template + void launch_invert(const T *a, T *c, const int size) + { + auto [numBlocks, blockSize] = BestDims(size); + invert_kernel<<>>(a, c, size); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + throw std::runtime_error("Failed to launch invert kernel: " + + std::string(cudaGetErrorString(err))); + } + } + template void launch_invert(const int64_t *a, int64_t *c, const int size); + template void launch_invert(const int32_t *a, int32_t *c, const int size); + template void launch_invert(const int16_t *a, int16_t *c, const int size); + template void launch_invert(const int8_t *a, int8_t *c, const int size); + } #endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAOBYTE_BASIC_CU diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cuh b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cuh index 0f4da083..604421c4 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cuh +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cuh @@ -1,8 +1,7 @@ #ifndef DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_BASIC_CUH #define DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_BASIC_CUH -#include -#include + #include "deepx/tensorfunc/cuda.hpp" #include "deepx/tensorfunc/authors.hpp" @@ -12,282 +11,74 @@ namespace deepx::tensorfunc __global__ void add_kernel(const T* A, const T* B, T* C,const int size); template - void launch_add(int numBlocks, int blockSize, const T* a, const T* b, T* c,const int size); - - template <> - void launch_add(int numBlocks, int blockSize, const double* a, const double* b, double* c,const int size); - - template <> - void launch_add(int numBlocks, int blockSize, const float* a, const float* b, float* c,const int size); - - template <> - void launch_add(int numBlocks, int blockSize, const nv_bfloat16* a, const nv_bfloat16* b, nv_bfloat16* c,const int size); - - template <> - void launch_add<__half>(int numBlocks, int blockSize, const __half* a, const __half* b, __half* c,const int size); - - template <> - void launch_add(int numBlocks, int blockSize, const int64_t* a, const int64_t* b, int64_t* c,const int size); - - template <> - void launch_add(int numBlocks, int blockSize, const int32_t* a, const int32_t* b, int32_t* c,const int size); - - template <> - void launch_add(int numBlocks, int blockSize, const int16_t* a, const int16_t* b, int16_t* c,const int size); - - template <> - void launch_add(int numBlocks, int blockSize, const int8_t* a, const int8_t* b, int8_t* c,const int size); + void launch_add(const T* a, const T* b, T* c,const int size); + // addscalar template __global__ void addscalar_kernel(const T* A, const T scalar, T* C,const int size); template - void launch_addscalar(int numBlocks, int blockSize, const T* a, const T scalar, T* c,const int size); - - template <> - void launch_addscalar(int numBlocks, int blockSize, const double* a, const double scalar, double* c,const int size); - - template <> - void launch_addscalar(int numBlocks, int blockSize, const float* a, const float scalar, float* c,const int size); - - template <> - void launch_addscalar(int numBlocks, int blockSize, const nv_bfloat16* a, const nv_bfloat16 scalar, nv_bfloat16* c,const int size); - - template <> - void launch_addscalar<__half>(int numBlocks, int blockSize, const __half* a, const __half scalar, __half* c,const int size); - - template <> - void launch_addscalar(int numBlocks, int blockSize, const int64_t* a, const int64_t scalar, int64_t* c,const int size); - - template <> - void launch_addscalar(int numBlocks, int blockSize, const int32_t* a, const int32_t scalar, int32_t* c,const int size); - - template <> - void launch_addscalar(int numBlocks, int blockSize, const int16_t* a, const int16_t scalar, int16_t* c,const int size); - - template <> - void launch_addscalar(int numBlocks, int blockSize, const int8_t* a, const int8_t scalar, int8_t* c,const int size); - + void launch_addscalar(const T* a, const T scalar, T* c,const int size); + // sub template __global__ void sub_kernel(const T* A, const T* B, T* C,const int size); template - void launch_sub(int numBlocks, int blockSize, const T* a, const T* b, T* c,const int size); - - template <> - void launch_sub(int numBlocks, int blockSize, const double* a, const double* b, double* c,const int size); - - template <> - void launch_sub(int numBlocks, int blockSize, const float* a, const float* b, float* c,const int size); - - template <> - void launch_sub(int numBlocks, int blockSize, const nv_bfloat16* a, const nv_bfloat16* b, nv_bfloat16* c,const int size); - - template <> - void launch_sub<__half>(int numBlocks, int blockSize, const __half* a, const __half* b, __half* c,const int size); - - template <> - void launch_sub(int numBlocks, int blockSize, const int64_t* a, const int64_t* b, int64_t* c,const int size); - - template <> - void launch_sub(int numBlocks, int blockSize, const int32_t* a, const int32_t* b, int32_t* c,const int size); - - template <> - void launch_sub(int numBlocks, int blockSize, const int16_t* a, const int16_t* b, int16_t* c,const int size); - - template <> - void launch_sub(int numBlocks, int blockSize, const int8_t* a, const int8_t* b, int8_t* c,const int size); - + void launch_sub(const T* a, const T* b, T* c,const int size); + // subscalar template __global__ void subscalar_kernel(const T* A, const T scalar, T* C,const int size); template - void launch_subscalar(const int numBlocks, const int blockSize, const T* a, const T scalar, T* c,const int size); - - template <> - void launch_subscalar(const int numBlocks, const int blockSize, const double* a, const double scalar, double* c,const int size); - - template <> - void launch_subscalar(const int numBlocks, const int blockSize, const float* a, const float scalar, float* c,const int size); - - template <> - void launch_subscalar(const int numBlocks, const int blockSize, const nv_bfloat16* a, const nv_bfloat16 scalar, nv_bfloat16* c,const int size); - - template <> - void launch_subscalar<__half>(const int numBlocks, const int blockSize, const __half* a, const __half scalar, __half* c,const int size); - - template <> - void launch_subscalar(const int numBlocks, const int blockSize, const int64_t* a, const int64_t scalar, int64_t* c,const int size); - - template <> - void launch_subscalar(const int numBlocks, const int blockSize, const int32_t* a, const int32_t scalar, int32_t* c,const int size); - - template <> - void launch_subscalar(const int numBlocks, const int blockSize, const int16_t* a, const int16_t scalar, int16_t* c,const int size); - - template <> - void launch_subscalar(const int numBlocks, const int blockSize, const int8_t* a, const int8_t scalar, int8_t* c,const int size); - + void launch_subscalar(const T* a, const T scalar, T* c,const int size); + // mul template __global__ void mul_kernel(const T* A, const T* B, T* C,const int size); template - void launch_mul(const int numBlocks, const int blockSize, const T* a, const T* b, T* c,const int size); - - template <> - void launch_mul(const int numBlocks, const int blockSize, const double* a, const double* b, double* c,const int size); - - template <> - void launch_mul(const int numBlocks, const int blockSize, const float* a, const float* b, float* c,const int size); - - template <> - void launch_mul(const int numBlocks, const int blockSize, const nv_bfloat16* a, const nv_bfloat16* b, nv_bfloat16* c,const int size); - - template <> - void launch_mul<__half>(const int numBlocks, const int blockSize, const __half* a, const __half* b, __half* c,const int size); - - template <> - void launch_mul(const int numBlocks, const int blockSize, const int64_t* a, const int64_t* b, int64_t* c,const int size); - - template <> - void launch_mul(const int numBlocks, const int blockSize, const int32_t* a, const int32_t* b, int32_t* c,const int size); - - template <> - void launch_mul(const int numBlocks, const int blockSize, const int16_t* a, const int16_t* b, int16_t* c,const int size); - - template <> - void launch_mul(const int numBlocks, const int blockSize, const int8_t* a, const int8_t* b, int8_t* c,const int size); - + void launch_mul(const T* a, const T* b, T* c,const int size); + // mulscalar template __global__ void mulscalar_kernel(const T* A, const T scalar, T* C,const int size); template - void launch_mulscalar(const int numBlocks, const int blockSize, const T* a, const T scalar, T* c,const int size); - - template <> - void launch_mulscalar(const int numBlocks, const int blockSize, const double* a, const double scalar, double* c,const int size); - - template <> - void launch_mulscalar(const int numBlocks, const int blockSize, const float* a, const float scalar, float* c,const int size); - - template <> - void launch_mulscalar(const int numBlocks, const int blockSize, const nv_bfloat16* a, const nv_bfloat16 scalar, nv_bfloat16* c,const int size); - - template <> - void launch_mulscalar<__half>(const int numBlocks, const int blockSize, const __half* a, const __half scalar, __half* c,const int size); - - template <> - void launch_mulscalar(const int numBlocks, const int blockSize, const int64_t* a, const int64_t scalar, int64_t* c,const int size); - - template <> - void launch_mulscalar(const int numBlocks, const int blockSize, const int32_t* a, const int32_t scalar, int32_t* c,const int size); - - template <> - void launch_mulscalar(const int numBlocks, const int blockSize, const int16_t* a, const int16_t scalar, int16_t* c,const int size); - - template <> - void launch_mulscalar(const int numBlocks, const int blockSize, const int8_t* a, const int8_t scalar, int8_t* c,const int size); + void launch_mulscalar(const T* a, const T scalar, T* c,const int size); + // div template __global__ void div_kernel(const T* A, const T* B, T* C,const int size); template - void launch_div(const int numBlocks, const int blockSize, const T* a, const T* b, T* c,const int size); - - template <> - void launch_div(const int numBlocks, const int blockSize, const double* a, const double* b, double* c,const int size); - - template <> - void launch_div(const int numBlocks, const int blockSize, const float* a, const float* b, float* c,const int size); - - template <> - void launch_div(const int numBlocks, const int blockSize, const nv_bfloat16* a, const nv_bfloat16* b, nv_bfloat16* c,const int size); - - template <> - void launch_div<__half>(const int numBlocks, const int blockSize, const __half* a, const __half* b, __half* c,const int size); - - template <> - void launch_div(const int numBlocks, const int blockSize, const int64_t* a, const int64_t* b, int64_t* c,const int size); - - template <> - void launch_div(const int numBlocks, const int blockSize, const int32_t* a, const int32_t* b, int32_t* c,const int size); - - template <> - void launch_div(const int numBlocks, const int blockSize, const int16_t* a, const int16_t* b, int16_t* c,const int size); - - template <> - void launch_div(const int numBlocks, const int blockSize, const int8_t* a, const int8_t* b, int8_t* c,const int size); + void launch_div(const T* a, const T* b, T* c,const int size); + // divscalar template __global__ void divscalar_kernel(const T* A, const T scalar, T* C,const int size); template - void launch_divscalar(const int numBlocks, const int blockSize, const T* a, const T scalar, T* c,const int size); - - template <> - void launch_divscalar(const int numBlocks, const int blockSize, const double* a, const double scalar, double* c,const int size); - - template <> - void launch_divscalar(const int numBlocks, const int blockSize, const float* a, const float scalar, float* c,const int size); - - template <> - void launch_divscalar(const int numBlocks, const int blockSize, const nv_bfloat16* a, const nv_bfloat16 scalar, nv_bfloat16* c,const int size); - - template <> - void launch_divscalar<__half>(const int numBlocks, const int blockSize, const __half* a, const __half scalar, __half* c,const int size); - - template <> - void launch_divscalar(const int numBlocks, const int blockSize, const int64_t* a, const int64_t scalar, int64_t* c,const int size); - - template <> - void launch_divscalar(const int numBlocks, const int blockSize, const int32_t* a, const int32_t scalar, int32_t* c,const int size); - - template <> - void launch_divscalar(const int numBlocks, const int blockSize, const int16_t* a, const int16_t scalar, int16_t* c,const int size); - - template <> - void launch_divscalar(const int numBlocks, const int blockSize, const int8_t* a, const int8_t scalar, int8_t* c,const int size); - + void launch_divscalar(const T* a, const T scalar, T* c,const int size); + // rdivscalar template __global__ void rdivscalar_kernel(const T scalar, const T* A, T* C,const int size); template - void launch_rdivscalar(const int numBlocks, const int blockSize, const T scalar, const T* a, T* c,const int size); - - template <> - void launch_rdivscalar(const int numBlocks, const int blockSize, const double scalar, const double* a, double* c,const int size); - - template <> - void launch_rdivscalar(const int numBlocks, const int blockSize, const float scalar, const float* a, float* c,const int size); - - template <> - void launch_rdivscalar(const int numBlocks, const int blockSize, const nv_bfloat16 scalar, const nv_bfloat16* a, nv_bfloat16* c,const int size); - - template <> - void launch_rdivscalar<__half>(const int numBlocks, const int blockSize, const __half scalar, const __half* a, __half* c,const int size); - - template <> - void launch_rdivscalar(const int numBlocks, const int blockSize, const int64_t scalar, const int64_t* a, int64_t* c,const int size); - - template <> - void launch_rdivscalar(const int numBlocks, const int blockSize, const int32_t scalar, const int32_t* a, int32_t* c,const int size); - - template <> - void launch_rdivscalar(const int numBlocks, const int blockSize, const int16_t scalar, const int16_t* a, int16_t* c,const int size); + void launch_rdivscalar(const T scalar, const T* a, T* c,const int size); + + // invert + template + __global__ void invert_kernel(const T* A, T* C,const int size); - template <> - void launch_rdivscalar(const int numBlocks, const int blockSize, const int8_t scalar, const int8_t* a, int8_t* c,const int size); - - + template + void launch_invert(const T* a, T* c,const int size); } diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.hpp index e263b65b..82cb4cbf 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.hpp +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.hpp @@ -21,9 +21,7 @@ namespace deepx::tensorfunc if (A.shape.size != B.shape.size || A.shape.size != C.shape.size) { throw TensorShapeError("add"); } - const int blockSize = A.shape.size > 256 ? 256 : A.shape.size; - int numBlocks = (A.shape.size + blockSize - 1) / blockSize; - launch_add(numBlocks, blockSize, A.data, B.data, C.data, A.shape.size); + launch_add(A.data, B.data, C.data, A.shape.size); } }; @@ -36,9 +34,7 @@ namespace deepx::tensorfunc if (A.shape.size != C.shape.size) { throw TensorShapeError("addscalar"); } - const int blockSize = A.shape.size > 256 ? 256 : A.shape.size; - int numBlocks = (A.shape.size + blockSize - 1) / blockSize; - launch_addscalar(numBlocks, blockSize, A.data, scalar, C.data, A.shape.size); + launch_addscalar(A.data, scalar, C.data, A.shape.size); } }; @@ -50,9 +46,7 @@ namespace deepx::tensorfunc if (A.shape.size != B.shape.size || A.shape.size != C.shape.size) { throw TensorShapeError("sub"); } - const int blockSize = A.shape.size > 256 ? 256 : A.shape.size; - int numBlocks = (A.shape.size + blockSize - 1) / blockSize; - launch_sub(numBlocks, blockSize, A.data, B.data, C.data, A.shape.size); + launch_sub(A.data, B.data, C.data, A.shape.size); } }; @@ -64,9 +58,7 @@ namespace deepx::tensorfunc if (A.shape.size != C.shape.size) { throw TensorShapeError("subscalar"); } - const int blockSize = A.shape.size > 256 ? 256 : A.shape.size; - int numBlocks = (A.shape.size + blockSize - 1) / blockSize; - launch_subscalar(numBlocks, blockSize, A.data, scalar, C.data, A.shape.size); + launch_subscalar(A.data, scalar, C.data, A.shape.size); } }; @@ -78,9 +70,7 @@ namespace deepx::tensorfunc if (A.shape.size != B.shape.size || A.shape.size != C.shape.size) { throw TensorShapeError("mul"); } - const int blockSize = A.shape.size > 256 ? 256 : A.shape.size; - int numBlocks = (A.shape.size + blockSize - 1) / blockSize; - launch_mul(numBlocks, blockSize, A.data, B.data, C.data, A.shape.size); + launch_mul(A.data, B.data, C.data, A.shape.size); } }; @@ -92,9 +82,7 @@ namespace deepx::tensorfunc if (A.shape.size != C.shape.size) { throw TensorShapeError("mulscalar"); } - const int blockSize = A.shape.size > 256 ? 256 : A.shape.size; - int numBlocks = (A.shape.size + blockSize - 1) / blockSize; - launch_mulscalar(numBlocks, blockSize, A.data, scalar, C.data, A.shape.size); + launch_mulscalar(A.data, scalar, C.data, A.shape.size); } }; @@ -106,9 +94,7 @@ namespace deepx::tensorfunc if (A.shape.size != B.shape.size || A.shape.size != C.shape.size) { throw TensorShapeError("div"); } - const int blockSize = A.shape.size > 256 ? 256 : A.shape.size; - int numBlocks = (A.shape.size + blockSize - 1) / blockSize; - launch_div(numBlocks, blockSize, A.data, B.data, C.data, A.shape.size); + launch_div(A.data, B.data, C.data, A.shape.size); } }; @@ -120,9 +106,7 @@ namespace deepx::tensorfunc if (A.shape.size != C.shape.size) { throw TensorShapeError("divscalar"); } - const int blockSize = A.shape.size > 256 ? 256 : A.shape.size; - int numBlocks = (A.shape.size + blockSize - 1) / blockSize; - launch_divscalar(numBlocks, blockSize, A.data, scalar, C.data, A.shape.size); + launch_divscalar(A.data, scalar, C.data, A.shape.size); } }; @@ -134,12 +118,21 @@ namespace deepx::tensorfunc if (A.shape.size != C.shape.size) { throw TensorShapeError("rdivscalar"); } - const int blockSize = A.shape.size > 256 ? 256 : A.shape.size; - int numBlocks = (A.shape.size + blockSize - 1) / blockSize; - launch_rdivscalar(numBlocks, blockSize, scalar, A.data, C.data, A.shape.size); + launch_rdivscalar(scalar, A.data, C.data, A.shape.size); + } + }; + + template + struct invertDispatcher + { + static void invert(const Tensor &A, Tensor &C) + { + if (A.shape.size != C.shape.size) { + throw TensorShapeError("invert"); + } + launch_invert( A.data, C.data, A.shape.size); } }; - } #endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_BASIC_HPP diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.cu index cb117037..f5e93fc5 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.cu +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.cu @@ -3,249 +3,420 @@ #include "deepx/tensorfunc/cuda.hpp" #include "deepx/tensorfunc/authors.hpp" - +#include "deepx/tensorfunc/vector_cuda.cuh" namespace deepx::tensorfunc { template - __global__ void max_kernel(const T* A, const T* B, T* C, const int size){ - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < size) { + __global__ void max_kernel(const T *A, const T *B, T *C, const int size) + { + int stride = blockDim.x * gridDim.x; + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride) + { C[idx] = A[idx] > B[idx] ? A[idx] : B[idx]; } } - template __global__ void max_kernel(const double* A, const double* B, double* C, const int size); - template __global__ void max_kernel(const float* A, const float* B, float* C, const int size); - template __global__ void max_kernel(const nv_bfloat16* A, const nv_bfloat16* B, nv_bfloat16* C, const int size); - template __global__ void max_kernel<__half>(const __half* A, const __half* B, __half* C, const int size); - template __global__ void max_kernel(const int64_t* A, const int64_t* B, int64_t* C, const int size); - template __global__ void max_kernel(const int32_t* A, const int32_t* B, int32_t* C, const int size); - template __global__ void max_kernel(const int16_t* A, const int16_t* B, int16_t* C, const int size); - template __global__ void max_kernel(const int8_t* A, const int8_t* B, int8_t* C, const int size); - template - void launch_max(int numBlocks, int blockSize, const T* A, const T* B, T* C, const int size) + void launch_max(const T *A, const T *B, T *C, const int size) { + auto [numBlocks, blockSize] = BestDims(size); max_kernel<<>>(A, B, C, size); - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) { - throw std::runtime_error("Failed to launch add kernel: " + - std::string(cudaGetErrorString(err))); - } + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + throw std::runtime_error("Failed to launch add kernel: " + + std::string(cudaGetErrorString(err))); + } } - template void launch_max(int numBlocks, int blockSize, const double* A, const double* B, double* C, const int size); - template void launch_max(int numBlocks, int blockSize, const float* A, const float* B, float* C, const int size); - template void launch_max(int numBlocks, int blockSize, const nv_bfloat16* A, const nv_bfloat16* B, nv_bfloat16* C, const int size); - template void launch_max<__half>(int numBlocks, int blockSize, const __half* A, const __half* B, __half* C, const int size); - template void launch_max(int numBlocks, int blockSize, const int64_t* A, const int64_t* B, int64_t* C, const int size); - template void launch_max(int numBlocks, int blockSize, const int32_t* A, const int32_t* B, int32_t* C, const int size); - template void launch_max(int numBlocks, int blockSize, const int16_t* A, const int16_t* B, int16_t* C, const int size); - template void launch_max(int numBlocks, int blockSize, const int8_t* A, const int8_t* B, int8_t* C, const int size); + template void launch_max(const double *A, const double *B, double *C, const int size); + template void launch_max(const float *A, const float *B, float *C, const int size); + template void launch_max(const nv_bfloat16 *A, const nv_bfloat16 *B, nv_bfloat16 *C, const int size); + template void launch_max<__half>(const __half *A, const __half *B, __half *C, const int size); + template void launch_max(const int64_t *A, const int64_t *B, int64_t *C, const int size); + template void launch_max(const int32_t *A, const int32_t *B, int32_t *C, const int size); + template void launch_max(const int16_t *A, const int16_t *B, int16_t *C, const int size); + template void launch_max(const int8_t *A, const int8_t *B, int8_t *C, const int size); template - __global__ void maxscalar_kernel(const T* A, const T scalar, T* C, const int size) + __global__ void maxscalar_kernel(const T *A, const T scalar, T *C, const int size) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < size) { + int stride = blockDim.x * gridDim.x; + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride) + { C[idx] = A[idx] > scalar ? A[idx] : scalar; } } - template __global__ void maxscalar_kernel(const double* A, const double scalar, double* C, const int size); - template __global__ void maxscalar_kernel(const float* A, const float scalar, float* C, const int size); - template __global__ void maxscalar_kernel(const nv_bfloat16* A, const nv_bfloat16 scalar, nv_bfloat16* C, const int size); - template __global__ void maxscalar_kernel<__half>(const __half* A, const __half scalar, __half* C, const int size); - template __global__ void maxscalar_kernel(const int64_t* A, const int64_t scalar, int64_t* C, const int size); - template __global__ void maxscalar_kernel(const int32_t* A, const int32_t scalar, int32_t* C, const int size); - template __global__ void maxscalar_kernel(const int16_t* A, const int16_t scalar, int16_t* C, const int size); - template __global__ void maxscalar_kernel(const int8_t* A, const int8_t scalar, int8_t* C, const int size); - - template - void launch_maxscalar(int numBlocks, int blockSize, const T* A, const T scalar, T* C, const int size) + template + void launch_maxscalar(const T *A, const T scalar, T *C, const int size) { + auto [numBlocks, blockSize] = BestDims(size); maxscalar_kernel<<>>(A, scalar, C, size); cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) { - throw std::runtime_error("Failed to launch add kernel: " + - std::string(cudaGetErrorString(err))); - } + if (err != cudaSuccess) + { + throw std::runtime_error("Failed to launch add kernel: " + + std::string(cudaGetErrorString(err))); + } } - template void launch_maxscalar(int numBlocks, int blockSize, const double* A, const double scalar, double* C, const int size); - template void launch_maxscalar(int numBlocks, int blockSize, const float* A, const float scalar, float* C, const int size); - template void launch_maxscalar(int numBlocks, int blockSize, const nv_bfloat16* A, const nv_bfloat16 scalar, nv_bfloat16* C, const int size); - template void launch_maxscalar<__half>(int numBlocks, int blockSize, const __half* A, const __half scalar, __half* C, const int size); - template void launch_maxscalar(int numBlocks, int blockSize, const int64_t* A, const int64_t scalar, int64_t* C, const int size); - template void launch_maxscalar(int numBlocks, int blockSize, const int32_t* A, const int32_t scalar, int32_t* C, const int size); - template void launch_maxscalar(int numBlocks, int blockSize, const int16_t* A, const int16_t scalar, int16_t* C, const int size); - template void launch_maxscalar(int numBlocks, int blockSize, const int8_t* A, const int8_t scalar, int8_t* C, const int size); + template void launch_maxscalar(const double *A, const double scalar, double *C, const int size); + template void launch_maxscalar(const float *A, const float scalar, float *C, const int size); + template void launch_maxscalar(const nv_bfloat16 *A, const nv_bfloat16 scalar, nv_bfloat16 *C, const int size); + template void launch_maxscalar<__half>(const __half *A, const __half scalar, __half *C, const int size); + template void launch_maxscalar(const int64_t *A, const int64_t scalar, int64_t *C, const int size); + template void launch_maxscalar(const int32_t *A, const int32_t scalar, int32_t *C, const int size); + template void launch_maxscalar(const int16_t *A, const int16_t scalar, int16_t *C, const int size); + template void launch_maxscalar(const int8_t *A, const int8_t scalar, int8_t *C, const int size); template - __global__ void min_kernel(const T* A, const T* B, T* C, const int size) + __global__ void min_kernel(const T *A, const T *B, T *C, const int size) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < size) { + int stride = blockDim.x * gridDim.x; + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride) + { C[idx] = A[idx] < B[idx] ? A[idx] : B[idx]; - } + } } - template __global__ void min_kernel(const double* A, const double* B, double* C, const int size); - template __global__ void min_kernel(const float* A, const float* B, float* C, const int size); - template __global__ void min_kernel(const nv_bfloat16* A, const nv_bfloat16* B, nv_bfloat16* C, const int size); - template __global__ void min_kernel<__half>(const __half* A, const __half* B, __half* C, const int size); - template __global__ void min_kernel(const int64_t* A, const int64_t* B, int64_t* C, const int size); - template __global__ void min_kernel(const int32_t* A, const int32_t* B, int32_t* C, const int size); - template __global__ void min_kernel(const int16_t* A, const int16_t* B, int16_t* C, const int size); - template __global__ void min_kernel(const int8_t* A, const int8_t* B, int8_t* C, const int size); - template - void launch_min(int numBlocks, int blockSize, const T* A, const T* B, T* C, const int size) + void launch_min(const T *A, const T *B, T *C, const int size) { + auto [numBlocks, blockSize] = BestDims(size); min_kernel<<>>(A, B, C, size); cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) { - throw std::runtime_error("Failed to launch add kernel: " + - std::string(cudaGetErrorString(err))); - } - } - - template void launch_min(int numBlocks, int blockSize, const double* A, const double* B, double* C, const int size); - template void launch_min(int numBlocks, int blockSize, const float* A, const float* B, float* C, const int size); - template void launch_min(int numBlocks, int blockSize, const nv_bfloat16* A, const nv_bfloat16* B, nv_bfloat16* C, const int size); - template void launch_min<__half>(int numBlocks, int blockSize, const __half* A, const __half* B, __half* C, const int size); - template void launch_min(int numBlocks, int blockSize, const int64_t* A, const int64_t* B, int64_t* C, const int size); - template void launch_min(int numBlocks, int blockSize, const int32_t* A, const int32_t* B, int32_t* C, const int size); - template void launch_min(int numBlocks, int blockSize, const int16_t* A, const int16_t* B, int16_t* C, const int size); - template void launch_min(int numBlocks, int blockSize, const int8_t* A, const int8_t* B, int8_t* C, const int size); + if (err != cudaSuccess) + { + throw std::runtime_error("Failed to launch add kernel: " + + std::string(cudaGetErrorString(err))); + } + } + + template void launch_min(const double *A, const double *B, double *C, const int size); + template void launch_min(const float *A, const float *B, float *C, const int size); + template void launch_min(const nv_bfloat16 *A, const nv_bfloat16 *B, nv_bfloat16 *C, const int size); + template void launch_min<__half>(const __half *A, const __half *B, __half *C, const int size); + template void launch_min(const int64_t *A, const int64_t *B, int64_t *C, const int size); + template void launch_min(const int32_t *A, const int32_t *B, int32_t *C, const int size); + template void launch_min(const int16_t *A, const int16_t *B, int16_t *C, const int size); + template void launch_min(const int8_t *A, const int8_t *B, int8_t *C, const int size); template - __global__ void minscalar_kernel(const T* A, const T scalar, T* C, const int size) + __global__ void minscalar_kernel(const T *A, const T scalar, T *C, const int size) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < size) { + int stride = blockDim.x * gridDim.x; + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride) + { C[idx] = A[idx] < scalar ? A[idx] : scalar; } } - template __global__ void minscalar_kernel(const double* A, const double scalar, double* C, const int size); - template __global__ void minscalar_kernel(const float* A, const float scalar, float* C, const int size); - template __global__ void minscalar_kernel(const nv_bfloat16* A, const nv_bfloat16 scalar, nv_bfloat16* C, const int size); - template __global__ void minscalar_kernel<__half>(const __half* A, const __half scalar, __half* C, const int size); - template __global__ void minscalar_kernel(const int64_t* A, const int64_t scalar, int64_t* C, const int size); - template __global__ void minscalar_kernel(const int32_t* A, const int32_t scalar, int32_t* C, const int size); - template __global__ void minscalar_kernel(const int16_t* A, const int16_t scalar, int16_t* C, const int size); - template __global__ void minscalar_kernel(const int8_t* A, const int8_t scalar, int8_t* C, const int size); - template - void launch_minscalar(int numBlocks, int blockSize, const T* A, const T scalar, T* C, const int size) + void launch_minscalar(const T *A, const T scalar, T *C, const int size) { + auto [numBlocks, blockSize] = BestDims(size); minscalar_kernel<<>>(A, scalar, C, size); cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) { - throw std::runtime_error("Failed to launch add kernel: " + - std::string(cudaGetErrorString(err))); - } - } + if (err != cudaSuccess) + { + throw std::runtime_error("Failed to launch add kernel: " + + std::string(cudaGetErrorString(err))); + } + } - template void launch_minscalar(int numBlocks, int blockSize, const double* A, const double scalar, double* C, const int size); - template void launch_minscalar(int numBlocks, int blockSize, const float* A, const float scalar, float* C, const int size); - template void launch_minscalar(int numBlocks, int blockSize, const nv_bfloat16* A, const nv_bfloat16 scalar, nv_bfloat16* C, const int size); - template void launch_minscalar<__half>(int numBlocks, int blockSize, const __half* A, const __half scalar, __half* C, const int size); - template void launch_minscalar(int numBlocks, int blockSize, const int64_t* A, const int64_t scalar, int64_t* C, const int size); - template void launch_minscalar(int numBlocks, int blockSize, const int32_t* A, const int32_t scalar, int32_t* C, const int size); - template void launch_minscalar(int numBlocks, int blockSize, const int16_t* A, const int16_t scalar, int16_t* C, const int size); - template void launch_minscalar(int numBlocks, int blockSize, const int8_t* A, const int8_t scalar, int8_t* C, const int size); + template void launch_minscalar(const double *A, const double scalar, double *C, const int size); + template void launch_minscalar(const float *A, const float scalar, float *C, const int size); + template void launch_minscalar(const nv_bfloat16 *A, const nv_bfloat16 scalar, nv_bfloat16 *C, const int size); + template void launch_minscalar<__half>(const __half *A, const __half scalar, __half *C, const int size); + template void launch_minscalar(const int64_t *A, const int64_t scalar, int64_t *C, const int size); + template void launch_minscalar(const int32_t *A, const int32_t scalar, int32_t *C, const int size); + template void launch_minscalar(const int16_t *A, const int16_t scalar, int16_t *C, const int size); + template void launch_minscalar(const int8_t *A, const int8_t scalar, int8_t *C, const int size); - template - __global__ void compare_kernel(const T* A, const T* B, float* mask, const int size) + // equal + template + __global__ void equalwithepsilon_kernel(const T *A, const T *B, const float epsilon, MaskT *mask, const int size) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < size) { - if (A[idx] == B[idx]) { - mask[idx] = 0.5; - } else if (A[idx] > B[idx]) { + int stride = blockDim.x * gridDim.x; + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride) + { + float diff = fabsf(static_cast(A[idx]) - static_cast(B[idx])); + if (diff < epsilon) + { mask[idx] = 1; - } else { + } + else + { mask[idx] = 0; } } } - template __global__ void compare_kernel(const double* A, const double* B, float* mask, const int size); - template __global__ void compare_kernel(const float* A, const float* B, float* mask, const int size); - template __global__ void compare_kernel(const nv_bfloat16* A, const nv_bfloat16* B, float* mask, const int size); - template __global__ void compare_kernel<__half>(const __half* A, const __half* B, float* mask, const int size); - template __global__ void compare_kernel(const int64_t* A, const int64_t* B, float* mask, const int size); - template __global__ void compare_kernel(const int32_t* A, const int32_t* B, float* mask, const int size); - template __global__ void compare_kernel(const int16_t* A, const int16_t* B, float* mask, const int size); - template __global__ void compare_kernel(const int8_t* A, const int8_t* B, float* mask, const int size); + template + __global__ void equal_kernel(const T *A, const T *B, MaskT *mask, const int size) + { + int stride = blockDim.x * gridDim.x; + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride) + { + mask[idx] = (A[idx] == B[idx]); + } + } - template - void launch_compare(int numBlocks, int blockSize, const T* A, const T* B, float* mask, const int size) + template + void launch_equal(const T *A, const T *B, const float epsilon, MaskT *mask, const int size) { - compare_kernel<<>>(A, B, mask, size); + auto [numBlocks, blockSize] = BestDims(size); + if (epsilon == 0) + { + equal_kernel<<>>(A, B, mask, size); + } + else + { + equalwithepsilon_kernel<<>>(A, B, epsilon, mask, size); + } cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) { - throw std::runtime_error("Failed to launch add kernel: " + - std::string(cudaGetErrorString(err))); - } + if (err != cudaSuccess) + { + throw std::runtime_error("Failed to launch add kernel: " + + std::string(cudaGetErrorString(err))); + } } - template void launch_compare(int numBlocks, int blockSize, const double* A, const double* B, float* mask, const int size); - template void launch_compare(int numBlocks, int blockSize, const float* A, const float* B, float* mask, const int size); - template void launch_compare(int numBlocks, int blockSize, const nv_bfloat16* A, const nv_bfloat16* B, float* mask, const int size); - template void launch_compare<__half>(int numBlocks, int blockSize, const __half* A, const __half* B, float* mask, const int size); - template void launch_compare(int numBlocks, int blockSize, const int64_t* A, const int64_t* B, float* mask, const int size); - template void launch_compare(int numBlocks, int blockSize, const int32_t* A, const int32_t* B, float* mask, const int size); - template void launch_compare(int numBlocks, int blockSize, const int16_t* A, const int16_t* B, float* mask, const int size); - template void launch_compare(int numBlocks, int blockSize, const int8_t* A, const int8_t* B, float* mask, const int size); - - //comparescalar - template - __global__ void comparescalar_kernel(const T* A, const T scalar, float* mask, const int size) + template void launch_equal(const double *A, const double *B, const float epsilon, bool *mask, const int size); + template void launch_equal(const float *A, const float *B, const float epsilon, bool *mask, const int size); + template void launch_equal(const nv_bfloat16 *A, const nv_bfloat16 *B, const float epsilon, bool *mask, const int size); + template void launch_equal<__half,bool>(const __half *A, const __half *B, const float epsilon, bool *mask, const int size); + template void launch_equal(const int64_t *A, const int64_t *B, const float epsilon, bool *mask, const int size); + template void launch_equal(const int32_t *A, const int32_t *B, const float epsilon, bool *mask, const int size); + template void launch_equal(const int16_t *A, const int16_t *B, const float epsilon, bool *mask, const int size); + template void launch_equal(const int8_t *A, const int8_t *B, const float epsilon, bool *mask, const int size); + + // equalscalar + template + __global__ void equalscalarwithepsilon_kernel(const T *A, const T scalar, const float epsilon, MaskT *mask, const int size) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < size) { - if (A[idx] == scalar) { - mask[idx] = 0.5; - } else if (A[idx] > scalar) { + int stride = blockDim.x * gridDim.x; + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride) + { + float diff = fabsf(static_cast(A[idx]) - static_cast(scalar)); + if (diff < epsilon) + { mask[idx] = 1; - } else { + } + else + { mask[idx] = 0; } } } - template __global__ void comparescalar_kernel(const double* A, const double scalar, float* mask, const int size); - template __global__ void comparescalar_kernel(const float* A, const float scalar, float* mask, const int size); - template __global__ void comparescalar_kernel(const nv_bfloat16* A, const nv_bfloat16 scalar, float* mask, const int size); - template __global__ void comparescalar_kernel<__half>(const __half* A, const __half scalar, float* mask, const int size); - template __global__ void comparescalar_kernel(const int64_t* A, const int64_t scalar, float* mask, const int size); - template __global__ void comparescalar_kernel(const int32_t* A, const int32_t scalar, float* mask, const int size); - template __global__ void comparescalar_kernel(const int16_t* A, const int16_t scalar, float* mask, const int size); - template __global__ void comparescalar_kernel(const int8_t* A, const int8_t scalar, float* mask, const int size); + template + __global__ void equalscalar_kernel(const T *A, const T scalar, MaskT *mask, const int size) + { + int stride = blockDim.x * gridDim.x; + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride) + { + mask[idx] = (A[idx] == scalar); + } + } - template - void launch_comparescalar(int numBlocks, int blockSize, const T* A, const T scalar, float* mask, const int size) + template + void launch_equalscalar(const T *A, const T scalar, const float epsilon, MaskT *mask, const int size) { - comparescalar_kernel<<>>(A, scalar, mask, size); + auto [numBlocks, blockSize] = BestDims(size); + if (epsilon == 0) + { + equalscalar_kernel<<>>(A, scalar, mask, size); + } + else + { + equalscalarwithepsilon_kernel<<>>(A, scalar, epsilon, mask, size); + } cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) { - throw std::runtime_error("Failed to launch add kernel: " + - std::string(cudaGetErrorString(err))); - } + if (err != cudaSuccess) + { + throw std::runtime_error("Failed to launch add kernel: " + + std::string(cudaGetErrorString(err))); + } + } + + template void launch_equalscalar(const double *A, const double scalar, const float epsilon, bool *mask, const int size); + template void launch_equalscalar(const float *A, const float scalar, const float epsilon, bool *mask, const int size); + template void launch_equalscalar(const nv_bfloat16 *A, const nv_bfloat16 scalar, const float epsilon, bool *mask, const int size); + template void launch_equalscalar<__half,bool>(const __half *A, const __half scalar, const float epsilon, bool *mask, const int size); + template void launch_equalscalar(const int64_t *A, const int64_t scalar, const float epsilon, bool *mask, const int size); + template void launch_equalscalar(const int32_t *A, const int32_t scalar, const float epsilon, bool *mask, const int size); + template void launch_equalscalar(const int16_t *A, const int16_t scalar, const float epsilon, bool *mask, const int size); + template void launch_equalscalar(const int8_t *A, const int8_t scalar, const float epsilon, bool *mask, const int size); + + // less + template + __global__ void less_kernel(const T *A, const T *B, MaskT *mask, const int size) + { + int stride = blockDim.x * gridDim.x; + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride) + { + mask[idx] = (A[idx] < B[idx]); + } + } + + template + void launch_less(const T *A, const T *B, MaskT *mask, const int size) + { + auto [numBlocks, blockSize] = BestDims(size); + less_kernel<<>>(A, B, mask, size); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + throw std::runtime_error("Failed to launch add kernel: " + + std::string(cudaGetErrorString(err))); + } + } + + template void launch_less(const double *A, const double *B, bool *mask, const int size); + template void launch_less(const float *A, const float *B, bool *mask, const int size); + template void launch_less(const nv_bfloat16 *A, const nv_bfloat16 *B, bool *mask, const int size); + template void launch_less<__half,bool>(const __half *A, const __half *B, bool *mask, const int size); + template void launch_less(const int64_t *A, const int64_t *B, bool *mask, const int size); + template void launch_less(const int32_t *A, const int32_t *B, bool *mask, const int size); + template void launch_less(const int16_t *A, const int16_t *B, bool *mask, const int size); + template void launch_less(const int8_t *A, const int8_t *B, bool *mask, const int size); + + // lessscalar + + template + __global__ void lessscalar_kernel(const T *A, const T scalar, MaskT *mask, const int size) + { + int stride = blockDim.x * gridDim.x; + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride) + { + mask[idx] = (A[idx] < scalar); + } + } + + template + void launch_lessscalar(const T *A, const T scalar, MaskT *mask, const int size) + { + auto [numBlocks, blockSize] = BestDims(size); + lessscalar_kernel<<>>(A, scalar, mask, size); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + throw std::runtime_error("Failed to launch add kernel: " + + std::string(cudaGetErrorString(err))); + } + } + + template void launch_lessscalar(const double *A, const double scalar, bool *mask, const int size); + template void launch_lessscalar(const float *A, const float scalar, bool *mask, const int size); + template void launch_lessscalar(const nv_bfloat16 *A, const nv_bfloat16 scalar, bool *mask, const int size); + template void launch_lessscalar<__half,bool>(const __half *A, const __half scalar, bool *mask, const int size); + template void launch_lessscalar(const int64_t *A, const int64_t scalar, bool *mask, const int size); + template void launch_lessscalar(const int32_t *A, const int32_t scalar, bool *mask, const int size); + template void launch_lessscalar(const int16_t *A, const int16_t scalar, bool *mask, const int size); + template void launch_lessscalar(const int8_t *A, const int8_t scalar, bool *mask, const int size); + + // greater + template + __global__ void greater_kernel(const T *A, const T *B, MaskT *mask, const int size) + { + int stride = blockDim.x * gridDim.x; + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride) + { + mask[idx] = (A[idx] > B[idx]); + } + } + + template + void launch_greater(const T *A, const T *B, MaskT *mask, const int size) + { + auto [numBlocks, blockSize] = BestDims(size); + greater_kernel<<>>(A, B, mask, size); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + throw std::runtime_error("Failed to launch add kernel: " + + std::string(cudaGetErrorString(err))); + } + } + + template void launch_greater(const double *A, const double *B, bool *mask, const int size); + template void launch_greater(const float *A, const float *B, bool *mask, const int size); + template void launch_greater(const nv_bfloat16 *A, const nv_bfloat16 *B, bool *mask, const int size); + template void launch_greater<__half,bool>(const __half *A, const __half *B, bool *mask, const int size); + template void launch_greater(const int64_t *A, const int64_t *B, bool *mask, const int size); + template void launch_greater(const int32_t *A, const int32_t *B, bool *mask, const int size); + template void launch_greater(const int16_t *A, const int16_t *B, bool *mask, const int size); + template void launch_greater(const int8_t *A, const int8_t *B, bool *mask, const int size); + + // greaterscalar + template + __global__ void greaterscalar_kernel(const T *A, const T scalar, MaskT *mask, const int size) + { + int stride = blockDim.x * gridDim.x; + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride) + { + mask[idx] = (A[idx] > scalar); + } } - template void launch_comparescalar(int numBlocks, int blockSize, const double* A, const double scalar, float* mask, const int size); - template void launch_comparescalar(int numBlocks, int blockSize, const float* A, const float scalar, float* mask, const int size); - template void launch_comparescalar(int numBlocks, int blockSize, const nv_bfloat16* A, const nv_bfloat16 scalar, float* mask, const int size); - template void launch_comparescalar<__half>(int numBlocks, int blockSize, const __half* A, const __half scalar, float* mask, const int size); - template void launch_comparescalar(int numBlocks, int blockSize, const int64_t* A, const int64_t scalar, float* mask, const int size); - template void launch_comparescalar(int numBlocks, int blockSize, const int32_t* A, const int32_t scalar, float* mask, const int size); - template void launch_comparescalar(int numBlocks, int blockSize, const int16_t* A, const int16_t scalar, float* mask, const int size); - template void launch_comparescalar(int numBlocks, int blockSize, const int8_t* A, const int8_t scalar, float* mask, const int size); + template + void launch_greaterscalar(const T *A, const T scalar, MaskT *mask, const int size) + { + auto [numBlocks, blockSize] = BestDims(size); + greaterscalar_kernel<<>>(A, scalar, mask, size); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + throw std::runtime_error("Failed to launch add kernel: " + + std::string(cudaGetErrorString(err))); + } + } + + template void launch_greaterscalar(const double *A, const double scalar, bool *mask, const int size); + template void launch_greaterscalar(const float *A, const float scalar, bool *mask, const int size); + template void launch_greaterscalar(const nv_bfloat16 *A, const nv_bfloat16 scalar, bool *mask, const int size); + template void launch_greaterscalar<__half,bool>(const __half *A, const __half scalar, bool *mask, const int size); + template void launch_greaterscalar(const int64_t *A, const int64_t scalar, bool *mask, const int size); + template void launch_greaterscalar(const int32_t *A, const int32_t scalar, bool *mask, const int size); + template void launch_greaterscalar(const int16_t *A, const int16_t scalar, bool *mask, const int size); + template void launch_greaterscalar(const int8_t *A, const int8_t scalar, bool *mask, const int size); -}; + // switch + template + __global__ void switch_kernel(const T **tensorsdata, const int numTensors, const casesT *cases, T *C, const int size) + { + int stride = blockDim.x * gridDim.x; + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride) + { + C[idx] = tensorsdata[cases[idx]][idx]; + } + } + + template + void launch_switch(const T **tensorsdata, const int numTensors, const casesT *cases, T *C, const int size) + { + auto [numBlocks, blockSize] = BestDims(size); + cudaVector tensorsdataList(tensorsdata, numTensors, cudaMemcpyHostToDevice); + switch_kernel<<>>(tensorsdataList.data, numTensors, cases, C, size); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + throw std::runtime_error("Failed to launch add kernel: " + + std::string(cudaGetErrorString(err))); + } + } + template void launch_switch(const double **tensorsdata, const int numTensors, const int8_t *cases, double *C, const int size); + template void launch_switch(const float **tensorsdata, const int numTensors, const int8_t *cases, float *C, const int size); + template void launch_switch(const nv_bfloat16 **tensorsdata, const int numTensors, const int8_t *cases, nv_bfloat16 *C, const int size); + template void launch_switch<__half,int8_t>(const __half **tensorsdata, const int numTensors, const int8_t *cases, __half *C, const int size); + template void launch_switch(const int64_t **tensorsdata, const int numTensors, const int8_t *cases, int64_t *C, const int size); + template void launch_switch(const int32_t **tensorsdata, const int numTensors, const int8_t *cases, int32_t *C, const int size); + template void launch_switch(const int16_t **tensorsdata, const int numTensors, const int8_t *cases, int16_t *C, const int size); + template void launch_switch(const int8_t **tensorsdata, const int numTensors, const int8_t *cases, int8_t *C, const int size); + template void launch_switch(const bool **tensorsdata, const int numTensors, const int8_t *cases, bool *C, const int size); + +} #endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_COMPARE_CU diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.cuh b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.cuh index 708b6d05..ee9ea259 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.cuh +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.cuh @@ -12,186 +12,82 @@ namespace deepx::tensorfunc __global__ void max_kernel(const T* A, const T* B, T* C, const int size); template - void launch_max(int numBlocks, int blockSize, const T* A, const T* B, T* C, const int size); - - template <> - void launch_max(int numBlocks, int blockSize, const double* A, const double* B, double* C, const int size); - - template <> - void launch_max(int numBlocks, int blockSize, const float* A, const float* B, float* C, const int size); - - template <> - void launch_max(int numBlocks, int blockSize, const nv_bfloat16* A, const nv_bfloat16* B, nv_bfloat16* C, const int size); - - template <> - void launch_max<__half>(int numBlocks, int blockSize, const __half* A, const __half* B, __half* C, const int size); - - template <> - void launch_max(int numBlocks, int blockSize, const int64_t* A, const int64_t* B, int64_t* C, const int size); - - template <> - void launch_max(int numBlocks, int blockSize, const int32_t* A, const int32_t* B, int32_t* C, const int size); - - template <> - void launch_max(int numBlocks, int blockSize, const int16_t* A, const int16_t* B, int16_t* C, const int size); - - template <> - void launch_max(int numBlocks, int blockSize, const int8_t* A, const int8_t* B, int8_t* C, const int size); - + void launch_max(const T* A, const T* B, T* C, const int size); + //maxscalar template __global__ void maxscalar_kernel(const T* A, const T scalar, T* C, const int size); template - void launch_maxscalar(int numBlocks, int blockSize, const T* A, const T scalar, T* C, const int size); - - template <> - void launch_maxscalar(int numBlocks, int blockSize, const double* A, const double scalar, double* C, const int size); - - template <> - void launch_maxscalar(int numBlocks, int blockSize, const float* A, const float scalar, float* C, const int size); - - template <> - void launch_maxscalar(int numBlocks, int blockSize, const nv_bfloat16* A, const nv_bfloat16 scalar, nv_bfloat16* C, const int size); - - template <> - void launch_maxscalar<__half>(int numBlocks, int blockSize, const __half* A, const __half scalar, __half* C, const int size); - - template <> - void launch_maxscalar(int numBlocks, int blockSize, const int64_t* A, const int64_t scalar, int64_t* C, const int size); - - template <> - void launch_maxscalar(int numBlocks, int blockSize, const int32_t* A, const int32_t scalar, int32_t* C, const int size); - - template <> - void launch_maxscalar(int numBlocks, int blockSize, const int16_t* A, const int16_t scalar, int16_t* C, const int size); - - template <> - void launch_maxscalar(int numBlocks, int blockSize, const int8_t* A, const int8_t scalar, int8_t* C, const int size); - + void launch_maxscalar(const T* A, const T scalar, T* C, const int size); + + //min template __global__ void min_kernel(const T* A, const T* B, T* C, const int size); - //min template - void launch_min(int numBlocks, int blockSize, const T* A, const T* B, T* C, const int size); - - template <> - void launch_min(int numBlocks, int blockSize, const double* A, const double* B, double* C, const int size); - - template <> - void launch_min(int numBlocks, int blockSize, const float* A, const float* B, float* C, const int size); - - template <> - void launch_min(int numBlocks, int blockSize, const nv_bfloat16* A, const nv_bfloat16* B, nv_bfloat16* C, const int size); - - template <> - void launch_min<__half>(int numBlocks, int blockSize, const __half* A, const __half* B, __half* C, const int size); - - template <> - void launch_min(int numBlocks, int blockSize, const int64_t* A, const int64_t* B, int64_t* C, const int size); - - template <> - void launch_min(int numBlocks, int blockSize, const int32_t* A, const int32_t* B, int32_t* C, const int size); - - template <> - void launch_min(int numBlocks, int blockSize, const int16_t* A, const int16_t* B, int16_t* C, const int size); - - template <> - void launch_min(int numBlocks, int blockSize, const int8_t* A, const int8_t* B, int8_t* C, const int size); + void launch_min(const T* A, const T* B, T* C, const int size); + //minscalar template __global__ void minscalar_kernel(const T* A, const T scalar, T* C, const int size); template - void launch_minscalar(int numBlocks, int blockSize, const T* A, const T scalar, T* C, const int size); + void launch_minscalar(const T* A, const T scalar, T* C, const int size); - template <> - void launch_minscalar(int numBlocks, int blockSize, const double* A, const double scalar, double* C, const int size); - - template <> - void launch_minscalar(int numBlocks, int blockSize, const float* A, const float scalar, float* C, const int size); - - template <> - void launch_minscalar(int numBlocks, int blockSize, const nv_bfloat16* A, const nv_bfloat16 scalar, nv_bfloat16* C, const int size); - - template <> - void launch_minscalar<__half>(int numBlocks, int blockSize, const __half* A, const __half scalar, __half* C, const int size); - - template <> - void launch_minscalar(int numBlocks, int blockSize, const int64_t* A, const int64_t scalar, int64_t* C, const int size); - - template <> - void launch_minscalar(int numBlocks, int blockSize, const int32_t* A, const int32_t scalar, int32_t* C, const int size); - - template <> - void launch_minscalar(int numBlocks, int blockSize, const int16_t* A, const int16_t scalar, int16_t* C, const int size); - - template <> - void launch_minscalar(int numBlocks, int blockSize, const int8_t* A, const int8_t scalar, int8_t* C, const int size); - - //compare - template - __global__ void compare_kernel(const T* A, const T* B, float* mask, const int size); - - template - void launch_compare(int numBlocks, int blockSize, const T* A, const T* B, float* mask, const int size); - - template <> - void launch_compare(int numBlocks, int blockSize, const double* A, const double* B, float* mask, const int size); - - template <> - void launch_compare(int numBlocks, int blockSize, const float* A, const float* B, float* mask, const int size); - - template <> - void launch_compare(int numBlocks, int blockSize, const nv_bfloat16* A, const nv_bfloat16* B, float* mask, const int size); - - template <> - void launch_compare<__half>(int numBlocks, int blockSize, const __half* A, const __half* B, float* mask, const int size); - - template <> - void launch_compare(int numBlocks, int blockSize, const int64_t* A, const int64_t* B, float* mask, const int size); - - template <> - void launch_compare(int numBlocks, int blockSize, const int32_t* A, const int32_t* B, float* mask, const int size); + + //equal + template + __global__ void equal_kernel(const T* A, const T* B,const float epsilon, MaskT* mask, const int size); - template <> - void launch_compare(int numBlocks, int blockSize, const int16_t* A, const int16_t* B, float* mask, const int size); + template + __global__ void equal_kernel(const T* A, const T* B, float* mask, const int size); + + template + void launch_equal(const T* A, const T* B,const float epsilon, MaskT* mask, const int size); - template <> - void launch_compare(int numBlocks, int blockSize, const int8_t* A, const int8_t* B, float* mask, const int size); + //equalscalar + template + __global__ void equalscalar_kernel(const T* A, const T scalar,const float epsilon, MaskT* mask, const int size); - //comparescalar - template - __global__ void comparescalar_kernel(const T* A, const T scalar, float* mask, const int size); + template + void launch_equalscalar(const T* A, const T scalar,const float epsilon, MaskT* mask, const int size); - template - void launch_comparescalar(int numBlocks, int blockSize, const T* A, const T scalar, float* mask, const int size); + //less + template + __global__ void less_kernel(const T* A, const T* B, MaskT* mask, const int size); - template <> - void launch_comparescalar(int numBlocks, int blockSize, const double* A, const double scalar, float* mask, const int size); + template + void launch_less(const T* A, const T* B, MaskT* mask, const int size); - template <> - void launch_comparescalar(int numBlocks, int blockSize, const float* A, const float scalar, float* mask, const int size); + //lessscalar + template + __global__ void lessscalar_kernel(const T* A, const T scalar, MaskT* mask, const int size); - template <> - void launch_comparescalar(int numBlocks, int blockSize, const nv_bfloat16* A, const nv_bfloat16 scalar, float* mask, const int size); + template + void launch_lessscalar(const T* A, const T scalar, MaskT* mask, const int size); - template <> - void launch_comparescalar<__half>(int numBlocks, int blockSize, const __half* A, const __half scalar, float* mask, const int size); + //greater + template + __global__ void greater_kernel(const T* A, const T* B, MaskT* mask, const int size); - template <> - void launch_comparescalar(int numBlocks, int blockSize, const int64_t* A, const int64_t scalar, float* mask, const int size); + template + void launch_greater(const T* A, const T* B, MaskT* mask, const int size); - template <> - void launch_comparescalar(int numBlocks, int blockSize, const int32_t* A, const int32_t scalar, float* mask, const int size); + //greaterscalar + template + __global__ void greaterscalar_kernel(const T* A, const T scalar, MaskT* mask, const int size); - template <> - void launch_comparescalar(int numBlocks, int blockSize, const int16_t* A, const int16_t scalar, float* mask, const int size); + template + void launch_greaterscalar(const T* A, const T scalar, MaskT* mask, const int size); + + //switch + template + __global__ void switch_kernel(const T** tensorsdata,const int numTensors, const casesT* cases, T* C, const int size); - template <> - void launch_comparescalar(int numBlocks, int blockSize, const int8_t* A, const int8_t scalar, float* mask, const int size); - + template + void launch_switch(const T** tensorsdata,const int numTensors, const casesT* cases, T* C, const int size); + } #endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_COMPARE_CUH diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.hpp index 1d0c49d9..ed58ac6a 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.hpp +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.hpp @@ -10,20 +10,18 @@ namespace deepx::tensorfunc { // CUDA kernel函数声明 - template struct maxDispatcher { static void max(const Tensor &A, const Tensor &B, Tensor &C) { - if (A.shape.size != C.shape.size) { + if (A.shape.size != C.shape.size) + { throw TensorShapeError("max"); } - const int blockSize = A.shape.size > 256 ? 256 : A.shape.size; - int numBlocks = (A.shape.size + blockSize - 1) / blockSize; - launch_max(numBlocks, blockSize, A.data, B.data, C.data, A.shape.size); - } + launch_max(A.data, B.data, C.data, A.shape.size); + } }; template @@ -31,26 +29,25 @@ namespace deepx::tensorfunc { static void maxscalar(const Tensor &A, const T scalar, Tensor &C) { - if (A.shape.size != C.shape.size) { + if (A.shape.size != C.shape.size) + { throw TensorShapeError("maxscalar"); } - const int blockSize = A.shape.size > 256 ? 256 : A.shape.size; - int numBlocks = (A.shape.size + blockSize - 1) / blockSize; - launch_maxscalar(numBlocks, blockSize, A.data, scalar, C.data, A.shape.size); + launch_maxscalar(A.data, scalar, C.data, A.shape.size); } }; - + template struct minDispatcher { static void min(const Tensor &A, const Tensor &B, Tensor &C) { - if (A.shape.size != C.shape.size) { + if (A.shape.size != C.shape.size) + { throw TensorShapeError("min"); } - const int blockSize = A.shape.size > 256 ? 256 : A.shape.size; - int numBlocks = (A.shape.size + blockSize - 1) / blockSize; - launch_min(numBlocks, blockSize, A.data, B.data, C.data, A.shape.size); + + launch_min(A.data, B.data, C.data, A.shape.size); } }; @@ -59,41 +56,122 @@ namespace deepx::tensorfunc { static void minscalar(const Tensor &A, const T scalar, Tensor &C) { - if (A.shape.size != C.shape.size) { + if (A.shape.size != C.shape.size) + { throw TensorShapeError("minscalar"); } - const int blockSize = A.shape.size > 256 ? 256 : A.shape.size; - int numBlocks = (A.shape.size + blockSize - 1) / blockSize; - launch_minscalar(numBlocks, blockSize, A.data, scalar, C.data, A.shape.size); + + launch_minscalar(A.data, scalar, C.data, A.shape.size); } }; - template - struct compareDispatcher + // equal(A,B)=>C + template + struct equalDispatcher + { + static void equal(const Tensor &A, const Tensor &B, float epsilon, Tensor &mask) + { + if (A.shape.size != B.shape.size || A.shape.size != mask.shape.size) + { + throw TensorShapeError("equal"); + } + if (epsilon < 0) + { + throw std::invalid_argument("equal epsilon must be positive"); + } + launch_equal(A.data, B.data, epsilon, mask.data, A.shape.size); + } + }; + // equalscalar(A,scalar)=>C + template + struct equalscalarDispatcher { - static void compare(const Tensor &A, const Tensor &B, Tensor &mask) + static void equalscalar(const Tensor &A, const T scalar, float epsilon, Tensor &mask) { - if (A.shape.size != B.shape.size || A.shape.size != mask.shape.size) { - throw TensorShapeError("compare"); + if (A.shape.size != mask.shape.size) + { + throw TensorShapeError("equalscalar"); + } + if (epsilon < 0) + { + throw std::invalid_argument("equal epsilon must be positive"); } - const int blockSize = A.shape.size > 256 ? 256 : A.shape.size; - int numBlocks = (A.shape.size + blockSize - 1) / blockSize; - launch_compare(numBlocks, blockSize, A.data, B.data, mask.data, A.shape.size); + launch_equalscalar(A.data, scalar, epsilon, mask.data, A.shape.size); } }; - template - struct comparescalarDispatcher + // less(A,B)=>C + template + struct lessDispatcher + { + static void less(const Tensor &A, const Tensor &B, Tensor &mask) + { + if (A.shape.size != B.shape.size || A.shape.size != mask.shape.size) + { + throw TensorShapeError("less"); + } + launch_less(A.data, B.data, mask.data, A.shape.size); + } + }; + // lessscalar(A,scalar)=>C + template + struct lessscalarDispatcher { - static void comparescalar(const Tensor &A, const T scalar, Tensor &mask) + static void lessscalar(const Tensor &A, const T scalar, Tensor &mask) { - if (A.shape.size != mask.shape.size) { - throw TensorShapeError("comparescalar"); + if (A.shape.size != mask.shape.size) + { + throw TensorShapeError("lessscalar"); } - const int blockSize = A.shape.size > 256 ? 256 : A.shape.size; - int numBlocks = (A.shape.size + blockSize - 1) / blockSize; - launch_comparescalar(numBlocks, blockSize, A.data, scalar, mask.data, A.shape.size); + launch_lessscalar(A.data, scalar, mask.data, A.shape.size); } }; + // greater(A,B)=>C + template + struct greaterDispatcher + { + static void greater(const Tensor &A, const Tensor &B, Tensor &mask) + { + if (A.shape.size != B.shape.size || A.shape.size != mask.shape.size) + { + throw TensorShapeError("greater"); + } + launch_greater(A.data, B.data, mask.data, A.shape.size); + } + }; + // greaterscalar(A,scalar)=>C + template + struct greaterscalarDispatcher + { + static void greaterscalar(const Tensor &A, const T scalar, Tensor &mask) + { + if (A.shape.size != mask.shape.size) + { + throw TensorShapeError("greaterscalar"); + } + launch_greaterscalar(A.data, scalar, mask.data, A.shape.size); + } + }; + // switch(tensors,cases)=>C + template + struct switchDispatcher + { + static void Switch(const vector *> tensors, const Tensor &cases, Tensor &C) + { + if (cases.shape.size != C.shape.size) + { + throw TensorShapeError("Switch"); + } + + vector tensorsData(tensors.size()); + for (int i = 0; i < tensors.size(); i++) + { + tensorsData[i] = tensors[i]->data; + } + + launch_switch(tensorsData.data(), tensors.size(), cases.data, C.data, C.shape.size); + } + }; + } #endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_BASIC_HPP diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cu index 95307389..45b24be2 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cu +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cu @@ -5,53 +5,24 @@ #include #include "deepx/tensorfunc/cuda.hpp" #include "deepx/tensorfunc/authors.hpp" -#include +#include "deepx/tensorfunc/cuda_math.cuh" namespace deepx::tensorfunc { // sqrt template - __global__ void sqrt_kernel(const T *A, T *C, const int size); - template <> - __global__ void sqrt_kernel(const double *A, double *C, const int size) + __global__ void sqrt_kernel(const T *A, T *C, const int size) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < size) + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x) { - C[idx] = sqrt(A[idx]); - } - } - template <> - __global__ void sqrt_kernel(const float *A, float *C, const int size) - { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < size) - { - C[idx] = sqrtf(A[idx]); - } - } - - template <> - __global__ void sqrt_kernel<__half>(const __half *A, __half *C, const int size) - { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < size) - { - C[idx] = hsqrt(A[idx]); - } - } - template <> - __global__ void sqrt_kernel(const nv_bfloat16 *A, nv_bfloat16 *C, const int size) - { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < size) - { - C[idx] = hsqrt(A[idx]); + deepx_sqrt(A + idx, C + idx); } } + template - void launch_sqrt(int numBlocks, int blockSize, const T *a, T *c, const int size) + void launch_sqrt(const T *a, T *c, const int size) { + auto [numBlocks, blockSize] = BestDims(size); sqrt_kernel<<>>(a, c, size); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) @@ -60,35 +31,25 @@ namespace deepx::tensorfunc std::string(cudaGetErrorString(err))); } } - template void launch_sqrt(int numBlocks, int blockSize, const double *a, double *c, const int size); - template void launch_sqrt(int numBlocks, int blockSize, const float *a, float *c, const int size); - template void launch_sqrt<__half>(int numBlocks, int blockSize, const __half *a, __half *c, const int size); - template void launch_sqrt(int numBlocks, int blockSize, const nv_bfloat16 *a, nv_bfloat16 *c, const int size); + template void launch_sqrt(const double *a, double *c, const int size); + template void launch_sqrt(const float *a, float *c, const int size); + template void launch_sqrt<__half>(const __half *a, __half *c, const int size); + template void launch_sqrt(const nv_bfloat16 *a, nv_bfloat16 *c, const int size); + // pow template - __global__ void pow_kernel(const T *A, const T *B, T *C, const int size); - template <> - __global__ void pow_kernel(const double *A, const double *B, double *C, const int size) - { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < size) - { - C[idx] = pow(A[idx], B[idx]); - } - } - template <> - __global__ void pow_kernel(const float *A, const float *B, float *C, const int size) + __global__ void pow_kernel(const T *A, const T *B, T *C, const int size) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < size) + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x) { - C[idx] = powf(A[idx], B[idx]); + deepx_pow(A + idx, B + idx, C + idx); } } - + template - void launch_pow(int numBlocks, int blockSize, const T *a, const T *b, T *c, const int size) + void launch_pow(const T *a, const T *b, T *c, const int size) { + auto [numBlocks, blockSize] = BestDims(size); pow_kernel<<>>(a, b, c, size); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) @@ -97,36 +58,23 @@ namespace deepx::tensorfunc std::string(cudaGetErrorString(err))); } } - template void launch_pow(int numBlocks, int blockSize, const double *a, const double *b, double *c, const int size); - template void launch_pow(int numBlocks, int blockSize, const float *a, const float *b, float *c, const int size); + template void launch_pow(const double *a, const double *b, double *c, const int size); + template void launch_pow(const float *a, const float *b, float *c, const int size); // powscalar template - __global__ void powscalar_kernel(const T *A, const T scalar, T *C, const int size); - template <> - __global__ void powscalar_kernel(const double *A, const double scalar, double *C, const int size) + __global__ void powscalar_kernel(const T *A, const T scalar, T *C, const int size) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < size) + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x) { - C[idx] = pow(A[idx], scalar); + deepx_pow(A + idx, &scalar, C + idx); } } - template <> - __global__ void powscalar_kernel(const float *A, const float scalar, float *C, const int size) - { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < size) - { - C[idx] = powf(A[idx], scalar); - } - } - template __global__ void powscalar_kernel(const double *A, const double scalar, double *C, const int size); - template __global__ void powscalar_kernel(const float *A, const float scalar, float *C, const int size); - + template - void launch_powscalar(int numBlocks, int blockSize, const T *a, const T scalar, T *c, const int size) + void launch_powscalar(const T *a, const T scalar, T *c, const int size) { + auto [numBlocks, blockSize] = BestDims(size); powscalar_kernel<<>>(a, scalar, c, size); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) @@ -135,52 +83,48 @@ namespace deepx::tensorfunc std::string(cudaGetErrorString(err))); } } - template void launch_powscalar(int numBlocks, int blockSize, const double *a, const double scalar, double *c, const int size); - template void launch_powscalar(int numBlocks, int blockSize, const float *a, const float scalar, float *c, const int size); + template void launch_powscalar(const double *a, const double scalar, double *c, const int size); + template void launch_powscalar(const float *a, const float scalar, float *c, const int size); - // log + // rpowscalar template - __global__ void log_kernel(const T *A, T *C, const int size); - template <> - __global__ void log_kernel(const double *A, double *C, const int size) + __global__ void rpowscalar_kernel(const T scalar, const T *A, T *C, const int size) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < size) + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x) { - C[idx] = logf(A[idx]); + deepx_pow(&scalar, A + idx, C + idx); } } - template <> - __global__ void log_kernel(const float *A, float *C, const int size) + + template + void launch_rpowscalar(const T scalar, const T *a, T *c, const int size) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < size) + auto [numBlocks, blockSize] = BestDims(size); + rpowscalar_kernel<<>>(scalar, a, c, size); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { - C[idx] = logf(A[idx]); + throw std::runtime_error("Failed to launch rpowscalar kernel: " + + std::string(cudaGetErrorString(err))); } } - template <> - __global__ void log_kernel<__half>(const __half *A, __half *C, const int size) + template void launch_rpowscalar(const double scalar, const double *a, double *c, const int size); + template void launch_rpowscalar(const float scalar, const float *a, float *c, const int size); + + // log + template + __global__ void log_kernel(const T *A, T *C, const int size) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < size) + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x) { - C[idx] = hlog(A[idx]); - } - } - template <> - __global__ void log_kernel(const nv_bfloat16 *A, nv_bfloat16 *C, const int size) - { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < size) - { - C[idx] = hlog(A[idx]); + deepx_log(A + idx, C + idx); } } - + template - void launch_log(int numBlocks, int blockSize, const T *a, T *c, const int size) + void launch_log(const T *a, T *c, const int size) { + auto [numBlocks, blockSize] = BestDims(size); log_kernel<<>>(a, c, size); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) @@ -189,54 +133,24 @@ namespace deepx::tensorfunc std::string(cudaGetErrorString(err))); } } - template void launch_log(int numBlocks, int blockSize, const double *a, double *c, const int size); - template void launch_log(int numBlocks, int blockSize, const float *a, float *c, const int size); - template void launch_log<__half>(int numBlocks, int blockSize, const __half *a, __half *c, const int size); - template void launch_log(int numBlocks, int blockSize, const nv_bfloat16 *a, nv_bfloat16 *c, const int size); + template void launch_log(const double *a, double *c, const int size); + template void launch_log(const float *a, float *c, const int size); + template void launch_log<__half>(const __half *a, __half *c, const int size); + template void launch_log(const nv_bfloat16 *a, nv_bfloat16 *c, const int size); // exp template - __global__ void exp_kernel(const T *A, T *C, const int size); - template <> - __global__ void exp_kernel(const double *A, double *C, const int size) - { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < size) - { - C[idx] = exp(A[idx]); - } - } - template <> - __global__ void exp_kernel(const float *A, float *C, const int size) - { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < size) - { - C[idx] = expf(A[idx]); - } - } - - template <> - __global__ void exp_kernel<__half>(const __half *A, __half *C, const int size) - { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < size) - { - C[idx] = hexp(A[idx]); - } - } - template <> - __global__ void exp_kernel(const nv_bfloat16 *A, nv_bfloat16 *C, const int size) + __global__ void exp_kernel(const T *A, T *C, const int size) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < size) + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x) { - C[idx] = hexp(A[idx]); + deepx_exp(A + idx, C + idx); } } - + template - void launch_exp(int numBlocks, int blockSize, const T *a, T *c, const int size) + void launch_exp(const T *a, T *c, const int size) { + auto [numBlocks, blockSize] = BestDims(size); exp_kernel<<>>(a, c, size); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) @@ -245,9 +159,9 @@ namespace deepx::tensorfunc std::string(cudaGetErrorString(err))); } } - template void launch_exp(int numBlocks, int blockSize, const double *a, double *c, const int size); - template void launch_exp(int numBlocks, int blockSize, const float *a, float *c, const int size); - template void launch_exp<__half>(int numBlocks, int blockSize, const __half *a, __half *c, const int size); - template void launch_exp(int numBlocks, int blockSize, const nv_bfloat16 *a, nv_bfloat16 *c, const int size); + template void launch_exp(const double *a, double *c, const int size); + template void launch_exp(const float *a, float *c, const int size); + template void launch_exp<__half>(const __half *a, __half *c, const int size); + template void launch_exp(const nv_bfloat16 *a, nv_bfloat16 *c, const int size); } #endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_SQRT_CU diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cuh b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cuh index 341a0295..6f4ffa42 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cuh +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cuh @@ -1,7 +1,5 @@ #ifndef DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_SQRT_CUH #define DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_SQRT_CUH -#include -#include #include "deepx/tensorfunc/cuda.hpp" #include "deepx/tensorfunc/authors.hpp" @@ -13,99 +11,46 @@ namespace deepx::tensorfunc __global__ void sqrt_kernel(const T* A, T* C,const int size); template - void launch_sqrt(int numBlocks, int blockSize, const T* a, T* c,const int size); - - template <> - void launch_sqrt(int numBlocks, int blockSize, const double* a, double* c,const int size); - - template <> - void launch_sqrt(int numBlocks, int blockSize, const float* a, float* c,const int size); - - template <> - void launch_sqrt(int numBlocks, int blockSize, const nv_bfloat16* a, nv_bfloat16* c,const int size); - - template <> - void launch_sqrt<__half>(int numBlocks, int blockSize, const __half* a, __half* c,const int size); - + void launch_sqrt(const T* a, T* c,const int size); + // pow template __global__ void pow_kernel(const T* A, const T* B, T* C,const int size); template - void launch_pow(int numBlocks, int blockSize, const T* a, const T* b, T* c,const int size); - - template <> - void launch_pow(int numBlocks, int blockSize, const double* a, const double* b, double* c,const int size); - - template <> - void launch_pow(int numBlocks, int blockSize, const float* a, const float* b, float* c,const int size); - - template <> - void launch_pow(int numBlocks, int blockSize, const nv_bfloat16* a, const nv_bfloat16* b, nv_bfloat16* c,const int size); - - template <> - void launch_pow<__half>(int numBlocks, int blockSize, const __half* a, const __half* b, __half* c,const int size); - + void launch_pow(const T* a, const T* b, T* c,const int size); + // powscalar template __global__ void powscalar_kernel(const T* A, const T scalar, T* C,const int size); template - void launch_powscalar(int numBlocks, int blockSize, const T* a, const T scalar, T* c,const int size); - - template <> - void launch_powscalar(int numBlocks, int blockSize, const double* a, const double scalar, double* c,const int size); + void launch_powscalar(const T* a, const T scalar, T* c,const int size); - template <> - void launch_powscalar(int numBlocks, int blockSize, const float* a, const float scalar, float* c,const int size); - - template <> - void launch_powscalar(int numBlocks, int blockSize, const nv_bfloat16* a, const nv_bfloat16 scalar, nv_bfloat16* c,const int size); - - template <> - void launch_powscalar<__half>(int numBlocks, int blockSize, const __half* a, const __half scalar, __half* c,const int size); + // rpowscalar + template + __global__ void rpowscalar_kernel(const T scalar, const T* A, T* C, const int size); + template + void launch_rpowscalar(const T scalar, const T* a, T* c, const int size); // log template __global__ void log_kernel(const T* A, T* C,const int size); template - void launch_log(int numBlocks, int blockSize, const T* a, T* c,const int size); - - template <> - void launch_log(int numBlocks, int blockSize, const double* a, double* c,const int size); - - template <> - void launch_log(int numBlocks, int blockSize, const float* a, float* c,const int size); - - template <> - void launch_log(int numBlocks, int blockSize, const nv_bfloat16* a, nv_bfloat16* c,const int size); - - template <> - void launch_log<__half>(int numBlocks, int blockSize, const __half* a, __half* c,const int size); + void launch_log(const T* a, T* c,const int size); // exp template __global__ void exp_kernel(const T* A, T* C,const int size); template - void launch_exp(int numBlocks, int blockSize, const T* a, T* c,const int size); + void launch_exp(const T* a, T* c,const int size); - template <> - void launch_exp(int numBlocks, int blockSize, const double* a, double* c,const int size); - - template <> - void launch_exp(int numBlocks, int blockSize, const float* a, float* c,const int size); - - template <> - void launch_exp(int numBlocks, int blockSize, const nv_bfloat16* a, nv_bfloat16* c,const int size); - - template <> - void launch_exp<__half>(int numBlocks, int blockSize, const __half* a, __half* c,const int size); - + } diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.hpp index 38afe270..141bf51f 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.hpp +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.hpp @@ -20,9 +20,7 @@ namespace deepx::tensorfunc if (A.shape.size != C.shape.size) { throw TensorShapeError("sqrt"); } - const int blockSize = A.shape.size > 256 ? 256 : A.shape.size; - int numBlocks = (A.shape.size + blockSize - 1) / blockSize; - launch_sqrt(numBlocks, blockSize, A.data, C.data, A.shape.size); + launch_sqrt(A.data, C.data, A.shape.size); } }; @@ -34,9 +32,7 @@ namespace deepx::tensorfunc if (A.shape.size != C.shape.size) { throw TensorShapeError("pow"); } - const int blockSize = A.shape.size > 256 ? 256 : A.shape.size; - int numBlocks = (A.shape.size + blockSize - 1) / blockSize; - launch_pow(numBlocks, blockSize, A.data, B.data, C.data, A.shape.size); + launch_pow(A.data, B.data, C.data, A.shape.size); } }; @@ -48,12 +44,24 @@ namespace deepx::tensorfunc if (A.shape.size != C.shape.size) { throw TensorShapeError("powscalar"); } - const int blockSize = A.shape.size > 256 ? 256 : A.shape.size; - int numBlocks = (A.shape.size + blockSize - 1) / blockSize; - launch_powscalar(numBlocks, blockSize, A.data, scalar, C.data, A.shape.size); + launch_powscalar(A.data, scalar, C.data, A.shape.size); + } + }; + + // rpowscalar + template + struct rpowscalarDispatcher + { + static void rpowscalar(const T value, const Tensor &A, Tensor &C) + { + if (A.shape.size != C.shape.size) { + throw TensorShapeError("rpowscalar"); + } + launch_rpowscalar(value, A.data, C.data, A.shape.size); } }; + // log template struct logDispatcher { @@ -62,9 +70,7 @@ namespace deepx::tensorfunc if (A.shape.size != C.shape.size) { throw TensorShapeError("log"); } - const int blockSize = A.shape.size > 256 ? 256 : A.shape.size; - int numBlocks = (A.shape.size + blockSize - 1) / blockSize; - launch_log(numBlocks, blockSize, A.data, C.data, A.shape.size); + launch_log(A.data, C.data, A.shape.size); } }; @@ -76,9 +82,7 @@ namespace deepx::tensorfunc if (A.shape.size != C.shape.size) { throw TensorShapeError("exp"); } - const int blockSize = A.shape.size > 256 ? 256 : A.shape.size; - int numBlocks = (A.shape.size + blockSize - 1) / blockSize; - launch_exp(numBlocks, blockSize, A.data, C.data, A.shape.size); + launch_exp(A.data, C.data, A.shape.size); } }; diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.cu index 43717698..c9e185c8 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.cu +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.cu @@ -9,8 +9,9 @@ #include "deepx/tensorfunc/reduce_miaobyte.cuh" #include "deepx/tensorfunc/tensor_cuda.cuh" #include "deepx/tensorfunc/vector_cuda.cuh" -#include "deepx/tensorfunc/cuda_math.cuh" +#include "deepx/tensorfunc/cuda_atomic.cuh" +#include "deepx/tensorfunc/cuda_math.cuh" namespace deepx::tensorfunc { diff --git a/excuter/op-mem-cuda/src/deepx/tf/elementwise_basic.hpp b/excuter/op-mem-cuda/src/deepx/tf/elementwise_basic.hpp index 91fa6326..8611a227 100644 --- a/excuter/op-mem-cuda/src/deepx/tf/elementwise_basic.hpp +++ b/excuter/op-mem-cuda/src/deepx/tf/elementwise_basic.hpp @@ -40,6 +40,10 @@ namespace deepx::tf } int run(shared_ptr mem, string &error) override { + if(!checktensors({this->args[0].textvalue,this->args[1].textvalue,this->returns[0].textvalue},mem, error)) + { + return 1; + } Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype; Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; @@ -58,7 +62,7 @@ namespace deepx::tf break; case Precision::Float16: tensorfunc::add(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); - break; + break; case Precision::BFloat16: tensorfunc::add(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); break; @@ -112,7 +116,11 @@ namespace deepx::tf return make_shared>(*this); } int run(shared_ptr mem, string &error) override - { + { + if(!checktensors({this->args[0].textvalue,this->returns[0].textvalue},mem, error)) + { + return 1; + } Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; if (a_type != c_type) @@ -123,14 +131,14 @@ namespace deepx::tf switch (a_type) { case Precision::Float64: - tensorfunc::addscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::addscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Float32: tensorfunc::addscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Float16: tensorfunc::addscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); - break; + break; case Precision::BFloat16: tensorfunc::addscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); break; @@ -184,7 +192,11 @@ namespace deepx::tf return make_shared>(*this); } int run(shared_ptr mem, string &error) override - { + { + if(!checktensors({this->args[0].textvalue,this->args[1].textvalue,this->returns[0].textvalue},mem, error)) + { + return 1; + } Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype; Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; @@ -203,7 +215,7 @@ namespace deepx::tf break; case Precision::Float16: tensorfunc::sub(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); - break; + break; case Precision::BFloat16: tensorfunc::sub(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); break; @@ -230,7 +242,7 @@ namespace deepx::tf template class SubScalar : public TF { - public: + public: SubScalar(const vector &args, const vector &returns) { this->name = "subscalar"; @@ -255,9 +267,13 @@ namespace deepx::tf shared_ptr clone() const override { return make_shared>(*this); - } + } int run(shared_ptr mem, string &error) override { + if(!checktensors({this->args[0].textvalue,this->returns[0].textvalue},mem, error)) + { + return 1; + } Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; if (a_type != c_type) @@ -287,7 +303,7 @@ namespace deepx::tf break; case Precision::Int16: tensorfunc::subscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); - break; + break; case Precision::Int8: tensorfunc::subscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); break; @@ -296,9 +312,9 @@ namespace deepx::tf return 1; } return 0; - } + } }; - + template class Mul : public TF { @@ -329,7 +345,11 @@ namespace deepx::tf return make_shared>(*this); } int run(shared_ptr mem, string &error) override - { + { + if(!checktensors({this->args[0].textvalue,this->args[1].textvalue,this->returns[0].textvalue},mem, error)) + { + return 1; + } Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype; Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; @@ -348,7 +368,7 @@ namespace deepx::tf break; case Precision::Float16: tensorfunc::mul(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); - break; + break; case Precision::BFloat16: tensorfunc::mul(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); break; @@ -375,7 +395,7 @@ namespace deepx::tf template class MulScalar : public TF { - public: + public: MulScalar(const vector &args, const vector &returns) { this->name = "mulscalar"; @@ -400,9 +420,13 @@ namespace deepx::tf shared_ptr clone() const override { return make_shared>(*this); - } + } int run(shared_ptr mem, string &error) override - { + { + if(!checktensors({this->args[0].textvalue,this->returns[0].textvalue},mem, error)) + { + return 1; + } Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; if (a_type != c_type) @@ -432,7 +456,7 @@ namespace deepx::tf break; case Precision::Int16: tensorfunc::mulscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); - break; + break; case Precision::Int8: tensorfunc::mulscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); break; @@ -441,7 +465,7 @@ namespace deepx::tf return 1; } return 0; - } + } }; template @@ -454,7 +478,7 @@ namespace deepx::tf this->author = Author::name(); this->args = args; this->returns = returns; - } + } Div(string text) { @@ -464,7 +488,7 @@ namespace deepx::tf { throw std::runtime_error("Invalid name: " + this->name); } - } + } string math_formula() const override { return "T3=T1/T2"; @@ -472,12 +496,16 @@ namespace deepx::tf shared_ptr clone() const override { return make_shared>(*this); - } + } int run(shared_ptr mem, string &error) override - { + { + if(!checktensors({this->args[0].textvalue,this->args[1].textvalue,this->returns[0].textvalue},mem, error)) + { + return 1; + } Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype; - Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; if (a_type != b_type || a_type != c_type) { error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(b_type) + " != " + precision_str(c_type); @@ -485,7 +513,7 @@ namespace deepx::tf } switch (a_type) { - case Precision::Float64: + case Precision::Float64: tensorfunc::div(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Float32: @@ -493,25 +521,25 @@ namespace deepx::tf break; case Precision::Float16: tensorfunc::div(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); - break; + break; case Precision::BFloat16: tensorfunc::div(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int64: tensorfunc::div(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); - break; + break; case Precision::Int32: tensorfunc::div(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int16: tensorfunc::div(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); - break; + break; case Precision::Int8: tensorfunc::div(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); break; default: error = "Unsupported dtype: " + precision_str(a_type); - return 1; + return 1; } return 0; } @@ -545,9 +573,13 @@ namespace deepx::tf shared_ptr clone() const override { return make_shared>(*this); - } + } int run(shared_ptr mem, string &error) override - { + { + if(!checktensors({this->args[0].textvalue,this->returns[0].textvalue},mem, error)) + { + return 1; + } Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; if (a_type != c_type) @@ -558,36 +590,36 @@ namespace deepx::tf switch (a_type) { case Precision::Float64: - tensorfunc::divscalar( *mem->gettensor(this->args[0].textvalue),this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::divscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Float32: - tensorfunc::divscalar( *mem->gettensor(this->args[0].textvalue),this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::divscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Float16: - tensorfunc::divscalar( *mem->gettensor(this->args[0].textvalue),this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::divscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::BFloat16: - tensorfunc::divscalar( *mem->gettensor(this->args[0].textvalue),this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::divscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int64: - tensorfunc::divscalar( *mem->gettensor(this->args[0].textvalue),this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::divscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int32: - tensorfunc::divscalar( *mem->gettensor(this->args[0].textvalue),this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::divscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int16: - tensorfunc::divscalar( *mem->gettensor(this->args[0].textvalue),this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::divscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int8: - tensorfunc::divscalar( *mem->gettensor(this->args[0].textvalue),this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::divscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); break; default: error = "Unsupported dtype: " + precision_str(a_type); return 1; } return 0; - } - }; + } + }; template class RDivScalar : public TF @@ -599,7 +631,7 @@ namespace deepx::tf this->author = Author::name(); this->args = args; this->returns = returns; - } + } RDivScalar(string text) { @@ -617,9 +649,13 @@ namespace deepx::tf shared_ptr clone() const override { return make_shared>(*this); - } + } int run(shared_ptr mem, string &error) override { + if(!checktensors({this->args[1].textvalue,this->returns[0].textvalue},mem, error)) + { + return 1; + } Precision a_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype; Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; if (a_type != c_type) @@ -631,7 +667,7 @@ namespace deepx::tf { case Precision::Float64: tensorfunc::rdivscalar(this->getvar(0, mem), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); - break; + break; case Precision::Float32: tensorfunc::rdivscalar(this->getvar(0, mem), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); break; @@ -656,10 +692,66 @@ namespace deepx::tf default: error = "Unsupported dtype: " + precision_str(a_type); return 1; - } + } + return 0; + } + }; + + // invert + template + class Invert : public TF + { + public: + Invert(const vector &args, const vector &returns) + { + this->name = "invert"; + this->author = Author::name(); + this->args = args; + this->returns = returns; + } + string math_formula() const override + { + return "T3=~T1"; + } + shared_ptr clone() const override + { + return make_shared>(*this); + } + int run(shared_ptr mem, string &error) override + { + if(!checktensors({this->args[0].textvalue,this->returns[0].textvalue},mem, error)) + { + return 1; + } + Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; + Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (a_type != c_type) + { + error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type); + return 1; + } + switch (a_type) + { + case Precision::Int64: + tensorfunc::invert(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + tensorfunc::invert(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + tensorfunc::invert(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + tensorfunc::invert(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported dtype: " + precision_str(a_type); + return 1; + } return 0; - } + } }; + }; #endif // DEEPX_TF_ELEMENTWISE_BASIC_HPP diff --git a/excuter/op-mem-cuda/src/deepx/tf/elementwise_compare.hpp b/excuter/op-mem-cuda/src/deepx/tf/elementwise_compare.hpp index 4ec85b83..694ad3db 100644 --- a/excuter/op-mem-cuda/src/deepx/tf/elementwise_compare.hpp +++ b/excuter/op-mem-cuda/src/deepx/tf/elementwise_compare.hpp @@ -20,15 +20,6 @@ namespace deepx::tf this->returns = returns; } - Max(string text) - { - this->parse(text); - this->author = Author::name(); - if (this->name != "max") - { - throw std::runtime_error("Invalid name: " + this->name); - } - } string math_formula() const override { return "T3=max(T1, T2)"; @@ -72,7 +63,7 @@ namespace deepx::tf break; case Precision::Int8: tensorfunc::max(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); - break; + break; default: error = "Unsupported type: " + precision_str(a_type); return 1; @@ -93,15 +84,6 @@ namespace deepx::tf this->returns = returns; } - MaxScalar(string text) - { - this->parse(text); - this->author = Author::name(); - if (this->name != "maxscalar") - { - throw std::runtime_error("Invalid name: " + this->name); - } - } string math_formula() const override { return "T3=max(T1, scalar)"; @@ -139,8 +121,8 @@ namespace deepx::tf break; case Precision::Int32: tensorfunc::maxscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); - break; - case Precision::Int16: + break; + case Precision::Int16: tensorfunc::maxscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int8: @@ -166,15 +148,6 @@ namespace deepx::tf this->returns = returns; } - Min(string text) - { - this->parse(text); - this->author = Author::name(); - if (this->name != "min") - { - throw std::runtime_error("Invalid name: " + this->name); - } - } string math_formula() const override { return "T3=min(T1, T2)"; @@ -201,14 +174,14 @@ namespace deepx::tf break; case Precision::Float32: tensorfunc::min(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); - break; + break; case Precision::Float16: tensorfunc::min(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::BFloat16: tensorfunc::min(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); - break; - case Precision::Int64: + break; + case Precision::Int64: tensorfunc::min(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int32: @@ -240,15 +213,6 @@ namespace deepx::tf this->returns = returns; } - MinScalar(string text) - { - this->parse(text); - this->author = Author::name(); - if (this->name != "minscalar") - { - throw std::runtime_error("Invalid name: " + this->name); - } - } string math_formula() const override { return "T3=min(T1, scalar)"; @@ -279,7 +243,7 @@ namespace deepx::tf tensorfunc::minscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::BFloat16: - tensorfunc::minscalar (*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::minscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int64: tensorfunc::minscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); @@ -302,33 +266,156 @@ namespace deepx::tf }; template - class Compare : public TF + class Equal : public TF { public: - Compare(const vector &args, const vector &returns) + Equal(const vector &args, const vector &returns) { - this->name = "compare"; + this->name = "equal"; this->author = Author::name(); this->args = args; this->returns = returns; } - Compare(string text) + string math_formula() const override + { + return "mask=compare(T1, T2)"; + } + shared_ptr clone() const override + { + return make_shared>(*this); + } + + int run(shared_ptr mem, string &error) override + { + Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; + Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype; + float epsilon = this->getvar(2, mem); + Precision mask_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (a_type != b_type || mask_type != Precision::Bool) + { + error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(b_type) + " or " + precision_str(a_type) + " != " + precision_str(mask_type); + return 1; + } + switch (a_type) + { + case Precision::Float64: + tensorfunc::equal(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), epsilon, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + tensorfunc::equal(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), epsilon, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float16: + tensorfunc::equal(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), epsilon, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::BFloat16: + tensorfunc::equal(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), epsilon, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int64: + tensorfunc::equal(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), epsilon, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + tensorfunc::equal(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), epsilon, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + tensorfunc::equal(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), epsilon, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + tensorfunc::equal(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), epsilon, *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported type: " + precision_str(a_type); + return 1; + } + return 0; + } + }; + + template + class EqualScalar : public TF + { + public: + EqualScalar(const vector &args, const vector &returns) { - this->parse(text); + this->name = "equalscalar"; this->author = Author::name(); - if (this->name != "compare") + this->args = args; + this->returns = returns; + } + + string math_formula() const override + { + return "mask=compare(T1, scalar)"; + } + shared_ptr clone() const override + { + return make_shared>(*this); + } + + int run(shared_ptr mem, string &error) override + { + Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; + Precision mask_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + float epsilon = this->getvar(2, mem); + if (a_type != mask_type || mask_type != Precision::Bool) { - throw std::runtime_error("Invalid name: " + this->name); + error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(mask_type); + return 1; } + switch (a_type) + { + case Precision::Float64: + tensorfunc::equalscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), epsilon, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + tensorfunc::equalscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), epsilon, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float16: + tensorfunc::equalscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), epsilon, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::BFloat16: + tensorfunc::equalscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), epsilon, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int64: + tensorfunc::equalscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), epsilon, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + tensorfunc::equalscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), epsilon, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + tensorfunc::equalscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), epsilon, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + tensorfunc::equalscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), epsilon, *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported type: " + precision_str(a_type); + return 1; + } + return 0; } + }; + + // less + template + class Less : public TF + { + public: + Less(const vector &args, const vector &returns) + { + this->name = "less"; + this->author = Author::name(); + this->args = args; + this->returns = returns; + } + string math_formula() const override { return "mask=compare(T1, T2)"; } shared_ptr clone() const override { - return make_shared>(*this); + return make_shared>(*this); } int run(shared_ptr mem, string &error) override @@ -336,37 +423,37 @@ namespace deepx::tf Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype; Precision mask_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; - if (a_type != b_type) + if (a_type != b_type || mask_type != Precision::Bool) { - error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(b_type); + error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(b_type) + " or " + precision_str(a_type) + " != " + precision_str(mask_type); return 1; } switch (a_type) { case Precision::Float64: - tensorfunc::compare(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::less(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Float32: - tensorfunc::compare(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::less(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Float16: - tensorfunc::compare(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::less(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::BFloat16: - tensorfunc::compare(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::less(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int64: - tensorfunc::compare(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::less(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int32: - tensorfunc::compare(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); - break; + tensorfunc::less(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; case Precision::Int16: - tensorfunc::compare(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::less(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int8: - tensorfunc::compare(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); - break; + tensorfunc::less(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; default: error = "Unsupported type: " + precision_str(a_type); return 1; @@ -375,41 +462,164 @@ namespace deepx::tf } }; + // lessscalar template - class CompareScalar : public TF + class LessScalar : public TF { public: - CompareScalar(const vector &args, const vector &returns) - { - this->name = "comparescalar"; + LessScalar(const vector &args, const vector &returns) + { + this->name = "lessscalar"; this->author = Author::name(); this->args = args; this->returns = returns; } - CompareScalar(string text) + string math_formula() const override + { + return "mask=compare(T1, scalar)"; + } + shared_ptr clone() const override + { + return make_shared>(*this); + } + + int run(shared_ptr mem, string &error) override + { + Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; + Precision mask_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (a_type != mask_type || mask_type != Precision::Bool) + { + error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(mask_type); + return 1; + } + switch (a_type) + { + case Precision::Float64: + tensorfunc::lessscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + tensorfunc::lessscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float16: + tensorfunc::lessscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::BFloat16: + tensorfunc::lessscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int64: + tensorfunc::lessscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + tensorfunc::lessscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + tensorfunc::lessscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + tensorfunc::lessscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported type: " + precision_str(a_type); + return 1; + } + return 0; + } + }; + + // greater + template + class Greater : public TF + { + public: + Greater(const vector &args, const vector &returns) { - this->parse(text); + this->name = "greater"; this->author = Author::name(); - if (this->name != "comparescalar") + this->args = args; + this->returns = returns; + } + + string math_formula() const override + { + return "mask=compare(T1, T2)"; + } + shared_ptr clone() const override + { + return make_shared>(*this); + } + + int run(shared_ptr mem, string &error) override + { + Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; + Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype; + Precision mask_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (a_type != b_type || mask_type != Precision::Bool) + { + error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(b_type) + " or " + precision_str(a_type) + " != " + precision_str(mask_type); + return 1; + } + switch (a_type) { - throw std::runtime_error("Invalid name: " + this->name); + case Precision::Float64: + tensorfunc::greater(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + tensorfunc::greater(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float16: + tensorfunc::greater(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::BFloat16: + tensorfunc::greater(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int64: + tensorfunc::greater(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + tensorfunc::greater(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + tensorfunc::greater(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + tensorfunc::greater(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported type: " + precision_str(a_type); + return 1; } + return 0; + } + }; + + // greaterscalar + template + class GreaterScalar : public TF + { + public: + GreaterScalar(const vector &args, const vector &returns) + { + this->name = "greaterscalar"; + this->author = Author::name(); + this->args = args; + this->returns = returns; } + string math_formula() const override { - return "mask=compare(T1, scalar)"; + return "mask=compare(T1, scalar)"; } shared_ptr clone() const override { - return make_shared>(*this); + return make_shared>(*this); } int run(shared_ptr mem, string &error) override { Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; Precision mask_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; - if (a_type != mask_type) + if (a_type != mask_type || mask_type != Precision::Bool) { error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(mask_type); return 1; @@ -417,29 +627,29 @@ namespace deepx::tf switch (a_type) { case Precision::Float64: - tensorfunc::comparescalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::greaterscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Float32: - tensorfunc::comparescalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::greaterscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Float16: - tensorfunc::comparescalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::greaterscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::BFloat16: - tensorfunc::comparescalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::greaterscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int64: - tensorfunc::comparescalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::greaterscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int32: - tensorfunc::comparescalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::greaterscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int16: - tensorfunc::comparescalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::greaterscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int8: - tensorfunc::comparescalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); - break; + tensorfunc::greaterscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + break; default: error = "Unsupported type: " + precision_str(a_type); return 1; @@ -447,6 +657,70 @@ namespace deepx::tf return 0; } }; - + + // switch + template + class Switch : public TF + { + public: + Switch(const vector &args, const vector &returns) + { + this->name = "switch"; + this->author = Author::name(); + this->args = args; + this->returns = returns; + } + + string math_formula() const override + { + return "C=switch(tensors,cases)"; + } + shared_ptr clone() const override + { + return make_shared>(*this); + } + + int run(shared_ptr mem, string &error) override + { + + Precision C_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + + switch (C_type) + { + case Precision::Float64: + tensorfunc::Switch(mem->gettensors(this->getvector(0)), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + tensorfunc::Switch(mem->gettensors(this->getvector(0)), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float16: + tensorfunc::Switch(mem->gettensors(this->getvector(0)), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::BFloat16: + tensorfunc::Switch(mem->gettensors(this->getvector(0)), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int64: + tensorfunc::Switch(mem->gettensors(this->getvector(0)), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + tensorfunc::Switch(mem->gettensors(this->getvector(0)), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + tensorfunc::Switch(mem->gettensors(this->getvector(0)), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + tensorfunc::Switch(mem->gettensors(this->getvector(0)), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Bool: + tensorfunc::Switch(mem->gettensors(this->getvector(0)), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported type: " + precision_str(C_type); + return 1; + } + return 0; + } + }; + }; #endif // DEEPX_TF_ELEMENTWISE_COMPARE_HPP diff --git a/excuter/op-mem-cuda/src/deepx/tf/elementwise_sqrt.hpp b/excuter/op-mem-cuda/src/deepx/tf/elementwise_sqrt.hpp index 204fae9e..ae417bfe 100644 --- a/excuter/op-mem-cuda/src/deepx/tf/elementwise_sqrt.hpp +++ b/excuter/op-mem-cuda/src/deepx/tf/elementwise_sqrt.hpp @@ -7,60 +7,61 @@ namespace deepx::tf { - + // Pow template - class Sqrt : public TF + class Pow : public TF { public: - Sqrt(const vector &args, const vector &returns) + Pow(const vector &args, const vector &returns) { - this->name = "sqrt"; + this->name = "pow"; this->author = Author::name(); this->args = args; this->returns = returns; } - Sqrt(string text) + Pow(string text) { this->parse(text); this->author = Author::name(); - if (this->name != "sqrt") + if (this->name != "pow") { throw std::runtime_error("Invalid name: " + this->name); } } string math_formula() const override { - return "T3=sqrt(T1)"; + return "T3=pow(T1, T2)"; } shared_ptr clone() const override { - return make_shared>(*this); + return make_shared>(*this); } int run(shared_ptr mem, string &error) override - { + { + if(!checktensors({this->args[0].textvalue, this->args[1].textvalue, this->returns[0].textvalue}, mem, error)) + { + return 1; + } + Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; + Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype; Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; - if (a_type != c_type) + if (a_type != c_type || b_type != c_type) { - error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type); + error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type) + " or " + precision_str(b_type) + " != " + precision_str(c_type); return 1; } switch (a_type) { case Precision::Float64: - tensorfunc::sqrt(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::pow(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Float32: - tensorfunc::sqrt(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); - break; - case Precision::Float16: - tensorfunc::sqrt(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); - break; - case Precision::BFloat16: - tensorfunc::sqrt(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::pow(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); break; + default: error = "Unsupported type: " + precision_str(a_type); return 1; @@ -69,55 +70,59 @@ namespace deepx::tf } }; + // Powscalar template - class Pow : public TF + class PowScalar : public TF { public: - Pow(const vector &args, const vector &returns) + PowScalar(const vector &args, const vector &returns) { - this->name = "pow"; + this->name = "powscalar"; this->author = Author::name(); this->args = args; this->returns = returns; } - Pow(string text) + PowScalar(string text) { this->parse(text); this->author = Author::name(); - if (this->name != "pow") + if (this->name != "powscalar") { throw std::runtime_error("Invalid name: " + this->name); } } string math_formula() const override { - return "T3=pow(T1, T2)"; + return "T3=pow(T1, scalar)"; } shared_ptr clone() const override { - return make_shared>(*this); + return make_shared>(*this); } int run(shared_ptr mem, string &error) override - { + { + if(!checktensors({this->args[0].textvalue, this->returns[0].textvalue}, mem, error)) + { + return 1; + } Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; - Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype; Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; - if (a_type != c_type || b_type != c_type) + if (a_type != c_type) { - error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type) + " or " + precision_str(b_type) + " != " + precision_str(c_type); + error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type) + " != " + precision_str(c_type); return 1; } switch (a_type) { case Precision::Float64: - tensorfunc::pow(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::powscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Float32: - tensorfunc::pow(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::powscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); break; - + default: error = "Unsupported type: " + precision_str(a_type); return 1; @@ -126,55 +131,114 @@ namespace deepx::tf } }; + // Rpowscalar template - class PowScalar : public TF + class RpowScalar : public TF { public: - PowScalar(const vector &args, const vector &returns) + RpowScalar(const vector &args, const vector &returns) { - this->name = "powscalar"; + this->name = "rpowscalar"; this->author = Author::name(); this->args = args; this->returns = returns; } + string math_formula() const override + { + return "T3=pow(scalar, T1)"; + } + shared_ptr clone() const override + { + return make_shared>(*this); + } - PowScalar(string text) + int run(shared_ptr mem, string &error) override + { + if(!checktensors({this->args[1].textvalue, this->returns[0].textvalue}, mem, error)) + { + return 1; + } + Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype; + Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (b_type != c_type) + { + error = "Type mismatch: " + precision_str(b_type) + " != " + precision_str(c_type); + return 1; + } + switch (b_type) + { + case Precision::Float64: + tensorfunc::rpowscalar(this->getvar(0, mem), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + tensorfunc::rpowscalar(this->getvar(0, mem), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported type: " + precision_str(b_type); + return 1; + } + return 0; + } + }; + + // Sqrt + template + class Sqrt : public TF + { + public: + Sqrt(const vector &args, const vector &returns) + { + this->name = "sqrt"; + this->author = Author::name(); + this->args = args; + this->returns = returns; + } + + Sqrt(string text) { this->parse(text); this->author = Author::name(); - if (this->name != "powscalar") + if (this->name != "sqrt") { throw std::runtime_error("Invalid name: " + this->name); } } string math_formula() const override { - return "T3=pow(T1, scalar)"; + return "T3=sqrt(T1)"; } shared_ptr clone() const override { - return make_shared>(*this); + return make_shared>(*this); } int run(shared_ptr mem, string &error) override { + if(!checktensors({this->args[0].textvalue, this->returns[0].textvalue}, mem, error)) + { + return 1; + } Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; - Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype; Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; - if (a_type != c_type || b_type != c_type) + if (a_type != c_type) { - error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type) + " or " + precision_str(b_type) + " != " + precision_str(c_type); + error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type); return 1; } switch (a_type) { case Precision::Float64: - tensorfunc::powscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::sqrt(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Float32: - tensorfunc::powscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::sqrt(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float16: + tensorfunc::sqrt(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::BFloat16: + tensorfunc::sqrt(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); break; - default: error = "Unsupported type: " + precision_str(a_type); return 1; @@ -214,7 +278,11 @@ namespace deepx::tf } int run(shared_ptr mem, string &error) override - { + { + if(!checktensors({this->args[0].textvalue, this->returns[0].textvalue}, mem, error)) + { + return 1; + } Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; if (a_type != c_type) @@ -275,7 +343,11 @@ namespace deepx::tf } int run(shared_ptr mem, string &error) override - { + { + if(!checktensors({this->args[0].textvalue, this->returns[0].textvalue}, mem, error)) + { + return 1; + } Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; if (a_type != c_type) diff --git a/excuter/op-mem-ompsimd/src/client/tfs.cpp b/excuter/op-mem-ompsimd/src/client/tfs.cpp index e769edc6..2208863c 100644 --- a/excuter/op-mem-ompsimd/src/client/tfs.cpp +++ b/excuter/op-mem-ompsimd/src/client/tfs.cpp @@ -55,14 +55,14 @@ namespace deepx::tf { Param("t", DataCategory::Tensor, Precision::Any), }))); - //copytensor + // copytensor tffactory.add_tf(std::make_shared(vector( - { - Param("src", DataCategory::Tensor, Precision::Any), - Param("dst", DataCategory::Tensor, Precision::Any), - }), - vector())); - //deltensor + { + Param("src", DataCategory::Tensor, Precision::Any), + Param("dst", DataCategory::Tensor, Precision::Any), + }), + vector())); + // deltensor tffactory.add_tf(std::make_shared(vector( { Param("t", DataCategory::Tensor, Precision::Any), @@ -219,6 +219,15 @@ namespace deepx::tf { Param("C", DataCategory::Tensor, Precision::Any), }))); + // invert author=miaobyte + tffactory.add_tf(std::make_shared>(vector( + { + Param("A", DataCategory::Tensor, Precision::Int64 | Precision::Int32 | Precision::Int16 | Precision::Int8), + }), + vector( + { + Param("C", DataCategory::Tensor, Precision::Int64 | Precision::Int32 | Precision::Int16 | Precision::Int8), + }))); // sqrt author=miaobyte tffactory.add_tf(std::make_shared>(vector( { @@ -249,6 +258,17 @@ namespace deepx::tf { Param("C", DataCategory::Tensor, Precision::Any), }))); + // rpowscalar author=miaobyte + tffactory.add_tf(std::make_shared>(vector( + { + Param("scalar", DataCategory::Var, Precision::Any), + Param("A", DataCategory::Tensor, Precision::Any), + }), + vector( + { + Param("C", DataCategory::Tensor, Precision::Any), + }))); + // log author=miaobyte tffactory.add_tf(std::make_shared>(vector( { @@ -307,27 +327,76 @@ namespace deepx::tf { Param("C", DataCategory::Tensor, Precision::Any), }))); - // compare author=miaobyte - tffactory.add_tf(std::make_shared>(vector( + // equal author=miaobyte + tffactory.add_tf(std::make_shared>(vector( + { + Param("A", DataCategory::Tensor, Precision::Any), + Param("B", DataCategory::Tensor, Precision::Any), + }), + vector( + { + Param("mask", DataCategory::Tensor, Precision::Bool), + }))); + // equal scalar author=miaobyte + tffactory.add_tf(std::make_shared>(vector( + { + Param("A", DataCategory::Tensor, Precision::Any), + Param("scalar", DataCategory::Var, Precision::Any), + }), + vector( + { + Param("mask", DataCategory::Tensor, Precision::Bool), + }))); + // less author=miaobyte + tffactory.add_tf(std::make_shared>(vector( + { + Param("A", DataCategory::Tensor, Precision::Any), + Param("B", DataCategory::Tensor, Precision::Any), + }), + vector( + { + Param("mask", DataCategory::Tensor, Precision::Bool), + }))); + // less scalar author=miaobyte + tffactory.add_tf(std::make_shared>(vector( + { + Param("A", DataCategory::Tensor, Precision::Any), + Param("scalar", DataCategory::Var, Precision::Any), + }), + vector( + { + Param("mask", DataCategory::Tensor, Precision::Bool), + }))); + // greater author=miaobyte + tffactory.add_tf(std::make_shared>(vector( { Param("A", DataCategory::Tensor, Precision::Any), Param("B", DataCategory::Tensor, Precision::Any), - }), vector( { - Param("mask", DataCategory::Tensor, Precision::Float32), + Param("mask", DataCategory::Tensor, Precision::Bool), }))); - // compare scalar author=miaobyte - tffactory.add_tf(std::make_shared>(vector( + // greater scalar author=miaobyte + tffactory.add_tf(std::make_shared>(vector( { Param("A", DataCategory::Tensor, Precision::Any), Param("scalar", DataCategory::Var, Precision::Any), }), vector( { - Param("mask", DataCategory::Tensor, Precision::Float32), + Param("mask", DataCategory::Tensor, Precision::Bool), }))); + // switch author=miaobyte + tffactory.add_tf(std::make_shared>(vector( + { + Param("tensors", DataCategory::ListTensor, Precision::Any), + Param("cases", DataCategory::Tensor, Precision::Int8), + }), + vector( + { + Param("C", DataCategory::Tensor, Precision::Any), + }))); } // matmul void register_matmul(TfFactory &tffactory) @@ -421,31 +490,31 @@ namespace deepx::tf vector( { Param("B", DataCategory::Tensor, Precision::Any), - }))); + }))); // reducemax author=miaobyte tffactory.add_tf(std::make_shared>(vector( - { - Param("A", DataCategory::Tensor, Precision::Any), - Param("axis", DataCategory::Vector, Precision::Int32), - Param("keepdims", DataCategory::Var, Precision::Bool), - }), - vector( - { - Param("B", DataCategory::Tensor, Precision::Any), - }))); + { + Param("A", DataCategory::Tensor, Precision::Any), + Param("axis", DataCategory::Vector, Precision::Int32), + Param("keepdims", DataCategory::Var, Precision::Bool), + }), + vector( + { + Param("B", DataCategory::Tensor, Precision::Any), + }))); // reducemin author=miaobyte tffactory.add_tf(std::make_shared>(vector( - { - Param("A", DataCategory::Tensor, Precision::Any), - Param("axis", DataCategory::Vector, Precision::Int32), - Param("keepdims", DataCategory::Var, Precision::Bool), - }), - vector( - { - Param("B", DataCategory::Tensor, Precision::Any), - }))); + { + Param("A", DataCategory::Tensor, Precision::Any), + Param("axis", DataCategory::Vector, Precision::Int32), + Param("keepdims", DataCategory::Var, Precision::Bool), + }), + vector( + { + Param("B", DataCategory::Tensor, Precision::Any), + }))); } - + int register_all(TfFactory &tffactory) { register_lifecycle(tffactory); diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp index 18d0fbe7..1e863ae3 100644 --- a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp +++ b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp @@ -284,6 +284,29 @@ namespace deepx::tensorfunc } }; + // invert + template + struct invertDispatcher + { + static void invert(const Tensor &A, Tensor &C) + { + if (A.shape == C.shape) + { + A.shape.rangeParallel(A.shape.dim-1, [&A, &C](int idx) + { + for (int j=0;j struct sqrtDispatcher>> { @@ -392,6 +415,26 @@ namespace deepx::tensorfunc } }; + // rpowscalar + template + struct rpowscalarDispatcher + { + static void rpowscalar(const T value, const Tensor &input, Tensor &output) + { + if (input.shape == output.shape) + { + output.shape.rangeParallel(output.shape.dim - 1, [&input, &output, &value](int i) + { + for (int j = 0; j < output.shape[-1]; j++) + output.data[i+j] = std::pow(value, input.data[i+j]); }); + } + else + { + throw std::invalid_argument("shape mismatch"); + } + } + }; + template struct logDispatcher { @@ -730,22 +773,27 @@ namespace deepx::tensorfunc } }; - template - struct compareDispatcher + //equal + template + struct equalDispatcher { - static void compare(const Tensor &A, const Tensor &B, const Tensor &mask) + static void equal(const Tensor &A, const Tensor &B,const float epsilon, Tensor &mask) { if (A.shape == B.shape && mask.shape == A.shape) - { - A.shape.rangeParallel(A.shape.dim, [&A, &B, &mask](int idx) + { + A.shape.rangeParallel(A.shape.dim-1, [&A, &B, &mask,epsilon](int idx) { - if(A.data[idx]==B.data[idx]){ - mask.data[idx]=0.5; - }else if(A.data[idx]>B.data[idx]){ - mask.data[idx]=1; - }else{ - mask.data[idx]=0; - } }); + for (int i = 0; i < A.shape[-1]; i++) + { + if (epsilon == 0) + { + mask.data[idx+i]=A.data[idx+i]==B.data[idx+i]; + } + else{ + mask.data[idx+i]=std::abs(A.data[idx+i]-B.data[idx+i])<=epsilon; + } + } + }); } else { @@ -754,22 +802,27 @@ namespace deepx::tensorfunc } }; - template - struct comparescalarDispatcher + //equalscalar + template + struct equalscalarDispatcher { - static void comparescalar(const Tensor &A, const T scalar, Tensor &mask) + static void equalscalar(const Tensor &A, const T scalar,const float epsilon, Tensor &mask) { if (A.shape == mask.shape) { - A.shape.rangeParallel(A.shape.dim, [&A, &mask, &scalar](int idx) + A.shape.rangeParallel(A.shape.dim-1, [&A, &mask, &scalar,epsilon](int idx) { - if(A.data[idx]==scalar){ - mask.data[idx]=0.5; - }else if(A.data[idx]>scalar){ - mask.data[idx]=1; - }else{ - mask.data[idx]=0; - } }); + for (int i = 0; i < A.shape[-1]; i++) + { + if (epsilon == 0) + { + mask.data[idx+i]=A.data[idx+i]==scalar; + } + else{ + mask.data[idx+i]=std::abs(A.data[idx+i]-scalar)<=epsilon; + } + } + }); } else { @@ -778,6 +831,121 @@ namespace deepx::tensorfunc }; }; + //less + template + struct lessDispatcher + { + static void less(const Tensor &A, const Tensor &B, Tensor &mask) + { + if (A.shape == B.shape && mask.shape == A.shape) + { + A.shape.rangeParallel(A.shape.dim-1, [&A, &B, &mask](int idx) + { + for (int i = 0; i < A.shape[-1]; i++) + { + mask.data[idx+i]=A.data[idx+i] + struct lessscalarDispatcher + { + static void lessscalar(const Tensor &A, const T scalar, Tensor &mask) + { + if (A.shape == mask.shape) + { + A.shape.rangeParallel(A.shape.dim-1, [&A, &mask, &scalar](int idx) + { + for (int i = 0; i < A.shape[-1]; i++) + { + mask.data[idx+i]=A.data[idx+i] + struct greaterDispatcher + { + static void greater(const Tensor &A, const Tensor &B, Tensor &mask) + { + if (A.shape == B.shape && mask.shape == A.shape) + { + A.shape.rangeParallel(A.shape.dim-1, [&A, &B, &mask](int idx) + { + for (int i = 0; i < A.shape[-1]; i++) + { + mask.data[idx+i]=A.data[idx+i]>B.data[idx+i]; + } + }); + } + else + { + throw std::invalid_argument("shape mismatch"); + } + } + }; + + //greaterscalar + template + struct greaterscalarDispatcher + { + static void greaterscalar(const Tensor &A, const T scalar, Tensor &mask) + { + if (A.shape == mask.shape) + { + A.shape.rangeParallel(A.shape.dim-1, [&A, &mask, &scalar](int idx) + { + for (int i = 0; i < A.shape[-1]; i++) + { + mask.data[idx+i]=A.data[idx+i]>scalar; + } + }); + } + else + { + throw std::invalid_argument("shape mismatch"); + } + } + }; + + //switch + template + struct switchDispatcher + { + static void Switch(const vector*> tensors,const Tensor &cases, Tensor &C) + { + if (cases.shape == C.shape) + { + C.shape.rangeParallel(C.shape.dim-1, [&tensors, &cases, &C](int idx) + { + for (int i = 0; i < C.shape[-1]; i++) + { + int which_tensor=cases.data[idx]; + C.data[idx+i]=tensors[which_tensor]->data[idx]; + } + }); + } + else + { + throw std::invalid_argument("shape mismatch"); + } + } + }; }; #endif // DEEPX_OP_CPU_ELEMENTWISE_HPP \ No newline at end of file diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp b/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp index 8f1e3d0c..53f0b504 100644 --- a/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp +++ b/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp @@ -546,6 +546,61 @@ namespace deepx::tf } }; + // invert + template + class Invert : public TF + { + public: + Invert(vector args, vector returns) + { + this->name = "invert"; + this->author = Author::name(); + this->args = args; + this->returns = returns; + } + string math_formula() const override + { + return "T3=~T1"; + } + shared_ptr clone() const override + { + return make_shared>(*this); + } + int run(shared_ptr mem, string &error) override + { + if (!checktensors({this->args[0].textvalue, this->returns[0].textvalue}, mem, error)!=0) + { + return 1; + } + Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; + Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (a_type != c_type) + { + error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type); + return 1; + } + switch (a_type) + { + case Precision::Int64: + tensorfunc::invert(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + tensorfunc::invert(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + tensorfunc::invert(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + tensorfunc::invert(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported dtype: " + precision_str(a_type); + return 1; + } + return 0; + } + }; + template class Sqrt : public TF { @@ -679,6 +734,51 @@ namespace deepx::tf } }; + // rpowscalar + template + class RpowScalar : public TF + { + public: + RpowScalar(vector args, vector returns) + { + this->name = "rpowscalar"; + this->author = Author::name(); + this->args = args; + this->returns = returns; + } + string math_formula() const override + { + return "T3=scalar^T1"; + } + shared_ptr clone() const override + { + return make_shared>(*this); + } + int run(shared_ptr mem, string &error) override + { + Precision a_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype; + Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (a_type != c_type) + { + error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type); + return 1; + } + switch (a_type) + { + case Precision::Float64: + tensorfunc::rpowscalar(this->getvar(0, mem), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + tensorfunc::rpowscalar(this->getvar(0, mem), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported dtype: " + precision_str(a_type); + return 1; + } + return 0; + } + }; + template class Log : public TF { @@ -1126,54 +1226,56 @@ namespace deepx::tf } }; + //equal template - class Compare : public TF + class Equal : public TF { public: - Compare(vector args, vector returns) + Equal(vector args, vector returns) { - this->name = "compare"; + this->name = "equal"; this->author = Author::name(); this->args = args; this->returns = returns; } string math_formula() const override { - return "mask=compare(T1,T2)"; + return "mask=equal(T1,T2)"; } shared_ptr clone() const override { - return make_shared>(*this); + return make_shared>(*this); } int run(shared_ptr mem, string &error) override { Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype; + float epsilon = this->getvar(2,mem,true); Precision mask_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; - if (a_type != b_type || a_type != mask_type) + if (a_type != b_type || mask_type!=Precision::Bool) { - error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(b_type) + " != " + precision_str(mask_type); + error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(b_type) + " " + precision_str(mask_type)+"!=bool"; return 1; } switch (a_type) { case Precision::Float64: - tensorfunc::compare(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::equal(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), epsilon, *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Float32: - tensorfunc::compare(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::equal(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), epsilon, *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int64: - tensorfunc::compare(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::equal(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), epsilon, *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int32: - tensorfunc::compare(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::equal(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), epsilon, *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int16: - tensorfunc::compare(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::equal(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), epsilon, *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int8: - tensorfunc::compare(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::equal(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), epsilon, *mem->gettensor(this->returns[0].textvalue)); break; default: error = "Unsupported dtype: " + precision_str(a_type); @@ -1185,28 +1287,29 @@ namespace deepx::tf template - class CompareScalar : public TF + class EqualScalar : public TF { public: - CompareScalar(vector args, vector returns) + EqualScalar(vector args, vector returns) { - this->name = "comparescalar"; + this->name = "equalscalar"; this->author = Author::name(); this->args = args; this->returns = returns; } string math_formula() const override { - return "mask=compare(T1,scalar)"; + return "mask=equal(T1,scalar)"; } shared_ptr clone() const override { - return make_shared>(*this); + return make_shared>(*this); } int run(shared_ptr mem, string &error) override { Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; Precision mask_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + float epsilon = this->getvar(2,mem,true); if (a_type != mask_type) { error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(mask_type); @@ -1215,22 +1318,22 @@ namespace deepx::tf switch (a_type) { case Precision::Float64: - tensorfunc::comparescalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1,mem,true), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::equalscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1,mem,true), epsilon, *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Float32: - tensorfunc::comparescalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1,mem,true), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::equalscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1,mem,true), epsilon, *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int64: - tensorfunc::comparescalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1,mem,true), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::equalscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1,mem,true), epsilon, *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int32: - tensorfunc::comparescalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1,mem,true), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::equalscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1,mem,true), epsilon, *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int16: - tensorfunc::comparescalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1,mem,true), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::equalscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1,mem,true), epsilon, *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int8: - tensorfunc::comparescalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1,mem,true), *mem->gettensor(this->returns[0].textvalue)); + tensorfunc::equalscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1,mem,true), epsilon, *mem->gettensor(this->returns[0].textvalue)); break; default: error = "Unsupported dtype: " + precision_str(a_type); @@ -1239,6 +1342,293 @@ namespace deepx::tf return 0; } }; -}; + //less + template + class Less : public TF + { + public: + Less(vector args, vector returns) + { + this->name = "less"; + this->author = Author::name(); + this->args = args; + this->returns = returns; + } + string math_formula() const override + { + return "mask=less(T1,T2)"; + } + shared_ptr clone() const override + { + return make_shared>(*this); + } + int run(shared_ptr mem, string &error) override + { + Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; + Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype; + Precision mask_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (a_type != b_type || mask_type!=Precision::Bool) + { + error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(b_type) + " " + precision_str(mask_type)+"!=bool"; + return 1; + } + switch (a_type) + { + case Precision::Float64: + tensorfunc::less(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + tensorfunc::less(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int64: + tensorfunc::less(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + tensorfunc::less(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + tensorfunc::less(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + tensorfunc::less(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported dtype: " + precision_str(a_type); + return 1; + } + return 0; + } + }; + + //lessscalar + template + class LessScalar : public TF + { + public: + LessScalar(vector args, vector returns) + { + this->name = "lessscalar"; + this->author = Author::name(); + this->args = args; + this->returns = returns; + } + string math_formula() const override + { + return "mask=less(T1,scalar)"; + } + shared_ptr clone() const override + { + return make_shared>(*this); + } + int run(shared_ptr mem, string &error) override + { + Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; + Precision mask_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (a_type != mask_type) + { + error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(mask_type); + return 1; + } + switch (a_type) + { + case Precision::Float64: + tensorfunc::lessscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1,mem,true), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + tensorfunc::lessscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1,mem,true), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int64: + tensorfunc::lessscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1,mem,true), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + tensorfunc::lessscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1,mem,true), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + tensorfunc::lessscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1,mem,true), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + tensorfunc::lessscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1,mem,true), *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported dtype: " + precision_str(a_type); + return 1; + } + return 0; + } + }; + + //greater + template + class Greater : public TF + { + public: + Greater(vector args, vector returns) + { + this->name = "greater"; + this->author = Author::name(); + this->args = args; + this->returns = returns; + } + string math_formula() const override + { + return "mask=greater(T1,T2)"; + } + shared_ptr clone() const override + { + return make_shared>(*this); + } + int run(shared_ptr mem, string &error) override + { + Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; + Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype; + Precision mask_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (a_type != b_type || mask_type!=Precision::Bool) + { + error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(b_type) + " " + precision_str(mask_type)+"!=bool"; + return 1; + } + switch (a_type) + { + case Precision::Float64: + tensorfunc::greater(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + tensorfunc::greater(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int64: + tensorfunc::greater(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + tensorfunc::greater(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + tensorfunc::greater(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + tensorfunc::greater(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported dtype: " + precision_str(a_type); + return 1; + } + return 0; + } + }; + + //greaterscalar + template + class GreaterScalar : public TF + { + public: + GreaterScalar(vector args, vector returns) + { + this->name = "greaterscalar"; + this->author = Author::name(); + this->args = args; + this->returns = returns; + } + string math_formula() const override + { + return "mask=greater(T1,scalar)"; + } + shared_ptr clone() const override + { + return make_shared>(*this); + } + int run(shared_ptr mem, string &error) override + { + Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; + Precision mask_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (a_type != mask_type) + { + error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(mask_type); + return 1; + } + switch (a_type) + { + case Precision::Float64: + tensorfunc::greaterscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1,mem,true), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + tensorfunc::greaterscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1,mem,true), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int64: + tensorfunc::greaterscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1,mem,true), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + tensorfunc::greaterscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1,mem,true), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + tensorfunc::greaterscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1,mem,true), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + tensorfunc::greaterscalar(*mem->gettensor(this->args[0].textvalue), this->getvar(1,mem,true), *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported dtype: " + precision_str(a_type); + return 1; + } + return 0; + } + }; + + //switch + template + class Switch : public TF + { + public: + Switch(vector args, vector returns) + { + this->name = "switch"; + this->author = Author::name(); + this->args = args; + this->returns = returns; + } + string math_formula() const override + { + return "C=switch([tensors],case)"; + } + shared_ptr clone() const override + { + return make_shared>(*this); + } + int run(shared_ptr mem, string &error) override + { + Precision cases_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype; + Precision C_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (cases_type != Precision::Int8 ) + { + error = "Type mismatch: " + precision_str(cases_type) + " != int8"; + return 1; + } + + switch (cases_type) + { + case Precision::Float64: + tensorfunc::Switch(mem->gettensors(this->getvector(0)), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + tensorfunc::Switch(mem->gettensors(this->getvector(0)), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int64: + tensorfunc::Switch(mem->gettensors(this->getvector(0)), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + tensorfunc::Switch(mem->gettensors(this->getvector(0)), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + tensorfunc::Switch(mem->gettensors(this->getvector(0)), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + tensorfunc::Switch(mem->gettensors(this->getvector(0)), *mem->gettensor(this->args[1].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported dtype: " + precision_str(cases_type); + return 1; + } + return 0; + } + }; +}; #endif diff --git a/front/py/deepx/__init__.py b/front/py/deepx/__init__.py index 355a250b..37f47669 100644 --- a/front/py/deepx/__init__.py +++ b/front/py/deepx/__init__.py @@ -1,11 +1,10 @@ -from .tensor import Tensor,Shape +from .tensor import Tensor,Shape,Number from deepx.nn.functional import * # 导入所有functional函数 from deepx.nn.functional import __all__ as _func_all # 获取functional的导出列表 __all__ = [ #tensor - 'Tensor', - 'Shape', + 'Tensor','Shape','Number', *_func_all ] diff --git a/front/py/deepx/nn/functional/__init__.py b/front/py/deepx/nn/functional/__init__.py index 9cacf7d4..1e215d7e 100644 --- a/front/py/deepx/nn/functional/__init__.py +++ b/front/py/deepx/nn/functional/__init__.py @@ -21,7 +21,7 @@ "printtensor", "constant","constant_","full","zeros","ones","uniform","uniform_","arange","arange_","kaiming_uniform","kaiming_uniform_","calculate_fan_in_and_fan_out", "add","sub","mul","div","sqrt","pow","exp","log", - "leaffunc_matmul", + "matmul", "reducemax","reducemin","sum","prod", "reshape","permute","transpose","concat","broadcastTo", diff --git a/front/py/deepx/nn/functional/authormap.py b/front/py/deepx/nn/functional/authormap.py index 5b42d56f..91e0573b 100644 --- a/front/py/deepx/nn/functional/authormap.py +++ b/front/py/deepx/nn/functional/authormap.py @@ -1,4 +1,10 @@ defaultauthor=dict({ + #io + 'print':'miaobyte', + #init + 'uniform':'miaobyte', + 'constant':'miaobyte', + 'arange':'miaobyte', #elementwise 'add':'miaobyte', 'addscalar':'miaobyte', @@ -10,7 +16,7 @@ 'divscalar':'miaobyte', 'rdiv':'miaobyte', 'rdivscalar':'miaobyte', - + 'invert':'miaobyte', 'compare':'miaobyte', 'min':'miaobyte', 'minscalar':'miaobyte', @@ -20,6 +26,7 @@ 'log':'miaobyte', 'pow':'miaobyte', 'powscalar':'miaobyte', + 'rpowscalar':'miaobyte', 'sqrt':'miaobyte', #changeshape 'reshape':'miaobyte', @@ -27,7 +34,8 @@ 'broadcastTo':'miaobyte', 'concat':'miaobyte', #matmul - 'matmul':'miaobyte', + # 'matmul':'miaobyte', + 'matmul':'cublas', #reduce 'sum':'miaobyte', 'prod':'miaobyte', diff --git a/front/py/deepx/nn/functional/elementwise.py b/front/py/deepx/nn/functional/elementwise.py index 7adcb28f..28e5b199 100644 --- a/front/py/deepx/nn/functional/elementwise.py +++ b/front/py/deepx/nn/functional/elementwise.py @@ -1,4 +1,5 @@ -from deepx.tensor import Tensor +from typing import Union +from deepx.tensor import Tensor,Number from deepx.nn.functional import newtensor def rsqrt(input:Tensor)->Tensor: @@ -8,5 +9,5 @@ def rsqrt(input:Tensor)->Tensor: outtensor=newtensor(input.shape, dtype=input.dtype) sqrt(input,out= outtensor) return div(1,outtensor,outtensor) - + diff --git a/front/py/deepx/nn/functional/leaffunc.py b/front/py/deepx/nn/functional/leaffunc.py index 62fbb767..58d21105 100644 --- a/front/py/deepx/nn/functional/leaffunc.py +++ b/front/py/deepx/nn/functional/leaffunc.py @@ -17,9 +17,6 @@ def op_func( b: Union[Tensor, float, int] = None, out: Union[Tensor, str] = None) -> Tensor: outtensor = out - if isinstance(out, str): - outtensor = newtensor(a.shape, dtype=a.dtype, name=out) - rtf_module = importlib.import_module('deepx.nn.functional.rtf_elementwise') if isinstance(b, Tensor): an=a @@ -28,9 +25,16 @@ def op_func( newshape = Shape.broadcast_shape(a.shape, b.shape) an = a.broadcastTo(newshape) bn = b.broadcastTo(newshape) + if isinstance(out,str): + outtensor=newtensor(newshape,dtype=a.dtype,name=out) + else: + if isinstance(out,str): + outtensor=newtensor(a.shape,dtype=a.dtype,name=out) rtf_func = getattr(rtf_module, f'rtf_{op_name}') rtf_func(an, bn, outtensor, defaultauthor[op_name]) else: + if isinstance(out,str): + outtensor=newtensor(a.shape,dtype=a.dtype,name=out) rtf_func = getattr(rtf_module, f'rtf_{op_name}scalar') rtf_func(a, b, outtensor, defaultauthor[f'{op_name}scalar']) return outtensor diff --git a/front/py/deepx/nn/functional/leaffunc_elementwise.py b/front/py/deepx/nn/functional/leaffunc_elementwise.py index 3cfe5157..6aa54077 100644 --- a/front/py/deepx/nn/functional/leaffunc_elementwise.py +++ b/front/py/deepx/nn/functional/leaffunc_elementwise.py @@ -1,5 +1,5 @@ from typing import Optional, Union -from deepx import Tensor,Shape +from deepx import Tensor,Shape,Number from .leaffunc import create_A_B_tf_C,create_A_tf_C from .leaffunc_life import newtensor @@ -9,53 +9,48 @@ add = create_A_B_tf_C('add') sub = create_A_B_tf_C('sub') mul = create_A_B_tf_C('mul') +_div=create_A_B_tf_C('div') -#div def div( - a: Optional[Union[Tensor, float, int]] = None, - b: Optional[Union[Tensor, float, int]] = None, - out:Union[Tensor,str]=None, - requires_grad:bool=False, - author='miaobyte')->Tensor: - if isinstance(b,Tensor) and isinstance(a,Tensor): - #C=A/B - outtensor=out - if isinstance(out,str): - outtensor=newtensor(a.shape,dtype=a.dtype,name=out) - an=a - bn=b - if a.shape!=b.shape: - newshape=Shape.broadcast_shape(a.shape,b.shape) - an=a.broadcastTo(newshape) - bn=b.broadcastTo(newshape) - from .rtf_elementwise import rtf_div - rtf_div(an,bn,outtensor,defaultauthor['div']) - return outtensor + a: Union[Tensor, float, int], + b: Union[Tensor, float, int], + out:Union[Tensor,str]=None)->Tensor: + if isinstance(a,Tensor): + return _div(a,b,out) + elif isinstance(a,float) or isinstance(a,int): + return rdiv(a,b,out) else: - if isinstance(a,Tensor): - #C=A/b - outtensor=out - if isinstance(out,str): - outtensor=newtensor(a.shape,dtype=a.dtype,name=out) - from .rtf_elementwise import rtf_divscalar - rtf_divscalar(a,b,outtensor,defaultauthor['divscalar']) - return outtensor - elif isinstance(a,float) or isinstance(a,int): - #C=a/B - outtensor=out - if isinstance(out,str): - outtensor=newtensor(b.shape,dtype=b.dtype,name=out) - from .rtf_elementwise import rtf_rdivscalar - rtf_rdivscalar(a,b,outtensor,defaultauthor['rdivscalar']) - return outtensor + raise ValueError(f"Invalid type for a: {type(a)}") + +#div +def rdiv( + a: Union[float, int], + b: Tensor, + out:Union[Tensor,str]=None)->Tensor: + outtensor=out + if isinstance(out,str): + outtensor=newtensor(b.shape,dtype=b.dtype,name=out) + from .rtf_elementwise import rtf_rdivscalar + rtf_rdivscalar(a,b,outtensor,defaultauthor['rdivscalar']) + return outtensor max=create_A_B_tf_C('max') min=create_A_B_tf_C('min') #pow pow=create_A_B_tf_C('pow') +def rpow(a:Number,b:Tensor,out:Union[Tensor,str]=None)->Tensor: + outtensor=out + if isinstance(out,str): + outtensor=newtensor(b.shape,dtype=b.dtype,name=out) + from .rtf_elementwise import rtf_rpowscalar + rtf_rpowscalar(a,b,outtensor,defaultauthor['rpowscalar']) + return outtensor #sqrt sqrt=create_A_tf_C('sqrt') exp=create_A_tf_C('exp') -log=create_A_tf_C('log') \ No newline at end of file +log=create_A_tf_C('log') + +#invert +invert=create_A_tf_C('invert') \ No newline at end of file diff --git a/front/py/deepx/nn/functional/leaffunc_init.py b/front/py/deepx/nn/functional/leaffunc_init.py index e0b0da90..454dc09d 100644 --- a/front/py/deepx/nn/functional/leaffunc_init.py +++ b/front/py/deepx/nn/functional/leaffunc_init.py @@ -1,18 +1,18 @@ from typing import Union import math +import time +import os from .leaffunc_life import newtensor,parse_shape from .rtf_init import * -from deepx import Tensor - +from deepx import Tensor,Number +from .authormap import defaultauthor # 命名规则 # inplace操作的函数,其名为_后缀, 返回值为空 # 非inplace操作的函数,其名为_后缀, 返回值为Tensor -def constant_(t:Tensor, - value: Union[float,int], - author='miaobyte')->Tensor: - rtf_constant(t,value,author) +def constant_(t:Tensor,value: Union[float,int])->Tensor: + rtf_constant(t,value,defaultauthor['constant']) def constant(*shape, value:Union[float,int], dtype:str='float32',name:str)->Tensor: @@ -33,22 +33,27 @@ def ones(*shape, dtype:str='float32',name:str=None)->Tensor: s = parse_shape(shape) return constant(s, value=1, dtype=dtype,name=name) -def arange_(t:Tensor,start=0,step=1,author='miaobyte')->Tensor: +def arange_(t:Tensor,start=0,step=1)->Tensor: from .rtf_init import rtf_arange - rtf_arange(t,start,step,author) -def arange(*shape,start=0,step=1,dtype:str='float32',name:str=None,author='miaobyte')->Tensor: - s = parse_shape(shape) + rtf_arange(t,start,step,defaultauthor['arange']) +#pytorch style +def arange(start:Number,end:Number,step:Number=1,dtype:str='float32',name:str=None)->Tensor: + s =[int((end-start)/step)] outtensor=newtensor(s,dtype=dtype,name=name) - arange_(outtensor,start,step,author) + arange_(outtensor,start,step) return outtensor -def uniform_(t:Tensor,low=0, high=1,seed:int=0,author='miaobyte')->Tensor: +def uniform_(t:Tensor,low=0, high=1,seed:int=None)->Tensor: + if seed is None: + seed = int(time.time() * 1000) & 0xffffffff + seed = (seed + os.getpid()) & 0xffffffff from .rtf_init import rtf_uniform - rtf_uniform(t,low,high,seed,author) -def uniform(*shape,low=0, high=1,seed:int=0,dtype:str='float32',name:str=None,author='miaobyte')->Tensor: + rtf_uniform(t,low,high,seed,defaultauthor['uniform']) + +def uniform(*shape,low=0, high=1,seed:int=None,dtype:str='float32',name:str=None)->Tensor: s = parse_shape(shape) outtensor=newtensor(s,dtype=dtype,name=name) - uniform_(outtensor,low,high,seed,author) + uniform_(outtensor,low,high,seed) return outtensor # def rand(*size, dtype=None, device=None): diff --git a/front/py/deepx/nn/functional/leaffunc_io.py b/front/py/deepx/nn/functional/leaffunc_io.py index 98d221da..b4490803 100644 --- a/front/py/deepx/nn/functional/leaffunc_io.py +++ b/front/py/deepx/nn/functional/leaffunc_io.py @@ -1,7 +1,8 @@ from deepx.tensor import Tensor +from .authormap import defaultauthor -def printtensor(t:Tensor,format='',author='miaobyte'): +def printtensor(t:Tensor,format=''): from .rtf_io import rtf_printtensor - rtf_printtensor(t,format,author) + rtf_printtensor(t,format,defaultauthor['print']) return '' diff --git a/front/py/deepx/nn/functional/leaffunc_life.py b/front/py/deepx/nn/functional/leaffunc_life.py index cf4d0905..abf6a530 100644 --- a/front/py/deepx/nn/functional/leaffunc_life.py +++ b/front/py/deepx/nn/functional/leaffunc_life.py @@ -12,7 +12,10 @@ def newtensor(*shape,dtype:str='float32',name:str=None): from .rtf_life import rtf_newtensor rtf_newtensor(t) return t - +def rnewtensor(t:Tensor): + from .rtf_life import rtf_newtensor + rtf_newtensor(t) + return t def copytensor(t:Tensor,out:Tensor): from .rtf_life import rtf_copytensor rtf_copytensor(t,out) diff --git a/front/py/deepx/nn/functional/leaffunc_matmul.py b/front/py/deepx/nn/functional/leaffunc_matmul.py index 11b793a4..bb69b838 100644 --- a/front/py/deepx/nn/functional/leaffunc_matmul.py +++ b/front/py/deepx/nn/functional/leaffunc_matmul.py @@ -1,13 +1,14 @@ from typing import Union -from deepx import Tensor +from deepx import Tensor,Shape from .leaffunc_life import newtensor from .authormap import defaultauthor def matmul(a:Tensor,b:Tensor,out:Union[Tensor,str]='')->Tensor: outtensor=out if isinstance(out,str): - outtensor=newtensor(a.shape,dtype=a.dtype,name=out) + outshape=Shape.matmul(a.shape,b.shape) + outtensor=newtensor(outshape,dtype=a.dtype,name=out) from .rtf_matmul import rtf_matmul rtf_matmul(a,b,outtensor,defaultauthor['matmul']) return outtensor diff --git a/front/py/deepx/nn/functional/rtf_elementwise.py b/front/py/deepx/nn/functional/rtf_elementwise.py index 414c09f6..3b7df4a6 100644 --- a/front/py/deepx/nn/functional/rtf_elementwise.py +++ b/front/py/deepx/nn/functional/rtf_elementwise.py @@ -1,7 +1,6 @@ -from deepx.tensor import Tensor +from deepx.tensor import Tensor,Number from deepx.nn.deepxir import DeepxIR,Param from deepx.scheduler import send -from typing import Union from .rtf import A_B_op_C,A_scalar_op_C,A_op_C def rtf_add(a:Tensor, b:Tensor, out:Tensor, author='miaobyte')->Tensor: @@ -55,6 +54,13 @@ def rtf_powscalar(a:Tensor, b:float, out:Tensor, author='miaobyte')->Tensor: A_scalar_op_C("powscalar",a,b,out,author) return out +def rtf_rpowscalar(a:Number,b:Tensor,out:Tensor,author='miaobyte')->Tensor: + args = [ Param.varnum(a),Param.tensor(b)] + returns = [Param.tensor(out)] + ir = DeepxIR("rpowscalar", args, returns, author) + send(ir) + return out + def rtf_exp(a:Tensor, out:Tensor, author='miaobyte')->Tensor: A_op_C("exp",a,out,author) return out @@ -97,4 +103,8 @@ def rtf_min(a:Tensor, b:Tensor, out:Tensor, author='miaobyte')->Tensor: def rtf_minscalar(a:Tensor, b:float, out:Tensor, author='miaobyte')->Tensor: A_scalar_op_C("minscalar",a,b,out,author) + return out + +def rtf_invert(a:Tensor, out:Tensor, author='miaobyte')->Tensor: + A_op_C("invert",a,out,author) return out \ No newline at end of file diff --git a/front/py/deepx/nn/modules/container.py b/front/py/deepx/nn/modules/container.py deleted file mode 100644 index e69de29b..00000000 diff --git a/front/py/deepx/nn/modules/linear.py b/front/py/deepx/nn/modules/linear.py index c1ef3238..f1eb86e3 100644 --- a/front/py/deepx/nn/modules/linear.py +++ b/front/py/deepx/nn/modules/linear.py @@ -1,6 +1,6 @@ from .module import Module from deepx import Tensor -from deepx.nn.functional import uniform,kaiming_uniform_,calculate_fan_in_and_fan_out +from deepx.nn.functional import uniform_,kaiming_uniform_,calculate_fan_in_and_fan_out import math class Linear(Module): @@ -35,14 +35,17 @@ def reset_parameters(self) -> None: if self.bias is not None: fan_in, _ = calculate_fan_in_and_fan_out(self.weight) bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0 - uniform(self.bias, -bound, bound) + uniform_(self.bias, -bound, bound) def forward(self, input: Tensor) -> Tensor: - #`y = xA^T + b` - if self.bias is None: - return input @ self.weight.T - else: - return input @ self.weight.T + self.bias + #`y = xA^T + b` + y=input @ self.weight.T + oldshape=y.shape + if self.bias is not None: + y.reshape_(y.shape[1]) + y=y+self.bias + y.reshape_(*oldshape) + return y def extra_repr(self) -> str: return f"in_features={self.in_features}, out_features={self.out_features}, bias={self.bias is not None}" diff --git a/front/py/deepx/nn/modules/module.py b/front/py/deepx/nn/modules/module.py index bda175d4..5c7be9b2 100644 --- a/front/py/deepx/nn/modules/module.py +++ b/front/py/deepx/nn/modules/module.py @@ -1,13 +1,10 @@ import re -from typing import (Dict, Iterator, Optional, Tuple, Union, - Any, List, overload) +from typing import Dict, Iterator, Optional, Tuple, Any from collections import OrderedDict from deepx import Tensor class Module: def __init__(self, name: Optional[str] = None): - from deepx.autograd import Graph - self._graph=Graph.get_default() self._name = name or self._generate_default_name() self._parent: Optional[Module] = None self._modules: OrderedDict[str, Module] = OrderedDict() @@ -15,17 +12,14 @@ def __init__(self, name: Optional[str] = None): def _generate_default_name(self) -> str: class_name = self.__class__.__name__ - base_name = re.sub(r'(? None: self._parameters.pop(name, None) else: self._parameters[name] = param - param.addtograph(self.full_name + '.' + name) + param.name=self.full_name + '.' + name + from deepx.nn.functional.leaffunc_life import rnewtensor + rnewtensor(param) def parameters(self, recurse: bool = True) -> Iterator[Tensor]: for name, param in self.named_parameters(recurse=recurse): diff --git a/front/py/deepx/tensor/__init__.py b/front/py/deepx/tensor/__init__.py index 70f2f16f..b46990e9 100644 --- a/front/py/deepx/tensor/__init__.py +++ b/front/py/deepx/tensor/__init__.py @@ -1,4 +1,4 @@ -from .tensor import Tensor,tensor_method +from .tensor import * from .shape import Shape from .elementwise import * # 导入所有包含@tensor_method装饰的方法 from .matmul import * # 导入矩阵乘法相关方法 @@ -10,7 +10,7 @@ 'Shape', 'Tensor', 'tensor_method', - + 'Number', # 'lt', 'gt', 'eq', # 'sin', 'cos', 'tan', # 'DType', diff --git a/front/py/deepx/tensor/elementwise.py b/front/py/deepx/tensor/elementwise.py index b6cb2ef9..33ff1b97 100644 --- a/front/py/deepx/tensor/elementwise.py +++ b/front/py/deepx/tensor/elementwise.py @@ -1,6 +1,6 @@ from typing import Optional,Union -from deepx.tensor import Tensor,tensor_method +from deepx.tensor import Tensor,tensor_method,Number @tensor_method def add(self, @@ -129,6 +129,13 @@ def pow_(self, from deepx.nn.functional import pow as pow_func pow_func(self,b,self) +@tensor_method +def rpow(self, + a:Number, + out:Union[Tensor,str]=''): + from deepx.nn.functional import rpow as rpow_func + return rpow_func(a,self,out) + @tensor_method def sqrt(self,out:Union[Tensor,str]='')->Tensor: @@ -149,3 +156,10 @@ def rsqrt(self,out:Union[Tensor,str]='')->Tensor: def rsqrt_(self): from deepx.nn.functional import rsqrt as rsqrt_func rsqrt_func(self,self) + +@tensor_method +def invert(self,out:Union[Tensor,str]='')->Tensor: + from deepx.nn.functional import invert as invert_func + return invert_func(self,out) + + diff --git a/front/py/deepx/tensor/reduce.py b/front/py/deepx/tensor/reduce.py index b6d5bc3f..cdba12f8 100644 --- a/front/py/deepx/tensor/reduce.py +++ b/front/py/deepx/tensor/reduce.py @@ -4,28 +4,28 @@ from deepx.tensor import Tensor,tensor_method @tensor_method -def reducemax(self, dim:list[int],keepdim:bool=False,out:Union[Tensor,str]=''): +def reducemax(self, dim:tuple,keepdim:bool=False,out:Union[Tensor,str]=''): from deepx.nn.functional import reducemax as reduce_max_func return reduce_max_func(self,dim,keepdim,out) @tensor_method -def reducemin(self, dim:list[int],keepdim:bool=False,out:Union[Tensor,str]=''): +def reducemin(self, dim:tuple,keepdim:bool=False,out:Union[Tensor,str]=''): from deepx.nn.functional import reducemin as reduce_min_func return reduce_min_func(self,dim,keepdim,out) @tensor_method -def sum(self, dim:list[int],keepdim:bool=False,out:Union[Tensor,str]=''): +def sum(self, dim:tuple,keepdim:bool=False,out:Union[Tensor,str]=''): from deepx.nn.functional import sum as sum_func return sum_func(self,dim,keepdim,out) @tensor_method -def prod(self, dim:list[int],keepdim:bool=False,out:Union[Tensor,str]=''): +def prod(self, dim:tuple,keepdim:bool=False,out:Union[Tensor,str]=''): from deepx.nn.functional import prod as prod_func return prod_func(self,dim,keepdim,out) @tensor_method -def mean(self, dim:list[int],keepdim:bool=False,out:Union[Tensor,str]=''): +def mean(self,dim:tuple,keepdim:bool=False,out:Union[Tensor,str]=''): from deepx.nn.functional import mean as mean_func - return mean_func(self,dim,keepdim,out) + return mean_func(self,dim,keepdim) \ No newline at end of file diff --git a/front/py/deepx/tensor/tensor.py b/front/py/deepx/tensor/tensor.py index ee60948d..8843bc66 100644 --- a/front/py/deepx/tensor/tensor.py +++ b/front/py/deepx/tensor/tensor.py @@ -1,6 +1,9 @@ -from typing import Optional,Union +from typing import Optional,Union,TypeAlias from .shape import Shape + +Number: TypeAlias = Union[int, float, bool] + tensorid=1 class Tensor: @@ -27,9 +30,7 @@ def __init__(self,shape:Union[tuple[int],list[int],Shape],dtype:str='float32',na self._shape = shape else: raise ValueError("Invalid shape") - - self._graph = None - self._node = None + def copy_to(self,t:'Tensor'): from deepx.nn.functional import copytensor copytensor(self,t) @@ -44,7 +45,10 @@ def clone(self,name:str=None): @property def name(self): return self._name - + @name.setter + def name(self,name:str): + self._name=name + # shape @property def shape(self,dim:int=None): @@ -87,40 +91,40 @@ def numel(self)->int: @property def dtype(self): return self._dtype - - - @property - def graph(self): - return self._graph - - @property - def node(self): - return self._node + #elementwise - def __add__(self, other): + def __add__(self, other:Union[Number,'Tensor']): return self.add(other) - def __sub__(self, other): + def __sub__(self, other:Union[Number,'Tensor']): return self.sub(other) - def __mul__(self, other): + def __mul__(self, other:Union[Number,'Tensor']): return self.mul(other) - def __truediv__(self, other): + def __truediv__(self, other:Union[Number,'Tensor']): return self.div(other) - def __rtruediv__(self, other): + def __rtruediv__(self, other:Union[Number,'Tensor']): return self.rdiv(other) + def __pow__(self, other:Union[Number,'Tensor']): + return self.pow(other) + + def __rpow__(self, other:Union[Number,'Tensor']): + return self.rpow(other) + + def __invert__(self): + return self.invert() #矩阵乘法 - def __matmul__(self, other): + def __matmul__(self, other:Union[Number,'Tensor']): return self.matmul(other) #shape操作 @property def T(self) -> str: - return self.transpose(1,0,out=self.node.name+".T") + return self.transpose() # 打印 def autoformat(self): @@ -128,6 +132,8 @@ def autoformat(self): self._format = '%.4f' elif self._dtype == 'int32' or self._dtype == 'int64' or self._dtype == 'int8' or self._dtype == 'int16': self._format = '%d' + elif self._dtype == 'bool': + self._format = '%d' else: self._format = '%s' def set_format(self,format:str): diff --git a/front/py/deepx/transformer/modeling_rope_utils.py b/front/py/deepx/transformer/modeling_rope_utils.py new file mode 100644 index 00000000..0e6dd1ed --- /dev/null +++ b/front/py/deepx/transformer/modeling_rope_utils.py @@ -0,0 +1,289 @@ +from typing import Tuple +import math +from deepx import arange,Tensor + +def _compute_default_rope_parameters( + base: float = 10000.0, + head_dim: int = 0, + partial_rotary_factor: float = 1.0, +) -> Tuple[Tensor, float]: + attention_factor = 1.0 # 在这种类型的RoPE中未使用 + dim = head_dim*partial_rotary_factor + # 计算逆频率 + inv_freq = 1.0 / (base ** (arange(0, dim, 2, dtype='float64')/ dim)) + return inv_freq, attention_factor + +# def _compute_linear_scaling_rope_parameters( +# config: Optional[PretrainedConfig] = None, +# device: Optional["torch.device"] = None, +# seq_len: Optional[int] = None, +# **rope_kwargs, +# ) -> Tuple["torch.Tensor", float]: +# """ +# Computes the inverse frequencies with linear scaling. Credits to the Reddit user /u/kaiokendev +# Args: +# config ([`~transformers.PretrainedConfig`]): +# The model configuration. +# device (`torch.device`): +# The device to use for initialization of the inverse frequencies. +# seq_len (`int`, *optional*): +# The current sequence length. Unused for this type of RoPE. +# rope_kwargs (`Dict`, *optional*): +# BC compatibility with the previous RoPE class instantiation, will be removed in v4.45. +# Returns: +# Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the +# post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). +# """ +# if config is not None and len(rope_kwargs) > 0: +# raise ValueError( +# "Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in " +# f"`_compute_linear_scaling_rope_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}" +# ) +# if len(rope_kwargs) > 0: +# factor = rope_kwargs["factor"] +# elif config is not None: +# factor = config.rope_scaling["factor"] + +# # Gets the default RoPE parameters +# inv_freq, attention_factor = _compute_default_rope_parameters(config, device, seq_len, **rope_kwargs) + +# # Then applies linear scaling to the frequencies. +# # NOTE: originally, scaling was applied to the position_ids. However, we get `embs = inv_freq @ position_ids`, so +# # applying scaling to the inverse frequencies is equivalent. +# inv_freq /= factor +# return inv_freq, attention_factor + + +# def _compute_dynamic_ntk_parameters( +# config: Optional[PretrainedConfig] = None, +# device: Optional["torch.device"] = None, +# seq_len: Optional[int] = None, +# **rope_kwargs, +# ) -> Tuple["torch.Tensor", float]: +# """ +# Computes the inverse frequencies with NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla +# Args: +# config ([`~transformers.PretrainedConfig`]): +# The model configuration. +# device (`torch.device`): +# The device to use for initialization of the inverse frequencies. +# seq_len (`int`, *optional*): +# The current sequence length, used to update the dynamic RoPE at inference time. +# rope_kwargs (`Dict`, *optional*): +# BC compatibility with the previous RoPE class instantiation, will be removed in v4.45. +# Returns: +# Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the +# post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). +# """ +# # TODO (joao): use the new `original_max_position_embeddings` from rope_scaling +# if config is not None and len(rope_kwargs) > 0: +# raise ValueError( +# "Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in " +# f"`_compute_dynamic_ntk_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}" +# ) +# if len(rope_kwargs) > 0: +# base = rope_kwargs["base"] +# dim = rope_kwargs["dim"] +# max_position_embeddings = rope_kwargs["max_position_embeddings"] +# factor = rope_kwargs["factor"] +# elif config is not None: +# base = config.rope_theta +# partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 +# head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) +# dim = int(head_dim * partial_rotary_factor) +# max_position_embeddings = config.max_position_embeddings +# factor = config.rope_scaling["factor"] + +# attention_factor = 1.0 # Unused in this type of RoPE + +# # seq_len: default to max_position_embeddings, e.g. at init time +# seq_len = seq_len if seq_len is not None and seq_len > max_position_embeddings else max_position_embeddings + +# # Compute the inverse frequencies +# base = base * ((factor * seq_len / max_position_embeddings) - (factor - 1)) ** (dim / (dim - 2)) +# inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float().to(device) / dim)) +# return inv_freq, attention_factor + + +# def _compute_yarn_parameters( +# config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None, **rope_kwargs +# ) -> Tuple["torch.Tensor", float]: +# """ +# Computes the inverse frequencies with NTK scaling. Please refer to the +# [original paper](https://arxiv.org/abs/2309.00071) +# Args: +# config ([`~transformers.PretrainedConfig`]): +# The model configuration. +# device (`torch.device`): +# The device to use for initialization of the inverse frequencies. +# seq_len (`int`, *optional*): +# The current sequence length. Unused for this type of RoPE. +# rope_kwargs (`Dict`, *optional*): +# BC compatibility with the previous RoPE class instantiation, will be removed in v4.45. +# Returns: +# Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the +# post-processing scaling factor applied to the computed cos/sin. +# """ +# # No need to keep BC with yarn, unreleased when this new pattern was created. +# if len(rope_kwargs) > 0: +# raise ValueError( +# f"Unexpected arguments: `**rope_kwargs` should be unset in `_compute_yarn_parameters`, got {rope_kwargs}" +# ) + +# base = config.rope_theta +# partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 +# head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) +# dim = int(head_dim * partial_rotary_factor) +# max_position_embeddings = config.max_position_embeddings +# factor = config.rope_scaling["factor"] + +# # Sets the attention factor as suggested in the paper +# attention_factor = config.rope_scaling.get("attention_factor") +# if attention_factor is None: +# attention_factor = 0.1 * math.log(factor) + 1.0 + +# # Optional config options +# # beta_fast/beta_slow: as suggested in the paper, default to 32/1 (correspondingly) +# beta_fast = config.rope_scaling.get("beta_fast") or 32 +# beta_slow = config.rope_scaling.get("beta_slow") or 1 + +# # Compute the inverse frequencies +# def find_correction_dim(num_rotations, dim, base, max_position_embeddings): +# """Inverse dimension formula to find the dimension based on the number of rotations""" +# return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (2 * math.log(base)) + +# def find_correction_range(low_rot, high_rot, dim, base, max_position_embeddings): +# """Find dimension range bounds based on rotations""" +# low = math.floor(find_correction_dim(low_rot, dim, base, max_position_embeddings)) +# high = math.ceil(find_correction_dim(high_rot, dim, base, max_position_embeddings)) +# return max(low, 0), min(high, dim - 1) + +# def linear_ramp_factor(min, max, dim): +# if min == max: +# max += 0.001 # Prevent singularity + +# linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min) +# ramp_func = torch.clamp(linear_func, 0, 1) +# return ramp_func + +# # Note on variable naming: "interpolation" comes from the original technique, where we interpolate the position IDs +# # to expand the possible context length. In other words, interpolation = apply scaling factor. +# pos_freqs = base ** (torch.arange(0, dim, 2).float().to(device) / dim) +# inv_freq_extrapolation = 1.0 / pos_freqs +# inv_freq_interpolation = 1.0 / (factor * pos_freqs) + +# low, high = find_correction_range(beta_fast, beta_slow, dim, base, max_position_embeddings) + +# # Get n-dimensional rotational scaling corrected for extrapolation +# inv_freq_extrapolation_factor = 1 - linear_ramp_factor(low, high, dim // 2).float().to(device) +# inv_freq = ( +# inv_freq_interpolation * (1 - inv_freq_extrapolation_factor) +# + inv_freq_extrapolation * inv_freq_extrapolation_factor +# ) + +# return inv_freq, attention_factor + + +# def _compute_longrope_parameters( +# config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None, **rope_kwargs +# ) -> Tuple["torch.Tensor", float]: +# """ +# Computes the inverse frequencies with LongRoPE scaling. Please refer to the +# [original implementation](https://github.com/microsoft/LongRoPE) +# Args: +# config ([`~transformers.PretrainedConfig`]): +# The model configuration. +# device (`torch.device`): +# The device to use for initialization of the inverse frequencies. +# seq_len (`int`, *optional*): +# The current sequence length. +# rope_kwargs (`Dict`, *optional*): +# BC compatibility with the previous RoPE class instantiation, will be removed in v4.45. +# Returns: +# Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the +# post-processing scaling factor applied to the computed cos/sin. +# """ +# # TODO (joao): use the new `original_max_position_embeddings` from rope_scaling +# # No need to keep BC with longrope, unreleased when this new pattern was created. +# if len(rope_kwargs) > 0: +# raise ValueError( +# "Unexpected arguments: `**rope_kwargs` should be unset in `_compute_longrope_parameters`, got " +# f"{rope_kwargs}" +# ) + +# base = config.rope_theta +# partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 +# head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) +# dim = int(head_dim * partial_rotary_factor) +# long_factor = config.rope_scaling["long_factor"] +# short_factor = config.rope_scaling["short_factor"] +# factor = config.rope_scaling.get("factor") +# attention_factor = config.rope_scaling.get("attention_factor") + +# # NOTE: Phi3 (and potentially other models) modify `max_position_embeddings` and have a +# # `original_max_position_embeddings` field containing the pretrained value. They use the ratio between these two +# # values to compute the default attention scaling factor, instead of using `factor`. +# if hasattr(config, "original_max_position_embeddings"): +# original_max_position_embeddings = config.original_max_position_embeddings +# factor = config.max_position_embeddings / config.original_max_position_embeddings +# else: +# original_max_position_embeddings = config.max_position_embeddings + +# # Sets the attention factor as suggested in the paper +# if attention_factor is None: +# if factor <= 1.0: +# attention_factor = 1.0 +# else: +# attention_factor = math.sqrt(1 + math.log(factor) / math.log(original_max_position_embeddings)) + +# # Compute the inverse frequencies -- scaled based on the target sequence length +# if seq_len and seq_len > original_max_position_embeddings: +# ext_factors = torch.tensor(long_factor, dtype=torch.float32, device=device) +# else: +# ext_factors = torch.tensor(short_factor, dtype=torch.float32, device=device) +# inv_freq_shape = torch.arange(0, dim, 2, dtype=torch.int64, device=device).float() / dim +# inv_freq = 1.0 / (ext_factors * base**inv_freq_shape) + +# return inv_freq, attention_factor + + +def _compute_llama3_parameters( base: float = 10000.0, + head_dim: int = 0, + partial_rotary_factor: float = 1.0, + factor:float=8, + low_freq_factor:float=1, + high_freq_factor:float=4, + old_context_len:int=8192, + seq_len: Optional[int] = None +) -> Tuple[Tensor, float]: + # Gets the default RoPE parameters + inv_freq, attention_factor = _compute_default_rope_parameters(base, head_dim, partial_rotary_factor) + + low_freq_wavelen = old_context_len / low_freq_factor + high_freq_wavelen = old_context_len / high_freq_factor + + wavelen = 2 * math.pi / inv_freq + # wavelen < high_freq_wavelen: do nothing + # wavelen > low_freq_wavelen: divide by factor + inv_freq_llama = torch.where(wavelen > low_freq_wavelen, inv_freq / factor, inv_freq) + # otherwise: interpolate between the two, using a smooth factor + smooth_factor = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor) + smoothed_inv_freq = (1 - smooth_factor) * inv_freq_llama / factor + smooth_factor * inv_freq_llama + is_medium_freq = ~(wavelen < high_freq_wavelen) * ~(wavelen > low_freq_wavelen) + inv_freq_llama = torch.where(is_medium_freq, smoothed_inv_freq, inv_freq_llama) + + return inv_freq_llama, attention_factor + + +# This maps the "rope_type" string field in rope config to the corresponding function to compute the RoPE parameters +# from the model config. You can append new {'rope_type': callable} pairs to this dictionary to enable custom RoPE +# parameterizations, as long as the callable has the same signature. +ROPE_INIT_FUNCTIONS = { + "default": _compute_default_rope_parameters, + # "linear": _compute_linear_scaling_rope_parameters, + # "dynamic": _compute_dynamic_ntk_parameters, + # "yarn": _compute_yarn_parameters, + # "longrope": _compute_longrope_parameters, + "llama3": _compute_llama3_parameters, +} + \ No newline at end of file diff --git a/front/py/deepx/transformer/models/llama/modeling_llama.py b/front/py/deepx/transformer/models/llama/modeling_llama.py index f9850f81..c60f34f5 100644 --- a/front/py/deepx/transformer/models/llama/modeling_llama.py +++ b/front/py/deepx/transformer/models/llama/modeling_llama.py @@ -1,7 +1,9 @@ from deepx.nn.modules import Module from deepx import Tensor,ones,rsqrt +# RMSNorm # copy from https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py +# 数学公式 class LlamaRMSNorm(Module): def __init__(self, hidden_size, eps=1e-6): """ @@ -11,11 +13,80 @@ def __init__(self, hidden_size, eps=1e-6): self.weight = ones(hidden_size) self.variance_epsilon = eps - + # 和官方实现相比,尽可能inplace化 def forward(self, hidden_states:Tensor): - variance = hidden_states.pow(2).mean(-1, keepdim=True) - hidden_states = hidden_states * rsqrt(variance + self.variance_epsilon) - return self.weight * hidden_states + input_clone = hidden_states.clone() + input_clone.pow_(2) + variance = input_clone.mean([-1], keepdim=True) + + variance.add_(self.variance_epsilon) + variance = rsqrt(variance) + + hidden_states.mul_(variance) + hidden_states.mul_(self.weight) + return hidden_states def extra_repr(self): - return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" \ No newline at end of file + return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" + + +class LlamaRotaryEmbedding(Module): + from transformers.models.llama.configuration_llama import LlamaConfig + def __init__(self, config: LlamaConfig, device=None): + super().__init__() + # BC: "rope_type" was originally "type" + if hasattr(config, "rope_scaling") and config.rope_scaling is not None: + self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) + else: + self.rope_type = "default" + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + + inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = self.inv_freq + + def _dynamic_frequency_update(self, position_ids, device): + """ + dynamic RoPE layers should recompute `inv_freq` in the following situations: + 1 - growing beyond the cached sequence length (allow scaling) + 2 - the current sequence length is in the original scale (avoid losing precision with small sequences) + """ + seq_len = torch.max(position_ids) + 1 + if seq_len > self.max_seq_len_cached: # growth + inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len) + self.register_buffer("inv_freq", inv_freq, persistent=False) # TODO joao: may break with compilation + self.max_seq_len_cached = seq_len + + if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len: # reset + # This .to() is needed if the model has been moved to a device after being initialized (because + # the buffer is automatically moved, but not the original copy) + self.original_inv_freq = self.original_inv_freq.to(device) + self.register_buffer("inv_freq", self.original_inv_freq, persistent=False) + self.max_seq_len_cached = self.original_max_seq_len + + @torch.no_grad() + def forward(self, x, position_ids): + if "dynamic" in self.rope_type: + self._dynamic_frequency_update(position_ids, device=x.device) + + # Core RoPE block + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) + position_ids_expanded = position_ids[:, None, :].float() + # Force float32 (see https://github.com/huggingface/transformers/pull/29285) + device_type = x.device.type + device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() + sin = emb.sin() + + # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention + cos = cos * self.attention_scaling + sin = sin * self.attention_scaling + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) diff --git a/front/py/examples/1_tensor/1_new.py b/front/py/examples/1_tensor/1_new.py index c2475364..aed5e7cc 100644 --- a/front/py/examples/1_tensor/1_new.py +++ b/front/py/examples/1_tensor/1_new.py @@ -3,7 +3,7 @@ from deepx.tensor import Tensor def printall(t): - print("t=",t) + print("t.name",t.name) print("t.shape=",t.shape) print("t.shape[0]=",t.shape[0]) @@ -13,12 +13,20 @@ def printall(t): print("t.ndimension=",t.ndimension) print("t.numel=",t.numel()) print("t.dtype=", t.dtype) + t.print() -def newtensor(): +def newtensor(dtype): from deepx.nn.functional import newtensor - t=newtensor(1,2,3) + t=newtensor(1,2,3,dtype=dtype) printall(t) + if __name__ == "__main__": - newtensor() + args=sys.argv[1:] + if len(args)==0: + newtensor('float32') + elif len(args)==1: + newtensor(args[0]) + else: + print("Usage: python 1_new.py [dtype]") diff --git a/front/py/examples/2_ir/2_elementwise_compare.py b/front/py/examples/2_ir/2_elementwise_compare.py new file mode 100644 index 00000000..7f010870 --- /dev/null +++ b/front/py/examples/2_ir/2_elementwise_compare.py @@ -0,0 +1,26 @@ +############-------PyTorch-------################ + +print() +import torch +torch_t1 = torch.full((2,3,4, ), 10, dtype=torch.int8) +torch_t2 = ~torch_t1 +print(torch_t2) +torch_t3 = torch.full((2,3,4, ), 2, dtype=torch.int64) +torch_t4 = ~torch_t3 +print(torch_t4) + + + +############-------DEEPX-------################ + +from deepx import Tensor,full + +print() + +t1 = full(2,3,4, value=10,dtype="int8") +t2 = ~t1 +t2.print() + +t3 = full(2,3,4, value=2,dtype="int64") +t4 = ~t3 +t4.print() \ No newline at end of file diff --git a/front/py/examples/2_ir/2_elementwise_sqrtlog.py b/front/py/examples/2_ir/2_elementwise_sqrtlog.py index feb8e9f2..705219ed 100644 --- a/front/py/examples/2_ir/2_elementwise_sqrtlog.py +++ b/front/py/examples/2_ir/2_elementwise_sqrtlog.py @@ -13,22 +13,23 @@ print(torch_t5) torch_t6 = torch.pow(torch_t5,torch_t3) print(torch_t6) - +torch_t7 = 2**torch_t1 +print(torch_t7) ############-------DEEPX-------################ import deepx print() -t1 = deepx.arange(3*4*5,dtype='float32',name="t1") -t2 = deepx.full([3*4*5],value=2,dtype='float32',name="t2") +t1 = deepx.arange(start=0,end=3*4*5,dtype='float32',name="t1") +t2 = deepx.full((3*4*5,),value=2,dtype='float32',name="t2") t3 = deepx.sqrt(t1,out='t3') -print(t3) +t3.print() t4 = deepx.log(t2,out='t4') -print(t4) +t4.print() t5 = deepx.exp(t4,out='t5') -print(t5) +t5.print() t6 = deepx.pow(t5,t3,out='t6') -print(t6) - - +t6.print() +t7 = 2**t1 +t7.print() diff --git a/front/py/examples/2_ir/3_matmul.py b/front/py/examples/2_ir/3_matmul.py index 5cc0cffd..144cbdf7 100644 --- a/front/py/examples/2_ir/3_matmul.py +++ b/front/py/examples/2_ir/3_matmul.py @@ -16,7 +16,7 @@ t1 = ones([3,4],dtype='float32',name="t1") t2 = ones([4,5],dtype='float32',name="t2") t3 = t1 @ t2 -print(t3) +t3.print() diff --git a/front/py/examples/3_functional/1_mean.py b/front/py/examples/3_functional/1_mean.py index 12f4c0f5..64511555 100644 --- a/front/py/examples/3_functional/1_mean.py +++ b/front/py/examples/3_functional/1_mean.py @@ -14,7 +14,7 @@ t3=arange(4,5,6,name="t3") -print(t3) +t3.print() t3_mean=mean(t3,dim=(0,1)) -print(t3_mean) +t3_mean.print() diff --git a/front/py/examples/3_functional/1_relu.py b/front/py/examples/3_functional/1_relu.py index 22b1e8cc..9cd1737e 100644 --- a/front/py/examples/3_functional/1_relu.py +++ b/front/py/examples/3_functional/1_relu.py @@ -21,7 +21,7 @@ # 当tensor.name为str时,说明其是中间变量,执行inplace操作 t2=uniform(10,10,low=-1,high=1) -print(t2) +t2.print() relu_t2=relu(t2) -print(relu_t2) +relu_t2.print() diff --git a/front/py/examples/3_functional/1_rsqrt.py b/front/py/examples/3_functional/1_rsqrt.py index c0706691..aa4926a6 100644 --- a/front/py/examples/3_functional/1_rsqrt.py +++ b/front/py/examples/3_functional/1_rsqrt.py @@ -13,6 +13,6 @@ from deepx.nn.functional import rsqrt t=arange(2,3,4,name='t') -print((t)) +t.print() rsqrt_t=rsqrt(t) -print(rsqrt_t) +rsqrt_t.print() diff --git a/front/py/examples/3_functional/1_sigmoid.py b/front/py/examples/3_functional/1_sigmoid.py index 1eace7bf..dbdfd614 100644 --- a/front/py/examples/3_functional/1_sigmoid.py +++ b/front/py/examples/3_functional/1_sigmoid.py @@ -20,8 +20,8 @@ x.sub_(3.0) print("\nDEEPX tensor:") -print(x) +x.print() out=sigmoid(x) print("\nDEEPX sigmoid result:") -print(out) +out.print() diff --git a/front/py/examples/3_functional/1_swish.py b/front/py/examples/3_functional/1_swish.py index d2ce1082..f4e8c7c3 100644 --- a/front/py/examples/3_functional/1_swish.py +++ b/front/py/examples/3_functional/1_swish.py @@ -20,8 +20,8 @@ x.sub_(3.0) print("\nDEEPX tensor:") -print(x) +x.print() out=swish(x) print("\nDEEPX swish result:") -print(out) +out.print() diff --git a/front/py/examples/3_module/1_linear.py b/front/py/examples/3_module/1_linear.py index 06eb7cfd..7ad43a91 100644 --- a/front/py/examples/3_module/1_linear.py +++ b/front/py/examples/3_module/1_linear.py @@ -3,22 +3,18 @@ import torch.nn as nn net = nn.Linear(64, 4) -input = torch.ones(1, 64) -output = net(input) +torch_input = torch.ones(1, 64) +torch_output = net(torch_input) print() -print(output) +print(torch_output) ############-------DEEPX-------################ -from deepx.nn.modules import Linear, Module -from deepx import Tensor,ones +from deepx.nn.modules import Linear +from deepx import ones net = Linear(64, 4) input=ones(1,64,name='input') out=net.forward(input) -print(out) +out.print() -import os -script_name = os.path.splitext(os.path.basename( os.path.abspath(__file__)))[0] # 获取不带后缀的脚本名 -str=out.graph.to_dot() -str.render(script_name+".dot", format='svg') diff --git a/front/py/examples/4_transformer/llama/1_llamarmsnorm.dot b/front/py/examples/4_transformer/llama/1_llamarmsnorm.dot deleted file mode 100644 index f2e9db0c..00000000 --- a/front/py/examples/4_transformer/llama/1_llamarmsnorm.dot +++ /dev/null @@ -1,128 +0,0 @@ -// Computational Graph -digraph { - rankdir=TB - node [shape=record] - 130357533018672 [label="tensor_1 -(2, 3, 8)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 130357533019536 [label=reshape color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 130357533018480 [label="vector_1 -(2, 3, 8)" color=darkseagreen fillcolor=honeydew fontname="Sans-Serif" labeljust=l shape=box style=filled] - 130357533738896 [label=div_scalar color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 130357533739760 [label="var_1 -10.0" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled] - 130357533752528 [label=add_scalar color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 130357533752336 [label="var_2 --2.0" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled] - 130357533750272 [label="tensor_2 -(8,)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 130357533750512 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 130357533750128 [label="var_3 -1" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled] - 130357533750368 [label="llama_r_m_s_norm_0.weight -(8,)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 130357533750416 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 130357533750032 [label="var_4 -0.5" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled] - 130357533749840 [label=pow_scalar color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 130357533749888 [label="var_5 -2" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled] - 130357533749696 [label="tensor_4 -(2, 3, 8)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 130357533749504 [label="tensor_5 -(2, 3, 1)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 130357533749264 [label="vector_2 -[2]" color=darkseagreen fillcolor=honeydew fontname="Sans-Serif" labeljust=l shape=box style=filled] - 130357533749168 [label=sum color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 130357533748928 [label="tensor_6 -(2, 3, 1)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 130357533748832 [label=div_scalar color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 130357533748880 [label="var_6 -8" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled] - 130357533748688 [label=add_scalar color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 130357533748736 [label="var_7 -1e-06" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled] - 130357533748544 [label="tensor_7 -(2, 3, 1)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 130357533748304 [label="tensor_8 -(2, 3, 1)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 130357533748064 [label=sqrt color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 130357533747968 [label=rdiv_scalar color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 130357533748112 [label="var_8 -1" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled] - 130357533747824 [label="tensor_9 -(2, 3, 1)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 130357533747584 [label="tensor_10 -(2, 3, 1)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 130357533747344 [label=reshape color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 130357533747392 [label="vector_3 -[2, 3, 1]" color=darkseagreen fillcolor=honeydew fontname="Sans-Serif" labeljust=l shape=box style=filled] - 130357533747248 [label="tensor_11 -(2, 3, 8)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 130357533746960 [label=expand color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 130357533747008 [label="vector_4 -(2, 3, 8)" color=darkseagreen fillcolor=honeydew fontname="Sans-Serif" labeljust=l shape=box style=filled] - 130357533746864 [label=mul color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 130357533746720 [label="tensor_12 -(2, 3, 8)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 130357534894224 [label="tensor_13 -(1, 1, 8)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 130357533740336 [label=reshape color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 130357533740672 [label="vector_5 -[1, 1, 8]" color=darkseagreen fillcolor=honeydew fontname="Sans-Serif" labeljust=l shape=box style=filled] - 130357533740528 [label="tensor_14 -(2, 3, 8)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 130357533740768 [label=expand color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 130357533740816 [label="vector_6 -(2, 3, 8)" color=darkseagreen fillcolor=honeydew fontname="Sans-Serif" labeljust=l shape=box style=filled] - 130357533741152 [label=mul color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 130357533742736 [label="tensor_15 -(2, 3, 8)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 130357533019536 -> 130357533018672 [arrowsize=0.8 color=gray40 penwidth=1.2] - 130357533738896 -> 130357533018672 [arrowsize=0.8 color=gray40 penwidth=1.2] - 130357533752528 -> 130357533018672 [arrowsize=0.8 color=gray40 penwidth=1.2] - 130357533018672 -> 130357533019536 [arrowsize=0.8 color=gray40 penwidth=1.2] - 130357533018480 -> 130357533019536 [arrowsize=0.8 color=gray40 penwidth=1.2] - 130357533018672 -> 130357533738896 [arrowsize=0.8 color=gray40 penwidth=1.2] - 130357533739760 -> 130357533738896 [arrowsize=0.8 color=gray40 penwidth=1.2] - 130357533018672 -> 130357533752528 [arrowsize=0.8 color=gray40 penwidth=1.2] - 130357533752336 -> 130357533752528 [arrowsize=0.8 color=gray40 penwidth=1.2] - 130357533750512 -> 130357533750272 [arrowsize=0.8 color=gray40 penwidth=1.2] - 130357533750128 -> 130357533750512 [arrowsize=0.8 color=gray40 penwidth=1.2] - 130357533750416 -> 130357533750368 [arrowsize=0.8 color=gray40 penwidth=1.2] - 130357533750032 -> 130357533750416 [arrowsize=0.8 color=gray40 penwidth=1.2] - 130357533018672 -> 130357533749840 [arrowsize=0.8 color=gray40 penwidth=1.2] - 130357533749888 -> 130357533749840 [arrowsize=0.8 color=gray40 penwidth=1.2] - 130357533749840 -> 130357533749696 [arrowsize=0.8 color=gray40 penwidth=1.2] - 130357533749168 -> 130357533749504 [arrowsize=0.8 color=gray40 penwidth=1.2] - 130357533749696 -> 130357533749168 [arrowsize=0.8 color=gray40 penwidth=1.2] - 130357533749264 -> 130357533749168 [arrowsize=0.8 color=gray40 penwidth=1.2] - 130357533748832 -> 130357533748928 [arrowsize=0.8 color=gray40 penwidth=1.2] - 130357533749504 -> 130357533748832 [arrowsize=0.8 color=gray40 penwidth=1.2] - 130357533748880 -> 130357533748832 [arrowsize=0.8 color=gray40 penwidth=1.2] - 130357533748928 -> 130357533748688 [arrowsize=0.8 color=gray40 penwidth=1.2] - 130357533748736 -> 130357533748688 [arrowsize=0.8 color=gray40 penwidth=1.2] - 130357533748688 -> 130357533748544 [arrowsize=0.8 color=gray40 penwidth=1.2] - 130357533748064 -> 130357533748304 [arrowsize=0.8 color=gray40 penwidth=1.2] - 130357533748544 -> 130357533748064 [arrowsize=0.8 color=gray40 penwidth=1.2] - 130357533748112 -> 130357533747968 [arrowsize=0.8 color=gray40 penwidth=1.2] - 130357533748304 -> 130357533747968 [arrowsize=0.8 color=gray40 penwidth=1.2] - 130357533747968 -> 130357533747824 [arrowsize=0.8 color=gray40 penwidth=1.2] - 130357533747344 -> 130357533747584 [arrowsize=0.8 color=gray40 penwidth=1.2] - 130357533747824 -> 130357533747344 [arrowsize=0.8 color=gray40 penwidth=1.2] - 130357533747392 -> 130357533747344 [arrowsize=0.8 color=gray40 penwidth=1.2] - 130357533746960 -> 130357533747248 [arrowsize=0.8 color=gray40 penwidth=1.2] - 130357533747584 -> 130357533746960 [arrowsize=0.8 color=gray40 penwidth=1.2] - 130357533747008 -> 130357533746960 [arrowsize=0.8 color=gray40 penwidth=1.2] - 130357533018672 -> 130357533746864 [arrowsize=0.8 color=gray40 penwidth=1.2] - 130357533747248 -> 130357533746864 [arrowsize=0.8 color=gray40 penwidth=1.2] - 130357533746864 -> 130357533746720 [arrowsize=0.8 color=gray40 penwidth=1.2] - 130357533740336 -> 130357534894224 [arrowsize=0.8 color=gray40 penwidth=1.2] - 130357533750368 -> 130357533740336 [arrowsize=0.8 color=gray40 penwidth=1.2] - 130357533740672 -> 130357533740336 [arrowsize=0.8 color=gray40 penwidth=1.2] - 130357533740768 -> 130357533740528 [arrowsize=0.8 color=gray40 penwidth=1.2] - 130357534894224 -> 130357533740768 [arrowsize=0.8 color=gray40 penwidth=1.2] - 130357533740816 -> 130357533740768 [arrowsize=0.8 color=gray40 penwidth=1.2] - 130357533740528 -> 130357533741152 [arrowsize=0.8 color=gray40 penwidth=1.2] - 130357533746720 -> 130357533741152 [arrowsize=0.8 color=gray40 penwidth=1.2] - 130357533741152 -> 130357533742736 [arrowsize=0.8 color=gray40 penwidth=1.2] -} diff --git a/front/py/examples/4_transformer/llama/1_llamarmsnorm.dot.svg b/front/py/examples/4_transformer/llama/1_llamarmsnorm.dot.svg deleted file mode 100644 index 331e5566..00000000 --- a/front/py/examples/4_transformer/llama/1_llamarmsnorm.dot.svg +++ /dev/null @@ -1,606 +0,0 @@ - - - - - - -%3 - - - -130357533018672 - -tensor_1 -(2, 3, 8) - - - -130357533019536 - -reshape - - - -130357533018672->130357533019536 - - - - - -130357533738896 - -div_scalar - - - -130357533018672->130357533738896 - - - - - -130357533752528 - -add_scalar - - - -130357533018672->130357533752528 - - - - - -130357533749840 - -pow_scalar - - - -130357533018672->130357533749840 - - - - - -130357533746864 - -mul - - - -130357533018672->130357533746864 - - - - - -130357533019536->130357533018672 - - - - - -130357533018480 - -vector_1 -(2, 3, 8) - - - -130357533018480->130357533019536 - - - - - -130357533738896->130357533018672 - - - - - -130357533739760 - -var_1 -10.0 - - - -130357533739760->130357533738896 - - - - - -130357533752528->130357533018672 - - - - - -130357533752336 - -var_2 --2.0 - - - -130357533752336->130357533752528 - - - - - -130357533750272 - -tensor_2 -(8,) - - - -130357533750512 - -constant - - - -130357533750512->130357533750272 - - - - - -130357533750128 - -var_3 -1 - - - -130357533750128->130357533750512 - - - - - -130357533750368 - -llama_r_m_s_norm_0.weight -(8,) - - - -130357533740336 - -reshape - - - -130357533750368->130357533740336 - - - - - -130357533750416 - -constant - - - -130357533750416->130357533750368 - - - - - -130357533750032 - -var_4 -0.5 - - - -130357533750032->130357533750416 - - - - - -130357533749696 - -tensor_4 -(2, 3, 8) - - - -130357533749840->130357533749696 - - - - - -130357533749888 - -var_5 -2 - - - -130357533749888->130357533749840 - - - - - -130357533749168 - -sum - - - -130357533749696->130357533749168 - - - - - -130357533749504 - -tensor_5 -(2, 3, 1) - - - -130357533748832 - -div_scalar - - - -130357533749504->130357533748832 - - - - - -130357533749264 - -vector_2 -[2] - - - -130357533749264->130357533749168 - - - - - -130357533749168->130357533749504 - - - - - -130357533748928 - -tensor_6 -(2, 3, 1) - - - -130357533748688 - -add_scalar - - - -130357533748928->130357533748688 - - - - - -130357533748832->130357533748928 - - - - - -130357533748880 - -var_6 -8 - - - -130357533748880->130357533748832 - - - - - -130357533748544 - -tensor_7 -(2, 3, 1) - - - -130357533748688->130357533748544 - - - - - -130357533748736 - -var_7 -1e-06 - - - -130357533748736->130357533748688 - - - - - -130357533748064 - -sqrt - - - -130357533748544->130357533748064 - - - - - -130357533748304 - -tensor_8 -(2, 3, 1) - - - -130357533747968 - -rdiv_scalar - - - -130357533748304->130357533747968 - - - - - -130357533748064->130357533748304 - - - - - -130357533747824 - -tensor_9 -(2, 3, 1) - - - -130357533747968->130357533747824 - - - - - -130357533748112 - -var_8 -1 - - - -130357533748112->130357533747968 - - - - - -130357533747344 - -reshape - - - -130357533747824->130357533747344 - - - - - -130357533747584 - -tensor_10 -(2, 3, 1) - - - -130357533746960 - -expand - - - -130357533747584->130357533746960 - - - - - -130357533747344->130357533747584 - - - - - -130357533747392 - -vector_3 -[2, 3, 1] - - - -130357533747392->130357533747344 - - - - - -130357533747248 - -tensor_11 -(2, 3, 8) - - - -130357533747248->130357533746864 - - - - - -130357533746960->130357533747248 - - - - - -130357533747008 - -vector_4 -(2, 3, 8) - - - -130357533747008->130357533746960 - - - - - -130357533746720 - -tensor_12 -(2, 3, 8) - - - -130357533746864->130357533746720 - - - - - -130357533741152 - -mul - - - -130357533746720->130357533741152 - - - - - -130357534894224 - -tensor_13 -(1, 1, 8) - - - -130357533740768 - -expand - - - -130357534894224->130357533740768 - - - - - -130357533740336->130357534894224 - - - - - -130357533740672 - -vector_5 -[1, 1, 8] - - - -130357533740672->130357533740336 - - - - - -130357533740528 - -tensor_14 -(2, 3, 8) - - - -130357533740528->130357533741152 - - - - - -130357533740768->130357533740528 - - - - - -130357533740816 - -vector_6 -(2, 3, 8) - - - -130357533740816->130357533740768 - - - - - -130357533742736 - -tensor_15 -(2, 3, 8) - - - -130357533741152->130357533742736 - - - - - diff --git a/front/py/examples/4_transformer/llama/1_llamarmsnorm.py b/front/py/examples/4_transformer/llama/1_llamarmsnorm.py index 938e593c..8dfacfa1 100644 --- a/front/py/examples/4_transformer/llama/1_llamarmsnorm.py +++ b/front/py/examples/4_transformer/llama/1_llamarmsnorm.py @@ -5,29 +5,21 @@ ############### DeepX 实现部分 ############### -from deepx import arange, constant +from deepx import arange, constant_ from deepx.transformer.models.llama.modeling_llama import LlamaRMSNorm # 使用相同的数据 -dx_input = arange(0, 48, 1, dtype="float32").reshape_(2, 3, hidden_size) -dx_input.div_(10.0) -dx_input.sub_(2.0) +input = arange(2, 3, hidden_size, dtype="float32") +input.div_(10.0) +input.sub_(2.0) eps = 1e-6 -print("\nDeepX 输入:") -print(dx_input) +input.print() # DeepX计算流程 -dx_norm = LlamaRMSNorm(hidden_size=hidden_size, eps=eps) +norm = LlamaRMSNorm(hidden_size=hidden_size, eps=eps) # 设置相同的权重 -constant(dx_norm.weight, 0.5) +constant_(norm.weight, 0.5) # 前向计算 -dx_output = dx_norm(dx_input) - -print("\nDeepX RMSNorm 结果:") -print(dx_output) - -import os -script_name = os.path.splitext(os.path.basename( os.path.abspath(__file__)))[0] # 获取不带后缀的脚本名 -str=dx_output.graph.to_dot() -str.render(script_name+".dot", format='svg') \ No newline at end of file +output = norm(input) +output.print() diff --git a/front/py/examples/4_transformer/llama/1_llamarmsnorm_torch.py b/front/py/examples/4_transformer/llama/1_llamarmsnorm_torch.py index 4099feee..85ef6ced 100644 --- a/front/py/examples/4_transformer/llama/1_llamarmsnorm_torch.py +++ b/front/py/examples/4_transformer/llama/1_llamarmsnorm_torch.py @@ -1,6 +1,6 @@ ############### PyTorch 实现部分 ############### import torch -from transformers.models.llama.modeling_llama import LlamaRMSNorm as PTLlamaRMSNorm +from transformers.models.llama.modeling_llama import LlamaRMSNorm # 使用小规模数据以便打印完整结果 hidden_size = 8 @@ -10,7 +10,7 @@ print("PyTorch 输入:") print(pt_input) # 使用transformers库中的官方LlamaRMSNorm实现 -pt_norm = PTLlamaRMSNorm(hidden_size, eps=eps) +pt_norm = LlamaRMSNorm(hidden_size, eps=eps) # 设置权重为固定值0.5 with torch.no_grad(): pt_norm.weight.fill_(0.5)