rsubscalar:增加rsubscalar #61

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged

miaobyte merged 1 commit into array2d:main from miaobyte:main

May 18, 2025

doc/excuter/op-mem-cuda/list.md

-Original file line number
+Diff line change
@@ Expand Up / @@ -57,6 +57,9 @@ @@
     | maxscalar | miaobyte | T3=max(T1, scalar) | maxscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
     | tan | miaobyte | T3=tan(T1) | tan(tensor<float64|float32> A)->(tensor<float64|float32> C) |
     | sin | miaobyte | T3=sin(T1) | sin(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) |
+    | less | miaobyte | mask=compare(T1, T2) | less(tensor<any> A, tensor<any> B)->(tensor<bool> mask) |
+    | powscalar | miaobyte | T3=pow(T1, scalar) | powscalar(tensor<float64|float32> A, var<float64|int32> scalar)->(tensor<float64|float32> C) |
+    | rsubscalar | miaobyte | T3=scalar-T1 | rsubscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) |
     | divscalar | miaobyte | T3=scalar/T1 | divscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
     | log | miaobyte | T3=log(T1) | log(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) |
     | addscalar | miaobyte | T3=T1+scalar | addscalar(tensor<any> A, var<any> b)->(tensor<any> C) |
@@ Expand All / @@ -67,8 +70,6 @@ @@
     | minscalar | miaobyte | T3=min(T1, scalar) | minscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
     | rpowscalar | miaobyte | T3=pow(scalar, T1) | rpowscalar(var<float32|int32> scalar, tensor<float64|float32> A)->(tensor<float64|float32> C) |
     | rdivscalar | miaobyte | T3=scalar/T1 | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) |
-    | less | miaobyte | mask=compare(T1, T2) | less(tensor<any> A, tensor<any> B)->(tensor<bool> mask) |
-    | powscalar | miaobyte | T3=pow(T1, scalar) | powscalar(tensor<float64|float32> A, var<float64|int32> scalar)->(tensor<float64|float32> C) |
     | todtype |  none  | T3(dtypeA)->T1(dtypeB) | todtype(tensor<any> a)->(tensor<any> b) |
     | add | cublas | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |
     | add | miaobyte | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |
@@ Expand Down @@

doc/excuter/op-mem-ompsimd/list.md

-Original file line number
+Diff line change
@@ Expand Up / @@ -58,6 +58,9 @@ @@
     | maxscalar | miaobyte | T3=max(T1,scalar) | maxscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
     | tan | miaobyte | T3=tan(T1) | tan(tensor<any> A)->(tensor<any> C) |
     | sin | miaobyte | T3=sin(T1) | sin(tensor<any> A)->(tensor<any> C) |
+    | less | miaobyte | mask=less(T1,T2) | less(tensor<any> A, tensor<any> B)->(tensor<bool> mask) |
+    | powscalar | miaobyte | T3=T1^scalar | powscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
+    | rsubscalar | miaobyte | T3=scalar-T1 | rsubscalar(var<any> scalar, tensor<any> a)->(tensor<any> c) |
     | divscalar | miaobyte | T3=T1/scalar | divscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
     | log | miaobyte | T3=log(T1) | log(tensor<any> A)->(tensor<any> C) |
     | addscalar | miaobyte | T3=T1+scalar | addscalar(tensor<any> a, var<any> scalar)->(tensor<any> c) |
@@ Expand All / @@ -68,8 +71,6 @@ @@
     | minscalar | miaobyte | T3=min(T1,scalar) | minscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
     | rpowscalar | miaobyte | T3=scalar^T1 | rpowscalar(var<float32> scalar, tensor<any> A)->(tensor<any> C) |
     | rdivscalar | miaobyte | T3=scalar/T1 | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) |
-    | less | miaobyte | mask=less(T1,T2) | less(tensor<any> A, tensor<any> B)->(tensor<bool> mask) |
-    | powscalar | miaobyte | T3=T1^scalar | powscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
     | todtype |  none  | T3(dtypeA)->T1(dtypeB) | todtype(tensor<any> A)->(tensor<any> C) |
     | add | cblas | T3=T1+T2 | add(tensor<float64|float32> a, tensor<float64|float32> b)->(tensor<float64|float32> c) |
     | add | miaobyte | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |
@@ Expand Down @@

excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp

-Original file line number
+Diff line change
@@ Expand Up / @@ -58,6 +58,7 @@ namespace deepx::tensorfunc @@
             subDispatcher<Author, T>::sub(A, B, C);
         }
+        // A-scalar=>C
         template <typename Author, typename T>
         struct subscalarDispatcher
         {
@@ Expand All / @@ -66,20 +67,34 @@ namespace deepx::tensorfunc @@
                 throw NotImplementError("subscalar");
             }
         };
-        // A-scalar=>C
         template <typename Author, typename T>
         void subscalar(const Tensor<T> &input, const T value, Tensor<T> &output)
         {
             subscalarDispatcher<Author, T>::subscalar(input, value, output);
         }
+        //scalar-A=>C
+        template <typename Author, typename T>
+        struct rsubscalarDispatcher
+        {
+            static void rsubscalar(const T value, const Tensor<T> &input, Tensor<T> &output) = delete;
+        };
+        template <typename Author, typename T>
+        void rsubscalar(const T value, const Tensor<T> &input, Tensor<T> &output)
+        {
+            rsubscalarDispatcher<Author, T>::rsubscalar(value, input, output);
+        }
         template <typename Author, typename T>
         struct mulDispatcher
         {
             static void mul(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &C) = delete;
         };
         // A*B=>C
         template <typename Author, typename T>
         void mul(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &C)
@@ Expand Down @@

excuter/op-mem-cuda/src/client/tfs.cpp

-Original file line number
+Diff line change
@@ Expand Up / @@ -232,6 +232,16 @@ namespace deepx::tf @@
                                                                        {
                                                                            Param("C", DataCategory::Tensor, Precision::Any),
                                                                        })));
+            tffactory.add_tf(std::make_shared<RSubScalar<miaobyte>>(vector<Param>(
+                                                                       {
+                                                                           Param("scalar", DataCategory::Var, Precision::Any),
+                                                                           Param("A", DataCategory::Tensor, Precision::Any),
+                                                                       }),
+                                                                   vector<Param>(
+                                                                       {
+                                                                           Param("C", DataCategory::Tensor, Precision::Any),
+                                                                       })));
             tffactory.add_tf(std::make_shared<Mul<miaobyte>>(vector<Param>(
                                                                  {
                                                                      Param("A", DataCategory::Tensor, Precision::Any),
@@ Expand Down @@

excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu

-Original file line number
+Diff line change
@@ Expand Up / @@ -225,6 +225,37 @@ namespace deepx::tensorfunc @@
         template void launch_subscalar<int16_t>(const int16_t *a, const int16_t scalar, int16_t *c, const int size);
         template void launch_subscalar<int8_t>(const int8_t *a, const int8_t scalar, int8_t *c, const int size);
+        // rsubscalar
+        template <typename T>
+        __global__ void rsubscalar_kernel(const T scalar, const T* A, T* C,const int size){
+            int stride = blockDim.x * gridDim.x;
+            for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride)
+            {
+                C[idx] = scalar - A[idx];
+            }
+        }
+        template <typename T>
+        void launch_rsubscalar(const T scalar, const T* a, T* c,const int size){
+            auto [numBlocks, blockSize] = BestDims(size);
+            rsubscalar_kernel<<<numBlocks, blockSize>>>(scalar, a, c, size);
+            cudaError_t err = cudaGetLastError();
+            if (err != cudaSuccess)
+            {
+                throw std::runtime_error("Failed to launch rsubscalar kernel: "+std::string(cudaGetErrorString(err)));
+            }
+        }
+        template void launch_rsubscalar<double>(const double scalar, const double* a, double* c,const int size);
+        template void launch_rsubscalar<float>(const float scalar, const float* a, float* c,const int size);
+        template void launch_rsubscalar<half>(const half scalar, const half* a, half* c,const int size);
+        template void launch_rsubscalar<nv_bfloat16>(const nv_bfloat16 scalar, const nv_bfloat16* a, nv_bfloat16* c,const int size);
+        template void launch_rsubscalar<int64_t>(const int64_t scalar, const int64_t* a, int64_t* c,const int size);
+        template void launch_rsubscalar<int32_t>(const int32_t scalar, const int32_t* a, int32_t* c,const int size);
+        template void launch_rsubscalar<int16_t>(const int16_t scalar, const int16_t* a, int16_t* c,const int size);
+        template void launch_rsubscalar<int8_t>(const int8_t scalar, const int8_t* a, int8_t* c,const int size);
         // mul
         template <typename T>
         __global__ void mul_kernel(const T *A, const T *B, T *C, const int size)
@@ Expand Down @@

excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cuh

-Original file line number
+Diff line change
@@ Expand Up / @@ -44,7 +44,15 @@ namespace deepx::tensorfunc @@
         template <typename T>
         void launch_subscalar(const T* a, const T scalar, T* c,const int size);
+        // rsubscalar
+        template <typename T>
+        __global__ void rsubscalar_kernel(const T scalar, const T* A, T* C,const int size);
+        template <typename T>
+        void launch_rsubscalar(const T scalar, const T* a, T* c,const int size);
         // mul
         template <typename T>
         __global__ void mul_kernel(const T* A, const T* B, T* C,const int size);
@@ Expand Down @@

excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.hpp

-Original file line number
+Diff line change
@@ Expand Up / @@ -69,6 +69,18 @@ namespace deepx::tensorfunc @@
             }
         };
+        template <typename T>
+        struct rsubscalarDispatcher<miaobyte, T>
+        {
+            static void rsubscalar(const T scalar, const Tensor<T> &A, Tensor<T> &C)
+            {
+                if (A.shape.size != C.shape.size) {
+                    throw TensorShapeError("rsubscalar");
+                }
+                launch_rsubscalar(scalar, A.data, C.data, A.shape.size);
+            }
+        };
         template <typename T>
         struct mulDispatcher<miaobyte, T>
         {
@@ Expand Down @@

excuter/op-mem-cuda/src/deepx/tf/elementwise_basic.hpp

-Original file line number
+Diff line change
@@ Expand Up / @@ -636,6 +636,75 @@ namespace deepx::tf @@
             }
         };
+        // rsubscalar
+        template <typename Author>
+        class RSubScalar : public TF
+        {
+        public:
+            RSubScalar(const vector<Param> &args, const vector<Param> &returns)
+            {
+                this->name = "rsubscalar";
+                this->metadata.author = Author::name();
+                this->tftype = "elementwise";
+                this->args = args;
+                this->returns = returns;
+            }
+            string math_formula() const override
+            {
+                return "T3=scalar-T1";
+            }
+            shared_ptr<TF> clone() const override
+            {
+                return make_shared<RSubScalar<Author>>(*this);
+            }
+            int run(shared_ptr<MemBase> mem, string &error) override
+            {
+                if (!checktensors({this->args[0].textvalue, this->returns[0].textvalue}, mem, error))
+                {
+                    return 1;
+                }
+                Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
+                Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
+                if (a_type != c_type)
+                {
+                    error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type);
+                    return 1;
+                }
+                switch (a_type)
+                {
+                case Precision::Float64:
+                    tensorfunc::rsubscalar<Author, double>(this->getvar<double>(1, mem), *mem->gettensor<double>(this->args[0].textvalue), *mem->gettensor<double>(this->returns[0].textvalue));
+                    break;
+                case Precision::Float32:
+                    tensorfunc::rsubscalar<Author, float>(this->getvar<float>(1, mem), *mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
+                    break;
+                case Precision::Float16:
+                    tensorfunc::rsubscalar<Author, half>(this->getvar<half>(1, mem), *mem->gettensor<half>(this->args[0].textvalue), *mem->gettensor<half>(this->returns[0].textvalue));
+                    break;
+                case Precision::BFloat16:
+                    tensorfunc::rsubscalar<Author, nv_bfloat16>(this->getvar<nv_bfloat16>(1, mem), *mem->gettensor<nv_bfloat16>(this->args[0].textvalue), *mem->gettensor<nv_bfloat16>(this->returns[0].textvalue));
+                    break;
+                case Precision::Int64:
+                    tensorfunc::rsubscalar<Author, int32_t>(this->getvar<int32_t>(1, mem), *mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                    break;
+                case Precision::Int32:
+                    tensorfunc::rsubscalar<Author, int32_t>(this->getvar<int32_t>(1, mem), *mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                    break;
+                case Precision::Int16:
+                    tensorfunc::rsubscalar<Author, int16_t>(this->getvar<int16_t>(1, mem), *mem->gettensor<int16_t>(this->args[0].textvalue), *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                    break;
+                case Precision::Int8:
+                    tensorfunc::rsubscalar<Author, int8_t>(this->getvar<int8_t>(1, mem), *mem->gettensor<int8_t>(this->args[0].textvalue), *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                    break;
+                default:
+                    error = "Unsupported dtype: " + precision_str(a_type);
+                    return 1;
+                }
+                return 0;
+            }
+        };
         template <typename Author>
         class Mul : public TF
         {
@@ Expand Down @@

excuter/op-mem-ompsimd/src/client/tfs.cpp

-Original file line number
+Diff line change
@@ Expand Up / @@ -246,6 +246,18 @@ namespace deepx::tf @@
                                                                        {
                                                                            Param("c", DataCategory::Tensor, Precision::Any),
                                                                        })));
+            // rsubscalar author=miaobyte
+            tffactory.add_tf(std::make_shared<RSubScalar<miaobyte>>(vector<Param>(
+                                                                       {
+                                                                           Param("scalar", DataCategory::Var, Precision::Any),
+                                                                           Param("a", DataCategory::Tensor, Precision::Any),
+                                                                       }),
+                                                                   vector<Param>(
+                                                                       {
+                                                                           Param("c", DataCategory::Tensor, Precision::Any),
+                                                                       })));
             // mul author=miaobyte
             tffactory.add_tf(std::make_shared<Mul<miaobyte>>(vector<Param>(
                                                                  {
@@ Expand Down @@

excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp

-Original file line number
+Diff line change
@@ Expand Up / @@ -194,6 +194,27 @@ namespace deepx::tensorfunc @@
             }
         };
+        template <typename T>
+        struct rsubscalarDispatcher<miaobyte, T>
+        {
+            static void rsubscalar(const T scalar, const Tensor<T> &A, Tensor<T> &C)
+        {
+            elementwise_A_b_C<T>(A, scalar, C,
+                                 // 标量操作
+                                 [](const T &a,const T &scalar, T &c)
+                                 { c = scalar - a; },
+                                 // SIMD操作
+                                 []( const T *a,const T scalar, T *c, size_t size)
+                                 {
+                                    const ScalableTag<T> tag;
+                                    auto vec1 = Load(tag, a);
+                                    auto vec_scalar = Set(tag, scalar);
+                                    auto vec_result = Sub(vec_scalar, vec1);
+                                    Store(vec_result, tag, c);
+                                 });
+            }
+        };
         // 添加 mul 的模板特化实现
         template <typename T>
         struct mulDispatcher<miaobyte, T>
@@ Expand Down @@

excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp

-Original file line number
+Diff line change
@@ Expand Up / @@ -495,6 +495,67 @@ namespace deepx::tf @@
                 return 0;
             }
         };
+        template <typename Author>
+        class RSubScalar : public TF
+        {
+        public:
+            RSubScalar(vector<Param> args, vector<Param> returns)
+            {
+                this->name = "rsubscalar";
+                this->metadata.author = Author::name();
+                this->tftype = "elementwise";
+                this->args = args;
+                this->returns = returns;
+            }
+            string math_formula() const override
+            {
+                return "T3=scalar-T1";
+            }
+            shared_ptr<TF> clone() const override
+            {
+                return make_shared<RSubScalar<Author>>(*this);
+            }
+            int run(shared_ptr<MemBase> mem, string &error) override
+            {
+                if (!checktensors({this->args[1].textvalue,this->returns[0].textvalue}, mem, error)!=0)
+                {
+                    return 1;
+                }
+                Precision a_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype;
+                Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
+                if (a_type != c_type)
+                {
+                    error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type);
+                    return 1;
+                }
+                switch (a_type)
+                {
+                case Precision::Float64:
+                    tensorfunc::rsubscalar<Author, double>(this->getvar<double>(1, mem), *mem->gettensor<double>(this->args[1].textvalue), *mem->gettensor<double>(this->returns[0].textvalue));
+                    break;
+                case Precision::Float32:
+                    tensorfunc::rsubscalar<Author, float>(this->getvar<float>(1, mem), *mem->gettensor<float>(this->args[1].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
+                    break;
+                case Precision::Int64:
+                    tensorfunc::rsubscalar<Author, int64_t>(this->getvar<int64_t>(1, mem), *mem->gettensor<int64_t>(this->args[1].textvalue), *mem->gettensor<int64_t>(this->returns[0].textvalue));
+                    break;
+                case Precision::Int32:
+                    tensorfunc::rsubscalar<Author, int32_t>(this->getvar<int32_t>(1, mem), *mem->gettensor<int32_t>(this->args[1].textvalue), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                    break;
+                case Precision::Int16:
+                    tensorfunc::rsubscalar<Author, int16_t>(this->getvar<int16_t>(1, mem), *mem->gettensor<int16_t>(this->args[1].textvalue), *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                    break;
+                case Precision::Int8:
+                    tensorfunc::rsubscalar<Author, int8_t>(this->getvar<int8_t>(1, mem), *mem->gettensor<int8_t>(this->args[1].textvalue), *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                    break;
+                default:
+                    error = "Unsupported dtype: " + precision_str(a_type);
+                    return 1;
+                }
+                return 0;
+            }
+        };
         template <typename Author>
         class Mul : public TF
@@ Expand Down @@

front/py/deepx/nn/functional/authormap.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -11,6 +11,7 @@ @@
             'addscalar':'miaobyte',
             'sub':'miaobyte',
             'subscalar':'miaobyte',
+            'rsubscalar':'miaobyte',
             'mul':'miaobyte',
             'mulscalar':'miaobyte',
             'div':'miaobyte',
@@ Expand Down @@

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

rsubscalar:增加rsubscalar #61

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!

Uh oh!