excuter(cpu/cuda):subscalar #4

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged

miaobyte merged 1 commit into array2d:main from miaobyte:main

Mar 20, 2025

doc/excuter/op-mem-cuda/list.md

-Original file line number
+Diff line change
@@ Expand Up / @@ -5,8 +5,10 @@ @@
     | Operation | Author | Func Def | Math Formula | IR Instruction |
     |-----------|--------|------------|--------------|----------------|
     | addscalar | miaobyte | addscalar(tensor<any> A, var<any> b)->(tensor<any> C) | T3=T1+scalar | addscalar(tensor<any> A, var<any> b)->(tensor<any> C) |
+    | add | cublas | add(tensor<any> a, tensor<any> b)->(tensor<any> c) | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |
     | add | miaobyte | add(tensor<any> a, tensor<any> b)->(tensor<any> c) | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |
     | uniform | miaobyte | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() | uniform(T1,low,high,seed) | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() |
+    | subscalar | miaobyte | subscalar(tensor<any> A, var<any> b)->(tensor<any> C) | T3=T1-scalar | subscalar(tensor<any> A, var<any> b)->(tensor<any> C) |
     | arange | miaobyte | arange(tensor<any> t, var<any> start, var<any> step)->() | arange(T1,start,step) | arange(tensor<any> t, var<any> start, var<any> step)->() |
     | constant | miaobyte | constant(tensor<any> t, var<any> value)->() | constant(T1) | constant(tensor<any> t, var<any> value)->() |
     | print | miaobyte | print(tensor<any> )->() | print(T1) | print(tensor<any> )->() |
@@ Expand Down @@

doc/excuter/op-mem-ompsimd/list.md

-Original file line number
+Diff line change
@@ Expand Up / @@ -9,6 +9,7 @@ @@
     | add | cblas | add(tensor<float64|float32> a, tensor<float64|float32> b)->(tensor<float64|float32> c) | T3=T1+T2 | add(tensor<float64|float32> a, tensor<float64|float32> b)->(tensor<float64|float32> c) |
     | add | miaobyte | add(tensor<any> a, tensor<any> b)->(tensor<any> c) | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |
     | uniform | miaobyte | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() | uniform(T1,low,high,seed) | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() |
+    | subscalar | miaobyte | subscalar(tensor<any> a, var<any> scalar)->(tensor<any> c) | T3=T1-scalar | subscalar(tensor<any> a, var<any> scalar)->(tensor<any> c) |
     | arange | miaobyte | arange(tensor<any> t, var<any> start, var<any> step)->() | arange(T1,start,step) | arange(tensor<any> t, var<any> start, var<any> step)->() |
     | constant | miaobyte | constant(tensor<any> t, var<any> value)->() | constant(T1,value) | constant(tensor<any> t, var<any> value)->() |
     | print | miaobyte | print(tensor<any> )->() | print(T1) | print(tensor<any> )->() |
@@ Expand Down @@

excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp

-Original file line number
+Diff line change
@@ Expand Up / @@ -24,7 +24,9 @@ namespace deepx::tensorfunc @@
         template <typename Author, typename T>
         struct addscalarDispatcher
         {
-            static void addscalar(const Tensor<T> &input, const T value, Tensor<T> &output) = delete;
+            static void addscalar(const Tensor<T> &input, const T value, Tensor<T> &output){
+                throw NotImplementError("addscalar");
+            }
         };
         template <typename Author, typename T>
@@ Expand All / @@ -36,7 +38,9 @@ namespace deepx::tensorfunc @@
         template <typename Author, typename T>
         struct subDispatcher
         {
-            static void sub(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &C) = delete;
+            static void sub(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &C){
+                throw NotImplementError("sub");
+            }
         };
         template <typename Author, typename T>
@@ Expand All / @@ -48,7 +52,9 @@ namespace deepx::tensorfunc @@
         template <typename Author, typename T>
         struct subscalarDispatcher
         {
-            static void subscalar(const Tensor<T> &input, const T value, Tensor<T> &output) = delete;
+            static void subscalar(const Tensor<T> &input, const T value, Tensor<T> &output){
+                throw NotImplementError("subscalar");
+            }
         };
         template <typename Author, typename T>
@@ Expand Down @@

excuter/op-mem-cuda/src/client/tfs.cpp

-Original file line number
+Diff line change
@@ Expand Up / @@ -114,7 +114,7 @@ namespace deepx::tf @@
                                                                  {
                                                                      Param("c", DataCategory::Tensor, Precision::Any),
                                                                  })));
-            tffactory.add_tf(std::make_shared<Addscalar<miaobyte>>(vector<Param>(
+            tffactory.add_tf(std::make_shared<AddScalar<miaobyte>>(vector<Param>(
                                                                        {
                                                                            Param("A", DataCategory::Tensor, Precision::Any),
                                                                            Param("b", DataCategory::Var, Precision::Any),
@@ Expand All / @@ -133,7 +133,16 @@ namespace deepx::tf @@
                                                                        {
                                                                            Param("C", DataCategory::Tensor, Precision::Any),
                                                                        })));
+            tffactory.add_tf(std::make_shared<SubScalar<miaobyte>>(vector<Param>(
+                                                                       {
+                                                                           Param("A", DataCategory::Tensor, Precision::Any),
+                                                                           Param("b", DataCategory::Var, Precision::Any),
+                                                                       }),
+                                                                   vector<Param>(
+                                                                       {
+                                                                           Param("C", DataCategory::Tensor, Precision::Any),
+                                                                       })));
             //     opfactory.add_op(Sub_cblas<float>());
             //     opfactory.add_op(Sub_cblas<double>());
@@ Expand Down @@

excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu

-Original file line number
+Diff line change
@@ Expand Up / @@ -105,7 +105,34 @@ namespace deepx::tensorfunc @@
         template void launch_sub<int16_t>(const int numBlocks, const int blockSize, const int16_t* a, const int16_t* b, int16_t* c, const int size);
         template void launch_sub<int8_t>(const int numBlocks, const int blockSize, const int8_t* a, const int8_t* b, int8_t* c, const int size);
+        template <typename T>
+        __global__ void subscalar_kernel(const T* A, const T scalar, T* C,const int size){
+            int idx = blockIdx.x * blockDim.x + threadIdx.x;
+            if (idx < size) {
+                C[idx] = A[idx] - scalar;
+            }
+        }
+        template __global__ void subscalar_kernel<double>(const double* A, const double scalar, double* C,const int size);
+        template __global__ void subscalar_kernel<float>(const float* A, const float scalar, float* C,const int size);
+        template __global__ void subscalar_kernel<half>(const half* A, const half scalar, half* C,const int size);
+        template __global__ void subscalar_kernel<nv_bfloat16>(const nv_bfloat16* A, const nv_bfloat16 scalar, nv_bfloat16* C,const int size);
+        template __global__ void subscalar_kernel<int64_t>(const int64_t* A, const int64_t scalar, int64_t* C,const int size);
+        template __global__ void subscalar_kernel<int32_t>(const int32_t* A, const int32_t scalar, int32_t* C,const int size);
+        template __global__ void subscalar_kernel<int16_t>(const int16_t* A, const int16_t scalar, int16_t* C,const int size);
+        template __global__ void subscalar_kernel<int8_t>(const int8_t* A, const int8_t scalar, int8_t* C,const int size);
+        template <typename T>
+        void launch_subscalar(const int numBlocks, const int blockSize, const T* a, const T scalar, T* c, const int size) {
+            subscalar_kernel<<<numBlocks, blockSize>>>(a, scalar, c, size);
+        }
+        template void launch_subscalar<double>(const int numBlocks, const int blockSize, const double* a, const double scalar, double* c, const int size);
+        template void launch_subscalar<float>(const int numBlocks, const int blockSize, const float* a, const float scalar, float* c, const int size);
+        template void launch_subscalar<half>(const int numBlocks, const int blockSize, const half* a, const half scalar, half* c, const int size);
+        template void launch_subscalar<nv_bfloat16>(const int numBlocks, const int blockSize, const nv_bfloat16* a, const nv_bfloat16 scalar, nv_bfloat16* c, const int size);
+        template void launch_subscalar<int64_t>(const int numBlocks, const int blockSize, const int64_t* a, const int64_t scalar, int64_t* c, const int size);
+        template void launch_subscalar<int32_t>(const int numBlocks, const int blockSize, const int32_t* a, const int32_t scalar, int32_t* c, const int size);
+        template void launch_subscalar<int16_t>(const int numBlocks, const int blockSize, const int16_t* a, const int16_t scalar, int16_t* c, const int size);
+        template void launch_subscalar<int8_t>(const int numBlocks, const int blockSize, const int8_t* a, const int8_t scalar, int8_t* c, const int size);
     }
     #endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_BASIC_CUH

excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cuh

-Original file line number
+Diff line change
@@ Expand Up / @@ -103,7 +103,38 @@ namespace deepx::tensorfunc @@
         template <>
         void launch_sub<int8_t>(int numBlocks, int blockSize, const int8_t* a, const int8_t* b, int8_t* c,const int size);
+        // subscalar
+        template <typename T>
+        __global__ void subscalar_kernel(const T* A, const T scalar, T* C,const int size);
+        template <typename T>
+        void launch_subscalar(const int numBlocks, const int blockSize, const T* a, const T scalar, T* c,const int size);
+        template <>
+        void launch_subscalar<double>(const int numBlocks, const int blockSize, const double* a, const double scalar, double* c,const int size);
+        template <>
+        void launch_subscalar<float>(const int numBlocks, const int blockSize, const float* a, const float scalar, float* c,const int size);
+        template <>
+        void launch_subscalar<nv_bfloat16>(const int numBlocks, const int blockSize, const nv_bfloat16* a, const nv_bfloat16 scalar, nv_bfloat16* c,const int size);
+        template <>
+        void launch_subscalar<__half>(const int numBlocks, const int blockSize, const __half* a, const __half scalar, __half* c,const int size);
+        template <>
+        void launch_subscalar<int64_t>(const int numBlocks, const int blockSize, const int64_t* a, const int64_t scalar, int64_t* c,const int size);
+        template <>
+        void launch_subscalar<int32_t>(const int numBlocks, const int blockSize, const int32_t* a, const int32_t scalar, int32_t* c,const int size);
+        template <>
+        void launch_subscalar<int16_t>(const int numBlocks, const int blockSize, const int16_t* a, const int16_t scalar, int16_t* c,const int size);
+        template <>
+        void launch_subscalar<int8_t>(const int numBlocks, const int blockSize, const int8_t* a, const int8_t scalar, int8_t* c,const int size);
     }
     #endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_BASIC_CUH

excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.hpp

-Original file line number
+Diff line change
@@ Expand Up / @@ -55,6 +55,20 @@ namespace deepx::tensorfunc @@
                 launch_sub(numBlocks, blockSize, A.data, B.data, C.data, A.shape.size);
             }
         };
+        template <typename T>
+        struct subscalarDispatcher<miaobyte, T>
+        {
+            static void subscalar(const Tensor<T> &A, const T scalar, Tensor<T> &C)
+            {
+                if (A.shape.size != C.shape.size) {
+                    throw TensorShapeError("subscalar");
+                }
+                const int blockSize = A.shape.size > 256 ? 256 : A.shape.size;
+                int numBlocks = (A.shape.size + blockSize - 1) / blockSize;
+                launch_subscalar(numBlocks, blockSize, A.data, scalar, C.data, A.shape.size);
+            }
+        };
     }
     #endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_BASIC_HPP

excuter/op-mem-cuda/src/deepx/tf/elementwise_basic.hpp

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -83,18 +83,18 @@ namespace deepx::tf
  
        };

        template <typename Author>

        class Addscalar : public TF

        class AddScalar : public TF

        {

        public:

            Addscalar(const vector<Param> &args, const vector<Param> &returns)

            AddScalar(const vector<Param> &args, const vector<Param> &returns)

            {

                this->name = "addscalar";

                this->author = Author::name();

                this->args = args;

                this->returns = returns;

            }

            Addscalar(string text)

            AddScalar(string text)

            {

                this->parse(text);

                this->author = Author::name();

    @@ -109,7 +109,7 @@ namespace deepx::tf
  
            }

            shared_ptr<TF> clone() const override

            {

                return make_shared<Addscalar<Author>>(*this);

                return make_shared<AddScalar<Author>>(*this);

            }

            int run(shared_ptr<MemBase> mem, string &error) override

            {

    @@ -226,6 +226,80 @@ namespace deepx::tf
  
                return 0;

            }

        };

        template <typename Author>

        class SubScalar : public TF

        {

        public: 

            SubScalar(const vector<Param> &args, const vector<Param> &returns)

            {

                this->name = "subscalar";

                this->author = Author::name();

                this->args = args;

                this->returns = returns;

            }

            SubScalar(string text)

            {

                this->parse(text);

                this->author = Author::name();

                if (this->name != "subscalar")

                {

                    throw std::runtime_error("Invalid name: " + this->name);

                }

            }

            string math_formula() const override

            {

                return "T3=T1-scalar";

            }

            shared_ptr<TF> clone() const override

            {

                return make_shared<SubScalar<Author>>(*this);

            }   

            int run(shared_ptr<MemBase> mem, string &error) override

            {

                Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;

                Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;

                if (a_type != c_type)

                {

                    error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type);

                    return 1;

                }

                switch (a_type)

                {

                case Precision::Float64:

                    tensorfunc::subscalar<Author, double>(*mem->gettensor<double>(this->args[0].textvalue), this->getvar<double>(1, mem), *mem->gettensor<double>(this->returns[0].textvalue));

                    break;

                case Precision::Float32:

                    tensorfunc::subscalar<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), this->getvar<float>(1, mem), *mem->gettensor<float>(this->returns[0].textvalue));

                    break;

                case Precision::Float16:

                    tensorfunc::subscalar<Author, half>(*mem->gettensor<half>(this->args[0].textvalue), this->getvar<half>(1, mem), *mem->gettensor<half>(this->returns[0].textvalue));

                    break;

                case Precision::BFloat16:

                    tensorfunc::subscalar<Author, nv_bfloat16>(*mem->gettensor<nv_bfloat16>(this->args[0].textvalue), this->getvar<nv_bfloat16>(1, mem), *mem->gettensor<nv_bfloat16>(this->returns[0].textvalue));

                    break;

                case Precision::Int64:

                    tensorfunc::subscalar<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), this->getvar<int32_t>(1, mem), *mem->gettensor<int32_t>(this->returns[0].textvalue));

                    break;

                case Precision::Int32:

                    tensorfunc::subscalar<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), this->getvar<int32_t>(1, mem), *mem->gettensor<int32_t>(this->returns[0].textvalue));

                    break;

                case Precision::Int16:

                    tensorfunc::subscalar<Author, int16_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), this->getvar<int16_t>(1, mem), *mem->gettensor<int16_t>(this->returns[0].textvalue));

                    break;  

                case Precision::Int8:

                    tensorfunc::subscalar<Author, int8_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), this->getvar<int8_t>(1, mem), *mem->gettensor<int8_t>(this->returns[0].textvalue));

                    break;

                default:

                    error = "Unsupported dtype: " + precision_str(a_type);

                    return 1;

                }

                return 0;

            }   

        };

    };

    #endif // DEEPX_TF_ELEMENTWISE_BASIC_HPP

excuter/op-mem-ompsimd/src/client/tfs.cpp

-Original file line number
+Diff line change
@@ Expand Up / @@ -140,6 +140,16 @@ namespace deepx::tf @@
                                                                      {
                                                                          Param("c", DataCategory::Tensor, Precision::Any),
                                                                      })));
+            tffactory.add_tf(std::make_shared<SubScalar<miaobyte>>(vector<Param>(
+                                                                     {
+                                                                         Param("a", DataCategory::Tensor, Precision::Any),
+                                                                         Param("scalar", DataCategory::Var, Precision::Any),
+                                                                     }),
+                                                                 vector<Param>(
+                                                                     {
+                                                                         Param("c", DataCategory::Tensor, Precision::Any),
+                                                                     })));
             //     opfactory.add_op(Addscalar_miaobyte<float>());
             //     opfactory.add_op(Addscalar_miaobyte<double>());
@@ Expand Down @@

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

excuter(cpu/cuda):subscalar #4

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!