From f71f809bfd2b3ce3280d5ff440c854ad891d23b0 Mon Sep 17 00:00:00 2001
From: miaobyte <734991033@qq.com>
Date: Thu, 20 Mar 2025 16:04:06 +0800
Subject: [PATCH 1/4] excuter(cpu/cuda):subscalar

---
 doc/excuter/op-mem-cuda/list.md               |   2 +
 doc/excuter/op-mem-ompsimd/list.md            |   1 +
 .../src/deepx/tensorfunc/elementwise.hpp      |  12 +-
 excuter/op-mem-cuda/src/client/tfs.cpp        |  13 +-
 .../tensorfunc/elementwise_miaobyte_basic.cu  |  29 ++-
 .../tensorfunc/elementwise_miaobyte_basic.cuh |  33 ++-
 .../tensorfunc/elementwise_miaobyte_basic.hpp |  14 ++
 .../src/deepx/tf/elementwise_basic.hpp        |  82 ++++++-
 excuter/op-mem-ompsimd/src/client/tfs.cpp     |  10 +
 .../src/deepx/tf/elementwise.hpp              | 222 +++++++++++-------
 10 files changed, 320 insertions(+), 98 deletions(-)
diff --git a/doc/excuter/op-mem-cuda/list.md b/doc/excuter/op-mem-cuda/list.md
index 171779a4..9913a248 100644
--- a/doc/excuter/op-mem-cuda/list.md
+++ b/doc/excuter/op-mem-cuda/list.md
@@ -5,8 +5,10 @@
 | Operation | Author | Func Def | Math Formula | IR Instruction |
 |-----------|--------|------------|--------------|----------------|
 | addscalar | miaobyte | addscalar(tensor<any> A, var<any> b)->(tensor<any> C) | T3=T1+scalar | addscalar(tensor<any> A, var<any> b)->(tensor<any> C) |
+| add | cublas | add(tensor<any> a, tensor<any> b)->(tensor<any> c) | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |
 | add | miaobyte | add(tensor<any> a, tensor<any> b)->(tensor<any> c) | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |
 | uniform | miaobyte | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() | uniform(T1,low,high,seed) | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() |
+| subscalar | miaobyte | subscalar(tensor<any> A, var<any> b)->(tensor<any> C) | T3=T1-scalar | subscalar(tensor<any> A, var<any> b)->(tensor<any> C) |
 | arange | miaobyte | arange(tensor<any> t, var<any> start, var<any> step)->() | arange(T1,start,step) | arange(tensor<any> t, var<any> start, var<any> step)->() |
 | constant | miaobyte | constant(tensor<any> t, var<any> value)->() | constant(T1) | constant(tensor<any> t, var<any> value)->() |
 | print | miaobyte | print(tensor<any> )->() | print(T1) | print(tensor<any> )->() |
diff --git a/doc/excuter/op-mem-ompsimd/list.md b/doc/excuter/op-mem-ompsimd/list.md
index 581ab8f9..f10183f4 100644
--- a/doc/excuter/op-mem-ompsimd/list.md
+++ b/doc/excuter/op-mem-ompsimd/list.md
@@ -9,6 +9,7 @@
 | add | cblas | add(tensor<float64|float32> a, tensor<float64|float32> b)->(tensor<float64|float32> c) | T3=T1+T2 | add(tensor<float64|float32> a, tensor<float64|float32> b)->(tensor<float64|float32> c) |
 | add | miaobyte | add(tensor<any> a, tensor<any> b)->(tensor<any> c) | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |
 | uniform | miaobyte | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() | uniform(T1,low,high,seed) | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() |
+| subscalar | miaobyte | subscalar(tensor<any> a, var<any> scalar)->(tensor<any> c) | T3=T1-scalar | subscalar(tensor<any> a, var<any> scalar)->(tensor<any> c) |
 | arange | miaobyte | arange(tensor<any> t, var<any> start, var<any> step)->() | arange(T1,start,step) | arange(tensor<any> t, var<any> start, var<any> step)->() |
 | constant | miaobyte | constant(tensor<any> t, var<any> value)->() | constant(T1,value) | constant(tensor<any> t, var<any> value)->() |
 | print | miaobyte | print(tensor<any> )->() | print(T1) | print(tensor<any> )->() |
diff --git a/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp b/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp
index bf6fd053..e05506f7 100644
--- a/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp
+++ b/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp
@@ -24,7 +24,9 @@ namespace deepx::tensorfunc
     template <typename Author, typename T>
     struct addscalarDispatcher
     {
-        static void addscalar(const Tensor<T> &input, const T value, Tensor<T> &output) = delete;
+        static void addscalar(const Tensor<T> &input, const T value, Tensor<T> &output){
+            throw NotImplementError("addscalar");
+        }
     };
 
     template <typename Author, typename T>
@@ -36,7 +38,9 @@ namespace deepx::tensorfunc
     template <typename Author, typename T>
     struct subDispatcher
     {
-        static void sub(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &C) = delete;
+        static void sub(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &C){
+            throw NotImplementError("sub");
+        }
     };
 
     template <typename Author, typename T>
@@ -48,7 +52,9 @@ namespace deepx::tensorfunc
     template <typename Author, typename T>
     struct subscalarDispatcher
     {
-        static void subscalar(const Tensor<T> &input, const T value, Tensor<T> &output) = delete;
+        static void subscalar(const Tensor<T> &input, const T value, Tensor<T> &output){
+            throw NotImplementError("subscalar");
+        }
     };
 
     template <typename Author, typename T>
diff --git a/excuter/op-mem-cuda/src/client/tfs.cpp b/excuter/op-mem-cuda/src/client/tfs.cpp
index 41108a7f..cfcbec3b 100644
--- a/excuter/op-mem-cuda/src/client/tfs.cpp
+++ b/excuter/op-mem-cuda/src/client/tfs.cpp
@@ -114,7 +114,7 @@ namespace deepx::tf
                                                              {
                                                                  Param("c", DataCategory::Tensor, Precision::Any),
                                                              })));
-        tffactory.add_tf(std::make_shared<Addscalar<miaobyte>>(vector<Param>(
+        tffactory.add_tf(std::make_shared<AddScalar<miaobyte>>(vector<Param>(
                                                                    {
                                                                        Param("A", DataCategory::Tensor, Precision::Any),
                                                                        Param("b", DataCategory::Var, Precision::Any),
@@ -133,7 +133,16 @@ namespace deepx::tf
                                                                    {
                                                                        Param("C", DataCategory::Tensor, Precision::Any),
                                                                    })));
-        
+        tffactory.add_tf(std::make_shared<SubScalar<miaobyte>>(vector<Param>(
+                                                                   {
+                                                                       Param("A", DataCategory::Tensor, Precision::Any),
+                                                                       Param("b", DataCategory::Var, Precision::Any),
+                                                                   }),
+                                                               vector<Param>(
+                                                                   {
+                                                                       Param("C", DataCategory::Tensor, Precision::Any),
+                                                                   })));
+
         //     opfactory.add_op(Sub_cblas<float>());
         //     opfactory.add_op(Sub_cblas<double>());
 
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu
index f66950ac..f4836cd6 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu
@@ -105,7 +105,34 @@ namespace deepx::tensorfunc
     template void launch_sub<int16_t>(const int numBlocks, const int blockSize, const int16_t* a, const int16_t* b, int16_t* c, const int size);
     template void launch_sub<int8_t>(const int numBlocks, const int blockSize, const int8_t* a, const int8_t* b, int8_t* c, const int size);    
     
-    
+    template <typename T>
+    __global__ void subscalar_kernel(const T* A, const T scalar, T* C,const int size){
+        int idx = blockIdx.x * blockDim.x + threadIdx.x;
+        if (idx < size) {
+            C[idx] = A[idx] - scalar;
+        }
+    }   
+    template __global__ void subscalar_kernel<double>(const double* A, const double scalar, double* C,const int size);
+    template __global__ void subscalar_kernel<float>(const float* A, const float scalar, float* C,const int size);
+    template __global__ void subscalar_kernel<half>(const half* A, const half scalar, half* C,const int size);
+    template __global__ void subscalar_kernel<nv_bfloat16>(const nv_bfloat16* A, const nv_bfloat16 scalar, nv_bfloat16* C,const int size);
+    template __global__ void subscalar_kernel<int64_t>(const int64_t* A, const int64_t scalar, int64_t* C,const int size);  
+    template __global__ void subscalar_kernel<int32_t>(const int32_t* A, const int32_t scalar, int32_t* C,const int size);  
+    template __global__ void subscalar_kernel<int16_t>(const int16_t* A, const int16_t scalar, int16_t* C,const int size);  
+    template __global__ void subscalar_kernel<int8_t>(const int8_t* A, const int8_t scalar, int8_t* C,const int size);  
+
+    template <typename T>
+    void launch_subscalar(const int numBlocks, const int blockSize, const T* a, const T scalar, T* c, const int size) { 
+        subscalar_kernel<<<numBlocks, blockSize>>>(a, scalar, c, size);
+    }
+    template void launch_subscalar<double>(const int numBlocks, const int blockSize, const double* a, const double scalar, double* c, const int size);
+    template void launch_subscalar<float>(const int numBlocks, const int blockSize, const float* a, const float scalar, float* c, const int size);
+    template void launch_subscalar<half>(const int numBlocks, const int blockSize, const half* a, const half scalar, half* c, const int size);
+    template void launch_subscalar<nv_bfloat16>(const int numBlocks, const int blockSize, const nv_bfloat16* a, const nv_bfloat16 scalar, nv_bfloat16* c, const int size);  
+    template void launch_subscalar<int64_t>(const int numBlocks, const int blockSize, const int64_t* a, const int64_t scalar, int64_t* c, const int size);  
+    template void launch_subscalar<int32_t>(const int numBlocks, const int blockSize, const int32_t* a, const int32_t scalar, int32_t* c, const int size);  
+    template void launch_subscalar<int16_t>(const int numBlocks, const int blockSize, const int16_t* a, const int16_t scalar, int16_t* c, const int size);  
+    template void launch_subscalar<int8_t>(const int numBlocks, const int blockSize, const int8_t* a, const int8_t scalar, int8_t* c, const int size);    
 }
 
 #endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_BASIC_CUH
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cuh b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cuh
index 77102fc9..966cfa1c 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cuh
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cuh
@@ -103,7 +103,38 @@ namespace deepx::tensorfunc
 
     template <> 
     void launch_sub<int8_t>(int numBlocks, int blockSize, const int8_t* a, const int8_t* b, int8_t* c,const int size);
-    
+
+    // subscalar
+    template <typename T>
+    __global__ void subscalar_kernel(const T* A, const T scalar, T* C,const int size);
+
+    template <typename T>
+    void launch_subscalar(const int numBlocks, const int blockSize, const T* a, const T scalar, T* c,const int size);
+
+    template <>
+    void launch_subscalar<double>(const int numBlocks, const int blockSize, const double* a, const double scalar, double* c,const int size);
+
+    template <>
+    void launch_subscalar<float>(const int numBlocks, const int blockSize, const float* a, const float scalar, float* c,const int size);
+
+    template <>
+    void launch_subscalar<nv_bfloat16>(const int numBlocks, const int blockSize, const nv_bfloat16* a, const nv_bfloat16 scalar, nv_bfloat16* c,const int size);
+
+    template <>
+    void launch_subscalar<__half>(const int numBlocks, const int blockSize, const __half* a, const __half scalar, __half* c,const int size);
+
+    template <>
+    void launch_subscalar<int64_t>(const int numBlocks, const int blockSize, const int64_t* a, const int64_t scalar, int64_t* c,const int size);
+
+    template <>
+    void launch_subscalar<int32_t>(const int numBlocks, const int blockSize, const int32_t* a, const int32_t scalar, int32_t* c,const int size);
+
+    template <>
+    void launch_subscalar<int16_t>(const int numBlocks, const int blockSize, const int16_t* a, const int16_t scalar, int16_t* c,const int size);
+
+    template <>
+    void launch_subscalar<int8_t>(const int numBlocks, const int blockSize, const int8_t* a, const int8_t scalar, int8_t* c,const int size);    
+ 
 }
 
 #endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_BASIC_CUH
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.hpp
index 2da8ec9c..0500dd60 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.hpp
@@ -55,6 +55,20 @@ namespace deepx::tensorfunc
             launch_sub(numBlocks, blockSize, A.data, B.data, C.data, A.shape.size);
         }
     };
+
+    template <typename T>
+    struct subscalarDispatcher<miaobyte, T>
+    {
+        static void subscalar(const Tensor<T> &A, const T scalar, Tensor<T> &C)
+        {
+            if (A.shape.size != C.shape.size) { 
+                throw TensorShapeError("subscalar");
+            }
+            const int blockSize = A.shape.size > 256 ? 256 : A.shape.size;
+            int numBlocks = (A.shape.size + blockSize - 1) / blockSize;
+            launch_subscalar(numBlocks, blockSize, A.data, scalar, C.data, A.shape.size);
+        }
+    };  
 }
 
 #endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_BASIC_HPP
diff --git a/excuter/op-mem-cuda/src/deepx/tf/elementwise_basic.hpp b/excuter/op-mem-cuda/src/deepx/tf/elementwise_basic.hpp
index 218432a8..c0910a99 100644
--- a/excuter/op-mem-cuda/src/deepx/tf/elementwise_basic.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tf/elementwise_basic.hpp
@@ -83,10 +83,10 @@ namespace deepx::tf
     };
 
     template <typename Author>
-    class Addscalar : public TF
+    class AddScalar : public TF
     {
     public:
-        Addscalar(const vector<Param> &args, const vector<Param> &returns)
+        AddScalar(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "addscalar";
             this->author = Author::name();
@@ -94,7 +94,7 @@ namespace deepx::tf
             this->returns = returns;
         }
 
-        Addscalar(string text)
+        AddScalar(string text)
         {
             this->parse(text);
             this->author = Author::name();
@@ -109,7 +109,7 @@ namespace deepx::tf
         }
         shared_ptr<TF> clone() const override
         {
-            return make_shared<Addscalar<Author>>(*this);
+            return make_shared<AddScalar<Author>>(*this);
         }
         int run(shared_ptr<MemBase> mem, string &error) override
         {
@@ -226,6 +226,80 @@ namespace deepx::tf
             return 0;
         }
     };
+
+    template <typename Author>
+    class SubScalar : public TF
+    {
+    public: 
+        SubScalar(const vector<Param> &args, const vector<Param> &returns)
+        {
+            this->name = "subscalar";
+            this->author = Author::name();
+            this->args = args;
+            this->returns = returns;
+        }
+
+        SubScalar(string text)
+        {
+            this->parse(text);
+            this->author = Author::name();
+            if (this->name != "subscalar")
+            {
+                throw std::runtime_error("Invalid name: " + this->name);
+            }
+        }
+        string math_formula() const override
+        {
+            return "T3=T1-scalar";
+        }
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<SubScalar<Author>>(*this);
+        }   
+        int run(shared_ptr<MemBase> mem, string &error) override
+        {
+            Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
+            Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
+            if (a_type != c_type)
+            {
+                error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type);
+                return 1;
+            }
+            switch (a_type)
+            {
+            case Precision::Float64:
+                tensorfunc::subscalar<Author, double>(*mem->gettensor<double>(this->args[0].textvalue), this->getvar<double>(1, mem), *mem->gettensor<double>(this->returns[0].textvalue));
+                break;
+            case Precision::Float32:
+                tensorfunc::subscalar<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), this->getvar<float>(1, mem), *mem->gettensor<float>(this->returns[0].textvalue));
+                break;
+            case Precision::Float16:
+                tensorfunc::subscalar<Author, half>(*mem->gettensor<half>(this->args[0].textvalue), this->getvar<half>(1, mem), *mem->gettensor<half>(this->returns[0].textvalue));
+                break;
+            case Precision::BFloat16:
+                tensorfunc::subscalar<Author, nv_bfloat16>(*mem->gettensor<nv_bfloat16>(this->args[0].textvalue), this->getvar<nv_bfloat16>(1, mem), *mem->gettensor<nv_bfloat16>(this->returns[0].textvalue));
+                break;
+            case Precision::Int64:
+                tensorfunc::subscalar<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), this->getvar<int32_t>(1, mem), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int32:
+                tensorfunc::subscalar<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), this->getvar<int32_t>(1, mem), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int16:
+                tensorfunc::subscalar<Author, int16_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), this->getvar<int16_t>(1, mem), *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                break;  
+            case Precision::Int8:
+                tensorfunc::subscalar<Author, int8_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), this->getvar<int8_t>(1, mem), *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                break;
+            default:
+                error = "Unsupported dtype: " + precision_str(a_type);
+                return 1;
+            }
+            return 0;
+        }   
+    };
+ 
+    
 };
 
 #endif // DEEPX_TF_ELEMENTWISE_BASIC_HPP
diff --git a/excuter/op-mem-ompsimd/src/client/tfs.cpp b/excuter/op-mem-ompsimd/src/client/tfs.cpp
index 6bae8e79..b2de5145 100644
--- a/excuter/op-mem-ompsimd/src/client/tfs.cpp
+++ b/excuter/op-mem-ompsimd/src/client/tfs.cpp
@@ -140,6 +140,16 @@ namespace deepx::tf
                                                                  {
                                                                      Param("c", DataCategory::Tensor, Precision::Any),
                                                                  }))); 
+
+        tffactory.add_tf(std::make_shared<SubScalar<miaobyte>>(vector<Param>(
+                                                                 {
+                                                                     Param("a", DataCategory::Tensor, Precision::Any),
+                                                                     Param("scalar", DataCategory::Var, Precision::Any),
+                                                                 }),
+                                                             vector<Param>(
+                                                                 {
+                                                                     Param("c", DataCategory::Tensor, Precision::Any),
+                                                                 }))); 
         //     opfactory.add_op(Addscalar_miaobyte<float>());
         //     opfactory.add_op(Addscalar_miaobyte<double>());
 
diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp b/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp
index 1702c644..5487a2a7 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp
@@ -8,11 +8,13 @@
 #include "deepx/tensorfunc/authors.hpp"
 #include "deepx/tensorfunc/elementwise_miaobyte.hpp"
 #include "deepx/tensorfunc/elementwise_cblas.hpp"
-namespace deepx::tf {
+namespace deepx::tf
+{
 
     template <typename Author>
-    class Add : public TF {
-        public:
+    class Add : public TF
+    {
+    public:
         Add(vector<Param> args, vector<Param> returns)
         {
             this->name = "add";
@@ -23,7 +25,7 @@ namespace deepx::tf {
         string math_formula() const override
         {
             return "T3=T1+T2";
-        }   
+        }
         shared_ptr<TF> clone() const override
         {
             return make_shared<Add<Author>>(*this);
@@ -38,38 +40,38 @@ namespace deepx::tf {
                 error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(b_type) + " != " + precision_str(c_type);
                 return 1;
             }
-            switch (a_type) 
+            switch (a_type)
             {
-                case Precision::Float64:
-                    tensorfunc::add<Author, double>(*mem->gettensor<double>(this->args[0].textvalue), *mem->gettensor<double>(this->args[1].textvalue), *mem->gettensor<double>(this->returns[0].textvalue));
-                    break;
-                case Precision::Float32:
-                    tensorfunc::add<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<float>(this->args[1].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
-                    break;
-                case Precision::Int64:
-                    tensorfunc::add<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), *mem->gettensor<int32_t>(this->returns[0].textvalue));
-                    break;
-                case Precision::Int32:
-                    tensorfunc::add<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), *mem->gettensor<int32_t>(this->returns[0].textvalue));
-                    break;
-                case Precision::Int16:
-                    tensorfunc::add<Author, int16_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), *mem->gettensor<int16_t>(this->args[1].textvalue), *mem->gettensor<int16_t>(this->returns[0].textvalue));
-                    break;
-                case Precision::Int8:
-                    tensorfunc::add<Author, int8_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), *mem->gettensor<int8_t>(this->args[1].textvalue), *mem->gettensor<int8_t>(this->returns[0].textvalue));
-                    break;
-                default:
-                    error = "Unsupported dtype: " + precision_str(a_type);
-                    return 1;
+            case Precision::Float64:
+                tensorfunc::add<Author, double>(*mem->gettensor<double>(this->args[0].textvalue), *mem->gettensor<double>(this->args[1].textvalue), *mem->gettensor<double>(this->returns[0].textvalue));
+                break;
+            case Precision::Float32:
+                tensorfunc::add<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<float>(this->args[1].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
+                break;
+            case Precision::Int64:
+                tensorfunc::add<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int32:
+                tensorfunc::add<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int16:
+                tensorfunc::add<Author, int16_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), *mem->gettensor<int16_t>(this->args[1].textvalue), *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int8:
+                tensorfunc::add<Author, int8_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), *mem->gettensor<int8_t>(this->args[1].textvalue), *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                break;
+            default:
+                error = "Unsupported dtype: " + precision_str(a_type);
+                return 1;
             }
             return 0;
         }
-        
     };
 
     template <typename Author>
-    class AddScalar : public TF {
-        public:
+    class AddScalar : public TF
+    {
+    public:
         AddScalar(vector<Param> args, vector<Param> returns)
         {
             this->name = "addscalar";
@@ -80,7 +82,7 @@ namespace deepx::tf {
         string math_formula() const override
         {
             return "T3=T1+scalar";
-        }   
+        }
         shared_ptr<TF> clone() const override
         {
             return make_shared<AddScalar<Author>>(*this);
@@ -94,37 +96,37 @@ namespace deepx::tf {
                 error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type);
                 return 1;
             }
-            switch (a_type) 
+            switch (a_type)
             {
-                case Precision::Float64:
-                    tensorfunc::addscalar<Author, double>(*mem->gettensor<double>(this->args[0].textvalue), this->getvar<double>(1, mem), *mem->gettensor<double>(this->returns[0].textvalue));
-                    break;
-                case Precision::Float32:
-                    tensorfunc::addscalar<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), this->getvar<float>(1, mem), *mem->gettensor<float>(this->returns[0].textvalue));
-                    break;
-                case Precision::Int64:
-                    tensorfunc::addscalar<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), this->getvar<int32_t>(1, mem), *mem->gettensor<int32_t>(this->returns[0].textvalue));
-                    break;
-                case Precision::Int32:
-                    tensorfunc::addscalar<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), this->getvar<int32_t>(1, mem), *mem->gettensor<int32_t>(this->returns[0].textvalue));
-                    break;
-                case Precision::Int16:
-                    tensorfunc::addscalar<Author, int16_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), this->getvar<int16_t>(1, mem), *mem->gettensor<int16_t>(this->returns[0].textvalue));
-                    break;
-                case Precision::Int8:
-                    tensorfunc::addscalar<Author, int8_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), this->getvar<int8_t>(1, mem), *mem->gettensor<int8_t>(this->returns[0].textvalue));
-                    break;
-                default:
-                    error = "Unsupported dtype: " + precision_str(a_type);
-                    return 1;
+            case Precision::Float64:
+                tensorfunc::addscalar<Author, double>(*mem->gettensor<double>(this->args[0].textvalue), this->getvar<double>(1, mem), *mem->gettensor<double>(this->returns[0].textvalue));
+                break;
+            case Precision::Float32:
+                tensorfunc::addscalar<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), this->getvar<float>(1, mem), *mem->gettensor<float>(this->returns[0].textvalue));
+                break;
+            case Precision::Int64:
+                tensorfunc::addscalar<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), this->getvar<int32_t>(1, mem), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int32:
+                tensorfunc::addscalar<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), this->getvar<int32_t>(1, mem), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int16:
+                tensorfunc::addscalar<Author, int16_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), this->getvar<int16_t>(1, mem), *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int8:
+                tensorfunc::addscalar<Author, int8_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), this->getvar<int8_t>(1, mem), *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                break;
+            default:
+                error = "Unsupported dtype: " + precision_str(a_type);
+                return 1;
             }
             return 0;
         }
-        
     };
     template <typename Author>
-    class Sub : public TF {
-        public:
+    class Sub : public TF
+    {
+    public:
         Sub(vector<Param> args, vector<Param> returns)
         {
             this->name = "sub";
@@ -135,7 +137,7 @@ namespace deepx::tf {
         string math_formula() const override
         {
             return "T3=T1-T2";
-        }   
+        }
         shared_ptr<TF> clone() const override
         {
             return make_shared<Sub<Author>>(*this);
@@ -150,43 +152,89 @@ namespace deepx::tf {
                 error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(b_type) + " != " + precision_str(c_type);
                 return 1;
             }
-            switch (a_type) 
+            switch (a_type)
             {
-                case Precision::Float64:
-                    tensorfunc::sub<Author, double>(*mem->gettensor<double>(this->args[0].textvalue), *mem->gettensor<double>(this->args[1].textvalue), *mem->gettensor<double>(this->returns[0].textvalue));
-                    break;
-                case Precision::Float32:
-                    tensorfunc::sub<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<float>(this->args[1].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
-                    break;
-                case Precision::Int64:
-                    tensorfunc::sub<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), *mem->gettensor<int32_t>(this->returns[0].textvalue));
-                    break;
-                case Precision::Int32:
-                    tensorfunc::sub<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), *mem->gettensor<int32_t>(this->returns[0].textvalue));
-                    break;
-                case Precision::Int16:
-                    tensorfunc::sub<Author, int16_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), *mem->gettensor<int16_t>(this->args[1].textvalue), *mem->gettensor<int16_t>(this->returns[0].textvalue));
-                    break;
-                case Precision::Int8:
-                    tensorfunc::sub<Author, int8_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), *mem->gettensor<int8_t>(this->args[1].textvalue), *mem->gettensor<int8_t>(this->returns[0].textvalue));
-                    break;
-                default:
-                    error = "Unsupported dtype: " + precision_str(a_type);
-                    return 1;
+            case Precision::Float64:
+                tensorfunc::sub<Author, double>(*mem->gettensor<double>(this->args[0].textvalue), *mem->gettensor<double>(this->args[1].textvalue), *mem->gettensor<double>(this->returns[0].textvalue));
+                break;
+            case Precision::Float32:
+                tensorfunc::sub<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<float>(this->args[1].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
+                break;
+            case Precision::Int64:
+                tensorfunc::sub<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int32:
+                tensorfunc::sub<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int16:
+                tensorfunc::sub<Author, int16_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), *mem->gettensor<int16_t>(this->args[1].textvalue), *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int8:
+                tensorfunc::sub<Author, int8_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), *mem->gettensor<int8_t>(this->args[1].textvalue), *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                break;
+            default:
+                error = "Unsupported dtype: " + precision_str(a_type);
+                return 1;
             }
             return 0;
         }
-        
     };
-    
 
+    template <typename Author>
+    class SubScalar : public TF
+    {
+    public:
+        SubScalar(vector<Param> args, vector<Param> returns)
+        {
+            this->name = "subscalar";
+            this->author = Author::name();
+            this->args = args;
+            this->returns = returns;
+        }
+        string math_formula() const override
+        {
+            return "T3=T1-scalar";
+        }
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<SubScalar<Author>>(*this);
+        }
+        int run(shared_ptr<MemBase> mem, string &error) override
+        {
+            Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
+            Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
+            if (a_type != c_type)
+            {
+                error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type);
+                return 1;
+            }
+            switch (a_type)
+            {
+            case Precision::Float64:
+                tensorfunc::subscalar<Author, double>(*mem->gettensor<double>(this->args[0].textvalue), this->getvar<double>(1, mem), *mem->gettensor<double>(this->returns[0].textvalue));
+                break;
+            case Precision::Float32:
+                tensorfunc::subscalar<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), this->getvar<float>(1, mem), *mem->gettensor<float>(this->returns[0].textvalue));
+                break;
+            case Precision::Int64:
+                tensorfunc::subscalar<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), this->getvar<int32_t>(1, mem), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int32:
+                tensorfunc::subscalar<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), this->getvar<int32_t>(1, mem), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int16:
+                tensorfunc::subscalar<Author, int16_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), this->getvar<int16_t>(1, mem), *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int8:
+                tensorfunc::subscalar<Author, int8_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), this->getvar<int8_t>(1, mem), *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                break;
+            default:
+                error = "Unsupported dtype: " + precision_str(a_type);
+                return 1;
+            }
+            return 0;
+        }
+    };
 }
 
-
-
-
-
-
-
-
 #endif

From 1a92984eae9942d88c45247b68fe797925252507 Mon Sep 17 00:00:00 2001
From: miaobyte <734991033@qq.com>
Date: Wed, 26 Mar 2025 18:19:30 +0800
Subject: [PATCH 2/4] =?UTF-8?q?front:newtensor,print=20=E8=81=94=E5=90=88?=
 =?UTF-8?q?=E8=B0=83=E8=AF=95?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../src/deepx/tf/elementwise.hpp              | 113 ++++++++++++++++++
 1 file changed, 113 insertions(+)

diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp b/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp
index 5487a2a7..d3342110 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp
@@ -235,6 +235,119 @@ namespace deepx::tf
             return 0;
         }
     };
+
+    template <typename Author>
+    class Mul : public TF
+    {
+    public:
+        Mul(vector<Param> args, vector<Param> returns)
+        {   
+            this->name = "mul";
+            this->author = Author::name();
+            this->args = args;
+            this->returns = returns;
+        }   
+        string math_formula() const override
+        {
+            return "T3=T1*T2";
+        }
+        shared_ptr<TF> clone() const override   
+        {
+            return make_shared<Mul<Author>>(*this);
+        }
+        int run(shared_ptr<MemBase> mem, string &error) override
+        {
+            Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;  
+            Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype;
+            Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
+            if (a_type != b_type || a_type != c_type)
+            {
+                error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(b_type) + " != " + precision_str(c_type);
+                return 1;
+            }
+            switch (a_type)
+            {
+            case Precision::Float64:
+                tensorfunc::mul<Author, double>(*mem->gettensor<double>(this->args[0].textvalue), *mem->gettensor<double>(this->args[1].textvalue), *mem->gettensor<double>(this->returns[0].textvalue));
+                break;
+            case Precision::Float32:
+                tensorfunc::mul<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<float>(this->args[1].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
+                break;
+            case Precision::Int64:
+                tensorfunc::mul<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int32:
+                tensorfunc::mul<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int16:
+                tensorfunc::mul<Author, int16_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), *mem->gettensor<int16_t>(this->args[1].textvalue), *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int8:
+                tensorfunc::mul<Author, int8_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), *mem->gettensor<int8_t>(this->args[1].textvalue), *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                break;
+            default:
+                error = "Unsupported dtype: " + precision_str(a_type);
+                return 1;
+            }
+            return 0;
+        }
+    };  
+
+    template <typename Author>
+    class MulScalar : public TF
+    {
+    public:
+        MulScalar(vector<Param> args, vector<Param> returns)
+        {
+            this->name = "mulscalar";
+            this->author = Author::name();
+            this->args = args;
+            this->returns = returns;
+        }
+        string math_formula() const override
+        {
+            return "T3=T1*scalar";
+        }
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<MulScalar<Author>>(*this);
+        }
+        int run(shared_ptr<MemBase> mem, string &error) override
+        {
+            Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
+            Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
+            if (a_type != c_type)
+            {
+                error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type);
+                return 1;
+            }
+            switch (a_type)
+            {
+            case Precision::Float64:
+                tensorfunc::mulscalar<Author, double>(*mem->gettensor<double>(this->args[0].textvalue), this->getvar<double>(1, mem), *mem->gettensor<double>(this->returns[0].textvalue));
+                break;
+            case Precision::Float32:
+                tensorfunc::mulscalar<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), this->getvar<float>(1, mem), *mem->gettensor<float>(this->returns[0].textvalue));
+                break;
+            case Precision::Int64:
+                tensorfunc::mulscalar<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), this->getvar<int32_t>(1, mem), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int32:
+                tensorfunc::mulscalar<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), this->getvar<int32_t>(1, mem), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int16:
+                tensorfunc::mulscalar<Author, int16_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), this->getvar<int16_t>(1, mem), *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int8:
+                tensorfunc::mulscalar<Author, int8_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), this->getvar<int8_t>(1, mem), *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                break;
+            default:
+                error = "Unsupported dtype: " + precision_str(a_type);
+                return 1;
+            }   
+            return 0;
+        }       
+    };              
 }
 
 #endif

From 640d3cf5950b811a1720e8520eac4308b2dc6cb1 Mon Sep 17 00:00:00 2001
From: miaobyte <734991033@qq.com>
Date: Wed, 26 Mar 2025 18:20:00 +0800
Subject: [PATCH 3/4] =?UTF-8?q?front:newtensor,print=20=E8=81=94=E5=90=88?=
 =?UTF-8?q?=E8=B0=83=E8=AF=95?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 front/py/deepx/nn/deepxir.py          | 40 +++++++++++++++++++--------
 front/py/deepx/nn/functional/new.py   |  4 +--
 front/py/deepx/nn/functional/print.py |  4 +--
 3 files changed, 33 insertions(+), 15 deletions(-)

diff --git a/front/py/deepx/nn/deepxir.py b/front/py/deepx/nn/deepxir.py
index 6dce38ea..988afdc2 100644
--- a/front/py/deepx/nn/deepxir.py
+++ b/front/py/deepx/nn/deepxir.py
@@ -1,14 +1,36 @@
-from typing import Tuple, List, Optional
+from typing import Tuple, List, Optional,Union
 import time
 from datetime import datetime  # 添加datetime模块
 
+class Param:
+    def __init__(self, value:Optional[Union[str,int,float,list,tuple]], category:str=None,precision:str=None):
+        if isinstance(value,str):
+            self._textvalue=value
+        elif isinstance(value,int) or isinstance(value,float):
+            self._textvalue=str(value)
+        elif isinstance(value,list) or isinstance(value,tuple):
+            self._textvalue='['+' '.join(str(v) for v in value)+']'
+        else:
+            raise ValueError(f"Invalid value type: {type(value)}")
+
+        self._category=category
+        self._precision=precision
+
+    def __str__(self):
+        if self._category is not None:
+            if self._precision is not None:
+                return f"{self._category}<{self._precision}> {self._textvalue}"
+            else:
+                return f"{self._category} {self._textvalue}"
+        else:
+            return self._textvalue
+
 class DeepxIR:
     def __init__(self, 
                 name:str,
-                dtype:str,
-                args: List[str], 
-                returns: List[str],
-                author:str):
+                args: List[Param], 
+                returns: List[Param],
+                author:str=''):
         """
         初始化操作节点
         Args:
@@ -17,8 +39,7 @@ def __init__(self,
             author: tensorfunc的作者名称,如"miaobyte"
         """
  
-        self._name = name  
-        self._dtype = dtype
+        self._name = name
         self._args = args
         self._returns = returns
         self._author = author
@@ -28,10 +49,7 @@ def __init__(self,
 
     def __str__(self):
         # 函数名部分
-        if self._dtype == None or self._dtype == '':
-            parts = [self._name]
-        else:
-            parts = [f"{self._name}@{self._dtype}"]
+        parts = [self._name]
         
         # 处理输入参数部分 - 使用括号和逗号分隔
         args_parts = []
diff --git a/front/py/deepx/nn/functional/new.py b/front/py/deepx/nn/functional/new.py
index 879eda7d..1cc14dff 100644
--- a/front/py/deepx/nn/functional/new.py
+++ b/front/py/deepx/nn/functional/new.py
@@ -1,6 +1,6 @@
 from deepx.tensor import Tensor
 from deepx.autograd.graph import Graph
-from deepx.nn.deepxir import DeepxIR
+from deepx.nn.deepxir import DeepxIR,Param
 from deepx.scheduler import send
 
 def newtensor(t:Tensor,name:str=None):
@@ -8,7 +8,7 @@ def newtensor(t:Tensor,name:str=None):
     t._graph = graph
     t._node=graph.add_tensor(name,t=t)
     if t.graph.eager:
-        ir2=DeepxIR("newtensor", t.dtype, t.shape, [t._node.name])
+        ir2=DeepxIR("newtensor",[Param(t.shape)], [Param(t._node.name,category='tensor',precision=t.dtype)])
         send(ir2)
 def copytensor(t:Tensor,out:Tensor):
     graph = Graph.get_default()
diff --git a/front/py/deepx/nn/functional/print.py b/front/py/deepx/nn/functional/print.py
index b4c11fb6..2eb2bb25 100644
--- a/front/py/deepx/nn/functional/print.py
+++ b/front/py/deepx/nn/functional/print.py
@@ -4,8 +4,8 @@
 from deepx.scheduler import send
 
 OpNode.register("print")
-def printtensor(t:Tensor,format=''):
-    ir=DeepxIR("print",'', [t.node.name,format], [])
+def printtensor(t:Tensor,format='',author='miaobyte'):
+    ir=DeepxIR("print",[t.node.name,format], [],author)
     send(ir)
     return ''
 

From 73bc7ba905da6451fdfe3127102d739ca85e050c Mon Sep 17 00:00:00 2001
From: harryharrygo <harryharrygogogo@gmail.com>
Date: Thu, 20 Mar 2025 22:47:55 +0800
Subject: [PATCH 4/4] Fix build error in gcc compiler. (#5)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In gcc/++13 compiler, it shows error:
```
dtype.hpp:8:29: error: found ‘:’ in nested-name-specifier, expected ‘::’
8 | enum class DataCategory : uint8_t
```
---
 excuter/cpp-common/src/deepx/dtype.hpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/excuter/cpp-common/src/deepx/dtype.hpp b/excuter/cpp-common/src/deepx/dtype.hpp
index 9b9e24e8..b93a2a7a 100644
--- a/excuter/cpp-common/src/deepx/dtype.hpp
+++ b/excuter/cpp-common/src/deepx/dtype.hpp
@@ -2,6 +2,8 @@
 #define DEEPX_DTYPE_HPP
 
 #include <string>
+#include <cstdint>
+
 namespace deepx
 {