From ab95117a1b69ae803814206c5a756a8fdb9b4cb1 Mon Sep 17 00:00:00 2001
From: lipeng <734991033@qq.com>
Date: Mon, 31 Mar 2025 18:53:51 +0800
Subject: [PATCH] front&excuter:sqrt,pow,powscalar,log,exp

---
 doc/excuter/op-mem-cuda/list.md               |   5 +
 doc/excuter/op-mem-ompsimd/list.md            |   5 +
 .../src/deepx/tensorfunc/elementwise.hpp      |   2 +-
 excuter/op-mem-cuda/src/client/tfs.cpp        | 118 ++++--
 .../tensorfunc/elementwise_miaobyte_sin.hpp.a |  61 +++
 .../tensorfunc/elementwise_miaobyte_sqrt.cu   | 188 +++++++++
 .../tensorfunc/elementwise_miaobyte_sqrt.cuh  | 169 ++++++++
 .../tensorfunc/elementwise_miaobyte_sqrt.hpp  |  88 ++++
 .../src/deepx/tf/elementwise_sqrt.hpp         | 378 ++++++++++++++++++
 excuter/op-mem-ompsimd/src/client/tfs.cpp     | 173 ++++----
 .../deepx/tensorfunc/elementwise_miaobyte.hpp | 153 ++++---
 .../src/deepx/tf/elementwise.hpp              | 303 +++++++++++++-
 front/py/deepx/nn/functional/__init__.py      |   2 +-
 front/py/deepx/nn/functional/elementwise.py   |  30 +-
 .../examples/2_ir/2_elementwise_sqrtlog.dot   |  35 ++
 .../2_ir/2_elementwise_sqrtlog.dot.svg        | 158 ++++++++
 .../py/examples/2_ir/2_elementwise_sqrtlog.py |  38 ++
 17 files changed, 1709 insertions(+), 197 deletions(-)
 create mode 100644 excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.hpp.a
 create mode 100644 excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cu
 create mode 100644 excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cuh
 create mode 100644 excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.hpp
 create mode 100644 excuter/op-mem-cuda/src/deepx/tf/elementwise_sqrt.hpp
 create mode 100644 front/py/examples/2_ir/2_elementwise_sqrtlog.dot
 create mode 100644 front/py/examples/2_ir/2_elementwise_sqrtlog.dot.svg
 create mode 100644 front/py/examples/2_ir/2_elementwise_sqrtlog.py
diff --git a/doc/excuter/op-mem-cuda/list.md b/doc/excuter/op-mem-cuda/list.md
index 27bdd297..a314b05d 100644
--- a/doc/excuter/op-mem-cuda/list.md
+++ b/doc/excuter/op-mem-cuda/list.md
@@ -5,11 +5,15 @@
 | Operation | Author | Func Def | Math Formula | IR Instruction |
 |-----------|--------|------------|--------------|----------------|
 | matmul | cublas | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1 @ T2 | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) |
+| exp | miaobyte | exp(tensor<any> A)->(tensor<any> C) | T3=exp(T1) | exp(tensor<any> A)->(tensor<any> C) |
+| pow | miaobyte | pow(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=pow(T1, T2) | pow(tensor<any> A, tensor<any> B)->(tensor<any> C) |
+| powscalar | miaobyte | powscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=pow(T1, scalar) | powscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
 | rdivscalar | miaobyte | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) | T3=scalar/T1 | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) |
 | div | miaobyte | div(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1/T2 | div(tensor<any> A, tensor<any> B)->(tensor<any> C) |
 | sub | miaobyte | sub(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1-T2 | sub(tensor<any> A, tensor<any> B)->(tensor<any> C) |
 | argset |  none  | argset(var<any> value)->(var<any> name) | var argname = argvalue | argset(var<any> value)->(var<any> name) |
 | mulscalar | miaobyte | mulscalar(tensor<any> A, var<any> b)->(tensor<any> C) | T3=T1*scalar | mulscalar(tensor<any> A, var<any> b)->(tensor<any> C) |
+| sqrt | miaobyte | sqrt(tensor<any> A)->(tensor<any> C) | T3=sqrt(T1) | sqrt(tensor<any> A)->(tensor<any> C) |
 | vecset |  none  | vecset(vector<any> value)->(vector<any> name) | shape = [3  4  5] | vecset(vector<any> value)->(vector<any> name) |
 | newtensor |  none  | newtensor(vector<int32> shape)->(tensor<any> tensor1) | T1 = zeros(shape) | newtensor(vector<int32> shape)->(tensor<any> tensor1) |
 | newtensor |  none  | newtensor(var<string> shape)->(tensor<any> tensor1) | T1 = zeros(shape) | newtensor(var<string> shape)->(tensor<any> tensor1) |
@@ -19,6 +23,7 @@
 | constant | miaobyte | constant(tensor<any> t, var<any> value)->() | constant(T1) | constant(tensor<any> t, var<any> value)->() |
 | arange | miaobyte | arange(tensor<any> t, var<any> start, var<any> step)->() | arange(T1,start,step) | arange(tensor<any> t, var<any> start, var<any> step)->() |
 | subscalar | miaobyte | subscalar(tensor<any> A, var<any> b)->(tensor<any> C) | T3=T1-scalar | subscalar(tensor<any> A, var<any> b)->(tensor<any> C) |
+| log | miaobyte | log(tensor<any> A)->(tensor<any> C) | T3=log(T1) | log(tensor<any> A)->(tensor<any> C) |
 | uniform | miaobyte | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() | uniform(T1,low,high,seed) | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() |
 | add | cublas | add(tensor<any> a, tensor<any> b)->(tensor<any> c) | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |
 | add | miaobyte | add(tensor<any> a, tensor<any> b)->(tensor<any> c) | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |
diff --git a/doc/excuter/op-mem-ompsimd/list.md b/doc/excuter/op-mem-ompsimd/list.md
index 6e878c3a..47325905 100644
--- a/doc/excuter/op-mem-ompsimd/list.md
+++ b/doc/excuter/op-mem-ompsimd/list.md
@@ -7,11 +7,15 @@
 | concat |  none  | concat()->() | Tresult = concat([T1, T2...], axis=3) | concat()->() |
 | matmul | cblas | matmul(tensor<float64|float32> A, tensor<float64|float32> B)->(tensor<float64|float32> C) | T3=T1 @ T2 | matmul(tensor<float64|float32> A, tensor<float64|float32> B)->(tensor<float64|float32> C) |
 | matmul | miaobyte | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1 @ T2 | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) |
+| exp | miaobyte | exp(tensor<any> A)->(tensor<any> C) | T3=exp(T1) | exp(tensor<any> A)->(tensor<any> C) |
+| pow | miaobyte | pow(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1^T2 | pow(tensor<any> A, tensor<any> B)->(tensor<any> C) |
+| powscalar | miaobyte | powscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=T1^scalar | powscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
 | rdivscalar | miaobyte | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) | T3=scalar/T1 | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) |
 | div | miaobyte | div(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1/T2 | div(tensor<any> A, tensor<any> B)->(tensor<any> C) |
 | sub | miaobyte | sub(tensor<any> a, tensor<any> b)->(tensor<any> c) | T3=T1-T2 | sub(tensor<any> a, tensor<any> b)->(tensor<any> c) |
 | argset |  none  | argset(var<any> value)->(var<any> name) | var argname = argvalue | argset(var<any> value)->(var<any> name) |
 | mulscalar | miaobyte | mulscalar(tensor<any> A, var<any> b)->(tensor<any> C) | T3=T1*scalar | mulscalar(tensor<any> A, var<any> b)->(tensor<any> C) |
+| sqrt | miaobyte | sqrt(tensor<any> A)->(tensor<any> C) | T3=sqrt(T1) | sqrt(tensor<any> A)->(tensor<any> C) |
 | vecset |  none  | vecset(vector<any> value)->(vector<any> name) | shape = [3  4  5] | vecset(vector<any> value)->(vector<any> name) |
 | newtensor |  none  | newtensor(vector<int32> shape)->(tensor<any> tensor1) | T1 =Tensor(shape=[...]) | newtensor(vector<int32> shape)->(tensor<any> tensor1) |
 | newtensor |  none  | newtensor(var<string> shape)->(tensor<any> tensor1) | T1 =Tensor(shape=[...]) | newtensor(var<string> shape)->(tensor<any> tensor1) |
@@ -21,6 +25,7 @@
 | constant | miaobyte | constant(tensor<any> t, var<any> value)->() | constant(T1,value) | constant(tensor<any> t, var<any> value)->() |
 | arange | miaobyte | arange(tensor<any> t, var<any> start, var<any> step)->() | arange(T1,start,step) | arange(tensor<any> t, var<any> start, var<any> step)->() |
 | subscalar | miaobyte | subscalar(tensor<any> a, var<any> scalar)->(tensor<any> c) | T3=T1-scalar | subscalar(tensor<any> a, var<any> scalar)->(tensor<any> c) |
+| log | miaobyte | log(tensor<any> A)->(tensor<any> C) | T3=log(T1) | log(tensor<any> A)->(tensor<any> C) |
 | uniform | miaobyte | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() | uniform(T1,low,high,seed) | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() |
 | add | cblas | add(tensor<float64|float32> a, tensor<float64|float32> b)->(tensor<float64|float32> c) | T3=T1+T2 | add(tensor<float64|float32> a, tensor<float64|float32> b)->(tensor<float64|float32> c) |
 | add | miaobyte | add(tensor<any> a, tensor<any> b)->(tensor<any> c) | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |
diff --git a/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp b/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp
index 4ee525c3..4e0edc6e 100644
--- a/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp
+++ b/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp
@@ -195,7 +195,7 @@ namespace deepx::tensorfunc
         divaddbetaDispatcher<Author, T>::divaddbeta(A, B, alpha, C, beta, D);
     }
 
-    template <typename Author, typename T>
+    template <typename Author, typename T,typename = void>
     struct sqrtDispatcher
     {
         static void sqrt(const Tensor<T> &input, Tensor<T> &output) = delete;
diff --git a/excuter/op-mem-cuda/src/client/tfs.cpp b/excuter/op-mem-cuda/src/client/tfs.cpp
index 27361136..473e55d3 100644
--- a/excuter/op-mem-cuda/src/client/tfs.cpp
+++ b/excuter/op-mem-cuda/src/client/tfs.cpp
@@ -4,6 +4,7 @@
 #include "deepx/tf/print.hpp"
 #include "deepx/tf/init.hpp"
 #include "deepx/tf/elementwise_basic.hpp"
+#include "deepx/tf/elementwise_sqrt.hpp"
 #include "deepx/tf/matmul.hpp"
 #include "deepx/dtype.hpp"
 #include "deepx/tf/tffactory.hpp"
@@ -107,14 +108,14 @@ namespace deepx::tf
                                                                  Param("c", DataCategory::Tensor, Precision::Any),
                                                              })));
         tffactory.add_tf(std::make_shared<Add<cublas>>(vector<Param>(
-                                                             {
-                                                                 Param("a", DataCategory::Tensor, Precision::Any),
-                                                                 Param("b", DataCategory::Tensor, Precision::Any),
-                                                             }),
-                                                         vector<Param>(
-                                                             {
-                                                                 Param("c", DataCategory::Tensor, Precision::Any),
-                                                             })));
+                                                           {
+                                                               Param("a", DataCategory::Tensor, Precision::Any),
+                                                               Param("b", DataCategory::Tensor, Precision::Any),
+                                                           }),
+                                                       vector<Param>(
+                                                           {
+                                                               Param("c", DataCategory::Tensor, Precision::Any),
+                                                           })));
         tffactory.add_tf(std::make_shared<AddScalar<miaobyte>>(vector<Param>(
                                                                    {
                                                                        Param("A", DataCategory::Tensor, Precision::Any),
@@ -126,14 +127,14 @@ namespace deepx::tf
                                                                    })));
 
         tffactory.add_tf(std::make_shared<Sub<miaobyte>>(vector<Param>(
-                                                                   {
-                                                                       Param("A", DataCategory::Tensor, Precision::Any),
-                                                                       Param("B", DataCategory::Tensor, Precision::Any),
-                                                                   }),
-                                                               vector<Param>(
-                                                                   {
-                                                                       Param("C", DataCategory::Tensor, Precision::Any),
-                                                                   })));
+                                                             {
+                                                                 Param("A", DataCategory::Tensor, Precision::Any),
+                                                                 Param("B", DataCategory::Tensor, Precision::Any),
+                                                             }),
+                                                         vector<Param>(
+                                                             {
+                                                                 Param("C", DataCategory::Tensor, Precision::Any),
+                                                             })));
         tffactory.add_tf(std::make_shared<SubScalar<miaobyte>>(vector<Param>(
                                                                    {
                                                                        Param("A", DataCategory::Tensor, Precision::Any),
@@ -148,31 +149,31 @@ namespace deepx::tf
                                                                  Param("A", DataCategory::Tensor, Precision::Any),
                                                                  Param("B", DataCategory::Tensor, Precision::Any),
                                                              }),
-                                                         vector<Param>( 
+                                                         vector<Param>(
                                                              {
                                                                  Param("C", DataCategory::Tensor, Precision::Any),
                                                              })));
         tffactory.add_tf(std::make_shared<MulScalar<miaobyte>>(vector<Param>(
                                                                    {
-                                                                       Param("A", DataCategory::Tensor, Precision::Any),    
+                                                                       Param("A", DataCategory::Tensor, Precision::Any),
                                                                        Param("b", DataCategory::Var, Precision::Any),
                                                                    }),
                                                                vector<Param>(
                                                                    {
                                                                        Param("C", DataCategory::Tensor, Precision::Any),
-                                                                   })));    
+                                                                   })));
         tffactory.add_tf(std::make_shared<Div<miaobyte>>(vector<Param>(
                                                              {
                                                                  Param("A", DataCategory::Tensor, Precision::Any),
                                                                  Param("B", DataCategory::Tensor, Precision::Any),
                                                              }),
-                                                         vector<Param>( 
+                                                         vector<Param>(
                                                              {
                                                                  Param("C", DataCategory::Tensor, Precision::Any),
                                                              })));
         tffactory.add_tf(std::make_shared<DivScalar<miaobyte>>(vector<Param>(
                                                                    {
-                                                                       Param("A", DataCategory::Tensor, Precision::Any),    
+                                                                       Param("A", DataCategory::Tensor, Precision::Any),
                                                                        Param("scalar", DataCategory::Var, Precision::Any),
                                                                    }),
                                                                vector<Param>(
@@ -180,41 +181,72 @@ namespace deepx::tf
                                                                        Param("C", DataCategory::Tensor, Precision::Any),
                                                                    })));
         tffactory.add_tf(std::make_shared<RDivScalar<miaobyte>>(vector<Param>(
+                                                                    {
+                                                                        Param("scalar", DataCategory::Var, Precision::Any),
+                                                                        Param("A", DataCategory::Tensor, Precision::Any),
+                                                                    }),
+                                                                vector<Param>(
+                                                                    {
+                                                                        Param("C", DataCategory::Tensor, Precision::Any),
+                                                                    })));
+
+        tffactory.add_tf(std::make_shared<Sqrt<miaobyte>>(vector<Param>(
+                                                              {
+                                                                  Param("A", DataCategory::Tensor, Precision::Any),
+                                                              }),
+                                                          vector<Param>(
+                                                              {
+                                                                  Param("C", DataCategory::Tensor, Precision::Any),
+                                                              })));
+
+        tffactory.add_tf(std::make_shared<Pow<miaobyte>>(vector<Param>(
+                                                             {
+                                                                 Param("A", DataCategory::Tensor, Precision::Any),
+                                                                 Param("B", DataCategory::Tensor, Precision::Any),
+                                                             }),
+                                                         vector<Param>(
+                                                             {
+                                                                 Param("C", DataCategory::Tensor, Precision::Any),
+                                                             })));
+        tffactory.add_tf(std::make_shared<PowScalar<miaobyte>>(vector<Param>(
                                                                    {
-                                                                       Param("scalar", DataCategory::Var, Precision::Any),
                                                                        Param("A", DataCategory::Tensor, Precision::Any),
+                                                                       Param("scalar", DataCategory::Var, Precision::Any),
                                                                    }),
-                                                               vector<Param>(   
+                                                               vector<Param>(
                                                                    {
                                                                        Param("C", DataCategory::Tensor, Precision::Any),
-                                                                   })));    
-        
-
-        //     opfactory.add_op(Sqrt_miaobyte<float>());
-        //     opfactory.add_op(Sqrt_miaobyte<double>());
-
-        //     opfactory.add_op(Exp_miaobyte<float>());
-        //     opfactory.add_op(Exp_miaobyte<double>());
-
-        //     opfactory.add_op(Pow_miaobyte<float>());
-        //     opfactory.add_op(Pow_miaobyte<double>());
-
-        //     opfactory.add_op(Powscalar_miaobyte<float>());
-        //     opfactory.add_op(Powscalar_miaobyte<double>());
-    }
-    // matmul
-    void register_matmul(TfFactory &tffactory)
-    {
-        tffactory.add_tf(std::make_shared<MatMul<cublas>>(vector<Param>(
+                                                                   })));
+        tffactory.add_tf(std::make_shared<Log<miaobyte>>(vector<Param>(
+                                                             {
+                                                                 Param("A", DataCategory::Tensor, Precision::Any),
+                                                             }),
+                                                         vector<Param>(
+                                                             {
+                                                                 Param("C", DataCategory::Tensor, Precision::Any),
+                                                             })));
+        tffactory.add_tf(std::make_shared<Exp<miaobyte>>(vector<Param>(
                                                              {
                                                                  Param("A", DataCategory::Tensor, Precision::Any),
-                                                                 Param("B", DataCategory::Tensor, Precision::Any),
                                                              }),
                                                          vector<Param>(
                                                              {
                                                                  Param("C", DataCategory::Tensor, Precision::Any),
                                                              })));
     }
+    // matmul
+    void register_matmul(TfFactory &tffactory)
+    {
+        tffactory.add_tf(std::make_shared<MatMul<cublas>>(vector<Param>(
+                                                              {
+                                                                  Param("A", DataCategory::Tensor, Precision::Any),
+                                                                  Param("B", DataCategory::Tensor, Precision::Any),
+                                                              }),
+                                                          vector<Param>(
+                                                              {
+                                                                  Param("C", DataCategory::Tensor, Precision::Any),
+                                                              })));
+    }
     // // changeshape
     void register_changeshape(TfFactory &tffactory)
     {
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.hpp.a b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.hpp.a
new file mode 100644
index 00000000..f31973f3
--- /dev/null
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.hpp.a
@@ -0,0 +1,61 @@
+#ifndef DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_SIN_HPP
+#define DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_SIN_HPP
+
+#include "deepx/tensorfunc/elementwise.hpp"
+#include "deepx/tensorfunc/cuda.hpp"
+#include "deepx/tensorfunc/authors.hpp"
+#include "deepx/tensorfunc/elementwise_miaobyte_basic.cuh"
+
+#include "stdutil/error.hpp"
+
+namespace deepx::tensorfunc
+{
+    // CUDA kernel函数声明
+   
+
+    template <typename T>
+    struct sinDispatcher<miaobyte, T>
+    {
+        static void sin(const Tensor<T> &A, Tensor<T> &C)
+        {
+            if (A.shape.size != C.shape.size) {
+                throw TensorShapeError("sin");
+            }
+            const int blockSize = A.shape.size > 256 ? 256 : A.shape.size;
+            int numBlocks = (A.shape.size + blockSize - 1) / blockSize;
+            launch_sin(numBlocks, blockSize, A.data, C.data, A.shape.size);           
+        }   
+    };
+
+    template <typename T>
+    struct cosDispatcher<miaobyte, T>
+    {
+        static void cos(const Tensor<T> &A, Tensor<T> &C)
+        {
+            if (A.shape.size != C.shape.size) {
+                throw TensorShapeError("cos");
+            }
+            const int blockSize = A.shape.size > 256 ? 256 : A.shape.size;
+            int numBlocks = (A.shape.size + blockSize - 1) / blockSize;
+            launch_cos(numBlocks, blockSize, A.data, C.data, A.shape.size);
+        }
+    };
+
+    template <typename T>
+    struct tanDispatcher<miaobyte, T>
+    {
+        static void tan(const Tensor<T> &A, Tensor<T> &C)
+        {
+            if (A.shape.size != C.shape.size) { 
+                throw TensorShapeError("tan");
+            }
+            const int blockSize = A.shape.size > 256 ? 256 : A.shape.size;
+            int numBlocks = (A.shape.size + blockSize - 1) / blockSize;
+            launch_tan(numBlocks, blockSize, A.data, C.data, A.shape.size);
+        }
+    };
+
+   
+}
+
+#endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_BASIC_HPP
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cu
new file mode 100644
index 00000000..a808d5bc
--- /dev/null
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cu
@@ -0,0 +1,188 @@
+#ifndef DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_SQRT_CUH
+#define DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_SQRT_CUH
+
+#include "deepx/tensorfunc/cuda.hpp"
+#include "deepx/tensorfunc/authors.hpp"
+#include <cuda/std/cmath>
+
+namespace deepx::tensorfunc
+{
+    // sqrt
+     template <typename T>
+    __global__ void  sqrt_kernel(const T* A, T* C,const int size){
+        int idx = blockIdx.x * blockDim.x + threadIdx.x;
+        if (idx < size) {
+            C[idx] = sqrtf(A[idx]);
+        }
+    }
+    template __global__ void  sqrt_kernel<double>(const double* A, double* C,const int size);
+    template __global__ void  sqrt_kernel<float>(const float* A, float* C,const int size);
+    // template __global__ void  sqrt_kernel<nv_bfloat16>(const nv_bfloat16* A, nv_bfloat16* C,const int size);
+    // template __global__ void  sqrt_kernel<__half>(const __half* A, __half* C,const int size);
+    template __global__ void  sqrt_kernel<int64_t>(const int64_t* A, int64_t* C,const int size);
+    template __global__ void  sqrt_kernel<int32_t>(const int32_t* A, int32_t* C,const int size);
+    template __global__ void  sqrt_kernel<int16_t>(const int16_t* A, int16_t* C,const int size);
+    template __global__ void  sqrt_kernel<int8_t>(const int8_t* A, int8_t* C,const int size);
+
+    template <typename T>
+    void launch_sqrt(int numBlocks, int blockSize, const T* a, T* c,const int size){
+        sqrt_kernel<<<numBlocks, blockSize>>>(a, c, size);
+        cudaError_t err = cudaGetLastError();
+        if (err != cudaSuccess) {
+            throw std::runtime_error("Failed to launch sqrt kernel: " + 
+                                       std::string(cudaGetErrorString(err)));
+        }
+    }   
+    template void launch_sqrt<double>(int numBlocks, int blockSize, const double* a, double* c,const int size);
+    template void launch_sqrt<float>(int numBlocks, int blockSize, const float* a, float* c,const int size);
+    // template void launch_sqrt<nv_bfloat16>(int numBlocks, int blockSize, const nv_bfloat16* a, nv_bfloat16* c,const int size);
+    // template void launch_sqrt<__half>(int numBlocks, int blockSize, const __half* a, __half* c,const int size);
+    template void launch_sqrt<int64_t>(int numBlocks, int blockSize, const int64_t* a, int64_t* c,const int size);
+    template void launch_sqrt<int32_t>(int numBlocks, int blockSize, const int32_t* a, int32_t* c,const int size);
+    template void launch_sqrt<int16_t>(int numBlocks, int blockSize, const int16_t* a, int16_t* c,const int size);
+    template void launch_sqrt<int8_t>(int numBlocks, int blockSize, const int8_t* a, int8_t* c,const int size);
+    
+    
+    // pow
+    template <typename T>
+    __global__ void pow_kernel(const T* A, const T* B, T* C,const int size){    
+        int idx = blockIdx.x * blockDim.x + threadIdx.x;
+        if (idx < size) {
+            C[idx] = powf(A[idx], B[idx]);
+        }
+    }
+    template __global__ void pow_kernel<double>(const double* A, const double* B, double* C,const int size);
+    template __global__ void pow_kernel<float>(const float* A, const float* B, float* C,const int size);
+    // template __global__ void pow_kernel<nv_bfloat16>(const nv_bfloat16* A, const nv_bfloat16* B, nv_bfloat16* C,const int size);
+    // template __global__ void pow_kernel<__half>(const __half* A, const __half* B, __half* C,const int size);
+    template __global__ void pow_kernel<int64_t>(const int64_t* A, const int64_t* B, int64_t* C,const int size);
+    template __global__ void pow_kernel<int32_t>(const int32_t* A, const int32_t* B, int32_t* C,const int size);    
+    template __global__ void pow_kernel<int16_t>(const int16_t* A, const int16_t* B, int16_t* C,const int size);
+    template __global__ void pow_kernel<int8_t>(const int8_t* A, const int8_t* B, int8_t* C,const int size);
+
+    template <typename T>
+    void launch_pow(int numBlocks, int blockSize, const T* a, const T* b, T* c,const int size){
+        pow_kernel<<<numBlocks, blockSize>>>(a, b, c, size);
+        cudaError_t err = cudaGetLastError();
+        if (err != cudaSuccess) {
+            throw std::runtime_error("Failed to launch pow kernel: " + 
+                                       std::string(cudaGetErrorString(err)));
+        }
+    }   
+    template void launch_pow<double>(int numBlocks, int blockSize, const double* a, const double* b, double* c,const int size);     
+    template void launch_pow<float>(int numBlocks, int blockSize, const float* a, const float* b, float* c,const int size);
+    // template void launch_pow<nv_bfloat16>(int numBlocks, int blockSize, const nv_bfloat16* a, const nv_bfloat16* b, nv_bfloat16* c,const int size);
+    // template void launch_pow<__half>(int numBlocks, int blockSize, const __half* a, const __half* b, __half* c,const int size);
+    template void launch_pow<int64_t>(int numBlocks, int blockSize, const int64_t* a, const int64_t* b, int64_t* c,const int size);
+    template void launch_pow<int32_t>(int numBlocks, int blockSize, const int32_t* a, const int32_t* b, int32_t* c,const int size); 
+    template void launch_pow<int16_t>(int numBlocks, int blockSize, const int16_t* a, const int16_t* b, int16_t* c,const int size);
+    template void launch_pow<int8_t>(int numBlocks, int blockSize, const int8_t* a, const int8_t* b, int8_t* c,const int size);
+    
+    // powscalar
+    template <typename T>
+    __global__ void powscalar_kernel(const T* A, const T scalar, T* C,const int size){
+        int idx = blockIdx.x * blockDim.x + threadIdx.x;
+        if (idx < size) {
+            C[idx] =  powf(A[idx], scalar);
+        }
+    }
+    template __global__ void powscalar_kernel<double>(const double* A, const double scalar, double* C,const int size);
+    template __global__ void powscalar_kernel<float>(const float* A, const float scalar, float* C,const int size);
+    // template __global__ void powscalar_kernel<nv_bfloat16>(const nv_bfloat16* A, const nv_bfloat16 scalar, nv_bfloat16* C,const int size);
+    // template __global__ void powscalar_kernel<__half>(const __half* A, const __half scalar, __half* C,const int size);
+    template __global__ void powscalar_kernel<int64_t>(const int64_t* A, const int64_t scalar, int64_t* C,const int size);
+    template __global__ void powscalar_kernel<int32_t>(const int32_t* A, const int32_t scalar, int32_t* C,const int size);
+    template __global__ void powscalar_kernel<int16_t>(const int16_t* A, const int16_t scalar, int16_t* C,const int size);
+    template __global__ void powscalar_kernel<int8_t>(const int8_t* A, const int8_t scalar, int8_t* C,const int size);
+
+    template <typename T>
+    void launch_powscalar(int numBlocks, int blockSize, const T* a, const T scalar, T* c,const int size){
+        powscalar_kernel<<<numBlocks, blockSize>>>(a, scalar, c, size);
+        cudaError_t err = cudaGetLastError();
+        if (err != cudaSuccess) {
+            throw std::runtime_error("Failed to launch powscalar kernel: " + 
+                                       std::string(cudaGetErrorString(err)));
+        }
+    }       
+    template void launch_powscalar<double>(int numBlocks, int blockSize, const double* a, const double scalar, double* c,const int size);
+    template void launch_powscalar<float>(int numBlocks, int blockSize, const float* a, const float scalar, float* c,const int size);
+    // template void launch_powscalar<nv_bfloat16>(int numBlocks, int blockSize, const nv_bfloat16* a, const nv_bfloat16 scalar, nv_bfloat16* c,const int size);   
+    // template void launch_powscalar<__half>(int numBlocks, int blockSize, const __half* a, const __half scalar, __half* c,const int size);
+    template void launch_powscalar<int64_t>(int numBlocks, int blockSize, const int64_t* a, const int64_t scalar, int64_t* c,const int size);
+    template void launch_powscalar<int32_t>(int numBlocks, int blockSize, const int32_t* a, const int32_t scalar, int32_t* c,const int size);
+    template void launch_powscalar<int16_t>(int numBlocks, int blockSize, const int16_t* a, const int16_t scalar, int16_t* c,const int size);
+    template void launch_powscalar<int8_t>(int numBlocks, int blockSize, const int8_t* a, const int8_t scalar, int8_t* c,const int size);   
+
+    // log
+    template <typename T>
+    __global__ void log_kernel(const T* A, T* C,const int size){
+        int idx = blockIdx.x * blockDim.x + threadIdx.x;
+        if (idx < size) {
+            C[idx] =  logf(A[idx]);
+        }
+    }
+    template __global__ void log_kernel<double>(const double* A, double* C,const int size);
+    template __global__ void log_kernel<float>(const float* A, float* C,const int size);
+    // template __global__ void log_kernel<nv_bfloat16>(const nv_bfloat16* A, nv_bfloat16* C,const int size);
+    // template __global__ void log_kernel<__half>(const __half* A, __half* C,const int size); 
+    template __global__ void log_kernel<int64_t>(const int64_t* A, int64_t* C,const int size);
+    template __global__ void log_kernel<int32_t>(const int32_t* A, int32_t* C,const int size);
+    template __global__ void log_kernel<int16_t>(const int16_t* A, int16_t* C,const int size);
+    template __global__ void log_kernel<int8_t>(const int8_t* A, int8_t* C,const int size);
+    
+    template <typename T>
+    void launch_log(int numBlocks, int blockSize, const T* a, T* c,const int size){
+        log_kernel<<<numBlocks, blockSize>>>(a, c, size);
+        cudaError_t err = cudaGetLastError();
+        if (err != cudaSuccess) {
+            throw std::runtime_error("Failed to launch log kernel: " + 
+                                       std::string(cudaGetErrorString(err)));
+        }
+    }   
+    template void launch_log<double>(int numBlocks, int blockSize, const double* a, double* c,const int size);
+    template void launch_log<float>(int numBlocks, int blockSize, const float* a, float* c,const int size);
+    // template void launch_log<nv_bfloat16>(int numBlocks, int blockSize, const nv_bfloat16* a, nv_bfloat16* c,const int size);
+    // template void launch_log<__half>(int numBlocks, int blockSize, const __half* a, __half* c,const int size);
+    template void launch_log<int64_t>(int numBlocks, int blockSize, const int64_t* a, int64_t* c,const int size);
+    template void launch_log<int32_t>(int numBlocks, int blockSize, const int32_t* a, int32_t* c,const int size);
+    template void launch_log<int16_t>(int numBlocks, int blockSize, const int16_t* a, int16_t* c,const int size);
+    template void launch_log<int8_t>(int numBlocks, int blockSize, const int8_t* a, int8_t* c,const int size);  
+
+    // exp
+    template <typename T>
+    __global__ void exp_kernel(const T* A, T* C,const int size){
+        int idx = blockIdx.x * blockDim.x + threadIdx.x;
+        if (idx < size) {
+            C[idx] =  expf(A[idx]);
+        }
+    }
+    template __global__ void exp_kernel<double>(const double* A, double* C,const int size);
+    template __global__ void exp_kernel<float>(const float* A, float* C,const int size);
+    // template __global__ void exp_kernel<nv_bfloat16>(const nv_bfloat16* A, nv_bfloat16* C,const int size);
+    // template __global__ void exp_kernel<__half>(const __half* A, __half* C,const int size);
+    template __global__ void exp_kernel<int64_t>(const int64_t* A, int64_t* C,const int size);
+    template __global__ void exp_kernel<int32_t>(const int32_t* A, int32_t* C,const int size);
+    template __global__ void exp_kernel<int16_t>(const int16_t* A, int16_t* C,const int size);
+    template __global__ void exp_kernel<int8_t>(const int8_t* A, int8_t* C,const int size); 
+    
+    template <typename T>
+    void launch_exp(int numBlocks, int blockSize, const T* a, T* c,const int size){
+        exp_kernel<<<numBlocks, blockSize>>>(a, c, size);
+        cudaError_t err = cudaGetLastError();
+        if (err != cudaSuccess) {
+            throw std::runtime_error("Failed to launch exp kernel: " +   
+                                       std::string(cudaGetErrorString(err)));
+        }
+    }   
+    template void launch_exp<double>(int numBlocks, int blockSize, const double* a, double* c,const int size);
+    template void launch_exp<float>(int numBlocks, int blockSize, const float* a, float* c,const int size);
+    // template void launch_exp<nv_bfloat16>(int numBlocks, int blockSize, const nv_bfloat16* a, nv_bfloat16* c,const int size);
+    // template void launch_exp<__half>(int numBlocks, int blockSize, const __half* a, __half* c,const int size);
+    template void launch_exp<int64_t>(int numBlocks, int blockSize, const int64_t* a, int64_t* c,const int size);
+    template void launch_exp<int32_t>(int numBlocks, int blockSize, const int32_t* a, int32_t* c,const int size);
+    template void launch_exp<int16_t>(int numBlocks, int blockSize, const int16_t* a, int16_t* c,const int size);
+    template void launch_exp<int8_t>(int numBlocks, int blockSize, const int8_t* a, int8_t* c,const int size);
+   
+}
+
+#endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_SQRT_CUH
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cuh b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cuh
new file mode 100644
index 00000000..dd428cbd
--- /dev/null
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cuh
@@ -0,0 +1,169 @@
+#ifndef DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_SQRT_CUH
+#define DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_SQRT_CUH
+#include <cuda_bf16.h>  
+#include <cuda_fp16.h>
+
+
+#include "deepx/tensorfunc/elementwise.hpp"
+#include "deepx/tensorfunc/cuda.hpp"
+#include "deepx/tensorfunc/authors.hpp"
+
+namespace deepx::tensorfunc
+{   
+    // sqrt
+    template <typename T >
+    __global__ void sqrt_kernel(const T* A, T* C,const int size);
+
+    template <typename T>
+    void launch_sqrt(int numBlocks, int blockSize, const T* a, T* c,const int size);
+
+    template <>
+    void launch_sqrt<double>(int numBlocks, int blockSize, const double* a, double* c,const int size);
+
+    template <>
+    void launch_sqrt<float>(int numBlocks, int blockSize, const float* a, float* c,const int size);
+
+    template <>
+    void launch_sqrt<nv_bfloat16>(int numBlocks, int blockSize, const nv_bfloat16* a, nv_bfloat16* c,const int size);
+
+    template <>
+    void launch_sqrt<__half>(int numBlocks, int blockSize, const __half* a, __half* c,const int size);
+
+    template <>
+    void launch_sqrt<int64_t>(int numBlocks, int blockSize, const int64_t* a, int64_t* c,const int size);
+
+    template <>
+    void launch_sqrt<int32_t>(int numBlocks, int blockSize, const int32_t* a, int32_t* c,const int size);
+
+    template <>
+    void launch_sqrt<int16_t>(int numBlocks, int blockSize, const int16_t* a, int16_t* c,const int size);
+
+    template <>
+    void launch_sqrt<int8_t>(int numBlocks, int blockSize, const int8_t* a, int8_t* c,const int size);
+
+    // pow
+    template <typename T>
+    __global__ void pow_kernel(const T* A, const T* B, T* C,const int size);
+
+    template <typename T>
+    void launch_pow(int numBlocks, int blockSize, const T* a, const T* b, T* c,const int size);
+
+    template <>
+    void launch_pow<double>(int numBlocks, int blockSize, const double* a, const double* b, double* c,const int size);
+
+    template <>
+    void launch_pow<float>(int numBlocks, int blockSize, const float* a, const float* b, float* c,const int size);
+
+    template <>
+    void launch_pow<nv_bfloat16>(int numBlocks, int blockSize, const nv_bfloat16* a, const nv_bfloat16* b, nv_bfloat16* c,const int size);
+
+    template <>
+    void launch_pow<__half>(int numBlocks, int blockSize, const __half* a, const __half* b, __half* c,const int size);
+
+    template <>
+    void launch_pow<int64_t>(int numBlocks, int blockSize, const int64_t* a, const int64_t* b, int64_t* c,const int size);
+
+    template <>
+    void launch_pow<int32_t>(int numBlocks, int blockSize, const int32_t* a, const int32_t* b, int32_t* c,const int size);
+
+    template <>
+    void launch_pow<int16_t>(int numBlocks, int blockSize, const int16_t* a, const int16_t* b, int16_t* c,const int size);
+
+    template <>
+    void launch_pow<int8_t>(int numBlocks, int blockSize, const int8_t* a, const int8_t* b, int8_t* c,const int size);
+    
+    // powscalar
+    template <typename T>
+    __global__ void powscalar_kernel(const T* A, const T scalar, T* C,const int size);
+
+    template <typename T>
+    void launch_powscalar(int numBlocks, int blockSize, const T* a, const T scalar, T* c,const int size);   
+
+    template <>
+    void launch_powscalar<double>(int numBlocks, int blockSize, const double* a, const double scalar, double* c,const int size);
+
+    template <>
+    void launch_powscalar<float>(int numBlocks, int blockSize, const float* a, const float scalar, float* c,const int size);
+    
+    template <>
+    void launch_powscalar<nv_bfloat16>(int numBlocks, int blockSize, const nv_bfloat16* a, const nv_bfloat16 scalar, nv_bfloat16* c,const int size);
+
+    template <>
+    void launch_powscalar<__half>(int numBlocks, int blockSize, const __half* a, const __half scalar, __half* c,const int size);
+
+    template <>
+    void launch_powscalar<int64_t>(int numBlocks, int blockSize, const int64_t* a, const int64_t scalar, int64_t* c,const int size);
+
+    template <>
+    void launch_powscalar<int32_t>(int numBlocks, int blockSize, const int32_t* a, const int32_t scalar, int32_t* c,const int size);
+
+    template <>
+    void launch_powscalar<int16_t>(int numBlocks, int blockSize, const int16_t* a, const int16_t scalar, int16_t* c,const int size);
+
+    template <>
+    void launch_powscalar<int8_t>(int numBlocks, int blockSize, const int8_t* a, const int8_t scalar, int8_t* c,const int size);
+
+    // log
+    template <typename T>
+    __global__ void log_kernel(const T* A, T* C,const int size);
+
+    template <typename T>
+    void launch_log(int numBlocks, int blockSize, const T* a, T* c,const int size);
+
+    template <>
+    void launch_log<double>(int numBlocks, int blockSize, const double* a, double* c,const int size);
+
+    template <>
+    void launch_log<float>(int numBlocks, int blockSize, const float* a, float* c,const int size);
+
+    template <>
+    void launch_log<nv_bfloat16>(int numBlocks, int blockSize, const nv_bfloat16* a, nv_bfloat16* c,const int size);
+
+        template <>
+    void launch_log<__half>(int numBlocks, int blockSize, const __half* a, __half* c,const int size);
+
+    template <>
+    void launch_log<int64_t>(int numBlocks, int blockSize, const int64_t* a, int64_t* c,const int size);
+
+    template <>
+    void launch_log<int32_t>(int numBlocks, int blockSize, const int32_t* a, int32_t* c,const int size);
+
+    template <>
+    void launch_log<int16_t>(int numBlocks, int blockSize, const int16_t* a, int16_t* c,const int size);
+
+    template <>
+    void launch_log<int8_t>(int numBlocks, int blockSize, const int8_t* a, int8_t* c,const int size);
+
+    // exp
+    template <typename T>
+    __global__ void exp_kernel(const T* A, T* C,const int size);
+
+    template <typename T>
+    void launch_exp(int numBlocks, int blockSize, const T* a, T* c,const int size);
+    
+    template <>
+    void launch_exp<double>(int  numBlocks, int blockSize, const double* a, double* c,const int size);
+
+    template <>
+    void launch_exp<float>(int numBlocks, int blockSize, const float* a, float* c,const int size);
+
+    template <>
+    void launch_exp<nv_bfloat16>(int numBlocks, int blockSize, const nv_bfloat16* a, nv_bfloat16* c,const int size);
+
+    template <>
+    void launch_exp<__half>(int numBlocks, int blockSize, const __half* a, __half* c,const int size);
+
+    template <>
+    void launch_exp<int64_t>(int numBlocks, int blockSize, const int64_t* a, int64_t* c,const int size);
+
+    template <>
+    void launch_exp<int32_t>(int numBlocks, int blockSize, const int32_t* a, int32_t* c,const int size);
+
+    template <>
+    void launch_exp<int16_t>(int numBlocks, int blockSize, const int16_t* a, int16_t* c,const int size);
+
+    template <>
+    void launch_exp<int8_t>(int numBlocks, int blockSize, const int8_t* a, int8_t* c,const int size);
+}
+
+#endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_SQRT_CUH
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.hpp
new file mode 100644
index 00000000..38afe270
--- /dev/null
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.hpp
@@ -0,0 +1,88 @@
+#ifndef DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_SQRT_HPP
+#define DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_SQRT_HPP
+
+#include "deepx/tensorfunc/elementwise.hpp"
+#include "deepx/tensorfunc/cuda.hpp"
+#include "deepx/tensorfunc/authors.hpp"
+#include "deepx/tensorfunc/elementwise_miaobyte_sqrt.cuh"
+#include "stdutil/error.hpp"
+
+namespace deepx::tensorfunc
+{
+    // CUDA kernel函数声明
+   
+
+    template <typename T>
+    struct sqrtDispatcher<miaobyte, T>
+    {
+        static void sqrt(const Tensor<T> &A, Tensor<T> &C)
+        {
+            if (A.shape.size != C.shape.size) {
+                throw TensorShapeError("sqrt");
+            }
+            const int blockSize = A.shape.size > 256 ? 256 : A.shape.size;
+            int numBlocks = (A.shape.size + blockSize - 1) / blockSize;
+            launch_sqrt(numBlocks, blockSize, A.data, C.data, A.shape.size);           
+        }   
+    };
+
+    template <typename T>
+    struct powDispatcher<miaobyte, T>
+    {
+        static void pow(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &C)
+        {
+            if (A.shape.size != C.shape.size) {
+                throw TensorShapeError("pow");
+            }
+            const int blockSize = A.shape.size > 256 ? 256 : A.shape.size;
+            int numBlocks = (A.shape.size + blockSize - 1) / blockSize;
+            launch_pow(numBlocks, blockSize, A.data, B.data, C.data, A.shape.size);
+        }
+    };
+
+    template <typename T>
+    struct powscalarDispatcher<miaobyte, T>
+    {
+        static void powscalar(const Tensor<T> &A, const T scalar, Tensor<T> &C)
+        {
+            if (A.shape.size != C.shape.size) { 
+                throw TensorShapeError("powscalar");
+            }
+            const int blockSize = A.shape.size > 256 ? 256 : A.shape.size;
+            int numBlocks = (A.shape.size + blockSize - 1) / blockSize;
+            launch_powscalar(numBlocks, blockSize, A.data, scalar, C.data, A.shape.size);
+        }
+    };
+
+    template <typename T>
+    struct logDispatcher<miaobyte, T>
+    {
+        static void log(const Tensor<T> &A, Tensor<T> &C)
+        {
+            if (A.shape.size != C.shape.size) { 
+                throw TensorShapeError("log");
+            }
+            const int blockSize = A.shape.size > 256 ? 256 : A.shape.size;
+            int numBlocks = (A.shape.size + blockSize - 1) / blockSize;
+            launch_log(numBlocks, blockSize, A.data, C.data, A.shape.size);
+        }
+    };  
+
+    template <typename T>
+    struct expDispatcher<miaobyte, T>
+    {
+        static void exp(const Tensor<T> &A, Tensor<T> &C)
+        {
+            if (A.shape.size != C.shape.size) { 
+                throw TensorShapeError("exp");
+            }
+            const int blockSize = A.shape.size > 256 ? 256 : A.shape.size;
+            int numBlocks = (A.shape.size + blockSize - 1) / blockSize;
+            launch_exp(numBlocks, blockSize, A.data, C.data, A.shape.size);
+        }
+    };
+
+    
+}
+
+#endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_BASIC_HPP
diff --git a/excuter/op-mem-cuda/src/deepx/tf/elementwise_sqrt.hpp b/excuter/op-mem-cuda/src/deepx/tf/elementwise_sqrt.hpp
new file mode 100644
index 00000000..3865c03b
--- /dev/null
+++ b/excuter/op-mem-cuda/src/deepx/tf/elementwise_sqrt.hpp
@@ -0,0 +1,378 @@
+#ifndef DEEPX_TF_ELEMENTWISE_SQRT_HPP
+#define DEEPX_TF_ELEMENTWISE_SQRT_HPP
+
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include "deepx/tensorfunc/elementwise_miaobyte_sqrt.hpp"
+
+namespace deepx::tf
+{
+
+    template <typename Author>
+    class Sqrt : public TF
+    {
+    public:
+        Sqrt(const vector<Param> &args, const vector<Param> &returns)
+        {
+            this->name = "sqrt";
+            this->author = Author::name();
+            this->args = args;
+            this->returns = returns;
+        }
+
+        Sqrt(string text)
+        {
+            this->parse(text);
+            this->author = Author::name();
+            if (this->name != "sqrt")
+            {
+                throw std::runtime_error("Invalid name: " + this->name);
+            }
+        }
+        string math_formula() const override
+        {
+            return "T3=sqrt(T1)";
+        }
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<Sqrt<Author>>(*this);
+        }
+
+        int run(shared_ptr<MemBase> mem, string &error) override
+        {
+            Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
+            Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
+            if (a_type != c_type)
+            {
+                error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type);
+                return 1;
+            }
+            switch (a_type)
+            {
+            case Precision::Float64:
+                tensorfunc::sqrt<Author, double>(*mem->gettensor<double>(this->args[0].textvalue), *mem->gettensor<double>(this->returns[0].textvalue));
+                break;
+            case Precision::Float32:
+                tensorfunc::sqrt<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
+                break;
+            // case Precision::Float16:
+            //     tensorfunc::sqrt<Author, half>(*mem->gettensor<half>(this->args[0].textvalue), *mem->gettensor<half>(this->returns[0].textvalue));
+            //     break;
+            // case Precision::Float16:
+            //     tensorfunc::sqrt<Author, half>(*mem->gettensor<half>(this->args[0].textvalue), *mem->gettensor<half>(this->returns[0].textvalue));
+            //     break;
+            case Precision::Int64:
+                tensorfunc::sqrt<Author, int64_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int32:
+                tensorfunc::sqrt<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int16:
+                tensorfunc::sqrt<Author, int16_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int8:
+                tensorfunc::sqrt<Author, int8_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                break;
+            default:
+                error = "Unsupported type: " + precision_str(a_type);
+                return 1;
+            }
+            return 0;
+        }
+    };
+
+    template <typename Author>
+    class Pow : public TF
+    {
+    public:
+        Pow(const vector<Param> &args, const vector<Param> &returns)
+        {
+            this->name = "pow";
+            this->author = Author::name();
+            this->args = args;
+            this->returns = returns;
+        }
+
+        Pow(string text)
+        {
+            this->parse(text);
+            this->author = Author::name();
+            if (this->name != "pow")
+            {
+                throw std::runtime_error("Invalid name: " + this->name);
+            }
+        }
+        string math_formula() const override
+        {
+            return "T3=pow(T1, T2)";
+        }
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<Pow<Author>>(*this);
+        }
+
+        int run(shared_ptr<MemBase> mem, string &error) override
+        {
+            Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
+            Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype;
+            Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
+            if (a_type != c_type || b_type != c_type)
+            {
+                error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type) + " or " + precision_str(b_type) + " != " + precision_str(c_type);
+                return 1;
+            }
+            switch (a_type)
+            {
+            case Precision::Float64:
+                tensorfunc::pow<Author, double>(*mem->gettensor<double>(this->args[0].textvalue), *mem->gettensor<double>(this->args[1].textvalue), *mem->gettensor<double>(this->returns[0].textvalue));
+                break;
+            case Precision::Float32:
+                tensorfunc::pow<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<float>(this->args[1].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
+                break;
+            // case Precision::BFloat16:
+            //     tensorfunc::pow<Author, half>(*mem->gettensor<half>(this->args[0].textvalue), *mem->gettensor<half>(this->args[1].textvalue), *mem->gettensor<half>(this->returns[0].textvalue));
+            //     break;
+            // case Precision::Float16:
+            //     tensorfunc::pow<Author, half>(*mem->gettensor<half>(this->args[0].textvalue), *mem->gettensor<half>(this->args[1].textvalue), *mem->gettensor<half>(this->returns[0].textvalue));
+            //     break;
+            case Precision::Int64:
+                tensorfunc::pow<Author, int64_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue), *mem->gettensor<int64_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int32:
+                tensorfunc::pow<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int16:
+                tensorfunc::pow<Author, int16_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), *mem->gettensor<int16_t>(this->args[1].textvalue), *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int8:
+                tensorfunc::pow<Author, int8_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), *mem->gettensor<int8_t>(this->args[1].textvalue), *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                break;
+            default:
+                error = "Unsupported type: " + precision_str(a_type);
+                return 1;
+            }
+            return 0;
+        }
+    };
+
+    template <typename Author>
+    class PowScalar : public TF
+    {
+    public:
+        PowScalar(const vector<Param> &args, const vector<Param> &returns)
+        {
+            this->name = "powscalar";
+            this->author = Author::name();
+            this->args = args;
+            this->returns = returns;
+        }
+
+        PowScalar(string text)
+        {
+            this->parse(text);
+            this->author = Author::name();
+            if (this->name != "powscalar")
+            {
+                throw std::runtime_error("Invalid name: " + this->name);
+            }
+        }
+        string math_formula() const override
+        {
+            return "T3=pow(T1, scalar)";
+        }
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<PowScalar<Author>>(*this);
+        }
+
+        int run(shared_ptr<MemBase> mem, string &error) override
+        {
+            Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
+            Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype;
+            Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
+            if (a_type != c_type || b_type != c_type)
+            {
+                error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type) + " or " + precision_str(b_type) + " != " + precision_str(c_type);
+                return 1;
+            }
+            switch (a_type)
+            {
+            case Precision::Float64:
+                tensorfunc::powscalar<Author, double>(*mem->gettensor<double>(this->args[0].textvalue),  this->getvar<double>(1, mem), *mem->gettensor<double>(this->returns[0].textvalue));
+                break;
+            case Precision::Float32:
+                tensorfunc::powscalar<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), this->getvar<float>(1, mem), *mem->gettensor<float>(this->returns[0].textvalue));
+                break;
+            // case Precision::BFloat16:
+            //     tensorfunc::powscalar<Author, half>(*mem->gettensor<half>(this->args[0].textvalue), this->getvar<half>(1, mem), *mem->gettensor<half>(this->returns[0].textvalue));
+            //     break;
+            // case Precision::Float16:
+            //     tensorfunc::powscalar<Author, half>(*mem->gettensor<half>(this->args[0].textvalue), this->getvar<half>(1, mem), *mem->gettensor<half>(this->returns[0].textvalue));
+            //     break;
+            case Precision::Int64:
+                tensorfunc::powscalar<Author, int64_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), this->getvar<int64_t>(1, mem), *mem->gettensor<int64_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int32:
+                tensorfunc::powscalar<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), this->getvar<int32_t>(1, mem), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int16:
+                tensorfunc::powscalar<Author, int16_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), this->getvar<int16_t>(1, mem), *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int8:
+                tensorfunc::powscalar<Author, int8_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), this->getvar<int8_t>(1, mem), *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                break;
+            default:
+                error = "Unsupported type: " + precision_str(a_type);
+                return 1;
+            }
+            return 0;
+        }
+    };
+
+    template <typename Author>
+    class Log : public TF
+    {
+    public:
+        Log(const vector<Param> &args, const vector<Param> &returns)
+        {
+            this->name = "log";
+            this->author = Author::name();
+            this->args = args;
+            this->returns = returns;
+        }
+
+        Log(string text)
+        {
+            this->parse(text);
+            this->author = Author::name();
+            if (this->name != "log")
+            {
+                throw std::runtime_error("Invalid name: " + this->name);
+            }
+        }
+        string math_formula() const override
+        {
+            return "T3=log(T1)";
+        }
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<Log<Author>>(*this);
+        }
+
+        int run(shared_ptr<MemBase> mem, string &error) override
+        {
+            Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
+            Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
+            if (a_type != c_type)
+            {
+                error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type);
+                return 1;
+            }
+            switch (a_type)
+            {
+            case Precision::Float64:
+                tensorfunc::log<Author, double>(*mem->gettensor<double>(this->args[0].textvalue), *mem->gettensor<double>(this->returns[0].textvalue));
+                break;
+            case Precision::Float32:
+                tensorfunc::log<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
+                break;
+            // case Precision::Float16:
+            //     tensorfunc::log<Author, half>(*mem->gettensor<half>(this->args[0].textvalue), *mem->gettensor<half>(this->returns[0].textvalue));
+            //     break;
+            // case Precision::BFloat16:
+            //     tensorfunc::log<Author, half>(*mem->gettensor<half>(this->args[0].textvalue), *mem->gettensor<half>(this->returns[0].textvalue));
+            //     break;
+            case Precision::Int64:
+                tensorfunc::log<Author, int64_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int32:
+                tensorfunc::log<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int16:
+                tensorfunc::log<Author, int16_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int8:
+                tensorfunc::log<Author, int8_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                break;
+            default:
+                error = "Unsupported type: " + precision_str(a_type);
+                return 1;
+            }
+            return 0;
+        }
+    };
+
+    template <typename Author>
+    class Exp : public TF
+    {
+    public:
+        Exp(const vector<Param> &args, const vector<Param> &returns)
+        {
+            this->name = "exp";
+            this->author = Author::name();
+            this->args = args;
+            this->returns = returns;
+        }
+
+        Exp(string text)
+        {
+            this->parse(text);
+            this->author = Author::name();
+            if (this->name != "exp")
+            {
+                throw std::runtime_error("Invalid name: " + this->name);
+            }
+        }
+        string math_formula() const override
+        {
+            return "T3=exp(T1)";
+        }
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<Exp<Author>>(*this);
+        }
+
+        int run(shared_ptr<MemBase> mem, string &error) override
+        {
+            Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
+            Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
+            if (a_type != c_type)
+            {
+                error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type);
+                return 1;
+            }
+            switch (a_type)
+            {
+            case Precision::Float64:
+                tensorfunc::exp<Author, double>(*mem->gettensor<double>(this->args[0].textvalue), *mem->gettensor<double>(this->returns[0].textvalue));
+                break;
+            case Precision::Float32:
+                tensorfunc::exp<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
+                break;
+            // case Precision::Float16:
+            //     tensorfunc::exp<Author, half>(*mem->gettensor<half>(this->args[0].textvalue), *mem->gettensor<half>(this->returns[0].textvalue));
+            //     break;
+            // case Precision::BFloat16:
+            //     tensorfunc::exp<Author, half>(*mem->gettensor<half>(this->args[0].textvalue), *mem->gettensor<half>(this->returns[0].textvalue));
+            //     break;
+            case Precision::Int64:
+                tensorfunc::exp<Author, int64_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int32:
+                tensorfunc::exp<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int16:
+                tensorfunc::exp<Author, int16_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int8:
+                tensorfunc::exp<Author, int8_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                break;
+            default:
+                error = "Unsupported type: " + precision_str(a_type);
+                return 1;
+            }
+            return 0;
+        }
+    };
+};
+#endif // DEEPX_TF_ELEMENTWISE_SQRT_HPP
diff --git a/excuter/op-mem-ompsimd/src/client/tfs.cpp b/excuter/op-mem-ompsimd/src/client/tfs.cpp
index 59dfba65..afd1ee0a 100644
--- a/excuter/op-mem-ompsimd/src/client/tfs.cpp
+++ b/excuter/op-mem-ompsimd/src/client/tfs.cpp
@@ -112,44 +112,44 @@ namespace deepx::tf
                                                                  Param("c", DataCategory::Tensor, Precision::Any),
                                                              })));
 
-         tffactory.add_tf(std::make_shared<Add<cblas>>(vector<Param>(
+        tffactory.add_tf(std::make_shared<Add<cblas>>(vector<Param>(
+                                                          {
+                                                              Param("a", DataCategory::Tensor, Precision::Float64 | Precision::Float32),
+                                                              Param("b", DataCategory::Tensor, Precision::Float64 | Precision::Float32),
+                                                          }),
+                                                      vector<Param>(
+                                                          {
+                                                              Param("c", DataCategory::Tensor, Precision::Float64 | Precision::Float32),
+                                                          })));
+
+        tffactory.add_tf(std::make_shared<AddScalar<miaobyte>>(vector<Param>(
+                                                                   {
+                                                                       Param("a", DataCategory::Tensor, Precision::Any),
+                                                                       Param("scalar", DataCategory::Var, Precision::Any),
+                                                                   }),
+                                                               vector<Param>(
+                                                                   {
+                                                                       Param("c", DataCategory::Tensor, Precision::Any),
+                                                                   })));
+        tffactory.add_tf(std::make_shared<Sub<miaobyte>>(vector<Param>(
                                                              {
-                                                                 Param("a", DataCategory::Tensor, Precision::Float64|Precision::Float32),
-                                                                 Param("b", DataCategory::Tensor, Precision::Float64|Precision::Float32),
+                                                                 Param("a", DataCategory::Tensor, Precision::Any),
+                                                                 Param("b", DataCategory::Tensor, Precision::Any),
                                                              }),
                                                          vector<Param>(
                                                              {
-                                                                 Param("c", DataCategory::Tensor, Precision::Float64|Precision::Float32),
+                                                                 Param("c", DataCategory::Tensor, Precision::Any),
                                                              })));
- 
-        tffactory.add_tf(std::make_shared<AddScalar<miaobyte>>(vector<Param>(
-                                                                 {
-                                                                     Param("a", DataCategory::Tensor, Precision::Any),
-                                                                     Param("scalar", DataCategory::Var, Precision::Any),
-                                                                 }),
-                                                             vector<Param>(
-                                                                 {
-                                                                     Param("c", DataCategory::Tensor, Precision::Any),
-                                                                 })));
-        tffactory.add_tf(std::make_shared<Sub<miaobyte>>(vector<Param>(
-                                                                 {
-                                                                     Param("a", DataCategory::Tensor, Precision::Any),
-                                                                     Param("b", DataCategory::Tensor, Precision::Any),
-                                                                 }),
-                                                             vector<Param>(
-                                                                 {
-                                                                     Param("c", DataCategory::Tensor, Precision::Any),
-                                                                 }))); 
 
         tffactory.add_tf(std::make_shared<SubScalar<miaobyte>>(vector<Param>(
-                                                                 {
-                                                                     Param("a", DataCategory::Tensor, Precision::Any),
-                                                                     Param("scalar", DataCategory::Var, Precision::Any),
-                                                                 }),
-                                                             vector<Param>(
-                                                                 {
-                                                                     Param("c", DataCategory::Tensor, Precision::Any),
-                                                                 }))); 
+                                                                   {
+                                                                       Param("a", DataCategory::Tensor, Precision::Any),
+                                                                       Param("scalar", DataCategory::Var, Precision::Any),
+                                                                   }),
+                                                               vector<Param>(
+                                                                   {
+                                                                       Param("c", DataCategory::Tensor, Precision::Any),
+                                                                   })));
 
         tffactory.add_tf(std::make_shared<Mul<miaobyte>>(vector<Param>(
                                                              {
@@ -159,7 +159,7 @@ namespace deepx::tf
                                                          vector<Param>(
                                                              {
                                                                  Param("C", DataCategory::Tensor, Precision::Any),
-                                                                 })));
+                                                             })));
         tffactory.add_tf(std::make_shared<MulScalar<miaobyte>>(vector<Param>(
                                                                    {
                                                                        Param("A", DataCategory::Tensor, Precision::Any),
@@ -169,7 +169,7 @@ namespace deepx::tf
                                                                    {
                                                                        Param("C", DataCategory::Tensor, Precision::Any),
                                                                    })));
-        
+
         tffactory.add_tf(std::make_shared<Div<miaobyte>>(vector<Param>(
                                                              {
                                                                  Param("A", DataCategory::Tensor, Precision::Any),
@@ -179,44 +179,35 @@ namespace deepx::tf
                                                              {
                                                                  Param("C", DataCategory::Tensor, Precision::Any),
                                                              })));
-        tffactory.add_tf(std::make_shared<DivScalar<miaobyte>>(vector<Param>(   
-                                                                 {
-                                                                     Param("A", DataCategory::Tensor, Precision::Any),
-                                                                     Param("scalar", DataCategory::Var, Precision::Any),
-                                                                 }),
-                                                             vector<Param>(
-                                                                 {
-                                                                     Param("C", DataCategory::Tensor, Precision::Any),
-                                                                 })));
-
-
+        tffactory.add_tf(std::make_shared<DivScalar<miaobyte>>(vector<Param>(
+                                                                   {
+                                                                       Param("A", DataCategory::Tensor, Precision::Any),
+                                                                       Param("scalar", DataCategory::Var, Precision::Any),
+                                                                   }),
+                                                               vector<Param>(
+                                                                   {
+                                                                       Param("C", DataCategory::Tensor, Precision::Any),
+                                                                   })));
 
         tffactory.add_tf(std::make_shared<RDivScalar<miaobyte>>(vector<Param>(
-                                                                 {
-                                                                     Param("scalar", DataCategory::Var, Precision::Any),
-                                                                     Param("A", DataCategory::Tensor, Precision::Any),
-                                                                 }),
-                                                             vector<Param>(
-                                                                 {
-                                                                     Param("C", DataCategory::Tensor, Precision::Any),
-                                                                 })));
-
-        //     opfactory.add_op(Sqrt_miaobyte<float>());
-        //     opfactory.add_op(Sqrt_miaobyte<double>());
-
-        //     opfactory.add_op(Exp_miaobyte<float>());
-        //     opfactory.add_op(Exp_miaobyte<double>());
-
-        //     opfactory.add_op(Pow_miaobyte<float>());
-        //     opfactory.add_op(Pow_miaobyte<double>());
+                                                                    {
+                                                                        Param("scalar", DataCategory::Var, Precision::Any),
+                                                                        Param("A", DataCategory::Tensor, Precision::Any),
+                                                                    }),
+                                                                vector<Param>(
+                                                                    {
+                                                                        Param("C", DataCategory::Tensor, Precision::Any),
+                                                                    })));
+        tffactory.add_tf(std::make_shared<Sqrt<miaobyte>>(vector<Param>(
+                                                              {
+                                                                  Param("A", DataCategory::Tensor, Precision::Any),
+                                                              }),
+                                                          vector<Param>(
+                                                              {
+                                                                  Param("C", DataCategory::Tensor, Precision::Any),
+                                                              })));
 
-        //     opfactory.add_op(Powscalar_miaobyte<float>());
-        //     opfactory.add_op(Powscalar_miaobyte<double>());
-    }
-    // matmul
-    void register_matmul(TfFactory &tffactory)
-    {
-        tffactory.add_tf(std::make_shared<MatMul<miaobyte>>(vector<Param>(
+        tffactory.add_tf(std::make_shared<Pow<miaobyte>>(vector<Param>(
                                                              {
                                                                  Param("A", DataCategory::Tensor, Precision::Any),
                                                                  Param("B", DataCategory::Tensor, Precision::Any),
@@ -225,15 +216,53 @@ namespace deepx::tf
                                                              {
                                                                  Param("C", DataCategory::Tensor, Precision::Any),
                                                              })));
+        tffactory.add_tf(std::make_shared<PowScalar<miaobyte>>(vector<Param>(
+                                                                   {
+                                                                       Param("A", DataCategory::Tensor, Precision::Any),
+                                                                       Param("scalar", DataCategory::Var, Precision::Any),
+                                                                   }),
+                                                               vector<Param>(
+                                                                   {
+                                                                       Param("C", DataCategory::Tensor, Precision::Any),
+                                                                   })));
+        tffactory.add_tf(std::make_shared<Log<miaobyte>>(vector<Param>(
+                                                             {
+                                                                 Param("A", DataCategory::Tensor, Precision::Any),
+                                                             }),
+                                                         vector<Param>(
+                                                             {
+                                                                 Param("C", DataCategory::Tensor, Precision::Any),
+                                                             })));
+        tffactory.add_tf(std::make_shared<Exp<miaobyte>>(vector<Param>(
+                                                             {
+                                                                 Param("A", DataCategory::Tensor, Precision::Any),
+                                                             }),
+                                                         vector<Param>(
+                                                             {
+                                                                 Param("C", DataCategory::Tensor, Precision::Any),
+                                                             })));
+    }
+    // matmul
+    void register_matmul(TfFactory &tffactory)
+    {
+        tffactory.add_tf(std::make_shared<MatMul<miaobyte>>(vector<Param>(
+                                                                {
+                                                                    Param("A", DataCategory::Tensor, Precision::Any),
+                                                                    Param("B", DataCategory::Tensor, Precision::Any),
+                                                                }),
+                                                            vector<Param>(
+                                                                {
+                                                                    Param("C", DataCategory::Tensor, Precision::Any),
+                                                                })));
         tffactory.add_tf(std::make_shared<MatMul<cblas>>(vector<Param>(
                                                              {
-                                                                 Param("A", DataCategory::Tensor, Precision::Float64|Precision::Float32),
-                                                                 Param("B", DataCategory::Tensor, Precision::Float64|Precision::Float32),
+                                                                 Param("A", DataCategory::Tensor, Precision::Float64 | Precision::Float32),
+                                                                 Param("B", DataCategory::Tensor, Precision::Float64 | Precision::Float32),
                                                              }),
                                                          vector<Param>(
                                                              {
-                                                                 Param("C", DataCategory::Tensor, Precision::Float64|Precision::Float32),
-                                                             })));  
+                                                                 Param("C", DataCategory::Tensor, Precision::Float64 | Precision::Float32),
+                                                             })));
     }
     // // changeshape
     void register_changeshape(TfFactory &tffactory)
diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp
index e2e85677..f7bacc0b 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp
@@ -101,7 +101,7 @@ namespace deepx::tensorfunc
 
     // 通用实现
     template <typename T>
-    struct addDispatcher<miaobyte,T>
+    struct addDispatcher<miaobyte, T>
     {
         static void add(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &C)
         {
@@ -122,7 +122,7 @@ namespace deepx::tensorfunc
     };
 
     template <typename T>
-    struct addscalarDispatcher<miaobyte,T>
+    struct addscalarDispatcher<miaobyte, T>
     {
         static void addscalar(const Tensor<T> &A, const T value, Tensor<T> &C)
         {
@@ -143,7 +143,7 @@ namespace deepx::tensorfunc
 
     // 添加 sub 的模板特化实现
     template <typename T>
-    struct subDispatcher<miaobyte,T>
+    struct subDispatcher<miaobyte, T>
     {
         static void sub(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &C)
         {
@@ -163,7 +163,7 @@ namespace deepx::tensorfunc
     };
 
     template <typename T>
-    struct subscalarDispatcher<miaobyte,T>
+    struct subscalarDispatcher<miaobyte, T>
     {
         static void subscalar(const Tensor<T> &A, const T value, Tensor<T> &C)
         {
@@ -184,7 +184,7 @@ namespace deepx::tensorfunc
 
     // 添加 mul 的模板特化实现
     template <typename T>
-    struct mulDispatcher<miaobyte,T>
+    struct mulDispatcher<miaobyte, T>
     {
         static void mul(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &C)
         {
@@ -204,7 +204,7 @@ namespace deepx::tensorfunc
     };
 
     template <typename T>
-    struct mulscalarDispatcher<miaobyte,T>
+    struct mulscalarDispatcher<miaobyte, T>
     {
         static void mulscalar(const Tensor<T> &A, const T value, Tensor<T> &C)
         {
@@ -224,10 +224,10 @@ namespace deepx::tensorfunc
     };
 
     template <typename T>
-    struct muladdDispatcher<miaobyte,T>
+    struct muladdDispatcher<miaobyte, T>
     {
         // A*B+C=D
-        static void muladd(const Tensor<T> &A, const Tensor<T> &B, const Tensor<T> &C,  Tensor<T> &D)
+        static void muladd(const Tensor<T> &A, const Tensor<T> &B, const Tensor<T> &C, Tensor<T> &D)
         {
 
             if (A.shape == B.shape && A.shape == C.shape && A.shape == D.shape)
@@ -270,10 +270,10 @@ namespace deepx::tensorfunc
     };
 
     template <typename T>
-    struct muladdscalarDispatcher<miaobyte,T>
+    struct muladdscalarDispatcher<miaobyte, T>
     {
         // A*B*alpha+C*beta=D
-        static void muladdscalar(const Tensor<T> &A, const Tensor<T> &B, const T alpha, const Tensor<T> &C, const T beta,  Tensor<T> &D)
+        static void muladdscalar(const Tensor<T> &A, const Tensor<T> &B, const T alpha, const Tensor<T> &C, const T beta, Tensor<T> &D)
         {
             if (A.shape == B.shape && A.shape == C.shape && A.shape == D.shape)
             {
@@ -329,10 +329,10 @@ namespace deepx::tensorfunc
     };
 
     template <typename T>
-    struct mulscalaraddDispatcher<miaobyte,T>
+    struct mulscalaraddDispatcher<miaobyte, T>
     {
         // A*alpha+B*beta=C
-        static void mulscalaradd(const Tensor<T> &A, const T alpha, const Tensor<T> &B, const T beta,  Tensor<T> &C)
+        static void mulscalaradd(const Tensor<T> &A, const T alpha, const Tensor<T> &B, const T beta, Tensor<T> &C)
         {
             if (A.shape == B.shape && A.shape == C.shape)
             {
@@ -379,7 +379,7 @@ namespace deepx::tensorfunc
 
     // 添加 div 的模板特化实现
     template <typename T>
-    struct divDispatcher<miaobyte,T>
+    struct divDispatcher<miaobyte, T>
     {
         static void div(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &C)
         {
@@ -399,7 +399,7 @@ namespace deepx::tensorfunc
     };
 
     template <typename T>
-    struct divscalarDispatcher<miaobyte,T>
+    struct divscalarDispatcher<miaobyte, T>
     {
         static void divscalar(const Tensor<T> &A, const T value, Tensor<T> &C)
         {
@@ -419,7 +419,7 @@ namespace deepx::tensorfunc
     };
 
     template <typename T>
-    struct rdivscalarDispatcher<miaobyte,T>
+    struct rdivscalarDispatcher<miaobyte, T>
     {
         static void rdivscalar(const T value, const Tensor<T> &In, Tensor<T> &Out)
         {
@@ -439,10 +439,10 @@ namespace deepx::tensorfunc
     };
 
     template <typename T>
-    struct divaddDispatcher<miaobyte,T>
+    struct divaddDispatcher<miaobyte, T>
     {
         // D= A/B+ C
-        static void divadd(const Tensor<T> &A, const Tensor<T> &B, const Tensor<T> &C,   Tensor<T> &D)
+        static void divadd(const Tensor<T> &A, const Tensor<T> &B, const Tensor<T> &C, Tensor<T> &D)
         {
             if (A.shape == B.shape && A.shape == C.shape && A.shape == D.shape)
             {
@@ -481,13 +481,13 @@ namespace deepx::tensorfunc
                 throw std::invalid_argument("shape mismatch");
             }
         }
-    };  
+    };
 
     template <typename T>
-    struct divscalaraddDispatcher<miaobyte,T>
+    struct divscalaraddDispatcher<miaobyte, T>
     {
         //  C= A/alpha+ B/beta
-        static void divscalaradd(const Tensor<T> &A, const T alpha, const Tensor<T> &B, const T beta,   Tensor<T> &C)
+        static void divscalaradd(const Tensor<T> &A, const T alpha, const Tensor<T> &B, const T beta, Tensor<T> &C)
         {
             if (A.shape == B.shape && A.shape == C.shape)
             {
@@ -533,10 +533,10 @@ namespace deepx::tensorfunc
     };
 
     template <typename T>
-    struct divaddbetaDispatcher<miaobyte,T>
+    struct divaddbetaDispatcher<miaobyte, T>
     {
         // D= A/B*alpha+ C*beta
-        static void divaddbeta(const Tensor<T> &A, const Tensor<T> &B, const T alpha, const Tensor<T> &C, const T beta,   Tensor<T> &D)
+        static void divaddbeta(const Tensor<T> &A, const Tensor<T> &B, const T alpha, const Tensor<T> &C, const T beta, Tensor<T> &D)
         {
             if (A.shape == B.shape && A.shape == C.shape && A.shape == D.shape)
             {
@@ -584,7 +584,7 @@ namespace deepx::tensorfunc
     };
 
     template <typename T>
-    struct sqrtDispatcher<miaobyte,T>
+    struct sqrtDispatcher<miaobyte, T, std::enable_if_t<std::is_floating_point_v<T>>>
     {
         static void sqrt(const Tensor<T> &input, Tensor<T> &output)
         {
@@ -624,31 +624,66 @@ namespace deepx::tensorfunc
             }
         }
     };
+    template <typename T>
+    struct sqrtDispatcher<miaobyte, T, std::enable_if_t<std::is_integral_v<T>>>
+    {
+        static void sqrt(const Tensor<T> &input, Tensor<T> &output)
+        {
+            if (input.shape == output.shape)
+            {
+                output.shape.rangeParallel(output.shape.dim - 1, [&input, &output](int i)
+                                           {
+                                               int shape_last = output.shape[-1];
+
+                                               size_t j = 0;
+
+                                               while (j < shape_last)
+                                               {
+                                                   output.data[i + j] = std::sqrt(input.data[i + j]);
+                                                   ++j;
+                                               }
+                                           });
+            }
+            else
+            {
+                throw std::invalid_argument("shape mismatch");
+            }
+        }
+    };
 
     template <typename T>
-    struct powDispatcher<miaobyte,T>
+    struct powDispatcher<miaobyte, T>
     {
         // C=A^B
-        static void pow(const Tensor<T> &A, Tensor<T> &B, Tensor<T> &C)
+        static void pow(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &C)
         {
             if (A.shape == B.shape && A.shape == C.shape)
             {
-                C.shape.rangeParallel(C.shape.dim, [&A, &B, &C](int i)
-                                      { C.data[i] = std::pow(A.data[i], B.data[i]); });
+                C.shape.rangeParallel(C.shape.dim - 1, [&A, &B, &C](int i)
+                                      {
+                                         for (int j = 0; j < C.shape[-1]; j++)
+                                         C.data[i+j] = std::pow(A.data[i+j], B.data[i+j]); });
             }
             else
             {
                 throw std::invalid_argument("shape mismatch");
             }
         }
+    };
+
+    template <typename T>
+    struct powscalarDispatcher<miaobyte, T>
+    {
         // C=A^value
         //  highway 不支持POW
         static void powscalar(const Tensor<T> &input, const T value, Tensor<T> &output)
         {
             if (input.shape == output.shape)
             {
-                output.shape.rangeParallel(output.shape.dim, [&input, &output, &value](int i)
-                                           { output.data[i] = std::pow(input.data[i], value); });
+                output.shape.rangeParallel(output.shape.dim - 1, [&input, &output, &value](int i)
+                                           {
+                                             for (int j = 0; j < output.shape[-1]; j++)
+                                                output.data[i+j] = std::pow(input.data[i+j], value); });
             }
             else
             {
@@ -658,15 +693,16 @@ namespace deepx::tensorfunc
     };
 
     template <typename T>
-    struct logDispatcher<miaobyte,T>
-    {   
+    struct logDispatcher<miaobyte, T>
+    {
         // hwy库没有log函数，所以只能用std::log
         static void log(const Tensor<T> &input, Tensor<T> &output)
         {
             if (input.shape == output.shape)
             {
-                output.shape.rangeParallel(output.shape.dim, [&input, &output](int i)
-                                           { output.data[i] = std::log(input.data[i]); });
+                output.shape.rangeParallel(output.shape.dim - 1, [&input, &output](int i)
+                                           { for (int j = 0; j < output.shape[-1]; j++)
+                                                output.data[i+j] = std::log(input.data[i+j]); });
             }
             else
             {
@@ -676,15 +712,16 @@ namespace deepx::tensorfunc
     };
 
     template <typename T>
-    struct expDispatcher<miaobyte,T>
-    {   
+    struct expDispatcher<miaobyte, T>
+    {
         // 发现hwy库没有exp函数，所以只能用std::exp
         static void exp(const Tensor<T> &input, Tensor<T> &output)
         {
             if (input.shape == output.shape)
             {
-                output.shape.rangeParallel(output.shape.dim, [&input, &output](int i)
-                                           { output.data[i] = std::exp(input.data[i]); });
+                output.shape.rangeParallel(output.shape.dim - 1, [&input, &output](int i)
+                                           { for (int j = 0; j < output.shape[-1]; j++)
+                                                output.data[i+j] = std::exp(input.data[i+j]); });
             }
             else
             {
@@ -694,9 +731,9 @@ namespace deepx::tensorfunc
     };
 
     template <typename T>
-    struct sinDispatcher<miaobyte,T>
-    {   
-        
+    struct sinDispatcher<miaobyte, T>
+    {
+
         static void sin(const Tensor<T> &input, Tensor<T> &output)
         {
             if (input.shape == output.shape)
@@ -737,9 +774,9 @@ namespace deepx::tensorfunc
     };
 
     template <typename T>
-    struct cosDispatcher<miaobyte,T>
-    {   
- 
+    struct cosDispatcher<miaobyte, T>
+    {
+
         static void cos(const Tensor<T> &input, Tensor<T> &output)
         {
             if (input.shape == output.shape)
@@ -780,9 +817,9 @@ namespace deepx::tensorfunc
     };
 
     template <typename T>
-    struct tanDispatcher<miaobyte,T>
-    {   
-        
+    struct tanDispatcher<miaobyte, T>
+    {
+
         static void tan(const Tensor<T> &input, Tensor<T> &output)
         {
             if (input.shape == output.shape)
@@ -823,8 +860,8 @@ namespace deepx::tensorfunc
     };
 
     template <typename T>
-    struct maxDispatcher<miaobyte,T>
-    {       
+    struct maxDispatcher<miaobyte, T>
+    {
         static void max(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &C)
         {
             if (A.shape == B.shape && A.shape == C.shape)
@@ -866,7 +903,7 @@ namespace deepx::tensorfunc
     };
 
     template <typename T>
-    struct maxgradDispatcher<miaobyte,T>
+    struct maxgradDispatcher<miaobyte, T>
     {
         static void maxgrad(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &A_grad, Tensor<T> &B_grad, const Tensor<T> &output_grad)
         {
@@ -893,9 +930,9 @@ namespace deepx::tensorfunc
     };
 
     template <typename T>
-    struct maxscalarDispatcher<miaobyte,T>
+    struct maxscalarDispatcher<miaobyte, T>
     {
-        static void maxscalar(const Tensor<T> &A,const T b, Tensor<T> &C)
+        static void maxscalar(const Tensor<T> &A, const T b, Tensor<T> &C)
         {
             if (A.shape == C.shape)
             {
@@ -936,7 +973,7 @@ namespace deepx::tensorfunc
     };
 
     template <typename T>
-    struct maxscalargradDispatcher<miaobyte,T>
+    struct maxscalargradDispatcher<miaobyte, T>
     {
         static void maxscalargrad(const Tensor<T> &A, const T b, Tensor<T> &A_grad, const Tensor<T> &output_grad)
         {
@@ -960,7 +997,7 @@ namespace deepx::tensorfunc
     };
 
     template <typename T>
-    struct minDispatcher<miaobyte,T>
+    struct minDispatcher<miaobyte, T>
     {
         static void min(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &C)
         {
@@ -1003,7 +1040,7 @@ namespace deepx::tensorfunc
     };
 
     template <typename T>
-    struct mingradDispatcher<miaobyte,T>
+    struct mingradDispatcher<miaobyte, T>
     {
         static void mingrad(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &A_grad, Tensor<T> &B_grad, const Tensor<T> &output_grad)
         {
@@ -1030,9 +1067,9 @@ namespace deepx::tensorfunc
     };
 
     template <typename T>
-    struct minscalarDispatcher<miaobyte,T>
+    struct minscalarDispatcher<miaobyte, T>
     {
-        static void minscalar(const Tensor<T> &A,const T b, Tensor<T> &C)
+        static void minscalar(const Tensor<T> &A, const T b, Tensor<T> &C)
         {
             if (A.shape == C.shape)
             {
@@ -1069,10 +1106,10 @@ namespace deepx::tensorfunc
                 throw std::invalid_argument("shape mismatch");
             }
         }
-    };  
+    };
 
     template <typename T>
-    struct minscalargradDispatcher<miaobyte,T>
+    struct minscalargradDispatcher<miaobyte, T>
     {
         static void minscalargrad(const Tensor<T> &A, const T b, Tensor<T> &A_grad, const Tensor<T> &output_grad)
         {
diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp b/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp
index 622463d5..26dde852 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp
@@ -49,7 +49,7 @@ namespace deepx::tf
                 tensorfunc::add<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<float>(this->args[1].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
                 break;
             case Precision::Int64:
-                tensorfunc::add<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                tensorfunc::add<Author, int64_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue), *mem->gettensor<int64_t>(this->returns[0].textvalue));
                 break;
             case Precision::Int32:
                 tensorfunc::add<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), *mem->gettensor<int32_t>(this->returns[0].textvalue));
@@ -105,7 +105,7 @@ namespace deepx::tf
                 tensorfunc::addscalar<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), this->getvar<float>(1, mem), *mem->gettensor<float>(this->returns[0].textvalue));
                 break;
             case Precision::Int64:
-                tensorfunc::addscalar<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), this->getvar<int32_t>(1, mem), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                tensorfunc::addscalar<Author, int64_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), this->getvar<int64_t>(1, mem), *mem->gettensor<int64_t>(this->returns[0].textvalue));
                 break;
             case Precision::Int32:
                 tensorfunc::addscalar<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), this->getvar<int32_t>(1, mem), *mem->gettensor<int32_t>(this->returns[0].textvalue));
@@ -161,7 +161,7 @@ namespace deepx::tf
                 tensorfunc::sub<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<float>(this->args[1].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
                 break;
             case Precision::Int64:
-                tensorfunc::sub<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                tensorfunc::sub<Author, int64_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue), *mem->gettensor<int64_t>(this->returns[0].textvalue));
                 break;
             case Precision::Int32:
                 tensorfunc::sub<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), *mem->gettensor<int32_t>(this->returns[0].textvalue));
@@ -217,7 +217,7 @@ namespace deepx::tf
                 tensorfunc::subscalar<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), this->getvar<float>(1, mem), *mem->gettensor<float>(this->returns[0].textvalue));
                 break;
             case Precision::Int64:
-                tensorfunc::subscalar<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), this->getvar<int32_t>(1, mem), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                tensorfunc::subscalar<Author, int64_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), this->getvar<int64_t>(1, mem), *mem->gettensor<int64_t>(this->returns[0].textvalue));
                 break;
             case Precision::Int32:
                 tensorfunc::subscalar<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), this->getvar<int32_t>(1, mem), *mem->gettensor<int32_t>(this->returns[0].textvalue));
@@ -274,7 +274,7 @@ namespace deepx::tf
                 tensorfunc::mul<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<float>(this->args[1].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
                 break;
             case Precision::Int64:
-                tensorfunc::mul<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                tensorfunc::mul<Author, int64_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue), *mem->gettensor<int64_t>(this->returns[0].textvalue));
                 break;
             case Precision::Int32:
                 tensorfunc::mul<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), *mem->gettensor<int32_t>(this->returns[0].textvalue));
@@ -330,7 +330,7 @@ namespace deepx::tf
                 tensorfunc::mulscalar<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), this->getvar<float>(1, mem), *mem->gettensor<float>(this->returns[0].textvalue));
                 break;
             case Precision::Int64:
-                tensorfunc::mulscalar<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), this->getvar<int32_t>(1, mem), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                tensorfunc::mulscalar<Author, int64_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), this->getvar<int64_t>(1, mem), *mem->gettensor<int64_t>(this->returns[0].textvalue));
                 break;
             case Precision::Int32:
                 tensorfunc::mulscalar<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), this->getvar<int32_t>(1, mem), *mem->gettensor<int32_t>(this->returns[0].textvalue));
@@ -387,7 +387,7 @@ namespace deepx::tf
                 tensorfunc::div<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<float>(this->args[1].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
                 break;
             case Precision::Int64:
-                tensorfunc::div<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                tensorfunc::div<Author, int64_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue), *mem->gettensor<int64_t>(this->returns[0].textvalue));
                 break;
             case Precision::Int32:
                 tensorfunc::div<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), *mem->gettensor<int32_t>(this->returns[0].textvalue));
@@ -443,7 +443,7 @@ namespace deepx::tf
                 tensorfunc::divscalar<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), this->getvar<float>(1, mem), *mem->gettensor<float>(this->returns[0].textvalue));
                 break;
             case Precision::Int64:
-                tensorfunc::divscalar<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), this->getvar<int32_t>(1, mem), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                tensorfunc::divscalar<Author, int64_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), this->getvar<int64_t>(1, mem), *mem->gettensor<int64_t>(this->returns[0].textvalue));
                 break;
             case Precision::Int32:
                 tensorfunc::divscalar<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), this->getvar<int32_t>(1, mem), *mem->gettensor<int32_t>(this->returns[0].textvalue));
@@ -499,7 +499,7 @@ namespace deepx::tf
                 tensorfunc::rdivscalar<Author, float>(this->getvar<float>(0, mem),*mem->gettensor<float>(this->args[1].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
                 break;
             case Precision::Int64:
-                tensorfunc::rdivscalar<Author, int32_t>(this->getvar<int32_t>(0, mem),*mem->gettensor<int32_t>(this->args[1].textvalue), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                tensorfunc::rdivscalar<Author, int64_t>(this->getvar<int64_t>(0, mem),*mem->gettensor<int64_t>(this->args[1].textvalue), *mem->gettensor<int64_t>(this->returns[0].textvalue));
                 break;
             case Precision::Int32:
                 tensorfunc::rdivscalar<Author, int32_t>(this->getvar<int32_t>(0, mem),*mem->gettensor<int32_t>(this->args[1].textvalue), *mem->gettensor<int32_t>(this->returns[0].textvalue));
@@ -517,7 +517,288 @@ namespace deepx::tf
             return 0;
         }
     };
- 
-}
 
+    template <typename Author>
+    class Sqrt : public TF
+    {
+    public:
+        Sqrt(vector<Param> args, vector<Param> returns)
+        {
+            this->name = "sqrt";
+            this->author = Author::name();  
+            this->args = args;
+            this->returns = returns;
+        }
+        string math_formula() const override
+        {
+            return "T3=sqrt(T1)";
+        }
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<Sqrt<Author>>(*this);
+        }
+        int run(shared_ptr<MemBase> mem, string &error) override    
+        {
+            Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
+            Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
+            if (a_type != c_type)
+            {
+                error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type); 
+                return 1;
+            }
+            switch (a_type)
+            {
+            case Precision::Float64:
+                tensorfunc::sqrt<Author, double>(*mem->gettensor<double>(this->args[0].textvalue), *mem->gettensor<double>(this->returns[0].textvalue));
+                break;
+            case Precision::Float32:
+                tensorfunc::sqrt<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
+                break;
+            case Precision::Int64:
+                tensorfunc::sqrt<Author, int64_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int32:
+                tensorfunc::sqrt<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int16:
+                tensorfunc::sqrt<Author, int16_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int8:
+                tensorfunc::sqrt<Author, int8_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                break;
+            default:
+                error = "Unsupported dtype: " + precision_str(a_type);
+                return 1;   
+            }
+            return 0;
+        }
+    };
+
+    template <typename Author>
+    class Pow : public TF
+    {
+    public:
+        Pow(vector<Param> args, vector<Param> returns)
+        {
+            this->name = "pow";
+            this->author = Author::name();
+            this->args = args;
+            this->returns = returns;
+        }
+        string math_formula() const override    
+        {
+            return "T3=T1^T2";
+        }
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<Pow<Author>>(*this);
+        }   
+        int run(shared_ptr<MemBase> mem, string &error) override
+        {
+            Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
+            Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype;
+            Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;   
+            if (a_type != b_type || a_type != c_type)   
+            {
+                error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(b_type) + " != " + precision_str(c_type);
+                return 1;
+            }
+            switch (a_type) 
+            {
+            case Precision::Float64:
+                tensorfunc::pow<Author, double>(*mem->gettensor<double>(this->args[0].textvalue), *mem->gettensor<double>(this->args[1].textvalue), *mem->gettensor<double>(this->returns[0].textvalue));
+                break;
+            case Precision::Float32:
+                tensorfunc::pow<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<float>(this->args[1].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
+                break;
+            case Precision::Int64:
+                tensorfunc::pow<Author, int64_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue), *mem->gettensor<int64_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int32:  
+                tensorfunc::pow<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int16:
+                tensorfunc::pow<Author, int16_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), *mem->gettensor<int16_t>(this->args[1].textvalue), *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int8:
+                tensorfunc::pow<Author, int8_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), *mem->gettensor<int8_t>(this->args[1].textvalue), *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                break;
+            default:
+                error = "Unsupported dtype: " + precision_str(a_type);
+                return 1;
+            }
+            return 0;
+        }
+    };  
+
+    template <typename Author>
+    class PowScalar : public TF
+    {
+    public:
+        PowScalar(vector<Param> args, vector<Param> returns)
+        {
+            this->name = "powscalar";
+            this->author = Author::name();
+            this->args = args;
+            this->returns = returns;
+        }
+        string math_formula() const override
+        {
+            return "T3=T1^scalar";
+        }
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<PowScalar<Author>>(*this);
+        }
+        int run(shared_ptr<MemBase> mem, string &error) override
+        {
+            Precision a_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype;
+            Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
+            if (a_type != c_type)
+            {
+                error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type);
+                return 1;
+            }
+            switch (a_type)
+            {
+            case Precision::Float64:
+                tensorfunc::powscalar<Author, double>(*mem->gettensor<double>(this->args[0].textvalue), this->getvar<double>(1, mem), *mem->gettensor<double>(this->returns[0].textvalue));
+                break;
+            case Precision::Float32:
+                tensorfunc::powscalar<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), this->getvar<float>(1, mem), *mem->gettensor<float>(this->returns[0].textvalue));
+                break;
+            case Precision::Int64:
+                tensorfunc::powscalar<Author, int64_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), this->getvar<int64_t>(1, mem), *mem->gettensor<int64_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int32:
+                tensorfunc::powscalar<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), this->getvar<int32_t>(1, mem), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int16:
+                tensorfunc::powscalar<Author, int16_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), this->getvar<int16_t>(1, mem), *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int8:       
+                tensorfunc::powscalar<Author, int8_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), this->getvar<int8_t>(1, mem), *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                break;
+            default:
+                error = "Unsupported dtype: " + precision_str(a_type);
+                return 1;
+            }
+            return 0;
+        }
+    };
+
+    template <typename Author>
+    class Log : public TF
+    {
+    public:
+        Log(vector<Param> args, vector<Param> returns)
+        {
+            this->name = "log";
+            this->author = Author::name();
+            this->args = args;
+            this->returns = returns;
+        }
+        string math_formula() const override
+        {
+            return "T3=log(T1)";
+        }
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<Log<Author>>(*this);
+        }
+        int run(shared_ptr<MemBase> mem, string &error) override
+        {
+            Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
+            Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
+            if (a_type != c_type)
+            {
+                error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type);
+                return 1;
+            }   
+            switch (a_type)
+            {
+            case Precision::Float64:
+                tensorfunc::log<Author, double>(*mem->gettensor<double>(this->args[0].textvalue), *mem->gettensor<double>(this->returns[0].textvalue));
+                break;
+            case Precision::Float32:
+                tensorfunc::log<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
+                break;
+            case Precision::Int64:
+                tensorfunc::log<Author, int64_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int32:
+                tensorfunc::log<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int16:
+                tensorfunc::log<Author, int16_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int8:
+                tensorfunc::log<Author, int8_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                break;
+            default:
+                error = "Unsupported dtype: " + precision_str(a_type);
+                return 1;
+            }
+            return 0;
+        }
+    };
+
+    template <typename Author>
+    class Exp : public TF
+    {
+    public:
+        Exp(vector<Param> args, vector<Param> returns)
+        {   
+            this->name = "exp";
+            this->author = Author::name();
+            this->args = args;
+            this->returns = returns;
+        }
+        string math_formula() const override
+        {
+            return "T3=exp(T1)";
+        }
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<Exp<Author>>(*this);
+        }   
+        int run(shared_ptr<MemBase> mem, string &error) override
+        {
+            Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
+            Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
+            if (a_type != c_type)
+            {
+                error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type);
+                return 1;
+            }
+            switch (a_type) 
+            {
+            case Precision::Float64:
+                tensorfunc::exp<Author, double>(*mem->gettensor<double>(this->args[0].textvalue), *mem->gettensor<double>(this->returns[0].textvalue));
+                break;
+            case Precision::Float32:
+                tensorfunc::exp<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
+                break;
+            case Precision::Int64:
+                tensorfunc::exp<Author, int64_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int32:
+                tensorfunc::exp<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int16:
+                tensorfunc::exp<Author, int16_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int8:   
+                tensorfunc::exp<Author, int8_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                break;
+            default:
+                error = "Unsupported dtype: " + precision_str(a_type);
+                return 1;
+            }
+            return 0;
+        }
+    };
+ 
+};
+    
 #endif
diff --git a/front/py/deepx/nn/functional/__init__.py b/front/py/deepx/nn/functional/__init__.py
index f724b698..07610c30 100644
--- a/front/py/deepx/nn/functional/__init__.py
+++ b/front/py/deepx/nn/functional/__init__.py
@@ -10,7 +10,7 @@
     "newtensor",
     "printtensor",
     "constant","full","zeros","ones","uniform","arange","rand","randn","eye","kaiming_uniform_","calculate_fan_in_and_fan_out",
-    "add","sub","mul","div","clamp","exp","sqrt","rsqrt",
+    "add","sub","mul","div","clamp","sqrt","pow","exp","log","rsqrt",
     "matmul",
     "max","min","sum","prod","mean",
     "transpose","reshape","broadcast_shape","broadcast_to","unsqueeze",
diff --git a/front/py/deepx/nn/functional/elementwise.py b/front/py/deepx/nn/functional/elementwise.py
index ecf3c0c3..56acc975 100644
--- a/front/py/deepx/nn/functional/elementwise.py
+++ b/front/py/deepx/nn/functional/elementwise.py
@@ -208,14 +208,14 @@ def clamp(
         varir=DeepxIR("clamp", a.dtype, [a.node.name,min,max], [outtensor.node.name])
         send(str(varir))
     return outtensor
-#exp
-OpNode.register("exp")
-def exp(
-        a:Tensor,
+
+#sqrt
+OpNode.register("sqrt")
+def sqrt(
+        input:Tensor,
         out:Union[Tensor,str]='')->Tensor:
-    return _A_elementwiseop_C(a,"exp",out)  
-#pow
-# todo
+    return _A_elementwiseop_C(input,"sqrt",out)
+
 OpNode.register("pow")
 OpNode.register("powscalar")
 def pow(
@@ -226,12 +226,20 @@ def pow(
         return _A_b_elementwiseop_C(a,b,"powscalar",out)
     else:
         return _A_B_elementwiseop_C(a,b,"pow",out)
-#sqrt
-OpNode.register("sqrt")
-def sqrt(
+
+#exp
+OpNode.register("exp")
+def exp(
+        a:Tensor,
+        out:Union[Tensor,str]='')->Tensor:
+    return _A_elementwiseop_C(a,"exp",out)  
+#log
+OpNode.register("log")
+def log(
         input:Tensor,
         out:Union[Tensor,str]='')->Tensor:
-    return _A_elementwiseop_C(input,"sqrt",out)
+    return _A_elementwiseop_C(input,"log",out)
+
 
 def rsqrt(
         input:Tensor,
diff --git a/front/py/examples/2_ir/2_elementwise_sqrtlog.dot b/front/py/examples/2_ir/2_elementwise_sqrtlog.dot
new file mode 100644
index 00000000..4e476571
--- /dev/null
+++ b/front/py/examples/2_ir/2_elementwise_sqrtlog.dot
@@ -0,0 +1,35 @@
+// Computational Graph
+digraph {
+	rankdir=TB
+	node [shape=record]
+	140074505155728 [label="t1
+(60,)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
+	140076479891344 [label="t2
+(60,)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
+	140074503481968 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
+	140074503482016 [label="var_1
+2" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled]
+	140074503481920 [label=sqrt color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
+	140074503481824 [label="t3
+(60,)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
+	140074503481728 [label=log color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
+	140074503482304 [label="t4
+(60,)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
+	140074503482544 [label=exp color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
+	140074503482640 [label="t5
+(60,)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
+	140074503487056 [label=pow color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
+	140074503486960 [label="t6
+(60,)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
+	140074503481968 -> 140076479891344 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	140074503482016 -> 140074503481968 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	140074505155728 -> 140074503481920 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	140074503481920 -> 140074503481824 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	140076479891344 -> 140074503481728 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	140074503481728 -> 140074503482304 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	140074503482304 -> 140074503482544 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	140074503482544 -> 140074503482640 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	140074503482640 -> 140074503487056 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	140074503481824 -> 140074503487056 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	140074503487056 -> 140074503486960 [arrowsize=0.8 color=gray40 penwidth=1.2]
+}
diff --git a/front/py/examples/2_ir/2_elementwise_sqrtlog.dot.svg b/front/py/examples/2_ir/2_elementwise_sqrtlog.dot.svg
new file mode 100644
index 00000000..a517b63b
--- /dev/null
+++ b/front/py/examples/2_ir/2_elementwise_sqrtlog.dot.svg
@@ -0,0 +1,158 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+ "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by graphviz version 2.43.0 (0)
+ -->
+<!-- Title: %3 Pages: 1 -->
+<svg width="149pt" height="630pt"
+ viewBox="0.00 0.00 148.50 630.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 626)">
+<title>%3</title>
+<polygon fill="white" stroke="transparent" points="-4,4 -4,-626 144.5,-626 144.5,4 -4,4"/>
+<!-- 140074505155728 -->
+<g id="node1" class="node">
+<title>140074505155728</title>
+<polygon fill="aliceblue" stroke="skyblue" points="54,-330 0,-330 0,-292 54,-292 54,-330"/>
+<text text-anchor="middle" x="27" y="-314.8" font-family="Sans-Serif" font-size="14.00">t1</text>
+<text text-anchor="middle" x="27" y="-299.8" font-family="Sans-Serif" font-size="14.00">(60,)</text>
+</g>
+<!-- 140074503481920 -->
+<g id="node5" class="node">
+<title>140074503481920</title>
+<polygon fill="lightgray" stroke="darkslategray" points="54,-256 0,-256 0,-220 54,-220 54,-256"/>
+<text text-anchor="middle" x="27" y="-234.3" font-family="Courier Bold" font-size="14.00">sqrt</text>
+</g>
+<!-- 140074505155728&#45;&gt;140074503481920 -->
+<g id="edge3" class="edge">
+<title>140074505155728&#45;&gt;140074503481920</title>
+<path fill="none" stroke="#666666" stroke-width="1.2" d="M27,-291.72C27,-283.29 27,-273.15 27,-264.02"/>
+<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="29.8,-264 27,-256 24.2,-264 29.8,-264"/>
+</g>
+<!-- 140076479891344 -->
+<g id="node2" class="node">
+<title>140076479891344</title>
+<polygon fill="aliceblue" stroke="skyblue" points="126,-476 72,-476 72,-438 126,-438 126,-476"/>
+<text text-anchor="middle" x="99" y="-460.8" font-family="Sans-Serif" font-size="14.00">t2</text>
+<text text-anchor="middle" x="99" y="-445.8" font-family="Sans-Serif" font-size="14.00">(60,)</text>
+</g>
+<!-- 140074503481728 -->
+<g id="node7" class="node">
+<title>140074503481728</title>
+<polygon fill="lightgray" stroke="darkslategray" points="126,-402 72,-402 72,-366 126,-366 126,-402"/>
+<text text-anchor="middle" x="99" y="-380.3" font-family="Courier Bold" font-size="14.00">log</text>
+</g>
+<!-- 140076479891344&#45;&gt;140074503481728 -->
+<g id="edge5" class="edge">
+<title>140076479891344&#45;&gt;140074503481728</title>
+<path fill="none" stroke="#666666" stroke-width="1.2" d="M99,-437.72C99,-429.29 99,-419.15 99,-410.02"/>
+<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="101.8,-410 99,-402 96.2,-410 101.8,-410"/>
+</g>
+<!-- 140074503481968 -->
+<g id="node3" class="node">
+<title>140074503481968</title>
+<polygon fill="lightgray" stroke="darkslategray" points="140.5,-548 57.5,-548 57.5,-512 140.5,-512 140.5,-548"/>
+<text text-anchor="middle" x="99" y="-526.3" font-family="Courier Bold" font-size="14.00">constant</text>
+</g>
+<!-- 140074503481968&#45;&gt;140076479891344 -->
+<g id="edge1" class="edge">
+<title>140074503481968&#45;&gt;140076479891344</title>
+<path fill="none" stroke="#666666" stroke-width="1.2" d="M99,-511.81C99,-503.52 99,-493.39 99,-484.16"/>
+<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="101.8,-484.02 99,-476.02 96.2,-484.02 101.8,-484.02"/>
+</g>
+<!-- 140074503482016 -->
+<g id="node4" class="node">
+<title>140074503482016</title>
+<polygon fill="moccasin" stroke="orange" points="126,-622 72,-622 72,-584 126,-584 126,-622"/>
+<text text-anchor="middle" x="99" y="-606.8" font-family="Sans-Serif" font-size="14.00">var_1</text>
+<text text-anchor="middle" x="99" y="-591.8" font-family="Sans-Serif" font-size="14.00">2</text>
+</g>
+<!-- 140074503482016&#45;&gt;140074503481968 -->
+<g id="edge2" class="edge">
+<title>140074503482016&#45;&gt;140074503481968</title>
+<path fill="none" stroke="#666666" stroke-width="1.2" d="M99,-583.72C99,-575.29 99,-565.15 99,-556.02"/>
+<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="101.8,-556 99,-548 96.2,-556 101.8,-556"/>
+</g>
+<!-- 140074503481824 -->
+<g id="node6" class="node">
+<title>140074503481824</title>
+<polygon fill="aliceblue" stroke="skyblue" points="54,-184 0,-184 0,-146 54,-146 54,-184"/>
+<text text-anchor="middle" x="27" y="-168.8" font-family="Sans-Serif" font-size="14.00">t3</text>
+<text text-anchor="middle" x="27" y="-153.8" font-family="Sans-Serif" font-size="14.00">(60,)</text>
+</g>
+<!-- 140074503481920&#45;&gt;140074503481824 -->
+<g id="edge4" class="edge">
+<title>140074503481920&#45;&gt;140074503481824</title>
+<path fill="none" stroke="#666666" stroke-width="1.2" d="M27,-219.81C27,-211.52 27,-201.39 27,-192.16"/>
+<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="29.8,-192.02 27,-184.02 24.2,-192.02 29.8,-192.02"/>
+</g>
+<!-- 140074503487056 -->
+<g id="node11" class="node">
+<title>140074503487056</title>
+<polygon fill="lightgray" stroke="darkslategray" points="90,-110 36,-110 36,-74 90,-74 90,-110"/>
+<text text-anchor="middle" x="63" y="-88.3" font-family="Courier Bold" font-size="14.00">pow</text>
+</g>
+<!-- 140074503481824&#45;&gt;140074503487056 -->
+<g id="edge10" class="edge">
+<title>140074503481824&#45;&gt;140074503487056</title>
+<path fill="none" stroke="#666666" stroke-width="1.2" d="M36.27,-145.72C40.68,-137.03 46,-126.52 50.74,-117.19"/>
+<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="53.26,-118.4 54.38,-110 48.26,-115.87 53.26,-118.4"/>
+</g>
+<!-- 140074503482304 -->
+<g id="node8" class="node">
+<title>140074503482304</title>
+<polygon fill="aliceblue" stroke="skyblue" points="126,-330 72,-330 72,-292 126,-292 126,-330"/>
+<text text-anchor="middle" x="99" y="-314.8" font-family="Sans-Serif" font-size="14.00">t4</text>
+<text text-anchor="middle" x="99" y="-299.8" font-family="Sans-Serif" font-size="14.00">(60,)</text>
+</g>
+<!-- 140074503481728&#45;&gt;140074503482304 -->
+<g id="edge6" class="edge">
+<title>140074503481728&#45;&gt;140074503482304</title>
+<path fill="none" stroke="#666666" stroke-width="1.2" d="M99,-365.81C99,-357.52 99,-347.39 99,-338.16"/>
+<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="101.8,-338.02 99,-330.02 96.2,-338.02 101.8,-338.02"/>
+</g>
+<!-- 140074503482544 -->
+<g id="node9" class="node">
+<title>140074503482544</title>
+<polygon fill="lightgray" stroke="darkslategray" points="126,-256 72,-256 72,-220 126,-220 126,-256"/>
+<text text-anchor="middle" x="99" y="-234.3" font-family="Courier Bold" font-size="14.00">exp</text>
+</g>
+<!-- 140074503482304&#45;&gt;140074503482544 -->
+<g id="edge7" class="edge">
+<title>140074503482304&#45;&gt;140074503482544</title>
+<path fill="none" stroke="#666666" stroke-width="1.2" d="M99,-291.72C99,-283.29 99,-273.15 99,-264.02"/>
+<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="101.8,-264 99,-256 96.2,-264 101.8,-264"/>
+</g>
+<!-- 140074503482640 -->
+<g id="node10" class="node">
+<title>140074503482640</title>
+<polygon fill="aliceblue" stroke="skyblue" points="126,-184 72,-184 72,-146 126,-146 126,-184"/>
+<text text-anchor="middle" x="99" y="-168.8" font-family="Sans-Serif" font-size="14.00">t5</text>
+<text text-anchor="middle" x="99" y="-153.8" font-family="Sans-Serif" font-size="14.00">(60,)</text>
+</g>
+<!-- 140074503482544&#45;&gt;140074503482640 -->
+<g id="edge8" class="edge">
+<title>140074503482544&#45;&gt;140074503482640</title>
+<path fill="none" stroke="#666666" stroke-width="1.2" d="M99,-219.81C99,-211.52 99,-201.39 99,-192.16"/>
+<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="101.8,-192.02 99,-184.02 96.2,-192.02 101.8,-192.02"/>
+</g>
+<!-- 140074503482640&#45;&gt;140074503487056 -->
+<g id="edge9" class="edge">
+<title>140074503482640&#45;&gt;140074503487056</title>
+<path fill="none" stroke="#666666" stroke-width="1.2" d="M89.73,-145.72C85.32,-137.03 80,-126.52 75.26,-117.19"/>
+<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="77.74,-115.87 71.62,-110 72.74,-118.4 77.74,-115.87"/>
+</g>
+<!-- 140074503486960 -->
+<g id="node12" class="node">
+<title>140074503486960</title>
+<polygon fill="aliceblue" stroke="skyblue" points="90,-38 36,-38 36,0 90,0 90,-38"/>
+<text text-anchor="middle" x="63" y="-22.8" font-family="Sans-Serif" font-size="14.00">t6</text>
+<text text-anchor="middle" x="63" y="-7.8" font-family="Sans-Serif" font-size="14.00">(60,)</text>
+</g>
+<!-- 140074503487056&#45;&gt;140074503486960 -->
+<g id="edge11" class="edge">
+<title>140074503487056&#45;&gt;140074503486960</title>
+<path fill="none" stroke="#666666" stroke-width="1.2" d="M63,-73.81C63,-65.52 63,-55.39 63,-46.16"/>
+<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="65.8,-46.02 63,-38.02 60.2,-46.02 65.8,-46.02"/>
+</g>
+</g>
+</svg>
diff --git a/front/py/examples/2_ir/2_elementwise_sqrtlog.py b/front/py/examples/2_ir/2_elementwise_sqrtlog.py
new file mode 100644
index 00000000..908efd69
--- /dev/null
+++ b/front/py/examples/2_ir/2_elementwise_sqrtlog.py
@@ -0,0 +1,38 @@
+
+############-------PyTorch-------################
+
+import torch
+torch_t1 = torch.arange(3*4*5, dtype=torch.float32)
+torch_t2 = torch.full((3*4*5,),2, dtype=torch.float32)
+
+torch_t3 = torch.sqrt(torch_t1)
+print(torch_t3)
+torch_t4 = torch.log(torch_t2)
+print(torch_t4)
+torch_t5 = torch.exp(torch_t4)
+print(torch_t5)
+torch_t6 = torch.pow(torch_t5,torch_t3)
+print(torch_t6)
+
+############-------DEEPX-------################
+
+import deepx
+print()
+
+t1 = deepx.arange(end=3*4*5,dtype='float32',name="t1")
+t2 = deepx.full([3*4*5],value=2,dtype='float32',name="t2")
+t3 = deepx.sqrt(t1,out='t3')
+print(t3)
+t4 = deepx.log(t2,out='t4')
+print(t4)
+t5 = deepx.exp(t4,out='t5')
+print(t5)
+t6 = deepx.pow(t5,t3,out='t6')
+print(t6)
+
+import os
+script_name = os.path.splitext(os.path.basename( os.path.abspath(__file__)))[0]  # 获取不带后缀的脚本名
+str=t3.graph.to_dot()
+str.render(script_name+".dot", format='svg')
+
+