From ea84ac1b5853ad76ef2e30bb31340c4f3fdceaca Mon Sep 17 00:00:00 2001
From: lipeng <734991033@qq.com>
Date: Thu, 3 Apr 2025 15:58:06 +0800
Subject: [PATCH 1/7] =?UTF-8?q?fp16&bf16:cuda=E7=89=88=E6=9C=AC=E4=BF=AE?=
 =?UTF-8?q?=E6=AD=A3=E3=80=8212.1->12.6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../tensorfunc/elementwise_miaobyte_sin.cu    | 24 ++++-
 .../tensorfunc/elementwise_miaobyte_sin_a.cu  | 67 -------------
 .../tensorfunc/elementwise_miaobyte_sqrt.cu   | 37 +++++++-
 .../tensorfunc/elementwise_miaobyte_sqrt_a.cu | 95 -------------------
 4 files changed, 51 insertions(+), 172 deletions(-)
 delete mode 100644 excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin_a.cu
 delete mode 100644 excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt_a.cu
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.cu
index 23b78dbf..00bd232c 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.cu
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.cu
@@ -2,7 +2,7 @@
 #define DEEPX_TENSORFUNC_ELEMENTWISE_MIAOBYTE_SIN_CU
 
 #include <cuda_fp16.h>
-
+#include <cuda_bf16.h>
  
 #include "deepx/tensorfunc/cuda.hpp"
 #include "deepx/tensorfunc/authors.hpp"
@@ -35,7 +35,15 @@ namespace deepx::tensorfunc
             C[idx] = hsin(A[idx]);
         }
     }   
- 
+    template <>
+    __global__ void sin_kernel<nv_bfloat16>(const nv_bfloat16 *A, nv_bfloat16 *C, const int size){
+        int idx = blockIdx.x * blockDim.x + threadIdx.x;
+        if (idx < size)
+        {
+            C[idx] = hsin(A[idx]);
+        }
+    }
+
     template <typename T>
     void launch_sin(int numBlocks, int blockSize, const T* a, T* c, const int size){
         sin_kernel<<<numBlocks, blockSize>>>(a, c, size);
@@ -49,7 +57,7 @@ namespace deepx::tensorfunc
     template void  launch_sin<double>(int numBlocks, int blockSize, const double* a, double* c, const int size);
     template void  launch_sin<float>(int numBlocks, int blockSize, const float* a, float* c, const int size);
     template void  launch_sin<__half>(int numBlocks, int blockSize, const __half* a, __half* c, const int size);
-
+    template void  launch_sin<nv_bfloat16>(int numBlocks, int blockSize, const nv_bfloat16* a, nv_bfloat16* c, const int size);
     // cos
     template <typename T>
     __global__ void cos_kernel(const T* A, T* C, const int size);
@@ -75,7 +83,13 @@ namespace deepx::tensorfunc
             C[idx] = hcos(A[idx]);
         }
     }      
- 
+    template <>
+    __global__ void cos_kernel<nv_bfloat16>(const nv_bfloat16 *A, nv_bfloat16 *C, const int size){
+        int idx = blockIdx.x * blockDim.x + threadIdx.x;
+        if (idx < size) {
+            C[idx] = hcos(A[idx]);
+        }
+    }
     template <typename T>
     void launch_cos(int numBlocks, int blockSize, const T* a, T* c, const int size){
         cos_kernel<<<numBlocks, blockSize>>>(a, c, size);
@@ -88,7 +102,7 @@ namespace deepx::tensorfunc
     template void  launch_cos<double>(int numBlocks, int blockSize, const double* a, double* c, const int size);    
     template void  launch_cos<float>(int numBlocks, int blockSize, const float* a, float* c, const int size);
     template void  launch_cos<__half>(int numBlocks, int blockSize, const __half* a, __half* c, const int size);
- 
+    template void  launch_cos<nv_bfloat16>(int numBlocks, int blockSize, const nv_bfloat16* a, nv_bfloat16* c, const int size);
     // tan
     template <typename T>
     __global__ void tan_kernel(const T* A, T* C, const int size);
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin_a.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin_a.cu
deleted file mode 100644
index 0660c3fa..00000000
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin_a.cu
+++ /dev/null
@@ -1,67 +0,0 @@
-#ifndef DEEPX_TENSORFUNC_ELEMENTWISE_MIAOBYTE_SIN_A_CU
-#define DEEPX_TENSORFUNC_ELEMENTWISE_MIAOBYTE_SIN_A_CU
-
-#include <cuda_bf16.h>
-
-#include "deepx/tensorfunc/cuda.hpp"
-#include "deepx/tensorfunc/authors.hpp"
-
-namespace deepx::tensorfunc
-{
-    // sin
-    template <typename T>
-    __global__ void sin_kernel(const T *A, T *C, const int size);
-
-    template <>
-    __global__ void sin_kernel<nv_bfloat16>(const nv_bfloat16 *A, nv_bfloat16 *C, const int size)
-    {
-        int idx = blockIdx.x * blockDim.x + threadIdx.x;
-        if (idx < size)
-        {
-            C[idx] = hsin(A[idx]);
-        }
-    }
-
-    template <typename T>
-    void launch_sin(int numBlocks, int blockSize, const T *a, T *c, const int size)
-    {
-        sin_kernel<<<numBlocks, blockSize>>>(a, c, size);
-        cudaError_t err = cudaGetLastError();
-        if (err != cudaSuccess)
-        {
-            throw std::runtime_error("Failed to launch sin kernel: " +
-                                     std::string(cudaGetErrorString(err)));
-        }
-    }
-
-    template void launch_sin<nv_bfloat16>(int numBlocks, int blockSize, const nv_bfloat16 *a, nv_bfloat16 *c, const int size);
-
-    // cos
-    template <typename T>
-    __global__ void cos_kernel(const T *A, T *C, const int size);
-
-    template <>
-    __global__ void cos_kernel<nv_bfloat16>(const nv_bfloat16 *A, nv_bfloat16 *C, const int size)
-    {
-        int idx = blockIdx.x * blockDim.x + threadIdx.x;
-        if (idx < size)
-        {
-            C[idx] = hcos(A[idx]);
-        }
-    }
-    template <typename T>
-    void launch_cos(int numBlocks, int blockSize, const T *a, T *c, const int size)
-    {
-        cos_kernel<<<numBlocks, blockSize>>>(a, c, size);
-        cudaError_t err = cudaGetLastError();
-        if (err != cudaSuccess)
-        {
-            throw std::runtime_error("Failed to launch cos kernel: " +
-                                     std::string(cudaGetErrorString(err)));
-        }
-    }
-    template void launch_cos<nv_bfloat16>(int numBlocks, int blockSize, const nv_bfloat16 *a, nv_bfloat16 *c, const int size);
-
-}
-
-#endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAOBYTE_SIN_A_CU
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cu
index fe5c92ca..95307389 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cu
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cu
@@ -2,6 +2,7 @@
 #define DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_SQRT_CU
 
 #include <cuda_fp16.h>
+#include <cuda_bf16.h>
 #include "deepx/tensorfunc/cuda.hpp"
 #include "deepx/tensorfunc/authors.hpp"
 #include <cuda/std/cmath>
@@ -39,7 +40,15 @@ namespace deepx::tensorfunc
             C[idx] = hsqrt(A[idx]);
         }
     }
-
+     template <>
+    __global__ void sqrt_kernel<nv_bfloat16>(const nv_bfloat16 *A, nv_bfloat16 *C, const int size)
+    {
+        int idx = blockIdx.x * blockDim.x + threadIdx.x;
+        if (idx < size)
+        {
+            C[idx] = hsqrt(A[idx]);
+        }
+    }
     template <typename T>
     void launch_sqrt(int numBlocks, int blockSize, const T *a, T *c, const int size)
     {
@@ -54,7 +63,7 @@ namespace deepx::tensorfunc
     template void launch_sqrt<double>(int numBlocks, int blockSize, const double *a, double *c, const int size);
     template void launch_sqrt<float>(int numBlocks, int blockSize, const float *a, float *c, const int size);
     template void launch_sqrt<__half>(int numBlocks, int blockSize, const __half *a, __half *c, const int size);
-
+    template void launch_sqrt<nv_bfloat16>(int numBlocks, int blockSize, const nv_bfloat16 *a, nv_bfloat16 *c, const int size);
     // pow
     template <typename T>
     __global__ void pow_kernel(const T *A, const T *B, T *C, const int size);
@@ -159,7 +168,16 @@ namespace deepx::tensorfunc
             C[idx] = hlog(A[idx]);
         }
     }
-
+    template <>
+    __global__ void log_kernel<nv_bfloat16>(const nv_bfloat16 *A, nv_bfloat16 *C, const int size)
+    {
+        int idx = blockIdx.x * blockDim.x + threadIdx.x;
+        if (idx < size)
+        {   
+            C[idx] = hlog(A[idx]);
+        }
+    }
+    
     template <typename T>
     void launch_log(int numBlocks, int blockSize, const T *a, T *c, const int size)
     {
@@ -174,7 +192,7 @@ namespace deepx::tensorfunc
     template void launch_log<double>(int numBlocks, int blockSize, const double *a, double *c, const int size);
     template void launch_log<float>(int numBlocks, int blockSize, const float *a, float *c, const int size);
     template void launch_log<__half>(int numBlocks, int blockSize, const __half *a, __half *c, const int size);
-
+    template void launch_log<nv_bfloat16>(int numBlocks, int blockSize, const nv_bfloat16 *a, nv_bfloat16 *c, const int size);
     // exp
     template <typename T>
     __global__ void exp_kernel(const T *A, T *C, const int size);
@@ -206,6 +224,15 @@ namespace deepx::tensorfunc
             C[idx] = hexp(A[idx]);
         }
     }
+    template <>
+    __global__ void exp_kernel<nv_bfloat16>(const nv_bfloat16 *A, nv_bfloat16 *C, const int size)
+    {
+        int idx = blockIdx.x * blockDim.x + threadIdx.x;
+        if (idx < size)
+        {
+            C[idx] = hexp(A[idx]);
+        }
+    }
 
     template <typename T>
     void launch_exp(int numBlocks, int blockSize, const T *a, T *c, const int size)
@@ -221,6 +248,6 @@ namespace deepx::tensorfunc
     template void launch_exp<double>(int numBlocks, int blockSize, const double *a, double *c, const int size);
     template void launch_exp<float>(int numBlocks, int blockSize, const float *a, float *c, const int size);
     template void launch_exp<__half>(int numBlocks, int blockSize, const __half *a, __half *c, const int size);
+    template void launch_exp<nv_bfloat16>(int numBlocks, int blockSize, const nv_bfloat16 *a, nv_bfloat16 *c, const int size);
 }
-
 #endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_SQRT_CU
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt_a.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt_a.cu
deleted file mode 100644
index 4c67fa04..00000000
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt_a.cu
+++ /dev/null
@@ -1,95 +0,0 @@
-#ifndef DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_SQRT_A_CU
-#define DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_SQRT_A_CU
-
-#include <cuda_bf16.h>
-
-#include "deepx/tensorfunc/cuda.hpp"
-#include "deepx/tensorfunc/authors.hpp"
-#include <cuda/std/cmath>
-
-namespace deepx::tensorfunc
-{
-    // sqrt
-    template <typename T>
-    __global__ void sqrt_kernel(const T *A, T *C, const int size);
-    template <>
-
-    template <>
-    __global__ void sqrt_kernel<nv_bfloat16>(const nv_bfloat16 *A, nv_bfloat16 *C, const int size)
-    {
-        int idx = blockIdx.x * blockDim.x + threadIdx.x;
-        if (idx < size)
-        {
-            C[idx] = hsqrt(A[idx]);
-        }
-    }
-
-    template <typename T>
-    void launch_sqrt(int numBlocks, int blockSize, const T *a, T *c, const int size)
-    {
-        sqrt_kernel<<<numBlocks, blockSize>>>(a, c, size);
-        cudaError_t err = cudaGetLastError();
-        if (err != cudaSuccess)
-        {
-            throw std::runtime_error("Failed to launch sqrt kernel: " +
-                                     std::string(cudaGetErrorString(err)));
-        }
-    }
-    template void launch_sqrt<nv_bfloat16>(int numBlocks, int blockSize, const nv_bfloat16 *a, nv_bfloat16 *c, const int size);
-
-    // log
-    template <typename T>
-    __global__ void log_kernel(const T *A, T *C, const int size);
-
-    template <>
-    __global__ void log_kernel<nv_bfloat16>(const nv_bfloat16 *A, nv_bfloat16 *C, const int size)
-    {
-        int idx = blockIdx.x * blockDim.x + threadIdx.x;
-        if (idx < size)
-        {
-            C[idx] = hlog(A[idx]);
-        }
-    }
-
-    template <typename T>
-    void launch_log(int numBlocks, int blockSize, const T *a, T *c, const int size)
-    {
-        log_kernel<<<numBlocks, blockSize>>>(a, c, size);
-        cudaError_t err = cudaGetLastError();
-        if (err != cudaSuccess)
-        {
-            throw std::runtime_error("Failed to launch log kernel: " +
-                                     std::string(cudaGetErrorString(err)));
-        }
-    }
-    template void launch_log<nv_bfloat16>(int numBlocks, int blockSize, const nv_bfloat16 *a, nv_bfloat16 *c, const int size);
-
-    // exp
-    template <typename T>
-    __global__ void exp_kernel(const T *A, T *C, const int size);
-
-    template <>
-    __global__ void exp_kernel<nv_bfloat16>(const nv_bfloat16 *A, nv_bfloat16 *C, const int size)
-    {
-        int idx = blockIdx.x * blockDim.x + threadIdx.x;
-        if (idx < size)
-        {
-            C[idx] = hexp(A[idx]);
-        }
-    }
-
-    template <typename T>
-    void launch_exp(int numBlocks, int blockSize, const T *a, T *c, const int size)
-    {
-        exp_kernel<<<numBlocks, blockSize>>>(a, c, size);
-        cudaError_t err = cudaGetLastError();
-        if (err != cudaSuccess)
-        {
-            throw std::runtime_error("Failed to launch exp kernel: " +
-                                     std::string(cudaGetErrorString(err)));
-        }
-    }
-    template void launch_exp<nv_bfloat16>(int numBlocks, int blockSize, const nv_bfloat16 *a, nv_bfloat16 *c, const int size);
-}
-
-#endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_SQRT_A_CU

From 0830944dcd8540a32efd905ffcf41787830f3b72 Mon Sep 17 00:00:00 2001
From: lipeng <734991033@qq.com>
Date: Thu, 3 Apr 2025 16:04:43 +0800
Subject: [PATCH 2/7] =?UTF-8?q?tf:todo=E6=B8=85=E5=8D=95?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../src/deepx/tensorfunc/changeshape.hpp      | 144 ++++++++++++++----
 .../src/deepx/tensorfunc/elementwise.hpp      |  26 ++++
 .../src/deepx/tensorfunc/matmul.hpp           |  13 --
 .../src/deepx/tensorfunc/reduce.hpp           |  56 +++++++
 4 files changed, 197 insertions(+), 42 deletions(-)
 create mode 100644 excuter/cpp-common/src/deepx/tensorfunc/reduce.hpp

diff --git a/excuter/cpp-common/src/deepx/tensorfunc/changeshape.hpp b/excuter/cpp-common/src/deepx/tensorfunc/changeshape.hpp
index 6ae0ba86..dd190dac 100644
--- a/excuter/cpp-common/src/deepx/tensorfunc/changeshape.hpp
+++ b/excuter/cpp-common/src/deepx/tensorfunc/changeshape.hpp
@@ -6,43 +6,129 @@
 
 namespace deepx::tensorfunc
 {
-
-    // 通用模板声明
+ 
     template <typename Author, typename T>
-    struct InitDispatcher
+    struct reshapeDispatcher
     {
         static void reshape(Tensor<T> &tensor, const Shape &new_shape) = delete;
     };
 
+    // reshape(A,new_shape)=>B
     template <typename Author, typename T>
     void reshape(Tensor<T> &tensor, const Shape &new_shape)
     {
-        InitDispatcher<Author, T>::reshape(tensor, new_shape);
-    }
-
-    // // 作者特化示例（类型无关实现）
-    // template <typename T>
-    // struct InitDispatcher<miaobyte, T>
-    // {
-    //     static void reshape(Tensor<T> &tensor, const Shape &new_shape)
-    //     {
-    //         // 统一实现，不依赖T的类型
-    //         if (tensor.shape.size() != new_shape.size())
-    //         {
-    //             throw std::invalid_argument("Total elements must match");
-    //         }
-    //         tensor.shape = new_shape;
-    //     }
-    // };
-    // 特化作者和具体精度
-    // template <>
-    // struct InitDispatcher<miaobyte, float>
-    // {
-    //     static void reshape(Tensor<float> &tensor, const Shape &new_shape)
-    //     {
-    //         // CUDA实现
-    //     }
-    // };
+        reshapeDispatcher<Author, T>::reshape(tensor, new_shape);
+    }
+
+    template <typename Author, typename T>
+    struct transposeDispatcher
+    {
+        static void transpose(Tensor<T> &tensor, const std::vector<int> &dim_order) = delete;
+    };
+
+    // transpose(A,dim_order)=>B
+    template <typename Author, typename T>
+    void transpose(Tensor<T> &tensor, const std::vector<int> &dim_order)
+    {
+        transposeDispatcher<Author, T>::transpose(tensor, dim_order);
+    }
+
+    template <typename Author, typename T>
+    struct concatDispatcher
+    {
+        static void concat(const Tensor<T> *tensors, const int num_tensors, const int axis, Tensor<T> &C) = delete;
+    };
+    // concat(tensors,axis)=>C
+    template <typename Author, typename T>
+    void concat(const Tensor<T> *tensors, const int num_tensors, const int axis, Tensor<T> &C)
+    {
+        concatDispatcher<Author, T>::concat(tensors, num_tensors, axis, C);
+    }
+
+    // https://onnx.ai/onnx/operators/onnx__Split.html
+    template <typename Author, typename T>
+    struct splitDispatcher
+    {
+        static void split(const Tensor<T> &A, const int axis,const std::vector<int> &splits, Tensor<T> *&B) = delete;
+    };  
+    // split(tensor,axis,splits)=>tensors
+    template <typename Author, typename T>
+    void split(const Tensor<T> &A, const int axis,const std::vector<int> &splits, Tensor<T> *&B)
+    {
+        splitDispatcher<Author, T>::split(A, axis, splits, B);
+       
+    }   
+    template <typename Author, typename T>
+    struct splitDispatcher
+    {
+        static void split(const Tensor<T> &A, const int axis,const int num_outputs, Tensor<T> *&B) = delete;
+    };
+    // split(tensor,axis,num_outputs)=>tensors
+    template <typename Author, typename T>
+    void split(const Tensor<T> &A, const int axis,const int num_outputs, Tensor<T> *&B)
+    {
+        splitDispatcher<Author, T>::split(A, axis, num_outputs, B);
+    }
+
+    template <typename Author, typename T>
+    struct expandDispatcher
+    {
+        static void expand(const Tensor<T> &A, const Shape &new_shape, Tensor<T> &B) = delete;
+    };
+
+    template <typename Author, typename T>
+    void expand(const Tensor<T> &A, const Shape &new_shape, Tensor<T> &B)
+    {
+        expandDispatcher<Author, T>::expand(A, new_shape, B);
+    }
+    
+    template <typename Author, typename T>
+    struct squeezeDispatcher
+    {
+        static void squeeze(Tensor<T> &tensor) = delete;
+    };  
+
+    template <typename Author, typename T>
+    void squeeze(Tensor<T> &tensor)
+    {
+        squeezeDispatcher<Author, T>::squeeze(tensor);
+    }
+    
+    template <typename Author, typename T>
+    struct unsqueezeDispatcher
+    {
+        static void unsqueeze(Tensor<T> &tensor, const int axis) = delete;
+    };
+
+    template <typename Author, typename T>
+    void unsqueeze(Tensor<T> &tensor, const int axis)
+    {
+        unsqueezeDispatcher<Author, T>::unsqueeze(tensor, axis);
+    }
+    
+    template <typename Author, typename T>
+    struct flattenDispatcher
+    {
+        static void flatten(Tensor<T> &tensor) = delete;
+    };
+
+    template <typename Author, typename T>
+    void flatten(Tensor<T> &tensor)
+    {
+        flattenDispatcher<Author, T>::flatten(tensor);
+    }
+    
+    template <typename Author, typename T>
+    struct paddingDispatcher
+    {
+        static void padding(Tensor<T> &tensor, const Shape &new_shape) = delete;
+    };  
+
+    template <typename Author, typename T>
+    void padding(Tensor<T> &tensor, const Shape &new_shape)
+    {
+        paddingDispatcher<Author, T>::padding(tensor, new_shape);
+    }
 }
 
 #endif
diff --git a/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp b/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp
index 415e4449..4f50c6a2 100644
--- a/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp
+++ b/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp
@@ -15,6 +15,8 @@ namespace deepx::tensorfunc
         }
     };
 
+
+    // A+B=>C
     template <typename Author, typename T>
     void add(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &C)
     {
@@ -29,6 +31,7 @@ namespace deepx::tensorfunc
         }
     };
 
+    // A+scalar=>C
     template <typename Author, typename T>
     void addscalar(const Tensor<T> &input, const T value, Tensor<T> &output)
     {
@@ -43,6 +46,7 @@ namespace deepx::tensorfunc
         }
     };
 
+    // A-B=>C
     template <typename Author, typename T>
     void sub(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &C)
     {
@@ -57,6 +61,7 @@ namespace deepx::tensorfunc
         }
     };
 
+    // A-scalar=>C
     template <typename Author, typename T>
     void subscalar(const Tensor<T> &input, const T value, Tensor<T> &output)
     {
@@ -69,6 +74,7 @@ namespace deepx::tensorfunc
         static void mul(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &C) = delete;
     };
 
+    // A*B=>C
     template <typename Author, typename T>
     void mul(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &C)
     {
@@ -81,6 +87,7 @@ namespace deepx::tensorfunc
         static void mulscalar(const Tensor<T> &input, const T value, Tensor<T> &output) = delete;
     };
 
+    // A*scalar=>C
     template <typename Author, typename T>
     void mulscalar(const Tensor<T> &input, const T value, Tensor<T> &output)
     {
@@ -95,6 +102,7 @@ namespace deepx::tensorfunc
         static void div(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &C) = delete;
     };
 
+    // A/B=>C
     template <typename Author, typename T>
     void div(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &C)
     {
@@ -107,6 +115,7 @@ namespace deepx::tensorfunc
         static void divscalar(const Tensor<T> &input, const T value, Tensor<T> &output) = delete;
     };
 
+    // A/scalar=>C
     template <typename Author, typename T>
     void divscalar(const Tensor<T> &input, const T value, Tensor<T> &output)
     {
@@ -119,6 +128,7 @@ namespace deepx::tensorfunc
         static void rdivscalar(const T value, const Tensor<T> &input, Tensor<T> &output) = delete;
     };
 
+    // scalar/A=>C
     template <typename Author, typename T>
     void rdivscalar(const T value, const Tensor<T> &input, Tensor<T> &output)
     {
@@ -132,6 +142,7 @@ namespace deepx::tensorfunc
         static void sqrt(const Tensor<T> &input, Tensor<T> &output) = delete;
     };
 
+    // sqrt(A)=>C   
     template <typename Author, typename T>
     void sqrt(const Tensor<T> &input, Tensor<T> &output)
     {
@@ -144,6 +155,7 @@ namespace deepx::tensorfunc
         static void pow(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &C) = delete;
     };
 
+    // A^B=>C
     template <typename Author, typename T>
     void pow(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &C)
     {
@@ -156,6 +168,7 @@ namespace deepx::tensorfunc
         static void powscalar(const Tensor<T> &input, const T value, Tensor<T> &output) = delete;
     };
 
+    // A^scalar=>C
     template <typename Author, typename T>
     void powscalar(const Tensor<T> &input, const T value, Tensor<T> &output)
     {
@@ -168,6 +181,7 @@ namespace deepx::tensorfunc
         static void log(const Tensor<T> &input, Tensor<T> &output) = delete;
     };
 
+    // log(A)=>C
     template <typename Author, typename T>
     void log(const Tensor<T> &input, Tensor<T> &output)
     {
@@ -180,6 +194,7 @@ namespace deepx::tensorfunc
         static void exp(const Tensor<T> &input, Tensor<T> &output) = delete;
     };
 
+    // exp(A)=>C
     template <typename Author, typename T>
     void exp(const Tensor<T> &input, Tensor<T> &output)
     {
@@ -192,6 +207,7 @@ namespace deepx::tensorfunc
         static void sin(const Tensor<T> &input, Tensor<T> &output) = delete;
     };
 
+    // sin(A)=>C
     template <typename Author, typename T>
     void sin(const Tensor<T> &input, Tensor<T> &output)
     {
@@ -204,6 +220,7 @@ namespace deepx::tensorfunc
         static void cos(const Tensor<T> &input, Tensor<T> &output) = delete;
     };
 
+    // cos(A)=>C
     template <typename Author, typename T>
     void cos(const Tensor<T> &input, Tensor<T> &output)
     {
@@ -216,6 +233,7 @@ namespace deepx::tensorfunc
         static void tan(const Tensor<T> &input, Tensor<T> &output) = delete;
     };
 
+    // tan(A)=>C
     template <typename Author, typename T>
     void tan(const Tensor<T> &input, Tensor<T> &output)
     {
@@ -228,6 +246,7 @@ namespace deepx::tensorfunc
         static void max(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &C) = delete;
     };
 
+    // max(A,B)=>C
     template <typename Author, typename T>
     void max(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &C)
     {
@@ -242,6 +261,7 @@ namespace deepx::tensorfunc
         static void maxscalar(const Tensor<T> &A, T b, Tensor<T> &C) = delete;
     };
 
+    // max(A,scalar)=>C
     template <typename Author, typename T>
     void maxscalar(const Tensor<T> &A, T b, Tensor<T> &C)
     {
@@ -256,6 +276,7 @@ namespace deepx::tensorfunc
         static void min(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &C) = delete;
     };
 
+    // min(A,B)=>C
     template <typename Author, typename T>
     void min(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &C)
     {
@@ -268,6 +289,7 @@ namespace deepx::tensorfunc
         static void minscalar(const Tensor<T> &A, T b, Tensor<T> &C) = delete;
     };
 
+    // min(A,scalar)=>C
     template <typename Author, typename T>
     void minscalar(const Tensor<T> &A, T b, Tensor<T> &C)
     {
@@ -280,6 +302,10 @@ namespace deepx::tensorfunc
         static void compare(const Tensor<T> &A, const Tensor<T> &B, Tensor<float> &mask) = delete;
     };
 
+    // compare(A,B)=>mask
+    // if A[i]==B[i], mask[i]=0.5
+    // if A[i]>B[i], mask[i]=0
+    // if A[i]<B[i], mask[i]=1
     template <typename Author, typename T>
     void compare(const Tensor<T> &A, const Tensor<T> &B,Tensor<float> &mask)
     {
diff --git a/excuter/cpp-common/src/deepx/tensorfunc/matmul.hpp b/excuter/cpp-common/src/deepx/tensorfunc/matmul.hpp
index ca844f4b..2e099aba 100644
--- a/excuter/cpp-common/src/deepx/tensorfunc/matmul.hpp
+++ b/excuter/cpp-common/src/deepx/tensorfunc/matmul.hpp
@@ -40,19 +40,6 @@ namespace deepx::tensorfunc
     {
         matmulDispatcher<Author, T>::matmul(A, B, C);
     }
-
-    template <typename Author, typename T>
-    struct matmuladdDispatcher
-    {
-        static void matmuladd(const Tensor<T> &A, const Tensor<T> &B, const T &alpha, const T &beta, Tensor<T> &C) = delete;
-    };
-
-    template <typename Author, typename T>
-    void matmuladd(const Tensor<T> &A, const Tensor<T> &B, const T &alpha, const T &beta, Tensor<T> &C)
-    {
-        matmuladdDispatcher<Author, T>::matmuladd(A, B, alpha, beta, C);
-    }
-
 }
 
 #endif
diff --git a/excuter/cpp-common/src/deepx/tensorfunc/reduce.hpp b/excuter/cpp-common/src/deepx/tensorfunc/reduce.hpp
new file mode 100644
index 00000000..c9f3b2a7
--- /dev/null
+++ b/excuter/cpp-common/src/deepx/tensorfunc/reduce.hpp
@@ -0,0 +1,56 @@
+#ifndef DEEPX_TENSORFUNC_REDUCE_HPP
+#define DEEPX_TENSORFUNC_REDUCE_HPP
+
+ #include "deepx/tensor.hpp"
+#include "deepx/tensorfunc/authors.hpp"
+#include "stdutil/error.hpp"
+
+namespace deepx::tensorfunc
+{
+    template <typename Author, typename T>
+    struct reducesumDispatcher
+    {
+        static void reducesum(const Tensor<T> &A, const int axis,const bool keepdims, Tensor<T> &B) = delete;
+    };
+    template <typename Author, typename T>
+    void reducesum(const Tensor<T> &A, const int axis,const bool keepdims, Tensor<T> &B)
+    {
+        reducesumDispatcher<Author, T>::reducesum(A, axis, keepdims, B);
+    }
+    
+    template <typename Author, typename T>
+    struct reduceprodDispatcher
+    {
+        static void reduceprod(const Tensor<T> &A, const int axis,const bool keepdims, Tensor<T> &B) = delete;
+    };
+    
+    template <typename Author, typename T>
+    void reduceprod(const Tensor<T> &A, const int axis,const bool keepdims, Tensor<T> &B)
+    {
+        reduceprodDispatcher<Author, T>::reduceprod(A, axis, keepdims, B);
+    }
+
+    template <typename Author, typename T>
+    struct reducemaxDispatcher
+    {
+        static void reducemax(const Tensor<T> &A, const int axis,const bool keepdims, Tensor<T> &B) = delete;
+    };
+    template <typename Author, typename T>
+    void reducemax(const Tensor<T> &A, const int axis,const bool keepdims, Tensor<T> &B)
+    {
+        reducemaxDispatcher<Author, T>::reducemax(A, axis, keepdims, B);
+    }
+    
+    template <typename Author, typename T>
+    struct reduceminDispatcher
+    {
+        static void reducemin(const Tensor<T> &A, const int axis,const bool keepdims, Tensor<T> &B) = delete;
+    };
+    template <typename Author, typename T>
+    void reducemin(const Tensor<T> &A, const int axis,const bool keepdims, Tensor<T> &B)
+    {
+        reduceminDispatcher<Author, T>::reducemin(A, axis, keepdims, B);
+    }
+ 
+}
+#endif // DEEPX_TENSORFUNC_REDUCE_HPP

From b2fce2584c10bc28fa71f3d6e71781a9d4372408 Mon Sep 17 00:00:00 2001
From: lipeng <734991033@qq.com>
Date: Sun, 6 Apr 2025 01:46:38 +0800
Subject: [PATCH 3/7] excuter(cpu/cuda):reshape,transpose,concat

---
 doc/excuter/op-mem-cuda/list.md               |   1 +
 doc/excuter/op-mem-ompsimd/list.md            |   3 +-
 excuter/cpp-common/src/deepx/mem/mem.hpp      |   1 +
 excuter/cpp-common/src/deepx/shape_concat.hpp |  31 ++-
 .../src/deepx/tensorfunc/changeshape.hpp      |  26 +-
 .../deepx/tensorfunc/changeshape_miaobyte.cu  | 231 ++++++++++++++++++
 .../deepx/tensorfunc/changeshape_miaobyte.cuh |  82 +++++++
 .../deepx/tensorfunc/changeshape_miaobyte.hpp |  81 ++++++
 .../src/deepx/tensorfunc/concat.hpp           |  13 -
 .../op-mem-cuda/src/deepx/tensorfunc/cuda.hpp |  24 +-
 .../src/deepx/tensorfunc/matmul_cublas.hpp    | 119 +--------
 .../src/deepx/tensorfunc/tensor_cuda.cuh      |  39 +++
 .../src/deepx/tensorfunc/vector_cuda.cuh      | 100 ++++++++
 .../test/tensorfunc/2_changeshape.cpp         |  46 ++++
 .../test/tensorfunc/CMakeLists.txt            |   5 +-
 ...angeshape.hpp => changeshape_miaobyte.hpp} |  96 ++++----
 .../src/deepx/tensorfunc/matmul_cblas.hpp     | 145 -----------
 .../test/tensorfunc/7_tensor_transpose.cpp    |   6 +-
 .../test/tensorfunc/8_tensor_concat.cpp       |   7 +-
 front/py/deepx/nn/functional/__init__.py      |   2 +-
 front/py/deepx/nn/functional/init.py          |   2 +-
 src/deepx/tensorfunc/changeshape_miaobyte.cu  |   1 +
 22 files changed, 715 insertions(+), 346 deletions(-)
 create mode 100644 excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu
 create mode 100644 excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cuh
 create mode 100644 excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.hpp
 delete mode 100644 excuter/op-mem-cuda/src/deepx/tensorfunc/concat.hpp
 create mode 100644 excuter/op-mem-cuda/src/deepx/tensorfunc/tensor_cuda.cuh
 create mode 100644 excuter/op-mem-cuda/src/deepx/tensorfunc/vector_cuda.cuh
 create mode 100644 excuter/op-mem-cuda/test/tensorfunc/2_changeshape.cpp
 rename excuter/op-mem-ompsimd/src/deepx/tensorfunc/{changeshape.hpp => changeshape_miaobyte.hpp} (71%)
 create mode 100644 src/deepx/tensorfunc/changeshape_miaobyte.cu

diff --git a/doc/excuter/op-mem-cuda/list.md b/doc/excuter/op-mem-cuda/list.md
index 4982da4c..5967d738 100644
--- a/doc/excuter/op-mem-cuda/list.md
+++ b/doc/excuter/op-mem-cuda/list.md
@@ -5,6 +5,7 @@
 | Operation | Author | Func Def | Math Formula | IR Instruction |
 |-----------|--------|------------|--------------|----------------|
 | matmul | cublas | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1 @ T2 | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) |
+| comparescalar | miaobyte | comparescalar(tensor<any> A, var<any> scalar)->(tensor<int8> mask) | mask=compare(T1, scalar) | comparescalar(tensor<any> A, var<any> scalar)->(tensor<int8> mask) |
 | add | cublas | add(tensor<any> a, tensor<any> b)->(tensor<any> c) | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |
 | add | miaobyte | add(tensor<any> a, tensor<any> b)->(tensor<any> c) | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |
 | uniform | miaobyte | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() | uniform(T1,low,high,seed) | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() |
diff --git a/doc/excuter/op-mem-ompsimd/list.md b/doc/excuter/op-mem-ompsimd/list.md
index 84f46f87..b4ab6cd7 100644
--- a/doc/excuter/op-mem-ompsimd/list.md
+++ b/doc/excuter/op-mem-ompsimd/list.md
@@ -7,7 +7,7 @@
 | concat |  none  | concat()->() | Tresult = concat([T1, T2...], axis=3) | concat()->() |
 | matmul | cblas | matmul(tensor<float64|float32> A, tensor<float64|float32> B)->(tensor<float64|float32> C) | T3=T1 @ T2 | matmul(tensor<float64|float32> A, tensor<float64|float32> B)->(tensor<float64|float32> C) |
 | matmul | miaobyte | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1 @ T2 | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) |
-| compare | miaobyte | compare(tensor<any> A, tensor<any> B)->(tensor<int8> mask) | mask=compare(T1,T2) | compare(tensor<any> A, tensor<any> B)->(tensor<int8> mask) |
+| compare | miaobyte | compare(tensor<any> A, tensor<any> B)->(tensor<float32> mask) | mask=compare(T1,T2) | compare(tensor<any> A, tensor<any> B)->(tensor<float32> mask) |
 | min | miaobyte | min(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=min(T1,T2) | min(tensor<any> A, tensor<any> B)->(tensor<any> C) |
 | minscalar | miaobyte | minscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=min(T1,scalar) | minscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
 | exp | miaobyte | exp(tensor<any> A)->(tensor<any> C) | T3=exp(T1) | exp(tensor<any> A)->(tensor<any> C) |
@@ -32,6 +32,7 @@
 | subscalar | miaobyte | subscalar(tensor<any> a, var<any> scalar)->(tensor<any> c) | T3=T1-scalar | subscalar(tensor<any> a, var<any> scalar)->(tensor<any> c) |
 | log | miaobyte | log(tensor<any> A)->(tensor<any> C) | T3=log(T1) | log(tensor<any> A)->(tensor<any> C) |
 | uniform | miaobyte | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() | uniform(T1,low,high,seed) | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() |
+| comparescalar | miaobyte | comparescalar(tensor<any> A, var<any> scalar)->(tensor<float32> mask) | mask=compare(T1,scalar) | comparescalar(tensor<any> A, var<any> scalar)->(tensor<float32> mask) |
 | add | cblas | add(tensor<float64|float32> a, tensor<float64|float32> b)->(tensor<float64|float32> c) | T3=T1+T2 | add(tensor<float64|float32> a, tensor<float64|float32> b)->(tensor<float64|float32> c) |
 | add | miaobyte | add(tensor<any> a, tensor<any> b)->(tensor<any> c) | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |
 | addscalar | miaobyte | addscalar(tensor<any> a, var<any> scalar)->(tensor<any> c) | T3=T1+scalar | addscalar(tensor<any> a, var<any> scalar)->(tensor<any> c) |
diff --git a/excuter/cpp-common/src/deepx/mem/mem.hpp b/excuter/cpp-common/src/deepx/mem/mem.hpp
index a4db9d9d..504db01f 100644
--- a/excuter/cpp-common/src/deepx/mem/mem.hpp
+++ b/excuter/cpp-common/src/deepx/mem/mem.hpp
@@ -150,6 +150,7 @@ namespace deepx::mem
 
             return tensors;
         }
+ 
 
         void delete_tensor(const string &name)
         {
diff --git a/excuter/cpp-common/src/deepx/shape_concat.hpp b/excuter/cpp-common/src/deepx/shape_concat.hpp
index 885a7678..91884e5e 100644
--- a/excuter/cpp-common/src/deepx/shape_concat.hpp
+++ b/excuter/cpp-common/src/deepx/shape_concat.hpp
@@ -3,6 +3,7 @@
 
 #include "deepx/shape.hpp"
 #include "deepx/tensor.hpp"
+#include "stdutil/error.hpp"
 
 namespace deepx
 {
@@ -18,6 +19,32 @@ namespace deepx
         }
         return concatShape(shapes,axis);
     }
-}
 
-#endif
\ No newline at end of file
+    template<typename T>
+    bool checkShapeConcat(const std::vector<Tensor<T>*> &tensors,const int axis,const Tensor<T> &output){
+        int axisDim=0;
+        for (int i = 0; i < tensors.size(); i++)
+        {
+            if (tensors[i]->shape.dim != output.shape.dim)
+            {
+                throw TensorShapeError("All input tensors must have the same dimension size for concat");
+            }
+            for (int j = 0; j < tensors[i]->shape.dim; j++)
+            {
+                if (j != axis)
+                {   
+                    if (tensors[i]->shape[j] != output.shape[j])
+                    {
+                        throw TensorShapeError("All input tensors must have the same dimension size for concat");
+                    }
+                }
+                else
+                {
+                    axisDim += tensors[i]->shape[j];
+                }
+            }
+        }
+        return axisDim == output.shape[axis];
+    }
+};
+#endif // DEEPX_SHAPE_CONCAT_HPP
\ No newline at end of file
diff --git a/excuter/cpp-common/src/deepx/tensorfunc/changeshape.hpp b/excuter/cpp-common/src/deepx/tensorfunc/changeshape.hpp
index dd190dac..5acd644e 100644
--- a/excuter/cpp-common/src/deepx/tensorfunc/changeshape.hpp
+++ b/excuter/cpp-common/src/deepx/tensorfunc/changeshape.hpp
@@ -1,21 +1,22 @@
 #ifndef DEEPX_TENSORFUNC_CHANGE_SHAPE_HPP
 #define DEEPX_TENSORFUNC_CHANGE_SHAPE_HPP
 
+#include <vector>
 #include "deepx/tensor.hpp"
 #include "stdutil/error.hpp"
 
 namespace deepx::tensorfunc
 {
- 
+    using namespace std;
     template <typename Author, typename T>
     struct reshapeDispatcher
     {
-        static void reshape(Tensor<T> &tensor, const Shape &new_shape) = delete;
+        static void reshape(Tensor<T> &tensor, const std::vector<int> &new_shape) = delete;
     };
 
     // reshape(A,new_shape)=>B
     template <typename Author, typename T>
-    void reshape(Tensor<T> &tensor, const Shape &new_shape)
+    void reshape(Tensor<T> &tensor, const std::vector<int> &new_shape)
     {
         reshapeDispatcher<Author, T>::reshape(tensor, new_shape);
     }
@@ -23,26 +24,26 @@ namespace deepx::tensorfunc
     template <typename Author, typename T>
     struct transposeDispatcher
     {
-        static void transpose(Tensor<T> &tensor, const std::vector<int> &dim_order) = delete;
+        static void transpose(const Tensor<T> &tensor, const std::vector<int> &dim_order, Tensor<T> &output) = delete;
     };
 
     // transpose(A,dim_order)=>B
     template <typename Author, typename T>
-    void transpose(Tensor<T> &tensor, const std::vector<int> &dim_order)
+    void transpose(const Tensor<T> &tensor, const std::vector<int> &dim_order, Tensor<T> &output)
     {
-        transposeDispatcher<Author, T>::transpose(tensor, dim_order);
+        transposeDispatcher<Author, T>::transpose(tensor, dim_order, output);
     }
 
     template <typename Author, typename T>
     struct concatDispatcher
     {
-        static void concat(const Tensor<T> *tensors, const int num_tensors, const int axis, Tensor<T> &C) = delete;
+        static void concat(const vector<Tensor<T>*> tensors, const int axis, Tensor<T> &C) = delete;
     };
     // concat(tensors,axis)=>C
     template <typename Author, typename T>
-    void concat(const Tensor<T> *tensors, const int num_tensors, const int axis, Tensor<T> &C)
+    void concat(const vector<Tensor<T>*> tensors, const int axis, Tensor<T> &C)
     {
-        concatDispatcher<Author, T>::concat(tensors, num_tensors, axis, C);
+        concatDispatcher<Author, T>::concat(tensors, axis, C);
     }
 
     // https://onnx.ai/onnx/operators/onnx__Split.html
@@ -50,6 +51,7 @@ namespace deepx::tensorfunc
     struct splitDispatcher
     {
         static void split(const Tensor<T> &A, const int axis,const std::vector<int> &splits, Tensor<T> *&B) = delete;
+        static void split(const Tensor<T> &A, const int axis,const int num_outputs, Tensor<T> *&B) = delete;
     };  
     // split(tensor,axis,splits)=>tensors
     template <typename Author, typename T>
@@ -58,11 +60,7 @@ namespace deepx::tensorfunc
         splitDispatcher<Author, T>::split(A, axis, splits, B);
        
     }   
-    template <typename Author, typename T>
-    struct splitDispatcher
-    {
-        static void split(const Tensor<T> &A, const int axis,const int num_outputs, Tensor<T> *&B) = delete;
-    };
+ 
     // split(tensor,axis,num_outputs)=>tensors
     template <typename Author, typename T>
     void split(const Tensor<T> &A, const int axis,const int num_outputs, Tensor<T> *&B)
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu
new file mode 100644
index 00000000..82c893c0
--- /dev/null
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu
@@ -0,0 +1,231 @@
+#ifndef DEEPX_TENSORFUNC_CHANGESHAPE_MIAOBYTE_CU
+#define DEEPX_TENSORFUNC_CHANGESHAPE_MIAOBYTE_CU
+
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include "deepx/tensorfunc/cuda.hpp"
+#include "deepx/tensorfunc/authors.hpp"
+#include "deepx/tensorfunc/tensor_cuda.cuh"
+#include "deepx/tensorfunc/vector_cuda.cuh"
+namespace deepx::tensorfunc
+{
+    // transpose
+    //  DIM=2^n
+    template <int DIM, typename T>
+    __global__ void transpose_kernel(const T *inputData,
+                                     const int *inputStrides,
+                                     T *outputData,
+                                     const int *outputStrides,
+                                     const int dim,
+                                     const int len,
+                                     const int *dimOrder)
+    {
+        const int grid_stride = gridDim.x * blockDim.x;
+        int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+        for (; thread_id < len; thread_id += grid_stride)
+        {
+            int input_indices[DIM];
+
+            // 计算当前线程需要处理的索引
+            linearTo(inputStrides, dim, input_indices, thread_id);
+
+            int output_indices[DIM];
+
+            // 根据 dim_order 和输入输出的形状计算新索引
+            reorder(input_indices, dimOrder, dim, output_indices);
+            int inputIdx = linearAt(inputStrides, dim, input_indices);
+            int outputIdx = linearAt(outputStrides, dim, output_indices);
+            outputData[outputIdx] = inputData[inputIdx];
+        }
+    }
+
+    inline int nextPowerOf2(int n)
+    {
+        if (n <= 0)
+            return 1;
+        if ((n & (n - 1)) == 0)
+            return n; // 如果n已经是2的幂
+
+        n--;
+        n |= n >> 1;
+        n |= n >> 2;
+        n |= n >> 4;
+        n |= n >> 8;
+        n |= n >> 16;
+        return n + 1;
+    }
+
+    template <typename T>
+    void launch_transpose(const int numBlocks, const int blockSize,
+                          const T *input,
+                          const int *inputStrides,
+                          T *output,
+                          const int *outputStrides,
+                          const int dim,
+                          const int len,
+                          const int *dimOrder)
+    {
+        cudaVector<int> strides_d(inputStrides, dim);
+        cudaVector<int> newStrides_d(outputStrides, dim);
+        cudaVector<int> dimOrder_d(dimOrder, dim);
+
+        int powDim = nextPowerOf2(dim);
+
+        // 根据计算出的2的幂次选择对应的模板实例
+        switch (powDim)
+        {
+        case 1:
+            transpose_kernel<1, T><<<numBlocks, blockSize>>>(input, strides_d.data, output, newStrides_d.data, dim, len, dimOrder_d.data);
+            break;
+        case 2:
+            transpose_kernel<2, T><<<numBlocks, blockSize>>>(input, strides_d.data, output, newStrides_d.data, dim, len, dimOrder_d.data);
+            break;
+        case 4:
+            transpose_kernel<4, T><<<numBlocks, blockSize>>>(input, strides_d.data, output, newStrides_d.data, dim, len, dimOrder_d.data);
+            break;
+        case 8:
+            transpose_kernel<8, T><<<numBlocks, blockSize>>>(input, strides_d.data, output, newStrides_d.data, dim, len, dimOrder_d.data);
+            break;
+        case 16:
+            transpose_kernel<16, T><<<numBlocks, blockSize>>>(input, strides_d.data, output, newStrides_d.data, dim, len, dimOrder_d.data);
+            break;
+        case 32:
+            transpose_kernel<32, T><<<numBlocks, blockSize>>>(input, strides_d.data, output, newStrides_d.data, dim, len, dimOrder_d.data);
+            break;
+        case 64:
+            transpose_kernel<64, T><<<numBlocks, blockSize>>>(input, strides_d.data, output, newStrides_d.data, dim, len, dimOrder_d.data);
+            break;
+        case 128:
+            transpose_kernel<128, T><<<numBlocks, blockSize>>>(input, strides_d.data, output, newStrides_d.data, dim, len, dimOrder_d.data);
+            break;
+        default:
+            throw std::runtime_error("dim too large, max support 128");
+        }
+    }
+
+    template void launch_transpose<double>(const int numBlocks, const int blockSize, const double *input, const int *inputStrides, double *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
+    template void launch_transpose<float>(const int numBlocks, const int blockSize, const float *input, const int *inputStrides, float *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
+    template void launch_transpose<nv_bfloat16>(const int numBlocks, const int blockSize, const nv_bfloat16 *input, const int *inputStrides, nv_bfloat16 *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
+    template void launch_transpose<__half>(const int numBlocks, const int blockSize, const __half *input, const int *inputStrides, __half *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
+    template void launch_transpose<int64_t>(const int numBlocks, const int blockSize, const int64_t *input, const int *inputStrides, int64_t *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
+    template void launch_transpose<int32_t>(const int numBlocks, const int blockSize, const int32_t *input, const int *inputStrides, int32_t *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
+    template void launch_transpose<int16_t>(const int numBlocks, const int blockSize, const int16_t *input, const int *inputStrides, int16_t *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
+    template void launch_transpose<int8_t>(const int numBlocks, const int blockSize, const int8_t *input, const int *inputStrides, int8_t *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
+
+    // concat
+    template <int DIM, typename T>
+    __global__ void concat_kernel(const T **tensorsData,
+                                  const int *inputStrides,
+                                  T *outputData,
+                                  const int *outputStrides,
+                                  const int dim,
+                                  const int outputLen,
+                                  const int axis,
+                                  const int numTensors,
+                                  const int *shapeAtAxis)
+    {
+        const int grid_stride = gridDim.x * blockDim.x;
+        int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+        cudaVector<int> outputIndices(DIM);
+        cudaVector<int> currentTensorIndices(DIM);
+        for (; thread_id < outputLen; thread_id += grid_stride)
+        {
+            linearTo(outputStrides, dim, outputIndices.data, thread_id);
+            int concatIdxResult = outputIndices[axis];
+            int concatIdxCurrentTensor = concatIdxResult;
+            int tensorIdx = 0;
+            while (tensorIdx < numTensors)
+            {
+                if (concatIdxCurrentTensor < shapeAtAxis[tensorIdx])
+                {
+                    break;
+                }
+                else
+                {
+                    concatIdxCurrentTensor -= shapeAtAxis[tensorIdx];
+                    tensorIdx++;
+                }
+            }
+            currentTensorIndices.copyFromDevice(outputIndices.data, dim);
+            currentTensorIndices[axis] = concatIdxCurrentTensor;
+
+            int idxCurrentTensor = linearAt(inputStrides+tensorIdx*dim, dim, currentTensorIndices.data);
+
+            int idx = linearAt(outputStrides, dim, outputIndices.data);
+            outputData[idx] = tensorsData[tensorIdx][idxCurrentTensor];
+        }
+    }
+
+
+    template <typename T>
+    void launch_concat(
+                       const T **tensorsData,
+                       const int *inputStrides,
+                       T *outputData,
+                       const int *outputStrides,
+                       const int dim,
+                       const int outputLen,
+                       const int axis,
+                       const int numTensors,
+                       const int *shapeAtAxis)
+    {   
+        auto [numBlocks, blockSize] = BestDims(outputLen);
+
+        //output
+        cudaVector<int> outputStrides_d(outputStrides, dim, cudaMemcpyHostToDevice);
+
+        //input
+        //datas
+        cudaVector<const T*> tensorsDataList(tensorsData, numTensors, cudaMemcpyHostToDevice);
+        //strides
+        cudaVector<int> inputStrides_d(inputStrides, numTensors*dim, cudaMemcpyHostToDevice);
+       
+
+        //shapeAtAxis
+        cudaVector<int> shapeAtAxis_d(shapeAtAxis, numTensors, cudaMemcpyHostToDevice);
+
+        int powDim = nextPowerOf2(dim);
+
+        // 根据计算出的2的幂次选择对应的模板实例
+        switch (powDim)
+        {
+        case 1:
+            concat_kernel<1, T><<<numBlocks, blockSize>>>(tensorsDataList.data, inputStrides_d.data, outputData, outputStrides_d.data, dim, outputLen, axis, numTensors, shapeAtAxis_d.data);
+            break;
+        case 2:
+            concat_kernel<2, T><<<numBlocks, blockSize>>>(tensorsDataList.data, inputStrides_d.data, outputData, outputStrides_d.data, dim, outputLen, axis, numTensors, shapeAtAxis_d.data);
+            break;
+        case 4:
+            concat_kernel<4, T><<<numBlocks, blockSize>>>(tensorsDataList.data, inputStrides_d.data, outputData, outputStrides_d.data, dim, outputLen, axis, numTensors, shapeAtAxis_d.data);
+            break;
+        case 8:
+            concat_kernel<8, T><<<numBlocks, blockSize>>>(tensorsDataList.data, inputStrides_d.data, outputData, outputStrides_d.data, dim, outputLen, axis, numTensors, shapeAtAxis_d.data);
+            break;
+        case 16:
+            concat_kernel<16, T><<<numBlocks, blockSize>>>(tensorsDataList.data, inputStrides_d.data, outputData, outputStrides_d.data, dim, outputLen, axis, numTensors, shapeAtAxis_d.data);
+            break;
+        case 32:
+            concat_kernel<32, T><<<numBlocks, blockSize>>>(tensorsDataList.data, inputStrides_d.data, outputData, outputStrides_d.data, dim, outputLen, axis, numTensors, shapeAtAxis_d.data);
+            break;
+        case 64:
+            concat_kernel<64, T><<<numBlocks, blockSize>>>(tensorsDataList.data, inputStrides_d.data, outputData, outputStrides_d.data, dim, outputLen, axis, numTensors, shapeAtAxis_d.data);
+            break;
+        case 128:
+            concat_kernel<128, T><<<numBlocks, blockSize>>>(tensorsDataList.data, inputStrides_d.data, outputData, outputStrides_d.data, dim, outputLen, axis, numTensors, shapeAtAxis_d.data);
+            break;
+        default:
+            throw std::runtime_error("dim too large, max support 128");
+        }
+    }
+    template void launch_concat<double>(const double **tensorsData, const int *inputStrides, double *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis);
+    template void launch_concat<float>(const float **tensorsData, const int *inputStrides, float *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis);
+    template void launch_concat<nv_bfloat16>(const nv_bfloat16 **tensorsData, const int *inputStrides, nv_bfloat16 *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis);
+    template void launch_concat<__half>(const __half **tensorsData, const int *inputStrides, __half *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis);
+    template void launch_concat<int64_t>(const int64_t **tensorsData, const int *inputStrides, int64_t *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis);
+    template void launch_concat<int32_t>(const int32_t **tensorsData, const int *inputStrides, int32_t *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis);
+    template void launch_concat<int16_t>(const int16_t **tensorsData, const int *inputStrides, int16_t *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis);
+    template void launch_concat<int8_t>(const int8_t **tensorsData, const int *inputStrides, int8_t *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis);
+
+}
+#endif // DEEPX_TENSORFUNC_CHANGESHAPE_MIAOBYTE_HPP
\ No newline at end of file
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cuh b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cuh
new file mode 100644
index 00000000..9e9a8629
--- /dev/null
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cuh
@@ -0,0 +1,82 @@
+#ifndef DEEPX_TENSORFUNC_CHANGESHAPE_MIAOBYTE_CUH
+#define DEEPX_TENSORFUNC_CHANGESHAPE_MIAOBYTE_CUH
+ 
+#include <cuda_bf16.h>  
+#include <cuda_fp16.h>
+#include "deepx/tensorfunc/cuda.hpp"
+#include "deepx/tensorfunc/authors.hpp"
+
+namespace deepx::tensorfunc
+{
+    //transpose
+    template <typename T>
+    __global__ void transpose_kernel(const T* input, const int* inputStrides, T* output, const int* outputStrides, const int dim, const int len, const int* dimOrder);
+
+    template <typename T>
+    void launch_transpose(const int numBlocks, const int blockSize, const T* input, const int* inputStrides, T* output, const int* outputStrides, const int dim, const int len, const int* dimOrder);
+
+    template <>
+    void launch_transpose<double>(const int numBlocks, const int blockSize, const double* input, const int* inputStrides, double* output, const int* outputStrides, const int dim, const int len, const int* dimOrder);
+
+    template <>
+    void launch_transpose<float>(const int numBlocks, const int blockSize, const float* input, const int* inputStrides, float* output, const int* outputStrides, const int dim, const int len, const int* dimOrder);
+
+    template <>     
+    void launch_transpose<nv_bfloat16>(const int numBlocks, const int blockSize, const nv_bfloat16* input, const int* inputStrides, nv_bfloat16* output, const int* outputStrides, const int dim, const int len, const int* dimOrder);
+
+    template <>
+    void launch_transpose<__half>(const int numBlocks, const int blockSize, const __half* input, const int* inputStrides, __half* output, const int* outputStrides, const int dim, const int len, const int* dimOrder);
+
+    template <>
+    void launch_transpose<int64_t>(const int numBlocks, const int blockSize, const int64_t* input, const int* inputStrides, int64_t* output, const int* outputStrides, const int dim, const int len, const int* dimOrder);
+
+    template <>
+    void launch_transpose<int32_t>(const int numBlocks, const int blockSize, const int32_t* input, const int* inputStrides, int32_t* output, const int* outputStrides, const int dim, const int len, const int* dimOrder);
+
+    template <>
+    void launch_transpose<int16_t>(const int numBlocks, const int blockSize, const int16_t* input, const int* inputStrides, int16_t* output, const int* outputStrides, const int dim, const int len, const int* dimOrder);
+
+    template <>
+    void launch_transpose<int8_t>(const int numBlocks, const int blockSize, const int8_t* input, const int* inputStrides, int8_t* output, const int* outputStrides, const int dim, const int len, const int* dimOrder);
+
+     template <typename DIM, typename T>
+    __global__ void concat_kernel(const T **tensorsData,
+                                  const int *inputStrides,
+                                  T *outputData,
+                                  const int *outputStrides,
+                                  const int dim,
+                                  const int len,
+                                  const int axis,
+                                  const int numTensors,
+                                  const int *shapeAtAxis);
+
+    template <typename T>
+    void launch_concat(const T **tensorsData, const int *inputStrides, T *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis);
+
+    template <>
+    void launch_concat<double>(const double **tensorsData, const int *inputStrides, double *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis);
+
+    template <>
+    void launch_concat<float>(const float **tensorsData, const int *inputStrides, float *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis);
+
+    template <>
+    void launch_concat<nv_bfloat16>(const nv_bfloat16 **tensorsData, const int *inputStrides, nv_bfloat16 *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis);
+
+    template <>
+    void launch_concat<__half>(const __half **tensorsData, const int *inputStrides, __half *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis);
+
+    template <>
+    void launch_concat<int64_t>(const int64_t **tensorsData, const int *inputStrides, int64_t *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis);
+
+    template <>
+    void launch_concat<int32_t>(const int32_t **tensorsData, const int *inputStrides, int32_t *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis);
+
+    template <>
+    void launch_concat<int16_t>(const int16_t **tensorsData, const int *inputStrides, int16_t *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis);
+
+    template <>
+    void launch_concat<int8_t>(const int8_t **tensorsData, const int *inputStrides, int8_t *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis);
+    
+    
+}
+#endif // DEEPX_TENSORFUNC_CHANGESHAPE_MIAOBYTE_HPP
\ No newline at end of file
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.hpp
new file mode 100644
index 00000000..4c9f5c4d
--- /dev/null
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.hpp
@@ -0,0 +1,81 @@
+#ifndef DEEPX_TENSORFUNC_CHANGESHAPE_MIAOBYTE_HPP
+#define DEEPX_TENSORFUNC_CHANGESHAPE_MIAOBYTE_HPP
+
+#include <vector>
+#include <stdexcept>
+#include "deepx/tensor.hpp"
+#include "deepx/tensorfunc/changeshape.hpp"
+#include "deepx/tensorfunc/authors.hpp"
+#include "deepx/tensorfunc/changeshape_miaobyte.cuh"
+#include "deepx/tensorfunc/cuda.hpp"
+#include "deepx/shape_concat.hpp"
+namespace deepx::tensorfunc
+{
+    template <typename T>
+    struct reshapeDispatcher<miaobyte, T>
+    {
+        static void reshape(Tensor<T> &tensor, const std::vector<int> &new_shape)
+        {
+            if (tensor.shape.dim != new_shape.size())
+            {
+                throw std::runtime_error("Tensor shapes must match for reshape");
+            }
+            tensor.shape = Shape(new_shape);
+        }
+    };
+
+    template <typename T>
+    struct transposeDispatcher<miaobyte, T>
+    {
+        static void transpose(const Tensor<T> &tensor, const std::vector<int> &dim_order, Tensor<T> &output)
+        {
+            if (dim_order.size() != tensor.shape.dim)
+            {
+                throw std::runtime_error("Dimension order size must match tensor dimension size for transpose");
+            }
+            auto [actual_blocks, optimal_block_size] = BestDims(tensor.shape.size);
+            launch_transpose<T>(actual_blocks, optimal_block_size,
+                             tensor.data, tensor.shape.strides.data(),
+                             output.data, output.shape.strides.data(),
+                             tensor.shape.dim, tensor.shape.size, dim_order.data());
+        }
+    };
+
+    template <typename T>
+    struct concatDispatcher<miaobyte, T>
+    {
+        static void concat(const vector<Tensor<T>*> tensors, const int axis, Tensor<T> &C)
+        {       
+              //checkshape
+              if (!checkShapeConcat(tensors, axis, C))
+              {
+                  throw TensorShapeError("Output tensor shape size must match the sum of input tensor shape sizes for concat");
+              }
+
+              vector<const T*> tensorsData(tensors.size());
+              for (int i = 0; i < tensors.size(); i++)
+              {
+                  tensorsData[i] = tensors[i]->data;
+              }
+
+              vector< int> inputStrides;
+              for (int i = 0; i < tensors.size(); i++)
+              {
+                  std::copy(tensors[i]->shape.strides.data(), tensors[i]->shape.strides.data() + tensors[i]->shape.dim, std::back_inserter(inputStrides));
+              }
+            
+              vector<int> shapeAtAxis(tensors.size());
+              for (int i = 0; i < tensors.size(); i++)
+              {
+                  shapeAtAxis[i] = tensors[i]->shape[axis];
+              }
+
+              launch_concat<T>(tensorsData.data(), inputStrides.data(), 
+              C.data, C.shape.strides.data(),
+              C.shape.dim, 
+              C.shape.size,
+              axis, tensors.size(), shapeAtAxis.data());
+        };
+    };
+}
+#endif // DEEPX_TENSORFUNC_CHANGESHAPE_MIAOBYTE_HPP
\ No newline at end of file
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/concat.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/concat.hpp
deleted file mode 100644
index eb009b75..00000000
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/concat.hpp
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef DEEPX_TENSORFUNC_CONCAT_HPP
-#define DEEPX_TENSORFUNC_CONCAT_HPP
-
-#include <vector>
-#include <stdexcept>
-#include "deepx/tensor.hpp"
-#include "deepx/shape_concat.hpp" 
-#include "deepx/tensorfunc/new.hpp"
-namespace deepx::tensorfunc
-{
-
-}
-#endif
\ No newline at end of file
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda.hpp
index bfee6cae..a9b6886f 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda.hpp
@@ -1,7 +1,7 @@
 #ifndef DEEPX_TENSORFUNC_CUDA_HPP
 #define DEEPX_TENSORFUNC_CUDA_HPP
 
-#include <cublas_v2.h> 
+#include <cublas_v2.h>
 #include <cstdint>
 #include <stdexcept>
 
@@ -29,7 +29,27 @@ namespace deepx::tensorfunc
     private:
         cublasHandle_t handle_;
     };
- 
+
+    inline std::pair<int, int> BestDims(int total_elements)
+    {
+        // 默认块大小
+        int optimal_block_size = 256; // 一般256或512是较好的选择
+        // 计算设备属性以确定最佳配置
+        int device_id;
+        cudaGetDevice(&device_id);
+        cudaDeviceProp props;
+        cudaGetDeviceProperties(&props, device_id);
+
+        // 根据SM数量和每个SM的最大线程数决定块数
+        int sm_count = props.multiProcessorCount;
+        int optimal_blocks = sm_count * 8; // 每个SM分配多个块以增加并行度
+
+        // 确保至少启动足够的线程来处理所有数据
+        int min_blocks = (total_elements + optimal_block_size - 1) / optimal_block_size;
+        int actual_blocks = std::min(optimal_blocks, min_blocks);
+
+        return {actual_blocks, optimal_block_size};
+    };
 }
 
 #endif
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/matmul_cublas.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/matmul_cublas.hpp
index 067a94c1..5931c054 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/matmul_cublas.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/matmul_cublas.hpp
@@ -144,123 +144,6 @@ namespace deepx::tensorfunc
         }
     };
 
-    template <>
-    struct matmuladdDispatcher<cublas, float>
-    {
-        static void matmuladd(const Tensor<float> &A, const Tensor<float> &B, const float &alpha, const float &beta, Tensor<float> &C)
-        {
-            if (!check_matmul_shape(A.shape, B.shape))
-            {
-                throw std::invalid_argument("A.shape could not matmul with B.shape");
-            }
-
-            static CublasHandle handle;
-            int64_t batch_size = A.shape.size / (A.shape[-2] * A.shape[-1]);
-
-            int m = A.shape[-2];
-            int k = A.shape[-1];
-            int n = B.shape[-1];
-
-            // 计算步长
-            int64_t stride_a = m * k;
-            int64_t stride_b = k * n;
-            int64_t stride_c = m * n;
-
-            if (batch_size > 1)
-            {
-                auto status = cublasSgemmStridedBatched(handle.get(),
-                                                        CUBLAS_OP_N,
-                                                        CUBLAS_OP_N,
-                                                        n, m, k,      // 交换m,n
-                                                        &alpha,
-                                                        B.data, n, stride_b,  // B在前
-                                                        A.data, k, stride_a,  // A在后
-                                                        &beta,
-                                                        C.data, n, stride_c,  // 调整leading dimension
-                                                        batch_size);          // 添加缺失的batch_size参数
-
-                if (status != CUBLAS_STATUS_SUCCESS)
-                {
-                    throw std::runtime_error("cublasSgemmStridedBatched failed");
-                }
-            }
-            else
-            {
-                auto status = cublasSgemm(handle.get(),
-                                        CUBLAS_OP_N,
-                                        CUBLAS_OP_N,
-                                        n, m, k,    // 交换m,n
-                                        &alpha,
-                                        B.data, n,  // B在前
-                                        A.data, k,  // A在后
-                                        &beta,
-                                        C.data, n); // 调整leading dimension
-
-                if (status != CUBLAS_STATUS_SUCCESS)
-                {
-                    throw std::runtime_error("cublasSgemm failed");
-                }
-            }
-        }
-    };
-    template <>
-    struct matmuladdDispatcher<cublas, double>
-    {
-        static void matmuladd(const Tensor<double> &A, const Tensor<double> &B, const double &alpha, const double &beta, Tensor<double> &C)
-        {
-            if (!check_matmul_shape(A.shape, B.shape))
-            {
-                throw std::invalid_argument("A.shape could not matmul with B.shape");
-            }
-
-            static CublasHandle handle;
-            int m = A.shape[-2];
-            int k = A.shape[-1];
-            int n = B.shape[-1];
-
-            int64_t batch_size = A.shape.size / (A.shape[-2] * A.shape[-1]);
-
-            if (batch_size > 1)
-            {
-                // 计算步长
-                int64_t stride_a = m * k;
-                int64_t stride_b = k * n;
-                int64_t stride_c = m * n;
-
-                auto status = cublasDgemmStridedBatched(handle.get(),
-                                                        CUBLAS_OP_N,
-                                                        CUBLAS_OP_N,
-                                                        n, m, k,      // 交换m,n处理行主序
-                                                        &alpha,
-                                                        B.data, n, stride_b,  // B在前
-                                                        A.data, k, stride_a,  // A在后
-                                                        &beta,
-                                                        C.data, n, stride_c,  // 输出维度对应调整
-                                                        batch_size);
-
-                if (status != CUBLAS_STATUS_SUCCESS)
-                {
-                    throw std::runtime_error("cublasDgemmStridedBatched failed");
-                }
-            }
-            else
-            {
-                auto status = cublasDgemm(handle.get(),
-                                          CUBLAS_OP_N,
-                                          CUBLAS_OP_N,
-                                          m, n, k,
-                                          &alpha,
-                                          A.data, m,
-                                          B.data, k,
-                                          &beta,
-                                          C.data, m);
-
-                if (status != CUBLAS_STATUS_SUCCESS)
-                {
-                    throw std::runtime_error("cublasDgemm failed");
-                }
-            }
-        };
-    };
+    
 };
 #endif // DEEPX_TENSORFUNC_MATMUL_HPP
\ No newline at end of file
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/tensor_cuda.cuh b/excuter/op-mem-cuda/src/deepx/tensorfunc/tensor_cuda.cuh
new file mode 100644
index 00000000..a042d6d1
--- /dev/null
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/tensor_cuda.cuh
@@ -0,0 +1,39 @@
+#ifndef DEEPX_TENSORFUNC_TENSOR_CUDA_CUH
+#define DEEPX_TENSORFUNC_TENSOR_CUDA_CUH
+
+#include "deepx/tensor.hpp"
+
+namespace deepx::tensorfunc
+{
+    __host__ __device__ void linearTo(const int *strides, const int dim, int *indices, const int id)
+    {
+        int linearIndex = id;
+        for (int i = 0; i < dim; i++)
+        {
+            indices[i] = linearIndex / strides[i];
+            linearIndex %= strides[i];
+        }
+    }
+
+    __host__ __device__ int linearAt(const int *strides, const int dim, int *indices)
+    {
+        int idx = 0;
+        for (int i = 0; i < dim; i++)
+        {
+            idx += indices[i] * strides[i];
+        }
+        return idx;
+    }
+
+    template <typename T>
+    __device__ __host__ void reorder(const T *order, const int *dimOrder, int dim, T *neworder)
+    {
+        for (int i = 0; i < dim; i++)
+        {
+            neworder[i] = order[dimOrder[i]];
+        }
+    }
+   
+}
+
+#endif // DEEPX_TENSORFUNC_TENSOR_CUDA_CUH
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/vector_cuda.cuh b/excuter/op-mem-cuda/src/deepx/tensorfunc/vector_cuda.cuh
new file mode 100644
index 00000000..4fe17030
--- /dev/null
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/vector_cuda.cuh
@@ -0,0 +1,100 @@
+#ifndef DEEPX_TENSORFUNC_VECTOR_CUDA_CUH
+#define DEEPX_TENSORFUNC_VECTOR_CUDA_CUH
+
+namespace deepx::tensorfunc
+{
+    template <typename T>
+    __device__ void GridStrideLoopCopy(const T* src, T* dst, int size) {
+        int idx = blockIdx.x * blockDim.x + threadIdx.x;
+        int stride = blockDim.x * gridDim.x;
+        
+        for (int i = idx; i < size; i += stride) {
+            dst[i] = src[i];
+        }
+    }
+    
+    // 全局复制函数，可从主机调用
+    template <typename T>
+    __global__ void GridStrideLoopCopyKernel(const T* src, T* dst, int size) {
+        GridStrideLoopCopy(src, dst, size);
+    }
+
+    //cudaVector
+    template <typename T>
+    struct cudaVector
+    {
+        T *data;
+        int size;
+        __device__ __host__ cudaVector(int size) : size(size)
+        {
+            cudaMalloc(&data, size * sizeof(T));
+        }
+        __host__ cudaVector(const T *src, int size, cudaMemcpyKind kind = cudaMemcpyHostToDevice) : size(size)
+        {
+            cudaMalloc(&data, size * sizeof(T));
+            cudaMemcpy(data, src, size * sizeof(T), kind);
+        }
+        __host__ cudaVector(const cudaVector &other) : size(other.size)
+        {
+            cudaMalloc(&data, size * sizeof(T));
+            cudaMemcpy(data, other.data, size * sizeof(T), cudaMemcpyDeviceToDevice);
+        }
+        __device__ __host__ cudaVector(cudaVector &&other) noexcept : data(other.data), size(other.size)
+        {
+            other.data = nullptr;
+            other.size = 0;
+        }
+        __device__ __host__ cudaVector &operator=(const cudaVector &other)
+        {
+            if (this != &other)
+            {
+                cudaFree(data);
+                data = other.data;
+                size = other.size;
+            }
+            return *this;
+        }
+        __device__ __host__ cudaVector &operator=(cudaVector &&other) noexcept
+        {
+            if (this != &other)
+            {
+                cudaFree(data);
+                data = other.data;
+                size = other.size;
+                other.data = nullptr;
+                other.size = 0;
+            }
+            return *this;
+        }
+        __device__ __host__ ~cudaVector()
+        {
+            cudaFree(data);
+        }
+        __device__ __host__ void copyFromHost(const T *hostData, int size,int offset=0)
+        {
+            cudaMemcpy(data+offset, hostData, size * sizeof(T), cudaMemcpyHostToDevice);
+        }
+        __device__ __host__ void copyToHost(T *hostData, int size,int offset=0)
+        {
+            cudaMemcpy(hostData, data+offset, size * sizeof(T), cudaMemcpyDeviceToHost);
+        }
+        __device__ __host__ void copyFromDevice(const T *deviceData, int size,int offset=0)
+        {
+            for (int i = 0; i < size; i++)
+            {
+                data[offset+i] = deviceData[i];
+            }
+        }
+        __device__ __host__ T &operator[](int idx)
+        {
+            return data[idx];
+        }
+        __device__ __host__ const T &operator[](int idx) const
+        {
+            return data[idx];
+        }
+ 
+    };
+}
+
+#endif // DEEPX_TENSORFUNC_VECTOR_CUDA_CUH
diff --git a/excuter/op-mem-cuda/test/tensorfunc/2_changeshape.cpp b/excuter/op-mem-cuda/test/tensorfunc/2_changeshape.cpp
new file mode 100644
index 00000000..66f5dc39
--- /dev/null
+++ b/excuter/op-mem-cuda/test/tensorfunc/2_changeshape.cpp
@@ -0,0 +1,46 @@
+#include "deepx/tensorfunc/init_miaobyte.hpp"
+#include "deepx/tensor.hpp"
+#include "deepx/tensorfunc/new.hpp"
+#include "deepx/tensorfunc/print_miaobyte.hpp"
+#include "deepx/tensorfunc/changeshape_miaobyte.hpp"
+using namespace deepx::tensorfunc;
+using namespace deepx;
+void test_transpose()
+{
+    Tensor<float> a=New<float>({3,4,6});
+    arange<miaobyte,float>(a, 1.0f, 1.0f);
+    print<miaobyte>(a,"%.0f");
+    Tensor<float> b=New<float>({3,6,4});
+    transpose<miaobyte,float>(a, {0,2,1}, b);
+    print<miaobyte>(b,"%.0f");
+}
+
+void test_concat()
+{
+    Tensor<float> a=New<float>({3,2,6});
+    arange<miaobyte,float>(a, 1.0f, 1.0f);
+    print<miaobyte>(a,"%.0f");
+    Tensor<float> b=New<float>({3,4,6});
+    constant<miaobyte,float>(b, 2.0f);
+    print<miaobyte>(b,"%.0f");
+    Tensor<float> c=New<float>({3,6,6});
+    constant<miaobyte,float>(c, 3.0f);
+    print<miaobyte>(c,"%.0f");
+    Tensor<float> d=New<float>({3,12,6});
+    concat<miaobyte,float>({&a,&b,&c},1,d);
+    print<miaobyte>(d,"%.0f");
+}
+int main(int argc, char **argv)
+{      
+    int casearg=atoi(argv[1]);
+    switch (casearg)
+    {
+    case 0:
+        test_transpose();
+        break;
+    case 1:
+        test_concat();
+        break;
+    }
+    return 0;
+}
\ No newline at end of file
diff --git a/excuter/op-mem-cuda/test/tensorfunc/CMakeLists.txt b/excuter/op-mem-cuda/test/tensorfunc/CMakeLists.txt
index cbea6433..91fbb357 100644
--- a/excuter/op-mem-cuda/test/tensorfunc/CMakeLists.txt
+++ b/excuter/op-mem-cuda/test/tensorfunc/CMakeLists.txt
@@ -5,4 +5,7 @@ add_executable(1_cublas_add 1_cublas_add.cpp)
 target_link_libraries(1_cublas_add deepx CUDA::cudart)
 
 add_executable(1_cublas_matmul 1_cublas_matmul.cpp)
-target_link_libraries(1_cublas_matmul deepx CUDA::cudart)
\ No newline at end of file
+target_link_libraries(1_cublas_matmul deepx CUDA::cudart)
+
+add_executable(2_changeshape 2_changeshape.cpp)
+target_link_libraries(2_changeshape deepx CUDA::cudart)
\ No newline at end of file
diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/changeshape.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/changeshape_miaobyte.hpp
similarity index 71%
rename from excuter/op-mem-ompsimd/src/deepx/tensorfunc/changeshape.hpp
rename to excuter/op-mem-ompsimd/src/deepx/tensorfunc/changeshape_miaobyte.hpp
index fe562099..4ca0f747 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/changeshape.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/changeshape_miaobyte.hpp
@@ -6,59 +6,72 @@
 
 #include "deepx/tensor.hpp"
 #include "deepx/tensorfunc/new.hpp"
+#include "deepx/tensorfunc/changeshape.hpp"
 #include "deepx/shape_broadcast.hpp"
-
+#include "deepx/tensorfunc/authors.hpp"
 namespace deepx::tensorfunc
 {
     template <typename T>
-    void reshape(Tensor<T> &tensor, Tensor<T> &output, const std::vector<int> &shape)
-    { // 参数改为单个tensor引用
+    struct reshapeDispatcher<miaobyte, T>
+    {
+        void reshape(Tensor<T> &tensor, Tensor<T> &output, const std::vector<int> &shape)
+        { // 参数改为单个tensor引用
 
-        int new_prod = 1;
-        for (int dim : shape)
-        {
-            new_prod *= dim;
-        }
+            int new_prod = 1;
+            for (int dim : shape)
+            {
+                new_prod *= dim;
+            }
 
-        if (tensor.shape.size != new_prod)
-        {
-            throw std::invalid_argument("Shape size mismatch");
-        }
-        if (tensor.data != output.data)
-        {
-            tensorfunc::copytensor(tensor, output);
+            if (tensor.shape.size != new_prod)
+            {
+                throw std::invalid_argument("Shape size mismatch");
+            }
+            if (tensor.data != output.data)
+            {
+                tensorfunc::copytensor(tensor, output);
+            }
+            output.shape = Shape(shape); // 直接修改原tensor的shape
         }
-        output.shape = Shape(shape); // 直接修改原tensor的shape
-    }
+    };
 
     template <typename T>
-    void transpose(const Tensor<T> &tensor, Tensor<T> &result, const std::vector<int> &dimOrder)
+    struct transposeDispatcher<miaobyte, T>
     {
-        if (dimOrder.size() != tensor.shape.dim)
-        {
-            throw std::invalid_argument("dimOrder size does not match the number of dimensions in the TensorCPU.");
-        }
-        if (result.shape.size != tensor.shape.size)
+        static void transpose(const Tensor<T> &tensor, const std::vector<int> &dim_order, Tensor<T> &output)
         {
-            throw std::runtime_error("transpose error!shape");
-        }
-        result.shape.rangeParallel(dimOrder.size(), [&tensor, &result, &dimOrder](int idx_linear, const std::vector<int> &indices, std::vector<int> &newIndices)
-                                   {
-                           
-                            for (size_t i = 0; i < dimOrder.size(); ++i) {
-                                newIndices[dimOrder[i]] = indices[i];
+
+            if (dim_order.size() != tensor.shape.dim)
+            {
+                throw std::invalid_argument("dimOrder size does not match the number of dimensions in the TensorCPU.");
+            }
+            if (output.shape.size != tensor.shape.size)
+            {
+                throw std::runtime_error("transpose error!shape");
+            }
+            output.shape.rangeParallel(dim_order.size(), [&tensor, &output, &dim_order](int idx_linear, const std::vector<int> &indices, std::vector<int> &newIndices)
+                                       {
+                                        
+                            for (size_t i = 0; i < dim_order.size(); ++i) {
+                                newIndices[dim_order[i]] = indices[i];
                             }
-                            result.data[idx_linear]= tensor.data[tensor.shape.linearat(newIndices)]; }, tensor.shape.dim);
-    }
+                            output.data[idx_linear]= tensor.data[tensor.shape.linearat(newIndices)]; }, tensor.shape.dim);
+        }
+    };
 
     template <typename T>
-    void concat(const std::vector<Tensor<T> *> &tensors, const int axis, Tensor<T> &result)
+    struct concatDispatcher<miaobyte, T>
     {
-        // Shape shape=concatShape(tensors,axis);
-        // result=New<T>(shape.shape);
-        int dimC = axis + 1;
-        result.shape.rangeParallel(dimC, [&](const int idx, const std::vector<int> &indices)
-                                   {
+        static void concat(const vector<Tensor<T> *> tensors, const int axis, Tensor<T> &result)
+        {
+            //checkshape
+            if (!checkShapeConcat(tensors, axis, result))
+            {
+                throw TensorShapeError("Output tensor shape size must match the sum of input tensor shape sizes for concat");
+            }   
+            int dimC = axis + 1;
+            result.shape.rangeParallel(dimC, [&](const int idx, const std::vector<int> &indices)
+                                       {
                         int concatIdxCurrentTensor=indices[axis];;
                         int tensorIdx=0;
                         while (tensorIdx < tensors.size()  ) {
@@ -76,7 +89,8 @@ namespace deepx::tensorfunc
                         int idxCurrentTensor=tensors[tensorIdx]->shape.linearat(currentTensorIndices);
                         int copylen=tensors[tensorIdx]->shape.strides[axis];
                         std::copy(tensors[tensorIdx]->data+idxCurrentTensor,tensors[tensorIdx]->data+idxCurrentTensor+copylen,result.data+idx); });
-    }
+        }
+    };
 
     template <typename T>
     void split(const Tensor<T> &tensor, const int axis, std::vector<Tensor<T> *> &results)
@@ -111,7 +125,7 @@ namespace deepx::tensorfunc
             throw std::invalid_argument("expand维度不匹配: 输入维度 " +
                                         std::to_string(input.shape.dim) +
                                         ", 目标维度 " +
-                                        std::to_string(output.shape.dim)+
+                                        std::to_string(output.shape.dim) +
                                         "请先前dim补1的方式reshape");
         }
 
@@ -162,7 +176,5 @@ namespace deepx::tensorfunc
                     output.data[idx_linear] = input.data[idx_old]; }, input.shape.dim);
         }
     }
-
-
 }
 #endif
\ No newline at end of file
diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/matmul_cblas.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/matmul_cblas.hpp
index 1e1371f6..8656191b 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/matmul_cblas.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/matmul_cblas.hpp
@@ -115,150 +115,5 @@ namespace deepx::tensorfunc
       }
     }
   };
-
-  template <typename T>
-  struct matmuladdDispatcher<cblas, T>
-  {
-    static void matmuladd(const Tensor<T> &a, const Tensor<T> &b, const T &alpha, const T &beta, Tensor<T> &c)
-    {
-      if (!check_shape(a.shape, b.shape))
-      {
-        throw std::invalid_argument("a.shape could matmul with b.shape");
-      }
-      c.shape.rangeParallel(c.shape.dim - 2, [&](const std::vector<int> &indices)
-                            {
-                        int aIdx=a.shape.linearat(indices);
-                        int bIdx=b.shape.linearat(indices);
-                        int cIdx=c.shape.linearat(indices);
-                        int m=a.shape[-2];
-                        int k=a.shape[-1];
-                        int n=b.shape[-1];
-                        for(int i=0;i<m;i++){
-                            for(int j=0;j<n;j++){
-                                T sum=0;
-                                for(int l=0;l<k;l++){
-                                    sum+=a.data[aIdx+i*k+l]*b.data[bIdx+l*n+j];
-                                }
-                                c.data[cIdx+i*n+j]=alpha*sum+beta*c.data[cIdx+i*n+j];
-                            }
-                        } });
-    }
-  };
-
-  template <>
-  struct matmuladdDispatcher<cblas, float>
-  {
-    static void matmuladd(const Tensor<float> &a, const Tensor<float> &b, const float &alpha, const float &beta, Tensor<float> &c)
-    {
-      if (!check_matmul_shape(a.shape, b.shape))
-      {
-        throw std::invalid_argument("a.shape could matmul with b.shape");
-      }
-      // 计算batch size (将除最后两维外的所有维度展平)
-      // 计算batch size (将除最后两维外的所有维度展平)
-      int64_t batch_size = 1;
-      for (int i = 0; i < a.shape.dim - 2; ++i)
-      {
-        batch_size *= a.shape[i];
-      }
-
-      // 获取矩阵维度
-      int64_t m = a.shape[-2]; // 倒数第二维
-      int64_t k = a.shape[-1]; // 最后一维
-      int64_t n = b.shape[-1]; // B的最后一维
-
-      // 设置每个矩阵的步长
-      int64_t lda = k;
-      int64_t ldb = n;
-      int64_t ldc = n;
-
-      // 计算每个batch的指针偏移
-      std::vector<const float *> a_array(batch_size);
-      std::vector<const float *> b_array(batch_size);
-      std::vector<float *> c_array(batch_size);
-
-      for (int64_t i = 0; i < batch_size; ++i)
-      {
-        a_array[i] = a.data + i * m * k;
-        b_array[i] = b.data + i * k * n;
-        c_array[i] = c.data + i * m * n;
-      }
-
-      for (int64_t i = 0; i < batch_size; ++i)
-      {
-        // C = α * op(A) * op(B) + β * C
-        cblas_sgemm(CblasRowMajor, // 存储顺序
-                    CblasNoTrans,  // op(A) = A
-                    CblasNoTrans,  // op(B) = B
-                    m, n, k,       // A[m×k], B[k×n], C[m×n]
-                    alpha,         // α = 1.0
-                    a_array[i],    // A矩阵指针
-                    lda,           // A的leading dimension（行主序时为列数k）
-                    b_array[i],    // B矩阵指针
-                    ldb,           // B的leading dimension（行主序时为列数n）
-                    beta,          // β = 0.0
-                    c_array[i],    // C矩阵指针
-                    ldc);          // C的leading dimension（行主序时为列数n）
-      }
-    }
-  };
-
-  template <>
-  struct matmuladdDispatcher<cblas, double>
-  {
-    static void matmuladd(const Tensor<double> &a, const Tensor<double> &b, const double &alpha, const double &beta, Tensor<double> &c)
-    {
-      if (!check_matmul_shape(a.shape, b.shape))
-      {
-        throw std::invalid_argument("a.shape could matmul with b.shape");
-      }
-      // 计算batch size (将除最后两维外的所有维度展平)
-      // 计算batch size (将除最后两维外的所有维度展平)
-      int64_t batch_size = 1;
-      for (int i = 0; i < a.shape.dim - 2; ++i)
-      {
-        batch_size *= a.shape[i];
-      }
-
-      // 获取矩阵维度
-      int64_t m = a.shape[-2]; // 倒数第二维
-      int64_t k = a.shape[-1]; // 最后一维
-      int64_t n = b.shape[-1]; // B的最后一维
-
-      // 设置每个矩阵的步长
-      int64_t lda = k;
-      int64_t ldb = n;
-      int64_t ldc = n;
-
-      // 计算每个batch的指针偏移
-      std::vector<const double *> a_array(batch_size);
-      std::vector<const double *> b_array(batch_size);
-      std::vector<double *> c_array(batch_size);
-
-      for (int64_t i = 0; i < batch_size; ++i)
-      {
-        a_array[i] = a.data + i * m * k;
-        b_array[i] = b.data + i * k * n;
-        c_array[i] = c.data + i * m * n;
-      }
-
-      for (int64_t i = 0; i < batch_size; ++i)
-      {
-        // C = α * op(A) * op(B) + β * C
-        cblas_dgemm(CblasRowMajor, // 存储顺序
-                    CblasNoTrans,  // op(A) = A
-                    CblasNoTrans,  // op(B) = B
-                    m, n, k,       // A[m×k], B[k×n], C[m×n]
-                    alpha,         // α = 1.0
-                    a_array[i],    // A矩阵指针
-                    lda,           // A的leading dimension（行主序时为列数k）
-                    b_array[i],    // B矩阵指针
-                    ldb,           // B的leading dimension（行主序时为列数n）
-                    beta,          // β = 0.0
-                    c_array[i],    // C矩阵指针
-                    ldc);          // C的leading dimension（行主序时为列数n）
-      }
-    }
-  };
 }
 #endif // DEEPX_TENSORFUNC_MATMUL_CBLAS_HPP
\ No newline at end of file
diff --git a/excuter/op-mem-ompsimd/test/tensorfunc/7_tensor_transpose.cpp b/excuter/op-mem-ompsimd/test/tensorfunc/7_tensor_transpose.cpp
index f6e55be4..90188489 100644
--- a/excuter/op-mem-ompsimd/test/tensorfunc/7_tensor_transpose.cpp
+++ b/excuter/op-mem-ompsimd/test/tensorfunc/7_tensor_transpose.cpp
@@ -3,13 +3,13 @@
 #include <vector>
 
 #include "deepx/tensor.hpp"
-#include "deepx/tensorfunc/changeshape.hpp"
+#include "deepx/tensorfunc/changeshape_miaobyte.hpp"
 #include "deepx/tensorfunc/new.hpp"
+#include "deepx/tensorfunc/authors.hpp"
 #include "deepx/tensorfunc/print_miaobyte.hpp"
 #include "stdutil/vector.hpp"
 #include "tensorutil.hpp"
 #include "deepx/shape_transpose.hpp"
-#include "deepx/tensorfunc/authors.hpp"
 
 using namespace deepx::tensorfunc;
 using namespace deepx;
@@ -25,7 +25,7 @@ void test_transpose()
 
     std::vector<int> resultshape = transposeShape(tensor.shape.shape, dimOrder);
     Tensor result = New<float>(resultshape);
-    transpose(tensor, result, dimOrder);
+    transpose<miaobyte,float>(tensor, dimOrder, result);
     print<miaobyte>(result);
 }
 
diff --git a/excuter/op-mem-ompsimd/test/tensorfunc/8_tensor_concat.cpp b/excuter/op-mem-ompsimd/test/tensorfunc/8_tensor_concat.cpp
index 59d66dd1..3a6bafdc 100644
--- a/excuter/op-mem-ompsimd/test/tensorfunc/8_tensor_concat.cpp
+++ b/excuter/op-mem-ompsimd/test/tensorfunc/8_tensor_concat.cpp
@@ -3,12 +3,12 @@
 #include <iostream>
 
 
-#include "deepx/tensorfunc/changeshape.hpp"
+#include "deepx/tensorfunc/changeshape_miaobyte.hpp"
 #include "deepx/tensor.hpp"
 #include "deepx/shape.hpp"
 #include "deepx/shape_concat.hpp"
 #include "deepx/tensorfunc/new.hpp"
-#include "deepx/tensorfunc/init.hpp"
+#include "deepx/tensorfunc/init_miaobyte.hpp"
 #include "deepx/tensorfunc/print_miaobyte.hpp"
 #include "stdutil/vector.hpp"
 #include "deepx/mem/mem.hpp"
@@ -22,6 +22,7 @@ shared_ptr<MemBase>  makeMem(int cnt,std::vector<int> shape){
         
     for (int j=0; j<cnt; j++){
         auto ptr = New<float>(shape);
+        arange<miaobyte,float>(ptr,0.0f,1.0f);
         mem->addtensor("tensor"+std::to_string(j), ptr);
     }
     return mem;
@@ -39,7 +40,7 @@ void test_concat(){
     for (int i=0;i<tensors[0]->shape.dim;i++){
         Shape shape=concatShape(tensors,i);
         Tensor<float> result=New<float>(shape.shape);
-        concat(tensors,i,result);
+        concat<miaobyte,float>(tensors,i,result);
         print<miaobyte>(result);
     }
     std::cout<<"================"<<std::endl;
diff --git a/front/py/deepx/nn/functional/__init__.py b/front/py/deepx/nn/functional/__init__.py
index 07610c30..4ffa5bc6 100644
--- a/front/py/deepx/nn/functional/__init__.py
+++ b/front/py/deepx/nn/functional/__init__.py
@@ -10,7 +10,7 @@
     "newtensor",
     "printtensor",
     "constant","full","zeros","ones","uniform","arange","rand","randn","eye","kaiming_uniform_","calculate_fan_in_and_fan_out",
-    "add","sub","mul","div","clamp","sqrt","pow","exp","log","rsqrt",
+    "add","sub","mul","div","sqrt","pow","exp","log","rsqrt",
     "matmul",
     "max","min","sum","prod","mean",
     "transpose","reshape","broadcast_shape","broadcast_to","unsqueeze",
diff --git a/front/py/deepx/nn/functional/init.py b/front/py/deepx/nn/functional/init.py
index e79b683b..74581339 100644
--- a/front/py/deepx/nn/functional/init.py
+++ b/front/py/deepx/nn/functional/init.py
@@ -2,7 +2,7 @@
 import math
 
 from deepx import Tensor
-from deepx.autograd.graph import OpNode,Function,Context
+from deepx.autograd import OpNode,Function,Context
 from deepx.nn.deepxir import DeepxIR,Param
 from deepx.scheduler import send
 
diff --git a/src/deepx/tensorfunc/changeshape_miaobyte.cu b/src/deepx/tensorfunc/changeshape_miaobyte.cu
new file mode 100644
index 00000000..0519ecba
--- /dev/null
+++ b/src/deepx/tensorfunc/changeshape_miaobyte.cu
@@ -0,0 +1 @@
+ 
\ No newline at end of file

From 737239b00b4fd4697dffbd77f378a5fb57106e28 Mon Sep 17 00:00:00 2001
From: lipeng <734991033@qq.com>
Date: Sun, 6 Apr 2025 17:51:18 +0800
Subject: [PATCH 4/7] excuter(cpu/cuda):reshape,transpose

---
 .../src/deepx/tensorfunc/changeshape.hpp      |   6 +-
 excuter/cpp-common/src/deepx/tf/tf.hpp        |   2 +-
 excuter/op-mem-cuda/src/client/tfs.cpp        | 128 ++++++-----
 .../op-mem-cuda/src/deepx/tf/changeshape.hpp  | 143 ++++++++++++
 excuter/op-mem-ompsimd/src/client/tfs.cpp     |  78 ++++---
 .../deepx/tensorfunc/changeshape_miaobyte.hpp |  14 +-
 .../src/deepx/tf/changeshape.hpp              | 213 ++++++++++++------
 7 files changed, 416 insertions(+), 168 deletions(-)
 create mode 100644 excuter/op-mem-cuda/src/deepx/tf/changeshape.hpp

diff --git a/excuter/cpp-common/src/deepx/tensorfunc/changeshape.hpp b/excuter/cpp-common/src/deepx/tensorfunc/changeshape.hpp
index 5acd644e..c2eff6b1 100644
--- a/excuter/cpp-common/src/deepx/tensorfunc/changeshape.hpp
+++ b/excuter/cpp-common/src/deepx/tensorfunc/changeshape.hpp
@@ -1,5 +1,5 @@
-#ifndef DEEPX_TENSORFUNC_CHANGE_SHAPE_HPP
-#define DEEPX_TENSORFUNC_CHANGE_SHAPE_HPP
+#ifndef DEEPX_TENSORFUNC_CHANGESHAPE_HPP
+#define DEEPX_TENSORFUNC_CHANGESHAPE_HPP
 
 #include <vector>
 #include "deepx/tensor.hpp"
@@ -14,7 +14,7 @@ namespace deepx::tensorfunc
         static void reshape(Tensor<T> &tensor, const std::vector<int> &new_shape) = delete;
     };
 
-    // reshape(A,new_shape)=>B
+    // A.reshape(new_shape)
     template <typename Author, typename T>
     void reshape(Tensor<T> &tensor, const std::vector<int> &new_shape)
     {
diff --git a/excuter/cpp-common/src/deepx/tf/tf.hpp b/excuter/cpp-common/src/deepx/tf/tf.hpp
index 2508a121..3425e9a8 100644
--- a/excuter/cpp-common/src/deepx/tf/tf.hpp
+++ b/excuter/cpp-common/src/deepx/tf/tf.hpp
@@ -77,7 +77,7 @@ namespace deepx::tf
         }
 
         template<typename T>
-        vector<T> argvector(  int from=0, int to=0,bool arg=true){
+        vector<T> getvector( int from=0, int to=0,bool arg=true){
             vector<Param> &vars=arg?args:returns;
             if(from<0){
                 from = vars.size()+from;
diff --git a/excuter/op-mem-cuda/src/client/tfs.cpp b/excuter/op-mem-cuda/src/client/tfs.cpp
index f8d33bd5..109847c9 100644
--- a/excuter/op-mem-cuda/src/client/tfs.cpp
+++ b/excuter/op-mem-cuda/src/client/tfs.cpp
@@ -8,6 +8,7 @@
 #include "deepx/tf/elementwise_sin.hpp"
 #include "deepx/tf/elementwise_compare.hpp"
 #include "deepx/tf/matmul.hpp"
+#include "deepx/tf/changeshape.hpp"
 #include "deepx/dtype.hpp"
 #include "deepx/tf/tffactory.hpp"
 #include "deepx/tensorfunc/authors.hpp"
@@ -194,80 +195,80 @@ namespace deepx::tf
 
         tffactory.add_tf(std::make_shared<Sqrt<miaobyte>>(vector<Param>(
                                                               {
-                                                                  Param("A", DataCategory::Tensor, Precision::Float64|Precision::Float32|Precision::Float16|Precision::BFloat16),
+                                                                  Param("A", DataCategory::Tensor, Precision::Float64 | Precision::Float32 | Precision::Float16 | Precision::BFloat16),
                                                               }),
                                                           vector<Param>(
                                                               {
-                                                                  Param("C", DataCategory::Tensor, Precision::Float64|Precision::Float32|Precision::Float16|Precision::BFloat16),
+                                                                  Param("C", DataCategory::Tensor, Precision::Float64 | Precision::Float32 | Precision::Float16 | Precision::BFloat16),
                                                               })));
 
         tffactory.add_tf(std::make_shared<Pow<miaobyte>>(vector<Param>(
                                                              {
-                                                                 Param("A", DataCategory::Tensor, Precision::Float64|Precision::Float32 ),
-                                                                 Param("B", DataCategory::Tensor, Precision::Float64|Precision::Float32 ),
+                                                                 Param("A", DataCategory::Tensor, Precision::Float64 | Precision::Float32),
+                                                                 Param("B", DataCategory::Tensor, Precision::Float64 | Precision::Float32),
                                                              }),
                                                          vector<Param>(
                                                              {
-                                                                 Param("C", DataCategory::Tensor, Precision::Float64|Precision::Float32),
+                                                                 Param("C", DataCategory::Tensor, Precision::Float64 | Precision::Float32),
                                                              })));
         tffactory.add_tf(std::make_shared<PowScalar<miaobyte>>(vector<Param>(
                                                                    {
-                                                                       Param("A", DataCategory::Tensor, Precision::Float64|Precision::Float32),
-                                                                       Param("scalar", DataCategory::Var, Precision::Float64|Precision::Float32),
+                                                                       Param("A", DataCategory::Tensor, Precision::Float64 | Precision::Float32),
+                                                                       Param("scalar", DataCategory::Var, Precision::Float64 | Precision::Float32),
                                                                    }),
                                                                vector<Param>(
                                                                    {
-                                                                       Param("C", DataCategory::Tensor, Precision::Float64|Precision::Float32),
+                                                                       Param("C", DataCategory::Tensor, Precision::Float64 | Precision::Float32),
                                                                    })));
         tffactory.add_tf(std::make_shared<Log<miaobyte>>(vector<Param>(
                                                              {
-                                                                 Param("A", DataCategory::Tensor, Precision::Float64|Precision::Float32|Precision::Float16|Precision::BFloat16),
+                                                                 Param("A", DataCategory::Tensor, Precision::Float64 | Precision::Float32 | Precision::Float16 | Precision::BFloat16),
                                                              }),
                                                          vector<Param>(
                                                              {
-                                                                 Param("C", DataCategory::Tensor, Precision::Float64|Precision::Float32|Precision::Float16|Precision::BFloat16),
+                                                                 Param("C", DataCategory::Tensor, Precision::Float64 | Precision::Float32 | Precision::Float16 | Precision::BFloat16),
                                                              })));
         tffactory.add_tf(std::make_shared<Exp<miaobyte>>(vector<Param>(
                                                              {
-                                                                 Param("A", DataCategory::Tensor, Precision::Float64|Precision::Float32|Precision::Float16|Precision::BFloat16),
+                                                                 Param("A", DataCategory::Tensor, Precision::Float64 | Precision::Float32 | Precision::Float16 | Precision::BFloat16),
                                                              }),
                                                          vector<Param>(
                                                              {
-                                                                 Param("C", DataCategory::Tensor, Precision::Float64|Precision::Float32|Precision::Float16|Precision::BFloat16),
+                                                                 Param("C", DataCategory::Tensor, Precision::Float64 | Precision::Float32 | Precision::Float16 | Precision::BFloat16),
                                                              })));
         tffactory.add_tf(std::make_shared<Sin<miaobyte>>(vector<Param>(
                                                              {
-                                                                 Param("A", DataCategory::Tensor, Precision::Float64|Precision::Float32|Precision::Float16|Precision::BFloat16),
+                                                                 Param("A", DataCategory::Tensor, Precision::Float64 | Precision::Float32 | Precision::Float16 | Precision::BFloat16),
                                                              }),
                                                          vector<Param>(
                                                              {
-                                                                 Param("C", DataCategory::Tensor, Precision::Float64|Precision::Float32|Precision::Float16|Precision::BFloat16),
+                                                                 Param("C", DataCategory::Tensor, Precision::Float64 | Precision::Float32 | Precision::Float16 | Precision::BFloat16),
                                                              })));
         tffactory.add_tf(std::make_shared<Cos<miaobyte>>(vector<Param>(
                                                              {
-                                                                 Param("A", DataCategory::Tensor, Precision::Float64|Precision::Float32|Precision::Float16|Precision::BFloat16),
+                                                                 Param("A", DataCategory::Tensor, Precision::Float64 | Precision::Float32 | Precision::Float16 | Precision::BFloat16),
                                                              }),
                                                          vector<Param>(
                                                              {
-                                                                 Param("C", DataCategory::Tensor, Precision::Float64|Precision::Float32|Precision::Float16|Precision::BFloat16),
+                                                                 Param("C", DataCategory::Tensor, Precision::Float64 | Precision::Float32 | Precision::Float16 | Precision::BFloat16),
                                                              })));
         tffactory.add_tf(std::make_shared<Tan<miaobyte>>(vector<Param>(
                                                              {
-                                                                 Param("A", DataCategory::Tensor, Precision::Float64|Precision::Float32),
+                                                                 Param("A", DataCategory::Tensor, Precision::Float64 | Precision::Float32),
                                                              }),
                                                          vector<Param>(
                                                              {
-                                                                 Param("C", DataCategory::Tensor, Precision::Float64|Precision::Float32),
+                                                                 Param("C", DataCategory::Tensor, Precision::Float64 | Precision::Float32),
                                                              })));
         tffactory.add_tf(std::make_shared<Max<miaobyte>>(vector<Param>(
-                                                                   {
-                                                                       Param("A", DataCategory::Tensor, Precision::Any),
-                                                                       Param("B", DataCategory::Tensor, Precision::Any),
-                                                                   }),
-                                                               vector<Param>(
-                                                                   {
-                                                                       Param("C", DataCategory::Tensor, Precision::Any),
-                                                                   })));  
+                                                             {
+                                                                 Param("A", DataCategory::Tensor, Precision::Any),
+                                                                 Param("B", DataCategory::Tensor, Precision::Any),
+                                                             }),
+                                                         vector<Param>(
+                                                             {
+                                                                 Param("C", DataCategory::Tensor, Precision::Any),
+                                                             })));
         tffactory.add_tf(std::make_shared<MaxScalar<miaobyte>>(vector<Param>(
                                                                    {
                                                                        Param("A", DataCategory::Tensor, Precision::Any),
@@ -276,16 +277,16 @@ namespace deepx::tf
                                                                vector<Param>(
                                                                    {
                                                                        Param("C", DataCategory::Tensor, Precision::Any),
-                                                                   })));   
+                                                                   })));
         tffactory.add_tf(std::make_shared<Min<miaobyte>>(vector<Param>(
-                                                                   {
-                                                                       Param("A", DataCategory::Tensor, Precision::Any),
-                                                                       Param("B", DataCategory::Tensor, Precision::Any),
-                                                                   }),
-                                                               vector<Param>(
-                                                                   {
-                                                                       Param("C", DataCategory::Tensor, Precision::Any),
-                                                                   })));      
+                                                             {
+                                                                 Param("A", DataCategory::Tensor, Precision::Any),
+                                                                 Param("B", DataCategory::Tensor, Precision::Any),
+                                                             }),
+                                                         vector<Param>(
+                                                             {
+                                                                 Param("C", DataCategory::Tensor, Precision::Any),
+                                                             })));
         tffactory.add_tf(std::make_shared<MinScalar<miaobyte>>(vector<Param>(
                                                                    {
                                                                        Param("A", DataCategory::Tensor, Precision::Any),
@@ -294,26 +295,26 @@ namespace deepx::tf
                                                                vector<Param>(
                                                                    {
                                                                        Param("C", DataCategory::Tensor, Precision::Any),
-                                                                   })));   
+                                                                   })));
         tffactory.add_tf(std::make_shared<Compare<miaobyte>>(vector<Param>(
-                                                                   {
-                                                                       Param("A", DataCategory::Tensor, Precision::Any),
-                                                                       Param("B", DataCategory::Tensor, Precision::Any),
-                                                                   }),
-                                                               vector<Param>(
-                                                                   {
-                                                                       Param("mask", DataCategory::Tensor, Precision::Int8),
-                                                                   })));     
+                                                                 {
+                                                                     Param("A", DataCategory::Tensor, Precision::Any),
+                                                                     Param("B", DataCategory::Tensor, Precision::Any),
+                                                                 }),
+                                                             vector<Param>(
+                                                                 {
+                                                                     Param("mask", DataCategory::Tensor, Precision::Int8),
+                                                                 })));
         tffactory.add_tf(std::make_shared<CompareScalar<miaobyte>>(vector<Param>(
-                                                                   {
-                                                                       Param("A", DataCategory::Tensor, Precision::Any),
-                                                                       Param("scalar", DataCategory::Var, Precision::Any),
-                                                                   }),
-                                                               vector<Param>(
-                                                                   {
-                                                                       Param("mask", DataCategory::Tensor, Precision::Int8),
-                                                                   })));   
-    }   
+                                                                       {
+                                                                           Param("A", DataCategory::Tensor, Precision::Any),
+                                                                           Param("scalar", DataCategory::Var, Precision::Any),
+                                                                       }),
+                                                                   vector<Param>(
+                                                                       {
+                                                                           Param("mask", DataCategory::Tensor, Precision::Int8),
+                                                                       })));
+    }
     // matmul
     void register_matmul(TfFactory &tffactory)
     {
@@ -330,10 +331,23 @@ namespace deepx::tf
     // // changeshape
     void register_changeshape(TfFactory &tffactory)
     {
-        //     opfactory.add_op(Transpose<float>());
-        //     opfactory.add_op(Reshape<float>());
-        //     opfactory.add_op(Expand<float>());
-        // tffactory.add_tf(std::make_shared<Concat>());
+
+        tffactory.add_tf(std::make_shared<Reshape<miaobyte>>(vector<Param>(
+                                                                 {
+                                                                     Param("A", DataCategory::Tensor, Precision::Any),
+                                                                     Param("shape", DataCategory::Vector, Precision::Int32),
+                                                                 }),
+                                                             vector<Param>()));
+
+        tffactory.add_tf(std::make_shared<Transpose<miaobyte>>(vector<Param>(
+                {
+                    Param("A", DataCategory::Tensor, Precision::Any),
+                    Param("dim_order", DataCategory::Vector, Precision::Int32),
+                }),
+            vector<Param>(
+                {
+                    Param("C", DataCategory::Tensor, Precision::Any),
+                })));
     }
     // // reduce
     // void register_reduce(OpFactory &opfactory)
diff --git a/excuter/op-mem-cuda/src/deepx/tf/changeshape.hpp b/excuter/op-mem-cuda/src/deepx/tf/changeshape.hpp
new file mode 100644
index 00000000..551f7d51
--- /dev/null
+++ b/excuter/op-mem-cuda/src/deepx/tf/changeshape.hpp
@@ -0,0 +1,143 @@
+#ifndef DEEPX_TF_CHANGESHAPE_HPP
+#define DEEPX_TF_CHANGESHAPE_HPP
+
+#include <vector>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+
+
+#include "deepx/tensorfunc/changeshape_miaobyte.hpp"
+
+namespace deepx::tf
+{
+    using namespace deepx::tensorfunc;
+    using namespace std;
+    template <typename Author>
+    class Reshape : public TF
+    {
+    public:
+        Reshape(const vector<Param> &args, const vector<Param> &returns)
+        {
+            this->name = "reshape";
+            this->author = Author::name();
+            this->args = args;
+            this->returns = returns;
+        }
+
+        string math_formula() const override
+        {
+            return "T2=T1.reshape(shape)";
+        }
+
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<Reshape<Author>>(*this);
+        }
+
+        int run(shared_ptr<MemBase> mem, string &error) override
+        {
+            Precision input_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
+            vector<int> shape = this->getvector<int>(1, -1);
+            Precision output_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
+            if (input_type != output_type)
+            {
+                error = "Type mismatch: " + precision_str(input_type) + " != " + precision_str(output_type);
+                return 1;
+            }
+            switch (input_type)
+            {
+            case Precision::Float64:
+                reshape<Author, double>(*mem->gettensor<double>(this->args[0].textvalue), shape);
+                break;
+            case Precision::Float32:
+                reshape<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), shape);
+                break;
+            case Precision::Int64:
+                reshape<Author, int64_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), shape);
+                break;
+            case Precision::Int32:
+                reshape<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), shape);
+                break;
+            case Precision::Int16:
+                reshape<Author, int16_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), shape);
+                break;
+            case Precision::Int8:
+                reshape<Author, int8_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), shape);
+                break;
+            default:
+                error = "Unsupported type: " + precision_str(input_type);
+                return 1;
+            }
+            return 0;
+        }
+    };
+
+    template <typename Author>
+    class Transpose : public TF
+    {
+    public:
+        Transpose(const vector<Param> &args, const vector<Param> &returns)
+        {
+            this->name = "transpose";
+            this->author = Author::name();
+            this->args = args;
+            this->returns = returns;
+        }
+
+        string math_formula() const override
+        {
+            return "T2 = T1.transpose(dimorder=[1,0])";
+        }
+
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<Transpose<Author>>(*this);
+        }
+
+        int run(shared_ptr<MemBase> mem, string &error) override
+        {
+            Precision input_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
+            vector<int> dim_order = this->getvector<int>(1, -1);
+            Precision output_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
+            
+            if (input_type != output_type)
+            {
+                error = "Type mismatch: " + precision_str(input_type) + " != " + precision_str(output_type);
+                return 1;
+            }
+            switch (input_type)
+            {
+            case Precision::Float64:
+                transpose<Author, double>(*mem->gettensor<double>(this->args[0].textvalue), dim_order, *mem->gettensor<double>(this->returns[0].textvalue));
+                break;
+            case Precision::Float32:
+                transpose<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), dim_order, *mem->gettensor<float>(this->returns[0].textvalue));
+                break;
+            case Precision::Float16:
+                transpose<Author, half>(*mem->gettensor<half>(this->args[0].textvalue), dim_order, *mem->gettensor<half>(this->returns[0].textvalue));
+                break;
+            case Precision::BFloat16:
+                transpose<Author, nv_bfloat16>(*mem->gettensor<nv_bfloat16>(this->args[0].textvalue), dim_order, *mem->gettensor<nv_bfloat16>(this->returns[0].textvalue));
+                break;
+            case Precision::Int64:
+                transpose<Author, int64_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), dim_order, *mem->gettensor<int64_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int32:
+                transpose<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), dim_order, *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int16:
+                transpose<Author, int16_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), dim_order, *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int8:
+                transpose<Author, int8_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), dim_order, *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                break;
+            default:
+                error = "Unsupported type: " + precision_str(input_type);
+                return 1;
+            }
+            return 0;
+        }
+    };
+
+}
+#endif // DEEPX_TF_CHANGESHAPE_HPP
diff --git a/excuter/op-mem-ompsimd/src/client/tfs.cpp b/excuter/op-mem-ompsimd/src/client/tfs.cpp
index 2670deae..cdf7ac6c 100644
--- a/excuter/op-mem-ompsimd/src/client/tfs.cpp
+++ b/excuter/op-mem-ompsimd/src/client/tfs.cpp
@@ -250,15 +250,15 @@ namespace deepx::tf
                                                              {
                                                                  Param("C", DataCategory::Tensor, Precision::Any),
                                                              })));
-          tffactory.add_tf(std::make_shared<MaxScalar<miaobyte>>(vector<Param>(
-                                                             {
-                                                                 Param("A", DataCategory::Tensor, Precision::Any),
-                                                                 Param("scalar", DataCategory::Var, Precision::Any),
-                                                             }),
-                                                         vector<Param>(
-                                                             {
-                                                                 Param("C", DataCategory::Tensor, Precision::Any),
-                                                             })));                                                   
+        tffactory.add_tf(std::make_shared<MaxScalar<miaobyte>>(vector<Param>(
+                                                                   {
+                                                                       Param("A", DataCategory::Tensor, Precision::Any),
+                                                                       Param("scalar", DataCategory::Var, Precision::Any),
+                                                                   }),
+                                                               vector<Param>(
+                                                                   {
+                                                                       Param("C", DataCategory::Tensor, Precision::Any),
+                                                                   })));
         tffactory.add_tf(std::make_shared<Min<miaobyte>>(vector<Param>(
                                                              {
                                                                  Param("A", DataCategory::Tensor, Precision::Any),
@@ -272,31 +272,31 @@ namespace deepx::tf
                                                                    {
                                                                        Param("A", DataCategory::Tensor, Precision::Any),
                                                                        Param("scalar", DataCategory::Var, Precision::Any),
-                                                                   }),  
+                                                                   }),
                                                                vector<Param>(
                                                                    {
                                                                        Param("C", DataCategory::Tensor, Precision::Any),
                                                                    })));
         tffactory.add_tf(std::make_shared<Compare<miaobyte>>(vector<Param>(
-                                                                   {
-                                                                       Param("A", DataCategory::Tensor, Precision::Any),
-                                                                       Param("B", DataCategory::Tensor, Precision::Any),
-                                                                       
-                                                                   }),
-                                                               vector<Param>(
-                                                                {
-                                                                    Param("mask", DataCategory::Tensor, Precision::Float32),
-                                                                })));
+                                                                 {
+                                                                     Param("A", DataCategory::Tensor, Precision::Any),
+                                                                     Param("B", DataCategory::Tensor, Precision::Any),
+
+                                                                 }),
+                                                             vector<Param>(
+                                                                 {
+                                                                     Param("mask", DataCategory::Tensor, Precision::Float32),
+                                                                 })));
         tffactory.add_tf(std::make_shared<CompareScalar<miaobyte>>(vector<Param>(
-                                                                   {
-                                                                       Param("A", DataCategory::Tensor, Precision::Any),
-                                                                       Param("scalar", DataCategory::Var, Precision::Any),
-                                                                   }),
-                                                               vector<Param>(
-                                                                   {
-                                                                       Param("mask", DataCategory::Tensor, Precision::Float32),
-                                                                   })));
-    }   
+                                                                       {
+                                                                           Param("A", DataCategory::Tensor, Precision::Any),
+                                                                           Param("scalar", DataCategory::Var, Precision::Any),
+                                                                       }),
+                                                                   vector<Param>(
+                                                                       {
+                                                                           Param("mask", DataCategory::Tensor, Precision::Float32),
+                                                                       })));
+    }
     // matmul
     void register_matmul(TfFactory &tffactory)
     {
@@ -318,15 +318,27 @@ namespace deepx::tf
                                                              {
                                                                  Param("C", DataCategory::Tensor, Precision::Float64 | Precision::Float32),
                                                              })));
-                                                            
     }
     // // changeshape
     void register_changeshape(TfFactory &tffactory)
     {
-        //     opfactory.add_op(Transpose<float>());
-        //     opfactory.add_op(Reshape<float>());
-        //     opfactory.add_op(Expand<float>());
-        tffactory.add_tf(std::make_shared<Concat>());
+        
+        tffactory.add_tf(std::make_shared<Reshape<miaobyte>>(vector<Param>(
+                                                                 {
+                                                                     Param("A", DataCategory::Tensor, Precision::Any),
+                                                                     Param("shape", DataCategory::Vector, Precision::Int32),
+                                                                 }),
+                                                             vector<Param>()));
+
+        tffactory.add_tf(std::make_shared<Transpose<miaobyte>>(vector<Param>(
+                {
+                    Param("A", DataCategory::Tensor, Precision::Any),
+                    Param("dim_order", DataCategory::Vector, Precision::Int32),
+                }),
+            vector<Param>(
+                {
+                    Param("C", DataCategory::Tensor, Precision::Any),
+                })));
     }
     // // reduce
     // void register_reduce(OpFactory &opfactory)
diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/changeshape_miaobyte.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/changeshape_miaobyte.hpp
index 4ca0f747..de1f277a 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/changeshape_miaobyte.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/changeshape_miaobyte.hpp
@@ -1,5 +1,5 @@
-#ifndef DEEPX_TENSORFUNC_CHANGESHAPE_HPP
-#define DEEPX_TENSORFUNC_CHANGESHAPE_HPP
+#ifndef DEEPX_TENSORFUNC_CHANGESHAPE_MIAOBYTE_HPP
+#define DEEPX_TENSORFUNC_CHANGESHAPE_MIAOBYTE_HPP
 
 #include <stdexcept>
 #include <vector>
@@ -14,7 +14,7 @@ namespace deepx::tensorfunc
     template <typename T>
     struct reshapeDispatcher<miaobyte, T>
     {
-        void reshape(Tensor<T> &tensor, Tensor<T> &output, const std::vector<int> &shape)
+        static void reshape(Tensor<T> &tensor, const std::vector<int> &shape)
         { // 参数改为单个tensor引用
 
             int new_prod = 1;
@@ -27,11 +27,7 @@ namespace deepx::tensorfunc
             {
                 throw std::invalid_argument("Shape size mismatch");
             }
-            if (tensor.data != output.data)
-            {
-                tensorfunc::copytensor(tensor, output);
-            }
-            output.shape = Shape(shape); // 直接修改原tensor的shape
+            tensor.shape = Shape(shape);
         }
     };
 
@@ -177,4 +173,4 @@ namespace deepx::tensorfunc
         }
     }
 }
-#endif
\ No newline at end of file
+#endif // DEEPX_TENSORFUNC_CHANGESHAPE_MIAOBYTE_HPP
\ No newline at end of file
diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/changeshape.hpp b/excuter/op-mem-ompsimd/src/deepx/tf/changeshape.hpp
index 838f2879..2c9707db 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tf/changeshape.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tf/changeshape.hpp
@@ -1,45 +1,171 @@
 #ifndef DEEPX_TF_CHANGESHAPE_HPP
 #define DEEPX_TF_CHANGESHAPE_HPP
 
+#include <vector>
 #include "deepx/tf/tf.hpp"
-#include "deepx/tensorfunc/changeshape.hpp"
+#include "deepx/tensorfunc/changeshape_miaobyte.hpp"
 #include "deepx/dtype.hpp"
 
 namespace deepx::tf
 {
-    class Concat : public TF
+    using namespace deepx::tensorfunc;
+    using namespace std;
+
+    template <typename Author>
+    class Reshape : public TF
     {
-        private:
-            const string _name="concat";
     public:
-        Concat()
+        Reshape(const vector<Param> &args, const vector<Param> &returns)
+        {
+            this->name = "reshape";
+            this->author = Author::name();
+            this->args = args;
+            this->returns = returns;
+        }
+
+        string math_formula() const override
         {
-            this->name=_name;
+            return "T2=T1.reshape(shape)";
+        }
 
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<Reshape<Author>>(*this);
         }
-        Concat(string text)
+
+        int run(shared_ptr<MemBase> mem, string &error) override
         {
-            this->parse(text);
-            if (this->name!=_name){
-                throw std::runtime_error("Invalid name: "+this->name);
+            Precision input_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
+            vector<int> shape = this->getvector<int>(1, -1);
+            Precision output_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
+            if (input_type != output_type)
+            {
+                error = "Type mismatch: " + precision_str(input_type) + " != " + precision_str(output_type);
+                return 1;
+            }
+            switch (input_type)
+            {
+            case Precision::Float64:
+                reshape<Author, double>(*mem->gettensor<double>(this->args[0].textvalue), shape);
+                break;
+            case Precision::Float32:
+                reshape<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), shape);
+                break;
+            case Precision::Int64:
+                reshape<Author, int64_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), shape);
+                break;
+            case Precision::Int32:
+                reshape<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), shape);
+                break;
+            case Precision::Int16:
+                reshape<Author, int16_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), shape);
+                break;
+            case Precision::Int8:
+                reshape<Author, int8_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), shape);
+                break;
+            default:
+                error = "Unsupported type: " + precision_str(input_type);
+                return 1;
             }
+            return 0;
+        }
+    };
+
+    template <typename Author>
+    class Transpose : public TF
+    {
+    public:
+        Transpose(const vector<Param> &args, const vector<Param> &returns)
+        {   
+            this->name = "transpose";
+            this->author = Author::name();
+            this->args = args;
+            this->returns = returns;
         }
 
         string math_formula() const override
         {
-            return "Tresult = concat([T1, T2...], axis=3)";
+            return "T2 = T1.transpose(dimorder=[1,0])";
         }
+
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<Transpose<Author>>(*this);
+        }
+
         int run(shared_ptr<MemBase> mem, string &error) override
         {
-            //TODO，去掉T
-            // std::vector<Tensor<T> *> input;
-            // for (int i = 0; i < this->args.size() - 1; i++)
-            // {
-            //     input.push_back(mem.gettensor<T>(this->args[i].name).get());
-            // }
-            // auto output = mem.gettensor<T>(this->returns[0].name).get();
-            // int axis = this->getvar<int>(-1,mem,false);
-            // tensorfunc::concat(input, axis, *output);
+            Precision input_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
+            vector<int> dim_order = this->getvector<int>(1, -1);
+            Precision output_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
+            if (input_type != output_type)
+            {
+                error = "Type mismatch: " + precision_str(input_type) + " != " + precision_str(output_type);
+                return 1;
+            }
+            
+            switch (input_type)
+            {
+            case Precision::Float64:
+                transpose<Author, double>(*mem->gettensor<double>(this->args[0].textvalue), dim_order, *mem->gettensor<double>(this->returns[0].textvalue));
+                break;
+            case Precision::Float32:
+                transpose<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), dim_order, *mem->gettensor<float>(this->returns[0].textvalue));
+                break;
+            case Precision::Int64:
+                transpose<Author, int64_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), dim_order, *mem->gettensor<int64_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int32:
+                transpose<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), dim_order, *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int16:
+                transpose<Author, int16_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), dim_order, *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int8:
+                transpose<Author, int8_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), dim_order, *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                break;
+            default:
+                error = "Unsupported type: " + precision_str(input_type);
+                return 1;
+            }
+            return 0;
+        }
+    };
+
+
+
+
+
+
+    
+template <typename Author>
+    class Concat : public TF
+    {
+    public:
+        Concat(const vector<Param> &args, const vector<Param> &returns)
+        {
+            this->name = _name;
+            this->author = Author::name(); 
+            this->args = args;
+            this->returns = returns;
+        }
+ 
+
+        string math_formula() const override
+        {
+            return "Tresult = concat([T1, T2...], axis=3)";
+        }
+        int run(shared_ptr<MemBase> mem, string &error) override
+        {   
+            // TODO，去掉T
+            //  std::vector<Tensor<T> *> input;
+            //  for (int i = 0; i < this->args.size() - 1; i++)
+            //  {
+            //      input.push_back(mem.gettensor<T>(this->args[i].name).get());
+            //  }
+            //  auto output = mem.gettensor<T>(this->returns[0].name).get();
+            //  int axis = this->getvar<int>(-1,mem,false);
+            //  tensorfunc::concat(input, axis, *output);
             return 0;
         };
         shared_ptr<TF> clone() const override
@@ -65,8 +191,8 @@ namespace deepx::tf
     //     void funcdef() override
     //     {
     //         this->parse("split(float32 T1,int32 3)->(float32 T2,T3)");
-    //     }   
-    //     string math_formula() const override    
+    //     }
+    //     string math_formula() const override
     //     {
     //         return "T2,T3 = split(T1, axis=3)";
     //     }
@@ -82,49 +208,6 @@ namespace deepx::tf
     //         tensorfunc::split(*output, axis, input);
     //     }
     // };
-    // template <typename T>
-    // class Reshape : public TF
-    // {
-    // public:
-    //     Reshape()
-    //     {
-    //         this->init("reshape", "any", {}, {}, false, {}, {});
-    //     }
-    //     void forward(mem::Mem &mem) override
-    //     {
-    //         auto input = mem.gettensor<T>(this->args[0]).get();
-    //         auto output = mem.gettensor<T>(this->returns[0]).get();
-    //         vector<int> shape;
-    //         if (this->args.size() == 2 && !is_integer(this->args[1]))
-    //         {
-    //             shape = mem.getvector<int32_t>(this->args[1]);
-    //         }
-    //         else
-    //         {
-    //             for (int i = 1; i < this->args.size(); i++)
-    //             {
-    //                 shape.push_back(atoi(this->args[i].c_str()));
-    //             }
-    //         }
-    //         tensorfunc::reshape(*input, *output, shape);
-    //     }
-    //     void backward(mem::Mem &mem) override
-    //     {
-    //         auto return_grad = mem.gettensor<T>(this->returns_grad[0]).get();
-    //         auto input_grad = mem.gettensor<T>(this->args_grad[0]).get();
-    //         auto input = mem.gettensor<T>(this->args[0]).get();
-    //         vector<int> shape = input->shape.shape;
-    //         tensorfunc::reshape(*return_grad, *input_grad, shape);
-    //     }
-    //     void funcdef() override
-    //     {
-    //         this->init("reshape", "float32", {"T1", "2", "3", "4"}, {"T2"}, false, {}, {});
-    //     }
-    //     string math_formula() const override
-    //     {
-    //         return "T2 = reshape(T1, [2,3,4])";
-    //     }
-    // };
 
     // template <typename T>
     // class Transpose : public Op

From 731b833015bd0a7f49c28c17a3fd788a6dbf5e65 Mon Sep 17 00:00:00 2001
From: lipeng <734991033@qq.com>
Date: Sun, 6 Apr 2025 18:28:23 +0800
Subject: [PATCH 5/7] excuter(cpu/cuda):reshape,transpose,concat

---
 excuter/cpp-common/src/deepx/dtype.hpp        |  25 +++-
 excuter/cpp-common/src/deepx/tf/tf.hpp        |  99 +++++++++-----
 excuter/op-mem-cuda/src/client/tfs.cpp        |  10 ++
 .../op-mem-cuda/src/deepx/tf/changeshape.hpp  | 124 ++++++++++++++++++
 excuter/op-mem-ompsimd/src/client/tfs.cpp     |  10 ++
 .../deepx/tensorfunc/changeshape_miaobyte.hpp |   1 +
 .../src/deepx/tf/changeshape.hpp              | 110 ++++++++++++----
 7 files changed, 316 insertions(+), 63 deletions(-)

diff --git a/excuter/cpp-common/src/deepx/dtype.hpp b/excuter/cpp-common/src/deepx/dtype.hpp
index b93a2a7a..e1ca3627 100644
--- a/excuter/cpp-common/src/deepx/dtype.hpp
+++ b/excuter/cpp-common/src/deepx/dtype.hpp
@@ -3,9 +3,30 @@
 
 #include <string>
 #include <cstdint>
+#include <sstream>
 
 namespace deepx
 {
+    template <typename T>
+    T to(const std::string &textvalue)
+    {
+        if constexpr (std::is_same_v<T, std::string>)
+        {
+            return textvalue;
+        }
+        else if constexpr (std::is_arithmetic_v<T>)
+        {
+            return static_cast<T>(std::stof(textvalue));
+        }
+        else
+        {
+            // 对于其他类型，尝试从字符串转换
+            T value;
+            std::istringstream iss(textvalue);
+            iss >> value;
+            return value;
+        }
+    }
 
     enum class DataCategory : uint8_t
     {
@@ -112,7 +133,7 @@ namespace deepx
         // 布尔类型 (13位)
         Bool = 1 << 13,   // 0010 0000 0000 0000
         String = 1 << 15, // 0100 0000 0000 0000
-        // 常用组合
+                          // 常用组合
         Any = 0xFFFF, // 1111 1111 1111 1111
         Float = Float64 | Float32 | Float16 | BFloat16 | Float8E5M2 | Float8E4M3 | Float4E2M1,
         Float8 = Float8E5M2 | Float8E4M3, // 所有FP8格式
@@ -230,8 +251,6 @@ namespace deepx
         return TypeDef(category, precision);
     }
 
-
-
     // 修改precision_str函数以使用标准命名格式
     inline std::string precision_str(Precision p)
     {
diff --git a/excuter/cpp-common/src/deepx/tf/tf.hpp b/excuter/cpp-common/src/deepx/tf/tf.hpp
index 3425e9a8..90c0d3cb 100644
--- a/excuter/cpp-common/src/deepx/tf/tf.hpp
+++ b/excuter/cpp-common/src/deepx/tf/tf.hpp
@@ -20,16 +20,17 @@ namespace deepx::tf
     using mem::MemBase;
     using namespace std;
     using namespace std::chrono;
-    
-    struct Param {
+
+    struct Param
+    {
         TypeDef dtype;
         string textvalue;
         any value;
-        Param(const string& textvalue = "", const DataCategory& dt = DataCategory::Unknown, const Precision& prec = Precision::Any)
+        Param(const string &textvalue = "", const DataCategory &dt = DataCategory::Unknown, const Precision &prec = Precision::Any)
             : textvalue(textvalue), dtype(make_dtype(dt, prec)) {}
     };
 
-    //TF:Tensor Function的缩写
+    // TF:Tensor Function的缩写
     class TF
     {
     public:
@@ -42,55 +43,76 @@ namespace deepx::tf
         system_clock::time_point created_at;
         system_clock::time_point sent_at;
         system_clock::time_point recv_at;
+
     public:
         TF() = default;
         TF(const TF &) = default;
         TF(const string text);
         TF &operator=(const TF &) = default;
-        
+
         string op_name();
-        virtual int run(shared_ptr<MemBase> mem,string &error){
+        virtual int run(shared_ptr<MemBase> mem, string &error)
+        {
             throw NotImplementError(name);
         }
         virtual string math_formula() const;
 
         void parse(const string &str);
-        std::string to_string(bool show_extra=false, bool show_name=true) const;
+        std::string to_string(bool show_extra = false, bool show_name = true) const;
         void init(const string &opname,
                   const vector<Param> &args,
                   const vector<Param> &returns);
 
-        template<typename T>
-        T getvar(int idx, shared_ptr<MemBase> mem,bool arg=true){
-            vector<Param> &vars=arg?args:returns;
-            if(idx<0){
-                idx = vars.size()+idx;
+        template <typename T>
+        T getvar(int idx, shared_ptr<MemBase> mem, bool arg = true)
+        {
+            vector<Param> &vars = arg ? args : returns;
+            if (idx < 0)
+            {
+                idx = vars.size() + idx;
             }
-            if(idx<0 || idx>=vars.size()){
+            if (idx < 0 || idx >= vars.size())
+            {
                 throw std::invalid_argument("Invalid argument index");
             }
-            if (is_float(vars[idx].textvalue)){
-                T value=T(std::stof(vars[idx].textvalue));
+            if (is_float(vars[idx].textvalue))
+            {
+                T value = T(std::stof(vars[idx].textvalue));
                 return value;
             }
             return mem->getarg<T>(vars[idx].textvalue);
         }
 
-        template<typename T>
-        vector<T> getvector( int from=0, int to=0,bool arg=true){
-            vector<Param> &vars=arg?args:returns;
-            if(from<0){
-                from = vars.size()+from;
-            }   
-            if(to<0){
-                to = vars.size()+to;
+        
+
+        template <typename T>
+        vector<T> getvector(int idx,bool arg = true)
+        {
+            vector<Param> &vars = arg ? args : returns;
+            if (idx < 0)
+            {
+                idx = vars.size() + idx;
             }
-            if(from>to){
+            if (idx < 0 || idx >= vars.size())
+            {
                 throw std::invalid_argument("Invalid argument index");
             }
+            if (idx < 0 || idx >= vars.size())
+            {
+                throw std::invalid_argument("Invalid argument index");
+            }
+
             vector<T> result;
-            for(int i=from;i<=to;i++){
-                result.push_back(T(std::stof(vars[i].textvalue)));
+            string textvalue = vars[idx].textvalue;
+            if (textvalue.empty())
+            {
+                throw std::invalid_argument("Invalid argument index");
+            }
+            std::stringstream ss(textvalue);
+            std::string item;
+            while (std::getline(ss, item, ','))
+            {
+                result.push_back(to<T>(item));
             }
             return result;
         }
@@ -99,7 +121,8 @@ namespace deepx::tf
         bool check_dtype(const TF &other) const;
 
         // 添加虚拟克隆方法
-        virtual shared_ptr<TF> clone() const {
+        virtual shared_ptr<TF> clone() const
+        {
             return make_shared<TF>(*this);
         }
     };
@@ -113,35 +136,41 @@ namespace deepx::tf
         system_clock::time_point start_at;
         system_clock::time_point finish_at;
         string message;
+
     public:
         OpResp() = default;
         OpResp(const OpResp &) = default;
         OpResp &operator=(const OpResp &) = default;
- 
-        std::string to_string() const{
+
+        std::string to_string() const
+        {
             std::stringstream stream;
             stream << id << " " << result;
             stream << "// recv_at=";
             stream << duration_cast<milliseconds>(recv_at.time_since_epoch()).count();
             stream << " start_at=";
             stream << duration_cast<milliseconds>(start_at.time_since_epoch()).count();
-            stream << " finish_at=";    
+            stream << " finish_at=";
             stream << duration_cast<milliseconds>(finish_at.time_since_epoch()).count();
-            if (message.size()>0){
-                stream << " "<< message;
+            if (message.size() > 0)
+            {
+                stream << " " << message;
             }
             return stream.str();
         }
-        void init(int id,system_clock::time_point recv_at){
+        void init(int id, system_clock::time_point recv_at)
+        {
             this->id = id;
             this->recv_at = recv_at;
         }
-        void finish(const string &message){
+        void finish(const string &message)
+        {
             this->result = "ok";
             this->finish_at = system_clock::now();
             this->message = message;
         }
-        void error(const string &message){
+        void error(const string &message)
+        {
             this->result = "error";
             this->finish_at = system_clock::now();
             this->message = message;
diff --git a/excuter/op-mem-cuda/src/client/tfs.cpp b/excuter/op-mem-cuda/src/client/tfs.cpp
index 109847c9..c0d9bd9c 100644
--- a/excuter/op-mem-cuda/src/client/tfs.cpp
+++ b/excuter/op-mem-cuda/src/client/tfs.cpp
@@ -348,6 +348,16 @@ namespace deepx::tf
                 {
                     Param("C", DataCategory::Tensor, Precision::Any),
                 })));
+
+        tffactory.add_tf(std::make_shared<Concat<miaobyte>>(vector<Param>(
+                {
+                    Param("tensors", DataCategory::ListTensor, Precision::Any),
+                    Param("axis", DataCategory::Var, Precision::Int32),
+                }),
+            vector<Param>(
+                {
+                    Param("result", DataCategory::Tensor, Precision::Any),
+                })));
     }
     // // reduce
     // void register_reduce(OpFactory &opfactory)
diff --git a/excuter/op-mem-cuda/src/deepx/tf/changeshape.hpp b/excuter/op-mem-cuda/src/deepx/tf/changeshape.hpp
index 551f7d51..68d03767 100644
--- a/excuter/op-mem-cuda/src/deepx/tf/changeshape.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tf/changeshape.hpp
@@ -139,5 +139,129 @@ namespace deepx::tf
         }
     };
 
+        template <typename Author>
+    class Concat : public TF
+    {
+    public:
+        Concat(const vector<Param> &args, const vector<Param> &returns)
+        {
+            this->name = "concat";
+            this->author = Author::name();
+            this->args = args;
+            this->returns = returns;
+        }
+
+        string math_formula() const override
+        {
+            return "Tresult = concat([T1, T2...], axis=3)";
+        }
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<Concat>(*this);
+        }
+        int run(shared_ptr<MemBase> mem, string &error) override
+        {
+            vector<string> tensor_names = this->getvector<string>(0, true);
+            Precision input_type = mem->gettensor(tensor_names[0]).get()->shape.dtype;
+            int axis = this->getvar<int>(1, mem, false);
+            switch (input_type)
+            {
+            case Precision::Float64:
+            {
+                std::vector<Tensor<double> *> input;
+                for (int i = 0; i < tensor_names.size(); i++)
+                {
+                    input.push_back(mem->gettensor<double>(tensor_names[i]).get());
+                }
+                auto output = mem->gettensor<double>(this->returns[0].textvalue).get();
+                concat<Author, double>(input, axis, *output);
+                break;
+            }
+            case Precision::Float32:
+            {
+                std::vector<Tensor<float> *> input;
+                for (int i = 0; i < tensor_names.size(); i++)
+                {
+                    input.push_back(mem->gettensor<float>(tensor_names[i]).get());
+                }
+                auto output = mem->gettensor<float>(this->returns[0].textvalue).get();
+                concat<Author, float>(input, axis, *output);
+                break;
+            }
+            case Precision::Float16:
+            {
+                std::vector<Tensor<half> *> input;
+                for (int i = 0; i < tensor_names.size(); i++)
+                {
+                    input.push_back(mem->gettensor<half>(tensor_names[i]).get());   
+                }
+                auto output = mem->gettensor<half>(this->returns[0].textvalue).get();
+                concat<Author, half>(input, axis, *output);
+                break;
+            }
+            case Precision::BFloat16:
+            {
+                std::vector<Tensor<nv_bfloat16> *> input;
+                for (int i = 0; i < tensor_names.size(); i++)
+                {
+                    input.push_back(mem->gettensor<nv_bfloat16>(tensor_names[i]).get());
+                }   
+                auto output = mem->gettensor<nv_bfloat16>(this->returns[0].textvalue).get();
+                concat<Author, nv_bfloat16>(input, axis, *output);
+                break;
+            }
+            case Precision::Int64:
+            {
+                std::vector<Tensor<int64_t> *> input;
+                for (int i = 0; i < tensor_names.size(); i++)
+                {
+                    input.push_back(mem->gettensor<int64_t>(tensor_names[i]).get());
+                }
+                auto output = mem->gettensor<int64_t>(this->returns[0].textvalue).get();
+                concat<Author, int64_t>(input, axis, *output);
+                break;
+            }
+            case Precision::Int32:
+            {
+                std::vector<Tensor<int32_t> *> input;
+                for (int i = 0; i < tensor_names.size(); i++)
+                {
+                    input.push_back(mem->gettensor<int32_t>(tensor_names[i]).get());
+                }
+                auto output = mem->gettensor<int32_t>(this->returns[0].textvalue).get();
+                concat<Author, int32_t>(input, axis, *output);
+                break;
+            }
+            case Precision::Int16:
+            {
+                std::vector<Tensor<int16_t> *> input;
+                for (int i = 0; i < tensor_names.size(); i++)
+                {
+                    input.push_back(mem->gettensor<int16_t>(tensor_names[i]).get());
+                }
+                auto output = mem->gettensor<int16_t>(this->returns[0].textvalue).get();
+                concat<Author, int16_t>(input, axis, *output);
+                break;
+            }
+            case Precision::Int8:
+            {
+                std::vector<Tensor<int8_t> *> input;
+                for (int i = 0; i < tensor_names.size(); i++)
+                {
+                    input.push_back(mem->gettensor<int8_t>(tensor_names[i]).get());
+                }
+                auto output = mem->gettensor<int8_t>(this->returns[0].textvalue).get();
+                concat<Author, int8_t>(input, axis, *output);
+                break;
+            }
+            default:
+                error = "Unsupported type: " + precision_str(input_type);
+                return 1;
+            }
+
+            return 0;
+        };
+    };
+
 }
 #endif // DEEPX_TF_CHANGESHAPE_HPP
diff --git a/excuter/op-mem-ompsimd/src/client/tfs.cpp b/excuter/op-mem-ompsimd/src/client/tfs.cpp
index cdf7ac6c..57e2c206 100644
--- a/excuter/op-mem-ompsimd/src/client/tfs.cpp
+++ b/excuter/op-mem-ompsimd/src/client/tfs.cpp
@@ -339,6 +339,16 @@ namespace deepx::tf
                 {
                     Param("C", DataCategory::Tensor, Precision::Any),
                 })));
+
+        tffactory.add_tf(std::make_shared<Concat<miaobyte>>(vector<Param>(
+                {
+                    Param("tensors", DataCategory::ListTensor, Precision::Any),
+                    Param("axis", DataCategory::Var, Precision::Int32),
+                }),
+            vector<Param>(
+                {
+                    Param("result", DataCategory::Tensor, Precision::Any),
+                })));
     }
     // // reduce
     // void register_reduce(OpFactory &opfactory)
diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/changeshape_miaobyte.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/changeshape_miaobyte.hpp
index de1f277a..c03f3c0d 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/changeshape_miaobyte.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/changeshape_miaobyte.hpp
@@ -7,6 +7,7 @@
 #include "deepx/tensor.hpp"
 #include "deepx/tensorfunc/new.hpp"
 #include "deepx/tensorfunc/changeshape.hpp"
+#include "deepx/shape_concat.hpp"
 #include "deepx/shape_broadcast.hpp"
 #include "deepx/tensorfunc/authors.hpp"
 namespace deepx::tensorfunc
diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/changeshape.hpp b/excuter/op-mem-ompsimd/src/deepx/tf/changeshape.hpp
index 2c9707db..fa125766 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tf/changeshape.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tf/changeshape.hpp
@@ -76,7 +76,7 @@ namespace deepx::tf
     {
     public:
         Transpose(const vector<Param> &args, const vector<Param> &returns)
-        {   
+        {
             this->name = "transpose";
             this->author = Author::name();
             this->args = args;
@@ -103,7 +103,7 @@ namespace deepx::tf
                 error = "Type mismatch: " + precision_str(input_type) + " != " + precision_str(output_type);
                 return 1;
             }
-            
+
             switch (input_type)
             {
             case Precision::Float64:
@@ -132,46 +132,106 @@ namespace deepx::tf
         }
     };
 
-
-
-
-
-
-    
-template <typename Author>
+    template <typename Author>
     class Concat : public TF
     {
     public:
         Concat(const vector<Param> &args, const vector<Param> &returns)
         {
-            this->name = _name;
-            this->author = Author::name(); 
+            this->name = "concat";
+            this->author = Author::name();
             this->args = args;
             this->returns = returns;
         }
- 
 
         string math_formula() const override
         {
             return "Tresult = concat([T1, T2...], axis=3)";
         }
-        int run(shared_ptr<MemBase> mem, string &error) override
-        {   
-            // TODO，去掉T
-            //  std::vector<Tensor<T> *> input;
-            //  for (int i = 0; i < this->args.size() - 1; i++)
-            //  {
-            //      input.push_back(mem.gettensor<T>(this->args[i].name).get());
-            //  }
-            //  auto output = mem.gettensor<T>(this->returns[0].name).get();
-            //  int axis = this->getvar<int>(-1,mem,false);
-            //  tensorfunc::concat(input, axis, *output);
-            return 0;
-        };
         shared_ptr<TF> clone() const override
         {
             return make_shared<Concat>(*this);
         }
+        int run(shared_ptr<MemBase> mem, string &error) override
+        {
+            vector<string> tensor_names = this->getvector<string>(0, true);
+            Precision input_type = mem->gettensor(tensor_names[0]).get()->shape.dtype;
+            int axis = this->getvar<int>(1, mem, false);
+            switch (input_type)
+            {
+            case Precision::Float64:
+            {
+                std::vector<Tensor<double> *> input;
+                for (int i = 0; i < tensor_names.size(); i++)
+                {
+                    input.push_back(mem->gettensor<double>(tensor_names[i]).get());
+                }
+                auto output = mem->gettensor<double>(this->returns[0].textvalue).get();
+                concat<Author, double>(input, axis, *output);
+                break;
+            }
+            case Precision::Float32:
+            {
+                std::vector<Tensor<float> *> input;
+                for (int i = 0; i < tensor_names.size(); i++)
+                {
+                    input.push_back(mem->gettensor<float>(tensor_names[i]).get());
+                }
+                auto output = mem->gettensor<float>(this->returns[0].textvalue).get();
+                concat<Author, float>(input, axis, *output);
+                break;
+            }
+            case Precision::Int64:
+            {
+                std::vector<Tensor<int64_t> *> input;
+                for (int i = 0; i < tensor_names.size(); i++)
+                {
+                    input.push_back(mem->gettensor<int64_t>(tensor_names[i]).get());
+                }
+                auto output = mem->gettensor<int64_t>(this->returns[0].textvalue).get();
+                concat<Author, int64_t>(input, axis, *output);
+                break;
+            }
+            case Precision::Int32:
+            {
+                std::vector<Tensor<int32_t> *> input;
+                for (int i = 0; i < tensor_names.size(); i++)
+                {
+                    input.push_back(mem->gettensor<int32_t>(tensor_names[i]).get());
+                }
+                auto output = mem->gettensor<int32_t>(this->returns[0].textvalue).get();
+                concat<Author, int32_t>(input, axis, *output);
+                break;
+            }
+            case Precision::Int16:
+            {
+                std::vector<Tensor<int16_t> *> input;
+                for (int i = 0; i < tensor_names.size(); i++)
+                {
+                    input.push_back(mem->gettensor<int16_t>(tensor_names[i]).get());
+                }
+                auto output = mem->gettensor<int16_t>(this->returns[0].textvalue).get();
+                concat<Author, int16_t>(input, axis, *output);
+                break;
+            }
+            case Precision::Int8:
+            {
+                std::vector<Tensor<int8_t> *> input;
+                for (int i = 0; i < tensor_names.size(); i++)
+                {
+                    input.push_back(mem->gettensor<int8_t>(tensor_names[i]).get());
+                }
+                auto output = mem->gettensor<int8_t>(this->returns[0].textvalue).get();
+                concat<Author, int8_t>(input, axis, *output);
+                break;
+            }
+            default:
+                error = "Unsupported type: " + precision_str(input_type);
+                return 1;
+            }
+
+            return 0;
+        };
     };
 
     // class Split : public TF

From 72e5334d5c7524fb06972ac2045fe69fe1af5acb Mon Sep 17 00:00:00 2001
From: lipeng <734991033@qq.com>
Date: Sun, 6 Apr 2025 19:29:28 +0800
Subject: [PATCH 6/7] front:Apply fix,requires_grad=requires_grad

---
 doc/excuter/op-mem-cuda/list.md               |   3 +
 doc/excuter/op-mem-ompsimd/list.md            |  60 +++----
 front/py/deepx/nn/functional/elementwise.py   |  32 ++--
 front/py/examples/2_ir/1_init_zeroones.dot    |  76 ++++-----
 .../py/examples/2_ir/1_init_zeroones.dot.svg  | 152 +++++++++---------
 5 files changed, 164 insertions(+), 159 deletions(-)

diff --git a/doc/excuter/op-mem-cuda/list.md b/doc/excuter/op-mem-cuda/list.md
index 5967d738..09937bd1 100644
--- a/doc/excuter/op-mem-cuda/list.md
+++ b/doc/excuter/op-mem-cuda/list.md
@@ -4,6 +4,9 @@
 
 | Operation | Author | Func Def | Math Formula | IR Instruction |
 |-----------|--------|------------|--------------|----------------|
+| concat | miaobyte | concat(listtensor<any> tensors, var<int32> axis)->(tensor<any> result) | Tresult = concat([T1, T2...], axis=3) | concat(listtensor<any> tensors, var<int32> axis)->(tensor<any> result) |
+| transpose | miaobyte | transpose(tensor<any> A, vector<int32> dim_order)->(tensor<any> C) | T2 = T1.transpose(dimorder=[1,0]) | transpose(tensor<any> A, vector<int32> dim_order)->(tensor<any> C) |
+| reshape | miaobyte | reshape(tensor<any> A, vector<int32> shape)->() | T2=T1.reshape(shape) | reshape(tensor<any> A, vector<int32> shape)->() |
 | matmul | cublas | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1 @ T2 | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) |
 | comparescalar | miaobyte | comparescalar(tensor<any> A, var<any> scalar)->(tensor<int8> mask) | mask=compare(T1, scalar) | comparescalar(tensor<any> A, var<any> scalar)->(tensor<int8> mask) |
 | add | cublas | add(tensor<any> a, tensor<any> b)->(tensor<any> c) | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |
diff --git a/doc/excuter/op-mem-ompsimd/list.md b/doc/excuter/op-mem-ompsimd/list.md
index b4ab6cd7..32aadf9d 100644
--- a/doc/excuter/op-mem-ompsimd/list.md
+++ b/doc/excuter/op-mem-ompsimd/list.md
@@ -4,36 +4,38 @@
 
 | Operation | Author | Func Def | Math Formula | IR Instruction |
 |-----------|--------|------------|--------------|----------------|
-| concat |  none  | concat()->() | Tresult = concat([T1, T2...], axis=3) | concat()->() |
-| matmul | cblas | matmul(tensor<float64|float32> A, tensor<float64|float32> B)->(tensor<float64|float32> C) | T3=T1 @ T2 | matmul(tensor<float64|float32> A, tensor<float64|float32> B)->(tensor<float64|float32> C) |
-| matmul | miaobyte | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1 @ T2 | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) |
-| compare | miaobyte | compare(tensor<any> A, tensor<any> B)->(tensor<float32> mask) | mask=compare(T1,T2) | compare(tensor<any> A, tensor<any> B)->(tensor<float32> mask) |
-| min | miaobyte | min(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=min(T1,T2) | min(tensor<any> A, tensor<any> B)->(tensor<any> C) |
-| minscalar | miaobyte | minscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=min(T1,scalar) | minscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
-| exp | miaobyte | exp(tensor<any> A)->(tensor<any> C) | T3=exp(T1) | exp(tensor<any> A)->(tensor<any> C) |
-| maxscalar | miaobyte | maxscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=max(T1,scalar) | maxscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
-| pow | miaobyte | pow(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1^T2 | pow(tensor<any> A, tensor<any> B)->(tensor<any> C) |
-| powscalar | miaobyte | powscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=T1^scalar | powscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
-| rdivscalar | miaobyte | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) | T3=scalar/T1 | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) |
-| div | miaobyte | div(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1/T2 | div(tensor<any> A, tensor<any> B)->(tensor<any> C) |
-| sub | miaobyte | sub(tensor<any> a, tensor<any> b)->(tensor<any> c) | T3=T1-T2 | sub(tensor<any> a, tensor<any> b)->(tensor<any> c) |
-| argset |  none  | argset(var<any> value)->(var<any> name) | var argname = argvalue | argset(var<any> value)->(var<any> name) |
-| mulscalar | miaobyte | mulscalar(tensor<any> A, var<any> b)->(tensor<any> C) | T3=T1*scalar | mulscalar(tensor<any> A, var<any> b)->(tensor<any> C) |
-| sqrt | miaobyte | sqrt(tensor<any> A)->(tensor<any> C) | T3=sqrt(T1) | sqrt(tensor<any> A)->(tensor<any> C) |
-| vecset |  none  | vecset(vector<any> value)->(vector<any> name) | shape = [3  4  5] | vecset(vector<any> value)->(vector<any> name) |
-| newtensor |  none  | newtensor(vector<int32> shape)->(tensor<any> tensor1) | T1 =Tensor(shape=[...]) | newtensor(vector<int32> shape)->(tensor<any> tensor1) |
-| newtensor |  none  | newtensor(var<string> shape)->(tensor<any> tensor1) | T1 =Tensor(shape=[...]) | newtensor(var<string> shape)->(tensor<any> tensor1) |
-| print | miaobyte | print(tensor<any> )->() | print(T1) | print(tensor<any> )->() |
-| print | miaobyte | print(tensor<any> , var<string> )->() | print(T1) | print(tensor<any> , var<string> )->() |
-| max | miaobyte | max(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=max(T1,T2) | max(tensor<any> A, tensor<any> B)->(tensor<any> C) |
-| divscalar | miaobyte | divscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=T1/scalar | divscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
-| constant | miaobyte | constant(tensor<any> t, var<any> value)->() | constant(T1,value) | constant(tensor<any> t, var<any> value)->() |
-| arange | miaobyte | arange(tensor<any> t, var<any> start, var<any> step)->() | arange(T1,start,step) | arange(tensor<any> t, var<any> start, var<any> step)->() |
-| subscalar | miaobyte | subscalar(tensor<any> a, var<any> scalar)->(tensor<any> c) | T3=T1-scalar | subscalar(tensor<any> a, var<any> scalar)->(tensor<any> c) |
-| log | miaobyte | log(tensor<any> A)->(tensor<any> C) | T3=log(T1) | log(tensor<any> A)->(tensor<any> C) |
-| uniform | miaobyte | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() | uniform(T1,low,high,seed) | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() |
-| comparescalar | miaobyte | comparescalar(tensor<any> A, var<any> scalar)->(tensor<float32> mask) | mask=compare(T1,scalar) | comparescalar(tensor<any> A, var<any> scalar)->(tensor<float32> mask) |
+| concat | miaobyte | concat(listtensor<any> tensors, var<int32> axis)->(tensor<any> result) | Tresult = concat([T1, T2...], axis=3) | concat(listtensor<any> tensors, var<int32> axis)->(tensor<any> result) |
+| transpose | miaobyte | transpose(tensor<any> A, vector<int32> dim_order)->(tensor<any> C) | T2 = T1.transpose(dimorder=[1,0]) | transpose(tensor<any> A, vector<int32> dim_order)->(tensor<any> C) |
 | add | cblas | add(tensor<float64|float32> a, tensor<float64|float32> b)->(tensor<float64|float32> c) | T3=T1+T2 | add(tensor<float64|float32> a, tensor<float64|float32> b)->(tensor<float64|float32> c) |
 | add | miaobyte | add(tensor<any> a, tensor<any> b)->(tensor<any> c) | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |
+| comparescalar | miaobyte | comparescalar(tensor<any> A, var<any> scalar)->(tensor<float32> mask) | mask=compare(T1,scalar) | comparescalar(tensor<any> A, var<any> scalar)->(tensor<float32> mask) |
+| uniform | miaobyte | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() | uniform(T1,low,high,seed) | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() |
 | addscalar | miaobyte | addscalar(tensor<any> a, var<any> scalar)->(tensor<any> c) | T3=T1+scalar | addscalar(tensor<any> a, var<any> scalar)->(tensor<any> c) |
+| log | miaobyte | log(tensor<any> A)->(tensor<any> C) | T3=log(T1) | log(tensor<any> A)->(tensor<any> C) |
+| reshape | miaobyte | reshape(tensor<any> A, vector<int32> shape)->() | T2=T1.reshape(shape) | reshape(tensor<any> A, vector<int32> shape)->() |
+| arange | miaobyte | arange(tensor<any> t, var<any> start, var<any> step)->() | arange(T1,start,step) | arange(tensor<any> t, var<any> start, var<any> step)->() |
+| divscalar | miaobyte | divscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=T1/scalar | divscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
+| print | miaobyte | print(tensor<any> )->() | print(T1) | print(tensor<any> )->() |
+| print | miaobyte | print(tensor<any> , var<string> )->() | print(T1) | print(tensor<any> , var<string> )->() |
+| newtensor |  none  | newtensor(vector<int32> shape)->(tensor<any> tensor1) | T1 =Tensor(shape=[...]) | newtensor(vector<int32> shape)->(tensor<any> tensor1) |
+| newtensor |  none  | newtensor(var<string> shape)->(tensor<any> tensor1) | T1 =Tensor(shape=[...]) | newtensor(var<string> shape)->(tensor<any> tensor1) |
+| vecset |  none  | vecset(vector<any> value)->(vector<any> name) | shape = [3  4  5] | vecset(vector<any> value)->(vector<any> name) |
+| subscalar | miaobyte | subscalar(tensor<any> a, var<any> scalar)->(tensor<any> c) | T3=T1-scalar | subscalar(tensor<any> a, var<any> scalar)->(tensor<any> c) |
+| sqrt | miaobyte | sqrt(tensor<any> A)->(tensor<any> C) | T3=sqrt(T1) | sqrt(tensor<any> A)->(tensor<any> C) |
+| argset |  none  | argset(var<any> value)->(var<any> name) | var argname = argvalue | argset(var<any> value)->(var<any> name) |
+| sub | miaobyte | sub(tensor<any> a, tensor<any> b)->(tensor<any> c) | T3=T1-T2 | sub(tensor<any> a, tensor<any> b)->(tensor<any> c) |
+| mulscalar | miaobyte | mulscalar(tensor<any> A, var<any> b)->(tensor<any> C) | T3=T1*scalar | mulscalar(tensor<any> A, var<any> b)->(tensor<any> C) |
+| div | miaobyte | div(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1/T2 | div(tensor<any> A, tensor<any> B)->(tensor<any> C) |
+| constant | miaobyte | constant(tensor<any> t, var<any> value)->() | constant(T1,value) | constant(tensor<any> t, var<any> value)->() |
+| powscalar | miaobyte | powscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=T1^scalar | powscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
+| max | miaobyte | max(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=max(T1,T2) | max(tensor<any> A, tensor<any> B)->(tensor<any> C) |
+| pow | miaobyte | pow(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1^T2 | pow(tensor<any> A, tensor<any> B)->(tensor<any> C) |
+| maxscalar | miaobyte | maxscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=max(T1,scalar) | maxscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
 | mul | miaobyte | mul(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1*T2 | mul(tensor<any> A, tensor<any> B)->(tensor<any> C) |
+| exp | miaobyte | exp(tensor<any> A)->(tensor<any> C) | T3=exp(T1) | exp(tensor<any> A)->(tensor<any> C) |
+| rdivscalar | miaobyte | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) | T3=scalar/T1 | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) |
+| minscalar | miaobyte | minscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=min(T1,scalar) | minscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
+| min | miaobyte | min(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=min(T1,T2) | min(tensor<any> A, tensor<any> B)->(tensor<any> C) |
+| compare | miaobyte | compare(tensor<any> A, tensor<any> B)->(tensor<float32> mask) | mask=compare(T1,T2) | compare(tensor<any> A, tensor<any> B)->(tensor<float32> mask) |
+| matmul | cblas | matmul(tensor<float64|float32> A, tensor<float64|float32> B)->(tensor<float64|float32> C) | T3=T1 @ T2 | matmul(tensor<float64|float32> A, tensor<float64|float32> B)->(tensor<float64|float32> C) |
+| matmul | miaobyte | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1 @ T2 | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) |
diff --git a/front/py/deepx/nn/functional/elementwise.py b/front/py/deepx/nn/functional/elementwise.py
index e6f8d326..358c88d0 100644
--- a/front/py/deepx/nn/functional/elementwise.py
+++ b/front/py/deepx/nn/functional/elementwise.py
@@ -125,9 +125,9 @@ def add(
         requires_grad:bool=False,
         author='miaobyte')->Tensor:
     if isinstance(b,Tensor):
-        return Add.apply(a,b,out,author,requires_grad)
+        return Add.apply(a,b,out,author,requires_grad=requires_grad)
     else:
-        return AddScalar.apply(a,b,out,author,requires_grad)
+        return AddScalar.apply(a,b,out,author,requires_grad=requires_grad)
 
 
 #sub
@@ -157,9 +157,9 @@ def sub(
         requires_grad:bool=False,
         author='miaobyte')->Tensor:  
     if isinstance(b,Tensor):
-        return Sub.apply(a,b,out,author,requires_grad)
+        return Sub.apply(a,b,out,author,requires_grad=requires_grad)
     else:
-        return SubScalar.apply(a,b,out,author,requires_grad)
+        return SubScalar.apply(a,b,out,author,requires_grad=requires_grad)
 
 #mul
 OpNode.register("mul")
@@ -195,7 +195,7 @@ def mul(
     if isinstance(b,Tensor):
         return Mul.apply(a,b,out,author,requires_grad)
     else:
-        return MulScalar.apply(a,b,out,author,requires_grad)
+        return MulScalar.apply(a,b,out,author,requires_grad=requires_grad)
  
 
 #div
@@ -248,10 +248,10 @@ def div(
     else:
         if isinstance(a,Tensor):
             #C=A/b
-            return DivScalar.apply(a,b,"divscalar",out,author,requires_grad)
+            return DivScalar.apply(a,b,"divscalar",out,author,requires_grad=requires_grad)
         else:
             #C=a/B
-            return RDivScalar.apply(a,b,"rdivscalar",out,author,requires_grad)
+            return RDivScalar.apply(a,b,"rdivscalar",out,author,requires_grad=requires_grad)
 
 OpNode.register("compare")
 class Compare(Function):
@@ -301,7 +301,7 @@ def max(
     if  isinstance(b,int) or isinstance(b,float):
         return MaxScalar.apply(a,b,"maxscalar",out,author,requires_grad)
     else:
-        return Max.apply(a,b,"max",out,author,requires_grad)
+        return Max.apply(a,b,"max",out,author,requires_grad=requires_grad)
 
 
 OpNode.register("min")
@@ -337,9 +337,9 @@ def min(
         requires_grad:bool=False,
         author='miaobyte')->Tensor:
     if  isinstance(b,int) or isinstance(b,float):
-        return MinScalar.apply(a,b,"minscalar",out,author,requires_grad)
+        return MinScalar.apply(a,b,"minscalar",out,author,requires_grad=requires_grad)
     else:
-        return Min.apply(a,b,"min",out,author,requires_grad)
+        return Min.apply(a,b,"min",out,author,requires_grad=requires_grad)
 
 #clamp,TODO
 
@@ -362,7 +362,7 @@ def sqrt(
         out:Union[Tensor,str]='',
         requires_grad:bool=False,
         author='miaobyte')->Tensor:
-    return Sqrt.apply(input,out,author,requires_grad)
+    return Sqrt.apply(input,out,author,requires_grad=requires_grad)
 
 OpNode.register("pow")
 class Pow(Function):
@@ -397,9 +397,9 @@ def pow(
         requires_grad:bool=False,
         author='miaobyte')->Tensor:
     if  isinstance(b,int) or isinstance(b,float):
-        return PowScalar.apply(a,b,out,author,requires_grad)
+        return PowScalar.apply(a,b,out,author,requires_grad=requires_grad)
     else:
-        return Pow.apply(a,b,out,author,requires_grad)
+        return Pow.apply(a,b,out,author,requires_grad=requires_grad)
 
 #exp
 OpNode.register("exp")
@@ -420,7 +420,7 @@ def exp(
         out:Union[Tensor,str]='',
         requires_grad:bool=False,
         author='miaobyte')->Tensor:
-    return Exp.apply(a,out,author,requires_grad)  
+    return Exp.apply(a,out,author,requires_grad=requires_grad)  
 #log
 OpNode.register("log")
 class Log(Function):
@@ -440,7 +440,7 @@ def log(
         out:Union[Tensor,str]='',
         requires_grad:bool=False,
         author='miaobyte')->Tensor:
-    return Log.apply(a,out,author,requires_grad)
+    return Log.apply(a,out,author,requires_grad=requires_grad)
 
 OpNode.register("rsqrt")
 class Rsqrt(Function):
@@ -460,7 +460,7 @@ def rsqrt(
         out:Union[Tensor,str]='',
         requires_grad:bool=False,
         author='miaobyte')->Tensor:
-    return Rsqrt.apply(input,out,author,requires_grad)
+    return Rsqrt.apply(input,out,author,requires_grad=requires_grad)
 
  
 
diff --git a/front/py/examples/2_ir/1_init_zeroones.dot b/front/py/examples/2_ir/1_init_zeroones.dot
index 33d54af8..55722399 100644
--- a/front/py/examples/2_ir/1_init_zeroones.dot
+++ b/front/py/examples/2_ir/1_init_zeroones.dot
@@ -2,55 +2,55 @@
 digraph {
 	rankdir=TB
 	node [shape=record]
-	132815942520016 [label="t1
+	129052233125168 [label="t1
 (3, 4, 5)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	132813646230768 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	132814271881056 [label="var_1
+	129049935907280 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
+	129049975249392 [label="var_1
 0" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	132813645298272 [label="t2
+	129049975250448 [label="t2
 (3, 4, 5)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	132813645298464 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	132813645298080 [label="var_2
+	129049935907424 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
+	129049935907568 [label="var_2
 1" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	132813645298704 [label=add color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	132813645298512 [label="t3
+	129049935907472 [label=add color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
+	129049935907760 [label="t3
 (3, 4, 5)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	132813645298800 [label="t4
+	129049935907616 [label="t4
 (3, 4, 5)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	132813645299136 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	132813645299088 [label="var_3
+	129049935907952 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
+	129049935908240 [label="var_3
 0.5" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	132813645298944 [label=add color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	132813645299424 [label="t5
+	129049935908432 [label=add color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
+	129049935908144 [label="t5
 (3, 4, 5)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	132813645299664 [label="tensor_6
+	129049935908576 [label="tensor_6
 (3, 4, 5)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	132813645293616 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	132813645293664 [label="var_4
+	129049935908624 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
+	129049935908912 [label="var_4
 0" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	132813645293280 [label=uniform color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	132813645299616 [label="var_5
+	129049935909056 [label=uniform color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
+	129049935909008 [label="var_5
 -0.5477225575051661" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	132813645293568 [label="var_6
+	129049935909200 [label="var_6
 0.5477225575051661" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	132813645293424 [label="var_7
+	129049935909152 [label="var_7
 0" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	132813646230768 -> 132815942520016 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	132814271881056 -> 132813646230768 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	132813645298464 -> 132813645298272 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	132813645298080 -> 132813645298464 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	132815942520016 -> 132813645298704 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	132813645298272 -> 132813645298704 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	132813645298704 -> 132813645298512 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	132813645299136 -> 132813645298800 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	132813645299088 -> 132813645299136 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	132813645298800 -> 132813645298944 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	132813645298512 -> 132813645298944 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	132813645298944 -> 132813645299424 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	132813645293616 -> 132813645299664 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	132813645293280 -> 132813645299664 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	132813645293664 -> 132813645293616 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	132813645299616 -> 132813645293280 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	132813645293568 -> 132813645293280 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	132813645293424 -> 132813645293280 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	129049935907280 -> 129052233125168 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	129049975249392 -> 129049935907280 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	129049935907424 -> 129049975250448 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	129049935907568 -> 129049935907424 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	129052233125168 -> 129049935907472 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	129049975250448 -> 129049935907472 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	129049935907472 -> 129049935907760 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	129049935907952 -> 129049935907616 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	129049935908240 -> 129049935907952 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	129049935907616 -> 129049935908432 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	129049935907760 -> 129049935908432 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	129049935908432 -> 129049935908144 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	129049935908624 -> 129049935908576 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	129049935909056 -> 129049935908576 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	129049935908912 -> 129049935908624 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	129049935909008 -> 129049935909056 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	129049935909200 -> 129049935909056 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	129049935909152 -> 129049935909056 [arrowsize=0.8 color=gray40 penwidth=1.2]
 }
diff --git a/front/py/examples/2_ir/1_init_zeroones.dot.svg b/front/py/examples/2_ir/1_init_zeroones.dot.svg
index 474bbbd8..b5bf6542 100644
--- a/front/py/examples/2_ir/1_init_zeroones.dot.svg
+++ b/front/py/examples/2_ir/1_init_zeroones.dot.svg
@@ -9,244 +9,244 @@
 <g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 480)">
 <title>%3</title>
 <polygon fill="white" stroke="transparent" points="-4,4 -4,-480 731.5,-480 731.5,4 -4,4"/>
-<!-- 132815942520016 -->
+<!-- 129052233125168 -->
 <g id="node1" class="node">
-<title>132815942520016</title>
+<title>129052233125168</title>
 <polygon fill="aliceblue" stroke="skyblue" points="239.5,-330 177.5,-330 177.5,-292 239.5,-292 239.5,-330"/>
 <text text-anchor="middle" x="208.5" y="-314.8" font-family="Sans-Serif" font-size="14.00">t1</text>
 <text text-anchor="middle" x="208.5" y="-299.8" font-family="Sans-Serif" font-size="14.00">(3, 4, 5)</text>
 </g>
-<!-- 132813645298704 -->
+<!-- 129049935907472 -->
 <g id="node7" class="node">
-<title>132813645298704</title>
+<title>129049935907472</title>
 <polygon fill="lightgray" stroke="darkslategray" points="155.5,-256 101.5,-256 101.5,-220 155.5,-220 155.5,-256"/>
 <text text-anchor="middle" x="128.5" y="-234.3" font-family="Courier Bold" font-size="14.00">add</text>
 </g>
-<!-- 132815942520016&#45;&gt;132813645298704 -->
+<!-- 129052233125168&#45;&gt;129049935907472 -->
 <g id="edge5" class="edge">
-<title>132815942520016&#45;&gt;132813645298704</title>
+<title>129052233125168&#45;&gt;129049935907472</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M187.9,-291.72C177.53,-282.51 164.86,-271.27 153.89,-261.53"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="155.5,-259.22 147.66,-256 151.78,-263.41 155.5,-259.22"/>
 </g>
-<!-- 132813646230768 -->
+<!-- 129049935907280 -->
 <g id="node2" class="node">
-<title>132813646230768</title>
+<title>129049935907280</title>
 <polygon fill="lightgray" stroke="darkslategray" points="260,-402 177,-402 177,-366 260,-366 260,-402"/>
 <text text-anchor="middle" x="218.5" y="-380.3" font-family="Courier Bold" font-size="14.00">constant</text>
 </g>
-<!-- 132813646230768&#45;&gt;132815942520016 -->
+<!-- 129049935907280&#45;&gt;129052233125168 -->
 <g id="edge1" class="edge">
-<title>132813646230768&#45;&gt;132815942520016</title>
+<title>129049935907280&#45;&gt;129052233125168</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M216.08,-365.81C214.91,-357.52 213.48,-347.39 212.18,-338.16"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="214.93,-337.55 211.04,-330.02 209.38,-338.33 214.93,-337.55"/>
 </g>
-<!-- 132814271881056 -->
+<!-- 129049975249392 -->
 <g id="node3" class="node">
-<title>132814271881056</title>
+<title>129049975249392</title>
 <polygon fill="moccasin" stroke="orange" points="245.5,-476 191.5,-476 191.5,-438 245.5,-438 245.5,-476"/>
 <text text-anchor="middle" x="218.5" y="-460.8" font-family="Sans-Serif" font-size="14.00">var_1</text>
 <text text-anchor="middle" x="218.5" y="-445.8" font-family="Sans-Serif" font-size="14.00">0</text>
 </g>
-<!-- 132814271881056&#45;&gt;132813646230768 -->
+<!-- 129049975249392&#45;&gt;129049935907280 -->
 <g id="edge2" class="edge">
-<title>132814271881056&#45;&gt;132813646230768</title>
+<title>129049975249392&#45;&gt;129049935907280</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M218.5,-437.72C218.5,-429.29 218.5,-419.15 218.5,-410.02"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="221.3,-410 218.5,-402 215.7,-410 221.3,-410"/>
 </g>
-<!-- 132813645298272 -->
+<!-- 129049975250448 -->
 <g id="node4" class="node">
-<title>132813645298272</title>
+<title>129049975250448</title>
 <polygon fill="aliceblue" stroke="skyblue" points="153.5,-330 91.5,-330 91.5,-292 153.5,-292 153.5,-330"/>
 <text text-anchor="middle" x="122.5" y="-314.8" font-family="Sans-Serif" font-size="14.00">t2</text>
 <text text-anchor="middle" x="122.5" y="-299.8" font-family="Sans-Serif" font-size="14.00">(3, 4, 5)</text>
 </g>
-<!-- 132813645298272&#45;&gt;132813645298704 -->
+<!-- 129049975250448&#45;&gt;129049935907472 -->
 <g id="edge6" class="edge">
-<title>132813645298272&#45;&gt;132813645298704</title>
+<title>129049975250448&#45;&gt;129049935907472</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M124.05,-291.72C124.76,-283.29 125.61,-273.15 126.39,-264.02"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="129.18,-264.21 127.06,-256 123.6,-263.74 129.18,-264.21"/>
 </g>
-<!-- 132813645298464 -->
+<!-- 129049935907424 -->
 <g id="node5" class="node">
-<title>132813645298464</title>
+<title>129049935907424</title>
 <polygon fill="lightgray" stroke="darkslategray" points="159,-402 76,-402 76,-366 159,-366 159,-402"/>
 <text text-anchor="middle" x="117.5" y="-380.3" font-family="Courier Bold" font-size="14.00">constant</text>
 </g>
-<!-- 132813645298464&#45;&gt;132813645298272 -->
+<!-- 129049935907424&#45;&gt;129049975250448 -->
 <g id="edge3" class="edge">
-<title>132813645298464&#45;&gt;132813645298272</title>
+<title>129049935907424&#45;&gt;129049975250448</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M118.71,-365.81C119.29,-357.52 120.01,-347.39 120.66,-338.16"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="123.46,-338.2 121.23,-330.02 117.88,-337.8 123.46,-338.2"/>
 </g>
-<!-- 132813645298080 -->
+<!-- 129049935907568 -->
 <g id="node6" class="node">
-<title>132813645298080</title>
+<title>129049935907568</title>
 <polygon fill="moccasin" stroke="orange" points="144.5,-476 90.5,-476 90.5,-438 144.5,-438 144.5,-476"/>
 <text text-anchor="middle" x="117.5" y="-460.8" font-family="Sans-Serif" font-size="14.00">var_2</text>
 <text text-anchor="middle" x="117.5" y="-445.8" font-family="Sans-Serif" font-size="14.00">1</text>
 </g>
-<!-- 132813645298080&#45;&gt;132813645298464 -->
+<!-- 129049935907568&#45;&gt;129049935907424 -->
 <g id="edge4" class="edge">
-<title>132813645298080&#45;&gt;132813645298464</title>
+<title>129049935907568&#45;&gt;129049935907424</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M117.5,-437.72C117.5,-429.29 117.5,-419.15 117.5,-410.02"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="120.3,-410 117.5,-402 114.7,-410 120.3,-410"/>
 </g>
-<!-- 132813645298512 -->
+<!-- 129049935907760 -->
 <g id="node8" class="node">
-<title>132813645298512</title>
+<title>129049935907760</title>
 <polygon fill="aliceblue" stroke="skyblue" points="157.5,-184 95.5,-184 95.5,-146 157.5,-146 157.5,-184"/>
 <text text-anchor="middle" x="126.5" y="-168.8" font-family="Sans-Serif" font-size="14.00">t3</text>
 <text text-anchor="middle" x="126.5" y="-153.8" font-family="Sans-Serif" font-size="14.00">(3, 4, 5)</text>
 </g>
-<!-- 132813645298704&#45;&gt;132813645298512 -->
+<!-- 129049935907472&#45;&gt;129049935907760 -->
 <g id="edge7" class="edge">
-<title>132813645298704&#45;&gt;132813645298512</title>
+<title>129049935907472&#45;&gt;129049935907760</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M128.02,-219.81C127.78,-211.52 127.5,-201.39 127.24,-192.16"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="130.03,-191.94 127.01,-184.02 124.43,-192.1 130.03,-191.94"/>
 </g>
-<!-- 132813645298944 -->
+<!-- 129049935908432 -->
 <g id="node12" class="node">
-<title>132813645298944</title>
+<title>129049935908432</title>
 <polygon fill="lightgray" stroke="darkslategray" points="111.5,-110 57.5,-110 57.5,-74 111.5,-74 111.5,-110"/>
 <text text-anchor="middle" x="84.5" y="-88.3" font-family="Courier Bold" font-size="14.00">add</text>
 </g>
-<!-- 132813645298512&#45;&gt;132813645298944 -->
+<!-- 129049935907760&#45;&gt;129049935908432 -->
 <g id="edge11" class="edge">
-<title>132813645298512&#45;&gt;132813645298944</title>
+<title>129049935907760&#45;&gt;129049935908432</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M115.68,-145.72C110.49,-136.94 104.21,-126.31 98.64,-116.91"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="101.04,-115.46 94.56,-110 96.22,-118.31 101.04,-115.46"/>
 </g>
-<!-- 132813645298800 -->
+<!-- 129049935907616 -->
 <g id="node9" class="node">
-<title>132813645298800</title>
+<title>129049935907616</title>
 <polygon fill="aliceblue" stroke="skyblue" points="75.5,-184 13.5,-184 13.5,-146 75.5,-146 75.5,-184"/>
 <text text-anchor="middle" x="44.5" y="-168.8" font-family="Sans-Serif" font-size="14.00">t4</text>
 <text text-anchor="middle" x="44.5" y="-153.8" font-family="Sans-Serif" font-size="14.00">(3, 4, 5)</text>
 </g>
-<!-- 132813645298800&#45;&gt;132813645298944 -->
+<!-- 129049935907616&#45;&gt;129049935908432 -->
 <g id="edge10" class="edge">
-<title>132813645298800&#45;&gt;132813645298944</title>
+<title>129049935907616&#45;&gt;129049935908432</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M54.8,-145.72C59.7,-137.03 65.61,-126.52 70.87,-117.19"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="73.43,-118.35 74.92,-110 68.55,-115.6 73.43,-118.35"/>
 </g>
-<!-- 132813645299136 -->
+<!-- 129049935907952 -->
 <g id="node10" class="node">
-<title>132813645299136</title>
+<title>129049935907952</title>
 <polygon fill="lightgray" stroke="darkslategray" points="83,-256 0,-256 0,-220 83,-220 83,-256"/>
 <text text-anchor="middle" x="41.5" y="-234.3" font-family="Courier Bold" font-size="14.00">constant</text>
 </g>
-<!-- 132813645299136&#45;&gt;132813645298800 -->
+<!-- 129049935907952&#45;&gt;129049935907616 -->
 <g id="edge8" class="edge">
-<title>132813645299136&#45;&gt;132813645298800</title>
+<title>129049935907952&#45;&gt;129049935907616</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M42.23,-219.81C42.58,-211.52 43,-201.39 43.39,-192.16"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="46.2,-192.13 43.74,-184.02 40.6,-191.89 46.2,-192.13"/>
 </g>
-<!-- 132813645299088 -->
+<!-- 129049935908240 -->
 <g id="node11" class="node">
-<title>132813645299088</title>
+<title>129049935908240</title>
 <polygon fill="moccasin" stroke="orange" points="68.5,-330 14.5,-330 14.5,-292 68.5,-292 68.5,-330"/>
 <text text-anchor="middle" x="41.5" y="-314.8" font-family="Sans-Serif" font-size="14.00">var_3</text>
 <text text-anchor="middle" x="41.5" y="-299.8" font-family="Sans-Serif" font-size="14.00">0.5</text>
 </g>
-<!-- 132813645299088&#45;&gt;132813645299136 -->
+<!-- 129049935908240&#45;&gt;129049935907952 -->
 <g id="edge9" class="edge">
-<title>132813645299088&#45;&gt;132813645299136</title>
+<title>129049935908240&#45;&gt;129049935907952</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M41.5,-291.72C41.5,-283.29 41.5,-273.15 41.5,-264.02"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="44.3,-264 41.5,-256 38.7,-264 44.3,-264"/>
 </g>
-<!-- 132813645299424 -->
+<!-- 129049935908144 -->
 <g id="node13" class="node">
-<title>132813645299424</title>
+<title>129049935908144</title>
 <polygon fill="aliceblue" stroke="skyblue" points="115.5,-38 53.5,-38 53.5,0 115.5,0 115.5,-38"/>
 <text text-anchor="middle" x="84.5" y="-22.8" font-family="Sans-Serif" font-size="14.00">t5</text>
 <text text-anchor="middle" x="84.5" y="-7.8" font-family="Sans-Serif" font-size="14.00">(3, 4, 5)</text>
 </g>
-<!-- 132813645298944&#45;&gt;132813645299424 -->
+<!-- 129049935908432&#45;&gt;129049935908144 -->
 <g id="edge12" class="edge">
-<title>132813645298944&#45;&gt;132813645299424</title>
+<title>129049935908432&#45;&gt;129049935908144</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M84.5,-73.81C84.5,-65.52 84.5,-55.39 84.5,-46.16"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="87.3,-46.02 84.5,-38.02 81.7,-46.02 87.3,-46.02"/>
 </g>
-<!-- 132813645299664 -->
+<!-- 129049935908576 -->
 <g id="node14" class="node">
-<title>132813645299664</title>
+<title>129049935908576</title>
 <polygon fill="aliceblue" stroke="skyblue" points="507,-330 434,-330 434,-292 507,-292 507,-330"/>
 <text text-anchor="middle" x="470.5" y="-314.8" font-family="Sans-Serif" font-size="14.00">tensor_6</text>
 <text text-anchor="middle" x="470.5" y="-299.8" font-family="Sans-Serif" font-size="14.00">(3, 4, 5)</text>
 </g>
-<!-- 132813645293616 -->
+<!-- 129049935908624 -->
 <g id="node15" class="node">
-<title>132813645293616</title>
+<title>129049935908624</title>
 <polygon fill="lightgray" stroke="darkslategray" points="443,-402 360,-402 360,-366 443,-366 443,-402"/>
 <text text-anchor="middle" x="401.5" y="-380.3" font-family="Courier Bold" font-size="14.00">constant</text>
 </g>
-<!-- 132813645293616&#45;&gt;132813645299664 -->
+<!-- 129049935908624&#45;&gt;129049935908576 -->
 <g id="edge13" class="edge">
-<title>132813645293616&#45;&gt;132813645299664</title>
+<title>129049935908624&#45;&gt;129049935908576</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M418.2,-365.81C426.92,-356.84 437.73,-345.72 447.25,-335.92"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="449.42,-337.71 452.99,-330.02 445.4,-333.81 449.42,-337.71"/>
 </g>
-<!-- 132813645293664 -->
+<!-- 129049935908912 -->
 <g id="node16" class="node">
-<title>132813645293664</title>
+<title>129049935908912</title>
 <polygon fill="moccasin" stroke="orange" points="319.5,-476 265.5,-476 265.5,-438 319.5,-438 319.5,-476"/>
 <text text-anchor="middle" x="292.5" y="-460.8" font-family="Sans-Serif" font-size="14.00">var_4</text>
 <text text-anchor="middle" x="292.5" y="-445.8" font-family="Sans-Serif" font-size="14.00">0</text>
 </g>
-<!-- 132813645293664&#45;&gt;132813645293616 -->
+<!-- 129049935908912&#45;&gt;129049935908624 -->
 <g id="edge15" class="edge">
-<title>132813645293664&#45;&gt;132813645293616</title>
+<title>129049935908912&#45;&gt;129049935908624</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M319.72,-438.27C334.49,-428.65 352.85,-416.69 368.43,-406.54"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="370.2,-408.73 375.37,-402.02 367.14,-404.04 370.2,-408.73"/>
 </g>
-<!-- 132813645293280 -->
+<!-- 129049935909056 -->
 <g id="node17" class="node">
-<title>132813645293280</title>
+<title>129049935909056</title>
 <polygon fill="lightgray" stroke="darkslategray" points="576.5,-402 502.5,-402 502.5,-366 576.5,-366 576.5,-402"/>
 <text text-anchor="middle" x="539.5" y="-380.3" font-family="Courier Bold" font-size="14.00">uniform</text>
 </g>
-<!-- 132813645293280&#45;&gt;132813645299664 -->
+<!-- 129049935909056&#45;&gt;129049935908576 -->
 <g id="edge14" class="edge">
-<title>132813645293280&#45;&gt;132813645299664</title>
+<title>129049935909056&#45;&gt;129049935908576</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M522.8,-365.81C514.08,-356.84 503.27,-345.72 493.75,-335.92"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="495.6,-333.81 488.01,-330.02 491.58,-337.71 495.6,-333.81"/>
 </g>
-<!-- 132813645299616 -->
+<!-- 129049935909008 -->
 <g id="node18" class="node">
-<title>132813645299616</title>
+<title>129049935909008</title>
 <polygon fill="moccasin" stroke="orange" points="489.5,-476 337.5,-476 337.5,-438 489.5,-438 489.5,-476"/>
 <text text-anchor="middle" x="413.5" y="-460.8" font-family="Sans-Serif" font-size="14.00">var_5</text>
 <text text-anchor="middle" x="413.5" y="-445.8" font-family="Sans-Serif" font-size="14.00">&#45;0.5477225575051661</text>
 </g>
-<!-- 132813645299616&#45;&gt;132813645293280 -->
+<!-- 129049935909008&#45;&gt;129049935909056 -->
 <g id="edge16" class="edge">
-<title>132813645299616&#45;&gt;132813645293280</title>
+<title>129049935909008&#45;&gt;129049935909056</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M445.62,-437.9C462.72,-428.27 483.85,-416.36 501.73,-406.28"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="503.58,-408.46 509.17,-402.09 500.83,-403.58 503.58,-408.46"/>
 </g>
-<!-- 132813645293568 -->
+<!-- 129049935909200 -->
 <g id="node19" class="node">
-<title>132813645293568</title>
+<title>129049935909200</title>
 <polygon fill="moccasin" stroke="orange" points="655.5,-476 507.5,-476 507.5,-438 655.5,-438 655.5,-476"/>
 <text text-anchor="middle" x="581.5" y="-460.8" font-family="Sans-Serif" font-size="14.00">var_6</text>
 <text text-anchor="middle" x="581.5" y="-445.8" font-family="Sans-Serif" font-size="14.00">0.5477225575051661</text>
 </g>
-<!-- 132813645293568&#45;&gt;132813645293280 -->
+<!-- 129049935909200&#45;&gt;129049935909056 -->
 <g id="edge17" class="edge">
-<title>132813645293568&#45;&gt;132813645293280</title>
+<title>129049935909200&#45;&gt;129049935909056</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M570.68,-437.72C565.49,-428.94 559.21,-418.31 553.64,-408.91"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="556.04,-407.46 549.56,-402 551.22,-410.31 556.04,-407.46"/>
 </g>
-<!-- 132813645293424 -->
+<!-- 129049935909152 -->
 <g id="node20" class="node">
-<title>132813645293424</title>
+<title>129049935909152</title>
 <polygon fill="moccasin" stroke="orange" points="727.5,-476 673.5,-476 673.5,-438 727.5,-438 727.5,-476"/>
 <text text-anchor="middle" x="700.5" y="-460.8" font-family="Sans-Serif" font-size="14.00">var_7</text>
 <text text-anchor="middle" x="700.5" y="-445.8" font-family="Sans-Serif" font-size="14.00">0</text>
 </g>
-<!-- 132813645293424&#45;&gt;132813645293280 -->
+<!-- 129049935909152&#45;&gt;129049935909056 -->
 <g id="edge18" class="edge">
-<title>132813645293424&#45;&gt;132813645293280</title>
+<title>129049935909152&#45;&gt;129049935909056</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M673.46,-442.25C670.45,-440.79 667.42,-439.35 664.5,-438 638.14,-425.84 608.21,-413.11 584.24,-403.17"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="585.15,-400.52 576.69,-400.05 583.01,-405.69 585.15,-400.52"/>
 </g>

From fd2ed6aa9e1baf4b0575980e24370cf62d3e66b6 Mon Sep 17 00:00:00 2001
From: lipeng <734991033@qq.com>
Date: Sun, 6 Apr 2025 19:41:28 +0800
Subject: [PATCH 7/7] front:Apply

---
 front/py/deepx/autograd/function.py           |   4 +
 front/py/deepx/nn/functional/elementwise.py   |  20 +-
 front/py/examples/2_ir/2_elementwise_add.dot  |  40 ++--
 .../examples/2_ir/2_elementwise_add.dot.svg   |  80 ++++----
 .../examples/2_ir/2_elementwise_operator.dot  |  92 ++++-----
 .../2_ir/2_elementwise_operator.dot.svg       | 184 +++++++++---------
 .../examples/2_ir/2_elementwise_sqrtlog.dot   |  46 ++---
 .../2_ir/2_elementwise_sqrtlog.dot.svg        |  92 ++++-----
 front/py/examples/2_ir/3_matmul.dot           |  30 +--
 front/py/examples/2_ir/3_matmul.dot.svg       |  60 +++---
 10 files changed, 326 insertions(+), 322 deletions(-)

diff --git a/front/py/deepx/autograd/function.py b/front/py/deepx/autograd/function.py
index e9f5ff06..98f63def 100644
--- a/front/py/deepx/autograd/function.py
+++ b/front/py/deepx/autograd/function.py
@@ -18,6 +18,10 @@ def save_data(self, key, value):
     def get_data(self, key):
         return self._non_tensor_data.get(key)
 
+    @property
+    def requires_grad(self):
+        return self._requires_grad
+
 class Function:
     @staticmethod
     def forward(ctx:Context, *args, **kwargs):
diff --git a/front/py/deepx/nn/functional/elementwise.py b/front/py/deepx/nn/functional/elementwise.py
index 358c88d0..7b524afc 100644
--- a/front/py/deepx/nn/functional/elementwise.py
+++ b/front/py/deepx/nn/functional/elementwise.py
@@ -193,7 +193,7 @@ def mul(
         requires_grad:bool=False,
         author='miaobyte')->Tensor:
     if isinstance(b,Tensor):
-        return Mul.apply(a,b,out,author,requires_grad)
+        return Mul.apply(a,b,out,author,requires_grad=requires_grad)
     else:
         return MulScalar.apply(a,b,out,author,requires_grad=requires_grad)
  
@@ -228,10 +228,10 @@ def backward(ctx:Context, out_grad):
 OpNode.register("rdivscalar")
 class RDivScalar(Function):
     @staticmethod
-    def forward(ctx:Context, a, b,out,author='miaobyte'):
+    def forward(ctx:Context, a,b,out,author='miaobyte'):
         if ctx.requires_grad:
             ctx.save_data('b',b)
-        return _A_b_elementwiseop_C(a, b, "rdivscalar", out,author)
+        return _a_B_elementwiseop_C(a, b, "rdivscalar", out,author)
     
     @staticmethod
     def backward(ctx:Context, out_grad):
@@ -244,14 +244,14 @@ def div(
         requires_grad:bool=False,
         author='miaobyte')->Tensor:
     if isinstance(b,Tensor) and isinstance(a,Tensor):
-        return Div.apply(a,b,out,author,requires_grad)
+        return Div.apply(a,b,out,author,requires_grad=requires_grad)
     else:
         if isinstance(a,Tensor):
             #C=A/b
-            return DivScalar.apply(a,b,"divscalar",out,author,requires_grad=requires_grad)
+            return DivScalar.apply(a,b,out,author,requires_grad=requires_grad)
         else:
             #C=a/B
-            return RDivScalar.apply(a,b,"rdivscalar",out,author,requires_grad=requires_grad)
+            return RDivScalar.apply(a,b,out,author,requires_grad=requires_grad)
 
 OpNode.register("compare")
 class Compare(Function):
@@ -299,9 +299,9 @@ def max(
         requires_grad:bool=False,
         author='miaobyte')->Tensor:
     if  isinstance(b,int) or isinstance(b,float):
-        return MaxScalar.apply(a,b,"maxscalar",out,author,requires_grad)
+        return MaxScalar.apply(a,b,out,author,requires_grad)
     else:
-        return Max.apply(a,b,"max",out,author,requires_grad=requires_grad)
+        return Max.apply(a,b,out,author,requires_grad=requires_grad)
 
 
 OpNode.register("min")
@@ -337,9 +337,9 @@ def min(
         requires_grad:bool=False,
         author='miaobyte')->Tensor:
     if  isinstance(b,int) or isinstance(b,float):
-        return MinScalar.apply(a,b,"minscalar",out,author,requires_grad=requires_grad)
+        return MinScalar.apply(a,b,out,author,requires_grad=requires_grad)
     else:
-        return Min.apply(a,b,"min",out,author,requires_grad=requires_grad)
+        return Min.apply(a,b,out,author,requires_grad=requires_grad)
 
 #clamp,TODO
 
diff --git a/front/py/examples/2_ir/2_elementwise_add.dot b/front/py/examples/2_ir/2_elementwise_add.dot
index 02ebe50a..1c2ae8c9 100644
--- a/front/py/examples/2_ir/2_elementwise_add.dot
+++ b/front/py/examples/2_ir/2_elementwise_add.dot
@@ -2,30 +2,30 @@
 digraph {
 	rankdir=TB
 	node [shape=record]
-	124025483544560 [label="t1
+	125643920431552 [label="t1
 (2, 3, 4)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	124023224089136 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	124025483558240 [label="var_1
+	125643920431744 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
+	125643920432608 [label="var_1
 10" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	124023224086304 [label="t2
+	125645612091328 [label="t2
 (2, 3, 4)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	124023224089184 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	124023224085296 [label="var_2
+	125643918940608 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
+	125643918940416 [label="var_2
 10" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	124023224088848 [label=add color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	124023224089376 [label="t3
+	125643918940464 [label=add color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
+	125643918940272 [label="t3
 (2, 3, 4)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	124023224089568 [label=addscalar color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	124023224089520 [label="var_3
+	125643918940128 [label=addscalar color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
+	125643918939936 [label="var_3
 0.5" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	124023224089136 -> 124025483544560 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	124025483558240 -> 124023224089136 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	124023224089184 -> 124023224086304 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	124023224085296 -> 124023224089184 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	124025483544560 -> 124023224088848 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	124023224086304 -> 124023224088848 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	124023224088848 -> 124023224089376 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	124023224089568 -> 124023224089376 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	124023224089376 -> 124023224089568 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	124023224089520 -> 124023224089568 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	125643920431744 -> 125643920431552 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	125643920432608 -> 125643920431744 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	125643918940608 -> 125645612091328 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	125643918940416 -> 125643918940608 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	125643920431552 -> 125643918940464 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	125645612091328 -> 125643918940464 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	125643918940464 -> 125643918940272 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	125643918940128 -> 125643918940272 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	125643918940272 -> 125643918940128 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	125643918939936 -> 125643918940128 [arrowsize=0.8 color=gray40 penwidth=1.2]
 }
diff --git a/front/py/examples/2_ir/2_elementwise_add.dot.svg b/front/py/examples/2_ir/2_elementwise_add.dot.svg
index 16598880..86968ef0 100644
--- a/front/py/examples/2_ir/2_elementwise_add.dot.svg
+++ b/front/py/examples/2_ir/2_elementwise_add.dot.svg
@@ -9,129 +9,129 @@
 <g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 406)">
 <title>%3</title>
 <polygon fill="white" stroke="transparent" points="-4,4 -4,-406 199.5,-406 199.5,4 -4,4"/>
-<!-- 124025483544560 -->
+<!-- 125643920431552 -->
 <g id="node1" class="node">
-<title>124025483544560</title>
+<title>125643920431552</title>
 <polygon fill="aliceblue" stroke="skyblue" points="179,-256 117,-256 117,-218 179,-218 179,-256"/>
 <text text-anchor="middle" x="148" y="-240.8" font-family="Sans-Serif" font-size="14.00">t1</text>
 <text text-anchor="middle" x="148" y="-225.8" font-family="Sans-Serif" font-size="14.00">(2, 3, 4)</text>
 </g>
-<!-- 124023224088848 -->
+<!-- 125643918940464 -->
 <g id="node7" class="node">
-<title>124023224088848</title>
+<title>125643918940464</title>
 <polygon fill="lightgray" stroke="darkslategray" points="130,-182 76,-182 76,-146 130,-146 130,-182"/>
 <text text-anchor="middle" x="103" y="-160.3" font-family="Courier Bold" font-size="14.00">add</text>
 </g>
-<!-- 124025483544560&#45;&gt;124023224088848 -->
+<!-- 125643920431552&#45;&gt;125643918940464 -->
 <g id="edge5" class="edge">
-<title>124025483544560&#45;&gt;124023224088848</title>
+<title>125643920431552&#45;&gt;125643918940464</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M136.41,-217.72C130.85,-208.94 124.11,-198.31 118.15,-188.91"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="120.42,-187.26 113.78,-182 115.69,-190.26 120.42,-187.26"/>
 </g>
-<!-- 124023224089136 -->
+<!-- 125643920431744 -->
 <g id="node2" class="node">
-<title>124023224089136</title>
+<title>125643920431744</title>
 <polygon fill="lightgray" stroke="darkslategray" points="195.5,-328 112.5,-328 112.5,-292 195.5,-292 195.5,-328"/>
 <text text-anchor="middle" x="154" y="-306.3" font-family="Courier Bold" font-size="14.00">constant</text>
 </g>
-<!-- 124023224089136&#45;&gt;124025483544560 -->
+<!-- 125643920431744&#45;&gt;125643920431552 -->
 <g id="edge1" class="edge">
-<title>124023224089136&#45;&gt;124025483544560</title>
+<title>125643920431744&#45;&gt;125643920431552</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M152.55,-291.81C151.85,-283.52 150.99,-273.39 150.21,-264.16"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="152.99,-263.76 149.52,-256.02 147.41,-264.23 152.99,-263.76"/>
 </g>
-<!-- 124025483558240 -->
+<!-- 125643920432608 -->
 <g id="node3" class="node">
-<title>124025483558240</title>
+<title>125643920432608</title>
 <polygon fill="moccasin" stroke="orange" points="181,-402 127,-402 127,-364 181,-364 181,-402"/>
 <text text-anchor="middle" x="154" y="-386.8" font-family="Sans-Serif" font-size="14.00">var_1</text>
 <text text-anchor="middle" x="154" y="-371.8" font-family="Sans-Serif" font-size="14.00">10</text>
 </g>
-<!-- 124025483558240&#45;&gt;124023224089136 -->
+<!-- 125643920432608&#45;&gt;125643920431744 -->
 <g id="edge2" class="edge">
-<title>124025483558240&#45;&gt;124023224089136</title>
+<title>125643920432608&#45;&gt;125643920431744</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M154,-363.72C154,-355.29 154,-345.15 154,-336.02"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="156.8,-336 154,-328 151.2,-336 156.8,-336"/>
 </g>
-<!-- 124023224086304 -->
+<!-- 125645612091328 -->
 <g id="node4" class="node">
-<title>124023224086304</title>
+<title>125645612091328</title>
 <polygon fill="aliceblue" stroke="skyblue" points="94,-256 32,-256 32,-218 94,-218 94,-256"/>
 <text text-anchor="middle" x="63" y="-240.8" font-family="Sans-Serif" font-size="14.00">t2</text>
 <text text-anchor="middle" x="63" y="-225.8" font-family="Sans-Serif" font-size="14.00">(2, 3, 4)</text>
 </g>
-<!-- 124023224086304&#45;&gt;124023224088848 -->
+<!-- 125645612091328&#45;&gt;125643918940464 -->
 <g id="edge6" class="edge">
-<title>124023224086304&#45;&gt;124023224088848</title>
+<title>125645612091328&#45;&gt;125643918940464</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M73.3,-217.72C78.2,-209.03 84.11,-198.52 89.37,-189.19"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="91.93,-190.35 93.42,-182 87.05,-187.6 91.93,-190.35"/>
 </g>
-<!-- 124023224089184 -->
+<!-- 125643918940608 -->
 <g id="node5" class="node">
-<title>124023224089184</title>
+<title>125643918940608</title>
 <polygon fill="lightgray" stroke="darkslategray" points="94.5,-328 11.5,-328 11.5,-292 94.5,-292 94.5,-328"/>
 <text text-anchor="middle" x="53" y="-306.3" font-family="Courier Bold" font-size="14.00">constant</text>
 </g>
-<!-- 124023224089184&#45;&gt;124023224086304 -->
+<!-- 125643918940608&#45;&gt;125645612091328 -->
 <g id="edge3" class="edge">
-<title>124023224089184&#45;&gt;124023224086304</title>
+<title>125643918940608&#45;&gt;125645612091328</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M55.42,-291.81C56.59,-283.52 58.02,-273.39 59.32,-264.16"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="62.12,-264.33 60.46,-256.02 56.57,-263.55 62.12,-264.33"/>
 </g>
-<!-- 124023224085296 -->
+<!-- 125643918940416 -->
 <g id="node6" class="node">
-<title>124023224085296</title>
+<title>125643918940416</title>
 <polygon fill="moccasin" stroke="orange" points="80,-402 26,-402 26,-364 80,-364 80,-402"/>
 <text text-anchor="middle" x="53" y="-386.8" font-family="Sans-Serif" font-size="14.00">var_2</text>
 <text text-anchor="middle" x="53" y="-371.8" font-family="Sans-Serif" font-size="14.00">10</text>
 </g>
-<!-- 124023224085296&#45;&gt;124023224089184 -->
+<!-- 125643918940416&#45;&gt;125643918940608 -->
 <g id="edge4" class="edge">
-<title>124023224085296&#45;&gt;124023224089184</title>
+<title>125643918940416&#45;&gt;125643918940608</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M53,-363.72C53,-355.29 53,-345.15 53,-336.02"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="55.8,-336 53,-328 50.2,-336 55.8,-336"/>
 </g>
-<!-- 124023224089376 -->
+<!-- 125643918940272 -->
 <g id="node8" class="node">
-<title>124023224089376</title>
+<title>125643918940272</title>
 <polygon fill="aliceblue" stroke="skyblue" points="134,-110 72,-110 72,-72 134,-72 134,-110"/>
 <text text-anchor="middle" x="103" y="-94.8" font-family="Sans-Serif" font-size="14.00">t3</text>
 <text text-anchor="middle" x="103" y="-79.8" font-family="Sans-Serif" font-size="14.00">(2, 3, 4)</text>
 </g>
-<!-- 124023224088848&#45;&gt;124023224089376 -->
+<!-- 125643918940464&#45;&gt;125643918940272 -->
 <g id="edge7" class="edge">
-<title>124023224088848&#45;&gt;124023224089376</title>
+<title>125643918940464&#45;&gt;125643918940272</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M103,-145.81C103,-137.52 103,-127.39 103,-118.16"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="105.8,-118.02 103,-110.02 100.2,-118.02 105.8,-118.02"/>
 </g>
-<!-- 124023224089568 -->
+<!-- 125643918940128 -->
 <g id="node9" class="node">
-<title>124023224089568</title>
+<title>125643918940128</title>
 <polygon fill="lightgray" stroke="darkslategray" points="148.5,-36 57.5,-36 57.5,0 148.5,0 148.5,-36"/>
 <text text-anchor="middle" x="103" y="-14.3" font-family="Courier Bold" font-size="14.00">addscalar</text>
 </g>
-<!-- 124023224089376&#45;&gt;124023224089568 -->
+<!-- 125643918940272&#45;&gt;125643918940128 -->
 <g id="edge9" class="edge">
-<title>124023224089376&#45;&gt;124023224089568</title>
+<title>125643918940272&#45;&gt;125643918940128</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M97.01,-71.72C96.2,-63.29 96.04,-53.15 96.52,-44.02"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="99.31,-44.2 97.14,-36 93.73,-43.76 99.31,-44.2"/>
 </g>
-<!-- 124023224089568&#45;&gt;124023224089376 -->
+<!-- 125643918940128&#45;&gt;125643918940272 -->
 <g id="edge8" class="edge">
-<title>124023224089568&#45;&gt;124023224089376</title>
+<title>125643918940128&#45;&gt;125643918940272</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M108.86,-36C109.73,-44.25 109.96,-54.35 109.55,-63.57"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="106.75,-63.54 108.99,-71.72 112.33,-63.93 106.75,-63.54"/>
 </g>
-<!-- 124023224089520 -->
+<!-- 125643918939936 -->
 <g id="node10" class="node">
-<title>124023224089520</title>
+<title>125643918939936</title>
 <polygon fill="moccasin" stroke="orange" points="54,-110 0,-110 0,-72 54,-72 54,-110"/>
 <text text-anchor="middle" x="27" y="-94.8" font-family="Sans-Serif" font-size="14.00">var_3</text>
 <text text-anchor="middle" x="27" y="-79.8" font-family="Sans-Serif" font-size="14.00">0.5</text>
 </g>
-<!-- 124023224089520&#45;&gt;124023224089568 -->
+<!-- 125643918939936&#45;&gt;125643918940128 -->
 <g id="edge10" class="edge">
-<title>124023224089520&#45;&gt;124023224089568</title>
+<title>125643918939936&#45;&gt;125643918940128</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M46.57,-71.72C56.43,-62.51 68.46,-51.27 78.88,-41.53"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="80.86,-43.51 84.8,-36 77.04,-39.42 80.86,-43.51"/>
 </g>
diff --git a/front/py/examples/2_ir/2_elementwise_operator.dot b/front/py/examples/2_ir/2_elementwise_operator.dot
index b39fa214..46fb2a94 100644
--- a/front/py/examples/2_ir/2_elementwise_operator.dot
+++ b/front/py/examples/2_ir/2_elementwise_operator.dot
@@ -2,63 +2,63 @@
 digraph {
 	rankdir=TB
 	node [shape=record]
-	134854829346096 [label="t1
+	130352998993200 [label="t1
 (3, 4, 5)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	134854521156512 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	134854521844832 [label="var_1
+	130350739524192 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
+	130350741117520 [label="var_1
 0" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	134854462386816 [label="t2
+	130350741118576 [label="t2
 (3, 4, 5)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	134854462387008 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	134854462386624 [label="var_2
+	130350739524336 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
+	130350739524480 [label="var_2
 1" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	134854462387248 [label=add color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	134854462387056 [label="t3
+	130350739524384 [label=add color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
+	130350739524672 [label="t3
 (3, 4, 5)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	134854462387344 [label="t4
+	130350739524528 [label="t4
 (3, 4, 5)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	134854462387680 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	134854462387632 [label="var_3
+	130350739524864 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
+	130350739525152 [label="var_3
 0.5" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	134854462387488 [label=add color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	134854462387776 [label="t5
+	130350739525104 [label=add color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
+	130350739525296 [label="t5
 (3, 4, 5)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	134854462388016 [label="t6
+	130350739525440 [label="t6
 (3, 4, 5)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	134854462388400 [label=div color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	134854462388256 [label=rdivscalar color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	134854462388352 [label="var_4
+	130350739525632 [label=div color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
+	130350739525776 [label=rdivscalar color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
+	130350739525968 [label="var_4
 0.05" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	134854462388688 [label="t7
+	130350739526016 [label="t7
 (3, 4, 5)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	134854462388832 [label=mulscalar color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	134854462388880 [label="var_5
+	130350739526304 [label=mulscalar color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
+	130350739526352 [label="var_5
 2.5" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	134854462388736 [label=mul color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	134854462389168 [label="t8
+	130350739526544 [label=mul color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
+	130350739526256 [label="t8
 (3, 4, 5)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	134854521156512 -> 134854829346096 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	134854521844832 -> 134854521156512 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	134854462387008 -> 134854462386816 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	134854462386624 -> 134854462387008 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	134854829346096 -> 134854462387248 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	134854462386816 -> 134854462387248 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	134854462387248 -> 134854462387056 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	134854462387680 -> 134854462387344 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	134854462387632 -> 134854462387680 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	134854462387344 -> 134854462387488 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	134854462387056 -> 134854462387488 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	134854462387488 -> 134854462387776 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	134854462388400 -> 134854462388016 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	134854829346096 -> 134854462388400 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	134854462386816 -> 134854462388400 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	134854462388352 -> 134854462388256 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	134854462386816 -> 134854462388256 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	134854462388256 -> 134854462388688 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	134854462388832 -> 134854462388688 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	134854462388688 -> 134854462388832 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	134854462388880 -> 134854462388832 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	134854462388688 -> 134854462388736 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	134854462386816 -> 134854462388736 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	134854462388736 -> 134854462389168 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	130350739524192 -> 130352998993200 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	130350741117520 -> 130350739524192 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	130350739524336 -> 130350741118576 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	130350739524480 -> 130350739524336 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	130352998993200 -> 130350739524384 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	130350741118576 -> 130350739524384 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	130350739524384 -> 130350739524672 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	130350739524864 -> 130350739524528 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	130350739525152 -> 130350739524864 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	130350739524528 -> 130350739525104 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	130350739524672 -> 130350739525104 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	130350739525104 -> 130350739525296 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	130350739525632 -> 130350739525440 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	130352998993200 -> 130350739525632 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	130350741118576 -> 130350739525632 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	130350739525968 -> 130350739525776 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	130350741118576 -> 130350739525776 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	130350739525776 -> 130350739526016 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	130350739526304 -> 130350739526016 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	130350739526016 -> 130350739526304 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	130350739526352 -> 130350739526304 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	130350739526016 -> 130350739526544 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	130350741118576 -> 130350739526544 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	130350739526544 -> 130350739526256 [arrowsize=0.8 color=gray40 penwidth=1.2]
 }
diff --git a/front/py/examples/2_ir/2_elementwise_operator.dot.svg b/front/py/examples/2_ir/2_elementwise_operator.dot.svg
index 1c50be16..bc9389b5 100644
--- a/front/py/examples/2_ir/2_elementwise_operator.dot.svg
+++ b/front/py/examples/2_ir/2_elementwise_operator.dot.svg
@@ -9,292 +9,292 @@
 <g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 480)">
 <title>%3</title>
 <polygon fill="white" stroke="transparent" points="-4,4 -4,-480 445.5,-480 445.5,4 -4,4"/>
-<!-- 134854829346096 -->
+<!-- 130352998993200 -->
 <g id="node1" class="node">
-<title>134854829346096</title>
+<title>130352998993200</title>
 <polygon fill="aliceblue" stroke="skyblue" points="326,-330 264,-330 264,-292 326,-292 326,-330"/>
 <text text-anchor="middle" x="295" y="-314.8" font-family="Sans-Serif" font-size="14.00">t1</text>
 <text text-anchor="middle" x="295" y="-299.8" font-family="Sans-Serif" font-size="14.00">(3, 4, 5)</text>
 </g>
-<!-- 134854462387248 -->
+<!-- 130350739524384 -->
 <g id="node7" class="node">
-<title>134854462387248</title>
+<title>130350739524384</title>
 <polygon fill="lightgray" stroke="darkslategray" points="265,-256 211,-256 211,-220 265,-220 265,-256"/>
 <text text-anchor="middle" x="238" y="-234.3" font-family="Courier Bold" font-size="14.00">add</text>
 </g>
-<!-- 134854829346096&#45;&gt;134854462387248 -->
+<!-- 130352998993200&#45;&gt;130350739524384 -->
 <g id="edge5" class="edge">
-<title>134854829346096&#45;&gt;134854462387248</title>
+<title>130352998993200&#45;&gt;130350739524384</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M280.32,-291.72C273.14,-282.77 264.41,-271.9 256.75,-262.36"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="258.84,-260.49 251.65,-256 254.48,-263.99 258.84,-260.49"/>
 </g>
-<!-- 134854462388400 -->
+<!-- 130350739525632 -->
 <g id="node15" class="node">
-<title>134854462388400</title>
+<title>130350739525632</title>
 <polygon fill="lightgray" stroke="darkslategray" points="337,-256 283,-256 283,-220 337,-220 337,-256"/>
 <text text-anchor="middle" x="310" y="-234.3" font-family="Courier Bold" font-size="14.00">div</text>
 </g>
-<!-- 134854829346096&#45;&gt;134854462388400 -->
+<!-- 130352998993200&#45;&gt;130350739525632 -->
 <g id="edge14" class="edge">
-<title>134854829346096&#45;&gt;134854462388400</title>
+<title>130352998993200&#45;&gt;130350739525632</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M298.86,-291.72C300.64,-283.29 302.79,-273.15 304.71,-264.02"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="307.49,-264.41 306.41,-256 302.01,-263.25 307.49,-264.41"/>
 </g>
-<!-- 134854521156512 -->
+<!-- 130350739524192 -->
 <g id="node2" class="node">
-<title>134854521156512</title>
+<title>130350739524192</title>
 <polygon fill="lightgray" stroke="darkslategray" points="336.5,-402 253.5,-402 253.5,-366 336.5,-366 336.5,-402"/>
 <text text-anchor="middle" x="295" y="-380.3" font-family="Courier Bold" font-size="14.00">constant</text>
 </g>
-<!-- 134854521156512&#45;&gt;134854829346096 -->
+<!-- 130350739524192&#45;&gt;130352998993200 -->
 <g id="edge1" class="edge">
-<title>134854521156512&#45;&gt;134854829346096</title>
+<title>130350739524192&#45;&gt;130352998993200</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M295,-365.81C295,-357.52 295,-347.39 295,-338.16"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="297.8,-338.02 295,-330.02 292.2,-338.02 297.8,-338.02"/>
 </g>
-<!-- 134854521844832 -->
+<!-- 130350741117520 -->
 <g id="node3" class="node">
-<title>134854521844832</title>
+<title>130350741117520</title>
 <polygon fill="moccasin" stroke="orange" points="322,-476 268,-476 268,-438 322,-438 322,-476"/>
 <text text-anchor="middle" x="295" y="-460.8" font-family="Sans-Serif" font-size="14.00">var_1</text>
 <text text-anchor="middle" x="295" y="-445.8" font-family="Sans-Serif" font-size="14.00">0</text>
 </g>
-<!-- 134854521844832&#45;&gt;134854521156512 -->
+<!-- 130350741117520&#45;&gt;130350739524192 -->
 <g id="edge2" class="edge">
-<title>134854521844832&#45;&gt;134854521156512</title>
+<title>130350741117520&#45;&gt;130350739524192</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M295,-437.72C295,-429.29 295,-419.15 295,-410.02"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="297.8,-410 295,-402 292.2,-410 297.8,-410"/>
 </g>
-<!-- 134854462386816 -->
+<!-- 130350741118576 -->
 <g id="node4" class="node">
-<title>134854462386816</title>
+<title>130350741118576</title>
 <polygon fill="aliceblue" stroke="skyblue" points="219,-330 157,-330 157,-292 219,-292 219,-330"/>
 <text text-anchor="middle" x="188" y="-314.8" font-family="Sans-Serif" font-size="14.00">t2</text>
 <text text-anchor="middle" x="188" y="-299.8" font-family="Sans-Serif" font-size="14.00">(3, 4, 5)</text>
 </g>
-<!-- 134854462386816&#45;&gt;134854462387248 -->
+<!-- 130350741118576&#45;&gt;130350739524384 -->
 <g id="edge6" class="edge">
-<title>134854462386816&#45;&gt;134854462387248</title>
+<title>130350741118576&#45;&gt;130350739524384</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M200.88,-291.72C207.12,-282.85 214.69,-272.11 221.36,-262.63"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="223.71,-264.16 226.03,-256 219.13,-260.93 223.71,-264.16"/>
 </g>
-<!-- 134854462386816&#45;&gt;134854462388400 -->
+<!-- 130350741118576&#45;&gt;130350739525632 -->
 <g id="edge15" class="edge">
-<title>134854462386816&#45;&gt;134854462388400</title>
+<title>130350741118576&#45;&gt;130350739525632</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M219.1,-291.9C236.27,-281.91 257.65,-269.47 275.35,-259.16"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="277.19,-261.34 282.69,-254.89 274.37,-256.5 277.19,-261.34"/>
 </g>
-<!-- 134854462388256 -->
+<!-- 130350739525776 -->
 <g id="node16" class="node">
-<title>134854462388256</title>
+<title>130350739525776</title>
 <polygon fill="lightgray" stroke="darkslategray" points="152.5,-256 53.5,-256 53.5,-220 152.5,-220 152.5,-256"/>
 <text text-anchor="middle" x="103" y="-234.3" font-family="Courier Bold" font-size="14.00">rdivscalar</text>
 </g>
-<!-- 134854462386816&#45;&gt;134854462388256 -->
+<!-- 130350741118576&#45;&gt;130350739525776 -->
 <g id="edge17" class="edge">
-<title>134854462386816&#45;&gt;134854462388256</title>
+<title>130350741118576&#45;&gt;130350739525776</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M166.11,-291.72C154.99,-282.42 141.38,-271.06 129.65,-261.26"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="131.29,-258.98 123.36,-256 127.7,-263.28 131.29,-258.98"/>
 </g>
-<!-- 134854462388736 -->
+<!-- 130350739526544 -->
 <g id="node21" class="node">
-<title>134854462388736</title>
+<title>130350739526544</title>
 <polygon fill="lightgray" stroke="darkslategray" points="208,-110 154,-110 154,-74 208,-74 208,-110"/>
 <text text-anchor="middle" x="181" y="-88.3" font-family="Courier Bold" font-size="14.00">mul</text>
 </g>
-<!-- 134854462386816&#45;&gt;134854462388736 -->
+<!-- 130350741118576&#45;&gt;130350739526544 -->
 <g id="edge23" class="edge">
-<title>134854462386816&#45;&gt;134854462388736</title>
+<title>130350741118576&#45;&gt;130350739526544</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M187.42,-291.87C186.17,-253.2 183.24,-162.52 181.81,-118.23"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="184.61,-117.93 181.55,-110.02 179.01,-118.11 184.61,-117.93"/>
 </g>
-<!-- 134854462387008 -->
+<!-- 130350739524336 -->
 <g id="node5" class="node">
-<title>134854462387008</title>
+<title>130350739524336</title>
 <polygon fill="lightgray" stroke="darkslategray" points="229.5,-402 146.5,-402 146.5,-366 229.5,-366 229.5,-402"/>
 <text text-anchor="middle" x="188" y="-380.3" font-family="Courier Bold" font-size="14.00">constant</text>
 </g>
-<!-- 134854462387008&#45;&gt;134854462386816 -->
+<!-- 130350739524336&#45;&gt;130350741118576 -->
 <g id="edge3" class="edge">
-<title>134854462387008&#45;&gt;134854462386816</title>
+<title>130350739524336&#45;&gt;130350741118576</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M188,-365.81C188,-357.52 188,-347.39 188,-338.16"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="190.8,-338.02 188,-330.02 185.2,-338.02 190.8,-338.02"/>
 </g>
-<!-- 134854462386624 -->
+<!-- 130350739524480 -->
 <g id="node6" class="node">
-<title>134854462386624</title>
+<title>130350739524480</title>
 <polygon fill="moccasin" stroke="orange" points="215,-476 161,-476 161,-438 215,-438 215,-476"/>
 <text text-anchor="middle" x="188" y="-460.8" font-family="Sans-Serif" font-size="14.00">var_2</text>
 <text text-anchor="middle" x="188" y="-445.8" font-family="Sans-Serif" font-size="14.00">1</text>
 </g>
-<!-- 134854462386624&#45;&gt;134854462387008 -->
+<!-- 130350739524480&#45;&gt;130350739524336 -->
 <g id="edge4" class="edge">
-<title>134854462386624&#45;&gt;134854462387008</title>
+<title>130350739524480&#45;&gt;130350739524336</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M188,-437.72C188,-429.29 188,-419.15 188,-410.02"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="190.8,-410 188,-402 185.2,-410 190.8,-410"/>
 </g>
-<!-- 134854462387056 -->
+<!-- 130350739524672 -->
 <g id="node8" class="node">
-<title>134854462387056</title>
+<title>130350739524672</title>
 <polygon fill="aliceblue" stroke="skyblue" points="271,-184 209,-184 209,-146 271,-146 271,-184"/>
 <text text-anchor="middle" x="240" y="-168.8" font-family="Sans-Serif" font-size="14.00">t3</text>
 <text text-anchor="middle" x="240" y="-153.8" font-family="Sans-Serif" font-size="14.00">(3, 4, 5)</text>
 </g>
-<!-- 134854462387248&#45;&gt;134854462387056 -->
+<!-- 130350739524384&#45;&gt;130350739524672 -->
 <g id="edge7" class="edge">
-<title>134854462387248&#45;&gt;134854462387056</title>
+<title>130350739524384&#45;&gt;130350739524672</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M238.48,-219.81C238.72,-211.52 239,-201.39 239.26,-192.16"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="242.07,-192.1 239.49,-184.02 236.47,-191.94 242.07,-192.1"/>
 </g>
-<!-- 134854462387488 -->
+<!-- 130350739525104 -->
 <g id="node12" class="node">
-<title>134854462387488</title>
+<title>130350739525104</title>
 <polygon fill="lightgray" stroke="darkslategray" points="357,-110 303,-110 303,-74 357,-74 357,-110"/>
 <text text-anchor="middle" x="330" y="-88.3" font-family="Courier Bold" font-size="14.00">add</text>
 </g>
-<!-- 134854462387056&#45;&gt;134854462387488 -->
+<!-- 130350739524672&#45;&gt;130350739525104 -->
 <g id="edge11" class="edge">
-<title>134854462387056&#45;&gt;134854462387488</title>
+<title>130350739524672&#45;&gt;130350739525104</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M263.18,-145.72C275.06,-136.34 289.63,-124.85 302.13,-114.99"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="303.9,-117.16 308.45,-110 300.43,-112.76 303.9,-117.16"/>
 </g>
-<!-- 134854462387344 -->
+<!-- 130350739524528 -->
 <g id="node9" class="node">
-<title>134854462387344</title>
+<title>130350739524528</title>
 <polygon fill="aliceblue" stroke="skyblue" points="431,-184 369,-184 369,-146 431,-146 431,-184"/>
 <text text-anchor="middle" x="400" y="-168.8" font-family="Sans-Serif" font-size="14.00">t4</text>
 <text text-anchor="middle" x="400" y="-153.8" font-family="Sans-Serif" font-size="14.00">(3, 4, 5)</text>
 </g>
-<!-- 134854462387344&#45;&gt;134854462387488 -->
+<!-- 130350739524528&#45;&gt;130350739525104 -->
 <g id="edge10" class="edge">
-<title>134854462387344&#45;&gt;134854462387488</title>
+<title>130350739524528&#45;&gt;130350739525104</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M381.97,-145.72C372.98,-136.6 362.02,-125.48 352.49,-115.81"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="354.37,-113.73 346.76,-110 350.39,-117.67 354.37,-113.73"/>
 </g>
-<!-- 134854462387680 -->
+<!-- 130350739524864 -->
 <g id="node10" class="node">
-<title>134854462387680</title>
+<title>130350739524864</title>
 <polygon fill="lightgray" stroke="darkslategray" points="441.5,-256 358.5,-256 358.5,-220 441.5,-220 441.5,-256"/>
 <text text-anchor="middle" x="400" y="-234.3" font-family="Courier Bold" font-size="14.00">constant</text>
 </g>
-<!-- 134854462387680&#45;&gt;134854462387344 -->
+<!-- 130350739524864&#45;&gt;130350739524528 -->
 <g id="edge8" class="edge">
-<title>134854462387680&#45;&gt;134854462387344</title>
+<title>130350739524864&#45;&gt;130350739524528</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M400,-219.81C400,-211.52 400,-201.39 400,-192.16"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="402.8,-192.02 400,-184.02 397.2,-192.02 402.8,-192.02"/>
 </g>
-<!-- 134854462387632 -->
+<!-- 130350739525152 -->
 <g id="node11" class="node">
-<title>134854462387632</title>
+<title>130350739525152</title>
 <polygon fill="moccasin" stroke="orange" points="427,-330 373,-330 373,-292 427,-292 427,-330"/>
 <text text-anchor="middle" x="400" y="-314.8" font-family="Sans-Serif" font-size="14.00">var_3</text>
 <text text-anchor="middle" x="400" y="-299.8" font-family="Sans-Serif" font-size="14.00">0.5</text>
 </g>
-<!-- 134854462387632&#45;&gt;134854462387680 -->
+<!-- 130350739525152&#45;&gt;130350739524864 -->
 <g id="edge9" class="edge">
-<title>134854462387632&#45;&gt;134854462387680</title>
+<title>130350739525152&#45;&gt;130350739524864</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M400,-291.72C400,-283.29 400,-273.15 400,-264.02"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="402.8,-264 400,-256 397.2,-264 402.8,-264"/>
 </g>
-<!-- 134854462387776 -->
+<!-- 130350739525296 -->
 <g id="node13" class="node">
-<title>134854462387776</title>
+<title>130350739525296</title>
 <polygon fill="aliceblue" stroke="skyblue" points="361,-38 299,-38 299,0 361,0 361,-38"/>
 <text text-anchor="middle" x="330" y="-22.8" font-family="Sans-Serif" font-size="14.00">t5</text>
 <text text-anchor="middle" x="330" y="-7.8" font-family="Sans-Serif" font-size="14.00">(3, 4, 5)</text>
 </g>
-<!-- 134854462387488&#45;&gt;134854462387776 -->
+<!-- 130350739525104&#45;&gt;130350739525296 -->
 <g id="edge12" class="edge">
-<title>134854462387488&#45;&gt;134854462387776</title>
+<title>130350739525104&#45;&gt;130350739525296</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M330,-73.81C330,-65.52 330,-55.39 330,-46.16"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="332.8,-46.02 330,-38.02 327.2,-46.02 332.8,-46.02"/>
 </g>
-<!-- 134854462388016 -->
+<!-- 130350739525440 -->
 <g id="node14" class="node">
-<title>134854462388016</title>
+<title>130350739525440</title>
 <polygon fill="aliceblue" stroke="skyblue" points="351,-184 289,-184 289,-146 351,-146 351,-184"/>
 <text text-anchor="middle" x="320" y="-168.8" font-family="Sans-Serif" font-size="14.00">t6</text>
 <text text-anchor="middle" x="320" y="-153.8" font-family="Sans-Serif" font-size="14.00">(3, 4, 5)</text>
 </g>
-<!-- 134854462388400&#45;&gt;134854462388016 -->
+<!-- 130350739525632&#45;&gt;130350739525440 -->
 <g id="edge13" class="edge">
-<title>134854462388400&#45;&gt;134854462388016</title>
+<title>130350739525632&#45;&gt;130350739525440</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M312.42,-219.81C313.59,-211.52 315.02,-201.39 316.32,-192.16"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="319.12,-192.33 317.46,-184.02 313.57,-191.55 319.12,-192.33"/>
 </g>
-<!-- 134854462388688 -->
+<!-- 130350739526016 -->
 <g id="node18" class="node">
-<title>134854462388688</title>
+<title>130350739526016</title>
 <polygon fill="aliceblue" stroke="skyblue" points="134,-184 72,-184 72,-146 134,-146 134,-184"/>
 <text text-anchor="middle" x="103" y="-168.8" font-family="Sans-Serif" font-size="14.00">t7</text>
 <text text-anchor="middle" x="103" y="-153.8" font-family="Sans-Serif" font-size="14.00">(3, 4, 5)</text>
 </g>
-<!-- 134854462388256&#45;&gt;134854462388688 -->
+<!-- 130350739525776&#45;&gt;130350739526016 -->
 <g id="edge18" class="edge">
-<title>134854462388256&#45;&gt;134854462388688</title>
+<title>130350739525776&#45;&gt;130350739526016</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M103,-219.81C103,-211.52 103,-201.39 103,-192.16"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="105.8,-192.02 103,-184.02 100.2,-192.02 105.8,-192.02"/>
 </g>
-<!-- 134854462388352 -->
+<!-- 130350739525968 -->
 <g id="node17" class="node">
-<title>134854462388352</title>
+<title>130350739525968</title>
 <polygon fill="moccasin" stroke="orange" points="130,-330 76,-330 76,-292 130,-292 130,-330"/>
 <text text-anchor="middle" x="103" y="-314.8" font-family="Sans-Serif" font-size="14.00">var_4</text>
 <text text-anchor="middle" x="103" y="-299.8" font-family="Sans-Serif" font-size="14.00">0.05</text>
 </g>
-<!-- 134854462388352&#45;&gt;134854462388256 -->
+<!-- 130350739525968&#45;&gt;130350739525776 -->
 <g id="edge16" class="edge">
-<title>134854462388352&#45;&gt;134854462388256</title>
+<title>130350739525968&#45;&gt;130350739525776</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M103,-291.72C103,-283.29 103,-273.15 103,-264.02"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="105.8,-264 103,-256 100.2,-264 105.8,-264"/>
 </g>
-<!-- 134854462388832 -->
+<!-- 130350739526304 -->
 <g id="node19" class="node">
-<title>134854462388832</title>
+<title>130350739526304</title>
 <polygon fill="lightgray" stroke="darkslategray" points="135.5,-110 44.5,-110 44.5,-74 135.5,-74 135.5,-110"/>
 <text text-anchor="middle" x="90" y="-88.3" font-family="Courier Bold" font-size="14.00">mulscalar</text>
 </g>
-<!-- 134854462388688&#45;&gt;134854462388832 -->
+<!-- 130350739526016&#45;&gt;130350739526304 -->
 <g id="edge20" class="edge">
-<title>134854462388688&#45;&gt;134854462388832</title>
+<title>130350739526016&#45;&gt;130350739526304</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M93.66,-145.72C91.31,-137.29 89.29,-127.15 88.1,-118.02"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="90.88,-117.67 87.25,-110 85.31,-118.25 90.88,-117.67"/>
 </g>
-<!-- 134854462388688&#45;&gt;134854462388736 -->
+<!-- 130350739526016&#45;&gt;130350739526544 -->
 <g id="edge22" class="edge">
-<title>134854462388688&#45;&gt;134854462388736</title>
+<title>130350739526016&#45;&gt;130350739526544</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M123.09,-145.72C133.2,-136.51 145.55,-125.27 156.24,-115.53"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="158.29,-117.46 162.32,-110 154.52,-113.32 158.29,-117.46"/>
 </g>
-<!-- 134854462388832&#45;&gt;134854462388688 -->
+<!-- 130350739526304&#45;&gt;130350739526016 -->
 <g id="edge19" class="edge">
-<title>134854462388832&#45;&gt;134854462388688</title>
+<title>130350739526304&#45;&gt;130350739526016</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M98.97,-110C101.36,-118.25 103.44,-128.35 104.71,-137.57"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="101.95,-138.08 105.64,-145.72 107.52,-137.45 101.95,-138.08"/>
 </g>
-<!-- 134854462388880 -->
+<!-- 130350739526352 -->
 <g id="node20" class="node">
-<title>134854462388880</title>
+<title>130350739526352</title>
 <polygon fill="moccasin" stroke="orange" points="54,-184 0,-184 0,-146 54,-146 54,-184"/>
 <text text-anchor="middle" x="27" y="-168.8" font-family="Sans-Serif" font-size="14.00">var_5</text>
 <text text-anchor="middle" x="27" y="-153.8" font-family="Sans-Serif" font-size="14.00">2.5</text>
 </g>
-<!-- 134854462388880&#45;&gt;134854462388832 -->
+<!-- 130350739526352&#45;&gt;130350739526304 -->
 <g id="edge21" class="edge">
-<title>134854462388880&#45;&gt;134854462388832</title>
+<title>130350739526352&#45;&gt;130350739526304</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M43.22,-145.72C51.24,-136.68 61,-125.69 69.52,-116.08"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="71.7,-117.85 74.91,-110 67.51,-114.13 71.7,-117.85"/>
 </g>
-<!-- 134854462389168 -->
+<!-- 130350739526256 -->
 <g id="node22" class="node">
-<title>134854462389168</title>
+<title>130350739526256</title>
 <polygon fill="aliceblue" stroke="skyblue" points="212,-38 150,-38 150,0 212,0 212,-38"/>
 <text text-anchor="middle" x="181" y="-22.8" font-family="Sans-Serif" font-size="14.00">t8</text>
 <text text-anchor="middle" x="181" y="-7.8" font-family="Sans-Serif" font-size="14.00">(3, 4, 5)</text>
 </g>
-<!-- 134854462388736&#45;&gt;134854462389168 -->
+<!-- 130350739526544&#45;&gt;130350739526256 -->
 <g id="edge24" class="edge">
-<title>134854462388736&#45;&gt;134854462389168</title>
+<title>130350739526544&#45;&gt;130350739526256</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M181,-73.81C181,-65.52 181,-55.39 181,-46.16"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="183.8,-46.02 181,-38.02 178.2,-46.02 183.8,-46.02"/>
 </g>
diff --git a/front/py/examples/2_ir/2_elementwise_sqrtlog.dot b/front/py/examples/2_ir/2_elementwise_sqrtlog.dot
index 4b3d20f4..fa272de2 100644
--- a/front/py/examples/2_ir/2_elementwise_sqrtlog.dot
+++ b/front/py/examples/2_ir/2_elementwise_sqrtlog.dot
@@ -2,34 +2,34 @@
 digraph {
 	rankdir=TB
 	node [shape=record]
-	136548958820992 [label="t1
+	127569128262912 [label="t1
 (60,)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	136551216711568 [label="t2
+	127569128261520 [label="t2
 (60,)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	136548919477104 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	136548919477152 [label="var_1
+	127566868400736 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
+	127566868400688 [label="var_1
 2" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	136548919476960 [label=sqrt color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	136548919477248 [label="t3
+	127566868400880 [label=sqrt color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
+	127566868401024 [label="t3
 (60,)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	136548919477728 [label=log color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	136548919477632 [label="t4
+	127566868401456 [label=log color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
+	127566868401360 [label="t4
 (60,)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	136548919478064 [label=exp color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	136548919477968 [label="t5
+	127566868401792 [label=exp color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
+	127566868401696 [label="t5
 (60,)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	136548919478400 [label=pow color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	136548919478304 [label="t6
+	127566868402128 [label=pow color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
+	127566868402032 [label="t6
 (60,)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	136548919477104 -> 136551216711568 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	136548919477152 -> 136548919477104 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	136548958820992 -> 136548919476960 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	136548919476960 -> 136548919477248 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	136551216711568 -> 136548919477728 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	136548919477728 -> 136548919477632 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	136548919477632 -> 136548919478064 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	136548919478064 -> 136548919477968 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	136548919477968 -> 136548919478400 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	136548919477248 -> 136548919478400 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	136548919478400 -> 136548919478304 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	127566868400736 -> 127569128261520 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	127566868400688 -> 127566868400736 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	127569128262912 -> 127566868400880 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	127566868400880 -> 127566868401024 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	127569128261520 -> 127566868401456 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	127566868401456 -> 127566868401360 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	127566868401360 -> 127566868401792 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	127566868401792 -> 127566868401696 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	127566868401696 -> 127566868402128 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	127566868401024 -> 127566868402128 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	127566868402128 -> 127566868402032 [arrowsize=0.8 color=gray40 penwidth=1.2]
 }
diff --git a/front/py/examples/2_ir/2_elementwise_sqrtlog.dot.svg b/front/py/examples/2_ir/2_elementwise_sqrtlog.dot.svg
index a688a2a1..3ca8df83 100644
--- a/front/py/examples/2_ir/2_elementwise_sqrtlog.dot.svg
+++ b/front/py/examples/2_ir/2_elementwise_sqrtlog.dot.svg
@@ -9,148 +9,148 @@
 <g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 626)">
 <title>%3</title>
 <polygon fill="white" stroke="transparent" points="-4,4 -4,-626 144.5,-626 144.5,4 -4,4"/>
-<!-- 136548958820992 -->
+<!-- 127569128262912 -->
 <g id="node1" class="node">
-<title>136548958820992</title>
+<title>127569128262912</title>
 <polygon fill="aliceblue" stroke="skyblue" points="54,-330 0,-330 0,-292 54,-292 54,-330"/>
 <text text-anchor="middle" x="27" y="-314.8" font-family="Sans-Serif" font-size="14.00">t1</text>
 <text text-anchor="middle" x="27" y="-299.8" font-family="Sans-Serif" font-size="14.00">(60,)</text>
 </g>
-<!-- 136548919476960 -->
+<!-- 127566868400880 -->
 <g id="node5" class="node">
-<title>136548919476960</title>
+<title>127566868400880</title>
 <polygon fill="lightgray" stroke="darkslategray" points="54,-256 0,-256 0,-220 54,-220 54,-256"/>
 <text text-anchor="middle" x="27" y="-234.3" font-family="Courier Bold" font-size="14.00">sqrt</text>
 </g>
-<!-- 136548958820992&#45;&gt;136548919476960 -->
+<!-- 127569128262912&#45;&gt;127566868400880 -->
 <g id="edge3" class="edge">
-<title>136548958820992&#45;&gt;136548919476960</title>
+<title>127569128262912&#45;&gt;127566868400880</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M27,-291.72C27,-283.29 27,-273.15 27,-264.02"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="29.8,-264 27,-256 24.2,-264 29.8,-264"/>
 </g>
-<!-- 136551216711568 -->
+<!-- 127569128261520 -->
 <g id="node2" class="node">
-<title>136551216711568</title>
+<title>127569128261520</title>
 <polygon fill="aliceblue" stroke="skyblue" points="126,-476 72,-476 72,-438 126,-438 126,-476"/>
 <text text-anchor="middle" x="99" y="-460.8" font-family="Sans-Serif" font-size="14.00">t2</text>
 <text text-anchor="middle" x="99" y="-445.8" font-family="Sans-Serif" font-size="14.00">(60,)</text>
 </g>
-<!-- 136548919477728 -->
+<!-- 127566868401456 -->
 <g id="node7" class="node">
-<title>136548919477728</title>
+<title>127566868401456</title>
 <polygon fill="lightgray" stroke="darkslategray" points="126,-402 72,-402 72,-366 126,-366 126,-402"/>
 <text text-anchor="middle" x="99" y="-380.3" font-family="Courier Bold" font-size="14.00">log</text>
 </g>
-<!-- 136551216711568&#45;&gt;136548919477728 -->
+<!-- 127569128261520&#45;&gt;127566868401456 -->
 <g id="edge5" class="edge">
-<title>136551216711568&#45;&gt;136548919477728</title>
+<title>127569128261520&#45;&gt;127566868401456</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M99,-437.72C99,-429.29 99,-419.15 99,-410.02"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="101.8,-410 99,-402 96.2,-410 101.8,-410"/>
 </g>
-<!-- 136548919477104 -->
+<!-- 127566868400736 -->
 <g id="node3" class="node">
-<title>136548919477104</title>
+<title>127566868400736</title>
 <polygon fill="lightgray" stroke="darkslategray" points="140.5,-548 57.5,-548 57.5,-512 140.5,-512 140.5,-548"/>
 <text text-anchor="middle" x="99" y="-526.3" font-family="Courier Bold" font-size="14.00">constant</text>
 </g>
-<!-- 136548919477104&#45;&gt;136551216711568 -->
+<!-- 127566868400736&#45;&gt;127569128261520 -->
 <g id="edge1" class="edge">
-<title>136548919477104&#45;&gt;136551216711568</title>
+<title>127566868400736&#45;&gt;127569128261520</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M99,-511.81C99,-503.52 99,-493.39 99,-484.16"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="101.8,-484.02 99,-476.02 96.2,-484.02 101.8,-484.02"/>
 </g>
-<!-- 136548919477152 -->
+<!-- 127566868400688 -->
 <g id="node4" class="node">
-<title>136548919477152</title>
+<title>127566868400688</title>
 <polygon fill="moccasin" stroke="orange" points="126,-622 72,-622 72,-584 126,-584 126,-622"/>
 <text text-anchor="middle" x="99" y="-606.8" font-family="Sans-Serif" font-size="14.00">var_1</text>
 <text text-anchor="middle" x="99" y="-591.8" font-family="Sans-Serif" font-size="14.00">2</text>
 </g>
-<!-- 136548919477152&#45;&gt;136548919477104 -->
+<!-- 127566868400688&#45;&gt;127566868400736 -->
 <g id="edge2" class="edge">
-<title>136548919477152&#45;&gt;136548919477104</title>
+<title>127566868400688&#45;&gt;127566868400736</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M99,-583.72C99,-575.29 99,-565.15 99,-556.02"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="101.8,-556 99,-548 96.2,-556 101.8,-556"/>
 </g>
-<!-- 136548919477248 -->
+<!-- 127566868401024 -->
 <g id="node6" class="node">
-<title>136548919477248</title>
+<title>127566868401024</title>
 <polygon fill="aliceblue" stroke="skyblue" points="54,-184 0,-184 0,-146 54,-146 54,-184"/>
 <text text-anchor="middle" x="27" y="-168.8" font-family="Sans-Serif" font-size="14.00">t3</text>
 <text text-anchor="middle" x="27" y="-153.8" font-family="Sans-Serif" font-size="14.00">(60,)</text>
 </g>
-<!-- 136548919476960&#45;&gt;136548919477248 -->
+<!-- 127566868400880&#45;&gt;127566868401024 -->
 <g id="edge4" class="edge">
-<title>136548919476960&#45;&gt;136548919477248</title>
+<title>127566868400880&#45;&gt;127566868401024</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M27,-219.81C27,-211.52 27,-201.39 27,-192.16"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="29.8,-192.02 27,-184.02 24.2,-192.02 29.8,-192.02"/>
 </g>
-<!-- 136548919478400 -->
+<!-- 127566868402128 -->
 <g id="node11" class="node">
-<title>136548919478400</title>
+<title>127566868402128</title>
 <polygon fill="lightgray" stroke="darkslategray" points="90,-110 36,-110 36,-74 90,-74 90,-110"/>
 <text text-anchor="middle" x="63" y="-88.3" font-family="Courier Bold" font-size="14.00">pow</text>
 </g>
-<!-- 136548919477248&#45;&gt;136548919478400 -->
+<!-- 127566868401024&#45;&gt;127566868402128 -->
 <g id="edge10" class="edge">
-<title>136548919477248&#45;&gt;136548919478400</title>
+<title>127566868401024&#45;&gt;127566868402128</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M36.27,-145.72C40.68,-137.03 46,-126.52 50.74,-117.19"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="53.26,-118.4 54.38,-110 48.26,-115.87 53.26,-118.4"/>
 </g>
-<!-- 136548919477632 -->
+<!-- 127566868401360 -->
 <g id="node8" class="node">
-<title>136548919477632</title>
+<title>127566868401360</title>
 <polygon fill="aliceblue" stroke="skyblue" points="126,-330 72,-330 72,-292 126,-292 126,-330"/>
 <text text-anchor="middle" x="99" y="-314.8" font-family="Sans-Serif" font-size="14.00">t4</text>
 <text text-anchor="middle" x="99" y="-299.8" font-family="Sans-Serif" font-size="14.00">(60,)</text>
 </g>
-<!-- 136548919477728&#45;&gt;136548919477632 -->
+<!-- 127566868401456&#45;&gt;127566868401360 -->
 <g id="edge6" class="edge">
-<title>136548919477728&#45;&gt;136548919477632</title>
+<title>127566868401456&#45;&gt;127566868401360</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M99,-365.81C99,-357.52 99,-347.39 99,-338.16"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="101.8,-338.02 99,-330.02 96.2,-338.02 101.8,-338.02"/>
 </g>
-<!-- 136548919478064 -->
+<!-- 127566868401792 -->
 <g id="node9" class="node">
-<title>136548919478064</title>
+<title>127566868401792</title>
 <polygon fill="lightgray" stroke="darkslategray" points="126,-256 72,-256 72,-220 126,-220 126,-256"/>
 <text text-anchor="middle" x="99" y="-234.3" font-family="Courier Bold" font-size="14.00">exp</text>
 </g>
-<!-- 136548919477632&#45;&gt;136548919478064 -->
+<!-- 127566868401360&#45;&gt;127566868401792 -->
 <g id="edge7" class="edge">
-<title>136548919477632&#45;&gt;136548919478064</title>
+<title>127566868401360&#45;&gt;127566868401792</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M99,-291.72C99,-283.29 99,-273.15 99,-264.02"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="101.8,-264 99,-256 96.2,-264 101.8,-264"/>
 </g>
-<!-- 136548919477968 -->
+<!-- 127566868401696 -->
 <g id="node10" class="node">
-<title>136548919477968</title>
+<title>127566868401696</title>
 <polygon fill="aliceblue" stroke="skyblue" points="126,-184 72,-184 72,-146 126,-146 126,-184"/>
 <text text-anchor="middle" x="99" y="-168.8" font-family="Sans-Serif" font-size="14.00">t5</text>
 <text text-anchor="middle" x="99" y="-153.8" font-family="Sans-Serif" font-size="14.00">(60,)</text>
 </g>
-<!-- 136548919478064&#45;&gt;136548919477968 -->
+<!-- 127566868401792&#45;&gt;127566868401696 -->
 <g id="edge8" class="edge">
-<title>136548919478064&#45;&gt;136548919477968</title>
+<title>127566868401792&#45;&gt;127566868401696</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M99,-219.81C99,-211.52 99,-201.39 99,-192.16"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="101.8,-192.02 99,-184.02 96.2,-192.02 101.8,-192.02"/>
 </g>
-<!-- 136548919477968&#45;&gt;136548919478400 -->
+<!-- 127566868401696&#45;&gt;127566868402128 -->
 <g id="edge9" class="edge">
-<title>136548919477968&#45;&gt;136548919478400</title>
+<title>127566868401696&#45;&gt;127566868402128</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M89.73,-145.72C85.32,-137.03 80,-126.52 75.26,-117.19"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="77.74,-115.87 71.62,-110 72.74,-118.4 77.74,-115.87"/>
 </g>
-<!-- 136548919478304 -->
+<!-- 127566868402032 -->
 <g id="node12" class="node">
-<title>136548919478304</title>
+<title>127566868402032</title>
 <polygon fill="aliceblue" stroke="skyblue" points="90,-38 36,-38 36,0 90,0 90,-38"/>
 <text text-anchor="middle" x="63" y="-22.8" font-family="Sans-Serif" font-size="14.00">t6</text>
 <text text-anchor="middle" x="63" y="-7.8" font-family="Sans-Serif" font-size="14.00">(60,)</text>
 </g>
-<!-- 136548919478400&#45;&gt;136548919478304 -->
+<!-- 127566868402128&#45;&gt;127566868402032 -->
 <g id="edge11" class="edge">
-<title>136548919478400&#45;&gt;136548919478304</title>
+<title>127566868402128&#45;&gt;127566868402032</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M63,-73.81C63,-65.52 63,-55.39 63,-46.16"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="65.8,-46.02 63,-38.02 60.2,-46.02 65.8,-46.02"/>
 </g>
diff --git a/front/py/examples/2_ir/3_matmul.dot b/front/py/examples/2_ir/3_matmul.dot
index f44682c8..232ef59c 100644
--- a/front/py/examples/2_ir/3_matmul.dot
+++ b/front/py/examples/2_ir/3_matmul.dot
@@ -2,24 +2,24 @@
 digraph {
 	rankdir=TB
 	node [shape=record]
-	135996949875968 [label="t1
+	137036194614528 [label="t1
 (3, 4)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	135994975499600 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	135994976203776 [label="var_1
+	137033935129088 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
+	137033936738800 [label="var_1
 1" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	135994974384672 [label="t2
+	137033936739856 [label="t2
 (4, 5)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	135994974384864 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	135994974384480 [label="var_2
+	137033935129232 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
+	137033935129376 [label="var_2
 1" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	135994974385104 [label=matmul color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	135994974385152 [label="tensor_3
+	137033935129280 [label=matmul color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
+	137033935129472 [label="tensor_3
 (3, 5)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	135994975499600 -> 135996949875968 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	135994976203776 -> 135994975499600 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	135994974384864 -> 135994974384672 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	135994974384480 -> 135994974384864 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	135996949875968 -> 135994974385104 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	135994974384672 -> 135994974385104 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	135994974385104 -> 135994974385152 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	137033935129088 -> 137036194614528 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	137033936738800 -> 137033935129088 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	137033935129232 -> 137033936739856 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	137033935129376 -> 137033935129232 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	137036194614528 -> 137033935129280 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	137033936739856 -> 137033935129280 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	137033935129280 -> 137033935129472 [arrowsize=0.8 color=gray40 penwidth=1.2]
 }
diff --git a/front/py/examples/2_ir/3_matmul.dot.svg b/front/py/examples/2_ir/3_matmul.dot.svg
index 3e1d97eb..d14a0efd 100644
--- a/front/py/examples/2_ir/3_matmul.dot.svg
+++ b/front/py/examples/2_ir/3_matmul.dot.svg
@@ -9,98 +9,98 @@
 <g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 334)">
 <title>%3</title>
 <polygon fill="white" stroke="transparent" points="-4,4 -4,-334 188,-334 188,4 -4,4"/>
-<!-- 135996949875968 -->
+<!-- 137036194614528 -->
 <g id="node1" class="node">
-<title>135996949875968</title>
+<title>137036194614528</title>
 <polygon fill="aliceblue" stroke="skyblue" points="161.5,-184 107.5,-184 107.5,-146 161.5,-146 161.5,-184"/>
 <text text-anchor="middle" x="134.5" y="-168.8" font-family="Sans-Serif" font-size="14.00">t1</text>
 <text text-anchor="middle" x="134.5" y="-153.8" font-family="Sans-Serif" font-size="14.00">(3, 4)</text>
 </g>
-<!-- 135994974385104 -->
+<!-- 137033935129280 -->
 <g id="node7" class="node">
-<title>135994974385104</title>
+<title>137033935129280</title>
 <polygon fill="lightgray" stroke="darkslategray" points="124.5,-110 58.5,-110 58.5,-74 124.5,-74 124.5,-110"/>
 <text text-anchor="middle" x="91.5" y="-88.3" font-family="Courier Bold" font-size="14.00">matmul</text>
 </g>
-<!-- 135996949875968&#45;&gt;135994974385104 -->
+<!-- 137036194614528&#45;&gt;137033935129280 -->
 <g id="edge5" class="edge">
-<title>135996949875968&#45;&gt;135994974385104</title>
+<title>137036194614528&#45;&gt;137033935129280</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M123.43,-145.72C118.11,-136.94 111.68,-126.31 105.98,-116.91"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="108.34,-115.4 101.8,-110 103.55,-118.3 108.34,-115.4"/>
 </g>
-<!-- 135994975499600 -->
+<!-- 137033935129088 -->
 <g id="node2" class="node">
-<title>135994975499600</title>
+<title>137033935129088</title>
 <polygon fill="lightgray" stroke="darkslategray" points="184,-256 101,-256 101,-220 184,-220 184,-256"/>
 <text text-anchor="middle" x="142.5" y="-234.3" font-family="Courier Bold" font-size="14.00">constant</text>
 </g>
-<!-- 135994975499600&#45;&gt;135996949875968 -->
+<!-- 137033935129088&#45;&gt;137036194614528 -->
 <g id="edge1" class="edge">
-<title>135994975499600&#45;&gt;135996949875968</title>
+<title>137033935129088&#45;&gt;137036194614528</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M140.56,-219.81C139.63,-211.52 138.49,-201.39 137.45,-192.16"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="140.21,-191.66 136.53,-184.02 134.64,-192.28 140.21,-191.66"/>
 </g>
-<!-- 135994976203776 -->
+<!-- 137033936738800 -->
 <g id="node3" class="node">
-<title>135994976203776</title>
+<title>137033936738800</title>
 <polygon fill="moccasin" stroke="orange" points="169.5,-330 115.5,-330 115.5,-292 169.5,-292 169.5,-330"/>
 <text text-anchor="middle" x="142.5" y="-314.8" font-family="Sans-Serif" font-size="14.00">var_1</text>
 <text text-anchor="middle" x="142.5" y="-299.8" font-family="Sans-Serif" font-size="14.00">1</text>
 </g>
-<!-- 135994976203776&#45;&gt;135994975499600 -->
+<!-- 137033936738800&#45;&gt;137033935129088 -->
 <g id="edge2" class="edge">
-<title>135994976203776&#45;&gt;135994975499600</title>
+<title>137033936738800&#45;&gt;137033935129088</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M142.5,-291.72C142.5,-283.29 142.5,-273.15 142.5,-264.02"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="145.3,-264 142.5,-256 139.7,-264 145.3,-264"/>
 </g>
-<!-- 135994974384672 -->
+<!-- 137033936739856 -->
 <g id="node4" class="node">
-<title>135994974384672</title>
+<title>137033936739856</title>
 <polygon fill="aliceblue" stroke="skyblue" points="82.5,-184 28.5,-184 28.5,-146 82.5,-146 82.5,-184"/>
 <text text-anchor="middle" x="55.5" y="-168.8" font-family="Sans-Serif" font-size="14.00">t2</text>
 <text text-anchor="middle" x="55.5" y="-153.8" font-family="Sans-Serif" font-size="14.00">(4, 5)</text>
 </g>
-<!-- 135994974384672&#45;&gt;135994974385104 -->
+<!-- 137033936739856&#45;&gt;137033935129280 -->
 <g id="edge6" class="edge">
-<title>135994974384672&#45;&gt;135994974385104</title>
+<title>137033936739856&#45;&gt;137033935129280</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M64.77,-145.72C69.18,-137.03 74.5,-126.52 79.24,-117.19"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="81.76,-118.4 82.88,-110 76.76,-115.87 81.76,-118.4"/>
 </g>
-<!-- 135994974384864 -->
+<!-- 137033935129232 -->
 <g id="node5" class="node">
-<title>135994974384864</title>
+<title>137033935129232</title>
 <polygon fill="lightgray" stroke="darkslategray" points="83,-256 0,-256 0,-220 83,-220 83,-256"/>
 <text text-anchor="middle" x="41.5" y="-234.3" font-family="Courier Bold" font-size="14.00">constant</text>
 </g>
-<!-- 135994974384864&#45;&gt;135994974384672 -->
+<!-- 137033935129232&#45;&gt;137033936739856 -->
 <g id="edge3" class="edge">
-<title>135994974384864&#45;&gt;135994974384672</title>
+<title>137033935129232&#45;&gt;137033936739856</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M44.89,-219.81C46.54,-211.43 48.56,-201.18 50.4,-191.88"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="53.15,-192.41 51.95,-184.02 47.65,-191.33 53.15,-192.41"/>
 </g>
-<!-- 135994974384480 -->
+<!-- 137033935129376 -->
 <g id="node6" class="node">
-<title>135994974384480</title>
+<title>137033935129376</title>
 <polygon fill="moccasin" stroke="orange" points="68.5,-330 14.5,-330 14.5,-292 68.5,-292 68.5,-330"/>
 <text text-anchor="middle" x="41.5" y="-314.8" font-family="Sans-Serif" font-size="14.00">var_2</text>
 <text text-anchor="middle" x="41.5" y="-299.8" font-family="Sans-Serif" font-size="14.00">1</text>
 </g>
-<!-- 135994974384480&#45;&gt;135994974384864 -->
+<!-- 137033935129376&#45;&gt;137033935129232 -->
 <g id="edge4" class="edge">
-<title>135994974384480&#45;&gt;135994974384864</title>
+<title>137033935129376&#45;&gt;137033935129232</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M41.5,-291.72C41.5,-283.29 41.5,-273.15 41.5,-264.02"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="44.3,-264 41.5,-256 38.7,-264 44.3,-264"/>
 </g>
-<!-- 135994974385152 -->
+<!-- 137033935129472 -->
 <g id="node8" class="node">
-<title>135994974385152</title>
+<title>137033935129472</title>
 <polygon fill="aliceblue" stroke="skyblue" points="128,-38 55,-38 55,0 128,0 128,-38"/>
 <text text-anchor="middle" x="91.5" y="-22.8" font-family="Sans-Serif" font-size="14.00">tensor_3</text>
 <text text-anchor="middle" x="91.5" y="-7.8" font-family="Sans-Serif" font-size="14.00">(3, 5)</text>
 </g>
-<!-- 135994974385104&#45;&gt;135994974385152 -->
+<!-- 137033935129280&#45;&gt;137033935129472 -->
 <g id="edge7" class="edge">
-<title>135994974385104&#45;&gt;135994974385152</title>
+<title>137033935129280&#45;&gt;137033935129472</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M91.5,-73.81C91.5,-65.52 91.5,-55.39 91.5,-46.16"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="94.3,-46.02 91.5,-38.02 88.7,-46.02 94.3,-46.02"/>
 </g>