From 7a343f0cc9de16f81cc2848b3dd304606fbc423a Mon Sep 17 00:00:00 2001
From: lipeng <734991033@qq.com>
Date: Sat, 19 Apr 2025 19:35:30 +0800
Subject: [PATCH 1/3] =?UTF-8?q?normal:=E5=88=9D=E5=A7=8B=E5=8C=96=E3=80=82?=
 =?UTF-8?q?=E8=A7=A3=E4=BA=86=E4=B8=80=E5=A4=A9bug=EF=BC=8C=E6=9C=80?=
 =?UTF-8?q?=E5=90=8E=E5=8F=91=E7=8E=B0init.hpp=20switch=E9=87=8C=E6=BC=8F?=
 =?UTF-8?q?=E5=86=99=E4=BA=86break=EF=BC=8C=E5=AF=BC=E8=87=B4normal?=
 =?UTF-8?q?=E8=B0=83=E7=94=A8=E5=90=8E=E6=95=B0=E6=8D=AE=E5=BC=82=E5=B8=B8?=
 =?UTF-8?q?=E3=80=82?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 front/py/deepx/nn/modules/sparse.py | 41 +++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)
 create mode 100644 front/py/deepx/nn/modules/sparse.py

diff --git a/front/py/deepx/nn/modules/sparse.py b/front/py/deepx/nn/modules/sparse.py
new file mode 100644
index 00000000..cab749e0
--- /dev/null
+++ b/front/py/deepx/nn/modules/sparse.py
@@ -0,0 +1,41 @@
+from  .module import Module
+from deepx.tensor import Tensor
+
+class Embedding(Module):
+    def __init__(self, 
+                 num_embeddings:int, 
+                 embedding_dim:int, 
+                 padding_idx:int=None, 
+                 max_norm:float=None, 
+                 norm_type:float=2.0, 
+                 scale_grad_by_freq:bool=False, 
+                 sparse:bool=False):
+        super(Embedding, self).__init__()
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        self.padding_idx = padding_idx
+        self.max_norm = max_norm
+        self.norm_type = norm_type
+        self.scale_grad_by_freq = scale_grad_by_freq
+        self.sparse = sparse
+        self.weight = Tensor(num_embeddings, embedding_dim)
+        self.weight.uniform_(-0.01, 0.01)
+        if padding_idx is not None:
+            self.weight[padding_idx] = 0
+
+    def forward(self, input:Tensor)->Tensor:
+        return self.weight[input]
+    
+    def backward(self, grad:Tensor)->Tensor:
+        self.weight.grad = grad
+        return None
+    
+    def __str__(self)->str:
+        return f"Embedding(num_embeddings={self.num_embeddings}, embedding_dim={self.embedding_dim})"
+    
+    def __repr__(self)->str:
+        return self.__str__()
+    
+    def __len__(self)->int:
+        return self.num_embeddings
+    

From c7d9ad1aa6efa7168b2723a04692a1a738861ed2 Mon Sep 17 00:00:00 2001
From: lipeng <734991033@qq.com>
Date: Sat, 19 Apr 2025 19:35:38 +0800
Subject: [PATCH 2/3] =?UTF-8?q?normal:=E5=88=9D=E5=A7=8B=E5=8C=96=E3=80=82?=
 =?UTF-8?q?=E8=A7=A3=E4=BA=86=E4=B8=80=E5=A4=A9bug=EF=BC=8C=E6=9C=80?=
 =?UTF-8?q?=E5=90=8E=E5=8F=91=E7=8E=B0init.hpp=20switch=E9=87=8C=E6=BC=8F?=
 =?UTF-8?q?=E5=86=99=E4=BA=86break=EF=BC=8C=E5=AF=BC=E8=87=B4normal?=
 =?UTF-8?q?=E8=B0=83=E7=94=A8=E5=90=8E=E6=95=B0=E6=8D=AE=E5=BC=82=E5=B8=B8?=
 =?UTF-8?q?=E3=80=82?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 doc/excuter/op-mem-cuda/list.md               |  12 +-
 doc/excuter/op-mem-ompsimd/list.md            |  22 ++-
 excuter/cpp-common/src/deepx/dtype.hpp        |   4 +-
 .../cpp-common/src/deepx/tensorfunc/init.hpp  |  20 +-
 excuter/cpp-common/src/stdutil/print.hpp      |  18 +-
 excuter/op-mem-cuda/src/client/tfs.cpp        |   8 +
 .../op-mem-cuda/src/deepx/tensorfunc/cuda.hpp |  33 ++--
 .../src/deepx/tensorfunc/init_miaobyte.cu     | 171 ++++++++++-------
 .../src/deepx/tensorfunc/init_miaobyte.cuh    |  68 ++-----
 .../src/deepx/tensorfunc/init_miaobyte.hpp    |  28 ++-
 excuter/op-mem-cuda/src/deepx/tf/init.hpp     | 172 ++++++++++++------
 excuter/op-mem-ompsimd/src/client/tfs.cpp     |   9 +
 .../src/deepx/tensorfunc/init_miaobyte.hpp    |  34 +++-
 excuter/op-mem-ompsimd/src/deepx/tf/init.hpp  |  62 ++++++-
 front/py/deepx/nn/functional/authormap.py     |   1 +
 front/py/deepx/nn/functional/leaffunc_init.py |  29 ++-
 front/py/deepx/nn/functional/rtf_init.py      |   7 +
 front/py/deepx/tensor/init.py                 |  26 +--
 front/py/deepx/tensor/tensor.py               |  30 +--
 front/py/examples/2_ir/1_init_zeroones.py     |  52 ++++--
 20 files changed, 517 insertions(+), 289 deletions(-)

diff --git a/doc/excuter/op-mem-cuda/list.md b/doc/excuter/op-mem-cuda/list.md
index 2d7a12fb..8704844c 100644
--- a/doc/excuter/op-mem-cuda/list.md
+++ b/doc/excuter/op-mem-cuda/list.md
@@ -10,10 +10,13 @@
 | transpose | miaobyte | transpose(tensor<any> A, vector<int32> dim_order)->(tensor<any> C) | T2 = T1.transpose(dimorder=[1,0]) | transpose(tensor<any> A, vector<int32> dim_order)->(tensor<any> C) |
 | reshape | miaobyte | reshape(tensor<any> A, vector<int32> shape)->(tensor<any> B) | T1.reshape(shape)->T2 | reshape(tensor<any> A, vector<int32> shape)->(tensor<any> B) |
 | matmul | cublas | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1 @ T2 | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) |
+| switch | miaobyte | switch(listtensor<any> tensors, tensor<int8> cases)->(tensor<any> result) | C=switch(tensors,cases) | switch(listtensor<any> tensors, tensor<int8> cases)->(tensor<any> result) |
+| greaterscalar | miaobyte | greaterscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) | mask=compare(T1, scalar) | greaterscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) |
 | equalscalar | miaobyte | equalscalar(tensor<any> A, var<any> scalar, var<float64> epsilon)->(tensor<bool> mask) | mask=compare(T1, scalar) | equalscalar(tensor<any> A, var<any> scalar, var<float64> epsilon)->(tensor<bool> mask) |
 | prod | miaobyte | prod(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) | B = prod(A, axis=[1 2], keepdims=false) | prod(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) |
 | min | miaobyte | min(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=min(T1, T2) | min(tensor<any> A, tensor<any> B)->(tensor<any> C) |
 | maxscalar | miaobyte | maxscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=max(T1, scalar) | maxscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
+| normal | miaobyte | normal(tensor<any> t, var<any> mean, var<any> stddev, var<int32> seed)->() | normal(T1,mean,stddev,seed) | normal(tensor<any> t, var<any> mean, var<any> stddev, var<int32> seed)->() |
 | uniform | miaobyte | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() | uniform(T1,low,high,seed) | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() |
 | addscalar | miaobyte | addscalar(tensor<any> A, var<any> b)->(tensor<any> C) | T3=T1+scalar | addscalar(tensor<any> A, var<any> b)->(tensor<any> C) |
 | log | miaobyte | log(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) | T3=log(T1) | log(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) |
@@ -24,22 +27,24 @@
 | add | cublas | add(tensor<any> a, tensor<any> b)->(tensor<any> c) | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |
 | add | miaobyte | add(tensor<any> a, tensor<any> b)->(tensor<any> c) | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |
 | copytensor |  none  | copytensor(tensor<any> src, tensor<any> dst)->() | T2.data = T1.data | copytensor(tensor<any> src, tensor<any> dst)->() |
+| greater | miaobyte | greater(tensor<any> A, tensor<any> B)->(tensor<bool> mask) | mask=compare(T1, T2) | greater(tensor<any> A, tensor<any> B)->(tensor<bool> mask) |
 | print | miaobyte | print(tensor<any> )->() | print(T1) | print(tensor<any> )->() |
 | print | miaobyte | print(tensor<any> , var<string> )->() | print(T1) | print(tensor<any> , var<string> )->() |
 | newtensor |  none  | newtensor(vector<int32> shape)->(tensor<any> tensor1) | T1 = zeros(shape) | newtensor(vector<int32> shape)->(tensor<any> tensor1) |
 | newtensor |  none  | newtensor(var<string> shape)->(tensor<any> tensor1) | T1 = zeros(shape) | newtensor(var<string> shape)->(tensor<any> tensor1) |
+| less | miaobyte | less(tensor<any> A, tensor<any> B)->(tensor<bool> mask) | mask=compare(T1, T2) | less(tensor<any> A, tensor<any> B)->(tensor<bool> mask) |
 | constant | miaobyte | constant(tensor<any> t, var<any> value)->() | constant(T1) | constant(tensor<any> t, var<any> value)->() |
 | powscalar | miaobyte | powscalar(tensor<float64|float32> A, var<float64|int32> scalar)->(tensor<float64|float32> C) | T3=pow(T1, scalar) | powscalar(tensor<float64|float32> A, var<float64|int32> scalar)->(tensor<float64|float32> C) |
 | vecset |  none  | vecset(vector<any> value)->(vector<any> name) | shape = [3  4  5] | vecset(vector<any> value)->(vector<any> name) |
-| reducemin | miaobyte | reducemin(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) | B = reducemin(A, axis=[1 2], keepdims=false) | reducemin(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) |
-| subscalar | miaobyte | subscalar(tensor<any> A, var<any> b)->(tensor<any> C) | T3=T1-scalar | subscalar(tensor<any> A, var<any> b)->(tensor<any> C) |
-| sqrt | miaobyte | sqrt(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) | T3=sqrt(T1) | sqrt(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) |
 | minscalar | miaobyte | minscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=min(T1, scalar) | minscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
 | rdivscalar | miaobyte | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) | T3=scalar/T1 | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) |
 | rpowscalar | miaobyte | rpowscalar(var<float64|int32> scalar, tensor<float64|float32> A)->(tensor<float64|float32> C) | T3=pow(scalar, T1) | rpowscalar(var<float64|int32> scalar, tensor<float64|float32> A)->(tensor<float64|float32> C) |
 | sub | miaobyte | sub(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1-T2 | sub(tensor<any> A, tensor<any> B)->(tensor<any> C) |
 | sum | miaobyte | sum(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) | B = sum(A, axis=[1 2], keepdims=false) | sum(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) |
 | argset |  none  | argset(var<any> value)->(var<any> name) | var argname = argvalue | argset(var<any> value)->(var<any> name) |
+| reducemin | miaobyte | reducemin(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) | B = reducemin(A, axis=[1 2], keepdims=false) | reducemin(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) |
+| sqrt | miaobyte | sqrt(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) | T3=sqrt(T1) | sqrt(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) |
+| subscalar | miaobyte | subscalar(tensor<any> A, var<any> b)->(tensor<any> C) | T3=T1-scalar | subscalar(tensor<any> A, var<any> b)->(tensor<any> C) |
 | equal | miaobyte | equal(tensor<any> A, tensor<any> B, var<float64> epsilon)->(tensor<bool> mask) | mask=compare(T1, T2) | equal(tensor<any> A, tensor<any> B, var<float64> epsilon)->(tensor<bool> mask) |
 | mulscalar | miaobyte | mulscalar(tensor<any> A, var<any> b)->(tensor<any> C) | T3=T1*scalar | mulscalar(tensor<any> A, var<any> b)->(tensor<any> C) |
 | div | miaobyte | div(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1/T2 | div(tensor<any> A, tensor<any> B)->(tensor<any> C) |
@@ -48,5 +53,6 @@
 | pow | miaobyte | pow(tensor<float64|float32> A, tensor<float64|float32> B)->(tensor<float64|float32> C) | T3=pow(T1, T2) | pow(tensor<float64|float32> A, tensor<float64|float32> B)->(tensor<float64|float32> C) |
 | mul | miaobyte | mul(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1*T2 | mul(tensor<any> A, tensor<any> B)->(tensor<any> C) |
 | exp | miaobyte | exp(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) | T3=exp(T1) | exp(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) |
+| lessscalar | miaobyte | lessscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) | mask=compare(T1, scalar) | lessscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) |
 | deltensor |  none  | deltensor(tensor<any> t)->() | del T1 | deltensor(tensor<any> t)->() |
 | cos | miaobyte | cos(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) | T3=cos(T1) | cos(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) |
diff --git a/doc/excuter/op-mem-ompsimd/list.md b/doc/excuter/op-mem-ompsimd/list.md
index 68ea3b70..65ffb758 100644
--- a/doc/excuter/op-mem-ompsimd/list.md
+++ b/doc/excuter/op-mem-ompsimd/list.md
@@ -11,8 +11,10 @@
 | reshape | miaobyte | reshape(tensor<any> A, vector<int32> shape)->(tensor<any> B) | T1.reshape(shape)->T2 | reshape(tensor<any> A, vector<int32> shape)->(tensor<any> B) |
 | matmul | cblas | matmul(tensor<float64|float32> A, tensor<float64|float32> B)->(tensor<float64|float32> C) | T3=T1 @ T2 | matmul(tensor<float64|float32> A, tensor<float64|float32> B)->(tensor<float64|float32> C) |
 | matmul | miaobyte | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1 @ T2 | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) |
-| comparescalar | miaobyte | comparescalar(tensor<any> A, var<any> scalar)->(tensor<float32> mask) | mask=compare(T1,scalar) | comparescalar(tensor<any> A, var<any> scalar)->(tensor<float32> mask) |
-| compare | miaobyte | compare(tensor<any> A, tensor<any> B)->(tensor<float32> mask) | mask=compare(T1,T2) | compare(tensor<any> A, tensor<any> B)->(tensor<float32> mask) |
+| switch | miaobyte | switch(listtensor<any> tensors, tensor<int8> cases)->(tensor<any> C) | C=switch([tensors],case) | switch(listtensor<any> tensors, tensor<int8> cases)->(tensor<any> C) |
+| greaterscalar | miaobyte | greaterscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) | mask=greater(T1,scalar) | greaterscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) |
+| equalscalar | miaobyte | equalscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) | mask=equal(T1,scalar) | equalscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) |
+| normal | miaobyte | normal(tensor<any> t, var<any> mean, var<any> std, var<int32> seed)->() | normal(T1,mean,stddev,seed) | normal(tensor<any> t, var<any> mean, var<any> std, var<int32> seed)->() |
 | uniform | miaobyte | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() | uniform(T1,low,high,seed) | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() |
 | addscalar | miaobyte | addscalar(tensor<any> a, var<any> scalar)->(tensor<any> c) | T3=T1+scalar | addscalar(tensor<any> a, var<any> scalar)->(tensor<any> c) |
 | log | miaobyte | log(tensor<any> A)->(tensor<any> C) | T3=log(T1) | log(tensor<any> A)->(tensor<any> C) |
@@ -23,20 +25,27 @@
 | copytensor |  none  | copytensor(tensor<any> src, tensor<any> dst)->() | T2.data = T1.data | copytensor(tensor<any> src, tensor<any> dst)->() |
 | prod | miaobyte | prod(tensor<any> A, vector<int32> axis, var<bool> keepdims)->(tensor<any> B) | B = prod(A, axis=[1 2], keepdims=false) | prod(tensor<any> A, vector<int32> axis, var<bool> keepdims)->(tensor<any> B) |
 | min | miaobyte | min(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=min(T1,T2) | min(tensor<any> A, tensor<any> B)->(tensor<any> C) |
+| greater | miaobyte | greater(tensor<any> A, tensor<any> B)->(tensor<bool> mask) | mask=greater(T1,T2) | greater(tensor<any> A, tensor<any> B)->(tensor<bool> mask) |
 | print | miaobyte | print(tensor<any> )->() | print(T1) | print(tensor<any> )->() |
 | print | miaobyte | print(tensor<any> , var<string> )->() | print(T1) | print(tensor<any> , var<string> )->() |
 | newtensor |  none  | newtensor(vector<int32> shape)->(tensor<any> tensor1) | T1 =Tensor(shape=[...]) | newtensor(vector<int32> shape)->(tensor<any> tensor1) |
 | newtensor |  none  | newtensor(var<string> shape)->(tensor<any> t) | T1 =Tensor(shape=[...]) | newtensor(var<string> shape)->(tensor<any> t) |
+| lessscalar | miaobyte | lessscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) | mask=less(T1,scalar) | lessscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) |
 | deltensor |  none  | deltensor(tensor<any> t)->() | del T1 | deltensor(tensor<any> t)->() |
+| less | miaobyte | less(tensor<any> A, tensor<any> B)->(tensor<bool> mask) | mask=less(T1,T2) | less(tensor<any> A, tensor<any> B)->(tensor<bool> mask) |
 | constant | miaobyte | constant(tensor<any> t, var<any> value)->() | constant(T1,value) | constant(tensor<any> t, var<any> value)->() |
 | powscalar | miaobyte | powscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=T1^scalar | powscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
 | vecset |  none  | vecset(vector<any> value)->(vector<any> name) | shape = [3  4  5] | vecset(vector<any> value)->(vector<any> name) |
-| reducemin | miaobyte | reducemin(tensor<any> A, vector<int32> axis, var<bool> keepdims)->(tensor<any> B) | B = reducemin(A, axis=[1 2], keepdims=false) | reducemin(tensor<any> A, vector<int32> axis, var<bool> keepdims)->(tensor<any> B) |
-| subscalar | miaobyte | subscalar(tensor<any> a, var<any> scalar)->(tensor<any> c) | T3=T1-scalar | subscalar(tensor<any> a, var<any> scalar)->(tensor<any> c) |
-| sqrt | miaobyte | sqrt(tensor<any> A)->(tensor<any> C) | T3=sqrt(T1) | sqrt(tensor<any> A)->(tensor<any> C) |
+| minscalar | miaobyte | minscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=min(T1,scalar) | minscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
+| rdivscalar | miaobyte | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) | T3=scalar/T1 | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) |
+| rpowscalar | miaobyte | rpowscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) | T3=scalar^T1 | rpowscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) |
 | sub | miaobyte | sub(tensor<any> a, tensor<any> b)->(tensor<any> c) | T3=T1-T2 | sub(tensor<any> a, tensor<any> b)->(tensor<any> c) |
 | sum | miaobyte | sum(tensor<any> A, vector<int32> axis, var<bool> keepdims)->(tensor<any> B) | B = sum(A, axis=[1 2], keepdims=false) | sum(tensor<any> A, vector<int32> axis, var<bool> keepdims)->(tensor<any> B) |
 | argset |  none  | argset(var<any> value)->(var<any> name) | var argname = argvalue | argset(var<any> value)->(var<any> name) |
+| reducemin | miaobyte | reducemin(tensor<any> A, vector<int32> axis, var<bool> keepdims)->(tensor<any> B) | B = reducemin(A, axis=[1 2], keepdims=false) | reducemin(tensor<any> A, vector<int32> axis, var<bool> keepdims)->(tensor<any> B) |
+| sqrt | miaobyte | sqrt(tensor<any> A)->(tensor<any> C) | T3=sqrt(T1) | sqrt(tensor<any> A)->(tensor<any> C) |
+| subscalar | miaobyte | subscalar(tensor<any> a, var<any> scalar)->(tensor<any> c) | T3=T1-scalar | subscalar(tensor<any> a, var<any> scalar)->(tensor<any> c) |
+| equal | miaobyte | equal(tensor<any> A, tensor<any> B)->(tensor<bool> mask) | mask=equal(T1,T2) | equal(tensor<any> A, tensor<any> B)->(tensor<bool> mask) |
 | mulscalar | miaobyte | mulscalar(tensor<any> A, var<any> b)->(tensor<any> C) | T3=T1*scalar | mulscalar(tensor<any> A, var<any> b)->(tensor<any> C) |
 | div | miaobyte | div(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1/T2 | div(tensor<any> A, tensor<any> B)->(tensor<any> C) |
 | invert | miaobyte | invert(tensor<int64|int32|int16|int8> A)->(tensor<int64|int32|int16|int8> C) | T3=~T1 | invert(tensor<int64|int32|int16|int8> A)->(tensor<int64|int32|int16|int8> C) |
@@ -45,6 +54,3 @@
 | maxscalar | miaobyte | maxscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=max(T1,scalar) | maxscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
 | mul | miaobyte | mul(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1*T2 | mul(tensor<any> A, tensor<any> B)->(tensor<any> C) |
 | exp | miaobyte | exp(tensor<any> A)->(tensor<any> C) | T3=exp(T1) | exp(tensor<any> A)->(tensor<any> C) |
-| rdivscalar | miaobyte | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) | T3=scalar/T1 | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) |
-| rpowscalar | miaobyte | rpowscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) | T3=scalar^T1 | rpowscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) |
-| minscalar | miaobyte | minscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=min(T1,scalar) | minscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
diff --git a/excuter/cpp-common/src/deepx/dtype.hpp b/excuter/cpp-common/src/deepx/dtype.hpp
index 810566c8..5b9551a9 100644
--- a/excuter/cpp-common/src/deepx/dtype.hpp
+++ b/excuter/cpp-common/src/deepx/dtype.hpp
@@ -174,6 +174,7 @@ namespace deepx
             return 8;
         case Precision::Float8E4M3:
             return 8;
+        //TODO 需要根据平台支持
         case Precision::Float4E2M1:
             return 4;
         case Precision::Int64:
@@ -184,10 +185,11 @@ namespace deepx
             return 16;
         case Precision::Int8:
             return 8;
+        //TODO，int4 需要根据平台支持
         case Precision::Int4:
             return 4;
         case Precision::Bool:
-            return 1;
+            return 8;
         case Precision::String:
         case Precision::Any:
         default:
diff --git a/excuter/cpp-common/src/deepx/tensorfunc/init.hpp b/excuter/cpp-common/src/deepx/tensorfunc/init.hpp
index 56e106e5..fdca437b 100644
--- a/excuter/cpp-common/src/deepx/tensorfunc/init.hpp
+++ b/excuter/cpp-common/src/deepx/tensorfunc/init.hpp
@@ -6,6 +6,7 @@
  
 namespace deepx::tensorfunc
 {   
+    //constant
     template <typename Author, typename T>
     struct constantDispatcher
     {
@@ -17,7 +18,8 @@ namespace deepx::tensorfunc
     {
         constantDispatcher<Author, T>::constant(tensor, value);
     }
-    
+
+    //arange
     template <typename Author, typename T>
     struct arangeDispatcher
     {
@@ -30,6 +32,7 @@ namespace deepx::tensorfunc
         arangeDispatcher<Author, T>::arange(tensor, start, step);
     }
 
+    //uniform
     template <typename Author, typename T>
     struct uniformDispatcher
     {
@@ -41,6 +44,19 @@ namespace deepx::tensorfunc
     {
         uniformDispatcher<Author, T>::uniform(tensor, low, high, seed);
     }
-}
 
+    //normal
+    template <typename Author, typename T>
+    struct normalDispatcher
+    {
+        static void normal(Tensor<T> &tensor, const T mean  , const T stddev  , const unsigned int seed) = delete;
+    };
+
+    template <typename Author, typename T>
+    void normal(Tensor<T> &tensor, const T mean = T(0), const T stddev = T(1), const unsigned int seed = 0)
+    {
+        normalDispatcher<Author, T>::normal(tensor, mean, stddev, seed);
+    }
+ 
+}
 #endif
diff --git a/excuter/cpp-common/src/stdutil/print.hpp b/excuter/cpp-common/src/stdutil/print.hpp
index 139575b4..046a4c83 100644
--- a/excuter/cpp-common/src/stdutil/print.hpp
+++ b/excuter/cpp-common/src/stdutil/print.hpp
@@ -35,16 +35,22 @@ namespace stdutil
         case Precision::Float64:
             printf(format.c_str(), ((double *)data)[offset]);
             break;
-        case Precision::Float32:
-            printf(format.c_str(), ((float *)data)[offset]);
+        case Precision::Float32:{
+            float result = ((float *)data)[offset];
+            printf(format.c_str(), result);
             break;
-        case Precision::Float16:
-            printf(format.c_str(), ((float *)data)[offset]);
+        }
+        case Precision::Float16:{
+            float result = ((float *)data)[offset];
+            printf(format.c_str(), result);
             break;
-        case Precision::BFloat16:
-            printf(format.c_str(), ((float *)data)[offset]);
+        }   
+        case Precision::BFloat16:{
+            float result = ((float *)data)[offset];
+            printf(format.c_str(), result);
             break;
         }
+        }
     }
 
     inline std::string default_format(const deepx::Precision &dtype)
diff --git a/excuter/op-mem-cuda/src/client/tfs.cpp b/excuter/op-mem-cuda/src/client/tfs.cpp
index c64973ca..f92bcfc4 100644
--- a/excuter/op-mem-cuda/src/client/tfs.cpp
+++ b/excuter/op-mem-cuda/src/client/tfs.cpp
@@ -93,6 +93,14 @@ namespace deepx::tf
                                                                      Param("seed", DataCategory::Var, Precision::Int32),
                                                                  }),
                                                              vector<Param>()));
+        tffactory.add_tf(std::make_shared<Normal<miaobyte>>(vector<Param>(
+                                                                 {
+                                                                     Param("t", DataCategory::Tensor, Precision::Any),
+                                                                     Param("mean", DataCategory::Var, Precision::Any),
+                                                                     Param("stddev", DataCategory::Var, Precision::Any),
+                                                                     Param("seed", DataCategory::Var, Precision::Int32),    
+                                                                 }),
+                                                             vector<Param>())); 
     }
     // io
     void register_util(TfFactory &opfactory)
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda.hpp
index a9b6886f..7dc0320e 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda.hpp
@@ -17,7 +17,6 @@ namespace deepx::tensorfunc
                 throw std::runtime_error("Failed to create cuBLAS handle");
             }
         }
-
         ~CublasHandle()
         {
             if (handle_)
@@ -29,26 +28,36 @@ namespace deepx::tensorfunc
     private:
         cublasHandle_t handle_;
     };
-
-    inline std::pair<int, int> BestDims(int total_elements)
+    //TODO 
+    inline int deviceblocksize(){
+        int device_id;
+        cudaGetDevice(&device_id);
+        cudaDeviceProp props;
+        cudaGetDeviceProperties(&props, device_id);
+        return props.maxThreadsPerBlock;
+    }
+    inline int deviceblock()
     {
-        // 默认块大小
-        int optimal_block_size = 256; // 一般256或512是较好的选择
-        // 计算设备属性以确定最佳配置
         int device_id;
         cudaGetDevice(&device_id);
         cudaDeviceProp props;
         cudaGetDeviceProperties(&props, device_id);
 
-        // 根据SM数量和每个SM的最大线程数决定块数
+        // 根据SM数量计算建议的块数上限
         int sm_count = props.multiProcessorCount;
         int optimal_blocks = sm_count * 8; // 每个SM分配多个块以增加并行度
+        return optimal_blocks;
+    }
 
-        // 确保至少启动足够的线程来处理所有数据
-        int min_blocks = (total_elements + optimal_block_size - 1) / optimal_block_size;
-        int actual_blocks = std::min(optimal_blocks, min_blocks);
-
-        return {actual_blocks, optimal_block_size};
+    // 计算最佳的块大小和块数
+    inline std::pair<int, int> BestDims(int total_elements)
+    {
+        // 默认块大小
+        int blocksize = total_elements > 256 ? 256 : total_elements;
+        int blocks = (total_elements + blocksize - 1) / blocksize; // 向上取整除法
+        int optimal_blocks = deviceblock();
+        blocks = std::min(blocks, optimal_blocks);
+        return {blocks, blocksize};
     };
 }
 
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/init_miaobyte.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/init_miaobyte.cu
index aaeed355..0e98773c 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/init_miaobyte.cu
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/init_miaobyte.cu
@@ -4,123 +4,154 @@
 
 #include "deepx/tensor.hpp"
 #include "deepx/tensorfunc/authors.hpp"
-
+#include "deepx/tensorfunc/cuda.hpp"
 namespace deepx::tensorfunc
 {
     template <typename T>
-    __global__ void kernel_constant(T *data, const int size, const T value)
+    __global__ void kernel_constant(T *data, const T value, const int size)
     {
-        int idx = blockIdx.x * blockDim.x + threadIdx.x;
-        if (idx < size)
+        int stride = blockDim.x * gridDim.x;
+        for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride)
         {
             data[idx] = value;
         }
     }
-    template __global__ void kernel_constant<double>(double *data, const int size, const double value);
-    template __global__ void kernel_constant<float>(float *data, const int size, const float value);
-    template __global__ void kernel_constant<half>(half *data, const int size, const half value);
-    template __global__ void kernel_constant<nv_bfloat16>(nv_bfloat16 *data, const int size, const nv_bfloat16 value);
-    template __global__ void kernel_constant<int64_t>(int64_t *data, const int size, const int64_t value);
-    template __global__ void kernel_constant<int32_t>(int32_t *data, const int size, const int32_t value);
-    template __global__ void kernel_constant<int16_t>(int16_t *data, const int size, const int16_t value);
-    template __global__ void kernel_constant<int8_t>(int8_t *data, const int size, const int8_t value);
 
     template <typename T>
-    void launch_constant(const int numBlocks, const int blockSize, T *a, const T value, const int size)
+    void launch_constant(T *a, const T value, const int size)
     {
-        kernel_constant<<<numBlocks, blockSize>>>(a, size, value);
+        auto [numBlocks, blockSize] = BestDims(size);
+        kernel_constant<<<numBlocks, blockSize>>>(a, value, size);
         cudaError_t err = cudaGetLastError();
         if (err != cudaSuccess)
             throw std::runtime_error("Failed to launch constant kernel");
+        err = cudaDeviceSynchronize();
+        if (err != cudaSuccess)
+            throw std::runtime_error("Failed to synchronize device");
     }
 
-    template void launch_constant<double>(const int numBlocks, const int blockSize,   double *a, const double value, const int size);
-    template void launch_constant<float>(const int numBlocks, const int blockSize,   float *a, const float value, const int size);
-    template void launch_constant<half>(const int numBlocks, const int blockSize,   half *a, const half value, const int size);
-    template void launch_constant<nv_bfloat16>(const int numBlocks, const int blockSize,   nv_bfloat16 *a, const nv_bfloat16 value, const int size);
-    template void launch_constant<int64_t>(const int numBlocks, const int blockSize,   int64_t *a, const int64_t value, const int size);
-    template void launch_constant<int32_t>(int numBlocks, int blockSize,   int32_t *a, int32_t value, int size);
-    template void launch_constant<int16_t>(const int numBlocks, const int blockSize,   int16_t *a, const int16_t value, const int size);
-    template void launch_constant<int8_t>(const int numBlocks, const int blockSize,   int8_t *a, const int8_t value, const int size);
+    template void launch_constant<double>(double *a, const double value, const int size);
+    template void launch_constant<float>(float *a, const float value, const int size);
+    template void launch_constant<half>(half *a, const half value, const int size);
+    template void launch_constant<nv_bfloat16>(nv_bfloat16 *a, const nv_bfloat16 value, const int size);
+    template void launch_constant<int64_t>(int64_t *a, const int64_t value, const int size);
+    template void launch_constant<int32_t>(int32_t *a, const int32_t value, const int size);
+    template void launch_constant<int16_t>(int16_t *a, const int16_t value, const int size);
+    template void launch_constant<int8_t>(int8_t *a, const int8_t value, const int size);
+    template void launch_constant<bool>(bool *a, const bool value, const int size);
 
     // 添加kernel函数
     template <typename T>
-    __global__ void kernel_arange(T *data, const int size, const T start, const T step)
+    __global__ void kernel_arange(T *data, const float start, const float step, const int size)
     {
-        int idx = blockIdx.x * blockDim.x + threadIdx.x;
-        if (idx < size)
+        int stride = blockDim.x * gridDim.x;
+        for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride)
         {
-            data[idx] = start + step * static_cast<T>(static_cast<float>(idx));
+            data[idx] = static_cast<T>(start + step * static_cast<float>(idx));
         }
     }
-    template __global__ void kernel_arange<double>(double *data, const int size, const double start, const double step);
-    template __global__ void kernel_arange<float>(float *data, const int size, const float start, const float step);
-    template __global__ void kernel_arange<half>(half *data, const int size, const half start, const half step);
-    template __global__ void kernel_arange<nv_bfloat16>(nv_bfloat16 *data, const int size, const nv_bfloat16 start, const nv_bfloat16 step);
-    template __global__ void kernel_arange<int64_t>(int64_t *data, const int size, const int64_t start, const int64_t step);
-    template __global__ void kernel_arange<int32_t>(int32_t *data, const int size, const int32_t start, const int32_t step);
-    template __global__ void kernel_arange<int16_t>(int16_t *data, const int size, const int16_t start, const int16_t step);
-    template __global__ void kernel_arange<int8_t>(int8_t *data, const int size, const int8_t start, const int8_t step);
 
     template <typename T>
-    void launch_arange(const int numBlocks, const int blockSize,   T *a, const T start, const T step, const int size)
+    void launch_arange(T *a, const T start, const T step, const int size)
     {
-        kernel_arange<<<numBlocks, blockSize>>>(a, size, start, step);
+        auto [numBlocks, blockSize] = BestDims(size);
+        kernel_arange<<<numBlocks, blockSize>>>(a, static_cast<float>(start), static_cast<float>(step), size);
         cudaError_t err = cudaGetLastError();
         if (err != cudaSuccess)
             throw std::runtime_error("Failed to launch arange kernel");
+        err = cudaDeviceSynchronize();
+        if (err != cudaSuccess)
+            throw std::runtime_error("Failed to synchronize device");
     }
-    template void launch_arange<double>(const int numBlocks, const int blockSize,   double *a, const double start, const double step, const int size);
-    template void launch_arange<float>(const int numBlocks, const int blockSize,   float *a, const float start, const float step, const int size);
-    template void launch_arange<half>(const int numBlocks, const int blockSize,   half *a, const half start, const half step, const int size);
-    template void launch_arange<nv_bfloat16>(const int numBlocks, const int blockSize,   nv_bfloat16 *a, const nv_bfloat16 start, const nv_bfloat16 step, const int size);
-    template void launch_arange<int64_t>(const int numBlocks, const int blockSize,   int64_t *a, const int64_t start, const int64_t step, const int size);
-    template void launch_arange<int32_t>(const int numBlocks, const int blockSize,   int32_t *a, const int32_t start, const int32_t step, const int size);
-    template void launch_arange<int16_t>(const int numBlocks, const int blockSize,   int16_t *a, const int16_t start, const int16_t step, const int size);
-    template void launch_arange<int8_t>(const int numBlocks, const int blockSize,   int8_t *a, const int8_t start, const int8_t step, const int size);
+    template void launch_arange<double>(double *a, const double start, const double step, const int size);
+    template void launch_arange<float>(float *a, const float start, const float step, const int size);
+    template void launch_arange<half>(half *a, const half start, const half step, const int size);
+    template void launch_arange<nv_bfloat16>(nv_bfloat16 *a, const nv_bfloat16 start, const nv_bfloat16 step, const int size);
+    template void launch_arange<int64_t>(int64_t *a, const int64_t start, const int64_t step, const int size);
+    template void launch_arange<int32_t>(int32_t *a, const int32_t start, const int32_t step, const int size);
+    template void launch_arange<int16_t>(int16_t *a, const int16_t start, const int16_t step, const int size);
+    template void launch_arange<int8_t>(int8_t *a, const int8_t start, const int8_t step, const int size);
 
     // 添加kernel函数
     template <typename T>
-    __global__ void kernel_uniform(T *data, const int size, const T low, const T high, const unsigned int seed)
+    __global__ void kernel_uniform(T *data, const float low, const float high, const unsigned int seed, const int size)
     {
-        int idx = blockIdx.x * blockDim.x + threadIdx.x;
-        if (idx < size)
-        {
-            // 为每个线程创建独立的随机数生成器状态
-            curandState state;
-            curand_init(seed, idx, 0, &state);
+        int stride = blockDim.x * gridDim.x;
+        curandState state;
+        curand_init(seed, threadIdx.x, 0, &state); // 仅初始化一次
 
+        for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride)
+        {
             // 生成[0,1)范围的随机数
             float rand = curand_uniform(&state);
 
             // 先用float类型进行计算，然后转换为目标类型
-            float result = static_cast<float>(low) + (static_cast<float>(high) - static_cast<float>(low)) * rand;
+            float result = low + (high - low) * rand;
+            printf("threadIdx: %d, idx: %d, result: %f\n", threadIdx.x, idx, result);
             data[idx] = static_cast<T>(result);
         }
     }
-    template __global__ void kernel_uniform<double>(double *data, const int size, const double low, const double high, const unsigned int seed);
-    template __global__ void kernel_uniform<float>(float *data, const int size, const float low, const float high, const unsigned int seed);
-    template __global__ void kernel_uniform<half>(half *data, const int size, const half low, const half high, const unsigned int seed);
-    template __global__ void kernel_uniform<nv_bfloat16>(nv_bfloat16 *data, const int size, const nv_bfloat16 low, const nv_bfloat16 high, const unsigned int seed);
-    template __global__ void kernel_uniform<int64_t>(int64_t *data, const int size, const int64_t low, const int64_t high, const unsigned int seed);
-    template __global__ void kernel_uniform<int32_t>(int32_t *data, int size, int32_t low, int32_t high, unsigned int seed);
-    template __global__ void kernel_uniform<int16_t>(int16_t *data, const int size, const int16_t low, const int16_t high, const unsigned int seed);
-    template __global__ void kernel_uniform<int8_t>(int8_t *data, const int size, const int8_t low, const int8_t high, const unsigned int seed);
 
     template <typename T>
-    void launch_uniform(const int numBlocks, const int blockSize, T *a, const T low, const T high, const unsigned int seed, const int size)
+    void launch_uniform(T *a, const T low, const T high, const unsigned int seed, const int size)
     {
-        kernel_uniform<<<numBlocks, blockSize>>>(a, size, low, high, seed);
+        auto [numBlocks, blockSize] = BestDims(size);
+        kernel_uniform<<<numBlocks, blockSize>>>(a, float(low), float(high), seed, size);
         cudaError_t err = cudaGetLastError();
         if (err != cudaSuccess)
             throw std::runtime_error("Failed to launch uniform kernel");
+        err = cudaDeviceSynchronize();
+        if (err != cudaSuccess)
+            throw std::runtime_error("Failed to synchronize device");
+    }
+    template void launch_uniform<double>(double *a, const double low, const double high, const unsigned int seed, const int size);
+    template void launch_uniform<float>(float *a, const float low, const float high, const unsigned int seed, const int size);
+    template void launch_uniform<half>(half *a, const half low, const half high, const unsigned int seed, const int size);
+    template void launch_uniform<nv_bfloat16>(nv_bfloat16 *a, const nv_bfloat16 low, const nv_bfloat16 high, const unsigned int seed, const int size);
+    template void launch_uniform<int64_t>(int64_t *a, const int64_t low, const int64_t high, const unsigned int seed, const int size);
+    template void launch_uniform<int32_t>(int32_t *a, const int32_t low, const int32_t high, const unsigned int seed, const int size);
+    template void launch_uniform<int16_t>(int16_t *a, const int16_t low, const int16_t high, const unsigned int seed, const int size);
+    template void launch_uniform<int8_t>(int8_t *a, const int8_t low, const int8_t high, const unsigned int seed, const int size);
+
+    // normal
+    template <typename T>
+    __global__ void kernel_normal(T *data, const float mean, const float stddev, const unsigned int seed, const int size)
+    {
+        int stride = blockDim.x * gridDim.x;
+        curandState state;
+        curand_init(seed, threadIdx.x, 0, &state); // 仅初始化一次
+
+        for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride)
+        {
+            // 生成[0,1)范围的随机数
+            float rand = curand_uniform(&state);
+
+            // 先用float类型进行计算，然后转换为目标类型
+            float result =  rand;
+            // float result = mean + stddev * rand;
+            printf("threadIdx: %d, idx: %d, result: %f\n", threadIdx.x, idx, result);
+            data[idx] = static_cast<T>(rand);
+        }
+    }
+    template <typename T>
+    void launch_normal(T *a, const T mean, const T stddev, const unsigned int seed, const int size)
+    {
+        auto [numBlocks, blockSize] = BestDims(size);
+        kernel_normal<<<numBlocks, blockSize>>>(a,float(mean), float(stddev), seed, size);
+        cudaError_t err = cudaGetLastError();
+        if (err != cudaSuccess)
+            throw std::runtime_error("Failed to launch normal kernel");
+        err = cudaDeviceSynchronize();
+        if (err != cudaSuccess)
+            throw std::runtime_error("Failed to synchronize device");
     }
-    template void launch_uniform<double>(const int numBlocks, const int blockSize, double *a, const double low, const double high, const unsigned int seed, const int size);
-    template void launch_uniform<float>(const int numBlocks, const int blockSize, float *a, const float low, const float high, const unsigned int seed, const int size);
-    template void launch_uniform<half>(const int numBlocks, const int blockSize, half *a, const half low, const half high, const unsigned int seed, const int size);
-    template void launch_uniform<nv_bfloat16>(const int numBlocks, const int blockSize, nv_bfloat16 *a, const nv_bfloat16 low, const nv_bfloat16 high, const unsigned int seed, const int size);
-    template void launch_uniform<int64_t>(const int numBlocks, const int blockSize, int64_t *a, const int64_t low, const int64_t high, const unsigned int seed, const int size);
-    template void launch_uniform<int32_t>(const int numBlocks, const int blockSize, int32_t *a, const int32_t low, const int32_t high, const unsigned int seed, const int size);
-    template void launch_uniform<int16_t>(const int numBlocks, const int blockSize, int16_t *a, const int16_t low, const int16_t high, const unsigned int seed, const int size);
-    template void launch_uniform<int8_t>(const int numBlocks, const int blockSize, int8_t *a, const int8_t low, const int8_t high, const unsigned int seed, const int size);
+    template void launch_normal<double>(double *a, const double mean, const double stddev, const unsigned int seed, const int size);
+    template void launch_normal<float>(float *a, const float mean, const float stddev, const unsigned int seed, const int size);
+    template void launch_normal<half>(half *a, const half mean, const half stddev, const unsigned int seed, const int size);
+    template void launch_normal<nv_bfloat16>(nv_bfloat16 *a, const nv_bfloat16 mean, const nv_bfloat16 stddev, const unsigned int seed, const int size);
+    template void launch_normal<int64_t>(int64_t *a, const int64_t mean, const int64_t stddev, const unsigned int seed, const int size);
+    template void launch_normal<int32_t>(int32_t *a, const int32_t mean, const int32_t stddev, const unsigned int seed, const int size);
+    template void launch_normal<int16_t>(int16_t *a, const int16_t mean, const int16_t stddev, const unsigned int seed, const int size);
+    template void launch_normal<int8_t>(int8_t *a, const int8_t mean, const int8_t stddev, const unsigned int seed, const int size);
+
 }
\ No newline at end of file
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/init_miaobyte.cuh b/excuter/op-mem-cuda/src/deepx/tensorfunc/init_miaobyte.cuh
index b2c2c42c..1acbb322 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/init_miaobyte.cuh
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/init_miaobyte.cuh
@@ -11,73 +11,29 @@
 namespace deepx::tensorfunc
 {
     template <typename T>
-    __global__ void kernel_constant(T *data, const int size, const T value);
+    __global__ void kernel_constant(T *data, const T value, const int size);
 
     template <typename T>
-    void launch_constant(const int numBlocks, const int blockSize, T *a, const T value, const int size);
-
-    template <>
-    void launch_constant<double>(const int numBlocks, const int blockSize, double *a, const double value, const int size);
-    template <>
-    void launch_constant<float>(const int numBlocks, const int blockSize, float *a, const float value, const int size);
-    template <>
-    void launch_constant<half>(const int numBlocks, const int blockSize, half *a, const half value, const int size);
-    template <>
-    void launch_constant<nv_bfloat16>(const int numBlocks, const int blockSize, nv_bfloat16 *a, const nv_bfloat16 value, const int size);
-    template <>
-    void launch_constant<int64_t>(const int numBlocks, const int blockSize, int64_t *a, const int64_t value, const int size);
-    template <>
-    void launch_constant<int32_t>(const int numBlocks, const int blockSize, int32_t *a, const int32_t value, const int size);
-    template <>
-    void launch_constant<int16_t>(const int numBlocks, const int blockSize, int16_t *a, const int16_t value, const int size);
-    template <>
-    void launch_constant<int8_t>(const int numBlocks, const int blockSize, int8_t *a, const int8_t value, const int size);
-
+    void launch_constant(T *a, const T value, const int size);
+ 
     template <typename T>
-    __global__ void kernel_arange(T *data, const int size, const T start, const T step);
+    __global__ void kernel_arange(T *data, const float start, const float step, const int size);
 
     template <typename T>
-    void launch_arange(const int numBlocks, const int blockSize, T *a, const T start, const T step, const int size);
+    void launch_arange(T *a, const T start, const T step, const int size);
+ 
+    template <typename T>
+    __global__ void kernel_uniform(T *data, const float low, const float high, const unsigned int seed, const int size);
 
-    template <>
-    void launch_arange<double>(const int numBlocks, const int blockSize, double *a, const double start, const double step, const int size);
-    template <>
-    void launch_arange<float>(const int numBlocks, const int blockSize, float *a, const float start, const float step, const int size);
-    template <>
-    void launch_arange<half>(const int numBlocks, const int blockSize, half *a, const half start, const half step, const int size);
-    template <>
-    void launch_arange<nv_bfloat16>(const int numBlocks, const int blockSize, nv_bfloat16 *a, const nv_bfloat16 start, const nv_bfloat16 step, const int size);
-    template <>
-    void launch_arange<int64_t>(const int numBlocks, const int blockSize, int64_t *a, const int64_t start, const int64_t step, const int size);
-    template <>
-    void launch_arange<int32_t>(const int numBlocks, const int blockSize, int32_t *a, const int32_t start, const int32_t step, const int size);
-    template <>
-    void launch_arange<int16_t>(const int numBlocks, const int blockSize,   int16_t *a, const int16_t start, const int16_t step, const int size);
-    template <>
-    void launch_arange<int8_t>(const int numBlocks, const int blockSize,   int8_t *a, const int8_t start, const int8_t step, const int size);
+    template <typename T>
+    void launch_uniform(T *a, const T low, const T high, const unsigned int seed, const int size);
 
     template <typename T>
-    __global__ void kernel_uniform(T *data, const int size, const T low, const T high, const unsigned int seed);
+    __global__ void kernel_normal(T *data, const float mean, const float stddev, const unsigned int seed, const int size);
 
     template <typename T>
-    void launch_uniform(const int numBlocks, const int blockSize, T *a, const T low, const T high, const unsigned int seed, const int size);
+    void launch_normal(T *a, const T mean, const T stddev, const unsigned int seed, const int size);
 
-    template <>
-    void launch_uniform<double>(const int numBlocks, const int blockSize, double *a, const double low, const double high, const unsigned int seed, const int size);
-    template <>
-    void launch_uniform<float>(const int numBlocks, const int blockSize, float *a, const float low, const float high, const unsigned int seed, const int size);
-    template <>
-    void launch_uniform<half>(const int numBlocks, const int blockSize, half *a, const half low, const half high, const unsigned int seed, const int size);
-    template <>
-    void launch_uniform<nv_bfloat16>(const int numBlocks, const int blockSize, nv_bfloat16 *a, const nv_bfloat16 low, const nv_bfloat16 high, const unsigned int seed, const int size);
-    template <>
-    void launch_uniform<int64_t>(const int numBlocks, const int blockSize, int64_t *a, const int64_t low, const int64_t high, const unsigned int seed, const int size);
-    template <>
-    void launch_uniform<int32_t>(const int numBlocks, const int blockSize, int32_t *a, const int32_t low, const int32_t high, const unsigned int seed, const int size);
-    template <>
-    void launch_uniform<int16_t>(const int numBlocks, const int blockSize, int16_t *a, const int16_t low, const int16_t high, const unsigned int seed, const int size);
-    template <>
-    void launch_uniform<int8_t>(const int numBlocks, const int blockSize, int8_t *a, const int8_t low, const int8_t high, const unsigned int seed, const int size);
 }
 
 #endif
\ No newline at end of file
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/init_miaobyte.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/init_miaobyte.hpp
index f909eefd..8340d2b6 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/init_miaobyte.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/init_miaobyte.hpp
@@ -1,43 +1,51 @@
 #ifndef DEEPX_TENSORFUNC_INIT_MIAO_BYTE_HPP
 #define DEEPX_TENSORFUNC_INIT_MIAO_BYTE_HPP
 
+#include <random>
+
 #include "deepx/tensorfunc/authors.hpp"
 #include "deepx/tensorfunc/init.hpp"
 #include "deepx/tensor.hpp"
 #include "deepx/tensorfunc/init_miaobyte.cuh"
 namespace deepx::tensorfunc
 {
-    // 分发器实现
+    // constant
     template <typename T>
     struct constantDispatcher<miaobyte, T>
     {
         static void constant(Tensor<T> &tensor, const T value)
         {
-            const int BLOCKSIZE = tensor.shape.size > 256 ? 256 : tensor.shape.size;
-            int numBlocks = (tensor.shape.size + BLOCKSIZE - 1) / BLOCKSIZE;
-            launch_constant(numBlocks, BLOCKSIZE, tensor.data, value, tensor.shape.size);
+            launch_constant(tensor.data, value, tensor.shape.size);
         }
     };
 
+    // arange
     template <typename T>
     struct arangeDispatcher<miaobyte, T>
     {
         static void arange(Tensor<T> &tensor, const T start, const T step)
         {
-            const int BLOCKSIZE = tensor.shape.size > 256 ? 256 : tensor.shape.size;
-            int numBlocks = (tensor.shape.size + BLOCKSIZE - 1) / BLOCKSIZE;
-            launch_arange(numBlocks, BLOCKSIZE, tensor.data, start, step, tensor.shape.size);
+            launch_arange(tensor.data, start, step, tensor.shape.size);
         }
     };
 
+    // uniform
     template <typename T>
     struct uniformDispatcher<miaobyte, T>
     {
         static void uniform(Tensor<T> &tensor, const T low, const T high, const unsigned int seed)
         {
-            const int BLOCKSIZE = tensor.shape.size > 256 ? 256 : tensor.shape.size;
-            int numBlocks = (tensor.shape.size + BLOCKSIZE - 1) / BLOCKSIZE;
-            launch_uniform(numBlocks, BLOCKSIZE, tensor.data, low, high, seed, tensor.shape.size);
+            launch_uniform(tensor.data, low, high, seed, tensor.shape.size);
+        }
+    };
+
+    // normal
+    template <typename T>
+    struct normalDispatcher<miaobyte, T>
+    {
+        static void normal(Tensor<T> &tensor, const T mean, const T stddev, const unsigned int seed)
+        {
+            launch_normal(tensor.data, mean, stddev, seed, tensor.shape.size);
         }
     };
 }
diff --git a/excuter/op-mem-cuda/src/deepx/tf/init.hpp b/excuter/op-mem-cuda/src/deepx/tf/init.hpp
index 2e60ba62..bbdb883e 100644
--- a/excuter/op-mem-cuda/src/deepx/tf/init.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tf/init.hpp
@@ -33,66 +33,59 @@ namespace deepx::tf
         }
 
         int run(shared_ptr<MemBase> mem, string &error) override
-        {   
+        {
             string name = this->args[0].textvalue;
             auto tensor = mem->gettensor(name).get();
-            auto type=tensor->shape.dtype;
+            auto type = tensor->shape.dtype;
             switch (type)
             {
             case Precision::Float64:
-            {
-                auto output = mem->gettensor<double>(name).get();
-                tensorfunc::constant<Author, double>(*output, this->getvar<double>(1, mem));
+             
+                tensorfunc::constant<Author, double>(*mem->gettensor<double>(name).get(), this->getvar<double>(1, mem));
                 break;
-            }
+           
             case Precision::Float32:
-            {
-                auto output = mem->gettensor<float>(name).get();
-                tensorfunc::constant<Author, float>(*output, this->getvar<float>(1, mem));
+            
+                tensorfunc::constant<Author, float>(*mem->gettensor<float>(name).get(), this->getvar<float>(1, mem));
                 break;
-            }
+            
             case Precision::Float16:
-            {
-                auto output = mem->gettensor<__half>(name).get();
-                tensorfunc::constant<Author, __half>(*output, this->getvar<__half>(1, mem));
+            
+                tensorfunc::constant<Author, __half>(*mem->gettensor<__half>(name).get(), this->getvar<__half>(1, mem));
                 break;
-            }
+             
             case Precision::BFloat16:
-            {
-                auto output = mem->gettensor<__nv_bfloat16>(name).get();
-                tensorfunc::constant<Author, __nv_bfloat16>(*output, this->getvar<__nv_bfloat16>(1, mem));
+            
+                tensorfunc::constant<Author, __nv_bfloat16>(*mem->gettensor<__nv_bfloat16>(name).get(), this->getvar<__nv_bfloat16>(1, mem));
                 break;
-            }
+            
             case Precision::Int64:
-            {
-                auto output = mem->gettensor<int64_t>(name).get();
-                tensorfunc::constant<Author, int64_t>(*output, this->getvar<int64_t>(1, mem));
+            
+                tensorfunc::constant<Author, int64_t>(*mem->gettensor<int64_t>(name).get(), this->getvar<int64_t>(1, mem));
                 break;
-            }
+            
             case Precision::Int32:
-            {
-                auto output = mem->gettensor<int32_t>(name).get();
-                tensorfunc::constant<Author, int32_t>(*output, this->getvar<int32_t>(1, mem));
+            
+                tensorfunc::constant<Author, int32_t>(*mem->gettensor<int32_t>(name).get(), this->getvar<int32_t>(1, mem));
                 break;
-            }
+            
             case Precision::Int16:
-            {
-                auto output = mem->gettensor<int16_t>(name).get();
-                tensorfunc::constant<Author, int16_t>(*output, this->getvar<int16_t>(1, mem));
+            
+                tensorfunc::constant<Author, int16_t>(*mem->gettensor<int16_t>(name).get(), this->getvar<int16_t>(1, mem));
                 break;
-            }
+            
             case Precision::Int8:
-            {
-                auto output = mem->gettensor<int8_t>(name).get();
-                tensorfunc::constant<Author, int8_t>(*output, this->getvar<int8_t>(1, mem));
+            
+                tensorfunc::constant<Author, int8_t>(*mem->gettensor<int8_t>(name).get(), this->getvar<int8_t>(1, mem));
+                break;
+            case Precision::Bool:
+                tensorfunc::constant<Author, bool>(*mem->gettensor<bool>(name).get(), this->getvar<bool>(1, mem));
                 break;
-            }
             default:
             {
                 error = "unsupported dtype: " + precision_str(type);
                 return 1;
             }
-                
             }
             return 0;
         };
@@ -103,7 +96,7 @@ namespace deepx::tf
         shared_ptr<TF> clone() const override
         {
             return make_shared<Constant<Author>>(*this);
-        }   
+        }
     };
 
     template <typename Author>
@@ -121,7 +114,7 @@ namespace deepx::tf
         {
             string name = this->args[0].textvalue;
             auto tensor = mem->gettensor(name).get();
-            auto type=tensor->shape.dtype;
+            auto type = tensor->shape.dtype;
             switch (type)
             {
             case Precision::Float64:
@@ -141,13 +134,13 @@ namespace deepx::tf
                 auto output = mem->gettensor<__half>(name).get();
                 tensorfunc::arange<Author, __half>(*output, this->getvar<__half>(1, mem), this->getvar<__half>(2, mem));
                 break;
-            }   
+            }
             case Precision::BFloat16:
             {
                 auto output = mem->gettensor<__nv_bfloat16>(name).get();
                 tensorfunc::arange<Author, __nv_bfloat16>(*output, this->getvar<__nv_bfloat16>(1, mem), this->getvar<__nv_bfloat16>(2, mem));
                 break;
-            }   
+            }
             case Precision::Int64:
             {
                 auto output = mem->gettensor<int64_t>(name).get();
@@ -177,7 +170,6 @@ namespace deepx::tf
                 error = "unsupported dtype: " + precision_str(type);
                 return 1;
             }
-                
             }
             return 0;
         }
@@ -189,8 +181,8 @@ namespace deepx::tf
         {
             return make_shared<Arange<Author>>(*this);
         }
-    };  
-    
+    };
+
     template <typename Author>
     class Uniform : public TF
     {
@@ -200,61 +192,62 @@ namespace deepx::tf
             this->name = "uniform";
             this->author = Author::name();
             this->args = args;
-            this->returns = returns;    
+            this->returns = returns;
         }
         int run(shared_ptr<MemBase> mem, string &error) override
         {
             string name = this->args[0].textvalue;
             auto tensor = mem->gettensor(name).get();
-            auto type=tensor->shape.dtype;
+            auto type = tensor->shape.dtype;
+            unsigned int seed = static_cast<unsigned int>( this->getvar<int>(3, mem));
             switch (type)
             {
             case Precision::Float64:
             {
                 auto output = mem->gettensor<double>(name).get();
-                tensorfunc::uniform<Author, double>(*output, this->getvar<double>(1, mem), this->getvar<double>(2, mem), this->getvar<unsigned int>(3, mem));
+                tensorfunc::uniform<Author, double>(*output, this->getvar<double>(1, mem), this->getvar<double>(2, mem), seed);
                 break;
             }
             case Precision::Float32:
             {
                 auto output = mem->gettensor<float>(name).get();
-                tensorfunc::uniform<Author, float>(*output, this->getvar<float>(1, mem), this->getvar<float>(2, mem), this->getvar<unsigned int>(3, mem));
+                tensorfunc::uniform<Author, float>(*output, this->getvar<float>(1, mem), this->getvar<float>(2, mem), seed);
                 break;
             }
             case Precision::Float16:
             {
                 auto output = mem->gettensor<__half>(name).get();
-                tensorfunc::uniform<Author, __half>(*output, this->getvar<__half>(1, mem), this->getvar<__half>(2, mem), this->getvar<unsigned int>(3, mem));
+                tensorfunc::uniform<Author, __half>(*output, this->getvar<__half>(1, mem), this->getvar<__half>(2, mem), seed);
                 break;
             }
             case Precision::BFloat16:
             {
                 auto output = mem->gettensor<__nv_bfloat16>(name).get();
-                tensorfunc::uniform<Author, __nv_bfloat16>(*output, this->getvar<__nv_bfloat16>(1, mem), this->getvar<__nv_bfloat16>(2, mem), this->getvar<unsigned int>(3, mem));
+                tensorfunc::uniform<Author, __nv_bfloat16>(*output, this->getvar<__nv_bfloat16>(1, mem), this->getvar<__nv_bfloat16>(2, mem), seed);
                 break;
-            }   
+            }
             case Precision::Int64:
             {
                 auto output = mem->gettensor<int64_t>(name).get();
-                tensorfunc::uniform<Author, int64_t>(*output, this->getvar<int64_t>(1, mem), this->getvar<int64_t>(2, mem), this->getvar<unsigned int>(3, mem));
+                tensorfunc::uniform<Author, int64_t>(*output, this->getvar<int64_t>(1, mem), this->getvar<int64_t>(2, mem), seed);
                 break;
             }
             case Precision::Int32:
             {
                 auto output = mem->gettensor<int32_t>(name).get();
-                tensorfunc::uniform<Author, int32_t>(*output, this->getvar<int32_t>(1, mem), this->getvar<int32_t>(2, mem), this->getvar<unsigned int>(3, mem));
+                tensorfunc::uniform<Author, int32_t>(*output, this->getvar<int32_t>(1, mem), this->getvar<int32_t>(2, mem), seed);
                 break;
             }
             case Precision::Int16:
             {
                 auto output = mem->gettensor<int16_t>(name).get();
-                tensorfunc::uniform<Author, int16_t>(*output, this->getvar<int16_t>(1, mem), this->getvar<int16_t>(2, mem), this->getvar<unsigned int>(3, mem));
+                tensorfunc::uniform<Author, int16_t>(*output, this->getvar<int16_t>(1, mem), this->getvar<int16_t>(2, mem), seed);
                 break;
             }
             case Precision::Int8:
             {
                 auto output = mem->gettensor<int8_t>(name).get();
-                tensorfunc::uniform<Author, int8_t>(*output, this->getvar<int8_t>(1, mem), this->getvar<int8_t>(2, mem), this->getvar<unsigned int>(3, mem));
+                tensorfunc::uniform<Author, int8_t>(*output, this->getvar<int8_t>(1, mem), this->getvar<int8_t>(2, mem), seed);
                 break;
             }
             default:
@@ -262,7 +255,6 @@ namespace deepx::tf
                 error = "unsupported dtype: " + precision_str(type);
                 return 1;
             }
-                
             }
             return 0;
         }
@@ -274,8 +266,76 @@ namespace deepx::tf
         {
             return make_shared<Uniform<Author>>(*this);
         }
-    };  
-    
+    };
+
+    template <typename Author>
+    class Normal : public TF
+    {
+    public:
+        Normal(const vector<Param> &args, const vector<Param> &returns)
+        {
+            this->name = "normal";
+            this->author = Author::name();
+            this->args = args;
+            this->returns = returns;
+        }
+
+        string math_formula() const override
+        {
+            return "normal(T1,mean,stddev,seed)";
+        }
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<Normal<Author>>(*this);
+        }
+        int run(shared_ptr<MemBase> mem, string &error) override
+        {
+            string name = this->args[0].textvalue;
+            auto tensor = mem->gettensor(name).get();
+            auto type = tensor->shape.dtype;
+            unsigned int seed = static_cast<unsigned int>( this->getvar<int>(3, mem));
+            switch (type)
+            {
+            case Precision::Float64:
+                tensorfunc::normal<Author, double>(*mem->gettensor<double>(name).get(), this->getvar<double>(1, mem), this->getvar<double>(2, mem), seed);
+                break;
+
+            case Precision::Float32:
+                tensorfunc::normal<Author, float>(*mem->gettensor<float>(name).get(), this->getvar<float>(1, mem), this->getvar<float>(2, mem), seed);
+                break;
+            case Precision::Float16:
+                tensorfunc::normal<Author, __half>(*mem->gettensor<__half>(name).get(), this->getvar<__half>(1, mem), this->getvar<__half>(2, mem), seed);
+                break;
+
+            case Precision::BFloat16:
+                tensorfunc::normal<Author, __nv_bfloat16>(*mem->gettensor<__nv_bfloat16>(name).get(), this->getvar<__nv_bfloat16>(1, mem), this->getvar<__nv_bfloat16>(2, mem), seed);
+                break;
+
+            case Precision::Int64:
+                tensorfunc::normal<Author, int64_t>(*mem->gettensor<int64_t>(name).get(), this->getvar<int64_t>(1, mem), this->getvar<int64_t>(2, mem), seed);
+                break;
+
+            case Precision::Int32:
+                tensorfunc::normal<Author, int32_t>(*mem->gettensor<int32_t>(name).get(), this->getvar<int32_t>(1, mem), this->getvar<int32_t>(2, mem), seed);
+                break;
+
+            case Precision::Int16:
+                tensorfunc::normal<Author, int16_t>(*mem->gettensor<int16_t>(name).get(), this->getvar<int16_t>(1, mem), this->getvar<int16_t>(2, mem), seed);
+                break;
+
+            case Precision::Int8:
+                tensorfunc::normal<Author, int8_t>(*mem->gettensor<int8_t>(name).get(), this->getvar<int8_t>(1, mem), this->getvar<int8_t>(2, mem), seed);
+                break;
+
+            default:
+            {
+                error = "unsupported dtype: " + precision_str(type);
+                return 1;
+            }
+            }
+            return 0;
+        }
+    };
 }
 
 #endif
diff --git a/excuter/op-mem-ompsimd/src/client/tfs.cpp b/excuter/op-mem-ompsimd/src/client/tfs.cpp
index 2208863c..defad8c6 100644
--- a/excuter/op-mem-ompsimd/src/client/tfs.cpp
+++ b/excuter/op-mem-ompsimd/src/client/tfs.cpp
@@ -97,6 +97,15 @@ namespace deepx::tf
                                                                      Param("seed", DataCategory::Var, Precision::Int32),
                                                                  }),
                                                              vector<Param>()));
+        // normal author=miaobyte
+        tffactory.add_tf(std::make_shared<Normal<miaobyte>>(vector<Param>(
+                                                                 {
+                                                                     Param("t", DataCategory::Tensor, Precision::Any),
+                                                                     Param("mean", DataCategory::Var, Precision::Any),
+                                                                     Param("std", DataCategory::Var, Precision::Any),
+                                                                     Param("seed", DataCategory::Var, Precision::Int32),
+                                                                 }),
+                                                             vector<Param>()));
     }
     // io
     void register_util(TfFactory &opfactory)
diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/init_miaobyte.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/init_miaobyte.hpp
index 27c63e99..4afda582 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/init_miaobyte.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/init_miaobyte.hpp
@@ -10,7 +10,8 @@
 #include "deepx/tensorfunc/init.hpp"
 
 namespace deepx::tensorfunc
-{
+{   
+    //constant
     template <typename T>
     struct constantDispatcher<miaobyte, T>
     {
@@ -20,6 +21,7 @@ namespace deepx::tensorfunc
         }
     };
 
+    //uniform
     template <typename T>
     struct uniformDispatcher<miaobyte, T>
     {
@@ -47,6 +49,7 @@ namespace deepx::tensorfunc
         }
     };
 
+    //arange
     template <typename T>
     struct arangeDispatcher<miaobyte, T>
     {
@@ -59,7 +62,34 @@ namespace deepx::tensorfunc
             }
         }
     };
- 
+
+    //normal
+    template <typename T>
+    struct normalDispatcher<miaobyte, T>
+    {
+        static void normal(Tensor<T> &tensor, const T mean, const T stddev, const unsigned int seed = 0)
+        {
+            std::normal_distribution<double> dist(mean, stddev);
+            std::default_random_engine generator;
+            
+            // 设置随机数生成器种子
+            if (seed == 0)
+            {
+                std::random_device rd;
+                generator.seed(rd());
+            }
+            else
+            {
+                generator.seed(seed);
+            }
+
+            // 单线程循环填充数据
+            for (int i = 0; i < tensor.shape.size; ++i)
+            {   
+                tensor.data[i] = static_cast<T>(dist(generator));   
+            }
+        }
+    };  
 }
 
 #endif // DEEPX_OP_CPU_INIT_HPP
\ No newline at end of file
diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/init.hpp b/excuter/op-mem-ompsimd/src/deepx/tf/init.hpp
index e208b540..dd5cdc45 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tf/init.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tf/init.hpp
@@ -7,7 +7,7 @@
 #include "stdutil/num.hpp"
 namespace deepx::tf
 {
-
+    // constant
     template <typename Author>
     class Constant : public TF
     {
@@ -33,7 +33,8 @@ namespace deepx::tf
         {
             string name = this->args[0].textvalue;
             auto tensor = mem->gettensor(name).get();
-            if (tensor==nullptr) {
+            if (tensor == nullptr)
+            {
                 error = "tensor not found: " + name;
                 return 1;
             }
@@ -95,6 +96,7 @@ namespace deepx::tf
         }
     };
 
+    // arange
     template <typename Author>
     class Arange : public TF
     {
@@ -168,6 +170,7 @@ namespace deepx::tf
         }
     };
 
+    // uniform
     template <typename Author>
     class Uniform : public TF
     {
@@ -241,6 +244,61 @@ namespace deepx::tf
         }
     };
 
+    // normal
+    template <typename Author>
+    class Normal : public TF
+    {
+    public:
+        Normal(const vector<Param> &args, const vector<Param> &returns)
+        {
+            this->name = "normal";
+            this->author = Author::name();
+            this->args = args;
+            this->returns = returns;
+        }
+
+        string math_formula() const override
+        {
+            return "normal(T1,mean,stddev,seed)";
+        }
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<Normal<Author>>(*this);
+        }
+        int run(shared_ptr<MemBase> mem, string &error) override
+        {
+            string name = this->args[0].textvalue;
+            auto tensor = mem->gettensor(name).get();
+            auto type = tensor->shape.dtype;
+            switch (type)
+            {
+            case Precision::Float64:
+                tensorfunc::normal<Author, double>(*mem->gettensor<double>(name).get(), this->getvar<double>(1, mem), this->getvar<double>(2, mem), this->getvar<unsigned int>(3, mem));
+                break;
+            case Precision::Float32:
+                tensorfunc::normal<Author, float>(*mem->gettensor<float>(name).get(), this->getvar<float>(1, mem), this->getvar<float>(2, mem), this->getvar<unsigned int>(3, mem));
+                break;
+            case Precision::Int64:
+                tensorfunc::normal<Author, int64_t>(*mem->gettensor<int64_t>(name).get(), this->getvar<int64_t>(1, mem), this->getvar<int64_t>(2, mem), this->getvar<unsigned int>(3, mem));
+                break;
+            case Precision::Int32:
+                tensorfunc::normal<Author, int32_t>(*mem->gettensor<int32_t>(name).get(), this->getvar<int32_t>(1, mem), this->getvar<int32_t>(2, mem), this->getvar<unsigned int>(3, mem));
+                break;
+            case Precision::Int16:
+                tensorfunc::normal<Author, int16_t>(*mem->gettensor<int16_t>(name).get(), this->getvar<int16_t>(1, mem), this->getvar<int16_t>(2, mem), this->getvar<unsigned int>(3, mem));
+                break;
+            case Precision::Int8:
+                tensorfunc::normal<Author, int8_t>(*mem->gettensor<int8_t>(name).get(), this->getvar<int8_t>(1, mem), this->getvar<int8_t>(2, mem), this->getvar<unsigned int>(3, mem));
+                break;
+            default:
+            {
+                error = "unsupported dtype: " + precision_str(type);
+                return 1;
+            }
+            }
+            return 0;
+        }
+    };
 }
 
 #endif
diff --git a/front/py/deepx/nn/functional/authormap.py b/front/py/deepx/nn/functional/authormap.py
index 91e0573b..6c32a4d6 100644
--- a/front/py/deepx/nn/functional/authormap.py
+++ b/front/py/deepx/nn/functional/authormap.py
@@ -5,6 +5,7 @@
         'uniform':'miaobyte',
         'constant':'miaobyte',
         'arange':'miaobyte',
+        'normal':'miaobyte',
         #elementwise
         'add':'miaobyte',
         'addscalar':'miaobyte',
diff --git a/front/py/deepx/nn/functional/leaffunc_init.py b/front/py/deepx/nn/functional/leaffunc_init.py
index 454dc09d..d6188592 100644
--- a/front/py/deepx/nn/functional/leaffunc_init.py
+++ b/front/py/deepx/nn/functional/leaffunc_init.py
@@ -56,23 +56,6 @@ def uniform(*shape,low=0, high=1,seed:int=None,dtype:str='float32',name:str=None
     uniform_(outtensor,low,high,seed)
     return outtensor
 
-# def rand(*size, dtype=None, device=None):
-#    #TODO
-#    pass
-
-# def randn(*size, dtype=None, device=None):
-#     #TODO
-#     pass
-
-# def eye(
-#         n:int,
-#         m:Optional[int]=None,
-#         dtype:Optional[str]=None, 
-#         device:Optional[str]=None):
-#     #TODO
-#     pass
- 
-
 def calculate_fan_in_and_fan_out(tensor:Tensor)->tuple[int,int]:
     dimensions = tensor.dim()
     if dimensions < 2:
@@ -189,3 +172,15 @@ def kaiming_uniform(*shape,a:float=0,mode:str='fan_in',nonlinearity:str='leaky_r
     kaiming_uniform_(outtensor,a,mode,nonlinearity)
     return outtensor
 
+def normal_(t:Tensor,mean:float=0, stddev:float=1,seed:int=None)->Tensor:
+    if seed is None:
+        seed = int(time.time() * 1000) & 0xffffffff
+        seed = (seed + os.getpid()) & 0xffffffff
+    from .rtf_init import rtf_normal
+    rtf_normal(t,mean,stddev,seed,defaultauthor['normal'])
+
+def normal(*shape,mean:float=0, stddev:float=1,seed:int=None,dtype:str='float32',name:str=None,author='miaobyte')->Tensor:
+    s = parse_shape(shape)
+    outtensor=newtensor(s,dtype=dtype,name=name)
+    normal_(outtensor,mean,stddev,seed)
+    return outtensor
diff --git a/front/py/deepx/nn/functional/rtf_init.py b/front/py/deepx/nn/functional/rtf_init.py
index aa4e5d99..bc46205c 100644
--- a/front/py/deepx/nn/functional/rtf_init.py
+++ b/front/py/deepx/nn/functional/rtf_init.py
@@ -21,4 +21,11 @@ def rtf_uniform(t:Tensor,low=0, high=1,seed:int=0,author='miaobyte')->Tensor:
     returns=[]
     ir=DeepxIR("uniform", args, returns,author)
     send(ir)
+    return t
+
+def rtf_normal(t:Tensor,mean:float=0, stddev:float=1,seed:int=0,author='miaobyte')->Tensor:
+    args=[Param.tensor(t),Param.varnum(mean),Param.varnum(stddev),Param.varnum(seed)]
+    returns=[]
+    ir=DeepxIR("normal", args, returns,author)
+    send(ir)
     return t
\ No newline at end of file
diff --git a/front/py/deepx/tensor/init.py b/front/py/deepx/tensor/init.py
index f50073ca..803ba866 100644
--- a/front/py/deepx/tensor/init.py
+++ b/front/py/deepx/tensor/init.py
@@ -3,24 +3,34 @@
 
 @tensor_method
 def full_(self,value:Union[float,int]):
-    from deepx.nn.functional import constant as constant_func
+    from deepx.nn.functional import constant_ as constant_func
     constant_func(self,value=value)
 
 @tensor_method
 def zeros_(self):
-    from deepx.nn.functional import constant as constant_func
+    from deepx.nn.functional import constant_ as constant_func
     constant_func(self,value=0)
 
 @tensor_method
 def ones_(self):
-    from deepx.nn.functional import constant as constant_func
+    from deepx.nn.functional import constant_ as constant_func
     constant_func(self,value=1)
 
 @tensor_method
-def uniform_(self,low=0, high=1,seed:int=0):
-    from deepx.nn.functional import uniform as uniform_func
+def uniform_(self,low=0, high=1,seed:int=None):
+    from deepx.nn.functional import uniform_ as uniform_func
     uniform_func(self,low=low, high=high,seed=seed)
 
+@tensor_method
+def arange_(self,start=0,step=1):
+    from deepx.nn.functional import arange_ as arange_func
+    arange_func(self,start,step)
+
+@tensor_method
+def normal_(self,mean=0, stddev=1,seed:int=None):
+    from deepx.nn.functional import normal_ as normal_func
+    normal_func(self,mean,stddev,seed)
+
 @tensor_method
 def rand_(self):
     #todo
@@ -30,12 +40,6 @@ def rand_(self):
 def randn_(self):
     #todo
     pass
-
-@tensor_method
-def arange_(self,start=0,step=1,author='miaobyte'):
-    from deepx.nn.functional import arange_ as arange_func
-    arange_func(self,start,step,author)
-
 @tensor_method
 def eye_(self,n,m=None):
     #todo
diff --git a/front/py/deepx/tensor/tensor.py b/front/py/deepx/tensor/tensor.py
index 8843bc66..5b909d88 100644
--- a/front/py/deepx/tensor/tensor.py
+++ b/front/py/deepx/tensor/tensor.py
@@ -19,9 +19,7 @@ def __init__(self,shape:Union[tuple[int],list[int],Shape],dtype:str='float32',na
             tensorid+=1
         # dtype
         self._dtype = dtype
-        
-        # format
-        self.autoformat()
+
         # shape
  
         if isinstance(shape, (tuple, list)) and all(isinstance(i, int) for i in shape):
@@ -127,20 +125,22 @@ def T(self) -> str:
         return self.transpose()
 
     # 打印
-    def autoformat(self):
-        if self._dtype == 'float32' or self._dtype == 'float64' or self._dtype == 'float16' or self._dtype == 'bfloat16':
-            self._format = '%.4f'
-        elif self._dtype == 'int32' or self._dtype == 'int64' or self._dtype == 'int8' or self._dtype == 'int16':
-            self._format = '%d'
-        elif self._dtype == 'bool':
-            self._format = '%d'
+    @staticmethod
+    def autoformat(dtype):
+        if dtype == 'float32' or dtype == 'float64' or dtype == 'float16' or dtype == 'bfloat16':
+            return '%.4f'
+        elif dtype == 'int32' or dtype == 'int64' or dtype == 'int8' or dtype == 'int16':
+            return '%d'
+        elif dtype == 'bool':
+            return '%d'
         else:
-            self._format = '%s'
-    def set_format(self,format:str):
-        self._format = format
-    def print(self):
+            return '%s'
+ 
+    def print(self,format:str=None):
+        if format is None:
+            format=self.autoformat(self.dtype)
         from deepx.nn.functional import printtensor
-        printtensor(self,format=self._format)
+        printtensor(self,format)
     def __repr__(self) -> str:
         return 'Tensor(shape={},dtype={},name={})'.format(self.shape,self.dtype,self.name)
 
diff --git a/front/py/examples/2_ir/1_init_zeroones.py b/front/py/examples/2_ir/1_init_zeroones.py
index d6f63c34..d6de4318 100644
--- a/front/py/examples/2_ir/1_init_zeroones.py
+++ b/front/py/examples/2_ir/1_init_zeroones.py
@@ -2,26 +2,42 @@
 ############-------PyTorch-------################
 
 import torch
-torch_t1 = torch.zeros(3, 4, 5, dtype=torch.float32)
-torch_t2 = torch.ones(3, 4, 5, dtype=torch.float32)
-torch_t4 = torch.full((3, 4, 5), 0.5)
-print(torch_t4)
-
-torch_t6 = torch.zeros(3, 4, 5, dtype=torch.float32)
-torch.nn.init.kaiming_uniform_(torch_t6)
-print(torch_t6)
-
-
+# torch_t1 = torch.zeros(3, 4, 5, dtype=torch.float32)
+# torch_t2 = torch.ones(3, 4, 5, dtype=torch.float32)
+# torch_t4 = torch.full((3, 4, 5), 0.5)
+# print(torch_t4)
+# torch_t5=torch.nn.init.uniform_(torch.zeros(3,4,5),0,1)
+# print(torch_t5)
+
+
+# torch_t6 = torch.zeros(3, 4, 5, dtype=torch.float32)
+# torch.nn.init.kaiming_uniform_(torch_t6)
+# print(torch_t6)
+#
+torch_t7 = torch.zeros(3, 4, 5, dtype=torch.float32)
+torch_t7.normal_(mean=0,std=0.02)
+print(torch_t7)
 
 ############-------DEEPX-------################
 
-from deepx import zeros,ones,full,kaiming_uniform
+import deepx
 print()
 
-t1 = zeros([3,4,5],dtype='float32')
-t2 = ones([3,4,5],dtype='float32')
-t4=full([3,4,5],value=0.5)
-print(t4)
-
-t6=kaiming_uniform(3,4,5,dtype='float32')
-print(t6)
+# t1 = deepx.zeros([3,4,5],dtype='float32')
+# t2 = deepx.ones([3,4,5],dtype='float32')
+# t4=deepx.full([3,4,5],value=0.5)
+# t4.print()
+# t5=deepx.uniform(3,4,5,low=0,high=1)
+# t5.print()
+# t6=deepx.kaiming_uniform(3,4,5,dtype='float32')
+# t6.print()
+
+t7=deepx.zeros(3,4,5,dtype='float32')
+t7.normal_(mean=0,stddev=0.02)
+t7.print("%.6f")
+
+# t7.uniform_(low=0,high=1)
+# t7.print("%.6f")
+#
+# t7.arange_(start=0,step=1)
+# t7.print("%.0f")

From e9c0411a63589c0c45e7b3c1829560b4738bc602 Mon Sep 17 00:00:00 2001
From: lipeng <734991033@qq.com>
Date: Sat, 19 Apr 2025 20:23:34 +0800
Subject: [PATCH 3/3] doc:tensor list

---
 doc/excuter/op-mem-cuda/list.md               |  80 +++++++++----
 doc/excuter/op-mem-ompsimd/list.md            |  82 +++++++++----
 excuter/cpp-common/src/deepx/tf/tf.hpp        |   1 +
 excuter/cpp-common/src/deepx/tf/tffactory.cpp |  42 ++++---
 excuter/op-mem-cuda/src/deepx/tf/arg.hpp      |  21 +---
 .../op-mem-cuda/src/deepx/tf/changeshape.hpp  |   4 +
 .../src/deepx/tf/elementwise_basic.hpp        | 109 +++---------------
 .../src/deepx/tf/elementwise_compare.hpp      |  11 ++
 .../src/deepx/tf/elementwise_sin.hpp          |  36 +-----
 .../src/deepx/tf/elementwise_sqrt.hpp         |  60 ++--------
 excuter/op-mem-cuda/src/deepx/tf/init.hpp     |  15 +--
 excuter/op-mem-cuda/src/deepx/tf/io.hpp       |  11 +-
 excuter/op-mem-cuda/src/deepx/tf/matmul.hpp   |   1 +
 excuter/op-mem-cuda/src/deepx/tf/reduce.hpp   |   6 +-
 .../op-mem-cuda/src/deepx/tf/tensorlife.hpp   |   3 +
 excuter/op-mem-ompsimd/src/deepx/tf/arg.hpp   |  24 +---
 .../src/deepx/tf/changeshape.hpp              |   4 +
 .../src/deepx/tf/elementwise.hpp              |  30 +++++
 excuter/op-mem-ompsimd/src/deepx/tf/init.hpp  |  15 +--
 excuter/op-mem-ompsimd/src/deepx/tf/io.hpp    |  11 +-
 .../op-mem-ompsimd/src/deepx/tf/matmul.hpp    |  12 +-
 .../op-mem-ompsimd/src/deepx/tf/reduce.hpp    |   4 +
 .../src/deepx/tf/tensorlife.hpp               |  13 +--
 front/py/examples/2_ir/1_init_zeroones.py     |  46 ++++----
 24 files changed, 292 insertions(+), 349 deletions(-)

diff --git a/doc/excuter/op-mem-cuda/list.md b/doc/excuter/op-mem-cuda/list.md
index 8704844c..34de625c 100644
--- a/doc/excuter/op-mem-cuda/list.md
+++ b/doc/excuter/op-mem-cuda/list.md
@@ -2,47 +2,61 @@
 
 本页面由 `excuter/op-mem-cuda 生成，请勿手动修改 
 
+### arg
+
+| Operation | Author | Func Def | Math Formula | IR Instruction |
+|-----------|--------|------------|--------------|----------------|
+| vecset |  none  | vecset(vector<any> value)->(vector<any> name) | shape = [3  4  5] | vecset(vector<any> value)->(vector<any> name) |
+| argset |  none  | argset(var<any> value)->(var<any> name) | var argname = argvalue | argset(var<any> value)->(var<any> name) |
+
+### io
+
+| Operation | Author | Func Def | Math Formula | IR Instruction |
+|-----------|--------|------------|--------------|----------------|
+| print | miaobyte | print(tensor<any> )->() | print(T1) | print(tensor<any> )->() |
+| print | miaobyte | print(tensor<any> , var<string> )->() | print(T1) | print(tensor<any> , var<string> )->() |
+
+### tensorlife
+
+| Operation | Author | Func Def | Math Formula | IR Instruction |
+|-----------|--------|------------|--------------|----------------|
+| copytensor |  none  | copytensor(tensor<any> src, tensor<any> dst)->() | T2.data = T1.data | copytensor(tensor<any> src, tensor<any> dst)->() |
+| newtensor |  none  | newtensor(vector<int32> shape)->(tensor<any> tensor1) | T1 = zeros(shape) | newtensor(vector<int32> shape)->(tensor<any> tensor1) |
+| newtensor |  none  | newtensor(var<string> shape)->(tensor<any> tensor1) | T1 = zeros(shape) | newtensor(var<string> shape)->(tensor<any> tensor1) |
+| deltensor |  none  | deltensor(tensor<any> t)->() | del T1 | deltensor(tensor<any> t)->() |
+
+### init
+
+| Operation | Author | Func Def | Math Formula | IR Instruction |
+|-----------|--------|------------|--------------|----------------|
+| normal | miaobyte | normal(tensor<any> t, var<any> mean, var<any> stddev, var<int32> seed)->() | normal(T1,mean,stddev,seed) | normal(tensor<any> t, var<any> mean, var<any> stddev, var<int32> seed)->() |
+| uniform | miaobyte | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() | uniform(T1,low,high,seed) | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() |
+| arange | miaobyte | arange(tensor<any> t, var<any> start, var<any> step)->() | arange(T1,start,step) | arange(tensor<any> t, var<any> start, var<any> step)->() |
+| constant | miaobyte | constant(tensor<any> t, var<any> value)->() | constant(T1) | constant(tensor<any> t, var<any> value)->() |
+
+### elementwise
+
 | Operation | Author | Func Def | Math Formula | IR Instruction |
 |-----------|--------|------------|--------------|----------------|
-| reducemax | miaobyte | reducemax(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) | B = reducemax(A, axis=[1 2], keepdims=false) | reducemax(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) |
-| broadcastTo | miaobyte | broadcastTo(tensor<any> A, vector<int32> new_shape)->(tensor<any> B) | T2 = T1.broadcastTo(new_shape=[4,3,2]) | broadcastTo(tensor<any> A, vector<int32> new_shape)->(tensor<any> B) |
-| concat | miaobyte | concat(listtensor<any> tensors, var<int32> dim)->(tensor<any> result) | Tresult = concat([T1, T2...], axis=3) | concat(listtensor<any> tensors, var<int32> dim)->(tensor<any> result) |
-| transpose | miaobyte | transpose(tensor<any> A, vector<int32> dim_order)->(tensor<any> C) | T2 = T1.transpose(dimorder=[1,0]) | transpose(tensor<any> A, vector<int32> dim_order)->(tensor<any> C) |
-| reshape | miaobyte | reshape(tensor<any> A, vector<int32> shape)->(tensor<any> B) | T1.reshape(shape)->T2 | reshape(tensor<any> A, vector<int32> shape)->(tensor<any> B) |
-| matmul | cublas | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1 @ T2 | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) |
 | switch | miaobyte | switch(listtensor<any> tensors, tensor<int8> cases)->(tensor<any> result) | C=switch(tensors,cases) | switch(listtensor<any> tensors, tensor<int8> cases)->(tensor<any> result) |
 | greaterscalar | miaobyte | greaterscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) | mask=compare(T1, scalar) | greaterscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) |
 | equalscalar | miaobyte | equalscalar(tensor<any> A, var<any> scalar, var<float64> epsilon)->(tensor<bool> mask) | mask=compare(T1, scalar) | equalscalar(tensor<any> A, var<any> scalar, var<float64> epsilon)->(tensor<bool> mask) |
-| prod | miaobyte | prod(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) | B = prod(A, axis=[1 2], keepdims=false) | prod(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) |
 | min | miaobyte | min(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=min(T1, T2) | min(tensor<any> A, tensor<any> B)->(tensor<any> C) |
 | maxscalar | miaobyte | maxscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=max(T1, scalar) | maxscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
-| normal | miaobyte | normal(tensor<any> t, var<any> mean, var<any> stddev, var<int32> seed)->() | normal(T1,mean,stddev,seed) | normal(tensor<any> t, var<any> mean, var<any> stddev, var<int32> seed)->() |
-| uniform | miaobyte | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() | uniform(T1,low,high,seed) | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() |
 | addscalar | miaobyte | addscalar(tensor<any> A, var<any> b)->(tensor<any> C) | T3=T1+scalar | addscalar(tensor<any> A, var<any> b)->(tensor<any> C) |
 | log | miaobyte | log(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) | T3=log(T1) | log(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) |
-| arange | miaobyte | arange(tensor<any> t, var<any> start, var<any> step)->() | arange(T1,start,step) | arange(tensor<any> t, var<any> start, var<any> step)->() |
 | divscalar | miaobyte | divscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=scalar/T1 | divscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
 | sin | miaobyte | sin(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) | T3=sin(T1) | sin(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) |
 | tan | miaobyte | tan(tensor<float64|float32> A)->(tensor<float64|float32> C) | T3=tan(T1) | tan(tensor<float64|float32> A)->(tensor<float64|float32> C) |
 | add | cublas | add(tensor<any> a, tensor<any> b)->(tensor<any> c) | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |
 | add | miaobyte | add(tensor<any> a, tensor<any> b)->(tensor<any> c) | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |
-| copytensor |  none  | copytensor(tensor<any> src, tensor<any> dst)->() | T2.data = T1.data | copytensor(tensor<any> src, tensor<any> dst)->() |
 | greater | miaobyte | greater(tensor<any> A, tensor<any> B)->(tensor<bool> mask) | mask=compare(T1, T2) | greater(tensor<any> A, tensor<any> B)->(tensor<bool> mask) |
-| print | miaobyte | print(tensor<any> )->() | print(T1) | print(tensor<any> )->() |
-| print | miaobyte | print(tensor<any> , var<string> )->() | print(T1) | print(tensor<any> , var<string> )->() |
-| newtensor |  none  | newtensor(vector<int32> shape)->(tensor<any> tensor1) | T1 = zeros(shape) | newtensor(vector<int32> shape)->(tensor<any> tensor1) |
-| newtensor |  none  | newtensor(var<string> shape)->(tensor<any> tensor1) | T1 = zeros(shape) | newtensor(var<string> shape)->(tensor<any> tensor1) |
 | less | miaobyte | less(tensor<any> A, tensor<any> B)->(tensor<bool> mask) | mask=compare(T1, T2) | less(tensor<any> A, tensor<any> B)->(tensor<bool> mask) |
-| constant | miaobyte | constant(tensor<any> t, var<any> value)->() | constant(T1) | constant(tensor<any> t, var<any> value)->() |
 | powscalar | miaobyte | powscalar(tensor<float64|float32> A, var<float64|int32> scalar)->(tensor<float64|float32> C) | T3=pow(T1, scalar) | powscalar(tensor<float64|float32> A, var<float64|int32> scalar)->(tensor<float64|float32> C) |
-| vecset |  none  | vecset(vector<any> value)->(vector<any> name) | shape = [3  4  5] | vecset(vector<any> value)->(vector<any> name) |
 | minscalar | miaobyte | minscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=min(T1, scalar) | minscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
 | rdivscalar | miaobyte | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) | T3=scalar/T1 | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) |
 | rpowscalar | miaobyte | rpowscalar(var<float64|int32> scalar, tensor<float64|float32> A)->(tensor<float64|float32> C) | T3=pow(scalar, T1) | rpowscalar(var<float64|int32> scalar, tensor<float64|float32> A)->(tensor<float64|float32> C) |
 | sub | miaobyte | sub(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1-T2 | sub(tensor<any> A, tensor<any> B)->(tensor<any> C) |
-| sum | miaobyte | sum(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) | B = sum(A, axis=[1 2], keepdims=false) | sum(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) |
-| argset |  none  | argset(var<any> value)->(var<any> name) | var argname = argvalue | argset(var<any> value)->(var<any> name) |
-| reducemin | miaobyte | reducemin(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) | B = reducemin(A, axis=[1 2], keepdims=false) | reducemin(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) |
 | sqrt | miaobyte | sqrt(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) | T3=sqrt(T1) | sqrt(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) |
 | subscalar | miaobyte | subscalar(tensor<any> A, var<any> b)->(tensor<any> C) | T3=T1-scalar | subscalar(tensor<any> A, var<any> b)->(tensor<any> C) |
 | equal | miaobyte | equal(tensor<any> A, tensor<any> B, var<float64> epsilon)->(tensor<bool> mask) | mask=compare(T1, T2) | equal(tensor<any> A, tensor<any> B, var<float64> epsilon)->(tensor<bool> mask) |
@@ -54,5 +68,29 @@
 | mul | miaobyte | mul(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1*T2 | mul(tensor<any> A, tensor<any> B)->(tensor<any> C) |
 | exp | miaobyte | exp(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) | T3=exp(T1) | exp(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) |
 | lessscalar | miaobyte | lessscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) | mask=compare(T1, scalar) | lessscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) |
-| deltensor |  none  | deltensor(tensor<any> t)->() | del T1 | deltensor(tensor<any> t)->() |
 | cos | miaobyte | cos(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) | T3=cos(T1) | cos(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) |
+
+### matmul
+
+| Operation | Author | Func Def | Math Formula | IR Instruction |
+|-----------|--------|------------|--------------|----------------|
+| matmul | cublas | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1 @ T2 | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) |
+
+### changeshape
+
+| Operation | Author | Func Def | Math Formula | IR Instruction |
+|-----------|--------|------------|--------------|----------------|
+| broadcastTo | miaobyte | broadcastTo(tensor<any> A, vector<int32> new_shape)->(tensor<any> B) | T2 = T1.broadcastTo(new_shape=[4,3,2]) | broadcastTo(tensor<any> A, vector<int32> new_shape)->(tensor<any> B) |
+| concat | miaobyte | concat(listtensor<any> tensors, var<int32> dim)->(tensor<any> result) | Tresult = concat([T1, T2...], axis=3) | concat(listtensor<any> tensors, var<int32> dim)->(tensor<any> result) |
+| transpose | miaobyte | transpose(tensor<any> A, vector<int32> dim_order)->(tensor<any> C) | T2 = T1.transpose(dimorder=[1,0]) | transpose(tensor<any> A, vector<int32> dim_order)->(tensor<any> C) |
+| reshape | miaobyte | reshape(tensor<any> A, vector<int32> shape)->(tensor<any> B) | T1.reshape(shape)->T2 | reshape(tensor<any> A, vector<int32> shape)->(tensor<any> B) |
+
+### reduce
+
+| Operation | Author | Func Def | Math Formula | IR Instruction |
+|-----------|--------|------------|--------------|----------------|
+| reducemax | miaobyte | reducemax(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) | B = reducemax(A, axis=[1 2], keepdims=false) | reducemax(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) |
+| prod | miaobyte | prod(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) | B = prod(A, axis=[1 2], keepdims=false) | prod(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) |
+| sum | miaobyte | sum(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) | B = sum(A, axis=[1 2], keepdims=false) | sum(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) |
+| reducemin | miaobyte | reducemin(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) | B = reducemin(A, axis=[1 2], keepdims=false) | reducemin(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) |
+
diff --git a/doc/excuter/op-mem-ompsimd/list.md b/doc/excuter/op-mem-ompsimd/list.md
index 65ffb758..b396a357 100644
--- a/doc/excuter/op-mem-ompsimd/list.md
+++ b/doc/excuter/op-mem-ompsimd/list.md
@@ -2,47 +2,59 @@
 
 本页面由 `excuter/op-mem-ompsimd 生成，请勿手动修改 
 
+### arg
+
+| Operation | Author | Func Def | Math Formula | IR Instruction |
+|-----------|--------|------------|--------------|----------------|
+| vecset |  none  | vecset(vector<any> value)->(vector<any> name) | shape = [3  4  5] | vecset(vector<any> value)->(vector<any> name) |
+| argset |  none  | argset(var<any> value)->(var<any> name) | var argname = argvalue | argset(var<any> value)->(var<any> name) |
+
+### io
+
+| Operation | Author | Func Def | Math Formula | IR Instruction |
+|-----------|--------|------------|--------------|----------------|
+| print | miaobyte | print(tensor<any> )->() | print(T1) | print(tensor<any> )->() |
+| print | miaobyte | print(tensor<any> , var<string> )->() | print(T1) | print(tensor<any> , var<string> )->() |
+
+### tensorlife
+
+| Operation | Author | Func Def | Math Formula | IR Instruction |
+|-----------|--------|------------|--------------|----------------|
+| copytensor |  none  | copytensor(tensor<any> src, tensor<any> dst)->() | T2.data = T1.data | copytensor(tensor<any> src, tensor<any> dst)->() |
+| newtensor |  none  | newtensor(vector<int32> shape)->(tensor<any> tensor1) | T1 =Tensor(shape=[...]) | newtensor(vector<int32> shape)->(tensor<any> tensor1) |
+| newtensor |  none  | newtensor(var<string> shape)->(tensor<any> t) | T1 =Tensor(shape=[...]) | newtensor(var<string> shape)->(tensor<any> t) |
+| deltensor |  none  | deltensor(tensor<any> t)->() | del T1 | deltensor(tensor<any> t)->() |
+
+### init
+
+| Operation | Author | Func Def | Math Formula | IR Instruction |
+|-----------|--------|------------|--------------|----------------|
+| normal | miaobyte | normal(tensor<any> t, var<any> mean, var<any> std, var<int32> seed)->() | normal(T1,mean,stddev,seed) | normal(tensor<any> t, var<any> mean, var<any> std, var<int32> seed)->() |
+| uniform | miaobyte | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() | uniform(T1,low,high,seed) | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() |
+| arange | miaobyte | arange(tensor<any> t, var<any> start, var<any> step)->() | arange(T1,start,step) | arange(tensor<any> t, var<any> start, var<any> step)->() |
+| constant | miaobyte | constant(tensor<any> t, var<any> value)->() | constant(T1,value) | constant(tensor<any> t, var<any> value)->() |
+
+### elementwise
+
 | Operation | Author | Func Def | Math Formula | IR Instruction |
 |-----------|--------|------------|--------------|----------------|
-| reducemax | miaobyte | reducemax(tensor<any> A, vector<int32> axis, var<bool> keepdims)->(tensor<any> B) | B = reducemax(A, axis=[1 2], keepdims=false) | reducemax(tensor<any> A, vector<int32> axis, var<bool> keepdims)->(tensor<any> B) |
-| broadcastTo | miaobyte | broadcastTo(tensor<any> A, vector<int32> new_shape)->(tensor<any> B) | T2 = T1.broadcastTo(new_shape=[4,3,2]) | broadcastTo(tensor<any> A, vector<int32> new_shape)->(tensor<any> B) |
-| concat | miaobyte | concat(listtensor<any> tensors, var<int32> dim)->(tensor<any> result) | Tresult = concat([T1, T2...], axis=3) | concat(listtensor<any> tensors, var<int32> dim)->(tensor<any> result) |
-| transpose | miaobyte | transpose(tensor<any> A, vector<int32> dim_order)->(tensor<any> C) | T1.transpose(dimorder=[1,0])->T2 | transpose(tensor<any> A, vector<int32> dim_order)->(tensor<any> C) |
-| reshape | miaobyte | reshape(tensor<any> A, vector<int32> shape)->(tensor<any> B) | T1.reshape(shape)->T2 | reshape(tensor<any> A, vector<int32> shape)->(tensor<any> B) |
-| matmul | cblas | matmul(tensor<float64|float32> A, tensor<float64|float32> B)->(tensor<float64|float32> C) | T3=T1 @ T2 | matmul(tensor<float64|float32> A, tensor<float64|float32> B)->(tensor<float64|float32> C) |
-| matmul | miaobyte | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1 @ T2 | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) |
 | switch | miaobyte | switch(listtensor<any> tensors, tensor<int8> cases)->(tensor<any> C) | C=switch([tensors],case) | switch(listtensor<any> tensors, tensor<int8> cases)->(tensor<any> C) |
 | greaterscalar | miaobyte | greaterscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) | mask=greater(T1,scalar) | greaterscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) |
 | equalscalar | miaobyte | equalscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) | mask=equal(T1,scalar) | equalscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) |
-| normal | miaobyte | normal(tensor<any> t, var<any> mean, var<any> std, var<int32> seed)->() | normal(T1,mean,stddev,seed) | normal(tensor<any> t, var<any> mean, var<any> std, var<int32> seed)->() |
-| uniform | miaobyte | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() | uniform(T1,low,high,seed) | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() |
 | addscalar | miaobyte | addscalar(tensor<any> a, var<any> scalar)->(tensor<any> c) | T3=T1+scalar | addscalar(tensor<any> a, var<any> scalar)->(tensor<any> c) |
 | log | miaobyte | log(tensor<any> A)->(tensor<any> C) | T3=log(T1) | log(tensor<any> A)->(tensor<any> C) |
-| arange | miaobyte | arange(tensor<any> t, var<any> start, var<any> step)->() | arange(T1,start,step) | arange(tensor<any> t, var<any> start, var<any> step)->() |
 | divscalar | miaobyte | divscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=T1/scalar | divscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
 | add | cblas | add(tensor<float64|float32> a, tensor<float64|float32> b)->(tensor<float64|float32> c) | T3=T1+T2 | add(tensor<float64|float32> a, tensor<float64|float32> b)->(tensor<float64|float32> c) |
 | add | miaobyte | add(tensor<any> a, tensor<any> b)->(tensor<any> c) | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |
-| copytensor |  none  | copytensor(tensor<any> src, tensor<any> dst)->() | T2.data = T1.data | copytensor(tensor<any> src, tensor<any> dst)->() |
-| prod | miaobyte | prod(tensor<any> A, vector<int32> axis, var<bool> keepdims)->(tensor<any> B) | B = prod(A, axis=[1 2], keepdims=false) | prod(tensor<any> A, vector<int32> axis, var<bool> keepdims)->(tensor<any> B) |
 | min | miaobyte | min(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=min(T1,T2) | min(tensor<any> A, tensor<any> B)->(tensor<any> C) |
 | greater | miaobyte | greater(tensor<any> A, tensor<any> B)->(tensor<bool> mask) | mask=greater(T1,T2) | greater(tensor<any> A, tensor<any> B)->(tensor<bool> mask) |
-| print | miaobyte | print(tensor<any> )->() | print(T1) | print(tensor<any> )->() |
-| print | miaobyte | print(tensor<any> , var<string> )->() | print(T1) | print(tensor<any> , var<string> )->() |
-| newtensor |  none  | newtensor(vector<int32> shape)->(tensor<any> tensor1) | T1 =Tensor(shape=[...]) | newtensor(vector<int32> shape)->(tensor<any> tensor1) |
-| newtensor |  none  | newtensor(var<string> shape)->(tensor<any> t) | T1 =Tensor(shape=[...]) | newtensor(var<string> shape)->(tensor<any> t) |
 | lessscalar | miaobyte | lessscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) | mask=less(T1,scalar) | lessscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) |
-| deltensor |  none  | deltensor(tensor<any> t)->() | del T1 | deltensor(tensor<any> t)->() |
 | less | miaobyte | less(tensor<any> A, tensor<any> B)->(tensor<bool> mask) | mask=less(T1,T2) | less(tensor<any> A, tensor<any> B)->(tensor<bool> mask) |
-| constant | miaobyte | constant(tensor<any> t, var<any> value)->() | constant(T1,value) | constant(tensor<any> t, var<any> value)->() |
 | powscalar | miaobyte | powscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=T1^scalar | powscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
-| vecset |  none  | vecset(vector<any> value)->(vector<any> name) | shape = [3  4  5] | vecset(vector<any> value)->(vector<any> name) |
 | minscalar | miaobyte | minscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=min(T1,scalar) | minscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
 | rdivscalar | miaobyte | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) | T3=scalar/T1 | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) |
 | rpowscalar | miaobyte | rpowscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) | T3=scalar^T1 | rpowscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) |
 | sub | miaobyte | sub(tensor<any> a, tensor<any> b)->(tensor<any> c) | T3=T1-T2 | sub(tensor<any> a, tensor<any> b)->(tensor<any> c) |
-| sum | miaobyte | sum(tensor<any> A, vector<int32> axis, var<bool> keepdims)->(tensor<any> B) | B = sum(A, axis=[1 2], keepdims=false) | sum(tensor<any> A, vector<int32> axis, var<bool> keepdims)->(tensor<any> B) |
-| argset |  none  | argset(var<any> value)->(var<any> name) | var argname = argvalue | argset(var<any> value)->(var<any> name) |
-| reducemin | miaobyte | reducemin(tensor<any> A, vector<int32> axis, var<bool> keepdims)->(tensor<any> B) | B = reducemin(A, axis=[1 2], keepdims=false) | reducemin(tensor<any> A, vector<int32> axis, var<bool> keepdims)->(tensor<any> B) |
 | sqrt | miaobyte | sqrt(tensor<any> A)->(tensor<any> C) | T3=sqrt(T1) | sqrt(tensor<any> A)->(tensor<any> C) |
 | subscalar | miaobyte | subscalar(tensor<any> a, var<any> scalar)->(tensor<any> c) | T3=T1-scalar | subscalar(tensor<any> a, var<any> scalar)->(tensor<any> c) |
 | equal | miaobyte | equal(tensor<any> A, tensor<any> B)->(tensor<bool> mask) | mask=equal(T1,T2) | equal(tensor<any> A, tensor<any> B)->(tensor<bool> mask) |
@@ -54,3 +66,29 @@
 | maxscalar | miaobyte | maxscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=max(T1,scalar) | maxscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
 | mul | miaobyte | mul(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1*T2 | mul(tensor<any> A, tensor<any> B)->(tensor<any> C) |
 | exp | miaobyte | exp(tensor<any> A)->(tensor<any> C) | T3=exp(T1) | exp(tensor<any> A)->(tensor<any> C) |
+
+### matmul
+
+| Operation | Author | Func Def | Math Formula | IR Instruction |
+|-----------|--------|------------|--------------|----------------|
+| matmul | cblas | matmul(tensor<float64|float32> A, tensor<float64|float32> B)->(tensor<float64|float32> C) | T3=T1 @ T2 | matmul(tensor<float64|float32> A, tensor<float64|float32> B)->(tensor<float64|float32> C) |
+| matmul | miaobyte | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1 @ T2 | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) |
+
+### changeshape
+
+| Operation | Author | Func Def | Math Formula | IR Instruction |
+|-----------|--------|------------|--------------|----------------|
+| broadcastTo | miaobyte | broadcastTo(tensor<any> A, vector<int32> new_shape)->(tensor<any> B) | T2 = T1.broadcastTo(new_shape=[4,3,2]) | broadcastTo(tensor<any> A, vector<int32> new_shape)->(tensor<any> B) |
+| concat | miaobyte | concat(listtensor<any> tensors, var<int32> dim)->(tensor<any> result) | Tresult = concat([T1, T2...], axis=3) | concat(listtensor<any> tensors, var<int32> dim)->(tensor<any> result) |
+| transpose | miaobyte | transpose(tensor<any> A, vector<int32> dim_order)->(tensor<any> C) | T1.transpose(dimorder=[1,0])->T2 | transpose(tensor<any> A, vector<int32> dim_order)->(tensor<any> C) |
+| reshape | miaobyte | reshape(tensor<any> A, vector<int32> shape)->(tensor<any> B) | T1.reshape(shape)->T2 | reshape(tensor<any> A, vector<int32> shape)->(tensor<any> B) |
+
+### reduce
+
+| Operation | Author | Func Def | Math Formula | IR Instruction |
+|-----------|--------|------------|--------------|----------------|
+| reducemax | miaobyte | reducemax(tensor<any> A, vector<int32> axis, var<bool> keepdims)->(tensor<any> B) | B = reducemax(A, axis=[1 2], keepdims=false) | reducemax(tensor<any> A, vector<int32> axis, var<bool> keepdims)->(tensor<any> B) |
+| prod | miaobyte | prod(tensor<any> A, vector<int32> axis, var<bool> keepdims)->(tensor<any> B) | B = prod(A, axis=[1 2], keepdims=false) | prod(tensor<any> A, vector<int32> axis, var<bool> keepdims)->(tensor<any> B) |
+| sum | miaobyte | sum(tensor<any> A, vector<int32> axis, var<bool> keepdims)->(tensor<any> B) | B = sum(A, axis=[1 2], keepdims=false) | sum(tensor<any> A, vector<int32> axis, var<bool> keepdims)->(tensor<any> B) |
+| reducemin | miaobyte | reducemin(tensor<any> A, vector<int32> axis, var<bool> keepdims)->(tensor<any> B) | B = reducemin(A, axis=[1 2], keepdims=false) | reducemin(tensor<any> A, vector<int32> axis, var<bool> keepdims)->(tensor<any> B) |
+
diff --git a/excuter/cpp-common/src/deepx/tf/tf.hpp b/excuter/cpp-common/src/deepx/tf/tf.hpp
index 430dc4c5..e123f10c 100644
--- a/excuter/cpp-common/src/deepx/tf/tf.hpp
+++ b/excuter/cpp-common/src/deepx/tf/tf.hpp
@@ -39,6 +39,7 @@ namespace deepx::tf
     public:
         string name;
         string author;
+        string tftype; 
         vector<Param> args;
         vector<Param> returns;
         //
diff --git a/excuter/cpp-common/src/deepx/tf/tffactory.cpp b/excuter/cpp-common/src/deepx/tf/tffactory.cpp
index 5d80ef14..1936acfc 100644
--- a/excuter/cpp-common/src/deepx/tf/tffactory.cpp
+++ b/excuter/cpp-common/src/deepx/tf/tffactory.cpp
@@ -78,24 +78,36 @@ namespace deepx::tf
         std::stringstream ss;
         ss << "## " << excuter_name << " 支持算子列表 \n\n";
         ss << "本页面由 `excuter/" << excuter_name << " 生成，请勿手动修改 \n\n";
-        ss << "| Operation | Author | Func Def | Math Formula | IR Instruction |\n";
-        ss << "|-----------|--------|------------|--------------|----------------|\n";
-
-        // 输出每个操作及其信息
-        for (const auto &[name, tf_family] : tf_families)
-        {
-            for (const auto &[author, tf_author] : tf_family->tf_authors)
-            {
-                for (const auto &tf : tf_author->tfs)
-                {
-                    ss << "| " << name << " | ";
-                    ss << (author.empty() ? " none " : author) << " | ";
-                    ss << tf->to_string(false, true) << " | ";
-                    ss << tf->math_formula() << " | ";
-                    ss << tf->to_string(false, true) << " |\n";
+ 
+        // 首先按tftype分组
+        unordered_map<string, vector<shared_ptr<TF>>> tf_by_type;
+        
+        // 收集所有TF并按tftype分组
+        for (const auto &[name, tf_family] : tf_families) {
+            for (const auto &[author, tf_author] : tf_family->tf_authors) {
+                for (const auto &tf : tf_author->tfs) {
+                    tf_by_type[tf->tftype].push_back(tf);
                 }
             }
         }
+        
+        // 为每个tftype生成一个表格
+        for (const auto &[tftype, tfs] : tf_by_type) {
+            ss << "### " << tftype << "\n\n";
+            ss << "| Operation | Author | Func Def | Math Formula | IR Instruction |\n";
+            ss << "|-----------|--------|------------|--------------|----------------|\n";
+            
+            for (const auto &tf : tfs) {
+                ss << "| " << tf->name << " | ";
+                ss << (tf->author.empty() ? " none " : tf->author) << " | ";
+                ss << tf->to_string(false, true) << " | ";
+                ss << tf->math_formula() << " | ";
+                ss << tf->to_string(false, true) << " |\n";
+            }
+            
+            ss << "\n";
+        }
+        
         return ss.str();
     }
 }
\ No newline at end of file
diff --git a/excuter/op-mem-cuda/src/deepx/tf/arg.hpp b/excuter/op-mem-cuda/src/deepx/tf/arg.hpp
index b4221840..5c8fa93b 100644
--- a/excuter/op-mem-cuda/src/deepx/tf/arg.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tf/arg.hpp
@@ -15,19 +15,11 @@ namespace deepx::tf
         {
             this->name = "argset";
             this->author = "";
+            this->tftype = "arg";
             this->args = args;
             this->returns = returns;
         }
  
-        ArgSet(string text)
-        {
-            this->parse(text);
-            if (this->name != "argset")
-            {
-                throw std::runtime_error("Invalid name: " + this->name);
-            }
-        }
-
         string math_formula() const override
         {
             return "var argname = argvalue";
@@ -85,18 +77,11 @@ namespace deepx::tf
         {
             this->name = "vecset";
             this->author = "";
+            this->tftype = "arg";
             this->args = args;
             this->returns = returns;
         }
-        VecSet(string text)
-        {
-            this->parse(text);
-            if (this->name != "vecset")
-            {
-                throw std::runtime_error("Invalid name: " + this->name);
-            }
-        }
-
+ 
         string math_formula() const override
         {
             return "shape = [3  4  5]";
diff --git a/excuter/op-mem-cuda/src/deepx/tf/changeshape.hpp b/excuter/op-mem-cuda/src/deepx/tf/changeshape.hpp
index a45f5ba8..99186de5 100644
--- a/excuter/op-mem-cuda/src/deepx/tf/changeshape.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tf/changeshape.hpp
@@ -19,6 +19,7 @@ namespace deepx::tf
         {
             this->name = "reshape";
             this->author = Author::name();
+            this->tftype = "changeshape";
             this->args = args;
             this->returns = returns;
         }
@@ -79,6 +80,7 @@ namespace deepx::tf
         {
             this->name = "transpose";
             this->author = Author::name();
+            this->tftype = "changeshape";
             this->args = args;
             this->returns = returns;
         }
@@ -146,6 +148,7 @@ namespace deepx::tf
         {
             this->name = "concat";
             this->author = Author::name();
+            this->tftype = "changeshape";
             this->args = args;
             this->returns = returns;
         }
@@ -279,6 +282,7 @@ namespace deepx::tf
         {
             this->name = "broadcastTo";
             this->author = Author::name();
+            this->tftype = "changeshape";
             this->args = args;
             this->returns = returns;
         }
diff --git a/excuter/op-mem-cuda/src/deepx/tf/elementwise_basic.hpp b/excuter/op-mem-cuda/src/deepx/tf/elementwise_basic.hpp
index 8611a227..d18eeb88 100644
--- a/excuter/op-mem-cuda/src/deepx/tf/elementwise_basic.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tf/elementwise_basic.hpp
@@ -16,20 +16,12 @@ namespace deepx::tf
         Add(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "add";
-            this->author = Author::name();
+            this->author = Author::name();  
+            this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
 
-        Add(string text)
-        {
-            this->parse(text);
-            this->author = Author::name();
-            if (this->name != "add")
-            {
-                throw std::runtime_error("Invalid name: " + this->name);
-            }
-        }
         string math_formula() const override
         {
             return "T3=T1+T2";
@@ -94,19 +86,11 @@ namespace deepx::tf
         {
             this->name = "addscalar";
             this->author = Author::name();
+            this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
-
-        AddScalar(string text)
-        {
-            this->parse(text);
-            this->author = Author::name();
-            if (this->name != "addscalar")
-            {
-                throw std::runtime_error("Invalid name: " + this->name);
-            }
-        }
+ 
         string math_formula() const override
         {
             return "T3=T1+scalar";
@@ -170,19 +154,11 @@ namespace deepx::tf
         {
             this->name = "sub";
             this->author = Author::name();
+            this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
-
-        Sub(string text)
-        {
-            this->parse(text);
-            this->author = Author::name();
-            if (this->name != "sub")
-            {
-                throw std::runtime_error("Invalid name: " + this->name);
-            }
-        }
+ 
         string math_formula() const override
         {
             return "T3=T1-T2";
@@ -247,19 +223,11 @@ namespace deepx::tf
         {
             this->name = "subscalar";
             this->author = Author::name();
+            this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
-
-        SubScalar(string text)
-        {
-            this->parse(text);
-            this->author = Author::name();
-            if (this->name != "subscalar")
-            {
-                throw std::runtime_error("Invalid name: " + this->name);
-            }
-        }
+ 
         string math_formula() const override
         {
             return "T3=T1-scalar";
@@ -323,19 +291,11 @@ namespace deepx::tf
         {
             this->name = "mul";
             this->author = Author::name();
+            this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
-
-        Mul(string text)
-        {
-            this->parse(text);
-            this->author = Author::name();
-            if (this->name != "mul")
-            {
-                throw std::runtime_error("Invalid name: " + this->name);
-            }
-        }
+ 
         string math_formula() const override
         {
             return "T3=T1*T2";
@@ -400,19 +360,11 @@ namespace deepx::tf
         {
             this->name = "mulscalar";
             this->author = Author::name();
+            this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
-
-        MulScalar(string text)
-        {
-            this->parse(text);
-            this->author = Author::name();
-            if (this->name != "mulscalar")
-            {
-                throw std::runtime_error("Invalid name: " + this->name);
-            }
-        }
+ 
         string math_formula() const override
         {
             return "T3=T1*scalar";
@@ -476,19 +428,11 @@ namespace deepx::tf
         {
             this->name = "div";
             this->author = Author::name();
+            this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
-
-        Div(string text)
-        {
-            this->parse(text);
-            this->author = Author::name();
-            if (this->name != "div")
-            {
-                throw std::runtime_error("Invalid name: " + this->name);
-            }
-        }
+ 
         string math_formula() const override
         {
             return "T3=T1/T2";
@@ -553,19 +497,11 @@ namespace deepx::tf
         {
             this->name = "divscalar";
             this->author = Author::name();
+            this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
-
-        DivScalar(string text)
-        {
-            this->parse(text);
-            this->author = Author::name();
-            if (this->name != "divscalar")
-            {
-                throw std::runtime_error("Invalid name: " + this->name);
-            }
-        }
+ 
         string math_formula() const override
         {
             return "T3=scalar/T1";
@@ -629,19 +565,11 @@ namespace deepx::tf
         {
             this->name = "rdivscalar";
             this->author = Author::name();
+            this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
-
-        RDivScalar(string text)
-        {
-            this->parse(text);
-            this->author = Author::name();
-            if (this->name != "rdivscalar")
-            {
-                throw std::runtime_error("Invalid name: " + this->name);
-            }
-        }
+ 
         string math_formula() const override
         {
             return "T3=scalar/T1";
@@ -706,6 +634,7 @@ namespace deepx::tf
         {
             this->name = "invert";
             this->author = Author::name();
+            this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
diff --git a/excuter/op-mem-cuda/src/deepx/tf/elementwise_compare.hpp b/excuter/op-mem-cuda/src/deepx/tf/elementwise_compare.hpp
index 694ad3db..0ee58de8 100644
--- a/excuter/op-mem-cuda/src/deepx/tf/elementwise_compare.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tf/elementwise_compare.hpp
@@ -16,6 +16,7 @@ namespace deepx::tf
         {
             this->name = "max";
             this->author = Author::name();
+            this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
@@ -80,6 +81,7 @@ namespace deepx::tf
         {
             this->name = "maxscalar";
             this->author = Author::name();
+            this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
@@ -144,6 +146,7 @@ namespace deepx::tf
         {
             this->name = "min";
             this->author = Author::name();
+            this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
@@ -209,6 +212,7 @@ namespace deepx::tf
         {
             this->name = "minscalar";
             this->author = Author::name();
+            this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
@@ -273,6 +277,7 @@ namespace deepx::tf
         {
             this->name = "equal";
             this->author = Author::name();
+            this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
@@ -339,6 +344,7 @@ namespace deepx::tf
         {
             this->name = "equalscalar";
             this->author = Author::name();
+            this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
@@ -405,6 +411,7 @@ namespace deepx::tf
         {
             this->name = "less";
             this->author = Author::name();
+            this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
@@ -471,6 +478,7 @@ namespace deepx::tf
         {
             this->name = "lessscalar";
             this->author = Author::name();
+            this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
@@ -536,6 +544,7 @@ namespace deepx::tf
         {
             this->name = "greater";
             this->author = Author::name();
+            this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
@@ -602,6 +611,7 @@ namespace deepx::tf
         {
             this->name = "greaterscalar";
             this->author = Author::name();
+            this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
@@ -667,6 +677,7 @@ namespace deepx::tf
         {
             this->name = "switch";
             this->author = Author::name();
+            this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
diff --git a/excuter/op-mem-cuda/src/deepx/tf/elementwise_sin.hpp b/excuter/op-mem-cuda/src/deepx/tf/elementwise_sin.hpp
index bea0a9f4..d5eae4e2 100644
--- a/excuter/op-mem-cuda/src/deepx/tf/elementwise_sin.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tf/elementwise_sin.hpp
@@ -16,19 +16,11 @@ namespace deepx::tf
         {
             this->name = "sin";
             this->author = Author::name();
+            this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
-
-        Sin(string text)
-        {
-            this->parse(text);
-            this->author = Author::name();
-            if (this->name != "sin")
-            {
-                throw std::runtime_error("Invalid name: " + this->name);
-            }
-        }
+ 
         string math_formula() const override
         {
             return "T3=sin(T1)";
@@ -77,19 +69,11 @@ namespace deepx::tf
         {
             this->name = "cos";
             this->author = Author::name();
+            this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
-
-        Cos(string text)
-        {
-            this->parse(text);
-            this->author = Author::name();
-            if (this->name != "cos")
-            {
-                throw std::runtime_error("Invalid name: " + this->name);
-            }
-        }
+ 
         string math_formula() const override
         {
             return "T3=cos(T1)";
@@ -140,19 +124,11 @@ namespace deepx::tf
         {
             this->name = "tan";
             this->author = Author::name();
+            this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
-
-        Tan(string text)
-        {
-            this->parse(text);
-            this->author = Author::name();
-            if (this->name != "tan")
-            {
-                throw std::runtime_error("Invalid name: " + this->name);
-            }
-        }
+ 
         string math_formula() const override
         {
             return "T3=tan(T1)";
diff --git a/excuter/op-mem-cuda/src/deepx/tf/elementwise_sqrt.hpp b/excuter/op-mem-cuda/src/deepx/tf/elementwise_sqrt.hpp
index ae417bfe..d13b22f6 100644
--- a/excuter/op-mem-cuda/src/deepx/tf/elementwise_sqrt.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tf/elementwise_sqrt.hpp
@@ -16,19 +16,11 @@ namespace deepx::tf
         {
             this->name = "pow";
             this->author = Author::name();
+            this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
-
-        Pow(string text)
-        {
-            this->parse(text);
-            this->author = Author::name();
-            if (this->name != "pow")
-            {
-                throw std::runtime_error("Invalid name: " + this->name);
-            }
-        }
+ 
         string math_formula() const override
         {
             return "T3=pow(T1, T2)";
@@ -79,19 +71,11 @@ namespace deepx::tf
         {
             this->name = "powscalar";
             this->author = Author::name();
+            this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
-
-        PowScalar(string text)
-        {
-            this->parse(text);
-            this->author = Author::name();
-            if (this->name != "powscalar")
-            {
-                throw std::runtime_error("Invalid name: " + this->name);
-            }
-        }
+ 
         string math_formula() const override
         {
             return "T3=pow(T1, scalar)";
@@ -140,6 +124,7 @@ namespace deepx::tf
         {
             this->name = "rpowscalar";
             this->author = Author::name();
+            this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
@@ -190,19 +175,12 @@ namespace deepx::tf
         {
             this->name = "sqrt";
             this->author = Author::name();
+            this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
 
-        Sqrt(string text)
-        {
-            this->parse(text);
-            this->author = Author::name();
-            if (this->name != "sqrt")
-            {
-                throw std::runtime_error("Invalid name: " + this->name);
-            }
-        }
+        
         string math_formula() const override
         {
             return "T3=sqrt(T1)";
@@ -255,19 +233,11 @@ namespace deepx::tf
         {
             this->name = "log";
             this->author = Author::name();
+            this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
-
-        Log(string text)
-        {
-            this->parse(text);
-            this->author = Author::name();
-            if (this->name != "log")
-            {
-                throw std::runtime_error("Invalid name: " + this->name);
-            }
-        }
+ 
         string math_formula() const override
         {
             return "T3=log(T1)";
@@ -320,19 +290,11 @@ namespace deepx::tf
         {
             this->name = "exp";
             this->author = Author::name();
+            this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
-
-        Exp(string text)
-        {
-            this->parse(text);
-            this->author = Author::name();
-            if (this->name != "exp")
-            {
-                throw std::runtime_error("Invalid name: " + this->name);
-            }
-        }
+ 
         string math_formula() const override
         {
             return "T3=exp(T1)";
diff --git a/excuter/op-mem-cuda/src/deepx/tf/init.hpp b/excuter/op-mem-cuda/src/deepx/tf/init.hpp
index bbdb883e..480bf18c 100644
--- a/excuter/op-mem-cuda/src/deepx/tf/init.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tf/init.hpp
@@ -19,19 +19,11 @@ namespace deepx::tf
         {
             this->name = "constant";
             this->author = Author::name();
+            this->tftype = "init";
             this->args = args;
             this->returns = returns;
         }
-        Constant(string text)
-        {
-            this->parse(text);
-            this->author = Author::name();
-            if (this->name != "constant")
-            {
-                throw std::runtime_error("Invalid name: " + this->name);
-            }
-        }
-
+ 
         int run(shared_ptr<MemBase> mem, string &error) override
         {
             string name = this->args[0].textvalue;
@@ -107,6 +99,7 @@ namespace deepx::tf
         {
             this->name = "arange";
             this->author = Author::name();
+            this->tftype = "init";
             this->args = args;
             this->returns = returns;
         }
@@ -191,6 +184,7 @@ namespace deepx::tf
         {
             this->name = "uniform";
             this->author = Author::name();
+            this->tftype = "init";
             this->args = args;
             this->returns = returns;
         }
@@ -276,6 +270,7 @@ namespace deepx::tf
         {
             this->name = "normal";
             this->author = Author::name();
+            this->tftype = "init";
             this->args = args;
             this->returns = returns;
         }
diff --git a/excuter/op-mem-cuda/src/deepx/tf/io.hpp b/excuter/op-mem-cuda/src/deepx/tf/io.hpp
index 6118471a..14315a85 100644
--- a/excuter/op-mem-cuda/src/deepx/tf/io.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tf/io.hpp
@@ -16,18 +16,11 @@ namespace deepx::tf
         {
             this->name = "print";
             this->author = Author::name();
+            this->tftype = "io";
             this->args = args;
             this->returns = returns;
         }
-        Print(string text)
-        {
-            this->parse(text);
-            this->author = Author::name();
-            if (this->name != "print")
-            {
-                throw std::runtime_error("Invalid name: " + this->name);
-            }
-        }
+ 
         int run(shared_ptr<MemBase> mem, string &error) override
         {
             string name = this->args[0].textvalue;
diff --git a/excuter/op-mem-cuda/src/deepx/tf/matmul.hpp b/excuter/op-mem-cuda/src/deepx/tf/matmul.hpp
index 9a4c5bc6..a61b5c04 100644
--- a/excuter/op-mem-cuda/src/deepx/tf/matmul.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tf/matmul.hpp
@@ -19,6 +19,7 @@ namespace deepx::tf
         {
             this->name = "matmul";
             this->author = Author::name();
+            this->tftype = "matmul";
             this->args = args;
             this->returns = returns;
         }
diff --git a/excuter/op-mem-cuda/src/deepx/tf/reduce.hpp b/excuter/op-mem-cuda/src/deepx/tf/reduce.hpp
index 4a1643ea..a23319d7 100644
--- a/excuter/op-mem-cuda/src/deepx/tf/reduce.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tf/reduce.hpp
@@ -17,6 +17,7 @@ namespace deepx::tf
         {
             this->name = "sum";
             this->author = Author::name();
+            this->tftype = "reduce";
             this->args = args;
             this->returns = returns;
         }
@@ -82,6 +83,7 @@ namespace deepx::tf
         {
             this->name = "prod";
             this->author = Author::name();
+            this->tftype = "reduce";
             this->args = args;
             this->returns = returns;
         }
@@ -140,6 +142,7 @@ namespace deepx::tf
         {
             this->name = "reducemax";
             this->author = Author::name();
+            this->tftype = "reduce";
             this->args = args;
             this->returns = returns;
         }
@@ -190,7 +193,7 @@ namespace deepx::tf
         }
     };
 
-        template <typename Author>
+    template <typename Author>
     class ReduceMin : public TF
     {
     public:
@@ -198,6 +201,7 @@ namespace deepx::tf
         {
             this->name = "reducemin";
             this->author = Author::name();
+            this->tftype = "reduce";
             this->args = args;
             this->returns = returns;
         }
diff --git a/excuter/op-mem-cuda/src/deepx/tf/tensorlife.hpp b/excuter/op-mem-cuda/src/deepx/tf/tensorlife.hpp
index 8e303037..43041188 100644
--- a/excuter/op-mem-cuda/src/deepx/tf/tensorlife.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tf/tensorlife.hpp
@@ -17,6 +17,7 @@ namespace deepx::tf
         NewTensor(vector<Param> args, vector<Param> returns)
         {
             this->name = "newtensor";
+            this->tftype = "tensorlife";
             this->args = args;
             this->returns = returns;
         }
@@ -138,6 +139,7 @@ namespace deepx::tf
         CopyTensor(vector<Param> args, vector<Param> returns)
         {
             this->name = "copytensor";
+            this->tftype = "tensorlife";
             this->args = args;
             this->returns = returns;
         }
@@ -226,6 +228,7 @@ namespace deepx::tf
         DelTensor(vector<Param> args, vector<Param> returns)
         {
             this->name = "deltensor";
+            this->tftype = "tensorlife";
             this->args = args;
             this->returns = returns;
         }
diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/arg.hpp b/excuter/op-mem-ompsimd/src/deepx/tf/arg.hpp
index 2c3f4f5e..13733fa1 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tf/arg.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tf/arg.hpp
@@ -16,19 +16,11 @@ namespace deepx::tf
          ArgSet(vector<Param> args, vector<Param> returns)
         {
             this->name = "argset";
-            this->author = "";
+            this->tftype = "arg";
             this->args = args;
             this->returns = returns;
         }
-        ArgSet(string text, bool call = false)
-        {
-            this->parse(text);
-            if (this->name != "argset")
-            {
-                throw std::runtime_error("Invalid name: " + this->name);
-            }
-        }
-
+ 
         string math_formula() const override
         {
             return "var argname = argvalue";
@@ -85,19 +77,11 @@ namespace deepx::tf
         VecSet(vector<Param> args, vector<Param> returns)
         {
             this->name = "vecset";
-            this->author = "";
+            this->tftype = "arg";
             this->args = args;
             this->returns = returns;
         }
-        VecSet(string text)
-        {
-            this->parse(text);
-            if (this->name != "vecset")
-            {
-                throw std::runtime_error("Invalid name: " + this->name);
-            }
-        }
-
+        
         string math_formula() const override
         {
             return "shape = [3  4  5]";
diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/changeshape.hpp b/excuter/op-mem-ompsimd/src/deepx/tf/changeshape.hpp
index 784abfc0..e9c534b7 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tf/changeshape.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tf/changeshape.hpp
@@ -21,6 +21,7 @@ namespace deepx::tf
             this->author = Author::name();
             this->args = args;
             this->returns = returns;
+            this->tftype = "changeshape";
         }
 
         string math_formula() const override
@@ -84,6 +85,7 @@ namespace deepx::tf
         {
             this->name = "transpose";
             this->author = Author::name();
+            this->tftype = "changeshape";
             this->args = args;
             this->returns = returns;
         }
@@ -149,6 +151,7 @@ namespace deepx::tf
         {
             this->name = "concat";
             this->author = Author::name();
+            this->tftype = "changeshape";
             this->args = args;
             this->returns = returns;
         }
@@ -260,6 +263,7 @@ namespace deepx::tf
         {
             this->name = "broadcastTo";
             this->author = Author::name();
+            this->tftype = "changeshape";
             this->args = args;
             this->returns = returns;
         }
diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp b/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp
index 53f0b504..e1914688 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp
@@ -19,6 +19,7 @@ namespace deepx::tf
         {
             this->name = "add";
             this->author = Author::name();
+            this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
@@ -80,6 +81,7 @@ namespace deepx::tf
         {
             this->name = "addscalar";
             this->author = Author::name();
+            this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
@@ -139,6 +141,7 @@ namespace deepx::tf
         {
             this->name = "sub";
             this->author = Author::name();
+            this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
@@ -200,6 +203,7 @@ namespace deepx::tf
         {
             this->name = "subscalar";
             this->author = Author::name();
+            this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
@@ -260,6 +264,7 @@ namespace deepx::tf
         {   
             this->name = "mul";
             this->author = Author::name();
+            this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }   
@@ -321,6 +326,7 @@ namespace deepx::tf
         {
             this->name = "mulscalar";
             this->author = Author::name();
+            this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
@@ -381,6 +387,7 @@ namespace deepx::tf
         {   
             this->name = "div";
             this->author = Author::name();
+            this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }   
@@ -442,6 +449,7 @@ namespace deepx::tf
         {
             this->name = "divscalar";
             this->author = Author::name();
+            this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
@@ -498,6 +506,7 @@ namespace deepx::tf
         {
             this->name = "rdivscalar";
             this->author = Author::name();
+            this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
@@ -555,6 +564,7 @@ namespace deepx::tf
         {
             this->name = "invert";
             this->author = Author::name();
+            this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
@@ -609,6 +619,7 @@ namespace deepx::tf
         {
             this->name = "sqrt";
             this->author = Author::name();  
+            this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
@@ -653,6 +664,7 @@ namespace deepx::tf
         {
             this->name = "pow";
             this->author = Author::name();
+            this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
@@ -698,6 +710,7 @@ namespace deepx::tf
         {
             this->name = "powscalar";
             this->author = Author::name();
+            this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
@@ -743,6 +756,7 @@ namespace deepx::tf
         {
             this->name = "rpowscalar";
             this->author = Author::name();
+            this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
@@ -787,6 +801,7 @@ namespace deepx::tf
         {
             this->name = "log";
             this->author = Author::name();
+            this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
@@ -831,6 +846,7 @@ namespace deepx::tf
         {   
             this->name = "exp";
             this->author = Author::name();
+            this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
@@ -875,6 +891,7 @@ namespace deepx::tf
         {
             this->name = "sin";
             this->author = Author::name();
+            this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
@@ -919,6 +936,7 @@ namespace deepx::tf
         {   
             this->name = "cos";
             this->author = Author::name();
+            this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
@@ -963,6 +981,7 @@ namespace deepx::tf
         {   
             this->name = "tan";
             this->author = Author::name();
+            this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
@@ -1007,6 +1026,7 @@ namespace deepx::tf
         {
             this->name = "max";
             this->author = Author::name();
+            this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
@@ -1064,6 +1084,7 @@ namespace deepx::tf
         {
             this->name = "maxscalar";
             this->author = Author::name();
+            this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
@@ -1120,6 +1141,7 @@ namespace deepx::tf
         {   
             this->name = "min"; 
             this->author = Author::name();
+            this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
@@ -1177,6 +1199,7 @@ namespace deepx::tf
         {
             this->name = "minscalar";
             this->author = Author::name();
+            this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
@@ -1235,6 +1258,7 @@ namespace deepx::tf
         {   
             this->name = "equal";
             this->author = Author::name();
+            this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
@@ -1294,6 +1318,7 @@ namespace deepx::tf
         {
             this->name = "equalscalar";
             this->author = Author::name();
+            this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
@@ -1352,6 +1377,7 @@ namespace deepx::tf
         {
             this->name = "less";
             this->author = Author::name();
+            this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
@@ -1410,6 +1436,7 @@ namespace deepx::tf
         {
             this->name = "lessscalar";
             this->author = Author::name();
+            this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
@@ -1467,6 +1494,7 @@ namespace deepx::tf
         {
             this->name = "greater";
             this->author = Author::name();
+            this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
@@ -1525,6 +1553,7 @@ namespace deepx::tf
         {
             this->name = "greaterscalar";
             this->author = Author::name();
+            this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
@@ -1582,6 +1611,7 @@ namespace deepx::tf
         {
             this->name = "switch";
             this->author = Author::name();
+            this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/init.hpp b/excuter/op-mem-ompsimd/src/deepx/tf/init.hpp
index dd5cdc45..c28b569e 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tf/init.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tf/init.hpp
@@ -16,19 +16,11 @@ namespace deepx::tf
         {
             this->name = "constant";
             this->author = Author::name();
+            this->tftype = "init";
             this->args = args;
             this->returns = returns;
         }
-        Constant(string text)
-        {
-            this->parse(text);
-            this->author = Author::name();
-            if (this->name != "constant")
-            {
-                throw std::runtime_error("Invalid name: " + this->name);
-            }
-        }
-
+ 
         int run(shared_ptr<MemBase> mem, string &error) override
         {
             string name = this->args[0].textvalue;
@@ -105,6 +97,7 @@ namespace deepx::tf
         {
             this->name = "arange";
             this->author = Author::name();
+            this->tftype = "init";
             this->args = args;
             this->returns = returns;
         }
@@ -179,6 +172,7 @@ namespace deepx::tf
         {
             this->name = "uniform";
             this->author = Author::name();
+            this->tftype = "init";
             this->args = args;
             this->returns = returns;
         }
@@ -253,6 +247,7 @@ namespace deepx::tf
         {
             this->name = "normal";
             this->author = Author::name();
+            this->tftype = "init";
             this->args = args;
             this->returns = returns;
         }
diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/io.hpp b/excuter/op-mem-ompsimd/src/deepx/tf/io.hpp
index ba180f3d..0fd86a18 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tf/io.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tf/io.hpp
@@ -15,19 +15,12 @@ namespace deepx::tf
         Print(vector<Param> args, vector<Param> returns)
         {
             this->name = "print";
+            this->tftype = "io";
             this->author = Author::name();
             this->args = args;
             this->returns = returns;
         }
-        Print(string text)
-        {
-            this->parse(text);
-            this->author = Author::name();
-            if (this->name != "print")
-            {
-                throw std::runtime_error("Invalid name: " + this->name);
-            }
-        }
+ 
         int run(shared_ptr<MemBase> mem, string &error) override
         {
             string name = this->args[0].textvalue;
diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/matmul.hpp b/excuter/op-mem-ompsimd/src/deepx/tf/matmul.hpp
index 77d61208..89804a18 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tf/matmul.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tf/matmul.hpp
@@ -17,19 +17,11 @@ namespace deepx::tf
         {
             this->name = "matmul";
             this->author = Author::name();
+            this->tftype = "matmul";
             this->args = args;
             this->returns = returns;
         }
-
-        MatMul(string text)
-        {
-            this->parse(text);
-            this->author = Author::name();
-            if (this->name != "matmul")
-            {
-                throw std::runtime_error("Invalid name: " + this->name);
-            }
-        }
+ 
         string math_formula() const override
         {
             return "T3=T1 @ T2";
diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/reduce.hpp b/excuter/op-mem-ompsimd/src/deepx/tf/reduce.hpp
index 4e2bd1b9..f8b43e53 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tf/reduce.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tf/reduce.hpp
@@ -17,6 +17,7 @@ namespace deepx::tf
         {
             this->name = "sum";
             this->author = Author::name();
+            this->tftype = "reduce";
             this->args = args;
             this->returns = returns;
         }
@@ -76,6 +77,7 @@ namespace deepx::tf
         {
             this->name = "prod";
             this->author = Author::name();
+            this->tftype = "reduce";
             this->args = args;
             this->returns = returns;
         }
@@ -134,6 +136,7 @@ namespace deepx::tf
         {
             this->name = "reducemax";
             this->author = Author::name();
+            this->tftype = "reduce";
             this->args = args;
             this->returns = returns;
         }
@@ -192,6 +195,7 @@ namespace deepx::tf
         {
             this->name = "reducemin";
             this->author = Author::name();
+            this->tftype = "reduce";
             this->args = args;
             this->returns = returns;
         }
diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/tensorlife.hpp b/excuter/op-mem-ompsimd/src/deepx/tf/tensorlife.hpp
index 92a45098..d703355c 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tf/tensorlife.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tf/tensorlife.hpp
@@ -15,18 +15,11 @@ namespace deepx::tf
         NewTensor(vector<Param> args, vector<Param> returns)
         {
             this->name = "newtensor";
+            this->tftype = "tensorlife";
             this->args = args;
             this->returns = returns;
         }
-
-        NewTensor(string text, bool call = false)
-        {
-            this->parse(text);
-            if (this->name != "newtensor")
-            {
-                throw std::runtime_error("Invalid name: " + this->name);
-            }
-        }
+ 
         int run(shared_ptr<MemBase> mem, string &error) override
         {
             string name = this->returns[0].textvalue;
@@ -139,6 +132,7 @@ namespace deepx::tf
             this->name = "copytensor";
             this->args = args;
             this->returns = returns;
+            this->tftype = "tensorlife";
         }
  
          int run(shared_ptr<MemBase> mem, string &error) override
@@ -218,6 +212,7 @@ namespace deepx::tf
             this->name = "deltensor";
             this->args = args;
             this->returns = returns;
+            this->tftype = "tensorlife";
         }
         int run(shared_ptr<MemBase> mem, string &error) override
         {
diff --git a/front/py/examples/2_ir/1_init_zeroones.py b/front/py/examples/2_ir/1_init_zeroones.py
index d6de4318..ca788569 100644
--- a/front/py/examples/2_ir/1_init_zeroones.py
+++ b/front/py/examples/2_ir/1_init_zeroones.py
@@ -2,18 +2,18 @@
 ############-------PyTorch-------################
 
 import torch
-# torch_t1 = torch.zeros(3, 4, 5, dtype=torch.float32)
-# torch_t2 = torch.ones(3, 4, 5, dtype=torch.float32)
-# torch_t4 = torch.full((3, 4, 5), 0.5)
-# print(torch_t4)
-# torch_t5=torch.nn.init.uniform_(torch.zeros(3,4,5),0,1)
-# print(torch_t5)
-
-
-# torch_t6 = torch.zeros(3, 4, 5, dtype=torch.float32)
-# torch.nn.init.kaiming_uniform_(torch_t6)
-# print(torch_t6)
-#
+torch_t1 = torch.zeros(3, 4, 5, dtype=torch.float32)
+torch_t2 = torch.ones(3, 4, 5, dtype=torch.float32)
+torch_t4 = torch.full((3, 4, 5), 0.5)
+print(torch_t4)
+torch_t5=torch.nn.init.uniform_(torch.zeros(3,4,5),0,1)
+print(torch_t5)
+
+
+torch_t6 = torch.zeros(3, 4, 5, dtype=torch.float32)
+torch.nn.init.kaiming_uniform_(torch_t6)
+print(torch_t6)
+
 torch_t7 = torch.zeros(3, 4, 5, dtype=torch.float32)
 torch_t7.normal_(mean=0,std=0.02)
 print(torch_t7)
@@ -23,21 +23,15 @@
 import deepx
 print()
 
-# t1 = deepx.zeros([3,4,5],dtype='float32')
-# t2 = deepx.ones([3,4,5],dtype='float32')
-# t4=deepx.full([3,4,5],value=0.5)
-# t4.print()
-# t5=deepx.uniform(3,4,5,low=0,high=1)
-# t5.print()
-# t6=deepx.kaiming_uniform(3,4,5,dtype='float32')
-# t6.print()
+t1 = deepx.zeros([3,4,5],dtype='float32')
+t2 = deepx.ones([3,4,5],dtype='float32')
+t4=deepx.full([3,4,5],value=0.5)
+t4.print()
+t5=deepx.uniform(3,4,5,low=0,high=1)
+t5.print()
+t6=deepx.kaiming_uniform(3,4,5,dtype='float32')
+t6.print()
 
 t7=deepx.zeros(3,4,5,dtype='float32')
 t7.normal_(mean=0,stddev=0.02)
 t7.print("%.6f")
-
-# t7.uniform_(low=0,high=1)
-# t7.print("%.6f")
-#
-# t7.arange_(start=0,step=1)
-# t7.print("%.0f")