From 75063c12cab7273e8539783a69812d8d5ebb26c5 Mon Sep 17 00:00:00 2001
From: lipeng <734991033@qq.com>
Date: Thu, 24 Apr 2025 00:09:34 +0800
Subject: [PATCH 1/7] front/py    deepxutil

---
 front/py/deepx/nn/functional/__init__.py      |   2 +-
 front/py/deepx/nn/functional/activite.py      |   1 -
 front/py/deepx/nn/functional/elementwise.py   |   3 +-
 front/py/deepx/nn/functional/leaffunc_init.py |   1 -
 front/py/deepx/nn/functional/leaffunc_io.py   |  27 +---
 front/py/deepx/nn/functional/normalization.py |   4 -
 front/py/deepx/nn/functional/reduce.py        |  13 +-
 front/py/deepx/nn/functional/rtf_matmul.py    |   1 -
 front/py/deepx/nn/functional/rtf_reduce.py    |   3 -
 front/py/deepx/nn/modules/module.py           |   4 +-
 front/py/deepx/nn/modules/sparse.py           |   2 +-
 front/py/deepx/tensor/reduce.py               |  25 +++-
 front/py/deepx/tensor/tensor.py               |   6 +-
 .../models/llama/modeling_llama.py            | 136 ++++++++----------
 front/py/deepxutil/numpy/__init__.py          |   4 +
 front/py/deepxutil/numpy/io.py                |  17 +++
 front/py/deepxutil/torch/__init__.py          |   4 +
 front/py/deepxutil/torch/io.py                |  10 ++
 front/py/examples/3_module/1_embedding.py     |   3 +-
 .../4_transformer/llama/1_llamarmsnorm.py     |  25 ----
 .../llama/1_llamarmsnorm_torch.py             |  35 ++++-
 21 files changed, 161 insertions(+), 165 deletions(-)
 create mode 100644 front/py/deepxutil/numpy/__init__.py
 create mode 100644 front/py/deepxutil/numpy/io.py
 create mode 100644 front/py/deepxutil/torch/__init__.py
 create mode 100644 front/py/deepxutil/torch/io.py
 delete mode 100644 front/py/examples/4_transformer/llama/1_llamarmsnorm.py

diff --git a/front/py/deepx/nn/functional/__init__.py b/front/py/deepx/nn/functional/__init__.py
index ff85363e..93825af3 100644
--- a/front/py/deepx/nn/functional/__init__.py
+++ b/front/py/deepx/nn/functional/__init__.py
@@ -19,7 +19,7 @@
 
     #leaffunc
     "newtensor","printtensor","load", #life
-    "printtensor","save","save_npy","save_torch",#io
+    "printtensor","save",#io
     "constant","constant_","full","zeros","ones","uniform","uniform_","arange","arange_","kaiming_uniform","kaiming_uniform_","calculate_fan_in_and_fan_out",
     "add","sub","mul","div","sqrt","pow","exp","log",
     "matmul",
diff --git a/front/py/deepx/nn/functional/activite.py b/front/py/deepx/nn/functional/activite.py
index 97be29f3..8510a052 100644
--- a/front/py/deepx/nn/functional/activite.py
+++ b/front/py/deepx/nn/functional/activite.py
@@ -1,4 +1,3 @@
-from typing import Union
 from deepx.tensor import Tensor
 from deepx.nn.functional import newtensor
 
diff --git a/front/py/deepx/nn/functional/elementwise.py b/front/py/deepx/nn/functional/elementwise.py
index 28e5b199..d6a4e6ea 100644
--- a/front/py/deepx/nn/functional/elementwise.py
+++ b/front/py/deepx/nn/functional/elementwise.py
@@ -1,5 +1,4 @@
-from typing import Union
-from deepx.tensor import Tensor,Number
+from deepx.tensor import Tensor
 from deepx.nn.functional import newtensor
 
 def rsqrt(input:Tensor)->Tensor:
diff --git a/front/py/deepx/nn/functional/leaffunc_init.py b/front/py/deepx/nn/functional/leaffunc_init.py
index d6188592..48160e3d 100644
--- a/front/py/deepx/nn/functional/leaffunc_init.py
+++ b/front/py/deepx/nn/functional/leaffunc_init.py
@@ -1,4 +1,3 @@
-from typing import Union
 import math
 import time
 import os
diff --git a/front/py/deepx/nn/functional/leaffunc_io.py b/front/py/deepx/nn/functional/leaffunc_io.py
index d9551c6c..9797eea9 100644
--- a/front/py/deepx/nn/functional/leaffunc_io.py
+++ b/front/py/deepx/nn/functional/leaffunc_io.py
@@ -1,4 +1,4 @@
-from deepx.tensor import Tensor,Shape,saveShape
+from deepx.tensor import Tensor
 from .authormap import defaultauthor
 
 def printtensor(t:Tensor,format=''):
@@ -10,28 +10,3 @@ def save(t:Tensor,path:str):
     from .rtf_io import rtf_save
     rtf_save(t,path)
     return t
-
-def save_npy(t,path:str):
-    r'''
-    保存numpy.ndarray为deepx.tensor格式
-    '''
-    from numpy import ascontiguousarray
-    shape=Shape(t.shape)
-    shape._dtype=str(t.dtype)
-    saveShape(shape,path+".shape")
-
-    array = ascontiguousarray(t)
-    array.tofile(path+'.data')
-    return t
-
-def save_torch(t,path:str):
-    r'''
-    保存torch.Tensor为deepx.tensor格式
-    '''
-    from torch import Tensor as torch_Tensor
-    if isinstance(t,torch_Tensor):
-        t=t.detach().cpu().numpy()
-    else:
-        raise ValueError("t must be a torch.Tensor")
-    save_npy(t,path)
-    
\ No newline at end of file
diff --git a/front/py/deepx/nn/functional/normalization.py b/front/py/deepx/nn/functional/normalization.py
index 87eacbbe..8a9d6030 100644
--- a/front/py/deepx/nn/functional/normalization.py
+++ b/front/py/deepx/nn/functional/normalization.py
@@ -1,8 +1,4 @@
-
-from typing import Union
 from deepx import Tensor
-from deepx.nn.functional import sub
-from deepx.nn.functional import newtensor
 
 # 数学公式：softmax(x_i) = e^{x_i} / sum(e^{x_j})
 def softmax(t: Tensor,dim:int=-1)->Tensor:
diff --git a/front/py/deepx/nn/functional/reduce.py b/front/py/deepx/nn/functional/reduce.py
index f2731fd4..474e98b1 100644
--- a/front/py/deepx/nn/functional/reduce.py
+++ b/front/py/deepx/nn/functional/reduce.py
@@ -1,18 +1,19 @@
 from deepx.tensor import Tensor,Shape
-from typing import Optional,Union
 from .leaffunc_reduce import sum
 from .leaffunc_life import newtensor
 #mean
  
 def mean(a:Tensor,dim:tuple[int,...]=None,keepdim:bool=False)->Tensor:
-    # 如果dim为None,则对所有维度求平均
+    assert isinstance(a,Tensor)
     if dim is None:
-        dim = list(range(a.ndim))
-    dim=list(dim)
+       dim = list(range(a.ndim))
+    else:
+        dim=list(dim)
+        for i in dim:
+            if i < 0:
+                dim[i] = i + a.dim()
     total = 1
     for i in dim:
-        if i < 0:
-            dim[i] = i + a.dim()
         total *= a.shape[i]
     reduceshape=Shape.reduceshape(a.shape,dim,keepdim)
     out=newtensor(reduceshape,dtype=a.dtype)
diff --git a/front/py/deepx/nn/functional/rtf_matmul.py b/front/py/deepx/nn/functional/rtf_matmul.py
index bfe23cee..93f4d5c8 100644
--- a/front/py/deepx/nn/functional/rtf_matmul.py
+++ b/front/py/deepx/nn/functional/rtf_matmul.py
@@ -1,7 +1,6 @@
 from deepx.tensor import Tensor
 from deepx.nn import DeepxIR,Param
 from deepx.scheduler import send
-from .rtf import A_B_op_C
 
 def rtf_matmul(a:Tensor,b:Tensor,out: Tensor ,author='cublas',bench:int=None):
     args=[Param.tensor(a),Param.tensor(b)]
diff --git a/front/py/deepx/nn/functional/rtf_reduce.py b/front/py/deepx/nn/functional/rtf_reduce.py
index 61da95b1..f089bfab 100644
--- a/front/py/deepx/nn/functional/rtf_reduce.py
+++ b/front/py/deepx/nn/functional/rtf_reduce.py
@@ -1,9 +1,6 @@
 from deepx.tensor import Tensor
-from deepx.nn.deepxir import DeepxIR,Param
-from deepx.scheduler import send
 from .rtf import A_b1_b2_op_C
 
-
 def rtf_sum(a:Tensor,dim:tuple[int],keepdim:bool,out: Tensor, author:str='miaobyte')->Tensor:
     A_b1_b2_op_C("sum",a,dim,keepdim,out,author)
  
diff --git a/front/py/deepx/nn/modules/module.py b/front/py/deepx/nn/modules/module.py
index 003223ad..7e55c5f3 100644
--- a/front/py/deepx/nn/modules/module.py
+++ b/front/py/deepx/nn/modules/module.py
@@ -17,7 +17,6 @@ def _generate_default_name(self) -> str:
             self.__class__._instance_counter = 0
         count = self.__class__._instance_counter
         self.__class__._instance_counter += 1
-        return count
         return f"{base_name}_{count}"
  
     @property
@@ -40,9 +39,10 @@ def register_parameter(self, name: str, param: Optional[Tensor]) -> None:
             self._parameters.pop(name, None)
         else:
             self._parameters[name] = param
-            param.name=self.full_name + '.' + name
+            param._name=self.full_name + '.' + name
             from deepx.nn.functional.leaffunc_life import rnewtensor
             rnewtensor(param)
+    
 
     def parameters(self, recurse: bool = True) -> Iterator[Tensor]:
         for name, param in self.named_parameters(recurse=recurse):
diff --git a/front/py/deepx/nn/modules/sparse.py b/front/py/deepx/nn/modules/sparse.py
index ca00f28c..1cdae141 100644
--- a/front/py/deepx/nn/modules/sparse.py
+++ b/front/py/deepx/nn/modules/sparse.py
@@ -128,7 +128,7 @@ def __init__(self,
                 embedding_dim,
             ], "权重形状与num_embeddings和embedding_dim不匹配"
             self.weight = weight
-        
+            self.weight.rtf_rename('embedding_0.weight')
         # self.sparse = sparse
         
         # if padding_idx is not None:
diff --git a/front/py/deepx/tensor/reduce.py b/front/py/deepx/tensor/reduce.py
index cdba12f8..654c8e4b 100644
--- a/front/py/deepx/tensor/reduce.py
+++ b/front/py/deepx/tensor/reduce.py
@@ -4,28 +4,43 @@
 from deepx.tensor import Tensor,tensor_method
 
 @tensor_method
-def reducemax(self, dim:tuple,keepdim:bool=False,out:Union[Tensor,str]=''):
+def reducemax(self, dim:tuple[int,...],keepdim:bool=False,out:Union[Tensor,str]='')->Tensor:
+    assert isinstance(dim,tuple)
+    for i in dim:
+        assert isinstance(i,int)
     from deepx.nn.functional import reducemax as reduce_max_func
     return reduce_max_func(self,dim,keepdim,out)
 
 @tensor_method
-def reducemin(self, dim:tuple,keepdim:bool=False,out:Union[Tensor,str]=''):
+def reducemin(self, dim:tuple[int,...],keepdim:bool=False,out:Union[Tensor,str]='')->Tensor:
+    assert isinstance(dim,tuple)
+    for i in dim:
+        assert isinstance(i,int)
     from deepx.nn.functional import reducemin as reduce_min_func
     return reduce_min_func(self,dim,keepdim,out)
 
 
 @tensor_method
-def sum(self, dim:tuple,keepdim:bool=False,out:Union[Tensor,str]=''):
+def sum(self, dim:tuple[int,...],keepdim:bool=False,out:Union[Tensor,str]='')->Tensor:
+    assert isinstance(dim,tuple)
+    for i in dim:
+        assert isinstance(i,int)
     from deepx.nn.functional import  sum as sum_func
     return  sum_func(self,dim,keepdim,out)
 
 @tensor_method
-def prod(self, dim:tuple,keepdim:bool=False,out:Union[Tensor,str]=''):
+def prod(self, dim:tuple[int,...],keepdim:bool=False,out:Union[Tensor,str]='')->Tensor:
+    assert isinstance(dim,tuple)
+    for i in dim:
+        assert isinstance(i,int)
     from deepx.nn.functional import prod as prod_func
     return prod_func(self,dim,keepdim,out)   
 
 @tensor_method
-def mean(self,dim:tuple,keepdim:bool=False,out:Union[Tensor,str]=''):
+def mean(self,dim:tuple[int,...],keepdim:bool=False)->Tensor:
+    assert isinstance(dim,tuple)
+    for i in dim:
+        assert isinstance(i,int)
     from deepx.nn.functional import mean as mean_func
     return mean_func(self,dim,keepdim)
  
\ No newline at end of file
diff --git a/front/py/deepx/tensor/tensor.py b/front/py/deepx/tensor/tensor.py
index efe2255d..661bd4de 100644
--- a/front/py/deepx/tensor/tensor.py
+++ b/front/py/deepx/tensor/tensor.py
@@ -22,7 +22,7 @@ def __init__(self,shape:tuple[int,...],dtype:str='float32',name:str=None):
                 self.__class__._instance_counter = 0
             count = self.__class__._instance_counter
             self.__class__._instance_counter += 1
-            self._name = count
+            self._name = str(count)
         # dtype
         self._dtype = dtype
 
@@ -49,8 +49,8 @@ def clone(self,name:str=None):
     @property
     def name(self):
         return self._name
-    @name.setter
-    def name(self,name:str):
+    
+    def rtf_rename(self,name:str):
         assert isinstance(name,str) and name != ''
         assert self.name is not None and self.name != ''
 
diff --git a/front/py/deepx/transformer/models/llama/modeling_llama.py b/front/py/deepx/transformer/models/llama/modeling_llama.py
index c60f34f5..c8d9c403 100644
--- a/front/py/deepx/transformer/models/llama/modeling_llama.py
+++ b/front/py/deepx/transformer/models/llama/modeling_llama.py
@@ -10,83 +10,69 @@ def __init__(self, hidden_size, eps=1e-6):
         LlamaRMSNorm is equivalent to T5LayerNorm
         """
         super().__init__()
-        self.weight =  ones(hidden_size)
+        self.weight=ones(hidden_size)
+        self.register_parameter("weight",self.weight)
         self.variance_epsilon = eps
- 
-    # 和官方实现相比，尽可能inplace化
     def forward(self, hidden_states:Tensor):
-        input_clone = hidden_states.clone()
-        input_clone.pow_(2)
-        variance = input_clone.mean([-1], keepdim=True)
-
-        variance.add_(self.variance_epsilon)
-        variance = rsqrt(variance)
-
-        hidden_states.mul_(variance)
-        hidden_states.mul_(self.weight)
-        return hidden_states
-
+        variance =  hidden_states.pow(2).mean((-1,), keepdim=True)
+        hidden_states = hidden_states * rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states
+ 
     def extra_repr(self):
         return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
     
-
-class LlamaRotaryEmbedding(Module):
-    from transformers.models.llama.configuration_llama import LlamaConfig
-    def __init__(self, config: LlamaConfig, device=None):
-        super().__init__()
-        # BC: "rope_type" was originally "type"
-        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
-            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
-        else:
-            self.rope_type = "default"
-        self.max_seq_len_cached = config.max_position_embeddings
-        self.original_max_seq_len = config.max_position_embeddings
-
-        self.config = config
-        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
-
-        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-        self.original_inv_freq = self.inv_freq
-
-    def _dynamic_frequency_update(self, position_ids, device):
-        """
-        dynamic RoPE layers should recompute `inv_freq` in the following situations:
-        1 - growing beyond the cached sequence length (allow scaling)
-        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
-        """
-        seq_len = torch.max(position_ids) + 1
-        if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
-            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
-            self.max_seq_len_cached = seq_len
-
-        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            self.original_inv_freq = self.original_inv_freq.to(device)
-            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
-            self.max_seq_len_cached = self.original_max_seq_len
-
-    @torch.no_grad()
-    def forward(self, x, position_ids):
-        if "dynamic" in self.rope_type:
-            self._dynamic_frequency_update(position_ids, device=x.device)
-
-        # Core RoPE block
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
-        position_ids_expanded = position_ids[:, None, :].float()
-        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
-        device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
-            emb = torch.cat((freqs, freqs), dim=-1)
-            cos = emb.cos()
-            sin = emb.sin()
-
-        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
-        cos = cos * self.attention_scaling
-        sin = sin * self.attention_scaling
-
-        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+#
+# class LlamaRotaryEmbedding(Module):
+#     def __init__(self,rope_type:str="default",max_seq_len:int=1024,device=None):
+#         super().__init__()
+#         self.max_seq_len_cached = config.max_position_embeddings
+#         self.original_max_seq_len = config.max_position_embeddings
+#
+#         self.config = config
+#         self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+#
+#         inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+#         self.register_buffer("inv_freq", inv_freq, persistent=False)
+#         self.original_inv_freq = self.inv_freq
+#
+#     def _dynamic_frequency_update(self, position_ids, device):
+#         """
+#         dynamic RoPE layers should recompute `inv_freq` in the following situations:
+#         1 - growing beyond the cached sequence length (allow scaling)
+#         2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
+#         """
+#         seq_len = torch.max(position_ids) + 1
+#         if seq_len > self.max_seq_len_cached:  # growth
+#             inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
+#             self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
+#             self.max_seq_len_cached = seq_len
+#
+#         if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+#             # This .to() is needed if the model has been moved to a device after being initialized (because
+#             # the buffer is automatically moved, but not the original copy)
+#             self.original_inv_freq = self.original_inv_freq.to(device)
+#             self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+#             self.max_seq_len_cached = self.original_max_seq_len
+#
+#     @torch.no_grad()
+#     def forward(self, x, position_ids):
+#         if "dynamic" in self.rope_type:
+#             self._dynamic_frequency_update(position_ids, device=x.device)
+#
+#         # Core RoPE block
+#         inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+#         position_ids_expanded = position_ids[:, None, :].float()
+#         # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
+#         device_type = x.device.type
+#         device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+#         with torch.autocast(device_type=device_type, enabled=False):
+#             freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+#             emb = torch.cat((freqs, freqs), dim=-1)
+#             cos = emb.cos()
+#             sin = emb.sin()
+#
+#         # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
+#         cos = cos * self.attention_scaling
+#         sin = sin * self.attention_scaling
+#
+#         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
diff --git a/front/py/deepxutil/numpy/__init__.py b/front/py/deepxutil/numpy/__init__.py
new file mode 100644
index 00000000..3bc44bf0
--- /dev/null
+++ b/front/py/deepxutil/numpy/__init__.py
@@ -0,0 +1,4 @@
+from .io import *
+__all__ = [
+    'save_numpy',
+]
diff --git a/front/py/deepxutil/numpy/io.py b/front/py/deepxutil/numpy/io.py
new file mode 100644
index 00000000..ae95a17a
--- /dev/null
+++ b/front/py/deepxutil/numpy/io.py
@@ -0,0 +1,17 @@
+from deepx.tensor import Shape,saveShape
+ 
+def save_numpy(t,tensorpath:str):
+    r'''
+    保存numpy.ndarray为deepx.tensor格式
+    t:numpy.ndarray
+    tensorpath:str,
+    '''
+    from numpy import ascontiguousarray,ndarray
+    assert isinstance(t,ndarray)
+    shape=Shape(t.shape)
+    shape._dtype=str(t.dtype)
+    saveShape(shape,tensorpath+".shape")
+
+    array = ascontiguousarray(t)
+    array.tofile(tensorpath+'.data')
+    return t
diff --git a/front/py/deepxutil/torch/__init__.py b/front/py/deepxutil/torch/__init__.py
new file mode 100644
index 00000000..6780053c
--- /dev/null
+++ b/front/py/deepxutil/torch/__init__.py
@@ -0,0 +1,4 @@
+from .io import *
+__all__ = [
+    'save_torch',
+]
diff --git a/front/py/deepxutil/torch/io.py b/front/py/deepxutil/torch/io.py
new file mode 100644
index 00000000..388e0bd9
--- /dev/null
+++ b/front/py/deepxutil/torch/io.py
@@ -0,0 +1,10 @@
+def save_torch(t,path:str):
+    r'''
+    保存torch.Tensor为deepx.tensor格式
+    '''
+    from torch import Tensor as torch_Tensor
+    assert isinstance(t,torch_Tensor)
+    t=t.detach().cpu().numpy()
+    from deepxutil.numpy.io import save_numpy
+    save_numpy(t,path)
+    
\ No newline at end of file
diff --git a/front/py/examples/3_module/1_embedding.py b/front/py/examples/3_module/1_embedding.py
index ddf52f44..3204fc00 100644
--- a/front/py/examples/3_module/1_embedding.py
+++ b/front/py/examples/3_module/1_embedding.py
@@ -26,7 +26,7 @@ def tokenize_text(text, tokenizer):
 # 创建输入
 text = "这是一个测试文本，用于演示嵌入层的使用。"
 torch_input = tokenize_text(text, tokenizer)
-from deepx.nn.functional import save_torch
+from deepxutil.torch import save_torch
 save_torch(torch_input,dir+'input')
 print(torch_input)
 # 创建网络
@@ -47,7 +47,6 @@ def tokenize_text(text, tokenizer):
 input.print()
 
 weight=load(dir+'weight')
-weight.name='embedding_0.weight'
 net = Embedding(tokenizer.vocab_size, 4096,weight=weight)
 out=net.forward(input)
 out.print()
diff --git a/front/py/examples/4_transformer/llama/1_llamarmsnorm.py b/front/py/examples/4_transformer/llama/1_llamarmsnorm.py
deleted file mode 100644
index 8dfacfa1..00000000
--- a/front/py/examples/4_transformer/llama/1_llamarmsnorm.py
+++ /dev/null
@@ -1,25 +0,0 @@
-
-# 使用小规模数据以便打印完整结果
-hidden_size = 8
-eps = 1e-6
-
-
-############### DeepX 实现部分 ###############
-from deepx import arange, constant_
-from deepx.transformer.models.llama.modeling_llama import LlamaRMSNorm
-
-# 使用相同的数据
-input = arange(2, 3, hidden_size, dtype="float32")
-input.div_(10.0)
-input.sub_(2.0)
-eps = 1e-6
-
-input.print()
-
-# DeepX计算流程
-norm = LlamaRMSNorm(hidden_size=hidden_size, eps=eps)
-# 设置相同的权重
-constant_(norm.weight, 0.5)
-# 前向计算
-output = norm(input)
-output.print()
diff --git a/front/py/examples/4_transformer/llama/1_llamarmsnorm_torch.py b/front/py/examples/4_transformer/llama/1_llamarmsnorm_torch.py
index 85ef6ced..0e350582 100644
--- a/front/py/examples/4_transformer/llama/1_llamarmsnorm_torch.py
+++ b/front/py/examples/4_transformer/llama/1_llamarmsnorm_torch.py
@@ -1,16 +1,21 @@
+hidden_size = 8
+eps = 1e-6
+dir='/home/lipeng/model/deepxmodel/llama/'
+
+
+
 ############### PyTorch 实现部分 ###############
 import torch
-from transformers.models.llama.modeling_llama import LlamaRMSNorm
-
 # 使用小规模数据以便打印完整结果
-hidden_size = 8
-pt_input_data = torch.arange(48, dtype=torch.float32).reshape(2, 3, hidden_size) / 10.0 - 2.0
-pt_input = pt_input_data.clone()
-eps = 1e-6
+pt_input = torch.arange(48, dtype=torch.float32).reshape(2, 3, hidden_size) / 10.0 - 2.0
 print("PyTorch 输入:")
 print(pt_input)
+
+from transformers.models.llama.modeling_llama import LlamaRMSNorm as TransformersLlamaRMSNorm
+from deepxutil.torch import save_torch
+save_torch(pt_input,dir+'rmsnorm_input')
 # 使用transformers库中的官方LlamaRMSNorm实现
-pt_norm = LlamaRMSNorm(hidden_size, eps=eps)
+pt_norm = TransformersLlamaRMSNorm(hidden_size, eps=eps)
 # 设置权重为固定值0.5
 with torch.no_grad():
     pt_norm.weight.fill_(0.5)
@@ -19,4 +24,20 @@
 
 
 print("\nPyTorch RMSNorm 结果:")
+print(pt_output.shape)
 print(pt_output)
+
+ 
+############### DeepX 实现部分 ###############
+from deepx import  constant_,load
+from deepx.transformer.models.llama.modeling_llama import LlamaRMSNorm
+
+input=load(dir+'rmsnorm_input')
+
+# DeepX计算流程
+norm = LlamaRMSNorm(hidden_size=hidden_size, eps=eps)
+# 设置相同的权重
+constant_(norm.weight, 0.5)
+# 前向计算
+output = norm(input)
+output.print()

From 70a181f5b53eec2e217bac9414238005168c9369 Mon Sep 17 00:00:00 2001
From: lipeng <734991033@qq.com>
Date: Thu, 24 Apr 2025 22:46:49 +0800
Subject: [PATCH 2/7] save,load,loadtensordata,loadshape

---
 excuter/cpp-common/src/deepx/tensor.hpp       |  21 ++-
 .../cpp-common/src/deepx/tensorfunc/io.hpp    |  11 +-
 excuter/cpp-common/src/stdutil/fs.cpp         |  64 ++++++-
 excuter/cpp-common/src/stdutil/fs.hpp         |  10 +
 excuter/cpp-common/test/2_saveload.cpp        |  31 ++++
 excuter/cpp-common/test/CMakeLists.txt        |   4 +-
 .../op-mem-cuda/src/deepx/tensorfunc/cuda.hpp |  21 ++-
 .../src/deepx/tensorfunc/io_miaobyte.hpp      | 149 +++------------
 .../deepx/tensorfunc/tensorlife_miaobyte.hpp  |  65 +++++--
 .../src/deepx/tensorfunc/vector_cuda.cuh      |   3 +-
 excuter/op-mem-cuda/src/deepx/tf/arg.hpp      |   6 +-
 .../op-mem-cuda/src/deepx/tf/changeshape.hpp  |   4 +-
 excuter/op-mem-cuda/src/deepx/tf/init.hpp     | 174 +++++++++---------
 excuter/op-mem-cuda/src/deepx/tf/io.hpp       | 121 ++++++++++++
 .../deepx/tensorfunc/tensorlife_miaobyte.hpp  |   6 +-
 excuter/op-mem-ompsimd/src/deepx/tf/arg.hpp   |   6 +-
 .../src/deepx/tf/changeshape.hpp              |   4 +-
 excuter/op-mem-ompsimd/src/deepx/tf/init.hpp  | 115 ++++++------
 excuter/op-mem-ompsimd/src/deepx/tf/io.hpp    |  35 ++++
 19 files changed, 550 insertions(+), 300 deletions(-)
 create mode 100644 excuter/cpp-common/test/2_saveload.cpp

diff --git a/excuter/cpp-common/src/deepx/tensor.hpp b/excuter/cpp-common/src/deepx/tensor.hpp
index 463e42cc..fc33ecde 100644
--- a/excuter/cpp-common/src/deepx/tensor.hpp
+++ b/excuter/cpp-common/src/deepx/tensor.hpp
@@ -28,6 +28,12 @@ namespace deepx
         using CopyFn = void (*)(T *, T *, int);
         CopyFn copyer; // 拷贝内存
 
+        using SaveFn = void (*)(T *,size_t,const std::string &);
+        SaveFn saver; // 保存内存
+
+        using LoadFn = int (*)(const std::string &, T *,int);
+        LoadFn loader; // 加载内存
+
         Tensor() = default;
         Tensor(const vector<int> &s)
         {
@@ -57,6 +63,8 @@ namespace deepx
             newer = tensor.newer;
             deleter = tensor.deleter;
             copyer = tensor.copyer;
+            loader = tensor.loader;
+            saver = tensor.saver;
 
             data = newer(shape.size);
             copyer(tensor.data, data, tensor.shape.size);
@@ -76,6 +84,8 @@ namespace deepx
             deleter = other.deleter;
             copyer = other.copyer;
             newer = other.newer;
+            loader = other.loader;
+            saver = other.saver;
 
             data = other.data;
 
@@ -84,6 +94,8 @@ namespace deepx
             other.deleter = nullptr;
             other.copyer = nullptr;
             other.newer = nullptr;
+            other.loader = nullptr;
+            other.saver = nullptr;
         }
 
         /**
@@ -102,7 +114,8 @@ namespace deepx
             deleter = tensor.deleter;
             copyer = tensor.copyer;
             newer = tensor.newer;
-
+            loader = tensor.loader;
+            saver = tensor.saver;
             data = newer(shape.size);
             if (data != nullptr)
             {
@@ -126,6 +139,10 @@ namespace deepx
             newer = tensor.newer;
             deleter = tensor.deleter;
             copyer = tensor.copyer;
+            loader = tensor.loader;
+            saver = tensor.saver;
+
+
             if (data != nullptr)
             {
                 deleter(data);
@@ -135,6 +152,8 @@ namespace deepx
             tensor.deleter = nullptr;
             tensor.copyer = nullptr;
             tensor.newer = nullptr;
+            tensor.loader = nullptr;
+            tensor.saver = nullptr;
             return *this;
         }
     }; 
diff --git a/excuter/cpp-common/src/deepx/tensorfunc/io.hpp b/excuter/cpp-common/src/deepx/tensorfunc/io.hpp
index 86a1c396..ae4876bf 100644
--- a/excuter/cpp-common/src/deepx/tensorfunc/io.hpp
+++ b/excuter/cpp-common/src/deepx/tensorfunc/io.hpp
@@ -15,10 +15,13 @@ namespace deepx::tensorfunc{
     void print(const Tensor<T> &t, const std::string &f=""){
         printDispatcher<Author,T>::print(t, f);
     }
-
+    
     template <typename T>
-    void save(Tensor<T> &tensor,const std::string &path);
+    void save(const Tensor<T> &tensor,const std::string &path);
 
+    
+    
+    //load
     template <typename T>
     pair<std::string,shared_ptr<Tensor<T>>> load(const std::string &path);
 
@@ -33,7 +36,9 @@ namespace deepx::tensorfunc{
         std::string tensor_name = filename.substr(0, filename.find_last_of('.'));
         return std::make_pair(tensor_name, shape);
     }
-    
+    //对loaddata，不同excuter的实现不同。gpu具有显存，可以直接从文件到显存。
+    template <typename T>
+    void loadData(const std::string &path,Tensor<T> &tensor);
 }
 
 #endif // DEEPX_TENSORFUNC_IO_HPP
diff --git a/excuter/cpp-common/src/stdutil/fs.cpp b/excuter/cpp-common/src/stdutil/fs.cpp
index 63d297e0..7f7a57fe 100644
--- a/excuter/cpp-common/src/stdutil/fs.cpp
+++ b/excuter/cpp-common/src/stdutil/fs.cpp
@@ -1,7 +1,65 @@
 #include "fs.hpp"
-
-namespace  stdutil{
-    string filename(const string &path){
+#include <fstream>
+ 
+namespace stdutil
+{
+    string filename(const string &path)
+    {
         return path.substr(path.find_last_of('/') + 1);
     }
+
+    /*
+    std::ios::binary 二进制打开
+    std::ios::in 读
+    std::ios::out 写，如果文件不存在，则创建文件
+    std::ios::trunc 如果文件存在，则清空文件
+    */
+
+    void save(const byte *data, size_t size, const string &path)
+    {
+
+        ofstream ofs(path, ios::binary | ios::out | ios::trunc);
+        ofs.write(reinterpret_cast<const char *>(data), size);
+        ofs.close();
+    }
+
+    void load(const string &path,byte *data,size_t target_size){
+        ifstream ifs(path, ios::binary | ios::in);
+        if (!ifs.is_open())
+        {
+            throw std::runtime_error("Failed to open file: " + path);
+        }
+        ifs.seekg(0, ios::end);
+        size_t size = ifs.tellg();
+        ifs.seekg(0, ios::beg);
+        if(size!=target_size){
+            throw std::runtime_error("file size mismatch: " + path);
+        }
+        ifs.read(reinterpret_cast<char *>(data), size);
+        if (ifs.fail())
+        {
+            throw std::runtime_error("Failed to read file: " + path);
+        }
+        ifs.close();
+    }
+
+    std::pair<size_t,shared_ptr<byte[]>> load(const string &path)
+    {
+        ifstream ifs(path, ios::binary | ios::in);
+        if (!ifs.is_open())
+        {
+            throw std::runtime_error("Failed to open file: " + path);
+        }
+        ifs.seekg(0, ios::end);
+        size_t size = ifs.tellg();
+        ifs.seekg(0, ios::beg);
+        shared_ptr<byte[]> data(new byte[size]);
+        ifs.read(reinterpret_cast<char *>(data.get()), size);
+        if (ifs.fail())
+        {
+            throw std::runtime_error("Failed to read file: " + path);
+        }
+        ifs.close();
+        return std::make_pair(size, data);
+    }
 }
\ No newline at end of file
diff --git a/excuter/cpp-common/src/stdutil/fs.hpp b/excuter/cpp-common/src/stdutil/fs.hpp
index 26826d9b..9917e09d 100644
--- a/excuter/cpp-common/src/stdutil/fs.hpp
+++ b/excuter/cpp-common/src/stdutil/fs.hpp
@@ -2,10 +2,20 @@
 #define DEEPX_STDUTIL_FS_HPP
 
 #include <string>
+#include <memory>
 
 namespace stdutil{
+
+    
+
     using namespace std;
     string filename(const string &path);
+
+    using byte = unsigned char;
+
+    void save(const byte *data,size_t size,const string &path);
+    void load(const string &path,byte *data,size_t target_size);
+    pair<size_t,shared_ptr<byte[]>> load(const string &path);
 }
 
 #endif // DEEPX_STDUTIL_FS_HPP
\ No newline at end of file
diff --git a/excuter/cpp-common/test/2_saveload.cpp b/excuter/cpp-common/test/2_saveload.cpp
new file mode 100644
index 00000000..9e6f4b31
--- /dev/null
+++ b/excuter/cpp-common/test/2_saveload.cpp
@@ -0,0 +1,31 @@
+#include "stdutil/fs.hpp"
+#include <iostream>
+using namespace stdutil;
+void test_save(int total_size){
+    stdutil::byte *data = new stdutil::byte[total_size];
+    for(int i=0;i<total_size;i++){
+        data[i] =32+ i;
+    }
+    save(data,total_size,"test.bin");
+    delete[] data;
+}   
+
+void test_load(int total_size ){
+ 
+    auto [size,dataptr]=load("test.bin");
+    stdutil::byte *data = dataptr.get();
+    if (size != total_size){
+        cout<<"load failed"<<endl;
+    }
+    for (int i=0;i<total_size;i++){
+        cout<<data[i]<<" ";
+    }
+    cout<<endl;
+}
+
+int main(int argc,char **argv){
+    int total_size = 96;
+    test_save(total_size);
+    test_load(total_size);
+    return 0;
+}
diff --git a/excuter/cpp-common/test/CMakeLists.txt b/excuter/cpp-common/test/CMakeLists.txt
index d78128d8..127aea0a 100644
--- a/excuter/cpp-common/test/CMakeLists.txt
+++ b/excuter/cpp-common/test/CMakeLists.txt
@@ -7,4 +7,6 @@ target_link_libraries(test_tf deepx_common)
 
 add_executable(test_tfcheck  1_tfcheck.cpp)
 target_link_libraries(test_tfcheck deepx_common)
- 
\ No newline at end of file
+ 
+add_executable(test_saveload  2_saveload.cpp)
+target_link_libraries(test_saveload deepx_common)
\ No newline at end of file
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda.hpp
index 603415b8..6fce29d0 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda.hpp
@@ -1,12 +1,17 @@
 #ifndef DEEPX_TENSORFUNC_CUDA_HPP
 #define DEEPX_TENSORFUNC_CUDA_HPP
 
-#include <cublas_v2.h>
 #include <cstdint>
 #include <stdexcept>
+#include <memory>
+
 
+#include <cublas_v2.h>
+
+ 
 namespace deepx::tensorfunc
 {
+    
     class CublasHandle
     {
     public:
@@ -61,6 +66,20 @@ namespace deepx::tensorfunc
         return {blocks, blocksize};
     };
 
+    using std::shared_ptr;
+    
+    inline std::pair<int, std::shared_ptr<unsigned char[]>> device_offload(unsigned char *data,int size)
+    {
+        shared_ptr<unsigned char[]> host_data(new unsigned char[size]);
+        cudaMemcpy(host_data.get(), data, size, cudaMemcpyDeviceToHost);
+        cudaError_t err=cudaGetLastError();
+        if(err!=cudaSuccess){
+            throw std::runtime_error("Failed to copy data from device to host");
+            
+        }
+        return {size, host_data};
+    }
+
     
 }
 
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/io_miaobyte.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/io_miaobyte.hpp
index 00d338fe..5c606393 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/io_miaobyte.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/io_miaobyte.hpp
@@ -13,7 +13,7 @@
 #include <stdutil/print.hpp>
 #include "deepx/tensorfunc/authors.hpp"
 #include "deepx/tensorfunc/io.hpp"
-
+#include "deepx/tensorfunc/cuda.hpp"
 namespace deepx::tensorfunc
 {
     template <typename T>
@@ -24,20 +24,9 @@ namespace deepx::tensorfunc
             int64_t total_bytes = t.shape.bytes();
 
             // 统一分配CPU内存
-            unsigned char *host_data = new unsigned char[total_bytes];
-            if (host_data == nullptr)
-            {
-                throw std::runtime_error("Failed to allocate host memory");
-            }
-            cudaError_t err = cudaMemcpy(host_data, t.data, total_bytes, cudaMemcpyDeviceToHost);
-            if (err != cudaSuccess)
-            {
-                delete[] host_data;
-                throw std::runtime_error("Failed to copy data from device to host");
-            }
-
-            stdutil::print(t.shape.shape, host_data, t.shape.dtype, f);
-            delete[] host_data;
+            unsigned char* device_data=reinterpret_cast<unsigned char*>(t.data);
+            auto [_,host_data]= device_offload(device_data,total_bytes);
+            stdutil::print(t.shape.shape, host_data.get(), t.shape.dtype, f);
         };
     };
 
@@ -50,36 +39,17 @@ namespace deepx::tensorfunc
             int64_t total_bytes = t.shape.bytes();
 
             // 统一分配CPU内存
-            unsigned char *host_data = new unsigned char[total_bytes];
-            if (host_data == nullptr)
-            {
-                throw std::runtime_error("Failed to allocate host memory");
-            }
-
-            // 统一复制数据到CPU
-            cudaError_t err = cudaMemcpy(host_data, t.data, total_bytes, cudaMemcpyDeviceToHost);
-            if (err != cudaSuccess)
-            {
-                delete[] host_data;
-                throw std::runtime_error("Failed to copy data from device to host");
-            }
-
-            float *host_float = new float[t.shape.size];
-            if (host_float == nullptr)
-            {
-                delete[] host_data;
-                throw std::runtime_error("Failed to allocate host memory for float conversion");
-            }
-
+            unsigned char* device_data=reinterpret_cast<unsigned char*>(t.data);
+            auto [_,host_data_]= device_offload(device_data,total_bytes);
+            half* host_data=reinterpret_cast<half*>(host_data_.get());
+            shared_ptr<float[]> host_float(new float[t.shape.size]);
             for (size_t i = 0; i < t.shape.size; i++)
             {
-                host_float[i] = __half2float(((half *)host_data)[i]);
+                host_float[i] = __half2float(host_data[i]);
             }
 
-            delete[] host_data;
             // 打印转换后的float数据
-            stdutil::print(t.shape.shape, host_float, Precision::Float32, f);
-            delete[] host_float;
+            stdutil::print(t.shape.shape, host_float.get(), Precision::Float32, f);
         }
     };
 
@@ -91,79 +61,27 @@ namespace deepx::tensorfunc
             int64_t total_bytes = t.shape.bytes();
 
             // 统一分配CPU内存
-            unsigned char *host_data = new unsigned char[total_bytes];
-            if (host_data == nullptr)
-            {
-                throw std::runtime_error("Failed to allocate host memory");
-            }
-
-            // 统一复制数据到CPU
-            cudaError_t err = cudaMemcpy(host_data, t.data, total_bytes, cudaMemcpyDeviceToHost);
-            if (err != cudaSuccess)
-            {
-                delete[] host_data;
-                throw std::runtime_error("Failed to copy data from device to host");
-            }
-
-            float *host_float = new float[t.shape.size];
-            if (host_float == nullptr)
-            {
-                delete[] host_data;
-                throw std::runtime_error("Failed to allocate host memory for float conversion");
-            }
+            unsigned char* device_data=reinterpret_cast<unsigned char*>(t.data);
+            auto [_,host_data_]= device_offload(device_data,total_bytes);
+            nv_bfloat16* host_data=reinterpret_cast<nv_bfloat16*>(host_data_.get());
+            shared_ptr<float[]> host_float(new float[t.shape.size]);
 
             for (size_t i = 0; i < t.shape.size; i++)
             {
-                host_float[i] = __bfloat162float(((nv_bfloat16 *)host_data)[i]);
-            }
-            delete[] host_data;
+                host_float[i] = __bfloat162float(host_data[i]);
+            }           
             // 打印转换后的float数据
-            stdutil::print(t.shape.shape, host_float, Precision::Float32, f);
-            delete[] host_float;
+            stdutil::print(t.shape.shape, host_float.get(), Precision::Float32, f); 
         }
     };
 
     template <typename T>
     void save(Tensor<T> &tensor, const std::string &path)
     {
-        // 保存shape
-        std::string shapepath = path + ".shape";
-        std::string shapedata = tensor.shape.toYaml();
-        std::ofstream shape_fs(shapepath, std::ios::binary);
-        shape_fs.write(shapedata.c_str(), shapedata.size());
-        shape_fs.close();
-
-        // 保存data
-        int64_t total_bytes = tensor.shape.bytes();
-
         // 统一分配CPU内存
-        unsigned char *host_data = new unsigned char[total_bytes];
-        if (host_data == nullptr)
-        {
-            throw std::runtime_error("Failed to allocate host memory");
-        }
-
-        // 统一复制数据到CPU
-        cudaError_t err = cudaMemcpy(host_data, tensor.data, total_bytes, cudaMemcpyDeviceToHost);
-        if (err != cudaSuccess)
-        {
-            delete[] host_data;
-            throw std::runtime_error("Failed to copy data from device to host");
-        }
-
-        std::string datapath = path + ".data";
-        std::ofstream data_fs(datapath, std::ios::binary | std::ios::in | std::ios::out);
-
-        if (!data_fs.is_open())
-        {
-            // 如果文件不存在，则创建新文件
-            data_fs.open(datapath, std::ios::binary | std::ios::out);
-        }
-        data_fs.seekp(0);
-        data_fs.write(reinterpret_cast<const char *>(host_data), total_bytes);
-        data_fs.close();
-
-        delete[] host_data;
+        unsigned char* device_data=reinterpret_cast<unsigned char*>(tensor.data);
+        auto [size,host_data]= device_offload(device_data,tensor.shape.bytes());
+        stdutil::save(host_data.get(),size,path);
     };
 
     template <typename T>
@@ -183,36 +101,19 @@ namespace deepx::tensorfunc
 
         // 检查file.size，是否是tensor.size*sizeof(T)
         std::string datapath = path + ".data";
-        std::ifstream data_fs(datapath, std::ios::binary);
-        data_fs.seekg(0, std::ios::end);
-        std::streamsize fileSize = data_fs.tellg();
-        std::streamsize expectedSize = shape.bytes();
-
-        if (fileSize != expectedSize)
-        {
-            throw std::runtime_error("数据文件大小不足: 需要 " + std::to_string(expectedSize) +
+        auto [fileSize,hostdata]=stdutil::load(datapath);
+        if(fileSize!=shape.bytes()){
+            throw std::runtime_error("数据文件大小不足: 需要 " + std::to_string(shape.bytes()) +
                                      " 字节，但文件只有 " + std::to_string(fileSize) + " 字节");
         }
-        data_fs.seekg(0);
-
-        // TODO 从文件，到cuda内存（可能是显存）
-
+        T *host_data=reinterpret_cast<T*>(hostdata.get());
         shared_ptr<Tensor<T>> tensor = make_shared<Tensor<T>>(New<T>(shape.shape));
-        unsigned char *host_data = new unsigned char[fileSize];
-        if (host_data == nullptr)
-        {
-            throw std::runtime_error("Failed to allocate host memory");
-        }
-        data_fs.read(reinterpret_cast<char *>(host_data), fileSize);
-        data_fs.close();
-
+        
         cudaError_t err = cudaMemcpy(tensor->data, host_data, fileSize, cudaMemcpyHostToDevice);
         if (err != cudaSuccess)
         {
-            delete[] host_data;
             throw std::runtime_error("Failed to copy data from host to device");
         }
-        delete[] host_data;
         return std::make_pair(tensor_name, tensor);
     }
 }
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/tensorlife_miaobyte.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/tensorlife_miaobyte.hpp
index 8e776a14..5e0c96c0 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/tensorlife_miaobyte.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/tensorlife_miaobyte.hpp
@@ -3,57 +3,92 @@
 
 #include <cuda_runtime.h>
 #include <stdexcept>
+
+#include "stdutil/fs.hpp"
 #include "deepx/tensor.hpp"
 #include "deepx/dtype_cuda.hpp"
 #include "deepx/tensorfunc/tensorlife.hpp"
+#include "deepx/tensorfunc/cuda.hpp"
 // 具体的张量类
 namespace deepx::tensorfunc
 {
+    // NewFn
     template <typename T>
-    static T* dataNew(int size)
+    static T *newFn(int size)
     {
-        T* data;
+        T *data;
         cudaError_t err = cudaMalloc(&data, size * sizeof(T));
-        if (err != cudaSuccess) {
+        if (err != cudaSuccess)
+        {
             throw std::runtime_error("Failed to allocate Unified Memory");
         }
         return data;
     }
 
     template <typename T>
-    static void dataFree(T *data)
+    static void freeFn(T *data)
     {
         cudaFree(data);
     }
 
     template <typename T>
-    static void dataCopy(T *data, T *data2, int size)
+    static void copyFn(T *src, T *dest, int size)
     {
-        cudaMemcpy(data2, data, size * sizeof(T), cudaMemcpyDefault);
+        cudaMemcpy(dest, src, size * sizeof(T), cudaMemcpyDeviceToDevice);
+    }
+
+    template <typename T>
+    static void saveFn(T *tensorData, size_t size, const std::string &path)
+    {
+        // 保存data
+        int64_t total_bytes = size * sizeof(T);
+
+        // 统一分配CPU内存
+        auto [host_data, err] = device_offload(tensorData, total_bytes);
+        stdutil::save(host_data.get(), total_bytes, path);
+    }
+
+    // 不做任何转换，直接从内存到文件，或从文件到内存
+    template <typename T>
+    static int loadFn(const std::string &path, T *data, int count)
+    {
+        auto [file_size, hostdata] = stdutil::load(path);
+        if (file_size != count * sizeof(T))
+        {
+            Precision p = precision<T>();
+            throw std::runtime_error("file_size!=count*" + precision_str(p));
+        }
+        cudaMemcpy(data, hostdata.get(), file_size, cudaMemcpyHostToDevice);
+        cudaError_t err = cudaGetLastError();
+        if (err != cudaSuccess)
+        {
+            throw std::runtime_error("Failed to copy data from host to device");
+        }
+        return count;
     }
 
     template <typename T>
     Tensor<T> New(const std::vector<int> &shapedata)
     {
         Shape shape(shapedata);
-        shape.dtype=precision<T>();
+        shape.dtype = precision<T>();
         Tensor<T> tensor(shape);
-        tensor.deleter = dataFree<T>;
-        tensor.copyer = dataCopy<T>;
-        tensor.newer = dataNew<T>;
+        tensor.deleter = freeFn<T>;
+        tensor.copyer = copyFn<T>;
+        tensor.newer = newFn<T>;
 
-        tensor.data = dataNew<T>(shape.size);
+        tensor.data = newFn<T>(shape.size);
         return tensor;
     }
- 
+
     template <typename T>
-    void copy(const Tensor<T> &src,Tensor<T> &dst)
+    void copy(const Tensor<T> &src, Tensor<T> &dst)
     {
-        dst.shape=src.shape;
+        dst.shape = src.shape;
         dst.copyer(src.data, dst.data, src.shape.size);
     }
 
-    //rename
+    // rename
 
 }
 #endif // DEEPX_TENSORFUNC_TENSORLIFE_MIAOBYTE_HPP
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/vector_cuda.cuh b/excuter/op-mem-cuda/src/deepx/tensorfunc/vector_cuda.cuh
index 4fe17030..042543d8 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/vector_cuda.cuh
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/vector_cuda.cuh
@@ -3,6 +3,7 @@
 
 namespace deepx::tensorfunc
 {
+    //TODO 待验证
     template <typename T>
     __device__ void GridStrideLoopCopy(const T* src, T* dst, int size) {
         int idx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -12,7 +13,7 @@ namespace deepx::tensorfunc
             dst[i] = src[i];
         }
     }
-    
+     //TODO 待验证
     // 全局复制函数，可从主机调用
     template <typename T>
     __global__ void GridStrideLoopCopyKernel(const T* src, T* dst, int size) {
diff --git a/excuter/op-mem-cuda/src/deepx/tf/arg.hpp b/excuter/op-mem-cuda/src/deepx/tf/arg.hpp
index dcc845fd..4b33c457 100644
--- a/excuter/op-mem-cuda/src/deepx/tf/arg.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tf/arg.hpp
@@ -21,11 +21,11 @@ namespace deepx::tf
  
         string math_formula() const override
         {
-            return "var argname = argvalue";
+            return "argvalue->argname";
         }
         int run(shared_ptr<MemBase> mem, string &error) override
         {
-            string name = this->args[0].textvalue;
+            string name = this->returns[0].textvalue;
             if (this->args.size() != 1)
             {
                 error = "argset(int32) must have 1 argument";
@@ -82,7 +82,7 @@ namespace deepx::tf
  
         string math_formula() const override
         {
-            return "shape = [3  4  5]";
+            return "[3  4  5]->shape";
         }
         int run(shared_ptr<MemBase> mem, string &error) override
         {
diff --git a/excuter/op-mem-cuda/src/deepx/tf/changeshape.hpp b/excuter/op-mem-cuda/src/deepx/tf/changeshape.hpp
index 8b4604f9..db447136 100644
--- a/excuter/op-mem-cuda/src/deepx/tf/changeshape.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tf/changeshape.hpp
@@ -39,7 +39,7 @@ namespace deepx::tf
         int run(shared_ptr<MemBase> mem, string &error) override
         {
             Precision input_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
-            vector<int> shape = this->getvector<int>(1, -1);
+            vector<int> shape = this->getvector<int>(1, true);
             Precision output_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
             if (input_type != output_type)
             {
@@ -101,7 +101,7 @@ namespace deepx::tf
         int run(shared_ptr<MemBase> mem, string &error) override
         {
             Precision input_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
-            vector<int> dim_order = this->getvector<int>(1, -1);
+            vector<int> dim_order = this->getvector<int>(1, true);
             Precision output_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
 
             if (input_type != output_type)
diff --git a/excuter/op-mem-cuda/src/deepx/tf/init.hpp b/excuter/op-mem-cuda/src/deepx/tf/init.hpp
index 839c83bf..055bb65c 100644
--- a/excuter/op-mem-cuda/src/deepx/tf/init.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tf/init.hpp
@@ -18,60 +18,67 @@ namespace deepx::tf
         Constant(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "constant";
-            this->metadata.author= Author::name();
+            this->metadata.author = Author::name();
             this->tftype = "init";
             this->args = args;
             this->returns = returns;
         }
- 
+        string math_formula() const override
+        {
+            return "constant(value)->T1";
+        }
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<Constant<Author>>(*this);
+        }
         int run(shared_ptr<MemBase> mem, string &error) override
         {
-            string name = this->args[0].textvalue;
+            string name = this->returns[0].textvalue;
             auto tensor = mem->gettensor(name).get();
             auto type = tensor->shape.dtype;
             switch (type)
             {
             case Precision::Float64:
-             
-                tensorfunc::constant<Author, double>(*mem->gettensor<double>(name).get(), this->getvar<double>(1, mem));
+
+                tensorfunc::constant<Author, double>(*mem->gettensor<double>(name).get(), this->getvar<double>(0, mem));
                 break;
-           
+
             case Precision::Float32:
-            
-                tensorfunc::constant<Author, float>(*mem->gettensor<float>(name).get(), this->getvar<float>(1, mem));
+
+                tensorfunc::constant<Author, float>(*mem->gettensor<float>(name).get(), this->getvar<float>(0, mem));
                 break;
-            
+
             case Precision::Float16:
-            
-                tensorfunc::constant<Author, __half>(*mem->gettensor<__half>(name).get(), this->getvar<__half>(1, mem));
+
+                tensorfunc::constant<Author, __half>(*mem->gettensor<__half>(name).get(), this->getvar<__half>(0, mem));
                 break;
-             
+
             case Precision::BFloat16:
-            
-                tensorfunc::constant<Author, __nv_bfloat16>(*mem->gettensor<__nv_bfloat16>(name).get(), this->getvar<__nv_bfloat16>(1, mem));
+
+                tensorfunc::constant<Author, __nv_bfloat16>(*mem->gettensor<__nv_bfloat16>(name).get(), this->getvar<__nv_bfloat16>(0, mem));
                 break;
-            
+
             case Precision::Int64:
-            
-                tensorfunc::constant<Author, int64_t>(*mem->gettensor<int64_t>(name).get(), this->getvar<int64_t>(1, mem));
+
+                tensorfunc::constant<Author, int64_t>(*mem->gettensor<int64_t>(name).get(), this->getvar<int64_t>(0, mem));
                 break;
-            
+
             case Precision::Int32:
-            
-                tensorfunc::constant<Author, int32_t>(*mem->gettensor<int32_t>(name).get(), this->getvar<int32_t>(1, mem));
+
+                tensorfunc::constant<Author, int32_t>(*mem->gettensor<int32_t>(name).get(), this->getvar<int32_t>(0, mem));
                 break;
-            
+
             case Precision::Int16:
-            
-                tensorfunc::constant<Author, int16_t>(*mem->gettensor<int16_t>(name).get(), this->getvar<int16_t>(1, mem));
+
+                tensorfunc::constant<Author, int16_t>(*mem->gettensor<int16_t>(name).get(), this->getvar<int16_t>(0, mem));
                 break;
-            
+
             case Precision::Int8:
-            
-                tensorfunc::constant<Author, int8_t>(*mem->gettensor<int8_t>(name).get(), this->getvar<int8_t>(1, mem));
+
+                tensorfunc::constant<Author, int8_t>(*mem->gettensor<int8_t>(name).get(), this->getvar<int8_t>(0, mem));
                 break;
             case Precision::Bool:
-                tensorfunc::constant<Author, bool>(*mem->gettensor<bool>(name).get(), this->getvar<bool>(1, mem));
+                tensorfunc::constant<Author, bool>(*mem->gettensor<bool>(name).get(), this->getvar<bool>(0, mem));
                 break;
             default:
             {
@@ -81,14 +88,6 @@ namespace deepx::tf
             }
             return 0;
         };
-        string math_formula() const override
-        {
-            return "constant(T1)";
-        }
-        shared_ptr<TF> clone() const override
-        {
-            return make_shared<Constant<Author>>(*this);
-        }
     };
 
     template <typename Author>
@@ -98,14 +97,22 @@ namespace deepx::tf
         Arange(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "arange";
-            this->metadata.author= Author::name();
+            this->metadata.author = Author::name();
             this->tftype = "init";
             this->args = args;
             this->returns = returns;
         }
+        string math_formula() const override
+        {
+            return "arange(start,step)->T1";
+        }
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<Arange<Author>>(*this);
+        }
         int run(shared_ptr<MemBase> mem, string &error) override
         {
-            string name = this->args[0].textvalue;
+            string name = this->returns[0].textvalue;
             auto tensor = mem->gettensor(name).get();
             auto type = tensor->shape.dtype;
             switch (type)
@@ -113,49 +120,49 @@ namespace deepx::tf
             case Precision::Float64:
             {
                 auto output = mem->gettensor<double>(name).get();
-                tensorfunc::arange<Author, double>(*output, this->getvar<double>(1, mem), this->getvar<double>(2, mem));
+                tensorfunc::arange<Author, double>(*output, this->getvar<double>(0, mem), this->getvar<double>(1, mem));
                 break;
             }
             case Precision::Float32:
             {
                 auto output = mem->gettensor<float>(name).get();
-                tensorfunc::arange<Author, float>(*output, this->getvar<float>(1, mem), this->getvar<float>(2, mem));
+                tensorfunc::arange<Author, float>(*output, this->getvar<float>(0, mem), this->getvar<float>(1, mem));
                 break;
             }
             case Precision::Float16:
             {
                 auto output = mem->gettensor<__half>(name).get();
-                tensorfunc::arange<Author, __half>(*output, this->getvar<__half>(1, mem), this->getvar<__half>(2, mem));
+                tensorfunc::arange<Author, __half>(*output, this->getvar<__half>(0, mem), this->getvar<__half>(1, mem));
                 break;
             }
             case Precision::BFloat16:
             {
                 auto output = mem->gettensor<__nv_bfloat16>(name).get();
-                tensorfunc::arange<Author, __nv_bfloat16>(*output, this->getvar<__nv_bfloat16>(1, mem), this->getvar<__nv_bfloat16>(2, mem));
+                tensorfunc::arange<Author, __nv_bfloat16>(*output, this->getvar<__nv_bfloat16>(0, mem), this->getvar<__nv_bfloat16>(1, mem));
                 break;
             }
             case Precision::Int64:
             {
                 auto output = mem->gettensor<int64_t>(name).get();
-                tensorfunc::arange<Author, int64_t>(*output, this->getvar<int64_t>(1, mem), this->getvar<int64_t>(2, mem));
+                tensorfunc::arange<Author, int64_t>(*output, this->getvar<int64_t>(0, mem), this->getvar<int64_t>(1, mem));
                 break;
             }
             case Precision::Int32:
             {
                 auto output = mem->gettensor<int32_t>(name).get();
-                tensorfunc::arange<Author, int32_t>(*output, this->getvar<int32_t>(1, mem), this->getvar<int32_t>(2, mem));
+                tensorfunc::arange<Author, int32_t>(*output, this->getvar<int32_t>(0, mem), this->getvar<int32_t>(1, mem));
                 break;
             }
             case Precision::Int16:
             {
                 auto output = mem->gettensor<int16_t>(name).get();
-                tensorfunc::arange<Author, int16_t>(*output, this->getvar<int16_t>(1, mem), this->getvar<int16_t>(2, mem));
+                tensorfunc::arange<Author, int16_t>(*output, this->getvar<int16_t>(0, mem), this->getvar<int16_t>(1, mem));
                 break;
             }
             case Precision::Int8:
             {
                 auto output = mem->gettensor<int8_t>(name).get();
-                tensorfunc::arange<Author, int8_t>(*output, this->getvar<int8_t>(1, mem), this->getvar<int8_t>(2, mem));
+                tensorfunc::arange<Author, int8_t>(*output, this->getvar<int8_t>(0, mem), this->getvar<int8_t>(1, mem));
                 break;
             }
             default:
@@ -166,14 +173,7 @@ namespace deepx::tf
             }
             return 0;
         }
-        string math_formula() const override
-        {
-            return "arange(T1,start,step)";
-        }
-        shared_ptr<TF> clone() const override
-        {
-            return make_shared<Arange<Author>>(*this);
-        }
+        
     };
 
     template <typename Author>
@@ -183,65 +183,75 @@ namespace deepx::tf
         Uniform(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "uniform";
-            this->metadata.author= Author::name();
+            this->metadata.author = Author::name();
             this->tftype = "init";
             this->args = args;
             this->returns = returns;
         }
+        string math_formula() const override
+        {
+            return "uniform(low,high,seed)->T1";
+        }
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<Uniform<Author>>(*this);
+        }
         int run(shared_ptr<MemBase> mem, string &error) override
         {
-            string name = this->args[0].textvalue;
+            string name = this->returns[0].textvalue;
             auto tensor = mem->gettensor(name).get();
             auto type = tensor->shape.dtype;
-            unsigned int seed = static_cast<unsigned int>( this->getvar<int>(3, mem));
+            int low_pos=0;
+            int high_pos=1;
+            unsigned int seed = static_cast<unsigned int>(this->getvar<int>(2, mem));
             switch (type)
             {
             case Precision::Float64:
             {
                 auto output = mem->gettensor<double>(name).get();
-                tensorfunc::uniform<Author, double>(*output, this->getvar<double>(1, mem), this->getvar<double>(2, mem), seed);
+                tensorfunc::uniform<Author, double>(*output, this->getvar<double>(low_pos, mem), this->getvar<double>(high_pos, mem), seed);
                 break;
             }
             case Precision::Float32:
             {
                 auto output = mem->gettensor<float>(name).get();
-                tensorfunc::uniform<Author, float>(*output, this->getvar<float>(1, mem), this->getvar<float>(2, mem), seed);
+                tensorfunc::uniform<Author, float>(*output, this->getvar<float>(low_pos, mem), this->getvar<float>(high_pos, mem), seed);
                 break;
             }
             case Precision::Float16:
             {
                 auto output = mem->gettensor<__half>(name).get();
-                tensorfunc::uniform<Author, __half>(*output, this->getvar<__half>(1, mem), this->getvar<__half>(2, mem), seed);
+                tensorfunc::uniform<Author, __half>(*output, this->getvar<__half>(low_pos, mem), this->getvar<__half>(high_pos, mem), seed);
                 break;
             }
             case Precision::BFloat16:
             {
                 auto output = mem->gettensor<__nv_bfloat16>(name).get();
-                tensorfunc::uniform<Author, __nv_bfloat16>(*output, this->getvar<__nv_bfloat16>(1, mem), this->getvar<__nv_bfloat16>(2, mem), seed);
+                tensorfunc::uniform<Author, __nv_bfloat16>(*output, this->getvar<__nv_bfloat16>(low_pos, mem), this->getvar<__nv_bfloat16>(high_pos, mem), seed);
                 break;
             }
             case Precision::Int64:
             {
                 auto output = mem->gettensor<int64_t>(name).get();
-                tensorfunc::uniform<Author, int64_t>(*output, this->getvar<int64_t>(1, mem), this->getvar<int64_t>(2, mem), seed);
+                tensorfunc::uniform<Author, int64_t>(*output, this->getvar<int64_t>(low_pos, mem), this->getvar<int64_t>(high_pos, mem), seed);
                 break;
             }
             case Precision::Int32:
             {
                 auto output = mem->gettensor<int32_t>(name).get();
-                tensorfunc::uniform<Author, int32_t>(*output, this->getvar<int32_t>(1, mem), this->getvar<int32_t>(2, mem), seed);
+                tensorfunc::uniform<Author, int32_t>(*output, this->getvar<int32_t>(low_pos, mem), this->getvar<int32_t>(high_pos, mem), seed);
                 break;
             }
             case Precision::Int16:
             {
                 auto output = mem->gettensor<int16_t>(name).get();
-                tensorfunc::uniform<Author, int16_t>(*output, this->getvar<int16_t>(1, mem), this->getvar<int16_t>(2, mem), seed);
+                tensorfunc::uniform<Author, int16_t>(*output, this->getvar<int16_t>(low_pos, mem), this->getvar<int16_t>(high_pos, mem), seed);
                 break;
             }
             case Precision::Int8:
             {
                 auto output = mem->gettensor<int8_t>(name).get();
-                tensorfunc::uniform<Author, int8_t>(*output, this->getvar<int8_t>(1, mem), this->getvar<int8_t>(2, mem), seed);
+                tensorfunc::uniform<Author, int8_t>(*output, this->getvar<int8_t>(low_pos, mem), this->getvar<int8_t>(high_pos, mem), seed);
                 break;
             }
             default:
@@ -252,16 +262,10 @@ namespace deepx::tf
             }
             return 0;
         }
-        string math_formula() const override
-        {
-            return "uniform(T1,low,high,seed)";
-        }
-        shared_ptr<TF> clone() const override
-        {
-            return make_shared<Uniform<Author>>(*this);
-        }
+        
     };
 
+    //
     template <typename Author>
     class Normal : public TF
     {
@@ -269,7 +273,7 @@ namespace deepx::tf
         Normal(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "normal";
-            this->metadata.author= Author::name();
+            this->metadata.author = Author::name();
             this->tftype = "init";
             this->args = args;
             this->returns = returns;
@@ -277,7 +281,7 @@ namespace deepx::tf
 
         string math_formula() const override
         {
-            return "normal(T1,mean,stddev,seed)";
+            return "normal(mean,stddev,seed)->T1";
         }
         shared_ptr<TF> clone() const override
         {
@@ -285,41 +289,43 @@ namespace deepx::tf
         }
         int run(shared_ptr<MemBase> mem, string &error) override
         {
-            string name = this->args[0].textvalue;
+            string name = this->returns[0].textvalue;
             auto tensor = mem->gettensor(name).get();
             auto type = tensor->shape.dtype;
-            unsigned int seed = static_cast<unsigned int>( this->getvar<int>(3, mem));
+            int mean_pos=0;
+            int stddev_pos=1;
+            unsigned int seed = static_cast<unsigned int>(this->getvar<int>(2, mem));
             switch (type)
             {
             case Precision::Float64:
-                tensorfunc::normal<Author, double>(*mem->gettensor<double>(name).get(), this->getvar<double>(1, mem), this->getvar<double>(2, mem), seed);
+                tensorfunc::normal<Author, double>(*mem->gettensor<double>(name).get(), this->getvar<double>(mean_pos, mem), this->getvar<double>(stddev_pos, mem), seed);
                 break;
 
             case Precision::Float32:
-                tensorfunc::normal<Author, float>(*mem->gettensor<float>(name).get(), this->getvar<float>(1, mem), this->getvar<float>(2, mem), seed);
+                tensorfunc::normal<Author, float>(*mem->gettensor<float>(name).get(), this->getvar<float>(mean_pos, mem), this->getvar<float>(stddev_pos, mem), seed);
                 break;
             case Precision::Float16:
-                tensorfunc::normal<Author, __half>(*mem->gettensor<__half>(name).get(), this->getvar<__half>(1, mem), this->getvar<__half>(2, mem), seed);
+                tensorfunc::normal<Author, __half>(*mem->gettensor<__half>(name).get(), this->getvar<__half>(mean_pos, mem), this->getvar<__half>(stddev_pos, mem), seed);
                 break;
 
             case Precision::BFloat16:
-                tensorfunc::normal<Author, __nv_bfloat16>(*mem->gettensor<__nv_bfloat16>(name).get(), this->getvar<__nv_bfloat16>(1, mem), this->getvar<__nv_bfloat16>(2, mem), seed);
+                tensorfunc::normal<Author, __nv_bfloat16>(*mem->gettensor<__nv_bfloat16>(name).get(), this->getvar<__nv_bfloat16>(mean_pos, mem), this->getvar<__nv_bfloat16>(stddev_pos, mem), seed);
                 break;
 
             case Precision::Int64:
-                tensorfunc::normal<Author, int64_t>(*mem->gettensor<int64_t>(name).get(), this->getvar<int64_t>(1, mem), this->getvar<int64_t>(2, mem), seed);
+                tensorfunc::normal<Author, int64_t>(*mem->gettensor<int64_t>(name).get(), this->getvar<int64_t>(mean_pos, mem), this->getvar<int64_t>(stddev_pos, mem), seed);
                 break;
 
             case Precision::Int32:
-                tensorfunc::normal<Author, int32_t>(*mem->gettensor<int32_t>(name).get(), this->getvar<int32_t>(1, mem), this->getvar<int32_t>(2, mem), seed);
+                tensorfunc::normal<Author, int32_t>(*mem->gettensor<int32_t>(name).get(), this->getvar<int32_t>(mean_pos, mem), this->getvar<int32_t>(stddev_pos, mem), seed);
                 break;
 
             case Precision::Int16:
-                tensorfunc::normal<Author, int16_t>(*mem->gettensor<int16_t>(name).get(), this->getvar<int16_t>(1, mem), this->getvar<int16_t>(2, mem), seed);
+                tensorfunc::normal<Author, int16_t>(*mem->gettensor<int16_t>(name).get(), this->getvar<int16_t>(mean_pos, mem), this->getvar<int16_t>(stddev_pos, mem), seed);
                 break;
 
             case Precision::Int8:
-                tensorfunc::normal<Author, int8_t>(*mem->gettensor<int8_t>(name).get(), this->getvar<int8_t>(1, mem), this->getvar<int8_t>(2, mem), seed);
+                tensorfunc::normal<Author, int8_t>(*mem->gettensor<int8_t>(name).get(), this->getvar<int8_t>(mean_pos, mem), this->getvar<int8_t>(stddev_pos, mem), seed);
                 break;
 
             default:
diff --git a/excuter/op-mem-cuda/src/deepx/tf/io.hpp b/excuter/op-mem-cuda/src/deepx/tf/io.hpp
index 8049fc81..64466b15 100644
--- a/excuter/op-mem-cuda/src/deepx/tf/io.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tf/io.hpp
@@ -178,5 +178,126 @@ namespace deepx::tf
             return 0;
         }
     };
+
+    //loadtensor
+    class LoadTensor : public TF
+    {
+    public:
+        LoadTensor(vector<Param> args, vector<Param> returns)
+        {
+            this->name = "loadtensor";
+            this->tftype = "io";
+            this->args = args;
+            this->returns = returns;
+        }
+        string math_formula() const override
+        {
+            return "loadtensor(path)->tensor";
+        }
+        shared_ptr<TF> clone() const override   
+        {
+            return make_shared<LoadTensor>(*this);
+        }
+        int run(shared_ptr<MemBase> mem, string &error) override
+        {
+            string path = this->args[0].textvalue;  
+            string tensorname = this->returns[0].textvalue;
+            if(!mem->existstensor(tensorname))
+            {   
+                error = "loadtensor " + tensorname + " not exists";
+                return 1;
+            }
+            pair<std::string,Shape> shape_name=tensorfunc::loadShape(path);
+            std::string tensor_name=shape_name.first;
+            Shape shape=shape_name.second;
+            switch (shape.dtype)
+            {
+            case Precision::Float64:{
+                pair<std::string,shared_ptr<Tensor<double>>> t = tensorfunc::load<double>(path);
+                mem->gettensor<double>(tensorname)->copyer(t.second->data,mem->gettensor<double>(tensorname)->data,t.second->shape.size);
+                break;
+            }
+            case Precision::Float32:{
+                pair<std::string,shared_ptr<Tensor<float>>> t = tensorfunc::load<float>(path);
+                mem->gettensor<float>(tensorname)->copyer(t.second->data,mem->gettensor<float>(tensorname)->data,t.second->shape.size);
+                break;
+            }
+            case Precision::Float16:{
+                pair<std::string,shared_ptr<Tensor<half>>> t = tensorfunc::load<half>(path);
+                mem->gettensor<half>(tensorname)->copyer(t.second->data,mem->gettensor<half>(tensorname)->data,t.second->shape.size);
+                break;
+            }
+            case Precision::BFloat16:{
+                pair<std::string,shared_ptr<Tensor<nv_bfloat16>>> t = tensorfunc::load<nv_bfloat16>(path);
+                mem->gettensor<nv_bfloat16>(tensorname)->copyer(t.second->data,mem->gettensor<nv_bfloat16>(tensorname)->data,t.second->shape.size);
+                break;
+            }
+            case Precision::Int64:{
+                pair<std::string,shared_ptr<Tensor<int64_t>>> t = tensorfunc::load<int64_t>(path);
+                mem->gettensor<int64_t>(tensorname)->copyer(t.second->data,mem->gettensor<int64_t>(tensorname)->data,t.second->shape.size);
+                break;
+            }
+            case Precision::Int32:{
+                pair<std::string,shared_ptr<Tensor<int32_t>>> t = tensorfunc::load<int32_t>(path);
+                mem->gettensor<int32_t>(tensorname)->copyer(t.second->data,mem->gettensor<int32_t>(tensorname)->data,t.second->shape.size);
+                break;
+            }
+            case Precision::Int16:{
+                pair<std::string,shared_ptr<Tensor<int16_t>>> t = tensorfunc::load<int16_t>(path);
+                mem->gettensor<int16_t>(tensorname)->copyer(t.second->data,mem->gettensor<int16_t>(tensorname)->data,t.second->shape.size);
+                break;
+            }   
+            case Precision::Int8:{
+                pair<std::string,shared_ptr<Tensor<int8_t>>> t = tensorfunc::load<int8_t>(path);
+                mem->gettensor<int8_t>(tensorname)->copyer(t.second->data,mem->gettensor<int8_t>(tensorname)->data,t.second->shape.size);
+                break;
+            }
+            case Precision::Bool:{
+                pair<std::string,shared_ptr<Tensor<bool>>> t = tensorfunc::load<bool>(path);
+                mem->gettensor<bool>(tensorname)->copyer(t.second->data,mem->gettensor<bool>(tensorname)->data,t.second->shape.size);
+                break;
+            }
+            default:
+                break;
+            }
+
+
+            return 0;
+        }
+    };
+
+    //loadtensordata
+    class LoadTensorData : public TF
+    {
+    public:
+        LoadTensorData(vector<Param> args, vector<Param> returns)
+        {   
+            this->name = "loadtensordata";
+            this->tftype = "io";
+            this->args = args;
+            this->returns = returns;
+        }   
+        string math_formula() const override
+        {
+            return "loadtensordata(path)->tensor";
+        }
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<LoadTensorData>(*this);
+        }
+        int run(shared_ptr<MemBase> mem, string &error) override
+        {
+            string path = this->args[0].textvalue;
+            string tensorname = this->returns[0].textvalue;
+            if(!mem->existstensor(tensorname))
+            {
+                error = "loadtensordata " + tensorname + " not found";
+                return 1;
+            }
+            auto t = *mem->gettensor(tensorname);
+            t.loader(path,t.data,t.shape.size);
+            return 0;
+        }   
+    };
 }
 #endif
diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/tensorlife_miaobyte.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/tensorlife_miaobyte.hpp
index c514faa8..b1bc509e 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/tensorlife_miaobyte.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/tensorlife_miaobyte.hpp
@@ -12,7 +12,7 @@ namespace deepx::tensorfunc
 {
 
     template <typename T>
-    static T *dataNew(int size)
+    static T *newFn(int size)
     {
         return static_cast<T *>(MemoryPool::Malloc(size * sizeof(T)));
     }
@@ -39,8 +39,8 @@ namespace deepx::tensorfunc
         Tensor<T> tensor(shape);
         tensor.deleter = dataFree<T>;
         tensor.copyer = dataCopy<T>;
-        tensor.newer = dataNew<T>;
-        tensor.data = dataNew<T>(shape.size);
+        tensor.newer = newFn<T>;
+        tensor.data = newFn<T>(shape.size);
         return tensor;
     };
 
diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/arg.hpp b/excuter/op-mem-ompsimd/src/deepx/tf/arg.hpp
index 13733fa1..2418cd32 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tf/arg.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tf/arg.hpp
@@ -23,11 +23,11 @@ namespace deepx::tf
  
         string math_formula() const override
         {
-            return "var argname = argvalue";
+            return "argvalue->argname";
         }
         int run(shared_ptr<MemBase> mem, string &error) override
         {
-            string name = this->args[0].textvalue;
+            string name = this->returns[0].textvalue;
             if (this->args.size() != 1)
             {
                 error = "argset(int32) must have 1 argument";
@@ -84,7 +84,7 @@ namespace deepx::tf
         
         string math_formula() const override
         {
-            return "shape = [3  4  5]";
+            return "[3  4  5]->shape";
         }
         int run(shared_ptr<MemBase> mem, string &error) override
         {
diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/changeshape.hpp b/excuter/op-mem-ompsimd/src/deepx/tf/changeshape.hpp
index e0d96019..0f64432c 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tf/changeshape.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tf/changeshape.hpp
@@ -43,7 +43,7 @@ namespace deepx::tf
                 return 1;
             }
             Precision input_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
-            vector<int> shape = this->getvector<int>(1, -1);
+            vector<int> shape = this->getvector<int>(1, true);
             Precision output_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
             if (input_type != output_type)
             {
@@ -109,7 +109,7 @@ namespace deepx::tf
                 return 1;
             }
             Precision input_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
-            vector<int> dim_order = this->getvector<int>(1, -1);
+            vector<int> dim_order = this->getvector<int>(1,true);
             Precision output_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
             if (input_type != output_type)
             {
diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/init.hpp b/excuter/op-mem-ompsimd/src/deepx/tf/init.hpp
index 54410653..756f880d 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tf/init.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tf/init.hpp
@@ -20,10 +20,17 @@ namespace deepx::tf
             this->args = args;
             this->returns = returns;
         }
- 
+        string math_formula() const override
+        {
+            return "constant(value)->T1";
+        }
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<Constant<Author>>(*this);
+        }
         int run(shared_ptr<MemBase> mem, string &error) override
         {
-            string name = this->args[0].textvalue;
+            string name = this->returns[0].textvalue;
             auto tensor = mem->gettensor(name).get();
             if (tensor == nullptr)
             {
@@ -36,38 +43,38 @@ namespace deepx::tf
             case Precision::Float64:
             {
                 auto output = mem->gettensor<double>(name).get();
-                tensorfunc::constant<Author, double>(*output, this->getvar<double>(1, mem));
+                tensorfunc::constant<Author, double>(*output, this->getvar<double>(0, mem));
                 break;
             }
             case Precision::Float32:
             {
                 auto output = mem->gettensor<float>(name).get();
-                tensorfunc::constant<Author, float>(*output, this->getvar<float>(1, mem));
+                tensorfunc::constant<Author, float>(*output, this->getvar<float>(0, mem));
                 break;
             }
 
             case Precision::Int64:
             {
                 auto output = mem->gettensor<int64_t>(name).get();
-                tensorfunc::constant<Author, int64_t>(*output, this->getvar<int64_t>(1, mem));
+                tensorfunc::constant<Author, int64_t>(*output, this->getvar<int64_t>(0, mem));
                 break;
             }
             case Precision::Int32:
             {
                 auto output = mem->gettensor<int32_t>(name).get();
-                tensorfunc::constant<Author, int32_t>(*output, this->getvar<int32_t>(1, mem));
+                tensorfunc::constant<Author, int32_t>(*output, this->getvar<int32_t>(0, mem));
                 break;
             }
             case Precision::Int16:
             {
                 auto output = mem->gettensor<int16_t>(name).get();
-                tensorfunc::constant<Author, int16_t>(*output, this->getvar<int16_t>(1, mem));
+                tensorfunc::constant<Author, int16_t>(*output, this->getvar<int16_t>(0, mem));
                 break;
             }
             case Precision::Int8:
             {
                 auto output = mem->gettensor<int8_t>(name).get();
-                tensorfunc::constant<Author, int8_t>(*output, this->getvar<int8_t>(1, mem));
+                tensorfunc::constant<Author, int8_t>(*output, this->getvar<int8_t>(0, mem));
                 break;
             }
             default:
@@ -78,14 +85,6 @@ namespace deepx::tf
             }
             return 0;
         };
-        string math_formula() const override
-        {
-            return "constant(T1,value)";
-        }
-        shared_ptr<TF> clone() const override
-        {
-            return make_shared<Constant<Author>>(*this);
-        }
     };
 
     // arange
@@ -100,10 +99,18 @@ namespace deepx::tf
             this->tftype = "init";
             this->args = args;
             this->returns = returns;
+        }
+         string math_formula() const override
+        {
+            return "arange(start,step)->T1";
+        }
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<Arange<Author>>(*this);
         }
         int run(shared_ptr<MemBase> mem, string &error) override
         {
-            string name = this->args[0].textvalue;
+            string name = this->returns[0].textvalue;
             auto tensor = mem->gettensor(name).get();
             auto type = tensor->shape.dtype;
             switch (type)
@@ -111,38 +118,38 @@ namespace deepx::tf
             case Precision::Float64:
             {
                 auto output = mem->gettensor<double>(name).get();
-                tensorfunc::arange<Author, double>(*output, this->getvar<double>(1, mem), this->getvar<double>(2, mem));
+                tensorfunc::arange<Author, double>(*output, this->getvar<double>(0, mem), this->getvar<double>(1, mem));
                 break;
             }
             case Precision::Float32:
             {
                 auto output = mem->gettensor<float>(name).get();
-                tensorfunc::arange<Author, float>(*output, this->getvar<float>(1, mem), this->getvar<float>(2, mem));
+                tensorfunc::arange<Author, float>(*output, this->getvar<float>(0, mem), this->getvar<float>(1, mem));
                 break;
             }
 
             case Precision::Int64:
             {
                 auto output = mem->gettensor<int64_t>(name).get();
-                tensorfunc::arange<Author, int64_t>(*output, this->getvar<int64_t>(1, mem), this->getvar<int64_t>(2, mem));
+                tensorfunc::arange<Author, int64_t>(*output, this->getvar<int64_t>(0, mem), this->getvar<int64_t>(1, mem));
                 break;
             }
             case Precision::Int32:
             {
                 auto output = mem->gettensor<int32_t>(name).get();
-                tensorfunc::arange<Author, int32_t>(*output, this->getvar<int32_t>(1, mem), this->getvar<int32_t>(2, mem));
+                tensorfunc::arange<Author, int32_t>(*output, this->getvar<int32_t>(0, mem), this->getvar<int32_t>(1, mem));
                 break;
             }
             case Precision::Int16:
             {
                 auto output = mem->gettensor<int16_t>(name).get();
-                tensorfunc::arange<Author, int16_t>(*output, this->getvar<int16_t>(1, mem), this->getvar<int16_t>(2, mem));
+                tensorfunc::arange<Author, int16_t>(*output, this->getvar<int16_t>(0, mem), this->getvar<int16_t>(1, mem));
                 break;
             }
             case Precision::Int8:
             {
                 auto output = mem->gettensor<int8_t>(name).get();
-                tensorfunc::arange<Author, int8_t>(*output, this->getvar<int8_t>(1, mem), this->getvar<int8_t>(2, mem));
+                tensorfunc::arange<Author, int8_t>(*output, this->getvar<int8_t>(0, mem), this->getvar<int8_t>(1, mem));
                 break;
             }
             default:
@@ -153,14 +160,7 @@ namespace deepx::tf
             }
             return 0;
         }
-        string math_formula() const override
-        {
-            return "arange(T1,start,step)";
-        }
-        shared_ptr<TF> clone() const override
-        {
-            return make_shared<Arange<Author>>(*this);
-        }
+       
     };
 
     // uniform
@@ -176,48 +176,59 @@ namespace deepx::tf
             this->args = args;
             this->returns = returns;
         }
+        string math_formula() const override
+        {
+            return "uniform(low,high,seed)->T1";
+        }
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<Uniform<Author>>(*this);
+        }
         int run(shared_ptr<MemBase> mem, string &error) override
         {
-            string name = this->args[0].textvalue;
+            string name = this->returns[0].textvalue;
             auto tensor = mem->gettensor(name).get();
             auto type = tensor->shape.dtype;
+            int low_pos=0;
+            int high_pos=1;
+            unsigned int seed = static_cast<unsigned int>(this->getvar<int>(2, mem));
             switch (type)
             {
             case Precision::Float64:
             {
                 auto output = mem->gettensor<double>(name).get();
-                tensorfunc::uniform<Author, double>(*output, this->getvar<double>(1, mem), this->getvar<double>(2, mem), this->getvar<unsigned int>(3, mem));
+                tensorfunc::uniform<Author, double>(*output, this->getvar<double>(low_pos, mem), this->getvar<double>(high_pos, mem), seed);
                 break;
             }
             case Precision::Float32:
             {
                 auto output = mem->gettensor<float>(name).get();
-                tensorfunc::uniform<Author, float>(*output, this->getvar<float>(1, mem), this->getvar<float>(2, mem), this->getvar<unsigned int>(3, mem));
+                tensorfunc::uniform<Author, float>(*output, this->getvar<float>(low_pos, mem), this->getvar<float>(high_pos, mem), seed);
                 break;
             }
 
             case Precision::Int64:
             {
                 auto output = mem->gettensor<int64_t>(name).get();
-                tensorfunc::uniform<Author, int64_t>(*output, this->getvar<int64_t>(1, mem), this->getvar<int64_t>(2, mem), this->getvar<unsigned int>(3, mem));
+                tensorfunc::uniform<Author, int64_t>(*output, this->getvar<int64_t>(low_pos, mem), this->getvar<int64_t>(high_pos, mem), seed);
                 break;
             }
             case Precision::Int32:
             {
                 auto output = mem->gettensor<int32_t>(name).get();
-                tensorfunc::uniform<Author, int32_t>(*output, this->getvar<int32_t>(1, mem), this->getvar<int32_t>(2, mem), this->getvar<unsigned int>(3, mem));
+                tensorfunc::uniform<Author, int32_t>(*output, this->getvar<int32_t>(low_pos, mem), this->getvar<int32_t>(high_pos, mem), seed);
                 break;
             }
             case Precision::Int16:
             {
                 auto output = mem->gettensor<int16_t>(name).get();
-                tensorfunc::uniform<Author, int16_t>(*output, this->getvar<int16_t>(1, mem), this->getvar<int16_t>(2, mem), this->getvar<unsigned int>(3, mem));
+                tensorfunc::uniform<Author, int16_t>(*output, this->getvar<int16_t>(low_pos, mem), this->getvar<int16_t>(high_pos, mem), seed);
                 break;
             }
             case Precision::Int8:
             {
                 auto output = mem->gettensor<int8_t>(name).get();
-                tensorfunc::uniform<Author, int8_t>(*output, this->getvar<int8_t>(1, mem), this->getvar<int8_t>(2, mem), this->getvar<unsigned int>(3, mem));
+                tensorfunc::uniform<Author, int8_t>(*output, this->getvar<int8_t>(low_pos, mem), this->getvar<int8_t>(high_pos, mem), seed);
                 break;
             }
             default:
@@ -228,14 +239,7 @@ namespace deepx::tf
             }
             return 0;
         }
-        string math_formula() const override
-        {
-            return "uniform(T1,low,high,seed)";
-        }
-        shared_ptr<TF> clone() const override
-        {
-            return make_shared<Uniform<Author>>(*this);
-        }
+        
     };
 
     // normal
@@ -254,7 +258,7 @@ namespace deepx::tf
 
         string math_formula() const override
         {
-            return "normal(T1,mean,stddev,seed)";
+            return "normal(mean,stddev,seed)->T1";
         }
         shared_ptr<TF> clone() const override
         {
@@ -262,28 +266,31 @@ namespace deepx::tf
         }
         int run(shared_ptr<MemBase> mem, string &error) override
         {
-            string name = this->args[0].textvalue;
+            string name = this->returns[0].textvalue;
             auto tensor = mem->gettensor(name).get();
             auto type = tensor->shape.dtype;
+            int mean_pos=0;
+            int stddev_pos=1;
+            unsigned int seed = static_cast<unsigned int>(this->getvar<int>(2, mem));
             switch (type)
             {
             case Precision::Float64:
-                tensorfunc::normal<Author, double>(*mem->gettensor<double>(name).get(), this->getvar<double>(1, mem), this->getvar<double>(2, mem), this->getvar<unsigned int>(3, mem));
+                tensorfunc::normal<Author, double>(*mem->gettensor<double>(name).get(), this->getvar<double>(mean_pos, mem), this->getvar<double>(stddev_pos, mem), seed);
                 break;
             case Precision::Float32:
-                tensorfunc::normal<Author, float>(*mem->gettensor<float>(name).get(), this->getvar<float>(1, mem), this->getvar<float>(2, mem), this->getvar<unsigned int>(3, mem));
+                tensorfunc::normal<Author, float>(*mem->gettensor<float>(name).get(), this->getvar<float>(mean_pos, mem), this->getvar<float>(stddev_pos, mem), seed);
                 break;
             case Precision::Int64:
-                tensorfunc::normal<Author, int64_t>(*mem->gettensor<int64_t>(name).get(), this->getvar<int64_t>(1, mem), this->getvar<int64_t>(2, mem), this->getvar<unsigned int>(3, mem));
+                tensorfunc::normal<Author, int64_t>(*mem->gettensor<int64_t>(name).get(), this->getvar<int64_t>(mean_pos, mem), this->getvar<int64_t>(stddev_pos, mem), seed);
                 break;
             case Precision::Int32:
-                tensorfunc::normal<Author, int32_t>(*mem->gettensor<int32_t>(name).get(), this->getvar<int32_t>(1, mem), this->getvar<int32_t>(2, mem), this->getvar<unsigned int>(3, mem));
+                tensorfunc::normal<Author, int32_t>(*mem->gettensor<int32_t>(name).get(), this->getvar<int32_t>(mean_pos, mem), this->getvar<int32_t>(stddev_pos, mem), seed);
                 break;
             case Precision::Int16:
-                tensorfunc::normal<Author, int16_t>(*mem->gettensor<int16_t>(name).get(), this->getvar<int16_t>(1, mem), this->getvar<int16_t>(2, mem), this->getvar<unsigned int>(3, mem));
+                tensorfunc::normal<Author, int16_t>(*mem->gettensor<int16_t>(name).get(), this->getvar<int16_t>(mean_pos, mem), this->getvar<int16_t>(stddev_pos, mem), seed);
                 break;
             case Precision::Int8:
-                tensorfunc::normal<Author, int8_t>(*mem->gettensor<int8_t>(name).get(), this->getvar<int8_t>(1, mem), this->getvar<int8_t>(2, mem), this->getvar<unsigned int>(3, mem));
+                tensorfunc::normal<Author, int8_t>(*mem->gettensor<int8_t>(name).get(), this->getvar<int8_t>(mean_pos, mem), this->getvar<int8_t>(stddev_pos, mem), seed);
                 break;
             default:
             {
diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/io.hpp b/excuter/op-mem-ompsimd/src/deepx/tf/io.hpp
index 8c6fdc56..eb540f32 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tf/io.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tf/io.hpp
@@ -168,5 +168,40 @@ namespace deepx::tf
             return 0;
         }
     };
+
+    //loadtensordata
+    class LoadTensorData : public TF
+    {
+    public:
+        LoadTensorData(vector<Param> args, vector<Param> returns)
+        {   
+            this->name = "loadtensordata";
+            this->tftype = "io";
+            this->args = args;
+            this->returns = returns;
+        }   
+        string math_formula() const override
+        {
+            return "loadtensordata(path)->tensor.data";
+        }
+        shared_ptr<TF> clone() const override
+        {   
+            return make_shared<LoadTensorData>(*this);
+        }
+        int run(shared_ptr<MemBase> mem, string &error) override
+        {
+            string path = this->args[0].textvalue;
+            string tensorname = this->returns[0].textvalue;
+            if(!mem->existstensor(tensorname))
+            {
+                error = "loadtensordata " + tensorname + " not found";
+                return 1;
+            }
+            auto t = *mem->gettensor(tensorname);
+            t.loader(path,t.data,t.shape.size);
+            return 0;
+        }
+    };
+    
 }
 #endif // DEEPX_TF_IO_HPP

From fc7d2c500792a5ef913238a9bb99941badba9cc6 Mon Sep 17 00:00:00 2001
From: lipeng <734991033@qq.com>
Date: Fri, 25 Apr 2025 00:48:56 +0800
Subject: [PATCH 3/7] =?UTF-8?q?save,load=EF=BC=8C=E4=BB=94=E7=BB=86?=
 =?UTF-8?q?=E9=AA=8C=E8=AF=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 doc/excuter/op-mem-cuda/list.md               |  35 +-
 doc/excuter/op-mem-ompsimd/list.md            |  39 +-
 excuter/cpp-common/src/deepx/tensor.hpp       |   2 +-
 .../cpp-common/src/deepx/tensorfunc/io.hpp    |  25 +-
 excuter/op-mem-cuda/src/client/tfs.cpp        | 391 +++++++++---------
 .../src/deepx/tensorfunc/io_miaobyte.hpp      |  39 +-
 .../deepx/tensorfunc/tensorlife_miaobyte.hpp  |   9 +-
 excuter/op-mem-cuda/src/deepx/tf/io.hpp       |  91 +++-
 .../op-mem-cuda/src/deepx/tf/tensorlife.hpp   |  48 +--
 excuter/op-mem-ompsimd/src/client/tfs.cpp     |  60 ++-
 .../src/deepx/tensorfunc/io_miaobyte.hpp      |  46 +--
 .../deepx/tensorfunc/tensorlife_miaobyte.hpp  |  29 +-
 excuter/op-mem-ompsimd/src/deepx/tf/io.hpp    |  51 ++-
 .../src/deepx/tf/tensorlife.hpp               |  52 ++-
 front/py/deepx/nn/functional/__init__.py      |   2 +-
 front/py/deepx/nn/functional/leaffunc_io.py   |   4 +
 front/py/deepx/nn/functional/rtf_init.py      |  17 +-
 front/py/deepx/nn/functional/rtf_io.py        |   7 +
 front/py/deepx/nn/functional/rtf_life.py      |  12 +-
 front/py/deepx/tensor/io.py                   |  19 +-
 front/py/deepx/tensor/shape.py                |   9 +-
 front/py/examples/1_tensor/1_clone.py         |   6 +-
 front/py/examples/1_tensor/1_copy.py          |   3 +-
 front/py/examples/1_tensor/1_print.py         |   2 +-
 front/py/examples/1_tensor/2_saveload.py      |  13 +-
 25 files changed, 579 insertions(+), 432 deletions(-)

diff --git a/doc/excuter/op-mem-cuda/list.md b/doc/excuter/op-mem-cuda/list.md
index aef5e0a8..f281979d 100644
--- a/doc/excuter/op-mem-cuda/list.md
+++ b/doc/excuter/op-mem-cuda/list.md
@@ -6,36 +6,43 @@
 
 | Operation | Author | Func Def | Math Formula | IR Instruction |
 |-----------|--------|------------|--------------|----------------|
-| vecset |  none  | vecset(vector<any> value)->(vector<any> name) | shape = [3  4  5] | vecset(vector<any> value)->(vector<any> name) |
-| argset |  none  | argset(var<any> value)->(var<any> name) | var argname = argvalue | argset(var<any> value)->(var<any> name) |
+| vecset |  none  | vecset(vector<any> value)->(vector<any> name) | [3  4  5]->shape | vecset(vector<any> value)->(vector<any> name) |
+| argset |  none  | argset(var<any> value)->(var<any> name) | argvalue->argname | argset(var<any> value)->(var<any> name) |
 
 ### tensorlife
 
 | Operation | Author | Func Def | Math Formula | IR Instruction |
 |-----------|--------|------------|--------------|----------------|
-| renametensor |  none  | renametensor(tensor<any> t, var<string> new_name)->() | rename T1 to T2 | renametensor(tensor<any> t, var<string> new_name)->() |
+| renametensor |  none  | renametensor(var<string> new_name)->(tensor<any> t) | rename(newname)->T1 | renametensor(var<string> new_name)->(tensor<any> t) |
 | newtensor |  none  | newtensor(vector<int32> shape)->(tensor<any> tensor1) | T1 = zeros(shape) | newtensor(vector<int32> shape)->(tensor<any> tensor1) |
 | newtensor |  none  | newtensor(var<string> shape)->(tensor<any> tensor1) | T1 = zeros(shape) | newtensor(var<string> shape)->(tensor<any> tensor1) |
-| deltensor |  none  | deltensor(tensor<any> t)->() | del T1 | deltensor(tensor<any> t)->() |
-| copytensor |  none  | copytensor(tensor<any> src, tensor<any> dst)->() | T2.data = T1.data | copytensor(tensor<any> src, tensor<any> dst)->() |
+| deltensor |  none  | deltensor()->(tensor<any> t) | del->T1 | deltensor()->(tensor<any> t) |
+| copytensor |  none  | copytensor(tensor<any> src)->(tensor<any> dst) | T2.data = T1.data | copytensor(tensor<any> src)->(tensor<any> dst) |
 
 ### io
 
 | Operation | Author | Func Def | Math Formula | IR Instruction |
 |-----------|--------|------------|--------------|----------------|
+| loadtensordata |  none  | loadtensordata(var<string> path)->(tensor<any> t) | loadtensordata(path)->tensor | loadtensordata(var<string> path)->(tensor<any> t) |
 | save |  none  | save(tensor<any> t, var<string> path)->() | save(T1,path) | save(tensor<any> t, var<string> path)->() |
 | print | miaobyte | print(tensor<any> t)->() | print(T1) | print(tensor<any> t)->() |
 | print | miaobyte | print(tensor<any> t, var<string> format)->() | print(T1) | print(tensor<any> t, var<string> format)->() |
 | load |  none  | load(var<string> path)->() | load(path) | load(var<string> path)->() |
 
+### matmul
+
+| Operation | Author | Func Def | Math Formula | IR Instruction |
+|-----------|--------|------------|--------------|----------------|
+| matmul | cublas | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1 @ T2 | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) |
+
 ### init
 
 | Operation | Author | Func Def | Math Formula | IR Instruction |
 |-----------|--------|------------|--------------|----------------|
-| normal | miaobyte | normal(tensor<any> t, var<any> mean, var<any> stddev, var<int32> seed)->() | normal(T1,mean,stddev,seed) | normal(tensor<any> t, var<any> mean, var<any> stddev, var<int32> seed)->() |
-| uniform | miaobyte | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() | uniform(T1,low,high,seed) | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() |
-| arange | miaobyte | arange(tensor<any> t, var<any> start, var<any> step)->() | arange(T1,start,step) | arange(tensor<any> t, var<any> start, var<any> step)->() |
-| constant | miaobyte | constant(tensor<any> t, var<any> value)->() | constant(T1) | constant(tensor<any> t, var<any> value)->() |
+| normal | miaobyte | normal(var<any> mean, var<any> stddev, var<int32> seed)->(tensor<any> t) | normal(mean,stddev,seed)->T1 | normal(var<any> mean, var<any> stddev, var<int32> seed)->(tensor<any> t) |
+| uniform | miaobyte | uniform(var<any> low, var<any> high, var<int32> seed)->(tensor<any> t) | uniform(low,high,seed)->T1 | uniform(var<any> low, var<any> high, var<int32> seed)->(tensor<any> t) |
+| arange | miaobyte | arange(var<any> start, var<any> step)->(tensor<any> t) | arange(start,step)->T1 | arange(var<any> start, var<any> step)->(tensor<any> t) |
+| constant | miaobyte | constant(var<any> value)->(tensor<any> t) | constant(value)->T1 | constant(var<any> value)->(tensor<any> t) |
 
 ### elementwise
 
@@ -64,20 +71,14 @@
 | sub | miaobyte | sub(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1-T2 | sub(tensor<any> A, tensor<any> B)->(tensor<any> C) |
 | sqrt | miaobyte | sqrt(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) | T3=sqrt(T1) | sqrt(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) |
 | subscalar | miaobyte | subscalar(tensor<any> A, var<any> b)->(tensor<any> C) | T3=T1-scalar | subscalar(tensor<any> A, var<any> b)->(tensor<any> C) |
+| exp | miaobyte | exp(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) | T3=exp(T1) | exp(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) |
+| mul | miaobyte | mul(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1*T2 | mul(tensor<any> A, tensor<any> B)->(tensor<any> C) |
 | equal | miaobyte | equal(tensor<any> A, tensor<any> B, var<float64> epsilon)->(tensor<bool> mask) | mask=compare(T1, T2) | equal(tensor<any> A, tensor<any> B, var<float64> epsilon)->(tensor<bool> mask) |
 | mulscalar | miaobyte | mulscalar(tensor<any> A, var<any> b)->(tensor<any> C) | T3=T1*scalar | mulscalar(tensor<any> A, var<any> b)->(tensor<any> C) |
 | div | miaobyte | div(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1/T2 | div(tensor<any> A, tensor<any> B)->(tensor<any> C) |
 | invert | miaobyte | invert(tensor<int64|int32|int16|int8> A)->(tensor<int64|int32|int16|int8> C) | T3=~T1 | invert(tensor<int64|int32|int16|int8> A)->(tensor<int64|int32|int16|int8> C) |
 | max | miaobyte | max(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=max(T1, T2) | max(tensor<any> A, tensor<any> B)->(tensor<any> C) |
 | pow | miaobyte | pow(tensor<float64|float32> A, tensor<float64|float32> B)->(tensor<float64|float32> C) | T3=pow(T1, T2) | pow(tensor<float64|float32> A, tensor<float64|float32> B)->(tensor<float64|float32> C) |
-| mul | miaobyte | mul(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1*T2 | mul(tensor<any> A, tensor<any> B)->(tensor<any> C) |
-| exp | miaobyte | exp(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) | T3=exp(T1) | exp(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) |
-
-### matmul
-
-| Operation | Author | Func Def | Math Formula | IR Instruction |
-|-----------|--------|------------|--------------|----------------|
-| matmul | cublas | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1 @ T2 | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) |
 
 ### reduce
 
diff --git a/doc/excuter/op-mem-ompsimd/list.md b/doc/excuter/op-mem-ompsimd/list.md
index 9f43ccf4..8bcfcdb0 100644
--- a/doc/excuter/op-mem-ompsimd/list.md
+++ b/doc/excuter/op-mem-ompsimd/list.md
@@ -6,36 +6,44 @@
 
 | Operation | Author | Func Def | Math Formula | IR Instruction |
 |-----------|--------|------------|--------------|----------------|
-| vecset |  none  | vecset(vector<any> value)->(vector<any> name) | shape = [3  4  5] | vecset(vector<any> value)->(vector<any> name) |
-| argset |  none  | argset(var<any> value)->(var<any> name) | var argname = argvalue | argset(var<any> value)->(var<any> name) |
+| vecset |  none  | vecset(vector<any> value)->(vector<any> name) | [3  4  5]->shape | vecset(vector<any> value)->(vector<any> name) |
+| argset |  none  | argset(var<any> value)->(var<any> name) | argvalue->argname | argset(var<any> value)->(var<any> name) |
 
 ### tensorlife
 
 | Operation | Author | Func Def | Math Formula | IR Instruction |
 |-----------|--------|------------|--------------|----------------|
-| renametensor |  none  | renametensor(tensor<any> t, var<string> new_name)->() | rename T1 to T2 | renametensor(tensor<any> t, var<string> new_name)->() |
-| newtensor |  none  | newtensor(vector<int32> shape)->(tensor<any> tensor1) | T1 =Tensor(shape=[...]) | newtensor(vector<int32> shape)->(tensor<any> tensor1) |
+| renametensor |  none  | renametensor(var<string> new_name)->(tensor<any> t) | rename(newname)->T1 | renametensor(var<string> new_name)->(tensor<any> t) |
+| newtensor |  none  | newtensor(vector<int32> shape)->(tensor<any> t) | T1 =Tensor(shape=[...]) | newtensor(vector<int32> shape)->(tensor<any> t) |
 | newtensor |  none  | newtensor(var<string> shape)->(tensor<any> t) | T1 =Tensor(shape=[...]) | newtensor(var<string> shape)->(tensor<any> t) |
-| deltensor |  none  | deltensor(tensor<any> t)->() | del T1 | deltensor(tensor<any> t)->() |
-| copytensor |  none  | copytensor(tensor<any> src, tensor<any> dst)->() | T2.data = T1.data | copytensor(tensor<any> src, tensor<any> dst)->() |
+| deltensor |  none  | deltensor()->(tensor<any> t) | del->T1 | deltensor()->(tensor<any> t) |
+| copytensor |  none  | copytensor(tensor<any> src)->(tensor<any> dst) | T1.data->T2.data | copytensor(tensor<any> src)->(tensor<any> dst) |
 
 ### io
 
 | Operation | Author | Func Def | Math Formula | IR Instruction |
 |-----------|--------|------------|--------------|----------------|
+| loadtensordata |  none  | loadtensordata(var<string> path)->(tensor<any> t) | loadtensordata(path)->tensor.data | loadtensordata(var<string> path)->(tensor<any> t) |
 | save |  none  | save(tensor<any> t, var<string> path)->() | save(T1,path) | save(tensor<any> t, var<string> path)->() |
 | print | miaobyte | print(tensor<any> t)->() | print(T1) | print(tensor<any> t)->() |
 | print | miaobyte | print(tensor<any> t, var<string> format)->() | print(T1) | print(tensor<any> t, var<string> format)->() |
 | load |  none  | load(var<string> path)->() | mem.load(path) | load(var<string> path)->() |
 
+### matmul
+
+| Operation | Author | Func Def | Math Formula | IR Instruction |
+|-----------|--------|------------|--------------|----------------|
+| matmul | cblas | matmul(tensor<float64|float32> A, tensor<float64|float32> B)->(tensor<float64|float32> C) | T3=T1 @ T2 | matmul(tensor<float64|float32> A, tensor<float64|float32> B)->(tensor<float64|float32> C) |
+| matmul | miaobyte | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1 @ T2 | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) |
+
 ### init
 
 | Operation | Author | Func Def | Math Formula | IR Instruction |
 |-----------|--------|------------|--------------|----------------|
-| normal | miaobyte | normal(tensor<any> t, var<any> mean, var<any> std, var<int32> seed)->() | normal(T1,mean,stddev,seed) | normal(tensor<any> t, var<any> mean, var<any> std, var<int32> seed)->() |
-| uniform | miaobyte | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() | uniform(T1,low,high,seed) | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() |
-| arange | miaobyte | arange(tensor<any> t, var<any> start, var<any> step)->() | arange(T1,start,step) | arange(tensor<any> t, var<any> start, var<any> step)->() |
-| constant | miaobyte | constant(tensor<any> t, var<any> value)->() | constant(T1,value) | constant(tensor<any> t, var<any> value)->() |
+| normal | miaobyte | normal(var<any> mean, var<any> std, var<int32> seed)->(tensor<any> t) | normal(mean,stddev,seed)->T1 | normal(var<any> mean, var<any> std, var<int32> seed)->(tensor<any> t) |
+| uniform | miaobyte | uniform(var<any> low, var<any> high, var<int32> seed)->(tensor<any> t) | uniform(low,high,seed)->T1 | uniform(var<any> low, var<any> high, var<int32> seed)->(tensor<any> t) |
+| arange | miaobyte | arange(var<any> start, var<any> step)->(tensor<any> t) | arange(start,step)->T1 | arange(var<any> start, var<any> step)->(tensor<any> t) |
+| constant | miaobyte | constant(var<any> value)->(tensor<any> t) | constant(value)->T1 | constant(var<any> value)->(tensor<any> t) |
 
 ### elementwise
 
@@ -61,21 +69,14 @@
 | sub | miaobyte | sub(tensor<any> a, tensor<any> b)->(tensor<any> c) | T3=T1-T2 | sub(tensor<any> a, tensor<any> b)->(tensor<any> c) |
 | sqrt | miaobyte | sqrt(tensor<any> A)->(tensor<any> C) | T3=sqrt(T1) | sqrt(tensor<any> A)->(tensor<any> C) |
 | subscalar | miaobyte | subscalar(tensor<any> a, var<any> scalar)->(tensor<any> c) | T3=T1-scalar | subscalar(tensor<any> a, var<any> scalar)->(tensor<any> c) |
+| exp | miaobyte | exp(tensor<any> A)->(tensor<any> C) | T3=exp(T1) | exp(tensor<any> A)->(tensor<any> C) |
+| mul | miaobyte | mul(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1*T2 | mul(tensor<any> A, tensor<any> B)->(tensor<any> C) |
 | equal | miaobyte | equal(tensor<any> A, tensor<any> B)->(tensor<bool> mask) | mask=equal(T1,T2) | equal(tensor<any> A, tensor<any> B)->(tensor<bool> mask) |
 | mulscalar | miaobyte | mulscalar(tensor<any> A, var<any> b)->(tensor<any> C) | T3=T1*scalar | mulscalar(tensor<any> A, var<any> b)->(tensor<any> C) |
 | div | miaobyte | div(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1/T2 | div(tensor<any> A, tensor<any> B)->(tensor<any> C) |
 | invert | miaobyte | invert(tensor<int64|int32|int16|int8> A)->(tensor<int64|int32|int16|int8> C) | T3=~T1 | invert(tensor<int64|int32|int16|int8> A)->(tensor<int64|int32|int16|int8> C) |
 | max | miaobyte | max(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=max(T1,T2) | max(tensor<any> A, tensor<any> B)->(tensor<any> C) |
 | pow | miaobyte | pow(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1^T2 | pow(tensor<any> A, tensor<any> B)->(tensor<any> C) |
-| mul | miaobyte | mul(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1*T2 | mul(tensor<any> A, tensor<any> B)->(tensor<any> C) |
-| exp | miaobyte | exp(tensor<any> A)->(tensor<any> C) | T3=exp(T1) | exp(tensor<any> A)->(tensor<any> C) |
-
-### matmul
-
-| Operation | Author | Func Def | Math Formula | IR Instruction |
-|-----------|--------|------------|--------------|----------------|
-| matmul | cblas | matmul(tensor<float64|float32> A, tensor<float64|float32> B)->(tensor<float64|float32> C) | T3=T1 @ T2 | matmul(tensor<float64|float32> A, tensor<float64|float32> B)->(tensor<float64|float32> C) |
-| matmul | miaobyte | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1 @ T2 | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) |
 
 ### reduce
 
diff --git a/excuter/cpp-common/src/deepx/tensor.hpp b/excuter/cpp-common/src/deepx/tensor.hpp
index fc33ecde..c5a6ca55 100644
--- a/excuter/cpp-common/src/deepx/tensor.hpp
+++ b/excuter/cpp-common/src/deepx/tensor.hpp
@@ -31,7 +31,7 @@ namespace deepx
         using SaveFn = void (*)(T *,size_t,const std::string &);
         SaveFn saver; // 保存内存
 
-        using LoadFn = int (*)(const std::string &, T *,int);
+        using LoadFn = void (*)(const std::string &, T *,int);
         LoadFn loader; // 加载内存
 
         Tensor() = default;
diff --git a/excuter/cpp-common/src/deepx/tensorfunc/io.hpp b/excuter/cpp-common/src/deepx/tensorfunc/io.hpp
index ae4876bf..73542616 100644
--- a/excuter/cpp-common/src/deepx/tensorfunc/io.hpp
+++ b/excuter/cpp-common/src/deepx/tensorfunc/io.hpp
@@ -16,19 +16,24 @@ namespace deepx::tensorfunc{
         printDispatcher<Author,T>::print(t, f);
     }
     
-    template <typename T>
-    void save(const Tensor<T> &tensor,const std::string &path);
-
-    
+ 
+    inline void saveShape(const Shape &shape,const std::string &tensorPath){
+        std::string shapepath = tensorPath + ".shape";
+        std::string shapedata = shape.toYaml();
+        std::ofstream shape_fs(shapepath, std::ios::binary);
+        shape_fs.write(shapedata.c_str(), shapedata.size());
+        shape_fs.close();
+    }
     
-    //load
-    template <typename T>
-    pair<std::string,shared_ptr<Tensor<T>>> load(const std::string &path);
-
+ 
     inline pair<std::string,Shape> loadShape(const std::string &path)
     {
         std::string shapepath = path + ".shape";
         std::ifstream shape_fs(shapepath, std::ios::binary);
+        if (!shape_fs.is_open())
+        {
+            throw std::runtime_error("Failed to open shape file: " + shapepath);
+        }
         std::string shapedata((std::istreambuf_iterator<char>(shape_fs)), std::istreambuf_iterator<char>());
         Shape shape;
         shape.fromYaml(shapedata);
@@ -36,9 +41,7 @@ namespace deepx::tensorfunc{
         std::string tensor_name = filename.substr(0, filename.find_last_of('.'));
         return std::make_pair(tensor_name, shape);
     }
-    //对loaddata，不同excuter的实现不同。gpu具有显存，可以直接从文件到显存。
-    template <typename T>
-    void loadData(const std::string &path,Tensor<T> &tensor);
+ 
 }
 
 #endif // DEEPX_TENSORFUNC_IO_HPP
diff --git a/excuter/op-mem-cuda/src/client/tfs.cpp b/excuter/op-mem-cuda/src/client/tfs.cpp
index 4e9095ca..c98748e2 100644
--- a/excuter/op-mem-cuda/src/client/tfs.cpp
+++ b/excuter/op-mem-cuda/src/client/tfs.cpp
@@ -52,26 +52,31 @@ namespace deepx::tf
                                                          {
                                                              Param("tensor1", DataCategory::Tensor, Precision::Any),
                                                          })));
-        //copytensor
+        // copytensor
         tffactory.add_tf(std::make_shared<CopyTensor>(vector<Param>(
-                                                         {
-                                                             Param("src", DataCategory::Tensor, Precision::Any),
-                                                             Param("dst", DataCategory::Tensor, Precision::Any),
-                                                         }),
-                                                     vector<Param>()));
-        //deltensor
-        tffactory.add_tf(std::make_shared<DelTensor>(vector<Param>(
+                                                          {
+                                                              Param("src", DataCategory::Tensor, Precision::Any),
+                                                          }),
+                                                      vector<Param>(
+                                                          {
+                                                              Param("dst", DataCategory::Tensor, Precision::Any),
+                                                          })));
+        // deltensor
+        tffactory.add_tf(std::make_shared<DelTensor>(vector<Param>(),
+                                                     vector<Param>(
                                                          {
                                                              Param("t", DataCategory::Tensor, Precision::Any),
-                                                         }),
-                                                     vector<Param>()));
-        //renametensor
+                                                         })));
+        // renametensor
         tffactory.add_tf(std::make_shared<RenameTensor>(vector<Param>(
-                                                         {
-                                                              Param("t", DataCategory::Tensor, Precision::Any),
-                                                             Param("new_name", DataCategory::Var, Precision::String),
-                                                        }),
-                                                     vector<Param>()));
+                                                            {
+
+                                                                Param("new_name", DataCategory::Var, Precision::String),
+                                                            }),
+                                                        vector<Param>(
+                                                            {
+                                                                Param("t", DataCategory::Tensor, Precision::Any),
+                                                            })));
     }
 
     // init
@@ -80,34 +85,42 @@ namespace deepx::tf
 
         tffactory.add_tf(std::make_shared<Constant<miaobyte>>(vector<Param>(
                                                                   {
-                                                                      Param("t", DataCategory::Tensor, Precision::Any),
+
                                                                       Param("value", DataCategory::Var, Precision::Any),
                                                                   }),
-                                                              vector<Param>()));
+                                                              vector<Param>({
+                                                                  Param("t", DataCategory::Tensor, Precision::Any),
+                                                              })));
 
         tffactory.add_tf(std::make_shared<Arange<miaobyte>>(vector<Param>(
                                                                 {
-                                                                    Param("t", DataCategory::Tensor, Precision::Any),
+
                                                                     Param("start", DataCategory::Var, Precision::Any),
                                                                     Param("step", DataCategory::Var, Precision::Any),
                                                                 }),
-                                                            vector<Param>()));
+                                                            vector<Param>({
+                                                                Param("t", DataCategory::Tensor, Precision::Any),
+                                                            })));
         tffactory.add_tf(std::make_shared<Uniform<miaobyte>>(vector<Param>(
                                                                  {
-                                                                     Param("t", DataCategory::Tensor, Precision::Any),
+
                                                                      Param("low", DataCategory::Var, Precision::Any),
                                                                      Param("high", DataCategory::Var, Precision::Any),
                                                                      Param("seed", DataCategory::Var, Precision::Int32),
                                                                  }),
-                                                             vector<Param>()));
+                                                             vector<Param>({
+                                                                 Param("t", DataCategory::Tensor, Precision::Any),
+                                                             })));
         tffactory.add_tf(std::make_shared<Normal<miaobyte>>(vector<Param>(
-                                                                 {
-                                                                     Param("t", DataCategory::Tensor, Precision::Any),
-                                                                     Param("mean", DataCategory::Var, Precision::Any),
-                                                                     Param("stddev", DataCategory::Var, Precision::Any),
-                                                                     Param("seed", DataCategory::Var, Precision::Int32),    
-                                                                 }),
-                                                             vector<Param>())); 
+                                                                {
+
+                                                                    Param("mean", DataCategory::Var, Precision::Any),
+                                                                    Param("stddev", DataCategory::Var, Precision::Any),
+                                                                    Param("seed", DataCategory::Var, Precision::Int32),
+                                                                }),
+                                                            vector<Param>({
+                                                                Param("t", DataCategory::Tensor, Precision::Any),
+                                                            })));
     }
     // io
     void register_io(TfFactory &opfactory)
@@ -126,18 +139,26 @@ namespace deepx::tf
                                                            vector<Param>()));
 
         opfactory.add_tf(std::make_shared<Save>(vector<Param>(
-                                                               {
-                                                                   Param("t", DataCategory::Tensor, Precision::Any),
-                                                                   Param("path", DataCategory::Var, Precision::String),
-                                                               }),
-                                                           vector<Param>()));
+                                                    {
+                                                        Param("t", DataCategory::Tensor, Precision::Any),
+                                                        Param("path", DataCategory::Var, Precision::String),
+                                                    }),
+                                                vector<Param>()));
 
         opfactory.add_tf(std::make_shared<Load>(vector<Param>(
-                                                               {
-                                                                   Param("path", DataCategory::Var, Precision::String),
-                                                               }),
-                                                           vector<Param>())); 
-                                                           
+                                                    {
+                                                        Param("path", DataCategory::Var, Precision::String),
+                                                    }),
+                                                vector<Param>()));
+        // loadtensordata
+        opfactory.add_tf(std::make_shared<LoadTensorData>(vector<Param>(
+                                                              {
+                                                                  Param("path", DataCategory::Var, Precision::String),
+                                                              }),
+                                                          vector<Param>(
+                                                              {
+                                                                  Param("t", DataCategory::Tensor, Precision::Any),
+                                                              })));
     }
 
     // elementwise
@@ -234,16 +255,16 @@ namespace deepx::tf
                                                                     {
                                                                         Param("C", DataCategory::Tensor, Precision::Any),
                                                                     })));
-        //invert
+        // invert
         tffactory.add_tf(std::make_shared<Invert<miaobyte>>(vector<Param>(
-                                                                 {
-                                                                     Param("A", DataCategory::Tensor, Precision::Int64 | Precision::Int32 | Precision::Int16 | Precision::Int8),
-                                                                 }),
-                                                                 vector<Param>(
-                                                                     {
-                                                                         Param("C", DataCategory::Tensor, Precision::Int64 | Precision::Int32 | Precision::Int16 | Precision::Int8),
-                                                                     })));
-                                                                     
+                                                                {
+                                                                    Param("A", DataCategory::Tensor, Precision::Int64 | Precision::Int32 | Precision::Int16 | Precision::Int8),
+                                                                }),
+                                                            vector<Param>(
+                                                                {
+                                                                    Param("C", DataCategory::Tensor, Precision::Int64 | Precision::Int32 | Precision::Int16 | Precision::Int8),
+                                                                })));
+
         tffactory.add_tf(std::make_shared<Sqrt<miaobyte>>(vector<Param>(
                                                               {
                                                                   Param("A", DataCategory::Tensor, Precision::Float64 | Precision::Float32 | Precision::Float16 | Precision::BFloat16),
@@ -271,17 +292,17 @@ namespace deepx::tf
                                                                    {
                                                                        Param("C", DataCategory::Tensor, Precision::Float64 | Precision::Float32),
                                                                    })));
-        //rpowscalar
+        // rpowscalar
         tffactory.add_tf(std::make_shared<RpowScalar<miaobyte>>(vector<Param>(
-                                                                   {
-                                                                       Param("scalar", DataCategory::Var, Precision::Float64 | Precision::Int32),
-                                                                       Param("A", DataCategory::Tensor, Precision::Float64 | Precision::Float32),
-                                                                   }),
-                                                               vector<Param>(
-                                                                   {
-                                                                       Param("C", DataCategory::Tensor, Precision::Float64 | Precision::Float32),
-                                                                   })));
-        //log
+                                                                    {
+                                                                        Param("scalar", DataCategory::Var, Precision::Float64 | Precision::Int32),
+                                                                        Param("A", DataCategory::Tensor, Precision::Float64 | Precision::Float32),
+                                                                    }),
+                                                                vector<Param>(
+                                                                    {
+                                                                        Param("C", DataCategory::Tensor, Precision::Float64 | Precision::Float32),
+                                                                    })));
+        // log
         tffactory.add_tf(std::make_shared<Log<miaobyte>>(vector<Param>(
                                                              {
                                                                  Param("A", DataCategory::Tensor, Precision::Float64 | Precision::Float32 | Precision::Float16 | Precision::BFloat16),
@@ -358,77 +379,77 @@ namespace deepx::tf
                                                                    {
                                                                        Param("C", DataCategory::Tensor, Precision::Any),
                                                                    })));
-        //equal
+        // equal
         tffactory.add_tf(std::make_shared<Equal<miaobyte>>(vector<Param>(
+                                                               {
+                                                                   Param("A", DataCategory::Tensor, Precision::Any),
+                                                                   Param("B", DataCategory::Tensor, Precision::Any),
+                                                                   Param("epsilon", DataCategory::Var, Precision::Float64),
+                                                               }),
+                                                           vector<Param>(
+                                                               {
+                                                                   Param("mask", DataCategory::Tensor, Precision::Bool),
+                                                               })));
+        tffactory.add_tf(std::make_shared<EqualScalar<miaobyte>>(vector<Param>(
+                                                                     {
+                                                                         Param("A", DataCategory::Tensor, Precision::Any),
+                                                                         Param("scalar", DataCategory::Var, Precision::Any),
+                                                                         Param("epsilon", DataCategory::Var, Precision::Float64),
+                                                                     }),
+                                                                 vector<Param>(
+                                                                     {
+                                                                         Param("mask", DataCategory::Tensor, Precision::Bool),
+                                                                     })));
+        // less
+        tffactory.add_tf(std::make_shared<Less<miaobyte>>(vector<Param>(
+                                                              {
+                                                                  Param("A", DataCategory::Tensor, Precision::Any),
+                                                                  Param("B", DataCategory::Tensor, Precision::Any),
+                                                              }),
+                                                          vector<Param>(
+                                                              {
+                                                                  Param("mask", DataCategory::Tensor, Precision::Bool),
+                                                              })));
+        // lessscalar
+        tffactory.add_tf(std::make_shared<LessScalar<miaobyte>>(vector<Param>(
+                                                                    {
+                                                                        Param("A", DataCategory::Tensor, Precision::Any),
+                                                                        Param("scalar", DataCategory::Var, Precision::Any),
+                                                                    }),
+                                                                vector<Param>(
+                                                                    {
+                                                                        Param("mask", DataCategory::Tensor, Precision::Bool),
+                                                                    })));
+        // greater
+        tffactory.add_tf(std::make_shared<Greater<miaobyte>>(vector<Param>(
                                                                  {
                                                                      Param("A", DataCategory::Tensor, Precision::Any),
                                                                      Param("B", DataCategory::Tensor, Precision::Any),
-                                                                     Param("epsilon", DataCategory::Var, Precision::Float64),
                                                                  }),
                                                              vector<Param>(
                                                                  {
                                                                      Param("mask", DataCategory::Tensor, Precision::Bool),
                                                                  })));
-        tffactory.add_tf(std::make_shared<EqualScalar<miaobyte>>(vector<Param>(
+        // greaterscalar
+        tffactory.add_tf(std::make_shared<GreaterScalar<miaobyte>>(vector<Param>(
                                                                        {
                                                                            Param("A", DataCategory::Tensor, Precision::Any),
                                                                            Param("scalar", DataCategory::Var, Precision::Any),
-                                                                           Param("epsilon", DataCategory::Var, Precision::Float64),
                                                                        }),
                                                                    vector<Param>(
                                                                        {
                                                                            Param("mask", DataCategory::Tensor, Precision::Bool),
                                                                        })));
-        //less
-        tffactory.add_tf(std::make_shared<Less<miaobyte>>(vector<Param>(
-                                                                 {
-                                                                     Param("A", DataCategory::Tensor, Precision::Any),
-                                                                     Param("B", DataCategory::Tensor, Precision::Any),
-                                                                 }),    
-                                                                 vector<Param>(
-                                                                     {
-                                                                         Param("mask", DataCategory::Tensor, Precision::Bool),
-                                                                     })));
-        //lessscalar
-        tffactory.add_tf(std::make_shared<LessScalar<miaobyte>>(vector<Param>(
-                                                                   {
-                                                                       Param("A", DataCategory::Tensor, Precision::Any),
-                                                                       Param("scalar", DataCategory::Var, Precision::Any),
-                                                                   }),
-                                                               vector<Param>(
-                                                                   {
-                                                                       Param("mask", DataCategory::Tensor, Precision::Bool),
-                                                                   })));
-        //greater
-        tffactory.add_tf(std::make_shared<Greater<miaobyte>>(vector<Param>(     
-                                                                 {
-                                                                     Param("A", DataCategory::Tensor, Precision::Any),
-                                                                     Param("B", DataCategory::Tensor, Precision::Any),
-                                                                 }),
-                                                                 vector<Param>(
-                                                                     {
-                                                                         Param("mask", DataCategory::Tensor, Precision::Bool),
-                                                                     })));
-        //greaterscalar
-        tffactory.add_tf(std::make_shared<GreaterScalar<miaobyte>>(vector<Param>(
-                                                                   {
-                                                                       Param("A", DataCategory::Tensor, Precision::Any),
-                                                                       Param("scalar", DataCategory::Var, Precision::Any),
-                                                                   }),  
-                                                               vector<Param>(
-                                                                   {
-                                                                       Param("mask", DataCategory::Tensor, Precision::Bool),
-                                                                   })));
-        //switch    
-        tffactory.add_tf(std::make_shared<Switch<miaobyte>>(vector<Param>(  
-                                                                 {
-                                                                     Param("tensors", DataCategory::ListTensor, Precision::Any),
-                                                                     Param("cases", DataCategory::Tensor,Precision::Int8),
-                                                                 }),
-                                                                 vector<Param>(
-                                                                     {
-                                                                         Param("result", DataCategory::Tensor, Precision::Any), 
-                                                                     })));
+        // switch
+        tffactory.add_tf(std::make_shared<Switch<miaobyte>>(vector<Param>(
+                                                                {
+                                                                    Param("tensors", DataCategory::ListTensor, Precision::Any),
+                                                                    Param("cases", DataCategory::Tensor, Precision::Int8),
+                                                                }),
+                                                            vector<Param>(
+                                                                {
+                                                                    Param("result", DataCategory::Tensor, Precision::Any),
+                                                                })));
     }
     // matmul
     void register_matmul(TfFactory &tffactory)
@@ -458,96 +479,96 @@ namespace deepx::tf
                                                                  })));
         // transpose
         tffactory.add_tf(std::make_shared<Transpose<miaobyte>>(vector<Param>(
-                {
-                    Param("A", DataCategory::Tensor, Precision::Any),
-                    Param("dim_order", DataCategory::Vector, Precision::Int32),
-                }),
-            vector<Param>(
-                {
-                    Param("C", DataCategory::Tensor, Precision::Any),
-                })));
+                                                                   {
+                                                                       Param("A", DataCategory::Tensor, Precision::Any),
+                                                                       Param("dim_order", DataCategory::Vector, Precision::Int32),
+                                                                   }),
+                                                               vector<Param>(
+                                                                   {
+                                                                       Param("C", DataCategory::Tensor, Precision::Any),
+                                                                   })));
         // concat
         tffactory.add_tf(std::make_shared<Concat<miaobyte>>(vector<Param>(
-                {
-                    Param("tensors", DataCategory::ListTensor, Precision::Any),
-                    Param("dim", DataCategory::Var, Precision::Int32),
-                }),
-            vector<Param>(
-                {
-                    Param("result", DataCategory::Tensor, Precision::Any),
-                })));
+                                                                {
+                                                                    Param("tensors", DataCategory::ListTensor, Precision::Any),
+                                                                    Param("dim", DataCategory::Var, Precision::Int32),
+                                                                }),
+                                                            vector<Param>(
+                                                                {
+                                                                    Param("result", DataCategory::Tensor, Precision::Any),
+                                                                })));
         // broadcastTo
         tffactory.add_tf(std::make_shared<BroadcastTo<miaobyte>>(vector<Param>(
-                {
-                    Param("A", DataCategory::Tensor, Precision::Any),
-                    Param("new_shape", DataCategory::Vector, Precision::Int32),
-                }),
-            vector<Param>(
-                {
-                    Param("B", DataCategory::Tensor, Precision::Any),
-                })));
+                                                                     {
+                                                                         Param("A", DataCategory::Tensor, Precision::Any),
+                                                                         Param("new_shape", DataCategory::Vector, Precision::Int32),
+                                                                     }),
+                                                                 vector<Param>(
+                                                                     {
+                                                                         Param("B", DataCategory::Tensor, Precision::Any),
+                                                                     })));
         // indexselect
         tffactory.add_tf(std::make_shared<IndexSelect<miaobyte>>(vector<Param>(
-                {
-                    Param("A", DataCategory::Tensor, Precision::Any),
-                    Param("indices", DataCategory::Tensor, Precision::Int64|Precision::Int32),
-                    Param("axis", DataCategory::Var, Precision::Int32),
-                }), 
-            vector<Param>(
-                {
-                    Param("B", DataCategory::Tensor, Precision::Any),
-                })));
+                                                                     {
+                                                                         Param("A", DataCategory::Tensor, Precision::Any),
+                                                                         Param("indices", DataCategory::Tensor, Precision::Int64 | Precision::Int32),
+                                                                         Param("axis", DataCategory::Var, Precision::Int32),
+                                                                     }),
+                                                                 vector<Param>(
+                                                                     {
+                                                                         Param("B", DataCategory::Tensor, Precision::Any),
+                                                                     })));
     }
-   // reduce
-     void register_reduce(TfFactory &tffactory)
-    {   
+    // reduce
+    void register_reduce(TfFactory &tffactory)
+    {
         // sum
         tffactory.add_tf(std::make_shared<Sum<miaobyte>>(vector<Param>(
-            {
-                Param("A", DataCategory::Tensor, Precision::Any),
-                Param("dims", DataCategory::Vector, Precision::Int32),
-                Param("keepdims", DataCategory::Var, Precision::Bool),
-            }),
-            vector<Param>(
-                {
-                    Param("B", DataCategory::Tensor, Precision::Any),
-                })));   
+                                                             {
+                                                                 Param("A", DataCategory::Tensor, Precision::Any),
+                                                                 Param("dims", DataCategory::Vector, Precision::Int32),
+                                                                 Param("keepdims", DataCategory::Var, Precision::Bool),
+                                                             }),
+                                                         vector<Param>(
+                                                             {
+                                                                 Param("B", DataCategory::Tensor, Precision::Any),
+                                                             })));
         // prod
         tffactory.add_tf(std::make_shared<Prod<miaobyte>>(vector<Param>(
-            {
-                Param("A", DataCategory::Tensor, Precision::Any),
-                Param("dims", DataCategory::Vector, Precision::Int32),
-                Param("keepdims", DataCategory::Var, Precision::Bool),
-            }), 
-            vector<Param>(
-                {
-                    Param("B", DataCategory::Tensor, Precision::Any),
-                })));  
+                                                              {
+                                                                  Param("A", DataCategory::Tensor, Precision::Any),
+                                                                  Param("dims", DataCategory::Vector, Precision::Int32),
+                                                                  Param("keepdims", DataCategory::Var, Precision::Bool),
+                                                              }),
+                                                          vector<Param>(
+                                                              {
+                                                                  Param("B", DataCategory::Tensor, Precision::Any),
+                                                              })));
 
         // max
         tffactory.add_tf(std::make_shared<ReduceMax<miaobyte>>(vector<Param>(
-            {
-                Param("A", DataCategory::Tensor, Precision::Any),
-                Param("dims", DataCategory::Vector, Precision::Int32),
-                Param("keepdims", DataCategory::Var, Precision::Bool),
-            }),
-            vector<Param>(
-                {
-                    Param("B", DataCategory::Tensor, Precision::Any),
-                })));
+                                                                   {
+                                                                       Param("A", DataCategory::Tensor, Precision::Any),
+                                                                       Param("dims", DataCategory::Vector, Precision::Int32),
+                                                                       Param("keepdims", DataCategory::Var, Precision::Bool),
+                                                                   }),
+                                                               vector<Param>(
+                                                                   {
+                                                                       Param("B", DataCategory::Tensor, Precision::Any),
+                                                                   })));
         // min
         tffactory.add_tf(std::make_shared<ReduceMin<miaobyte>>(vector<Param>(
-            {
-                Param("A", DataCategory::Tensor, Precision::Any),
-                Param("dims", DataCategory::Vector, Precision::Int32),
-                Param("keepdims", DataCategory::Var, Precision::Bool),
-            }), 
-            vector<Param>(
-                {
-                    Param("B", DataCategory::Tensor, Precision::Any),
-                })));
+                                                                   {
+                                                                       Param("A", DataCategory::Tensor, Precision::Any),
+                                                                       Param("dims", DataCategory::Vector, Precision::Int32),
+                                                                       Param("keepdims", DataCategory::Var, Precision::Bool),
+                                                                   }),
+                                                               vector<Param>(
+                                                                   {
+                                                                       Param("B", DataCategory::Tensor, Precision::Any),
+                                                                   })));
     }
- 
+
     int register_all(TfFactory &tffactory)
     {
         register_lifecycle(tffactory);
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/io_miaobyte.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/io_miaobyte.hpp
index 5c606393..4d0a096f 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/io_miaobyte.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/io_miaobyte.hpp
@@ -75,22 +75,15 @@ namespace deepx::tensorfunc
         }
     };
 
+     //load
     template <typename T>
-    void save(Tensor<T> &tensor, const std::string &path)
-    {
-        // 统一分配CPU内存
-        unsigned char* device_data=reinterpret_cast<unsigned char*>(tensor.data);
-        auto [size,host_data]= device_offload(device_data,tensor.shape.bytes());
-        stdutil::save(host_data.get(),size,path);
-    };
-
-    template <typename T>
-    pair<std::string, shared_ptr<Tensor<T>>> load(const std::string &path)
+    pair<std::string,shared_ptr<Tensor<T>>> load(const std::string &path)
     {
         // 加载shape
-        pair<std::string, Shape> shape_name = loadShape(path);
-        Shape shape = shape_name.second;
-        std::string tensor_name = shape_name.first;
+        pair<std::string,Shape> shape_name=loadShape(path);
+        Shape shape=shape_name.second;
+        std::string tensor_name=shape_name.first;
+ 
 
         // 检查T 和 shape.dtype 是否匹配
         if (shape.dtype != precision<T>())
@@ -98,23 +91,11 @@ namespace deepx::tensorfunc
             throw std::runtime_error("调用load<" + precision_str(shape.dtype) + "> 不匹配: 需要 " + precision_str(shape.dtype) +
                                      " 类型，但文件为" + precision_str(precision<T>()) + " 类型");
         }
-
-        // 检查file.size，是否是tensor.size*sizeof(T)
-        std::string datapath = path + ".data";
-        auto [fileSize,hostdata]=stdutil::load(datapath);
-        if(fileSize!=shape.bytes()){
-            throw std::runtime_error("数据文件大小不足: 需要 " + std::to_string(shape.bytes()) +
-                                     " 字节，但文件只有 " + std::to_string(fileSize) + " 字节");
-        }
-        T *host_data=reinterpret_cast<T*>(hostdata.get());
+ 
         shared_ptr<Tensor<T>> tensor = make_shared<Tensor<T>>(New<T>(shape.shape));
-        
-        cudaError_t err = cudaMemcpy(tensor->data, host_data, fileSize, cudaMemcpyHostToDevice);
-        if (err != cudaSuccess)
-        {
-            throw std::runtime_error("Failed to copy data from host to device");
-        }
+        tensor->loader(path,tensor->data,tensor->shape.size);
         return std::make_pair(tensor_name, tensor);
-    }
+    };
+
 }
 #endif // DEEPX_TENSORFUNC_IO_MIAOBYTE_HPP
\ No newline at end of file
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/tensorlife_miaobyte.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/tensorlife_miaobyte.hpp
index 5e0c96c0..7334301a 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/tensorlife_miaobyte.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/tensorlife_miaobyte.hpp
@@ -44,13 +44,14 @@ namespace deepx::tensorfunc
         int64_t total_bytes = size * sizeof(T);
 
         // 统一分配CPU内存
-        auto [host_data, err] = device_offload(tensorData, total_bytes);
+
+        auto [_,host_data] = device_offload(reinterpret_cast<unsigned char*>(tensorData), total_bytes);
         stdutil::save(host_data.get(), total_bytes, path);
     }
 
     // 不做任何转换，直接从内存到文件，或从文件到内存
     template <typename T>
-    static int loadFn(const std::string &path, T *data, int count)
+    static void loadFn(const std::string &path, T *data, int count)
     {
         auto [file_size, hostdata] = stdutil::load(path);
         if (file_size != count * sizeof(T))
@@ -64,7 +65,6 @@ namespace deepx::tensorfunc
         {
             throw std::runtime_error("Failed to copy data from host to device");
         }
-        return count;
     }
 
     template <typename T>
@@ -76,6 +76,9 @@ namespace deepx::tensorfunc
         tensor.deleter = freeFn<T>;
         tensor.copyer = copyFn<T>;
         tensor.newer = newFn<T>;
+        tensor.saver = saveFn<T>;
+        tensor.loader = loadFn<T>;
+
 
         tensor.data = newFn<T>(shape.size);
         return tensor;
diff --git a/excuter/op-mem-cuda/src/deepx/tf/io.hpp b/excuter/op-mem-cuda/src/deepx/tf/io.hpp
index 64466b15..cb0a52b9 100644
--- a/excuter/op-mem-cuda/src/deepx/tf/io.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tf/io.hpp
@@ -78,17 +78,64 @@ namespace deepx::tf
         {
             string name = this->args[0].textvalue;
             string path = this->args[1].textvalue;
-            if (mem->existstensor(name))
-            {
-                auto t = mem->gettensor(name);
-                tensorfunc::save<void>(*t, path);
-            }
-            else
+            if (!mem->existstensor(name))
             {
+   
                 std::cerr << "save " << name << " not found" << std::endl;
                 error = "save " + name + " not found";
                 return 1;
             }
+            Precision dtype = mem->gettensor(name)->shape.dtype;
+            switch (dtype)
+            {   
+            case Precision::Float64:{
+                auto t = mem->gettensor<double>(name);
+                t->saver(t->data,t->shape.size,path);
+                break;
+            }
+            case Precision::Float32:{
+                auto t = mem->gettensor<float>(name);
+                t->saver(t->data,t->shape.size,path);
+                break;
+            }
+            case Precision::Float16:{
+                auto t = mem->gettensor<half>(name);
+                t->saver(t->data,t->shape.size,path);
+                break;
+            }
+            case Precision::BFloat16:{
+                auto t = mem->gettensor<nv_bfloat16>(name);
+                t->saver(t->data,t->shape.size,path);
+                break;
+            }
+            case Precision::Int64:{
+                auto t = mem->gettensor<int64_t>(name);
+                t->saver(t->data,t->shape.size,path);
+                break;  
+            }
+            case Precision::Int32:{
+                auto t = mem->gettensor<int32_t>(name);
+                t->saver(t->data,t->shape.size,path);
+                break;
+            }
+            case Precision::Int16:{
+                auto t = mem->gettensor<int16_t>(name);
+                t->saver(t->data,t->shape.size,path);
+                break;  
+            }
+            case Precision::Int8:{
+                auto t = mem->gettensor<int8_t>(name);
+                t->saver(t->data,t->shape.size,path);
+                break;
+            }
+            case Precision::Bool:{
+                auto t = mem->gettensor<bool>(name);
+                t->saver(t->data,t->shape.size,path);
+                break;  
+            }
+            default:
+                break;
+            }
             return 0;
         }
     };
@@ -202,59 +249,61 @@ namespace deepx::tf
         {
             string path = this->args[0].textvalue;  
             string tensorname = this->returns[0].textvalue;
-            if(!mem->existstensor(tensorname))
-            {   
-                error = "loadtensor " + tensorname + " not exists";
-                return 1;
-            }
+            
             pair<std::string,Shape> shape_name=tensorfunc::loadShape(path);
             std::string tensor_name=shape_name.first;
             Shape shape=shape_name.second;
+
+            if(mem->existstensor(tensor_name))
+            {
+                cout<<"warning: "<<tensor_name<<" already exists,deepx will delete it,create new one"<<endl;
+                mem->delete_tensor(tensor_name);
+            }
             switch (shape.dtype)
             {
             case Precision::Float64:{
-                pair<std::string,shared_ptr<Tensor<double>>> t = tensorfunc::load<double>(path);
-                mem->gettensor<double>(tensorname)->copyer(t.second->data,mem->gettensor<double>(tensorname)->data,t.second->shape.size);
+                pair<std::string,shared_ptr<Tensor<float>>> t = tensorfunc::load<float>(path);
+                mem->addtensor(tensor_name, t.second);
                 break;
             }
             case Precision::Float32:{
                 pair<std::string,shared_ptr<Tensor<float>>> t = tensorfunc::load<float>(path);
-                mem->gettensor<float>(tensorname)->copyer(t.second->data,mem->gettensor<float>(tensorname)->data,t.second->shape.size);
+                mem->addtensor(tensor_name, t.second);
                 break;
             }
             case Precision::Float16:{
                 pair<std::string,shared_ptr<Tensor<half>>> t = tensorfunc::load<half>(path);
-                mem->gettensor<half>(tensorname)->copyer(t.second->data,mem->gettensor<half>(tensorname)->data,t.second->shape.size);
+                mem->addtensor(tensor_name, t.second);
                 break;
             }
             case Precision::BFloat16:{
                 pair<std::string,shared_ptr<Tensor<nv_bfloat16>>> t = tensorfunc::load<nv_bfloat16>(path);
-                mem->gettensor<nv_bfloat16>(tensorname)->copyer(t.second->data,mem->gettensor<nv_bfloat16>(tensorname)->data,t.second->shape.size);
+                mem->addtensor(tensor_name, t.second);
                 break;
             }
             case Precision::Int64:{
                 pair<std::string,shared_ptr<Tensor<int64_t>>> t = tensorfunc::load<int64_t>(path);
-                mem->gettensor<int64_t>(tensorname)->copyer(t.second->data,mem->gettensor<int64_t>(tensorname)->data,t.second->shape.size);
+                mem->addtensor(tensor_name, t.second);
                 break;
             }
             case Precision::Int32:{
                 pair<std::string,shared_ptr<Tensor<int32_t>>> t = tensorfunc::load<int32_t>(path);
-                mem->gettensor<int32_t>(tensorname)->copyer(t.second->data,mem->gettensor<int32_t>(tensorname)->data,t.second->shape.size);
+                mem->addtensor(tensor_name, t.second);
                 break;
             }
             case Precision::Int16:{
                 pair<std::string,shared_ptr<Tensor<int16_t>>> t = tensorfunc::load<int16_t>(path);
-                mem->gettensor<int16_t>(tensorname)->copyer(t.second->data,mem->gettensor<int16_t>(tensorname)->data,t.second->shape.size);
+                mem->addtensor(tensor_name, t.second);
                 break;
             }   
             case Precision::Int8:{
                 pair<std::string,shared_ptr<Tensor<int8_t>>> t = tensorfunc::load<int8_t>(path);
-                mem->gettensor<int8_t>(tensorname)->copyer(t.second->data,mem->gettensor<int8_t>(tensorname)->data,t.second->shape.size);
+                mem->addtensor(tensor_name, t.second);
                 break;
             }
             case Precision::Bool:{
                 pair<std::string,shared_ptr<Tensor<bool>>> t = tensorfunc::load<bool>(path);
-                mem->gettensor<bool>(tensorname)->copyer(t.second->data,mem->gettensor<bool>(tensorname)->data,t.second->shape.size);
+                mem->addtensor(tensor_name, t.second);
                 break;
             }
             default:
diff --git a/excuter/op-mem-cuda/src/deepx/tf/tensorlife.hpp b/excuter/op-mem-cuda/src/deepx/tf/tensorlife.hpp
index 620b81e1..d21cb60f 100644
--- a/excuter/op-mem-cuda/src/deepx/tf/tensorlife.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tf/tensorlife.hpp
@@ -145,62 +145,64 @@ namespace deepx::tf
         }
         int run(shared_ptr<MemBase> mem, string &error) override
         {
-            if (!checktensors({this->args[0].textvalue, this->args[1].textvalue}, mem, error) != 0)
+            if (!checktensors({this->args[0].textvalue, this->returns[0].textvalue}, mem, error) != 0)
             {
                 return 1;
             }
             Precision input_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
-            Precision type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype;
+            Precision type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
             if (input_type != type)
             {
                 error = "copytensor: input type and return type must be the same";
                 return 1;
             }
-            switch (input_type)
+            auto src = mem->gettensor(this->args[0].textvalue);
+            auto dst = mem->gettensor(this->returns[0].textvalue);
+            switch (type)
             {
             case Precision::Float64:
             {
-                tensorfunc::copy(*mem->gettensor<double>(this->args[0].textvalue), *mem->gettensor<double>(this->args[1].textvalue));
+                dst->copyer(src->data, dst->data, src->shape.size);
                 break;
             }
             case Precision::Float32:
             {
-                tensorfunc::copy(*mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<float>(this->args[1].textvalue));
+                dst->copyer(src->data, dst->data, src->shape.size);
                 break;
             }
             case Precision::Float16:
             {
-                tensorfunc::copy(*mem->gettensor<__half>(this->args[0].textvalue), *mem->gettensor<__half>(this->args[1].textvalue));
+                dst->copyer(src->data, dst->data, src->shape.size);
                 break;
             }
             case Precision::BFloat16:
             {
-                tensorfunc::copy(*mem->gettensor<__nv_bfloat16>(this->args[0].textvalue), *mem->gettensor<__nv_bfloat16>(this->args[1].textvalue));
+                dst->copyer(src->data, dst->data, src->shape.size);
                 break;
             }
             case Precision::Int64:
             {
-                tensorfunc::copy(*mem->gettensor<int64_t>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue));
+                dst->copyer(src->data, dst->data, src->shape.size);
                 break;
             }
             case Precision::Int32:
             {
-                tensorfunc::copy(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue));
+                dst->copyer(src->data, dst->data, src->shape.size);
                 break;
             }
             case Precision::Int16:
             {
-                tensorfunc::copy(*mem->gettensor<int16_t>(this->args[0].textvalue), *mem->gettensor<int16_t>(this->args[1].textvalue));
+                dst->copyer(src->data, dst->data, src->shape.size);
                 break;
             }
             case Precision::Int8:
             {
-                tensorfunc::copy(*mem->gettensor<int8_t>(this->args[0].textvalue), *mem->gettensor<int8_t>(this->args[1].textvalue));
+                dst->copyer(src->data, dst->data, src->shape.size);
                 break;
             }
             case Precision::Bool:
             {
-                tensorfunc::copy(*mem->gettensor<bool>(this->args[0].textvalue), *mem->gettensor<bool>(this->args[1].textvalue));
+                dst->copyer(src->data, dst->data, src->shape.size);
                 break;
             }
             default:
@@ -208,7 +210,7 @@ namespace deepx::tf
                 error = "copytensor: unsupported precision";
                 return 1;
             }
-            };
+            }
             return 0;
         }
 
@@ -234,14 +236,14 @@ namespace deepx::tf
         }
         int run(shared_ptr<MemBase> mem, string &error) override
         {
-            string name = this->args[0].textvalue;
+            string name = this->returns[0].textvalue;
             mem->delete_tensor(name);
             return 0;
         }
 
         string math_formula() const override
         {
-            return "del T1";
+            return "del->T1";
         }
         shared_ptr<TF> clone() const override
         {
@@ -249,33 +251,33 @@ namespace deepx::tf
         }
     };
 
-    //rename
+    // rename
     class RenameTensor : public TF
     {
     public:
         RenameTensor(vector<Param> args, vector<Param> returns)
         {
-            this->name = "renametensor";    
+            this->name = "renametensor";
             this->tftype = "tensorlife";
             this->args = args;
             this->returns = returns;
         }
-        int run(shared_ptr<MemBase> mem, string &error) override    
+        int run(shared_ptr<MemBase> mem, string &error) override
         {
-            string old_name = this->args[0].textvalue;
-            if (!checktensors({this->args[0].textvalue}, mem, error) != 0)
+            string old_name = this->returns[0].textvalue;
+            if (!checktensors({old_name}, mem, error) != 0)
             {
                 return 1;
             }
 
-            string new_name = this->args[1].textvalue;
- 
+            string new_name = this->args[0].textvalue;
+
             mem->rename_tensor(old_name, new_name);
             return 0;
         }
         string math_formula() const override
         {
-            return "rename T1 to T2";
+            return "rename(newname)->T1";
         }
         shared_ptr<TF> clone() const override
         {
diff --git a/excuter/op-mem-ompsimd/src/client/tfs.cpp b/excuter/op-mem-ompsimd/src/client/tfs.cpp
index 07df04bd..5080a673 100644
--- a/excuter/op-mem-ompsimd/src/client/tfs.cpp
+++ b/excuter/op-mem-ompsimd/src/client/tfs.cpp
@@ -44,7 +44,7 @@ namespace deepx::tf
                                                          }),
                                                      vector<Param>(
                                                          {
-                                                             Param("tensor1", DataCategory::Tensor, Precision::Any),
+                                                             Param("t", DataCategory::Tensor, Precision::Any),
                                                          })));
         // newtensor author=miaobyte
         tffactory.add_tf(std::make_shared<NewTensor>(vector<Param>(
@@ -59,22 +59,28 @@ namespace deepx::tf
         tffactory.add_tf(std::make_shared<CopyTensor>(vector<Param>(
                                                           {
                                                               Param("src", DataCategory::Tensor, Precision::Any),
-                                                              Param("dst", DataCategory::Tensor, Precision::Any),
-                                                          }),
-                                                      vector<Param>()));
+                                                             }),
+                                                      vector<Param>({
+                                                          Param("dst", DataCategory::Tensor, Precision::Any),
+                                                          
+                                                      })));
         // deltensor
         tffactory.add_tf(std::make_shared<DelTensor>(vector<Param>(
                                                          {
-                                                             Param("t", DataCategory::Tensor, Precision::Any),
+                                                           
                                                          }),
-                                                     vector<Param>()));
+                                                     vector<Param>({
+                                                          Param("t", DataCategory::Tensor, Precision::Any),
+                                                     })));
         //renametensor
         tffactory.add_tf(std::make_shared<RenameTensor>(vector<Param>(
                                                          {
-                                                             Param("t", DataCategory::Tensor, Precision::Any),
+                                                           
                                                              Param("new_name", DataCategory::Var, Precision::String),
                                                          }),
-                                                     vector<Param>()));
+                                                     vector<Param>({
+                                                          Param("t", DataCategory::Tensor, Precision::Any),
+                                                     })));
     }
 
     // init
@@ -83,36 +89,47 @@ namespace deepx::tf
         // constant author=miaobyte
         tffactory.add_tf(std::make_shared<Constant<miaobyte>>(vector<Param>(
                                                                   {
-                                                                      Param("t", DataCategory::Tensor, Precision::Any),
+                                                                     
                                                                       Param("value", DataCategory::Var, Precision::Any),
                                                                   }),
-                                                              vector<Param>()));
+                                                              vector<Param>({
+                                                                 Param("t", DataCategory::Tensor, Precision::Any),
+                                                              })));
         // arange author=miaobyte
         tffactory.add_tf(std::make_shared<Arange<miaobyte>>(vector<Param>(
                                                                 {
-                                                                    Param("t", DataCategory::Tensor, Precision::Any),
+                                                                    
                                                                     Param("start", DataCategory::Var, Precision::Any),
                                                                     Param("step", DataCategory::Var, Precision::Any),
                                                                 }),
-                                                            vector<Param>()));
+                                                            vector<Param>({
+                                                                Param("t", DataCategory::Tensor, Precision::Any),
+                                                            })));
         // uniform author=miaobyte
         tffactory.add_tf(std::make_shared<Uniform<miaobyte>>(vector<Param>(
                                                                  {
-                                                                     Param("t", DataCategory::Tensor, Precision::Any),
+                                                                    
                                                                      Param("low", DataCategory::Var, Precision::Any),
                                                                      Param("high", DataCategory::Var, Precision::Any),
                                                                      Param("seed", DataCategory::Var, Precision::Int32),
                                                                  }),
-                                                             vector<Param>()));
+                                                             vector<Param>(
+                                                                {
+                                                                     Param("t", DataCategory::Tensor, Precision::Any),
+                                                                }
+                                                             )));
         // normal author=miaobyte
         tffactory.add_tf(std::make_shared<Normal<miaobyte>>(vector<Param>(
                                                                 {
-                                                                    Param("t", DataCategory::Tensor, Precision::Any),
+                                                                    
                                                                     Param("mean", DataCategory::Var, Precision::Any),
                                                                     Param("std", DataCategory::Var, Precision::Any),
                                                                     Param("seed", DataCategory::Var, Precision::Int32),
                                                                 }),
-                                                            vector<Param>()));
+                                                            vector<Param>(
+                                                                {
+                                                                     Param("t", DataCategory::Tensor, Precision::Any),
+                                                                })));
     }
     // io
     void register_io(TfFactory &opfactory)
@@ -143,7 +160,16 @@ namespace deepx::tf
                                                                {
                                                                    Param("path", DataCategory::Var, Precision::String),
                                                                }),
-                                                           vector<Param>()));                                                
+                                                           vector<Param>()));
+        //loadtensordata
+        opfactory.add_tf(std::make_shared<LoadTensorData>(vector<Param>(
+                                                               {
+                                                                   Param("path", DataCategory::Var, Precision::String),
+                                                               }),
+                                                           vector<Param>(
+                                                               {
+                                                                   Param("t", DataCategory::Tensor, Precision::Any),
+                                                               })));   
     }
 
     // elementwise
diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/io_miaobyte.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/io_miaobyte.hpp
index 38c12f32..784fe767 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/io_miaobyte.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/io_miaobyte.hpp
@@ -36,34 +36,8 @@ namespace deepx::tensorfunc
         }
     };
 
-    template <typename T>
-    void save(Tensor<T> &tensor, const std::string &path)
-    {
-
-        // 保存shape
-        std::string shapepath = path + ".shape";
-        std::string shapedata = tensor.shape.toYaml();
-        std::ofstream shape_fs(shapepath, std::ios::binary);
-        shape_fs.write(shapedata.c_str(), shapedata.size());
-        shape_fs.close();
-
-        // 保存data
-        std::string datapath = path + ".data";
-        std::ofstream data_fs(datapath, std::ios::binary | std::ios::in | std::ios::out);
-
-        if (!data_fs.is_open())
-        {
-            // 如果文件不存在，则创建新文件
-            data_fs.open(datapath, std::ios::binary | std::ios::out);
-        }
-        int data_size = tensor.shape.size * precision_bits(tensor.shape.dtype) / 8;
-        data_fs.write(reinterpret_cast<const char *>(tensor.data), data_size);
-        data_fs.close();
-    }
 
     //load
-
-
     template <typename T>
     pair<std::string,shared_ptr<Tensor<T>>> load(const std::string &path)
     {
@@ -79,25 +53,9 @@ namespace deepx::tensorfunc
             throw std::runtime_error("调用load<" + precision_str(shape.dtype) + "> 不匹配: 需要 " + precision_str(shape.dtype) +
                                      " 类型，但文件为" + precision_str(precision<T>()) + " 类型");
         }
-
-        // 检查file.size，是否是tensor.size*sizeof(T)
-        std::string datapath = path + ".data";
-        std::ifstream data_fs(datapath, std::ios::binary);
-        data_fs.seekg(0, std::ios::end);
-        std::streamsize fileSize = data_fs.tellg();
-        std::streamsize expectedSize = shape.size * (precision_bits(shape.dtype) / 8);
-
-        if (fileSize != expectedSize)
-        {
-            throw std::runtime_error("数据文件大小不足: 需要 " + std::to_string(expectedSize) +
-                                     " 字节，但文件只有 " + std::to_string(fileSize) + " 字节");
-        }
-        data_fs.seekg(0);
-
-        // 创建tensor
+ 
         shared_ptr<Tensor<T>> tensor = make_shared<Tensor<T>>(New<T>(shape.shape));
-        data_fs.read(reinterpret_cast<char *>(tensor->data), fileSize);
-        data_fs.close();
+        tensor->loader(path,tensor->data,tensor->shape.size);
         return std::make_pair(tensor_name, tensor);
     };
 
diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/tensorlife_miaobyte.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/tensorlife_miaobyte.hpp
index b1bc509e..ca177a4a 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/tensorlife_miaobyte.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/tensorlife_miaobyte.hpp
@@ -1,6 +1,7 @@
 #ifndef DEEPX_TENSORFUNC_TENSORLIFE_MIAOBYTE_HPP
 #define DEEPX_TENSORFUNC_TENSORLIFE_MIAOBYTE_HPP
 
+#include "stdutil/fs.hpp"
 #include "deepx/tensorfunc/tensorlife.hpp"
 #include "deepx/tensorfunc/authors.hpp"
 #include "deepx/tensor.hpp"
@@ -18,17 +19,33 @@ namespace deepx::tensorfunc
     }
 
     template <typename T>
-    static void dataFree(T *data)
+    static void freeFn(T *data)
     {
         MemoryPool::Free(data);
     }
 
     template <typename T>
-    static void dataCopy(T *data, T *data2, int size)
+    static void copyFn(T *data, T *data2, int size)
     {
         std::copy(data, data + size, data2);
     }
 
+    template <typename T>
+    static void saveFn(T *data, size_t size, const std::string &path)
+    {   
+        unsigned char *udata = reinterpret_cast<unsigned char *>(data);
+        size_t udatasize = size * sizeof(T);
+        stdutil::save(udata,udatasize,path);
+    }
+    
+
+    template <typename T>
+    static void loadFn(const std::string &path, T *data, int size)
+    {
+        unsigned char *udata = reinterpret_cast<unsigned char *>(data);
+        size_t udatasize = size * sizeof(T);
+        stdutil::load(path,udata,udatasize);
+    }
     // New
     template <typename T>
     Tensor<T> New(const std::vector<int> &shapedata)
@@ -37,9 +54,13 @@ namespace deepx::tensorfunc
         shape.dtype = precision<T>();
 
         Tensor<T> tensor(shape);
-        tensor.deleter = dataFree<T>;
-        tensor.copyer = dataCopy<T>;
+        tensor.deleter = freeFn<T>;
+        tensor.copyer = copyFn<T>;
         tensor.newer = newFn<T>;
+        tensor.saver = saveFn<T>;
+        tensor.loader = loadFn<T>;
+
+
         tensor.data = newFn<T>(shape.size);
         return tensor;
     };
diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/io.hpp b/excuter/op-mem-ompsimd/src/deepx/tf/io.hpp
index eb540f32..e964bc87 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tf/io.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tf/io.hpp
@@ -78,17 +78,56 @@ namespace deepx::tf
         {
             string name = this->args[0].textvalue;
             string path = this->args[1].textvalue;
-            if (mem->existstensor(name))
-            {
-                auto t = mem->gettensor(name);
-                tensorfunc::save<void>(*t, path);
-            }
-            else
+            if (!mem->existstensor(name))
             {
+   
                 std::cerr << "save " << name << " not found" << std::endl;
                 error = "save " + name + " not found";
                 return 1;
             }
+            Precision dtype = mem->gettensor(name)->shape.dtype;
+            tensorfunc::saveShape(mem->gettensor(name)->shape,path);
+            switch (dtype)
+            {   
+            case Precision::Float64:{
+                auto t = mem->gettensor<double>(name);
+                t->saver(t->data,t->shape.size,path);
+                break;
+            }
+            case Precision::Float32:{
+                auto t = mem->gettensor<float>(name);
+                t->saver(t->data,t->shape.size,path);
+                break;
+            }
+
+            case Precision::Int64:{
+                auto t = mem->gettensor<int64_t>(name);
+                t->saver(t->data,t->shape.size,path);
+                break;  
+            }
+            case Precision::Int32:{
+                auto t = mem->gettensor<int32_t>(name);
+                t->saver(t->data,t->shape.size,path);
+                break;
+            }
+            case Precision::Int16:{
+                auto t = mem->gettensor<int16_t>(name);
+                t->saver(t->data,t->shape.size,path);
+                break;  
+            }
+            case Precision::Int8:{
+                auto t = mem->gettensor<int8_t>(name);
+                t->saver(t->data,t->shape.size,path);
+                break;
+            }
+            case Precision::Bool:{
+                auto t = mem->gettensor<bool>(name);
+                t->saver(t->data,t->shape.size,path);
+                break;  
+            }
+            default:
+                break;
+            }
             return 0;
         }
     };
diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/tensorlife.hpp b/excuter/op-mem-ompsimd/src/deepx/tf/tensorlife.hpp
index 8d4d4f23..ba97ad7e 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tf/tensorlife.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tf/tensorlife.hpp
@@ -137,52 +137,66 @@ namespace deepx::tf
  
          int run(shared_ptr<MemBase> mem, string &error) override
         {
-            if (!checktensors({this->args[0].textvalue, this->args[1].textvalue}, mem, error) != 0)
+            if (!checktensors({this->args[0].textvalue, this->returns[0].textvalue}, mem, error) != 0)
             {
                 return 1;
             }
-            Precision input_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
-            Precision type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype;
-            if (input_type != type)
+            Precision src_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
+            Precision dst_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
+            if (src_type != dst_type)
             {
                 error = "copytensor: input type and return type must be the same";
                 return 1;
             }
-            switch (input_type)
+            switch (src_type)
             {
             case Precision::Float64:
             {
-                tensorfunc::copy(*mem->gettensor<double>(this->args[0].textvalue), *mem->gettensor<double>(this->args[1].textvalue));
+                auto src = mem->gettensor<double>(this->args[0].textvalue);
+                auto dst = mem->gettensor<double>(this->returns[0].textvalue);
+                dst->copyer(src->data,dst->data,src->shape.size);
                 break;
             }
             case Precision::Float32:
             {
-                tensorfunc::copy(*mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<float>(this->args[1].textvalue));
+                auto src = mem->gettensor<float>(this->args[0].textvalue);
+                auto dst = mem->gettensor<float>(this->returns[0].textvalue);
+                dst->copyer(src->data,dst->data,src->shape.size);
                 break;
             }
             case Precision::Int64:
             {
-                tensorfunc::copy(*mem->gettensor<int64_t>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue));
+                auto src = mem->gettensor<int64_t>(this->args[0].textvalue);
+                auto dst = mem->gettensor<int64_t>(this->returns[0].textvalue);
+                dst->copyer(src->data,dst->data,src->shape.size);
                 break;
             }
             case Precision::Int32:
             {
-                tensorfunc::copy(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue));
+                auto src = mem->gettensor<int32_t>(this->args[0].textvalue);
+                auto dst = mem->gettensor<int32_t>(this->returns[0].textvalue);
+                dst->copyer(src->data,dst->data,src->shape.size);
                 break;
             }
             case Precision::Int16:
             {
-                tensorfunc::copy(*mem->gettensor<int16_t>(this->args[0].textvalue), *mem->gettensor<int16_t>(this->args[1].textvalue));
+                auto src = mem->gettensor<int16_t>(this->args[0].textvalue);
+                auto dst = mem->gettensor<int16_t>(this->returns[0].textvalue);
+                dst->copyer(src->data,dst->data,src->shape.size);
                 break;
             }
             case Precision::Int8:
             {
-                tensorfunc::copy(*mem->gettensor<int8_t>(this->args[0].textvalue), *mem->gettensor<int8_t>(this->args[1].textvalue));
+                auto src = mem->gettensor<int8_t>(this->args[0].textvalue);
+                auto dst = mem->gettensor<int8_t>(this->returns[0].textvalue);
+                dst->copyer(src->data,dst->data,src->shape.size);
                 break;
             }
             case Precision::Bool:
             {
-                tensorfunc::copy(*mem->gettensor<bool>(this->args[0].textvalue), *mem->gettensor<bool>(this->args[1].textvalue));
+                auto src = mem->gettensor<bool>(this->args[0].textvalue);
+                auto dst = mem->gettensor<bool>(this->returns[0].textvalue);
+                dst->copyer(src->data,dst->data,src->shape.size);
                 break;
             }
             default:
@@ -196,7 +210,7 @@ namespace deepx::tf
 
         string math_formula() const override
         {
-            return "T2.data = T1.data";
+            return "T1.data->T2.data";
         }
         shared_ptr<TF> clone() const override
         {
@@ -216,14 +230,14 @@ namespace deepx::tf
         }
         int run(shared_ptr<MemBase> mem, string &error) override
         {
-            string name = this->args[0].textvalue;
+            string name = this->returns[0].textvalue;
             mem->delete_tensor(name);
             return 0;
         }
 
         string math_formula() const override
         {
-            return "del T1";
+            return "del->T1";
         }
         shared_ptr<TF> clone() const override
         {
@@ -244,18 +258,18 @@ namespace deepx::tf
         }
         int run(shared_ptr<MemBase> mem, string &error) override
         {
-            string old_name = this->args[0].textvalue;
-            if (!checktensors({this->args[0].textvalue}, mem, error) != 0)
+            string old_name = this->returns[0].textvalue;
+            if (!checktensors({old_name}, mem, error) != 0)
             {
                 return 1;
             }
-            string new_name = this->args[1].textvalue;
+            string new_name = this->args[0].textvalue;
             mem->rename_tensor(old_name, new_name);
             return 0;
         }
         string math_formula() const override
         {
-            return "rename T1 to T2";
+            return "rename(newname)->T1";
         }
         shared_ptr<TF> clone() const override
         {
diff --git a/front/py/deepx/nn/functional/__init__.py b/front/py/deepx/nn/functional/__init__.py
index 93825af3..4a17d8ca 100644
--- a/front/py/deepx/nn/functional/__init__.py
+++ b/front/py/deepx/nn/functional/__init__.py
@@ -18,7 +18,7 @@
 __all__ = [
 
     #leaffunc
-    "newtensor","printtensor","load", #life
+    "newtensor","rnewtensor","printtensor","load", #life
     "printtensor","save",#io
     "constant","constant_","full","zeros","ones","uniform","uniform_","arange","arange_","kaiming_uniform","kaiming_uniform_","calculate_fan_in_and_fan_out",
     "add","sub","mul","div","sqrt","pow","exp","log",
diff --git a/front/py/deepx/nn/functional/leaffunc_io.py b/front/py/deepx/nn/functional/leaffunc_io.py
index 9797eea9..77dbd9f7 100644
--- a/front/py/deepx/nn/functional/leaffunc_io.py
+++ b/front/py/deepx/nn/functional/leaffunc_io.py
@@ -10,3 +10,7 @@ def save(t:Tensor,path:str):
     from .rtf_io import rtf_save
     rtf_save(t,path)
     return t
+ 
+def loadData(t:Tensor,path:str)->Tensor:
+    from .rtf_io import rtf_loadtensordata
+    return rtf_loadtensordata(t,path)
\ No newline at end of file
diff --git a/front/py/deepx/nn/functional/rtf_init.py b/front/py/deepx/nn/functional/rtf_init.py
index bc46205c..2fbcde8e 100644
--- a/front/py/deepx/nn/functional/rtf_init.py
+++ b/front/py/deepx/nn/functional/rtf_init.py
@@ -6,26 +6,29 @@
 
 
 def rtf_constant(t:Tensor,value:Union[float,int]=0,author='miaobyte')->Tensor:
-    A_scalar_op("constant",t,value,author)
+    args=[Param.varnum(value)]
+    returns=[Param.tensor(t)]
+    ir=DeepxIR("constant", args, returns,author)
+    send(ir)
     return t
   
 def rtf_arange(t:Tensor,start:Optional[Union[float,int]]=0,step:Optional[Union[float,int]]=1,author='miaobyte')->Tensor:
-    args=[Param.tensor(t),Param.varnum(start),Param.varnum(step)]
-    returns=[]
+    args=[Param.varnum(start),Param.varnum(step)]
+    returns=[Param.tensor(t)]
     ir=DeepxIR("arange", args, returns,author)
     send(ir)
     return t
  
 def rtf_uniform(t:Tensor,low=0, high=1,seed:int=0,author='miaobyte')->Tensor:
-    args=[Param.tensor(t),Param.varnum(low),Param.varnum(high),Param.varnum(seed)]
-    returns=[]
+    args=[Param.varnum(low),Param.varnum(high),Param.varnum(seed)]
+    returns=[Param.tensor(t)]
     ir=DeepxIR("uniform", args, returns,author)
     send(ir)
     return t
 
 def rtf_normal(t:Tensor,mean:float=0, stddev:float=1,seed:int=0,author='miaobyte')->Tensor:
-    args=[Param.tensor(t),Param.varnum(mean),Param.varnum(stddev),Param.varnum(seed)]
-    returns=[]
+    args=[Param.varnum(mean),Param.varnum(stddev),Param.varnum(seed)]
+    returns=[Param.tensor(t)]
     ir=DeepxIR("normal", args, returns,author)
     send(ir)
     return t
\ No newline at end of file
diff --git a/front/py/deepx/nn/functional/rtf_io.py b/front/py/deepx/nn/functional/rtf_io.py
index 918f63de..f5066a7a 100644
--- a/front/py/deepx/nn/functional/rtf_io.py
+++ b/front/py/deepx/nn/functional/rtf_io.py
@@ -24,3 +24,10 @@ def rtf_load(path:str)->Tensor:
     shapefile=path+'.shape'
     tensor_name,shape,dtype=loadShape(shapefile)
     return Tensor(shape.shape,dtype,tensor_name)
+
+def rtf_loadtensordata(t:Tensor,path:str)->Tensor:
+    args=[Param.varstr(path)]
+    returns=[Param.tensor(t)]
+    ir=DeepxIR("loadtensordata", args, returns)
+    send(ir)
+    return t
\ No newline at end of file
diff --git a/front/py/deepx/nn/functional/rtf_life.py b/front/py/deepx/nn/functional/rtf_life.py
index 014cd505..21547b99 100644
--- a/front/py/deepx/nn/functional/rtf_life.py
+++ b/front/py/deepx/nn/functional/rtf_life.py
@@ -10,19 +10,19 @@ def rtf_newtensor(t:Tensor):
 
 
 def rtf_copytensor(t:Tensor,out:Tensor):
-    args=[Param.tensor(t),Param.tensor(out)]
-    returns=[]
+    args=[Param.tensor(t)]
+    returns=[Param.tensor(out)]
     ir=DeepxIR("copytensor", args, returns,'')
     send(ir)
 
 def rtf_deltensor(t:Tensor):
-    args=[Param.tensor(t)]
-    returns=[]
+    args=[]
+    returns=[Param.tensor(t)]
     ir=DeepxIR("deltensor", args, returns,'')
     send(ir)
 
 def rtf_renametensor(t:Tensor,new_name:str):
-    args=[Param.tensor(t),Param.varstr(new_name)]
-    returns=[]
+    args=[Param.varstr(new_name)]
+    returns=[Param.tensor(t)]
     ir=DeepxIR("renametensor", args, returns,'')
     send(ir)
diff --git a/front/py/deepx/tensor/io.py b/front/py/deepx/tensor/io.py
index 35e3b0f7..45e14a1f 100644
--- a/front/py/deepx/tensor/io.py
+++ b/front/py/deepx/tensor/io.py
@@ -1,6 +1,6 @@
 import yaml
 import os
-from deepx.tensor import Shape
+from deepx.tensor import Shape,Tensor,tensor_method
 
 def loadShape(path:str)->tuple[str,Shape,str]:
     filename = os.path.basename(path)
@@ -12,11 +12,14 @@ def loadShape(path:str)->tuple[str,Shape,str]:
  
     tensor_name = filename[:-6]  # 移除'.shape'后缀
     return (tensor_name,Shape(tuple(shape['shape'])),shape['dtype'])
+@tensor_method
+def loadData(self,path:str):
+    from deepx.nn.functional import loadData as loadData_func
+    loadData_func(self,path)
+    
+@tensor_method
+def save(self,path:str):
+    from deepx.nn.functional import save  as save_func
+    save_func(self,path)
 
-def saveShape(t:Shape,path:str):
-    if path.endswith('.shape'):
-        with open(path, 'w') as f:
-            yaml.dump({'shape': list(t.shape), 'dtype': t._dtype,'size':t.numel(),'dim':t.ndim,'stride':list(t.stride)}, f)
-    else:
-        raise ValueError("文件名必须以.shape结尾")
-
+ 
diff --git a/front/py/deepx/tensor/shape.py b/front/py/deepx/tensor/shape.py
index 4ce87937..1098586d 100644
--- a/front/py/deepx/tensor/shape.py
+++ b/front/py/deepx/tensor/shape.py
@@ -179,4 +179,11 @@ def reduceshape(cls,shape:tuple[int,...],dim:tuple[int,...],keepdim:bool)->tuple
     @classmethod
     def indexselectshape(cls,input_shape:tuple[int,...],index_shape:tuple[int,...],gatheraxis:int)->tuple[int,...]:
         return input_shape[:gatheraxis]+index_shape+input_shape[gatheraxis+1:]
-    
\ No newline at end of file
+
+    def save(self,path:str):
+        if path.endswith('.shape'):
+            import yaml
+            with open(path, 'w') as f:
+                yaml.dump({'shape': list(self.shape), 'dtype': self._dtype,'size':self.numel(),'dim':self.ndim,'stride':list(self.stride)}, f)
+        else:
+            raise ValueError("文件名必须以.shape结尾")
\ No newline at end of file
diff --git a/front/py/examples/1_tensor/1_clone.py b/front/py/examples/1_tensor/1_clone.py
index 78654fb5..e0b2fcd4 100644
--- a/front/py/examples/1_tensor/1_clone.py
+++ b/front/py/examples/1_tensor/1_clone.py
@@ -1,9 +1,11 @@
-from deepx.tensor import Tensor
+
+from deepx  import Tensor,newtensor,rnewtensor
 
 def clonetest():
     t1=Tensor(shape=(1,2,3),dtype='float32',name='t1')
+    rnewtensor(t1)
     t2=t1.clone(name='t2')
-    print(t2)
+    t2.print()
 
 if __name__ == "__main__":
     clonetest()
\ No newline at end of file
diff --git a/front/py/examples/1_tensor/1_copy.py b/front/py/examples/1_tensor/1_copy.py
index 2afeddcb..244d4bfa 100644
--- a/front/py/examples/1_tensor/1_copy.py
+++ b/front/py/examples/1_tensor/1_copy.py
@@ -4,8 +4,9 @@ def copytest():
     from deepx.nn.functional import newtensor
     t1= newtensor(1, 2, 3,name='t1')
     t2= newtensor(1, 2, 3,name='t2')
+    t1.print()
     t1.copy_to(t2)
-    print(t2)
+    t2.print()
 
 
 if __name__ == "__main__":
diff --git a/front/py/examples/1_tensor/1_print.py b/front/py/examples/1_tensor/1_print.py
index 1bfebe9c..7a5a205e 100644
--- a/front/py/examples/1_tensor/1_print.py
+++ b/front/py/examples/1_tensor/1_print.py
@@ -7,7 +7,7 @@ def newtensor():
 
    from deepx.nn.functional import newtensor
    t=newtensor(1,2,3,name='t')
-   print(t)
+   t.print()
 
 if __name__ == "__main__":
    newtensor()
diff --git a/front/py/examples/1_tensor/2_saveload.py b/front/py/examples/1_tensor/2_saveload.py
index 0b88d544..ec2225d7 100644
--- a/front/py/examples/1_tensor/2_saveload.py
+++ b/front/py/examples/1_tensor/2_saveload.py
@@ -1,21 +1,22 @@
-from deepx.tensor import Tensor
 from deepx.nn.functional import arange,save,load
 
 def saveloadfloat32():
-    t1=arange(start=0,end=60 ,dtype='float32',name='t1').reshape_(3,4,5)
+    t1=arange(start=0,end=60 ,dtype='float32').reshape_((3,4,5))
     dir='/home/lipeng/model/deepxmodel/tester/'
+    t1.save(dir+'t1')
 
-    t2=load(dir+t1.name)
+    t2=load(dir+'t1')
     t2.print()
 
 def saveloadint8():
-    t=arange(start=0,end=60 ,dtype='int8',name='t.int8').reshape_(3,4,5)
+    t=arange(start=0,end=60 ,dtype='int8').reshape_((3,4,5))
     dir='/home/lipeng/model/deepxmodel/tester/'
+    t.save(dir+'tint8')
 
-    t2=load(dir+t.name)
+    t2=load(dir+"tint8")
     t2.print()
 
 
 if __name__ == "__main__":
-    saveloadfloat32()
+    #saveloadfloat32()
     saveloadint8()
\ No newline at end of file

From 21a1dab7679a2a0e49af465fa85163a22c17e4ce Mon Sep 17 00:00:00 2001
From: lipeng <734991033@qq.com>
Date: Fri, 25 Apr 2025 01:11:11 +0800
Subject: [PATCH 4/7] save,load,loadtensordata,loadshape

---
 excuter/cpp-common/src/stdutil/fs.cpp         |  1 +
 .../src/deepx/tensorfunc/io_miaobyte.hpp      |  4 ++--
 excuter/op-mem-cuda/src/deepx/tf/io.hpp       |  2 ++
 .../op-mem-cuda/src/deepx/tf/tensorlife.hpp   | 21 +++++++++++++++++--
 .../src/deepx/tensorfunc/io_miaobyte.hpp      |  2 +-
 excuter/op-mem-ompsimd/src/deepx/tf/io.hpp    |  1 +
 front/py/deepxutil/numpy/io.py                |  4 ++--
 front/py/examples/2_ir/2_elementwise_add.py   |  4 ++--
 .../examples/2_ir/2_elementwise_operator.py   |  8 +++----
 front/py/examples/2_ir/3_matmul.py            |  8 +++----
 10 files changed, 38 insertions(+), 17 deletions(-)

diff --git a/excuter/cpp-common/src/stdutil/fs.cpp b/excuter/cpp-common/src/stdutil/fs.cpp
index 7f7a57fe..6b30d551 100644
--- a/excuter/cpp-common/src/stdutil/fs.cpp
+++ b/excuter/cpp-common/src/stdutil/fs.cpp
@@ -29,6 +29,7 @@ namespace stdutil
         {
             throw std::runtime_error("Failed to open file: " + path);
         }
+
         ifs.seekg(0, ios::end);
         size_t size = ifs.tellg();
         ifs.seekg(0, ios::beg);
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/io_miaobyte.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/io_miaobyte.hpp
index 4d0a096f..0967736f 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/io_miaobyte.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/io_miaobyte.hpp
@@ -91,9 +91,9 @@ namespace deepx::tensorfunc
             throw std::runtime_error("调用load<" + precision_str(shape.dtype) + "> 不匹配: 需要 " + precision_str(shape.dtype) +
                                      " 类型，但文件为" + precision_str(precision<T>()) + " 类型");
         }
- 
+
         shared_ptr<Tensor<T>> tensor = make_shared<Tensor<T>>(New<T>(shape.shape));
-        tensor->loader(path,tensor->data,tensor->shape.size);
+        tensor->loader(path+".data",tensor->data,tensor->shape.size);
         return std::make_pair(tensor_name, tensor);
     };
 
diff --git a/excuter/op-mem-cuda/src/deepx/tf/io.hpp b/excuter/op-mem-cuda/src/deepx/tf/io.hpp
index cb0a52b9..81d739f2 100644
--- a/excuter/op-mem-cuda/src/deepx/tf/io.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tf/io.hpp
@@ -86,6 +86,8 @@ namespace deepx::tf
                 return 1;
             }
             Precision dtype = mem->gettensor(name)->shape.dtype;
+            tensorfunc::saveShape(mem->gettensor(name)->shape,path);
+            path+=".data";
             switch (dtype)
             {   
             case Precision::Float64:{
diff --git a/excuter/op-mem-cuda/src/deepx/tf/tensorlife.hpp b/excuter/op-mem-cuda/src/deepx/tf/tensorlife.hpp
index d21cb60f..0db28933 100644
--- a/excuter/op-mem-cuda/src/deepx/tf/tensorlife.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tf/tensorlife.hpp
@@ -156,52 +156,69 @@ namespace deepx::tf
                 error = "copytensor: input type and return type must be the same";
                 return 1;
             }
-            auto src = mem->gettensor(this->args[0].textvalue);
-            auto dst = mem->gettensor(this->returns[0].textvalue);
+
             switch (type)
             {
             case Precision::Float64:
             {
+                auto src = mem->gettensor<double>(this->args[0].textvalue);
+                auto dst = mem->gettensor<double>(this->returns[0].textvalue);
                 dst->copyer(src->data, dst->data, src->shape.size);
                 break;
             }
             case Precision::Float32:
             {
+                auto src = mem->gettensor<float>(this->args[0].textvalue);
+                auto dst = mem->gettensor<float>(this->returns[0].textvalue);
                 dst->copyer(src->data, dst->data, src->shape.size);
                 break;
             }
             case Precision::Float16:
             {
+                auto src = mem->gettensor<half>(this->args[0].textvalue);
+                auto dst = mem->gettensor<half>(this->returns[0].textvalue);
                 dst->copyer(src->data, dst->data, src->shape.size);
                 break;
             }
             case Precision::BFloat16:
             {
+                auto src = mem->gettensor<nv_bfloat16>(this->args[0].textvalue);
+                auto dst = mem->gettensor<nv_bfloat16>(this->returns[0].textvalue);
                 dst->copyer(src->data, dst->data, src->shape.size);
                 break;
             }
             case Precision::Int64:
             {
+                auto src = mem->gettensor<int64_t>(this->args[0].textvalue);
+                auto dst = mem->gettensor<int64_t>(this->returns[0].textvalue);
                 dst->copyer(src->data, dst->data, src->shape.size);
                 break;
             }
             case Precision::Int32:
             {
+                auto src = mem->gettensor<int32_t>(this->args[0].textvalue);
+                auto dst = mem->gettensor<int32_t>(this->returns[0].textvalue);
                 dst->copyer(src->data, dst->data, src->shape.size);
                 break;
             }
             case Precision::Int16:
             {
+                auto src = mem->gettensor<int16_t>(this->args[0].textvalue);
+                auto dst = mem->gettensor<int16_t>(this->returns[0].textvalue);
                 dst->copyer(src->data, dst->data, src->shape.size);
                 break;
             }
             case Precision::Int8:
             {
+                auto src = mem->gettensor<int8_t>(this->args[0].textvalue);
+                auto dst = mem->gettensor<int8_t>(this->returns[0].textvalue);
                 dst->copyer(src->data, dst->data, src->shape.size);
                 break;
             }
             case Precision::Bool:
             {
+                auto src = mem->gettensor<bool>(this->args[0].textvalue);
+                auto dst = mem->gettensor<bool>(this->returns[0].textvalue);
                 dst->copyer(src->data, dst->data, src->shape.size);
                 break;
             }
diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/io_miaobyte.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/io_miaobyte.hpp
index 784fe767..d7c26ccc 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/io_miaobyte.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/io_miaobyte.hpp
@@ -55,7 +55,7 @@ namespace deepx::tensorfunc
         }
  
         shared_ptr<Tensor<T>> tensor = make_shared<Tensor<T>>(New<T>(shape.shape));
-        tensor->loader(path,tensor->data,tensor->shape.size);
+        tensor->loader(path+".data",tensor->data,tensor->shape.size);
         return std::make_pair(tensor_name, tensor);
     };
 
diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/io.hpp b/excuter/op-mem-ompsimd/src/deepx/tf/io.hpp
index e964bc87..d0d9ae5d 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tf/io.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tf/io.hpp
@@ -87,6 +87,7 @@ namespace deepx::tf
             }
             Precision dtype = mem->gettensor(name)->shape.dtype;
             tensorfunc::saveShape(mem->gettensor(name)->shape,path);
+            path+=".data";
             switch (dtype)
             {   
             case Precision::Float64:{
diff --git a/front/py/deepxutil/numpy/io.py b/front/py/deepxutil/numpy/io.py
index ae95a17a..4af9a79d 100644
--- a/front/py/deepxutil/numpy/io.py
+++ b/front/py/deepxutil/numpy/io.py
@@ -1,4 +1,4 @@
-from deepx.tensor import Shape,saveShape
+from deepx.tensor import Shape
  
 def save_numpy(t,tensorpath:str):
     r'''
@@ -10,7 +10,7 @@ def save_numpy(t,tensorpath:str):
     assert isinstance(t,ndarray)
     shape=Shape(t.shape)
     shape._dtype=str(t.dtype)
-    saveShape(shape,tensorpath+".shape")
+    shape.save(tensorpath+".shape")
 
     array = ascontiguousarray(t)
     array.tofile(tensorpath+'.data')
diff --git a/front/py/examples/2_ir/2_elementwise_add.py b/front/py/examples/2_ir/2_elementwise_add.py
index ceba8d73..dca1fa93 100644
--- a/front/py/examples/2_ir/2_elementwise_add.py
+++ b/front/py/examples/2_ir/2_elementwise_add.py
@@ -10,7 +10,7 @@
 
 ############-------DEEPX-------################
 
-from deepx import Tensor,full
+from deepx import  full
 
 print()
 
@@ -18,4 +18,4 @@
 t2 = t1.clone()
 t3 = t1+t2
 t3.add_(0.5)
-print(t3)
\ No newline at end of file
+t3.print()
\ No newline at end of file
diff --git a/front/py/examples/2_ir/2_elementwise_operator.py b/front/py/examples/2_ir/2_elementwise_operator.py
index c5870c79..3d053781 100644
--- a/front/py/examples/2_ir/2_elementwise_operator.py
+++ b/front/py/examples/2_ir/2_elementwise_operator.py
@@ -25,14 +25,14 @@
 t3 = t1.add(t2,out='t3')
 t4=deepx.full([3,4,5],value=0.5,name='t4')
 t5=t4.add(t3,out='t5')
-print(t5)
+t5.print()
 t6=t1.div(t2,out='t6')
-print(t6)
+t6.print()
 t7=t2.rdiv(0.05,out='t7')
 t7.mul_(2.5)
-print(t7)
+t7.print()
 t8=t7.mul(t2,out='t8')
-print(t8)
+t8.print()
 
 
 
diff --git a/front/py/examples/2_ir/3_matmul.py b/front/py/examples/2_ir/3_matmul.py
index 3c22593f..205b27c0 100644
--- a/front/py/examples/2_ir/3_matmul.py
+++ b/front/py/examples/2_ir/3_matmul.py
@@ -1,13 +1,13 @@
 benchcnt=100
 
-from deepx.nn.functional import save_npy
+from deepxutil.numpy  import save_numpy
 import numpy as np
 np_T1 = np.random.randn(1024, 1024).astype(np.float32)
 np_T2 = np.random.randn(1024, 1024).astype(np.float32)
 
-npy_path = '/home/lipeng/model/deepxmodel/tester/'
-save_npy(np_T1,npy_path+'t1')
-save_npy(np_T2,npy_path+'t2')
+npy_path = '/home/lipeng/model/deepxmodel/matmul/'
+save_numpy(np_T1,npy_path+'t1')
+save_numpy(np_T2,npy_path+'t2')
 
 ############-------PyTorch-------################
 

From f15f6cc7d0682524677da6ff72cfd9cebf988aee Mon Sep 17 00:00:00 2001
From: lipeng <734991033@qq.com>
Date: Fri, 25 Apr 2025 01:33:39 +0800
Subject: [PATCH 5/7] =?UTF-8?q?reduce:=E7=B3=BB=E5=88=97=E5=87=BD=E6=95=B0?=
 =?UTF-8?q?=E7=AA=81=E7=84=B6=E5=87=BA=E9=97=AE=E9=A2=98=EF=BC=8C=E5=BE=85?=
 =?UTF-8?q?=E8=A7=A3=E5=86=B3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 front/py/deepx/nn/functional/leaffunc_init.py | 37 ++++++++-----------
 front/py/deepx/scheduler/client/udpconn.py    |  2 +-
 .../examples/2_ir/4_changeshape_broadcast.py  |  6 +--
 .../py/examples/2_ir/4_changeshape_concat.py  |  2 +-
 .../py/examples/2_ir/4_changeshape_gather.py  | 21 ++++++-----
 .../py/examples/2_ir/4_changeshape_reshape.py | 12 +++---
 .../examples/2_ir/4_changeshape_transpose.py  | 16 ++++----
 .../py/examples/2_ir/5_reduce_sum_keepdim.py  | 32 ++++++++--------
 front/py/examples/2_ir/5_reduce_sumprod.py    | 29 ++++++---------
 9 files changed, 73 insertions(+), 84 deletions(-)

diff --git a/front/py/deepx/nn/functional/leaffunc_init.py b/front/py/deepx/nn/functional/leaffunc_init.py
index 48160e3d..7711e2be 100644
--- a/front/py/deepx/nn/functional/leaffunc_init.py
+++ b/front/py/deepx/nn/functional/leaffunc_init.py
@@ -1,7 +1,7 @@
 import math
 import time
 import os
-from .leaffunc_life import newtensor,parse_shape
+from .leaffunc_life import newtensor
 from .rtf_init import *
 from deepx import Tensor,Number
 from .authormap import defaultauthor
@@ -12,25 +12,20 @@
 
 def constant_(t:Tensor,value: Union[float,int])->Tensor:
     rtf_constant(t,value,defaultauthor['constant'])
- 
-
-def constant(*shape, value:Union[float,int], dtype:str='float32',name:str)->Tensor:
-    s = parse_shape(shape)
-    outtensor=newtensor(s,dtype=dtype,name=name)
+    
+def constant(shape:tuple[int,...], value:Union[float,int], dtype:str='float32',name:str=None)->Tensor:
+    outtensor=newtensor(shape,dtype=dtype,name=name)
     constant_(outtensor, value)
     return outtensor
 
-def full(*shape, value:Union[float,int], dtype:str='float32',name:str=None)->Tensor:
-    s = parse_shape(shape)
-    return constant(s, value=value, dtype=dtype,name=name)
+def full(shape:tuple[int,...], value:Union[float,int], dtype:str='float32',name:str=None)->Tensor:
+    return constant(shape, value=value, dtype=dtype,name=name)
 
-def zeros(*shape, dtype:str='float32',name:str=None)->Tensor:
-    s = parse_shape(shape)
-    return constant(s, value=0, dtype=dtype,name=name)
+def zeros(shape:tuple[int,...], dtype:str='float32',name:str=None)->Tensor:
+    return constant(shape, value=0, dtype=dtype,name=name)
 
-def ones(*shape, dtype:str='float32',name:str=None)->Tensor:
-    s = parse_shape(shape)
-    return constant(s, value=1, dtype=dtype,name=name)
+def ones(shape:tuple[int,...], dtype:str='float32',name:str=None)->Tensor:
+    return constant(shape, value=1, dtype=dtype,name=name)
  
 def arange_(t:Tensor,start=0,step=1)->Tensor:
     from .rtf_init import rtf_arange
@@ -49,9 +44,8 @@ def uniform_(t:Tensor,low=0, high=1,seed:int=None)->Tensor:
     from .rtf_init import rtf_uniform
     rtf_uniform(t,low,high,seed,defaultauthor['uniform'])
 
-def uniform(*shape,low=0, high=1,seed:int=None,dtype:str='float32',name:str=None)->Tensor:
-    s = parse_shape(shape)
-    outtensor=newtensor(s,dtype=dtype,name=name)
+def uniform(shape:tuple[int,...],low=0, high=1,seed:int=None,dtype:str='float32',name:str=None)->Tensor:
+    outtensor=newtensor(shape,dtype=dtype,name=name)
     uniform_(outtensor,low,high,seed)
     return outtensor
 
@@ -166,7 +160,7 @@ def kaiming_uniform_(
     bound = math.sqrt(3.0) * std  # Calculate uniform bounds from standard deviation
     return  uniform_(tensor,-bound, bound)
 
-def kaiming_uniform(*shape,a:float=0,mode:str='fan_in',nonlinearity:str='leaky_relu',dtype:str='float32',name:str=None,author='miaobyte')->Tensor:
+def kaiming_uniform(shape:tuple[int,...],a:float=0,mode:str='fan_in',nonlinearity:str='leaky_relu',dtype:str='float32',name:str=None,author='miaobyte')->Tensor:
     outtensor=newtensor(shape,dtype=dtype,name=name)
     kaiming_uniform_(outtensor,a,mode,nonlinearity)
     return outtensor
@@ -178,8 +172,7 @@ def normal_(t:Tensor,mean:float=0, stddev:float=1,seed:int=None)->Tensor:
     from .rtf_init import rtf_normal
     rtf_normal(t,mean,stddev,seed,defaultauthor['normal'])
 
-def normal(*shape,mean:float=0, stddev:float=1,seed:int=None,dtype:str='float32',name:str=None,author='miaobyte')->Tensor:
-    s = parse_shape(shape)
-    outtensor=newtensor(s,dtype=dtype,name=name)
+def normal(shape:tuple[int,...],mean:float=0, stddev:float=1,seed:int=None,dtype:str='float32',name:str=None,author='miaobyte')->Tensor:
+    outtensor=newtensor(shape,dtype=dtype,name=name)
     normal_(outtensor,mean,stddev,seed)
     return outtensor
diff --git a/front/py/deepx/scheduler/client/udpconn.py b/front/py/deepx/scheduler/client/udpconn.py
index 6a12c26a..a25b0963 100644
--- a/front/py/deepx/scheduler/client/udpconn.py
+++ b/front/py/deepx/scheduler/client/udpconn.py
@@ -3,7 +3,7 @@
 import select
 
 class UDPConn:
-    def __init__(self, endpoint: str = "localhost:9090"):
+    def __init__(self, endpoint: str = "localhost:8080"):
         # 解析endpoint
         self._host, port_str = endpoint.split(':')
         self._port = int(port_str)
diff --git a/front/py/examples/2_ir/4_changeshape_broadcast.py b/front/py/examples/2_ir/4_changeshape_broadcast.py
index 9811201e..84199680 100644
--- a/front/py/examples/2_ir/4_changeshape_broadcast.py
+++ b/front/py/examples/2_ir/4_changeshape_broadcast.py
@@ -10,10 +10,10 @@
 ########====DEEPX====########
 from deepx import Tensor,arange,broadcastTo
 
-a=arange(4,2,3,name="a")
-b=arange(2,1,name='b')
+a=arange(start=0,end=4*2*3,name="a").reshape_((4,2,3))
+b=arange(start=0,end=2,name='b').reshape((2,1))
 bb=b.broadcastTo( a.shape,out="b.broadcasted")
-print(bb)
+bb.print()
 
 
  
diff --git a/front/py/examples/2_ir/4_changeshape_concat.py b/front/py/examples/2_ir/4_changeshape_concat.py
index 24a8c9a4..cda373fb 100644
--- a/front/py/examples/2_ir/4_changeshape_concat.py
+++ b/front/py/examples/2_ir/4_changeshape_concat.py
@@ -20,4 +20,4 @@
 t3=ones([3,4,5],dtype='float32',name='t3')
  
 t=concat([t1,t2,t3],dim=1,out='t')
-print(t)
+t.print()
diff --git a/front/py/examples/2_ir/4_changeshape_gather.py b/front/py/examples/2_ir/4_changeshape_gather.py
index e05013bd..df12e5f2 100644
--- a/front/py/examples/2_ir/4_changeshape_gather.py
+++ b/front/py/examples/2_ir/4_changeshape_gather.py
@@ -8,21 +8,22 @@
 import torch
 torch_t = torch.arange(10*5, dtype=torch.float32).reshape(10,5)
 torch_indices = torch.tensor(indices_np)
-torch_t = torch.gather(torch_t, 1,torch_indices)
-print(torch_t.shape)
-print(torch_t)
+torch_t2 = torch.index_select(torch_t, 1,torch_indices)
+print(torch_t2.shape)
+print(torch_t2)
 
 
 ############-------DEEPX-------################
 
-from deepx import Tensor,arange,Shape
-from deepx.nn.functional import load,save_npy
+from deepx import Tensor,arange,Shape,load
+from deepxutil.numpy import save_numpy
 
- 
-save_npy(indices_np,'/home/lipeng/model/deepxmodel/tester/testindices')
+save_numpy(indices_np,'/home/lipeng/model/deepxmodel/tester/testindices')
 
-t = arange(start=0,end=10*5,dtype='float32',name='t').reshape(10,5)
+t = arange(start=0,end=10*5,dtype='float32',name='t').reshape_((10,5))
 indices = load('/home/lipeng/model/deepxmodel/tester/testindices')
 indices.print()
-t = t.gather(indices,dim=1)
-t.print()
\ No newline at end of file
+t2 = t.indexselect(indices,axis=1)
+t2.print()
+
+### indexselect 行为和tensorflow.gather保持一致，支持index为多维
\ No newline at end of file
diff --git a/front/py/examples/2_ir/4_changeshape_reshape.py b/front/py/examples/2_ir/4_changeshape_reshape.py
index 151dc842..a8dde5f5 100644
--- a/front/py/examples/2_ir/4_changeshape_reshape.py
+++ b/front/py/examples/2_ir/4_changeshape_reshape.py
@@ -14,10 +14,10 @@
 
 from deepx import Tensor,zeros, ones, full, arange
 print()
-t1 = ones([3,4],dtype='float32',name='t1')
-print(t1)
-t2=t1.reshape(3,2,2)
-print(t2)
+t1 = ones((3,4),dtype='float32',name='t1')
+t1.print()
+t2=t1.reshape((3,2,2))
+t2.print()
 
-t3=ones([4,5],dtype='float32').reshape_(20)
-print(t3)
+t3=ones((4,5),dtype='float32').reshape_((20,))
+t3.print()
diff --git a/front/py/examples/2_ir/4_changeshape_transpose.py b/front/py/examples/2_ir/4_changeshape_transpose.py
index 8ced7ade..bc1efd08 100644
--- a/front/py/examples/2_ir/4_changeshape_transpose.py
+++ b/front/py/examples/2_ir/4_changeshape_transpose.py
@@ -1,5 +1,5 @@
 ############-------PyTorch-------################
-
+print()
 import torch
 torch_t1 = torch.ones(3, 4, dtype=torch.float32)
 print(torch_t1)
@@ -12,15 +12,15 @@
 
 ############-------DEEPX-------################
 
-from deepx import Tensor,zeros, ones, full, arange
+from deepx import  ones
+
 
-print()
 
-t1 = ones([3,4],dtype='float32',name='t1')
-print(t1)
+t1 = ones((3,4),dtype='float32',name='t1')
+t1.print()
 t2=t1.transpose(out='t2')
-print(t2)
+t2.print()
 
-t3=ones([2,3,4],dtype='float32',name='t3')
+t3=ones((2,3,4),dtype='float32',name='t3')
 t4=t3.transpose(out='t4')
-print(t4)
+t4.print()
diff --git a/front/py/examples/2_ir/5_reduce_sum_keepdim.py b/front/py/examples/2_ir/5_reduce_sum_keepdim.py
index 3a582b47..da4cf110 100644
--- a/front/py/examples/2_ir/5_reduce_sum_keepdim.py
+++ b/front/py/examples/2_ir/5_reduce_sum_keepdim.py
@@ -1,4 +1,5 @@
 ############-------PyTorch-------################
+print()
 
 import torch
 torch_t = torch.arange(0,60).reshape(3,4,5)
@@ -19,19 +20,18 @@
 from deepx import Tensor,ones,zeros,arange
 from deepx.nn.functional import sum,prod
 
-t=arange(3,4,5,name='t')
-t.set_format("%.0f")
-print(t)
-s=sum(t,dim=[0,2],out="s",keepdim=True)
-s.set_format("%.0f")
-print(s)
-p=prod(t,dim=[1],out="p",keepdim=True)
-p.set_format("%.0f")
-print(p)
-
-t1=ones(4,5,6,name="t1")
-t1.set_format("%.0f")
-print(t1)
-t2=sum(t1,dim=[0,1],out='t2',keepdim=True)
-t2.set_format("%.0f")
-print(t2)
+t=arange(0,60,name='t').reshape_((3,4,5))
+t.print()
+s=sum(t,dim=(0,2),out="s",keepdim=True)
+
+s.print()
+p=prod(t,dim=(1,),out="p",keepdim=True)
+
+p.print()
+
+t1=ones((4,5,6),name="t1")
+
+t1.print()
+t2=sum(t1,dim=(0,1),out='t2',keepdim=True)
+
+t2.print()
diff --git a/front/py/examples/2_ir/5_reduce_sumprod.py b/front/py/examples/2_ir/5_reduce_sumprod.py
index cc4360f5..f141ca13 100644
--- a/front/py/examples/2_ir/5_reduce_sumprod.py
+++ b/front/py/examples/2_ir/5_reduce_sumprod.py
@@ -19,20 +19,15 @@
 from deepx import Tensor,ones,zeros,arange
 from deepx.nn.functional import sum,prod
 
-t=arange(3,4,5,name='t')
-t.arange_(0,1)
-t.set_format("%.0f")
-print(t)
-s=sum(t,dim=[0,2],out="s")
-s.set_format("%.0f")
-print(s)
-p=prod(t,dim=[1],out="p")
-p.set_format("%.0f")
-print(p)
-
-t1=ones(4,5,6,name="t1")
-t1.set_format("%.0f")
-print(t1)
-t2=sum(t1,dim=[0,1],out='t2')
-t2.set_format("%.0f")
-print(t2)
+t=arange(0,60,name='t').reshape_((3,4,5))
+
+t.print()
+s=sum(t,dim=(0,2),out="s")
+s.print()
+p=prod(t,dim=(1,),out="p")
+p.print()
+
+t1=ones((4,5,6),name="t1")
+t1.print()
+t2=sum(t1,dim=(0,1),out='t2')
+t2.print()

From dd61422a75098e6fd02067111947bcbc48662e9e Mon Sep 17 00:00:00 2001
From: lipeng <734991033@qq.com>
Date: Sat, 26 Apr 2025 16:12:13 +0800
Subject: [PATCH 6/7] =?UTF-8?q?reduce:=E7=B3=BB=E5=88=97=E5=87=BD=E6=95=B0?=
 =?UTF-8?q?=E7=9A=84=E9=97=AE=E9=A2=98=E6=98=AF=E7=94=B1=E4=BA=8Edim?=
 =?UTF-8?q?=E4=BB=8Eshape=E6=96=87=E4=BB=B6=E8=AF=BB=E5=8F=96=E6=97=B6?=
 =?UTF-8?q?=EF=BC=8Cdim=E6=9C=AC=E8=BA=AB=E6=98=AF=E9=94=99=E7=9A=84?=
 =?UTF-8?q?=E3=80=82=E5=8E=9F=E5=9B=A0=E6=98=AFreshape=E5=90=8E=EF=BC=8Cdi?=
 =?UTF-8?q?m=E6=9C=AA=E4=BF=AE=E6=94=B9=E3=80=82?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

现在已经擅长了dim这个成员，改为实时计算shape.size()
---
 doc/design.md                                 | 21 +++++++++
 .../{5_reduce_sumprod.py => 5_reduce_prod.py} |  8 ++--
 front/py/examples/2_ir/5_reduce_sum.py        | 33 ++++++++++++++
 front/py/examples/3_module/1_swiglu.py        | 43 +++++++++++++++++++
 4 files changed, 101 insertions(+), 4 deletions(-)
 create mode 100644 doc/design.md
 rename front/py/examples/2_ir/{5_reduce_sumprod.py => 5_reduce_prod.py} (86%)
 create mode 100644 front/py/examples/2_ir/5_reduce_sum.py
 create mode 100644 front/py/examples/3_module/1_swiglu.py

diff --git a/doc/design.md b/doc/design.md
new file mode 100644
index 00000000..bbc4df71
--- /dev/null
+++ b/doc/design.md
@@ -0,0 +1,21 @@
+# deepx默认原则
+
+## 一.DeepxIR
+
+### 1.deepIR结构
+```
+deepIR{
+    Meta{
+        int id
+        string author
+    } meta
+    string name 
+    []Param args
+    []Param returns
+}
+```
+
+excuter执行deepxIR的规则
+
++ excuter执行deepxIR时，不得修改args中的tensor
++ 但deepIR不限制args和returns中的Param同名，这样可以实现类似inplace的操作
\ No newline at end of file
diff --git a/front/py/examples/2_ir/5_reduce_sumprod.py b/front/py/examples/2_ir/5_reduce_prod.py
similarity index 86%
rename from front/py/examples/2_ir/5_reduce_sumprod.py
rename to front/py/examples/2_ir/5_reduce_prod.py
index f141ca13..a6f0eb63 100644
--- a/front/py/examples/2_ir/5_reduce_sumprod.py
+++ b/front/py/examples/2_ir/5_reduce_prod.py
@@ -5,8 +5,8 @@
 print(torch_t)
 torch_s = torch.sum(torch_t, dim=[0, 2])
 print(torch_s)
-torch_p=torch.prod(torch_t,dim=1)
-print(torch_p)
+# torch_p=torch.prod(torch_t,dim=1)
+# print(torch_p)
 
 torch_t1 = torch.ones(4, 5, 6,dtype=torch.float)
 print(torch_t1)
@@ -24,8 +24,8 @@
 t.print()
 s=sum(t,dim=(0,2),out="s")
 s.print()
-p=prod(t,dim=(1,),out="p")
-p.print()
+# p=prod(t,dim=(1,),out="p")
+# p.print()
 
 t1=ones((4,5,6),name="t1")
 t1.print()
diff --git a/front/py/examples/2_ir/5_reduce_sum.py b/front/py/examples/2_ir/5_reduce_sum.py
new file mode 100644
index 00000000..a6f0eb63
--- /dev/null
+++ b/front/py/examples/2_ir/5_reduce_sum.py
@@ -0,0 +1,33 @@
+############-------PyTorch-------################
+
+import torch
+torch_t = torch.arange(0,60).reshape(3,4,5)
+print(torch_t)
+torch_s = torch.sum(torch_t, dim=[0, 2])
+print(torch_s)
+# torch_p=torch.prod(torch_t,dim=1)
+# print(torch_p)
+
+torch_t1 = torch.ones(4, 5, 6,dtype=torch.float)
+print(torch_t1)
+torch_t2 = torch.sum(torch_t1, dim=[0, 1])
+print(torch_t2)
+
+
+############-------DEEPX-------################
+
+from deepx import Tensor,ones,zeros,arange
+from deepx.nn.functional import sum,prod
+
+t=arange(0,60,name='t').reshape_((3,4,5))
+
+t.print()
+s=sum(t,dim=(0,2),out="s")
+s.print()
+# p=prod(t,dim=(1,),out="p")
+# p.print()
+
+t1=ones((4,5,6),name="t1")
+t1.print()
+t2=sum(t1,dim=(0,1),out='t2')
+t2.print()
diff --git a/front/py/examples/3_module/1_swiglu.py b/front/py/examples/3_module/1_swiglu.py
new file mode 100644
index 00000000..0e350582
--- /dev/null
+++ b/front/py/examples/3_module/1_swiglu.py
@@ -0,0 +1,43 @@
+hidden_size = 8
+eps = 1e-6
+dir='/home/lipeng/model/deepxmodel/llama/'
+
+
+
+############### PyTorch 实现部分 ###############
+import torch
+# 使用小规模数据以便打印完整结果
+pt_input = torch.arange(48, dtype=torch.float32).reshape(2, 3, hidden_size) / 10.0 - 2.0
+print("PyTorch 输入:")
+print(pt_input)
+
+from transformers.models.llama.modeling_llama import LlamaRMSNorm as TransformersLlamaRMSNorm
+from deepxutil.torch import save_torch
+save_torch(pt_input,dir+'rmsnorm_input')
+# 使用transformers库中的官方LlamaRMSNorm实现
+pt_norm = TransformersLlamaRMSNorm(hidden_size, eps=eps)
+# 设置权重为固定值0.5
+with torch.no_grad():
+    pt_norm.weight.fill_(0.5)
+# 前向传播
+pt_output = pt_norm(pt_input)
+
+
+print("\nPyTorch RMSNorm 结果:")
+print(pt_output.shape)
+print(pt_output)
+
+ 
+############### DeepX 实现部分 ###############
+from deepx import  constant_,load
+from deepx.transformer.models.llama.modeling_llama import LlamaRMSNorm
+
+input=load(dir+'rmsnorm_input')
+
+# DeepX计算流程
+norm = LlamaRMSNorm(hidden_size=hidden_size, eps=eps)
+# 设置相同的权重
+constant_(norm.weight, 0.5)
+# 前向计算
+output = norm(input)
+output.print()

From 66499590b42543e31e74123827d3e44f22b79485 Mon Sep 17 00:00:00 2001
From: lipeng <734991033@qq.com>
Date: Sat, 26 Apr 2025 16:12:21 +0800
Subject: [PATCH 7/7] =?UTF-8?q?reduce:=E7=B3=BB=E5=88=97=E5=87=BD=E6=95=B0?=
 =?UTF-8?q?=E7=9A=84=E9=97=AE=E9=A2=98=E6=98=AF=E7=94=B1=E4=BA=8Edim?=
 =?UTF-8?q?=E4=BB=8Eshape=E6=96=87=E4=BB=B6=E8=AF=BB=E5=8F=96=E6=97=B6?=
 =?UTF-8?q?=EF=BC=8Cdim=E6=9C=AC=E8=BA=AB=E6=98=AF=E9=94=99=E7=9A=84?=
 =?UTF-8?q?=E3=80=82=E5=8E=9F=E5=9B=A0=E6=98=AFreshape=E5=90=8E=EF=BC=8Cdi?=
 =?UTF-8?q?m=E6=9C=AA=E4=BF=AE=E6=94=B9=E3=80=82?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

现在已经擅长了dim这个成员，改为实时计算shape.size()
---
 excuter/cpp-common/src/deepx/shape.cpp        |  51 ++++++-
 excuter/cpp-common/src/deepx/shape.hpp        |  18 ++-
 .../src/deepx/shape_changeshape.cpp           |   4 +-
 .../src/deepx/shape_changeshape.hpp           |   4 +-
 excuter/cpp-common/src/deepx/shape_matmul.cpp |   8 +-
 excuter/cpp-common/src/deepx/shape_range.cpp  |  18 +--
 excuter/cpp-common/src/deepx/shape_reduce.cpp |   8 +-
 .../cpp-common/src/deepx/shape_tensorinit.cpp |   6 +-
 excuter/cpp-common/src/deepx/shapeslice.cpp   |  34 -----
 excuter/cpp-common/src/deepx/shapeslice.hpp   |  21 ---
 excuter/cpp-common/src/deepx/tensor.hpp       |  11 ++
 .../cpp-common/src/deepx/tensorfunc/io.hpp    |  25 +---
 .../src/deepx/tensorfunc/matmul.hpp           |   4 +-
 excuter/cpp-common/src/deepx/tensorslice.hpp  |  26 ----
 excuter/cpp-common/src/stdutil/print.hpp      |   2 +-
 .../deepx/tensorfunc/changeshape_miaobyte.hpp |  20 +--
 .../src/deepx/tensorfunc/io_miaobyte.hpp      |   2 +-
 .../src/deepx/tensorfunc/reduce_miaobyte.hpp  |  16 +-
 excuter/op-mem-cuda/src/deepx/tf/io.hpp       |   6 +-
 .../deepx/tensorfunc/changeshape_miaobyte.hpp |  30 ++--
 .../deepx/tensorfunc/elementwise_miaobyte.hpp |  48 +++---
 .../src/deepx/tensorfunc/io_miaobyte.hpp      |   2 +-
 .../src/deepx/tensorfunc/matmul_cblas.hpp     |   4 +-
 .../src/deepx/tensorfunc/matmul_miaobyte.hpp  |   4 +-
 .../src/deepx/tensorfunc/reduce_miaobyte.hpp  |  56 +++----
 excuter/op-mem-ompsimd/src/deepx/tf/io.hpp    |   4 +-
 .../op-mem-ompsimd/src/deepx/tf/reduce.hpp    |   2 +-
 .../test/tensorfunc/1_shape.cpp               |   2 +-
 .../test/tensorfunc/2_tensor_new.cpp          |   4 +-
 .../test/tensorfunc/5_tensor_sum.cpp          |   4 +-
 .../test/tensorfunc/8_tensor_concat.cpp       |   2 +-
 .../models/llama/modeling_llama.py            | 140 ++++++++++--------
 front/py/examples/2_ir/5_reduce_sum.py        |   8 +-
 33 files changed, 283 insertions(+), 311 deletions(-)
 delete mode 100644 excuter/cpp-common/src/deepx/shapeslice.cpp
 delete mode 100644 excuter/cpp-common/src/deepx/shapeslice.hpp
 delete mode 100644 excuter/cpp-common/src/deepx/tensorslice.hpp

diff --git a/excuter/cpp-common/src/deepx/shape.cpp b/excuter/cpp-common/src/deepx/shape.cpp
index fa207e98..9f51a2e2 100644
--- a/excuter/cpp-common/src/deepx/shape.cpp
+++ b/excuter/cpp-common/src/deepx/shape.cpp
@@ -8,18 +8,19 @@
 #include "deepx/dtype.hpp"
 namespace deepx
 {
-
     Shape::Shape(const int *shape, int dim)
     {
         setshape(shape, dim);
     }
+    int Shape::dim() const{
+        return shape.size();
+    }
     int64_t Shape::bytes() const{
         return size * (precision_bits(dtype) / 8);
     }
     void Shape::setshape(const int *shape, int dim)
     {
         this->shape.resize(dim);
-        this->dim = dim;
         std::copy(shape, shape + dim, this->shape.begin());
         strides.resize(dim);
         strides[dim - 1] = 1;
@@ -57,10 +58,10 @@ namespace deepx
     void Shape::print() const
     {
         std::cout << "shape:[";
-        for (int i = 0; i < dim; ++i)
+        for (int i = 0; i < dim(); ++i)
         {
             std::cout << shape[i];
-            if (i < dim - 1)
+            if (i < dim() - 1)
                 std::cout << ", ";
         }
         std::cout << "]" << std::endl;
@@ -73,8 +74,8 @@ namespace deepx
         return idx;
     }
     std::vector<int> Shape::linearto(int idx_linear) const{
-        std::vector<int> indices(dim,0);
-        for(int i=0;i<dim;i++){
+        std::vector<int> indices(dim(),0);
+        for(int i=0;i<dim();i++){
             indices[i]=idx_linear/strides[i];
             idx_linear%=strides[i];
         }
@@ -84,7 +85,7 @@ namespace deepx
     std::string Shape::toYaml() const{
         YAML::Node node;
         node["dtype"] = precision_str(dtype);
-        node["dim"] = dim;
+        node["dim"] = dim();
         node["shape"] = shape;
         node["stride"] = strides;
         node["size"] = size;
@@ -93,9 +94,43 @@ namespace deepx
     void Shape::fromYaml(const std::string &yaml){
         YAML::Node node = YAML::Load(yaml);
         dtype = precision(node["dtype"].as<std::string>());
-        dim = node["dim"].as<int>();
         shape = node["shape"].as<std::vector<int>>();
         strides=node["stride"].as<std::vector<int>>();
         size=node["size"].as<int>();
+        
+        //check
+        Shape checkedshape(shape);
+        if(checkedshape.shape!=shape){
+            throw std::runtime_error("Shape::fromYaml: shape mismatch");
+        }
+        if(checkedshape.strides!=strides){
+            throw std::runtime_error("Shape::fromYaml: strides mismatch");
+        }
+         if(checkedshape.size!=size){
+            throw std::runtime_error("Shape::fromYaml: size mismatch");
+        }
     }
+
+    void Shape::saveShape( const std::string &tensorPath) const{
+            std::string shapedata = toYaml();
+            std::ofstream shape_fs(tensorPath + ".shape", std::ios::binary);
+            shape_fs.write(shapedata.c_str(), shapedata.size());
+            shape_fs.close();
+        }
+
+    pair<std::string,Shape> Shape::loadShape(const std::string &path)   
+    {
+        std::string shapepath = path + ".shape";
+        std::ifstream shape_fs(shapepath, std::ios::binary);
+        if (!shape_fs.is_open())
+        {
+                throw std::runtime_error("Failed to open shape file: " + shapepath);
+            }
+            std::string shapedata((std::istreambuf_iterator<char>(shape_fs)), std::istreambuf_iterator<char>());
+            Shape shape;
+            shape.fromYaml(shapedata);
+            std::string filename = stdutil::filename(path);
+            std::string tensor_name = filename.substr(0, filename.find_last_of('.'));
+            return std::make_pair(tensor_name, shape);
+        }
 }
\ No newline at end of file
diff --git a/excuter/cpp-common/src/deepx/shape.hpp b/excuter/cpp-common/src/deepx/shape.hpp
index 655dce38..b314e891 100644
--- a/excuter/cpp-common/src/deepx/shape.hpp
+++ b/excuter/cpp-common/src/deepx/shape.hpp
@@ -4,11 +4,14 @@
 #include <vector>
 #include <string>
 #include <functional>
+#include <fstream>
+#include <utility>
 
+#include "stdutil/fs.hpp"
 #include "deepx/dtype.hpp"
 namespace deepx
 {
-    //omp内线程局部变量
+    // omp内线程局部变量
     class ThreadLocalVectors
     {
     private:
@@ -43,7 +46,6 @@ namespace deepx
         Precision dtype;
         std::vector<int> shape;
         std::vector<int> strides;
-        int dim;
         int64_t size;
         int64_t bytes() const;
 
@@ -52,6 +54,7 @@ namespace deepx
         Shape(const std::initializer_list<int> &shape);
         Shape(const int *shape, int dim);
         void setshape(const int *shape, int dim);
+        int dim() const;
         int operator[](int index) const;
         int &operator[](int index);
         bool operator==(const Shape &shape) const { return shape.shape == shape.shape; }
@@ -67,16 +70,19 @@ namespace deepx
         void rangeParallel(int dimCount, std::function<void(const int idx_linear, const std::vector<int> &indices)> func) const;
 
         // 支持omp,但omp内需要线程local变量
-        void rangeParallel(int dimCount, std::function<void(const std::vector<int> &indices, ThreadLocalVectors &tlv)> func,const vector<int> tlv_sizes) const;
-        void rangeParallel(int dimCount, std::function<void(const int idx_linear, ThreadLocalVectors &tlv)> func,const vector<int> tlv_sizes) const;
-        void rangeParallel(int dimCount, std::function<void(const int idx_linear, const std::vector<int> &indices, ThreadLocalVectors &tlv)> func,const vector<int> tlv_sizes) const;
+        void rangeParallel(int dimCount, std::function<void(const std::vector<int> &indices, ThreadLocalVectors &tlv)> func, const vector<int> tlv_sizes) const;
+        void rangeParallel(int dimCount, std::function<void(const int idx_linear, ThreadLocalVectors &tlv)> func, const vector<int> tlv_sizes) const;
+        void rangeParallel(int dimCount, std::function<void(const int idx_linear, const std::vector<int> &indices, ThreadLocalVectors &tlv)> func, const vector<int> tlv_sizes) const;
         int linearat(const std::vector<int> &indices) const;
         std::vector<int> linearto(int idx_linear) const;
 
         std::string toYaml() const;
         void fromYaml(const std::string &yaml);
-    };
 
+        void saveShape(const std::string &tensorPath) const;
+
+        static pair<std::string, Shape> loadShape(const std::string &path);
+    };
 }
 
 #endif // DEEPX_SHAPE_HPP
diff --git a/excuter/cpp-common/src/deepx/shape_changeshape.cpp b/excuter/cpp-common/src/deepx/shape_changeshape.cpp
index f3a60bd1..d3a89d90 100644
--- a/excuter/cpp-common/src/deepx/shape_changeshape.cpp
+++ b/excuter/cpp-common/src/deepx/shape_changeshape.cpp
@@ -32,11 +32,11 @@ namespace deepx
 
     Shape concatShape(const std::vector<Shape> &shapes, const int axis)
     {
-        std::vector<int> outputShape(shapes[0].dim);
+        std::vector<int> outputShape(shapes[0].dim());
         outputShape = shapes[0].shape;
         for (int i = 1; i < shapes.size(); ++i)
         {
-            if (shapes[i].dim != outputShape.size())
+            if (shapes[i].dim() != outputShape.size())
             {
                 throw std::invalid_argument("All tensors must have the same number of dimensions.");
             }
diff --git a/excuter/cpp-common/src/deepx/shape_changeshape.hpp b/excuter/cpp-common/src/deepx/shape_changeshape.hpp
index 3f299885..71cbcb63 100644
--- a/excuter/cpp-common/src/deepx/shape_changeshape.hpp
+++ b/excuter/cpp-common/src/deepx/shape_changeshape.hpp
@@ -38,11 +38,11 @@ namespace deepx
         int axisDim = 0;
         for (int i = 0; i < tensors.size(); i++)
         {
-            if (tensors[i]->shape.dim != output.shape.dim)
+            if (tensors[i]->shape.dim() != output.shape.dim())
             {
                 throw TensorShapeError("All input tensors must have the same dimension size for concat");
             }
-            for (int j = 0; j < tensors[i]->shape.dim; j++)
+            for (int j = 0; j < tensors[i]->shape.dim(); j++)
             {
                 if (j != axis)
                 {
diff --git a/excuter/cpp-common/src/deepx/shape_matmul.cpp b/excuter/cpp-common/src/deepx/shape_matmul.cpp
index 3ed0d51a..46247c70 100644
--- a/excuter/cpp-common/src/deepx/shape_matmul.cpp
+++ b/excuter/cpp-common/src/deepx/shape_matmul.cpp
@@ -4,9 +4,9 @@
 
 namespace deepx
 {
-        Shape matmul_shape(const Shape &A, const Shape &B)
+    Shape matmul_shape(const Shape &A, const Shape &B)
     {
-        if (A.dim < 2 || B.dim < 2)
+        if (A.dim() < 2 || B.dim() < 2)
         {
             throw std::invalid_argument("A and B must >= 2D tensors");
         }
@@ -14,8 +14,8 @@ namespace deepx
         {
             throw std::invalid_argument("A[-1] must be equal to B[-2]");
         }
-        std::vector<int> resultshape(A.dim);
-        std::copy(A.shape.begin(), A.shape.begin() + A.dim, resultshape.begin());
+        std::vector<int> resultshape(A.dim());
+        std::copy(A.shape.begin(), A.shape.begin() + A.dim(), resultshape.begin());
         Shape result(resultshape);
         result[-1] = B[-1];
         return result;
diff --git a/excuter/cpp-common/src/deepx/shape_range.cpp b/excuter/cpp-common/src/deepx/shape_range.cpp
index 31a125e1..1f7fad54 100644
--- a/excuter/cpp-common/src/deepx/shape_range.cpp
+++ b/excuter/cpp-common/src/deepx/shape_range.cpp
@@ -42,7 +42,7 @@ namespace deepx
     }
     void Shape::range(int dimCount, std::function<void(const std::vector<int> &indices)> func) const
     {
-        dimCount = checkdim(dimCount, dim);
+        dimCount = checkdim(dimCount, dim());
         int totalSize = checkTotalSize(dimCount, shape);
 
         std::vector<int> indices(dimCount, 0);
@@ -61,7 +61,7 @@ namespace deepx
     }
     void Shape::range(int dimCount, std::function<void(const int idx_linear, const std::vector<int> &indices)> func) const
     {
-        dimCount = checkdim(dimCount, dim);
+        dimCount = checkdim(dimCount, dim());
         int totalSize = checkTotalSize(dimCount, shape);
 
         int stride = checkStride(dimCount, shape);
@@ -82,7 +82,7 @@ namespace deepx
 
     void Shape::range(int dimCount, std::function<void(const int idx_linear)> func) const
     {
-        dimCount = checkdim(dimCount, dim);
+        dimCount = checkdim(dimCount, dim());
         int totalSize = checkTotalSize(dimCount, shape);
         int stride = checkStride(dimCount, shape);
         for (int idx = 0; idx < totalSize; idx++)
@@ -93,7 +93,7 @@ namespace deepx
 
     void Shape::rangeParallel(int dimCount, std::function<void(const std::vector<int> &indices)> func) const
     {
-        dimCount = checkdim(dimCount, dim);
+        dimCount = checkdim(dimCount, dim());
         int totalSize = checkTotalSize(dimCount, shape);
 
 #pragma omp parallel
@@ -115,7 +115,7 @@ namespace deepx
     }
     void Shape::rangeParallel(int dimCount, std::function<void(const int idx_linear)> func) const
     {
-        dimCount = checkdim(dimCount, dim);
+        dimCount = checkdim(dimCount, dim());
         int stride = checkStride(dimCount, shape);
 
         // 计算总循环次数
@@ -130,7 +130,7 @@ namespace deepx
 
     void Shape::rangeParallel(int dimCount, std::function<void(const int idx_linear, const std::vector<int> &indices)> func) const
     {
-        dimCount = checkdim(dimCount, dim);
+        dimCount = checkdim(dimCount, dim());
         int totalSize = checkTotalSize(dimCount, shape);
         int stride = checkStride(dimCount, shape);
 
@@ -154,7 +154,7 @@ namespace deepx
 
     void Shape::rangeParallel(int dimCount, std::function<void(const std::vector<int> &indices, ThreadLocalVectors &tlv)> func,const vector<int> tlv_sizes) const
     {
-        dimCount = checkdim(dimCount, dim);
+        dimCount = checkdim(dimCount, dim());
         int totalSize = checkTotalSize(dimCount, shape);
 
 #pragma omp parallel
@@ -177,7 +177,7 @@ namespace deepx
     }
     void Shape::rangeParallel(int dimCount, std::function<void(const int idx_linear, ThreadLocalVectors &tlv)> func,const vector<int> tlv_sizes) const
     {
-        dimCount = checkdim(dimCount, dim);
+        dimCount = checkdim(dimCount, dim());
         int stride = checkStride(dimCount, shape);
 
         // 计算总循环次数
@@ -196,7 +196,7 @@ namespace deepx
 
     void Shape::rangeParallel(int dimCount, std::function<void(const int idx_linear, const std::vector<int> &indices, ThreadLocalVectors &tlv)> func,const vector<int> tlv_sizes) const
     {
-        dimCount = checkdim(dimCount, dim);
+        dimCount = checkdim(dimCount, dim());
         int totalSize = checkTotalSize(dimCount, shape);
         int stride = checkStride(dimCount, shape);
 
diff --git a/excuter/cpp-common/src/deepx/shape_reduce.cpp b/excuter/cpp-common/src/deepx/shape_reduce.cpp
index 2f017a9b..eeb427a2 100644
--- a/excuter/cpp-common/src/deepx/shape_reduce.cpp
+++ b/excuter/cpp-common/src/deepx/shape_reduce.cpp
@@ -73,14 +73,14 @@ namespace deepx
         return outputShape;
     }
 
+    // 创建一个(map映射)数组，标记哪些维度需要求和
     std::vector<int> reducedDim(const std::vector<int> &shape, const std::vector<int> &dims)
     {
-        // 创建一个映射数组，标记哪些维度需要求和
-        std::vector<int> sumMap(shape.size(), 0);
+        std::vector<int> reducdMap(shape.size(), 0);
         for (int dim : dims)
         {
-            sumMap[dim] = 1;
+            reducdMap[dim] = 1;
         }
-        return sumMap;
+        return reducdMap;
     }
 }
\ No newline at end of file
diff --git a/excuter/cpp-common/src/deepx/shape_tensorinit.cpp b/excuter/cpp-common/src/deepx/shape_tensorinit.cpp
index b2ae8e68..dd93e798 100644
--- a/excuter/cpp-common/src/deepx/shape_tensorinit.cpp
+++ b/excuter/cpp-common/src/deepx/shape_tensorinit.cpp
@@ -5,7 +5,7 @@ namespace deepx
      std::pair<int, int> calculateFanInAndFanOut(const Shape &shape)
     {
         int fanIn, fanOut;
-        if (shape.dim < 2)
+        if (shape.dim() < 2)
         {
             fanIn = 1;
             fanOut = 1;
@@ -15,9 +15,9 @@ namespace deepx
         int numInputFmaps = shape[1];  // 输入特征图数量
         int numOutputFmaps = shape[0]; // 输出特征图数量
         int receptiveFieldSize = 1;
-        if (shape.dim > 2)
+        if (shape.dim() > 2)
         {
-            for (int i = 2; i < shape.dim; ++i)
+            for (int i = 2; i < shape.dim(); ++i)
             {
                 receptiveFieldSize *= shape[i]; // 计算感受野大小
             }
diff --git a/excuter/cpp-common/src/deepx/shapeslice.cpp b/excuter/cpp-common/src/deepx/shapeslice.cpp
deleted file mode 100644
index 99874226..00000000
--- a/excuter/cpp-common/src/deepx/shapeslice.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-#include "deepx/shapeslice.hpp"
- 
-namespace deepx
-{
-    ShapeSlice::ShapeSlice(const std::vector<int> &start, const std::vector<int> &shape, Shape *parent)
-    {
-        this->shape = Shape(shape);
-        this->start = start;
-        this->parent = parent;
-    }
-
-    ShapeSlice::~ShapeSlice()
-    {
-        parent = nullptr;
-    }
-    const std::vector<int> ShapeSlice::toParentIndices(const std::vector<int> &indices) const
-    {
-        std::vector<int> parentindices = indices;
-        for (int i = 0; i < parentindices.size(); i++)
-        {
-            parentindices[i] = parentindices[i] + start[i];
-        }
-        return parentindices;
-    }
-    const std::vector<int> ShapeSlice::fromParentIndices(const std::vector<int> &parentIndices) const
-    {
-        std::vector<int> indices = parentIndices;
-        for (int i = 0; i < indices.size(); i++)
-        {
-            indices[i]=parentIndices[i]-start[i];
-        }
-        return indices;
-    };
-}
\ No newline at end of file
diff --git a/excuter/cpp-common/src/deepx/shapeslice.hpp b/excuter/cpp-common/src/deepx/shapeslice.hpp
deleted file mode 100644
index 428041da..00000000
--- a/excuter/cpp-common/src/deepx/shapeslice.hpp
+++ /dev/null
@@ -1,21 +0,0 @@
-#ifndef SHAPE_SLICE_HPP
-#define SHAPE_SLICE_HPP
-
-#include "deepx/tensor.hpp"
-
-namespace deepx
-{
-    struct ShapeSlice
-    {
-        std::vector<int> start;
-        Shape shape;
-        Shape *parent;
-        ShapeSlice() = default;
-        ShapeSlice(const std::vector<int> &start, const std::vector<int> &shape, Shape *parent);
-        ~ShapeSlice();
-        const std::vector<int> toParentIndices(const std::vector<int> &indices) const;
-        const std::vector<int> fromParentIndices(const std::vector<int> &parentIndices) const;
-    };
-
-}
-#endif
\ No newline at end of file
diff --git a/excuter/cpp-common/src/deepx/tensor.hpp b/excuter/cpp-common/src/deepx/tensor.hpp
index c5a6ca55..d49f12f8 100644
--- a/excuter/cpp-common/src/deepx/tensor.hpp
+++ b/excuter/cpp-common/src/deepx/tensor.hpp
@@ -156,6 +156,17 @@ namespace deepx
             tensor.saver = nullptr;
             return *this;
         }
+
+        //io
+
+        void save(const string &path)
+        {
+            if (saver)
+            {
+                shape.saveShape(path);
+                saver(data, shape.size, path+".data");
+            }
+        }
     }; 
 }
 #endif
\ No newline at end of file
diff --git a/excuter/cpp-common/src/deepx/tensorfunc/io.hpp b/excuter/cpp-common/src/deepx/tensorfunc/io.hpp
index 73542616..59a3606e 100644
--- a/excuter/cpp-common/src/deepx/tensorfunc/io.hpp
+++ b/excuter/cpp-common/src/deepx/tensorfunc/io.hpp
@@ -17,30 +17,7 @@ namespace deepx::tensorfunc{
     }
     
  
-    inline void saveShape(const Shape &shape,const std::string &tensorPath){
-        std::string shapepath = tensorPath + ".shape";
-        std::string shapedata = shape.toYaml();
-        std::ofstream shape_fs(shapepath, std::ios::binary);
-        shape_fs.write(shapedata.c_str(), shapedata.size());
-        shape_fs.close();
-    }
-    
- 
-    inline pair<std::string,Shape> loadShape(const std::string &path)
-    {
-        std::string shapepath = path + ".shape";
-        std::ifstream shape_fs(shapepath, std::ios::binary);
-        if (!shape_fs.is_open())
-        {
-            throw std::runtime_error("Failed to open shape file: " + shapepath);
-        }
-        std::string shapedata((std::istreambuf_iterator<char>(shape_fs)), std::istreambuf_iterator<char>());
-        Shape shape;
-        shape.fromYaml(shapedata);
-        std::string filename = stdutil::filename(path);
-        std::string tensor_name = filename.substr(0, filename.find_last_of('.'));
-        return std::make_pair(tensor_name, shape);
-    }
+   
  
 }
 
diff --git a/excuter/cpp-common/src/deepx/tensorfunc/matmul.hpp b/excuter/cpp-common/src/deepx/tensorfunc/matmul.hpp
index 2e099aba..35a50114 100644
--- a/excuter/cpp-common/src/deepx/tensorfunc/matmul.hpp
+++ b/excuter/cpp-common/src/deepx/tensorfunc/matmul.hpp
@@ -12,11 +12,11 @@ namespace deepx::tensorfunc
         {
             return false;
         }
-        if (a.dim != b.dim)
+        if (a.dim() != b.dim())
         {
             return false;
         }
-        for (int i = 0; i < a.dim - 2; ++i)
+        for (int i = 0; i < a.dim() - 2; ++i)
         {
             if (a[i] != b[i])
             {
diff --git a/excuter/cpp-common/src/deepx/tensorslice.hpp b/excuter/cpp-common/src/deepx/tensorslice.hpp
deleted file mode 100644
index 11a791a6..00000000
--- a/excuter/cpp-common/src/deepx/tensorslice.hpp
+++ /dev/null
@@ -1,26 +0,0 @@
-#ifndef TENSORSLICE_HPP
-#define TENSORSLICE_HPP
-
-#include "deepx/shape.hpp"
-namespace deepx
-{
-    //主要支持CNN的slice操作
-    template <typename T>
-    struct TensorSlice
-    {
-        Tensor<T> *parent;
-        SliceShape sliceShape;
-
-        TensorSlice(Tensor<T> *parent, SliceShape sliceShape)
-        {
-            this->parent = parent;
-            this->sliceShape = sliceShape;
-        }
-        ~TensorSlice()
-        {
-            parent = nullptr;
-            sliceShape.parent = nullptr;
-        }
-    };
-} // namespace deepx
-#endif
\ No newline at end of file
diff --git a/excuter/cpp-common/src/stdutil/print.hpp b/excuter/cpp-common/src/stdutil/print.hpp
index 046a4c83..e7ea4c9d 100644
--- a/excuter/cpp-common/src/stdutil/print.hpp
+++ b/excuter/cpp-common/src/stdutil/print.hpp
@@ -90,7 +90,7 @@ namespace stdutil
         shape.dtype = dtype;
 
         shape.print();
-        if (shape.dim == 1)
+        if (shape.dim() == 1)
         {
             std::cout << "[";
             for (int i = 0; i < shape[0]; ++i)
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.hpp
index a49439b9..922c8ddd 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.hpp
@@ -50,14 +50,14 @@ namespace deepx::tensorfunc
     {
         static void transpose(const Tensor<T> &tensor, const std::vector<int> &dim_order, Tensor<T> &output)
         {
-            if (dim_order.size() != tensor.shape.dim)
+            if (dim_order.size() != tensor.shape.dim())
             {
                 throw std::runtime_error("Dimension order size must match tensor dimension size for transpose");
             }
            
             launch_transpose<T>(tensor.data, tensor.shape.strides.data(),
                                 output.data, output.shape.strides.data(),
-                                tensor.shape.dim, tensor.shape.size, dim_order.data());
+                                tensor.shape.dim(), tensor.shape.size, dim_order.data());
         }
     };
 
@@ -82,7 +82,7 @@ namespace deepx::tensorfunc
             vector<int> inputStrides;
             for (int i = 0; i < tensors.size(); i++)
             {
-                std::copy(tensors[i]->shape.strides.data(), tensors[i]->shape.strides.data() + tensors[i]->shape.dim, std::back_inserter(inputStrides));
+                std::copy(tensors[i]->shape.strides.data(), tensors[i]->shape.strides.data() + tensors[i]->shape.dim(), std::back_inserter(inputStrides));
             }
 
             vector<int> shapeAtAxis(tensors.size());
@@ -93,7 +93,7 @@ namespace deepx::tensorfunc
 
             launch_concat<T>(tensorsData.data(), inputStrides.data(),
                              C.data, C.shape.strides.data(),
-                             C.shape.dim,
+                             C.shape.dim(),
                              C.shape.size,
                              axis, tensors.size(), shapeAtAxis.data());
         };
@@ -111,9 +111,9 @@ namespace deepx::tensorfunc
                 throw TensorShapeError("Broadcast shape mismatch");
             }
             auto bmap = broadcastMap(A.shape.shape, new_shape);
-            launch_broadcastTo<T>(A.data, A.shape.strides.data(), A.shape.dim,
+            launch_broadcastTo<T>(A.data, A.shape.strides.data(), A.shape.dim(),
             bmap.data(),
-            B.data, B.shape.strides.data(), B.shape.dim, B.shape.size);
+            B.data, B.shape.strides.data(), B.shape.dim(), B.shape.size);
         }
     };
 
@@ -122,17 +122,17 @@ namespace deepx::tensorfunc
     struct indexselectDispatcher<miaobyte, T,GatherAxisT>
     {
         static void indexselect(const Tensor<T> &input, const Tensor<GatherAxisT> &indices, const int axis, Tensor<T> &output){
-            int gatherAxis = axis < 0 ? input.shape.dim + axis : axis;
+            int gatherAxis = axis < 0 ? input.shape.dim() + axis : axis;
             vector<int> gatherShape = indexselectShape(input.shape.shape, indices.shape.shape, gatherAxis);
             if (gatherShape.empty()||gatherShape!=output.shape.shape)
             {
                 throw TensorShapeError("Indexselect shape mismatch");
             }
             
-            launch_indexselect<T,GatherAxisT>(input.data, input.shape.strides.data(), input.shape.dim,
-                            indices.data, indices.shape.strides.data(), indices.shape.dim,
+            launch_indexselect<T,GatherAxisT>(input.data, input.shape.strides.data(), input.shape.dim(),
+                            indices.data, indices.shape.strides.data(), indices.shape.dim(),
                             gatherAxis,
-                            output.data,output.shape.strides.data(),output.shape.dim,output.shape.size);
+                            output.data,output.shape.strides.data(),output.shape.dim(),output.shape.size);
         }
     };
 }
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/io_miaobyte.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/io_miaobyte.hpp
index 0967736f..065923e7 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/io_miaobyte.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/io_miaobyte.hpp
@@ -80,7 +80,7 @@ namespace deepx::tensorfunc
     pair<std::string,shared_ptr<Tensor<T>>> load(const std::string &path)
     {
         // 加载shape
-        pair<std::string,Shape> shape_name=loadShape(path);
+        pair<std::string,Shape> shape_name=Shape::loadShape(path);
         Shape shape=shape_name.second;
         std::string tensor_name=shape_name.first;
  
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.hpp
index ce4c7223..2acacf06 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.hpp
@@ -22,9 +22,9 @@ namespace deepx::tensorfunc
             constant<miaobyte, T>(result, T(0));
             std::vector<int> checkeddims = checkedDims(tensor.shape.shape, dims);
             std::vector<int> reduced_dims = reducedDim(tensor.shape.shape, checkeddims);
-            launch_sum<T>(tensor.data, tensor.shape.strides.data(), tensor.shape.dim, tensor.shape.size,
+            launch_sum<T>(tensor.data, tensor.shape.strides.data(), tensor.shape.dim(), tensor.shape.size,
                           reduced_dims.data(), keepdims,
-                          result.data, result.shape.strides.data(), result.shape.dim);
+                          result.data, result.shape.strides.data(), result.shape.dim()) ;
         }
     };
 
@@ -36,9 +36,9 @@ namespace deepx::tensorfunc
             constant<miaobyte, T>(result, T(1));
             std::vector<int> checkeddims = checkedDims(tensor.shape.shape, dims);
             std::vector<int> reduced_dims = reducedDim(tensor.shape.shape, checkeddims);
-            launch_prod<T>(tensor.data, tensor.shape.strides.data(), tensor.shape.dim, tensor.shape.size,
+            launch_prod<T>(tensor.data, tensor.shape.strides.data(), tensor.shape.dim(), tensor.shape.size,
                            reduced_dims.data(), keepdims,
-                           result.data, result.shape.strides.data(), result.shape.dim);
+                           result.data, result.shape.strides.data(), result.shape.dim()) ;
         }
     };
     template <typename T>
@@ -49,9 +49,9 @@ namespace deepx::tensorfunc
             constant<miaobyte, T>(result, std::numeric_limits<T>::lowest());
             std::vector<int> checkeddims = checkedDims(tensor.shape.shape, dims);
             std::vector<int> reduced_dims = reducedDim(tensor.shape.shape, checkeddims);
-            launch_reducemax<T>(tensor.data, tensor.shape.strides.data(), tensor.shape.dim, tensor.shape.size,
+            launch_reducemax<T>(tensor.data, tensor.shape.strides.data(), tensor.shape.dim(), tensor.shape.size,
                                 reduced_dims.data(), keepdims,
-                                result.data, result.shape.strides.data(), result.shape.dim);
+                                result.data, result.shape.strides.data(), result.shape.dim()) ;
         }
     };
 
@@ -63,9 +63,9 @@ namespace deepx::tensorfunc
             constant<miaobyte, T>(result, std::numeric_limits<T>::max());
             std::vector<int> checkeddims = checkedDims(tensor.shape.shape, dims);
             std::vector<int> reduced_dims = reducedDim(tensor.shape.shape, checkeddims);
-            launch_reducemin<T>(tensor.data, tensor.shape.strides.data(), tensor.shape.dim, tensor.shape.size,
+            launch_reducemin<T>(tensor.data, tensor.shape.strides.data(), tensor.shape.dim(), tensor.shape.size,
                                 reduced_dims.data(), keepdims,
-                                result.data, result.shape.strides.data(), result.shape.dim);
+                                result.data, result.shape.strides.data(), result.shape.dim()) ;
         }
     };
 }
diff --git a/excuter/op-mem-cuda/src/deepx/tf/io.hpp b/excuter/op-mem-cuda/src/deepx/tf/io.hpp
index 81d739f2..d676c743 100644
--- a/excuter/op-mem-cuda/src/deepx/tf/io.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tf/io.hpp
@@ -86,7 +86,7 @@ namespace deepx::tf
                 return 1;
             }
             Precision dtype = mem->gettensor(name)->shape.dtype;
-            tensorfunc::saveShape(mem->gettensor(name)->shape,path);
+            mem->gettensor(name)->shape.saveShape(path);
             path+=".data";
             switch (dtype)
             {   
@@ -165,7 +165,7 @@ namespace deepx::tf
         {
             string path = this->args[0].textvalue;
             
-            pair<std::string,Shape> shape_name=tensorfunc::loadShape(path);
+            pair<std::string,Shape> shape_name=Shape::loadShape(path);
             std::string tensor_name=shape_name.first;
             Shape shape=shape_name.second;
 
@@ -252,7 +252,7 @@ namespace deepx::tf
             string path = this->args[0].textvalue;  
             string tensorname = this->returns[0].textvalue;
             
-            pair<std::string,Shape> shape_name=tensorfunc::loadShape(path);
+            pair<std::string,Shape> shape_name=Shape::loadShape(path);
             std::string tensor_name=shape_name.first;
             Shape shape=shape_name.second;
 
diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/changeshape_miaobyte.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/changeshape_miaobyte.hpp
index f9466111..a92a9571 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/changeshape_miaobyte.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/changeshape_miaobyte.hpp
@@ -48,7 +48,7 @@ namespace deepx::tensorfunc
         static void transpose(const Tensor<T> &tensor, const std::vector<int> &dim_order, Tensor<T> &output)
         {
 
-            if (dim_order.size() != tensor.shape.dim)
+            if (dim_order.size() != tensor.shape.dim()) 
             {
                 throw std::invalid_argument("dimOrder size does not match the number of dimensions in the TensorCPU.");
             }
@@ -62,7 +62,7 @@ namespace deepx::tensorfunc
                             for (size_t i = 0; i < dim_order.size(); ++i) {
                                 tlv.get(0)[dim_order[i]] = indices[i];
                             }
-                            output.data[idx_linear]= tensor.data[tensor.shape.linearat(tlv.get(0))]; }, {tensor.shape.dim});
+                            output.data[idx_linear]= tensor.data[tensor.shape.linearat(tlv.get(0))]; }, {tensor.shape.dim()});
         }
     };
     // concat
@@ -131,7 +131,7 @@ namespace deepx::tensorfunc
             }
             auto bmap = broadcastMap(A.shape.shape, new_shape);
 
-            B.shape.rangeParallel(B.shape.dim, [&](const int idx, const std::vector<int> &bindices)
+            B.shape.rangeParallel(B.shape.dim(), [&](const int idx, const std::vector<int> &bindices)
                                   {
                         vector<int> aindices=fromBroadcastIndices(bmap, bindices);
                         B.data[idx] = A.data[A.shape.linearat(aindices)]; });
@@ -157,8 +157,8 @@ namespace deepx::tensorfunc
     {
         static void indexselect(const Tensor<T> &input, const Tensor<GatherAxisT> &index, const int axis, Tensor<T> &output)
         {
-            int gatherAxis = axis < 0 ? input.shape.dim + axis : axis;
-            if (gatherAxis < 0 || gatherAxis >= input.shape.dim)
+            int gatherAxis = axis < 0 ? input.shape.dim() + axis : axis;
+            if (gatherAxis < 0 || gatherAxis >= input.shape.dim()) 
             {
                 throw std::invalid_argument("Axis is out of bounds");
             }
@@ -168,12 +168,12 @@ namespace deepx::tensorfunc
             {
                 throw TensorShapeError("Indexselect shape mismatch");
             }
-            output.shape.rangeParallel(output.shape.dim, [&](const int idx, const std::vector<int> &output_indices, ThreadLocalVectors &tlv)
+            output.shape.rangeParallel(output.shape.dim(), [&](const int idx, const std::vector<int> &output_indices, ThreadLocalVectors &tlv)
                                        {  
                             fromIndexselectIndices(output_indices, index,tlv.get(1), gatherAxis, tlv.get(0));
                             output.data[idx] = input.data[input.shape.linearat(tlv.get(0))]; 
                         },
-                    {input.shape.dim,index.shape.dim});
+                    {input.shape.dim(),index.shape.dim()});
         }
     };
 
@@ -205,12 +205,12 @@ namespace deepx::tensorfunc
     // void expand(const Tensor<T> &input, Tensor<T> &output)
     // {
     //     // 检查输入和目标形状的兼容性
-    //     if (input.shape.dim != output.shape.dim)
+    //     if (input.shape.dim() != output.shape.dim()) 
     //     {
     //         throw std::invalid_argument("expand维度不匹配: 输入维度 " +
-    //                                     std::to_string(input.shape.dim) +
+    //                                     std::to_string(input.shape.dim())  +
     //                                     ", 目标维度 " +
-    //                                     std::to_string(output.shape.dim) +
+    //                                     std::to_string(output.shape.dim())  +
     //                                     "请先前dim补1的方式reshape");
     //     }
 
@@ -231,7 +231,7 @@ namespace deepx::tensorfunc
 
     //     // 找到最后一个需要扩展的维度
     //     int last_expand_dim = -1;
-    //     for (int i = input.shape.dim - 1; i >= 0; --i)
+    //     for (int i = input.shape.dim() - 1; i >= 0; --i)
     //     {
     //         if (input.shape[i] != output.shape.shape[i])
     //         {
@@ -241,7 +241,7 @@ namespace deepx::tensorfunc
     //     }
 
     //     // 如果最后几个维度不需要扩展，可以连续复制
-    //     if (last_expand_dim < output.shape.dim - 1)
+    //     if (last_expand_dim < output.shape.dim() - 1)
     //     {
     //         int copy_len = output.shape.strides[last_expand_dim + 1];
     //         output.shape.rangeParallel(last_expand_dim + 1, [&bm, &output, &input, copy_len](int idx_linear, const std::vector<int> &indices, std::vector<int> &oldIndices)
@@ -250,15 +250,15 @@ namespace deepx::tensorfunc
     //                 int idx_old = input.shape.linearat(oldIndices);
     //                 std::copy(input.data + idx_old,
     //                          input.data + idx_old + copy_len,
-    //                          output.data + idx_linear); }, input.shape.dim);
+    //                          output.data + idx_linear); }, input.shape.dim()) ;
     //     }
     //     else
     //     {
-    //         output.shape.rangeParallel(output.shape.dim, [&bm, &output, &input](int idx_linear, const std::vector<int> &indices, std::vector<int> &oldIndices)
+    //         output.shape.rangeParallel(output.shape.dim(), [&bm, &output, &input](int idx_linear, const std::vector<int> &indices, std::vector<int> &oldIndices)
     //                                    {
     //                 fromBroadcastIndices(bm, indices, oldIndices);
     //                 int idx_old = input.shape.linearat(oldIndices);
-    //                 output.data[idx_linear] = input.data[idx_old]; }, input.shape.dim);
+    //                 output.data[idx_linear] = input.data[idx_old]; }, input.shape.dim()) ;
     //     }
     // }
 }
diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp
index 1e863ae3..5acf3bd7 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp
@@ -18,7 +18,7 @@ namespace deepx::tensorfunc
     {
         if (A.shape == B.shape && A.shape == C.shape)
         {
-            C.shape.rangeParallel(C.shape.dim - 1, [&A, &B, &C, &scalar_op, &simd_op](int i)
+            C.shape.rangeParallel(C.shape.dim() - 1, [&A, &B, &C, &scalar_op, &simd_op](int i)
                                   {
                                       int shape_last = C.shape[-1];
                                       const ScalableTag<T> tag;
@@ -62,7 +62,7 @@ namespace deepx::tensorfunc
     {
         if (A.shape == C.shape)
         {
-            C.shape.rangeParallel(C.shape.dim - 1, [&A, &b, &C, &scalar_op, &simd_op](int i)
+            C.shape.rangeParallel(C.shape.dim() - 1, [&A, &b, &C, &scalar_op, &simd_op](int i)
                                   {
                                       int shape_last = C.shape[-1];
                                       const ScalableTag<T> tag;
@@ -292,7 +292,7 @@ namespace deepx::tensorfunc
         {   
             if (A.shape == C.shape)
             {
-                A.shape.rangeParallel(A.shape.dim-1, [&A, &C](int idx)
+                A.shape.rangeParallel(A.shape.dim()-1, [&A, &C](int idx)
                                       {
                                            for (int j=0;j<A.shape[-1];j++)
                                            {
@@ -314,7 +314,7 @@ namespace deepx::tensorfunc
         {
             if (input.shape == output.shape)
             {
-                output.shape.rangeParallel(output.shape.dim - 1, [&input, &output](int i)
+                output.shape.rangeParallel(output.shape.dim() - 1, [&input, &output](int i)
                                            {
                 int shape_last=output.shape[-1];
                 const ScalableTag<T> tag;
@@ -355,7 +355,7 @@ namespace deepx::tensorfunc
         {
             if (input.shape == output.shape)
             {
-                output.shape.rangeParallel(output.shape.dim - 1, [&input, &output](int i)
+                output.shape.rangeParallel(output.shape.dim() - 1, [&input, &output](int i)
                                            {
                                                int shape_last = output.shape[-1];
 
@@ -382,7 +382,7 @@ namespace deepx::tensorfunc
         {
             if (A.shape == B.shape && A.shape == C.shape)
             {
-                C.shape.rangeParallel(C.shape.dim - 1, [&A, &B, &C](int i)
+                C.shape.rangeParallel(C.shape.dim() - 1, [&A, &B, &C](int i)
                                       {
                                          for (int j = 0; j < C.shape[-1]; j++)
                                          C.data[i+j] = std::pow(A.data[i+j], B.data[i+j]); });
@@ -403,7 +403,7 @@ namespace deepx::tensorfunc
         {
             if (input.shape == output.shape)
             {
-                output.shape.rangeParallel(output.shape.dim - 1, [&input, &output, &value](int i)
+                output.shape.rangeParallel(output.shape.dim() - 1, [&input, &output, &value](int i)
                                            {
                                              for (int j = 0; j < output.shape[-1]; j++)
                                                 output.data[i+j] = std::pow(input.data[i+j], value); });
@@ -423,7 +423,7 @@ namespace deepx::tensorfunc
         {
             if (input.shape == output.shape)
             {
-                output.shape.rangeParallel(output.shape.dim - 1, [&input, &output, &value](int i)
+                output.shape.rangeParallel(output.shape.dim() - 1, [&input, &output, &value](int i)
                                            {
                                                 for (int j = 0; j < output.shape[-1]; j++)
                                                 output.data[i+j] = std::pow(value, input.data[i+j]); });
@@ -443,7 +443,7 @@ namespace deepx::tensorfunc
         {
             if (input.shape == output.shape)
             {
-                output.shape.rangeParallel(output.shape.dim - 1, [&input, &output](int i)
+                output.shape.rangeParallel(output.shape.dim() - 1, [&input, &output](int i)
                                            { for (int j = 0; j < output.shape[-1]; j++)
                                                 output.data[i+j] = std::log(input.data[i+j]); });
             }
@@ -462,7 +462,7 @@ namespace deepx::tensorfunc
         {
             if (input.shape == output.shape)
             {
-                output.shape.rangeParallel(output.shape.dim - 1, [&input, &output](int i)
+                output.shape.rangeParallel(output.shape.dim() - 1, [&input, &output](int i)
                                            { for (int j = 0; j < output.shape[-1]; j++)
                                                 output.data[i+j] = std::exp(input.data[i+j]); });
             }
@@ -481,7 +481,7 @@ namespace deepx::tensorfunc
         {
             if (input.shape == output.shape)
             {
-                output.shape.rangeParallel(output.shape.dim - 1, [&input, &output](int i)
+                output.shape.rangeParallel(output.shape.dim() - 1, [&input, &output](int i)
                                            {
                 int shape_last=output.shape[-1];
                 const ScalableTag<T> tag;
@@ -524,7 +524,7 @@ namespace deepx::tensorfunc
         {
             if (input.shape == output.shape)
             {
-                output.shape.rangeParallel(output.shape.dim - 1, [&input, &output](int i)
+                output.shape.rangeParallel(output.shape.dim() - 1, [&input, &output](int i)
                                            {
                 int shape_last=output.shape[-1];
                 const ScalableTag<T> tag;
@@ -567,7 +567,7 @@ namespace deepx::tensorfunc
         {
             if (input.shape == output.shape)
             {
-                output.shape.rangeParallel(output.shape.dim - 1, [&input, &output](int i)
+                output.shape.rangeParallel(output.shape.dim() - 1, [&input, &output](int i)
                                            {
                 int shape_last=output.shape[-1];
                 const ScalableTag<T> tag;
@@ -609,7 +609,7 @@ namespace deepx::tensorfunc
         {
             if (A.shape == B.shape && A.shape == C.shape)
             {
-                C.shape.rangeParallel(C.shape.dim - 1, [&A, &B, &C](int idx)
+                C.shape.rangeParallel(C.shape.dim() - 1, [&A, &B, &C](int idx)
                                       {
                 int shape_last=C.shape[-1];
                 const ScalableTag<T> tag;
@@ -652,7 +652,7 @@ namespace deepx::tensorfunc
         {
             if (A.shape == C.shape)
             {
-                C.shape.rangeParallel(C.shape.dim - 1, [&A, b, &C](int idx)
+                C.shape.rangeParallel(C.shape.dim() - 1, [&A, b, &C](int idx)
                                       {
                 int shape_last=C.shape[-1];
                 const ScalableTag<T> tag;
@@ -695,7 +695,7 @@ namespace deepx::tensorfunc
         {
             if (A.shape == B.shape && A.shape == C.shape)
             {
-                C.shape.rangeParallel(C.shape.dim - 1, [&A, &B, &C](int idx)
+                C.shape.rangeParallel(C.shape.dim() - 1, [&A, &B, &C](int idx)
                                       {
                 int shape_last=C.shape[-1];
                 const ScalableTag<T> tag;
@@ -738,7 +738,7 @@ namespace deepx::tensorfunc
         {
             if (A.shape == C.shape)
             {
-                C.shape.rangeParallel(C.shape.dim - 1, [&A, b, &C](int idx)
+                C.shape.rangeParallel(C.shape.dim() - 1, [&A, b, &C](int idx)
                                       {   
                 int shape_last=C.shape[-1];
                 const ScalableTag<T> tag;
@@ -781,7 +781,7 @@ namespace deepx::tensorfunc
         {
             if (A.shape == B.shape && mask.shape == A.shape)
             {   
-                A.shape.rangeParallel(A.shape.dim-1, [&A, &B, &mask,epsilon](int idx)
+                A.shape.rangeParallel(A.shape.dim()-1, [&A, &B, &mask,epsilon](int idx)
                                       {
                                             for (int i = 0; i < A.shape[-1]; i++)
                                             {
@@ -810,7 +810,7 @@ namespace deepx::tensorfunc
         {
             if (A.shape == mask.shape)
             {
-                A.shape.rangeParallel(A.shape.dim-1, [&A, &mask, &scalar,epsilon](int idx)
+                A.shape.rangeParallel(A.shape.dim()-1, [&A, &mask, &scalar,epsilon](int idx)
                                       {
                 for (int i = 0; i < A.shape[-1]; i++)
                 {
@@ -839,7 +839,7 @@ namespace deepx::tensorfunc
         {
             if (A.shape == B.shape && mask.shape == A.shape)
             {
-                A.shape.rangeParallel(A.shape.dim-1, [&A, &B, &mask](int idx)
+                A.shape.rangeParallel(A.shape.dim()-1, [&A, &B, &mask](int idx)
                                       {
                 for (int i = 0; i < A.shape[-1]; i++)
                 {
@@ -862,7 +862,7 @@ namespace deepx::tensorfunc
         {
             if (A.shape == mask.shape)
             {
-                A.shape.rangeParallel(A.shape.dim-1, [&A, &mask, &scalar](int idx)
+                A.shape.rangeParallel(A.shape.dim()-1, [&A, &mask, &scalar](int idx)
                                       {
                 for (int i = 0; i < A.shape[-1]; i++)
                 {
@@ -885,7 +885,7 @@ namespace deepx::tensorfunc
         {
             if (A.shape == B.shape && mask.shape == A.shape)
             {
-                A.shape.rangeParallel(A.shape.dim-1, [&A, &B, &mask](int idx)
+                A.shape.rangeParallel(A.shape.dim()-1, [&A, &B, &mask](int idx)
                                       {
                 for (int i = 0; i < A.shape[-1]; i++)
                 {
@@ -908,7 +908,7 @@ namespace deepx::tensorfunc
         {
             if (A.shape == mask.shape)
             {
-                A.shape.rangeParallel(A.shape.dim-1, [&A, &mask, &scalar](int idx)
+                A.shape.rangeParallel(A.shape.dim()-1, [&A, &mask, &scalar](int idx)
                                       {
                 for (int i = 0; i < A.shape[-1]; i++)
                 {
@@ -931,7 +931,7 @@ namespace deepx::tensorfunc
         {
             if (cases.shape == C.shape)
             {
-                C.shape.rangeParallel(C.shape.dim-1, [&tensors, &cases, &C](int idx)
+                C.shape.rangeParallel(C.shape.dim()-1, [&tensors, &cases, &C](int idx)
                                       {
                 for (int i = 0; i < C.shape[-1]; i++)
                 {   
diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/io_miaobyte.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/io_miaobyte.hpp
index d7c26ccc..f5324b5e 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/io_miaobyte.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/io_miaobyte.hpp
@@ -42,7 +42,7 @@ namespace deepx::tensorfunc
     pair<std::string,shared_ptr<Tensor<T>>> load(const std::string &path)
     {
         // 加载shape
-        pair<std::string,Shape> shape_name=loadShape(path);
+        pair<std::string,Shape> shape_name=Shape::loadShape(path);
         Shape shape=shape_name.second;
         std::string tensor_name=shape_name.first;
  
diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/matmul_cblas.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/matmul_cblas.hpp
index 8656191b..f12d9301 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/matmul_cblas.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/matmul_cblas.hpp
@@ -19,7 +19,7 @@ namespace deepx::tensorfunc
       }
       // 计算batch size (将除最后两维外的所有维度展平)
       int64_t batch_size = 1;
-      for (int i = 0; i < a.shape.dim - 2; ++i)
+      for (int i = 0; i < a.shape.dim() - 2; ++i)
       {
         batch_size *= a.shape[i];
       }
@@ -70,7 +70,7 @@ namespace deepx::tensorfunc
       }
       // 计算batch size (将除最后两维外的所有维度展平)
       int64_t batch_size = 1;
-      for (int i = 0; i < a.shape.dim - 2; ++i)
+      for (int i = 0; i < a.shape.dim() - 2; ++i)
       {
         batch_size *= a.shape[i];
       }
diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/matmul_miaobyte.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/matmul_miaobyte.hpp
index e5dadce8..4e3d26ad 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/matmul_miaobyte.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/matmul_miaobyte.hpp
@@ -15,8 +15,8 @@ namespace deepx::tensorfunc
                 throw std::invalid_argument("A.shape could matmul with B.shape");
             }
             //TODO
-            //这里如果对二维矩阵运算，则omp并行不起来，因为C.shape.dim - 2刚好=0
-            C.shape.rangeParallel(C.shape.dim - 2, [&](const std::vector<int> &indices)
+            //这里如果对二维矩阵运算，则omp并行不起来，因为C.shape.dim() - 2刚好=0
+            C.shape.rangeParallel(C.shape.dim() - 2, [&](const std::vector<int> &indices)
                                   {
                         int aIdx=A.shape.linearat(indices);
                         int bIdx=B.shape.linearat(indices);
diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/reduce_miaobyte.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/reduce_miaobyte.hpp
index b6ebea5c..6c5283e2 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/reduce_miaobyte.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/reduce_miaobyte.hpp
@@ -25,12 +25,12 @@ namespace deepx::tensorfunc
             std::vector<int> checkeddims = checkedDims(tensor.shape.shape, dims);
             std::vector<int> reduced_dims = reducedDim(tensor.shape.shape, checkeddims);
             const int minshape_1 = Lanes(ScalableTag<T>());
-            if (reduced_dims.rbegin()[0] == tensor.shape.dim - 1 || tensor.shape.dim > reduced_dims.size() || tensor.shape[-1] >= minshape_1)
+            if (checkeddims.rbegin()[0] == tensor.shape.dim() - 1 || tensor.shape.dim() > reduced_dims.size() || tensor.shape[-1] >= minshape_1)
             {
-                tensor.shape.rangeParallel(tensor.shape.dim, [&tensor, &result, &reduced_dims, keepdims](const int idx_linear, const std::vector<int> &indices, ThreadLocalVectors &tlv)
+                tensor.shape.rangeParallel(tensor.shape.dim(), [&tensor, &result, &reduced_dims, keepdims](const int idx_linear, const std::vector<int> &indices, ThreadLocalVectors &tlv)
                                            {
                     // 计算输出索引
-                    for (size_t i = 0, j = 0; i < tensor.shape.dim; ++i)
+                    for (size_t i = 0, j = 0; i < tensor.shape.dim(); ++i)
                     {
                         if (reduced_dims[i] == 0)
                         {
@@ -41,15 +41,15 @@ namespace deepx::tensorfunc
                     }
                     int outputIdx = result.shape.linearat(tlv.get(0));
 #pragma omp atomic
-                    result.data[outputIdx] += tensor.data[idx_linear]; }, {result.shape.dim});
+                    result.data[outputIdx] += tensor.data[idx_linear]; }, {result.shape.dim()});
             }
             else
             {
                 // 如果数据连续（对齐），则可以simd
-                tensor.shape.rangeParallel(tensor.shape.dim - 1, [&tensor, &result, &reduced_dims, keepdims](const int idx_linear, const std::vector<int> &indices, ThreadLocalVectors &tlv)
+                tensor.shape.rangeParallel(tensor.shape.dim() - 1, [&tensor, &result, &reduced_dims, keepdims](const int idx_linear, const std::vector<int> &indices, ThreadLocalVectors &tlv)
                                            {
                     // 计算输出索引
-                    for (size_t i = 0, j = 0; i < tensor.shape.dim; ++i)
+                    for (size_t i = 0, j = 0; i < tensor.shape.dim(); ++i)
                     {
                         if (reduced_dims[i] == 0)
                         {
@@ -87,7 +87,7 @@ namespace deepx::tensorfunc
                     }
 #pragma omp atomic
                     result.data[outputIdx] += sum; },
-                     {result.shape.dim});
+                     {result.shape.dim()});
             }
         }
     };
@@ -103,13 +103,13 @@ namespace deepx::tensorfunc
             const int minshape_1 = Lanes(ScalableTag<T>());
             // 如果dims的最后一个元素是tensor.shape.dim-1，则说明reduceprod的数据不连续（不对齐），无法simd（需要不停跳跃）
             constant<miaobyte, T>(result, T(1));
-            if (reduced_dims.rbegin()[0] == tensor.shape.dim - 1 || tensor.shape.dim > reduced_dims.size() || tensor.shape[-1] >= minshape_1)
+            if (reduced_dims.rbegin()[0] == tensor.shape.dim() - 1 || tensor.shape.dim() > reduced_dims.size() || tensor.shape[-1] >= minshape_1)
             {
-                tensor.shape.rangeParallel(tensor.shape.dim, [&tensor, &result, &reduced_dims, keepdims](const int idx_linear, const std::vector<int> &indices, ThreadLocalVectors &tlv)
+                tensor.shape.rangeParallel(tensor.shape.dim(), [&tensor, &result, &reduced_dims, keepdims](const int idx_linear, const std::vector<int> &indices, ThreadLocalVectors &tlv)
                                            {
                             // 计算输出索引
                          
-                            for (size_t i = 0,j=0; i < tensor.shape.dim ; ++i) {
+                            for (size_t i = 0,j=0; i < tensor.shape.dim() ; ++i) {
                                 if (reduced_dims[i]==0) {
                                         tlv.get(0)[j++]=indices[i];
                                     }else if (keepdims && (reduced_dims[i] == 1)) {
@@ -120,16 +120,16 @@ namespace deepx::tensorfunc
                             int outputIdx=result.shape.linearat(tlv.get(0));
 #pragma omp atomic
                             result.data[outputIdx]*=tensor.data[idx_linear]; 
-                            }, {result.shape.dim});
+                            }, {result.shape.dim()});
             }
             else
             {
                 // 如果数据连续（对齐），则可以simd
-                tensor.shape.rangeParallel(tensor.shape.dim - 1, [&tensor, &result, &reduced_dims, keepdims](const int i, const std::vector<int> &indices, ThreadLocalVectors &tlv)
+                tensor.shape.rangeParallel(tensor.shape.dim() - 1, [&tensor, &result, &reduced_dims, keepdims](const int i, const std::vector<int> &indices, ThreadLocalVectors &tlv)
                                            {
                                                // 计算输出索引
 
-                                               for (size_t i = 0, j = 0; i < tensor.shape.dim; ++i)
+                                               for (size_t i = 0, j = 0; i < tensor.shape.dim(); ++i)
                                                {
                                                    if (reduced_dims[i] == 0)
                                                    {
@@ -173,7 +173,7 @@ namespace deepx::tensorfunc
                                                }
 #pragma omp atomic
                                                result.data[outputIdx] *= product; 
-                                               }, {result.shape.dim});
+                                               }, {result.shape.dim()});
             }
         }
     };
@@ -188,13 +188,13 @@ namespace deepx::tensorfunc
             const int minshape_1 = Lanes(ScalableTag<T>());
             // 如果dims的最后一个元素是tensor.shape.dim-1，则说明reducemax的数据不连续（不对齐），无法simd（需要不停跳跃）
             constant<miaobyte, T>(result, std::numeric_limits<T>::lowest());
-            if (reduced_dims.rbegin()[0] == tensor.shape.dim - 1 || tensor.shape.dim > reduced_dims.size() || tensor.shape[-1] >= minshape_1)
+            if (reduced_dims.rbegin()[0] == tensor.shape.dim() - 1 || tensor.shape.dim() > reduced_dims.size() || tensor.shape[-1] >= minshape_1)
             {
-                tensor.shape.rangeParallel(tensor.shape.dim, [&tensor, &result, &reduced_dims, keepdims](const int idx_linear, const std::vector<int> &indices, ThreadLocalVectors &tlv)
+                tensor.shape.rangeParallel(tensor.shape.dim(), [&tensor, &result, &reduced_dims, keepdims](const int idx_linear, const std::vector<int> &indices, ThreadLocalVectors &tlv)
                                            {
                             // 计算输出索引
                          
-                            for (size_t i = 0,j=0; i < tensor.shape.dim ; ++i) {
+                            for (size_t i = 0,j=0; i < tensor.shape.dim() ; ++i) {
                                 if (reduced_dims[i]==0) {
                                         tlv.get(0)[j++]=indices[i];
                                     }else if (keepdims && (reduced_dims[i] == 1)) {
@@ -204,16 +204,16 @@ namespace deepx::tensorfunc
                             // 累加求和
                             int outputIdx=result.shape.linearat(tlv.get(0));
                             result.data[outputIdx]=std::max(result.data[outputIdx],tensor.data[idx_linear]); 
-                            }, {result.shape.dim});
+                            }, {result.shape.dim()});
             }
             else
             {
                 // 如果数据连续（对齐），则可以simd
-                tensor.shape.rangeParallel(tensor.shape.dim - 1, [&tensor, &result, &reduced_dims, keepdims](const int i, const std::vector<int> &indices, ThreadLocalVectors &tlv)
+                tensor.shape.rangeParallel(tensor.shape.dim() - 1, [&tensor, &result, &reduced_dims, keepdims](const int i, const std::vector<int> &indices, ThreadLocalVectors &tlv)
                                            {
                                                // 计算输出索引
 
-                                               for (size_t i = 0, j = 0; i < tensor.shape.dim; ++i)
+                                               for (size_t i = 0, j = 0; i < tensor.shape.dim(); ++i)
                                                {
                                                    if (reduced_dims[i] == 0)
                                                    {
@@ -256,7 +256,7 @@ namespace deepx::tensorfunc
                                                }
  
                                                result.data[outputIdx] = std::max(result.data[outputIdx],maxt); 
-                                               }, {result.shape.dim});
+                                               }, {result.shape.dim()});
             }
         }
     };
@@ -271,13 +271,13 @@ namespace deepx::tensorfunc
             const int minshape_1 = Lanes(ScalableTag<T>());
             // 如果dims的最后一个元素是tensor.shape.dim-1，则说明reducemin的数据不连续（不对齐），无法simd（需要不停跳跃）
             constant<miaobyte, T>(result, std::numeric_limits<T>::max());
-            if (reduced_dims.rbegin()[0] == tensor.shape.dim - 1 || tensor.shape.dim > reduced_dims.size() || tensor.shape[-1] >= minshape_1)
+            if (reduced_dims.rbegin()[0] == tensor.shape.dim() - 1 || tensor.shape.dim() > reduced_dims.size() || tensor.shape[-1] >= minshape_1)
             {
-                tensor.shape.rangeParallel(tensor.shape.dim, [&tensor, &result, &reduced_dims, keepdims](const int idx_linear, const std::vector<int> &indices, ThreadLocalVectors &tlv)
+                tensor.shape.rangeParallel(tensor.shape.dim(), [&tensor, &result, &reduced_dims, keepdims](const int idx_linear, const std::vector<int> &indices, ThreadLocalVectors &tlv)
                                            {
                             // 计算输出索引
                          
-                            for (size_t i = 0,j=0; i < tensor.shape.dim ; ++i) {
+                            for (size_t i = 0,j=0; i < tensor.shape.dim() ; ++i) {
                                 if (reduced_dims[i]==0) {
                                         tlv.get(0)[j++]=indices[i];
                                     }else if (keepdims && (reduced_dims[i] == 1)) {
@@ -288,16 +288,16 @@ namespace deepx::tensorfunc
                             int outputIdx=result.shape.linearat(tlv.get(0));
  
                             result.data[outputIdx]=std::min(result.data[outputIdx],tensor.data[idx_linear]); 
-                            }, {result.shape.dim});
+                            }, {result.shape.dim()});
             }
             else
             {
                 // 如果数据连续（对齐），则可以simd
-                tensor.shape.rangeParallel(tensor.shape.dim - 1, [&tensor, &result, &reduced_dims, keepdims](const int i, const std::vector<int> &indices, ThreadLocalVectors &tlv)
+                tensor.shape.rangeParallel(tensor.shape.dim() - 1, [&tensor, &result, &reduced_dims, keepdims](const int i, const std::vector<int> &indices, ThreadLocalVectors &tlv)
                                            {
                                                // 计算输出索引
 
-                                               for (size_t i = 0, j = 0; i < tensor.shape.dim; ++i)
+                                               for (size_t i = 0, j = 0; i < tensor.shape.dim(); ++i)
                                                {
                                                    if (reduced_dims[i] == 0)
                                                    {
@@ -339,7 +339,7 @@ namespace deepx::tensorfunc
                                                    mint = std::min(mint,tensor.data[i + j]);
                                                }
  
-                                               result.data[outputIdx] = std::min(result.data[outputIdx],mint); }, {result.shape.dim});
+                                               result.data[outputIdx] = std::min(result.data[outputIdx],mint); }, {result.shape.dim()});
             }
         }
     };
diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/io.hpp b/excuter/op-mem-ompsimd/src/deepx/tf/io.hpp
index d0d9ae5d..a279b708 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tf/io.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tf/io.hpp
@@ -86,7 +86,7 @@ namespace deepx::tf
                 return 1;
             }
             Precision dtype = mem->gettensor(name)->shape.dtype;
-            tensorfunc::saveShape(mem->gettensor(name)->shape,path);
+            mem->gettensor(name)->shape.saveShape(path);
             path+=".data";
             switch (dtype)
             {   
@@ -156,7 +156,7 @@ namespace deepx::tf
         {
             string path = this->args[0].textvalue;
             
-            pair<std::string,Shape> shape_name=tensorfunc::loadShape(path);
+            pair<std::string,Shape> shape_name=Shape::loadShape(path);
             std::string tensor_name=shape_name.first;
             Shape shape=shape_name.second;
 
diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/reduce.hpp b/excuter/op-mem-ompsimd/src/deepx/tf/reduce.hpp
index ff483da6..f7eeeba2 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tf/reduce.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tf/reduce.hpp
@@ -187,7 +187,7 @@ namespace deepx::tf
         }
     };
 
-        template <typename Author>
+    template <typename Author>
     class ReduceMin : public TF
     {
     public:
diff --git a/excuter/op-mem-ompsimd/test/tensorfunc/1_shape.cpp b/excuter/op-mem-ompsimd/test/tensorfunc/1_shape.cpp
index 9f13efba..af393e9b 100644
--- a/excuter/op-mem-ompsimd/test/tensorfunc/1_shape.cpp
+++ b/excuter/op-mem-ompsimd/test/tensorfunc/1_shape.cpp
@@ -13,7 +13,7 @@ void test_tensor_shape() {
     
     Shape shape2;
     shape2.fromYaml(yaml);
-    std::cout<<"shape2: "<<shape2.size<<" "<<shape2.shape<<shape2.dim<<shape2.strides<<std::endl;
+    std::cout<<"shape2: "<<shape2.size<<" "<<shape2.shape<<" "<<shape2.dim()<<" "<<shape2.strides<<std::endl;
     
 }
 
diff --git a/excuter/op-mem-ompsimd/test/tensorfunc/2_tensor_new.cpp b/excuter/op-mem-ompsimd/test/tensorfunc/2_tensor_new.cpp
index b50237c5..906ab399 100644
--- a/excuter/op-mem-ompsimd/test/tensorfunc/2_tensor_new.cpp
+++ b/excuter/op-mem-ompsimd/test/tensorfunc/2_tensor_new.cpp
@@ -14,11 +14,11 @@ void test_tensor_new(){
     Tensor<float> tensor=New<float>({2, 3});
     constant<miaobyte,float>(tensor,1);
     print<miaobyte>(tensor);
-    save(tensor,"tensor");
+    tensor.save("tensor");
     Tensor<float> tensor2=New<float>({2, 3});
     constant<miaobyte,float>(tensor2,2);
     print<miaobyte>(tensor2);
-    save(tensor2,"tensor2");
+    tensor2.save("tensor2");
 }
 
 void test_arange() {
diff --git a/excuter/op-mem-ompsimd/test/tensorfunc/5_tensor_sum.cpp b/excuter/op-mem-ompsimd/test/tensorfunc/5_tensor_sum.cpp
index dc2ef698..71c8e395 100644
--- a/excuter/op-mem-ompsimd/test/tensorfunc/5_tensor_sum.cpp
+++ b/excuter/op-mem-ompsimd/test/tensorfunc/5_tensor_sum.cpp
@@ -22,9 +22,9 @@ void test_sum()
 {
     omp_set_num_threads(1); 
 
-    std::vector<int> shape={2, 3, 4};
+    std::vector<int> shape={3, 4,5};
     Tensor<float> tensor= New<float>(shape);
-    constant<miaobyte,float>(tensor,float(1));
+    arange<miaobyte,float>(tensor,float(0),float(1));
     print<miaobyte>(tensor,"%.0f");
     cout<<""<<endl;
     std::vector<std::vector<int>> result = combination(3);
diff --git a/excuter/op-mem-ompsimd/test/tensorfunc/8_tensor_concat.cpp b/excuter/op-mem-ompsimd/test/tensorfunc/8_tensor_concat.cpp
index 74fd575b..538eba0c 100644
--- a/excuter/op-mem-ompsimd/test/tensorfunc/8_tensor_concat.cpp
+++ b/excuter/op-mem-ompsimd/test/tensorfunc/8_tensor_concat.cpp
@@ -37,7 +37,7 @@ void test_concat(){
  
      
     std::cout<<"================"<<std::endl;
-    for (int i=0;i<tensors[0]->shape.dim;i++){
+    for (int i=0;i<tensors[0]->shape.dim();i++){
         Shape shape=concatShape(tensors,i);
         Tensor<float> result=New<float>(shape.shape);
         concat<miaobyte,float>(tensors,i,result);
diff --git a/front/py/deepx/transformer/models/llama/modeling_llama.py b/front/py/deepx/transformer/models/llama/modeling_llama.py
index c8d9c403..c3b07fd3 100644
--- a/front/py/deepx/transformer/models/llama/modeling_llama.py
+++ b/front/py/deepx/transformer/models/llama/modeling_llama.py
@@ -1,6 +1,6 @@
-from deepx.nn.modules import Module
-from deepx import Tensor,ones,rsqrt
-
+from deepx.nn.modules import Module,Linear
+from deepx import Tensor,ones,rsqrt,concat
+from deepx.transformer.modeling_rope_utils import ROPE_INIT_FUNCTIONS
 # RMSNorm
 # copy from https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
 # 数学公式
@@ -21,58 +21,82 @@ def forward(self, hidden_states:Tensor):
     def extra_repr(self):
         return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
     
-#
-# class LlamaRotaryEmbedding(Module):
-#     def __init__(self,rope_type:str="default",max_seq_len:int=1024,device=None):
-#         super().__init__()
-#         self.max_seq_len_cached = config.max_position_embeddings
-#         self.original_max_seq_len = config.max_position_embeddings
-#
-#         self.config = config
-#         self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
-#
-#         inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
-#         self.register_buffer("inv_freq", inv_freq, persistent=False)
-#         self.original_inv_freq = self.inv_freq
-#
-#     def _dynamic_frequency_update(self, position_ids, device):
-#         """
-#         dynamic RoPE layers should recompute `inv_freq` in the following situations:
-#         1 - growing beyond the cached sequence length (allow scaling)
-#         2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
-#         """
-#         seq_len = torch.max(position_ids) + 1
-#         if seq_len > self.max_seq_len_cached:  # growth
-#             inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
-#             self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
-#             self.max_seq_len_cached = seq_len
-#
-#         if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
-#             # This .to() is needed if the model has been moved to a device after being initialized (because
-#             # the buffer is automatically moved, but not the original copy)
-#             self.original_inv_freq = self.original_inv_freq.to(device)
-#             self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
-#             self.max_seq_len_cached = self.original_max_seq_len
-#
-#     @torch.no_grad()
-#     def forward(self, x, position_ids):
-#         if "dynamic" in self.rope_type:
-#             self._dynamic_frequency_update(position_ids, device=x.device)
-#
-#         # Core RoPE block
-#         inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
-#         position_ids_expanded = position_ids[:, None, :].float()
-#         # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
-#         device_type = x.device.type
-#         device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-#         with torch.autocast(device_type=device_type, enabled=False):
-#             freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
-#             emb = torch.cat((freqs, freqs), dim=-1)
-#             cos = emb.cos()
-#             sin = emb.sin()
-#
-#         # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
-#         cos = cos * self.attention_scaling
-#         sin = sin * self.attention_scaling
-#
-#         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+class LlamaRotaryEmbedding(Module):
+    def __init__(self,rope_type:str="default",max_seq_len:int=1024,device=None):
+        super().__init__()
+        # 最大序列长度
+        self.max_seq_len_cached = max_seq_len
+        # 原始最大序列长度
+        self.original_max_seq_len = max_seq_len
+        # 旋转类型
+        self.rope_type=rope_type
+        # 旋转初始化函数
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+        # 旋转初始化函数
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        #TODO 
+        # 注册缓存
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        # 原始旋转频率
+        self.original_inv_freq = self.inv_freq
+
+    # def _dynamic_frequency_update(self, position_ids, device):
+    #     """
+    #     dynamic RoPE layers should recompute `inv_freq` in the following situations:
+    #     1 - growing beyond the cached sequence length (allow scaling)
+    #     2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
+    #     """
+    #     seq_len = torch.max(position_ids) + 1
+    #     if seq_len > self.max_seq_len_cached:  # growth
+    #         inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
+    #         self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
+    #         self.max_seq_len_cached = seq_len
+
+    #     if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+    #         # This .to() is needed if the model has been moved to a device after being initialized (because
+    #         # the buffer is automatically moved, but not the original copy)
+    #         self.original_inv_freq = self.original_inv_freq.to(device)
+    #         self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+    #         self.max_seq_len_cached = self.original_max_seq_len
+
+    def forward(self, x, position_ids):
+        # TODO
+        # if "dynamic" in self.rope_type:
+        #     self._dynamic_frequency_update(position_ids, device=x.device)
+
+        # Core RoPE block
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+ 
+        freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+        emb = concat((freqs, freqs), dim=-1)
+        cos = emb.cos()
+        sin = emb.sin()
+
+        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
+        cos = cos * self.attention_scaling
+        sin = sin * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+class LlamaMLP(Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        # 输入层大小
+        self.hidden_size = config.hidden_size  
+        # 中间层大小
+        self.intermediate_size = config.intermediate_size  
+        #门控投影层
+        self.gate_proj = Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        #上投影层
+        self.up_proj = Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        #下投影层
+        self.down_proj = Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
+        #激活函数
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
\ No newline at end of file
diff --git a/front/py/examples/2_ir/5_reduce_sum.py b/front/py/examples/2_ir/5_reduce_sum.py
index a6f0eb63..6932f4b2 100644
--- a/front/py/examples/2_ir/5_reduce_sum.py
+++ b/front/py/examples/2_ir/5_reduce_sum.py
@@ -27,7 +27,7 @@
 # p=prod(t,dim=(1,),out="p")
 # p.print()
 
-t1=ones((4,5,6),name="t1")
-t1.print()
-t2=sum(t1,dim=(0,1),out='t2')
-t2.print()
+# t1=ones((4,5,6),name="t1")
+# t1.print()
+# t2=sum(t1,dim=(0,1),out='t2')
+# t2.print()