From c28e243e17d83fa3bcb7ffbd46c71ed5f081d475 Mon Sep 17 00:00:00 2001
From: lipeng <734991033@qq.com>
Date: Sun, 1 Jun 2025 21:57:10 +0800
Subject: [PATCH 1/3] =?UTF-8?q?repeat:ompsimd=20=E5=AE=9E=E7=8E=B0?=
 =?UTF-8?q?=E5=AE=8C=E6=88=90=EF=BC=8C=E5=BE=85=E9=AA=8C=E8=AF=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 excuter/cpp-common/src/deepx/shape.cpp        | 12 +++-
 .../src/deepx/shape_changeshape.cpp           | 14 ++++
 .../src/deepx/tensorfunc/changeshape.hpp      | 36 +++++++++++
 .../deepx/tensorfunc/changeshape_miaobyte.hpp | 22 +++++++
 .../src/deepx/tf/changeshape.hpp              | 64 +++++++++++++++++++
 front/py/deepx/nn/functional/__init__.py      |  2 +-
 .../transformer/models/llama/attention.py     | 11 +++-
 .../models/llama/groupedquery_attention.py    | 12 ++++
 front/py/examples/1_tensor/2_newbig.py        |  6 +-
 front/py/examples/1_tensor/getitem.py         | 14 ++++
 .../examples/2_ir/4_changeshape_broadcast.py  | 12 ++--
 .../2_ir/4_changeshape_broadcast_add.py       |  2 +-
 front/py/examples/2_ir/changeshape_repeat.py  | 18 ++++++
 .../3_functional/changeshape_broadcast.py     | 23 +++++++
 front/py/examples/4_transformer/llama/llama_  |  0
 15 files changed, 236 insertions(+), 12 deletions(-)
 create mode 100644 front/py/deepx/transformer/models/llama/groupedquery_attention.py
 create mode 100644 front/py/examples/1_tensor/getitem.py
 create mode 100644 front/py/examples/2_ir/changeshape_repeat.py
 create mode 100644 front/py/examples/3_functional/changeshape_broadcast.py
 create mode 100644 front/py/examples/4_transformer/llama/llama_
diff --git a/excuter/cpp-common/src/deepx/shape.cpp b/excuter/cpp-common/src/deepx/shape.cpp
index 9f51a2e2..cc802d90 100644
--- a/excuter/cpp-common/src/deepx/shape.cpp
+++ b/excuter/cpp-common/src/deepx/shape.cpp
@@ -66,11 +66,19 @@ namespace deepx
         }
         std::cout << "]" << std::endl;
     }
+    //linearat
+    //linearat 支持和strides不同dim的indice索引
+    //当strides.size() < indices.size()时，indices的前面部分会默认都是unsqueeze的维度，从而被忽略。在repeat方法中，用到。
+    //当strides.size() > indices.size()时，strides的后面部分会被忽略不计算
     int Shape::linearat(const std::vector<int> &indices) const{
         int idx=0;
-        for(int i=0;i<indices.size();i++){
-            idx+=indices[i]*strides[i];
+        int stride_i=0,indices_i=0;
+        if (indices.size()> shape.size()) {
+            indices_i=indices.size()-shape.size();
         }
+        for(;stride_i<strides.size()&&stride_i<indices.size();){
+            idx+=indices[indices_i++]*strides[stride_i++];
+        }       
         return idx;
     }
     std::vector<int> Shape::linearto(int idx_linear) const{
diff --git a/excuter/cpp-common/src/deepx/shape_changeshape.cpp b/excuter/cpp-common/src/deepx/shape_changeshape.cpp
index d3a89d90..c02d2f5c 100644
--- a/excuter/cpp-common/src/deepx/shape_changeshape.cpp
+++ b/excuter/cpp-common/src/deepx/shape_changeshape.cpp
@@ -142,4 +142,18 @@ namespace deepx
         }
         return output_shape;
     }
+
+    // repeat
+    std::vector<int> repeatShape(const std::vector<int> &src, const std::vector<int> &repeats)
+    {
+        if (src.size() != repeats.size()) {
+            throw std::invalid_argument("Shape and repeats must have the same number of dimensions");
+        }
+        
+        std::vector<int> dest(src.size());
+        for (size_t i = 0; i < src.size(); ++i) {
+            dest[i] = src[i] * repeats[i];
+        }
+        return dest;
+    }
 }
\ No newline at end of file
diff --git a/excuter/cpp-common/src/deepx/tensorfunc/changeshape.hpp b/excuter/cpp-common/src/deepx/tensorfunc/changeshape.hpp
index 100f408b..11c4b2b5 100644
--- a/excuter/cpp-common/src/deepx/tensorfunc/changeshape.hpp
+++ b/excuter/cpp-common/src/deepx/tensorfunc/changeshape.hpp
@@ -74,6 +74,42 @@ namespace deepx::tensorfunc
         indexselectDispatcher<Author, T, GatherAxisT>::indexselect(input, indices, axis, output);
     }
 
+    //repeat
+    template <typename Author, typename T>
+    struct repeatDispatcher
+    {
+        static void repeat(const Tensor<T> &A, const std::vector<int> &repeats, Tensor<T> &B) = delete;
+    };
+    //Repeats:The number of times to repeat this tensor along each dimension
+    // https://docs.pytorch.org/docs/stable/generated/torch.Tensor.repeat.html#torch.Tensor.repeat
+    template <typename Author, typename T> 
+    void repeat(const Tensor<T> &A, const std::vector<int> &repeats, Tensor<T> &B)
+    {
+        repeatDispatcher<Author, T>::repeat(A, repeats, B);
+    }
+
+    //repeat_interleave
+    template <typename Author, typename T>
+    struct repeat_interleaveDispatcher
+    {
+        static void repeat_interleave(const Tensor<T> &A, const int repeats, Tensor<T> &B) = delete;
+        static void repeat_interleave(const Tensor<T> &A, const Tensor<T> &repeats, Tensor<T> &B) = delete;
+    };
+    template <typename Author, typename T>
+    void repeat_interleave(const Tensor<T> &A, const  int repeats, Tensor<T> &B)
+    {
+        repeat_interleaveDispatcher<Author, T>::repeat_interleave(A, repeats, B);
+    }
+    template <typename Author, typename T>
+    void repeat_interleave(const Tensor<T> &A, const Tensor<T> &repeats, Tensor<T> &B)
+    {
+        repeat_interleaveDispatcher<Author, T>::repeat_interleave(A, repeats, B);
+    }
+
+
+
+
+
     // // split
     // //  https://onnx.ai/onnx/operators/onnx__Split.html
     // template <typename Author, typename T>
diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/changeshape_miaobyte.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/changeshape_miaobyte.hpp
index a92a9571..0e04a63b 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/changeshape_miaobyte.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/changeshape_miaobyte.hpp
@@ -177,6 +177,28 @@ namespace deepx::tensorfunc
         }
     };
 
+    // repeat
+    template <typename T>
+    struct repeatDispatcher<miaobyte, T>
+    {
+        static void repeat(const Tensor<T> &A, const std::vector<int> &repeats, Tensor<T> &B)
+        {
+            auto new_shape = repeatShape(A.shape.shape, repeats);
+            if (new_shape.empty() || new_shape != B.shape.shape)
+            {
+                throw TensorShapeError("Repeat shape mismatch");
+            }
+            B.shape.rangeParallel(B.shape.dim(), [&A,B,&repeats](const int idx, const std::vector<int> &indices, ThreadLocalVectors &tlv)
+                                  {
+                        for (size_t i = 0; i < A.shape.dim(); ++i) {
+                            tlv.get(0)[i] = indices[i] / repeats[i];
+                        }
+                        B.data[idx] = A.data[A.shape.linearat(tlv.get(0))];
+                    },{B.shape.dim()});
+        }
+    };
+
+
     // template <typename T>
     // void split(const Tensor<T> &tensor, const int axis, std::vector<Tensor<T> *> &results)
     // {
diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/changeshape.hpp b/excuter/op-mem-ompsimd/src/deepx/tf/changeshape.hpp
index 0f64432c..8ffd5af4 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tf/changeshape.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tf/changeshape.hpp
@@ -446,5 +446,69 @@ namespace deepx::tf
             return 0;
         };
     };
+
+    // repeat
+     template <typename Author>
+     class Repeat : public TF
+     {
+     public:
+     Repeat(const vector<Param> &args, const vector<Param> &returns)
+         {
+             this->name = "repeat";
+             this->metadata.author = Author::name();
+             this->tftype = "changeshape";
+             this->args = args;
+             this->returns = returns;
+         }
+ 
+         string math_formula() const override
+         {
+             return "T1.repeat(repeats=[4,3,2])->T2";
+         }
+         shared_ptr<TF> clone() const override
+         {
+             return make_shared<Repeat<Author>>(*this);
+         }
+         int run(shared_ptr<MemBase> mem, string &error) override
+         {
+             if (!checktensors({this->args[0].textvalue, this->returns[0].textvalue}, mem, error) != 0)
+             {
+                 return 1;
+             }
+             Precision input_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
+             vector<int> new_shape = this->getvector<int>(1, true);
+             Precision output_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
+             if (input_type != output_type)
+             {
+                 error = "Type mismatch: " + precision_str(input_type) + " != " + precision_str(output_type);
+                 return 1;
+             }
+             switch (input_type)
+             {
+             case Precision::Float64:
+                 broadcastTo<Author, double>(*mem->gettensor<double>(this->args[0].textvalue), new_shape, *mem->gettensor<double>(this->returns[0].textvalue));
+                 break;
+             case Precision::Float32:
+                 broadcastTo<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), new_shape, *mem->gettensor<float>(this->returns[0].textvalue));
+                 break;
+             case Precision::Int64:
+                 broadcastTo<Author, int64_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), new_shape, *mem->gettensor<int64_t>(this->returns[0].textvalue));
+                 break;
+             case Precision::Int32:
+                 broadcastTo<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), new_shape, *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                 break;
+             case Precision::Int16:
+                 broadcastTo<Author, int16_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), new_shape, *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                 break;
+             case Precision::Int8:
+                 broadcastTo<Author, int8_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), new_shape, *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                 break;
+             default:
+                 error = "Unsupported type: " + precision_str(input_type);
+                 return 1;
+             }
+             return 0;
+         }
+     };
 }
 #endif // DEEPX_TF_CHANGESHAPE_HPP
\ No newline at end of file
diff --git a/front/py/deepx/nn/functional/__init__.py b/front/py/deepx/nn/functional/__init__.py
index d8ac8f44..da04e664 100644
--- a/front/py/deepx/nn/functional/__init__.py
+++ b/front/py/deepx/nn/functional/__init__.py
@@ -30,7 +30,7 @@
     "invert",
     "matmul",
     "reducemax","reducemin","sum","prod",
-    "reshape","permute","transpose","concat","broadcastTo","indexselect",
+    "reshape","permute","transpose","concat","broadcastTo","broadcast_to","indexselect",
 
     #functional
     "relu","sigmoid","swish","silu",
diff --git a/front/py/deepx/transformer/models/llama/attention.py b/front/py/deepx/transformer/models/llama/attention.py
index 490501e4..325d6a27 100644
--- a/front/py/deepx/transformer/models/llama/attention.py
+++ b/front/py/deepx/transformer/models/llama/attention.py
@@ -1,6 +1,6 @@
 from typing import Optional,Tuple
 from deepx.nn.modules import Module,Linear
-from deepx import Tensor,repeat_kv,matmul,softmax,concat,arange,dropout as dropout_func
+from deepx import Tensor,matmul,softmax,concat,arange,dropout as dropout_func
 
 
 
@@ -17,7 +17,14 @@ def apply_rotary_pos_emb(q:Tensor, k:Tensor, cos:Tensor, sin:Tensor, unsqueeze_d
     q_embed = (q * cos) + (rotate_half(q) * sin)
     k_embed = (k * cos) + (rotate_half(k) * sin)
     return q_embed, k_embed
- 
+
+def repeat_kv(hidden_states: Tensor, n_rep: int) -> Tensor:
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
 
 # https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
 # 经简化，去掉了分布式配置，去掉attention的配置。交给IR自动替换flashattention，后续的组件自动处理
diff --git a/front/py/deepx/transformer/models/llama/groupedquery_attention.py b/front/py/deepx/transformer/models/llama/groupedquery_attention.py
new file mode 100644
index 00000000..58f56a00
--- /dev/null
+++ b/front/py/deepx/transformer/models/llama/groupedquery_attention.py
@@ -0,0 +1,12 @@
+from typing import Optional,Tuple
+from deepx.nn.modules import Module,Linear
+from deepx import Tensor,matmul,softmax,concat,arange,dropout as dropout_func
+
+
+def repeat_kv(hidden_states: Tensor, n_rep: int) -> Tensor:
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
diff --git a/front/py/examples/1_tensor/2_newbig.py b/front/py/examples/1_tensor/2_newbig.py
index 2d0b4bbf..34c76c09 100644
--- a/front/py/examples/1_tensor/2_newbig.py
+++ b/front/py/examples/1_tensor/2_newbig.py
@@ -6,7 +6,11 @@ def newtensor(dtype):
    for i in range(0,20):
       t=newtensor((1,20,4096),dtype=dtype)
       # t.print()
-
+a=None
+def f():
+   a="hello"
+f()
+print(a)
 
 if __name__ == "__main__":
    args=sys.argv[1:]
diff --git a/front/py/examples/1_tensor/getitem.py b/front/py/examples/1_tensor/getitem.py
new file mode 100644
index 00000000..c0746762
--- /dev/null
+++ b/front/py/examples/1_tensor/getitem.py
@@ -0,0 +1,14 @@
+
+def deepx_getitem():
+    from deepx  import newtensor
+    t=newtensor((2,3,4)).full_(1)
+    t2=t[None, :, None]
+    t2.print()
+def torch_getitem():
+    import torch
+    t=torch.full((2,3,4),1)
+    t2=t[None, :, None]
+    print(t2)
+if __name__ == "__main__":
+    deepx_getitem()
+    torch_getitem()
\ No newline at end of file
diff --git a/front/py/examples/2_ir/4_changeshape_broadcast.py b/front/py/examples/2_ir/4_changeshape_broadcast.py
index 15ceb34f..3c8f77cd 100644
--- a/front/py/examples/2_ir/4_changeshape_broadcast.py
+++ b/front/py/examples/2_ir/4_changeshape_broadcast.py
@@ -2,10 +2,13 @@
 
 print()
 import torch
-a=torch.arange(4*2*3).reshape(4,2,3)
-b=torch.arange(2*1).reshape(2,1)
-bb_torch = torch.broadcast_to(b, (4,2,3))
+torch_a=torch.arange(4*2*3).reshape(4,2,3)
+torch_b=torch.arange(2*1).reshape(2,1)
+bb_torch = torch.broadcast_to(torch_b, (4,2,3))
 print(bb_torch)
+torch_a[None:,]
+
+
 
 ########====DEEPX====########
 from deepx import Tensor,arange,broadcastTo
@@ -15,6 +18,5 @@
 bb=b.broadcastTo( a.shape,out="b.broadcasted")
 bb.print()
 
-
- 
+c=a[None:,]
 
diff --git a/front/py/examples/2_ir/4_changeshape_broadcast_add.py b/front/py/examples/2_ir/4_changeshape_broadcast_add.py
index 229f1332..7ebd743b 100644
--- a/front/py/examples/2_ir/4_changeshape_broadcast_add.py
+++ b/front/py/examples/2_ir/4_changeshape_broadcast_add.py
@@ -1,5 +1,5 @@
 ########====DEEPX====########
-from deepx import Tensor,ones
+from deepx import ones
 
 a=ones( 4,2,3 ,name="a")    
 b=ones(  2,1 ,name='b')
diff --git a/front/py/examples/2_ir/changeshape_repeat.py b/front/py/examples/2_ir/changeshape_repeat.py
new file mode 100644
index 00000000..a915ca7a
--- /dev/null
+++ b/front/py/examples/2_ir/changeshape_repeat.py
@@ -0,0 +1,18 @@
+import torch
+
+# 正确：repeats为一维张量
+x = torch.tensor([[1, 2], [3, 4]])
+repeats = torch.tensor([1, 2])  # 一维张量
+torch.repeat_interleave(x, repeats, dim=0)
+# 输出:
+# tensor([[1, 2],
+#         [3, 4],
+#         [3, 4]])
+
+# 错误：repeats为二维张量
+repeats_2d = torch.tensor([[1, 2], [3, 4]])  # 二维张量
+try:
+    torch.repeat_interleave(x, repeats_2d, dim=0)
+except RuntimeError as e:
+    print(f"错误: {e}")
+# 输出:
\ No newline at end of file
diff --git a/front/py/examples/3_functional/changeshape_broadcast.py b/front/py/examples/3_functional/changeshape_broadcast.py
new file mode 100644
index 00000000..87210412
--- /dev/null
+++ b/front/py/examples/3_functional/changeshape_broadcast.py
@@ -0,0 +1,23 @@
+
+print()
+#######-----------------torch-----------------#######
+import torch
+torch_x = torch.arange(6).reshape(1,2,3)       # shape=(2,3)
+torch_y = torch_x.broadcast_to((3,2,3))    # 需要原维度为1
+print(torch_y)
+
+torch_x2=torch_x.repeat_interleave(dim=0, repeats=3)
+print(torch_x2)
+
+
+#######-----------------deepx-----------------#######
+from deepx import Tensor,broadcast_to,arange
+deepx_x = arange(0,6).reshape_((1,2,3))      # shape=(2,3)
+deepx_y = broadcast_to(deepx_x, (3,2,3))    # 需要原维度为1
+deepx_y.print()
+
+
+
+
+
+
diff --git a/front/py/examples/4_transformer/llama/llama_ b/front/py/examples/4_transformer/llama/llama_
new file mode 100644
index 00000000..e69de29b

From 19adfc8c321463a286d78d22a8beecd43114b367 Mon Sep 17 00:00:00 2001
From: lipeng <734991033@qq.com>
Date: Sun, 1 Jun 2025 22:56:16 +0800
Subject: [PATCH 2/3] =?UTF-8?q?repeat:ompsimd=20=E5=AE=9E=E7=8E=B0?=
 =?UTF-8?q?=E5=AE=8C=E6=88=90=EF=BC=8C=E5=BE=85=E9=AA=8C=E8=AF=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 doc/excuter/op-mem-ompsimd/list.md            | 127 +++++++++---------
 .../src/deepx/shape_changeshape.hpp           |   3 +
 excuter/op-mem-ompsimd/src/client/tfs.cpp     |  10 ++
 .../deepx/tensorfunc/changeshape_miaobyte.hpp |   5 +-
 .../src/deepx/tf/changeshape.hpp              | 126 ++++++++---------
 front/py/deepx/nn/functional/__init__.py      |   2 +-
 front/py/deepx/nn/functional/authormap.py     |   1 +
 .../nn/functional/leaffunc_changeshape.py     |  13 ++
 .../py/deepx/nn/functional/rtf_changeshape.py |   8 ++
 front/py/deepx/tensor/changeshape.py          |   6 +
 front/py/deepx/tensor/shape.py                |   7 +-
 .../py/examples/2_ir/4_changeshape_repeat.py  |  17 +++
 12 files changed, 195 insertions(+), 130 deletions(-)
 create mode 100644 front/py/examples/2_ir/4_changeshape_repeat.py

diff --git a/doc/excuter/op-mem-ompsimd/list.md b/doc/excuter/op-mem-ompsimd/list.md
index e259031c..3ec2089e 100644
--- a/doc/excuter/op-mem-ompsimd/list.md
+++ b/doc/excuter/op-mem-ompsimd/list.md
@@ -2,106 +2,107 @@
 
 本页面由 `excuter/op-mem-ompsimd 生成，请勿手动修改 
 
-### arg
+### matmul
 
 | Operation | Author |  Math Formula | IR Instruction |
 |-----------|--------|--------------|----------------|
-| vecset |  none  | [3  4  5]->shape | vecset(vector<any>:value)->(vector<any>:name) |
-| argset |  none  | argvalue->argname | argset(var<any>:value)->(var<any>:name) |
+| matmul | cblas | T3=T1 @ T2 | matmul(tensor<float64|float32>:A, tensor<float64|float32>:B)->(tensor<float64|float32>:C) |
+| matmul | miaobyte | T3=T1 @ T2 | matmul(tensor<any>:A, tensor<any>:B)->(tensor<any>:C) |
 
-### tensorlife
+### init
 
 | Operation | Author |  Math Formula | IR Instruction |
 |-----------|--------|--------------|----------------|
-| renametensor |  none  | rename(newname)->T1 | renametensor(var<string>:new_name)->(tensor<any>:t) |
-| newtensor |  none  | T1 =Tensor(shape=[...]) | newtensor(vector<int32>:shape)->(tensor<any>:t) |
-| newtensor |  none  | T1 =Tensor(shape=[...]) | newtensor(var<string>:shape)->(tensor<any>:t) |
-| deltensor |  none  | del->T1 | deltensor()->(tensor<any>:t) |
-| copytensor |  none  | T1.data->T2.data | copytensor(tensor<any>:src)->(tensor<any>:dst) |
+| constant | miaobyte | constant(value)->T1 | constant(var<any>:value)->(tensor<any>:t) |
+| arange | miaobyte | arange(start,step)->T1 | arange(var<any>:start, var<any>:step)->(tensor<any>:t) |
+| uniform | miaobyte | uniform(low,high,seed)->T1 | uniform(var<any>:low, var<any>:high, var<int32>:seed)->(tensor<any>:t) |
+| dropout | miaobyte | dropout(p,seed)->A | dropout(var<float32>:p, var<int32>:seed)->(tensor<any>:A) |
+| normal | miaobyte | normal(mean,stddev,seed)->T1 | normal(var<any>:mean, var<any>:std, var<int32>:seed)->(tensor<any>:t) |
 
 ### io
 
 | Operation | Author |  Math Formula | IR Instruction |
 |-----------|--------|--------------|----------------|
-| loadtensordata |  none  | loadtensordata(path)->tensor.data | loadtensordata(var<string>:path)->(tensor<any>:t) |
-| save |  none  | save(T1,path) | save(tensor<any>:t, var<string>:path)->() |
+| load |  none  | mem.load(path) | load(var<string>:path)->() |
 | print | miaobyte | print(T1) | print(tensor<any>:t)->() |
 | print | miaobyte | print(T1) | print(tensor<any>:t, var<string>:format)->() |
-| load |  none  | mem.load(path) | load(var<string>:path)->() |
+| save |  none  | save(T1,path) | save(tensor<any>:t, var<string>:path)->() |
+| loadtensordata |  none  | loadtensordata(path)->tensor.data | loadtensordata(var<string>:path)->(tensor<any>:t) |
 
-### matmul
+### arg
 
 | Operation | Author |  Math Formula | IR Instruction |
 |-----------|--------|--------------|----------------|
-| matmul | cblas | T3=T1 @ T2 | matmul(tensor<float64|float32>:A, tensor<float64|float32>:B)->(tensor<float64|float32>:C) |
-| matmul | miaobyte | T3=T1 @ T2 | matmul(tensor<any>:A, tensor<any>:B)->(tensor<any>:C) |
+| argset |  none  | argvalue->argname | argset(var<any>:value)->(var<any>:name) |
+| vecset |  none  | [3  4  5]->shape | vecset(vector<any>:value)->(vector<any>:name) |
 
-### init
+### tensorlife
 
 | Operation | Author |  Math Formula | IR Instruction |
 |-----------|--------|--------------|----------------|
-| normal | miaobyte | normal(mean,stddev,seed)->T1 | normal(var<any>:mean, var<any>:std, var<int32>:seed)->(tensor<any>:t) |
-| dropout | miaobyte | dropout(p,seed)->A | dropout(var<float32>:p, var<int32>:seed)->(tensor<any>:A) |
-| uniform | miaobyte | uniform(low,high,seed)->T1 | uniform(var<any>:low, var<any>:high, var<int32>:seed)->(tensor<any>:t) |
-| arange | miaobyte | arange(start,step)->T1 | arange(var<any>:start, var<any>:step)->(tensor<any>:t) |
-| constant | miaobyte | constant(value)->T1 | constant(var<any>:value)->(tensor<any>:t) |
+| copytensor |  none  | T1.data->T2.data | copytensor(tensor<any>:src)->(tensor<any>:dst) |
+| newtensor |  none  | T1 =Tensor(shape=[...]) | newtensor(vector<int32>:shape)->(tensor<any>:t) |
+| newtensor |  none  | T1 =Tensor(shape=[...]) | newtensor(var<string>:shape)->(tensor<any>:t) |
+| renametensor |  none  | rename(newname)->T1 | renametensor(var<string>:new_name)->(tensor<any>:t) |
+| deltensor |  none  | del->T1 | deltensor()->(tensor<any>:t) |
 
 ### elementwise
 
 | Operation | Author |  Math Formula | IR Instruction |
 |-----------|--------|--------------|----------------|
-| switch | miaobyte | C=switch([tensors],case) | switch(listtensor<any>:tensors, tensor<int32|bool>:cases)->(tensor<any>:C) |
-| greaterscalar | miaobyte | mask=greater(T1,scalar) | greaterscalar(tensor<any>:A, var<any>:scalar)->(tensor<bool>:mask) |
-| notequal | miaobyte | notequal(T1,T2)->mask | notequal(tensor<any>:A, tensor<any>:B, var<float32>:epsilon)->(tensor<bool>:mask) |
-| equalscalar | miaobyte | mask=equal(T1,scalar) | equalscalar(tensor<any>:A, var<any>:scalar, var<float32>:eposilon)->(tensor<bool>:mask) |
-| min | miaobyte | T3=min(T1,T2) | min(tensor<any>:A, tensor<any>:B)->(tensor<any>:C) |
-| maxscalar | miaobyte | T3=max(T1,scalar) | maxscalar(tensor<any>:A, var<any>:scalar)->(tensor<any>:C) |
-| tan | miaobyte | T3=tan(T1) | tan(tensor<any>:A)->(tensor<any>:C) |
-| sin | miaobyte | T3=sin(T1) | sin(tensor<any>:A)->(tensor<any>:C) |
-| less | miaobyte | mask=less(T1,T2) | less(tensor<any>:A, tensor<any>:B)->(tensor<bool>:mask) |
-| powscalar | miaobyte | T3=T1^scalar | powscalar(tensor<any>:A, var<any>:scalar)->(tensor<any>:C) |
-| rsubscalar | miaobyte | T3=scalar-T1 | rsubscalar(var<any>:scalar, tensor<any>:a)->(tensor<any>:c) |
-| divscalar | miaobyte | T3=T1/scalar | divscalar(tensor<any>:A, var<any>:scalar)->(tensor<any>:C) |
-| log | miaobyte | T3=log(T1) | log(tensor<any>:A)->(tensor<any>:C) |
-| addscalar | miaobyte | T3=T1+scalar | addscalar(tensor<any>:a, var<any>:scalar)->(tensor<any>:c) |
+| pow | miaobyte | T3=T1^T2 | pow(tensor<any>:A, tensor<any>:B)->(tensor<any>:C) |
+| max | miaobyte | T3=max(T1,T2) | max(tensor<any>:A, tensor<any>:B)->(tensor<any>:C) |
+| mulscalar | miaobyte | T3=T1*scalar | mulscalar(tensor<any>:A, var<any>:b)->(tensor<any>:C) |
+| equal | miaobyte | equal(T1,T2)->mask | equal(tensor<any>:A, tensor<any>:B, var<float32>:eposilon)->(tensor<bool>:mask) |
+| mul | miaobyte | T3=T1*T2 | mul(tensor<any>:A, tensor<any>:B)->(tensor<any>:C) |
+| exp | miaobyte | T3=exp(T1) | exp(tensor<any>:A)->(tensor<any>:C) |
+| subscalar | miaobyte | T3=T1-scalar | subscalar(tensor<any>:a, var<any>:scalar)->(tensor<any>:c) |
+| sqrt | miaobyte | T3=sqrt(T1) | sqrt(tensor<any>:A)->(tensor<any>:C) |
+| sub | miaobyte | T3=T1-T2 | sub(tensor<any>:a, tensor<any>:b)->(tensor<any>:c) |
+| add | cblas | T3=T1+T2 | add(tensor<float64|float32>:a, tensor<float64|float32>:b)->(tensor<float64|float32>:c) |
+| add | miaobyte | T3=T1+T2 | add(tensor<any>:a, tensor<any>:b)->(tensor<any>:c) |
+| todtype |  none  | T3(dtypeA)->T1(dtypeB) | todtype(tensor<any>:A)->(tensor<any>:C) |
+| invert | miaobyte | T3=~T1 | invert(tensor<int64|int32|int16|int8|bool>:A)->(tensor<int64|int32|int16|int8|bool>:C) |
+| rdivscalar | miaobyte | T3=scalar/T1 | rdivscalar(var<any>:scalar, tensor<any>:A)->(tensor<any>:C) |
+| rpowscalar | miaobyte | T3=scalar^T1 | rpowscalar(var<float32>:scalar, tensor<any>:A)->(tensor<any>:C) |
+| cos | miaobyte | T3=cos(T1) | cos(tensor<any>:A)->(tensor<any>:C) |
 | greater | miaobyte | mask=greater(T1,T2) | greater(tensor<any>:A, tensor<any>:B)->(tensor<bool>:mask) |
+| addscalar | miaobyte | T3=T1+scalar | addscalar(tensor<any>:a, var<any>:scalar)->(tensor<any>:c) |
+| div | miaobyte | T3=T1/T2 | div(tensor<any>:A, tensor<any>:B)->(tensor<any>:C) |
+| divscalar | miaobyte | T3=T1/scalar | divscalar(tensor<any>:A, var<any>:scalar)->(tensor<any>:C) |
 | lessscalar | miaobyte | mask=less(T1,scalar) | lessscalar(tensor<any>:A, var<any>:scalar)->(tensor<bool>:mask) |
-| cos | miaobyte | T3=cos(T1) | cos(tensor<any>:A)->(tensor<any>:C) |
+| less | miaobyte | mask=less(T1,T2) | less(tensor<any>:A, tensor<any>:B)->(tensor<bool>:mask) |
 | notequalscalar | miaobyte | mask=notequal(T1,scalar) | notequalscalar(tensor<any>:A, var<any>:scalar, var<float32>:epsilon)->(tensor<bool>:mask) |
+| rsubscalar | miaobyte | T3=scalar-T1 | rsubscalar(var<any>:scalar, tensor<any>:a)->(tensor<any>:c) |
+| sin | miaobyte | T3=sin(T1) | sin(tensor<any>:A)->(tensor<any>:C) |
 | minscalar | miaobyte | T3=min(T1,scalar) | minscalar(tensor<any>:A, var<any>:scalar)->(tensor<any>:C) |
-| rpowscalar | miaobyte | T3=scalar^T1 | rpowscalar(var<float32>:scalar, tensor<any>:A)->(tensor<any>:C) |
-| rdivscalar | miaobyte | T3=scalar/T1 | rdivscalar(var<any>:scalar, tensor<any>:A)->(tensor<any>:C) |
-| todtype |  none  | T3(dtypeA)->T1(dtypeB) | todtype(tensor<any>:A)->(tensor<any>:C) |
-| add | cblas | T3=T1+T2 | add(tensor<float64|float32>:a, tensor<float64|float32>:b)->(tensor<float64|float32>:c) |
-| add | miaobyte | T3=T1+T2 | add(tensor<any>:a, tensor<any>:b)->(tensor<any>:c) |
-| sub | miaobyte | T3=T1-T2 | sub(tensor<any>:a, tensor<any>:b)->(tensor<any>:c) |
-| sqrt | miaobyte | T3=sqrt(T1) | sqrt(tensor<any>:A)->(tensor<any>:C) |
-| subscalar | miaobyte | T3=T1-scalar | subscalar(tensor<any>:a, var<any>:scalar)->(tensor<any>:c) |
-| exp | miaobyte | T3=exp(T1) | exp(tensor<any>:A)->(tensor<any>:C) |
-| mul | miaobyte | T3=T1*T2 | mul(tensor<any>:A, tensor<any>:B)->(tensor<any>:C) |
-| equal | miaobyte | equal(T1,T2)->mask | equal(tensor<any>:A, tensor<any>:B, var<float32>:eposilon)->(tensor<bool>:mask) |
-| mulscalar | miaobyte | T3=T1*scalar | mulscalar(tensor<any>:A, var<any>:b)->(tensor<any>:C) |
-| div | miaobyte | T3=T1/T2 | div(tensor<any>:A, tensor<any>:B)->(tensor<any>:C) |
-| invert | miaobyte | T3=~T1 | invert(tensor<int64|int32|int16|int8|bool>:A)->(tensor<int64|int32|int16|int8|bool>:C) |
-| max | miaobyte | T3=max(T1,T2) | max(tensor<any>:A, tensor<any>:B)->(tensor<any>:C) |
-| pow | miaobyte | T3=T1^T2 | pow(tensor<any>:A, tensor<any>:B)->(tensor<any>:C) |
+| tan | miaobyte | T3=tan(T1) | tan(tensor<any>:A)->(tensor<any>:C) |
+| maxscalar | miaobyte | T3=max(T1,scalar) | maxscalar(tensor<any>:A, var<any>:scalar)->(tensor<any>:C) |
+| min | miaobyte | T3=min(T1,T2) | min(tensor<any>:A, tensor<any>:B)->(tensor<any>:C) |
+| log | miaobyte | T3=log(T1) | log(tensor<any>:A)->(tensor<any>:C) |
+| equalscalar | miaobyte | mask=equal(T1,scalar) | equalscalar(tensor<any>:A, var<any>:scalar, var<float32>:eposilon)->(tensor<bool>:mask) |
+| notequal | miaobyte | notequal(T1,T2)->mask | notequal(tensor<any>:A, tensor<any>:B, var<float32>:epsilon)->(tensor<bool>:mask) |
+| greaterscalar | miaobyte | mask=greater(T1,scalar) | greaterscalar(tensor<any>:A, var<any>:scalar)->(tensor<bool>:mask) |
+| switch | miaobyte | C=switch([tensors],case) | switch(listtensor<any>:tensors, tensor<int32|bool>:cases)->(tensor<any>:C) |
+| powscalar | miaobyte | T3=T1^scalar | powscalar(tensor<any>:A, var<any>:scalar)->(tensor<any>:C) |
 
-### reduce
+### changeshape
 
 | Operation | Author |  Math Formula | IR Instruction |
 |-----------|--------|--------------|----------------|
-| prod | miaobyte | B = prod(A, axis=[1 2], keepdims=false) | prod(tensor<any>:A, vector<int32>:axis, var<bool>:keepdims)->(tensor<any>:B) |
-| reducemax | miaobyte | B = reducemax(A, axis=[1 2], keepdims=false) | reducemax(tensor<any>:A, vector<int32>:axis, var<bool>:keepdims)->(tensor<any>:B) |
-| sum | miaobyte | B = sum(A, axis=[1 2], keepdims=false) | sum(tensor<any>:A, vector<int32>:axis, var<bool>:keepdims)->(tensor<any>:B) |
-| reducemin | miaobyte | B = reducemin(A, axis=[1 2], keepdims=false) | reducemin(tensor<any>:A, vector<int32>:axis, var<bool>:keepdims)->(tensor<any>:B) |
+| reshape | miaobyte | T1.reshape(shape)->T2 | reshape(tensor<any>:A, vector<int32>:shape)->(tensor<any>:B) |
+| transpose | miaobyte | T1.transpose(dimorder=[1,0])->T2 | transpose(tensor<any>:A, vector<int32>:dim_order)->(tensor<any>:C) |
+| concat | miaobyte | Tresult = concat([T1, T2...], axis=3) | concat(listtensor<any>:tensors, var<int32>:dim)->(tensor<any>:result) |
+| broadcastTo | miaobyte | T2 = T1.broadcastTo(new_shape=[4,3,2]) | broadcastTo(tensor<any>:A, vector<int32>:new_shape)->(tensor<any>:B) |
+| indexselect | miaobyte | T2 = T1.indexselect(index=T3, axis=3) | indexselect(tensor<any>:A, tensor<int64|int32>:index, var<int32>:axis)->(tensor<any>:B) |
+| repeat | miaobyte | T1.repeat(repeats=[4,3,2])->T2 | repeat(tensor<any>:A, vector<int32>:repeats)->(tensor<any>:B) |
 
-### changeshape
+### reduce
 
 | Operation | Author |  Math Formula | IR Instruction |
 |-----------|--------|--------------|----------------|
-| indexselect | miaobyte | T2 = T1.indexselect(index=T3, axis=3) | indexselect(tensor<any>:A, tensor<int64|int32>:index, var<int32>:axis)->(tensor<any>:B) |
-| broadcastTo | miaobyte | T2 = T1.broadcastTo(new_shape=[4,3,2]) | broadcastTo(tensor<any>:A, vector<int32>:new_shape)->(tensor<any>:B) |
-| concat | miaobyte | Tresult = concat([T1, T2...], axis=3) | concat(listtensor<any>:tensors, var<int32>:dim)->(tensor<any>:result) |
-| transpose | miaobyte | T1.transpose(dimorder=[1,0])->T2 | transpose(tensor<any>:A, vector<int32>:dim_order)->(tensor<any>:C) |
-| reshape | miaobyte | T1.reshape(shape)->T2 | reshape(tensor<any>:A, vector<int32>:shape)->(tensor<any>:B) |
+| reducemin | miaobyte | B = reducemin(A, axis=[1 2], keepdims=false) | reducemin(tensor<any>:A, vector<int32>:axis, var<bool>:keepdims)->(tensor<any>:B) |
+| sum | miaobyte | B = sum(A, axis=[1 2], keepdims=false) | sum(tensor<any>:A, vector<int32>:axis, var<bool>:keepdims)->(tensor<any>:B) |
+| reducemax | miaobyte | B = reducemax(A, axis=[1 2], keepdims=false) | reducemax(tensor<any>:A, vector<int32>:axis, var<bool>:keepdims)->(tensor<any>:B) |
+| prod | miaobyte | B = prod(A, axis=[1 2], keepdims=false) | prod(tensor<any>:A, vector<int32>:axis, var<bool>:keepdims)->(tensor<any>:B) |
 
diff --git a/excuter/cpp-common/src/deepx/shape_changeshape.hpp b/excuter/cpp-common/src/deepx/shape_changeshape.hpp
index 71cbcb63..65a36670 100644
--- a/excuter/cpp-common/src/deepx/shape_changeshape.hpp
+++ b/excuter/cpp-common/src/deepx/shape_changeshape.hpp
@@ -73,5 +73,8 @@ namespace deepx
  
     //indexselect
     vector<int> indexselectShape(const vector<int> &input_shape, const vector<int> &index_shape, const int axis);
+
+    //repeat
+    std::vector<int> repeatShape(const std::vector<int> &src, const std::vector<int> &repeats);
 }
 #endif // DEEPX_SHAPE_CHANGESHAPE_HPP
\ No newline at end of file
diff --git a/excuter/op-mem-ompsimd/src/client/tfs.cpp b/excuter/op-mem-ompsimd/src/client/tfs.cpp
index 04b09aeb..e0d9af05 100644
--- a/excuter/op-mem-ompsimd/src/client/tfs.cpp
+++ b/excuter/op-mem-ompsimd/src/client/tfs.cpp
@@ -616,6 +616,16 @@ namespace deepx::tf
                                                                      {
                                                                          Param("B", DataCategory::Tensor, Precision::Any),
                                                                      })));
+         // repeat author=miaobyte
+        tffactory.add_tf(std::make_shared<Repeat<miaobyte>>(vector<Param>(
+            {
+                Param("A", DataCategory::Tensor, Precision::Any),
+                Param("repeats", DataCategory::Vector, Precision::Int32)
+            }),
+        vector<Param>(
+            {
+                Param("B", DataCategory::Tensor, Precision::Any),
+            })));                                                            
     }
     // // reduce
     void register_reduce(TfFactory &tffactory)
diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/changeshape_miaobyte.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/changeshape_miaobyte.hpp
index 0e04a63b..901ac913 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/changeshape_miaobyte.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/changeshape_miaobyte.hpp
@@ -188,12 +188,13 @@ namespace deepx::tensorfunc
             {
                 throw TensorShapeError("Repeat shape mismatch");
             }
-            B.shape.rangeParallel(B.shape.dim(), [&A,B,&repeats](const int idx, const std::vector<int> &indices, ThreadLocalVectors &tlv)
+            B.shape.rangeParallel(B.shape.dim(), [&A,&B,&repeats](const int idx, const std::vector<int> &indices, ThreadLocalVectors &tlv)
                                   {
                         for (size_t i = 0; i < A.shape.dim(); ++i) {
                             tlv.get(0)[i] = indices[i] / repeats[i];
                         }
-                        B.data[idx] = A.data[A.shape.linearat(tlv.get(0))];
+                        int idx_A=A.shape.linearat(tlv.get(0));
+                        B.data[idx] = A.data[idx_A];
                     },{B.shape.dim()});
         }
     };
diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/changeshape.hpp b/excuter/op-mem-ompsimd/src/deepx/tf/changeshape.hpp
index 8ffd5af4..1fc44285 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tf/changeshape.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tf/changeshape.hpp
@@ -109,7 +109,7 @@ namespace deepx::tf
                 return 1;
             }
             Precision input_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
-            vector<int> dim_order = this->getvector<int>(1,true);
+            vector<int> dim_order = this->getvector<int>(1, true);
             Precision output_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
             if (input_type != output_type)
             {
@@ -448,67 +448,67 @@ namespace deepx::tf
     };
 
     // repeat
-     template <typename Author>
-     class Repeat : public TF
-     {
-     public:
-     Repeat(const vector<Param> &args, const vector<Param> &returns)
-         {
-             this->name = "repeat";
-             this->metadata.author = Author::name();
-             this->tftype = "changeshape";
-             this->args = args;
-             this->returns = returns;
-         }
- 
-         string math_formula() const override
-         {
-             return "T1.repeat(repeats=[4,3,2])->T2";
-         }
-         shared_ptr<TF> clone() const override
-         {
-             return make_shared<Repeat<Author>>(*this);
-         }
-         int run(shared_ptr<MemBase> mem, string &error) override
-         {
-             if (!checktensors({this->args[0].textvalue, this->returns[0].textvalue}, mem, error) != 0)
-             {
-                 return 1;
-             }
-             Precision input_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
-             vector<int> new_shape = this->getvector<int>(1, true);
-             Precision output_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
-             if (input_type != output_type)
-             {
-                 error = "Type mismatch: " + precision_str(input_type) + " != " + precision_str(output_type);
-                 return 1;
-             }
-             switch (input_type)
-             {
-             case Precision::Float64:
-                 broadcastTo<Author, double>(*mem->gettensor<double>(this->args[0].textvalue), new_shape, *mem->gettensor<double>(this->returns[0].textvalue));
-                 break;
-             case Precision::Float32:
-                 broadcastTo<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), new_shape, *mem->gettensor<float>(this->returns[0].textvalue));
-                 break;
-             case Precision::Int64:
-                 broadcastTo<Author, int64_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), new_shape, *mem->gettensor<int64_t>(this->returns[0].textvalue));
-                 break;
-             case Precision::Int32:
-                 broadcastTo<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), new_shape, *mem->gettensor<int32_t>(this->returns[0].textvalue));
-                 break;
-             case Precision::Int16:
-                 broadcastTo<Author, int16_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), new_shape, *mem->gettensor<int16_t>(this->returns[0].textvalue));
-                 break;
-             case Precision::Int8:
-                 broadcastTo<Author, int8_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), new_shape, *mem->gettensor<int8_t>(this->returns[0].textvalue));
-                 break;
-             default:
-                 error = "Unsupported type: " + precision_str(input_type);
-                 return 1;
-             }
-             return 0;
-         }
-     };
+    template <typename Author>
+    class Repeat : public TF
+    {
+    public:
+        Repeat(const vector<Param> &args, const vector<Param> &returns)
+        {
+            this->name = "repeat";
+            this->metadata.author = Author::name();
+            this->tftype = "changeshape";
+            this->args = args;
+            this->returns = returns;
+        }
+
+        string math_formula() const override
+        {
+            return "T1.repeat(repeats=[4,3,2])->T2";
+        }
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<Repeat<Author>>(*this);
+        }
+        int run(shared_ptr<MemBase> mem, string &error) override
+        {
+            if (!checktensors({this->args[0].textvalue, this->returns[0].textvalue}, mem, error) != 0)
+            {
+                return 1;
+            }
+            Precision input_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
+            vector<int> repeats = this->getvector<int>(1, true);
+            Precision output_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
+            if (input_type != output_type)
+            {
+                error = "Type mismatch: " + precision_str(input_type) + " != " + precision_str(output_type);
+                return 1;
+            }
+            switch (input_type)
+            {
+            case Precision::Float64:
+                repeat<Author, double>(*mem->gettensor<double>(this->args[0].textvalue), repeats, *mem->gettensor<double>(this->returns[0].textvalue));
+                break;
+            case Precision::Float32:
+                repeat<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), repeats, *mem->gettensor<float>(this->returns[0].textvalue));
+                break;
+            case Precision::Int64:
+                repeat<Author, int64_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), repeats, *mem->gettensor<int64_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int32:
+                repeat<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), repeats, *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int16:
+                repeat<Author, int16_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), repeats, *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int8:
+                repeat<Author, int8_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), repeats, *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                break;
+            default:
+                error = "Unsupported type: " + precision_str(input_type);
+                return 1;
+            }
+            return 0;
+        }
+    };
 }
 #endif // DEEPX_TF_CHANGESHAPE_HPP
\ No newline at end of file
diff --git a/front/py/deepx/nn/functional/__init__.py b/front/py/deepx/nn/functional/__init__.py
index da04e664..3f5d33cf 100644
--- a/front/py/deepx/nn/functional/__init__.py
+++ b/front/py/deepx/nn/functional/__init__.py
@@ -30,7 +30,7 @@
     "invert",
     "matmul",
     "reducemax","reducemin","sum","prod",
-    "reshape","permute","transpose","concat","broadcastTo","broadcast_to","indexselect",
+    "reshape","permute","transpose","concat","broadcastTo","broadcast_to","indexselect",'repeat',
 
     #functional
     "relu","sigmoid","swish","silu",
diff --git a/front/py/deepx/nn/functional/authormap.py b/front/py/deepx/nn/functional/authormap.py
index 696a8d5d..6e8bc602 100644
--- a/front/py/deepx/nn/functional/authormap.py
+++ b/front/py/deepx/nn/functional/authormap.py
@@ -54,6 +54,7 @@
         'broadcastTo':'miaobyte',
         'concat':'miaobyte',
         'indexselect':'miaobyte',
+        'repeat':'miaobyte',
         #matmul
         # 'matmul':'miaobyte',
         'matmul':'cublas',
diff --git a/front/py/deepx/nn/functional/leaffunc_changeshape.py b/front/py/deepx/nn/functional/leaffunc_changeshape.py
index a69f92d6..2da83802 100644
--- a/front/py/deepx/nn/functional/leaffunc_changeshape.py
+++ b/front/py/deepx/nn/functional/leaffunc_changeshape.py
@@ -93,6 +93,19 @@ def indexselect(input:Tensor,indices:Tensor,gatheraxis:int,out:Union[Tensor,str]
     rtf_indexselect(input,indices,gatheraxis,outtensor,defaultauthor['indexselect'])
     return outtensor
 
+def repeat(input:Tensor,repeats:tuple[int,...],out:Union[Tensor,str]=''):
+    assert isinstance(repeats,tuple)
+    assert input.Shape.ndim==len(repeats)
+    for i in repeats:
+        assert isinstance(i,int) and i>0
+    outtensor=out
+    if isinstance(out,str) or out is None:
+        outshape=Shape.repeatshape(input.shape,repeats)
+        outtensor=newtensor(outshape,dtype=input.dtype,name=out)
+    from .rtf_changeshape import rtf_repeat
+    rtf_repeat(input,repeats,outtensor,defaultauthor['repeat'])
+    return outtensor
+
 # def unsqueeze(t:Tensor,dim:int)->Tensor:
 #     # 确保dim是有效的
 #     if dim < -t.ndim-1 or dim > t.ndim:
diff --git a/front/py/deepx/nn/functional/rtf_changeshape.py b/front/py/deepx/nn/functional/rtf_changeshape.py
index 37b38bad..7c0b144b 100644
--- a/front/py/deepx/nn/functional/rtf_changeshape.py
+++ b/front/py/deepx/nn/functional/rtf_changeshape.py
@@ -35,3 +35,11 @@ def rtf_indexselect(input:Tensor,indices:Tensor,axis:int,out:Tensor,author='miao
     ir=DeepxIR("indexselect", args, returns,author)
     send(ir)
  
+def rtf_repeat(input:Tensor,repeats:tuple[int,...],out:Tensor,author='miaobyte'):
+    assert isinstance(repeats,tuple)
+    for i in repeats:
+        assert isinstance(i,int) and i>0
+    args=[Param.tensor(input),Param.vector(repeats,'int32')]
+    returns=[Param.tensor(out)]
+    ir=DeepxIR("repeat", args, returns,author)
+    send(ir)
\ No newline at end of file
diff --git a/front/py/deepx/tensor/changeshape.py b/front/py/deepx/tensor/changeshape.py
index f359c2de..39b415ad 100644
--- a/front/py/deepx/tensor/changeshape.py
+++ b/front/py/deepx/tensor/changeshape.py
@@ -74,6 +74,12 @@ def unsqueeze(self,dim:int)->Tensor:
     result=unsqueeze_func(self,dim)
     return result
 
+@tensor_method
+def repeat(self,repeats:tuple[int,...])->Tensor:
+    from deepx.nn.functional import repeat as repeat_func
+    result=repeat_func(self,repeats)
+    return result
+
 # @tensor_method
 # def expand(self,shape:tuple)->Tensor:
 #     from deepx.nn.functional import expand as expand_func
diff --git a/front/py/deepx/tensor/shape.py b/front/py/deepx/tensor/shape.py
index cb0d3c8e..1cd165c0 100644
--- a/front/py/deepx/tensor/shape.py
+++ b/front/py/deepx/tensor/shape.py
@@ -199,4 +199,9 @@ def save(self,path:str):
             with open(path, 'w') as f:
                 yaml.dump({'shape': list(self.shape), 'dtype': self._dtype,'size':self.numel(),'dim':self.ndim,'stride':list(self.stride)}, f)
         else:
-            raise ValueError("文件名必须以.shape结尾")
\ No newline at end of file
+            raise ValueError("文件名必须以.shape结尾")
+        
+    @classmethod
+    def repeatshape(cls,input_shape:tuple[int,...],repeat:tuple[int,...])->tuple[int,...]:
+        assert len(input_shape)== len(repeat)
+        return tuple(d * r for d, r in zip(input_shape, repeat))
diff --git a/front/py/examples/2_ir/4_changeshape_repeat.py b/front/py/examples/2_ir/4_changeshape_repeat.py
new file mode 100644
index 00000000..6bdcbd8f
--- /dev/null
+++ b/front/py/examples/2_ir/4_changeshape_repeat.py
@@ -0,0 +1,17 @@
+print()
+############-------PyTorch-------################
+
+import torch
+torch_t1 = torch.arange(60, dtype=torch.float32).reshape(3, 4,5)
+print(torch_t1)
+torch_t2=torch_t1.repeat([1,2,3])
+print(torch_t2)
+
+
+############-------Deepx-------################
+
+from deepx import arange
+t1 =  arange(0,60).reshape_((3, 4,5))
+t1.print()
+t2=t1.repeat((1,2,3))
+t2.print()
\ No newline at end of file

From d39e7fedb7fda5a150dcc04f5ae0b32c3dcb1133 Mon Sep 17 00:00:00 2001
From: lipeng <734991033@qq.com>
Date: Mon, 2 Jun 2025 15:43:08 +0800
Subject: [PATCH 3/3] =?UTF-8?q?repeat:cuda=20=E5=AE=9E=E7=8E=B0=E5=AE=8C?=
 =?UTF-8?q?=E6=88=90=EF=BC=8C=E5=AE=8C=E6=88=90=E9=AA=8C=E8=AF=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 doc/excuter/op-mem-cuda/list.md               | 125 +++++++++---------
 excuter/op-mem-cuda/src/client/tfs.cpp        |  11 ++
 .../deepx/tensorfunc/changeshape_miaobyte.cu  | 117 +++++++++++++++-
 .../deepx/tensorfunc/changeshape_miaobyte.cuh |  16 +++
 .../deepx/tensorfunc/changeshape_miaobyte.hpp |  19 ++-
 .../tensorfunc/elementwise_miaobyte_basic.cu  |   1 +
 .../op-mem-cuda/src/deepx/tf/changeshape.hpp  |  68 ++++++++++
 .../src/deepx/tf/elementwise_basic.hpp        |   1 -
 front/py/deepx/scheduler/client/udpconn.py    |   2 +-
 9 files changed, 294 insertions(+), 66 deletions(-)

diff --git a/doc/excuter/op-mem-cuda/list.md b/doc/excuter/op-mem-cuda/list.md
index e33b2df3..69966acf 100644
--- a/doc/excuter/op-mem-cuda/list.md
+++ b/doc/excuter/op-mem-cuda/list.md
@@ -2,105 +2,106 @@
 
 本页面由 `excuter/op-mem-cuda 生成，请勿手动修改 
 
-### arg
+### matmul
 
 | Operation | Author |  Math Formula | IR Instruction |
 |-----------|--------|--------------|----------------|
-| vecset |  none  | [3  4  5]->shape | vecset(vector<any>:value)->(vector<any>:name) |
-| argset |  none  | argvalue->argname | argset(var<any>:value)->(var<any>:name) |
+| matmul | cublas | T3=T1 @ T2 | matmul(tensor<any>:A, tensor<any>:B)->(tensor<any>:C) |
 
-### tensorlife
+### init
 
 | Operation | Author |  Math Formula | IR Instruction |
 |-----------|--------|--------------|----------------|
-| renametensor |  none  | rename(newname)->T1 | renametensor(var<string>:new_name)->(tensor<any>:t) |
-| newtensor |  none  | T1 = zeros(shape) | newtensor(vector<int32>:shape)->(tensor<any>:tensor1) |
-| newtensor |  none  | T1 = zeros(shape) | newtensor(var<string>:shape)->(tensor<any>:tensor1) |
-| deltensor |  none  | del->T1 | deltensor()->(tensor<any>:t) |
-| copytensor |  none  | T2.data = T1.data | copytensor(tensor<any>:src)->(tensor<any>:dst) |
+| constant | miaobyte | constant(value)->T1 | constant(var<any>:value)->(tensor<any>:t) |
+| arange | miaobyte | arange(start,step)->T1 | arange(var<any>:start, var<any>:step)->(tensor<any>:t) |
+| uniform | miaobyte | uniform(low,high,seed)->T1 | uniform(var<any>:low, var<any>:high, var<int32>:seed)->(tensor<any>:t) |
+| dropout | miaobyte | dropout(p,seed)->A | dropout(var<float32>:p, var<int32>:seed)->(tensor<any>:A) |
+| normal | miaobyte | normal(mean,stddev,seed)->T1 | normal(var<any>:mean, var<any>:stddev, var<int32>:seed)->(tensor<any>:t) |
 
 ### io
 
 | Operation | Author |  Math Formula | IR Instruction |
 |-----------|--------|--------------|----------------|
-| loadtensordata |  none  | loadtensordata(path)->tensor | loadtensordata(var<string>:path)->(tensor<any>:t) |
-| save |  none  | save(T1,path) | save(tensor<any>:t, var<string>:path)->() |
+| load |  none  | load(path) | load(var<string>:path)->() |
 | print | miaobyte | print(T1) | print(tensor<any>:t)->() |
 | print | miaobyte | print(T1) | print(tensor<any>:t, var<string>:format)->() |
-| load |  none  | load(path) | load(var<string>:path)->() |
+| save |  none  | save(T1,path) | save(tensor<any>:t, var<string>:path)->() |
+| loadtensordata |  none  | loadtensordata(path)->tensor | loadtensordata(var<string>:path)->(tensor<any>:t) |
 
-### matmul
+### arg
 
 | Operation | Author |  Math Formula | IR Instruction |
 |-----------|--------|--------------|----------------|
-| matmul | cublas | T3=T1 @ T2 | matmul(tensor<any>:A, tensor<any>:B)->(tensor<any>:C) |
+| argset |  none  | argvalue->argname | argset(var<any>:value)->(var<any>:name) |
+| vecset |  none  | [3  4  5]->shape | vecset(vector<any>:value)->(vector<any>:name) |
 
-### init
+### tensorlife
 
 | Operation | Author |  Math Formula | IR Instruction |
 |-----------|--------|--------------|----------------|
-| normal | miaobyte | normal(mean,stddev,seed)->T1 | normal(var<any>:mean, var<any>:stddev, var<int32>:seed)->(tensor<any>:t) |
-| dropout | miaobyte | dropout(p,seed)->A | dropout(var<float32>:p, var<int32>:seed)->(tensor<any>:A) |
-| uniform | miaobyte | uniform(low,high,seed)->T1 | uniform(var<any>:low, var<any>:high, var<int32>:seed)->(tensor<any>:t) |
-| arange | miaobyte | arange(start,step)->T1 | arange(var<any>:start, var<any>:step)->(tensor<any>:t) |
-| constant | miaobyte | constant(value)->T1 | constant(var<any>:value)->(tensor<any>:t) |
+| copytensor |  none  | T2.data = T1.data | copytensor(tensor<any>:src)->(tensor<any>:dst) |
+| newtensor |  none  | T1 = zeros(shape) | newtensor(vector<int32>:shape)->(tensor<any>:tensor1) |
+| newtensor |  none  | T1 = zeros(shape) | newtensor(var<string>:shape)->(tensor<any>:tensor1) |
+| renametensor |  none  | rename(newname)->T1 | renametensor(var<string>:new_name)->(tensor<any>:t) |
+| deltensor |  none  | del->T1 | deltensor()->(tensor<any>:t) |
 
 ### elementwise
 
 | Operation | Author |  Math Formula | IR Instruction |
 |-----------|--------|--------------|----------------|
-| switch | miaobyte | C=switch(tensors,cases) | switch(listtensor<any>:tensors, tensor<int32|bool>:cases)->(tensor<any>:result) |
-| greaterscalar | miaobyte | mask=compare(T1, scalar) | greaterscalar(tensor<any>:A, var<any>:scalar)->(tensor<bool>:mask) |
-| notequal | miaobyte | T1!=T2->mask | notequal(tensor<any>:A, tensor<any>:B, var<float32>:epsilon)->(tensor<bool>:mask) |
-| equalscalar | miaobyte | T1==scalar->mask | equalscalar(tensor<any>:A, var<any>:scalar, var<float32>:epsilon)->(tensor<bool>:mask) |
-| min | miaobyte | T3=min(T1, T2) | min(tensor<any>:A, tensor<any>:B)->(tensor<any>:C) |
-| maxscalar | miaobyte | T3=max(T1, scalar) | maxscalar(tensor<any>:A, var<any>:scalar)->(tensor<any>:C) |
-| tan | miaobyte | T3=tan(T1) | tan(tensor<float64|float32>:A)->(tensor<float64|float32>:C) |
-| sin | miaobyte | T3=sin(T1) | sin(tensor<float64|float32|float16|bfloat16>:A)->(tensor<float64|float32|float16|bfloat16>:C) |
-| less | miaobyte | mask=compare(T1, T2) | less(tensor<any>:A, tensor<any>:B)->(tensor<bool>:mask) |
-| powscalar | miaobyte | T3=pow(T1, scalar) | powscalar(tensor<float64|float32>:A, var<float64|int32>:scalar)->(tensor<float64|float32>:C) |
-| rsubscalar | miaobyte | T3=scalar-T1 | rsubscalar(var<any>:scalar, tensor<any>:A)->(tensor<any>:C) |
-| divscalar | miaobyte | T3=scalar/T1 | divscalar(tensor<any>:A, var<any>:scalar)->(tensor<any>:C) |
-| log | miaobyte | T3=log(T1) | log(tensor<float64|float32|float16|bfloat16>:A)->(tensor<float64|float32|float16|bfloat16>:C) |
-| addscalar | miaobyte | T3=T1+scalar | addscalar(tensor<any>:A, var<any>:b)->(tensor<any>:C) |
+| pow | miaobyte | T3=pow(T1, T2) | pow(tensor<float64|float32>:A, tensor<float64|float32>:B)->(tensor<float64|float32>:C) |
+| max | miaobyte | T3=max(T1, T2) | max(tensor<any>:A, tensor<any>:B)->(tensor<any>:C) |
+| mulscalar | miaobyte | T3=T1*scalar | mulscalar(tensor<any>:A, var<any>:b)->(tensor<any>:C) |
+| equal | miaobyte | T1==T2->mask | equal(tensor<any>:A, tensor<any>:B, var<float32>:epsilon)->(tensor<bool>:mask) |
+| mul | miaobyte | T3=T1*T2 | mul(tensor<any>:A, tensor<any>:B)->(tensor<any>:C) |
+| exp | miaobyte | T3=exp(T1) | exp(tensor<float64|float32|float16|bfloat16>:A)->(tensor<float64|float32|float16|bfloat16>:C) |
+| subscalar | miaobyte | T3=T1-scalar | subscalar(tensor<any>:A, var<any>:b)->(tensor<any>:C) |
+| sqrt | miaobyte | T3=sqrt(T1) | sqrt(tensor<float64|float32|float16|bfloat16>:A)->(tensor<float64|float32|float16|bfloat16>:C) |
+| sub | miaobyte | T3=T1-T2 | sub(tensor<any>:A, tensor<any>:B)->(tensor<any>:C) |
+| add | cublas | T3=T1+T2 | add(tensor<any>:a, tensor<any>:b)->(tensor<any>:c) |
+| add | miaobyte | T3=T1+T2 | add(tensor<any>:a, tensor<any>:b)->(tensor<any>:c) |
+| todtype |  none  | T3(dtypeA)->T1(dtypeB) | todtype(tensor<any>:a)->(tensor<any>:b) |
+| invert | miaobyte | T3=~T1 | invert(tensor<int64|int32|int16|int8|bool>:A)->(tensor<int64|int32|int16|int8|bool>:C) |
+| rdivscalar | miaobyte | T3=scalar/T1 | rdivscalar(var<any>:scalar, tensor<any>:A)->(tensor<any>:C) |
+| rpowscalar | miaobyte | T3=pow(scalar, T1) | rpowscalar(var<float32|int32>:scalar, tensor<float64|float32>:A)->(tensor<float64|float32>:C) |
+| cos | miaobyte | T3=cos(T1) | cos(tensor<float64|float32|float16|bfloat16>:A)->(tensor<float64|float32|float16|bfloat16>:C) |
 | greater | miaobyte | mask=compare(T1, T2) | greater(tensor<any>:A, tensor<any>:B)->(tensor<bool>:mask) |
+| addscalar | miaobyte | T3=T1+scalar | addscalar(tensor<any>:A, var<any>:b)->(tensor<any>:C) |
+| div | miaobyte | T3=T1/T2 | div(tensor<any>:A, tensor<any>:B)->(tensor<any>:C) |
+| divscalar | miaobyte | T3=scalar/T1 | divscalar(tensor<any>:A, var<any>:scalar)->(tensor<any>:C) |
 | lessscalar | miaobyte | mask=compare(T1, scalar) | lessscalar(tensor<any>:A, var<any>:scalar)->(tensor<bool>:mask) |
-| cos | miaobyte | T3=cos(T1) | cos(tensor<float64|float32|float16|bfloat16>:A)->(tensor<float64|float32|float16|bfloat16>:C) |
+| less | miaobyte | mask=compare(T1, T2) | less(tensor<any>:A, tensor<any>:B)->(tensor<bool>:mask) |
 | notequalscalar | miaobyte | T1!=scalar->mask | notequalscalar(tensor<any>:A, var<any>:scalar, var<float32>:epsilon)->(tensor<bool>:mask) |
+| rsubscalar | miaobyte | T3=scalar-T1 | rsubscalar(var<any>:scalar, tensor<any>:A)->(tensor<any>:C) |
+| sin | miaobyte | T3=sin(T1) | sin(tensor<float64|float32|float16|bfloat16>:A)->(tensor<float64|float32|float16|bfloat16>:C) |
 | minscalar | miaobyte | T3=min(T1, scalar) | minscalar(tensor<any>:A, var<any>:scalar)->(tensor<any>:C) |
-| rpowscalar | miaobyte | T3=pow(scalar, T1) | rpowscalar(var<float32|int32>:scalar, tensor<float64|float32>:A)->(tensor<float64|float32>:C) |
-| rdivscalar | miaobyte | T3=scalar/T1 | rdivscalar(var<any>:scalar, tensor<any>:A)->(tensor<any>:C) |
-| todtype |  none  | T3(dtypeA)->T1(dtypeB) | todtype(tensor<any>:a)->(tensor<any>:b) |
-| add | cublas | T3=T1+T2 | add(tensor<any>:a, tensor<any>:b)->(tensor<any>:c) |
-| add | miaobyte | T3=T1+T2 | add(tensor<any>:a, tensor<any>:b)->(tensor<any>:c) |
-| sub | miaobyte | T3=T1-T2 | sub(tensor<any>:A, tensor<any>:B)->(tensor<any>:C) |
-| sqrt | miaobyte | T3=sqrt(T1) | sqrt(tensor<float64|float32|float16|bfloat16>:A)->(tensor<float64|float32|float16|bfloat16>:C) |
-| subscalar | miaobyte | T3=T1-scalar | subscalar(tensor<any>:A, var<any>:b)->(tensor<any>:C) |
-| exp | miaobyte | T3=exp(T1) | exp(tensor<float64|float32|float16|bfloat16>:A)->(tensor<float64|float32|float16|bfloat16>:C) |
-| mul | miaobyte | T3=T1*T2 | mul(tensor<any>:A, tensor<any>:B)->(tensor<any>:C) |
-| equal | miaobyte | T1==T2->mask | equal(tensor<any>:A, tensor<any>:B, var<float32>:epsilon)->(tensor<bool>:mask) |
-| mulscalar | miaobyte | T3=T1*scalar | mulscalar(tensor<any>:A, var<any>:b)->(tensor<any>:C) |
-| div | miaobyte | T3=T1/T2 | div(tensor<any>:A, tensor<any>:B)->(tensor<any>:C) |
-| invert | miaobyte | T3=~T1 | invert(tensor<int64|int32|int16|int8|bool>:A)->(tensor<int64|int32|int16|int8|bool>:C) |
-| max | miaobyte | T3=max(T1, T2) | max(tensor<any>:A, tensor<any>:B)->(tensor<any>:C) |
-| pow | miaobyte | T3=pow(T1, T2) | pow(tensor<float64|float32>:A, tensor<float64|float32>:B)->(tensor<float64|float32>:C) |
+| tan | miaobyte | T3=tan(T1) | tan(tensor<float64|float32>:A)->(tensor<float64|float32>:C) |
+| maxscalar | miaobyte | T3=max(T1, scalar) | maxscalar(tensor<any>:A, var<any>:scalar)->(tensor<any>:C) |
+| min | miaobyte | T3=min(T1, T2) | min(tensor<any>:A, tensor<any>:B)->(tensor<any>:C) |
+| log | miaobyte | T3=log(T1) | log(tensor<float64|float32|float16|bfloat16>:A)->(tensor<float64|float32|float16|bfloat16>:C) |
+| equalscalar | miaobyte | T1==scalar->mask | equalscalar(tensor<any>:A, var<any>:scalar, var<float32>:epsilon)->(tensor<bool>:mask) |
+| notequal | miaobyte | T1!=T2->mask | notequal(tensor<any>:A, tensor<any>:B, var<float32>:epsilon)->(tensor<bool>:mask) |
+| greaterscalar | miaobyte | mask=compare(T1, scalar) | greaterscalar(tensor<any>:A, var<any>:scalar)->(tensor<bool>:mask) |
+| switch | miaobyte | C=switch(tensors,cases) | switch(listtensor<any>:tensors, tensor<int32|bool>:cases)->(tensor<any>:result) |
+| powscalar | miaobyte | T3=pow(T1, scalar) | powscalar(tensor<float64|float32>:A, var<float64|int32>:scalar)->(tensor<float64|float32>:C) |
 
-### reduce
+### changeshape
 
 | Operation | Author |  Math Formula | IR Instruction |
 |-----------|--------|--------------|----------------|
-| prod | miaobyte | B = prod(A, axis=[1 2], keepdims=false) | prod(tensor<any>:A, vector<int32>:dims, var<bool>:keepdims)->(tensor<any>:B) |
-| reducemax | miaobyte | B = reducemax(A, axis=[1 2], keepdims=false) | reducemax(tensor<any>:A, vector<int32>:dims, var<bool>:keepdims)->(tensor<any>:B) |
-| sum | miaobyte | B = sum(A, axis=[1 2], keepdims=false) | sum(tensor<any>:A, vector<int32>:dims, var<bool>:keepdims)->(tensor<any>:B) |
-| reducemin | miaobyte | B = reducemin(A, axis=[1 2], keepdims=false) | reducemin(tensor<any>:A, vector<int32>:dims, var<bool>:keepdims)->(tensor<any>:B) |
+| reshape | miaobyte | T1.reshape(shape)->T2 | reshape(tensor<any>:A, vector<int32>:shape)->(tensor<any>:B) |
+| transpose | miaobyte | T2 = T1.transpose(dimorder=[1,0]) | transpose(tensor<any>:A, vector<int32>:dim_order)->(tensor<any>:C) |
+| concat | miaobyte | Tresult = concat([T1, T2...], axis=3) | concat(listtensor<any>:tensors, var<int32>:dim)->(tensor<any>:result) |
+| broadcastTo | miaobyte | T2 = T1.broadcastTo(new_shape=[4,3,2]) | broadcastTo(tensor<any>:A, vector<int32>:new_shape)->(tensor<any>:B) |
+| indexselect | miaobyte | T2 = T1.indexselect(index=[1,2], axis=1) | indexselect(tensor<any>:A, tensor<int64|int32>:indices, var<int32>:axis)->(tensor<any>:B) |
+| repeat | miaobyte | T2 = T1.repeat(repeats=[3 4 5]) | repeat(tensor<any>:A, vector<int32>:repeats)->(tensor<any>:B) |
 
-### changeshape
+### reduce
 
 | Operation | Author |  Math Formula | IR Instruction |
 |-----------|--------|--------------|----------------|
-| indexselect | miaobyte | T2 = T1.indexselect(index=[1,2], axis=1) | indexselect(tensor<any>:A, tensor<int64|int32>:indices, var<int32>:axis)->(tensor<any>:B) |
-| broadcastTo | miaobyte | T2 = T1.broadcastTo(new_shape=[4,3,2]) | broadcastTo(tensor<any>:A, vector<int32>:new_shape)->(tensor<any>:B) |
-| concat | miaobyte | Tresult = concat([T1, T2...], axis=3) | concat(listtensor<any>:tensors, var<int32>:dim)->(tensor<any>:result) |
-| transpose | miaobyte | T2 = T1.transpose(dimorder=[1,0]) | transpose(tensor<any>:A, vector<int32>:dim_order)->(tensor<any>:C) |
-| reshape | miaobyte | T1.reshape(shape)->T2 | reshape(tensor<any>:A, vector<int32>:shape)->(tensor<any>:B) |
+| reducemin | miaobyte | B = reducemin(A, axis=[1 2], keepdims=false) | reducemin(tensor<any>:A, vector<int32>:dims, var<bool>:keepdims)->(tensor<any>:B) |
+| sum | miaobyte | B = sum(A, axis=[1 2], keepdims=false) | sum(tensor<any>:A, vector<int32>:dims, var<bool>:keepdims)->(tensor<any>:B) |
+| reducemax | miaobyte | B = reducemax(A, axis=[1 2], keepdims=false) | reducemax(tensor<any>:A, vector<int32>:dims, var<bool>:keepdims)->(tensor<any>:B) |
+| prod | miaobyte | B = prod(A, axis=[1 2], keepdims=false) | prod(tensor<any>:A, vector<int32>:dims, var<bool>:keepdims)->(tensor<any>:B) |
 
diff --git a/excuter/op-mem-cuda/src/client/tfs.cpp b/excuter/op-mem-cuda/src/client/tfs.cpp
index 85fd5be6..ef197b68 100644
--- a/excuter/op-mem-cuda/src/client/tfs.cpp
+++ b/excuter/op-mem-cuda/src/client/tfs.cpp
@@ -571,6 +571,17 @@ namespace deepx::tf
                                                                      {
                                                                          Param("B", DataCategory::Tensor, Precision::Any),
                                                                      })));
+        // repeat
+        tffactory.add_tf(std::make_shared<Repeat<miaobyte>>(vector<Param>(
+                                                                {
+                                                                    Param("A", DataCategory::Tensor, Precision::Any),
+                                                                    Param("repeats", DataCategory::Vector, Precision::Int32),
+                                                                 }),
+                                                            vector<Param>(
+                                                                {
+                                                                    Param("B", DataCategory::Tensor, Precision::Any),
+                                                                })));                                                        
+
     }
     // reduce
     void register_reduce(TfFactory &tffactory)
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu
index 6cd97295..aeb871c6 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu
@@ -103,6 +103,7 @@ namespace deepx::tensorfunc
     template void launch_transpose<float>(const float *input, const int *inputStrides, float *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
     template void launch_transpose<nv_bfloat16>(const nv_bfloat16 *input, const int *inputStrides, nv_bfloat16 *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
     template void launch_transpose<__half>(const __half *input, const int *inputStrides, __half *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
+    
     template void launch_transpose<int64_t>(const int64_t *input, const int *inputStrides, int64_t *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
     template void launch_transpose<int32_t>(const int32_t *input, const int *inputStrides, int32_t *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
     template void launch_transpose<int16_t>(const int16_t *input, const int *inputStrides, int16_t *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
@@ -546,6 +547,120 @@ namespace deepx::tensorfunc
                                                       const int32_t *index, const int *indexStrides, const int indexDim,
                                                       const int gatherAxis,
                                                       int8_t *output, const int *outputStrides, const int outputDim, const int outputlen);
-}
+
+
+    //repeat
+    template <int DIM, typename T>
+    __global__ void repeat_kernel(
+        const T *input, const int *inputStrides,
+        const int *repeats, 
+        T *output, const int *outputStrides,const int outputlen,const int dim){
+        const int grid_stride = gridDim.x * blockDim.x;
+        int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+        for (; thread_id < outputlen; thread_id += grid_stride)
+        {
+            // 输出索引
+            int output_indices[DIM];
+            linearTo(outputStrides, dim, output_indices, thread_id);
+
+            // 输入索引
+            int input_indices[DIM];
+            for (int i = 0; i < dim; ++i)
+            {
+                input_indices[i] = output_indices[i] / repeats[i];
+            }
+            int inputIdx = linearAt(inputStrides, dim, input_indices);
+            int outputIdx = linearAt(outputStrides, dim, output_indices);
+            output[outputIdx] = input[inputIdx];
+        }
+    }
+
+
+    template <typename T>
+    void launch_repeat(
+        const T *input, const int *inputStrides, 
+        const int *repeats,
+        T *output, const int *outputStrides,  const int outputlen,const int dim){
+
+
+        auto [numBlocks, blockSize] = BestDims(outputlen);
+        // input
+        cudaVector<int> inputStrides_d(inputStrides, dim, cudaMemcpyHostToDevice);
+        // output
+        cudaVector<int> outputStrides_d(outputStrides, dim, cudaMemcpyHostToDevice);
+        // repeats
+        cudaVector<int> repeats_d(repeats, dim, cudaMemcpyHostToDevice);
+        
+        switch (dim)
+        {
+        case 1:
+            repeat_kernel<1, T><<<numBlocks, blockSize>>>(input, inputStrides_d.data, repeats_d.data, output, outputStrides_d.data, outputlen, dim);
+            break;
+        case 2:
+            repeat_kernel<2, T><<<numBlocks, blockSize>>>(input, inputStrides_d.data, repeats_d.data, output, outputStrides_d.data, outputlen, dim);
+            break;
+        case 3:
+            repeat_kernel<3, T><<<numBlocks, blockSize>>>(input, inputStrides_d.data, repeats_d.data, output, outputStrides_d.data, outputlen, dim);
+            break;
+        case 4:
+            repeat_kernel<4, T><<<numBlocks, blockSize>>>(input, inputStrides_d.data, repeats_d.data, output, outputStrides_d.data, outputlen, dim);
+            break;
+        case 5:
+            repeat_kernel<5, T><<<numBlocks, blockSize>>>(input, inputStrides_d.data, repeats_d.data, output, outputStrides_d.data, outputlen, dim);
+            break;
+        case 6:
+            repeat_kernel<6, T><<<numBlocks, blockSize>>>(input, inputStrides_d.data, repeats_d.data, output, outputStrides_d.data, outputlen, dim);
+            break;
+        case 7:
+            repeat_kernel<7, T><<<numBlocks, blockSize>>>(input, inputStrides_d.data, repeats_d.data, output, outputStrides_d.data, outputlen, dim);
+            break;
+        case 8:
+            repeat_kernel<8, T><<<numBlocks, blockSize>>>(input, inputStrides_d.data, repeats_d.data, output, outputStrides_d.data, outputlen, dim);
+            break;
+        case 9:
+            repeat_kernel<9, T><<<numBlocks, blockSize>>>(input, inputStrides_d.data, repeats_d.data, output, outputStrides_d.data, outputlen, dim);
+            break;
+        case 10:
+            repeat_kernel<10, T><<<numBlocks, blockSize>>>(input, inputStrides_d.data, repeats_d.data, output, outputStrides_d.data, outputlen, dim);
+            break;
+        case 11:
+            repeat_kernel<11, T><<<numBlocks, blockSize>>>(input, inputStrides_d.data, repeats_d.data, output, outputStrides_d.data, outputlen, dim);
+            break;
+        case 12:
+            repeat_kernel<12, T><<<numBlocks, blockSize>>>(input, inputStrides_d.data, repeats_d.data, output, outputStrides_d.data, outputlen, dim);
+            break;
+        default:
+            throw std::runtime_error("dimension large than " + std::to_string(MAX_DIM));
+        }
+    }
+
+    template void launch_repeat<double>(const double *input, const int *inputStrides, 
+                                        const int *repeats,
+                                        double *output, const int *outputStrides,  const int outputlen,const int dim);
+    template void launch_repeat<float>(const float *input, const int *inputStrides,
+                                        const int *repeats,
+                                        float *output, const int *outputStrides,  const int outputlen,const int dim);
+    template void launch_repeat<nv_bfloat16>(const nv_bfloat16 *input, const int *inputStrides,
+                                        const int *repeats,
+                                        nv_bfloat16 *output, const int *outputStrides,  const int outputlen,const int dim);
+    template void launch_repeat<__half>(const __half *input, const int *inputStrides,
+                                        const int *repeats,
+                                        __half *output, const int *outputStrides,  const int outputlen,const int dim);
+    template void launch_repeat<int64_t>(const int64_t *input, const int *inputStrides,
+                                        const int *repeats,
+                                        int64_t *output, const int *outputStrides,  const int outputlen,const int dim);
+    template void launch_repeat<int32_t>(const int32_t *input, const int *inputStrides,
+                                        const int *repeats,
+                                        int32_t *output, const int *outputStrides,  const int outputlen,const int dim);
+    template void launch_repeat<int16_t>(const int16_t *input, const int *inputStrides,
+                                        const int *repeats,
+                                        int16_t *output, const int *outputStrides,  const int outputlen,const int dim);
+    template void launch_repeat<int8_t>(const int8_t *input, const int *inputStrides,
+                                        const int *repeats,
+                                        int8_t *output, const int *outputStrides,  const int outputlen,const int dim);
+    template void launch_repeat<bool>(const bool *input, const int *inputStrides,
+                                        const int *repeats,
+                                        bool *output, const int *outputStrides,  const int outputlen,const int dim);                                     
+}// namespace deepx
 
 #endif // DEEPX_TENSORFUNC_CHANGESHAPE_MIAOBYTE_HPP
\ No newline at end of file
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cuh b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cuh
index 26c40851..d3845ee4 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cuh
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cuh
@@ -65,5 +65,21 @@ namespace deepx::tensorfunc
         const GatherAxisT *indices,const int *indicesStrides,const int indicesDim,
         const int gatherAxis,
         T *output,const int *outputStrides,const int outputDim,const int outputlen);
+
+
+    // repeat
+    template <int DIM, typename T>
+    __global__ void repeat_kernel(
+        const T *input, const int *inputStrides, 
+        const int *repeats,
+        T *output, const int *outputStrides, const int outputlen,
+        const int dim);
+ 
+    template <typename T>
+    void launch_repeat(
+        const T *input, const int *inputStrides, 
+        const int *repeats, 
+        T *output, const int *outputStrides, const int outputlen,
+        const int dim);
 };
 #endif // DEEPX_TENSORFUNC_CHANGESHAPE_MIAOBYTE_CUH
\ No newline at end of file
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.hpp
index 922c8ddd..818e4bab 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.hpp
@@ -117,7 +117,7 @@ namespace deepx::tensorfunc
         }
     };
 
-    //indexselect
+    //indexselectmoe_infer
     template <typename T,typename GatherAxisT>
     struct indexselectDispatcher<miaobyte, T,GatherAxisT>
     {
@@ -135,5 +135,22 @@ namespace deepx::tensorfunc
                             output.data,output.shape.strides.data(),output.shape.dim(),output.shape.size);
         }
     };
+
+    //repeat
+    template <typename T>
+    struct repeatDispatcher<miaobyte, T>
+    {
+        static void repeat(const Tensor<T> &A, const std::vector<int> &repeats, Tensor<T> &B)
+        {
+            auto new_shape = repeatShape(A.shape.shape, repeats);
+            if (new_shape.empty() || new_shape != B.shape.shape)
+            {
+                throw TensorShapeError("Repeat shape mismatch");
+            }
+            launch_repeat<T>(A.data, A.shape.strides.data(), 
+                             repeats.data(), 
+                             B.data, B.shape.strides.data(),B.shape.size, B.shape.dim());
+        }
+    };
 }
 #endif // DEEPX_TENSORFUNC_CHANGESHAPE_MIAOBYTE_HPP
\ No newline at end of file
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu
index f48ca7b2..9392608d 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu
@@ -32,6 +32,7 @@ namespace deepx::tensorfunc
     template void launch_todtype<double, float>(const double *a, float *c, const int size);
     template void launch_todtype<double, half>(const double *a, half *c, const int size);
     template void launch_todtype<double, nv_bfloat16>(const double *a, nv_bfloat16 *c, const int size);
+    //template void launch_todtype<double, nv_float8>(const double *a, int64_t *c, const int size);
     template void launch_todtype<double, int64_t>(const double *a, int64_t *c, const int size);
     template void launch_todtype<double, int32_t>(const double *a, int32_t *c, const int size);
     template void launch_todtype<double, int16_t>(const double *a, int16_t *c, const int size);
diff --git a/excuter/op-mem-cuda/src/deepx/tf/changeshape.hpp b/excuter/op-mem-cuda/src/deepx/tf/changeshape.hpp
index db447136..b1d9ef8b 100644
--- a/excuter/op-mem-cuda/src/deepx/tf/changeshape.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tf/changeshape.hpp
@@ -490,5 +490,73 @@ namespace deepx::tf
         }
     };
 
+    //repeat
+    template <typename Author>
+    class Repeat : public TF
+    {
+    public:
+        Repeat(const vector<Param> &args, const vector<Param> &returns)
+        {
+            this->name = "repeat";
+            this->metadata.author = Author::name();
+            this->tftype = "changeshape";
+            this->args = args;
+            this->returns = returns;
+        }
+
+        string math_formula() const override
+        {
+            return "T2 = T1.repeat(repeats=[3 4 5])";
+        }
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<Repeat<Author>>(*this);
+        }
+        int run(shared_ptr<MemBase> mem, string &error) override
+        {
+            Precision input_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
+            vector<int> repeats = this->getvector<int>(1);
+            Precision output_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
+            if (input_type != output_type)
+            {
+                error = "Type mismatch: " + precision_str(input_type) + " != " + precision_str(output_type);
+                return 1;
+            }
+            switch (input_type)
+            {
+            case Precision::Float64:
+                repeat<Author, double>(*mem->gettensor<double>(this->args[0].textvalue), repeats, *mem->gettensor<double>(this->returns[0].textvalue));
+                break;
+            case Precision::Float32:
+                repeat<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), repeats,  *mem->gettensor<float>(this->returns[0].textvalue));
+                break;
+            case Precision::Float16:
+                repeat<Author, half>(*mem->gettensor<half>(this->args[0].textvalue), repeats,  *mem->gettensor<half>(this->returns[0].textvalue));
+                break;
+            case Precision::BFloat16:
+                repeat<Author, nv_bfloat16>(*mem->gettensor<nv_bfloat16>(this->args[0].textvalue), repeats,  *mem->gettensor<nv_bfloat16>(this->returns[0].textvalue));
+                break;
+            case Precision::Int64:
+                repeat<Author, int64_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), repeats, *mem->gettensor<int64_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int32:
+                repeat<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), repeats,  *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int16:
+                repeat<Author, int16_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), repeats,  *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int8:   
+                repeat<Author, int8_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), repeats,   *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Bool:
+                repeat<Author, bool>(*mem->gettensor<bool>(this->args[0].textvalue), repeats,  *mem->gettensor<bool>(this->returns[0].textvalue));
+                break;
+            default:    
+                error = "Unsupported type: " + precision_str(input_type);
+                return 1;
+            }
+            return 0;
+        }
+    };
 };
 #endif // DEEPX_TF_CHANGESHAPE_HPP
diff --git a/excuter/op-mem-cuda/src/deepx/tf/elementwise_basic.hpp b/excuter/op-mem-cuda/src/deepx/tf/elementwise_basic.hpp
index c0472001..fca52981 100644
--- a/excuter/op-mem-cuda/src/deepx/tf/elementwise_basic.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tf/elementwise_basic.hpp
@@ -9,7 +9,6 @@
 
 namespace deepx::tf
 {
-
     // todtype
     class Todtype : public TF
     {
diff --git a/front/py/deepx/scheduler/client/udpconn.py b/front/py/deepx/scheduler/client/udpconn.py
index a25b0963..6a12c26a 100644
--- a/front/py/deepx/scheduler/client/udpconn.py
+++ b/front/py/deepx/scheduler/client/udpconn.py
@@ -3,7 +3,7 @@
 import select
 
 class UDPConn:
-    def __init__(self, endpoint: str = "localhost:8080"):
+    def __init__(self, endpoint: str = "localhost:9090"):
         # 解析endpoint
         self._host, port_str = endpoint.split(':')
         self._port = int(port_str)