From b7f038aff5fa84233dfcb8d9c3d1081c89548d25 Mon Sep 17 00:00:00 2001
From: lipeng <734991033@qq.com>
Date: Tue, 22 Apr 2025 17:45:58 +0800
Subject: [PATCH 1/5] =?UTF-8?q?benchmark.IR:=E6=94=AF=E6=8C=81=E5=AF=B9?=
 =?UTF-8?q?=E7=AE=97=E5=AD=90=E8=BF=9B=E8=A1=8Cbenchmark?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 doc/excuter/op-mem-ompsimd/list.md            |   2 +-
 excuter/cpp-common/src/client/udpserver.cpp   |   2 +-
 excuter/cpp-common/src/deepx/tf/tf.cpp        | 144 +++++++++++-------
 excuter/cpp-common/src/deepx/tf/tf.hpp        |  35 +++--
 excuter/cpp-common/src/deepx/tf/tffactory.cpp |  19 ++-
 excuter/cpp-common/src/deepx/tf/tffactory.hpp |   6 +-
 excuter/op-mem-cuda/src/client/main.cpp       |   4 +-
 excuter/op-mem-cuda/src/deepx/tf/arg.hpp      |   2 -
 .../op-mem-cuda/src/deepx/tf/changeshape.hpp  |  10 +-
 .../src/deepx/tf/elementwise_basic.hpp        |  20 +--
 .../src/deepx/tf/elementwise_compare.hpp      |  22 +--
 .../src/deepx/tf/elementwise_sin.hpp          |   6 +-
 .../src/deepx/tf/elementwise_sqrt.hpp         |  12 +-
 excuter/op-mem-cuda/src/deepx/tf/init.hpp     |   8 +-
 excuter/op-mem-cuda/src/deepx/tf/io.hpp       |   2 +-
 excuter/op-mem-cuda/src/deepx/tf/matmul.hpp   |   2 +-
 excuter/op-mem-cuda/src/deepx/tf/reduce.hpp   |   8 +-
 excuter/op-mem-ompsimd/src/client/main.cpp    |   4 +-
 .../src/deepx/tf/changeshape.hpp              |  10 +-
 .../src/deepx/tf/elementwise.hpp              |  60 ++++----
 excuter/op-mem-ompsimd/src/deepx/tf/init.hpp  |   8 +-
 excuter/op-mem-ompsimd/src/deepx/tf/io.hpp    |   2 +-
 .../op-mem-ompsimd/src/deepx/tf/matmul.hpp    |  37 +++--
 .../op-mem-ompsimd/src/deepx/tf/reduce.hpp    |   8 +-
 front/py/deepx/nn/deepxir.py                  |  56 +++++--
 .../py/deepx/nn/functional/leaffunc_matmul.py |   4 +-
 front/py/deepx/nn/functional/rtf_matmul.py    |   4 +-
 front/py/deepx/scheduler/client/udpconn.py    |   2 +-
 front/py/examples/2_ir/3_matmul.py            |  45 ++++--
 29 files changed, 333 insertions(+), 211 deletions(-)
diff --git a/doc/excuter/op-mem-ompsimd/list.md b/doc/excuter/op-mem-ompsimd/list.md
index 66a03786..8ca0e1d6 100644
--- a/doc/excuter/op-mem-ompsimd/list.md
+++ b/doc/excuter/op-mem-ompsimd/list.md
@@ -25,7 +25,7 @@
 | save |  none  | save(tensor<any> t, var<string> path)->() | save(T1,path) | save(tensor<any> t, var<string> path)->() |
 | print | miaobyte | print(tensor<any> t)->() | print(T1) | print(tensor<any> t)->() |
 | print | miaobyte | print(tensor<any> t, var<string> format)->() | print(T1) | print(tensor<any> t, var<string> format)->() |
-| load |  none  | load(var<string> path)->() | load(path) | load(var<string> path)->() |
+| load |  none  | load(var<string> path)->() | mem.load(path) | load(var<string> path)->() |
 
 ### init
 
diff --git a/excuter/cpp-common/src/client/udpserver.cpp b/excuter/cpp-common/src/client/udpserver.cpp
index 664083be..44759d42 100644
--- a/excuter/cpp-common/src/client/udpserver.cpp
+++ b/excuter/cpp-common/src/client/udpserver.cpp
@@ -51,7 +51,7 @@ namespace client
             while (getline(ss, line)) {
                 if (!line.empty()) {
                     deepx::tf::TF tf;
-                    tf.recv_at = chrono::system_clock::now();
+                    tf.metadata.recv_at = chrono::system_clock::now();
                     tf.parse(line);
                     queue.push(tf);
                 }
diff --git a/excuter/cpp-common/src/deepx/tf/tf.cpp b/excuter/cpp-common/src/deepx/tf/tf.cpp
index d8bce199..b6ffa66f 100644
--- a/excuter/cpp-common/src/deepx/tf/tf.cpp
+++ b/excuter/cpp-common/src/deepx/tf/tf.cpp
@@ -1,17 +1,17 @@
 #include <iomanip>
 #include <sstream>
 #include <ctime>
+#include <map>
 
 #include "deepx/tf/tf.hpp"
 #include "stdutil/time.hpp"
 #include "stdutil/string.hpp"
 namespace deepx::tf
-{   
-
+{
 
     void Param::parse(const string &param)
     {
-        //1. 按:分割类型和值
+        // 1. 按:分割类型和值
         size_t colon_pos = param.find(':');
         string type, textvalue;
         if (colon_pos != string::npos)
@@ -36,8 +36,88 @@ namespace deepx::tf
             this->dtype = deepx::dtype(textvalue);
             this->textvalue = textvalue;
         }
-        
-    }   
+    }
+
+    string TFMetadata::to_string() const
+    {
+        stringstream ss;
+        if (!author.empty())
+        {
+            ss << "author=" << author << " ";
+        }
+        if (id > 0)
+        {
+            ss << "id=" << id << " ";
+        }
+        if (created_at != system_clock::time_point::min())
+        {
+            ss << "created_at=" << duration_cast<milliseconds>(created_at.time_since_epoch()).count() << " ";
+        }
+        if (sent_at != system_clock::time_point::min())
+        {
+            ss << "sent_at=" << duration_cast<milliseconds>(sent_at.time_since_epoch()).count() << " ";
+        }
+        if (recv_at != system_clock::time_point::min())
+        {
+            ss << "recv_at=" << duration_cast<milliseconds>(recv_at.time_since_epoch()).count() << " ";
+        }
+        if (benchmark.repeat > 0)
+        {
+            ss << "benchmark.repeat=" << benchmark.repeat << " ";
+        }
+        return ss.str();
+    }
+
+     std::unordered_map<string, string> parse_metadata_map(const string &meta)
+    {
+        std::unordered_map<string, string> metadata;
+        stringstream meta_ss(meta);
+        string key_value;
+        while (meta_ss >> key_value)
+        {
+            size_t eq_pos = key_value.find('=');
+            if (eq_pos == string::npos)
+                continue;
+            string key = key_value.substr(0, eq_pos);
+            string value = key_value.substr(eq_pos + 1);
+            metadata[key] = value;
+        }
+        return metadata;
+    }
+
+    // 解析元数据
+    void  TFMetadata::parse(const string &meta)
+    {
+        if (meta.empty())
+            return;
+
+        auto metadata_map = parse_metadata_map(meta);
+        if (metadata_map.find("id") != metadata_map.end())
+        {
+            id = stoi(metadata_map["id"]);
+        }
+        if (metadata_map.find("author") != metadata_map.end())
+        {
+            author = metadata_map["author"];
+        }   
+        if (metadata_map.find("created_at") != metadata_map.end())
+        {
+            created_at = system_clock::from_time_t(stod(metadata_map["created_at"]));
+        }
+        if (metadata_map.find("sent_at") != metadata_map.end())
+        {
+            sent_at = system_clock::from_time_t(stod(metadata_map["sent_at"]));
+        }
+        if (metadata_map.find("recv_at") != metadata_map.end())
+        {
+            recv_at = system_clock::from_time_t(stod(metadata_map["recv_at"]));
+        }
+        if (metadata_map.find("benchmark.repeat") != metadata_map.end())
+        {   
+            benchmark.repeat = stoi(metadata_map["benchmark.repeat"]);
+        }
+    }
+
     // 分割主体和元数据
     std::pair<string, string> split_body_metadata(const string &input)
     {
@@ -186,7 +266,6 @@ namespace deepx::tf
         return value_str; // 默认作为字符串处理
     }
 
-  
     // 解析参数列表
     vector<Param> parse_params(const string &params_str)
     {
@@ -211,51 +290,7 @@ namespace deepx::tf
         return params;
     }
 
-    // 解析元数据键值对
-    void parse_metadata_pair(const string &key_value, int &id, string &author,
-                             system_clock::time_point &created_at,
-                             system_clock::time_point &sent_at)
-    {
-        size_t eq_pos = key_value.find('=');
-        if (eq_pos == string::npos)
-            return;
-
-        string key = key_value.substr(0, eq_pos);
-        string value = key_value.substr(eq_pos + 1);
-
-        if (key == "id")
-        {
-            id = stoi(value);
-        }
-        else if (key == "author")
-        {
-            author = value;
-        }
-        else if (key == "created_at")
-        {
-            created_at = system_clock::from_time_t(stod(value));
-        }
-        else if (key == "sent_at")
-        {
-            sent_at = system_clock::from_time_t(stod(value));
-        }
-    }
-
-    // 解析元数据
-    void parse_metadata(const string &meta, int &id, string &author,
-                        system_clock::time_point &created_at,
-                        system_clock::time_point &sent_at)
-    {
-        if (meta.empty())
-            return;
-
-        stringstream meta_ss(meta);
-        string key_value;
-        while (meta_ss >> key_value)
-        {
-            parse_metadata_pair(key_value, id, author, created_at, sent_at);
-        }
-    }
+   
 
     // 主解析函数
     void TF::parse(const string &input)
@@ -274,7 +309,7 @@ namespace deepx::tf
         returns = parse_params(output_part);
 
         // 5. 解析元数据
-        parse_metadata(meta, id, author, created_at, sent_at);
+        metadata.parse(meta);
     }
 
     void TF::init(const string &opname,
@@ -282,7 +317,6 @@ namespace deepx::tf
                   const vector<Param> &returns)
     {
         this->name = opname;
-        this->author = "";
         this->args = args;
         this->returns = returns;
     }
@@ -330,9 +364,7 @@ namespace deepx::tf
 
         if (show_extra)
         {
-            ss << " //id=" << id
-               << " created_at=" << stdutil::format_time(created_at)
-               << " sent_at=" << stdutil::format_time(sent_at);
+            ss << " //" << metadata.to_string();
         }
 
         return ss.str();
diff --git a/excuter/cpp-common/src/deepx/tf/tf.hpp b/excuter/cpp-common/src/deepx/tf/tf.hpp
index e123f10c..d08b85b5 100644
--- a/excuter/cpp-common/src/deepx/tf/tf.hpp
+++ b/excuter/cpp-common/src/deepx/tf/tf.hpp
@@ -26,27 +26,40 @@ namespace deepx::tf
     {
         TypeDef dtype;
         string textvalue;
- 
+
         Param(const string &textvalue = "", const DataCategory &dt = DataCategory::Unknown, const Precision &prec = Precision::Any)
             : textvalue(textvalue), dtype(make_dtype(dt, prec)) {}
-        
+
         void parse(const string &param);
     };
+    // 元数据
+    struct Benchmark
+    {
+        int repeat = 0;
+    };
+    struct TFMetadata
+    {
+        string author;
+        int id;
+        system_clock::time_point created_at;
+        system_clock::time_point sent_at;
+        system_clock::time_point recv_at;
+        Benchmark benchmark;
 
+        string to_string() const;
+        void parse(const string &str);
+    };
     // TF:Tensor Function的缩写
     class TF
     {
     public:
         string name;
-        string author;
-        string tftype; 
+
+        string tftype;
         vector<Param> args;
         vector<Param> returns;
-        //
-        int id;
-        system_clock::time_point created_at;
-        system_clock::time_point sent_at;
-        system_clock::time_point recv_at;
+        // metadata
+        TFMetadata metadata;
 
     public:
         TF() = default;
@@ -128,8 +141,8 @@ namespace deepx::tf
             }
 
             vector<T> result;
-            string textvalue =vars[idx].textvalue;
-            stdutil::trim(textvalue,"[]");
+            string textvalue = vars[idx].textvalue;
+            stdutil::trim(textvalue, "[]");
             if (textvalue.empty())
             {
                 throw std::invalid_argument("Invalid argument index");
diff --git a/excuter/cpp-common/src/deepx/tf/tffactory.cpp b/excuter/cpp-common/src/deepx/tf/tffactory.cpp
index 1936acfc..48dd52bb 100644
--- a/excuter/cpp-common/src/deepx/tf/tffactory.cpp
+++ b/excuter/cpp-common/src/deepx/tf/tffactory.cpp
@@ -22,11 +22,18 @@ namespace deepx::tf
         }
 
         // 检查作者是否存在
-        auto author_it = family_it->second->tf_authors.find(other.author);
+        auto author_it = family_it->second->tf_authors.find(other.metadata.author);
         if (author_it == family_it->second->tf_authors.end())
         {
-            cerr << "<op> " << other.name << " author:" << other.author << " not found" << endl;
-            return nullptr;
+            cerr << "<op> " << other.name << " author:" << other.metadata.author << " not found" << endl;
+            //使用第一个作者
+            author_it = family_it->second->tf_authors.begin();
+            cerr << "<op> " << other.name << " use first author:" << author_it->first << endl;
+            if (author_it == family_it->second->tf_authors.end())
+            {
+                cerr << "<op> " << other.name << " default author:" << author_it->first << " not found" << endl;
+                return nullptr;
+            }
         }
 
         // 提取参数和返回值类型
@@ -71,7 +78,9 @@ namespace deepx::tf
         }
 
         // 使用clone()方法创建新实例，而不是直接复制构造
-        return tf->clone();
+        auto cloned = tf->clone();
+        cloned->metadata=other.metadata;
+        return cloned;
     }
     string TfFactory::print_markdown(string excuter_name) const
     {
@@ -99,7 +108,7 @@ namespace deepx::tf
             
             for (const auto &tf : tfs) {
                 ss << "| " << tf->name << " | ";
-                ss << (tf->author.empty() ? " none " : tf->author) << " | ";
+                ss << (tf->metadata.author.empty() ? " none " : tf->metadata.author) << " | ";
                 ss << tf->to_string(false, true) << " | ";
                 ss << tf->math_formula() << " | ";
                 ss << tf->to_string(false, true) << " |\n";
diff --git a/excuter/cpp-common/src/deepx/tf/tffactory.hpp b/excuter/cpp-common/src/deepx/tf/tffactory.hpp
index 3262ffaf..ebe836c0 100644
--- a/excuter/cpp-common/src/deepx/tf/tffactory.hpp
+++ b/excuter/cpp-common/src/deepx/tf/tffactory.hpp
@@ -85,14 +85,14 @@ namespace deepx::tf
             }
 
             // 检查是否存在该作者的实现，不存在则创建
-            if (tf_families[tf->name]->tf_authors.find(tf->author) ==
+            if (tf_families[tf->name]->tf_authors.find(tf->metadata.author) ==
                 tf_families[tf->name]->tf_authors.end())
             {
-                tf_families[tf->name]->tf_authors[tf->author] = std::make_shared<TFAuthor>();
+                tf_families[tf->name]->tf_authors[tf->metadata.author] = std::make_shared<TFAuthor>();
             }
 
             // 直接添加到vector中
-            tf_families[tf->name]->tf_authors[tf->author]->tfs.push_back(tf);
+            tf_families[tf->name]->tf_authors[tf->metadata.author]->tfs.push_back(tf);
         }
         shared_ptr<TF> get_tf(const TF &other) const;
         // 输出为markdown表格格式
diff --git a/excuter/op-mem-cuda/src/client/main.cpp b/excuter/op-mem-cuda/src/client/main.cpp
index 8c00a50a..b3cd40d6 100644
--- a/excuter/op-mem-cuda/src/client/main.cpp
+++ b/excuter/op-mem-cuda/src/client/main.cpp
@@ -60,8 +60,8 @@ int main()
             }
 
             deepx::tf::OpResp opresp;
-            opresp.id = op.id;
-            opresp.recv_at = op.recv_at;
+            opresp.id = op.metadata.id;
+            opresp.recv_at = op.metadata.recv_at;
 
             auto src = tf_factory.get_tf(op);
             if (src == nullptr)
diff --git a/excuter/op-mem-cuda/src/deepx/tf/arg.hpp b/excuter/op-mem-cuda/src/deepx/tf/arg.hpp
index 5c8fa93b..dcc845fd 100644
--- a/excuter/op-mem-cuda/src/deepx/tf/arg.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tf/arg.hpp
@@ -14,7 +14,6 @@ namespace deepx::tf
         ArgSet(vector<Param> args, vector<Param> returns)
         {
             this->name = "argset";
-            this->author = "";
             this->tftype = "arg";
             this->args = args;
             this->returns = returns;
@@ -76,7 +75,6 @@ namespace deepx::tf
         VecSet(vector<Param> args, vector<Param> returns)
         {
             this->name = "vecset";
-            this->author = "";
             this->tftype = "arg";
             this->args = args;
             this->returns = returns;
diff --git a/excuter/op-mem-cuda/src/deepx/tf/changeshape.hpp b/excuter/op-mem-cuda/src/deepx/tf/changeshape.hpp
index 6320b7d4..d869a435 100644
--- a/excuter/op-mem-cuda/src/deepx/tf/changeshape.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tf/changeshape.hpp
@@ -20,7 +20,7 @@ namespace deepx::tf
         Reshape(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "reshape";
-            this->author = Author::name();
+            this->metadata.author = Author::name();
             this->tftype = "changeshape";
             this->args = args;
             this->returns = returns;
@@ -82,7 +82,7 @@ namespace deepx::tf
         Transpose(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "transpose";
-            this->author = Author::name();
+            this->metadata.author = Author::name();
             this->tftype = "changeshape";
             this->args = args;
             this->returns = returns;
@@ -151,7 +151,7 @@ namespace deepx::tf
         Concat(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "concat";
-            this->author = Author::name();
+            this->metadata.author = Author::name();
             this->tftype = "changeshape";
             this->args = args;
             this->returns = returns;
@@ -286,7 +286,7 @@ namespace deepx::tf
         BroadcastTo(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "broadcastTo";
-            this->author = Author::name();
+            this->metadata.author = Author::name();
             this->tftype = "changeshape";
             this->args = args;
             this->returns = returns;
@@ -352,7 +352,7 @@ namespace deepx::tf
         Gather(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "gather";
-            this->author = Author::name();
+            this->metadata.author = Author::name();
             this->tftype = "changeshape";
             this->args = args;
             this->returns = returns;
diff --git a/excuter/op-mem-cuda/src/deepx/tf/elementwise_basic.hpp b/excuter/op-mem-cuda/src/deepx/tf/elementwise_basic.hpp
index d18eeb88..97e4b17e 100644
--- a/excuter/op-mem-cuda/src/deepx/tf/elementwise_basic.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tf/elementwise_basic.hpp
@@ -16,7 +16,7 @@ namespace deepx::tf
         Add(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "add";
-            this->author = Author::name();  
+            this->metadata.author = Author::name();  
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
@@ -85,7 +85,7 @@ namespace deepx::tf
         AddScalar(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "addscalar";
-            this->author = Author::name();
+            this->metadata.author = Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
@@ -153,7 +153,7 @@ namespace deepx::tf
         Sub(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "sub";
-            this->author = Author::name();
+            this->metadata.author = Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
@@ -222,7 +222,7 @@ namespace deepx::tf
         SubScalar(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "subscalar";
-            this->author = Author::name();
+            this->metadata.author=Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
@@ -290,7 +290,7 @@ namespace deepx::tf
         Mul(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "mul";
-            this->author = Author::name();
+            this->metadata.author=Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
@@ -359,7 +359,7 @@ namespace deepx::tf
         MulScalar(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "mulscalar";
-            this->author = Author::name();
+            this->metadata.author=Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
@@ -427,7 +427,7 @@ namespace deepx::tf
         Div(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "div";
-            this->author = Author::name();
+            this->metadata.author=Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
@@ -496,7 +496,7 @@ namespace deepx::tf
         DivScalar(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "divscalar";
-            this->author = Author::name();
+            this->metadata.author=Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
@@ -564,7 +564,7 @@ namespace deepx::tf
         RDivScalar(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "rdivscalar";
-            this->author = Author::name();
+            this->metadata.author=Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
@@ -633,7 +633,7 @@ namespace deepx::tf
         Invert(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "invert";
-            this->author = Author::name();
+            this->metadata.author=Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
diff --git a/excuter/op-mem-cuda/src/deepx/tf/elementwise_compare.hpp b/excuter/op-mem-cuda/src/deepx/tf/elementwise_compare.hpp
index 0ee58de8..fe3734a1 100644
--- a/excuter/op-mem-cuda/src/deepx/tf/elementwise_compare.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tf/elementwise_compare.hpp
@@ -15,7 +15,7 @@ namespace deepx::tf
         Max(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "max";
-            this->author = Author::name();
+            this->metadata.author =Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
@@ -80,7 +80,7 @@ namespace deepx::tf
         MaxScalar(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "maxscalar";
-            this->author = Author::name();
+            this->metadata.author=Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
@@ -145,7 +145,7 @@ namespace deepx::tf
         Min(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "min";
-            this->author = Author::name();
+            this->metadata.author=Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
@@ -211,7 +211,7 @@ namespace deepx::tf
         MinScalar(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "minscalar";
-            this->author = Author::name();
+            this->metadata.author=Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
@@ -276,7 +276,7 @@ namespace deepx::tf
         Equal(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "equal";
-            this->author = Author::name();
+            this->metadata.author=Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
@@ -343,7 +343,7 @@ namespace deepx::tf
         EqualScalar(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "equalscalar";
-            this->author = Author::name();
+            this->metadata.author=Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
@@ -410,7 +410,7 @@ namespace deepx::tf
         Less(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "less";
-            this->author = Author::name();
+            this->metadata.author=Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
@@ -477,7 +477,7 @@ namespace deepx::tf
         LessScalar(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "lessscalar";
-            this->author = Author::name();
+            this->metadata.author=Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
@@ -543,7 +543,7 @@ namespace deepx::tf
         Greater(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "greater";
-            this->author = Author::name();
+            this->metadata.author=Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
@@ -610,7 +610,7 @@ namespace deepx::tf
         GreaterScalar(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "greaterscalar";
-            this->author = Author::name();
+            this->metadata.author=Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
@@ -676,7 +676,7 @@ namespace deepx::tf
         Switch(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "switch";
-            this->author = Author::name();
+            this->metadata.author=Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
diff --git a/excuter/op-mem-cuda/src/deepx/tf/elementwise_sin.hpp b/excuter/op-mem-cuda/src/deepx/tf/elementwise_sin.hpp
index d5eae4e2..ec35e1f4 100644
--- a/excuter/op-mem-cuda/src/deepx/tf/elementwise_sin.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tf/elementwise_sin.hpp
@@ -15,7 +15,7 @@ namespace deepx::tf
         Sin(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "sin";
-            this->author = Author::name();
+            this->metadata.author= Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
@@ -68,7 +68,7 @@ namespace deepx::tf
         Cos(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "cos";
-            this->author = Author::name();
+            this->metadata.author= Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
@@ -123,7 +123,7 @@ namespace deepx::tf
         Tan(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "tan";
-            this->author = Author::name();
+            this->metadata.author= Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
diff --git a/excuter/op-mem-cuda/src/deepx/tf/elementwise_sqrt.hpp b/excuter/op-mem-cuda/src/deepx/tf/elementwise_sqrt.hpp
index d13b22f6..9f514264 100644
--- a/excuter/op-mem-cuda/src/deepx/tf/elementwise_sqrt.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tf/elementwise_sqrt.hpp
@@ -15,7 +15,7 @@ namespace deepx::tf
         Pow(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "pow";
-            this->author = Author::name();
+            this->metadata.author= Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
@@ -70,7 +70,7 @@ namespace deepx::tf
         PowScalar(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "powscalar";
-            this->author = Author::name();
+            this->metadata.author= Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
@@ -123,7 +123,7 @@ namespace deepx::tf
         RpowScalar(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "rpowscalar";
-            this->author = Author::name();
+            this->metadata.author= Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
@@ -174,7 +174,7 @@ namespace deepx::tf
         Sqrt(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "sqrt";
-            this->author = Author::name();
+            this->metadata.author= Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
@@ -232,7 +232,7 @@ namespace deepx::tf
         Log(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "log";
-            this->author = Author::name();
+            this->metadata.author= Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
@@ -289,7 +289,7 @@ namespace deepx::tf
         Exp(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "exp";
-            this->author = Author::name();
+            this->metadata.author= Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
diff --git a/excuter/op-mem-cuda/src/deepx/tf/init.hpp b/excuter/op-mem-cuda/src/deepx/tf/init.hpp
index 480bf18c..839c83bf 100644
--- a/excuter/op-mem-cuda/src/deepx/tf/init.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tf/init.hpp
@@ -18,7 +18,7 @@ namespace deepx::tf
         Constant(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "constant";
-            this->author = Author::name();
+            this->metadata.author= Author::name();
             this->tftype = "init";
             this->args = args;
             this->returns = returns;
@@ -98,7 +98,7 @@ namespace deepx::tf
         Arange(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "arange";
-            this->author = Author::name();
+            this->metadata.author= Author::name();
             this->tftype = "init";
             this->args = args;
             this->returns = returns;
@@ -183,7 +183,7 @@ namespace deepx::tf
         Uniform(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "uniform";
-            this->author = Author::name();
+            this->metadata.author= Author::name();
             this->tftype = "init";
             this->args = args;
             this->returns = returns;
@@ -269,7 +269,7 @@ namespace deepx::tf
         Normal(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "normal";
-            this->author = Author::name();
+            this->metadata.author= Author::name();
             this->tftype = "init";
             this->args = args;
             this->returns = returns;
diff --git a/excuter/op-mem-cuda/src/deepx/tf/io.hpp b/excuter/op-mem-cuda/src/deepx/tf/io.hpp
index b5a5f4d1..8049fc81 100644
--- a/excuter/op-mem-cuda/src/deepx/tf/io.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tf/io.hpp
@@ -15,7 +15,7 @@ namespace deepx::tf
         Print(vector<Param> args, vector<Param> returns)
         {
             this->name = "print";
-            this->author = Author::name();
+            this->metadata.author= Author::name();
             this->tftype = "io";
             this->args = args;
             this->returns = returns;
diff --git a/excuter/op-mem-cuda/src/deepx/tf/matmul.hpp b/excuter/op-mem-cuda/src/deepx/tf/matmul.hpp
index a61b5c04..3b18f939 100644
--- a/excuter/op-mem-cuda/src/deepx/tf/matmul.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tf/matmul.hpp
@@ -18,7 +18,7 @@ namespace deepx::tf
         MatMul(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "matmul";
-            this->author = Author::name();
+            this->metadata.author= Author::name();
             this->tftype = "matmul";
             this->args = args;
             this->returns = returns;
diff --git a/excuter/op-mem-cuda/src/deepx/tf/reduce.hpp b/excuter/op-mem-cuda/src/deepx/tf/reduce.hpp
index a23319d7..2bfd36a2 100644
--- a/excuter/op-mem-cuda/src/deepx/tf/reduce.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tf/reduce.hpp
@@ -16,7 +16,7 @@ namespace deepx::tf
         Sum(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "sum";
-            this->author = Author::name();
+            this->metadata.author= Author::name();
             this->tftype = "reduce";
             this->args = args;
             this->returns = returns;
@@ -82,7 +82,7 @@ namespace deepx::tf
         Prod(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "prod";
-            this->author = Author::name();
+            this->metadata.author= Author::name();
             this->tftype = "reduce";
             this->args = args;
             this->returns = returns;
@@ -141,7 +141,7 @@ namespace deepx::tf
         ReduceMax(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "reducemax";
-            this->author = Author::name();
+            this->metadata.author= Author::name();
             this->tftype = "reduce";
             this->args = args;
             this->returns = returns;
@@ -200,7 +200,7 @@ namespace deepx::tf
         ReduceMin(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "reducemin";
-            this->author = Author::name();
+            this->metadata.author= Author::name();
             this->tftype = "reduce";
             this->args = args;
             this->returns = returns;
diff --git a/excuter/op-mem-ompsimd/src/client/main.cpp b/excuter/op-mem-ompsimd/src/client/main.cpp
index 7f0378d9..96ca39bb 100644
--- a/excuter/op-mem-ompsimd/src/client/main.cpp
+++ b/excuter/op-mem-ompsimd/src/client/main.cpp
@@ -59,8 +59,8 @@ int main()
             }
 
             deepx::tf::OpResp opresp;
-            opresp.id = op.id;
-            opresp.recv_at = op.recv_at;
+            opresp.id = op.metadata.id;
+            opresp.recv_at = op.metadata.recv_at;
 
             auto src = tf_factory.get_tf(op);
             if (src == nullptr)
diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/changeshape.hpp b/excuter/op-mem-ompsimd/src/deepx/tf/changeshape.hpp
index fd4f0e07..8afa8e14 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tf/changeshape.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tf/changeshape.hpp
@@ -19,7 +19,7 @@ namespace deepx::tf
         Reshape(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "reshape";
-            this->author = Author::name();
+            this->metadata.author = Author::name();
             this->args = args;
             this->returns = returns;
             this->tftype = "changeshape";
@@ -86,7 +86,7 @@ namespace deepx::tf
         Transpose(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "transpose";
-            this->author = Author::name();
+            this->metadata.author = Author::name();
             this->tftype = "changeshape";
             this->args = args;
             this->returns = returns;
@@ -153,7 +153,7 @@ namespace deepx::tf
         Concat(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "concat";
-            this->author = Author::name();
+            this->metadata.author = Author::name();
             this->tftype = "changeshape";
             this->args = args;
             this->returns = returns;
@@ -266,7 +266,7 @@ namespace deepx::tf
         BroadcastTo(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "broadcastTo";
-            this->author = Author::name();
+            this->metadata.author = Author::name();
             this->tftype = "changeshape";
             this->args = args;
             this->returns = returns;
@@ -330,7 +330,7 @@ namespace deepx::tf
         Gather(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "gather";
-            this->author = Author::name();
+            this->metadata.author = Author::name();
             this->tftype = "changeshape";
             this->args = args;
             this->returns = returns;
diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp b/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp
index e1914688..1f754006 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp
@@ -18,7 +18,7 @@ namespace deepx::tf
         Add(vector<Param> args, vector<Param> returns)
         {
             this->name = "add";
-            this->author = Author::name();
+            this->metadata.author = Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
@@ -80,7 +80,7 @@ namespace deepx::tf
         AddScalar(vector<Param> args, vector<Param> returns)
         {
             this->name = "addscalar";
-            this->author = Author::name();
+            this->metadata.author = Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
@@ -140,7 +140,7 @@ namespace deepx::tf
         Sub(vector<Param> args, vector<Param> returns)
         {
             this->name = "sub";
-            this->author = Author::name();
+            this->metadata.author = Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
@@ -202,7 +202,7 @@ namespace deepx::tf
         SubScalar(vector<Param> args, vector<Param> returns)
         {
             this->name = "subscalar";
-            this->author = Author::name();
+            this->metadata.author = Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
@@ -263,7 +263,7 @@ namespace deepx::tf
         Mul(vector<Param> args, vector<Param> returns)
         {   
             this->name = "mul";
-            this->author = Author::name();
+            this->metadata.author = Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
@@ -325,7 +325,7 @@ namespace deepx::tf
         MulScalar(vector<Param> args, vector<Param> returns)
         {
             this->name = "mulscalar";
-            this->author = Author::name();
+            this->metadata.author = Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
@@ -386,7 +386,7 @@ namespace deepx::tf
         Div(vector<Param> args, vector<Param> returns)
         {   
             this->name = "div";
-            this->author = Author::name();
+            this->metadata.author = Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
@@ -448,7 +448,7 @@ namespace deepx::tf
         DivScalar(vector<Param> args, vector<Param> returns)
         {
             this->name = "divscalar";
-            this->author = Author::name();
+            this->metadata.author = Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
@@ -505,7 +505,7 @@ namespace deepx::tf
         RDivScalar(vector<Param> args, vector<Param> returns)
         {
             this->name = "rdivscalar";
-            this->author = Author::name();
+            this->metadata.author = Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
@@ -563,7 +563,7 @@ namespace deepx::tf
         Invert(vector<Param> args, vector<Param> returns)
         {
             this->name = "invert";
-            this->author = Author::name();
+            this->metadata.author = Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
@@ -618,7 +618,7 @@ namespace deepx::tf
         Sqrt(vector<Param> args, vector<Param> returns)
         {
             this->name = "sqrt";
-            this->author = Author::name();  
+            this->metadata.author = Author::name();  
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
@@ -663,7 +663,7 @@ namespace deepx::tf
         Pow(vector<Param> args, vector<Param> returns)
         {
             this->name = "pow";
-            this->author = Author::name();
+            this->metadata.author = Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
@@ -709,7 +709,7 @@ namespace deepx::tf
         PowScalar(vector<Param> args, vector<Param> returns)
         {
             this->name = "powscalar";
-            this->author = Author::name();
+            this->metadata.author = Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
@@ -755,7 +755,7 @@ namespace deepx::tf
         RpowScalar(vector<Param> args, vector<Param> returns)
         {
             this->name = "rpowscalar";
-            this->author = Author::name();
+            this->metadata.author = Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
@@ -800,7 +800,7 @@ namespace deepx::tf
         Log(vector<Param> args, vector<Param> returns)
         {
             this->name = "log";
-            this->author = Author::name();
+            this->metadata.author = Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
@@ -845,7 +845,7 @@ namespace deepx::tf
         Exp(vector<Param> args, vector<Param> returns)
         {   
             this->name = "exp";
-            this->author = Author::name();
+            this->metadata.author = Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
@@ -890,7 +890,7 @@ namespace deepx::tf
         Sin(vector<Param> args, vector<Param> returns)
         {
             this->name = "sin";
-            this->author = Author::name();
+            this->metadata.author = Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
@@ -935,7 +935,7 @@ namespace deepx::tf
         Cos(vector<Param> args, vector<Param> returns)
         {   
             this->name = "cos";
-            this->author = Author::name();
+            this->metadata.author = Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
@@ -980,7 +980,7 @@ namespace deepx::tf
         Tan(vector<Param> args, vector<Param> returns)
         {   
             this->name = "tan";
-            this->author = Author::name();
+            this->metadata.author = Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
@@ -1025,7 +1025,7 @@ namespace deepx::tf
         Max(vector<Param> args, vector<Param> returns)
         {
             this->name = "max";
-            this->author = Author::name();
+            this->metadata.author = Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
@@ -1083,7 +1083,7 @@ namespace deepx::tf
         MaxScalar(vector<Param> args, vector<Param> returns)
         {
             this->name = "maxscalar";
-            this->author = Author::name();
+            this->metadata.author = Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
@@ -1140,7 +1140,7 @@ namespace deepx::tf
         Min(vector<Param> args, vector<Param> returns)
         {   
             this->name = "min"; 
-            this->author = Author::name();
+            this->metadata.author = Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
@@ -1198,7 +1198,7 @@ namespace deepx::tf
         MinScalar(vector<Param> args, vector<Param> returns)
         {
             this->name = "minscalar";
-            this->author = Author::name();
+            this->metadata.author = Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
@@ -1257,7 +1257,7 @@ namespace deepx::tf
         Equal(vector<Param> args, vector<Param> returns)
         {   
             this->name = "equal";
-            this->author = Author::name();
+            this->metadata.author = Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
@@ -1317,7 +1317,7 @@ namespace deepx::tf
         EqualScalar(vector<Param> args, vector<Param> returns)
         {
             this->name = "equalscalar";
-            this->author = Author::name();
+            this->metadata.author = Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
@@ -1376,7 +1376,7 @@ namespace deepx::tf
         Less(vector<Param> args, vector<Param> returns)
         {
             this->name = "less";
-            this->author = Author::name();
+            this->metadata.author = Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
@@ -1435,7 +1435,7 @@ namespace deepx::tf
         LessScalar(vector<Param> args, vector<Param> returns)
         {
             this->name = "lessscalar";
-            this->author = Author::name();
+            this->metadata.author = Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
@@ -1493,7 +1493,7 @@ namespace deepx::tf
         Greater(vector<Param> args, vector<Param> returns)
         {
             this->name = "greater";
-            this->author = Author::name();
+            this->metadata.author = Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
@@ -1552,7 +1552,7 @@ namespace deepx::tf
         GreaterScalar(vector<Param> args, vector<Param> returns)
         {
             this->name = "greaterscalar";
-            this->author = Author::name();
+            this->metadata.author = Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
@@ -1610,7 +1610,7 @@ namespace deepx::tf
         Switch(vector<Param> args, vector<Param> returns)
         {
             this->name = "switch";
-            this->author = Author::name();
+            this->metadata.author = Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/init.hpp b/excuter/op-mem-ompsimd/src/deepx/tf/init.hpp
index c28b569e..54410653 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tf/init.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tf/init.hpp
@@ -15,7 +15,7 @@ namespace deepx::tf
         Constant(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "constant";
-            this->author = Author::name();
+            this->metadata.author = Author::name();
             this->tftype = "init";
             this->args = args;
             this->returns = returns;
@@ -96,7 +96,7 @@ namespace deepx::tf
         Arange(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "arange";
-            this->author = Author::name();
+            this->metadata.author = Author::name();
             this->tftype = "init";
             this->args = args;
             this->returns = returns;
@@ -171,7 +171,7 @@ namespace deepx::tf
         Uniform(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "uniform";
-            this->author = Author::name();
+            this->metadata.author = Author::name();
             this->tftype = "init";
             this->args = args;
             this->returns = returns;
@@ -246,7 +246,7 @@ namespace deepx::tf
         Normal(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "normal";
-            this->author = Author::name();
+            this->metadata.author = Author::name();
             this->tftype = "init";
             this->args = args;
             this->returns = returns;
diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/io.hpp b/excuter/op-mem-ompsimd/src/deepx/tf/io.hpp
index a6fb83dc..8c6fdc56 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tf/io.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tf/io.hpp
@@ -16,7 +16,7 @@ namespace deepx::tf
         {
             this->name = "print";
             this->tftype = "io";
-            this->author = Author::name();
+            this->metadata.author = Author::name();
             this->args = args;
             this->returns = returns;
         }
diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/matmul.hpp b/excuter/op-mem-ompsimd/src/deepx/tf/matmul.hpp
index 89804a18..f5cafd18 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tf/matmul.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tf/matmul.hpp
@@ -16,7 +16,7 @@ namespace deepx::tf
         MatMul(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "matmul";
-            this->author = Author::name();
+            this->metadata.author = Author::name();
             this->tftype = "matmul";
             this->args = args;
             this->returns = returns;
@@ -30,16 +30,7 @@ namespace deepx::tf
         {
             return make_shared<MatMul<Author>>(*this);
         }
-        int run(shared_ptr<MemBase> mem, string &error) override
-        {
-            Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
-            Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype;
-            Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
-            if (a_type != b_type || a_type != c_type)
-            {
-                error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(b_type) + " != " + precision_str(c_type);
-                return 1;
-            }
+        int compute(shared_ptr<MemBase> mem, Precision a_type,string &error){
             switch (a_type)
             {
             case Precision::Float64:
@@ -66,6 +57,30 @@ namespace deepx::tf
             }
             return 0;
         }
+        int run(shared_ptr<MemBase> mem, string &error) override
+        {
+            Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
+            Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype;
+            Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
+            if (a_type != b_type || a_type != c_type)
+            {
+                error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(b_type) + " != " + precision_str(c_type);
+                return 1;
+            }
+            if (metadata.benchmark.repeat > 0)
+            {   
+                for (int i = 0; i < metadata.benchmark.repeat; i++)
+                {
+                    if (compute(mem, a_type, error))
+                    {
+                        return 1;
+                    }
+                }
+            }else{
+                return compute(mem, a_type, error);
+            }
+            return 0;
+        }
     };
 }
 
diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/reduce.hpp b/excuter/op-mem-ompsimd/src/deepx/tf/reduce.hpp
index f8b43e53..ff483da6 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tf/reduce.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tf/reduce.hpp
@@ -16,7 +16,7 @@ namespace deepx::tf
         Sum(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "sum";
-            this->author = Author::name();
+            this->metadata.author = Author::name();
             this->tftype = "reduce";
             this->args = args;
             this->returns = returns;
@@ -76,7 +76,7 @@ namespace deepx::tf
         Prod(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "prod";
-            this->author = Author::name();
+            this->metadata.author = Author::name();
             this->tftype = "reduce";
             this->args = args;
             this->returns = returns;
@@ -135,7 +135,7 @@ namespace deepx::tf
         ReduceMax(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "reducemax";
-            this->author = Author::name();
+            this->metadata.author = Author::name();
             this->tftype = "reduce";
             this->args = args;
             this->returns = returns;
@@ -194,7 +194,7 @@ namespace deepx::tf
         ReduceMin(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "reducemin";
-            this->author = Author::name();
+            this->metadata.author = Author::name();
             this->tftype = "reduce";
             this->args = args;
             this->returns = returns;
diff --git a/front/py/deepx/nn/deepxir.py b/front/py/deepx/nn/deepxir.py
index 33b7e037..b1f6880e 100644
--- a/front/py/deepx/nn/deepxir.py
+++ b/front/py/deepx/nn/deepxir.py
@@ -19,7 +19,8 @@ def __str__(self):
                 return f"{self._category}:{self._textvalue}"
         else:
             return self._textvalue
-    
+ 
+
     @classmethod
     def tensorName(cls,name:str,dtype:str):
         return Param(name,category="tensor",precision=dtype)
@@ -72,6 +73,43 @@ def listtensor(cls,value:tuple[Tensor]):
 # newtensor ( [3 4 5]) -> ( tensor_136144420556608) 
 # // id=1 created_at=1744724799.0650852 sent_at=1744724799.0650952
 
+class Benchmark:
+    def __init__(self,repeat:int):
+        self._repeat=repeat
+
+    def __str__(self):
+        return f"benchmark.repeat={self._repeat}"
+        
+class Metadata:
+    def __init__(self,author:str=None,id:str=None,created_at:datetime=None,sent_at:datetime=None):
+        self._author=None
+        if author is not None and author != "":
+            self._author=author
+ 
+        self._id=None
+        if id is not None and id != "":
+            self._id=id
+        self._created_at=created_at
+        self._sent_at=sent_at
+        self._benchmark=None
+        
+    def __str__(self):
+        parts =[]
+        if self._author is not None :
+            parts.append(f"author={self._author}")
+        if self._id is not None and self._id != "":
+            parts.append(f" id={self._id}")
+        if self._created_at is not None:
+            parts.append(f" created_at={self._created_at}")
+        if self._sent_at is not None:
+            parts.append(f" sent_at={self._sent_at}")
+        if  self._benchmark is not None:
+            parts.append(f" {self._benchmark}")
+        return ' '.join(parts)
+    
+    def openbench(self,repeat:int):
+        self._benchmark=Benchmark(repeat)
+
 
 class DeepxIR:
     def __init__(self, 
@@ -90,11 +128,8 @@ def __init__(self,
         self._name = name
         self._args = [arg if isinstance(arg, Param) else Param(arg) for arg in args]
         self._returns = [ret if isinstance(ret, Param) else Param(ret) for ret in returns]
-        self._author = author
-        self._id=None
-        self._created_at=time.time()
-        self._sent_at=None
-
+        self._metadata=Metadata(author=author,id=None,created_at=time.time())
+ 
     def __str__(self):
         # 函数名部分
         parts = [self._name]
@@ -120,14 +155,7 @@ def __str__(self):
 
         # 添加元数据
         parts.append("//")
-        if self._id is not None:
-            parts.append(f"id={self._id}")
-        if self._author:
-            parts.append(f"author={self._author}")
-        parts.append(f"created_at={self._created_at}")
-        if self._sent_at is not None:
-            parts.append(f"sent_at={self._sent_at}")
-        
+        parts.append(str(self._metadata))
         return ' '.join(parts)
 
 class DeepxIRResp:
diff --git a/front/py/deepx/nn/functional/leaffunc_matmul.py b/front/py/deepx/nn/functional/leaffunc_matmul.py
index bb69b838..8cad3127 100644
--- a/front/py/deepx/nn/functional/leaffunc_matmul.py
+++ b/front/py/deepx/nn/functional/leaffunc_matmul.py
@@ -4,11 +4,11 @@
 from .leaffunc_life import newtensor
 from .authormap import defaultauthor
 
-def matmul(a:Tensor,b:Tensor,out:Union[Tensor,str]='')->Tensor:
+def matmul(a:Tensor,b:Tensor,out:Union[Tensor,str]='',bench:tuple[int,int]=None)->Tensor:
     outtensor=out
     if isinstance(out,str):
         outshape=Shape.matmul(a.shape,b.shape)
         outtensor=newtensor(outshape,dtype=a.dtype,name=out)
     from .rtf_matmul import rtf_matmul
-    rtf_matmul(a,b,outtensor,defaultauthor['matmul'])
+    rtf_matmul(a,b,outtensor,defaultauthor['matmul'],bench)
     return outtensor
diff --git a/front/py/deepx/nn/functional/rtf_matmul.py b/front/py/deepx/nn/functional/rtf_matmul.py
index a6a3af5d..bfe23cee 100644
--- a/front/py/deepx/nn/functional/rtf_matmul.py
+++ b/front/py/deepx/nn/functional/rtf_matmul.py
@@ -3,9 +3,11 @@
 from deepx.scheduler import send
 from .rtf import A_B_op_C
 
-def rtf_matmul(a:Tensor,b:Tensor,out: Tensor ,author='cublas'):
+def rtf_matmul(a:Tensor,b:Tensor,out: Tensor ,author='cublas',bench:int=None):
     args=[Param.tensor(a),Param.tensor(b)]
     returns=[Param.tensor(out)]
     ir=DeepxIR("matmul", args, returns, author)
+    if bench is not None:
+        ir._metadata.openbench(bench)
     send(ir)
     return out
\ No newline at end of file
diff --git a/front/py/deepx/scheduler/client/udpconn.py b/front/py/deepx/scheduler/client/udpconn.py
index 6a12c26a..a25b0963 100644
--- a/front/py/deepx/scheduler/client/udpconn.py
+++ b/front/py/deepx/scheduler/client/udpconn.py
@@ -3,7 +3,7 @@
 import select
 
 class UDPConn:
-    def __init__(self, endpoint: str = "localhost:9090"):
+    def __init__(self, endpoint: str = "localhost:8080"):
         # 解析endpoint
         self._host, port_str = endpoint.split(':')
         self._port = int(port_str)
diff --git a/front/py/examples/2_ir/3_matmul.py b/front/py/examples/2_ir/3_matmul.py
index 144cbdf7..bf682a88 100644
--- a/front/py/examples/2_ir/3_matmul.py
+++ b/front/py/examples/2_ir/3_matmul.py
@@ -1,22 +1,47 @@
+benchcnt=1000
+
+from deepx.nn.functional import save_npy
+import numpy as np
+np_T1 = np.random.randn(1024, 1024).astype(np.float32)
+np_T2 = np.random.randn(1024, 1024).astype(np.float32)
+
+npy_path = '/home/lipeng/model/deepxmodel/tester/'
+save_npy(np_T1,npy_path+'t1')
+save_npy(np_T2,npy_path+'t2')
+
 ############-------PyTorch-------################
 
 import torch
-torch_t1 = torch.ones(3, 4, dtype=torch.float32)
-torch_t2 = torch.ones(4, 5, dtype=torch.float32)
-torch_t3 = torch_t1 @ torch_t2
-print()
+import time
+torch_t1 = torch.from_numpy(np_T1)
+torch_t2 = torch.from_numpy(np_T2)
+# warmup
+_=torch_t1 @ torch_t2
+
+torch_start = time.time()
+for i in range(benchcnt):
+    torch_t3 = torch_t1 @ torch_t2
+    
 print(torch_t3)
-
+torch_end = time.time()
+print(f"PyTorch time: {torch_end - torch_start} seconds")
 ############-------DEEPX-------################
 
-from deepx import zeros, ones, full, arange
-
+from deepx import uniform, matmul, zeros,load
+from deepx.nn.functional import save,load
 print()
 
-t1 = ones([3,4],dtype='float32',name="t1")
-t2 = ones([4,5],dtype='float32',name="t2")
-t3 = t1 @ t2
+t1 = load(npy_path+'t1')
+t2 = load(npy_path+'t2')
+t3= zeros(1024,1024,dtype='float32',name="t3")
+# warmup
+matmul(t1,t2,out=t3)
+
+deepx_start = time.time()
+matmul(t1,t2,out=t3,bench=(benchcnt))
 t3.print()
+deepx_end = time.time()
+print(f"DeepX time: {deepx_end - deepx_start} seconds")
 
 
 

From 116f9f5a51518f8e1a798d66619b9df98aeed15e Mon Sep 17 00:00:00 2001
From: lipeng <734991033@qq.com>
Date: Tue, 22 Apr 2025 19:46:41 +0800
Subject: [PATCH 2/5] =?UTF-8?q?gather,save,load:cuda=20=E9=AA=8C=E8=AF=81?=
 =?UTF-8?q?=E5=AE=8C=E6=88=90?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 excuter/op-mem-cuda/src/deepx/tensorfunc/io_miaobyte.hpp    | 6 ++++++
 .../op-mem-ompsimd/src/deepx/tensorfunc/matmul_miaobyte.hpp | 2 ++
 front/py/deepx/scheduler/client/udpconn.py                  | 2 +-
 front/py/examples/2_ir/3_matmul.py                          | 4 +++-
 4 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/io_miaobyte.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/io_miaobyte.hpp
index 02fee22f..3a2a4032 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/io_miaobyte.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/io_miaobyte.hpp
@@ -30,6 +30,12 @@ namespace deepx::tensorfunc
             {
                 throw std::runtime_error("Failed to allocate host memory");
             }
+            cudaError_t err = cudaMemcpy(host_data, t.data, total_bytes, cudaMemcpyDeviceToHost);
+            if (err != cudaSuccess)
+            {
+                delete[] host_data;
+                throw std::runtime_error("Failed to copy data from device to host");
+            }
 
             stdutil::print(t.shape.shape, host_data, t.shape.dtype, f);
             delete[] host_data;
diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/matmul_miaobyte.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/matmul_miaobyte.hpp
index 92b16f13..e5dadce8 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/matmul_miaobyte.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/matmul_miaobyte.hpp
@@ -14,6 +14,8 @@ namespace deepx::tensorfunc
             {
                 throw std::invalid_argument("A.shape could matmul with B.shape");
             }
+            //TODO
+            //这里如果对二维矩阵运算，则omp并行不起来，因为C.shape.dim - 2刚好=0
             C.shape.rangeParallel(C.shape.dim - 2, [&](const std::vector<int> &indices)
                                   {
                         int aIdx=A.shape.linearat(indices);
diff --git a/front/py/deepx/scheduler/client/udpconn.py b/front/py/deepx/scheduler/client/udpconn.py
index a25b0963..6a12c26a 100644
--- a/front/py/deepx/scheduler/client/udpconn.py
+++ b/front/py/deepx/scheduler/client/udpconn.py
@@ -3,7 +3,7 @@
 import select
 
 class UDPConn:
-    def __init__(self, endpoint: str = "localhost:8080"):
+    def __init__(self, endpoint: str = "localhost:9090"):
         # 解析endpoint
         self._host, port_str = endpoint.split(':')
         self._port = int(port_str)
diff --git a/front/py/examples/2_ir/3_matmul.py b/front/py/examples/2_ir/3_matmul.py
index bf682a88..3c22593f 100644
--- a/front/py/examples/2_ir/3_matmul.py
+++ b/front/py/examples/2_ir/3_matmul.py
@@ -1,4 +1,4 @@
-benchcnt=1000
+benchcnt=100
 
 from deepx.nn.functional import save_npy
 import numpy as np
@@ -34,6 +34,8 @@
 t1 = load(npy_path+'t1')
 t2 = load(npy_path+'t2')
 t3= zeros(1024,1024,dtype='float32',name="t3")
+from deepx.nn.functional import defaultauthor
+defaultauthor['matmul']='miaobyte'
 # warmup
 matmul(t1,t2,out=t3)
 

From 33ae53b0d6640e725b2bcd4cf5307c373f4710a3 Mon Sep 17 00:00:00 2001
From: lipeng <734991033@qq.com>
Date: Wed, 23 Apr 2025 02:52:10 +0800
Subject: [PATCH 3/5] =?UTF-8?q?IndexSelect:=E6=B5=8B=E8=AF=95=E9=AA=8C?=
 =?UTF-8?q?=E8=AF=81=20RenameTensor=EF=BC=9A=E6=B5=8B=E8=AF=95=E9=AA=8C?=
 =?UTF-8?q?=E8=AF=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

front:embedding ok
---
 doc/excuter/op-mem-ompsimd/list.md            | 33 +++++-----
 excuter/cpp-common/src/deepx/mem/mem.hpp      | 22 +++----
 .../src/deepx/shape_changeshape.cpp           | 23 ++++++-
 .../src/deepx/shape_changeshape.hpp           |  6 +-
 .../src/deepx/tensorfunc/changeshape.hpp      | 10 ++--
 .../src/deepx/tensorfunc/tensorlife.hpp       |  2 +
 excuter/op-mem-ompsimd/src/client/tfs.cpp     | 13 +++-
 .../deepx/tensorfunc/changeshape_miaobyte.hpp | 31 +++++-----
 .../src/deepx/tensorfunc/io_miaobyte.hpp      |  2 +-
 .../src/deepx/tf/changeshape.hpp              | 54 ++++++++---------
 .../src/deepx/tf/tensorlife.hpp               | 32 ++++++++++
 front/py/deepx/nn/functional/__init__.py      |  4 +-
 front/py/deepx/nn/functional/authormap.py     |  2 +-
 front/py/deepx/nn/functional/leaffunc.py      |  2 +-
 .../nn/functional/leaffunc_changeshape.py     | 35 ++++++++---
 front/py/deepx/nn/functional/leaffunc_io.py   | 18 +++++-
 front/py/deepx/nn/functional/leaffunc_life.py |  7 +++
 front/py/deepx/nn/functional/reduce.py        |  9 +--
 .../py/deepx/nn/functional/rtf_changeshape.py |  5 +-
 front/py/deepx/nn/functional/rtf_io.py        |  2 +-
 front/py/deepx/nn/functional/rtf_life.py      |  6 ++
 front/py/deepx/nn/modules/__init__.py         |  5 +-
 front/py/deepx/nn/modules/module.py           | 39 ++++--------
 front/py/deepx/nn/modules/sparse.py           | 60 ++++++++++---------
 front/py/deepx/scheduler/client/udpconn.py    |  2 +-
 front/py/deepx/tensor/changeshape.py          | 30 +++++-----
 front/py/deepx/tensor/io.py                   |  2 +-
 front/py/deepx/tensor/shape.py                | 37 +++++++-----
 front/py/deepx/tensor/tensor.py               | 17 ++++--
 front/py/examples/3_module/1_embedding.py     | 54 +++++++++++++++++
 30 files changed, 364 insertions(+), 200 deletions(-)

diff --git a/doc/excuter/op-mem-ompsimd/list.md b/doc/excuter/op-mem-ompsimd/list.md
index 8ca0e1d6..9f43ccf4 100644
--- a/doc/excuter/op-mem-ompsimd/list.md
+++ b/doc/excuter/op-mem-ompsimd/list.md
@@ -13,10 +13,11 @@
 
 | Operation | Author | Func Def | Math Formula | IR Instruction |
 |-----------|--------|------------|--------------|----------------|
-| copytensor |  none  | copytensor(tensor<any> src, tensor<any> dst)->() | T2.data = T1.data | copytensor(tensor<any> src, tensor<any> dst)->() |
+| renametensor |  none  | renametensor(tensor<any> t, var<string> new_name)->() | rename T1 to T2 | renametensor(tensor<any> t, var<string> new_name)->() |
 | newtensor |  none  | newtensor(vector<int32> shape)->(tensor<any> tensor1) | T1 =Tensor(shape=[...]) | newtensor(vector<int32> shape)->(tensor<any> tensor1) |
 | newtensor |  none  | newtensor(var<string> shape)->(tensor<any> t) | T1 =Tensor(shape=[...]) | newtensor(var<string> shape)->(tensor<any> t) |
 | deltensor |  none  | deltensor(tensor<any> t)->() | del T1 | deltensor(tensor<any> t)->() |
+| copytensor |  none  | copytensor(tensor<any> src, tensor<any> dst)->() | T2.data = T1.data | copytensor(tensor<any> src, tensor<any> dst)->() |
 
 ### io
 
@@ -31,9 +32,9 @@
 
 | Operation | Author | Func Def | Math Formula | IR Instruction |
 |-----------|--------|------------|--------------|----------------|
-| arange | miaobyte | arange(tensor<any> t, var<any> start, var<any> step)->() | arange(T1,start,step) | arange(tensor<any> t, var<any> start, var<any> step)->() |
 | normal | miaobyte | normal(tensor<any> t, var<any> mean, var<any> std, var<int32> seed)->() | normal(T1,mean,stddev,seed) | normal(tensor<any> t, var<any> mean, var<any> std, var<int32> seed)->() |
 | uniform | miaobyte | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() | uniform(T1,low,high,seed) | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() |
+| arange | miaobyte | arange(tensor<any> t, var<any> start, var<any> step)->() | arange(T1,start,step) | arange(tensor<any> t, var<any> start, var<any> step)->() |
 | constant | miaobyte | constant(tensor<any> t, var<any> value)->() | constant(T1,value) | constant(tensor<any> t, var<any> value)->() |
 
 ### elementwise
@@ -46,8 +47,8 @@
 | min | miaobyte | min(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=min(T1,T2) | min(tensor<any> A, tensor<any> B)->(tensor<any> C) |
 | maxscalar | miaobyte | maxscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=max(T1,scalar) | maxscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
 | divscalar | miaobyte | divscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=T1/scalar | divscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
-| add | cblas | add(tensor<float64|float32> a, tensor<float64|float32> b)->(tensor<float64|float32> c) | T3=T1+T2 | add(tensor<float64|float32> a, tensor<float64|float32> b)->(tensor<float64|float32> c) |
-| add | miaobyte | add(tensor<any> a, tensor<any> b)->(tensor<any> c) | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |
+| log | miaobyte | log(tensor<any> A)->(tensor<any> C) | T3=log(T1) | log(tensor<any> A)->(tensor<any> C) |
+| addscalar | miaobyte | addscalar(tensor<any> a, var<any> scalar)->(tensor<any> c) | T3=T1+scalar | addscalar(tensor<any> a, var<any> scalar)->(tensor<any> c) |
 | greater | miaobyte | greater(tensor<any> A, tensor<any> B)->(tensor<bool> mask) | mask=greater(T1,T2) | greater(tensor<any> A, tensor<any> B)->(tensor<bool> mask) |
 | lessscalar | miaobyte | lessscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) | mask=less(T1,scalar) | lessscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) |
 | less | miaobyte | less(tensor<any> A, tensor<any> B)->(tensor<bool> mask) | mask=less(T1,T2) | less(tensor<any> A, tensor<any> B)->(tensor<bool> mask) |
@@ -55,8 +56,8 @@
 | minscalar | miaobyte | minscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=min(T1,scalar) | minscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
 | rdivscalar | miaobyte | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) | T3=scalar/T1 | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) |
 | rpowscalar | miaobyte | rpowscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) | T3=scalar^T1 | rpowscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) |
-| log | miaobyte | log(tensor<any> A)->(tensor<any> C) | T3=log(T1) | log(tensor<any> A)->(tensor<any> C) |
-| addscalar | miaobyte | addscalar(tensor<any> a, var<any> scalar)->(tensor<any> c) | T3=T1+scalar | addscalar(tensor<any> a, var<any> scalar)->(tensor<any> c) |
+| add | cblas | add(tensor<float64|float32> a, tensor<float64|float32> b)->(tensor<float64|float32> c) | T3=T1+T2 | add(tensor<float64|float32> a, tensor<float64|float32> b)->(tensor<float64|float32> c) |
+| add | miaobyte | add(tensor<any> a, tensor<any> b)->(tensor<any> c) | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |
 | sub | miaobyte | sub(tensor<any> a, tensor<any> b)->(tensor<any> c) | T3=T1-T2 | sub(tensor<any> a, tensor<any> b)->(tensor<any> c) |
 | sqrt | miaobyte | sqrt(tensor<any> A)->(tensor<any> C) | T3=sqrt(T1) | sqrt(tensor<any> A)->(tensor<any> C) |
 | subscalar | miaobyte | subscalar(tensor<any> a, var<any> scalar)->(tensor<any> c) | T3=T1-scalar | subscalar(tensor<any> a, var<any> scalar)->(tensor<any> c) |
@@ -76,22 +77,22 @@
 | matmul | cblas | matmul(tensor<float64|float32> A, tensor<float64|float32> B)->(tensor<float64|float32> C) | T3=T1 @ T2 | matmul(tensor<float64|float32> A, tensor<float64|float32> B)->(tensor<float64|float32> C) |
 | matmul | miaobyte | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1 @ T2 | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) |
 
+### reduce
+
+| Operation | Author | Func Def | Math Formula | IR Instruction |
+|-----------|--------|------------|--------------|----------------|
+| prod | miaobyte | prod(tensor<any> A, vector<int32> axis, var<bool> keepdims)->(tensor<any> B) | B = prod(A, axis=[1 2], keepdims=false) | prod(tensor<any> A, vector<int32> axis, var<bool> keepdims)->(tensor<any> B) |
+| reducemax | miaobyte | reducemax(tensor<any> A, vector<int32> axis, var<bool> keepdims)->(tensor<any> B) | B = reducemax(A, axis=[1 2], keepdims=false) | reducemax(tensor<any> A, vector<int32> axis, var<bool> keepdims)->(tensor<any> B) |
+| sum | miaobyte | sum(tensor<any> A, vector<int32> axis, var<bool> keepdims)->(tensor<any> B) | B = sum(A, axis=[1 2], keepdims=false) | sum(tensor<any> A, vector<int32> axis, var<bool> keepdims)->(tensor<any> B) |
+| reducemin | miaobyte | reducemin(tensor<any> A, vector<int32> axis, var<bool> keepdims)->(tensor<any> B) | B = reducemin(A, axis=[1 2], keepdims=false) | reducemin(tensor<any> A, vector<int32> axis, var<bool> keepdims)->(tensor<any> B) |
+
 ### changeshape
 
 | Operation | Author | Func Def | Math Formula | IR Instruction |
 |-----------|--------|------------|--------------|----------------|
-| gather | miaobyte | gather(tensor<any> A, tensor<int64|int32> indices, var<int32> axis)->(tensor<any> B) | T2 = T1.gather(indices=T3, axis=3) | gather(tensor<any> A, tensor<int64|int32> indices, var<int32> axis)->(tensor<any> B) |
+| indexselect | miaobyte | indexselect(tensor<any> A, tensor<int64|int32> index, var<int32> axis)->(tensor<any> B) | T2 = T1.indexselect(index=T3, axis=3) | indexselect(tensor<any> A, tensor<int64|int32> index, var<int32> axis)->(tensor<any> B) |
 | broadcastTo | miaobyte | broadcastTo(tensor<any> A, vector<int32> new_shape)->(tensor<any> B) | T2 = T1.broadcastTo(new_shape=[4,3,2]) | broadcastTo(tensor<any> A, vector<int32> new_shape)->(tensor<any> B) |
 | concat | miaobyte | concat(listtensor<any> tensors, var<int32> dim)->(tensor<any> result) | Tresult = concat([T1, T2...], axis=3) | concat(listtensor<any> tensors, var<int32> dim)->(tensor<any> result) |
 | transpose | miaobyte | transpose(tensor<any> A, vector<int32> dim_order)->(tensor<any> C) | T1.transpose(dimorder=[1,0])->T2 | transpose(tensor<any> A, vector<int32> dim_order)->(tensor<any> C) |
 | reshape | miaobyte | reshape(tensor<any> A, vector<int32> shape)->(tensor<any> B) | T1.reshape(shape)->T2 | reshape(tensor<any> A, vector<int32> shape)->(tensor<any> B) |
 
-### reduce
-
-| Operation | Author | Func Def | Math Formula | IR Instruction |
-|-----------|--------|------------|--------------|----------------|
-| reducemax | miaobyte | reducemax(tensor<any> A, vector<int32> axis, var<bool> keepdims)->(tensor<any> B) | B = reducemax(A, axis=[1 2], keepdims=false) | reducemax(tensor<any> A, vector<int32> axis, var<bool> keepdims)->(tensor<any> B) |
-| prod | miaobyte | prod(tensor<any> A, vector<int32> axis, var<bool> keepdims)->(tensor<any> B) | B = prod(A, axis=[1 2], keepdims=false) | prod(tensor<any> A, vector<int32> axis, var<bool> keepdims)->(tensor<any> B) |
-| sum | miaobyte | sum(tensor<any> A, vector<int32> axis, var<bool> keepdims)->(tensor<any> B) | B = sum(A, axis=[1 2], keepdims=false) | sum(tensor<any> A, vector<int32> axis, var<bool> keepdims)->(tensor<any> B) |
-| reducemin | miaobyte | reducemin(tensor<any> A, vector<int32> axis, var<bool> keepdims)->(tensor<any> B) | B = reducemin(A, axis=[1 2], keepdims=false) | reducemin(tensor<any> A, vector<int32> axis, var<bool> keepdims)->(tensor<any> B) |
-
diff --git a/excuter/cpp-common/src/deepx/mem/mem.hpp b/excuter/cpp-common/src/deepx/mem/mem.hpp
index 9ea7ab70..710c3773 100644
--- a/excuter/cpp-common/src/deepx/mem/mem.hpp
+++ b/excuter/cpp-common/src/deepx/mem/mem.hpp
@@ -108,17 +108,7 @@ namespace deepx::mem
             }
             mem[name] = tensor;
         }
-
-        // template <typename T>
-        // shared_ptr<Tensor<T>> temptensor(vector<int> shape)
-        // {
-        //     // 直接构造到shared_ptr避免移动
-        //     auto temp = tensorfunc::New<T>(shape); // 临时对象
-        //     auto cloned = make_shared<Tensor<T>>(std::move(temp));
-        //     mem["temp" + to_string(tempidx)] = cloned;
-        //     tempidx++;
-        //     return cloned;
-        // }
+ 
 
         bool existstensor(const string &name) const
         {
@@ -172,6 +162,16 @@ namespace deepx::mem
         {
             args.erase(name);
         }
+
+        void rename_tensor(const string &old_name, const string &new_name)
+        {
+            if (mem.find(old_name) == mem.end())
+            {
+                throw std::runtime_error("tensor not found: " + old_name);
+            }
+            mem[new_name] = mem[old_name];  
+            mem.erase(old_name);
+        }
     };
 }
 #endif // DEEPX_MEM_MEMBASE_HPP
\ No newline at end of file
diff --git a/excuter/cpp-common/src/deepx/shape_changeshape.cpp b/excuter/cpp-common/src/deepx/shape_changeshape.cpp
index c0002617..f3a60bd1 100644
--- a/excuter/cpp-common/src/deepx/shape_changeshape.cpp
+++ b/excuter/cpp-common/src/deepx/shape_changeshape.cpp
@@ -120,5 +120,26 @@ namespace deepx
             }
         }
     }
- 
+    
+    vector<int> indexselectShape(const vector<int> &input_shape, const vector<int> &index_shape, const int axis){
+
+        vector<int> output_shape(input_shape.size()-1+index_shape.size());
+        for (int output_idx=0,input_idx=0,index_idx=0;output_idx<output_shape.size();output_idx++){
+            if (output_idx<axis){
+                if (input_idx==axis){
+                    input_idx++;
+                }
+                output_shape[output_idx]=input_shape[input_idx++];
+            }else if (output_idx>=axis+index_shape.size()){
+                if (input_idx==axis){
+                    input_idx++;
+                }
+                output_shape[output_idx]=input_shape[input_idx++];
+            }else{
+                //index
+                output_shape[output_idx]=index_shape[index_idx++];
+            }
+        }
+        return output_shape;
+    }
 }
\ No newline at end of file
diff --git a/excuter/cpp-common/src/deepx/shape_changeshape.hpp b/excuter/cpp-common/src/deepx/shape_changeshape.hpp
index ac2a588a..3f299885 100644
--- a/excuter/cpp-common/src/deepx/shape_changeshape.hpp
+++ b/excuter/cpp-common/src/deepx/shape_changeshape.hpp
@@ -70,8 +70,8 @@ namespace deepx
     };
     std::vector<BroadcastMap> broadcastMap(const std::vector<int> &a, const std::vector<int> &b);
 
-
-    //gather
-    //gather的out.shape=indices.shape,所以无需计算
+ 
+    //indexselect
+    vector<int> indexselectShape(const vector<int> &input_shape, const vector<int> &index_shape, const int axis);
 }
 #endif // DEEPX_SHAPE_CHANGESHAPE_HPP
\ No newline at end of file
diff --git a/excuter/cpp-common/src/deepx/tensorfunc/changeshape.hpp b/excuter/cpp-common/src/deepx/tensorfunc/changeshape.hpp
index 5e359dbc..100f408b 100644
--- a/excuter/cpp-common/src/deepx/tensorfunc/changeshape.hpp
+++ b/excuter/cpp-common/src/deepx/tensorfunc/changeshape.hpp
@@ -61,17 +61,17 @@ namespace deepx::tensorfunc
         broadcastToDispatcher<Author, T>::broadcastTo(A, new_shape, B);
     }
 
-    // gather
+    // indexselect
     template <typename Author, typename T, typename GatherAxisT>
-    struct gatherDispatcher
+    struct indexselectDispatcher
     {
-        static void gather(const Tensor<T> &input, const Tensor<GatherAxisT> &indices, const int axis, Tensor<T> &output) = delete;
+        static void indexselect(const Tensor<T> &input, const Tensor<GatherAxisT> &indices, const int axis, Tensor<T> &output) = delete;
     };
 
     template <typename Author, typename T, typename GatherAxisT>
-    void gather(const Tensor<T> &input, const Tensor<GatherAxisT> &indices, const int axis, Tensor<T> &output)
+    void indexselect(const Tensor<T> &input, const Tensor<GatherAxisT> &indices, const int axis, Tensor<T> &output)
     {
-        gatherDispatcher<Author, T, GatherAxisT>::gather(input, indices, axis, output);
+        indexselectDispatcher<Author, T, GatherAxisT>::indexselect(input, indices, axis, output);
     }
 
     // // split
diff --git a/excuter/cpp-common/src/deepx/tensorfunc/tensorlife.hpp b/excuter/cpp-common/src/deepx/tensorfunc/tensorlife.hpp
index 0dc884ac..cc06c69d 100644
--- a/excuter/cpp-common/src/deepx/tensorfunc/tensorlife.hpp
+++ b/excuter/cpp-common/src/deepx/tensorfunc/tensorlife.hpp
@@ -19,5 +19,7 @@ namespace deepx::tensorfunc
     template <typename T>
     void copy(const Tensor<T> &src,Tensor<T> &dst);
 
+    //rename
+    //通过tf直接实现
 }
 #endif
\ No newline at end of file
diff --git a/excuter/op-mem-ompsimd/src/client/tfs.cpp b/excuter/op-mem-ompsimd/src/client/tfs.cpp
index 6eed5e04..07df04bd 100644
--- a/excuter/op-mem-ompsimd/src/client/tfs.cpp
+++ b/excuter/op-mem-ompsimd/src/client/tfs.cpp
@@ -68,6 +68,13 @@ namespace deepx::tf
                                                              Param("t", DataCategory::Tensor, Precision::Any),
                                                          }),
                                                      vector<Param>()));
+        //renametensor
+        tffactory.add_tf(std::make_shared<RenameTensor>(vector<Param>(
+                                                         {
+                                                             Param("t", DataCategory::Tensor, Precision::Any),
+                                                             Param("new_name", DataCategory::Var, Precision::String),
+                                                         }),
+                                                     vector<Param>()));
     }
 
     // init
@@ -488,11 +495,11 @@ namespace deepx::tf
                                                                      {
                                                                          Param("B", DataCategory::Tensor, Precision::Any),
                                                                      })));
-        // gather author=miaobyte
-        tffactory.add_tf(std::make_shared<Gather<miaobyte>>(vector<Param>(
+        // indexselect author=miaobyte
+        tffactory.add_tf(std::make_shared<IndexSelect<miaobyte>>(vector<Param>(
                                                                 {
                                                                     Param("A", DataCategory::Tensor, Precision::Any),
-                                                                    Param("indices", DataCategory::Tensor, Precision::Int32 | Precision::Int64),
+                                                                    Param("index", DataCategory::Tensor, Precision::Int32 | Precision::Int64),
                                                                     Param("axis", DataCategory::Var, Precision::Int32),
                                                                 }),
                                                             vector<Param>(
diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/changeshape_miaobyte.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/changeshape_miaobyte.hpp
index 7e2985fb..f9466111 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/changeshape_miaobyte.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/changeshape_miaobyte.hpp
@@ -138,21 +138,24 @@ namespace deepx::tensorfunc
         }
     };
 
-    // gather
-    // 支持高维indices
-    // 结果写入input_indices
+    // indexselect
+    // output_indices,index,index_indices,gatheraxis->input_indices
     template <typename GatherAxisT>
-    void fromGatherIndices(const vector<int> &output_indices, const Tensor<GatherAxisT> &indices, const int gatherAxis, vector<int> &input_indices)
+    void fromIndexselectIndices(const vector<int> &output_indices, const Tensor<GatherAxisT> &index,vector<int> &index_indices, const int gatherAxis, vector<int> &input_indices)
     {
-        std::copy(output_indices.begin(), output_indices.begin()+input_indices.size(), input_indices.begin());
-        int indices_idx = indices.shape.linearat(output_indices);
-        input_indices[gatherAxis] = indices.data[indices_idx];
+ 
+        std::copy(output_indices.begin(), output_indices.begin()+gatherAxis, input_indices.begin());
+        std::copy(output_indices.begin()+gatherAxis,output_indices.begin()+gatherAxis+index_indices.size(), index_indices.begin());
+        int index_idx=index.shape.linearat(index_indices);
+        input_indices[gatherAxis] = index.data[index_idx];
+        std::copy(output_indices.begin()+gatherAxis+index_indices.size(),output_indices.begin()+output_indices.size(), input_indices.begin()+gatherAxis+1);
+
     }
 
     template <typename T, typename GatherAxisT>
-    struct gatherDispatcher<miaobyte, T, GatherAxisT>
+    struct indexselectDispatcher<miaobyte, T, GatherAxisT>
     {
-        static void gather(const Tensor<T> &input, const Tensor<GatherAxisT> &indices, const int axis, Tensor<T> &output)
+        static void indexselect(const Tensor<T> &input, const Tensor<GatherAxisT> &index, const int axis, Tensor<T> &output)
         {
             int gatherAxis = axis < 0 ? input.shape.dim + axis : axis;
             if (gatherAxis < 0 || gatherAxis >= input.shape.dim)
@@ -160,17 +163,17 @@ namespace deepx::tensorfunc
                 throw std::invalid_argument("Axis is out of bounds");
             }
 
-            vector<int> input_gatherShape =  indices.shape.shape;
-            if (input_gatherShape.empty() || input_gatherShape != output.shape.shape)
+            vector<int>  gatherShape =  indexselectShape(input.shape.shape,index.shape.shape,gatherAxis);
+            if (gatherShape.empty() || gatherShape != output.shape.shape)
             {
-                throw TensorShapeError("Gather shape mismatch");
+                throw TensorShapeError("Indexselect shape mismatch");
             }
             output.shape.rangeParallel(output.shape.dim, [&](const int idx, const std::vector<int> &output_indices, ThreadLocalVectors &tlv)
                                        {  
-                            fromGatherIndices(output_indices, indices, gatherAxis, tlv.get(0));
+                            fromIndexselectIndices(output_indices, index,tlv.get(1), gatherAxis, tlv.get(0));
                             output.data[idx] = input.data[input.shape.linearat(tlv.get(0))]; 
                         },
-                    {input.shape.dim});
+                    {input.shape.dim,index.shape.dim});
         }
     };
 
diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/io_miaobyte.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/io_miaobyte.hpp
index f219ca59..38c12f32 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/io_miaobyte.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/io_miaobyte.hpp
@@ -85,7 +85,7 @@ namespace deepx::tensorfunc
         std::ifstream data_fs(datapath, std::ios::binary);
         data_fs.seekg(0, std::ios::end);
         std::streamsize fileSize = data_fs.tellg();
-        std::streamsize expectedSize = shape.size * precision_bits(shape.dtype) / 8;
+        std::streamsize expectedSize = shape.size * (precision_bits(shape.dtype) / 8);
 
         if (fileSize != expectedSize)
         {
diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/changeshape.hpp b/excuter/op-mem-ompsimd/src/deepx/tf/changeshape.hpp
index 8afa8e14..e0d96019 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tf/changeshape.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tf/changeshape.hpp
@@ -322,14 +322,14 @@ namespace deepx::tf
         }
     };
 
-    // gather
+    // indexselect
     template <typename Author>
-    class Gather : public TF
+    class IndexSelect : public TF
     {
     public:
-        Gather(const vector<Param> &args, const vector<Param> &returns)
+        IndexSelect(const vector<Param> &args, const vector<Param> &returns)
         {
-            this->name = "gather";
+            this->name = "indexselect";
             this->metadata.author = Author::name();
             this->tftype = "changeshape";
             this->args = args;
@@ -338,11 +338,11 @@ namespace deepx::tf
 
         string math_formula() const override
         {
-            return "T2 = T1.gather(indices=T3, axis=3)";
+            return "T2 = T1.indexselect(index=T3, axis=3)";
         }
         shared_ptr<TF> clone() const override
         {
-            return make_shared<Gather<Author>>(*this);
+            return make_shared<IndexSelect<Author>>(*this);
         }
         int run(shared_ptr<MemBase> mem, string &error) override
         {
@@ -358,10 +358,10 @@ namespace deepx::tf
                 error = "Type mismatch: " + precision_str(input_type) + " != " + precision_str(output_type);
                 return 1;
             }
-            Precision indices_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype;
-            if (indices_type != Precision::Int32 && indices_type != Precision::Int64)
+            Precision index_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype;
+            if (index_type != Precision::Int32 && index_type != Precision::Int64)
             {
-                error = "indices only support int32 or int64";
+                error = "index only support int32 or int64";
                 return 1;
             }
             int axis = this->getvar<int>(2, mem, true);
@@ -369,73 +369,73 @@ namespace deepx::tf
             {
             case Precision::Float64:
             {
-                if (indices_type == Precision::Int32)
+                if (index_type == Precision::Int32)
                 {
-                    gather<Author, double, int32_t>(*mem->gettensor<double>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), axis, *mem->gettensor<double>(this->returns[0].textvalue));
+                    indexselect<Author, double, int32_t>(*mem->gettensor<double>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), axis, *mem->gettensor<double>(this->returns[0].textvalue));
                 }
                 else
                 {
-                    gather<Author, double, int64_t>(*mem->gettensor<double>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue), axis, *mem->gettensor<double>(this->returns[0].textvalue));
+                    indexselect<Author, double, int64_t>(*mem->gettensor<double>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue), axis, *mem->gettensor<double>(this->returns[0].textvalue));
                 }
                 break;
             }
             case Precision::Float32:
             {
-                if (indices_type == Precision::Int32)
+                if (index_type == Precision::Int32)
                 {
-                    gather<Author, float, int32_t>(*mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), axis, *mem->gettensor<float>(this->returns[0].textvalue));
+                    indexselect<Author, float, int32_t>(*mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), axis, *mem->gettensor<float>(this->returns[0].textvalue));
                 }
                 else
                 {
-                    gather<Author, float, int64_t>(*mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue), axis, *mem->gettensor<float>(this->returns[0].textvalue));
+                    indexselect<Author, float, int64_t>(*mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue), axis, *mem->gettensor<float>(this->returns[0].textvalue));
                 }
                 break;
             }
             case Precision::Int64:
             {
-                if (indices_type == Precision::Int32)
+                if (index_type == Precision::Int32)
                 {
-                    gather<Author, int64_t, int32_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), axis, *mem->gettensor<int64_t>(this->returns[0].textvalue));
+                    indexselect<Author, int64_t, int32_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), axis, *mem->gettensor<int64_t>(this->returns[0].textvalue));
                 }
                 else
                 {
-                    gather<Author, int64_t, int64_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue), axis, *mem->gettensor<int64_t>(this->returns[0].textvalue));
+                    indexselect<Author, int64_t, int64_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue), axis, *mem->gettensor<int64_t>(this->returns[0].textvalue));
                 }
                 break;
             }
             case Precision::Int16:
             {
-                if (indices_type == Precision::Int32)
+                if (index_type == Precision::Int32)
                 {
-                    gather<Author, int16_t, int32_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), axis, *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                    indexselect<Author, int16_t, int32_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), axis, *mem->gettensor<int16_t>(this->returns[0].textvalue));
                 }
                 else
                 {
-                    gather<Author, int16_t, int64_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue), axis, *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                    indexselect<Author, int16_t, int64_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue), axis, *mem->gettensor<int16_t>(this->returns[0].textvalue));
                 }
                 break;
             }
             case Precision::Int8:
             {
-                if (indices_type == Precision::Int32)
+                if (index_type == Precision::Int32)
                 {
-                    gather<Author, int8_t, int32_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), axis, *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                    indexselect<Author, int8_t, int32_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), axis, *mem->gettensor<int8_t>(this->returns[0].textvalue));
                 }
                 else
                 {
-                    gather<Author, int8_t, int64_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue), axis, *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                    indexselect<Author, int8_t, int64_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue), axis, *mem->gettensor<int8_t>(this->returns[0].textvalue));
                 }
                 break;
             }
             case Precision::Bool:
             {
-                if (indices_type == Precision::Int32)
+                if (index_type == Precision::Int32)
                 {
-                    gather<Author, bool, int32_t>(*mem->gettensor<bool>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), axis, *mem->gettensor<bool>(this->returns[0].textvalue));
+                    indexselect<Author, bool, int32_t>(*mem->gettensor<bool>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), axis, *mem->gettensor<bool>(this->returns[0].textvalue));
                 }
                 else
                 {
-                    gather<Author, bool, int64_t>(*mem->gettensor<bool>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue), axis, *mem->gettensor<bool>(this->returns[0].textvalue));
+                    indexselect<Author, bool, int64_t>(*mem->gettensor<bool>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue), axis, *mem->gettensor<bool>(this->returns[0].textvalue));
                 }
                 break;
             }
diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/tensorlife.hpp b/excuter/op-mem-ompsimd/src/deepx/tf/tensorlife.hpp
index d703355c..8d4d4f23 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tf/tensorlife.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tf/tensorlife.hpp
@@ -230,5 +230,37 @@ namespace deepx::tf
             return make_shared<DelTensor>(*this);
         }
     };
+
+    //rename
+    class RenameTensor : public TF
+    {
+    public:
+        RenameTensor(vector<Param> args, vector<Param> returns)
+        {
+            this->name = "renametensor";
+            this->args = args;
+            this->returns = returns;
+            this->tftype = "tensorlife";
+        }
+        int run(shared_ptr<MemBase> mem, string &error) override
+        {
+            string old_name = this->args[0].textvalue;
+            if (!checktensors({this->args[0].textvalue}, mem, error) != 0)
+            {
+                return 1;
+            }
+            string new_name = this->args[1].textvalue;
+            mem->rename_tensor(old_name, new_name);
+            return 0;
+        }
+        string math_formula() const override
+        {
+            return "rename T1 to T2";
+        }
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<RenameTensor>(*this);
+        }
+    };
 }
 #endif
diff --git a/front/py/deepx/nn/functional/__init__.py b/front/py/deepx/nn/functional/__init__.py
index 0f729cfa..ff85363e 100644
--- a/front/py/deepx/nn/functional/__init__.py
+++ b/front/py/deepx/nn/functional/__init__.py
@@ -19,12 +19,12 @@
 
     #leaffunc
     "newtensor","printtensor","load", #life
-    "printtensor","save","save_npy",#io
+    "printtensor","save","save_npy","save_torch",#io
     "constant","constant_","full","zeros","ones","uniform","uniform_","arange","arange_","kaiming_uniform","kaiming_uniform_","calculate_fan_in_and_fan_out",
     "add","sub","mul","div","sqrt","pow","exp","log",
     "matmul",
     "reducemax","reducemin","sum","prod",
-    "reshape","permute","transpose","concat","broadcastTo",
+    "reshape","permute","transpose","concat","broadcastTo","indexselect",
 
     #functional
     "relu","sigmoid","swish",
diff --git a/front/py/deepx/nn/functional/authormap.py b/front/py/deepx/nn/functional/authormap.py
index 4a241865..120db25b 100644
--- a/front/py/deepx/nn/functional/authormap.py
+++ b/front/py/deepx/nn/functional/authormap.py
@@ -34,7 +34,7 @@
         'transpose':'miaobyte',
         'broadcastTo':'miaobyte',
         'concat':'miaobyte',
-        'gather':'miaobyte',
+        'indexselect':'miaobyte',
         #matmul
         # 'matmul':'miaobyte',
         'matmul':'cublas',
diff --git a/front/py/deepx/nn/functional/leaffunc.py b/front/py/deepx/nn/functional/leaffunc.py
index 58d21105..566e2d48 100644
--- a/front/py/deepx/nn/functional/leaffunc.py
+++ b/front/py/deepx/nn/functional/leaffunc.py
@@ -61,7 +61,7 @@ def op_func(
 def create_A_dim_keepdim_tf_C(op_name):
     def op_func(
             a:Tensor,
-            dim:list[int],
+            dim:tuple[int,...],
             keepdim:bool=False,
             out:Union[Tensor,str]='',
             author:str='miaobyte',
diff --git a/front/py/deepx/nn/functional/leaffunc_changeshape.py b/front/py/deepx/nn/functional/leaffunc_changeshape.py
index bd86edf6..cee587bf 100644
--- a/front/py/deepx/nn/functional/leaffunc_changeshape.py
+++ b/front/py/deepx/nn/functional/leaffunc_changeshape.py
@@ -4,7 +4,11 @@
 from .leaffunc_life import newtensor
 from .authormap import defaultauthor
 
-def reshape(t:Tensor,shape:list[int],out:Union[Tensor,str]='')->Tensor:
+def reshape(t:Tensor,shape:tuple[int,...],out:Union[Tensor,str]='')->Tensor:
+    assert isinstance(shape,tuple)
+    for i in shape:
+        assert isinstance(i,int) and i>0
+
     outtensor=out
     if isinstance(out,str):
         outshape=shape
@@ -18,8 +22,12 @@ def reshape(t:Tensor,shape:list[int],out:Union[Tensor,str]='')->Tensor:
  
 
 def permute(t:Tensor,
-            dimorder:list[int],
+            dimorder:tuple[int,...],
             out:Union[Tensor,str]='')->Tensor:
+    assert isinstance(dimorder,tuple)
+    for i in dimorder:
+        assert isinstance(i,int)
+
     if t.ndim!=len(dimorder):
         raise ValueError(f"shape参数不合法,当前输入维度数：{len(dimorder)}，张量维度数：{t.ndim}")
     dimorder = [d % t.ndim for d in dimorder]
@@ -49,11 +57,15 @@ def concat(tensors:Union[list[Tensor],tuple[Tensor]],dim:int,out:Union[Tensor,st
     rtf_concat(tensors,dim,outtensor,defaultauthor['concat'])
     return outtensor
 
-def broadcastTo(t:Tensor,new_shape:tuple[int],out:Union[Tensor,str]='',requires_grad:bool=False,author='miaobyte')->Tensor:
+def broadcastTo(t:Tensor,new_shape:tuple[int,...],out:Union[Tensor,str]='',requires_grad:bool=False,author='miaobyte')->Tensor:
+    assert isinstance(new_shape,tuple)
+    for i in new_shape:
+        assert isinstance(i,int) and i>0
+    
     if t.shape==new_shape:
         return t
     bshape=Shape.broadcast_shape(t.shape,new_shape)
-    if bshape!=new_shape:
+    if bshape!=tuple(new_shape):
         raise ValueError(f"广播失败：{t.shape} 无法广播为 {new_shape} ")
     outtensor=out
     if isinstance(out,str):
@@ -64,12 +76,17 @@ def broadcastTo(t:Tensor,new_shape:tuple[int],out:Union[Tensor,str]='',requires_
     return outtensor
 broadcast_to = broadcastTo
 
-def gather(input:Tensor,indices:Tensor,gatheraxis:int,out:Union[Tensor,str]='')->Tensor:
+def indexselect(input:Tensor,indices:Tensor,gatheraxis:int,out:Union[Tensor,str]='')->Tensor:
+    assert gatheraxis>=0 and gatheraxis<input.ndim
+
     outtensor=out
     if isinstance(out,str):
-        outtensor=newtensor(indices.shape,dtype=input.dtype,name=out)
-    from .rtf_changeshape import rtf_gather
-    rtf_gather(input,indices,gatheraxis,outtensor,defaultauthor['gather'])
+        outshape=Shape.indexselectshape(input.shape,indices.shape,gatheraxis)
+        outtensor=newtensor(outshape,dtype=input.dtype,name=out)
+    assert outtensor.shape==outshape
+    
+    from .rtf_changeshape import rtf_indexselect
+    rtf_indexselect(input,indices,gatheraxis,outtensor,defaultauthor['indexselect'])
     return outtensor
 
 # def unsqueeze(t:Tensor,dim:int)->Tensor:
@@ -87,7 +104,7 @@ def gather(input:Tensor,indices:Tensor,gatheraxis:int,out:Union[Tensor,str]='')-
 #     return reshape(t, new_shape)
 
 # OpNode.register("expand")
-# def expand(t:Tensor,shape:list[int],out:Union[Tensor,str]='')->Tensor:
+# def expand(t:Tensor,shape:tuple[int,...],out:Union[Tensor,str]='')->Tensor:
 #     outtensor=None
 #     if isinstance(out,str):
 #         outtensor=Tensor(shape=shape, dtype=t.dtype, device=t.device)
diff --git a/front/py/deepx/nn/functional/leaffunc_io.py b/front/py/deepx/nn/functional/leaffunc_io.py
index dd09abc0..d9551c6c 100644
--- a/front/py/deepx/nn/functional/leaffunc_io.py
+++ b/front/py/deepx/nn/functional/leaffunc_io.py
@@ -13,13 +13,25 @@ def save(t:Tensor,path:str):
 
 def save_npy(t,path:str):
     r'''
-    保存numpy.tensor为deepxtensor格式
+    保存numpy.ndarray为deepx.tensor格式
     '''
-    from numpy import save,ndarray,ascontiguousarray
+    from numpy import ascontiguousarray
     shape=Shape(t.shape)
     shape._dtype=str(t.dtype)
     saveShape(shape,path+".shape")
 
     array = ascontiguousarray(t)
     array.tofile(path+'.data')
-    return t
\ No newline at end of file
+    return t
+
+def save_torch(t,path:str):
+    r'''
+    保存torch.Tensor为deepx.tensor格式
+    '''
+    from torch import Tensor as torch_Tensor
+    if isinstance(t,torch_Tensor):
+        t=t.detach().cpu().numpy()
+    else:
+        raise ValueError("t must be a torch.Tensor")
+    save_npy(t,path)
+    
\ No newline at end of file
diff --git a/front/py/deepx/nn/functional/leaffunc_life.py b/front/py/deepx/nn/functional/leaffunc_life.py
index 8952b9d6..8921f8e3 100644
--- a/front/py/deepx/nn/functional/leaffunc_life.py
+++ b/front/py/deepx/nn/functional/leaffunc_life.py
@@ -22,6 +22,13 @@ def copytensor(t:Tensor,out:Tensor):
 def deltensor(t:Tensor):
     from .rtf_life import rtf_deltensor
     rtf_deltensor(t)
+def renametensor(t:Tensor,new_name:str):
+    assert isinstance(t,Tensor)
+    assert isinstance(new_name,str) and new_name != ''
+    assert t.name is not None and t.name != ''
+
+    from .rtf_life import rtf_renametensor
+    rtf_renametensor(t,new_name)
 
 def load(path:str)->Tensor:
     from .rtf_io import rtf_load
diff --git a/front/py/deepx/nn/functional/reduce.py b/front/py/deepx/nn/functional/reduce.py
index c5998e6a..f2731fd4 100644
--- a/front/py/deepx/nn/functional/reduce.py
+++ b/front/py/deepx/nn/functional/reduce.py
@@ -4,14 +4,11 @@
 from .leaffunc_life import newtensor
 #mean
  
-def mean(a:Tensor,dim:tuple[int]=None,keepdim:bool=False)->Tensor:
+def mean(a:Tensor,dim:tuple[int,...]=None,keepdim:bool=False)->Tensor:
     # 如果dim为None,则对所有维度求平均
     if dim is None:
         dim = list(range(a.ndim))
-    elif isinstance(dim, int):
-        dim = [dim]
-    else:
-        dim = list(dim)
+    dim=list(dim)
     total = 1
     for i in dim:
         if i < 0:
@@ -19,6 +16,6 @@ def mean(a:Tensor,dim:tuple[int]=None,keepdim:bool=False)->Tensor:
         total *= a.shape[i]
     reduceshape=Shape.reduceshape(a.shape,dim,keepdim)
     out=newtensor(reduceshape,dtype=a.dtype)
-    sum(a, dim, keepdim, out)
+    sum(a, tuple(dim), keepdim, out)
     out.div_(total)
     return out
diff --git a/front/py/deepx/nn/functional/rtf_changeshape.py b/front/py/deepx/nn/functional/rtf_changeshape.py
index c7233ab0..37b38bad 100644
--- a/front/py/deepx/nn/functional/rtf_changeshape.py
+++ b/front/py/deepx/nn/functional/rtf_changeshape.py
@@ -28,9 +28,10 @@ def rtf_broadcastTo(t:Tensor,new_shape:tuple[int],out:Tensor,author='miaobyte'):
     ir=DeepxIR("broadcastTo", args, returns,author)
     send(ir)
  
-def rtf_gather(input:Tensor,indices:Tensor,axis:int,out:Tensor,author='miaobyte'):
+def rtf_indexselect(input:Tensor,indices:Tensor,axis:int,out:Tensor,author='miaobyte'):
+    assert axis>=0 and axis<input.ndim
     args=[Param.tensor(input),Param.tensor(indices),Param.varnum(axis)]
     returns=[Param.tensor(out)]
-    ir=DeepxIR("gather", args, returns,author)
+    ir=DeepxIR("indexselect", args, returns,author)
     send(ir)
  
diff --git a/front/py/deepx/nn/functional/rtf_io.py b/front/py/deepx/nn/functional/rtf_io.py
index 899b6f16..918f63de 100644
--- a/front/py/deepx/nn/functional/rtf_io.py
+++ b/front/py/deepx/nn/functional/rtf_io.py
@@ -23,4 +23,4 @@ def rtf_load(path:str)->Tensor:
     send(ir)
     shapefile=path+'.shape'
     tensor_name,shape,dtype=loadShape(shapefile)
-    return Tensor(shape,dtype,tensor_name)
+    return Tensor(shape.shape,dtype,tensor_name)
diff --git a/front/py/deepx/nn/functional/rtf_life.py b/front/py/deepx/nn/functional/rtf_life.py
index b233df70..014cd505 100644
--- a/front/py/deepx/nn/functional/rtf_life.py
+++ b/front/py/deepx/nn/functional/rtf_life.py
@@ -20,3 +20,9 @@ def rtf_deltensor(t:Tensor):
     returns=[]
     ir=DeepxIR("deltensor", args, returns,'')
     send(ir)
+
+def rtf_renametensor(t:Tensor,new_name:str):
+    args=[Param.tensor(t),Param.varstr(new_name)]
+    returns=[]
+    ir=DeepxIR("renametensor", args, returns,'')
+    send(ir)
diff --git a/front/py/deepx/nn/modules/__init__.py b/front/py/deepx/nn/modules/__init__.py
index 83ce046c..bf433622 100644
--- a/front/py/deepx/nn/modules/__init__.py
+++ b/front/py/deepx/nn/modules/__init__.py
@@ -1,8 +1,9 @@
 from .module import Module, Sequential
 from .linear import Linear
-
+from .sparse import Embedding
 __all__ = [
     "Module",
     "Linear",
-    "Sequential"
+    "Sequential",
+    "Embedding",
     ]
diff --git a/front/py/deepx/nn/modules/module.py b/front/py/deepx/nn/modules/module.py
index 5c7be9b2..6966bebd 100644
--- a/front/py/deepx/nn/modules/module.py
+++ b/front/py/deepx/nn/modules/module.py
@@ -12,8 +12,7 @@ def __init__(self, name: Optional[str] = None):
 
     def _generate_default_name(self) -> str:
         class_name = self.__class__.__name__
-        # 修改正则表达式，保留连续大写字母为一个单词
-        base_name = re.sub(r'(?<![A-Z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])', '_', class_name).lower()
+        base_name = class_name.lower()
         if not hasattr(self.__class__, '_instance_counter'):
             self.__class__._instance_counter = 0
         count = self.__class__._instance_counter
@@ -27,14 +26,14 @@ def full_name(self):
         else:
             return f"{self._parent.full_name}.{self._name}"
     
-    def __setattr__(self, name: str, value: Any) -> None:
-        if not name.startswith('_'):
-            if isinstance(value, Module):
-                self.register_module(name, value)
-            elif isinstance(value, Tensor):
-                self.register_parameter(name, value)
-            # 使用父类方法设置属性，避免递归
-        super().__setattr__(name, value)
+    # def __setattr__(self, name: str, value: Any) -> None:
+    #     if not name.startswith('_'):
+    #         if isinstance(value, Module):
+    #             self.register_module(name, value)
+    #         elif isinstance(value, Tensor):
+    #             self.register_parameter(name, value)
+    #         # 使用父类方法设置属性，避免递归
+    #     super().__setattr__(name, value)
         
     def register_module(self, name: str, module: Optional['Module']) -> None:
         if module is None:
@@ -89,25 +88,7 @@ def named_modules(self, memo: Optional[set] = None, prefix: str = ''
             for name, module in self._modules.items():
                 submodule_prefix = f"{prefix}.{name}" if prefix else name
                 yield from module.named_modules(memo, submodule_prefix)
-                
-    # def to(self, device: Union[Device, str]) -> 'Module':
-    #     """移动模块到指定设备"""
-    #     for param in self.parameters():
-    #         param.to(device)
-    #     for buf in self.buffers():
-    #         buf.to(device)
-    #     return self
-    
-    # def train(self, mode: bool = True) -> 'Module':
-    #     self.training = mode
-    #     for module in self.children():
-    #         module.train(mode)
-    #     return self
-    
-    # def eval(self) -> 'Module':
-    #     """设置评估模式"""
-    #     return self.train(False)
-    
+
     def state_dict(self) -> Dict[str, Tensor]:
         """返回模型状态字典"""
         state = {}
diff --git a/front/py/deepx/nn/modules/sparse.py b/front/py/deepx/nn/modules/sparse.py
index a7727a67..ca00f28c 100644
--- a/front/py/deepx/nn/modules/sparse.py
+++ b/front/py/deepx/nn/modules/sparse.py
@@ -93,51 +93,55 @@ class Embedding(Module):
     def __init__(self, 
                  num_embeddings:int, #嵌入字典的大小（词汇表大小）vocab_size，llama=128256
                  embedding_dim:int, #每个嵌入向量的维度,隐藏层大小hidden_size，llama=4096
-                 padding_idx:int=None,
-                 max_norm:float=None, 
-                 norm_type:float=2.0, 
-                 scale_grad_by_freq:bool=False, 
-                 _weight:Tensor=None,dtype=None,
-                 sparse:bool=False):
+                #  padding_idx:int=None,
+                #  max_norm:float=None, 
+                #  norm_type:float=2.0, 
+                #  scale_grad_by_freq:bool=False, 
+                 weight:Tensor=None,dtype='float32',
+                #  sparse:bool=False
+                 ):
         super(Embedding, self).__init__()
         self.num_embeddings = num_embeddings
         self.embedding_dim = embedding_dim
        
-        if padding_idx is not None:
-            if padding_idx > 0:
-                assert (
-                    padding_idx < self.num_embeddings
-                ), "Padding_idx必须在num_embeddings范围内"
-            elif padding_idx < 0:
-                assert (
-                    padding_idx >= -self.num_embeddings
-                ), "Padding_idx必须在num_embeddings范围内"
-                padding_idx = self.num_embeddings + padding_idx
-        self.padding_idx = padding_idx
-        self.max_norm = max_norm
-        self.norm_type = norm_type
-        self.scale_grad_by_freq = scale_grad_by_freq
-        if _weight is None:
+        # if padding_idx is not None:
+        #     if padding_idx > 0:
+        #         assert (
+        #             padding_idx < self.num_embeddings
+        #         ), "Padding_idx必须在num_embeddings范围内"
+        #     elif padding_idx < 0:
+        #         assert (
+        #             padding_idx >= -self.num_embeddings
+        #         ), "Padding_idx必须在num_embeddings范围内"
+        #         padding_idx = self.num_embeddings + padding_idx
+        # self.padding_idx = padding_idx
+        # self.max_norm = max_norm
+        # self.norm_type = norm_type
+        # self.scale_grad_by_freq = scale_grad_by_freq
+        if weight is None:
             self.weight = Tensor(shape=(num_embeddings, embedding_dim),dtype=dtype)
+            self.register_parameter('weight', self.weight)
             self.reset_parameters()
         else:
-            assert list(_weight.shape) == [
+            assert list(weight.shape) == [
                 num_embeddings,
                 embedding_dim,
             ], "权重形状与num_embeddings和embedding_dim不匹配"
-            self.weight = _weight
+            self.weight = weight
         
-        self.sparse = sparse
+        # self.sparse = sparse
         
-        if padding_idx is not None:
-            self.weight[padding_idx] = 0
+        # if padding_idx is not None:
+        #     self.weight[padding_idx] = 0
     def reset_parameters(self) -> None:
         self.weight.normal_()  # 正态分布初始化权重
         self._fill_padding_idx_with_zero()  # 填充索引位置归零
 
     def _fill_padding_idx_with_zero(self) -> None:
-        if self.padding_idx is not None:
-           self.weight[self.padding_idx].fill_(0)
+        #TODO
+        pass
+        # if self.padding_idx is not None:
+        #    self.weight[self.padding_idx].fill_(0)
     def forward(self, input:Tensor)->Tensor:
         return self.weight[input]
     
diff --git a/front/py/deepx/scheduler/client/udpconn.py b/front/py/deepx/scheduler/client/udpconn.py
index 6a12c26a..a25b0963 100644
--- a/front/py/deepx/scheduler/client/udpconn.py
+++ b/front/py/deepx/scheduler/client/udpconn.py
@@ -3,7 +3,7 @@
 import select
 
 class UDPConn:
-    def __init__(self, endpoint: str = "localhost:9090"):
+    def __init__(self, endpoint: str = "localhost:8080"):
         # 解析endpoint
         self._host, port_str = endpoint.split(':')
         self._port = int(port_str)
diff --git a/front/py/deepx/tensor/changeshape.py b/front/py/deepx/tensor/changeshape.py
index f5534541..462fc9d3 100644
--- a/front/py/deepx/tensor/changeshape.py
+++ b/front/py/deepx/tensor/changeshape.py
@@ -2,31 +2,36 @@
 from  .tensor import Tensor,tensor_method
 
 @tensor_method
-def reshape(self,*shape,out:Union[Tensor,str]='')->Tensor:
+def reshape(self,shape:tuple[int,...],out:Union[Tensor,str]='')->Tensor:
+    assert isinstance(shape,tuple)
     from deepx.nn.functional import reshape as reshape_func
     result=reshape_func(self,shape,out)
     return result
 
 @tensor_method
-def reshape_(self,*shape)->Tensor:
+def reshape_(self,shape:tuple[int,...])->Tensor:
+    assert isinstance(shape,tuple)
     from deepx.nn.functional import reshape as reshape_func
     result=reshape_func(self,shape,self)
     return result
 
 @tensor_method
-def permute(self,*dimorder,out:Union[Tensor,str]=''):
+def permute(self,dimorder:tuple[int,...],out:Union[Tensor,str]=''):
+    assert isinstance(dimorder,tuple)
     from deepx.nn.functional import permute as permute_func
     result=permute_func(self,dimorder,out)
     return result
 
 @tensor_method
-def permute_(self,*dimorder):
+def permute_(self,dimorder:tuple[int,...])->Tensor:
+    assert isinstance(dimorder,tuple)
     from deepx.nn.functional import permute as permute_func
     permute_func(self,dimorder,self)
     return self
 
 @tensor_method
 def transpose(self,out:Union[Tensor,str]=''):
+    assert isinstance(out,str) or isinstance(out,Tensor)
     from deepx.nn.functional import transpose as transpose_func
     result=transpose_func(self,out)
     return result
@@ -38,26 +43,23 @@ def transpose_(self):
     return self
 
 @tensor_method
-def broadcastshape(self,other:Tensor)->tuple[int]:
+def broadcastshape(self,other:Tensor)->tuple[int,...]:
     from deepx.nn.functional import broadcastshape as broadcastshape_func
     result=broadcastshape_func(self.shape,other.shape)
     return result
 
 @tensor_method
-def broadcastTo(self,shape:tuple[int],out:Union[Tensor,str]='')->Tensor:
+def broadcastTo(self,shape:tuple[int,...],out:Union[Tensor,str]='')->Tensor:
     from deepx.nn.functional import broadcastTo as broadcastTo_func
     result=broadcastTo_func(self,shape,out)
     return result
 
 @tensor_method
-def gather(self,indices:Tensor,dim:int,out:Union[Tensor,str]='')->Tensor:
-    final_indices=indices
-    #TODO 当indices不是tensor时，需要转换为tensor
-    if not isinstance(indices,Tensor):
-        raise ValueError("indices must be a Tensor")
-
-    from deepx.nn.functional import gather as gather_func
-    result=gather_func(self,final_indices,dim,out)
+def indexselect(self,index:Tensor,axis:int=0,out:Union[Tensor,str]='')->Tensor:
+    assert isinstance(index,Tensor)
+    gatheraxis=axis%self.ndim
+    from deepx.nn.functional import indexselect as indexselect_func
+    result=indexselect_func(self,index,gatheraxis,out)
     return result
 
 
diff --git a/front/py/deepx/tensor/io.py b/front/py/deepx/tensor/io.py
index 23027ee4..35e3b0f7 100644
--- a/front/py/deepx/tensor/io.py
+++ b/front/py/deepx/tensor/io.py
@@ -11,7 +11,7 @@ def loadShape(path:str)->tuple[str,Shape,str]:
         raise ValueError("文件名必须以.shape结尾")
  
     tensor_name = filename[:-6]  # 移除'.shape'后缀
-    return (tensor_name,Shape(shape['shape']),shape['dtype'])
+    return (tensor_name,Shape(tuple(shape['shape'])),shape['dtype'])
 
 def saveShape(t:Shape,path:str):
     if path.endswith('.shape'):
diff --git a/front/py/deepx/tensor/shape.py b/front/py/deepx/tensor/shape.py
index dfc4f5a0..4ce87937 100644
--- a/front/py/deepx/tensor/shape.py
+++ b/front/py/deepx/tensor/shape.py
@@ -1,10 +1,12 @@
 import numpy as np
 from typing import Optional,Union
 class Shape:
-    def __init__(self, 
-                 shape:Optional[Union[tuple[int],list[int],int]]=None):
+    def __init__(self, shape:tuple[int,...]=None):
         # 确保 shape 是元组类型
-        self._shape = tuple(shape)
+        assert isinstance(shape,tuple)
+        self._shape = shape
+        for i in self._shape:
+            assert isinstance(i,int) and i>0
         self._size = int(np.prod(self.shape)) if self.shape else 0
         # 计算 stride（步长）
         self._strides = self._compute_strides()
@@ -17,7 +19,7 @@ def shape(self,dim=None):
         else:
             return self._shape[dim]
         
-    def numel(self):
+    def numel(self)->int:
         """计算张量中所有元素的数量（与torch.Tensor.numel()行为一致）
         
         实现说明：
@@ -27,7 +29,7 @@ def numel(self):
         """
         return self._size  # 在__init__中已预先计算好
 
-    def dim(self):
+    def dim(self)->int:
         """返回张量的维度数（与torch.Tensor.dim()行为一致）
         
         实现说明：
@@ -38,7 +40,7 @@ def dim(self):
         return len(self._shape)
 
     @property
-    def ndim(self):
+    def ndim(self)->int:
         """返回张量的维度数（dim的别名，与PyTorch命名习惯保持一致）
         
         设计考虑：
@@ -48,7 +50,7 @@ def ndim(self):
         """
         return self.dim()
     
-    def ndimension(self):
+    def ndimension(self)->int:
         """返回张量的维度数（dim的别名，与PyTorch命名习惯保持一致）
         
         设计考虑：
@@ -59,7 +61,7 @@ def ndimension(self):
         return self.dim()
   
     @property
-    def stride(self):
+    def stride(self)->tuple[int,...]:
         """返回所有维度的步长元组"""
         return self._strides
 
@@ -81,13 +83,13 @@ def __repr__(self):
     def __getitem__(self, idx):
         return self.shape[idx]
     
-    def __len__(self):
+    def __len__(self)->int:
         return len(self.shape)
     
     def __iter__(self):
         return iter(self.shape)
         
-    def __eq__(self, other):
+    def __eq__(self, other)->bool:
         """比较两个形状是否相等"""
         if isinstance(other, Shape):
             return self.shape == other.shape
@@ -100,7 +102,7 @@ def __hash__(self):
         return hash(self.shape)
 
     @classmethod
-    def total_size(cls,other:tuple[int])->int:
+    def total_size(cls,other:tuple[int,...])->int:
         total_size=1
         for i in other:
             total_size*=i
@@ -108,9 +110,9 @@ def total_size(cls,other:tuple[int])->int:
     
 
     @classmethod
-    def transpose(cls,shape:tuple[int],dimorder:list[int]=None):
+    def transpose(cls,shape:tuple[int,...],dimorder:tuple[int,...]=None)->tuple[int,...]:
         if dimorder is None:
-            dimorder=list(range(len(shape)))
+            dimorder=tuple(range(len(shape)))
         return Shape(tuple(shape[i] for i in dimorder))
     
     @classmethod
@@ -126,7 +128,7 @@ def matmul(cls,shape:tuple[int],other:tuple[int])->tuple[int]:
         return tuple(resultshape)
     
     @classmethod
-    def broadcast_shape(cls,shape_a: tuple[int], shape_b: tuple[int]) -> tuple[int]:
+    def broadcast_shape(cls,shape_a: tuple[int,...], shape_b: tuple[int,...]) -> tuple[int,...]:
         """计算两个形状的广播后形状"""
         # 获取形状的长度
         len_a, len_b = len(shape_a), len(shape_b)
@@ -159,7 +161,7 @@ def broadcast_shape(cls,shape_a: tuple[int], shape_b: tuple[int]) -> tuple[int]:
 
  
     @classmethod
-    def reduceshape(cls,shape:tuple[int],dim:list[int],keepdim:bool)->tuple[int]:
+    def reduceshape(cls,shape:tuple[int,...],dim:tuple[int,...],keepdim:bool)->tuple[int,...]:
         ndim = len(shape)
         # 处理负数维度
         normalized_dim = [d % ndim for d in dim]
@@ -172,4 +174,9 @@ def reduceshape(cls,shape:tuple[int],dim:list[int],keepdim:bool)->tuple[int]:
         else:
             return tuple(s for i, s in enumerate(shape)
                         if i not in unique_dim)
+    
+    # 参考自 https://www.tensorflow.org/api_docs/python/tf/gather
+    @classmethod
+    def indexselectshape(cls,input_shape:tuple[int,...],index_shape:tuple[int,...],gatheraxis:int)->tuple[int,...]:
+        return input_shape[:gatheraxis]+index_shape+input_shape[gatheraxis+1:]
     
\ No newline at end of file
diff --git a/front/py/deepx/tensor/tensor.py b/front/py/deepx/tensor/tensor.py
index 69a328e9..7929ba7f 100644
--- a/front/py/deepx/tensor/tensor.py
+++ b/front/py/deepx/tensor/tensor.py
@@ -9,8 +9,13 @@
 class Tensor:
 
     #life
-    def __init__(self,shape:Union[tuple[int],list[int],Shape],dtype:str='float32',name:str=None):
+    def __init__(self,shape:tuple[int,...],dtype:str='float32',name:str=None):
         # name
+        assert isinstance(name,str) or name is None
+        assert isinstance(shape,tuple)
+        for i in shape:
+            assert isinstance(i,int) and i>0
+        assert isinstance(dtype,str)
 
         self._name = name
         if name is None or name =='':
@@ -45,8 +50,12 @@ def name(self):
         return self._name
     @name.setter
     def name(self,name:str):
-        self._name=name
+        assert isinstance(name,str) and name != ''
+        assert self.name is not None and self.name != ''
 
+        from deepx.nn.functional import renametensor
+        renametensor(self,name)
+        self._name = name
     # shape
     @property
     def shape(self,dim:int=None):
@@ -120,8 +129,8 @@ def __matmul__(self, other:Union[Number,'Tensor']):
         return self.matmul(other)
 
     #gather
-    def __getitem__(self, indices:'Tensor'):
-        return self.gather(indices)
+    def __getitem__(self, index:'Tensor'):
+        return self.indexselect(index)
 
     #shape操作
     @property
diff --git a/front/py/examples/3_module/1_embedding.py b/front/py/examples/3_module/1_embedding.py
index e69de29b..ddf52f44 100644
--- a/front/py/examples/3_module/1_embedding.py
+++ b/front/py/examples/3_module/1_embedding.py
@@ -0,0 +1,54 @@
+from transformers import AutoTokenizer
+print()
+def init_tokenizer(model_path):
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
+    tokenizer.pad_token = tokenizer.eos_token
+    return tokenizer
+
+tokenizer = init_tokenizer("/home/lipeng/model/deepseek-ai/DeepSeek-R1-Distill-Llama-8B")
+
+def tokenize_text(text, tokenizer):
+    tokens = tokenizer(text, return_tensors="pt").input_ids
+    import torch
+    # 处理超出词汇表范围的token
+    if torch.any(tokens >= tokenizer.vocab_size):
+        # 获取UNK token ID，如果没有则使用0
+        unk_token_id = tokenizer.unk_token_id if hasattr(tokenizer, 'unk_token_id') and tokenizer.unk_token_id is not None else 0
+        # 替换所有超出范围的token为UNK
+        tokens = torch.where(tokens < tokenizer.vocab_size, tokens, torch.tensor(unk_token_id, device=tokens.device))
+    return tokens
+
+dir="/home/lipeng/model/deepxmodel/embeddingtest/"
+ 
+############-------PyTorch-------################
+import torch.nn as nn
+
+# 创建输入
+text = "这是一个测试文本，用于演示嵌入层的使用。"
+torch_input = tokenize_text(text, tokenizer)
+from deepx.nn.functional import save_torch
+save_torch(torch_input,dir+'input')
+print(torch_input)
+# 创建网络
+torch_net = nn.Embedding(tokenizer.vocab_size, 4096)
+save_torch(torch_net.weight,dir+'weight')
+# 前向传播
+torch_output = torch_net(torch_input)
+print()
+print(torch_output.shape)
+print(torch_output)
+
+
+############-------DEEPX-------################
+from deepx.nn.modules import Embedding
+from deepx.nn.functional import load
+
+input=load(dir+'input')
+input.print()
+
+weight=load(dir+'weight')
+weight.name='embedding_0.weight'
+net = Embedding(tokenizer.vocab_size, 4096,weight=weight)
+out=net.forward(input)
+out.print()
+

From 33ea00f861abbf5c891223df84ea482b974d829c Mon Sep 17 00:00:00 2001
From: lipeng <734991033@qq.com>
Date: Wed, 23 Apr 2025 16:39:23 +0800
Subject: [PATCH 4/5] =?UTF-8?q?cuda:load,save,indexselect=EF=BC=8C?=
 =?UTF-8?q?=E5=92=8Cpytorch=E5=AF=B9=E9=BD=90?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 doc/excuter/op-mem-cuda/list.md               |  35 ++---
 excuter/cpp-common/src/deepx/shape.cpp        |   3 +
 excuter/cpp-common/src/deepx/shape.hpp        |   3 +-
 excuter/op-mem-cuda/src/client/tfs.cpp        |  11 +-
 .../deepx/tensorfunc/changeshape_miaobyte.cu  | 140 ++++++++++--------
 .../deepx/tensorfunc/changeshape_miaobyte.cuh |  17 ++-
 .../deepx/tensorfunc/changeshape_miaobyte.hpp |  19 +--
 .../src/deepx/tensorfunc/io_miaobyte.hpp      |  14 +-
 .../deepx/tensorfunc/tensorlife_miaobyte.hpp  |   3 +
 .../op-mem-cuda/src/deepx/tf/changeshape.hpp  |  82 +++++-----
 .../op-mem-cuda/src/deepx/tf/tensorlife.hpp   |  34 +++++
 front/py/deepx/scheduler/client/udpconn.py    |   2 +-
 12 files changed, 212 insertions(+), 151 deletions(-)

diff --git a/doc/excuter/op-mem-cuda/list.md b/doc/excuter/op-mem-cuda/list.md
index 2a73fd50..aef5e0a8 100644
--- a/doc/excuter/op-mem-cuda/list.md
+++ b/doc/excuter/op-mem-cuda/list.md
@@ -13,10 +13,11 @@
 
 | Operation | Author | Func Def | Math Formula | IR Instruction |
 |-----------|--------|------------|--------------|----------------|
-| copytensor |  none  | copytensor(tensor<any> src, tensor<any> dst)->() | T2.data = T1.data | copytensor(tensor<any> src, tensor<any> dst)->() |
+| renametensor |  none  | renametensor(tensor<any> t, var<string> new_name)->() | rename T1 to T2 | renametensor(tensor<any> t, var<string> new_name)->() |
 | newtensor |  none  | newtensor(vector<int32> shape)->(tensor<any> tensor1) | T1 = zeros(shape) | newtensor(vector<int32> shape)->(tensor<any> tensor1) |
 | newtensor |  none  | newtensor(var<string> shape)->(tensor<any> tensor1) | T1 = zeros(shape) | newtensor(var<string> shape)->(tensor<any> tensor1) |
 | deltensor |  none  | deltensor(tensor<any> t)->() | del T1 | deltensor(tensor<any> t)->() |
+| copytensor |  none  | copytensor(tensor<any> src, tensor<any> dst)->() | T2.data = T1.data | copytensor(tensor<any> src, tensor<any> dst)->() |
 
 ### io
 
@@ -31,9 +32,9 @@
 
 | Operation | Author | Func Def | Math Formula | IR Instruction |
 |-----------|--------|------------|--------------|----------------|
-| arange | miaobyte | arange(tensor<any> t, var<any> start, var<any> step)->() | arange(T1,start,step) | arange(tensor<any> t, var<any> start, var<any> step)->() |
 | normal | miaobyte | normal(tensor<any> t, var<any> mean, var<any> stddev, var<int32> seed)->() | normal(T1,mean,stddev,seed) | normal(tensor<any> t, var<any> mean, var<any> stddev, var<int32> seed)->() |
 | uniform | miaobyte | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() | uniform(T1,low,high,seed) | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() |
+| arange | miaobyte | arange(tensor<any> t, var<any> start, var<any> step)->() | arange(T1,start,step) | arange(tensor<any> t, var<any> start, var<any> step)->() |
 | constant | miaobyte | constant(tensor<any> t, var<any> value)->() | constant(T1) | constant(tensor<any> t, var<any> value)->() |
 
 ### elementwise
@@ -46,10 +47,10 @@
 | min | miaobyte | min(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=min(T1, T2) | min(tensor<any> A, tensor<any> B)->(tensor<any> C) |
 | maxscalar | miaobyte | maxscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=max(T1, scalar) | maxscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
 | tan | miaobyte | tan(tensor<float64|float32> A)->(tensor<float64|float32> C) | T3=tan(T1) | tan(tensor<float64|float32> A)->(tensor<float64|float32> C) |
-| divscalar | miaobyte | divscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=scalar/T1 | divscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
 | sin | miaobyte | sin(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) | T3=sin(T1) | sin(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) |
-| add | cublas | add(tensor<any> a, tensor<any> b)->(tensor<any> c) | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |
-| add | miaobyte | add(tensor<any> a, tensor<any> b)->(tensor<any> c) | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |
+| divscalar | miaobyte | divscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=scalar/T1 | divscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
+| log | miaobyte | log(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) | T3=log(T1) | log(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) |
+| addscalar | miaobyte | addscalar(tensor<any> A, var<any> b)->(tensor<any> C) | T3=T1+scalar | addscalar(tensor<any> A, var<any> b)->(tensor<any> C) |
 | greater | miaobyte | greater(tensor<any> A, tensor<any> B)->(tensor<bool> mask) | mask=compare(T1, T2) | greater(tensor<any> A, tensor<any> B)->(tensor<bool> mask) |
 | lessscalar | miaobyte | lessscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) | mask=compare(T1, scalar) | lessscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) |
 | cos | miaobyte | cos(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) | T3=cos(T1) | cos(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) |
@@ -58,8 +59,8 @@
 | minscalar | miaobyte | minscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=min(T1, scalar) | minscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
 | rdivscalar | miaobyte | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) | T3=scalar/T1 | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) |
 | rpowscalar | miaobyte | rpowscalar(var<float64|int32> scalar, tensor<float64|float32> A)->(tensor<float64|float32> C) | T3=pow(scalar, T1) | rpowscalar(var<float64|int32> scalar, tensor<float64|float32> A)->(tensor<float64|float32> C) |
-| log | miaobyte | log(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) | T3=log(T1) | log(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) |
-| addscalar | miaobyte | addscalar(tensor<any> A, var<any> b)->(tensor<any> C) | T3=T1+scalar | addscalar(tensor<any> A, var<any> b)->(tensor<any> C) |
+| add | cublas | add(tensor<any> a, tensor<any> b)->(tensor<any> c) | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |
+| add | miaobyte | add(tensor<any> a, tensor<any> b)->(tensor<any> c) | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |
 | sub | miaobyte | sub(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1-T2 | sub(tensor<any> A, tensor<any> B)->(tensor<any> C) |
 | sqrt | miaobyte | sqrt(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) | T3=sqrt(T1) | sqrt(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) |
 | subscalar | miaobyte | subscalar(tensor<any> A, var<any> b)->(tensor<any> C) | T3=T1-scalar | subscalar(tensor<any> A, var<any> b)->(tensor<any> C) |
@@ -78,22 +79,22 @@
 |-----------|--------|------------|--------------|----------------|
 | matmul | cublas | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1 @ T2 | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) |
 
+### reduce
+
+| Operation | Author | Func Def | Math Formula | IR Instruction |
+|-----------|--------|------------|--------------|----------------|
+| prod | miaobyte | prod(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) | B = prod(A, axis=[1 2], keepdims=false) | prod(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) |
+| reducemax | miaobyte | reducemax(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) | B = reducemax(A, axis=[1 2], keepdims=false) | reducemax(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) |
+| sum | miaobyte | sum(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) | B = sum(A, axis=[1 2], keepdims=false) | sum(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) |
+| reducemin | miaobyte | reducemin(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) | B = reducemin(A, axis=[1 2], keepdims=false) | reducemin(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) |
+
 ### changeshape
 
 | Operation | Author | Func Def | Math Formula | IR Instruction |
 |-----------|--------|------------|--------------|----------------|
-| gather | miaobyte | gather(tensor<any> A, tensor<int64|int32> indices, var<int32> axis)->(tensor<any> B) | T2 = T1.gather(indices=[1,2], axis=1) | gather(tensor<any> A, tensor<int64|int32> indices, var<int32> axis)->(tensor<any> B) |
+| indexselect | miaobyte | indexselect(tensor<any> A, tensor<int64|int32> indices, var<int32> axis)->(tensor<any> B) | T2 = T1.indexselect(index=[1,2], axis=1) | indexselect(tensor<any> A, tensor<int64|int32> indices, var<int32> axis)->(tensor<any> B) |
 | broadcastTo | miaobyte | broadcastTo(tensor<any> A, vector<int32> new_shape)->(tensor<any> B) | T2 = T1.broadcastTo(new_shape=[4,3,2]) | broadcastTo(tensor<any> A, vector<int32> new_shape)->(tensor<any> B) |
 | concat | miaobyte | concat(listtensor<any> tensors, var<int32> dim)->(tensor<any> result) | Tresult = concat([T1, T2...], axis=3) | concat(listtensor<any> tensors, var<int32> dim)->(tensor<any> result) |
 | transpose | miaobyte | transpose(tensor<any> A, vector<int32> dim_order)->(tensor<any> C) | T2 = T1.transpose(dimorder=[1,0]) | transpose(tensor<any> A, vector<int32> dim_order)->(tensor<any> C) |
 | reshape | miaobyte | reshape(tensor<any> A, vector<int32> shape)->(tensor<any> B) | T1.reshape(shape)->T2 | reshape(tensor<any> A, vector<int32> shape)->(tensor<any> B) |
 
-### reduce
-
-| Operation | Author | Func Def | Math Formula | IR Instruction |
-|-----------|--------|------------|--------------|----------------|
-| reducemax | miaobyte | reducemax(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) | B = reducemax(A, axis=[1 2], keepdims=false) | reducemax(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) |
-| prod | miaobyte | prod(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) | B = prod(A, axis=[1 2], keepdims=false) | prod(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) |
-| sum | miaobyte | sum(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) | B = sum(A, axis=[1 2], keepdims=false) | sum(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) |
-| reducemin | miaobyte | reducemin(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) | B = reducemin(A, axis=[1 2], keepdims=false) | reducemin(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) |
-
diff --git a/excuter/cpp-common/src/deepx/shape.cpp b/excuter/cpp-common/src/deepx/shape.cpp
index cedca724..fa207e98 100644
--- a/excuter/cpp-common/src/deepx/shape.cpp
+++ b/excuter/cpp-common/src/deepx/shape.cpp
@@ -13,6 +13,9 @@ namespace deepx
     {
         setshape(shape, dim);
     }
+    int64_t Shape::bytes() const{
+        return size * (precision_bits(dtype) / 8);
+    }
     void Shape::setshape(const int *shape, int dim)
     {
         this->shape.resize(dim);
diff --git a/excuter/cpp-common/src/deepx/shape.hpp b/excuter/cpp-common/src/deepx/shape.hpp
index ff4fea69..655dce38 100644
--- a/excuter/cpp-common/src/deepx/shape.hpp
+++ b/excuter/cpp-common/src/deepx/shape.hpp
@@ -44,7 +44,8 @@ namespace deepx
         std::vector<int> shape;
         std::vector<int> strides;
         int dim;
-        int size;
+        int64_t size;
+        int64_t bytes() const;
 
         Shape() = default;
         Shape(const std::vector<int> &shape);
diff --git a/excuter/op-mem-cuda/src/client/tfs.cpp b/excuter/op-mem-cuda/src/client/tfs.cpp
index 44fbcfc6..4e9095ca 100644
--- a/excuter/op-mem-cuda/src/client/tfs.cpp
+++ b/excuter/op-mem-cuda/src/client/tfs.cpp
@@ -65,6 +65,13 @@ namespace deepx::tf
                                                              Param("t", DataCategory::Tensor, Precision::Any),
                                                          }),
                                                      vector<Param>()));
+        //renametensor
+        tffactory.add_tf(std::make_shared<RenameTensor>(vector<Param>(
+                                                         {
+                                                              Param("t", DataCategory::Tensor, Precision::Any),
+                                                             Param("new_name", DataCategory::Var, Precision::String),
+                                                        }),
+                                                     vector<Param>()));
     }
 
     // init
@@ -479,8 +486,8 @@ namespace deepx::tf
                 {
                     Param("B", DataCategory::Tensor, Precision::Any),
                 })));
-        // gather
-        tffactory.add_tf(std::make_shared<Gather<miaobyte>>(vector<Param>(
+        // indexselect
+        tffactory.add_tf(std::make_shared<IndexSelect<miaobyte>>(vector<Param>(
                 {
                     Param("A", DataCategory::Tensor, Precision::Any),
                     Param("indices", DataCategory::Tensor, Precision::Int64|Precision::Int32),
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu
index 130fc80d..bc97ba5f 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu
@@ -366,32 +366,38 @@ namespace deepx::tensorfunc
                                              const BroadcastMap *broadcastMap,
                                              int8_t *output, const int *outputStrides, const int outputDim, const int outputlen);
 
-    // gather
-
+    // indexselect
     template <typename GatherAxisT>
-    __host__ __device__ void fromGatherIndices(
-        const int *output_indices,                                                   // 输出张量的索引
-        const GatherAxisT *indices, const int *indicesStrides, const int indicesDim, // indices是tensor
-        const int gatherAxis,                                                        // gather操作的轴
-        int *input_indices, const int inputDim)
-    {
-
-        for (int i = 0; i < inputDim; ++i)
+    __host__ __device__ void fromIndexselectIndices(
+    const int *output_indices,const int outputDim,  // 输出张量的索引
+    const GatherAxisT *indices,const int *indicesStrides,const int indicesDim, //indices是tensor
+    int *index_indices,
+    const int gatherAxis,      // gather操作的轴
+    int *input_indices,const int inputDim){
+
+        for (int i = 0; i < gatherAxis; ++i)
         {
             input_indices[i] = output_indices[i];
         }
-
+        for (int i = gatherAxis; i < gatherAxis + indicesDim; ++i)
+        {
+            index_indices[i - gatherAxis] = output_indices[i];
+        }
         // 使用indices张量中对应位置的值来替换gatherAxis维度的索引
-        int indices_idx = linearAt(indicesStrides, indicesDim, output_indices);
+        int indices_idx = linearAt(indicesStrides, indicesDim, index_indices);
         input_indices[gatherAxis] = indices[indices_idx];
+        for (int i = gatherAxis +indicesDim; i < outputDim; ++i)
+        {
+            input_indices[gatherAxis+1+i] = output_indices[i];
+        }
     }
 
     template <int DIM, typename T, typename GatherAxisT>
-    __global__ void gather_kernel(
+    __global__ void indexselect_kernel(
         const T *input, const int *inputStrides, const int inputDim,
         const GatherAxisT *indices, const int *indicesStrides, const int indicesDim,
         const int gatherAxis,
-        T *output, const int outputlen)
+        T *output, const int *outputStrides, const int outputDim, const int outputlen)
     {
         const int grid_stride = gridDim.x * blockDim.x;
         int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
@@ -399,26 +405,28 @@ namespace deepx::tensorfunc
         {
             // 输出索引
             int output_indices[DIM];
-            linearTo(indicesStrides, indicesDim, output_indices, thread_id);
+            linearTo(outputStrides, outputDim, output_indices, thread_id);
 
             // 输入索引
+            int index_indices[DIM];
             int input_indices[DIM];
-            fromGatherIndices(output_indices,
+            fromIndexselectIndices(output_indices,outputDim,
                               indices, indicesStrides, indicesDim,
+                              index_indices,
                               gatherAxis,
                               input_indices, inputDim);
             int inputIdx = linearAt(inputStrides, inputDim, input_indices);
-            int outputIdx = linearAt(indicesStrides, indicesDim, output_indices);
+            int outputIdx = linearAt(outputStrides, outputDim, output_indices);
             output[outputIdx] = input[inputIdx];
         }
     }
 
     template <typename T, typename GatherAxisT>
-    void launch_gather(
+    void launch_indexselect(
         const T *input, const int *inputStrides, const int inputDim,
         const GatherAxisT *indices, const int *indicesStrides, const int indicesDim,
         const int gatherAxis,
-        T *output, const int outputlen)
+        T *output, const int *outputStrides, const int outputDim, const int outputlen)
     {
 
         auto [numBlocks, blockSize] = BestDims(outputlen);
@@ -428,44 +436,50 @@ namespace deepx::tensorfunc
 
         // input
         cudaVector<int> inputStrides_d(inputStrides, inputDim, cudaMemcpyHostToDevice);
+
+        // output
+        cudaVector<int> outputStrides_d(outputStrides, outputDim, cudaMemcpyHostToDevice);
+
+        //TODO 这里可能会导致寄存器浪费，但是，搞太多模板T，模板实例化不好搞
         int dim=std::max(inputDim,indicesDim);
+        dim=std::max(dim,outputDim);
         switch (dim)
         {
         case 1:
-            gather_kernel<1, T, GatherAxisT><<<numBlocks, blockSize>>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputlen);
+            indexselect_kernel<1, T, GatherAxisT><<<numBlocks, blockSize>>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputStrides_d.data, outputDim, outputlen);
             break;
         case 2:
-            gather_kernel<2, T, GatherAxisT><<<numBlocks, blockSize>>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputlen);
+            indexselect_kernel<2, T, GatherAxisT><<<numBlocks, blockSize>>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputStrides_d.data, outputDim, outputlen);
             break;
         case 3:
-            gather_kernel<3, T, GatherAxisT><<<numBlocks, blockSize>>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputlen);
+            indexselect_kernel<3, T, GatherAxisT><<<numBlocks, blockSize>>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputStrides_d.data, outputDim, outputlen);
             break;
         case 4:
-            gather_kernel<4, T, GatherAxisT><<<numBlocks, blockSize>>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputlen);
+            indexselect_kernel<4, T, GatherAxisT><<<numBlocks, blockSize>>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputStrides_d.data, outputDim, outputlen);
             break;
         case 5:
-            gather_kernel<5, T, GatherAxisT><<<numBlocks, blockSize>>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputlen);
+            indexselect_kernel<5, T, GatherAxisT><<<numBlocks, blockSize>>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputStrides_d.data, outputDim, outputlen);
             break;
         case 6:
-            gather_kernel<6, T, GatherAxisT><<<numBlocks, blockSize>>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputlen);
+            indexselect_kernel<6, T, GatherAxisT><<<numBlocks, blockSize>>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputStrides_d.data, outputDim, outputlen);
             break;
         case 7:
-            gather_kernel<7, T, GatherAxisT><<<numBlocks, blockSize>>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputlen);
+            indexselect_kernel<7, T, GatherAxisT><<<numBlocks, blockSize>>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputStrides_d.data, outputDim, outputlen);
             break;
         case 8:
-            gather_kernel<8, T, GatherAxisT><<<numBlocks, blockSize>>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputlen);
+            indexselect_kernel<8, T, GatherAxisT><<<numBlocks, blockSize>>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputStrides_d.data, outputDim, outputlen);
             break;
         case 9:
-            gather_kernel<9, T, GatherAxisT><<<numBlocks, blockSize>>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputlen);
+            indexselect_kernel<9, T, GatherAxisT><<<numBlocks, blockSize>>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputStrides_d.data, outputDim, outputlen);
             break;
         case 10:
-            gather_kernel<10, T, GatherAxisT><<<numBlocks, blockSize>>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputlen);
+            indexselect_kernel<10, T, GatherAxisT><<<numBlocks, blockSize>>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputStrides_d.data, outputDim, outputlen);
             break;
         case 11:
-            gather_kernel<11, T, GatherAxisT><<<numBlocks, blockSize>>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputlen);
+            indexselect_kernel<11, T, GatherAxisT><<<numBlocks, blockSize>>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputStrides_d.data, outputDim, outputlen);
             break;
         case 12:
-            gather_kernel<12, T, GatherAxisT><<<numBlocks, blockSize>>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputlen);
+            indexselect_kernel<12, T, GatherAxisT><<<numBlocks, blockSize>>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputStrides_d.data, outputDim, outputlen);
             break;
         default:
             throw std::runtime_error("dimension large than " + std::to_string(MAX_DIM));
@@ -476,71 +490,71 @@ namespace deepx::tensorfunc
             throw std::runtime_error("cuda error");
         }
     }
-    template void launch_gather<double, int64_t>(const double *input, const int *inputStrides, const int inputDim,
+    template void launch_indexselect<double, int64_t>(const double *input, const int *inputStrides, const int inputDim,
                                                  const int64_t *indices, const int *indicesStrides, const int indicesDim,
                                                  const int gatherAxis,
-                                                 double *output, const int outputlen);
-    template void launch_gather<float, int64_t>(const float *input, const int *inputStrides, const int inputDim,
+                                                 double *output, const int *outputStrides, const int outputDim, const int outputlen);
+    template void launch_indexselect<float, int64_t>(const float *input, const int *inputStrides, const int inputDim,
                                                 const int64_t *indices, const int *indicesStrides, const int indicesDim,
                                                 const int gatherAxis,
-                                                float *output, const int outputlen);
-    template void launch_gather<nv_bfloat16, int64_t>(const nv_bfloat16 *input, const int *inputStrides, const int inputDim,
+                                                float *output, const int *outputStrides, const int outputDim, const int outputlen);
+    template void launch_indexselect<nv_bfloat16, int64_t>(const nv_bfloat16 *input, const int *inputStrides, const int inputDim,
                                                       const int64_t *indices, const int *indicesStrides, const int indicesDim,
                                                       const int gatherAxis,
-                                                      nv_bfloat16 *output, const int outputlen);
-    template void launch_gather<__half, int64_t>(const __half *input, const int *inputStrides, const int inputDim,
+                                                      nv_bfloat16 *output, const int *outputStrides, const int outputDim, const int outputlen);
+    template void launch_indexselect<__half, int64_t>(const __half *input, const int *inputStrides, const int inputDim,
                                                  const int64_t *indices, const int *indicesStrides, const int indicesDim,
                                                  const int gatherAxis,
-                                                 __half *output, const int outputlen);
-    template void launch_gather<int64_t, int64_t>(const int64_t *input, const int *inputStrides, const int inputDim,
+                                                 __half *output, const int *outputStrides, const int outputDim, const int outputlen);
+    template void launch_indexselect<int64_t, int64_t>(const int64_t *input, const int *inputStrides, const int inputDim,
                                                   const int64_t *indices, const int *indicesStrides, const int indicesDim,
                                                   const int gatherAxis,
-                                                  int64_t *output, const int outputlen);
-    template void launch_gather<int32_t, int64_t>(const int32_t *input, const int *inputStrides, const int inputDim,
+                                                  int64_t *output, const int *outputStrides, const int outputDim, const int outputlen);
+    template void launch_indexselect<int32_t, int64_t>(const int32_t *input, const int *inputStrides, const int inputDim,
                                                   const int64_t *indices, const int *indicesStrides, const int indicesDim,
                                                   const int gatherAxis,
-                                                  int32_t *output, const int outputlen);
-    template void launch_gather<int16_t, int64_t>(const int16_t *input, const int *inputStrides, const int inputDim,
+                                                  int32_t *output, const int *outputStrides, const int outputDim, const int outputlen);
+    template void launch_indexselect<int16_t, int64_t>(const int16_t *input, const int *inputStrides, const int inputDim,
                                                   const int64_t *indices, const int *indicesStrides, const int indicesDim,
                                                   const int gatherAxis,
-                                                  int16_t *output, const int outputlen);
-    template void launch_gather<int8_t, int64_t>(const int8_t *input, const int *inputStrides, const int inputDim,
+                                                  int16_t *output, const int *outputStrides, const int outputDim, const int outputlen);
+    template void launch_indexselect<int8_t, int64_t>(const int8_t *input, const int *inputStrides, const int inputDim,
                                                  const int64_t *indices, const int *indicesStrides, const int indicesDim,
                                                  const int gatherAxis,
-                                                 int8_t *output, const int outputlen);
+                                                 int8_t *output, const int *outputStrides, const int outputDim, const int outputlen);
 
-    template void launch_gather<double, int32_t>(const double *input, const int *inputStrides, const int inputDim,
+    template void launch_indexselect<double, int32_t>(const double *input, const int *inputStrides, const int inputDim,
                                                  const int32_t *indices, const int *indicesStrides, const int indicesDim,
                                                  const int gatherAxis,
-                                                 double *output, const int outputlen);
-    template void launch_gather<float, int32_t>(const float *input, const int *inputStrides, const int inputDim,
+                                                 double *output, const int *outputStrides, const int outputDim, const int outputlen);
+    template void launch_indexselect<float, int32_t>(const float *input, const int *inputStrides, const int inputDim,
                                                 const int32_t *indices, const int *indicesStrides, const int indicesDim,
                                                 const int gatherAxis,
-                                                float *output, const int outputlen);
-    template void launch_gather<nv_bfloat16, int32_t>(const nv_bfloat16 *input, const int *inputStrides, const int inputDim,
+                                                float *output, const int *outputStrides, const int outputDim, const int outputlen);
+    template void launch_indexselect<nv_bfloat16, int32_t>(const nv_bfloat16 *input, const int *inputStrides, const int inputDim,
                                                       const int32_t *indices, const int *indicesStrides, const int indicesDim,
                                                       const int gatherAxis,
-                                                      nv_bfloat16 *output, const int outputlen);
-    template void launch_gather<__half, int32_t>(const __half *input, const int *inputStrides, const int inputDim,
+                                                      nv_bfloat16 *output, const int *outputStrides, const int outputDim, const int outputlen);
+    template void launch_indexselect<__half, int32_t>(const __half *input, const int *inputStrides, const int inputDim,
                                                  const int32_t *indices, const int *indicesStrides, const int indicesDim,
                                                  const int gatherAxis,
-                                                 __half *output, const int outputlen);
-    template void launch_gather<int64_t, int32_t>(const int64_t *input, const int *inputStrides, const int inputDim,
+                                                 __half *output, const int *outputStrides, const int outputDim, const int outputlen);
+    template void launch_indexselect<int64_t, int32_t>(const int64_t *input, const int *inputStrides, const int inputDim,
                                                   const int32_t *indices, const int *indicesStrides, const int indicesDim,
                                                   const int gatherAxis,
-                                                  int64_t *output, const int outputlen);
-    template void launch_gather<int32_t, int32_t>(const int32_t *input, const int *inputStrides, const int inputDim,
+                                                  int64_t *output, const int *outputStrides, const int outputDim, const int outputlen);
+    template void launch_indexselect<int32_t, int32_t>(const int32_t *input, const int *inputStrides, const int inputDim,
                                                   const int32_t *indices, const int *indicesStrides, const int indicesDim,
                                                   const int gatherAxis,
-                                                  int32_t *output, const int outputlen);
-    template void launch_gather<int16_t, int32_t>(const int16_t *input, const int *inputStrides, const int inputDim,
+                                                  int32_t *output, const int *outputStrides, const int outputDim, const int outputlen);
+    template void launch_indexselect<int16_t, int32_t>(const int16_t *input, const int *inputStrides, const int inputDim,
                                                   const int32_t *indices, const int *indicesStrides, const int indicesDim,
                                                   const int gatherAxis,
-                                                  int16_t *output, const int outputlen);
-    template void launch_gather<int8_t, int32_t>(const int8_t *input, const int *inputStrides, const int inputDim,
+                                                  int16_t *output, const int *outputStrides, const int outputDim, const int outputlen);
+    template void launch_indexselect<int8_t, int32_t>(const int8_t *input, const int *inputStrides, const int inputDim,
                                                  const int32_t *indices, const int *indicesStrides, const int indicesDim,
                                                  const int gatherAxis,
-                                                 int8_t *output, const int outputlen);
+                                                 int8_t *output, const int *outputStrides, const int outputDim, const int outputlen);
 }
 
  
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cuh b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cuh
index 2047a636..26c40851 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cuh
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cuh
@@ -43,26 +43,27 @@ namespace deepx::tensorfunc
                             const BroadcastMap *broadcastMap,
                             T *output, const int *outputStrides, const int outputDim, const int outputlen);
 
-    // gather
+    // indexselect
      template <typename GatherAxisT>
-    __host__ __device__ void fromGatherIndices(
-    const int *output_indices,  // 输出张量的索引
+    __host__ __device__ void fromIndexselectIndices(
+    const int *output_indices,const int outputDim,  // 输出张量的索引
     const GatherAxisT *indices,const int *indicesStrides,const int indicesDim, //indices是tensor
+    int *index_indices,
     const int gatherAxis,      // gather操作的轴
     int *input_indices,const int inputDim);       // 计算出的输入张量索引  
 
     template <int DIM, typename T,typename GatherAxisT>
-    __global__ void gather_kernel(
+    __global__ void indexselect_kernel(
         const T *input, const int *inputStrides, const int inputDim,
-        const GatherAxisT *indices,const int *indicesStrides,const int indicesDim,
+        const GatherAxisT *index,const int *indexStrides,const int indexDim,
         const int gatherAxis,
-        T *output,const int outputlen);//output 和input的shape相同,所以共享strides,dim,len
+        T *output,const int *outputStrides,const int outputDim,const int outputlen);
 
     template <typename T,typename GatherAxisT>
-    void launch_gather(
+    void launch_indexselect(
         const T *input, const int *inputStrides, const int inputDim, 
         const GatherAxisT *indices,const int *indicesStrides,const int indicesDim,
         const int gatherAxis,
-        T *output,const  int outputlen);//output 和input的shape相同,所以共享strides,dim,len
+        T *output,const int *outputStrides,const int outputDim,const int outputlen);
 };
 #endif // DEEPX_TENSORFUNC_CHANGESHAPE_MIAOBYTE_CUH
\ No newline at end of file
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.hpp
index 8fb43a76..a49439b9 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.hpp
@@ -117,21 +117,22 @@ namespace deepx::tensorfunc
         }
     };
 
-    //gather
+    //indexselect
     template <typename T,typename GatherAxisT>
-    struct gatherDispatcher<miaobyte, T,GatherAxisT>
+    struct indexselectDispatcher<miaobyte, T,GatherAxisT>
     {
-        static void gather(const Tensor<T> &input, const Tensor<GatherAxisT> &indices, const int axis, Tensor<T> &output){
-            vector<int> input_gatherShape = indices.shape.shape;
-            if (input_gatherShape.empty()||input_gatherShape!=output.shape.shape)
+        static void indexselect(const Tensor<T> &input, const Tensor<GatherAxisT> &indices, const int axis, Tensor<T> &output){
+            int gatherAxis = axis < 0 ? input.shape.dim + axis : axis;
+            vector<int> gatherShape = indexselectShape(input.shape.shape, indices.shape.shape, gatherAxis);
+            if (gatherShape.empty()||gatherShape!=output.shape.shape)
             {
-                throw TensorShapeError("Gather shape mismatch");
+                throw TensorShapeError("Indexselect shape mismatch");
             }
-            int gatherAxis = axis < 0 ? input.shape.dim + axis : axis;
-            launch_gather<T,GatherAxisT>(input.data, input.shape.strides.data(), input.shape.dim,
+            
+            launch_indexselect<T,GatherAxisT>(input.data, input.shape.strides.data(), input.shape.dim,
                             indices.data, indices.shape.strides.data(), indices.shape.dim,
                             gatherAxis,
-                            output.data,output.shape.size);//output和indices的shape相同，共享strides等
+                            output.data,output.shape.strides.data(),output.shape.dim,output.shape.size);
         }
     };
 }
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/io_miaobyte.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/io_miaobyte.hpp
index 3a2a4032..00d338fe 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/io_miaobyte.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/io_miaobyte.hpp
@@ -21,8 +21,7 @@ namespace deepx::tensorfunc
     {
         static void print(const Tensor<T> &t, const std::string &f = "")
         {
-            int bytes = precision_bits(t.shape.dtype) / 8;
-            size_t total_bytes = t.shape.size * bytes;
+            int64_t total_bytes = t.shape.bytes();
 
             // 统一分配CPU内存
             unsigned char *host_data = new unsigned char[total_bytes];
@@ -48,8 +47,7 @@ namespace deepx::tensorfunc
     {
         static void print(const Tensor<half> &t, const std::string &f = "")
         {
-            int bytes = precision_bits(t.shape.dtype) / 8;
-            size_t total_bytes = t.shape.size * bytes;
+            int64_t total_bytes = t.shape.bytes();
 
             // 统一分配CPU内存
             unsigned char *host_data = new unsigned char[total_bytes];
@@ -90,8 +88,7 @@ namespace deepx::tensorfunc
     {
         static void print(const Tensor<nv_bfloat16> &t, const std::string &f = "")
         {
-            int bytes = precision_bits(t.shape.dtype) / 8;
-            size_t total_bytes = t.shape.size * bytes;
+            int64_t total_bytes = t.shape.bytes();
 
             // 统一分配CPU内存
             unsigned char *host_data = new unsigned char[total_bytes];
@@ -137,8 +134,7 @@ namespace deepx::tensorfunc
         shape_fs.close();
 
         // 保存data
-        int bytes = precision_bits(tensor.shape.dtype) / 8;
-        size_t total_bytes = tensor.shape.size * bytes;
+        int64_t total_bytes = tensor.shape.bytes();
 
         // 统一分配CPU内存
         unsigned char *host_data = new unsigned char[total_bytes];
@@ -190,7 +186,7 @@ namespace deepx::tensorfunc
         std::ifstream data_fs(datapath, std::ios::binary);
         data_fs.seekg(0, std::ios::end);
         std::streamsize fileSize = data_fs.tellg();
-        std::streamsize expectedSize = shape.size * precision_bits(shape.dtype) / 8;
+        std::streamsize expectedSize = shape.bytes();
 
         if (fileSize != expectedSize)
         {
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/tensorlife_miaobyte.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/tensorlife_miaobyte.hpp
index fdcb0f17..8e776a14 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/tensorlife_miaobyte.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/tensorlife_miaobyte.hpp
@@ -52,5 +52,8 @@ namespace deepx::tensorfunc
         dst.shape=src.shape;
         dst.copyer(src.data, dst.data, src.shape.size);
     }
+
+    //rename
+
 }
 #endif // DEEPX_TENSORFUNC_TENSORLIFE_MIAOBYTE_HPP
diff --git a/excuter/op-mem-cuda/src/deepx/tf/changeshape.hpp b/excuter/op-mem-cuda/src/deepx/tf/changeshape.hpp
index d869a435..8b4604f9 100644
--- a/excuter/op-mem-cuda/src/deepx/tf/changeshape.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tf/changeshape.hpp
@@ -344,14 +344,14 @@ namespace deepx::tf
         }
     };
 
-    // gather
+    // indexselect
     template <typename Author>
-    class Gather : public TF
+    class IndexSelect : public TF
     {
     public:
-        Gather(const vector<Param> &args, const vector<Param> &returns)
+        IndexSelect(const vector<Param> &args, const vector<Param> &returns)
         {
-            this->name = "gather";
+            this->name = "indexselect";
             this->metadata.author = Author::name();
             this->tftype = "changeshape";
             this->args = args;
@@ -360,11 +360,11 @@ namespace deepx::tf
 
         string math_formula() const override
         {
-            return "T2 = T1.gather(indices=[1,2], axis=1)";
+            return "T2 = T1.indexselect(index=[1,2], axis=1)";
         }
         shared_ptr<TF> clone() const override
         {
-            return make_shared<Gather<Author>>(*this);
+            return make_shared<IndexSelect<Author>>(*this);
         }
         int run(shared_ptr<MemBase> mem, string &error) override
         {
@@ -377,10 +377,10 @@ namespace deepx::tf
                 error = "output_type " + precision_str(output_type) + " or input_type " + precision_str(input_type) + " must be the same";
                 return 1;
             }
-            Precision indices_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype;
-            if (indices_type != Precision::Int64 && indices_type != Precision::Int32)
+            Precision index_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype;
+            if (index_type != Precision::Int64 && index_type != Precision::Int32)
             {
-                error = "indices_type " + precision_str(indices_type) + " only support " + precision_str(Precision::Int64) + " or " + precision_str(Precision::Int32);
+                error = "index_type " + precision_str(index_type) + " only support " + precision_str(Precision::Int64) + " or " + precision_str(Precision::Int32);
                 return 1;
             }
 
@@ -388,97 +388,97 @@ namespace deepx::tf
             {
             case Precision::Float64:
             {
-                if (indices_type == Precision::Int64)
+                if (index_type == Precision::Int64)
                 {
-                    gather<Author, double, int64_t>(*mem->gettensor<double>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue), axis, *mem->gettensor<double>(this->returns[0].textvalue));
+                    indexselect<Author, double, int64_t>(*mem->gettensor<double>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue), axis, *mem->gettensor<double>(this->returns[0].textvalue));
                 }
-                else if (indices_type == Precision::Int32)
+                else if (index_type == Precision::Int32)
                 {
-                    gather<Author, double, int32_t>(*mem->gettensor<double>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), axis, *mem->gettensor<double>(this->returns[0].textvalue));
+                    indexselect<Author, double, int32_t>(*mem->gettensor<double>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), axis, *mem->gettensor<double>(this->returns[0].textvalue));
                 }
                 break;
             }
             case Precision::Float32:
             {
-                if (indices_type == Precision::Int64)
+                if (index_type == Precision::Int64)
                 {
-                    gather<Author, float, int64_t>(*mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue), axis, *mem->gettensor<float>(this->returns[0].textvalue));
+                    indexselect<Author, float, int64_t>(*mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue), axis, *mem->gettensor<float>(this->returns[0].textvalue));
                 }
-                else if (indices_type == Precision::Int32)
+                else if (index_type == Precision::Int32)
                 {
-                    gather<Author, float, int32_t>(*mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), axis, *mem->gettensor<float>(this->returns[0].textvalue));
+                    indexselect<Author, float, int32_t>(*mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), axis, *mem->gettensor<float>(this->returns[0].textvalue));
                 }
                 break;
             }
             case Precision::Float16:
             {
-                if (indices_type == Precision::Int64)
+                if (index_type == Precision::Int64)
                 {
-                    gather<Author, half, int64_t>(*mem->gettensor<half>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue), axis, *mem->gettensor<half>(this->returns[0].textvalue));
+                    indexselect<Author, half, int64_t>(*mem->gettensor<half>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue), axis, *mem->gettensor<half>(this->returns[0].textvalue));
                 }
-                else if (indices_type == Precision::Int32)
+                else if (index_type == Precision::Int32)
                 {
-                    gather<Author, half, int32_t>(*mem->gettensor<half>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), axis, *mem->gettensor<half>(this->returns[0].textvalue));
+                    indexselect<Author, half, int32_t>(*mem->gettensor<half>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), axis, *mem->gettensor<half>(this->returns[0].textvalue));
                 }
                 break;
             }
             case Precision::BFloat16:
             {
-                if (indices_type == Precision::Int64)
+                if (index_type == Precision::Int64)
                 {
-                    gather<Author, nv_bfloat16, int64_t>(*mem->gettensor<nv_bfloat16>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue), axis, *mem->gettensor<nv_bfloat16>(this->returns[0].textvalue));
+                    indexselect<Author, nv_bfloat16, int64_t>(*mem->gettensor<nv_bfloat16>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue), axis, *mem->gettensor<nv_bfloat16>(this->returns[0].textvalue));
                 }
-                else if (indices_type == Precision::Int32)
+                else if (index_type == Precision::Int32)
                 {
-                    gather<Author, nv_bfloat16, int32_t>(*mem->gettensor<nv_bfloat16>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), axis, *mem->gettensor<nv_bfloat16>(this->returns[0].textvalue));
+                    indexselect<Author, nv_bfloat16, int32_t>(*mem->gettensor<nv_bfloat16>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), axis, *mem->gettensor<nv_bfloat16>(this->returns[0].textvalue));
                 }
                 break;
             }
             case Precision::Int64:
             {
-                if (indices_type == Precision::Int64)
+                if (index_type == Precision::Int64)
                 {
-                    gather<Author, int64_t, int64_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue), axis, *mem->gettensor<int64_t>(this->returns[0].textvalue));
+                    indexselect<Author, int64_t, int64_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue), axis, *mem->gettensor<int64_t>(this->returns[0].textvalue));
                 }
-                else if (indices_type == Precision::Int32)
+                else if (index_type == Precision::Int32)
                 {
-                    gather<Author, int64_t, int32_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), axis, *mem->gettensor<int64_t>(this->returns[0].textvalue));
+                    indexselect<Author, int64_t, int32_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), axis, *mem->gettensor<int64_t>(this->returns[0].textvalue));
                 }
                 break;
             }
             case Precision::Int32:
             {
-                if (indices_type == Precision::Int64)
+                if (index_type == Precision::Int64)
                 {
-                    gather<Author, int32_t, int64_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue), axis, *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                    indexselect<Author, int32_t, int64_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue), axis, *mem->gettensor<int32_t>(this->returns[0].textvalue));
                 }
-                else if (indices_type == Precision::Int32)
+                else if (index_type == Precision::Int32)
                 {
-                    gather<Author, int32_t, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), axis, *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                    indexselect<Author, int32_t, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), axis, *mem->gettensor<int32_t>(this->returns[0].textvalue));
                 }
                 break;
             }
             case Precision::Int16:
             {
-                if (indices_type == Precision::Int64)
+                if (index_type == Precision::Int64)
                 {
-                    gather<Author, int16_t, int64_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue), axis, *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                    indexselect<Author, int16_t, int64_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue), axis, *mem->gettensor<int16_t>(this->returns[0].textvalue));
                 }
-                else if (indices_type == Precision::Int32)
+                else if (index_type == Precision::Int32)
                 {
-                    gather<Author, int16_t, int32_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), axis, *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                    indexselect<Author, int16_t, int32_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), axis, *mem->gettensor<int16_t>(this->returns[0].textvalue));
                 }
                 break;
             }
             case Precision::Int8:
             {
-                if (indices_type == Precision::Int64)
+                if (index_type == Precision::Int64)
                 {
-                    gather<Author, int8_t, int64_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue), axis, *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                    indexselect<Author, int8_t, int64_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue), axis, *mem->gettensor<int8_t>(this->returns[0].textvalue));
                 }
-                else if (indices_type == Precision::Int32)
+                else if (index_type == Precision::Int32)
                 {
-                    gather<Author, int8_t, int32_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), axis, *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                    indexselect<Author, int8_t, int32_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), axis, *mem->gettensor<int8_t>(this->returns[0].textvalue));
                 }
                 break;
             }
diff --git a/excuter/op-mem-cuda/src/deepx/tf/tensorlife.hpp b/excuter/op-mem-cuda/src/deepx/tf/tensorlife.hpp
index 43041188..620b81e1 100644
--- a/excuter/op-mem-cuda/src/deepx/tf/tensorlife.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tf/tensorlife.hpp
@@ -248,5 +248,39 @@ namespace deepx::tf
             return make_shared<DelTensor>(*this);
         }
     };
+
+    //rename
+    class RenameTensor : public TF
+    {
+    public:
+        RenameTensor(vector<Param> args, vector<Param> returns)
+        {
+            this->name = "renametensor";    
+            this->tftype = "tensorlife";
+            this->args = args;
+            this->returns = returns;
+        }
+        int run(shared_ptr<MemBase> mem, string &error) override    
+        {
+            string old_name = this->args[0].textvalue;
+            if (!checktensors({this->args[0].textvalue}, mem, error) != 0)
+            {
+                return 1;
+            }
+
+            string new_name = this->args[1].textvalue;
+ 
+            mem->rename_tensor(old_name, new_name);
+            return 0;
+        }
+        string math_formula() const override
+        {
+            return "rename T1 to T2";
+        }
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<RenameTensor>(*this);
+        }
+    };
 }
 #endif // DEEPX_TF_TENSORLIFE_HPP
diff --git a/front/py/deepx/scheduler/client/udpconn.py b/front/py/deepx/scheduler/client/udpconn.py
index a25b0963..6a12c26a 100644
--- a/front/py/deepx/scheduler/client/udpconn.py
+++ b/front/py/deepx/scheduler/client/udpconn.py
@@ -3,7 +3,7 @@
 import select
 
 class UDPConn:
-    def __init__(self, endpoint: str = "localhost:8080"):
+    def __init__(self, endpoint: str = "localhost:9090"):
         # 解析endpoint
         self._host, port_str = endpoint.split(':')
         self._port = int(port_str)

From ab1269ccccc4a524873f935f9f6efcc0915e3610 Mon Sep 17 00:00:00 2001
From: lipeng <734991033@qq.com>
Date: Wed, 23 Apr 2025 18:19:45 +0800
Subject: [PATCH 5/5] =?UTF-8?q?nn.module:=E7=A7=BB=E9=99=A4=E8=87=AA?=
 =?UTF-8?q?=E5=8A=A8=E6=B3=A8=E5=86=8C=EF=BC=8C=E6=94=B9=E4=B8=BA=E6=89=8B?=
 =?UTF-8?q?=E5=8A=A8=E6=B3=A8=E5=86=8Ctensor=E3=80=81module?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 doc/language.md                               |  52 ++
 .../nn/functional/leaffunc_changeshape.py     |   2 +-
 front/py/deepx/nn/modules/linear.py           |   6 +-
 front/py/deepx/nn/modules/module.py           |  14 +-
 front/py/deepx/nn/modules/sparse.torch.py     | 512 ------------------
 front/py/deepx/tensor/tensor.py               |  11 +-
 .../py/examples/4_transformer/llama/1_rope.py |   0
 todo/deepxpy.md                               |  10 -
 todo/excuter/metal/metal.md                   |   3 -
 todo/excuter/ompsimd/ompsimd.md               |   3 -
 todo/scheduler.md                             |   7 -
 11 files changed, 66 insertions(+), 554 deletions(-)
 create mode 100644 doc/language.md
 delete mode 100644 front/py/deepx/nn/modules/sparse.torch.py
 create mode 100644 front/py/examples/4_transformer/llama/1_rope.py
 delete mode 100644 todo/deepxpy.md
 delete mode 100644 todo/excuter/metal/metal.md
 delete mode 100644 todo/excuter/ompsimd/ompsimd.md
 delete mode 100644 todo/scheduler.md

diff --git a/doc/language.md b/doc/language.md
new file mode 100644
index 00000000..eefde765
--- /dev/null
+++ b/doc/language.md
@@ -0,0 +1,52 @@
+## c++:计算执行器(excuter)
+
+负责实现tensor的具体计算过程,对接硬件如GPU、CPU的simd指令
+
+除了c++，也就只有编译器能干这样的脏活累活了
+
+deepx用到了以下库，都是c++是实现
+
+cblas
+openmp
+c++可以和汇编结合，从而最大程度发挥cpu、gpu寄存器的性能
+
+cuda是c++的语言子集，也可以看作是c++
+
+
+## python:模型前端构建
+python提供了类似pytorch的库，便于调试和验证模型算法
+
+deepx/tensor/
+deepx/nn/deepxIR
+deepx/nn.module/
+deepx/nn.functional
+通过这些库，我们可以快速的搭建一个模型结构
+
+## golang:运维、监控、分布式，深度学习训推自动化的维护者
+
+与pytorch、tensorflow不同，deepx追求分布式过程自动化，因此python侧不参与分布式
+
+deepxctl:提供对deepx体系的所有工具、库、模型、镜像的统一纳管
+
+
+
+## deepxIR
+虽然deepxIR不是独立的编程语言，但是deepx体系的程序格式标准
+
+excuter所执行的内容，就是deepxir的序列或deepxir计算图
+
+https://github.com/array2d/deepx/blob/main/doc/excuter/op-mem-cuda/list.md
+
+deepxir分为3类
+
+计算：tensor这些系列elementwise、changeshape、tensorlife、io、reduce、init
+指令结构:
+queue[deepxIR]，串行指令，有前后执行顺序
+parallel[deepxIR]，可并行的指令，无顺序依赖，可并行
+以上指令为静态图所需的指令，运行过程是确定的。
+
+分支：goto、ifelse
+分支指令会让计算图行为不可预测，也就是动态部分
+
+控制：parse、run等特殊自定义指令
+控制指令是deepx分布式系统内置的各个组件控制指令
\ No newline at end of file
diff --git a/front/py/deepx/nn/functional/leaffunc_changeshape.py b/front/py/deepx/nn/functional/leaffunc_changeshape.py
index cee587bf..c9501755 100644
--- a/front/py/deepx/nn/functional/leaffunc_changeshape.py
+++ b/front/py/deepx/nn/functional/leaffunc_changeshape.py
@@ -43,7 +43,7 @@ def permute(t:Tensor,
 def transpose(t:Tensor,out:Union[Tensor,str]='')->Tensor:
     dimorder = list(range(t.ndim))
     dimorder[-1],dimorder[-2]=dimorder[-2],dimorder[-1]
-    return permute(t,dimorder,out)
+    return permute(t,tuple(dimorder),out)
 
  
 
diff --git a/front/py/deepx/nn/modules/linear.py b/front/py/deepx/nn/modules/linear.py
index f1eb86e3..c4f05194 100644
--- a/front/py/deepx/nn/modules/linear.py
+++ b/front/py/deepx/nn/modules/linear.py
@@ -21,8 +21,10 @@ def __init__(
         self.in_features = in_features
         self.out_features = out_features
         self.weight = Tensor(shape=(out_features, in_features),dtype=dtype)
+        self.register_parameter("weight",self.weight)
         if bias:
             self.bias = Tensor(shape=(out_features,),dtype=dtype)
+            self.register_parameter("bias",self.bias)
         else:
             self.register_parameter("bias", None)
         self.reset_parameters()
@@ -42,9 +44,9 @@ def forward(self, input: Tensor) -> Tensor:
         y=input @ self.weight.T
         oldshape=y.shape
         if self.bias is not None:
-            y.reshape_(y.shape[1])
+            y.reshape_(tuple(y.shape[1:]))
             y=y+self.bias
-        y.reshape_(*oldshape)
+        y.reshape_(oldshape)
         return y
 
     def extra_repr(self) -> str:
diff --git a/front/py/deepx/nn/modules/module.py b/front/py/deepx/nn/modules/module.py
index 6966bebd..003223ad 100644
--- a/front/py/deepx/nn/modules/module.py
+++ b/front/py/deepx/nn/modules/module.py
@@ -9,7 +9,7 @@ def __init__(self, name: Optional[str] = None):
         self._parent: Optional[Module] = None
         self._modules: OrderedDict[str, Module] = OrderedDict()
         self._parameters: OrderedDict[str, Tensor] = OrderedDict()
-
+ 
     def _generate_default_name(self) -> str:
         class_name = self.__class__.__name__
         base_name = class_name.lower()
@@ -17,6 +17,7 @@ def _generate_default_name(self) -> str:
             self.__class__._instance_counter = 0
         count = self.__class__._instance_counter
         self.__class__._instance_counter += 1
+        return count
         return f"{base_name}_{count}"
  
     @property
@@ -25,16 +26,7 @@ def full_name(self):
             return self._name
         else:
             return f"{self._parent.full_name}.{self._name}"
-    
-    # def __setattr__(self, name: str, value: Any) -> None:
-    #     if not name.startswith('_'):
-    #         if isinstance(value, Module):
-    #             self.register_module(name, value)
-    #         elif isinstance(value, Tensor):
-    #             self.register_parameter(name, value)
-    #         # 使用父类方法设置属性，避免递归
-    #     super().__setattr__(name, value)
-        
+
     def register_module(self, name: str, module: Optional['Module']) -> None:
         if module is None:
             self._modules.pop(name, None)
diff --git a/front/py/deepx/nn/modules/sparse.torch.py b/front/py/deepx/nn/modules/sparse.torch.py
deleted file mode 100644
index a0621543..00000000
--- a/front/py/deepx/nn/modules/sparse.torch.py
+++ /dev/null
@@ -1,512 +0,0 @@
-# mypy: 允许无类型定义的函数
-from typing import Optional
-
-import torch
-from torch import Tensor
-from torch.nn import functional as F, init
-from torch.nn.parameter import Parameter
-
-from .module import Module
-
-
-__all__ = ["Embedding", "EmbeddingBag"]
-
-
-class Embedding(Module):
-    r"""一个存储固定字典和大小的嵌入向量的简单查找表。
-
-    该模块常用于存储词嵌入并通过索引检索它们。
-    模块的输入是索引列表，输出是对应的词嵌入向量。
-
-    参数:
-        num_embeddings (int): 嵌入字典的大小（词汇表大小）
-        embedding_dim (int): 每个嵌入向量的维度
-        padding_idx (int, 可选): 如果指定，该索引位置的条目不参与梯度计算；
-                                  因此，该位置的嵌入向量在训练中不会更新，保持为固定的"填充"向量。
-                                  对于新创建的嵌入层，该位置的嵌入向量默认全零，但可更新为其他值作为填充向量。
-        max_norm (float, 可选): 如果指定，范数超过此值的嵌入向量会被重新归一化到该范数
-        norm_type (float, 可选): 计算max_norm时使用的p范数（默认L2范数，p=2）
-        scale_grad_by_freq (bool, 可选): 如果为True，梯度会按mini-batch中词的频率倒数缩放（默认False）
-        sparse (bool, 可选): 如果为True，权重矩阵的梯度将是稀疏张量（详见注释）
-
-    属性:
-        weight (Tensor): 模块的可学习权重，形状为(num_embeddings, embedding_dim)，
-                         初始化为正态分布N(0, 1)
-
-    形状:
-        - 输入: :math:`(*)`, 任意形状的IntTensor或LongTensor，包含要提取的索引
-        - 输出: :math:`(*, H)`, 其中*是输入形状，H=embedding_dim
-
-    .. 注意::
-        注意只有部分优化器支持稀疏梯度：目前支持的有SGD（CPU和CUDA）、SparseAdam（CPU和CUDA）、Adagrad（CPU）
-
-    .. 注意::
-        当max_norm不为None时，嵌入层的前向传播会原地修改weight张量。
-        由于梯度计算所需的张量不能被原地修改，因此在调用前向传播前对weight进行可微操作时，
-        若max_norm不为None则需要克隆weight。例如::
-
-            n, d, m = 3, 5, 7
-            embedding = nn.Embedding(n, d, max_norm=1.0)
-            W = torch.randn((m, d), requires_grad=True)
-            idx = torch.tensor([1, 2])
-            a = embedding.weight.clone() @ W.t()  # weight必须克隆以保证可微性
-            b = embedding(idx) @ W.t()  # 原地修改weight
-            out = (a.unsqueeze(0) + b.unsqueeze(1))
-            loss = out.sigmoid().prod()
-            loss.backward()
-
-    示例::
-
-        >>> # 包含10个3维张量的嵌入层
-        >>> embedding = nn.Embedding(10, 3)
-        >>> # 2个样本，每个包含4个索引的批次
-        >>> input = torch.LongTensor([[1, 2, 4, 5], [4, 3, 2, 9]])
-        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
-        >>> embedding(input)
-        tensor([[[-0.0251, -1.6902,  0.7172],
-                 [-0.6431,  0.0748,  0.6969],
-                 [ 1.4970,  1.3448, -0.9685],
-                 [-0.3677, -2.7265, -0.1685]],
-
-                [[ 1.4970,  1.3448, -0.9685],
-                 [ 0.4362, -0.4004,  0.9400],
-                 [-0.6431,  0.0748,  0.6969],
-                 [ 0.9124, -2.3616,  1.1151]]])
-
-
-        >>> # 带padding_idx的示例
-        >>> embedding = nn.Embedding(10, 3, padding_idx=0)
-        >>> input = torch.LongTensor([[0, 2, 0, 5]])
-        >>> embedding(input)
-        tensor([[[ 0.0000,  0.0000,  0.0000],
-                 [ 0.1535, -2.0309,  0.9315],
-                 [ 0.0000,  0.0000,  0.0000],
-                 [-0.1655,  0.9897,  0.0635]]])
-
-        >>> # 修改填充向量的示例
-        >>> padding_idx = 0
-        >>> embedding = nn.Embedding(3, 3, padding_idx=padding_idx)
-        >>> embedding.weight
-        Parameter containing:
-        tensor([[ 0.0000,  0.0000,  0.0000],
-                [-0.7895, -0.7089, -0.0364],
-                [ 0.6778,  0.5803,  0.2678]], requires_grad=True)
-        >>> with torch.no_grad():
-        ...     embedding.weight[padding_idx] = torch.ones(3)
-        >>> embedding.weight
-        Parameter containing:
-        tensor([[ 1.0000,  1.0000,  1.0000],
-                [-0.7895, -0.7089, -0.0364],
-                [ 0.6778,  0.5803,  0.2678]], requires_grad=True)
-    """
-
-    __constants__ = [
-        "num_embeddings",
-        "embedding_dim",
-        "padding_idx",
-        "max_norm",
-        "norm_type",
-        "scale_grad_by_freq",
-        "sparse",
-    ]
-
-    num_embeddings: int
-    embedding_dim: int
-    padding_idx: Optional[int]
-    max_norm: Optional[float]
-    norm_type: float
-    scale_grad_by_freq: bool
-    weight: Tensor
-    freeze: bool
-    sparse: bool
-
-    def __init__(
-        self,
-        num_embeddings: int,
-        embedding_dim: int,
-        padding_idx: Optional[int] = None,
-        max_norm: Optional[float] = None,
-        norm_type: float = 2.0,
-        scale_grad_by_freq: bool = False,
-        sparse: bool = False,
-        _weight: Optional[Tensor] = None,
-        _freeze: bool = False,
-        device=None,
-        dtype=None,
-    ) -> None:
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super().__init__()
-        self.num_embeddings = num_embeddings
-        self.embedding_dim = embedding_dim
-        if padding_idx is not None:
-            if padding_idx > 0:
-                assert (
-                    padding_idx < self.num_embeddings
-                ), "Padding_idx必须在num_embeddings范围内"
-            elif padding_idx < 0:
-                assert (
-                    padding_idx >= -self.num_embeddings
-                ), "Padding_idx必须在num_embeddings范围内"
-                padding_idx = self.num_embeddings + padding_idx
-        self.padding_idx = padding_idx
-        self.max_norm = max_norm
-        self.norm_type = norm_type
-        self.scale_grad_by_freq = scale_grad_by_freq
-        if _weight is None:
-            self.weight = Parameter(
-                torch.empty((num_embeddings, embedding_dim), **factory_kwargs),
-                requires_grad=not _freeze,
-            )
-            self.reset_parameters()
-        else:
-            assert list(_weight.shape) == [
-                num_embeddings,
-                embedding_dim,
-            ], "权重形状与num_embeddings和embedding_dim不匹配"
-            self.weight = Parameter(_weight, requires_grad=not _freeze)
-
-        self.sparse = sparse
-
-    def reset_parameters(self) -> None:
-        init.normal_(self.weight)  # 正态分布初始化权重
-        self._fill_padding_idx_with_zero()  # 填充索引位置归零
-
-    def _fill_padding_idx_with_zero(self) -> None:
-        if self.padding_idx is not None:
-            with torch.no_grad():  # 不计算梯度
-                self.weight[self.padding_idx].fill_(0)  # 填充位置设为0
-
-    def forward(self, input: Tensor) -> Tensor:
-        return F.embedding(
-            input,
-            self.weight,
-            self.padding_idx,
-            self.max_norm,
-            self.norm_type,
-            self.scale_grad_by_freq,
-            self.sparse,
-        )
-
-    def extra_repr(self) -> str:
-        s = "{num_embeddings}, {embedding_dim}"
-        if self.padding_idx is not None:
-            s += ", padding_idx={padding_idx}"
-        if self.max_norm is not None:
-            s += ", max_norm={max_norm}"
-            s += ", max_norm={max_norm}"
-        if self.norm_type != 2:
-            s += ", norm_type={norm_type}"
-        if self.scale_grad_by_freq is not False:
-            s += ", scale_grad_by_freq={scale_grad_by_freq}"
-        if self.sparse is not False:
-            s += ", sparse=True"
-        return s.format(**self.__dict__)
-
-    @classmethod
-    def from_pretrained(
-        cls,
-        embeddings,
-        freeze=True,
-        padding_idx=None,
-        max_norm=None,
-        norm_type=2.0,
-        scale_grad_by_freq=False,
-        sparse=False,
-    ):
-        r"""从给定的2维FloatTensor创建Embedding实例。
-
-        参数:
-            embeddings (Tensor): 包含嵌入权重的FloatTensor，
-                第一维作为num_embeddings，第二维作为embedding_dim。
-            freeze (bool, 可选): 若为True，张量在学习过程中不更新，
-                相当于embedding.weight.requires_grad = False。默认True。
-            padding_idx (int, 可选): 同模块初始化文档说明。
-            max_norm (float, 可选): 同模块初始化文档说明。
-            norm_type (float, 可选): 同模块初始化文档说明，默认2。
-            scale_grad_by_freq (bool, 可选): 同模块初始化文档说明，默认False。
-            sparse (bool, 可选): 同模块初始化文档说明。
-
-        示例::
-
-            >>> # 包含预训练权重的FloatTensor
-            >>> weight = torch.FloatTensor([[1, 2.3, 3], [4, 5.1, 6.3]])
-            >>> embedding = nn.Embedding.from_pretrained(weight)
-            >>> # 获取索引1的嵌入
-            >>> input = torch.LongTensor([1])
-            >>> # xdoctest: +IGNORE_WANT("non-deterministic")
-            >>> embedding(input)
-            tensor([[ 4.0000,  5.1000,  6.3000]])
-        """
-        assert (
-            embeddings.dim() == 2
-        ), "Embeddings参数应为2维张量"
-        rows, cols = embeddings.shape
-        embedding = cls(
-            num_embeddings=rows,
-            embedding_dim=cols,
-            _weight=embeddings,
-            _freeze=freeze,
-            padding_idx=padding_idx,
-            max_norm=max_norm,
-            norm_type=norm_type,
-            scale_grad_by_freq=scale_grad_by_freq,
-            sparse=sparse,
-        )
-        return embedding
-
-
-class EmbeddingBag(Module):
-    r"""计算嵌入"袋"的和或均值，无需实例化中间嵌入。
-
-    对于固定长度的袋、无per_sample_weights、无等于padding_idx的索引，且输入为2D时，
-    该类的行为如下：
-        * mode="sum"等价于Embedding层后接torch.sum(dim=1)
-        * mode="mean"等价于Embedding层后接torch.mean(dim=1)
-        * mode="max"等价于Embedding层后接torch.max(dim=1)
-
-    但EmbeddingBag比链式操作更节省时间和内存。
-
-    EmbeddingBag还支持在正向传播时传入样本权重，
-    这会在按mode指定的方式进行加权归约前缩放嵌入输出。
-    若传入per_sample_weights，仅支持mode="sum"，即按权重计算加权和。
-
-    参数:
-        num_embeddings (int): 嵌入字典的大小（词汇表大小）
-        embedding_dim (int): 每个嵌入向量的维度
-        max_norm (float, 可选): 若指定，范数超过此值的嵌入向量会被重新归一化到该范数
-        norm_type (float, 可选): 计算max_norm时使用的p范数（默认L2范数，p=2）
-        scale_grad_by_freq (bool, 可选): 若为True，梯度会按mini-batch中词的频率倒数缩放（默认False）。
-                                         注意：mode="max"时不支持此选项。
-        mode (str, 可选): "sum"、"mean"或"max"，指定袋的归约方式。
-                           "sum"计算加权和（考虑per_sample_weights），
-                           "mean"计算袋内平均值，"max"计算袋内最大值。默认"mean"。
-        sparse (bool, 可选): 若为True，权重矩阵的梯度将是稀疏张量（详见注释）。
-                             注意：mode="max"时不支持此选项。
-        include_last_offset (bool, 可选): 若为True，offsets包含一个额外元素，
-                                          其值等于indices的长度，符合CSR格式。
-        padding_idx (int, 可选): 若指定，该索引位置的条目不参与梯度计算；
-                                 因此，该位置的嵌入向量在训练中不会更新，保持为固定的"填充"向量。
-                                 对于新创建的EmbeddingBag，该位置的嵌入向量默认全零，
-                                 但可更新为其他值作为填充向量。注意该位置的嵌入向量会被排除在归约之外。
-
-    属性:
-        weight (Tensor): 模块的可学习权重，形状为(num_embeddings, embedding_dim)，
-                         初始化为正态分布N(0, 1)。
-
-    示例::
-
-        >>> # 包含10个3维张量的EmbeddingBag（求和模式）
-        >>> embedding_sum = nn.EmbeddingBag(10, 3, mode='sum')
-        >>> # 2个样本，每个包含4个索引的输入（展平为1D）
-        >>> input = torch.tensor([1, 2, 4, 5, 4, 3, 2, 9], dtype=torch.long)
-        >>> offsets = torch.tensor([0, 4], dtype=torch.long)
-        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
-        >>> embedding_sum(input, offsets)
-        tensor([[-0.8861, -5.4350, -0.0523],
-                [ 1.1306, -2.5798, -1.0044]])
-
-        >>> # 带padding_idx的示例
-        >>> embedding_sum = nn.EmbeddingBag(10, 3, mode='sum', padding_idx=2)
-        >>> input = torch.tensor([2, 2, 2, 2, 4, 3, 2, 9], dtype=torch.long)
-        >>> offsets = torch.tensor([0, 4], dtype=torch.long)
-        >>> embedding_sum(input, offsets)
-        tensor([[ 0.0000,  0.0000,  0.0000],
-                [-0.7082,  3.2145, -2.6251]])
-
-        >>> # 从Embedding加载EmbeddingBag的示例
-        >>> embedding = nn.Embedding(10, 3, padding_idx=2)
-        >>> embedding_sum = nn.EmbeddingBag.from_pretrained(
-                embedding.weight,
-                padding_idx=embedding.padding_idx,
-                mode='sum')
-    """
-
-    __constants__ = [
-        "num_embeddings",
-        "embedding_dim",
-        "max_norm",
-        "norm_type",
-        "scale_grad_by_freq",
-        "mode",
-        "sparse",
-        "include_last_offset",
-        "padding_idx",
-    ]
-
-    num_embeddings: int
-    embedding_dim: int
-    max_norm: Optional[float]
-    norm_type: float
-    scale_grad_by_freq: bool
-    weight: Tensor
-    mode: str
-    sparse: bool
-    include_last_offset: bool
-    padding_idx: Optional[int]
-
-    def __init__(
-        self,
-        num_embeddings: int,
-        embedding_dim: int,
-        max_norm: Optional[float] = None,
-        norm_type: float = 2.0,
-        scale_grad_by_freq: bool = False,
-        mode: str = "mean",
-        sparse: bool = False,
-        _weight: Optional[Tensor] = None,
-        include_last_offset: bool = False,
-        padding_idx: Optional[int] = None,
-        device=None,
-        dtype=None,
-    ) -> None:
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super().__init__()
-        self.num_embeddings = num_embeddings
-        self.embedding_dim = embedding_dim
-        self.max_norm = max_norm
-        self.norm_type = norm_type
-        self.scale_grad_by_freq = scale_grad_by_freq
-        if padding_idx is not None:
-            if padding_idx > 0:
-                assert (
-                    padding_idx < self.num_embeddings
-                ), "padding_idx必须在num_embeddings范围内"
-            elif padding_idx < 0:
-                assert (
-                    padding_idx >= -self.num_embeddings
-                ), "padding_idx必须在num_embeddings范围内"
-                padding_idx = self.num_embeddings + padding_idx
-        self.padding_idx = padding_idx
-        if _weight is None:
-            self.weight = Parameter(
-                torch.empty((num_embeddings, embedding_dim), **factory_kwargs)
-            )
-            self.reset_parameters()
-        else:
-            assert list(_weight.shape) == [
-                num_embeddings,
-                embedding_dim,
-            ], "权重形状与num_embeddings和embedding_dim不匹配"
-            self.weight = Parameter(_weight)
-        self.mode = mode
-        self.sparse = sparse
-        self.include_last_offset = include_last_offset
-
-    def reset_parameters(self) -> None:
-        init.normal_(self.weight)  # 正态分布初始化权重
-        self._fill_padding_idx_with_zero()  # 填充索引位置归零
-
-    def _fill_padding_idx_with_zero(self) -> None:
-        if self.padding_idx is not None:
-            with torch.no_grad():  # 不计算梯度
-                self.weight[self.padding_idx].fill_(0)  # 填充位置设为0
-
-    def forward(
-        self,
-        input: Tensor,
-        offsets: Optional[Tensor] = None,
-        per_sample_weights: Optional[Tensor] = None,
-    ) -> Tensor:
-        """EmbeddingBag的正向传播。
-
-        参数:
-            input (Tensor): 包含嵌入矩阵索引袋的张量。
-            offsets (Tensor, 可选): 仅当input为1D时使用，确定input中每个袋（序列）的起始索引位置。
-            per_sample_weights (Tensor, 可选): 浮点/双精度权重张量，None表示所有权重为1。
-                若指定，形状必须与input相同，且在offsets非None时使用相同的偏移量。仅支持mode='sum'。
-
-        返回:
-            形状为(B, embedding_dim)的张量。
-
-        .. 注意::
-
-            关于input和offsets的说明：
-            - input和offsets必须同类型（int或long）
-            - 若input为2D形状(B, N)，视为B个固定长度N的袋，返回B个按mode聚合的值，此时offsets被忽略且必须为None。
-            - 若input为1D形状(N)，视为多个袋（序列）的拼接，offsets必须为1D张量，包含每个袋在input中的起始索引位置。
-              因此，对于形状(B)的offsets，input视为B个袋，空袋（长度为0）返回全零向量。
-        """
-        return F.embedding_bag(
-            input,
-            self.weight,
-            offsets,
-            self.max_norm,
-            self.norm_type,
-            self.scale_grad_by_freq,
-            self.mode,
-            self.sparse,
-            per_sample_weights,
-            self.include_last_offset,
-            self.padding_idx,
-        )
-
-    def extra_repr(self) -> str:
-        s = "{num_embeddings}, {embedding_dim}"
-        if self.max_norm is not None:
-            s += ", max_norm={max_norm}"
-        if self.norm_type != 2:
-            s += ", norm_type={norm_type}"
-        if self.scale_grad_by_freq is not False:
-            s += ", scale_grad_by_freq={scale_grad_by_freq}"
-        s += ", mode={mode}"
-        if self.padding_idx is not None:
-            s += ", padding_idx={padding_idx}"
-        return s.format(**{k: repr(v) for k, v in self.__dict__.items()})
-
-    @classmethod
-    def from_pretrained(
-        cls,
-        embeddings: Tensor,
-        freeze: bool = True,
-        max_norm: Optional[float] = None,
-        norm_type: float = 2.0,
-        scale_grad_by_freq: bool = False,
-        mode: str = "mean",
-        sparse: bool = False,
-        include_last_offset: bool = False,
-        padding_idx: Optional[int] = None,
-    ) -> "EmbeddingBag":
-        r"""从给定的2维FloatTensor创建EmbeddingBag实例。
-
-        参数:
-            embeddings (Tensor): 包含EmbeddingBag权重的FloatTensor，
-                第一维作为num_embeddings，第二维作为embedding_dim。
-            freeze (bool, 可选): 若为True，张量在学习过程中不更新，
-                相当于embeddingbag.weight.requires_grad = False。默认True。
-            max_norm (float, 可选): 同模块初始化文档说明，默认None。
-            norm_type (float, 可选): 同模块初始化文档说明，默认2。
-            scale_grad_by_freq (bool, 可选): 同模块初始化文档说明，默认False。
-            mode (str, 可选): 同模块初始化文档说明，默认"mean"。
-            sparse (bool, 可选): 同模块初始化文档说明，默认False。
-            include_last_offset (bool, 可选): 同模块初始化文档说明，默认False。
-            padding_idx (int, 可选): 同模块初始化文档说明，默认None。
-
-        示例::
-
-            >>> # 包含预训练权重的FloatTensor
-            >>> weight = torch.FloatTensor([[1, 2.3, 3], [4, 5.1, 6.3]])
-            >>> embeddingbag = nn.EmbeddingBag.from_pretrained(weight)
-            >>> # 获取索引1和0的嵌入袋（2D输入）
-            >>> input = torch.LongTensor([[1, 0]])
-            >>> # xdoctest: +IGNORE_WANT("non-deterministic")
-            >>> embeddingbag(input)
-            tensor([[ 2.5000,  3.7000,  4.6500]])
-        """
-        assert (
-            embeddings.dim() == 2
-        ), "Embeddings参数应为2维张量"
-        rows, cols = embeddings.shape
-        embeddingbag = cls(
-            num_embeddings=rows,
-            embedding_dim=cols,
-            _weight=embeddings,
-            max_norm=max_norm,
-            norm_type=norm_type,
-            scale_grad_by_freq=scale_grad_by_freq,
-            mode=mode,
-            sparse=sparse,
-            include_last_offset=include_last_offset,
-            padding_idx=padding_idx,
-        )
-        embeddingbag.weight.requires_grad = not freeze
-        return embeddingbag
\ No newline at end of file
diff --git a/front/py/deepx/tensor/tensor.py b/front/py/deepx/tensor/tensor.py
index 7929ba7f..efe2255d 100644
--- a/front/py/deepx/tensor/tensor.py
+++ b/front/py/deepx/tensor/tensor.py
@@ -4,8 +4,7 @@
 
 Number: TypeAlias = Union[int, float, bool]
 
-tensorid=1
-
+ 
 class Tensor:
 
     #life
@@ -19,9 +18,11 @@ def __init__(self,shape:tuple[int,...],dtype:str='float32',name:str=None):
 
         self._name = name
         if name is None or name =='':
-            global tensorid
-            self._name =tensorid
-            tensorid+=1
+            if not hasattr(self.__class__, '_instance_counter'):
+                self.__class__._instance_counter = 0
+            count = self.__class__._instance_counter
+            self.__class__._instance_counter += 1
+            self._name = count
         # dtype
         self._dtype = dtype
 
diff --git a/front/py/examples/4_transformer/llama/1_rope.py b/front/py/examples/4_transformer/llama/1_rope.py
new file mode 100644
index 00000000..e69de29b
diff --git a/todo/deepxpy.md b/todo/deepxpy.md
deleted file mode 100644
index b784c004..00000000
--- a/todo/deepxpy.md
+++ /dev/null
@@ -1,10 +0,0 @@
-# DeepX py部分
-
-1.scheduder和excuter的通信
-    + IR增加时间辍
-    + 实现 请求和返回，实现异步计算通信重叠
-
-2.
-
-3.
-
diff --git a/todo/excuter/metal/metal.md b/todo/excuter/metal/metal.md
deleted file mode 100644
index 2022e8f9..00000000
--- a/todo/excuter/metal/metal.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# Metal
-
-考虑用macos的metal加速库，支持GPU计算
\ No newline at end of file
diff --git a/todo/excuter/ompsimd/ompsimd.md b/todo/excuter/ompsimd/ompsimd.md
deleted file mode 100644
index 6b97be76..00000000
--- a/todo/excuter/ompsimd/ompsimd.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# OMP SIMD
-
-当前开发中
\ No newline at end of file
diff --git a/todo/scheduler.md b/todo/scheduler.md
deleted file mode 100644
index 4f0cf0ec..00000000
--- a/todo/scheduler.md
+++ /dev/null
@@ -1,7 +0,0 @@
-
-## 性能监控
-
-通过IR消息的send_at,recv_at,done_at，计算IR的计算耗时
-
-## 算子注册和融合
-