diff --git a/deepxctl/.gitignore b/deepxctl/.gitignore
new file mode 100644
index 00000000..5ca0477f
--- /dev/null
+++ b/deepxctl/.gitignore
@@ -0,0 +1,2 @@
+.idea
+deepxctl
\ No newline at end of file
diff --git a/deepxctl/cmd/tensor/print.go b/deepxctl/cmd/tensor/print.go
new file mode 100644
index 00000000..44029497
--- /dev/null
+++ b/deepxctl/cmd/tensor/print.go
@@ -0,0 +1,83 @@
+package tensor
+
+import (
+	"flag"
+	"fmt"
+	"os"
+
+	coretensor "github.com/array2d/deepx/deepxctl/tensor"
+)
+
+func PrintCmd() {
+	printCmd := flag.NewFlagSet("print", flag.ExitOnError)
+	tensorPath := os.Args[0]
+	if tensorPath == "" {
+		fmt.Println("请指定文件路径")
+		printCmd.Usage()
+		return
+	}
+	var err error
+	var shape coretensor.Shape
+	shape, err = coretensor.LoadShape(tensorPath)
+	if err != nil {
+		fmt.Println("读取文件失败:", err)
+	}
+	switch shape.Dtype {
+	case "bool":
+		var t coretensor.Tensor[bool]
+		t, err = coretensor.LoadTensor[bool](tensorPath)
+		if err != nil {
+			fmt.Println("读取文件失败:", err)
+		}
+		t.Print()
+	case "int8":
+		var t coretensor.Tensor[int8]
+		t, err = coretensor.LoadTensor[int8](tensorPath)
+		if err != nil {
+			fmt.Println("读取文件失败:", err)
+		}
+		t.Print()
+	case "int16":
+		var t coretensor.Tensor[int16]
+		t, err = coretensor.LoadTensor[int16](tensorPath)
+		if err != nil {
+			fmt.Println("读取文件失败:", err)
+		}
+		t.Print()
+	case "int32":
+		var t coretensor.Tensor[int32]
+		t, err = coretensor.LoadTensor[int32](tensorPath)
+		if err != nil {
+			fmt.Println("读取文件失败:", err)
+		}
+		t.Print()
+	case "int64":
+		var t coretensor.Tensor[int64]
+		t, err = coretensor.LoadTensor[int64](tensorPath)
+		if err != nil {
+			fmt.Println("读取文件失败:", err)
+		}
+		t.Print()
+	case "float16":
+		// var t coretensor.Tensor[float16]
+		// t, err = coretensor.LoadTensor[float16](tensorPath)
+		// if err != nil {
+		// 	fmt.Println("读取文件失败:", err)
+		// }
+		// t.Print()
+	case "float32":
+		var t coretensor.Tensor[float32]
+		t, err = coretensor.LoadTensor[float32](tensorPath)
+		if err != nil {
+			fmt.Println("读取文件失败:", err)
+		}
+		t.Print()
+	case "float64":
+		var t coretensor.Tensor[float64]
+		t, err = coretensor.LoadTensor[float64](tensorPath)
+		if err != nil {
+			fmt.Println("读取文件失败:", err)
+		}
+		t.Print()
+	}
+}
diff --git a/deepxctl/cmd/tensor/tensor.go b/deepxctl/cmd/tensor/tensor.go
new file mode 100644
index 00000000..eb11dba0
--- /dev/null
+++ b/deepxctl/cmd/tensor/tensor.go
@@ -0,0 +1,36 @@
+package tensor
+
+import (
+	"fmt"
+	"os"
+)
+
+func PrintUsage() {
+	fmt.Println("使用方法:")
+	fmt.Println("  tensor print <文件路径>")
+	fmt.Println("  tensor help")
+}
+
+func Execute() {
+	if len(os.Args) < 1 {
+		PrintUsage()
+		os.Exit(1)
+	}
+
+	subCmd := "help"
+	if len(os.Args) > 0 {
+		subCmd = os.Args[0]
+	}
+
+	switch subCmd {
+	case "print":
+		os.Args = os.Args[1:]
+		PrintCmd()
+	case "help":
+		PrintUsage()
+	default:
+		fmt.Printf("未知的张量命令: %s\n", subCmd)
+		PrintUsage()
+		os.Exit(1)
+	}
+}
diff --git a/deepxctl/go.mod b/deepxctl/go.mod
new file mode 100644
index 00000000..8cabf45f
--- /dev/null
+++ b/deepxctl/go.mod
@@ -0,0 +1,5 @@
+module github.com/array2d/deepx/deepxctl
+
+go 1.23.2
+
+require gopkg.in/yaml.v2 v2.4.0 // indirect
diff --git a/deepxctl/go.sum b/deepxctl/go.sum
new file mode 100644
index 00000000..75346616
--- /dev/null
+++ b/deepxctl/go.sum
@@ -0,0 +1,3 @@
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY=
+gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ=
diff --git a/deepxctl/main.go b/deepxctl/main.go
new file mode 100644
index 00000000..de73e43c
--- /dev/null
+++ b/deepxctl/main.go
@@ -0,0 +1,64 @@
+package main
+
+import (
+	"flag"
+	"fmt"
+	"os"
+	"path/filepath"
+
+	"github.com/array2d/deepx/deepxctl/cmd/tensor"
+)
+
+var version = "0.1.0"
+
+func printUsage() {
+	execName := filepath.Base(os.Args[0])
+	fmt.Printf("用法: %s [命令] [参数]\n\n", execName)
+	fmt.Println("可用命令:")
+	fmt.Println("  tensor    张量操作相关命令")
+	fmt.Println("  version   显示版本信息")
+	fmt.Println("  help      显示帮助信息")
+	fmt.Println("\n使用 '%s help [命令]' 获取命令的详细信息", execName)
+}
+
+func main() {
+	flag.Usage = printUsage
+
+	if len(os.Args) < 2 {
+		printUsage()
+		os.Exit(1)
+	}
+
+	// 获取子命令
+	cmd := os.Args[1]
+
+	// 根据子命令执行相应操作
+	switch cmd {
+	case "tensor":
+		// 移除子命令，让子命令处理剩余的参数
+		os.Args = os.Args[2:]
+		tensor.Execute()
+
+	case "version":
+		fmt.Printf("deepxctl 版本 %s\n", version)
+
+	case "help":
+		if len(os.Args) > 2 {
+			helpCmd := os.Args[2]
+			switch helpCmd {
+			case "tensor":
+				tensor.PrintUsage()
+			default:
+				fmt.Printf("未知命令: %s\n", helpCmd)
+				printUsage()
+			}
+		} else {
+			printUsage()
+		}
+
+	default:
+		fmt.Printf("未知命令: %s\n", cmd)
+		printUsage()
+		os.Exit(1)
+	}
+}
diff --git a/deepxctl/tensor/fp16.go b/deepxctl/tensor/fp16.go
new file mode 100644
index 00000000..18a7e0f8
--- /dev/null
+++ b/deepxctl/tensor/fp16.go
@@ -0,0 +1,28 @@
+package tensor
+
+import (
+	"encoding/binary"
+	"math"
+)
+
+func Byte2ToFloat16(value []byte) float32 {
+	bits := binary.BigEndian.Uint16(value)
+	// 这里需要实现float16到float32的转换
+	// 简化实现，实际项目中需要更完整的实现
+	sign := float32(1)
+	if bits&0x8000 != 0 {
+		sign = -1
+	}
+	exp := int((bits & 0x7C00) >> 10)
+	frac := float32(bits&0x03FF) / 1024.0
+
+	if exp == 0 {
+		return sign * frac * float32(1.0/16384.0) // 非规格化数
+	} else if exp == 31 {
+		if frac == 0 {
+			return sign * float32(math.Inf(1)) // 无穷大
+		}
+		return float32(math.NaN()) // NaN
+	}
+	return sign * float32(math.Pow(2, float64(exp-15))) * (1.0 + frac) // 规格化数
+}
diff --git a/deepxctl/tensor/io.go b/deepxctl/tensor/io.go
new file mode 100644
index 00000000..b0faf9d7
--- /dev/null
+++ b/deepxctl/tensor/io.go
@@ -0,0 +1,47 @@
+package tensor
+
+import (
+	"encoding/binary"
+	"os"
+
+	"gopkg.in/yaml.v2"
+)
+
+func LoadShape(filePath string) (shape Shape, err error) {
+	var shapeData []byte
+	shapeData, err = os.ReadFile(filePath + ".shape")
+	if err != nil {
+		return
+	}
+
+	err = yaml.Unmarshal(shapeData, &shape)
+	if err != nil {
+		return
+	}
+	return
+}
+func LoadTensor[T Number](filePath string) (tensor Tensor[T], err error) {
+
+	_, err = os.ReadFile(filePath + ".shape")
+	if err != nil {
+		return
+	}
+	var shape Shape
+	shape, err = LoadShape(filePath)
+	if err != nil {
+		return
+	}
+	file, err := os.Open(filePath + ".data")
+	if err != nil {
+		return
+	}
+	defer file.Close()
+	data := make([]T, shape.Size)
+
+	err = binary.Read(file, binary.LittleEndian, data)
+	if err != nil {
+		return
+	}
+	tensor = Tensor[T]{Data: data, Shape: shape}
+	return
+}
diff --git a/deepxctl/tensor/print.go b/deepxctl/tensor/print.go
new file mode 100644
index 00000000..89dd3535
--- /dev/null
+++ b/deepxctl/tensor/print.go
@@ -0,0 +1,121 @@
+package tensor
+
+import "fmt"
+
+func (t *Tensor[T]) Range(dimCount int, f func(indices []int)) {
+	Shape := t.Shape
+	if dimCount > len(Shape.Shape) {
+		panic("dimCount exceeds the number of dimensions in the Tensor.")
+	}
+
+	totalSize := 1
+
+	// 计算总的循环次数
+	for i := 0; i < dimCount; i++ {
+		totalSize *= Shape.At(i)
+	}
+	indices := make([]int, dimCount) // 初始化索引向量
+	// 遍历所有可能的索引组合
+	for idx := 0; idx < totalSize; idx++ {
+		// 反算出 indices 数组
+		idx_ := idx
+		for dim := dimCount - 1; dim >= 0; dim-- {
+			indices[dim] = idx_ % Shape.At(dim) // 计算当前维度的索引
+			idx_ /= Shape.At(dim)               // 更新 idx
+		}
+		f(indices) // 调用传入的函数
+	}
+}
+
+func AutoFormat(dtype string) string {
+	switch dtype {
+	case "bool":
+		return "%v"
+	case "int8":
+		return "%d"
+	case "int16":
+		return "%d"
+	case "int32":
+		return "%d"
+	case "int64":
+		return "%d"
+	case "float16":
+		return "%f"
+	case "float32":
+		return "%f"
+	case "float64":
+		return "%f"
+	default:
+		return "%v"
+	}
+}
+
+// Print 打印Tensor的值
+func (t *Tensor[T]) Print(format_ ...string) {
+	Shape := t.Shape
+	format := AutoFormat(t.Dtype)
+	if len(format_) > 0 {
+		format = format_[0]
+	}
+	fmt.Print("shape:[")
+	for i := 0; i < Shape.Dim; i++ {
+		fmt.Print(Shape.At(i))
+		if i < Shape.Dim-1 {
+			fmt.Print(", ")
+		}
+	}
+	fmt.Println("]")
+	if Shape.Dim == 1 {
+		fmt.Print("[")
+		for i := 0; i < Shape.At(0); i++ {
+			if i > 0 {
+				fmt.Print(" ")
+			}
+			fmt.Printf(format, t.Get(i))
+		}
+		fmt.Println("]")
+	} else if Shape.Dim == 2 {
+		fmt.Println("[")
+		for i := 0; i < Shape.At(0); i++ {
+			fmt.Print(" [")
+			for j := 0; j < Shape.At(1); j++ {
+				if j > 0 {
+					fmt.Print(" ")
+				}
+				fmt.Printf(format, t.Get(i, j))
+			}
+
+			fmt.Print("]")
+			if i < Shape.At(0)-1 {
+				fmt.Print(",")
+			}
+			fmt.Println()
+		}
+		fmt.Println("]")
+	} else {
+		t.Range(Shape.Dim-2, func(indices []int) {
+			fmt.Print(indices)
+			m, n := Shape.At(Shape.Dim-2), Shape.At(Shape.Dim-1)
+			fmt.Print([]int{m, n})
+			fmt.Println("=")
+
+			fmt.Println("[")
+			for i := 0; i < m; i++ {
+				fmt.Print(" [")
+				for j := 0; j < n; j++ {
+					if j > 0 {
+						fmt.Print(" ")
+					}
+					fmt.Printf(format, t.Get(append(indices, i, j)...))
+				}
+
+				fmt.Print("]")
+				if i < m-1 {
+					fmt.Print(",")
+				}
+				fmt.Println()
+			}
+			fmt.Println("]")
+		})
+	}
+}
diff --git a/deepxctl/tensor/tensor.go b/deepxctl/tensor/tensor.go
new file mode 100644
index 00000000..0ef542b3
--- /dev/null
+++ b/deepxctl/tensor/tensor.go
@@ -0,0 +1,90 @@
+package tensor
+
+import (
+	"fmt"
+)
+
+type Shape struct {
+	Shape  []int  `json:"shape"`
+	Stride []int  `json:"stride"`
+	Dim    int    `json:"ndim"`
+	Size   int    `json:"size"`
+	Dtype  string `json:"dtype"`
+}
+
+func NewTensorShape(shape []int) (s Shape) {
+	s.Dim = len(shape)
+	s.Shape = make([]int, len(shape))
+	copy(s.Shape, shape)
+	s.Stride = make([]int, len(shape))
+	s.Stride[len(shape)-1] = 1
+	for i := len(shape) - 2; i >= 0; i-- {
+		s.Stride[i] = s.Stride[i+1] * shape[i+1]
+	}
+	s.Size = s.Stride[0] * shape[0]
+	return s
+}
+func (s Shape) String() string {
+	return fmt.Sprintf("%v", s.Shape)
+}
+
+func (s Shape) At(i int) int {
+	return s.Shape[i]
+}
+
+func (s Shape) LinearAt(indices []int) int {
+	idx := 0
+	for i := 0; i < len(indices); i++ {
+		idx += indices[i] * s.Stride[i]
+	}
+	return idx
+}
+func (s Shape) LinearTo(idx int) (indices []int) {
+	linearIndex := idx
+	indices = make([]int, s.Dim)
+	for i := 0; i < s.Dim; i++ {
+		indices[i] = linearIndex / s.Stride[i]
+		linearIndex %= s.Stride[i]
+	}
+	return indices
+}
+
+func BitSize(Dtype string) int {
+	switch Dtype {
+	case "bool":
+		return 8
+	case "int8":
+		return 8
+	case "int16":
+		return 16
+	case "int32":
+		return 32
+	case "int64":
+		return 64
+	case "float16":
+		return 16
+	case "float32":
+		return 32
+	case "float64":
+		return 64
+	default:
+		return 0
+	}
+}
+
+type Number interface {
+	comparable
+	float64 | float32 | int64 | int32 | int16 | int8 | bool
+}
+
+type Tensor[T Number] struct {
+	Data []T
+	Shape
+}
+
+// Get 获取Tensor的值
+func (t *Tensor[T]) Get(indices ...int) T {
+	idx := t.Shape.LinearAt(indices)
+	return t.Data[idx]
+
+}
diff --git a/doc/excuter/op-mem-cuda/list.md b/doc/excuter/op-mem-cuda/list.md
index 34de625c..2a73fd50 100644
--- a/doc/excuter/op-mem-cuda/list.md
+++ b/doc/excuter/op-mem-cuda/list.md
@@ -9,13 +9,6 @@
 | vecset |  none  | vecset(vector<any> value)->(vector<any> name) | shape = [3  4  5] | vecset(vector<any> value)->(vector<any> name) |
 | argset |  none  | argset(var<any> value)->(var<any> name) | var argname = argvalue | argset(var<any> value)->(var<any> name) |
 
-### io
-
-| Operation | Author | Func Def | Math Formula | IR Instruction |
-|-----------|--------|------------|--------------|----------------|
-| print | miaobyte | print(tensor<any> )->() | print(T1) | print(tensor<any> )->() |
-| print | miaobyte | print(tensor<any> , var<string> )->() | print(T1) | print(tensor<any> , var<string> )->() |
-
 ### tensorlife
 
 | Operation | Author | Func Def | Math Formula | IR Instruction |
@@ -25,13 +18,22 @@
 | newtensor |  none  | newtensor(var<string> shape)->(tensor<any> tensor1) | T1 = zeros(shape) | newtensor(var<string> shape)->(tensor<any> tensor1) |
 | deltensor |  none  | deltensor(tensor<any> t)->() | del T1 | deltensor(tensor<any> t)->() |
 
+### io
+
+| Operation | Author | Func Def | Math Formula | IR Instruction |
+|-----------|--------|------------|--------------|----------------|
+| save |  none  | save(tensor<any> t, var<string> path)->() | save(T1,path) | save(tensor<any> t, var<string> path)->() |
+| print | miaobyte | print(tensor<any> t)->() | print(T1) | print(tensor<any> t)->() |
+| print | miaobyte | print(tensor<any> t, var<string> format)->() | print(T1) | print(tensor<any> t, var<string> format)->() |
+| load |  none  | load(var<string> path)->() | load(path) | load(var<string> path)->() |
+
 ### init
 
 | Operation | Author | Func Def | Math Formula | IR Instruction |
 |-----------|--------|------------|--------------|----------------|
+| arange | miaobyte | arange(tensor<any> t, var<any> start, var<any> step)->() | arange(T1,start,step) | arange(tensor<any> t, var<any> start, var<any> step)->() |
 | normal | miaobyte | normal(tensor<any> t, var<any> mean, var<any> stddev, var<int32> seed)->() | normal(T1,mean,stddev,seed) | normal(tensor<any> t, var<any> mean, var<any> stddev, var<int32> seed)->() |
 | uniform | miaobyte | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() | uniform(T1,low,high,seed) | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() |
-| arange | miaobyte | arange(tensor<any> t, var<any> start, var<any> step)->() | arange(T1,start,step) | arange(tensor<any> t, var<any> start, var<any> step)->() |
 | constant | miaobyte | constant(tensor<any> t, var<any> value)->() | constant(T1) | constant(tensor<any> t, var<any> value)->() |
 
 ### elementwise
@@ -43,19 +45,21 @@
 | equalscalar | miaobyte | equalscalar(tensor<any> A, var<any> scalar, var<float64> epsilon)->(tensor<bool> mask) | mask=compare(T1, scalar) | equalscalar(tensor<any> A, var<any> scalar, var<float64> epsilon)->(tensor<bool> mask) |
 | min | miaobyte | min(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=min(T1, T2) | min(tensor<any> A, tensor<any> B)->(tensor<any> C) |
 | maxscalar | miaobyte | maxscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=max(T1, scalar) | maxscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
-| addscalar | miaobyte | addscalar(tensor<any> A, var<any> b)->(tensor<any> C) | T3=T1+scalar | addscalar(tensor<any> A, var<any> b)->(tensor<any> C) |
-| log | miaobyte | log(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) | T3=log(T1) | log(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) |
+| tan | miaobyte | tan(tensor<float64|float32> A)->(tensor<float64|float32> C) | T3=tan(T1) | tan(tensor<float64|float32> A)->(tensor<float64|float32> C) |
 | divscalar | miaobyte | divscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=scalar/T1 | divscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
 | sin | miaobyte | sin(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) | T3=sin(T1) | sin(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) |
-| tan | miaobyte | tan(tensor<float64|float32> A)->(tensor<float64|float32> C) | T3=tan(T1) | tan(tensor<float64|float32> A)->(tensor<float64|float32> C) |
 | add | cublas | add(tensor<any> a, tensor<any> b)->(tensor<any> c) | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |
 | add | miaobyte | add(tensor<any> a, tensor<any> b)->(tensor<any> c) | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |
 | greater | miaobyte | greater(tensor<any> A, tensor<any> B)->(tensor<bool> mask) | mask=compare(T1, T2) | greater(tensor<any> A, tensor<any> B)->(tensor<bool> mask) |
+| lessscalar | miaobyte | lessscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) | mask=compare(T1, scalar) | lessscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) |
+| cos | miaobyte | cos(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) | T3=cos(T1) | cos(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) |
 | less | miaobyte | less(tensor<any> A, tensor<any> B)->(tensor<bool> mask) | mask=compare(T1, T2) | less(tensor<any> A, tensor<any> B)->(tensor<bool> mask) |
 | powscalar | miaobyte | powscalar(tensor<float64|float32> A, var<float64|int32> scalar)->(tensor<float64|float32> C) | T3=pow(T1, scalar) | powscalar(tensor<float64|float32> A, var<float64|int32> scalar)->(tensor<float64|float32> C) |
 | minscalar | miaobyte | minscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=min(T1, scalar) | minscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
 | rdivscalar | miaobyte | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) | T3=scalar/T1 | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) |
 | rpowscalar | miaobyte | rpowscalar(var<float64|int32> scalar, tensor<float64|float32> A)->(tensor<float64|float32> C) | T3=pow(scalar, T1) | rpowscalar(var<float64|int32> scalar, tensor<float64|float32> A)->(tensor<float64|float32> C) |
+| log | miaobyte | log(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) | T3=log(T1) | log(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) |
+| addscalar | miaobyte | addscalar(tensor<any> A, var<any> b)->(tensor<any> C) | T3=T1+scalar | addscalar(tensor<any> A, var<any> b)->(tensor<any> C) |
 | sub | miaobyte | sub(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1-T2 | sub(tensor<any> A, tensor<any> B)->(tensor<any> C) |
 | sqrt | miaobyte | sqrt(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) | T3=sqrt(T1) | sqrt(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) |
 | subscalar | miaobyte | subscalar(tensor<any> A, var<any> b)->(tensor<any> C) | T3=T1-scalar | subscalar(tensor<any> A, var<any> b)->(tensor<any> C) |
@@ -67,8 +71,6 @@
 | pow | miaobyte | pow(tensor<float64|float32> A, tensor<float64|float32> B)->(tensor<float64|float32> C) | T3=pow(T1, T2) | pow(tensor<float64|float32> A, tensor<float64|float32> B)->(tensor<float64|float32> C) |
 | mul | miaobyte | mul(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1*T2 | mul(tensor<any> A, tensor<any> B)->(tensor<any> C) |
 | exp | miaobyte | exp(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) | T3=exp(T1) | exp(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) |
-| lessscalar | miaobyte | lessscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) | mask=compare(T1, scalar) | lessscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) |
-| cos | miaobyte | cos(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) | T3=cos(T1) | cos(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) |
 
 ### matmul
 
@@ -80,6 +82,7 @@
 
 | Operation | Author | Func Def | Math Formula | IR Instruction |
 |-----------|--------|------------|--------------|----------------|
+| gather | miaobyte | gather(tensor<any> A, tensor<int64|int32> indices, var<int32> axis)->(tensor<any> B) | T2 = T1.gather(indices=[1,2], axis=1) | gather(tensor<any> A, tensor<int64|int32> indices, var<int32> axis)->(tensor<any> B) |
 | broadcastTo | miaobyte | broadcastTo(tensor<any> A, vector<int32> new_shape)->(tensor<any> B) | T2 = T1.broadcastTo(new_shape=[4,3,2]) | broadcastTo(tensor<any> A, vector<int32> new_shape)->(tensor<any> B) |
 | concat | miaobyte | concat(listtensor<any> tensors, var<int32> dim)->(tensor<any> result) | Tresult = concat([T1, T2...], axis=3) | concat(listtensor<any> tensors, var<int32> dim)->(tensor<any> result) |
 | transpose | miaobyte | transpose(tensor<any> A, vector<int32> dim_order)->(tensor<any> C) | T2 = T1.transpose(dimorder=[1,0]) | transpose(tensor<any> A, vector<int32> dim_order)->(tensor<any> C) |
diff --git a/doc/excuter/op-mem-ompsimd/list.md b/doc/excuter/op-mem-ompsimd/list.md
index b396a357..66a03786 100644
--- a/doc/excuter/op-mem-ompsimd/list.md
+++ b/doc/excuter/op-mem-ompsimd/list.md
@@ -9,13 +9,6 @@
 | vecset |  none  | vecset(vector<any> value)->(vector<any> name) | shape = [3  4  5] | vecset(vector<any> value)->(vector<any> name) |
 | argset |  none  | argset(var<any> value)->(var<any> name) | var argname = argvalue | argset(var<any> value)->(var<any> name) |
 
-### io
-
-| Operation | Author | Func Def | Math Formula | IR Instruction |
-|-----------|--------|------------|--------------|----------------|
-| print | miaobyte | print(tensor<any> )->() | print(T1) | print(tensor<any> )->() |
-| print | miaobyte | print(tensor<any> , var<string> )->() | print(T1) | print(tensor<any> , var<string> )->() |
-
 ### tensorlife
 
 | Operation | Author | Func Def | Math Formula | IR Instruction |
@@ -25,13 +18,22 @@
 | newtensor |  none  | newtensor(var<string> shape)->(tensor<any> t) | T1 =Tensor(shape=[...]) | newtensor(var<string> shape)->(tensor<any> t) |
 | deltensor |  none  | deltensor(tensor<any> t)->() | del T1 | deltensor(tensor<any> t)->() |
 
+### io
+
+| Operation | Author | Func Def | Math Formula | IR Instruction |
+|-----------|--------|------------|--------------|----------------|
+| save |  none  | save(tensor<any> t, var<string> path)->() | save(T1,path) | save(tensor<any> t, var<string> path)->() |
+| print | miaobyte | print(tensor<any> t)->() | print(T1) | print(tensor<any> t)->() |
+| print | miaobyte | print(tensor<any> t, var<string> format)->() | print(T1) | print(tensor<any> t, var<string> format)->() |
+| load |  none  | load(var<string> path)->() | load(path) | load(var<string> path)->() |
+
 ### init
 
 | Operation | Author | Func Def | Math Formula | IR Instruction |
 |-----------|--------|------------|--------------|----------------|
+| arange | miaobyte | arange(tensor<any> t, var<any> start, var<any> step)->() | arange(T1,start,step) | arange(tensor<any> t, var<any> start, var<any> step)->() |
 | normal | miaobyte | normal(tensor<any> t, var<any> mean, var<any> std, var<int32> seed)->() | normal(T1,mean,stddev,seed) | normal(tensor<any> t, var<any> mean, var<any> std, var<int32> seed)->() |
 | uniform | miaobyte | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() | uniform(T1,low,high,seed) | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() |
-| arange | miaobyte | arange(tensor<any> t, var<any> start, var<any> step)->() | arange(T1,start,step) | arange(tensor<any> t, var<any> start, var<any> step)->() |
 | constant | miaobyte | constant(tensor<any> t, var<any> value)->() | constant(T1,value) | constant(tensor<any> t, var<any> value)->() |
 
 ### elementwise
@@ -41,12 +43,11 @@
 | switch | miaobyte | switch(listtensor<any> tensors, tensor<int8> cases)->(tensor<any> C) | C=switch([tensors],case) | switch(listtensor<any> tensors, tensor<int8> cases)->(tensor<any> C) |
 | greaterscalar | miaobyte | greaterscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) | mask=greater(T1,scalar) | greaterscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) |
 | equalscalar | miaobyte | equalscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) | mask=equal(T1,scalar) | equalscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) |
-| addscalar | miaobyte | addscalar(tensor<any> a, var<any> scalar)->(tensor<any> c) | T3=T1+scalar | addscalar(tensor<any> a, var<any> scalar)->(tensor<any> c) |
-| log | miaobyte | log(tensor<any> A)->(tensor<any> C) | T3=log(T1) | log(tensor<any> A)->(tensor<any> C) |
+| min | miaobyte | min(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=min(T1,T2) | min(tensor<any> A, tensor<any> B)->(tensor<any> C) |
+| maxscalar | miaobyte | maxscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=max(T1,scalar) | maxscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
 | divscalar | miaobyte | divscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=T1/scalar | divscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
 | add | cblas | add(tensor<float64|float32> a, tensor<float64|float32> b)->(tensor<float64|float32> c) | T3=T1+T2 | add(tensor<float64|float32> a, tensor<float64|float32> b)->(tensor<float64|float32> c) |
 | add | miaobyte | add(tensor<any> a, tensor<any> b)->(tensor<any> c) | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |
-| min | miaobyte | min(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=min(T1,T2) | min(tensor<any> A, tensor<any> B)->(tensor<any> C) |
 | greater | miaobyte | greater(tensor<any> A, tensor<any> B)->(tensor<bool> mask) | mask=greater(T1,T2) | greater(tensor<any> A, tensor<any> B)->(tensor<bool> mask) |
 | lessscalar | miaobyte | lessscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) | mask=less(T1,scalar) | lessscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) |
 | less | miaobyte | less(tensor<any> A, tensor<any> B)->(tensor<bool> mask) | mask=less(T1,T2) | less(tensor<any> A, tensor<any> B)->(tensor<bool> mask) |
@@ -54,6 +55,8 @@
 | minscalar | miaobyte | minscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=min(T1,scalar) | minscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
 | rdivscalar | miaobyte | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) | T3=scalar/T1 | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) |
 | rpowscalar | miaobyte | rpowscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) | T3=scalar^T1 | rpowscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) |
+| log | miaobyte | log(tensor<any> A)->(tensor<any> C) | T3=log(T1) | log(tensor<any> A)->(tensor<any> C) |
+| addscalar | miaobyte | addscalar(tensor<any> a, var<any> scalar)->(tensor<any> c) | T3=T1+scalar | addscalar(tensor<any> a, var<any> scalar)->(tensor<any> c) |
 | sub | miaobyte | sub(tensor<any> a, tensor<any> b)->(tensor<any> c) | T3=T1-T2 | sub(tensor<any> a, tensor<any> b)->(tensor<any> c) |
 | sqrt | miaobyte | sqrt(tensor<any> A)->(tensor<any> C) | T3=sqrt(T1) | sqrt(tensor<any> A)->(tensor<any> C) |
 | subscalar | miaobyte | subscalar(tensor<any> a, var<any> scalar)->(tensor<any> c) | T3=T1-scalar | subscalar(tensor<any> a, var<any> scalar)->(tensor<any> c) |
@@ -63,7 +66,6 @@
 | invert | miaobyte | invert(tensor<int64|int32|int16|int8> A)->(tensor<int64|int32|int16|int8> C) | T3=~T1 | invert(tensor<int64|int32|int16|int8> A)->(tensor<int64|int32|int16|int8> C) |
 | max | miaobyte | max(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=max(T1,T2) | max(tensor<any> A, tensor<any> B)->(tensor<any> C) |
 | pow | miaobyte | pow(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1^T2 | pow(tensor<any> A, tensor<any> B)->(tensor<any> C) |
-| maxscalar | miaobyte | maxscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=max(T1,scalar) | maxscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
 | mul | miaobyte | mul(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1*T2 | mul(tensor<any> A, tensor<any> B)->(tensor<any> C) |
 | exp | miaobyte | exp(tensor<any> A)->(tensor<any> C) | T3=exp(T1) | exp(tensor<any> A)->(tensor<any> C) |
 
@@ -78,6 +80,7 @@
 
 | Operation | Author | Func Def | Math Formula | IR Instruction |
 |-----------|--------|------------|--------------|----------------|
+| gather | miaobyte | gather(tensor<any> A, tensor<int64|int32> indices, var<int32> axis)->(tensor<any> B) | T2 = T1.gather(indices=T3, axis=3) | gather(tensor<any> A, tensor<int64|int32> indices, var<int32> axis)->(tensor<any> B) |
 | broadcastTo | miaobyte | broadcastTo(tensor<any> A, vector<int32> new_shape)->(tensor<any> B) | T2 = T1.broadcastTo(new_shape=[4,3,2]) | broadcastTo(tensor<any> A, vector<int32> new_shape)->(tensor<any> B) |
 | concat | miaobyte | concat(listtensor<any> tensors, var<int32> dim)->(tensor<any> result) | Tresult = concat([T1, T2...], axis=3) | concat(listtensor<any> tensors, var<int32> dim)->(tensor<any> result) |
 | transpose | miaobyte | transpose(tensor<any> A, vector<int32> dim_order)->(tensor<any> C) | T1.transpose(dimorder=[1,0])->T2 | transpose(tensor<any> A, vector<int32> dim_order)->(tensor<any> C) |
diff --git a/excuter/cpp-common/src/deepx/dtype.hpp b/excuter/cpp-common/src/deepx/dtype.hpp
index 5b9551a9..73b4ba5b 100644
--- a/excuter/cpp-common/src/deepx/dtype.hpp
+++ b/excuter/cpp-common/src/deepx/dtype.hpp
@@ -175,8 +175,8 @@ namespace deepx
         case Precision::Float8E4M3:
             return 8;
         //TODO 需要根据平台支持
-        case Precision::Float4E2M1:
-            return 4;
+        // case Precision::Float4E2M1:
+        //     return 4;
         case Precision::Int64:
             return 64;
         case Precision::Int32:
@@ -186,8 +186,8 @@ namespace deepx
         case Precision::Int8:
             return 8;
         //TODO，int4 需要根据平台支持
-        case Precision::Int4:
-            return 4;
+        // case Precision::Int4:
+        //     return 4;
         case Precision::Bool:
             return 8;
         case Precision::String:
diff --git a/excuter/cpp-common/src/deepx/mem/mem.hpp b/excuter/cpp-common/src/deepx/mem/mem.hpp
index 504db01f..9ea7ab70 100644
--- a/excuter/cpp-common/src/deepx/mem/mem.hpp
+++ b/excuter/cpp-common/src/deepx/mem/mem.hpp
@@ -98,6 +98,17 @@ namespace deepx::mem
             mem[name] = ptr;
         }
 
+        template <typename T>
+        void addtensor(const string &name, shared_ptr<Tensor<T>> tensor)
+        {
+            if (mem.find(name) != mem.end())
+            {
+                cerr << "tensor already exists: " << name << endl;
+                return;
+            }
+            mem[name] = tensor;
+        }
+
         // template <typename T>
         // shared_ptr<Tensor<T>> temptensor(vector<int> shape)
         // {
diff --git a/excuter/cpp-common/src/deepx/shape.cpp b/excuter/cpp-common/src/deepx/shape.cpp
index c126cc58..cedca724 100644
--- a/excuter/cpp-common/src/deepx/shape.cpp
+++ b/excuter/cpp-common/src/deepx/shape.cpp
@@ -83,7 +83,7 @@ namespace deepx
         node["dtype"] = precision_str(dtype);
         node["dim"] = dim;
         node["shape"] = shape;
-        node["strides"] = strides;
+        node["stride"] = strides;
         node["size"] = size;
         return YAML::Dump(node);
     }
@@ -92,7 +92,7 @@ namespace deepx
         dtype = precision(node["dtype"].as<std::string>());
         dim = node["dim"].as<int>();
         shape = node["shape"].as<std::vector<int>>();
-        strides=node["strides"].as<std::vector<int>>();
+        strides=node["stride"].as<std::vector<int>>();
         size=node["size"].as<int>();
     }
 }
\ No newline at end of file
diff --git a/excuter/cpp-common/src/deepx/shape.hpp b/excuter/cpp-common/src/deepx/shape.hpp
index fdcd8dd2..ff4fea69 100644
--- a/excuter/cpp-common/src/deepx/shape.hpp
+++ b/excuter/cpp-common/src/deepx/shape.hpp
@@ -8,7 +8,36 @@
 #include "deepx/dtype.hpp"
 namespace deepx
 {
-   
+    //omp内线程局部变量
+    class ThreadLocalVectors
+    {
+    private:
+        std::vector<std::vector<int>> vectors;
+
+    public:
+        // 构造函数接收向量大小数组
+        explicit ThreadLocalVectors(const std::vector<int> &sizes)
+        {
+            vectors.resize(sizes.size());
+            for (size_t i = 0; i < sizes.size(); ++i)
+            {
+                vectors[i].resize(sizes[i], 0);
+            }
+        }
+
+        // 获取指定索引的向量引用
+        std::vector<int> &get(size_t index)
+        {
+            return vectors[index];
+        }
+
+        // 获取所有向量
+        std::vector<std::vector<int>> &getAll()
+        {
+            return vectors;
+        }
+    };
+
     struct Shape
     {
         Precision dtype;
@@ -17,28 +46,29 @@ namespace deepx
         int dim;
         int size;
 
-        Shape()=default;
+        Shape() = default;
         Shape(const std::vector<int> &shape);
         Shape(const std::initializer_list<int> &shape);
         Shape(const int *shape, int dim);
         void setshape(const int *shape, int dim);
         int operator[](int index) const;
         int &operator[](int index);
-        bool operator==(const Shape &shape) const{return shape.shape==shape.shape;}
+        bool operator==(const Shape &shape) const { return shape.shape == shape.shape; }
         void print() const;
-        //range 不支持omp 
-        void range(int dimCount, std::function<void(const std::vector<int> &indices )> func ) const;
-        void range(int dimCount, std::function<void(const int idx_linear,const std::vector<int> &indices )> func ) const;
-        void range(int dimCount, std::function<void(const int idx_linear )> func ) const;
+        // range 不支持omp
+        void range(int dimCount, std::function<void(const std::vector<int> &indices)> func) const;
+        void range(int dimCount, std::function<void(const int idx_linear, const std::vector<int> &indices)> func) const;
+        void range(int dimCount, std::function<void(const int idx_linear)> func) const;
 
-        //rangeParallel 支持omp,但omp内无需线程local变量
+        // rangeParallel 支持omp,但omp内无需线程local变量
         void rangeParallel(int dimCount, std::function<void(const std::vector<int> &indices)> func) const;
         void rangeParallel(int dimCount, std::function<void(const int idx_linear)> func) const;
-        void rangeParallel(int dimCount, std::function<void(const int idx_linear,const std::vector<int> &indices )> func) const;
+        void rangeParallel(int dimCount, std::function<void(const int idx_linear, const std::vector<int> &indices)> func) const;
 
-        void rangeParallel(int dimCount, std::function<void(const std::vector<int> &indices,std::vector<int> &newIndices)> func,int newIndiceDim) const;
-        void rangeParallel(int dimCount, std::function<void(const int idx_linear,std::vector<int> &newIndices)> func,int newIndiceDim) const;
-        void rangeParallel(int dimCount, std::function<void(const int idx_linear,const std::vector<int> &indices,std::vector<int> &newIndices )> func,int newIndiceDim) const;
+        // 支持omp,但omp内需要线程local变量
+        void rangeParallel(int dimCount, std::function<void(const std::vector<int> &indices, ThreadLocalVectors &tlv)> func,const vector<int> tlv_sizes) const;
+        void rangeParallel(int dimCount, std::function<void(const int idx_linear, ThreadLocalVectors &tlv)> func,const vector<int> tlv_sizes) const;
+        void rangeParallel(int dimCount, std::function<void(const int idx_linear, const std::vector<int> &indices, ThreadLocalVectors &tlv)> func,const vector<int> tlv_sizes) const;
         int linearat(const std::vector<int> &indices) const;
         std::vector<int> linearto(int idx_linear) const;
 
diff --git a/excuter/cpp-common/src/deepx/shape_changeshape.cpp b/excuter/cpp-common/src/deepx/shape_changeshape.cpp
index 17a89461..c0002617 100644
--- a/excuter/cpp-common/src/deepx/shape_changeshape.cpp
+++ b/excuter/cpp-common/src/deepx/shape_changeshape.cpp
@@ -3,18 +3,18 @@
 
 #include "deepx/shape_changeshape.hpp"
 
-namespace deepx 
+namespace deepx
 {
-    //transpose
+    // transpose
 
-        std::vector<int> swaplastTwoDimOrder(const std::vector<int> &shape) 
+    std::vector<int> swaplastTwoDimOrder(const std::vector<int> &shape)
     {
         vector<int> dimOrder = shape;
         std::iota(dimOrder.begin(), dimOrder.end(), 0);
         swap(dimOrder[dimOrder.size() - 1], dimOrder[dimOrder.size() - 2]);
         return dimOrder;
     }
-    std::vector<int> transposeShape(const std::vector<int> &shape, const std::vector<int> &dimOrder) 
+    std::vector<int> transposeShape(const std::vector<int> &shape, const std::vector<int> &dimOrder)
     {
         if (dimOrder.size() != shape.size())
         {
@@ -23,16 +23,17 @@ namespace deepx
         std::vector<int> newShape = shape;
         for (size_t i = 0; i < dimOrder.size(); ++i)
         {
-            newShape[i] =shape[dimOrder[i]];
+            newShape[i] = shape[dimOrder[i]];
         }
         return newShape;
     }
 
-    //concat
+    // concat
 
-    Shape concatShape(const std::vector<Shape> &shapes,const int axis){
+    Shape concatShape(const std::vector<Shape> &shapes, const int axis)
+    {
         std::vector<int> outputShape(shapes[0].dim);
-        outputShape=shapes[0].shape;
+        outputShape = shapes[0].shape;
         for (int i = 1; i < shapes.size(); ++i)
         {
             if (shapes[i].dim != outputShape.size())
@@ -54,7 +55,7 @@ namespace deepx
         return Shape(outputShape);
     }
 
-   //broadcast
+    // broadcast
     std::vector<int> broadcastShape(const std::vector<int> &a, const std::vector<int> &b)
     {
         int len1 = a.size();
@@ -102,7 +103,7 @@ namespace deepx
         return broadcastMap;
     }
 
-    void fromBroadcastIndices(const std::vector<BroadcastMap> &broadcastMap, const std::vector<int> &broadcastIndices, std::vector<int> &oldIndices )
+    void fromBroadcastIndices(const std::vector<BroadcastMap> &broadcastMap, const std::vector<int> &broadcastIndices, std::vector<int> &oldIndices)
     {
         for (int i = 0, j = 0; i < broadcastIndices.size(); ++i)
         {
@@ -119,4 +120,5 @@ namespace deepx
             }
         }
     }
+ 
 }
\ No newline at end of file
diff --git a/excuter/cpp-common/src/deepx/shape_changeshape.hpp b/excuter/cpp-common/src/deepx/shape_changeshape.hpp
index 291c7291..ac2a588a 100644
--- a/excuter/cpp-common/src/deepx/shape_changeshape.hpp
+++ b/excuter/cpp-common/src/deepx/shape_changeshape.hpp
@@ -19,7 +19,6 @@ namespace deepx
     std::vector<int> transposeShape(const std::vector<int> &shape, const std::vector<int> &dimOrder);
 
     // concat
-
     Shape concatShape(const std::vector<Shape> &shapes, const int axis);
 
     template <typename T>
@@ -71,6 +70,8 @@ namespace deepx
     };
     std::vector<BroadcastMap> broadcastMap(const std::vector<int> &a, const std::vector<int> &b);
 
-}
 
+    //gather
+    //gather的out.shape=indices.shape,所以无需计算
+}
 #endif // DEEPX_SHAPE_CHANGESHAPE_HPP
\ No newline at end of file
diff --git a/excuter/cpp-common/src/deepx/shape_range.cpp b/excuter/cpp-common/src/deepx/shape_range.cpp
index c7fc6d5a..31a125e1 100644
--- a/excuter/cpp-common/src/deepx/shape_range.cpp
+++ b/excuter/cpp-common/src/deepx/shape_range.cpp
@@ -152,7 +152,7 @@ namespace deepx
         }
     }
 
-    void Shape::rangeParallel(int dimCount, std::function<void(const std::vector<int> &indices, std::vector<int> &newIndices)> func, int newIndiceDim) const
+    void Shape::rangeParallel(int dimCount, std::function<void(const std::vector<int> &indices, ThreadLocalVectors &tlv)> func,const vector<int> tlv_sizes) const
     {
         dimCount = checkdim(dimCount, dim);
         int totalSize = checkTotalSize(dimCount, shape);
@@ -160,7 +160,7 @@ namespace deepx
 #pragma omp parallel
         {
             std::vector<int> indices(dimCount, 0);
-            std::vector<int> newIndices(newIndiceDim, 0);
+            ThreadLocalVectors tlv(tlv_sizes);
 #pragma omp for
             for (int idx = 0; idx < totalSize; idx++)
             {
@@ -171,11 +171,11 @@ namespace deepx
                     indices[dim] = idx_ % shape[dim]; // 计算当前维度的索引
                     idx_ /= shape[dim];               // 更新 idx
                 }
-                func(indices, newIndices); // 调用传入的函数
+                func(indices, tlv); // 调用传入的函数
             }
         }
     }
-    void Shape::rangeParallel(int dimCount, std::function<void(const int idx_linear, std::vector<int> &newIndices)> func, int newIndiceDim) const
+    void Shape::rangeParallel(int dimCount, std::function<void(const int idx_linear, ThreadLocalVectors &tlv)> func,const vector<int> tlv_sizes) const
     {
         dimCount = checkdim(dimCount, dim);
         int stride = checkStride(dimCount, shape);
@@ -185,16 +185,16 @@ namespace deepx
 
 #pragma omp parallel
         {
-            std::vector<int> newIndices(newIndiceDim, 0);
+            ThreadLocalVectors tlv(tlv_sizes);
 #pragma omp for
             for (int idx = 0; idx < total; idx++)
             {
-                func(idx * stride, newIndices);
+                func(idx * stride, tlv);
             }
         }
     }
 
-    void Shape::rangeParallel(int dimCount, std::function<void(const int idx_linear, const std::vector<int> &indices, std::vector<int> &newIndices)> func, int newIndiceDim) const
+    void Shape::rangeParallel(int dimCount, std::function<void(const int idx_linear, const std::vector<int> &indices, ThreadLocalVectors &tlv)> func,const vector<int> tlv_sizes) const
     {
         dimCount = checkdim(dimCount, dim);
         int totalSize = checkTotalSize(dimCount, shape);
@@ -203,7 +203,7 @@ namespace deepx
 #pragma omp parallel
         {
             std::vector<int> indices(dimCount, 0);
-            std::vector<int> newIndices(newIndiceDim, 0);
+            ThreadLocalVectors tlv(tlv_sizes);
 #pragma omp for
             for (int idx = 0; idx < totalSize; idx++)
             {
@@ -214,7 +214,7 @@ namespace deepx
                     indices[dim] = idx_ % shape[dim]; // 计算当前维度的索引
                     idx_ /= shape[dim];               // 更新 idx
                 }
-                func(idx * stride, indices, newIndices);
+                func(idx * stride, indices, tlv);
             }
         }
     }
diff --git a/excuter/cpp-common/src/deepx/tensor.hpp b/excuter/cpp-common/src/deepx/tensor.hpp
index af5f0dc6..463e42cc 100644
--- a/excuter/cpp-common/src/deepx/tensor.hpp
+++ b/excuter/cpp-common/src/deepx/tensor.hpp
@@ -54,7 +54,6 @@ namespace deepx
         Tensor(const Tensor<T> &tensor)
         {
             shape = tensor.shape;
-            device = tensor.device;
             newer = tensor.newer;
             deleter = tensor.deleter;
             copyer = tensor.copyer;
@@ -73,7 +72,6 @@ namespace deepx
         Tensor(Tensor<T> &&other) noexcept
         {
             shape = std::move(other.shape);
-            device = other.device;
 
             deleter = other.deleter;
             copyer = other.copyer;
@@ -101,7 +99,6 @@ namespace deepx
                 return *this;
 
             shape = tensor.shape;
-            device = tensor.device;
             deleter = tensor.deleter;
             copyer = tensor.copyer;
             newer = tensor.newer;
@@ -126,7 +123,6 @@ namespace deepx
             if (this == &tensor)
                 return *this;
             shape = tensor.shape;
-            device = tensor.device;
             newer = tensor.newer;
             deleter = tensor.deleter;
             copyer = tensor.copyer;
diff --git a/excuter/cpp-common/src/deepx/tensorbase.hpp b/excuter/cpp-common/src/deepx/tensorbase.hpp
index d1b176f7..6e8806e0 100644
--- a/excuter/cpp-common/src/deepx/tensorbase.hpp
+++ b/excuter/cpp-common/src/deepx/tensorbase.hpp
@@ -5,29 +5,21 @@
 
 namespace deepx
 {
-    enum DeviceType
-    {
-        CPU = 0,
-        CUDA = 1,
-    };
-
+ 
     struct TensorBase
     {
         Shape shape;
-        DeviceType device;
         TensorBase() = default;
         // 拷贝构造函数
         TensorBase(const TensorBase &other)
         {
             shape = other.shape;
-            device = other.device;
         }
 
         // 移动构造函数
         TensorBase(TensorBase &&other) noexcept
         {
             shape = std::move(other.shape);
-            device = other.device;
         }
 
         // 拷贝赋值运算符
@@ -36,7 +28,6 @@ namespace deepx
             if (this != &other)
             {
                 shape = other.shape;
-                device = other.device;
             }
             return *this;
         }
@@ -47,7 +38,6 @@ namespace deepx
             if (this != &other)
             {
                 shape = std::move(other.shape);
-                device = other.device;
             }
             return *this;
         }
diff --git a/excuter/cpp-common/src/deepx/tensorfunc/changeshape.hpp b/excuter/cpp-common/src/deepx/tensorfunc/changeshape.hpp
index 42040543..5e359dbc 100644
--- a/excuter/cpp-common/src/deepx/tensorfunc/changeshape.hpp
+++ b/excuter/cpp-common/src/deepx/tensorfunc/changeshape.hpp
@@ -8,66 +8,47 @@
 namespace deepx::tensorfunc
 {
     using namespace std;
+
+    // reshape
     template <typename Author, typename T>
     struct reshapeDispatcher
     {
-        static void reshape(const Tensor<T> &tensor, const std::vector<int> &new_shape,Tensor<T> &output) = delete;
+        static void reshape(const Tensor<T> &tensor, const std::vector<int> &new_shape, Tensor<T> &output) = delete;
     };
 
-    // A.reshape(new_shape)
     template <typename Author, typename T>
-    void reshape(const Tensor<T> &tensor, const std::vector<int> &new_shape,Tensor<T> &output)
+    void reshape(const Tensor<T> &tensor, const std::vector<int> &new_shape, Tensor<T> &output)
     {
-        reshapeDispatcher<Author, T>::reshape(tensor, new_shape,output);
+        reshapeDispatcher<Author, T>::reshape(tensor, new_shape, output);
     }
 
+    // transpose
     template <typename Author, typename T>
     struct transposeDispatcher
     {
         static void transpose(const Tensor<T> &tensor, const std::vector<int> &dim_order, Tensor<T> &output) = delete;
     };
 
-    // transpose(A,dim_order)=>B
     template <typename Author, typename T>
     void transpose(const Tensor<T> &tensor, const std::vector<int> &dim_order, Tensor<T> &output)
     {
         transposeDispatcher<Author, T>::transpose(tensor, dim_order, output);
     }
 
+    // concat
     template <typename Author, typename T>
     struct concatDispatcher
     {
-        static void concat(const vector<Tensor<T>*> tensors, const int axis, Tensor<T> &C) = delete;
+        static void concat(const vector<Tensor<T> *> tensors, const int axis, Tensor<T> &C) = delete;
     };
-    // concat(tensors,axis)=>C
-    template <typename Author, typename T>
-    void concat(const vector<Tensor<T>*> tensors, const int axis, Tensor<T> &C)
-    {
-        concatDispatcher<Author, T>::concat(tensors, axis, C);
-    }
 
-    // https://onnx.ai/onnx/operators/onnx__Split.html
     template <typename Author, typename T>
-    struct splitDispatcher
+    void concat(const vector<Tensor<T> *> tensors, const int axis, Tensor<T> &C)
     {
-        static void split(const Tensor<T> &A, const int axis,const std::vector<int> &splits, Tensor<T> *&B) = delete;
-        static void split(const Tensor<T> &A, const int axis,const int num_outputs, Tensor<T> *&B) = delete;
-    };  
-    // split(tensor,axis,splits)=>tensors
-    template <typename Author, typename T>
-    void split(const Tensor<T> &A, const int axis,const std::vector<int> &splits, Tensor<T> *&B)
-    {
-        splitDispatcher<Author, T>::split(A, axis, splits, B);
-       
-    }   
- 
-    // split(tensor,axis,num_outputs)=>tensors
-    template <typename Author, typename T>
-    void split(const Tensor<T> &A, const int axis,const int num_outputs, Tensor<T> *&B)
-    {
-        splitDispatcher<Author, T>::split(A, axis, num_outputs, B);
+        concatDispatcher<Author, T>::concat(tensors, axis, C);
     }
 
+    // broadcastTo
     template <typename Author, typename T>
     struct broadcastToDispatcher
     {
@@ -80,67 +61,99 @@ namespace deepx::tensorfunc
         broadcastToDispatcher<Author, T>::broadcastTo(A, new_shape, B);
     }
 
-
-
-    template <typename Author, typename T>
-    struct expandDispatcher
+    // gather
+    template <typename Author, typename T, typename GatherAxisT>
+    struct gatherDispatcher
     {
-        static void expand(const Tensor<T> &A, const Shape &new_shape, Tensor<T> &B) = delete;
+        static void gather(const Tensor<T> &input, const Tensor<GatherAxisT> &indices, const int axis, Tensor<T> &output) = delete;
     };
 
-    template <typename Author, typename T>
-    void expand(const Tensor<T> &A, const Shape &new_shape, Tensor<T> &B)
+    template <typename Author, typename T, typename GatherAxisT>
+    void gather(const Tensor<T> &input, const Tensor<GatherAxisT> &indices, const int axis, Tensor<T> &output)
     {
-        expandDispatcher<Author, T>::expand(A, new_shape, B);
+        gatherDispatcher<Author, T, GatherAxisT>::gather(input, indices, axis, output);
     }
-    
-    template <typename Author, typename T>
-    struct squeezeDispatcher
-    {
-        static void squeeze(Tensor<T> &tensor) = delete;
-    };  
 
-    template <typename Author, typename T>
-    void squeeze(Tensor<T> &tensor)
-    {
-        squeezeDispatcher<Author, T>::squeeze(tensor);
-    }
-    
-    template <typename Author, typename T>
-    struct unsqueezeDispatcher
-    {
-        static void unsqueeze(Tensor<T> &tensor, const int axis) = delete;
-    };
-
-    template <typename Author, typename T>
-    void unsqueeze(Tensor<T> &tensor, const int axis)
-    {
-        unsqueezeDispatcher<Author, T>::unsqueeze(tensor, axis);
-    }
-    
-    template <typename Author, typename T>
-    struct flattenDispatcher
-    {
-        static void flatten(Tensor<T> &tensor) = delete;
-    };
-
-    template <typename Author, typename T>
-    void flatten(Tensor<T> &tensor)
-    {
-        flattenDispatcher<Author, T>::flatten(tensor);
-    }
-    
-    template <typename Author, typename T>
-    struct paddingDispatcher
-    {
-        static void padding(Tensor<T> &tensor, const Shape &new_shape) = delete;
-    };  
-
-    template <typename Author, typename T>
-    void padding(Tensor<T> &tensor, const Shape &new_shape)
-    {
-        paddingDispatcher<Author, T>::padding(tensor, new_shape);
-    }
+    // // split
+    // //  https://onnx.ai/onnx/operators/onnx__Split.html
+    // template <typename Author, typename T>
+    // struct splitDispatcher
+    // {
+    //     static void split(const Tensor<T> &A, const int axis, const std::vector<int> &splits, Tensor<T> *&B) = delete;
+    //     static void split(const Tensor<T> &A, const int axis, const int num_outputs, Tensor<T> *&B) = delete;
+    // };
+    // template <typename Author, typename T>
+    // void split(const Tensor<T> &A, const int axis, const std::vector<int> &splits, Tensor<T> *&B)
+    // {
+    //     splitDispatcher<Author, T>::split(A, axis, splits, B);
+    // }
+
+    // // split(tensor,axis,num_outputs)=>tensors
+    // template <typename Author, typename T>
+    // void split(const Tensor<T> &A, const int axis, const int num_outputs, Tensor<T> *&B)
+    // {
+    //     splitDispatcher<Author, T>::split(A, axis, num_outputs, B);
+    // }
+
+    // template <typename Author, typename T>
+    // struct expandDispatcher
+    // {
+    //     static void expand(const Tensor<T> &A, const Shape &new_shape, Tensor<T> &B) = delete;
+    // };
+
+    // template <typename Author, typename T>
+    // void expand(const Tensor<T> &A, const Shape &new_shape, Tensor<T> &B)
+    // {
+    //     expandDispatcher<Author, T>::expand(A, new_shape, B);
+    // }
+
+    // template <typename Author, typename T>
+    // struct squeezeDispatcher
+    // {
+    //     static void squeeze(Tensor<T> &tensor) = delete;
+    // };
+
+    // template <typename Author, typename T>
+    // void squeeze(Tensor<T> &tensor)
+    // {
+    //     squeezeDispatcher<Author, T>::squeeze(tensor);
+    // }
+
+    // template <typename Author, typename T>
+    // struct unsqueezeDispatcher
+    // {
+    //     static void unsqueeze(Tensor<T> &tensor, const int axis) = delete;
+    // };
+
+    // template <typename Author, typename T>
+    // void unsqueeze(Tensor<T> &tensor, const int axis)
+    // {
+    //     unsqueezeDispatcher<Author, T>::unsqueeze(tensor, axis);
+    // }
+
+    // template <typename Author, typename T>
+    // struct flattenDispatcher
+    // {
+    //     static void flatten(Tensor<T> &tensor) = delete;
+    // };
+
+    // template <typename Author, typename T>
+    // void flatten(Tensor<T> &tensor)
+    // {
+    //     flattenDispatcher<Author, T>::flatten(tensor);
+    // }
+
+    // template <typename Author, typename T>
+    // struct paddingDispatcher
+    // {
+    //     static void padding(Tensor<T> &tensor, const Shape &new_shape) = delete;
+    // };
+
+    // template <typename Author, typename T>
+    // void padding(Tensor<T> &tensor, const Shape &new_shape)
+    // {
+    //     paddingDispatcher<Author, T>::padding(tensor, new_shape);
+    // }
 }
 
 #endif
diff --git a/excuter/cpp-common/src/deepx/tensorfunc/io.hpp b/excuter/cpp-common/src/deepx/tensorfunc/io.hpp
index d9fdb47f..86a1c396 100644
--- a/excuter/cpp-common/src/deepx/tensorfunc/io.hpp
+++ b/excuter/cpp-common/src/deepx/tensorfunc/io.hpp
@@ -2,6 +2,7 @@
 #define DEEPX_TENSORFUNC_IO_HPP
 
 #include "deepx/tensor.hpp"
+#include "stdutil/fs.hpp"
 
 namespace deepx::tensorfunc{
     
@@ -15,25 +16,24 @@ namespace deepx::tensorfunc{
         printDispatcher<Author,T>::print(t, f);
     }
 
-    template <typename Author, typename T>
-    struct saveDispatcher{
-        static void save(Tensor<T> &tensor,const std::string &path,int filebegin=0)=delete;
-    };
-
-    template <typename Author, typename T>
-    void save(Tensor<T> &tensor,const std::string &path,int filebegin=0){
-        saveDispatcher<Author,T>::save(tensor, path, filebegin);
-    }
-
-    template <typename Author, typename T>
-    struct loadDispatcher{
-        static Tensor<T> load(const std::string &path,int filebegin=0)=delete;
-    };
-
-    template <typename Author, typename T>
-    Tensor<T> load(const std::string &path,int filebegin=0){
-        return loadDispatcher<Author,T>::load(path, filebegin);
+    template <typename T>
+    void save(Tensor<T> &tensor,const std::string &path);
+
+    template <typename T>
+    pair<std::string,shared_ptr<Tensor<T>>> load(const std::string &path);
+
+    inline pair<std::string,Shape> loadShape(const std::string &path)
+    {
+        std::string shapepath = path + ".shape";
+        std::ifstream shape_fs(shapepath, std::ios::binary);
+        std::string shapedata((std::istreambuf_iterator<char>(shape_fs)), std::istreambuf_iterator<char>());
+        Shape shape;
+        shape.fromYaml(shapedata);
+        std::string filename = stdutil::filename(path);
+        std::string tensor_name = filename.substr(0, filename.find_last_of('.'));
+        return std::make_pair(tensor_name, shape);
     }
+    
 }
 
 #endif // DEEPX_TENSORFUNC_IO_HPP
diff --git a/excuter/cpp-common/src/stdutil/fs.cpp b/excuter/cpp-common/src/stdutil/fs.cpp
new file mode 100644
index 00000000..63d297e0
--- /dev/null
+++ b/excuter/cpp-common/src/stdutil/fs.cpp
@@ -0,0 +1,7 @@
+#include "fs.hpp"
+
+namespace  stdutil{
+    string filename(const string &path){
+        return path.substr(path.find_last_of('/') + 1);
+    }
+}
\ No newline at end of file
diff --git a/excuter/cpp-common/src/stdutil/fs.hpp b/excuter/cpp-common/src/stdutil/fs.hpp
new file mode 100644
index 00000000..26826d9b
--- /dev/null
+++ b/excuter/cpp-common/src/stdutil/fs.hpp
@@ -0,0 +1,11 @@
+#ifndef DEEPX_STDUTIL_FS_HPP
+#define DEEPX_STDUTIL_FS_HPP
+
+#include <string>
+
+namespace stdutil{
+    using namespace std;
+    string filename(const string &path);
+}
+
+#endif // DEEPX_STDUTIL_FS_HPP
\ No newline at end of file
diff --git a/excuter/op-mem-cuda/CMakeLists.txt b/excuter/op-mem-cuda/CMakeLists.txt
index c6d695d5..98471042 100644
--- a/excuter/op-mem-cuda/CMakeLists.txt
+++ b/excuter/op-mem-cuda/CMakeLists.txt
@@ -41,6 +41,9 @@ set(CMAKE_CUDA_STANDARD 17)
 set(CMAKE_CUDA_ARCHITECTURES 75)  # 根据您的 GPU 计算能力进行调整
 set(CMAKE_CUDA_SEPARABLE_COMPILATION ON) #确保 CMake 能够正确识别 CUDA 文件并将其编译为目标
 
+# 设置 CUDA 编译选项
+# 是否开启PTX 汇编展示
+# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --ptxas-options=-v")
 
 find_package(yaml-cpp REQUIRED)
  
diff --git a/excuter/op-mem-cuda/src/client/tfs.cpp b/excuter/op-mem-cuda/src/client/tfs.cpp
index f92bcfc4..44fbcfc6 100644
--- a/excuter/op-mem-cuda/src/client/tfs.cpp
+++ b/excuter/op-mem-cuda/src/client/tfs.cpp
@@ -103,20 +103,34 @@ namespace deepx::tf
                                                              vector<Param>())); 
     }
     // io
-    void register_util(TfFactory &opfactory)
+    void register_io(TfFactory &opfactory)
     {
         opfactory.add_tf(std::make_shared<Print<miaobyte>>(vector<Param>(
                                                                {
-                                                                   Param("", DataCategory::Tensor, Precision::Any),
+                                                                   Param("t", DataCategory::Tensor, Precision::Any),
                                                                }),
                                                            vector<Param>()));
 
         opfactory.add_tf(std::make_shared<Print<miaobyte>>(vector<Param>(
                                                                {
-                                                                   Param("", DataCategory::Tensor, Precision::Any),
-                                                                   Param("", DataCategory::Var, Precision::String),
+                                                                   Param("t", DataCategory::Tensor, Precision::Any),
+                                                                   Param("format", DataCategory::Var, Precision::String),
                                                                }),
                                                            vector<Param>()));
+
+        opfactory.add_tf(std::make_shared<Save>(vector<Param>(
+                                                               {
+                                                                   Param("t", DataCategory::Tensor, Precision::Any),
+                                                                   Param("path", DataCategory::Var, Precision::String),
+                                                               }),
+                                                           vector<Param>()));
+
+        opfactory.add_tf(std::make_shared<Load>(vector<Param>(
+                                                               {
+                                                                   Param("path", DataCategory::Var, Precision::String),
+                                                               }),
+                                                           vector<Param>())); 
+                                                           
     }
 
     // elementwise
@@ -422,7 +436,7 @@ namespace deepx::tf
                                                                   Param("C", DataCategory::Tensor, Precision::Any),
                                                               })));
     }
-    // // changeshape
+    // changeshape
     void register_changeshape(TfFactory &tffactory)
     {
         // reshape
@@ -465,6 +479,17 @@ namespace deepx::tf
                 {
                     Param("B", DataCategory::Tensor, Precision::Any),
                 })));
+        // gather
+        tffactory.add_tf(std::make_shared<Gather<miaobyte>>(vector<Param>(
+                {
+                    Param("A", DataCategory::Tensor, Precision::Any),
+                    Param("indices", DataCategory::Tensor, Precision::Int64|Precision::Int32),
+                    Param("axis", DataCategory::Var, Precision::Int32),
+                }), 
+            vector<Param>(
+                {
+                    Param("B", DataCategory::Tensor, Precision::Any),
+                })));
     }
    // reduce
      void register_reduce(TfFactory &tffactory)
@@ -520,7 +545,7 @@ namespace deepx::tf
     {
         register_lifecycle(tffactory);
         register_init(tffactory);
-        register_util(tffactory);
+        register_io(tffactory);
         register_elementwise(tffactory);
         register_matmul(tffactory);
         register_changeshape(tffactory);
diff --git a/excuter/op-mem-cuda/src/deepx/mem/mem_cuda.hpp b/excuter/op-mem-cuda/src/deepx/mem/mem_cuda.hpp
index 38ec1dd0..556a4aac 100644
--- a/excuter/op-mem-cuda/src/deepx/mem/mem_cuda.hpp
+++ b/excuter/op-mem-cuda/src/deepx/mem/mem_cuda.hpp
@@ -52,7 +52,7 @@ namespace deepx::mem
             auto ptr = mem.at(name);
             auto result = make_shared<Tensor<void>>();
             result->shape = ptr->shape;
-            result->device = ptr->device;
+ 
             result->deleter = nullptr;
             result->copyer = nullptr;
             result->newer = nullptr;
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu
index e5eb511d..130fc80d 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu
@@ -12,7 +12,6 @@
 namespace deepx::tensorfunc
 {
     // transpose
-    //  DIM=2^n
     template <int DIM, typename T>
     __global__ void transpose_kernel(const T *inputData,
                                      const int *inputStrides,
@@ -41,8 +40,6 @@ namespace deepx::tensorfunc
         }
     }
 
-   
-
     template <typename T>
     void launch_transpose(const T *input,
                           const int *inputStrides,
@@ -56,10 +53,8 @@ namespace deepx::tensorfunc
         cudaVector<int> newStrides_d(outputStrides, dim);
         cudaVector<int> dimOrder_d(dimOrder, dim);
 
-        int powDim = nextPowerOf2(dim);
         auto [numBlocks, blockSize] = BestDims(len);
-        // 根据计算出的2的幂次选择对应的模板实例
-        switch (powDim)
+        switch (dim)
         {
         case 1:
             transpose_kernel<1, T><<<numBlocks, blockSize>>>(input, strides_d.data, output, newStrides_d.data, dim, len, dimOrder_d.data);
@@ -67,26 +62,44 @@ namespace deepx::tensorfunc
         case 2:
             transpose_kernel<2, T><<<numBlocks, blockSize>>>(input, strides_d.data, output, newStrides_d.data, dim, len, dimOrder_d.data);
             break;
+        case 3:
+            transpose_kernel<3, T><<<numBlocks, blockSize>>>(input, strides_d.data, output, newStrides_d.data, dim, len, dimOrder_d.data);
+            break;
         case 4:
             transpose_kernel<4, T><<<numBlocks, blockSize>>>(input, strides_d.data, output, newStrides_d.data, dim, len, dimOrder_d.data);
             break;
+        case 5:
+            transpose_kernel<5, T><<<numBlocks, blockSize>>>(input, strides_d.data, output, newStrides_d.data, dim, len, dimOrder_d.data);
+            break;
+        case 6:
+            transpose_kernel<6, T><<<numBlocks, blockSize>>>(input, strides_d.data, output, newStrides_d.data, dim, len, dimOrder_d.data);
+            break;
+        case 7:
+            transpose_kernel<7, T><<<numBlocks, blockSize>>>(input, strides_d.data, output, newStrides_d.data, dim, len, dimOrder_d.data);
+            break;
         case 8:
             transpose_kernel<8, T><<<numBlocks, blockSize>>>(input, strides_d.data, output, newStrides_d.data, dim, len, dimOrder_d.data);
             break;
-        case 16:
-            transpose_kernel<16, T><<<numBlocks, blockSize>>>(input, strides_d.data, output, newStrides_d.data, dim, len, dimOrder_d.data);
+        case 9:
+            transpose_kernel<9, T><<<numBlocks, blockSize>>>(input, strides_d.data, output, newStrides_d.data, dim, len, dimOrder_d.data);
             break;
-        case 32:
-            transpose_kernel<32, T><<<numBlocks, blockSize>>>(input, strides_d.data, output, newStrides_d.data, dim, len, dimOrder_d.data);
+        case 10:
+            transpose_kernel<10, T><<<numBlocks, blockSize>>>(input, strides_d.data, output, newStrides_d.data, dim, len, dimOrder_d.data);
             break;
-        case 64:
-            transpose_kernel<64, T><<<numBlocks, blockSize>>>(input, strides_d.data, output, newStrides_d.data, dim, len, dimOrder_d.data);
+        case 11:
+            transpose_kernel<11, T><<<numBlocks, blockSize>>>(input, strides_d.data, output, newStrides_d.data, dim, len, dimOrder_d.data);
             break;
-        case 128:
-            transpose_kernel<128, T><<<numBlocks, blockSize>>>(input, strides_d.data, output, newStrides_d.data, dim, len, dimOrder_d.data);
+        case 12:
+            transpose_kernel<12, T><<<numBlocks, blockSize>>>(input, strides_d.data, output, newStrides_d.data, dim, len, dimOrder_d.data);
             break;
+
         default:
-            throw std::runtime_error("dim too large, max support 128");
+            throw std::runtime_error("dimension large than " + std::to_string(MAX_DIM));
+        }
+        cudaError_t err = cudaGetLastError();
+        if (err != cudaSuccess)
+        {
+            throw std::runtime_error("cuda error");
         }
     }
 
@@ -169,11 +182,7 @@ namespace deepx::tensorfunc
 
         // shapeAtAxis
         cudaVector<int> shapeAtAxis_d(shapeAtAxis, numTensors, cudaMemcpyHostToDevice);
-
-        int powDim = nextPowerOf2(dim);
-
-        // 根据计算出的2的幂次选择对应的模板实例
-        switch (powDim)
+        switch (dim)
         {
         case 1:
             concat_kernel<1, T><<<numBlocks, blockSize>>>(tensorsDataList.data, inputStrides_d.data, outputData, outputStrides_d.data, dim, outputLen, axis, numTensors, shapeAtAxis_d.data);
@@ -181,26 +190,44 @@ namespace deepx::tensorfunc
         case 2:
             concat_kernel<2, T><<<numBlocks, blockSize>>>(tensorsDataList.data, inputStrides_d.data, outputData, outputStrides_d.data, dim, outputLen, axis, numTensors, shapeAtAxis_d.data);
             break;
+        case 3:
+            concat_kernel<3, T><<<numBlocks, blockSize>>>(tensorsDataList.data, inputStrides_d.data, outputData, outputStrides_d.data, dim, outputLen, axis, numTensors, shapeAtAxis_d.data);
+            break;
         case 4:
             concat_kernel<4, T><<<numBlocks, blockSize>>>(tensorsDataList.data, inputStrides_d.data, outputData, outputStrides_d.data, dim, outputLen, axis, numTensors, shapeAtAxis_d.data);
             break;
+        case 5:
+            concat_kernel<5, T><<<numBlocks, blockSize>>>(tensorsDataList.data, inputStrides_d.data, outputData, outputStrides_d.data, dim, outputLen, axis, numTensors, shapeAtAxis_d.data);
+            break;
+        case 6:
+            concat_kernel<6, T><<<numBlocks, blockSize>>>(tensorsDataList.data, inputStrides_d.data, outputData, outputStrides_d.data, dim, outputLen, axis, numTensors, shapeAtAxis_d.data);
+            break;
+        case 7:
+            concat_kernel<7, T><<<numBlocks, blockSize>>>(tensorsDataList.data, inputStrides_d.data, outputData, outputStrides_d.data, dim, outputLen, axis, numTensors, shapeAtAxis_d.data);
+            break;
         case 8:
             concat_kernel<8, T><<<numBlocks, blockSize>>>(tensorsDataList.data, inputStrides_d.data, outputData, outputStrides_d.data, dim, outputLen, axis, numTensors, shapeAtAxis_d.data);
             break;
-        case 16:
-            concat_kernel<16, T><<<numBlocks, blockSize>>>(tensorsDataList.data, inputStrides_d.data, outputData, outputStrides_d.data, dim, outputLen, axis, numTensors, shapeAtAxis_d.data);
+        case 9:
+            concat_kernel<9, T><<<numBlocks, blockSize>>>(tensorsDataList.data, inputStrides_d.data, outputData, outputStrides_d.data, dim, outputLen, axis, numTensors, shapeAtAxis_d.data);
             break;
-        case 32:
-            concat_kernel<32, T><<<numBlocks, blockSize>>>(tensorsDataList.data, inputStrides_d.data, outputData, outputStrides_d.data, dim, outputLen, axis, numTensors, shapeAtAxis_d.data);
+        case 10:
+            concat_kernel<10, T><<<numBlocks, blockSize>>>(tensorsDataList.data, inputStrides_d.data, outputData, outputStrides_d.data, dim, outputLen, axis, numTensors, shapeAtAxis_d.data);
             break;
-        case 64:
-            concat_kernel<64, T><<<numBlocks, blockSize>>>(tensorsDataList.data, inputStrides_d.data, outputData, outputStrides_d.data, dim, outputLen, axis, numTensors, shapeAtAxis_d.data);
+        case 11:
+            concat_kernel<11, T><<<numBlocks, blockSize>>>(tensorsDataList.data, inputStrides_d.data, outputData, outputStrides_d.data, dim, outputLen, axis, numTensors, shapeAtAxis_d.data);
             break;
-        case 128:
-            concat_kernel<128, T><<<numBlocks, blockSize>>>(tensorsDataList.data, inputStrides_d.data, outputData, outputStrides_d.data, dim, outputLen, axis, numTensors, shapeAtAxis_d.data);
+        case 12:
+            concat_kernel<12, T><<<numBlocks, blockSize>>>(tensorsDataList.data, inputStrides_d.data, outputData, outputStrides_d.data, dim, outputLen, axis, numTensors, shapeAtAxis_d.data);
             break;
+
         default:
-            throw std::runtime_error("dim too large, max support 128");
+            throw std::runtime_error("dimension large than " + std::to_string(MAX_DIM));
+        }
+        cudaError_t err = cudaGetLastError();
+        if (err != cudaSuccess)
+        {
+            throw std::runtime_error("cuda error");
         }
     }
     template void launch_concat<double>(const double **tensorsData, const int *inputStrides, double *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis);
@@ -253,7 +280,8 @@ namespace deepx::tensorfunc
     template <typename T>
     void launch_broadcastTo(const T *input, const int *inputStrides, const int intputDim,
                             const BroadcastMap *broadcastMap,
-                            T *output, const int *outputStrides, const int outputDim, const int outputlen){
+                            T *output, const int *outputStrides, const int outputDim, const int outputlen)
+    {
 
         auto [numBlocks, blockSize] = BestDims(outputlen);
 
@@ -266,37 +294,51 @@ namespace deepx::tensorfunc
         // input
         cudaVector<int> inputStrides_d(inputStrides, intputDim, cudaMemcpyHostToDevice);
 
-     
-        int powDim = nextPowerOf2(outputDim);   
-        // 根据计算出的2的幂次选择对应的模板实例
-        switch (powDim)
+        switch (outputDim)
         {
         case 1:
-            broadcastTo_kernel<1, T><<<numBlocks, blockSize>>>(input, inputStrides_d.data, intputDim, broadcastMap_d.data, output, outputStrides_d.data, outputDim, outputlen);    
+            broadcastTo_kernel<1, T><<<numBlocks, blockSize>>>(input, inputStrides_d.data, intputDim, broadcastMap_d.data, output, outputStrides_d.data, outputDim, outputlen);
             break;
         case 2:
-            broadcastTo_kernel<2, T><<<numBlocks, blockSize>>>(input, inputStrides_d.data, intputDim, broadcastMap_d.data, output, outputStrides_d.data, outputDim, outputlen);    
+            broadcastTo_kernel<2, T><<<numBlocks, blockSize>>>(input, inputStrides_d.data, intputDim, broadcastMap_d.data, output, outputStrides_d.data, outputDim, outputlen);
+            break;
+        case 3:
+            broadcastTo_kernel<3, T><<<numBlocks, blockSize>>>(input, inputStrides_d.data, intputDim, broadcastMap_d.data, output, outputStrides_d.data, outputDim, outputlen);
             break;
         case 4:
-            broadcastTo_kernel<4, T><<<numBlocks, blockSize>>>(input, inputStrides_d.data, intputDim, broadcastMap_d.data, output, outputStrides_d.data, outputDim, outputlen);    
+            broadcastTo_kernel<4, T><<<numBlocks, blockSize>>>(input, inputStrides_d.data, intputDim, broadcastMap_d.data, output, outputStrides_d.data, outputDim, outputlen);
+            break;
+        case 5:
+            broadcastTo_kernel<5, T><<<numBlocks, blockSize>>>(input, inputStrides_d.data, intputDim, broadcastMap_d.data, output, outputStrides_d.data, outputDim, outputlen);
+            break;
+        case 6:
+            broadcastTo_kernel<6, T><<<numBlocks, blockSize>>>(input, inputStrides_d.data, intputDim, broadcastMap_d.data, output, outputStrides_d.data, outputDim, outputlen);
+            break;
+        case 7:
+            broadcastTo_kernel<7, T><<<numBlocks, blockSize>>>(input, inputStrides_d.data, intputDim, broadcastMap_d.data, output, outputStrides_d.data, outputDim, outputlen);
             break;
         case 8:
-            broadcastTo_kernel<8, T><<<numBlocks, blockSize>>>(input, inputStrides_d.data, intputDim, broadcastMap_d.data, output, outputStrides_d.data, outputDim, outputlen);    
+            broadcastTo_kernel<8, T><<<numBlocks, blockSize>>>(input, inputStrides_d.data, intputDim, broadcastMap_d.data, output, outputStrides_d.data, outputDim, outputlen);
             break;
-        case 16:
-            broadcastTo_kernel<16, T><<<numBlocks, blockSize>>>(input, inputStrides_d.data, intputDim, broadcastMap_d.data, output, outputStrides_d.data, outputDim, outputlen);       
+        case 9:
+            broadcastTo_kernel<9, T><<<numBlocks, blockSize>>>(input, inputStrides_d.data, intputDim, broadcastMap_d.data, output, outputStrides_d.data, outputDim, outputlen);
             break;
-        case 32:
-            broadcastTo_kernel<32, T><<<numBlocks, blockSize>>>(input, inputStrides_d.data, intputDim, broadcastMap_d.data, output, outputStrides_d.data, outputDim, outputlen);       
+        case 10:
+            broadcastTo_kernel<10, T><<<numBlocks, blockSize>>>(input, inputStrides_d.data, intputDim, broadcastMap_d.data, output, outputStrides_d.data, outputDim, outputlen);
             break;
-        case 64:
-            broadcastTo_kernel<64, T><<<numBlocks, blockSize>>>(input, inputStrides_d.data, intputDim, broadcastMap_d.data, output, outputStrides_d.data, outputDim, outputlen);       
+        case 11:
+            broadcastTo_kernel<11, T><<<numBlocks, blockSize>>>(input, inputStrides_d.data, intputDim, broadcastMap_d.data, output, outputStrides_d.data, outputDim, outputlen);
             break;
-        case 128:
-            broadcastTo_kernel<128, T><<<numBlocks, blockSize>>>(input, inputStrides_d.data, intputDim, broadcastMap_d.data, output, outputStrides_d.data, outputDim, outputlen);       
+        case 12:
+            broadcastTo_kernel<12, T><<<numBlocks, blockSize>>>(input, inputStrides_d.data, intputDim, broadcastMap_d.data, output, outputStrides_d.data, outputDim, outputlen);
             break;
         default:
-            throw std::runtime_error("dim too large, max support 128");
+            throw std::runtime_error("dimension large than " + std::to_string(MAX_DIM));
+        }
+        cudaError_t err = cudaGetLastError();
+        if (err != cudaSuccess)
+        {
+            throw std::runtime_error("cuda error");
         }
     }
     template void launch_broadcastTo<double>(const double *input, const int *inputStrides, const int inputDim,
@@ -310,7 +352,7 @@ namespace deepx::tensorfunc
                                                   nv_bfloat16 *output, const int *outputStrides, const int outputDim, const int outputlen);
     template void launch_broadcastTo<__half>(const __half *input, const int *inputStrides, const int inputDim,
                                              const BroadcastMap *broadcastMap,
-                                             __half *output,     const int *outputStrides, const int outputDim, const int outputlen);
+                                             __half *output, const int *outputStrides, const int outputDim, const int outputlen);
     template void launch_broadcastTo<int64_t>(const int64_t *input, const int *inputStrides, const int inputDim,
                                               const BroadcastMap *broadcastMap,
                                               int64_t *output, const int *outputStrides, const int outputDim, const int outputlen);
@@ -323,5 +365,183 @@ namespace deepx::tensorfunc
     template void launch_broadcastTo<int8_t>(const int8_t *input, const int *inputStrides, const int inputDim,
                                              const BroadcastMap *broadcastMap,
                                              int8_t *output, const int *outputStrides, const int outputDim, const int outputlen);
+
+    // gather
+
+    template <typename GatherAxisT>
+    __host__ __device__ void fromGatherIndices(
+        const int *output_indices,                                                   // 输出张量的索引
+        const GatherAxisT *indices, const int *indicesStrides, const int indicesDim, // indices是tensor
+        const int gatherAxis,                                                        // gather操作的轴
+        int *input_indices, const int inputDim)
+    {
+
+        for (int i = 0; i < inputDim; ++i)
+        {
+            input_indices[i] = output_indices[i];
+        }
+
+        // 使用indices张量中对应位置的值来替换gatherAxis维度的索引
+        int indices_idx = linearAt(indicesStrides, indicesDim, output_indices);
+        input_indices[gatherAxis] = indices[indices_idx];
+    }
+
+    template <int DIM, typename T, typename GatherAxisT>
+    __global__ void gather_kernel(
+        const T *input, const int *inputStrides, const int inputDim,
+        const GatherAxisT *indices, const int *indicesStrides, const int indicesDim,
+        const int gatherAxis,
+        T *output, const int outputlen)
+    {
+        const int grid_stride = gridDim.x * blockDim.x;
+        int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+        for (; thread_id < outputlen; thread_id += grid_stride)
+        {
+            // 输出索引
+            int output_indices[DIM];
+            linearTo(indicesStrides, indicesDim, output_indices, thread_id);
+
+            // 输入索引
+            int input_indices[DIM];
+            fromGatherIndices(output_indices,
+                              indices, indicesStrides, indicesDim,
+                              gatherAxis,
+                              input_indices, inputDim);
+            int inputIdx = linearAt(inputStrides, inputDim, input_indices);
+            int outputIdx = linearAt(indicesStrides, indicesDim, output_indices);
+            output[outputIdx] = input[inputIdx];
+        }
+    }
+
+    template <typename T, typename GatherAxisT>
+    void launch_gather(
+        const T *input, const int *inputStrides, const int inputDim,
+        const GatherAxisT *indices, const int *indicesStrides, const int indicesDim,
+        const int gatherAxis,
+        T *output, const int outputlen)
+    {
+
+        auto [numBlocks, blockSize] = BestDims(outputlen);
+
+        // indices
+        cudaVector<int> indicesStrides_d(indicesStrides, indicesDim, cudaMemcpyHostToDevice);
+
+        // input
+        cudaVector<int> inputStrides_d(inputStrides, inputDim, cudaMemcpyHostToDevice);
+        int dim=std::max(inputDim,indicesDim);
+        switch (dim)
+        {
+        case 1:
+            gather_kernel<1, T, GatherAxisT><<<numBlocks, blockSize>>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputlen);
+            break;
+        case 2:
+            gather_kernel<2, T, GatherAxisT><<<numBlocks, blockSize>>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputlen);
+            break;
+        case 3:
+            gather_kernel<3, T, GatherAxisT><<<numBlocks, blockSize>>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputlen);
+            break;
+        case 4:
+            gather_kernel<4, T, GatherAxisT><<<numBlocks, blockSize>>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputlen);
+            break;
+        case 5:
+            gather_kernel<5, T, GatherAxisT><<<numBlocks, blockSize>>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputlen);
+            break;
+        case 6:
+            gather_kernel<6, T, GatherAxisT><<<numBlocks, blockSize>>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputlen);
+            break;
+        case 7:
+            gather_kernel<7, T, GatherAxisT><<<numBlocks, blockSize>>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputlen);
+            break;
+        case 8:
+            gather_kernel<8, T, GatherAxisT><<<numBlocks, blockSize>>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputlen);
+            break;
+        case 9:
+            gather_kernel<9, T, GatherAxisT><<<numBlocks, blockSize>>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputlen);
+            break;
+        case 10:
+            gather_kernel<10, T, GatherAxisT><<<numBlocks, blockSize>>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputlen);
+            break;
+        case 11:
+            gather_kernel<11, T, GatherAxisT><<<numBlocks, blockSize>>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputlen);
+            break;
+        case 12:
+            gather_kernel<12, T, GatherAxisT><<<numBlocks, blockSize>>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputlen);
+            break;
+        default:
+            throw std::runtime_error("dimension large than " + std::to_string(MAX_DIM));
+        }
+        cudaError_t err = cudaGetLastError();
+        if (err != cudaSuccess)
+        {
+            throw std::runtime_error("cuda error");
+        }
+    }
+    template void launch_gather<double, int64_t>(const double *input, const int *inputStrides, const int inputDim,
+                                                 const int64_t *indices, const int *indicesStrides, const int indicesDim,
+                                                 const int gatherAxis,
+                                                 double *output, const int outputlen);
+    template void launch_gather<float, int64_t>(const float *input, const int *inputStrides, const int inputDim,
+                                                const int64_t *indices, const int *indicesStrides, const int indicesDim,
+                                                const int gatherAxis,
+                                                float *output, const int outputlen);
+    template void launch_gather<nv_bfloat16, int64_t>(const nv_bfloat16 *input, const int *inputStrides, const int inputDim,
+                                                      const int64_t *indices, const int *indicesStrides, const int indicesDim,
+                                                      const int gatherAxis,
+                                                      nv_bfloat16 *output, const int outputlen);
+    template void launch_gather<__half, int64_t>(const __half *input, const int *inputStrides, const int inputDim,
+                                                 const int64_t *indices, const int *indicesStrides, const int indicesDim,
+                                                 const int gatherAxis,
+                                                 __half *output, const int outputlen);
+    template void launch_gather<int64_t, int64_t>(const int64_t *input, const int *inputStrides, const int inputDim,
+                                                  const int64_t *indices, const int *indicesStrides, const int indicesDim,
+                                                  const int gatherAxis,
+                                                  int64_t *output, const int outputlen);
+    template void launch_gather<int32_t, int64_t>(const int32_t *input, const int *inputStrides, const int inputDim,
+                                                  const int64_t *indices, const int *indicesStrides, const int indicesDim,
+                                                  const int gatherAxis,
+                                                  int32_t *output, const int outputlen);
+    template void launch_gather<int16_t, int64_t>(const int16_t *input, const int *inputStrides, const int inputDim,
+                                                  const int64_t *indices, const int *indicesStrides, const int indicesDim,
+                                                  const int gatherAxis,
+                                                  int16_t *output, const int outputlen);
+    template void launch_gather<int8_t, int64_t>(const int8_t *input, const int *inputStrides, const int inputDim,
+                                                 const int64_t *indices, const int *indicesStrides, const int indicesDim,
+                                                 const int gatherAxis,
+                                                 int8_t *output, const int outputlen);
+
+    template void launch_gather<double, int32_t>(const double *input, const int *inputStrides, const int inputDim,
+                                                 const int32_t *indices, const int *indicesStrides, const int indicesDim,
+                                                 const int gatherAxis,
+                                                 double *output, const int outputlen);
+    template void launch_gather<float, int32_t>(const float *input, const int *inputStrides, const int inputDim,
+                                                const int32_t *indices, const int *indicesStrides, const int indicesDim,
+                                                const int gatherAxis,
+                                                float *output, const int outputlen);
+    template void launch_gather<nv_bfloat16, int32_t>(const nv_bfloat16 *input, const int *inputStrides, const int inputDim,
+                                                      const int32_t *indices, const int *indicesStrides, const int indicesDim,
+                                                      const int gatherAxis,
+                                                      nv_bfloat16 *output, const int outputlen);
+    template void launch_gather<__half, int32_t>(const __half *input, const int *inputStrides, const int inputDim,
+                                                 const int32_t *indices, const int *indicesStrides, const int indicesDim,
+                                                 const int gatherAxis,
+                                                 __half *output, const int outputlen);
+    template void launch_gather<int64_t, int32_t>(const int64_t *input, const int *inputStrides, const int inputDim,
+                                                  const int32_t *indices, const int *indicesStrides, const int indicesDim,
+                                                  const int gatherAxis,
+                                                  int64_t *output, const int outputlen);
+    template void launch_gather<int32_t, int32_t>(const int32_t *input, const int *inputStrides, const int inputDim,
+                                                  const int32_t *indices, const int *indicesStrides, const int indicesDim,
+                                                  const int gatherAxis,
+                                                  int32_t *output, const int outputlen);
+    template void launch_gather<int16_t, int32_t>(const int16_t *input, const int *inputStrides, const int inputDim,
+                                                  const int32_t *indices, const int *indicesStrides, const int indicesDim,
+                                                  const int gatherAxis,
+                                                  int16_t *output, const int outputlen);
+    template void launch_gather<int8_t, int32_t>(const int8_t *input, const int *inputStrides, const int inputDim,
+                                                 const int32_t *indices, const int *indicesStrides, const int indicesDim,
+                                                 const int gatherAxis,
+                                                 int8_t *output, const int outputlen);
 }
+
+ 
 #endif // DEEPX_TENSORFUNC_CHANGESHAPE_MIAOBYTE_HPP
\ No newline at end of file
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cuh b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cuh
index 7b0f5d31..2047a636 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cuh
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cuh
@@ -13,9 +13,8 @@ namespace deepx::tensorfunc
     __global__ void transpose_kernel(const T *input, const int *inputStrides, T *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
 
     template <typename T>
-    void launch_transpose(  const T *input, const int *inputStrides, T *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
+    void launch_transpose(const T *input, const int *inputStrides, T *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
 
-   
     template <typename DIM, typename T>
     __global__ void concat_kernel(const T **tensorsData,
                                   const int *inputStrides,
@@ -30,21 +29,40 @@ namespace deepx::tensorfunc
     template <typename T>
     void launch_concat(const T **tensorsData, const int *inputStrides, T *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis);
 
-    
-
-    __host__ __device__ void fromBroadcastIndices(const BroadcastMap *broadcastMap, const int *broadcastIndices, const int broadcastIndicesDim, int *indices);
-    
     // broadcastTo
+    __host__ __device__ void fromBroadcastIndices(const BroadcastMap *broadcastMap, const int *broadcastIndices, const int broadcastIndicesDim, int *indices);
+
     template <int DIM, typename T>
     __global__ void broadcastTo_kernel(
-        const T *input, const int *inputStrides,const int inputDim,
+        const T *input, const int *inputStrides, const int inputDim,
         const BroadcastMap *broadcastMap,
-        T *output, const int *outputStrides,const int outputDim,const int outputlen);
+        T *output, const int *outputStrides, const int outputDim, const int outputlen);
 
     template <typename T>
-    void launch_broadcastTo(const T *input, const int *inputStrides,const int intputDim,
+    void launch_broadcastTo(const T *input, const int *inputStrides, const int intputDim,
                             const BroadcastMap *broadcastMap,
-                            T *output, const int *outputStrides,const int outputDim,const int outputlen);
- 
+                            T *output, const int *outputStrides, const int outputDim, const int outputlen);
+
+    // gather
+     template <typename GatherAxisT>
+    __host__ __device__ void fromGatherIndices(
+    const int *output_indices,  // 输出张量的索引
+    const GatherAxisT *indices,const int *indicesStrides,const int indicesDim, //indices是tensor
+    const int gatherAxis,      // gather操作的轴
+    int *input_indices,const int inputDim);       // 计算出的输入张量索引  
+
+    template <int DIM, typename T,typename GatherAxisT>
+    __global__ void gather_kernel(
+        const T *input, const int *inputStrides, const int inputDim,
+        const GatherAxisT *indices,const int *indicesStrides,const int indicesDim,
+        const int gatherAxis,
+        T *output,const int outputlen);//output 和input的shape相同,所以共享strides,dim,len
+
+    template <typename T,typename GatherAxisT>
+    void launch_gather(
+        const T *input, const int *inputStrides, const int inputDim, 
+        const GatherAxisT *indices,const int *indicesStrides,const int indicesDim,
+        const int gatherAxis,
+        T *output,const  int outputlen);//output 和input的shape相同,所以共享strides,dim,len
 };
 #endif // DEEPX_TENSORFUNC_CHANGESHAPE_MIAOBYTE_CUH
\ No newline at end of file
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.hpp
index d127e6a1..8fb43a76 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.hpp
@@ -13,6 +13,7 @@
 
 namespace deepx::tensorfunc
 {
+    //reshape
     template <typename T>
     struct reshapeDispatcher<miaobyte, T>
     {
@@ -43,6 +44,7 @@ namespace deepx::tensorfunc
         }
     };
 
+    //transpose
     template <typename T>
     struct transposeDispatcher<miaobyte, T>
     {
@@ -59,6 +61,7 @@ namespace deepx::tensorfunc
         }
     };
 
+    //concat        
     template <typename T>
     struct concatDispatcher<miaobyte, T>
     {
@@ -96,7 +99,7 @@ namespace deepx::tensorfunc
         };
     };
 
-
+    //broadcastTo
     template <typename T>
     struct broadcastToDispatcher<miaobyte, T>
     {
@@ -113,5 +116,23 @@ namespace deepx::tensorfunc
             B.data, B.shape.strides.data(), B.shape.dim, B.shape.size);
         }
     };
+
+    //gather
+    template <typename T,typename GatherAxisT>
+    struct gatherDispatcher<miaobyte, T,GatherAxisT>
+    {
+        static void gather(const Tensor<T> &input, const Tensor<GatherAxisT> &indices, const int axis, Tensor<T> &output){
+            vector<int> input_gatherShape = indices.shape.shape;
+            if (input_gatherShape.empty()||input_gatherShape!=output.shape.shape)
+            {
+                throw TensorShapeError("Gather shape mismatch");
+            }
+            int gatherAxis = axis < 0 ? input.shape.dim + axis : axis;
+            launch_gather<T,GatherAxisT>(input.data, input.shape.strides.data(), input.shape.dim,
+                            indices.data, indices.shape.strides.data(), indices.shape.dim,
+                            gatherAxis,
+                            output.data,output.shape.size);//output和indices的shape相同，共享strides等
+        }
+    };
 }
 #endif // DEEPX_TENSORFUNC_CHANGESHAPE_MIAOBYTE_HPP
\ No newline at end of file
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda.hpp
index 7dc0320e..603415b8 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda.hpp
@@ -28,8 +28,9 @@ namespace deepx::tensorfunc
     private:
         cublasHandle_t handle_;
     };
-    //TODO 
-    inline int deviceblocksize(){
+    // TODO
+    inline int deviceblocksize()
+    {
         int device_id;
         cudaGetDevice(&device_id);
         cudaDeviceProp props;
@@ -59,6 +60,8 @@ namespace deepx::tensorfunc
         blocks = std::min(blocks, optimal_blocks);
         return {blocks, blocksize};
     };
+
+    
 }
 
 #endif
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/init_miaobyte.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/init_miaobyte.cu
index 0e98773c..82497ee5 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/init_miaobyte.cu
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/init_miaobyte.cu
@@ -87,7 +87,6 @@ namespace deepx::tensorfunc
 
             // 先用float类型进行计算，然后转换为目标类型
             float result = low + (high - low) * rand;
-            printf("threadIdx: %d, idx: %d, result: %f\n", threadIdx.x, idx, result);
             data[idx] = static_cast<T>(result);
         }
     }
@@ -124,13 +123,10 @@ namespace deepx::tensorfunc
         for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride)
         {
             // 生成[0,1)范围的随机数
-            float rand = curand_uniform(&state);
-
+            float rand = curand_normal(&state);
             // 先用float类型进行计算，然后转换为目标类型
-            float result =  rand;
-            // float result = mean + stddev * rand;
-            printf("threadIdx: %d, idx: %d, result: %f\n", threadIdx.x, idx, result);
-            data[idx] = static_cast<T>(rand);
+            float result = mean + stddev * rand;
+            data[idx] = static_cast<T>(result);
         }
     }
     template <typename T>
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/io_miaobyte.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/io_miaobyte.hpp
index e4aa4080..02fee22f 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/io_miaobyte.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/io_miaobyte.hpp
@@ -3,6 +3,10 @@
 
 #include <iostream>
 #include <string>
+
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+
 #include <deepx/tensor.hpp>
 #include <deepx/dtype.hpp>
 #include <stdutil/vector.hpp>
@@ -27,6 +31,27 @@ namespace deepx::tensorfunc
                 throw std::runtime_error("Failed to allocate host memory");
             }
 
+            stdutil::print(t.shape.shape, host_data, t.shape.dtype, f);
+            delete[] host_data;
+        };
+    };
+
+    // 特化Float16和BFloat16类型
+    template <>
+    struct printDispatcher<miaobyte, half>
+    {
+        static void print(const Tensor<half> &t, const std::string &f = "")
+        {
+            int bytes = precision_bits(t.shape.dtype) / 8;
+            size_t total_bytes = t.shape.size * bytes;
+
+            // 统一分配CPU内存
+            unsigned char *host_data = new unsigned char[total_bytes];
+            if (host_data == nullptr)
+            {
+                throw std::runtime_error("Failed to allocate host memory");
+            }
+
             // 统一复制数据到CPU
             cudaError_t err = cudaMemcpy(host_data, t.data, total_bytes, cudaMemcpyDeviceToHost);
             if (err != cudaSuccess)
@@ -35,103 +60,158 @@ namespace deepx::tensorfunc
                 throw std::runtime_error("Failed to copy data from device to host");
             }
 
-            // 对于half和bf16类型需要转换为float
-            if (t.shape.dtype == Precision::Float16 || t.shape.dtype == Precision::BFloat16)
+            float *host_float = new float[t.shape.size];
+            if (host_float == nullptr)
             {
-                float *host_float = new float[t.shape.size];
-                if (host_float == nullptr)
-                {
-                    delete[] host_data;
-                    throw std::runtime_error("Failed to allocate host memory for float conversion");
-                }
-
-                // 在CPU上进行类型转换
-                if (t.shape.dtype == Precision::Float16)
-                {
-                    for (size_t i = 0; i < t.shape.size; i++)
-                    {
-                        host_float[i] = __half2float(((half *)host_data)[i]);
-                    }
-                }
-                else
-                { // BFloat16
-                    for (size_t i = 0; i < t.shape.size; i++)
-                    {
-                        host_float[i] = __bfloat162float(((nv_bfloat16 *)host_data)[i]);
-                    }
-                }
-
-                // 打印转换后的float数据
-                stdutil::print(t.shape.shape, host_float, Precision::Float32, f.empty() ? "%.4f" : f);
-                delete[] host_float;
+                delete[] host_data;
+                throw std::runtime_error("Failed to allocate host memory for float conversion");
             }
-            else
+
+            for (size_t i = 0; i < t.shape.size; i++)
             {
-                // 其他类型直接打印
-                stdutil::print(t.shape.shape, host_data, t.shape.dtype, f);
+                host_float[i] = __half2float(((half *)host_data)[i]);
             }
 
             delete[] host_data;
+            // 打印转换后的float数据
+            stdutil::print(t.shape.shape, host_float, Precision::Float32, f);
+            delete[] host_float;
         }
     };
 
-    template <typename T>
-    struct saveDispatcher<miaobyte, T>
+    template <>
+    struct printDispatcher<miaobyte, nv_bfloat16>
     {
-        static void save(Tensor<T> &tensor, const std::string &path, int filebegin = 0)
+        static void print(const Tensor<nv_bfloat16> &t, const std::string &f = "")
         {
-            // 保存shape
-            std::string shapepath = path + ".shape";
-            std::string shapedata = tensor.shape.toYaml();
-            std::ofstream shape_fs(shapepath, std::ios::binary);
-            shape_fs.write(shapedata.c_str(), shapedata.size());
-            shape_fs.close();
-
-            // 保存data
-            std::string datapath = path + ".data";
-            std::ofstream data_fs(datapath, std::ios::binary | std::ios::in | std::ios::out);
-
-            if (!data_fs.is_open())
+            int bytes = precision_bits(t.shape.dtype) / 8;
+            size_t total_bytes = t.shape.size * bytes;
+
+            // 统一分配CPU内存
+            unsigned char *host_data = new unsigned char[total_bytes];
+            if (host_data == nullptr)
+            {
+                throw std::runtime_error("Failed to allocate host memory");
+            }
+
+            // 统一复制数据到CPU
+            cudaError_t err = cudaMemcpy(host_data, t.data, total_bytes, cudaMemcpyDeviceToHost);
+            if (err != cudaSuccess)
+            {
+                delete[] host_data;
+                throw std::runtime_error("Failed to copy data from device to host");
+            }
+
+            float *host_float = new float[t.shape.size];
+            if (host_float == nullptr)
             {
-                // 如果文件不存在，则创建新文件
-                data_fs.open(datapath, std::ios::binary | std::ios::out);
+                delete[] host_data;
+                throw std::runtime_error("Failed to allocate host memory for float conversion");
             }
-            data_fs.seekp(filebegin);
-            data_fs.write(reinterpret_cast<const char *>(tensor.data), tensor.shape.size * sizeof(T));
-            data_fs.close();
+
+            for (size_t i = 0; i < t.shape.size; i++)
+            {
+                host_float[i] = __bfloat162float(((nv_bfloat16 *)host_data)[i]);
+            }
+            delete[] host_data;
+            // 打印转换后的float数据
+            stdutil::print(t.shape.shape, host_float, Precision::Float32, f);
+            delete[] host_float;
         }
     };
+
     template <typename T>
-    struct loadDispatcher<miaobyte, T>
+    void save(Tensor<T> &tensor, const std::string &path)
     {
-        static Tensor<T> load(const std::string &path, int filebegin = 0)
+        // 保存shape
+        std::string shapepath = path + ".shape";
+        std::string shapedata = tensor.shape.toYaml();
+        std::ofstream shape_fs(shapepath, std::ios::binary);
+        shape_fs.write(shapedata.c_str(), shapedata.size());
+        shape_fs.close();
+
+        // 保存data
+        int bytes = precision_bits(tensor.shape.dtype) / 8;
+        size_t total_bytes = tensor.shape.size * bytes;
+
+        // 统一分配CPU内存
+        unsigned char *host_data = new unsigned char[total_bytes];
+        if (host_data == nullptr)
         {
-            // 加载shape
-            std::string shapepath = path + ".shape";
-            std::ifstream shape_fs(shapepath, std::ios::binary);
-            std::string shapedata((std::istreambuf_iterator<char>(shape_fs)), std::istreambuf_iterator<char>());
+            throw std::runtime_error("Failed to allocate host memory");
+        }
 
-            Shape shape;
-            shape.fromYaml(shapedata);
-            shape_fs.close();
+        // 统一复制数据到CPU
+        cudaError_t err = cudaMemcpy(host_data, tensor.data, total_bytes, cudaMemcpyDeviceToHost);
+        if (err != cudaSuccess)
+        {
+            delete[] host_data;
+            throw std::runtime_error("Failed to copy data from device to host");
+        }
 
-            // 加载data
-            Tensor<T> tensor = New<T>(shape);
-            std::string datapath = path + ".data";
-            std::ifstream data_fs(datapath, std::ios::binary);
+        std::string datapath = path + ".data";
+        std::ofstream data_fs(datapath, std::ios::binary | std::ios::in | std::ios::out);
 
-            if (!data_fs.is_open())
-            {
-                throw std::runtime_error("无法打开数据文件: " + datapath);
-            }
+        if (!data_fs.is_open())
+        {
+            // 如果文件不存在，则创建新文件
+            data_fs.open(datapath, std::ios::binary | std::ios::out);
+        }
+        data_fs.seekp(0);
+        data_fs.write(reinterpret_cast<const char *>(host_data), total_bytes);
+        data_fs.close();
 
-            // 设置读取位置
-            data_fs.seekg(filebegin);
-            data_fs.read(reinterpret_cast<char *>(tensor.data), shape.size * sizeof(T));
-            data_fs.close();
+        delete[] host_data;
+    };
 
-            return tensor;
+    template <typename T>
+    pair<std::string, shared_ptr<Tensor<T>>> load(const std::string &path)
+    {
+        // 加载shape
+        pair<std::string, Shape> shape_name = loadShape(path);
+        Shape shape = shape_name.second;
+        std::string tensor_name = shape_name.first;
+
+        // 检查T 和 shape.dtype 是否匹配
+        if (shape.dtype != precision<T>())
+        {
+            throw std::runtime_error("调用load<" + precision_str(shape.dtype) + "> 不匹配: 需要 " + precision_str(shape.dtype) +
+                                     " 类型，但文件为" + precision_str(precision<T>()) + " 类型");
         }
-    };
+
+        // 检查file.size，是否是tensor.size*sizeof(T)
+        std::string datapath = path + ".data";
+        std::ifstream data_fs(datapath, std::ios::binary);
+        data_fs.seekg(0, std::ios::end);
+        std::streamsize fileSize = data_fs.tellg();
+        std::streamsize expectedSize = shape.size * precision_bits(shape.dtype) / 8;
+
+        if (fileSize != expectedSize)
+        {
+            throw std::runtime_error("数据文件大小不足: 需要 " + std::to_string(expectedSize) +
+                                     " 字节，但文件只有 " + std::to_string(fileSize) + " 字节");
+        }
+        data_fs.seekg(0);
+
+        // TODO 从文件，到cuda内存（可能是显存）
+
+        shared_ptr<Tensor<T>> tensor = make_shared<Tensor<T>>(New<T>(shape.shape));
+        unsigned char *host_data = new unsigned char[fileSize];
+        if (host_data == nullptr)
+        {
+            throw std::runtime_error("Failed to allocate host memory");
+        }
+        data_fs.read(reinterpret_cast<char *>(host_data), fileSize);
+        data_fs.close();
+
+        cudaError_t err = cudaMemcpy(tensor->data, host_data, fileSize, cudaMemcpyHostToDevice);
+        if (err != cudaSuccess)
+        {
+            delete[] host_data;
+            throw std::runtime_error("Failed to copy data from host to device");
+        }
+        delete[] host_data;
+        return std::make_pair(tensor_name, tensor);
+    }
 }
 #endif // DEEPX_TENSORFUNC_IO_MIAOBYTE_HPP
\ No newline at end of file
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.cu
index c9e185c8..c6047ac8 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.cu
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.cu
@@ -58,8 +58,8 @@ namespace deepx::tensorfunc
         cudaVector<int> result_strides_d(result_strides, result_dim, cudaMemcpyHostToDevice);
         cudaVector<int> reduced_dims_d(reduced_dims, tensor_dim, cudaMemcpyHostToDevice);
 
-        int powDim = nextPowerOf2(tensor_dim);
-        switch (powDim)
+ 
+        switch (tensor_dim)
         {
         case 1:
             sum_kernel<1, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
@@ -67,26 +67,38 @@ namespace deepx::tensorfunc
         case 2:
             sum_kernel<2, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
             break;
+        case 3:
+            sum_kernel<3, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
+            break;
         case 4:
             sum_kernel<4, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
             break;
+        case 5:
+            sum_kernel<5, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
+            break;
+        case 6:
+            sum_kernel<6, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
+            break;
+        case 7:
+            sum_kernel<7, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);    
+            break;
         case 8:
             sum_kernel<8, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
             break;
-        case 16:
-            sum_kernel<16, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
-            break;
-        case 32:
-            sum_kernel<32, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);    
+        case 9:
+            sum_kernel<9, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
             break;
-        case 64:
-            sum_kernel<64, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
+        case 10:
+            sum_kernel<10, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
             break;
-        case 128:
-            sum_kernel<128, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
+        case 11:
+            sum_kernel<11, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
+            break;  
+        case 12:
+            sum_kernel<12, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
             break;
         default:
-            throw std::runtime_error("dim too large, max support 128");
+            throw std::runtime_error("dim too large, max support " + std::to_string(MAX_DIM));
         }
     }
 
@@ -155,8 +167,7 @@ namespace deepx::tensorfunc
         cudaVector<int> result_strides_d(result_strides, result_dim, cudaMemcpyHostToDevice);
         cudaVector<int> reduced_dims_d(reduced_dims, tensor_dim, cudaMemcpyHostToDevice);
 
-        int powDim = nextPowerOf2(tensor_dim);
-        switch (powDim)
+        switch (tensor_dim)
         {
         case 1:
             prod_kernel<1, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
@@ -164,26 +175,26 @@ namespace deepx::tensorfunc
         case 2:
             prod_kernel<2, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
             break;
+        case 3:
+            prod_kernel<3, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
+            break;
         case 4:
             prod_kernel<4, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
             break;
-        case 8:
-            prod_kernel<8, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
+        case 5:
+            prod_kernel<5, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
             break;
-        case 16:
-            prod_kernel<16, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
+        case 6:
+            prod_kernel<6, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
             break;
-        case 32:
-            prod_kernel<32, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
+        case 7:
+            prod_kernel<7, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
             break;
-        case 64:
-            prod_kernel<64, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
-            break;
-        case 128:
-            prod_kernel<128, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
+        case 8:
+            prod_kernel<8, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
             break;
         default:
-            throw std::runtime_error("dim too large, max support 128");
+            throw std::runtime_error("dim too large, max support " + std::to_string(MAX_DIM));
         }
     }
 
@@ -252,8 +263,7 @@ namespace deepx::tensorfunc
         cudaVector<int> result_strides_d(result_strides, result_dim, cudaMemcpyHostToDevice);
         cudaVector<int> reduced_dims_d(reduced_dims, tensor_dim, cudaMemcpyHostToDevice);
 
-        int powDim = nextPowerOf2(tensor_dim);
-        switch (powDim)
+        switch (tensor_dim)
         {
         case 1:
             max_kernel<1, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
@@ -261,26 +271,38 @@ namespace deepx::tensorfunc
         case 2:
             max_kernel<2, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
             break;
+        case 3:
+            max_kernel<3, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
+            break;
         case 4:
             max_kernel<4, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
             break;
+        case 5:
+            max_kernel<5, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
+            break;
+        case 6:
+            max_kernel<6, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
+            break;
+        case 7:
+            max_kernel<7, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
+            break;
         case 8:
             max_kernel<8, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
             break;
-        case 16:
-            max_kernel<16, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
-            break;
-        case 32:
-            max_kernel<32, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
+        case 9:
+            max_kernel<9, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
             break;
-        case 64:
-            max_kernel<64, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
+        case 10:
+            max_kernel<10, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
+            break;  
+        case 11:
+            max_kernel<11, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
             break;
-        case 128:
-            max_kernel<128, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
+        case 12:
+            max_kernel<12, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
             break;
         default:
-            throw std::runtime_error("dim too large, max support 128");
+            throw std::runtime_error("dim too large, max support " + std::to_string(MAX_DIM));
         }
     };
 
@@ -349,8 +371,7 @@ namespace deepx::tensorfunc
         cudaVector<int> result_strides_d(result_strides, result_dim, cudaMemcpyHostToDevice);
         cudaVector<int> reduced_dims_d(reduced_dims, tensor_dim, cudaMemcpyHostToDevice);
 
-        int powDim = nextPowerOf2(tensor_dim);
-        switch (powDim)
+        switch (tensor_dim)
         {
         case 1:
             min_kernel<1, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
@@ -358,26 +379,38 @@ namespace deepx::tensorfunc
         case 2:
             min_kernel<2, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
             break;
+        case 3:
+            min_kernel<3, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
+            break;
         case 4:
             min_kernel<4, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
             break;
+        case 5:
+            min_kernel<5, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
+            break;
+        case 6:
+            min_kernel<6, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
+            break;
+        case 7:
+            min_kernel<7, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
+            break;
         case 8:
             min_kernel<8, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
             break;
-        case 16:
-            min_kernel<16, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
-            break;
-        case 32:
-            min_kernel<32, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
+        case 9:
+            min_kernel<9, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
             break;
-        case 64:
-            min_kernel<64, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
+        case 10:
+            min_kernel<10, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
             break;
-        case 128:
-            min_kernel<128, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
+        case 11:
+            min_kernel<11, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
             break;
+        case 12:
+            min_kernel<12, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
+            break;  
         default:
-            throw std::runtime_error("dim too large, max support 128");
+            throw std::runtime_error("dim too large, max support " + std::to_string(MAX_DIM));
         }
     }
 
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/tensor_cuda.cuh b/excuter/op-mem-cuda/src/deepx/tensorfunc/tensor_cuda.cuh
index b60ab3f8..ef3538a4 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/tensor_cuda.cuh
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/tensor_cuda.cuh
@@ -16,7 +16,7 @@ namespace deepx::tensorfunc
         }
     }
 
-    inline __host__ __device__ int linearAt(const int *strides, const int dim, int *indices)
+    inline __host__ __device__ int linearAt(const int *strides, const int dim,const int *indices)
     {
         int idx = 0;
         for (int i = 0; i < dim; i++)
@@ -34,22 +34,8 @@ namespace deepx::tensorfunc
             neworder[i] = order[dimOrder[i]];
         }
     }
-   
-    inline int nextPowerOf2(int n)
-    {
-        if (n <= 0)
-            return 1;
-        if ((n & (n - 1)) == 0)
-            return n; // 如果n已经是2的幂
-
-        n--;
-        n |= n >> 1;
-        n |= n >> 2;
-        n |= n >> 4;
-        n |= n >> 8;
-        n |= n >> 16;
-        return n + 1;
-    }
+    
+    const int MAX_DIM = 12;
 }
 
 #endif // DEEPX_TENSORFUNC_TENSOR_CUDA_CUH
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/tensorlife_miaobyte.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/tensorlife_miaobyte.hpp
index acf28e9f..fdcb0f17 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/tensorlife_miaobyte.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/tensorlife_miaobyte.hpp
@@ -38,7 +38,6 @@ namespace deepx::tensorfunc
         Shape shape(shapedata);
         shape.dtype=precision<T>();
         Tensor<T> tensor(shape);
-        tensor.device = CUDA; // 使用 CUDA 设备
         tensor.deleter = dataFree<T>;
         tensor.copyer = dataCopy<T>;
         tensor.newer = dataNew<T>;
diff --git a/excuter/op-mem-cuda/src/deepx/tf/changeshape.hpp b/excuter/op-mem-cuda/src/deepx/tf/changeshape.hpp
index 99186de5..6320b7d4 100644
--- a/excuter/op-mem-cuda/src/deepx/tf/changeshape.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tf/changeshape.hpp
@@ -11,6 +11,8 @@ namespace deepx::tf
 {
     using namespace deepx::tensorfunc;
     using namespace std;
+
+    // reshape
     template <typename Author>
     class Reshape : public TF
     {
@@ -72,6 +74,7 @@ namespace deepx::tf
         }
     };
 
+    // transpose
     template <typename Author>
     class Transpose : public TF
     {
@@ -140,6 +143,7 @@ namespace deepx::tf
         }
     };
 
+    // concat
     template <typename Author>
     class Concat : public TF
     {
@@ -163,13 +167,13 @@ namespace deepx::tf
         }
         int run(shared_ptr<MemBase> mem, string &error) override
         {
-            if (!checktensors({ this->returns[0].textvalue}, mem, error)!=0)
+            if (!checktensors({this->returns[0].textvalue}, mem, error) != 0)
             {
                 return 1;
             }
 
             vector<string> tensor_names = this->getvector<string>(0, true);
-            if (!checktensors(tensor_names, mem, error)!=0)
+            if (!checktensors(tensor_names, mem, error) != 0)
             {
                 return 1;
             }
@@ -274,6 +278,7 @@ namespace deepx::tf
         };
     };
 
+    // broadcastTo
     template <typename Author>
     class BroadcastTo : public TF
     {
@@ -338,5 +343,152 @@ namespace deepx::tf
             return 0;
         }
     };
-}
+
+    // gather
+    template <typename Author>
+    class Gather : public TF
+    {
+    public:
+        Gather(const vector<Param> &args, const vector<Param> &returns)
+        {
+            this->name = "gather";
+            this->author = Author::name();
+            this->tftype = "changeshape";
+            this->args = args;
+            this->returns = returns;
+        }
+
+        string math_formula() const override
+        {
+            return "T2 = T1.gather(indices=[1,2], axis=1)";
+        }
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<Gather<Author>>(*this);
+        }
+        int run(shared_ptr<MemBase> mem, string &error) override
+        {
+            Precision input_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
+
+            int axis = this->getvar<int>(2, mem, true);
+            Precision output_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
+            if (input_type != output_type)
+            {
+                error = "output_type " + precision_str(output_type) + " or input_type " + precision_str(input_type) + " must be the same";
+                return 1;
+            }
+            Precision indices_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype;
+            if (indices_type != Precision::Int64 && indices_type != Precision::Int32)
+            {
+                error = "indices_type " + precision_str(indices_type) + " only support " + precision_str(Precision::Int64) + " or " + precision_str(Precision::Int32);
+                return 1;
+            }
+
+            switch (input_type)
+            {
+            case Precision::Float64:
+            {
+                if (indices_type == Precision::Int64)
+                {
+                    gather<Author, double, int64_t>(*mem->gettensor<double>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue), axis, *mem->gettensor<double>(this->returns[0].textvalue));
+                }
+                else if (indices_type == Precision::Int32)
+                {
+                    gather<Author, double, int32_t>(*mem->gettensor<double>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), axis, *mem->gettensor<double>(this->returns[0].textvalue));
+                }
+                break;
+            }
+            case Precision::Float32:
+            {
+                if (indices_type == Precision::Int64)
+                {
+                    gather<Author, float, int64_t>(*mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue), axis, *mem->gettensor<float>(this->returns[0].textvalue));
+                }
+                else if (indices_type == Precision::Int32)
+                {
+                    gather<Author, float, int32_t>(*mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), axis, *mem->gettensor<float>(this->returns[0].textvalue));
+                }
+                break;
+            }
+            case Precision::Float16:
+            {
+                if (indices_type == Precision::Int64)
+                {
+                    gather<Author, half, int64_t>(*mem->gettensor<half>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue), axis, *mem->gettensor<half>(this->returns[0].textvalue));
+                }
+                else if (indices_type == Precision::Int32)
+                {
+                    gather<Author, half, int32_t>(*mem->gettensor<half>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), axis, *mem->gettensor<half>(this->returns[0].textvalue));
+                }
+                break;
+            }
+            case Precision::BFloat16:
+            {
+                if (indices_type == Precision::Int64)
+                {
+                    gather<Author, nv_bfloat16, int64_t>(*mem->gettensor<nv_bfloat16>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue), axis, *mem->gettensor<nv_bfloat16>(this->returns[0].textvalue));
+                }
+                else if (indices_type == Precision::Int32)
+                {
+                    gather<Author, nv_bfloat16, int32_t>(*mem->gettensor<nv_bfloat16>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), axis, *mem->gettensor<nv_bfloat16>(this->returns[0].textvalue));
+                }
+                break;
+            }
+            case Precision::Int64:
+            {
+                if (indices_type == Precision::Int64)
+                {
+                    gather<Author, int64_t, int64_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue), axis, *mem->gettensor<int64_t>(this->returns[0].textvalue));
+                }
+                else if (indices_type == Precision::Int32)
+                {
+                    gather<Author, int64_t, int32_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), axis, *mem->gettensor<int64_t>(this->returns[0].textvalue));
+                }
+                break;
+            }
+            case Precision::Int32:
+            {
+                if (indices_type == Precision::Int64)
+                {
+                    gather<Author, int32_t, int64_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue), axis, *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                }
+                else if (indices_type == Precision::Int32)
+                {
+                    gather<Author, int32_t, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), axis, *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                }
+                break;
+            }
+            case Precision::Int16:
+            {
+                if (indices_type == Precision::Int64)
+                {
+                    gather<Author, int16_t, int64_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue), axis, *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                }
+                else if (indices_type == Precision::Int32)
+                {
+                    gather<Author, int16_t, int32_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), axis, *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                }
+                break;
+            }
+            case Precision::Int8:
+            {
+                if (indices_type == Precision::Int64)
+                {
+                    gather<Author, int8_t, int64_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue), axis, *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                }
+                else if (indices_type == Precision::Int32)
+                {
+                    gather<Author, int8_t, int32_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), axis, *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                }
+                break;
+            }
+            default:
+                error = "Unsupported type: " + precision_str(input_type);
+                return 1;
+            }
+            return 0;
+        }
+    };
+
+};
 #endif // DEEPX_TF_CHANGESHAPE_HPP
diff --git a/excuter/op-mem-cuda/src/deepx/tf/io.hpp b/excuter/op-mem-cuda/src/deepx/tf/io.hpp
index 14315a85..b5a5f4d1 100644
--- a/excuter/op-mem-cuda/src/deepx/tf/io.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tf/io.hpp
@@ -54,5 +54,129 @@ namespace deepx::tf
             return make_shared<Print<Author>>(*this);
         }
     };
+
+     //save
+    class Save : public TF
+    {
+    public:
+        Save(vector<Param> args, vector<Param> returns) 
+        {
+            this->name = "save";
+            this->tftype = "io";
+            this->args = args;
+            this->returns = returns;
+        }   
+         string math_formula() const override
+        {
+            return "save(T1,path)";
+        }
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<Save>(*this);
+        }
+        int run(shared_ptr<MemBase> mem, string &error) override
+        {
+            string name = this->args[0].textvalue;
+            string path = this->args[1].textvalue;
+            if (mem->existstensor(name))
+            {
+                auto t = mem->gettensor(name);
+                tensorfunc::save<void>(*t, path);
+            }
+            else
+            {
+                std::cerr << "save " << name << " not found" << std::endl;
+                error = "save " + name + " not found";
+                return 1;
+            }
+            return 0;
+        }
+    };
+
+    //load
+    class Load : public TF
+    {
+    public:
+        Load(vector<Param> args, vector<Param> returns)
+        {   
+            this->name = "load";
+            this->tftype = "io";
+            this->args = args;
+            this->returns = returns;
+        }
+        string math_formula() const override
+        {
+            return "load(path)";
+        }
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<Load>(*this);
+        }
+        int run(shared_ptr<MemBase> mem, string &error) override    
+        {
+            string path = this->args[0].textvalue;
+            
+            pair<std::string,Shape> shape_name=tensorfunc::loadShape(path);
+            std::string tensor_name=shape_name.first;
+            Shape shape=shape_name.second;
+
+            if(mem->existstensor(tensor_name))
+            {
+                cout<<"warning: "<<tensor_name<<" already exists,deepx will delete it,create new one"<<endl;
+                mem->delete_tensor(tensor_name);
+            }
+            switch (shape.dtype)
+            {
+            case Precision::Float64:{
+                pair<std::string,shared_ptr<Tensor<double>>> t = tensorfunc::load<double>(path);
+                mem->addtensor(tensor_name, t.second);
+                break;
+            }
+            case Precision::Float32:{
+                pair<std::string,shared_ptr<Tensor<float>>> t = tensorfunc::load<float>(path);
+                mem->addtensor(tensor_name, t.second);
+                break;
+            }
+            case Precision::Float16:{
+                pair<std::string,shared_ptr<Tensor<half>>> t = tensorfunc::load<half>(path);
+                mem->addtensor(tensor_name, t.second);
+                break;
+            }
+            case Precision::BFloat16:{
+                pair<std::string,shared_ptr<Tensor<nv_bfloat16>>> t = tensorfunc::load<nv_bfloat16>(path);
+                mem->addtensor(tensor_name, t.second);
+                break;  
+            }
+            case Precision::Int64:{
+                pair<std::string,shared_ptr<Tensor<int64_t>>> t = tensorfunc::load<int64_t>(path);
+                mem->addtensor(tensor_name, t.second);
+                break;
+            }
+            case Precision::Int32:{
+                pair<std::string,shared_ptr<Tensor<int32_t>>> t = tensorfunc::load<int32_t>(path);
+                mem->addtensor(tensor_name, t.second);
+                break;
+            }
+            case Precision::Int16:{
+                pair<std::string,shared_ptr<Tensor<int16_t>>> t = tensorfunc::load<int16_t>(path);
+                mem->addtensor(tensor_name, t.second);
+                break;
+            }
+            case Precision::Int8:{
+                pair<std::string,shared_ptr<Tensor<int8_t>>> t = tensorfunc::load<int8_t>(path);
+                mem->addtensor(tensor_name, t.second);
+                break;
+            }
+            case Precision::Bool:{
+                pair<std::string,shared_ptr<Tensor<bool>>> t = tensorfunc::load<bool>(path);
+                mem->addtensor(tensor_name, t.second);
+                break;
+            }
+            default:
+                break;
+            }
+            return 0;
+        }
+    };
 }
 #endif
diff --git a/excuter/op-mem-ompsimd/src/client/tfs.cpp b/excuter/op-mem-ompsimd/src/client/tfs.cpp
index defad8c6..6eed5e04 100644
--- a/excuter/op-mem-ompsimd/src/client/tfs.cpp
+++ b/excuter/op-mem-ompsimd/src/client/tfs.cpp
@@ -99,30 +99,44 @@ namespace deepx::tf
                                                              vector<Param>()));
         // normal author=miaobyte
         tffactory.add_tf(std::make_shared<Normal<miaobyte>>(vector<Param>(
-                                                                 {
-                                                                     Param("t", DataCategory::Tensor, Precision::Any),
-                                                                     Param("mean", DataCategory::Var, Precision::Any),
-                                                                     Param("std", DataCategory::Var, Precision::Any),
-                                                                     Param("seed", DataCategory::Var, Precision::Int32),
-                                                                 }),
-                                                             vector<Param>()));
+                                                                {
+                                                                    Param("t", DataCategory::Tensor, Precision::Any),
+                                                                    Param("mean", DataCategory::Var, Precision::Any),
+                                                                    Param("std", DataCategory::Var, Precision::Any),
+                                                                    Param("seed", DataCategory::Var, Precision::Int32),
+                                                                }),
+                                                            vector<Param>()));
     }
     // io
-    void register_util(TfFactory &opfactory)
+    void register_io(TfFactory &opfactory)
     {
         // print author=miaobyte
         opfactory.add_tf(std::make_shared<Print<miaobyte>>(vector<Param>(
                                                                {
-                                                                   Param("", DataCategory::Tensor, Precision::Any),
+                                                                   Param("t", DataCategory::Tensor, Precision::Any),
                                                                }),
                                                            vector<Param>()));
         // print author=miaobyte
         opfactory.add_tf(std::make_shared<Print<miaobyte>>(vector<Param>(
                                                                {
-                                                                   Param("", DataCategory::Tensor, Precision::Any),
-                                                                   Param("", DataCategory::Var, Precision::String),
+                                                                   Param("t", DataCategory::Tensor, Precision::Any),
+                                                                   Param("format", DataCategory::Var, Precision::String),
+                                                               }),
+                                                           vector<Param>()));
+        //save
+        opfactory.add_tf(std::make_shared<Save>(vector<Param>(
+                                                               {
+                                                                   Param("t", DataCategory::Tensor, Precision::Any),
+                                                                   Param("path", DataCategory::Var, Precision::String),
                                                                }),
                                                            vector<Param>()));
+
+        //load
+        opfactory.add_tf(std::make_shared<Load>(vector<Param>(
+                                                               {
+                                                                   Param("path", DataCategory::Var, Precision::String),
+                                                               }),
+                                                           vector<Param>()));                                                
     }
 
     // elementwise
@@ -474,6 +488,17 @@ namespace deepx::tf
                                                                      {
                                                                          Param("B", DataCategory::Tensor, Precision::Any),
                                                                      })));
+        // gather author=miaobyte
+        tffactory.add_tf(std::make_shared<Gather<miaobyte>>(vector<Param>(
+                                                                {
+                                                                    Param("A", DataCategory::Tensor, Precision::Any),
+                                                                    Param("indices", DataCategory::Tensor, Precision::Int32 | Precision::Int64),
+                                                                    Param("axis", DataCategory::Var, Precision::Int32),
+                                                                }),
+                                                            vector<Param>(
+                                                                {
+                                                                    Param("B", DataCategory::Tensor, Precision::Any),
+                                                                })));
     }
     // // reduce
     void register_reduce(TfFactory &tffactory)
@@ -528,7 +553,7 @@ namespace deepx::tf
     {
         register_lifecycle(tffactory);
         register_init(tffactory);
-        register_util(tffactory);
+        register_io(tffactory);
         register_elementwise(tffactory);
         register_matmul(tffactory);
         register_changeshape(tffactory);
diff --git a/excuter/op-mem-ompsimd/src/deepx/mem/mem_ompsimd.hpp b/excuter/op-mem-ompsimd/src/deepx/mem/mem_ompsimd.hpp
index b8f8d538..8e4710b0 100644
--- a/excuter/op-mem-ompsimd/src/deepx/mem/mem_ompsimd.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/mem/mem_ompsimd.hpp
@@ -50,7 +50,6 @@ namespace deepx::mem
             auto ptr = mem.at(name);
             auto result = make_shared<Tensor<void>>();
             result->shape = ptr->shape;
-            result->device = ptr->device;
             result->deleter = nullptr;
             result->copyer = nullptr;
             result->newer = nullptr;
diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/changeshape_miaobyte.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/changeshape_miaobyte.hpp
index 1644c045..7e2985fb 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/changeshape_miaobyte.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/changeshape_miaobyte.hpp
@@ -10,12 +10,13 @@
 #include "deepx/tensorfunc/authors.hpp"
 namespace deepx::tensorfunc
 {
+    // reshape
     template <typename T>
     struct reshapeDispatcher<miaobyte, T>
     {
-        static void reshape(const Tensor<T> &tensor, const std::vector<int> &shape,Tensor<T> &output)
+        static void reshape(const Tensor<T> &tensor, const std::vector<int> &shape, Tensor<T> &output)
         { // 参数改为单个tensor引用
-            
+
             int new_prod = 1;
             for (int dim : shape)
             {
@@ -29,18 +30,18 @@ namespace deepx::tensorfunc
             Shape newshape(shape);
             if (tensor.data == output.data)
             {
-                output.shape.shape=newshape.shape;
-                output.shape.strides=newshape.strides;
+                output.shape.shape = newshape.shape;
+                output.shape.strides = newshape.strides;
             }
             else
             {
-                output.shape.shape=newshape.shape;
-                output.shape.strides=newshape.strides;
+                output.shape.shape = newshape.shape;
+                output.shape.strides = newshape.strides;
                 output.copyer(tensor.data, output.data, tensor.shape.size);
             }
         }
     };
-
+    // transpose
     template <typename T>
     struct transposeDispatcher<miaobyte, T>
     {
@@ -55,26 +56,26 @@ namespace deepx::tensorfunc
             {
                 throw std::runtime_error("transpose error!shape");
             }
-            output.shape.rangeParallel(dim_order.size(), [&tensor, &output, &dim_order](int idx_linear, const std::vector<int> &indices, std::vector<int> &newIndices)
+            output.shape.rangeParallel(dim_order.size(), [&tensor, &output, &dim_order](int idx_linear, const std::vector<int> &indices, ThreadLocalVectors &tlv)
                                        {
                                         
                             for (size_t i = 0; i < dim_order.size(); ++i) {
-                                newIndices[dim_order[i]] = indices[i];
+                                tlv.get(0)[dim_order[i]] = indices[i];
                             }
-                            output.data[idx_linear]= tensor.data[tensor.shape.linearat(newIndices)]; }, tensor.shape.dim);
+                            output.data[idx_linear]= tensor.data[tensor.shape.linearat(tlv.get(0))]; }, {tensor.shape.dim});
         }
     };
-
+    // concat
     template <typename T>
     struct concatDispatcher<miaobyte, T>
     {
         static void concat(const vector<Tensor<T> *> tensors, const int axis, Tensor<T> &result)
         {
-            //checkshape
+            // checkshape
             if (!checkShapeConcat(tensors, axis, result))
             {
                 throw TensorShapeError("Output tensor shape size must match the sum of input tensor shape sizes for concat");
-            }   
+            }
             int dimC = axis + 1;
             result.shape.rangeParallel(dimC, [&](const int idx, const std::vector<int> &indices)
                                        {
@@ -124,17 +125,52 @@ namespace deepx::tensorfunc
         static void broadcastTo(const Tensor<T> &A, const vector<int> &new_shape, Tensor<T> &B)
         {
             auto A_broadcastShape = broadcastShape(A.shape.shape, new_shape);
-            if (A_broadcastShape.empty()||A_broadcastShape!=new_shape)
+            if (A_broadcastShape.empty() || A_broadcastShape != new_shape)
             {
                 throw TensorShapeError("Broadcast shape mismatch");
             }
             auto bmap = broadcastMap(A.shape.shape, new_shape);
 
             B.shape.rangeParallel(B.shape.dim, [&](const int idx, const std::vector<int> &bindices)
-                                   {
+                                  {
                         vector<int> aindices=fromBroadcastIndices(bmap, bindices);
-                        B.data[idx] = A.data[A.shape.linearat(aindices)];
-                    });
+                        B.data[idx] = A.data[A.shape.linearat(aindices)]; });
+        }
+    };
+
+    // gather
+    // 支持高维indices
+    // 结果写入input_indices
+    template <typename GatherAxisT>
+    void fromGatherIndices(const vector<int> &output_indices, const Tensor<GatherAxisT> &indices, const int gatherAxis, vector<int> &input_indices)
+    {
+        std::copy(output_indices.begin(), output_indices.begin()+input_indices.size(), input_indices.begin());
+        int indices_idx = indices.shape.linearat(output_indices);
+        input_indices[gatherAxis] = indices.data[indices_idx];
+    }
+
+    template <typename T, typename GatherAxisT>
+    struct gatherDispatcher<miaobyte, T, GatherAxisT>
+    {
+        static void gather(const Tensor<T> &input, const Tensor<GatherAxisT> &indices, const int axis, Tensor<T> &output)
+        {
+            int gatherAxis = axis < 0 ? input.shape.dim + axis : axis;
+            if (gatherAxis < 0 || gatherAxis >= input.shape.dim)
+            {
+                throw std::invalid_argument("Axis is out of bounds");
+            }
+
+            vector<int> input_gatherShape =  indices.shape.shape;
+            if (input_gatherShape.empty() || input_gatherShape != output.shape.shape)
+            {
+                throw TensorShapeError("Gather shape mismatch");
+            }
+            output.shape.rangeParallel(output.shape.dim, [&](const int idx, const std::vector<int> &output_indices, ThreadLocalVectors &tlv)
+                                       {  
+                            fromGatherIndices(output_indices, indices, gatherAxis, tlv.get(0));
+                            output.data[idx] = input.data[input.shape.linearat(tlv.get(0))]; 
+                        },
+                    {input.shape.dim});
         }
     };
 
diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/io_miaobyte.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/io_miaobyte.hpp
index f65166ca..f219ca59 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/io_miaobyte.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/io_miaobyte.hpp
@@ -6,6 +6,7 @@
 #include "deepx/tensor.hpp"
 #include "stdutil/vector.hpp"
 #include "stdutil/print.hpp"
+#include "stdutil/fs.hpp"
 #include "deepx/tensorfunc/authors.hpp"
 #include "deepx/tensorfunc/io.hpp"
 #include "deepx/tensorfunc/tensorlife_miaobyte.hpp"
@@ -36,62 +37,69 @@ namespace deepx::tensorfunc
     };
 
     template <typename T>
-    struct saveDispatcher<miaobyte, T>
+    void save(Tensor<T> &tensor, const std::string &path)
     {
-        static void save(Tensor<T> &tensor, const std::string &path, int filebegin = 0)
-        {
-            // 保存shape
-            std::string shapepath = path + ".shape";
-            std::string shapedata = tensor.shape.toYaml();
-            std::ofstream shape_fs(shapepath, std::ios::binary);
-            shape_fs.write(shapedata.c_str(), shapedata.size());
-            shape_fs.close();
 
-            // 保存data
-            std::string datapath = path + ".data";
-            std::ofstream data_fs(datapath, std::ios::binary | std::ios::in | std::ios::out);
+        // 保存shape
+        std::string shapepath = path + ".shape";
+        std::string shapedata = tensor.shape.toYaml();
+        std::ofstream shape_fs(shapepath, std::ios::binary);
+        shape_fs.write(shapedata.c_str(), shapedata.size());
+        shape_fs.close();
 
-            if (!data_fs.is_open())
-            {
-                // 如果文件不存在，则创建新文件
-                data_fs.open(datapath, std::ios::binary | std::ios::out);
-            }
-            data_fs.seekp(filebegin);
-            data_fs.write(reinterpret_cast<const char *>(tensor.data), tensor.shape.size * sizeof(T));
-            data_fs.close();
-        }
-    };
-    template <typename T>
-    struct loadDispatcher<miaobyte, T>
-    {
-        static Tensor<T> load(const std::string &path, int filebegin = 0)
+        // 保存data
+        std::string datapath = path + ".data";
+        std::ofstream data_fs(datapath, std::ios::binary | std::ios::in | std::ios::out);
+
+        if (!data_fs.is_open())
         {
-            // 加载shape
-            std::string shapepath = path + ".shape";
-            std::ifstream shape_fs(shapepath, std::ios::binary);
-            std::string shapedata((std::istreambuf_iterator<char>(shape_fs)), std::istreambuf_iterator<char>());
+            // 如果文件不存在，则创建新文件
+            data_fs.open(datapath, std::ios::binary | std::ios::out);
+        }
+        int data_size = tensor.shape.size * precision_bits(tensor.shape.dtype) / 8;
+        data_fs.write(reinterpret_cast<const char *>(tensor.data), data_size);
+        data_fs.close();
+    }
 
-            Shape shape;
-            shape.fromYaml(shapedata);
-            shape_fs.close();
+    //load
 
-            // 加载data
-            Tensor<T> tensor = New<T>(shape);
-            std::string datapath = path + ".data";
-            std::ifstream data_fs(datapath, std::ios::binary);
 
-            if (!data_fs.is_open())
-            {
-                throw std::runtime_error("无法打开数据文件: " + datapath);
-            }
+    template <typename T>
+    pair<std::string,shared_ptr<Tensor<T>>> load(const std::string &path)
+    {
+        // 加载shape
+        pair<std::string,Shape> shape_name=loadShape(path);
+        Shape shape=shape_name.second;
+        std::string tensor_name=shape_name.first;
+ 
+
+        // 检查T 和 shape.dtype 是否匹配
+        if (shape.dtype != precision<T>())
+        {
+            throw std::runtime_error("调用load<" + precision_str(shape.dtype) + "> 不匹配: 需要 " + precision_str(shape.dtype) +
+                                     " 类型，但文件为" + precision_str(precision<T>()) + " 类型");
+        }
 
-            // 设置读取位置
-            data_fs.seekg(filebegin);
-            data_fs.read(reinterpret_cast<char *>(tensor.data), shape.size * sizeof(T));
-            data_fs.close();
+        // 检查file.size，是否是tensor.size*sizeof(T)
+        std::string datapath = path + ".data";
+        std::ifstream data_fs(datapath, std::ios::binary);
+        data_fs.seekg(0, std::ios::end);
+        std::streamsize fileSize = data_fs.tellg();
+        std::streamsize expectedSize = shape.size * precision_bits(shape.dtype) / 8;
 
-            return tensor;
+        if (fileSize != expectedSize)
+        {
+            throw std::runtime_error("数据文件大小不足: 需要 " + std::to_string(expectedSize) +
+                                     " 字节，但文件只有 " + std::to_string(fileSize) + " 字节");
         }
+        data_fs.seekg(0);
+
+        // 创建tensor
+        shared_ptr<Tensor<T>> tensor = make_shared<Tensor<T>>(New<T>(shape.shape));
+        data_fs.read(reinterpret_cast<char *>(tensor->data), fileSize);
+        data_fs.close();
+        return std::make_pair(tensor_name, tensor);
     };
+
 }
 #endif // DEEPX_TENSORFUNC_IO_MIAOBYTE_HPP
\ No newline at end of file
diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/reduce_miaobyte.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/reduce_miaobyte.hpp
index aaf3f86e..b6ebea5c 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/reduce_miaobyte.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/reduce_miaobyte.hpp
@@ -27,38 +27,38 @@ namespace deepx::tensorfunc
             const int minshape_1 = Lanes(ScalableTag<T>());
             if (reduced_dims.rbegin()[0] == tensor.shape.dim - 1 || tensor.shape.dim > reduced_dims.size() || tensor.shape[-1] >= minshape_1)
             {
-                tensor.shape.rangeParallel(tensor.shape.dim, [&tensor, &result, &reduced_dims, keepdims](const int idx_linear, const std::vector<int> &indices, std::vector<int> &newIndices)
+                tensor.shape.rangeParallel(tensor.shape.dim, [&tensor, &result, &reduced_dims, keepdims](const int idx_linear, const std::vector<int> &indices, ThreadLocalVectors &tlv)
                                            {
                     // 计算输出索引
                     for (size_t i = 0, j = 0; i < tensor.shape.dim; ++i)
                     {
                         if (reduced_dims[i] == 0)
                         {
-                            newIndices[j++] = indices[i];
+                            tlv.get(0)[j++] = indices[i];
                         }else if (keepdims && (reduced_dims[i] == 1)) {
-                            newIndices[j++] = 0;
+                            tlv.get(0)[j++] = 0;
                         }
                     }
-                    int outputIdx = result.shape.linearat(newIndices);
+                    int outputIdx = result.shape.linearat(tlv.get(0));
 #pragma omp atomic
-                    result.data[outputIdx] += tensor.data[idx_linear]; }, result.shape.dim);
+                    result.data[outputIdx] += tensor.data[idx_linear]; }, {result.shape.dim});
             }
             else
             {
                 // 如果数据连续（对齐），则可以simd
-                tensor.shape.rangeParallel(tensor.shape.dim - 1, [&tensor, &result, &reduced_dims, keepdims](const int idx_linear, const std::vector<int> &indices, std::vector<int> &newIndices)
+                tensor.shape.rangeParallel(tensor.shape.dim - 1, [&tensor, &result, &reduced_dims, keepdims](const int idx_linear, const std::vector<int> &indices, ThreadLocalVectors &tlv)
                                            {
                     // 计算输出索引
                     for (size_t i = 0, j = 0; i < tensor.shape.dim; ++i)
                     {
                         if (reduced_dims[i] == 0)
                         {
-                            newIndices[j++] = indices[i];
+                            tlv.get(0)[j++] = indices[i];
                         }else if (keepdims && (reduced_dims[i] == 1)) {
-                            newIndices[j++] = 0;
+                            tlv.get(0)[j++] = 0;
                         }
                     }
-                    int outputIdx = result.shape.linearat(newIndices);
+                    int outputIdx = result.shape.linearat(tlv.get(0));
                     int shape_last = tensor.shape[-1];
                     const ScalableTag<T> tag;
                     const size_t lanes = Lanes(tag);
@@ -86,7 +86,8 @@ namespace deepx::tensorfunc
                         sum += tensor.data[idx_linear + j];
                     }
 #pragma omp atomic
-                    result.data[outputIdx] += sum; }, result.shape.dim);
+                    result.data[outputIdx] += sum; },
+                     {result.shape.dim});
             }
         }
     };
@@ -104,26 +105,27 @@ namespace deepx::tensorfunc
             constant<miaobyte, T>(result, T(1));
             if (reduced_dims.rbegin()[0] == tensor.shape.dim - 1 || tensor.shape.dim > reduced_dims.size() || tensor.shape[-1] >= minshape_1)
             {
-                tensor.shape.rangeParallel(tensor.shape.dim, [&tensor, &result, &reduced_dims, keepdims](const int idx_linear, const std::vector<int> &indices, std::vector<int> &newIndices)
+                tensor.shape.rangeParallel(tensor.shape.dim, [&tensor, &result, &reduced_dims, keepdims](const int idx_linear, const std::vector<int> &indices, ThreadLocalVectors &tlv)
                                            {
                             // 计算输出索引
                          
                             for (size_t i = 0,j=0; i < tensor.shape.dim ; ++i) {
                                 if (reduced_dims[i]==0) {
-                                        newIndices[j++]=indices[i];
+                                        tlv.get(0)[j++]=indices[i];
                                     }else if (keepdims && (reduced_dims[i] == 1)) {
-                                        newIndices[j++]=0;
+                                        tlv.get(0)[j++]=0;
                                     }
                                 }
                             // 累加求和
-                            int outputIdx=result.shape.linearat(newIndices);
+                            int outputIdx=result.shape.linearat(tlv.get(0));
 #pragma omp atomic
-                            result.data[outputIdx]*=tensor.data[idx_linear]; }, result.shape.dim);
+                            result.data[outputIdx]*=tensor.data[idx_linear]; 
+                            }, {result.shape.dim});
             }
             else
             {
                 // 如果数据连续（对齐），则可以simd
-                tensor.shape.rangeParallel(tensor.shape.dim - 1, [&tensor, &result, &reduced_dims, keepdims](const int i, const std::vector<int> &indices, std::vector<int> &newIndices)
+                tensor.shape.rangeParallel(tensor.shape.dim - 1, [&tensor, &result, &reduced_dims, keepdims](const int i, const std::vector<int> &indices, ThreadLocalVectors &tlv)
                                            {
                                                // 计算输出索引
 
@@ -131,13 +133,13 @@ namespace deepx::tensorfunc
                                                {
                                                    if (reduced_dims[i] == 0)
                                                    {
-                                                       newIndices[j++] = indices[i];
+                                                       tlv.get(0)[j++] = indices[i];
                                                    }else if (keepdims && (reduced_dims[i] == 1)) {
-                                                       newIndices[j++] = 0;
+                                                       tlv.get(0)[j++] = 0;
                                                    }
                                                }
                                                // 累加求和
-                                               int outputIdx = result.shape.linearat(newIndices);
+                                               int outputIdx = result.shape.linearat(tlv.get(0));
 
                                                int shape_last = tensor.shape[-1];
                                                const ScalableTag<T> tag;
@@ -170,7 +172,8 @@ namespace deepx::tensorfunc
                                                    product *= tensor.data[i + j];
                                                }
 #pragma omp atomic
-                                               result.data[outputIdx] *= product; }, result.shape.dim);
+                                               result.data[outputIdx] *= product; 
+                                               }, {result.shape.dim});
             }
         }
     };
@@ -187,25 +190,26 @@ namespace deepx::tensorfunc
             constant<miaobyte, T>(result, std::numeric_limits<T>::lowest());
             if (reduced_dims.rbegin()[0] == tensor.shape.dim - 1 || tensor.shape.dim > reduced_dims.size() || tensor.shape[-1] >= minshape_1)
             {
-                tensor.shape.rangeParallel(tensor.shape.dim, [&tensor, &result, &reduced_dims, keepdims](const int idx_linear, const std::vector<int> &indices, std::vector<int> &newIndices)
+                tensor.shape.rangeParallel(tensor.shape.dim, [&tensor, &result, &reduced_dims, keepdims](const int idx_linear, const std::vector<int> &indices, ThreadLocalVectors &tlv)
                                            {
                             // 计算输出索引
                          
                             for (size_t i = 0,j=0; i < tensor.shape.dim ; ++i) {
                                 if (reduced_dims[i]==0) {
-                                        newIndices[j++]=indices[i];
+                                        tlv.get(0)[j++]=indices[i];
                                     }else if (keepdims && (reduced_dims[i] == 1)) {
-                                        newIndices[j++]=0;
+                                        tlv.get(0)[j++]=0;
                                     }
                                 }
                             // 累加求和
-                            int outputIdx=result.shape.linearat(newIndices);
-                            result.data[outputIdx]=std::max(result.data[outputIdx],tensor.data[idx_linear]); }, result.shape.dim);
+                            int outputIdx=result.shape.linearat(tlv.get(0));
+                            result.data[outputIdx]=std::max(result.data[outputIdx],tensor.data[idx_linear]); 
+                            }, {result.shape.dim});
             }
             else
             {
                 // 如果数据连续（对齐），则可以simd
-                tensor.shape.rangeParallel(tensor.shape.dim - 1, [&tensor, &result, &reduced_dims, keepdims](const int i, const std::vector<int> &indices, std::vector<int> &newIndices)
+                tensor.shape.rangeParallel(tensor.shape.dim - 1, [&tensor, &result, &reduced_dims, keepdims](const int i, const std::vector<int> &indices, ThreadLocalVectors &tlv)
                                            {
                                                // 计算输出索引
 
@@ -213,13 +217,13 @@ namespace deepx::tensorfunc
                                                {
                                                    if (reduced_dims[i] == 0)
                                                    {
-                                                       newIndices[j++] = indices[i];
+                                                       tlv.get(0)[j++] = indices[i];
                                                    }else if (keepdims && (reduced_dims[i] == 1)) {
-                                                       newIndices[j++] =0;
+                                                       tlv.get(0)[j++] =0;
                                                    }
                                                }
                                                
-                                               int outputIdx = result.shape.linearat(newIndices);
+                                               int outputIdx = result.shape.linearat(tlv.get(0));
 
                                                int shape_last = tensor.shape[-1];
                                                const ScalableTag<T> tag;
@@ -251,7 +255,8 @@ namespace deepx::tensorfunc
                                                    maxt = std::max(maxt,tensor.data[i + j]);
                                                }
  
-                                               result.data[outputIdx] = std::max(result.data[outputIdx],maxt); }, result.shape.dim);
+                                               result.data[outputIdx] = std::max(result.data[outputIdx],maxt); 
+                                               }, {result.shape.dim});
             }
         }
     };
@@ -268,26 +273,27 @@ namespace deepx::tensorfunc
             constant<miaobyte, T>(result, std::numeric_limits<T>::max());
             if (reduced_dims.rbegin()[0] == tensor.shape.dim - 1 || tensor.shape.dim > reduced_dims.size() || tensor.shape[-1] >= minshape_1)
             {
-                tensor.shape.rangeParallel(tensor.shape.dim, [&tensor, &result, &reduced_dims, keepdims](const int idx_linear, const std::vector<int> &indices, std::vector<int> &newIndices)
+                tensor.shape.rangeParallel(tensor.shape.dim, [&tensor, &result, &reduced_dims, keepdims](const int idx_linear, const std::vector<int> &indices, ThreadLocalVectors &tlv)
                                            {
                             // 计算输出索引
                          
                             for (size_t i = 0,j=0; i < tensor.shape.dim ; ++i) {
                                 if (reduced_dims[i]==0) {
-                                        newIndices[j++]=indices[i];
+                                        tlv.get(0)[j++]=indices[i];
                                     }else if (keepdims && (reduced_dims[i] == 1)) {
-                                        newIndices[j++]=0;
+                                        tlv.get(0)[j++]=0;
                                     }
                                 }
                             // 累加求和
-                            int outputIdx=result.shape.linearat(newIndices);
+                            int outputIdx=result.shape.linearat(tlv.get(0));
  
-                            result.data[outputIdx]=std::min(result.data[outputIdx],tensor.data[idx_linear]); }, result.shape.dim);
+                            result.data[outputIdx]=std::min(result.data[outputIdx],tensor.data[idx_linear]); 
+                            }, {result.shape.dim});
             }
             else
             {
                 // 如果数据连续（对齐），则可以simd
-                tensor.shape.rangeParallel(tensor.shape.dim - 1, [&tensor, &result, &reduced_dims, keepdims](const int i, const std::vector<int> &indices, std::vector<int> &newIndices)
+                tensor.shape.rangeParallel(tensor.shape.dim - 1, [&tensor, &result, &reduced_dims, keepdims](const int i, const std::vector<int> &indices, ThreadLocalVectors &tlv)
                                            {
                                                // 计算输出索引
 
@@ -295,13 +301,13 @@ namespace deepx::tensorfunc
                                                {
                                                    if (reduced_dims[i] == 0)
                                                    {
-                                                       newIndices[j++] = indices[i];
+                                                       tlv.get(0)[j++] = indices[i];
                                                    }else if (keepdims && (reduced_dims[i] == 1)) {
-                                                       newIndices[j++] = 0;
+                                                       tlv.get(0)[j++] = 0;
                                                    }
                                                }
                                                
-                                               int outputIdx = result.shape.linearat(newIndices);
+                                               int outputIdx = result.shape.linearat(tlv.get(0));
 
                                                int shape_last = tensor.shape[-1];
                                                const ScalableTag<T> tag;
@@ -333,7 +339,7 @@ namespace deepx::tensorfunc
                                                    mint = std::min(mint,tensor.data[i + j]);
                                                }
  
-                                               result.data[outputIdx] = std::min(result.data[outputIdx],mint); }, result.shape.dim);
+                                               result.data[outputIdx] = std::min(result.data[outputIdx],mint); }, {result.shape.dim});
             }
         }
     };
diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/tensorlife_miaobyte.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/tensorlife_miaobyte.hpp
index 1f9c9cda..c514faa8 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/tensorlife_miaobyte.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/tensorlife_miaobyte.hpp
@@ -37,7 +37,6 @@ namespace deepx::tensorfunc
         shape.dtype = precision<T>();
 
         Tensor<T> tensor(shape);
-        tensor.device = CPU;
         tensor.deleter = dataFree<T>;
         tensor.copyer = dataCopy<T>;
         tensor.newer = dataNew<T>;
diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/changeshape.hpp b/excuter/op-mem-ompsimd/src/deepx/tf/changeshape.hpp
index e9c534b7..fd4f0e07 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tf/changeshape.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tf/changeshape.hpp
@@ -11,6 +11,7 @@ namespace deepx::tf
     using namespace deepx::tensorfunc;
     using namespace std;
 
+    // reshape
     template <typename Author>
     class Reshape : public TF
     {
@@ -37,7 +38,7 @@ namespace deepx::tf
         int run(shared_ptr<MemBase> mem, string &error) override
         {
 
-            if (!checktensors({ this->returns[0].textvalue}, mem, error)!=0)
+            if (!checktensors({this->returns[0].textvalue}, mem, error) != 0)
             {
                 return 1;
             }
@@ -77,6 +78,7 @@ namespace deepx::tf
         }
     };
 
+    // transpose
     template <typename Author>
     class Transpose : public TF
     {
@@ -102,7 +104,7 @@ namespace deepx::tf
 
         int run(shared_ptr<MemBase> mem, string &error) override
         {
-            if (!checktensors({this->args[0].textvalue, this->returns[0].textvalue}, mem, error)!=0)
+            if (!checktensors({this->args[0].textvalue, this->returns[0].textvalue}, mem, error) != 0)
             {
                 return 1;
             }
@@ -143,6 +145,7 @@ namespace deepx::tf
         }
     };
 
+    // concat
     template <typename Author>
     class Concat : public TF
     {
@@ -165,14 +168,14 @@ namespace deepx::tf
             return make_shared<Concat>(*this);
         }
         int run(shared_ptr<MemBase> mem, string &error) override
-        {   
-            if (!checktensors({ this->returns[0].textvalue}, mem, error)!=0)
+        {
+            if (!checktensors({this->returns[0].textvalue}, mem, error) != 0)
             {
                 return 1;
             }
 
             vector<string> tensor_names = this->getvector<string>(0, true);
-            if (!checktensors(tensor_names, mem, error)!=0)
+            if (!checktensors(tensor_names, mem, error) != 0)
             {
                 return 1;
             }
@@ -255,7 +258,8 @@ namespace deepx::tf
         };
     };
 
-        template <typename Author>
+    // broadcastTo
+    template <typename Author>
     class BroadcastTo : public TF
     {
     public:
@@ -278,7 +282,7 @@ namespace deepx::tf
         }
         int run(shared_ptr<MemBase> mem, string &error) override
         {
-            if (!checktensors({this->args[0].textvalue, this->returns[0].textvalue}, mem, error)!=0)
+            if (!checktensors({this->args[0].textvalue, this->returns[0].textvalue}, mem, error) != 0)
             {
                 return 1;
             }
@@ -317,124 +321,130 @@ namespace deepx::tf
             return 0;
         }
     };
-    
-    // class Split : public TF
-    // {
-    // public:
-    //     Split()
-    //     {
-    //         this->name="split";
-    //     }
-    //     Split(string text)
-    //     {
-    //         this->parse(text);
-    //         if (this->name!="split"){
-    //             throw std::runtime_error("Invalid name: "+this->name);
-    //         }
-    //     }
-    //     void funcdef() override
-    //     {
-    //         this->parse("split(float32 T1,int32 3)->(float32 T2,T3)");
-    //     }
-    //     string math_formula() const override
-    //     {
-    //         return "T2,T3 = split(T1, axis=3)";
-    //     }
-    //     void run(mem::Mem &mem) override
-    //     {
-    //         std::vector<Tensor<T> *> input;
-    //         for (int i = 0; i < this->args.size() - 1; i++)
-    //         {
-    //             input.push_back(mem.gettensor<T>(this->args[i]).get());
-    //         }
-    //         int axis = mem.getarg<int>(this->args.back());
-    //         auto output = mem.gettensor<T>(this->returns[0]).get();
-    //         tensorfunc::split(*output, axis, input);
-    //     }
-    // };
-
-    
 
-    // template <typename T>
-    // class Expand : public Op
-    // {
-    // public:
-    //     Expand()
-    //     {
-    //         this->init("expand", "any", {}, {}, false, {}, {});
-    //     }
-    //     void forward(mem::Mem &mem) override
-    //     {
-    //         auto input = mem.gettensor<T>(this->args[0]).get();
-    //         auto output = mem.gettensor<T>(this->returns[0]).get();
-    //         tensorfunc::expand(*input, *output);
-    //     }
-    //     vector<int> sumaxis(const vector<int>  shape,const vector<int> target_shape)
-    //     {
-    //         vector<int> axis;
-
-    //         // 检查当前形状(this->shape)与目标形状的差异
-    //         int current_dim =  shape.size();
-    //         int target_dim = target_shape.size();
-
-    //         // 如果目标维度小于当前维度，需要在一些轴上求和来减少维度
-    //         if (target_dim < current_dim)
-    //         {
-    //             // 检查每个当前维度，看是否需要在目标形状中保留
-    //             for (int i = 0; i < current_dim; i++)
-    //             {
-    //                 bool keep_dim = false;
+    // gather
+    template <typename Author>
+    class Gather : public TF
+    {
+    public:
+        Gather(const vector<Param> &args, const vector<Param> &returns)
+        {
+            this->name = "gather";
+            this->author = Author::name();
+            this->tftype = "changeshape";
+            this->args = args;
+            this->returns = returns;
+        }
 
-    //                 // 找出当前维度是否与目标形状中的任何维度对应
-    //                 if (i < current_dim - target_dim)
-    //                 {
-    //                     // 如果当前维度索引小于两者维度差，肯定需要被求和
-    //                     axis.push_back(i);
-    //                 }
-    //                 else
-    //                 {
-    //                     // 检查该维度是否与目标形状匹配
-    //                     int target_idx = i - (current_dim - target_dim);
-    //                     if (target_shape[target_idx] == 1 &&  shape[i] > 1)
-    //                     {
-    //                         // 如果目标形状在这个维度上是1，但当前形状不是1，需要求和
-    //                         axis.push_back(i);
-    //                     }
-    //                 }
-    //             }
-    //         }
-    //         else if (target_dim == current_dim)
-    //         {
-    //             // 维度数量相同，检查哪些维度需要被压缩为1
-    //             for (int i = 0; i < current_dim; i++)
-    //             {
-    //                 if (target_shape[i] == 1 && shape[i] > 1)
-    //                 {
-    //                     axis.push_back(i);
-    //                 }
-    //             }
-    //         }
-    //         // 如果目标维度大于当前维度，可能需要扩展维度(通常通过其他操作如expand_dims)
+        string math_formula() const override
+        {
+            return "T2 = T1.gather(indices=T3, axis=3)";
+        }
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<Gather<Author>>(*this);
+        }
+        int run(shared_ptr<MemBase> mem, string &error) override
+        {
+            if (!checktensors({this->args[0].textvalue, this->args[1].textvalue, this->returns[0].textvalue}, mem, error) != 0)
+            {
+                return 1;
+            }
+            Precision input_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
 
-    //         return axis;
-    //     }
-    //     void backward(mem::Mem &mem) override
-    //     {
-    //         auto input_grad = mem.gettensor<T>(this->args_grad[0]).get();
-    //         auto output_grad = mem.gettensor<T>(this->returns_grad[0]).get();
-    //         vector<int> target_shape = this->getvector<int32_t>( 1);
-    //         vector<int> axis = this->sumaxis(input_grad->shape.shape,target_shape);
-    //         // sum,按指定维度求和
-    //         tensorfunc::sum(*output_grad,  axis,*input_grad);
-    //     }
-    //     void funcdef() override
-    //     {
-    //         this->init("expand", "float32", {"T1", "4", "6", "12"}, {"T2"}, false, {}, {});
-    //     }
-    //     string math_formula() const override
-    //     {
-    //         return "T2 = expand(T1, axis=[4,6,12])";
-    //     }
-    // };
+            Precision output_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
+            if (input_type != output_type)
+            {
+                error = "Type mismatch: " + precision_str(input_type) + " != " + precision_str(output_type);
+                return 1;
+            }
+            Precision indices_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype;
+            if (indices_type != Precision::Int32 && indices_type != Precision::Int64)
+            {
+                error = "indices only support int32 or int64";
+                return 1;
+            }
+            int axis = this->getvar<int>(2, mem, true);
+            switch (input_type)
+            {
+            case Precision::Float64:
+            {
+                if (indices_type == Precision::Int32)
+                {
+                    gather<Author, double, int32_t>(*mem->gettensor<double>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), axis, *mem->gettensor<double>(this->returns[0].textvalue));
+                }
+                else
+                {
+                    gather<Author, double, int64_t>(*mem->gettensor<double>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue), axis, *mem->gettensor<double>(this->returns[0].textvalue));
+                }
+                break;
+            }
+            case Precision::Float32:
+            {
+                if (indices_type == Precision::Int32)
+                {
+                    gather<Author, float, int32_t>(*mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), axis, *mem->gettensor<float>(this->returns[0].textvalue));
+                }
+                else
+                {
+                    gather<Author, float, int64_t>(*mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue), axis, *mem->gettensor<float>(this->returns[0].textvalue));
+                }
+                break;
+            }
+            case Precision::Int64:
+            {
+                if (indices_type == Precision::Int32)
+                {
+                    gather<Author, int64_t, int32_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), axis, *mem->gettensor<int64_t>(this->returns[0].textvalue));
+                }
+                else
+                {
+                    gather<Author, int64_t, int64_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue), axis, *mem->gettensor<int64_t>(this->returns[0].textvalue));
+                }
+                break;
+            }
+            case Precision::Int16:
+            {
+                if (indices_type == Precision::Int32)
+                {
+                    gather<Author, int16_t, int32_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), axis, *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                }
+                else
+                {
+                    gather<Author, int16_t, int64_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue), axis, *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                }
+                break;
+            }
+            case Precision::Int8:
+            {
+                if (indices_type == Precision::Int32)
+                {
+                    gather<Author, int8_t, int32_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), axis, *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                }
+                else
+                {
+                    gather<Author, int8_t, int64_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue), axis, *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                }
+                break;
+            }
+            case Precision::Bool:
+            {
+                if (indices_type == Precision::Int32)
+                {
+                    gather<Author, bool, int32_t>(*mem->gettensor<bool>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), axis, *mem->gettensor<bool>(this->returns[0].textvalue));
+                }
+                else
+                {
+                    gather<Author, bool, int64_t>(*mem->gettensor<bool>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue), axis, *mem->gettensor<bool>(this->returns[0].textvalue));
+                }
+                break;
+            }
+            default:
+                error = "Unsupported type: " + precision_str(input_type);
+                return 1;
+            }
+            return 0;
+        };
+    };
 }
-#endif // DEEPX_OP_CONCAT_HPP
\ No newline at end of file
+#endif // DEEPX_TF_CHANGESHAPE_HPP
\ No newline at end of file
diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/io.hpp b/excuter/op-mem-ompsimd/src/deepx/tf/io.hpp
index 0fd86a18..a6fb83dc 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tf/io.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tf/io.hpp
@@ -7,7 +7,7 @@
 #include "deepx/tensorfunc/authors.hpp"
 namespace deepx::tf
 {
-
+    //print
     template <typename Author>
     class Print : public TF
     {
@@ -20,7 +20,14 @@ namespace deepx::tf
             this->args = args;
             this->returns = returns;
         }
- 
+  string math_formula() const override
+        {
+            return "print(T1)";
+        }
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<Print<Author>>(*this);
+        }
         int run(shared_ptr<MemBase> mem, string &error) override
         {
             string name = this->args[0].textvalue;
@@ -45,13 +52,120 @@ namespace deepx::tf
             return 0;
         }
 
+       
+    };
+
+    //save
+    class Save : public TF
+    {
+    public:
+        Save(vector<Param> args, vector<Param> returns) 
+        {
+            this->name = "save";
+            this->tftype = "io";
+            this->args = args;
+            this->returns = returns;
+        }   
+         string math_formula() const override
+        {
+            return "save(T1,path)";
+        }
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<Save>(*this);
+        }
+        int run(shared_ptr<MemBase> mem, string &error) override
+        {
+            string name = this->args[0].textvalue;
+            string path = this->args[1].textvalue;
+            if (mem->existstensor(name))
+            {
+                auto t = mem->gettensor(name);
+                tensorfunc::save<void>(*t, path);
+            }
+            else
+            {
+                std::cerr << "save " << name << " not found" << std::endl;
+                error = "save " + name + " not found";
+                return 1;
+            }
+            return 0;
+        }
+    };
+
+    //load
+    class Load : public TF
+    {
+    public:
+        Load(vector<Param> args, vector<Param> returns)
+        {   
+            this->name = "load";
+            this->tftype = "io";
+            this->args = args;
+            this->returns = returns;
+        }
         string math_formula() const override
         {
-            return "print(T1)";
+            return "mem.load(path)";
         }
         shared_ptr<TF> clone() const override
         {
-            return make_shared<Print<Author>>(*this);
+            return make_shared<Load>(*this);
+        }
+        int run(shared_ptr<MemBase> mem, string &error) override    
+        {
+            string path = this->args[0].textvalue;
+            
+            pair<std::string,Shape> shape_name=tensorfunc::loadShape(path);
+            std::string tensor_name=shape_name.first;
+            Shape shape=shape_name.second;
+
+            if(mem->existstensor(tensor_name))
+            {
+                cout<<"warning: "<<tensor_name<<" already exists,deepx will delete it,create new one"<<endl;
+                mem->delete_tensor(tensor_name);
+            }
+            switch (shape.dtype)
+            {
+            case Precision::Float64:{
+                pair<std::string,shared_ptr<Tensor<double>>> t = tensorfunc::load<double>(path);
+                mem->addtensor(tensor_name, t.second);
+                break;
+            }
+            case Precision::Float32:{
+                pair<std::string,shared_ptr<Tensor<float>>> t = tensorfunc::load<float>(path);
+                mem->addtensor(tensor_name, t.second);
+                break;
+            }
+            case Precision::Int64:{
+                pair<std::string,shared_ptr<Tensor<int64_t>>> t = tensorfunc::load<int64_t>(path);
+                mem->addtensor(tensor_name, t.second);
+                break;
+            }
+            case Precision::Int32:{
+                pair<std::string,shared_ptr<Tensor<int32_t>>> t = tensorfunc::load<int32_t>(path);
+                mem->addtensor(tensor_name, t.second);
+                break;
+            }
+            case Precision::Int16:{
+                pair<std::string,shared_ptr<Tensor<int16_t>>> t = tensorfunc::load<int16_t>(path);
+                mem->addtensor(tensor_name, t.second);
+                break;
+            }
+            case Precision::Int8:{
+                pair<std::string,shared_ptr<Tensor<int8_t>>> t = tensorfunc::load<int8_t>(path);
+                mem->addtensor(tensor_name, t.second);
+                break;
+            }
+            case Precision::Bool:{
+                pair<std::string,shared_ptr<Tensor<bool>>> t = tensorfunc::load<bool>(path);
+                mem->addtensor(tensor_name, t.second);
+                break;
+            }
+            default:
+                break;
+            }
+            return 0;
         }
     };
 }
diff --git a/excuter/op-mem-ompsimd/test/tensorfunc/2_tensor_new.cpp b/excuter/op-mem-ompsimd/test/tensorfunc/2_tensor_new.cpp
index 5b62463d..b50237c5 100644
--- a/excuter/op-mem-ompsimd/test/tensorfunc/2_tensor_new.cpp
+++ b/excuter/op-mem-ompsimd/test/tensorfunc/2_tensor_new.cpp
@@ -14,11 +14,11 @@ void test_tensor_new(){
     Tensor<float> tensor=New<float>({2, 3});
     constant<miaobyte,float>(tensor,1);
     print<miaobyte>(tensor);
-    save<miaobyte>(tensor,"tensor");
+    save(tensor,"tensor");
     Tensor<float> tensor2=New<float>({2, 3});
     constant<miaobyte,float>(tensor2,2);
     print<miaobyte>(tensor2);
-    save<miaobyte>(tensor2,"tensor2");
+    save(tensor2,"tensor2");
 }
 
 void test_arange() {
@@ -30,6 +30,7 @@ void test_arange() {
 int main(int argc,char **argv){
     int i=0;
     if (argc>1){
+        
         i=std::atoi(argv[1]);
     }
     switch (i) {
diff --git a/excuter/op-mem-ompsimd/test/tensorfunc/2_tensor_range.cpp b/excuter/op-mem-ompsimd/test/tensorfunc/2_tensor_range.cpp
index a7f3eeec..9a4b0aaf 100644
--- a/excuter/op-mem-ompsimd/test/tensorfunc/2_tensor_range.cpp
+++ b/excuter/op-mem-ompsimd/test/tensorfunc/2_tensor_range.cpp
@@ -14,11 +14,11 @@ void test_tensor_range(){
     Tensor<float> tensor=New<float>({2, 3});
     constant<miaobyte,float>(tensor,1);
     print<miaobyte>(tensor);
-    save<miaobyte>(tensor,"2_tensor_range.1");
+ 
     Tensor<float> tensor2=New<float>({2, 3});
     constant<miaobyte,float>(tensor2,2);
     print<miaobyte>(tensor2);
-    save<miaobyte>(tensor2,"2_tensor_range.2");
+ 
 }
  
 int main(){
diff --git a/excuter/op-mem-ompsimd/test/tensorfunc/3_tensor_print.cpp b/excuter/op-mem-ompsimd/test/tensorfunc/3_tensor_print.cpp
index d965ec77..bdf3a492 100644
--- a/excuter/op-mem-ompsimd/test/tensorfunc/3_tensor_print.cpp
+++ b/excuter/op-mem-ompsimd/test/tensorfunc/3_tensor_print.cpp
@@ -10,6 +10,5 @@ int main(){
     deepx::Tensor<float> t=New<float>({2, 3,4});
     std::iota(t.data, t.data+t.shape.size, 0);
     print<miaobyte>(t);
-    save<miaobyte>(t,"3_tensor_print");
     return 0;
 }
\ No newline at end of file
diff --git a/excuter/op-mem-ompsimd/test/tensorfunc/4_tensor_matmul.cpp b/excuter/op-mem-ompsimd/test/tensorfunc/4_tensor_matmul.cpp
index 60027015..008b5550 100644
--- a/excuter/op-mem-ompsimd/test/tensorfunc/4_tensor_matmul.cpp
+++ b/excuter/op-mem-ompsimd/test/tensorfunc/4_tensor_matmul.cpp
@@ -49,10 +49,8 @@ void test_tensor_matmul(){
 void bench_tensor_matmul(int i) {
     Tensor<float> tensor= New<float>({i,i});
     uniform<miaobyte,float>(tensor,0,1);
-    save<miaobyte>(tensor,"4_tensor_matmul"+std::to_string(i)+"tensor");
     Tensor<float> tensor2= New<float>({i,i});
     uniform<miaobyte,float>(tensor2,0,1);
-    save<miaobyte>(tensor2,"4_tensor_matmul"+std::to_string(i)+"tensor2");
     Tensor<float> tensor3= New<float>(matmul_shape(tensor.shape, tensor2.shape).shape);
     std::cout<<("matmul ", i, "x", i);
     auto start = std::chrono::high_resolution_clock::now();
@@ -60,7 +58,6 @@ void bench_tensor_matmul(int i) {
     matmul<tensorfunc::miaobyte,float>(tensor, tensor2, tensor3);
     auto end=std::chrono::high_resolution_clock::now();
     std::chrono::duration<double> duration = end - start;
-    save<miaobyte>(tensor3,"4_tensor_matmul"+std::to_string(i)+"result");
     std::cout << "time:" << duration.count() << " seconds" << std::endl;
 }
  
diff --git a/excuter/op-mem-ompsimd/test/tensorfunc/5_tensor_sum.cpp b/excuter/op-mem-ompsimd/test/tensorfunc/5_tensor_sum.cpp
index e902e717..dc2ef698 100644
--- a/excuter/op-mem-ompsimd/test/tensorfunc/5_tensor_sum.cpp
+++ b/excuter/op-mem-ompsimd/test/tensorfunc/5_tensor_sum.cpp
@@ -68,7 +68,6 @@ void benchmark_sum(int i){
         {
             combstr+=std::to_string(c)+"_";
         }
-        save<miaobyte>(r,"5_tensor_sum."+ combstr);
         print<miaobyte>(r,"%.0f");
     }
     auto end=std::chrono::high_resolution_clock::now();
diff --git a/front/go/deepx/tensor.go b/front/go/deepx/tensor.go
deleted file mode 100644
index 61223786..00000000
--- a/front/go/deepx/tensor.go
+++ /dev/null
@@ -1,48 +0,0 @@
-package deepx
-
-import "fmt"
-
-type Shape struct {
-	shape  []int
-	stride []int
-	ndim   int
-	size   int
-}
-
-func NewTensorShape(shape []int) (s Shape) {
-	s.ndim = len(shape)
-	s.shape = make([]int, len(shape))
-	copy(s.shape, shape)
-	s.stride = make([]int, len(shape))
-	s.stride[len(shape)-1] = 1
-	for i := len(shape) - 2; i >= 0; i-- {
-		s.stride[i] = s.stride[i+1] * shape[i+1]
-	}
-	s.size = s.stride[0] * shape[0]
-	return s
-}
-func (s Shape) String() string {
-	return fmt.Sprintf("%v", s.shape)
-}
-
-type Dtype int
-
-const (
-	DtypeInt8 Dtype = iota
-	DtypeInt16
-	DtypeInt32
-	DtypeInt64
-	DtypeUint8
-	DtypeFloat16
-	DtypeFloat32
-	DtypeFloat64
-)
-
-type Tensor struct {
-	Data         []byte
-	Dtype        Dtype
-	Shape        Shape
-	graph        *Graph      // 所属计算图
-	node         *TensorNode // 对应的计算图节点
-	requiresGrad bool
-}
diff --git a/front/py/deepx/nn/functional/__init__.py b/front/py/deepx/nn/functional/__init__.py
index 1e215d7e..0f729cfa 100644
--- a/front/py/deepx/nn/functional/__init__.py
+++ b/front/py/deepx/nn/functional/__init__.py
@@ -1,6 +1,6 @@
 
-from .leaffunc_life import newtensor,copytensor,deltensor
-from .leaffunc_io import printtensor
+from .leaffunc_life import *
+from .leaffunc_io import *
 from .leaffunc_init import *
 from .leaffunc_changeshape import *
 from .leaffunc_elementwise import *
@@ -16,9 +16,10 @@
 from .elementwise import *
 from .normalization import *
 __all__ = [
+
     #leaffunc
-    "newtensor",
-    "printtensor",
+    "newtensor","printtensor","load", #life
+    "printtensor","save","save_npy",#io
     "constant","constant_","full","zeros","ones","uniform","uniform_","arange","arange_","kaiming_uniform","kaiming_uniform_","calculate_fan_in_and_fan_out",
     "add","sub","mul","div","sqrt","pow","exp","log",
     "matmul",
diff --git a/front/py/deepx/nn/functional/authormap.py b/front/py/deepx/nn/functional/authormap.py
index 6c32a4d6..4a241865 100644
--- a/front/py/deepx/nn/functional/authormap.py
+++ b/front/py/deepx/nn/functional/authormap.py
@@ -34,6 +34,7 @@
         'transpose':'miaobyte',
         'broadcastTo':'miaobyte',
         'concat':'miaobyte',
+        'gather':'miaobyte',
         #matmul
         # 'matmul':'miaobyte',
         'matmul':'cublas',
diff --git a/front/py/deepx/nn/functional/leaffunc_changeshape.py b/front/py/deepx/nn/functional/leaffunc_changeshape.py
index 25599498..bd86edf6 100644
--- a/front/py/deepx/nn/functional/leaffunc_changeshape.py
+++ b/front/py/deepx/nn/functional/leaffunc_changeshape.py
@@ -64,6 +64,14 @@ def broadcastTo(t:Tensor,new_shape:tuple[int],out:Union[Tensor,str]='',requires_
     return outtensor
 broadcast_to = broadcastTo
 
+def gather(input:Tensor,indices:Tensor,gatheraxis:int,out:Union[Tensor,str]='')->Tensor:
+    outtensor=out
+    if isinstance(out,str):
+        outtensor=newtensor(indices.shape,dtype=input.dtype,name=out)
+    from .rtf_changeshape import rtf_gather
+    rtf_gather(input,indices,gatheraxis,outtensor,defaultauthor['gather'])
+    return outtensor
+
 # def unsqueeze(t:Tensor,dim:int)->Tensor:
 #     # 确保dim是有效的
 #     if dim < -t.ndim-1 or dim > t.ndim:
diff --git a/front/py/deepx/nn/functional/leaffunc_io.py b/front/py/deepx/nn/functional/leaffunc_io.py
index b4490803..dd09abc0 100644
--- a/front/py/deepx/nn/functional/leaffunc_io.py
+++ b/front/py/deepx/nn/functional/leaffunc_io.py
@@ -1,4 +1,4 @@
-from deepx.tensor import Tensor
+from deepx.tensor import Tensor,Shape,saveShape
 from .authormap import defaultauthor
 
 def printtensor(t:Tensor,format=''):
@@ -6,3 +6,20 @@ def printtensor(t:Tensor,format=''):
     rtf_printtensor(t,format,defaultauthor['print'])
     return ''
 
+def save(t:Tensor,path:str):
+    from .rtf_io import rtf_save
+    rtf_save(t,path)
+    return t
+
+def save_npy(t,path:str):
+    r'''
+    保存numpy.tensor为deepxtensor格式
+    '''
+    from numpy import save,ndarray,ascontiguousarray
+    shape=Shape(t.shape)
+    shape._dtype=str(t.dtype)
+    saveShape(shape,path+".shape")
+
+    array = ascontiguousarray(t)
+    array.tofile(path+'.data')
+    return t
\ No newline at end of file
diff --git a/front/py/deepx/nn/functional/leaffunc_life.py b/front/py/deepx/nn/functional/leaffunc_life.py
index abf6a530..8952b9d6 100644
--- a/front/py/deepx/nn/functional/leaffunc_life.py
+++ b/front/py/deepx/nn/functional/leaffunc_life.py
@@ -23,3 +23,6 @@ def deltensor(t:Tensor):
     from .rtf_life import rtf_deltensor
     rtf_deltensor(t)
 
+def load(path:str)->Tensor:
+    from .rtf_io import rtf_load
+    return rtf_load(path)
diff --git a/front/py/deepx/nn/functional/rtf_changeshape.py b/front/py/deepx/nn/functional/rtf_changeshape.py
index 2f9e7e2d..c7233ab0 100644
--- a/front/py/deepx/nn/functional/rtf_changeshape.py
+++ b/front/py/deepx/nn/functional/rtf_changeshape.py
@@ -28,3 +28,9 @@ def rtf_broadcastTo(t:Tensor,new_shape:tuple[int],out:Tensor,author='miaobyte'):
     ir=DeepxIR("broadcastTo", args, returns,author)
     send(ir)
  
+def rtf_gather(input:Tensor,indices:Tensor,axis:int,out:Tensor,author='miaobyte'):
+    args=[Param.tensor(input),Param.tensor(indices),Param.varnum(axis)]
+    returns=[Param.tensor(out)]
+    ir=DeepxIR("gather", args, returns,author)
+    send(ir)
+ 
diff --git a/front/py/deepx/nn/functional/rtf_io.py b/front/py/deepx/nn/functional/rtf_io.py
index 02569603..899b6f16 100644
--- a/front/py/deepx/nn/functional/rtf_io.py
+++ b/front/py/deepx/nn/functional/rtf_io.py
@@ -1,4 +1,4 @@
-from deepx.tensor import Tensor
+from deepx.tensor import Tensor,loadShape
 from deepx.nn import DeepxIR,Param
 from deepx.scheduler import send
 
@@ -9,16 +9,18 @@ def rtf_printtensor(t:Tensor,format='',author='miaobyte'):
     send(ir)
     return ''
 
-def rtf_load(t:Tensor,path:str,author='miaobyte'):
+def rtf_save(t:Tensor,path:str):
     args=[Param.tensor(t),Param.varstr(path)]
     returns=[]
-    ir=DeepxIR("load", args, returns,author)
+    ir=DeepxIR("save", args, returns)
     send(ir)
     return t
 
-def rtf_save(t:Tensor,path:str,author='miaobyte'):
-    args=[Param.tensor(t),Param.varstr(path)]
+def rtf_load(path:str)->Tensor:
+    args=[Param.varstr(path)]
     returns=[]
-    ir=DeepxIR("save", args, returns,author)
+    ir=DeepxIR("load", args, returns)
     send(ir)
-    return t
\ No newline at end of file
+    shapefile=path+'.shape'
+    tensor_name,shape,dtype=loadShape(shapefile)
+    return Tensor(shape,dtype,tensor_name)
diff --git a/front/py/deepx/nn/modules/sparse.py b/front/py/deepx/nn/modules/sparse.py
index cab749e0..a7727a67 100644
--- a/front/py/deepx/nn/modules/sparse.py
+++ b/front/py/deepx/nn/modules/sparse.py
@@ -2,27 +2,142 @@
 from deepx.tensor import Tensor
 
 class Embedding(Module):
+    r"""一个存储固定字典和大小的嵌入向量的简单查找表。
+
+    该模块常用于存储词嵌入并通过索引检索它们。
+    模块的输入是索引列表，输出是对应的词嵌入向量。
+
+    参数:
+        num_embeddings (int): 嵌入字典的大小（词汇表大小）
+        embedding_dim (int): 每个嵌入向量的维度
+        padding_idx (int, 可选): 如果指定，该索引位置的条目不参与梯度计算；
+                                  因此，该位置的嵌入向量在训练中不会更新，保持为固定的"填充"向量。
+                                  对于新创建的嵌入层，该位置的嵌入向量默认全零，但可更新为其他值作为填充向量。
+        max_norm (float, 可选): 如果指定，范数超过此值的嵌入向量会被重新归一化到该范数
+        norm_type (float, 可选): 计算max_norm时使用的p范数（默认L2范数，p=2）
+        scale_grad_by_freq (bool, 可选): 如果为True，梯度会按mini-batch中词的频率倒数缩放（默认False）
+        sparse (bool, 可选): 如果为True，权重矩阵的梯度将是稀疏张量（详见注释）
+
+    属性:
+        weight (Tensor): 模块的可学习权重，形状为(num_embeddings, embedding_dim)，
+                         初始化为正态分布N(0, 1)
+
+    形状:
+        - 输入: :math:`(*)`, 任意形状的IntTensor或LongTensor，包含要提取的索引
+        - 输出: :math:`(*, H)`, 其中*是输入形状，H=embedding_dim
+
+    .. 注意::
+        注意只有部分优化器支持稀疏梯度：目前支持的有SGD（CPU和CUDA）、SparseAdam（CPU和CUDA）、Adagrad（CPU）
+
+    .. 注意::
+        当max_norm不为None时，嵌入层的前向传播会原地修改weight张量。
+        由于梯度计算所需的张量不能被原地修改，因此在调用前向传播前对weight进行可微操作时，
+        若max_norm不为None则需要克隆weight。例如::
+
+            n, d, m = 3, 5, 7
+            embedding = nn.Embedding(n, d, max_norm=1.0)
+            W = torch.randn((m, d), requires_grad=True)
+            idx = torch.tensor([1, 2])
+            a = embedding.weight.clone() @ W.t()  # weight必须克隆以保证可微性
+            b = embedding(idx) @ W.t()  # 原地修改weight
+            out = (a.unsqueeze(0) + b.unsqueeze(1))
+            loss = out.sigmoid().prod()
+            loss.backward()
+
+    示例::
+
+        >>> # 包含10个3维张量的嵌入层
+        >>> embedding = nn.Embedding(10, 3)
+        >>> # 2个样本，每个包含4个索引的批次
+        >>> input = torch.LongTensor([[1, 2, 4, 5], [4, 3, 2, 9]])
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> embedding(input)
+        tensor([[[-0.0251, -1.6902,  0.7172],
+                 [-0.6431,  0.0748,  0.6969],
+                 [ 1.4970,  1.3448, -0.9685],
+                 [-0.3677, -2.7265, -0.1685]],
+
+                [[ 1.4970,  1.3448, -0.9685],
+                 [ 0.4362, -0.4004,  0.9400],
+                 [-0.6431,  0.0748,  0.6969],
+                 [ 0.9124, -2.3616,  1.1151]]])
+
+
+        >>> # 带padding_idx的示例
+        >>> embedding = nn.Embedding(10, 3, padding_idx=0)
+        >>> input = torch.LongTensor([[0, 2, 0, 5]])
+        >>> embedding(input)
+        tensor([[[ 0.0000,  0.0000,  0.0000],
+                 [ 0.1535, -2.0309,  0.9315],
+                 [ 0.0000,  0.0000,  0.0000],
+                 [-0.1655,  0.9897,  0.0635]]])
+
+        >>> # 修改填充向量的示例
+        >>> padding_idx = 0
+        >>> embedding = nn.Embedding(3, 3, padding_idx=padding_idx)
+        >>> embedding.weight
+        Parameter containing:
+        tensor([[ 0.0000,  0.0000,  0.0000],
+                [-0.7895, -0.7089, -0.0364],
+                [ 0.6778,  0.5803,  0.2678]], requires_grad=True)
+        >>> with torch.no_grad():
+        ...     embedding.weight[padding_idx] = torch.ones(3)
+        >>> embedding.weight
+        Parameter containing:
+        tensor([[ 1.0000,  1.0000,  1.0000],
+                [-0.7895, -0.7089, -0.0364],
+                [ 0.6778,  0.5803,  0.2678]], requires_grad=True)
+    """
+
+
     def __init__(self, 
-                 num_embeddings:int, 
-                 embedding_dim:int, 
-                 padding_idx:int=None, 
+                 num_embeddings:int, #嵌入字典的大小（词汇表大小）vocab_size，llama=128256
+                 embedding_dim:int, #每个嵌入向量的维度,隐藏层大小hidden_size，llama=4096
+                 padding_idx:int=None,
                  max_norm:float=None, 
                  norm_type:float=2.0, 
                  scale_grad_by_freq:bool=False, 
+                 _weight:Tensor=None,dtype=None,
                  sparse:bool=False):
         super(Embedding, self).__init__()
         self.num_embeddings = num_embeddings
         self.embedding_dim = embedding_dim
+       
+        if padding_idx is not None:
+            if padding_idx > 0:
+                assert (
+                    padding_idx < self.num_embeddings
+                ), "Padding_idx必须在num_embeddings范围内"
+            elif padding_idx < 0:
+                assert (
+                    padding_idx >= -self.num_embeddings
+                ), "Padding_idx必须在num_embeddings范围内"
+                padding_idx = self.num_embeddings + padding_idx
         self.padding_idx = padding_idx
         self.max_norm = max_norm
         self.norm_type = norm_type
         self.scale_grad_by_freq = scale_grad_by_freq
+        if _weight is None:
+            self.weight = Tensor(shape=(num_embeddings, embedding_dim),dtype=dtype)
+            self.reset_parameters()
+        else:
+            assert list(_weight.shape) == [
+                num_embeddings,
+                embedding_dim,
+            ], "权重形状与num_embeddings和embedding_dim不匹配"
+            self.weight = _weight
+        
         self.sparse = sparse
-        self.weight = Tensor(num_embeddings, embedding_dim)
-        self.weight.uniform_(-0.01, 0.01)
+        
         if padding_idx is not None:
             self.weight[padding_idx] = 0
+    def reset_parameters(self) -> None:
+        self.weight.normal_()  # 正态分布初始化权重
+        self._fill_padding_idx_with_zero()  # 填充索引位置归零
 
+    def _fill_padding_idx_with_zero(self) -> None:
+        if self.padding_idx is not None:
+           self.weight[self.padding_idx].fill_(0)
     def forward(self, input:Tensor)->Tensor:
         return self.weight[input]
     
diff --git a/front/py/deepx/nn/modules/sparse.torch.py b/front/py/deepx/nn/modules/sparse.torch.py
new file mode 100644
index 00000000..a0621543
--- /dev/null
+++ b/front/py/deepx/nn/modules/sparse.torch.py
@@ -0,0 +1,512 @@
+# mypy: 允许无类型定义的函数
+from typing import Optional
+
+import torch
+from torch import Tensor
+from torch.nn import functional as F, init
+from torch.nn.parameter import Parameter
+
+from .module import Module
+
+
+__all__ = ["Embedding", "EmbeddingBag"]
+
+
+class Embedding(Module):
+    r"""一个存储固定字典和大小的嵌入向量的简单查找表。
+
+    该模块常用于存储词嵌入并通过索引检索它们。
+    模块的输入是索引列表，输出是对应的词嵌入向量。
+
+    参数:
+        num_embeddings (int): 嵌入字典的大小（词汇表大小）
+        embedding_dim (int): 每个嵌入向量的维度
+        padding_idx (int, 可选): 如果指定，该索引位置的条目不参与梯度计算；
+                                  因此，该位置的嵌入向量在训练中不会更新，保持为固定的"填充"向量。
+                                  对于新创建的嵌入层，该位置的嵌入向量默认全零，但可更新为其他值作为填充向量。
+        max_norm (float, 可选): 如果指定，范数超过此值的嵌入向量会被重新归一化到该范数
+        norm_type (float, 可选): 计算max_norm时使用的p范数（默认L2范数，p=2）
+        scale_grad_by_freq (bool, 可选): 如果为True，梯度会按mini-batch中词的频率倒数缩放（默认False）
+        sparse (bool, 可选): 如果为True，权重矩阵的梯度将是稀疏张量（详见注释）
+
+    属性:
+        weight (Tensor): 模块的可学习权重，形状为(num_embeddings, embedding_dim)，
+                         初始化为正态分布N(0, 1)
+
+    形状:
+        - 输入: :math:`(*)`, 任意形状的IntTensor或LongTensor，包含要提取的索引
+        - 输出: :math:`(*, H)`, 其中*是输入形状，H=embedding_dim
+
+    .. 注意::
+        注意只有部分优化器支持稀疏梯度：目前支持的有SGD（CPU和CUDA）、SparseAdam（CPU和CUDA）、Adagrad（CPU）
+
+    .. 注意::
+        当max_norm不为None时，嵌入层的前向传播会原地修改weight张量。
+        由于梯度计算所需的张量不能被原地修改，因此在调用前向传播前对weight进行可微操作时，
+        若max_norm不为None则需要克隆weight。例如::
+
+            n, d, m = 3, 5, 7
+            embedding = nn.Embedding(n, d, max_norm=1.0)
+            W = torch.randn((m, d), requires_grad=True)
+            idx = torch.tensor([1, 2])
+            a = embedding.weight.clone() @ W.t()  # weight必须克隆以保证可微性
+            b = embedding(idx) @ W.t()  # 原地修改weight
+            out = (a.unsqueeze(0) + b.unsqueeze(1))
+            loss = out.sigmoid().prod()
+            loss.backward()
+
+    示例::
+
+        >>> # 包含10个3维张量的嵌入层
+        >>> embedding = nn.Embedding(10, 3)
+        >>> # 2个样本，每个包含4个索引的批次
+        >>> input = torch.LongTensor([[1, 2, 4, 5], [4, 3, 2, 9]])
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> embedding(input)
+        tensor([[[-0.0251, -1.6902,  0.7172],
+                 [-0.6431,  0.0748,  0.6969],
+                 [ 1.4970,  1.3448, -0.9685],
+                 [-0.3677, -2.7265, -0.1685]],
+
+                [[ 1.4970,  1.3448, -0.9685],
+                 [ 0.4362, -0.4004,  0.9400],
+                 [-0.6431,  0.0748,  0.6969],
+                 [ 0.9124, -2.3616,  1.1151]]])
+
+
+        >>> # 带padding_idx的示例
+        >>> embedding = nn.Embedding(10, 3, padding_idx=0)
+        >>> input = torch.LongTensor([[0, 2, 0, 5]])
+        >>> embedding(input)
+        tensor([[[ 0.0000,  0.0000,  0.0000],
+                 [ 0.1535, -2.0309,  0.9315],
+                 [ 0.0000,  0.0000,  0.0000],
+                 [-0.1655,  0.9897,  0.0635]]])
+
+        >>> # 修改填充向量的示例
+        >>> padding_idx = 0
+        >>> embedding = nn.Embedding(3, 3, padding_idx=padding_idx)
+        >>> embedding.weight
+        Parameter containing:
+        tensor([[ 0.0000,  0.0000,  0.0000],
+                [-0.7895, -0.7089, -0.0364],
+                [ 0.6778,  0.5803,  0.2678]], requires_grad=True)
+        >>> with torch.no_grad():
+        ...     embedding.weight[padding_idx] = torch.ones(3)
+        >>> embedding.weight
+        Parameter containing:
+        tensor([[ 1.0000,  1.0000,  1.0000],
+                [-0.7895, -0.7089, -0.0364],
+                [ 0.6778,  0.5803,  0.2678]], requires_grad=True)
+    """
+
+    __constants__ = [
+        "num_embeddings",
+        "embedding_dim",
+        "padding_idx",
+        "max_norm",
+        "norm_type",
+        "scale_grad_by_freq",
+        "sparse",
+    ]
+
+    num_embeddings: int
+    embedding_dim: int
+    padding_idx: Optional[int]
+    max_norm: Optional[float]
+    norm_type: float
+    scale_grad_by_freq: bool
+    weight: Tensor
+    freeze: bool
+    sparse: bool
+
+    def __init__(
+        self,
+        num_embeddings: int,
+        embedding_dim: int,
+        padding_idx: Optional[int] = None,
+        max_norm: Optional[float] = None,
+        norm_type: float = 2.0,
+        scale_grad_by_freq: bool = False,
+        sparse: bool = False,
+        _weight: Optional[Tensor] = None,
+        _freeze: bool = False,
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        if padding_idx is not None:
+            if padding_idx > 0:
+                assert (
+                    padding_idx < self.num_embeddings
+                ), "Padding_idx必须在num_embeddings范围内"
+            elif padding_idx < 0:
+                assert (
+                    padding_idx >= -self.num_embeddings
+                ), "Padding_idx必须在num_embeddings范围内"
+                padding_idx = self.num_embeddings + padding_idx
+        self.padding_idx = padding_idx
+        self.max_norm = max_norm
+        self.norm_type = norm_type
+        self.scale_grad_by_freq = scale_grad_by_freq
+        if _weight is None:
+            self.weight = Parameter(
+                torch.empty((num_embeddings, embedding_dim), **factory_kwargs),
+                requires_grad=not _freeze,
+            )
+            self.reset_parameters()
+        else:
+            assert list(_weight.shape) == [
+                num_embeddings,
+                embedding_dim,
+            ], "权重形状与num_embeddings和embedding_dim不匹配"
+            self.weight = Parameter(_weight, requires_grad=not _freeze)
+
+        self.sparse = sparse
+
+    def reset_parameters(self) -> None:
+        init.normal_(self.weight)  # 正态分布初始化权重
+        self._fill_padding_idx_with_zero()  # 填充索引位置归零
+
+    def _fill_padding_idx_with_zero(self) -> None:
+        if self.padding_idx is not None:
+            with torch.no_grad():  # 不计算梯度
+                self.weight[self.padding_idx].fill_(0)  # 填充位置设为0
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.embedding(
+            input,
+            self.weight,
+            self.padding_idx,
+            self.max_norm,
+            self.norm_type,
+            self.scale_grad_by_freq,
+            self.sparse,
+        )
+
+    def extra_repr(self) -> str:
+        s = "{num_embeddings}, {embedding_dim}"
+        if self.padding_idx is not None:
+            s += ", padding_idx={padding_idx}"
+        if self.max_norm is not None:
+            s += ", max_norm={max_norm}"
+            s += ", max_norm={max_norm}"
+        if self.norm_type != 2:
+            s += ", norm_type={norm_type}"
+        if self.scale_grad_by_freq is not False:
+            s += ", scale_grad_by_freq={scale_grad_by_freq}"
+        if self.sparse is not False:
+            s += ", sparse=True"
+        return s.format(**self.__dict__)
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        embeddings,
+        freeze=True,
+        padding_idx=None,
+        max_norm=None,
+        norm_type=2.0,
+        scale_grad_by_freq=False,
+        sparse=False,
+    ):
+        r"""从给定的2维FloatTensor创建Embedding实例。
+
+        参数:
+            embeddings (Tensor): 包含嵌入权重的FloatTensor，
+                第一维作为num_embeddings，第二维作为embedding_dim。
+            freeze (bool, 可选): 若为True，张量在学习过程中不更新，
+                相当于embedding.weight.requires_grad = False。默认True。
+            padding_idx (int, 可选): 同模块初始化文档说明。
+            max_norm (float, 可选): 同模块初始化文档说明。
+            norm_type (float, 可选): 同模块初始化文档说明，默认2。
+            scale_grad_by_freq (bool, 可选): 同模块初始化文档说明，默认False。
+            sparse (bool, 可选): 同模块初始化文档说明。
+
+        示例::
+
+            >>> # 包含预训练权重的FloatTensor
+            >>> weight = torch.FloatTensor([[1, 2.3, 3], [4, 5.1, 6.3]])
+            >>> embedding = nn.Embedding.from_pretrained(weight)
+            >>> # 获取索引1的嵌入
+            >>> input = torch.LongTensor([1])
+            >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+            >>> embedding(input)
+            tensor([[ 4.0000,  5.1000,  6.3000]])
+        """
+        assert (
+            embeddings.dim() == 2
+        ), "Embeddings参数应为2维张量"
+        rows, cols = embeddings.shape
+        embedding = cls(
+            num_embeddings=rows,
+            embedding_dim=cols,
+            _weight=embeddings,
+            _freeze=freeze,
+            padding_idx=padding_idx,
+            max_norm=max_norm,
+            norm_type=norm_type,
+            scale_grad_by_freq=scale_grad_by_freq,
+            sparse=sparse,
+        )
+        return embedding
+
+
+class EmbeddingBag(Module):
+    r"""计算嵌入"袋"的和或均值，无需实例化中间嵌入。
+
+    对于固定长度的袋、无per_sample_weights、无等于padding_idx的索引，且输入为2D时，
+    该类的行为如下：
+        * mode="sum"等价于Embedding层后接torch.sum(dim=1)
+        * mode="mean"等价于Embedding层后接torch.mean(dim=1)
+        * mode="max"等价于Embedding层后接torch.max(dim=1)
+
+    但EmbeddingBag比链式操作更节省时间和内存。
+
+    EmbeddingBag还支持在正向传播时传入样本权重，
+    这会在按mode指定的方式进行加权归约前缩放嵌入输出。
+    若传入per_sample_weights，仅支持mode="sum"，即按权重计算加权和。
+
+    参数:
+        num_embeddings (int): 嵌入字典的大小（词汇表大小）
+        embedding_dim (int): 每个嵌入向量的维度
+        max_norm (float, 可选): 若指定，范数超过此值的嵌入向量会被重新归一化到该范数
+        norm_type (float, 可选): 计算max_norm时使用的p范数（默认L2范数，p=2）
+        scale_grad_by_freq (bool, 可选): 若为True，梯度会按mini-batch中词的频率倒数缩放（默认False）。
+                                         注意：mode="max"时不支持此选项。
+        mode (str, 可选): "sum"、"mean"或"max"，指定袋的归约方式。
+                           "sum"计算加权和（考虑per_sample_weights），
+                           "mean"计算袋内平均值，"max"计算袋内最大值。默认"mean"。
+        sparse (bool, 可选): 若为True，权重矩阵的梯度将是稀疏张量（详见注释）。
+                             注意：mode="max"时不支持此选项。
+        include_last_offset (bool, 可选): 若为True，offsets包含一个额外元素，
+                                          其值等于indices的长度，符合CSR格式。
+        padding_idx (int, 可选): 若指定，该索引位置的条目不参与梯度计算；
+                                 因此，该位置的嵌入向量在训练中不会更新，保持为固定的"填充"向量。
+                                 对于新创建的EmbeddingBag，该位置的嵌入向量默认全零，
+                                 但可更新为其他值作为填充向量。注意该位置的嵌入向量会被排除在归约之外。
+
+    属性:
+        weight (Tensor): 模块的可学习权重，形状为(num_embeddings, embedding_dim)，
+                         初始化为正态分布N(0, 1)。
+
+    示例::
+
+        >>> # 包含10个3维张量的EmbeddingBag（求和模式）
+        >>> embedding_sum = nn.EmbeddingBag(10, 3, mode='sum')
+        >>> # 2个样本，每个包含4个索引的输入（展平为1D）
+        >>> input = torch.tensor([1, 2, 4, 5, 4, 3, 2, 9], dtype=torch.long)
+        >>> offsets = torch.tensor([0, 4], dtype=torch.long)
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> embedding_sum(input, offsets)
+        tensor([[-0.8861, -5.4350, -0.0523],
+                [ 1.1306, -2.5798, -1.0044]])
+
+        >>> # 带padding_idx的示例
+        >>> embedding_sum = nn.EmbeddingBag(10, 3, mode='sum', padding_idx=2)
+        >>> input = torch.tensor([2, 2, 2, 2, 4, 3, 2, 9], dtype=torch.long)
+        >>> offsets = torch.tensor([0, 4], dtype=torch.long)
+        >>> embedding_sum(input, offsets)
+        tensor([[ 0.0000,  0.0000,  0.0000],
+                [-0.7082,  3.2145, -2.6251]])
+
+        >>> # 从Embedding加载EmbeddingBag的示例
+        >>> embedding = nn.Embedding(10, 3, padding_idx=2)
+        >>> embedding_sum = nn.EmbeddingBag.from_pretrained(
+                embedding.weight,
+                padding_idx=embedding.padding_idx,
+                mode='sum')
+    """
+
+    __constants__ = [
+        "num_embeddings",
+        "embedding_dim",
+        "max_norm",
+        "norm_type",
+        "scale_grad_by_freq",
+        "mode",
+        "sparse",
+        "include_last_offset",
+        "padding_idx",
+    ]
+
+    num_embeddings: int
+    embedding_dim: int
+    max_norm: Optional[float]
+    norm_type: float
+    scale_grad_by_freq: bool
+    weight: Tensor
+    mode: str
+    sparse: bool
+    include_last_offset: bool
+    padding_idx: Optional[int]
+
+    def __init__(
+        self,
+        num_embeddings: int,
+        embedding_dim: int,
+        max_norm: Optional[float] = None,
+        norm_type: float = 2.0,
+        scale_grad_by_freq: bool = False,
+        mode: str = "mean",
+        sparse: bool = False,
+        _weight: Optional[Tensor] = None,
+        include_last_offset: bool = False,
+        padding_idx: Optional[int] = None,
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        self.max_norm = max_norm
+        self.norm_type = norm_type
+        self.scale_grad_by_freq = scale_grad_by_freq
+        if padding_idx is not None:
+            if padding_idx > 0:
+                assert (
+                    padding_idx < self.num_embeddings
+                ), "padding_idx必须在num_embeddings范围内"
+            elif padding_idx < 0:
+                assert (
+                    padding_idx >= -self.num_embeddings
+                ), "padding_idx必须在num_embeddings范围内"
+                padding_idx = self.num_embeddings + padding_idx
+        self.padding_idx = padding_idx
+        if _weight is None:
+            self.weight = Parameter(
+                torch.empty((num_embeddings, embedding_dim), **factory_kwargs)
+            )
+            self.reset_parameters()
+        else:
+            assert list(_weight.shape) == [
+                num_embeddings,
+                embedding_dim,
+            ], "权重形状与num_embeddings和embedding_dim不匹配"
+            self.weight = Parameter(_weight)
+        self.mode = mode
+        self.sparse = sparse
+        self.include_last_offset = include_last_offset
+
+    def reset_parameters(self) -> None:
+        init.normal_(self.weight)  # 正态分布初始化权重
+        self._fill_padding_idx_with_zero()  # 填充索引位置归零
+
+    def _fill_padding_idx_with_zero(self) -> None:
+        if self.padding_idx is not None:
+            with torch.no_grad():  # 不计算梯度
+                self.weight[self.padding_idx].fill_(0)  # 填充位置设为0
+
+    def forward(
+        self,
+        input: Tensor,
+        offsets: Optional[Tensor] = None,
+        per_sample_weights: Optional[Tensor] = None,
+    ) -> Tensor:
+        """EmbeddingBag的正向传播。
+
+        参数:
+            input (Tensor): 包含嵌入矩阵索引袋的张量。
+            offsets (Tensor, 可选): 仅当input为1D时使用，确定input中每个袋（序列）的起始索引位置。
+            per_sample_weights (Tensor, 可选): 浮点/双精度权重张量，None表示所有权重为1。
+                若指定，形状必须与input相同，且在offsets非None时使用相同的偏移量。仅支持mode='sum'。
+
+        返回:
+            形状为(B, embedding_dim)的张量。
+
+        .. 注意::
+
+            关于input和offsets的说明：
+            - input和offsets必须同类型（int或long）
+            - 若input为2D形状(B, N)，视为B个固定长度N的袋，返回B个按mode聚合的值，此时offsets被忽略且必须为None。
+            - 若input为1D形状(N)，视为多个袋（序列）的拼接，offsets必须为1D张量，包含每个袋在input中的起始索引位置。
+              因此，对于形状(B)的offsets，input视为B个袋，空袋（长度为0）返回全零向量。
+        """
+        return F.embedding_bag(
+            input,
+            self.weight,
+            offsets,
+            self.max_norm,
+            self.norm_type,
+            self.scale_grad_by_freq,
+            self.mode,
+            self.sparse,
+            per_sample_weights,
+            self.include_last_offset,
+            self.padding_idx,
+        )
+
+    def extra_repr(self) -> str:
+        s = "{num_embeddings}, {embedding_dim}"
+        if self.max_norm is not None:
+            s += ", max_norm={max_norm}"
+        if self.norm_type != 2:
+            s += ", norm_type={norm_type}"
+        if self.scale_grad_by_freq is not False:
+            s += ", scale_grad_by_freq={scale_grad_by_freq}"
+        s += ", mode={mode}"
+        if self.padding_idx is not None:
+            s += ", padding_idx={padding_idx}"
+        return s.format(**{k: repr(v) for k, v in self.__dict__.items()})
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        embeddings: Tensor,
+        freeze: bool = True,
+        max_norm: Optional[float] = None,
+        norm_type: float = 2.0,
+        scale_grad_by_freq: bool = False,
+        mode: str = "mean",
+        sparse: bool = False,
+        include_last_offset: bool = False,
+        padding_idx: Optional[int] = None,
+    ) -> "EmbeddingBag":
+        r"""从给定的2维FloatTensor创建EmbeddingBag实例。
+
+        参数:
+            embeddings (Tensor): 包含EmbeddingBag权重的FloatTensor，
+                第一维作为num_embeddings，第二维作为embedding_dim。
+            freeze (bool, 可选): 若为True，张量在学习过程中不更新，
+                相当于embeddingbag.weight.requires_grad = False。默认True。
+            max_norm (float, 可选): 同模块初始化文档说明，默认None。
+            norm_type (float, 可选): 同模块初始化文档说明，默认2。
+            scale_grad_by_freq (bool, 可选): 同模块初始化文档说明，默认False。
+            mode (str, 可选): 同模块初始化文档说明，默认"mean"。
+            sparse (bool, 可选): 同模块初始化文档说明，默认False。
+            include_last_offset (bool, 可选): 同模块初始化文档说明，默认False。
+            padding_idx (int, 可选): 同模块初始化文档说明，默认None。
+
+        示例::
+
+            >>> # 包含预训练权重的FloatTensor
+            >>> weight = torch.FloatTensor([[1, 2.3, 3], [4, 5.1, 6.3]])
+            >>> embeddingbag = nn.EmbeddingBag.from_pretrained(weight)
+            >>> # 获取索引1和0的嵌入袋（2D输入）
+            >>> input = torch.LongTensor([[1, 0]])
+            >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+            >>> embeddingbag(input)
+            tensor([[ 2.5000,  3.7000,  4.6500]])
+        """
+        assert (
+            embeddings.dim() == 2
+        ), "Embeddings参数应为2维张量"
+        rows, cols = embeddings.shape
+        embeddingbag = cls(
+            num_embeddings=rows,
+            embedding_dim=cols,
+            _weight=embeddings,
+            max_norm=max_norm,
+            norm_type=norm_type,
+            scale_grad_by_freq=scale_grad_by_freq,
+            mode=mode,
+            sparse=sparse,
+            include_last_offset=include_last_offset,
+            padding_idx=padding_idx,
+        )
+        embeddingbag.weight.requires_grad = not freeze
+        return embeddingbag
\ No newline at end of file
diff --git a/front/py/deepx/tensor/__init__.py b/front/py/deepx/tensor/__init__.py
index b46990e9..25fdac77 100644
--- a/front/py/deepx/tensor/__init__.py
+++ b/front/py/deepx/tensor/__init__.py
@@ -5,12 +5,13 @@
 from .changeshape import *    # 导入转置方法
 from .init import *
 from .reduce import *
-
+from .io import *
 __all__ = [
     'Shape',
     'Tensor',
     'tensor_method',
     'Number',
+    'loadShape',
     # 'lt', 'gt', 'eq',
     # 'sin', 'cos', 'tan',
     # 'DType',
diff --git a/front/py/deepx/tensor/changeshape.py b/front/py/deepx/tensor/changeshape.py
index a9776f3e..f5534541 100644
--- a/front/py/deepx/tensor/changeshape.py
+++ b/front/py/deepx/tensor/changeshape.py
@@ -49,6 +49,17 @@ def broadcastTo(self,shape:tuple[int],out:Union[Tensor,str]='')->Tensor:
     result=broadcastTo_func(self,shape,out)
     return result
 
+@tensor_method
+def gather(self,indices:Tensor,dim:int,out:Union[Tensor,str]='')->Tensor:
+    final_indices=indices
+    #TODO 当indices不是tensor时，需要转换为tensor
+    if not isinstance(indices,Tensor):
+        raise ValueError("indices must be a Tensor")
+
+    from deepx.nn.functional import gather as gather_func
+    result=gather_func(self,final_indices,dim,out)
+    return result
+
 
 
 # @tensor_method
diff --git a/front/py/deepx/tensor/io.py b/front/py/deepx/tensor/io.py
new file mode 100644
index 00000000..23027ee4
--- /dev/null
+++ b/front/py/deepx/tensor/io.py
@@ -0,0 +1,22 @@
+import yaml
+import os
+from deepx.tensor import Shape
+
+def loadShape(path:str)->tuple[str,Shape,str]:
+    filename = os.path.basename(path)
+    if filename.endswith('.shape'):
+        with open(path, 'r') as f:
+            shape = yaml.safe_load(f)
+    else:
+        raise ValueError("文件名必须以.shape结尾")
+ 
+    tensor_name = filename[:-6]  # 移除'.shape'后缀
+    return (tensor_name,Shape(shape['shape']),shape['dtype'])
+
+def saveShape(t:Shape,path:str):
+    if path.endswith('.shape'):
+        with open(path, 'w') as f:
+            yaml.dump({'shape': list(t.shape), 'dtype': t._dtype,'size':t.numel(),'dim':t.ndim,'stride':list(t.stride)}, f)
+    else:
+        raise ValueError("文件名必须以.shape结尾")
+
diff --git a/front/py/deepx/tensor/shape.py b/front/py/deepx/tensor/shape.py
index 436f6bc6..dfc4f5a0 100644
--- a/front/py/deepx/tensor/shape.py
+++ b/front/py/deepx/tensor/shape.py
@@ -8,6 +8,7 @@ def __init__(self,
         self._size = int(np.prod(self.shape)) if self.shape else 0
         # 计算 stride（步长）
         self._strides = self._compute_strides()
+        self._dtype=None
         
     @property
     def shape(self,dim=None):
@@ -156,6 +157,7 @@ def broadcast_shape(cls,shape_a: tuple[int], shape_b: tuple[int]) -> tuple[int]:
         
         return tuple(result_shape)
 
+ 
     @classmethod
     def reduceshape(cls,shape:tuple[int],dim:list[int],keepdim:bool)->tuple[int]:
         ndim = len(shape)
@@ -169,4 +171,5 @@ def reduceshape(cls,shape:tuple[int],dim:list[int],keepdim:bool)->tuple[int]:
                         for i, s in enumerate(shape))
         else:
             return tuple(s for i, s in enumerate(shape)
-                        if i not in unique_dim)
\ No newline at end of file
+                        if i not in unique_dim)
+    
\ No newline at end of file
diff --git a/front/py/deepx/tensor/tensor.py b/front/py/deepx/tensor/tensor.py
index 5b909d88..69a328e9 100644
--- a/front/py/deepx/tensor/tensor.py
+++ b/front/py/deepx/tensor/tensor.py
@@ -119,6 +119,10 @@ def __invert__(self):
     def __matmul__(self, other:Union[Number,'Tensor']):
         return self.matmul(other)
 
+    #gather
+    def __getitem__(self, indices:'Tensor'):
+        return self.gather(indices)
+
     #shape操作
     @property
     def T(self) -> str:
diff --git a/front/py/examples/1_tensor/2_saveload.py b/front/py/examples/1_tensor/2_saveload.py
new file mode 100644
index 00000000..0b88d544
--- /dev/null
+++ b/front/py/examples/1_tensor/2_saveload.py
@@ -0,0 +1,21 @@
+from deepx.tensor import Tensor
+from deepx.nn.functional import arange,save,load
+
+def saveloadfloat32():
+    t1=arange(start=0,end=60 ,dtype='float32',name='t1').reshape_(3,4,5)
+    dir='/home/lipeng/model/deepxmodel/tester/'
+
+    t2=load(dir+t1.name)
+    t2.print()
+
+def saveloadint8():
+    t=arange(start=0,end=60 ,dtype='int8',name='t.int8').reshape_(3,4,5)
+    dir='/home/lipeng/model/deepxmodel/tester/'
+
+    t2=load(dir+t.name)
+    t2.print()
+
+
+if __name__ == "__main__":
+    saveloadfloat32()
+    saveloadint8()
\ No newline at end of file
diff --git a/front/py/examples/2_ir/4_changeshape_gather.py b/front/py/examples/2_ir/4_changeshape_gather.py
new file mode 100644
index 00000000..e05013bd
--- /dev/null
+++ b/front/py/examples/2_ir/4_changeshape_gather.py
@@ -0,0 +1,28 @@
+############-------PyTorch-------################
+import numpy as np  
+print()
+indices_np = np.array([[0, 1, 2], [0, 1, 2]])
+
+print(indices_np)
+
+import torch
+torch_t = torch.arange(10*5, dtype=torch.float32).reshape(10,5)
+torch_indices = torch.tensor(indices_np)
+torch_t = torch.gather(torch_t, 1,torch_indices)
+print(torch_t.shape)
+print(torch_t)
+
+
+############-------DEEPX-------################
+
+from deepx import Tensor,arange,Shape
+from deepx.nn.functional import load,save_npy
+
+ 
+save_npy(indices_np,'/home/lipeng/model/deepxmodel/tester/testindices')
+
+t = arange(start=0,end=10*5,dtype='float32',name='t').reshape(10,5)
+indices = load('/home/lipeng/model/deepxmodel/tester/testindices')
+indices.print()
+t = t.gather(indices,dim=1)
+t.print()
\ No newline at end of file
diff --git a/front/py/examples/3_module/0_hg_tokenizer.py b/front/py/examples/3_module/0_hg_tokenizer.py
new file mode 100644
index 00000000..3a54d6f4
--- /dev/null
+++ b/front/py/examples/3_module/0_hg_tokenizer.py
@@ -0,0 +1,43 @@
+from transformers import AutoTokenizer
+
+def init_tokenizer(model_path):
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
+    tokenizer.pad_token = tokenizer.eos_token
+    return tokenizer
+
+tokenizer = init_tokenizer("/home/lipeng/model/deepseek-ai/DeepSeek-R1-Distill-Llama-8B")
+
+def test_tokenizer():
+    # 测试编码功能
+    text = "这是一个测试文本   aaa bbb"
+    tokens = tokenizer(text, return_tensors="np")
+    print(f"{text}==>{tokens.input_ids.shape} {tokens}")
+    
+    # 测试解码功能
+    for i in range(tokens.input_ids.shape[0]):
+        for j in range(tokens.input_ids.shape[1]):
+            decoded_text = tokenizer.decode(tokens.input_ids[i][j])
+            print(f"{i,j}->{decoded_text}")
+    
+    # 验证特殊tokens
+    print(f"PAD token:{tokenizer.pad_token_id}=  {tokenizer.pad_token}")
+    print(f"EOS token:{tokenizer.eos_token_id}=  {tokenizer.eos_token}")
+    print(f"Vocabulary size: {len(tokenizer)}")
+    
+    # 测试批处理
+    batch_texts = ["测试文本一", "另一个测试文本", "第三个测试文本"]
+    batch_tokens = tokenizer(batch_texts, padding=True, truncation=True, return_tensors='np')
+    print(f"批处理tokens shape: {batch_tokens.input_ids.shape}")
+    
+    # 测试最大长度限制
+    long_text = "这是一个" * 100
+    tokens_truncated = tokenizer(long_text, max_length=20, truncation=True, return_tensors="np")
+    print(f"截断后的tokens长度: {tokens_truncated.input_ids.shape[1]}")
+    
+    return True
+
+if __name__ == "__main__":
+    print()
+    test_result = test_tokenizer()
+
+    print(f"Tokenizer测试完成: {'成功' if test_result else '失败'}")
\ No newline at end of file
diff --git a/front/py/examples/3_module/1_embedding.py b/front/py/examples/3_module/1_embedding.py
new file mode 100644
index 00000000..e69de29b
diff --git a/front/py/examples/3_module/1_linear.dot b/front/py/examples/3_module/1_linear.dot
deleted file mode 100644
index c52491f6..00000000
--- a/front/py/examples/3_module/1_linear.dot
+++ /dev/null
@@ -1,66 +0,0 @@
-// Computational Graph
-digraph {
-	rankdir=TB
-	node [shape=record]
-	126533329176464 [label="linear_0.weight
-(4, 64)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	126531635937968 [label="linear_0.bias
-(4,)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	126531635951984 [label=uniform color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	126531635938016 [label="var_1
--0.12499999999999999" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	126531635938976 [label="var_2
-0.12499999999999999" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	126531635937488 [label="var_3
-0" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	126531635939072 [label=uniform color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	126531635937536 [label="var_4
--0.125" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	126531635939216 [label="var_5
-0.125" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	126531635939168 [label="var_6
-0" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	126531635939120 [label="input
-(1, 64)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	126531635940896 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	126531635940944 [label="var_7
-1" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	126531635941280 [label="linear_0.weight.T
-(64, 4)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	126531635941424 [label="vector_1
-[1, 0]" color=darkseagreen fillcolor=honeydew fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	126531635944736 [label=transpose color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	126531635944640 [label=matmul color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	126531635944784 [label="tensor_5
-(1, 4)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	126531635944976 [label="tensor_6
-(1, 4)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	126531635945216 [label=reshape color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	126531635945168 [label="vector_2
-[1, 4]" color=darkseagreen fillcolor=honeydew fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	126531635945360 [label=add color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	126531635945504 [label="tensor_7
-(1, 4)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	126531635951984 -> 126533329176464 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	126531635939072 -> 126531635937968 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	126531635938016 -> 126531635951984 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	126531635938976 -> 126531635951984 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	126531635937488 -> 126531635951984 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	126531635937536 -> 126531635939072 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	126531635939216 -> 126531635939072 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	126531635939168 -> 126531635939072 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	126531635940896 -> 126531635939120 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	126531635940944 -> 126531635940896 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	126531635944736 -> 126531635941280 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	126533329176464 -> 126531635944736 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	126531635941424 -> 126531635944736 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	126531635939120 -> 126531635944640 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	126531635941280 -> 126531635944640 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	126531635944640 -> 126531635944784 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	126531635945216 -> 126531635944976 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	126531635937968 -> 126531635945216 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	126531635945168 -> 126531635945216 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	126531635944784 -> 126531635945360 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	126531635944976 -> 126531635945360 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	126531635945360 -> 126531635945504 [arrowsize=0.8 color=gray40 penwidth=1.2]
-}
diff --git a/front/py/examples/3_module/1_linear.dot.svg b/front/py/examples/3_module/1_linear.dot.svg
deleted file mode 100644
index 7b0806f8..00000000
--- a/front/py/examples/3_module/1_linear.dot.svg
+++ /dev/null
@@ -1,299 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
- "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
-<!-- Generated by graphviz version 2.43.0 (0)
- -->
-<!-- Title: %3 Pages: 1 -->
-<svg width="656pt" height="630pt"
- viewBox="0.00 0.00 655.50 630.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
-<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 626)">
-<title>%3</title>
-<polygon fill="white" stroke="transparent" points="-4,4 -4,-626 651.5,-626 651.5,4 -4,4"/>
-<!-- 126533329176464 -->
-<g id="node1" class="node">
-<title>126533329176464</title>
-<polygon fill="aliceblue" stroke="skyblue" points="555.5,-476 439.5,-476 439.5,-438 555.5,-438 555.5,-476"/>
-<text text-anchor="middle" x="497.5" y="-460.8" font-family="Sans-Serif" font-size="14.00">linear_0.weight</text>
-<text text-anchor="middle" x="497.5" y="-445.8" font-family="Sans-Serif" font-size="14.00">(4, 64)</text>
-</g>
-<!-- 126531635944736 -->
-<g id="node16" class="node">
-<title>126531635944736</title>
-<polygon fill="lightgray" stroke="darkslategray" points="445,-402 354,-402 354,-366 445,-366 445,-402"/>
-<text text-anchor="middle" x="399.5" y="-380.3" font-family="Courier Bold" font-size="14.00">transpose</text>
-</g>
-<!-- 126533329176464&#45;&gt;126531635944736 -->
-<g id="edge12" class="edge">
-<title>126533329176464&#45;&gt;126531635944736</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M472.52,-437.9C459.46,-428.44 443.37,-416.78 429.62,-406.83"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="431.21,-404.52 423.09,-402.09 427.92,-409.05 431.21,-404.52"/>
-</g>
-<!-- 126531635937968 -->
-<g id="node2" class="node">
-<title>126531635937968</title>
-<polygon fill="aliceblue" stroke="skyblue" points="150.5,-330 50.5,-330 50.5,-292 150.5,-292 150.5,-330"/>
-<text text-anchor="middle" x="100.5" y="-314.8" font-family="Sans-Serif" font-size="14.00">linear_0.bias</text>
-<text text-anchor="middle" x="100.5" y="-299.8" font-family="Sans-Serif" font-size="14.00">(4,)</text>
-</g>
-<!-- 126531635945216 -->
-<g id="node20" class="node">
-<title>126531635945216</title>
-<polygon fill="lightgray" stroke="darkslategray" points="238.5,-256 164.5,-256 164.5,-220 238.5,-220 238.5,-256"/>
-<text text-anchor="middle" x="201.5" y="-234.3" font-family="Courier Bold" font-size="14.00">reshape</text>
-</g>
-<!-- 126531635937968&#45;&gt;126531635945216 -->
-<g id="edge18" class="edge">
-<title>126531635937968&#45;&gt;126531635945216</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M126.25,-291.9C139.71,-282.44 156.29,-270.78 170.45,-260.83"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="172.26,-262.98 177.19,-256.09 169.03,-258.4 172.26,-262.98"/>
-</g>
-<!-- 126531635951984 -->
-<g id="node3" class="node">
-<title>126531635951984</title>
-<polygon fill="lightgray" stroke="darkslategray" points="534.5,-548 460.5,-548 460.5,-512 534.5,-512 534.5,-548"/>
-<text text-anchor="middle" x="497.5" y="-526.3" font-family="Courier Bold" font-size="14.00">uniform</text>
-</g>
-<!-- 126531635951984&#45;&gt;126533329176464 -->
-<g id="edge1" class="edge">
-<title>126531635951984&#45;&gt;126533329176464</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M497.5,-511.81C497.5,-503.52 497.5,-493.39 497.5,-484.16"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="500.3,-484.02 497.5,-476.02 494.7,-484.02 500.3,-484.02"/>
-</g>
-<!-- 126531635938016 -->
-<g id="node4" class="node">
-<title>126531635938016</title>
-<polygon fill="moccasin" stroke="orange" points="401.5,-622 241.5,-622 241.5,-584 401.5,-584 401.5,-622"/>
-<text text-anchor="middle" x="321.5" y="-606.8" font-family="Sans-Serif" font-size="14.00">var_1</text>
-<text text-anchor="middle" x="321.5" y="-591.8" font-family="Sans-Serif" font-size="14.00">&#45;0.12499999999999999</text>
-</g>
-<!-- 126531635938016&#45;&gt;126531635951984 -->
-<g id="edge3" class="edge">
-<title>126531635938016&#45;&gt;126531635951984</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M366.37,-583.9C392.88,-573.21 426.33,-559.71 452.81,-549.03"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="454.06,-551.54 460.43,-545.95 451.97,-546.35 454.06,-551.54"/>
-</g>
-<!-- 126531635938976 -->
-<g id="node5" class="node">
-<title>126531635938976</title>
-<polygon fill="moccasin" stroke="orange" points="575,-622 420,-622 420,-584 575,-584 575,-622"/>
-<text text-anchor="middle" x="497.5" y="-606.8" font-family="Sans-Serif" font-size="14.00">var_2</text>
-<text text-anchor="middle" x="497.5" y="-591.8" font-family="Sans-Serif" font-size="14.00">0.12499999999999999</text>
-</g>
-<!-- 126531635938976&#45;&gt;126531635951984 -->
-<g id="edge4" class="edge">
-<title>126531635938976&#45;&gt;126531635951984</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M497.5,-583.72C497.5,-575.29 497.5,-565.15 497.5,-556.02"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="500.3,-556 497.5,-548 494.7,-556 500.3,-556"/>
-</g>
-<!-- 126531635937488 -->
-<g id="node6" class="node">
-<title>126531635937488</title>
-<polygon fill="moccasin" stroke="orange" points="647.5,-622 593.5,-622 593.5,-584 647.5,-584 647.5,-622"/>
-<text text-anchor="middle" x="620.5" y="-606.8" font-family="Sans-Serif" font-size="14.00">var_3</text>
-<text text-anchor="middle" x="620.5" y="-591.8" font-family="Sans-Serif" font-size="14.00">0</text>
-</g>
-<!-- 126531635937488&#45;&gt;126531635951984 -->
-<g id="edge5" class="edge">
-<title>126531635937488&#45;&gt;126531635951984</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M593.2,-586.24C575.96,-576.29 553.45,-563.3 534.6,-552.41"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="535.63,-549.78 527.3,-548.2 532.83,-554.63 535.63,-549.78"/>
-</g>
-<!-- 126531635939072 -->
-<g id="node7" class="node">
-<title>126531635939072</title>
-<polygon fill="lightgray" stroke="darkslategray" points="137.5,-402 63.5,-402 63.5,-366 137.5,-366 137.5,-402"/>
-<text text-anchor="middle" x="100.5" y="-380.3" font-family="Courier Bold" font-size="14.00">uniform</text>
-</g>
-<!-- 126531635939072&#45;&gt;126531635937968 -->
-<g id="edge2" class="edge">
-<title>126531635939072&#45;&gt;126531635937968</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M100.5,-365.81C100.5,-357.52 100.5,-347.39 100.5,-338.16"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="103.3,-338.02 100.5,-330.02 97.7,-338.02 103.3,-338.02"/>
-</g>
-<!-- 126531635937536 -->
-<g id="node8" class="node">
-<title>126531635937536</title>
-<polygon fill="moccasin" stroke="orange" points="55,-476 0,-476 0,-438 55,-438 55,-476"/>
-<text text-anchor="middle" x="27.5" y="-460.8" font-family="Sans-Serif" font-size="14.00">var_4</text>
-<text text-anchor="middle" x="27.5" y="-445.8" font-family="Sans-Serif" font-size="14.00">&#45;0.125</text>
-</g>
-<!-- 126531635937536&#45;&gt;126531635939072 -->
-<g id="edge6" class="edge">
-<title>126531635937536&#45;&gt;126531635939072</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M46.3,-437.72C55.68,-428.6 67.11,-417.48 77.05,-407.81"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="79.23,-409.59 83.02,-402 75.33,-405.57 79.23,-409.59"/>
-</g>
-<!-- 126531635939216 -->
-<g id="node9" class="node">
-<title>126531635939216</title>
-<polygon fill="moccasin" stroke="orange" points="127.5,-476 73.5,-476 73.5,-438 127.5,-438 127.5,-476"/>
-<text text-anchor="middle" x="100.5" y="-460.8" font-family="Sans-Serif" font-size="14.00">var_5</text>
-<text text-anchor="middle" x="100.5" y="-445.8" font-family="Sans-Serif" font-size="14.00">0.125</text>
-</g>
-<!-- 126531635939216&#45;&gt;126531635939072 -->
-<g id="edge7" class="edge">
-<title>126531635939216&#45;&gt;126531635939072</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M100.5,-437.72C100.5,-429.29 100.5,-419.15 100.5,-410.02"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="103.3,-410 100.5,-402 97.7,-410 103.3,-410"/>
-</g>
-<!-- 126531635939168 -->
-<g id="node10" class="node">
-<title>126531635939168</title>
-<polygon fill="moccasin" stroke="orange" points="199.5,-476 145.5,-476 145.5,-438 199.5,-438 199.5,-476"/>
-<text text-anchor="middle" x="172.5" y="-460.8" font-family="Sans-Serif" font-size="14.00">var_6</text>
-<text text-anchor="middle" x="172.5" y="-445.8" font-family="Sans-Serif" font-size="14.00">0</text>
-</g>
-<!-- 126531635939168&#45;&gt;126531635939072 -->
-<g id="edge8" class="edge">
-<title>126531635939168&#45;&gt;126531635939072</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M153.96,-437.72C144.71,-428.6 133.44,-417.48 123.63,-407.81"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="125.41,-405.63 117.74,-402 121.47,-409.61 125.41,-405.63"/>
-</g>
-<!-- 126531635939120 -->
-<g id="node11" class="node">
-<title>126531635939120</title>
-<polygon fill="aliceblue" stroke="skyblue" points="318,-330 263,-330 263,-292 318,-292 318,-330"/>
-<text text-anchor="middle" x="290.5" y="-314.8" font-family="Sans-Serif" font-size="14.00">input</text>
-<text text-anchor="middle" x="290.5" y="-299.8" font-family="Sans-Serif" font-size="14.00">(1, 64)</text>
-</g>
-<!-- 126531635944640 -->
-<g id="node17" class="node">
-<title>126531635944640</title>
-<polygon fill="lightgray" stroke="darkslategray" points="325.5,-256 259.5,-256 259.5,-220 325.5,-220 325.5,-256"/>
-<text text-anchor="middle" x="292.5" y="-234.3" font-family="Courier Bold" font-size="14.00">matmul</text>
-</g>
-<!-- 126531635939120&#45;&gt;126531635944640 -->
-<g id="edge14" class="edge">
-<title>126531635939120&#45;&gt;126531635944640</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M291.02,-291.72C291.25,-283.29 291.54,-273.15 291.8,-264.02"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="294.59,-264.08 292.02,-256 289,-263.92 294.59,-264.08"/>
-</g>
-<!-- 126531635940896 -->
-<g id="node12" class="node">
-<title>126531635940896</title>
-<polygon fill="lightgray" stroke="darkslategray" points="332,-402 249,-402 249,-366 332,-366 332,-402"/>
-<text text-anchor="middle" x="290.5" y="-380.3" font-family="Courier Bold" font-size="14.00">constant</text>
-</g>
-<!-- 126531635940896&#45;&gt;126531635939120 -->
-<g id="edge9" class="edge">
-<title>126531635940896&#45;&gt;126531635939120</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M290.5,-365.81C290.5,-357.52 290.5,-347.39 290.5,-338.16"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="293.3,-338.02 290.5,-330.02 287.7,-338.02 293.3,-338.02"/>
-</g>
-<!-- 126531635940944 -->
-<g id="node13" class="node">
-<title>126531635940944</title>
-<polygon fill="moccasin" stroke="orange" points="317.5,-476 263.5,-476 263.5,-438 317.5,-438 317.5,-476"/>
-<text text-anchor="middle" x="290.5" y="-460.8" font-family="Sans-Serif" font-size="14.00">var_7</text>
-<text text-anchor="middle" x="290.5" y="-445.8" font-family="Sans-Serif" font-size="14.00">1</text>
-</g>
-<!-- 126531635940944&#45;&gt;126531635940896 -->
-<g id="edge10" class="edge">
-<title>126531635940944&#45;&gt;126531635940896</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M290.5,-437.72C290.5,-429.29 290.5,-419.15 290.5,-410.02"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="293.3,-410 290.5,-402 287.7,-410 293.3,-410"/>
-</g>
-<!-- 126531635941280 -->
-<g id="node14" class="node">
-<title>126531635941280</title>
-<polygon fill="aliceblue" stroke="skyblue" points="463,-330 336,-330 336,-292 463,-292 463,-330"/>
-<text text-anchor="middle" x="399.5" y="-314.8" font-family="Sans-Serif" font-size="14.00">linear_0.weight.T</text>
-<text text-anchor="middle" x="399.5" y="-299.8" font-family="Sans-Serif" font-size="14.00">(64, 4)</text>
-</g>
-<!-- 126531635941280&#45;&gt;126531635944640 -->
-<g id="edge15" class="edge">
-<title>126531635941280&#45;&gt;126531635944640</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M372.22,-291.9C357.83,-282.35 340.08,-270.57 324.98,-260.55"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="326.47,-258.18 318.25,-256.09 323.37,-262.85 326.47,-258.18"/>
-</g>
-<!-- 126531635941424 -->
-<g id="node15" class="node">
-<title>126531635941424</title>
-<polygon fill="honeydew" stroke="darkseagreen" points="421.5,-476 349.5,-476 349.5,-438 421.5,-438 421.5,-476"/>
-<text text-anchor="middle" x="385.5" y="-460.8" font-family="Sans-Serif" font-size="14.00">vector_1</text>
-<text text-anchor="middle" x="385.5" y="-445.8" font-family="Sans-Serif" font-size="14.00">[1, 0]</text>
-</g>
-<!-- 126531635941424&#45;&gt;126531635944736 -->
-<g id="edge13" class="edge">
-<title>126531635941424&#45;&gt;126531635944736</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M389.11,-437.72C390.77,-429.29 392.77,-419.15 394.57,-410.02"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="397.35,-410.39 396.15,-402 391.85,-409.31 397.35,-410.39"/>
-</g>
-<!-- 126531635944736&#45;&gt;126531635941280 -->
-<g id="edge11" class="edge">
-<title>126531635944736&#45;&gt;126531635941280</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M399.5,-365.81C399.5,-357.52 399.5,-347.39 399.5,-338.16"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="402.3,-338.02 399.5,-330.02 396.7,-338.02 402.3,-338.02"/>
-</g>
-<!-- 126531635944784 -->
-<g id="node18" class="node">
-<title>126531635944784</title>
-<polygon fill="aliceblue" stroke="skyblue" points="329,-184 256,-184 256,-146 329,-146 329,-184"/>
-<text text-anchor="middle" x="292.5" y="-168.8" font-family="Sans-Serif" font-size="14.00">tensor_5</text>
-<text text-anchor="middle" x="292.5" y="-153.8" font-family="Sans-Serif" font-size="14.00">(1, 4)</text>
-</g>
-<!-- 126531635944640&#45;&gt;126531635944784 -->
-<g id="edge16" class="edge">
-<title>126531635944640&#45;&gt;126531635944784</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M292.5,-219.81C292.5,-211.52 292.5,-201.39 292.5,-192.16"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="295.3,-192.02 292.5,-184.02 289.7,-192.02 295.3,-192.02"/>
-</g>
-<!-- 126531635945360 -->
-<g id="node22" class="node">
-<title>126531635945360</title>
-<polygon fill="lightgray" stroke="darkslategray" points="273.5,-110 219.5,-110 219.5,-74 273.5,-74 273.5,-110"/>
-<text text-anchor="middle" x="246.5" y="-88.3" font-family="Courier Bold" font-size="14.00">add</text>
-</g>
-<!-- 126531635944784&#45;&gt;126531635945360 -->
-<g id="edge20" class="edge">
-<title>126531635944784&#45;&gt;126531635945360</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M280.65,-145.72C274.97,-136.94 268.08,-126.31 261.99,-116.91"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="264.22,-115.19 257.52,-110 259.52,-118.24 264.22,-115.19"/>
-</g>
-<!-- 126531635944976 -->
-<g id="node19" class="node">
-<title>126531635944976</title>
-<polygon fill="aliceblue" stroke="skyblue" points="238,-184 165,-184 165,-146 238,-146 238,-184"/>
-<text text-anchor="middle" x="201.5" y="-168.8" font-family="Sans-Serif" font-size="14.00">tensor_6</text>
-<text text-anchor="middle" x="201.5" y="-153.8" font-family="Sans-Serif" font-size="14.00">(1, 4)</text>
-</g>
-<!-- 126531635944976&#45;&gt;126531635945360 -->
-<g id="edge21" class="edge">
-<title>126531635944976&#45;&gt;126531635945360</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M213.09,-145.72C218.65,-136.94 225.39,-126.31 231.35,-116.91"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="233.81,-118.26 235.72,-110 229.08,-115.26 233.81,-118.26"/>
-</g>
-<!-- 126531635945216&#45;&gt;126531635944976 -->
-<g id="edge17" class="edge">
-<title>126531635945216&#45;&gt;126531635944976</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M201.5,-219.81C201.5,-211.52 201.5,-201.39 201.5,-192.16"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="204.3,-192.02 201.5,-184.02 198.7,-192.02 204.3,-192.02"/>
-</g>
-<!-- 126531635945168 -->
-<g id="node21" class="node">
-<title>126531635945168</title>
-<polygon fill="honeydew" stroke="darkseagreen" points="240.5,-330 168.5,-330 168.5,-292 240.5,-292 240.5,-330"/>
-<text text-anchor="middle" x="204.5" y="-314.8" font-family="Sans-Serif" font-size="14.00">vector_2</text>
-<text text-anchor="middle" x="204.5" y="-299.8" font-family="Sans-Serif" font-size="14.00">[1, 4]</text>
-</g>
-<!-- 126531635945168&#45;&gt;126531635945216 -->
-<g id="edge19" class="edge">
-<title>126531635945168&#45;&gt;126531635945216</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M203.73,-291.72C203.37,-283.29 202.94,-273.15 202.56,-264.02"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="205.35,-263.88 202.22,-256 199.76,-264.11 205.35,-263.88"/>
-</g>
-<!-- 126531635945504 -->
-<g id="node23" class="node">
-<title>126531635945504</title>
-<polygon fill="aliceblue" stroke="skyblue" points="283,-38 210,-38 210,0 283,0 283,-38"/>
-<text text-anchor="middle" x="246.5" y="-22.8" font-family="Sans-Serif" font-size="14.00">tensor_7</text>
-<text text-anchor="middle" x="246.5" y="-7.8" font-family="Sans-Serif" font-size="14.00">(1, 4)</text>
-</g>
-<!-- 126531635945360&#45;&gt;126531635945504 -->
-<g id="edge22" class="edge">
-<title>126531635945360&#45;&gt;126531635945504</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M246.5,-73.81C246.5,-65.52 246.5,-55.39 246.5,-46.16"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="249.3,-46.02 246.5,-38.02 243.7,-46.02 249.3,-46.02"/>
-</g>
-</g>
-</svg>