diff --git a/deepxctl/.gitignore b/deepxctl/.gitignore new file mode 100644 index 00000000..5ca0477f --- /dev/null +++ b/deepxctl/.gitignore @@ -0,0 +1,2 @@ +.idea +deepxctl \ No newline at end of file diff --git a/deepxctl/cmd/tensor/print.go b/deepxctl/cmd/tensor/print.go new file mode 100644 index 00000000..44029497 --- /dev/null +++ b/deepxctl/cmd/tensor/print.go @@ -0,0 +1,83 @@ +package tensor + +import ( + "flag" + "fmt" + "os" + + coretensor "github.com/array2d/deepx/deepxctl/tensor" +) + +func PrintCmd() { + printCmd := flag.NewFlagSet("print", flag.ExitOnError) + tensorPath := os.Args[0] + if tensorPath == "" { + fmt.Println("请指定文件路径") + printCmd.Usage() + return + } + var err error + var shape coretensor.Shape + shape, err = coretensor.LoadShape(tensorPath) + if err != nil { + fmt.Println("读取文件失败:", err) + } + switch shape.Dtype { + case "bool": + var t coretensor.Tensor[bool] + t, err = coretensor.LoadTensor[bool](tensorPath) + if err != nil { + fmt.Println("读取文件失败:", err) + } + t.Print() + case "int8": + var t coretensor.Tensor[int8] + t, err = coretensor.LoadTensor[int8](tensorPath) + if err != nil { + fmt.Println("读取文件失败:", err) + } + t.Print() + case "int16": + var t coretensor.Tensor[int16] + t, err = coretensor.LoadTensor[int16](tensorPath) + if err != nil { + fmt.Println("读取文件失败:", err) + } + t.Print() + case "int32": + var t coretensor.Tensor[int32] + t, err = coretensor.LoadTensor[int32](tensorPath) + if err != nil { + fmt.Println("读取文件失败:", err) + } + t.Print() + case "int64": + var t coretensor.Tensor[int64] + t, err = coretensor.LoadTensor[int64](tensorPath) + if err != nil { + fmt.Println("读取文件失败:", err) + } + t.Print() + case "float16": + // var t coretensor.Tensor[float16] + // t, err = coretensor.LoadTensor[float16](tensorPath) + // if err != nil { + // fmt.Println("读取文件失败:", err) + // } + // t.Print() + case "float32": + var t coretensor.Tensor[float32] + t, err = coretensor.LoadTensor[float32](tensorPath) + if err != nil { + fmt.Println("读取文件失败:", err) + } + t.Print() + case "float64": + var t coretensor.Tensor[float64] + t, err = coretensor.LoadTensor[float64](tensorPath) + if err != nil { + fmt.Println("读取文件失败:", err) + } + t.Print() + } +} diff --git a/deepxctl/cmd/tensor/tensor.go b/deepxctl/cmd/tensor/tensor.go new file mode 100644 index 00000000..eb11dba0 --- /dev/null +++ b/deepxctl/cmd/tensor/tensor.go @@ -0,0 +1,36 @@ +package tensor + +import ( + "fmt" + "os" +) + +func PrintUsage() { + fmt.Println("使用方法:") + fmt.Println(" tensor print <文件路径>") + fmt.Println(" tensor help") +} + +func Execute() { + if len(os.Args) < 1 { + PrintUsage() + os.Exit(1) + } + + subCmd := "help" + if len(os.Args) > 0 { + subCmd = os.Args[0] + } + + switch subCmd { + case "print": + os.Args = os.Args[1:] + PrintCmd() + case "help": + PrintUsage() + default: + fmt.Printf("未知的张量命令: %s\n", subCmd) + PrintUsage() + os.Exit(1) + } +} diff --git a/deepxctl/go.mod b/deepxctl/go.mod new file mode 100644 index 00000000..8cabf45f --- /dev/null +++ b/deepxctl/go.mod @@ -0,0 +1,5 @@ +module github.com/array2d/deepx/deepxctl + +go 1.23.2 + +require gopkg.in/yaml.v2 v2.4.0 // indirect diff --git a/deepxctl/go.sum b/deepxctl/go.sum new file mode 100644 index 00000000..75346616 --- /dev/null +++ b/deepxctl/go.sum @@ -0,0 +1,3 @@ +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= +gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= diff --git a/deepxctl/main.go b/deepxctl/main.go new file mode 100644 index 00000000..de73e43c --- /dev/null +++ b/deepxctl/main.go @@ -0,0 +1,64 @@ +package main + +import ( + "flag" + "fmt" + "os" + "path/filepath" + + "github.com/array2d/deepx/deepxctl/cmd/tensor" +) + +var version = "0.1.0" + +func printUsage() { + execName := filepath.Base(os.Args[0]) + fmt.Printf("用法: %s [命令] [参数]\n\n", execName) + fmt.Println("可用命令:") + fmt.Println(" tensor 张量操作相关命令") + fmt.Println(" version 显示版本信息") + fmt.Println(" help 显示帮助信息") + fmt.Println("\n使用 '%s help [命令]' 获取命令的详细信息", execName) +} + +func main() { + flag.Usage = printUsage + + if len(os.Args) < 2 { + printUsage() + os.Exit(1) + } + + // 获取子命令 + cmd := os.Args[1] + + // 根据子命令执行相应操作 + switch cmd { + case "tensor": + // 移除子命令,让子命令处理剩余的参数 + os.Args = os.Args[2:] + tensor.Execute() + + case "version": + fmt.Printf("deepxctl 版本 %s\n", version) + + case "help": + if len(os.Args) > 2 { + helpCmd := os.Args[2] + switch helpCmd { + case "tensor": + tensor.PrintUsage() + default: + fmt.Printf("未知命令: %s\n", helpCmd) + printUsage() + } + } else { + printUsage() + } + + default: + fmt.Printf("未知命令: %s\n", cmd) + printUsage() + os.Exit(1) + } +} diff --git a/deepxctl/tensor/fp16.go b/deepxctl/tensor/fp16.go new file mode 100644 index 00000000..18a7e0f8 --- /dev/null +++ b/deepxctl/tensor/fp16.go @@ -0,0 +1,28 @@ +package tensor + +import ( + "encoding/binary" + "math" +) + +func Byte2ToFloat16(value []byte) float32 { + bits := binary.BigEndian.Uint16(value) + // 这里需要实现float16到float32的转换 + // 简化实现,实际项目中需要更完整的实现 + sign := float32(1) + if bits&0x8000 != 0 { + sign = -1 + } + exp := int((bits & 0x7C00) >> 10) + frac := float32(bits&0x03FF) / 1024.0 + + if exp == 0 { + return sign * frac * float32(1.0/16384.0) // 非规格化数 + } else if exp == 31 { + if frac == 0 { + return sign * float32(math.Inf(1)) // 无穷大 + } + return float32(math.NaN()) // NaN + } + return sign * float32(math.Pow(2, float64(exp-15))) * (1.0 + frac) // 规格化数 +} diff --git a/deepxctl/tensor/io.go b/deepxctl/tensor/io.go new file mode 100644 index 00000000..b0faf9d7 --- /dev/null +++ b/deepxctl/tensor/io.go @@ -0,0 +1,47 @@ +package tensor + +import ( + "encoding/binary" + "os" + + "gopkg.in/yaml.v2" +) + +func LoadShape(filePath string) (shape Shape, err error) { + var shapeData []byte + shapeData, err = os.ReadFile(filePath + ".shape") + if err != nil { + return + } + + err = yaml.Unmarshal(shapeData, &shape) + if err != nil { + return + } + return +} +func LoadTensor[T Number](filePath string) (tensor Tensor[T], err error) { + + _, err = os.ReadFile(filePath + ".shape") + if err != nil { + return + } + var shape Shape + shape, err = LoadShape(filePath) + if err != nil { + return + } + file, err := os.Open(filePath + ".data") + if err != nil { + return + } + defer file.Close() + data := make([]T, shape.Size) + + err = binary.Read(file, binary.LittleEndian, data) + if err != nil { + return + } + tensor = Tensor[T]{Data: data, Shape: shape} + return +} diff --git a/deepxctl/tensor/print.go b/deepxctl/tensor/print.go new file mode 100644 index 00000000..89dd3535 --- /dev/null +++ b/deepxctl/tensor/print.go @@ -0,0 +1,121 @@ +package tensor + +import "fmt" + +func (t *Tensor[T]) Range(dimCount int, f func(indices []int)) { + Shape := t.Shape + if dimCount > len(Shape.Shape) { + panic("dimCount exceeds the number of dimensions in the Tensor.") + } + + totalSize := 1 + + // 计算总的循环次数 + for i := 0; i < dimCount; i++ { + totalSize *= Shape.At(i) + } + indices := make([]int, dimCount) // 初始化索引向量 + // 遍历所有可能的索引组合 + for idx := 0; idx < totalSize; idx++ { + // 反算出 indices 数组 + idx_ := idx + for dim := dimCount - 1; dim >= 0; dim-- { + indices[dim] = idx_ % Shape.At(dim) // 计算当前维度的索引 + idx_ /= Shape.At(dim) // 更新 idx + } + f(indices) // 调用传入的函数 + } +} + +func AutoFormat(dtype string) string { + switch dtype { + case "bool": + return "%v" + case "int8": + return "%d" + case "int16": + return "%d" + case "int32": + return "%d" + case "int64": + return "%d" + case "float16": + return "%f" + case "float32": + return "%f" + case "float64": + return "%f" + default: + return "%v" + } +} + +// Print 打印Tensor的值 +func (t *Tensor[T]) Print(format_ ...string) { + Shape := t.Shape + format := AutoFormat(t.Dtype) + if len(format_) > 0 { + format = format_[0] + } + fmt.Print("shape:[") + for i := 0; i < Shape.Dim; i++ { + fmt.Print(Shape.At(i)) + if i < Shape.Dim-1 { + fmt.Print(", ") + } + } + fmt.Println("]") + if Shape.Dim == 1 { + fmt.Print("[") + for i := 0; i < Shape.At(0); i++ { + if i > 0 { + fmt.Print(" ") + } + fmt.Printf(format, t.Get(i)) + } + fmt.Println("]") + } else if Shape.Dim == 2 { + fmt.Println("[") + for i := 0; i < Shape.At(0); i++ { + fmt.Print(" [") + for j := 0; j < Shape.At(1); j++ { + if j > 0 { + fmt.Print(" ") + } + fmt.Printf(format, t.Get(i, j)) + } + + fmt.Print("]") + if i < Shape.At(0)-1 { + fmt.Print(",") + } + fmt.Println() + } + fmt.Println("]") + } else { + t.Range(Shape.Dim-2, func(indices []int) { + fmt.Print(indices) + m, n := Shape.At(Shape.Dim-2), Shape.At(Shape.Dim-1) + fmt.Print([]int{m, n}) + fmt.Println("=") + + fmt.Println("[") + for i := 0; i < m; i++ { + fmt.Print(" [") + for j := 0; j < n; j++ { + if j > 0 { + fmt.Print(" ") + } + fmt.Printf(format, t.Get(append(indices, i, j)...)) + } + + fmt.Print("]") + if i < m-1 { + fmt.Print(",") + } + fmt.Println() + } + fmt.Println("]") + }) + } +} diff --git a/deepxctl/tensor/tensor.go b/deepxctl/tensor/tensor.go new file mode 100644 index 00000000..0ef542b3 --- /dev/null +++ b/deepxctl/tensor/tensor.go @@ -0,0 +1,90 @@ +package tensor + +import ( + "fmt" +) + +type Shape struct { + Shape []int `json:"shape"` + Stride []int `json:"stride"` + Dim int `json:"ndim"` + Size int `json:"size"` + Dtype string `json:"dtype"` +} + +func NewTensorShape(shape []int) (s Shape) { + s.Dim = len(shape) + s.Shape = make([]int, len(shape)) + copy(s.Shape, shape) + s.Stride = make([]int, len(shape)) + s.Stride[len(shape)-1] = 1 + for i := len(shape) - 2; i >= 0; i-- { + s.Stride[i] = s.Stride[i+1] * shape[i+1] + } + s.Size = s.Stride[0] * shape[0] + return s +} +func (s Shape) String() string { + return fmt.Sprintf("%v", s.Shape) +} + +func (s Shape) At(i int) int { + return s.Shape[i] +} + +func (s Shape) LinearAt(indices []int) int { + idx := 0 + for i := 0; i < len(indices); i++ { + idx += indices[i] * s.Stride[i] + } + return idx +} +func (s Shape) LinearTo(idx int) (indices []int) { + linearIndex := idx + indices = make([]int, s.Dim) + for i := 0; i < s.Dim; i++ { + indices[i] = linearIndex / s.Stride[i] + linearIndex %= s.Stride[i] + } + return indices +} + +func BitSize(Dtype string) int { + switch Dtype { + case "bool": + return 8 + case "int8": + return 8 + case "int16": + return 16 + case "int32": + return 32 + case "int64": + return 64 + case "float16": + return 16 + case "float32": + return 32 + case "float64": + return 64 + default: + return 0 + } +} + +type Number interface { + comparable + float64 | float32 | int64 | int32 | int16 | int8 | bool +} + +type Tensor[T Number] struct { + Data []T + Shape +} + +// Get 获取Tensor的值 +func (t *Tensor[T]) Get(indices ...int) T { + idx := t.Shape.LinearAt(indices) + return t.Data[idx] + +} diff --git a/doc/excuter/op-mem-cuda/list.md b/doc/excuter/op-mem-cuda/list.md index 34de625c..2a73fd50 100644 --- a/doc/excuter/op-mem-cuda/list.md +++ b/doc/excuter/op-mem-cuda/list.md @@ -9,13 +9,6 @@ | vecset | none | vecset(vector value)->(vector name) | shape = [3 4 5] | vecset(vector value)->(vector name) | | argset | none | argset(var value)->(var name) | var argname = argvalue | argset(var value)->(var name) | -### io - -| Operation | Author | Func Def | Math Formula | IR Instruction | -|-----------|--------|------------|--------------|----------------| -| print | miaobyte | print(tensor )->() | print(T1) | print(tensor )->() | -| print | miaobyte | print(tensor , var )->() | print(T1) | print(tensor , var )->() | - ### tensorlife | Operation | Author | Func Def | Math Formula | IR Instruction | @@ -25,13 +18,22 @@ | newtensor | none | newtensor(var shape)->(tensor tensor1) | T1 = zeros(shape) | newtensor(var shape)->(tensor tensor1) | | deltensor | none | deltensor(tensor t)->() | del T1 | deltensor(tensor t)->() | +### io + +| Operation | Author | Func Def | Math Formula | IR Instruction | +|-----------|--------|------------|--------------|----------------| +| save | none | save(tensor t, var path)->() | save(T1,path) | save(tensor t, var path)->() | +| print | miaobyte | print(tensor t)->() | print(T1) | print(tensor t)->() | +| print | miaobyte | print(tensor t, var format)->() | print(T1) | print(tensor t, var format)->() | +| load | none | load(var path)->() | load(path) | load(var path)->() | + ### init | Operation | Author | Func Def | Math Formula | IR Instruction | |-----------|--------|------------|--------------|----------------| +| arange | miaobyte | arange(tensor t, var start, var step)->() | arange(T1,start,step) | arange(tensor t, var start, var step)->() | | normal | miaobyte | normal(tensor t, var mean, var stddev, var seed)->() | normal(T1,mean,stddev,seed) | normal(tensor t, var mean, var stddev, var seed)->() | | uniform | miaobyte | uniform(tensor t, var low, var high, var seed)->() | uniform(T1,low,high,seed) | uniform(tensor t, var low, var high, var seed)->() | -| arange | miaobyte | arange(tensor t, var start, var step)->() | arange(T1,start,step) | arange(tensor t, var start, var step)->() | | constant | miaobyte | constant(tensor t, var value)->() | constant(T1) | constant(tensor t, var value)->() | ### elementwise @@ -43,19 +45,21 @@ | equalscalar | miaobyte | equalscalar(tensor A, var scalar, var epsilon)->(tensor mask) | mask=compare(T1, scalar) | equalscalar(tensor A, var scalar, var epsilon)->(tensor mask) | | min | miaobyte | min(tensor A, tensor B)->(tensor C) | T3=min(T1, T2) | min(tensor A, tensor B)->(tensor C) | | maxscalar | miaobyte | maxscalar(tensor A, var scalar)->(tensor C) | T3=max(T1, scalar) | maxscalar(tensor A, var scalar)->(tensor C) | -| addscalar | miaobyte | addscalar(tensor A, var b)->(tensor C) | T3=T1+scalar | addscalar(tensor A, var b)->(tensor C) | -| log | miaobyte | log(tensor A)->(tensor C) | T3=log(T1) | log(tensor A)->(tensor C) | +| tan | miaobyte | tan(tensor A)->(tensor C) | T3=tan(T1) | tan(tensor A)->(tensor C) | | divscalar | miaobyte | divscalar(tensor A, var scalar)->(tensor C) | T3=scalar/T1 | divscalar(tensor A, var scalar)->(tensor C) | | sin | miaobyte | sin(tensor A)->(tensor C) | T3=sin(T1) | sin(tensor A)->(tensor C) | -| tan | miaobyte | tan(tensor A)->(tensor C) | T3=tan(T1) | tan(tensor A)->(tensor C) | | add | cublas | add(tensor a, tensor b)->(tensor c) | T3=T1+T2 | add(tensor a, tensor b)->(tensor c) | | add | miaobyte | add(tensor a, tensor b)->(tensor c) | T3=T1+T2 | add(tensor a, tensor b)->(tensor c) | | greater | miaobyte | greater(tensor A, tensor B)->(tensor mask) | mask=compare(T1, T2) | greater(tensor A, tensor B)->(tensor mask) | +| lessscalar | miaobyte | lessscalar(tensor A, var scalar)->(tensor mask) | mask=compare(T1, scalar) | lessscalar(tensor A, var scalar)->(tensor mask) | +| cos | miaobyte | cos(tensor A)->(tensor C) | T3=cos(T1) | cos(tensor A)->(tensor C) | | less | miaobyte | less(tensor A, tensor B)->(tensor mask) | mask=compare(T1, T2) | less(tensor A, tensor B)->(tensor mask) | | powscalar | miaobyte | powscalar(tensor A, var scalar)->(tensor C) | T3=pow(T1, scalar) | powscalar(tensor A, var scalar)->(tensor C) | | minscalar | miaobyte | minscalar(tensor A, var scalar)->(tensor C) | T3=min(T1, scalar) | minscalar(tensor A, var scalar)->(tensor C) | | rdivscalar | miaobyte | rdivscalar(var scalar, tensor A)->(tensor C) | T3=scalar/T1 | rdivscalar(var scalar, tensor A)->(tensor C) | | rpowscalar | miaobyte | rpowscalar(var scalar, tensor A)->(tensor C) | T3=pow(scalar, T1) | rpowscalar(var scalar, tensor A)->(tensor C) | +| log | miaobyte | log(tensor A)->(tensor C) | T3=log(T1) | log(tensor A)->(tensor C) | +| addscalar | miaobyte | addscalar(tensor A, var b)->(tensor C) | T3=T1+scalar | addscalar(tensor A, var b)->(tensor C) | | sub | miaobyte | sub(tensor A, tensor B)->(tensor C) | T3=T1-T2 | sub(tensor A, tensor B)->(tensor C) | | sqrt | miaobyte | sqrt(tensor A)->(tensor C) | T3=sqrt(T1) | sqrt(tensor A)->(tensor C) | | subscalar | miaobyte | subscalar(tensor A, var b)->(tensor C) | T3=T1-scalar | subscalar(tensor A, var b)->(tensor C) | @@ -67,8 +71,6 @@ | pow | miaobyte | pow(tensor A, tensor B)->(tensor C) | T3=pow(T1, T2) | pow(tensor A, tensor B)->(tensor C) | | mul | miaobyte | mul(tensor A, tensor B)->(tensor C) | T3=T1*T2 | mul(tensor A, tensor B)->(tensor C) | | exp | miaobyte | exp(tensor A)->(tensor C) | T3=exp(T1) | exp(tensor A)->(tensor C) | -| lessscalar | miaobyte | lessscalar(tensor A, var scalar)->(tensor mask) | mask=compare(T1, scalar) | lessscalar(tensor A, var scalar)->(tensor mask) | -| cos | miaobyte | cos(tensor A)->(tensor C) | T3=cos(T1) | cos(tensor A)->(tensor C) | ### matmul @@ -80,6 +82,7 @@ | Operation | Author | Func Def | Math Formula | IR Instruction | |-----------|--------|------------|--------------|----------------| +| gather | miaobyte | gather(tensor A, tensor indices, var axis)->(tensor B) | T2 = T1.gather(indices=[1,2], axis=1) | gather(tensor A, tensor indices, var axis)->(tensor B) | | broadcastTo | miaobyte | broadcastTo(tensor A, vector new_shape)->(tensor B) | T2 = T1.broadcastTo(new_shape=[4,3,2]) | broadcastTo(tensor A, vector new_shape)->(tensor B) | | concat | miaobyte | concat(listtensor tensors, var dim)->(tensor result) | Tresult = concat([T1, T2...], axis=3) | concat(listtensor tensors, var dim)->(tensor result) | | transpose | miaobyte | transpose(tensor A, vector dim_order)->(tensor C) | T2 = T1.transpose(dimorder=[1,0]) | transpose(tensor A, vector dim_order)->(tensor C) | diff --git a/doc/excuter/op-mem-ompsimd/list.md b/doc/excuter/op-mem-ompsimd/list.md index b396a357..66a03786 100644 --- a/doc/excuter/op-mem-ompsimd/list.md +++ b/doc/excuter/op-mem-ompsimd/list.md @@ -9,13 +9,6 @@ | vecset | none | vecset(vector value)->(vector name) | shape = [3 4 5] | vecset(vector value)->(vector name) | | argset | none | argset(var value)->(var name) | var argname = argvalue | argset(var value)->(var name) | -### io - -| Operation | Author | Func Def | Math Formula | IR Instruction | -|-----------|--------|------------|--------------|----------------| -| print | miaobyte | print(tensor )->() | print(T1) | print(tensor )->() | -| print | miaobyte | print(tensor , var )->() | print(T1) | print(tensor , var )->() | - ### tensorlife | Operation | Author | Func Def | Math Formula | IR Instruction | @@ -25,13 +18,22 @@ | newtensor | none | newtensor(var shape)->(tensor t) | T1 =Tensor(shape=[...]) | newtensor(var shape)->(tensor t) | | deltensor | none | deltensor(tensor t)->() | del T1 | deltensor(tensor t)->() | +### io + +| Operation | Author | Func Def | Math Formula | IR Instruction | +|-----------|--------|------------|--------------|----------------| +| save | none | save(tensor t, var path)->() | save(T1,path) | save(tensor t, var path)->() | +| print | miaobyte | print(tensor t)->() | print(T1) | print(tensor t)->() | +| print | miaobyte | print(tensor t, var format)->() | print(T1) | print(tensor t, var format)->() | +| load | none | load(var path)->() | load(path) | load(var path)->() | + ### init | Operation | Author | Func Def | Math Formula | IR Instruction | |-----------|--------|------------|--------------|----------------| +| arange | miaobyte | arange(tensor t, var start, var step)->() | arange(T1,start,step) | arange(tensor t, var start, var step)->() | | normal | miaobyte | normal(tensor t, var mean, var std, var seed)->() | normal(T1,mean,stddev,seed) | normal(tensor t, var mean, var std, var seed)->() | | uniform | miaobyte | uniform(tensor t, var low, var high, var seed)->() | uniform(T1,low,high,seed) | uniform(tensor t, var low, var high, var seed)->() | -| arange | miaobyte | arange(tensor t, var start, var step)->() | arange(T1,start,step) | arange(tensor t, var start, var step)->() | | constant | miaobyte | constant(tensor t, var value)->() | constant(T1,value) | constant(tensor t, var value)->() | ### elementwise @@ -41,12 +43,11 @@ | switch | miaobyte | switch(listtensor tensors, tensor cases)->(tensor C) | C=switch([tensors],case) | switch(listtensor tensors, tensor cases)->(tensor C) | | greaterscalar | miaobyte | greaterscalar(tensor A, var scalar)->(tensor mask) | mask=greater(T1,scalar) | greaterscalar(tensor A, var scalar)->(tensor mask) | | equalscalar | miaobyte | equalscalar(tensor A, var scalar)->(tensor mask) | mask=equal(T1,scalar) | equalscalar(tensor A, var scalar)->(tensor mask) | -| addscalar | miaobyte | addscalar(tensor a, var scalar)->(tensor c) | T3=T1+scalar | addscalar(tensor a, var scalar)->(tensor c) | -| log | miaobyte | log(tensor A)->(tensor C) | T3=log(T1) | log(tensor A)->(tensor C) | +| min | miaobyte | min(tensor A, tensor B)->(tensor C) | T3=min(T1,T2) | min(tensor A, tensor B)->(tensor C) | +| maxscalar | miaobyte | maxscalar(tensor A, var scalar)->(tensor C) | T3=max(T1,scalar) | maxscalar(tensor A, var scalar)->(tensor C) | | divscalar | miaobyte | divscalar(tensor A, var scalar)->(tensor C) | T3=T1/scalar | divscalar(tensor A, var scalar)->(tensor C) | | add | cblas | add(tensor a, tensor b)->(tensor c) | T3=T1+T2 | add(tensor a, tensor b)->(tensor c) | | add | miaobyte | add(tensor a, tensor b)->(tensor c) | T3=T1+T2 | add(tensor a, tensor b)->(tensor c) | -| min | miaobyte | min(tensor A, tensor B)->(tensor C) | T3=min(T1,T2) | min(tensor A, tensor B)->(tensor C) | | greater | miaobyte | greater(tensor A, tensor B)->(tensor mask) | mask=greater(T1,T2) | greater(tensor A, tensor B)->(tensor mask) | | lessscalar | miaobyte | lessscalar(tensor A, var scalar)->(tensor mask) | mask=less(T1,scalar) | lessscalar(tensor A, var scalar)->(tensor mask) | | less | miaobyte | less(tensor A, tensor B)->(tensor mask) | mask=less(T1,T2) | less(tensor A, tensor B)->(tensor mask) | @@ -54,6 +55,8 @@ | minscalar | miaobyte | minscalar(tensor A, var scalar)->(tensor C) | T3=min(T1,scalar) | minscalar(tensor A, var scalar)->(tensor C) | | rdivscalar | miaobyte | rdivscalar(var scalar, tensor A)->(tensor C) | T3=scalar/T1 | rdivscalar(var scalar, tensor A)->(tensor C) | | rpowscalar | miaobyte | rpowscalar(var scalar, tensor A)->(tensor C) | T3=scalar^T1 | rpowscalar(var scalar, tensor A)->(tensor C) | +| log | miaobyte | log(tensor A)->(tensor C) | T3=log(T1) | log(tensor A)->(tensor C) | +| addscalar | miaobyte | addscalar(tensor a, var scalar)->(tensor c) | T3=T1+scalar | addscalar(tensor a, var scalar)->(tensor c) | | sub | miaobyte | sub(tensor a, tensor b)->(tensor c) | T3=T1-T2 | sub(tensor a, tensor b)->(tensor c) | | sqrt | miaobyte | sqrt(tensor A)->(tensor C) | T3=sqrt(T1) | sqrt(tensor A)->(tensor C) | | subscalar | miaobyte | subscalar(tensor a, var scalar)->(tensor c) | T3=T1-scalar | subscalar(tensor a, var scalar)->(tensor c) | @@ -63,7 +66,6 @@ | invert | miaobyte | invert(tensor A)->(tensor C) | T3=~T1 | invert(tensor A)->(tensor C) | | max | miaobyte | max(tensor A, tensor B)->(tensor C) | T3=max(T1,T2) | max(tensor A, tensor B)->(tensor C) | | pow | miaobyte | pow(tensor A, tensor B)->(tensor C) | T3=T1^T2 | pow(tensor A, tensor B)->(tensor C) | -| maxscalar | miaobyte | maxscalar(tensor A, var scalar)->(tensor C) | T3=max(T1,scalar) | maxscalar(tensor A, var scalar)->(tensor C) | | mul | miaobyte | mul(tensor A, tensor B)->(tensor C) | T3=T1*T2 | mul(tensor A, tensor B)->(tensor C) | | exp | miaobyte | exp(tensor A)->(tensor C) | T3=exp(T1) | exp(tensor A)->(tensor C) | @@ -78,6 +80,7 @@ | Operation | Author | Func Def | Math Formula | IR Instruction | |-----------|--------|------------|--------------|----------------| +| gather | miaobyte | gather(tensor A, tensor indices, var axis)->(tensor B) | T2 = T1.gather(indices=T3, axis=3) | gather(tensor A, tensor indices, var axis)->(tensor B) | | broadcastTo | miaobyte | broadcastTo(tensor A, vector new_shape)->(tensor B) | T2 = T1.broadcastTo(new_shape=[4,3,2]) | broadcastTo(tensor A, vector new_shape)->(tensor B) | | concat | miaobyte | concat(listtensor tensors, var dim)->(tensor result) | Tresult = concat([T1, T2...], axis=3) | concat(listtensor tensors, var dim)->(tensor result) | | transpose | miaobyte | transpose(tensor A, vector dim_order)->(tensor C) | T1.transpose(dimorder=[1,0])->T2 | transpose(tensor A, vector dim_order)->(tensor C) | diff --git a/excuter/cpp-common/src/deepx/dtype.hpp b/excuter/cpp-common/src/deepx/dtype.hpp index 5b9551a9..73b4ba5b 100644 --- a/excuter/cpp-common/src/deepx/dtype.hpp +++ b/excuter/cpp-common/src/deepx/dtype.hpp @@ -175,8 +175,8 @@ namespace deepx case Precision::Float8E4M3: return 8; //TODO 需要根据平台支持 - case Precision::Float4E2M1: - return 4; + // case Precision::Float4E2M1: + // return 4; case Precision::Int64: return 64; case Precision::Int32: @@ -186,8 +186,8 @@ namespace deepx case Precision::Int8: return 8; //TODO,int4 需要根据平台支持 - case Precision::Int4: - return 4; + // case Precision::Int4: + // return 4; case Precision::Bool: return 8; case Precision::String: diff --git a/excuter/cpp-common/src/deepx/mem/mem.hpp b/excuter/cpp-common/src/deepx/mem/mem.hpp index 504db01f..9ea7ab70 100644 --- a/excuter/cpp-common/src/deepx/mem/mem.hpp +++ b/excuter/cpp-common/src/deepx/mem/mem.hpp @@ -98,6 +98,17 @@ namespace deepx::mem mem[name] = ptr; } + template + void addtensor(const string &name, shared_ptr> tensor) + { + if (mem.find(name) != mem.end()) + { + cerr << "tensor already exists: " << name << endl; + return; + } + mem[name] = tensor; + } + // template // shared_ptr> temptensor(vector shape) // { diff --git a/excuter/cpp-common/src/deepx/shape.cpp b/excuter/cpp-common/src/deepx/shape.cpp index c126cc58..cedca724 100644 --- a/excuter/cpp-common/src/deepx/shape.cpp +++ b/excuter/cpp-common/src/deepx/shape.cpp @@ -83,7 +83,7 @@ namespace deepx node["dtype"] = precision_str(dtype); node["dim"] = dim; node["shape"] = shape; - node["strides"] = strides; + node["stride"] = strides; node["size"] = size; return YAML::Dump(node); } @@ -92,7 +92,7 @@ namespace deepx dtype = precision(node["dtype"].as()); dim = node["dim"].as(); shape = node["shape"].as>(); - strides=node["strides"].as>(); + strides=node["stride"].as>(); size=node["size"].as(); } } \ No newline at end of file diff --git a/excuter/cpp-common/src/deepx/shape.hpp b/excuter/cpp-common/src/deepx/shape.hpp index fdcd8dd2..ff4fea69 100644 --- a/excuter/cpp-common/src/deepx/shape.hpp +++ b/excuter/cpp-common/src/deepx/shape.hpp @@ -8,7 +8,36 @@ #include "deepx/dtype.hpp" namespace deepx { - + //omp内线程局部变量 + class ThreadLocalVectors + { + private: + std::vector> vectors; + + public: + // 构造函数接收向量大小数组 + explicit ThreadLocalVectors(const std::vector &sizes) + { + vectors.resize(sizes.size()); + for (size_t i = 0; i < sizes.size(); ++i) + { + vectors[i].resize(sizes[i], 0); + } + } + + // 获取指定索引的向量引用 + std::vector &get(size_t index) + { + return vectors[index]; + } + + // 获取所有向量 + std::vector> &getAll() + { + return vectors; + } + }; + struct Shape { Precision dtype; @@ -17,28 +46,29 @@ namespace deepx int dim; int size; - Shape()=default; + Shape() = default; Shape(const std::vector &shape); Shape(const std::initializer_list &shape); Shape(const int *shape, int dim); void setshape(const int *shape, int dim); int operator[](int index) const; int &operator[](int index); - bool operator==(const Shape &shape) const{return shape.shape==shape.shape;} + bool operator==(const Shape &shape) const { return shape.shape == shape.shape; } void print() const; - //range 不支持omp - void range(int dimCount, std::function &indices )> func ) const; - void range(int dimCount, std::function &indices )> func ) const; - void range(int dimCount, std::function func ) const; + // range 不支持omp + void range(int dimCount, std::function &indices)> func) const; + void range(int dimCount, std::function &indices)> func) const; + void range(int dimCount, std::function func) const; - //rangeParallel 支持omp,但omp内无需线程local变量 + // rangeParallel 支持omp,但omp内无需线程local变量 void rangeParallel(int dimCount, std::function &indices)> func) const; void rangeParallel(int dimCount, std::function func) const; - void rangeParallel(int dimCount, std::function &indices )> func) const; + void rangeParallel(int dimCount, std::function &indices)> func) const; - void rangeParallel(int dimCount, std::function &indices,std::vector &newIndices)> func,int newIndiceDim) const; - void rangeParallel(int dimCount, std::function &newIndices)> func,int newIndiceDim) const; - void rangeParallel(int dimCount, std::function &indices,std::vector &newIndices )> func,int newIndiceDim) const; + // 支持omp,但omp内需要线程local变量 + void rangeParallel(int dimCount, std::function &indices, ThreadLocalVectors &tlv)> func,const vector tlv_sizes) const; + void rangeParallel(int dimCount, std::function func,const vector tlv_sizes) const; + void rangeParallel(int dimCount, std::function &indices, ThreadLocalVectors &tlv)> func,const vector tlv_sizes) const; int linearat(const std::vector &indices) const; std::vector linearto(int idx_linear) const; diff --git a/excuter/cpp-common/src/deepx/shape_changeshape.cpp b/excuter/cpp-common/src/deepx/shape_changeshape.cpp index 17a89461..c0002617 100644 --- a/excuter/cpp-common/src/deepx/shape_changeshape.cpp +++ b/excuter/cpp-common/src/deepx/shape_changeshape.cpp @@ -3,18 +3,18 @@ #include "deepx/shape_changeshape.hpp" -namespace deepx +namespace deepx { - //transpose + // transpose - std::vector swaplastTwoDimOrder(const std::vector &shape) + std::vector swaplastTwoDimOrder(const std::vector &shape) { vector dimOrder = shape; std::iota(dimOrder.begin(), dimOrder.end(), 0); swap(dimOrder[dimOrder.size() - 1], dimOrder[dimOrder.size() - 2]); return dimOrder; } - std::vector transposeShape(const std::vector &shape, const std::vector &dimOrder) + std::vector transposeShape(const std::vector &shape, const std::vector &dimOrder) { if (dimOrder.size() != shape.size()) { @@ -23,16 +23,17 @@ namespace deepx std::vector newShape = shape; for (size_t i = 0; i < dimOrder.size(); ++i) { - newShape[i] =shape[dimOrder[i]]; + newShape[i] = shape[dimOrder[i]]; } return newShape; } - //concat + // concat - Shape concatShape(const std::vector &shapes,const int axis){ + Shape concatShape(const std::vector &shapes, const int axis) + { std::vector outputShape(shapes[0].dim); - outputShape=shapes[0].shape; + outputShape = shapes[0].shape; for (int i = 1; i < shapes.size(); ++i) { if (shapes[i].dim != outputShape.size()) @@ -54,7 +55,7 @@ namespace deepx return Shape(outputShape); } - //broadcast + // broadcast std::vector broadcastShape(const std::vector &a, const std::vector &b) { int len1 = a.size(); @@ -102,7 +103,7 @@ namespace deepx return broadcastMap; } - void fromBroadcastIndices(const std::vector &broadcastMap, const std::vector &broadcastIndices, std::vector &oldIndices ) + void fromBroadcastIndices(const std::vector &broadcastMap, const std::vector &broadcastIndices, std::vector &oldIndices) { for (int i = 0, j = 0; i < broadcastIndices.size(); ++i) { @@ -119,4 +120,5 @@ namespace deepx } } } + } \ No newline at end of file diff --git a/excuter/cpp-common/src/deepx/shape_changeshape.hpp b/excuter/cpp-common/src/deepx/shape_changeshape.hpp index 291c7291..ac2a588a 100644 --- a/excuter/cpp-common/src/deepx/shape_changeshape.hpp +++ b/excuter/cpp-common/src/deepx/shape_changeshape.hpp @@ -19,7 +19,6 @@ namespace deepx std::vector transposeShape(const std::vector &shape, const std::vector &dimOrder); // concat - Shape concatShape(const std::vector &shapes, const int axis); template @@ -71,6 +70,8 @@ namespace deepx }; std::vector broadcastMap(const std::vector &a, const std::vector &b); -} + //gather + //gather的out.shape=indices.shape,所以无需计算 +} #endif // DEEPX_SHAPE_CHANGESHAPE_HPP \ No newline at end of file diff --git a/excuter/cpp-common/src/deepx/shape_range.cpp b/excuter/cpp-common/src/deepx/shape_range.cpp index c7fc6d5a..31a125e1 100644 --- a/excuter/cpp-common/src/deepx/shape_range.cpp +++ b/excuter/cpp-common/src/deepx/shape_range.cpp @@ -152,7 +152,7 @@ namespace deepx } } - void Shape::rangeParallel(int dimCount, std::function &indices, std::vector &newIndices)> func, int newIndiceDim) const + void Shape::rangeParallel(int dimCount, std::function &indices, ThreadLocalVectors &tlv)> func,const vector tlv_sizes) const { dimCount = checkdim(dimCount, dim); int totalSize = checkTotalSize(dimCount, shape); @@ -160,7 +160,7 @@ namespace deepx #pragma omp parallel { std::vector indices(dimCount, 0); - std::vector newIndices(newIndiceDim, 0); + ThreadLocalVectors tlv(tlv_sizes); #pragma omp for for (int idx = 0; idx < totalSize; idx++) { @@ -171,11 +171,11 @@ namespace deepx indices[dim] = idx_ % shape[dim]; // 计算当前维度的索引 idx_ /= shape[dim]; // 更新 idx } - func(indices, newIndices); // 调用传入的函数 + func(indices, tlv); // 调用传入的函数 } } } - void Shape::rangeParallel(int dimCount, std::function &newIndices)> func, int newIndiceDim) const + void Shape::rangeParallel(int dimCount, std::function func,const vector tlv_sizes) const { dimCount = checkdim(dimCount, dim); int stride = checkStride(dimCount, shape); @@ -185,16 +185,16 @@ namespace deepx #pragma omp parallel { - std::vector newIndices(newIndiceDim, 0); + ThreadLocalVectors tlv(tlv_sizes); #pragma omp for for (int idx = 0; idx < total; idx++) { - func(idx * stride, newIndices); + func(idx * stride, tlv); } } } - void Shape::rangeParallel(int dimCount, std::function &indices, std::vector &newIndices)> func, int newIndiceDim) const + void Shape::rangeParallel(int dimCount, std::function &indices, ThreadLocalVectors &tlv)> func,const vector tlv_sizes) const { dimCount = checkdim(dimCount, dim); int totalSize = checkTotalSize(dimCount, shape); @@ -203,7 +203,7 @@ namespace deepx #pragma omp parallel { std::vector indices(dimCount, 0); - std::vector newIndices(newIndiceDim, 0); + ThreadLocalVectors tlv(tlv_sizes); #pragma omp for for (int idx = 0; idx < totalSize; idx++) { @@ -214,7 +214,7 @@ namespace deepx indices[dim] = idx_ % shape[dim]; // 计算当前维度的索引 idx_ /= shape[dim]; // 更新 idx } - func(idx * stride, indices, newIndices); + func(idx * stride, indices, tlv); } } } diff --git a/excuter/cpp-common/src/deepx/tensor.hpp b/excuter/cpp-common/src/deepx/tensor.hpp index af5f0dc6..463e42cc 100644 --- a/excuter/cpp-common/src/deepx/tensor.hpp +++ b/excuter/cpp-common/src/deepx/tensor.hpp @@ -54,7 +54,6 @@ namespace deepx Tensor(const Tensor &tensor) { shape = tensor.shape; - device = tensor.device; newer = tensor.newer; deleter = tensor.deleter; copyer = tensor.copyer; @@ -73,7 +72,6 @@ namespace deepx Tensor(Tensor &&other) noexcept { shape = std::move(other.shape); - device = other.device; deleter = other.deleter; copyer = other.copyer; @@ -101,7 +99,6 @@ namespace deepx return *this; shape = tensor.shape; - device = tensor.device; deleter = tensor.deleter; copyer = tensor.copyer; newer = tensor.newer; @@ -126,7 +123,6 @@ namespace deepx if (this == &tensor) return *this; shape = tensor.shape; - device = tensor.device; newer = tensor.newer; deleter = tensor.deleter; copyer = tensor.copyer; diff --git a/excuter/cpp-common/src/deepx/tensorbase.hpp b/excuter/cpp-common/src/deepx/tensorbase.hpp index d1b176f7..6e8806e0 100644 --- a/excuter/cpp-common/src/deepx/tensorbase.hpp +++ b/excuter/cpp-common/src/deepx/tensorbase.hpp @@ -5,29 +5,21 @@ namespace deepx { - enum DeviceType - { - CPU = 0, - CUDA = 1, - }; - + struct TensorBase { Shape shape; - DeviceType device; TensorBase() = default; // 拷贝构造函数 TensorBase(const TensorBase &other) { shape = other.shape; - device = other.device; } // 移动构造函数 TensorBase(TensorBase &&other) noexcept { shape = std::move(other.shape); - device = other.device; } // 拷贝赋值运算符 @@ -36,7 +28,6 @@ namespace deepx if (this != &other) { shape = other.shape; - device = other.device; } return *this; } @@ -47,7 +38,6 @@ namespace deepx if (this != &other) { shape = std::move(other.shape); - device = other.device; } return *this; } diff --git a/excuter/cpp-common/src/deepx/tensorfunc/changeshape.hpp b/excuter/cpp-common/src/deepx/tensorfunc/changeshape.hpp index 42040543..5e359dbc 100644 --- a/excuter/cpp-common/src/deepx/tensorfunc/changeshape.hpp +++ b/excuter/cpp-common/src/deepx/tensorfunc/changeshape.hpp @@ -8,66 +8,47 @@ namespace deepx::tensorfunc { using namespace std; + + // reshape template struct reshapeDispatcher { - static void reshape(const Tensor &tensor, const std::vector &new_shape,Tensor &output) = delete; + static void reshape(const Tensor &tensor, const std::vector &new_shape, Tensor &output) = delete; }; - // A.reshape(new_shape) template - void reshape(const Tensor &tensor, const std::vector &new_shape,Tensor &output) + void reshape(const Tensor &tensor, const std::vector &new_shape, Tensor &output) { - reshapeDispatcher::reshape(tensor, new_shape,output); + reshapeDispatcher::reshape(tensor, new_shape, output); } + // transpose template struct transposeDispatcher { static void transpose(const Tensor &tensor, const std::vector &dim_order, Tensor &output) = delete; }; - // transpose(A,dim_order)=>B template void transpose(const Tensor &tensor, const std::vector &dim_order, Tensor &output) { transposeDispatcher::transpose(tensor, dim_order, output); } + // concat template struct concatDispatcher { - static void concat(const vector*> tensors, const int axis, Tensor &C) = delete; + static void concat(const vector *> tensors, const int axis, Tensor &C) = delete; }; - // concat(tensors,axis)=>C - template - void concat(const vector*> tensors, const int axis, Tensor &C) - { - concatDispatcher::concat(tensors, axis, C); - } - // https://onnx.ai/onnx/operators/onnx__Split.html template - struct splitDispatcher + void concat(const vector *> tensors, const int axis, Tensor &C) { - static void split(const Tensor &A, const int axis,const std::vector &splits, Tensor *&B) = delete; - static void split(const Tensor &A, const int axis,const int num_outputs, Tensor *&B) = delete; - }; - // split(tensor,axis,splits)=>tensors - template - void split(const Tensor &A, const int axis,const std::vector &splits, Tensor *&B) - { - splitDispatcher::split(A, axis, splits, B); - - } - - // split(tensor,axis,num_outputs)=>tensors - template - void split(const Tensor &A, const int axis,const int num_outputs, Tensor *&B) - { - splitDispatcher::split(A, axis, num_outputs, B); + concatDispatcher::concat(tensors, axis, C); } + // broadcastTo template struct broadcastToDispatcher { @@ -80,67 +61,99 @@ namespace deepx::tensorfunc broadcastToDispatcher::broadcastTo(A, new_shape, B); } - - - template - struct expandDispatcher + // gather + template + struct gatherDispatcher { - static void expand(const Tensor &A, const Shape &new_shape, Tensor &B) = delete; + static void gather(const Tensor &input, const Tensor &indices, const int axis, Tensor &output) = delete; }; - template - void expand(const Tensor &A, const Shape &new_shape, Tensor &B) + template + void gather(const Tensor &input, const Tensor &indices, const int axis, Tensor &output) { - expandDispatcher::expand(A, new_shape, B); + gatherDispatcher::gather(input, indices, axis, output); } - - template - struct squeezeDispatcher - { - static void squeeze(Tensor &tensor) = delete; - }; - template - void squeeze(Tensor &tensor) - { - squeezeDispatcher::squeeze(tensor); - } - - template - struct unsqueezeDispatcher - { - static void unsqueeze(Tensor &tensor, const int axis) = delete; - }; - - template - void unsqueeze(Tensor &tensor, const int axis) - { - unsqueezeDispatcher::unsqueeze(tensor, axis); - } - - template - struct flattenDispatcher - { - static void flatten(Tensor &tensor) = delete; - }; - - template - void flatten(Tensor &tensor) - { - flattenDispatcher::flatten(tensor); - } - - template - struct paddingDispatcher - { - static void padding(Tensor &tensor, const Shape &new_shape) = delete; - }; - - template - void padding(Tensor &tensor, const Shape &new_shape) - { - paddingDispatcher::padding(tensor, new_shape); - } + // // split + // // https://onnx.ai/onnx/operators/onnx__Split.html + // template + // struct splitDispatcher + // { + // static void split(const Tensor &A, const int axis, const std::vector &splits, Tensor *&B) = delete; + // static void split(const Tensor &A, const int axis, const int num_outputs, Tensor *&B) = delete; + // }; + // template + // void split(const Tensor &A, const int axis, const std::vector &splits, Tensor *&B) + // { + // splitDispatcher::split(A, axis, splits, B); + // } + + // // split(tensor,axis,num_outputs)=>tensors + // template + // void split(const Tensor &A, const int axis, const int num_outputs, Tensor *&B) + // { + // splitDispatcher::split(A, axis, num_outputs, B); + // } + + // template + // struct expandDispatcher + // { + // static void expand(const Tensor &A, const Shape &new_shape, Tensor &B) = delete; + // }; + + // template + // void expand(const Tensor &A, const Shape &new_shape, Tensor &B) + // { + // expandDispatcher::expand(A, new_shape, B); + // } + + // template + // struct squeezeDispatcher + // { + // static void squeeze(Tensor &tensor) = delete; + // }; + + // template + // void squeeze(Tensor &tensor) + // { + // squeezeDispatcher::squeeze(tensor); + // } + + // template + // struct unsqueezeDispatcher + // { + // static void unsqueeze(Tensor &tensor, const int axis) = delete; + // }; + + // template + // void unsqueeze(Tensor &tensor, const int axis) + // { + // unsqueezeDispatcher::unsqueeze(tensor, axis); + // } + + // template + // struct flattenDispatcher + // { + // static void flatten(Tensor &tensor) = delete; + // }; + + // template + // void flatten(Tensor &tensor) + // { + // flattenDispatcher::flatten(tensor); + // } + + // template + // struct paddingDispatcher + // { + // static void padding(Tensor &tensor, const Shape &new_shape) = delete; + // }; + + // template + // void padding(Tensor &tensor, const Shape &new_shape) + // { + // paddingDispatcher::padding(tensor, new_shape); + // } } #endif diff --git a/excuter/cpp-common/src/deepx/tensorfunc/io.hpp b/excuter/cpp-common/src/deepx/tensorfunc/io.hpp index d9fdb47f..86a1c396 100644 --- a/excuter/cpp-common/src/deepx/tensorfunc/io.hpp +++ b/excuter/cpp-common/src/deepx/tensorfunc/io.hpp @@ -2,6 +2,7 @@ #define DEEPX_TENSORFUNC_IO_HPP #include "deepx/tensor.hpp" +#include "stdutil/fs.hpp" namespace deepx::tensorfunc{ @@ -15,25 +16,24 @@ namespace deepx::tensorfunc{ printDispatcher::print(t, f); } - template - struct saveDispatcher{ - static void save(Tensor &tensor,const std::string &path,int filebegin=0)=delete; - }; - - template - void save(Tensor &tensor,const std::string &path,int filebegin=0){ - saveDispatcher::save(tensor, path, filebegin); - } - - template - struct loadDispatcher{ - static Tensor load(const std::string &path,int filebegin=0)=delete; - }; - - template - Tensor load(const std::string &path,int filebegin=0){ - return loadDispatcher::load(path, filebegin); + template + void save(Tensor &tensor,const std::string &path); + + template + pair>> load(const std::string &path); + + inline pair loadShape(const std::string &path) + { + std::string shapepath = path + ".shape"; + std::ifstream shape_fs(shapepath, std::ios::binary); + std::string shapedata((std::istreambuf_iterator(shape_fs)), std::istreambuf_iterator()); + Shape shape; + shape.fromYaml(shapedata); + std::string filename = stdutil::filename(path); + std::string tensor_name = filename.substr(0, filename.find_last_of('.')); + return std::make_pair(tensor_name, shape); } + } #endif // DEEPX_TENSORFUNC_IO_HPP diff --git a/excuter/cpp-common/src/stdutil/fs.cpp b/excuter/cpp-common/src/stdutil/fs.cpp new file mode 100644 index 00000000..63d297e0 --- /dev/null +++ b/excuter/cpp-common/src/stdutil/fs.cpp @@ -0,0 +1,7 @@ +#include "fs.hpp" + +namespace stdutil{ + string filename(const string &path){ + return path.substr(path.find_last_of('/') + 1); + } +} \ No newline at end of file diff --git a/excuter/cpp-common/src/stdutil/fs.hpp b/excuter/cpp-common/src/stdutil/fs.hpp new file mode 100644 index 00000000..26826d9b --- /dev/null +++ b/excuter/cpp-common/src/stdutil/fs.hpp @@ -0,0 +1,11 @@ +#ifndef DEEPX_STDUTIL_FS_HPP +#define DEEPX_STDUTIL_FS_HPP + +#include + +namespace stdutil{ + using namespace std; + string filename(const string &path); +} + +#endif // DEEPX_STDUTIL_FS_HPP \ No newline at end of file diff --git a/excuter/op-mem-cuda/CMakeLists.txt b/excuter/op-mem-cuda/CMakeLists.txt index c6d695d5..98471042 100644 --- a/excuter/op-mem-cuda/CMakeLists.txt +++ b/excuter/op-mem-cuda/CMakeLists.txt @@ -41,6 +41,9 @@ set(CMAKE_CUDA_STANDARD 17) set(CMAKE_CUDA_ARCHITECTURES 75) # 根据您的 GPU 计算能力进行调整 set(CMAKE_CUDA_SEPARABLE_COMPILATION ON) #确保 CMake 能够正确识别 CUDA 文件并将其编译为目标 +# 设置 CUDA 编译选项 +# 是否开启PTX 汇编展示 +# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --ptxas-options=-v") find_package(yaml-cpp REQUIRED) diff --git a/excuter/op-mem-cuda/src/client/tfs.cpp b/excuter/op-mem-cuda/src/client/tfs.cpp index f92bcfc4..44fbcfc6 100644 --- a/excuter/op-mem-cuda/src/client/tfs.cpp +++ b/excuter/op-mem-cuda/src/client/tfs.cpp @@ -103,20 +103,34 @@ namespace deepx::tf vector())); } // io - void register_util(TfFactory &opfactory) + void register_io(TfFactory &opfactory) { opfactory.add_tf(std::make_shared>(vector( { - Param("", DataCategory::Tensor, Precision::Any), + Param("t", DataCategory::Tensor, Precision::Any), }), vector())); opfactory.add_tf(std::make_shared>(vector( { - Param("", DataCategory::Tensor, Precision::Any), - Param("", DataCategory::Var, Precision::String), + Param("t", DataCategory::Tensor, Precision::Any), + Param("format", DataCategory::Var, Precision::String), }), vector())); + + opfactory.add_tf(std::make_shared(vector( + { + Param("t", DataCategory::Tensor, Precision::Any), + Param("path", DataCategory::Var, Precision::String), + }), + vector())); + + opfactory.add_tf(std::make_shared(vector( + { + Param("path", DataCategory::Var, Precision::String), + }), + vector())); + } // elementwise @@ -422,7 +436,7 @@ namespace deepx::tf Param("C", DataCategory::Tensor, Precision::Any), }))); } - // // changeshape + // changeshape void register_changeshape(TfFactory &tffactory) { // reshape @@ -465,6 +479,17 @@ namespace deepx::tf { Param("B", DataCategory::Tensor, Precision::Any), }))); + // gather + tffactory.add_tf(std::make_shared>(vector( + { + Param("A", DataCategory::Tensor, Precision::Any), + Param("indices", DataCategory::Tensor, Precision::Int64|Precision::Int32), + Param("axis", DataCategory::Var, Precision::Int32), + }), + vector( + { + Param("B", DataCategory::Tensor, Precision::Any), + }))); } // reduce void register_reduce(TfFactory &tffactory) @@ -520,7 +545,7 @@ namespace deepx::tf { register_lifecycle(tffactory); register_init(tffactory); - register_util(tffactory); + register_io(tffactory); register_elementwise(tffactory); register_matmul(tffactory); register_changeshape(tffactory); diff --git a/excuter/op-mem-cuda/src/deepx/mem/mem_cuda.hpp b/excuter/op-mem-cuda/src/deepx/mem/mem_cuda.hpp index 38ec1dd0..556a4aac 100644 --- a/excuter/op-mem-cuda/src/deepx/mem/mem_cuda.hpp +++ b/excuter/op-mem-cuda/src/deepx/mem/mem_cuda.hpp @@ -52,7 +52,7 @@ namespace deepx::mem auto ptr = mem.at(name); auto result = make_shared>(); result->shape = ptr->shape; - result->device = ptr->device; + result->deleter = nullptr; result->copyer = nullptr; result->newer = nullptr; diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu index e5eb511d..130fc80d 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu @@ -12,7 +12,6 @@ namespace deepx::tensorfunc { // transpose - // DIM=2^n template __global__ void transpose_kernel(const T *inputData, const int *inputStrides, @@ -41,8 +40,6 @@ namespace deepx::tensorfunc } } - - template void launch_transpose(const T *input, const int *inputStrides, @@ -56,10 +53,8 @@ namespace deepx::tensorfunc cudaVector newStrides_d(outputStrides, dim); cudaVector dimOrder_d(dimOrder, dim); - int powDim = nextPowerOf2(dim); auto [numBlocks, blockSize] = BestDims(len); - // 根据计算出的2的幂次选择对应的模板实例 - switch (powDim) + switch (dim) { case 1: transpose_kernel<1, T><<>>(input, strides_d.data, output, newStrides_d.data, dim, len, dimOrder_d.data); @@ -67,26 +62,44 @@ namespace deepx::tensorfunc case 2: transpose_kernel<2, T><<>>(input, strides_d.data, output, newStrides_d.data, dim, len, dimOrder_d.data); break; + case 3: + transpose_kernel<3, T><<>>(input, strides_d.data, output, newStrides_d.data, dim, len, dimOrder_d.data); + break; case 4: transpose_kernel<4, T><<>>(input, strides_d.data, output, newStrides_d.data, dim, len, dimOrder_d.data); break; + case 5: + transpose_kernel<5, T><<>>(input, strides_d.data, output, newStrides_d.data, dim, len, dimOrder_d.data); + break; + case 6: + transpose_kernel<6, T><<>>(input, strides_d.data, output, newStrides_d.data, dim, len, dimOrder_d.data); + break; + case 7: + transpose_kernel<7, T><<>>(input, strides_d.data, output, newStrides_d.data, dim, len, dimOrder_d.data); + break; case 8: transpose_kernel<8, T><<>>(input, strides_d.data, output, newStrides_d.data, dim, len, dimOrder_d.data); break; - case 16: - transpose_kernel<16, T><<>>(input, strides_d.data, output, newStrides_d.data, dim, len, dimOrder_d.data); + case 9: + transpose_kernel<9, T><<>>(input, strides_d.data, output, newStrides_d.data, dim, len, dimOrder_d.data); break; - case 32: - transpose_kernel<32, T><<>>(input, strides_d.data, output, newStrides_d.data, dim, len, dimOrder_d.data); + case 10: + transpose_kernel<10, T><<>>(input, strides_d.data, output, newStrides_d.data, dim, len, dimOrder_d.data); break; - case 64: - transpose_kernel<64, T><<>>(input, strides_d.data, output, newStrides_d.data, dim, len, dimOrder_d.data); + case 11: + transpose_kernel<11, T><<>>(input, strides_d.data, output, newStrides_d.data, dim, len, dimOrder_d.data); break; - case 128: - transpose_kernel<128, T><<>>(input, strides_d.data, output, newStrides_d.data, dim, len, dimOrder_d.data); + case 12: + transpose_kernel<12, T><<>>(input, strides_d.data, output, newStrides_d.data, dim, len, dimOrder_d.data); break; + default: - throw std::runtime_error("dim too large, max support 128"); + throw std::runtime_error("dimension large than " + std::to_string(MAX_DIM)); + } + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + throw std::runtime_error("cuda error"); } } @@ -169,11 +182,7 @@ namespace deepx::tensorfunc // shapeAtAxis cudaVector shapeAtAxis_d(shapeAtAxis, numTensors, cudaMemcpyHostToDevice); - - int powDim = nextPowerOf2(dim); - - // 根据计算出的2的幂次选择对应的模板实例 - switch (powDim) + switch (dim) { case 1: concat_kernel<1, T><<>>(tensorsDataList.data, inputStrides_d.data, outputData, outputStrides_d.data, dim, outputLen, axis, numTensors, shapeAtAxis_d.data); @@ -181,26 +190,44 @@ namespace deepx::tensorfunc case 2: concat_kernel<2, T><<>>(tensorsDataList.data, inputStrides_d.data, outputData, outputStrides_d.data, dim, outputLen, axis, numTensors, shapeAtAxis_d.data); break; + case 3: + concat_kernel<3, T><<>>(tensorsDataList.data, inputStrides_d.data, outputData, outputStrides_d.data, dim, outputLen, axis, numTensors, shapeAtAxis_d.data); + break; case 4: concat_kernel<4, T><<>>(tensorsDataList.data, inputStrides_d.data, outputData, outputStrides_d.data, dim, outputLen, axis, numTensors, shapeAtAxis_d.data); break; + case 5: + concat_kernel<5, T><<>>(tensorsDataList.data, inputStrides_d.data, outputData, outputStrides_d.data, dim, outputLen, axis, numTensors, shapeAtAxis_d.data); + break; + case 6: + concat_kernel<6, T><<>>(tensorsDataList.data, inputStrides_d.data, outputData, outputStrides_d.data, dim, outputLen, axis, numTensors, shapeAtAxis_d.data); + break; + case 7: + concat_kernel<7, T><<>>(tensorsDataList.data, inputStrides_d.data, outputData, outputStrides_d.data, dim, outputLen, axis, numTensors, shapeAtAxis_d.data); + break; case 8: concat_kernel<8, T><<>>(tensorsDataList.data, inputStrides_d.data, outputData, outputStrides_d.data, dim, outputLen, axis, numTensors, shapeAtAxis_d.data); break; - case 16: - concat_kernel<16, T><<>>(tensorsDataList.data, inputStrides_d.data, outputData, outputStrides_d.data, dim, outputLen, axis, numTensors, shapeAtAxis_d.data); + case 9: + concat_kernel<9, T><<>>(tensorsDataList.data, inputStrides_d.data, outputData, outputStrides_d.data, dim, outputLen, axis, numTensors, shapeAtAxis_d.data); break; - case 32: - concat_kernel<32, T><<>>(tensorsDataList.data, inputStrides_d.data, outputData, outputStrides_d.data, dim, outputLen, axis, numTensors, shapeAtAxis_d.data); + case 10: + concat_kernel<10, T><<>>(tensorsDataList.data, inputStrides_d.data, outputData, outputStrides_d.data, dim, outputLen, axis, numTensors, shapeAtAxis_d.data); break; - case 64: - concat_kernel<64, T><<>>(tensorsDataList.data, inputStrides_d.data, outputData, outputStrides_d.data, dim, outputLen, axis, numTensors, shapeAtAxis_d.data); + case 11: + concat_kernel<11, T><<>>(tensorsDataList.data, inputStrides_d.data, outputData, outputStrides_d.data, dim, outputLen, axis, numTensors, shapeAtAxis_d.data); break; - case 128: - concat_kernel<128, T><<>>(tensorsDataList.data, inputStrides_d.data, outputData, outputStrides_d.data, dim, outputLen, axis, numTensors, shapeAtAxis_d.data); + case 12: + concat_kernel<12, T><<>>(tensorsDataList.data, inputStrides_d.data, outputData, outputStrides_d.data, dim, outputLen, axis, numTensors, shapeAtAxis_d.data); break; + default: - throw std::runtime_error("dim too large, max support 128"); + throw std::runtime_error("dimension large than " + std::to_string(MAX_DIM)); + } + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + throw std::runtime_error("cuda error"); } } template void launch_concat(const double **tensorsData, const int *inputStrides, double *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis); @@ -253,7 +280,8 @@ namespace deepx::tensorfunc template void launch_broadcastTo(const T *input, const int *inputStrides, const int intputDim, const BroadcastMap *broadcastMap, - T *output, const int *outputStrides, const int outputDim, const int outputlen){ + T *output, const int *outputStrides, const int outputDim, const int outputlen) + { auto [numBlocks, blockSize] = BestDims(outputlen); @@ -266,37 +294,51 @@ namespace deepx::tensorfunc // input cudaVector inputStrides_d(inputStrides, intputDim, cudaMemcpyHostToDevice); - - int powDim = nextPowerOf2(outputDim); - // 根据计算出的2的幂次选择对应的模板实例 - switch (powDim) + switch (outputDim) { case 1: - broadcastTo_kernel<1, T><<>>(input, inputStrides_d.data, intputDim, broadcastMap_d.data, output, outputStrides_d.data, outputDim, outputlen); + broadcastTo_kernel<1, T><<>>(input, inputStrides_d.data, intputDim, broadcastMap_d.data, output, outputStrides_d.data, outputDim, outputlen); break; case 2: - broadcastTo_kernel<2, T><<>>(input, inputStrides_d.data, intputDim, broadcastMap_d.data, output, outputStrides_d.data, outputDim, outputlen); + broadcastTo_kernel<2, T><<>>(input, inputStrides_d.data, intputDim, broadcastMap_d.data, output, outputStrides_d.data, outputDim, outputlen); + break; + case 3: + broadcastTo_kernel<3, T><<>>(input, inputStrides_d.data, intputDim, broadcastMap_d.data, output, outputStrides_d.data, outputDim, outputlen); break; case 4: - broadcastTo_kernel<4, T><<>>(input, inputStrides_d.data, intputDim, broadcastMap_d.data, output, outputStrides_d.data, outputDim, outputlen); + broadcastTo_kernel<4, T><<>>(input, inputStrides_d.data, intputDim, broadcastMap_d.data, output, outputStrides_d.data, outputDim, outputlen); + break; + case 5: + broadcastTo_kernel<5, T><<>>(input, inputStrides_d.data, intputDim, broadcastMap_d.data, output, outputStrides_d.data, outputDim, outputlen); + break; + case 6: + broadcastTo_kernel<6, T><<>>(input, inputStrides_d.data, intputDim, broadcastMap_d.data, output, outputStrides_d.data, outputDim, outputlen); + break; + case 7: + broadcastTo_kernel<7, T><<>>(input, inputStrides_d.data, intputDim, broadcastMap_d.data, output, outputStrides_d.data, outputDim, outputlen); break; case 8: - broadcastTo_kernel<8, T><<>>(input, inputStrides_d.data, intputDim, broadcastMap_d.data, output, outputStrides_d.data, outputDim, outputlen); + broadcastTo_kernel<8, T><<>>(input, inputStrides_d.data, intputDim, broadcastMap_d.data, output, outputStrides_d.data, outputDim, outputlen); break; - case 16: - broadcastTo_kernel<16, T><<>>(input, inputStrides_d.data, intputDim, broadcastMap_d.data, output, outputStrides_d.data, outputDim, outputlen); + case 9: + broadcastTo_kernel<9, T><<>>(input, inputStrides_d.data, intputDim, broadcastMap_d.data, output, outputStrides_d.data, outputDim, outputlen); break; - case 32: - broadcastTo_kernel<32, T><<>>(input, inputStrides_d.data, intputDim, broadcastMap_d.data, output, outputStrides_d.data, outputDim, outputlen); + case 10: + broadcastTo_kernel<10, T><<>>(input, inputStrides_d.data, intputDim, broadcastMap_d.data, output, outputStrides_d.data, outputDim, outputlen); break; - case 64: - broadcastTo_kernel<64, T><<>>(input, inputStrides_d.data, intputDim, broadcastMap_d.data, output, outputStrides_d.data, outputDim, outputlen); + case 11: + broadcastTo_kernel<11, T><<>>(input, inputStrides_d.data, intputDim, broadcastMap_d.data, output, outputStrides_d.data, outputDim, outputlen); break; - case 128: - broadcastTo_kernel<128, T><<>>(input, inputStrides_d.data, intputDim, broadcastMap_d.data, output, outputStrides_d.data, outputDim, outputlen); + case 12: + broadcastTo_kernel<12, T><<>>(input, inputStrides_d.data, intputDim, broadcastMap_d.data, output, outputStrides_d.data, outputDim, outputlen); break; default: - throw std::runtime_error("dim too large, max support 128"); + throw std::runtime_error("dimension large than " + std::to_string(MAX_DIM)); + } + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + throw std::runtime_error("cuda error"); } } template void launch_broadcastTo(const double *input, const int *inputStrides, const int inputDim, @@ -310,7 +352,7 @@ namespace deepx::tensorfunc nv_bfloat16 *output, const int *outputStrides, const int outputDim, const int outputlen); template void launch_broadcastTo<__half>(const __half *input, const int *inputStrides, const int inputDim, const BroadcastMap *broadcastMap, - __half *output, const int *outputStrides, const int outputDim, const int outputlen); + __half *output, const int *outputStrides, const int outputDim, const int outputlen); template void launch_broadcastTo(const int64_t *input, const int *inputStrides, const int inputDim, const BroadcastMap *broadcastMap, int64_t *output, const int *outputStrides, const int outputDim, const int outputlen); @@ -323,5 +365,183 @@ namespace deepx::tensorfunc template void launch_broadcastTo(const int8_t *input, const int *inputStrides, const int inputDim, const BroadcastMap *broadcastMap, int8_t *output, const int *outputStrides, const int outputDim, const int outputlen); + + // gather + + template + __host__ __device__ void fromGatherIndices( + const int *output_indices, // 输出张量的索引 + const GatherAxisT *indices, const int *indicesStrides, const int indicesDim, // indices是tensor + const int gatherAxis, // gather操作的轴 + int *input_indices, const int inputDim) + { + + for (int i = 0; i < inputDim; ++i) + { + input_indices[i] = output_indices[i]; + } + + // 使用indices张量中对应位置的值来替换gatherAxis维度的索引 + int indices_idx = linearAt(indicesStrides, indicesDim, output_indices); + input_indices[gatherAxis] = indices[indices_idx]; + } + + template + __global__ void gather_kernel( + const T *input, const int *inputStrides, const int inputDim, + const GatherAxisT *indices, const int *indicesStrides, const int indicesDim, + const int gatherAxis, + T *output, const int outputlen) + { + const int grid_stride = gridDim.x * blockDim.x; + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + for (; thread_id < outputlen; thread_id += grid_stride) + { + // 输出索引 + int output_indices[DIM]; + linearTo(indicesStrides, indicesDim, output_indices, thread_id); + + // 输入索引 + int input_indices[DIM]; + fromGatherIndices(output_indices, + indices, indicesStrides, indicesDim, + gatherAxis, + input_indices, inputDim); + int inputIdx = linearAt(inputStrides, inputDim, input_indices); + int outputIdx = linearAt(indicesStrides, indicesDim, output_indices); + output[outputIdx] = input[inputIdx]; + } + } + + template + void launch_gather( + const T *input, const int *inputStrides, const int inputDim, + const GatherAxisT *indices, const int *indicesStrides, const int indicesDim, + const int gatherAxis, + T *output, const int outputlen) + { + + auto [numBlocks, blockSize] = BestDims(outputlen); + + // indices + cudaVector indicesStrides_d(indicesStrides, indicesDim, cudaMemcpyHostToDevice); + + // input + cudaVector inputStrides_d(inputStrides, inputDim, cudaMemcpyHostToDevice); + int dim=std::max(inputDim,indicesDim); + switch (dim) + { + case 1: + gather_kernel<1, T, GatherAxisT><<>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputlen); + break; + case 2: + gather_kernel<2, T, GatherAxisT><<>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputlen); + break; + case 3: + gather_kernel<3, T, GatherAxisT><<>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputlen); + break; + case 4: + gather_kernel<4, T, GatherAxisT><<>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputlen); + break; + case 5: + gather_kernel<5, T, GatherAxisT><<>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputlen); + break; + case 6: + gather_kernel<6, T, GatherAxisT><<>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputlen); + break; + case 7: + gather_kernel<7, T, GatherAxisT><<>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputlen); + break; + case 8: + gather_kernel<8, T, GatherAxisT><<>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputlen); + break; + case 9: + gather_kernel<9, T, GatherAxisT><<>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputlen); + break; + case 10: + gather_kernel<10, T, GatherAxisT><<>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputlen); + break; + case 11: + gather_kernel<11, T, GatherAxisT><<>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputlen); + break; + case 12: + gather_kernel<12, T, GatherAxisT><<>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputlen); + break; + default: + throw std::runtime_error("dimension large than " + std::to_string(MAX_DIM)); + } + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + throw std::runtime_error("cuda error"); + } + } + template void launch_gather(const double *input, const int *inputStrides, const int inputDim, + const int64_t *indices, const int *indicesStrides, const int indicesDim, + const int gatherAxis, + double *output, const int outputlen); + template void launch_gather(const float *input, const int *inputStrides, const int inputDim, + const int64_t *indices, const int *indicesStrides, const int indicesDim, + const int gatherAxis, + float *output, const int outputlen); + template void launch_gather(const nv_bfloat16 *input, const int *inputStrides, const int inputDim, + const int64_t *indices, const int *indicesStrides, const int indicesDim, + const int gatherAxis, + nv_bfloat16 *output, const int outputlen); + template void launch_gather<__half, int64_t>(const __half *input, const int *inputStrides, const int inputDim, + const int64_t *indices, const int *indicesStrides, const int indicesDim, + const int gatherAxis, + __half *output, const int outputlen); + template void launch_gather(const int64_t *input, const int *inputStrides, const int inputDim, + const int64_t *indices, const int *indicesStrides, const int indicesDim, + const int gatherAxis, + int64_t *output, const int outputlen); + template void launch_gather(const int32_t *input, const int *inputStrides, const int inputDim, + const int64_t *indices, const int *indicesStrides, const int indicesDim, + const int gatherAxis, + int32_t *output, const int outputlen); + template void launch_gather(const int16_t *input, const int *inputStrides, const int inputDim, + const int64_t *indices, const int *indicesStrides, const int indicesDim, + const int gatherAxis, + int16_t *output, const int outputlen); + template void launch_gather(const int8_t *input, const int *inputStrides, const int inputDim, + const int64_t *indices, const int *indicesStrides, const int indicesDim, + const int gatherAxis, + int8_t *output, const int outputlen); + + template void launch_gather(const double *input, const int *inputStrides, const int inputDim, + const int32_t *indices, const int *indicesStrides, const int indicesDim, + const int gatherAxis, + double *output, const int outputlen); + template void launch_gather(const float *input, const int *inputStrides, const int inputDim, + const int32_t *indices, const int *indicesStrides, const int indicesDim, + const int gatherAxis, + float *output, const int outputlen); + template void launch_gather(const nv_bfloat16 *input, const int *inputStrides, const int inputDim, + const int32_t *indices, const int *indicesStrides, const int indicesDim, + const int gatherAxis, + nv_bfloat16 *output, const int outputlen); + template void launch_gather<__half, int32_t>(const __half *input, const int *inputStrides, const int inputDim, + const int32_t *indices, const int *indicesStrides, const int indicesDim, + const int gatherAxis, + __half *output, const int outputlen); + template void launch_gather(const int64_t *input, const int *inputStrides, const int inputDim, + const int32_t *indices, const int *indicesStrides, const int indicesDim, + const int gatherAxis, + int64_t *output, const int outputlen); + template void launch_gather(const int32_t *input, const int *inputStrides, const int inputDim, + const int32_t *indices, const int *indicesStrides, const int indicesDim, + const int gatherAxis, + int32_t *output, const int outputlen); + template void launch_gather(const int16_t *input, const int *inputStrides, const int inputDim, + const int32_t *indices, const int *indicesStrides, const int indicesDim, + const int gatherAxis, + int16_t *output, const int outputlen); + template void launch_gather(const int8_t *input, const int *inputStrides, const int inputDim, + const int32_t *indices, const int *indicesStrides, const int indicesDim, + const int gatherAxis, + int8_t *output, const int outputlen); } + + #endif // DEEPX_TENSORFUNC_CHANGESHAPE_MIAOBYTE_HPP \ No newline at end of file diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cuh b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cuh index 7b0f5d31..2047a636 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cuh +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cuh @@ -13,9 +13,8 @@ namespace deepx::tensorfunc __global__ void transpose_kernel(const T *input, const int *inputStrides, T *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); template - void launch_transpose( const T *input, const int *inputStrides, T *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); + void launch_transpose(const T *input, const int *inputStrides, T *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); - template __global__ void concat_kernel(const T **tensorsData, const int *inputStrides, @@ -30,21 +29,40 @@ namespace deepx::tensorfunc template void launch_concat(const T **tensorsData, const int *inputStrides, T *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis); - - - __host__ __device__ void fromBroadcastIndices(const BroadcastMap *broadcastMap, const int *broadcastIndices, const int broadcastIndicesDim, int *indices); - // broadcastTo + __host__ __device__ void fromBroadcastIndices(const BroadcastMap *broadcastMap, const int *broadcastIndices, const int broadcastIndicesDim, int *indices); + template __global__ void broadcastTo_kernel( - const T *input, const int *inputStrides,const int inputDim, + const T *input, const int *inputStrides, const int inputDim, const BroadcastMap *broadcastMap, - T *output, const int *outputStrides,const int outputDim,const int outputlen); + T *output, const int *outputStrides, const int outputDim, const int outputlen); template - void launch_broadcastTo(const T *input, const int *inputStrides,const int intputDim, + void launch_broadcastTo(const T *input, const int *inputStrides, const int intputDim, const BroadcastMap *broadcastMap, - T *output, const int *outputStrides,const int outputDim,const int outputlen); - + T *output, const int *outputStrides, const int outputDim, const int outputlen); + + // gather + template + __host__ __device__ void fromGatherIndices( + const int *output_indices, // 输出张量的索引 + const GatherAxisT *indices,const int *indicesStrides,const int indicesDim, //indices是tensor + const int gatherAxis, // gather操作的轴 + int *input_indices,const int inputDim); // 计算出的输入张量索引 + + template + __global__ void gather_kernel( + const T *input, const int *inputStrides, const int inputDim, + const GatherAxisT *indices,const int *indicesStrides,const int indicesDim, + const int gatherAxis, + T *output,const int outputlen);//output 和input的shape相同,所以共享strides,dim,len + + template + void launch_gather( + const T *input, const int *inputStrides, const int inputDim, + const GatherAxisT *indices,const int *indicesStrides,const int indicesDim, + const int gatherAxis, + T *output,const int outputlen);//output 和input的shape相同,所以共享strides,dim,len }; #endif // DEEPX_TENSORFUNC_CHANGESHAPE_MIAOBYTE_CUH \ No newline at end of file diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.hpp index d127e6a1..8fb43a76 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.hpp +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.hpp @@ -13,6 +13,7 @@ namespace deepx::tensorfunc { + //reshape template struct reshapeDispatcher { @@ -43,6 +44,7 @@ namespace deepx::tensorfunc } }; + //transpose template struct transposeDispatcher { @@ -59,6 +61,7 @@ namespace deepx::tensorfunc } }; + //concat template struct concatDispatcher { @@ -96,7 +99,7 @@ namespace deepx::tensorfunc }; }; - + //broadcastTo template struct broadcastToDispatcher { @@ -113,5 +116,23 @@ namespace deepx::tensorfunc B.data, B.shape.strides.data(), B.shape.dim, B.shape.size); } }; + + //gather + template + struct gatherDispatcher + { + static void gather(const Tensor &input, const Tensor &indices, const int axis, Tensor &output){ + vector input_gatherShape = indices.shape.shape; + if (input_gatherShape.empty()||input_gatherShape!=output.shape.shape) + { + throw TensorShapeError("Gather shape mismatch"); + } + int gatherAxis = axis < 0 ? input.shape.dim + axis : axis; + launch_gather(input.data, input.shape.strides.data(), input.shape.dim, + indices.data, indices.shape.strides.data(), indices.shape.dim, + gatherAxis, + output.data,output.shape.size);//output和indices的shape相同,共享strides等 + } + }; } #endif // DEEPX_TENSORFUNC_CHANGESHAPE_MIAOBYTE_HPP \ No newline at end of file diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda.hpp index 7dc0320e..603415b8 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda.hpp +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda.hpp @@ -28,8 +28,9 @@ namespace deepx::tensorfunc private: cublasHandle_t handle_; }; - //TODO - inline int deviceblocksize(){ + // TODO + inline int deviceblocksize() + { int device_id; cudaGetDevice(&device_id); cudaDeviceProp props; @@ -59,6 +60,8 @@ namespace deepx::tensorfunc blocks = std::min(blocks, optimal_blocks); return {blocks, blocksize}; }; + + } #endif diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/init_miaobyte.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/init_miaobyte.cu index 0e98773c..82497ee5 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/init_miaobyte.cu +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/init_miaobyte.cu @@ -87,7 +87,6 @@ namespace deepx::tensorfunc // 先用float类型进行计算,然后转换为目标类型 float result = low + (high - low) * rand; - printf("threadIdx: %d, idx: %d, result: %f\n", threadIdx.x, idx, result); data[idx] = static_cast(result); } } @@ -124,13 +123,10 @@ namespace deepx::tensorfunc for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride) { // 生成[0,1)范围的随机数 - float rand = curand_uniform(&state); - + float rand = curand_normal(&state); // 先用float类型进行计算,然后转换为目标类型 - float result = rand; - // float result = mean + stddev * rand; - printf("threadIdx: %d, idx: %d, result: %f\n", threadIdx.x, idx, result); - data[idx] = static_cast(rand); + float result = mean + stddev * rand; + data[idx] = static_cast(result); } } template diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/io_miaobyte.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/io_miaobyte.hpp index e4aa4080..02fee22f 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/io_miaobyte.hpp +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/io_miaobyte.hpp @@ -3,6 +3,10 @@ #include #include + +#include +#include + #include #include #include @@ -27,6 +31,27 @@ namespace deepx::tensorfunc throw std::runtime_error("Failed to allocate host memory"); } + stdutil::print(t.shape.shape, host_data, t.shape.dtype, f); + delete[] host_data; + }; + }; + + // 特化Float16和BFloat16类型 + template <> + struct printDispatcher + { + static void print(const Tensor &t, const std::string &f = "") + { + int bytes = precision_bits(t.shape.dtype) / 8; + size_t total_bytes = t.shape.size * bytes; + + // 统一分配CPU内存 + unsigned char *host_data = new unsigned char[total_bytes]; + if (host_data == nullptr) + { + throw std::runtime_error("Failed to allocate host memory"); + } + // 统一复制数据到CPU cudaError_t err = cudaMemcpy(host_data, t.data, total_bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) @@ -35,103 +60,158 @@ namespace deepx::tensorfunc throw std::runtime_error("Failed to copy data from device to host"); } - // 对于half和bf16类型需要转换为float - if (t.shape.dtype == Precision::Float16 || t.shape.dtype == Precision::BFloat16) + float *host_float = new float[t.shape.size]; + if (host_float == nullptr) { - float *host_float = new float[t.shape.size]; - if (host_float == nullptr) - { - delete[] host_data; - throw std::runtime_error("Failed to allocate host memory for float conversion"); - } - - // 在CPU上进行类型转换 - if (t.shape.dtype == Precision::Float16) - { - for (size_t i = 0; i < t.shape.size; i++) - { - host_float[i] = __half2float(((half *)host_data)[i]); - } - } - else - { // BFloat16 - for (size_t i = 0; i < t.shape.size; i++) - { - host_float[i] = __bfloat162float(((nv_bfloat16 *)host_data)[i]); - } - } - - // 打印转换后的float数据 - stdutil::print(t.shape.shape, host_float, Precision::Float32, f.empty() ? "%.4f" : f); - delete[] host_float; + delete[] host_data; + throw std::runtime_error("Failed to allocate host memory for float conversion"); } - else + + for (size_t i = 0; i < t.shape.size; i++) { - // 其他类型直接打印 - stdutil::print(t.shape.shape, host_data, t.shape.dtype, f); + host_float[i] = __half2float(((half *)host_data)[i]); } delete[] host_data; + // 打印转换后的float数据 + stdutil::print(t.shape.shape, host_float, Precision::Float32, f); + delete[] host_float; } }; - template - struct saveDispatcher + template <> + struct printDispatcher { - static void save(Tensor &tensor, const std::string &path, int filebegin = 0) + static void print(const Tensor &t, const std::string &f = "") { - // 保存shape - std::string shapepath = path + ".shape"; - std::string shapedata = tensor.shape.toYaml(); - std::ofstream shape_fs(shapepath, std::ios::binary); - shape_fs.write(shapedata.c_str(), shapedata.size()); - shape_fs.close(); - - // 保存data - std::string datapath = path + ".data"; - std::ofstream data_fs(datapath, std::ios::binary | std::ios::in | std::ios::out); - - if (!data_fs.is_open()) + int bytes = precision_bits(t.shape.dtype) / 8; + size_t total_bytes = t.shape.size * bytes; + + // 统一分配CPU内存 + unsigned char *host_data = new unsigned char[total_bytes]; + if (host_data == nullptr) + { + throw std::runtime_error("Failed to allocate host memory"); + } + + // 统一复制数据到CPU + cudaError_t err = cudaMemcpy(host_data, t.data, total_bytes, cudaMemcpyDeviceToHost); + if (err != cudaSuccess) + { + delete[] host_data; + throw std::runtime_error("Failed to copy data from device to host"); + } + + float *host_float = new float[t.shape.size]; + if (host_float == nullptr) { - // 如果文件不存在,则创建新文件 - data_fs.open(datapath, std::ios::binary | std::ios::out); + delete[] host_data; + throw std::runtime_error("Failed to allocate host memory for float conversion"); } - data_fs.seekp(filebegin); - data_fs.write(reinterpret_cast(tensor.data), tensor.shape.size * sizeof(T)); - data_fs.close(); + + for (size_t i = 0; i < t.shape.size; i++) + { + host_float[i] = __bfloat162float(((nv_bfloat16 *)host_data)[i]); + } + delete[] host_data; + // 打印转换后的float数据 + stdutil::print(t.shape.shape, host_float, Precision::Float32, f); + delete[] host_float; } }; + template - struct loadDispatcher + void save(Tensor &tensor, const std::string &path) { - static Tensor load(const std::string &path, int filebegin = 0) + // 保存shape + std::string shapepath = path + ".shape"; + std::string shapedata = tensor.shape.toYaml(); + std::ofstream shape_fs(shapepath, std::ios::binary); + shape_fs.write(shapedata.c_str(), shapedata.size()); + shape_fs.close(); + + // 保存data + int bytes = precision_bits(tensor.shape.dtype) / 8; + size_t total_bytes = tensor.shape.size * bytes; + + // 统一分配CPU内存 + unsigned char *host_data = new unsigned char[total_bytes]; + if (host_data == nullptr) { - // 加载shape - std::string shapepath = path + ".shape"; - std::ifstream shape_fs(shapepath, std::ios::binary); - std::string shapedata((std::istreambuf_iterator(shape_fs)), std::istreambuf_iterator()); + throw std::runtime_error("Failed to allocate host memory"); + } - Shape shape; - shape.fromYaml(shapedata); - shape_fs.close(); + // 统一复制数据到CPU + cudaError_t err = cudaMemcpy(host_data, tensor.data, total_bytes, cudaMemcpyDeviceToHost); + if (err != cudaSuccess) + { + delete[] host_data; + throw std::runtime_error("Failed to copy data from device to host"); + } - // 加载data - Tensor tensor = New(shape); - std::string datapath = path + ".data"; - std::ifstream data_fs(datapath, std::ios::binary); + std::string datapath = path + ".data"; + std::ofstream data_fs(datapath, std::ios::binary | std::ios::in | std::ios::out); - if (!data_fs.is_open()) - { - throw std::runtime_error("无法打开数据文件: " + datapath); - } + if (!data_fs.is_open()) + { + // 如果文件不存在,则创建新文件 + data_fs.open(datapath, std::ios::binary | std::ios::out); + } + data_fs.seekp(0); + data_fs.write(reinterpret_cast(host_data), total_bytes); + data_fs.close(); - // 设置读取位置 - data_fs.seekg(filebegin); - data_fs.read(reinterpret_cast(tensor.data), shape.size * sizeof(T)); - data_fs.close(); + delete[] host_data; + }; - return tensor; + template + pair>> load(const std::string &path) + { + // 加载shape + pair shape_name = loadShape(path); + Shape shape = shape_name.second; + std::string tensor_name = shape_name.first; + + // 检查T 和 shape.dtype 是否匹配 + if (shape.dtype != precision()) + { + throw std::runtime_error("调用load<" + precision_str(shape.dtype) + "> 不匹配: 需要 " + precision_str(shape.dtype) + + " 类型,但文件为" + precision_str(precision()) + " 类型"); } - }; + + // 检查file.size,是否是tensor.size*sizeof(T) + std::string datapath = path + ".data"; + std::ifstream data_fs(datapath, std::ios::binary); + data_fs.seekg(0, std::ios::end); + std::streamsize fileSize = data_fs.tellg(); + std::streamsize expectedSize = shape.size * precision_bits(shape.dtype) / 8; + + if (fileSize != expectedSize) + { + throw std::runtime_error("数据文件大小不足: 需要 " + std::to_string(expectedSize) + + " 字节,但文件只有 " + std::to_string(fileSize) + " 字节"); + } + data_fs.seekg(0); + + // TODO 从文件,到cuda内存(可能是显存) + + shared_ptr> tensor = make_shared>(New(shape.shape)); + unsigned char *host_data = new unsigned char[fileSize]; + if (host_data == nullptr) + { + throw std::runtime_error("Failed to allocate host memory"); + } + data_fs.read(reinterpret_cast(host_data), fileSize); + data_fs.close(); + + cudaError_t err = cudaMemcpy(tensor->data, host_data, fileSize, cudaMemcpyHostToDevice); + if (err != cudaSuccess) + { + delete[] host_data; + throw std::runtime_error("Failed to copy data from host to device"); + } + delete[] host_data; + return std::make_pair(tensor_name, tensor); + } } #endif // DEEPX_TENSORFUNC_IO_MIAOBYTE_HPP \ No newline at end of file diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.cu index c9e185c8..c6047ac8 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.cu +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.cu @@ -58,8 +58,8 @@ namespace deepx::tensorfunc cudaVector result_strides_d(result_strides, result_dim, cudaMemcpyHostToDevice); cudaVector reduced_dims_d(reduced_dims, tensor_dim, cudaMemcpyHostToDevice); - int powDim = nextPowerOf2(tensor_dim); - switch (powDim) + + switch (tensor_dim) { case 1: sum_kernel<1, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); @@ -67,26 +67,38 @@ namespace deepx::tensorfunc case 2: sum_kernel<2, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); break; + case 3: + sum_kernel<3, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + break; case 4: sum_kernel<4, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); break; + case 5: + sum_kernel<5, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + break; + case 6: + sum_kernel<6, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + break; + case 7: + sum_kernel<7, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + break; case 8: sum_kernel<8, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); break; - case 16: - sum_kernel<16, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); - break; - case 32: - sum_kernel<32, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + case 9: + sum_kernel<9, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); break; - case 64: - sum_kernel<64, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + case 10: + sum_kernel<10, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); break; - case 128: - sum_kernel<128, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + case 11: + sum_kernel<11, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + break; + case 12: + sum_kernel<12, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); break; default: - throw std::runtime_error("dim too large, max support 128"); + throw std::runtime_error("dim too large, max support " + std::to_string(MAX_DIM)); } } @@ -155,8 +167,7 @@ namespace deepx::tensorfunc cudaVector result_strides_d(result_strides, result_dim, cudaMemcpyHostToDevice); cudaVector reduced_dims_d(reduced_dims, tensor_dim, cudaMemcpyHostToDevice); - int powDim = nextPowerOf2(tensor_dim); - switch (powDim) + switch (tensor_dim) { case 1: prod_kernel<1, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); @@ -164,26 +175,26 @@ namespace deepx::tensorfunc case 2: prod_kernel<2, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); break; + case 3: + prod_kernel<3, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + break; case 4: prod_kernel<4, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); break; - case 8: - prod_kernel<8, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + case 5: + prod_kernel<5, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); break; - case 16: - prod_kernel<16, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + case 6: + prod_kernel<6, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); break; - case 32: - prod_kernel<32, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + case 7: + prod_kernel<7, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); break; - case 64: - prod_kernel<64, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); - break; - case 128: - prod_kernel<128, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + case 8: + prod_kernel<8, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); break; default: - throw std::runtime_error("dim too large, max support 128"); + throw std::runtime_error("dim too large, max support " + std::to_string(MAX_DIM)); } } @@ -252,8 +263,7 @@ namespace deepx::tensorfunc cudaVector result_strides_d(result_strides, result_dim, cudaMemcpyHostToDevice); cudaVector reduced_dims_d(reduced_dims, tensor_dim, cudaMemcpyHostToDevice); - int powDim = nextPowerOf2(tensor_dim); - switch (powDim) + switch (tensor_dim) { case 1: max_kernel<1, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); @@ -261,26 +271,38 @@ namespace deepx::tensorfunc case 2: max_kernel<2, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); break; + case 3: + max_kernel<3, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + break; case 4: max_kernel<4, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); break; + case 5: + max_kernel<5, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + break; + case 6: + max_kernel<6, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + break; + case 7: + max_kernel<7, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + break; case 8: max_kernel<8, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); break; - case 16: - max_kernel<16, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); - break; - case 32: - max_kernel<32, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + case 9: + max_kernel<9, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); break; - case 64: - max_kernel<64, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + case 10: + max_kernel<10, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + break; + case 11: + max_kernel<11, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); break; - case 128: - max_kernel<128, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + case 12: + max_kernel<12, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); break; default: - throw std::runtime_error("dim too large, max support 128"); + throw std::runtime_error("dim too large, max support " + std::to_string(MAX_DIM)); } }; @@ -349,8 +371,7 @@ namespace deepx::tensorfunc cudaVector result_strides_d(result_strides, result_dim, cudaMemcpyHostToDevice); cudaVector reduced_dims_d(reduced_dims, tensor_dim, cudaMemcpyHostToDevice); - int powDim = nextPowerOf2(tensor_dim); - switch (powDim) + switch (tensor_dim) { case 1: min_kernel<1, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); @@ -358,26 +379,38 @@ namespace deepx::tensorfunc case 2: min_kernel<2, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); break; + case 3: + min_kernel<3, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + break; case 4: min_kernel<4, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); break; + case 5: + min_kernel<5, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + break; + case 6: + min_kernel<6, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + break; + case 7: + min_kernel<7, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + break; case 8: min_kernel<8, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); break; - case 16: - min_kernel<16, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); - break; - case 32: - min_kernel<32, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + case 9: + min_kernel<9, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); break; - case 64: - min_kernel<64, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + case 10: + min_kernel<10, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); break; - case 128: - min_kernel<128, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + case 11: + min_kernel<11, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); break; + case 12: + min_kernel<12, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + break; default: - throw std::runtime_error("dim too large, max support 128"); + throw std::runtime_error("dim too large, max support " + std::to_string(MAX_DIM)); } } diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/tensor_cuda.cuh b/excuter/op-mem-cuda/src/deepx/tensorfunc/tensor_cuda.cuh index b60ab3f8..ef3538a4 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/tensor_cuda.cuh +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/tensor_cuda.cuh @@ -16,7 +16,7 @@ namespace deepx::tensorfunc } } - inline __host__ __device__ int linearAt(const int *strides, const int dim, int *indices) + inline __host__ __device__ int linearAt(const int *strides, const int dim,const int *indices) { int idx = 0; for (int i = 0; i < dim; i++) @@ -34,22 +34,8 @@ namespace deepx::tensorfunc neworder[i] = order[dimOrder[i]]; } } - - inline int nextPowerOf2(int n) - { - if (n <= 0) - return 1; - if ((n & (n - 1)) == 0) - return n; // 如果n已经是2的幂 - - n--; - n |= n >> 1; - n |= n >> 2; - n |= n >> 4; - n |= n >> 8; - n |= n >> 16; - return n + 1; - } + + const int MAX_DIM = 12; } #endif // DEEPX_TENSORFUNC_TENSOR_CUDA_CUH diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/tensorlife_miaobyte.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/tensorlife_miaobyte.hpp index acf28e9f..fdcb0f17 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/tensorlife_miaobyte.hpp +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/tensorlife_miaobyte.hpp @@ -38,7 +38,6 @@ namespace deepx::tensorfunc Shape shape(shapedata); shape.dtype=precision(); Tensor tensor(shape); - tensor.device = CUDA; // 使用 CUDA 设备 tensor.deleter = dataFree; tensor.copyer = dataCopy; tensor.newer = dataNew; diff --git a/excuter/op-mem-cuda/src/deepx/tf/changeshape.hpp b/excuter/op-mem-cuda/src/deepx/tf/changeshape.hpp index 99186de5..6320b7d4 100644 --- a/excuter/op-mem-cuda/src/deepx/tf/changeshape.hpp +++ b/excuter/op-mem-cuda/src/deepx/tf/changeshape.hpp @@ -11,6 +11,8 @@ namespace deepx::tf { using namespace deepx::tensorfunc; using namespace std; + + // reshape template class Reshape : public TF { @@ -72,6 +74,7 @@ namespace deepx::tf } }; + // transpose template class Transpose : public TF { @@ -140,6 +143,7 @@ namespace deepx::tf } }; + // concat template class Concat : public TF { @@ -163,13 +167,13 @@ namespace deepx::tf } int run(shared_ptr mem, string &error) override { - if (!checktensors({ this->returns[0].textvalue}, mem, error)!=0) + if (!checktensors({this->returns[0].textvalue}, mem, error) != 0) { return 1; } vector tensor_names = this->getvector(0, true); - if (!checktensors(tensor_names, mem, error)!=0) + if (!checktensors(tensor_names, mem, error) != 0) { return 1; } @@ -274,6 +278,7 @@ namespace deepx::tf }; }; + // broadcastTo template class BroadcastTo : public TF { @@ -338,5 +343,152 @@ namespace deepx::tf return 0; } }; -} + + // gather + template + class Gather : public TF + { + public: + Gather(const vector &args, const vector &returns) + { + this->name = "gather"; + this->author = Author::name(); + this->tftype = "changeshape"; + this->args = args; + this->returns = returns; + } + + string math_formula() const override + { + return "T2 = T1.gather(indices=[1,2], axis=1)"; + } + shared_ptr clone() const override + { + return make_shared>(*this); + } + int run(shared_ptr mem, string &error) override + { + Precision input_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; + + int axis = this->getvar(2, mem, true); + Precision output_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (input_type != output_type) + { + error = "output_type " + precision_str(output_type) + " or input_type " + precision_str(input_type) + " must be the same"; + return 1; + } + Precision indices_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype; + if (indices_type != Precision::Int64 && indices_type != Precision::Int32) + { + error = "indices_type " + precision_str(indices_type) + " only support " + precision_str(Precision::Int64) + " or " + precision_str(Precision::Int32); + return 1; + } + + switch (input_type) + { + case Precision::Float64: + { + if (indices_type == Precision::Int64) + { + gather(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), axis, *mem->gettensor(this->returns[0].textvalue)); + } + else if (indices_type == Precision::Int32) + { + gather(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), axis, *mem->gettensor(this->returns[0].textvalue)); + } + break; + } + case Precision::Float32: + { + if (indices_type == Precision::Int64) + { + gather(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), axis, *mem->gettensor(this->returns[0].textvalue)); + } + else if (indices_type == Precision::Int32) + { + gather(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), axis, *mem->gettensor(this->returns[0].textvalue)); + } + break; + } + case Precision::Float16: + { + if (indices_type == Precision::Int64) + { + gather(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), axis, *mem->gettensor(this->returns[0].textvalue)); + } + else if (indices_type == Precision::Int32) + { + gather(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), axis, *mem->gettensor(this->returns[0].textvalue)); + } + break; + } + case Precision::BFloat16: + { + if (indices_type == Precision::Int64) + { + gather(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), axis, *mem->gettensor(this->returns[0].textvalue)); + } + else if (indices_type == Precision::Int32) + { + gather(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), axis, *mem->gettensor(this->returns[0].textvalue)); + } + break; + } + case Precision::Int64: + { + if (indices_type == Precision::Int64) + { + gather(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), axis, *mem->gettensor(this->returns[0].textvalue)); + } + else if (indices_type == Precision::Int32) + { + gather(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), axis, *mem->gettensor(this->returns[0].textvalue)); + } + break; + } + case Precision::Int32: + { + if (indices_type == Precision::Int64) + { + gather(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), axis, *mem->gettensor(this->returns[0].textvalue)); + } + else if (indices_type == Precision::Int32) + { + gather(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), axis, *mem->gettensor(this->returns[0].textvalue)); + } + break; + } + case Precision::Int16: + { + if (indices_type == Precision::Int64) + { + gather(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), axis, *mem->gettensor(this->returns[0].textvalue)); + } + else if (indices_type == Precision::Int32) + { + gather(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), axis, *mem->gettensor(this->returns[0].textvalue)); + } + break; + } + case Precision::Int8: + { + if (indices_type == Precision::Int64) + { + gather(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), axis, *mem->gettensor(this->returns[0].textvalue)); + } + else if (indices_type == Precision::Int32) + { + gather(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), axis, *mem->gettensor(this->returns[0].textvalue)); + } + break; + } + default: + error = "Unsupported type: " + precision_str(input_type); + return 1; + } + return 0; + } + }; + +}; #endif // DEEPX_TF_CHANGESHAPE_HPP diff --git a/excuter/op-mem-cuda/src/deepx/tf/io.hpp b/excuter/op-mem-cuda/src/deepx/tf/io.hpp index 14315a85..b5a5f4d1 100644 --- a/excuter/op-mem-cuda/src/deepx/tf/io.hpp +++ b/excuter/op-mem-cuda/src/deepx/tf/io.hpp @@ -54,5 +54,129 @@ namespace deepx::tf return make_shared>(*this); } }; + + //save + class Save : public TF + { + public: + Save(vector args, vector returns) + { + this->name = "save"; + this->tftype = "io"; + this->args = args; + this->returns = returns; + } + string math_formula() const override + { + return "save(T1,path)"; + } + shared_ptr clone() const override + { + return make_shared(*this); + } + int run(shared_ptr mem, string &error) override + { + string name = this->args[0].textvalue; + string path = this->args[1].textvalue; + if (mem->existstensor(name)) + { + auto t = mem->gettensor(name); + tensorfunc::save(*t, path); + } + else + { + std::cerr << "save " << name << " not found" << std::endl; + error = "save " + name + " not found"; + return 1; + } + return 0; + } + }; + + //load + class Load : public TF + { + public: + Load(vector args, vector returns) + { + this->name = "load"; + this->tftype = "io"; + this->args = args; + this->returns = returns; + } + string math_formula() const override + { + return "load(path)"; + } + shared_ptr clone() const override + { + return make_shared(*this); + } + int run(shared_ptr mem, string &error) override + { + string path = this->args[0].textvalue; + + pair shape_name=tensorfunc::loadShape(path); + std::string tensor_name=shape_name.first; + Shape shape=shape_name.second; + + if(mem->existstensor(tensor_name)) + { + cout<<"warning: "<delete_tensor(tensor_name); + } + switch (shape.dtype) + { + case Precision::Float64:{ + pair>> t = tensorfunc::load(path); + mem->addtensor(tensor_name, t.second); + break; + } + case Precision::Float32:{ + pair>> t = tensorfunc::load(path); + mem->addtensor(tensor_name, t.second); + break; + } + case Precision::Float16:{ + pair>> t = tensorfunc::load(path); + mem->addtensor(tensor_name, t.second); + break; + } + case Precision::BFloat16:{ + pair>> t = tensorfunc::load(path); + mem->addtensor(tensor_name, t.second); + break; + } + case Precision::Int64:{ + pair>> t = tensorfunc::load(path); + mem->addtensor(tensor_name, t.second); + break; + } + case Precision::Int32:{ + pair>> t = tensorfunc::load(path); + mem->addtensor(tensor_name, t.second); + break; + } + case Precision::Int16:{ + pair>> t = tensorfunc::load(path); + mem->addtensor(tensor_name, t.second); + break; + } + case Precision::Int8:{ + pair>> t = tensorfunc::load(path); + mem->addtensor(tensor_name, t.second); + break; + } + case Precision::Bool:{ + pair>> t = tensorfunc::load(path); + mem->addtensor(tensor_name, t.second); + break; + } + default: + break; + } + return 0; + } + }; } #endif diff --git a/excuter/op-mem-ompsimd/src/client/tfs.cpp b/excuter/op-mem-ompsimd/src/client/tfs.cpp index defad8c6..6eed5e04 100644 --- a/excuter/op-mem-ompsimd/src/client/tfs.cpp +++ b/excuter/op-mem-ompsimd/src/client/tfs.cpp @@ -99,30 +99,44 @@ namespace deepx::tf vector())); // normal author=miaobyte tffactory.add_tf(std::make_shared>(vector( - { - Param("t", DataCategory::Tensor, Precision::Any), - Param("mean", DataCategory::Var, Precision::Any), - Param("std", DataCategory::Var, Precision::Any), - Param("seed", DataCategory::Var, Precision::Int32), - }), - vector())); + { + Param("t", DataCategory::Tensor, Precision::Any), + Param("mean", DataCategory::Var, Precision::Any), + Param("std", DataCategory::Var, Precision::Any), + Param("seed", DataCategory::Var, Precision::Int32), + }), + vector())); } // io - void register_util(TfFactory &opfactory) + void register_io(TfFactory &opfactory) { // print author=miaobyte opfactory.add_tf(std::make_shared>(vector( { - Param("", DataCategory::Tensor, Precision::Any), + Param("t", DataCategory::Tensor, Precision::Any), }), vector())); // print author=miaobyte opfactory.add_tf(std::make_shared>(vector( { - Param("", DataCategory::Tensor, Precision::Any), - Param("", DataCategory::Var, Precision::String), + Param("t", DataCategory::Tensor, Precision::Any), + Param("format", DataCategory::Var, Precision::String), + }), + vector())); + //save + opfactory.add_tf(std::make_shared(vector( + { + Param("t", DataCategory::Tensor, Precision::Any), + Param("path", DataCategory::Var, Precision::String), }), vector())); + + //load + opfactory.add_tf(std::make_shared(vector( + { + Param("path", DataCategory::Var, Precision::String), + }), + vector())); } // elementwise @@ -474,6 +488,17 @@ namespace deepx::tf { Param("B", DataCategory::Tensor, Precision::Any), }))); + // gather author=miaobyte + tffactory.add_tf(std::make_shared>(vector( + { + Param("A", DataCategory::Tensor, Precision::Any), + Param("indices", DataCategory::Tensor, Precision::Int32 | Precision::Int64), + Param("axis", DataCategory::Var, Precision::Int32), + }), + vector( + { + Param("B", DataCategory::Tensor, Precision::Any), + }))); } // // reduce void register_reduce(TfFactory &tffactory) @@ -528,7 +553,7 @@ namespace deepx::tf { register_lifecycle(tffactory); register_init(tffactory); - register_util(tffactory); + register_io(tffactory); register_elementwise(tffactory); register_matmul(tffactory); register_changeshape(tffactory); diff --git a/excuter/op-mem-ompsimd/src/deepx/mem/mem_ompsimd.hpp b/excuter/op-mem-ompsimd/src/deepx/mem/mem_ompsimd.hpp index b8f8d538..8e4710b0 100644 --- a/excuter/op-mem-ompsimd/src/deepx/mem/mem_ompsimd.hpp +++ b/excuter/op-mem-ompsimd/src/deepx/mem/mem_ompsimd.hpp @@ -50,7 +50,6 @@ namespace deepx::mem auto ptr = mem.at(name); auto result = make_shared>(); result->shape = ptr->shape; - result->device = ptr->device; result->deleter = nullptr; result->copyer = nullptr; result->newer = nullptr; diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/changeshape_miaobyte.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/changeshape_miaobyte.hpp index 1644c045..7e2985fb 100644 --- a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/changeshape_miaobyte.hpp +++ b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/changeshape_miaobyte.hpp @@ -10,12 +10,13 @@ #include "deepx/tensorfunc/authors.hpp" namespace deepx::tensorfunc { + // reshape template struct reshapeDispatcher { - static void reshape(const Tensor &tensor, const std::vector &shape,Tensor &output) + static void reshape(const Tensor &tensor, const std::vector &shape, Tensor &output) { // 参数改为单个tensor引用 - + int new_prod = 1; for (int dim : shape) { @@ -29,18 +30,18 @@ namespace deepx::tensorfunc Shape newshape(shape); if (tensor.data == output.data) { - output.shape.shape=newshape.shape; - output.shape.strides=newshape.strides; + output.shape.shape = newshape.shape; + output.shape.strides = newshape.strides; } else { - output.shape.shape=newshape.shape; - output.shape.strides=newshape.strides; + output.shape.shape = newshape.shape; + output.shape.strides = newshape.strides; output.copyer(tensor.data, output.data, tensor.shape.size); } } }; - + // transpose template struct transposeDispatcher { @@ -55,26 +56,26 @@ namespace deepx::tensorfunc { throw std::runtime_error("transpose error!shape"); } - output.shape.rangeParallel(dim_order.size(), [&tensor, &output, &dim_order](int idx_linear, const std::vector &indices, std::vector &newIndices) + output.shape.rangeParallel(dim_order.size(), [&tensor, &output, &dim_order](int idx_linear, const std::vector &indices, ThreadLocalVectors &tlv) { for (size_t i = 0; i < dim_order.size(); ++i) { - newIndices[dim_order[i]] = indices[i]; + tlv.get(0)[dim_order[i]] = indices[i]; } - output.data[idx_linear]= tensor.data[tensor.shape.linearat(newIndices)]; }, tensor.shape.dim); + output.data[idx_linear]= tensor.data[tensor.shape.linearat(tlv.get(0))]; }, {tensor.shape.dim}); } }; - + // concat template struct concatDispatcher { static void concat(const vector *> tensors, const int axis, Tensor &result) { - //checkshape + // checkshape if (!checkShapeConcat(tensors, axis, result)) { throw TensorShapeError("Output tensor shape size must match the sum of input tensor shape sizes for concat"); - } + } int dimC = axis + 1; result.shape.rangeParallel(dimC, [&](const int idx, const std::vector &indices) { @@ -124,17 +125,52 @@ namespace deepx::tensorfunc static void broadcastTo(const Tensor &A, const vector &new_shape, Tensor &B) { auto A_broadcastShape = broadcastShape(A.shape.shape, new_shape); - if (A_broadcastShape.empty()||A_broadcastShape!=new_shape) + if (A_broadcastShape.empty() || A_broadcastShape != new_shape) { throw TensorShapeError("Broadcast shape mismatch"); } auto bmap = broadcastMap(A.shape.shape, new_shape); B.shape.rangeParallel(B.shape.dim, [&](const int idx, const std::vector &bindices) - { + { vector aindices=fromBroadcastIndices(bmap, bindices); - B.data[idx] = A.data[A.shape.linearat(aindices)]; - }); + B.data[idx] = A.data[A.shape.linearat(aindices)]; }); + } + }; + + // gather + // 支持高维indices + // 结果写入input_indices + template + void fromGatherIndices(const vector &output_indices, const Tensor &indices, const int gatherAxis, vector &input_indices) + { + std::copy(output_indices.begin(), output_indices.begin()+input_indices.size(), input_indices.begin()); + int indices_idx = indices.shape.linearat(output_indices); + input_indices[gatherAxis] = indices.data[indices_idx]; + } + + template + struct gatherDispatcher + { + static void gather(const Tensor &input, const Tensor &indices, const int axis, Tensor &output) + { + int gatherAxis = axis < 0 ? input.shape.dim + axis : axis; + if (gatherAxis < 0 || gatherAxis >= input.shape.dim) + { + throw std::invalid_argument("Axis is out of bounds"); + } + + vector input_gatherShape = indices.shape.shape; + if (input_gatherShape.empty() || input_gatherShape != output.shape.shape) + { + throw TensorShapeError("Gather shape mismatch"); + } + output.shape.rangeParallel(output.shape.dim, [&](const int idx, const std::vector &output_indices, ThreadLocalVectors &tlv) + { + fromGatherIndices(output_indices, indices, gatherAxis, tlv.get(0)); + output.data[idx] = input.data[input.shape.linearat(tlv.get(0))]; + }, + {input.shape.dim}); } }; diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/io_miaobyte.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/io_miaobyte.hpp index f65166ca..f219ca59 100644 --- a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/io_miaobyte.hpp +++ b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/io_miaobyte.hpp @@ -6,6 +6,7 @@ #include "deepx/tensor.hpp" #include "stdutil/vector.hpp" #include "stdutil/print.hpp" +#include "stdutil/fs.hpp" #include "deepx/tensorfunc/authors.hpp" #include "deepx/tensorfunc/io.hpp" #include "deepx/tensorfunc/tensorlife_miaobyte.hpp" @@ -36,62 +37,69 @@ namespace deepx::tensorfunc }; template - struct saveDispatcher + void save(Tensor &tensor, const std::string &path) { - static void save(Tensor &tensor, const std::string &path, int filebegin = 0) - { - // 保存shape - std::string shapepath = path + ".shape"; - std::string shapedata = tensor.shape.toYaml(); - std::ofstream shape_fs(shapepath, std::ios::binary); - shape_fs.write(shapedata.c_str(), shapedata.size()); - shape_fs.close(); - // 保存data - std::string datapath = path + ".data"; - std::ofstream data_fs(datapath, std::ios::binary | std::ios::in | std::ios::out); + // 保存shape + std::string shapepath = path + ".shape"; + std::string shapedata = tensor.shape.toYaml(); + std::ofstream shape_fs(shapepath, std::ios::binary); + shape_fs.write(shapedata.c_str(), shapedata.size()); + shape_fs.close(); - if (!data_fs.is_open()) - { - // 如果文件不存在,则创建新文件 - data_fs.open(datapath, std::ios::binary | std::ios::out); - } - data_fs.seekp(filebegin); - data_fs.write(reinterpret_cast(tensor.data), tensor.shape.size * sizeof(T)); - data_fs.close(); - } - }; - template - struct loadDispatcher - { - static Tensor load(const std::string &path, int filebegin = 0) + // 保存data + std::string datapath = path + ".data"; + std::ofstream data_fs(datapath, std::ios::binary | std::ios::in | std::ios::out); + + if (!data_fs.is_open()) { - // 加载shape - std::string shapepath = path + ".shape"; - std::ifstream shape_fs(shapepath, std::ios::binary); - std::string shapedata((std::istreambuf_iterator(shape_fs)), std::istreambuf_iterator()); + // 如果文件不存在,则创建新文件 + data_fs.open(datapath, std::ios::binary | std::ios::out); + } + int data_size = tensor.shape.size * precision_bits(tensor.shape.dtype) / 8; + data_fs.write(reinterpret_cast(tensor.data), data_size); + data_fs.close(); + } - Shape shape; - shape.fromYaml(shapedata); - shape_fs.close(); + //load - // 加载data - Tensor tensor = New(shape); - std::string datapath = path + ".data"; - std::ifstream data_fs(datapath, std::ios::binary); - if (!data_fs.is_open()) - { - throw std::runtime_error("无法打开数据文件: " + datapath); - } + template + pair>> load(const std::string &path) + { + // 加载shape + pair shape_name=loadShape(path); + Shape shape=shape_name.second; + std::string tensor_name=shape_name.first; + + + // 检查T 和 shape.dtype 是否匹配 + if (shape.dtype != precision()) + { + throw std::runtime_error("调用load<" + precision_str(shape.dtype) + "> 不匹配: 需要 " + precision_str(shape.dtype) + + " 类型,但文件为" + precision_str(precision()) + " 类型"); + } - // 设置读取位置 - data_fs.seekg(filebegin); - data_fs.read(reinterpret_cast(tensor.data), shape.size * sizeof(T)); - data_fs.close(); + // 检查file.size,是否是tensor.size*sizeof(T) + std::string datapath = path + ".data"; + std::ifstream data_fs(datapath, std::ios::binary); + data_fs.seekg(0, std::ios::end); + std::streamsize fileSize = data_fs.tellg(); + std::streamsize expectedSize = shape.size * precision_bits(shape.dtype) / 8; - return tensor; + if (fileSize != expectedSize) + { + throw std::runtime_error("数据文件大小不足: 需要 " + std::to_string(expectedSize) + + " 字节,但文件只有 " + std::to_string(fileSize) + " 字节"); } + data_fs.seekg(0); + + // 创建tensor + shared_ptr> tensor = make_shared>(New(shape.shape)); + data_fs.read(reinterpret_cast(tensor->data), fileSize); + data_fs.close(); + return std::make_pair(tensor_name, tensor); }; + } #endif // DEEPX_TENSORFUNC_IO_MIAOBYTE_HPP \ No newline at end of file diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/reduce_miaobyte.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/reduce_miaobyte.hpp index aaf3f86e..b6ebea5c 100644 --- a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/reduce_miaobyte.hpp +++ b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/reduce_miaobyte.hpp @@ -27,38 +27,38 @@ namespace deepx::tensorfunc const int minshape_1 = Lanes(ScalableTag()); if (reduced_dims.rbegin()[0] == tensor.shape.dim - 1 || tensor.shape.dim > reduced_dims.size() || tensor.shape[-1] >= minshape_1) { - tensor.shape.rangeParallel(tensor.shape.dim, [&tensor, &result, &reduced_dims, keepdims](const int idx_linear, const std::vector &indices, std::vector &newIndices) + tensor.shape.rangeParallel(tensor.shape.dim, [&tensor, &result, &reduced_dims, keepdims](const int idx_linear, const std::vector &indices, ThreadLocalVectors &tlv) { // 计算输出索引 for (size_t i = 0, j = 0; i < tensor.shape.dim; ++i) { if (reduced_dims[i] == 0) { - newIndices[j++] = indices[i]; + tlv.get(0)[j++] = indices[i]; }else if (keepdims && (reduced_dims[i] == 1)) { - newIndices[j++] = 0; + tlv.get(0)[j++] = 0; } } - int outputIdx = result.shape.linearat(newIndices); + int outputIdx = result.shape.linearat(tlv.get(0)); #pragma omp atomic - result.data[outputIdx] += tensor.data[idx_linear]; }, result.shape.dim); + result.data[outputIdx] += tensor.data[idx_linear]; }, {result.shape.dim}); } else { // 如果数据连续(对齐),则可以simd - tensor.shape.rangeParallel(tensor.shape.dim - 1, [&tensor, &result, &reduced_dims, keepdims](const int idx_linear, const std::vector &indices, std::vector &newIndices) + tensor.shape.rangeParallel(tensor.shape.dim - 1, [&tensor, &result, &reduced_dims, keepdims](const int idx_linear, const std::vector &indices, ThreadLocalVectors &tlv) { // 计算输出索引 for (size_t i = 0, j = 0; i < tensor.shape.dim; ++i) { if (reduced_dims[i] == 0) { - newIndices[j++] = indices[i]; + tlv.get(0)[j++] = indices[i]; }else if (keepdims && (reduced_dims[i] == 1)) { - newIndices[j++] = 0; + tlv.get(0)[j++] = 0; } } - int outputIdx = result.shape.linearat(newIndices); + int outputIdx = result.shape.linearat(tlv.get(0)); int shape_last = tensor.shape[-1]; const ScalableTag tag; const size_t lanes = Lanes(tag); @@ -86,7 +86,8 @@ namespace deepx::tensorfunc sum += tensor.data[idx_linear + j]; } #pragma omp atomic - result.data[outputIdx] += sum; }, result.shape.dim); + result.data[outputIdx] += sum; }, + {result.shape.dim}); } } }; @@ -104,26 +105,27 @@ namespace deepx::tensorfunc constant(result, T(1)); if (reduced_dims.rbegin()[0] == tensor.shape.dim - 1 || tensor.shape.dim > reduced_dims.size() || tensor.shape[-1] >= minshape_1) { - tensor.shape.rangeParallel(tensor.shape.dim, [&tensor, &result, &reduced_dims, keepdims](const int idx_linear, const std::vector &indices, std::vector &newIndices) + tensor.shape.rangeParallel(tensor.shape.dim, [&tensor, &result, &reduced_dims, keepdims](const int idx_linear, const std::vector &indices, ThreadLocalVectors &tlv) { // 计算输出索引 for (size_t i = 0,j=0; i < tensor.shape.dim ; ++i) { if (reduced_dims[i]==0) { - newIndices[j++]=indices[i]; + tlv.get(0)[j++]=indices[i]; }else if (keepdims && (reduced_dims[i] == 1)) { - newIndices[j++]=0; + tlv.get(0)[j++]=0; } } // 累加求和 - int outputIdx=result.shape.linearat(newIndices); + int outputIdx=result.shape.linearat(tlv.get(0)); #pragma omp atomic - result.data[outputIdx]*=tensor.data[idx_linear]; }, result.shape.dim); + result.data[outputIdx]*=tensor.data[idx_linear]; + }, {result.shape.dim}); } else { // 如果数据连续(对齐),则可以simd - tensor.shape.rangeParallel(tensor.shape.dim - 1, [&tensor, &result, &reduced_dims, keepdims](const int i, const std::vector &indices, std::vector &newIndices) + tensor.shape.rangeParallel(tensor.shape.dim - 1, [&tensor, &result, &reduced_dims, keepdims](const int i, const std::vector &indices, ThreadLocalVectors &tlv) { // 计算输出索引 @@ -131,13 +133,13 @@ namespace deepx::tensorfunc { if (reduced_dims[i] == 0) { - newIndices[j++] = indices[i]; + tlv.get(0)[j++] = indices[i]; }else if (keepdims && (reduced_dims[i] == 1)) { - newIndices[j++] = 0; + tlv.get(0)[j++] = 0; } } // 累加求和 - int outputIdx = result.shape.linearat(newIndices); + int outputIdx = result.shape.linearat(tlv.get(0)); int shape_last = tensor.shape[-1]; const ScalableTag tag; @@ -170,7 +172,8 @@ namespace deepx::tensorfunc product *= tensor.data[i + j]; } #pragma omp atomic - result.data[outputIdx] *= product; }, result.shape.dim); + result.data[outputIdx] *= product; + }, {result.shape.dim}); } } }; @@ -187,25 +190,26 @@ namespace deepx::tensorfunc constant(result, std::numeric_limits::lowest()); if (reduced_dims.rbegin()[0] == tensor.shape.dim - 1 || tensor.shape.dim > reduced_dims.size() || tensor.shape[-1] >= minshape_1) { - tensor.shape.rangeParallel(tensor.shape.dim, [&tensor, &result, &reduced_dims, keepdims](const int idx_linear, const std::vector &indices, std::vector &newIndices) + tensor.shape.rangeParallel(tensor.shape.dim, [&tensor, &result, &reduced_dims, keepdims](const int idx_linear, const std::vector &indices, ThreadLocalVectors &tlv) { // 计算输出索引 for (size_t i = 0,j=0; i < tensor.shape.dim ; ++i) { if (reduced_dims[i]==0) { - newIndices[j++]=indices[i]; + tlv.get(0)[j++]=indices[i]; }else if (keepdims && (reduced_dims[i] == 1)) { - newIndices[j++]=0; + tlv.get(0)[j++]=0; } } // 累加求和 - int outputIdx=result.shape.linearat(newIndices); - result.data[outputIdx]=std::max(result.data[outputIdx],tensor.data[idx_linear]); }, result.shape.dim); + int outputIdx=result.shape.linearat(tlv.get(0)); + result.data[outputIdx]=std::max(result.data[outputIdx],tensor.data[idx_linear]); + }, {result.shape.dim}); } else { // 如果数据连续(对齐),则可以simd - tensor.shape.rangeParallel(tensor.shape.dim - 1, [&tensor, &result, &reduced_dims, keepdims](const int i, const std::vector &indices, std::vector &newIndices) + tensor.shape.rangeParallel(tensor.shape.dim - 1, [&tensor, &result, &reduced_dims, keepdims](const int i, const std::vector &indices, ThreadLocalVectors &tlv) { // 计算输出索引 @@ -213,13 +217,13 @@ namespace deepx::tensorfunc { if (reduced_dims[i] == 0) { - newIndices[j++] = indices[i]; + tlv.get(0)[j++] = indices[i]; }else if (keepdims && (reduced_dims[i] == 1)) { - newIndices[j++] =0; + tlv.get(0)[j++] =0; } } - int outputIdx = result.shape.linearat(newIndices); + int outputIdx = result.shape.linearat(tlv.get(0)); int shape_last = tensor.shape[-1]; const ScalableTag tag; @@ -251,7 +255,8 @@ namespace deepx::tensorfunc maxt = std::max(maxt,tensor.data[i + j]); } - result.data[outputIdx] = std::max(result.data[outputIdx],maxt); }, result.shape.dim); + result.data[outputIdx] = std::max(result.data[outputIdx],maxt); + }, {result.shape.dim}); } } }; @@ -268,26 +273,27 @@ namespace deepx::tensorfunc constant(result, std::numeric_limits::max()); if (reduced_dims.rbegin()[0] == tensor.shape.dim - 1 || tensor.shape.dim > reduced_dims.size() || tensor.shape[-1] >= minshape_1) { - tensor.shape.rangeParallel(tensor.shape.dim, [&tensor, &result, &reduced_dims, keepdims](const int idx_linear, const std::vector &indices, std::vector &newIndices) + tensor.shape.rangeParallel(tensor.shape.dim, [&tensor, &result, &reduced_dims, keepdims](const int idx_linear, const std::vector &indices, ThreadLocalVectors &tlv) { // 计算输出索引 for (size_t i = 0,j=0; i < tensor.shape.dim ; ++i) { if (reduced_dims[i]==0) { - newIndices[j++]=indices[i]; + tlv.get(0)[j++]=indices[i]; }else if (keepdims && (reduced_dims[i] == 1)) { - newIndices[j++]=0; + tlv.get(0)[j++]=0; } } // 累加求和 - int outputIdx=result.shape.linearat(newIndices); + int outputIdx=result.shape.linearat(tlv.get(0)); - result.data[outputIdx]=std::min(result.data[outputIdx],tensor.data[idx_linear]); }, result.shape.dim); + result.data[outputIdx]=std::min(result.data[outputIdx],tensor.data[idx_linear]); + }, {result.shape.dim}); } else { // 如果数据连续(对齐),则可以simd - tensor.shape.rangeParallel(tensor.shape.dim - 1, [&tensor, &result, &reduced_dims, keepdims](const int i, const std::vector &indices, std::vector &newIndices) + tensor.shape.rangeParallel(tensor.shape.dim - 1, [&tensor, &result, &reduced_dims, keepdims](const int i, const std::vector &indices, ThreadLocalVectors &tlv) { // 计算输出索引 @@ -295,13 +301,13 @@ namespace deepx::tensorfunc { if (reduced_dims[i] == 0) { - newIndices[j++] = indices[i]; + tlv.get(0)[j++] = indices[i]; }else if (keepdims && (reduced_dims[i] == 1)) { - newIndices[j++] = 0; + tlv.get(0)[j++] = 0; } } - int outputIdx = result.shape.linearat(newIndices); + int outputIdx = result.shape.linearat(tlv.get(0)); int shape_last = tensor.shape[-1]; const ScalableTag tag; @@ -333,7 +339,7 @@ namespace deepx::tensorfunc mint = std::min(mint,tensor.data[i + j]); } - result.data[outputIdx] = std::min(result.data[outputIdx],mint); }, result.shape.dim); + result.data[outputIdx] = std::min(result.data[outputIdx],mint); }, {result.shape.dim}); } } }; diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/tensorlife_miaobyte.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/tensorlife_miaobyte.hpp index 1f9c9cda..c514faa8 100644 --- a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/tensorlife_miaobyte.hpp +++ b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/tensorlife_miaobyte.hpp @@ -37,7 +37,6 @@ namespace deepx::tensorfunc shape.dtype = precision(); Tensor tensor(shape); - tensor.device = CPU; tensor.deleter = dataFree; tensor.copyer = dataCopy; tensor.newer = dataNew; diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/changeshape.hpp b/excuter/op-mem-ompsimd/src/deepx/tf/changeshape.hpp index e9c534b7..fd4f0e07 100644 --- a/excuter/op-mem-ompsimd/src/deepx/tf/changeshape.hpp +++ b/excuter/op-mem-ompsimd/src/deepx/tf/changeshape.hpp @@ -11,6 +11,7 @@ namespace deepx::tf using namespace deepx::tensorfunc; using namespace std; + // reshape template class Reshape : public TF { @@ -37,7 +38,7 @@ namespace deepx::tf int run(shared_ptr mem, string &error) override { - if (!checktensors({ this->returns[0].textvalue}, mem, error)!=0) + if (!checktensors({this->returns[0].textvalue}, mem, error) != 0) { return 1; } @@ -77,6 +78,7 @@ namespace deepx::tf } }; + // transpose template class Transpose : public TF { @@ -102,7 +104,7 @@ namespace deepx::tf int run(shared_ptr mem, string &error) override { - if (!checktensors({this->args[0].textvalue, this->returns[0].textvalue}, mem, error)!=0) + if (!checktensors({this->args[0].textvalue, this->returns[0].textvalue}, mem, error) != 0) { return 1; } @@ -143,6 +145,7 @@ namespace deepx::tf } }; + // concat template class Concat : public TF { @@ -165,14 +168,14 @@ namespace deepx::tf return make_shared(*this); } int run(shared_ptr mem, string &error) override - { - if (!checktensors({ this->returns[0].textvalue}, mem, error)!=0) + { + if (!checktensors({this->returns[0].textvalue}, mem, error) != 0) { return 1; } vector tensor_names = this->getvector(0, true); - if (!checktensors(tensor_names, mem, error)!=0) + if (!checktensors(tensor_names, mem, error) != 0) { return 1; } @@ -255,7 +258,8 @@ namespace deepx::tf }; }; - template + // broadcastTo + template class BroadcastTo : public TF { public: @@ -278,7 +282,7 @@ namespace deepx::tf } int run(shared_ptr mem, string &error) override { - if (!checktensors({this->args[0].textvalue, this->returns[0].textvalue}, mem, error)!=0) + if (!checktensors({this->args[0].textvalue, this->returns[0].textvalue}, mem, error) != 0) { return 1; } @@ -317,124 +321,130 @@ namespace deepx::tf return 0; } }; - - // class Split : public TF - // { - // public: - // Split() - // { - // this->name="split"; - // } - // Split(string text) - // { - // this->parse(text); - // if (this->name!="split"){ - // throw std::runtime_error("Invalid name: "+this->name); - // } - // } - // void funcdef() override - // { - // this->parse("split(float32 T1,int32 3)->(float32 T2,T3)"); - // } - // string math_formula() const override - // { - // return "T2,T3 = split(T1, axis=3)"; - // } - // void run(mem::Mem &mem) override - // { - // std::vector *> input; - // for (int i = 0; i < this->args.size() - 1; i++) - // { - // input.push_back(mem.gettensor(this->args[i]).get()); - // } - // int axis = mem.getarg(this->args.back()); - // auto output = mem.gettensor(this->returns[0]).get(); - // tensorfunc::split(*output, axis, input); - // } - // }; - - - // template - // class Expand : public Op - // { - // public: - // Expand() - // { - // this->init("expand", "any", {}, {}, false, {}, {}); - // } - // void forward(mem::Mem &mem) override - // { - // auto input = mem.gettensor(this->args[0]).get(); - // auto output = mem.gettensor(this->returns[0]).get(); - // tensorfunc::expand(*input, *output); - // } - // vector sumaxis(const vector shape,const vector target_shape) - // { - // vector axis; - - // // 检查当前形状(this->shape)与目标形状的差异 - // int current_dim = shape.size(); - // int target_dim = target_shape.size(); - - // // 如果目标维度小于当前维度,需要在一些轴上求和来减少维度 - // if (target_dim < current_dim) - // { - // // 检查每个当前维度,看是否需要在目标形状中保留 - // for (int i = 0; i < current_dim; i++) - // { - // bool keep_dim = false; + // gather + template + class Gather : public TF + { + public: + Gather(const vector &args, const vector &returns) + { + this->name = "gather"; + this->author = Author::name(); + this->tftype = "changeshape"; + this->args = args; + this->returns = returns; + } - // // 找出当前维度是否与目标形状中的任何维度对应 - // if (i < current_dim - target_dim) - // { - // // 如果当前维度索引小于两者维度差,肯定需要被求和 - // axis.push_back(i); - // } - // else - // { - // // 检查该维度是否与目标形状匹配 - // int target_idx = i - (current_dim - target_dim); - // if (target_shape[target_idx] == 1 && shape[i] > 1) - // { - // // 如果目标形状在这个维度上是1,但当前形状不是1,需要求和 - // axis.push_back(i); - // } - // } - // } - // } - // else if (target_dim == current_dim) - // { - // // 维度数量相同,检查哪些维度需要被压缩为1 - // for (int i = 0; i < current_dim; i++) - // { - // if (target_shape[i] == 1 && shape[i] > 1) - // { - // axis.push_back(i); - // } - // } - // } - // // 如果目标维度大于当前维度,可能需要扩展维度(通常通过其他操作如expand_dims) + string math_formula() const override + { + return "T2 = T1.gather(indices=T3, axis=3)"; + } + shared_ptr clone() const override + { + return make_shared>(*this); + } + int run(shared_ptr mem, string &error) override + { + if (!checktensors({this->args[0].textvalue, this->args[1].textvalue, this->returns[0].textvalue}, mem, error) != 0) + { + return 1; + } + Precision input_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; - // return axis; - // } - // void backward(mem::Mem &mem) override - // { - // auto input_grad = mem.gettensor(this->args_grad[0]).get(); - // auto output_grad = mem.gettensor(this->returns_grad[0]).get(); - // vector target_shape = this->getvector( 1); - // vector axis = this->sumaxis(input_grad->shape.shape,target_shape); - // // sum,按指定维度求和 - // tensorfunc::sum(*output_grad, axis,*input_grad); - // } - // void funcdef() override - // { - // this->init("expand", "float32", {"T1", "4", "6", "12"}, {"T2"}, false, {}, {}); - // } - // string math_formula() const override - // { - // return "T2 = expand(T1, axis=[4,6,12])"; - // } - // }; + Precision output_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (input_type != output_type) + { + error = "Type mismatch: " + precision_str(input_type) + " != " + precision_str(output_type); + return 1; + } + Precision indices_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype; + if (indices_type != Precision::Int32 && indices_type != Precision::Int64) + { + error = "indices only support int32 or int64"; + return 1; + } + int axis = this->getvar(2, mem, true); + switch (input_type) + { + case Precision::Float64: + { + if (indices_type == Precision::Int32) + { + gather(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), axis, *mem->gettensor(this->returns[0].textvalue)); + } + else + { + gather(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), axis, *mem->gettensor(this->returns[0].textvalue)); + } + break; + } + case Precision::Float32: + { + if (indices_type == Precision::Int32) + { + gather(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), axis, *mem->gettensor(this->returns[0].textvalue)); + } + else + { + gather(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), axis, *mem->gettensor(this->returns[0].textvalue)); + } + break; + } + case Precision::Int64: + { + if (indices_type == Precision::Int32) + { + gather(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), axis, *mem->gettensor(this->returns[0].textvalue)); + } + else + { + gather(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), axis, *mem->gettensor(this->returns[0].textvalue)); + } + break; + } + case Precision::Int16: + { + if (indices_type == Precision::Int32) + { + gather(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), axis, *mem->gettensor(this->returns[0].textvalue)); + } + else + { + gather(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), axis, *mem->gettensor(this->returns[0].textvalue)); + } + break; + } + case Precision::Int8: + { + if (indices_type == Precision::Int32) + { + gather(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), axis, *mem->gettensor(this->returns[0].textvalue)); + } + else + { + gather(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), axis, *mem->gettensor(this->returns[0].textvalue)); + } + break; + } + case Precision::Bool: + { + if (indices_type == Precision::Int32) + { + gather(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), axis, *mem->gettensor(this->returns[0].textvalue)); + } + else + { + gather(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->args[1].textvalue), axis, *mem->gettensor(this->returns[0].textvalue)); + } + break; + } + default: + error = "Unsupported type: " + precision_str(input_type); + return 1; + } + return 0; + }; + }; } -#endif // DEEPX_OP_CONCAT_HPP \ No newline at end of file +#endif // DEEPX_TF_CHANGESHAPE_HPP \ No newline at end of file diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/io.hpp b/excuter/op-mem-ompsimd/src/deepx/tf/io.hpp index 0fd86a18..a6fb83dc 100644 --- a/excuter/op-mem-ompsimd/src/deepx/tf/io.hpp +++ b/excuter/op-mem-ompsimd/src/deepx/tf/io.hpp @@ -7,7 +7,7 @@ #include "deepx/tensorfunc/authors.hpp" namespace deepx::tf { - + //print template class Print : public TF { @@ -20,7 +20,14 @@ namespace deepx::tf this->args = args; this->returns = returns; } - + string math_formula() const override + { + return "print(T1)"; + } + shared_ptr clone() const override + { + return make_shared>(*this); + } int run(shared_ptr mem, string &error) override { string name = this->args[0].textvalue; @@ -45,13 +52,120 @@ namespace deepx::tf return 0; } + + }; + + //save + class Save : public TF + { + public: + Save(vector args, vector returns) + { + this->name = "save"; + this->tftype = "io"; + this->args = args; + this->returns = returns; + } + string math_formula() const override + { + return "save(T1,path)"; + } + shared_ptr clone() const override + { + return make_shared(*this); + } + int run(shared_ptr mem, string &error) override + { + string name = this->args[0].textvalue; + string path = this->args[1].textvalue; + if (mem->existstensor(name)) + { + auto t = mem->gettensor(name); + tensorfunc::save(*t, path); + } + else + { + std::cerr << "save " << name << " not found" << std::endl; + error = "save " + name + " not found"; + return 1; + } + return 0; + } + }; + + //load + class Load : public TF + { + public: + Load(vector args, vector returns) + { + this->name = "load"; + this->tftype = "io"; + this->args = args; + this->returns = returns; + } string math_formula() const override { - return "print(T1)"; + return "mem.load(path)"; } shared_ptr clone() const override { - return make_shared>(*this); + return make_shared(*this); + } + int run(shared_ptr mem, string &error) override + { + string path = this->args[0].textvalue; + + pair shape_name=tensorfunc::loadShape(path); + std::string tensor_name=shape_name.first; + Shape shape=shape_name.second; + + if(mem->existstensor(tensor_name)) + { + cout<<"warning: "<delete_tensor(tensor_name); + } + switch (shape.dtype) + { + case Precision::Float64:{ + pair>> t = tensorfunc::load(path); + mem->addtensor(tensor_name, t.second); + break; + } + case Precision::Float32:{ + pair>> t = tensorfunc::load(path); + mem->addtensor(tensor_name, t.second); + break; + } + case Precision::Int64:{ + pair>> t = tensorfunc::load(path); + mem->addtensor(tensor_name, t.second); + break; + } + case Precision::Int32:{ + pair>> t = tensorfunc::load(path); + mem->addtensor(tensor_name, t.second); + break; + } + case Precision::Int16:{ + pair>> t = tensorfunc::load(path); + mem->addtensor(tensor_name, t.second); + break; + } + case Precision::Int8:{ + pair>> t = tensorfunc::load(path); + mem->addtensor(tensor_name, t.second); + break; + } + case Precision::Bool:{ + pair>> t = tensorfunc::load(path); + mem->addtensor(tensor_name, t.second); + break; + } + default: + break; + } + return 0; } }; } diff --git a/excuter/op-mem-ompsimd/test/tensorfunc/2_tensor_new.cpp b/excuter/op-mem-ompsimd/test/tensorfunc/2_tensor_new.cpp index 5b62463d..b50237c5 100644 --- a/excuter/op-mem-ompsimd/test/tensorfunc/2_tensor_new.cpp +++ b/excuter/op-mem-ompsimd/test/tensorfunc/2_tensor_new.cpp @@ -14,11 +14,11 @@ void test_tensor_new(){ Tensor tensor=New({2, 3}); constant(tensor,1); print(tensor); - save(tensor,"tensor"); + save(tensor,"tensor"); Tensor tensor2=New({2, 3}); constant(tensor2,2); print(tensor2); - save(tensor2,"tensor2"); + save(tensor2,"tensor2"); } void test_arange() { @@ -30,6 +30,7 @@ void test_arange() { int main(int argc,char **argv){ int i=0; if (argc>1){ + i=std::atoi(argv[1]); } switch (i) { diff --git a/excuter/op-mem-ompsimd/test/tensorfunc/2_tensor_range.cpp b/excuter/op-mem-ompsimd/test/tensorfunc/2_tensor_range.cpp index a7f3eeec..9a4b0aaf 100644 --- a/excuter/op-mem-ompsimd/test/tensorfunc/2_tensor_range.cpp +++ b/excuter/op-mem-ompsimd/test/tensorfunc/2_tensor_range.cpp @@ -14,11 +14,11 @@ void test_tensor_range(){ Tensor tensor=New({2, 3}); constant(tensor,1); print(tensor); - save(tensor,"2_tensor_range.1"); + Tensor tensor2=New({2, 3}); constant(tensor2,2); print(tensor2); - save(tensor2,"2_tensor_range.2"); + } int main(){ diff --git a/excuter/op-mem-ompsimd/test/tensorfunc/3_tensor_print.cpp b/excuter/op-mem-ompsimd/test/tensorfunc/3_tensor_print.cpp index d965ec77..bdf3a492 100644 --- a/excuter/op-mem-ompsimd/test/tensorfunc/3_tensor_print.cpp +++ b/excuter/op-mem-ompsimd/test/tensorfunc/3_tensor_print.cpp @@ -10,6 +10,5 @@ int main(){ deepx::Tensor t=New({2, 3,4}); std::iota(t.data, t.data+t.shape.size, 0); print(t); - save(t,"3_tensor_print"); return 0; } \ No newline at end of file diff --git a/excuter/op-mem-ompsimd/test/tensorfunc/4_tensor_matmul.cpp b/excuter/op-mem-ompsimd/test/tensorfunc/4_tensor_matmul.cpp index 60027015..008b5550 100644 --- a/excuter/op-mem-ompsimd/test/tensorfunc/4_tensor_matmul.cpp +++ b/excuter/op-mem-ompsimd/test/tensorfunc/4_tensor_matmul.cpp @@ -49,10 +49,8 @@ void test_tensor_matmul(){ void bench_tensor_matmul(int i) { Tensor tensor= New({i,i}); uniform(tensor,0,1); - save(tensor,"4_tensor_matmul"+std::to_string(i)+"tensor"); Tensor tensor2= New({i,i}); uniform(tensor2,0,1); - save(tensor2,"4_tensor_matmul"+std::to_string(i)+"tensor2"); Tensor tensor3= New(matmul_shape(tensor.shape, tensor2.shape).shape); std::cout<<("matmul ", i, "x", i); auto start = std::chrono::high_resolution_clock::now(); @@ -60,7 +58,6 @@ void bench_tensor_matmul(int i) { matmul(tensor, tensor2, tensor3); auto end=std::chrono::high_resolution_clock::now(); std::chrono::duration duration = end - start; - save(tensor3,"4_tensor_matmul"+std::to_string(i)+"result"); std::cout << "time:" << duration.count() << " seconds" << std::endl; } diff --git a/excuter/op-mem-ompsimd/test/tensorfunc/5_tensor_sum.cpp b/excuter/op-mem-ompsimd/test/tensorfunc/5_tensor_sum.cpp index e902e717..dc2ef698 100644 --- a/excuter/op-mem-ompsimd/test/tensorfunc/5_tensor_sum.cpp +++ b/excuter/op-mem-ompsimd/test/tensorfunc/5_tensor_sum.cpp @@ -68,7 +68,6 @@ void benchmark_sum(int i){ { combstr+=std::to_string(c)+"_"; } - save(r,"5_tensor_sum."+ combstr); print(r,"%.0f"); } auto end=std::chrono::high_resolution_clock::now(); diff --git a/front/go/deepx/tensor.go b/front/go/deepx/tensor.go deleted file mode 100644 index 61223786..00000000 --- a/front/go/deepx/tensor.go +++ /dev/null @@ -1,48 +0,0 @@ -package deepx - -import "fmt" - -type Shape struct { - shape []int - stride []int - ndim int - size int -} - -func NewTensorShape(shape []int) (s Shape) { - s.ndim = len(shape) - s.shape = make([]int, len(shape)) - copy(s.shape, shape) - s.stride = make([]int, len(shape)) - s.stride[len(shape)-1] = 1 - for i := len(shape) - 2; i >= 0; i-- { - s.stride[i] = s.stride[i+1] * shape[i+1] - } - s.size = s.stride[0] * shape[0] - return s -} -func (s Shape) String() string { - return fmt.Sprintf("%v", s.shape) -} - -type Dtype int - -const ( - DtypeInt8 Dtype = iota - DtypeInt16 - DtypeInt32 - DtypeInt64 - DtypeUint8 - DtypeFloat16 - DtypeFloat32 - DtypeFloat64 -) - -type Tensor struct { - Data []byte - Dtype Dtype - Shape Shape - graph *Graph // 所属计算图 - node *TensorNode // 对应的计算图节点 - requiresGrad bool -} diff --git a/front/py/deepx/nn/functional/__init__.py b/front/py/deepx/nn/functional/__init__.py index 1e215d7e..0f729cfa 100644 --- a/front/py/deepx/nn/functional/__init__.py +++ b/front/py/deepx/nn/functional/__init__.py @@ -1,6 +1,6 @@ -from .leaffunc_life import newtensor,copytensor,deltensor -from .leaffunc_io import printtensor +from .leaffunc_life import * +from .leaffunc_io import * from .leaffunc_init import * from .leaffunc_changeshape import * from .leaffunc_elementwise import * @@ -16,9 +16,10 @@ from .elementwise import * from .normalization import * __all__ = [ + #leaffunc - "newtensor", - "printtensor", + "newtensor","printtensor","load", #life + "printtensor","save","save_npy",#io "constant","constant_","full","zeros","ones","uniform","uniform_","arange","arange_","kaiming_uniform","kaiming_uniform_","calculate_fan_in_and_fan_out", "add","sub","mul","div","sqrt","pow","exp","log", "matmul", diff --git a/front/py/deepx/nn/functional/authormap.py b/front/py/deepx/nn/functional/authormap.py index 6c32a4d6..4a241865 100644 --- a/front/py/deepx/nn/functional/authormap.py +++ b/front/py/deepx/nn/functional/authormap.py @@ -34,6 +34,7 @@ 'transpose':'miaobyte', 'broadcastTo':'miaobyte', 'concat':'miaobyte', + 'gather':'miaobyte', #matmul # 'matmul':'miaobyte', 'matmul':'cublas', diff --git a/front/py/deepx/nn/functional/leaffunc_changeshape.py b/front/py/deepx/nn/functional/leaffunc_changeshape.py index 25599498..bd86edf6 100644 --- a/front/py/deepx/nn/functional/leaffunc_changeshape.py +++ b/front/py/deepx/nn/functional/leaffunc_changeshape.py @@ -64,6 +64,14 @@ def broadcastTo(t:Tensor,new_shape:tuple[int],out:Union[Tensor,str]='',requires_ return outtensor broadcast_to = broadcastTo +def gather(input:Tensor,indices:Tensor,gatheraxis:int,out:Union[Tensor,str]='')->Tensor: + outtensor=out + if isinstance(out,str): + outtensor=newtensor(indices.shape,dtype=input.dtype,name=out) + from .rtf_changeshape import rtf_gather + rtf_gather(input,indices,gatheraxis,outtensor,defaultauthor['gather']) + return outtensor + # def unsqueeze(t:Tensor,dim:int)->Tensor: # # 确保dim是有效的 # if dim < -t.ndim-1 or dim > t.ndim: diff --git a/front/py/deepx/nn/functional/leaffunc_io.py b/front/py/deepx/nn/functional/leaffunc_io.py index b4490803..dd09abc0 100644 --- a/front/py/deepx/nn/functional/leaffunc_io.py +++ b/front/py/deepx/nn/functional/leaffunc_io.py @@ -1,4 +1,4 @@ -from deepx.tensor import Tensor +from deepx.tensor import Tensor,Shape,saveShape from .authormap import defaultauthor def printtensor(t:Tensor,format=''): @@ -6,3 +6,20 @@ def printtensor(t:Tensor,format=''): rtf_printtensor(t,format,defaultauthor['print']) return '' +def save(t:Tensor,path:str): + from .rtf_io import rtf_save + rtf_save(t,path) + return t + +def save_npy(t,path:str): + r''' + 保存numpy.tensor为deepxtensor格式 + ''' + from numpy import save,ndarray,ascontiguousarray + shape=Shape(t.shape) + shape._dtype=str(t.dtype) + saveShape(shape,path+".shape") + + array = ascontiguousarray(t) + array.tofile(path+'.data') + return t \ No newline at end of file diff --git a/front/py/deepx/nn/functional/leaffunc_life.py b/front/py/deepx/nn/functional/leaffunc_life.py index abf6a530..8952b9d6 100644 --- a/front/py/deepx/nn/functional/leaffunc_life.py +++ b/front/py/deepx/nn/functional/leaffunc_life.py @@ -23,3 +23,6 @@ def deltensor(t:Tensor): from .rtf_life import rtf_deltensor rtf_deltensor(t) +def load(path:str)->Tensor: + from .rtf_io import rtf_load + return rtf_load(path) diff --git a/front/py/deepx/nn/functional/rtf_changeshape.py b/front/py/deepx/nn/functional/rtf_changeshape.py index 2f9e7e2d..c7233ab0 100644 --- a/front/py/deepx/nn/functional/rtf_changeshape.py +++ b/front/py/deepx/nn/functional/rtf_changeshape.py @@ -28,3 +28,9 @@ def rtf_broadcastTo(t:Tensor,new_shape:tuple[int],out:Tensor,author='miaobyte'): ir=DeepxIR("broadcastTo", args, returns,author) send(ir) +def rtf_gather(input:Tensor,indices:Tensor,axis:int,out:Tensor,author='miaobyte'): + args=[Param.tensor(input),Param.tensor(indices),Param.varnum(axis)] + returns=[Param.tensor(out)] + ir=DeepxIR("gather", args, returns,author) + send(ir) + diff --git a/front/py/deepx/nn/functional/rtf_io.py b/front/py/deepx/nn/functional/rtf_io.py index 02569603..899b6f16 100644 --- a/front/py/deepx/nn/functional/rtf_io.py +++ b/front/py/deepx/nn/functional/rtf_io.py @@ -1,4 +1,4 @@ -from deepx.tensor import Tensor +from deepx.tensor import Tensor,loadShape from deepx.nn import DeepxIR,Param from deepx.scheduler import send @@ -9,16 +9,18 @@ def rtf_printtensor(t:Tensor,format='',author='miaobyte'): send(ir) return '' -def rtf_load(t:Tensor,path:str,author='miaobyte'): +def rtf_save(t:Tensor,path:str): args=[Param.tensor(t),Param.varstr(path)] returns=[] - ir=DeepxIR("load", args, returns,author) + ir=DeepxIR("save", args, returns) send(ir) return t -def rtf_save(t:Tensor,path:str,author='miaobyte'): - args=[Param.tensor(t),Param.varstr(path)] +def rtf_load(path:str)->Tensor: + args=[Param.varstr(path)] returns=[] - ir=DeepxIR("save", args, returns,author) + ir=DeepxIR("load", args, returns) send(ir) - return t \ No newline at end of file + shapefile=path+'.shape' + tensor_name,shape,dtype=loadShape(shapefile) + return Tensor(shape,dtype,tensor_name) diff --git a/front/py/deepx/nn/modules/sparse.py b/front/py/deepx/nn/modules/sparse.py index cab749e0..a7727a67 100644 --- a/front/py/deepx/nn/modules/sparse.py +++ b/front/py/deepx/nn/modules/sparse.py @@ -2,27 +2,142 @@ from deepx.tensor import Tensor class Embedding(Module): + r"""一个存储固定字典和大小的嵌入向量的简单查找表。 + + 该模块常用于存储词嵌入并通过索引检索它们。 + 模块的输入是索引列表,输出是对应的词嵌入向量。 + + 参数: + num_embeddings (int): 嵌入字典的大小(词汇表大小) + embedding_dim (int): 每个嵌入向量的维度 + padding_idx (int, 可选): 如果指定,该索引位置的条目不参与梯度计算; + 因此,该位置的嵌入向量在训练中不会更新,保持为固定的"填充"向量。 + 对于新创建的嵌入层,该位置的嵌入向量默认全零,但可更新为其他值作为填充向量。 + max_norm (float, 可选): 如果指定,范数超过此值的嵌入向量会被重新归一化到该范数 + norm_type (float, 可选): 计算max_norm时使用的p范数(默认L2范数,p=2) + scale_grad_by_freq (bool, 可选): 如果为True,梯度会按mini-batch中词的频率倒数缩放(默认False) + sparse (bool, 可选): 如果为True,权重矩阵的梯度将是稀疏张量(详见注释) + + 属性: + weight (Tensor): 模块的可学习权重,形状为(num_embeddings, embedding_dim), + 初始化为正态分布N(0, 1) + + 形状: + - 输入: :math:`(*)`, 任意形状的IntTensor或LongTensor,包含要提取的索引 + - 输出: :math:`(*, H)`, 其中*是输入形状,H=embedding_dim + + .. 注意:: + 注意只有部分优化器支持稀疏梯度:目前支持的有SGD(CPU和CUDA)、SparseAdam(CPU和CUDA)、Adagrad(CPU) + + .. 注意:: + 当max_norm不为None时,嵌入层的前向传播会原地修改weight张量。 + 由于梯度计算所需的张量不能被原地修改,因此在调用前向传播前对weight进行可微操作时, + 若max_norm不为None则需要克隆weight。例如:: + + n, d, m = 3, 5, 7 + embedding = nn.Embedding(n, d, max_norm=1.0) + W = torch.randn((m, d), requires_grad=True) + idx = torch.tensor([1, 2]) + a = embedding.weight.clone() @ W.t() # weight必须克隆以保证可微性 + b = embedding(idx) @ W.t() # 原地修改weight + out = (a.unsqueeze(0) + b.unsqueeze(1)) + loss = out.sigmoid().prod() + loss.backward() + + 示例:: + + >>> # 包含10个3维张量的嵌入层 + >>> embedding = nn.Embedding(10, 3) + >>> # 2个样本,每个包含4个索引的批次 + >>> input = torch.LongTensor([[1, 2, 4, 5], [4, 3, 2, 9]]) + >>> # xdoctest: +IGNORE_WANT("non-deterministic") + >>> embedding(input) + tensor([[[-0.0251, -1.6902, 0.7172], + [-0.6431, 0.0748, 0.6969], + [ 1.4970, 1.3448, -0.9685], + [-0.3677, -2.7265, -0.1685]], + + [[ 1.4970, 1.3448, -0.9685], + [ 0.4362, -0.4004, 0.9400], + [-0.6431, 0.0748, 0.6969], + [ 0.9124, -2.3616, 1.1151]]]) + + + >>> # 带padding_idx的示例 + >>> embedding = nn.Embedding(10, 3, padding_idx=0) + >>> input = torch.LongTensor([[0, 2, 0, 5]]) + >>> embedding(input) + tensor([[[ 0.0000, 0.0000, 0.0000], + [ 0.1535, -2.0309, 0.9315], + [ 0.0000, 0.0000, 0.0000], + [-0.1655, 0.9897, 0.0635]]]) + + >>> # 修改填充向量的示例 + >>> padding_idx = 0 + >>> embedding = nn.Embedding(3, 3, padding_idx=padding_idx) + >>> embedding.weight + Parameter containing: + tensor([[ 0.0000, 0.0000, 0.0000], + [-0.7895, -0.7089, -0.0364], + [ 0.6778, 0.5803, 0.2678]], requires_grad=True) + >>> with torch.no_grad(): + ... embedding.weight[padding_idx] = torch.ones(3) + >>> embedding.weight + Parameter containing: + tensor([[ 1.0000, 1.0000, 1.0000], + [-0.7895, -0.7089, -0.0364], + [ 0.6778, 0.5803, 0.2678]], requires_grad=True) + """ + + def __init__(self, - num_embeddings:int, - embedding_dim:int, - padding_idx:int=None, + num_embeddings:int, #嵌入字典的大小(词汇表大小)vocab_size,llama=128256 + embedding_dim:int, #每个嵌入向量的维度,隐藏层大小hidden_size,llama=4096 + padding_idx:int=None, max_norm:float=None, norm_type:float=2.0, scale_grad_by_freq:bool=False, + _weight:Tensor=None,dtype=None, sparse:bool=False): super(Embedding, self).__init__() self.num_embeddings = num_embeddings self.embedding_dim = embedding_dim + + if padding_idx is not None: + if padding_idx > 0: + assert ( + padding_idx < self.num_embeddings + ), "Padding_idx必须在num_embeddings范围内" + elif padding_idx < 0: + assert ( + padding_idx >= -self.num_embeddings + ), "Padding_idx必须在num_embeddings范围内" + padding_idx = self.num_embeddings + padding_idx self.padding_idx = padding_idx self.max_norm = max_norm self.norm_type = norm_type self.scale_grad_by_freq = scale_grad_by_freq + if _weight is None: + self.weight = Tensor(shape=(num_embeddings, embedding_dim),dtype=dtype) + self.reset_parameters() + else: + assert list(_weight.shape) == [ + num_embeddings, + embedding_dim, + ], "权重形状与num_embeddings和embedding_dim不匹配" + self.weight = _weight + self.sparse = sparse - self.weight = Tensor(num_embeddings, embedding_dim) - self.weight.uniform_(-0.01, 0.01) + if padding_idx is not None: self.weight[padding_idx] = 0 + def reset_parameters(self) -> None: + self.weight.normal_() # 正态分布初始化权重 + self._fill_padding_idx_with_zero() # 填充索引位置归零 + def _fill_padding_idx_with_zero(self) -> None: + if self.padding_idx is not None: + self.weight[self.padding_idx].fill_(0) def forward(self, input:Tensor)->Tensor: return self.weight[input] diff --git a/front/py/deepx/nn/modules/sparse.torch.py b/front/py/deepx/nn/modules/sparse.torch.py new file mode 100644 index 00000000..a0621543 --- /dev/null +++ b/front/py/deepx/nn/modules/sparse.torch.py @@ -0,0 +1,512 @@ +# mypy: 允许无类型定义的函数 +from typing import Optional + +import torch +from torch import Tensor +from torch.nn import functional as F, init +from torch.nn.parameter import Parameter + +from .module import Module + + +__all__ = ["Embedding", "EmbeddingBag"] + + +class Embedding(Module): + r"""一个存储固定字典和大小的嵌入向量的简单查找表。 + + 该模块常用于存储词嵌入并通过索引检索它们。 + 模块的输入是索引列表,输出是对应的词嵌入向量。 + + 参数: + num_embeddings (int): 嵌入字典的大小(词汇表大小) + embedding_dim (int): 每个嵌入向量的维度 + padding_idx (int, 可选): 如果指定,该索引位置的条目不参与梯度计算; + 因此,该位置的嵌入向量在训练中不会更新,保持为固定的"填充"向量。 + 对于新创建的嵌入层,该位置的嵌入向量默认全零,但可更新为其他值作为填充向量。 + max_norm (float, 可选): 如果指定,范数超过此值的嵌入向量会被重新归一化到该范数 + norm_type (float, 可选): 计算max_norm时使用的p范数(默认L2范数,p=2) + scale_grad_by_freq (bool, 可选): 如果为True,梯度会按mini-batch中词的频率倒数缩放(默认False) + sparse (bool, 可选): 如果为True,权重矩阵的梯度将是稀疏张量(详见注释) + + 属性: + weight (Tensor): 模块的可学习权重,形状为(num_embeddings, embedding_dim), + 初始化为正态分布N(0, 1) + + 形状: + - 输入: :math:`(*)`, 任意形状的IntTensor或LongTensor,包含要提取的索引 + - 输出: :math:`(*, H)`, 其中*是输入形状,H=embedding_dim + + .. 注意:: + 注意只有部分优化器支持稀疏梯度:目前支持的有SGD(CPU和CUDA)、SparseAdam(CPU和CUDA)、Adagrad(CPU) + + .. 注意:: + 当max_norm不为None时,嵌入层的前向传播会原地修改weight张量。 + 由于梯度计算所需的张量不能被原地修改,因此在调用前向传播前对weight进行可微操作时, + 若max_norm不为None则需要克隆weight。例如:: + + n, d, m = 3, 5, 7 + embedding = nn.Embedding(n, d, max_norm=1.0) + W = torch.randn((m, d), requires_grad=True) + idx = torch.tensor([1, 2]) + a = embedding.weight.clone() @ W.t() # weight必须克隆以保证可微性 + b = embedding(idx) @ W.t() # 原地修改weight + out = (a.unsqueeze(0) + b.unsqueeze(1)) + loss = out.sigmoid().prod() + loss.backward() + + 示例:: + + >>> # 包含10个3维张量的嵌入层 + >>> embedding = nn.Embedding(10, 3) + >>> # 2个样本,每个包含4个索引的批次 + >>> input = torch.LongTensor([[1, 2, 4, 5], [4, 3, 2, 9]]) + >>> # xdoctest: +IGNORE_WANT("non-deterministic") + >>> embedding(input) + tensor([[[-0.0251, -1.6902, 0.7172], + [-0.6431, 0.0748, 0.6969], + [ 1.4970, 1.3448, -0.9685], + [-0.3677, -2.7265, -0.1685]], + + [[ 1.4970, 1.3448, -0.9685], + [ 0.4362, -0.4004, 0.9400], + [-0.6431, 0.0748, 0.6969], + [ 0.9124, -2.3616, 1.1151]]]) + + + >>> # 带padding_idx的示例 + >>> embedding = nn.Embedding(10, 3, padding_idx=0) + >>> input = torch.LongTensor([[0, 2, 0, 5]]) + >>> embedding(input) + tensor([[[ 0.0000, 0.0000, 0.0000], + [ 0.1535, -2.0309, 0.9315], + [ 0.0000, 0.0000, 0.0000], + [-0.1655, 0.9897, 0.0635]]]) + + >>> # 修改填充向量的示例 + >>> padding_idx = 0 + >>> embedding = nn.Embedding(3, 3, padding_idx=padding_idx) + >>> embedding.weight + Parameter containing: + tensor([[ 0.0000, 0.0000, 0.0000], + [-0.7895, -0.7089, -0.0364], + [ 0.6778, 0.5803, 0.2678]], requires_grad=True) + >>> with torch.no_grad(): + ... embedding.weight[padding_idx] = torch.ones(3) + >>> embedding.weight + Parameter containing: + tensor([[ 1.0000, 1.0000, 1.0000], + [-0.7895, -0.7089, -0.0364], + [ 0.6778, 0.5803, 0.2678]], requires_grad=True) + """ + + __constants__ = [ + "num_embeddings", + "embedding_dim", + "padding_idx", + "max_norm", + "norm_type", + "scale_grad_by_freq", + "sparse", + ] + + num_embeddings: int + embedding_dim: int + padding_idx: Optional[int] + max_norm: Optional[float] + norm_type: float + scale_grad_by_freq: bool + weight: Tensor + freeze: bool + sparse: bool + + def __init__( + self, + num_embeddings: int, + embedding_dim: int, + padding_idx: Optional[int] = None, + max_norm: Optional[float] = None, + norm_type: float = 2.0, + scale_grad_by_freq: bool = False, + sparse: bool = False, + _weight: Optional[Tensor] = None, + _freeze: bool = False, + device=None, + dtype=None, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + self.num_embeddings = num_embeddings + self.embedding_dim = embedding_dim + if padding_idx is not None: + if padding_idx > 0: + assert ( + padding_idx < self.num_embeddings + ), "Padding_idx必须在num_embeddings范围内" + elif padding_idx < 0: + assert ( + padding_idx >= -self.num_embeddings + ), "Padding_idx必须在num_embeddings范围内" + padding_idx = self.num_embeddings + padding_idx + self.padding_idx = padding_idx + self.max_norm = max_norm + self.norm_type = norm_type + self.scale_grad_by_freq = scale_grad_by_freq + if _weight is None: + self.weight = Parameter( + torch.empty((num_embeddings, embedding_dim), **factory_kwargs), + requires_grad=not _freeze, + ) + self.reset_parameters() + else: + assert list(_weight.shape) == [ + num_embeddings, + embedding_dim, + ], "权重形状与num_embeddings和embedding_dim不匹配" + self.weight = Parameter(_weight, requires_grad=not _freeze) + + self.sparse = sparse + + def reset_parameters(self) -> None: + init.normal_(self.weight) # 正态分布初始化权重 + self._fill_padding_idx_with_zero() # 填充索引位置归零 + + def _fill_padding_idx_with_zero(self) -> None: + if self.padding_idx is not None: + with torch.no_grad(): # 不计算梯度 + self.weight[self.padding_idx].fill_(0) # 填充位置设为0 + + def forward(self, input: Tensor) -> Tensor: + return F.embedding( + input, + self.weight, + self.padding_idx, + self.max_norm, + self.norm_type, + self.scale_grad_by_freq, + self.sparse, + ) + + def extra_repr(self) -> str: + s = "{num_embeddings}, {embedding_dim}" + if self.padding_idx is not None: + s += ", padding_idx={padding_idx}" + if self.max_norm is not None: + s += ", max_norm={max_norm}" + s += ", max_norm={max_norm}" + if self.norm_type != 2: + s += ", norm_type={norm_type}" + if self.scale_grad_by_freq is not False: + s += ", scale_grad_by_freq={scale_grad_by_freq}" + if self.sparse is not False: + s += ", sparse=True" + return s.format(**self.__dict__) + + @classmethod + def from_pretrained( + cls, + embeddings, + freeze=True, + padding_idx=None, + max_norm=None, + norm_type=2.0, + scale_grad_by_freq=False, + sparse=False, + ): + r"""从给定的2维FloatTensor创建Embedding实例。 + + 参数: + embeddings (Tensor): 包含嵌入权重的FloatTensor, + 第一维作为num_embeddings,第二维作为embedding_dim。 + freeze (bool, 可选): 若为True,张量在学习过程中不更新, + 相当于embedding.weight.requires_grad = False。默认True。 + padding_idx (int, 可选): 同模块初始化文档说明。 + max_norm (float, 可选): 同模块初始化文档说明。 + norm_type (float, 可选): 同模块初始化文档说明,默认2。 + scale_grad_by_freq (bool, 可选): 同模块初始化文档说明,默认False。 + sparse (bool, 可选): 同模块初始化文档说明。 + + 示例:: + + >>> # 包含预训练权重的FloatTensor + >>> weight = torch.FloatTensor([[1, 2.3, 3], [4, 5.1, 6.3]]) + >>> embedding = nn.Embedding.from_pretrained(weight) + >>> # 获取索引1的嵌入 + >>> input = torch.LongTensor([1]) + >>> # xdoctest: +IGNORE_WANT("non-deterministic") + >>> embedding(input) + tensor([[ 4.0000, 5.1000, 6.3000]]) + """ + assert ( + embeddings.dim() == 2 + ), "Embeddings参数应为2维张量" + rows, cols = embeddings.shape + embedding = cls( + num_embeddings=rows, + embedding_dim=cols, + _weight=embeddings, + _freeze=freeze, + padding_idx=padding_idx, + max_norm=max_norm, + norm_type=norm_type, + scale_grad_by_freq=scale_grad_by_freq, + sparse=sparse, + ) + return embedding + + +class EmbeddingBag(Module): + r"""计算嵌入"袋"的和或均值,无需实例化中间嵌入。 + + 对于固定长度的袋、无per_sample_weights、无等于padding_idx的索引,且输入为2D时, + 该类的行为如下: + * mode="sum"等价于Embedding层后接torch.sum(dim=1) + * mode="mean"等价于Embedding层后接torch.mean(dim=1) + * mode="max"等价于Embedding层后接torch.max(dim=1) + + 但EmbeddingBag比链式操作更节省时间和内存。 + + EmbeddingBag还支持在正向传播时传入样本权重, + 这会在按mode指定的方式进行加权归约前缩放嵌入输出。 + 若传入per_sample_weights,仅支持mode="sum",即按权重计算加权和。 + + 参数: + num_embeddings (int): 嵌入字典的大小(词汇表大小) + embedding_dim (int): 每个嵌入向量的维度 + max_norm (float, 可选): 若指定,范数超过此值的嵌入向量会被重新归一化到该范数 + norm_type (float, 可选): 计算max_norm时使用的p范数(默认L2范数,p=2) + scale_grad_by_freq (bool, 可选): 若为True,梯度会按mini-batch中词的频率倒数缩放(默认False)。 + 注意:mode="max"时不支持此选项。 + mode (str, 可选): "sum"、"mean"或"max",指定袋的归约方式。 + "sum"计算加权和(考虑per_sample_weights), + "mean"计算袋内平均值,"max"计算袋内最大值。默认"mean"。 + sparse (bool, 可选): 若为True,权重矩阵的梯度将是稀疏张量(详见注释)。 + 注意:mode="max"时不支持此选项。 + include_last_offset (bool, 可选): 若为True,offsets包含一个额外元素, + 其值等于indices的长度,符合CSR格式。 + padding_idx (int, 可选): 若指定,该索引位置的条目不参与梯度计算; + 因此,该位置的嵌入向量在训练中不会更新,保持为固定的"填充"向量。 + 对于新创建的EmbeddingBag,该位置的嵌入向量默认全零, + 但可更新为其他值作为填充向量。注意该位置的嵌入向量会被排除在归约之外。 + + 属性: + weight (Tensor): 模块的可学习权重,形状为(num_embeddings, embedding_dim), + 初始化为正态分布N(0, 1)。 + + 示例:: + + >>> # 包含10个3维张量的EmbeddingBag(求和模式) + >>> embedding_sum = nn.EmbeddingBag(10, 3, mode='sum') + >>> # 2个样本,每个包含4个索引的输入(展平为1D) + >>> input = torch.tensor([1, 2, 4, 5, 4, 3, 2, 9], dtype=torch.long) + >>> offsets = torch.tensor([0, 4], dtype=torch.long) + >>> # xdoctest: +IGNORE_WANT("non-deterministic") + >>> embedding_sum(input, offsets) + tensor([[-0.8861, -5.4350, -0.0523], + [ 1.1306, -2.5798, -1.0044]]) + + >>> # 带padding_idx的示例 + >>> embedding_sum = nn.EmbeddingBag(10, 3, mode='sum', padding_idx=2) + >>> input = torch.tensor([2, 2, 2, 2, 4, 3, 2, 9], dtype=torch.long) + >>> offsets = torch.tensor([0, 4], dtype=torch.long) + >>> embedding_sum(input, offsets) + tensor([[ 0.0000, 0.0000, 0.0000], + [-0.7082, 3.2145, -2.6251]]) + + >>> # 从Embedding加载EmbeddingBag的示例 + >>> embedding = nn.Embedding(10, 3, padding_idx=2) + >>> embedding_sum = nn.EmbeddingBag.from_pretrained( + embedding.weight, + padding_idx=embedding.padding_idx, + mode='sum') + """ + + __constants__ = [ + "num_embeddings", + "embedding_dim", + "max_norm", + "norm_type", + "scale_grad_by_freq", + "mode", + "sparse", + "include_last_offset", + "padding_idx", + ] + + num_embeddings: int + embedding_dim: int + max_norm: Optional[float] + norm_type: float + scale_grad_by_freq: bool + weight: Tensor + mode: str + sparse: bool + include_last_offset: bool + padding_idx: Optional[int] + + def __init__( + self, + num_embeddings: int, + embedding_dim: int, + max_norm: Optional[float] = None, + norm_type: float = 2.0, + scale_grad_by_freq: bool = False, + mode: str = "mean", + sparse: bool = False, + _weight: Optional[Tensor] = None, + include_last_offset: bool = False, + padding_idx: Optional[int] = None, + device=None, + dtype=None, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + self.num_embeddings = num_embeddings + self.embedding_dim = embedding_dim + self.max_norm = max_norm + self.norm_type = norm_type + self.scale_grad_by_freq = scale_grad_by_freq + if padding_idx is not None: + if padding_idx > 0: + assert ( + padding_idx < self.num_embeddings + ), "padding_idx必须在num_embeddings范围内" + elif padding_idx < 0: + assert ( + padding_idx >= -self.num_embeddings + ), "padding_idx必须在num_embeddings范围内" + padding_idx = self.num_embeddings + padding_idx + self.padding_idx = padding_idx + if _weight is None: + self.weight = Parameter( + torch.empty((num_embeddings, embedding_dim), **factory_kwargs) + ) + self.reset_parameters() + else: + assert list(_weight.shape) == [ + num_embeddings, + embedding_dim, + ], "权重形状与num_embeddings和embedding_dim不匹配" + self.weight = Parameter(_weight) + self.mode = mode + self.sparse = sparse + self.include_last_offset = include_last_offset + + def reset_parameters(self) -> None: + init.normal_(self.weight) # 正态分布初始化权重 + self._fill_padding_idx_with_zero() # 填充索引位置归零 + + def _fill_padding_idx_with_zero(self) -> None: + if self.padding_idx is not None: + with torch.no_grad(): # 不计算梯度 + self.weight[self.padding_idx].fill_(0) # 填充位置设为0 + + def forward( + self, + input: Tensor, + offsets: Optional[Tensor] = None, + per_sample_weights: Optional[Tensor] = None, + ) -> Tensor: + """EmbeddingBag的正向传播。 + + 参数: + input (Tensor): 包含嵌入矩阵索引袋的张量。 + offsets (Tensor, 可选): 仅当input为1D时使用,确定input中每个袋(序列)的起始索引位置。 + per_sample_weights (Tensor, 可选): 浮点/双精度权重张量,None表示所有权重为1。 + 若指定,形状必须与input相同,且在offsets非None时使用相同的偏移量。仅支持mode='sum'。 + + 返回: + 形状为(B, embedding_dim)的张量。 + + .. 注意:: + + 关于input和offsets的说明: + - input和offsets必须同类型(int或long) + - 若input为2D形状(B, N),视为B个固定长度N的袋,返回B个按mode聚合的值,此时offsets被忽略且必须为None。 + - 若input为1D形状(N),视为多个袋(序列)的拼接,offsets必须为1D张量,包含每个袋在input中的起始索引位置。 + 因此,对于形状(B)的offsets,input视为B个袋,空袋(长度为0)返回全零向量。 + """ + return F.embedding_bag( + input, + self.weight, + offsets, + self.max_norm, + self.norm_type, + self.scale_grad_by_freq, + self.mode, + self.sparse, + per_sample_weights, + self.include_last_offset, + self.padding_idx, + ) + + def extra_repr(self) -> str: + s = "{num_embeddings}, {embedding_dim}" + if self.max_norm is not None: + s += ", max_norm={max_norm}" + if self.norm_type != 2: + s += ", norm_type={norm_type}" + if self.scale_grad_by_freq is not False: + s += ", scale_grad_by_freq={scale_grad_by_freq}" + s += ", mode={mode}" + if self.padding_idx is not None: + s += ", padding_idx={padding_idx}" + return s.format(**{k: repr(v) for k, v in self.__dict__.items()}) + + @classmethod + def from_pretrained( + cls, + embeddings: Tensor, + freeze: bool = True, + max_norm: Optional[float] = None, + norm_type: float = 2.0, + scale_grad_by_freq: bool = False, + mode: str = "mean", + sparse: bool = False, + include_last_offset: bool = False, + padding_idx: Optional[int] = None, + ) -> "EmbeddingBag": + r"""从给定的2维FloatTensor创建EmbeddingBag实例。 + + 参数: + embeddings (Tensor): 包含EmbeddingBag权重的FloatTensor, + 第一维作为num_embeddings,第二维作为embedding_dim。 + freeze (bool, 可选): 若为True,张量在学习过程中不更新, + 相当于embeddingbag.weight.requires_grad = False。默认True。 + max_norm (float, 可选): 同模块初始化文档说明,默认None。 + norm_type (float, 可选): 同模块初始化文档说明,默认2。 + scale_grad_by_freq (bool, 可选): 同模块初始化文档说明,默认False。 + mode (str, 可选): 同模块初始化文档说明,默认"mean"。 + sparse (bool, 可选): 同模块初始化文档说明,默认False。 + include_last_offset (bool, 可选): 同模块初始化文档说明,默认False。 + padding_idx (int, 可选): 同模块初始化文档说明,默认None。 + + 示例:: + + >>> # 包含预训练权重的FloatTensor + >>> weight = torch.FloatTensor([[1, 2.3, 3], [4, 5.1, 6.3]]) + >>> embeddingbag = nn.EmbeddingBag.from_pretrained(weight) + >>> # 获取索引1和0的嵌入袋(2D输入) + >>> input = torch.LongTensor([[1, 0]]) + >>> # xdoctest: +IGNORE_WANT("non-deterministic") + >>> embeddingbag(input) + tensor([[ 2.5000, 3.7000, 4.6500]]) + """ + assert ( + embeddings.dim() == 2 + ), "Embeddings参数应为2维张量" + rows, cols = embeddings.shape + embeddingbag = cls( + num_embeddings=rows, + embedding_dim=cols, + _weight=embeddings, + max_norm=max_norm, + norm_type=norm_type, + scale_grad_by_freq=scale_grad_by_freq, + mode=mode, + sparse=sparse, + include_last_offset=include_last_offset, + padding_idx=padding_idx, + ) + embeddingbag.weight.requires_grad = not freeze + return embeddingbag \ No newline at end of file diff --git a/front/py/deepx/tensor/__init__.py b/front/py/deepx/tensor/__init__.py index b46990e9..25fdac77 100644 --- a/front/py/deepx/tensor/__init__.py +++ b/front/py/deepx/tensor/__init__.py @@ -5,12 +5,13 @@ from .changeshape import * # 导入转置方法 from .init import * from .reduce import * - +from .io import * __all__ = [ 'Shape', 'Tensor', 'tensor_method', 'Number', + 'loadShape', # 'lt', 'gt', 'eq', # 'sin', 'cos', 'tan', # 'DType', diff --git a/front/py/deepx/tensor/changeshape.py b/front/py/deepx/tensor/changeshape.py index a9776f3e..f5534541 100644 --- a/front/py/deepx/tensor/changeshape.py +++ b/front/py/deepx/tensor/changeshape.py @@ -49,6 +49,17 @@ def broadcastTo(self,shape:tuple[int],out:Union[Tensor,str]='')->Tensor: result=broadcastTo_func(self,shape,out) return result +@tensor_method +def gather(self,indices:Tensor,dim:int,out:Union[Tensor,str]='')->Tensor: + final_indices=indices + #TODO 当indices不是tensor时,需要转换为tensor + if not isinstance(indices,Tensor): + raise ValueError("indices must be a Tensor") + + from deepx.nn.functional import gather as gather_func + result=gather_func(self,final_indices,dim,out) + return result + # @tensor_method diff --git a/front/py/deepx/tensor/io.py b/front/py/deepx/tensor/io.py new file mode 100644 index 00000000..23027ee4 --- /dev/null +++ b/front/py/deepx/tensor/io.py @@ -0,0 +1,22 @@ +import yaml +import os +from deepx.tensor import Shape + +def loadShape(path:str)->tuple[str,Shape,str]: + filename = os.path.basename(path) + if filename.endswith('.shape'): + with open(path, 'r') as f: + shape = yaml.safe_load(f) + else: + raise ValueError("文件名必须以.shape结尾") + + tensor_name = filename[:-6] # 移除'.shape'后缀 + return (tensor_name,Shape(shape['shape']),shape['dtype']) + +def saveShape(t:Shape,path:str): + if path.endswith('.shape'): + with open(path, 'w') as f: + yaml.dump({'shape': list(t.shape), 'dtype': t._dtype,'size':t.numel(),'dim':t.ndim,'stride':list(t.stride)}, f) + else: + raise ValueError("文件名必须以.shape结尾") + diff --git a/front/py/deepx/tensor/shape.py b/front/py/deepx/tensor/shape.py index 436f6bc6..dfc4f5a0 100644 --- a/front/py/deepx/tensor/shape.py +++ b/front/py/deepx/tensor/shape.py @@ -8,6 +8,7 @@ def __init__(self, self._size = int(np.prod(self.shape)) if self.shape else 0 # 计算 stride(步长) self._strides = self._compute_strides() + self._dtype=None @property def shape(self,dim=None): @@ -156,6 +157,7 @@ def broadcast_shape(cls,shape_a: tuple[int], shape_b: tuple[int]) -> tuple[int]: return tuple(result_shape) + @classmethod def reduceshape(cls,shape:tuple[int],dim:list[int],keepdim:bool)->tuple[int]: ndim = len(shape) @@ -169,4 +171,5 @@ def reduceshape(cls,shape:tuple[int],dim:list[int],keepdim:bool)->tuple[int]: for i, s in enumerate(shape)) else: return tuple(s for i, s in enumerate(shape) - if i not in unique_dim) \ No newline at end of file + if i not in unique_dim) + \ No newline at end of file diff --git a/front/py/deepx/tensor/tensor.py b/front/py/deepx/tensor/tensor.py index 5b909d88..69a328e9 100644 --- a/front/py/deepx/tensor/tensor.py +++ b/front/py/deepx/tensor/tensor.py @@ -119,6 +119,10 @@ def __invert__(self): def __matmul__(self, other:Union[Number,'Tensor']): return self.matmul(other) + #gather + def __getitem__(self, indices:'Tensor'): + return self.gather(indices) + #shape操作 @property def T(self) -> str: diff --git a/front/py/examples/1_tensor/2_saveload.py b/front/py/examples/1_tensor/2_saveload.py new file mode 100644 index 00000000..0b88d544 --- /dev/null +++ b/front/py/examples/1_tensor/2_saveload.py @@ -0,0 +1,21 @@ +from deepx.tensor import Tensor +from deepx.nn.functional import arange,save,load + +def saveloadfloat32(): + t1=arange(start=0,end=60 ,dtype='float32',name='t1').reshape_(3,4,5) + dir='/home/lipeng/model/deepxmodel/tester/' + + t2=load(dir+t1.name) + t2.print() + +def saveloadint8(): + t=arange(start=0,end=60 ,dtype='int8',name='t.int8').reshape_(3,4,5) + dir='/home/lipeng/model/deepxmodel/tester/' + + t2=load(dir+t.name) + t2.print() + + +if __name__ == "__main__": + saveloadfloat32() + saveloadint8() \ No newline at end of file diff --git a/front/py/examples/2_ir/4_changeshape_gather.py b/front/py/examples/2_ir/4_changeshape_gather.py new file mode 100644 index 00000000..e05013bd --- /dev/null +++ b/front/py/examples/2_ir/4_changeshape_gather.py @@ -0,0 +1,28 @@ +############-------PyTorch-------################ +import numpy as np +print() +indices_np = np.array([[0, 1, 2], [0, 1, 2]]) + +print(indices_np) + +import torch +torch_t = torch.arange(10*5, dtype=torch.float32).reshape(10,5) +torch_indices = torch.tensor(indices_np) +torch_t = torch.gather(torch_t, 1,torch_indices) +print(torch_t.shape) +print(torch_t) + + +############-------DEEPX-------################ + +from deepx import Tensor,arange,Shape +from deepx.nn.functional import load,save_npy + + +save_npy(indices_np,'/home/lipeng/model/deepxmodel/tester/testindices') + +t = arange(start=0,end=10*5,dtype='float32',name='t').reshape(10,5) +indices = load('/home/lipeng/model/deepxmodel/tester/testindices') +indices.print() +t = t.gather(indices,dim=1) +t.print() \ No newline at end of file diff --git a/front/py/examples/3_module/0_hg_tokenizer.py b/front/py/examples/3_module/0_hg_tokenizer.py new file mode 100644 index 00000000..3a54d6f4 --- /dev/null +++ b/front/py/examples/3_module/0_hg_tokenizer.py @@ -0,0 +1,43 @@ +from transformers import AutoTokenizer + +def init_tokenizer(model_path): + tokenizer = AutoTokenizer.from_pretrained(model_path) + tokenizer.pad_token = tokenizer.eos_token + return tokenizer + +tokenizer = init_tokenizer("/home/lipeng/model/deepseek-ai/DeepSeek-R1-Distill-Llama-8B") + +def test_tokenizer(): + # 测试编码功能 + text = "这是一个测试文本 aaa bbb" + tokens = tokenizer(text, return_tensors="np") + print(f"{text}==>{tokens.input_ids.shape} {tokens}") + + # 测试解码功能 + for i in range(tokens.input_ids.shape[0]): + for j in range(tokens.input_ids.shape[1]): + decoded_text = tokenizer.decode(tokens.input_ids[i][j]) + print(f"{i,j}->{decoded_text}") + + # 验证特殊tokens + print(f"PAD token:{tokenizer.pad_token_id}= {tokenizer.pad_token}") + print(f"EOS token:{tokenizer.eos_token_id}= {tokenizer.eos_token}") + print(f"Vocabulary size: {len(tokenizer)}") + + # 测试批处理 + batch_texts = ["测试文本一", "另一个测试文本", "第三个测试文本"] + batch_tokens = tokenizer(batch_texts, padding=True, truncation=True, return_tensors='np') + print(f"批处理tokens shape: {batch_tokens.input_ids.shape}") + + # 测试最大长度限制 + long_text = "这是一个" * 100 + tokens_truncated = tokenizer(long_text, max_length=20, truncation=True, return_tensors="np") + print(f"截断后的tokens长度: {tokens_truncated.input_ids.shape[1]}") + + return True + +if __name__ == "__main__": + print() + test_result = test_tokenizer() + + print(f"Tokenizer测试完成: {'成功' if test_result else '失败'}") \ No newline at end of file diff --git a/front/py/examples/3_module/1_embedding.py b/front/py/examples/3_module/1_embedding.py new file mode 100644 index 00000000..e69de29b diff --git a/front/py/examples/3_module/1_linear.dot b/front/py/examples/3_module/1_linear.dot deleted file mode 100644 index c52491f6..00000000 --- a/front/py/examples/3_module/1_linear.dot +++ /dev/null @@ -1,66 +0,0 @@ -// Computational Graph -digraph { - rankdir=TB - node [shape=record] - 126533329176464 [label="linear_0.weight -(4, 64)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 126531635937968 [label="linear_0.bias -(4,)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 126531635951984 [label=uniform color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 126531635938016 [label="var_1 --0.12499999999999999" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled] - 126531635938976 [label="var_2 -0.12499999999999999" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled] - 126531635937488 [label="var_3 -0" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled] - 126531635939072 [label=uniform color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 126531635937536 [label="var_4 --0.125" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled] - 126531635939216 [label="var_5 -0.125" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled] - 126531635939168 [label="var_6 -0" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled] - 126531635939120 [label="input -(1, 64)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 126531635940896 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 126531635940944 [label="var_7 -1" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled] - 126531635941280 [label="linear_0.weight.T -(64, 4)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 126531635941424 [label="vector_1 -[1, 0]" color=darkseagreen fillcolor=honeydew fontname="Sans-Serif" labeljust=l shape=box style=filled] - 126531635944736 [label=transpose color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 126531635944640 [label=matmul color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 126531635944784 [label="tensor_5 -(1, 4)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 126531635944976 [label="tensor_6 -(1, 4)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 126531635945216 [label=reshape color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 126531635945168 [label="vector_2 -[1, 4]" color=darkseagreen fillcolor=honeydew fontname="Sans-Serif" labeljust=l shape=box style=filled] - 126531635945360 [label=add color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 126531635945504 [label="tensor_7 -(1, 4)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 126531635951984 -> 126533329176464 [arrowsize=0.8 color=gray40 penwidth=1.2] - 126531635939072 -> 126531635937968 [arrowsize=0.8 color=gray40 penwidth=1.2] - 126531635938016 -> 126531635951984 [arrowsize=0.8 color=gray40 penwidth=1.2] - 126531635938976 -> 126531635951984 [arrowsize=0.8 color=gray40 penwidth=1.2] - 126531635937488 -> 126531635951984 [arrowsize=0.8 color=gray40 penwidth=1.2] - 126531635937536 -> 126531635939072 [arrowsize=0.8 color=gray40 penwidth=1.2] - 126531635939216 -> 126531635939072 [arrowsize=0.8 color=gray40 penwidth=1.2] - 126531635939168 -> 126531635939072 [arrowsize=0.8 color=gray40 penwidth=1.2] - 126531635940896 -> 126531635939120 [arrowsize=0.8 color=gray40 penwidth=1.2] - 126531635940944 -> 126531635940896 [arrowsize=0.8 color=gray40 penwidth=1.2] - 126531635944736 -> 126531635941280 [arrowsize=0.8 color=gray40 penwidth=1.2] - 126533329176464 -> 126531635944736 [arrowsize=0.8 color=gray40 penwidth=1.2] - 126531635941424 -> 126531635944736 [arrowsize=0.8 color=gray40 penwidth=1.2] - 126531635939120 -> 126531635944640 [arrowsize=0.8 color=gray40 penwidth=1.2] - 126531635941280 -> 126531635944640 [arrowsize=0.8 color=gray40 penwidth=1.2] - 126531635944640 -> 126531635944784 [arrowsize=0.8 color=gray40 penwidth=1.2] - 126531635945216 -> 126531635944976 [arrowsize=0.8 color=gray40 penwidth=1.2] - 126531635937968 -> 126531635945216 [arrowsize=0.8 color=gray40 penwidth=1.2] - 126531635945168 -> 126531635945216 [arrowsize=0.8 color=gray40 penwidth=1.2] - 126531635944784 -> 126531635945360 [arrowsize=0.8 color=gray40 penwidth=1.2] - 126531635944976 -> 126531635945360 [arrowsize=0.8 color=gray40 penwidth=1.2] - 126531635945360 -> 126531635945504 [arrowsize=0.8 color=gray40 penwidth=1.2] -} diff --git a/front/py/examples/3_module/1_linear.dot.svg b/front/py/examples/3_module/1_linear.dot.svg deleted file mode 100644 index 7b0806f8..00000000 --- a/front/py/examples/3_module/1_linear.dot.svg +++ /dev/null @@ -1,299 +0,0 @@ - - - - - - -%3 - - - -126533329176464 - -linear_0.weight -(4, 64) - - - -126531635944736 - -transpose - - - -126533329176464->126531635944736 - - - - - -126531635937968 - -linear_0.bias -(4,) - - - -126531635945216 - -reshape - - - -126531635937968->126531635945216 - - - - - -126531635951984 - -uniform - - - -126531635951984->126533329176464 - - - - - -126531635938016 - -var_1 --0.12499999999999999 - - - -126531635938016->126531635951984 - - - - - -126531635938976 - -var_2 -0.12499999999999999 - - - -126531635938976->126531635951984 - - - - - -126531635937488 - -var_3 -0 - - - -126531635937488->126531635951984 - - - - - -126531635939072 - -uniform - - - -126531635939072->126531635937968 - - - - - -126531635937536 - -var_4 --0.125 - - - -126531635937536->126531635939072 - - - - - -126531635939216 - -var_5 -0.125 - - - -126531635939216->126531635939072 - - - - - -126531635939168 - -var_6 -0 - - - -126531635939168->126531635939072 - - - - - -126531635939120 - -input -(1, 64) - - - -126531635944640 - -matmul - - - -126531635939120->126531635944640 - - - - - -126531635940896 - -constant - - - -126531635940896->126531635939120 - - - - - -126531635940944 - -var_7 -1 - - - -126531635940944->126531635940896 - - - - - -126531635941280 - -linear_0.weight.T -(64, 4) - - - -126531635941280->126531635944640 - - - - - -126531635941424 - -vector_1 -[1, 0] - - - -126531635941424->126531635944736 - - - - - -126531635944736->126531635941280 - - - - - -126531635944784 - -tensor_5 -(1, 4) - - - -126531635944640->126531635944784 - - - - - -126531635945360 - -add - - - -126531635944784->126531635945360 - - - - - -126531635944976 - -tensor_6 -(1, 4) - - - -126531635944976->126531635945360 - - - - - -126531635945216->126531635944976 - - - - - -126531635945168 - -vector_2 -[1, 4] - - - -126531635945168->126531635945216 - - - - - -126531635945504 - -tensor_7 -(1, 4) - - - -126531635945360->126531635945504 - - - - -