From c28e243e17d83fa3bcb7ffbd46c71ed5f081d475 Mon Sep 17 00:00:00 2001 From: lipeng <734991033@qq.com> Date: Sun, 1 Jun 2025 21:57:10 +0800 Subject: [PATCH 1/3] =?UTF-8?q?repeat:ompsimd=20=E5=AE=9E=E7=8E=B0?= =?UTF-8?q?=E5=AE=8C=E6=88=90=EF=BC=8C=E5=BE=85=E9=AA=8C=E8=AF=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- excuter/cpp-common/src/deepx/shape.cpp | 12 +++- .../src/deepx/shape_changeshape.cpp | 14 ++++ .../src/deepx/tensorfunc/changeshape.hpp | 36 +++++++++++ .../deepx/tensorfunc/changeshape_miaobyte.hpp | 22 +++++++ .../src/deepx/tf/changeshape.hpp | 64 +++++++++++++++++++ front/py/deepx/nn/functional/__init__.py | 2 +- .../transformer/models/llama/attention.py | 11 +++- .../models/llama/groupedquery_attention.py | 12 ++++ front/py/examples/1_tensor/2_newbig.py | 6 +- front/py/examples/1_tensor/getitem.py | 14 ++++ .../examples/2_ir/4_changeshape_broadcast.py | 12 ++-- .../2_ir/4_changeshape_broadcast_add.py | 2 +- front/py/examples/2_ir/changeshape_repeat.py | 18 ++++++ .../3_functional/changeshape_broadcast.py | 23 +++++++ front/py/examples/4_transformer/llama/llama_ | 0 15 files changed, 236 insertions(+), 12 deletions(-) create mode 100644 front/py/deepx/transformer/models/llama/groupedquery_attention.py create mode 100644 front/py/examples/1_tensor/getitem.py create mode 100644 front/py/examples/2_ir/changeshape_repeat.py create mode 100644 front/py/examples/3_functional/changeshape_broadcast.py create mode 100644 front/py/examples/4_transformer/llama/llama_ diff --git a/excuter/cpp-common/src/deepx/shape.cpp b/excuter/cpp-common/src/deepx/shape.cpp index 9f51a2e2..cc802d90 100644 --- a/excuter/cpp-common/src/deepx/shape.cpp +++ b/excuter/cpp-common/src/deepx/shape.cpp @@ -66,11 +66,19 @@ namespace deepx } std::cout << "]" << std::endl; } + //linearat + //linearat 支持和strides不同dim的indice索引 + //当strides.size() < indices.size()时,indices的前面部分会默认都是unsqueeze的维度,从而被忽略。在repeat方法中,用到。 + //当strides.size() > indices.size()时,strides的后面部分会被忽略不计算 int Shape::linearat(const std::vector &indices) const{ int idx=0; - for(int i=0;i shape.size()) { + indices_i=indices.size()-shape.size(); } + for(;stride_i Shape::linearto(int idx_linear) const{ diff --git a/excuter/cpp-common/src/deepx/shape_changeshape.cpp b/excuter/cpp-common/src/deepx/shape_changeshape.cpp index d3a89d90..c02d2f5c 100644 --- a/excuter/cpp-common/src/deepx/shape_changeshape.cpp +++ b/excuter/cpp-common/src/deepx/shape_changeshape.cpp @@ -142,4 +142,18 @@ namespace deepx } return output_shape; } + + // repeat + std::vector repeatShape(const std::vector &src, const std::vector &repeats) + { + if (src.size() != repeats.size()) { + throw std::invalid_argument("Shape and repeats must have the same number of dimensions"); + } + + std::vector dest(src.size()); + for (size_t i = 0; i < src.size(); ++i) { + dest[i] = src[i] * repeats[i]; + } + return dest; + } } \ No newline at end of file diff --git a/excuter/cpp-common/src/deepx/tensorfunc/changeshape.hpp b/excuter/cpp-common/src/deepx/tensorfunc/changeshape.hpp index 100f408b..11c4b2b5 100644 --- a/excuter/cpp-common/src/deepx/tensorfunc/changeshape.hpp +++ b/excuter/cpp-common/src/deepx/tensorfunc/changeshape.hpp @@ -74,6 +74,42 @@ namespace deepx::tensorfunc indexselectDispatcher::indexselect(input, indices, axis, output); } + //repeat + template + struct repeatDispatcher + { + static void repeat(const Tensor &A, const std::vector &repeats, Tensor &B) = delete; + }; + //Repeats:The number of times to repeat this tensor along each dimension + // https://docs.pytorch.org/docs/stable/generated/torch.Tensor.repeat.html#torch.Tensor.repeat + template + void repeat(const Tensor &A, const std::vector &repeats, Tensor &B) + { + repeatDispatcher::repeat(A, repeats, B); + } + + //repeat_interleave + template + struct repeat_interleaveDispatcher + { + static void repeat_interleave(const Tensor &A, const int repeats, Tensor &B) = delete; + static void repeat_interleave(const Tensor &A, const Tensor &repeats, Tensor &B) = delete; + }; + template + void repeat_interleave(const Tensor &A, const int repeats, Tensor &B) + { + repeat_interleaveDispatcher::repeat_interleave(A, repeats, B); + } + template + void repeat_interleave(const Tensor &A, const Tensor &repeats, Tensor &B) + { + repeat_interleaveDispatcher::repeat_interleave(A, repeats, B); + } + + + + + // // split // // https://onnx.ai/onnx/operators/onnx__Split.html // template diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/changeshape_miaobyte.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/changeshape_miaobyte.hpp index a92a9571..0e04a63b 100644 --- a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/changeshape_miaobyte.hpp +++ b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/changeshape_miaobyte.hpp @@ -177,6 +177,28 @@ namespace deepx::tensorfunc } }; + // repeat + template + struct repeatDispatcher + { + static void repeat(const Tensor &A, const std::vector &repeats, Tensor &B) + { + auto new_shape = repeatShape(A.shape.shape, repeats); + if (new_shape.empty() || new_shape != B.shape.shape) + { + throw TensorShapeError("Repeat shape mismatch"); + } + B.shape.rangeParallel(B.shape.dim(), [&A,B,&repeats](const int idx, const std::vector &indices, ThreadLocalVectors &tlv) + { + for (size_t i = 0; i < A.shape.dim(); ++i) { + tlv.get(0)[i] = indices[i] / repeats[i]; + } + B.data[idx] = A.data[A.shape.linearat(tlv.get(0))]; + },{B.shape.dim()}); + } + }; + + // template // void split(const Tensor &tensor, const int axis, std::vector *> &results) // { diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/changeshape.hpp b/excuter/op-mem-ompsimd/src/deepx/tf/changeshape.hpp index 0f64432c..8ffd5af4 100644 --- a/excuter/op-mem-ompsimd/src/deepx/tf/changeshape.hpp +++ b/excuter/op-mem-ompsimd/src/deepx/tf/changeshape.hpp @@ -446,5 +446,69 @@ namespace deepx::tf return 0; }; }; + + // repeat + template + class Repeat : public TF + { + public: + Repeat(const vector &args, const vector &returns) + { + this->name = "repeat"; + this->metadata.author = Author::name(); + this->tftype = "changeshape"; + this->args = args; + this->returns = returns; + } + + string math_formula() const override + { + return "T1.repeat(repeats=[4,3,2])->T2"; + } + shared_ptr clone() const override + { + return make_shared>(*this); + } + int run(shared_ptr mem, string &error) override + { + if (!checktensors({this->args[0].textvalue, this->returns[0].textvalue}, mem, error) != 0) + { + return 1; + } + Precision input_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; + vector new_shape = this->getvector(1, true); + Precision output_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (input_type != output_type) + { + error = "Type mismatch: " + precision_str(input_type) + " != " + precision_str(output_type); + return 1; + } + switch (input_type) + { + case Precision::Float64: + broadcastTo(*mem->gettensor(this->args[0].textvalue), new_shape, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + broadcastTo(*mem->gettensor(this->args[0].textvalue), new_shape, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int64: + broadcastTo(*mem->gettensor(this->args[0].textvalue), new_shape, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + broadcastTo(*mem->gettensor(this->args[0].textvalue), new_shape, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + broadcastTo(*mem->gettensor(this->args[0].textvalue), new_shape, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + broadcastTo(*mem->gettensor(this->args[0].textvalue), new_shape, *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported type: " + precision_str(input_type); + return 1; + } + return 0; + } + }; } #endif // DEEPX_TF_CHANGESHAPE_HPP \ No newline at end of file diff --git a/front/py/deepx/nn/functional/__init__.py b/front/py/deepx/nn/functional/__init__.py index d8ac8f44..da04e664 100644 --- a/front/py/deepx/nn/functional/__init__.py +++ b/front/py/deepx/nn/functional/__init__.py @@ -30,7 +30,7 @@ "invert", "matmul", "reducemax","reducemin","sum","prod", - "reshape","permute","transpose","concat","broadcastTo","indexselect", + "reshape","permute","transpose","concat","broadcastTo","broadcast_to","indexselect", #functional "relu","sigmoid","swish","silu", diff --git a/front/py/deepx/transformer/models/llama/attention.py b/front/py/deepx/transformer/models/llama/attention.py index 490501e4..325d6a27 100644 --- a/front/py/deepx/transformer/models/llama/attention.py +++ b/front/py/deepx/transformer/models/llama/attention.py @@ -1,6 +1,6 @@ from typing import Optional,Tuple from deepx.nn.modules import Module,Linear -from deepx import Tensor,repeat_kv,matmul,softmax,concat,arange,dropout as dropout_func +from deepx import Tensor,matmul,softmax,concat,arange,dropout as dropout_func @@ -17,7 +17,14 @@ def apply_rotary_pos_emb(q:Tensor, k:Tensor, cos:Tensor, sin:Tensor, unsqueeze_d q_embed = (q * cos) + (rotate_half(q) * sin) k_embed = (k * cos) + (rotate_half(k) * sin) return q_embed, k_embed - + +def repeat_kv(hidden_states: Tensor, n_rep: int) -> Tensor: + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + # https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py # 经简化,去掉了分布式配置,去掉attention的配置。交给IR自动替换flashattention,后续的组件自动处理 diff --git a/front/py/deepx/transformer/models/llama/groupedquery_attention.py b/front/py/deepx/transformer/models/llama/groupedquery_attention.py new file mode 100644 index 00000000..58f56a00 --- /dev/null +++ b/front/py/deepx/transformer/models/llama/groupedquery_attention.py @@ -0,0 +1,12 @@ +from typing import Optional,Tuple +from deepx.nn.modules import Module,Linear +from deepx import Tensor,matmul,softmax,concat,arange,dropout as dropout_func + + +def repeat_kv(hidden_states: Tensor, n_rep: int) -> Tensor: + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + diff --git a/front/py/examples/1_tensor/2_newbig.py b/front/py/examples/1_tensor/2_newbig.py index 2d0b4bbf..34c76c09 100644 --- a/front/py/examples/1_tensor/2_newbig.py +++ b/front/py/examples/1_tensor/2_newbig.py @@ -6,7 +6,11 @@ def newtensor(dtype): for i in range(0,20): t=newtensor((1,20,4096),dtype=dtype) # t.print() - +a=None +def f(): + a="hello" +f() +print(a) if __name__ == "__main__": args=sys.argv[1:] diff --git a/front/py/examples/1_tensor/getitem.py b/front/py/examples/1_tensor/getitem.py new file mode 100644 index 00000000..c0746762 --- /dev/null +++ b/front/py/examples/1_tensor/getitem.py @@ -0,0 +1,14 @@ + +def deepx_getitem(): + from deepx import newtensor + t=newtensor((2,3,4)).full_(1) + t2=t[None, :, None] + t2.print() +def torch_getitem(): + import torch + t=torch.full((2,3,4),1) + t2=t[None, :, None] + print(t2) +if __name__ == "__main__": + deepx_getitem() + torch_getitem() \ No newline at end of file diff --git a/front/py/examples/2_ir/4_changeshape_broadcast.py b/front/py/examples/2_ir/4_changeshape_broadcast.py index 15ceb34f..3c8f77cd 100644 --- a/front/py/examples/2_ir/4_changeshape_broadcast.py +++ b/front/py/examples/2_ir/4_changeshape_broadcast.py @@ -2,10 +2,13 @@ print() import torch -a=torch.arange(4*2*3).reshape(4,2,3) -b=torch.arange(2*1).reshape(2,1) -bb_torch = torch.broadcast_to(b, (4,2,3)) +torch_a=torch.arange(4*2*3).reshape(4,2,3) +torch_b=torch.arange(2*1).reshape(2,1) +bb_torch = torch.broadcast_to(torch_b, (4,2,3)) print(bb_torch) +torch_a[None:,] + + ########====DEEPX====######## from deepx import Tensor,arange,broadcastTo @@ -15,6 +18,5 @@ bb=b.broadcastTo( a.shape,out="b.broadcasted") bb.print() - - +c=a[None:,] diff --git a/front/py/examples/2_ir/4_changeshape_broadcast_add.py b/front/py/examples/2_ir/4_changeshape_broadcast_add.py index 229f1332..7ebd743b 100644 --- a/front/py/examples/2_ir/4_changeshape_broadcast_add.py +++ b/front/py/examples/2_ir/4_changeshape_broadcast_add.py @@ -1,5 +1,5 @@ ########====DEEPX====######## -from deepx import Tensor,ones +from deepx import ones a=ones( 4,2,3 ,name="a") b=ones( 2,1 ,name='b') diff --git a/front/py/examples/2_ir/changeshape_repeat.py b/front/py/examples/2_ir/changeshape_repeat.py new file mode 100644 index 00000000..a915ca7a --- /dev/null +++ b/front/py/examples/2_ir/changeshape_repeat.py @@ -0,0 +1,18 @@ +import torch + +# 正确:repeats为一维张量 +x = torch.tensor([[1, 2], [3, 4]]) +repeats = torch.tensor([1, 2]) # 一维张量 +torch.repeat_interleave(x, repeats, dim=0) +# 输出: +# tensor([[1, 2], +# [3, 4], +# [3, 4]]) + +# 错误:repeats为二维张量 +repeats_2d = torch.tensor([[1, 2], [3, 4]]) # 二维张量 +try: + torch.repeat_interleave(x, repeats_2d, dim=0) +except RuntimeError as e: + print(f"错误: {e}") +# 输出: \ No newline at end of file diff --git a/front/py/examples/3_functional/changeshape_broadcast.py b/front/py/examples/3_functional/changeshape_broadcast.py new file mode 100644 index 00000000..87210412 --- /dev/null +++ b/front/py/examples/3_functional/changeshape_broadcast.py @@ -0,0 +1,23 @@ + +print() +#######-----------------torch-----------------####### +import torch +torch_x = torch.arange(6).reshape(1,2,3) # shape=(2,3) +torch_y = torch_x.broadcast_to((3,2,3)) # 需要原维度为1 +print(torch_y) + +torch_x2=torch_x.repeat_interleave(dim=0, repeats=3) +print(torch_x2) + + +#######-----------------deepx-----------------####### +from deepx import Tensor,broadcast_to,arange +deepx_x = arange(0,6).reshape_((1,2,3)) # shape=(2,3) +deepx_y = broadcast_to(deepx_x, (3,2,3)) # 需要原维度为1 +deepx_y.print() + + + + + + diff --git a/front/py/examples/4_transformer/llama/llama_ b/front/py/examples/4_transformer/llama/llama_ new file mode 100644 index 00000000..e69de29b From 19adfc8c321463a286d78d22a8beecd43114b367 Mon Sep 17 00:00:00 2001 From: lipeng <734991033@qq.com> Date: Sun, 1 Jun 2025 22:56:16 +0800 Subject: [PATCH 2/3] =?UTF-8?q?repeat:ompsimd=20=E5=AE=9E=E7=8E=B0?= =?UTF-8?q?=E5=AE=8C=E6=88=90=EF=BC=8C=E5=BE=85=E9=AA=8C=E8=AF=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- doc/excuter/op-mem-ompsimd/list.md | 127 +++++++++--------- .../src/deepx/shape_changeshape.hpp | 3 + excuter/op-mem-ompsimd/src/client/tfs.cpp | 10 ++ .../deepx/tensorfunc/changeshape_miaobyte.hpp | 5 +- .../src/deepx/tf/changeshape.hpp | 126 ++++++++--------- front/py/deepx/nn/functional/__init__.py | 2 +- front/py/deepx/nn/functional/authormap.py | 1 + .../nn/functional/leaffunc_changeshape.py | 13 ++ .../py/deepx/nn/functional/rtf_changeshape.py | 8 ++ front/py/deepx/tensor/changeshape.py | 6 + front/py/deepx/tensor/shape.py | 7 +- .../py/examples/2_ir/4_changeshape_repeat.py | 17 +++ 12 files changed, 195 insertions(+), 130 deletions(-) create mode 100644 front/py/examples/2_ir/4_changeshape_repeat.py diff --git a/doc/excuter/op-mem-ompsimd/list.md b/doc/excuter/op-mem-ompsimd/list.md index e259031c..3ec2089e 100644 --- a/doc/excuter/op-mem-ompsimd/list.md +++ b/doc/excuter/op-mem-ompsimd/list.md @@ -2,106 +2,107 @@ 本页面由 `excuter/op-mem-ompsimd 生成,请勿手动修改 -### arg +### matmul | Operation | Author | Math Formula | IR Instruction | |-----------|--------|--------------|----------------| -| vecset | none | [3 4 5]->shape | vecset(vector:value)->(vector:name) | -| argset | none | argvalue->argname | argset(var:value)->(var:name) | +| matmul | cblas | T3=T1 @ T2 | matmul(tensor:A, tensor:B)->(tensor:C) | +| matmul | miaobyte | T3=T1 @ T2 | matmul(tensor:A, tensor:B)->(tensor:C) | -### tensorlife +### init | Operation | Author | Math Formula | IR Instruction | |-----------|--------|--------------|----------------| -| renametensor | none | rename(newname)->T1 | renametensor(var:new_name)->(tensor:t) | -| newtensor | none | T1 =Tensor(shape=[...]) | newtensor(vector:shape)->(tensor:t) | -| newtensor | none | T1 =Tensor(shape=[...]) | newtensor(var:shape)->(tensor:t) | -| deltensor | none | del->T1 | deltensor()->(tensor:t) | -| copytensor | none | T1.data->T2.data | copytensor(tensor:src)->(tensor:dst) | +| constant | miaobyte | constant(value)->T1 | constant(var:value)->(tensor:t) | +| arange | miaobyte | arange(start,step)->T1 | arange(var:start, var:step)->(tensor:t) | +| uniform | miaobyte | uniform(low,high,seed)->T1 | uniform(var:low, var:high, var:seed)->(tensor:t) | +| dropout | miaobyte | dropout(p,seed)->A | dropout(var:p, var:seed)->(tensor:A) | +| normal | miaobyte | normal(mean,stddev,seed)->T1 | normal(var:mean, var:std, var:seed)->(tensor:t) | ### io | Operation | Author | Math Formula | IR Instruction | |-----------|--------|--------------|----------------| -| loadtensordata | none | loadtensordata(path)->tensor.data | loadtensordata(var:path)->(tensor:t) | -| save | none | save(T1,path) | save(tensor:t, var:path)->() | +| load | none | mem.load(path) | load(var:path)->() | | print | miaobyte | print(T1) | print(tensor:t)->() | | print | miaobyte | print(T1) | print(tensor:t, var:format)->() | -| load | none | mem.load(path) | load(var:path)->() | +| save | none | save(T1,path) | save(tensor:t, var:path)->() | +| loadtensordata | none | loadtensordata(path)->tensor.data | loadtensordata(var:path)->(tensor:t) | -### matmul +### arg | Operation | Author | Math Formula | IR Instruction | |-----------|--------|--------------|----------------| -| matmul | cblas | T3=T1 @ T2 | matmul(tensor:A, tensor:B)->(tensor:C) | -| matmul | miaobyte | T3=T1 @ T2 | matmul(tensor:A, tensor:B)->(tensor:C) | +| argset | none | argvalue->argname | argset(var:value)->(var:name) | +| vecset | none | [3 4 5]->shape | vecset(vector:value)->(vector:name) | -### init +### tensorlife | Operation | Author | Math Formula | IR Instruction | |-----------|--------|--------------|----------------| -| normal | miaobyte | normal(mean,stddev,seed)->T1 | normal(var:mean, var:std, var:seed)->(tensor:t) | -| dropout | miaobyte | dropout(p,seed)->A | dropout(var:p, var:seed)->(tensor:A) | -| uniform | miaobyte | uniform(low,high,seed)->T1 | uniform(var:low, var:high, var:seed)->(tensor:t) | -| arange | miaobyte | arange(start,step)->T1 | arange(var:start, var:step)->(tensor:t) | -| constant | miaobyte | constant(value)->T1 | constant(var:value)->(tensor:t) | +| copytensor | none | T1.data->T2.data | copytensor(tensor:src)->(tensor:dst) | +| newtensor | none | T1 =Tensor(shape=[...]) | newtensor(vector:shape)->(tensor:t) | +| newtensor | none | T1 =Tensor(shape=[...]) | newtensor(var:shape)->(tensor:t) | +| renametensor | none | rename(newname)->T1 | renametensor(var:new_name)->(tensor:t) | +| deltensor | none | del->T1 | deltensor()->(tensor:t) | ### elementwise | Operation | Author | Math Formula | IR Instruction | |-----------|--------|--------------|----------------| -| switch | miaobyte | C=switch([tensors],case) | switch(listtensor:tensors, tensor:cases)->(tensor:C) | -| greaterscalar | miaobyte | mask=greater(T1,scalar) | greaterscalar(tensor:A, var:scalar)->(tensor:mask) | -| notequal | miaobyte | notequal(T1,T2)->mask | notequal(tensor:A, tensor:B, var:epsilon)->(tensor:mask) | -| equalscalar | miaobyte | mask=equal(T1,scalar) | equalscalar(tensor:A, var:scalar, var:eposilon)->(tensor:mask) | -| min | miaobyte | T3=min(T1,T2) | min(tensor:A, tensor:B)->(tensor:C) | -| maxscalar | miaobyte | T3=max(T1,scalar) | maxscalar(tensor:A, var:scalar)->(tensor:C) | -| tan | miaobyte | T3=tan(T1) | tan(tensor:A)->(tensor:C) | -| sin | miaobyte | T3=sin(T1) | sin(tensor:A)->(tensor:C) | -| less | miaobyte | mask=less(T1,T2) | less(tensor:A, tensor:B)->(tensor:mask) | -| powscalar | miaobyte | T3=T1^scalar | powscalar(tensor:A, var:scalar)->(tensor:C) | -| rsubscalar | miaobyte | T3=scalar-T1 | rsubscalar(var:scalar, tensor:a)->(tensor:c) | -| divscalar | miaobyte | T3=T1/scalar | divscalar(tensor:A, var:scalar)->(tensor:C) | -| log | miaobyte | T3=log(T1) | log(tensor:A)->(tensor:C) | -| addscalar | miaobyte | T3=T1+scalar | addscalar(tensor:a, var:scalar)->(tensor:c) | +| pow | miaobyte | T3=T1^T2 | pow(tensor:A, tensor:B)->(tensor:C) | +| max | miaobyte | T3=max(T1,T2) | max(tensor:A, tensor:B)->(tensor:C) | +| mulscalar | miaobyte | T3=T1*scalar | mulscalar(tensor:A, var:b)->(tensor:C) | +| equal | miaobyte | equal(T1,T2)->mask | equal(tensor:A, tensor:B, var:eposilon)->(tensor:mask) | +| mul | miaobyte | T3=T1*T2 | mul(tensor:A, tensor:B)->(tensor:C) | +| exp | miaobyte | T3=exp(T1) | exp(tensor:A)->(tensor:C) | +| subscalar | miaobyte | T3=T1-scalar | subscalar(tensor:a, var:scalar)->(tensor:c) | +| sqrt | miaobyte | T3=sqrt(T1) | sqrt(tensor:A)->(tensor:C) | +| sub | miaobyte | T3=T1-T2 | sub(tensor:a, tensor:b)->(tensor:c) | +| add | cblas | T3=T1+T2 | add(tensor:a, tensor:b)->(tensor:c) | +| add | miaobyte | T3=T1+T2 | add(tensor:a, tensor:b)->(tensor:c) | +| todtype | none | T3(dtypeA)->T1(dtypeB) | todtype(tensor:A)->(tensor:C) | +| invert | miaobyte | T3=~T1 | invert(tensor:A)->(tensor:C) | +| rdivscalar | miaobyte | T3=scalar/T1 | rdivscalar(var:scalar, tensor:A)->(tensor:C) | +| rpowscalar | miaobyte | T3=scalar^T1 | rpowscalar(var:scalar, tensor:A)->(tensor:C) | +| cos | miaobyte | T3=cos(T1) | cos(tensor:A)->(tensor:C) | | greater | miaobyte | mask=greater(T1,T2) | greater(tensor:A, tensor:B)->(tensor:mask) | +| addscalar | miaobyte | T3=T1+scalar | addscalar(tensor:a, var:scalar)->(tensor:c) | +| div | miaobyte | T3=T1/T2 | div(tensor:A, tensor:B)->(tensor:C) | +| divscalar | miaobyte | T3=T1/scalar | divscalar(tensor:A, var:scalar)->(tensor:C) | | lessscalar | miaobyte | mask=less(T1,scalar) | lessscalar(tensor:A, var:scalar)->(tensor:mask) | -| cos | miaobyte | T3=cos(T1) | cos(tensor:A)->(tensor:C) | +| less | miaobyte | mask=less(T1,T2) | less(tensor:A, tensor:B)->(tensor:mask) | | notequalscalar | miaobyte | mask=notequal(T1,scalar) | notequalscalar(tensor:A, var:scalar, var:epsilon)->(tensor:mask) | +| rsubscalar | miaobyte | T3=scalar-T1 | rsubscalar(var:scalar, tensor:a)->(tensor:c) | +| sin | miaobyte | T3=sin(T1) | sin(tensor:A)->(tensor:C) | | minscalar | miaobyte | T3=min(T1,scalar) | minscalar(tensor:A, var:scalar)->(tensor:C) | -| rpowscalar | miaobyte | T3=scalar^T1 | rpowscalar(var:scalar, tensor:A)->(tensor:C) | -| rdivscalar | miaobyte | T3=scalar/T1 | rdivscalar(var:scalar, tensor:A)->(tensor:C) | -| todtype | none | T3(dtypeA)->T1(dtypeB) | todtype(tensor:A)->(tensor:C) | -| add | cblas | T3=T1+T2 | add(tensor:a, tensor:b)->(tensor:c) | -| add | miaobyte | T3=T1+T2 | add(tensor:a, tensor:b)->(tensor:c) | -| sub | miaobyte | T3=T1-T2 | sub(tensor:a, tensor:b)->(tensor:c) | -| sqrt | miaobyte | T3=sqrt(T1) | sqrt(tensor:A)->(tensor:C) | -| subscalar | miaobyte | T3=T1-scalar | subscalar(tensor:a, var:scalar)->(tensor:c) | -| exp | miaobyte | T3=exp(T1) | exp(tensor:A)->(tensor:C) | -| mul | miaobyte | T3=T1*T2 | mul(tensor:A, tensor:B)->(tensor:C) | -| equal | miaobyte | equal(T1,T2)->mask | equal(tensor:A, tensor:B, var:eposilon)->(tensor:mask) | -| mulscalar | miaobyte | T3=T1*scalar | mulscalar(tensor:A, var:b)->(tensor:C) | -| div | miaobyte | T3=T1/T2 | div(tensor:A, tensor:B)->(tensor:C) | -| invert | miaobyte | T3=~T1 | invert(tensor:A)->(tensor:C) | -| max | miaobyte | T3=max(T1,T2) | max(tensor:A, tensor:B)->(tensor:C) | -| pow | miaobyte | T3=T1^T2 | pow(tensor:A, tensor:B)->(tensor:C) | +| tan | miaobyte | T3=tan(T1) | tan(tensor:A)->(tensor:C) | +| maxscalar | miaobyte | T3=max(T1,scalar) | maxscalar(tensor:A, var:scalar)->(tensor:C) | +| min | miaobyte | T3=min(T1,T2) | min(tensor:A, tensor:B)->(tensor:C) | +| log | miaobyte | T3=log(T1) | log(tensor:A)->(tensor:C) | +| equalscalar | miaobyte | mask=equal(T1,scalar) | equalscalar(tensor:A, var:scalar, var:eposilon)->(tensor:mask) | +| notequal | miaobyte | notequal(T1,T2)->mask | notequal(tensor:A, tensor:B, var:epsilon)->(tensor:mask) | +| greaterscalar | miaobyte | mask=greater(T1,scalar) | greaterscalar(tensor:A, var:scalar)->(tensor:mask) | +| switch | miaobyte | C=switch([tensors],case) | switch(listtensor:tensors, tensor:cases)->(tensor:C) | +| powscalar | miaobyte | T3=T1^scalar | powscalar(tensor:A, var:scalar)->(tensor:C) | -### reduce +### changeshape | Operation | Author | Math Formula | IR Instruction | |-----------|--------|--------------|----------------| -| prod | miaobyte | B = prod(A, axis=[1 2], keepdims=false) | prod(tensor:A, vector:axis, var:keepdims)->(tensor:B) | -| reducemax | miaobyte | B = reducemax(A, axis=[1 2], keepdims=false) | reducemax(tensor:A, vector:axis, var:keepdims)->(tensor:B) | -| sum | miaobyte | B = sum(A, axis=[1 2], keepdims=false) | sum(tensor:A, vector:axis, var:keepdims)->(tensor:B) | -| reducemin | miaobyte | B = reducemin(A, axis=[1 2], keepdims=false) | reducemin(tensor:A, vector:axis, var:keepdims)->(tensor:B) | +| reshape | miaobyte | T1.reshape(shape)->T2 | reshape(tensor:A, vector:shape)->(tensor:B) | +| transpose | miaobyte | T1.transpose(dimorder=[1,0])->T2 | transpose(tensor:A, vector:dim_order)->(tensor:C) | +| concat | miaobyte | Tresult = concat([T1, T2...], axis=3) | concat(listtensor:tensors, var:dim)->(tensor:result) | +| broadcastTo | miaobyte | T2 = T1.broadcastTo(new_shape=[4,3,2]) | broadcastTo(tensor:A, vector:new_shape)->(tensor:B) | +| indexselect | miaobyte | T2 = T1.indexselect(index=T3, axis=3) | indexselect(tensor:A, tensor:index, var:axis)->(tensor:B) | +| repeat | miaobyte | T1.repeat(repeats=[4,3,2])->T2 | repeat(tensor:A, vector:repeats)->(tensor:B) | -### changeshape +### reduce | Operation | Author | Math Formula | IR Instruction | |-----------|--------|--------------|----------------| -| indexselect | miaobyte | T2 = T1.indexselect(index=T3, axis=3) | indexselect(tensor:A, tensor:index, var:axis)->(tensor:B) | -| broadcastTo | miaobyte | T2 = T1.broadcastTo(new_shape=[4,3,2]) | broadcastTo(tensor:A, vector:new_shape)->(tensor:B) | -| concat | miaobyte | Tresult = concat([T1, T2...], axis=3) | concat(listtensor:tensors, var:dim)->(tensor:result) | -| transpose | miaobyte | T1.transpose(dimorder=[1,0])->T2 | transpose(tensor:A, vector:dim_order)->(tensor:C) | -| reshape | miaobyte | T1.reshape(shape)->T2 | reshape(tensor:A, vector:shape)->(tensor:B) | +| reducemin | miaobyte | B = reducemin(A, axis=[1 2], keepdims=false) | reducemin(tensor:A, vector:axis, var:keepdims)->(tensor:B) | +| sum | miaobyte | B = sum(A, axis=[1 2], keepdims=false) | sum(tensor:A, vector:axis, var:keepdims)->(tensor:B) | +| reducemax | miaobyte | B = reducemax(A, axis=[1 2], keepdims=false) | reducemax(tensor:A, vector:axis, var:keepdims)->(tensor:B) | +| prod | miaobyte | B = prod(A, axis=[1 2], keepdims=false) | prod(tensor:A, vector:axis, var:keepdims)->(tensor:B) | diff --git a/excuter/cpp-common/src/deepx/shape_changeshape.hpp b/excuter/cpp-common/src/deepx/shape_changeshape.hpp index 71cbcb63..65a36670 100644 --- a/excuter/cpp-common/src/deepx/shape_changeshape.hpp +++ b/excuter/cpp-common/src/deepx/shape_changeshape.hpp @@ -73,5 +73,8 @@ namespace deepx //indexselect vector indexselectShape(const vector &input_shape, const vector &index_shape, const int axis); + + //repeat + std::vector repeatShape(const std::vector &src, const std::vector &repeats); } #endif // DEEPX_SHAPE_CHANGESHAPE_HPP \ No newline at end of file diff --git a/excuter/op-mem-ompsimd/src/client/tfs.cpp b/excuter/op-mem-ompsimd/src/client/tfs.cpp index 04b09aeb..e0d9af05 100644 --- a/excuter/op-mem-ompsimd/src/client/tfs.cpp +++ b/excuter/op-mem-ompsimd/src/client/tfs.cpp @@ -616,6 +616,16 @@ namespace deepx::tf { Param("B", DataCategory::Tensor, Precision::Any), }))); + // repeat author=miaobyte + tffactory.add_tf(std::make_shared>(vector( + { + Param("A", DataCategory::Tensor, Precision::Any), + Param("repeats", DataCategory::Vector, Precision::Int32) + }), + vector( + { + Param("B", DataCategory::Tensor, Precision::Any), + }))); } // // reduce void register_reduce(TfFactory &tffactory) diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/changeshape_miaobyte.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/changeshape_miaobyte.hpp index 0e04a63b..901ac913 100644 --- a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/changeshape_miaobyte.hpp +++ b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/changeshape_miaobyte.hpp @@ -188,12 +188,13 @@ namespace deepx::tensorfunc { throw TensorShapeError("Repeat shape mismatch"); } - B.shape.rangeParallel(B.shape.dim(), [&A,B,&repeats](const int idx, const std::vector &indices, ThreadLocalVectors &tlv) + B.shape.rangeParallel(B.shape.dim(), [&A,&B,&repeats](const int idx, const std::vector &indices, ThreadLocalVectors &tlv) { for (size_t i = 0; i < A.shape.dim(); ++i) { tlv.get(0)[i] = indices[i] / repeats[i]; } - B.data[idx] = A.data[A.shape.linearat(tlv.get(0))]; + int idx_A=A.shape.linearat(tlv.get(0)); + B.data[idx] = A.data[idx_A]; },{B.shape.dim()}); } }; diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/changeshape.hpp b/excuter/op-mem-ompsimd/src/deepx/tf/changeshape.hpp index 8ffd5af4..1fc44285 100644 --- a/excuter/op-mem-ompsimd/src/deepx/tf/changeshape.hpp +++ b/excuter/op-mem-ompsimd/src/deepx/tf/changeshape.hpp @@ -109,7 +109,7 @@ namespace deepx::tf return 1; } Precision input_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; - vector dim_order = this->getvector(1,true); + vector dim_order = this->getvector(1, true); Precision output_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; if (input_type != output_type) { @@ -448,67 +448,67 @@ namespace deepx::tf }; // repeat - template - class Repeat : public TF - { - public: - Repeat(const vector &args, const vector &returns) - { - this->name = "repeat"; - this->metadata.author = Author::name(); - this->tftype = "changeshape"; - this->args = args; - this->returns = returns; - } - - string math_formula() const override - { - return "T1.repeat(repeats=[4,3,2])->T2"; - } - shared_ptr clone() const override - { - return make_shared>(*this); - } - int run(shared_ptr mem, string &error) override - { - if (!checktensors({this->args[0].textvalue, this->returns[0].textvalue}, mem, error) != 0) - { - return 1; - } - Precision input_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; - vector new_shape = this->getvector(1, true); - Precision output_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; - if (input_type != output_type) - { - error = "Type mismatch: " + precision_str(input_type) + " != " + precision_str(output_type); - return 1; - } - switch (input_type) - { - case Precision::Float64: - broadcastTo(*mem->gettensor(this->args[0].textvalue), new_shape, *mem->gettensor(this->returns[0].textvalue)); - break; - case Precision::Float32: - broadcastTo(*mem->gettensor(this->args[0].textvalue), new_shape, *mem->gettensor(this->returns[0].textvalue)); - break; - case Precision::Int64: - broadcastTo(*mem->gettensor(this->args[0].textvalue), new_shape, *mem->gettensor(this->returns[0].textvalue)); - break; - case Precision::Int32: - broadcastTo(*mem->gettensor(this->args[0].textvalue), new_shape, *mem->gettensor(this->returns[0].textvalue)); - break; - case Precision::Int16: - broadcastTo(*mem->gettensor(this->args[0].textvalue), new_shape, *mem->gettensor(this->returns[0].textvalue)); - break; - case Precision::Int8: - broadcastTo(*mem->gettensor(this->args[0].textvalue), new_shape, *mem->gettensor(this->returns[0].textvalue)); - break; - default: - error = "Unsupported type: " + precision_str(input_type); - return 1; - } - return 0; - } - }; + template + class Repeat : public TF + { + public: + Repeat(const vector &args, const vector &returns) + { + this->name = "repeat"; + this->metadata.author = Author::name(); + this->tftype = "changeshape"; + this->args = args; + this->returns = returns; + } + + string math_formula() const override + { + return "T1.repeat(repeats=[4,3,2])->T2"; + } + shared_ptr clone() const override + { + return make_shared>(*this); + } + int run(shared_ptr mem, string &error) override + { + if (!checktensors({this->args[0].textvalue, this->returns[0].textvalue}, mem, error) != 0) + { + return 1; + } + Precision input_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; + vector repeats = this->getvector(1, true); + Precision output_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (input_type != output_type) + { + error = "Type mismatch: " + precision_str(input_type) + " != " + precision_str(output_type); + return 1; + } + switch (input_type) + { + case Precision::Float64: + repeat(*mem->gettensor(this->args[0].textvalue), repeats, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + repeat(*mem->gettensor(this->args[0].textvalue), repeats, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int64: + repeat(*mem->gettensor(this->args[0].textvalue), repeats, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + repeat(*mem->gettensor(this->args[0].textvalue), repeats, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + repeat(*mem->gettensor(this->args[0].textvalue), repeats, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + repeat(*mem->gettensor(this->args[0].textvalue), repeats, *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported type: " + precision_str(input_type); + return 1; + } + return 0; + } + }; } #endif // DEEPX_TF_CHANGESHAPE_HPP \ No newline at end of file diff --git a/front/py/deepx/nn/functional/__init__.py b/front/py/deepx/nn/functional/__init__.py index da04e664..3f5d33cf 100644 --- a/front/py/deepx/nn/functional/__init__.py +++ b/front/py/deepx/nn/functional/__init__.py @@ -30,7 +30,7 @@ "invert", "matmul", "reducemax","reducemin","sum","prod", - "reshape","permute","transpose","concat","broadcastTo","broadcast_to","indexselect", + "reshape","permute","transpose","concat","broadcastTo","broadcast_to","indexselect",'repeat', #functional "relu","sigmoid","swish","silu", diff --git a/front/py/deepx/nn/functional/authormap.py b/front/py/deepx/nn/functional/authormap.py index 696a8d5d..6e8bc602 100644 --- a/front/py/deepx/nn/functional/authormap.py +++ b/front/py/deepx/nn/functional/authormap.py @@ -54,6 +54,7 @@ 'broadcastTo':'miaobyte', 'concat':'miaobyte', 'indexselect':'miaobyte', + 'repeat':'miaobyte', #matmul # 'matmul':'miaobyte', 'matmul':'cublas', diff --git a/front/py/deepx/nn/functional/leaffunc_changeshape.py b/front/py/deepx/nn/functional/leaffunc_changeshape.py index a69f92d6..2da83802 100644 --- a/front/py/deepx/nn/functional/leaffunc_changeshape.py +++ b/front/py/deepx/nn/functional/leaffunc_changeshape.py @@ -93,6 +93,19 @@ def indexselect(input:Tensor,indices:Tensor,gatheraxis:int,out:Union[Tensor,str] rtf_indexselect(input,indices,gatheraxis,outtensor,defaultauthor['indexselect']) return outtensor +def repeat(input:Tensor,repeats:tuple[int,...],out:Union[Tensor,str]=''): + assert isinstance(repeats,tuple) + assert input.Shape.ndim==len(repeats) + for i in repeats: + assert isinstance(i,int) and i>0 + outtensor=out + if isinstance(out,str) or out is None: + outshape=Shape.repeatshape(input.shape,repeats) + outtensor=newtensor(outshape,dtype=input.dtype,name=out) + from .rtf_changeshape import rtf_repeat + rtf_repeat(input,repeats,outtensor,defaultauthor['repeat']) + return outtensor + # def unsqueeze(t:Tensor,dim:int)->Tensor: # # 确保dim是有效的 # if dim < -t.ndim-1 or dim > t.ndim: diff --git a/front/py/deepx/nn/functional/rtf_changeshape.py b/front/py/deepx/nn/functional/rtf_changeshape.py index 37b38bad..7c0b144b 100644 --- a/front/py/deepx/nn/functional/rtf_changeshape.py +++ b/front/py/deepx/nn/functional/rtf_changeshape.py @@ -35,3 +35,11 @@ def rtf_indexselect(input:Tensor,indices:Tensor,axis:int,out:Tensor,author='miao ir=DeepxIR("indexselect", args, returns,author) send(ir) +def rtf_repeat(input:Tensor,repeats:tuple[int,...],out:Tensor,author='miaobyte'): + assert isinstance(repeats,tuple) + for i in repeats: + assert isinstance(i,int) and i>0 + args=[Param.tensor(input),Param.vector(repeats,'int32')] + returns=[Param.tensor(out)] + ir=DeepxIR("repeat", args, returns,author) + send(ir) \ No newline at end of file diff --git a/front/py/deepx/tensor/changeshape.py b/front/py/deepx/tensor/changeshape.py index f359c2de..39b415ad 100644 --- a/front/py/deepx/tensor/changeshape.py +++ b/front/py/deepx/tensor/changeshape.py @@ -74,6 +74,12 @@ def unsqueeze(self,dim:int)->Tensor: result=unsqueeze_func(self,dim) return result +@tensor_method +def repeat(self,repeats:tuple[int,...])->Tensor: + from deepx.nn.functional import repeat as repeat_func + result=repeat_func(self,repeats) + return result + # @tensor_method # def expand(self,shape:tuple)->Tensor: # from deepx.nn.functional import expand as expand_func diff --git a/front/py/deepx/tensor/shape.py b/front/py/deepx/tensor/shape.py index cb0d3c8e..1cd165c0 100644 --- a/front/py/deepx/tensor/shape.py +++ b/front/py/deepx/tensor/shape.py @@ -199,4 +199,9 @@ def save(self,path:str): with open(path, 'w') as f: yaml.dump({'shape': list(self.shape), 'dtype': self._dtype,'size':self.numel(),'dim':self.ndim,'stride':list(self.stride)}, f) else: - raise ValueError("文件名必须以.shape结尾") \ No newline at end of file + raise ValueError("文件名必须以.shape结尾") + + @classmethod + def repeatshape(cls,input_shape:tuple[int,...],repeat:tuple[int,...])->tuple[int,...]: + assert len(input_shape)== len(repeat) + return tuple(d * r for d, r in zip(input_shape, repeat)) diff --git a/front/py/examples/2_ir/4_changeshape_repeat.py b/front/py/examples/2_ir/4_changeshape_repeat.py new file mode 100644 index 00000000..6bdcbd8f --- /dev/null +++ b/front/py/examples/2_ir/4_changeshape_repeat.py @@ -0,0 +1,17 @@ +print() +############-------PyTorch-------################ + +import torch +torch_t1 = torch.arange(60, dtype=torch.float32).reshape(3, 4,5) +print(torch_t1) +torch_t2=torch_t1.repeat([1,2,3]) +print(torch_t2) + + +############-------Deepx-------################ + +from deepx import arange +t1 = arange(0,60).reshape_((3, 4,5)) +t1.print() +t2=t1.repeat((1,2,3)) +t2.print() \ No newline at end of file From d39e7fedb7fda5a150dcc04f5ae0b32c3dcb1133 Mon Sep 17 00:00:00 2001 From: lipeng <734991033@qq.com> Date: Mon, 2 Jun 2025 15:43:08 +0800 Subject: [PATCH 3/3] =?UTF-8?q?repeat:cuda=20=E5=AE=9E=E7=8E=B0=E5=AE=8C?= =?UTF-8?q?=E6=88=90=EF=BC=8C=E5=AE=8C=E6=88=90=E9=AA=8C=E8=AF=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- doc/excuter/op-mem-cuda/list.md | 125 +++++++++--------- excuter/op-mem-cuda/src/client/tfs.cpp | 11 ++ .../deepx/tensorfunc/changeshape_miaobyte.cu | 117 +++++++++++++++- .../deepx/tensorfunc/changeshape_miaobyte.cuh | 16 +++ .../deepx/tensorfunc/changeshape_miaobyte.hpp | 19 ++- .../tensorfunc/elementwise_miaobyte_basic.cu | 1 + .../op-mem-cuda/src/deepx/tf/changeshape.hpp | 68 ++++++++++ .../src/deepx/tf/elementwise_basic.hpp | 1 - front/py/deepx/scheduler/client/udpconn.py | 2 +- 9 files changed, 294 insertions(+), 66 deletions(-) diff --git a/doc/excuter/op-mem-cuda/list.md b/doc/excuter/op-mem-cuda/list.md index e33b2df3..69966acf 100644 --- a/doc/excuter/op-mem-cuda/list.md +++ b/doc/excuter/op-mem-cuda/list.md @@ -2,105 +2,106 @@ 本页面由 `excuter/op-mem-cuda 生成,请勿手动修改 -### arg +### matmul | Operation | Author | Math Formula | IR Instruction | |-----------|--------|--------------|----------------| -| vecset | none | [3 4 5]->shape | vecset(vector:value)->(vector:name) | -| argset | none | argvalue->argname | argset(var:value)->(var:name) | +| matmul | cublas | T3=T1 @ T2 | matmul(tensor:A, tensor:B)->(tensor:C) | -### tensorlife +### init | Operation | Author | Math Formula | IR Instruction | |-----------|--------|--------------|----------------| -| renametensor | none | rename(newname)->T1 | renametensor(var:new_name)->(tensor:t) | -| newtensor | none | T1 = zeros(shape) | newtensor(vector:shape)->(tensor:tensor1) | -| newtensor | none | T1 = zeros(shape) | newtensor(var:shape)->(tensor:tensor1) | -| deltensor | none | del->T1 | deltensor()->(tensor:t) | -| copytensor | none | T2.data = T1.data | copytensor(tensor:src)->(tensor:dst) | +| constant | miaobyte | constant(value)->T1 | constant(var:value)->(tensor:t) | +| arange | miaobyte | arange(start,step)->T1 | arange(var:start, var:step)->(tensor:t) | +| uniform | miaobyte | uniform(low,high,seed)->T1 | uniform(var:low, var:high, var:seed)->(tensor:t) | +| dropout | miaobyte | dropout(p,seed)->A | dropout(var:p, var:seed)->(tensor:A) | +| normal | miaobyte | normal(mean,stddev,seed)->T1 | normal(var:mean, var:stddev, var:seed)->(tensor:t) | ### io | Operation | Author | Math Formula | IR Instruction | |-----------|--------|--------------|----------------| -| loadtensordata | none | loadtensordata(path)->tensor | loadtensordata(var:path)->(tensor:t) | -| save | none | save(T1,path) | save(tensor:t, var:path)->() | +| load | none | load(path) | load(var:path)->() | | print | miaobyte | print(T1) | print(tensor:t)->() | | print | miaobyte | print(T1) | print(tensor:t, var:format)->() | -| load | none | load(path) | load(var:path)->() | +| save | none | save(T1,path) | save(tensor:t, var:path)->() | +| loadtensordata | none | loadtensordata(path)->tensor | loadtensordata(var:path)->(tensor:t) | -### matmul +### arg | Operation | Author | Math Formula | IR Instruction | |-----------|--------|--------------|----------------| -| matmul | cublas | T3=T1 @ T2 | matmul(tensor:A, tensor:B)->(tensor:C) | +| argset | none | argvalue->argname | argset(var:value)->(var:name) | +| vecset | none | [3 4 5]->shape | vecset(vector:value)->(vector:name) | -### init +### tensorlife | Operation | Author | Math Formula | IR Instruction | |-----------|--------|--------------|----------------| -| normal | miaobyte | normal(mean,stddev,seed)->T1 | normal(var:mean, var:stddev, var:seed)->(tensor:t) | -| dropout | miaobyte | dropout(p,seed)->A | dropout(var:p, var:seed)->(tensor:A) | -| uniform | miaobyte | uniform(low,high,seed)->T1 | uniform(var:low, var:high, var:seed)->(tensor:t) | -| arange | miaobyte | arange(start,step)->T1 | arange(var:start, var:step)->(tensor:t) | -| constant | miaobyte | constant(value)->T1 | constant(var:value)->(tensor:t) | +| copytensor | none | T2.data = T1.data | copytensor(tensor:src)->(tensor:dst) | +| newtensor | none | T1 = zeros(shape) | newtensor(vector:shape)->(tensor:tensor1) | +| newtensor | none | T1 = zeros(shape) | newtensor(var:shape)->(tensor:tensor1) | +| renametensor | none | rename(newname)->T1 | renametensor(var:new_name)->(tensor:t) | +| deltensor | none | del->T1 | deltensor()->(tensor:t) | ### elementwise | Operation | Author | Math Formula | IR Instruction | |-----------|--------|--------------|----------------| -| switch | miaobyte | C=switch(tensors,cases) | switch(listtensor:tensors, tensor:cases)->(tensor:result) | -| greaterscalar | miaobyte | mask=compare(T1, scalar) | greaterscalar(tensor:A, var:scalar)->(tensor:mask) | -| notequal | miaobyte | T1!=T2->mask | notequal(tensor:A, tensor:B, var:epsilon)->(tensor:mask) | -| equalscalar | miaobyte | T1==scalar->mask | equalscalar(tensor:A, var:scalar, var:epsilon)->(tensor:mask) | -| min | miaobyte | T3=min(T1, T2) | min(tensor:A, tensor:B)->(tensor:C) | -| maxscalar | miaobyte | T3=max(T1, scalar) | maxscalar(tensor:A, var:scalar)->(tensor:C) | -| tan | miaobyte | T3=tan(T1) | tan(tensor:A)->(tensor:C) | -| sin | miaobyte | T3=sin(T1) | sin(tensor:A)->(tensor:C) | -| less | miaobyte | mask=compare(T1, T2) | less(tensor:A, tensor:B)->(tensor:mask) | -| powscalar | miaobyte | T3=pow(T1, scalar) | powscalar(tensor:A, var:scalar)->(tensor:C) | -| rsubscalar | miaobyte | T3=scalar-T1 | rsubscalar(var:scalar, tensor:A)->(tensor:C) | -| divscalar | miaobyte | T3=scalar/T1 | divscalar(tensor:A, var:scalar)->(tensor:C) | -| log | miaobyte | T3=log(T1) | log(tensor:A)->(tensor:C) | -| addscalar | miaobyte | T3=T1+scalar | addscalar(tensor:A, var:b)->(tensor:C) | +| pow | miaobyte | T3=pow(T1, T2) | pow(tensor:A, tensor:B)->(tensor:C) | +| max | miaobyte | T3=max(T1, T2) | max(tensor:A, tensor:B)->(tensor:C) | +| mulscalar | miaobyte | T3=T1*scalar | mulscalar(tensor:A, var:b)->(tensor:C) | +| equal | miaobyte | T1==T2->mask | equal(tensor:A, tensor:B, var:epsilon)->(tensor:mask) | +| mul | miaobyte | T3=T1*T2 | mul(tensor:A, tensor:B)->(tensor:C) | +| exp | miaobyte | T3=exp(T1) | exp(tensor:A)->(tensor:C) | +| subscalar | miaobyte | T3=T1-scalar | subscalar(tensor:A, var:b)->(tensor:C) | +| sqrt | miaobyte | T3=sqrt(T1) | sqrt(tensor:A)->(tensor:C) | +| sub | miaobyte | T3=T1-T2 | sub(tensor:A, tensor:B)->(tensor:C) | +| add | cublas | T3=T1+T2 | add(tensor:a, tensor:b)->(tensor:c) | +| add | miaobyte | T3=T1+T2 | add(tensor:a, tensor:b)->(tensor:c) | +| todtype | none | T3(dtypeA)->T1(dtypeB) | todtype(tensor:a)->(tensor:b) | +| invert | miaobyte | T3=~T1 | invert(tensor:A)->(tensor:C) | +| rdivscalar | miaobyte | T3=scalar/T1 | rdivscalar(var:scalar, tensor:A)->(tensor:C) | +| rpowscalar | miaobyte | T3=pow(scalar, T1) | rpowscalar(var:scalar, tensor:A)->(tensor:C) | +| cos | miaobyte | T3=cos(T1) | cos(tensor:A)->(tensor:C) | | greater | miaobyte | mask=compare(T1, T2) | greater(tensor:A, tensor:B)->(tensor:mask) | +| addscalar | miaobyte | T3=T1+scalar | addscalar(tensor:A, var:b)->(tensor:C) | +| div | miaobyte | T3=T1/T2 | div(tensor:A, tensor:B)->(tensor:C) | +| divscalar | miaobyte | T3=scalar/T1 | divscalar(tensor:A, var:scalar)->(tensor:C) | | lessscalar | miaobyte | mask=compare(T1, scalar) | lessscalar(tensor:A, var:scalar)->(tensor:mask) | -| cos | miaobyte | T3=cos(T1) | cos(tensor:A)->(tensor:C) | +| less | miaobyte | mask=compare(T1, T2) | less(tensor:A, tensor:B)->(tensor:mask) | | notequalscalar | miaobyte | T1!=scalar->mask | notequalscalar(tensor:A, var:scalar, var:epsilon)->(tensor:mask) | +| rsubscalar | miaobyte | T3=scalar-T1 | rsubscalar(var:scalar, tensor:A)->(tensor:C) | +| sin | miaobyte | T3=sin(T1) | sin(tensor:A)->(tensor:C) | | minscalar | miaobyte | T3=min(T1, scalar) | minscalar(tensor:A, var:scalar)->(tensor:C) | -| rpowscalar | miaobyte | T3=pow(scalar, T1) | rpowscalar(var:scalar, tensor:A)->(tensor:C) | -| rdivscalar | miaobyte | T3=scalar/T1 | rdivscalar(var:scalar, tensor:A)->(tensor:C) | -| todtype | none | T3(dtypeA)->T1(dtypeB) | todtype(tensor:a)->(tensor:b) | -| add | cublas | T3=T1+T2 | add(tensor:a, tensor:b)->(tensor:c) | -| add | miaobyte | T3=T1+T2 | add(tensor:a, tensor:b)->(tensor:c) | -| sub | miaobyte | T3=T1-T2 | sub(tensor:A, tensor:B)->(tensor:C) | -| sqrt | miaobyte | T3=sqrt(T1) | sqrt(tensor:A)->(tensor:C) | -| subscalar | miaobyte | T3=T1-scalar | subscalar(tensor:A, var:b)->(tensor:C) | -| exp | miaobyte | T3=exp(T1) | exp(tensor:A)->(tensor:C) | -| mul | miaobyte | T3=T1*T2 | mul(tensor:A, tensor:B)->(tensor:C) | -| equal | miaobyte | T1==T2->mask | equal(tensor:A, tensor:B, var:epsilon)->(tensor:mask) | -| mulscalar | miaobyte | T3=T1*scalar | mulscalar(tensor:A, var:b)->(tensor:C) | -| div | miaobyte | T3=T1/T2 | div(tensor:A, tensor:B)->(tensor:C) | -| invert | miaobyte | T3=~T1 | invert(tensor:A)->(tensor:C) | -| max | miaobyte | T3=max(T1, T2) | max(tensor:A, tensor:B)->(tensor:C) | -| pow | miaobyte | T3=pow(T1, T2) | pow(tensor:A, tensor:B)->(tensor:C) | +| tan | miaobyte | T3=tan(T1) | tan(tensor:A)->(tensor:C) | +| maxscalar | miaobyte | T3=max(T1, scalar) | maxscalar(tensor:A, var:scalar)->(tensor:C) | +| min | miaobyte | T3=min(T1, T2) | min(tensor:A, tensor:B)->(tensor:C) | +| log | miaobyte | T3=log(T1) | log(tensor:A)->(tensor:C) | +| equalscalar | miaobyte | T1==scalar->mask | equalscalar(tensor:A, var:scalar, var:epsilon)->(tensor:mask) | +| notequal | miaobyte | T1!=T2->mask | notequal(tensor:A, tensor:B, var:epsilon)->(tensor:mask) | +| greaterscalar | miaobyte | mask=compare(T1, scalar) | greaterscalar(tensor:A, var:scalar)->(tensor:mask) | +| switch | miaobyte | C=switch(tensors,cases) | switch(listtensor:tensors, tensor:cases)->(tensor:result) | +| powscalar | miaobyte | T3=pow(T1, scalar) | powscalar(tensor:A, var:scalar)->(tensor:C) | -### reduce +### changeshape | Operation | Author | Math Formula | IR Instruction | |-----------|--------|--------------|----------------| -| prod | miaobyte | B = prod(A, axis=[1 2], keepdims=false) | prod(tensor:A, vector:dims, var:keepdims)->(tensor:B) | -| reducemax | miaobyte | B = reducemax(A, axis=[1 2], keepdims=false) | reducemax(tensor:A, vector:dims, var:keepdims)->(tensor:B) | -| sum | miaobyte | B = sum(A, axis=[1 2], keepdims=false) | sum(tensor:A, vector:dims, var:keepdims)->(tensor:B) | -| reducemin | miaobyte | B = reducemin(A, axis=[1 2], keepdims=false) | reducemin(tensor:A, vector:dims, var:keepdims)->(tensor:B) | +| reshape | miaobyte | T1.reshape(shape)->T2 | reshape(tensor:A, vector:shape)->(tensor:B) | +| transpose | miaobyte | T2 = T1.transpose(dimorder=[1,0]) | transpose(tensor:A, vector:dim_order)->(tensor:C) | +| concat | miaobyte | Tresult = concat([T1, T2...], axis=3) | concat(listtensor:tensors, var:dim)->(tensor:result) | +| broadcastTo | miaobyte | T2 = T1.broadcastTo(new_shape=[4,3,2]) | broadcastTo(tensor:A, vector:new_shape)->(tensor:B) | +| indexselect | miaobyte | T2 = T1.indexselect(index=[1,2], axis=1) | indexselect(tensor:A, tensor:indices, var:axis)->(tensor:B) | +| repeat | miaobyte | T2 = T1.repeat(repeats=[3 4 5]) | repeat(tensor:A, vector:repeats)->(tensor:B) | -### changeshape +### reduce | Operation | Author | Math Formula | IR Instruction | |-----------|--------|--------------|----------------| -| indexselect | miaobyte | T2 = T1.indexselect(index=[1,2], axis=1) | indexselect(tensor:A, tensor:indices, var:axis)->(tensor:B) | -| broadcastTo | miaobyte | T2 = T1.broadcastTo(new_shape=[4,3,2]) | broadcastTo(tensor:A, vector:new_shape)->(tensor:B) | -| concat | miaobyte | Tresult = concat([T1, T2...], axis=3) | concat(listtensor:tensors, var:dim)->(tensor:result) | -| transpose | miaobyte | T2 = T1.transpose(dimorder=[1,0]) | transpose(tensor:A, vector:dim_order)->(tensor:C) | -| reshape | miaobyte | T1.reshape(shape)->T2 | reshape(tensor:A, vector:shape)->(tensor:B) | +| reducemin | miaobyte | B = reducemin(A, axis=[1 2], keepdims=false) | reducemin(tensor:A, vector:dims, var:keepdims)->(tensor:B) | +| sum | miaobyte | B = sum(A, axis=[1 2], keepdims=false) | sum(tensor:A, vector:dims, var:keepdims)->(tensor:B) | +| reducemax | miaobyte | B = reducemax(A, axis=[1 2], keepdims=false) | reducemax(tensor:A, vector:dims, var:keepdims)->(tensor:B) | +| prod | miaobyte | B = prod(A, axis=[1 2], keepdims=false) | prod(tensor:A, vector:dims, var:keepdims)->(tensor:B) | diff --git a/excuter/op-mem-cuda/src/client/tfs.cpp b/excuter/op-mem-cuda/src/client/tfs.cpp index 85fd5be6..ef197b68 100644 --- a/excuter/op-mem-cuda/src/client/tfs.cpp +++ b/excuter/op-mem-cuda/src/client/tfs.cpp @@ -571,6 +571,17 @@ namespace deepx::tf { Param("B", DataCategory::Tensor, Precision::Any), }))); + // repeat + tffactory.add_tf(std::make_shared>(vector( + { + Param("A", DataCategory::Tensor, Precision::Any), + Param("repeats", DataCategory::Vector, Precision::Int32), + }), + vector( + { + Param("B", DataCategory::Tensor, Precision::Any), + }))); + } // reduce void register_reduce(TfFactory &tffactory) diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu index 6cd97295..aeb871c6 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu @@ -103,6 +103,7 @@ namespace deepx::tensorfunc template void launch_transpose(const float *input, const int *inputStrides, float *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); template void launch_transpose(const nv_bfloat16 *input, const int *inputStrides, nv_bfloat16 *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); template void launch_transpose<__half>(const __half *input, const int *inputStrides, __half *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); + template void launch_transpose(const int64_t *input, const int *inputStrides, int64_t *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); template void launch_transpose(const int32_t *input, const int *inputStrides, int32_t *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); template void launch_transpose(const int16_t *input, const int *inputStrides, int16_t *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); @@ -546,6 +547,120 @@ namespace deepx::tensorfunc const int32_t *index, const int *indexStrides, const int indexDim, const int gatherAxis, int8_t *output, const int *outputStrides, const int outputDim, const int outputlen); -} + + + //repeat + template + __global__ void repeat_kernel( + const T *input, const int *inputStrides, + const int *repeats, + T *output, const int *outputStrides,const int outputlen,const int dim){ + const int grid_stride = gridDim.x * blockDim.x; + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + for (; thread_id < outputlen; thread_id += grid_stride) + { + // 输出索引 + int output_indices[DIM]; + linearTo(outputStrides, dim, output_indices, thread_id); + + // 输入索引 + int input_indices[DIM]; + for (int i = 0; i < dim; ++i) + { + input_indices[i] = output_indices[i] / repeats[i]; + } + int inputIdx = linearAt(inputStrides, dim, input_indices); + int outputIdx = linearAt(outputStrides, dim, output_indices); + output[outputIdx] = input[inputIdx]; + } + } + + + template + void launch_repeat( + const T *input, const int *inputStrides, + const int *repeats, + T *output, const int *outputStrides, const int outputlen,const int dim){ + + + auto [numBlocks, blockSize] = BestDims(outputlen); + // input + cudaVector inputStrides_d(inputStrides, dim, cudaMemcpyHostToDevice); + // output + cudaVector outputStrides_d(outputStrides, dim, cudaMemcpyHostToDevice); + // repeats + cudaVector repeats_d(repeats, dim, cudaMemcpyHostToDevice); + + switch (dim) + { + case 1: + repeat_kernel<1, T><<>>(input, inputStrides_d.data, repeats_d.data, output, outputStrides_d.data, outputlen, dim); + break; + case 2: + repeat_kernel<2, T><<>>(input, inputStrides_d.data, repeats_d.data, output, outputStrides_d.data, outputlen, dim); + break; + case 3: + repeat_kernel<3, T><<>>(input, inputStrides_d.data, repeats_d.data, output, outputStrides_d.data, outputlen, dim); + break; + case 4: + repeat_kernel<4, T><<>>(input, inputStrides_d.data, repeats_d.data, output, outputStrides_d.data, outputlen, dim); + break; + case 5: + repeat_kernel<5, T><<>>(input, inputStrides_d.data, repeats_d.data, output, outputStrides_d.data, outputlen, dim); + break; + case 6: + repeat_kernel<6, T><<>>(input, inputStrides_d.data, repeats_d.data, output, outputStrides_d.data, outputlen, dim); + break; + case 7: + repeat_kernel<7, T><<>>(input, inputStrides_d.data, repeats_d.data, output, outputStrides_d.data, outputlen, dim); + break; + case 8: + repeat_kernel<8, T><<>>(input, inputStrides_d.data, repeats_d.data, output, outputStrides_d.data, outputlen, dim); + break; + case 9: + repeat_kernel<9, T><<>>(input, inputStrides_d.data, repeats_d.data, output, outputStrides_d.data, outputlen, dim); + break; + case 10: + repeat_kernel<10, T><<>>(input, inputStrides_d.data, repeats_d.data, output, outputStrides_d.data, outputlen, dim); + break; + case 11: + repeat_kernel<11, T><<>>(input, inputStrides_d.data, repeats_d.data, output, outputStrides_d.data, outputlen, dim); + break; + case 12: + repeat_kernel<12, T><<>>(input, inputStrides_d.data, repeats_d.data, output, outputStrides_d.data, outputlen, dim); + break; + default: + throw std::runtime_error("dimension large than " + std::to_string(MAX_DIM)); + } + } + + template void launch_repeat(const double *input, const int *inputStrides, + const int *repeats, + double *output, const int *outputStrides, const int outputlen,const int dim); + template void launch_repeat(const float *input, const int *inputStrides, + const int *repeats, + float *output, const int *outputStrides, const int outputlen,const int dim); + template void launch_repeat(const nv_bfloat16 *input, const int *inputStrides, + const int *repeats, + nv_bfloat16 *output, const int *outputStrides, const int outputlen,const int dim); + template void launch_repeat<__half>(const __half *input, const int *inputStrides, + const int *repeats, + __half *output, const int *outputStrides, const int outputlen,const int dim); + template void launch_repeat(const int64_t *input, const int *inputStrides, + const int *repeats, + int64_t *output, const int *outputStrides, const int outputlen,const int dim); + template void launch_repeat(const int32_t *input, const int *inputStrides, + const int *repeats, + int32_t *output, const int *outputStrides, const int outputlen,const int dim); + template void launch_repeat(const int16_t *input, const int *inputStrides, + const int *repeats, + int16_t *output, const int *outputStrides, const int outputlen,const int dim); + template void launch_repeat(const int8_t *input, const int *inputStrides, + const int *repeats, + int8_t *output, const int *outputStrides, const int outputlen,const int dim); + template void launch_repeat(const bool *input, const int *inputStrides, + const int *repeats, + bool *output, const int *outputStrides, const int outputlen,const int dim); +}// namespace deepx #endif // DEEPX_TENSORFUNC_CHANGESHAPE_MIAOBYTE_HPP \ No newline at end of file diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cuh b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cuh index 26c40851..d3845ee4 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cuh +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cuh @@ -65,5 +65,21 @@ namespace deepx::tensorfunc const GatherAxisT *indices,const int *indicesStrides,const int indicesDim, const int gatherAxis, T *output,const int *outputStrides,const int outputDim,const int outputlen); + + + // repeat + template + __global__ void repeat_kernel( + const T *input, const int *inputStrides, + const int *repeats, + T *output, const int *outputStrides, const int outputlen, + const int dim); + + template + void launch_repeat( + const T *input, const int *inputStrides, + const int *repeats, + T *output, const int *outputStrides, const int outputlen, + const int dim); }; #endif // DEEPX_TENSORFUNC_CHANGESHAPE_MIAOBYTE_CUH \ No newline at end of file diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.hpp index 922c8ddd..818e4bab 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.hpp +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.hpp @@ -117,7 +117,7 @@ namespace deepx::tensorfunc } }; - //indexselect + //indexselectmoe_infer template struct indexselectDispatcher { @@ -135,5 +135,22 @@ namespace deepx::tensorfunc output.data,output.shape.strides.data(),output.shape.dim(),output.shape.size); } }; + + //repeat + template + struct repeatDispatcher + { + static void repeat(const Tensor &A, const std::vector &repeats, Tensor &B) + { + auto new_shape = repeatShape(A.shape.shape, repeats); + if (new_shape.empty() || new_shape != B.shape.shape) + { + throw TensorShapeError("Repeat shape mismatch"); + } + launch_repeat(A.data, A.shape.strides.data(), + repeats.data(), + B.data, B.shape.strides.data(),B.shape.size, B.shape.dim()); + } + }; } #endif // DEEPX_TENSORFUNC_CHANGESHAPE_MIAOBYTE_HPP \ No newline at end of file diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu index f48ca7b2..9392608d 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu @@ -32,6 +32,7 @@ namespace deepx::tensorfunc template void launch_todtype(const double *a, float *c, const int size); template void launch_todtype(const double *a, half *c, const int size); template void launch_todtype(const double *a, nv_bfloat16 *c, const int size); + //template void launch_todtype(const double *a, int64_t *c, const int size); template void launch_todtype(const double *a, int64_t *c, const int size); template void launch_todtype(const double *a, int32_t *c, const int size); template void launch_todtype(const double *a, int16_t *c, const int size); diff --git a/excuter/op-mem-cuda/src/deepx/tf/changeshape.hpp b/excuter/op-mem-cuda/src/deepx/tf/changeshape.hpp index db447136..b1d9ef8b 100644 --- a/excuter/op-mem-cuda/src/deepx/tf/changeshape.hpp +++ b/excuter/op-mem-cuda/src/deepx/tf/changeshape.hpp @@ -490,5 +490,73 @@ namespace deepx::tf } }; + //repeat + template + class Repeat : public TF + { + public: + Repeat(const vector &args, const vector &returns) + { + this->name = "repeat"; + this->metadata.author = Author::name(); + this->tftype = "changeshape"; + this->args = args; + this->returns = returns; + } + + string math_formula() const override + { + return "T2 = T1.repeat(repeats=[3 4 5])"; + } + shared_ptr clone() const override + { + return make_shared>(*this); + } + int run(shared_ptr mem, string &error) override + { + Precision input_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; + vector repeats = this->getvector(1); + Precision output_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (input_type != output_type) + { + error = "Type mismatch: " + precision_str(input_type) + " != " + precision_str(output_type); + return 1; + } + switch (input_type) + { + case Precision::Float64: + repeat(*mem->gettensor(this->args[0].textvalue), repeats, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + repeat(*mem->gettensor(this->args[0].textvalue), repeats, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float16: + repeat(*mem->gettensor(this->args[0].textvalue), repeats, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::BFloat16: + repeat(*mem->gettensor(this->args[0].textvalue), repeats, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int64: + repeat(*mem->gettensor(this->args[0].textvalue), repeats, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + repeat(*mem->gettensor(this->args[0].textvalue), repeats, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + repeat(*mem->gettensor(this->args[0].textvalue), repeats, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + repeat(*mem->gettensor(this->args[0].textvalue), repeats, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Bool: + repeat(*mem->gettensor(this->args[0].textvalue), repeats, *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported type: " + precision_str(input_type); + return 1; + } + return 0; + } + }; }; #endif // DEEPX_TF_CHANGESHAPE_HPP diff --git a/excuter/op-mem-cuda/src/deepx/tf/elementwise_basic.hpp b/excuter/op-mem-cuda/src/deepx/tf/elementwise_basic.hpp index c0472001..fca52981 100644 --- a/excuter/op-mem-cuda/src/deepx/tf/elementwise_basic.hpp +++ b/excuter/op-mem-cuda/src/deepx/tf/elementwise_basic.hpp @@ -9,7 +9,6 @@ namespace deepx::tf { - // todtype class Todtype : public TF { diff --git a/front/py/deepx/scheduler/client/udpconn.py b/front/py/deepx/scheduler/client/udpconn.py index a25b0963..6a12c26a 100644 --- a/front/py/deepx/scheduler/client/udpconn.py +++ b/front/py/deepx/scheduler/client/udpconn.py @@ -3,7 +3,7 @@ import select class UDPConn: - def __init__(self, endpoint: str = "localhost:8080"): + def __init__(self, endpoint: str = "localhost:9090"): # 解析endpoint self._host, port_str = endpoint.split(':') self._port = int(port_str)