From 3677fb43c8b32c2a5c502df4a217435eae0724d7 Mon Sep 17 00:00:00 2001 From: lipeng <734991033@qq.com> Date: Sun, 18 May 2025 22:50:12 +0800 Subject: [PATCH 1/3] cudaerror: --- doc/excuter/op-mem-cuda/list.md | 124 ++++++++--------- doc/excuter/op-mem-ompsimd/list.md | 126 +++++++++--------- excuter/cpp-common/src/deepx/tf/tf.cpp | 18 +-- excuter/cpp-common/src/deepx/tf/tf.hpp | 1 + .../tensorfunc/elementwise_miaobyte_basic.cu | 35 +++-- .../elementwise_miaobyte_compare.cu | 29 ++-- .../deepx/transformer/modeling_rope_utils.py | 7 +- 7 files changed, 174 insertions(+), 166 deletions(-) diff --git a/doc/excuter/op-mem-cuda/list.md b/doc/excuter/op-mem-cuda/list.md index e5995829..e33b2df3 100644 --- a/doc/excuter/op-mem-cuda/list.md +++ b/doc/excuter/op-mem-cuda/list.md @@ -6,101 +6,101 @@ | Operation | Author | Math Formula | IR Instruction | |-----------|--------|--------------|----------------| -| vecset | none | [3 4 5]->shape | vecset(vector value)->(vector name) | -| argset | none | argvalue->argname | argset(var value)->(var name) | +| vecset | none | [3 4 5]->shape | vecset(vector:value)->(vector:name) | +| argset | none | argvalue->argname | argset(var:value)->(var:name) | ### tensorlife | Operation | Author | Math Formula | IR Instruction | |-----------|--------|--------------|----------------| -| renametensor | none | rename(newname)->T1 | renametensor(var new_name)->(tensor t) | -| newtensor | none | T1 = zeros(shape) | newtensor(vector shape)->(tensor tensor1) | -| newtensor | none | T1 = zeros(shape) | newtensor(var shape)->(tensor tensor1) | -| deltensor | none | del->T1 | deltensor()->(tensor t) | -| copytensor | none | T2.data = T1.data | copytensor(tensor src)->(tensor dst) | +| renametensor | none | rename(newname)->T1 | renametensor(var:new_name)->(tensor:t) | +| newtensor | none | T1 = zeros(shape) | newtensor(vector:shape)->(tensor:tensor1) | +| newtensor | none | T1 = zeros(shape) | newtensor(var:shape)->(tensor:tensor1) | +| deltensor | none | del->T1 | deltensor()->(tensor:t) | +| copytensor | none | T2.data = T1.data | copytensor(tensor:src)->(tensor:dst) | ### io | Operation | Author | Math Formula | IR Instruction | |-----------|--------|--------------|----------------| -| loadtensordata | none | loadtensordata(path)->tensor | loadtensordata(var path)->(tensor t) | -| save | none | save(T1,path) | save(tensor t, var path)->() | -| print | miaobyte | print(T1) | print(tensor t)->() | -| print | miaobyte | print(T1) | print(tensor t, var format)->() | -| load | none | load(path) | load(var path)->() | +| loadtensordata | none | loadtensordata(path)->tensor | loadtensordata(var:path)->(tensor:t) | +| save | none | save(T1,path) | save(tensor:t, var:path)->() | +| print | miaobyte | print(T1) | print(tensor:t)->() | +| print | miaobyte | print(T1) | print(tensor:t, var:format)->() | +| load | none | load(path) | load(var:path)->() | ### matmul | Operation | Author | Math Formula | IR Instruction | |-----------|--------|--------------|----------------| -| matmul | cublas | T3=T1 @ T2 | matmul(tensor A, tensor B)->(tensor C) | +| matmul | cublas | T3=T1 @ T2 | matmul(tensor:A, tensor:B)->(tensor:C) | ### init | Operation | Author | Math Formula | IR Instruction | |-----------|--------|--------------|----------------| -| normal | miaobyte | normal(mean,stddev,seed)->T1 | normal(var mean, var stddev, var seed)->(tensor t) | -| dropout | miaobyte | dropout(p,seed)->A | dropout(var p, var seed)->(tensor A) | -| uniform | miaobyte | uniform(low,high,seed)->T1 | uniform(var low, var high, var seed)->(tensor t) | -| arange | miaobyte | arange(start,step)->T1 | arange(var start, var step)->(tensor t) | -| constant | miaobyte | constant(value)->T1 | constant(var value)->(tensor t) | +| normal | miaobyte | normal(mean,stddev,seed)->T1 | normal(var:mean, var:stddev, var:seed)->(tensor:t) | +| dropout | miaobyte | dropout(p,seed)->A | dropout(var:p, var:seed)->(tensor:A) | +| uniform | miaobyte | uniform(low,high,seed)->T1 | uniform(var:low, var:high, var:seed)->(tensor:t) | +| arange | miaobyte | arange(start,step)->T1 | arange(var:start, var:step)->(tensor:t) | +| constant | miaobyte | constant(value)->T1 | constant(var:value)->(tensor:t) | ### elementwise | Operation | Author | Math Formula | IR Instruction | |-----------|--------|--------------|----------------| -| switch | miaobyte | C=switch(tensors,cases) | switch(listtensor tensors, tensor cases)->(tensor result) | -| greaterscalar | miaobyte | mask=compare(T1, scalar) | greaterscalar(tensor A, var scalar)->(tensor mask) | -| notequal | miaobyte | T1!=T2->mask | notequal(tensor A, tensor B, var epsilon)->(tensor mask) | -| equalscalar | miaobyte | T1==scalar->mask | equalscalar(tensor A, var scalar, var epsilon)->(tensor mask) | -| min | miaobyte | T3=min(T1, T2) | min(tensor A, tensor B)->(tensor C) | -| maxscalar | miaobyte | T3=max(T1, scalar) | maxscalar(tensor A, var scalar)->(tensor C) | -| tan | miaobyte | T3=tan(T1) | tan(tensor A)->(tensor C) | -| sin | miaobyte | T3=sin(T1) | sin(tensor A)->(tensor C) | -| less | miaobyte | mask=compare(T1, T2) | less(tensor A, tensor B)->(tensor mask) | -| powscalar | miaobyte | T3=pow(T1, scalar) | powscalar(tensor A, var scalar)->(tensor C) | -| rsubscalar | miaobyte | T3=scalar-T1 | rsubscalar(var scalar, tensor A)->(tensor C) | -| divscalar | miaobyte | T3=scalar/T1 | divscalar(tensor A, var scalar)->(tensor C) | -| log | miaobyte | T3=log(T1) | log(tensor A)->(tensor C) | -| addscalar | miaobyte | T3=T1+scalar | addscalar(tensor A, var b)->(tensor C) | -| greater | miaobyte | mask=compare(T1, T2) | greater(tensor A, tensor B)->(tensor mask) | -| lessscalar | miaobyte | mask=compare(T1, scalar) | lessscalar(tensor A, var scalar)->(tensor mask) | -| cos | miaobyte | T3=cos(T1) | cos(tensor A)->(tensor C) | -| notequalscalar | miaobyte | T1!=scalar->mask | notequalscalar(tensor A, var scalar, var epsilon)->(tensor mask) | -| minscalar | miaobyte | T3=min(T1, scalar) | minscalar(tensor A, var scalar)->(tensor C) | -| rpowscalar | miaobyte | T3=pow(scalar, T1) | rpowscalar(var scalar, tensor A)->(tensor C) | -| rdivscalar | miaobyte | T3=scalar/T1 | rdivscalar(var scalar, tensor A)->(tensor C) | -| todtype | none | T3(dtypeA)->T1(dtypeB) | todtype(tensor a)->(tensor b) | -| add | cublas | T3=T1+T2 | add(tensor a, tensor b)->(tensor c) | -| add | miaobyte | T3=T1+T2 | add(tensor a, tensor b)->(tensor c) | -| sub | miaobyte | T3=T1-T2 | sub(tensor A, tensor B)->(tensor C) | -| sqrt | miaobyte | T3=sqrt(T1) | sqrt(tensor A)->(tensor C) | -| subscalar | miaobyte | T3=T1-scalar | subscalar(tensor A, var b)->(tensor C) | -| exp | miaobyte | T3=exp(T1) | exp(tensor A)->(tensor C) | -| mul | miaobyte | T3=T1*T2 | mul(tensor A, tensor B)->(tensor C) | -| equal | miaobyte | T1==T2->mask | equal(tensor A, tensor B, var epsilon)->(tensor mask) | -| mulscalar | miaobyte | T3=T1*scalar | mulscalar(tensor A, var b)->(tensor C) | -| div | miaobyte | T3=T1/T2 | div(tensor A, tensor B)->(tensor C) | -| invert | miaobyte | T3=~T1 | invert(tensor A)->(tensor C) | -| max | miaobyte | T3=max(T1, T2) | max(tensor A, tensor B)->(tensor C) | -| pow | miaobyte | T3=pow(T1, T2) | pow(tensor A, tensor B)->(tensor C) | +| switch | miaobyte | C=switch(tensors,cases) | switch(listtensor:tensors, tensor:cases)->(tensor:result) | +| greaterscalar | miaobyte | mask=compare(T1, scalar) | greaterscalar(tensor:A, var:scalar)->(tensor:mask) | +| notequal | miaobyte | T1!=T2->mask | notequal(tensor:A, tensor:B, var:epsilon)->(tensor:mask) | +| equalscalar | miaobyte | T1==scalar->mask | equalscalar(tensor:A, var:scalar, var:epsilon)->(tensor:mask) | +| min | miaobyte | T3=min(T1, T2) | min(tensor:A, tensor:B)->(tensor:C) | +| maxscalar | miaobyte | T3=max(T1, scalar) | maxscalar(tensor:A, var:scalar)->(tensor:C) | +| tan | miaobyte | T3=tan(T1) | tan(tensor:A)->(tensor:C) | +| sin | miaobyte | T3=sin(T1) | sin(tensor:A)->(tensor:C) | +| less | miaobyte | mask=compare(T1, T2) | less(tensor:A, tensor:B)->(tensor:mask) | +| powscalar | miaobyte | T3=pow(T1, scalar) | powscalar(tensor:A, var:scalar)->(tensor:C) | +| rsubscalar | miaobyte | T3=scalar-T1 | rsubscalar(var:scalar, tensor:A)->(tensor:C) | +| divscalar | miaobyte | T3=scalar/T1 | divscalar(tensor:A, var:scalar)->(tensor:C) | +| log | miaobyte | T3=log(T1) | log(tensor:A)->(tensor:C) | +| addscalar | miaobyte | T3=T1+scalar | addscalar(tensor:A, var:b)->(tensor:C) | +| greater | miaobyte | mask=compare(T1, T2) | greater(tensor:A, tensor:B)->(tensor:mask) | +| lessscalar | miaobyte | mask=compare(T1, scalar) | lessscalar(tensor:A, var:scalar)->(tensor:mask) | +| cos | miaobyte | T3=cos(T1) | cos(tensor:A)->(tensor:C) | +| notequalscalar | miaobyte | T1!=scalar->mask | notequalscalar(tensor:A, var:scalar, var:epsilon)->(tensor:mask) | +| minscalar | miaobyte | T3=min(T1, scalar) | minscalar(tensor:A, var:scalar)->(tensor:C) | +| rpowscalar | miaobyte | T3=pow(scalar, T1) | rpowscalar(var:scalar, tensor:A)->(tensor:C) | +| rdivscalar | miaobyte | T3=scalar/T1 | rdivscalar(var:scalar, tensor:A)->(tensor:C) | +| todtype | none | T3(dtypeA)->T1(dtypeB) | todtype(tensor:a)->(tensor:b) | +| add | cublas | T3=T1+T2 | add(tensor:a, tensor:b)->(tensor:c) | +| add | miaobyte | T3=T1+T2 | add(tensor:a, tensor:b)->(tensor:c) | +| sub | miaobyte | T3=T1-T2 | sub(tensor:A, tensor:B)->(tensor:C) | +| sqrt | miaobyte | T3=sqrt(T1) | sqrt(tensor:A)->(tensor:C) | +| subscalar | miaobyte | T3=T1-scalar | subscalar(tensor:A, var:b)->(tensor:C) | +| exp | miaobyte | T3=exp(T1) | exp(tensor:A)->(tensor:C) | +| mul | miaobyte | T3=T1*T2 | mul(tensor:A, tensor:B)->(tensor:C) | +| equal | miaobyte | T1==T2->mask | equal(tensor:A, tensor:B, var:epsilon)->(tensor:mask) | +| mulscalar | miaobyte | T3=T1*scalar | mulscalar(tensor:A, var:b)->(tensor:C) | +| div | miaobyte | T3=T1/T2 | div(tensor:A, tensor:B)->(tensor:C) | +| invert | miaobyte | T3=~T1 | invert(tensor:A)->(tensor:C) | +| max | miaobyte | T3=max(T1, T2) | max(tensor:A, tensor:B)->(tensor:C) | +| pow | miaobyte | T3=pow(T1, T2) | pow(tensor:A, tensor:B)->(tensor:C) | ### reduce | Operation | Author | Math Formula | IR Instruction | |-----------|--------|--------------|----------------| -| prod | miaobyte | B = prod(A, axis=[1 2], keepdims=false) | prod(tensor A, vector dims, var keepdims)->(tensor B) | -| reducemax | miaobyte | B = reducemax(A, axis=[1 2], keepdims=false) | reducemax(tensor A, vector dims, var keepdims)->(tensor B) | -| sum | miaobyte | B = sum(A, axis=[1 2], keepdims=false) | sum(tensor A, vector dims, var keepdims)->(tensor B) | -| reducemin | miaobyte | B = reducemin(A, axis=[1 2], keepdims=false) | reducemin(tensor A, vector dims, var keepdims)->(tensor B) | +| prod | miaobyte | B = prod(A, axis=[1 2], keepdims=false) | prod(tensor:A, vector:dims, var:keepdims)->(tensor:B) | +| reducemax | miaobyte | B = reducemax(A, axis=[1 2], keepdims=false) | reducemax(tensor:A, vector:dims, var:keepdims)->(tensor:B) | +| sum | miaobyte | B = sum(A, axis=[1 2], keepdims=false) | sum(tensor:A, vector:dims, var:keepdims)->(tensor:B) | +| reducemin | miaobyte | B = reducemin(A, axis=[1 2], keepdims=false) | reducemin(tensor:A, vector:dims, var:keepdims)->(tensor:B) | ### changeshape | Operation | Author | Math Formula | IR Instruction | |-----------|--------|--------------|----------------| -| indexselect | miaobyte | T2 = T1.indexselect(index=[1,2], axis=1) | indexselect(tensor A, tensor indices, var axis)->(tensor B) | -| broadcastTo | miaobyte | T2 = T1.broadcastTo(new_shape=[4,3,2]) | broadcastTo(tensor A, vector new_shape)->(tensor B) | -| concat | miaobyte | Tresult = concat([T1, T2...], axis=3) | concat(listtensor tensors, var dim)->(tensor result) | -| transpose | miaobyte | T2 = T1.transpose(dimorder=[1,0]) | transpose(tensor A, vector dim_order)->(tensor C) | -| reshape | miaobyte | T1.reshape(shape)->T2 | reshape(tensor A, vector shape)->(tensor B) | +| indexselect | miaobyte | T2 = T1.indexselect(index=[1,2], axis=1) | indexselect(tensor:A, tensor:indices, var:axis)->(tensor:B) | +| broadcastTo | miaobyte | T2 = T1.broadcastTo(new_shape=[4,3,2]) | broadcastTo(tensor:A, vector:new_shape)->(tensor:B) | +| concat | miaobyte | Tresult = concat([T1, T2...], axis=3) | concat(listtensor:tensors, var:dim)->(tensor:result) | +| transpose | miaobyte | T2 = T1.transpose(dimorder=[1,0]) | transpose(tensor:A, vector:dim_order)->(tensor:C) | +| reshape | miaobyte | T1.reshape(shape)->T2 | reshape(tensor:A, vector:shape)->(tensor:B) | diff --git a/doc/excuter/op-mem-ompsimd/list.md b/doc/excuter/op-mem-ompsimd/list.md index fe44dd52..e259031c 100644 --- a/doc/excuter/op-mem-ompsimd/list.md +++ b/doc/excuter/op-mem-ompsimd/list.md @@ -6,102 +6,102 @@ | Operation | Author | Math Formula | IR Instruction | |-----------|--------|--------------|----------------| -| vecset | none | [3 4 5]->shape | vecset(vector value)->(vector name) | -| argset | none | argvalue->argname | argset(var value)->(var name) | +| vecset | none | [3 4 5]->shape | vecset(vector:value)->(vector:name) | +| argset | none | argvalue->argname | argset(var:value)->(var:name) | ### tensorlife | Operation | Author | Math Formula | IR Instruction | |-----------|--------|--------------|----------------| -| renametensor | none | rename(newname)->T1 | renametensor(var new_name)->(tensor t) | -| newtensor | none | T1 =Tensor(shape=[...]) | newtensor(vector shape)->(tensor t) | -| newtensor | none | T1 =Tensor(shape=[...]) | newtensor(var shape)->(tensor t) | -| deltensor | none | del->T1 | deltensor()->(tensor t) | -| copytensor | none | T1.data->T2.data | copytensor(tensor src)->(tensor dst) | +| renametensor | none | rename(newname)->T1 | renametensor(var:new_name)->(tensor:t) | +| newtensor | none | T1 =Tensor(shape=[...]) | newtensor(vector:shape)->(tensor:t) | +| newtensor | none | T1 =Tensor(shape=[...]) | newtensor(var:shape)->(tensor:t) | +| deltensor | none | del->T1 | deltensor()->(tensor:t) | +| copytensor | none | T1.data->T2.data | copytensor(tensor:src)->(tensor:dst) | ### io | Operation | Author | Math Formula | IR Instruction | |-----------|--------|--------------|----------------| -| loadtensordata | none | loadtensordata(path)->tensor.data | loadtensordata(var path)->(tensor t) | -| save | none | save(T1,path) | save(tensor t, var path)->() | -| print | miaobyte | print(T1) | print(tensor t)->() | -| print | miaobyte | print(T1) | print(tensor t, var format)->() | -| load | none | mem.load(path) | load(var path)->() | +| loadtensordata | none | loadtensordata(path)->tensor.data | loadtensordata(var:path)->(tensor:t) | +| save | none | save(T1,path) | save(tensor:t, var:path)->() | +| print | miaobyte | print(T1) | print(tensor:t)->() | +| print | miaobyte | print(T1) | print(tensor:t, var:format)->() | +| load | none | mem.load(path) | load(var:path)->() | ### matmul | Operation | Author | Math Formula | IR Instruction | |-----------|--------|--------------|----------------| -| matmul | cblas | T3=T1 @ T2 | matmul(tensor A, tensor B)->(tensor C) | -| matmul | miaobyte | T3=T1 @ T2 | matmul(tensor A, tensor B)->(tensor C) | +| matmul | cblas | T3=T1 @ T2 | matmul(tensor:A, tensor:B)->(tensor:C) | +| matmul | miaobyte | T3=T1 @ T2 | matmul(tensor:A, tensor:B)->(tensor:C) | ### init | Operation | Author | Math Formula | IR Instruction | |-----------|--------|--------------|----------------| -| normal | miaobyte | normal(mean,stddev,seed)->T1 | normal(var mean, var std, var seed)->(tensor t) | -| dropout | miaobyte | dropout(p,seed)->A | dropout(var p, var seed)->(tensor A) | -| uniform | miaobyte | uniform(low,high,seed)->T1 | uniform(var low, var high, var seed)->(tensor t) | -| arange | miaobyte | arange(start,step)->T1 | arange(var start, var step)->(tensor t) | -| constant | miaobyte | constant(value)->T1 | constant(var value)->(tensor t) | +| normal | miaobyte | normal(mean,stddev,seed)->T1 | normal(var:mean, var:std, var:seed)->(tensor:t) | +| dropout | miaobyte | dropout(p,seed)->A | dropout(var:p, var:seed)->(tensor:A) | +| uniform | miaobyte | uniform(low,high,seed)->T1 | uniform(var:low, var:high, var:seed)->(tensor:t) | +| arange | miaobyte | arange(start,step)->T1 | arange(var:start, var:step)->(tensor:t) | +| constant | miaobyte | constant(value)->T1 | constant(var:value)->(tensor:t) | ### elementwise | Operation | Author | Math Formula | IR Instruction | |-----------|--------|--------------|----------------| -| switch | miaobyte | C=switch([tensors],case) | switch(listtensor tensors, tensor cases)->(tensor C) | -| greaterscalar | miaobyte | mask=greater(T1,scalar) | greaterscalar(tensor A, var scalar)->(tensor mask) | -| notequal | miaobyte | notequal(T1,T2)->mask | notequal(tensor A, tensor B, var epsilon)->(tensor mask) | -| equalscalar | miaobyte | mask=equal(T1,scalar) | equalscalar(tensor A, var scalar, var eposilon)->(tensor mask) | -| min | miaobyte | T3=min(T1,T2) | min(tensor A, tensor B)->(tensor C) | -| maxscalar | miaobyte | T3=max(T1,scalar) | maxscalar(tensor A, var scalar)->(tensor C) | -| tan | miaobyte | T3=tan(T1) | tan(tensor A)->(tensor C) | -| sin | miaobyte | T3=sin(T1) | sin(tensor A)->(tensor C) | -| less | miaobyte | mask=less(T1,T2) | less(tensor A, tensor B)->(tensor mask) | -| powscalar | miaobyte | T3=T1^scalar | powscalar(tensor A, var scalar)->(tensor C) | -| rsubscalar | miaobyte | T3=scalar-T1 | rsubscalar(var scalar, tensor a)->(tensor c) | -| divscalar | miaobyte | T3=T1/scalar | divscalar(tensor A, var scalar)->(tensor C) | -| log | miaobyte | T3=log(T1) | log(tensor A)->(tensor C) | -| addscalar | miaobyte | T3=T1+scalar | addscalar(tensor a, var scalar)->(tensor c) | -| greater | miaobyte | mask=greater(T1,T2) | greater(tensor A, tensor B)->(tensor mask) | -| lessscalar | miaobyte | mask=less(T1,scalar) | lessscalar(tensor A, var scalar)->(tensor mask) | -| cos | miaobyte | T3=cos(T1) | cos(tensor A)->(tensor C) | -| notequalscalar | miaobyte | mask=notequal(T1,scalar) | notequalscalar(tensor A, var scalar, var epsilon)->(tensor mask) | -| minscalar | miaobyte | T3=min(T1,scalar) | minscalar(tensor A, var scalar)->(tensor C) | -| rpowscalar | miaobyte | T3=scalar^T1 | rpowscalar(var scalar, tensor A)->(tensor C) | -| rdivscalar | miaobyte | T3=scalar/T1 | rdivscalar(var scalar, tensor A)->(tensor C) | -| todtype | none | T3(dtypeA)->T1(dtypeB) | todtype(tensor A)->(tensor C) | -| add | cblas | T3=T1+T2 | add(tensor a, tensor b)->(tensor c) | -| add | miaobyte | T3=T1+T2 | add(tensor a, tensor b)->(tensor c) | -| sub | miaobyte | T3=T1-T2 | sub(tensor a, tensor b)->(tensor c) | -| sqrt | miaobyte | T3=sqrt(T1) | sqrt(tensor A)->(tensor C) | -| subscalar | miaobyte | T3=T1-scalar | subscalar(tensor a, var scalar)->(tensor c) | -| exp | miaobyte | T3=exp(T1) | exp(tensor A)->(tensor C) | -| mul | miaobyte | T3=T1*T2 | mul(tensor A, tensor B)->(tensor C) | -| equal | miaobyte | equal(T1,T2)->mask | equal(tensor A, tensor B, var eposilon)->(tensor mask) | -| mulscalar | miaobyte | T3=T1*scalar | mulscalar(tensor A, var b)->(tensor C) | -| div | miaobyte | T3=T1/T2 | div(tensor A, tensor B)->(tensor C) | -| invert | miaobyte | T3=~T1 | invert(tensor A)->(tensor C) | -| max | miaobyte | T3=max(T1,T2) | max(tensor A, tensor B)->(tensor C) | -| pow | miaobyte | T3=T1^T2 | pow(tensor A, tensor B)->(tensor C) | +| switch | miaobyte | C=switch([tensors],case) | switch(listtensor:tensors, tensor:cases)->(tensor:C) | +| greaterscalar | miaobyte | mask=greater(T1,scalar) | greaterscalar(tensor:A, var:scalar)->(tensor:mask) | +| notequal | miaobyte | notequal(T1,T2)->mask | notequal(tensor:A, tensor:B, var:epsilon)->(tensor:mask) | +| equalscalar | miaobyte | mask=equal(T1,scalar) | equalscalar(tensor:A, var:scalar, var:eposilon)->(tensor:mask) | +| min | miaobyte | T3=min(T1,T2) | min(tensor:A, tensor:B)->(tensor:C) | +| maxscalar | miaobyte | T3=max(T1,scalar) | maxscalar(tensor:A, var:scalar)->(tensor:C) | +| tan | miaobyte | T3=tan(T1) | tan(tensor:A)->(tensor:C) | +| sin | miaobyte | T3=sin(T1) | sin(tensor:A)->(tensor:C) | +| less | miaobyte | mask=less(T1,T2) | less(tensor:A, tensor:B)->(tensor:mask) | +| powscalar | miaobyte | T3=T1^scalar | powscalar(tensor:A, var:scalar)->(tensor:C) | +| rsubscalar | miaobyte | T3=scalar-T1 | rsubscalar(var:scalar, tensor:a)->(tensor:c) | +| divscalar | miaobyte | T3=T1/scalar | divscalar(tensor:A, var:scalar)->(tensor:C) | +| log | miaobyte | T3=log(T1) | log(tensor:A)->(tensor:C) | +| addscalar | miaobyte | T3=T1+scalar | addscalar(tensor:a, var:scalar)->(tensor:c) | +| greater | miaobyte | mask=greater(T1,T2) | greater(tensor:A, tensor:B)->(tensor:mask) | +| lessscalar | miaobyte | mask=less(T1,scalar) | lessscalar(tensor:A, var:scalar)->(tensor:mask) | +| cos | miaobyte | T3=cos(T1) | cos(tensor:A)->(tensor:C) | +| notequalscalar | miaobyte | mask=notequal(T1,scalar) | notequalscalar(tensor:A, var:scalar, var:epsilon)->(tensor:mask) | +| minscalar | miaobyte | T3=min(T1,scalar) | minscalar(tensor:A, var:scalar)->(tensor:C) | +| rpowscalar | miaobyte | T3=scalar^T1 | rpowscalar(var:scalar, tensor:A)->(tensor:C) | +| rdivscalar | miaobyte | T3=scalar/T1 | rdivscalar(var:scalar, tensor:A)->(tensor:C) | +| todtype | none | T3(dtypeA)->T1(dtypeB) | todtype(tensor:A)->(tensor:C) | +| add | cblas | T3=T1+T2 | add(tensor:a, tensor:b)->(tensor:c) | +| add | miaobyte | T3=T1+T2 | add(tensor:a, tensor:b)->(tensor:c) | +| sub | miaobyte | T3=T1-T2 | sub(tensor:a, tensor:b)->(tensor:c) | +| sqrt | miaobyte | T3=sqrt(T1) | sqrt(tensor:A)->(tensor:C) | +| subscalar | miaobyte | T3=T1-scalar | subscalar(tensor:a, var:scalar)->(tensor:c) | +| exp | miaobyte | T3=exp(T1) | exp(tensor:A)->(tensor:C) | +| mul | miaobyte | T3=T1*T2 | mul(tensor:A, tensor:B)->(tensor:C) | +| equal | miaobyte | equal(T1,T2)->mask | equal(tensor:A, tensor:B, var:eposilon)->(tensor:mask) | +| mulscalar | miaobyte | T3=T1*scalar | mulscalar(tensor:A, var:b)->(tensor:C) | +| div | miaobyte | T3=T1/T2 | div(tensor:A, tensor:B)->(tensor:C) | +| invert | miaobyte | T3=~T1 | invert(tensor:A)->(tensor:C) | +| max | miaobyte | T3=max(T1,T2) | max(tensor:A, tensor:B)->(tensor:C) | +| pow | miaobyte | T3=T1^T2 | pow(tensor:A, tensor:B)->(tensor:C) | ### reduce | Operation | Author | Math Formula | IR Instruction | |-----------|--------|--------------|----------------| -| prod | miaobyte | B = prod(A, axis=[1 2], keepdims=false) | prod(tensor A, vector axis, var keepdims)->(tensor B) | -| reducemax | miaobyte | B = reducemax(A, axis=[1 2], keepdims=false) | reducemax(tensor A, vector axis, var keepdims)->(tensor B) | -| sum | miaobyte | B = sum(A, axis=[1 2], keepdims=false) | sum(tensor A, vector axis, var keepdims)->(tensor B) | -| reducemin | miaobyte | B = reducemin(A, axis=[1 2], keepdims=false) | reducemin(tensor A, vector axis, var keepdims)->(tensor B) | +| prod | miaobyte | B = prod(A, axis=[1 2], keepdims=false) | prod(tensor:A, vector:axis, var:keepdims)->(tensor:B) | +| reducemax | miaobyte | B = reducemax(A, axis=[1 2], keepdims=false) | reducemax(tensor:A, vector:axis, var:keepdims)->(tensor:B) | +| sum | miaobyte | B = sum(A, axis=[1 2], keepdims=false) | sum(tensor:A, vector:axis, var:keepdims)->(tensor:B) | +| reducemin | miaobyte | B = reducemin(A, axis=[1 2], keepdims=false) | reducemin(tensor:A, vector:axis, var:keepdims)->(tensor:B) | ### changeshape | Operation | Author | Math Formula | IR Instruction | |-----------|--------|--------------|----------------| -| indexselect | miaobyte | T2 = T1.indexselect(index=T3, axis=3) | indexselect(tensor A, tensor index, var axis)->(tensor B) | -| broadcastTo | miaobyte | T2 = T1.broadcastTo(new_shape=[4,3,2]) | broadcastTo(tensor A, vector new_shape)->(tensor B) | -| concat | miaobyte | Tresult = concat([T1, T2...], axis=3) | concat(listtensor tensors, var dim)->(tensor result) | -| transpose | miaobyte | T1.transpose(dimorder=[1,0])->T2 | transpose(tensor A, vector dim_order)->(tensor C) | -| reshape | miaobyte | T1.reshape(shape)->T2 | reshape(tensor A, vector shape)->(tensor B) | +| indexselect | miaobyte | T2 = T1.indexselect(index=T3, axis=3) | indexselect(tensor:A, tensor:index, var:axis)->(tensor:B) | +| broadcastTo | miaobyte | T2 = T1.broadcastTo(new_shape=[4,3,2]) | broadcastTo(tensor:A, vector:new_shape)->(tensor:B) | +| concat | miaobyte | Tresult = concat([T1, T2...], axis=3) | concat(listtensor:tensors, var:dim)->(tensor:result) | +| transpose | miaobyte | T1.transpose(dimorder=[1,0])->T2 | transpose(tensor:A, vector:dim_order)->(tensor:C) | +| reshape | miaobyte | T1.reshape(shape)->T2 | reshape(tensor:A, vector:shape)->(tensor:B) | diff --git a/excuter/cpp-common/src/deepx/tf/tf.cpp b/excuter/cpp-common/src/deepx/tf/tf.cpp index b6ffa66f..6c52b3e4 100644 --- a/excuter/cpp-common/src/deepx/tf/tf.cpp +++ b/excuter/cpp-common/src/deepx/tf/tf.cpp @@ -37,7 +37,10 @@ namespace deepx::tf this->textvalue = textvalue; } } - + string Param::to_string() const + { + return dtype_str(dtype) + ":" + textvalue; + } string TFMetadata::to_string() const { stringstream ss; @@ -335,11 +338,7 @@ namespace deepx::tf } // 输出类型,根据show_name决定是否输出参数名 - ss << dtype_str(args[i].dtype); - if (show_name) - { - ss << " " << args[i].textvalue; - } + ss << args[i].to_string(); } ss << ")->("; @@ -351,13 +350,8 @@ namespace deepx::tf { ss << ", "; // 始终使用逗号分隔返回值 } - // 输出类型,根据show_name决定是否输出返回值名 - ss << dtype_str(returns[i].dtype); - if (show_name) - { - ss << " " << returns[i].textvalue; - } + ss << returns[i].to_string(); } ss << ")"; diff --git a/excuter/cpp-common/src/deepx/tf/tf.hpp b/excuter/cpp-common/src/deepx/tf/tf.hpp index d08b85b5..f5cf55c1 100644 --- a/excuter/cpp-common/src/deepx/tf/tf.hpp +++ b/excuter/cpp-common/src/deepx/tf/tf.hpp @@ -31,6 +31,7 @@ namespace deepx::tf : textvalue(textvalue), dtype(make_dtype(dt, prec)) {} void parse(const string ¶m); + string to_string() const; }; // 元数据 struct Benchmark diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu index a9b1ca2a..f48ca7b2 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu @@ -97,7 +97,8 @@ namespace deepx::tensorfunc template __global__ void add_kernel(const T *A, const T *B, T *C, const int size) { - for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x) + int stride = blockDim.x * gridDim.x; + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride) { C[idx] = A[idx] + B[idx]; } @@ -125,7 +126,8 @@ namespace deepx::tensorfunc template __global__ void addscalar_kernel(const T *A, const T scalar, T *C, const int size) { - for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x) + int stride = blockDim.x * gridDim.x; + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride) { C[idx] = A[idx] + scalar; } @@ -151,7 +153,8 @@ namespace deepx::tensorfunc template __global__ void sub_kernel(const T *A, const T *B, T *C, const int size) { - for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x) + int stride = blockDim.x * gridDim.x; + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride) { C[idx] = A[idx] - B[idx]; } @@ -177,7 +180,8 @@ namespace deepx::tensorfunc template __global__ void subscalar_kernel(const T *A, const T scalar, T *C, const int size) { - for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x) + int stride = blockDim.x * gridDim.x; + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride) { C[idx] = A[idx] - scalar; } @@ -231,7 +235,8 @@ namespace deepx::tensorfunc template __global__ void mul_kernel(const T *A, const T *B, T *C, const int size) { - for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x) + int stride = blockDim.x * gridDim.x; + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride) { C[idx] = A[idx] * B[idx]; } @@ -258,7 +263,8 @@ namespace deepx::tensorfunc template __global__ void mulscalar_kernel(const T *A, const T scalar, T *C, const int size) { - for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x) + int stride = blockDim.x * gridDim.x; + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride) { C[idx] = A[idx] * scalar; } @@ -284,7 +290,8 @@ namespace deepx::tensorfunc template __global__ void div_kernel(const T *A, const T *B, T *C, const int size) { - for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x) + int stride = blockDim.x * gridDim.x; + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride) { C[idx] = A[idx] / B[idx]; } @@ -311,7 +318,8 @@ namespace deepx::tensorfunc template __global__ void divscalar_kernel(const T *A, const T scalar, T *C, const int size) { - for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x) + int stride = blockDim.x * gridDim.x; + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride) { C[idx] = A[idx] / scalar; } @@ -338,7 +346,8 @@ namespace deepx::tensorfunc template __global__ void rdivscalar_kernel(const T scalar, const T *A, T *C, const int size) { - for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x) + int stride = blockDim.x * gridDim.x; + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride) { C[idx] = scalar / A[idx]; } @@ -365,7 +374,8 @@ namespace deepx::tensorfunc template __global__ void invert_kernel(const T *A, T *C, const int size) { - for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x) + int stride = blockDim.x * gridDim.x; + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride) { C[idx] = ~A[idx]; } @@ -373,8 +383,9 @@ namespace deepx::tensorfunc template <> __global__ void invert_kernel(const bool *A, bool *C, const int size) - { - for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x) + { + int stride = blockDim.x * gridDim.x; + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride) { C[idx] = !A[idx]; } diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.cu index a514755b..c09b8257 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.cu +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.cu @@ -21,7 +21,7 @@ namespace deepx::tensorfunc { auto [numBlocks, blockSize] = BestDims(size); max_kernel<<>>(A, B, C, size); - throwcudaerror("Failed to launch add kernel",cudaGetLastError()); + throwcudaerror("Failed to launch max kernel",cudaGetLastError()); } template void launch_max(const double *A, const double *B, double *C, const int size); @@ -48,7 +48,7 @@ namespace deepx::tensorfunc { auto [numBlocks, blockSize] = BestDims(size); maxscalar_kernel<<>>(A, scalar, C, size); - throwcudaerror("Failed to launch add kernel",cudaGetLastError()); + throwcudaerror("Failed to launch maxscalar kernel",cudaGetLastError()); } template void launch_maxscalar(const double *A, const double scalar, double *C, const int size); @@ -75,7 +75,7 @@ namespace deepx::tensorfunc { auto [numBlocks, blockSize] = BestDims(size); min_kernel<<>>(A, B, C, size); - throwcudaerror("Failed to launch add kernel",cudaGetLastError()); + throwcudaerror("Failed to launch min kernel",cudaGetLastError()); } template void launch_min(const double *A, const double *B, double *C, const int size); @@ -102,7 +102,7 @@ namespace deepx::tensorfunc { auto [numBlocks, blockSize] = BestDims(size); minscalar_kernel<<>>(A, scalar, C, size); - throwcudaerror("Failed to launch add kernel",cudaGetLastError()); + throwcudaerror("Failed to launch minscalar kernel",cudaGetLastError()); } template void launch_minscalar(const double *A, const double scalar, double *C, const int size); @@ -155,7 +155,7 @@ namespace deepx::tensorfunc { equalwithepsilon_kernel<<>>(A, B, epsilon, mask, size); } - throwcudaerror("Failed to launch add kernel",cudaGetLastError()); + throwcudaerror("Failed to launch equal kernel",cudaGetLastError()); } template void launch_equal(const double *A, const double *B, const float epsilon, bool *mask, const int size); @@ -208,7 +208,7 @@ namespace deepx::tensorfunc { equalscalarwithepsilon_kernel<<>>(A, scalar, epsilon, mask, size); } - throwcudaerror("Failed to launch add kernel",cudaGetLastError()); + throwcudaerror("Failed to launch equalscalar kernel",cudaGetLastError()); } template void launch_equalscalar(const double *A, const double scalar, const float epsilon, bool *mask, const int size); @@ -261,7 +261,7 @@ namespace deepx::tensorfunc { notequalwithepsilon_kernel<<>>(A, B, epsilon, mask, size); } - throwcudaerror("Failed to launch add kernel",cudaGetLastError()); + throwcudaerror("Failed to launch notequal kernel",cudaGetLastError()); } template void launch_notequal(const double *A, const double *B, const float epsilon, bool *mask, const int size); @@ -314,7 +314,7 @@ namespace deepx::tensorfunc { notequalscalarwithepsilon_kernel<<>>(A, scalar, epsilon, mask, size); } - throwcudaerror("Failed to launch add kernel",cudaGetLastError()); + throwcudaerror("Failed to launch notequalscalar kernel",cudaGetLastError()); } template void launch_notequalscalar(const double *A, const double scalar, const float epsilon, bool *mask, const int size); @@ -342,7 +342,7 @@ namespace deepx::tensorfunc { auto [numBlocks, blockSize] = BestDims(size); less_kernel<<>>(A, B, mask, size); - throwcudaerror("Failed to launch add kernel",cudaGetLastError()); + throwcudaerror("Failed to launch less kernel",cudaGetLastError()); } template void launch_less(const double *A, const double *B, bool *mask, const int size); @@ -371,7 +371,7 @@ namespace deepx::tensorfunc { auto [numBlocks, blockSize] = BestDims(size); lessscalar_kernel<<>>(A, scalar, mask, size); - throwcudaerror("Failed to launch add kernel",cudaGetLastError()); + throwcudaerror("Failed to launch lessscalar kernel",cudaGetLastError()); } template void launch_lessscalar(const double *A, const double scalar, bool *mask, const int size); @@ -399,7 +399,7 @@ namespace deepx::tensorfunc { auto [numBlocks, blockSize] = BestDims(size); greater_kernel<<>>(A, B, mask, size); - throwcudaerror("Failed to launch add kernel",cudaGetLastError()); + throwcudaerror("Failed to launch greater kernel",cudaGetLastError()); } template void launch_greater(const double *A, const double *B, bool *mask, const int size); @@ -427,7 +427,7 @@ namespace deepx::tensorfunc { auto [numBlocks, blockSize] = BestDims(size); greaterscalar_kernel<<>>(A, scalar, mask, size); - throwcudaerror("Failed to launch add kernel",cudaGetLastError()); + throwcudaerror("Failed to launch greaterscalar kernel",cudaGetLastError()); } template void launch_greaterscalar(const double *A, const double scalar, bool *mask, const int size); @@ -449,17 +449,16 @@ namespace deepx::tensorfunc C[idx] = tensorsdata[cases[idx]][idx]; } } - + template void launch_switch(const T **tensorsdata, const int numTensors, const casesT *cases, T *C, const int size) { auto [numBlocks, blockSize] = BestDims(size); cudaVector tensorsdataList(tensorsdata, numTensors, cudaMemcpyHostToDevice); switch_kernel<<>>(tensorsdataList.data, numTensors, cases, C, size); - throwcudaerror("Failed to launch add kernel",cudaGetLastError()); + throwcudaerror("Failed to launch switch kernel",cudaGetLastError()); } - template void launch_switch(const double **tensorsdata, const int numTensors, const int32_t *cases, double *C, const int size); template void launch_switch(const float **tensorsdata, const int numTensors, const int32_t *cases, float *C, const int size); template void launch_switch(const nv_bfloat16 **tensorsdata, const int numTensors, const int32_t *cases, nv_bfloat16 *C, const int size); diff --git a/front/py/deepx/transformer/modeling_rope_utils.py b/front/py/deepx/transformer/modeling_rope_utils.py index a0c7d323..ca27caa7 100644 --- a/front/py/deepx/transformer/modeling_rope_utils.py +++ b/front/py/deepx/transformer/modeling_rope_utils.py @@ -36,7 +36,7 @@ def _compute_llama3_parameters(config:dict={ high_freq_wavelen = old_context_len / high_freq_factor wavelen = 2 * math.pi / inv_freq - + wavelen.print() # wavelen < high_freq_wavelen: do nothing # wavelen > low_freq_wavelen: divide by factor inv_freq_llama = where(wavelen > low_freq_wavelen, inv_freq / factor, inv_freq) @@ -44,8 +44,11 @@ def _compute_llama3_parameters(config:dict={ smooth_factor = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor) smoothed_inv_freq = (1 - smooth_factor) * inv_freq_llama / factor + smooth_factor * inv_freq_llama is_medium_freq = ~(wavelen < high_freq_wavelen) * ~(wavelen > low_freq_wavelen) + is_medium_freq.print() + # TODO 这一步执行后,会导致an illegal memory access was encountered inv_freq_llama = where(is_medium_freq, smoothed_inv_freq, inv_freq_llama) - + is_medium_freq.print() + inv_freq_llama.print() return inv_freq_llama, attention_factor ROPE_INIT_FUNCTIONS = { From 52f754aa0f5b315486c7cdf8021407e296adaaf9 Mon Sep 17 00:00:00 2001 From: lipeng <734991033@qq.com> Date: Sun, 18 May 2025 23:47:18 +0800 Subject: [PATCH 2/3] =?UTF-8?q?llama=5Frope:cuda=E9=AA=8C=E8=AF=81?= =?UTF-8?q?=E6=88=90=E5=8A=9F=E3=80=82=E4=BF=AE=E6=AD=A3switch=20bool?= =?UTF-8?q?=E7=B1=BB=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../deepx/tensorfunc/elementwise_miaobyte_compare.cu | 10 +++++++++- .../deepx/tensorfunc/elementwise_miaobyte_compare.cuh | 3 +++ .../src/deepx/tensorfunc/elementwise_miaobyte_sin.cu | 2 +- excuter/op-mem-cuda/src/deepx/tf/elementwise_sin.hpp | 11 +++++------ 4 files changed, 18 insertions(+), 8 deletions(-) diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.cu index c09b8257..1eaa3dcb 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.cu +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.cu @@ -449,7 +449,15 @@ namespace deepx::tensorfunc C[idx] = tensorsdata[cases[idx]][idx]; } } - + template + __global__ void switch_kernel(const T** tensorsdata, const int numTensors, const bool* cases, T* C, const int size) + { + int stride = blockDim.x * gridDim.x; + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride) + { + C[idx] = cases[idx] ? tensorsdata[1][idx] : tensorsdata[0][idx]; + } + } template void launch_switch(const T **tensorsdata, const int numTensors, const casesT *cases, T *C, const int size) { diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.cuh b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.cuh index c813acb2..d4efe1a5 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.cuh +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.cuh @@ -109,6 +109,9 @@ namespace deepx::tensorfunc template __global__ void switch_kernel(const T** tensorsdata,const int numTensors, const casesT* cases, T* C, const int size); + template + __global__ void switch_kernel(const T** tensorsdata, const int numTensors, const bool* cases, T* C, const int size); + template void launch_switch(const T** tensorsdata,const int numTensors, const casesT* cases, T* C, const int size); diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.cu index 18ca1a12..e0610ebd 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.cu +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.cu @@ -114,7 +114,7 @@ namespace deepx::tensorfunc C[idx] = tanf(A[idx]); } } - + template void launch_tan(const T* a, T* c, const int size){ diff --git a/excuter/op-mem-cuda/src/deepx/tf/elementwise_sin.hpp b/excuter/op-mem-cuda/src/deepx/tf/elementwise_sin.hpp index ec35e1f4..c6de7102 100644 --- a/excuter/op-mem-cuda/src/deepx/tf/elementwise_sin.hpp +++ b/excuter/op-mem-cuda/src/deepx/tf/elementwise_sin.hpp @@ -86,11 +86,10 @@ namespace deepx::tf int run(shared_ptr mem, string &error) override { Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; - Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype; Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; - if (a_type != c_type || b_type != c_type) + if (a_type != c_type) { - error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type) + " or " + precision_str(b_type) + " != " + precision_str(c_type); + error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type); return 1; } switch (a_type) @@ -141,11 +140,10 @@ namespace deepx::tf int run(shared_ptr mem, string &error) override { Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; - Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype; Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; - if (a_type != c_type || b_type != c_type) + if (a_type != c_type) { - error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type) + " or " + precision_str(b_type) + " != " + precision_str(c_type); + error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type); return 1; } switch (a_type) @@ -156,6 +154,7 @@ namespace deepx::tf case Precision::Float32: tensorfunc::tan(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); break; + default: error = "Unsupported type: " + precision_str(a_type); return 1; From c5bd17652759d6e02e696ee10ea1147d38513afb Mon Sep 17 00:00:00 2001 From: lipeng <734991033@qq.com> Date: Sun, 18 May 2025 23:51:26 +0800 Subject: [PATCH 3/3] =?UTF-8?q?switch:ompsimd=20bool=E7=89=B9=E5=8C=96?= =?UTF-8?q?=E4=BF=AE=E6=AD=A3=EF=BC=8C=E9=81=BF=E5=85=8D=E5=87=BA=E9=94=99?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../deepx/tensorfunc/elementwise_miaobyte.hpp | 32 ++++++++++++++++++- front/py/deepx/scheduler/client/udpconn.py | 2 +- 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp index 864334ce..a125761f 100644 --- a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp +++ b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp @@ -987,7 +987,37 @@ namespace deepx::tensorfunc { casesT which_tensor=cases.data[i+j]; C.data[i+j]=tensors[which_tensor]->data[i+j]; - } }); + } + }); + } + else + { + throw std::invalid_argument("shape mismatch"); + } + } + }; + + template + struct switchDispatcher + { + static void Switch(const vector *> tensors, const Tensor &cases, Tensor &C) + { + if (cases.shape == C.shape) + { + // 对于bool类型,tensors必须只有2个元素 + if (tensors.size() != 2) + { + throw std::invalid_argument("For bool cases, tensors size must be 2"); + } + + C.shape.rangeElementwiseParallel([&tensors, &cases, &C](int i, int i_end) + { + for (int j = 0; j < i_end; j++) + { + // bool特化版本:false选第一个tensor,true选第二个tensor + C.data[i+j] = cases.data[i+j] ? tensors[1]->data[i+j] : tensors[0]->data[i+j]; + } + }); } else { diff --git a/front/py/deepx/scheduler/client/udpconn.py b/front/py/deepx/scheduler/client/udpconn.py index 6a12c26a..a25b0963 100644 --- a/front/py/deepx/scheduler/client/udpconn.py +++ b/front/py/deepx/scheduler/client/udpconn.py @@ -3,7 +3,7 @@ import select class UDPConn: - def __init__(self, endpoint: str = "localhost:9090"): + def __init__(self, endpoint: str = "localhost:8080"): # 解析endpoint self._host, port_str = endpoint.split(':') self._port = int(port_str)