From 917b2cb683d8caa6ca75338c896637148ce58451 Mon Sep 17 00:00:00 2001 From: lipeng <734991033@qq.com> Date: Thu, 10 Apr 2025 21:47:36 +0800 Subject: [PATCH 1/6] =?UTF-8?q?fixbug:keepkdim=E9=97=AE=E9=A2=98=E8=A7=A3?= =?UTF-8?q?=E5=86=B3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../src/deepx/tensorfunc/reduce_miaobyte.hpp | 30 ++- .../py/examples/2_ir/5_reduce_sum_keepdim.dot | 43 ++-- .../2_ir/5_reduce_sum_keepdim.dot.svg | 197 ++++++++++++------ .../py/examples/2_ir/5_reduce_sum_keepdim.py | 2 +- front/py/examples/2_ir/5_reduce_sumprod.dot | 48 ++--- .../py/examples/2_ir/5_reduce_sumprod.dot.svg | 96 ++++----- 6 files changed, 253 insertions(+), 163 deletions(-) diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/reduce_miaobyte.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/reduce_miaobyte.hpp index 9d7e37c8..424cacd6 100644 --- a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/reduce_miaobyte.hpp +++ b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/reduce_miaobyte.hpp @@ -39,7 +39,7 @@ namespace deepx::tensorfunc { newIndices[j++] = indices[i]; }else if (keepdims && (reduced_dims[i] == 1)) { - newIndices[j++] = indices[i]; + newIndices[j++] = 0; } } int outputIdx = result.shape.linearat(newIndices); @@ -58,7 +58,7 @@ namespace deepx::tensorfunc { newIndices[j++] = indices[i]; }else if (keepdims && (reduced_dims[i] == 1)) { - newIndices[j++] = indices[i]; + newIndices[j++] = 0; } } int outputIdx = result.shape.linearat(newIndices); @@ -104,7 +104,7 @@ namespace deepx::tensorfunc std::vector reduced_dims = reducedDim(tensor.shape.shape, checkeddims); const int minshape_1 = Lanes(ScalableTag()); // 如果dims的最后一个元素是tensor.shape.dim-1,则说明reduceprod的数据不连续(不对齐),无法simd(需要不停跳跃) - constant(result, T(1)); + constant(result, T(1)); if (reduced_dims.rbegin()[0] == tensor.shape.dim - 1 || tensor.shape.dim > reduced_dims.size() || tensor.shape[-1] >= minshape_1) { tensor.shape.rangeParallel(tensor.shape.dim, [&tensor, &result, &reduced_dims, keepdims](const int idx_linear, const std::vector &indices, std::vector &newIndices) @@ -115,7 +115,7 @@ namespace deepx::tensorfunc if (reduced_dims[i]==0) { newIndices[j++]=indices[i]; }else if (keepdims && (reduced_dims[i] == 1)) { - newIndices[j++]=indices[i]; + newIndices[j++]=0; } } // 累加求和 @@ -136,7 +136,7 @@ namespace deepx::tensorfunc { newIndices[j++] = indices[i]; }else if (keepdims && (reduced_dims[i] == 1)) { - newIndices[j++] = indices[i]; + newIndices[j++] = 0; } } // 累加求和 @@ -198,13 +198,12 @@ namespace deepx::tensorfunc if (reduced_dims[i]==0) { newIndices[j++]=indices[i]; }else if (keepdims && (reduced_dims[i] == 1)) { - newIndices[j++]=indices[i]; + newIndices[j++]=0; } } // 累加求和 int outputIdx=result.shape.linearat(newIndices); - result.data[outputIdx]=std::max(result.data[outputIdx],tensor.data[idx_linear]); - }, result.shape.dim); + result.data[outputIdx]=std::max(result.data[outputIdx],tensor.data[idx_linear]); }, result.shape.dim); } else { @@ -219,7 +218,7 @@ namespace deepx::tensorfunc { newIndices[j++] = indices[i]; }else if (keepdims && (reduced_dims[i] == 1)) { - newIndices[j++] = indices[i]; + newIndices[j++] =0; } } @@ -255,8 +254,7 @@ namespace deepx::tensorfunc maxt = std::max(maxt,tensor.data[i + j]); } - result.data[outputIdx] = std::max(result.data[outputIdx],maxt); - }, result.shape.dim); + result.data[outputIdx] = std::max(result.data[outputIdx],maxt); }, result.shape.dim); } } }; @@ -281,14 +279,13 @@ namespace deepx::tensorfunc if (reduced_dims[i]==0) { newIndices[j++]=indices[i]; }else if (keepdims && (reduced_dims[i] == 1)) { - newIndices[j++]=indices[i]; + newIndices[j++]=0; } } // 累加求和 int outputIdx=result.shape.linearat(newIndices); - result.data[outputIdx]=std::min(result.data[outputIdx],tensor.data[idx_linear]); - }, result.shape.dim); + result.data[outputIdx]=std::min(result.data[outputIdx],tensor.data[idx_linear]); }, result.shape.dim); } else { @@ -303,7 +300,7 @@ namespace deepx::tensorfunc { newIndices[j++] = indices[i]; }else if (keepdims && (reduced_dims[i] == 1)) { - newIndices[j++] = indices[i]; + newIndices[j++] = 0; } } @@ -339,8 +336,7 @@ namespace deepx::tensorfunc mint = std::min(mint,tensor.data[i + j]); } - result.data[outputIdx] = std::min(result.data[outputIdx],mint); - }, result.shape.dim); + result.data[outputIdx] = std::min(result.data[outputIdx],mint); }, result.shape.dim); } } }; diff --git a/front/py/examples/2_ir/5_reduce_sum_keepdim.dot b/front/py/examples/2_ir/5_reduce_sum_keepdim.dot index 60e69f3d..93785caa 100644 --- a/front/py/examples/2_ir/5_reduce_sum_keepdim.dot +++ b/front/py/examples/2_ir/5_reduce_sum_keepdim.dot @@ -2,20 +2,37 @@ digraph { rankdir=TB node [shape=record] - 138334896593968 [label="tensor_1 + 134049762132704 [label="t (3, 4, 5)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 138336870303440 [label=reshape color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 138336870304208 [label="vector_1 -(3, 4, 5)" color=darkseagreen fillcolor=honeydew fontname="Sans-Serif" labeljust=l shape=box style=filled] - 138334610649632 [label="s1 + 134047466295776 [label="s (1, 4, 1)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 138334610649680 [label="vector_2 + 134047464790368 [label="vector_1 [0, 2]" color=darkseagreen fillcolor=honeydew fontname="Sans-Serif" labeljust=l shape=box style=filled] - 138334610649536 [label=sum color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 138336870303440 -> 138334896593968 [arrowsize=0.8 color=gray40 penwidth=1.2] - 138334896593968 -> 138336870303440 [arrowsize=0.8 color=gray40 penwidth=1.2] - 138336870304208 -> 138336870303440 [arrowsize=0.8 color=gray40 penwidth=1.2] - 138334610649536 -> 138334610649632 [arrowsize=0.8 color=gray40 penwidth=1.2] - 138334896593968 -> 138334610649536 [arrowsize=0.8 color=gray40 penwidth=1.2] - 138334610649680 -> 138334610649536 [arrowsize=0.8 color=gray40 penwidth=1.2] + 134047464790512 [label=sum color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 134047464790272 [label="p +(3, 1, 5)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] + 134047464790608 [label="vector_2 +[1]" color=darkseagreen fillcolor=honeydew fontname="Sans-Serif" labeljust=l shape=box style=filled] + 134047464790656 [label=prod color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 134049762132560 [label="t1 +(4, 5, 6)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] + 134047464791088 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 134047464790848 [label="var_1 +1" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled] + 134047464790800 [label="t2 +(1, 1, 6)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] + 134047464790464 [label="vector_3 +[0, 1]" color=darkseagreen fillcolor=honeydew fontname="Sans-Serif" labeljust=l shape=box style=filled] + 134047464791328 [label=sum color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 134047464790512 -> 134047466295776 [arrowsize=0.8 color=gray40 penwidth=1.2] + 134049762132704 -> 134047464790512 [arrowsize=0.8 color=gray40 penwidth=1.2] + 134047464790368 -> 134047464790512 [arrowsize=0.8 color=gray40 penwidth=1.2] + 134047464790656 -> 134047464790272 [arrowsize=0.8 color=gray40 penwidth=1.2] + 134049762132704 -> 134047464790656 [arrowsize=0.8 color=gray40 penwidth=1.2] + 134047464790608 -> 134047464790656 [arrowsize=0.8 color=gray40 penwidth=1.2] + 134047464791088 -> 134049762132560 [arrowsize=0.8 color=gray40 penwidth=1.2] + 134047464790848 -> 134047464791088 [arrowsize=0.8 color=gray40 penwidth=1.2] + 134047464791328 -> 134047464790800 [arrowsize=0.8 color=gray40 penwidth=1.2] + 134049762132560 -> 134047464791328 [arrowsize=0.8 color=gray40 penwidth=1.2] + 134047464790464 -> 134047464791328 [arrowsize=0.8 color=gray40 penwidth=1.2] } diff --git a/front/py/examples/2_ir/5_reduce_sum_keepdim.dot.svg b/front/py/examples/2_ir/5_reduce_sum_keepdim.dot.svg index f0d4ee5c..bf6cc407 100644 --- a/front/py/examples/2_ir/5_reduce_sum_keepdim.dot.svg +++ b/front/py/examples/2_ir/5_reduce_sum_keepdim.dot.svg @@ -4,86 +4,163 @@ - - + + %3 - - + + -138334896593968 - -tensor_1 -(3, 4, 5) +134049762132704 + +t +(3, 4, 5) - - -138336870303440 - -reshape + + +134047464790512 + +sum - + -138334896593968->138336870303440 - - +134049762132704->134047464790512 + + - - -138334610649536 - -sum + + +134047464790656 + +prod - + -138334896593968->138334610649536 - - +134049762132704->134047464790656 + + - - -138336870303440->138334896593968 - - + + +134047466295776 + +s +(1, 4, 1) - + -138336870304208 - -vector_1 -(3, 4, 5) +134047464790368 + +vector_1 +[0, 2] - + -138336870304208->138336870303440 - - +134047464790368->134047464790512 + + - - -138334610649632 - -s1 -(1, 4, 1) + + +134047464790512->134047466295776 + + - + -138334610649680 - -vector_2 -[0, 2] +134047464790272 + +p +(3, 1, 5) - + + +134047464790608 + +vector_2 +[1] + + -138334610649680->138334610649536 - - +134047464790608->134047464790656 + + - + -138334610649536->138334610649632 - - +134047464790656->134047464790272 + + + + + +134049762132560 + +t1 +(4, 5, 6) + + + +134047464791328 + +sum + + + +134049762132560->134047464791328 + + + + + +134047464791088 + +constant + + + +134047464791088->134049762132560 + + + + + +134047464790848 + +var_1 +1 + + + +134047464790848->134047464791088 + + + + + +134047464790800 + +t2 +(1, 1, 6) + + + +134047464790464 + +vector_3 +[0, 1] + + + +134047464790464->134047464791328 + + + + + +134047464791328->134047464790800 + + diff --git a/front/py/examples/2_ir/5_reduce_sum_keepdim.py b/front/py/examples/2_ir/5_reduce_sum_keepdim.py index e6f1c5e4..902904c7 100644 --- a/front/py/examples/2_ir/5_reduce_sum_keepdim.py +++ b/front/py/examples/2_ir/5_reduce_sum_keepdim.py @@ -29,7 +29,7 @@ print(s) p=prod(t,dim=[1],out="p",keepdim=True) p.set_format("%.0f") -# print(p) +print(p) t1=ones(4,5,6,name="t1") t1.set_format("%.0f") diff --git a/front/py/examples/2_ir/5_reduce_sumprod.dot b/front/py/examples/2_ir/5_reduce_sumprod.dot index da812dbb..5dfc772d 100644 --- a/front/py/examples/2_ir/5_reduce_sumprod.dot +++ b/front/py/examples/2_ir/5_reduce_sumprod.dot @@ -2,37 +2,37 @@ digraph { rankdir=TB node [shape=record] - 133977343199552 [label="t + 124690717936416 [label="t (3, 4, 5)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 133975085212112 [label="s + 124689026191840 [label="s (4,)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 133975083685904 [label="vector_1 + 124688742603248 [label="vector_1 [0, 2]" color=darkseagreen fillcolor=honeydew fontname="Sans-Serif" labeljust=l shape=box style=filled] - 133975083685520 [label=sum color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 133975083685760 [label="p + 124688742602864 [label=sum color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 124688742603104 [label="p (3, 5)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 133975083686048 [label="vector_2 + 124688742603392 [label="vector_2 [1]" color=darkseagreen fillcolor=honeydew fontname="Sans-Serif" labeljust=l shape=box style=filled] - 133975083686096 [label=prod color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 133977343199120 [label="t1 + 124688742603440 [label=prod color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 124690717935984 [label="t1 (4, 5, 6)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 133975083686528 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 133975083686288 [label="var_1 + 124688742603872 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 124688742603632 [label="var_1 1" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled] - 133975083686240 [label="t2 + 124688742603584 [label="t2 (6,)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 133975083686768 [label="vector_3 + 124688742604112 [label="vector_3 [0, 1]" color=darkseagreen fillcolor=honeydew fontname="Sans-Serif" labeljust=l shape=box style=filled] - 133975083686576 [label=sum color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 133975083685520 -> 133975085212112 [arrowsize=0.8 color=gray40 penwidth=1.2] - 133977343199552 -> 133975083685520 [arrowsize=0.8 color=gray40 penwidth=1.2] - 133975083685904 -> 133975083685520 [arrowsize=0.8 color=gray40 penwidth=1.2] - 133975083686096 -> 133975083685760 [arrowsize=0.8 color=gray40 penwidth=1.2] - 133977343199552 -> 133975083686096 [arrowsize=0.8 color=gray40 penwidth=1.2] - 133975083686048 -> 133975083686096 [arrowsize=0.8 color=gray40 penwidth=1.2] - 133975083686528 -> 133977343199120 [arrowsize=0.8 color=gray40 penwidth=1.2] - 133975083686288 -> 133975083686528 [arrowsize=0.8 color=gray40 penwidth=1.2] - 133975083686576 -> 133975083686240 [arrowsize=0.8 color=gray40 penwidth=1.2] - 133977343199120 -> 133975083686576 [arrowsize=0.8 color=gray40 penwidth=1.2] - 133975083686768 -> 133975083686576 [arrowsize=0.8 color=gray40 penwidth=1.2] + 124688742603920 [label=sum color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 124688742602864 -> 124689026191840 [arrowsize=0.8 color=gray40 penwidth=1.2] + 124690717936416 -> 124688742602864 [arrowsize=0.8 color=gray40 penwidth=1.2] + 124688742603248 -> 124688742602864 [arrowsize=0.8 color=gray40 penwidth=1.2] + 124688742603440 -> 124688742603104 [arrowsize=0.8 color=gray40 penwidth=1.2] + 124690717936416 -> 124688742603440 [arrowsize=0.8 color=gray40 penwidth=1.2] + 124688742603392 -> 124688742603440 [arrowsize=0.8 color=gray40 penwidth=1.2] + 124688742603872 -> 124690717935984 [arrowsize=0.8 color=gray40 penwidth=1.2] + 124688742603632 -> 124688742603872 [arrowsize=0.8 color=gray40 penwidth=1.2] + 124688742603920 -> 124688742603584 [arrowsize=0.8 color=gray40 penwidth=1.2] + 124690717935984 -> 124688742603920 [arrowsize=0.8 color=gray40 penwidth=1.2] + 124688742604112 -> 124688742603920 [arrowsize=0.8 color=gray40 penwidth=1.2] } diff --git a/front/py/examples/2_ir/5_reduce_sumprod.dot.svg b/front/py/examples/2_ir/5_reduce_sumprod.dot.svg index 2b180653..dc32bc57 100644 --- a/front/py/examples/2_ir/5_reduce_sumprod.dot.svg +++ b/front/py/examples/2_ir/5_reduce_sumprod.dot.svg @@ -9,156 +9,156 @@ %3 - + -133977343199552 +124690717936416 t (3, 4, 5) - + -133975083685520 +124688742602864 sum - + -133977343199552->133975083685520 +124690717936416->124688742602864 - + -133975083686096 +124688742603440 prod - + -133977343199552->133975083686096 +124690717936416->124688742603440 - + -133975085212112 +124689026191840 s (4,) - + -133975083685904 +124688742603248 vector_1 [0, 2] - + -133975083685904->133975083685520 +124688742603248->124688742602864 - + -133975083685520->133975085212112 +124688742602864->124689026191840 - + -133975083685760 +124688742603104 p (3, 5) - + -133975083686048 +124688742603392 vector_2 [1] - + -133975083686048->133975083686096 +124688742603392->124688742603440 - + -133975083686096->133975083685760 +124688742603440->124688742603104 - + -133977343199120 +124690717935984 t1 (4, 5, 6) - + -133975083686576 +124688742603920 sum - + -133977343199120->133975083686576 +124690717935984->124688742603920 - + -133975083686528 +124688742603872 constant - + -133975083686528->133977343199120 +124688742603872->124690717935984 - + -133975083686288 +124688742603632 var_1 1 - + -133975083686288->133975083686528 +124688742603632->124688742603872 - + -133975083686240 +124688742603584 t2 (6,) - + -133975083686768 +124688742604112 vector_3 [0, 1] - + -133975083686768->133975083686576 +124688742604112->124688742603920 - + -133975083686576->133975083686240 +124688742603920->124688742603584 From 0f5d5d1e73bdeac6bcda1f0c38ec57762d024042 Mon Sep 17 00:00:00 2001 From: lipeng <734991033@qq.com> Date: Sat, 12 Apr 2025 15:42:03 +0800 Subject: [PATCH 2/6] =?UTF-8?q?cuda&changeshape:=E6=A0=BC=E5=BC=8F?= =?UTF-8?q?=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../deepx/tensorfunc/changeshape_miaobyte.cu | 16 +-- .../deepx/tensorfunc/changeshape_miaobyte.cuh | 2 +- .../deepx/tensorfunc/changeshape_miaobyte.hpp | 4 +- .../src/deepx/tensorfunc/cuda_math.hpp | 119 ++++++++++++++++++ .../tensorfunc/elementwise_miaobyte_basic.cu | 6 +- .../src/deepx/tensorfunc/tensor_cuda.cuh | 20 ++- 6 files changed, 145 insertions(+), 22 deletions(-) create mode 100644 excuter/op-mem-cuda/src/deepx/tensorfunc/cuda_math.hpp diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu index ac395c3f..bc46a818 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu @@ -41,21 +41,7 @@ namespace deepx::tensorfunc } } - inline int nextPowerOf2(int n) - { - if (n <= 0) - return 1; - if ((n & (n - 1)) == 0) - return n; // 如果n已经是2的幂 - - n--; - n |= n >> 1; - n |= n >> 2; - n |= n >> 4; - n |= n >> 8; - n |= n >> 16; - return n + 1; - } + template void launch_transpose(const int numBlocks, const int blockSize, diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cuh b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cuh index f9f47f7b..1039641a 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cuh +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cuh @@ -134,4 +134,4 @@ namespace deepx::tensorfunc const BroadcastMap *broadcastMap, int8_t *output, const int *outputStrides,const int outputDim,const int outputlen); } -#endif // DEEPX_TENSORFUNC_CHANGESHAPE_MIAOBYTE_HPP \ No newline at end of file +#endif // DEEPX_TENSORFUNC_CHANGESHAPE_MIAOBYTE_CUH \ No newline at end of file diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.hpp index 6f76fa23..153214b2 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.hpp +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.hpp @@ -3,13 +3,15 @@ #include #include + #include "deepx/tensor.hpp" #include "deepx/tensorfunc/changeshape.hpp" -#include "deepx/tensorfunc/authors.hpp" #include "deepx/tensorfunc/changeshape_miaobyte.cuh" #include "deepx/tensorfunc/cuda.hpp" +#include "deepx/tensorfunc/authors.hpp" #include "deepx/shape_changeshape.hpp" #include "stdutil/error.hpp" + namespace deepx::tensorfunc { template diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda_math.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda_math.hpp new file mode 100644 index 00000000..f0a3c578 --- /dev/null +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda_math.hpp @@ -0,0 +1,119 @@ +#ifndef DEEPX_TENSORFUNC_CUDA_MATH_HPP +#define DEEPX_TENSORFUNC_CUDA_MATH_HPP + +#include +#include +#include +#include + +namespace deepx::tensorfunc +{ + + // max + template + __device__ void deepx_max(const T *a, const T *b, T *out); + + template <> + __device__ void deepx_max(const double *a, const double *b, double *out) + { + *out = fmax(*a, *b); + } + + template <> + __device__ void deepx_max(const float *a, const float *b, float *out) + { + *out = fmaxf(*a, *b); + } + + template <> + __device__ void deepx_max(const half *a, const half *b, half *out) + { + *out = __hmax(*a, *b); + } + + template <> + __device__ void deepx_max(const nv_bfloat16 *a, const nv_bfloat16 *b, nv_bfloat16 *out) + { + *out = __hmax(*a, *b); + } + template <> + __device__ void deepx_max(const int64_t *a, const int64_t *b, int64_t *out) + { + *out = *a>*b?*a:*b; + } + template <> + __device__ void deepx_max(const int32_t *a, const int32_t *b, int32_t *out) + { + *out = *a>*b?*a:*b; + } + template <> + __device__ void deepx_max(const int16_t *a, const int16_t *b, int16_t *out) + { + *out = *a>*b?*a:*b; + } + template <> + __device__ void deepx_max(const int8_t *a, const int8_t *b, int8_t *out) + { + *out = *a>*b?*a:*b; + } + + + // min + template + __device__ void deepx_min(const T *a, const T *b, T *out); + + + template <> + __device__ void deepx_min(const double *a, const double *b, double *out) + { + *out = fmin(*a, *b); + } + + template <> + __device__ void deepx_min(const float *a, const float *b, float *out) + { + *out = fminf(*a, *b); + } + + + template <> + __device__ void deepx_min(const half *a, const half *b, half *out) + { + *out = __hmin(*a, *b); + } + + template <> + __device__ void deepx_min(const nv_bfloat16 *a,const nv_bfloat16 *b, nv_bfloat16 *out) + { + *out = __hmin(*a, *b); + } + + template <> + __device__ void deepx_min(const int64_t *a, const int64_t *b, int64_t *out) + { + *out = *a<*b?*a:*b; + } + + template <> + __device__ void deepx_min(const int32_t *a, const int32_t *b, int32_t *out) + { + *out = *a<*b?*a:*b; + } + + template <> + __device__ void deepx_min(const int16_t *a, const int16_t *b, int16_t *out) + { + *out = *a<*b?*a:*b; + } + + template <> + __device__ void deepx_min(const int8_t *a, const int8_t *b, int8_t *out) + { + *out = *a<*b?*a:*b; + } + + + +} + +#endif diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu index a5371005..3f54e08e 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu @@ -1,5 +1,5 @@ -#ifndef DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_BASIC_CU -#define DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_BASIC_CU +#ifndef DEEPX_TENSORFUNC_ELEMENTWISE_MIAOBYTE_BASIC_CU +#define DEEPX_TENSORFUNC_ELEMENTWISE_MIAOBYTE_BASIC_CU #include "deepx/tensorfunc/cuda.hpp" #include "deepx/tensorfunc/authors.hpp" @@ -281,4 +281,4 @@ namespace deepx::tensorfunc } -#endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_BASIC_CU +#endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAOBYTE_BASIC_CU diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/tensor_cuda.cuh b/excuter/op-mem-cuda/src/deepx/tensorfunc/tensor_cuda.cuh index a042d6d1..b60ab3f8 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/tensor_cuda.cuh +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/tensor_cuda.cuh @@ -1,11 +1,12 @@ #ifndef DEEPX_TENSORFUNC_TENSOR_CUDA_CUH #define DEEPX_TENSORFUNC_TENSOR_CUDA_CUH +#include #include "deepx/tensor.hpp" namespace deepx::tensorfunc { - __host__ __device__ void linearTo(const int *strides, const int dim, int *indices, const int id) + inline __host__ __device__ void linearTo(const int *strides, const int dim, int *indices, const int id) { int linearIndex = id; for (int i = 0; i < dim; i++) @@ -15,7 +16,7 @@ namespace deepx::tensorfunc } } - __host__ __device__ int linearAt(const int *strides, const int dim, int *indices) + inline __host__ __device__ int linearAt(const int *strides, const int dim, int *indices) { int idx = 0; for (int i = 0; i < dim; i++) @@ -34,6 +35,21 @@ namespace deepx::tensorfunc } } + inline int nextPowerOf2(int n) + { + if (n <= 0) + return 1; + if ((n & (n - 1)) == 0) + return n; // 如果n已经是2的幂 + + n--; + n |= n >> 1; + n |= n >> 2; + n |= n >> 4; + n |= n >> 8; + n |= n >> 16; + return n + 1; + } } #endif // DEEPX_TENSORFUNC_TENSOR_CUDA_CUH From 3207ccc7174179799cf5d5fe39c419cc8ea0564d Mon Sep 17 00:00:00 2001 From: lipeng <734991033@qq.com> Date: Sat, 12 Apr 2025 16:43:39 +0800 Subject: [PATCH 3/6] =?UTF-8?q?ompsimd&reduce:=E5=8F=82=E6=95=B0=E9=A1=BA?= =?UTF-8?q?=E5=BA=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../src/deepx/tensorfunc/reduce_miaobyte.hpp | 13 ++--- .../op-mem-ompsimd/src/deepx/tf/reduce.hpp | 48 +++++++++---------- .../test/tensorfunc/5_tensor_sum.cpp | 4 +- 3 files changed, 31 insertions(+), 34 deletions(-) diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/reduce_miaobyte.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/reduce_miaobyte.hpp index 424cacd6..aaf3f86e 100644 --- a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/reduce_miaobyte.hpp +++ b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/reduce_miaobyte.hpp @@ -2,15 +2,12 @@ #define DEEPX_TENSORFUNC_REDUCE_MIAOBYTE_HPP #include -#include #include #include -#include -#include "deepx/tensor.hpp" - #include "deepx/tensorfunc/highway.hpp" #include "deepx/shape_reduce.hpp" +#include "deepx/tensor.hpp" #include "deepx/tensorfunc/reduce.hpp" #include "deepx/tensorfunc/init_miaobyte.hpp" @@ -22,7 +19,7 @@ namespace deepx::tensorfunc template struct sumDispatcher { - static void sum(const Tensor &tensor, const std::vector &dims, Tensor &result, const bool keepdims) + static void sum(const Tensor &tensor, const std::vector &dims, const bool keepdims, Tensor &result) { constant(result, T(0)); std::vector checkeddims = checkedDims(tensor.shape.shape, dims); @@ -98,7 +95,7 @@ namespace deepx::tensorfunc template struct prodDispatcher { - static void prod(const Tensor &tensor, const std::vector &dims, Tensor &result, const bool keepdims) + static void prod(const Tensor &tensor, const std::vector &dims, const bool keepdims, Tensor &result) { std::vector checkeddims = checkedDims(tensor.shape.shape, dims); std::vector reduced_dims = reducedDim(tensor.shape.shape, checkeddims); @@ -181,7 +178,7 @@ namespace deepx::tensorfunc template struct reducemaxDispatcher { - static void reducemax(const Tensor &tensor, const std::vector &dims, Tensor &result, const bool keepdims) + static void reducemax(const Tensor &tensor, const std::vector &dims, const bool keepdims, Tensor &result) { std::vector checkeddims = checkedDims(tensor.shape.shape, dims); std::vector reduced_dims = reducedDim(tensor.shape.shape, checkeddims); @@ -262,7 +259,7 @@ namespace deepx::tensorfunc template struct reduceminDispatcher { - static void reducemin(const Tensor &tensor, const std::vector &dims, Tensor &result, const bool keepdims) + static void reducemin(const Tensor &tensor, const std::vector &dims, const bool keepdims, Tensor &result) { std::vector checkeddims = checkedDims(tensor.shape.shape, dims); std::vector reduced_dims = reducedDim(tensor.shape.shape, checkeddims); diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/reduce.hpp b/excuter/op-mem-ompsimd/src/deepx/tf/reduce.hpp index 5e5a873b..4e2bd1b9 100644 --- a/excuter/op-mem-ompsimd/src/deepx/tf/reduce.hpp +++ b/excuter/op-mem-ompsimd/src/deepx/tf/reduce.hpp @@ -43,22 +43,22 @@ namespace deepx::tf switch (input_type) { case Precision::Float64: - sum(*mem->gettensor(this->args[0].textvalue), dims, *mem->gettensor(this->returns[0].textvalue), keepdims); + sum(*mem->gettensor(this->args[0].textvalue), dims, keepdims, *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Float32: - sum(*mem->gettensor(this->args[0].textvalue), dims, *mem->gettensor(this->returns[0].textvalue), keepdims); + sum(*mem->gettensor(this->args[0].textvalue), dims, keepdims,*mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int64: - sum(*mem->gettensor(this->args[0].textvalue), dims, *mem->gettensor(this->returns[0].textvalue), keepdims); + sum(*mem->gettensor(this->args[0].textvalue), dims, keepdims,*mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int32: - sum(*mem->gettensor(this->args[0].textvalue), dims, *mem->gettensor(this->returns[0].textvalue), keepdims); + sum(*mem->gettensor(this->args[0].textvalue), dims, keepdims,*mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int16: - sum(*mem->gettensor(this->args[0].textvalue), dims, *mem->gettensor(this->returns[0].textvalue), keepdims); + sum(*mem->gettensor(this->args[0].textvalue), dims, keepdims,*mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int8: - sum(*mem->gettensor(this->args[0].textvalue), dims, *mem->gettensor(this->returns[0].textvalue), keepdims); + sum(*mem->gettensor(this->args[0].textvalue), dims, keepdims,*mem->gettensor(this->returns[0].textvalue)); break; default: error = "Unsupported type: " + precision_str(input_type); @@ -101,22 +101,22 @@ namespace deepx::tf switch (input_type) { case Precision::Float64: - prod(*mem->gettensor(this->args[0].textvalue), dims, *mem->gettensor(this->returns[0].textvalue), keepdims); + prod(*mem->gettensor(this->args[0].textvalue), dims, keepdims, *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Float32: - prod(*mem->gettensor(this->args[0].textvalue), dims, *mem->gettensor(this->returns[0].textvalue), keepdims); + prod(*mem->gettensor(this->args[0].textvalue), dims, keepdims, *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int64: - prod(*mem->gettensor(this->args[0].textvalue), dims, *mem->gettensor(this->returns[0].textvalue), keepdims); + prod(*mem->gettensor(this->args[0].textvalue), dims, keepdims, *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int32: - prod(*mem->gettensor(this->args[0].textvalue), dims, *mem->gettensor(this->returns[0].textvalue), keepdims); + prod(*mem->gettensor(this->args[0].textvalue), dims, keepdims, *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int16: - prod(*mem->gettensor(this->args[0].textvalue), dims, *mem->gettensor(this->returns[0].textvalue), keepdims); + prod(*mem->gettensor(this->args[0].textvalue), dims, keepdims, *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int8: - prod(*mem->gettensor(this->args[0].textvalue), dims, *mem->gettensor(this->returns[0].textvalue), keepdims); + prod(*mem->gettensor(this->args[0].textvalue), dims, keepdims, *mem->gettensor(this->returns[0].textvalue)); break; default: error = "Unsupported type: " + precision_str(input_type); @@ -159,22 +159,22 @@ namespace deepx::tf switch (input_type) { case Precision::Float64: - reducemax(*mem->gettensor(this->args[0].textvalue), dims, *mem->gettensor(this->returns[0].textvalue), keepdims); + reducemax(*mem->gettensor(this->args[0].textvalue), dims, keepdims, *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Float32: - reducemax(*mem->gettensor(this->args[0].textvalue), dims, *mem->gettensor(this->returns[0].textvalue), keepdims); + reducemax(*mem->gettensor(this->args[0].textvalue), dims, keepdims, *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int64: - reducemax(*mem->gettensor(this->args[0].textvalue), dims, *mem->gettensor(this->returns[0].textvalue), keepdims); + reducemax(*mem->gettensor(this->args[0].textvalue), dims, keepdims, *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int32: - reducemax(*mem->gettensor(this->args[0].textvalue), dims, *mem->gettensor(this->returns[0].textvalue), keepdims); + reducemax(*mem->gettensor(this->args[0].textvalue), dims, keepdims, *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int16: - reducemax(*mem->gettensor(this->args[0].textvalue), dims, *mem->gettensor(this->returns[0].textvalue), keepdims); + reducemax(*mem->gettensor(this->args[0].textvalue), dims, keepdims, *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int8: - reducemax(*mem->gettensor(this->args[0].textvalue), dims, *mem->gettensor(this->returns[0].textvalue), keepdims); + reducemax(*mem->gettensor(this->args[0].textvalue), dims, keepdims, *mem->gettensor(this->returns[0].textvalue)); break; default: error = "Unsupported type: " + precision_str(input_type); @@ -217,22 +217,22 @@ namespace deepx::tf switch (input_type) { case Precision::Float64: - reducemin(*mem->gettensor(this->args[0].textvalue), dims, *mem->gettensor(this->returns[0].textvalue), keepdims); + reducemin(*mem->gettensor(this->args[0].textvalue), dims, keepdims, *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Float32: - reducemin(*mem->gettensor(this->args[0].textvalue), dims, *mem->gettensor(this->returns[0].textvalue), keepdims); + reducemin(*mem->gettensor(this->args[0].textvalue), dims, keepdims, *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int64: - reducemin(*mem->gettensor(this->args[0].textvalue), dims, *mem->gettensor(this->returns[0].textvalue), keepdims); + reducemin(*mem->gettensor(this->args[0].textvalue), dims, keepdims, *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int32: - reducemin(*mem->gettensor(this->args[0].textvalue), dims, *mem->gettensor(this->returns[0].textvalue), keepdims); + reducemin(*mem->gettensor(this->args[0].textvalue), dims, keepdims, *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int16: - reducemin(*mem->gettensor(this->args[0].textvalue), dims, *mem->gettensor(this->returns[0].textvalue), keepdims); + reducemin(*mem->gettensor(this->args[0].textvalue), dims, keepdims, *mem->gettensor(this->returns[0].textvalue)); break; case Precision::Int8: - reducemin(*mem->gettensor(this->args[0].textvalue), dims, *mem->gettensor(this->returns[0].textvalue), keepdims); + reducemin(*mem->gettensor(this->args[0].textvalue), dims, keepdims, *mem->gettensor(this->returns[0].textvalue)); break; default: error = "Unsupported type: " + precision_str(input_type); diff --git a/excuter/op-mem-ompsimd/test/tensorfunc/5_tensor_sum.cpp b/excuter/op-mem-ompsimd/test/tensorfunc/5_tensor_sum.cpp index c68e2b8e..79530b6b 100644 --- a/excuter/op-mem-ompsimd/test/tensorfunc/5_tensor_sum.cpp +++ b/excuter/op-mem-ompsimd/test/tensorfunc/5_tensor_sum.cpp @@ -35,7 +35,7 @@ void test_sum() std::vector checkeddims=checkedDims(shape,comb); std::vector sumshape=reducedShape(shape,checkeddims); Tensor r = New(sumshape); - sum(tensor, checkeddims,r); + sum(tensor, checkeddims,false,r); print(r,"%.0f"); } /* @@ -63,7 +63,7 @@ void benchmark_sum(int i){ std::vector checkeddims=checkedDims(shape,comb); std::vector sumshape=reducedShape(shape,checkeddims); Tensor r=New(sumshape); - sum(tensor, checkeddims,r); + sum(tensor, checkeddims,false,r); string combstr=""; for (const auto &c : comb) { From 055444142b6fa897f7188f028ffba643fb54fcf0 Mon Sep 17 00:00:00 2001 From: lipeng <734991033@qq.com> Date: Sat, 12 Apr 2025 16:43:54 +0800 Subject: [PATCH 4/6] =?UTF-8?q?ompsimd&reduce:=E5=8F=82=E6=95=B0=E9=A1=BA?= =?UTF-8?q?=E5=BA=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../src/deepx/tensorfunc/reduce.hpp | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/excuter/cpp-common/src/deepx/tensorfunc/reduce.hpp b/excuter/cpp-common/src/deepx/tensorfunc/reduce.hpp index f1570693..a94c1908 100644 --- a/excuter/cpp-common/src/deepx/tensorfunc/reduce.hpp +++ b/excuter/cpp-common/src/deepx/tensorfunc/reduce.hpp @@ -12,46 +12,46 @@ namespace deepx::tensorfunc template struct reducemaxDispatcher { - static void reducemax(const Tensor &A, const std::vector &dims,Tensor &B,const bool keepdims=false) = delete; + static void reducemax(const Tensor &A, const std::vector &dims,const bool keepdims,Tensor &B) = delete; }; template - void reducemax(const Tensor &A, const std::vector &dims,Tensor &B,const bool keepdims=false) + void reducemax(const Tensor &A, const std::vector &dims,const bool keepdims,Tensor &B) { - reducemaxDispatcher::reducemax(A, dims, B, keepdims); + reducemaxDispatcher::reducemax(A, dims, keepdims, B); } template struct reduceminDispatcher { - static void reducemin(const Tensor &A, const std::vector &dims,Tensor &B,const bool keepdims=false) = delete; + static void reducemin(const Tensor &A, const std::vector &dims,const bool keepdims,Tensor &B) = delete; }; template - void reducemin(const Tensor &A, const std::vector &dims,Tensor &B,const bool keepdims=false) + void reducemin(const Tensor &A, const std::vector &dims,const bool keepdims,Tensor &B) { - reduceminDispatcher::reducemin(A, dims, B, keepdims); + reduceminDispatcher::reducemin(A, dims, keepdims, B); } template struct sumDispatcher { - static void reducesum(const Tensor &A, const std::vector &dims,Tensor &B,const bool keepdims=false) = delete; + static void sum(const Tensor &A, const std::vector &dims,const bool keepdims,Tensor &B) = delete; }; template - void sum(const Tensor &A, const std::vector &dims,Tensor &B,const bool keepdims=false) + void sum(const Tensor &A, const std::vector &dims,const bool keepdims,Tensor &B) { - sumDispatcher::sum(A, dims, B, keepdims); + sumDispatcher::sum(A, dims, keepdims, B); } template struct prodDispatcher { - static void prod(const Tensor &A, const std::vector &dims,Tensor &B,const bool keepdims=false) = delete; + static void prod(const Tensor &A, const std::vector &dims,const bool keepdims,Tensor &B) = delete; }; template - void prod(const Tensor &A, const std::vector &dims,Tensor &B,const bool keepdims=false) + void prod(const Tensor &A, const std::vector &dims,const bool keepdims,Tensor &B) { - prodDispatcher::prod(A, dims, B, keepdims); + prodDispatcher::prod(A, dims, keepdims, B); } } #endif // DEEPX_TENSORFUNC_REDUCE_HPP From 7181d1cead88b61ae320cae471220a8c64fd43df Mon Sep 17 00:00:00 2001 From: lipeng <734991033@qq.com> Date: Sat, 12 Apr 2025 16:44:19 +0800 Subject: [PATCH 5/6] =?UTF-8?q?cuda&reduce:=E7=BC=96=E8=AF=91=E5=AE=8C?= =?UTF-8?q?=E6=88=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- excuter/op-mem-cuda/src/client/tfs.cpp | 65 ++- .../deepx/tensorfunc/changeshape_miaobyte.cu | 21 +- .../deepx/tensorfunc/changeshape_miaobyte.cuh | 151 ++++--- .../deepx/tensorfunc/changeshape_miaobyte.hpp | 6 +- .../src/deepx/tensorfunc/reduce_miaobyte.cu | 404 ++++++++++++++++++ .../src/deepx/tensorfunc/reduce_miaobyte.cuh | 58 +++ .../src/deepx/tensorfunc/reduce_miaobyte.hpp | 81 ++-- excuter/op-mem-cuda/src/deepx/tf/reduce.hpp | 252 +++++++++++ 8 files changed, 895 insertions(+), 143 deletions(-) create mode 100644 excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.cu create mode 100644 excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.cuh create mode 100644 excuter/op-mem-cuda/src/deepx/tf/reduce.hpp diff --git a/excuter/op-mem-cuda/src/client/tfs.cpp b/excuter/op-mem-cuda/src/client/tfs.cpp index 44560d7f..2ef18928 100644 --- a/excuter/op-mem-cuda/src/client/tfs.cpp +++ b/excuter/op-mem-cuda/src/client/tfs.cpp @@ -9,6 +9,7 @@ #include "deepx/tf/elementwise_compare.hpp" #include "deepx/tf/matmul.hpp" #include "deepx/tf/changeshape.hpp" +#include "deepx/tf/reduce.hpp" #include "deepx/dtype.hpp" #include "deepx/tf/tffactory.hpp" #include "deepx/tensorfunc/authors.hpp" @@ -371,20 +372,56 @@ namespace deepx::tf Param("B", DataCategory::Tensor, Precision::Any), }))); } - // // reduce - // void register_reduce(OpFactory &opfactory) - // { - // opfactory.add_op(Max()); - // opfactory.add_op(Max()); - // opfactory.add_op(Maxscalar()); - // opfactory.add_op(Maxscalar()); - // opfactory.add_op(Min()); - // opfactory.add_op(Min()); - // opfactory.add_op(Minscalar()); - // opfactory.add_op(Minscalar()); - // opfactory.add_op(Sum()); - // opfactory.add_op(Sum()); - // } + // reduce + void register_reduce(TfFactory &tffactory) + { + // sum + tffactory.add_tf(std::make_shared>(vector( + { + Param("A", DataCategory::Tensor, Precision::Any), + Param("dims", DataCategory::Vector, Precision::Int32), + Param("keepdims", DataCategory::Var, Precision::Bool), + }), + vector( + { + Param("B", DataCategory::Tensor, Precision::Any), + }))); + // prod + tffactory.add_tf(std::make_shared>(vector( + { + Param("A", DataCategory::Tensor, Precision::Any), + Param("dims", DataCategory::Vector, Precision::Int32), + Param("keepdims", DataCategory::Var, Precision::Bool), + }), + vector( + { + Param("B", DataCategory::Tensor, Precision::Any), + }))); + + // max + tffactory.add_tf(std::make_shared>(vector( + { + Param("A", DataCategory::Tensor, Precision::Any), + Param("dims", DataCategory::Vector, Precision::Int32), + Param("keepdims", DataCategory::Var, Precision::Bool), + }), + vector( + { + Param("B", DataCategory::Tensor, Precision::Any), + }))); + // min + tffactory.add_tf(std::make_shared>(vector( + { + Param("A", DataCategory::Tensor, Precision::Any), + Param("dims", DataCategory::Vector, Precision::Int32), + Param("keepdims", DataCategory::Var, Precision::Bool), + }), + vector( + { + Param("B", DataCategory::Tensor, Precision::Any), + }))); + } + int register_all(TfFactory &tffactory) { register_lifecycle(tffactory); diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu index bc46a818..e5eb511d 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu @@ -44,8 +44,7 @@ namespace deepx::tensorfunc template - void launch_transpose(const int numBlocks, const int blockSize, - const T *input, + void launch_transpose(const T *input, const int *inputStrides, T *output, const int *outputStrides, @@ -58,7 +57,7 @@ namespace deepx::tensorfunc cudaVector dimOrder_d(dimOrder, dim); int powDim = nextPowerOf2(dim); - + auto [numBlocks, blockSize] = BestDims(len); // 根据计算出的2的幂次选择对应的模板实例 switch (powDim) { @@ -91,14 +90,14 @@ namespace deepx::tensorfunc } } - template void launch_transpose(const int numBlocks, const int blockSize, const double *input, const int *inputStrides, double *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); - template void launch_transpose(const int numBlocks, const int blockSize, const float *input, const int *inputStrides, float *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); - template void launch_transpose(const int numBlocks, const int blockSize, const nv_bfloat16 *input, const int *inputStrides, nv_bfloat16 *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); - template void launch_transpose<__half>(const int numBlocks, const int blockSize, const __half *input, const int *inputStrides, __half *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); - template void launch_transpose(const int numBlocks, const int blockSize, const int64_t *input, const int *inputStrides, int64_t *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); - template void launch_transpose(const int numBlocks, const int blockSize, const int32_t *input, const int *inputStrides, int32_t *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); - template void launch_transpose(const int numBlocks, const int blockSize, const int16_t *input, const int *inputStrides, int16_t *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); - template void launch_transpose(const int numBlocks, const int blockSize, const int8_t *input, const int *inputStrides, int8_t *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); + template void launch_transpose(const double *input, const int *inputStrides, double *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); + template void launch_transpose(const float *input, const int *inputStrides, float *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); + template void launch_transpose(const nv_bfloat16 *input, const int *inputStrides, nv_bfloat16 *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); + template void launch_transpose<__half>(const __half *input, const int *inputStrides, __half *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); + template void launch_transpose(const int64_t *input, const int *inputStrides, int64_t *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); + template void launch_transpose(const int32_t *input, const int *inputStrides, int32_t *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); + template void launch_transpose(const int16_t *input, const int *inputStrides, int16_t *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); + template void launch_transpose(const int8_t *input, const int *inputStrides, int8_t *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); // concat template diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cuh b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cuh index 1039641a..a1fcf9fa 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cuh +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cuh @@ -4,9 +4,7 @@ #include #include -#include "deepx/shape_changeshape.hpp" -#include "deepx/tensorfunc/cuda.hpp" -#include "deepx/tensorfunc/authors.hpp" +#include "deepx/shape_changeshape.hpp" //BroadcastMap类型 namespace deepx::tensorfunc { @@ -15,31 +13,31 @@ namespace deepx::tensorfunc __global__ void transpose_kernel(const T *input, const int *inputStrides, T *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); template - void launch_transpose(const int numBlocks, const int blockSize, const T *input, const int *inputStrides, T *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); + void launch_transpose( const T *input, const int *inputStrides, T *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); - template <> - void launch_transpose(const int numBlocks, const int blockSize, const double *input, const int *inputStrides, double *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); + // template <> + // void launch_transpose( const double *input, const int *inputStrides, double *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); - template <> - void launch_transpose(const int numBlocks, const int blockSize, const float *input, const int *inputStrides, float *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); + // template <> + // void launch_transpose( const float *input, const int *inputStrides, float *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); - template <> - void launch_transpose(const int numBlocks, const int blockSize, const nv_bfloat16 *input, const int *inputStrides, nv_bfloat16 *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); + // template <> + // void launch_transpose(const nv_bfloat16 *input, const int *inputStrides, nv_bfloat16 *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); - template <> - void launch_transpose<__half>(const int numBlocks, const int blockSize, const __half *input, const int *inputStrides, __half *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); + // template <> + // void launch_transpose<__half>(const __half *input, const int *inputStrides, __half *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); - template <> - void launch_transpose(const int numBlocks, const int blockSize, const int64_t *input, const int *inputStrides, int64_t *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); + // template <> + // void launch_transpose(const int64_t *input, const int *inputStrides, int64_t *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); - template <> - void launch_transpose(const int numBlocks, const int blockSize, const int32_t *input, const int *inputStrides, int32_t *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); + // template <> + // void launch_transpose(const int32_t *input, const int *inputStrides, int32_t *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); - template <> - void launch_transpose(const int numBlocks, const int blockSize, const int16_t *input, const int *inputStrides, int16_t *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); + // template <> + // void launch_transpose(const int16_t *input, const int *inputStrides, int16_t *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); - template <> - void launch_transpose(const int numBlocks, const int blockSize, const int8_t *input, const int *inputStrides, int8_t *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); + // template <> + // void launch_transpose(const int8_t *input, const int *inputStrides, int8_t *output, const int *outputStrides, const int dim, const int len, const int *dimOrder); template __global__ void concat_kernel(const T **tensorsData, @@ -55,29 +53,29 @@ namespace deepx::tensorfunc template void launch_concat(const T **tensorsData, const int *inputStrides, T *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis); - template <> - void launch_concat(const double **tensorsData, const int *inputStrides, double *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis); + // template <> + // void launch_concat(const double **tensorsData, const int *inputStrides, double *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis); - template <> - void launch_concat(const float **tensorsData, const int *inputStrides, float *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis); + // template <> + // void launch_concat(const float **tensorsData, const int *inputStrides, float *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis); - template <> - void launch_concat(const nv_bfloat16 **tensorsData, const int *inputStrides, nv_bfloat16 *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis); + // template <> + // void launch_concat(const nv_bfloat16 **tensorsData, const int *inputStrides, nv_bfloat16 *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis); - template <> - void launch_concat<__half>(const __half **tensorsData, const int *inputStrides, __half *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis); + // template <> + // void launch_concat<__half>(const __half **tensorsData, const int *inputStrides, __half *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis); - template <> - void launch_concat(const int64_t **tensorsData, const int *inputStrides, int64_t *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis); + // template <> + // void launch_concat(const int64_t **tensorsData, const int *inputStrides, int64_t *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis); - template <> - void launch_concat(const int32_t **tensorsData, const int *inputStrides, int32_t *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis); + // template <> + // void launch_concat(const int32_t **tensorsData, const int *inputStrides, int32_t *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis); - template <> - void launch_concat(const int16_t **tensorsData, const int *inputStrides, int16_t *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis); + // template <> + // void launch_concat(const int16_t **tensorsData, const int *inputStrides, int16_t *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis); - template <> - void launch_concat(const int8_t **tensorsData, const int *inputStrides, int8_t *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis); + // template <> + // void launch_concat(const int8_t **tensorsData, const int *inputStrides, int8_t *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis); __host__ __device__ void fromBroadcastIndices(const BroadcastMap *broadcastMap, const int *broadcastIndices, const int broadcastIndicesDim, int *indices); @@ -94,44 +92,45 @@ namespace deepx::tensorfunc const BroadcastMap *broadcastMap, T *output, const int *outputStrides,const int outputDim,const int outputlen); - template <> - void launch_broadcastTo(const double *input, const int *inputStrides,const int inputDim, - const BroadcastMap *broadcastMap, - double *output, const int *outputStrides,const int outputDim,const int outputlen); - - template <> - void launch_broadcastTo(const float *input, const int *inputStrides,const int inputDim, - const BroadcastMap *broadcastMap, - float *output, const int *outputStrides,const int outputDim,const int outputlen); - - template <> - void launch_broadcastTo(const nv_bfloat16 *input, const int *inputStrides,const int inputDim, - const BroadcastMap *broadcastMap, - nv_bfloat16 *output, const int *outputStrides,const int outputDim,const int outputlen); - - template <> - void launch_broadcastTo<__half>(const __half *input, const int *inputStrides,const int inputDim, - const BroadcastMap *broadcastMap, - __half *output, const int *outputStrides,const int outputDim,const int outputlen); - - template <> - void launch_broadcastTo(const int64_t *input, const int *inputStrides,const int inputDim, - const BroadcastMap *broadcastMap, - int64_t *output, const int *outputStrides,const int outputDim,const int outputlen); - - template <> - void launch_broadcastTo(const int32_t *input, const int *inputStrides,const int inputDim, - const BroadcastMap *broadcastMap, - int32_t *output, const int *outputStrides,const int outputDim,const int outputlen); - - template <> - void launch_broadcastTo(const int16_t *input, const int *inputStrides,const int inputDim, - const BroadcastMap *broadcastMap, - int16_t *output, const int *outputStrides,const int outputDim,const int outputlen); - - template <> - void launch_broadcastTo(const int8_t *input, const int *inputStrides,const int inputDim, - const BroadcastMap *broadcastMap, - int8_t *output, const int *outputStrides,const int outputDim,const int outputlen); -} +// template <> +// void launch_broadcastTo(const double *input, const int *inputStrides,const int inputDim, +// const BroadcastMap *broadcastMap, +// double *output, const int *outputStrides,const int outputDim,const int outputlen); + +// template <> +// void launch_broadcastTo(const float *input, const int *inputStrides,const int inputDim, +// const BroadcastMap *broadcastMap, +// float *output, const int *outputStrides,const int outputDim,const int outputlen); + +// template <> +// void launch_broadcastTo(const nv_bfloat16 *input, const int *inputStrides,const int inputDim, +// const BroadcastMap *broadcastMap, +// nv_bfloat16 *output, const int *outputStrides,const int outputDim,const int outputlen); + +// template <> +// void launch_broadcastTo<__half>(const __half *input, const int *inputStrides,const int inputDim, +// const BroadcastMap *broadcastMap, +// __half *output, const int *outputStrides,const int outputDim,const int outputlen); + +// template <> +// void launch_broadcastTo(const int64_t *input, const int *inputStrides,const int inputDim, +// const BroadcastMap *broadcastMap, +// int64_t *output, const int *outputStrides,const int outputDim,const int outputlen); + +// template <> +// void launch_broadcastTo(const int32_t *input, const int *inputStrides,const int inputDim, +// const BroadcastMap *broadcastMap, +// int32_t *output, const int *outputStrides,const int outputDim,const int outputlen); + +// template <> +// void launch_broadcastTo(const int16_t *input, const int *inputStrides,const int inputDim, +// const BroadcastMap *broadcastMap, +// int16_t *output, const int *outputStrides,const int outputDim,const int outputlen); + +// template <> +// void launch_broadcastTo(const int8_t *input, const int *inputStrides,const int inputDim, +// const BroadcastMap *broadcastMap, +// int8_t *output, const int *outputStrides,const int outputDim,const int outputlen); +// } +}; #endif // DEEPX_TENSORFUNC_CHANGESHAPE_MIAOBYTE_CUH \ No newline at end of file diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.hpp index 153214b2..d127e6a1 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.hpp +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.hpp @@ -7,7 +7,6 @@ #include "deepx/tensor.hpp" #include "deepx/tensorfunc/changeshape.hpp" #include "deepx/tensorfunc/changeshape_miaobyte.cuh" -#include "deepx/tensorfunc/cuda.hpp" #include "deepx/tensorfunc/authors.hpp" #include "deepx/shape_changeshape.hpp" #include "stdutil/error.hpp" @@ -53,9 +52,8 @@ namespace deepx::tensorfunc { throw std::runtime_error("Dimension order size must match tensor dimension size for transpose"); } - auto [actual_blocks, optimal_block_size] = BestDims(tensor.shape.size); - launch_transpose(actual_blocks, optimal_block_size, - tensor.data, tensor.shape.strides.data(), + + launch_transpose(tensor.data, tensor.shape.strides.data(), output.data, output.shape.strides.data(), tensor.shape.dim, tensor.shape.size, dim_order.data()); } diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.cu new file mode 100644 index 00000000..c7df6cf6 --- /dev/null +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.cu @@ -0,0 +1,404 @@ +#ifndef DEEPX_TENSORFUNC_REDUCE_MIAOBYTE_CU +#define DEEPX_TENSORFUNC_REDUCE_MIAOBYTE_CU + +#include +#include +#include + +#include "deepx/tensorfunc/cuda.hpp" +#include "deepx/tensorfunc/reduce_miaobyte.cuh" +#include "deepx/tensorfunc/tensor_cuda.cuh" +#include "deepx/tensorfunc/vector_cuda.cuh" +#include "deepx/tensorfunc/cuda_math.hpp" + +namespace deepx::tensorfunc +{ + + template + __global__ void sum_kernel(const T *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, + const int *reduced_dims, const bool keepdims, + T *result_data, const int *result_strides, const int result_dim) + { + const int grid_stride = gridDim.x * blockDim.x; + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + for (; thread_id < tensor_len; thread_id += grid_stride) + { + int input_indices[DIM]; + linearTo(tensor_strides, tensor_dim, input_indices, thread_id); + int output_indices[DIM]; + for (size_t i = 0, j = 0; i < tensor_dim; ++i) + { + if (reduced_dims[i] == 0) + { + output_indices[j++] = input_indices[i]; + } + else if (keepdims && (reduced_dims[i] == 1)) + { + output_indices[j++] = 0; + } + } + int outputIdx = linearAt(result_strides, result_dim, output_indices); + int inputIdx = linearAt(tensor_strides, tensor_dim, input_indices); + result_data[outputIdx] += tensor_data[inputIdx]; + } + } + + template + __host__ void launch_sum(const T *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, + const int *reduced_dims, const bool keepdims, + T *result_data, const int *result_strides, const int result_dim) + { + auto [numBlocks, blockSize] = BestDims(tensor_len); + cudaVector tensor_strides_d(tensor_strides, tensor_dim, cudaMemcpyHostToDevice); + cudaVector result_strides_d(result_strides, result_dim, cudaMemcpyHostToDevice); + cudaVector reduced_dims_d(reduced_dims,tensor_dim, cudaMemcpyHostToDevice); + + int powDim = nextPowerOf2(tensor_dim); + switch (powDim) + { + case 1: + sum_kernel<1, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + break; + case 2: + sum_kernel<2, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + break; + case 4: + sum_kernel<4, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + break; + case 8: + sum_kernel<8, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len,reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + break; + case 16: + sum_kernel<16, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + break; + case 32: + sum_kernel<32, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + break; + case 64: + sum_kernel<64, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + break; + case 128: + sum_kernel<128, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + break; + default: + throw std::runtime_error("dim too large, max support 128"); + } + } + + template void launch_sum(const double *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, + const int *reduced_dims, const bool keepdims, + double *result_data, const int *result_strides, const int result_dim); + template void launch_sum(const float *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, + const int *reduced_dims, const bool keepdims, + float *result_data, const int *result_strides, const int result_dim); + template void launch_sum(const nv_bfloat16 *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, + const int *reduced_dims, const bool keepdims, + nv_bfloat16 *result_data, const int *result_strides, const int result_dim); + template void launch_sum<__half>(const __half *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, + const int *reduced_dims, const bool keepdims, + __half *result_data, const int *result_strides, const int result_dim); + template void launch_sum(const int64_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, + const int *reduced_dims, const bool keepdims, + int64_t *result_data, const int *result_strides, const int result_dim); + template void launch_sum(const int32_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, + const int *reduced_dims, const bool keepdims, + int32_t *result_data, const int *result_strides, const int result_dim); + template void launch_sum(const int16_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, + const int *reduced_dims, const bool keepdims, + int16_t *result_data, const int *result_strides, const int result_dim); + template void launch_sum(const int8_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, + const int *reduced_dims, const bool keepdims, + int8_t *result_data, const int *result_strides, const int result_dim); + + //prod + template + __global__ void prod_kernel(const T *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, + const int *reduced_dims, const bool keepdims, + T *result_data, const int *result_strides, const int result_dim) + { + const int grid_stride = gridDim.x * blockDim.x; + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + for (; thread_id < tensor_len; thread_id += grid_stride) + { + int input_indices[DIM]; + linearTo(tensor_strides, tensor_dim, input_indices, thread_id); + int output_indices[DIM]; + for (size_t i = 0, j = 0; i < tensor_dim; ++i) + { + if (reduced_dims[i] == 0) + { + output_indices[j++] = input_indices[i]; + } + else if (keepdims && (reduced_dims[i] == 1)) + { + output_indices[j++] = 0; + } + } + int outputIdx = linearAt(result_strides, result_dim, output_indices); + int inputIdx = linearAt(tensor_strides, tensor_dim, input_indices); + result_data[outputIdx] *= tensor_data[inputIdx]; + } + } + + template + void launch_prod(const T *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, + const int *reduced_dims, const bool keepdims, + T *result_data, const int *result_strides, const int result_dim) + { + auto [numBlocks, blockSize] = BestDims(tensor_len); + cudaVector tensor_strides_d(tensor_strides, tensor_dim, cudaMemcpyHostToDevice); + cudaVector result_strides_d(result_strides, result_dim, cudaMemcpyHostToDevice); + cudaVector reduced_dims_d(reduced_dims,tensor_dim, cudaMemcpyHostToDevice); + + int powDim = nextPowerOf2(tensor_dim); + switch (powDim) + { + case 1: + prod_kernel<1, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + break; + case 2: + prod_kernel<2, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + break; + case 4: + prod_kernel<4, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + break; + case 8: + prod_kernel<8, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + break; + case 16: + prod_kernel<16, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + break; + case 32: + prod_kernel<32, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + break; + case 64: + prod_kernel<64, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + break; + case 128: + prod_kernel<128, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + break; + default: + throw std::runtime_error("dim too large, max support 128"); + } + } + + template void launch_prod(const double *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, + const int *reduced_dims, const bool keepdims, + double *result_data, const int *result_strides, const int result_dim); + template void launch_prod(const float *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, + const int *reduced_dims, const bool keepdims, + float *result_data, const int *result_strides, const int result_dim); + template void launch_prod(const nv_bfloat16 *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, + const int *reduced_dims, const bool keepdims, + nv_bfloat16 *result_data, const int *result_strides, const int result_dim); + template void launch_prod<__half>(const __half *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, + const int *reduced_dims, const bool keepdims, + __half *result_data, const int *result_strides, const int result_dim); + template void launch_prod(const int64_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, + const int *reduced_dims, const bool keepdims, + int64_t *result_data, const int *result_strides, const int result_dim); + template void launch_prod(const int32_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, + const int *reduced_dims, const bool keepdims, + int32_t *result_data, const int *result_strides, const int result_dim); + template void launch_prod(const int16_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, + const int *reduced_dims, const bool keepdims, + int16_t *result_data, const int *result_strides, const int result_dim); + template void launch_prod(const int8_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, + const int *reduced_dims, const bool keepdims, + int8_t *result_data, const int *result_strides, const int result_dim); + + //max + template + __global__ void max_kernel(const T *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, + const int *reduced_dims, const bool keepdims, + T *result_data, const int *result_strides, const int result_dim){ + const int grid_stride = gridDim.x * blockDim.x; + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + for (; thread_id < tensor_len; thread_id += grid_stride) + { + int input_indices[DIM]; + linearTo(tensor_strides, tensor_dim, input_indices, thread_id); + int output_indices[DIM]; + for (size_t i = 0, j = 0; i < tensor_dim; ++i) + { + if (reduced_dims[i] == 0) + { + output_indices[j++] = input_indices[i]; + } + else if (keepdims && (reduced_dims[i] == 1)) + { + output_indices[j++] = 0; + } + } + int outputIdx = linearAt(result_strides, result_dim, output_indices); + int inputIdx = linearAt(tensor_strides, tensor_dim, input_indices); + deepx_max(result_data+outputIdx, tensor_data+inputIdx, result_data+outputIdx); + } + } + + template + void launch_reducemax(const T *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, + const int *reduced_dims, const bool keepdims, + T *result_data, const int *result_strides, const int result_dim) + { + auto [numBlocks, blockSize] = BestDims(tensor_len); + cudaVector tensor_strides_d(tensor_strides, tensor_dim, cudaMemcpyHostToDevice); + cudaVector result_strides_d(result_strides, result_dim, cudaMemcpyHostToDevice); + cudaVector reduced_dims_d(reduced_dims,tensor_dim, cudaMemcpyHostToDevice); + + int powDim = nextPowerOf2(tensor_dim); + switch (powDim) + { + case 1: + max_kernel<1, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + break; + case 2: + max_kernel<2, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + break; + case 4: + max_kernel<4, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + break; + case 8: + max_kernel<8, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + break; + case 16: + max_kernel<16, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + break; + case 32: + max_kernel<32, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + break; + case 64: + max_kernel<64, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + break; + case 128: + max_kernel<128, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + break; + default: + throw std::runtime_error("dim too large, max support 128"); + } + }; + + template void launch_reducemax(const double *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, + const int *reduced_dims, const bool keepdims, + double *result_data, const int *result_strides, const int result_dim); + template void launch_reducemax(const float *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, + const int *reduced_dims, const bool keepdims, + float *result_data, const int *result_strides, const int result_dim); + template void launch_reducemax(const nv_bfloat16 *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, + const int *reduced_dims, const bool keepdims, + nv_bfloat16 *result_data, const int *result_strides, const int result_dim); + template void launch_reducemax<__half>(const __half *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, + const int *reduced_dims, const bool keepdims, + __half *result_data, const int *result_strides, const int result_dim); + template void launch_reducemax(const int64_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, + const int *reduced_dims, const bool keepdims, + int64_t *result_data, const int *result_strides, const int result_dim); + template void launch_reducemax(const int32_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, + const int *reduced_dims, const bool keepdims, + int32_t *result_data, const int *result_strides, const int result_dim); + template void launch_reducemax(const int16_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, + const int *reduced_dims, const bool keepdims, + int16_t *result_data, const int *result_strides, const int result_dim); + template void launch_reducemax(const int8_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, + const int *reduced_dims, const bool keepdims, + int8_t *result_data, const int *result_strides, const int result_dim); + + //min + template + __global__ void min_kernel(const T *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, + const int *reduced_dims, const bool keepdims, + T *result_data, const int *result_strides, const int result_dim){ + const int grid_stride = gridDim.x * blockDim.x; + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + for (; thread_id < tensor_len; thread_id += grid_stride) + { + int input_indices[DIM]; + linearTo(tensor_strides, tensor_dim, input_indices, thread_id); + int output_indices[DIM]; + for (size_t i = 0, j = 0; i < tensor_dim; ++i) + { + if (reduced_dims[i] == 0) + { + output_indices[j++] = input_indices[i]; + } + else if (keepdims && (reduced_dims[i] == 1)) + { + output_indices[j++] = 0; + } + } + int outputIdx = linearAt(result_strides, result_dim, output_indices); + int inputIdx = linearAt(tensor_strides, tensor_dim, input_indices); + deepx_min(result_data+outputIdx, tensor_data+inputIdx, result_data+outputIdx); + } + } + + template + void launch_reducemin(const T *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, + const int *reduced_dims, const bool keepdims, + T *result_data, const int *result_strides, const int result_dim) + { + auto [numBlocks, blockSize] = BestDims(tensor_len); + cudaVector tensor_strides_d(tensor_strides, tensor_dim, cudaMemcpyHostToDevice); + cudaVector result_strides_d(result_strides, result_dim, cudaMemcpyHostToDevice); + cudaVector reduced_dims_d(reduced_dims , tensor_dim, cudaMemcpyHostToDevice); + + int powDim = nextPowerOf2(tensor_dim); + switch (powDim) + { + case 1: + min_kernel<1, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + break; + case 2: + min_kernel<2, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + break; + case 4: + min_kernel<4, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + break; + case 8: + min_kernel<8, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + break; + case 16: + min_kernel<16, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + break; + case 32: + min_kernel<32, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + break; + case 64: + min_kernel<64, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + break; + case 128: + min_kernel<128, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + break; + default: + throw std::runtime_error("dim too large, max support 128"); + } + } + + template void launch_reducemin(const double *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, + const int *reduced_dims, const bool keepdims, + double *result_data, const int *result_strides, const int result_dim); + template void launch_reducemin(const float *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, + const int *reduced_dims, const bool keepdims, + float *result_data, const int *result_strides, const int result_dim); + template void launch_reducemin(const nv_bfloat16 *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, + const int *reduced_dims, const bool keepdims, + nv_bfloat16 *result_data, const int *result_strides, const int result_dim); + template void launch_reducemin<__half>(const __half *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, + const int *reduced_dims, const bool keepdims, + __half *result_data, const int *result_strides, const int result_dim); + template void launch_reducemin(const int64_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, + const int *reduced_dims, const bool keepdims, + int64_t *result_data, const int *result_strides, const int result_dim); + template void launch_reducemin(const int32_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, + const int *reduced_dims, const bool keepdims, + int32_t *result_data, const int *result_strides, const int result_dim); + template void launch_reducemin(const int16_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, + const int *reduced_dims, const bool keepdims, + int16_t *result_data, const int *result_strides, const int result_dim); + template void launch_reducemin(const int8_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, + const int *reduced_dims, const bool keepdims, + int8_t *result_data, const int *result_strides, const int result_dim); + +} + +#endif // DEEPX_TENSORFUNC_REDUCE_MIAOBYTE_CU diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.cuh b/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.cuh new file mode 100644 index 00000000..de1dbf4c --- /dev/null +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.cuh @@ -0,0 +1,58 @@ +#ifndef DEEPX_TENSORFUNC_REDUCE_MIAOBYTE_CUH +#define DEEPX_TENSORFUNC_REDUCE_MIAOBYTE_CUH + +#include +#include + +namespace deepx::tensorfunc +{ + // sum + template + __global__ void sum_kernel(const T *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, + const int *reduced_dims, const bool keepdims, + T *result_data, const int *result_strides, const int result_dim); + + template + void launch_sum(const T *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, + const int *reduced_dims, const bool keepdims, + T *result_data, const int *result_strides, const int result_dim); + + + //prod + + template + __global__ void prod_kernel(const T *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, + const int *reduced_dims, const bool keepdims, + T *result_data, const int *result_strides, const int result_dim); + + template + void launch_prod(const T *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, + const int *reduced_dims, const bool keepdims, + T *result_data, const int *result_strides, const int result_dim); + + //max + template + __global__ void reducemax_kernel(const T *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, + const int *reduced_dims, const bool keepdims, + T *result_data, const int *result_strides, const int result_dim); + + template + void launch_reducemax(const T *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, + const int *reduced_dims, const bool keepdims, + T *result_data, const int *result_strides, const int result_dim); + + + //min + template + __global__ void reducemin_kernel(const T *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, + const int *reduced_dims, const bool keepdims, + T *result_data, const int *result_strides, const int result_dim); + + template + void launch_reducemin(const T *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, + const int *reduced_dims, const bool keepdims, + T *result_data, const int *result_strides, const int result_dim); + +} + +#endif //DEEPX_TENSORFUNC_REDUCE_MIAOBYTE_CUH diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.hpp index 7b2a9f58..ce4c7223 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.hpp +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.hpp @@ -2,67 +2,72 @@ #define DEEPX_TENSORFUNC_REDUCE_MIAOBYTE_HPP #include -#include #include #include "deepx/tensor.hpp" -#include "deepx/shape_reduce.hpp" -#include "deepx/tensorfunc/authors.hpp" -#include - #include "deepx/tensorfunc/reduce.hpp" +#include "deepx/tensorfunc/reduce_miaobyte.cuh" +#include "deepx/shape_reduce.hpp" +#include "deepx/tensorfunc/authors.hpp" +#include "deepx/tensorfunc/init_miaobyte.hpp" namespace deepx::tensorfunc { - - template < typename T> - struct reducemaxDispatcher + + template + struct sumDispatcher { - static void reducemax(const Tensor &A, const std::vector &dims, Tensor &B,const bool keepdims) { - if (axis < 0) { - axis += A.shape.dim; - } - if (axis >= A.shape.dim) { - throw std::invalid_argument("Invalid axis for reducemax"); - } - + static void sum(const Tensor &tensor, const std::vector &dims, const bool keepdims, Tensor &result) + { + constant(result, T(0)); + std::vector checkeddims = checkedDims(tensor.shape.shape, dims); + std::vector reduced_dims = reducedDim(tensor.shape.shape, checkeddims); + launch_sum(tensor.data, tensor.shape.strides.data(), tensor.shape.dim, tensor.shape.size, + reduced_dims.data(), keepdims, + result.data, result.shape.strides.data(), result.shape.dim); } }; - - template < typename T> - struct reduceminDispatcher + template + struct prodDispatcher { - static void reducemin(const Tensor &A, const std::vector &dims, Tensor &B,const bool keepdims) { - if (axis < 0) { - axis += A.shape.dim; - } - if (axis >= A.shape.dim) { - throw std::invalid_argument("Invalid axis for reducemin"); - } - + static void prod(const Tensor &tensor, const std::vector &dims, const bool keepdims, Tensor &result) + { + constant(result, T(1)); + std::vector checkeddims = checkedDims(tensor.shape.shape, dims); + std::vector reduced_dims = reducedDim(tensor.shape.shape, checkeddims); + launch_prod(tensor.data, tensor.shape.strides.data(), tensor.shape.dim, tensor.shape.size, + reduced_dims.data(), keepdims, + result.data, result.shape.strides.data(), result.shape.dim); } }; - - template - struct sumDispatcher + struct reducemaxDispatcher { - static void sum(const Tensor &tensor, const std::vector &dims, Tensor &result,const bool keepdims) + static void reducemax(const Tensor &tensor, const std::vector &dims, const bool keepdims, Tensor &result) { - - + constant(result, std::numeric_limits::lowest()); + std::vector checkeddims = checkedDims(tensor.shape.shape, dims); + std::vector reduced_dims = reducedDim(tensor.shape.shape, checkeddims); + launch_reducemax(tensor.data, tensor.shape.strides.data(), tensor.shape.dim, tensor.shape.size, + reduced_dims.data(), keepdims, + result.data, result.shape.strides.data(), result.shape.dim); } }; - template - struct prodDispatcher + struct reduceminDispatcher { - static void prod(const Tensor &tensor, const std::vector &dims, Tensor &result,const bool keepdims) + static void reducemin(const Tensor &tensor, const std::vector &dims, const bool keepdims, Tensor &result) { - + constant(result, std::numeric_limits::max()); + std::vector checkeddims = checkedDims(tensor.shape.shape, dims); + std::vector reduced_dims = reducedDim(tensor.shape.shape, checkeddims); + launch_reducemin(tensor.data, tensor.shape.strides.data(), tensor.shape.dim, tensor.shape.size, + reduced_dims.data(), keepdims, + result.data, result.shape.strides.data(), result.shape.dim); } }; } -#endif DEEPX_TENSORFUNC_REDUCE_MIAOBYTE_HPP \ No newline at end of file + +#endif //DEEPX_TENSORFUNC_REDUCE_MIAOBYTE_HPP \ No newline at end of file diff --git a/excuter/op-mem-cuda/src/deepx/tf/reduce.hpp b/excuter/op-mem-cuda/src/deepx/tf/reduce.hpp new file mode 100644 index 00000000..4a1643ea --- /dev/null +++ b/excuter/op-mem-cuda/src/deepx/tf/reduce.hpp @@ -0,0 +1,252 @@ +#ifndef DEEPX_TF_REDUCE_HPP +#define DEEPX_TF_REDUCE_HPP + +#include +#include +#include + +#include "deepx/tensorfunc/reduce_miaobyte.hpp" + +namespace deepx::tf +{ + template + class Sum : public TF + { + public: + Sum(const vector &args, const vector &returns) + { + this->name = "sum"; + this->author = Author::name(); + this->args = args; + this->returns = returns; + } + + string math_formula() const override + { + return "B = sum(A, axis=[1 2], keepdims=false)"; + } + shared_ptr clone() const override + { + return make_shared(*this); + } + int run(shared_ptr mem, string &error) override + { + Precision input_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; + vector dims = this->getvector(1, true); + bool keepdims = this->getvar(2,mem,true); + Precision output_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (input_type != output_type) + { + error = "Type mismatch: " + precision_str(input_type) + " != " + precision_str(output_type); + return 1; + } + switch (input_type) + { + case Precision::Float64: + sum(*mem->gettensor(this->args[0].textvalue), dims, keepdims, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + sum(*mem->gettensor(this->args[0].textvalue), dims, keepdims, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::BFloat16: + sum(*mem->gettensor(this->args[0].textvalue), dims, keepdims, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float16: + sum(*mem->gettensor<__half>(this->args[0].textvalue), dims, keepdims, *mem->gettensor<__half>(this->returns[0].textvalue)); + break; + case Precision::Int64: + sum(*mem->gettensor(this->args[0].textvalue), dims, keepdims, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + sum(*mem->gettensor(this->args[0].textvalue), dims, keepdims, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + sum(*mem->gettensor(this->args[0].textvalue), dims, keepdims, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + sum(*mem->gettensor(this->args[0].textvalue), dims, keepdims, *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported type: " + precision_str(input_type); + return 1; + } + return 0; + } + }; + + template + class Prod : public TF + { + public: + Prod(const vector &args, const vector &returns) + { + this->name = "prod"; + this->author = Author::name(); + this->args = args; + this->returns = returns; + } + string math_formula() const override + { + return "B = prod(A, axis=[1 2], keepdims=false)"; + } + shared_ptr clone() const override + { + return make_shared(*this); + } + int run(shared_ptr mem, string &error) override + { + Precision input_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; + vector dims = this->getvector(1, true); + bool keepdims = this->getvar(2,mem,true); + Precision output_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (input_type != output_type) + { + error = "Type mismatch: " + precision_str(input_type) + " != " + precision_str(output_type); + return 1; + } + switch (input_type) + { + case Precision::Float64: + prod(*mem->gettensor(this->args[0].textvalue), dims, keepdims, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + prod(*mem->gettensor(this->args[0].textvalue), dims, keepdims, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int64: + prod(*mem->gettensor(this->args[0].textvalue), dims, keepdims, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + prod(*mem->gettensor(this->args[0].textvalue), dims, keepdims, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + prod(*mem->gettensor(this->args[0].textvalue), dims, keepdims, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + prod(*mem->gettensor(this->args[0].textvalue), dims, keepdims, *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported type: " + precision_str(input_type); + return 1; + } + return 0; + } + }; + + template + class ReduceMax : public TF + { + public: + ReduceMax(const vector &args, const vector &returns) + { + this->name = "reducemax"; + this->author = Author::name(); + this->args = args; + this->returns = returns; + } + string math_formula() const override + { + return "B = reducemax(A, axis=[1 2], keepdims=false)"; + } + shared_ptr clone() const override + { + return make_shared(*this); + } + int run(shared_ptr mem, string &error) override + { + Precision input_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; + vector dims = this->getvector(1, true); + bool keepdims = this->getvar(2,mem,true); + Precision output_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (input_type != output_type) + { + error = "Type mismatch: " + precision_str(input_type) + " != " + precision_str(output_type); + return 1; + } + switch (input_type) + { + case Precision::Float64: + reducemax(*mem->gettensor(this->args[0].textvalue), dims, keepdims, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + reducemax(*mem->gettensor(this->args[0].textvalue), dims, keepdims, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int64: + reducemax(*mem->gettensor(this->args[0].textvalue), dims, keepdims, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + reducemax(*mem->gettensor(this->args[0].textvalue), dims, keepdims, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + reducemax(*mem->gettensor(this->args[0].textvalue), dims, keepdims, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + reducemax(*mem->gettensor(this->args[0].textvalue), dims, keepdims, *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported type: " + precision_str(input_type); + return 1; + } + return 0; + } + }; + + template + class ReduceMin : public TF + { + public: + ReduceMin(const vector &args, const vector &returns) + { + this->name = "reducemin"; + this->author = Author::name(); + this->args = args; + this->returns = returns; + } + string math_formula() const override + { + return "B = reducemin(A, axis=[1 2], keepdims=false)"; + } + shared_ptr clone() const override + { + return make_shared(*this); + } + int run(shared_ptr mem, string &error) override + { + Precision input_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; + vector dims = this->getvector(1, true); + bool keepdims = this->getvar(2,mem,true); + Precision output_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (input_type != output_type) + { + error = "Type mismatch: " + precision_str(input_type) + " != " + precision_str(output_type); + return 1; + } + switch (input_type) + { + case Precision::Float64: + reducemin(*mem->gettensor(this->args[0].textvalue), dims, keepdims, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + reducemin(*mem->gettensor(this->args[0].textvalue), dims, keepdims, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int64: + reducemin(*mem->gettensor(this->args[0].textvalue), dims, keepdims, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + reducemin(*mem->gettensor(this->args[0].textvalue), dims, keepdims, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + reducemin(*mem->gettensor(this->args[0].textvalue), dims, keepdims, *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + reducemin(*mem->gettensor(this->args[0].textvalue), dims, keepdims, *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported type: " + precision_str(input_type); + return 1; + } + return 0; + } + }; +} + +#endif // DEEPX_TF_REDUCE_HPP From b2c9d8360974423275bf3f3034a7fe2b5c4c16a5 Mon Sep 17 00:00:00 2001 From: lipeng <734991033@qq.com> Date: Sun, 13 Apr 2025 20:41:18 +0800 Subject: [PATCH 6/6] front&excuter/reduce:sum,prod --- doc/excuter/op-mem-cuda/list.md | 4 + excuter/op-mem-cuda/src/client/tfs.cpp | 2 +- .../src/deepx/tensorfunc/cuda_math.cuh | 363 ++++++++++++++++++ .../src/deepx/tensorfunc/cuda_math.hpp | 119 ------ .../src/deepx/tensorfunc/reduce_miaobyte.cu | 207 +++++----- front/py/deepx/scheduler/client/udpconn.py | 2 +- .../py/examples/2_ir/5_reduce_sum_keepdim.dot | 48 +-- .../2_ir/5_reduce_sum_keepdim.dot.svg | 96 ++--- front/py/examples/2_ir/5_reduce_sumprod.dot | 48 +-- .../py/examples/2_ir/5_reduce_sumprod.dot.svg | 96 ++--- 10 files changed, 619 insertions(+), 366 deletions(-) create mode 100644 excuter/op-mem-cuda/src/deepx/tensorfunc/cuda_math.cuh delete mode 100644 excuter/op-mem-cuda/src/deepx/tensorfunc/cuda_math.hpp diff --git a/doc/excuter/op-mem-cuda/list.md b/doc/excuter/op-mem-cuda/list.md index eab757a0..36d05762 100644 --- a/doc/excuter/op-mem-cuda/list.md +++ b/doc/excuter/op-mem-cuda/list.md @@ -4,6 +4,7 @@ | Operation | Author | Func Def | Math Formula | IR Instruction | |-----------|--------|------------|--------------|----------------| +| reducemax | miaobyte | reducemax(tensor A, vector dims, var keepdims)->(tensor B) | B = reducemax(A, axis=[1 2], keepdims=false) | reducemax(tensor A, vector dims, var keepdims)->(tensor B) | | broadcastTo | miaobyte | broadcastTo(tensor A, vector new_shape)->(tensor B) | T2 = T1.broadcastTo(new_shape=[4,3,2]) | broadcastTo(tensor A, vector new_shape)->(tensor B) | | concat | miaobyte | concat(listtensor tensors, var axis)->(tensor result) | Tresult = concat([T1, T2...], axis=3) | concat(listtensor tensors, var axis)->(tensor result) | | transpose | miaobyte | transpose(tensor A, vector dim_order)->(tensor C) | T2 = T1.transpose(dimorder=[1,0]) | transpose(tensor A, vector dim_order)->(tensor C) | @@ -24,8 +25,10 @@ | newtensor | none | newtensor(vector shape)->(tensor tensor1) | T1 = zeros(shape) | newtensor(vector shape)->(tensor tensor1) | | newtensor | none | newtensor(var shape)->(tensor tensor1) | T1 = zeros(shape) | newtensor(var shape)->(tensor tensor1) | | vecset | none | vecset(vector value)->(vector name) | shape = [3 4 5] | vecset(vector value)->(vector name) | +| reducemin | miaobyte | reducemin(tensor A, vector dims, var keepdims)->(tensor B) | B = reducemin(A, axis=[1 2], keepdims=false) | reducemin(tensor A, vector dims, var keepdims)->(tensor B) | | subscalar | miaobyte | subscalar(tensor A, var b)->(tensor C) | T3=T1-scalar | subscalar(tensor A, var b)->(tensor C) | | sqrt | miaobyte | sqrt(tensor A)->(tensor C) | T3=sqrt(T1) | sqrt(tensor A)->(tensor C) | +| sum | miaobyte | sum(tensor A, vector dims, var keepdims)->(tensor B) | B = sum(A, axis=[1 2], keepdims=false) | sum(tensor A, vector dims, var keepdims)->(tensor B) | | argset | none | argset(var value)->(var name) | var argname = argvalue | argset(var value)->(var name) | | sub | miaobyte | sub(tensor A, tensor B)->(tensor C) | T3=T1-T2 | sub(tensor A, tensor B)->(tensor C) | | mulscalar | miaobyte | mulscalar(tensor A, var b)->(tensor C) | T3=T1*scalar | mulscalar(tensor A, var b)->(tensor C) | @@ -40,5 +43,6 @@ | rdivscalar | miaobyte | rdivscalar(var scalar, tensor A)->(tensor C) | T3=scalar/T1 | rdivscalar(var scalar, tensor A)->(tensor C) | | minscalar | miaobyte | minscalar(tensor A, var scalar)->(tensor C) | T3=min(T1, scalar) | minscalar(tensor A, var scalar)->(tensor C) | | cos | miaobyte | cos(tensor A)->(tensor C) | T3=cos(T1) | cos(tensor A)->(tensor C) | +| prod | miaobyte | prod(tensor A, vector dims, var keepdims)->(tensor B) | B = prod(A, axis=[1 2], keepdims=false) | prod(tensor A, vector dims, var keepdims)->(tensor B) | | min | miaobyte | min(tensor A, tensor B)->(tensor C) | T3=min(T1, T2) | min(tensor A, tensor B)->(tensor C) | | compare | miaobyte | compare(tensor A, tensor B)->(tensor mask) | mask=compare(T1, T2) | compare(tensor A, tensor B)->(tensor mask) | diff --git a/excuter/op-mem-cuda/src/client/tfs.cpp b/excuter/op-mem-cuda/src/client/tfs.cpp index 2ef18928..9cd9a9f3 100644 --- a/excuter/op-mem-cuda/src/client/tfs.cpp +++ b/excuter/op-mem-cuda/src/client/tfs.cpp @@ -430,7 +430,7 @@ namespace deepx::tf register_elementwise(tffactory); register_matmul(tffactory); register_changeshape(tffactory); - // register_reduce(opfactory); + register_reduce(tffactory); return 0; } } \ No newline at end of file diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda_math.cuh b/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda_math.cuh new file mode 100644 index 00000000..d1828724 --- /dev/null +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda_math.cuh @@ -0,0 +1,363 @@ +#ifndef DEEPX_TENSORFUNC_CUDA_MATH_CUH +#define DEEPX_TENSORFUNC_CUDA_MATH_CUH + +#include +#include +#include +#include + +namespace deepx::tensorfunc +{ + + // max + template + __device__ void deepx_max(const T *a, const T *b, T *out); + + template <> + __device__ void deepx_max(const double *a, const double *b, double *out) + { + *out = fmax(*a, *b); + } + + template <> + __device__ void deepx_max(const float *a, const float *b, float *out) + { + *out = fmaxf(*a, *b); + } + + template <> + __device__ void deepx_max(const half *a, const half *b, half *out) + { + *out = __hmax(*a, *b); + } + + template <> + __device__ void deepx_max(const nv_bfloat16 *a, const nv_bfloat16 *b, nv_bfloat16 *out) + { + *out = __hmax(*a, *b); + } + template <> + __device__ void deepx_max(const int64_t *a, const int64_t *b, int64_t *out) + { + *out = *a > *b ? *a : *b; + } + template <> + __device__ void deepx_max(const int32_t *a, const int32_t *b, int32_t *out) + { + *out = *a > *b ? *a : *b; + } + template <> + __device__ void deepx_max(const int16_t *a, const int16_t *b, int16_t *out) + { + *out = *a > *b ? *a : *b; + } + template <> + __device__ void deepx_max(const int8_t *a, const int8_t *b, int8_t *out) + { + *out = *a > *b ? *a : *b; + } + + // min + template + __device__ void deepx_min(const T *a, const T *b, T *out); + + template <> + __device__ void deepx_min(const double *a, const double *b, double *out) + { + *out = fmin(*a, *b); + } + + template <> + __device__ void deepx_min(const float *a, const float *b, float *out) + { + *out = fminf(*a, *b); + } + + template <> + __device__ void deepx_min(const half *a, const half *b, half *out) + { + *out = __hmin(*a, *b); + } + + template <> + __device__ void deepx_min(const nv_bfloat16 *a, const nv_bfloat16 *b, nv_bfloat16 *out) + { + *out = __hmin(*a, *b); + } + + template <> + __device__ void deepx_min(const int64_t *a, const int64_t *b, int64_t *out) + { + *out = *a < *b ? *a : *b; + } + + template <> + __device__ void deepx_min(const int32_t *a, const int32_t *b, int32_t *out) + { + *out = *a < *b ? *a : *b; + } + + template <> + __device__ void deepx_min(const int16_t *a, const int16_t *b, int16_t *out) + { + *out = *a < *b ? *a : *b; + } + + template <> + __device__ void deepx_min(const int8_t *a, const int8_t *b, int8_t *out) + { + *out = *a < *b ? *a : *b; + } + + // atomicAdd + template + __device__ void deepx_atomicAdd(T *a, T b); + + template <> + __device__ void deepx_atomicAdd(double *a, double b) + { + atomicAdd(a, b); + } + + template <> + __device__ void deepx_atomicAdd(float *a, float b) + { + atomicAdd(a, b); + } + + template <> + __device__ void deepx_atomicAdd(half *a, half b) + { + atomicAdd(a, b); + } + + template <> + __device__ void deepx_atomicAdd(nv_bfloat16 *a, nv_bfloat16 b) + { + atomicAdd(a, b); + } + + template <> + __device__ void deepx_atomicAdd(int64_t *a, int64_t b) + { + int64_t old = *a; + int64_t assumed; + do + { + assumed = old; + old = atomicCAS((unsigned long long *)a, (unsigned long long)assumed, (unsigned long long)(assumed + b)); + } while (assumed != old); + *a = old + b; + } + + template <> + __device__ void deepx_atomicAdd(int32_t *a, int32_t b) + { + atomicAdd(a, b); + } + + template <> + __device__ void deepx_atomicAdd(int16_t *a, int16_t b) + { + unsigned int *address_as_uint = (unsigned int *)((char *)a - ((size_t)a & 2)); + unsigned int old = *address_as_uint; + unsigned int assumed; + + do + { + assumed = old; + unsigned int new_val; + if ((size_t)a & 2) + { + new_val = (old & 0x0000FFFF) | (((unsigned short)(((old >> 16) & 0xFFFF) + b)) << 16); + } + else + { + new_val = (old & 0xFFFF0000) | ((unsigned short)((old & 0xFFFF) + b)); + } + old = atomicCAS(address_as_uint, assumed, new_val); + } while (assumed != old); + } + + template <> + __device__ void deepx_atomicAdd(int8_t *a, int8_t b) + { + unsigned int *address_as_uint = (unsigned int *)((char *)a - ((size_t)a & 3)); + unsigned int old = *address_as_uint; + unsigned int assumed; + unsigned int byte_offset = ((size_t)a & 3) * 8; + unsigned int mask = 0xFF << byte_offset; + + do + { + assumed = old; + unsigned char byte_val = (old >> byte_offset) & 0xFF; + byte_val += b; + unsigned int new_val = (old & ~mask) | (byte_val << byte_offset); + old = atomicCAS(address_as_uint, assumed, new_val); + } while (assumed != old); + } + + + // atomicMul + // atomicMul + template + __device__ void deepx_atomicMul(T *a, T b); + + template <> + __device__ void deepx_atomicMul(double *a, double b) + { + double old = *a; + double assumed; + do + { + assumed = old; + old = __longlong_as_double(atomicCAS((unsigned long long int*)a, + __double_as_longlong(assumed), + __double_as_longlong(assumed * b))); + } while (assumed != old); + } + + template <> + __device__ void deepx_atomicMul(float *a, float b) + { + float old = *a; + float assumed; + do + { + assumed = old; + old = __int_as_float(atomicCAS((int*)a, + __float_as_int(assumed), + __float_as_int(assumed * b))); + } while (assumed != old); + } + + template <> + __device__ void deepx_atomicMul(half *a, half b) + { + unsigned int *address_as_uint = (unsigned int *)((char *)a - ((size_t)a & 2)); + unsigned int old = *address_as_uint; + unsigned int assumed; + + do + { + assumed = old; + half assumed_half; + if ((size_t)a & 2) + { + assumed_half = __ushort_as_half((unsigned short)(old >> 16)); + half new_half = __hmul(assumed_half, b); + unsigned int new_val = (old & 0x0000FFFF) | ((unsigned int)__half_as_ushort(new_half) << 16); + old = atomicCAS(address_as_uint, assumed, new_val); + } + else + { + assumed_half = __ushort_as_half((unsigned short)(old & 0xFFFF)); + half new_half = __hmul(assumed_half, b); + unsigned int new_val = (old & 0xFFFF0000) | __half_as_ushort(new_half); + old = atomicCAS(address_as_uint, assumed, new_val); + } + } while (assumed != old); + } + + template <> + __device__ void deepx_atomicMul(nv_bfloat16 *a, nv_bfloat16 b) + { + unsigned int *address_as_uint = (unsigned int *)((char *)a - ((size_t)a & 2)); + unsigned int old = *address_as_uint; + unsigned int assumed; + + do + { + assumed = old; + nv_bfloat16 assumed_bf16; + if ((size_t)a & 2) + { + assumed_bf16 = __ushort_as_bfloat16((unsigned short)(old >> 16)); + nv_bfloat16 new_bf16 = __hmul(assumed_bf16, b); + unsigned int new_val = (old & 0x0000FFFF) | ((unsigned int)__bfloat16_as_ushort(new_bf16) << 16); + old = atomicCAS(address_as_uint, assumed, new_val); + } + else + { + assumed_bf16 = __ushort_as_bfloat16((unsigned short)(old & 0xFFFF)); + nv_bfloat16 new_bf16 = __hmul(assumed_bf16, b); + unsigned int new_val = (old & 0xFFFF0000) | __bfloat16_as_ushort(new_bf16); + old = atomicCAS(address_as_uint, assumed, new_val); + } + } while (assumed != old); + } + + template <> + __device__ void deepx_atomicMul(int64_t *a, int64_t b) + { + int64_t old = *a; + int64_t assumed; + do + { + assumed = old; + old = atomicCAS((unsigned long long *)a, + (unsigned long long)assumed, + (unsigned long long)(assumed * b)); + } while (assumed != old); + } + + template <> + __device__ void deepx_atomicMul(int32_t *a, int32_t b) + { + int32_t old = *a; + int32_t assumed; + do + { + assumed = old; + old = atomicCAS((int32_t *)a, assumed, assumed * b); + } while (assumed != old); + } + + template <> + __device__ void deepx_atomicMul(int16_t *a, int16_t b) + { + unsigned int *address_as_uint = (unsigned int *)((char *)a - ((size_t)a & 2)); + unsigned int old = *address_as_uint; + unsigned int assumed; + + do + { + assumed = old; + unsigned int new_val; + if ((size_t)a & 2) + { + int16_t assumed_short = (int16_t)(old >> 16); + new_val = (old & 0x0000FFFF) | (((unsigned short)(assumed_short * b)) << 16); + } + else + { + int16_t assumed_short = (int16_t)(old & 0xFFFF); + new_val = (old & 0xFFFF0000) | ((unsigned short)(assumed_short * b)); + } + old = atomicCAS(address_as_uint, assumed, new_val); + } while (assumed != old); + } + + template <> + __device__ void deepx_atomicMul(int8_t *a, int8_t b) + { + unsigned int *address_as_uint = (unsigned int *)((char *)a - ((size_t)a & 3)); + unsigned int old = *address_as_uint; + unsigned int assumed; + unsigned int byte_offset = ((size_t)a & 3) * 8; + unsigned int mask = 0xFF << byte_offset; + + do + { + assumed = old; + int8_t byte_val = (old >> byte_offset) & 0xFF; + byte_val *= b; + unsigned int new_val = (old & ~mask) | ((byte_val & 0xFF) << byte_offset); + old = atomicCAS(address_as_uint, assumed, new_val); + } while (assumed != old); + } + +} + +#endif // DEEPX_TENSORFUNC_CUDA_MATH_CUH \ No newline at end of file diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda_math.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda_math.hpp deleted file mode 100644 index f0a3c578..00000000 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda_math.hpp +++ /dev/null @@ -1,119 +0,0 @@ -#ifndef DEEPX_TENSORFUNC_CUDA_MATH_HPP -#define DEEPX_TENSORFUNC_CUDA_MATH_HPP - -#include -#include -#include -#include - -namespace deepx::tensorfunc -{ - - // max - template - __device__ void deepx_max(const T *a, const T *b, T *out); - - template <> - __device__ void deepx_max(const double *a, const double *b, double *out) - { - *out = fmax(*a, *b); - } - - template <> - __device__ void deepx_max(const float *a, const float *b, float *out) - { - *out = fmaxf(*a, *b); - } - - template <> - __device__ void deepx_max(const half *a, const half *b, half *out) - { - *out = __hmax(*a, *b); - } - - template <> - __device__ void deepx_max(const nv_bfloat16 *a, const nv_bfloat16 *b, nv_bfloat16 *out) - { - *out = __hmax(*a, *b); - } - template <> - __device__ void deepx_max(const int64_t *a, const int64_t *b, int64_t *out) - { - *out = *a>*b?*a:*b; - } - template <> - __device__ void deepx_max(const int32_t *a, const int32_t *b, int32_t *out) - { - *out = *a>*b?*a:*b; - } - template <> - __device__ void deepx_max(const int16_t *a, const int16_t *b, int16_t *out) - { - *out = *a>*b?*a:*b; - } - template <> - __device__ void deepx_max(const int8_t *a, const int8_t *b, int8_t *out) - { - *out = *a>*b?*a:*b; - } - - - // min - template - __device__ void deepx_min(const T *a, const T *b, T *out); - - - template <> - __device__ void deepx_min(const double *a, const double *b, double *out) - { - *out = fmin(*a, *b); - } - - template <> - __device__ void deepx_min(const float *a, const float *b, float *out) - { - *out = fminf(*a, *b); - } - - - template <> - __device__ void deepx_min(const half *a, const half *b, half *out) - { - *out = __hmin(*a, *b); - } - - template <> - __device__ void deepx_min(const nv_bfloat16 *a,const nv_bfloat16 *b, nv_bfloat16 *out) - { - *out = __hmin(*a, *b); - } - - template <> - __device__ void deepx_min(const int64_t *a, const int64_t *b, int64_t *out) - { - *out = *a<*b?*a:*b; - } - - template <> - __device__ void deepx_min(const int32_t *a, const int32_t *b, int32_t *out) - { - *out = *a<*b?*a:*b; - } - - template <> - __device__ void deepx_min(const int16_t *a, const int16_t *b, int16_t *out) - { - *out = *a<*b?*a:*b; - } - - template <> - __device__ void deepx_min(const int8_t *a, const int8_t *b, int8_t *out) - { - *out = *a<*b?*a:*b; - } - - - -} - -#endif diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.cu index c7df6cf6..43717698 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.cu +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.cu @@ -9,15 +9,18 @@ #include "deepx/tensorfunc/reduce_miaobyte.cuh" #include "deepx/tensorfunc/tensor_cuda.cuh" #include "deepx/tensorfunc/vector_cuda.cuh" -#include "deepx/tensorfunc/cuda_math.hpp" +#include "deepx/tensorfunc/cuda_math.cuh" namespace deepx::tensorfunc { + + // sum + //DIM是希望申请寄存器中存放索引数组的长度 template __global__ void sum_kernel(const T *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, - const int *reduced_dims, const bool keepdims, - T *result_data, const int *result_strides, const int result_dim) + const int *reduced_dims, const bool keepdims, + T *result_data, const int *result_strides, const int result_dim) { const int grid_stride = gridDim.x * blockDim.x; int thread_id = blockIdx.x * blockDim.x + threadIdx.x; @@ -39,19 +42,20 @@ namespace deepx::tensorfunc } int outputIdx = linearAt(result_strides, result_dim, output_indices); int inputIdx = linearAt(tensor_strides, tensor_dim, input_indices); - result_data[outputIdx] += tensor_data[inputIdx]; + deepx_atomicAdd(result_data + outputIdx, tensor_data[inputIdx]); } } - + template __host__ void launch_sum(const T *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, - const int *reduced_dims, const bool keepdims, - T *result_data, const int *result_strides, const int result_dim) + const int *reduced_dims, const bool keepdims, + T *result_data, const int *result_strides, const int result_dim) { auto [numBlocks, blockSize] = BestDims(tensor_len); + // int shared_mem_size = blockSize * sizeof(T) + sizeof(int) * tensor_dim; cudaVector tensor_strides_d(tensor_strides, tensor_dim, cudaMemcpyHostToDevice); cudaVector result_strides_d(result_strides, result_dim, cudaMemcpyHostToDevice); - cudaVector reduced_dims_d(reduced_dims,tensor_dim, cudaMemcpyHostToDevice); + cudaVector reduced_dims_d(reduced_dims, tensor_dim, cudaMemcpyHostToDevice); int powDim = nextPowerOf2(tensor_dim); switch (powDim) @@ -66,13 +70,13 @@ namespace deepx::tensorfunc sum_kernel<4, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); break; case 8: - sum_kernel<8, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len,reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + sum_kernel<8, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); break; case 16: sum_kernel<16, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); break; case 32: - sum_kernel<32, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); + sum_kernel<32, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); break; case 64: sum_kernel<64, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); @@ -86,35 +90,35 @@ namespace deepx::tensorfunc } template void launch_sum(const double *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, - const int *reduced_dims, const bool keepdims, - double *result_data, const int *result_strides, const int result_dim); + const int *reduced_dims, const bool keepdims, + double *result_data, const int *result_strides, const int result_dim); template void launch_sum(const float *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, - const int *reduced_dims, const bool keepdims, - float *result_data, const int *result_strides, const int result_dim); + const int *reduced_dims, const bool keepdims, + float *result_data, const int *result_strides, const int result_dim); template void launch_sum(const nv_bfloat16 *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, - const int *reduced_dims, const bool keepdims, - nv_bfloat16 *result_data, const int *result_strides, const int result_dim); + const int *reduced_dims, const bool keepdims, + nv_bfloat16 *result_data, const int *result_strides, const int result_dim); template void launch_sum<__half>(const __half *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, - const int *reduced_dims, const bool keepdims, - __half *result_data, const int *result_strides, const int result_dim); + const int *reduced_dims, const bool keepdims, + __half *result_data, const int *result_strides, const int result_dim); template void launch_sum(const int64_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, - const int *reduced_dims, const bool keepdims, - int64_t *result_data, const int *result_strides, const int result_dim); + const int *reduced_dims, const bool keepdims, + int64_t *result_data, const int *result_strides, const int result_dim); template void launch_sum(const int32_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, - const int *reduced_dims, const bool keepdims, - int32_t *result_data, const int *result_strides, const int result_dim); + const int *reduced_dims, const bool keepdims, + int32_t *result_data, const int *result_strides, const int result_dim); template void launch_sum(const int16_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, - const int *reduced_dims, const bool keepdims, - int16_t *result_data, const int *result_strides, const int result_dim); + const int *reduced_dims, const bool keepdims, + int16_t *result_data, const int *result_strides, const int result_dim); template void launch_sum(const int8_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, - const int *reduced_dims, const bool keepdims, - int8_t *result_data, const int *result_strides, const int result_dim); + const int *reduced_dims, const bool keepdims, + int8_t *result_data, const int *result_strides, const int result_dim); - //prod + // prod template - __global__ void prod_kernel(const T *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, - const int *reduced_dims, const bool keepdims, - T *result_data, const int *result_strides, const int result_dim) + __global__ void prod_kernel(const T *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, + const int *reduced_dims, const bool keepdims, + T *result_data, const int *result_strides, const int result_dim) { const int grid_stride = gridDim.x * blockDim.x; int thread_id = blockIdx.x * blockDim.x + threadIdx.x; @@ -136,19 +140,19 @@ namespace deepx::tensorfunc } int outputIdx = linearAt(result_strides, result_dim, output_indices); int inputIdx = linearAt(tensor_strides, tensor_dim, input_indices); - result_data[outputIdx] *= tensor_data[inputIdx]; + deepx_atomicMul(result_data + outputIdx, tensor_data[inputIdx]); } } template void launch_prod(const T *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, - const int *reduced_dims, const bool keepdims, - T *result_data, const int *result_strides, const int result_dim) + const int *reduced_dims, const bool keepdims, + T *result_data, const int *result_strides, const int result_dim) { auto [numBlocks, blockSize] = BestDims(tensor_len); cudaVector tensor_strides_d(tensor_strides, tensor_dim, cudaMemcpyHostToDevice); cudaVector result_strides_d(result_strides, result_dim, cudaMemcpyHostToDevice); - cudaVector reduced_dims_d(reduced_dims,tensor_dim, cudaMemcpyHostToDevice); + cudaVector reduced_dims_d(reduced_dims, tensor_dim, cudaMemcpyHostToDevice); int powDim = nextPowerOf2(tensor_dim); switch (powDim) @@ -183,36 +187,37 @@ namespace deepx::tensorfunc } template void launch_prod(const double *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, - const int *reduced_dims, const bool keepdims, - double *result_data, const int *result_strides, const int result_dim); + const int *reduced_dims, const bool keepdims, + double *result_data, const int *result_strides, const int result_dim); template void launch_prod(const float *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, - const int *reduced_dims, const bool keepdims, - float *result_data, const int *result_strides, const int result_dim); + const int *reduced_dims, const bool keepdims, + float *result_data, const int *result_strides, const int result_dim); template void launch_prod(const nv_bfloat16 *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, - const int *reduced_dims, const bool keepdims, - nv_bfloat16 *result_data, const int *result_strides, const int result_dim); + const int *reduced_dims, const bool keepdims, + nv_bfloat16 *result_data, const int *result_strides, const int result_dim); template void launch_prod<__half>(const __half *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, - const int *reduced_dims, const bool keepdims, - __half *result_data, const int *result_strides, const int result_dim); + const int *reduced_dims, const bool keepdims, + __half *result_data, const int *result_strides, const int result_dim); template void launch_prod(const int64_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, - const int *reduced_dims, const bool keepdims, - int64_t *result_data, const int *result_strides, const int result_dim); + const int *reduced_dims, const bool keepdims, + int64_t *result_data, const int *result_strides, const int result_dim); template void launch_prod(const int32_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, - const int *reduced_dims, const bool keepdims, - int32_t *result_data, const int *result_strides, const int result_dim); + const int *reduced_dims, const bool keepdims, + int32_t *result_data, const int *result_strides, const int result_dim); template void launch_prod(const int16_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, - const int *reduced_dims, const bool keepdims, - int16_t *result_data, const int *result_strides, const int result_dim); + const int *reduced_dims, const bool keepdims, + int16_t *result_data, const int *result_strides, const int result_dim); template void launch_prod(const int8_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, - const int *reduced_dims, const bool keepdims, - int8_t *result_data, const int *result_strides, const int result_dim); + const int *reduced_dims, const bool keepdims, + int8_t *result_data, const int *result_strides, const int result_dim); - //max + // max template __global__ void max_kernel(const T *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, const int *reduced_dims, const bool keepdims, - T *result_data, const int *result_strides, const int result_dim){ - const int grid_stride = gridDim.x * blockDim.x; + T *result_data, const int *result_strides, const int result_dim) + { + const int grid_stride = gridDim.x * blockDim.x; int thread_id = blockIdx.x * blockDim.x + threadIdx.x; for (; thread_id < tensor_len; thread_id += grid_stride) { @@ -229,10 +234,10 @@ namespace deepx::tensorfunc { output_indices[j++] = 0; } - } + } int outputIdx = linearAt(result_strides, result_dim, output_indices); int inputIdx = linearAt(tensor_strides, tensor_dim, input_indices); - deepx_max(result_data+outputIdx, tensor_data+inputIdx, result_data+outputIdx); + deepx_max(result_data + outputIdx, tensor_data + inputIdx, result_data + outputIdx); } } @@ -244,7 +249,7 @@ namespace deepx::tensorfunc auto [numBlocks, blockSize] = BestDims(tensor_len); cudaVector tensor_strides_d(tensor_strides, tensor_dim, cudaMemcpyHostToDevice); cudaVector result_strides_d(result_strides, result_dim, cudaMemcpyHostToDevice); - cudaVector reduced_dims_d(reduced_dims,tensor_dim, cudaMemcpyHostToDevice); + cudaVector reduced_dims_d(reduced_dims, tensor_dim, cudaMemcpyHostToDevice); int powDim = nextPowerOf2(tensor_dim); switch (powDim) @@ -279,36 +284,37 @@ namespace deepx::tensorfunc }; template void launch_reducemax(const double *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, - const int *reduced_dims, const bool keepdims, - double *result_data, const int *result_strides, const int result_dim); + const int *reduced_dims, const bool keepdims, + double *result_data, const int *result_strides, const int result_dim); template void launch_reducemax(const float *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, - const int *reduced_dims, const bool keepdims, - float *result_data, const int *result_strides, const int result_dim); + const int *reduced_dims, const bool keepdims, + float *result_data, const int *result_strides, const int result_dim); template void launch_reducemax(const nv_bfloat16 *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, - const int *reduced_dims, const bool keepdims, - nv_bfloat16 *result_data, const int *result_strides, const int result_dim); + const int *reduced_dims, const bool keepdims, + nv_bfloat16 *result_data, const int *result_strides, const int result_dim); template void launch_reducemax<__half>(const __half *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, - const int *reduced_dims, const bool keepdims, - __half *result_data, const int *result_strides, const int result_dim); + const int *reduced_dims, const bool keepdims, + __half *result_data, const int *result_strides, const int result_dim); template void launch_reducemax(const int64_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, - const int *reduced_dims, const bool keepdims, - int64_t *result_data, const int *result_strides, const int result_dim); + const int *reduced_dims, const bool keepdims, + int64_t *result_data, const int *result_strides, const int result_dim); template void launch_reducemax(const int32_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, - const int *reduced_dims, const bool keepdims, - int32_t *result_data, const int *result_strides, const int result_dim); + const int *reduced_dims, const bool keepdims, + int32_t *result_data, const int *result_strides, const int result_dim); template void launch_reducemax(const int16_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, - const int *reduced_dims, const bool keepdims, - int16_t *result_data, const int *result_strides, const int result_dim); + const int *reduced_dims, const bool keepdims, + int16_t *result_data, const int *result_strides, const int result_dim); template void launch_reducemax(const int8_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, - const int *reduced_dims, const bool keepdims, - int8_t *result_data, const int *result_strides, const int result_dim); + const int *reduced_dims, const bool keepdims, + int8_t *result_data, const int *result_strides, const int result_dim); - //min + // min template __global__ void min_kernel(const T *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, const int *reduced_dims, const bool keepdims, - T *result_data, const int *result_strides, const int result_dim){ - const int grid_stride = gridDim.x * blockDim.x; + T *result_data, const int *result_strides, const int result_dim) + { + const int grid_stride = gridDim.x * blockDim.x; int thread_id = blockIdx.x * blockDim.x + threadIdx.x; for (; thread_id < tensor_len; thread_id += grid_stride) { @@ -325,11 +331,11 @@ namespace deepx::tensorfunc { output_indices[j++] = 0; } - } + } int outputIdx = linearAt(result_strides, result_dim, output_indices); int inputIdx = linearAt(tensor_strides, tensor_dim, input_indices); - deepx_min(result_data+outputIdx, tensor_data+inputIdx, result_data+outputIdx); - } + deepx_min(result_data + outputIdx, tensor_data + inputIdx, result_data + outputIdx); + } } template @@ -337,10 +343,10 @@ namespace deepx::tensorfunc const int *reduced_dims, const bool keepdims, T *result_data, const int *result_strides, const int result_dim) { - auto [numBlocks, blockSize] = BestDims(tensor_len); + auto [numBlocks, blockSize] = BestDims(tensor_len); cudaVector tensor_strides_d(tensor_strides, tensor_dim, cudaMemcpyHostToDevice); cudaVector result_strides_d(result_strides, result_dim, cudaMemcpyHostToDevice); - cudaVector reduced_dims_d(reduced_dims , tensor_dim, cudaMemcpyHostToDevice); + cudaVector reduced_dims_d(reduced_dims, tensor_dim, cudaMemcpyHostToDevice); int powDim = nextPowerOf2(tensor_dim); switch (powDim) @@ -353,13 +359,13 @@ namespace deepx::tensorfunc break; case 4: min_kernel<4, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); - break; + break; case 8: min_kernel<8, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); break; case 16: min_kernel<16, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); - break; + break; case 32: min_kernel<32, T><<>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim); break; @@ -372,33 +378,32 @@ namespace deepx::tensorfunc default: throw std::runtime_error("dim too large, max support 128"); } - } - + } + template void launch_reducemin(const double *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, - const int *reduced_dims, const bool keepdims, - double *result_data, const int *result_strides, const int result_dim); + const int *reduced_dims, const bool keepdims, + double *result_data, const int *result_strides, const int result_dim); template void launch_reducemin(const float *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, - const int *reduced_dims, const bool keepdims, - float *result_data, const int *result_strides, const int result_dim); + const int *reduced_dims, const bool keepdims, + float *result_data, const int *result_strides, const int result_dim); template void launch_reducemin(const nv_bfloat16 *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, - const int *reduced_dims, const bool keepdims, - nv_bfloat16 *result_data, const int *result_strides, const int result_dim); + const int *reduced_dims, const bool keepdims, + nv_bfloat16 *result_data, const int *result_strides, const int result_dim); template void launch_reducemin<__half>(const __half *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, - const int *reduced_dims, const bool keepdims, - __half *result_data, const int *result_strides, const int result_dim); + const int *reduced_dims, const bool keepdims, + __half *result_data, const int *result_strides, const int result_dim); template void launch_reducemin(const int64_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, - const int *reduced_dims, const bool keepdims, - int64_t *result_data, const int *result_strides, const int result_dim); + const int *reduced_dims, const bool keepdims, + int64_t *result_data, const int *result_strides, const int result_dim); template void launch_reducemin(const int32_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, - const int *reduced_dims, const bool keepdims, - int32_t *result_data, const int *result_strides, const int result_dim); + const int *reduced_dims, const bool keepdims, + int32_t *result_data, const int *result_strides, const int result_dim); template void launch_reducemin(const int16_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, - const int *reduced_dims, const bool keepdims, - int16_t *result_data, const int *result_strides, const int result_dim); + const int *reduced_dims, const bool keepdims, + int16_t *result_data, const int *result_strides, const int result_dim); template void launch_reducemin(const int8_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len, - const int *reduced_dims, const bool keepdims, - int8_t *result_data, const int *result_strides, const int result_dim); - + const int *reduced_dims, const bool keepdims, + int8_t *result_data, const int *result_strides, const int result_dim); } #endif // DEEPX_TENSORFUNC_REDUCE_MIAOBYTE_CU diff --git a/front/py/deepx/scheduler/client/udpconn.py b/front/py/deepx/scheduler/client/udpconn.py index a25b0963..6a12c26a 100644 --- a/front/py/deepx/scheduler/client/udpconn.py +++ b/front/py/deepx/scheduler/client/udpconn.py @@ -3,7 +3,7 @@ import select class UDPConn: - def __init__(self, endpoint: str = "localhost:8080"): + def __init__(self, endpoint: str = "localhost:9090"): # 解析endpoint self._host, port_str = endpoint.split(':') self._port = int(port_str) diff --git a/front/py/examples/2_ir/5_reduce_sum_keepdim.dot b/front/py/examples/2_ir/5_reduce_sum_keepdim.dot index 93785caa..fdc348fe 100644 --- a/front/py/examples/2_ir/5_reduce_sum_keepdim.dot +++ b/front/py/examples/2_ir/5_reduce_sum_keepdim.dot @@ -2,37 +2,37 @@ digraph { rankdir=TB node [shape=record] - 134049762132704 [label="t + 134483478251840 [label="t (3, 4, 5)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 134047466295776 [label="s + 134481220276224 [label="s (1, 4, 1)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 134047464790368 [label="vector_1 + 134481218754432 [label="vector_1 [0, 2]" color=darkseagreen fillcolor=honeydew fontname="Sans-Serif" labeljust=l shape=box style=filled] - 134047464790512 [label=sum color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 134047464790272 [label="p + 134481218754576 [label=sum color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 134481218754336 [label="p (3, 1, 5)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 134047464790608 [label="vector_2 + 134481218754672 [label="vector_2 [1]" color=darkseagreen fillcolor=honeydew fontname="Sans-Serif" labeljust=l shape=box style=filled] - 134047464790656 [label=prod color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 134049762132560 [label="t1 + 134481218754720 [label=prod color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 134483478251408 [label="t1 (4, 5, 6)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 134047464791088 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 134047464790848 [label="var_1 + 134481218755152 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 134481218754912 [label="var_1 1" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled] - 134047464790800 [label="t2 + 134481218754864 [label="t2 (1, 1, 6)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 134047464790464 [label="vector_3 + 134481218754528 [label="vector_3 [0, 1]" color=darkseagreen fillcolor=honeydew fontname="Sans-Serif" labeljust=l shape=box style=filled] - 134047464791328 [label=sum color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 134047464790512 -> 134047466295776 [arrowsize=0.8 color=gray40 penwidth=1.2] - 134049762132704 -> 134047464790512 [arrowsize=0.8 color=gray40 penwidth=1.2] - 134047464790368 -> 134047464790512 [arrowsize=0.8 color=gray40 penwidth=1.2] - 134047464790656 -> 134047464790272 [arrowsize=0.8 color=gray40 penwidth=1.2] - 134049762132704 -> 134047464790656 [arrowsize=0.8 color=gray40 penwidth=1.2] - 134047464790608 -> 134047464790656 [arrowsize=0.8 color=gray40 penwidth=1.2] - 134047464791088 -> 134049762132560 [arrowsize=0.8 color=gray40 penwidth=1.2] - 134047464790848 -> 134047464791088 [arrowsize=0.8 color=gray40 penwidth=1.2] - 134047464791328 -> 134047464790800 [arrowsize=0.8 color=gray40 penwidth=1.2] - 134049762132560 -> 134047464791328 [arrowsize=0.8 color=gray40 penwidth=1.2] - 134047464790464 -> 134047464791328 [arrowsize=0.8 color=gray40 penwidth=1.2] + 134481218755392 [label=sum color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 134481218754576 -> 134481220276224 [arrowsize=0.8 color=gray40 penwidth=1.2] + 134483478251840 -> 134481218754576 [arrowsize=0.8 color=gray40 penwidth=1.2] + 134481218754432 -> 134481218754576 [arrowsize=0.8 color=gray40 penwidth=1.2] + 134481218754720 -> 134481218754336 [arrowsize=0.8 color=gray40 penwidth=1.2] + 134483478251840 -> 134481218754720 [arrowsize=0.8 color=gray40 penwidth=1.2] + 134481218754672 -> 134481218754720 [arrowsize=0.8 color=gray40 penwidth=1.2] + 134481218755152 -> 134483478251408 [arrowsize=0.8 color=gray40 penwidth=1.2] + 134481218754912 -> 134481218755152 [arrowsize=0.8 color=gray40 penwidth=1.2] + 134481218755392 -> 134481218754864 [arrowsize=0.8 color=gray40 penwidth=1.2] + 134483478251408 -> 134481218755392 [arrowsize=0.8 color=gray40 penwidth=1.2] + 134481218754528 -> 134481218755392 [arrowsize=0.8 color=gray40 penwidth=1.2] } diff --git a/front/py/examples/2_ir/5_reduce_sum_keepdim.dot.svg b/front/py/examples/2_ir/5_reduce_sum_keepdim.dot.svg index bf6cc407..3786069a 100644 --- a/front/py/examples/2_ir/5_reduce_sum_keepdim.dot.svg +++ b/front/py/examples/2_ir/5_reduce_sum_keepdim.dot.svg @@ -9,156 +9,156 @@ %3 - + -134049762132704 +134483478251840 t (3, 4, 5) - + -134047464790512 +134481218754576 sum - + -134049762132704->134047464790512 +134483478251840->134481218754576 - + -134047464790656 +134481218754720 prod - + -134049762132704->134047464790656 +134483478251840->134481218754720 - + -134047466295776 +134481220276224 s (1, 4, 1) - + -134047464790368 +134481218754432 vector_1 [0, 2] - + -134047464790368->134047464790512 +134481218754432->134481218754576 - + -134047464790512->134047466295776 +134481218754576->134481220276224 - + -134047464790272 +134481218754336 p (3, 1, 5) - + -134047464790608 +134481218754672 vector_2 [1] - + -134047464790608->134047464790656 +134481218754672->134481218754720 - + -134047464790656->134047464790272 +134481218754720->134481218754336 - + -134049762132560 +134483478251408 t1 (4, 5, 6) - + -134047464791328 +134481218755392 sum - + -134049762132560->134047464791328 +134483478251408->134481218755392 - + -134047464791088 +134481218755152 constant - + -134047464791088->134049762132560 +134481218755152->134483478251408 - + -134047464790848 +134481218754912 var_1 1 - + -134047464790848->134047464791088 +134481218754912->134481218755152 - + -134047464790800 +134481218754864 t2 (1, 1, 6) - + -134047464790464 +134481218754528 vector_3 [0, 1] - + -134047464790464->134047464791328 +134481218754528->134481218755392 - + -134047464791328->134047464790800 +134481218755392->134481218754864 diff --git a/front/py/examples/2_ir/5_reduce_sumprod.dot b/front/py/examples/2_ir/5_reduce_sumprod.dot index 5dfc772d..ecaf7883 100644 --- a/front/py/examples/2_ir/5_reduce_sumprod.dot +++ b/front/py/examples/2_ir/5_reduce_sumprod.dot @@ -2,37 +2,37 @@ digraph { rankdir=TB node [shape=record] - 124690717936416 [label="t + 137168675850560 [label="t (3, 4, 5)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 124689026191840 [label="s + 137168368280016 [label="s (4,)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 124688742603248 [label="vector_1 + 137168309032976 [label="vector_1 [0, 2]" color=darkseagreen fillcolor=honeydew fontname="Sans-Serif" labeljust=l shape=box style=filled] - 124688742602864 [label=sum color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 124688742603104 [label="p + 137168309032592 [label=sum color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 137168309032832 [label="p (3, 5)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 124688742603392 [label="vector_2 + 137168309033120 [label="vector_2 [1]" color=darkseagreen fillcolor=honeydew fontname="Sans-Serif" labeljust=l shape=box style=filled] - 124688742603440 [label=prod color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 124690717935984 [label="t1 + 137168309033168 [label=prod color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 137168675850128 [label="t1 (4, 5, 6)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 124688742603872 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 124688742603632 [label="var_1 + 137168309033600 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 137168309033360 [label="var_1 1" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled] - 124688742603584 [label="t2 + 137168309033312 [label="t2 (6,)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled] - 124688742604112 [label="vector_3 + 137168309033840 [label="vector_3 [0, 1]" color=darkseagreen fillcolor=honeydew fontname="Sans-Serif" labeljust=l shape=box style=filled] - 124688742603920 [label=sum color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] - 124688742602864 -> 124689026191840 [arrowsize=0.8 color=gray40 penwidth=1.2] - 124690717936416 -> 124688742602864 [arrowsize=0.8 color=gray40 penwidth=1.2] - 124688742603248 -> 124688742602864 [arrowsize=0.8 color=gray40 penwidth=1.2] - 124688742603440 -> 124688742603104 [arrowsize=0.8 color=gray40 penwidth=1.2] - 124690717936416 -> 124688742603440 [arrowsize=0.8 color=gray40 penwidth=1.2] - 124688742603392 -> 124688742603440 [arrowsize=0.8 color=gray40 penwidth=1.2] - 124688742603872 -> 124690717935984 [arrowsize=0.8 color=gray40 penwidth=1.2] - 124688742603632 -> 124688742603872 [arrowsize=0.8 color=gray40 penwidth=1.2] - 124688742603920 -> 124688742603584 [arrowsize=0.8 color=gray40 penwidth=1.2] - 124690717935984 -> 124688742603920 [arrowsize=0.8 color=gray40 penwidth=1.2] - 124688742604112 -> 124688742603920 [arrowsize=0.8 color=gray40 penwidth=1.2] + 137168309033648 [label=sum color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled] + 137168309032592 -> 137168368280016 [arrowsize=0.8 color=gray40 penwidth=1.2] + 137168675850560 -> 137168309032592 [arrowsize=0.8 color=gray40 penwidth=1.2] + 137168309032976 -> 137168309032592 [arrowsize=0.8 color=gray40 penwidth=1.2] + 137168309033168 -> 137168309032832 [arrowsize=0.8 color=gray40 penwidth=1.2] + 137168675850560 -> 137168309033168 [arrowsize=0.8 color=gray40 penwidth=1.2] + 137168309033120 -> 137168309033168 [arrowsize=0.8 color=gray40 penwidth=1.2] + 137168309033600 -> 137168675850128 [arrowsize=0.8 color=gray40 penwidth=1.2] + 137168309033360 -> 137168309033600 [arrowsize=0.8 color=gray40 penwidth=1.2] + 137168309033648 -> 137168309033312 [arrowsize=0.8 color=gray40 penwidth=1.2] + 137168675850128 -> 137168309033648 [arrowsize=0.8 color=gray40 penwidth=1.2] + 137168309033840 -> 137168309033648 [arrowsize=0.8 color=gray40 penwidth=1.2] } diff --git a/front/py/examples/2_ir/5_reduce_sumprod.dot.svg b/front/py/examples/2_ir/5_reduce_sumprod.dot.svg index dc32bc57..e98414d2 100644 --- a/front/py/examples/2_ir/5_reduce_sumprod.dot.svg +++ b/front/py/examples/2_ir/5_reduce_sumprod.dot.svg @@ -9,156 +9,156 @@ %3 - + -124690717936416 +137168675850560 t (3, 4, 5) - + -124688742602864 +137168309032592 sum - + -124690717936416->124688742602864 +137168675850560->137168309032592 - + -124688742603440 +137168309033168 prod - + -124690717936416->124688742603440 +137168675850560->137168309033168 - + -124689026191840 +137168368280016 s (4,) - + -124688742603248 +137168309032976 vector_1 [0, 2] - + -124688742603248->124688742602864 +137168309032976->137168309032592 - + -124688742602864->124689026191840 +137168309032592->137168368280016 - + -124688742603104 +137168309032832 p (3, 5) - + -124688742603392 +137168309033120 vector_2 [1] - + -124688742603392->124688742603440 +137168309033120->137168309033168 - + -124688742603440->124688742603104 +137168309033168->137168309032832 - + -124690717935984 +137168675850128 t1 (4, 5, 6) - + -124688742603920 +137168309033648 sum - + -124690717935984->124688742603920 +137168675850128->137168309033648 - + -124688742603872 +137168309033600 constant - + -124688742603872->124690717935984 +137168309033600->137168675850128 - + -124688742603632 +137168309033360 var_1 1 - + -124688742603632->124688742603872 +137168309033360->137168309033600 - + -124688742603584 +137168309033312 t2 (6,) - + -124688742604112 +137168309033840 vector_3 [0, 1] - + -124688742604112->124688742603920 +137168309033840->137168309033648 - + -124688742603920->124688742603584 +137168309033648->137168309033312