From 917b2cb683d8caa6ca75338c896637148ce58451 Mon Sep 17 00:00:00 2001
From: lipeng <734991033@qq.com>
Date: Thu, 10 Apr 2025 21:47:36 +0800
Subject: [PATCH 1/6] =?UTF-8?q?fixbug:keepkdim=E9=97=AE=E9=A2=98=E8=A7=A3?=
 =?UTF-8?q?=E5=86=B3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../src/deepx/tensorfunc/reduce_miaobyte.hpp  |  30 ++-
 .../py/examples/2_ir/5_reduce_sum_keepdim.dot |  43 ++--
 .../2_ir/5_reduce_sum_keepdim.dot.svg         | 197 ++++++++++++------
 .../py/examples/2_ir/5_reduce_sum_keepdim.py  |   2 +-
 front/py/examples/2_ir/5_reduce_sumprod.dot   |  48 ++---
 .../py/examples/2_ir/5_reduce_sumprod.dot.svg |  96 ++++-----
 6 files changed, 253 insertions(+), 163 deletions(-)
diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/reduce_miaobyte.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/reduce_miaobyte.hpp
index 9d7e37c8..424cacd6 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/reduce_miaobyte.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/reduce_miaobyte.hpp
@@ -39,7 +39,7 @@ namespace deepx::tensorfunc
                         {
                             newIndices[j++] = indices[i];
                         }else if (keepdims && (reduced_dims[i] == 1)) {
-                            newIndices[j++] = indices[i];
+                            newIndices[j++] = 0;
                         }
                     }
                     int outputIdx = result.shape.linearat(newIndices);
@@ -58,7 +58,7 @@ namespace deepx::tensorfunc
                         {
                             newIndices[j++] = indices[i];
                         }else if (keepdims && (reduced_dims[i] == 1)) {
-                            newIndices[j++] = indices[i];
+                            newIndices[j++] = 0;
                         }
                     }
                     int outputIdx = result.shape.linearat(newIndices);
@@ -104,7 +104,7 @@ namespace deepx::tensorfunc
             std::vector<int> reduced_dims = reducedDim(tensor.shape.shape, checkeddims);
             const int minshape_1 = Lanes(ScalableTag<T>());
             // 如果dims的最后一个元素是tensor.shape.dim-1，则说明reduceprod的数据不连续（不对齐），无法simd（需要不停跳跃）
-            constant<miaobyte,T>(result, T(1));
+            constant<miaobyte, T>(result, T(1));
             if (reduced_dims.rbegin()[0] == tensor.shape.dim - 1 || tensor.shape.dim > reduced_dims.size() || tensor.shape[-1] >= minshape_1)
             {
                 tensor.shape.rangeParallel(tensor.shape.dim, [&tensor, &result, &reduced_dims, keepdims](const int idx_linear, const std::vector<int> &indices, std::vector<int> &newIndices)
@@ -115,7 +115,7 @@ namespace deepx::tensorfunc
                                 if (reduced_dims[i]==0) {
                                         newIndices[j++]=indices[i];
                                     }else if (keepdims && (reduced_dims[i] == 1)) {
-                                        newIndices[j++]=indices[i];
+                                        newIndices[j++]=0;
                                     }
                                 }
                             // 累加求和
@@ -136,7 +136,7 @@ namespace deepx::tensorfunc
                                                    {
                                                        newIndices[j++] = indices[i];
                                                    }else if (keepdims && (reduced_dims[i] == 1)) {
-                                                       newIndices[j++] = indices[i];
+                                                       newIndices[j++] = 0;
                                                    }
                                                }
                                                // 累加求和
@@ -198,13 +198,12 @@ namespace deepx::tensorfunc
                                 if (reduced_dims[i]==0) {
                                         newIndices[j++]=indices[i];
                                     }else if (keepdims && (reduced_dims[i] == 1)) {
-                                        newIndices[j++]=indices[i];
+                                        newIndices[j++]=0;
                                     }
                                 }
                             // 累加求和
                             int outputIdx=result.shape.linearat(newIndices);
-                            result.data[outputIdx]=std::max(result.data[outputIdx],tensor.data[idx_linear]);
-                            }, result.shape.dim);
+                            result.data[outputIdx]=std::max(result.data[outputIdx],tensor.data[idx_linear]); }, result.shape.dim);
             }
             else
             {
@@ -219,7 +218,7 @@ namespace deepx::tensorfunc
                                                    {
                                                        newIndices[j++] = indices[i];
                                                    }else if (keepdims && (reduced_dims[i] == 1)) {
-                                                       newIndices[j++] = indices[i];
+                                                       newIndices[j++] =0;
                                                    }
                                                }
                                                
@@ -255,8 +254,7 @@ namespace deepx::tensorfunc
                                                    maxt = std::max(maxt,tensor.data[i + j]);
                                                }
  
-                                               result.data[outputIdx] = std::max(result.data[outputIdx],maxt);
-                                                }, result.shape.dim);
+                                               result.data[outputIdx] = std::max(result.data[outputIdx],maxt); }, result.shape.dim);
             }
         }
     };
@@ -281,14 +279,13 @@ namespace deepx::tensorfunc
                                 if (reduced_dims[i]==0) {
                                         newIndices[j++]=indices[i];
                                     }else if (keepdims && (reduced_dims[i] == 1)) {
-                                        newIndices[j++]=indices[i];
+                                        newIndices[j++]=0;
                                     }
                                 }
                             // 累加求和
                             int outputIdx=result.shape.linearat(newIndices);
  
-                            result.data[outputIdx]=std::min(result.data[outputIdx],tensor.data[idx_linear]);
-                            }, result.shape.dim);
+                            result.data[outputIdx]=std::min(result.data[outputIdx],tensor.data[idx_linear]); }, result.shape.dim);
             }
             else
             {
@@ -303,7 +300,7 @@ namespace deepx::tensorfunc
                                                    {
                                                        newIndices[j++] = indices[i];
                                                    }else if (keepdims && (reduced_dims[i] == 1)) {
-                                                       newIndices[j++] = indices[i];
+                                                       newIndices[j++] = 0;
                                                    }
                                                }
                                                
@@ -339,8 +336,7 @@ namespace deepx::tensorfunc
                                                    mint = std::min(mint,tensor.data[i + j]);
                                                }
  
-                                               result.data[outputIdx] = std::min(result.data[outputIdx],mint); 
-                                               }, result.shape.dim);
+                                               result.data[outputIdx] = std::min(result.data[outputIdx],mint); }, result.shape.dim);
             }
         }
     };
diff --git a/front/py/examples/2_ir/5_reduce_sum_keepdim.dot b/front/py/examples/2_ir/5_reduce_sum_keepdim.dot
index 60e69f3d..93785caa 100644
--- a/front/py/examples/2_ir/5_reduce_sum_keepdim.dot
+++ b/front/py/examples/2_ir/5_reduce_sum_keepdim.dot
@@ -2,20 +2,37 @@
 digraph {
 	rankdir=TB
 	node [shape=record]
-	138334896593968 [label="tensor_1
+	134049762132704 [label="t
 (3, 4, 5)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	138336870303440 [label=reshape color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	138336870304208 [label="vector_1
-(3, 4, 5)" color=darkseagreen fillcolor=honeydew fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	138334610649632 [label="s1
+	134047466295776 [label="s
 (1, 4, 1)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	138334610649680 [label="vector_2
+	134047464790368 [label="vector_1
 [0, 2]" color=darkseagreen fillcolor=honeydew fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	138334610649536 [label=sum color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	138336870303440 -> 138334896593968 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	138334896593968 -> 138336870303440 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	138336870304208 -> 138336870303440 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	138334610649536 -> 138334610649632 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	138334896593968 -> 138334610649536 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	138334610649680 -> 138334610649536 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	134047464790512 [label=sum color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
+	134047464790272 [label="p
+(3, 1, 5)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
+	134047464790608 [label="vector_2
+[1]" color=darkseagreen fillcolor=honeydew fontname="Sans-Serif" labeljust=l shape=box style=filled]
+	134047464790656 [label=prod color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
+	134049762132560 [label="t1
+(4, 5, 6)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
+	134047464791088 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
+	134047464790848 [label="var_1
+1" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled]
+	134047464790800 [label="t2
+(1, 1, 6)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
+	134047464790464 [label="vector_3
+[0, 1]" color=darkseagreen fillcolor=honeydew fontname="Sans-Serif" labeljust=l shape=box style=filled]
+	134047464791328 [label=sum color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
+	134047464790512 -> 134047466295776 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	134049762132704 -> 134047464790512 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	134047464790368 -> 134047464790512 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	134047464790656 -> 134047464790272 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	134049762132704 -> 134047464790656 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	134047464790608 -> 134047464790656 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	134047464791088 -> 134049762132560 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	134047464790848 -> 134047464791088 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	134047464791328 -> 134047464790800 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	134049762132560 -> 134047464791328 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	134047464790464 -> 134047464791328 [arrowsize=0.8 color=gray40 penwidth=1.2]
 }
diff --git a/front/py/examples/2_ir/5_reduce_sum_keepdim.dot.svg b/front/py/examples/2_ir/5_reduce_sum_keepdim.dot.svg
index f0d4ee5c..bf6cc407 100644
--- a/front/py/examples/2_ir/5_reduce_sum_keepdim.dot.svg
+++ b/front/py/examples/2_ir/5_reduce_sum_keepdim.dot.svg
@@ -4,86 +4,163 @@
 <!-- Generated by graphviz version 2.43.0 (0)
  -->
 <!-- Title: %3 Pages: 1 -->
-<svg width="262pt" height="192pt"
- viewBox="0.00 0.00 262.00 192.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
-<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 188)">
+<svg width="382pt" height="338pt"
+ viewBox="0.00 0.00 381.50 338.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 334)">
 <title>%3</title>
-<polygon fill="white" stroke="transparent" points="-4,4 -4,-188 258,-188 258,4 -4,4"/>
-<!-- 138334896593968 -->
+<polygon fill="white" stroke="transparent" points="-4,4 -4,-334 377.5,-334 377.5,4 -4,4"/>
+<!-- 134049762132704 -->
 <g id="node1" class="node">
-<title>138334896593968</title>
-<polygon fill="aliceblue" stroke="skyblue" points="163.5,-184 90.5,-184 90.5,-146 163.5,-146 163.5,-184"/>
-<text text-anchor="middle" x="127" y="-168.8" font-family="Sans-Serif" font-size="14.00">tensor_1</text>
-<text text-anchor="middle" x="127" y="-153.8" font-family="Sans-Serif" font-size="14.00">(3, 4, 5)</text>
+<title>134049762132704</title>
+<polygon fill="aliceblue" stroke="skyblue" points="152,-330 90,-330 90,-292 152,-292 152,-330"/>
+<text text-anchor="middle" x="121" y="-314.8" font-family="Sans-Serif" font-size="14.00">t</text>
+<text text-anchor="middle" x="121" y="-299.8" font-family="Sans-Serif" font-size="14.00">(3, 4, 5)</text>
 </g>
-<!-- 138336870303440 -->
-<g id="node2" class="node">
-<title>138336870303440</title>
-<polygon fill="lightgray" stroke="darkslategray" points="164,-110 90,-110 90,-74 164,-74 164,-110"/>
-<text text-anchor="middle" x="127" y="-88.3" font-family="Courier Bold" font-size="14.00">reshape</text>
+<!-- 134047464790512 -->
+<g id="node4" class="node">
+<title>134047464790512</title>
+<polygon fill="lightgray" stroke="darkslategray" points="107,-256 53,-256 53,-220 107,-220 107,-256"/>
+<text text-anchor="middle" x="80" y="-234.3" font-family="Courier Bold" font-size="14.00">sum</text>
 </g>
-<!-- 138334896593968&#45;&gt;138336870303440 -->
+<!-- 134049762132704&#45;&gt;134047464790512 -->
 <g id="edge2" class="edge">
-<title>138334896593968&#45;&gt;138336870303440</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M121.01,-145.72C120.2,-137.29 120.04,-127.15 120.52,-118.02"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="123.31,-118.2 121.14,-110 117.73,-117.76 123.31,-118.2"/>
+<title>134049762132704&#45;&gt;134047464790512</title>
+<path fill="none" stroke="#666666" stroke-width="1.2" d="M110.44,-291.72C105.42,-283.03 99.36,-272.52 93.97,-263.19"/>
+<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="96.24,-261.53 89.82,-256 91.39,-264.33 96.24,-261.53"/>
 </g>
-<!-- 138334610649536 -->
-<g id="node6" class="node">
-<title>138334610649536</title>
-<polygon fill="lightgray" stroke="darkslategray" points="240,-110 186,-110 186,-74 240,-74 240,-110"/>
-<text text-anchor="middle" x="213" y="-88.3" font-family="Courier Bold" font-size="14.00">sum</text>
+<!-- 134047464790656 -->
+<g id="node7" class="node">
+<title>134047464790656</title>
+<polygon fill="lightgray" stroke="darkslategray" points="189,-256 135,-256 135,-220 189,-220 189,-256"/>
+<text text-anchor="middle" x="162" y="-234.3" font-family="Courier Bold" font-size="14.00">prod</text>
 </g>
-<!-- 138334896593968&#45;&gt;138334610649536 -->
+<!-- 134049762132704&#45;&gt;134047464790656 -->
 <g id="edge5" class="edge">
-<title>138334896593968&#45;&gt;138334610649536</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M149.15,-145.72C160.4,-136.42 174.17,-125.06 186.04,-115.26"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="188.02,-117.26 192.4,-110 184.45,-112.94 188.02,-117.26"/>
+<title>134049762132704&#45;&gt;134047464790656</title>
+<path fill="none" stroke="#666666" stroke-width="1.2" d="M131.56,-291.72C136.58,-283.03 142.64,-272.52 148.03,-263.19"/>
+<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="150.61,-264.33 152.18,-256 145.76,-261.53 150.61,-264.33"/>
 </g>
-<!-- 138336870303440&#45;&gt;138334896593968 -->
-<g id="edge1" class="edge">
-<title>138336870303440&#45;&gt;138334896593968</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M132.86,-110C133.73,-118.25 133.96,-128.35 133.55,-137.57"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="130.75,-137.54 132.99,-145.72 136.33,-137.93 130.75,-137.54"/>
+<!-- 134047466295776 -->
+<g id="node2" class="node">
+<title>134047466295776</title>
+<polygon fill="aliceblue" stroke="skyblue" points="111,-184 49,-184 49,-146 111,-146 111,-184"/>
+<text text-anchor="middle" x="80" y="-168.8" font-family="Sans-Serif" font-size="14.00">s</text>
+<text text-anchor="middle" x="80" y="-153.8" font-family="Sans-Serif" font-size="14.00">(1, 4, 1)</text>
 </g>
-<!-- 138336870304208 -->
+<!-- 134047464790368 -->
 <g id="node3" class="node">
-<title>138336870304208</title>
-<polygon fill="honeydew" stroke="darkseagreen" points="72,-184 0,-184 0,-146 72,-146 72,-184"/>
-<text text-anchor="middle" x="36" y="-168.8" font-family="Sans-Serif" font-size="14.00">vector_1</text>
-<text text-anchor="middle" x="36" y="-153.8" font-family="Sans-Serif" font-size="14.00">(3, 4, 5)</text>
+<title>134047464790368</title>
+<polygon fill="honeydew" stroke="darkseagreen" points="72,-330 0,-330 0,-292 72,-292 72,-330"/>
+<text text-anchor="middle" x="36" y="-314.8" font-family="Sans-Serif" font-size="14.00">vector_1</text>
+<text text-anchor="middle" x="36" y="-299.8" font-family="Sans-Serif" font-size="14.00">[0, 2]</text>
 </g>
-<!-- 138336870304208&#45;&gt;138336870303440 -->
+<!-- 134047464790368&#45;&gt;134047464790512 -->
 <g id="edge3" class="edge">
-<title>138336870304208&#45;&gt;138336870303440</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M59.43,-145.72C71.45,-136.34 86.18,-124.85 98.82,-114.99"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="100.62,-117.13 105.21,-110 97.18,-112.72 100.62,-117.13"/>
+<title>134047464790368&#45;&gt;134047464790512</title>
+<path fill="none" stroke="#666666" stroke-width="1.2" d="M47.33,-291.72C52.77,-282.94 59.35,-272.31 65.18,-262.91"/>
+<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="67.63,-264.28 69.46,-256 62.87,-261.33 67.63,-264.28"/>
 </g>
-<!-- 138334610649632 -->
-<g id="node4" class="node">
-<title>138334610649632</title>
-<polygon fill="aliceblue" stroke="skyblue" points="244,-38 182,-38 182,0 244,0 244,-38"/>
-<text text-anchor="middle" x="213" y="-22.8" font-family="Sans-Serif" font-size="14.00">s1</text>
-<text text-anchor="middle" x="213" y="-7.8" font-family="Sans-Serif" font-size="14.00">(1, 4, 1)</text>
+<!-- 134047464790512&#45;&gt;134047466295776 -->
+<g id="edge1" class="edge">
+<title>134047464790512&#45;&gt;134047466295776</title>
+<path fill="none" stroke="#666666" stroke-width="1.2" d="M80,-219.81C80,-211.52 80,-201.39 80,-192.16"/>
+<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="82.8,-192.02 80,-184.02 77.2,-192.02 82.8,-192.02"/>
 </g>
-<!-- 138334610649680 -->
+<!-- 134047464790272 -->
 <g id="node5" class="node">
-<title>138334610649680</title>
-<polygon fill="honeydew" stroke="darkseagreen" points="254,-184 182,-184 182,-146 254,-146 254,-184"/>
-<text text-anchor="middle" x="218" y="-168.8" font-family="Sans-Serif" font-size="14.00">vector_2</text>
-<text text-anchor="middle" x="218" y="-153.8" font-family="Sans-Serif" font-size="14.00">[0, 2]</text>
+<title>134047464790272</title>
+<polygon fill="aliceblue" stroke="skyblue" points="193,-184 131,-184 131,-146 193,-146 193,-184"/>
+<text text-anchor="middle" x="162" y="-168.8" font-family="Sans-Serif" font-size="14.00">p</text>
+<text text-anchor="middle" x="162" y="-153.8" font-family="Sans-Serif" font-size="14.00">(3, 1, 5)</text>
 </g>
-<!-- 138334610649680&#45;&gt;138334610649536 -->
+<!-- 134047464790608 -->
+<g id="node6" class="node">
+<title>134047464790608</title>
+<polygon fill="honeydew" stroke="darkseagreen" points="242,-330 170,-330 170,-292 242,-292 242,-330"/>
+<text text-anchor="middle" x="206" y="-314.8" font-family="Sans-Serif" font-size="14.00">vector_2</text>
+<text text-anchor="middle" x="206" y="-299.8" font-family="Sans-Serif" font-size="14.00">[1]</text>
+</g>
+<!-- 134047464790608&#45;&gt;134047464790656 -->
 <g id="edge6" class="edge">
-<title>138334610649680&#45;&gt;138334610649536</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M216.71,-145.72C216.12,-137.29 215.4,-127.15 214.76,-118.02"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="217.55,-117.79 214.2,-110 211.97,-118.18 217.55,-117.79"/>
+<title>134047464790608&#45;&gt;134047464790656</title>
+<path fill="none" stroke="#666666" stroke-width="1.2" d="M194.67,-291.72C189.23,-282.94 182.65,-272.31 176.82,-262.91"/>
+<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="179.13,-261.33 172.54,-256 174.37,-264.28 179.13,-261.33"/>
 </g>
-<!-- 138334610649536&#45;&gt;138334610649632 -->
+<!-- 134047464790656&#45;&gt;134047464790272 -->
 <g id="edge4" class="edge">
-<title>138334610649536&#45;&gt;138334610649632</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M213,-73.81C213,-65.52 213,-55.39 213,-46.16"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="215.8,-46.02 213,-38.02 210.2,-46.02 215.8,-46.02"/>
+<title>134047464790656&#45;&gt;134047464790272</title>
+<path fill="none" stroke="#666666" stroke-width="1.2" d="M162,-219.81C162,-211.52 162,-201.39 162,-192.16"/>
+<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="164.8,-192.02 162,-184.02 159.2,-192.02 164.8,-192.02"/>
+</g>
+<!-- 134049762132560 -->
+<g id="node8" class="node">
+<title>134049762132560</title>
+<polygon fill="aliceblue" stroke="skyblue" points="363,-184 301,-184 301,-146 363,-146 363,-184"/>
+<text text-anchor="middle" x="332" y="-168.8" font-family="Sans-Serif" font-size="14.00">t1</text>
+<text text-anchor="middle" x="332" y="-153.8" font-family="Sans-Serif" font-size="14.00">(4, 5, 6)</text>
+</g>
+<!-- 134047464791328 -->
+<g id="node13" class="node">
+<title>134047464791328</title>
+<polygon fill="lightgray" stroke="darkslategray" points="316,-110 262,-110 262,-74 316,-74 316,-110"/>
+<text text-anchor="middle" x="289" y="-88.3" font-family="Courier Bold" font-size="14.00">sum</text>
+</g>
+<!-- 134049762132560&#45;&gt;134047464791328 -->
+<g id="edge10" class="edge">
+<title>134049762132560&#45;&gt;134047464791328</title>
+<path fill="none" stroke="#666666" stroke-width="1.2" d="M320.93,-145.72C315.61,-136.94 309.18,-126.31 303.48,-116.91"/>
+<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="305.84,-115.4 299.3,-110 301.05,-118.3 305.84,-115.4"/>
+</g>
+<!-- 134047464791088 -->
+<g id="node9" class="node">
+<title>134047464791088</title>
+<polygon fill="lightgray" stroke="darkslategray" points="373.5,-256 290.5,-256 290.5,-220 373.5,-220 373.5,-256"/>
+<text text-anchor="middle" x="332" y="-234.3" font-family="Courier Bold" font-size="14.00">constant</text>
+</g>
+<!-- 134047464791088&#45;&gt;134049762132560 -->
+<g id="edge7" class="edge">
+<title>134047464791088&#45;&gt;134049762132560</title>
+<path fill="none" stroke="#666666" stroke-width="1.2" d="M332,-219.81C332,-211.52 332,-201.39 332,-192.16"/>
+<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="334.8,-192.02 332,-184.02 329.2,-192.02 334.8,-192.02"/>
+</g>
+<!-- 134047464790848 -->
+<g id="node10" class="node">
+<title>134047464790848</title>
+<polygon fill="moccasin" stroke="orange" points="359,-330 305,-330 305,-292 359,-292 359,-330"/>
+<text text-anchor="middle" x="332" y="-314.8" font-family="Sans-Serif" font-size="14.00">var_1</text>
+<text text-anchor="middle" x="332" y="-299.8" font-family="Sans-Serif" font-size="14.00">1</text>
+</g>
+<!-- 134047464790848&#45;&gt;134047464791088 -->
+<g id="edge8" class="edge">
+<title>134047464790848&#45;&gt;134047464791088</title>
+<path fill="none" stroke="#666666" stroke-width="1.2" d="M332,-291.72C332,-283.29 332,-273.15 332,-264.02"/>
+<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="334.8,-264 332,-256 329.2,-264 334.8,-264"/>
+</g>
+<!-- 134047464790800 -->
+<g id="node11" class="node">
+<title>134047464790800</title>
+<polygon fill="aliceblue" stroke="skyblue" points="320,-38 258,-38 258,0 320,0 320,-38"/>
+<text text-anchor="middle" x="289" y="-22.8" font-family="Sans-Serif" font-size="14.00">t2</text>
+<text text-anchor="middle" x="289" y="-7.8" font-family="Sans-Serif" font-size="14.00">(1, 1, 6)</text>
+</g>
+<!-- 134047464790464 -->
+<g id="node12" class="node">
+<title>134047464790464</title>
+<polygon fill="honeydew" stroke="darkseagreen" points="283,-184 211,-184 211,-146 283,-146 283,-184"/>
+<text text-anchor="middle" x="247" y="-168.8" font-family="Sans-Serif" font-size="14.00">vector_3</text>
+<text text-anchor="middle" x="247" y="-153.8" font-family="Sans-Serif" font-size="14.00">[0, 1]</text>
+</g>
+<!-- 134047464790464&#45;&gt;134047464791328 -->
+<g id="edge11" class="edge">
+<title>134047464790464&#45;&gt;134047464791328</title>
+<path fill="none" stroke="#666666" stroke-width="1.2" d="M257.82,-145.72C263.01,-136.94 269.29,-126.31 274.86,-116.91"/>
+<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="277.28,-118.31 278.94,-110 272.46,-115.46 277.28,-118.31"/>
+</g>
+<!-- 134047464791328&#45;&gt;134047464790800 -->
+<g id="edge9" class="edge">
+<title>134047464791328&#45;&gt;134047464790800</title>
+<path fill="none" stroke="#666666" stroke-width="1.2" d="M289,-73.81C289,-65.52 289,-55.39 289,-46.16"/>
+<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="291.8,-46.02 289,-38.02 286.2,-46.02 291.8,-46.02"/>
 </g>
 </g>
 </svg>
diff --git a/front/py/examples/2_ir/5_reduce_sum_keepdim.py b/front/py/examples/2_ir/5_reduce_sum_keepdim.py
index e6f1c5e4..902904c7 100644
--- a/front/py/examples/2_ir/5_reduce_sum_keepdim.py
+++ b/front/py/examples/2_ir/5_reduce_sum_keepdim.py
@@ -29,7 +29,7 @@
 print(s)
 p=prod(t,dim=[1],out="p",keepdim=True)
 p.set_format("%.0f")
-# print(p)
+print(p)
 
 t1=ones(4,5,6,name="t1")
 t1.set_format("%.0f")
diff --git a/front/py/examples/2_ir/5_reduce_sumprod.dot b/front/py/examples/2_ir/5_reduce_sumprod.dot
index da812dbb..5dfc772d 100644
--- a/front/py/examples/2_ir/5_reduce_sumprod.dot
+++ b/front/py/examples/2_ir/5_reduce_sumprod.dot
@@ -2,37 +2,37 @@
 digraph {
 	rankdir=TB
 	node [shape=record]
-	133977343199552 [label="t
+	124690717936416 [label="t
 (3, 4, 5)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	133975085212112 [label="s
+	124689026191840 [label="s
 (4,)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	133975083685904 [label="vector_1
+	124688742603248 [label="vector_1
 [0, 2]" color=darkseagreen fillcolor=honeydew fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	133975083685520 [label=sum color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	133975083685760 [label="p
+	124688742602864 [label=sum color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
+	124688742603104 [label="p
 (3, 5)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	133975083686048 [label="vector_2
+	124688742603392 [label="vector_2
 [1]" color=darkseagreen fillcolor=honeydew fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	133975083686096 [label=prod color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	133977343199120 [label="t1
+	124688742603440 [label=prod color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
+	124690717935984 [label="t1
 (4, 5, 6)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	133975083686528 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	133975083686288 [label="var_1
+	124688742603872 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
+	124688742603632 [label="var_1
 1" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	133975083686240 [label="t2
+	124688742603584 [label="t2
 (6,)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	133975083686768 [label="vector_3
+	124688742604112 [label="vector_3
 [0, 1]" color=darkseagreen fillcolor=honeydew fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	133975083686576 [label=sum color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	133975083685520 -> 133975085212112 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	133977343199552 -> 133975083685520 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	133975083685904 -> 133975083685520 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	133975083686096 -> 133975083685760 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	133977343199552 -> 133975083686096 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	133975083686048 -> 133975083686096 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	133975083686528 -> 133977343199120 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	133975083686288 -> 133975083686528 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	133975083686576 -> 133975083686240 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	133977343199120 -> 133975083686576 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	133975083686768 -> 133975083686576 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	124688742603920 [label=sum color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
+	124688742602864 -> 124689026191840 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	124690717936416 -> 124688742602864 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	124688742603248 -> 124688742602864 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	124688742603440 -> 124688742603104 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	124690717936416 -> 124688742603440 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	124688742603392 -> 124688742603440 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	124688742603872 -> 124690717935984 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	124688742603632 -> 124688742603872 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	124688742603920 -> 124688742603584 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	124690717935984 -> 124688742603920 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	124688742604112 -> 124688742603920 [arrowsize=0.8 color=gray40 penwidth=1.2]
 }
diff --git a/front/py/examples/2_ir/5_reduce_sumprod.dot.svg b/front/py/examples/2_ir/5_reduce_sumprod.dot.svg
index 2b180653..dc32bc57 100644
--- a/front/py/examples/2_ir/5_reduce_sumprod.dot.svg
+++ b/front/py/examples/2_ir/5_reduce_sumprod.dot.svg
@@ -9,156 +9,156 @@
 <g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 334)">
 <title>%3</title>
 <polygon fill="white" stroke="transparent" points="-4,4 -4,-334 371.5,-334 371.5,4 -4,4"/>
-<!-- 133977343199552 -->
+<!-- 124690717936416 -->
 <g id="node1" class="node">
-<title>133977343199552</title>
+<title>124690717936416</title>
 <polygon fill="aliceblue" stroke="skyblue" points="152,-330 90,-330 90,-292 152,-292 152,-330"/>
 <text text-anchor="middle" x="121" y="-314.8" font-family="Sans-Serif" font-size="14.00">t</text>
 <text text-anchor="middle" x="121" y="-299.8" font-family="Sans-Serif" font-size="14.00">(3, 4, 5)</text>
 </g>
-<!-- 133975083685520 -->
+<!-- 124688742602864 -->
 <g id="node4" class="node">
-<title>133975083685520</title>
+<title>124688742602864</title>
 <polygon fill="lightgray" stroke="darkslategray" points="109,-256 55,-256 55,-220 109,-220 109,-256"/>
 <text text-anchor="middle" x="82" y="-234.3" font-family="Courier Bold" font-size="14.00">sum</text>
 </g>
-<!-- 133977343199552&#45;&gt;133975083685520 -->
+<!-- 124690717936416&#45;&gt;124688742602864 -->
 <g id="edge2" class="edge">
-<title>133977343199552&#45;&gt;133975083685520</title>
+<title>124690717936416&#45;&gt;124688742602864</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M110.96,-291.72C106.18,-283.03 100.41,-272.52 95.28,-263.19"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="97.65,-261.67 91.34,-256 92.74,-264.36 97.65,-261.67"/>
 </g>
-<!-- 133975083686096 -->
+<!-- 124688742603440 -->
 <g id="node7" class="node">
-<title>133975083686096</title>
+<title>124688742603440</title>
 <polygon fill="lightgray" stroke="darkslategray" points="187,-256 133,-256 133,-220 187,-220 187,-256"/>
 <text text-anchor="middle" x="160" y="-234.3" font-family="Courier Bold" font-size="14.00">prod</text>
 </g>
-<!-- 133977343199552&#45;&gt;133975083686096 -->
+<!-- 124690717936416&#45;&gt;124688742603440 -->
 <g id="edge5" class="edge">
-<title>133977343199552&#45;&gt;133975083686096</title>
+<title>124690717936416&#45;&gt;124688742603440</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M131.04,-291.72C135.82,-283.03 141.59,-272.52 146.72,-263.19"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="149.26,-264.36 150.66,-256 144.35,-261.67 149.26,-264.36"/>
 </g>
-<!-- 133975085212112 -->
+<!-- 124689026191840 -->
 <g id="node2" class="node">
-<title>133975085212112</title>
+<title>124689026191840</title>
 <polygon fill="aliceblue" stroke="skyblue" points="109,-184 55,-184 55,-146 109,-146 109,-184"/>
 <text text-anchor="middle" x="82" y="-168.8" font-family="Sans-Serif" font-size="14.00">s</text>
 <text text-anchor="middle" x="82" y="-153.8" font-family="Sans-Serif" font-size="14.00">(4,)</text>
 </g>
-<!-- 133975083685904 -->
+<!-- 124688742603248 -->
 <g id="node3" class="node">
-<title>133975083685904</title>
+<title>124688742603248</title>
 <polygon fill="honeydew" stroke="darkseagreen" points="72,-330 0,-330 0,-292 72,-292 72,-330"/>
 <text text-anchor="middle" x="36" y="-314.8" font-family="Sans-Serif" font-size="14.00">vector_1</text>
 <text text-anchor="middle" x="36" y="-299.8" font-family="Sans-Serif" font-size="14.00">[0, 2]</text>
 </g>
-<!-- 133975083685904&#45;&gt;133975083685520 -->
+<!-- 124688742603248&#45;&gt;124688742602864 -->
 <g id="edge3" class="edge">
-<title>133975083685904&#45;&gt;133975083685520</title>
+<title>124688742603248&#45;&gt;124688742602864</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M47.85,-291.72C53.53,-282.94 60.42,-272.31 66.51,-262.91"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="68.98,-264.24 70.98,-256 64.28,-261.19 68.98,-264.24"/>
 </g>
-<!-- 133975083685520&#45;&gt;133975085212112 -->
+<!-- 124688742602864&#45;&gt;124689026191840 -->
 <g id="edge1" class="edge">
-<title>133975083685520&#45;&gt;133975085212112</title>
+<title>124688742602864&#45;&gt;124689026191840</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M82,-219.81C82,-211.52 82,-201.39 82,-192.16"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="84.8,-192.02 82,-184.02 79.2,-192.02 84.8,-192.02"/>
 </g>
-<!-- 133975083685760 -->
+<!-- 124688742603104 -->
 <g id="node5" class="node">
-<title>133975083685760</title>
+<title>124688742603104</title>
 <polygon fill="aliceblue" stroke="skyblue" points="187,-184 133,-184 133,-146 187,-146 187,-184"/>
 <text text-anchor="middle" x="160" y="-168.8" font-family="Sans-Serif" font-size="14.00">p</text>
 <text text-anchor="middle" x="160" y="-153.8" font-family="Sans-Serif" font-size="14.00">(3, 5)</text>
 </g>
-<!-- 133975083686048 -->
+<!-- 124688742603392 -->
 <g id="node6" class="node">
-<title>133975083686048</title>
+<title>124688742603392</title>
 <polygon fill="honeydew" stroke="darkseagreen" points="242,-330 170,-330 170,-292 242,-292 242,-330"/>
 <text text-anchor="middle" x="206" y="-314.8" font-family="Sans-Serif" font-size="14.00">vector_2</text>
 <text text-anchor="middle" x="206" y="-299.8" font-family="Sans-Serif" font-size="14.00">[1]</text>
 </g>
-<!-- 133975083686048&#45;&gt;133975083686096 -->
+<!-- 124688742603392&#45;&gt;124688742603440 -->
 <g id="edge6" class="edge">
-<title>133975083686048&#45;&gt;133975083686096</title>
+<title>124688742603392&#45;&gt;124688742603440</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M194.15,-291.72C188.47,-282.94 181.58,-272.31 175.49,-262.91"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="177.72,-261.19 171.02,-256 173.02,-264.24 177.72,-261.19"/>
 </g>
-<!-- 133975083686096&#45;&gt;133975083685760 -->
+<!-- 124688742603440&#45;&gt;124688742603104 -->
 <g id="edge4" class="edge">
-<title>133975083686096&#45;&gt;133975083685760</title>
+<title>124688742603440&#45;&gt;124688742603104</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M160,-219.81C160,-211.52 160,-201.39 160,-192.16"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="162.8,-192.02 160,-184.02 157.2,-192.02 162.8,-192.02"/>
 </g>
-<!-- 133977343199120 -->
+<!-- 124690717935984 -->
 <g id="node8" class="node">
-<title>133977343199120</title>
+<title>124690717935984</title>
 <polygon fill="aliceblue" stroke="skyblue" points="357,-184 295,-184 295,-146 357,-146 357,-184"/>
 <text text-anchor="middle" x="326" y="-168.8" font-family="Sans-Serif" font-size="14.00">t1</text>
 <text text-anchor="middle" x="326" y="-153.8" font-family="Sans-Serif" font-size="14.00">(4, 5, 6)</text>
 </g>
-<!-- 133975083686576 -->
+<!-- 124688742603920 -->
 <g id="node13" class="node">
-<title>133975083686576</title>
+<title>124688742603920</title>
 <polygon fill="lightgray" stroke="darkslategray" points="310,-110 256,-110 256,-74 310,-74 310,-110"/>
 <text text-anchor="middle" x="283" y="-88.3" font-family="Courier Bold" font-size="14.00">sum</text>
 </g>
-<!-- 133977343199120&#45;&gt;133975083686576 -->
+<!-- 124690717935984&#45;&gt;124688742603920 -->
 <g id="edge10" class="edge">
-<title>133977343199120&#45;&gt;133975083686576</title>
+<title>124690717935984&#45;&gt;124688742603920</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M314.93,-145.72C309.61,-136.94 303.18,-126.31 297.48,-116.91"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="299.84,-115.4 293.3,-110 295.05,-118.3 299.84,-115.4"/>
 </g>
-<!-- 133975083686528 -->
+<!-- 124688742603872 -->
 <g id="node9" class="node">
-<title>133975083686528</title>
+<title>124688742603872</title>
 <polygon fill="lightgray" stroke="darkslategray" points="367.5,-256 284.5,-256 284.5,-220 367.5,-220 367.5,-256"/>
 <text text-anchor="middle" x="326" y="-234.3" font-family="Courier Bold" font-size="14.00">constant</text>
 </g>
-<!-- 133975083686528&#45;&gt;133977343199120 -->
+<!-- 124688742603872&#45;&gt;124690717935984 -->
 <g id="edge7" class="edge">
-<title>133975083686528&#45;&gt;133977343199120</title>
+<title>124688742603872&#45;&gt;124690717935984</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M326,-219.81C326,-211.52 326,-201.39 326,-192.16"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="328.8,-192.02 326,-184.02 323.2,-192.02 328.8,-192.02"/>
 </g>
-<!-- 133975083686288 -->
+<!-- 124688742603632 -->
 <g id="node10" class="node">
-<title>133975083686288</title>
+<title>124688742603632</title>
 <polygon fill="moccasin" stroke="orange" points="353,-330 299,-330 299,-292 353,-292 353,-330"/>
 <text text-anchor="middle" x="326" y="-314.8" font-family="Sans-Serif" font-size="14.00">var_1</text>
 <text text-anchor="middle" x="326" y="-299.8" font-family="Sans-Serif" font-size="14.00">1</text>
 </g>
-<!-- 133975083686288&#45;&gt;133975083686528 -->
+<!-- 124688742603632&#45;&gt;124688742603872 -->
 <g id="edge8" class="edge">
-<title>133975083686288&#45;&gt;133975083686528</title>
+<title>124688742603632&#45;&gt;124688742603872</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M326,-291.72C326,-283.29 326,-273.15 326,-264.02"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="328.8,-264 326,-256 323.2,-264 328.8,-264"/>
 </g>
-<!-- 133975083686240 -->
+<!-- 124688742603584 -->
 <g id="node11" class="node">
-<title>133975083686240</title>
+<title>124688742603584</title>
 <polygon fill="aliceblue" stroke="skyblue" points="310,-38 256,-38 256,0 310,0 310,-38"/>
 <text text-anchor="middle" x="283" y="-22.8" font-family="Sans-Serif" font-size="14.00">t2</text>
 <text text-anchor="middle" x="283" y="-7.8" font-family="Sans-Serif" font-size="14.00">(6,)</text>
 </g>
-<!-- 133975083686768 -->
+<!-- 124688742604112 -->
 <g id="node12" class="node">
-<title>133975083686768</title>
+<title>124688742604112</title>
 <polygon fill="honeydew" stroke="darkseagreen" points="277,-184 205,-184 205,-146 277,-146 277,-184"/>
 <text text-anchor="middle" x="241" y="-168.8" font-family="Sans-Serif" font-size="14.00">vector_3</text>
 <text text-anchor="middle" x="241" y="-153.8" font-family="Sans-Serif" font-size="14.00">[0, 1]</text>
 </g>
-<!-- 133975083686768&#45;&gt;133975083686576 -->
+<!-- 124688742604112&#45;&gt;124688742603920 -->
 <g id="edge11" class="edge">
-<title>133975083686768&#45;&gt;133975083686576</title>
+<title>124688742604112&#45;&gt;124688742603920</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M251.82,-145.72C257.01,-136.94 263.29,-126.31 268.86,-116.91"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="271.28,-118.31 272.94,-110 266.46,-115.46 271.28,-118.31"/>
 </g>
-<!-- 133975083686576&#45;&gt;133975083686240 -->
+<!-- 124688742603920&#45;&gt;124688742603584 -->
 <g id="edge9" class="edge">
-<title>133975083686576&#45;&gt;133975083686240</title>
+<title>124688742603920&#45;&gt;124688742603584</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M283,-73.81C283,-65.52 283,-55.39 283,-46.16"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="285.8,-46.02 283,-38.02 280.2,-46.02 285.8,-46.02"/>
 </g>

From 0f5d5d1e73bdeac6bcda1f0c38ec57762d024042 Mon Sep 17 00:00:00 2001
From: lipeng <734991033@qq.com>
Date: Sat, 12 Apr 2025 15:42:03 +0800
Subject: [PATCH 2/6] =?UTF-8?q?cuda&changeshape:=E6=A0=BC=E5=BC=8F?=
 =?UTF-8?q?=E5=8C=96?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../deepx/tensorfunc/changeshape_miaobyte.cu  |  16 +--
 .../deepx/tensorfunc/changeshape_miaobyte.cuh |   2 +-
 .../deepx/tensorfunc/changeshape_miaobyte.hpp |   4 +-
 .../src/deepx/tensorfunc/cuda_math.hpp        | 119 ++++++++++++++++++
 .../tensorfunc/elementwise_miaobyte_basic.cu  |   6 +-
 .../src/deepx/tensorfunc/tensor_cuda.cuh      |  20 ++-
 6 files changed, 145 insertions(+), 22 deletions(-)
 create mode 100644 excuter/op-mem-cuda/src/deepx/tensorfunc/cuda_math.hpp

diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu
index ac395c3f..bc46a818 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu
@@ -41,21 +41,7 @@ namespace deepx::tensorfunc
         }
     }
 
-    inline int nextPowerOf2(int n)
-    {
-        if (n <= 0)
-            return 1;
-        if ((n & (n - 1)) == 0)
-            return n; // 如果n已经是2的幂
-
-        n--;
-        n |= n >> 1;
-        n |= n >> 2;
-        n |= n >> 4;
-        n |= n >> 8;
-        n |= n >> 16;
-        return n + 1;
-    }
+   
 
     template <typename T>
     void launch_transpose(const int numBlocks, const int blockSize,
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cuh b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cuh
index f9f47f7b..1039641a 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cuh
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cuh
@@ -134,4 +134,4 @@ namespace deepx::tensorfunc
                                     const BroadcastMap *broadcastMap,
                                     int8_t *output, const int *outputStrides,const int outputDim,const int outputlen);
 }
-#endif // DEEPX_TENSORFUNC_CHANGESHAPE_MIAOBYTE_HPP
\ No newline at end of file
+#endif // DEEPX_TENSORFUNC_CHANGESHAPE_MIAOBYTE_CUH
\ No newline at end of file
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.hpp
index 6f76fa23..153214b2 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.hpp
@@ -3,13 +3,15 @@
 
 #include <vector>
 #include <stdexcept>
+
 #include "deepx/tensor.hpp"
 #include "deepx/tensorfunc/changeshape.hpp"
-#include "deepx/tensorfunc/authors.hpp"
 #include "deepx/tensorfunc/changeshape_miaobyte.cuh"
 #include "deepx/tensorfunc/cuda.hpp"
+#include "deepx/tensorfunc/authors.hpp"
 #include "deepx/shape_changeshape.hpp"
 #include "stdutil/error.hpp"
+
 namespace deepx::tensorfunc
 {
     template <typename T>
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda_math.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda_math.hpp
new file mode 100644
index 00000000..f0a3c578
--- /dev/null
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda_math.hpp
@@ -0,0 +1,119 @@
+#ifndef DEEPX_TENSORFUNC_CUDA_MATH_HPP
+#define DEEPX_TENSORFUNC_CUDA_MATH_HPP
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include <cublas_v2.h>
+
+namespace deepx::tensorfunc
+{
+
+    // max
+    template <typename T>
+    __device__ void deepx_max(const T *a, const T *b, T *out);
+
+    template <>
+    __device__ void deepx_max<double>(const double *a, const double *b, double *out)
+    {
+        *out = fmax(*a, *b);
+    }
+    
+    template <>
+    __device__ void deepx_max<float>(const float *a, const float *b, float *out)
+    {
+        *out = fmaxf(*a, *b);
+    }
+
+    template <>
+    __device__ void deepx_max<half>(const half *a, const half *b, half *out)
+    {
+        *out = __hmax(*a, *b);
+    }
+
+    template <>
+    __device__ void deepx_max<nv_bfloat16>(const nv_bfloat16 *a, const nv_bfloat16 *b, nv_bfloat16 *out)
+    {
+        *out = __hmax(*a, *b);
+    }
+    template <>
+    __device__ void deepx_max<int64_t>(const int64_t *a, const int64_t *b, int64_t *out)
+    {
+        *out = *a>*b?*a:*b;
+    }
+    template <>
+    __device__ void deepx_max<int32_t>(const int32_t *a, const int32_t *b, int32_t *out)
+    {
+        *out = *a>*b?*a:*b;
+    }
+    template <>
+    __device__ void deepx_max<int16_t>(const int16_t *a, const int16_t *b, int16_t *out)
+    {
+        *out = *a>*b?*a:*b;
+    }
+    template <>
+    __device__ void deepx_max<int8_t>(const int8_t *a, const int8_t *b, int8_t *out)
+    {
+        *out = *a>*b?*a:*b;
+    }
+    
+ 
+    // min
+    template <typename T>
+    __device__ void deepx_min(const T *a, const T *b, T *out);
+ 
+
+    template <>
+    __device__ void deepx_min<double>(const double *a, const double *b, double *out)
+    {
+        *out = fmin(*a, *b);
+    }
+
+    template <>
+    __device__ void deepx_min<float>(const float *a, const float *b, float *out)
+    {
+        *out = fminf(*a, *b);
+    }
+
+
+    template <>
+    __device__ void deepx_min<half>(const half *a, const half *b, half *out)
+    {
+        *out = __hmin(*a, *b);
+    }
+
+    template <>
+    __device__ void deepx_min<nv_bfloat16>(const nv_bfloat16 *a,const nv_bfloat16 *b, nv_bfloat16 *out)
+    {
+        *out = __hmin(*a, *b);
+    }
+
+    template <>
+    __device__ void deepx_min<int64_t>(const int64_t *a, const int64_t *b, int64_t *out)
+    {
+        *out = *a<*b?*a:*b;
+    }
+
+    template <>
+    __device__ void deepx_min<int32_t>(const int32_t *a, const int32_t *b, int32_t *out)
+    {
+        *out = *a<*b?*a:*b;
+    }
+
+    template <>
+    __device__ void deepx_min<int16_t>(const int16_t *a, const int16_t *b, int16_t *out)
+    {
+        *out = *a<*b?*a:*b;
+    }
+
+    template <>
+    __device__ void deepx_min<int8_t>(const int8_t *a, const int8_t *b, int8_t *out)
+    {
+        *out =  *a<*b?*a:*b;
+    }   
+
+ 
+
+}
+
+#endif
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu
index a5371005..3f54e08e 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu
@@ -1,5 +1,5 @@
-#ifndef DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_BASIC_CU
-#define DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_BASIC_CU
+#ifndef DEEPX_TENSORFUNC_ELEMENTWISE_MIAOBYTE_BASIC_CU
+#define DEEPX_TENSORFUNC_ELEMENTWISE_MIAOBYTE_BASIC_CU
 
 #include "deepx/tensorfunc/cuda.hpp"
 #include "deepx/tensorfunc/authors.hpp"
@@ -281,4 +281,4 @@ namespace deepx::tensorfunc
  
 }
 
-#endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_BASIC_CU
+#endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAOBYTE_BASIC_CU
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/tensor_cuda.cuh b/excuter/op-mem-cuda/src/deepx/tensorfunc/tensor_cuda.cuh
index a042d6d1..b60ab3f8 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/tensor_cuda.cuh
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/tensor_cuda.cuh
@@ -1,11 +1,12 @@
 #ifndef DEEPX_TENSORFUNC_TENSOR_CUDA_CUH
 #define DEEPX_TENSORFUNC_TENSOR_CUDA_CUH
 
+#include <cuda_runtime.h>
 #include "deepx/tensor.hpp"
 
 namespace deepx::tensorfunc
 {
-    __host__ __device__ void linearTo(const int *strides, const int dim, int *indices, const int id)
+    inline __host__ __device__ void linearTo(const int *strides, const int dim, int *indices, const int id)
     {
         int linearIndex = id;
         for (int i = 0; i < dim; i++)
@@ -15,7 +16,7 @@ namespace deepx::tensorfunc
         }
     }
 
-    __host__ __device__ int linearAt(const int *strides, const int dim, int *indices)
+    inline __host__ __device__ int linearAt(const int *strides, const int dim, int *indices)
     {
         int idx = 0;
         for (int i = 0; i < dim; i++)
@@ -34,6 +35,21 @@ namespace deepx::tensorfunc
         }
     }
    
+    inline int nextPowerOf2(int n)
+    {
+        if (n <= 0)
+            return 1;
+        if ((n & (n - 1)) == 0)
+            return n; // 如果n已经是2的幂
+
+        n--;
+        n |= n >> 1;
+        n |= n >> 2;
+        n |= n >> 4;
+        n |= n >> 8;
+        n |= n >> 16;
+        return n + 1;
+    }
 }
 
 #endif // DEEPX_TENSORFUNC_TENSOR_CUDA_CUH

From 3207ccc7174179799cf5d5fe39c419cc8ea0564d Mon Sep 17 00:00:00 2001
From: lipeng <734991033@qq.com>
Date: Sat, 12 Apr 2025 16:43:39 +0800
Subject: [PATCH 3/6] =?UTF-8?q?ompsimd&reduce:=E5=8F=82=E6=95=B0=E9=A1=BA?=
 =?UTF-8?q?=E5=BA=8F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../src/deepx/tensorfunc/reduce_miaobyte.hpp  | 13 ++---
 .../op-mem-ompsimd/src/deepx/tf/reduce.hpp    | 48 +++++++++----------
 .../test/tensorfunc/5_tensor_sum.cpp          |  4 +-
 3 files changed, 31 insertions(+), 34 deletions(-)

diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/reduce_miaobyte.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/reduce_miaobyte.hpp
index 424cacd6..aaf3f86e 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/reduce_miaobyte.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/reduce_miaobyte.hpp
@@ -2,15 +2,12 @@
 #define DEEPX_TENSORFUNC_REDUCE_MIAOBYTE_HPP
 
 #include <vector>
-#include <algorithm>
 #include <stdexcept>
 #include <hwy/highway.h>
 
-#include <deepx/vector_combination.hpp>
-#include "deepx/tensor.hpp"
-
 #include "deepx/tensorfunc/highway.hpp"
 #include "deepx/shape_reduce.hpp"
+#include "deepx/tensor.hpp"
 #include "deepx/tensorfunc/reduce.hpp"
 #include "deepx/tensorfunc/init_miaobyte.hpp"
 
@@ -22,7 +19,7 @@ namespace deepx::tensorfunc
     template <typename T>
     struct sumDispatcher<miaobyte, T>
     {
-        static void sum(const Tensor<T> &tensor, const std::vector<int> &dims, Tensor<T> &result, const bool keepdims)
+        static void sum(const Tensor<T> &tensor, const std::vector<int> &dims, const bool keepdims, Tensor<T> &result)
         {
             constant<miaobyte, T>(result, T(0));
             std::vector<int> checkeddims = checkedDims(tensor.shape.shape, dims);
@@ -98,7 +95,7 @@ namespace deepx::tensorfunc
     template <typename T>
     struct prodDispatcher<miaobyte, T>
     {
-        static void prod(const Tensor<T> &tensor, const std::vector<int> &dims, Tensor<T> &result, const bool keepdims)
+        static void prod(const Tensor<T> &tensor, const std::vector<int> &dims, const bool keepdims, Tensor<T> &result)
         {
             std::vector<int> checkeddims = checkedDims(tensor.shape.shape, dims);
             std::vector<int> reduced_dims = reducedDim(tensor.shape.shape, checkeddims);
@@ -181,7 +178,7 @@ namespace deepx::tensorfunc
     template <typename T>
     struct reducemaxDispatcher<miaobyte, T>
     {
-        static void reducemax(const Tensor<T> &tensor, const std::vector<int> &dims, Tensor<T> &result, const bool keepdims)
+        static void reducemax(const Tensor<T> &tensor, const std::vector<int> &dims, const bool keepdims, Tensor<T> &result)
         {
             std::vector<int> checkeddims = checkedDims(tensor.shape.shape, dims);
             std::vector<int> reduced_dims = reducedDim(tensor.shape.shape, checkeddims);
@@ -262,7 +259,7 @@ namespace deepx::tensorfunc
     template <typename T>
     struct reduceminDispatcher<miaobyte, T>
     {
-        static void reducemin(const Tensor<T> &tensor, const std::vector<int> &dims, Tensor<T> &result, const bool keepdims)
+        static void reducemin(const Tensor<T> &tensor, const std::vector<int> &dims, const bool keepdims, Tensor<T> &result)
         {
             std::vector<int> checkeddims = checkedDims(tensor.shape.shape, dims);
             std::vector<int> reduced_dims = reducedDim(tensor.shape.shape, checkeddims);
diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/reduce.hpp b/excuter/op-mem-ompsimd/src/deepx/tf/reduce.hpp
index 5e5a873b..4e2bd1b9 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tf/reduce.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tf/reduce.hpp
@@ -43,22 +43,22 @@ namespace deepx::tf
             switch (input_type)
             {
             case Precision::Float64:
-                sum<Author, double>(*mem->gettensor<double>(this->args[0].textvalue), dims, *mem->gettensor<double>(this->returns[0].textvalue), keepdims);
+                sum<Author, double>(*mem->gettensor<double>(this->args[0].textvalue), dims, keepdims, *mem->gettensor<double>(this->returns[0].textvalue));
                 break;
             case Precision::Float32:
-                sum<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), dims, *mem->gettensor<float>(this->returns[0].textvalue), keepdims);
+                sum<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), dims,  keepdims,*mem->gettensor<float>(this->returns[0].textvalue));
                 break;
             case Precision::Int64:
-                sum<Author, int64_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), dims, *mem->gettensor<int64_t>(this->returns[0].textvalue), keepdims);
+                sum<Author, int64_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), dims,  keepdims,*mem->gettensor<int64_t>(this->returns[0].textvalue));
                 break;
             case Precision::Int32:
-                sum<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), dims, *mem->gettensor<int32_t>(this->returns[0].textvalue), keepdims);
+                sum<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), dims,  keepdims,*mem->gettensor<int32_t>(this->returns[0].textvalue));
                 break;
             case Precision::Int16:
-                sum<Author, int16_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), dims, *mem->gettensor<int16_t>(this->returns[0].textvalue), keepdims);
+                sum<Author, int16_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), dims,  keepdims,*mem->gettensor<int16_t>(this->returns[0].textvalue));
                 break;
             case Precision::Int8:
-                sum<Author, int8_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), dims, *mem->gettensor<int8_t>(this->returns[0].textvalue), keepdims);
+                sum<Author, int8_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), dims,  keepdims,*mem->gettensor<int8_t>(this->returns[0].textvalue));
                 break;
             default:
                 error = "Unsupported type: " + precision_str(input_type);
@@ -101,22 +101,22 @@ namespace deepx::tf
             switch (input_type)
             {
             case Precision::Float64:
-                prod<Author, double>(*mem->gettensor<double>(this->args[0].textvalue), dims, *mem->gettensor<double>(this->returns[0].textvalue), keepdims);
+                prod<Author, double>(*mem->gettensor<double>(this->args[0].textvalue), dims, keepdims, *mem->gettensor<double>(this->returns[0].textvalue));
                 break;
             case Precision::Float32:
-                prod<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), dims, *mem->gettensor<float>(this->returns[0].textvalue), keepdims);
+                prod<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), dims, keepdims, *mem->gettensor<float>(this->returns[0].textvalue));
                 break;
             case Precision::Int64:
-                prod<Author, int64_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), dims, *mem->gettensor<int64_t>(this->returns[0].textvalue), keepdims);
+                prod<Author, int64_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), dims, keepdims, *mem->gettensor<int64_t>(this->returns[0].textvalue));
                 break;
             case Precision::Int32:
-                prod<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), dims, *mem->gettensor<int32_t>(this->returns[0].textvalue), keepdims);
+                prod<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), dims, keepdims, *mem->gettensor<int32_t>(this->returns[0].textvalue));
                 break;
             case Precision::Int16:
-                prod<Author, int16_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), dims, *mem->gettensor<int16_t>(this->returns[0].textvalue), keepdims);
+                prod<Author, int16_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), dims, keepdims, *mem->gettensor<int16_t>(this->returns[0].textvalue));
                 break;
             case Precision::Int8:
-                prod<Author, int8_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), dims, *mem->gettensor<int8_t>(this->returns[0].textvalue), keepdims);
+                prod<Author, int8_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), dims, keepdims, *mem->gettensor<int8_t>(this->returns[0].textvalue));
                 break;
             default:
                 error = "Unsupported type: " + precision_str(input_type);
@@ -159,22 +159,22 @@ namespace deepx::tf
             switch (input_type)
             {
             case Precision::Float64:
-                reducemax<Author, double>(*mem->gettensor<double>(this->args[0].textvalue), dims, *mem->gettensor<double>(this->returns[0].textvalue), keepdims);
+                reducemax<Author, double>(*mem->gettensor<double>(this->args[0].textvalue), dims, keepdims, *mem->gettensor<double>(this->returns[0].textvalue));
                 break;
             case Precision::Float32:
-                reducemax<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), dims, *mem->gettensor<float>(this->returns[0].textvalue), keepdims);
+                reducemax<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), dims, keepdims, *mem->gettensor<float>(this->returns[0].textvalue));
                 break;
             case Precision::Int64:
-                reducemax<Author, int64_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), dims, *mem->gettensor<int64_t>(this->returns[0].textvalue), keepdims);
+                reducemax<Author, int64_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), dims, keepdims, *mem->gettensor<int64_t>(this->returns[0].textvalue));
                 break;
             case Precision::Int32:
-                reducemax<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), dims, *mem->gettensor<int32_t>(this->returns[0].textvalue), keepdims);
+                reducemax<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), dims, keepdims, *mem->gettensor<int32_t>(this->returns[0].textvalue));
                 break;
             case Precision::Int16:
-                reducemax<Author, int16_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), dims, *mem->gettensor<int16_t>(this->returns[0].textvalue), keepdims);
+                reducemax<Author, int16_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), dims, keepdims, *mem->gettensor<int16_t>(this->returns[0].textvalue));
                 break;
             case Precision::Int8:
-                reducemax<Author, int8_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), dims, *mem->gettensor<int8_t>(this->returns[0].textvalue), keepdims);
+                reducemax<Author, int8_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), dims, keepdims, *mem->gettensor<int8_t>(this->returns[0].textvalue));
                 break;
             default:
                 error = "Unsupported type: " + precision_str(input_type);
@@ -217,22 +217,22 @@ namespace deepx::tf
             switch (input_type)
             {
             case Precision::Float64:
-                reducemin<Author, double>(*mem->gettensor<double>(this->args[0].textvalue), dims, *mem->gettensor<double>(this->returns[0].textvalue), keepdims);
+                reducemin<Author, double>(*mem->gettensor<double>(this->args[0].textvalue), dims, keepdims, *mem->gettensor<double>(this->returns[0].textvalue));
                 break;
             case Precision::Float32:
-                reducemin<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), dims, *mem->gettensor<float>(this->returns[0].textvalue), keepdims);
+                reducemin<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), dims, keepdims, *mem->gettensor<float>(this->returns[0].textvalue));
                 break;
             case Precision::Int64:
-                reducemin<Author, int64_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), dims, *mem->gettensor<int64_t>(this->returns[0].textvalue), keepdims);
+                reducemin<Author, int64_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), dims, keepdims, *mem->gettensor<int64_t>(this->returns[0].textvalue));
                 break;
             case Precision::Int32:
-                reducemin<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), dims, *mem->gettensor<int32_t>(this->returns[0].textvalue), keepdims);
+                reducemin<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), dims, keepdims, *mem->gettensor<int32_t>(this->returns[0].textvalue));
                 break;
             case Precision::Int16:
-                reducemin<Author, int16_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), dims, *mem->gettensor<int16_t>(this->returns[0].textvalue), keepdims);
+                reducemin<Author, int16_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), dims, keepdims, *mem->gettensor<int16_t>(this->returns[0].textvalue));
                 break;
             case Precision::Int8:
-                reducemin<Author, int8_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), dims, *mem->gettensor<int8_t>(this->returns[0].textvalue), keepdims);
+                reducemin<Author, int8_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), dims, keepdims, *mem->gettensor<int8_t>(this->returns[0].textvalue));
                 break;
             default:
                 error = "Unsupported type: " + precision_str(input_type);
diff --git a/excuter/op-mem-ompsimd/test/tensorfunc/5_tensor_sum.cpp b/excuter/op-mem-ompsimd/test/tensorfunc/5_tensor_sum.cpp
index c68e2b8e..79530b6b 100644
--- a/excuter/op-mem-ompsimd/test/tensorfunc/5_tensor_sum.cpp
+++ b/excuter/op-mem-ompsimd/test/tensorfunc/5_tensor_sum.cpp
@@ -35,7 +35,7 @@ void test_sum()
         std::vector<int> checkeddims=checkedDims(shape,comb);
         std::vector<int> sumshape=reducedShape(shape,checkeddims);
         Tensor<float> r = New<float>(sumshape);
-        sum<miaobyte,float>(tensor, checkeddims,r);
+        sum<miaobyte,float>(tensor, checkeddims,false,r);
         print<miaobyte>(r,"%.0f");
     }
 /*
@@ -63,7 +63,7 @@ void benchmark_sum(int i){
         std::vector<int> checkeddims=checkedDims(shape,comb);
         std::vector<int> sumshape=reducedShape(shape,checkeddims);
         Tensor<float> r=New<float>(sumshape);
-        sum<miaobyte,float>(tensor, checkeddims,r);
+        sum<miaobyte,float>(tensor, checkeddims,false,r);
         string combstr="";
         for (const auto &c : comb)
         {

From 055444142b6fa897f7188f028ffba643fb54fcf0 Mon Sep 17 00:00:00 2001
From: lipeng <734991033@qq.com>
Date: Sat, 12 Apr 2025 16:43:54 +0800
Subject: [PATCH 4/6] =?UTF-8?q?ompsimd&reduce:=E5=8F=82=E6=95=B0=E9=A1=BA?=
 =?UTF-8?q?=E5=BA=8F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../src/deepx/tensorfunc/reduce.hpp           | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/excuter/cpp-common/src/deepx/tensorfunc/reduce.hpp b/excuter/cpp-common/src/deepx/tensorfunc/reduce.hpp
index f1570693..a94c1908 100644
--- a/excuter/cpp-common/src/deepx/tensorfunc/reduce.hpp
+++ b/excuter/cpp-common/src/deepx/tensorfunc/reduce.hpp
@@ -12,46 +12,46 @@ namespace deepx::tensorfunc
     template <typename Author, typename T>
     struct reducemaxDispatcher
     {
-        static void reducemax(const Tensor<T> &A, const std::vector<int> &dims,Tensor<T> &B,const bool keepdims=false) = delete;
+        static void reducemax(const Tensor<T> &A, const std::vector<int> &dims,const bool keepdims,Tensor<T> &B) = delete;
     };
     template <typename Author, typename T>
-    void reducemax(const Tensor<T> &A, const std::vector<int> &dims,Tensor<T> &B,const bool keepdims=false)
+    void reducemax(const Tensor<T> &A, const std::vector<int> &dims,const bool keepdims,Tensor<T> &B)
     {
-        reducemaxDispatcher<Author, T>::reducemax(A, dims, B, keepdims);
+        reducemaxDispatcher<Author, T>::reducemax(A, dims, keepdims, B);
     }
     
     template <typename Author, typename T>
     struct reduceminDispatcher
     {
-        static void reducemin(const Tensor<T> &A, const std::vector<int> &dims,Tensor<T> &B,const bool keepdims=false) = delete;
+        static void reducemin(const Tensor<T> &A, const std::vector<int> &dims,const bool keepdims,Tensor<T> &B) = delete;
     };
     template <typename Author, typename T>
-    void reducemin(const Tensor<T> &A, const std::vector<int> &dims,Tensor<T> &B,const bool keepdims=false)
+    void reducemin(const Tensor<T> &A, const std::vector<int> &dims,const bool keepdims,Tensor<T> &B)
     {
-        reduceminDispatcher<Author, T>::reducemin(A, dims, B, keepdims);
+        reduceminDispatcher<Author, T>::reducemin(A, dims, keepdims, B);
     }
     
     template <typename Author, typename T>
     struct  sumDispatcher
     {
-        static void reducesum(const Tensor<T> &A, const std::vector<int> &dims,Tensor<T> &B,const bool keepdims=false) = delete;
+        static void  sum(const Tensor<T> &A, const std::vector<int> &dims,const bool keepdims,Tensor<T> &B) = delete;
     };
     template <typename Author, typename T>
-    void sum(const Tensor<T> &A, const std::vector<int> &dims,Tensor<T> &B,const bool keepdims=false)
+    void sum(const Tensor<T> &A, const std::vector<int> &dims,const bool keepdims,Tensor<T> &B)
     {
-        sumDispatcher<Author, T>::sum(A, dims, B, keepdims);
+        sumDispatcher<Author, T>::sum(A, dims, keepdims, B);
     }
     
     template <typename Author, typename T>
     struct  prodDispatcher
     {
-        static void prod(const Tensor<T> &A, const std::vector<int> &dims,Tensor<T> &B,const bool keepdims=false) = delete;
+        static void prod(const Tensor<T> &A, const std::vector<int> &dims,const bool keepdims,Tensor<T> &B) = delete;
     };
     
     template <typename Author, typename T>
-    void prod(const Tensor<T> &A, const std::vector<int> &dims,Tensor<T> &B,const bool keepdims=false)
+    void prod(const Tensor<T> &A, const std::vector<int> &dims,const bool keepdims,Tensor<T> &B)
     {
-        prodDispatcher<Author, T>::prod(A, dims, B, keepdims);
+        prodDispatcher<Author, T>::prod(A, dims, keepdims, B);
     }
 }
 #endif // DEEPX_TENSORFUNC_REDUCE_HPP

From 7181d1cead88b61ae320cae471220a8c64fd43df Mon Sep 17 00:00:00 2001
From: lipeng <734991033@qq.com>
Date: Sat, 12 Apr 2025 16:44:19 +0800
Subject: [PATCH 5/6] =?UTF-8?q?cuda&reduce:=E7=BC=96=E8=AF=91=E5=AE=8C?=
 =?UTF-8?q?=E6=88=90?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 excuter/op-mem-cuda/src/client/tfs.cpp        |  65 ++-
 .../deepx/tensorfunc/changeshape_miaobyte.cu  |  21 +-
 .../deepx/tensorfunc/changeshape_miaobyte.cuh | 151 ++++---
 .../deepx/tensorfunc/changeshape_miaobyte.hpp |   6 +-
 .../src/deepx/tensorfunc/reduce_miaobyte.cu   | 404 ++++++++++++++++++
 .../src/deepx/tensorfunc/reduce_miaobyte.cuh  |  58 +++
 .../src/deepx/tensorfunc/reduce_miaobyte.hpp  |  81 ++--
 excuter/op-mem-cuda/src/deepx/tf/reduce.hpp   | 252 +++++++++++
 8 files changed, 895 insertions(+), 143 deletions(-)
 create mode 100644 excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.cu
 create mode 100644 excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.cuh
 create mode 100644 excuter/op-mem-cuda/src/deepx/tf/reduce.hpp

diff --git a/excuter/op-mem-cuda/src/client/tfs.cpp b/excuter/op-mem-cuda/src/client/tfs.cpp
index 44560d7f..2ef18928 100644
--- a/excuter/op-mem-cuda/src/client/tfs.cpp
+++ b/excuter/op-mem-cuda/src/client/tfs.cpp
@@ -9,6 +9,7 @@
 #include "deepx/tf/elementwise_compare.hpp"
 #include "deepx/tf/matmul.hpp"
 #include "deepx/tf/changeshape.hpp"
+#include "deepx/tf/reduce.hpp"
 #include "deepx/dtype.hpp"
 #include "deepx/tf/tffactory.hpp"
 #include "deepx/tensorfunc/authors.hpp"
@@ -371,20 +372,56 @@ namespace deepx::tf
                     Param("B", DataCategory::Tensor, Precision::Any),
                 })));
     }
-    // // reduce
-    // void register_reduce(OpFactory &opfactory)
-    // {
-    //     opfactory.add_op(Max<float>());
-    //     opfactory.add_op(Max<double>());
-    //     opfactory.add_op(Maxscalar<float>());
-    //     opfactory.add_op(Maxscalar<double>());
-    //     opfactory.add_op(Min<float>());
-    //     opfactory.add_op(Min<double>());
-    //     opfactory.add_op(Minscalar<float>());
-    //     opfactory.add_op(Minscalar<double>());
-    //     opfactory.add_op(Sum<float>());
-    //     opfactory.add_op(Sum<double>());
-    // }
+   // reduce
+     void register_reduce(TfFactory &tffactory)
+    {   
+        // sum
+        tffactory.add_tf(std::make_shared<Sum<miaobyte>>(vector<Param>(
+            {
+                Param("A", DataCategory::Tensor, Precision::Any),
+                Param("dims", DataCategory::Vector, Precision::Int32),
+                Param("keepdims", DataCategory::Var, Precision::Bool),
+            }),
+            vector<Param>(
+                {
+                    Param("B", DataCategory::Tensor, Precision::Any),
+                })));   
+        // prod
+        tffactory.add_tf(std::make_shared<Prod<miaobyte>>(vector<Param>(
+            {
+                Param("A", DataCategory::Tensor, Precision::Any),
+                Param("dims", DataCategory::Vector, Precision::Int32),
+                Param("keepdims", DataCategory::Var, Precision::Bool),
+            }), 
+            vector<Param>(
+                {
+                    Param("B", DataCategory::Tensor, Precision::Any),
+                })));  
+
+        // max
+        tffactory.add_tf(std::make_shared<ReduceMax<miaobyte>>(vector<Param>(
+            {
+                Param("A", DataCategory::Tensor, Precision::Any),
+                Param("dims", DataCategory::Vector, Precision::Int32),
+                Param("keepdims", DataCategory::Var, Precision::Bool),
+            }),
+            vector<Param>(
+                {
+                    Param("B", DataCategory::Tensor, Precision::Any),
+                })));
+        // min
+        tffactory.add_tf(std::make_shared<ReduceMin<miaobyte>>(vector<Param>(
+            {
+                Param("A", DataCategory::Tensor, Precision::Any),
+                Param("dims", DataCategory::Vector, Precision::Int32),
+                Param("keepdims", DataCategory::Var, Precision::Bool),
+            }), 
+            vector<Param>(
+                {
+                    Param("B", DataCategory::Tensor, Precision::Any),
+                })));
+    }
+ 
     int register_all(TfFactory &tffactory)
     {
         register_lifecycle(tffactory);
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu
index bc46a818..e5eb511d 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu
@@ -44,8 +44,7 @@ namespace deepx::tensorfunc
    
 
     template <typename T>
-    void launch_transpose(const int numBlocks, const int blockSize,
-                          const T *input,
+    void launch_transpose(const T *input,
                           const int *inputStrides,
                           T *output,
                           const int *outputStrides,
@@ -58,7 +57,7 @@ namespace deepx::tensorfunc
         cudaVector<int> dimOrder_d(dimOrder, dim);
 
         int powDim = nextPowerOf2(dim);
-
+        auto [numBlocks, blockSize] = BestDims(len);
         // 根据计算出的2的幂次选择对应的模板实例
         switch (powDim)
         {
@@ -91,14 +90,14 @@ namespace deepx::tensorfunc
         }
     }
 
-    template void launch_transpose<double>(const int numBlocks, const int blockSize, const double *input, const int *inputStrides, double *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
-    template void launch_transpose<float>(const int numBlocks, const int blockSize, const float *input, const int *inputStrides, float *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
-    template void launch_transpose<nv_bfloat16>(const int numBlocks, const int blockSize, const nv_bfloat16 *input, const int *inputStrides, nv_bfloat16 *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
-    template void launch_transpose<__half>(const int numBlocks, const int blockSize, const __half *input, const int *inputStrides, __half *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
-    template void launch_transpose<int64_t>(const int numBlocks, const int blockSize, const int64_t *input, const int *inputStrides, int64_t *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
-    template void launch_transpose<int32_t>(const int numBlocks, const int blockSize, const int32_t *input, const int *inputStrides, int32_t *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
-    template void launch_transpose<int16_t>(const int numBlocks, const int blockSize, const int16_t *input, const int *inputStrides, int16_t *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
-    template void launch_transpose<int8_t>(const int numBlocks, const int blockSize, const int8_t *input, const int *inputStrides, int8_t *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
+    template void launch_transpose<double>(const double *input, const int *inputStrides, double *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
+    template void launch_transpose<float>(const float *input, const int *inputStrides, float *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
+    template void launch_transpose<nv_bfloat16>(const nv_bfloat16 *input, const int *inputStrides, nv_bfloat16 *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
+    template void launch_transpose<__half>(const __half *input, const int *inputStrides, __half *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
+    template void launch_transpose<int64_t>(const int64_t *input, const int *inputStrides, int64_t *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
+    template void launch_transpose<int32_t>(const int32_t *input, const int *inputStrides, int32_t *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
+    template void launch_transpose<int16_t>(const int16_t *input, const int *inputStrides, int16_t *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
+    template void launch_transpose<int8_t>(const int8_t *input, const int *inputStrides, int8_t *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
 
     // concat
     template <int DIM, typename T>
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cuh b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cuh
index 1039641a..a1fcf9fa 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cuh
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cuh
@@ -4,9 +4,7 @@
 #include <cuda_bf16.h>
 #include <cuda_fp16.h>
 
-#include "deepx/shape_changeshape.hpp"
-#include "deepx/tensorfunc/cuda.hpp"
-#include "deepx/tensorfunc/authors.hpp"
+#include "deepx/shape_changeshape.hpp" //BroadcastMap类型
 
 namespace deepx::tensorfunc
 {
@@ -15,31 +13,31 @@ namespace deepx::tensorfunc
     __global__ void transpose_kernel(const T *input, const int *inputStrides, T *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
 
     template <typename T>
-    void launch_transpose(const int numBlocks, const int blockSize, const T *input, const int *inputStrides, T *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
+    void launch_transpose(  const T *input, const int *inputStrides, T *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
 
-    template <>
-    void launch_transpose<double>(const int numBlocks, const int blockSize, const double *input, const int *inputStrides, double *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
+    // template <>
+    // void launch_transpose<double>( const double *input, const int *inputStrides, double *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
 
-    template <>
-    void launch_transpose<float>(const int numBlocks, const int blockSize, const float *input, const int *inputStrides, float *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
+    // template <>
+    // void launch_transpose<float>(  const float *input, const int *inputStrides, float *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
 
-    template <>
-    void launch_transpose<nv_bfloat16>(const int numBlocks, const int blockSize, const nv_bfloat16 *input, const int *inputStrides, nv_bfloat16 *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
+    // template <>
+    // void launch_transpose<nv_bfloat16>(const nv_bfloat16 *input, const int *inputStrides, nv_bfloat16 *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
 
-    template <>
-    void launch_transpose<__half>(const int numBlocks, const int blockSize, const __half *input, const int *inputStrides, __half *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
+    // template <>
+    // void launch_transpose<__half>(const __half *input, const int *inputStrides, __half *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
 
-    template <>
-    void launch_transpose<int64_t>(const int numBlocks, const int blockSize, const int64_t *input, const int *inputStrides, int64_t *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
+    // template <>
+    // void launch_transpose<int64_t>(const int64_t *input, const int *inputStrides, int64_t *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
 
-    template <>
-    void launch_transpose<int32_t>(const int numBlocks, const int blockSize, const int32_t *input, const int *inputStrides, int32_t *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
+    // template <>
+    // void launch_transpose<int32_t>(const int32_t *input, const int *inputStrides, int32_t *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
 
-    template <>
-    void launch_transpose<int16_t>(const int numBlocks, const int blockSize, const int16_t *input, const int *inputStrides, int16_t *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
+    // template <>
+    // void launch_transpose<int16_t>(const int16_t *input, const int *inputStrides, int16_t *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
 
-    template <>
-    void launch_transpose<int8_t>(const int numBlocks, const int blockSize, const int8_t *input, const int *inputStrides, int8_t *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
+    // template <>
+    // void launch_transpose<int8_t>(const int8_t *input, const int *inputStrides, int8_t *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
 
     template <typename DIM, typename T>
     __global__ void concat_kernel(const T **tensorsData,
@@ -55,29 +53,29 @@ namespace deepx::tensorfunc
     template <typename T>
     void launch_concat(const T **tensorsData, const int *inputStrides, T *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis);
 
-    template <>
-    void launch_concat<double>(const double **tensorsData, const int *inputStrides, double *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis);
+    // template <>
+    // void launch_concat<double>(const double **tensorsData, const int *inputStrides, double *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis);
 
-    template <>
-    void launch_concat<float>(const float **tensorsData, const int *inputStrides, float *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis);
+    // template <>
+    // void launch_concat<float>(const float **tensorsData, const int *inputStrides, float *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis);
 
-    template <>
-    void launch_concat<nv_bfloat16>(const nv_bfloat16 **tensorsData, const int *inputStrides, nv_bfloat16 *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis);
+    // template <>
+    // void launch_concat<nv_bfloat16>(const nv_bfloat16 **tensorsData, const int *inputStrides, nv_bfloat16 *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis);
 
-    template <>
-    void launch_concat<__half>(const __half **tensorsData, const int *inputStrides, __half *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis);
+    // template <>
+    // void launch_concat<__half>(const __half **tensorsData, const int *inputStrides, __half *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis);
 
-    template <>
-    void launch_concat<int64_t>(const int64_t **tensorsData, const int *inputStrides, int64_t *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis);
+    // template <>
+    // void launch_concat<int64_t>(const int64_t **tensorsData, const int *inputStrides, int64_t *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis);
 
-    template <>
-    void launch_concat<int32_t>(const int32_t **tensorsData, const int *inputStrides, int32_t *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis);
+    // template <>
+    // void launch_concat<int32_t>(const int32_t **tensorsData, const int *inputStrides, int32_t *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis);
 
-    template <>
-    void launch_concat<int16_t>(const int16_t **tensorsData, const int *inputStrides, int16_t *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis);
+    // template <>
+    // void launch_concat<int16_t>(const int16_t **tensorsData, const int *inputStrides, int16_t *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis);
 
-    template <>
-    void launch_concat<int8_t>(const int8_t **tensorsData, const int *inputStrides, int8_t *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis);
+    // template <>
+    // void launch_concat<int8_t>(const int8_t **tensorsData, const int *inputStrides, int8_t *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis);
 
 
     __host__ __device__ void fromBroadcastIndices(const BroadcastMap *broadcastMap, const int *broadcastIndices, const int broadcastIndicesDim, int *indices);
@@ -94,44 +92,45 @@ namespace deepx::tensorfunc
                             const BroadcastMap *broadcastMap,
                             T *output, const int *outputStrides,const int outputDim,const int outputlen);
 
-    template <>
-    void launch_broadcastTo<double>(const double *input, const int *inputStrides,const int inputDim,
-                                    const BroadcastMap *broadcastMap,
-                                    double *output, const int *outputStrides,const int outputDim,const int outputlen);
-
-    template <>
-    void launch_broadcastTo<float>(const float *input, const int *inputStrides,const int inputDim,
-                                    const BroadcastMap *broadcastMap,
-                                    float *output, const int *outputStrides,const int outputDim,const int outputlen);
-
-    template <>
-    void launch_broadcastTo<nv_bfloat16>(const nv_bfloat16 *input, const int *inputStrides,const int inputDim,
-                                    const BroadcastMap *broadcastMap,
-                                    nv_bfloat16 *output, const int *outputStrides,const int outputDim,const int outputlen);
-
-    template <>
-    void launch_broadcastTo<__half>(const __half *input, const int *inputStrides,const int inputDim,
-                                    const BroadcastMap *broadcastMap,
-                                    __half *output, const int *outputStrides,const int outputDim,const int outputlen);
-
-    template <>
-    void launch_broadcastTo<int64_t>(const int64_t *input, const int *inputStrides,const int inputDim,
-                                    const BroadcastMap *broadcastMap,
-                                    int64_t *output, const int *outputStrides,const int outputDim,const int outputlen);
-
-    template <>
-    void launch_broadcastTo<int32_t>(const int32_t *input, const int *inputStrides,const int inputDim,
-                                    const BroadcastMap *broadcastMap,
-                                    int32_t *output, const int *outputStrides,const int outputDim,const int outputlen);
-
-    template <>
-    void launch_broadcastTo<int16_t>(const int16_t *input, const int *inputStrides,const int inputDim,
-                                    const BroadcastMap *broadcastMap,
-                                    int16_t *output, const int *outputStrides,const int outputDim,const int outputlen);
-
-    template <>
-    void launch_broadcastTo<int8_t>(const int8_t *input, const int *inputStrides,const int inputDim,
-                                    const BroadcastMap *broadcastMap,
-                                    int8_t *output, const int *outputStrides,const int outputDim,const int outputlen);
-}
+//     template <>
+//     void launch_broadcastTo<double>(const double *input, const int *inputStrides,const int inputDim,
+//                                     const BroadcastMap *broadcastMap,
+//                                     double *output, const int *outputStrides,const int outputDim,const int outputlen);
+
+//     template <>
+//     void launch_broadcastTo<float>(const float *input, const int *inputStrides,const int inputDim,
+//                                     const BroadcastMap *broadcastMap,
+//                                     float *output, const int *outputStrides,const int outputDim,const int outputlen);
+
+//     template <>
+//     void launch_broadcastTo<nv_bfloat16>(const nv_bfloat16 *input, const int *inputStrides,const int inputDim,
+//                                     const BroadcastMap *broadcastMap,
+//                                     nv_bfloat16 *output, const int *outputStrides,const int outputDim,const int outputlen);
+
+//     template <>
+//     void launch_broadcastTo<__half>(const __half *input, const int *inputStrides,const int inputDim,
+//                                     const BroadcastMap *broadcastMap,
+//                                     __half *output, const int *outputStrides,const int outputDim,const int outputlen);
+
+//     template <>
+//     void launch_broadcastTo<int64_t>(const int64_t *input, const int *inputStrides,const int inputDim,
+//                                     const BroadcastMap *broadcastMap,
+//                                     int64_t *output, const int *outputStrides,const int outputDim,const int outputlen);
+
+//     template <>
+//     void launch_broadcastTo<int32_t>(const int32_t *input, const int *inputStrides,const int inputDim,
+//                                     const BroadcastMap *broadcastMap,
+//                                     int32_t *output, const int *outputStrides,const int outputDim,const int outputlen);
+
+//     template <>
+//     void launch_broadcastTo<int16_t>(const int16_t *input, const int *inputStrides,const int inputDim,
+//                                     const BroadcastMap *broadcastMap,
+//                                     int16_t *output, const int *outputStrides,const int outputDim,const int outputlen);
+
+//     template <>
+//     void launch_broadcastTo<int8_t>(const int8_t *input, const int *inputStrides,const int inputDim,
+//                                     const BroadcastMap *broadcastMap,
+//                                     int8_t *output, const int *outputStrides,const int outputDim,const int outputlen);
+// }
+};
 #endif // DEEPX_TENSORFUNC_CHANGESHAPE_MIAOBYTE_CUH
\ No newline at end of file
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.hpp
index 153214b2..d127e6a1 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.hpp
@@ -7,7 +7,6 @@
 #include "deepx/tensor.hpp"
 #include "deepx/tensorfunc/changeshape.hpp"
 #include "deepx/tensorfunc/changeshape_miaobyte.cuh"
-#include "deepx/tensorfunc/cuda.hpp"
 #include "deepx/tensorfunc/authors.hpp"
 #include "deepx/shape_changeshape.hpp"
 #include "stdutil/error.hpp"
@@ -53,9 +52,8 @@ namespace deepx::tensorfunc
             {
                 throw std::runtime_error("Dimension order size must match tensor dimension size for transpose");
             }
-            auto [actual_blocks, optimal_block_size] = BestDims(tensor.shape.size);
-            launch_transpose<T>(actual_blocks, optimal_block_size,
-                                tensor.data, tensor.shape.strides.data(),
+           
+            launch_transpose<T>(tensor.data, tensor.shape.strides.data(),
                                 output.data, output.shape.strides.data(),
                                 tensor.shape.dim, tensor.shape.size, dim_order.data());
         }
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.cu
new file mode 100644
index 00000000..c7df6cf6
--- /dev/null
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.cu
@@ -0,0 +1,404 @@
+#ifndef DEEPX_TENSORFUNC_REDUCE_MIAOBYTE_CU
+#define DEEPX_TENSORFUNC_REDUCE_MIAOBYTE_CU
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+
+#include "deepx/tensorfunc/cuda.hpp"
+#include "deepx/tensorfunc/reduce_miaobyte.cuh"
+#include "deepx/tensorfunc/tensor_cuda.cuh"
+#include "deepx/tensorfunc/vector_cuda.cuh"
+#include "deepx/tensorfunc/cuda_math.hpp"
+
+namespace deepx::tensorfunc
+{
+
+    template <int DIM, typename T>
+    __global__ void sum_kernel(const T *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
+                                     const int *reduced_dims, const bool keepdims,
+                                     T *result_data, const int *result_strides, const int result_dim)
+    {
+        const int grid_stride = gridDim.x * blockDim.x;
+        int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+        for (; thread_id < tensor_len; thread_id += grid_stride)
+        {
+            int input_indices[DIM];
+            linearTo(tensor_strides, tensor_dim, input_indices, thread_id);
+            int output_indices[DIM];
+            for (size_t i = 0, j = 0; i < tensor_dim; ++i)
+            {
+                if (reduced_dims[i] == 0)
+                {
+                    output_indices[j++] = input_indices[i];
+                }
+                else if (keepdims && (reduced_dims[i] == 1))
+                {
+                    output_indices[j++] = 0;
+                }
+            }
+            int outputIdx = linearAt(result_strides, result_dim, output_indices);
+            int inputIdx = linearAt(tensor_strides, tensor_dim, input_indices);
+            result_data[outputIdx] += tensor_data[inputIdx];
+        }
+    }
+ 
+    template <typename T>
+    __host__ void launch_sum(const T *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
+                          const int *reduced_dims, const bool keepdims,
+                          T *result_data, const int *result_strides, const int result_dim)
+    {
+        auto [numBlocks, blockSize] = BestDims(tensor_len);
+        cudaVector<int> tensor_strides_d(tensor_strides, tensor_dim, cudaMemcpyHostToDevice);
+        cudaVector<int> result_strides_d(result_strides, result_dim, cudaMemcpyHostToDevice);
+        cudaVector<int> reduced_dims_d(reduced_dims,tensor_dim, cudaMemcpyHostToDevice);
+
+        int powDim = nextPowerOf2(tensor_dim);
+        switch (powDim)
+        {
+        case 1:
+            sum_kernel<1, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
+            break;
+        case 2:
+            sum_kernel<2, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
+            break;
+        case 4:
+            sum_kernel<4, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
+            break;
+        case 8:
+            sum_kernel<8, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len,reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
+            break;
+        case 16:
+            sum_kernel<16, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
+            break;
+        case 32:
+            sum_kernel<32, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
+            break;
+        case 64:
+            sum_kernel<64, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
+            break;
+        case 128:
+            sum_kernel<128, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
+            break;
+        default:
+            throw std::runtime_error("dim too large, max support 128");
+        }
+    }
+
+    template void launch_sum<double>(const double *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
+                          const int *reduced_dims, const bool keepdims,
+                          double *result_data, const int *result_strides, const int result_dim);
+    template void launch_sum<float>(const float *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
+                          const int *reduced_dims, const bool keepdims,
+                          float *result_data, const int *result_strides, const int result_dim);
+    template void launch_sum<nv_bfloat16>(const nv_bfloat16 *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
+                          const int *reduced_dims, const bool keepdims,
+                          nv_bfloat16 *result_data, const int *result_strides, const int result_dim);
+    template void launch_sum<__half>(const __half *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
+                          const int *reduced_dims, const bool keepdims,
+                          __half *result_data, const int *result_strides, const int result_dim);
+    template void launch_sum<int64_t>(const int64_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
+                          const int *reduced_dims, const bool keepdims,
+                          int64_t *result_data, const int *result_strides, const int result_dim);
+    template void launch_sum<int32_t>(const int32_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
+                          const int *reduced_dims, const bool keepdims,
+                          int32_t *result_data, const int *result_strides, const int result_dim);
+    template void launch_sum<int16_t>(const int16_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
+                          const int *reduced_dims, const bool keepdims,
+                          int16_t *result_data, const int *result_strides, const int result_dim);
+    template void launch_sum<int8_t>(const int8_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
+                          const int *reduced_dims, const bool keepdims,
+                          int8_t *result_data, const int *result_strides, const int result_dim);
+
+    //prod
+    template <int DIM, typename T>
+    __global__ void  prod_kernel(const T *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
+                                      const int *reduced_dims, const bool keepdims,
+                                      T *result_data, const int *result_strides, const int result_dim)
+    {
+        const int grid_stride = gridDim.x * blockDim.x;
+        int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+        for (; thread_id < tensor_len; thread_id += grid_stride)
+        {
+            int input_indices[DIM];
+            linearTo(tensor_strides, tensor_dim, input_indices, thread_id);
+            int output_indices[DIM];
+            for (size_t i = 0, j = 0; i < tensor_dim; ++i)
+            {
+                if (reduced_dims[i] == 0)
+                {
+                    output_indices[j++] = input_indices[i];
+                }
+                else if (keepdims && (reduced_dims[i] == 1))
+                {
+                    output_indices[j++] = 0;
+                }
+            }
+            int outputIdx = linearAt(result_strides, result_dim, output_indices);
+            int inputIdx = linearAt(tensor_strides, tensor_dim, input_indices);
+            result_data[outputIdx] *= tensor_data[inputIdx];
+        }
+    }
+
+    template <typename T>
+    void launch_prod(const T *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
+                          const int *reduced_dims, const bool keepdims,
+                          T *result_data, const int *result_strides, const int result_dim)
+    {
+        auto [numBlocks, blockSize] = BestDims(tensor_len);
+        cudaVector<int> tensor_strides_d(tensor_strides, tensor_dim, cudaMemcpyHostToDevice);
+        cudaVector<int> result_strides_d(result_strides, result_dim, cudaMemcpyHostToDevice);
+        cudaVector<int> reduced_dims_d(reduced_dims,tensor_dim, cudaMemcpyHostToDevice);
+
+        int powDim = nextPowerOf2(tensor_dim);
+        switch (powDim)
+        {
+        case 1:
+            prod_kernel<1, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
+            break;
+        case 2:
+            prod_kernel<2, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
+            break;
+        case 4:
+            prod_kernel<4, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
+            break;
+        case 8:
+            prod_kernel<8, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
+            break;
+        case 16:
+            prod_kernel<16, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
+            break;
+        case 32:
+            prod_kernel<32, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
+            break;
+        case 64:
+            prod_kernel<64, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
+            break;
+        case 128:
+            prod_kernel<128, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
+            break;
+        default:
+            throw std::runtime_error("dim too large, max support 128");
+        }
+    }
+
+    template void launch_prod<double>(const double *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
+                          const int *reduced_dims, const bool keepdims,
+                          double *result_data, const int *result_strides, const int result_dim);
+    template void launch_prod<float>(const float *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
+                          const int *reduced_dims, const bool keepdims,
+                          float *result_data, const int *result_strides, const int result_dim);
+    template void launch_prod<nv_bfloat16>(const nv_bfloat16 *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
+                          const int *reduced_dims, const bool keepdims,
+                          nv_bfloat16 *result_data, const int *result_strides, const int result_dim);
+    template void launch_prod<__half>(const __half *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
+                          const int *reduced_dims, const bool keepdims,
+                          __half *result_data, const int *result_strides, const int result_dim);
+    template void launch_prod<int64_t>(const int64_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
+                          const int *reduced_dims, const bool keepdims,
+                          int64_t *result_data, const int *result_strides, const int result_dim);   
+    template void launch_prod<int32_t>(const int32_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
+                          const int *reduced_dims, const bool keepdims,
+                          int32_t *result_data, const int *result_strides, const int result_dim);
+    template void launch_prod<int16_t>(const int16_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
+                          const int *reduced_dims, const bool keepdims,
+                          int16_t *result_data, const int *result_strides, const int result_dim);
+    template void launch_prod<int8_t>(const int8_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
+                          const int *reduced_dims, const bool keepdims,
+                          int8_t *result_data, const int *result_strides, const int result_dim);
+
+    //max
+    template <int DIM, typename T>
+    __global__ void max_kernel(const T *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
+                               const int *reduced_dims, const bool keepdims,
+                               T *result_data, const int *result_strides, const int result_dim){
+        const int grid_stride = gridDim.x * blockDim.x; 
+        int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+        for (; thread_id < tensor_len; thread_id += grid_stride)
+        {
+            int input_indices[DIM];
+            linearTo(tensor_strides, tensor_dim, input_indices, thread_id);
+            int output_indices[DIM];
+            for (size_t i = 0, j = 0; i < tensor_dim; ++i)
+            {
+                if (reduced_dims[i] == 0)
+                {
+                    output_indices[j++] = input_indices[i];
+                }
+                else if (keepdims && (reduced_dims[i] == 1))
+                {
+                    output_indices[j++] = 0;
+                }
+            }   
+            int outputIdx = linearAt(result_strides, result_dim, output_indices);
+            int inputIdx = linearAt(tensor_strides, tensor_dim, input_indices);
+            deepx_max(result_data+outputIdx, tensor_data+inputIdx, result_data+outputIdx);   
+        }
+    }
+
+    template <typename T>
+    void launch_reducemax(const T *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
+                          const int *reduced_dims, const bool keepdims,
+                          T *result_data, const int *result_strides, const int result_dim)
+    {
+        auto [numBlocks, blockSize] = BestDims(tensor_len);
+        cudaVector<int> tensor_strides_d(tensor_strides, tensor_dim, cudaMemcpyHostToDevice);
+        cudaVector<int> result_strides_d(result_strides, result_dim, cudaMemcpyHostToDevice);
+        cudaVector<int> reduced_dims_d(reduced_dims,tensor_dim, cudaMemcpyHostToDevice);
+
+        int powDim = nextPowerOf2(tensor_dim);
+        switch (powDim)
+        {
+        case 1:
+            max_kernel<1, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
+            break;
+        case 2:
+            max_kernel<2, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
+            break;
+        case 4:
+            max_kernel<4, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
+            break;
+        case 8:
+            max_kernel<8, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
+            break;
+        case 16:
+            max_kernel<16, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
+            break;
+        case 32:
+            max_kernel<32, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
+            break;
+        case 64:
+            max_kernel<64, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
+            break;
+        case 128:
+            max_kernel<128, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
+            break;
+        default:
+            throw std::runtime_error("dim too large, max support 128");
+        }
+    };
+
+    template void launch_reducemax<double>(const double *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
+                          const int *reduced_dims, const bool keepdims,
+                          double *result_data, const int *result_strides, const int result_dim);
+    template void launch_reducemax<float>(const float *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
+                          const int *reduced_dims, const bool keepdims,
+                          float *result_data, const int *result_strides, const int result_dim);
+    template void launch_reducemax<nv_bfloat16>(const nv_bfloat16 *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
+                          const int *reduced_dims, const bool keepdims,
+                          nv_bfloat16 *result_data, const int *result_strides, const int result_dim);
+    template void launch_reducemax<__half>(const __half *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
+                          const int *reduced_dims, const bool keepdims,
+                          __half *result_data, const int *result_strides, const int result_dim);
+    template void launch_reducemax<int64_t>(const int64_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
+                          const int *reduced_dims, const bool keepdims,
+                          int64_t *result_data, const int *result_strides, const int result_dim);
+    template void launch_reducemax<int32_t>(const int32_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
+                          const int *reduced_dims, const bool keepdims,
+                          int32_t *result_data, const int *result_strides, const int result_dim);
+    template void launch_reducemax<int16_t>(const int16_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
+                          const int *reduced_dims, const bool keepdims,
+                          int16_t *result_data, const int *result_strides, const int result_dim);
+    template void launch_reducemax<int8_t>(const int8_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
+                          const int *reduced_dims, const bool keepdims,
+                          int8_t *result_data, const int *result_strides, const int result_dim);
+
+    //min
+    template <int DIM, typename T>
+    __global__ void min_kernel(const T *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
+                               const int *reduced_dims, const bool keepdims,
+                               T *result_data, const int *result_strides, const int result_dim){
+        const int grid_stride = gridDim.x * blockDim.x; 
+        int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+        for (; thread_id < tensor_len; thread_id += grid_stride)
+        {
+            int input_indices[DIM];
+            linearTo(tensor_strides, tensor_dim, input_indices, thread_id);
+            int output_indices[DIM];
+            for (size_t i = 0, j = 0; i < tensor_dim; ++i)
+            {
+                if (reduced_dims[i] == 0)
+                {
+                    output_indices[j++] = input_indices[i];
+                }
+                else if (keepdims && (reduced_dims[i] == 1))
+                {
+                    output_indices[j++] = 0;
+                }
+            }   
+            int outputIdx = linearAt(result_strides, result_dim, output_indices);
+            int inputIdx = linearAt(tensor_strides, tensor_dim, input_indices);
+            deepx_min(result_data+outputIdx, tensor_data+inputIdx, result_data+outputIdx);   
+        }   
+    }
+
+    template <typename T>
+    void launch_reducemin(const T *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
+                          const int *reduced_dims, const bool keepdims,
+                          T *result_data, const int *result_strides, const int result_dim)
+    {
+        auto [numBlocks, blockSize]  = BestDims(tensor_len);
+        cudaVector<int> tensor_strides_d(tensor_strides, tensor_dim, cudaMemcpyHostToDevice);
+        cudaVector<int> result_strides_d(result_strides, result_dim, cudaMemcpyHostToDevice);
+        cudaVector<int> reduced_dims_d(reduced_dims , tensor_dim, cudaMemcpyHostToDevice);
+
+        int powDim = nextPowerOf2(tensor_dim);
+        switch (powDim)
+        {
+        case 1:
+            min_kernel<1, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
+            break;
+        case 2:
+            min_kernel<2, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
+            break;
+        case 4:
+            min_kernel<4, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
+            break;  
+        case 8:
+            min_kernel<8, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
+            break;
+        case 16:
+            min_kernel<16, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
+            break;  
+        case 32:
+            min_kernel<32, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
+            break;
+        case 64:
+            min_kernel<64, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
+            break;
+        case 128:
+            min_kernel<128, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
+            break;
+        default:
+            throw std::runtime_error("dim too large, max support 128");
+        }
+    }   
+    
+    template void launch_reducemin<double>(const double *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
+                          const int *reduced_dims, const bool keepdims,
+                          double *result_data, const int *result_strides, const int result_dim);
+    template void launch_reducemin<float>(const float *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
+                          const int *reduced_dims, const bool keepdims,
+                          float *result_data, const int *result_strides, const int result_dim);
+    template void launch_reducemin<nv_bfloat16>(const nv_bfloat16 *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
+                          const int *reduced_dims, const bool keepdims,
+                          nv_bfloat16 *result_data, const int *result_strides, const int result_dim);
+    template void launch_reducemin<__half>(const __half *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
+                          const int *reduced_dims, const bool keepdims,
+                          __half *result_data, const int *result_strides, const int result_dim);
+    template void launch_reducemin<int64_t>(const int64_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
+                          const int *reduced_dims, const bool keepdims,
+                          int64_t *result_data, const int *result_strides, const int result_dim);
+    template void launch_reducemin<int32_t>(const int32_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
+                          const int *reduced_dims, const bool keepdims,
+                          int32_t *result_data, const int *result_strides, const int result_dim);
+    template void launch_reducemin<int16_t>(const int16_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
+                          const int *reduced_dims, const bool keepdims,
+                          int16_t *result_data, const int *result_strides, const int result_dim);
+    template void launch_reducemin<int8_t>(const int8_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
+                          const int *reduced_dims, const bool keepdims,
+                          int8_t *result_data, const int *result_strides, const int result_dim);
+
+}
+
+#endif // DEEPX_TENSORFUNC_REDUCE_MIAOBYTE_CU
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.cuh b/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.cuh
new file mode 100644
index 00000000..de1dbf4c
--- /dev/null
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.cuh
@@ -0,0 +1,58 @@
+#ifndef DEEPX_TENSORFUNC_REDUCE_MIAOBYTE_CUH
+#define DEEPX_TENSORFUNC_REDUCE_MIAOBYTE_CUH
+
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
+namespace deepx::tensorfunc
+{
+    // sum
+    template <int DIM, typename T>
+    __global__ void  sum_kernel(const T *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
+                                     const int *reduced_dims, const bool keepdims,
+                                     T *result_data, const int *result_strides, const int result_dim);
+  
+    template <typename T>
+    void launch_sum(const T *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
+                          const int *reduced_dims, const bool keepdims,
+                          T *result_data, const int *result_strides, const int result_dim);
+
+  
+    //prod
+
+    template <int DIM, typename T>
+    __global__ void  prod_kernel(const T *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
+                                      const int *reduced_dims, const bool keepdims,
+                                      T *result_data, const int *result_strides, const int result_dim);
+
+    template <typename T>
+    void launch_prod(const T *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
+                           const int *reduced_dims, const bool keepdims,
+                           T *result_data, const int *result_strides, const int result_dim);
+   
+    //max
+    template <int DIM, typename T>
+    __global__ void reducemax_kernel(const T *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
+                                      const int *reduced_dims, const bool keepdims,
+                                      T *result_data, const int *result_strides, const int result_dim);
+                                      
+    template <typename T>
+    void launch_reducemax(const T *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
+                          const int *reduced_dims, const bool keepdims,
+                          T *result_data, const int *result_strides, const int result_dim);
+
+   
+    //min
+    template <int DIM, typename T>
+    __global__ void reducemin_kernel(const T *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
+                                      const int *reduced_dims, const bool keepdims,
+                                      T *result_data, const int *result_strides, const int result_dim);
+
+    template <typename T>
+    void launch_reducemin(const T *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
+                          const int *reduced_dims, const bool keepdims,
+                          T *result_data, const int *result_strides, const int result_dim);
+ 
+}
+
+#endif //DEEPX_TENSORFUNC_REDUCE_MIAOBYTE_CUH
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.hpp
index 7b2a9f58..ce4c7223 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.hpp
@@ -2,67 +2,72 @@
 #define DEEPX_TENSORFUNC_REDUCE_MIAOBYTE_HPP
 
 #include <vector>
-#include <algorithm>
 #include <stdexcept>
 
 #include "deepx/tensor.hpp"
-#include "deepx/shape_reduce.hpp"
-#include "deepx/tensorfunc/authors.hpp"
-#include <deepx/vector_combination.hpp>
-
 #include "deepx/tensorfunc/reduce.hpp"
+#include "deepx/tensorfunc/reduce_miaobyte.cuh"
+#include "deepx/shape_reduce.hpp"
+#include "deepx/tensorfunc/authors.hpp" 
+#include "deepx/tensorfunc/init_miaobyte.hpp"
 
 namespace deepx::tensorfunc
 {
-    
-    template < typename T>
-    struct reducemaxDispatcher<miaobyte, T>
+
+    template <typename T>
+    struct sumDispatcher<miaobyte, T>
     {
-        static void reducemax(const Tensor<T> &A, const std::vector<int> &dims, Tensor<T> &B,const bool keepdims) {
-            if (axis < 0) {
-                axis += A.shape.dim;
-            }
-            if (axis >= A.shape.dim) {
-                throw std::invalid_argument("Invalid axis for reducemax");
-            }
-            
+        static void sum(const Tensor<T> &tensor, const std::vector<int> &dims, const bool keepdims, Tensor<T> &result)
+        {
+            constant<miaobyte, T>(result, T(0));
+            std::vector<int> checkeddims = checkedDims(tensor.shape.shape, dims);
+            std::vector<int> reduced_dims = reducedDim(tensor.shape.shape, checkeddims);
+            launch_sum<T>(tensor.data, tensor.shape.strides.data(), tensor.shape.dim, tensor.shape.size,
+                          reduced_dims.data(), keepdims,
+                          result.data, result.shape.strides.data(), result.shape.dim);
         }
     };
 
-
-    template < typename T>
-    struct reduceminDispatcher<miaobyte, T>
+    template <typename T>
+    struct prodDispatcher<miaobyte, T>
     {
-        static void reducemin(const Tensor<T> &A, const std::vector<int> &dims, Tensor<T> &B,const bool keepdims) {
-            if (axis < 0) {
-                axis += A.shape.dim;
-            }
-            if (axis >= A.shape.dim) {
-                throw std::invalid_argument("Invalid axis for reducemin");
-            }
-            
+        static void prod(const Tensor<T> &tensor, const std::vector<int> &dims, const bool keepdims, Tensor<T> &result)
+        {
+            constant<miaobyte, T>(result, T(1));
+            std::vector<int> checkeddims = checkedDims(tensor.shape.shape, dims);
+            std::vector<int> reduced_dims = reducedDim(tensor.shape.shape, checkeddims);
+            launch_prod<T>(tensor.data, tensor.shape.strides.data(), tensor.shape.dim, tensor.shape.size,
+                           reduced_dims.data(), keepdims,
+                           result.data, result.shape.strides.data(), result.shape.dim);
         }
     };
-
-
     template <typename T>
-    struct sumDispatcher<miaobyte, T>
+    struct reducemaxDispatcher<miaobyte, T>
     {
-        static void sum(const Tensor<T> &tensor, const std::vector<int> &dims, Tensor<T> &result,const bool keepdims)
+        static void reducemax(const Tensor<T> &tensor, const std::vector<int> &dims, const bool keepdims, Tensor<T> &result)
         {
-             
-            
+            constant<miaobyte, T>(result, std::numeric_limits<T>::lowest());
+            std::vector<int> checkeddims = checkedDims(tensor.shape.shape, dims);
+            std::vector<int> reduced_dims = reducedDim(tensor.shape.shape, checkeddims);
+            launch_reducemax<T>(tensor.data, tensor.shape.strides.data(), tensor.shape.dim, tensor.shape.size,
+                                reduced_dims.data(), keepdims,
+                                result.data, result.shape.strides.data(), result.shape.dim);
         }
     };
-     
 
     template <typename T>
-    struct prodDispatcher<miaobyte, T>
+    struct reduceminDispatcher<miaobyte, T>
     {
-        static void prod(const Tensor<T> &tensor, const std::vector<int> &dims, Tensor<T> &result,const bool keepdims)
+        static void reducemin(const Tensor<T> &tensor, const std::vector<int> &dims, const bool keepdims, Tensor<T> &result)
         {
-
+            constant<miaobyte, T>(result, std::numeric_limits<T>::max());
+            std::vector<int> checkeddims = checkedDims(tensor.shape.shape, dims);
+            std::vector<int> reduced_dims = reducedDim(tensor.shape.shape, checkeddims);
+            launch_reducemin<T>(tensor.data, tensor.shape.strides.data(), tensor.shape.dim, tensor.shape.size,
+                                reduced_dims.data(), keepdims,
+                                result.data, result.shape.strides.data(), result.shape.dim);
         }
     };
 }
-#endif DEEPX_TENSORFUNC_REDUCE_MIAOBYTE_HPP
\ No newline at end of file
+
+#endif //DEEPX_TENSORFUNC_REDUCE_MIAOBYTE_HPP
\ No newline at end of file
diff --git a/excuter/op-mem-cuda/src/deepx/tf/reduce.hpp b/excuter/op-mem-cuda/src/deepx/tf/reduce.hpp
new file mode 100644
index 00000000..4a1643ea
--- /dev/null
+++ b/excuter/op-mem-cuda/src/deepx/tf/reduce.hpp
@@ -0,0 +1,252 @@
+#ifndef DEEPX_TF_REDUCE_HPP
+#define DEEPX_TF_REDUCE_HPP
+
+#include <vector>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+
+#include "deepx/tensorfunc/reduce_miaobyte.hpp"
+
+namespace deepx::tf
+{
+    template <typename Author>
+    class Sum : public TF
+    {
+    public:
+        Sum(const vector<Param> &args, const vector<Param> &returns)
+        {
+            this->name = "sum";
+            this->author = Author::name();
+            this->args = args;
+            this->returns = returns;
+        }
+
+        string math_formula() const override
+        {
+            return "B = sum(A, axis=[1 2], keepdims=false)";
+        }
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<Sum>(*this);
+        }
+        int run(shared_ptr<MemBase> mem, string &error) override
+        {
+            Precision input_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
+            vector<int> dims = this->getvector<int>(1, true);
+            bool keepdims = this->getvar<bool>(2,mem,true);
+            Precision output_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
+            if (input_type != output_type)
+            {
+                error = "Type mismatch: " + precision_str(input_type) + " != " + precision_str(output_type);
+                return 1;
+            }
+            switch (input_type)
+            {
+            case Precision::Float64:
+                sum<Author, double>(*mem->gettensor<double>(this->args[0].textvalue), dims, keepdims, *mem->gettensor<double>(this->returns[0].textvalue));
+                break;
+            case Precision::Float32:
+                sum<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), dims, keepdims, *mem->gettensor<float>(this->returns[0].textvalue));
+                break;
+            case Precision::BFloat16:
+                sum<Author, nv_bfloat16>(*mem->gettensor<nv_bfloat16>(this->args[0].textvalue), dims, keepdims, *mem->gettensor<nv_bfloat16>(this->returns[0].textvalue));
+                break;
+            case Precision::Float16:
+                sum<Author, __half>(*mem->gettensor<__half>(this->args[0].textvalue), dims, keepdims, *mem->gettensor<__half>(this->returns[0].textvalue));
+                break;
+            case Precision::Int64:
+                sum<Author, int64_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), dims, keepdims, *mem->gettensor<int64_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int32:
+                sum<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), dims, keepdims, *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int16:
+                sum<Author, int16_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), dims, keepdims, *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int8:
+                sum<Author, int8_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), dims, keepdims, *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                break;
+            default:
+                error = "Unsupported type: " + precision_str(input_type);
+                return 1;
+            }
+            return 0;
+        }
+    };
+
+    template <typename Author>
+    class Prod : public TF
+    {
+    public:
+        Prod(const vector<Param> &args, const vector<Param> &returns)
+        {
+            this->name = "prod";
+            this->author = Author::name();
+            this->args = args;
+            this->returns = returns;
+        }
+        string math_formula() const override
+        {
+            return "B = prod(A, axis=[1 2], keepdims=false)";
+        }
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<Prod>(*this);
+        }
+        int run(shared_ptr<MemBase> mem, string &error) override
+        {
+            Precision input_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
+            vector<int> dims = this->getvector<int>(1, true);
+            bool keepdims = this->getvar<bool>(2,mem,true);
+            Precision output_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
+            if (input_type != output_type)
+            {
+                error = "Type mismatch: " + precision_str(input_type) + " != " + precision_str(output_type);
+                return 1;
+            }
+            switch (input_type)
+            {
+            case Precision::Float64:
+                prod<Author, double>(*mem->gettensor<double>(this->args[0].textvalue), dims, keepdims, *mem->gettensor<double>(this->returns[0].textvalue));
+                break;
+            case Precision::Float32:
+                prod<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), dims, keepdims, *mem->gettensor<float>(this->returns[0].textvalue));
+                break;
+            case Precision::Int64:
+                prod<Author, int64_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), dims, keepdims, *mem->gettensor<int64_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int32:
+                prod<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), dims, keepdims, *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int16:
+                prod<Author, int16_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), dims, keepdims, *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int8:
+                prod<Author, int8_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), dims, keepdims, *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                break;
+            default:
+                error = "Unsupported type: " + precision_str(input_type);
+                return 1;
+            }
+            return 0;
+        }
+    };
+
+        template <typename Author>
+    class ReduceMax : public TF
+    {
+    public:
+        ReduceMax(const vector<Param> &args, const vector<Param> &returns)
+        {
+            this->name = "reducemax";
+            this->author = Author::name();
+            this->args = args;
+            this->returns = returns;
+        }
+        string math_formula() const override
+        {
+            return "B = reducemax(A, axis=[1 2], keepdims=false)";
+        }
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<ReduceMax>(*this);
+        }
+        int run(shared_ptr<MemBase> mem, string &error) override
+        {
+            Precision input_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
+            vector<int> dims = this->getvector<int>(1, true);
+            bool keepdims = this->getvar<bool>(2,mem,true);
+            Precision output_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
+            if (input_type != output_type)
+            {
+                error = "Type mismatch: " + precision_str(input_type) + " != " + precision_str(output_type);
+                return 1;
+            }
+            switch (input_type)
+            {
+            case Precision::Float64:
+                reducemax<Author, double>(*mem->gettensor<double>(this->args[0].textvalue), dims, keepdims, *mem->gettensor<double>(this->returns[0].textvalue));
+                break;
+            case Precision::Float32:
+                reducemax<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), dims, keepdims, *mem->gettensor<float>(this->returns[0].textvalue));
+                break;
+            case Precision::Int64:
+                reducemax<Author, int64_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), dims, keepdims, *mem->gettensor<int64_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int32:
+                reducemax<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), dims, keepdims, *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int16:
+                reducemax<Author, int16_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), dims, keepdims, *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int8:
+                reducemax<Author, int8_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), dims, keepdims, *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                break;
+            default:
+                error = "Unsupported type: " + precision_str(input_type);
+                return 1;
+            }
+            return 0;
+        }
+    };
+
+        template <typename Author>
+    class ReduceMin : public TF
+    {
+    public:
+        ReduceMin(const vector<Param> &args, const vector<Param> &returns)
+        {
+            this->name = "reducemin";
+            this->author = Author::name();
+            this->args = args;
+            this->returns = returns;
+        }
+        string math_formula() const override
+        {
+            return "B = reducemin(A, axis=[1 2], keepdims=false)";
+        }
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<ReduceMin>(*this);
+        }
+        int run(shared_ptr<MemBase> mem, string &error) override
+        {
+            Precision input_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
+            vector<int> dims = this->getvector<int>(1, true);
+            bool keepdims = this->getvar<bool>(2,mem,true);
+            Precision output_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
+            if (input_type != output_type)
+            {
+                error = "Type mismatch: " + precision_str(input_type) + " != " + precision_str(output_type);
+                return 1;
+            }
+            switch (input_type)
+            {
+            case Precision::Float64:
+                reducemin<Author, double>(*mem->gettensor<double>(this->args[0].textvalue), dims, keepdims, *mem->gettensor<double>(this->returns[0].textvalue));
+                break;
+            case Precision::Float32:
+                reducemin<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), dims, keepdims, *mem->gettensor<float>(this->returns[0].textvalue));
+                break;
+            case Precision::Int64:
+                reducemin<Author, int64_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), dims, keepdims, *mem->gettensor<int64_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int32:
+                reducemin<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), dims, keepdims, *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int16:
+                reducemin<Author, int16_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), dims, keepdims, *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int8:
+                reducemin<Author, int8_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), dims, keepdims, *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                break;
+            default:
+                error = "Unsupported type: " + precision_str(input_type);
+                return 1;
+            }
+            return 0;
+        }
+    };
+}
+
+#endif // DEEPX_TF_REDUCE_HPP

From b2c9d8360974423275bf3f3034a7fe2b5c4c16a5 Mon Sep 17 00:00:00 2001
From: lipeng <734991033@qq.com>
Date: Sun, 13 Apr 2025 20:41:18 +0800
Subject: [PATCH 6/6] front&excuter/reduce:sum,prod

---
 doc/excuter/op-mem-cuda/list.md               |   4 +
 excuter/op-mem-cuda/src/client/tfs.cpp        |   2 +-
 .../src/deepx/tensorfunc/cuda_math.cuh        | 363 ++++++++++++++++++
 .../src/deepx/tensorfunc/cuda_math.hpp        | 119 ------
 .../src/deepx/tensorfunc/reduce_miaobyte.cu   | 207 +++++-----
 front/py/deepx/scheduler/client/udpconn.py    |   2 +-
 .../py/examples/2_ir/5_reduce_sum_keepdim.dot |  48 +--
 .../2_ir/5_reduce_sum_keepdim.dot.svg         |  96 ++---
 front/py/examples/2_ir/5_reduce_sumprod.dot   |  48 +--
 .../py/examples/2_ir/5_reduce_sumprod.dot.svg |  96 ++---
 10 files changed, 619 insertions(+), 366 deletions(-)
 create mode 100644 excuter/op-mem-cuda/src/deepx/tensorfunc/cuda_math.cuh
 delete mode 100644 excuter/op-mem-cuda/src/deepx/tensorfunc/cuda_math.hpp

diff --git a/doc/excuter/op-mem-cuda/list.md b/doc/excuter/op-mem-cuda/list.md
index eab757a0..36d05762 100644
--- a/doc/excuter/op-mem-cuda/list.md
+++ b/doc/excuter/op-mem-cuda/list.md
@@ -4,6 +4,7 @@
 
 | Operation | Author | Func Def | Math Formula | IR Instruction |
 |-----------|--------|------------|--------------|----------------|
+| reducemax | miaobyte | reducemax(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) | B = reducemax(A, axis=[1 2], keepdims=false) | reducemax(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) |
 | broadcastTo | miaobyte | broadcastTo(tensor<any> A, vector<int32> new_shape)->(tensor<any> B) | T2 = T1.broadcastTo(new_shape=[4,3,2]) | broadcastTo(tensor<any> A, vector<int32> new_shape)->(tensor<any> B) |
 | concat | miaobyte | concat(listtensor<any> tensors, var<int32> axis)->(tensor<any> result) | Tresult = concat([T1, T2...], axis=3) | concat(listtensor<any> tensors, var<int32> axis)->(tensor<any> result) |
 | transpose | miaobyte | transpose(tensor<any> A, vector<int32> dim_order)->(tensor<any> C) | T2 = T1.transpose(dimorder=[1,0]) | transpose(tensor<any> A, vector<int32> dim_order)->(tensor<any> C) |
@@ -24,8 +25,10 @@
 | newtensor |  none  | newtensor(vector<int32> shape)->(tensor<any> tensor1) | T1 = zeros(shape) | newtensor(vector<int32> shape)->(tensor<any> tensor1) |
 | newtensor |  none  | newtensor(var<string> shape)->(tensor<any> tensor1) | T1 = zeros(shape) | newtensor(var<string> shape)->(tensor<any> tensor1) |
 | vecset |  none  | vecset(vector<any> value)->(vector<any> name) | shape = [3  4  5] | vecset(vector<any> value)->(vector<any> name) |
+| reducemin | miaobyte | reducemin(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) | B = reducemin(A, axis=[1 2], keepdims=false) | reducemin(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) |
 | subscalar | miaobyte | subscalar(tensor<any> A, var<any> b)->(tensor<any> C) | T3=T1-scalar | subscalar(tensor<any> A, var<any> b)->(tensor<any> C) |
 | sqrt | miaobyte | sqrt(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) | T3=sqrt(T1) | sqrt(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) |
+| sum | miaobyte | sum(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) | B = sum(A, axis=[1 2], keepdims=false) | sum(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) |
 | argset |  none  | argset(var<any> value)->(var<any> name) | var argname = argvalue | argset(var<any> value)->(var<any> name) |
 | sub | miaobyte | sub(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1-T2 | sub(tensor<any> A, tensor<any> B)->(tensor<any> C) |
 | mulscalar | miaobyte | mulscalar(tensor<any> A, var<any> b)->(tensor<any> C) | T3=T1*scalar | mulscalar(tensor<any> A, var<any> b)->(tensor<any> C) |
@@ -40,5 +43,6 @@
 | rdivscalar | miaobyte | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) | T3=scalar/T1 | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) |
 | minscalar | miaobyte | minscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=min(T1, scalar) | minscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
 | cos | miaobyte | cos(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) | T3=cos(T1) | cos(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) |
+| prod | miaobyte | prod(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) | B = prod(A, axis=[1 2], keepdims=false) | prod(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) |
 | min | miaobyte | min(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=min(T1, T2) | min(tensor<any> A, tensor<any> B)->(tensor<any> C) |
 | compare | miaobyte | compare(tensor<any> A, tensor<any> B)->(tensor<int8> mask) | mask=compare(T1, T2) | compare(tensor<any> A, tensor<any> B)->(tensor<int8> mask) |
diff --git a/excuter/op-mem-cuda/src/client/tfs.cpp b/excuter/op-mem-cuda/src/client/tfs.cpp
index 2ef18928..9cd9a9f3 100644
--- a/excuter/op-mem-cuda/src/client/tfs.cpp
+++ b/excuter/op-mem-cuda/src/client/tfs.cpp
@@ -430,7 +430,7 @@ namespace deepx::tf
         register_elementwise(tffactory);
         register_matmul(tffactory);
         register_changeshape(tffactory);
-        // register_reduce(opfactory);
+        register_reduce(tffactory);
         return 0;
     }
 }
\ No newline at end of file
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda_math.cuh b/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda_math.cuh
new file mode 100644
index 00000000..d1828724
--- /dev/null
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda_math.cuh
@@ -0,0 +1,363 @@
+#ifndef DEEPX_TENSORFUNC_CUDA_MATH_CUH
+#define DEEPX_TENSORFUNC_CUDA_MATH_CUH
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include <cublas_v2.h>
+
+namespace deepx::tensorfunc
+{
+
+    // max
+    template <typename T>
+    __device__ void deepx_max(const T *a, const T *b, T *out);
+
+    template <>
+    __device__ void deepx_max<double>(const double *a, const double *b, double *out)
+    {
+        *out = fmax(*a, *b);
+    }
+
+    template <>
+    __device__ void deepx_max<float>(const float *a, const float *b, float *out)
+    {
+        *out = fmaxf(*a, *b);
+    }
+
+    template <>
+    __device__ void deepx_max<half>(const half *a, const half *b, half *out)
+    {
+        *out = __hmax(*a, *b);
+    }
+
+    template <>
+    __device__ void deepx_max<nv_bfloat16>(const nv_bfloat16 *a, const nv_bfloat16 *b, nv_bfloat16 *out)
+    {
+        *out = __hmax(*a, *b);
+    }
+    template <>
+    __device__ void deepx_max<int64_t>(const int64_t *a, const int64_t *b, int64_t *out)
+    {
+        *out = *a > *b ? *a : *b;
+    }
+    template <>
+    __device__ void deepx_max<int32_t>(const int32_t *a, const int32_t *b, int32_t *out)
+    {
+        *out = *a > *b ? *a : *b;
+    }
+    template <>
+    __device__ void deepx_max<int16_t>(const int16_t *a, const int16_t *b, int16_t *out)
+    {
+        *out = *a > *b ? *a : *b;
+    }
+    template <>
+    __device__ void deepx_max<int8_t>(const int8_t *a, const int8_t *b, int8_t *out)
+    {
+        *out = *a > *b ? *a : *b;
+    }
+
+    // min
+    template <typename T>
+    __device__ void deepx_min(const T *a, const T *b, T *out);
+
+    template <>
+    __device__ void deepx_min<double>(const double *a, const double *b, double *out)
+    {
+        *out = fmin(*a, *b);
+    }
+
+    template <>
+    __device__ void deepx_min<float>(const float *a, const float *b, float *out)
+    {
+        *out = fminf(*a, *b);
+    }
+
+    template <>
+    __device__ void deepx_min<half>(const half *a, const half *b, half *out)
+    {
+        *out = __hmin(*a, *b);
+    }
+
+    template <>
+    __device__ void deepx_min<nv_bfloat16>(const nv_bfloat16 *a, const nv_bfloat16 *b, nv_bfloat16 *out)
+    {
+        *out = __hmin(*a, *b);
+    }
+
+    template <>
+    __device__ void deepx_min<int64_t>(const int64_t *a, const int64_t *b, int64_t *out)
+    {
+        *out = *a < *b ? *a : *b;
+    }
+
+    template <>
+    __device__ void deepx_min<int32_t>(const int32_t *a, const int32_t *b, int32_t *out)
+    {
+        *out = *a < *b ? *a : *b;
+    }
+
+    template <>
+    __device__ void deepx_min<int16_t>(const int16_t *a, const int16_t *b, int16_t *out)
+    {
+        *out = *a < *b ? *a : *b;
+    }
+
+    template <>
+    __device__ void deepx_min<int8_t>(const int8_t *a, const int8_t *b, int8_t *out)
+    {
+        *out = *a < *b ? *a : *b;
+    }
+
+    // atomicAdd
+    template <typename T>
+    __device__ void deepx_atomicAdd(T *a, T b);
+
+    template <>
+    __device__ void deepx_atomicAdd<double>(double *a, double b)
+    {
+        atomicAdd(a, b);
+    }
+
+    template <>
+    __device__ void deepx_atomicAdd<float>(float *a, float b)
+    {
+        atomicAdd(a, b);
+    }
+
+    template <>
+    __device__ void deepx_atomicAdd<half>(half *a, half b)
+    {
+        atomicAdd(a, b);
+    }
+
+    template <>
+    __device__ void deepx_atomicAdd<nv_bfloat16>(nv_bfloat16 *a, nv_bfloat16 b)
+    {
+        atomicAdd(a, b);
+    }
+
+    template <>
+    __device__ void deepx_atomicAdd<int64_t>(int64_t *a, int64_t b)
+    {
+        int64_t old = *a;
+        int64_t assumed;
+        do
+        {
+            assumed = old;
+            old = atomicCAS((unsigned long long *)a, (unsigned long long)assumed, (unsigned long long)(assumed + b));
+        } while (assumed != old);
+        *a = old + b;
+    }
+
+    template <>
+    __device__ void deepx_atomicAdd<int32_t>(int32_t *a, int32_t b)
+    {
+        atomicAdd(a, b);
+    }
+
+    template <>
+    __device__ void deepx_atomicAdd<int16_t>(int16_t *a, int16_t b)
+    {
+        unsigned int *address_as_uint = (unsigned int *)((char *)a - ((size_t)a & 2));
+        unsigned int old = *address_as_uint;
+        unsigned int assumed;
+
+        do
+        {
+            assumed = old;
+            unsigned int new_val;
+            if ((size_t)a & 2)
+            {
+                new_val = (old & 0x0000FFFF) | (((unsigned short)(((old >> 16) & 0xFFFF) + b)) << 16);
+            }
+            else
+            {
+                new_val = (old & 0xFFFF0000) | ((unsigned short)((old & 0xFFFF) + b));
+            }
+            old = atomicCAS(address_as_uint, assumed, new_val);
+        } while (assumed != old);
+    }
+
+    template <>
+    __device__ void deepx_atomicAdd<int8_t>(int8_t *a, int8_t b)
+    {
+        unsigned int *address_as_uint = (unsigned int *)((char *)a - ((size_t)a & 3));
+        unsigned int old = *address_as_uint;
+        unsigned int assumed;
+        unsigned int byte_offset = ((size_t)a & 3) * 8;
+        unsigned int mask = 0xFF << byte_offset;
+
+        do
+        {
+            assumed = old;
+            unsigned char byte_val = (old >> byte_offset) & 0xFF;
+            byte_val += b;
+            unsigned int new_val = (old & ~mask) | (byte_val << byte_offset);
+            old = atomicCAS(address_as_uint, assumed, new_val);
+        } while (assumed != old);
+    }
+
+
+    // atomicMul
+       // atomicMul
+    template <typename T>
+    __device__ void deepx_atomicMul(T *a, T b);
+
+    template <>
+    __device__ void deepx_atomicMul<double>(double *a, double b)
+    {
+        double old = *a;
+        double assumed;
+        do
+        {
+            assumed = old;
+            old = __longlong_as_double(atomicCAS((unsigned long long int*)a, 
+                                               __double_as_longlong(assumed),
+                                               __double_as_longlong(assumed * b)));
+        } while (assumed != old);
+    }
+
+    template <>
+    __device__ void deepx_atomicMul<float>(float *a, float b)
+    {
+        float old = *a;
+        float assumed;
+        do
+        {
+            assumed = old;
+            old = __int_as_float(atomicCAS((int*)a, 
+                                          __float_as_int(assumed),
+                                          __float_as_int(assumed * b)));
+        } while (assumed != old);
+    }
+    
+    template <>
+    __device__ void deepx_atomicMul<half>(half *a, half b)
+    {
+        unsigned int *address_as_uint = (unsigned int *)((char *)a - ((size_t)a & 2));
+        unsigned int old = *address_as_uint;
+        unsigned int assumed;
+
+        do
+        {
+            assumed = old;
+            half assumed_half;
+            if ((size_t)a & 2)
+            {
+                assumed_half = __ushort_as_half((unsigned short)(old >> 16));
+                half new_half = __hmul(assumed_half, b);
+                unsigned int new_val = (old & 0x0000FFFF) | ((unsigned int)__half_as_ushort(new_half) << 16);
+                old = atomicCAS(address_as_uint, assumed, new_val);
+            }
+            else
+            {
+                assumed_half = __ushort_as_half((unsigned short)(old & 0xFFFF));
+                half new_half = __hmul(assumed_half, b);
+                unsigned int new_val = (old & 0xFFFF0000) | __half_as_ushort(new_half);
+                old = atomicCAS(address_as_uint, assumed, new_val);
+            }
+        } while (assumed != old);
+    }
+
+    template <>
+    __device__ void deepx_atomicMul<nv_bfloat16>(nv_bfloat16 *a, nv_bfloat16 b)
+    {
+        unsigned int *address_as_uint = (unsigned int *)((char *)a - ((size_t)a & 2));
+        unsigned int old = *address_as_uint;
+        unsigned int assumed;
+
+        do
+        {
+            assumed = old;
+            nv_bfloat16 assumed_bf16;
+            if ((size_t)a & 2)
+            {
+                assumed_bf16 = __ushort_as_bfloat16((unsigned short)(old >> 16));
+                nv_bfloat16 new_bf16 = __hmul(assumed_bf16, b);
+                unsigned int new_val = (old & 0x0000FFFF) | ((unsigned int)__bfloat16_as_ushort(new_bf16) << 16);
+                old = atomicCAS(address_as_uint, assumed, new_val);
+            }
+            else
+            {
+                assumed_bf16 = __ushort_as_bfloat16((unsigned short)(old & 0xFFFF));
+                nv_bfloat16 new_bf16 = __hmul(assumed_bf16, b);
+                unsigned int new_val = (old & 0xFFFF0000) | __bfloat16_as_ushort(new_bf16);
+                old = atomicCAS(address_as_uint, assumed, new_val);
+            }
+        } while (assumed != old);
+    }
+
+    template <>
+    __device__ void deepx_atomicMul<int64_t>(int64_t *a, int64_t b)
+    {
+        int64_t old = *a;
+        int64_t assumed;
+        do
+        {
+            assumed = old;
+            old = atomicCAS((unsigned long long *)a, 
+                          (unsigned long long)assumed, 
+                          (unsigned long long)(assumed * b));
+        } while (assumed != old);
+    }
+
+    template <>
+    __device__ void deepx_atomicMul<int32_t>(int32_t *a, int32_t b)
+    {
+        int32_t old = *a;
+        int32_t assumed;
+        do
+        {
+            assumed = old;
+            old = atomicCAS((int32_t *)a, assumed, assumed * b);
+        } while (assumed != old);
+    }
+
+    template <>
+    __device__ void deepx_atomicMul<int16_t>(int16_t *a, int16_t b)
+    {
+        unsigned int *address_as_uint = (unsigned int *)((char *)a - ((size_t)a & 2));
+        unsigned int old = *address_as_uint;
+        unsigned int assumed;
+
+        do
+        {
+            assumed = old;
+            unsigned int new_val;
+            if ((size_t)a & 2)
+            {
+                int16_t assumed_short = (int16_t)(old >> 16);
+                new_val = (old & 0x0000FFFF) | (((unsigned short)(assumed_short * b)) << 16);
+            }
+            else
+            {
+                int16_t assumed_short = (int16_t)(old & 0xFFFF);
+                new_val = (old & 0xFFFF0000) | ((unsigned short)(assumed_short * b));
+            }
+            old = atomicCAS(address_as_uint, assumed, new_val);
+        } while (assumed != old);
+    }
+
+    template <>
+    __device__ void deepx_atomicMul<int8_t>(int8_t *a, int8_t b)
+    {
+        unsigned int *address_as_uint = (unsigned int *)((char *)a - ((size_t)a & 3));
+        unsigned int old = *address_as_uint;
+        unsigned int assumed;
+        unsigned int byte_offset = ((size_t)a & 3) * 8;
+        unsigned int mask = 0xFF << byte_offset;
+
+        do
+        {
+            assumed = old;
+            int8_t byte_val = (old >> byte_offset) & 0xFF;
+            byte_val *= b;
+            unsigned int new_val = (old & ~mask) | ((byte_val & 0xFF) << byte_offset);
+            old = atomicCAS(address_as_uint, assumed, new_val);
+        } while (assumed != old);
+    }
+    
+}
+
+#endif // DEEPX_TENSORFUNC_CUDA_MATH_CUH
\ No newline at end of file
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda_math.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda_math.hpp
deleted file mode 100644
index f0a3c578..00000000
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda_math.hpp
+++ /dev/null
@@ -1,119 +0,0 @@
-#ifndef DEEPX_TENSORFUNC_CUDA_MATH_HPP
-#define DEEPX_TENSORFUNC_CUDA_MATH_HPP
-
-#include <cuda_runtime.h>
-#include <cuda_fp16.h>
-#include <cuda_bf16.h>
-#include <cublas_v2.h>
-
-namespace deepx::tensorfunc
-{
-
-    // max
-    template <typename T>
-    __device__ void deepx_max(const T *a, const T *b, T *out);
-
-    template <>
-    __device__ void deepx_max<double>(const double *a, const double *b, double *out)
-    {
-        *out = fmax(*a, *b);
-    }
-    
-    template <>
-    __device__ void deepx_max<float>(const float *a, const float *b, float *out)
-    {
-        *out = fmaxf(*a, *b);
-    }
-
-    template <>
-    __device__ void deepx_max<half>(const half *a, const half *b, half *out)
-    {
-        *out = __hmax(*a, *b);
-    }
-
-    template <>
-    __device__ void deepx_max<nv_bfloat16>(const nv_bfloat16 *a, const nv_bfloat16 *b, nv_bfloat16 *out)
-    {
-        *out = __hmax(*a, *b);
-    }
-    template <>
-    __device__ void deepx_max<int64_t>(const int64_t *a, const int64_t *b, int64_t *out)
-    {
-        *out = *a>*b?*a:*b;
-    }
-    template <>
-    __device__ void deepx_max<int32_t>(const int32_t *a, const int32_t *b, int32_t *out)
-    {
-        *out = *a>*b?*a:*b;
-    }
-    template <>
-    __device__ void deepx_max<int16_t>(const int16_t *a, const int16_t *b, int16_t *out)
-    {
-        *out = *a>*b?*a:*b;
-    }
-    template <>
-    __device__ void deepx_max<int8_t>(const int8_t *a, const int8_t *b, int8_t *out)
-    {
-        *out = *a>*b?*a:*b;
-    }
-    
- 
-    // min
-    template <typename T>
-    __device__ void deepx_min(const T *a, const T *b, T *out);
- 
-
-    template <>
-    __device__ void deepx_min<double>(const double *a, const double *b, double *out)
-    {
-        *out = fmin(*a, *b);
-    }
-
-    template <>
-    __device__ void deepx_min<float>(const float *a, const float *b, float *out)
-    {
-        *out = fminf(*a, *b);
-    }
-
-
-    template <>
-    __device__ void deepx_min<half>(const half *a, const half *b, half *out)
-    {
-        *out = __hmin(*a, *b);
-    }
-
-    template <>
-    __device__ void deepx_min<nv_bfloat16>(const nv_bfloat16 *a,const nv_bfloat16 *b, nv_bfloat16 *out)
-    {
-        *out = __hmin(*a, *b);
-    }
-
-    template <>
-    __device__ void deepx_min<int64_t>(const int64_t *a, const int64_t *b, int64_t *out)
-    {
-        *out = *a<*b?*a:*b;
-    }
-
-    template <>
-    __device__ void deepx_min<int32_t>(const int32_t *a, const int32_t *b, int32_t *out)
-    {
-        *out = *a<*b?*a:*b;
-    }
-
-    template <>
-    __device__ void deepx_min<int16_t>(const int16_t *a, const int16_t *b, int16_t *out)
-    {
-        *out = *a<*b?*a:*b;
-    }
-
-    template <>
-    __device__ void deepx_min<int8_t>(const int8_t *a, const int8_t *b, int8_t *out)
-    {
-        *out =  *a<*b?*a:*b;
-    }   
-
- 
-
-}
-
-#endif
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.cu
index c7df6cf6..43717698 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.cu
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.cu
@@ -9,15 +9,18 @@
 #include "deepx/tensorfunc/reduce_miaobyte.cuh"
 #include "deepx/tensorfunc/tensor_cuda.cuh"
 #include "deepx/tensorfunc/vector_cuda.cuh"
-#include "deepx/tensorfunc/cuda_math.hpp"
+#include "deepx/tensorfunc/cuda_math.cuh"
 
 namespace deepx::tensorfunc
 {
 
+ 
+    // sum
+    //DIM是希望申请寄存器中存放索引数组的长度
     template <int DIM, typename T>
     __global__ void sum_kernel(const T *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
-                                     const int *reduced_dims, const bool keepdims,
-                                     T *result_data, const int *result_strides, const int result_dim)
+                               const int *reduced_dims, const bool keepdims,
+                               T *result_data, const int *result_strides, const int result_dim)
     {
         const int grid_stride = gridDim.x * blockDim.x;
         int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
@@ -39,19 +42,20 @@ namespace deepx::tensorfunc
             }
             int outputIdx = linearAt(result_strides, result_dim, output_indices);
             int inputIdx = linearAt(tensor_strides, tensor_dim, input_indices);
-            result_data[outputIdx] += tensor_data[inputIdx];
+            deepx_atomicAdd(result_data + outputIdx, tensor_data[inputIdx]);
         }
     }
- 
+
     template <typename T>
     __host__ void launch_sum(const T *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
-                          const int *reduced_dims, const bool keepdims,
-                          T *result_data, const int *result_strides, const int result_dim)
+                             const int *reduced_dims, const bool keepdims,
+                             T *result_data, const int *result_strides, const int result_dim)
     {
         auto [numBlocks, blockSize] = BestDims(tensor_len);
+        // int shared_mem_size = blockSize * sizeof(T) + sizeof(int) * tensor_dim;
         cudaVector<int> tensor_strides_d(tensor_strides, tensor_dim, cudaMemcpyHostToDevice);
         cudaVector<int> result_strides_d(result_strides, result_dim, cudaMemcpyHostToDevice);
-        cudaVector<int> reduced_dims_d(reduced_dims,tensor_dim, cudaMemcpyHostToDevice);
+        cudaVector<int> reduced_dims_d(reduced_dims, tensor_dim, cudaMemcpyHostToDevice);
 
         int powDim = nextPowerOf2(tensor_dim);
         switch (powDim)
@@ -66,13 +70,13 @@ namespace deepx::tensorfunc
             sum_kernel<4, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
             break;
         case 8:
-            sum_kernel<8, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len,reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
+            sum_kernel<8, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
             break;
         case 16:
             sum_kernel<16, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
             break;
         case 32:
-            sum_kernel<32, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
+            sum_kernel<32, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);    
             break;
         case 64:
             sum_kernel<64, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
@@ -86,35 +90,35 @@ namespace deepx::tensorfunc
     }
 
     template void launch_sum<double>(const double *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
-                          const int *reduced_dims, const bool keepdims,
-                          double *result_data, const int *result_strides, const int result_dim);
+                                     const int *reduced_dims, const bool keepdims,
+                                     double *result_data, const int *result_strides, const int result_dim);
     template void launch_sum<float>(const float *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
-                          const int *reduced_dims, const bool keepdims,
-                          float *result_data, const int *result_strides, const int result_dim);
+                                    const int *reduced_dims, const bool keepdims,
+                                    float *result_data, const int *result_strides, const int result_dim);
     template void launch_sum<nv_bfloat16>(const nv_bfloat16 *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
-                          const int *reduced_dims, const bool keepdims,
-                          nv_bfloat16 *result_data, const int *result_strides, const int result_dim);
+                                          const int *reduced_dims, const bool keepdims,
+                                          nv_bfloat16 *result_data, const int *result_strides, const int result_dim);
     template void launch_sum<__half>(const __half *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
-                          const int *reduced_dims, const bool keepdims,
-                          __half *result_data, const int *result_strides, const int result_dim);
+                                     const int *reduced_dims, const bool keepdims,
+                                     __half *result_data, const int *result_strides, const int result_dim);
     template void launch_sum<int64_t>(const int64_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
-                          const int *reduced_dims, const bool keepdims,
-                          int64_t *result_data, const int *result_strides, const int result_dim);
+                                      const int *reduced_dims, const bool keepdims,
+                                      int64_t *result_data, const int *result_strides, const int result_dim);
     template void launch_sum<int32_t>(const int32_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
-                          const int *reduced_dims, const bool keepdims,
-                          int32_t *result_data, const int *result_strides, const int result_dim);
+                                      const int *reduced_dims, const bool keepdims,
+                                      int32_t *result_data, const int *result_strides, const int result_dim);
     template void launch_sum<int16_t>(const int16_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
-                          const int *reduced_dims, const bool keepdims,
-                          int16_t *result_data, const int *result_strides, const int result_dim);
+                                      const int *reduced_dims, const bool keepdims,
+                                      int16_t *result_data, const int *result_strides, const int result_dim);
     template void launch_sum<int8_t>(const int8_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
-                          const int *reduced_dims, const bool keepdims,
-                          int8_t *result_data, const int *result_strides, const int result_dim);
+                                     const int *reduced_dims, const bool keepdims,
+                                     int8_t *result_data, const int *result_strides, const int result_dim);
 
-    //prod
+    // prod
     template <int DIM, typename T>
-    __global__ void  prod_kernel(const T *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
-                                      const int *reduced_dims, const bool keepdims,
-                                      T *result_data, const int *result_strides, const int result_dim)
+    __global__ void prod_kernel(const T *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
+                                const int *reduced_dims, const bool keepdims,
+                                T *result_data, const int *result_strides, const int result_dim)
     {
         const int grid_stride = gridDim.x * blockDim.x;
         int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
@@ -136,19 +140,19 @@ namespace deepx::tensorfunc
             }
             int outputIdx = linearAt(result_strides, result_dim, output_indices);
             int inputIdx = linearAt(tensor_strides, tensor_dim, input_indices);
-            result_data[outputIdx] *= tensor_data[inputIdx];
+            deepx_atomicMul(result_data + outputIdx, tensor_data[inputIdx]);
         }
     }
 
     template <typename T>
     void launch_prod(const T *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
-                          const int *reduced_dims, const bool keepdims,
-                          T *result_data, const int *result_strides, const int result_dim)
+                     const int *reduced_dims, const bool keepdims,
+                     T *result_data, const int *result_strides, const int result_dim)
     {
         auto [numBlocks, blockSize] = BestDims(tensor_len);
         cudaVector<int> tensor_strides_d(tensor_strides, tensor_dim, cudaMemcpyHostToDevice);
         cudaVector<int> result_strides_d(result_strides, result_dim, cudaMemcpyHostToDevice);
-        cudaVector<int> reduced_dims_d(reduced_dims,tensor_dim, cudaMemcpyHostToDevice);
+        cudaVector<int> reduced_dims_d(reduced_dims, tensor_dim, cudaMemcpyHostToDevice);
 
         int powDim = nextPowerOf2(tensor_dim);
         switch (powDim)
@@ -183,36 +187,37 @@ namespace deepx::tensorfunc
     }
 
     template void launch_prod<double>(const double *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
-                          const int *reduced_dims, const bool keepdims,
-                          double *result_data, const int *result_strides, const int result_dim);
+                                      const int *reduced_dims, const bool keepdims,
+                                      double *result_data, const int *result_strides, const int result_dim);
     template void launch_prod<float>(const float *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
-                          const int *reduced_dims, const bool keepdims,
-                          float *result_data, const int *result_strides, const int result_dim);
+                                     const int *reduced_dims, const bool keepdims,
+                                     float *result_data, const int *result_strides, const int result_dim);
     template void launch_prod<nv_bfloat16>(const nv_bfloat16 *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
-                          const int *reduced_dims, const bool keepdims,
-                          nv_bfloat16 *result_data, const int *result_strides, const int result_dim);
+                                           const int *reduced_dims, const bool keepdims,
+                                           nv_bfloat16 *result_data, const int *result_strides, const int result_dim);
     template void launch_prod<__half>(const __half *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
-                          const int *reduced_dims, const bool keepdims,
-                          __half *result_data, const int *result_strides, const int result_dim);
+                                      const int *reduced_dims, const bool keepdims,
+                                      __half *result_data, const int *result_strides, const int result_dim);
     template void launch_prod<int64_t>(const int64_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
-                          const int *reduced_dims, const bool keepdims,
-                          int64_t *result_data, const int *result_strides, const int result_dim);   
+                                       const int *reduced_dims, const bool keepdims,
+                                       int64_t *result_data, const int *result_strides, const int result_dim);
     template void launch_prod<int32_t>(const int32_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
-                          const int *reduced_dims, const bool keepdims,
-                          int32_t *result_data, const int *result_strides, const int result_dim);
+                                       const int *reduced_dims, const bool keepdims,
+                                       int32_t *result_data, const int *result_strides, const int result_dim);
     template void launch_prod<int16_t>(const int16_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
-                          const int *reduced_dims, const bool keepdims,
-                          int16_t *result_data, const int *result_strides, const int result_dim);
+                                       const int *reduced_dims, const bool keepdims,
+                                       int16_t *result_data, const int *result_strides, const int result_dim);
     template void launch_prod<int8_t>(const int8_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
-                          const int *reduced_dims, const bool keepdims,
-                          int8_t *result_data, const int *result_strides, const int result_dim);
+                                      const int *reduced_dims, const bool keepdims,
+                                      int8_t *result_data, const int *result_strides, const int result_dim);
 
-    //max
+    // max
     template <int DIM, typename T>
     __global__ void max_kernel(const T *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
                                const int *reduced_dims, const bool keepdims,
-                               T *result_data, const int *result_strides, const int result_dim){
-        const int grid_stride = gridDim.x * blockDim.x; 
+                               T *result_data, const int *result_strides, const int result_dim)
+    {
+        const int grid_stride = gridDim.x * blockDim.x;
         int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
         for (; thread_id < tensor_len; thread_id += grid_stride)
         {
@@ -229,10 +234,10 @@ namespace deepx::tensorfunc
                 {
                     output_indices[j++] = 0;
                 }
-            }   
+            }
             int outputIdx = linearAt(result_strides, result_dim, output_indices);
             int inputIdx = linearAt(tensor_strides, tensor_dim, input_indices);
-            deepx_max(result_data+outputIdx, tensor_data+inputIdx, result_data+outputIdx);   
+            deepx_max(result_data + outputIdx, tensor_data + inputIdx, result_data + outputIdx);
         }
     }
 
@@ -244,7 +249,7 @@ namespace deepx::tensorfunc
         auto [numBlocks, blockSize] = BestDims(tensor_len);
         cudaVector<int> tensor_strides_d(tensor_strides, tensor_dim, cudaMemcpyHostToDevice);
         cudaVector<int> result_strides_d(result_strides, result_dim, cudaMemcpyHostToDevice);
-        cudaVector<int> reduced_dims_d(reduced_dims,tensor_dim, cudaMemcpyHostToDevice);
+        cudaVector<int> reduced_dims_d(reduced_dims, tensor_dim, cudaMemcpyHostToDevice);
 
         int powDim = nextPowerOf2(tensor_dim);
         switch (powDim)
@@ -279,36 +284,37 @@ namespace deepx::tensorfunc
     };
 
     template void launch_reducemax<double>(const double *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
-                          const int *reduced_dims, const bool keepdims,
-                          double *result_data, const int *result_strides, const int result_dim);
+                                           const int *reduced_dims, const bool keepdims,
+                                           double *result_data, const int *result_strides, const int result_dim);
     template void launch_reducemax<float>(const float *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
-                          const int *reduced_dims, const bool keepdims,
-                          float *result_data, const int *result_strides, const int result_dim);
+                                          const int *reduced_dims, const bool keepdims,
+                                          float *result_data, const int *result_strides, const int result_dim);
     template void launch_reducemax<nv_bfloat16>(const nv_bfloat16 *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
-                          const int *reduced_dims, const bool keepdims,
-                          nv_bfloat16 *result_data, const int *result_strides, const int result_dim);
+                                                const int *reduced_dims, const bool keepdims,
+                                                nv_bfloat16 *result_data, const int *result_strides, const int result_dim);
     template void launch_reducemax<__half>(const __half *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
-                          const int *reduced_dims, const bool keepdims,
-                          __half *result_data, const int *result_strides, const int result_dim);
+                                           const int *reduced_dims, const bool keepdims,
+                                           __half *result_data, const int *result_strides, const int result_dim);
     template void launch_reducemax<int64_t>(const int64_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
-                          const int *reduced_dims, const bool keepdims,
-                          int64_t *result_data, const int *result_strides, const int result_dim);
+                                            const int *reduced_dims, const bool keepdims,
+                                            int64_t *result_data, const int *result_strides, const int result_dim);
     template void launch_reducemax<int32_t>(const int32_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
-                          const int *reduced_dims, const bool keepdims,
-                          int32_t *result_data, const int *result_strides, const int result_dim);
+                                            const int *reduced_dims, const bool keepdims,
+                                            int32_t *result_data, const int *result_strides, const int result_dim);
     template void launch_reducemax<int16_t>(const int16_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
-                          const int *reduced_dims, const bool keepdims,
-                          int16_t *result_data, const int *result_strides, const int result_dim);
+                                            const int *reduced_dims, const bool keepdims,
+                                            int16_t *result_data, const int *result_strides, const int result_dim);
     template void launch_reducemax<int8_t>(const int8_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
-                          const int *reduced_dims, const bool keepdims,
-                          int8_t *result_data, const int *result_strides, const int result_dim);
+                                           const int *reduced_dims, const bool keepdims,
+                                           int8_t *result_data, const int *result_strides, const int result_dim);
 
-    //min
+    // min
     template <int DIM, typename T>
     __global__ void min_kernel(const T *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
                                const int *reduced_dims, const bool keepdims,
-                               T *result_data, const int *result_strides, const int result_dim){
-        const int grid_stride = gridDim.x * blockDim.x; 
+                               T *result_data, const int *result_strides, const int result_dim)
+    {
+        const int grid_stride = gridDim.x * blockDim.x;
         int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
         for (; thread_id < tensor_len; thread_id += grid_stride)
         {
@@ -325,11 +331,11 @@ namespace deepx::tensorfunc
                 {
                     output_indices[j++] = 0;
                 }
-            }   
+            }
             int outputIdx = linearAt(result_strides, result_dim, output_indices);
             int inputIdx = linearAt(tensor_strides, tensor_dim, input_indices);
-            deepx_min(result_data+outputIdx, tensor_data+inputIdx, result_data+outputIdx);   
-        }   
+            deepx_min(result_data + outputIdx, tensor_data + inputIdx, result_data + outputIdx);
+        }
     }
 
     template <typename T>
@@ -337,10 +343,10 @@ namespace deepx::tensorfunc
                           const int *reduced_dims, const bool keepdims,
                           T *result_data, const int *result_strides, const int result_dim)
     {
-        auto [numBlocks, blockSize]  = BestDims(tensor_len);
+        auto [numBlocks, blockSize] = BestDims(tensor_len);
         cudaVector<int> tensor_strides_d(tensor_strides, tensor_dim, cudaMemcpyHostToDevice);
         cudaVector<int> result_strides_d(result_strides, result_dim, cudaMemcpyHostToDevice);
-        cudaVector<int> reduced_dims_d(reduced_dims , tensor_dim, cudaMemcpyHostToDevice);
+        cudaVector<int> reduced_dims_d(reduced_dims, tensor_dim, cudaMemcpyHostToDevice);
 
         int powDim = nextPowerOf2(tensor_dim);
         switch (powDim)
@@ -353,13 +359,13 @@ namespace deepx::tensorfunc
             break;
         case 4:
             min_kernel<4, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
-            break;  
+            break;
         case 8:
             min_kernel<8, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
             break;
         case 16:
             min_kernel<16, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
-            break;  
+            break;
         case 32:
             min_kernel<32, T><<<numBlocks, blockSize>>>(tensor_data, tensor_strides_d.data, tensor_dim, tensor_len, reduced_dims_d.data, keepdims, result_data, result_strides_d.data, result_dim);
             break;
@@ -372,33 +378,32 @@ namespace deepx::tensorfunc
         default:
             throw std::runtime_error("dim too large, max support 128");
         }
-    }   
-    
+    }
+
     template void launch_reducemin<double>(const double *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
-                          const int *reduced_dims, const bool keepdims,
-                          double *result_data, const int *result_strides, const int result_dim);
+                                           const int *reduced_dims, const bool keepdims,
+                                           double *result_data, const int *result_strides, const int result_dim);
     template void launch_reducemin<float>(const float *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
-                          const int *reduced_dims, const bool keepdims,
-                          float *result_data, const int *result_strides, const int result_dim);
+                                          const int *reduced_dims, const bool keepdims,
+                                          float *result_data, const int *result_strides, const int result_dim);
     template void launch_reducemin<nv_bfloat16>(const nv_bfloat16 *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
-                          const int *reduced_dims, const bool keepdims,
-                          nv_bfloat16 *result_data, const int *result_strides, const int result_dim);
+                                                const int *reduced_dims, const bool keepdims,
+                                                nv_bfloat16 *result_data, const int *result_strides, const int result_dim);
     template void launch_reducemin<__half>(const __half *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
-                          const int *reduced_dims, const bool keepdims,
-                          __half *result_data, const int *result_strides, const int result_dim);
+                                           const int *reduced_dims, const bool keepdims,
+                                           __half *result_data, const int *result_strides, const int result_dim);
     template void launch_reducemin<int64_t>(const int64_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
-                          const int *reduced_dims, const bool keepdims,
-                          int64_t *result_data, const int *result_strides, const int result_dim);
+                                            const int *reduced_dims, const bool keepdims,
+                                            int64_t *result_data, const int *result_strides, const int result_dim);
     template void launch_reducemin<int32_t>(const int32_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
-                          const int *reduced_dims, const bool keepdims,
-                          int32_t *result_data, const int *result_strides, const int result_dim);
+                                            const int *reduced_dims, const bool keepdims,
+                                            int32_t *result_data, const int *result_strides, const int result_dim);
     template void launch_reducemin<int16_t>(const int16_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
-                          const int *reduced_dims, const bool keepdims,
-                          int16_t *result_data, const int *result_strides, const int result_dim);
+                                            const int *reduced_dims, const bool keepdims,
+                                            int16_t *result_data, const int *result_strides, const int result_dim);
     template void launch_reducemin<int8_t>(const int8_t *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
-                          const int *reduced_dims, const bool keepdims,
-                          int8_t *result_data, const int *result_strides, const int result_dim);
-
+                                           const int *reduced_dims, const bool keepdims,
+                                           int8_t *result_data, const int *result_strides, const int result_dim);
 }
 
 #endif // DEEPX_TENSORFUNC_REDUCE_MIAOBYTE_CU
diff --git a/front/py/deepx/scheduler/client/udpconn.py b/front/py/deepx/scheduler/client/udpconn.py
index a25b0963..6a12c26a 100644
--- a/front/py/deepx/scheduler/client/udpconn.py
+++ b/front/py/deepx/scheduler/client/udpconn.py
@@ -3,7 +3,7 @@
 import select
 
 class UDPConn:
-    def __init__(self, endpoint: str = "localhost:8080"):
+    def __init__(self, endpoint: str = "localhost:9090"):
         # 解析endpoint
         self._host, port_str = endpoint.split(':')
         self._port = int(port_str)
diff --git a/front/py/examples/2_ir/5_reduce_sum_keepdim.dot b/front/py/examples/2_ir/5_reduce_sum_keepdim.dot
index 93785caa..fdc348fe 100644
--- a/front/py/examples/2_ir/5_reduce_sum_keepdim.dot
+++ b/front/py/examples/2_ir/5_reduce_sum_keepdim.dot
@@ -2,37 +2,37 @@
 digraph {
 	rankdir=TB
 	node [shape=record]
-	134049762132704 [label="t
+	134483478251840 [label="t
 (3, 4, 5)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	134047466295776 [label="s
+	134481220276224 [label="s
 (1, 4, 1)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	134047464790368 [label="vector_1
+	134481218754432 [label="vector_1
 [0, 2]" color=darkseagreen fillcolor=honeydew fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	134047464790512 [label=sum color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	134047464790272 [label="p
+	134481218754576 [label=sum color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
+	134481218754336 [label="p
 (3, 1, 5)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	134047464790608 [label="vector_2
+	134481218754672 [label="vector_2
 [1]" color=darkseagreen fillcolor=honeydew fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	134047464790656 [label=prod color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	134049762132560 [label="t1
+	134481218754720 [label=prod color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
+	134483478251408 [label="t1
 (4, 5, 6)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	134047464791088 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	134047464790848 [label="var_1
+	134481218755152 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
+	134481218754912 [label="var_1
 1" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	134047464790800 [label="t2
+	134481218754864 [label="t2
 (1, 1, 6)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	134047464790464 [label="vector_3
+	134481218754528 [label="vector_3
 [0, 1]" color=darkseagreen fillcolor=honeydew fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	134047464791328 [label=sum color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	134047464790512 -> 134047466295776 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	134049762132704 -> 134047464790512 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	134047464790368 -> 134047464790512 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	134047464790656 -> 134047464790272 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	134049762132704 -> 134047464790656 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	134047464790608 -> 134047464790656 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	134047464791088 -> 134049762132560 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	134047464790848 -> 134047464791088 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	134047464791328 -> 134047464790800 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	134049762132560 -> 134047464791328 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	134047464790464 -> 134047464791328 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	134481218755392 [label=sum color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
+	134481218754576 -> 134481220276224 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	134483478251840 -> 134481218754576 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	134481218754432 -> 134481218754576 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	134481218754720 -> 134481218754336 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	134483478251840 -> 134481218754720 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	134481218754672 -> 134481218754720 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	134481218755152 -> 134483478251408 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	134481218754912 -> 134481218755152 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	134481218755392 -> 134481218754864 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	134483478251408 -> 134481218755392 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	134481218754528 -> 134481218755392 [arrowsize=0.8 color=gray40 penwidth=1.2]
 }
diff --git a/front/py/examples/2_ir/5_reduce_sum_keepdim.dot.svg b/front/py/examples/2_ir/5_reduce_sum_keepdim.dot.svg
index bf6cc407..3786069a 100644
--- a/front/py/examples/2_ir/5_reduce_sum_keepdim.dot.svg
+++ b/front/py/examples/2_ir/5_reduce_sum_keepdim.dot.svg
@@ -9,156 +9,156 @@
 <g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 334)">
 <title>%3</title>
 <polygon fill="white" stroke="transparent" points="-4,4 -4,-334 377.5,-334 377.5,4 -4,4"/>
-<!-- 134049762132704 -->
+<!-- 134483478251840 -->
 <g id="node1" class="node">
-<title>134049762132704</title>
+<title>134483478251840</title>
 <polygon fill="aliceblue" stroke="skyblue" points="152,-330 90,-330 90,-292 152,-292 152,-330"/>
 <text text-anchor="middle" x="121" y="-314.8" font-family="Sans-Serif" font-size="14.00">t</text>
 <text text-anchor="middle" x="121" y="-299.8" font-family="Sans-Serif" font-size="14.00">(3, 4, 5)</text>
 </g>
-<!-- 134047464790512 -->
+<!-- 134481218754576 -->
 <g id="node4" class="node">
-<title>134047464790512</title>
+<title>134481218754576</title>
 <polygon fill="lightgray" stroke="darkslategray" points="107,-256 53,-256 53,-220 107,-220 107,-256"/>
 <text text-anchor="middle" x="80" y="-234.3" font-family="Courier Bold" font-size="14.00">sum</text>
 </g>
-<!-- 134049762132704&#45;&gt;134047464790512 -->
+<!-- 134483478251840&#45;&gt;134481218754576 -->
 <g id="edge2" class="edge">
-<title>134049762132704&#45;&gt;134047464790512</title>
+<title>134483478251840&#45;&gt;134481218754576</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M110.44,-291.72C105.42,-283.03 99.36,-272.52 93.97,-263.19"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="96.24,-261.53 89.82,-256 91.39,-264.33 96.24,-261.53"/>
 </g>
-<!-- 134047464790656 -->
+<!-- 134481218754720 -->
 <g id="node7" class="node">
-<title>134047464790656</title>
+<title>134481218754720</title>
 <polygon fill="lightgray" stroke="darkslategray" points="189,-256 135,-256 135,-220 189,-220 189,-256"/>
 <text text-anchor="middle" x="162" y="-234.3" font-family="Courier Bold" font-size="14.00">prod</text>
 </g>
-<!-- 134049762132704&#45;&gt;134047464790656 -->
+<!-- 134483478251840&#45;&gt;134481218754720 -->
 <g id="edge5" class="edge">
-<title>134049762132704&#45;&gt;134047464790656</title>
+<title>134483478251840&#45;&gt;134481218754720</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M131.56,-291.72C136.58,-283.03 142.64,-272.52 148.03,-263.19"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="150.61,-264.33 152.18,-256 145.76,-261.53 150.61,-264.33"/>
 </g>
-<!-- 134047466295776 -->
+<!-- 134481220276224 -->
 <g id="node2" class="node">
-<title>134047466295776</title>
+<title>134481220276224</title>
 <polygon fill="aliceblue" stroke="skyblue" points="111,-184 49,-184 49,-146 111,-146 111,-184"/>
 <text text-anchor="middle" x="80" y="-168.8" font-family="Sans-Serif" font-size="14.00">s</text>
 <text text-anchor="middle" x="80" y="-153.8" font-family="Sans-Serif" font-size="14.00">(1, 4, 1)</text>
 </g>
-<!-- 134047464790368 -->
+<!-- 134481218754432 -->
 <g id="node3" class="node">
-<title>134047464790368</title>
+<title>134481218754432</title>
 <polygon fill="honeydew" stroke="darkseagreen" points="72,-330 0,-330 0,-292 72,-292 72,-330"/>
 <text text-anchor="middle" x="36" y="-314.8" font-family="Sans-Serif" font-size="14.00">vector_1</text>
 <text text-anchor="middle" x="36" y="-299.8" font-family="Sans-Serif" font-size="14.00">[0, 2]</text>
 </g>
-<!-- 134047464790368&#45;&gt;134047464790512 -->
+<!-- 134481218754432&#45;&gt;134481218754576 -->
 <g id="edge3" class="edge">
-<title>134047464790368&#45;&gt;134047464790512</title>
+<title>134481218754432&#45;&gt;134481218754576</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M47.33,-291.72C52.77,-282.94 59.35,-272.31 65.18,-262.91"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="67.63,-264.28 69.46,-256 62.87,-261.33 67.63,-264.28"/>
 </g>
-<!-- 134047464790512&#45;&gt;134047466295776 -->
+<!-- 134481218754576&#45;&gt;134481220276224 -->
 <g id="edge1" class="edge">
-<title>134047464790512&#45;&gt;134047466295776</title>
+<title>134481218754576&#45;&gt;134481220276224</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M80,-219.81C80,-211.52 80,-201.39 80,-192.16"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="82.8,-192.02 80,-184.02 77.2,-192.02 82.8,-192.02"/>
 </g>
-<!-- 134047464790272 -->
+<!-- 134481218754336 -->
 <g id="node5" class="node">
-<title>134047464790272</title>
+<title>134481218754336</title>
 <polygon fill="aliceblue" stroke="skyblue" points="193,-184 131,-184 131,-146 193,-146 193,-184"/>
 <text text-anchor="middle" x="162" y="-168.8" font-family="Sans-Serif" font-size="14.00">p</text>
 <text text-anchor="middle" x="162" y="-153.8" font-family="Sans-Serif" font-size="14.00">(3, 1, 5)</text>
 </g>
-<!-- 134047464790608 -->
+<!-- 134481218754672 -->
 <g id="node6" class="node">
-<title>134047464790608</title>
+<title>134481218754672</title>
 <polygon fill="honeydew" stroke="darkseagreen" points="242,-330 170,-330 170,-292 242,-292 242,-330"/>
 <text text-anchor="middle" x="206" y="-314.8" font-family="Sans-Serif" font-size="14.00">vector_2</text>
 <text text-anchor="middle" x="206" y="-299.8" font-family="Sans-Serif" font-size="14.00">[1]</text>
 </g>
-<!-- 134047464790608&#45;&gt;134047464790656 -->
+<!-- 134481218754672&#45;&gt;134481218754720 -->
 <g id="edge6" class="edge">
-<title>134047464790608&#45;&gt;134047464790656</title>
+<title>134481218754672&#45;&gt;134481218754720</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M194.67,-291.72C189.23,-282.94 182.65,-272.31 176.82,-262.91"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="179.13,-261.33 172.54,-256 174.37,-264.28 179.13,-261.33"/>
 </g>
-<!-- 134047464790656&#45;&gt;134047464790272 -->
+<!-- 134481218754720&#45;&gt;134481218754336 -->
 <g id="edge4" class="edge">
-<title>134047464790656&#45;&gt;134047464790272</title>
+<title>134481218754720&#45;&gt;134481218754336</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M162,-219.81C162,-211.52 162,-201.39 162,-192.16"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="164.8,-192.02 162,-184.02 159.2,-192.02 164.8,-192.02"/>
 </g>
-<!-- 134049762132560 -->
+<!-- 134483478251408 -->
 <g id="node8" class="node">
-<title>134049762132560</title>
+<title>134483478251408</title>
 <polygon fill="aliceblue" stroke="skyblue" points="363,-184 301,-184 301,-146 363,-146 363,-184"/>
 <text text-anchor="middle" x="332" y="-168.8" font-family="Sans-Serif" font-size="14.00">t1</text>
 <text text-anchor="middle" x="332" y="-153.8" font-family="Sans-Serif" font-size="14.00">(4, 5, 6)</text>
 </g>
-<!-- 134047464791328 -->
+<!-- 134481218755392 -->
 <g id="node13" class="node">
-<title>134047464791328</title>
+<title>134481218755392</title>
 <polygon fill="lightgray" stroke="darkslategray" points="316,-110 262,-110 262,-74 316,-74 316,-110"/>
 <text text-anchor="middle" x="289" y="-88.3" font-family="Courier Bold" font-size="14.00">sum</text>
 </g>
-<!-- 134049762132560&#45;&gt;134047464791328 -->
+<!-- 134483478251408&#45;&gt;134481218755392 -->
 <g id="edge10" class="edge">
-<title>134049762132560&#45;&gt;134047464791328</title>
+<title>134483478251408&#45;&gt;134481218755392</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M320.93,-145.72C315.61,-136.94 309.18,-126.31 303.48,-116.91"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="305.84,-115.4 299.3,-110 301.05,-118.3 305.84,-115.4"/>
 </g>
-<!-- 134047464791088 -->
+<!-- 134481218755152 -->
 <g id="node9" class="node">
-<title>134047464791088</title>
+<title>134481218755152</title>
 <polygon fill="lightgray" stroke="darkslategray" points="373.5,-256 290.5,-256 290.5,-220 373.5,-220 373.5,-256"/>
 <text text-anchor="middle" x="332" y="-234.3" font-family="Courier Bold" font-size="14.00">constant</text>
 </g>
-<!-- 134047464791088&#45;&gt;134049762132560 -->
+<!-- 134481218755152&#45;&gt;134483478251408 -->
 <g id="edge7" class="edge">
-<title>134047464791088&#45;&gt;134049762132560</title>
+<title>134481218755152&#45;&gt;134483478251408</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M332,-219.81C332,-211.52 332,-201.39 332,-192.16"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="334.8,-192.02 332,-184.02 329.2,-192.02 334.8,-192.02"/>
 </g>
-<!-- 134047464790848 -->
+<!-- 134481218754912 -->
 <g id="node10" class="node">
-<title>134047464790848</title>
+<title>134481218754912</title>
 <polygon fill="moccasin" stroke="orange" points="359,-330 305,-330 305,-292 359,-292 359,-330"/>
 <text text-anchor="middle" x="332" y="-314.8" font-family="Sans-Serif" font-size="14.00">var_1</text>
 <text text-anchor="middle" x="332" y="-299.8" font-family="Sans-Serif" font-size="14.00">1</text>
 </g>
-<!-- 134047464790848&#45;&gt;134047464791088 -->
+<!-- 134481218754912&#45;&gt;134481218755152 -->
 <g id="edge8" class="edge">
-<title>134047464790848&#45;&gt;134047464791088</title>
+<title>134481218754912&#45;&gt;134481218755152</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M332,-291.72C332,-283.29 332,-273.15 332,-264.02"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="334.8,-264 332,-256 329.2,-264 334.8,-264"/>
 </g>
-<!-- 134047464790800 -->
+<!-- 134481218754864 -->
 <g id="node11" class="node">
-<title>134047464790800</title>
+<title>134481218754864</title>
 <polygon fill="aliceblue" stroke="skyblue" points="320,-38 258,-38 258,0 320,0 320,-38"/>
 <text text-anchor="middle" x="289" y="-22.8" font-family="Sans-Serif" font-size="14.00">t2</text>
 <text text-anchor="middle" x="289" y="-7.8" font-family="Sans-Serif" font-size="14.00">(1, 1, 6)</text>
 </g>
-<!-- 134047464790464 -->
+<!-- 134481218754528 -->
 <g id="node12" class="node">
-<title>134047464790464</title>
+<title>134481218754528</title>
 <polygon fill="honeydew" stroke="darkseagreen" points="283,-184 211,-184 211,-146 283,-146 283,-184"/>
 <text text-anchor="middle" x="247" y="-168.8" font-family="Sans-Serif" font-size="14.00">vector_3</text>
 <text text-anchor="middle" x="247" y="-153.8" font-family="Sans-Serif" font-size="14.00">[0, 1]</text>
 </g>
-<!-- 134047464790464&#45;&gt;134047464791328 -->
+<!-- 134481218754528&#45;&gt;134481218755392 -->
 <g id="edge11" class="edge">
-<title>134047464790464&#45;&gt;134047464791328</title>
+<title>134481218754528&#45;&gt;134481218755392</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M257.82,-145.72C263.01,-136.94 269.29,-126.31 274.86,-116.91"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="277.28,-118.31 278.94,-110 272.46,-115.46 277.28,-118.31"/>
 </g>
-<!-- 134047464791328&#45;&gt;134047464790800 -->
+<!-- 134481218755392&#45;&gt;134481218754864 -->
 <g id="edge9" class="edge">
-<title>134047464791328&#45;&gt;134047464790800</title>
+<title>134481218755392&#45;&gt;134481218754864</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M289,-73.81C289,-65.52 289,-55.39 289,-46.16"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="291.8,-46.02 289,-38.02 286.2,-46.02 291.8,-46.02"/>
 </g>
diff --git a/front/py/examples/2_ir/5_reduce_sumprod.dot b/front/py/examples/2_ir/5_reduce_sumprod.dot
index 5dfc772d..ecaf7883 100644
--- a/front/py/examples/2_ir/5_reduce_sumprod.dot
+++ b/front/py/examples/2_ir/5_reduce_sumprod.dot
@@ -2,37 +2,37 @@
 digraph {
 	rankdir=TB
 	node [shape=record]
-	124690717936416 [label="t
+	137168675850560 [label="t
 (3, 4, 5)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	124689026191840 [label="s
+	137168368280016 [label="s
 (4,)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	124688742603248 [label="vector_1
+	137168309032976 [label="vector_1
 [0, 2]" color=darkseagreen fillcolor=honeydew fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	124688742602864 [label=sum color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	124688742603104 [label="p
+	137168309032592 [label=sum color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
+	137168309032832 [label="p
 (3, 5)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	124688742603392 [label="vector_2
+	137168309033120 [label="vector_2
 [1]" color=darkseagreen fillcolor=honeydew fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	124688742603440 [label=prod color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	124690717935984 [label="t1
+	137168309033168 [label=prod color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
+	137168675850128 [label="t1
 (4, 5, 6)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	124688742603872 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	124688742603632 [label="var_1
+	137168309033600 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
+	137168309033360 [label="var_1
 1" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	124688742603584 [label="t2
+	137168309033312 [label="t2
 (6,)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	124688742604112 [label="vector_3
+	137168309033840 [label="vector_3
 [0, 1]" color=darkseagreen fillcolor=honeydew fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	124688742603920 [label=sum color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	124688742602864 -> 124689026191840 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	124690717936416 -> 124688742602864 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	124688742603248 -> 124688742602864 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	124688742603440 -> 124688742603104 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	124690717936416 -> 124688742603440 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	124688742603392 -> 124688742603440 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	124688742603872 -> 124690717935984 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	124688742603632 -> 124688742603872 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	124688742603920 -> 124688742603584 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	124690717935984 -> 124688742603920 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	124688742604112 -> 124688742603920 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	137168309033648 [label=sum color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
+	137168309032592 -> 137168368280016 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	137168675850560 -> 137168309032592 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	137168309032976 -> 137168309032592 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	137168309033168 -> 137168309032832 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	137168675850560 -> 137168309033168 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	137168309033120 -> 137168309033168 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	137168309033600 -> 137168675850128 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	137168309033360 -> 137168309033600 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	137168309033648 -> 137168309033312 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	137168675850128 -> 137168309033648 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	137168309033840 -> 137168309033648 [arrowsize=0.8 color=gray40 penwidth=1.2]
 }
diff --git a/front/py/examples/2_ir/5_reduce_sumprod.dot.svg b/front/py/examples/2_ir/5_reduce_sumprod.dot.svg
index dc32bc57..e98414d2 100644
--- a/front/py/examples/2_ir/5_reduce_sumprod.dot.svg
+++ b/front/py/examples/2_ir/5_reduce_sumprod.dot.svg
@@ -9,156 +9,156 @@
 <g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 334)">
 <title>%3</title>
 <polygon fill="white" stroke="transparent" points="-4,4 -4,-334 371.5,-334 371.5,4 -4,4"/>
-<!-- 124690717936416 -->
+<!-- 137168675850560 -->
 <g id="node1" class="node">
-<title>124690717936416</title>
+<title>137168675850560</title>
 <polygon fill="aliceblue" stroke="skyblue" points="152,-330 90,-330 90,-292 152,-292 152,-330"/>
 <text text-anchor="middle" x="121" y="-314.8" font-family="Sans-Serif" font-size="14.00">t</text>
 <text text-anchor="middle" x="121" y="-299.8" font-family="Sans-Serif" font-size="14.00">(3, 4, 5)</text>
 </g>
-<!-- 124688742602864 -->
+<!-- 137168309032592 -->
 <g id="node4" class="node">
-<title>124688742602864</title>
+<title>137168309032592</title>
 <polygon fill="lightgray" stroke="darkslategray" points="109,-256 55,-256 55,-220 109,-220 109,-256"/>
 <text text-anchor="middle" x="82" y="-234.3" font-family="Courier Bold" font-size="14.00">sum</text>
 </g>
-<!-- 124690717936416&#45;&gt;124688742602864 -->
+<!-- 137168675850560&#45;&gt;137168309032592 -->
 <g id="edge2" class="edge">
-<title>124690717936416&#45;&gt;124688742602864</title>
+<title>137168675850560&#45;&gt;137168309032592</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M110.96,-291.72C106.18,-283.03 100.41,-272.52 95.28,-263.19"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="97.65,-261.67 91.34,-256 92.74,-264.36 97.65,-261.67"/>
 </g>
-<!-- 124688742603440 -->
+<!-- 137168309033168 -->
 <g id="node7" class="node">
-<title>124688742603440</title>
+<title>137168309033168</title>
 <polygon fill="lightgray" stroke="darkslategray" points="187,-256 133,-256 133,-220 187,-220 187,-256"/>
 <text text-anchor="middle" x="160" y="-234.3" font-family="Courier Bold" font-size="14.00">prod</text>
 </g>
-<!-- 124690717936416&#45;&gt;124688742603440 -->
+<!-- 137168675850560&#45;&gt;137168309033168 -->
 <g id="edge5" class="edge">
-<title>124690717936416&#45;&gt;124688742603440</title>
+<title>137168675850560&#45;&gt;137168309033168</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M131.04,-291.72C135.82,-283.03 141.59,-272.52 146.72,-263.19"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="149.26,-264.36 150.66,-256 144.35,-261.67 149.26,-264.36"/>
 </g>
-<!-- 124689026191840 -->
+<!-- 137168368280016 -->
 <g id="node2" class="node">
-<title>124689026191840</title>
+<title>137168368280016</title>
 <polygon fill="aliceblue" stroke="skyblue" points="109,-184 55,-184 55,-146 109,-146 109,-184"/>
 <text text-anchor="middle" x="82" y="-168.8" font-family="Sans-Serif" font-size="14.00">s</text>
 <text text-anchor="middle" x="82" y="-153.8" font-family="Sans-Serif" font-size="14.00">(4,)</text>
 </g>
-<!-- 124688742603248 -->
+<!-- 137168309032976 -->
 <g id="node3" class="node">
-<title>124688742603248</title>
+<title>137168309032976</title>
 <polygon fill="honeydew" stroke="darkseagreen" points="72,-330 0,-330 0,-292 72,-292 72,-330"/>
 <text text-anchor="middle" x="36" y="-314.8" font-family="Sans-Serif" font-size="14.00">vector_1</text>
 <text text-anchor="middle" x="36" y="-299.8" font-family="Sans-Serif" font-size="14.00">[0, 2]</text>
 </g>
-<!-- 124688742603248&#45;&gt;124688742602864 -->
+<!-- 137168309032976&#45;&gt;137168309032592 -->
 <g id="edge3" class="edge">
-<title>124688742603248&#45;&gt;124688742602864</title>
+<title>137168309032976&#45;&gt;137168309032592</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M47.85,-291.72C53.53,-282.94 60.42,-272.31 66.51,-262.91"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="68.98,-264.24 70.98,-256 64.28,-261.19 68.98,-264.24"/>
 </g>
-<!-- 124688742602864&#45;&gt;124689026191840 -->
+<!-- 137168309032592&#45;&gt;137168368280016 -->
 <g id="edge1" class="edge">
-<title>124688742602864&#45;&gt;124689026191840</title>
+<title>137168309032592&#45;&gt;137168368280016</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M82,-219.81C82,-211.52 82,-201.39 82,-192.16"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="84.8,-192.02 82,-184.02 79.2,-192.02 84.8,-192.02"/>
 </g>
-<!-- 124688742603104 -->
+<!-- 137168309032832 -->
 <g id="node5" class="node">
-<title>124688742603104</title>
+<title>137168309032832</title>
 <polygon fill="aliceblue" stroke="skyblue" points="187,-184 133,-184 133,-146 187,-146 187,-184"/>
 <text text-anchor="middle" x="160" y="-168.8" font-family="Sans-Serif" font-size="14.00">p</text>
 <text text-anchor="middle" x="160" y="-153.8" font-family="Sans-Serif" font-size="14.00">(3, 5)</text>
 </g>
-<!-- 124688742603392 -->
+<!-- 137168309033120 -->
 <g id="node6" class="node">
-<title>124688742603392</title>
+<title>137168309033120</title>
 <polygon fill="honeydew" stroke="darkseagreen" points="242,-330 170,-330 170,-292 242,-292 242,-330"/>
 <text text-anchor="middle" x="206" y="-314.8" font-family="Sans-Serif" font-size="14.00">vector_2</text>
 <text text-anchor="middle" x="206" y="-299.8" font-family="Sans-Serif" font-size="14.00">[1]</text>
 </g>
-<!-- 124688742603392&#45;&gt;124688742603440 -->
+<!-- 137168309033120&#45;&gt;137168309033168 -->
 <g id="edge6" class="edge">
-<title>124688742603392&#45;&gt;124688742603440</title>
+<title>137168309033120&#45;&gt;137168309033168</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M194.15,-291.72C188.47,-282.94 181.58,-272.31 175.49,-262.91"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="177.72,-261.19 171.02,-256 173.02,-264.24 177.72,-261.19"/>
 </g>
-<!-- 124688742603440&#45;&gt;124688742603104 -->
+<!-- 137168309033168&#45;&gt;137168309032832 -->
 <g id="edge4" class="edge">
-<title>124688742603440&#45;&gt;124688742603104</title>
+<title>137168309033168&#45;&gt;137168309032832</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M160,-219.81C160,-211.52 160,-201.39 160,-192.16"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="162.8,-192.02 160,-184.02 157.2,-192.02 162.8,-192.02"/>
 </g>
-<!-- 124690717935984 -->
+<!-- 137168675850128 -->
 <g id="node8" class="node">
-<title>124690717935984</title>
+<title>137168675850128</title>
 <polygon fill="aliceblue" stroke="skyblue" points="357,-184 295,-184 295,-146 357,-146 357,-184"/>
 <text text-anchor="middle" x="326" y="-168.8" font-family="Sans-Serif" font-size="14.00">t1</text>
 <text text-anchor="middle" x="326" y="-153.8" font-family="Sans-Serif" font-size="14.00">(4, 5, 6)</text>
 </g>
-<!-- 124688742603920 -->
+<!-- 137168309033648 -->
 <g id="node13" class="node">
-<title>124688742603920</title>
+<title>137168309033648</title>
 <polygon fill="lightgray" stroke="darkslategray" points="310,-110 256,-110 256,-74 310,-74 310,-110"/>
 <text text-anchor="middle" x="283" y="-88.3" font-family="Courier Bold" font-size="14.00">sum</text>
 </g>
-<!-- 124690717935984&#45;&gt;124688742603920 -->
+<!-- 137168675850128&#45;&gt;137168309033648 -->
 <g id="edge10" class="edge">
-<title>124690717935984&#45;&gt;124688742603920</title>
+<title>137168675850128&#45;&gt;137168309033648</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M314.93,-145.72C309.61,-136.94 303.18,-126.31 297.48,-116.91"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="299.84,-115.4 293.3,-110 295.05,-118.3 299.84,-115.4"/>
 </g>
-<!-- 124688742603872 -->
+<!-- 137168309033600 -->
 <g id="node9" class="node">
-<title>124688742603872</title>
+<title>137168309033600</title>
 <polygon fill="lightgray" stroke="darkslategray" points="367.5,-256 284.5,-256 284.5,-220 367.5,-220 367.5,-256"/>
 <text text-anchor="middle" x="326" y="-234.3" font-family="Courier Bold" font-size="14.00">constant</text>
 </g>
-<!-- 124688742603872&#45;&gt;124690717935984 -->
+<!-- 137168309033600&#45;&gt;137168675850128 -->
 <g id="edge7" class="edge">
-<title>124688742603872&#45;&gt;124690717935984</title>
+<title>137168309033600&#45;&gt;137168675850128</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M326,-219.81C326,-211.52 326,-201.39 326,-192.16"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="328.8,-192.02 326,-184.02 323.2,-192.02 328.8,-192.02"/>
 </g>
-<!-- 124688742603632 -->
+<!-- 137168309033360 -->
 <g id="node10" class="node">
-<title>124688742603632</title>
+<title>137168309033360</title>
 <polygon fill="moccasin" stroke="orange" points="353,-330 299,-330 299,-292 353,-292 353,-330"/>
 <text text-anchor="middle" x="326" y="-314.8" font-family="Sans-Serif" font-size="14.00">var_1</text>
 <text text-anchor="middle" x="326" y="-299.8" font-family="Sans-Serif" font-size="14.00">1</text>
 </g>
-<!-- 124688742603632&#45;&gt;124688742603872 -->
+<!-- 137168309033360&#45;&gt;137168309033600 -->
 <g id="edge8" class="edge">
-<title>124688742603632&#45;&gt;124688742603872</title>
+<title>137168309033360&#45;&gt;137168309033600</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M326,-291.72C326,-283.29 326,-273.15 326,-264.02"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="328.8,-264 326,-256 323.2,-264 328.8,-264"/>
 </g>
-<!-- 124688742603584 -->
+<!-- 137168309033312 -->
 <g id="node11" class="node">
-<title>124688742603584</title>
+<title>137168309033312</title>
 <polygon fill="aliceblue" stroke="skyblue" points="310,-38 256,-38 256,0 310,0 310,-38"/>
 <text text-anchor="middle" x="283" y="-22.8" font-family="Sans-Serif" font-size="14.00">t2</text>
 <text text-anchor="middle" x="283" y="-7.8" font-family="Sans-Serif" font-size="14.00">(6,)</text>
 </g>
-<!-- 124688742604112 -->
+<!-- 137168309033840 -->
 <g id="node12" class="node">
-<title>124688742604112</title>
+<title>137168309033840</title>
 <polygon fill="honeydew" stroke="darkseagreen" points="277,-184 205,-184 205,-146 277,-146 277,-184"/>
 <text text-anchor="middle" x="241" y="-168.8" font-family="Sans-Serif" font-size="14.00">vector_3</text>
 <text text-anchor="middle" x="241" y="-153.8" font-family="Sans-Serif" font-size="14.00">[0, 1]</text>
 </g>
-<!-- 124688742604112&#45;&gt;124688742603920 -->
+<!-- 137168309033840&#45;&gt;137168309033648 -->
 <g id="edge11" class="edge">
-<title>124688742604112&#45;&gt;124688742603920</title>
+<title>137168309033840&#45;&gt;137168309033648</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M251.82,-145.72C257.01,-136.94 263.29,-126.31 268.86,-116.91"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="271.28,-118.31 272.94,-110 266.46,-115.46 271.28,-118.31"/>
 </g>
-<!-- 124688742603920&#45;&gt;124688742603584 -->
+<!-- 137168309033648&#45;&gt;137168309033312 -->
 <g id="edge9" class="edge">
-<title>124688742603920&#45;&gt;124688742603584</title>
+<title>137168309033648&#45;&gt;137168309033312</title>
 <path fill="none" stroke="#666666" stroke-width="1.2" d="M283,-73.81C283,-65.52 283,-55.39 283,-46.16"/>
 <polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="285.8,-46.02 283,-38.02 280.2,-46.02 285.8,-46.02"/>
 </g>