From 56886908ea2ecbce8e087eab81ecfd69801d01b8 Mon Sep 17 00:00:00 2001
From: lipeng <734991033@qq.com>
Date: Fri, 18 Apr 2025 00:15:14 +0800
Subject: [PATCH 1/6] =?UTF-8?q?module:linear=E7=AE=80=E5=8C=96=20fix?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 front/py/deepx/nn/functional/authormap.py     |  9 +++++-
 .../nn/functional/leaffunc_elementwise.py     |  4 +--
 front/py/deepx/nn/functional/leaffunc_init.py | 30 +++++++++++--------
 front/py/deepx/nn/functional/leaffunc_io.py   |  5 ++--
 front/py/deepx/nn/functional/leaffunc_life.py |  5 +++-
 .../py/deepx/nn/functional/leaffunc_matmul.py |  5 ++--
 front/py/deepx/nn/modules/linear.py           | 17 ++++++-----
 front/py/deepx/nn/modules/module.py           | 15 ++++------
 front/py/deepx/tensor/tensor.py               | 21 +++++--------
 front/py/examples/2_ir/3_matmul.py            |  2 +-
 front/py/examples/3_functional/1_mean.py      |  4 +--
 front/py/examples/3_functional/1_relu.py      |  4 +--
 front/py/examples/3_functional/1_rsqrt.py     |  4 +--
 front/py/examples/3_functional/1_sigmoid.py   |  4 +--
 front/py/examples/3_functional/1_swish.py     |  4 +--
 front/py/examples/3_module/1_linear.py        | 16 ++++------
 16 files changed, 75 insertions(+), 74 deletions(-)

diff --git a/front/py/deepx/nn/functional/authormap.py b/front/py/deepx/nn/functional/authormap.py
index 5b42d56f..644505a0 100644
--- a/front/py/deepx/nn/functional/authormap.py
+++ b/front/py/deepx/nn/functional/authormap.py
@@ -1,4 +1,10 @@
 defaultauthor=dict({
+        #io
+        'print':'miaobyte',
+        #init
+        'uniform':'miaobyte',
+        'constant':'miaobyte',
+        'arange':'miaobyte',
         #elementwise
         'add':'miaobyte',
         'addscalar':'miaobyte',
@@ -27,7 +33,8 @@
         'broadcastTo':'miaobyte',
         'concat':'miaobyte',
         #matmul
-        'matmul':'miaobyte',
+        # 'matmul':'miaobyte',
+        'matmul':'cublas',
         #reduce
         'sum':'miaobyte',
         'prod':'miaobyte',
diff --git a/front/py/deepx/nn/functional/leaffunc_elementwise.py b/front/py/deepx/nn/functional/leaffunc_elementwise.py
index 3cfe5157..93e64b36 100644
--- a/front/py/deepx/nn/functional/leaffunc_elementwise.py
+++ b/front/py/deepx/nn/functional/leaffunc_elementwise.py
@@ -14,9 +14,7 @@
 def div(
         a: Optional[Union[Tensor, float, int]] = None,
         b: Optional[Union[Tensor, float, int]] = None, 
-        out:Union[Tensor,str]=None,
-        requires_grad:bool=False,
-        author='miaobyte')->Tensor:
+        out:Union[Tensor,str]=None)->Tensor:
     if isinstance(b,Tensor) and isinstance(a,Tensor):
         #C=A/B
         outtensor=out
diff --git a/front/py/deepx/nn/functional/leaffunc_init.py b/front/py/deepx/nn/functional/leaffunc_init.py
index e0b0da90..a5ac4dde 100644
--- a/front/py/deepx/nn/functional/leaffunc_init.py
+++ b/front/py/deepx/nn/functional/leaffunc_init.py
@@ -1,18 +1,18 @@
 from typing import Union
 import math
+import time
+import os
 from .leaffunc_life import newtensor,parse_shape
 from .rtf_init import *
 from deepx import Tensor
- 
+from .authormap import defaultauthor
 
 # 命名规则
 # inplace操作的函数，其名为_后缀, 返回值为空
 # 非inplace操作的函数，其名为_后缀, 返回值为Tensor
 
-def constant_(t:Tensor,
-            value: Union[float,int],
-            author='miaobyte')->Tensor:
-    rtf_constant(t,value,author)
+def constant_(t:Tensor,value: Union[float,int])->Tensor:
+    rtf_constant(t,value,defaultauthor['constant'])
  
 
 def constant(*shape, value:Union[float,int], dtype:str='float32',name:str)->Tensor:
@@ -33,22 +33,26 @@ def ones(*shape, dtype:str='float32',name:str=None)->Tensor:
     s = parse_shape(shape)
     return constant(s, value=1, dtype=dtype,name=name)
  
-def arange_(t:Tensor,start=0,step=1,author='miaobyte')->Tensor:
+def arange_(t:Tensor,start=0,step=1)->Tensor:
     from .rtf_init import rtf_arange
-    rtf_arange(t,start,step,author)
-def arange(*shape,start=0,step=1,dtype:str='float32',name:str=None,author='miaobyte')->Tensor:
+    rtf_arange(t,start,step,defaultauthor['arange'])
+def arange(*shape,start=0,step=1,dtype:str='float32',name:str=None)->Tensor:
     s = parse_shape(shape)
     outtensor=newtensor(s,dtype=dtype,name=name)
-    arange_(outtensor,start,step,author)
+    arange_(outtensor,start,step)
     return outtensor
 
-def uniform_(t:Tensor,low=0, high=1,seed:int=0,author='miaobyte')->Tensor:
+def uniform_(t:Tensor,low=0, high=1,seed:int=None)->Tensor:
+    if seed is None:
+        seed = int(time.time() * 1000) & 0xffffffff
+        seed = (seed + os.getpid()) & 0xffffffff
     from .rtf_init import rtf_uniform
-    rtf_uniform(t,low,high,seed,author)
-def uniform(*shape,low=0, high=1,seed:int=0,dtype:str='float32',name:str=None,author='miaobyte')->Tensor:
+    rtf_uniform(t,low,high,seed,defaultauthor['uniform'])
+
+def uniform(*shape,low=0, high=1,seed:int=None,dtype:str='float32',name:str=None)->Tensor:
     s = parse_shape(shape)
     outtensor=newtensor(s,dtype=dtype,name=name)
-    uniform_(outtensor,low,high,seed,author)
+    uniform_(outtensor,low,high,seed)
     return outtensor
 
 # def rand(*size, dtype=None, device=None):
diff --git a/front/py/deepx/nn/functional/leaffunc_io.py b/front/py/deepx/nn/functional/leaffunc_io.py
index 98d221da..b4490803 100644
--- a/front/py/deepx/nn/functional/leaffunc_io.py
+++ b/front/py/deepx/nn/functional/leaffunc_io.py
@@ -1,7 +1,8 @@
 from deepx.tensor import Tensor
+from .authormap import defaultauthor
 
-def printtensor(t:Tensor,format='',author='miaobyte'):
+def printtensor(t:Tensor,format=''):
     from .rtf_io import rtf_printtensor
-    rtf_printtensor(t,format,author)
+    rtf_printtensor(t,format,defaultauthor['print'])
     return ''
 
diff --git a/front/py/deepx/nn/functional/leaffunc_life.py b/front/py/deepx/nn/functional/leaffunc_life.py
index cf4d0905..abf6a530 100644
--- a/front/py/deepx/nn/functional/leaffunc_life.py
+++ b/front/py/deepx/nn/functional/leaffunc_life.py
@@ -12,7 +12,10 @@ def newtensor(*shape,dtype:str='float32',name:str=None):
     from .rtf_life import rtf_newtensor
     rtf_newtensor(t)
     return t
-
+def rnewtensor(t:Tensor):
+    from .rtf_life import rtf_newtensor
+    rtf_newtensor(t)
+    return t
 def copytensor(t:Tensor,out:Tensor):
     from .rtf_life import rtf_copytensor
     rtf_copytensor(t,out)
diff --git a/front/py/deepx/nn/functional/leaffunc_matmul.py b/front/py/deepx/nn/functional/leaffunc_matmul.py
index 11b793a4..bb69b838 100644
--- a/front/py/deepx/nn/functional/leaffunc_matmul.py
+++ b/front/py/deepx/nn/functional/leaffunc_matmul.py
@@ -1,13 +1,14 @@
 from typing import Union
 
-from deepx import Tensor
+from deepx import Tensor,Shape
 from .leaffunc_life import newtensor
 from .authormap import defaultauthor
 
 def matmul(a:Tensor,b:Tensor,out:Union[Tensor,str]='')->Tensor:
     outtensor=out
     if isinstance(out,str):
-        outtensor=newtensor(a.shape,dtype=a.dtype,name=out)
+        outshape=Shape.matmul(a.shape,b.shape)
+        outtensor=newtensor(outshape,dtype=a.dtype,name=out)
     from .rtf_matmul import rtf_matmul
     rtf_matmul(a,b,outtensor,defaultauthor['matmul'])
     return outtensor
diff --git a/front/py/deepx/nn/modules/linear.py b/front/py/deepx/nn/modules/linear.py
index c1ef3238..f1eb86e3 100644
--- a/front/py/deepx/nn/modules/linear.py
+++ b/front/py/deepx/nn/modules/linear.py
@@ -1,6 +1,6 @@
 from .module import Module
 from deepx  import Tensor
-from deepx.nn.functional import uniform,kaiming_uniform_,calculate_fan_in_and_fan_out
+from deepx.nn.functional import uniform_,kaiming_uniform_,calculate_fan_in_and_fan_out
 import math
 
 class Linear(Module):
@@ -35,14 +35,17 @@ def reset_parameters(self) -> None:
         if self.bias is not None:
             fan_in, _ = calculate_fan_in_and_fan_out(self.weight)
             bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
-            uniform(self.bias, -bound, bound)
+            uniform_(self.bias, -bound, bound)
 
     def forward(self, input: Tensor) -> Tensor:
-        #`y = xA^T + b` 
-        if self.bias is None:
-            return  input @ self.weight.T 
-        else:
-            return  input @ self.weight.T + self.bias
+        #`y = xA^T + b`
+        y=input @ self.weight.T
+        oldshape=y.shape
+        if self.bias is not None:
+            y.reshape_(y.shape[1])
+            y=y+self.bias
+        y.reshape_(*oldshape)
+        return y
 
     def extra_repr(self) -> str:
         return f"in_features={self.in_features}, out_features={self.out_features}, bias={self.bias is not None}"
diff --git a/front/py/deepx/nn/modules/module.py b/front/py/deepx/nn/modules/module.py
index bda175d4..9f52fdf3 100644
--- a/front/py/deepx/nn/modules/module.py
+++ b/front/py/deepx/nn/modules/module.py
@@ -1,13 +1,10 @@
 import re
-from typing import (Dict, Iterator, Optional, Tuple, Union, 
-                    Any, List, overload)
+from typing import Dict, Iterator, Optional, Tuple, Any
 from collections import OrderedDict
 from deepx import Tensor
 
 class Module:  
     def __init__(self, name: Optional[str] = None):
-        from deepx.autograd import Graph
-        self._graph=Graph.get_default()
         self._name = name or self._generate_default_name()
         self._parent: Optional[Module] = None
         self._modules: OrderedDict[str, Module] = OrderedDict()
@@ -21,11 +18,7 @@ def _generate_default_name(self) -> str:
         count = self.__class__._instance_counter
         self.__class__._instance_counter += 1
         return f"{base_name}_{count}"
-    
-    @property
-    def graph(self):
-        return self._graph
-    
+ 
     @property
     def full_name(self):
         if self._parent is None:
@@ -55,7 +48,9 @@ def register_parameter(self, name: str, param: Optional[Tensor]) -> None:
             self._parameters.pop(name, None)
         else:
             self._parameters[name] = param
-            param.addtograph(self.full_name + '.' + name)
+            param.name=self.full_name + '.' + name
+            from deepx.nn.functional.leaffunc_life import rnewtensor
+            rnewtensor(param)
 
     def parameters(self, recurse: bool = True) -> Iterator[Tensor]:
         for name, param in self.named_parameters(recurse=recurse):
diff --git a/front/py/deepx/tensor/tensor.py b/front/py/deepx/tensor/tensor.py
index ee60948d..1e2fe9fe 100644
--- a/front/py/deepx/tensor/tensor.py
+++ b/front/py/deepx/tensor/tensor.py
@@ -27,9 +27,7 @@ def __init__(self,shape:Union[tuple[int],list[int],Shape],dtype:str='float32',na
             self._shape = shape
         else:
             raise ValueError("Invalid shape")
- 
-        self._graph = None
-        self._node = None
+
     def copy_to(self,t:'Tensor'):
         from deepx.nn.functional import copytensor
         copytensor(self,t)
@@ -44,7 +42,10 @@ def clone(self,name:str=None):
     @property
     def name(self):
         return self._name
-    
+    @name.setter
+    def name(self,name:str):
+        self._name=name
+
     # shape
     @property
     def shape(self,dim:int=None):
@@ -87,15 +88,7 @@ def numel(self)->int:
     @property
     def dtype(self):
         return self._dtype
- 
-    
-    @property
-    def graph(self):
-        return self._graph
-     
-    @property
-    def node(self):
-        return self._node
+
     
     #elementwise
     def __add__(self, other):
@@ -120,7 +113,7 @@ def __matmul__(self, other):
     #shape操作
     @property
     def T(self) -> str:
-        return self.transpose(1,0,out=self.node.name+".T")
+        return self.transpose()
 
     # 打印
     def autoformat(self):
diff --git a/front/py/examples/2_ir/3_matmul.py b/front/py/examples/2_ir/3_matmul.py
index 5cc0cffd..144cbdf7 100644
--- a/front/py/examples/2_ir/3_matmul.py
+++ b/front/py/examples/2_ir/3_matmul.py
@@ -16,7 +16,7 @@
 t1 = ones([3,4],dtype='float32',name="t1")
 t2 = ones([4,5],dtype='float32',name="t2")
 t3 = t1 @ t2
-print(t3)
+t3.print()
 
 
 
diff --git a/front/py/examples/3_functional/1_mean.py b/front/py/examples/3_functional/1_mean.py
index 12f4c0f5..64511555 100644
--- a/front/py/examples/3_functional/1_mean.py
+++ b/front/py/examples/3_functional/1_mean.py
@@ -14,7 +14,7 @@
 
 
 t3=arange(4,5,6,name="t3")
-print(t3)
+t3.print()
 
 t3_mean=mean(t3,dim=(0,1))
-print(t3_mean)
+t3_mean.print()
diff --git a/front/py/examples/3_functional/1_relu.py b/front/py/examples/3_functional/1_relu.py
index 22b1e8cc..9cd1737e 100644
--- a/front/py/examples/3_functional/1_relu.py
+++ b/front/py/examples/3_functional/1_relu.py
@@ -21,7 +21,7 @@
 
 # 当tensor.name为str时，说明其是中间变量，执行inplace操作
 t2=uniform(10,10,low=-1,high=1)
-print(t2)
+t2.print()
 relu_t2=relu(t2)
-print(relu_t2)
+relu_t2.print()
 
diff --git a/front/py/examples/3_functional/1_rsqrt.py b/front/py/examples/3_functional/1_rsqrt.py
index c0706691..aa4926a6 100644
--- a/front/py/examples/3_functional/1_rsqrt.py
+++ b/front/py/examples/3_functional/1_rsqrt.py
@@ -13,6 +13,6 @@
 from deepx.nn.functional import rsqrt
 
 t=arange(2,3,4,name='t')
-print((t))
+t.print()
 rsqrt_t=rsqrt(t)
-print(rsqrt_t)
+rsqrt_t.print()
diff --git a/front/py/examples/3_functional/1_sigmoid.py b/front/py/examples/3_functional/1_sigmoid.py
index 1eace7bf..dbdfd614 100644
--- a/front/py/examples/3_functional/1_sigmoid.py
+++ b/front/py/examples/3_functional/1_sigmoid.py
@@ -20,8 +20,8 @@
 x.sub_(3.0)
 
 print("\nDEEPX tensor:")
-print(x)
+x.print()
 
 out=sigmoid(x)
 print("\nDEEPX sigmoid result:")
-print(out)
+out.print()
diff --git a/front/py/examples/3_functional/1_swish.py b/front/py/examples/3_functional/1_swish.py
index d2ce1082..f4e8c7c3 100644
--- a/front/py/examples/3_functional/1_swish.py
+++ b/front/py/examples/3_functional/1_swish.py
@@ -20,8 +20,8 @@
 x.sub_(3.0)
 
 print("\nDEEPX tensor:")
-print(x)
+x.print()
 
 out=swish(x)
 print("\nDEEPX swish result:")
-print(out)
+out.print()
diff --git a/front/py/examples/3_module/1_linear.py b/front/py/examples/3_module/1_linear.py
index 06eb7cfd..7ad43a91 100644
--- a/front/py/examples/3_module/1_linear.py
+++ b/front/py/examples/3_module/1_linear.py
@@ -3,22 +3,18 @@
 import torch.nn as nn
 
 net = nn.Linear(64, 4)
-input = torch.ones(1, 64)
-output = net(input)
+torch_input = torch.ones(1, 64)
+torch_output = net(torch_input)
 print()
-print(output)
+print(torch_output)
 
 
 ############-------DEEPX-------################
-from deepx.nn.modules import Linear, Module
-from deepx import Tensor,ones
+from deepx.nn.modules import Linear
+from deepx import ones
 
 net = Linear(64, 4)
 input=ones(1,64,name='input')
 out=net.forward(input)
-print(out)
+out.print()
 
-import os
-script_name = os.path.splitext(os.path.basename( os.path.abspath(__file__)))[0]  # 获取不带后缀的脚本名
-str=out.graph.to_dot()
-str.render(script_name+".dot", format='svg')

From 78f9defca1c558e758e7de7c575f336351436723 Mon Sep 17 00:00:00 2001
From: lipeng <734991033@qq.com>
Date: Fri, 18 Apr 2025 04:45:15 +0800
Subject: [PATCH 2/6] =?UTF-8?q?llama:RMSNorm=20ok,=E9=AA=8C=E8=AF=81rpoe?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 front/py/deepx/__init__.py                    |   5 +-
 front/py/deepx/nn/functional/__init__.py      |   2 +-
 front/py/deepx/nn/functional/elementwise.py   |   5 +-
 front/py/deepx/nn/functional/leaffunc.py      |  10 +-
 .../nn/functional/leaffunc_elementwise.py     |  62 +-
 front/py/deepx/nn/functional/leaffunc_init.py |   7 +-
 .../py/deepx/nn/functional/rtf_elementwise.py |  10 +-
 front/py/deepx/nn/modules/container.py        |   0
 front/py/deepx/nn/modules/module.py           |   3 +-
 front/py/deepx/tensor/__init__.py             |   4 +-
 front/py/deepx/tensor/elementwise.py          |   9 +-
 front/py/deepx/tensor/reduce.py               |  12 +-
 front/py/deepx/tensor/tensor.py               |  22 +-
 .../4_transformer/llama/1_llamarmsnorm.dot    | 128 ----
 .../llama/1_llamarmsnorm.dot.svg              | 606 ------------------
 .../4_transformer/llama/1_llamarmsnorm.py     |  26 +-
 .../llama/1_llamarmsnorm_torch.py             |   4 +-
 17 files changed, 97 insertions(+), 818 deletions(-)
 delete mode 100644 front/py/deepx/nn/modules/container.py
 delete mode 100644 front/py/examples/4_transformer/llama/1_llamarmsnorm.dot
 delete mode 100644 front/py/examples/4_transformer/llama/1_llamarmsnorm.dot.svg

diff --git a/front/py/deepx/__init__.py b/front/py/deepx/__init__.py
index 355a250b..37f47669 100644
--- a/front/py/deepx/__init__.py
+++ b/front/py/deepx/__init__.py
@@ -1,11 +1,10 @@
-from .tensor import Tensor,Shape
+from .tensor import Tensor,Shape,Number
 from deepx.nn.functional import *  # 导入所有functional函数
 from deepx.nn.functional import __all__ as _func_all  # 获取functional的导出列表
 
 __all__ = [
     #tensor
-    'Tensor',
-    'Shape',
+    'Tensor','Shape','Number',
     *_func_all
 ]
 
diff --git a/front/py/deepx/nn/functional/__init__.py b/front/py/deepx/nn/functional/__init__.py
index 9cacf7d4..1e215d7e 100644
--- a/front/py/deepx/nn/functional/__init__.py
+++ b/front/py/deepx/nn/functional/__init__.py
@@ -21,7 +21,7 @@
     "printtensor",
     "constant","constant_","full","zeros","ones","uniform","uniform_","arange","arange_","kaiming_uniform","kaiming_uniform_","calculate_fan_in_and_fan_out",
     "add","sub","mul","div","sqrt","pow","exp","log",
-    "leaffunc_matmul",
+    "matmul",
     "reducemax","reducemin","sum","prod",
     "reshape","permute","transpose","concat","broadcastTo",
 
diff --git a/front/py/deepx/nn/functional/elementwise.py b/front/py/deepx/nn/functional/elementwise.py
index 7adcb28f..28e5b199 100644
--- a/front/py/deepx/nn/functional/elementwise.py
+++ b/front/py/deepx/nn/functional/elementwise.py
@@ -1,4 +1,5 @@
-from deepx.tensor import Tensor
+from typing import Union
+from deepx.tensor import Tensor,Number
 from deepx.nn.functional import newtensor
 
 def rsqrt(input:Tensor)->Tensor:
@@ -8,5 +9,5 @@ def rsqrt(input:Tensor)->Tensor:
         outtensor=newtensor(input.shape, dtype=input.dtype)
     sqrt(input,out= outtensor)
     return div(1,outtensor,outtensor)
-
+ 
 
diff --git a/front/py/deepx/nn/functional/leaffunc.py b/front/py/deepx/nn/functional/leaffunc.py
index 62fbb767..58d21105 100644
--- a/front/py/deepx/nn/functional/leaffunc.py
+++ b/front/py/deepx/nn/functional/leaffunc.py
@@ -17,9 +17,6 @@ def op_func(
             b: Union[Tensor, float, int] = None, 
             out: Union[Tensor, str] = None) -> Tensor:
         outtensor = out
-        if isinstance(out, str):
-            outtensor = newtensor(a.shape, dtype=a.dtype, name=out)
-
         rtf_module = importlib.import_module('deepx.nn.functional.rtf_elementwise')
         if isinstance(b, Tensor):
             an=a
@@ -28,9 +25,16 @@ def op_func(
                 newshape = Shape.broadcast_shape(a.shape, b.shape)
                 an = a.broadcastTo(newshape)
                 bn = b.broadcastTo(newshape)
+                if isinstance(out,str):
+                    outtensor=newtensor(newshape,dtype=a.dtype,name=out)
+            else:
+                if isinstance(out,str):
+                    outtensor=newtensor(a.shape,dtype=a.dtype,name=out)
             rtf_func = getattr(rtf_module, f'rtf_{op_name}')
             rtf_func(an, bn, outtensor, defaultauthor[op_name])
         else:
+            if isinstance(out,str):
+                outtensor=newtensor(a.shape,dtype=a.dtype,name=out)
             rtf_func = getattr(rtf_module, f'rtf_{op_name}scalar')
             rtf_func(a, b, outtensor, defaultauthor[f'{op_name}scalar'])
         return outtensor
diff --git a/front/py/deepx/nn/functional/leaffunc_elementwise.py b/front/py/deepx/nn/functional/leaffunc_elementwise.py
index 93e64b36..74e0918d 100644
--- a/front/py/deepx/nn/functional/leaffunc_elementwise.py
+++ b/front/py/deepx/nn/functional/leaffunc_elementwise.py
@@ -1,5 +1,5 @@
 from typing import Optional, Union
-from deepx import Tensor,Shape
+from deepx import Tensor,Shape,Number
 
 from .leaffunc import create_A_B_tf_C,create_A_tf_C
 from .leaffunc_life import newtensor
@@ -9,49 +9,43 @@
 add = create_A_B_tf_C('add')
 sub = create_A_B_tf_C('sub')
 mul = create_A_B_tf_C('mul')
+_div=create_A_B_tf_C('div')
 
-#div
 def div(
-        a: Optional[Union[Tensor, float, int]] = None,
-        b: Optional[Union[Tensor, float, int]] = None, 
+        a: Union[Tensor, float, int],
+        b: Union[Tensor, float, int], 
         out:Union[Tensor,str]=None)->Tensor:
-    if isinstance(b,Tensor) and isinstance(a,Tensor):
-        #C=A/B
-        outtensor=out
-        if isinstance(out,str):
-            outtensor=newtensor(a.shape,dtype=a.dtype,name=out)
-        an=a
-        bn=b
-        if a.shape!=b.shape:
-            newshape=Shape.broadcast_shape(a.shape,b.shape)
-            an=a.broadcastTo(newshape)
-            bn=b.broadcastTo(newshape)
-        from .rtf_elementwise import rtf_div
-        rtf_div(an,bn,outtensor,defaultauthor['div'])
-        return outtensor
+    if isinstance(a,Tensor):
+        return _div(a,b,out)
+    elif isinstance(a,float) or isinstance(a,int):
+        return rdiv(a,b,out)
     else:
-        if isinstance(a,Tensor):
-            #C=A/b
-            outtensor=out
-            if isinstance(out,str):
-                outtensor=newtensor(a.shape,dtype=a.dtype,name=out)
-            from .rtf_elementwise import rtf_divscalar
-            rtf_divscalar(a,b,outtensor,defaultauthor['divscalar'])
-            return outtensor
-        elif isinstance(a,float) or isinstance(a,int):
-            #C=a/B
-            outtensor=out
-            if isinstance(out,str):
-                outtensor=newtensor(b.shape,dtype=b.dtype,name=out)
-            from .rtf_elementwise import rtf_rdivscalar
-            rtf_rdivscalar(a,b,outtensor,defaultauthor['rdivscalar'])
-            return outtensor
+        raise ValueError(f"Invalid type for a: {type(a)}")
+
+#div
+def rdiv(
+        a: Union[float, int],
+        b: Tensor, 
+        out:Union[Tensor,str]=None)->Tensor:
+    outtensor=out
+    if isinstance(out,str):
+        outtensor=newtensor(b.shape,dtype=b.dtype,name=out)
+    from .rtf_elementwise import rtf_rdivscalar
+    rtf_rdivscalar(a,b,outtensor,defaultauthor['rdivscalar'])
+    return outtensor
  
 max=create_A_B_tf_C('max')
 min=create_A_B_tf_C('min')
 
 #pow
 pow=create_A_B_tf_C('pow')
+def rpow(a:Number,b:Tensor,out:Union[Tensor,str]=None)->Tensor:
+    outtensor=out
+    if isinstance(out,str):
+        outtensor=newtensor(b.shape,dtype=b.dtype,name=out)
+    from .rtf_elementwise import rtf_rpowscalar
+    rtf_rpowscalar(a,b,outtensor,defaultauthor['rpowscalar'])
+    return outtensor
 #sqrt
 
 sqrt=create_A_tf_C('sqrt')
diff --git a/front/py/deepx/nn/functional/leaffunc_init.py b/front/py/deepx/nn/functional/leaffunc_init.py
index a5ac4dde..454dc09d 100644
--- a/front/py/deepx/nn/functional/leaffunc_init.py
+++ b/front/py/deepx/nn/functional/leaffunc_init.py
@@ -4,7 +4,7 @@
 import os
 from .leaffunc_life import newtensor,parse_shape
 from .rtf_init import *
-from deepx import Tensor
+from deepx import Tensor,Number
 from .authormap import defaultauthor
 
 # 命名规则
@@ -36,8 +36,9 @@ def ones(*shape, dtype:str='float32',name:str=None)->Tensor:
 def arange_(t:Tensor,start=0,step=1)->Tensor:
     from .rtf_init import rtf_arange
     rtf_arange(t,start,step,defaultauthor['arange'])
-def arange(*shape,start=0,step=1,dtype:str='float32',name:str=None)->Tensor:
-    s = parse_shape(shape)
+#pytorch style
+def arange(start:Number,end:Number,step:Number=1,dtype:str='float32',name:str=None)->Tensor:
+    s =[int((end-start)/step)]
     outtensor=newtensor(s,dtype=dtype,name=name)
     arange_(outtensor,start,step)
     return outtensor
diff --git a/front/py/deepx/nn/functional/rtf_elementwise.py b/front/py/deepx/nn/functional/rtf_elementwise.py
index 414c09f6..96614a4e 100644
--- a/front/py/deepx/nn/functional/rtf_elementwise.py
+++ b/front/py/deepx/nn/functional/rtf_elementwise.py
@@ -1,7 +1,6 @@
-from deepx.tensor import Tensor
+from deepx.tensor import Tensor,Number
 from deepx.nn.deepxir import DeepxIR,Param
 from deepx.scheduler import send
-from typing import Union
 from .rtf import A_B_op_C,A_scalar_op_C,A_op_C
 
 def rtf_add(a:Tensor, b:Tensor, out:Tensor, author='miaobyte')->Tensor:
@@ -55,6 +54,13 @@ def rtf_powscalar(a:Tensor, b:float, out:Tensor, author='miaobyte')->Tensor:
     A_scalar_op_C("powscalar",a,b,out,author)
     return out
 
+def rtf_rpowscalar(a:Number,b:Tensor,out:Tensor,author='miaobyte')->Tensor:
+    args = [ Param.varnum(a),Param.tensor(b)]
+    returns = [Param.tensor(out)]
+    ir = DeepxIR("rpowscalar", args, returns, author)
+    send(ir)
+    return out
+
 def rtf_exp(a:Tensor, out:Tensor, author='miaobyte')->Tensor:
     A_op_C("exp",a,out,author)
     return out
diff --git a/front/py/deepx/nn/modules/container.py b/front/py/deepx/nn/modules/container.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/front/py/deepx/nn/modules/module.py b/front/py/deepx/nn/modules/module.py
index 9f52fdf3..5c7be9b2 100644
--- a/front/py/deepx/nn/modules/module.py
+++ b/front/py/deepx/nn/modules/module.py
@@ -12,7 +12,8 @@ def __init__(self, name: Optional[str] = None):
 
     def _generate_default_name(self) -> str:
         class_name = self.__class__.__name__
-        base_name = re.sub(r'(?<!^)(?=[A-Z])', '_', class_name).lower()
+        # 修改正则表达式，保留连续大写字母为一个单词
+        base_name = re.sub(r'(?<![A-Z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])', '_', class_name).lower()
         if not hasattr(self.__class__, '_instance_counter'):
             self.__class__._instance_counter = 0
         count = self.__class__._instance_counter
diff --git a/front/py/deepx/tensor/__init__.py b/front/py/deepx/tensor/__init__.py
index 70f2f16f..b46990e9 100644
--- a/front/py/deepx/tensor/__init__.py
+++ b/front/py/deepx/tensor/__init__.py
@@ -1,4 +1,4 @@
-from .tensor import Tensor,tensor_method
+from .tensor import *
 from .shape import Shape
 from .elementwise import *  # 导入所有包含@tensor_method装饰的方法
 from .matmul import *       # 导入矩阵乘法相关方法
@@ -10,7 +10,7 @@
     'Shape',
     'Tensor',
     'tensor_method',
- 
+    'Number',
     # 'lt', 'gt', 'eq',
     # 'sin', 'cos', 'tan',
     # 'DType',
diff --git a/front/py/deepx/tensor/elementwise.py b/front/py/deepx/tensor/elementwise.py
index b6cb2ef9..d4798da1 100644
--- a/front/py/deepx/tensor/elementwise.py
+++ b/front/py/deepx/tensor/elementwise.py
@@ -1,6 +1,6 @@
 from typing import Optional,Union
 
-from deepx.tensor import Tensor,tensor_method
+from deepx.tensor import Tensor,tensor_method,Number
 
 @tensor_method
 def add(self,
@@ -129,6 +129,13 @@ def pow_(self,
     from deepx.nn.functional import pow as pow_func
     pow_func(self,b,self)
 
+@tensor_method
+def rpow(self,
+        a:Number,
+        out:Union[Tensor,str]=''):
+    from deepx.nn.functional import rpow as rpow_func
+    return rpow_func(a,self,out)
+
 
 @tensor_method
 def sqrt(self,out:Union[Tensor,str]='')->Tensor:
diff --git a/front/py/deepx/tensor/reduce.py b/front/py/deepx/tensor/reduce.py
index b6d5bc3f..cdba12f8 100644
--- a/front/py/deepx/tensor/reduce.py
+++ b/front/py/deepx/tensor/reduce.py
@@ -4,28 +4,28 @@
 from deepx.tensor import Tensor,tensor_method
 
 @tensor_method
-def reducemax(self, dim:list[int],keepdim:bool=False,out:Union[Tensor,str]=''):
+def reducemax(self, dim:tuple,keepdim:bool=False,out:Union[Tensor,str]=''):
     from deepx.nn.functional import reducemax as reduce_max_func
     return reduce_max_func(self,dim,keepdim,out)
 
 @tensor_method
-def reducemin(self, dim:list[int],keepdim:bool=False,out:Union[Tensor,str]=''):
+def reducemin(self, dim:tuple,keepdim:bool=False,out:Union[Tensor,str]=''):
     from deepx.nn.functional import reducemin as reduce_min_func
     return reduce_min_func(self,dim,keepdim,out)
 
 
 @tensor_method
-def sum(self, dim:list[int],keepdim:bool=False,out:Union[Tensor,str]=''):
+def sum(self, dim:tuple,keepdim:bool=False,out:Union[Tensor,str]=''):
     from deepx.nn.functional import  sum as sum_func
     return  sum_func(self,dim,keepdim,out)
 
 @tensor_method
-def prod(self, dim:list[int],keepdim:bool=False,out:Union[Tensor,str]=''):
+def prod(self, dim:tuple,keepdim:bool=False,out:Union[Tensor,str]=''):
     from deepx.nn.functional import prod as prod_func
     return prod_func(self,dim,keepdim,out)   
 
 @tensor_method
-def mean(self, dim:list[int],keepdim:bool=False,out:Union[Tensor,str]=''):
+def mean(self,dim:tuple,keepdim:bool=False,out:Union[Tensor,str]=''):
     from deepx.nn.functional import mean as mean_func
-    return mean_func(self,dim,keepdim,out)
+    return mean_func(self,dim,keepdim)
  
\ No newline at end of file
diff --git a/front/py/deepx/tensor/tensor.py b/front/py/deepx/tensor/tensor.py
index 1e2fe9fe..a45888b1 100644
--- a/front/py/deepx/tensor/tensor.py
+++ b/front/py/deepx/tensor/tensor.py
@@ -1,6 +1,9 @@
-from typing import Optional,Union
+from typing import Optional,Union,TypeAlias
 from .shape import Shape
 
+
+Number: TypeAlias = Union[int, float, bool]
+
 tensorid=1
 
 class Tensor:
@@ -91,23 +94,28 @@ def dtype(self):
 
     
     #elementwise
-    def __add__(self, other):
+    def __add__(self, other:Union[Number,'Tensor']):
         return self.add(other)
     
-    def __sub__(self, other):
+    def __sub__(self, other:Union[Number,'Tensor']):
         return self.sub(other)
     
-    def __mul__(self, other):
+    def __mul__(self, other:Union[Number,'Tensor']):
         return self.mul(other)
     
-    def __truediv__(self, other):
+    def __truediv__(self, other:Union[Number,'Tensor']):
         return self.div(other)
     
-    def __rtruediv__(self, other):
+    def __rtruediv__(self, other:Union[Number,'Tensor']):
         return self.rdiv(other)
 
+    def __pow__(self, other:Union[Number,'Tensor']):
+        return self.pow(other)
+    
+    def __rpow__(self, other:Union[Number,'Tensor']):
+        return self.rpow(other)
     #矩阵乘法
-    def __matmul__(self, other):
+    def __matmul__(self, other:Union[Number,'Tensor']):
         return self.matmul(other)
 
     #shape操作
diff --git a/front/py/examples/4_transformer/llama/1_llamarmsnorm.dot b/front/py/examples/4_transformer/llama/1_llamarmsnorm.dot
deleted file mode 100644
index f2e9db0c..00000000
--- a/front/py/examples/4_transformer/llama/1_llamarmsnorm.dot
+++ /dev/null
@@ -1,128 +0,0 @@
-// Computational Graph
-digraph {
-	rankdir=TB
-	node [shape=record]
-	130357533018672 [label="tensor_1
-(2, 3, 8)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	130357533019536 [label=reshape color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	130357533018480 [label="vector_1
-(2, 3, 8)" color=darkseagreen fillcolor=honeydew fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	130357533738896 [label=div_scalar color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	130357533739760 [label="var_1
-10.0" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	130357533752528 [label=add_scalar color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	130357533752336 [label="var_2
--2.0" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	130357533750272 [label="tensor_2
-(8,)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	130357533750512 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	130357533750128 [label="var_3
-1" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	130357533750368 [label="llama_r_m_s_norm_0.weight
-(8,)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	130357533750416 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	130357533750032 [label="var_4
-0.5" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	130357533749840 [label=pow_scalar color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	130357533749888 [label="var_5
-2" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	130357533749696 [label="tensor_4
-(2, 3, 8)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	130357533749504 [label="tensor_5
-(2, 3, 1)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	130357533749264 [label="vector_2
-[2]" color=darkseagreen fillcolor=honeydew fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	130357533749168 [label=sum color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	130357533748928 [label="tensor_6
-(2, 3, 1)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	130357533748832 [label=div_scalar color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	130357533748880 [label="var_6
-8" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	130357533748688 [label=add_scalar color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	130357533748736 [label="var_7
-1e-06" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	130357533748544 [label="tensor_7
-(2, 3, 1)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	130357533748304 [label="tensor_8
-(2, 3, 1)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	130357533748064 [label=sqrt color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	130357533747968 [label=rdiv_scalar color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	130357533748112 [label="var_8
-1" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	130357533747824 [label="tensor_9
-(2, 3, 1)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	130357533747584 [label="tensor_10
-(2, 3, 1)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	130357533747344 [label=reshape color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	130357533747392 [label="vector_3
-[2, 3, 1]" color=darkseagreen fillcolor=honeydew fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	130357533747248 [label="tensor_11
-(2, 3, 8)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	130357533746960 [label=expand color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	130357533747008 [label="vector_4
-(2, 3, 8)" color=darkseagreen fillcolor=honeydew fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	130357533746864 [label=mul color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	130357533746720 [label="tensor_12
-(2, 3, 8)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	130357534894224 [label="tensor_13
-(1, 1, 8)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	130357533740336 [label=reshape color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	130357533740672 [label="vector_5
-[1, 1, 8]" color=darkseagreen fillcolor=honeydew fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	130357533740528 [label="tensor_14
-(2, 3, 8)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	130357533740768 [label=expand color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	130357533740816 [label="vector_6
-(2, 3, 8)" color=darkseagreen fillcolor=honeydew fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	130357533741152 [label=mul color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
-	130357533742736 [label="tensor_15
-(2, 3, 8)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
-	130357533019536 -> 130357533018672 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	130357533738896 -> 130357533018672 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	130357533752528 -> 130357533018672 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	130357533018672 -> 130357533019536 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	130357533018480 -> 130357533019536 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	130357533018672 -> 130357533738896 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	130357533739760 -> 130357533738896 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	130357533018672 -> 130357533752528 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	130357533752336 -> 130357533752528 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	130357533750512 -> 130357533750272 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	130357533750128 -> 130357533750512 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	130357533750416 -> 130357533750368 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	130357533750032 -> 130357533750416 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	130357533018672 -> 130357533749840 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	130357533749888 -> 130357533749840 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	130357533749840 -> 130357533749696 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	130357533749168 -> 130357533749504 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	130357533749696 -> 130357533749168 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	130357533749264 -> 130357533749168 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	130357533748832 -> 130357533748928 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	130357533749504 -> 130357533748832 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	130357533748880 -> 130357533748832 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	130357533748928 -> 130357533748688 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	130357533748736 -> 130357533748688 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	130357533748688 -> 130357533748544 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	130357533748064 -> 130357533748304 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	130357533748544 -> 130357533748064 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	130357533748112 -> 130357533747968 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	130357533748304 -> 130357533747968 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	130357533747968 -> 130357533747824 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	130357533747344 -> 130357533747584 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	130357533747824 -> 130357533747344 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	130357533747392 -> 130357533747344 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	130357533746960 -> 130357533747248 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	130357533747584 -> 130357533746960 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	130357533747008 -> 130357533746960 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	130357533018672 -> 130357533746864 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	130357533747248 -> 130357533746864 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	130357533746864 -> 130357533746720 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	130357533740336 -> 130357534894224 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	130357533750368 -> 130357533740336 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	130357533740672 -> 130357533740336 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	130357533740768 -> 130357533740528 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	130357534894224 -> 130357533740768 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	130357533740816 -> 130357533740768 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	130357533740528 -> 130357533741152 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	130357533746720 -> 130357533741152 [arrowsize=0.8 color=gray40 penwidth=1.2]
-	130357533741152 -> 130357533742736 [arrowsize=0.8 color=gray40 penwidth=1.2]
-}
diff --git a/front/py/examples/4_transformer/llama/1_llamarmsnorm.dot.svg b/front/py/examples/4_transformer/llama/1_llamarmsnorm.dot.svg
deleted file mode 100644
index 331e5566..00000000
--- a/front/py/examples/4_transformer/llama/1_llamarmsnorm.dot.svg
+++ /dev/null
@@ -1,606 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
- "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
-<!-- Generated by graphviz version 2.43.0 (0)
- -->
-<!-- Title: %3 Pages: 1 -->
-<svg width="740pt" height="1506pt"
- viewBox="0.00 0.00 740.00 1506.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
-<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 1502)">
-<title>%3</title>
-<polygon fill="white" stroke="transparent" points="-4,4 -4,-1502 736,-1502 736,4 -4,4"/>
-<!-- 130357533018672 -->
-<g id="node1" class="node">
-<title>130357533018672</title>
-<polygon fill="aliceblue" stroke="skyblue" points="254.5,-1498 181.5,-1498 181.5,-1460 254.5,-1460 254.5,-1498"/>
-<text text-anchor="middle" x="218" y="-1482.8" font-family="Sans-Serif" font-size="14.00">tensor_1</text>
-<text text-anchor="middle" x="218" y="-1467.8" font-family="Sans-Serif" font-size="14.00">(2, 3, 8)</text>
-</g>
-<!-- 130357533019536 -->
-<g id="node2" class="node">
-<title>130357533019536</title>
-<polygon fill="lightgray" stroke="darkslategray" points="74,-1424 0,-1424 0,-1388 74,-1388 74,-1424"/>
-<text text-anchor="middle" x="37" y="-1402.3" font-family="Courier Bold" font-size="14.00">reshape</text>
-</g>
-<!-- 130357533018672&#45;&gt;130357533019536 -->
-<g id="edge4" class="edge">
-<title>130357533018672&#45;&gt;130357533019536</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M181.21,-1465.63C152.41,-1454.97 112.39,-1439.39 81.76,-1426.96"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="82.65,-1424.31 74.19,-1423.88 80.54,-1429.49 82.65,-1424.31"/>
-</g>
-<!-- 130357533738896 -->
-<g id="node4" class="node">
-<title>130357533738896</title>
-<polygon fill="lightgray" stroke="darkslategray" points="191.5,-1424 92.5,-1424 92.5,-1388 191.5,-1388 191.5,-1424"/>
-<text text-anchor="middle" x="142" y="-1402.3" font-family="Courier Bold" font-size="14.00">div_scalar</text>
-</g>
-<!-- 130357533018672&#45;&gt;130357533738896 -->
-<g id="edge6" class="edge">
-<title>130357533018672&#45;&gt;130357533738896</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M192.44,-1459.72C181.8,-1450.6 169.78,-1439.48 160.06,-1429.81"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="161.95,-1427.74 154.34,-1424 157.96,-1431.67 161.95,-1427.74"/>
-</g>
-<!-- 130357533752528 -->
-<g id="node6" class="node">
-<title>130357533752528</title>
-<polygon fill="lightgray" stroke="darkslategray" points="347.5,-1424 248.5,-1424 248.5,-1388 347.5,-1388 347.5,-1424"/>
-<text text-anchor="middle" x="298" y="-1402.3" font-family="Courier Bold" font-size="14.00">add_scalar</text>
-</g>
-<!-- 130357533018672&#45;&gt;130357533752528 -->
-<g id="edge8" class="edge">
-<title>130357533018672&#45;&gt;130357533752528</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M232.61,-1459.72C242.19,-1450.42 254.89,-1439.06 266.6,-1429.26"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="268.59,-1431.25 272.98,-1424 265.03,-1426.93 268.59,-1431.25"/>
-</g>
-<!-- 130357533749840 -->
-<g id="node14" class="node">
-<title>130357533749840</title>
-<polygon fill="lightgray" stroke="darkslategray" points="464.5,-1424 365.5,-1424 365.5,-1388 464.5,-1388 464.5,-1424"/>
-<text text-anchor="middle" x="415" y="-1402.3" font-family="Courier Bold" font-size="14.00">pow_scalar</text>
-</g>
-<!-- 130357533018672&#45;&gt;130357533749840 -->
-<g id="edge14" class="edge">
-<title>130357533018672&#45;&gt;130357533749840</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M254.61,-1463.54C257.78,-1462.33 260.94,-1461.13 264,-1460 294.71,-1448.6 329.17,-1436.46 357.4,-1426.68"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="358.55,-1429.25 365.19,-1423.98 356.72,-1423.95 358.55,-1429.25"/>
-</g>
-<!-- 130357533746864 -->
-<g id="node37" class="node">
-<title>130357533746864</title>
-<polygon fill="lightgray" stroke="darkslategray" points="337,-256 283,-256 283,-220 337,-220 337,-256"/>
-<text text-anchor="middle" x="310" y="-234.3" font-family="Courier Bold" font-size="14.00">mul</text>
-</g>
-<!-- 130357533018672&#45;&gt;130357533746864 -->
-<g id="edge37" class="edge">
-<title>130357533018672&#45;&gt;130357533746864</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M218.41,-1459.82C219,-1432.5 220,-1379.25 220,-1334 220,-1334 220,-1334 220,-383 220,-334.21 257.61,-288.33 284.2,-261.98"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="286.22,-263.92 290.03,-256.34 282.33,-259.89 286.22,-263.92"/>
-</g>
-<!-- 130357533019536&#45;&gt;130357533018672 -->
-<g id="edge1" class="edge">
-<title>130357533019536&#45;&gt;130357533018672</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M74.28,-1419.55C103.23,-1430.27 143.31,-1445.88 173.84,-1458.28"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="172.92,-1460.93 181.38,-1461.35 175.03,-1455.74 172.92,-1460.93"/>
-</g>
-<!-- 130357533018480 -->
-<g id="node3" class="node">
-<title>130357533018480</title>
-<polygon fill="honeydew" stroke="darkseagreen" points="73,-1498 1,-1498 1,-1460 73,-1460 73,-1498"/>
-<text text-anchor="middle" x="37" y="-1482.8" font-family="Sans-Serif" font-size="14.00">vector_1</text>
-<text text-anchor="middle" x="37" y="-1467.8" font-family="Sans-Serif" font-size="14.00">(2, 3, 8)</text>
-</g>
-<!-- 130357533018480&#45;&gt;130357533019536 -->
-<g id="edge5" class="edge">
-<title>130357533018480&#45;&gt;130357533019536</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M37,-1459.72C37,-1451.29 37,-1441.15 37,-1432.02"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="39.8,-1432 37,-1424 34.2,-1432 39.8,-1432"/>
-</g>
-<!-- 130357533738896&#45;&gt;130357533018672 -->
-<g id="edge2" class="edge">
-<title>130357533738896&#45;&gt;130357533018672</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M166.06,-1424C176.56,-1432.93 188.61,-1444.01 198.54,-1453.81"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="196.79,-1456.02 204.42,-1459.72 200.76,-1452.07 196.79,-1456.02"/>
-</g>
-<!-- 130357533739760 -->
-<g id="node5" class="node">
-<title>130357533739760</title>
-<polygon fill="moccasin" stroke="orange" points="163,-1498 109,-1498 109,-1460 163,-1460 163,-1498"/>
-<text text-anchor="middle" x="136" y="-1482.8" font-family="Sans-Serif" font-size="14.00">var_1</text>
-<text text-anchor="middle" x="136" y="-1467.8" font-family="Sans-Serif" font-size="14.00">10.0</text>
-</g>
-<!-- 130357533739760&#45;&gt;130357533738896 -->
-<g id="edge7" class="edge">
-<title>130357533739760&#45;&gt;130357533738896</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M137.55,-1459.72C138.26,-1451.29 139.11,-1441.15 139.89,-1432.02"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="142.68,-1432.21 140.56,-1424 137.1,-1431.74 142.68,-1432.21"/>
-</g>
-<!-- 130357533752528&#45;&gt;130357533018672 -->
-<g id="edge3" class="edge">
-<title>130357533752528&#45;&gt;130357533018672</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M284.7,-1424C275.42,-1433.09 262.83,-1444.43 251.04,-1454.36"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="248.96,-1452.45 244.59,-1459.72 252.53,-1456.76 248.96,-1452.45"/>
-</g>
-<!-- 130357533752336 -->
-<g id="node7" class="node">
-<title>130357533752336</title>
-<polygon fill="moccasin" stroke="orange" points="327,-1498 273,-1498 273,-1460 327,-1460 327,-1498"/>
-<text text-anchor="middle" x="300" y="-1482.8" font-family="Sans-Serif" font-size="14.00">var_2</text>
-<text text-anchor="middle" x="300" y="-1467.8" font-family="Sans-Serif" font-size="14.00">&#45;2.0</text>
-</g>
-<!-- 130357533752336&#45;&gt;130357533752528 -->
-<g id="edge9" class="edge">
-<title>130357533752336&#45;&gt;130357533752528</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M299.48,-1459.72C299.25,-1451.29 298.96,-1441.15 298.7,-1432.02"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="301.5,-1431.92 298.48,-1424 295.91,-1432.08 301.5,-1431.92"/>
-</g>
-<!-- 130357533750272 -->
-<g id="node8" class="node">
-<title>130357533750272</title>
-<polygon fill="aliceblue" stroke="skyblue" points="560.5,-1352 487.5,-1352 487.5,-1314 560.5,-1314 560.5,-1352"/>
-<text text-anchor="middle" x="524" y="-1336.8" font-family="Sans-Serif" font-size="14.00">tensor_2</text>
-<text text-anchor="middle" x="524" y="-1321.8" font-family="Sans-Serif" font-size="14.00">(8,)</text>
-</g>
-<!-- 130357533750512 -->
-<g id="node9" class="node">
-<title>130357533750512</title>
-<polygon fill="lightgray" stroke="darkslategray" points="565.5,-1424 482.5,-1424 482.5,-1388 565.5,-1388 565.5,-1424"/>
-<text text-anchor="middle" x="524" y="-1402.3" font-family="Courier Bold" font-size="14.00">constant</text>
-</g>
-<!-- 130357533750512&#45;&gt;130357533750272 -->
-<g id="edge10" class="edge">
-<title>130357533750512&#45;&gt;130357533750272</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M524,-1387.81C524,-1379.52 524,-1369.39 524,-1360.16"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="526.8,-1360.02 524,-1352.02 521.2,-1360.02 526.8,-1360.02"/>
-</g>
-<!-- 130357533750128 -->
-<g id="node10" class="node">
-<title>130357533750128</title>
-<polygon fill="moccasin" stroke="orange" points="551,-1498 497,-1498 497,-1460 551,-1460 551,-1498"/>
-<text text-anchor="middle" x="524" y="-1482.8" font-family="Sans-Serif" font-size="14.00">var_3</text>
-<text text-anchor="middle" x="524" y="-1467.8" font-family="Sans-Serif" font-size="14.00">1</text>
-</g>
-<!-- 130357533750128&#45;&gt;130357533750512 -->
-<g id="edge11" class="edge">
-<title>130357533750128&#45;&gt;130357533750512</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M524,-1459.72C524,-1451.29 524,-1441.15 524,-1432.02"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="526.8,-1432 524,-1424 521.2,-1432 526.8,-1432"/>
-</g>
-<!-- 130357533750368 -->
-<g id="node11" class="node">
-<title>130357533750368</title>
-<polygon fill="aliceblue" stroke="skyblue" points="732,-476 526,-476 526,-438 732,-438 732,-476"/>
-<text text-anchor="middle" x="629" y="-460.8" font-family="Sans-Serif" font-size="14.00">llama_r_m_s_norm_0.weight</text>
-<text text-anchor="middle" x="629" y="-445.8" font-family="Sans-Serif" font-size="14.00">(8,)</text>
-</g>
-<!-- 130357533740336 -->
-<g id="node40" class="node">
-<title>130357533740336</title>
-<polygon fill="lightgray" stroke="darkslategray" points="509,-402 435,-402 435,-366 509,-366 509,-402"/>
-<text text-anchor="middle" x="472" y="-380.3" font-family="Courier Bold" font-size="14.00">reshape</text>
-</g>
-<!-- 130357533750368&#45;&gt;130357533740336 -->
-<g id="edge41" class="edge">
-<title>130357533750368&#45;&gt;130357533740336</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M588.98,-437.9C566.94,-427.93 539.52,-415.54 516.78,-405.25"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="517.7,-402.59 509.26,-401.85 515.39,-407.7 517.7,-402.59"/>
-</g>
-<!-- 130357533750416 -->
-<g id="node12" class="node">
-<title>130357533750416</title>
-<polygon fill="lightgray" stroke="darkslategray" points="670.5,-548 587.5,-548 587.5,-512 670.5,-512 670.5,-548"/>
-<text text-anchor="middle" x="629" y="-526.3" font-family="Courier Bold" font-size="14.00">constant</text>
-</g>
-<!-- 130357533750416&#45;&gt;130357533750368 -->
-<g id="edge12" class="edge">
-<title>130357533750416&#45;&gt;130357533750368</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M629,-511.81C629,-503.52 629,-493.39 629,-484.16"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="631.8,-484.02 629,-476.02 626.2,-484.02 631.8,-484.02"/>
-</g>
-<!-- 130357533750032 -->
-<g id="node13" class="node">
-<title>130357533750032</title>
-<polygon fill="moccasin" stroke="orange" points="656,-622 602,-622 602,-584 656,-584 656,-622"/>
-<text text-anchor="middle" x="629" y="-606.8" font-family="Sans-Serif" font-size="14.00">var_4</text>
-<text text-anchor="middle" x="629" y="-591.8" font-family="Sans-Serif" font-size="14.00">0.5</text>
-</g>
-<!-- 130357533750032&#45;&gt;130357533750416 -->
-<g id="edge13" class="edge">
-<title>130357533750032&#45;&gt;130357533750416</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M629,-583.72C629,-575.29 629,-565.15 629,-556.02"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="631.8,-556 629,-548 626.2,-556 631.8,-556"/>
-</g>
-<!-- 130357533749696 -->
-<g id="node16" class="node">
-<title>130357533749696</title>
-<polygon fill="aliceblue" stroke="skyblue" points="451.5,-1352 378.5,-1352 378.5,-1314 451.5,-1314 451.5,-1352"/>
-<text text-anchor="middle" x="415" y="-1336.8" font-family="Sans-Serif" font-size="14.00">tensor_4</text>
-<text text-anchor="middle" x="415" y="-1321.8" font-family="Sans-Serif" font-size="14.00">(2, 3, 8)</text>
-</g>
-<!-- 130357533749840&#45;&gt;130357533749696 -->
-<g id="edge16" class="edge">
-<title>130357533749840&#45;&gt;130357533749696</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M415,-1387.81C415,-1379.52 415,-1369.39 415,-1360.16"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="417.8,-1360.02 415,-1352.02 412.2,-1360.02 417.8,-1360.02"/>
-</g>
-<!-- 130357533749888 -->
-<g id="node15" class="node">
-<title>130357533749888</title>
-<polygon fill="moccasin" stroke="orange" points="442,-1498 388,-1498 388,-1460 442,-1460 442,-1498"/>
-<text text-anchor="middle" x="415" y="-1482.8" font-family="Sans-Serif" font-size="14.00">var_5</text>
-<text text-anchor="middle" x="415" y="-1467.8" font-family="Sans-Serif" font-size="14.00">2</text>
-</g>
-<!-- 130357533749888&#45;&gt;130357533749840 -->
-<g id="edge15" class="edge">
-<title>130357533749888&#45;&gt;130357533749840</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M415,-1459.72C415,-1451.29 415,-1441.15 415,-1432.02"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="417.8,-1432 415,-1424 412.2,-1432 417.8,-1432"/>
-</g>
-<!-- 130357533749168 -->
-<g id="node19" class="node">
-<title>130357533749168</title>
-<polygon fill="lightgray" stroke="darkslategray" points="434,-1278 380,-1278 380,-1242 434,-1242 434,-1278"/>
-<text text-anchor="middle" x="407" y="-1256.3" font-family="Courier Bold" font-size="14.00">sum</text>
-</g>
-<!-- 130357533749696&#45;&gt;130357533749168 -->
-<g id="edge18" class="edge">
-<title>130357533749696&#45;&gt;130357533749168</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M412.94,-1313.72C411.99,-1305.29 410.85,-1295.15 409.82,-1286.02"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="412.59,-1285.64 408.92,-1278 407.03,-1286.27 412.59,-1285.64"/>
-</g>
-<!-- 130357533749504 -->
-<g id="node17" class="node">
-<title>130357533749504</title>
-<polygon fill="aliceblue" stroke="skyblue" points="443.5,-1206 370.5,-1206 370.5,-1168 443.5,-1168 443.5,-1206"/>
-<text text-anchor="middle" x="407" y="-1190.8" font-family="Sans-Serif" font-size="14.00">tensor_5</text>
-<text text-anchor="middle" x="407" y="-1175.8" font-family="Sans-Serif" font-size="14.00">(2, 3, 1)</text>
-</g>
-<!-- 130357533748832 -->
-<g id="node21" class="node">
-<title>130357533748832</title>
-<polygon fill="lightgray" stroke="darkslategray" points="448.5,-1132 349.5,-1132 349.5,-1096 448.5,-1096 448.5,-1132"/>
-<text text-anchor="middle" x="399" y="-1110.3" font-family="Courier Bold" font-size="14.00">div_scalar</text>
-</g>
-<!-- 130357533749504&#45;&gt;130357533748832 -->
-<g id="edge21" class="edge">
-<title>130357533749504&#45;&gt;130357533748832</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M404.94,-1167.72C403.99,-1159.29 402.85,-1149.15 401.82,-1140.02"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="404.59,-1139.64 400.92,-1132 399.03,-1140.27 404.59,-1139.64"/>
-</g>
-<!-- 130357533749264 -->
-<g id="node18" class="node">
-<title>130357533749264</title>
-<polygon fill="honeydew" stroke="darkseagreen" points="360,-1352 288,-1352 288,-1314 360,-1314 360,-1352"/>
-<text text-anchor="middle" x="324" y="-1336.8" font-family="Sans-Serif" font-size="14.00">vector_2</text>
-<text text-anchor="middle" x="324" y="-1321.8" font-family="Sans-Serif" font-size="14.00">[2]</text>
-</g>
-<!-- 130357533749264&#45;&gt;130357533749168 -->
-<g id="edge19" class="edge">
-<title>130357533749264&#45;&gt;130357533749168</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M345.37,-1313.72C356.24,-1304.42 369.52,-1293.06 380.98,-1283.26"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="382.86,-1285.33 387.12,-1278 379.22,-1281.08 382.86,-1285.33"/>
-</g>
-<!-- 130357533749168&#45;&gt;130357533749504 -->
-<g id="edge17" class="edge">
-<title>130357533749168&#45;&gt;130357533749504</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M407,-1241.81C407,-1233.52 407,-1223.39 407,-1214.16"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="409.8,-1214.02 407,-1206.02 404.2,-1214.02 409.8,-1214.02"/>
-</g>
-<!-- 130357533748928 -->
-<g id="node20" class="node">
-<title>130357533748928</title>
-<polygon fill="aliceblue" stroke="skyblue" points="435.5,-1060 362.5,-1060 362.5,-1022 435.5,-1022 435.5,-1060"/>
-<text text-anchor="middle" x="399" y="-1044.8" font-family="Sans-Serif" font-size="14.00">tensor_6</text>
-<text text-anchor="middle" x="399" y="-1029.8" font-family="Sans-Serif" font-size="14.00">(2, 3, 1)</text>
-</g>
-<!-- 130357533748688 -->
-<g id="node23" class="node">
-<title>130357533748688</title>
-<polygon fill="lightgray" stroke="darkslategray" points="433.5,-986 334.5,-986 334.5,-950 433.5,-950 433.5,-986"/>
-<text text-anchor="middle" x="384" y="-964.3" font-family="Courier Bold" font-size="14.00">add_scalar</text>
-</g>
-<!-- 130357533748928&#45;&gt;130357533748688 -->
-<g id="edge23" class="edge">
-<title>130357533748928&#45;&gt;130357533748688</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M395.14,-1021.72C393.36,-1013.29 391.21,-1003.15 389.29,-994.02"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="391.99,-993.25 387.59,-986 386.51,-994.41 391.99,-993.25"/>
-</g>
-<!-- 130357533748832&#45;&gt;130357533748928 -->
-<g id="edge20" class="edge">
-<title>130357533748832&#45;&gt;130357533748928</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M399,-1095.81C399,-1087.52 399,-1077.39 399,-1068.16"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="401.8,-1068.02 399,-1060.02 396.2,-1068.02 401.8,-1068.02"/>
-</g>
-<!-- 130357533748880 -->
-<g id="node22" class="node">
-<title>130357533748880</title>
-<polygon fill="moccasin" stroke="orange" points="352,-1206 298,-1206 298,-1168 352,-1168 352,-1206"/>
-<text text-anchor="middle" x="325" y="-1190.8" font-family="Sans-Serif" font-size="14.00">var_6</text>
-<text text-anchor="middle" x="325" y="-1175.8" font-family="Sans-Serif" font-size="14.00">8</text>
-</g>
-<!-- 130357533748880&#45;&gt;130357533748832 -->
-<g id="edge22" class="edge">
-<title>130357533748880&#45;&gt;130357533748832</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M344.06,-1167.72C353.56,-1158.6 365.15,-1147.48 375.23,-1137.81"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="377.44,-1139.56 381.28,-1132 373.57,-1135.52 377.44,-1139.56"/>
-</g>
-<!-- 130357533748544 -->
-<g id="node25" class="node">
-<title>130357533748544</title>
-<polygon fill="aliceblue" stroke="skyblue" points="420.5,-914 347.5,-914 347.5,-876 420.5,-876 420.5,-914"/>
-<text text-anchor="middle" x="384" y="-898.8" font-family="Sans-Serif" font-size="14.00">tensor_7</text>
-<text text-anchor="middle" x="384" y="-883.8" font-family="Sans-Serif" font-size="14.00">(2, 3, 1)</text>
-</g>
-<!-- 130357533748688&#45;&gt;130357533748544 -->
-<g id="edge25" class="edge">
-<title>130357533748688&#45;&gt;130357533748544</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M384,-949.81C384,-941.52 384,-931.39 384,-922.16"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="386.8,-922.02 384,-914.02 381.2,-922.02 386.8,-922.02"/>
-</g>
-<!-- 130357533748736 -->
-<g id="node24" class="node">
-<title>130357533748736</title>
-<polygon fill="moccasin" stroke="orange" points="344,-1060 290,-1060 290,-1022 344,-1022 344,-1060"/>
-<text text-anchor="middle" x="317" y="-1044.8" font-family="Sans-Serif" font-size="14.00">var_7</text>
-<text text-anchor="middle" x="317" y="-1029.8" font-family="Sans-Serif" font-size="14.00">1e&#45;06</text>
-</g>
-<!-- 130357533748736&#45;&gt;130357533748688 -->
-<g id="edge24" class="edge">
-<title>130357533748736&#45;&gt;130357533748688</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M334.25,-1021.72C342.78,-1012.68 353.15,-1001.69 362.22,-992.08"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="364.5,-993.74 367.95,-986 360.43,-989.9 364.5,-993.74"/>
-</g>
-<!-- 130357533748064 -->
-<g id="node27" class="node">
-<title>130357533748064</title>
-<polygon fill="lightgray" stroke="darkslategray" points="411,-840 357,-840 357,-804 411,-804 411,-840"/>
-<text text-anchor="middle" x="384" y="-818.3" font-family="Courier Bold" font-size="14.00">sqrt</text>
-</g>
-<!-- 130357533748544&#45;&gt;130357533748064 -->
-<g id="edge27" class="edge">
-<title>130357533748544&#45;&gt;130357533748064</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M384,-875.72C384,-867.29 384,-857.15 384,-848.02"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="386.8,-848 384,-840 381.2,-848 386.8,-848"/>
-</g>
-<!-- 130357533748304 -->
-<g id="node26" class="node">
-<title>130357533748304</title>
-<polygon fill="aliceblue" stroke="skyblue" points="420.5,-768 347.5,-768 347.5,-730 420.5,-730 420.5,-768"/>
-<text text-anchor="middle" x="384" y="-752.8" font-family="Sans-Serif" font-size="14.00">tensor_8</text>
-<text text-anchor="middle" x="384" y="-737.8" font-family="Sans-Serif" font-size="14.00">(2, 3, 1)</text>
-</g>
-<!-- 130357533747968 -->
-<g id="node28" class="node">
-<title>130357533747968</title>
-<polygon fill="lightgray" stroke="darkslategray" points="382.5,-694 275.5,-694 275.5,-658 382.5,-658 382.5,-694"/>
-<text text-anchor="middle" x="329" y="-672.3" font-family="Courier Bold" font-size="14.00">rdiv_scalar</text>
-</g>
-<!-- 130357533748304&#45;&gt;130357533747968 -->
-<g id="edge29" class="edge">
-<title>130357533748304&#45;&gt;130357533747968</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M369.84,-729.72C362.91,-720.77 354.48,-709.9 347.09,-700.36"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="349.28,-698.61 342.17,-694 344.86,-702.04 349.28,-698.61"/>
-</g>
-<!-- 130357533748064&#45;&gt;130357533748304 -->
-<g id="edge26" class="edge">
-<title>130357533748064&#45;&gt;130357533748304</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M384,-803.81C384,-795.52 384,-785.39 384,-776.16"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="386.8,-776.02 384,-768.02 381.2,-776.02 386.8,-776.02"/>
-</g>
-<!-- 130357533747824 -->
-<g id="node30" class="node">
-<title>130357533747824</title>
-<polygon fill="aliceblue" stroke="skyblue" points="365.5,-622 292.5,-622 292.5,-584 365.5,-584 365.5,-622"/>
-<text text-anchor="middle" x="329" y="-606.8" font-family="Sans-Serif" font-size="14.00">tensor_9</text>
-<text text-anchor="middle" x="329" y="-591.8" font-family="Sans-Serif" font-size="14.00">(2, 3, 1)</text>
-</g>
-<!-- 130357533747968&#45;&gt;130357533747824 -->
-<g id="edge30" class="edge">
-<title>130357533747968&#45;&gt;130357533747824</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M329,-657.81C329,-649.52 329,-639.39 329,-630.16"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="331.8,-630.02 329,-622.02 326.2,-630.02 331.8,-630.02"/>
-</g>
-<!-- 130357533748112 -->
-<g id="node29" class="node">
-<title>130357533748112</title>
-<polygon fill="moccasin" stroke="orange" points="329,-768 275,-768 275,-730 329,-730 329,-768"/>
-<text text-anchor="middle" x="302" y="-752.8" font-family="Sans-Serif" font-size="14.00">var_8</text>
-<text text-anchor="middle" x="302" y="-737.8" font-family="Sans-Serif" font-size="14.00">1</text>
-</g>
-<!-- 130357533748112&#45;&gt;130357533747968 -->
-<g id="edge28" class="edge">
-<title>130357533748112&#45;&gt;130357533747968</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M308.95,-729.72C312.19,-721.2 316.09,-710.94 319.59,-701.74"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="322.31,-702.48 322.53,-694 317.07,-700.49 322.31,-702.48"/>
-</g>
-<!-- 130357533747344 -->
-<g id="node32" class="node">
-<title>130357533747344</title>
-<polygon fill="lightgray" stroke="darkslategray" points="415,-548 341,-548 341,-512 415,-512 415,-548"/>
-<text text-anchor="middle" x="378" y="-526.3" font-family="Courier Bold" font-size="14.00">reshape</text>
-</g>
-<!-- 130357533747824&#45;&gt;130357533747344 -->
-<g id="edge32" class="edge">
-<title>130357533747824&#45;&gt;130357533747344</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M341.62,-583.72C347.73,-574.85 355.15,-564.11 361.69,-554.63"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="364.03,-556.18 366.27,-548 359.42,-553 364.03,-556.18"/>
-</g>
-<!-- 130357533747584 -->
-<g id="node31" class="node">
-<title>130357533747584</title>
-<polygon fill="aliceblue" stroke="skyblue" points="418,-476 338,-476 338,-438 418,-438 418,-476"/>
-<text text-anchor="middle" x="378" y="-460.8" font-family="Sans-Serif" font-size="14.00">tensor_10</text>
-<text text-anchor="middle" x="378" y="-445.8" font-family="Sans-Serif" font-size="14.00">(2, 3, 1)</text>
-</g>
-<!-- 130357533746960 -->
-<g id="node35" class="node">
-<title>130357533746960</title>
-<polygon fill="lightgray" stroke="darkslategray" points="343,-402 277,-402 277,-366 343,-366 343,-402"/>
-<text text-anchor="middle" x="310" y="-380.3" font-family="Courier Bold" font-size="14.00">expand</text>
-</g>
-<!-- 130357533747584&#45;&gt;130357533746960 -->
-<g id="edge35" class="edge">
-<title>130357533747584&#45;&gt;130357533746960</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M360.49,-437.72C351.75,-428.6 341.11,-417.48 331.84,-407.81"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="333.84,-405.84 326.28,-402 329.8,-409.72 333.84,-405.84"/>
-</g>
-<!-- 130357533747344&#45;&gt;130357533747584 -->
-<g id="edge31" class="edge">
-<title>130357533747344&#45;&gt;130357533747584</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M378,-511.81C378,-503.52 378,-493.39 378,-484.16"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="380.8,-484.02 378,-476.02 375.2,-484.02 380.8,-484.02"/>
-</g>
-<!-- 130357533747392 -->
-<g id="node33" class="node">
-<title>130357533747392</title>
-<polygon fill="honeydew" stroke="darkseagreen" points="456,-622 384,-622 384,-584 456,-584 456,-622"/>
-<text text-anchor="middle" x="420" y="-606.8" font-family="Sans-Serif" font-size="14.00">vector_3</text>
-<text text-anchor="middle" x="420" y="-591.8" font-family="Sans-Serif" font-size="14.00">[2, 3, 1]</text>
-</g>
-<!-- 130357533747392&#45;&gt;130357533747344 -->
-<g id="edge33" class="edge">
-<title>130357533747392&#45;&gt;130357533747344</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M409.18,-583.72C403.99,-574.94 397.71,-564.31 392.14,-554.91"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="394.54,-553.46 388.06,-548 389.72,-556.31 394.54,-553.46"/>
-</g>
-<!-- 130357533747248 -->
-<g id="node34" class="node">
-<title>130357533747248</title>
-<polygon fill="aliceblue" stroke="skyblue" points="350,-330 270,-330 270,-292 350,-292 350,-330"/>
-<text text-anchor="middle" x="310" y="-314.8" font-family="Sans-Serif" font-size="14.00">tensor_11</text>
-<text text-anchor="middle" x="310" y="-299.8" font-family="Sans-Serif" font-size="14.00">(2, 3, 8)</text>
-</g>
-<!-- 130357533747248&#45;&gt;130357533746864 -->
-<g id="edge38" class="edge">
-<title>130357533747248&#45;&gt;130357533746864</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M310,-291.72C310,-283.29 310,-273.15 310,-264.02"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="312.8,-264 310,-256 307.2,-264 312.8,-264"/>
-</g>
-<!-- 130357533746960&#45;&gt;130357533747248 -->
-<g id="edge34" class="edge">
-<title>130357533746960&#45;&gt;130357533747248</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M310,-365.81C310,-357.52 310,-347.39 310,-338.16"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="312.8,-338.02 310,-330.02 307.2,-338.02 312.8,-338.02"/>
-</g>
-<!-- 130357533747008 -->
-<g id="node36" class="node">
-<title>130357533747008</title>
-<polygon fill="honeydew" stroke="darkseagreen" points="320,-476 248,-476 248,-438 320,-438 320,-476"/>
-<text text-anchor="middle" x="284" y="-460.8" font-family="Sans-Serif" font-size="14.00">vector_4</text>
-<text text-anchor="middle" x="284" y="-445.8" font-family="Sans-Serif" font-size="14.00">(2, 3, 8)</text>
-</g>
-<!-- 130357533747008&#45;&gt;130357533746960 -->
-<g id="edge36" class="edge">
-<title>130357533747008&#45;&gt;130357533746960</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M290.7,-437.72C293.81,-429.2 297.57,-418.94 300.94,-409.74"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="303.65,-410.48 303.77,-402 298.39,-408.55 303.65,-410.48"/>
-</g>
-<!-- 130357533746720 -->
-<g id="node38" class="node">
-<title>130357533746720</title>
-<polygon fill="aliceblue" stroke="skyblue" points="372,-184 292,-184 292,-146 372,-146 372,-184"/>
-<text text-anchor="middle" x="332" y="-168.8" font-family="Sans-Serif" font-size="14.00">tensor_12</text>
-<text text-anchor="middle" x="332" y="-153.8" font-family="Sans-Serif" font-size="14.00">(2, 3, 8)</text>
-</g>
-<!-- 130357533746864&#45;&gt;130357533746720 -->
-<g id="edge39" class="edge">
-<title>130357533746864&#45;&gt;130357533746720</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M315.33,-219.81C317.92,-211.43 321.1,-201.18 323.98,-191.88"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="326.72,-192.49 326.42,-184.02 321.37,-190.83 326.72,-192.49"/>
-</g>
-<!-- 130357533741152 -->
-<g id="node45" class="node">
-<title>130357533741152</title>
-<polygon fill="lightgray" stroke="darkslategray" points="408,-110 354,-110 354,-74 408,-74 408,-110"/>
-<text text-anchor="middle" x="381" y="-88.3" font-family="Courier Bold" font-size="14.00">mul</text>
-</g>
-<!-- 130357533746720&#45;&gt;130357533741152 -->
-<g id="edge47" class="edge">
-<title>130357533746720&#45;&gt;130357533741152</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M344.62,-145.72C350.73,-136.85 358.15,-126.11 364.69,-116.63"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="367.03,-118.18 369.27,-110 362.42,-115 367.03,-118.18"/>
-</g>
-<!-- 130357534894224 -->
-<g id="node39" class="node">
-<title>130357534894224</title>
-<polygon fill="aliceblue" stroke="skyblue" points="491,-330 411,-330 411,-292 491,-292 491,-330"/>
-<text text-anchor="middle" x="451" y="-314.8" font-family="Sans-Serif" font-size="14.00">tensor_13</text>
-<text text-anchor="middle" x="451" y="-299.8" font-family="Sans-Serif" font-size="14.00">(1, 1, 8)</text>
-</g>
-<!-- 130357533740768 -->
-<g id="node43" class="node">
-<title>130357533740768</title>
-<polygon fill="lightgray" stroke="darkslategray" points="484,-256 418,-256 418,-220 484,-220 484,-256"/>
-<text text-anchor="middle" x="451" y="-234.3" font-family="Courier Bold" font-size="14.00">expand</text>
-</g>
-<!-- 130357534894224&#45;&gt;130357533740768 -->
-<g id="edge44" class="edge">
-<title>130357534894224&#45;&gt;130357533740768</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M451,-291.72C451,-283.29 451,-273.15 451,-264.02"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="453.8,-264 451,-256 448.2,-264 453.8,-264"/>
-</g>
-<!-- 130357533740336&#45;&gt;130357534894224 -->
-<g id="edge40" class="edge">
-<title>130357533740336&#45;&gt;130357534894224</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M466.92,-365.81C464.44,-357.43 461.4,-347.18 458.65,-337.88"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="461.28,-336.9 456.33,-330.02 455.91,-338.49 461.28,-336.9"/>
-</g>
-<!-- 130357533740672 -->
-<g id="node41" class="node">
-<title>130357533740672</title>
-<polygon fill="honeydew" stroke="darkseagreen" points="508,-476 436,-476 436,-438 508,-438 508,-476"/>
-<text text-anchor="middle" x="472" y="-460.8" font-family="Sans-Serif" font-size="14.00">vector_5</text>
-<text text-anchor="middle" x="472" y="-445.8" font-family="Sans-Serif" font-size="14.00">[1, 1, 8]</text>
-</g>
-<!-- 130357533740672&#45;&gt;130357533740336 -->
-<g id="edge42" class="edge">
-<title>130357533740672&#45;&gt;130357533740336</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M472,-437.72C472,-429.29 472,-419.15 472,-410.02"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="474.8,-410 472,-402 469.2,-410 474.8,-410"/>
-</g>
-<!-- 130357533740528 -->
-<g id="node42" class="node">
-<title>130357533740528</title>
-<polygon fill="aliceblue" stroke="skyblue" points="470,-184 390,-184 390,-146 470,-146 470,-184"/>
-<text text-anchor="middle" x="430" y="-168.8" font-family="Sans-Serif" font-size="14.00">tensor_14</text>
-<text text-anchor="middle" x="430" y="-153.8" font-family="Sans-Serif" font-size="14.00">(2, 3, 8)</text>
-</g>
-<!-- 130357533740528&#45;&gt;130357533741152 -->
-<g id="edge46" class="edge">
-<title>130357533740528&#45;&gt;130357533741152</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M417.38,-145.72C411.27,-136.85 403.85,-126.11 397.31,-116.63"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="399.58,-115 392.73,-110 394.97,-118.18 399.58,-115"/>
-</g>
-<!-- 130357533740768&#45;&gt;130357533740528 -->
-<g id="edge43" class="edge">
-<title>130357533740768&#45;&gt;130357533740528</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M445.92,-219.81C443.44,-211.43 440.4,-201.18 437.65,-191.88"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="440.28,-190.9 435.33,-184.02 434.91,-192.49 440.28,-190.9"/>
-</g>
-<!-- 130357533740816 -->
-<g id="node44" class="node">
-<title>130357533740816</title>
-<polygon fill="honeydew" stroke="darkseagreen" points="581,-330 509,-330 509,-292 581,-292 581,-330"/>
-<text text-anchor="middle" x="545" y="-314.8" font-family="Sans-Serif" font-size="14.00">vector_6</text>
-<text text-anchor="middle" x="545" y="-299.8" font-family="Sans-Serif" font-size="14.00">(2, 3, 8)</text>
-</g>
-<!-- 130357533740816&#45;&gt;130357533740768 -->
-<g id="edge45" class="edge">
-<title>130357533740816&#45;&gt;130357533740768</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M520.79,-291.72C508.38,-282.34 493.16,-270.85 480.11,-260.99"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="481.58,-258.59 473.51,-256 478.21,-263.06 481.58,-258.59"/>
-</g>
-<!-- 130357533742736 -->
-<g id="node46" class="node">
-<title>130357533742736</title>
-<polygon fill="aliceblue" stroke="skyblue" points="421,-38 341,-38 341,0 421,0 421,-38"/>
-<text text-anchor="middle" x="381" y="-22.8" font-family="Sans-Serif" font-size="14.00">tensor_15</text>
-<text text-anchor="middle" x="381" y="-7.8" font-family="Sans-Serif" font-size="14.00">(2, 3, 8)</text>
-</g>
-<!-- 130357533741152&#45;&gt;130357533742736 -->
-<g id="edge48" class="edge">
-<title>130357533741152&#45;&gt;130357533742736</title>
-<path fill="none" stroke="#666666" stroke-width="1.2" d="M381,-73.81C381,-65.52 381,-55.39 381,-46.16"/>
-<polygon fill="#666666" stroke="#666666" stroke-width="1.2" points="383.8,-46.02 381,-38.02 378.2,-46.02 383.8,-46.02"/>
-</g>
-</g>
-</svg>
diff --git a/front/py/examples/4_transformer/llama/1_llamarmsnorm.py b/front/py/examples/4_transformer/llama/1_llamarmsnorm.py
index 938e593c..8dfacfa1 100644
--- a/front/py/examples/4_transformer/llama/1_llamarmsnorm.py
+++ b/front/py/examples/4_transformer/llama/1_llamarmsnorm.py
@@ -5,29 +5,21 @@
 
 
 ############### DeepX 实现部分 ###############
-from deepx import arange, constant
+from deepx import arange, constant_
 from deepx.transformer.models.llama.modeling_llama import LlamaRMSNorm
 
 # 使用相同的数据
-dx_input = arange(0, 48, 1, dtype="float32").reshape_(2, 3, hidden_size)
-dx_input.div_(10.0)
-dx_input.sub_(2.0)
+input = arange(2, 3, hidden_size, dtype="float32")
+input.div_(10.0)
+input.sub_(2.0)
 eps = 1e-6
 
-print("\nDeepX 输入:")
-print(dx_input)
+input.print()
 
 # DeepX计算流程
-dx_norm = LlamaRMSNorm(hidden_size=hidden_size, eps=eps)
+norm = LlamaRMSNorm(hidden_size=hidden_size, eps=eps)
 # 设置相同的权重
-constant(dx_norm.weight, 0.5)
+constant_(norm.weight, 0.5)
 # 前向计算
-dx_output = dx_norm(dx_input)
-
-print("\nDeepX RMSNorm 结果:")
-print(dx_output)
-
-import os
-script_name = os.path.splitext(os.path.basename( os.path.abspath(__file__)))[0]  # 获取不带后缀的脚本名
-str=dx_output.graph.to_dot()
-str.render(script_name+".dot", format='svg')
\ No newline at end of file
+output = norm(input)
+output.print()
diff --git a/front/py/examples/4_transformer/llama/1_llamarmsnorm_torch.py b/front/py/examples/4_transformer/llama/1_llamarmsnorm_torch.py
index 4099feee..85ef6ced 100644
--- a/front/py/examples/4_transformer/llama/1_llamarmsnorm_torch.py
+++ b/front/py/examples/4_transformer/llama/1_llamarmsnorm_torch.py
@@ -1,6 +1,6 @@
 ############### PyTorch 实现部分 ###############
 import torch
-from transformers.models.llama.modeling_llama import LlamaRMSNorm as PTLlamaRMSNorm
+from transformers.models.llama.modeling_llama import LlamaRMSNorm
 
 # 使用小规模数据以便打印完整结果
 hidden_size = 8
@@ -10,7 +10,7 @@
 print("PyTorch 输入:")
 print(pt_input)
 # 使用transformers库中的官方LlamaRMSNorm实现
-pt_norm = PTLlamaRMSNorm(hidden_size, eps=eps)
+pt_norm = LlamaRMSNorm(hidden_size, eps=eps)
 # 设置权重为固定值0.5
 with torch.no_grad():
     pt_norm.weight.fill_(0.5)

From fabc8954e3fbff319c4ec425064dd3917b758a1e Mon Sep 17 00:00:00 2001
From: lipeng <734991033@qq.com>
Date: Fri, 18 Apr 2025 04:46:31 +0800
Subject: [PATCH 3/6] llama:rpoe todo

---
 excuter/op-mem-cuda/src/client/tfs.cpp        |   2 +-
 .../deepx/transformer/modeling_rope_utils.py  | 317 ++++++++++++++++++
 .../models/llama/modeling_llama.py            |  81 ++++-
 3 files changed, 394 insertions(+), 6 deletions(-)
 create mode 100644 front/py/deepx/transformer/modeling_rope_utils.py

diff --git a/excuter/op-mem-cuda/src/client/tfs.cpp b/excuter/op-mem-cuda/src/client/tfs.cpp
index f8e79c7b..15d935b0 100644
--- a/excuter/op-mem-cuda/src/client/tfs.cpp
+++ b/excuter/op-mem-cuda/src/client/tfs.cpp
@@ -227,7 +227,7 @@ namespace deepx::tf
         tffactory.add_tf(std::make_shared<PowScalar<miaobyte>>(vector<Param>(
                                                                    {
                                                                        Param("A", DataCategory::Tensor, Precision::Float64 | Precision::Float32),
-                                                                       Param("scalar", DataCategory::Var, Precision::Float64 | Precision::Float32),
+                                                                       Param("scalar", DataCategory::Var, Precision::Float64 | Precision::Int32),
                                                                    }),
                                                                vector<Param>(
                                                                    {
diff --git a/front/py/deepx/transformer/modeling_rope_utils.py b/front/py/deepx/transformer/modeling_rope_utils.py
new file mode 100644
index 00000000..41cb3909
--- /dev/null
+++ b/front/py/deepx/transformer/modeling_rope_utils.py
@@ -0,0 +1,317 @@
+from typing import   Tuple
+from deepx import arange
+
+def _compute_default_rope_parameters(
+    base: float = 10000.0,
+    dim: int = 0,
+    head_dim: int = 0,
+    partial_rotary_factor: float = 1.0,
+) -> Tuple:
+    """
+    计算原始RoPE实现的逆频率
+    
+    参数:
+        base: 用于旋转位置编码的基数，默认为10000.0
+        dim: 特征维度，必须是偶数
+        head_dim: 每个头的特征维度，必须是偶数
+        partial_rotary_factor: 部分旋转因子，默认为1.0
+    
+    返回:
+        包含RoPE嵌入的逆频率的元组和应用于计算的cos/sin的后处理缩放因子
+    """
+    attention_factor = 1.0  # 在这种类型的RoPE中未使用
+    if dim == 0:
+        dim = head_dim*partial_rotary_factor
+    # 计算逆频率
+    inv_freq = 1.0 / (base ** (arange(0, dim, 2, dtype='float64')/ dim))
+    return inv_freq, attention_factor
+ 
+# def _compute_linear_scaling_rope_parameters(
+#     config: Optional[PretrainedConfig] = None,
+#     device: Optional["torch.device"] = None,
+#     seq_len: Optional[int] = None,
+#     **rope_kwargs,
+# ) -> Tuple["torch.Tensor", float]:
+#     """
+#     Computes the inverse frequencies with linear scaling. Credits to the Reddit user /u/kaiokendev
+#     Args:
+#         config ([`~transformers.PretrainedConfig`]):
+#             The model configuration.
+#         device (`torch.device`):
+#             The device to use for initialization of the inverse frequencies.
+#         seq_len (`int`, *optional*):
+#             The current sequence length. Unused for this type of RoPE.
+#         rope_kwargs (`Dict`, *optional*):
+#             BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
+#     Returns:
+#         Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
+#         post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
+#     """
+#     if config is not None and len(rope_kwargs) > 0:
+#         raise ValueError(
+#             "Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in "
+#             f"`_compute_linear_scaling_rope_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}"
+#         )
+#     if len(rope_kwargs) > 0:
+#         factor = rope_kwargs["factor"]
+#     elif config is not None:
+#         factor = config.rope_scaling["factor"]
+
+#     # Gets the default RoPE parameters
+#     inv_freq, attention_factor = _compute_default_rope_parameters(config, device, seq_len, **rope_kwargs)
+
+#     # Then applies linear scaling to the frequencies.
+#     # NOTE: originally, scaling was applied to the position_ids. However, we get `embs = inv_freq @ position_ids`, so
+#     # applying scaling to the inverse frequencies is equivalent.
+#     inv_freq /= factor
+#     return inv_freq, attention_factor
+
+
+# def _compute_dynamic_ntk_parameters(
+#     config: Optional[PretrainedConfig] = None,
+#     device: Optional["torch.device"] = None,
+#     seq_len: Optional[int] = None,
+#     **rope_kwargs,
+# ) -> Tuple["torch.Tensor", float]:
+#     """
+#     Computes the inverse frequencies with NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla
+#     Args:
+#         config ([`~transformers.PretrainedConfig`]):
+#             The model configuration.
+#         device (`torch.device`):
+#             The device to use for initialization of the inverse frequencies.
+#         seq_len (`int`, *optional*):
+#             The current sequence length, used to update the dynamic RoPE at inference time.
+#         rope_kwargs (`Dict`, *optional*):
+#             BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
+#     Returns:
+#         Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
+#         post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
+#     """
+#     # TODO (joao): use the new `original_max_position_embeddings` from rope_scaling
+#     if config is not None and len(rope_kwargs) > 0:
+#         raise ValueError(
+#             "Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in "
+#             f"`_compute_dynamic_ntk_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}"
+#         )
+#     if len(rope_kwargs) > 0:
+#         base = rope_kwargs["base"]
+#         dim = rope_kwargs["dim"]
+#         max_position_embeddings = rope_kwargs["max_position_embeddings"]
+#         factor = rope_kwargs["factor"]
+#     elif config is not None:
+#         base = config.rope_theta
+#         partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
+#         head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+#         dim = int(head_dim * partial_rotary_factor)
+#         max_position_embeddings = config.max_position_embeddings
+#         factor = config.rope_scaling["factor"]
+
+#     attention_factor = 1.0  # Unused in this type of RoPE
+
+#     # seq_len: default to max_position_embeddings, e.g. at init time
+#     seq_len = seq_len if seq_len is not None and seq_len > max_position_embeddings else max_position_embeddings
+
+#     # Compute the inverse frequencies
+#     base = base * ((factor * seq_len / max_position_embeddings) - (factor - 1)) ** (dim / (dim - 2))
+#     inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float().to(device) / dim))
+#     return inv_freq, attention_factor
+
+
+# def _compute_yarn_parameters(
+#     config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None, **rope_kwargs
+# ) -> Tuple["torch.Tensor", float]:
+#     """
+#     Computes the inverse frequencies with NTK scaling. Please refer to the
+#     [original paper](https://arxiv.org/abs/2309.00071)
+#     Args:
+#         config ([`~transformers.PretrainedConfig`]):
+#             The model configuration.
+#         device (`torch.device`):
+#             The device to use for initialization of the inverse frequencies.
+#         seq_len (`int`, *optional*):
+#             The current sequence length. Unused for this type of RoPE.
+#         rope_kwargs (`Dict`, *optional*):
+#             BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
+#     Returns:
+#         Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
+#         post-processing scaling factor applied to the computed cos/sin.
+#     """
+#     # No need to keep BC with yarn, unreleased when this new pattern was created.
+#     if len(rope_kwargs) > 0:
+#         raise ValueError(
+#             f"Unexpected arguments: `**rope_kwargs` should be unset in `_compute_yarn_parameters`, got {rope_kwargs}"
+#         )
+
+#     base = config.rope_theta
+#     partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
+#     head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+#     dim = int(head_dim * partial_rotary_factor)
+#     max_position_embeddings = config.max_position_embeddings
+#     factor = config.rope_scaling["factor"]
+
+#     # Sets the attention factor as suggested in the paper
+#     attention_factor = config.rope_scaling.get("attention_factor")
+#     if attention_factor is None:
+#         attention_factor = 0.1 * math.log(factor) + 1.0
+
+#     # Optional config options
+#     # beta_fast/beta_slow: as suggested in the paper, default to 32/1 (correspondingly)
+#     beta_fast = config.rope_scaling.get("beta_fast") or 32
+#     beta_slow = config.rope_scaling.get("beta_slow") or 1
+
+#     # Compute the inverse frequencies
+#     def find_correction_dim(num_rotations, dim, base, max_position_embeddings):
+#         """Inverse dimension formula to find the dimension based on the number of rotations"""
+#         return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (2 * math.log(base))
+
+#     def find_correction_range(low_rot, high_rot, dim, base, max_position_embeddings):
+#         """Find dimension range bounds based on rotations"""
+#         low = math.floor(find_correction_dim(low_rot, dim, base, max_position_embeddings))
+#         high = math.ceil(find_correction_dim(high_rot, dim, base, max_position_embeddings))
+#         return max(low, 0), min(high, dim - 1)
+
+#     def linear_ramp_factor(min, max, dim):
+#         if min == max:
+#             max += 0.001  # Prevent singularity
+
+#         linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min)
+#         ramp_func = torch.clamp(linear_func, 0, 1)
+#         return ramp_func
+
+#     # Note on variable naming: "interpolation" comes from the original technique, where we interpolate the position IDs
+#     # to expand the possible context length. In other words, interpolation = apply scaling factor.
+#     pos_freqs = base ** (torch.arange(0, dim, 2).float().to(device) / dim)
+#     inv_freq_extrapolation = 1.0 / pos_freqs
+#     inv_freq_interpolation = 1.0 / (factor * pos_freqs)
+
+#     low, high = find_correction_range(beta_fast, beta_slow, dim, base, max_position_embeddings)
+
+#     # Get n-dimensional rotational scaling corrected for extrapolation
+#     inv_freq_extrapolation_factor = 1 - linear_ramp_factor(low, high, dim // 2).float().to(device)
+#     inv_freq = (
+#         inv_freq_interpolation * (1 - inv_freq_extrapolation_factor)
+#         + inv_freq_extrapolation * inv_freq_extrapolation_factor
+#     )
+
+#     return inv_freq, attention_factor
+
+
+# def _compute_longrope_parameters(
+#     config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None, **rope_kwargs
+# ) -> Tuple["torch.Tensor", float]:
+#     """
+#     Computes the inverse frequencies with LongRoPE scaling. Please refer to the
+#     [original implementation](https://github.com/microsoft/LongRoPE)
+#     Args:
+#         config ([`~transformers.PretrainedConfig`]):
+#             The model configuration.
+#         device (`torch.device`):
+#             The device to use for initialization of the inverse frequencies.
+#         seq_len (`int`, *optional*):
+#             The current sequence length.
+#         rope_kwargs (`Dict`, *optional*):
+#             BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
+#     Returns:
+#         Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
+#         post-processing scaling factor applied to the computed cos/sin.
+#     """
+#     # TODO (joao): use the new `original_max_position_embeddings` from rope_scaling
+#     # No need to keep BC with longrope, unreleased when this new pattern was created.
+#     if len(rope_kwargs) > 0:
+#         raise ValueError(
+#             "Unexpected arguments: `**rope_kwargs` should be unset in `_compute_longrope_parameters`, got "
+#             f"{rope_kwargs}"
+#         )
+
+#     base = config.rope_theta
+#     partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
+#     head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+#     dim = int(head_dim * partial_rotary_factor)
+#     long_factor = config.rope_scaling["long_factor"]
+#     short_factor = config.rope_scaling["short_factor"]
+#     factor = config.rope_scaling.get("factor")
+#     attention_factor = config.rope_scaling.get("attention_factor")
+
+#     # NOTE: Phi3 (and potentially other models) modify `max_position_embeddings` and have a
+#     # `original_max_position_embeddings` field containing the pretrained value. They use the ratio between these two
+#     # values to compute the default attention scaling factor, instead of using `factor`.
+#     if hasattr(config, "original_max_position_embeddings"):
+#         original_max_position_embeddings = config.original_max_position_embeddings
+#         factor = config.max_position_embeddings / config.original_max_position_embeddings
+#     else:
+#         original_max_position_embeddings = config.max_position_embeddings
+
+#     # Sets the attention factor as suggested in the paper
+#     if attention_factor is None:
+#         if factor <= 1.0:
+#             attention_factor = 1.0
+#         else:
+#             attention_factor = math.sqrt(1 + math.log(factor) / math.log(original_max_position_embeddings))
+
+#     # Compute the inverse frequencies -- scaled based on the target sequence length
+#     if seq_len and seq_len > original_max_position_embeddings:
+#         ext_factors = torch.tensor(long_factor, dtype=torch.float32, device=device)
+#     else:
+#         ext_factors = torch.tensor(short_factor, dtype=torch.float32, device=device)
+#     inv_freq_shape = torch.arange(0, dim, 2, dtype=torch.int64, device=device).float() / dim
+#     inv_freq = 1.0 / (ext_factors * base**inv_freq_shape)
+
+#     return inv_freq, attention_factor
+
+
+def _compute_llama3_parameters(
+    config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None, **rope_kwargs
+) -> Tuple["torch.Tensor", float]:
+    """
+    Computes the inverse frequencies for llama 3.1.
+
+    Args:
+        config ([`~transformers.PretrainedConfig`]):
+            The model configuration.
+        device (`torch.device`):
+            The device to use for initialization of the inverse frequencies.
+        seq_len (`int`, *optional*):
+            The current sequence length. Unused for this type of RoPE.
+        rope_kwargs (`Dict`, *optional*):
+            BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
+    Returns:
+        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
+        post-processing scaling factor applied to the computed cos/sin.
+    """
+    # Gets the default RoPE parameters
+    inv_freq, attention_factor = _compute_default_rope_parameters(config, device, seq_len, **rope_kwargs)
+
+    factor = config.rope_scaling["factor"]  # `8` in the original implementation
+    low_freq_factor = config.rope_scaling["low_freq_factor"]  # `1` in the original implementation
+    high_freq_factor = config.rope_scaling["high_freq_factor"]  # `4` in the original implementation
+    old_context_len = config.rope_scaling["original_max_position_embeddings"]  # `8192` in the original implementation
+
+    low_freq_wavelen = old_context_len / low_freq_factor
+    high_freq_wavelen = old_context_len / high_freq_factor
+
+    wavelen = 2 * math.pi / inv_freq
+    # wavelen < high_freq_wavelen: do nothing
+    # wavelen > low_freq_wavelen: divide by factor
+    inv_freq_llama = torch.where(wavelen > low_freq_wavelen, inv_freq / factor, inv_freq)
+    # otherwise: interpolate between the two, using a smooth factor
+    smooth_factor = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
+    smoothed_inv_freq = (1 - smooth_factor) * inv_freq_llama / factor + smooth_factor * inv_freq_llama
+    is_medium_freq = ~(wavelen < high_freq_wavelen) * ~(wavelen > low_freq_wavelen)
+    inv_freq_llama = torch.where(is_medium_freq, smoothed_inv_freq, inv_freq_llama)
+
+    return inv_freq_llama, attention_factor
+
+
+# This maps the "rope_type" string field in rope config to the corresponding function to compute the RoPE parameters
+# from the model config. You can append new {'rope_type': callable} pairs to this dictionary to enable custom RoPE
+# parameterizations, as long as the callable has the same signature.
+ROPE_INIT_FUNCTIONS = {
+    "default": _compute_default_rope_parameters,
+    # "linear": _compute_linear_scaling_rope_parameters,
+    # "dynamic": _compute_dynamic_ntk_parameters,
+    # "yarn": _compute_yarn_parameters,
+    # "longrope": _compute_longrope_parameters,
+    # "llama3": _compute_llama3_parameters,
+}
+  
\ No newline at end of file
diff --git a/front/py/deepx/transformer/models/llama/modeling_llama.py b/front/py/deepx/transformer/models/llama/modeling_llama.py
index f9850f81..c60f34f5 100644
--- a/front/py/deepx/transformer/models/llama/modeling_llama.py
+++ b/front/py/deepx/transformer/models/llama/modeling_llama.py
@@ -1,7 +1,9 @@
 from deepx.nn.modules import Module
 from deepx import Tensor,ones,rsqrt
 
+# RMSNorm
 # copy from https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
+# 数学公式
 class LlamaRMSNorm(Module):
     def __init__(self, hidden_size, eps=1e-6):
         """
@@ -11,11 +13,80 @@ def __init__(self, hidden_size, eps=1e-6):
         self.weight =  ones(hidden_size)
         self.variance_epsilon = eps
  
-
+    # 和官方实现相比，尽可能inplace化
     def forward(self, hidden_states:Tensor):
-        variance = hidden_states.pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * rsqrt(variance + self.variance_epsilon)
-        return self.weight * hidden_states
+        input_clone = hidden_states.clone()
+        input_clone.pow_(2)
+        variance = input_clone.mean([-1], keepdim=True)
+
+        variance.add_(self.variance_epsilon)
+        variance = rsqrt(variance)
+
+        hidden_states.mul_(variance)
+        hidden_states.mul_(self.weight)
+        return hidden_states
 
     def extra_repr(self):
-        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
\ No newline at end of file
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+    
+
+class LlamaRotaryEmbedding(Module):
+    from transformers.models.llama.configuration_llama import LlamaConfig
+    def __init__(self, config: LlamaConfig, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    def _dynamic_frequency_update(self, position_ids, device):
+        """
+        dynamic RoPE layers should recompute `inv_freq` in the following situations:
+        1 - growing beyond the cached sequence length (allow scaling)
+        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
+        """
+        seq_len = torch.max(position_ids) + 1
+        if seq_len > self.max_seq_len_cached:  # growth
+            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
+            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
+            self.max_seq_len_cached = seq_len
+
+        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+            # This .to() is needed if the model has been moved to a device after being initialized (because
+            # the buffer is automatically moved, but not the original copy)
+            self.original_inv_freq = self.original_inv_freq.to(device)
+            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+            self.max_seq_len_cached = self.original_max_seq_len
+
+    @torch.no_grad()
+    def forward(self, x, position_ids):
+        if "dynamic" in self.rope_type:
+            self._dynamic_frequency_update(position_ids, device=x.device)
+
+        # Core RoPE block
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+
+        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
+        cos = cos * self.attention_scaling
+        sin = sin * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)

From a5d555afa9bc7a651bfd8a0ac02d8a9249bf9791 Mon Sep 17 00:00:00 2001
From: lipeng <734991033@qq.com>
Date: Fri, 18 Apr 2025 04:46:36 +0800
Subject: [PATCH 4/6] llama:rpoe todo

---
 doc/excuter/op-mem-cuda/list.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/excuter/op-mem-cuda/list.md b/doc/excuter/op-mem-cuda/list.md
index e9f62d96..7b94afd6 100644
--- a/doc/excuter/op-mem-cuda/list.md
+++ b/doc/excuter/op-mem-cuda/list.md
@@ -31,7 +31,7 @@
 | minscalar | miaobyte | minscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=min(T1, scalar) | minscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
 | rdivscalar | miaobyte | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) | T3=scalar/T1 | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) |
 | constant | miaobyte | constant(tensor<any> t, var<any> value)->() | constant(T1) | constant(tensor<any> t, var<any> value)->() |
-| powscalar | miaobyte | powscalar(tensor<float64|float32> A, var<float64|float32> scalar)->(tensor<float64|float32> C) | T3=pow(T1, scalar) | powscalar(tensor<float64|float32> A, var<float64|float32> scalar)->(tensor<float64|float32> C) |
+| powscalar | miaobyte | powscalar(tensor<float64|float32> A, var<float64|int32> scalar)->(tensor<float64|float32> C) | T3=pow(T1, scalar) | powscalar(tensor<float64|float32> A, var<float64|int32> scalar)->(tensor<float64|float32> C) |
 | vecset |  none  | vecset(vector<any> value)->(vector<any> name) | shape = [3  4  5] | vecset(vector<any> value)->(vector<any> name) |
 | reducemin | miaobyte | reducemin(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) | B = reducemin(A, axis=[1 2], keepdims=false) | reducemin(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) |
 | subscalar | miaobyte | subscalar(tensor<any> A, var<any> b)->(tensor<any> C) | T3=T1-scalar | subscalar(tensor<any> A, var<any> b)->(tensor<any> C) |

From 05b6e9dd2c49bd6e75792caa7eaf62b442be7035 Mon Sep 17 00:00:00 2001
From: lipeng <734991033@qq.com>
Date: Fri, 18 Apr 2025 15:57:47 +0800
Subject: [PATCH 5/6] invert,rpowscalar:cuda&cpu

---
 doc/excuter/op-mem-cuda/list.md               |  12 +-
 doc/excuter/op-mem-ompsimd/list.md            |   4 +-
 .../src/deepx/tensorfunc/elementwise.hpp      |  84 ++--
 excuter/op-mem-cuda/src/client/tfs.cpp        |  22 +-
 .../src/deepx/tensorfunc/cuda_atomic.cuh      | 260 ++++++++++
 .../src/deepx/tensorfunc/cuda_math.cuh        | 313 +++---------
 .../tensorfunc/elementwise_miaobyte_basic.cu  | 474 ++++++++++--------
 .../tensorfunc/elementwise_miaobyte_basic.cuh | 257 +---------
 .../tensorfunc/elementwise_miaobyte_basic.hpp |  49 +-
 .../tensorfunc/elementwise_miaobyte_sqrt.cu   | 218 +++-----
 .../tensorfunc/elementwise_miaobyte_sqrt.cuh  |  81 +--
 .../tensorfunc/elementwise_miaobyte_sqrt.hpp  |  34 +-
 .../src/deepx/tensorfunc/reduce_miaobyte.cu   |   3 +-
 .../src/deepx/tf/elementwise_basic.hpp        | 182 +++++--
 .../src/deepx/tf/elementwise_sqrt.hpp         | 168 +++++--
 excuter/op-mem-ompsimd/src/client/tfs.cpp     |  20 +
 .../deepx/tensorfunc/elementwise_miaobyte.hpp |  43 ++
 .../src/deepx/tf/elementwise.hpp              | 100 ++++
 front/py/deepx/nn/functional/authormap.py     |   3 +-
 .../nn/functional/leaffunc_elementwise.py     |   5 +-
 .../py/deepx/nn/functional/rtf_elementwise.py |   4 +
 front/py/deepx/tensor/elementwise.py          |   7 +
 front/py/deepx/tensor/tensor.py               |   3 +
 .../deepx/transformer/modeling_rope_utils.py  |  58 +--
 .../py/examples/2_ir/2_elementwise_compare.py |  26 +
 .../py/examples/2_ir/2_elementwise_sqrtlog.py |  19 +-
 26 files changed, 1318 insertions(+), 1131 deletions(-)
 create mode 100644 excuter/op-mem-cuda/src/deepx/tensorfunc/cuda_atomic.cuh
 create mode 100644 front/py/examples/2_ir/2_elementwise_compare.py

diff --git a/doc/excuter/op-mem-cuda/list.md b/doc/excuter/op-mem-cuda/list.md
index 7b94afd6..691cdca6 100644
--- a/doc/excuter/op-mem-cuda/list.md
+++ b/doc/excuter/op-mem-cuda/list.md
@@ -12,6 +12,9 @@
 | matmul | cublas | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1 @ T2 | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) |
 | comparescalar | miaobyte | comparescalar(tensor<any> A, var<any> scalar)->(tensor<int8> mask) | mask=compare(T1, scalar) | comparescalar(tensor<any> A, var<any> scalar)->(tensor<int8> mask) |
 | compare | miaobyte | compare(tensor<any> A, tensor<any> B)->(tensor<int8> mask) | mask=compare(T1, T2) | compare(tensor<any> A, tensor<any> B)->(tensor<int8> mask) |
+| prod | miaobyte | prod(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) | B = prod(A, axis=[1 2], keepdims=false) | prod(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) |
+| min | miaobyte | min(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=min(T1, T2) | min(tensor<any> A, tensor<any> B)->(tensor<any> C) |
+| maxscalar | miaobyte | maxscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=max(T1, scalar) | maxscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
 | uniform | miaobyte | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() | uniform(T1,low,high,seed) | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() |
 | addscalar | miaobyte | addscalar(tensor<any> A, var<any> b)->(tensor<any> C) | T3=T1+scalar | addscalar(tensor<any> A, var<any> b)->(tensor<any> C) |
 | log | miaobyte | log(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) | T3=log(T1) | log(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) |
@@ -22,28 +25,27 @@
 | add | cublas | add(tensor<any> a, tensor<any> b)->(tensor<any> c) | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |
 | add | miaobyte | add(tensor<any> a, tensor<any> b)->(tensor<any> c) | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |
 | copytensor |  none  | copytensor(tensor<any> src, tensor<any> dst)->() | T2.data = T1.data | copytensor(tensor<any> src, tensor<any> dst)->() |
-| prod | miaobyte | prod(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) | B = prod(A, axis=[1 2], keepdims=false) | prod(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) |
-| min | miaobyte | min(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=min(T1, T2) | min(tensor<any> A, tensor<any> B)->(tensor<any> C) |
 | print | miaobyte | print(tensor<any> )->() | print(T1) | print(tensor<any> )->() |
 | print | miaobyte | print(tensor<any> , var<string> )->() | print(T1) | print(tensor<any> , var<string> )->() |
 | newtensor |  none  | newtensor(vector<int32> shape)->(tensor<any> tensor1) | T1 = zeros(shape) | newtensor(vector<int32> shape)->(tensor<any> tensor1) |
 | newtensor |  none  | newtensor(var<string> shape)->(tensor<any> tensor1) | T1 = zeros(shape) | newtensor(var<string> shape)->(tensor<any> tensor1) |
-| minscalar | miaobyte | minscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=min(T1, scalar) | minscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
-| rdivscalar | miaobyte | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) | T3=scalar/T1 | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) |
 | constant | miaobyte | constant(tensor<any> t, var<any> value)->() | constant(T1) | constant(tensor<any> t, var<any> value)->() |
 | powscalar | miaobyte | powscalar(tensor<float64|float32> A, var<float64|int32> scalar)->(tensor<float64|float32> C) | T3=pow(T1, scalar) | powscalar(tensor<float64|float32> A, var<float64|int32> scalar)->(tensor<float64|float32> C) |
 | vecset |  none  | vecset(vector<any> value)->(vector<any> name) | shape = [3  4  5] | vecset(vector<any> value)->(vector<any> name) |
 | reducemin | miaobyte | reducemin(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) | B = reducemin(A, axis=[1 2], keepdims=false) | reducemin(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) |
 | subscalar | miaobyte | subscalar(tensor<any> A, var<any> b)->(tensor<any> C) | T3=T1-scalar | subscalar(tensor<any> A, var<any> b)->(tensor<any> C) |
 | sqrt | miaobyte | sqrt(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) | T3=sqrt(T1) | sqrt(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) |
+| minscalar | miaobyte | minscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=min(T1, scalar) | minscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
+| rdivscalar | miaobyte | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) | T3=scalar/T1 | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) |
+| rpowscalar | miaobyte | rpowscalar(var<float64|int32> scalar, tensor<float64|float32> A)->(tensor<float64|float32> C) | T3=pow(scalar, T1) | rpowscalar(var<float64|int32> scalar, tensor<float64|float32> A)->(tensor<float64|float32> C) |
 | sub | miaobyte | sub(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1-T2 | sub(tensor<any> A, tensor<any> B)->(tensor<any> C) |
 | sum | miaobyte | sum(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) | B = sum(A, axis=[1 2], keepdims=false) | sum(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) |
 | argset |  none  | argset(var<any> value)->(var<any> name) | var argname = argvalue | argset(var<any> value)->(var<any> name) |
 | mulscalar | miaobyte | mulscalar(tensor<any> A, var<any> b)->(tensor<any> C) | T3=T1*scalar | mulscalar(tensor<any> A, var<any> b)->(tensor<any> C) |
 | div | miaobyte | div(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1/T2 | div(tensor<any> A, tensor<any> B)->(tensor<any> C) |
+| invert | miaobyte | invert(tensor<int64|int32|int16|int8> A)->(tensor<int64|int32|int16|int8> C) | T3=~T1 | invert(tensor<int64|int32|int16|int8> A)->(tensor<int64|int32|int16|int8> C) |
 | max | miaobyte | max(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=max(T1, T2) | max(tensor<any> A, tensor<any> B)->(tensor<any> C) |
 | pow | miaobyte | pow(tensor<float64|float32> A, tensor<float64|float32> B)->(tensor<float64|float32> C) | T3=pow(T1, T2) | pow(tensor<float64|float32> A, tensor<float64|float32> B)->(tensor<float64|float32> C) |
-| maxscalar | miaobyte | maxscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=max(T1, scalar) | maxscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
 | mul | miaobyte | mul(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1*T2 | mul(tensor<any> A, tensor<any> B)->(tensor<any> C) |
 | exp | miaobyte | exp(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) | T3=exp(T1) | exp(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) |
 | deltensor |  none  | deltensor(tensor<any> t)->() | del T1 | deltensor(tensor<any> t)->() |
diff --git a/doc/excuter/op-mem-ompsimd/list.md b/doc/excuter/op-mem-ompsimd/list.md
index 9593ef2f..68ea3b70 100644
--- a/doc/excuter/op-mem-ompsimd/list.md
+++ b/doc/excuter/op-mem-ompsimd/list.md
@@ -12,6 +12,7 @@
 | matmul | cblas | matmul(tensor<float64|float32> A, tensor<float64|float32> B)->(tensor<float64|float32> C) | T3=T1 @ T2 | matmul(tensor<float64|float32> A, tensor<float64|float32> B)->(tensor<float64|float32> C) |
 | matmul | miaobyte | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1 @ T2 | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) |
 | comparescalar | miaobyte | comparescalar(tensor<any> A, var<any> scalar)->(tensor<float32> mask) | mask=compare(T1,scalar) | comparescalar(tensor<any> A, var<any> scalar)->(tensor<float32> mask) |
+| compare | miaobyte | compare(tensor<any> A, tensor<any> B)->(tensor<float32> mask) | mask=compare(T1,T2) | compare(tensor<any> A, tensor<any> B)->(tensor<float32> mask) |
 | uniform | miaobyte | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() | uniform(T1,low,high,seed) | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() |
 | addscalar | miaobyte | addscalar(tensor<any> a, var<any> scalar)->(tensor<any> c) | T3=T1+scalar | addscalar(tensor<any> a, var<any> scalar)->(tensor<any> c) |
 | log | miaobyte | log(tensor<any> A)->(tensor<any> C) | T3=log(T1) | log(tensor<any> A)->(tensor<any> C) |
@@ -38,11 +39,12 @@
 | argset |  none  | argset(var<any> value)->(var<any> name) | var argname = argvalue | argset(var<any> value)->(var<any> name) |
 | mulscalar | miaobyte | mulscalar(tensor<any> A, var<any> b)->(tensor<any> C) | T3=T1*scalar | mulscalar(tensor<any> A, var<any> b)->(tensor<any> C) |
 | div | miaobyte | div(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1/T2 | div(tensor<any> A, tensor<any> B)->(tensor<any> C) |
+| invert | miaobyte | invert(tensor<int64|int32|int16|int8> A)->(tensor<int64|int32|int16|int8> C) | T3=~T1 | invert(tensor<int64|int32|int16|int8> A)->(tensor<int64|int32|int16|int8> C) |
 | max | miaobyte | max(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=max(T1,T2) | max(tensor<any> A, tensor<any> B)->(tensor<any> C) |
 | pow | miaobyte | pow(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1^T2 | pow(tensor<any> A, tensor<any> B)->(tensor<any> C) |
 | maxscalar | miaobyte | maxscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=max(T1,scalar) | maxscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
 | mul | miaobyte | mul(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1*T2 | mul(tensor<any> A, tensor<any> B)->(tensor<any> C) |
 | exp | miaobyte | exp(tensor<any> A)->(tensor<any> C) | T3=exp(T1) | exp(tensor<any> A)->(tensor<any> C) |
 | rdivscalar | miaobyte | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) | T3=scalar/T1 | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) |
+| rpowscalar | miaobyte | rpowscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) | T3=scalar^T1 | rpowscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) |
 | minscalar | miaobyte | minscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=min(T1,scalar) | minscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
-| compare | miaobyte | compare(tensor<any> A, tensor<any> B)->(tensor<float32> mask) | mask=compare(T1,T2) | compare(tensor<any> A, tensor<any> B)->(tensor<float32> mask) |
diff --git a/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp b/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp
index 8c9ebd94..aadf5a68 100644
--- a/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp
+++ b/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp
@@ -15,7 +15,6 @@ namespace deepx::tensorfunc
         }
     };
 
-
     // A+B=>C
     template <typename Author, typename T>
     void add(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &C)
@@ -26,7 +25,8 @@ namespace deepx::tensorfunc
     template <typename Author, typename T>
     struct addscalarDispatcher
     {
-        static void addscalar(const Tensor<T> &input, const T value, Tensor<T> &output){
+        static void addscalar(const Tensor<T> &input, const T value, Tensor<T> &output)
+        {
             throw NotImplementError("addscalar");
         }
     };
@@ -41,7 +41,8 @@ namespace deepx::tensorfunc
     template <typename Author, typename T>
     struct subDispatcher
     {
-        static void sub(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &C){
+        static void sub(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &C)
+        {
             throw NotImplementError("sub");
         }
     };
@@ -56,7 +57,8 @@ namespace deepx::tensorfunc
     template <typename Author, typename T>
     struct subscalarDispatcher
     {
-        static void subscalar(const Tensor<T> &input, const T value, Tensor<T> &output){
+        static void subscalar(const Tensor<T> &input, const T value, Tensor<T> &output)
+        {
             throw NotImplementError("subscalar");
         }
     };
@@ -94,8 +96,6 @@ namespace deepx::tensorfunc
         mulscalarDispatcher<Author, T>::mulscalar(input, value, output);
     }
 
- 
-  
     template <typename Author, typename T>
     struct divDispatcher
     {
@@ -135,27 +135,12 @@ namespace deepx::tensorfunc
         rdivscalarDispatcher<Author, T>::rdivscalar(value, input, output);
     }
 
-    
-    template <typename Author, typename T,typename = void>
-    struct sqrtDispatcher
-    {
-        static void sqrt(const Tensor<T> &input, Tensor<T> &output) = delete;
-    };
-
-    // sqrt(A)=>C   
-    template <typename Author, typename T>
-    void sqrt(const Tensor<T> &input, Tensor<T> &output)
-    {
-        sqrtDispatcher<Author, T>::sqrt(input, output);
-    }
-
+    // A^B=>C
     template <typename Author, typename T>
     struct powDispatcher
     {
         static void pow(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &C) = delete;
     };
-
-    // A^B=>C
     template <typename Author, typename T>
     void pow(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &C)
     {
@@ -175,6 +160,33 @@ namespace deepx::tensorfunc
         powscalarDispatcher<Author, T>::powscalar(input, value, output);
     }
 
+ 
+    template <typename Author, typename T>
+    struct rpowscalarDispatcher
+    {
+        static void rpowscalar(const T value, const Tensor<T> &input, Tensor<T> &output) = delete;
+    };
+
+    // scalar^A=>C
+    template <typename Author, typename T>
+    void rpowscalar(const T value, const Tensor<T> &input, Tensor<T> &output)
+    {
+        rpowscalarDispatcher<Author, T>::rpowscalar(value, input, output);
+    }
+
+    template <typename Author, typename T, typename = void>
+    struct sqrtDispatcher
+    {
+        static void sqrt(const Tensor<T> &input, Tensor<T> &output) = delete;
+    };
+
+    // sqrt(A)=>C
+    template <typename Author, typename T>
+    void sqrt(const Tensor<T> &input, Tensor<T> &output)
+    {
+        sqrtDispatcher<Author, T>::sqrt(input, output);
+    }
+
     template <typename Author, typename T>
     struct logDispatcher
     {
@@ -253,8 +265,6 @@ namespace deepx::tensorfunc
         maxDispatcher<Author, T>::max(A, B, C);
     }
 
-    
-
     template <typename Author, typename T>
     struct maxscalarDispatcher
     {
@@ -268,8 +278,6 @@ namespace deepx::tensorfunc
         maxscalarDispatcher<Author, T>::maxscalar(A, b, C);
     }
 
- 
-
     template <typename Author, typename T>
     struct minDispatcher
     {
@@ -295,7 +303,7 @@ namespace deepx::tensorfunc
     {
         minscalarDispatcher<Author, T>::minscalar(A, b, C);
     }
-    
+
     template <typename Author, typename T>
     struct compareDispatcher
     {
@@ -307,7 +315,7 @@ namespace deepx::tensorfunc
     // if A[i]>B[i], mask[i]=0
     // if A[i]<B[i], mask[i]=1
     template <typename Author, typename T>
-    void compare(const Tensor<T> &A, const Tensor<T> &B,Tensor<float> &mask)
+    void compare(const Tensor<T> &A, const Tensor<T> &B, Tensor<float> &mask)
     {
         compareDispatcher<Author, T>::compare(A, B, mask);
     }
@@ -328,16 +336,28 @@ namespace deepx::tensorfunc
     template <typename Author, typename T>
     struct equalDispatcher
     {
-        static bool equal(const Tensor<T> &A, const Tensor<T> &B, float epsilon=1e-6) = delete;
+        static bool equal(const Tensor<T> &A, const Tensor<T> &B, float epsilon = 1e-6) = delete;
     };
 
     template <typename Author, typename T>
-    bool equal(const Tensor<T> &A, const Tensor<T> &B,float epsilon=1e-6)
+    bool equal(const Tensor<T> &A, const Tensor<T> &B, float epsilon = 1e-6)
     {
         return equalDispatcher<Author, T>::equal(A, B, epsilon);
     }
-    
-    
+
+    template <typename Author, typename T>
+    struct invertDispatcher
+    {
+        static void invert(const Tensor<T> &input, Tensor<T> &output) = delete;
+    };
+
+    // invert(A)=>C
+    template <typename Author, typename T>
+    void invert(const Tensor<T> &input, Tensor<T> &output)
+    {
+        invertDispatcher<Author, T>::invert(input, output);
+    }
+
 } // namespace deepx::tensorfunc
 
 #endif // DEEPX_TENSORFUNC_ELEMENTWISE_HPP
diff --git a/excuter/op-mem-cuda/src/client/tfs.cpp b/excuter/op-mem-cuda/src/client/tfs.cpp
index 15d935b0..e6e82689 100644
--- a/excuter/op-mem-cuda/src/client/tfs.cpp
+++ b/excuter/op-mem-cuda/src/client/tfs.cpp
@@ -205,7 +205,16 @@ namespace deepx::tf
                                                                     {
                                                                         Param("C", DataCategory::Tensor, Precision::Any),
                                                                     })));
-
+        //invert
+        tffactory.add_tf(std::make_shared<Invert<miaobyte>>(vector<Param>(
+                                                                 {
+                                                                     Param("A", DataCategory::Tensor, Precision::Int64 | Precision::Int32 | Precision::Int16 | Precision::Int8),
+                                                                 }),
+                                                                 vector<Param>(
+                                                                     {
+                                                                         Param("C", DataCategory::Tensor, Precision::Int64 | Precision::Int32 | Precision::Int16 | Precision::Int8),
+                                                                     })));
+                                                                     
         tffactory.add_tf(std::make_shared<Sqrt<miaobyte>>(vector<Param>(
                                                               {
                                                                   Param("A", DataCategory::Tensor, Precision::Float64 | Precision::Float32 | Precision::Float16 | Precision::BFloat16),
@@ -233,6 +242,17 @@ namespace deepx::tf
                                                                    {
                                                                        Param("C", DataCategory::Tensor, Precision::Float64 | Precision::Float32),
                                                                    })));
+        //rpowscalar
+        tffactory.add_tf(std::make_shared<RpowScalar<miaobyte>>(vector<Param>(
+                                                                   {
+                                                                       Param("scalar", DataCategory::Var, Precision::Float64 | Precision::Int32),
+                                                                       Param("A", DataCategory::Tensor, Precision::Float64 | Precision::Float32),
+                                                                   }),
+                                                               vector<Param>(
+                                                                   {
+                                                                       Param("C", DataCategory::Tensor, Precision::Float64 | Precision::Float32),
+                                                                   })));
+        //log
         tffactory.add_tf(std::make_shared<Log<miaobyte>>(vector<Param>(
                                                              {
                                                                  Param("A", DataCategory::Tensor, Precision::Float64 | Precision::Float32 | Precision::Float16 | Precision::BFloat16),
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda_atomic.cuh b/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda_atomic.cuh
new file mode 100644
index 00000000..58596f7b
--- /dev/null
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda_atomic.cuh
@@ -0,0 +1,260 @@
+#ifndef DEEPX_TENSORFUNC_CUDA_ATOMIC_CUH
+#define DEEPX_TENSORFUNC_CUDA_ATOMIC_CUH
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include <cublas_v2.h>
+namespace deepx::tensorfunc
+{
+     // atomicAdd
+    template <typename T>
+    __device__  __forceinline__ void deepx_atomicAdd(T *a, T b);
+
+    template <>
+    __device__ __forceinline__ void deepx_atomicAdd<double>(double *a, double b)
+    {
+        atomicAdd(a, b);
+    }
+
+    template <>
+    __device__ __forceinline__ void deepx_atomicAdd<float>(float *a, float b)
+    {
+        atomicAdd(a, b);
+    }
+
+    template <>
+    __device__  __forceinline__ void deepx_atomicAdd<half>(half *a, half b)
+    {
+        atomicAdd(a, b);
+    }
+
+    template <>
+    __device__  __forceinline__ void deepx_atomicAdd<nv_bfloat16>(nv_bfloat16 *a, nv_bfloat16 b)
+    {
+        atomicAdd(a, b);
+    }
+
+    template <>
+    __device__  __forceinline__ void deepx_atomicAdd<int64_t>(int64_t *a, int64_t b)
+    {
+        int64_t old = *a;
+        int64_t assumed;
+        do
+        {
+            assumed = old;
+            old = atomicCAS((unsigned long long *)a, (unsigned long long)assumed, (unsigned long long)(assumed + b));
+        } while (assumed != old);
+        *a = old + b;
+    }
+
+    template <>
+    __device__  __forceinline__ void deepx_atomicAdd<int32_t>(int32_t *a, int32_t b)
+    {
+        atomicAdd(a, b);
+    }
+
+    template <>
+    __device__  __forceinline__ void deepx_atomicAdd<int16_t>(int16_t *a, int16_t b)
+    {
+        unsigned int *address_as_uint = (unsigned int *)((char *)a - ((size_t)a & 2));
+        unsigned int old = *address_as_uint;
+        unsigned int assumed;
+
+        do
+        {
+            assumed = old;
+            unsigned int new_val;
+            if ((size_t)a & 2)
+            {
+                new_val = (old & 0x0000FFFF) | (((unsigned short)(((old >> 16) & 0xFFFF) + b)) << 16);
+            }
+            else
+            {
+                new_val = (old & 0xFFFF0000) | ((unsigned short)((old & 0xFFFF) + b));
+            }
+            old = atomicCAS(address_as_uint, assumed, new_val);
+        } while (assumed != old);
+    }
+
+    template <>
+    __device__  __forceinline__ void deepx_atomicAdd<int8_t>(int8_t *a, int8_t b)
+    {
+        unsigned int *address_as_uint = (unsigned int *)((char *)a - ((size_t)a & 3));
+        unsigned int old = *address_as_uint;
+        unsigned int assumed;
+        unsigned int byte_offset = ((size_t)a & 3) * 8;
+        unsigned int mask = 0xFF << byte_offset;
+
+        do
+        {
+            assumed = old;
+            unsigned char byte_val = (old >> byte_offset) & 0xFF;
+            byte_val += b;
+            unsigned int new_val = (old & ~mask) | (byte_val << byte_offset);
+            old = atomicCAS(address_as_uint, assumed, new_val);
+        } while (assumed != old);
+    }
+
+
+    // atomicMul
+       // atomicMul
+    template <typename T>
+    __device__  __forceinline__ void deepx_atomicMul(T *a, T b);
+
+    template <>
+    __device__  __forceinline__ void deepx_atomicMul<double>(double *a, double b)
+    {
+        double old = *a;
+        double assumed;
+        do
+        {
+            assumed = old;
+            old = __longlong_as_double(atomicCAS((unsigned long long int*)a, 
+                                               __double_as_longlong(assumed),
+                                               __double_as_longlong(assumed * b)));
+        } while (assumed != old);
+    }
+
+    template <>
+    __device__  __forceinline__ void deepx_atomicMul<float>(float *a, float b)
+    {
+        float old = *a;
+        float assumed;
+        do
+        {
+            assumed = old;
+            old = __int_as_float(atomicCAS((int*)a, 
+                                          __float_as_int(assumed),
+                                          __float_as_int(assumed * b)));
+        } while (assumed != old);
+    }
+    
+    template <>
+    __device__  __forceinline__ void deepx_atomicMul<half>(half *a, half b)
+    {
+        unsigned int *address_as_uint = (unsigned int *)((char *)a - ((size_t)a & 2));
+        unsigned int old = *address_as_uint;
+        unsigned int assumed;
+
+        do
+        {
+            assumed = old;
+            half assumed_half;
+            if ((size_t)a & 2)
+            {
+                assumed_half = __ushort_as_half((unsigned short)(old >> 16));
+                half new_half = __hmul(assumed_half, b);
+                unsigned int new_val = (old & 0x0000FFFF) | ((unsigned int)__half_as_ushort(new_half) << 16);
+                old = atomicCAS(address_as_uint, assumed, new_val);
+            }
+            else
+            {
+                assumed_half = __ushort_as_half((unsigned short)(old & 0xFFFF));
+                half new_half = __hmul(assumed_half, b);
+                unsigned int new_val = (old & 0xFFFF0000) | __half_as_ushort(new_half);
+                old = atomicCAS(address_as_uint, assumed, new_val);
+            }
+        } while (assumed != old);
+    }
+
+    template <>
+    __device__  __forceinline__ void deepx_atomicMul<nv_bfloat16>(nv_bfloat16 *a, nv_bfloat16 b)
+    {
+        unsigned int *address_as_uint = (unsigned int *)((char *)a - ((size_t)a & 2));
+        unsigned int old = *address_as_uint;
+        unsigned int assumed;
+
+        do
+        {
+            assumed = old;
+            nv_bfloat16 assumed_bf16;
+            if ((size_t)a & 2)
+            {
+                assumed_bf16 = __ushort_as_bfloat16((unsigned short)(old >> 16));
+                nv_bfloat16 new_bf16 = __hmul(assumed_bf16, b);
+                unsigned int new_val = (old & 0x0000FFFF) | ((unsigned int)__bfloat16_as_ushort(new_bf16) << 16);
+                old = atomicCAS(address_as_uint, assumed, new_val);
+            }
+            else
+            {
+                assumed_bf16 = __ushort_as_bfloat16((unsigned short)(old & 0xFFFF));
+                nv_bfloat16 new_bf16 = __hmul(assumed_bf16, b);
+                unsigned int new_val = (old & 0xFFFF0000) | __bfloat16_as_ushort(new_bf16);
+                old = atomicCAS(address_as_uint, assumed, new_val);
+            }
+        } while (assumed != old);
+    }
+
+    template <>
+    __device__  __forceinline__ void deepx_atomicMul<int64_t>(int64_t *a, int64_t b)
+    {
+        int64_t old = *a;
+        int64_t assumed;
+        do
+        {
+            assumed = old;
+            old = atomicCAS((unsigned long long *)a, 
+                          (unsigned long long)assumed, 
+                          (unsigned long long)(assumed * b));
+        } while (assumed != old);
+    }
+
+    template <>
+    __device__  __forceinline__ void deepx_atomicMul<int32_t>(int32_t *a, int32_t b)
+    {
+        int32_t old = *a;
+        int32_t assumed;
+        do
+        {
+            assumed = old;
+            old = atomicCAS((int32_t *)a, assumed, assumed * b);
+        } while (assumed != old);
+    }
+
+    template <>
+    __device__  __forceinline__ void deepx_atomicMul<int16_t>(int16_t *a, int16_t b)
+    {
+        unsigned int *address_as_uint = (unsigned int *)((char *)a - ((size_t)a & 2));
+        unsigned int old = *address_as_uint;
+        unsigned int assumed;
+
+        do
+        {
+            assumed = old;
+            unsigned int new_val;
+            if ((size_t)a & 2)
+            {
+                int16_t assumed_short = (int16_t)(old >> 16);
+                new_val = (old & 0x0000FFFF) | (((unsigned short)(assumed_short * b)) << 16);
+            }
+            else
+            {
+                int16_t assumed_short = (int16_t)(old & 0xFFFF);
+                new_val = (old & 0xFFFF0000) | ((unsigned short)(assumed_short * b));
+            }
+            old = atomicCAS(address_as_uint, assumed, new_val);
+        } while (assumed != old);
+    }
+
+    template <>
+    __device__  __forceinline__ void deepx_atomicMul<int8_t>(int8_t *a, int8_t b)
+    {
+        unsigned int *address_as_uint = (unsigned int *)((char *)a - ((size_t)a & 3));
+        unsigned int old = *address_as_uint;
+        unsigned int assumed;
+        unsigned int byte_offset = ((size_t)a & 3) * 8;
+        unsigned int mask = 0xFF << byte_offset;
+
+        do
+        {
+            assumed = old;
+            int8_t byte_val = (old >> byte_offset) & 0xFF;
+            byte_val *= b;
+            unsigned int new_val = (old & ~mask) | ((byte_val & 0xFF) << byte_offset);
+            old = atomicCAS(address_as_uint, assumed, new_val);
+        } while (assumed != old);
+    }
+}
+
+#endif
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda_math.cuh b/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda_math.cuh
index d1828724..14764266 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda_math.cuh
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda_math.cuh
@@ -9,354 +9,207 @@
 namespace deepx::tensorfunc
 {
 
-    // max
+    //sqrt
     template <typename T>
-    __device__ void deepx_max(const T *a, const T *b, T *out);
+    __device__  __forceinline__  void deepx_sqrt(const T *a, T *out);
 
     template <>
-    __device__ void deepx_max<double>(const double *a, const double *b, double *out)
+    __device__  __forceinline__  void deepx_sqrt<double>(const double *a, double *out)
     {
-        *out = fmax(*a, *b);
+        *out = sqrt(*a);
     }
 
     template <>
-    __device__ void deepx_max<float>(const float *a, const float *b, float *out)
+    __device__  __forceinline__  void deepx_sqrt<float>(const float *a, float *out)
     {
-        *out = fmaxf(*a, *b);
+        *out = sqrtf(*a);
     }
 
     template <>
-    __device__ void deepx_max<half>(const half *a, const half *b, half *out)
+    __device__  __forceinline__  void deepx_sqrt<half>(const half *a, half *out)
     {
-        *out = __hmax(*a, *b);
+        *out = hsqrt(*a);
     }
 
     template <>
-    __device__ void deepx_max<nv_bfloat16>(const nv_bfloat16 *a, const nv_bfloat16 *b, nv_bfloat16 *out)
+    __device__  __forceinline__  void deepx_sqrt<nv_bfloat16>(const nv_bfloat16 *a, nv_bfloat16 *out)
     {
-        *out = __hmax(*a, *b);
-    }
-    template <>
-    __device__ void deepx_max<int64_t>(const int64_t *a, const int64_t *b, int64_t *out)
-    {
-        *out = *a > *b ? *a : *b;
-    }
-    template <>
-    __device__ void deepx_max<int32_t>(const int32_t *a, const int32_t *b, int32_t *out)
-    {
-        *out = *a > *b ? *a : *b;
+        *out = hsqrt(*a);
     }
+    
+    //pow
+    template <typename T>
+    __device__  __forceinline__  void deepx_pow(const T *a, const T *b, T *out);
+
     template <>
-    __device__ void deepx_max<int16_t>(const int16_t *a, const int16_t *b, int16_t *out)
+    __device__  __forceinline__  void deepx_pow<double>(const double *a, const double *b, double *out)
     {
-        *out = *a > *b ? *a : *b;
+        *out = pow(*a, *b);
     }
+
     template <>
-    __device__ void deepx_max<int8_t>(const int8_t *a, const int8_t *b, int8_t *out)
+    __device__  __forceinline__  void deepx_pow<float>(const float *a, const float *b, float *out)
     {
-        *out = *a > *b ? *a : *b;
+        *out = powf(*a, *b);
     }
 
-    // min
+    //log
     template <typename T>
-    __device__ void deepx_min(const T *a, const T *b, T *out);
+    __device__  __forceinline__  void deepx_log(const T *a, T *out);
 
     template <>
-    __device__ void deepx_min<double>(const double *a, const double *b, double *out)
+    __device__  __forceinline__  void deepx_log<double>(const double *a, double *out)
     {
-        *out = fmin(*a, *b);
+        *out = log(*a);
     }
 
     template <>
-    __device__ void deepx_min<float>(const float *a, const float *b, float *out)
+    __device__  __forceinline__  void deepx_log<float>(const float *a, float *out)
     {
-        *out = fminf(*a, *b);
+        *out = logf(*a);
     }
 
     template <>
-    __device__ void deepx_min<half>(const half *a, const half *b, half *out)
+    __device__  __forceinline__  void deepx_log<half>(const half *a, half *out)
     {
-        *out = __hmin(*a, *b);
-    }
+        *out = hlog(*a);
+    }   
 
     template <>
-    __device__ void deepx_min<nv_bfloat16>(const nv_bfloat16 *a, const nv_bfloat16 *b, nv_bfloat16 *out)
+    __device__  __forceinline__  void deepx_log<nv_bfloat16>(const nv_bfloat16 *a, nv_bfloat16 *out)
     {
-        *out = __hmin(*a, *b);
+        *out = hlog(*a);
     }
 
+    //exp
+    template <typename T>
+    __device__  __forceinline__  void deepx_exp(const T *a, T *out);
+
     template <>
-    __device__ void deepx_min<int64_t>(const int64_t *a, const int64_t *b, int64_t *out)
+    __device__  __forceinline__  void deepx_exp<double>(const double *a, double *out)
     {
-        *out = *a < *b ? *a : *b;
+        *out = exp(*a);
     }
 
     template <>
-    __device__ void deepx_min<int32_t>(const int32_t *a, const int32_t *b, int32_t *out)
+    __device__  __forceinline__  void deepx_exp<float>(const float *a, float *out)
     {
-        *out = *a < *b ? *a : *b;
+        *out = expf(*a);
     }
 
     template <>
-    __device__ void deepx_min<int16_t>(const int16_t *a, const int16_t *b, int16_t *out)
+    __device__  __forceinline__  void deepx_exp<half>(const half *a, half *out)
     {
-        *out = *a < *b ? *a : *b;
+        *out = hexp(*a);
     }
 
     template <>
-    __device__ void deepx_min<int8_t>(const int8_t *a, const int8_t *b, int8_t *out)
+    __device__  __forceinline__  void deepx_exp<nv_bfloat16>(const nv_bfloat16 *a, nv_bfloat16 *out)
     {
-        *out = *a < *b ? *a : *b;
+        *out = hexp(*a);
     }
-
-    // atomicAdd
+    
+    // max
     template <typename T>
-    __device__ void deepx_atomicAdd(T *a, T b);
+    __device__  __forceinline__  void deepx_max(const T *a, const T *b, T *out);
 
     template <>
-    __device__ void deepx_atomicAdd<double>(double *a, double b)
+    __device__  __forceinline__  void deepx_max<double>(const double *a, const double *b, double *out)
     {
-        atomicAdd(a, b);
+        *out = fmax(*a, *b);
     }
 
     template <>
-    __device__ void deepx_atomicAdd<float>(float *a, float b)
+    __device__  __forceinline__  void deepx_max<float>(const float *a, const float *b, float *out)
     {
-        atomicAdd(a, b);
+        *out = fmaxf(*a, *b);
     }
 
     template <>
-    __device__ void deepx_atomicAdd<half>(half *a, half b)
+    __device__  __forceinline__  void deepx_max<half>(const half *a, const half *b, half *out)
     {
-        atomicAdd(a, b);
+        *out = __hmax(*a, *b);
     }
 
     template <>
-    __device__ void deepx_atomicAdd<nv_bfloat16>(nv_bfloat16 *a, nv_bfloat16 b)
+    __device__  __forceinline__  void deepx_max<nv_bfloat16>(const nv_bfloat16 *a, const nv_bfloat16 *b, nv_bfloat16 *out)
     {
-        atomicAdd(a, b);
+        *out = __hmax(*a, *b);
     }
-
     template <>
-    __device__ void deepx_atomicAdd<int64_t>(int64_t *a, int64_t b)
+    __device__  __forceinline__  void deepx_max<int64_t>(const int64_t *a, const int64_t *b, int64_t *out)
     {
-        int64_t old = *a;
-        int64_t assumed;
-        do
-        {
-            assumed = old;
-            old = atomicCAS((unsigned long long *)a, (unsigned long long)assumed, (unsigned long long)(assumed + b));
-        } while (assumed != old);
-        *a = old + b;
+        *out = *a > *b ? *a : *b;
     }
-
     template <>
-    __device__ void deepx_atomicAdd<int32_t>(int32_t *a, int32_t b)
+    __device__  __forceinline__  void deepx_max<int32_t>(const int32_t *a, const int32_t *b, int32_t *out)
     {
-        atomicAdd(a, b);
+        *out = *a > *b ? *a : *b;
     }
-
     template <>
-    __device__ void deepx_atomicAdd<int16_t>(int16_t *a, int16_t b)
+    __device__  __forceinline__  void deepx_max<int16_t>(const int16_t *a, const int16_t *b, int16_t *out)
     {
-        unsigned int *address_as_uint = (unsigned int *)((char *)a - ((size_t)a & 2));
-        unsigned int old = *address_as_uint;
-        unsigned int assumed;
-
-        do
-        {
-            assumed = old;
-            unsigned int new_val;
-            if ((size_t)a & 2)
-            {
-                new_val = (old & 0x0000FFFF) | (((unsigned short)(((old >> 16) & 0xFFFF) + b)) << 16);
-            }
-            else
-            {
-                new_val = (old & 0xFFFF0000) | ((unsigned short)((old & 0xFFFF) + b));
-            }
-            old = atomicCAS(address_as_uint, assumed, new_val);
-        } while (assumed != old);
+        *out = *a > *b ? *a : *b;
     }
-
     template <>
-    __device__ void deepx_atomicAdd<int8_t>(int8_t *a, int8_t b)
+    __device__  __forceinline__  void deepx_max<int8_t>(const int8_t *a, const int8_t *b, int8_t *out)
     {
-        unsigned int *address_as_uint = (unsigned int *)((char *)a - ((size_t)a & 3));
-        unsigned int old = *address_as_uint;
-        unsigned int assumed;
-        unsigned int byte_offset = ((size_t)a & 3) * 8;
-        unsigned int mask = 0xFF << byte_offset;
-
-        do
-        {
-            assumed = old;
-            unsigned char byte_val = (old >> byte_offset) & 0xFF;
-            byte_val += b;
-            unsigned int new_val = (old & ~mask) | (byte_val << byte_offset);
-            old = atomicCAS(address_as_uint, assumed, new_val);
-        } while (assumed != old);
+        *out = *a > *b ? *a : *b;
     }
 
-
-    // atomicMul
-       // atomicMul
+    // min
     template <typename T>
-    __device__ void deepx_atomicMul(T *a, T b);
+    __device__  __forceinline__  void deepx_min(const T *a, const T *b, T *out);
 
     template <>
-    __device__ void deepx_atomicMul<double>(double *a, double b)
+    __device__  __forceinline__  void deepx_min<double>(const double *a, const double *b, double *out)
     {
-        double old = *a;
-        double assumed;
-        do
-        {
-            assumed = old;
-            old = __longlong_as_double(atomicCAS((unsigned long long int*)a, 
-                                               __double_as_longlong(assumed),
-                                               __double_as_longlong(assumed * b)));
-        } while (assumed != old);
+        *out = fmin(*a, *b);
     }
 
     template <>
-    __device__ void deepx_atomicMul<float>(float *a, float b)
+    __device__  __forceinline__  void deepx_min<float>(const float *a, const float *b, float *out)
     {
-        float old = *a;
-        float assumed;
-        do
-        {
-            assumed = old;
-            old = __int_as_float(atomicCAS((int*)a, 
-                                          __float_as_int(assumed),
-                                          __float_as_int(assumed * b)));
-        } while (assumed != old);
+        *out = fminf(*a, *b);
     }
-    
+
     template <>
-    __device__ void deepx_atomicMul<half>(half *a, half b)
+    __device__  __forceinline__  void deepx_min<half>(const half *a, const half *b, half *out)
     {
-        unsigned int *address_as_uint = (unsigned int *)((char *)a - ((size_t)a & 2));
-        unsigned int old = *address_as_uint;
-        unsigned int assumed;
-
-        do
-        {
-            assumed = old;
-            half assumed_half;
-            if ((size_t)a & 2)
-            {
-                assumed_half = __ushort_as_half((unsigned short)(old >> 16));
-                half new_half = __hmul(assumed_half, b);
-                unsigned int new_val = (old & 0x0000FFFF) | ((unsigned int)__half_as_ushort(new_half) << 16);
-                old = atomicCAS(address_as_uint, assumed, new_val);
-            }
-            else
-            {
-                assumed_half = __ushort_as_half((unsigned short)(old & 0xFFFF));
-                half new_half = __hmul(assumed_half, b);
-                unsigned int new_val = (old & 0xFFFF0000) | __half_as_ushort(new_half);
-                old = atomicCAS(address_as_uint, assumed, new_val);
-            }
-        } while (assumed != old);
+        *out = __hmin(*a, *b);
     }
 
     template <>
-    __device__ void deepx_atomicMul<nv_bfloat16>(nv_bfloat16 *a, nv_bfloat16 b)
+    __device__  __forceinline__  void deepx_min<nv_bfloat16>(const nv_bfloat16 *a, const nv_bfloat16 *b, nv_bfloat16 *out)
     {
-        unsigned int *address_as_uint = (unsigned int *)((char *)a - ((size_t)a & 2));
-        unsigned int old = *address_as_uint;
-        unsigned int assumed;
-
-        do
-        {
-            assumed = old;
-            nv_bfloat16 assumed_bf16;
-            if ((size_t)a & 2)
-            {
-                assumed_bf16 = __ushort_as_bfloat16((unsigned short)(old >> 16));
-                nv_bfloat16 new_bf16 = __hmul(assumed_bf16, b);
-                unsigned int new_val = (old & 0x0000FFFF) | ((unsigned int)__bfloat16_as_ushort(new_bf16) << 16);
-                old = atomicCAS(address_as_uint, assumed, new_val);
-            }
-            else
-            {
-                assumed_bf16 = __ushort_as_bfloat16((unsigned short)(old & 0xFFFF));
-                nv_bfloat16 new_bf16 = __hmul(assumed_bf16, b);
-                unsigned int new_val = (old & 0xFFFF0000) | __bfloat16_as_ushort(new_bf16);
-                old = atomicCAS(address_as_uint, assumed, new_val);
-            }
-        } while (assumed != old);
+        *out = __hmin(*a, *b);
     }
 
     template <>
-    __device__ void deepx_atomicMul<int64_t>(int64_t *a, int64_t b)
+    __device__  __forceinline__  void deepx_min<int64_t>(const int64_t *a, const int64_t *b, int64_t *out)
     {
-        int64_t old = *a;
-        int64_t assumed;
-        do
-        {
-            assumed = old;
-            old = atomicCAS((unsigned long long *)a, 
-                          (unsigned long long)assumed, 
-                          (unsigned long long)(assumed * b));
-        } while (assumed != old);
+        *out = *a < *b ? *a : *b;
     }
 
     template <>
-    __device__ void deepx_atomicMul<int32_t>(int32_t *a, int32_t b)
+    __device__  __forceinline__  void deepx_min<int32_t>(const int32_t *a, const int32_t *b, int32_t *out)
     {
-        int32_t old = *a;
-        int32_t assumed;
-        do
-        {
-            assumed = old;
-            old = atomicCAS((int32_t *)a, assumed, assumed * b);
-        } while (assumed != old);
+        *out = *a < *b ? *a : *b;
     }
 
     template <>
-    __device__ void deepx_atomicMul<int16_t>(int16_t *a, int16_t b)
+    __device__  __forceinline__  void deepx_min<int16_t>(const int16_t *a, const int16_t *b, int16_t *out)
     {
-        unsigned int *address_as_uint = (unsigned int *)((char *)a - ((size_t)a & 2));
-        unsigned int old = *address_as_uint;
-        unsigned int assumed;
-
-        do
-        {
-            assumed = old;
-            unsigned int new_val;
-            if ((size_t)a & 2)
-            {
-                int16_t assumed_short = (int16_t)(old >> 16);
-                new_val = (old & 0x0000FFFF) | (((unsigned short)(assumed_short * b)) << 16);
-            }
-            else
-            {
-                int16_t assumed_short = (int16_t)(old & 0xFFFF);
-                new_val = (old & 0xFFFF0000) | ((unsigned short)(assumed_short * b));
-            }
-            old = atomicCAS(address_as_uint, assumed, new_val);
-        } while (assumed != old);
+        *out = *a < *b ? *a : *b;
     }
 
     template <>
-    __device__ void deepx_atomicMul<int8_t>(int8_t *a, int8_t b)
+    __device__  __forceinline__  void deepx_min<int8_t>(const int8_t *a, const int8_t *b, int8_t *out)
     {
-        unsigned int *address_as_uint = (unsigned int *)((char *)a - ((size_t)a & 3));
-        unsigned int old = *address_as_uint;
-        unsigned int assumed;
-        unsigned int byte_offset = ((size_t)a & 3) * 8;
-        unsigned int mask = 0xFF << byte_offset;
-
-        do
-        {
-            assumed = old;
-            int8_t byte_val = (old >> byte_offset) & 0xFF;
-            byte_val *= b;
-            unsigned int new_val = (old & ~mask) | ((byte_val & 0xFF) << byte_offset);
-            old = atomicCAS(address_as_uint, assumed, new_val);
-        } while (assumed != old);
+        *out = *a < *b ? *a : *b;
     }
+
+   
     
 }
 
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu
index 3f54e08e..abe6f223 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu
@@ -1,284 +1,322 @@
 #ifndef DEEPX_TENSORFUNC_ELEMENTWISE_MIAOBYTE_BASIC_CU
 #define DEEPX_TENSORFUNC_ELEMENTWISE_MIAOBYTE_BASIC_CU
 
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
 #include "deepx/tensorfunc/cuda.hpp"
 #include "deepx/tensorfunc/authors.hpp"
 
 namespace deepx::tensorfunc
 {
-     template <typename T>
-    __global__ void add_kernel(const T* A, const T* B, T* C,const int size) {
-        int idx = blockIdx.x * blockDim.x + threadIdx.x;
-        if (idx < size) {
+    // add
+    template <typename T>
+    __global__ void add_kernel(const T *A, const T *B, T *C, const int size)
+    {
+        for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x)
+        {
             C[idx] = A[idx] + B[idx];
         }
     }
-    template __global__ void add_kernel<double>(const double* A, const double* B, double* C,const int size);
-    template __global__ void add_kernel<float>(const float* A, const float* B, float* C,const int size);
-    template __global__ void add_kernel<half>(const half* A, const half* B, half* C,const int size);
-    template __global__ void add_kernel<nv_bfloat16>(const nv_bfloat16* A, const nv_bfloat16* B, nv_bfloat16* C,const int size);
-    template __global__ void add_kernel<int64_t>(const int64_t* A, const int64_t* B, int64_t* C,const int size);
-    template __global__ void add_kernel<int32_t>(const int32_t* A, const int32_t* B, int32_t* C,const int size);
-    template __global__ void add_kernel<int16_t>(const int16_t* A, const int16_t* B, int16_t* C,const int size);
-    template __global__ void add_kernel<int8_t>(const int8_t* A, const int8_t* B, int8_t* C,const int size);
-    
+
     template <typename T>
-    void launch_add(int numBlocks, int blockSize,const T*  a, const  T* b,  T* c,const int size)
+    void launch_add(const T *a, const T *b, T *c, const int size)
     {
-         // 启动kernel
-            add_kernel<<<numBlocks, blockSize>>>(a, b, c, size);
-            // 检查kernel执行是否成功
-            cudaError_t err = cudaGetLastError();
-            if (err != cudaSuccess) {
-                throw std::runtime_error("Failed to launch add kernel: " + 
-                                       std::string(cudaGetErrorString(err)));
-            }
+        // 启动kernel
+        auto [numBlocks, blockSize] = BestDims(size);
+        add_kernel<<<numBlocks, blockSize>>>(a, b, c, size);
+        // 检查kernel执行是否成功
+        cudaError_t err = cudaGetLastError();
+        if (err != cudaSuccess)
+        {
+            throw std::runtime_error("Failed to launch add kernel: " +
+                                     std::string(cudaGetErrorString(err)));
+        }
     }
 
-    template void launch_add<double>(int numBlocks, int blockSize,const double*  a, const  double* b,  double* c,const int size);
-    template void launch_add<float>(int numBlocks, int blockSize,const float*  a, const  float* b,  float* c,const int size);
-    template void launch_add<half>(int numBlocks, int blockSize,const half*  a, const  half* b,  half* c,const int size);
-    template void launch_add<nv_bfloat16>(int numBlocks, int blockSize,const nv_bfloat16*  a, const  nv_bfloat16* b,  nv_bfloat16* c,const int size);
-    template void launch_add<int64_t>(int numBlocks, int blockSize,const int64_t*  a, const  int64_t* b,  int64_t* c,const int size);
-    template void launch_add<int32_t>(int numBlocks, int blockSize, const int32_t*  a, const  int32_t* b,  int32_t* c,const int size);
-    template void launch_add<int16_t>(int numBlocks, int blockSize, const int16_t*  a, const  int16_t* b,  int16_t* c,const int size);
-    template void launch_add<int8_t>(int numBlocks, int blockSize, const int8_t*  a, const  int8_t* b,  int8_t* c,const int size);
-
+    template void launch_add<double>(const double *a, const double *b, double *c, const int size);
+    template void launch_add<float>(const float *a, const float *b, float *c, const int size);
+    template void launch_add<half>(const half *a, const half *b, half *c, const int size);
+    template void launch_add<nv_bfloat16>(const nv_bfloat16 *a, const nv_bfloat16 *b, nv_bfloat16 *c, const int size);
+    template void launch_add<int64_t>(const int64_t *a, const int64_t *b, int64_t *c, const int size);
+    template void launch_add<int32_t>(const int32_t *a, const int32_t *b, int32_t *c, const int size);
+    template void launch_add<int16_t>(const int16_t *a, const int16_t *b, int16_t *c, const int size);
+    template void launch_add<int8_t>(const int8_t *a, const int8_t *b, int8_t *c, const int size);
 
+    // addscalar
     template <typename T>
-    __global__ void addscalar_kernel(const T* A, const T scalar, T* C,const int size) {
-        int idx = blockIdx.x * blockDim.x + threadIdx.x;
-        if (idx < size) {
+    __global__ void addscalar_kernel(const T *A, const T scalar, T *C, const int size)
+    {
+        for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x)
+        {
             C[idx] = A[idx] + scalar;
         }
-    }   
-    template __global__ void addscalar_kernel<double>(const double* A, const double scalar, double* C,const int size);   
-    template __global__ void addscalar_kernel<float>(const float* A, const float scalar, float* C,const int size);
-    template __global__ void addscalar_kernel<half>(const half* A, const half scalar, half* C,const int size);
-    template __global__ void addscalar_kernel<nv_bfloat16>(const nv_bfloat16* A, const nv_bfloat16 scalar, nv_bfloat16* C,const int size);
-    template __global__ void addscalar_kernel<int64_t>(const int64_t* A, const int64_t scalar, int64_t* C,const int size);
-    template __global__ void addscalar_kernel<int32_t>(const int32_t* A, const int32_t scalar, int32_t* C,const int size);
-    template __global__ void addscalar_kernel<int16_t>(const int16_t* A, const int16_t scalar, int16_t* C,const int size);
-    template __global__ void addscalar_kernel<int8_t>(const int8_t* A, const int8_t scalar, int8_t* C,const int size);
-    
+    }
+
     template <typename T>
-    void launch_addscalar(const int numBlocks, const int blockSize, const T* a, const T scalar, T* c, const int size) {
+    void launch_addscalar(const T *a, const T scalar, T *c, const int size)
+    {
+        auto [numBlocks, blockSize] = BestDims(size);
         addscalar_kernel<<<numBlocks, blockSize>>>(a, scalar, c, size);
-    }   
-    template void launch_addscalar<double>(const int numBlocks, const int blockSize, const double* a, const double scalar, double* c, const int size);
-    template void launch_addscalar<float>(const int numBlocks, const int blockSize, const float* a, const float scalar, float* c, const int size);
-    template void launch_addscalar<half>(const int numBlocks, const int blockSize, const half* a, const half scalar, half* c, const int size);
-    template void launch_addscalar<nv_bfloat16>(const int numBlocks, const int blockSize, const nv_bfloat16* a, const nv_bfloat16 scalar, nv_bfloat16* c, const int size);
-    template void launch_addscalar<int64_t>(const int numBlocks, const int blockSize, const int64_t* a, const int64_t scalar, int64_t* c, const int size);  
-    template void launch_addscalar<int32_t>(const int numBlocks, const int blockSize, const int32_t* a, const int32_t scalar, int32_t* c, const int size);
-    template void launch_addscalar<int16_t>(const int numBlocks, const int blockSize, const int16_t* a, const int16_t scalar, int16_t* c, const int size);
-    template void launch_addscalar<int8_t>(const int numBlocks, const int blockSize, const int8_t* a, const int8_t scalar, int8_t* c, const int size);
-
+        cudaError_t err = cudaGetLastError();
+        if (err != cudaSuccess)
+        {
+            throw std::runtime_error("Failed to launch addscalar kernel: " +
+                                     std::string(cudaGetErrorString(err)));
+        }
+    }
+    template void launch_addscalar<double>(const double *a, const double scalar, double *c, const int size);
+    template void launch_addscalar<float>(const float *a, const float scalar, float *c, const int size);
+    template void launch_addscalar<half>(const half *a, const half scalar, half *c, const int size);
+    template void launch_addscalar<nv_bfloat16>(const nv_bfloat16 *a, const nv_bfloat16 scalar, nv_bfloat16 *c, const int size);
+    template void launch_addscalar<int64_t>(const int64_t *a, const int64_t scalar, int64_t *c, const int size);
+    template void launch_addscalar<int32_t>(const int32_t *a, const int32_t scalar, int32_t *c, const int size);
+    template void launch_addscalar<int16_t>(const int16_t *a, const int16_t scalar, int16_t *c, const int size);
+    template void launch_addscalar<int8_t>(const int8_t *a, const int8_t scalar, int8_t *c, const int size);
 
+    // sub
     template <typename T>
-    __global__ void sub_kernel(const T* A, const T* B, T* C,const int size){
-        int idx = blockIdx.x * blockDim.x + threadIdx.x;
-        if (idx < size) {
+    __global__ void sub_kernel(const T *A, const T *B, T *C, const int size)
+    {
+        for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x)
+        {
             C[idx] = A[idx] - B[idx];
-        }   
+        }
     }
-    template __global__ void sub_kernel<double>(const double* A, const double* B, double* C, const int size);   
-    template __global__ void sub_kernel<float>(const float* A, const float* B, float* C, const int size);
-    template __global__ void sub_kernel<half>(const half* A, const half* B, half* C, const int size);
-    template __global__ void sub_kernel<nv_bfloat16>(const nv_bfloat16* A, const nv_bfloat16* B, nv_bfloat16* C, const int size);
-    template __global__ void sub_kernel<int64_t>(const int64_t* A, const int64_t* B, int64_t* C, const int size);
-    template __global__ void sub_kernel<int32_t>(const int32_t* A, const int32_t* B, int32_t* C, const int size);
-    template __global__ void sub_kernel<int16_t>(const int16_t* A, const int16_t* B, int16_t* C, const int size);
-    template __global__ void sub_kernel<int8_t>(const int8_t* A, const int8_t* B, int8_t* C, const int size);
 
     template <typename T>
-    void launch_sub(const int numBlocks, const int blockSize, const T* a, const T* b, T* c, const int size) {
+    void launch_sub(const T *a, const T *b, T *c, const int size)
+    {
+        auto [numBlocks, blockSize] = BestDims(size);
         sub_kernel<<<numBlocks, blockSize>>>(a, b, c, size);
+        cudaError_t err = cudaGetLastError();
+        if (err != cudaSuccess)
+        {
+            throw std::runtime_error("Failed to launch sub kernel: " +
+                                     std::string(cudaGetErrorString(err)));
+        }
     }
-    template void launch_sub<double>(const int numBlocks, const int blockSize, const double* a, const double* b, double* c, const int size);
-    template void launch_sub<float>(const int numBlocks, const int blockSize, const float* a, const float* b, float* c, const int size);
-    template void launch_sub<half>(const int numBlocks, const int blockSize, const half* a, const half* b, half* c, const int size);
-    template void launch_sub<nv_bfloat16>(const int numBlocks, const int blockSize, const nv_bfloat16* a, const nv_bfloat16* b, nv_bfloat16* c, const int size);
-    template void launch_sub<int64_t>(const int numBlocks, const int blockSize, const int64_t* a, const int64_t* b, int64_t* c, const int size);
-    template void launch_sub<int32_t>(const int numBlocks, const int blockSize, const int32_t* a, const int32_t* b, int32_t* c, const int size);
-    template void launch_sub<int16_t>(const int numBlocks, const int blockSize, const int16_t* a, const int16_t* b, int16_t* c, const int size);
-    template void launch_sub<int8_t>(const int numBlocks, const int blockSize, const int8_t* a, const int8_t* b, int8_t* c, const int size);    
-    
+    template void launch_sub<double>(const double *a, const double *b, double *c, const int size);
+    template void launch_sub<float>(const float *a, const float *b, float *c, const int size);
+    template void launch_sub<half>(const half *a, const half *b, half *c, const int size);
+    template void launch_sub<nv_bfloat16>(const nv_bfloat16 *a, const nv_bfloat16 *b, nv_bfloat16 *c, const int size);
+    template void launch_sub<int64_t>(const int64_t *a, const int64_t *b, int64_t *c, const int size);
+    template void launch_sub<int32_t>(const int32_t *a, const int32_t *b, int32_t *c, const int size);
+    template void launch_sub<int16_t>(const int16_t *a, const int16_t *b, int16_t *c, const int size);
+    template void launch_sub<int8_t>(const int8_t *a, const int8_t *b, int8_t *c, const int size);
+
+    // subscalar
     template <typename T>
-    __global__ void subscalar_kernel(const T* A, const T scalar, T* C,const int size){
-        int idx = blockIdx.x * blockDim.x + threadIdx.x;
-        if (idx < size) {
+    __global__ void subscalar_kernel(const T *A, const T scalar, T *C, const int size)
+    {
+        for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x)
+        {
             C[idx] = A[idx] - scalar;
         }
-    }   
-    template __global__ void subscalar_kernel<double>(const double* A, const double scalar, double* C,const int size);
-    template __global__ void subscalar_kernel<float>(const float* A, const float scalar, float* C,const int size);
-    template __global__ void subscalar_kernel<half>(const half* A, const half scalar, half* C,const int size);
-    template __global__ void subscalar_kernel<nv_bfloat16>(const nv_bfloat16* A, const nv_bfloat16 scalar, nv_bfloat16* C,const int size);
-    template __global__ void subscalar_kernel<int64_t>(const int64_t* A, const int64_t scalar, int64_t* C,const int size);  
-    template __global__ void subscalar_kernel<int32_t>(const int32_t* A, const int32_t scalar, int32_t* C,const int size);  
-    template __global__ void subscalar_kernel<int16_t>(const int16_t* A, const int16_t scalar, int16_t* C,const int size);  
-    template __global__ void subscalar_kernel<int8_t>(const int8_t* A, const int8_t scalar, int8_t* C,const int size);  
+    }
 
     template <typename T>
-    void launch_subscalar(const int numBlocks, const int blockSize, const T* a, const T scalar, T* c, const int size) { 
+    void launch_subscalar(const T *a, const T scalar, T *c, const int size)
+    {
+        auto [numBlocks, blockSize] = BestDims(size);
         subscalar_kernel<<<numBlocks, blockSize>>>(a, scalar, c, size);
+        cudaError_t err = cudaGetLastError();
+        if (err != cudaSuccess)
+        {
+            throw std::runtime_error("Failed to launch subscalar kernel: " +
+                                     std::string(cudaGetErrorString(err)));
+        }
     }
-    template void launch_subscalar<double>(const int numBlocks, const int blockSize, const double* a, const double scalar, double* c, const int size);
-    template void launch_subscalar<float>(const int numBlocks, const int blockSize, const float* a, const float scalar, float* c, const int size);
-    template void launch_subscalar<half>(const int numBlocks, const int blockSize, const half* a, const half scalar, half* c, const int size);
-    template void launch_subscalar<nv_bfloat16>(const int numBlocks, const int blockSize, const nv_bfloat16* a, const nv_bfloat16 scalar, nv_bfloat16* c, const int size);  
-    template void launch_subscalar<int64_t>(const int numBlocks, const int blockSize, const int64_t* a, const int64_t scalar, int64_t* c, const int size);  
-    template void launch_subscalar<int32_t>(const int numBlocks, const int blockSize, const int32_t* a, const int32_t scalar, int32_t* c, const int size);  
-    template void launch_subscalar<int16_t>(const int numBlocks, const int blockSize, const int16_t* a, const int16_t scalar, int16_t* c, const int size);  
-    template void launch_subscalar<int8_t>(const int numBlocks, const int blockSize, const int8_t* a, const int8_t scalar, int8_t* c, const int size);    
-    
-     template <typename T>
-    __global__ void mul_kernel(const T* A, const T* B, T* C,const int size){
-        int idx = blockIdx.x * blockDim.x + threadIdx.x;
-        if (idx < size) {
+    template void launch_subscalar<double>(const double *a, const double scalar, double *c, const int size);
+    template void launch_subscalar<float>(const float *a, const float scalar, float *c, const int size);
+    template void launch_subscalar<half>(const half *a, const half scalar, half *c, const int size);
+    template void launch_subscalar<nv_bfloat16>(const nv_bfloat16 *a, const nv_bfloat16 scalar, nv_bfloat16 *c, const int size);
+    template void launch_subscalar<int64_t>(const int64_t *a, const int64_t scalar, int64_t *c, const int size);
+    template void launch_subscalar<int32_t>(const int32_t *a, const int32_t scalar, int32_t *c, const int size);
+    template void launch_subscalar<int16_t>(const int16_t *a, const int16_t scalar, int16_t *c, const int size);
+    template void launch_subscalar<int8_t>(const int8_t *a, const int8_t scalar, int8_t *c, const int size);
+
+    // mul
+    template <typename T>
+    __global__ void mul_kernel(const T *A, const T *B, T *C, const int size)
+    {
+        for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x)
+        {
             C[idx] = A[idx] * B[idx];
         }
-    }  
-    template __global__ void mul_kernel<double>(const double* A, const double* B, double* C,const int size);
-    template __global__ void mul_kernel<float>(const float* A, const float* B, float* C,const int size);
-    template __global__ void mul_kernel<half>(const half* A, const half* B, half* C,const int size);
-    template __global__ void mul_kernel<nv_bfloat16>(const nv_bfloat16* A, const nv_bfloat16* B, nv_bfloat16* C,const int size);
-    template __global__ void mul_kernel<int64_t>(const int64_t* A, const int64_t* B, int64_t* C,const int size);
-    template __global__ void mul_kernel<int32_t>(const int32_t* A, const int32_t* B, int32_t* C,const int size);    
-    template __global__ void mul_kernel<int16_t>(const int16_t* A, const int16_t* B, int16_t* C,const int size);
-    template __global__ void mul_kernel<int8_t>(const int8_t* A, const int8_t* B, int8_t* C,const int size);
+    }
 
     template <typename T>
-    void launch_mul(const int numBlocks, const int blockSize, const T* a, const T* b, T* c, const int size) {
+    void launch_mul(const T *a, const T *b, T *c, const int size)
+    {
+        auto [numBlocks, blockSize] = BestDims(size);
         mul_kernel<<<numBlocks, blockSize>>>(a, b, c, size);
-    }   
-    template void launch_mul<double>(const int numBlocks, const int blockSize, const double* a, const double* b, double* c, const int size);    
-    template void launch_mul<float>(const int numBlocks, const int blockSize, const float* a, const float* b, float* c, const int size);
-    template void launch_mul<half>(const int numBlocks, const int blockSize, const half* a, const half* b, half* c, const int size);
-    template void launch_mul<nv_bfloat16>(const int numBlocks, const int blockSize, const nv_bfloat16* a, const nv_bfloat16* b, nv_bfloat16* c, const int size);
-    template void launch_mul<int64_t>(const int numBlocks, const int blockSize, const int64_t* a, const int64_t* b, int64_t* c, const int size);    
-    template void launch_mul<int32_t>(const int numBlocks, const int blockSize, const int32_t* a, const int32_t* b, int32_t* c, const int size);    
-    template void launch_mul<int16_t>(const int numBlocks, const int blockSize, const int16_t* a, const int16_t* b, int16_t* c, const int size);    
-    template void launch_mul<int8_t>(const int numBlocks, const int blockSize, const int8_t* a, const int8_t* b, int8_t* c, const int size);    
-    
+        cudaError_t err = cudaGetLastError();
+        if (err != cudaSuccess)
+        {
+            throw std::runtime_error("Failed to launch mul kernel: " +
+                                     std::string(cudaGetErrorString(err)));
+        }
+    }
+    template void launch_mul<double>(const double *a, const double *b, double *c, const int size);
+    template void launch_mul<float>(const float *a, const float *b, float *c, const int size);
+    template void launch_mul<half>(const half *a, const half *b, half *c, const int size);
+    template void launch_mul<nv_bfloat16>(const nv_bfloat16 *a, const nv_bfloat16 *b, nv_bfloat16 *c, const int size);
+    template void launch_mul<int64_t>(const int64_t *a, const int64_t *b, int64_t *c, const int size);
+    template void launch_mul<int32_t>(const int32_t *a, const int32_t *b, int32_t *c, const int size);
+    template void launch_mul<int16_t>(const int16_t *a, const int16_t *b, int16_t *c, const int size);
+    template void launch_mul<int8_t>(const int8_t *a, const int8_t *b, int8_t *c, const int size);
+
+    // mulscalar
     template <typename T>
-    __global__ void mulscalar_kernel(const T* A, const T scalar, T* C,const int size){
-        int idx = blockIdx.x * blockDim.x + threadIdx.x;
-        if (idx < size) {
+    __global__ void mulscalar_kernel(const T *A, const T scalar, T *C, const int size)
+    {
+        for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x)
+        {
             C[idx] = A[idx] * scalar;
         }
-    }   
-    template __global__ void mulscalar_kernel<double>(const double* A, const double scalar, double* C,const int size);
-    template __global__ void mulscalar_kernel<float>(const float* A, const float scalar, float* C,const int size);
-    template __global__ void mulscalar_kernel<half>(const half* A, const half scalar, half* C,const int size);
-    template __global__ void mulscalar_kernel<nv_bfloat16>(const nv_bfloat16* A, const nv_bfloat16 scalar, nv_bfloat16* C,const int size);  
-    template __global__ void mulscalar_kernel<int64_t>(const int64_t* A, const int64_t scalar, int64_t* C,const int size);  
-    template __global__ void mulscalar_kernel<int32_t>(const int32_t* A, const int32_t scalar, int32_t* C,const int size);  
-    template __global__ void mulscalar_kernel<int16_t>(const int16_t* A, const int16_t scalar, int16_t* C,const int size);  
-    template __global__ void mulscalar_kernel<int8_t>(const int8_t* A, const int8_t scalar, int8_t* C,const int size);  
-    
+    }
+
     template <typename T>
-    void launch_mulscalar(const int numBlocks, const int blockSize, const T* a, const T scalar, T* c, const int size) {
+    void launch_mulscalar(const T *a, const T scalar, T *c, const int size)
+    {
+        auto [numBlocks, blockSize] = BestDims(size);
         mulscalar_kernel<<<numBlocks, blockSize>>>(a, scalar, c, size);
+        cudaError_t err = cudaGetLastError();
+        if (err != cudaSuccess)
+        {
+            throw std::runtime_error("Failed to launch mulscalar kernel: " +
+                                     std::string(cudaGetErrorString(err)));
+        }
     }
-    template void launch_mulscalar<double>(const int numBlocks, const int blockSize, const double* a, const double scalar, double* c, const int size);
-    template void launch_mulscalar<float>(const int numBlocks, const int blockSize, const float* a, const float scalar, float* c, const int size);
-    template void launch_mulscalar<half>(const int numBlocks, const int blockSize, const half* a, const half scalar, half* c, const int size);
-    template void launch_mulscalar<nv_bfloat16>(const int numBlocks, const int blockSize, const nv_bfloat16* a, const nv_bfloat16 scalar, nv_bfloat16* c, const int size);
-    template void launch_mulscalar<int64_t>(const int numBlocks, const int blockSize, const int64_t* a, const int64_t scalar, int64_t* c, const int size);
-    template void launch_mulscalar<int32_t>(const int numBlocks, const int blockSize, const int32_t* a, const int32_t scalar, int32_t* c, const int size);  
-    template void launch_mulscalar<int16_t>(const int numBlocks, const int blockSize, const int16_t* a, const int16_t scalar, int16_t* c, const int size);  
-    template void launch_mulscalar<int8_t>(const int numBlocks, const int blockSize, const int8_t* a, const int8_t scalar, int8_t* c, const int size);  
-    
+    template void launch_mulscalar<double>(const double *a, const double scalar, double *c, const int size);
+    template void launch_mulscalar<float>(const float *a, const float scalar, float *c, const int size);
+    template void launch_mulscalar<half>(const half *a, const half scalar, half *c, const int size);
+    template void launch_mulscalar<nv_bfloat16>(const nv_bfloat16 *a, const nv_bfloat16 scalar, nv_bfloat16 *c, const int size);
+    template void launch_mulscalar<int64_t>(const int64_t *a, const int64_t scalar, int64_t *c, const int size);
+    template void launch_mulscalar<int32_t>(const int32_t *a, const int32_t scalar, int32_t *c, const int size);
+    template void launch_mulscalar<int16_t>(const int16_t *a, const int16_t scalar, int16_t *c, const int size);
+    template void launch_mulscalar<int8_t>(const int8_t *a, const int8_t scalar, int8_t *c, const int size);
+
+    // div
     template <typename T>
-    __global__ void div_kernel(const T* A, const T* B, T* C,const int size){
-        int idx = blockIdx.x * blockDim.x + threadIdx.x;
-        if (idx < size) {
+    __global__ void div_kernel(const T *A, const T *B, T *C, const int size)
+    {
+        for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x)
+        {
             C[idx] = A[idx] / B[idx];
         }
-    }   
-    template __global__ void div_kernel<double>(const double* A, const double* B, double* C,const int size);
-    template __global__ void div_kernel<float>(const float* A, const float* B, float* C,const int size);
-    template __global__ void div_kernel<half>(const half* A, const half* B, half* C,const int size);
-    template __global__ void div_kernel<nv_bfloat16>(const nv_bfloat16* A, const nv_bfloat16* B, nv_bfloat16* C,const int size);    
-    template __global__ void div_kernel<int64_t>(const int64_t* A, const int64_t* B, int64_t* C,const int size);  
-    template __global__ void div_kernel<int32_t>(const int32_t* A, const int32_t* B, int32_t* C,const int size);  
-    template __global__ void div_kernel<int16_t>(const int16_t* A, const int16_t* B, int16_t* C,const int size);  
-    template __global__ void div_kernel<int8_t>(const int8_t* A, const int8_t* B, int8_t* C,const int size);  
-    
+    }
+
     template <typename T>
-    void launch_div(const int numBlocks, const int blockSize, const T* a, const T* b, T* c, const int size) {
+    void launch_div(const T *a, const T *b, T *c, const int size)
+    {
+        auto [numBlocks, blockSize] = BestDims(size);
         div_kernel<<<numBlocks, blockSize>>>(a, b, c, size);
+        cudaError_t err = cudaGetLastError();
+        if (err != cudaSuccess)
+        {
+            throw std::runtime_error("Failed to launch div kernel: " +
+                                     std::string(cudaGetErrorString(err)));
+        }
     }
-    template void launch_div<double>(const int numBlocks, const int blockSize, const double* a, const double* b, double* c, const int size);
-    template void launch_div<float>(const int numBlocks, const int blockSize, const float* a, const float* b, float* c, const int size);
-    template void launch_div<half>(const int numBlocks, const int blockSize, const half* a, const half* b, half* c, const int size);
-    template void launch_div<nv_bfloat16>(const int numBlocks, const int blockSize, const nv_bfloat16* a, const nv_bfloat16* b, nv_bfloat16* c, const int size);
-    template void launch_div<int64_t>(const int numBlocks, const int blockSize, const int64_t* a, const int64_t* b, int64_t* c, const int size);
-    template void launch_div<int32_t>(const int numBlocks, const int blockSize, const int32_t* a, const int32_t* b, int32_t* c, const int size);    
-    template void launch_div<int16_t>(const int numBlocks, const int blockSize, const int16_t* a, const int16_t* b, int16_t* c, const int size);  
-    template void launch_div<int8_t>(const int numBlocks, const int blockSize, const int8_t* a, const int8_t* b, int8_t* c, const int size);  
-    
+    template void launch_div<double>(const double *a, const double *b, double *c, const int size);
+    template void launch_div<float>(const float *a, const float *b, float *c, const int size);
+    template void launch_div<half>(const half *a, const half *b, half *c, const int size);
+    template void launch_div<nv_bfloat16>(const nv_bfloat16 *a, const nv_bfloat16 *b, nv_bfloat16 *c, const int size);
+    template void launch_div<int64_t>(const int64_t *a, const int64_t *b, int64_t *c, const int size);
+    template void launch_div<int32_t>(const int32_t *a, const int32_t *b, int32_t *c, const int size);
+    template void launch_div<int16_t>(const int16_t *a, const int16_t *b, int16_t *c, const int size);
+    template void launch_div<int8_t>(const int8_t *a, const int8_t *b, int8_t *c, const int size);
+
+    // divscalar
     template <typename T>
-    __global__ void divscalar_kernel(const T* A, const T scalar, T* C,const int size){
-        int idx = blockIdx.x * blockDim.x + threadIdx.x;    
-        if (idx < size) {
+    __global__ void divscalar_kernel(const T *A, const T scalar, T *C, const int size)
+    {
+        for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x)
+        {
             C[idx] = A[idx] / scalar;
         }
-    }   
-    template __global__ void divscalar_kernel<double>(const double* A, const double scalar, double* C,const int size);
-    template __global__ void divscalar_kernel<float>(const float* A, const float scalar, float* C,const int size);
-    template __global__ void divscalar_kernel<half>(const half* A, const half scalar, half* C,const int size);
-    template __global__ void divscalar_kernel<nv_bfloat16>(const nv_bfloat16* A, const nv_bfloat16 scalar, nv_bfloat16* C,const int size);
-    template __global__ void divscalar_kernel<int64_t>(const int64_t* A, const int64_t scalar, int64_t* C,const int size);
-    template __global__ void divscalar_kernel<int32_t>(const int32_t* A, const int32_t scalar, int32_t* C,const int size);
-    template __global__ void divscalar_kernel<int16_t>(const int16_t* A, const int16_t scalar, int16_t* C,const int size);
-    template __global__ void divscalar_kernel<int8_t>(const int8_t* A, const int8_t scalar, int8_t* C,const int size);
-    
+    }
+
     template <typename T>
-    void launch_divscalar(const int numBlocks, const int blockSize, const T* a, const T scalar, T* c, const int size) {
+    void launch_divscalar(const T *a, const T scalar, T *c, const int size)
+    {
+        auto [numBlocks, blockSize] = BestDims(size);
         divscalar_kernel<<<numBlocks, blockSize>>>(a, scalar, c, size);
-    }   
-    template void launch_divscalar<double>(const int numBlocks, const int blockSize, const double* a, const double scalar, double* c, const int size);
-    template void launch_divscalar<float>(const int numBlocks, const int blockSize, const float* a, const float scalar, float* c, const int size);
-    template void launch_divscalar<half>(const int numBlocks, const int blockSize, const half* a, const half scalar, half* c, const int size);
-    template void launch_divscalar<nv_bfloat16>(const int numBlocks, const int blockSize, const nv_bfloat16* a, const nv_bfloat16 scalar, nv_bfloat16* c, const int size);
-    template void launch_divscalar<int64_t>(const int numBlocks, const int blockSize, const int64_t* a, const int64_t scalar, int64_t* c, const int size);  
-    template void launch_divscalar<int32_t>(const int numBlocks, const int blockSize, const int32_t* a, const int32_t scalar, int32_t* c, const int size);  
-    template void launch_divscalar<int16_t>(const int numBlocks, const int blockSize, const int16_t* a, const int16_t scalar, int16_t* c, const int size);  
-    template void launch_divscalar<int8_t>(const int numBlocks, const int blockSize, const int8_t* a, const int8_t scalar, int8_t* c, const int size);    
-    
+        cudaError_t err = cudaGetLastError();
+        if (err != cudaSuccess)
+        {
+            throw std::runtime_error("Failed to launch divscalar kernel: " +
+                                     std::string(cudaGetErrorString(err)));
+        }
+    }
+    template void launch_divscalar<double>(const double *a, const double scalar, double *c, const int size);
+    template void launch_divscalar<float>(const float *a, const float scalar, float *c, const int size);
+    template void launch_divscalar<half>(const half *a, const half scalar, half *c, const int size);
+    template void launch_divscalar<nv_bfloat16>(const nv_bfloat16 *a, const nv_bfloat16 scalar, nv_bfloat16 *c, const int size);
+    template void launch_divscalar<int64_t>(const int64_t *a, const int64_t scalar, int64_t *c, const int size);
+    template void launch_divscalar<int32_t>(const int32_t *a, const int32_t scalar, int32_t *c, const int size);
+    template void launch_divscalar<int16_t>(const int16_t *a, const int16_t scalar, int16_t *c, const int size);
+    template void launch_divscalar<int8_t>(const int8_t *a, const int8_t scalar, int8_t *c, const int size);
+
+    // rdivscalar
     template <typename T>
-    __global__ void rdivscalar_kernel(const T scalar, const T* A, T* C,const int size){
-        int idx = blockIdx.x * blockDim.x + threadIdx.x;
-        if (idx < size) {
+    __global__ void rdivscalar_kernel(const T scalar, const T *A, T *C, const int size)
+    {
+        for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x)
+        {
             C[idx] = scalar / A[idx];
         }
-    }      
-    template __global__ void rdivscalar_kernel<double>(const double scalar, const double* A, double* C,const int size); 
-    template __global__ void rdivscalar_kernel<float>(const float scalar, const float* A, float* C,const int size);
-    template __global__ void rdivscalar_kernel<half>(const half scalar, const half* A, half* C,const int size);
-    template __global__ void rdivscalar_kernel<nv_bfloat16>(const nv_bfloat16 scalar, const nv_bfloat16* A, nv_bfloat16* C,const int size);
-    template __global__ void rdivscalar_kernel<int64_t>(const int64_t scalar, const int64_t* A, int64_t* C,const int size);
-    template __global__ void rdivscalar_kernel<int32_t>(const int32_t scalar, const int32_t* A, int32_t* C,const int size); 
-    template __global__ void rdivscalar_kernel<int16_t>(const int16_t scalar, const int16_t* A, int16_t* C,const int size);
-    template __global__ void rdivscalar_kernel<int8_t>(const int8_t scalar, const int8_t* A, int8_t* C,const int size);
-    
+    }
+
     template <typename T>
-    void launch_rdivscalar(const int numBlocks, const int blockSize, const T scalar, const T* a, T* c, const int size) {
+    void launch_rdivscalar(const T scalar, const T *a, T *c, const int size)
+    {
+        auto [numBlocks, blockSize] = BestDims(size);
         rdivscalar_kernel<<<numBlocks, blockSize>>>(scalar, a, c, size);
-    }   
-    template void launch_rdivscalar<double>(const int numBlocks, const int blockSize, const double scalar, const double* a, double* c, const int size); 
-    template void launch_rdivscalar<float>(const int numBlocks, const int blockSize, const float scalar, const float* a, float* c, const int size);
-    template void launch_rdivscalar<half>(const int numBlocks, const int blockSize, const half scalar, const half* a, half* c, const int size);
-    template void launch_rdivscalar<nv_bfloat16>(const int numBlocks, const int blockSize, const nv_bfloat16 scalar, const nv_bfloat16* a, nv_bfloat16* c, const int size);
-    template void launch_rdivscalar<int64_t>(const int numBlocks, const int blockSize, const int64_t scalar, const int64_t* a, int64_t* c, const int size);
-    template void launch_rdivscalar<int32_t>(const int numBlocks, const int blockSize, const int32_t scalar, const int32_t* a, int32_t* c, const int size);
-    template void launch_rdivscalar<int16_t>(const int numBlocks, const int blockSize, const int16_t scalar, const int16_t* a, int16_t* c, const int size);
-    template void launch_rdivscalar<int8_t>(const int numBlocks, const int blockSize, const int8_t scalar, const int8_t* a, int8_t* c, const int size);
-    
- 
+        cudaError_t err = cudaGetLastError();
+        if (err != cudaSuccess)
+        {
+            throw std::runtime_error("Failed to launch rdivscalar kernel: " +
+                                     std::string(cudaGetErrorString(err)));
+        }
+    }
+    template void launch_rdivscalar<double>(const double scalar, const double *a, double *c, const int size);
+    template void launch_rdivscalar<float>(const float scalar, const float *a, float *c, const int size);
+    template void launch_rdivscalar<half>(const half scalar, const half *a, half *c, const int size);
+    template void launch_rdivscalar<nv_bfloat16>(const nv_bfloat16 scalar, const nv_bfloat16 *a, nv_bfloat16 *c, const int size);
+    template void launch_rdivscalar<int64_t>(const int64_t scalar, const int64_t *a, int64_t *c, const int size);
+    template void launch_rdivscalar<int32_t>(const int32_t scalar, const int32_t *a, int32_t *c, const int size);
+    template void launch_rdivscalar<int16_t>(const int16_t scalar, const int16_t *a, int16_t *c, const int size);
+    template void launch_rdivscalar<int8_t>(const int8_t scalar, const int8_t *a, int8_t *c, const int size);
+
+    // invert
+    template <typename T>
+    __global__ void invert_kernel(const T *A, T *C, const int size)
+    {
+        for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x)
+        {
+            C[idx] = ~A[idx];
+        }
+    }
+
+    template <typename T>
+    void launch_invert(const T *a, T *c, const int size)
+    {
+        auto [numBlocks, blockSize] = BestDims(size);
+        invert_kernel<<<numBlocks, blockSize>>>(a, c, size);
+        cudaError_t err = cudaGetLastError();
+        if (err != cudaSuccess)
+        {
+            throw std::runtime_error("Failed to launch invert kernel: " +
+                                     std::string(cudaGetErrorString(err)));
+        }
+    }
+    template void launch_invert<int64_t>(const int64_t *a, int64_t *c, const int size);
+    template void launch_invert<int32_t>(const int32_t *a, int32_t *c, const int size);
+    template void launch_invert<int16_t>(const int16_t *a, int16_t *c, const int size);
+    template void launch_invert<int8_t>(const int8_t *a, int8_t *c, const int size);
+
 }
 
 #endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAOBYTE_BASIC_CU
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cuh b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cuh
index 0f4da083..604421c4 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cuh
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cuh
@@ -1,8 +1,7 @@
 #ifndef DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_BASIC_CUH
 #define DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_BASIC_CUH
 
-#include <cuda_bf16.h>  
-#include <cuda_fp16.h>
+
 #include "deepx/tensorfunc/cuda.hpp"
 #include "deepx/tensorfunc/authors.hpp"
 
@@ -12,282 +11,74 @@ namespace deepx::tensorfunc
     __global__ void add_kernel(const T* A, const T* B, T* C,const int size);
 
     template <typename T>
-    void launch_add(int numBlocks, int blockSize, const T* a, const T* b, T* c,const int size);
-
-    template <>
-    void launch_add<double>(int numBlocks, int blockSize, const double* a, const double* b, double* c,const int size);
-
-    template <>
-    void launch_add<float>(int numBlocks, int blockSize, const float* a, const float* b, float* c,const int size);
-
-    template <>
-    void launch_add<nv_bfloat16>(int numBlocks, int blockSize, const nv_bfloat16* a, const nv_bfloat16* b, nv_bfloat16* c,const int size);
-
-    template <>
-    void launch_add<__half>(int numBlocks, int blockSize, const __half* a, const __half* b, __half* c,const int size);
-
-    template <>
-    void launch_add<int64_t>(int numBlocks, int blockSize, const int64_t* a, const int64_t* b, int64_t* c,const int size);
-
-    template <>
-    void launch_add<int32_t>(int numBlocks, int blockSize, const int32_t* a, const int32_t* b, int32_t* c,const int size);
-
-    template <>
-    void launch_add<int16_t>(int numBlocks, int blockSize, const int16_t* a, const int16_t* b, int16_t* c,const int size);
-
-    template <>
-    void launch_add<int8_t>(int numBlocks, int blockSize, const int8_t* a, const int8_t* b, int8_t* c,const int size);  
+    void launch_add(const T* a, const T* b, T* c,const int size);
 
+ 
 
     // addscalar
      template <typename T>
     __global__ void addscalar_kernel(const T* A, const T scalar, T* C,const int size);
 
     template <typename T>
-    void launch_addscalar(int numBlocks, int blockSize, const T* a, const T scalar, T* c,const int size);
-
-    template <>
-    void launch_addscalar<double>(int numBlocks, int blockSize, const double* a, const double scalar, double* c,const int size);
-
-    template <>
-    void launch_addscalar<float>(int numBlocks, int blockSize, const float* a, const float scalar, float* c,const int size);
-
-    template <>
-    void launch_addscalar<nv_bfloat16>(int numBlocks, int blockSize, const nv_bfloat16* a, const nv_bfloat16 scalar, nv_bfloat16* c,const int size);
-
-    template <>
-    void launch_addscalar<__half>(int numBlocks, int blockSize, const __half* a, const __half scalar, __half* c,const int size);
-
-    template <>
-    void launch_addscalar<int64_t>(int numBlocks, int blockSize, const int64_t* a, const int64_t scalar, int64_t* c,const int size);
-
-    template <>
-    void launch_addscalar<int32_t>(int numBlocks, int blockSize, const int32_t* a, const int32_t scalar, int32_t* c,const int size);
-
-    template <>
-    void launch_addscalar<int16_t>(int numBlocks, int blockSize, const int16_t* a, const int16_t scalar, int16_t* c,const int size);
-
-    template <>
-    void launch_addscalar<int8_t>(int numBlocks, int blockSize, const int8_t* a, const int8_t scalar, int8_t* c,const int size);
-
+    void launch_addscalar(const T* a, const T scalar, T* c,const int size);
+ 
     // sub
     template <typename T>
     __global__ void sub_kernel(const T* A, const T* B, T* C,const int size);
 
     template <typename T>
-    void launch_sub(int numBlocks, int blockSize, const T* a, const T* b, T* c,const int size);  
-
-    template <>
-    void launch_sub<double>(int numBlocks, int blockSize, const double* a, const double* b, double* c,const int size);
-
-    template <> 
-    void launch_sub<float>(int numBlocks, int blockSize, const float* a, const float* b, float* c,const int size);
-
-    template <>
-    void launch_sub<nv_bfloat16>(int numBlocks, int blockSize, const nv_bfloat16* a, const nv_bfloat16* b, nv_bfloat16* c,const int size);
-
-    template <> 
-    void launch_sub<__half>(int numBlocks, int blockSize, const __half* a, const __half* b, __half* c,const int size);
-
-    template <>
-    void launch_sub<int64_t>(int numBlocks, int blockSize, const int64_t* a, const int64_t* b, int64_t* c,const int size);
-
-    template <> 
-    void launch_sub<int32_t>(int numBlocks, int blockSize, const int32_t* a, const int32_t* b, int32_t* c,const int size);
-
-    template <> 
-    void launch_sub<int16_t>(int numBlocks, int blockSize, const int16_t* a, const int16_t* b, int16_t* c,const int size);
-
-    template <> 
-    void launch_sub<int8_t>(int numBlocks, int blockSize, const int8_t* a, const int8_t* b, int8_t* c,const int size);
-
+    void launch_sub(const T* a, const T* b, T* c,const int size);  
+ 
     // subscalar
     template <typename T>
     __global__ void subscalar_kernel(const T* A, const T scalar, T* C,const int size);
 
     template <typename T>
-    void launch_subscalar(const int numBlocks, const int blockSize, const T* a, const T scalar, T* c,const int size);
-
-    template <>
-    void launch_subscalar<double>(const int numBlocks, const int blockSize, const double* a, const double scalar, double* c,const int size);
-
-    template <>
-    void launch_subscalar<float>(const int numBlocks, const int blockSize, const float* a, const float scalar, float* c,const int size);
-
-    template <>
-    void launch_subscalar<nv_bfloat16>(const int numBlocks, const int blockSize, const nv_bfloat16* a, const nv_bfloat16 scalar, nv_bfloat16* c,const int size);
-
-    template <>
-    void launch_subscalar<__half>(const int numBlocks, const int blockSize, const __half* a, const __half scalar, __half* c,const int size);
-
-    template <>
-    void launch_subscalar<int64_t>(const int numBlocks, const int blockSize, const int64_t* a, const int64_t scalar, int64_t* c,const int size);
-
-    template <>
-    void launch_subscalar<int32_t>(const int numBlocks, const int blockSize, const int32_t* a, const int32_t scalar, int32_t* c,const int size);
-
-    template <>
-    void launch_subscalar<int16_t>(const int numBlocks, const int blockSize, const int16_t* a, const int16_t scalar, int16_t* c,const int size);
-
-    template <>
-    void launch_subscalar<int8_t>(const int numBlocks, const int blockSize, const int8_t* a, const int8_t scalar, int8_t* c,const int size);    
-
+    void launch_subscalar(const T* a, const T scalar, T* c,const int size);
+ 
     // mul
     template <typename T>
     __global__ void mul_kernel(const T* A, const T* B, T* C,const int size);
 
     template <typename T>
-    void launch_mul(const int numBlocks, const int blockSize, const T* a, const T* b, T* c,const int size);
-
-    template <>
-    void launch_mul<double>(const int numBlocks, const int blockSize, const double* a, const double* b, double* c,const int size);
-
-    template <> 
-    void launch_mul<float>(const int numBlocks, const int blockSize, const float* a, const float* b, float* c,const int size);
-
-    template <>
-    void launch_mul<nv_bfloat16>(const int numBlocks, const int blockSize, const nv_bfloat16* a, const nv_bfloat16* b, nv_bfloat16* c,const int size);
-
-    template <> 
-    void launch_mul<__half>(const int numBlocks, const int blockSize, const __half* a, const __half* b, __half* c,const int size);
-
-    template <>
-    void launch_mul<int64_t>(const int numBlocks, const int blockSize, const int64_t* a, const int64_t* b, int64_t* c,const int size);
-
-    template <> 
-    void launch_mul<int32_t>(const int numBlocks, const int blockSize, const int32_t* a, const int32_t* b, int32_t* c,const int size);
-
-    template <>
-    void launch_mul<int16_t>(const int numBlocks, const int blockSize, const int16_t* a, const int16_t* b, int16_t* c,const int size);
-
-    template <> 
-    void launch_mul<int8_t>(const int numBlocks, const int blockSize, const int8_t* a, const int8_t* b, int8_t* c,const int size);
-
+    void launch_mul(const T* a, const T* b, T* c,const int size);
+ 
     // mulscalar
     template <typename T>
     __global__ void mulscalar_kernel(const T* A, const T scalar, T* C,const int size);  
 
     template <typename T>
-    void launch_mulscalar(const int numBlocks, const int blockSize, const T* a, const T scalar, T* c,const int size);
-
-    template <>
-    void launch_mulscalar<double>(const int numBlocks, const int blockSize, const double* a, const double scalar, double* c,const int size);
-
-    template <>
-    void launch_mulscalar<float>(const int numBlocks, const int blockSize, const float* a, const float scalar, float* c,const int size);
-
-    template <>
-    void launch_mulscalar<nv_bfloat16>(const int numBlocks, const int blockSize, const nv_bfloat16* a, const nv_bfloat16 scalar, nv_bfloat16* c,const int size);
-
-    template <>
-    void launch_mulscalar<__half>(const int numBlocks, const int blockSize, const __half* a, const __half scalar, __half* c,const int size);
-
-    template <>
-    void launch_mulscalar<int64_t>(const int numBlocks, const int blockSize, const int64_t* a, const int64_t scalar, int64_t* c,const int size);
-
-    template <>
-    void launch_mulscalar<int32_t>(const int numBlocks, const int blockSize, const int32_t* a, const int32_t scalar, int32_t* c,const int size);
-
-    template <>
-    void launch_mulscalar<int16_t>(const int numBlocks, const int blockSize, const int16_t* a, const int16_t scalar, int16_t* c,const int size);
-
-    template <>
-    void launch_mulscalar<int8_t>(const int numBlocks, const int blockSize, const int8_t* a, const int8_t scalar, int8_t* c,const int size);
+    void launch_mulscalar(const T* a, const T scalar, T* c,const int size);
 
+ 
     // div
     template <typename T>
     __global__ void div_kernel(const T* A, const T* B, T* C,const int size);
 
     template <typename T>
-    void launch_div(const int numBlocks, const int blockSize, const T* a, const T* b, T* c,const int size);
-
-    template <>
-    void launch_div<double>(const int numBlocks, const int blockSize, const double* a, const double* b, double* c,const int size);
-
-    template <>
-    void launch_div<float>(const int numBlocks, const int blockSize, const float* a, const float* b, float* c,const int size);
-
-    template <>
-    void launch_div<nv_bfloat16>(const int numBlocks, const int blockSize, const nv_bfloat16* a, const nv_bfloat16* b, nv_bfloat16* c,const int size);
-
-    template <>
-    void launch_div<__half>(const int numBlocks, const int blockSize, const __half* a, const __half* b, __half* c,const int size);
-
-    template <> 
-    void launch_div<int64_t>(const int numBlocks, const int blockSize, const int64_t* a, const int64_t* b, int64_t* c,const int size);
-
-    template <>
-    void launch_div<int32_t>(const int numBlocks, const int blockSize, const int32_t* a, const int32_t* b, int32_t* c,const int size);
-
-    template <>
-    void launch_div<int16_t>(const int numBlocks, const int blockSize, const int16_t* a, const int16_t* b, int16_t* c,const int size);
-
-    template <>
-    void launch_div<int8_t>(const int numBlocks, const int blockSize, const int8_t* a, const int8_t* b, int8_t* c,const int size);
+    void launch_div(const T* a, const T* b, T* c,const int size);
 
+ 
     // divscalar
     template <typename T>
     __global__ void divscalar_kernel(const T* A, const T scalar, T* C,const int size);
 
     template <typename T>
-    void launch_divscalar(const int numBlocks, const int blockSize, const T* a, const T scalar, T* c,const int size);
-
-    template <>
-    void launch_divscalar<double>(const int numBlocks, const int blockSize, const double* a, const double scalar, double* c,const int size);
-
-    template <>
-    void launch_divscalar<float>(const int numBlocks, const int blockSize, const float* a, const float scalar, float* c,const int size);
-
-    template <> 
-    void launch_divscalar<nv_bfloat16>(const int numBlocks, const int blockSize, const nv_bfloat16* a, const nv_bfloat16 scalar, nv_bfloat16* c,const int size);
-
-    template <>
-    void launch_divscalar<__half>(const int numBlocks, const int blockSize, const __half* a, const __half scalar, __half* c,const int size);
-
-    template <>
-    void launch_divscalar<int64_t>(const int numBlocks, const int blockSize, const int64_t* a, const int64_t scalar, int64_t* c,const int size);
-
-    template <>
-    void launch_divscalar<int32_t>(const int numBlocks, const int blockSize, const int32_t* a, const int32_t scalar, int32_t* c,const int size);
-    
-    template <>
-    void launch_divscalar<int16_t>(const int numBlocks, const int blockSize, const int16_t* a, const int16_t scalar, int16_t* c,const int size);
-
-    template <>
-    void launch_divscalar<int8_t>(const int numBlocks, const int blockSize, const int8_t* a, const int8_t scalar, int8_t* c,const int size);
-
+    void launch_divscalar(const T* a, const T scalar, T* c,const int size);
+ 
     // rdivscalar
     template <typename T>
     __global__ void rdivscalar_kernel(const T scalar, const T* A, T* C,const int size);
 
     template <typename T>
-    void launch_rdivscalar(const int numBlocks, const int blockSize, const T scalar, const T* a, T* c,const int size);
-
-    template <>
-    void launch_rdivscalar<double>(const int numBlocks, const int blockSize, const double scalar, const double* a, double* c,const int size);
-
-    template <>
-    void launch_rdivscalar<float>(const int numBlocks, const int blockSize, const float scalar, const float* a, float* c,const int size);
-
-    template <>
-    void launch_rdivscalar<nv_bfloat16>(const int numBlocks, const int blockSize, const nv_bfloat16 scalar, const nv_bfloat16* a, nv_bfloat16* c,const int size);
-
-    template <>
-    void launch_rdivscalar<__half>(const int numBlocks, const int blockSize, const __half scalar, const __half* a, __half* c,const int size);
-
-    template <>
-    void launch_rdivscalar<int64_t>(const int numBlocks, const int blockSize, const int64_t scalar, const int64_t* a, int64_t* c,const int size);
-
-    template <>
-    void launch_rdivscalar<int32_t>(const int numBlocks, const int blockSize, const int32_t scalar, const int32_t* a, int32_t* c,const int size);
-
-    template <>
-    void launch_rdivscalar<int16_t>(const int numBlocks, const int blockSize, const int16_t scalar, const int16_t* a, int16_t* c,const int size);
+    void launch_rdivscalar(const T scalar, const T* a, T* c,const int size);
+ 
+    // invert
+    template <typename T>
+    __global__ void invert_kernel(const T* A, T* C,const int size);
 
-    template <>
-    void launch_rdivscalar<int8_t>(const int numBlocks, const int blockSize, const int8_t scalar, const int8_t* a, int8_t* c,const int size);
-    
-    
+    template <typename T>
+    void launch_invert(const T* a, T* c,const int size);
 
 }
 
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.hpp
index e263b65b..82cb4cbf 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.hpp
@@ -21,9 +21,7 @@ namespace deepx::tensorfunc
             if (A.shape.size != B.shape.size || A.shape.size != C.shape.size) {
                 throw TensorShapeError("add");
             }
-            const int blockSize = A.shape.size > 256 ? 256 : A.shape.size;
-            int numBlocks = (A.shape.size + blockSize - 1) / blockSize;
-            launch_add(numBlocks, blockSize, A.data, B.data, C.data, A.shape.size);
+            launch_add(A.data, B.data, C.data, A.shape.size);
            
         }   
     };
@@ -36,9 +34,7 @@ namespace deepx::tensorfunc
             if (A.shape.size != C.shape.size) {
                 throw TensorShapeError("addscalar");
             }
-            const int blockSize = A.shape.size > 256 ? 256 : A.shape.size;
-            int numBlocks = (A.shape.size + blockSize - 1) / blockSize;
-            launch_addscalar(numBlocks, blockSize, A.data, scalar, C.data, A.shape.size);
+            launch_addscalar(A.data, scalar, C.data, A.shape.size);
         }
     };
 
@@ -50,9 +46,7 @@ namespace deepx::tensorfunc
             if (A.shape.size != B.shape.size || A.shape.size != C.shape.size) { 
                 throw TensorShapeError("sub");
             }
-            const int blockSize = A.shape.size > 256 ? 256 : A.shape.size;
-            int numBlocks = (A.shape.size + blockSize - 1) / blockSize;
-            launch_sub(numBlocks, blockSize, A.data, B.data, C.data, A.shape.size);
+            launch_sub(A.data, B.data, C.data, A.shape.size);
         }
     };
 
@@ -64,9 +58,7 @@ namespace deepx::tensorfunc
             if (A.shape.size != C.shape.size) { 
                 throw TensorShapeError("subscalar");
             }
-            const int blockSize = A.shape.size > 256 ? 256 : A.shape.size;
-            int numBlocks = (A.shape.size + blockSize - 1) / blockSize;
-            launch_subscalar(numBlocks, blockSize, A.data, scalar, C.data, A.shape.size);
+            launch_subscalar(A.data, scalar, C.data, A.shape.size);
         }
     };  
 
@@ -78,9 +70,7 @@ namespace deepx::tensorfunc
             if (A.shape.size != B.shape.size || A.shape.size != C.shape.size) { 
                 throw TensorShapeError("mul");
             }
-            const int blockSize = A.shape.size > 256 ? 256 : A.shape.size;
-            int numBlocks = (A.shape.size + blockSize - 1) / blockSize;
-            launch_mul(numBlocks, blockSize, A.data, B.data, C.data, A.shape.size);
+            launch_mul(A.data, B.data, C.data, A.shape.size);
         }
     };
 
@@ -92,9 +82,7 @@ namespace deepx::tensorfunc
             if (A.shape.size != C.shape.size) { 
                 throw TensorShapeError("mulscalar");    
             }
-            const int blockSize = A.shape.size > 256 ? 256 : A.shape.size;
-            int numBlocks = (A.shape.size + blockSize - 1) / blockSize;
-            launch_mulscalar(numBlocks, blockSize, A.data, scalar, C.data, A.shape.size);
+            launch_mulscalar(A.data, scalar, C.data, A.shape.size);
         }
     };  
 
@@ -106,9 +94,7 @@ namespace deepx::tensorfunc
             if (A.shape.size != B.shape.size || A.shape.size != C.shape.size) { 
                 throw TensorShapeError("div");
             }
-            const int blockSize = A.shape.size > 256 ? 256 : A.shape.size;
-            int numBlocks = (A.shape.size + blockSize - 1) / blockSize; 
-            launch_div(numBlocks, blockSize, A.data, B.data, C.data, A.shape.size);
+            launch_div(A.data, B.data, C.data, A.shape.size);
         }
     };
 
@@ -120,9 +106,7 @@ namespace deepx::tensorfunc
             if (A.shape.size != C.shape.size) { 
                 throw TensorShapeError("divscalar");
             }
-            const int blockSize = A.shape.size > 256 ? 256 : A.shape.size;
-            int numBlocks = (A.shape.size + blockSize - 1) / blockSize;
-            launch_divscalar(numBlocks, blockSize, A.data, scalar, C.data, A.shape.size);
+            launch_divscalar(A.data, scalar, C.data, A.shape.size);
         }
     };
 
@@ -134,12 +118,21 @@ namespace deepx::tensorfunc
             if (A.shape.size != C.shape.size) { 
                 throw TensorShapeError("rdivscalar");
             }
-            const int blockSize = A.shape.size > 256 ? 256 : A.shape.size;
-            int numBlocks = (A.shape.size + blockSize - 1) / blockSize;
-            launch_rdivscalar(numBlocks, blockSize, scalar, A.data, C.data, A.shape.size);
+            launch_rdivscalar(scalar, A.data, C.data, A.shape.size);
+        }
+    };
+
+    template <typename T>
+    struct invertDispatcher<miaobyte, T>
+    {
+        static void invert(const Tensor<T> &A, Tensor<T> &C)
+        {
+            if (A.shape.size != C.shape.size) { 
+                throw TensorShapeError("invert");
+            }
+            launch_invert( A.data, C.data, A.shape.size);
         }
     };
-    
 }
 
 #endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_BASIC_HPP
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cu
index 95307389..45b24be2 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cu
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cu
@@ -5,53 +5,24 @@
 #include <cuda_bf16.h>
 #include "deepx/tensorfunc/cuda.hpp"
 #include "deepx/tensorfunc/authors.hpp"
-#include <cuda/std/cmath>
+#include "deepx/tensorfunc/cuda_math.cuh"
 
 namespace deepx::tensorfunc
 {
     // sqrt
     template <typename T>
-    __global__ void sqrt_kernel(const T *A, T *C, const int size);
-    template <>
-    __global__ void sqrt_kernel<double>(const double *A, double *C, const int size)
+    __global__ void sqrt_kernel(const T *A, T *C, const int size)
     {
-        int idx = blockIdx.x * blockDim.x + threadIdx.x;
-        if (idx < size)
+        for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x)
         {
-            C[idx] = sqrt(A[idx]);
-        }
-    }
-    template <>
-    __global__ void sqrt_kernel<float>(const float *A, float *C, const int size)
-    {
-        int idx = blockIdx.x * blockDim.x + threadIdx.x;
-        if (idx < size)
-        {
-            C[idx] = sqrtf(A[idx]);
-        }
-    }
-
-    template <>
-    __global__ void sqrt_kernel<__half>(const __half *A, __half *C, const int size)
-    {
-        int idx = blockIdx.x * blockDim.x + threadIdx.x;
-        if (idx < size)
-        {
-            C[idx] = hsqrt(A[idx]);
-        }
-    }
-     template <>
-    __global__ void sqrt_kernel<nv_bfloat16>(const nv_bfloat16 *A, nv_bfloat16 *C, const int size)
-    {
-        int idx = blockIdx.x * blockDim.x + threadIdx.x;
-        if (idx < size)
-        {
-            C[idx] = hsqrt(A[idx]);
+            deepx_sqrt(A + idx, C + idx);
         }
     }
+ 
     template <typename T>
-    void launch_sqrt(int numBlocks, int blockSize, const T *a, T *c, const int size)
+    void launch_sqrt(const T *a, T *c, const int size)
     {
+        auto [numBlocks, blockSize] = BestDims(size);
         sqrt_kernel<<<numBlocks, blockSize>>>(a, c, size);
         cudaError_t err = cudaGetLastError();
         if (err != cudaSuccess)
@@ -60,35 +31,25 @@ namespace deepx::tensorfunc
                                      std::string(cudaGetErrorString(err)));
         }
     }
-    template void launch_sqrt<double>(int numBlocks, int blockSize, const double *a, double *c, const int size);
-    template void launch_sqrt<float>(int numBlocks, int blockSize, const float *a, float *c, const int size);
-    template void launch_sqrt<__half>(int numBlocks, int blockSize, const __half *a, __half *c, const int size);
-    template void launch_sqrt<nv_bfloat16>(int numBlocks, int blockSize, const nv_bfloat16 *a, nv_bfloat16 *c, const int size);
+    template void launch_sqrt<double>(const double *a, double *c, const int size);
+    template void launch_sqrt<float>(const float *a, float *c, const int size);
+    template void launch_sqrt<__half>(const __half *a, __half *c, const int size);
+    template void launch_sqrt<nv_bfloat16>(const nv_bfloat16 *a, nv_bfloat16 *c, const int size);
+
     // pow
     template <typename T>
-    __global__ void pow_kernel(const T *A, const T *B, T *C, const int size);
-    template <>
-    __global__ void pow_kernel<double>(const double *A, const double *B, double *C, const int size)
-    {
-        int idx = blockIdx.x * blockDim.x + threadIdx.x;
-        if (idx < size)
-        {
-            C[idx] = pow(A[idx], B[idx]);
-        }
-    }
-    template <>
-    __global__ void pow_kernel<float>(const float *A, const float *B, float *C, const int size)
+    __global__ void pow_kernel(const T *A, const T *B, T *C, const int size)
     {
-        int idx = blockIdx.x * blockDim.x + threadIdx.x;
-        if (idx < size)
+        for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x)
         {
-            C[idx] = powf(A[idx], B[idx]);
+            deepx_pow(A + idx, B + idx, C + idx);
         }
     }
-
+ 
     template <typename T>
-    void launch_pow(int numBlocks, int blockSize, const T *a, const T *b, T *c, const int size)
+    void launch_pow(const T *a, const T *b, T *c, const int size)
     {
+        auto [numBlocks, blockSize] = BestDims(size);
         pow_kernel<<<numBlocks, blockSize>>>(a, b, c, size);
         cudaError_t err = cudaGetLastError();
         if (err != cudaSuccess)
@@ -97,36 +58,23 @@ namespace deepx::tensorfunc
                                      std::string(cudaGetErrorString(err)));
         }
     }
-    template void launch_pow<double>(int numBlocks, int blockSize, const double *a, const double *b, double *c, const int size);
-    template void launch_pow<float>(int numBlocks, int blockSize, const float *a, const float *b, float *c, const int size);
+    template void launch_pow<double>(const double *a, const double *b, double *c, const int size);
+    template void launch_pow<float>(const float *a, const float *b, float *c, const int size);
 
     // powscalar
     template <typename T>
-    __global__ void powscalar_kernel(const T *A, const T scalar, T *C, const int size);
-    template <>
-    __global__ void powscalar_kernel<double>(const double *A, const double scalar, double *C, const int size)
+    __global__ void powscalar_kernel(const T *A, const T scalar, T *C, const int size)
     {
-        int idx = blockIdx.x * blockDim.x + threadIdx.x;
-        if (idx < size)
+        for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x)
         {
-            C[idx] = pow(A[idx], scalar);
+            deepx_pow(A + idx, &scalar, C + idx);
         }
     }
-    template <>
-    __global__ void powscalar_kernel<float>(const float *A, const float scalar, float *C, const int size)
-    {
-        int idx = blockIdx.x * blockDim.x + threadIdx.x;
-        if (idx < size)
-        {
-            C[idx] = powf(A[idx], scalar);
-        }
-    }
-    template __global__ void powscalar_kernel<double>(const double *A, const double scalar, double *C, const int size);
-    template __global__ void powscalar_kernel<float>(const float *A, const float scalar, float *C, const int size);
-
+ 
     template <typename T>
-    void launch_powscalar(int numBlocks, int blockSize, const T *a, const T scalar, T *c, const int size)
+    void launch_powscalar(const T *a, const T scalar, T *c, const int size)
     {
+        auto [numBlocks, blockSize] = BestDims(size);
         powscalar_kernel<<<numBlocks, blockSize>>>(a, scalar, c, size);
         cudaError_t err = cudaGetLastError();
         if (err != cudaSuccess)
@@ -135,52 +83,48 @@ namespace deepx::tensorfunc
                                      std::string(cudaGetErrorString(err)));
         }
     }
-    template void launch_powscalar<double>(int numBlocks, int blockSize, const double *a, const double scalar, double *c, const int size);
-    template void launch_powscalar<float>(int numBlocks, int blockSize, const float *a, const float scalar, float *c, const int size);
+    template void launch_powscalar<double>(const double *a, const double scalar, double *c, const int size);
+    template void launch_powscalar<float>(const float *a, const float scalar, float *c, const int size);
 
-    // log
+    // rpowscalar
     template <typename T>
-    __global__ void log_kernel(const T *A, T *C, const int size);
-    template <>
-    __global__ void log_kernel<double>(const double *A, double *C, const int size)
+    __global__ void rpowscalar_kernel(const T scalar, const T *A, T *C, const int size)
     {
-        int idx = blockIdx.x * blockDim.x + threadIdx.x;
-        if (idx < size)
+        for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x)
         {
-            C[idx] = logf(A[idx]);
+            deepx_pow(&scalar, A + idx, C + idx);
         }
     }
-    template <>
-    __global__ void log_kernel<float>(const float *A, float *C, const int size)
+ 
+    template <typename T>
+    void launch_rpowscalar(const T scalar, const T *a, T *c, const int size)
     {
-        int idx = blockIdx.x * blockDim.x + threadIdx.x;
-        if (idx < size)
+        auto [numBlocks, blockSize] = BestDims(size);
+        rpowscalar_kernel<<<numBlocks, blockSize>>>(scalar, a, c, size);
+        cudaError_t err = cudaGetLastError();
+        if (err != cudaSuccess)
         {
-            C[idx] = logf(A[idx]);
+            throw std::runtime_error("Failed to launch rpowscalar kernel: " +
+                                     std::string(cudaGetErrorString(err)));
         }
     }
-    template <>
-    __global__ void log_kernel<__half>(const __half *A, __half *C, const int size)
+    template void launch_rpowscalar<double>(const double scalar, const double *a, double *c, const int size);
+    template void launch_rpowscalar<float>(const float scalar, const float *a, float *c, const int size);
+
+    // log
+    template <typename T>
+    __global__ void log_kernel(const T *A, T *C, const int size)
     {
-        int idx = blockIdx.x * blockDim.x + threadIdx.x;
-        if (idx < size)
+        for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x)
         {
-            C[idx] = hlog(A[idx]);
-        }
-    }
-    template <>
-    __global__ void log_kernel<nv_bfloat16>(const nv_bfloat16 *A, nv_bfloat16 *C, const int size)
-    {
-        int idx = blockIdx.x * blockDim.x + threadIdx.x;
-        if (idx < size)
-        {   
-            C[idx] = hlog(A[idx]);
+            deepx_log(A + idx, C + idx);
         }
     }
-    
+ 
     template <typename T>
-    void launch_log(int numBlocks, int blockSize, const T *a, T *c, const int size)
+    void launch_log(const T *a, T *c, const int size)
     {
+        auto [numBlocks, blockSize] = BestDims(size);
         log_kernel<<<numBlocks, blockSize>>>(a, c, size);
         cudaError_t err = cudaGetLastError();
         if (err != cudaSuccess)
@@ -189,54 +133,24 @@ namespace deepx::tensorfunc
                                      std::string(cudaGetErrorString(err)));
         }
     }
-    template void launch_log<double>(int numBlocks, int blockSize, const double *a, double *c, const int size);
-    template void launch_log<float>(int numBlocks, int blockSize, const float *a, float *c, const int size);
-    template void launch_log<__half>(int numBlocks, int blockSize, const __half *a, __half *c, const int size);
-    template void launch_log<nv_bfloat16>(int numBlocks, int blockSize, const nv_bfloat16 *a, nv_bfloat16 *c, const int size);
+    template void launch_log<double>(const double *a, double *c, const int size);
+    template void launch_log<float>(const float *a, float *c, const int size);
+    template void launch_log<__half>(const __half *a, __half *c, const int size);
+    template void launch_log<nv_bfloat16>(const nv_bfloat16 *a, nv_bfloat16 *c, const int size);
     // exp
     template <typename T>
-    __global__ void exp_kernel(const T *A, T *C, const int size);
-    template <>
-    __global__ void exp_kernel<double>(const double *A, double *C, const int size)
-    {
-        int idx = blockIdx.x * blockDim.x + threadIdx.x;
-        if (idx < size)
-        {
-            C[idx] = exp(A[idx]);
-        }
-    }
-    template <>
-    __global__ void exp_kernel<float>(const float *A, float *C, const int size)
-    {
-        int idx = blockIdx.x * blockDim.x + threadIdx.x;
-        if (idx < size)
-        {
-            C[idx] = expf(A[idx]);
-        }
-    }
-
-    template <>
-    __global__ void exp_kernel<__half>(const __half *A, __half *C, const int size)
-    {
-        int idx = blockIdx.x * blockDim.x + threadIdx.x;
-        if (idx < size)
-        {
-            C[idx] = hexp(A[idx]);
-        }
-    }
-    template <>
-    __global__ void exp_kernel<nv_bfloat16>(const nv_bfloat16 *A, nv_bfloat16 *C, const int size)
+    __global__ void exp_kernel(const T *A, T *C, const int size)
     {
-        int idx = blockIdx.x * blockDim.x + threadIdx.x;
-        if (idx < size)
+        for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x)
         {
-            C[idx] = hexp(A[idx]);
+            deepx_exp(A + idx, C + idx);
         }
     }
-
+ 
     template <typename T>
-    void launch_exp(int numBlocks, int blockSize, const T *a, T *c, const int size)
+    void launch_exp(const T *a, T *c, const int size)
     {
+        auto [numBlocks, blockSize] = BestDims(size);
         exp_kernel<<<numBlocks, blockSize>>>(a, c, size);
         cudaError_t err = cudaGetLastError();
         if (err != cudaSuccess)
@@ -245,9 +159,9 @@ namespace deepx::tensorfunc
                                      std::string(cudaGetErrorString(err)));
         }
     }
-    template void launch_exp<double>(int numBlocks, int blockSize, const double *a, double *c, const int size);
-    template void launch_exp<float>(int numBlocks, int blockSize, const float *a, float *c, const int size);
-    template void launch_exp<__half>(int numBlocks, int blockSize, const __half *a, __half *c, const int size);
-    template void launch_exp<nv_bfloat16>(int numBlocks, int blockSize, const nv_bfloat16 *a, nv_bfloat16 *c, const int size);
+    template void launch_exp<double>(const double *a, double *c, const int size);
+    template void launch_exp<float>(const float *a, float *c, const int size);
+    template void launch_exp<__half>(const __half *a, __half *c, const int size);
+    template void launch_exp<nv_bfloat16>(const nv_bfloat16 *a, nv_bfloat16 *c, const int size);
 }
 #endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_SQRT_CU
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cuh b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cuh
index 341a0295..6f4ffa42 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cuh
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cuh
@@ -1,7 +1,5 @@
 #ifndef DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_SQRT_CUH
 #define DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_SQRT_CUH
-#include <cuda_bf16.h>  
-#include <cuda_fp16.h>
 
 #include "deepx/tensorfunc/cuda.hpp"
 #include "deepx/tensorfunc/authors.hpp"
@@ -13,99 +11,46 @@ namespace deepx::tensorfunc
     __global__ void sqrt_kernel(const T* A, T* C,const int size);
 
     template <typename T>
-    void launch_sqrt(int numBlocks, int blockSize, const T* a, T* c,const int size);
-
-    template <>
-    void launch_sqrt<double>(int numBlocks, int blockSize, const double* a, double* c,const int size);
-
-    template <>
-    void launch_sqrt<float>(int numBlocks, int blockSize, const float* a, float* c,const int size);
-
-    template <>
-    void launch_sqrt<nv_bfloat16>(int numBlocks, int blockSize, const nv_bfloat16* a, nv_bfloat16* c,const int size);
-
-    template <>
-    void launch_sqrt<__half>(int numBlocks, int blockSize, const __half* a, __half* c,const int size);
-
+    void launch_sqrt(const T* a, T* c,const int size);
+ 
     
     // pow
     template <typename T>
     __global__ void pow_kernel(const T* A, const T* B, T* C,const int size);
 
     template <typename T>
-    void launch_pow(int numBlocks, int blockSize, const T* a, const T* b, T* c,const int size);
-
-    template <>
-    void launch_pow<double>(int numBlocks, int blockSize, const double* a, const double* b, double* c,const int size);
-
-    template <>
-    void launch_pow<float>(int numBlocks, int blockSize, const float* a, const float* b, float* c,const int size);
-
-    template <>
-    void launch_pow<nv_bfloat16>(int numBlocks, int blockSize, const nv_bfloat16* a, const nv_bfloat16* b, nv_bfloat16* c,const int size);
-
-    template <>
-    void launch_pow<__half>(int numBlocks, int blockSize, const __half* a, const __half* b, __half* c,const int size);
-
+    void launch_pow(const T* a, const T* b, T* c,const int size);
+ 
      
     // powscalar
     template <typename T>
     __global__ void powscalar_kernel(const T* A, const T scalar, T* C,const int size);
 
     template <typename T>
-    void launch_powscalar(int numBlocks, int blockSize, const T* a, const T scalar, T* c,const int size);   
-
-    template <>
-    void launch_powscalar<double>(int numBlocks, int blockSize, const double* a, const double scalar, double* c,const int size);
+    void launch_powscalar(const T* a, const T scalar, T* c,const int size);   
 
-    template <>
-    void launch_powscalar<float>(int numBlocks, int blockSize, const float* a, const float scalar, float* c,const int size);
-    
-    template <>
-    void launch_powscalar<nv_bfloat16>(int numBlocks, int blockSize, const nv_bfloat16* a, const nv_bfloat16 scalar, nv_bfloat16* c,const int size);
-
-    template <>
-    void launch_powscalar<__half>(int numBlocks, int blockSize, const __half* a, const __half scalar, __half* c,const int size);
+    // rpowscalar
+    template <typename T>
+    __global__ void rpowscalar_kernel(const T scalar, const T* A, T* C, const int size);
 
+    template <typename T>
+    void launch_rpowscalar(const T scalar, const T* a, T* c, const int size);
     
     // log
     template <typename T>
     __global__ void log_kernel(const T* A, T* C,const int size);
 
     template <typename T>
-    void launch_log(int numBlocks, int blockSize, const T* a, T* c,const int size);
-
-    template <>
-    void launch_log<double>(int numBlocks, int blockSize, const double* a, double* c,const int size);
-
-    template <>
-    void launch_log<float>(int numBlocks, int blockSize, const float* a, float* c,const int size);
-
-    template <>
-    void launch_log<nv_bfloat16>(int numBlocks, int blockSize, const nv_bfloat16* a, nv_bfloat16* c,const int size);
-
-        template <>
-    void launch_log<__half>(int numBlocks, int blockSize, const __half* a, __half* c,const int size);
+    void launch_log(const T* a, T* c,const int size);
  
     // exp
     template <typename T>
     __global__ void exp_kernel(const T* A, T* C,const int size);
 
     template <typename T>
-    void launch_exp(int numBlocks, int blockSize, const T* a, T* c,const int size);
+    void launch_exp(const T* a, T* c,const int size);
     
-    template <>
-    void launch_exp<double>(int  numBlocks, int blockSize, const double* a, double* c,const int size);
-
-    template <>
-    void launch_exp<float>(int numBlocks, int blockSize, const float* a, float* c,const int size);
-
-    template <>
-    void launch_exp<nv_bfloat16>(int numBlocks, int blockSize, const nv_bfloat16* a, nv_bfloat16* c,const int size);
-
-    template <>
-    void launch_exp<__half>(int numBlocks, int blockSize, const __half* a, __half* c,const int size);
-
+   
     
 }
 
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.hpp
index 38afe270..141bf51f 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.hpp
@@ -20,9 +20,7 @@ namespace deepx::tensorfunc
             if (A.shape.size != C.shape.size) {
                 throw TensorShapeError("sqrt");
             }
-            const int blockSize = A.shape.size > 256 ? 256 : A.shape.size;
-            int numBlocks = (A.shape.size + blockSize - 1) / blockSize;
-            launch_sqrt(numBlocks, blockSize, A.data, C.data, A.shape.size);           
+            launch_sqrt(A.data, C.data, A.shape.size);           
         }   
     };
 
@@ -34,9 +32,7 @@ namespace deepx::tensorfunc
             if (A.shape.size != C.shape.size) {
                 throw TensorShapeError("pow");
             }
-            const int blockSize = A.shape.size > 256 ? 256 : A.shape.size;
-            int numBlocks = (A.shape.size + blockSize - 1) / blockSize;
-            launch_pow(numBlocks, blockSize, A.data, B.data, C.data, A.shape.size);
+            launch_pow(A.data, B.data, C.data, A.shape.size);
         }
     };
 
@@ -48,12 +44,24 @@ namespace deepx::tensorfunc
             if (A.shape.size != C.shape.size) { 
                 throw TensorShapeError("powscalar");
             }
-            const int blockSize = A.shape.size > 256 ? 256 : A.shape.size;
-            int numBlocks = (A.shape.size + blockSize - 1) / blockSize;
-            launch_powscalar(numBlocks, blockSize, A.data, scalar, C.data, A.shape.size);
+            launch_powscalar(A.data, scalar, C.data, A.shape.size);
+        }
+    };
+ 
+    // rpowscalar
+    template <typename T>
+    struct rpowscalarDispatcher<miaobyte, T>
+    {
+        static void rpowscalar(const T value, const Tensor<T> &A, Tensor<T> &C)
+        {
+            if (A.shape.size != C.shape.size) { 
+                throw TensorShapeError("rpowscalar");
+            }
+            launch_rpowscalar(value, A.data, C.data, A.shape.size);
         }
     };
 
+    // log
     template <typename T>
     struct logDispatcher<miaobyte, T>
     {
@@ -62,9 +70,7 @@ namespace deepx::tensorfunc
             if (A.shape.size != C.shape.size) { 
                 throw TensorShapeError("log");
             }
-            const int blockSize = A.shape.size > 256 ? 256 : A.shape.size;
-            int numBlocks = (A.shape.size + blockSize - 1) / blockSize;
-            launch_log(numBlocks, blockSize, A.data, C.data, A.shape.size);
+            launch_log(A.data, C.data, A.shape.size);
         }
     };  
 
@@ -76,9 +82,7 @@ namespace deepx::tensorfunc
             if (A.shape.size != C.shape.size) { 
                 throw TensorShapeError("exp");
             }
-            const int blockSize = A.shape.size > 256 ? 256 : A.shape.size;
-            int numBlocks = (A.shape.size + blockSize - 1) / blockSize;
-            launch_exp(numBlocks, blockSize, A.data, C.data, A.shape.size);
+            launch_exp(A.data, C.data, A.shape.size);
         }
     };
 
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.cu
index 43717698..c9e185c8 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.cu
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.cu
@@ -9,8 +9,9 @@
 #include "deepx/tensorfunc/reduce_miaobyte.cuh"
 #include "deepx/tensorfunc/tensor_cuda.cuh"
 #include "deepx/tensorfunc/vector_cuda.cuh"
-#include "deepx/tensorfunc/cuda_math.cuh"
 
+#include "deepx/tensorfunc/cuda_atomic.cuh"
+#include "deepx/tensorfunc/cuda_math.cuh"
 namespace deepx::tensorfunc
 {
 
diff --git a/excuter/op-mem-cuda/src/deepx/tf/elementwise_basic.hpp b/excuter/op-mem-cuda/src/deepx/tf/elementwise_basic.hpp
index 91fa6326..8611a227 100644
--- a/excuter/op-mem-cuda/src/deepx/tf/elementwise_basic.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tf/elementwise_basic.hpp
@@ -40,6 +40,10 @@ namespace deepx::tf
         }
         int run(shared_ptr<MemBase> mem, string &error) override
         {
+            if(!checktensors({this->args[0].textvalue,this->args[1].textvalue,this->returns[0].textvalue},mem, error))
+            {
+                return 1;
+            }
             Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
             Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype;
             Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
@@ -58,7 +62,7 @@ namespace deepx::tf
                 break;
             case Precision::Float16:
                 tensorfunc::add<Author, half>(*mem->gettensor<half>(this->args[0].textvalue), *mem->gettensor<half>(this->args[1].textvalue), *mem->gettensor<half>(this->returns[0].textvalue));
-                break;  
+                break;
             case Precision::BFloat16:
                 tensorfunc::add<Author, nv_bfloat16>(*mem->gettensor<nv_bfloat16>(this->args[0].textvalue), *mem->gettensor<nv_bfloat16>(this->args[1].textvalue), *mem->gettensor<nv_bfloat16>(this->returns[0].textvalue));
                 break;
@@ -112,7 +116,11 @@ namespace deepx::tf
             return make_shared<AddScalar<Author>>(*this);
         }
         int run(shared_ptr<MemBase> mem, string &error) override
-        {
+        {   
+            if(!checktensors({this->args[0].textvalue,this->returns[0].textvalue},mem, error))
+            {
+                return 1;
+            }
             Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
             Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
             if (a_type != c_type)
@@ -123,14 +131,14 @@ namespace deepx::tf
             switch (a_type)
             {
             case Precision::Float64:
-                tensorfunc::addscalar<Author, double>(*mem->gettensor<double>(this->args[0].textvalue),  this->getvar<double>(1, mem), *mem->gettensor<double>(this->returns[0].textvalue));
+                tensorfunc::addscalar<Author, double>(*mem->gettensor<double>(this->args[0].textvalue), this->getvar<double>(1, mem), *mem->gettensor<double>(this->returns[0].textvalue));
                 break;
             case Precision::Float32:
                 tensorfunc::addscalar<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), this->getvar<float>(1, mem), *mem->gettensor<float>(this->returns[0].textvalue));
                 break;
             case Precision::Float16:
                 tensorfunc::addscalar<Author, half>(*mem->gettensor<half>(this->args[0].textvalue), this->getvar<half>(1, mem), *mem->gettensor<half>(this->returns[0].textvalue));
-                break;  
+                break;
             case Precision::BFloat16:
                 tensorfunc::addscalar<Author, nv_bfloat16>(*mem->gettensor<nv_bfloat16>(this->args[0].textvalue), this->getvar<nv_bfloat16>(1, mem), *mem->gettensor<nv_bfloat16>(this->returns[0].textvalue));
                 break;
@@ -184,7 +192,11 @@ namespace deepx::tf
             return make_shared<Sub<Author>>(*this);
         }
         int run(shared_ptr<MemBase> mem, string &error) override
-        {
+        {   
+            if(!checktensors({this->args[0].textvalue,this->args[1].textvalue,this->returns[0].textvalue},mem, error))  
+            {
+                return 1;
+            }
             Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
             Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype;
             Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
@@ -203,7 +215,7 @@ namespace deepx::tf
                 break;
             case Precision::Float16:
                 tensorfunc::sub<Author, half>(*mem->gettensor<half>(this->args[0].textvalue), *mem->gettensor<half>(this->args[1].textvalue), *mem->gettensor<half>(this->returns[0].textvalue));
-                break;  
+                break;
             case Precision::BFloat16:
                 tensorfunc::sub<Author, nv_bfloat16>(*mem->gettensor<nv_bfloat16>(this->args[0].textvalue), *mem->gettensor<nv_bfloat16>(this->args[1].textvalue), *mem->gettensor<nv_bfloat16>(this->returns[0].textvalue));
                 break;
@@ -230,7 +242,7 @@ namespace deepx::tf
     template <typename Author>
     class SubScalar : public TF
     {
-    public: 
+    public:
         SubScalar(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "subscalar";
@@ -255,9 +267,13 @@ namespace deepx::tf
         shared_ptr<TF> clone() const override
         {
             return make_shared<SubScalar<Author>>(*this);
-        }   
+        }
         int run(shared_ptr<MemBase> mem, string &error) override
         {
+            if(!checktensors({this->args[0].textvalue,this->returns[0].textvalue},mem, error))
+            {
+                return 1;
+            }
             Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
             Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
             if (a_type != c_type)
@@ -287,7 +303,7 @@ namespace deepx::tf
                 break;
             case Precision::Int16:
                 tensorfunc::subscalar<Author, int16_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), this->getvar<int16_t>(1, mem), *mem->gettensor<int16_t>(this->returns[0].textvalue));
-                break;  
+                break;
             case Precision::Int8:
                 tensorfunc::subscalar<Author, int8_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), this->getvar<int8_t>(1, mem), *mem->gettensor<int8_t>(this->returns[0].textvalue));
                 break;
@@ -296,9 +312,9 @@ namespace deepx::tf
                 return 1;
             }
             return 0;
-        }   
+        }
     };
- 
+
     template <typename Author>
     class Mul : public TF
     {
@@ -329,7 +345,11 @@ namespace deepx::tf
             return make_shared<Mul<Author>>(*this);
         }
         int run(shared_ptr<MemBase> mem, string &error) override
-        {
+        {   
+            if(!checktensors({this->args[0].textvalue,this->args[1].textvalue,this->returns[0].textvalue},mem, error))
+            {
+                return 1;
+            }
             Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
             Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype;
             Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
@@ -348,7 +368,7 @@ namespace deepx::tf
                 break;
             case Precision::Float16:
                 tensorfunc::mul<Author, half>(*mem->gettensor<half>(this->args[0].textvalue), *mem->gettensor<half>(this->args[1].textvalue), *mem->gettensor<half>(this->returns[0].textvalue));
-                break;  
+                break;
             case Precision::BFloat16:
                 tensorfunc::mul<Author, nv_bfloat16>(*mem->gettensor<nv_bfloat16>(this->args[0].textvalue), *mem->gettensor<nv_bfloat16>(this->args[1].textvalue), *mem->gettensor<nv_bfloat16>(this->returns[0].textvalue));
                 break;
@@ -375,7 +395,7 @@ namespace deepx::tf
     template <typename Author>
     class MulScalar : public TF
     {
-    public: 
+    public:
         MulScalar(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "mulscalar";
@@ -400,9 +420,13 @@ namespace deepx::tf
         shared_ptr<TF> clone() const override
         {
             return make_shared<MulScalar<Author>>(*this);
-        }   
+        }
         int run(shared_ptr<MemBase> mem, string &error) override
-        {
+        {   
+            if(!checktensors({this->args[0].textvalue,this->returns[0].textvalue},mem, error))
+            {
+                return 1;
+            }
             Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
             Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
             if (a_type != c_type)
@@ -432,7 +456,7 @@ namespace deepx::tf
                 break;
             case Precision::Int16:
                 tensorfunc::mulscalar<Author, int16_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), this->getvar<int16_t>(1, mem), *mem->gettensor<int16_t>(this->returns[0].textvalue));
-                break;  
+                break;
             case Precision::Int8:
                 tensorfunc::mulscalar<Author, int8_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), this->getvar<int8_t>(1, mem), *mem->gettensor<int8_t>(this->returns[0].textvalue));
                 break;
@@ -441,7 +465,7 @@ namespace deepx::tf
                 return 1;
             }
             return 0;
-        }   
+        }
     };
 
     template <typename Author>
@@ -454,7 +478,7 @@ namespace deepx::tf
             this->author = Author::name();
             this->args = args;
             this->returns = returns;
-        }   
+        }
 
         Div(string text)
         {
@@ -464,7 +488,7 @@ namespace deepx::tf
             {
                 throw std::runtime_error("Invalid name: " + this->name);
             }
-        }   
+        }
         string math_formula() const override
         {
             return "T3=T1/T2";
@@ -472,12 +496,16 @@ namespace deepx::tf
         shared_ptr<TF> clone() const override
         {
             return make_shared<Div<Author>>(*this);
-        }   
+        }
         int run(shared_ptr<MemBase> mem, string &error) override
-        {
+        {   
+            if(!checktensors({this->args[0].textvalue,this->args[1].textvalue,this->returns[0].textvalue},mem, error))
+            {
+                return 1;
+            }
             Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
             Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype;
-            Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;   
+            Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
             if (a_type != b_type || a_type != c_type)
             {
                 error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(b_type) + " != " + precision_str(c_type);
@@ -485,7 +513,7 @@ namespace deepx::tf
             }
             switch (a_type)
             {
-            case Precision::Float64:    
+            case Precision::Float64:
                 tensorfunc::div<Author, double>(*mem->gettensor<double>(this->args[0].textvalue), *mem->gettensor<double>(this->args[1].textvalue), *mem->gettensor<double>(this->returns[0].textvalue));
                 break;
             case Precision::Float32:
@@ -493,25 +521,25 @@ namespace deepx::tf
                 break;
             case Precision::Float16:
                 tensorfunc::div<Author, half>(*mem->gettensor<half>(this->args[0].textvalue), *mem->gettensor<half>(this->args[1].textvalue), *mem->gettensor<half>(this->returns[0].textvalue));
-                break;  
+                break;
             case Precision::BFloat16:
                 tensorfunc::div<Author, nv_bfloat16>(*mem->gettensor<nv_bfloat16>(this->args[0].textvalue), *mem->gettensor<nv_bfloat16>(this->args[1].textvalue), *mem->gettensor<nv_bfloat16>(this->returns[0].textvalue));
                 break;
             case Precision::Int64:
                 tensorfunc::div<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), *mem->gettensor<int32_t>(this->returns[0].textvalue));
-                break;  
+                break;
             case Precision::Int32:
                 tensorfunc::div<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), *mem->gettensor<int32_t>(this->returns[0].textvalue));
                 break;
             case Precision::Int16:
                 tensorfunc::div<Author, int16_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), *mem->gettensor<int16_t>(this->args[1].textvalue), *mem->gettensor<int16_t>(this->returns[0].textvalue));
-                break;  
+                break;
             case Precision::Int8:
                 tensorfunc::div<Author, int8_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), *mem->gettensor<int8_t>(this->args[1].textvalue), *mem->gettensor<int8_t>(this->returns[0].textvalue));
                 break;
             default:
                 error = "Unsupported dtype: " + precision_str(a_type);
-                return 1;   
+                return 1;
             }
             return 0;
         }
@@ -545,9 +573,13 @@ namespace deepx::tf
         shared_ptr<TF> clone() const override
         {
             return make_shared<DivScalar<Author>>(*this);
-        }   
+        }
         int run(shared_ptr<MemBase> mem, string &error) override
-        {
+        {   
+            if(!checktensors({this->args[0].textvalue,this->returns[0].textvalue},mem, error))
+            {
+                return 1;
+            }
             Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
             Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
             if (a_type != c_type)
@@ -558,36 +590,36 @@ namespace deepx::tf
             switch (a_type)
             {
             case Precision::Float64:
-                tensorfunc::divscalar<Author, double>( *mem->gettensor<double>(this->args[0].textvalue),this->getvar<double>(1, mem), *mem->gettensor<double>(this->returns[0].textvalue));
+                tensorfunc::divscalar<Author, double>(*mem->gettensor<double>(this->args[0].textvalue), this->getvar<double>(1, mem), *mem->gettensor<double>(this->returns[0].textvalue));
                 break;
             case Precision::Float32:
-                tensorfunc::divscalar<Author, float>( *mem->gettensor<float>(this->args[0].textvalue),this->getvar<float>(1, mem), *mem->gettensor<float>(this->returns[0].textvalue));
+                tensorfunc::divscalar<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), this->getvar<float>(1, mem), *mem->gettensor<float>(this->returns[0].textvalue));
                 break;
             case Precision::Float16:
-                tensorfunc::divscalar<Author, half>( *mem->gettensor<half>(this->args[0].textvalue),this->getvar<half>(1, mem), *mem->gettensor<half>(this->returns[0].textvalue));
+                tensorfunc::divscalar<Author, half>(*mem->gettensor<half>(this->args[0].textvalue), this->getvar<half>(1, mem), *mem->gettensor<half>(this->returns[0].textvalue));
                 break;
             case Precision::BFloat16:
-                tensorfunc::divscalar<Author, nv_bfloat16>( *mem->gettensor<nv_bfloat16>(this->args[0].textvalue),this->getvar<nv_bfloat16>(1, mem), *mem->gettensor<nv_bfloat16>(this->returns[0].textvalue));
+                tensorfunc::divscalar<Author, nv_bfloat16>(*mem->gettensor<nv_bfloat16>(this->args[0].textvalue), this->getvar<nv_bfloat16>(1, mem), *mem->gettensor<nv_bfloat16>(this->returns[0].textvalue));
                 break;
             case Precision::Int64:
-                tensorfunc::divscalar<Author, int32_t>( *mem->gettensor<int32_t>(this->args[0].textvalue),this->getvar<int32_t>(1, mem), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                tensorfunc::divscalar<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), this->getvar<int32_t>(1, mem), *mem->gettensor<int32_t>(this->returns[0].textvalue));
                 break;
             case Precision::Int32:
-                tensorfunc::divscalar<Author, int32_t>( *mem->gettensor<int32_t>(this->args[0].textvalue),this->getvar<int32_t>(1, mem), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                tensorfunc::divscalar<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), this->getvar<int32_t>(1, mem), *mem->gettensor<int32_t>(this->returns[0].textvalue));
                 break;
             case Precision::Int16:
-                tensorfunc::divscalar<Author, int16_t>( *mem->gettensor<int16_t>(this->args[0].textvalue),this->getvar<int16_t>(1, mem), *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                tensorfunc::divscalar<Author, int16_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), this->getvar<int16_t>(1, mem), *mem->gettensor<int16_t>(this->returns[0].textvalue));
                 break;
             case Precision::Int8:
-                tensorfunc::divscalar<Author, int8_t>( *mem->gettensor<int8_t>(this->args[0].textvalue),this->getvar<int8_t>(1, mem), *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                tensorfunc::divscalar<Author, int8_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), this->getvar<int8_t>(1, mem), *mem->gettensor<int8_t>(this->returns[0].textvalue));
                 break;
             default:
                 error = "Unsupported dtype: " + precision_str(a_type);
                 return 1;
             }
             return 0;
-        }   
-    };  
+        }
+    };
 
     template <typename Author>
     class RDivScalar : public TF
@@ -599,7 +631,7 @@ namespace deepx::tf
             this->author = Author::name();
             this->args = args;
             this->returns = returns;
-            }
+        }
 
         RDivScalar(string text)
         {
@@ -617,9 +649,13 @@ namespace deepx::tf
         shared_ptr<TF> clone() const override
         {
             return make_shared<RDivScalar<Author>>(*this);
-        }       
+        }
         int run(shared_ptr<MemBase> mem, string &error) override
         {
+            if(!checktensors({this->args[1].textvalue,this->returns[0].textvalue},mem, error))
+            {
+                return 1;
+            }
             Precision a_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype;
             Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
             if (a_type != c_type)
@@ -631,7 +667,7 @@ namespace deepx::tf
             {
             case Precision::Float64:
                 tensorfunc::rdivscalar<Author, double>(this->getvar<double>(0, mem), *mem->gettensor<double>(this->args[1].textvalue), *mem->gettensor<double>(this->returns[0].textvalue));
-                break;  
+                break;
             case Precision::Float32:
                 tensorfunc::rdivscalar<Author, float>(this->getvar<float>(0, mem), *mem->gettensor<float>(this->args[1].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
                 break;
@@ -656,10 +692,66 @@ namespace deepx::tf
             default:
                 error = "Unsupported dtype: " + precision_str(a_type);
                 return 1;
-            }   
+            }
+            return 0;
+        }
+    };
+
+    // invert
+    template <typename Author>
+    class Invert : public TF
+    {
+    public:
+        Invert(const vector<Param> &args, const vector<Param> &returns)
+        {
+            this->name = "invert";
+            this->author = Author::name();
+            this->args = args;
+            this->returns = returns;
+        }
+        string math_formula() const override
+        {
+            return "T3=~T1";
+        }
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<Invert<Author>>(*this);
+        }
+        int run(shared_ptr<MemBase> mem, string &error) override
+        {
+            if(!checktensors({this->args[0].textvalue,this->returns[0].textvalue},mem, error))
+            {
+                return 1;
+            }
+            Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
+            Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
+            if (a_type != c_type)
+            {
+                error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type);
+                return 1;
+            }
+            switch (a_type)
+            {
+            case Precision::Int64:
+                tensorfunc::invert<Author>(*mem->gettensor<int64_t>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int32:
+                tensorfunc::invert<Author>(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int16:
+                tensorfunc::invert<Author>(*mem->gettensor<int16_t>(this->args[0].textvalue), *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int8:
+                tensorfunc::invert<Author>(*mem->gettensor<int8_t>(this->args[0].textvalue), *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                break;
+            default:
+                error = "Unsupported dtype: " + precision_str(a_type);
+                return 1;
+            }
             return 0;
-        }   
+        }
     };
+
 };
 
 #endif // DEEPX_TF_ELEMENTWISE_BASIC_HPP
diff --git a/excuter/op-mem-cuda/src/deepx/tf/elementwise_sqrt.hpp b/excuter/op-mem-cuda/src/deepx/tf/elementwise_sqrt.hpp
index 204fae9e..ae417bfe 100644
--- a/excuter/op-mem-cuda/src/deepx/tf/elementwise_sqrt.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tf/elementwise_sqrt.hpp
@@ -7,60 +7,61 @@
 
 namespace deepx::tf
 {
-
+    // Pow 
     template <typename Author>
-    class Sqrt : public TF
+    class Pow : public TF
     {
     public:
-        Sqrt(const vector<Param> &args, const vector<Param> &returns)
+        Pow(const vector<Param> &args, const vector<Param> &returns)
         {
-            this->name = "sqrt";
+            this->name = "pow";
             this->author = Author::name();
             this->args = args;
             this->returns = returns;
         }
 
-        Sqrt(string text)
+        Pow(string text)
         {
             this->parse(text);
             this->author = Author::name();
-            if (this->name != "sqrt")
+            if (this->name != "pow")
             {
                 throw std::runtime_error("Invalid name: " + this->name);
             }
         }
         string math_formula() const override
         {
-            return "T3=sqrt(T1)";
+            return "T3=pow(T1, T2)";
         }
         shared_ptr<TF> clone() const override
         {
-            return make_shared<Sqrt<Author>>(*this);
+            return make_shared<Pow<Author>>(*this);
         }
 
         int run(shared_ptr<MemBase> mem, string &error) override
-        {
+        {   
+            if(!checktensors({this->args[0].textvalue, this->args[1].textvalue, this->returns[0].textvalue}, mem, error))
+            {
+                return 1;
+            }
+
             Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
+            Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype;
             Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
-            if (a_type != c_type)
+            if (a_type != c_type || b_type != c_type)
             {
-                error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type);
+                error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type) + " or " + precision_str(b_type) + " != " + precision_str(c_type);
                 return 1;
             }
             switch (a_type)
             {
             case Precision::Float64:
-                tensorfunc::sqrt<Author, double>(*mem->gettensor<double>(this->args[0].textvalue), *mem->gettensor<double>(this->returns[0].textvalue));
+                tensorfunc::pow<Author, double>(*mem->gettensor<double>(this->args[0].textvalue), *mem->gettensor<double>(this->args[1].textvalue), *mem->gettensor<double>(this->returns[0].textvalue));
                 break;
             case Precision::Float32:
-                tensorfunc::sqrt<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
-                break;
-            case Precision::Float16:
-                tensorfunc::sqrt<Author, half>(*mem->gettensor<half>(this->args[0].textvalue), *mem->gettensor<half>(this->returns[0].textvalue));
-                break;
-            case Precision::BFloat16:
-                tensorfunc::sqrt<Author, nv_bfloat16>(*mem->gettensor<nv_bfloat16>(this->args[0].textvalue), *mem->gettensor<nv_bfloat16>(this->returns[0].textvalue));
+                tensorfunc::pow<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<float>(this->args[1].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
                 break;
+
             default:
                 error = "Unsupported type: " + precision_str(a_type);
                 return 1;
@@ -69,55 +70,59 @@ namespace deepx::tf
         }
     };
 
+    // Powscalar
     template <typename Author>
-    class Pow : public TF
+    class PowScalar : public TF
     {
     public:
-        Pow(const vector<Param> &args, const vector<Param> &returns)
+        PowScalar(const vector<Param> &args, const vector<Param> &returns)
         {
-            this->name = "pow";
+            this->name = "powscalar";
             this->author = Author::name();
             this->args = args;
             this->returns = returns;
         }
 
-        Pow(string text)
+        PowScalar(string text)
         {
             this->parse(text);
             this->author = Author::name();
-            if (this->name != "pow")
+            if (this->name != "powscalar")
             {
                 throw std::runtime_error("Invalid name: " + this->name);
             }
         }
         string math_formula() const override
         {
-            return "T3=pow(T1, T2)";
+            return "T3=pow(T1, scalar)";
         }
         shared_ptr<TF> clone() const override
         {
-            return make_shared<Pow<Author>>(*this);
+            return make_shared<PowScalar<Author>>(*this);
         }
 
         int run(shared_ptr<MemBase> mem, string &error) override
-        {
+        {   
+            if(!checktensors({this->args[0].textvalue,  this->returns[0].textvalue}, mem, error))
+            {
+                return 1;
+            }
             Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
-            Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype;
             Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
-            if (a_type != c_type || b_type != c_type)
+            if (a_type != c_type)
             {
-                error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type) + " or " + precision_str(b_type) + " != " + precision_str(c_type);
+                error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type)  + " != " + precision_str(c_type);
                 return 1;
             }
             switch (a_type)
             {
             case Precision::Float64:
-                tensorfunc::pow<Author, double>(*mem->gettensor<double>(this->args[0].textvalue), *mem->gettensor<double>(this->args[1].textvalue), *mem->gettensor<double>(this->returns[0].textvalue));
+                tensorfunc::powscalar<Author, double>(*mem->gettensor<double>(this->args[0].textvalue), this->getvar<double>(1, mem), *mem->gettensor<double>(this->returns[0].textvalue));
                 break;
             case Precision::Float32:
-                tensorfunc::pow<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<float>(this->args[1].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
+                tensorfunc::powscalar<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), this->getvar<float>(1, mem), *mem->gettensor<float>(this->returns[0].textvalue));
                 break;
-            
+
             default:
                 error = "Unsupported type: " + precision_str(a_type);
                 return 1;
@@ -126,55 +131,114 @@ namespace deepx::tf
         }
     };
 
+    // Rpowscalar
     template <typename Author>
-    class PowScalar : public TF
+    class RpowScalar : public TF
     {
     public:
-        PowScalar(const vector<Param> &args, const vector<Param> &returns)
+        RpowScalar(const vector<Param> &args, const vector<Param> &returns)
         {
-            this->name = "powscalar";
+            this->name = "rpowscalar";
             this->author = Author::name();
             this->args = args;
             this->returns = returns;
         }
+        string math_formula() const override
+        {
+            return "T3=pow(scalar, T1)";
+        }
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<RpowScalar<Author>>(*this);
+        }   
 
-        PowScalar(string text)
+        int run(shared_ptr<MemBase> mem, string &error) override
+        {
+            if(!checktensors({this->args[1].textvalue, this->returns[0].textvalue}, mem, error))
+            {
+                return 1;
+            }
+            Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype;
+            Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
+            if (b_type != c_type)
+            {
+                error = "Type mismatch: " + precision_str(b_type) + " != " + precision_str(c_type);
+                return 1;
+            }
+            switch (b_type)
+            {
+            case Precision::Float64:
+                tensorfunc::rpowscalar<Author, double>(this->getvar<double>(0, mem), *mem->gettensor<double>(this->args[1].textvalue), *mem->gettensor<double>(this->returns[0].textvalue));
+                break;
+            case Precision::Float32:
+                tensorfunc::rpowscalar<Author, float>(this->getvar<float>(0, mem), *mem->gettensor<float>(this->args[1].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
+                break;
+            default:
+                error = "Unsupported type: " + precision_str(b_type);
+                return 1;
+            }
+            return 0;
+        }
+    };
+
+    // Sqrt
+    template <typename Author>
+    class Sqrt : public TF
+    {
+    public:
+        Sqrt(const vector<Param> &args, const vector<Param> &returns)
+        {
+            this->name = "sqrt";
+            this->author = Author::name();
+            this->args = args;
+            this->returns = returns;
+        }
+
+        Sqrt(string text)
         {
             this->parse(text);
             this->author = Author::name();
-            if (this->name != "powscalar")
+            if (this->name != "sqrt")
             {
                 throw std::runtime_error("Invalid name: " + this->name);
             }
         }
         string math_formula() const override
         {
-            return "T3=pow(T1, scalar)";
+            return "T3=sqrt(T1)";
         }
         shared_ptr<TF> clone() const override
         {
-            return make_shared<PowScalar<Author>>(*this);
+            return make_shared<Sqrt<Author>>(*this);
         }
 
         int run(shared_ptr<MemBase> mem, string &error) override
         {
+            if(!checktensors({this->args[0].textvalue, this->returns[0].textvalue}, mem, error))
+            {
+                return 1;
+            }
             Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
-            Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype;
             Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
-            if (a_type != c_type || b_type != c_type)
+            if (a_type != c_type)
             {
-                error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type) + " or " + precision_str(b_type) + " != " + precision_str(c_type);
+                error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type);
                 return 1;
             }
             switch (a_type)
             {
             case Precision::Float64:
-                tensorfunc::powscalar<Author, double>(*mem->gettensor<double>(this->args[0].textvalue),  this->getvar<double>(1, mem), *mem->gettensor<double>(this->returns[0].textvalue));
+                tensorfunc::sqrt<Author, double>(*mem->gettensor<double>(this->args[0].textvalue), *mem->gettensor<double>(this->returns[0].textvalue));
                 break;
             case Precision::Float32:
-                tensorfunc::powscalar<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), this->getvar<float>(1, mem), *mem->gettensor<float>(this->returns[0].textvalue));
+                tensorfunc::sqrt<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
+                break;
+            case Precision::Float16:
+                tensorfunc::sqrt<Author, half>(*mem->gettensor<half>(this->args[0].textvalue), *mem->gettensor<half>(this->returns[0].textvalue));
+                break;
+            case Precision::BFloat16:
+                tensorfunc::sqrt<Author, nv_bfloat16>(*mem->gettensor<nv_bfloat16>(this->args[0].textvalue), *mem->gettensor<nv_bfloat16>(this->returns[0].textvalue));
                 break;
-            
             default:
                 error = "Unsupported type: " + precision_str(a_type);
                 return 1;
@@ -214,7 +278,11 @@ namespace deepx::tf
         }
 
         int run(shared_ptr<MemBase> mem, string &error) override
-        {
+        {   
+            if(!checktensors({this->args[0].textvalue, this->returns[0].textvalue}, mem, error))
+            {
+                return 1;
+            }
             Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
             Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
             if (a_type != c_type)
@@ -275,7 +343,11 @@ namespace deepx::tf
         }
 
         int run(shared_ptr<MemBase> mem, string &error) override
-        {
+        {   
+            if(!checktensors({this->args[0].textvalue, this->returns[0].textvalue}, mem, error))
+            {
+                return 1;
+            }
             Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
             Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
             if (a_type != c_type)
diff --git a/excuter/op-mem-ompsimd/src/client/tfs.cpp b/excuter/op-mem-ompsimd/src/client/tfs.cpp
index e769edc6..2f9d9794 100644
--- a/excuter/op-mem-ompsimd/src/client/tfs.cpp
+++ b/excuter/op-mem-ompsimd/src/client/tfs.cpp
@@ -219,6 +219,15 @@ namespace deepx::tf
                                                                     {
                                                                         Param("C", DataCategory::Tensor, Precision::Any),
                                                                     })));
+        // invert author=miaobyte
+        tffactory.add_tf(std::make_shared<Invert<miaobyte>>(vector<Param>(
+                                                                 {
+                                                                     Param("A", DataCategory::Tensor, Precision::Int64 | Precision::Int32 | Precision::Int16 | Precision::Int8),
+                                                                 }),
+                                                                 vector<Param>(
+                                                                     {
+                                                                         Param("C", DataCategory::Tensor, Precision::Int64 | Precision::Int32 | Precision::Int16 | Precision::Int8),
+                                                                     })));
         // sqrt author=miaobyte
         tffactory.add_tf(std::make_shared<Sqrt<miaobyte>>(vector<Param>(
                                                               {
@@ -249,6 +258,17 @@ namespace deepx::tf
                                                                    {
                                                                        Param("C", DataCategory::Tensor, Precision::Any),
                                                                    })));
+        // rpowscalar author=miaobyte
+        tffactory.add_tf(std::make_shared<RpowScalar<miaobyte>>(vector<Param>(
+                                                                   {
+                                                                       Param("scalar", DataCategory::Var, Precision::Any),
+                                                                       Param("A", DataCategory::Tensor, Precision::Any),
+                                                                   }),
+                                                               vector<Param>(
+                                                                   {
+                                                                       Param("C", DataCategory::Tensor, Precision::Any),
+                                                                   })));
+
         // log author=miaobyte
         tffactory.add_tf(std::make_shared<Log<miaobyte>>(vector<Param>(
                                                              {
diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp
index 18d0fbe7..7425c2a7 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp
@@ -284,6 +284,29 @@ namespace deepx::tensorfunc
         }
     };
 
+    // invert
+    template <typename T>
+    struct invertDispatcher<miaobyte, T>
+    {
+        static void invert(const Tensor<T> &A, Tensor<T> &C)
+        {   
+            if (A.shape == C.shape)
+            {
+                A.shape.rangeParallel(A.shape.dim-1, [&A, &C](int idx)
+                                      {
+                                           for (int j=0;j<A.shape[-1];j++)
+                                           {
+                                                C.data[idx+j]=~A.data[idx+j];
+                                           } 
+                                      });
+            }
+            else
+            {
+                throw std::invalid_argument("shape mismatch");
+            }
+        }
+    };  
+
     template <typename T>
     struct sqrtDispatcher<miaobyte, T, std::enable_if_t<std::is_floating_point_v<T>>>
     {
@@ -392,6 +415,26 @@ namespace deepx::tensorfunc
         }
     };
 
+    // rpowscalar
+    template <typename T>
+    struct rpowscalarDispatcher<miaobyte, T>
+    {
+        static void rpowscalar(const T value, const Tensor<T> &input, Tensor<T> &output)
+        {
+            if (input.shape == output.shape)
+            {
+                output.shape.rangeParallel(output.shape.dim - 1, [&input, &output, &value](int i)
+                                           {
+                                                for (int j = 0; j < output.shape[-1]; j++)
+                                                output.data[i+j] = std::pow(value, input.data[i+j]); });
+            }
+            else
+            {
+                throw std::invalid_argument("shape mismatch");
+            }
+        }
+    };  
+
     template <typename T>
     struct logDispatcher<miaobyte, T>
     {
diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp b/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp
index 8f1e3d0c..46863815 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp
@@ -546,6 +546,61 @@ namespace deepx::tf
         }
     };
 
+    // invert
+    template <typename Author>
+    class Invert : public TF
+    {
+    public:
+        Invert(vector<Param> args, vector<Param> returns)
+        {
+            this->name = "invert";
+            this->author = Author::name();
+            this->args = args;
+            this->returns = returns;
+        }
+        string math_formula() const override
+        {
+            return "T3=~T1";
+        }
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<Invert<Author>>(*this);
+        }
+        int run(shared_ptr<MemBase> mem, string &error) override
+        {
+            if (!checktensors({this->args[0].textvalue, this->returns[0].textvalue}, mem, error)!=0)
+            {
+                return 1;
+            }
+            Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
+            Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
+            if (a_type != c_type)
+            {
+                error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type);
+                return 1;
+            }   
+            switch (a_type)
+            {
+            case Precision::Int64:
+                tensorfunc::invert<Author>(*mem->gettensor<int64_t>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int32:
+                tensorfunc::invert<Author>(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int16:
+                tensorfunc::invert<Author>(*mem->gettensor<int16_t>(this->args[0].textvalue), *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int8:
+                tensorfunc::invert<Author>(*mem->gettensor<int8_t>(this->args[0].textvalue), *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                break;
+            default:
+                error = "Unsupported dtype: " + precision_str(a_type);
+                return 1;
+            }
+            return 0;
+        }
+    };
+
     template <typename Author>
     class Sqrt : public TF
     {
@@ -679,6 +734,51 @@ namespace deepx::tf
         }
     };
 
+    // rpowscalar
+    template <typename Author>
+    class RpowScalar : public TF
+    {
+    public:
+        RpowScalar(vector<Param> args, vector<Param> returns)
+        {
+            this->name = "rpowscalar";
+            this->author = Author::name();
+            this->args = args;
+            this->returns = returns;
+        }
+        string math_formula() const override
+        {
+            return "T3=scalar^T1";
+        }
+        shared_ptr<TF> clone() const override
+        {   
+            return make_shared<RpowScalar<Author>>(*this);
+        }
+        int run(shared_ptr<MemBase> mem, string &error) override
+        {
+            Precision a_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype;
+            Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
+            if (a_type != c_type)
+            {
+                error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type);
+                return 1;
+            }
+            switch (a_type) 
+            {
+            case Precision::Float64:
+                tensorfunc::rpowscalar<Author, double>(this->getvar<double>(0, mem), *mem->gettensor<double>(this->args[1].textvalue), *mem->gettensor<double>(this->returns[0].textvalue));
+                break;
+            case Precision::Float32:    
+                tensorfunc::rpowscalar<Author, float>(this->getvar<float>(0, mem), *mem->gettensor<float>(this->args[1].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
+                break;
+            default:
+                error = "Unsupported dtype: " + precision_str(a_type);
+                return 1;
+            }   
+            return 0;
+        }
+    };
+
     template <typename Author>
     class Log : public TF
     {
diff --git a/front/py/deepx/nn/functional/authormap.py b/front/py/deepx/nn/functional/authormap.py
index 644505a0..91e0573b 100644
--- a/front/py/deepx/nn/functional/authormap.py
+++ b/front/py/deepx/nn/functional/authormap.py
@@ -16,7 +16,7 @@
         'divscalar':'miaobyte',
         'rdiv':'miaobyte',
         'rdivscalar':'miaobyte',
-
+        'invert':'miaobyte',
         'compare':'miaobyte',
         'min':'miaobyte',
         'minscalar':'miaobyte',
@@ -26,6 +26,7 @@
         'log':'miaobyte',
         'pow':'miaobyte',
         'powscalar':'miaobyte',
+        'rpowscalar':'miaobyte',
         'sqrt':'miaobyte',
         #changeshape
         'reshape':'miaobyte',
diff --git a/front/py/deepx/nn/functional/leaffunc_elementwise.py b/front/py/deepx/nn/functional/leaffunc_elementwise.py
index 74e0918d..6aa54077 100644
--- a/front/py/deepx/nn/functional/leaffunc_elementwise.py
+++ b/front/py/deepx/nn/functional/leaffunc_elementwise.py
@@ -50,4 +50,7 @@ def rpow(a:Number,b:Tensor,out:Union[Tensor,str]=None)->Tensor:
 
 sqrt=create_A_tf_C('sqrt')
 exp=create_A_tf_C('exp')
-log=create_A_tf_C('log')
\ No newline at end of file
+log=create_A_tf_C('log')
+
+#invert
+invert=create_A_tf_C('invert')
\ No newline at end of file
diff --git a/front/py/deepx/nn/functional/rtf_elementwise.py b/front/py/deepx/nn/functional/rtf_elementwise.py
index 96614a4e..3b7df4a6 100644
--- a/front/py/deepx/nn/functional/rtf_elementwise.py
+++ b/front/py/deepx/nn/functional/rtf_elementwise.py
@@ -103,4 +103,8 @@ def rtf_min(a:Tensor, b:Tensor, out:Tensor, author='miaobyte')->Tensor:
 
 def rtf_minscalar(a:Tensor, b:float, out:Tensor, author='miaobyte')->Tensor:
     A_scalar_op_C("minscalar",a,b,out,author)
+    return out
+
+def rtf_invert(a:Tensor, out:Tensor, author='miaobyte')->Tensor:
+    A_op_C("invert",a,out,author)
     return out
\ No newline at end of file
diff --git a/front/py/deepx/tensor/elementwise.py b/front/py/deepx/tensor/elementwise.py
index d4798da1..33ff1b97 100644
--- a/front/py/deepx/tensor/elementwise.py
+++ b/front/py/deepx/tensor/elementwise.py
@@ -156,3 +156,10 @@ def rsqrt(self,out:Union[Tensor,str]='')->Tensor:
 def rsqrt_(self):
     from deepx.nn.functional import rsqrt as rsqrt_func
     rsqrt_func(self,self)
+
+@tensor_method
+def invert(self,out:Union[Tensor,str]='')->Tensor:
+    from deepx.nn.functional import invert as invert_func
+    return invert_func(self,out)
+
+
diff --git a/front/py/deepx/tensor/tensor.py b/front/py/deepx/tensor/tensor.py
index a45888b1..39958a2e 100644
--- a/front/py/deepx/tensor/tensor.py
+++ b/front/py/deepx/tensor/tensor.py
@@ -114,6 +114,9 @@ def __pow__(self, other:Union[Number,'Tensor']):
     
     def __rpow__(self, other:Union[Number,'Tensor']):
         return self.rpow(other)
+    
+    def __invert__(self):
+        return self.invert()
     #矩阵乘法
     def __matmul__(self, other:Union[Number,'Tensor']):
         return self.matmul(other)
diff --git a/front/py/deepx/transformer/modeling_rope_utils.py b/front/py/deepx/transformer/modeling_rope_utils.py
index 41cb3909..3127021e 100644
--- a/front/py/deepx/transformer/modeling_rope_utils.py
+++ b/front/py/deepx/transformer/modeling_rope_utils.py
@@ -1,27 +1,14 @@
 from typing import   Tuple
-from deepx import arange
+import math
+from deepx import arange,Tensor
 
 def _compute_default_rope_parameters(
     base: float = 10000.0,
-    dim: int = 0,
     head_dim: int = 0,
     partial_rotary_factor: float = 1.0,
-) -> Tuple:
-    """
-    计算原始RoPE实现的逆频率
-    
-    参数:
-        base: 用于旋转位置编码的基数，默认为10000.0
-        dim: 特征维度，必须是偶数
-        head_dim: 每个头的特征维度，必须是偶数
-        partial_rotary_factor: 部分旋转因子，默认为1.0
-    
-    返回:
-        包含RoPE嵌入的逆频率的元组和应用于计算的cos/sin的后处理缩放因子
-    """
+) -> Tuple[Tensor, float]:
     attention_factor = 1.0  # 在这种类型的RoPE中未使用
-    if dim == 0:
-        dim = head_dim*partial_rotary_factor
+    dim   = head_dim*partial_rotary_factor
     # 计算逆频率
     inv_freq = 1.0 / (base ** (arange(0, dim, 2, dtype='float64')/ dim))
     return inv_freq, attention_factor
@@ -260,32 +247,17 @@ def _compute_default_rope_parameters(
 #     return inv_freq, attention_factor
 
 
-def _compute_llama3_parameters(
-    config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None, **rope_kwargs
-) -> Tuple["torch.Tensor", float]:
-    """
-    Computes the inverse frequencies for llama 3.1.
-
-    Args:
-        config ([`~transformers.PretrainedConfig`]):
-            The model configuration.
-        device (`torch.device`):
-            The device to use for initialization of the inverse frequencies.
-        seq_len (`int`, *optional*):
-            The current sequence length. Unused for this type of RoPE.
-        rope_kwargs (`Dict`, *optional*):
-            BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
-    Returns:
-        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
-        post-processing scaling factor applied to the computed cos/sin.
-    """
+def _compute_llama3_parameters(    base: float = 10000.0,
+    head_dim: int = 0,
+    partial_rotary_factor: float = 1.0,
+    factor:float=8,
+    low_freq_factor:float=1,
+    high_freq_factor:float=4,
+    old_context_len:int=8192,
+    seq_len: Optional[int] = None, **rope_kwargs
+) -> Tuple[Tensor, float]:
     # Gets the default RoPE parameters
-    inv_freq, attention_factor = _compute_default_rope_parameters(config, device, seq_len, **rope_kwargs)
-
-    factor = config.rope_scaling["factor"]  # `8` in the original implementation
-    low_freq_factor = config.rope_scaling["low_freq_factor"]  # `1` in the original implementation
-    high_freq_factor = config.rope_scaling["high_freq_factor"]  # `4` in the original implementation
-    old_context_len = config.rope_scaling["original_max_position_embeddings"]  # `8192` in the original implementation
+    inv_freq, attention_factor = _compute_default_rope_parameters(base, head_dim, partial_rotary_factor)
 
     low_freq_wavelen = old_context_len / low_freq_factor
     high_freq_wavelen = old_context_len / high_freq_factor
@@ -312,6 +284,6 @@ def _compute_llama3_parameters(
     # "dynamic": _compute_dynamic_ntk_parameters,
     # "yarn": _compute_yarn_parameters,
     # "longrope": _compute_longrope_parameters,
-    # "llama3": _compute_llama3_parameters,
+    "llama3": _compute_llama3_parameters,
 }
   
\ No newline at end of file
diff --git a/front/py/examples/2_ir/2_elementwise_compare.py b/front/py/examples/2_ir/2_elementwise_compare.py
new file mode 100644
index 00000000..7f010870
--- /dev/null
+++ b/front/py/examples/2_ir/2_elementwise_compare.py
@@ -0,0 +1,26 @@
+############-------PyTorch-------################
+
+print()
+import torch
+torch_t1 = torch.full((2,3,4, ), 10, dtype=torch.int8)
+torch_t2 = ~torch_t1
+print(torch_t2)
+torch_t3 = torch.full((2,3,4, ), 2, dtype=torch.int64)
+torch_t4 = ~torch_t3
+print(torch_t4)
+
+
+
+############-------DEEPX-------################
+
+from deepx import Tensor,full
+
+print()
+
+t1 = full(2,3,4, value=10,dtype="int8")
+t2 = ~t1
+t2.print()
+
+t3 = full(2,3,4, value=2,dtype="int64")
+t4 = ~t3
+t4.print()
\ No newline at end of file
diff --git a/front/py/examples/2_ir/2_elementwise_sqrtlog.py b/front/py/examples/2_ir/2_elementwise_sqrtlog.py
index feb8e9f2..705219ed 100644
--- a/front/py/examples/2_ir/2_elementwise_sqrtlog.py
+++ b/front/py/examples/2_ir/2_elementwise_sqrtlog.py
@@ -13,22 +13,23 @@
 print(torch_t5)
 torch_t6 = torch.pow(torch_t5,torch_t3)
 print(torch_t6)
-
+torch_t7 = 2**torch_t1
+print(torch_t7)
 ############-------DEEPX-------################
 
 import deepx
 print()
 
-t1 = deepx.arange(3*4*5,dtype='float32',name="t1")
-t2 = deepx.full([3*4*5],value=2,dtype='float32',name="t2")
+t1 = deepx.arange(start=0,end=3*4*5,dtype='float32',name="t1")
+t2 = deepx.full((3*4*5,),value=2,dtype='float32',name="t2")
 t3 = deepx.sqrt(t1,out='t3')
-print(t3)
+t3.print()
 t4 = deepx.log(t2,out='t4')
-print(t4)
+t4.print()
 t5 = deepx.exp(t4,out='t5')
-print(t5)
+t5.print()
 t6 = deepx.pow(t5,t3,out='t6')
-print(t6)
-
-
+t6.print()
+t7 = 2**t1
+t7.print()
 

From 0a194d349a4f38788df9e9fbfeb2b6eea46f2f51 Mon Sep 17 00:00:00 2001
From: lipeng <734991033@qq.com>
Date: Sat, 19 Apr 2025 02:09:55 +0800
Subject: [PATCH 6/6] 
 equal,equalscalar,less,lessscalar,greater,greaterscalar,switch

---
 doc/excuter/op-mem-cuda/list.md               |   4 +-
 excuter/cpp-common/src/deepx/tensor.hpp       |  31 +-
 .../src/deepx/tensorfunc/elementwise.hpp      | 100 +++-
 excuter/cpp-common/src/deepx/tf/tf.hpp        |   2 +-
 excuter/cpp-common/src/stdutil/print.hpp      |  34 +-
 excuter/op-mem-cuda/src/client/tfs.cpp        |  61 ++-
 .../deepx/tensorfunc/changeshape_miaobyte.cuh |  92 +---
 .../elementwise_miaobyte_compare.cu           | 515 ++++++++++++------
 .../elementwise_miaobyte_compare.cuh          | 202 ++-----
 .../elementwise_miaobyte_compare.hpp          | 148 +++--
 .../src/deepx/tf/elementwise_compare.hpp      | 442 ++++++++++++---
 excuter/op-mem-ompsimd/src/client/tfs.cpp     | 147 +++--
 .../deepx/tensorfunc/elementwise_miaobyte.hpp | 171 +++++-
 .../src/deepx/tf/elementwise.hpp              | 340 +++++++++++-
 front/py/deepx/tensor/tensor.py               |   2 +
 .../deepx/transformer/modeling_rope_utils.py  |   2 +-
 front/py/examples/1_tensor/1_new.py           |  16 +-
 17 files changed, 1602 insertions(+), 707 deletions(-)

diff --git a/doc/excuter/op-mem-cuda/list.md b/doc/excuter/op-mem-cuda/list.md
index 691cdca6..2d7a12fb 100644
--- a/doc/excuter/op-mem-cuda/list.md
+++ b/doc/excuter/op-mem-cuda/list.md
@@ -10,8 +10,7 @@
 | transpose | miaobyte | transpose(tensor<any> A, vector<int32> dim_order)->(tensor<any> C) | T2 = T1.transpose(dimorder=[1,0]) | transpose(tensor<any> A, vector<int32> dim_order)->(tensor<any> C) |
 | reshape | miaobyte | reshape(tensor<any> A, vector<int32> shape)->(tensor<any> B) | T1.reshape(shape)->T2 | reshape(tensor<any> A, vector<int32> shape)->(tensor<any> B) |
 | matmul | cublas | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1 @ T2 | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) |
-| comparescalar | miaobyte | comparescalar(tensor<any> A, var<any> scalar)->(tensor<int8> mask) | mask=compare(T1, scalar) | comparescalar(tensor<any> A, var<any> scalar)->(tensor<int8> mask) |
-| compare | miaobyte | compare(tensor<any> A, tensor<any> B)->(tensor<int8> mask) | mask=compare(T1, T2) | compare(tensor<any> A, tensor<any> B)->(tensor<int8> mask) |
+| equalscalar | miaobyte | equalscalar(tensor<any> A, var<any> scalar, var<float64> epsilon)->(tensor<bool> mask) | mask=compare(T1, scalar) | equalscalar(tensor<any> A, var<any> scalar, var<float64> epsilon)->(tensor<bool> mask) |
 | prod | miaobyte | prod(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) | B = prod(A, axis=[1 2], keepdims=false) | prod(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) |
 | min | miaobyte | min(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=min(T1, T2) | min(tensor<any> A, tensor<any> B)->(tensor<any> C) |
 | maxscalar | miaobyte | maxscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=max(T1, scalar) | maxscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
@@ -41,6 +40,7 @@
 | sub | miaobyte | sub(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1-T2 | sub(tensor<any> A, tensor<any> B)->(tensor<any> C) |
 | sum | miaobyte | sum(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) | B = sum(A, axis=[1 2], keepdims=false) | sum(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) |
 | argset |  none  | argset(var<any> value)->(var<any> name) | var argname = argvalue | argset(var<any> value)->(var<any> name) |
+| equal | miaobyte | equal(tensor<any> A, tensor<any> B, var<float64> epsilon)->(tensor<bool> mask) | mask=compare(T1, T2) | equal(tensor<any> A, tensor<any> B, var<float64> epsilon)->(tensor<bool> mask) |
 | mulscalar | miaobyte | mulscalar(tensor<any> A, var<any> b)->(tensor<any> C) | T3=T1*scalar | mulscalar(tensor<any> A, var<any> b)->(tensor<any> C) |
 | div | miaobyte | div(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1/T2 | div(tensor<any> A, tensor<any> B)->(tensor<any> C) |
 | invert | miaobyte | invert(tensor<int64|int32|int16|int8> A)->(tensor<int64|int32|int16|int8> C) | T3=~T1 | invert(tensor<int64|int32|int16|int8> A)->(tensor<int64|int32|int16|int8> C) |
diff --git a/excuter/cpp-common/src/deepx/tensor.hpp b/excuter/cpp-common/src/deepx/tensor.hpp
index b755a4c9..af5f0dc6 100644
--- a/excuter/cpp-common/src/deepx/tensor.hpp
+++ b/excuter/cpp-common/src/deepx/tensor.hpp
@@ -4,6 +4,7 @@
 #include <vector>
 #include <string>
 #include <fstream>
+#include <memory>
 
 #include "deepx/shape.hpp"
 #include "deepx/dtype.hpp"
@@ -12,7 +13,7 @@
 namespace deepx
 {
     using namespace std;
-    
+
     template <typename T>
     struct Tensor : public TensorBase
     {
@@ -28,11 +29,11 @@ namespace deepx
         CopyFn copyer; // 拷贝内存
 
         Tensor() = default;
-        Tensor(const vector<int> &s)  
+        Tensor(const vector<int> &s)
         {
             shape = Shape(s);
         }
-        Tensor(const Shape &s)  
+        Tensor(const Shape &s)
         {
             shape = s;
         }
@@ -140,28 +141,6 @@ namespace deepx
             tensor.newer = nullptr;
             return *this;
         }
-    };
-
-    // template <typename T>
-    // struct TensorSlice {
-    //     Slice  slice;
-    //     Tensor<T> tensor;
-    // };
-
-    // 添加一个新的类用于类型擦除
-    struct TensorVoid : public TensorBase {
-        void* data;
-        void (*deleter)(void*);
-        void (*copyer)(void*, void*, int);
-        void* (*newer)(int);
-        
-        TensorVoid() = default;
-        ~TensorVoid() {
-            if (data && deleter) {
-                deleter(data);
-                data = nullptr;
-            }
-        }
-    };
+    }; 
 }
 #endif
\ No newline at end of file
diff --git a/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp b/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp
index aadf5a68..ca44fd13 100644
--- a/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp
+++ b/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp
@@ -160,7 +160,6 @@ namespace deepx::tensorfunc
         powscalarDispatcher<Author, T>::powscalar(input, value, output);
     }
 
- 
     template <typename Author, typename T>
     struct rpowscalarDispatcher
     {
@@ -304,54 +303,105 @@ namespace deepx::tensorfunc
         minscalarDispatcher<Author, T>::minscalar(A, b, C);
     }
 
-    template <typename Author, typename T>
-    struct compareDispatcher
+    // equal(A,B)=>mask
+
+    template <typename Author, typename T, typename MaskT>
+    struct equalDispatcher
     {
-        static void compare(const Tensor<T> &A, const Tensor<T> &B, Tensor<float> &mask) = delete;
+        static void equal(const Tensor<T> &A, const Tensor<T> &B, float epsilon, Tensor<MaskT> &mask) = delete;
     };
 
-    // compare(A,B)=>mask
-    // if A[i]==B[i], mask[i]=0.5
-    // if A[i]>B[i], mask[i]=0
-    // if A[i]<B[i], mask[i]=1
-    template <typename Author, typename T>
-    void compare(const Tensor<T> &A, const Tensor<T> &B, Tensor<float> &mask)
+    template <typename Author, typename T, typename MaskT>
+    void equal(const Tensor<T> &A, const Tensor<T> &B, float epsilon, Tensor<MaskT> &mask)
     {
-        compareDispatcher<Author, T>::compare(A, B, mask);
+        equalDispatcher<Author, T, MaskT>::equal(A, B, epsilon, mask);
     }
 
-    template <typename Author, typename T>
-    struct comparescalarDispatcher
+    // equal(A,scalar)=>mask
+    template <typename Author, typename T, typename MaskT>
+    struct equalscalarDispatcher
     {
-        static void comparescalar(const Tensor<T> &A, const T scalar, Tensor<float> &mask) = delete;
+        static void equalscalar(const Tensor<T> &A, const T scalar, float epsilon, Tensor<MaskT> &mask) = delete;
     };
 
-    template <typename Author, typename T>
-    void comparescalar(const Tensor<T> &A, const T scalar, Tensor<float> &mask)
+    template <typename Author, typename T, typename MaskT>
+    void equalscalar(const Tensor<T> &A, const T scalar, float epsilon, Tensor<MaskT> &mask)
     {
-        comparescalarDispatcher<Author, T>::comparescalar(A, scalar, mask);
+        equalscalarDispatcher<Author, T, MaskT>::equalscalar(A, scalar, epsilon, mask);
     }
 
-    // 判断两个张量是否相等，TODO
-    template <typename Author, typename T>
-    struct equalDispatcher
+    // less(A,B)=>mask
+    template <typename Author, typename T, typename MaskT>
+    struct lessDispatcher
     {
-        static bool equal(const Tensor<T> &A, const Tensor<T> &B, float epsilon = 1e-6) = delete;
+        static void less(const Tensor<T> &A, const Tensor<T> &B, Tensor<MaskT> &mask) = delete;
     };
 
-    template <typename Author, typename T>
-    bool equal(const Tensor<T> &A, const Tensor<T> &B, float epsilon = 1e-6)
+    template <typename Author, typename T, typename MaskT>
+    void less(const Tensor<T> &A, const Tensor<T> &B, Tensor<MaskT> &mask)
     {
-        return equalDispatcher<Author, T>::equal(A, B, epsilon);
+        lessDispatcher<Author, T, MaskT>::less(A, B, mask);
     }
 
+    // less(A,scalar)=>mask
+    template <typename Author, typename T, typename MaskT>
+    struct lessscalarDispatcher
+    {
+        static void lessscalar(const Tensor<T> &A, const T scalar, Tensor<MaskT> &mask) = delete;
+    };
+
+    template <typename Author, typename T, typename MaskT>
+    void lessscalar(const Tensor<T> &A, const T scalar, Tensor<MaskT> &mask)
+    {
+        lessscalarDispatcher<Author, T, MaskT>::lessscalar(A, scalar, mask);
+    }
+
+    // greater(A,B)=>C
+    template <typename Author, typename T, typename MaskT>
+    struct greaterDispatcher
+    {
+        static void greater(const Tensor<T> &A, const Tensor<T> &B, Tensor<MaskT> &mask) = delete;
+    };
+
+    template <typename Author, typename T, typename MaskT>
+    void greater(const Tensor<T> &A, const Tensor<T> &B, Tensor<MaskT> &mask)
+    {
+        greaterDispatcher<Author, T, MaskT>::greater(A, B, mask);
+    }
+
+    // greater(A,scalar)=>C
+    template <typename Author, typename T, typename MaskT>
+    struct greaterscalarDispatcher
+    {
+        static void greaterscalar(const Tensor<T> &A, const T scalar, Tensor<MaskT> &mask) = delete;
+    };
+
+    template <typename Author, typename T, typename MaskT>
+    void greaterscalar(const Tensor<T> &A, const T scalar, Tensor<MaskT> &mask)
+    {
+        greaterscalarDispatcher<Author, T, MaskT>::greaterscalar(A, scalar, mask);
+    }
+
+    // switch(tensors,cases)=>C
+    template <typename Author, typename T, typename casesT>
+    struct switchDispatcher
+    {
+        static void Switch(const vector<Tensor<T> *> tensors, const Tensor<casesT> &cases, Tensor<T> &C) = delete;
+    };
+
+    template <typename Author, typename T, typename casesT>
+    void Switch(const vector<Tensor<T> *> tensors, const Tensor<casesT> &cases, Tensor<T> &C)
+    {
+        switchDispatcher<Author, T, casesT>::Switch(tensors, cases, C);
+    }
+
+    // invert(A)=>C
     template <typename Author, typename T>
     struct invertDispatcher
     {
         static void invert(const Tensor<T> &input, Tensor<T> &output) = delete;
     };
 
-    // invert(A)=>C
     template <typename Author, typename T>
     void invert(const Tensor<T> &input, Tensor<T> &output)
     {
diff --git a/excuter/cpp-common/src/deepx/tf/tf.hpp b/excuter/cpp-common/src/deepx/tf/tf.hpp
index 95c2956d..430dc4c5 100644
--- a/excuter/cpp-common/src/deepx/tf/tf.hpp
+++ b/excuter/cpp-common/src/deepx/tf/tf.hpp
@@ -146,7 +146,7 @@ namespace deepx::tf
         {
             for (const auto &name : names)
             {
-                if (!mem->gettensor(name))
+                if (!mem->existstensor(name))
                 {
                     error = "tensor not found: " + name;
                     return false;
diff --git a/excuter/cpp-common/src/stdutil/print.hpp b/excuter/cpp-common/src/stdutil/print.hpp
index 31c2737c..139575b4 100644
--- a/excuter/cpp-common/src/stdutil/print.hpp
+++ b/excuter/cpp-common/src/stdutil/print.hpp
@@ -14,7 +14,12 @@ namespace stdutil
     {
         switch (dtype)
         {
-        case deepx::Precision::Int8:
+        case Precision::Bool:{
+            bool bool_data = ((bool *)data)[offset];
+            printf(format.c_str(), static_cast<int8_t>(bool_data));
+            break;
+        }
+        case  Precision::Int8:
             printf(format.c_str(), ((int8_t *)data)[offset]);
             break;
         case Precision::Int16:
@@ -26,15 +31,16 @@ namespace stdutil
         case Precision::Int64:
             printf(format.c_str(), ((int64_t *)data)[offset]);
             break;
-        case Precision::Float32:
-            printf(format.c_str(), ((float *)data)[offset]);
-            break;
+
         case Precision::Float64:
             printf(format.c_str(), ((double *)data)[offset]);
             break;
+        case Precision::Float32:
+            printf(format.c_str(), ((float *)data)[offset]);
+            break;
         case Precision::Float16:
             printf(format.c_str(), ((float *)data)[offset]);
-            break;  
+            break;
         case Precision::BFloat16:
             printf(format.c_str(), ((float *)data)[offset]);
             break;
@@ -58,25 +64,25 @@ namespace stdutil
             format = "%d";
         }
         else if (dtype == Precision::String)
-        {   
+        {
             format = "%s";
         };
-         
-        
+
         return format;
     }
 
-     void print(const std::vector<int> &shape_vec, void *data, const Precision &dtype, const std::string &f="")
+    void print(const std::vector<int> &shape_vec, void *data, const Precision &dtype, const std::string &f = "")
     {
         std::string format = f;
-        if (f.empty()) {
+        if (f.empty())
+        {
             format = stdutil::default_format(dtype);
         }
 
         // 创建临时Shape对象用于打印和计算
         deepx::Shape shape(shape_vec);
         shape.dtype = dtype;
-        
+
         shape.print();
         if (shape.dim == 1)
         {
@@ -89,10 +95,10 @@ namespace stdutil
             }
             std::cout << "]" << std::endl;
         }
-        else 
+        else
         {
             shape.range(-2, [&format, data, &shape, &dtype](const int idx_linear, const std::vector<int> &indices)
-                          {
+                        {
                         std::cout << indices << "=";
                         std::cout<<"["<<std::endl;
                         for (int i = 0; i < shape[-2]; ++i)
@@ -103,7 +109,7 @@ namespace stdutil
                                 if (j > 0)
                                     std::cout << " ";
                                 int offset = idx_linear + i * shape[-1] + j;
-                                stdutil::print_element(data, offset, dtype, format);
+                                print_element(data, offset, dtype, format);
                             }
                             
                             std::cout<<"]";
diff --git a/excuter/op-mem-cuda/src/client/tfs.cpp b/excuter/op-mem-cuda/src/client/tfs.cpp
index e6e82689..c64973ca 100644
--- a/excuter/op-mem-cuda/src/client/tfs.cpp
+++ b/excuter/op-mem-cuda/src/client/tfs.cpp
@@ -329,24 +329,77 @@ namespace deepx::tf
                                                                    {
                                                                        Param("C", DataCategory::Tensor, Precision::Any),
                                                                    })));
-        tffactory.add_tf(std::make_shared<Compare<miaobyte>>(vector<Param>(
+        //equal
+        tffactory.add_tf(std::make_shared<Equal<miaobyte>>(vector<Param>(
                                                                  {
                                                                      Param("A", DataCategory::Tensor, Precision::Any),
                                                                      Param("B", DataCategory::Tensor, Precision::Any),
+                                                                     Param("epsilon", DataCategory::Var, Precision::Float64),
                                                                  }),
                                                              vector<Param>(
                                                                  {
-                                                                     Param("mask", DataCategory::Tensor, Precision::Int8),
+                                                                     Param("mask", DataCategory::Tensor, Precision::Bool),
                                                                  })));
-        tffactory.add_tf(std::make_shared<CompareScalar<miaobyte>>(vector<Param>(
+        tffactory.add_tf(std::make_shared<EqualScalar<miaobyte>>(vector<Param>(
                                                                        {
                                                                            Param("A", DataCategory::Tensor, Precision::Any),
                                                                            Param("scalar", DataCategory::Var, Precision::Any),
+                                                                           Param("epsilon", DataCategory::Var, Precision::Float64),
                                                                        }),
                                                                    vector<Param>(
                                                                        {
-                                                                           Param("mask", DataCategory::Tensor, Precision::Int8),
+                                                                           Param("mask", DataCategory::Tensor, Precision::Bool),
                                                                        })));
+        //less
+        tffactory.add_tf(std::make_shared<Less<miaobyte>>(vector<Param>(
+                                                                 {
+                                                                     Param("A", DataCategory::Tensor, Precision::Any),
+                                                                     Param("B", DataCategory::Tensor, Precision::Any),
+                                                                 }),    
+                                                                 vector<Param>(
+                                                                     {
+                                                                         Param("mask", DataCategory::Tensor, Precision::Bool),
+                                                                     })));
+        //lessscalar
+        tffactory.add_tf(std::make_shared<LessScalar<miaobyte>>(vector<Param>(
+                                                                   {
+                                                                       Param("A", DataCategory::Tensor, Precision::Any),
+                                                                       Param("scalar", DataCategory::Var, Precision::Any),
+                                                                   }),
+                                                               vector<Param>(
+                                                                   {
+                                                                       Param("mask", DataCategory::Tensor, Precision::Bool),
+                                                                   })));
+        //greater
+        tffactory.add_tf(std::make_shared<Greater<miaobyte>>(vector<Param>(     
+                                                                 {
+                                                                     Param("A", DataCategory::Tensor, Precision::Any),
+                                                                     Param("B", DataCategory::Tensor, Precision::Any),
+                                                                 }),
+                                                                 vector<Param>(
+                                                                     {
+                                                                         Param("mask", DataCategory::Tensor, Precision::Bool),
+                                                                     })));
+        //greaterscalar
+        tffactory.add_tf(std::make_shared<GreaterScalar<miaobyte>>(vector<Param>(
+                                                                   {
+                                                                       Param("A", DataCategory::Tensor, Precision::Any),
+                                                                       Param("scalar", DataCategory::Var, Precision::Any),
+                                                                   }),  
+                                                               vector<Param>(
+                                                                   {
+                                                                       Param("mask", DataCategory::Tensor, Precision::Bool),
+                                                                   })));
+        //switch    
+        tffactory.add_tf(std::make_shared<Switch<miaobyte>>(vector<Param>(  
+                                                                 {
+                                                                     Param("tensors", DataCategory::ListTensor, Precision::Any),
+                                                                     Param("cases", DataCategory::Tensor,Precision::Int8),
+                                                                 }),
+                                                                 vector<Param>(
+                                                                     {
+                                                                         Param("result", DataCategory::Tensor, Precision::Any), 
+                                                                     })));
     }
     // matmul
     void register_matmul(TfFactory &tffactory)
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cuh b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cuh
index a1fcf9fa..7b0f5d31 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cuh
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cuh
@@ -15,30 +15,7 @@ namespace deepx::tensorfunc
     template <typename T>
     void launch_transpose(  const T *input, const int *inputStrides, T *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
 
-    // template <>
-    // void launch_transpose<double>( const double *input, const int *inputStrides, double *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
-
-    // template <>
-    // void launch_transpose<float>(  const float *input, const int *inputStrides, float *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
-
-    // template <>
-    // void launch_transpose<nv_bfloat16>(const nv_bfloat16 *input, const int *inputStrides, nv_bfloat16 *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
-
-    // template <>
-    // void launch_transpose<__half>(const __half *input, const int *inputStrides, __half *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
-
-    // template <>
-    // void launch_transpose<int64_t>(const int64_t *input, const int *inputStrides, int64_t *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
-
-    // template <>
-    // void launch_transpose<int32_t>(const int32_t *input, const int *inputStrides, int32_t *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
-
-    // template <>
-    // void launch_transpose<int16_t>(const int16_t *input, const int *inputStrides, int16_t *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
-
-    // template <>
-    // void launch_transpose<int8_t>(const int8_t *input, const int *inputStrides, int8_t *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
-
+   
     template <typename DIM, typename T>
     __global__ void concat_kernel(const T **tensorsData,
                                   const int *inputStrides,
@@ -53,30 +30,7 @@ namespace deepx::tensorfunc
     template <typename T>
     void launch_concat(const T **tensorsData, const int *inputStrides, T *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis);
 
-    // template <>
-    // void launch_concat<double>(const double **tensorsData, const int *inputStrides, double *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis);
-
-    // template <>
-    // void launch_concat<float>(const float **tensorsData, const int *inputStrides, float *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis);
-
-    // template <>
-    // void launch_concat<nv_bfloat16>(const nv_bfloat16 **tensorsData, const int *inputStrides, nv_bfloat16 *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis);
-
-    // template <>
-    // void launch_concat<__half>(const __half **tensorsData, const int *inputStrides, __half *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis);
-
-    // template <>
-    // void launch_concat<int64_t>(const int64_t **tensorsData, const int *inputStrides, int64_t *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis);
-
-    // template <>
-    // void launch_concat<int32_t>(const int32_t **tensorsData, const int *inputStrides, int32_t *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis);
-
-    // template <>
-    // void launch_concat<int16_t>(const int16_t **tensorsData, const int *inputStrides, int16_t *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis);
-
-    // template <>
-    // void launch_concat<int8_t>(const int8_t **tensorsData, const int *inputStrides, int8_t *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis);
-
+    
 
     __host__ __device__ void fromBroadcastIndices(const BroadcastMap *broadcastMap, const int *broadcastIndices, const int broadcastIndicesDim, int *indices);
     
@@ -91,46 +45,6 @@ namespace deepx::tensorfunc
     void launch_broadcastTo(const T *input, const int *inputStrides,const int intputDim,
                             const BroadcastMap *broadcastMap,
                             T *output, const int *outputStrides,const int outputDim,const int outputlen);
-
-//     template <>
-//     void launch_broadcastTo<double>(const double *input, const int *inputStrides,const int inputDim,
-//                                     const BroadcastMap *broadcastMap,
-//                                     double *output, const int *outputStrides,const int outputDim,const int outputlen);
-
-//     template <>
-//     void launch_broadcastTo<float>(const float *input, const int *inputStrides,const int inputDim,
-//                                     const BroadcastMap *broadcastMap,
-//                                     float *output, const int *outputStrides,const int outputDim,const int outputlen);
-
-//     template <>
-//     void launch_broadcastTo<nv_bfloat16>(const nv_bfloat16 *input, const int *inputStrides,const int inputDim,
-//                                     const BroadcastMap *broadcastMap,
-//                                     nv_bfloat16 *output, const int *outputStrides,const int outputDim,const int outputlen);
-
-//     template <>
-//     void launch_broadcastTo<__half>(const __half *input, const int *inputStrides,const int inputDim,
-//                                     const BroadcastMap *broadcastMap,
-//                                     __half *output, const int *outputStrides,const int outputDim,const int outputlen);
-
-//     template <>
-//     void launch_broadcastTo<int64_t>(const int64_t *input, const int *inputStrides,const int inputDim,
-//                                     const BroadcastMap *broadcastMap,
-//                                     int64_t *output, const int *outputStrides,const int outputDim,const int outputlen);
-
-//     template <>
-//     void launch_broadcastTo<int32_t>(const int32_t *input, const int *inputStrides,const int inputDim,
-//                                     const BroadcastMap *broadcastMap,
-//                                     int32_t *output, const int *outputStrides,const int outputDim,const int outputlen);
-
-//     template <>
-//     void launch_broadcastTo<int16_t>(const int16_t *input, const int *inputStrides,const int inputDim,
-//                                     const BroadcastMap *broadcastMap,
-//                                     int16_t *output, const int *outputStrides,const int outputDim,const int outputlen);
-
-//     template <>
-//     void launch_broadcastTo<int8_t>(const int8_t *input, const int *inputStrides,const int inputDim,
-//                                     const BroadcastMap *broadcastMap,
-//                                     int8_t *output, const int *outputStrides,const int outputDim,const int outputlen);
-// }
+ 
 };
 #endif // DEEPX_TENSORFUNC_CHANGESHAPE_MIAOBYTE_CUH
\ No newline at end of file
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.cu
index cb117037..f5e93fc5 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.cu
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.cu
@@ -3,249 +3,420 @@
 
 #include "deepx/tensorfunc/cuda.hpp"
 #include "deepx/tensorfunc/authors.hpp"
-
+#include "deepx/tensorfunc/vector_cuda.cuh"
 namespace deepx::tensorfunc
 {
     template <typename T>
-    __global__ void max_kernel(const T* A, const T* B, T* C, const int size){
-        int idx = blockIdx.x * blockDim.x + threadIdx.x;
-        if (idx < size) {
+    __global__ void max_kernel(const T *A, const T *B, T *C, const int size)
+    {
+        int stride = blockDim.x * gridDim.x;
+        for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride)
+        {
             C[idx] = A[idx] > B[idx] ? A[idx] : B[idx];
         }
     }
 
-    template __global__ void max_kernel<double>(const double* A, const double* B, double* C, const int size);
-    template __global__ void max_kernel<float>(const float* A, const float* B, float* C, const int size);
-    template __global__ void max_kernel<nv_bfloat16>(const nv_bfloat16* A, const nv_bfloat16* B, nv_bfloat16* C, const int size);
-    template __global__ void max_kernel<__half>(const __half* A, const __half* B, __half* C, const int size);
-    template __global__ void max_kernel<int64_t>(const int64_t* A, const int64_t* B, int64_t* C, const int size);
-    template __global__ void max_kernel<int32_t>(const int32_t* A, const int32_t* B, int32_t* C, const int size);
-    template __global__ void max_kernel<int16_t>(const int16_t* A, const int16_t* B, int16_t* C, const int size);
-    template __global__ void max_kernel<int8_t>(const int8_t* A, const int8_t* B, int8_t* C, const int size);
-
     template <typename T>
-    void launch_max(int numBlocks, int blockSize, const T* A, const T* B, T* C, const int size)
+    void launch_max(const T *A, const T *B, T *C, const int size)
     {
+        auto [numBlocks, blockSize] = BestDims(size);
         max_kernel<<<numBlocks, blockSize>>>(A, B, C, size);
-         cudaError_t err = cudaGetLastError();
-            if (err != cudaSuccess) {
-                throw std::runtime_error("Failed to launch add kernel: " + 
-                                       std::string(cudaGetErrorString(err)));
-            }
+        cudaError_t err = cudaGetLastError();
+        if (err != cudaSuccess)
+        {
+            throw std::runtime_error("Failed to launch add kernel: " +
+                                     std::string(cudaGetErrorString(err)));
+        }
     }
 
-    template void launch_max<double>(int numBlocks, int blockSize, const double* A, const double* B, double* C, const int size);
-    template void launch_max<float>(int numBlocks, int blockSize, const float* A, const float* B, float* C, const int size);
-    template void launch_max<nv_bfloat16>(int numBlocks, int blockSize, const nv_bfloat16* A, const nv_bfloat16* B, nv_bfloat16* C, const int size);
-    template void launch_max<__half>(int numBlocks, int blockSize, const __half* A, const __half* B, __half* C, const int size);
-    template void launch_max<int64_t>(int numBlocks, int blockSize, const int64_t* A, const int64_t* B, int64_t* C, const int size);
-    template void launch_max<int32_t>(int numBlocks, int blockSize, const int32_t* A, const int32_t* B, int32_t* C, const int size);
-    template void launch_max<int16_t>(int numBlocks, int blockSize, const int16_t* A, const int16_t* B, int16_t* C, const int size);
-    template void launch_max<int8_t>(int numBlocks, int blockSize, const int8_t* A, const int8_t* B, int8_t* C, const int size);
+    template void launch_max<double>(const double *A, const double *B, double *C, const int size);
+    template void launch_max<float>(const float *A, const float *B, float *C, const int size);
+    template void launch_max<nv_bfloat16>(const nv_bfloat16 *A, const nv_bfloat16 *B, nv_bfloat16 *C, const int size);
+    template void launch_max<__half>(const __half *A, const __half *B, __half *C, const int size);
+    template void launch_max<int64_t>(const int64_t *A, const int64_t *B, int64_t *C, const int size);
+    template void launch_max<int32_t>(const int32_t *A, const int32_t *B, int32_t *C, const int size);
+    template void launch_max<int16_t>(const int16_t *A, const int16_t *B, int16_t *C, const int size);
+    template void launch_max<int8_t>(const int8_t *A, const int8_t *B, int8_t *C, const int size);
 
     template <typename T>
-    __global__ void maxscalar_kernel(const T* A, const T scalar, T* C, const int size)
+    __global__ void maxscalar_kernel(const T *A, const T scalar, T *C, const int size)
     {
-        int idx = blockIdx.x * blockDim.x + threadIdx.x;
-        if (idx < size) {
+        int stride = blockDim.x * gridDim.x;
+        for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride)
+        {
             C[idx] = A[idx] > scalar ? A[idx] : scalar;
         }
     }
 
-    template __global__ void maxscalar_kernel<double>(const double* A, const double scalar, double* C, const int size);
-    template __global__ void maxscalar_kernel<float>(const float* A, const float scalar, float* C, const int size);
-    template __global__ void maxscalar_kernel<nv_bfloat16>(const nv_bfloat16* A, const nv_bfloat16 scalar, nv_bfloat16* C, const int size);
-    template __global__ void maxscalar_kernel<__half>(const __half* A, const __half scalar, __half* C, const int size);
-    template __global__ void maxscalar_kernel<int64_t>(const int64_t* A, const int64_t scalar, int64_t* C, const int size);
-    template __global__ void maxscalar_kernel<int32_t>(const int32_t* A, const int32_t scalar, int32_t* C, const int size); 
-    template __global__ void maxscalar_kernel<int16_t>(const int16_t* A, const int16_t scalar, int16_t* C, const int size);
-    template __global__ void maxscalar_kernel<int8_t>(const int8_t* A, const int8_t scalar, int8_t* C, const int size);
-
-    template <typename T>   
-    void launch_maxscalar(int numBlocks, int blockSize, const T* A, const T scalar, T* C, const int size)
+    template <typename T>
+    void launch_maxscalar(const T *A, const T scalar, T *C, const int size)
     {
+        auto [numBlocks, blockSize] = BestDims(size);
         maxscalar_kernel<<<numBlocks, blockSize>>>(A, scalar, C, size);
         cudaError_t err = cudaGetLastError();
-            if (err != cudaSuccess) {
-                throw std::runtime_error("Failed to launch add kernel: " + 
-                                       std::string(cudaGetErrorString(err)));
-            }
+        if (err != cudaSuccess)
+        {
+            throw std::runtime_error("Failed to launch add kernel: " +
+                                     std::string(cudaGetErrorString(err)));
+        }
     }
 
-    template void launch_maxscalar<double>(int numBlocks, int blockSize, const double* A, const double scalar, double* C, const int size);
-    template void launch_maxscalar<float>(int numBlocks, int blockSize, const float* A, const float scalar, float* C, const int size);
-    template void launch_maxscalar<nv_bfloat16>(int numBlocks, int blockSize, const nv_bfloat16* A, const nv_bfloat16 scalar, nv_bfloat16* C, const int size);
-    template void launch_maxscalar<__half>(int numBlocks, int blockSize, const __half* A, const __half scalar, __half* C, const int size);
-    template void launch_maxscalar<int64_t>(int numBlocks, int blockSize,   const int64_t* A, const int64_t scalar, int64_t* C, const int size);
-    template void launch_maxscalar<int32_t>(int numBlocks, int blockSize, const int32_t* A, const int32_t scalar, int32_t* C, const int size);
-    template void launch_maxscalar<int16_t>(int numBlocks, int blockSize, const int16_t* A, const int16_t scalar, int16_t* C, const int size);
-    template void launch_maxscalar<int8_t>(int numBlocks, int blockSize, const int8_t* A, const int8_t scalar, int8_t* C, const int size);
+    template void launch_maxscalar<double>(const double *A, const double scalar, double *C, const int size);
+    template void launch_maxscalar<float>(const float *A, const float scalar, float *C, const int size);
+    template void launch_maxscalar<nv_bfloat16>(const nv_bfloat16 *A, const nv_bfloat16 scalar, nv_bfloat16 *C, const int size);
+    template void launch_maxscalar<__half>(const __half *A, const __half scalar, __half *C, const int size);
+    template void launch_maxscalar<int64_t>(const int64_t *A, const int64_t scalar, int64_t *C, const int size);
+    template void launch_maxscalar<int32_t>(const int32_t *A, const int32_t scalar, int32_t *C, const int size);
+    template void launch_maxscalar<int16_t>(const int16_t *A, const int16_t scalar, int16_t *C, const int size);
+    template void launch_maxscalar<int8_t>(const int8_t *A, const int8_t scalar, int8_t *C, const int size);
 
     template <typename T>
-    __global__ void min_kernel(const T* A, const T* B, T* C, const int size)
+    __global__ void min_kernel(const T *A, const T *B, T *C, const int size)
     {
-        int idx = blockIdx.x * blockDim.x + threadIdx.x;
-        if (idx < size) {
+        int stride = blockDim.x * gridDim.x;
+        for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride)
+        {
             C[idx] = A[idx] < B[idx] ? A[idx] : B[idx];
-        }   
+        }
     }
 
-    template __global__ void min_kernel<double>(const double* A, const double* B, double* C, const int size);
-    template __global__ void min_kernel<float>(const float* A, const float* B, float* C, const int size);
-    template __global__ void min_kernel<nv_bfloat16>(const nv_bfloat16* A, const nv_bfloat16* B, nv_bfloat16* C, const int size);
-    template __global__ void min_kernel<__half>(const __half* A, const __half* B, __half* C, const int size);
-    template __global__ void min_kernel<int64_t>(const int64_t* A, const int64_t* B, int64_t* C, const int size);
-    template __global__ void min_kernel<int32_t>(const int32_t* A, const int32_t* B, int32_t* C, const int size);
-    template __global__ void min_kernel<int16_t>(const int16_t* A, const int16_t* B, int16_t* C, const int size);
-    template __global__ void min_kernel<int8_t>(const int8_t* A, const int8_t* B, int8_t* C, const int size);
-
     template <typename T>
-    void launch_min(int numBlocks, int blockSize, const T* A, const T* B, T* C, const int size)
+    void launch_min(const T *A, const T *B, T *C, const int size)
     {
+        auto [numBlocks, blockSize] = BestDims(size);
         min_kernel<<<numBlocks, blockSize>>>(A, B, C, size);
         cudaError_t err = cudaGetLastError();
-            if (err != cudaSuccess) {
-                throw std::runtime_error("Failed to launch add kernel: " + 
-                                       std::string(cudaGetErrorString(err)));
-            }
-    }   
-    
-    template void launch_min<double>(int numBlocks, int blockSize, const double* A, const double* B, double* C, const int size);
-    template void launch_min<float>(int numBlocks, int blockSize, const float* A, const float* B, float* C, const int size);
-    template void launch_min<nv_bfloat16>(int numBlocks, int blockSize, const nv_bfloat16* A, const nv_bfloat16* B, nv_bfloat16* C, const int size);
-    template void launch_min<__half>(int numBlocks, int blockSize, const __half* A, const __half* B, __half* C, const int size);
-    template void launch_min<int64_t>(int numBlocks, int blockSize, const int64_t* A, const int64_t* B, int64_t* C, const int size);
-    template void launch_min<int32_t>(int numBlocks, int blockSize, const int32_t* A, const int32_t* B, int32_t* C, const int size);
-    template void launch_min<int16_t>(int numBlocks, int blockSize, const int16_t* A, const int16_t* B, int16_t* C, const int size);
-    template void launch_min<int8_t>(int numBlocks, int blockSize, const int8_t* A, const int8_t* B, int8_t* C, const int size);
+        if (err != cudaSuccess)
+        {
+            throw std::runtime_error("Failed to launch add kernel: " +
+                                     std::string(cudaGetErrorString(err)));
+        }
+    }
+
+    template void launch_min<double>(const double *A, const double *B, double *C, const int size);
+    template void launch_min<float>(const float *A, const float *B, float *C, const int size);
+    template void launch_min<nv_bfloat16>(const nv_bfloat16 *A, const nv_bfloat16 *B, nv_bfloat16 *C, const int size);
+    template void launch_min<__half>(const __half *A, const __half *B, __half *C, const int size);
+    template void launch_min<int64_t>(const int64_t *A, const int64_t *B, int64_t *C, const int size);
+    template void launch_min<int32_t>(const int32_t *A, const int32_t *B, int32_t *C, const int size);
+    template void launch_min<int16_t>(const int16_t *A, const int16_t *B, int16_t *C, const int size);
+    template void launch_min<int8_t>(const int8_t *A, const int8_t *B, int8_t *C, const int size);
 
     template <typename T>
-    __global__ void minscalar_kernel(const T* A, const T scalar, T* C, const int size)
+    __global__ void minscalar_kernel(const T *A, const T scalar, T *C, const int size)
     {
-        int idx = blockIdx.x * blockDim.x + threadIdx.x;
-        if (idx < size) {
+        int stride = blockDim.x * gridDim.x;
+        for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride)
+        {
             C[idx] = A[idx] < scalar ? A[idx] : scalar;
         }
     }
 
-    template __global__ void minscalar_kernel<double>(const double* A, const double scalar, double* C, const int size);
-    template __global__ void minscalar_kernel<float>(const float* A, const float scalar, float* C, const int size);
-    template __global__ void minscalar_kernel<nv_bfloat16>(const nv_bfloat16* A, const nv_bfloat16 scalar, nv_bfloat16* C, const int size); 
-    template __global__ void minscalar_kernel<__half>(const __half* A, const __half scalar, __half* C, const int size);
-    template __global__ void minscalar_kernel<int64_t>(const int64_t* A, const int64_t scalar, int64_t* C, const int size);
-    template __global__ void minscalar_kernel<int32_t>(const int32_t* A, const int32_t scalar, int32_t* C, const int size);
-    template __global__ void minscalar_kernel<int16_t>(const int16_t* A, const int16_t scalar, int16_t* C, const int size);
-    template __global__ void minscalar_kernel<int8_t>(const int8_t* A, const int8_t scalar, int8_t* C, const int size);
-
     template <typename T>
-    void launch_minscalar(int numBlocks, int blockSize, const T* A, const T scalar, T* C, const int size)
+    void launch_minscalar(const T *A, const T scalar, T *C, const int size)
     {
+        auto [numBlocks, blockSize] = BestDims(size);
         minscalar_kernel<<<numBlocks, blockSize>>>(A, scalar, C, size);
         cudaError_t err = cudaGetLastError();
-            if (err != cudaSuccess) {
-                throw std::runtime_error("Failed to launch add kernel: " + 
-                                       std::string(cudaGetErrorString(err)));
-            }
-    }   
+        if (err != cudaSuccess)
+        {
+            throw std::runtime_error("Failed to launch add kernel: " +
+                                     std::string(cudaGetErrorString(err)));
+        }
+    }
 
-    template void launch_minscalar<double>(int numBlocks, int blockSize, const double* A, const double scalar, double* C, const int size);
-    template void launch_minscalar<float>(int numBlocks, int blockSize, const float* A, const float scalar, float* C, const int size);
-    template void launch_minscalar<nv_bfloat16>(int numBlocks, int blockSize, const nv_bfloat16* A, const nv_bfloat16 scalar, nv_bfloat16* C, const int size);
-    template void launch_minscalar<__half>(int numBlocks, int blockSize, const __half* A, const __half scalar, __half* C, const int size);
-    template void launch_minscalar<int64_t>(int numBlocks, int blockSize, const int64_t* A, const int64_t scalar, int64_t* C, const int size);
-    template void launch_minscalar<int32_t>(int numBlocks, int blockSize, const int32_t* A, const int32_t scalar, int32_t* C, const int size);
-    template void launch_minscalar<int16_t>(int numBlocks, int blockSize, const int16_t* A, const int16_t scalar, int16_t* C, const int size);
-    template void launch_minscalar<int8_t>(int numBlocks, int blockSize, const int8_t* A, const int8_t scalar, int8_t* C, const int size);  
+    template void launch_minscalar<double>(const double *A, const double scalar, double *C, const int size);
+    template void launch_minscalar<float>(const float *A, const float scalar, float *C, const int size);
+    template void launch_minscalar<nv_bfloat16>(const nv_bfloat16 *A, const nv_bfloat16 scalar, nv_bfloat16 *C, const int size);
+    template void launch_minscalar<__half>(const __half *A, const __half scalar, __half *C, const int size);
+    template void launch_minscalar<int64_t>(const int64_t *A, const int64_t scalar, int64_t *C, const int size);
+    template void launch_minscalar<int32_t>(const int32_t *A, const int32_t scalar, int32_t *C, const int size);
+    template void launch_minscalar<int16_t>(const int16_t *A, const int16_t scalar, int16_t *C, const int size);
+    template void launch_minscalar<int8_t>(const int8_t *A, const int8_t scalar, int8_t *C, const int size);
 
-    template <typename T>
-    __global__ void compare_kernel(const T* A, const T* B, float* mask, const int size)
+    // equal
+    template <typename T,typename MaskT>
+    __global__ void equalwithepsilon_kernel(const T *A, const T *B, const float epsilon, MaskT *mask, const int size)
     {
-        int idx = blockIdx.x * blockDim.x + threadIdx.x;
-        if (idx < size) {
-            if (A[idx] == B[idx]) {
-                mask[idx] = 0.5;
-            } else if (A[idx] > B[idx]) {
+        int stride = blockDim.x * gridDim.x;
+        for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride)
+        {
+            float diff = fabsf(static_cast<float>(A[idx]) - static_cast<float>(B[idx]));
+            if (diff < epsilon)
+            {
                 mask[idx] = 1;
-            } else {
+            }
+            else
+            {
                 mask[idx] = 0;
             }
         }
     }
 
-    template __global__ void compare_kernel<double>(const double* A, const double* B, float* mask, const int size);
-    template __global__ void compare_kernel<float>(const float* A, const float* B, float* mask, const int size);
-    template __global__ void compare_kernel<nv_bfloat16>(const nv_bfloat16* A, const nv_bfloat16* B, float* mask, const int size);    
-    template __global__ void compare_kernel<__half>(const __half* A, const __half* B, float* mask, const int size);
-    template __global__ void compare_kernel<int64_t>(const int64_t* A, const int64_t* B, float* mask, const int size);
-    template __global__ void compare_kernel<int32_t>(const int32_t* A, const int32_t* B, float* mask, const int size);
-    template __global__ void compare_kernel<int16_t>(const int16_t* A, const int16_t* B, float* mask, const int size);
-    template __global__ void compare_kernel<int8_t>(const int8_t* A, const int8_t* B, float* mask, const int size);
+    template <typename T,typename MaskT>
+    __global__ void equal_kernel(const T *A, const T *B, MaskT *mask, const int size)
+    {
+        int stride = blockDim.x * gridDim.x;
+        for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride)
+        {
+            mask[idx] = (A[idx] == B[idx]);
+        }
+    }
 
-    template <typename T>
-    void launch_compare(int numBlocks, int blockSize, const T* A, const T* B, float* mask, const int size)
+    template <typename T,typename MaskT>
+    void launch_equal(const T *A, const T *B, const float epsilon, MaskT *mask, const int size)
     {
-        compare_kernel<<<numBlocks, blockSize>>>(A, B, mask, size);
+        auto [numBlocks, blockSize] = BestDims(size);
+        if (epsilon == 0)
+        {
+            equal_kernel<<<numBlocks, blockSize>>>(A, B, mask, size);
+        }
+        else
+        {
+            equalwithepsilon_kernel<<<numBlocks, blockSize>>>(A, B, epsilon, mask, size);
+        }
         cudaError_t err = cudaGetLastError();
-            if (err != cudaSuccess) {
-                throw std::runtime_error("Failed to launch add kernel: " + 
-                                       std::string(cudaGetErrorString(err)));
-            }
+        if (err != cudaSuccess)
+        {
+            throw std::runtime_error("Failed to launch add kernel: " +
+                                     std::string(cudaGetErrorString(err)));
+        }
     }
 
-    template void launch_compare<double>(int numBlocks, int blockSize, const double* A, const double* B, float* mask, const int size);
-    template void launch_compare<float>(int numBlocks, int blockSize, const float* A, const float* B, float* mask, const int size);
-    template void launch_compare<nv_bfloat16>(int numBlocks, int blockSize, const nv_bfloat16* A, const nv_bfloat16* B, float* mask, const int size);
-    template void launch_compare<__half>(int numBlocks, int blockSize, const __half* A, const __half* B, float* mask, const int size);
-    template void launch_compare<int64_t>(int numBlocks, int blockSize, const int64_t* A, const int64_t* B, float* mask, const int size);
-    template void launch_compare<int32_t>(int numBlocks, int blockSize, const int32_t* A, const int32_t* B, float* mask, const int size);
-    template void launch_compare<int16_t>(int numBlocks, int blockSize, const int16_t* A, const int16_t* B, float* mask, const int size);
-    template void launch_compare<int8_t>(int numBlocks, int blockSize, const int8_t* A, const int8_t* B, float* mask, const int size);
-    
-    //comparescalar
-    template <typename T>
-    __global__ void comparescalar_kernel(const T* A, const T scalar, float* mask, const int size)
+    template void launch_equal<double,bool>(const double *A, const double *B, const float epsilon, bool *mask, const int size);
+    template void launch_equal<float,bool>(const float *A, const float *B, const float epsilon, bool *mask, const int size);
+    template void launch_equal<nv_bfloat16,bool>(const nv_bfloat16 *A, const nv_bfloat16 *B, const float epsilon, bool *mask, const int size);
+    template void launch_equal<__half,bool>(const __half *A, const __half *B, const float epsilon, bool *mask, const int size);
+    template void launch_equal<int64_t,bool>(const int64_t *A, const int64_t *B, const float epsilon, bool *mask, const int size);
+    template void launch_equal<int32_t,bool>(const int32_t *A, const int32_t *B, const float epsilon, bool *mask, const int size);
+    template void launch_equal<int16_t,bool>(const int16_t *A, const int16_t *B, const float epsilon, bool *mask, const int size);
+    template void launch_equal<int8_t,bool>(const int8_t *A, const int8_t *B, const float epsilon, bool *mask, const int size);
+
+    // equalscalar
+    template <typename T,typename MaskT>
+    __global__ void equalscalarwithepsilon_kernel(const T *A, const T scalar, const float epsilon, MaskT *mask, const int size)
     {
-        int idx = blockIdx.x * blockDim.x + threadIdx.x;
-        if (idx < size) {   
-            if (A[idx] == scalar) {
-                mask[idx] = 0.5;
-            } else if (A[idx] > scalar) {
+        int stride = blockDim.x * gridDim.x;
+        for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride)
+        {   
+            float diff = fabsf(static_cast<float>(A[idx]) - static_cast<float>(scalar));
+            if (diff < epsilon)
+            {
                 mask[idx] = 1;
-            } else {
+            }
+            else
+            {
                 mask[idx] = 0;
             }
         }
     }
 
-    template __global__ void comparescalar_kernel<double>(const double* A, const double scalar, float* mask, const int size);
-    template __global__ void comparescalar_kernel<float>(const float* A, const float scalar, float* mask, const int size);
-    template __global__ void comparescalar_kernel<nv_bfloat16>(const nv_bfloat16* A, const nv_bfloat16 scalar, float* mask, const int size);
-    template __global__ void comparescalar_kernel<__half>(const __half* A, const __half scalar, float* mask, const int size);
-    template __global__ void comparescalar_kernel<int64_t>(const int64_t* A, const int64_t scalar, float* mask, const int size);
-    template __global__ void comparescalar_kernel<int32_t>(const int32_t* A, const int32_t scalar, float* mask, const int size);
-    template __global__ void comparescalar_kernel<int16_t>(const int16_t* A, const int16_t scalar, float* mask, const int size);
-    template __global__ void comparescalar_kernel<int8_t>(const int8_t* A, const int8_t scalar, float* mask, const int size);
+    template <typename T,typename MaskT>
+    __global__ void equalscalar_kernel(const T *A, const T scalar, MaskT *mask, const int size)
+    {
+        int stride = blockDim.x * gridDim.x;
+        for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride)
+        {
+            mask[idx] = (A[idx] == scalar);
+        }
+    }
 
-    template <typename T>
-    void launch_comparescalar(int numBlocks, int blockSize, const T* A, const T scalar, float* mask, const int size)
+    template <typename T,typename MaskT>
+    void launch_equalscalar(const T *A, const T scalar, const float epsilon, MaskT *mask, const int size)
     {
-        comparescalar_kernel<<<numBlocks, blockSize>>>(A, scalar, mask, size);
+        auto [numBlocks, blockSize] = BestDims(size);
+        if (epsilon == 0)
+        {
+            equalscalar_kernel<<<numBlocks, blockSize>>>(A, scalar, mask, size);
+        }
+        else
+        {
+            equalscalarwithepsilon_kernel<<<numBlocks, blockSize>>>(A, scalar, epsilon, mask, size);
+        }
         cudaError_t err = cudaGetLastError();
-            if (err != cudaSuccess) {
-                throw std::runtime_error("Failed to launch add kernel: " + 
-                                       std::string(cudaGetErrorString(err)));
-            }
+        if (err != cudaSuccess)
+        {
+            throw std::runtime_error("Failed to launch add kernel: " +
+                                     std::string(cudaGetErrorString(err)));
+        }
+    }
+
+    template void launch_equalscalar<double,bool>(const double *A, const double scalar, const float epsilon, bool *mask, const int size);
+    template void launch_equalscalar<float,bool>(const float *A, const float scalar, const float epsilon, bool *mask, const int size);
+    template void launch_equalscalar<nv_bfloat16,bool>(const nv_bfloat16 *A, const nv_bfloat16 scalar, const float epsilon, bool *mask, const int size);
+    template void launch_equalscalar<__half,bool>(const __half *A, const __half scalar, const float epsilon, bool *mask, const int size);
+    template void launch_equalscalar<int64_t,bool>(const int64_t *A, const int64_t scalar, const float epsilon, bool *mask, const int size);
+    template void launch_equalscalar<int32_t,bool>(const int32_t *A, const int32_t scalar, const float epsilon, bool *mask, const int size);
+    template void launch_equalscalar<int16_t,bool>(const int16_t *A, const int16_t scalar, const float epsilon, bool *mask, const int size);
+    template void launch_equalscalar<int8_t,bool>(const int8_t *A, const int8_t scalar, const float epsilon, bool *mask, const int size);
+
+    // less
+    template <typename T,typename MaskT>
+    __global__ void less_kernel(const T *A, const T *B, MaskT *mask, const int size)
+    {
+        int stride = blockDim.x * gridDim.x;
+        for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride)
+        {
+            mask[idx] = (A[idx] < B[idx]);
+        }
+    }
+
+    template <typename T,typename MaskT>
+    void launch_less(const T *A, const T *B, MaskT *mask, const int size)
+    {
+        auto [numBlocks, blockSize] = BestDims(size);
+        less_kernel<<<numBlocks, blockSize>>>(A, B, mask, size);
+        cudaError_t err = cudaGetLastError();
+        if (err != cudaSuccess)
+        {
+            throw std::runtime_error("Failed to launch add kernel: " +
+                                     std::string(cudaGetErrorString(err)));
+        }
+    }
+
+    template void launch_less<double,bool>(const double *A, const double *B, bool *mask, const int size);
+    template void launch_less<float,bool>(const float *A, const float *B, bool *mask, const int size);
+    template void launch_less<nv_bfloat16,bool>(const nv_bfloat16 *A, const nv_bfloat16 *B, bool *mask, const int size);
+    template void launch_less<__half,bool>(const __half *A, const __half *B, bool *mask, const int size);
+    template void launch_less<int64_t,bool>(const int64_t *A, const int64_t *B, bool *mask, const int size);
+    template void launch_less<int32_t,bool>(const int32_t *A, const int32_t *B, bool *mask, const int size);
+    template void launch_less<int16_t,bool>(const int16_t *A, const int16_t *B, bool *mask, const int size);
+    template void launch_less<int8_t,bool>(const int8_t *A, const int8_t *B, bool *mask, const int size);
+
+    // lessscalar
+    
+    template <typename T,typename MaskT>
+    __global__ void lessscalar_kernel(const T *A, const T scalar, MaskT *mask, const int size)
+    {
+        int stride = blockDim.x * gridDim.x;
+        for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride)
+        {
+            mask[idx] = (A[idx] < scalar);
+        }
+    }
+
+    template <typename T,typename MaskT>
+    void launch_lessscalar(const T *A, const T scalar, MaskT *mask, const int size)
+    {
+        auto [numBlocks, blockSize] = BestDims(size);
+        lessscalar_kernel<<<numBlocks, blockSize>>>(A, scalar, mask, size);
+        cudaError_t err = cudaGetLastError();
+        if (err != cudaSuccess)
+        {
+            throw std::runtime_error("Failed to launch add kernel: " +
+                                     std::string(cudaGetErrorString(err)));
+        }
+    }
+
+    template void launch_lessscalar<double,bool>(const double *A, const double scalar, bool *mask, const int size);
+    template void launch_lessscalar<float,bool>(const float *A, const float scalar, bool *mask, const int size);
+    template void launch_lessscalar<nv_bfloat16,bool>(const nv_bfloat16 *A, const nv_bfloat16 scalar, bool *mask, const int size);
+    template void launch_lessscalar<__half,bool>(const __half *A, const __half scalar, bool *mask, const int size);
+    template void launch_lessscalar<int64_t,bool>(const int64_t *A, const int64_t scalar, bool *mask, const int size);
+    template void launch_lessscalar<int32_t,bool>(const int32_t *A, const int32_t scalar, bool *mask, const int size);
+    template void launch_lessscalar<int16_t,bool>(const int16_t *A, const int16_t scalar, bool *mask, const int size);
+    template void launch_lessscalar<int8_t,bool>(const int8_t *A, const int8_t scalar, bool *mask, const int size);
+    
+    // greater
+    template <typename T,typename MaskT>
+    __global__ void greater_kernel(const T *A, const T *B, MaskT *mask, const int size)
+    {
+        int stride = blockDim.x * gridDim.x;
+        for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride)
+        {
+            mask[idx] = (A[idx] > B[idx]);
+        }
+    }
+
+    template <typename T,typename MaskT>
+    void launch_greater(const T *A, const T *B, MaskT *mask, const int size)
+    {   
+        auto [numBlocks, blockSize] = BestDims(size);
+        greater_kernel<<<numBlocks, blockSize>>>(A, B, mask, size);
+        cudaError_t err = cudaGetLastError();
+        if (err != cudaSuccess)
+        {
+            throw std::runtime_error("Failed to launch add kernel: " +
+                                     std::string(cudaGetErrorString(err)));
+        }
+    }
+
+    template void launch_greater<double,bool>(const double *A, const double *B, bool *mask, const int size);
+    template void launch_greater<float,bool>(const float *A, const float *B, bool *mask, const int size);
+    template void launch_greater<nv_bfloat16,bool>(const nv_bfloat16 *A, const nv_bfloat16 *B, bool *mask, const int size);
+    template void launch_greater<__half,bool>(const __half *A, const __half *B, bool *mask, const int size);
+    template void launch_greater<int64_t,bool>(const int64_t *A, const int64_t *B, bool *mask, const int size);
+    template void launch_greater<int32_t,bool>(const int32_t *A, const int32_t *B, bool *mask, const int size);
+    template void launch_greater<int16_t,bool>(const int16_t *A, const int16_t *B, bool *mask, const int size);
+    template void launch_greater<int8_t,bool>(const int8_t *A, const int8_t *B, bool *mask, const int size);    
+
+    // greaterscalar
+    template <typename T,typename MaskT>
+    __global__ void greaterscalar_kernel(const T *A, const T scalar, MaskT *mask, const int size)
+    {
+        int stride = blockDim.x * gridDim.x;
+        for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride)
+        {
+            mask[idx] = (A[idx] > scalar);
+        }
     }   
 
-    template void launch_comparescalar<double>(int numBlocks, int blockSize, const double* A, const double scalar, float* mask, const int size);
-    template void launch_comparescalar<float>(int numBlocks, int blockSize, const float* A, const float scalar, float* mask, const int size);
-    template void launch_comparescalar<nv_bfloat16>(int numBlocks, int blockSize, const nv_bfloat16* A, const nv_bfloat16 scalar, float* mask, const int size);
-    template void launch_comparescalar<__half>(int numBlocks, int blockSize, const __half* A, const __half scalar, float* mask, const int size);
-    template void launch_comparescalar<int64_t>(int numBlocks, int blockSize, const int64_t* A, const int64_t scalar, float* mask, const int size);
-    template void launch_comparescalar<int32_t>(int numBlocks, int blockSize, const int32_t* A, const int32_t scalar, float* mask, const int size);
-    template void launch_comparescalar<int16_t>(int numBlocks, int blockSize, const int16_t* A, const int16_t scalar, float* mask, const int size);
-    template void launch_comparescalar<int8_t>(int numBlocks, int blockSize, const int8_t* A, const int8_t scalar, float* mask, const int size);
+    template <typename T,typename MaskT>
+    void launch_greaterscalar(const T *A, const T scalar, MaskT *mask, const int size)
+    {
+        auto [numBlocks, blockSize] = BestDims(size);
+        greaterscalar_kernel<<<numBlocks, blockSize>>>(A, scalar, mask, size);
+        cudaError_t err = cudaGetLastError();
+        if (err != cudaSuccess)
+        {
+            throw std::runtime_error("Failed to launch add kernel: " +
+                                     std::string(cudaGetErrorString(err)));
+        }
+    }   
+
+    template void launch_greaterscalar<double,bool>(const double *A, const double scalar, bool *mask, const int size);
+    template void launch_greaterscalar<float,bool>(const float *A, const float scalar, bool *mask, const int size);
+    template void launch_greaterscalar<nv_bfloat16,bool>(const nv_bfloat16 *A, const nv_bfloat16 scalar, bool *mask, const int size);
+    template void launch_greaterscalar<__half,bool>(const __half *A, const __half scalar, bool *mask, const int size);
+    template void launch_greaterscalar<int64_t,bool>(const int64_t *A, const int64_t scalar, bool *mask, const int size);
+    template void launch_greaterscalar<int32_t,bool>(const int32_t *A, const int32_t scalar, bool *mask, const int size);
+    template void launch_greaterscalar<int16_t,bool>(const int16_t *A, const int16_t scalar, bool *mask, const int size);
+    template void launch_greaterscalar<int8_t,bool>(const int8_t *A, const int8_t scalar, bool *mask, const int size);
 
-};
+    // switch
+    template <typename T,typename casesT>
+    __global__ void switch_kernel(const T **tensorsdata, const int numTensors, const casesT *cases, T *C, const int size)
+    {
+        int stride = blockDim.x * gridDim.x;
+        for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride)
+        {
+            C[idx] = tensorsdata[cases[idx]][idx];
+        }
+    }
+
+    template <typename T,typename casesT>
+    void launch_switch(const T **tensorsdata, const int numTensors, const casesT *cases, T *C, const int size)
+    {
+        auto [numBlocks, blockSize] = BestDims(size);
+        cudaVector<const T *> tensorsdataList(tensorsdata, numTensors, cudaMemcpyHostToDevice);
+        switch_kernel<<<numBlocks, blockSize>>>(tensorsdataList.data, numTensors, cases, C, size);
+        cudaError_t err = cudaGetLastError();
+        if (err != cudaSuccess)
+        {
+            throw std::runtime_error("Failed to launch add kernel: " +
+                                     std::string(cudaGetErrorString(err)));
+        }
+    }   
     
+    template void launch_switch<double,int8_t>(const double **tensorsdata, const int numTensors, const int8_t *cases, double *C, const int size);
+    template void launch_switch<float,int8_t>(const float **tensorsdata, const int numTensors, const int8_t *cases, float *C, const int size);
+    template void launch_switch<nv_bfloat16,int8_t>(const nv_bfloat16 **tensorsdata, const int numTensors, const int8_t *cases, nv_bfloat16 *C, const int size);
+    template void launch_switch<__half,int8_t>(const __half **tensorsdata, const int numTensors, const int8_t *cases, __half *C, const int size);
+    template void launch_switch<int64_t,int8_t>(const int64_t **tensorsdata, const int numTensors, const int8_t *cases, int64_t *C, const int size);
+    template void launch_switch<int32_t,int8_t>(const int32_t **tensorsdata, const int numTensors, const int8_t *cases, int32_t *C, const int size);
+    template void launch_switch<int16_t,int8_t>(const int16_t **tensorsdata, const int numTensors, const int8_t *cases, int16_t *C, const int size);
+    template void launch_switch<int8_t,int8_t>(const int8_t **tensorsdata, const int numTensors, const int8_t *cases, int8_t *C, const int size);
+    template void launch_switch<bool,int8_t>(const bool **tensorsdata, const int numTensors, const int8_t *cases, bool *C, const int size);
+ 
+}
 #endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_COMPARE_CU
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.cuh b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.cuh
index 708b6d05..ee9ea259 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.cuh
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.cuh
@@ -12,186 +12,82 @@ namespace deepx::tensorfunc
     __global__ void max_kernel(const T* A, const T* B, T* C, const int size);
 
     template <typename T>
-    void launch_max(int numBlocks, int blockSize, const T* A, const T* B, T* C, const int size);
-
-    template <>
-    void launch_max<double>(int numBlocks, int blockSize, const double* A, const double* B, double* C, const int size);
-
-    template <>
-    void launch_max<float>(int numBlocks, int blockSize, const float* A, const float* B, float* C, const int size);
-
-    template <>
-    void launch_max<nv_bfloat16>(int numBlocks, int blockSize, const nv_bfloat16* A, const nv_bfloat16* B, nv_bfloat16* C, const int size);
-
-    template <>
-    void launch_max<__half>(int numBlocks, int blockSize, const __half* A, const __half* B, __half* C, const int size);
-
-    template <>
-    void launch_max<int64_t>(int numBlocks, int blockSize, const int64_t* A, const int64_t* B, int64_t* C, const int size);
-
-    template <>
-    void launch_max<int32_t>(int numBlocks, int blockSize, const int32_t* A, const int32_t* B, int32_t* C, const int size);
-
-    template <>
-    void launch_max<int16_t>(int numBlocks, int blockSize, const int16_t* A, const int16_t* B, int16_t* C, const int size);
-
-    template <>
-    void launch_max<int8_t>(int numBlocks, int blockSize, const int8_t* A, const int8_t* B, int8_t* C, const int size);
-
+    void launch_max(const T* A, const T* B, T* C, const int size);
+ 
     //maxscalar
     template <typename T>
     __global__ void maxscalar_kernel(const T* A, const T scalar, T* C, const int size);
 
     template <typename T>
-    void launch_maxscalar(int numBlocks, int blockSize, const T* A, const T scalar, T* C, const int size);
-
-    template <>
-    void launch_maxscalar<double>(int numBlocks, int blockSize, const double* A, const double scalar, double* C, const int size);
-
-    template <>
-    void launch_maxscalar<float>(int numBlocks, int blockSize, const float* A, const float scalar, float* C, const int size);
-
-    template <>
-    void launch_maxscalar<nv_bfloat16>(int numBlocks, int blockSize, const nv_bfloat16* A, const nv_bfloat16 scalar, nv_bfloat16* C, const int size);
-
-    template <>
-    void launch_maxscalar<__half>(int numBlocks, int blockSize, const __half* A, const __half scalar, __half* C, const int size);
-
-    template <>
-    void launch_maxscalar<int64_t>(int numBlocks, int blockSize, const int64_t* A, const int64_t scalar, int64_t* C, const int size);
-
-    template <>
-    void launch_maxscalar<int32_t>(int numBlocks, int blockSize, const int32_t* A, const int32_t scalar, int32_t* C, const int size);
-
-    template <>
-    void launch_maxscalar<int16_t>(int numBlocks, int blockSize, const int16_t* A, const int16_t scalar, int16_t* C, const int size);
-
-    template <>
-    void launch_maxscalar<int8_t>(int numBlocks, int blockSize, const int8_t* A, const int8_t scalar, int8_t* C, const int size);
-
+    void launch_maxscalar(const T* A, const T scalar, T* C, const int size);
+    
+    //min
     template <typename T>
     __global__ void min_kernel(const T* A, const T* B, T* C, const int size);
 
-    //min
     template <typename T>
-    void launch_min(int numBlocks, int blockSize, const T* A, const T* B, T* C, const int size);
-
-    template <>
-    void launch_min<double>(int numBlocks, int blockSize, const double* A, const double* B, double* C, const int size);
-
-    template <>
-    void launch_min<float>(int numBlocks, int blockSize, const float* A, const float* B, float* C, const int size);
-
-    template <>
-    void launch_min<nv_bfloat16>(int numBlocks, int blockSize, const nv_bfloat16* A, const nv_bfloat16* B, nv_bfloat16* C, const int size);
-
-    template <>
-    void launch_min<__half>(int numBlocks, int blockSize, const __half* A, const __half* B, __half* C, const int size);
-
-    template <>
-    void launch_min<int64_t>(int numBlocks, int blockSize, const int64_t* A, const int64_t* B, int64_t* C, const int size);
-
-    template <>
-    void launch_min<int32_t>(int numBlocks, int blockSize, const int32_t* A, const int32_t* B, int32_t* C, const int size);
-
-    template <>
-    void launch_min<int16_t>(int numBlocks, int blockSize, const int16_t* A, const int16_t* B, int16_t* C, const int size);
-
-    template <>
-    void launch_min<int8_t>(int numBlocks, int blockSize, const int8_t* A, const int8_t* B, int8_t* C, const int size);
+    void launch_min(const T* A, const T* B, T* C, const int size);
 
+     
     //minscalar
     template <typename T>
     __global__ void minscalar_kernel(const T* A, const T scalar, T* C, const int size);
 
     template <typename T>
-    void launch_minscalar(int numBlocks, int blockSize, const T* A, const T scalar, T* C, const int size);
+    void launch_minscalar(const T* A, const T scalar, T* C, const int size);
     
-    template <>
-    void launch_minscalar<double>(int numBlocks, int blockSize, const double* A, const double scalar, double* C, const int size);
-
-    template <>
-    void launch_minscalar<float>(int numBlocks, int blockSize, const float* A, const float scalar, float* C, const int size);
-
-    template <>
-    void launch_minscalar<nv_bfloat16>(int numBlocks, int blockSize, const nv_bfloat16* A, const nv_bfloat16 scalar, nv_bfloat16* C, const int size);
-
-    template <>
-    void launch_minscalar<__half>(int numBlocks, int blockSize, const __half* A, const __half scalar, __half* C, const int size);
-
-    template <>
-    void launch_minscalar<int64_t>(int numBlocks, int blockSize, const int64_t* A, const int64_t scalar, int64_t* C, const int size);
-
-    template <>
-    void launch_minscalar<int32_t>(int numBlocks, int blockSize, const int32_t* A, const int32_t scalar, int32_t* C, const int size);
-
-    template <>
-    void launch_minscalar<int16_t>(int numBlocks, int blockSize, const int16_t* A, const int16_t scalar, int16_t* C, const int size);
-
-    template <>
-    void launch_minscalar<int8_t>(int numBlocks, int blockSize, const int8_t* A, const int8_t scalar, int8_t* C, const int size);
-
-    //compare
-    template <typename T>
-    __global__ void compare_kernel(const T* A, const T* B, float* mask, const int size);
-
-    template <typename T>
-    void launch_compare(int numBlocks, int blockSize, const T* A, const T* B, float* mask, const int size);
-
-    template <>
-    void launch_compare<double>(int numBlocks, int blockSize, const double* A, const double* B, float* mask, const int size);
-
-    template <>
-    void launch_compare<float>(int numBlocks, int blockSize, const float* A, const float* B, float* mask, const int size);
-
-    template <>
-    void launch_compare<nv_bfloat16>(int numBlocks, int blockSize, const nv_bfloat16* A, const nv_bfloat16* B, float* mask, const int size);
-
-    template <>
-    void launch_compare<__half>(int numBlocks, int blockSize, const __half* A, const __half* B, float* mask, const int size);
-
-    template <>
-    void launch_compare<int64_t>(int numBlocks, int blockSize, const int64_t* A, const int64_t* B, float* mask, const int size);
-
-    template <>
-    void launch_compare<int32_t>(int numBlocks, int blockSize, const int32_t* A, const int32_t* B, float* mask, const int size);
+    
+    //equal
+    template <typename T,typename MaskT>
+    __global__ void equal_kernel(const T* A, const T* B,const float epsilon, MaskT* mask, const int size);
 
-    template <>
-    void launch_compare<int16_t>(int numBlocks, int blockSize, const int16_t* A, const int16_t* B, float* mask, const int size);
+    template <typename T,typename MaskT>
+    __global__ void equal_kernel(const T* A, const T* B, float* mask, const int size);
+    
+    template <typename T,typename MaskT>
+    void launch_equal(const T* A, const T* B,const float epsilon, MaskT* mask, const int size);
 
-    template <>
-    void launch_compare<int8_t>(int numBlocks, int blockSize, const int8_t* A, const int8_t* B, float* mask, const int size);
+    //equalscalar
+    template <typename T,typename MaskT>
+    __global__ void equalscalar_kernel(const T* A, const T scalar,const float epsilon, MaskT* mask, const int size);
 
-    //comparescalar
-    template <typename T>
-    __global__ void comparescalar_kernel(const T* A, const T scalar, float* mask, const int size);
+    template <typename T,typename MaskT>
+    void launch_equalscalar(const T* A, const T scalar,const float epsilon, MaskT* mask, const int size);
 
-    template <typename T>
-    void launch_comparescalar(int numBlocks, int blockSize, const T* A, const T scalar, float* mask, const int size);
+    //less
+    template <typename T,typename MaskT>
+    __global__ void less_kernel(const T* A, const T* B, MaskT* mask, const int size);
 
-    template <>
-    void launch_comparescalar<double>(int numBlocks, int blockSize, const double* A, const double scalar, float* mask, const int size);
+    template <typename T,typename MaskT>
+    void launch_less(const T* A, const T* B, MaskT* mask, const int size);
 
-    template <>
-    void launch_comparescalar<float>(int numBlocks, int blockSize, const float* A, const float scalar, float* mask, const int size);
+    //lessscalar
+    template <typename T,typename MaskT>
+    __global__ void lessscalar_kernel(const T* A, const T scalar, MaskT* mask, const int size);
 
-    template <>
-    void launch_comparescalar<nv_bfloat16>(int numBlocks, int blockSize, const nv_bfloat16* A, const nv_bfloat16 scalar, float* mask, const int size);
+    template <typename T,typename MaskT>
+    void launch_lessscalar(const T* A, const T scalar, MaskT* mask, const int size);
 
-    template <>
-    void launch_comparescalar<__half>(int numBlocks, int blockSize, const __half* A, const __half scalar, float* mask, const int size);
+    //greater
+    template <typename T,typename MaskT>
+    __global__ void greater_kernel(const T* A, const T* B, MaskT* mask, const int size);
 
-    template <>
-    void launch_comparescalar<int64_t>(int numBlocks, int blockSize, const int64_t* A, const int64_t scalar, float* mask, const int size);
+    template <typename T,typename MaskT>
+    void launch_greater(const T* A, const T* B, MaskT* mask, const int size);
 
-    template <>
-    void launch_comparescalar<int32_t>(int numBlocks, int blockSize, const int32_t* A, const int32_t scalar, float* mask, const int size);
+    //greaterscalar
+    template <typename T,typename MaskT>
+    __global__ void greaterscalar_kernel(const T* A, const T scalar, MaskT* mask, const int size);
 
-    template <>
-    void launch_comparescalar<int16_t>(int numBlocks, int blockSize, const int16_t* A, const int16_t scalar, float* mask, const int size);
+    template <typename T,typename MaskT>
+    void launch_greaterscalar(const T* A, const T scalar, MaskT* mask, const int size);
+    
+    //switch
+    template <typename T,typename casesT>
+    __global__ void switch_kernel(const T** tensorsdata,const int numTensors, const casesT* cases, T* C, const int size);
 
-    template <>
-    void launch_comparescalar<int8_t>(int numBlocks, int blockSize, const int8_t* A, const int8_t scalar, float* mask, const int size); 
- 
+    template <typename T,typename casesT>
+    void launch_switch(const T** tensorsdata,const int numTensors, const casesT* cases, T* C, const int size);
+    
 }
 #endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_COMPARE_CUH
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.hpp
index 1d0c49d9..ed58ac6a 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.hpp
@@ -10,20 +10,18 @@
 namespace deepx::tensorfunc
 {
     // CUDA kernel函数声明
-   
 
     template <typename T>
     struct maxDispatcher<miaobyte, T>
     {
         static void max(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &C)
         {
-            if (A.shape.size != C.shape.size) {
+            if (A.shape.size != C.shape.size)
+            {
                 throw TensorShapeError("max");
             }
-            const int blockSize = A.shape.size > 256 ? 256 : A.shape.size;
-            int numBlocks = (A.shape.size + blockSize - 1) / blockSize;
-            launch_max(numBlocks, blockSize, A.data, B.data, C.data, A.shape.size);           
-        }   
+            launch_max(A.data, B.data, C.data, A.shape.size);
+        }
     };
 
     template <typename T>
@@ -31,26 +29,25 @@ namespace deepx::tensorfunc
     {
         static void maxscalar(const Tensor<T> &A, const T scalar, Tensor<T> &C)
         {
-            if (A.shape.size != C.shape.size) {
+            if (A.shape.size != C.shape.size)
+            {
                 throw TensorShapeError("maxscalar");
             }
-            const int blockSize = A.shape.size > 256 ? 256 : A.shape.size;
-            int numBlocks = (A.shape.size + blockSize - 1) / blockSize;
-            launch_maxscalar(numBlocks, blockSize, A.data, scalar, C.data, A.shape.size);
+            launch_maxscalar(A.data, scalar, C.data, A.shape.size);
         }
     };
-    
+
     template <typename T>
     struct minDispatcher<miaobyte, T>
     {
         static void min(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &C)
         {
-            if (A.shape.size != C.shape.size) {
+            if (A.shape.size != C.shape.size)
+            {
                 throw TensorShapeError("min");
             }
-            const int blockSize = A.shape.size > 256 ? 256 : A.shape.size;
-            int numBlocks = (A.shape.size + blockSize - 1) / blockSize;
-            launch_min(numBlocks, blockSize, A.data, B.data, C.data, A.shape.size);
+
+            launch_min(A.data, B.data, C.data, A.shape.size);
         }
     };
 
@@ -59,41 +56,122 @@ namespace deepx::tensorfunc
     {
         static void minscalar(const Tensor<T> &A, const T scalar, Tensor<T> &C)
         {
-            if (A.shape.size != C.shape.size) {
+            if (A.shape.size != C.shape.size)
+            {
                 throw TensorShapeError("minscalar");
             }
-            const int blockSize = A.shape.size > 256 ? 256 : A.shape.size;
-            int numBlocks = (A.shape.size + blockSize - 1) / blockSize;
-            launch_minscalar(numBlocks, blockSize, A.data, scalar, C.data, A.shape.size);
+
+            launch_minscalar(A.data, scalar, C.data, A.shape.size);
         }
     };
-    template <typename T>
-    struct compareDispatcher<miaobyte, T>
+    // equal(A,B)=>C
+    template <typename T,typename MaskT>
+    struct equalDispatcher<miaobyte, T,MaskT>
+    {
+        static void equal(const Tensor<T> &A, const Tensor<T> &B, float epsilon, Tensor<MaskT> &mask)
+        {
+            if (A.shape.size != B.shape.size || A.shape.size != mask.shape.size)
+            {
+                throw TensorShapeError("equal");
+            }
+            if (epsilon < 0)
+            {
+                throw std::invalid_argument("equal epsilon must be positive");
+            }
+            launch_equal(A.data, B.data, epsilon, mask.data, A.shape.size);
+        }
+    };
+    // equalscalar(A,scalar)=>C
+    template <typename T,typename MaskT>
+    struct equalscalarDispatcher<miaobyte, T,MaskT>
     {
-        static void compare(const Tensor<T> &A, const Tensor<T> &B, Tensor<float> &mask)
+        static void equalscalar(const Tensor<T> &A, const T scalar, float epsilon, Tensor<MaskT> &mask)
         {
-            if (A.shape.size != B.shape.size || A.shape.size != mask.shape.size) { 
-                throw TensorShapeError("compare");  
+            if (A.shape.size != mask.shape.size)
+            {
+                throw TensorShapeError("equalscalar");
+            }
+            if (epsilon < 0)
+            {
+                throw std::invalid_argument("equal epsilon must be positive");
             }
-            const int blockSize = A.shape.size > 256 ? 256 : A.shape.size;
-            int numBlocks = (A.shape.size + blockSize - 1) / blockSize;
-            launch_compare(numBlocks, blockSize, A.data, B.data, mask.data, A.shape.size);
+            launch_equalscalar(A.data, scalar, epsilon, mask.data, A.shape.size);
         }
     };
 
-    template <typename T>
-    struct comparescalarDispatcher<miaobyte, T>
+    // less(A,B)=>C
+    template <typename T,typename MaskT>
+    struct lessDispatcher<miaobyte, T,MaskT>
+    {
+        static void less(const Tensor<T> &A, const Tensor<T> &B, Tensor<MaskT> &mask)
+        {
+            if (A.shape.size != B.shape.size || A.shape.size != mask.shape.size)
+            {
+                throw TensorShapeError("less");
+            }
+            launch_less(A.data, B.data, mask.data, A.shape.size);
+        }
+    };
+    // lessscalar(A,scalar)=>C
+    template <typename T,typename MaskT>
+    struct lessscalarDispatcher<miaobyte, T,MaskT>
     {
-        static void comparescalar(const Tensor<T> &A, const T scalar, Tensor<float> &mask)
+        static void lessscalar(const Tensor<T> &A, const T scalar, Tensor<MaskT> &mask)
         {
-            if (A.shape.size != mask.shape.size) {
-                throw TensorShapeError("comparescalar");
+            if (A.shape.size != mask.shape.size)
+            {
+                throw TensorShapeError("lessscalar");
             }
-            const int blockSize = A.shape.size > 256 ? 256 : A.shape.size;
-            int numBlocks = (A.shape.size + blockSize - 1) / blockSize;
-            launch_comparescalar(numBlocks, blockSize, A.data, scalar, mask.data, A.shape.size);
+            launch_lessscalar(A.data, scalar, mask.data, A.shape.size);
         }
     };
+    // greater(A,B)=>C
+    template <typename T,typename MaskT>
+    struct greaterDispatcher<miaobyte, T,MaskT>
+    {
+        static void greater(const Tensor<T> &A, const Tensor<T> &B, Tensor<MaskT> &mask)
+        {
+            if (A.shape.size != B.shape.size || A.shape.size != mask.shape.size)
+            {
+                throw TensorShapeError("greater");
+            }
+            launch_greater(A.data, B.data, mask.data, A.shape.size);
+        }
+    };
+    // greaterscalar(A,scalar)=>C
+    template <typename T,typename MaskT>
+    struct greaterscalarDispatcher<miaobyte, T,MaskT>
+    {
+        static void greaterscalar(const Tensor<T> &A, const T scalar, Tensor<MaskT> &mask)
+        {
+            if (A.shape.size != mask.shape.size)
+            {
+                throw TensorShapeError("greaterscalar");
+            }
+            launch_greaterscalar(A.data, scalar, mask.data, A.shape.size);
+        }
+    };
+    // switch(tensors,cases)=>C
+    template <typename T,typename casesT>
+    struct switchDispatcher<miaobyte, T,casesT>
+    {
+        static void Switch(const vector<Tensor<T> *> tensors, const Tensor<casesT> &cases, Tensor<T> &C)
+        {
+            if (cases.shape.size != C.shape.size)
+            {
+                throw TensorShapeError("Switch");
+            }
+            
+            vector<const T *> tensorsData(tensors.size());
+            for (int i = 0; i < tensors.size(); i++)
+            {
+                tensorsData[i] = tensors[i]->data;
+            }
+            
+            launch_switch(tensorsData.data(), tensors.size(), cases.data, C.data, C.shape.size);
+        }
+    };
+
 }
 
 #endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_BASIC_HPP
diff --git a/excuter/op-mem-cuda/src/deepx/tf/elementwise_compare.hpp b/excuter/op-mem-cuda/src/deepx/tf/elementwise_compare.hpp
index 4ec85b83..694ad3db 100644
--- a/excuter/op-mem-cuda/src/deepx/tf/elementwise_compare.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tf/elementwise_compare.hpp
@@ -20,15 +20,6 @@ namespace deepx::tf
             this->returns = returns;
         }
 
-        Max(string text)
-        {
-            this->parse(text);
-            this->author = Author::name();
-            if (this->name != "max")
-            {
-                throw std::runtime_error("Invalid name: " + this->name);
-            }
-        }
         string math_formula() const override
         {
             return "T3=max(T1, T2)";
@@ -72,7 +63,7 @@ namespace deepx::tf
                 break;
             case Precision::Int8:
                 tensorfunc::max<Author, int8_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), *mem->gettensor<int8_t>(this->args[1].textvalue), *mem->gettensor<int8_t>(this->returns[0].textvalue));
-                break;  
+                break;
             default:
                 error = "Unsupported type: " + precision_str(a_type);
                 return 1;
@@ -93,15 +84,6 @@ namespace deepx::tf
             this->returns = returns;
         }
 
-        MaxScalar(string text)
-        {
-            this->parse(text);
-            this->author = Author::name();
-            if (this->name != "maxscalar")
-            {
-                throw std::runtime_error("Invalid name: " + this->name);
-            }
-        }
         string math_formula() const override
         {
             return "T3=max(T1, scalar)";
@@ -139,8 +121,8 @@ namespace deepx::tf
                 break;
             case Precision::Int32:
                 tensorfunc::maxscalar<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), this->getvar<int32_t>(1, mem), *mem->gettensor<int32_t>(this->returns[0].textvalue));
-                break;  
-            case Precision::Int16:  
+                break;
+            case Precision::Int16:
                 tensorfunc::maxscalar<Author, int16_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), this->getvar<int16_t>(1, mem), *mem->gettensor<int16_t>(this->returns[0].textvalue));
                 break;
             case Precision::Int8:
@@ -166,15 +148,6 @@ namespace deepx::tf
             this->returns = returns;
         }
 
-        Min(string text)
-        {
-            this->parse(text);
-            this->author = Author::name();
-            if (this->name != "min")
-            {
-                throw std::runtime_error("Invalid name: " + this->name);
-            }
-        }
         string math_formula() const override
         {
             return "T3=min(T1, T2)";
@@ -201,14 +174,14 @@ namespace deepx::tf
                 break;
             case Precision::Float32:
                 tensorfunc::min<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<float>(this->args[1].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
-                break;      
+                break;
             case Precision::Float16:
                 tensorfunc::min<Author, half>(*mem->gettensor<half>(this->args[0].textvalue), *mem->gettensor<half>(this->args[1].textvalue), *mem->gettensor<half>(this->returns[0].textvalue));
                 break;
             case Precision::BFloat16:
                 tensorfunc::min<Author, nv_bfloat16>(*mem->gettensor<nv_bfloat16>(this->args[0].textvalue), *mem->gettensor<nv_bfloat16>(this->args[1].textvalue), *mem->gettensor<nv_bfloat16>(this->returns[0].textvalue));
-                break;  
-            case Precision::Int64:  
+                break;
+            case Precision::Int64:
                 tensorfunc::min<Author, int64_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue), *mem->gettensor<int64_t>(this->returns[0].textvalue));
                 break;
             case Precision::Int32:
@@ -240,15 +213,6 @@ namespace deepx::tf
             this->returns = returns;
         }
 
-        MinScalar(string text)
-        {
-            this->parse(text);
-            this->author = Author::name();
-            if (this->name != "minscalar")
-            {
-                throw std::runtime_error("Invalid name: " + this->name);
-            }
-        }
         string math_formula() const override
         {
             return "T3=min(T1, scalar)";
@@ -279,7 +243,7 @@ namespace deepx::tf
                 tensorfunc::minscalar<Author, half>(*mem->gettensor<half>(this->args[0].textvalue), this->getvar<half>(1, mem), *mem->gettensor<half>(this->returns[0].textvalue));
                 break;
             case Precision::BFloat16:
-                tensorfunc::minscalar   <Author, nv_bfloat16>(*mem->gettensor<nv_bfloat16>(this->args[0].textvalue), this->getvar<nv_bfloat16>(1, mem), *mem->gettensor<nv_bfloat16>(this->returns[0].textvalue));
+                tensorfunc::minscalar<Author, nv_bfloat16>(*mem->gettensor<nv_bfloat16>(this->args[0].textvalue), this->getvar<nv_bfloat16>(1, mem), *mem->gettensor<nv_bfloat16>(this->returns[0].textvalue));
                 break;
             case Precision::Int64:
                 tensorfunc::minscalar<Author, int64_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), this->getvar<int64_t>(1, mem), *mem->gettensor<int64_t>(this->returns[0].textvalue));
@@ -302,33 +266,156 @@ namespace deepx::tf
     };
 
     template <typename Author>
-    class Compare : public TF
+    class Equal : public TF
     {
     public:
-        Compare(const vector<Param> &args, const vector<Param> &returns)
+        Equal(const vector<Param> &args, const vector<Param> &returns)
         {
-            this->name = "compare";
+            this->name = "equal";
             this->author = Author::name();
             this->args = args;
             this->returns = returns;
         }
 
-        Compare(string text)
+        string math_formula() const override
+        {
+            return "mask=compare(T1, T2)";
+        }
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<Equal<Author>>(*this);
+        }
+
+        int run(shared_ptr<MemBase> mem, string &error) override
+        {
+            Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
+            Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype;
+            float epsilon = this->getvar<float>(2, mem);
+            Precision mask_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
+            if (a_type != b_type || mask_type != Precision::Bool)
+            {
+                error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(b_type) + " or " + precision_str(a_type) + " != " + precision_str(mask_type);
+                return 1;
+            }
+            switch (a_type)
+            {
+            case Precision::Float64:
+                tensorfunc::equal<Author, double, bool>(*mem->gettensor<double>(this->args[0].textvalue), *mem->gettensor<double>(this->args[1].textvalue), epsilon, *mem->gettensor<bool>(this->returns[0].textvalue));
+                break;
+            case Precision::Float32:
+                tensorfunc::equal<Author, float, bool>(*mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<float>(this->args[1].textvalue), epsilon, *mem->gettensor<bool>(this->returns[0].textvalue));
+                break;
+            case Precision::Float16:
+                tensorfunc::equal<Author, half, bool>(*mem->gettensor<half>(this->args[0].textvalue), *mem->gettensor<half>(this->args[1].textvalue), epsilon, *mem->gettensor<bool>(this->returns[0].textvalue));
+                break;
+            case Precision::BFloat16:
+                tensorfunc::equal<Author, nv_bfloat16, bool>(*mem->gettensor<nv_bfloat16>(this->args[0].textvalue), *mem->gettensor<nv_bfloat16>(this->args[1].textvalue), epsilon, *mem->gettensor<bool>(this->returns[0].textvalue));
+                break;
+            case Precision::Int64:
+                tensorfunc::equal<Author, int64_t, bool>(*mem->gettensor<int64_t>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue), epsilon, *mem->gettensor<bool>(this->returns[0].textvalue));
+                break;
+            case Precision::Int32:
+                tensorfunc::equal<Author, int32_t, bool>(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), epsilon, *mem->gettensor<bool>(this->returns[0].textvalue));
+                break;
+            case Precision::Int16:
+                tensorfunc::equal<Author, int16_t, bool>(*mem->gettensor<int16_t>(this->args[0].textvalue), *mem->gettensor<int16_t>(this->args[1].textvalue), epsilon, *mem->gettensor<bool>(this->returns[0].textvalue));
+                break;
+            case Precision::Int8:
+                tensorfunc::equal<Author, int8_t, bool>(*mem->gettensor<int8_t>(this->args[0].textvalue), *mem->gettensor<int8_t>(this->args[1].textvalue), epsilon, *mem->gettensor<bool>(this->returns[0].textvalue));
+                break;
+            default:
+                error = "Unsupported type: " + precision_str(a_type);
+                return 1;
+            }
+            return 0;
+        }
+    };
+
+    template <typename Author>
+    class EqualScalar : public TF
+    {
+    public:
+        EqualScalar(const vector<Param> &args, const vector<Param> &returns)
         {
-            this->parse(text);
+            this->name = "equalscalar";
             this->author = Author::name();
-            if (this->name != "compare")
+            this->args = args;
+            this->returns = returns;
+        }
+
+        string math_formula() const override
+        {
+            return "mask=compare(T1, scalar)";
+        }
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<EqualScalar<Author>>(*this);
+        }
+
+        int run(shared_ptr<MemBase> mem, string &error) override
+        {
+            Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
+            Precision mask_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
+            float epsilon = this->getvar<float>(2, mem);
+            if (a_type != mask_type || mask_type != Precision::Bool)
             {
-                throw std::runtime_error("Invalid name: " + this->name);
+                error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(mask_type);
+                return 1;
             }
+            switch (a_type)
+            {
+            case Precision::Float64:
+                tensorfunc::equalscalar<Author, double, bool>(*mem->gettensor<double>(this->args[0].textvalue), this->getvar<double>(1, mem), epsilon, *mem->gettensor<bool>(this->returns[0].textvalue));
+                break;
+            case Precision::Float32:
+                tensorfunc::equalscalar<Author, float, bool>(*mem->gettensor<float>(this->args[0].textvalue), this->getvar<float>(1, mem), epsilon, *mem->gettensor<bool>(this->returns[0].textvalue));
+                break;
+            case Precision::Float16:
+                tensorfunc::equalscalar<Author, half, bool>(*mem->gettensor<half>(this->args[0].textvalue), this->getvar<half>(1, mem), epsilon, *mem->gettensor<bool>(this->returns[0].textvalue));
+                break;
+            case Precision::BFloat16:
+                tensorfunc::equalscalar<Author, nv_bfloat16, bool>(*mem->gettensor<nv_bfloat16>(this->args[0].textvalue), this->getvar<nv_bfloat16>(1, mem), epsilon, *mem->gettensor<bool>(this->returns[0].textvalue));
+                break;
+            case Precision::Int64:
+                tensorfunc::equalscalar<Author, int64_t, bool>(*mem->gettensor<int64_t>(this->args[0].textvalue), this->getvar<int64_t>(1, mem), epsilon, *mem->gettensor<bool>(this->returns[0].textvalue));
+                break;
+            case Precision::Int32:
+                tensorfunc::equalscalar<Author, int32_t, bool>(*mem->gettensor<int32_t>(this->args[0].textvalue), this->getvar<int32_t>(1, mem), epsilon, *mem->gettensor<bool>(this->returns[0].textvalue));
+                break;
+            case Precision::Int16:
+                tensorfunc::equalscalar<Author, int16_t, bool>(*mem->gettensor<int16_t>(this->args[0].textvalue), this->getvar<int16_t>(1, mem), epsilon, *mem->gettensor<bool>(this->returns[0].textvalue));
+                break;
+            case Precision::Int8:
+                tensorfunc::equalscalar<Author, int8_t, bool>(*mem->gettensor<int8_t>(this->args[0].textvalue), this->getvar<int8_t>(1, mem), epsilon, *mem->gettensor<bool>(this->returns[0].textvalue));
+                break;
+            default:
+                error = "Unsupported type: " + precision_str(a_type);
+                return 1;
+            }
+            return 0;
         }
+    };
+
+    // less
+    template <typename Author>
+    class Less : public TF
+    {
+    public:
+        Less(const vector<Param> &args, const vector<Param> &returns)
+        {
+            this->name = "less";
+            this->author = Author::name();
+            this->args = args;
+            this->returns = returns;
+        }
+
         string math_formula() const override
         {
             return "mask=compare(T1, T2)";
         }
         shared_ptr<TF> clone() const override
         {
-            return make_shared<Compare<Author>>(*this);
+            return make_shared<Less<Author>>(*this);
         }
 
         int run(shared_ptr<MemBase> mem, string &error) override
@@ -336,37 +423,37 @@ namespace deepx::tf
             Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
             Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype;
             Precision mask_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
-            if (a_type != b_type)
+            if (a_type != b_type || mask_type != Precision::Bool)
             {
-                error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(b_type);
+                error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(b_type) + " or " + precision_str(a_type) + " != " + precision_str(mask_type);
                 return 1;
             }
             switch (a_type)
             {
             case Precision::Float64:
-                tensorfunc::compare<Author, double>(*mem->gettensor<double>(this->args[0].textvalue), *mem->gettensor<double>(this->args[1].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
+                tensorfunc::less<Author, double, bool>(*mem->gettensor<double>(this->args[0].textvalue), *mem->gettensor<double>(this->args[1].textvalue), *mem->gettensor<bool>(this->returns[0].textvalue));
                 break;
             case Precision::Float32:
-                tensorfunc::compare<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<float>(this->args[1].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
+                tensorfunc::less<Author, float, bool>(*mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<float>(this->args[1].textvalue), *mem->gettensor<bool>(this->returns[0].textvalue));
                 break;
             case Precision::Float16:
-                tensorfunc::compare<Author, half>(*mem->gettensor<half>(this->args[0].textvalue), *mem->gettensor<half>(this->args[1].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
+                tensorfunc::less<Author, half, bool>(*mem->gettensor<half>(this->args[0].textvalue), *mem->gettensor<half>(this->args[1].textvalue), *mem->gettensor<bool>(this->returns[0].textvalue));
                 break;
             case Precision::BFloat16:
-                tensorfunc::compare<Author, nv_bfloat16>(*mem->gettensor<nv_bfloat16>(this->args[0].textvalue), *mem->gettensor<nv_bfloat16>(this->args[1].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
+                tensorfunc::less<Author, nv_bfloat16, bool>(*mem->gettensor<nv_bfloat16>(this->args[0].textvalue), *mem->gettensor<nv_bfloat16>(this->args[1].textvalue), *mem->gettensor<bool>(this->returns[0].textvalue));
                 break;
             case Precision::Int64:
-                tensorfunc::compare<Author, int64_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
+                tensorfunc::less<Author, int64_t, bool>(*mem->gettensor<int64_t>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue), *mem->gettensor<bool>(this->returns[0].textvalue));
                 break;
             case Precision::Int32:
-                tensorfunc::compare<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
-                break;  
+                tensorfunc::less<Author, int32_t, bool>(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), *mem->gettensor<bool>(this->returns[0].textvalue));
+                break;
             case Precision::Int16:
-                tensorfunc::compare<Author, int16_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), *mem->gettensor<int16_t>(this->args[1].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
+                tensorfunc::less<Author, int16_t, bool>(*mem->gettensor<int16_t>(this->args[0].textvalue), *mem->gettensor<int16_t>(this->args[1].textvalue), *mem->gettensor<bool>(this->returns[0].textvalue));
                 break;
             case Precision::Int8:
-                tensorfunc::compare<Author, int8_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), *mem->gettensor<int8_t>(this->args[1].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
-                break;  
+                tensorfunc::less<Author, int8_t, bool>(*mem->gettensor<int8_t>(this->args[0].textvalue), *mem->gettensor<int8_t>(this->args[1].textvalue), *mem->gettensor<bool>(this->returns[0].textvalue));
+                break;
             default:
                 error = "Unsupported type: " + precision_str(a_type);
                 return 1;
@@ -375,41 +462,164 @@ namespace deepx::tf
         }
     };
 
+    // lessscalar
     template <typename Author>
-    class CompareScalar : public TF
+    class LessScalar : public TF
     {
     public:
-        CompareScalar(const vector<Param> &args, const vector<Param> &returns)
-        {   
-            this->name = "comparescalar";
+        LessScalar(const vector<Param> &args, const vector<Param> &returns)
+        {
+            this->name = "lessscalar";
             this->author = Author::name();
             this->args = args;
             this->returns = returns;
         }
 
-        CompareScalar(string text)
+        string math_formula() const override
+        {
+            return "mask=compare(T1, scalar)";
+        }
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<LessScalar<Author>>(*this);
+        }
+
+        int run(shared_ptr<MemBase> mem, string &error) override
+        {
+            Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
+            Precision mask_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
+            if (a_type != mask_type || mask_type != Precision::Bool)
+            {
+                error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(mask_type);
+                return 1;
+            }
+            switch (a_type)
+            {
+            case Precision::Float64:
+                tensorfunc::lessscalar<Author, double, bool>(*mem->gettensor<double>(this->args[0].textvalue), this->getvar<double>(1, mem), *mem->gettensor<bool>(this->returns[0].textvalue));
+                break;
+            case Precision::Float32:
+                tensorfunc::lessscalar<Author, float, bool>(*mem->gettensor<float>(this->args[0].textvalue), this->getvar<float>(1, mem), *mem->gettensor<bool>(this->returns[0].textvalue));
+                break;
+            case Precision::Float16:
+                tensorfunc::lessscalar<Author, half, bool>(*mem->gettensor<half>(this->args[0].textvalue), this->getvar<half>(1, mem), *mem->gettensor<bool>(this->returns[0].textvalue));
+                break;
+            case Precision::BFloat16:
+                tensorfunc::lessscalar<Author, nv_bfloat16, bool>(*mem->gettensor<nv_bfloat16>(this->args[0].textvalue), this->getvar<nv_bfloat16>(1, mem), *mem->gettensor<bool>(this->returns[0].textvalue));
+                break;
+            case Precision::Int64:
+                tensorfunc::lessscalar<Author, int64_t, bool>(*mem->gettensor<int64_t>(this->args[0].textvalue), this->getvar<int64_t>(1, mem), *mem->gettensor<bool>(this->returns[0].textvalue));
+                break;
+            case Precision::Int32:
+                tensorfunc::lessscalar<Author, int32_t, bool>(*mem->gettensor<int32_t>(this->args[0].textvalue), this->getvar<int32_t>(1, mem), *mem->gettensor<bool>(this->returns[0].textvalue));
+                break;
+            case Precision::Int16:
+                tensorfunc::lessscalar<Author, int16_t, bool>(*mem->gettensor<int16_t>(this->args[0].textvalue), this->getvar<int16_t>(1, mem), *mem->gettensor<bool>(this->returns[0].textvalue));
+                break;
+            case Precision::Int8:
+                tensorfunc::lessscalar<Author, int8_t, bool>(*mem->gettensor<int8_t>(this->args[0].textvalue), this->getvar<int8_t>(1, mem), *mem->gettensor<bool>(this->returns[0].textvalue));
+                break;
+            default:
+                error = "Unsupported type: " + precision_str(a_type);
+                return 1;
+            }
+            return 0;
+        }
+    };
+
+    // greater
+    template <typename Author>
+    class Greater : public TF
+    {
+    public:
+        Greater(const vector<Param> &args, const vector<Param> &returns)
         {
-            this->parse(text);
+            this->name = "greater";
             this->author = Author::name();
-            if (this->name != "comparescalar")
+            this->args = args;
+            this->returns = returns;
+        }
+
+        string math_formula() const override
+        {
+            return "mask=compare(T1, T2)";
+        }
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<Greater<Author>>(*this);
+        }
+
+        int run(shared_ptr<MemBase> mem, string &error) override
+        {
+            Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
+            Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype;
+            Precision mask_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
+            if (a_type != b_type || mask_type != Precision::Bool)
+            {
+                error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(b_type) + " or " + precision_str(a_type) + " != " + precision_str(mask_type);
+                return 1;
+            }
+            switch (a_type)
             {
-                throw std::runtime_error("Invalid name: " + this->name);
+            case Precision::Float64:
+                tensorfunc::greater<Author, double, bool>(*mem->gettensor<double>(this->args[0].textvalue), *mem->gettensor<double>(this->args[1].textvalue), *mem->gettensor<bool>(this->returns[0].textvalue));
+                break;
+            case Precision::Float32:
+                tensorfunc::greater<Author, float, bool>(*mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<float>(this->args[1].textvalue), *mem->gettensor<bool>(this->returns[0].textvalue));
+                break;
+            case Precision::Float16:
+                tensorfunc::greater<Author, half, bool>(*mem->gettensor<half>(this->args[0].textvalue), *mem->gettensor<half>(this->args[1].textvalue), *mem->gettensor<bool>(this->returns[0].textvalue));
+                break;
+            case Precision::BFloat16:
+                tensorfunc::greater<Author, nv_bfloat16, bool>(*mem->gettensor<nv_bfloat16>(this->args[0].textvalue), *mem->gettensor<nv_bfloat16>(this->args[1].textvalue), *mem->gettensor<bool>(this->returns[0].textvalue));
+                break;
+            case Precision::Int64:
+                tensorfunc::greater<Author, int64_t, bool>(*mem->gettensor<int64_t>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue), *mem->gettensor<bool>(this->returns[0].textvalue));
+                break;
+            case Precision::Int32:
+                tensorfunc::greater<Author, int32_t, bool>(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), *mem->gettensor<bool>(this->returns[0].textvalue));
+                break;
+            case Precision::Int16:
+                tensorfunc::greater<Author, int16_t, bool>(*mem->gettensor<int16_t>(this->args[0].textvalue), *mem->gettensor<int16_t>(this->args[1].textvalue), *mem->gettensor<bool>(this->returns[0].textvalue));
+                break;
+            case Precision::Int8:
+                tensorfunc::greater<Author, int8_t, bool>(*mem->gettensor<int8_t>(this->args[0].textvalue), *mem->gettensor<int8_t>(this->args[1].textvalue), *mem->gettensor<bool>(this->returns[0].textvalue));
+                break;
+            default:
+                error = "Unsupported type: " + precision_str(a_type);
+                return 1;
             }
+            return 0;
+        }
+    };
+
+    // greaterscalar
+    template <typename Author>
+    class GreaterScalar : public TF
+    {
+    public:
+        GreaterScalar(const vector<Param> &args, const vector<Param> &returns)
+        {
+            this->name = "greaterscalar";
+            this->author = Author::name();
+            this->args = args;
+            this->returns = returns;
         }
+
         string math_formula() const override
         {
-            return "mask=compare(T1, scalar)";  
+            return "mask=compare(T1, scalar)";
         }
         shared_ptr<TF> clone() const override
         {
-            return make_shared<CompareScalar<Author>>(*this);
+            return make_shared<GreaterScalar<Author>>(*this);
         }
 
         int run(shared_ptr<MemBase> mem, string &error) override
         {
             Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
             Precision mask_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
-            if (a_type != mask_type)
+            if (a_type != mask_type || mask_type != Precision::Bool)
             {
                 error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(mask_type);
                 return 1;
@@ -417,29 +627,29 @@ namespace deepx::tf
             switch (a_type)
             {
             case Precision::Float64:
-                tensorfunc::comparescalar<Author, double>(*mem->gettensor<double>(this->args[0].textvalue), this->getvar<double>(1, mem), *mem->gettensor<float>(this->returns[0].textvalue));
+                tensorfunc::greaterscalar<Author, double, bool>(*mem->gettensor<double>(this->args[0].textvalue), this->getvar<double>(1, mem), *mem->gettensor<bool>(this->returns[0].textvalue));
                 break;
             case Precision::Float32:
-                tensorfunc::comparescalar<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), this->getvar<float>(1, mem), *mem->gettensor<float>(this->returns[0].textvalue));
+                tensorfunc::greaterscalar<Author, float, bool>(*mem->gettensor<float>(this->args[0].textvalue), this->getvar<float>(1, mem), *mem->gettensor<bool>(this->returns[0].textvalue));
                 break;
             case Precision::Float16:
-                tensorfunc::comparescalar<Author, half>(*mem->gettensor<half>(this->args[0].textvalue), this->getvar<half>(1, mem), *mem->gettensor<float>(this->returns[0].textvalue));
+                tensorfunc::greaterscalar<Author, half, bool>(*mem->gettensor<half>(this->args[0].textvalue), this->getvar<half>(1, mem), *mem->gettensor<bool>(this->returns[0].textvalue));
                 break;
             case Precision::BFloat16:
-                tensorfunc::comparescalar<Author, nv_bfloat16>(*mem->gettensor<nv_bfloat16>(this->args[0].textvalue), this->getvar<nv_bfloat16>(1, mem), *mem->gettensor<float>(this->returns[0].textvalue));
+                tensorfunc::greaterscalar<Author, nv_bfloat16, bool>(*mem->gettensor<nv_bfloat16>(this->args[0].textvalue), this->getvar<nv_bfloat16>(1, mem), *mem->gettensor<bool>(this->returns[0].textvalue));
                 break;
             case Precision::Int64:
-                tensorfunc::comparescalar<Author, int64_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), this->getvar<int64_t>(1, mem), *mem->gettensor<float>(this->returns[0].textvalue));
+                tensorfunc::greaterscalar<Author, int64_t, bool>(*mem->gettensor<int64_t>(this->args[0].textvalue), this->getvar<int64_t>(1, mem), *mem->gettensor<bool>(this->returns[0].textvalue));
                 break;
             case Precision::Int32:
-                tensorfunc::comparescalar<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), this->getvar<int32_t>(1, mem), *mem->gettensor<float>(this->returns[0].textvalue));
+                tensorfunc::greaterscalar<Author, int32_t, bool>(*mem->gettensor<int32_t>(this->args[0].textvalue), this->getvar<int32_t>(1, mem), *mem->gettensor<bool>(this->returns[0].textvalue));
                 break;
             case Precision::Int16:
-                tensorfunc::comparescalar<Author, int16_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), this->getvar<int16_t>(1, mem), *mem->gettensor<float>(this->returns[0].textvalue));
+                tensorfunc::greaterscalar<Author, int16_t, bool>(*mem->gettensor<int16_t>(this->args[0].textvalue), this->getvar<int16_t>(1, mem), *mem->gettensor<bool>(this->returns[0].textvalue));
                 break;
             case Precision::Int8:
-                tensorfunc::comparescalar<Author, int8_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), this->getvar<int8_t>(1, mem), *mem->gettensor<float>(this->returns[0].textvalue));
-                break;  
+                tensorfunc::greaterscalar<Author, int8_t, bool>(*mem->gettensor<int8_t>(this->args[0].textvalue), this->getvar<int8_t>(1, mem), *mem->gettensor<bool>(this->returns[0].textvalue));
+                break;
             default:
                 error = "Unsupported type: " + precision_str(a_type);
                 return 1;
@@ -447,6 +657,70 @@ namespace deepx::tf
             return 0;
         }
     };
-        
+
+    // switch
+    template <typename Author>
+    class Switch : public TF
+    {
+    public:
+        Switch(const vector<Param> &args, const vector<Param> &returns)
+        {
+            this->name = "switch";
+            this->author = Author::name();
+            this->args = args;
+            this->returns = returns;
+        }
+
+        string math_formula() const override
+        {
+            return "C=switch(tensors,cases)";
+        }
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<Switch<Author>>(*this);
+        }
+
+        int run(shared_ptr<MemBase> mem, string &error) override
+        {
+
+            Precision C_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
+
+            switch (C_type)
+            {
+            case Precision::Float64:
+                tensorfunc::Switch<Author, double>(mem->gettensors<double>(this->getvector<string>(0)), *mem->gettensor<int8_t>(this->args[1].textvalue), *mem->gettensor<double>(this->returns[0].textvalue));
+                break;
+            case Precision::Float32:
+                tensorfunc::Switch<Author, float>(mem->gettensors<float>(this->getvector<string>(0)), *mem->gettensor<int8_t>(this->args[1].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
+                break;
+            case Precision::Float16:
+                tensorfunc::Switch<Author, half>(mem->gettensors<half>(this->getvector<string>(0)), *mem->gettensor<int8_t>(this->args[1].textvalue), *mem->gettensor<half>(this->returns[0].textvalue));
+                break;
+            case Precision::BFloat16:
+                tensorfunc::Switch<Author, nv_bfloat16>(mem->gettensors<nv_bfloat16>(this->getvector<string>(0)), *mem->gettensor<int8_t>(this->args[1].textvalue), *mem->gettensor<nv_bfloat16>(this->returns[0].textvalue));
+                break;
+            case Precision::Int64:
+                tensorfunc::Switch<Author, int64_t>(mem->gettensors<int64_t>(this->getvector<string>(0)), *mem->gettensor<int8_t>(this->args[1].textvalue), *mem->gettensor<int64_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int32:
+                tensorfunc::Switch<Author, int32_t>(mem->gettensors<int32_t>(this->getvector<string>(0)), *mem->gettensor<int8_t>(this->args[1].textvalue), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int16:
+                tensorfunc::Switch<Author, int16_t>(mem->gettensors<int16_t>(this->getvector<string>(0)), *mem->gettensor<int8_t>(this->args[1].textvalue), *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int8:
+                tensorfunc::Switch<Author, int8_t>(mem->gettensors<int8_t>(this->getvector<string>(0)), *mem->gettensor<int8_t>(this->args[1].textvalue), *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Bool:
+                tensorfunc::Switch<Author, bool>(mem->gettensors<bool>(this->getvector<string>(0)), *mem->gettensor<int8_t>(this->args[1].textvalue), *mem->gettensor<bool>(this->returns[0].textvalue));
+                break;
+            default:
+                error = "Unsupported type: " + precision_str(C_type);
+                return 1;
+            }
+            return 0;
+        }
+    };
+
 };
 #endif // DEEPX_TF_ELEMENTWISE_COMPARE_HPP
diff --git a/excuter/op-mem-ompsimd/src/client/tfs.cpp b/excuter/op-mem-ompsimd/src/client/tfs.cpp
index 2f9d9794..2208863c 100644
--- a/excuter/op-mem-ompsimd/src/client/tfs.cpp
+++ b/excuter/op-mem-ompsimd/src/client/tfs.cpp
@@ -55,14 +55,14 @@ namespace deepx::tf
                                                          {
                                                              Param("t", DataCategory::Tensor, Precision::Any),
                                                          })));
-        //copytensor
+        // copytensor
         tffactory.add_tf(std::make_shared<CopyTensor>(vector<Param>(
-                                                         {
-                                                             Param("src", DataCategory::Tensor, Precision::Any),
-                                                             Param("dst", DataCategory::Tensor, Precision::Any),
-                                                         }),
-                                                     vector<Param>()));
-        //deltensor
+                                                          {
+                                                              Param("src", DataCategory::Tensor, Precision::Any),
+                                                              Param("dst", DataCategory::Tensor, Precision::Any),
+                                                          }),
+                                                      vector<Param>()));
+        // deltensor
         tffactory.add_tf(std::make_shared<DelTensor>(vector<Param>(
                                                          {
                                                              Param("t", DataCategory::Tensor, Precision::Any),
@@ -221,13 +221,13 @@ namespace deepx::tf
                                                                     })));
         // invert author=miaobyte
         tffactory.add_tf(std::make_shared<Invert<miaobyte>>(vector<Param>(
-                                                                 {
-                                                                     Param("A", DataCategory::Tensor, Precision::Int64 | Precision::Int32 | Precision::Int16 | Precision::Int8),
-                                                                 }),
-                                                                 vector<Param>(
-                                                                     {
-                                                                         Param("C", DataCategory::Tensor, Precision::Int64 | Precision::Int32 | Precision::Int16 | Precision::Int8),
-                                                                     })));
+                                                                {
+                                                                    Param("A", DataCategory::Tensor, Precision::Int64 | Precision::Int32 | Precision::Int16 | Precision::Int8),
+                                                                }),
+                                                            vector<Param>(
+                                                                {
+                                                                    Param("C", DataCategory::Tensor, Precision::Int64 | Precision::Int32 | Precision::Int16 | Precision::Int8),
+                                                                })));
         // sqrt author=miaobyte
         tffactory.add_tf(std::make_shared<Sqrt<miaobyte>>(vector<Param>(
                                                               {
@@ -260,14 +260,14 @@ namespace deepx::tf
                                                                    })));
         // rpowscalar author=miaobyte
         tffactory.add_tf(std::make_shared<RpowScalar<miaobyte>>(vector<Param>(
-                                                                   {
-                                                                       Param("scalar", DataCategory::Var, Precision::Any),
-                                                                       Param("A", DataCategory::Tensor, Precision::Any),
-                                                                   }),
-                                                               vector<Param>(
-                                                                   {
-                                                                       Param("C", DataCategory::Tensor, Precision::Any),
-                                                                   })));
+                                                                    {
+                                                                        Param("scalar", DataCategory::Var, Precision::Any),
+                                                                        Param("A", DataCategory::Tensor, Precision::Any),
+                                                                    }),
+                                                                vector<Param>(
+                                                                    {
+                                                                        Param("C", DataCategory::Tensor, Precision::Any),
+                                                                    })));
 
         // log author=miaobyte
         tffactory.add_tf(std::make_shared<Log<miaobyte>>(vector<Param>(
@@ -327,27 +327,76 @@ namespace deepx::tf
                                                                    {
                                                                        Param("C", DataCategory::Tensor, Precision::Any),
                                                                    })));
-        // compare author=miaobyte
-        tffactory.add_tf(std::make_shared<Compare<miaobyte>>(vector<Param>(
+        // equal author=miaobyte
+        tffactory.add_tf(std::make_shared<Equal<miaobyte>>(vector<Param>(
+                                                               {
+                                                                   Param("A", DataCategory::Tensor, Precision::Any),
+                                                                   Param("B", DataCategory::Tensor, Precision::Any),
+                                                               }),
+                                                           vector<Param>(
+                                                               {
+                                                                   Param("mask", DataCategory::Tensor, Precision::Bool),
+                                                               })));
+        // equal scalar author=miaobyte
+        tffactory.add_tf(std::make_shared<EqualScalar<miaobyte>>(vector<Param>(
+                                                                     {
+                                                                         Param("A", DataCategory::Tensor, Precision::Any),
+                                                                         Param("scalar", DataCategory::Var, Precision::Any),
+                                                                     }),
+                                                                 vector<Param>(
+                                                                     {
+                                                                         Param("mask", DataCategory::Tensor, Precision::Bool),
+                                                                     })));
+        // less author=miaobyte
+        tffactory.add_tf(std::make_shared<Less<miaobyte>>(vector<Param>(
+                                                              {
+                                                                  Param("A", DataCategory::Tensor, Precision::Any),
+                                                                  Param("B", DataCategory::Tensor, Precision::Any),
+                                                              }),
+                                                          vector<Param>(
+                                                              {
+                                                                  Param("mask", DataCategory::Tensor, Precision::Bool),
+                                                              })));
+        // less scalar author=miaobyte
+        tffactory.add_tf(std::make_shared<LessScalar<miaobyte>>(vector<Param>(
+                                                                    {
+                                                                        Param("A", DataCategory::Tensor, Precision::Any),
+                                                                        Param("scalar", DataCategory::Var, Precision::Any),
+                                                                    }),
+                                                                vector<Param>(
+                                                                    {
+                                                                        Param("mask", DataCategory::Tensor, Precision::Bool),
+                                                                    })));
+        // greater author=miaobyte
+        tffactory.add_tf(std::make_shared<Greater<miaobyte>>(vector<Param>(
                                                                  {
                                                                      Param("A", DataCategory::Tensor, Precision::Any),
                                                                      Param("B", DataCategory::Tensor, Precision::Any),
-
                                                                  }),
                                                              vector<Param>(
                                                                  {
-                                                                     Param("mask", DataCategory::Tensor, Precision::Float32),
+                                                                     Param("mask", DataCategory::Tensor, Precision::Bool),
                                                                  })));
-        // compare scalar author=miaobyte
-        tffactory.add_tf(std::make_shared<CompareScalar<miaobyte>>(vector<Param>(
+        // greater scalar author=miaobyte
+        tffactory.add_tf(std::make_shared<GreaterScalar<miaobyte>>(vector<Param>(
                                                                        {
                                                                            Param("A", DataCategory::Tensor, Precision::Any),
                                                                            Param("scalar", DataCategory::Var, Precision::Any),
                                                                        }),
                                                                    vector<Param>(
                                                                        {
-                                                                           Param("mask", DataCategory::Tensor, Precision::Float32),
+                                                                           Param("mask", DataCategory::Tensor, Precision::Bool),
                                                                        })));
+        // switch author=miaobyte
+        tffactory.add_tf(std::make_shared<Switch<miaobyte>>(vector<Param>(
+                                                                {
+                                                                    Param("tensors", DataCategory::ListTensor, Precision::Any),
+                                                                    Param("cases", DataCategory::Tensor, Precision::Int8),
+                                                                }),
+                                                            vector<Param>(
+                                                                {
+                                                                    Param("C", DataCategory::Tensor, Precision::Any),
+                                                                })));
     }
     // matmul
     void register_matmul(TfFactory &tffactory)
@@ -441,31 +490,31 @@ namespace deepx::tf
                                                           vector<Param>(
                                                               {
                                                                   Param("B", DataCategory::Tensor, Precision::Any),
-                                                                 })));
+                                                              })));
         // reducemax author=miaobyte
         tffactory.add_tf(std::make_shared<ReduceMax<miaobyte>>(vector<Param>(
-                                                                 {
-                                                                     Param("A", DataCategory::Tensor, Precision::Any),
-                                                                     Param("axis", DataCategory::Vector, Precision::Int32),
-                                                                     Param("keepdims", DataCategory::Var, Precision::Bool),
-                                                                 }),
-                                                                 vector<Param>(
-                                                                     {
-                                                                         Param("B", DataCategory::Tensor, Precision::Any),
-                                                                     })));
+                                                                   {
+                                                                       Param("A", DataCategory::Tensor, Precision::Any),
+                                                                       Param("axis", DataCategory::Vector, Precision::Int32),
+                                                                       Param("keepdims", DataCategory::Var, Precision::Bool),
+                                                                   }),
+                                                               vector<Param>(
+                                                                   {
+                                                                       Param("B", DataCategory::Tensor, Precision::Any),
+                                                                   })));
         // reducemin author=miaobyte
         tffactory.add_tf(std::make_shared<ReduceMin<miaobyte>>(vector<Param>(
-                                                                 {
-                                                                     Param("A", DataCategory::Tensor, Precision::Any),
-                                                                     Param("axis", DataCategory::Vector, Precision::Int32),
-                                                                     Param("keepdims", DataCategory::Var, Precision::Bool),
-                                                                 }),
-                                                                 vector<Param>(
-                                                                     {
-                                                                         Param("B", DataCategory::Tensor, Precision::Any),
-                                                                     })));
+                                                                   {
+                                                                       Param("A", DataCategory::Tensor, Precision::Any),
+                                                                       Param("axis", DataCategory::Vector, Precision::Int32),
+                                                                       Param("keepdims", DataCategory::Var, Precision::Bool),
+                                                                   }),
+                                                               vector<Param>(
+                                                                   {
+                                                                       Param("B", DataCategory::Tensor, Precision::Any),
+                                                                   })));
     }
- 
+
     int register_all(TfFactory &tffactory)
     {
         register_lifecycle(tffactory);
diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp
index 7425c2a7..1e863ae3 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp
@@ -773,22 +773,27 @@ namespace deepx::tensorfunc
         }
     };
 
-    template <typename T>
-    struct compareDispatcher<miaobyte, T>
+    //equal
+    template <typename T,typename MaskT>
+    struct equalDispatcher<miaobyte, T,MaskT>
     {
-        static void compare(const Tensor<T> &A, const Tensor<T> &B, const Tensor<float> &mask)
+        static void equal(const Tensor<T> &A, const Tensor<T> &B,const float epsilon, Tensor<MaskT> &mask)
         {
             if (A.shape == B.shape && mask.shape == A.shape)
-            {
-                A.shape.rangeParallel(A.shape.dim, [&A, &B, &mask](int idx)
+            {   
+                A.shape.rangeParallel(A.shape.dim-1, [&A, &B, &mask,epsilon](int idx)
                                       {
-                                            if(A.data[idx]==B.data[idx]){
-                                                mask.data[idx]=0.5;
-                                            }else if(A.data[idx]>B.data[idx]){
-                                                mask.data[idx]=1;
-                                            }else{
-                                                mask.data[idx]=0;
-                                            } });
+                                            for (int i = 0; i < A.shape[-1]; i++)
+                                            {
+                                                if (epsilon == 0)
+                                                {
+                                                    mask.data[idx+i]=A.data[idx+i]==B.data[idx+i];
+                                                }
+                                                else{
+                                                    mask.data[idx+i]=std::abs(A.data[idx+i]-B.data[idx+i])<=epsilon;
+                                                }
+                                            }
+                                            });
             }
             else
             {
@@ -797,22 +802,27 @@ namespace deepx::tensorfunc
         }
     };
 
-    template <typename T>
-    struct comparescalarDispatcher<miaobyte, T>
+    //equalscalar
+    template <typename T,typename MaskT>
+    struct equalscalarDispatcher<miaobyte, T,MaskT>
     {
-        static void comparescalar(const Tensor<T> &A, const T scalar, Tensor<float> &mask)
+        static void equalscalar(const Tensor<T> &A, const T scalar,const float epsilon, Tensor<MaskT> &mask)
         {
             if (A.shape == mask.shape)
             {
-                A.shape.rangeParallel(A.shape.dim, [&A, &mask, &scalar](int idx)
+                A.shape.rangeParallel(A.shape.dim-1, [&A, &mask, &scalar,epsilon](int idx)
                                       {
-                if(A.data[idx]==scalar){
-                    mask.data[idx]=0.5;
-                }else if(A.data[idx]>scalar){
-                    mask.data[idx]=1;
-                }else{
-                    mask.data[idx]=0;
-                } });
+                for (int i = 0; i < A.shape[-1]; i++)
+                {
+                    if (epsilon == 0)
+                    {
+                        mask.data[idx+i]=A.data[idx+i]==scalar;
+                    }
+                    else{
+                        mask.data[idx+i]=std::abs(A.data[idx+i]-scalar)<=epsilon;
+                    }
+                }
+                });
             }
             else
             {
@@ -821,6 +831,121 @@ namespace deepx::tensorfunc
         };
     };
 
+    //less
+    template <typename T,typename MaskT>
+    struct lessDispatcher<miaobyte, T,MaskT>
+    {
+        static void less(const Tensor<T> &A, const Tensor<T> &B, Tensor<MaskT> &mask)
+        {
+            if (A.shape == B.shape && mask.shape == A.shape)
+            {
+                A.shape.rangeParallel(A.shape.dim-1, [&A, &B, &mask](int idx)
+                                      {
+                for (int i = 0; i < A.shape[-1]; i++)
+                {
+                    mask.data[idx+i]=A.data[idx+i]<B.data[idx+i];
+                }   
+                });
+            }
+            else
+            {
+                throw std::invalid_argument("shape mismatch");
+            }
+        }   
+    };
+
+    //lessscalar
+    template <typename T,typename MaskT>
+    struct lessscalarDispatcher<miaobyte, T,MaskT>
+    {
+        static void lessscalar(const Tensor<T> &A, const T scalar, Tensor<MaskT> &mask)
+        {
+            if (A.shape == mask.shape)
+            {
+                A.shape.rangeParallel(A.shape.dim-1, [&A, &mask, &scalar](int idx)
+                                      {
+                for (int i = 0; i < A.shape[-1]; i++)
+                {
+                    mask.data[idx+i]=A.data[idx+i]<scalar;
+                }
+                });
+            }
+            else
+            {
+                throw std::invalid_argument("shape mismatch");
+            }
+        }   
+    };
+    
+    //greater
+    template <typename T,typename MaskT>
+    struct greaterDispatcher<miaobyte, T,MaskT>
+    {
+        static void greater(const Tensor<T> &A, const Tensor<T> &B, Tensor<MaskT> &mask)
+        {
+            if (A.shape == B.shape && mask.shape == A.shape)
+            {
+                A.shape.rangeParallel(A.shape.dim-1, [&A, &B, &mask](int idx)
+                                      {
+                for (int i = 0; i < A.shape[-1]; i++)
+                {
+                    mask.data[idx+i]=A.data[idx+i]>B.data[idx+i];
+                }
+                });
+            }
+            else
+            {
+                throw std::invalid_argument("shape mismatch");
+            }
+        }
+    };
+
+    //greaterscalar
+    template <typename T,typename MaskT>
+    struct greaterscalarDispatcher<miaobyte, T,MaskT>
+    {
+        static void greaterscalar(const Tensor<T> &A, const T scalar, Tensor<MaskT> &mask)
+        {
+            if (A.shape == mask.shape)
+            {
+                A.shape.rangeParallel(A.shape.dim-1, [&A, &mask, &scalar](int idx)
+                                      {
+                for (int i = 0; i < A.shape[-1]; i++)
+                {
+                    mask.data[idx+i]=A.data[idx+i]>scalar;
+                }
+                });
+            }
+            else
+            {
+                throw std::invalid_argument("shape mismatch");
+            }
+        }   
+    };      
+
+    //switch
+    template <typename T,typename casesT>
+    struct switchDispatcher<miaobyte, T,casesT>
+    {
+        static void Switch(const vector<Tensor<T>*> tensors,const Tensor<casesT> &cases, Tensor<T> &C)
+        {
+            if (cases.shape == C.shape)
+            {
+                C.shape.rangeParallel(C.shape.dim-1, [&tensors, &cases, &C](int idx)
+                                      {
+                for (int i = 0; i < C.shape[-1]; i++)
+                {   
+                    int which_tensor=cases.data[idx];
+                    C.data[idx+i]=tensors[which_tensor]->data[idx];
+                }
+                });
+            }
+            else
+            {
+                throw std::invalid_argument("shape mismatch");
+            }   
+        }
+    };      
     
 };
 #endif // DEEPX_OP_CPU_ELEMENTWISE_HPP
\ No newline at end of file
diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp b/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp
index 46863815..53f0b504 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp
@@ -1226,54 +1226,56 @@ namespace deepx::tf
         }
     };
 
+    //equal
     template <typename Author>
-    class Compare : public TF
+    class Equal : public TF
     {
     public:
-        Compare(vector<Param> args, vector<Param> returns)
+        Equal(vector<Param> args, vector<Param> returns)
         {   
-            this->name = "compare";
+            this->name = "equal";
             this->author = Author::name();
             this->args = args;
             this->returns = returns;
         }
         string math_formula() const override
         {
-            return "mask=compare(T1,T2)";
+            return "mask=equal(T1,T2)";
         }
         shared_ptr<TF> clone() const override
         {
-            return make_shared<Compare<Author>>(*this);
+            return make_shared<Equal<Author>>(*this);
         }
         int run(shared_ptr<MemBase> mem, string &error) override
         {
             Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
             Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype;
+            float epsilon = this->getvar<float>(2,mem,true);
             Precision mask_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
-            if (a_type != b_type || a_type != mask_type)
+            if (a_type != b_type || mask_type!=Precision::Bool)
             {
-                error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(b_type) + " != " + precision_str(mask_type);
+                error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(b_type) + "  " + precision_str(mask_type)+"!=bool";
                 return 1;
             }
             switch (a_type)
             {
             case Precision::Float64:
-                tensorfunc::compare<Author, double>(*mem->gettensor<double>(this->args[0].textvalue), *mem->gettensor<double>(this->args[1].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
+                tensorfunc::equal<Author, double,bool>(*mem->gettensor<double>(this->args[0].textvalue), *mem->gettensor<double>(this->args[1].textvalue), epsilon, *mem->gettensor<bool>(this->returns[0].textvalue));
                 break;  
             case Precision::Float32:
-                tensorfunc::compare<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<float>(this->args[1].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
+                tensorfunc::equal<Author, float,bool>(*mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<float>(this->args[1].textvalue), epsilon, *mem->gettensor<bool>(this->returns[0].textvalue));
                 break;
             case Precision::Int64:
-                tensorfunc::compare<Author, int64_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
+                tensorfunc::equal<Author, int64_t,bool>(*mem->gettensor<int64_t>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue), epsilon, *mem->gettensor<bool>(this->returns[0].textvalue));
                 break;
             case Precision::Int32:
-                tensorfunc::compare<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
+                tensorfunc::equal<Author, int32_t,bool>(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), epsilon, *mem->gettensor<bool>(this->returns[0].textvalue));
                 break;
             case Precision::Int16:
-                tensorfunc::compare<Author, int16_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), *mem->gettensor<int16_t>(this->args[1].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
+                tensorfunc::equal<Author, int16_t,bool>(*mem->gettensor<int16_t>(this->args[0].textvalue), *mem->gettensor<int16_t>(this->args[1].textvalue), epsilon, *mem->gettensor<bool>(this->returns[0].textvalue));
                 break;
             case Precision::Int8:
-                tensorfunc::compare<Author, int8_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), *mem->gettensor<int8_t>(this->args[1].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
+                tensorfunc::equal<Author, int8_t,bool>(*mem->gettensor<int8_t>(this->args[0].textvalue), *mem->gettensor<int8_t>(this->args[1].textvalue), epsilon, *mem->gettensor<bool>(this->returns[0].textvalue));
                 break;  
             default:
                 error = "Unsupported dtype: " + precision_str(a_type);
@@ -1285,28 +1287,29 @@ namespace deepx::tf
 
 
     template <typename Author>
-    class CompareScalar : public TF
+    class EqualScalar : public TF
     {
     public:
-        CompareScalar(vector<Param> args, vector<Param> returns)
+        EqualScalar(vector<Param> args, vector<Param> returns)
         {
-            this->name = "comparescalar";
+            this->name = "equalscalar";
             this->author = Author::name();
             this->args = args;
             this->returns = returns;
         }
         string math_formula() const override
         {
-            return "mask=compare(T1,scalar)";
+            return "mask=equal(T1,scalar)";
         }
         shared_ptr<TF> clone() const override
         {
-            return make_shared<CompareScalar<Author>>(*this);
+            return make_shared<EqualScalar<Author>>(*this);
         }
         int run(shared_ptr<MemBase> mem, string &error) override
         {
             Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
             Precision mask_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;    
+            float epsilon = this->getvar<float>(2,mem,true);
             if (a_type != mask_type)
             {
                 error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(mask_type);
@@ -1315,22 +1318,22 @@ namespace deepx::tf
             switch (a_type)
             {
             case Precision::Float64:
-                tensorfunc::comparescalar<Author, double>(*mem->gettensor<double>(this->args[0].textvalue), this->getvar<double>(1,mem,true), *mem->gettensor<float>(this->returns[0].textvalue));
+                tensorfunc::equalscalar<Author, double,bool>(*mem->gettensor<double>(this->args[0].textvalue), this->getvar<double>(1,mem,true), epsilon, *mem->gettensor<bool>(this->returns[0].textvalue));
                 break;
             case Precision::Float32:
-                tensorfunc::comparescalar<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), this->getvar<float>(1,mem,true), *mem->gettensor<float>(this->returns[0].textvalue));
+                tensorfunc::equalscalar<Author, float,bool>(*mem->gettensor<float>(this->args[0].textvalue), this->getvar<float>(1,mem,true), epsilon, *mem->gettensor<bool>(this->returns[0].textvalue));
                 break;
             case Precision::Int64:
-                tensorfunc::comparescalar<Author, int64_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), this->getvar<int64_t>(1,mem,true), *mem->gettensor<float>(this->returns[0].textvalue));
+                tensorfunc::equalscalar<Author, int64_t,bool>(*mem->gettensor<int64_t>(this->args[0].textvalue), this->getvar<int64_t>(1,mem,true), epsilon, *mem->gettensor<bool>(this->returns[0].textvalue));
                 break;
             case Precision::Int32:
-                tensorfunc::comparescalar<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), this->getvar<int32_t>(1,mem,true), *mem->gettensor<float>(this->returns[0].textvalue));
+                tensorfunc::equalscalar<Author, int32_t,bool>(*mem->gettensor<int32_t>(this->args[0].textvalue), this->getvar<int32_t>(1,mem,true), epsilon, *mem->gettensor<bool>(this->returns[0].textvalue));
                 break;
             case Precision::Int16:
-                tensorfunc::comparescalar<Author, int16_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), this->getvar<int16_t>(1,mem,true), *mem->gettensor<float>(this->returns[0].textvalue));
+                tensorfunc::equalscalar<Author, int16_t,bool>(*mem->gettensor<int16_t>(this->args[0].textvalue), this->getvar<int16_t>(1,mem,true), epsilon, *mem->gettensor<bool>(this->returns[0].textvalue));
                 break;
             case Precision::Int8:
-                tensorfunc::comparescalar<Author, int8_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), this->getvar<int8_t>(1,mem,true), *mem->gettensor<float>(this->returns[0].textvalue));
+                tensorfunc::equalscalar<Author, int8_t,bool>(*mem->gettensor<int8_t>(this->args[0].textvalue), this->getvar<int8_t>(1,mem,true), epsilon, *mem->gettensor<bool>(this->returns[0].textvalue));
                 break;
             default:
                 error = "Unsupported dtype: " + precision_str(a_type);
@@ -1339,6 +1342,293 @@ namespace deepx::tf
             return 0;
         }
     };
-};
 
+    //less
+    template <typename Author>
+    class Less : public TF
+    {
+    public:
+        Less(vector<Param> args, vector<Param> returns)
+        {
+            this->name = "less";
+            this->author = Author::name();
+            this->args = args;
+            this->returns = returns;
+        }
+        string math_formula() const override
+        {
+            return "mask=less(T1,T2)";
+        }
+        shared_ptr<TF> clone() const override   
+        {
+            return make_shared<Less<Author>>(*this);
+        }
+        int run(shared_ptr<MemBase> mem, string &error) override
+        {
+            Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
+            Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype;
+            Precision mask_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
+            if (a_type != b_type || mask_type!=Precision::Bool)
+            {
+                error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(b_type) + "  " + precision_str(mask_type)+"!=bool";
+                return 1;    
+            }   
+            switch (a_type)
+            {
+            case Precision::Float64:
+                tensorfunc::less<Author, double,bool>(*mem->gettensor<double>(this->args[0].textvalue), *mem->gettensor<double>(this->args[1].textvalue), *mem->gettensor<bool>(this->returns[0].textvalue));
+                break;
+            case Precision::Float32:
+                tensorfunc::less<Author, float,bool>(*mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<float>(this->args[1].textvalue), *mem->gettensor<bool>(this->returns[0].textvalue));
+                break;
+            case Precision::Int64:
+                tensorfunc::less<Author, int64_t,bool>(*mem->gettensor<int64_t>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue), *mem->gettensor<bool>(this->returns[0].textvalue));
+                break;
+            case Precision::Int32:
+                tensorfunc::less<Author, int32_t,bool>(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), *mem->gettensor<bool>(this->returns[0].textvalue));
+                break;
+            case Precision::Int16:
+                tensorfunc::less<Author, int16_t,bool>(*mem->gettensor<int16_t>(this->args[0].textvalue), *mem->gettensor<int16_t>(this->args[1].textvalue), *mem->gettensor<bool>(this->returns[0].textvalue));
+                break;      
+            case Precision::Int8:
+                tensorfunc::less<Author, int8_t,bool>(*mem->gettensor<int8_t>(this->args[0].textvalue), *mem->gettensor<int8_t>(this->args[1].textvalue), *mem->gettensor<bool>(this->returns[0].textvalue));
+                break;  
+            default:
+                error = "Unsupported dtype: " + precision_str(a_type);
+                return 1;
+            }   
+            return 0;
+        }
+    };
+
+    //lessscalar
+    template <typename Author>
+    class LessScalar : public TF
+    {
+    public:
+        LessScalar(vector<Param> args, vector<Param> returns)
+        {
+            this->name = "lessscalar";
+            this->author = Author::name();
+            this->args = args;
+            this->returns = returns;
+        }
+        string math_formula() const override
+        {   
+            return "mask=less(T1,scalar)";
+        }
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<LessScalar<Author>>(*this);
+        }
+        int run(shared_ptr<MemBase> mem, string &error) override    
+        {
+            Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
+            Precision mask_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
+            if (a_type != mask_type)
+            {
+                error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(mask_type);
+                return 1;
+            }   
+            switch (a_type)
+            {   
+            case Precision::Float64:
+                tensorfunc::lessscalar<Author, double,bool>(*mem->gettensor<double>(this->args[0].textvalue), this->getvar<double>(1,mem,true), *mem->gettensor<bool>(this->returns[0].textvalue));
+                break;
+            case Precision::Float32:
+                tensorfunc::lessscalar<Author, float,bool>(*mem->gettensor<float>(this->args[0].textvalue), this->getvar<float>(1,mem,true), *mem->gettensor<bool>(this->returns[0].textvalue));
+                break;      
+            case Precision::Int64:
+                tensorfunc::lessscalar<Author, int64_t,bool>(*mem->gettensor<int64_t>(this->args[0].textvalue), this->getvar<int64_t>(1,mem,true), *mem->gettensor<bool>(this->returns[0].textvalue));
+                break;
+            case Precision::Int32:
+                tensorfunc::lessscalar<Author, int32_t,bool>(*mem->gettensor<int32_t>(this->args[0].textvalue), this->getvar<int32_t>(1,mem,true), *mem->gettensor<bool>(this->returns[0].textvalue));
+                break;  
+            case Precision::Int16:
+                tensorfunc::lessscalar<Author, int16_t,bool>(*mem->gettensor<int16_t>(this->args[0].textvalue), this->getvar<int16_t>(1,mem,true), *mem->gettensor<bool>(this->returns[0].textvalue));
+                break;  
+            case Precision::Int8:
+                tensorfunc::lessscalar<Author, int8_t,bool>(*mem->gettensor<int8_t>(this->args[0].textvalue), this->getvar<int8_t>(1,mem,true), *mem->gettensor<bool>(this->returns[0].textvalue));
+                break;    
+            default:
+                error = "Unsupported dtype: " + precision_str(a_type);
+                return 1;
+            }       
+            return 0;
+        }   
+    };  
+    
+    //greater
+    template <typename Author>
+    class Greater : public TF
+    {
+    public:
+        Greater(vector<Param> args, vector<Param> returns)
+        {
+            this->name = "greater";
+            this->author = Author::name();
+            this->args = args;
+            this->returns = returns;
+        }
+        string math_formula() const override
+        {
+            return "mask=greater(T1,T2)";
+        }
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<Greater<Author>>(*this);
+        }
+        int run(shared_ptr<MemBase> mem, string &error) override
+        {
+            Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
+            Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype;  
+            Precision mask_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
+            if (a_type != b_type || mask_type!=Precision::Bool)
+            {
+                error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(b_type) + "  " + precision_str(mask_type)+"!=bool";
+                return 1;
+            }
+            switch (a_type) 
+            {
+            case Precision::Float64:
+                tensorfunc::greater<Author, double,bool>(*mem->gettensor<double>(this->args[0].textvalue), *mem->gettensor<double>(this->args[1].textvalue), *mem->gettensor<bool>(this->returns[0].textvalue));
+                break;
+            case Precision::Float32:
+                tensorfunc::greater<Author, float,bool>(*mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<float>(this->args[1].textvalue), *mem->gettensor<bool>(this->returns[0].textvalue));
+                break;
+            case Precision::Int64:
+                tensorfunc::greater<Author, int64_t,bool>(*mem->gettensor<int64_t>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue), *mem->gettensor<bool>(this->returns[0].textvalue));
+                break;
+            case Precision::Int32:
+                tensorfunc::greater<Author, int32_t,bool>(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), *mem->gettensor<bool>(this->returns[0].textvalue));
+                break;
+            case Precision::Int16:
+                tensorfunc::greater<Author, int16_t,bool>(*mem->gettensor<int16_t>(this->args[0].textvalue), *mem->gettensor<int16_t>(this->args[1].textvalue), *mem->gettensor<bool>(this->returns[0].textvalue)); 
+                break;
+            case Precision::Int8:
+                tensorfunc::greater<Author, int8_t,bool>(*mem->gettensor<int8_t>(this->args[0].textvalue), *mem->gettensor<int8_t>(this->args[1].textvalue), *mem->gettensor<bool>(this->returns[0].textvalue));
+                break;
+            default:
+                error = "Unsupported dtype: " + precision_str(a_type);
+                return 1;   
+            }
+            return 0;
+        }
+    };
+
+    //greaterscalar
+    template <typename Author>
+    class GreaterScalar : public TF
+    {
+    public:
+        GreaterScalar(vector<Param> args, vector<Param> returns)
+        {
+            this->name = "greaterscalar";
+            this->author = Author::name();
+            this->args = args;
+            this->returns = returns;
+        }
+        string math_formula() const override
+        {
+            return "mask=greater(T1,scalar)";
+        }
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<GreaterScalar<Author>>(*this);
+        }
+        int run(shared_ptr<MemBase> mem, string &error) override
+        {
+            Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
+            Precision mask_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
+            if (a_type != mask_type)
+            {
+                error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(mask_type);
+                return 1;
+            }   
+            switch (a_type)
+            {
+            case Precision::Float64:
+                tensorfunc::greaterscalar<Author, double,bool>(*mem->gettensor<double>(this->args[0].textvalue), this->getvar<double>(1,mem,true), *mem->gettensor<bool>(this->returns[0].textvalue));
+                break;
+            case Precision::Float32:
+                tensorfunc::greaterscalar<Author, float,bool>(*mem->gettensor<float>(this->args[0].textvalue), this->getvar<float>(1,mem,true), *mem->gettensor<bool>(this->returns[0].textvalue));
+                break;
+            case Precision::Int64:
+                tensorfunc::greaterscalar<Author, int64_t,bool>(*mem->gettensor<int64_t>(this->args[0].textvalue), this->getvar<int64_t>(1,mem,true), *mem->gettensor<bool>(this->returns[0].textvalue));
+                break;
+            case Precision::Int32:
+                tensorfunc::greaterscalar<Author, int32_t,bool>(*mem->gettensor<int32_t>(this->args[0].textvalue), this->getvar<int32_t>(1,mem,true), *mem->gettensor<bool>(this->returns[0].textvalue));
+                break;
+            case Precision::Int16:
+                tensorfunc::greaterscalar<Author, int16_t,bool>(*mem->gettensor<int16_t>(this->args[0].textvalue), this->getvar<int16_t>(1,mem,true), *mem->gettensor<bool>(this->returns[0].textvalue));
+                break;
+            case Precision::Int8:
+                tensorfunc::greaterscalar<Author, int8_t,bool>(*mem->gettensor<int8_t>(this->args[0].textvalue), this->getvar<int8_t>(1,mem,true), *mem->gettensor<bool>(this->returns[0].textvalue));
+                break;
+            default:
+                error = "Unsupported dtype: " + precision_str(a_type);
+                return 1;   
+            }    
+            return 0;
+        }   
+    };  
+
+    //switch
+    template <typename Author>
+    class Switch : public TF
+    {
+    public:
+        Switch(vector<Param> args, vector<Param> returns)
+        {
+            this->name = "switch";
+            this->author = Author::name();
+            this->args = args;
+            this->returns = returns;
+        }
+        string math_formula() const override
+        {
+            return "C=switch([tensors],case)";
+        }
+        shared_ptr<TF> clone() const override   
+        {
+            return make_shared<Switch<Author>>(*this);
+        }
+        int run(shared_ptr<MemBase> mem, string &error) override
+        {
+            Precision cases_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype;
+            Precision C_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
+            if (cases_type != Precision::Int8 )
+            {
+                error = "Type mismatch: " + precision_str(cases_type) + " != int8";
+                return 1;
+            }
+
+            switch (cases_type)
+            {
+            case Precision::Float64:
+                tensorfunc::Switch<Author, double,int8_t>(mem->gettensors<double>(this->getvector<string>(0)), *mem->gettensor<int8_t>(this->args[1].textvalue), *mem->gettensor<double>(this->returns[0].textvalue));
+                break;
+            case Precision::Float32:
+                tensorfunc::Switch<Author, float,int8_t>(mem->gettensors<float>(this->getvector<string>(0)), *mem->gettensor<int8_t>(this->args[1].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
+                break;
+            case Precision::Int64:
+                tensorfunc::Switch<Author, int64_t,int8_t>(mem->gettensors<int64_t>(this->getvector<string>(0)), *mem->gettensor<int8_t>(this->args[1].textvalue), *mem->gettensor<int64_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int32:
+                tensorfunc::Switch<Author, int32_t,int8_t>(mem->gettensors<int32_t>(this->getvector<string>(0)), *mem->gettensor<int8_t>(this->args[1].textvalue), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int16:
+                tensorfunc::Switch<Author, int16_t,int8_t>(mem->gettensors<int16_t>(this->getvector<string>(0)), *mem->gettensor<int8_t>(this->args[1].textvalue), *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int8:   
+                tensorfunc::Switch<Author, int8_t,int8_t>(mem->gettensors<int8_t>(this->getvector<string>(0)), *mem->gettensor<int8_t>(this->args[1].textvalue), *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                break;
+            default:
+                error = "Unsupported dtype: " + precision_str(cases_type);
+                return 1;
+            }
+            return 0;
+        }
+    };
+};
 #endif
diff --git a/front/py/deepx/tensor/tensor.py b/front/py/deepx/tensor/tensor.py
index 39958a2e..8843bc66 100644
--- a/front/py/deepx/tensor/tensor.py
+++ b/front/py/deepx/tensor/tensor.py
@@ -132,6 +132,8 @@ def autoformat(self):
             self._format = '%.4f'
         elif self._dtype == 'int32' or self._dtype == 'int64' or self._dtype == 'int8' or self._dtype == 'int16':
             self._format = '%d'
+        elif self._dtype == 'bool':
+            self._format = '%d'
         else:
             self._format = '%s'
     def set_format(self,format:str):
diff --git a/front/py/deepx/transformer/modeling_rope_utils.py b/front/py/deepx/transformer/modeling_rope_utils.py
index 3127021e..0e6dd1ed 100644
--- a/front/py/deepx/transformer/modeling_rope_utils.py
+++ b/front/py/deepx/transformer/modeling_rope_utils.py
@@ -254,7 +254,7 @@ def _compute_llama3_parameters(    base: float = 10000.0,
     low_freq_factor:float=1,
     high_freq_factor:float=4,
     old_context_len:int=8192,
-    seq_len: Optional[int] = None, **rope_kwargs
+    seq_len: Optional[int] = None
 ) -> Tuple[Tensor, float]:
     # Gets the default RoPE parameters
     inv_freq, attention_factor = _compute_default_rope_parameters(base, head_dim, partial_rotary_factor)
diff --git a/front/py/examples/1_tensor/1_new.py b/front/py/examples/1_tensor/1_new.py
index c2475364..aed5e7cc 100644
--- a/front/py/examples/1_tensor/1_new.py
+++ b/front/py/examples/1_tensor/1_new.py
@@ -3,7 +3,7 @@
 
 from deepx.tensor import Tensor
 def printall(t):
-   print("t=",t)
+
    print("t.name",t.name)
    print("t.shape=",t.shape)
    print("t.shape[0]=",t.shape[0])
@@ -13,12 +13,20 @@ def printall(t):
    print("t.ndimension=",t.ndimension)
    print("t.numel=",t.numel())
    print("t.dtype=", t.dtype)
+   t.print()
 
-def newtensor():
+def newtensor(dtype):
 
    from deepx.nn.functional import newtensor
-   t=newtensor(1,2,3)
+   t=newtensor(1,2,3,dtype=dtype)
    printall(t)
 
+
 if __name__ == "__main__":
-   newtensor()
+   args=sys.argv[1:]
+   if len(args)==0:
+      newtensor('float32')
+   elif len(args)==1:
+      newtensor(args[0])
+   else:
+      print("Usage: python 1_new.py [dtype]")