From 56886908ea2ecbce8e087eab81ecfd69801d01b8 Mon Sep 17 00:00:00 2001
From: lipeng <734991033@qq.com>
Date: Fri, 18 Apr 2025 00:15:14 +0800
Subject: [PATCH 1/6] =?UTF-8?q?module:linear=E7=AE=80=E5=8C=96=20fix?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
front/py/deepx/nn/functional/authormap.py | 9 +++++-
.../nn/functional/leaffunc_elementwise.py | 4 +--
front/py/deepx/nn/functional/leaffunc_init.py | 30 +++++++++++--------
front/py/deepx/nn/functional/leaffunc_io.py | 5 ++--
front/py/deepx/nn/functional/leaffunc_life.py | 5 +++-
.../py/deepx/nn/functional/leaffunc_matmul.py | 5 ++--
front/py/deepx/nn/modules/linear.py | 17 ++++++-----
front/py/deepx/nn/modules/module.py | 15 ++++------
front/py/deepx/tensor/tensor.py | 21 +++++--------
front/py/examples/2_ir/3_matmul.py | 2 +-
front/py/examples/3_functional/1_mean.py | 4 +--
front/py/examples/3_functional/1_relu.py | 4 +--
front/py/examples/3_functional/1_rsqrt.py | 4 +--
front/py/examples/3_functional/1_sigmoid.py | 4 +--
front/py/examples/3_functional/1_swish.py | 4 +--
front/py/examples/3_module/1_linear.py | 16 ++++------
16 files changed, 75 insertions(+), 74 deletions(-)
diff --git a/front/py/deepx/nn/functional/authormap.py b/front/py/deepx/nn/functional/authormap.py
index 5b42d56f..644505a0 100644
--- a/front/py/deepx/nn/functional/authormap.py
+++ b/front/py/deepx/nn/functional/authormap.py
@@ -1,4 +1,10 @@
defaultauthor=dict({
+ #io
+ 'print':'miaobyte',
+ #init
+ 'uniform':'miaobyte',
+ 'constant':'miaobyte',
+ 'arange':'miaobyte',
#elementwise
'add':'miaobyte',
'addscalar':'miaobyte',
@@ -27,7 +33,8 @@
'broadcastTo':'miaobyte',
'concat':'miaobyte',
#matmul
- 'matmul':'miaobyte',
+ # 'matmul':'miaobyte',
+ 'matmul':'cublas',
#reduce
'sum':'miaobyte',
'prod':'miaobyte',
diff --git a/front/py/deepx/nn/functional/leaffunc_elementwise.py b/front/py/deepx/nn/functional/leaffunc_elementwise.py
index 3cfe5157..93e64b36 100644
--- a/front/py/deepx/nn/functional/leaffunc_elementwise.py
+++ b/front/py/deepx/nn/functional/leaffunc_elementwise.py
@@ -14,9 +14,7 @@
def div(
a: Optional[Union[Tensor, float, int]] = None,
b: Optional[Union[Tensor, float, int]] = None,
- out:Union[Tensor,str]=None,
- requires_grad:bool=False,
- author='miaobyte')->Tensor:
+ out:Union[Tensor,str]=None)->Tensor:
if isinstance(b,Tensor) and isinstance(a,Tensor):
#C=A/B
outtensor=out
diff --git a/front/py/deepx/nn/functional/leaffunc_init.py b/front/py/deepx/nn/functional/leaffunc_init.py
index e0b0da90..a5ac4dde 100644
--- a/front/py/deepx/nn/functional/leaffunc_init.py
+++ b/front/py/deepx/nn/functional/leaffunc_init.py
@@ -1,18 +1,18 @@
from typing import Union
import math
+import time
+import os
from .leaffunc_life import newtensor,parse_shape
from .rtf_init import *
from deepx import Tensor
-
+from .authormap import defaultauthor
# 命名规则
# inplace操作的函数,其名为_后缀, 返回值为空
# 非inplace操作的函数,其名为_后缀, 返回值为Tensor
-def constant_(t:Tensor,
- value: Union[float,int],
- author='miaobyte')->Tensor:
- rtf_constant(t,value,author)
+def constant_(t:Tensor,value: Union[float,int])->Tensor:
+ rtf_constant(t,value,defaultauthor['constant'])
def constant(*shape, value:Union[float,int], dtype:str='float32',name:str)->Tensor:
@@ -33,22 +33,26 @@ def ones(*shape, dtype:str='float32',name:str=None)->Tensor:
s = parse_shape(shape)
return constant(s, value=1, dtype=dtype,name=name)
-def arange_(t:Tensor,start=0,step=1,author='miaobyte')->Tensor:
+def arange_(t:Tensor,start=0,step=1)->Tensor:
from .rtf_init import rtf_arange
- rtf_arange(t,start,step,author)
-def arange(*shape,start=0,step=1,dtype:str='float32',name:str=None,author='miaobyte')->Tensor:
+ rtf_arange(t,start,step,defaultauthor['arange'])
+def arange(*shape,start=0,step=1,dtype:str='float32',name:str=None)->Tensor:
s = parse_shape(shape)
outtensor=newtensor(s,dtype=dtype,name=name)
- arange_(outtensor,start,step,author)
+ arange_(outtensor,start,step)
return outtensor
-def uniform_(t:Tensor,low=0, high=1,seed:int=0,author='miaobyte')->Tensor:
+def uniform_(t:Tensor,low=0, high=1,seed:int=None)->Tensor:
+ if seed is None:
+ seed = int(time.time() * 1000) & 0xffffffff
+ seed = (seed + os.getpid()) & 0xffffffff
from .rtf_init import rtf_uniform
- rtf_uniform(t,low,high,seed,author)
-def uniform(*shape,low=0, high=1,seed:int=0,dtype:str='float32',name:str=None,author='miaobyte')->Tensor:
+ rtf_uniform(t,low,high,seed,defaultauthor['uniform'])
+
+def uniform(*shape,low=0, high=1,seed:int=None,dtype:str='float32',name:str=None)->Tensor:
s = parse_shape(shape)
outtensor=newtensor(s,dtype=dtype,name=name)
- uniform_(outtensor,low,high,seed,author)
+ uniform_(outtensor,low,high,seed)
return outtensor
# def rand(*size, dtype=None, device=None):
diff --git a/front/py/deepx/nn/functional/leaffunc_io.py b/front/py/deepx/nn/functional/leaffunc_io.py
index 98d221da..b4490803 100644
--- a/front/py/deepx/nn/functional/leaffunc_io.py
+++ b/front/py/deepx/nn/functional/leaffunc_io.py
@@ -1,7 +1,8 @@
from deepx.tensor import Tensor
+from .authormap import defaultauthor
-def printtensor(t:Tensor,format='',author='miaobyte'):
+def printtensor(t:Tensor,format=''):
from .rtf_io import rtf_printtensor
- rtf_printtensor(t,format,author)
+ rtf_printtensor(t,format,defaultauthor['print'])
return ''
diff --git a/front/py/deepx/nn/functional/leaffunc_life.py b/front/py/deepx/nn/functional/leaffunc_life.py
index cf4d0905..abf6a530 100644
--- a/front/py/deepx/nn/functional/leaffunc_life.py
+++ b/front/py/deepx/nn/functional/leaffunc_life.py
@@ -12,7 +12,10 @@ def newtensor(*shape,dtype:str='float32',name:str=None):
from .rtf_life import rtf_newtensor
rtf_newtensor(t)
return t
-
+def rnewtensor(t:Tensor):
+ from .rtf_life import rtf_newtensor
+ rtf_newtensor(t)
+ return t
def copytensor(t:Tensor,out:Tensor):
from .rtf_life import rtf_copytensor
rtf_copytensor(t,out)
diff --git a/front/py/deepx/nn/functional/leaffunc_matmul.py b/front/py/deepx/nn/functional/leaffunc_matmul.py
index 11b793a4..bb69b838 100644
--- a/front/py/deepx/nn/functional/leaffunc_matmul.py
+++ b/front/py/deepx/nn/functional/leaffunc_matmul.py
@@ -1,13 +1,14 @@
from typing import Union
-from deepx import Tensor
+from deepx import Tensor,Shape
from .leaffunc_life import newtensor
from .authormap import defaultauthor
def matmul(a:Tensor,b:Tensor,out:Union[Tensor,str]='')->Tensor:
outtensor=out
if isinstance(out,str):
- outtensor=newtensor(a.shape,dtype=a.dtype,name=out)
+ outshape=Shape.matmul(a.shape,b.shape)
+ outtensor=newtensor(outshape,dtype=a.dtype,name=out)
from .rtf_matmul import rtf_matmul
rtf_matmul(a,b,outtensor,defaultauthor['matmul'])
return outtensor
diff --git a/front/py/deepx/nn/modules/linear.py b/front/py/deepx/nn/modules/linear.py
index c1ef3238..f1eb86e3 100644
--- a/front/py/deepx/nn/modules/linear.py
+++ b/front/py/deepx/nn/modules/linear.py
@@ -1,6 +1,6 @@
from .module import Module
from deepx import Tensor
-from deepx.nn.functional import uniform,kaiming_uniform_,calculate_fan_in_and_fan_out
+from deepx.nn.functional import uniform_,kaiming_uniform_,calculate_fan_in_and_fan_out
import math
class Linear(Module):
@@ -35,14 +35,17 @@ def reset_parameters(self) -> None:
if self.bias is not None:
fan_in, _ = calculate_fan_in_and_fan_out(self.weight)
bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
- uniform(self.bias, -bound, bound)
+ uniform_(self.bias, -bound, bound)
def forward(self, input: Tensor) -> Tensor:
- #`y = xA^T + b`
- if self.bias is None:
- return input @ self.weight.T
- else:
- return input @ self.weight.T + self.bias
+ #`y = xA^T + b`
+ y=input @ self.weight.T
+ oldshape=y.shape
+ if self.bias is not None:
+ y.reshape_(y.shape[1])
+ y=y+self.bias
+ y.reshape_(*oldshape)
+ return y
def extra_repr(self) -> str:
return f"in_features={self.in_features}, out_features={self.out_features}, bias={self.bias is not None}"
diff --git a/front/py/deepx/nn/modules/module.py b/front/py/deepx/nn/modules/module.py
index bda175d4..9f52fdf3 100644
--- a/front/py/deepx/nn/modules/module.py
+++ b/front/py/deepx/nn/modules/module.py
@@ -1,13 +1,10 @@
import re
-from typing import (Dict, Iterator, Optional, Tuple, Union,
- Any, List, overload)
+from typing import Dict, Iterator, Optional, Tuple, Any
from collections import OrderedDict
from deepx import Tensor
class Module:
def __init__(self, name: Optional[str] = None):
- from deepx.autograd import Graph
- self._graph=Graph.get_default()
self._name = name or self._generate_default_name()
self._parent: Optional[Module] = None
self._modules: OrderedDict[str, Module] = OrderedDict()
@@ -21,11 +18,7 @@ def _generate_default_name(self) -> str:
count = self.__class__._instance_counter
self.__class__._instance_counter += 1
return f"{base_name}_{count}"
-
- @property
- def graph(self):
- return self._graph
-
+
@property
def full_name(self):
if self._parent is None:
@@ -55,7 +48,9 @@ def register_parameter(self, name: str, param: Optional[Tensor]) -> None:
self._parameters.pop(name, None)
else:
self._parameters[name] = param
- param.addtograph(self.full_name + '.' + name)
+ param.name=self.full_name + '.' + name
+ from deepx.nn.functional.leaffunc_life import rnewtensor
+ rnewtensor(param)
def parameters(self, recurse: bool = True) -> Iterator[Tensor]:
for name, param in self.named_parameters(recurse=recurse):
diff --git a/front/py/deepx/tensor/tensor.py b/front/py/deepx/tensor/tensor.py
index ee60948d..1e2fe9fe 100644
--- a/front/py/deepx/tensor/tensor.py
+++ b/front/py/deepx/tensor/tensor.py
@@ -27,9 +27,7 @@ def __init__(self,shape:Union[tuple[int],list[int],Shape],dtype:str='float32',na
self._shape = shape
else:
raise ValueError("Invalid shape")
-
- self._graph = None
- self._node = None
+
def copy_to(self,t:'Tensor'):
from deepx.nn.functional import copytensor
copytensor(self,t)
@@ -44,7 +42,10 @@ def clone(self,name:str=None):
@property
def name(self):
return self._name
-
+ @name.setter
+ def name(self,name:str):
+ self._name=name
+
# shape
@property
def shape(self,dim:int=None):
@@ -87,15 +88,7 @@ def numel(self)->int:
@property
def dtype(self):
return self._dtype
-
-
- @property
- def graph(self):
- return self._graph
-
- @property
- def node(self):
- return self._node
+
#elementwise
def __add__(self, other):
@@ -120,7 +113,7 @@ def __matmul__(self, other):
#shape操作
@property
def T(self) -> str:
- return self.transpose(1,0,out=self.node.name+".T")
+ return self.transpose()
# 打印
def autoformat(self):
diff --git a/front/py/examples/2_ir/3_matmul.py b/front/py/examples/2_ir/3_matmul.py
index 5cc0cffd..144cbdf7 100644
--- a/front/py/examples/2_ir/3_matmul.py
+++ b/front/py/examples/2_ir/3_matmul.py
@@ -16,7 +16,7 @@
t1 = ones([3,4],dtype='float32',name="t1")
t2 = ones([4,5],dtype='float32',name="t2")
t3 = t1 @ t2
-print(t3)
+t3.print()
diff --git a/front/py/examples/3_functional/1_mean.py b/front/py/examples/3_functional/1_mean.py
index 12f4c0f5..64511555 100644
--- a/front/py/examples/3_functional/1_mean.py
+++ b/front/py/examples/3_functional/1_mean.py
@@ -14,7 +14,7 @@
t3=arange(4,5,6,name="t3")
-print(t3)
+t3.print()
t3_mean=mean(t3,dim=(0,1))
-print(t3_mean)
+t3_mean.print()
diff --git a/front/py/examples/3_functional/1_relu.py b/front/py/examples/3_functional/1_relu.py
index 22b1e8cc..9cd1737e 100644
--- a/front/py/examples/3_functional/1_relu.py
+++ b/front/py/examples/3_functional/1_relu.py
@@ -21,7 +21,7 @@
# 当tensor.name为str时,说明其是中间变量,执行inplace操作
t2=uniform(10,10,low=-1,high=1)
-print(t2)
+t2.print()
relu_t2=relu(t2)
-print(relu_t2)
+relu_t2.print()
diff --git a/front/py/examples/3_functional/1_rsqrt.py b/front/py/examples/3_functional/1_rsqrt.py
index c0706691..aa4926a6 100644
--- a/front/py/examples/3_functional/1_rsqrt.py
+++ b/front/py/examples/3_functional/1_rsqrt.py
@@ -13,6 +13,6 @@
from deepx.nn.functional import rsqrt
t=arange(2,3,4,name='t')
-print((t))
+t.print()
rsqrt_t=rsqrt(t)
-print(rsqrt_t)
+rsqrt_t.print()
diff --git a/front/py/examples/3_functional/1_sigmoid.py b/front/py/examples/3_functional/1_sigmoid.py
index 1eace7bf..dbdfd614 100644
--- a/front/py/examples/3_functional/1_sigmoid.py
+++ b/front/py/examples/3_functional/1_sigmoid.py
@@ -20,8 +20,8 @@
x.sub_(3.0)
print("\nDEEPX tensor:")
-print(x)
+x.print()
out=sigmoid(x)
print("\nDEEPX sigmoid result:")
-print(out)
+out.print()
diff --git a/front/py/examples/3_functional/1_swish.py b/front/py/examples/3_functional/1_swish.py
index d2ce1082..f4e8c7c3 100644
--- a/front/py/examples/3_functional/1_swish.py
+++ b/front/py/examples/3_functional/1_swish.py
@@ -20,8 +20,8 @@
x.sub_(3.0)
print("\nDEEPX tensor:")
-print(x)
+x.print()
out=swish(x)
print("\nDEEPX swish result:")
-print(out)
+out.print()
diff --git a/front/py/examples/3_module/1_linear.py b/front/py/examples/3_module/1_linear.py
index 06eb7cfd..7ad43a91 100644
--- a/front/py/examples/3_module/1_linear.py
+++ b/front/py/examples/3_module/1_linear.py
@@ -3,22 +3,18 @@
import torch.nn as nn
net = nn.Linear(64, 4)
-input = torch.ones(1, 64)
-output = net(input)
+torch_input = torch.ones(1, 64)
+torch_output = net(torch_input)
print()
-print(output)
+print(torch_output)
############-------DEEPX-------################
-from deepx.nn.modules import Linear, Module
-from deepx import Tensor,ones
+from deepx.nn.modules import Linear
+from deepx import ones
net = Linear(64, 4)
input=ones(1,64,name='input')
out=net.forward(input)
-print(out)
+out.print()
-import os
-script_name = os.path.splitext(os.path.basename( os.path.abspath(__file__)))[0] # 获取不带后缀的脚本名
-str=out.graph.to_dot()
-str.render(script_name+".dot", format='svg')
From 78f9defca1c558e758e7de7c575f336351436723 Mon Sep 17 00:00:00 2001
From: lipeng <734991033@qq.com>
Date: Fri, 18 Apr 2025 04:45:15 +0800
Subject: [PATCH 2/6] =?UTF-8?q?llama:RMSNorm=20ok,=E9=AA=8C=E8=AF=81rpoe?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
front/py/deepx/__init__.py | 5 +-
front/py/deepx/nn/functional/__init__.py | 2 +-
front/py/deepx/nn/functional/elementwise.py | 5 +-
front/py/deepx/nn/functional/leaffunc.py | 10 +-
.../nn/functional/leaffunc_elementwise.py | 62 +-
front/py/deepx/nn/functional/leaffunc_init.py | 7 +-
.../py/deepx/nn/functional/rtf_elementwise.py | 10 +-
front/py/deepx/nn/modules/container.py | 0
front/py/deepx/nn/modules/module.py | 3 +-
front/py/deepx/tensor/__init__.py | 4 +-
front/py/deepx/tensor/elementwise.py | 9 +-
front/py/deepx/tensor/reduce.py | 12 +-
front/py/deepx/tensor/tensor.py | 22 +-
.../4_transformer/llama/1_llamarmsnorm.dot | 128 ----
.../llama/1_llamarmsnorm.dot.svg | 606 ------------------
.../4_transformer/llama/1_llamarmsnorm.py | 26 +-
.../llama/1_llamarmsnorm_torch.py | 4 +-
17 files changed, 97 insertions(+), 818 deletions(-)
delete mode 100644 front/py/deepx/nn/modules/container.py
delete mode 100644 front/py/examples/4_transformer/llama/1_llamarmsnorm.dot
delete mode 100644 front/py/examples/4_transformer/llama/1_llamarmsnorm.dot.svg
diff --git a/front/py/deepx/__init__.py b/front/py/deepx/__init__.py
index 355a250b..37f47669 100644
--- a/front/py/deepx/__init__.py
+++ b/front/py/deepx/__init__.py
@@ -1,11 +1,10 @@
-from .tensor import Tensor,Shape
+from .tensor import Tensor,Shape,Number
from deepx.nn.functional import * # 导入所有functional函数
from deepx.nn.functional import __all__ as _func_all # 获取functional的导出列表
__all__ = [
#tensor
- 'Tensor',
- 'Shape',
+ 'Tensor','Shape','Number',
*_func_all
]
diff --git a/front/py/deepx/nn/functional/__init__.py b/front/py/deepx/nn/functional/__init__.py
index 9cacf7d4..1e215d7e 100644
--- a/front/py/deepx/nn/functional/__init__.py
+++ b/front/py/deepx/nn/functional/__init__.py
@@ -21,7 +21,7 @@
"printtensor",
"constant","constant_","full","zeros","ones","uniform","uniform_","arange","arange_","kaiming_uniform","kaiming_uniform_","calculate_fan_in_and_fan_out",
"add","sub","mul","div","sqrt","pow","exp","log",
- "leaffunc_matmul",
+ "matmul",
"reducemax","reducemin","sum","prod",
"reshape","permute","transpose","concat","broadcastTo",
diff --git a/front/py/deepx/nn/functional/elementwise.py b/front/py/deepx/nn/functional/elementwise.py
index 7adcb28f..28e5b199 100644
--- a/front/py/deepx/nn/functional/elementwise.py
+++ b/front/py/deepx/nn/functional/elementwise.py
@@ -1,4 +1,5 @@
-from deepx.tensor import Tensor
+from typing import Union
+from deepx.tensor import Tensor,Number
from deepx.nn.functional import newtensor
def rsqrt(input:Tensor)->Tensor:
@@ -8,5 +9,5 @@ def rsqrt(input:Tensor)->Tensor:
outtensor=newtensor(input.shape, dtype=input.dtype)
sqrt(input,out= outtensor)
return div(1,outtensor,outtensor)
-
+
diff --git a/front/py/deepx/nn/functional/leaffunc.py b/front/py/deepx/nn/functional/leaffunc.py
index 62fbb767..58d21105 100644
--- a/front/py/deepx/nn/functional/leaffunc.py
+++ b/front/py/deepx/nn/functional/leaffunc.py
@@ -17,9 +17,6 @@ def op_func(
b: Union[Tensor, float, int] = None,
out: Union[Tensor, str] = None) -> Tensor:
outtensor = out
- if isinstance(out, str):
- outtensor = newtensor(a.shape, dtype=a.dtype, name=out)
-
rtf_module = importlib.import_module('deepx.nn.functional.rtf_elementwise')
if isinstance(b, Tensor):
an=a
@@ -28,9 +25,16 @@ def op_func(
newshape = Shape.broadcast_shape(a.shape, b.shape)
an = a.broadcastTo(newshape)
bn = b.broadcastTo(newshape)
+ if isinstance(out,str):
+ outtensor=newtensor(newshape,dtype=a.dtype,name=out)
+ else:
+ if isinstance(out,str):
+ outtensor=newtensor(a.shape,dtype=a.dtype,name=out)
rtf_func = getattr(rtf_module, f'rtf_{op_name}')
rtf_func(an, bn, outtensor, defaultauthor[op_name])
else:
+ if isinstance(out,str):
+ outtensor=newtensor(a.shape,dtype=a.dtype,name=out)
rtf_func = getattr(rtf_module, f'rtf_{op_name}scalar')
rtf_func(a, b, outtensor, defaultauthor[f'{op_name}scalar'])
return outtensor
diff --git a/front/py/deepx/nn/functional/leaffunc_elementwise.py b/front/py/deepx/nn/functional/leaffunc_elementwise.py
index 93e64b36..74e0918d 100644
--- a/front/py/deepx/nn/functional/leaffunc_elementwise.py
+++ b/front/py/deepx/nn/functional/leaffunc_elementwise.py
@@ -1,5 +1,5 @@
from typing import Optional, Union
-from deepx import Tensor,Shape
+from deepx import Tensor,Shape,Number
from .leaffunc import create_A_B_tf_C,create_A_tf_C
from .leaffunc_life import newtensor
@@ -9,49 +9,43 @@
add = create_A_B_tf_C('add')
sub = create_A_B_tf_C('sub')
mul = create_A_B_tf_C('mul')
+_div=create_A_B_tf_C('div')
-#div
def div(
- a: Optional[Union[Tensor, float, int]] = None,
- b: Optional[Union[Tensor, float, int]] = None,
+ a: Union[Tensor, float, int],
+ b: Union[Tensor, float, int],
out:Union[Tensor,str]=None)->Tensor:
- if isinstance(b,Tensor) and isinstance(a,Tensor):
- #C=A/B
- outtensor=out
- if isinstance(out,str):
- outtensor=newtensor(a.shape,dtype=a.dtype,name=out)
- an=a
- bn=b
- if a.shape!=b.shape:
- newshape=Shape.broadcast_shape(a.shape,b.shape)
- an=a.broadcastTo(newshape)
- bn=b.broadcastTo(newshape)
- from .rtf_elementwise import rtf_div
- rtf_div(an,bn,outtensor,defaultauthor['div'])
- return outtensor
+ if isinstance(a,Tensor):
+ return _div(a,b,out)
+ elif isinstance(a,float) or isinstance(a,int):
+ return rdiv(a,b,out)
else:
- if isinstance(a,Tensor):
- #C=A/b
- outtensor=out
- if isinstance(out,str):
- outtensor=newtensor(a.shape,dtype=a.dtype,name=out)
- from .rtf_elementwise import rtf_divscalar
- rtf_divscalar(a,b,outtensor,defaultauthor['divscalar'])
- return outtensor
- elif isinstance(a,float) or isinstance(a,int):
- #C=a/B
- outtensor=out
- if isinstance(out,str):
- outtensor=newtensor(b.shape,dtype=b.dtype,name=out)
- from .rtf_elementwise import rtf_rdivscalar
- rtf_rdivscalar(a,b,outtensor,defaultauthor['rdivscalar'])
- return outtensor
+ raise ValueError(f"Invalid type for a: {type(a)}")
+
+#div
+def rdiv(
+ a: Union[float, int],
+ b: Tensor,
+ out:Union[Tensor,str]=None)->Tensor:
+ outtensor=out
+ if isinstance(out,str):
+ outtensor=newtensor(b.shape,dtype=b.dtype,name=out)
+ from .rtf_elementwise import rtf_rdivscalar
+ rtf_rdivscalar(a,b,outtensor,defaultauthor['rdivscalar'])
+ return outtensor
max=create_A_B_tf_C('max')
min=create_A_B_tf_C('min')
#pow
pow=create_A_B_tf_C('pow')
+def rpow(a:Number,b:Tensor,out:Union[Tensor,str]=None)->Tensor:
+ outtensor=out
+ if isinstance(out,str):
+ outtensor=newtensor(b.shape,dtype=b.dtype,name=out)
+ from .rtf_elementwise import rtf_rpowscalar
+ rtf_rpowscalar(a,b,outtensor,defaultauthor['rpowscalar'])
+ return outtensor
#sqrt
sqrt=create_A_tf_C('sqrt')
diff --git a/front/py/deepx/nn/functional/leaffunc_init.py b/front/py/deepx/nn/functional/leaffunc_init.py
index a5ac4dde..454dc09d 100644
--- a/front/py/deepx/nn/functional/leaffunc_init.py
+++ b/front/py/deepx/nn/functional/leaffunc_init.py
@@ -4,7 +4,7 @@
import os
from .leaffunc_life import newtensor,parse_shape
from .rtf_init import *
-from deepx import Tensor
+from deepx import Tensor,Number
from .authormap import defaultauthor
# 命名规则
@@ -36,8 +36,9 @@ def ones(*shape, dtype:str='float32',name:str=None)->Tensor:
def arange_(t:Tensor,start=0,step=1)->Tensor:
from .rtf_init import rtf_arange
rtf_arange(t,start,step,defaultauthor['arange'])
-def arange(*shape,start=0,step=1,dtype:str='float32',name:str=None)->Tensor:
- s = parse_shape(shape)
+#pytorch style
+def arange(start:Number,end:Number,step:Number=1,dtype:str='float32',name:str=None)->Tensor:
+ s =[int((end-start)/step)]
outtensor=newtensor(s,dtype=dtype,name=name)
arange_(outtensor,start,step)
return outtensor
diff --git a/front/py/deepx/nn/functional/rtf_elementwise.py b/front/py/deepx/nn/functional/rtf_elementwise.py
index 414c09f6..96614a4e 100644
--- a/front/py/deepx/nn/functional/rtf_elementwise.py
+++ b/front/py/deepx/nn/functional/rtf_elementwise.py
@@ -1,7 +1,6 @@
-from deepx.tensor import Tensor
+from deepx.tensor import Tensor,Number
from deepx.nn.deepxir import DeepxIR,Param
from deepx.scheduler import send
-from typing import Union
from .rtf import A_B_op_C,A_scalar_op_C,A_op_C
def rtf_add(a:Tensor, b:Tensor, out:Tensor, author='miaobyte')->Tensor:
@@ -55,6 +54,13 @@ def rtf_powscalar(a:Tensor, b:float, out:Tensor, author='miaobyte')->Tensor:
A_scalar_op_C("powscalar",a,b,out,author)
return out
+def rtf_rpowscalar(a:Number,b:Tensor,out:Tensor,author='miaobyte')->Tensor:
+ args = [ Param.varnum(a),Param.tensor(b)]
+ returns = [Param.tensor(out)]
+ ir = DeepxIR("rpowscalar", args, returns, author)
+ send(ir)
+ return out
+
def rtf_exp(a:Tensor, out:Tensor, author='miaobyte')->Tensor:
A_op_C("exp",a,out,author)
return out
diff --git a/front/py/deepx/nn/modules/container.py b/front/py/deepx/nn/modules/container.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/front/py/deepx/nn/modules/module.py b/front/py/deepx/nn/modules/module.py
index 9f52fdf3..5c7be9b2 100644
--- a/front/py/deepx/nn/modules/module.py
+++ b/front/py/deepx/nn/modules/module.py
@@ -12,7 +12,8 @@ def __init__(self, name: Optional[str] = None):
def _generate_default_name(self) -> str:
class_name = self.__class__.__name__
- base_name = re.sub(r'(?Tensor:
diff --git a/front/py/deepx/tensor/reduce.py b/front/py/deepx/tensor/reduce.py
index b6d5bc3f..cdba12f8 100644
--- a/front/py/deepx/tensor/reduce.py
+++ b/front/py/deepx/tensor/reduce.py
@@ -4,28 +4,28 @@
from deepx.tensor import Tensor,tensor_method
@tensor_method
-def reducemax(self, dim:list[int],keepdim:bool=False,out:Union[Tensor,str]=''):
+def reducemax(self, dim:tuple,keepdim:bool=False,out:Union[Tensor,str]=''):
from deepx.nn.functional import reducemax as reduce_max_func
return reduce_max_func(self,dim,keepdim,out)
@tensor_method
-def reducemin(self, dim:list[int],keepdim:bool=False,out:Union[Tensor,str]=''):
+def reducemin(self, dim:tuple,keepdim:bool=False,out:Union[Tensor,str]=''):
from deepx.nn.functional import reducemin as reduce_min_func
return reduce_min_func(self,dim,keepdim,out)
@tensor_method
-def sum(self, dim:list[int],keepdim:bool=False,out:Union[Tensor,str]=''):
+def sum(self, dim:tuple,keepdim:bool=False,out:Union[Tensor,str]=''):
from deepx.nn.functional import sum as sum_func
return sum_func(self,dim,keepdim,out)
@tensor_method
-def prod(self, dim:list[int],keepdim:bool=False,out:Union[Tensor,str]=''):
+def prod(self, dim:tuple,keepdim:bool=False,out:Union[Tensor,str]=''):
from deepx.nn.functional import prod as prod_func
return prod_func(self,dim,keepdim,out)
@tensor_method
-def mean(self, dim:list[int],keepdim:bool=False,out:Union[Tensor,str]=''):
+def mean(self,dim:tuple,keepdim:bool=False,out:Union[Tensor,str]=''):
from deepx.nn.functional import mean as mean_func
- return mean_func(self,dim,keepdim,out)
+ return mean_func(self,dim,keepdim)
\ No newline at end of file
diff --git a/front/py/deepx/tensor/tensor.py b/front/py/deepx/tensor/tensor.py
index 1e2fe9fe..a45888b1 100644
--- a/front/py/deepx/tensor/tensor.py
+++ b/front/py/deepx/tensor/tensor.py
@@ -1,6 +1,9 @@
-from typing import Optional,Union
+from typing import Optional,Union,TypeAlias
from .shape import Shape
+
+Number: TypeAlias = Union[int, float, bool]
+
tensorid=1
class Tensor:
@@ -91,23 +94,28 @@ def dtype(self):
#elementwise
- def __add__(self, other):
+ def __add__(self, other:Union[Number,'Tensor']):
return self.add(other)
- def __sub__(self, other):
+ def __sub__(self, other:Union[Number,'Tensor']):
return self.sub(other)
- def __mul__(self, other):
+ def __mul__(self, other:Union[Number,'Tensor']):
return self.mul(other)
- def __truediv__(self, other):
+ def __truediv__(self, other:Union[Number,'Tensor']):
return self.div(other)
- def __rtruediv__(self, other):
+ def __rtruediv__(self, other:Union[Number,'Tensor']):
return self.rdiv(other)
+ def __pow__(self, other:Union[Number,'Tensor']):
+ return self.pow(other)
+
+ def __rpow__(self, other:Union[Number,'Tensor']):
+ return self.rpow(other)
#矩阵乘法
- def __matmul__(self, other):
+ def __matmul__(self, other:Union[Number,'Tensor']):
return self.matmul(other)
#shape操作
diff --git a/front/py/examples/4_transformer/llama/1_llamarmsnorm.dot b/front/py/examples/4_transformer/llama/1_llamarmsnorm.dot
deleted file mode 100644
index f2e9db0c..00000000
--- a/front/py/examples/4_transformer/llama/1_llamarmsnorm.dot
+++ /dev/null
@@ -1,128 +0,0 @@
-// Computational Graph
-digraph {
- rankdir=TB
- node [shape=record]
- 130357533018672 [label="tensor_1
-(2, 3, 8)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
- 130357533019536 [label=reshape color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
- 130357533018480 [label="vector_1
-(2, 3, 8)" color=darkseagreen fillcolor=honeydew fontname="Sans-Serif" labeljust=l shape=box style=filled]
- 130357533738896 [label=div_scalar color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
- 130357533739760 [label="var_1
-10.0" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled]
- 130357533752528 [label=add_scalar color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
- 130357533752336 [label="var_2
--2.0" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled]
- 130357533750272 [label="tensor_2
-(8,)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
- 130357533750512 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
- 130357533750128 [label="var_3
-1" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled]
- 130357533750368 [label="llama_r_m_s_norm_0.weight
-(8,)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
- 130357533750416 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
- 130357533750032 [label="var_4
-0.5" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled]
- 130357533749840 [label=pow_scalar color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
- 130357533749888 [label="var_5
-2" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled]
- 130357533749696 [label="tensor_4
-(2, 3, 8)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
- 130357533749504 [label="tensor_5
-(2, 3, 1)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
- 130357533749264 [label="vector_2
-[2]" color=darkseagreen fillcolor=honeydew fontname="Sans-Serif" labeljust=l shape=box style=filled]
- 130357533749168 [label=sum color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
- 130357533748928 [label="tensor_6
-(2, 3, 1)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
- 130357533748832 [label=div_scalar color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
- 130357533748880 [label="var_6
-8" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled]
- 130357533748688 [label=add_scalar color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
- 130357533748736 [label="var_7
-1e-06" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled]
- 130357533748544 [label="tensor_7
-(2, 3, 1)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
- 130357533748304 [label="tensor_8
-(2, 3, 1)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
- 130357533748064 [label=sqrt color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
- 130357533747968 [label=rdiv_scalar color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
- 130357533748112 [label="var_8
-1" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled]
- 130357533747824 [label="tensor_9
-(2, 3, 1)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
- 130357533747584 [label="tensor_10
-(2, 3, 1)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
- 130357533747344 [label=reshape color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
- 130357533747392 [label="vector_3
-[2, 3, 1]" color=darkseagreen fillcolor=honeydew fontname="Sans-Serif" labeljust=l shape=box style=filled]
- 130357533747248 [label="tensor_11
-(2, 3, 8)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
- 130357533746960 [label=expand color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
- 130357533747008 [label="vector_4
-(2, 3, 8)" color=darkseagreen fillcolor=honeydew fontname="Sans-Serif" labeljust=l shape=box style=filled]
- 130357533746864 [label=mul color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
- 130357533746720 [label="tensor_12
-(2, 3, 8)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
- 130357534894224 [label="tensor_13
-(1, 1, 8)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
- 130357533740336 [label=reshape color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
- 130357533740672 [label="vector_5
-[1, 1, 8]" color=darkseagreen fillcolor=honeydew fontname="Sans-Serif" labeljust=l shape=box style=filled]
- 130357533740528 [label="tensor_14
-(2, 3, 8)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
- 130357533740768 [label=expand color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
- 130357533740816 [label="vector_6
-(2, 3, 8)" color=darkseagreen fillcolor=honeydew fontname="Sans-Serif" labeljust=l shape=box style=filled]
- 130357533741152 [label=mul color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
- 130357533742736 [label="tensor_15
-(2, 3, 8)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
- 130357533019536 -> 130357533018672 [arrowsize=0.8 color=gray40 penwidth=1.2]
- 130357533738896 -> 130357533018672 [arrowsize=0.8 color=gray40 penwidth=1.2]
- 130357533752528 -> 130357533018672 [arrowsize=0.8 color=gray40 penwidth=1.2]
- 130357533018672 -> 130357533019536 [arrowsize=0.8 color=gray40 penwidth=1.2]
- 130357533018480 -> 130357533019536 [arrowsize=0.8 color=gray40 penwidth=1.2]
- 130357533018672 -> 130357533738896 [arrowsize=0.8 color=gray40 penwidth=1.2]
- 130357533739760 -> 130357533738896 [arrowsize=0.8 color=gray40 penwidth=1.2]
- 130357533018672 -> 130357533752528 [arrowsize=0.8 color=gray40 penwidth=1.2]
- 130357533752336 -> 130357533752528 [arrowsize=0.8 color=gray40 penwidth=1.2]
- 130357533750512 -> 130357533750272 [arrowsize=0.8 color=gray40 penwidth=1.2]
- 130357533750128 -> 130357533750512 [arrowsize=0.8 color=gray40 penwidth=1.2]
- 130357533750416 -> 130357533750368 [arrowsize=0.8 color=gray40 penwidth=1.2]
- 130357533750032 -> 130357533750416 [arrowsize=0.8 color=gray40 penwidth=1.2]
- 130357533018672 -> 130357533749840 [arrowsize=0.8 color=gray40 penwidth=1.2]
- 130357533749888 -> 130357533749840 [arrowsize=0.8 color=gray40 penwidth=1.2]
- 130357533749840 -> 130357533749696 [arrowsize=0.8 color=gray40 penwidth=1.2]
- 130357533749168 -> 130357533749504 [arrowsize=0.8 color=gray40 penwidth=1.2]
- 130357533749696 -> 130357533749168 [arrowsize=0.8 color=gray40 penwidth=1.2]
- 130357533749264 -> 130357533749168 [arrowsize=0.8 color=gray40 penwidth=1.2]
- 130357533748832 -> 130357533748928 [arrowsize=0.8 color=gray40 penwidth=1.2]
- 130357533749504 -> 130357533748832 [arrowsize=0.8 color=gray40 penwidth=1.2]
- 130357533748880 -> 130357533748832 [arrowsize=0.8 color=gray40 penwidth=1.2]
- 130357533748928 -> 130357533748688 [arrowsize=0.8 color=gray40 penwidth=1.2]
- 130357533748736 -> 130357533748688 [arrowsize=0.8 color=gray40 penwidth=1.2]
- 130357533748688 -> 130357533748544 [arrowsize=0.8 color=gray40 penwidth=1.2]
- 130357533748064 -> 130357533748304 [arrowsize=0.8 color=gray40 penwidth=1.2]
- 130357533748544 -> 130357533748064 [arrowsize=0.8 color=gray40 penwidth=1.2]
- 130357533748112 -> 130357533747968 [arrowsize=0.8 color=gray40 penwidth=1.2]
- 130357533748304 -> 130357533747968 [arrowsize=0.8 color=gray40 penwidth=1.2]
- 130357533747968 -> 130357533747824 [arrowsize=0.8 color=gray40 penwidth=1.2]
- 130357533747344 -> 130357533747584 [arrowsize=0.8 color=gray40 penwidth=1.2]
- 130357533747824 -> 130357533747344 [arrowsize=0.8 color=gray40 penwidth=1.2]
- 130357533747392 -> 130357533747344 [arrowsize=0.8 color=gray40 penwidth=1.2]
- 130357533746960 -> 130357533747248 [arrowsize=0.8 color=gray40 penwidth=1.2]
- 130357533747584 -> 130357533746960 [arrowsize=0.8 color=gray40 penwidth=1.2]
- 130357533747008 -> 130357533746960 [arrowsize=0.8 color=gray40 penwidth=1.2]
- 130357533018672 -> 130357533746864 [arrowsize=0.8 color=gray40 penwidth=1.2]
- 130357533747248 -> 130357533746864 [arrowsize=0.8 color=gray40 penwidth=1.2]
- 130357533746864 -> 130357533746720 [arrowsize=0.8 color=gray40 penwidth=1.2]
- 130357533740336 -> 130357534894224 [arrowsize=0.8 color=gray40 penwidth=1.2]
- 130357533750368 -> 130357533740336 [arrowsize=0.8 color=gray40 penwidth=1.2]
- 130357533740672 -> 130357533740336 [arrowsize=0.8 color=gray40 penwidth=1.2]
- 130357533740768 -> 130357533740528 [arrowsize=0.8 color=gray40 penwidth=1.2]
- 130357534894224 -> 130357533740768 [arrowsize=0.8 color=gray40 penwidth=1.2]
- 130357533740816 -> 130357533740768 [arrowsize=0.8 color=gray40 penwidth=1.2]
- 130357533740528 -> 130357533741152 [arrowsize=0.8 color=gray40 penwidth=1.2]
- 130357533746720 -> 130357533741152 [arrowsize=0.8 color=gray40 penwidth=1.2]
- 130357533741152 -> 130357533742736 [arrowsize=0.8 color=gray40 penwidth=1.2]
-}
diff --git a/front/py/examples/4_transformer/llama/1_llamarmsnorm.dot.svg b/front/py/examples/4_transformer/llama/1_llamarmsnorm.dot.svg
deleted file mode 100644
index 331e5566..00000000
--- a/front/py/examples/4_transformer/llama/1_llamarmsnorm.dot.svg
+++ /dev/null
@@ -1,606 +0,0 @@
-
-
-
-
-
diff --git a/front/py/examples/4_transformer/llama/1_llamarmsnorm.py b/front/py/examples/4_transformer/llama/1_llamarmsnorm.py
index 938e593c..8dfacfa1 100644
--- a/front/py/examples/4_transformer/llama/1_llamarmsnorm.py
+++ b/front/py/examples/4_transformer/llama/1_llamarmsnorm.py
@@ -5,29 +5,21 @@
############### DeepX 实现部分 ###############
-from deepx import arange, constant
+from deepx import arange, constant_
from deepx.transformer.models.llama.modeling_llama import LlamaRMSNorm
# 使用相同的数据
-dx_input = arange(0, 48, 1, dtype="float32").reshape_(2, 3, hidden_size)
-dx_input.div_(10.0)
-dx_input.sub_(2.0)
+input = arange(2, 3, hidden_size, dtype="float32")
+input.div_(10.0)
+input.sub_(2.0)
eps = 1e-6
-print("\nDeepX 输入:")
-print(dx_input)
+input.print()
# DeepX计算流程
-dx_norm = LlamaRMSNorm(hidden_size=hidden_size, eps=eps)
+norm = LlamaRMSNorm(hidden_size=hidden_size, eps=eps)
# 设置相同的权重
-constant(dx_norm.weight, 0.5)
+constant_(norm.weight, 0.5)
# 前向计算
-dx_output = dx_norm(dx_input)
-
-print("\nDeepX RMSNorm 结果:")
-print(dx_output)
-
-import os
-script_name = os.path.splitext(os.path.basename( os.path.abspath(__file__)))[0] # 获取不带后缀的脚本名
-str=dx_output.graph.to_dot()
-str.render(script_name+".dot", format='svg')
\ No newline at end of file
+output = norm(input)
+output.print()
diff --git a/front/py/examples/4_transformer/llama/1_llamarmsnorm_torch.py b/front/py/examples/4_transformer/llama/1_llamarmsnorm_torch.py
index 4099feee..85ef6ced 100644
--- a/front/py/examples/4_transformer/llama/1_llamarmsnorm_torch.py
+++ b/front/py/examples/4_transformer/llama/1_llamarmsnorm_torch.py
@@ -1,6 +1,6 @@
############### PyTorch 实现部分 ###############
import torch
-from transformers.models.llama.modeling_llama import LlamaRMSNorm as PTLlamaRMSNorm
+from transformers.models.llama.modeling_llama import LlamaRMSNorm
# 使用小规模数据以便打印完整结果
hidden_size = 8
@@ -10,7 +10,7 @@
print("PyTorch 输入:")
print(pt_input)
# 使用transformers库中的官方LlamaRMSNorm实现
-pt_norm = PTLlamaRMSNorm(hidden_size, eps=eps)
+pt_norm = LlamaRMSNorm(hidden_size, eps=eps)
# 设置权重为固定值0.5
with torch.no_grad():
pt_norm.weight.fill_(0.5)
From fabc8954e3fbff319c4ec425064dd3917b758a1e Mon Sep 17 00:00:00 2001
From: lipeng <734991033@qq.com>
Date: Fri, 18 Apr 2025 04:46:31 +0800
Subject: [PATCH 3/6] llama:rpoe todo
---
excuter/op-mem-cuda/src/client/tfs.cpp | 2 +-
.../deepx/transformer/modeling_rope_utils.py | 317 ++++++++++++++++++
.../models/llama/modeling_llama.py | 81 ++++-
3 files changed, 394 insertions(+), 6 deletions(-)
create mode 100644 front/py/deepx/transformer/modeling_rope_utils.py
diff --git a/excuter/op-mem-cuda/src/client/tfs.cpp b/excuter/op-mem-cuda/src/client/tfs.cpp
index f8e79c7b..15d935b0 100644
--- a/excuter/op-mem-cuda/src/client/tfs.cpp
+++ b/excuter/op-mem-cuda/src/client/tfs.cpp
@@ -227,7 +227,7 @@ namespace deepx::tf
tffactory.add_tf(std::make_shared>(vector(
{
Param("A", DataCategory::Tensor, Precision::Float64 | Precision::Float32),
- Param("scalar", DataCategory::Var, Precision::Float64 | Precision::Float32),
+ Param("scalar", DataCategory::Var, Precision::Float64 | Precision::Int32),
}),
vector(
{
diff --git a/front/py/deepx/transformer/modeling_rope_utils.py b/front/py/deepx/transformer/modeling_rope_utils.py
new file mode 100644
index 00000000..41cb3909
--- /dev/null
+++ b/front/py/deepx/transformer/modeling_rope_utils.py
@@ -0,0 +1,317 @@
+from typing import Tuple
+from deepx import arange
+
+def _compute_default_rope_parameters(
+ base: float = 10000.0,
+ dim: int = 0,
+ head_dim: int = 0,
+ partial_rotary_factor: float = 1.0,
+) -> Tuple:
+ """
+ 计算原始RoPE实现的逆频率
+
+ 参数:
+ base: 用于旋转位置编码的基数,默认为10000.0
+ dim: 特征维度,必须是偶数
+ head_dim: 每个头的特征维度,必须是偶数
+ partial_rotary_factor: 部分旋转因子,默认为1.0
+
+ 返回:
+ 包含RoPE嵌入的逆频率的元组和应用于计算的cos/sin的后处理缩放因子
+ """
+ attention_factor = 1.0 # 在这种类型的RoPE中未使用
+ if dim == 0:
+ dim = head_dim*partial_rotary_factor
+ # 计算逆频率
+ inv_freq = 1.0 / (base ** (arange(0, dim, 2, dtype='float64')/ dim))
+ return inv_freq, attention_factor
+
+# def _compute_linear_scaling_rope_parameters(
+# config: Optional[PretrainedConfig] = None,
+# device: Optional["torch.device"] = None,
+# seq_len: Optional[int] = None,
+# **rope_kwargs,
+# ) -> Tuple["torch.Tensor", float]:
+# """
+# Computes the inverse frequencies with linear scaling. Credits to the Reddit user /u/kaiokendev
+# Args:
+# config ([`~transformers.PretrainedConfig`]):
+# The model configuration.
+# device (`torch.device`):
+# The device to use for initialization of the inverse frequencies.
+# seq_len (`int`, *optional*):
+# The current sequence length. Unused for this type of RoPE.
+# rope_kwargs (`Dict`, *optional*):
+# BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
+# Returns:
+# Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
+# post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
+# """
+# if config is not None and len(rope_kwargs) > 0:
+# raise ValueError(
+# "Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in "
+# f"`_compute_linear_scaling_rope_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}"
+# )
+# if len(rope_kwargs) > 0:
+# factor = rope_kwargs["factor"]
+# elif config is not None:
+# factor = config.rope_scaling["factor"]
+
+# # Gets the default RoPE parameters
+# inv_freq, attention_factor = _compute_default_rope_parameters(config, device, seq_len, **rope_kwargs)
+
+# # Then applies linear scaling to the frequencies.
+# # NOTE: originally, scaling was applied to the position_ids. However, we get `embs = inv_freq @ position_ids`, so
+# # applying scaling to the inverse frequencies is equivalent.
+# inv_freq /= factor
+# return inv_freq, attention_factor
+
+
+# def _compute_dynamic_ntk_parameters(
+# config: Optional[PretrainedConfig] = None,
+# device: Optional["torch.device"] = None,
+# seq_len: Optional[int] = None,
+# **rope_kwargs,
+# ) -> Tuple["torch.Tensor", float]:
+# """
+# Computes the inverse frequencies with NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla
+# Args:
+# config ([`~transformers.PretrainedConfig`]):
+# The model configuration.
+# device (`torch.device`):
+# The device to use for initialization of the inverse frequencies.
+# seq_len (`int`, *optional*):
+# The current sequence length, used to update the dynamic RoPE at inference time.
+# rope_kwargs (`Dict`, *optional*):
+# BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
+# Returns:
+# Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
+# post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
+# """
+# # TODO (joao): use the new `original_max_position_embeddings` from rope_scaling
+# if config is not None and len(rope_kwargs) > 0:
+# raise ValueError(
+# "Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in "
+# f"`_compute_dynamic_ntk_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}"
+# )
+# if len(rope_kwargs) > 0:
+# base = rope_kwargs["base"]
+# dim = rope_kwargs["dim"]
+# max_position_embeddings = rope_kwargs["max_position_embeddings"]
+# factor = rope_kwargs["factor"]
+# elif config is not None:
+# base = config.rope_theta
+# partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
+# head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+# dim = int(head_dim * partial_rotary_factor)
+# max_position_embeddings = config.max_position_embeddings
+# factor = config.rope_scaling["factor"]
+
+# attention_factor = 1.0 # Unused in this type of RoPE
+
+# # seq_len: default to max_position_embeddings, e.g. at init time
+# seq_len = seq_len if seq_len is not None and seq_len > max_position_embeddings else max_position_embeddings
+
+# # Compute the inverse frequencies
+# base = base * ((factor * seq_len / max_position_embeddings) - (factor - 1)) ** (dim / (dim - 2))
+# inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float().to(device) / dim))
+# return inv_freq, attention_factor
+
+
+# def _compute_yarn_parameters(
+# config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None, **rope_kwargs
+# ) -> Tuple["torch.Tensor", float]:
+# """
+# Computes the inverse frequencies with NTK scaling. Please refer to the
+# [original paper](https://arxiv.org/abs/2309.00071)
+# Args:
+# config ([`~transformers.PretrainedConfig`]):
+# The model configuration.
+# device (`torch.device`):
+# The device to use for initialization of the inverse frequencies.
+# seq_len (`int`, *optional*):
+# The current sequence length. Unused for this type of RoPE.
+# rope_kwargs (`Dict`, *optional*):
+# BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
+# Returns:
+# Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
+# post-processing scaling factor applied to the computed cos/sin.
+# """
+# # No need to keep BC with yarn, unreleased when this new pattern was created.
+# if len(rope_kwargs) > 0:
+# raise ValueError(
+# f"Unexpected arguments: `**rope_kwargs` should be unset in `_compute_yarn_parameters`, got {rope_kwargs}"
+# )
+
+# base = config.rope_theta
+# partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
+# head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+# dim = int(head_dim * partial_rotary_factor)
+# max_position_embeddings = config.max_position_embeddings
+# factor = config.rope_scaling["factor"]
+
+# # Sets the attention factor as suggested in the paper
+# attention_factor = config.rope_scaling.get("attention_factor")
+# if attention_factor is None:
+# attention_factor = 0.1 * math.log(factor) + 1.0
+
+# # Optional config options
+# # beta_fast/beta_slow: as suggested in the paper, default to 32/1 (correspondingly)
+# beta_fast = config.rope_scaling.get("beta_fast") or 32
+# beta_slow = config.rope_scaling.get("beta_slow") or 1
+
+# # Compute the inverse frequencies
+# def find_correction_dim(num_rotations, dim, base, max_position_embeddings):
+# """Inverse dimension formula to find the dimension based on the number of rotations"""
+# return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (2 * math.log(base))
+
+# def find_correction_range(low_rot, high_rot, dim, base, max_position_embeddings):
+# """Find dimension range bounds based on rotations"""
+# low = math.floor(find_correction_dim(low_rot, dim, base, max_position_embeddings))
+# high = math.ceil(find_correction_dim(high_rot, dim, base, max_position_embeddings))
+# return max(low, 0), min(high, dim - 1)
+
+# def linear_ramp_factor(min, max, dim):
+# if min == max:
+# max += 0.001 # Prevent singularity
+
+# linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min)
+# ramp_func = torch.clamp(linear_func, 0, 1)
+# return ramp_func
+
+# # Note on variable naming: "interpolation" comes from the original technique, where we interpolate the position IDs
+# # to expand the possible context length. In other words, interpolation = apply scaling factor.
+# pos_freqs = base ** (torch.arange(0, dim, 2).float().to(device) / dim)
+# inv_freq_extrapolation = 1.0 / pos_freqs
+# inv_freq_interpolation = 1.0 / (factor * pos_freqs)
+
+# low, high = find_correction_range(beta_fast, beta_slow, dim, base, max_position_embeddings)
+
+# # Get n-dimensional rotational scaling corrected for extrapolation
+# inv_freq_extrapolation_factor = 1 - linear_ramp_factor(low, high, dim // 2).float().to(device)
+# inv_freq = (
+# inv_freq_interpolation * (1 - inv_freq_extrapolation_factor)
+# + inv_freq_extrapolation * inv_freq_extrapolation_factor
+# )
+
+# return inv_freq, attention_factor
+
+
+# def _compute_longrope_parameters(
+# config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None, **rope_kwargs
+# ) -> Tuple["torch.Tensor", float]:
+# """
+# Computes the inverse frequencies with LongRoPE scaling. Please refer to the
+# [original implementation](https://github.com/microsoft/LongRoPE)
+# Args:
+# config ([`~transformers.PretrainedConfig`]):
+# The model configuration.
+# device (`torch.device`):
+# The device to use for initialization of the inverse frequencies.
+# seq_len (`int`, *optional*):
+# The current sequence length.
+# rope_kwargs (`Dict`, *optional*):
+# BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
+# Returns:
+# Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
+# post-processing scaling factor applied to the computed cos/sin.
+# """
+# # TODO (joao): use the new `original_max_position_embeddings` from rope_scaling
+# # No need to keep BC with longrope, unreleased when this new pattern was created.
+# if len(rope_kwargs) > 0:
+# raise ValueError(
+# "Unexpected arguments: `**rope_kwargs` should be unset in `_compute_longrope_parameters`, got "
+# f"{rope_kwargs}"
+# )
+
+# base = config.rope_theta
+# partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
+# head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+# dim = int(head_dim * partial_rotary_factor)
+# long_factor = config.rope_scaling["long_factor"]
+# short_factor = config.rope_scaling["short_factor"]
+# factor = config.rope_scaling.get("factor")
+# attention_factor = config.rope_scaling.get("attention_factor")
+
+# # NOTE: Phi3 (and potentially other models) modify `max_position_embeddings` and have a
+# # `original_max_position_embeddings` field containing the pretrained value. They use the ratio between these two
+# # values to compute the default attention scaling factor, instead of using `factor`.
+# if hasattr(config, "original_max_position_embeddings"):
+# original_max_position_embeddings = config.original_max_position_embeddings
+# factor = config.max_position_embeddings / config.original_max_position_embeddings
+# else:
+# original_max_position_embeddings = config.max_position_embeddings
+
+# # Sets the attention factor as suggested in the paper
+# if attention_factor is None:
+# if factor <= 1.0:
+# attention_factor = 1.0
+# else:
+# attention_factor = math.sqrt(1 + math.log(factor) / math.log(original_max_position_embeddings))
+
+# # Compute the inverse frequencies -- scaled based on the target sequence length
+# if seq_len and seq_len > original_max_position_embeddings:
+# ext_factors = torch.tensor(long_factor, dtype=torch.float32, device=device)
+# else:
+# ext_factors = torch.tensor(short_factor, dtype=torch.float32, device=device)
+# inv_freq_shape = torch.arange(0, dim, 2, dtype=torch.int64, device=device).float() / dim
+# inv_freq = 1.0 / (ext_factors * base**inv_freq_shape)
+
+# return inv_freq, attention_factor
+
+
+def _compute_llama3_parameters(
+ config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None, **rope_kwargs
+) -> Tuple["torch.Tensor", float]:
+ """
+ Computes the inverse frequencies for llama 3.1.
+
+ Args:
+ config ([`~transformers.PretrainedConfig`]):
+ The model configuration.
+ device (`torch.device`):
+ The device to use for initialization of the inverse frequencies.
+ seq_len (`int`, *optional*):
+ The current sequence length. Unused for this type of RoPE.
+ rope_kwargs (`Dict`, *optional*):
+ BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
+ Returns:
+ Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
+ post-processing scaling factor applied to the computed cos/sin.
+ """
+ # Gets the default RoPE parameters
+ inv_freq, attention_factor = _compute_default_rope_parameters(config, device, seq_len, **rope_kwargs)
+
+ factor = config.rope_scaling["factor"] # `8` in the original implementation
+ low_freq_factor = config.rope_scaling["low_freq_factor"] # `1` in the original implementation
+ high_freq_factor = config.rope_scaling["high_freq_factor"] # `4` in the original implementation
+ old_context_len = config.rope_scaling["original_max_position_embeddings"] # `8192` in the original implementation
+
+ low_freq_wavelen = old_context_len / low_freq_factor
+ high_freq_wavelen = old_context_len / high_freq_factor
+
+ wavelen = 2 * math.pi / inv_freq
+ # wavelen < high_freq_wavelen: do nothing
+ # wavelen > low_freq_wavelen: divide by factor
+ inv_freq_llama = torch.where(wavelen > low_freq_wavelen, inv_freq / factor, inv_freq)
+ # otherwise: interpolate between the two, using a smooth factor
+ smooth_factor = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
+ smoothed_inv_freq = (1 - smooth_factor) * inv_freq_llama / factor + smooth_factor * inv_freq_llama
+ is_medium_freq = ~(wavelen < high_freq_wavelen) * ~(wavelen > low_freq_wavelen)
+ inv_freq_llama = torch.where(is_medium_freq, smoothed_inv_freq, inv_freq_llama)
+
+ return inv_freq_llama, attention_factor
+
+
+# This maps the "rope_type" string field in rope config to the corresponding function to compute the RoPE parameters
+# from the model config. You can append new {'rope_type': callable} pairs to this dictionary to enable custom RoPE
+# parameterizations, as long as the callable has the same signature.
+ROPE_INIT_FUNCTIONS = {
+ "default": _compute_default_rope_parameters,
+ # "linear": _compute_linear_scaling_rope_parameters,
+ # "dynamic": _compute_dynamic_ntk_parameters,
+ # "yarn": _compute_yarn_parameters,
+ # "longrope": _compute_longrope_parameters,
+ # "llama3": _compute_llama3_parameters,
+}
+
\ No newline at end of file
diff --git a/front/py/deepx/transformer/models/llama/modeling_llama.py b/front/py/deepx/transformer/models/llama/modeling_llama.py
index f9850f81..c60f34f5 100644
--- a/front/py/deepx/transformer/models/llama/modeling_llama.py
+++ b/front/py/deepx/transformer/models/llama/modeling_llama.py
@@ -1,7 +1,9 @@
from deepx.nn.modules import Module
from deepx import Tensor,ones,rsqrt
+# RMSNorm
# copy from https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
+# 数学公式
class LlamaRMSNorm(Module):
def __init__(self, hidden_size, eps=1e-6):
"""
@@ -11,11 +13,80 @@ def __init__(self, hidden_size, eps=1e-6):
self.weight = ones(hidden_size)
self.variance_epsilon = eps
-
+ # 和官方实现相比,尽可能inplace化
def forward(self, hidden_states:Tensor):
- variance = hidden_states.pow(2).mean(-1, keepdim=True)
- hidden_states = hidden_states * rsqrt(variance + self.variance_epsilon)
- return self.weight * hidden_states
+ input_clone = hidden_states.clone()
+ input_clone.pow_(2)
+ variance = input_clone.mean([-1], keepdim=True)
+
+ variance.add_(self.variance_epsilon)
+ variance = rsqrt(variance)
+
+ hidden_states.mul_(variance)
+ hidden_states.mul_(self.weight)
+ return hidden_states
def extra_repr(self):
- return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
\ No newline at end of file
+ return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class LlamaRotaryEmbedding(Module):
+ from transformers.models.llama.configuration_llama import LlamaConfig
+ def __init__(self, config: LlamaConfig, device=None):
+ super().__init__()
+ # BC: "rope_type" was originally "type"
+ if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+ self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+ else:
+ self.rope_type = "default"
+ self.max_seq_len_cached = config.max_position_embeddings
+ self.original_max_seq_len = config.max_position_embeddings
+
+ self.config = config
+ self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+ inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
+ self.original_inv_freq = self.inv_freq
+
+ def _dynamic_frequency_update(self, position_ids, device):
+ """
+ dynamic RoPE layers should recompute `inv_freq` in the following situations:
+ 1 - growing beyond the cached sequence length (allow scaling)
+ 2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
+ """
+ seq_len = torch.max(position_ids) + 1
+ if seq_len > self.max_seq_len_cached: # growth
+ inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
+ self.register_buffer("inv_freq", inv_freq, persistent=False) # TODO joao: may break with compilation
+ self.max_seq_len_cached = seq_len
+
+ if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len: # reset
+ # This .to() is needed if the model has been moved to a device after being initialized (because
+ # the buffer is automatically moved, but not the original copy)
+ self.original_inv_freq = self.original_inv_freq.to(device)
+ self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+ self.max_seq_len_cached = self.original_max_seq_len
+
+ @torch.no_grad()
+ def forward(self, x, position_ids):
+ if "dynamic" in self.rope_type:
+ self._dynamic_frequency_update(position_ids, device=x.device)
+
+ # Core RoPE block
+ inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+ position_ids_expanded = position_ids[:, None, :].float()
+ # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
+ device_type = x.device.type
+ device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+ with torch.autocast(device_type=device_type, enabled=False):
+ freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+ emb = torch.cat((freqs, freqs), dim=-1)
+ cos = emb.cos()
+ sin = emb.sin()
+
+ # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
+ cos = cos * self.attention_scaling
+ sin = sin * self.attention_scaling
+
+ return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
From a5d555afa9bc7a651bfd8a0ac02d8a9249bf9791 Mon Sep 17 00:00:00 2001
From: lipeng <734991033@qq.com>
Date: Fri, 18 Apr 2025 04:46:36 +0800
Subject: [PATCH 4/6] llama:rpoe todo
---
doc/excuter/op-mem-cuda/list.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/doc/excuter/op-mem-cuda/list.md b/doc/excuter/op-mem-cuda/list.md
index e9f62d96..7b94afd6 100644
--- a/doc/excuter/op-mem-cuda/list.md
+++ b/doc/excuter/op-mem-cuda/list.md
@@ -31,7 +31,7 @@
| minscalar | miaobyte | minscalar(tensor A, var scalar)->(tensor C) | T3=min(T1, scalar) | minscalar(tensor A, var scalar)->(tensor C) |
| rdivscalar | miaobyte | rdivscalar(var scalar, tensor A)->(tensor C) | T3=scalar/T1 | rdivscalar(var scalar, tensor A)->(tensor C) |
| constant | miaobyte | constant(tensor t, var value)->() | constant(T1) | constant(tensor t, var value)->() |
-| powscalar | miaobyte | powscalar(tensor A, var scalar)->(tensor C) | T3=pow(T1, scalar) | powscalar(tensor A, var scalar)->(tensor C) |
+| powscalar | miaobyte | powscalar(tensor A, var scalar)->(tensor C) | T3=pow(T1, scalar) | powscalar(tensor A, var scalar)->(tensor C) |
| vecset | none | vecset(vector value)->(vector name) | shape = [3 4 5] | vecset(vector value)->(vector name) |
| reducemin | miaobyte | reducemin(tensor A, vector dims, var keepdims)->(tensor B) | B = reducemin(A, axis=[1 2], keepdims=false) | reducemin(tensor A, vector dims, var keepdims)->(tensor B) |
| subscalar | miaobyte | subscalar(tensor A, var b)->(tensor C) | T3=T1-scalar | subscalar(tensor A, var b)->(tensor C) |
From 05b6e9dd2c49bd6e75792caa7eaf62b442be7035 Mon Sep 17 00:00:00 2001
From: lipeng <734991033@qq.com>
Date: Fri, 18 Apr 2025 15:57:47 +0800
Subject: [PATCH 5/6] invert,rpowscalar:cuda&cpu
---
doc/excuter/op-mem-cuda/list.md | 12 +-
doc/excuter/op-mem-ompsimd/list.md | 4 +-
.../src/deepx/tensorfunc/elementwise.hpp | 84 ++--
excuter/op-mem-cuda/src/client/tfs.cpp | 22 +-
.../src/deepx/tensorfunc/cuda_atomic.cuh | 260 ++++++++++
.../src/deepx/tensorfunc/cuda_math.cuh | 313 +++---------
.../tensorfunc/elementwise_miaobyte_basic.cu | 474 ++++++++++--------
.../tensorfunc/elementwise_miaobyte_basic.cuh | 257 +---------
.../tensorfunc/elementwise_miaobyte_basic.hpp | 49 +-
.../tensorfunc/elementwise_miaobyte_sqrt.cu | 218 +++-----
.../tensorfunc/elementwise_miaobyte_sqrt.cuh | 81 +--
.../tensorfunc/elementwise_miaobyte_sqrt.hpp | 34 +-
.../src/deepx/tensorfunc/reduce_miaobyte.cu | 3 +-
.../src/deepx/tf/elementwise_basic.hpp | 182 +++++--
.../src/deepx/tf/elementwise_sqrt.hpp | 168 +++++--
excuter/op-mem-ompsimd/src/client/tfs.cpp | 20 +
.../deepx/tensorfunc/elementwise_miaobyte.hpp | 43 ++
.../src/deepx/tf/elementwise.hpp | 100 ++++
front/py/deepx/nn/functional/authormap.py | 3 +-
.../nn/functional/leaffunc_elementwise.py | 5 +-
.../py/deepx/nn/functional/rtf_elementwise.py | 4 +
front/py/deepx/tensor/elementwise.py | 7 +
front/py/deepx/tensor/tensor.py | 3 +
.../deepx/transformer/modeling_rope_utils.py | 58 +--
.../py/examples/2_ir/2_elementwise_compare.py | 26 +
.../py/examples/2_ir/2_elementwise_sqrtlog.py | 19 +-
26 files changed, 1318 insertions(+), 1131 deletions(-)
create mode 100644 excuter/op-mem-cuda/src/deepx/tensorfunc/cuda_atomic.cuh
create mode 100644 front/py/examples/2_ir/2_elementwise_compare.py
diff --git a/doc/excuter/op-mem-cuda/list.md b/doc/excuter/op-mem-cuda/list.md
index 7b94afd6..691cdca6 100644
--- a/doc/excuter/op-mem-cuda/list.md
+++ b/doc/excuter/op-mem-cuda/list.md
@@ -12,6 +12,9 @@
| matmul | cublas | matmul(tensor A, tensor B)->(tensor C) | T3=T1 @ T2 | matmul(tensor A, tensor B)->(tensor C) |
| comparescalar | miaobyte | comparescalar(tensor A, var scalar)->(tensor mask) | mask=compare(T1, scalar) | comparescalar(tensor A, var scalar)->(tensor mask) |
| compare | miaobyte | compare(tensor A, tensor B)->(tensor mask) | mask=compare(T1, T2) | compare(tensor A, tensor B)->(tensor mask) |
+| prod | miaobyte | prod(tensor A, vector dims, var keepdims)->(tensor B) | B = prod(A, axis=[1 2], keepdims=false) | prod(tensor A, vector dims, var keepdims)->(tensor B) |
+| min | miaobyte | min(tensor A, tensor B)->(tensor C) | T3=min(T1, T2) | min(tensor A, tensor B)->(tensor C) |
+| maxscalar | miaobyte | maxscalar(tensor A, var scalar)->(tensor C) | T3=max(T1, scalar) | maxscalar(tensor A, var scalar)->(tensor C) |
| uniform | miaobyte | uniform(tensor t, var low, var high, var seed)->() | uniform(T1,low,high,seed) | uniform(tensor t, var low, var high, var seed)->() |
| addscalar | miaobyte | addscalar(tensor A, var b)->(tensor C) | T3=T1+scalar | addscalar(tensor A, var b)->(tensor C) |
| log | miaobyte | log(tensor A)->(tensor C) | T3=log(T1) | log(tensor A)->(tensor C) |
@@ -22,28 +25,27 @@
| add | cublas | add(tensor a, tensor b)->(tensor c) | T3=T1+T2 | add(tensor a, tensor b)->(tensor c) |
| add | miaobyte | add(tensor a, tensor b)->(tensor c) | T3=T1+T2 | add(tensor a, tensor b)->(tensor c) |
| copytensor | none | copytensor(tensor src, tensor dst)->() | T2.data = T1.data | copytensor(tensor src, tensor dst)->() |
-| prod | miaobyte | prod(tensor A, vector dims, var keepdims)->(tensor B) | B = prod(A, axis=[1 2], keepdims=false) | prod(tensor A, vector dims, var keepdims)->(tensor B) |
-| min | miaobyte | min(tensor A, tensor B)->(tensor C) | T3=min(T1, T2) | min(tensor A, tensor B)->(tensor C) |
| print | miaobyte | print(tensor )->() | print(T1) | print(tensor )->() |
| print | miaobyte | print(tensor , var )->() | print(T1) | print(tensor , var )->() |
| newtensor | none | newtensor(vector shape)->(tensor tensor1) | T1 = zeros(shape) | newtensor(vector shape)->(tensor tensor1) |
| newtensor | none | newtensor(var shape)->(tensor tensor1) | T1 = zeros(shape) | newtensor(var shape)->(tensor tensor1) |
-| minscalar | miaobyte | minscalar(tensor A, var scalar)->(tensor C) | T3=min(T1, scalar) | minscalar(tensor A, var scalar)->(tensor C) |
-| rdivscalar | miaobyte | rdivscalar(var scalar, tensor A)->(tensor C) | T3=scalar/T1 | rdivscalar(var scalar, tensor A)->(tensor C) |
| constant | miaobyte | constant(tensor t, var value)->() | constant(T1) | constant(tensor t, var value)->() |
| powscalar | miaobyte | powscalar(tensor A, var scalar)->(tensor C) | T3=pow(T1, scalar) | powscalar(tensor A, var scalar)->(tensor C) |
| vecset | none | vecset(vector value)->(vector name) | shape = [3 4 5] | vecset(vector value)->(vector name) |
| reducemin | miaobyte | reducemin(tensor A, vector dims, var keepdims)->(tensor B) | B = reducemin(A, axis=[1 2], keepdims=false) | reducemin(tensor A, vector dims, var keepdims)->(tensor B) |
| subscalar | miaobyte | subscalar(tensor A, var b)->(tensor C) | T3=T1-scalar | subscalar(tensor A, var b)->(tensor C) |
| sqrt | miaobyte | sqrt(tensor A)->(tensor C) | T3=sqrt(T1) | sqrt(tensor A)->(tensor C) |
+| minscalar | miaobyte | minscalar(tensor A, var scalar)->(tensor C) | T3=min(T1, scalar) | minscalar(tensor A, var scalar)->(tensor C) |
+| rdivscalar | miaobyte | rdivscalar(var scalar, tensor A)->(tensor C) | T3=scalar/T1 | rdivscalar(var scalar, tensor A)->(tensor C) |
+| rpowscalar | miaobyte | rpowscalar(var scalar, tensor A)->(tensor C) | T3=pow(scalar, T1) | rpowscalar(var scalar, tensor A)->(tensor C) |
| sub | miaobyte | sub(tensor A, tensor B)->(tensor C) | T3=T1-T2 | sub(tensor A, tensor B)->(tensor C) |
| sum | miaobyte | sum(tensor A, vector dims, var keepdims)->(tensor B) | B = sum(A, axis=[1 2], keepdims=false) | sum(tensor A, vector dims, var keepdims)->(tensor B) |
| argset | none | argset(var value)->(var name) | var argname = argvalue | argset(var value)->(var name) |
| mulscalar | miaobyte | mulscalar(tensor A, var b)->(tensor C) | T3=T1*scalar | mulscalar(tensor A, var b)->(tensor C) |
| div | miaobyte | div(tensor A, tensor B)->(tensor C) | T3=T1/T2 | div(tensor A, tensor B)->(tensor C) |
+| invert | miaobyte | invert(tensor A)->(tensor C) | T3=~T1 | invert(tensor A)->(tensor C) |
| max | miaobyte | max(tensor A, tensor B)->(tensor C) | T3=max(T1, T2) | max(tensor A, tensor B)->(tensor C) |
| pow | miaobyte | pow(tensor A, tensor B)->(tensor C) | T3=pow(T1, T2) | pow(tensor A, tensor B)->(tensor