From 2f1cf4e507c7680d90f4d4cef175007b71e05cf4 Mon Sep 17 00:00:00 2001 From: lipeng <734991033@qq.com> Date: Tue, 1 Jul 2025 23:22:34 +0800 Subject: [PATCH 1/4] =?UTF-8?q?py:=201.transformer=E7=9B=AE=E5=BD=95?= =?UTF-8?q?=EF=BC=8C=E6=8C=89=E7=85=A7pytorch=E9=A3=8E=E6=A0=BC=E6=95=B4?= =?UTF-8?q?=E5=90=88=E8=BF=9Bnn.module?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- front/py/deepx/nn/functional/changeshape.py | 10 +++++- .../nn/functional/leaffunc_changeshape.py | 6 ---- front/py/deepx/nn/functional/rtf.py | 2 +- .../py/deepx/nn/functional/rtf_changeshape.py | 6 ++-- front/py/deepx/nn/functional/rtf_reduce.py | 8 ++--- front/py/deepx/nn/modules/{ => conv}/conv.py | 0 front/py/deepx/nn/modules/linear.py | 2 +- .../models/llama => nn/modules/mlp}/mlp.py | 2 +- .../modules/norm}/normalization.py | 0 front/py/deepx/nn/modules/normalization.py | 0 .../modules/transformer}/__init__.py | 0 .../modules/transformer}/attention.py | 35 +++++++++++++++++-- .../modules/transformer}/embedding.py | 5 +-- .../transformer}/llama/modeling_llama.py | 2 +- .../transformer/modeling_rope_utils.py | 0 front/py/deepx/tensor/changeshape.py | 11 +++--- front/py/deepx/tensor/shape.py | 2 +- front/py/deepx/tensor/tensor.py | 4 +-- front/py/deepx/transformer/__init__.py | 0 front/py/deepx/transformer/attention.py | 0 front/py/deepx/transformer/decoder.py | 0 front/py/deepx/transformer/models/__init__.py | 0 .../py/examples/2_ir/4_changeshape_concat.py | 2 +- .../py/examples/2_ir/4_changeshape_reshape.py | 2 +- .../examples/2_ir/4_changeshape_transpose.py | 4 +-- front/py/examples/2_ir/5_reduce_prod.py | 4 +-- front/py/examples/2_ir/5_reduce_sum.py | 2 +- .../py/examples/2_ir/5_reduce_sum_keepdim.py | 2 +- .../examples/3_functional/activite_sigmoid.py | 2 +- .../3_functional/changeshape_broadcast.py | 2 +- front/py/examples/3_module/1_embedding.py | 2 +- .../4_transformer/llama/llama_rope.py | 2 +- 32 files changed, 77 insertions(+), 42 deletions(-) rename front/py/deepx/nn/modules/{ => conv}/conv.py (100%) rename front/py/deepx/{transformer/models/llama => nn/modules/mlp}/mlp.py (97%) rename front/py/deepx/{transformer/models/llama => nn/modules/norm}/normalization.py (100%) delete mode 100644 front/py/deepx/nn/modules/normalization.py rename front/py/deepx/{transformer/models/llama => nn/modules/transformer}/__init__.py (100%) rename front/py/deepx/{transformer/models/llama => nn/modules/transformer}/attention.py (81%) rename front/py/deepx/{transformer/models/llama => nn/modules/transformer}/embedding.py (95%) rename front/py/deepx/{transformer/models => nn/modules/transformer}/llama/modeling_llama.py (99%) rename front/py/deepx/{ => nn/modules}/transformer/modeling_rope_utils.py (100%) delete mode 100644 front/py/deepx/transformer/__init__.py delete mode 100644 front/py/deepx/transformer/attention.py delete mode 100644 front/py/deepx/transformer/decoder.py delete mode 100644 front/py/deepx/transformer/models/__init__.py diff --git a/front/py/deepx/nn/functional/changeshape.py b/front/py/deepx/nn/functional/changeshape.py index 5616c458..9e0ee71b 100644 --- a/front/py/deepx/nn/functional/changeshape.py +++ b/front/py/deepx/nn/functional/changeshape.py @@ -1,7 +1,15 @@ from typing import Union from deepx import Tensor -from .leaffunc_changeshape import reshape,indexselect, concat,broadcastTo +from .leaffunc_changeshape import reshape,indexselect, concat,broadcastTo,permute from .leaffunc_init import newtensor,arange + + +def transpose(t:Tensor,dim0:int,dim1:int,out:Union[Tensor,str]='')->Tensor: + dimorder = list(range(t.ndim)) + dimorder[dim0],dimorder[dim1]=dimorder[dim1],dimorder[dim0] + return permute(t,tuple(dimorder),out) + + def squeeze(t:Tensor,dim:int)->Tensor: assert isinstance(dim,int) assert isinstance(t,Tensor) diff --git a/front/py/deepx/nn/functional/leaffunc_changeshape.py b/front/py/deepx/nn/functional/leaffunc_changeshape.py index 750378cd..4be47c30 100644 --- a/front/py/deepx/nn/functional/leaffunc_changeshape.py +++ b/front/py/deepx/nn/functional/leaffunc_changeshape.py @@ -9,7 +9,6 @@ def reshape(t:Tensor,shape:tuple[int,...],out:Union[Tensor,str]='')->Tensor: for i in shape: assert isinstance(i,int) and i>0 - outtensor=out if isinstance(out,str) or out is None: outshape=shape outtensor=newtensor(outshape,dtype=t.dtype,name=out) @@ -40,11 +39,6 @@ def permute(t:Tensor, rtf_transpose(t,dimorder,outtensor,defaultauthor['transpose']) return outtensor -def transpose(t:Tensor,out:Union[Tensor,str]='')->Tensor: - dimorder = list(range(t.ndim)) - dimorder[-1],dimorder[-2]=dimorder[-2],dimorder[-1] - return permute(t,tuple(dimorder),out) - def concat(tensors:Union[list[Tensor],tuple[Tensor,...]],dim:int,out:Union[Tensor,str]='')->Tensor: diff --git a/front/py/deepx/nn/functional/rtf.py b/front/py/deepx/nn/functional/rtf.py index acd99249..4983ad16 100644 --- a/front/py/deepx/nn/functional/rtf.py +++ b/front/py/deepx/nn/functional/rtf.py @@ -37,7 +37,7 @@ def A_op_C(op:str,a:Tensor,out:Tensor,author='miaobyte'): ir=DeepxIR(op, args, returns,author) send(ir) -def A_b1_b2_op_C(op:str,a:Tensor,b1:tuple[int],b2:bool,out:Tensor,author='miaobyte'): +def A_b1_b2_op_C(op:str,a:Tensor,b1:tuple[int,...],b2:bool,out:Tensor,author='miaobyte'): args=[Param.tensor(a),Param.vector(b1,'int32'),Param.varbool(b2)] returns=[Param.tensor(out)] ir=DeepxIR(op, args, returns,author) diff --git a/front/py/deepx/nn/functional/rtf_changeshape.py b/front/py/deepx/nn/functional/rtf_changeshape.py index 7c0b144b..6ad3ab3e 100644 --- a/front/py/deepx/nn/functional/rtf_changeshape.py +++ b/front/py/deepx/nn/functional/rtf_changeshape.py @@ -2,14 +2,14 @@ from deepx.nn.deepxir import DeepxIR,Param from deepx.scheduler import send -def rtf_reshape(t:Tensor,shape:tuple[int],out:Tensor,author='miaobyte'): +def rtf_reshape(t:Tensor,shape:tuple[int,...],out:Tensor,author='miaobyte'): args=[Param.tensor(t),Param.vector(shape,'int32')] returns=[Param.tensor(out)] ir=DeepxIR("reshape", args, returns,author) send(ir) -def rtf_transpose(t:Tensor,dimorder:tuple[int],out:Tensor,author='miaobyte'): +def rtf_transpose(t:Tensor,dimorder:tuple[int,...],out:Tensor,author='miaobyte'): args=[Param.tensor(t),Param.vector(dimorder,'int32')] returns=[Param.tensor(out)] ir=DeepxIR("transpose", args, returns,author) @@ -22,7 +22,7 @@ def rtf_concat(tensors:tuple[Tensor],dim:int,out:Tensor,author='miaobyte'): send(ir) -def rtf_broadcastTo(t:Tensor,new_shape:tuple[int],out:Tensor,author='miaobyte'): +def rtf_broadcastTo(t:Tensor,new_shape:tuple[int,...],out:Tensor,author='miaobyte'): args=[Param.tensor(t),Param.vector(new_shape,'int32')] returns=[Param.tensor(out)] ir=DeepxIR("broadcastTo", args, returns,author) diff --git a/front/py/deepx/nn/functional/rtf_reduce.py b/front/py/deepx/nn/functional/rtf_reduce.py index f089bfab..a9966f41 100644 --- a/front/py/deepx/nn/functional/rtf_reduce.py +++ b/front/py/deepx/nn/functional/rtf_reduce.py @@ -1,18 +1,18 @@ from deepx.tensor import Tensor from .rtf import A_b1_b2_op_C -def rtf_sum(a:Tensor,dim:tuple[int],keepdim:bool,out: Tensor, author:str='miaobyte')->Tensor: +def rtf_sum(a:Tensor,dim:tuple[int,...],keepdim:bool,out: Tensor, author:str='miaobyte')->Tensor: A_b1_b2_op_C("sum",a,dim,keepdim,out,author) -def rtf_prod(a:Tensor,dim:tuple[int],keepdim:bool,out:Tensor, author:str='miaobyte')->Tensor: +def rtf_prod(a:Tensor,dim:tuple[int,...],keepdim:bool,out:Tensor, author:str='miaobyte')->Tensor: A_b1_b2_op_C("prod",a,dim,keepdim,out,author) -def rtf_reducemax(a:Tensor,dim:tuple[int],keepdim:bool,out:Tensor, author:str='miaobyte')->Tensor: +def rtf_reducemax(a:Tensor,dim:tuple[int,...],keepdim:bool,out:Tensor, author:str='miaobyte')->Tensor: A_b1_b2_op_C("reducemax",a,dim,keepdim,out,author) -def rtf_reducemin(a:Tensor,dim:tuple[int],keepdim:bool,out:Tensor, author:str='miaobyte')->Tensor: +def rtf_reducemin(a:Tensor,dim:tuple[int,...],keepdim:bool,out:Tensor, author:str='miaobyte')->Tensor: A_b1_b2_op_C("reducemin",a,dim,keepdim,out,author) \ No newline at end of file diff --git a/front/py/deepx/nn/modules/conv.py b/front/py/deepx/nn/modules/conv/conv.py similarity index 100% rename from front/py/deepx/nn/modules/conv.py rename to front/py/deepx/nn/modules/conv/conv.py diff --git a/front/py/deepx/nn/modules/linear.py b/front/py/deepx/nn/modules/linear.py index c4f05194..6c80f93f 100644 --- a/front/py/deepx/nn/modules/linear.py +++ b/front/py/deepx/nn/modules/linear.py @@ -41,7 +41,7 @@ def reset_parameters(self) -> None: def forward(self, input: Tensor) -> Tensor: #`y = xA^T + b` - y=input @ self.weight.T + y=input @ self.weight.mT oldshape=y.shape if self.bias is not None: y.reshape_(tuple(y.shape[1:])) diff --git a/front/py/deepx/transformer/models/llama/mlp.py b/front/py/deepx/nn/modules/mlp/mlp.py similarity index 97% rename from front/py/deepx/transformer/models/llama/mlp.py rename to front/py/deepx/nn/modules/mlp/mlp.py index a35ea3be..eefb5004 100644 --- a/front/py/deepx/transformer/models/llama/mlp.py +++ b/front/py/deepx/nn/modules/mlp/mlp.py @@ -5,7 +5,7 @@ "silu":swish_fn, } -class LlamaMLP(Module): +class MLP(Module): def __init__(self, config:dict): super().__init__() # 输入层大小 diff --git a/front/py/deepx/transformer/models/llama/normalization.py b/front/py/deepx/nn/modules/norm/normalization.py similarity index 100% rename from front/py/deepx/transformer/models/llama/normalization.py rename to front/py/deepx/nn/modules/norm/normalization.py diff --git a/front/py/deepx/nn/modules/normalization.py b/front/py/deepx/nn/modules/normalization.py deleted file mode 100644 index e69de29b..00000000 diff --git a/front/py/deepx/transformer/models/llama/__init__.py b/front/py/deepx/nn/modules/transformer/__init__.py similarity index 100% rename from front/py/deepx/transformer/models/llama/__init__.py rename to front/py/deepx/nn/modules/transformer/__init__.py diff --git a/front/py/deepx/transformer/models/llama/attention.py b/front/py/deepx/nn/modules/transformer/attention.py similarity index 81% rename from front/py/deepx/transformer/models/llama/attention.py rename to front/py/deepx/nn/modules/transformer/attention.py index a028ce49..248a6099 100644 --- a/front/py/deepx/transformer/models/llama/attention.py +++ b/front/py/deepx/nn/modules/transformer/attention.py @@ -4,6 +4,35 @@ from deepx.nn.modules import Module from deepx.utils import Config +def scaled_dot_product( + query: Tensor, + key: Tensor, + value: Tensor, + attention_mask: Optional[Tensor] = None, + scaling_factor: float = 1.0, + dropout_prob: float = 0.0 +) -> Tuple[Tensor, Tensor]: + # 计算注意力分数 + attn_scores = (query @ key.mT) * scaling_factor + + # softmax归一化 + attn_weights = softmax(attn_scores, dim=-1) + + # 应用注意力掩码 + if attention_mask is not None: + causal_mask = attention_mask[:, :, :, : key.shape[-2]] + attn_weights = attn_weights + causal_mask + + # 可选的dropout + if dropout_prob > 0.0: + attn_weights = dropout_func(attn_weights, p=dropout_prob) + + # 注意力加权值 + attn_output = matmul(attn_weights, value) + + # 恢复原始维度 + attn_output = attn_output.mT + return attn_output, attn_weights def rotate_half(x:Tensor): @@ -29,7 +58,7 @@ def repeat_kv(hidden_states: Tensor, n_rep: int) -> Tensor: # https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py # 经简化,去掉了分布式配置,去掉attention的配置。交给IR自动替换flashattention,后续的组件自动处理 -def eager_attention_forward( +def GQA( module: Module, query: Tensor, key: Tensor, @@ -49,7 +78,7 @@ def eager_attention_forward( attn_weights = softmax(attn_weights, dim=-1, dtype=query.dtype) attn_weights = dropout_func(attn_weights, p=dropout) attn_output = matmul(attn_weights, value_states) - attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.transpose(1, 2) return attn_output, attn_weights @@ -97,7 +126,7 @@ def forward( query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) - attn_output, attn_weights =eager_attention_forward( + attn_output, attn_weights =GQA( self, query_states, key_states, diff --git a/front/py/deepx/transformer/models/llama/embedding.py b/front/py/deepx/nn/modules/transformer/embedding.py similarity index 95% rename from front/py/deepx/transformer/models/llama/embedding.py rename to front/py/deepx/nn/modules/transformer/embedding.py index 0ca7ab10..3f3d3db8 100644 --- a/front/py/deepx/transformer/models/llama/embedding.py +++ b/front/py/deepx/nn/modules/transformer/embedding.py @@ -1,7 +1,8 @@ from deepx.nn.modules import Module from deepx import cat -from deepx.transformer.modeling_rope_utils import ROPE_INIT_FUNCTIONS +from .modeling_rope_utils import ROPE_INIT_FUNCTIONS from deepx.utils import Config + # https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py class LlamaRotaryEmbedding(Module): def __init__(self,config:Config): @@ -46,7 +47,7 @@ def forward(self, x, position_ids): # 计算频率 - freqs = (inv_freq_expanded @ position_ids_expanded).T + freqs = (inv_freq_expanded @ position_ids_expanded).mT # 拼接频率 emb = cat((freqs, freqs), dim=-1) # 计算余弦和正弦 diff --git a/front/py/deepx/transformer/models/llama/modeling_llama.py b/front/py/deepx/nn/modules/transformer/llama/modeling_llama.py similarity index 99% rename from front/py/deepx/transformer/models/llama/modeling_llama.py rename to front/py/deepx/nn/modules/transformer/llama/modeling_llama.py index cc9cfc8b..e48c9a2b 100644 --- a/front/py/deepx/transformer/models/llama/modeling_llama.py +++ b/front/py/deepx/nn/modules/transformer/llama/modeling_llama.py @@ -1,7 +1,7 @@ from typing import Optional,Tuple from deepx.nn.modules import Module,Linear,Embedding from deepx import Tensor -from deepx.transformer.modeling_rope_utils import ROPE_INIT_FUNCTIONS +from front.py.deepx.nn.modules.transformer.modeling_rope_utils import ROPE_INIT_FUNCTIONS from deepx.transformer.models.llama.attention import LlamaAttention from deepx.transformer.models.llama.mlp import LlamaMLP from deepx.transformer.models.llama.normalization import LlamaRMSNorm diff --git a/front/py/deepx/transformer/modeling_rope_utils.py b/front/py/deepx/nn/modules/transformer/modeling_rope_utils.py similarity index 100% rename from front/py/deepx/transformer/modeling_rope_utils.py rename to front/py/deepx/nn/modules/transformer/modeling_rope_utils.py diff --git a/front/py/deepx/tensor/changeshape.py b/front/py/deepx/tensor/changeshape.py index ad495557..8fd9868b 100644 --- a/front/py/deepx/tensor/changeshape.py +++ b/front/py/deepx/tensor/changeshape.py @@ -30,16 +30,19 @@ def permute_(self,dimorder:tuple[int,...])->Tensor: return self @tensor_method -def transpose(self,out:Union[Tensor,str]=''): +def transpose(self,dim0:int,dim1:int,out:Union[Tensor,str]=''): assert isinstance(out,str) or isinstance(out,Tensor) + assert isinstance(dim0,int) and isinstance(dim1,int) from deepx.nn.functional import transpose as transpose_func - result=transpose_func(self,out) + result=transpose_func(self,dim0,dim1,out) return result @tensor_method -def transpose_(self): +def transpose_(self,dim0:int,dim1:int): + assert isinstance(dim0,int) and isinstance(dim1,int) + assert isinstance(dim0,int) and isinstance(dim1,int) from deepx.nn.functional import transpose as transpose_func - transpose_func(self,self) + transpose_func(self,dim0,dim1,self) return self # broadcast_to==broadcastTo==expand diff --git a/front/py/deepx/tensor/shape.py b/front/py/deepx/tensor/shape.py index f7e92625..bca062d4 100644 --- a/front/py/deepx/tensor/shape.py +++ b/front/py/deepx/tensor/shape.py @@ -129,7 +129,7 @@ def concat(cls,shapes:tuple,dim:int)->tuple[int,...]: return tuple(outshape) @classmethod - def matmul(cls,shape:tuple[int],other:tuple[int])->tuple[int]: + def matmul(cls,shape:tuple[int,...],other:tuple[int,...])->tuple[int,...]: if len(shape)<2 or len(other)<2: raise ValueError(f"matmul: self.ndimension()<2 or other.ndimension()<2") if len(shape)!=len(other): diff --git a/front/py/deepx/tensor/tensor.py b/front/py/deepx/tensor/tensor.py index 5a1244bc..97664a34 100644 --- a/front/py/deepx/tensor/tensor.py +++ b/front/py/deepx/tensor/tensor.py @@ -202,8 +202,8 @@ def __getitem__(self, idx): #shape操作 @property - def T(self) -> str: - return self.transpose() + def mT(self) -> str: + return self.transpose(-1,-2) # 打印 @staticmethod diff --git a/front/py/deepx/transformer/__init__.py b/front/py/deepx/transformer/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/front/py/deepx/transformer/attention.py b/front/py/deepx/transformer/attention.py deleted file mode 100644 index e69de29b..00000000 diff --git a/front/py/deepx/transformer/decoder.py b/front/py/deepx/transformer/decoder.py deleted file mode 100644 index e69de29b..00000000 diff --git a/front/py/deepx/transformer/models/__init__.py b/front/py/deepx/transformer/models/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/front/py/examples/2_ir/4_changeshape_concat.py b/front/py/examples/2_ir/4_changeshape_concat.py index a8dc9bda..c00bbbbc 100644 --- a/front/py/examples/2_ir/4_changeshape_concat.py +++ b/front/py/examples/2_ir/4_changeshape_concat.py @@ -12,7 +12,7 @@ ############-------DEEPX-------################ -from deepx import Tensor,zeros, ones, concat +from deepx import ones, concat t1 = ones( (3,4,5),dtype='float32',name='t1') diff --git a/front/py/examples/2_ir/4_changeshape_reshape.py b/front/py/examples/2_ir/4_changeshape_reshape.py index aec8153d..45c33fca 100644 --- a/front/py/examples/2_ir/4_changeshape_reshape.py +++ b/front/py/examples/2_ir/4_changeshape_reshape.py @@ -12,7 +12,7 @@ ############-------DEEPX-------################ -from deepx import Tensor,zeros, ones, full, arange +from deepx import ones t1 = ones((3,4),dtype='float32',name='t1') t1.print() diff --git a/front/py/examples/2_ir/4_changeshape_transpose.py b/front/py/examples/2_ir/4_changeshape_transpose.py index bc1efd08..50dd5535 100644 --- a/front/py/examples/2_ir/4_changeshape_transpose.py +++ b/front/py/examples/2_ir/4_changeshape_transpose.py @@ -18,9 +18,9 @@ t1 = ones((3,4),dtype='float32',name='t1') t1.print() -t2=t1.transpose(out='t2') +t2=t1.transpose(0,1,out='t2') t2.print() t3=ones((2,3,4),dtype='float32',name='t3') -t4=t3.transpose(out='t4') +t4=t3.transpose(1,2,out='t4') t4.print() diff --git a/front/py/examples/2_ir/5_reduce_prod.py b/front/py/examples/2_ir/5_reduce_prod.py index 60676f51..5808bf7e 100644 --- a/front/py/examples/2_ir/5_reduce_prod.py +++ b/front/py/examples/2_ir/5_reduce_prod.py @@ -11,8 +11,8 @@ ############-------DEEPX-------################ -from deepx import Tensor,ones,zeros,arange -from deepx.nn.functional import sum,prod +from deepx import arange +from deepx.nn.functional import prod t=arange(0,60,name='t').reshape_((3,4,5)) t.print() diff --git a/front/py/examples/2_ir/5_reduce_sum.py b/front/py/examples/2_ir/5_reduce_sum.py index 6932f4b2..74e00aa7 100644 --- a/front/py/examples/2_ir/5_reduce_sum.py +++ b/front/py/examples/2_ir/5_reduce_sum.py @@ -16,7 +16,7 @@ ############-------DEEPX-------################ -from deepx import Tensor,ones,zeros,arange +from deepx import arange from deepx.nn.functional import sum,prod t=arange(0,60,name='t').reshape_((3,4,5)) diff --git a/front/py/examples/2_ir/5_reduce_sum_keepdim.py b/front/py/examples/2_ir/5_reduce_sum_keepdim.py index da4cf110..cc5e696d 100644 --- a/front/py/examples/2_ir/5_reduce_sum_keepdim.py +++ b/front/py/examples/2_ir/5_reduce_sum_keepdim.py @@ -17,7 +17,7 @@ ############-------DEEPX-------################ -from deepx import Tensor,ones,zeros,arange +from deepx import ones,arange from deepx.nn.functional import sum,prod t=arange(0,60,name='t').reshape_((3,4,5)) diff --git a/front/py/examples/3_functional/activite_sigmoid.py b/front/py/examples/3_functional/activite_sigmoid.py index 8859fe4b..8d74050f 100644 --- a/front/py/examples/3_functional/activite_sigmoid.py +++ b/front/py/examples/3_functional/activite_sigmoid.py @@ -16,7 +16,7 @@ print(out_torch) ############-------DEEPX-------################ -from deepx import Tensor,ones,zeros,arange,load +from deepx import load from deepx import sigmoid # 使用相同的初始化方式 diff --git a/front/py/examples/3_functional/changeshape_broadcast.py b/front/py/examples/3_functional/changeshape_broadcast.py index d1b9b927..a3b26c8d 100644 --- a/front/py/examples/3_functional/changeshape_broadcast.py +++ b/front/py/examples/3_functional/changeshape_broadcast.py @@ -11,7 +11,7 @@ #######-----------------deepx-----------------####### -from deepx import Tensor,broadcast_to,arange +from deepx import arange deepx_x = arange(0,6).reshape_((1,2,3)) # shape=(2,3) deepx_y = deepx_x.broadcast_to((3,2,3)) # 需要原维度为1 deepx_y.print() diff --git a/front/py/examples/3_module/1_embedding.py b/front/py/examples/3_module/1_embedding.py index ff5c1f17..4b8f847f 100644 --- a/front/py/examples/3_module/1_embedding.py +++ b/front/py/examples/3_module/1_embedding.py @@ -18,7 +18,7 @@ def tokenize_text(text, tokenizer): tokens = torch.where(tokens < tokenizer.vocab_size, tokens, torch.tensor(unk_token_id, device=tokens.device)) return tokens -dir="/home/lipeng/model/deepxmodel/embeddingtest/" +dir="/home/lipeng/model/deepx/embeddingtest/" ############-------PyTorch-------################ import torch.nn as nn diff --git a/front/py/examples/4_transformer/llama/llama_rope.py b/front/py/examples/4_transformer/llama/llama_rope.py index 33f17daa..ce04ffff 100644 --- a/front/py/examples/4_transformer/llama/llama_rope.py +++ b/front/py/examples/4_transformer/llama/llama_rope.py @@ -4,7 +4,7 @@ ############-------DEEPX-------################ from deepx.nn.modules import Embedding,Module from deepx import load,arange -from deepx.transformer.models.llama import LlamaRotaryEmbedding +from deepx.nn.modules.transformer import LlamaRotaryEmbedding input=load(dir+'input') From 5a41e265a807b5c3b17b6365141b738414efe9c2 Mon Sep 17 00:00:00 2001 From: lipeng <734991033@qq.com> Date: Tue, 1 Jul 2025 23:32:47 +0800 Subject: [PATCH 2/4] =?UTF-8?q?py:=201.transformer=E7=9B=AE=E5=BD=95?= =?UTF-8?q?=EF=BC=8C=E6=8C=89=E7=85=A7pytorch=E9=A3=8E=E6=A0=BC=E6=95=B4?= =?UTF-8?q?=E5=90=88=E8=BF=9Bnn.module?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../deepx/nn/modules/transformer/__init__.py | 3 +- .../deepx/nn/modules/transformer/attention.py | 141 ------------------ .../transformer/grouped_query_attention.py | 34 +++++ .../transformer/llama/modeling_llama.py | 82 +++++++++- .../scaled_dot_product_attention.py | 32 ++++ 5 files changed, 144 insertions(+), 148 deletions(-) delete mode 100644 front/py/deepx/nn/modules/transformer/attention.py create mode 100644 front/py/deepx/nn/modules/transformer/grouped_query_attention.py create mode 100644 front/py/deepx/nn/modules/transformer/scaled_dot_product_attention.py diff --git a/front/py/deepx/nn/modules/transformer/__init__.py b/front/py/deepx/nn/modules/transformer/__init__.py index 96a73bfc..3859b094 100644 --- a/front/py/deepx/nn/modules/transformer/__init__.py +++ b/front/py/deepx/nn/modules/transformer/__init__.py @@ -1,7 +1,8 @@ from .embedding import * -from .attention import * +from .scaled_dot_product_attention import * __all__ = [ + "scaled_dot_product_attention", "LlamaRotaryEmbedding", "rotate_half" ] \ No newline at end of file diff --git a/front/py/deepx/nn/modules/transformer/attention.py b/front/py/deepx/nn/modules/transformer/attention.py deleted file mode 100644 index 248a6099..00000000 --- a/front/py/deepx/nn/modules/transformer/attention.py +++ /dev/null @@ -1,141 +0,0 @@ -from typing import Optional,Tuple -from deepx import nn -from deepx import Tensor,matmul,softmax,cat,dropout as dropout_func -from deepx.nn.modules import Module -from deepx.utils import Config - -def scaled_dot_product( - query: Tensor, - key: Tensor, - value: Tensor, - attention_mask: Optional[Tensor] = None, - scaling_factor: float = 1.0, - dropout_prob: float = 0.0 -) -> Tuple[Tensor, Tensor]: - # 计算注意力分数 - attn_scores = (query @ key.mT) * scaling_factor - - # softmax归一化 - attn_weights = softmax(attn_scores, dim=-1) - - # 应用注意力掩码 - if attention_mask is not None: - causal_mask = attention_mask[:, :, :, : key.shape[-2]] - attn_weights = attn_weights + causal_mask - - # 可选的dropout - if dropout_prob > 0.0: - attn_weights = dropout_func(attn_weights, p=dropout_prob) - - # 注意力加权值 - attn_output = matmul(attn_weights, value) - - # 恢复原始维度 - attn_output = attn_output.mT - return attn_output, attn_weights - - -def rotate_half(x:Tensor): - x1 = x[..., : x.shape[-1] // 2] - x2 = x[..., x.shape[-1] // 2 :] - return cat((-x2, x1,), dim=-1) - -def apply_rotary_pos_emb(q:Tensor, k:Tensor, cos:Tensor, sin:Tensor, unsqueeze_dim:int=1): - cos = cos.unsqueeze(unsqueeze_dim) - sin = sin.unsqueeze(unsqueeze_dim) - q_embed = (q * cos) + (rotate_half(q) * sin) - k_embed = (k * cos) + (rotate_half(k) * sin) - return q_embed, k_embed - -def repeat_kv(hidden_states: Tensor, n_rep: int) -> Tensor: - batch, num_key_value_heads, slen, head_dim = hidden_states.shape - if n_rep == 1: - return hidden_states - hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) - return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) - - -# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py -# 经简化,去掉了分布式配置,去掉attention的配置。交给IR自动替换flashattention,后续的组件自动处理 - -def GQA( - module: Module, - query: Tensor, - key: Tensor, - value: Tensor, - attention_mask: Optional[Tensor], - scaling: float, - dropout: float = 0.0, -): - key_states = repeat_kv(key, module.num_key_value_groups) - value_states = repeat_kv(value, module.num_key_value_groups) - - attn_weights = matmul(query, key_states.transpose(2, 3)) * scaling - if attention_mask is not None: - causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] - attn_weights = attn_weights + causal_mask - - attn_weights = softmax(attn_weights, dim=-1, dtype=query.dtype) - attn_weights = dropout_func(attn_weights, p=dropout) - attn_output = matmul(attn_weights, value_states) - attn_output = attn_output.transpose(1, 2) - - return attn_output, attn_weights - -class LlamaAttention(Module): - """Multi-headed attention from 'Attention Is All You Need' paper""" - - def __init__(self, config: Config, layer_idx: int): - super().__init__() - self.config = config - self.layer_idx = layer_idx - self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) - self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads - self.scaling = self.head_dim**-0.5 - self.attention_dropout = config.attention_dropout - self.is_causal = True - - self.q_proj = nn.Linear( - config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias - ) - self.k_proj = nn.Linear( - config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias - ) - self.v_proj = nn.Linear( - config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias - ) - self.o_proj = nn.Linear( - config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias - ) - - - def forward( - self, - hidden_states: Tensor, - position_embeddings: Tuple[Tensor, Tensor], - attention_mask: Optional[Tensor] - ) -> Tuple[Tensor, Optional[Tensor], Optional[Tuple[Tensor]]]: - input_shape = hidden_states.shape[:-1] - hidden_shape = (*input_shape, -1, self.head_dim) - - query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2) - key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2) - value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) - - cos, sin = position_embeddings - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) - - - attn_output, attn_weights =GQA( - self, - query_states, - key_states, - value_states, - attention_mask, - scaling=self.scaling, - dropout=0.0 if not self.training else self.attention_dropout - ) - - attn_output = attn_output.reshape(*input_shape, -1) - attn_output = self.o_proj(attn_output) - return attn_output, attn_weights \ No newline at end of file diff --git a/front/py/deepx/nn/modules/transformer/grouped_query_attention.py b/front/py/deepx/nn/modules/transformer/grouped_query_attention.py new file mode 100644 index 00000000..338617f9 --- /dev/null +++ b/front/py/deepx/nn/modules/transformer/grouped_query_attention.py @@ -0,0 +1,34 @@ +from typing import Optional +from deepx import Tensor, Module +from .scaled_dot_product_attention import scaled_dot_product + +def repeat_kv(hidden_states: Tensor, n_rep: int) -> Tensor: + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py +# 经简化,去掉了分布式配置,去掉attention的配置。交给IR自动替换flashattention,后续的组件自动处理 + + +def grouped_query_attention( + module: Module, + query: Tensor, + key: Tensor, + value: Tensor, + attention_mask: Optional[Tensor], + scaling_factor: float, + dropout_prob: float = 0.0, +): + key_states = repeat_kv(key, module.num_key_value_groups) + value_states = repeat_kv(value, module.num_key_value_groups) + + return scaled_dot_product( + query, key, value, + attention_mask=attention_mask, + scaling_factor=scaling_factor, + dropout_prob=dropout_prob + ) diff --git a/front/py/deepx/nn/modules/transformer/llama/modeling_llama.py b/front/py/deepx/nn/modules/transformer/llama/modeling_llama.py index e48c9a2b..c8aace79 100644 --- a/front/py/deepx/nn/modules/transformer/llama/modeling_llama.py +++ b/front/py/deepx/nn/modules/transformer/llama/modeling_llama.py @@ -1,12 +1,82 @@ from typing import Optional,Tuple from deepx.nn.modules import Module,Linear,Embedding -from deepx import Tensor +from deepx import Tensor,cat from front.py.deepx.nn.modules.transformer.modeling_rope_utils import ROPE_INIT_FUNCTIONS -from deepx.transformer.models.llama.attention import LlamaAttention -from deepx.transformer.models.llama.mlp import LlamaMLP -from deepx.transformer.models.llama.normalization import LlamaRMSNorm -from deepx.transformer.models.llama.embedding import LlamaRotaryEmbedding - +from deepx.nn.modules.mlp import LlamaMLP +from deepx.nn.modules.norm import LlamaRMSNorm +from deepx.nn.modules.transformer import LlamaRotaryEmbedding + +def rotate_half(x:Tensor): + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return cat((-x2, x1,), dim=-1) + +def apply_rotary_pos_emb(q:Tensor, k:Tensor, cos:Tensor, sin:Tensor, unsqueeze_dim:int=1): + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + + +class LlamaAttention(Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: Config, layer_idx: int): + super().__init__() + self.config = config + self.layer_idx = layer_idx + self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) + self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads + self.scaling = self.head_dim**-0.5 + self.attention_dropout = config.attention_dropout + self.is_causal = True + + self.q_proj = Linear( + config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias + ) + self.k_proj = Linear( + config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias + ) + self.v_proj = Linear( + config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias + ) + self.o_proj = Linear( + config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias + ) + + + def forward( + self, + hidden_states: Tensor, + position_embeddings: Tuple[Tensor, Tensor], + attention_mask: Optional[Tensor] + ) -> Tuple[Tensor, Optional[Tensor], Optional[Tuple[Tensor]]]: + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, self.head_dim) + + query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2) + key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2) + value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) + + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + + attn_output, attn_weights =GQA( + self, + query_states, + key_states, + value_states, + attention_mask, + scaling=self.scaling, + dropout=0.0 if not self.training else self.attention_dropout + ) + + attn_output = attn_output.reshape(*input_shape, -1) + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights class LlamaDecoderLayer(Module): def __init__(self, config:dict, layer_idx: int): diff --git a/front/py/deepx/nn/modules/transformer/scaled_dot_product_attention.py b/front/py/deepx/nn/modules/transformer/scaled_dot_product_attention.py new file mode 100644 index 00000000..9ce2ec96 --- /dev/null +++ b/front/py/deepx/nn/modules/transformer/scaled_dot_product_attention.py @@ -0,0 +1,32 @@ +from typing import Optional,Tuple +from deepx import Tensor,matmul,softmax,dropout + +def scaled_dot_product( + query: Tensor, + key: Tensor, + value: Tensor, + attention_mask: Optional[Tensor] = None, + scaling_factor: float = 1.0, + dropout_prob: float = 0.0 +) -> Tuple[Tensor, Tensor]: + # 计算注意力分数 + attn_scores = (query @ key.mT) * scaling_factor + + # softmax归一化 + attn_weights = softmax(attn_scores, dim=-1) + + # 应用注意力掩码 + if attention_mask is not None: + causal_mask = attention_mask[:, :, :, : key.shape[-2]] + attn_weights = attn_weights + causal_mask + + # 可选的dropout + if dropout_prob > 0.0: + attn_weights = dropout(attn_weights, p=dropout_prob) + + # 注意力加权值 + attn_output = matmul(attn_weights, value) + + # 恢复原始维度 + attn_output = attn_output.mT + return attn_output, attn_weights From 41c7430ff560e96f54f3cf79d29bd4a2726cc960 Mon Sep 17 00:00:00 2001 From: lipeng <734991033@qq.com> Date: Tue, 1 Jul 2025 23:36:01 +0800 Subject: [PATCH 3/4] =?UTF-8?q?py:=201.transformer=E7=9B=AE=E5=BD=95?= =?UTF-8?q?=EF=BC=8C=E6=8C=89=E7=85=A7pytorch=E9=A3=8E=E6=A0=BC=E6=95=B4?= =?UTF-8?q?=E5=90=88=E8=BF=9Bnn.module?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../deepx/nn/modules/transformer/grouped_query_attention.py | 4 ++-- .../nn/modules/transformer/scaled_dot_product_attention.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/front/py/deepx/nn/modules/transformer/grouped_query_attention.py b/front/py/deepx/nn/modules/transformer/grouped_query_attention.py index 338617f9..5f5057a1 100644 --- a/front/py/deepx/nn/modules/transformer/grouped_query_attention.py +++ b/front/py/deepx/nn/modules/transformer/grouped_query_attention.py @@ -1,6 +1,6 @@ from typing import Optional from deepx import Tensor, Module -from .scaled_dot_product_attention import scaled_dot_product +from .scaled_dot_product_attention import scaled_dot_product_attention def repeat_kv(hidden_states: Tensor, n_rep: int) -> Tensor: batch, num_key_value_heads, slen, head_dim = hidden_states.shape @@ -26,7 +26,7 @@ def grouped_query_attention( key_states = repeat_kv(key, module.num_key_value_groups) value_states = repeat_kv(value, module.num_key_value_groups) - return scaled_dot_product( + return scaled_dot_product_attention( query, key, value, attention_mask=attention_mask, scaling_factor=scaling_factor, diff --git a/front/py/deepx/nn/modules/transformer/scaled_dot_product_attention.py b/front/py/deepx/nn/modules/transformer/scaled_dot_product_attention.py index 9ce2ec96..00a5174f 100644 --- a/front/py/deepx/nn/modules/transformer/scaled_dot_product_attention.py +++ b/front/py/deepx/nn/modules/transformer/scaled_dot_product_attention.py @@ -1,7 +1,7 @@ from typing import Optional,Tuple from deepx import Tensor,matmul,softmax,dropout -def scaled_dot_product( +def scaled_dot_product_attention( query: Tensor, key: Tensor, value: Tensor, From 9841af5b09bd28c7bb26eae75173e161931517af Mon Sep 17 00:00:00 2001 From: lipeng <734991033@qq.com> Date: Tue, 1 Jul 2025 23:49:30 +0800 Subject: [PATCH 4/4] =?UTF-8?q?py:=201.transformer=E7=9B=AE=E5=BD=95?= =?UTF-8?q?=EF=BC=8C=E6=8C=89=E7=85=A7pytorch=E9=A3=8E=E6=A0=BC=E6=95=B4?= =?UTF-8?q?=E5=90=88=E8=BF=9Bnn.module?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../deepx/nn/modules/transformer/__init__.py | 2 +- ..._dot_product_attention.py => attention.py} | 24 ++++++++++--------- .../transformer/grouped_query_attention.py | 2 +- 3 files changed, 15 insertions(+), 13 deletions(-) rename front/py/deepx/nn/modules/transformer/{scaled_dot_product_attention.py => attention.py} (71%) diff --git a/front/py/deepx/nn/modules/transformer/__init__.py b/front/py/deepx/nn/modules/transformer/__init__.py index 3859b094..bfab8802 100644 --- a/front/py/deepx/nn/modules/transformer/__init__.py +++ b/front/py/deepx/nn/modules/transformer/__init__.py @@ -1,5 +1,5 @@ from .embedding import * -from .scaled_dot_product_attention import * +from .attention import * __all__ = [ "scaled_dot_product_attention", diff --git a/front/py/deepx/nn/modules/transformer/scaled_dot_product_attention.py b/front/py/deepx/nn/modules/transformer/attention.py similarity index 71% rename from front/py/deepx/nn/modules/transformer/scaled_dot_product_attention.py rename to front/py/deepx/nn/modules/transformer/attention.py index 00a5174f..b3377953 100644 --- a/front/py/deepx/nn/modules/transformer/scaled_dot_product_attention.py +++ b/front/py/deepx/nn/modules/transformer/attention.py @@ -9,24 +9,26 @@ def scaled_dot_product_attention( scaling_factor: float = 1.0, dropout_prob: float = 0.0 ) -> Tuple[Tensor, Tensor]: - # 计算注意力分数 + + # 参考论文: https://arxiv.org/abs/1706.03762 (Attention is All You Need) + #1 计算注意力分数 attn_scores = (query @ key.mT) * scaling_factor - # softmax归一化 - attn_weights = softmax(attn_scores, dim=-1) - - # 应用注意力掩码 + #2 应用注意力掩码 if attention_mask is not None: causal_mask = attention_mask[:, :, :, : key.shape[-2]] - attn_weights = attn_weights + causal_mask + attn_scores = attn_scores + causal_mask + + + #3 softmax归一化 + attn_weights = softmax(attn_scores, dim=-1) + - # 可选的dropout + #4 可选的dropout if dropout_prob > 0.0: attn_weights = dropout(attn_weights, p=dropout_prob) - # 注意力加权值 + #5 注意力加权值 attn_output = matmul(attn_weights, value) - - # 恢复原始维度 - attn_output = attn_output.mT + return attn_output, attn_weights diff --git a/front/py/deepx/nn/modules/transformer/grouped_query_attention.py b/front/py/deepx/nn/modules/transformer/grouped_query_attention.py index 5f5057a1..4c8e20da 100644 --- a/front/py/deepx/nn/modules/transformer/grouped_query_attention.py +++ b/front/py/deepx/nn/modules/transformer/grouped_query_attention.py @@ -1,6 +1,6 @@ from typing import Optional from deepx import Tensor, Module -from .scaled_dot_product_attention import scaled_dot_product_attention +from .attention import scaled_dot_product_attention def repeat_kv(hidden_states: Tensor, n_rep: int) -> Tensor: batch, num_key_value_heads, slen, head_dim = hidden_states.shape