diff --git a/front/py/deepx/nn/functional/changeshape.py b/front/py/deepx/nn/functional/changeshape.py index 5616c458..9e0ee71b 100644 --- a/front/py/deepx/nn/functional/changeshape.py +++ b/front/py/deepx/nn/functional/changeshape.py @@ -1,7 +1,15 @@ from typing import Union from deepx import Tensor -from .leaffunc_changeshape import reshape,indexselect, concat,broadcastTo +from .leaffunc_changeshape import reshape,indexselect, concat,broadcastTo,permute from .leaffunc_init import newtensor,arange + + +def transpose(t:Tensor,dim0:int,dim1:int,out:Union[Tensor,str]='')->Tensor: + dimorder = list(range(t.ndim)) + dimorder[dim0],dimorder[dim1]=dimorder[dim1],dimorder[dim0] + return permute(t,tuple(dimorder),out) + + def squeeze(t:Tensor,dim:int)->Tensor: assert isinstance(dim,int) assert isinstance(t,Tensor) diff --git a/front/py/deepx/nn/functional/leaffunc_changeshape.py b/front/py/deepx/nn/functional/leaffunc_changeshape.py index 750378cd..4be47c30 100644 --- a/front/py/deepx/nn/functional/leaffunc_changeshape.py +++ b/front/py/deepx/nn/functional/leaffunc_changeshape.py @@ -9,7 +9,6 @@ def reshape(t:Tensor,shape:tuple[int,...],out:Union[Tensor,str]='')->Tensor: for i in shape: assert isinstance(i,int) and i>0 - outtensor=out if isinstance(out,str) or out is None: outshape=shape outtensor=newtensor(outshape,dtype=t.dtype,name=out) @@ -40,11 +39,6 @@ def permute(t:Tensor, rtf_transpose(t,dimorder,outtensor,defaultauthor['transpose']) return outtensor -def transpose(t:Tensor,out:Union[Tensor,str]='')->Tensor: - dimorder = list(range(t.ndim)) - dimorder[-1],dimorder[-2]=dimorder[-2],dimorder[-1] - return permute(t,tuple(dimorder),out) - def concat(tensors:Union[list[Tensor],tuple[Tensor,...]],dim:int,out:Union[Tensor,str]='')->Tensor: diff --git a/front/py/deepx/nn/functional/rtf.py b/front/py/deepx/nn/functional/rtf.py index acd99249..4983ad16 100644 --- a/front/py/deepx/nn/functional/rtf.py +++ b/front/py/deepx/nn/functional/rtf.py @@ -37,7 +37,7 @@ def A_op_C(op:str,a:Tensor,out:Tensor,author='miaobyte'): ir=DeepxIR(op, args, returns,author) send(ir) -def A_b1_b2_op_C(op:str,a:Tensor,b1:tuple[int],b2:bool,out:Tensor,author='miaobyte'): +def A_b1_b2_op_C(op:str,a:Tensor,b1:tuple[int,...],b2:bool,out:Tensor,author='miaobyte'): args=[Param.tensor(a),Param.vector(b1,'int32'),Param.varbool(b2)] returns=[Param.tensor(out)] ir=DeepxIR(op, args, returns,author) diff --git a/front/py/deepx/nn/functional/rtf_changeshape.py b/front/py/deepx/nn/functional/rtf_changeshape.py index 7c0b144b..6ad3ab3e 100644 --- a/front/py/deepx/nn/functional/rtf_changeshape.py +++ b/front/py/deepx/nn/functional/rtf_changeshape.py @@ -2,14 +2,14 @@ from deepx.nn.deepxir import DeepxIR,Param from deepx.scheduler import send -def rtf_reshape(t:Tensor,shape:tuple[int],out:Tensor,author='miaobyte'): +def rtf_reshape(t:Tensor,shape:tuple[int,...],out:Tensor,author='miaobyte'): args=[Param.tensor(t),Param.vector(shape,'int32')] returns=[Param.tensor(out)] ir=DeepxIR("reshape", args, returns,author) send(ir) -def rtf_transpose(t:Tensor,dimorder:tuple[int],out:Tensor,author='miaobyte'): +def rtf_transpose(t:Tensor,dimorder:tuple[int,...],out:Tensor,author='miaobyte'): args=[Param.tensor(t),Param.vector(dimorder,'int32')] returns=[Param.tensor(out)] ir=DeepxIR("transpose", args, returns,author) @@ -22,7 +22,7 @@ def rtf_concat(tensors:tuple[Tensor],dim:int,out:Tensor,author='miaobyte'): send(ir) -def rtf_broadcastTo(t:Tensor,new_shape:tuple[int],out:Tensor,author='miaobyte'): +def rtf_broadcastTo(t:Tensor,new_shape:tuple[int,...],out:Tensor,author='miaobyte'): args=[Param.tensor(t),Param.vector(new_shape,'int32')] returns=[Param.tensor(out)] ir=DeepxIR("broadcastTo", args, returns,author) diff --git a/front/py/deepx/nn/functional/rtf_reduce.py b/front/py/deepx/nn/functional/rtf_reduce.py index f089bfab..a9966f41 100644 --- a/front/py/deepx/nn/functional/rtf_reduce.py +++ b/front/py/deepx/nn/functional/rtf_reduce.py @@ -1,18 +1,18 @@ from deepx.tensor import Tensor from .rtf import A_b1_b2_op_C -def rtf_sum(a:Tensor,dim:tuple[int],keepdim:bool,out: Tensor, author:str='miaobyte')->Tensor: +def rtf_sum(a:Tensor,dim:tuple[int,...],keepdim:bool,out: Tensor, author:str='miaobyte')->Tensor: A_b1_b2_op_C("sum",a,dim,keepdim,out,author) -def rtf_prod(a:Tensor,dim:tuple[int],keepdim:bool,out:Tensor, author:str='miaobyte')->Tensor: +def rtf_prod(a:Tensor,dim:tuple[int,...],keepdim:bool,out:Tensor, author:str='miaobyte')->Tensor: A_b1_b2_op_C("prod",a,dim,keepdim,out,author) -def rtf_reducemax(a:Tensor,dim:tuple[int],keepdim:bool,out:Tensor, author:str='miaobyte')->Tensor: +def rtf_reducemax(a:Tensor,dim:tuple[int,...],keepdim:bool,out:Tensor, author:str='miaobyte')->Tensor: A_b1_b2_op_C("reducemax",a,dim,keepdim,out,author) -def rtf_reducemin(a:Tensor,dim:tuple[int],keepdim:bool,out:Tensor, author:str='miaobyte')->Tensor: +def rtf_reducemin(a:Tensor,dim:tuple[int,...],keepdim:bool,out:Tensor, author:str='miaobyte')->Tensor: A_b1_b2_op_C("reducemin",a,dim,keepdim,out,author) \ No newline at end of file diff --git a/front/py/deepx/nn/modules/conv.py b/front/py/deepx/nn/modules/conv/conv.py similarity index 100% rename from front/py/deepx/nn/modules/conv.py rename to front/py/deepx/nn/modules/conv/conv.py diff --git a/front/py/deepx/nn/modules/linear.py b/front/py/deepx/nn/modules/linear.py index c4f05194..6c80f93f 100644 --- a/front/py/deepx/nn/modules/linear.py +++ b/front/py/deepx/nn/modules/linear.py @@ -41,7 +41,7 @@ def reset_parameters(self) -> None: def forward(self, input: Tensor) -> Tensor: #`y = xA^T + b` - y=input @ self.weight.T + y=input @ self.weight.mT oldshape=y.shape if self.bias is not None: y.reshape_(tuple(y.shape[1:])) diff --git a/front/py/deepx/transformer/models/llama/mlp.py b/front/py/deepx/nn/modules/mlp/mlp.py similarity index 97% rename from front/py/deepx/transformer/models/llama/mlp.py rename to front/py/deepx/nn/modules/mlp/mlp.py index a35ea3be..eefb5004 100644 --- a/front/py/deepx/transformer/models/llama/mlp.py +++ b/front/py/deepx/nn/modules/mlp/mlp.py @@ -5,7 +5,7 @@ "silu":swish_fn, } -class LlamaMLP(Module): +class MLP(Module): def __init__(self, config:dict): super().__init__() # 输入层大小 diff --git a/front/py/deepx/transformer/models/llama/normalization.py b/front/py/deepx/nn/modules/norm/normalization.py similarity index 100% rename from front/py/deepx/transformer/models/llama/normalization.py rename to front/py/deepx/nn/modules/norm/normalization.py diff --git a/front/py/deepx/nn/modules/normalization.py b/front/py/deepx/nn/modules/normalization.py deleted file mode 100644 index e69de29b..00000000 diff --git a/front/py/deepx/transformer/models/llama/__init__.py b/front/py/deepx/nn/modules/transformer/__init__.py similarity index 75% rename from front/py/deepx/transformer/models/llama/__init__.py rename to front/py/deepx/nn/modules/transformer/__init__.py index 96a73bfc..bfab8802 100644 --- a/front/py/deepx/transformer/models/llama/__init__.py +++ b/front/py/deepx/nn/modules/transformer/__init__.py @@ -2,6 +2,7 @@ from .attention import * __all__ = [ + "scaled_dot_product_attention", "LlamaRotaryEmbedding", "rotate_half" ] \ No newline at end of file diff --git a/front/py/deepx/nn/modules/transformer/attention.py b/front/py/deepx/nn/modules/transformer/attention.py new file mode 100644 index 00000000..b3377953 --- /dev/null +++ b/front/py/deepx/nn/modules/transformer/attention.py @@ -0,0 +1,34 @@ +from typing import Optional,Tuple +from deepx import Tensor,matmul,softmax,dropout + +def scaled_dot_product_attention( + query: Tensor, + key: Tensor, + value: Tensor, + attention_mask: Optional[Tensor] = None, + scaling_factor: float = 1.0, + dropout_prob: float = 0.0 +) -> Tuple[Tensor, Tensor]: + + # 参考论文: https://arxiv.org/abs/1706.03762 (Attention is All You Need) + #1 计算注意力分数 + attn_scores = (query @ key.mT) * scaling_factor + + #2 应用注意力掩码 + if attention_mask is not None: + causal_mask = attention_mask[:, :, :, : key.shape[-2]] + attn_scores = attn_scores + causal_mask + + + #3 softmax归一化 + attn_weights = softmax(attn_scores, dim=-1) + + + #4 可选的dropout + if dropout_prob > 0.0: + attn_weights = dropout(attn_weights, p=dropout_prob) + + #5 注意力加权值 + attn_output = matmul(attn_weights, value) + + return attn_output, attn_weights diff --git a/front/py/deepx/transformer/models/llama/embedding.py b/front/py/deepx/nn/modules/transformer/embedding.py similarity index 95% rename from front/py/deepx/transformer/models/llama/embedding.py rename to front/py/deepx/nn/modules/transformer/embedding.py index 0ca7ab10..3f3d3db8 100644 --- a/front/py/deepx/transformer/models/llama/embedding.py +++ b/front/py/deepx/nn/modules/transformer/embedding.py @@ -1,7 +1,8 @@ from deepx.nn.modules import Module from deepx import cat -from deepx.transformer.modeling_rope_utils import ROPE_INIT_FUNCTIONS +from .modeling_rope_utils import ROPE_INIT_FUNCTIONS from deepx.utils import Config + # https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py class LlamaRotaryEmbedding(Module): def __init__(self,config:Config): @@ -46,7 +47,7 @@ def forward(self, x, position_ids): # 计算频率 - freqs = (inv_freq_expanded @ position_ids_expanded).T + freqs = (inv_freq_expanded @ position_ids_expanded).mT # 拼接频率 emb = cat((freqs, freqs), dim=-1) # 计算余弦和正弦 diff --git a/front/py/deepx/nn/modules/transformer/grouped_query_attention.py b/front/py/deepx/nn/modules/transformer/grouped_query_attention.py new file mode 100644 index 00000000..4c8e20da --- /dev/null +++ b/front/py/deepx/nn/modules/transformer/grouped_query_attention.py @@ -0,0 +1,34 @@ +from typing import Optional +from deepx import Tensor, Module +from .attention import scaled_dot_product_attention + +def repeat_kv(hidden_states: Tensor, n_rep: int) -> Tensor: + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py +# 经简化,去掉了分布式配置,去掉attention的配置。交给IR自动替换flashattention,后续的组件自动处理 + + +def grouped_query_attention( + module: Module, + query: Tensor, + key: Tensor, + value: Tensor, + attention_mask: Optional[Tensor], + scaling_factor: float, + dropout_prob: float = 0.0, +): + key_states = repeat_kv(key, module.num_key_value_groups) + value_states = repeat_kv(value, module.num_key_value_groups) + + return scaled_dot_product_attention( + query, key, value, + attention_mask=attention_mask, + scaling_factor=scaling_factor, + dropout_prob=dropout_prob + ) diff --git a/front/py/deepx/transformer/models/llama/modeling_llama.py b/front/py/deepx/nn/modules/transformer/llama/modeling_llama.py similarity index 80% rename from front/py/deepx/transformer/models/llama/modeling_llama.py rename to front/py/deepx/nn/modules/transformer/llama/modeling_llama.py index cc9cfc8b..c8aace79 100644 --- a/front/py/deepx/transformer/models/llama/modeling_llama.py +++ b/front/py/deepx/nn/modules/transformer/llama/modeling_llama.py @@ -1,12 +1,82 @@ from typing import Optional,Tuple from deepx.nn.modules import Module,Linear,Embedding -from deepx import Tensor -from deepx.transformer.modeling_rope_utils import ROPE_INIT_FUNCTIONS -from deepx.transformer.models.llama.attention import LlamaAttention -from deepx.transformer.models.llama.mlp import LlamaMLP -from deepx.transformer.models.llama.normalization import LlamaRMSNorm -from deepx.transformer.models.llama.embedding import LlamaRotaryEmbedding - +from deepx import Tensor,cat +from front.py.deepx.nn.modules.transformer.modeling_rope_utils import ROPE_INIT_FUNCTIONS +from deepx.nn.modules.mlp import LlamaMLP +from deepx.nn.modules.norm import LlamaRMSNorm +from deepx.nn.modules.transformer import LlamaRotaryEmbedding + +def rotate_half(x:Tensor): + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return cat((-x2, x1,), dim=-1) + +def apply_rotary_pos_emb(q:Tensor, k:Tensor, cos:Tensor, sin:Tensor, unsqueeze_dim:int=1): + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + + +class LlamaAttention(Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: Config, layer_idx: int): + super().__init__() + self.config = config + self.layer_idx = layer_idx + self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) + self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads + self.scaling = self.head_dim**-0.5 + self.attention_dropout = config.attention_dropout + self.is_causal = True + + self.q_proj = Linear( + config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias + ) + self.k_proj = Linear( + config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias + ) + self.v_proj = Linear( + config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias + ) + self.o_proj = Linear( + config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias + ) + + + def forward( + self, + hidden_states: Tensor, + position_embeddings: Tuple[Tensor, Tensor], + attention_mask: Optional[Tensor] + ) -> Tuple[Tensor, Optional[Tensor], Optional[Tuple[Tensor]]]: + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, self.head_dim) + + query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2) + key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2) + value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) + + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + + attn_output, attn_weights =GQA( + self, + query_states, + key_states, + value_states, + attention_mask, + scaling=self.scaling, + dropout=0.0 if not self.training else self.attention_dropout + ) + + attn_output = attn_output.reshape(*input_shape, -1) + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights class LlamaDecoderLayer(Module): def __init__(self, config:dict, layer_idx: int): diff --git a/front/py/deepx/transformer/modeling_rope_utils.py b/front/py/deepx/nn/modules/transformer/modeling_rope_utils.py similarity index 100% rename from front/py/deepx/transformer/modeling_rope_utils.py rename to front/py/deepx/nn/modules/transformer/modeling_rope_utils.py diff --git a/front/py/deepx/tensor/changeshape.py b/front/py/deepx/tensor/changeshape.py index ad495557..8fd9868b 100644 --- a/front/py/deepx/tensor/changeshape.py +++ b/front/py/deepx/tensor/changeshape.py @@ -30,16 +30,19 @@ def permute_(self,dimorder:tuple[int,...])->Tensor: return self @tensor_method -def transpose(self,out:Union[Tensor,str]=''): +def transpose(self,dim0:int,dim1:int,out:Union[Tensor,str]=''): assert isinstance(out,str) or isinstance(out,Tensor) + assert isinstance(dim0,int) and isinstance(dim1,int) from deepx.nn.functional import transpose as transpose_func - result=transpose_func(self,out) + result=transpose_func(self,dim0,dim1,out) return result @tensor_method -def transpose_(self): +def transpose_(self,dim0:int,dim1:int): + assert isinstance(dim0,int) and isinstance(dim1,int) + assert isinstance(dim0,int) and isinstance(dim1,int) from deepx.nn.functional import transpose as transpose_func - transpose_func(self,self) + transpose_func(self,dim0,dim1,self) return self # broadcast_to==broadcastTo==expand diff --git a/front/py/deepx/tensor/shape.py b/front/py/deepx/tensor/shape.py index f7e92625..bca062d4 100644 --- a/front/py/deepx/tensor/shape.py +++ b/front/py/deepx/tensor/shape.py @@ -129,7 +129,7 @@ def concat(cls,shapes:tuple,dim:int)->tuple[int,...]: return tuple(outshape) @classmethod - def matmul(cls,shape:tuple[int],other:tuple[int])->tuple[int]: + def matmul(cls,shape:tuple[int,...],other:tuple[int,...])->tuple[int,...]: if len(shape)<2 or len(other)<2: raise ValueError(f"matmul: self.ndimension()<2 or other.ndimension()<2") if len(shape)!=len(other): diff --git a/front/py/deepx/tensor/tensor.py b/front/py/deepx/tensor/tensor.py index 5a1244bc..97664a34 100644 --- a/front/py/deepx/tensor/tensor.py +++ b/front/py/deepx/tensor/tensor.py @@ -202,8 +202,8 @@ def __getitem__(self, idx): #shape操作 @property - def T(self) -> str: - return self.transpose() + def mT(self) -> str: + return self.transpose(-1,-2) # 打印 @staticmethod diff --git a/front/py/deepx/transformer/__init__.py b/front/py/deepx/transformer/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/front/py/deepx/transformer/attention.py b/front/py/deepx/transformer/attention.py deleted file mode 100644 index e69de29b..00000000 diff --git a/front/py/deepx/transformer/decoder.py b/front/py/deepx/transformer/decoder.py deleted file mode 100644 index e69de29b..00000000 diff --git a/front/py/deepx/transformer/models/__init__.py b/front/py/deepx/transformer/models/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/front/py/deepx/transformer/models/llama/attention.py b/front/py/deepx/transformer/models/llama/attention.py deleted file mode 100644 index a028ce49..00000000 --- a/front/py/deepx/transformer/models/llama/attention.py +++ /dev/null @@ -1,112 +0,0 @@ -from typing import Optional,Tuple -from deepx import nn -from deepx import Tensor,matmul,softmax,cat,dropout as dropout_func -from deepx.nn.modules import Module -from deepx.utils import Config - - - -def rotate_half(x:Tensor): - x1 = x[..., : x.shape[-1] // 2] - x2 = x[..., x.shape[-1] // 2 :] - return cat((-x2, x1,), dim=-1) - -def apply_rotary_pos_emb(q:Tensor, k:Tensor, cos:Tensor, sin:Tensor, unsqueeze_dim:int=1): - cos = cos.unsqueeze(unsqueeze_dim) - sin = sin.unsqueeze(unsqueeze_dim) - q_embed = (q * cos) + (rotate_half(q) * sin) - k_embed = (k * cos) + (rotate_half(k) * sin) - return q_embed, k_embed - -def repeat_kv(hidden_states: Tensor, n_rep: int) -> Tensor: - batch, num_key_value_heads, slen, head_dim = hidden_states.shape - if n_rep == 1: - return hidden_states - hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) - return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) - - -# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py -# 经简化,去掉了分布式配置,去掉attention的配置。交给IR自动替换flashattention,后续的组件自动处理 - -def eager_attention_forward( - module: Module, - query: Tensor, - key: Tensor, - value: Tensor, - attention_mask: Optional[Tensor], - scaling: float, - dropout: float = 0.0, -): - key_states = repeat_kv(key, module.num_key_value_groups) - value_states = repeat_kv(value, module.num_key_value_groups) - - attn_weights = matmul(query, key_states.transpose(2, 3)) * scaling - if attention_mask is not None: - causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] - attn_weights = attn_weights + causal_mask - - attn_weights = softmax(attn_weights, dim=-1, dtype=query.dtype) - attn_weights = dropout_func(attn_weights, p=dropout) - attn_output = matmul(attn_weights, value_states) - attn_output = attn_output.transpose(1, 2).contiguous() - - return attn_output, attn_weights - -class LlamaAttention(Module): - """Multi-headed attention from 'Attention Is All You Need' paper""" - - def __init__(self, config: Config, layer_idx: int): - super().__init__() - self.config = config - self.layer_idx = layer_idx - self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) - self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads - self.scaling = self.head_dim**-0.5 - self.attention_dropout = config.attention_dropout - self.is_causal = True - - self.q_proj = nn.Linear( - config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias - ) - self.k_proj = nn.Linear( - config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias - ) - self.v_proj = nn.Linear( - config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias - ) - self.o_proj = nn.Linear( - config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias - ) - - - def forward( - self, - hidden_states: Tensor, - position_embeddings: Tuple[Tensor, Tensor], - attention_mask: Optional[Tensor] - ) -> Tuple[Tensor, Optional[Tensor], Optional[Tuple[Tensor]]]: - input_shape = hidden_states.shape[:-1] - hidden_shape = (*input_shape, -1, self.head_dim) - - query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2) - key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2) - value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) - - cos, sin = position_embeddings - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) - - - attn_output, attn_weights =eager_attention_forward( - self, - query_states, - key_states, - value_states, - attention_mask, - scaling=self.scaling, - dropout=0.0 if not self.training else self.attention_dropout - ) - - attn_output = attn_output.reshape(*input_shape, -1) - attn_output = self.o_proj(attn_output) - return attn_output, attn_weights \ No newline at end of file diff --git a/front/py/examples/2_ir/4_changeshape_concat.py b/front/py/examples/2_ir/4_changeshape_concat.py index a8dc9bda..c00bbbbc 100644 --- a/front/py/examples/2_ir/4_changeshape_concat.py +++ b/front/py/examples/2_ir/4_changeshape_concat.py @@ -12,7 +12,7 @@ ############-------DEEPX-------################ -from deepx import Tensor,zeros, ones, concat +from deepx import ones, concat t1 = ones( (3,4,5),dtype='float32',name='t1') diff --git a/front/py/examples/2_ir/4_changeshape_reshape.py b/front/py/examples/2_ir/4_changeshape_reshape.py index aec8153d..45c33fca 100644 --- a/front/py/examples/2_ir/4_changeshape_reshape.py +++ b/front/py/examples/2_ir/4_changeshape_reshape.py @@ -12,7 +12,7 @@ ############-------DEEPX-------################ -from deepx import Tensor,zeros, ones, full, arange +from deepx import ones t1 = ones((3,4),dtype='float32',name='t1') t1.print() diff --git a/front/py/examples/2_ir/4_changeshape_transpose.py b/front/py/examples/2_ir/4_changeshape_transpose.py index bc1efd08..50dd5535 100644 --- a/front/py/examples/2_ir/4_changeshape_transpose.py +++ b/front/py/examples/2_ir/4_changeshape_transpose.py @@ -18,9 +18,9 @@ t1 = ones((3,4),dtype='float32',name='t1') t1.print() -t2=t1.transpose(out='t2') +t2=t1.transpose(0,1,out='t2') t2.print() t3=ones((2,3,4),dtype='float32',name='t3') -t4=t3.transpose(out='t4') +t4=t3.transpose(1,2,out='t4') t4.print() diff --git a/front/py/examples/2_ir/5_reduce_prod.py b/front/py/examples/2_ir/5_reduce_prod.py index 60676f51..5808bf7e 100644 --- a/front/py/examples/2_ir/5_reduce_prod.py +++ b/front/py/examples/2_ir/5_reduce_prod.py @@ -11,8 +11,8 @@ ############-------DEEPX-------################ -from deepx import Tensor,ones,zeros,arange -from deepx.nn.functional import sum,prod +from deepx import arange +from deepx.nn.functional import prod t=arange(0,60,name='t').reshape_((3,4,5)) t.print() diff --git a/front/py/examples/2_ir/5_reduce_sum.py b/front/py/examples/2_ir/5_reduce_sum.py index 6932f4b2..74e00aa7 100644 --- a/front/py/examples/2_ir/5_reduce_sum.py +++ b/front/py/examples/2_ir/5_reduce_sum.py @@ -16,7 +16,7 @@ ############-------DEEPX-------################ -from deepx import Tensor,ones,zeros,arange +from deepx import arange from deepx.nn.functional import sum,prod t=arange(0,60,name='t').reshape_((3,4,5)) diff --git a/front/py/examples/2_ir/5_reduce_sum_keepdim.py b/front/py/examples/2_ir/5_reduce_sum_keepdim.py index da4cf110..cc5e696d 100644 --- a/front/py/examples/2_ir/5_reduce_sum_keepdim.py +++ b/front/py/examples/2_ir/5_reduce_sum_keepdim.py @@ -17,7 +17,7 @@ ############-------DEEPX-------################ -from deepx import Tensor,ones,zeros,arange +from deepx import ones,arange from deepx.nn.functional import sum,prod t=arange(0,60,name='t').reshape_((3,4,5)) diff --git a/front/py/examples/3_functional/activite_sigmoid.py b/front/py/examples/3_functional/activite_sigmoid.py index 8859fe4b..8d74050f 100644 --- a/front/py/examples/3_functional/activite_sigmoid.py +++ b/front/py/examples/3_functional/activite_sigmoid.py @@ -16,7 +16,7 @@ print(out_torch) ############-------DEEPX-------################ -from deepx import Tensor,ones,zeros,arange,load +from deepx import load from deepx import sigmoid # 使用相同的初始化方式 diff --git a/front/py/examples/3_functional/changeshape_broadcast.py b/front/py/examples/3_functional/changeshape_broadcast.py index d1b9b927..a3b26c8d 100644 --- a/front/py/examples/3_functional/changeshape_broadcast.py +++ b/front/py/examples/3_functional/changeshape_broadcast.py @@ -11,7 +11,7 @@ #######-----------------deepx-----------------####### -from deepx import Tensor,broadcast_to,arange +from deepx import arange deepx_x = arange(0,6).reshape_((1,2,3)) # shape=(2,3) deepx_y = deepx_x.broadcast_to((3,2,3)) # 需要原维度为1 deepx_y.print() diff --git a/front/py/examples/3_module/1_embedding.py b/front/py/examples/3_module/1_embedding.py index ff5c1f17..4b8f847f 100644 --- a/front/py/examples/3_module/1_embedding.py +++ b/front/py/examples/3_module/1_embedding.py @@ -18,7 +18,7 @@ def tokenize_text(text, tokenizer): tokens = torch.where(tokens < tokenizer.vocab_size, tokens, torch.tensor(unk_token_id, device=tokens.device)) return tokens -dir="/home/lipeng/model/deepxmodel/embeddingtest/" +dir="/home/lipeng/model/deepx/embeddingtest/" ############-------PyTorch-------################ import torch.nn as nn diff --git a/front/py/examples/4_transformer/llama/llama_rope.py b/front/py/examples/4_transformer/llama/llama_rope.py index 33f17daa..ce04ffff 100644 --- a/front/py/examples/4_transformer/llama/llama_rope.py +++ b/front/py/examples/4_transformer/llama/llama_rope.py @@ -4,7 +4,7 @@ ############-------DEEPX-------################ from deepx.nn.modules import Embedding,Module from deepx import load,arange -from deepx.transformer.models.llama import LlamaRotaryEmbedding +from deepx.nn.modules.transformer import LlamaRotaryEmbedding input=load(dir+'input')