diff --git a/doc/excuter/op-mem-cuda/list.md b/doc/excuter/op-mem-cuda/list.md index f25b6973..f487b09c 100644 --- a/doc/excuter/op-mem-cuda/list.md +++ b/doc/excuter/op-mem-cuda/list.md @@ -65,7 +65,7 @@ | cos | miaobyte | T3=cos(T1) | cos(tensor A)->(tensor C) | | notequalscalar | miaobyte | T1!=scalar->mask | notequalscalar(tensor A, var scalar, var epsilon)->(tensor mask) | | minscalar | miaobyte | T3=min(T1, scalar) | minscalar(tensor A, var scalar)->(tensor C) | -| rpowscalar | miaobyte | T3=pow(scalar, T1) | rpowscalar(var scalar, tensor A)->(tensor C) | +| rpowscalar | miaobyte | T3=pow(scalar, T1) | rpowscalar(var scalar, tensor A)->(tensor C) | | rdivscalar | miaobyte | T3=scalar/T1 | rdivscalar(var scalar, tensor A)->(tensor C) | | less | miaobyte | mask=compare(T1, T2) | less(tensor A, tensor B)->(tensor mask) | | powscalar | miaobyte | T3=pow(T1, scalar) | powscalar(tensor A, var scalar)->(tensor C) | diff --git a/excuter/op-mem-cuda/src/client/main.cpp b/excuter/op-mem-cuda/src/client/main.cpp index b3cd40d6..b00a284a 100644 --- a/excuter/op-mem-cuda/src/client/main.cpp +++ b/excuter/op-mem-cuda/src/client/main.cpp @@ -28,8 +28,6 @@ int main() deepx::tf::TfFactory tf_factory; register_all(tf_factory); - - // 将op table输出到markdown文件 string docdir = "../../../doc/excuter/op-mem-cuda/"; std::ofstream md_file(docdir + "list.md"); @@ -68,13 +66,30 @@ int main() { opresp.error("op" + op.name + " not found"); server.resp(opresp.to_string()); - cerr< 1) + { + for (int i = 0; i < (*src).metadata.benchmark.repeat; i++) + { + ret = (*src).run(mem, opresp.message); + if (ret != 0) + { + break; + } + } + } + else + { + ret = (*src).run(mem, opresp.message); + } + memmutex.unlock(); if (ret != 0) { diff --git a/excuter/op-mem-cuda/src/client/tfs.cpp b/excuter/op-mem-cuda/src/client/tfs.cpp index fa6cdcc6..0f64b87b 100644 --- a/excuter/op-mem-cuda/src/client/tfs.cpp +++ b/excuter/op-mem-cuda/src/client/tfs.cpp @@ -317,7 +317,7 @@ namespace deepx::tf // rpowscalar tffactory.add_tf(std::make_shared>(vector( { - Param("scalar", DataCategory::Var, Precision::Float64 | Precision::Int32), + Param("scalar", DataCategory::Var, Precision::Float32 | Precision::Int32), Param("A", DataCategory::Tensor, Precision::Float64 | Precision::Float32), }), vector( diff --git a/excuter/op-mem-ompsimd/src/client/main.cpp b/excuter/op-mem-ompsimd/src/client/main.cpp index 96ca39bb..15004184 100644 --- a/excuter/op-mem-ompsimd/src/client/main.cpp +++ b/excuter/op-mem-ompsimd/src/client/main.cpp @@ -28,7 +28,7 @@ int main() client::udpserver server(8080); deepx::tf::TfFactory tf_factory; register_all(tf_factory); - + // 将op table输出到markdown文件 string docdir = "../../../doc/excuter/op-mem-ompsimd/"; std::ofstream md_file(docdir + "list.md"); @@ -72,14 +72,28 @@ int main() (*src).init(op.name, op.args, op.returns); memmutex.lock(); opresp.start_at = chrono::system_clock::now(); - - int ret = (*src).run(mem,opresp.message); + int ret = 0; + if ((*src).metadata.benchmark.repeat > 1) + { + for (int i = 0; i < (*src).metadata.benchmark.repeat; i++) + { + ret = (*src).run(mem, opresp.message); + if (ret != 0) + { + break; + } + } + } + else + { + ret = (*src).run(mem, opresp.message); + } memmutex.unlock(); if (ret != 0) { opresp.error(opresp.message); server.resp(opresp.to_string()); - cerr<>(vector( { - Param("scalar", DataCategory::Var, Precision::Any), + Param("scalar", DataCategory::Var, Precision::Float32), Param("A", DataCategory::Tensor, Precision::Any), }), vector( diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/matmul_miaobyte.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/matmul_miaobyte.hpp index 4e3d26ad..ac39f38c 100644 --- a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/matmul_miaobyte.hpp +++ b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/matmul_miaobyte.hpp @@ -15,24 +15,29 @@ namespace deepx::tensorfunc throw std::invalid_argument("A.shape could matmul with B.shape"); } //TODO - //这里如果对二维矩阵运算,则omp并行不起来,因为C.shape.dim() - 2刚好=0 - C.shape.rangeParallel(C.shape.dim() - 2, [&](const std::vector &indices) - { - int aIdx=A.shape.linearat(indices); - int bIdx=B.shape.linearat(indices); - int cIdx=C.shape.linearat(indices); - int m=A.shape[-2]; - int k=A.shape[-1]; - int n=B.shape[-1]; - for(int i=0;i &indices,ThreadLocalVectors &tlv) { + + // int m=A.shape[-2]; + int k=A.shape[-1]; + // int n=B.shape[-1]; + + std::copy(indices.begin(),indices.end()-2,tlv.get(0).begin()); + tlv.get(0)[indices.size()-2]=A.shape[-2]; + tlv.get(0)[indices.size()-1]=indices[-1]; + int aIdx=A.shape.linearat(tlv.get(0)); + std::copy(indices.begin(),indices.end()-2,tlv.get(1).begin()); + tlv.get(1)[indices.size()-2]=0; + tlv.get(1)[indices.size()-1]=indices[-2]; + int bIdx=B.shape.linearat(tlv.get(1)); + int bstride=k; + + T sum=0; + for(int l=0;largs = args; this->returns = returns; } - + string math_formula() const override { return "T3=T1 @ T2"; @@ -30,7 +30,17 @@ namespace deepx::tf { return make_shared>(*this); } - int compute(shared_ptr mem, Precision a_type,string &error){ + + int run(shared_ptr mem, string &error) override + { + Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; + Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype; + Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (a_type != b_type || a_type != c_type) + { + error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(b_type) + " != " + precision_str(c_type); + return 1; + } switch (a_type) { case Precision::Float64: @@ -57,30 +67,6 @@ namespace deepx::tf } return 0; } - int run(shared_ptr mem, string &error) override - { - Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; - Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype; - Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; - if (a_type != b_type || a_type != c_type) - { - error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(b_type) + " != " + precision_str(c_type); - return 1; - } - if (metadata.benchmark.repeat > 0) - { - for (int i = 0; i < metadata.benchmark.repeat; i++) - { - if (compute(mem, a_type, error)) - { - return 1; - } - } - }else{ - return compute(mem, a_type, error); - } - return 0; - } }; } diff --git a/front/py/deepx/nn/functional/__init__.py b/front/py/deepx/nn/functional/__init__.py index 2c5d517b..d8ac8f44 100644 --- a/front/py/deepx/nn/functional/__init__.py +++ b/front/py/deepx/nn/functional/__init__.py @@ -25,7 +25,7 @@ "sqrt","pow","exp","log", "min","max", "less","greater","equal","notequal", - "switch", + "switch","where", "todtype", "invert", "matmul", diff --git a/front/py/deepx/nn/functional/elementwise.py b/front/py/deepx/nn/functional/elementwise.py index 9b502473..52a6dc52 100644 --- a/front/py/deepx/nn/functional/elementwise.py +++ b/front/py/deepx/nn/functional/elementwise.py @@ -54,3 +54,7 @@ def bool(input:Tensor)->Tensor: from .leaffunc_elementwise import todtype dest=newtensor(input.shape,dtype='bool',name=input.name) return todtype(input,dest) + +def where(condition:Tensor,x:Tensor,y:Tensor)->Tensor: + from .leaffunc_elementwise import switch_func + return switch_func((x,y),condition) \ No newline at end of file diff --git a/front/py/deepx/nn/functional/leaffunc_matmul.py b/front/py/deepx/nn/functional/leaffunc_matmul.py index acdcefd4..fadb0175 100644 --- a/front/py/deepx/nn/functional/leaffunc_matmul.py +++ b/front/py/deepx/nn/functional/leaffunc_matmul.py @@ -4,7 +4,7 @@ from .leaffunc_life import newtensor from .authormap import defaultauthor -def matmul(a:Tensor,b:Tensor,out:Union[Tensor,str]='',bench:tuple[int,int]=None)->Tensor: +def matmul(a:Tensor,b:Tensor,out:Union[Tensor,str]='',bench:int=None)->Tensor: outtensor=out if isinstance(out,str) or out is None: outshape=Shape.matmul(a.shape,b.shape) diff --git a/front/py/deepx/scheduler/client/udpconn.py b/front/py/deepx/scheduler/client/udpconn.py index a25b0963..6a12c26a 100644 --- a/front/py/deepx/scheduler/client/udpconn.py +++ b/front/py/deepx/scheduler/client/udpconn.py @@ -3,7 +3,7 @@ import select class UDPConn: - def __init__(self, endpoint: str = "localhost:8080"): + def __init__(self, endpoint: str = "localhost:9090"): # 解析endpoint self._host, port_str = endpoint.split(':') self._port = int(port_str) diff --git a/front/py/deepx/transformer/modeling_rope_utils.py b/front/py/deepx/transformer/modeling_rope_utils.py index 0554590c..3ee5255b 100644 --- a/front/py/deepx/transformer/modeling_rope_utils.py +++ b/front/py/deepx/transformer/modeling_rope_utils.py @@ -1,19 +1,21 @@ from typing import Tuple import math -from deepx import arange,Tensor +from deepx import arange,Tensor,where def _compute_default_rope_parameters(config:dict={ - "base":10000.0, + "rope_theta":10000.0, "head_dim":0, "partial_rotary_factor":1.0, }) -> Tuple[Tensor, float]: - dim = config.head_dim* config.partial_rotary_factor + partial_rotary_factor = config.get("partial_rotary_factor", 1.0) + dim = config["head_dim"]* partial_rotary_factor # 计算逆频率 - inv_freq = 1.0 / (config.base ** (arange(0, dim, 2, dtype='float64')/ dim)) + base=config["rope_theta"] + inv_freq = 1.0 / (base ** (arange(0, dim, 2, dtype='float64')/ dim)) return inv_freq, 1.0 def _compute_llama3_parameters(config:dict={ - "base":10000.0, + "rope_theta":10000.0, "head_dim":0, "partial_rotary_factor":1.0, "factor":8, @@ -25,18 +27,22 @@ def _compute_llama3_parameters(config:dict={ # Gets the default RoPE parameters inv_freq, attention_factor = _compute_default_rope_parameters(config) - low_freq_wavelen = config.old_context_len / config.low_freq_factor - high_freq_wavelen = config.old_context_len / config.high_freq_factor + low_freq_factor = config["rope_scaling"]["low_freq_factor"] # `1` in the original implementation + high_freq_factor = config["rope_scaling"]["high_freq_factor"] # `4` in the original implementation + old_context_len = config["rope_scaling"]["original_max_position_embeddings"] # `8192` in the original implementation + low_freq_wavelen = old_context_len /low_freq_factor + high_freq_wavelen = old_context_len/ high_freq_factor wavelen = 2 * math.pi / inv_freq + wavelen.print() # wavelen < high_freq_wavelen: do nothing # wavelen > low_freq_wavelen: divide by factor - inv_freq_llama = torch.where(wavelen > low_freq_wavelen, inv_freq / config.factor, inv_freq) + inv_freq_llama = where(wavelen > low_freq_wavelen, inv_freq / config.factor, inv_freq) # otherwise: interpolate between the two, using a smooth factor smooth_factor = (config.old_context_len / wavelen - config.low_freq_factor) / (config.high_freq_factor - config.low_freq_factor) smoothed_inv_freq = (1 - smooth_factor) * inv_freq_llama / config.factor + smooth_factor * inv_freq_llama is_medium_freq = ~(wavelen < high_freq_wavelen) * ~(wavelen > low_freq_wavelen) - inv_freq_llama = torch.where(is_medium_freq, smoothed_inv_freq, inv_freq_llama) + inv_freq_llama = where(is_medium_freq, smoothed_inv_freq, inv_freq_llama) return inv_freq_llama, attention_factor diff --git a/front/py/deepx/transformer/models/llama/__init__.py b/front/py/deepx/transformer/models/llama/__init__.py new file mode 100644 index 00000000..d77def35 --- /dev/null +++ b/front/py/deepx/transformer/models/llama/__init__.py @@ -0,0 +1,4 @@ +from .embedding import * +__all__ = [ + "LlamaRotaryEmbedding" +] \ No newline at end of file diff --git a/front/py/deepx/transformer/models/llama/embedding.py b/front/py/deepx/transformer/models/llama/embedding.py index 8bac5baa..62e00a57 100644 --- a/front/py/deepx/transformer/models/llama/embedding.py +++ b/front/py/deepx/transformer/models/llama/embedding.py @@ -11,11 +11,11 @@ def __init__(self,config:dict): # 原始最大序列长度 self.original_max_seq_len = config["max_position_embeddings"] # 旋转类型 - self.rope_type=config["rope_scaling"]["type"] + self.rope_type=config["rope_scaling"]["rope_type"] # 旋转初始化函数 self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] # 旋转初始化函数 - inv_freq, self.attention_scaling = self.rope_init_fn(self.config) + inv_freq, self.attention_scaling = self.rope_init_fn(config) # 注册缓存 self.register_buffer("inv_freq", inv_freq, persistent=False) # 原始旋转频率 diff --git a/front/py/examples/2_ir/3_matmul.py b/front/py/examples/2_ir/3_matmul.py index 413a2613..420a1709 100644 --- a/front/py/examples/2_ir/3_matmul.py +++ b/front/py/examples/2_ir/3_matmul.py @@ -1,9 +1,10 @@ -benchcnt=100 - +benchcnt=2000 +shape=[4096,4096] +print() from deepxutil.numpy import save_numpy import numpy as np -np_T1 = np.random.randn(1024, 1024).astype(np.float32) -np_T2 = np.random.randn(1024, 1024).astype(np.float32) +np_T1 = np.random.randn(shape[0], shape[1]).astype(np.float32) +np_T2 = np.random.randn(shape[0], shape[1]).astype(np.float32) npy_path = '/home/lipeng/model/deepxmodel/matmul/' save_numpy(np_T1,npy_path+'t1') @@ -13,35 +14,32 @@ import torch import time -torch_t1 = torch.from_numpy(np_T1) -torch_t2 = torch.from_numpy(np_T2) +torch_t1 = torch.from_numpy(np_T1).to(torch.float32).to('cuda') +torch_t2 = torch.from_numpy(np_T2).to(torch.float32).to('cuda') # warmup _=torch_t1 @ torch_t2 torch_start = time.time() for i in range(benchcnt): torch_t3 = torch_t1 @ torch_t2 - -print(torch_t3) + torch_end = time.time() print(f"PyTorch time: {torch_end - torch_start} seconds") ############-------DEEPX-------################ -from deepx import uniform, matmul, zeros,load -from deepx.nn.functional import save,load +from deepx import matmul, zeros,load print() t1 = load(npy_path+'t1') t2 = load(npy_path+'t2') -t3= zeros((1024,1024),dtype='float32',name="t3") +t3= zeros(tuple(shape),dtype='float32',name="t3") from deepx.nn.functional import defaultauthor defaultauthor['matmul']='miaobyte' # warmup matmul(t1,t2,out=t3) deepx_start = time.time() -matmul(t1,t2,out=t3,bench=(benchcnt)) -t3.print() +matmul(t1,t2,out=t3,bench=benchcnt) deepx_end = time.time() print(f"DeepX time: {deepx_end - deepx_start} seconds") diff --git a/front/py/examples/4_transformer/llama/1_llamarmsnorm_torch.py b/front/py/examples/4_transformer/llama/1_llama_rmsnorm.py similarity index 100% rename from front/py/examples/4_transformer/llama/1_llamarmsnorm_torch.py rename to front/py/examples/4_transformer/llama/1_llama_rmsnorm.py diff --git a/front/py/examples/4_transformer/llama/1_llama_rope.py b/front/py/examples/4_transformer/llama/1_llama_rope.py new file mode 100644 index 00000000..2738a41f --- /dev/null +++ b/front/py/examples/4_transformer/llama/1_llama_rope.py @@ -0,0 +1,94 @@ +hidden_size = 8 +eps = 1e-6 +dir='/home/lipeng/model/deepxmodel/llama/' +model_path="/home/lipeng/model/deepseek-ai/DeepSeek-R1-Distill-Llama-8B" +print() + +from transformers import AutoTokenizer,AutoConfig +def init_tokenizer(model_path): + tokenizer = AutoTokenizer.from_pretrained(model_path) + tokenizer.pad_token = tokenizer.eos_token + return tokenizer + +tokenizer = init_tokenizer(model_path) +config=AutoConfig.from_pretrained(model_path) +def tokenize_text(text, tokenizer): + tokens = tokenizer(text, return_tensors="pt").input_ids + import torch + # 处理超出词汇表范围的token + if torch.any(tokens >= tokenizer.vocab_size): + # 获取UNK token ID,如果没有则使用0 + unk_token_id = tokenizer.unk_token_id if hasattr(tokenizer, 'unk_token_id') and tokenizer.unk_token_id is not None else 0 + # 替换所有超出范围的token为UNK + tokens = torch.where(tokens < tokenizer.vocab_size, tokens, torch.tensor(unk_token_id, device=tokens.device)) + return tokens + +############-------PyTorch-------################ +import torch + +# 创建输入 +text = "这是一个测试文本,用于演示嵌入层的使用。" +torch_input = tokenize_text(text, tokenizer) +from deepxutil.torch import save_torch +save_torch(torch_input,dir+'input') + +# 创建网络 + +class NetTorch(torch.nn.Module): + from transformers.models.llama.modeling_llama import LlamaConfig + def __init__(self,config:LlamaConfig): + super().__init__() + self.padding_idx = config.pad_token_id + self.config = config + self.embed_tokens = torch.nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + from transformers.models.llama.modeling_llama import LlamaRotaryEmbedding + self.rotary_emb = LlamaRotaryEmbedding(config=config) + + def forward(self,x): + inputs_embeds = self.embed_tokens(x) + hidden_states = inputs_embeds + # create position embeddings to be shared across the decoder layers + position_ids = torch.arange(hidden_states.shape[1], device=hidden_states.device).unsqueeze(0) + return self.rotary_emb(hidden_states, position_ids) + + +torch_net = NetTorch(config) +save_torch(torch_net.embed_tokens.weight,dir+'weight') +# 前向传播 +torch_output = torch_net(torch_input) +torch_sin, torch_cos = torch_output + +print("sin shape:",torch_sin.shape) +print("sin:", torch_sin) + +print("cos shape:", torch_cos.shape) +print("cos:", torch_cos) + + +############-------DEEPX-------################ +from deepx.nn.modules import Embedding,Module +from deepx import load +from deepx.transformer.models.llama import LlamaRotaryEmbedding + +input=load(dir+'input') + +embed_tokens_weight=load(dir+'weight') + +class NetDeepx(Module): + def __init__(self,configdict:dict): + super().__init__() + self.embed_tokens = Embedding(configdict["vocab_size"], configdict["hidden_size"],weight=embed_tokens_weight) + self.rotary_emb = LlamaRotaryEmbedding(config=configdict) + + def forward(self,x): + inputs_embeds = self.embed_tokens(x) + hidden_states = inputs_embeds + position_ids = torch.arange(hidden_states.shape[1], device=hidden_states.device).unsqueeze(0) + return self.rotary_emb(hidden_states, position_ids) + +net = NetDeepx(configdict=config.to_dict()) +out=net.forward(input) +out.print() + + + diff --git a/front/py/examples/4_transformer/llama/1_rope.py b/front/py/examples/4_transformer/llama/1_rope.py deleted file mode 100644 index e69de29b..00000000