array2d · miaobyte · May 12, 2025 · May 7, 2025 · May 8, 2025 · May 12, 2025
diff --git a/doc/excuter/op-mem-cuda/list.md b/doc/excuter/op-mem-cuda/list.md
@@ -65,7 +65,7 @@
 | cos | miaobyte | T3=cos(T1) | cos(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) |
 | notequalscalar | miaobyte | T1!=scalar->mask | notequalscalar(tensor<any> A, var<any> scalar, var<float32> epsilon)->(tensor<bool> mask) |
 | minscalar | miaobyte | T3=min(T1, scalar) | minscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
-| rpowscalar | miaobyte | T3=pow(scalar, T1) | rpowscalar(var<float64|int32> scalar, tensor<float64|float32> A)->(tensor<float64|float32> C) |
+| rpowscalar | miaobyte | T3=pow(scalar, T1) | rpowscalar(var<float32|int32> scalar, tensor<float64|float32> A)->(tensor<float64|float32> C) |
 | rdivscalar | miaobyte | T3=scalar/T1 | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) |
 | less | miaobyte | mask=compare(T1, T2) | less(tensor<any> A, tensor<any> B)->(tensor<bool> mask) |
 | powscalar | miaobyte | T3=pow(T1, scalar) | powscalar(tensor<float64|float32> A, var<float64|int32> scalar)->(tensor<float64|float32> C) |

diff --git a/excuter/op-mem-cuda/src/client/main.cpp b/excuter/op-mem-cuda/src/client/main.cpp
@@ -28,8 +28,6 @@ int main()
     deepx::tf::TfFactory tf_factory;
     register_all(tf_factory);
 
-
-
     // 将op table输出到markdown文件
     string docdir = "../../../doc/excuter/op-mem-cuda/";
     std::ofstream md_file(docdir + "list.md");
@@ -68,13 +66,30 @@ int main()
             {
                 opresp.error("op" + op.name + " not found");
                 server.resp(opresp.to_string());
-                cerr<<opresp.message<<endl;
+                cerr << opresp.message << endl;
                 continue;
             }
             (*src).init(op.name, op.args, op.returns);
+
             memmutex.lock();
             opresp.start_at = chrono::system_clock::now();
-            int ret = (*src).run(mem,opresp.message);
+            int ret = 0;
+            if ((*src).metadata.benchmark.repeat > 1)
+            {
+                for (int i = 0; i < (*src).metadata.benchmark.repeat; i++)
+                {
+                    ret = (*src).run(mem, opresp.message);
+                    if (ret != 0)
+                    {
+                        break;
+                    }
+                }
+            }
+            else
+            {
+                ret = (*src).run(mem, opresp.message);
+            }
+
             memmutex.unlock();
             if (ret != 0)
             {

diff --git a/excuter/op-mem-cuda/src/client/tfs.cpp b/excuter/op-mem-cuda/src/client/tfs.cpp
@@ -317,7 +317,7 @@ namespace deepx::tf
         // rpowscalar
         tffactory.add_tf(std::make_shared<RpowScalar<miaobyte>>(vector<Param>(
                                                                     {
-                                                                        Param("scalar", DataCategory::Var, Precision::Float64 | Precision::Int32),
+                                                                        Param("scalar", DataCategory::Var, Precision::Float32 | Precision::Int32),
                                                                         Param("A", DataCategory::Tensor, Precision::Float64 | Precision::Float32),
                                                                     }),
                                                                 vector<Param>(

diff --git a/excuter/op-mem-ompsimd/src/client/main.cpp b/excuter/op-mem-ompsimd/src/client/main.cpp
@@ -28,7 +28,7 @@ int main()
     client::udpserver server(8080);
     deepx::tf::TfFactory tf_factory;
     register_all(tf_factory);
- 
+
     // 将op table输出到markdown文件
     string docdir = "../../../doc/excuter/op-mem-ompsimd/";
     std::ofstream md_file(docdir + "list.md");
@@ -72,14 +72,28 @@ int main()
             (*src).init(op.name, op.args, op.returns);
             memmutex.lock();
             opresp.start_at = chrono::system_clock::now();
-
-            int ret = (*src).run(mem,opresp.message);
+            int ret = 0;
+            if ((*src).metadata.benchmark.repeat > 1)
+            {
+                for (int i = 0; i < (*src).metadata.benchmark.repeat; i++)
+                {
+                    ret = (*src).run(mem, opresp.message);
+                    if (ret != 0)
+                    {
+                        break;
+                    }
+                }
+            }
+            else
+            {
+                ret = (*src).run(mem, opresp.message);
+            }
             memmutex.unlock();
             if (ret != 0)
             {
                 opresp.error(opresp.message);
                 server.resp(opresp.to_string());
-                cerr<<opresp.message<<endl;
+                cerr << opresp.message << endl;
                 continue;
             }
             opresp.finish("");

diff --git a/excuter/op-mem-ompsimd/src/client/tfs.cpp b/excuter/op-mem-ompsimd/src/client/tfs.cpp
@@ -338,7 +338,7 @@ namespace deepx::tf
         // rpowscalar author=miaobyte
         tffactory.add_tf(std::make_shared<RpowScalar<miaobyte>>(vector<Param>(
                                                                     {
-                                                                        Param("scalar", DataCategory::Var, Precision::Any),
+                                                                        Param("scalar", DataCategory::Var, Precision::Float32),
                                                                         Param("A", DataCategory::Tensor, Precision::Any),
                                                                     }),
                                                                 vector<Param>(

diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/matmul_miaobyte.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/matmul_miaobyte.hpp
@@ -15,24 +15,29 @@ namespace deepx::tensorfunc
                 throw std::invalid_argument("A.shape could matmul with B.shape");
             }
             //TODO
-            //这里如果对二维矩阵运算，则omp并行不起来，因为C.shape.dim() - 2刚好=0
-            C.shape.rangeParallel(C.shape.dim() - 2, [&](const std::vector<int> &indices)
-                                  {
-                        int aIdx=A.shape.linearat(indices);
-                        int bIdx=B.shape.linearat(indices);
-                        int cIdx=C.shape.linearat(indices);
-                        int m=A.shape[-2];
-                        int k=A.shape[-1];
-                        int n=B.shape[-1];
-                        for(int i=0;i<m;i++){
-                            for(int j=0;j<n;j++){
-                                T sum=0;
-                                for(int l=0;l<k;l++){
-                                    sum+=A.data[aIdx+i*k+l]*B.data[bIdx+l*n+j];
-                                }
-                                C.data[cIdx+i*n+j]=sum;
-                            }
-                        } });
+            //这里需要进一步优化
+            C.shape.rangeParallel(C.shape.dim(), [&A,&B,&C](const int idx,const std::vector<int> &indices,ThreadLocalVectors &tlv) {
+
+                // int m=A.shape[-2];
+                int k=A.shape[-1];
+                // int n=B.shape[-1];
+
+                std::copy(indices.begin(),indices.end()-2,tlv.get(0).begin());
+                tlv.get(0)[indices.size()-2]=A.shape[-2];
+                tlv.get(0)[indices.size()-1]=indices[-1];
+                int aIdx=A.shape.linearat(tlv.get(0));
+                std::copy(indices.begin(),indices.end()-2,tlv.get(1).begin());
+                tlv.get(1)[indices.size()-2]=0;
+                tlv.get(1)[indices.size()-1]=indices[-2];
+                int bIdx=B.shape.linearat(tlv.get(1));
+                int bstride=k;
+
+                T sum=0;
+                for(int l=0;l<k;l++){
+                    sum+=A.data[aIdx+l]+B.data[bIdx+l*bstride];
+                }
+                C.data[idx]=sum;
+            },{A.shape.dim(),B.shape.dim()});
         }
     };
 

diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/matmul.hpp b/excuter/op-mem-ompsimd/src/deepx/tf/matmul.hpp
@@ -1,6 +1,6 @@
 #ifndef DEEPX_TF_MATMUL_HPP
 #define DEEPX_TF_MATMUL_HPP
- 
+
 #include "deepx/tf/tf.hpp"
 #include "deepx/dtype.hpp"
 #include "deepx/dtype_ompsimd.hpp"
@@ -21,7 +21,7 @@ namespace deepx::tf
             this->args = args;
             this->returns = returns;
         }
- 
+
         string math_formula() const override
         {
             return "T3=T1 @ T2";
@@ -30,7 +30,17 @@ namespace deepx::tf
         {
             return make_shared<MatMul<Author>>(*this);
         }
-        int compute(shared_ptr<MemBase> mem, Precision a_type,string &error){
+
+        int run(shared_ptr<MemBase> mem, string &error) override
+        {
+            Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
+            Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype;
+            Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
+            if (a_type != b_type || a_type != c_type)
+            {
+                error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(b_type) + " != " + precision_str(c_type);
+                return 1;
+            }
             switch (a_type)
             {
             case Precision::Float64:
@@ -57,30 +67,6 @@ namespace deepx::tf
             }
             return 0;
         }
-        int run(shared_ptr<MemBase> mem, string &error) override
-        {
-            Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
-            Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype;
-            Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
-            if (a_type != b_type || a_type != c_type)
-            {
-                error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(b_type) + " != " + precision_str(c_type);
-                return 1;
-            }
-            if (metadata.benchmark.repeat > 0)
-            {   
-                for (int i = 0; i < metadata.benchmark.repeat; i++)
-                {
-                    if (compute(mem, a_type, error))
-                    {
-                        return 1;
-                    }
-                }
-            }else{
-                return compute(mem, a_type, error);
-            }
-            return 0;
-        }
     };
 }
 

diff --git a/front/py/deepx/nn/functional/__init__.py b/front/py/deepx/nn/functional/__init__.py
@@ -25,7 +25,7 @@
     "sqrt","pow","exp","log",
     "min","max",
     "less","greater","equal","notequal",
-    "switch",
+    "switch","where",
     "todtype",
     "invert",
     "matmul",

diff --git a/front/py/deepx/nn/functional/elementwise.py b/front/py/deepx/nn/functional/elementwise.py
@@ -54,3 +54,7 @@ def bool(input:Tensor)->Tensor:
     from .leaffunc_elementwise import todtype
     dest=newtensor(input.shape,dtype='bool',name=input.name)
     return todtype(input,dest)
+
+def where(condition:Tensor,x:Tensor,y:Tensor)->Tensor:
+    from .leaffunc_elementwise import switch_func
+    return switch_func((x,y),condition)
diff --git a/front/py/deepx/nn/functional/leaffunc_matmul.py b/front/py/deepx/nn/functional/leaffunc_matmul.py
@@ -4,7 +4,7 @@
 from .leaffunc_life import newtensor
 from .authormap import defaultauthor
 
-def matmul(a:Tensor,b:Tensor,out:Union[Tensor,str]='',bench:tuple[int,int]=None)->Tensor:
+def matmul(a:Tensor,b:Tensor,out:Union[Tensor,str]='',bench:int=None)->Tensor:
     outtensor=out
     if isinstance(out,str) or out is None:
         outshape=Shape.matmul(a.shape,b.shape)

diff --git a/front/py/deepx/scheduler/client/udpconn.py b/front/py/deepx/scheduler/client/udpconn.py
@@ -3,7 +3,7 @@
 import select
 
 class UDPConn:
-    def __init__(self, endpoint: str = "localhost:8080"):
+    def __init__(self, endpoint: str = "localhost:9090"):
         # 解析endpoint
         self._host, port_str = endpoint.split(':')
         self._port = int(port_str)

diff --git a/front/py/deepx/transformer/modeling_rope_utils.py b/front/py/deepx/transformer/modeling_rope_utils.py
@@ -1,19 +1,21 @@
 from typing import   Tuple
 import math
-from deepx import arange,Tensor
+from deepx import arange,Tensor,where
 
 def _compute_default_rope_parameters(config:dict={
-    "base":10000.0,
+    "rope_theta":10000.0,
     "head_dim":0,
     "partial_rotary_factor":1.0,
 }) -> Tuple[Tensor, float]:
-    dim   = config.head_dim* config.partial_rotary_factor
+    partial_rotary_factor = config.get("partial_rotary_factor", 1.0)
+    dim   = config["head_dim"]* partial_rotary_factor
     # 计算逆频率
-    inv_freq = 1.0 / (config.base ** (arange(0, dim, 2, dtype='float64')/ dim))
+    base=config["rope_theta"]
+    inv_freq = 1.0 / (base ** (arange(0, dim, 2, dtype='float64')/ dim))
     return inv_freq, 1.0
 
 def _compute_llama3_parameters(config:dict={
-    "base":10000.0,
+    "rope_theta":10000.0,
     "head_dim":0,
     "partial_rotary_factor":1.0,
     "factor":8,
@@ -25,18 +27,22 @@ def _compute_llama3_parameters(config:dict={
     # Gets the default RoPE parameters
     inv_freq, attention_factor = _compute_default_rope_parameters(config)
 
-    low_freq_wavelen = config.old_context_len / config.low_freq_factor
-    high_freq_wavelen = config.old_context_len / config.high_freq_factor
+    low_freq_factor = config["rope_scaling"]["low_freq_factor"]  # `1` in the original implementation
+    high_freq_factor = config["rope_scaling"]["high_freq_factor"]  # `4` in the original implementation
+    old_context_len = config["rope_scaling"]["original_max_position_embeddings"]  # `8192` in the original implementation
+    low_freq_wavelen = old_context_len /low_freq_factor
+    high_freq_wavelen = old_context_len/ high_freq_factor
 
     wavelen = 2 * math.pi / inv_freq
+    wavelen.print()
     # wavelen < high_freq_wavelen: do nothing
     # wavelen > low_freq_wavelen: divide by factor
-    inv_freq_llama = torch.where(wavelen > low_freq_wavelen, inv_freq / config.factor, inv_freq)
+    inv_freq_llama =  where(wavelen > low_freq_wavelen, inv_freq / config.factor, inv_freq)
     # otherwise: interpolate between the two, using a smooth factor
     smooth_factor = (config.old_context_len / wavelen - config.low_freq_factor) / (config.high_freq_factor - config.low_freq_factor)
     smoothed_inv_freq = (1 - smooth_factor) * inv_freq_llama / config.factor + smooth_factor * inv_freq_llama
     is_medium_freq = ~(wavelen < high_freq_wavelen) * ~(wavelen > low_freq_wavelen)
-    inv_freq_llama = torch.where(is_medium_freq, smoothed_inv_freq, inv_freq_llama)
+    inv_freq_llama =  where(is_medium_freq, smoothed_inv_freq, inv_freq_llama)
 
     return inv_freq_llama, attention_factor
 

diff --git a/front/py/deepx/transformer/models/llama/__init__.py b/front/py/deepx/transformer/models/llama/__init__.py
@@ -0,0 +1,4 @@
+from .embedding import *
+__all__ = [
+    "LlamaRotaryEmbedding"
+]
diff --git a/front/py/deepx/transformer/models/llama/embedding.py b/front/py/deepx/transformer/models/llama/embedding.py
@@ -11,11 +11,11 @@ def __init__(self,config:dict):
         # 原始最大序列长度
         self.original_max_seq_len = config["max_position_embeddings"]
         # 旋转类型
-        self.rope_type=config["rope_scaling"]["type"]
+        self.rope_type=config["rope_scaling"]["rope_type"]
         # 旋转初始化函数
         self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
         # 旋转初始化函数
-        inv_freq, self.attention_scaling = self.rope_init_fn(self.config)
+        inv_freq, self.attention_scaling = self.rope_init_fn(config)
         # 注册缓存
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         # 原始旋转频率

diff --git a/front/py/examples/2_ir/3_matmul.py b/front/py/examples/2_ir/3_matmul.py
@@ -1,9 +1,10 @@
-benchcnt=100
-
+benchcnt=2000
+shape=[4096,4096]
+print()
 from deepxutil.numpy  import save_numpy
 import numpy as np
-np_T1 = np.random.randn(1024, 1024).astype(np.float32)
-np_T2 = np.random.randn(1024, 1024).astype(np.float32)
+np_T1 = np.random.randn(shape[0], shape[1]).astype(np.float32)
+np_T2 = np.random.randn(shape[0], shape[1]).astype(np.float32)
 
 npy_path = '/home/lipeng/model/deepxmodel/matmul/'
 save_numpy(np_T1,npy_path+'t1')
@@ -13,35 +14,32 @@
 
 import torch
 import time
-torch_t1 = torch.from_numpy(np_T1)
-torch_t2 = torch.from_numpy(np_T2)
+torch_t1 = torch.from_numpy(np_T1).to(torch.float32).to('cuda')
+torch_t2 = torch.from_numpy(np_T2).to(torch.float32).to('cuda')
 # warmup
 _=torch_t1 @ torch_t2
 
 torch_start = time.time()
 for i in range(benchcnt):
     torch_t3 = torch_t1 @ torch_t2
-
-print(torch_t3)
+
 torch_end = time.time()
 print(f"PyTorch time: {torch_end - torch_start} seconds")
 ############-------DEEPX-------################
 
-from deepx import uniform, matmul, zeros,load
-from deepx.nn.functional import save,load
+from deepx import   matmul, zeros,load
 print()
 
 t1 = load(npy_path+'t1')
 t2 = load(npy_path+'t2')
-t3= zeros((1024,1024),dtype='float32',name="t3")
+t3= zeros(tuple(shape),dtype='float32',name="t3")
 from deepx.nn.functional import defaultauthor
 defaultauthor['matmul']='miaobyte'
 # warmup
 matmul(t1,t2,out=t3)
 
 deepx_start = time.time()
-matmul(t1,t2,out=t3,bench=(benchcnt))
-t3.print()
+matmul(t1,t2,out=t3,bench=benchcnt)
 deepx_end = time.time()
 print(f"DeepX time: {deepx_end - deepx_start} seconds")
 

diff --git a/...transformer/llama/1_llamarmsnorm_torch.py → ...es/4_transformer/llama/1_llama_rmsnorm.py b/...transformer/llama/1_llamarmsnorm_torch.py → ...es/4_transformer/llama/1_llama_rmsnorm.py