diff --git a/doc/excuter/op-mem-cuda/list.md b/doc/excuter/op-mem-cuda/list.md
index f25b6973..f487b09c 100644
--- a/doc/excuter/op-mem-cuda/list.md
+++ b/doc/excuter/op-mem-cuda/list.md
@@ -65,7 +65,7 @@
 | cos | miaobyte | T3=cos(T1) | cos(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) |
 | notequalscalar | miaobyte | T1!=scalar->mask | notequalscalar(tensor<any> A, var<any> scalar, var<float32> epsilon)->(tensor<bool> mask) |
 | minscalar | miaobyte | T3=min(T1, scalar) | minscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
-| rpowscalar | miaobyte | T3=pow(scalar, T1) | rpowscalar(var<float64|int32> scalar, tensor<float64|float32> A)->(tensor<float64|float32> C) |
+| rpowscalar | miaobyte | T3=pow(scalar, T1) | rpowscalar(var<float32|int32> scalar, tensor<float64|float32> A)->(tensor<float64|float32> C) |
 | rdivscalar | miaobyte | T3=scalar/T1 | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) |
 | less | miaobyte | mask=compare(T1, T2) | less(tensor<any> A, tensor<any> B)->(tensor<bool> mask) |
 | powscalar | miaobyte | T3=pow(T1, scalar) | powscalar(tensor<float64|float32> A, var<float64|int32> scalar)->(tensor<float64|float32> C) |
diff --git a/excuter/op-mem-cuda/src/client/main.cpp b/excuter/op-mem-cuda/src/client/main.cpp
index b3cd40d6..b00a284a 100644
--- a/excuter/op-mem-cuda/src/client/main.cpp
+++ b/excuter/op-mem-cuda/src/client/main.cpp
@@ -28,8 +28,6 @@ int main()
     deepx::tf::TfFactory tf_factory;
     register_all(tf_factory);
 
- 
-
     // 将op table输出到markdown文件
     string docdir = "../../../doc/excuter/op-mem-cuda/";
     std::ofstream md_file(docdir + "list.md");
@@ -68,13 +66,30 @@ int main()
             {
                 opresp.error("op" + op.name + " not found");
                 server.resp(opresp.to_string());
-                cerr<<opresp.message<<endl;
+                cerr << opresp.message << endl;
                 continue;
             }
             (*src).init(op.name, op.args, op.returns);
+
             memmutex.lock();
             opresp.start_at = chrono::system_clock::now();
-            int ret = (*src).run(mem,opresp.message);
+            int ret = 0;
+            if ((*src).metadata.benchmark.repeat > 1)
+            {
+                for (int i = 0; i < (*src).metadata.benchmark.repeat; i++)
+                {
+                    ret = (*src).run(mem, opresp.message);
+                    if (ret != 0)
+                    {
+                        break;
+                    }
+                }
+            }
+            else
+            {
+                ret = (*src).run(mem, opresp.message);
+            }
+
             memmutex.unlock();
             if (ret != 0)
             {
diff --git a/excuter/op-mem-cuda/src/client/tfs.cpp b/excuter/op-mem-cuda/src/client/tfs.cpp
index fa6cdcc6..0f64b87b 100644
--- a/excuter/op-mem-cuda/src/client/tfs.cpp
+++ b/excuter/op-mem-cuda/src/client/tfs.cpp
@@ -317,7 +317,7 @@ namespace deepx::tf
         // rpowscalar
         tffactory.add_tf(std::make_shared<RpowScalar<miaobyte>>(vector<Param>(
                                                                     {
-                                                                        Param("scalar", DataCategory::Var, Precision::Float64 | Precision::Int32),
+                                                                        Param("scalar", DataCategory::Var, Precision::Float32 | Precision::Int32),
                                                                         Param("A", DataCategory::Tensor, Precision::Float64 | Precision::Float32),
                                                                     }),
                                                                 vector<Param>(
diff --git a/excuter/op-mem-ompsimd/src/client/main.cpp b/excuter/op-mem-ompsimd/src/client/main.cpp
index 96ca39bb..15004184 100644
--- a/excuter/op-mem-ompsimd/src/client/main.cpp
+++ b/excuter/op-mem-ompsimd/src/client/main.cpp
@@ -28,7 +28,7 @@ int main()
     client::udpserver server(8080);
     deepx::tf::TfFactory tf_factory;
     register_all(tf_factory);
- 
+
     // 将op table输出到markdown文件
     string docdir = "../../../doc/excuter/op-mem-ompsimd/";
     std::ofstream md_file(docdir + "list.md");
@@ -72,14 +72,28 @@ int main()
             (*src).init(op.name, op.args, op.returns);
             memmutex.lock();
             opresp.start_at = chrono::system_clock::now();
-
-            int ret = (*src).run(mem,opresp.message);
+            int ret = 0;
+            if ((*src).metadata.benchmark.repeat > 1)
+            {
+                for (int i = 0; i < (*src).metadata.benchmark.repeat; i++)
+                {
+                    ret = (*src).run(mem, opresp.message);
+                    if (ret != 0)
+                    {
+                        break;
+                    }
+                }
+            }
+            else
+            {
+                ret = (*src).run(mem, opresp.message);
+            }
             memmutex.unlock();
             if (ret != 0)
             {
                 opresp.error(opresp.message);
                 server.resp(opresp.to_string());
-                cerr<<opresp.message<<endl;
+                cerr << opresp.message << endl;
                 continue;
             }
             opresp.finish("");
diff --git a/excuter/op-mem-ompsimd/src/client/tfs.cpp b/excuter/op-mem-ompsimd/src/client/tfs.cpp
index af50289a..902393da 100644
--- a/excuter/op-mem-ompsimd/src/client/tfs.cpp
+++ b/excuter/op-mem-ompsimd/src/client/tfs.cpp
@@ -338,7 +338,7 @@ namespace deepx::tf
         // rpowscalar author=miaobyte
         tffactory.add_tf(std::make_shared<RpowScalar<miaobyte>>(vector<Param>(
                                                                     {
-                                                                        Param("scalar", DataCategory::Var, Precision::Any),
+                                                                        Param("scalar", DataCategory::Var, Precision::Float32),
                                                                         Param("A", DataCategory::Tensor, Precision::Any),
                                                                     }),
                                                                 vector<Param>(
diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/matmul_miaobyte.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/matmul_miaobyte.hpp
index 4e3d26ad..ac39f38c 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/matmul_miaobyte.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/matmul_miaobyte.hpp
@@ -15,24 +15,29 @@ namespace deepx::tensorfunc
                 throw std::invalid_argument("A.shape could matmul with B.shape");
             }
             //TODO
-            //这里如果对二维矩阵运算，则omp并行不起来，因为C.shape.dim() - 2刚好=0
-            C.shape.rangeParallel(C.shape.dim() - 2, [&](const std::vector<int> &indices)
-                                  {
-                        int aIdx=A.shape.linearat(indices);
-                        int bIdx=B.shape.linearat(indices);
-                        int cIdx=C.shape.linearat(indices);
-                        int m=A.shape[-2];
-                        int k=A.shape[-1];
-                        int n=B.shape[-1];
-                        for(int i=0;i<m;i++){
-                            for(int j=0;j<n;j++){
-                                T sum=0;
-                                for(int l=0;l<k;l++){
-                                    sum+=A.data[aIdx+i*k+l]*B.data[bIdx+l*n+j];
-                                }
-                                C.data[cIdx+i*n+j]=sum;
-                            }
-                        } });
+            //这里需要进一步优化
+            C.shape.rangeParallel(C.shape.dim(), [&A,&B,&C](const int idx,const std::vector<int> &indices,ThreadLocalVectors &tlv) {
+                
+                // int m=A.shape[-2];
+                int k=A.shape[-1];
+                // int n=B.shape[-1];
+     
+                std::copy(indices.begin(),indices.end()-2,tlv.get(0).begin());
+                tlv.get(0)[indices.size()-2]=A.shape[-2];
+                tlv.get(0)[indices.size()-1]=indices[-1];
+                int aIdx=A.shape.linearat(tlv.get(0));
+                std::copy(indices.begin(),indices.end()-2,tlv.get(1).begin());
+                tlv.get(1)[indices.size()-2]=0;
+                tlv.get(1)[indices.size()-1]=indices[-2];
+                int bIdx=B.shape.linearat(tlv.get(1));
+                int bstride=k;
+                
+                T sum=0;
+                for(int l=0;l<k;l++){
+                    sum+=A.data[aIdx+l]+B.data[bIdx+l*bstride];
+                }
+                C.data[idx]=sum;
+            },{A.shape.dim(),B.shape.dim()});
         }
     };
 
diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/matmul.hpp b/excuter/op-mem-ompsimd/src/deepx/tf/matmul.hpp
index f5cafd18..1b88781d 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tf/matmul.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tf/matmul.hpp
@@ -1,6 +1,6 @@
 #ifndef DEEPX_TF_MATMUL_HPP
 #define DEEPX_TF_MATMUL_HPP
- 
+
 #include "deepx/tf/tf.hpp"
 #include "deepx/dtype.hpp"
 #include "deepx/dtype_ompsimd.hpp"
@@ -21,7 +21,7 @@ namespace deepx::tf
             this->args = args;
             this->returns = returns;
         }
- 
+
         string math_formula() const override
         {
             return "T3=T1 @ T2";
@@ -30,7 +30,17 @@ namespace deepx::tf
         {
             return make_shared<MatMul<Author>>(*this);
         }
-        int compute(shared_ptr<MemBase> mem, Precision a_type,string &error){
+
+        int run(shared_ptr<MemBase> mem, string &error) override
+        {
+            Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
+            Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype;
+            Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
+            if (a_type != b_type || a_type != c_type)
+            {
+                error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(b_type) + " != " + precision_str(c_type);
+                return 1;
+            }
             switch (a_type)
             {
             case Precision::Float64:
@@ -57,30 +67,6 @@ namespace deepx::tf
             }
             return 0;
         }
-        int run(shared_ptr<MemBase> mem, string &error) override
-        {
-            Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
-            Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype;
-            Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
-            if (a_type != b_type || a_type != c_type)
-            {
-                error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(b_type) + " != " + precision_str(c_type);
-                return 1;
-            }
-            if (metadata.benchmark.repeat > 0)
-            {   
-                for (int i = 0; i < metadata.benchmark.repeat; i++)
-                {
-                    if (compute(mem, a_type, error))
-                    {
-                        return 1;
-                    }
-                }
-            }else{
-                return compute(mem, a_type, error);
-            }
-            return 0;
-        }
     };
 }
 
diff --git a/front/py/deepx/nn/functional/__init__.py b/front/py/deepx/nn/functional/__init__.py
index 2c5d517b..d8ac8f44 100644
--- a/front/py/deepx/nn/functional/__init__.py
+++ b/front/py/deepx/nn/functional/__init__.py
@@ -25,7 +25,7 @@
     "sqrt","pow","exp","log",
     "min","max",
     "less","greater","equal","notequal",
-    "switch",
+    "switch","where",
     "todtype",
     "invert",
     "matmul",
diff --git a/front/py/deepx/nn/functional/elementwise.py b/front/py/deepx/nn/functional/elementwise.py
index 9b502473..52a6dc52 100644
--- a/front/py/deepx/nn/functional/elementwise.py
+++ b/front/py/deepx/nn/functional/elementwise.py
@@ -54,3 +54,7 @@ def bool(input:Tensor)->Tensor:
     from .leaffunc_elementwise import todtype
     dest=newtensor(input.shape,dtype='bool',name=input.name)
     return todtype(input,dest)
+
+def where(condition:Tensor,x:Tensor,y:Tensor)->Tensor:
+    from .leaffunc_elementwise import switch_func
+    return switch_func((x,y),condition)
\ No newline at end of file
diff --git a/front/py/deepx/nn/functional/leaffunc_matmul.py b/front/py/deepx/nn/functional/leaffunc_matmul.py
index acdcefd4..fadb0175 100644
--- a/front/py/deepx/nn/functional/leaffunc_matmul.py
+++ b/front/py/deepx/nn/functional/leaffunc_matmul.py
@@ -4,7 +4,7 @@
 from .leaffunc_life import newtensor
 from .authormap import defaultauthor
 
-def matmul(a:Tensor,b:Tensor,out:Union[Tensor,str]='',bench:tuple[int,int]=None)->Tensor:
+def matmul(a:Tensor,b:Tensor,out:Union[Tensor,str]='',bench:int=None)->Tensor:
     outtensor=out
     if isinstance(out,str) or out is None:
         outshape=Shape.matmul(a.shape,b.shape)
diff --git a/front/py/deepx/scheduler/client/udpconn.py b/front/py/deepx/scheduler/client/udpconn.py
index a25b0963..6a12c26a 100644
--- a/front/py/deepx/scheduler/client/udpconn.py
+++ b/front/py/deepx/scheduler/client/udpconn.py
@@ -3,7 +3,7 @@
 import select
 
 class UDPConn:
-    def __init__(self, endpoint: str = "localhost:8080"):
+    def __init__(self, endpoint: str = "localhost:9090"):
         # 解析endpoint
         self._host, port_str = endpoint.split(':')
         self._port = int(port_str)
diff --git a/front/py/deepx/transformer/modeling_rope_utils.py b/front/py/deepx/transformer/modeling_rope_utils.py
index 0554590c..3ee5255b 100644
--- a/front/py/deepx/transformer/modeling_rope_utils.py
+++ b/front/py/deepx/transformer/modeling_rope_utils.py
@@ -1,19 +1,21 @@
 from typing import   Tuple
 import math
-from deepx import arange,Tensor
+from deepx import arange,Tensor,where
 
 def _compute_default_rope_parameters(config:dict={
-    "base":10000.0,
+    "rope_theta":10000.0,
     "head_dim":0,
     "partial_rotary_factor":1.0,
 }) -> Tuple[Tensor, float]:
-    dim   = config.head_dim* config.partial_rotary_factor
+    partial_rotary_factor = config.get("partial_rotary_factor", 1.0)
+    dim   = config["head_dim"]* partial_rotary_factor
     # 计算逆频率
-    inv_freq = 1.0 / (config.base ** (arange(0, dim, 2, dtype='float64')/ dim))
+    base=config["rope_theta"]
+    inv_freq = 1.0 / (base ** (arange(0, dim, 2, dtype='float64')/ dim))
     return inv_freq, 1.0
     
 def _compute_llama3_parameters(config:dict={
-    "base":10000.0,
+    "rope_theta":10000.0,
     "head_dim":0,
     "partial_rotary_factor":1.0,
     "factor":8,
@@ -25,18 +27,22 @@ def _compute_llama3_parameters(config:dict={
     # Gets the default RoPE parameters
     inv_freq, attention_factor = _compute_default_rope_parameters(config)
 
-    low_freq_wavelen = config.old_context_len / config.low_freq_factor
-    high_freq_wavelen = config.old_context_len / config.high_freq_factor
+    low_freq_factor = config["rope_scaling"]["low_freq_factor"]  # `1` in the original implementation
+    high_freq_factor = config["rope_scaling"]["high_freq_factor"]  # `4` in the original implementation
+    old_context_len = config["rope_scaling"]["original_max_position_embeddings"]  # `8192` in the original implementation
+    low_freq_wavelen = old_context_len /low_freq_factor
+    high_freq_wavelen = old_context_len/ high_freq_factor
 
     wavelen = 2 * math.pi / inv_freq
+    wavelen.print()
     # wavelen < high_freq_wavelen: do nothing
     # wavelen > low_freq_wavelen: divide by factor
-    inv_freq_llama = torch.where(wavelen > low_freq_wavelen, inv_freq / config.factor, inv_freq)
+    inv_freq_llama =  where(wavelen > low_freq_wavelen, inv_freq / config.factor, inv_freq)
     # otherwise: interpolate between the two, using a smooth factor
     smooth_factor = (config.old_context_len / wavelen - config.low_freq_factor) / (config.high_freq_factor - config.low_freq_factor)
     smoothed_inv_freq = (1 - smooth_factor) * inv_freq_llama / config.factor + smooth_factor * inv_freq_llama
     is_medium_freq = ~(wavelen < high_freq_wavelen) * ~(wavelen > low_freq_wavelen)
-    inv_freq_llama = torch.where(is_medium_freq, smoothed_inv_freq, inv_freq_llama)
+    inv_freq_llama =  where(is_medium_freq, smoothed_inv_freq, inv_freq_llama)
 
     return inv_freq_llama, attention_factor
  
diff --git a/front/py/deepx/transformer/models/llama/__init__.py b/front/py/deepx/transformer/models/llama/__init__.py
new file mode 100644
index 00000000..d77def35
--- /dev/null
+++ b/front/py/deepx/transformer/models/llama/__init__.py
@@ -0,0 +1,4 @@
+from .embedding import *
+__all__ = [
+    "LlamaRotaryEmbedding"
+]
\ No newline at end of file
diff --git a/front/py/deepx/transformer/models/llama/embedding.py b/front/py/deepx/transformer/models/llama/embedding.py
index 8bac5baa..62e00a57 100644
--- a/front/py/deepx/transformer/models/llama/embedding.py
+++ b/front/py/deepx/transformer/models/llama/embedding.py
@@ -11,11 +11,11 @@ def __init__(self,config:dict):
         # 原始最大序列长度
         self.original_max_seq_len = config["max_position_embeddings"]
         # 旋转类型
-        self.rope_type=config["rope_scaling"]["type"]
+        self.rope_type=config["rope_scaling"]["rope_type"]
         # 旋转初始化函数
         self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
         # 旋转初始化函数
-        inv_freq, self.attention_scaling = self.rope_init_fn(self.config)
+        inv_freq, self.attention_scaling = self.rope_init_fn(config)
         # 注册缓存
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         # 原始旋转频率
diff --git a/front/py/examples/2_ir/3_matmul.py b/front/py/examples/2_ir/3_matmul.py
index 413a2613..420a1709 100644
--- a/front/py/examples/2_ir/3_matmul.py
+++ b/front/py/examples/2_ir/3_matmul.py
@@ -1,9 +1,10 @@
-benchcnt=100
-
+benchcnt=2000
+shape=[4096,4096]
+print()
 from deepxutil.numpy  import save_numpy
 import numpy as np
-np_T1 = np.random.randn(1024, 1024).astype(np.float32)
-np_T2 = np.random.randn(1024, 1024).astype(np.float32)
+np_T1 = np.random.randn(shape[0], shape[1]).astype(np.float32)
+np_T2 = np.random.randn(shape[0], shape[1]).astype(np.float32)
 
 npy_path = '/home/lipeng/model/deepxmodel/matmul/'
 save_numpy(np_T1,npy_path+'t1')
@@ -13,35 +14,32 @@
 
 import torch
 import time
-torch_t1 = torch.from_numpy(np_T1)
-torch_t2 = torch.from_numpy(np_T2)
+torch_t1 = torch.from_numpy(np_T1).to(torch.float32).to('cuda')
+torch_t2 = torch.from_numpy(np_T2).to(torch.float32).to('cuda')
 # warmup
 _=torch_t1 @ torch_t2
 
 torch_start = time.time()
 for i in range(benchcnt):
     torch_t3 = torch_t1 @ torch_t2
-    
-print(torch_t3)
+
 torch_end = time.time()
 print(f"PyTorch time: {torch_end - torch_start} seconds")
 ############-------DEEPX-------################
 
-from deepx import uniform, matmul, zeros,load
-from deepx.nn.functional import save,load
+from deepx import   matmul, zeros,load
 print()
 
 t1 = load(npy_path+'t1')
 t2 = load(npy_path+'t2')
-t3= zeros((1024,1024),dtype='float32',name="t3")
+t3= zeros(tuple(shape),dtype='float32',name="t3")
 from deepx.nn.functional import defaultauthor
 defaultauthor['matmul']='miaobyte'
 # warmup
 matmul(t1,t2,out=t3)
 
 deepx_start = time.time()
-matmul(t1,t2,out=t3,bench=(benchcnt))
-t3.print()
+matmul(t1,t2,out=t3,bench=benchcnt)
 deepx_end = time.time()
 print(f"DeepX time: {deepx_end - deepx_start} seconds")
 
diff --git a/front/py/examples/4_transformer/llama/1_llamarmsnorm_torch.py b/front/py/examples/4_transformer/llama/1_llama_rmsnorm.py
similarity index 100%
rename from front/py/examples/4_transformer/llama/1_llamarmsnorm_torch.py
rename to front/py/examples/4_transformer/llama/1_llama_rmsnorm.py
diff --git a/front/py/examples/4_transformer/llama/1_llama_rope.py b/front/py/examples/4_transformer/llama/1_llama_rope.py
new file mode 100644
index 00000000..2738a41f
--- /dev/null
+++ b/front/py/examples/4_transformer/llama/1_llama_rope.py
@@ -0,0 +1,94 @@
+hidden_size = 8
+eps = 1e-6
+dir='/home/lipeng/model/deepxmodel/llama/'
+model_path="/home/lipeng/model/deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
+print()
+
+from transformers import AutoTokenizer,AutoConfig
+def init_tokenizer(model_path):
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
+    tokenizer.pad_token = tokenizer.eos_token
+    return tokenizer
+
+tokenizer = init_tokenizer(model_path)
+config=AutoConfig.from_pretrained(model_path)
+def tokenize_text(text, tokenizer):
+    tokens = tokenizer(text, return_tensors="pt").input_ids
+    import torch
+    # 处理超出词汇表范围的token
+    if torch.any(tokens >= tokenizer.vocab_size):
+        # 获取UNK token ID，如果没有则使用0
+        unk_token_id = tokenizer.unk_token_id if hasattr(tokenizer, 'unk_token_id') and tokenizer.unk_token_id is not None else 0
+        # 替换所有超出范围的token为UNK
+        tokens = torch.where(tokens < tokenizer.vocab_size, tokens, torch.tensor(unk_token_id, device=tokens.device))
+    return tokens
+ 
+############-------PyTorch-------################
+import torch 
+
+# 创建输入
+text = "这是一个测试文本，用于演示嵌入层的使用。"
+torch_input = tokenize_text(text, tokenizer)
+from deepxutil.torch import save_torch
+save_torch(torch_input,dir+'input')
+
+# 创建网络
+
+class NetTorch(torch.nn.Module):
+    from transformers.models.llama.modeling_llama import LlamaConfig
+    def __init__(self,config:LlamaConfig):
+        super().__init__()
+        self.padding_idx = config.pad_token_id
+        self.config = config
+        self.embed_tokens = torch.nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        from transformers.models.llama.modeling_llama import LlamaRotaryEmbedding
+        self.rotary_emb = LlamaRotaryEmbedding(config=config)
+
+    def forward(self,x):
+        inputs_embeds = self.embed_tokens(x)
+        hidden_states = inputs_embeds
+        # create position embeddings to be shared across the decoder layers
+        position_ids = torch.arange(hidden_states.shape[1], device=hidden_states.device).unsqueeze(0)
+        return self.rotary_emb(hidden_states, position_ids)
+ 
+ 
+torch_net = NetTorch(config)
+save_torch(torch_net.embed_tokens.weight,dir+'weight')
+# 前向传播
+torch_output = torch_net(torch_input)
+torch_sin, torch_cos = torch_output
+
+print("sin shape:",torch_sin.shape)
+print("sin:", torch_sin)
+
+print("cos shape:", torch_cos.shape)
+print("cos:", torch_cos)
+
+
+############-------DEEPX-------################
+from deepx.nn.modules import Embedding,Module
+from deepx  import load
+from deepx.transformer.models.llama import LlamaRotaryEmbedding
+
+input=load(dir+'input')
+
+embed_tokens_weight=load(dir+'weight')
+
+class NetDeepx(Module):
+    def __init__(self,configdict:dict):
+        super().__init__()
+        self.embed_tokens = Embedding(configdict["vocab_size"], configdict["hidden_size"],weight=embed_tokens_weight)
+        self.rotary_emb = LlamaRotaryEmbedding(config=configdict)
+
+    def forward(self,x):
+        inputs_embeds = self.embed_tokens(x)
+        hidden_states = inputs_embeds
+        position_ids = torch.arange(hidden_states.shape[1], device=hidden_states.device).unsqueeze(0)
+        return self.rotary_emb(hidden_states, position_ids)
+
+net = NetDeepx(configdict=config.to_dict())
+out=net.forward(input)
+out.print()
+
+
+
diff --git a/front/py/examples/4_transformer/llama/1_rope.py b/front/py/examples/4_transformer/llama/1_rope.py
deleted file mode 100644
index e69de29b..00000000