diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu
index 5ef7ea8..6cd9729 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu
@@ -96,11 +96,7 @@ namespace deepx::tensorfunc
         default:
             throw std::runtime_error("dimension large than " + std::to_string(MAX_DIM));
         }
-        cudaError_t err = cudaGetLastError();
-        if (err != cudaSuccess)
-        {
-            throw std::runtime_error("cuda error");
-        }
+        throwcudaerror("Failed to launch transpose kernel",cudaGetLastError());
     }
 
     template void launch_transpose<double>(const double *input, const int *inputStrides, double *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
@@ -224,12 +220,9 @@ namespace deepx::tensorfunc
         default:
             throw std::runtime_error("dimension large than " + std::to_string(MAX_DIM));
         }
-        cudaError_t err = cudaGetLastError();
-        if (err != cudaSuccess)
-        {
-            throw std::runtime_error("cuda error");
-        }
+        throwcudaerror("Failed to launch concat kernel",cudaGetLastError());
     }
+ 
     template void launch_concat<double>(const double **tensorsData, const int *inputStrides, double *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis);
     template void launch_concat<float>(const float **tensorsData, const int *inputStrides, float *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis);
     template void launch_concat<nv_bfloat16>(const nv_bfloat16 **tensorsData, const int *inputStrides, nv_bfloat16 *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis);
@@ -335,11 +328,7 @@ namespace deepx::tensorfunc
         default:
             throw std::runtime_error("dimension large than " + std::to_string(MAX_DIM));
         }
-        cudaError_t err = cudaGetLastError();
-        if (err != cudaSuccess)
-        {
-            throw std::runtime_error("cuda error");
-        }
+        throwcudaerror("Failed to launch broadcastTo kernel",cudaGetLastError());
     }
     template void launch_broadcastTo<double>(const double *input, const int *inputStrides, const int inputDim,
                                              const BroadcastMap *broadcastMap,
@@ -489,12 +478,9 @@ namespace deepx::tensorfunc
         default:
             throw std::runtime_error("dimension large than " + std::to_string(MAX_DIM));
         }
-        cudaError_t err = cudaGetLastError();
-        if (err != cudaSuccess)
-        {
-            throw std::runtime_error("cuda error");
-        }
+        throwcudaerror("Failed to launch indexselect kernel",cudaGetLastError());
     }
+ 
     template void launch_indexselect<double, int64_t>(const double *input, const int *inputStrides, const int inputDim,
                                                       const int64_t *index, const int *indexStrides, const int indexDim,
                                                       const int gatherAxis,
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda.hpp
index 5f4b3cd..e44cd1a 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda.hpp
@@ -68,17 +68,7 @@ namespace deepx::tensorfunc
 
     using std::shared_ptr;
     
-    inline std::pair<int, std::shared_ptr<unsigned char[]>> device_offload(unsigned char *data,int size)
-    {
-        shared_ptr<unsigned char[]> host_data(new unsigned char[size]);
-        cudaMemcpy(host_data.get(), data, size, cudaMemcpyDeviceToHost);
-        cudaError_t err=cudaGetLastError();
-        if(err!=cudaSuccess){
-            throw std::runtime_error("Failed to copy data from device to host");
-            
-        }
-        return {size, host_data};
-    }
+
 
     inline void throwcudaerror(const std::string& msg,cudaError_t err){
        if (err != cudaSuccess)
@@ -86,6 +76,13 @@ namespace deepx::tensorfunc
             throw std::runtime_error(msg + "\n" + std::string(cudaGetErrorString(err)));
         }
     }
+    inline std::pair<int, std::shared_ptr<unsigned char[]>> device_offload(unsigned char *data,int size)
+    {
+        shared_ptr<unsigned char[]> host_data(new unsigned char[size]);
+        cudaMemcpy(host_data.get(), data, size, cudaMemcpyDeviceToHost);
+        throwcudaerror("Failed to copy data from device to host",cudaGetLastError());
+        return {size, host_data};
+    }
 }
 
 #endif
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu
index 16011b3..a9b1ca2 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu
@@ -27,12 +27,7 @@ namespace deepx::tensorfunc
     void launch_todtype(const T* a, Dtype* c,const int size){
         auto [numBlocks, blockSize] = BestDims(size);
         todtype_kernel<<<numBlocks, blockSize>>>(a, c, size);
-        cudaError_t err = cudaGetLastError();
-        if (err != cudaSuccess)
-        {
-            throw std::runtime_error("Failed to launch todtype kernel: " +
-                                     std::string(cudaGetErrorString(err)));
-        }
+        throwcudaerror("Failed to launch todtype kernel",cudaGetLastError());
     }
     template void launch_todtype<double, float>(const double *a, float *c, const int size);
     template void launch_todtype<double, half>(const double *a, half *c, const int size);
@@ -114,13 +109,7 @@ namespace deepx::tensorfunc
         // 启动kernel
         auto [numBlocks, blockSize] = BestDims(size);
         add_kernel<<<numBlocks, blockSize>>>(a, b, c, size);
-        // 检查kernel执行是否成功
-        cudaError_t err = cudaGetLastError();
-        if (err != cudaSuccess)
-        {
-            throw std::runtime_error("Failed to launch add kernel: " +
-                                     std::string(cudaGetErrorString(err)));
-        }
+        throwcudaerror("Failed to launch add kernel",cudaGetLastError());
     }
 
     template void launch_add<double>(const double *a, const double *b, double *c, const int size);
@@ -147,12 +136,7 @@ namespace deepx::tensorfunc
     {
         auto [numBlocks, blockSize] = BestDims(size);
         addscalar_kernel<<<numBlocks, blockSize>>>(a, scalar, c, size);
-        cudaError_t err = cudaGetLastError();
-        if (err != cudaSuccess)
-        {
-            throw std::runtime_error("Failed to launch addscalar kernel: " +
-                                     std::string(cudaGetErrorString(err)));
-        }
+        throwcudaerror("Failed to launch addscalar kernel",cudaGetLastError());
     }
     template void launch_addscalar<double>(const double *a, const double scalar, double *c, const int size);
     template void launch_addscalar<float>(const float *a, const float scalar, float *c, const int size);
@@ -178,12 +162,7 @@ namespace deepx::tensorfunc
     {
         auto [numBlocks, blockSize] = BestDims(size);
         sub_kernel<<<numBlocks, blockSize>>>(a, b, c, size);
-        cudaError_t err = cudaGetLastError();
-        if (err != cudaSuccess)
-        {
-            throw std::runtime_error("Failed to launch sub kernel: " +
-                                     std::string(cudaGetErrorString(err)));
-        }
+        throwcudaerror("Failed to launch sub kernel",cudaGetLastError());
     }
     template void launch_sub<double>(const double *a, const double *b, double *c, const int size);
     template void launch_sub<float>(const float *a, const float *b, float *c, const int size);
@@ -209,13 +188,9 @@ namespace deepx::tensorfunc
     {
         auto [numBlocks, blockSize] = BestDims(size);
         subscalar_kernel<<<numBlocks, blockSize>>>(a, scalar, c, size);
-        cudaError_t err = cudaGetLastError();
-        if (err != cudaSuccess)
-        {
-            throw std::runtime_error("Failed to launch subscalar kernel: " +
-                                     std::string(cudaGetErrorString(err)));
-        }
+        throwcudaerror("Failed to launch subscalar kernel",cudaGetLastError());
     }
+ 
     template void launch_subscalar<double>(const double *a, const double scalar, double *c, const int size);
     template void launch_subscalar<float>(const float *a, const float scalar, float *c, const int size);
     template void launch_subscalar<half>(const half *a, const half scalar, half *c, const int size);
@@ -239,11 +214,7 @@ namespace deepx::tensorfunc
     void launch_rsubscalar(const T scalar, const T* a, T* c,const int size){
         auto [numBlocks, blockSize] = BestDims(size);
         rsubscalar_kernel<<<numBlocks, blockSize>>>(scalar, a, c, size);
-        cudaError_t err = cudaGetLastError();
-        if (err != cudaSuccess)
-        {
-            throw std::runtime_error("Failed to launch rsubscalar kernel: "+std::string(cudaGetErrorString(err)));
-        }
+        throwcudaerror("Failed to launch rsubscalar kernel",cudaGetLastError());
     }
     template void launch_rsubscalar<double>(const double scalar, const double* a, double* c,const int size);
     template void launch_rsubscalar<float>(const float scalar, const float* a, float* c,const int size);
@@ -271,13 +242,9 @@ namespace deepx::tensorfunc
     {
         auto [numBlocks, blockSize] = BestDims(size);
         mul_kernel<<<numBlocks, blockSize>>>(a, b, c, size);
-        cudaError_t err = cudaGetLastError();
-        if (err != cudaSuccess)
-        {
-            throw std::runtime_error("Failed to launch mul kernel: " +
-                                     std::string(cudaGetErrorString(err)));
-        }
+        throwcudaerror("Failed to launch mul kernel",cudaGetLastError());
     }
+ 
     template void launch_mul<double>(const double *a, const double *b, double *c, const int size);
     template void launch_mul<float>(const float *a, const float *b, float *c, const int size);
     template void launch_mul<half>(const half *a, const half *b, half *c, const int size);
@@ -302,12 +269,7 @@ namespace deepx::tensorfunc
     {
         auto [numBlocks, blockSize] = BestDims(size);
         mulscalar_kernel<<<numBlocks, blockSize>>>(a, scalar, c, size);
-        cudaError_t err = cudaGetLastError();
-        if (err != cudaSuccess)
-        {
-            throw std::runtime_error("Failed to launch mulscalar kernel: " +
-                                     std::string(cudaGetErrorString(err)));
-        }
+        throwcudaerror("Failed to launch mulscalar kernel",cudaGetLastError());
     }
     template void launch_mulscalar<double>(const double *a, const double scalar, double *c, const int size);
     template void launch_mulscalar<float>(const float *a, const float scalar, float *c, const int size);
@@ -333,13 +295,9 @@ namespace deepx::tensorfunc
     {
         auto [numBlocks, blockSize] = BestDims(size);
         div_kernel<<<numBlocks, blockSize>>>(a, b, c, size);
-        cudaError_t err = cudaGetLastError();
-        if (err != cudaSuccess)
-        {
-            throw std::runtime_error("Failed to launch div kernel: " +
-                                     std::string(cudaGetErrorString(err)));
-        }
+        throwcudaerror("Failed to launch div kernel",cudaGetLastError());
     }
+ 
     template void launch_div<double>(const double *a, const double *b, double *c, const int size);
     template void launch_div<float>(const float *a, const float *b, float *c, const int size);
     template void launch_div<half>(const half *a, const half *b, half *c, const int size);
@@ -364,13 +322,9 @@ namespace deepx::tensorfunc
     {
         auto [numBlocks, blockSize] = BestDims(size);
         divscalar_kernel<<<numBlocks, blockSize>>>(a, scalar, c, size);
-        cudaError_t err = cudaGetLastError();
-        if (err != cudaSuccess)
-        {
-            throw std::runtime_error("Failed to launch divscalar kernel: " +
-                                     std::string(cudaGetErrorString(err)));
-        }
+        throwcudaerror("Failed to launch divscalar kernel",cudaGetLastError());
     }
+ 
     template void launch_divscalar<double>(const double *a, const double scalar, double *c, const int size);
     template void launch_divscalar<float>(const float *a, const float scalar, float *c, const int size);
     template void launch_divscalar<half>(const half *a, const half scalar, half *c, const int size);
@@ -395,13 +349,9 @@ namespace deepx::tensorfunc
     {
         auto [numBlocks, blockSize] = BestDims(size);
         rdivscalar_kernel<<<numBlocks, blockSize>>>(scalar, a, c, size);
-        cudaError_t err = cudaGetLastError();
-        if (err != cudaSuccess)
-        {
-            throw std::runtime_error("Failed to launch rdivscalar kernel: " +
-                                     std::string(cudaGetErrorString(err)));
-        }
+        throwcudaerror("Failed to launch rdivscalar kernel",cudaGetLastError());
     }
+
     template void launch_rdivscalar<double>(const double scalar, const double *a, double *c, const int size);
     template void launch_rdivscalar<float>(const float scalar, const float *a, float *c, const int size);
     template void launch_rdivscalar<half>(const half scalar, const half *a, half *c, const int size);
@@ -421,17 +371,21 @@ namespace deepx::tensorfunc
         }
     }
 
+    template <>
+    __global__ void invert_kernel<bool>(const bool *A, bool *C, const int size)
+    {
+        for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x)
+        {
+            C[idx] = !A[idx];
+        }
+    }
+
     template <typename T>
     void launch_invert(const T *a, T *c, const int size)
     {
         auto [numBlocks, blockSize] = BestDims(size);
         invert_kernel<<<numBlocks, blockSize>>>(a, c, size);
-        cudaError_t err = cudaGetLastError();
-        if (err != cudaSuccess)
-        {
-            throw std::runtime_error("Failed to launch invert kernel: " +
-                                     std::string(cudaGetErrorString(err)));
-        }
+        throwcudaerror("Failed to launch invert kernel",cudaGetLastError());
     }
     template void launch_invert<int64_t>(const int64_t *a, int64_t *c, const int size);
     template void launch_invert<int32_t>(const int32_t *a, int32_t *c, const int size);
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.cu
index a33a872..a514755 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.cu
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_compare.cu
@@ -21,12 +21,7 @@ namespace deepx::tensorfunc
     {
         auto [numBlocks, blockSize] = BestDims(size);
         max_kernel<<<numBlocks, blockSize>>>(A, B, C, size);
-        cudaError_t err = cudaGetLastError();
-        if (err != cudaSuccess)
-        {
-            throw std::runtime_error("Failed to launch add kernel: " +
-                                     std::string(cudaGetErrorString(err)));
-        }
+        throwcudaerror("Failed to launch add kernel",cudaGetLastError());
     }
 
     template void launch_max<double>(const double *A, const double *B, double *C, const int size);
@@ -53,12 +48,7 @@ namespace deepx::tensorfunc
     {
         auto [numBlocks, blockSize] = BestDims(size);
         maxscalar_kernel<<<numBlocks, blockSize>>>(A, scalar, C, size);
-        cudaError_t err = cudaGetLastError();
-        if (err != cudaSuccess)
-        {
-            throw std::runtime_error("Failed to launch add kernel: " +
-                                     std::string(cudaGetErrorString(err)));
-        }
+        throwcudaerror("Failed to launch add kernel",cudaGetLastError());
     }
 
     template void launch_maxscalar<double>(const double *A, const double scalar, double *C, const int size);
@@ -85,12 +75,7 @@ namespace deepx::tensorfunc
     {
         auto [numBlocks, blockSize] = BestDims(size);
         min_kernel<<<numBlocks, blockSize>>>(A, B, C, size);
-        cudaError_t err = cudaGetLastError();
-        if (err != cudaSuccess)
-        {
-            throw std::runtime_error("Failed to launch add kernel: " +
-                                     std::string(cudaGetErrorString(err)));
-        }
+        throwcudaerror("Failed to launch add kernel",cudaGetLastError());
     }
 
     template void launch_min<double>(const double *A, const double *B, double *C, const int size);
@@ -117,12 +102,7 @@ namespace deepx::tensorfunc
     {
         auto [numBlocks, blockSize] = BestDims(size);
         minscalar_kernel<<<numBlocks, blockSize>>>(A, scalar, C, size);
-        cudaError_t err = cudaGetLastError();
-        if (err != cudaSuccess)
-        {
-            throw std::runtime_error("Failed to launch add kernel: " +
-                                     std::string(cudaGetErrorString(err)));
-        }
+        throwcudaerror("Failed to launch add kernel",cudaGetLastError());
     }
 
     template void launch_minscalar<double>(const double *A, const double scalar, double *C, const int size);
@@ -175,12 +155,7 @@ namespace deepx::tensorfunc
         {
             equalwithepsilon_kernel<<<numBlocks, blockSize>>>(A, B, epsilon, mask, size);
         }
-        cudaError_t err = cudaGetLastError();
-        if (err != cudaSuccess)
-        {
-            throw std::runtime_error("Failed to launch add kernel: " +
-                                     std::string(cudaGetErrorString(err)));
-        }
+        throwcudaerror("Failed to launch add kernel",cudaGetLastError());
     }
 
     template void launch_equal<double,bool>(const double *A, const double *B, const float epsilon, bool *mask, const int size);
@@ -233,12 +208,7 @@ namespace deepx::tensorfunc
         {
             equalscalarwithepsilon_kernel<<<numBlocks, blockSize>>>(A, scalar, epsilon, mask, size);
         }
-        cudaError_t err = cudaGetLastError();
-        if (err != cudaSuccess)
-        {
-            throw std::runtime_error("Failed to launch add kernel: " +
-                                     std::string(cudaGetErrorString(err)));
-        }
+        throwcudaerror("Failed to launch add kernel",cudaGetLastError());
     }
 
     template void launch_equalscalar<double,bool>(const double *A, const double scalar, const float epsilon, bool *mask, const int size);
@@ -291,12 +261,7 @@ namespace deepx::tensorfunc
         {
             notequalwithepsilon_kernel<<<numBlocks, blockSize>>>(A, B, epsilon, mask, size);
         }
-        cudaError_t err = cudaGetLastError();
-        if (err != cudaSuccess)
-        {
-            throw std::runtime_error("Failed to launch add kernel: " +
-                                     std::string(cudaGetErrorString(err)));
-        }
+        throwcudaerror("Failed to launch add kernel",cudaGetLastError());
     }
 
     template void launch_notequal<double,bool>(const double *A, const double *B, const float epsilon, bool *mask, const int size);
@@ -349,12 +314,7 @@ namespace deepx::tensorfunc
         {
             notequalscalarwithepsilon_kernel<<<numBlocks, blockSize>>>(A, scalar, epsilon, mask, size);
         }
-        cudaError_t err = cudaGetLastError();
-        if (err != cudaSuccess)
-        {
-            throw std::runtime_error("Failed to launch add kernel: " +
-                                     std::string(cudaGetErrorString(err)));
-        }
+        throwcudaerror("Failed to launch add kernel",cudaGetLastError());
     }
 
     template void launch_notequalscalar<double,bool>(const double *A, const double scalar, const float epsilon, bool *mask, const int size);
@@ -382,12 +342,7 @@ namespace deepx::tensorfunc
     {
         auto [numBlocks, blockSize] = BestDims(size);
         less_kernel<<<numBlocks, blockSize>>>(A, B, mask, size);
-        cudaError_t err = cudaGetLastError();
-        if (err != cudaSuccess)
-        {
-            throw std::runtime_error("Failed to launch add kernel: " +
-                                     std::string(cudaGetErrorString(err)));
-        }
+        throwcudaerror("Failed to launch add kernel",cudaGetLastError());
     }
 
     template void launch_less<double,bool>(const double *A, const double *B, bool *mask, const int size);
@@ -416,12 +371,7 @@ namespace deepx::tensorfunc
     {
         auto [numBlocks, blockSize] = BestDims(size);
         lessscalar_kernel<<<numBlocks, blockSize>>>(A, scalar, mask, size);
-        cudaError_t err = cudaGetLastError();
-        if (err != cudaSuccess)
-        {
-            throw std::runtime_error("Failed to launch add kernel: " +
-                                     std::string(cudaGetErrorString(err)));
-        }
+        throwcudaerror("Failed to launch add kernel",cudaGetLastError());
     }
 
     template void launch_lessscalar<double,bool>(const double *A, const double scalar, bool *mask, const int size);
@@ -449,12 +399,7 @@ namespace deepx::tensorfunc
     {   
         auto [numBlocks, blockSize] = BestDims(size);
         greater_kernel<<<numBlocks, blockSize>>>(A, B, mask, size);
-        cudaError_t err = cudaGetLastError();
-        if (err != cudaSuccess)
-        {
-            throw std::runtime_error("Failed to launch add kernel: " +
-                                     std::string(cudaGetErrorString(err)));
-        }
+        throwcudaerror("Failed to launch add kernel",cudaGetLastError());
     }
 
     template void launch_greater<double,bool>(const double *A, const double *B, bool *mask, const int size);
@@ -482,12 +427,7 @@ namespace deepx::tensorfunc
     {
         auto [numBlocks, blockSize] = BestDims(size);
         greaterscalar_kernel<<<numBlocks, blockSize>>>(A, scalar, mask, size);
-        cudaError_t err = cudaGetLastError();
-        if (err != cudaSuccess)
-        {
-            throw std::runtime_error("Failed to launch add kernel: " +
-                                     std::string(cudaGetErrorString(err)));
-        }
+        throwcudaerror("Failed to launch add kernel",cudaGetLastError());
     }   
 
     template void launch_greaterscalar<double,bool>(const double *A, const double scalar, bool *mask, const int size);
@@ -516,13 +456,9 @@ namespace deepx::tensorfunc
         auto [numBlocks, blockSize] = BestDims(size);
         cudaVector<const T *> tensorsdataList(tensorsdata, numTensors, cudaMemcpyHostToDevice);
         switch_kernel<<<numBlocks, blockSize>>>(tensorsdataList.data, numTensors, cases, C, size);
-        cudaError_t err = cudaGetLastError();
-        if (err != cudaSuccess)
-        {
-            throw std::runtime_error("Failed to launch add kernel: " +
-                                     std::string(cudaGetErrorString(err)));
-        }
+        throwcudaerror("Failed to launch add kernel",cudaGetLastError());
     }   
+ 
     
     template void launch_switch<double,int32_t>(const double **tensorsdata, const int numTensors, const int32_t *cases, double *C, const int size);
     template void launch_switch<float,int32_t>(const float **tensorsdata, const int numTensors, const int32_t *cases, float *C, const int size);
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.cu
index 00bd232..18ca1a1 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.cu
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.cu
@@ -45,19 +45,16 @@ namespace deepx::tensorfunc
     }
 
     template <typename T>
-    void launch_sin(int numBlocks, int blockSize, const T* a, T* c, const int size){
+    void launch_sin(const T* a, T* c, const int size){
+        auto [numBlocks, blockSize] = BestDims(size);
         sin_kernel<<<numBlocks, blockSize>>>(a, c, size);
-        cudaError_t err = cudaGetLastError();
-        if (err != cudaSuccess) {
-            throw std::runtime_error("Failed to launch sin kernel: " + 
-                                       std::string(cudaGetErrorString(err)));
-        }
+        throwcudaerror("Failed to launch sin kernel",cudaGetLastError());
     }
 
-    template void  launch_sin<double>(int numBlocks, int blockSize, const double* a, double* c, const int size);
-    template void  launch_sin<float>(int numBlocks, int blockSize, const float* a, float* c, const int size);
-    template void  launch_sin<__half>(int numBlocks, int blockSize, const __half* a, __half* c, const int size);
-    template void  launch_sin<nv_bfloat16>(int numBlocks, int blockSize, const nv_bfloat16* a, nv_bfloat16* c, const int size);
+    template void  launch_sin<double>(const double* a, double* c, const int size);
+    template void  launch_sin<float>(const float* a, float* c, const int size);
+    template void  launch_sin<__half>(const __half* a, __half* c, const int size);
+    template void  launch_sin<nv_bfloat16>(const nv_bfloat16* a, nv_bfloat16* c, const int size);
     // cos
     template <typename T>
     __global__ void cos_kernel(const T* A, T* C, const int size);
@@ -91,18 +88,15 @@ namespace deepx::tensorfunc
         }
     }
     template <typename T>
-    void launch_cos(int numBlocks, int blockSize, const T* a, T* c, const int size){
+    void launch_cos(const T* a, T* c, const int size){
+        auto [numBlocks, blockSize] = BestDims(size);
         cos_kernel<<<numBlocks, blockSize>>>(a, c, size);
-        cudaError_t err = cudaGetLastError();
-        if (err != cudaSuccess) {
-            throw std::runtime_error("Failed to launch cos kernel: " + 
-                                       std::string(cudaGetErrorString(err)));
-        }
+        throwcudaerror("Failed to launch cos kernel",cudaGetLastError());
     }
-    template void  launch_cos<double>(int numBlocks, int blockSize, const double* a, double* c, const int size);    
-    template void  launch_cos<float>(int numBlocks, int blockSize, const float* a, float* c, const int size);
-    template void  launch_cos<__half>(int numBlocks, int blockSize, const __half* a, __half* c, const int size);
-    template void  launch_cos<nv_bfloat16>(int numBlocks, int blockSize, const nv_bfloat16* a, nv_bfloat16* c, const int size);
+    template void  launch_cos<double>(const double* a, double* c, const int size);    
+    template void  launch_cos<float>(const float* a, float* c, const int size);
+    template void  launch_cos<__half>(const __half* a, __half* c, const int size);
+    template void  launch_cos<nv_bfloat16>(const nv_bfloat16* a, nv_bfloat16* c, const int size);
     // tan
     template <typename T>
     __global__ void tan_kernel(const T* A, T* C, const int size);
@@ -123,17 +117,14 @@ namespace deepx::tensorfunc
    
  
     template <typename T>   
-    void launch_tan(int numBlocks, int blockSize, const T* a, T* c, const int size){
+    void launch_tan(const T* a, T* c, const int size){
+        auto [numBlocks, blockSize] = BestDims(size);
         tan_kernel<<<numBlocks, blockSize>>>(a, c, size);
-        cudaError_t err = cudaGetLastError();
-        if (err != cudaSuccess) {
-            throw std::runtime_error("Failed to launch tan kernel: " + 
-                                       std::string(cudaGetErrorString(err)));
-        }
+        throwcudaerror("Failed to launch tan kernel",cudaGetLastError());
     }
-    template void  launch_tan<double>(int numBlocks, int blockSize, const double* a, double* c, const int size);
-    template void  launch_tan<float>(int numBlocks, int blockSize, const float* a, float* c, const int size);
- 
+        
+    template void  launch_tan<double>( const double* a, double* c, const int size);
+    template void  launch_tan<float>(const float* a, float* c, const int size);
 }
 
 #endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAOBYTE_SIN_CU
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.cuh b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.cuh
index a66e996..f0e68d1 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.cuh
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.cuh
@@ -11,60 +11,26 @@
 
 namespace deepx::tensorfunc
 {
+    // sin
     template <typename T>
     __global__ void sin_kernel(const T* A, T* C, const int size);
 
     template <typename T>
-    void launch_sin(int numBlocks, int blockSize, const T* a, T* c, const int size);
+    void launch_sin(const T* a, T* c, const int size);
 
-    template <>
-    void launch_sin<double>(int numBlocks, int blockSize, const double* a, double* c, const int size);
-
-    template <>
-    void launch_sin<float>(int numBlocks, int blockSize, const float* a, float* c, const int size);
-
-    template <>
-    void launch_sin<nv_bfloat16>(int numBlocks, int blockSize, const nv_bfloat16* a, nv_bfloat16* c, const int size);
-
-    template <>
-    void launch_sin<__half>(int numBlocks, int blockSize, const __half* a, __half* c, const int size);
     
     template <typename T>
     __global__ void cos_kernel(const T* A, T* C, const int size);
 
     template <typename T>
-    void launch_cos(int numBlocks, int blockSize, const T* a, T* c, const int size);
-
-    template <>
-    void launch_cos<double>(int numBlocks, int blockSize, const double* a, double* c, const int size);
-
-    template <>
-    void launch_cos<float>(int numBlocks, int blockSize, const float* a, float* c, const int size);
-
-    template <>
-    void launch_cos<nv_bfloat16>(int numBlocks, int blockSize, const nv_bfloat16* a, nv_bfloat16* c, const int size);
+    void launch_cos( const T* a, T* c, const int size);
 
-    template <>
-    void launch_cos<__half>(int numBlocks, int blockSize, const __half* a, __half* c, const int size);
-    
+    // tan
     template <typename T>
     __global__ void tan_kernel(const T* A, T* C, const int size);
 
     template <typename T>
-    void launch_tan(int numBlocks, int blockSize, const T* a, T* c, const int size);
-
-    template <>
-    void launch_tan<double>(int numBlocks, int blockSize, const double* a, double* c, const int size);
-
-    template <>
-    void launch_tan<float>(int numBlocks, int blockSize, const float* a, float* c, const int size);
-
-    template <>
-    void launch_tan<nv_bfloat16>(int numBlocks, int blockSize, const nv_bfloat16* a, nv_bfloat16* c, const int size);
-
-    template <>
-    void launch_tan<__half>(int numBlocks, int blockSize, const __half* a, __half* c, const int size);
- 
+    void launch_tan( const T* a, T* c, const int size);
 }
 
 #endif
\ No newline at end of file
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.hpp
index 4a71c66..b126217 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.hpp
@@ -19,9 +19,7 @@ namespace deepx::tensorfunc
             if (A.shape.size != C.shape.size) {
                 throw TensorShapeError("sin");
             }
-            const int blockSize = A.shape.size > 256 ? 256 : A.shape.size;
-            int numBlocks = (A.shape.size + blockSize - 1) / blockSize;
-            launch_sin(numBlocks, blockSize, A.data, C.data, A.shape.size);           
+            launch_sin(A.data, C.data, A.shape.size);           
         }   
     };
 
@@ -33,9 +31,7 @@ namespace deepx::tensorfunc
             if (A.shape.size != C.shape.size) {
                 throw TensorShapeError("cos");
             }
-            const int blockSize = A.shape.size > 256 ? 256 : A.shape.size;
-            int numBlocks = (A.shape.size + blockSize - 1) / blockSize;
-            launch_cos(numBlocks, blockSize, A.data, C.data, A.shape.size);
+            launch_cos(A.data, C.data, A.shape.size);
         }
     };
 
@@ -47,9 +43,7 @@ namespace deepx::tensorfunc
             if (A.shape.size != C.shape.size) { 
                 throw TensorShapeError("tan");
             }
-            const int blockSize = A.shape.size > 256 ? 256 : A.shape.size;
-            int numBlocks = (A.shape.size + blockSize - 1) / blockSize;
-            launch_tan(numBlocks, blockSize, A.data, C.data, A.shape.size);
+            launch_tan(A.data, C.data, A.shape.size);
         }
     };
 
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cu
index 45b24be..c655b60 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cu
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cu
@@ -24,12 +24,8 @@ namespace deepx::tensorfunc
     {
         auto [numBlocks, blockSize] = BestDims(size);
         sqrt_kernel<<<numBlocks, blockSize>>>(a, c, size);
-        cudaError_t err = cudaGetLastError();
-        if (err != cudaSuccess)
-        {
-            throw std::runtime_error("Failed to launch sqrt kernel: " +
-                                     std::string(cudaGetErrorString(err)));
-        }
+        throwcudaerror("Failed to launch sqrt kernel",cudaGetLastError());
+ 
     }
     template void launch_sqrt<double>(const double *a, double *c, const int size);
     template void launch_sqrt<float>(const float *a, float *c, const int size);
@@ -51,12 +47,8 @@ namespace deepx::tensorfunc
     {
         auto [numBlocks, blockSize] = BestDims(size);
         pow_kernel<<<numBlocks, blockSize>>>(a, b, c, size);
-        cudaError_t err = cudaGetLastError();
-        if (err != cudaSuccess)
-        {
-            throw std::runtime_error("Failed to launch pow kernel: " +
-                                     std::string(cudaGetErrorString(err)));
-        }
+        throwcudaerror("Failed to launch pow kernel",cudaGetLastError());
+ 
     }
     template void launch_pow<double>(const double *a, const double *b, double *c, const int size);
     template void launch_pow<float>(const float *a, const float *b, float *c, const int size);
@@ -76,12 +68,8 @@ namespace deepx::tensorfunc
     {
         auto [numBlocks, blockSize] = BestDims(size);
         powscalar_kernel<<<numBlocks, blockSize>>>(a, scalar, c, size);
-        cudaError_t err = cudaGetLastError();
-        if (err != cudaSuccess)
-        {
-            throw std::runtime_error("Failed to launch powscalar kernel: " +
-                                     std::string(cudaGetErrorString(err)));
-        }
+        throwcudaerror("Failed to launch powscalar kernel",cudaGetLastError());
+ 
     }
     template void launch_powscalar<double>(const double *a, const double scalar, double *c, const int size);
     template void launch_powscalar<float>(const float *a, const float scalar, float *c, const int size);
@@ -101,12 +89,7 @@ namespace deepx::tensorfunc
     {
         auto [numBlocks, blockSize] = BestDims(size);
         rpowscalar_kernel<<<numBlocks, blockSize>>>(scalar, a, c, size);
-        cudaError_t err = cudaGetLastError();
-        if (err != cudaSuccess)
-        {
-            throw std::runtime_error("Failed to launch rpowscalar kernel: " +
-                                     std::string(cudaGetErrorString(err)));
-        }
+        throwcudaerror("Failed to launch rpowscalar kernel",cudaGetLastError());
     }
     template void launch_rpowscalar<double>(const double scalar, const double *a, double *c, const int size);
     template void launch_rpowscalar<float>(const float scalar, const float *a, float *c, const int size);
@@ -126,12 +109,7 @@ namespace deepx::tensorfunc
     {
         auto [numBlocks, blockSize] = BestDims(size);
         log_kernel<<<numBlocks, blockSize>>>(a, c, size);
-        cudaError_t err = cudaGetLastError();
-        if (err != cudaSuccess)
-        {
-            throw std::runtime_error("Failed to launch log kernel: " +
-                                     std::string(cudaGetErrorString(err)));
-        }
+        throwcudaerror("Failed to launch log kernel",cudaGetLastError());
     }
     template void launch_log<double>(const double *a, double *c, const int size);
     template void launch_log<float>(const float *a, float *c, const int size);
@@ -152,12 +130,7 @@ namespace deepx::tensorfunc
     {
         auto [numBlocks, blockSize] = BestDims(size);
         exp_kernel<<<numBlocks, blockSize>>>(a, c, size);
-        cudaError_t err = cudaGetLastError();
-        if (err != cudaSuccess)
-        {
-            throw std::runtime_error("Failed to launch exp kernel: " +
-                                     std::string(cudaGetErrorString(err)));
-        }
+        throwcudaerror("Failed to launch exp kernel",cudaGetLastError());
     }
     template void launch_exp<double>(const double *a, double *c, const int size);
     template void launch_exp<float>(const float *a, float *c, const int size);
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/init_miaobyte.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/init_miaobyte.cu
index 7f8d261..cc7d708 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/init_miaobyte.cu
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/init_miaobyte.cu
@@ -24,12 +24,7 @@ namespace deepx::tensorfunc
     {
         auto [numBlocks, blockSize] = BestDims(size);
         kernel_constant<<<numBlocks, blockSize>>>(a, value, size);
-        cudaError_t err = cudaGetLastError();
-        if (err != cudaSuccess)
-            throw std::runtime_error("Failed to launch constant kernel");
-        err = cudaDeviceSynchronize();
-        if (err != cudaSuccess)
-            throw std::runtime_error("Failed to synchronize device");
+        throwcudaerror("Failed to launch constant kernel",cudaGetLastError());
     }
 
     template void launch_constant<double>(double *a, const double value, const int size);
@@ -65,12 +60,7 @@ namespace deepx::tensorfunc
     {
         auto [numBlocks, blockSize] = BestDims(size);
         dropout_kernel<<<numBlocks, blockSize>>>(a, p, seed, size);
-        cudaError_t err = cudaGetLastError();
-        if (err != cudaSuccess)
-        {
-            throw std::runtime_error("Failed to launch dropout kernel: " +
-                                     std::string(cudaGetErrorString(err)));
-        }
+        throwcudaerror("Failed to launch dropout kernel",cudaGetLastError());
     }
     template void launch_dropout<double>(double *a, const float p, const unsigned int seed, const int size);
     template void launch_dropout<float>(float *a, const float p, const unsigned int seed, const int size);
@@ -98,12 +88,7 @@ namespace deepx::tensorfunc
     {
         auto [numBlocks, blockSize] = BestDims(size);
         kernel_arange<<<numBlocks, blockSize>>>(a, static_cast<float>(start), static_cast<float>(step), size);
-        cudaError_t err = cudaGetLastError();
-        if (err != cudaSuccess)
-            throw std::runtime_error("Failed to launch arange kernel");
-        err = cudaDeviceSynchronize();
-        if (err != cudaSuccess)
-            throw std::runtime_error("Failed to synchronize device");
+        throwcudaerror("Failed to launch arange kernel",cudaGetLastError());
     }
     template void launch_arange<double>(double *a, const double start, const double step, const int size);
     template void launch_arange<float>(float *a, const float start, const float step, const int size);
@@ -138,12 +123,7 @@ namespace deepx::tensorfunc
     {
         auto [numBlocks, blockSize] = BestDims(size);
         kernel_uniform<<<numBlocks, blockSize>>>(a, float(low), float(high), seed, size);
-        cudaError_t err = cudaGetLastError();
-        if (err != cudaSuccess)
-            throw std::runtime_error("Failed to launch uniform kernel");
-        err = cudaDeviceSynchronize();
-        if (err != cudaSuccess)
-            throw std::runtime_error("Failed to synchronize device");
+        throwcudaerror("Failed to launch uniform kernel",cudaGetLastError());
     }
     template void launch_uniform<double>(double *a, const double low, const double high, const unsigned int seed, const int size);
     template void launch_uniform<float>(float *a, const float low, const float high, const unsigned int seed, const int size);
@@ -176,12 +156,7 @@ namespace deepx::tensorfunc
     {
         auto [numBlocks, blockSize] = BestDims(size);
         kernel_normal<<<numBlocks, blockSize>>>(a, float(mean), float(stddev), seed, size);
-        cudaError_t err = cudaGetLastError();
-        if (err != cudaSuccess)
-            throw std::runtime_error("Failed to launch normal kernel");
-        err = cudaDeviceSynchronize();
-        if (err != cudaSuccess)
-            throw std::runtime_error("Failed to synchronize device");
+        throwcudaerror("Failed to launch normal kernel",cudaGetLastError());
     }
     template void launch_normal<double>(double *a, const double mean, const double stddev, const unsigned int seed, const int size);
     template void launch_normal<float>(float *a, const float mean, const float stddev, const unsigned int seed, const int size);
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/io_miaobyte.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/io_miaobyte.hpp
index 065923e..da58c2c 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/io_miaobyte.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/io_miaobyte.hpp
@@ -22,7 +22,6 @@ namespace deepx::tensorfunc
         static void print(const Tensor<T> &t, const std::string &f = "")
         {
             int64_t total_bytes = t.shape.bytes();
-
             // 统一分配CPU内存
             unsigned char* device_data=reinterpret_cast<unsigned char*>(t.data);
             auto [_,host_data]= device_offload(device_data,total_bytes);
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.cu
index c6047ac..7aa90c4 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.cu
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/reduce_miaobyte.cu
@@ -100,6 +100,7 @@ namespace deepx::tensorfunc
         default:
             throw std::runtime_error("dim too large, max support " + std::to_string(MAX_DIM));
         }
+        throwcudaerror("Failed to launch sum kernel",cudaGetLastError());
     }
 
     template void launch_sum<double>(const double *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
@@ -196,6 +197,7 @@ namespace deepx::tensorfunc
         default:
             throw std::runtime_error("dim too large, max support " + std::to_string(MAX_DIM));
         }
+        throwcudaerror("Failed to launch prod kernel",cudaGetLastError());
     }
 
     template void launch_prod<double>(const double *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
@@ -304,6 +306,7 @@ namespace deepx::tensorfunc
         default:
             throw std::runtime_error("dim too large, max support " + std::to_string(MAX_DIM));
         }
+        throwcudaerror("Failed to launch max kernel",cudaGetLastError());
     };
 
     template void launch_reducemax<double>(const double *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
@@ -412,6 +415,7 @@ namespace deepx::tensorfunc
         default:
             throw std::runtime_error("dim too large, max support " + std::to_string(MAX_DIM));
         }
+        throwcudaerror("Failed to launch min kernel",cudaGetLastError());
     }
 
     template void launch_reducemin<double>(const double *tensor_data, const int *tensor_strides, const int tensor_dim, const int tensor_len,
diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp
index 7f47046..864334c 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp
@@ -339,6 +339,28 @@ namespace deepx::tensorfunc
         }
     };
 
+    template <>
+    struct invertDispatcher<miaobyte, bool>
+    {
+        static void invert(const Tensor<bool> &A, Tensor<bool> &C)
+        {
+            if (A.shape == C.shape)
+            {
+                A.shape.rangeElementwiseParallel([&A, &C](int idx, int idx_end)
+                                                {
+                                        for (int j=0;j<idx_end;j++)
+                                        {
+                                                C.data[idx+j]=!A.data[idx+j];  // 使用逻辑非运算符
+                                        } });
+            }
+            else
+            {
+                throw std::invalid_argument("shape mismatch");
+            }
+        }
+    };
+
+    // sqrt
     template <typename T>
     struct sqrtDispatcher<miaobyte, T, std::enable_if_t<std::is_floating_point_v<T>>>
     {
@@ -379,6 +401,8 @@ namespace deepx::tensorfunc
             }
         }
     };
+
+    // sqrt
     template <typename T>
     struct sqrtDispatcher<miaobyte, T, std::enable_if_t<std::is_integral_v<T>>>
     {
diff --git a/front/py/deepx/tensor/tensor.py b/front/py/deepx/tensor/tensor.py
index ba40d67..d7dbd78 100644
--- a/front/py/deepx/tensor/tensor.py
+++ b/front/py/deepx/tensor/tensor.py
@@ -119,8 +119,7 @@ def __radd__(self, other:Union[Number,'Tensor']):
     def __sub__(self, other:Union[Number,'Tensor']):
         return self.sub(other)
     def __rsub__(self, other:Union[Number,'Tensor']):
-        x=self.mul(-1)
-        return x.add(other)
+        return self.rsub(other)
     def __mul__(self, other:Union[Number,'Tensor']):
         return self.mul(other)
     def __rmul__(self, other:Union[Number,'Tensor']):
diff --git a/front/py/examples/1_tensor/1_new.py b/front/py/examples/1_tensor/1_new.py
index aed5e7c..6a99a44 100644
--- a/front/py/examples/1_tensor/1_new.py
+++ b/front/py/examples/1_tensor/1_new.py
@@ -18,7 +18,7 @@ def printall(t):
 def newtensor(dtype):
 
    from deepx.nn.functional import newtensor
-   t=newtensor(1,2,3,dtype=dtype)
+   t=newtensor((1,2,3),dtype=dtype)
    printall(t)
 
 
diff --git a/front/py/examples/4_transformer/llama/llama_rope.py b/front/py/examples/4_transformer/llama/llama_rope.py
index 53c8995..e0f0598 100644
--- a/front/py/examples/4_transformer/llama/llama_rope.py
+++ b/front/py/examples/4_transformer/llama/llama_rope.py
@@ -1,4 +1,4 @@
-from .llama_rope_torch import dir,config
+from llama_rope_torch import dir,config
 
 ############-------DEEPX-------################
 from deepx.nn.modules import Embedding,Module
@@ -14,7 +14,8 @@ def __init__(self,configdict:dict):
         super().__init__()
         self.embed_tokens = Embedding(configdict["vocab_size"], configdict["hidden_size"],weight=embed_tokens_weight)
         self.rotary_emb = LlamaRotaryEmbedding(config=configdict)
-
+        print("rotary_emb.inv_freq")
+        self.rotary_emb.inv_freq.print()
     def forward(self,x):
         inputs_embeds = self.embed_tokens(x)
         hidden_states = inputs_embeds
diff --git a/front/py/examples/4_transformer/llama/llama_rope_torch.py b/front/py/examples/4_transformer/llama/llama_rope_torch.py
index 4e9301c..3f894e3 100644
--- a/front/py/examples/4_transformer/llama/llama_rope_torch.py
+++ b/front/py/examples/4_transformer/llama/llama_rope_torch.py
@@ -56,7 +56,6 @@ def __init__(self, config: LlamaConfig):
         print(self.rotary_emb.inv_freq)
     def forward(self, x):
         inputs_embeds = self.embed_tokens(x)
-        print(inputs_embeds)
         hidden_states = inputs_embeds
         # create position embeddings to be shared across the decoder layers
         position_ids = torch.arange(hidden_states.shape[1], device=hidden_states.device).unsqueeze(0)