diff --git a/include/axono/core/ops.h b/include/axono/core/ops.h
new file mode 100644
index 0000000..6c4bc92
--- /dev/null
+++ b/include/axono/core/ops.h
@@ -0,0 +1,58 @@
+// axono/core/ops.h
+#pragma once
+
+#include <pybind11/pybind11.h>
+#include <functional>
+#include <unordered_map>
+#include <string>
+
+namespace axono {
+namespace core {
+
+using OpFunction = std::function<pybind11::object(const pybind11::args&)>;
+
+class OpRegistry {
+public:
+    static OpRegistry& instance() {
+        static OpRegistry registry;
+        return registry;
+    }
+
+    void register_op(const std::string& name, OpFunction func) {
+        ops_[name] = std::move(func);
+    }
+
+    const OpFunction& get_op(const std::string& name) const {
+        auto it = ops_.find(name);
+        if (it == ops_.end()) {
+            throw std::runtime_error("算子 " + name + " 不存在。");
+        }
+        return it->second;
+    }
+
+    void bind_all(pybind11::module& m) {
+        for (const auto& [name, func] : ops_) {
+            m.def(name.c_str(), [func](const pybind11::args& args) {
+                return func(args);
+            });
+        }
+    }
+
+private:
+    OpRegistry() = default;
+    std::unordered_map<std::string, OpFunction> ops_;
+};
+
+#define REGISTER_OP(name) \
+    struct RegisterOp_##name { \
+        RegisterOp_##name() { \
+            axono::core::OpRegistry::instance().register_op( \
+                #name, [](const pybind11::args& args) { \
+                    return op_impl_##name(args); \
+                }); \
+        } \
+    }; \
+    static RegisterOp_##name register_op_##name; \
+    pybind11::object op_impl_##name(const pybind11::args& args)
+} // namespace core
+} // namespace axono
diff --git a/include/axono/pybind/compute/operators/add.h b/include/axono/pybind/compute/operators/add.h
index dba7ec0..58f8045 100644
--- a/include/axono/pybind/compute/operators/add.h
+++ b/include/axono/pybind/compute/operators/add.h
@@ -2,79 +2,82 @@
 
 namespace py = pybind11;
 
+#include "axono/core/ops.h"
+
 #ifdef COMPILED_WITH_CUDA
 #include "axono/compute/cuda/operators/add.h"
 #endif
 #include "axono/compute/cpu/operators/add.h"
 
-void init_add_operations(py::module &m) {
-  m.def(
-      "add",
-      [](const axono::core::Tensor &a, const axono::core::Tensor &b) {
-        axono::core::Context ctx;
-        axono::core::Tensor result =
-            axono::core::Tensor(a.dtype(), a.shape(), a.device());
+namespace axono {
+namespace compute {
+namespace operators {
 
-        axono::core::Status status;
-        if (a.is_cuda()) {
+py::object op_impl_add(const py::args& args);
+py::object op_impl_add_scalar(const py::args& args);
+
+REGISTER_OP(add) {
+    if (args.size() != 2) {
+        throw std::runtime_error("执行 add 需要传入 2 个 Tensor 喵~");
+    }
+    auto& a = pybind11::cast<core::Tensor&>(args[0]);
+    auto& b = pybind11::cast<core::Tensor&>(args[1]);
+    core::Context ctx;
+    core::Tensor result = core::Tensor(a.dtype(), a.shape(), a.device());
+    core::Status status;
+    if (a.is_cuda()) {
 #ifdef COMPILED_WITH_CUDA
-          status = axono::compute::cuda::operators::Add(ctx, a, b, result);
+        status = cuda::operators::Add(ctx, a, b, result);
 #endif
-        } else {
-          status = axono::compute::cpu::operators::Add(ctx, a, b, result);
-        }
-        if (status != axono::core::Status::OK) {
-          throw std::runtime_error(
-              "喵！计算矩阵加法的时候出现问题啦，错误代码：" +
-              std::to_string(static_cast<int>(status)));
-        }
+    } else {
+        status = cpu::operators::Add(ctx, a, b, result);
+    }
+    if (status != core::Status::OK)
+        throw std::runtime_error("执行 add 时出现问题，错误代码：" + std::to_string(static_cast<int>(status)));
 
-        return result;
-      },
-      "Element-wise addition of two tensors", py::arg("a"), py::arg("b"));
-
-  m.def(
-      "add_scalar",
-      [](const axono::core::Tensor &a, py::object scalar) {
-        axono::core::Context ctx;
-        axono::core::Tensor result;
-        axono::core::Status status;
+    return pybind11::cast(result);
+}
 
-        // 将 Python 标量转换为 C++ 数据
-        if (a.dtype() == axono::core::DataType::FLOAT32) {
-          float value = scalar.cast<float>();
-          if (a.is_cuda()) {
+REGISTER_OP(add_scalar) {
+    if (args.size() != 2) {
+        throw std::runtime_error("执行 add 需要传入 1 个 Tensor, 1 个 Scalar 喵~");
+    }
+    auto& a = pybind11::cast<core::Tensor&>(args[0]);
+    py::object scalar = pybind11::cast<py::object>(args[1]);
+    core::Context ctx;
+    core::Tensor result;
+    core::Status status;
+    if (a.dtype() == core::DataType::FLOAT32) {
+        float value = scalar.cast<float>();
+        if (a.is_cuda()){
 #ifdef COMPILED_WITH_CUDA
-            status = axono::compute::cuda::operators::AddScalar(
-                ctx, a, &value, sizeof(float), result);
+            status = cuda::operators::AddScalar(ctx, a, &value, sizeof(float), result);
 #endif
-          } else {
-            status = axono::compute::cpu::operators::AddScalar(
-                ctx, a, &value, sizeof(float), result);
-          }
+        } else {
+            status = cpu::operators::AddScalar(ctx, a, &value, sizeof(float), result);
         }
-        if (status != axono::core::Status::OK) {
-          throw std::runtime_error("Add scalar operation failed");
-        } else if (a.dtype() == axono::core::DataType::INT32) {
-          int32_t value = scalar.cast<int32_t>();
-          if (a.is_cuda()) {
+    }
+    if (status != core::Status::OK) {
+        throw std::runtime_error("执行 add_scalar 的时候出现问题，错误代码：" + std::to_string(static_cast<int>(status)));
+    } else if (a.dtype() == core::DataType::INT32) {
+        int32_t value = scalar.cast<int32_t>();
+        if (a.is_cuda()) {
 #ifdef COMPILED_WITH_CUDA
-            status = axono::compute::cuda::operators::AddScalar(
-                ctx, a, &value, sizeof(int32_t), result);
+            status = cuda::operators::AddScalar(ctx, a, &value, sizeof(int32_t), result);
 #endif
-          } else {
-            status = axono::compute::cpu::operators::AddScalar(
-                ctx, a, &value, sizeof(int32_t), result);
-          }
-
-          if (status != axono::core::Status::OK) {
-            throw std::runtime_error("喵！Add 操作出现了一些问题~");
-          }
         } else {
-          throw std::runtime_error("喵！当前类型不支持执行Add操作喵~");
+            status = cpu::operators::AddScalar(ctx, a, &value, sizeof(int32_t), result);
         }
 
-        return result;
-      },
-      "Add scalar to tensor", py::arg("a"), py::arg("scalar"));
+        if (status != core::Status::OK)
+            throw std::runtime_error("执行 add_scalar 的时候出现问题，错误代码：" + std::to_string(static_cast<int>(status)));
+    } else {
+        throw std::runtime_error("当前类型不支持执行 add_scalar 操作喵~");
+    }
+
+    return pybind11::cast(result);
+}
+
+}
+}
 }
diff --git a/include/axono/pybind/compute/operators/matmul.h b/include/axono/pybind/compute/operators/matmul.h
index e3524a1..9ce6942 100644
--- a/include/axono/pybind/compute/operators/matmul.h
+++ b/include/axono/pybind/compute/operators/matmul.h
@@ -7,31 +7,38 @@ namespace py = pybind11;
 #endif
 #include "axono/compute/cpu/operators/matmul.h"
 
-void init_matmul_operations(py::module &m) {
-  m.def(
-      "matmul",
-      [](const axono::core::Tensor &a, const axono::core::Tensor &b) {
-        axono::core::Context ctx;
-        axono::core::Tensor result;
-        axono::core::Status status;
-
-        if (a.is_cuda()) {
+namespace axono {
+namespace compute {
+namespace operators {
+
+py::object op_impl_matmul(const py::args& args);
+
+REGISTER_OP(matmul) {
+    if (args.size() != 2) {
+        throw std::runtime_error("执行 add 需要传入 2 个 Tensor 喵~");
+    }
+    auto& a = pybind11::cast<core::Tensor&>(args[0]);
+    auto& b = pybind11::cast<core::Tensor&>(args[1]);
+    core::Context ctx;
+    core::Tensor result;
+    core::Status status;
+
+    if (a.is_cuda()) {
 #ifdef COMPILED_WITH_CUDA
-          size_t m = a.shape()[0];
-          size_t n = b.shape()[1];
-          auto result = axono::core::Tensor(
-              a.dtype(), std::vector<size_t>{m, n}, a.device());
-          status = axono::compute::cuda::operators::MatMul(ctx, a, b, result);
-          return result;
+        size_t m = a.shape()[0];
+        size_t n = b.shape()[1];
+        auto result = core::Tensor(a.dtype(), std::vector<size_t>{m, n}, a.device());
+        status = cuda::operators::MatMul(ctx, a, b, result);
 #endif
-        } else {
-          status = axono::compute::cpu::operators::MatMul(ctx, a, b, result);
-        }
-        if (status != axono::core::Status::OK) {
-          throw std::runtime_error("喵！Matmul 操作 出现错误！");
-        }
-
-        return result;
-      },
-      "Matrix multiplication of two tensors", py::arg("a"), py::arg("b"));
+    } else {
+        status = compute::cpu::operators::MatMul(ctx, a, b, result);
+    }
+    if (status != core::Status::OK)
+        throw std::runtime_error("执行 Matmul 时出现问题，错误代码：" + std::to_string(static_cast<int>(status)));
+
+    return pybind11::cast(result);
+}
+
+}
+}
 }
diff --git a/include/axono/pybind/compute/ops/relu.h b/include/axono/pybind/compute/ops/relu.h
index db888fa..4ca5873 100644
--- a/include/axono/pybind/compute/ops/relu.h
+++ b/include/axono/pybind/compute/ops/relu.h
@@ -3,55 +3,65 @@
 namespace py = pybind11;
 
 #include "axono/core/tensor.h"
+#include "axono/core/ops.h"
 
 #ifdef COMPILED_WITH_CUDA
 #include "axono/compute/cuda/ops/relu.h"
 #endif
 #include "axono/compute/cpu/ops/relu.h"
 
-void init_relu_operations(py::module &m) {
-  m.def(
-      "relu",
-      [](const axono::core::Tensor &input) {
-        axono::core::Context ctx;
-        axono::core::Tensor output =
-            axono::core::Tensor(input.dtype(), input.shape(), input.device());
+namespace axono {
+namespace compute {
+namespace ops {
 
-        axono::core::Status status;
-        if (input.is_cuda()) {
+py::object op_impl_relu(const py::args& args);
+py::object op_impl_relu_(const py::args& args);
+
+REGISTER_OP(relu) {
+    core::Context ctx;
+    core::Tensor result;
+    core::Status status;
+    if (args.size() != 1) {
+        throw std::runtime_error("执行 add 需要传入 1 个 Tensor 喵~");
+    }
+    
+    auto& input = pybind11::cast<core::Tensor&>(args[0]);
+    core::Tensor output(input.dtype(), input.shape(), input.device());
+    
+    if (input.is_cuda()) {
 #ifdef COMPILED_WITH_CUDA
-          status = axono::compute::cuda::ops::Relu(ctx, input, output);
+        status = cuda::ops::Relu(ctx, input, output);
 #endif
-        } else {
-          status = axono::compute::cpu::ops::Relu(ctx, input, output);
-        }
-
-        if (status != axono::core::Status::OK) {
-          throw std::runtime_error("喵！ReLU计算时发生错误，错误代码: " +
-                                   std::to_string(static_cast<int>(status)));
-        }
-        return output;
-      },
-      "ReLU activation function", py::arg("input"),
-      py::return_value_policy::move),
-
-      m.def(
-          "relu_",
-          [](axono::core::Tensor &tensor) {
-            axono::core::Context ctx;
-            axono::core::Status status;
-            if (tensor.is_cuda()) {
+    } else {
+        status = cpu::ops::Relu(ctx, input, output);
+    }
+    if (status != core::Status::OK)
+        throw std::runtime_error("执行 ReLU 时出现问题，错误代码：" + std::to_string(static_cast<int>(status)));
+
+    return pybind11::cast(output);
+}
+REGISTER_OP(relu_) {
+    core::Context ctx;
+    core::Tensor result;
+    core::Status status;
+    if (args.size() != 1) {
+        throw std::runtime_error("执行 add 需要传入 1 个 Tensor 喵~");
+    }
+    
+    auto& tensor = pybind11::cast<core::Tensor&>(args[0]);
+    
+    if (tensor.is_cuda()) {
 #ifdef COMPILED_WITH_CUDA
-              status = axono::compute::cuda::ops::ReluInplace(ctx, tensor);
+        status = cuda::ops::ReluInplace(ctx, tensor);
 #endif
-            } else {
-              status = axono::compute::cpu::ops::ReluInplace(ctx, tensor);
-            }
-            if (status != axono::core::Status::OK) {
-              throw std::runtime_error("喵！InplaceReLU 出现错误！");
-            }
-
-            return tensor;
-          },
-          "Inplace ReLU activation function", py::arg("tensor"));
+    } else {
+        status = cpu::ops::ReluInplace(ctx, tensor);
+    }
+    if (status != core::Status::OK)
+        throw std::runtime_error("执行 ReLU 时出现问题，错误代码：" + std::to_string(static_cast<int>(status)));
+
+    return pybind11::cast(tensor);
+}
+}
+}
 }
diff --git a/python/src/pybind11_module.cpp b/python/src/pybind11_module.cpp
index 878a573..25d5aea 100644
--- a/python/src/pybind11_module.cpp
+++ b/python/src/pybind11_module.cpp
@@ -7,6 +7,7 @@
 #include "axono/pybind/compute/ops/relu.h"
 #include "axono/pybind/core/tensor.h"
 #include "axono/pybind/core/module.h"
+#include "axono/core/ops.h"
 
 namespace py = pybind11;
 
@@ -37,7 +38,5 @@ PYBIND11_MODULE(libaxono, m) {
   // 初始化 Tensor
   init_tensor(m);
   init_module(m);
-  init_matmul_operations(m);
-  init_add_operations(m);
-  init_relu_operations(m);
+  axono::core::OpRegistry::instance().bind_all(m);
 }
diff --git a/src/compute/cuda/operators/randn.cu b/src/compute/cuda/operators/randn.cu
index 285578f..6aa5b95 100644
--- a/src/compute/cuda/operators/randn.cu
+++ b/src/compute/cuda/operators/randn.cu
@@ -9,7 +9,6 @@ namespace compute {
 namespace cuda {
 namespace operators {
 
-// CUDA 核函数：生成正态分布随机数
 template <typename T>
 __global__ void RandnKernel(T* data, size_t num_elements, float mean, float stddev, unsigned int seed) {
     size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -18,22 +17,20 @@ __global__ void RandnKernel(T* data, size_t num_elements, float mean, float stdd
     curandState state;
     curand_init(seed, idx, 0, &state);
 
-    // 生成标准正态分布（均值0，标准差1），再转换为目标分布
     float val = curand_normal(&state);
     data[idx] = static_cast<T>(mean + val * stddev);
 }
 
-// 分派函数：根据数据类型调用对应核函数
 template <typename T>
 core::Status DispatchRandn(const core::Context& ctx, core::Tensor& out, float mean, float stddev) {
+    (void)ctx;
     size_t num_elements = out.num_elements();
     if (num_elements == 0) return core::Status::OK;
 
-    // 生成随机种子（使用 std::random_device）
-    std::random_device rd;  // 现在可正确识别
+    std::random_device rd;
     unsigned int seed = rd();
 
-    // 启动核函数
+    // 启动核弹咯
     const int block_size = 256;
     const int grid_size = (num_elements + block_size - 1) / block_size;
     RandnKernel<T><<<grid_size, block_size>>>(
@@ -43,8 +40,8 @@ core::Status DispatchRandn(const core::Context& ctx, core::Tensor& out, float me
     return core::Status::OK;
 }
 
-// 对外接口实现
 core::Status Randn(const core::Context& ctx, core::Tensor& out, float mean, float stddev) {
+    (void)ctx;
     if (out.is_cuda()) {
 #ifdef COMPILED_WITH_CUDA
         switch (out.dtype()) {
@@ -63,7 +60,6 @@ core::Status Randn(const core::Context& ctx, core::Tensor& out, float mean, floa
     }
 }
 
-// 显式实例化模板（避免链接错误）
 template core::Status DispatchRandn<float>(const core::Context&, core::Tensor&, float, float);
 template core::Status DispatchRandn<double>(const core::Context&, core::Tensor&, float, float);