diff --git a/.github/workflows/autopackage_linux_x86_64_cpu.yml b/.github/workflows/autopackage_linux_x86_64_cpu.yml
index 76b725d..7031fd3 100644
--- a/.github/workflows/autopackage_linux_x86_64_cpu.yml
+++ b/.github/workflows/autopackage_linux_x86_64_cpu.yml
@@ -1,16 +1,9 @@
 name: Publish Axono Package (Linux x86_64 For CPU)
 
-# on:
-#   push:
-#     tags:
-#       - 'v*'
-
 on:
   push:
-    branches: [ dev ]
-  pull_request:
-    branches: [ dev ]
-  workflow_dispatch:  # 添加手动触发器
+    tags:
+      - 'v*'
 
 jobs:
   build:
diff --git a/.github/workflows/autopackage_linux_x86_64_cuda.yml b/.github/workflows/autopackage_linux_x86_64_cuda.yml
index b38ba73..1689e9c 100644
--- a/.github/workflows/autopackage_linux_x86_64_cuda.yml
+++ b/.github/workflows/autopackage_linux_x86_64_cuda.yml
@@ -1,16 +1,9 @@
 name: Publish Axono Package (Linux x86_64 For CUDA)
 
-# on:
-#  push:
-#    tags:
-#      - 'v*'
-
 on:
-  push:
-    branches: [ dev ]
-  pull_request:
-    branches: [ dev ]
-  workflow_dispatch:  # 添加手动触发器
+ push:
+   tags:
+     - 'v*'
 
 jobs:
   build:
diff --git a/include/axono/core/module.h b/include/axono/core/module.h
new file mode 100644
index 0000000..35706ad
--- /dev/null
+++ b/include/axono/core/module.h
@@ -0,0 +1,16 @@
+#include "tensor.h"
+
+namespace axono::core {
+class Module {
+private:
+    std::unordered_map<std::string, Tensor> weights_;  // 存储权重张量
+public:
+    void add_weight(const std::string& name, const Tensor& weight) {
+        weights_[name] = weight;
+    }
+    Tensor& get_weight(const std::string& name) {
+        return weights_.at(name);
+    }
+    auto& weights() { return weights_; }
+};
+}
diff --git a/include/axono/pybind/core/module.h b/include/axono/pybind/core/module.h
new file mode 100644
index 0000000..a4b2d94
--- /dev/null
+++ b/include/axono/pybind/core/module.h
@@ -0,0 +1,23 @@
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include "axono/core/module.h"
+
+namespace py = pybind11;
+
+void init_module(py::module &m) {
+    py::class_<axono::core::Module>(m, "Module")
+        .def(py::init<>(), "创建一个空的 Module 实例")
+        .def("add_weight", 
+             &axono::core::Module::add_weight, 
+             py::arg("name"), py::arg("weight"), 
+             "向模块添加权重张量")
+        .def("get_weight", 
+             &axono::core::Module::get_weight, 
+             py::arg("name"), 
+             py::return_value_policy::reference_internal,
+             "获取指定名称的权重张量")
+        .def("weights", 
+             &axono::core::Module::weights, 
+             py::return_value_policy::reference_internal, 
+             "返回模块中所有权重的映射");
+}
diff --git a/pyproject.toml b/pyproject.toml
index bfbc9ee..f6e8737 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -9,7 +9,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "axono"
-version = "0.1.0"
+version = "0.2.0"
 description = "Deep learning framework with tensor operations and neural network modules"
 readme = "README.md"
 requires-python = ">=3.8"
@@ -39,11 +39,7 @@ package-dir = { "" = "python" }
 packages = [
     "axono",
     "axono.core",
-    "axono.data",
-    "axono.models",
     "axono.nn",
-    "axono.train",
-    "axono.viz",
     "axono.core.operators",
     "axono.core.ops"
 ]
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index ce8fe75..c43ff7f 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -16,26 +16,26 @@ endif()
 
 list(FILTER CORE_SOURCES EXCLUDE REGEX "\\.ipynb_checkpoints/")
 
-pybind11_add_module(axonolib
+pybind11_add_module(libaxono
     src/pybind11_module.cpp
     ${CORE_SOURCES}
 )
 
 if(WITH_CUDA)
-target_compile_definitions(axonolib PRIVATE COMPILED_WITH_CUDA)
+target_compile_definitions(libaxono PRIVATE COMPILED_WITH_CUDA)
 endif()
 
-target_include_directories(axonolib PRIVATE 
+target_include_directories(libaxono PRIVATE 
     ${CMAKE_SOURCE_DIR}/include
 )
 
-target_include_directories(axonolib PRIVATE
+target_include_directories(libaxono PRIVATE
     ${CMAKE_CURRENT_SOURCE_DIR}/include
 )
 
 # Set output properties
-set_target_properties(axonolib PROPERTIES
-    OUTPUT_NAME "axonolib"
+set_target_properties(libaxono PROPERTIES
+    OUTPUT_NAME "libaxono"
     LIBRARY_OUTPUT_DIRECTORY "${CMAKE_SOURCE_DIR}/python/axono/library"
     RUNTIME_OUTPUT_DIRECTORY "${CMAKE_SOURCE_DIR}/python/axono/library"
     PREFIX ""
diff --git a/python/axono/__init__.py b/python/axono/__init__.py
index d8cbde8..9ab0438 100644
--- a/python/axono/__init__.py
+++ b/python/axono/__init__.py
@@ -15,7 +15,7 @@
 
 from .core import DataType, Status, Tensor, operators
 
-__version__ = "0.1.0"
+__version__ = "0.2.0"
 __author__ = "ByteRainLab"
 __description__ = "High performance computing library for big data processing"
 
diff --git a/python/axono/core/__init__.py b/python/axono/core/__init__.py
index 16edd9a..2667185 100644
--- a/python/axono/core/__init__.py
+++ b/python/axono/core/__init__.py
@@ -15,7 +15,7 @@
 library_path = os.path.dirname(os.path.dirname(__file__)) + "/library/"
 sys.path.append(library_path)
 
-from axonolib import DataType, Status  # noqa: E402
+from libaxono import DataType, Status  # noqa: E402
 
 from . import operators  # noqa: E402
 from .tensor import Tensor  # noqa: E402
diff --git a/python/axono/core/operators/add.py b/python/axono/core/operators/add.py
index 88039f1..6145158 100644
--- a/python/axono/core/operators/add.py
+++ b/python/axono/core/operators/add.py
@@ -13,7 +13,7 @@
 Axono Add
 """
 
-from axonolib import add as _add
+from libaxono import add as _add
 
 from ..tensor import Tensor
 
diff --git a/python/axono/core/operators/matmul.py b/python/axono/core/operators/matmul.py
index 3af5c25..8abaedc 100644
--- a/python/axono/core/operators/matmul.py
+++ b/python/axono/core/operators/matmul.py
@@ -13,7 +13,7 @@
 Axono Matmul
 """
 
-from axonolib import matmul as _matmul
+from libaxono import matmul as _matmul
 
 from ..tensor import Tensor
 
diff --git a/python/axono/core/ops/relu.py b/python/axono/core/ops/relu.py
index d11854e..cff9c15 100644
--- a/python/axono/core/ops/relu.py
+++ b/python/axono/core/ops/relu.py
@@ -13,8 +13,8 @@
 core.ops.Relu()
 """
 
-from axonolib import relu as relu_op
-from axonolib import relu_ as relu_op_
+from libaxono import relu as relu_op
+from libaxono import relu_ as relu_op_
 
 from ..tensor import Tensor
 
diff --git a/python/axono/core/tensor.py b/python/axono/core/tensor.py
index a7f4dc8..c43ef1c 100644
--- a/python/axono/core/tensor.py
+++ b/python/axono/core/tensor.py
@@ -17,8 +17,8 @@
 import os
 
 import numpy as np
-from axonolib import DataType, Status
-from axonolib import Tensor as _Tensor
+from libaxono import DataType, Status
+from libaxono import Tensor as _Tensor
 
 default_device = os.getenv("axono_default_device", "cpu")
 
diff --git a/python/axono/data/dataloader.py b/python/axono/data/dataloader.py
deleted file mode 100644
index 62930a2..0000000
--- a/python/axono/data/dataloader.py
+++ /dev/null
@@ -1,151 +0,0 @@
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import Any, Callable, Dict, List, Optional
-
-import numpy as np
-from PIL import Image
-
-from ..core import Tensor
-
-
-class Dataset:
-    def __init__(self):
-        self.transform = None
-
-    def __getitem__(self, index: int) -> Dict[str, Any]:
-        raise NotImplementedError
-
-    def __len__(self) -> int:
-        raise NotImplementedError
-
-    def set_transform(self, transform: Callable):
-        self.transform = transform
-
-
-class DataLoader:
-    def __init__(
-        self,
-        dataset: Dataset,
-        batch_size: int = 1,
-        shuffle: bool = False,
-        num_workers: int = 0,
-    ):
-        self.dataset = dataset
-        self.batch_size = batch_size
-        self.shuffle = shuffle
-        self.num_workers = num_workers
-
-        self._indices = list(range(len(dataset)))
-
-    def __iter__(self):
-        if self.shuffle:
-            np.random.shuffle(self._indices)
-
-        for i in range(0, len(self._indices), self.batch_size):
-            batch_indices = self._indices[i : i + self.batch_size]
-            batch = self._collate_fn([self.dataset[idx] for idx in batch_indices])
-            yield batch
-
-    def __len__(self):
-        return (len(self.dataset) + self.batch_size - 1) // self.batch_size
-
-    def _collate_fn(self, batch: List[Dict[str, Any]]) -> Dict[str, Tensor]:
-        """Convert a list of samples to a batch"""
-        elem = batch[0]
-        if isinstance(elem, dict):
-            return {
-                key: self._collate_fn([d[key] for d in batch])
-                if isinstance(elem[key], (dict, list))
-                else Tensor.stack([d[key] for d in batch])
-                if isinstance(elem[key], Tensor)
-                else Tensor.from_numpy(np.stack([d[key] for d in batch]))
-                for key in elem
-            }
-        elif isinstance(elem, list):
-            return [self._collate_fn([d[i] for d in batch]) for i in range(len(elem))]
-        else:
-            raise TypeError(f"Unsupported batch element type: {type(elem)}")
-
-
-class ImageFolder(Dataset):
-    def __init__(self, root: str, transform: Optional[Callable] = None):
-        super().__init__()
-        self.root = root
-        self.transform = transform
-
-        # Scan directory for images and classes
-        self._scan_dir()
-
-    def _scan_dir(self):
-        """Scan directory and build dataset index"""
-        import os
-
-        self.classes = sorted(
-            [
-                d
-                for d in os.listdir(self.root)
-                if os.path.isdir(os.path.join(self.root, d))
-            ]
-        )
-
-        self.class_to_idx = {cls_name: i for i, cls_name in enumerate(self.classes)}
-
-        self.samples = []
-        for target_class in self.classes:
-            class_path = os.path.join(self.root, target_class)
-            if not os.path.isdir(class_path):
-                continue
-
-            for root, _, fnames in sorted(os.walk(class_path)):
-                for fname in sorted(fnames):
-                    if self._is_image_file(fname):
-                        path = os.path.join(root, fname)
-                        item = (path, self.class_to_idx[target_class])
-                        self.samples.append(item)
-
-    def _is_image_file(self, filename: str) -> bool:
-        """Check if a file is an image"""
-        img_extensions = (
-            ".jpg",
-            ".jpeg",
-            ".png",
-            ".ppm",
-            ".bmp",
-            ".pgm",
-            ".tif",
-            ".tiff",
-        )
-        return filename.lower().endswith(img_extensions)
-
-    def __getitem__(self, index: int) -> Dict[str, Any]:
-        """
-        Args:
-            index (int): Index
-
-        Returns:
-            Dict containing:
-                'inputs': Tensor image
-                'targets': Class label
-        """
-        path, target = self.samples[index]
-
-        # Load image
-        with open(path, "rb") as f:
-            img = Image.open(f).convert("RGB")
-
-        if self.transform is not None:
-            img = self.transform(img)
-
-        return {"inputs": img, "targets": target}
-
-    def __len__(self) -> int:
-        return len(self.samples)
diff --git a/python/axono/data/transforms.py b/python/axono/data/transforms.py
deleted file mode 100644
index 6764ec5..0000000
--- a/python/axono/data/transforms.py
+++ /dev/null
@@ -1,121 +0,0 @@
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import random
-from typing import List, Tuple, Union
-
-import numpy as np
-from PIL import Image
-
-
-class Transform:
-    def __call__(self, img):
-        raise NotImplementedError
-
-
-class Compose(Transform):
-    def __init__(self, transforms: List[Transform]):
-        self.transforms = transforms
-
-    def __call__(self, img):
-        for t in self.transforms:
-            img = t(img)
-        return img
-
-
-class Resize(Transform):
-    def __init__(self, size: Union[int, Tuple[int, int]]):
-        self.size = size if isinstance(size, tuple) else (size, size)
-
-    def __call__(self, img: Image.Image) -> Image.Image:
-        return img.resize(self.size, Image.BILINEAR)
-
-
-class RandomCrop(Transform):
-    def __init__(self, size: Union[int, Tuple[int, int]], padding: int = 0):
-        self.size = size if isinstance(size, tuple) else (size, size)
-        self.padding = padding
-
-    def __call__(self, img: Image.Image) -> Image.Image:
-        if self.padding > 0:
-            img = pad(img, self.padding)
-
-        w, h = img.size
-        th, tw = self.size
-
-        if w == tw and h == th:
-            return img
-
-        i = random.randint(0, h - th)
-        j = random.randint(0, w - tw)
-        return img.crop((j, i, j + tw, i + th))
-
-
-class RandomHorizontalFlip(Transform):
-    def __init__(self, p: float = 0.5):
-        self.p = p
-
-    def __call__(self, img: Image.Image) -> Image.Image:
-        if random.random() < self.p:
-            return img.transpose(Image.FLIP_LEFT_RIGHT)
-        return img
-
-
-class RandomRotation(Transform):
-    def __init__(self, degrees: Union[float, Tuple[float, float]]):
-        if isinstance(degrees, float):
-            degrees = (-degrees, degrees)
-        self.degrees = degrees
-
-    def __call__(self, img: Image.Image) -> Image.Image:
-        angle = random.uniform(self.degrees[0], self.degrees[1])
-        return img.rotate(angle, Image.BILINEAR, expand=False)
-
-
-class Normalize(Transform):
-    def __init__(self, mean: List[float], std: List[float]):
-        self.mean = np.array(mean)
-        self.std = np.array(std)
-
-    def __call__(self, img: np.ndarray) -> np.ndarray:
-        img = np.array(img).astype(np.float32) / 255.0
-        img = (img - self.mean) / self.std
-        return img
-
-
-class ToTensor(Transform):
-    def __call__(self, img: Union[Image.Image, np.ndarray]):
-        if isinstance(img, Image.Image):
-            img = np.array(img)
-
-        # Handle PIL Image
-        if len(img.shape) == 2:
-            img = img[:, :, None]
-
-        # Convert HWC to CHW format
-        img = img.transpose((2, 0, 1))
-        return img
-
-
-def pad(img: Image.Image, padding: int) -> Image.Image:
-    """Helper function to pad an image"""
-    if isinstance(padding, int):
-        padding = (padding, padding, padding, padding)
-    elif isinstance(padding, tuple) and len(padding) == 2:
-        padding = (padding[0], padding[1], padding[0], padding[1])
-
-    w, h = img.size
-    new_w = w + padding[0] + padding[2]
-    new_h = h + padding[1] + padding[3]
-
-    result = Image.new(img.mode, (new_w, new_h), 0)
-    result.paste(img, (padding[0], padding[1]))
-    return result
diff --git a/python/axono/models/container.py b/python/axono/models/container.py
deleted file mode 100644
index 7136d4c..0000000
--- a/python/axono/models/container.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from collections import OrderedDict
-from typing import Dict, List, Optional
-
-from ..core import Tensor
-from ..nn import Module
-
-
-class Sequential(Module):
-    def __init__(self, layers: List[Module]):
-        super().__init__()
-        self.layers = layers
-
-        # Register layers as submodules
-        for i, layer in enumerate(layers):
-            self.add_module(f"layer_{i}", layer)
-
-    def forward(self, x: Tensor) -> Tensor:
-        for layer in self.layers:
-            x = layer(x)
-        return x
-
-    def add_module(self, name: str, module: Optional[Module]):
-        if module is not None:
-            self._modules[name] = module
-
-
-class ModuleList(Module):
-    def __init__(self, modules: List[Module] = None):
-        super().__init__()
-        self._modules = OrderedDict()
-        if modules is not None:
-            for i, module in enumerate(modules):
-                self.add_module(str(i), module)
-
-    def append(self, module: Module):
-        self.add_module(str(len(self)), module)
-        return self
-
-    def extend(self, modules: List[Module]):
-        for module in modules:
-            self.append(module)
-        return self
-
-    def __len__(self):
-        return len(self._modules)
-
-    def __iter__(self):
-        return iter(self._modules.values())
-
-    def __getitem__(self, idx):
-        return list(self._modules.values())[idx]
-
-
-class ModuleDict(Module):
-    def __init__(self, modules: Dict[str, Module] = None):
-        super().__init__()
-        self._modules = OrderedDict()
-        if modules is not None:
-            for key, module in modules.items():
-                self.add_module(key, module)
-
-    def __getitem__(self, key: str) -> Module:
-        return self._modules[key]
-
-    def __setitem__(self, key: str, module: Module):
-        self.add_module(key, module)
-
-    def __delitem__(self, key: str):
-        del self._modules[key]
-
-    def __len__(self):
-        return len(self._modules)
-
-    def __iter__(self):
-        return iter(self._modules.values())
-
-    def keys(self):
-        return self._modules.keys()
-
-    def items(self):
-        return self._modules.items()
-
-    def values(self):
-        return self._modules.values()
diff --git a/python/axono/models/sequential.py b/python/axono/models/sequential.py
deleted file mode 100644
index 7d24860..0000000
--- a/python/axono/models/sequential.py
+++ /dev/null
@@ -1,219 +0,0 @@
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import List, Tuple
-
-from ..core import Tensor
-from ..nn import BatchNorm2d, Conv2d, Dropout, Linear, MaxPool2d, Module, ReLU
-from .container import Sequential
-
-
-class CNN(Module):
-    """A simple Convolutional Neural Network model."""
-
-    def __init__(
-        self,
-        input_channels: int,
-        num_classes: int,
-        hidden_channels: List[int] = [32, 64],
-        device: str = "cpu",
-    ):
-        super().__init__()
-
-        layers = []
-        in_channels = input_channels
-
-        # Add convolutional layers
-        for out_channels in hidden_channels:
-            layers.extend(
-                [
-                    Conv2d(
-                        in_channels,
-                        out_channels,
-                        kernel_size=3,
-                        padding=1,
-                        device=device,
-                    ),
-                    BatchNorm2d(out_channels, device=device),
-                    ReLU(),
-                    MaxPool2d(kernel_size=2),
-                ]
-            )
-            in_channels = out_channels
-
-        self.features = Sequential(layers)
-
-        # Calculate the size of flattened features
-        self.avgpool = None  # Will be initialized in forward
-
-        # Add classifier
-        self.classifier = Sequential(
-            [
-                Dropout(0.5),
-                Linear(hidden_channels[-1] * 7 * 7, 512, device=device),
-                ReLU(),
-                Dropout(0.5),
-                Linear(512, num_classes, device=device),
-            ]
-        )
-
-    def forward(self, x: Tensor) -> Tensor:
-        x = self.features(x)
-        if self.avgpool is None:
-            self.avgpool = x.shape[2] // 7
-        x = x.view(x.shape[0], -1)
-        x = self.classifier(x)
-        return x
-
-
-class RNN(Module):
-    """A simple Recurrent Neural Network model."""
-
-    def __init__(
-        self,
-        input_size: int,
-        hidden_size: int,
-        num_layers: int = 1,
-        dropout: float = 0.0,
-        device: str = "cpu",
-    ):
-        super().__init__()
-
-        self.hidden_size = hidden_size
-        self.num_layers = num_layers
-
-        # Input gate
-        self.wx = Linear(input_size, hidden_size, device=device)
-        self.wh = Linear(hidden_size, hidden_size, device=device)
-
-        # Additional layers
-        self.additional_layers = []
-        for _ in range(num_layers - 1):
-            layer = Linear(hidden_size, hidden_size, device=device)
-            self.additional_layers.append(layer)
-
-        self.dropout = Dropout(dropout)
-        self.activation = ReLU()
-
-    def forward(self, x: Tensor, hidden: Tensor = None) -> Tuple[Tensor, Tensor]:
-        if hidden is None:
-            hidden = Tensor.zeros(
-                (self.num_layers, x.shape[0], self.hidden_size), device=x.device
-            )
-
-        outputs = []
-        for t in range(x.shape[1]):
-            xt = x[:, t, :]
-            h = self.wx(xt) + self.wh(hidden[0])
-            h = self.activation(h)
-            h = self.dropout(h)
-
-            # Process additional layers
-            hidden_states = [h]
-            for i, layer in enumerate(self.additional_layers):
-                h = layer(h) + hidden[i + 1]
-                h = self.activation(h)
-                h = self.dropout(h)
-                hidden_states.append(h)
-
-            hidden = Tensor.stack(hidden_states)
-            outputs.append(h)
-
-        return Tensor.stack(outputs, dim=1), hidden
-
-
-class LSTM(Module):
-    """Long Short-Term Memory network."""
-
-    def __init__(
-        self,
-        input_size: int,
-        hidden_size: int,
-        num_layers: int = 1,
-        dropout: float = 0.0,
-        device: str = "cpu",
-    ):
-        super().__init__()
-
-        self.hidden_size = hidden_size
-        self.num_layers = num_layers
-
-        # Gates for each layer
-        self.layers = []
-        layer_input_size = input_size
-        for _ in range(num_layers):
-            layer = {
-                "forget": Linear(
-                    layer_input_size + hidden_size, hidden_size, device=device
-                ),
-                "input": Linear(
-                    layer_input_size + hidden_size, hidden_size, device=device
-                ),
-                "cell": Linear(
-                    layer_input_size + hidden_size, hidden_size, device=device
-                ),
-                "output": Linear(
-                    layer_input_size + hidden_size, hidden_size, device=device
-                ),
-            }
-            self.layers.append(layer)
-            layer_input_size = hidden_size
-
-        self.dropout = Dropout(dropout)
-
-    def forward(
-        self, x: Tensor, hidden: Tuple[Tensor, Tensor] = None
-    ) -> Tuple[Tensor, Tuple[Tensor, Tensor]]:
-        batch_size = x.shape[0]
-        seq_length = x.shape[1]
-
-        if hidden is None:
-            h = Tensor.zeros(
-                (self.num_layers, batch_size, self.hidden_size), device=x.device
-            )
-            c = Tensor.zeros(
-                (self.num_layers, batch_size, self.hidden_size), device=x.device
-            )
-            hidden = (h, c)
-
-        h, c = hidden
-        output_sequence = []
-
-        for t in range(seq_length):
-            xt = x[:, t, :]
-
-            for layer in range(self.num_layers):
-                if layer > 0:
-                    xt = self.dropout(xt)
-
-                layer_h = h[layer]
-                layer_c = c[layer]
-
-                # Concatenate input and hidden state
-                combined = Tensor.cat([xt, layer_h], dim=1)
-
-                # Gate computations
-                forget_gate = self.layers[layer]["forget"](combined).sigmoid()
-                input_gate = self.layers[layer]["input"](combined).sigmoid()
-                cell_gate = self.layers[layer]["cell"](combined).tanh()
-                output_gate = self.layers[layer]["output"](combined).sigmoid()
-
-                # Update cell and hidden state
-                layer_c = forget_gate * layer_c + input_gate * cell_gate
-                layer_h = output_gate * layer_c.tanh()
-
-                c[layer] = layer_c
-                h[layer] = layer_h
-                xt = layer_h
-
-            output_sequence.append(h[-1])
-
-        return Tensor.stack(output_sequence, dim=1), (h, c)
diff --git a/python/axono/models/transformer.py b/python/axono/models/transformer.py
deleted file mode 100644
index 2752f5c..0000000
--- a/python/axono/models/transformer.py
+++ /dev/null
@@ -1,161 +0,0 @@
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import Optional
-
-from ..core import Tensor
-from ..nn import Dropout, LayerNorm, Linear, Module
-from .container import Sequential
-
-
-class MultiHeadAttention(Module):
-    def __init__(
-        self,
-        embed_dim: int,
-        num_heads: int,
-        dropout: float = 0.0,
-        bias: bool = True,
-        device: str = "cpu",
-    ):
-        super().__init__()
-
-        if embed_dim % num_heads != 0:
-            raise ValueError(
-                f"embed_dim {embed_dim} not divisible by num_heads {num_heads}"
-            )
-
-        self.embed_dim = embed_dim
-        self.num_heads = num_heads
-        self.dropout = dropout
-        self.head_dim = embed_dim // num_heads
-        self.scaling = self.head_dim**-0.5
-
-        self.q_proj = Linear(embed_dim, embed_dim, bias=bias, device=device)
-        self.k_proj = Linear(embed_dim, embed_dim, bias=bias, device=device)
-        self.v_proj = Linear(embed_dim, embed_dim, bias=bias, device=device)
-        self.out_proj = Linear(embed_dim, embed_dim, bias=bias, device=device)
-
-        self.dropout_layer = Dropout(dropout)
-
-    def forward(
-        self, query: Tensor, key: Tensor, value: Tensor, mask: Optional[Tensor] = None
-    ) -> Tensor:
-        batch_size = query.shape[0]
-
-        # Linear projections and reshape
-        q = self.q_proj(query).view(batch_size, -1, self.num_heads, self.head_dim)
-        k = self.k_proj(key).view(batch_size, -1, self.num_heads, self.head_dim)
-        v = self.v_proj(value).view(batch_size, -1, self.num_heads, self.head_dim)
-
-        # Transpose for attention computation
-        q = q.transpose(1, 2)
-        k = k.transpose(1, 2)
-        v = v.transpose(1, 2)
-
-        # Attention scores
-        attn_weights = (q @ k.transpose(-2, -1)) * self.scaling
-
-        if mask is not None:
-            attn_weights = attn_weights.masked_fill(mask == 0, float("-inf"))
-
-        attn_weights = attn_weights.softmax(dim=-1)
-        attn_weights = self.dropout_layer(attn_weights)
-
-        # Attention output
-        attn_output = attn_weights @ v
-
-        # Reshape and project output
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.view(batch_size, -1, self.embed_dim)
-        attn_output = self.out_proj(attn_output)
-
-        return attn_output
-
-
-class TransformerEncoderLayer(Module):
-    def __init__(
-        self,
-        d_model: int,
-        nhead: int,
-        dim_feedforward: int = 2048,
-        dropout: float = 0.1,
-        device: str = "cpu",
-    ):
-        super().__init__()
-
-        self.self_attn = MultiHeadAttention(
-            d_model, nhead, dropout=dropout, device=device
-        )
-
-        self.linear1 = Linear(d_model, dim_feedforward, device=device)
-        self.dropout = Dropout(dropout)
-        self.linear2 = Linear(dim_feedforward, d_model, device=device)
-
-        self.norm1 = LayerNorm(d_model, device=device)
-        self.norm2 = LayerNorm(d_model, device=device)
-        self.dropout1 = Dropout(dropout)
-        self.dropout2 = Dropout(dropout)
-
-        self.activation = self.gelu
-
-    def forward(self, src: Tensor, mask: Optional[Tensor] = None) -> Tensor:
-        src2 = self.self_attn(src, src, src, mask=mask)
-        src = src + self.dropout1(src2)
-        src = self.norm1(src)
-
-        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
-        src = src + self.dropout2(src2)
-        src = self.norm2(src)
-
-        return src
-
-    @staticmethod
-    def gelu(x: Tensor) -> Tensor:
-        return 0.5 * x * (1 + (x * 0.7978845608 * (1 + 0.044715 * x * x)).tanh())
-
-
-class Transformer(Module):
-    def __init__(
-        self,
-        d_model: int = 512,
-        nhead: int = 8,
-        num_encoder_layers: int = 6,
-        dim_feedforward: int = 2048,
-        dropout: float = 0.1,
-        device: str = "cpu",
-    ):
-        super().__init__()
-
-        encoder_layers = []
-        for _ in range(num_encoder_layers):
-            encoder_layers.append(
-                TransformerEncoderLayer(
-                    d_model=d_model,
-                    nhead=nhead,
-                    dim_feedforward=dim_feedforward,
-                    dropout=dropout,
-                    device=device,
-                )
-            )
-
-        self.encoder = Sequential(encoder_layers)
-        self.d_model = d_model
-
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        for p in self.parameters():
-            if p.dim() > 1:
-                # Initialize weights with scaled normal distribution
-                p.data.normal_(mean=0.0, std=0.02)
-
-    def forward(self, src: Tensor, mask: Optional[Tensor] = None) -> Tensor:
-        return self.encoder(src)
diff --git a/python/axono/nn/__init__.py b/python/axono/nn/__init__.py
new file mode 100644
index 0000000..c9b7d89
--- /dev/null
+++ b/python/axono/nn/__init__.py
@@ -0,0 +1,4 @@
+from .module import Module
+from .layers import Linear
+
+__all__ = ["Module", "Linear"]
diff --git a/python/axono/nn/layers.py b/python/axono/nn/layers.py
index 26b268a..bfb2fe8 100644
--- a/python/axono/nn/layers.py
+++ b/python/axono/nn/layers.py
@@ -1,74 +1,10 @@
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import Optional, Tuple, Union
-
+import os
+from typing import Optional
 import numpy as np
-
 from ..core import Tensor
 from .module import Module
 
-
-class Conv2d(Module):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        kernel_size: Union[int, Tuple[int, int]],
-        stride: Union[int, Tuple[int, int]] = 1,
-        padding: Union[int, Tuple[int, int]] = 0,
-        bias: bool = True,
-        device: str = "cpu",
-    ):
-        super().__init__()
-
-        if isinstance(kernel_size, int):
-            kernel_size = (kernel_size, kernel_size)
-        if isinstance(stride, int):
-            stride = (stride, stride)
-        if isinstance(padding, int):
-            padding = (padding, padding)
-
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.kernel_size = kernel_size
-        self.stride = stride
-        self.padding = padding
-
-        # 初始化权重
-        scale = np.sqrt(2.0 / (in_channels * kernel_size[0] * kernel_size[1]))
-        weight_data = np.random.normal(
-            0, scale, (out_channels, in_channels, kernel_size[0], kernel_size[1])
-        )
-
-        self._parameters["weight"] = Tensor.from_numpy(weight_data).to(device)
-
-        if bias:
-            bias_data = np.zeros(out_channels)
-            self._parameters["bias"] = Tensor.from_numpy(bias_data).to(device)
-        else:
-            self._parameters["bias"] = None
-
-    def forward(self, x: Tensor) -> Tensor:
-        # 使用CUDA kernel或优化的CPU实现
-        from ..core.ops import conv2d
-
-        return conv2d(
-            x,
-            self._parameters["weight"],
-            self._parameters.get("bias"),
-            self.stride,
-            self.padding,
-        )
-
+device = os.getenv("axono_default_device", "cpu")
 
 class Linear(Module):
     def __init__(
@@ -76,138 +12,42 @@ def __init__(
         in_features: int,
         out_features: int,
         bias: bool = True,
-        device: str = "cpu",
+        device: str = device
     ):
         super().__init__()
+        self._init_args = {
+            "in_features": in_features,
+            "out_features": out_features,
+            "bias": bias,
+            "device": device
+        }
+        
+        self.in_features = in_features
+        self.out_features = out_features
+        self.device = device
 
-        # 初始化权重
         scale = np.sqrt(2.0 / in_features)
-        weight_data = np.random.normal(0, scale, (out_features, in_features))
-
-        self._parameters["weight"] = Tensor.from_numpy(weight_data).to(device)
-
+        weight_data = np.random.normal(
+            loc=0.0,
+            scale=scale,
+            size=(out_features, in_features)
+        ).astype(np.float32)
+        weight_tensor = Tensor.from_numpy(weight_data).to(device)
+        print(1)
+        self.add_weight("weight", weight_tensor)
         if bias:
-            bias_data = np.zeros(out_features)
-            self._parameters["bias"] = Tensor.from_numpy(bias_data).to(device)
+            bias_data = np.zeros(out_features, dtype=np.float32)
+            bias_tensor = Tensor.from_numpy(bias_data).to(device)
+            self.add_weight("bias", bias_tensor)
         else:
             self._parameters["bias"] = None
 
     def forward(self, x: Tensor) -> Tensor:
+        """前向传播：y = x @ weight.T + bias（若启用）"""
         output = x @ self._parameters["weight"].T
+        
+        # 加上偏置（广播机制）
         if self._parameters["bias"] is not None:
             output = output + self._parameters["bias"]
+        
         return output
-
-
-class BatchNorm2d(Module):
-    def __init__(
-        self,
-        num_features: int,
-        eps: float = 1e-5,
-        momentum: float = 0.1,
-        device: str = "cpu",
-    ):
-        super().__init__()
-
-        self.num_features = num_features
-        self.eps = eps
-        self.momentum = momentum
-
-        # 可学习参数
-        self._parameters["weight"] = Tensor.from_numpy(np.ones(num_features)).to(device)
-        self._parameters["bias"] = Tensor.from_numpy(np.zeros(num_features)).to(device)
-
-        # 运行时统计量
-        self.register_buffer(
-            "running_mean", Tensor.from_numpy(np.zeros(num_features)).to(device)
-        )
-        self.register_buffer(
-            "running_var", Tensor.from_numpy(np.ones(num_features)).to(device)
-        )
-
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        self.running_mean.fill_zero()
-        self.running_var.fill(1)
-        self._parameters["weight"].fill(1)
-        self._parameters["bias"].fill_zero()
-
-    def forward(self, x: Tensor) -> Tensor:
-        if self.is_training:
-            # 计算批次统计量
-            mean = x.mean(dim=(0, 2, 3))
-            var = x.var(dim=(0, 2, 3), unbiased=False)
-
-            # 更新运行时统计量
-            self.running_mean = (
-                1 - self.momentum
-            ) * self.running_mean + self.momentum * mean
-            self.running_var = (
-                1 - self.momentum
-            ) * self.running_var + self.momentum * var
-        else:
-            mean = self.running_mean
-            var = self.running_var
-
-        # 标准化
-        x_normalized = (x - mean[None, :, None, None]) / (
-            np.sqrt(var[None, :, None, None] + self.eps)
-        )
-
-        # 缩放和平移
-        return (
-            self._parameters["weight"][None, :, None, None] * x_normalized
-            + self._parameters["bias"][None, :, None, None]
-        )
-
-
-class ReLU(Module):
-    def forward(self, x: Tensor) -> Tensor:
-        from ..core.ops import relu
-
-        return relu(x)
-
-
-class MaxPool2d(Module):
-    def __init__(
-        self,
-        kernel_size: Union[int, Tuple[int, int]],
-        stride: Optional[Union[int, Tuple[int, int]]] = None,
-        padding: Union[int, Tuple[int, int]] = 0,
-    ):
-        super().__init__()
-
-        if isinstance(kernel_size, int):
-            kernel_size = (kernel_size, kernel_size)
-        if stride is None:
-            stride = kernel_size
-        if isinstance(stride, int):
-            stride = (stride, stride)
-        if isinstance(padding, int):
-            padding = (padding, padding)
-
-        self.kernel_size = kernel_size
-        self.stride = stride
-        self.padding = padding
-
-    def forward(self, x: Tensor) -> Tensor:
-        from ..core.ops import max_pool2d
-
-        return max_pool2d(x, self.kernel_size, self.stride, self.padding)
-
-
-class Dropout(Module):
-    def __init__(self, p: float = 0.5):
-        super().__init__()
-        if p < 0 or p > 1:
-            raise ValueError("Dropout probability has to be between 0 and 1")
-        self.p = p
-
-    def forward(self, x: Tensor) -> Tensor:
-        if self.is_training:
-            mask = Tensor.from_numpy(
-                (np.random.rand(*x.shape) > self.p).astype(np.float32)
-            ).to(x.device)
-            return x * mask / (1 - self.p)
-        return x
diff --git a/python/axono/nn/module.py b/python/axono/nn/module.py
index 8f17fe6..cb960d0 100644
--- a/python/axono/nn/module.py
+++ b/python/axono/nn/module.py
@@ -1,47 +1,49 @@
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from abc import ABC, abstractmethod
-from typing import List
-
+# python/axono/nn/module.py
+from typing import Dict, List
+from libaxono import Module as _Module
+from libaxono import Tensor as _Tensor
 from ..core import Tensor
 
-
-class Module(ABC):
+class Module:
     def __init__(self):
-        self._parameters = {}
+        self._parameters: Dict[str, Tensor] = {}
+        self._cpp_module = _Module()
         self._is_training = True
+        self._name = self.__class__.__name__
 
-    @abstractmethod
-    def forward(self, x: Tensor) -> Tensor:
-        pass
+    def add_weight(self, name: str, tensor: Tensor) -> None:
+        self._parameters[name] = tensor
+        self._cpp_module.add_weight(name, tensor._tensor)
 
-    def __call__(self, x: Tensor) -> Tensor:
-        return self.forward(x)
+    def parameters(self) -> Dict[str, Tensor]:
+        for k, v in self._parameters.items():
+            if type(v) == _Tensor:
+                self._parameters[k] = Tensor.from_raw(v)
+        return dict(self._parameters)
 
-    def train(self, mode: bool = True):
+    def train(self, mode: bool = True) -> "Module":
         self._is_training = mode
         return self
-
-    def eval(self):
-        return self.train(False)
-
-    @property
-    def is_training(self) -> bool:
-        return self._is_training
-
-    def parameters(self) -> List[Tensor]:
-        return list(self._parameters.values())
-
-    def to(self, device: str) -> "Module":
-        for name, param in self._parameters.items():
-            self._parameters[name] = param.to(device)
-        return self
+    def __repr__(self) -> str:
+        cls_name = self.__class__.__name__
+        init_args = []
+        if hasattr(self, '_init_args'):
+            init_args = [f"{k}={v}" for k, v in self._init_args.items()]
+
+        if not hasattr(self, '_modules') or not self._modules:
+            if init_args:
+                return f"{cls_name}({', '.join(init_args)})"
+            else:
+                return f"{cls_name}()"
+
+        lines = [f"{cls_name}("]
+        indent = "  "
+        if init_args:
+            lines.append(f"{indent}{', '.join(init_args)},")
+
+        for name, module in self._modules.items():
+            submodule_repr = repr(module).replace("\n", f"\n{indent}")
+            lines.append(f"{indent}({name}): {submodule_repr}")
+        
+        lines.append(")")
+        return "\n".join(lines)
diff --git a/python/axono/train/optimizer.py b/python/axono/train/optimizer.py
deleted file mode 100644
index 5a87525..0000000
--- a/python/axono/train/optimizer.py
+++ /dev/null
@@ -1,122 +0,0 @@
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import Any, Dict, List, Tuple
-
-from ..core import Tensor
-
-
-class Optimizer:
-    def __init__(self, params: List[Tensor], lr: float = 0.01):
-        self.params = params
-        self.lr = lr
-        self._state: Dict[str, Any] = {}
-
-    def step(self):
-        """Update parameters using gradients"""
-        raise NotImplementedError
-
-    def zero_grad(self):
-        """Zero out parameter gradients"""
-        for param in self.params:
-            if param.grad is not None:
-                param.grad.fill_zero()
-
-
-class SGD(Optimizer):
-    def __init__(
-        self,
-        params: List[Tensor],
-        lr: float = 0.01,
-        momentum: float = 0.0,
-        weight_decay: float = 0.0,
-    ):
-        super().__init__(params, lr)
-        self.momentum = momentum
-        self.weight_decay = weight_decay
-
-        if momentum > 0:
-            self._state["momentum_buffer"] = [Tensor.zeros_like(p) for p in params]
-
-    def step(self):
-        for i, param in enumerate(self.params):
-            if param.grad is None:
-                continue
-
-            grad = param.grad
-
-            if self.weight_decay != 0:
-                grad = grad + self.weight_decay * param
-
-            if self.momentum > 0:
-                buf = self._state["momentum_buffer"][i]
-                buf = buf * self.momentum + grad
-                self._state["momentum_buffer"][i] = buf
-                grad = buf
-
-            param -= self.lr * grad
-
-
-class Adam(Optimizer):
-    def __init__(
-        self,
-        params: List[Tensor],
-        lr: float = 0.001,
-        betas: Tuple[float, float] = (0.9, 0.999),
-        eps: float = 1e-8,
-        weight_decay: float = 0.0,
-    ):
-        super().__init__(params, lr)
-        self.betas = betas
-        self.eps = eps
-        self.weight_decay = weight_decay
-
-        self._state["step"] = 0
-        self._state["exp_avg"] = [Tensor.zeros_like(p) for p in params]
-        self._state["exp_avg_sq"] = [Tensor.zeros_like(p) for p in params]
-
-    def step(self):
-        self._state["step"] += 1
-
-        for i, param in enumerate(self.params):
-            if param.grad is None:
-                continue
-
-            grad = param.grad
-            if self.weight_decay != 0:
-                grad = grad + self.weight_decay * param
-
-            beta1, beta2 = self.betas
-            exp_avg = self._state["exp_avg"][i]
-            exp_avg_sq = self._state["exp_avg_sq"][i]
-
-            # Update biased first moment estimate
-            exp_avg = beta1 * exp_avg + (1 - beta1) * grad
-
-            # Update biased second raw moment estimate
-            exp_avg_sq = beta2 * exp_avg_sq + (1 - beta2) * grad * grad
-
-            # Store updated moments
-            self._state["exp_avg"][i] = exp_avg
-            self._state["exp_avg_sq"][i] = exp_avg_sq
-
-            # Bias correction
-            bias_correction1 = 1 - beta1 ** self._state["step"]
-            bias_correction2 = 1 - beta2 ** self._state["step"]
-
-            # Compute bias-corrected moments
-            exp_avg_corrected = exp_avg / bias_correction1
-            exp_avg_sq_corrected = exp_avg_sq / bias_correction2
-
-            # Update parameters
-            param -= (
-                self.lr * exp_avg_corrected / (exp_avg_sq_corrected.sqrt() + self.eps)
-            )
diff --git a/python/axono/train/trainer.py b/python/axono/train/trainer.py
deleted file mode 100644
index 00786f6..0000000
--- a/python/axono/train/trainer.py
+++ /dev/null
@@ -1,193 +0,0 @@
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import time
-from typing import Any, Callable, Dict, Optional
-
-from ..core import Tensor
-from ..nn import Module
-
-
-class LossFunction:
-    @staticmethod
-    def mse_loss(pred: Tensor, target: Tensor) -> Tensor:
-        """Mean Squared Error Loss"""
-        return ((pred - target) ** 2).mean()
-
-    @staticmethod
-    def cross_entropy_loss(pred: Tensor, target: Tensor) -> Tensor:
-        """Cross Entropy Loss"""
-        log_softmax = pred.log_softmax(dim=1)
-        return -(target * log_softmax).sum(dim=1).mean()
-
-    @staticmethod
-    def bce_loss(pred: Tensor, target: Tensor) -> Tensor:
-        """Binary Cross Entropy Loss"""
-        return -(target * pred.log() + (1 - target) * (1 - pred).log()).mean()
-
-    @staticmethod
-    def l1_loss(pred: Tensor, target: Tensor) -> Tensor:
-        """L1 Loss"""
-        return (pred - target).abs().mean()
-
-
-class Trainer:
-    def __init__(
-        self,
-        model: Module,
-        optimizer: Any,  # Will be implemented in optimizer.py
-        loss_fn: str = "cross_entropy",
-        device: str = "cpu",
-        callbacks: Optional[Dict[str, Callable]] = None,
-    ):
-        self.model = model.to(device)
-        self.optimizer = optimizer
-        self.device = device
-        self.callbacks = callbacks or {}
-
-        # Set loss function
-        if isinstance(loss_fn, str):
-            if loss_fn == "mse":
-                self.loss_fn = LossFunction.mse_loss
-            elif loss_fn == "cross_entropy":
-                self.loss_fn = LossFunction.cross_entropy_loss
-            elif loss_fn == "bce":
-                self.loss_fn = LossFunction.bce_loss
-            elif loss_fn == "l1":
-                self.loss_fn = LossFunction.l1_loss
-            else:
-                raise ValueError(f"Unknown loss function: {loss_fn}")
-        else:
-            self.loss_fn = loss_fn
-
-    def train_step(self, batch: Dict[str, Tensor]) -> Dict[str, float]:
-        """Single training step"""
-        self.model.train()
-
-        # Move batch to device
-        inputs = batch["inputs"].to(self.device)
-        targets = batch["targets"].to(self.device)
-
-        # Forward pass
-        outputs = self.model(inputs)
-        loss = self.loss_fn(outputs, targets)
-
-        # Backward pass
-        self.optimizer.zero_grad()
-        loss.backward()
-        self.optimizer.step()
-
-        return {"loss": loss.item()}
-
-    def eval_step(self, batch: Dict[str, Tensor]) -> Dict[str, float]:
-        """Single evaluation step"""
-        self.model.eval()
-
-        with Tensor.no_grad():
-            # Move batch to device
-            inputs = batch["inputs"].to(self.device)
-            targets = batch["targets"].to(self.device)
-
-            # Forward pass
-            outputs = self.model(inputs)
-            loss = self.loss_fn(outputs, targets)
-
-            # Calculate accuracy
-            predictions = outputs.argmax(dim=1)
-            correct = (predictions == targets.argmax(dim=1)).sum()
-            accuracy = correct.item() / targets.shape[0]
-
-            return {"loss": loss.item(), "accuracy": accuracy}
-
-    def fit(
-        self,
-        train_loader: Any,  # Will be implemented in data.py
-        valid_loader: Optional[Any] = None,
-        epochs: int = 10,
-        log_interval: int = 100,
-    ):
-        """Train the model"""
-        for epoch in range(epochs):
-            start_time = time.time()
-            train_metrics = []
-
-            # Training loop
-            for i, batch in enumerate(train_loader):
-                metrics = self.train_step(batch)
-                train_metrics.append(metrics)
-
-                if i % log_interval == 0:
-                    metrics_str = ", ".join(f"{k}: {v:.4f}" for k, v in metrics.items())
-                    print(
-                        f"Epoch {epoch + 1}/{epochs} "
-                        f"[{i}/{len(train_loader)}] {metrics_str}"
-                    )
-
-            # Calculate average training metrics
-            train_avg_metrics = {}
-            for key in train_metrics[0].keys():
-                train_avg_metrics[key] = sum(m[key] for m in train_metrics) / len(
-                    train_metrics
-                )
-
-            # Validation loop
-            if valid_loader is not None:
-                valid_metrics = []
-                for batch in valid_loader:
-                    metrics = self.eval_step(batch)
-                    valid_metrics.append(metrics)
-
-                # Calculate average validation metrics
-                valid_avg_metrics = {}
-                for key in valid_metrics[0].keys():
-                    valid_avg_metrics[key] = sum(m[key] for m in valid_metrics) / len(
-                        valid_metrics
-                    )
-
-            # Log epoch metrics
-            epoch_time = time.time() - start_time
-            metrics_str = ", ".join(
-                f"train_{k}: {v:.4f}" for k, v in train_avg_metrics.items()
-            )
-            if valid_loader is not None:
-                metrics_str += ", " + ", ".join(
-                    f"valid_{k}: {v:.4f}" for k, v in valid_avg_metrics.items()
-                )
-            print(
-                f"Epoch {epoch + 1}/{epochs} completed in {epoch_time:.2f}s. "
-                f"Metrics: {metrics_str}"
-            )
-
-            # Call callbacks
-            if "on_epoch_end" in self.callbacks:
-                self.callbacks["on_epoch_end"](
-                    epoch=epoch,
-                    metrics={
-                        "train": train_avg_metrics,
-                        "valid": valid_avg_metrics if valid_loader else None,
-                    },
-                )
-
-    def evaluate(self, test_loader: Any) -> Dict[str, float]:
-        """Evaluate the model"""
-        test_metrics = []
-        for batch in test_loader:
-            metrics = self.eval_step(batch)
-            test_metrics.append(metrics)
-
-        # Calculate average test metrics
-        test_avg_metrics = {}
-        for key in test_metrics[0].keys():
-            test_avg_metrics[key] = sum(m[key] for m in test_metrics) / len(
-                test_metrics
-            )
-
-        return test_avg_metrics
diff --git a/python/axono/viz/visualizer.py b/python/axono/viz/visualizer.py
deleted file mode 100644
index 6c9d091..0000000
--- a/python/axono/viz/visualizer.py
+++ /dev/null
@@ -1,245 +0,0 @@
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import Dict, List, Tuple
-
-import matplotlib.pyplot as plt
-import numpy as np
-
-from ..core import Tensor
-from ..nn import Module
-
-
-class ModelVisualizer:
-    """Visualize model architecture and computational graph"""
-
-    def __init__(self, model: Module):
-        self.model = model
-        self.graph = {}
-        self._build_graph()
-
-    def _build_graph(self):
-        """Build computational graph from model"""
-
-        def _add_module(module: Module, name: str = ""):
-            for child_name, child in module.named_children():
-                child_full_name = f"{name}.{child_name}" if name else child_name
-                self.graph[child_full_name] = {
-                    "type": type(child).__name__,
-                    "params": {
-                        name: tensor.shape for name, tensor in child.named_parameters()
-                    },
-                    "children": [],
-                }
-                _add_module(child, child_full_name)
-
-        _add_module(self.model)
-
-    def plot(self, figsize: Tuple[int, int] = (12, 8)) -> None:
-        """Plot model architecture"""
-        import networkx as nx
-
-        graph = nx.DiGraph()
-        pos = {}
-        labels = {}
-
-        # Add nodes and edges
-        y_offset = 0
-        for name, info in self.graph.items():
-            graph.add_node(name)
-            pos[name] = (len(name.split(".")), y_offset)
-            labels[name] = f"{info['type']}\n{name}"
-            y_offset += 1
-
-            # Add edges between parent and child modules
-            parent = ".".join(name.split(".")[:-1])
-            if parent in self.graph:
-                graph.add_edge(parent, name)
-
-        plt.figure(figsize=figsize)
-        nx.draw(
-            graph,
-            pos,
-            labels=labels,
-            with_labels=True,
-            node_color="lightblue",
-            node_size=2000,
-            font_size=8,
-            font_weight="bold",
-        )
-        plt.title("Model Architecture")
-        plt.tight_layout()
-        plt.show()
-
-    def summary(self) -> None:
-        """Print model summary"""
-        total_params = 0
-        trainable_params = 0
-
-        print("Model Summary:")
-        print("=" * 80)
-        print(f"{'Layer':<40} {'Output Shape':<20} {'Param #':<10}")
-        print("-" * 80)
-
-        for name, module in self.model.named_modules():
-            params = sum(p.numel() for p in module.parameters())
-            trainable = sum(p.numel() for p in module.parameters() if p.requires_grad)
-
-            if params > 0:
-                print(f"{name:<40} {str(module):<20} {params:<10,d}")
-                total_params += params
-                trainable_params += trainable
-
-        print("=" * 80)
-        print(f"Total params: {total_params:,}")
-        print(f"Trainable params: {trainable_params:,}")
-        print(f"Non-trainable params: {total_params - trainable_params:,}")
-
-
-class TrainingVisualizer:
-    """Visualize training progress and metrics"""
-
-    def __init__(self):
-        self.history = {
-            "train": {"loss": [], "accuracy": []},
-            "valid": {"loss": [], "accuracy": []},
-        }
-        self.current_epoch = 0
-
-    def update(self, metrics: Dict[str, Dict[str, float]], epoch: int) -> None:
-        """Update training history with new metrics"""
-        self.current_epoch = epoch
-
-        for split in ["train", "valid"]:
-            if split in metrics:
-                for metric, value in metrics[split].items():
-                    self.history[split][metric].append(value)
-
-    def plot_metrics(self, figsize: Tuple[int, int] = (12, 4)) -> None:
-        """Plot training metrics"""
-        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=figsize)
-
-        epochs = range(1, self.current_epoch + 2)
-
-        # Plot loss
-        ax1.plot(epochs, self.history["train"]["loss"], "b-", label="Training")
-        if self.history["valid"]["loss"]:
-            ax1.plot(epochs, self.history["valid"]["loss"], "r-", label="Validation")
-        ax1.set_title("Loss")
-        ax1.set_xlabel("Epoch")
-        ax1.set_ylabel("Loss")
-        ax1.legend()
-        ax1.grid(True)
-
-        # Plot accuracy
-        ax2.plot(epochs, self.history["train"]["accuracy"], "b-", label="Training")
-        if self.history["valid"]["accuracy"]:
-            ax2.plot(
-                epochs, self.history["valid"]["accuracy"], "r-", label="Validation"
-            )
-        ax2.set_title("Accuracy")
-        ax2.set_xlabel("Epoch")
-        ax2.set_ylabel("Accuracy")
-        ax2.legend()
-        ax2.grid(True)
-
-        plt.tight_layout()
-        plt.show()
-
-
-class FeatureVisualizer:
-    """Visualize model's feature maps and filters"""
-
-    def __init__(self, model: Module):
-        self.model = model
-        self.hooks = []
-        self.feature_maps = {}
-
-    def _hook_fn(self, name: str):
-        def hook(module, input, output):
-            self.feature_maps[name] = output.to_numpy()
-
-        return hook
-
-    def register_hooks(self, layer_names: List[str]) -> None:
-        """Register forward hooks for specified layers"""
-        for name, module in self.model.named_modules():
-            if name in layer_names:
-                hook = module.register_forward_hook(self._hook_fn(name))
-                self.hooks.append(hook)
-
-    def remove_hooks(self) -> None:
-        """Remove all registered hooks"""
-        for hook in self.hooks:
-            hook.remove()
-        self.hooks.clear()
-
-    def plot_feature_maps(
-        self,
-        input_tensor: Tensor,
-        layer_name: str,
-        num_features: int = 16,
-        figsize: Tuple[int, int] = (12, 8),
-    ) -> None:
-        """Plot feature maps for a specific layer"""
-        # Forward pass to get feature maps
-        _ = self.model(input_tensor)
-
-        if layer_name not in self.feature_maps:
-            raise ValueError(f"No feature maps found for layer {layer_name}")
-
-        feature_maps = self.feature_maps[layer_name][0]  # First batch only
-        num_features = min(num_features, feature_maps.shape[0])
-
-        # Plot feature maps
-        fig, axes = plt.subplots(4, num_features // 4, figsize=figsize)
-        axes = axes.ravel()
-
-        for i in range(num_features):
-            axes[i].imshow(feature_maps[i], cmap="viridis")
-            axes[i].axis("off")
-
-        plt.suptitle(f"Feature Maps - {layer_name}")
-        plt.tight_layout()
-        plt.show()
-
-    def plot_filters(
-        self, layer_name: str, num_filters: int = 16, figsize: Tuple[int, int] = (12, 8)
-    ) -> None:
-        """Plot convolutional filters for a specific layer"""
-        for name, module in self.model.named_modules():
-            if name == layer_name:
-                if not hasattr(module, "weight"):
-                    raise ValueError(f"Layer {layer_name} has no weights")
-
-                weights = module.weight.to_numpy()
-                num_filters = min(num_filters, weights.shape[0])
-
-                # Plot filters
-                fig, axes = plt.subplots(4, num_filters // 4, figsize=figsize)
-                axes = axes.ravel()
-
-                for i in range(num_filters):
-                    # For RGB filters, take mean across channels
-                    if weights.shape[1] == 3:
-                        filt = np.mean(weights[i], axis=0)
-                    else:
-                        filt = weights[i, 0]
-
-                    axes[i].imshow(filt, cmap="viridis")
-                    axes[i].axis("off")
-
-                plt.suptitle(f"Convolution Filters - {layer_name}")
-                plt.tight_layout()
-                plt.show()
-                return
-
-        raise ValueError(f"Layer {layer_name} not found in model")
diff --git a/python/src/pybind11_module.cpp b/python/src/pybind11_module.cpp
index 31fa823..878a573 100644
--- a/python/src/pybind11_module.cpp
+++ b/python/src/pybind11_module.cpp
@@ -6,10 +6,11 @@
 #include "axono/pybind/compute/operators/matmul.h"
 #include "axono/pybind/compute/ops/relu.h"
 #include "axono/pybind/core/tensor.h"
+#include "axono/pybind/core/module.h"
 
 namespace py = pybind11;
 
-PYBIND11_MODULE(axonolib, m) {
+PYBIND11_MODULE(libaxono, m) {
   m.doc() = "Axono Library";
 
   // 数据类型枚举
@@ -35,6 +36,7 @@ PYBIND11_MODULE(axonolib, m) {
 
   // 初始化 Tensor
   init_tensor(m);
+  init_module(m);
   init_matmul_operations(m);
   init_add_operations(m);
   init_relu_operations(m);
diff --git a/python/tests/core/nn/test_module.py b/python/tests/core/nn/test_module.py
new file mode 100644
index 0000000..f5f91d9
--- /dev/null
+++ b/python/tests/core/nn/test_module.py
@@ -0,0 +1,41 @@
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import sys
+import unittest
+
+import numpy as np
+
+sys.path.insert(
+    0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
+)
+
+from axono.core import Tensor
+from axono.nn import Module
+
+device = os.getenv("axono_default_device", "cpu")
+
+
+class TestModule(unittest.TestCase):
+    """nn.Module 测试"""
+    def test_weight(self):
+        """测试权重功能"""
+        _Module = Module()
+        data = Tensor(shape=[1], device=device)
+        data.fill(1)
+        _Module.add_weight("weight", data)
+        # 测试填充
+        _Module.parameters()["weight"].fill(2)
+        self.assertEqual(_Module.parameters()["weight"].shape, [1])
+
+if __name__ == "__main__":
+    unittest.main(verbosity=2)
diff --git a/src/core/tensor.cpp b/src/core/tensor.cpp
index ad3f31c..a8d0994 100644
--- a/src/core/tensor.cpp
+++ b/src/core/tensor.cpp
@@ -75,55 +75,73 @@ Tensor::Tensor(const Tensor &other)
       device_(other.device_),
       num_elements_(other.num_elements_) {
   if (other.data_) {
-    // 根据设备类型初始化存储
-    if (device_ == other.device_) {
-      // 相同设备，分配内存并拷贝
-      InitializeStorage();
-      if (device_.substr(0, 4) == "cuda") {
-        // CUDA设备间的拷贝
+    // 总是重新分配存储
+    InitializeStorage();
+    
+    // 执行设备间拷贝
+    if (is_cuda()) {
 #ifdef COMPILED_WITH_CUDA
+      if (other.is_cuda()) {
         cuda::detail::cuda_memcpy_d2d(data_.get(), other.data_.get(),
                                       num_bytes());
-#endif
       } else {
-        // CPU设备间的拷贝
-        std::memcpy(data_.get(), other.data_.get(), num_bytes());
+        cuda::detail::cuda_memcpy_h2d(data_.get(), other.data_.get(),
+                                      num_bytes());
       }
+#endif
     } else {
-      // 不同设备，需要转换
-      InitializeStorage();
-      if (other.device_.substr(0, 4) == "cuda" &&
-          device_.substr(0, 3) == "cpu") {
+      if (other.is_cuda()) {
 #ifdef COMPILED_WITH_CUDA
-        // CUDA -> CPU
         cuda::detail::cuda_memcpy_d2h(data_.get(), other.data_.get(),
                                       num_bytes());
-#endif
-      } else if (other.device_.substr(0, 3) == "cpu" &&
-                 device_.substr(0, 4) == "cuda") {
-        // CPU -> CUDA
-#ifdef COMPILED_WITH_CUDA
-        cuda::detail::cuda_memcpy_h2d(data_.get(), other.data_.get(),
-                                      num_bytes());
 #endif
       } else {
-        // 其他情况
-        throw std::runtime_error("Unsupported device copy");
+        std::memcpy(data_.get(), other.data_.get(), num_bytes());
       }
     }
   }
 }
 Tensor &Tensor::operator=(const Tensor &other) {
   if (this != &other) {
+    // 清理旧数据
+    data_.reset();
+    
+    // 更新所有成员变量
     dtype_ = other.dtype_;
     shape_ = other.shape_;
+    device_ = other.device_;  // 重要：更新设备信息！
     num_elements_ = other.num_elements_;
-
+    
     if (other.data_) {
+      // 重新初始化存储
       InitializeStorage();
-      std::memcpy(data_.get(), other.data_.get(), num_bytes());
-    } else {
-      data_.reset();
+      
+      // 执行设备间正确的拷贝
+      if (device_ == other.device_) {
+        if (is_cuda()) {
+#ifdef COMPILED_WITH_CUDA
+          cuda::detail::cuda_memcpy_d2d(data_.get(), other.data_.get(),
+                                        num_bytes());
+#endif
+        } else {
+          std::memcpy(data_.get(), other.data_.get(), num_bytes());
+        }
+      } else {
+        // 跨设备拷贝
+        if (other.is_cuda() && !is_cuda()) {
+#ifdef COMPILED_WITH_CUDA
+          cuda::detail::cuda_memcpy_d2h(data_.get(), other.data_.get(),
+                                        num_bytes());
+#endif
+        } else if (!other.is_cuda() && is_cuda()) {
+#ifdef COMPILED_WITH_CUDA
+          cuda::detail::cuda_memcpy_h2d(data_.get(), other.data_.get(),
+                                        num_bytes());
+#endif
+        } else {
+          throw std::runtime_error("Unsupported device copy");
+        }
+      }
     }
   }
   return *this;
@@ -211,7 +229,7 @@ void Tensor::InitializeStorage() {
   size_t bytes = num_bytes();
   if (bytes == 0) return;
 
-  if (device_.substr(0, 4) == "cuda") {
+  if (is_cuda()) {
 #ifdef COMPILED_WITH_CUDA
     data_ = cuda::detail::CudaAllocateStorage(bytes, device_);
 #endif