diff --git a/include/axono/core/cpu/tensor/transpose.h b/include/axono/core/cpu/tensor/transpose.h
new file mode 100644
index 0000000..4ef0af1
--- /dev/null
+++ b/include/axono/core/cpu/tensor/transpose.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "axono/core/tensor.h"
+#include "axono/core/types.h"
+
+namespace axono {
+namespace core {
+namespace cpu {
+namespace tensor {
+
+AXONO_EXPORT Status TransposeKernel(const Tensor& src, Tensor& dst, int dim0, int dim1);
+
+}  // namespace tensor
+} // namespace cpu
+}  // namespace core
+}  // namespace axono
diff --git a/include/axono/core/cuda/tensor/transpose.h b/include/axono/core/cuda/tensor/transpose.h
new file mode 100644
index 0000000..1763e79
--- /dev/null
+++ b/include/axono/core/cuda/tensor/transpose.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "axono/core/tensor.h"
+#include "axono/core/types.h"
+
+namespace axono {
+namespace core {
+namespace cuda {
+namespace tensor {
+
+AXONO_EXPORT Status TransposeKernel(const Tensor& src, Tensor& dst, int dim0, int dim1);
+
+}  // namespace tensor
+} // namespace cuda
+}  // namespace core
+}  // namespace axono
diff --git a/include/axono/core/tensor.h b/include/axono/core/tensor.h
index 57e80ea..a214469 100644
--- a/include/axono/core/tensor.h
+++ b/include/axono/core/tensor.h
@@ -76,6 +76,9 @@ class Tensor {
   Status Reshape(const Shape &new_shape);
   Status Resize(const Shape &new_shape);
 
+  // Transpose
+  Tensor Transpose(int dim0 = -2, int dim1 = -1);
+
   // 填充操作
   Status FillZero();
   Status Fill(void *value, size_t value_size);
diff --git a/include/axono/pybind/core/tensor.h b/include/axono/pybind/core/tensor.h
index a71dafa..9671cda 100644
--- a/include/axono/pybind/core/tensor.h
+++ b/include/axono/pybind/core/tensor.h
@@ -33,6 +33,7 @@ void init_tensor(py::module &m) {
            py::arg("dtype"), py::arg("shape"), py::arg("device"))
       .def_static("create", &axono::core::Tensor::Create)
       .def_static("create_like", &axono::core::Tensor::CreateLike)
+      .def("transpose", &axono::core::Tensor::Transpose)
       .def("reshape", &axono::core::Tensor::Reshape)
       .def("resize", &axono::core::Tensor::Resize)
       .def(
diff --git a/python/axono/core/tensor.py b/python/axono/core/tensor.py
index c43ef1c..469fc8a 100644
--- a/python/axono/core/tensor.py
+++ b/python/axono/core/tensor.py
@@ -125,6 +125,10 @@ def __add__(self, other) -> "Tensor":
         from .operators import add
 
         return add(self, other)
+    
+    def transpose(self, dim0: int = -2, dim1: int = -1) -> "Tensor":
+        new = self._tensor.transpose(dim0, dim1)
+        return Tensor.from_raw(new)
 
     def to_numpy(self) -> np.ndarray:
         """Convert tensor to numpy array - FIXED VERSION"""
diff --git a/python/tests/core/test_tensors.py b/python/tests/core/test_tensors.py
index fd3a343..38242ee 100644
--- a/python/tests/core/test_tensors.py
+++ b/python/tests/core/test_tensors.py
@@ -45,6 +45,13 @@ def test_tensor_data_types(self):
         for dtype in dtypes:
             tensor = Tensor(dtype=dtype, shape=[2, 2])
             self.assertEqual(tensor.dtype, dtype)
+    
+    def test_tensor_transpose(self):
+        """测试 Tensor 转置"""
+        tensor = Tensor(dtype=DataType.FLOAT32, shape=[2, 3])
+        tensor.fill(1.0)
+        transposed_tensor = tensor.transpose()
+        self.assertEqual(transposed_tensor.shape, [3, 2])
 
     def test_tensor_fill(self):
         """测试 Tensor 填充"""
diff --git a/src/core/cpu/tensor/transpose.cpp b/src/core/cpu/tensor/transpose.cpp
new file mode 100644
index 0000000..ad8ec76
--- /dev/null
+++ b/src/core/cpu/tensor/transpose.cpp
@@ -0,0 +1,86 @@
+#include <stdexcept>
+#include <algorithm>
+#include <cstring>
+
+#include "axono/core/tensor.h"
+#include "axono/core/types.h"
+
+namespace axono {
+namespace core {
+namespace cpu {
+namespace tensor {
+
+namespace {
+template <typename T>
+void TransposeImpl(const Tensor& src, Tensor& dst, int dim0, int dim1) {
+    const auto& src_shape = src.shape();
+    const auto& dst_shape = dst.shape();
+    const size_t total_elems = src.num_elements();
+
+    if (total_elems == 0) return;
+    if (dst.num_elements() != total_elems) throw std::logic_error("src/dst elem count mismatch");
+    if (dim0 < 0 || dim0 >= (int)src_shape.size() || dim1 < 0 || dim1 >= (int)src_shape.size())
+        throw std::out_of_range("dim index out of bound");
+
+    std::vector<size_t> src_stride(src_shape.size(), 1);
+    std::vector<size_t> dst_stride(dst_shape.size(), 1);
+    for (int i = (int)src_shape.size() - 2; i >= 0; --i)
+        src_stride[i] = src_stride[i+1] * src_shape[i+1];
+    for (int i = (int)dst_shape.size() - 2; i >= 0; --i)
+        dst_stride[i] = dst_stride[i+1] * dst_shape[i+1];
+
+    const T* src_data = src.data<T>();
+    T* dst_data = dst.data<T>();
+    if (!src_data || !dst_data) throw std::runtime_error("null data pointer");
+
+    for (size_t linear_idx = 0; linear_idx < total_elems; ++linear_idx) {
+        std::vector<size_t> coords(src_shape.size(), 0);
+        size_t rem = linear_idx;
+        for (int i = 0; i < (int)src_shape.size(); ++i) {
+            coords[i] = rem / src_stride[i];
+            rem %= src_stride[i];
+            if (coords[i] >= src_shape[i])
+                throw std::out_of_range("src coord out of bound");
+        }
+
+        std::swap(coords[dim0], coords[dim1]);
+
+        size_t dst_idx = 0;
+        for (int i = 0; i < (int)dst_shape.size(); ++i) {
+            if (coords[i] >= dst_shape[i])
+                throw std::out_of_range("dst coord out of bound");
+            dst_idx += coords[i] * dst_stride[i];
+        }
+
+        if (dst_idx >= dst.num_elements())
+            throw std::out_of_range("dst linear index out of bound");
+
+        dst_data[dst_idx] = src_data[linear_idx];
+    }
+}
+
+Status DispatchDtype(const Tensor& src, Tensor& dst, int dim0, int dim1) {
+    switch (src.dtype()) {
+        case DataType::INT8:    TransposeImpl<int8_t>(src, dst, dim0, dim1); break;
+        case DataType::INT16:   TransposeImpl<int16_t>(src, dst, dim0, dim1); break;
+        case DataType::INT32:   TransposeImpl<int32_t>(src, dst, dim0, dim1); break;
+        case DataType::INT64:   TransposeImpl<int64_t>(src, dst, dim0, dim1); break;
+        case DataType::FLOAT32: TransposeImpl<float>(src, dst, dim0, dim1); break;
+        case DataType::FLOAT64: TransposeImpl<double>(src, dst, dim0, dim1); break;
+        case DataType::BOOLEAN: TransposeImpl<bool>(src, dst, dim0, dim1); break;
+        default: return Status::UNSUPPORTED_TYPE;
+    }
+    return Status::OK;
+}
+} // anonymous namespace
+
+Status TransposeKernel(const Tensor& src, Tensor& dst, int dim0, int dim1) {
+    if (src.device() != "cpu" || dst.device() != "cpu") return Status::DEVICE_MISMATCH;
+    if (src.dtype() != dst.dtype()) return Status::UNSUPPORTED_TYPE;
+    if (src.num_elements() == 0) return Status::OK;
+}
+
+} // namespace tensor
+} // namespace cpu
+} // namespace core
+} // namespace axono
diff --git a/src/core/cuda/tensor/transpose.cu b/src/core/cuda/tensor/transpose.cu
new file mode 100644
index 0000000..d8f76f3
--- /dev/null
+++ b/src/core/cuda/tensor/transpose.cu
@@ -0,0 +1,100 @@
+#include <cuda_runtime.h>
+
+#include "axono/core/tensor.h"
+#include "axono/core/macros.h"
+#include "axono/core/types.h"
+
+namespace axono {
+namespace core {
+namespace cuda {
+namespace tensor {
+
+namespace {
+template <typename T>
+__global__ void Transpose2DKernel(const T* src, T* dst, 
+                                  size_t dim0_size, size_t dim1_size,
+                                  size_t other_size, size_t src_stride0, size_t src_stride1) {
+    const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= other_size * dim0_size * dim1_size) return;
+
+    const size_t total_dim1_dim0 = dim1_size * dim0_size;
+    const size_t other_idx = idx / total_dim1_dim0;
+    const size_t rem = idx % total_dim1_dim0;
+    const size_t dim1_idx = rem / dim0_size;
+    const size_t dim0_idx = rem % dim0_size;
+
+    const size_t src_idx = other_idx * src_stride0 * src_stride1 
+                           + dim0_idx * src_stride1 
+                           + dim1_idx;
+    const size_t dst_idx = other_idx * src_stride0 * src_stride1 
+                           + dim1_idx * src_stride0 
+                           + dim0_idx;
+    dst[dst_idx] = src[src_idx];
+}
+
+template <typename T>
+Status LaunchTransposeKernel(const Tensor& src, Tensor& dst, int dim0, int dim1) {
+    dim0 = (dim0 < 0) ? static_cast<int>(src.shape().size()) + dim0 : dim0;
+    dim1 = (dim1 < 0) ? static_cast<int>(src.shape().size()) + dim1 : dim1;
+
+    const size_t dim0_size = src.shape()[dim0];
+    const size_t dim1_size = src.shape()[dim1];
+    size_t other_size = 1;
+    for (int i = 0; i < static_cast<int>(src.shape().size()); ++i) {
+        if (i != dim0 && i != dim1) {
+            other_size *= src.shape()[i];
+        }
+    }
+
+    const size_t total_elements = other_size * dim0_size * dim1_size;
+    const int block_size = 256;
+    const int grid_size = (total_elements + block_size - 1) / block_size;
+
+    Transpose2DKernel<T><<<grid_size, block_size>>>(
+        src.data<T>(), dst.data<T>(),
+        dim0_size, dim1_size, other_size,
+        dim0_size, dim1_size
+    );
+
+    // 检查 CUDA 错误
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        return Status::DEVICE_ERROR;
+    }
+    return Status::OK;
+}
+}  // anonymous namespace
+
+Status TransposeKernel(const Tensor& src, Tensor& dst, int dim0, int dim1) {
+    if (!src.is_cuda() || !dst.is_cuda()) {
+        return Status::DEVICE_MISMATCH;
+    }
+    if (src.dtype() != dst.dtype()) {
+        return Status::UNSUPPORTED_TYPE;
+    }
+
+    const DataType dtype = src.dtype();
+    switch (dtype) {
+        case DataType::INT8:
+            return LaunchTransposeKernel<int8_t>(src, dst, dim0, dim1);
+        case DataType::INT16:
+            return LaunchTransposeKernel<int16_t>(src, dst, dim0, dim1);
+        case DataType::INT32:
+            return LaunchTransposeKernel<int32_t>(src, dst, dim0, dim1);
+        case DataType::INT64:
+            return LaunchTransposeKernel<int64_t>(src, dst, dim0, dim1);
+        case DataType::FLOAT32:
+            return LaunchTransposeKernel<float>(src, dst, dim0, dim1);
+        case DataType::FLOAT64:
+            return LaunchTransposeKernel<double>(src, dst, dim0, dim1);
+        case DataType::BOOLEAN:
+            return LaunchTransposeKernel<bool>(src, dst, dim0, dim1);
+        default:
+            return Status::UNSUPPORTED_TYPE;
+    }
+}
+
+}  // namespace tensor
+}  // namespace cuda
+}  // namespace core
+}  // namespace axono
diff --git a/src/core/tensor.cpp b/src/core/tensor.cpp
index 016d23e..836f6fc 100644
--- a/src/core/tensor.cpp
+++ b/src/core/tensor.cpp
@@ -1,19 +1,21 @@
-#include "axono/core/tensor.h"
-
 #include <cstdlib>
 #include <cstring>
 #include <sstream>
 #include <stdexcept>  // std::runtime_error
 
+#include "axono/core/tensor.h"
+
 #ifdef COMPILED_WITH_CUDA
 #include "axono/ops/cuda/randn.h"
 #include "axono/core/cuda/detail.h"
 #include "axono/core/cuda/tensor/kernel.h"
+#include "axono/core/cuda/tensor/transpose.h"
 #endif
 
 #include "axono/ops/cpu/randn.h"
 #include "axono/core/cpu/tensor/kernel.h"
 #include "axono/core/types.h"
+#include "axono/core/cpu/tensor/transpose.h"
 
 namespace {
 // 自定义删除器，用于 shared_ptr
@@ -390,5 +392,44 @@ std::string Tensor::ToString() const {
   return oss.str();
 }
 
+Tensor Tensor::Transpose(int dim0, int dim1) {
+    const int n_dim = static_cast<int>(this->ndim());
+    dim0 = (dim0 < 0) ? (n_dim + dim0) : dim0;
+    dim1 = (dim1 < 0) ? (n_dim + dim1) : dim1;
+
+    if (dim0 < 0 || dim0 >= n_dim || dim1 < 0 || dim1 >= n_dim) {
+        throw std::invalid_argument(
+            "Transpose: invalid dims, ndim=" + std::to_string(n_dim) +
+            ", dim0=" + std::to_string(dim0) + ", dim1=" + std::to_string(dim1)
+        );
+    }
+    if (dim0 == dim1) {
+        return *this;
+    }
+
+    Shape dst_shape = this->shape_;
+    std::swap(dst_shape[dim0], dst_shape[dim1]);
+
+    Tensor dst(this->dtype_, dst_shape, this->device_);
+    dst.InitializeStorage();
+
+    Status status;
+    if (this->is_cuda()) {
+#ifdef COMPILED_WITH_CUDA
+        status = cuda::tensor::TransposeKernel(*this, dst, dim0, dim1);
+#else
+        throw std::runtime_error("Transpose: CUDA not compiled, but tensor is on cuda");
+#endif
+    } else {
+        status = cpu::tensor::TransposeKernel(*this, dst, dim0, dim1);
+    }
+
+    if (status != Status::OK) {
+        throw std::runtime_error("Transpose failed, status=" + std::to_string(static_cast<int>(status)));
+    }
+
+    return dst;
+}
+
 }  // namespace core
 }  // namespace axono