From f648ca02ca56b897b700866adb4d56c0de19da85 Mon Sep 17 00:00:00 2001 From: ice Date: Sat, 31 Jan 2026 14:05:44 +0800 Subject: [PATCH 01/12] Create transpose.cpp --- include/axono/core/cpu/tensor/transpose.cpp | 86 +++++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100644 include/axono/core/cpu/tensor/transpose.cpp diff --git a/include/axono/core/cpu/tensor/transpose.cpp b/include/axono/core/cpu/tensor/transpose.cpp new file mode 100644 index 0000000..da1a0cc --- /dev/null +++ b/include/axono/core/cpu/tensor/transpose.cpp @@ -0,0 +1,86 @@ +#include "axono/core/tensor.h" +#include "axono/core/types.h" + +#include +#include +#include + +namespace axono { +namespace core { +namespace cpu { +namespace tensor { + +namespace { +template +void TransposeImpl(const Tensor& src, Tensor& dst, int dim0, int dim1) { + const auto& src_shape = src.shape(); + const auto& dst_shape = dst.shape(); + const size_t total_elems = src.num_elements(); + + if (total_elems == 0) return; + if (dst.num_elements() != total_elems) throw std::logic_error("src/dst elem count mismatch"); + if (dim0 < 0 || dim0 >= (int)src_shape.size() || dim1 < 0 || dim1 >= (int)src_shape.size()) + throw std::out_of_range("dim index out of bound"); + + std::vector src_stride(src_shape.size(), 1); + std::vector dst_stride(dst_shape.size(), 1); + for (int i = (int)src_shape.size() - 2; i >= 0; --i) + src_stride[i] = src_stride[i+1] * src_shape[i+1]; + for (int i = (int)dst_shape.size() - 2; i >= 0; --i) + dst_stride[i] = dst_stride[i+1] * dst_shape[i+1]; + + const T* src_data = src.data(); + T* dst_data = dst.data(); + if (!src_data || !dst_data) throw std::runtime_error("null data pointer"); + + for (size_t linear_idx = 0; linear_idx < total_elems; ++linear_idx) { + std::vector coords(src_shape.size(), 0); + size_t rem = linear_idx; + for (int i = 0; i < (int)src_shape.size(); ++i) { + coords[i] = rem / src_stride[i]; + rem %= src_stride[i]; + if (coords[i] >= src_shape[i]) + throw std::out_of_range("src coord out of bound"); + } + + std::swap(coords[dim0], coords[dim1]); + + size_t dst_idx = 0; + for (int i = 0; i < (int)dst_shape.size(); ++i) { + if (coords[i] >= dst_shape[i]) + throw std::out_of_range("dst coord out of bound"); + dst_idx += coords[i] * dst_stride[i]; + } + + if (dst_idx >= dst.num_elements()) + throw std::out_of_range("dst linear index out of bound"); + + dst_data[dst_idx] = src_data[linear_idx]; + } +} + +Status DispatchDtype(const Tensor& src, Tensor& dst, int dim0, int dim1) { + switch (src.dtype()) { + case DataType::INT8: TransposeImpl(src, dst, dim0, dim1); break; + case DataType::INT16: TransposeImpl(src, dst, dim0, dim1); break; + case DataType::INT32: TransposeImpl(src, dst, dim0, dim1); break; + case DataType::INT64: TransposeImpl(src, dst, dim0, dim1); break; + case DataType::FLOAT32: TransposeImpl(src, dst, dim0, dim1); break; + case DataType::FLOAT64: TransposeImpl(src, dst, dim0, dim1); break; + case DataType::BOOLEAN: TransposeImpl(src, dst, dim0, dim1); break; + default: return Status::UNSUPPORTED_TYPE; + } + return Status::OK; +} +} // anonymous namespace + +Status TransposeKernel(const Tensor& src, Tensor& dst, int dim0, int dim1) { + if (src.device() != "cpu" || dst.device() != "cpu") return Status::DEVICE_MISMATCH; + if (src.dtype() != dst.dtype()) return Status::UNSUPPORTED_TYPE; + if (src.num_elements() == 0) return Status::OK; +} + +} // namespace tensor +} // namespace cpu +} // namespace core +} // namespace axono From ceb2edb9f07bd44dd81a13b043446be30960fd19 Mon Sep 17 00:00:00 2001 From: ice Date: Sat, 31 Jan 2026 14:06:31 +0800 Subject: [PATCH 02/12] Implement CUDA kernel for 2D tensor transposition --- include/axono/core/cuda/tensor/transpose.cu | 100 ++++++++++++++++++++ 1 file changed, 100 insertions(+) create mode 100644 include/axono/core/cuda/tensor/transpose.cu diff --git a/include/axono/core/cuda/tensor/transpose.cu b/include/axono/core/cuda/tensor/transpose.cu new file mode 100644 index 0000000..d8f76f3 --- /dev/null +++ b/include/axono/core/cuda/tensor/transpose.cu @@ -0,0 +1,100 @@ +#include + +#include "axono/core/tensor.h" +#include "axono/core/macros.h" +#include "axono/core/types.h" + +namespace axono { +namespace core { +namespace cuda { +namespace tensor { + +namespace { +template +__global__ void Transpose2DKernel(const T* src, T* dst, + size_t dim0_size, size_t dim1_size, + size_t other_size, size_t src_stride0, size_t src_stride1) { + const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= other_size * dim0_size * dim1_size) return; + + const size_t total_dim1_dim0 = dim1_size * dim0_size; + const size_t other_idx = idx / total_dim1_dim0; + const size_t rem = idx % total_dim1_dim0; + const size_t dim1_idx = rem / dim0_size; + const size_t dim0_idx = rem % dim0_size; + + const size_t src_idx = other_idx * src_stride0 * src_stride1 + + dim0_idx * src_stride1 + + dim1_idx; + const size_t dst_idx = other_idx * src_stride0 * src_stride1 + + dim1_idx * src_stride0 + + dim0_idx; + dst[dst_idx] = src[src_idx]; +} + +template +Status LaunchTransposeKernel(const Tensor& src, Tensor& dst, int dim0, int dim1) { + dim0 = (dim0 < 0) ? static_cast(src.shape().size()) + dim0 : dim0; + dim1 = (dim1 < 0) ? static_cast(src.shape().size()) + dim1 : dim1; + + const size_t dim0_size = src.shape()[dim0]; + const size_t dim1_size = src.shape()[dim1]; + size_t other_size = 1; + for (int i = 0; i < static_cast(src.shape().size()); ++i) { + if (i != dim0 && i != dim1) { + other_size *= src.shape()[i]; + } + } + + const size_t total_elements = other_size * dim0_size * dim1_size; + const int block_size = 256; + const int grid_size = (total_elements + block_size - 1) / block_size; + + Transpose2DKernel<<>>( + src.data(), dst.data(), + dim0_size, dim1_size, other_size, + dim0_size, dim1_size + ); + + // 检查 CUDA 错误 + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + return Status::DEVICE_ERROR; + } + return Status::OK; +} +} // anonymous namespace + +Status TransposeKernel(const Tensor& src, Tensor& dst, int dim0, int dim1) { + if (!src.is_cuda() || !dst.is_cuda()) { + return Status::DEVICE_MISMATCH; + } + if (src.dtype() != dst.dtype()) { + return Status::UNSUPPORTED_TYPE; + } + + const DataType dtype = src.dtype(); + switch (dtype) { + case DataType::INT8: + return LaunchTransposeKernel(src, dst, dim0, dim1); + case DataType::INT16: + return LaunchTransposeKernel(src, dst, dim0, dim1); + case DataType::INT32: + return LaunchTransposeKernel(src, dst, dim0, dim1); + case DataType::INT64: + return LaunchTransposeKernel(src, dst, dim0, dim1); + case DataType::FLOAT32: + return LaunchTransposeKernel(src, dst, dim0, dim1); + case DataType::FLOAT64: + return LaunchTransposeKernel(src, dst, dim0, dim1); + case DataType::BOOLEAN: + return LaunchTransposeKernel(src, dst, dim0, dim1); + default: + return Status::UNSUPPORTED_TYPE; + } +} + +} // namespace tensor +} // namespace cuda +} // namespace core +} // namespace axono From 061e44f4459bb6663b7c5f45f4c2ba5a33c10c5c Mon Sep 17 00:00:00 2001 From: ice Date: Sat, 31 Jan 2026 14:07:25 +0800 Subject: [PATCH 03/12] Rename transpose.cu to transpose.cu --- {include => src}/axono/core/cuda/tensor/transpose.cu | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename {include => src}/axono/core/cuda/tensor/transpose.cu (100%) diff --git a/include/axono/core/cuda/tensor/transpose.cu b/src/axono/core/cuda/tensor/transpose.cu similarity index 100% rename from include/axono/core/cuda/tensor/transpose.cu rename to src/axono/core/cuda/tensor/transpose.cu From e3237fcfa30e8af636861084d5089d91f803378a Mon Sep 17 00:00:00 2001 From: ice Date: Sat, 31 Jan 2026 14:07:45 +0800 Subject: [PATCH 04/12] Rename src/axono/core/cuda/tensor/transpose.cu to src/core/cuda/tensor/transpose.cu --- src/{axono => }/core/cuda/tensor/transpose.cu | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/{axono => }/core/cuda/tensor/transpose.cu (100%) diff --git a/src/axono/core/cuda/tensor/transpose.cu b/src/core/cuda/tensor/transpose.cu similarity index 100% rename from src/axono/core/cuda/tensor/transpose.cu rename to src/core/cuda/tensor/transpose.cu From f5b7acf4cc654876afbbbb3ea0c6305c3e73c330 Mon Sep 17 00:00:00 2001 From: ice Date: Sat, 31 Jan 2026 14:08:24 +0800 Subject: [PATCH 05/12] Update and rename include/axono/core/cpu/tensor/transpose.cpp to src/core/cpu/tensor/transpose.cpp --- {include/axono => src}/core/cpu/tensor/transpose.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) rename {include/axono => src}/core/cpu/tensor/transpose.cpp (100%) diff --git a/include/axono/core/cpu/tensor/transpose.cpp b/src/core/cpu/tensor/transpose.cpp similarity index 100% rename from include/axono/core/cpu/tensor/transpose.cpp rename to src/core/cpu/tensor/transpose.cpp index da1a0cc..ad8ec76 100644 --- a/include/axono/core/cpu/tensor/transpose.cpp +++ b/src/core/cpu/tensor/transpose.cpp @@ -1,10 +1,10 @@ -#include "axono/core/tensor.h" -#include "axono/core/types.h" - #include #include #include +#include "axono/core/tensor.h" +#include "axono/core/types.h" + namespace axono { namespace core { namespace cpu { From b129f0747d4704a857b6b811943a242c699e4fca Mon Sep 17 00:00:00 2001 From: ice Date: Sat, 31 Jan 2026 14:08:54 +0800 Subject: [PATCH 06/12] Update tensor.cpp --- src/core/tensor.cpp | 45 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 43 insertions(+), 2 deletions(-) diff --git a/src/core/tensor.cpp b/src/core/tensor.cpp index 016d23e..836f6fc 100644 --- a/src/core/tensor.cpp +++ b/src/core/tensor.cpp @@ -1,19 +1,21 @@ -#include "axono/core/tensor.h" - #include #include #include #include // std::runtime_error +#include "axono/core/tensor.h" + #ifdef COMPILED_WITH_CUDA #include "axono/ops/cuda/randn.h" #include "axono/core/cuda/detail.h" #include "axono/core/cuda/tensor/kernel.h" +#include "axono/core/cuda/tensor/transpose.h" #endif #include "axono/ops/cpu/randn.h" #include "axono/core/cpu/tensor/kernel.h" #include "axono/core/types.h" +#include "axono/core/cpu/tensor/transpose.h" namespace { // 自定义删除器,用于 shared_ptr @@ -390,5 +392,44 @@ std::string Tensor::ToString() const { return oss.str(); } +Tensor Tensor::Transpose(int dim0, int dim1) { + const int n_dim = static_cast(this->ndim()); + dim0 = (dim0 < 0) ? (n_dim + dim0) : dim0; + dim1 = (dim1 < 0) ? (n_dim + dim1) : dim1; + + if (dim0 < 0 || dim0 >= n_dim || dim1 < 0 || dim1 >= n_dim) { + throw std::invalid_argument( + "Transpose: invalid dims, ndim=" + std::to_string(n_dim) + + ", dim0=" + std::to_string(dim0) + ", dim1=" + std::to_string(dim1) + ); + } + if (dim0 == dim1) { + return *this; + } + + Shape dst_shape = this->shape_; + std::swap(dst_shape[dim0], dst_shape[dim1]); + + Tensor dst(this->dtype_, dst_shape, this->device_); + dst.InitializeStorage(); + + Status status; + if (this->is_cuda()) { +#ifdef COMPILED_WITH_CUDA + status = cuda::tensor::TransposeKernel(*this, dst, dim0, dim1); +#else + throw std::runtime_error("Transpose: CUDA not compiled, but tensor is on cuda"); +#endif + } else { + status = cpu::tensor::TransposeKernel(*this, dst, dim0, dim1); + } + + if (status != Status::OK) { + throw std::runtime_error("Transpose failed, status=" + std::to_string(static_cast(status))); + } + + return dst; +} + } // namespace core } // namespace axono From df83b70670d9deccaf707f592e2877ac4ae8e0f3 Mon Sep 17 00:00:00 2001 From: ice Date: Sat, 31 Jan 2026 14:09:32 +0800 Subject: [PATCH 07/12] Update tensor.h --- include/axono/pybind/core/tensor.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/axono/pybind/core/tensor.h b/include/axono/pybind/core/tensor.h index a71dafa..9671cda 100644 --- a/include/axono/pybind/core/tensor.h +++ b/include/axono/pybind/core/tensor.h @@ -33,6 +33,7 @@ void init_tensor(py::module &m) { py::arg("dtype"), py::arg("shape"), py::arg("device")) .def_static("create", &axono::core::Tensor::Create) .def_static("create_like", &axono::core::Tensor::CreateLike) + .def("transpose", &axono::core::Tensor::Transpose) .def("reshape", &axono::core::Tensor::Reshape) .def("resize", &axono::core::Tensor::Resize) .def( From 7ec0b0478370dbe17d1ec976f8c87e494496ebbc Mon Sep 17 00:00:00 2001 From: ice Date: Sat, 31 Jan 2026 14:10:08 +0800 Subject: [PATCH 08/12] Create transpose.h --- include/axono/core/cpu/tensor/transpose.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 include/axono/core/cpu/tensor/transpose.h diff --git a/include/axono/core/cpu/tensor/transpose.h b/include/axono/core/cpu/tensor/transpose.h new file mode 100644 index 0000000..4ef0af1 --- /dev/null +++ b/include/axono/core/cpu/tensor/transpose.h @@ -0,0 +1,16 @@ +#pragma once + +#include "axono/core/tensor.h" +#include "axono/core/types.h" + +namespace axono { +namespace core { +namespace cpu { +namespace tensor { + +AXONO_EXPORT Status TransposeKernel(const Tensor& src, Tensor& dst, int dim0, int dim1); + +} // namespace tensor +} // namespace cpu +} // namespace core +} // namespace axono From b4806f54a99fb656ac3e1dc5c63b1457592ad602 Mon Sep 17 00:00:00 2001 From: ice Date: Sat, 31 Jan 2026 14:10:29 +0800 Subject: [PATCH 09/12] Add transpose.h header for tensor transpose functionality --- include/axono/core/cuda/tensor/transpose.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 include/axono/core/cuda/tensor/transpose.h diff --git a/include/axono/core/cuda/tensor/transpose.h b/include/axono/core/cuda/tensor/transpose.h new file mode 100644 index 0000000..1763e79 --- /dev/null +++ b/include/axono/core/cuda/tensor/transpose.h @@ -0,0 +1,16 @@ +#pragma once + +#include "axono/core/tensor.h" +#include "axono/core/types.h" + +namespace axono { +namespace core { +namespace cuda { +namespace tensor { + +AXONO_EXPORT Status TransposeKernel(const Tensor& src, Tensor& dst, int dim0, int dim1); + +} // namespace tensor +} // namespace cuda +} // namespace core +} // namespace axono From 4d775c1060ad234b38f3d37a5daee8df458c4860 Mon Sep 17 00:00:00 2001 From: ice Date: Sat, 31 Jan 2026 14:11:00 +0800 Subject: [PATCH 10/12] Update tensor.h --- include/axono/core/tensor.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/axono/core/tensor.h b/include/axono/core/tensor.h index 57e80ea..a214469 100644 --- a/include/axono/core/tensor.h +++ b/include/axono/core/tensor.h @@ -76,6 +76,9 @@ class Tensor { Status Reshape(const Shape &new_shape); Status Resize(const Shape &new_shape); + // Transpose + Tensor Transpose(int dim0 = -2, int dim1 = -1); + // 填充操作 Status FillZero(); Status Fill(void *value, size_t value_size); From 4efa3ff2278c0421bf64edb5e8193d271a7c7a53 Mon Sep 17 00:00:00 2001 From: ice Date: Sat, 31 Jan 2026 14:11:35 +0800 Subject: [PATCH 11/12] Update tensor.py --- python/axono/core/tensor.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/axono/core/tensor.py b/python/axono/core/tensor.py index c43ef1c..469fc8a 100644 --- a/python/axono/core/tensor.py +++ b/python/axono/core/tensor.py @@ -125,6 +125,10 @@ def __add__(self, other) -> "Tensor": from .operators import add return add(self, other) + + def transpose(self, dim0: int = -2, dim1: int = -1) -> "Tensor": + new = self._tensor.transpose(dim0, dim1) + return Tensor.from_raw(new) def to_numpy(self) -> np.ndarray: """Convert tensor to numpy array - FIXED VERSION""" From 5b083afd905e623e07f2214e8ae2257a71932dbb Mon Sep 17 00:00:00 2001 From: ice Date: Sat, 31 Jan 2026 14:12:09 +0800 Subject: [PATCH 12/12] Update test_tensors.py --- python/tests/core/test_tensors.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/python/tests/core/test_tensors.py b/python/tests/core/test_tensors.py index fd3a343..38242ee 100644 --- a/python/tests/core/test_tensors.py +++ b/python/tests/core/test_tensors.py @@ -45,6 +45,13 @@ def test_tensor_data_types(self): for dtype in dtypes: tensor = Tensor(dtype=dtype, shape=[2, 2]) self.assertEqual(tensor.dtype, dtype) + + def test_tensor_transpose(self): + """测试 Tensor 转置""" + tensor = Tensor(dtype=DataType.FLOAT32, shape=[2, 3]) + tensor.fill(1.0) + transposed_tensor = tensor.transpose() + self.assertEqual(transposed_tensor.shape, [3, 2]) def test_tensor_fill(self): """测试 Tensor 填充"""