diff --git a/include/axono/core/cpu/tensor/transpose.h b/include/axono/core/cpu/tensor/transpose.h new file mode 100644 index 0000000..4ef0af1 --- /dev/null +++ b/include/axono/core/cpu/tensor/transpose.h @@ -0,0 +1,16 @@ +#pragma once + +#include "axono/core/tensor.h" +#include "axono/core/types.h" + +namespace axono { +namespace core { +namespace cpu { +namespace tensor { + +AXONO_EXPORT Status TransposeKernel(const Tensor& src, Tensor& dst, int dim0, int dim1); + +} // namespace tensor +} // namespace cpu +} // namespace core +} // namespace axono diff --git a/include/axono/core/cuda/tensor/transpose.h b/include/axono/core/cuda/tensor/transpose.h new file mode 100644 index 0000000..1763e79 --- /dev/null +++ b/include/axono/core/cuda/tensor/transpose.h @@ -0,0 +1,16 @@ +#pragma once + +#include "axono/core/tensor.h" +#include "axono/core/types.h" + +namespace axono { +namespace core { +namespace cuda { +namespace tensor { + +AXONO_EXPORT Status TransposeKernel(const Tensor& src, Tensor& dst, int dim0, int dim1); + +} // namespace tensor +} // namespace cuda +} // namespace core +} // namespace axono diff --git a/include/axono/core/tensor.h b/include/axono/core/tensor.h index 57e80ea..a214469 100644 --- a/include/axono/core/tensor.h +++ b/include/axono/core/tensor.h @@ -76,6 +76,9 @@ class Tensor { Status Reshape(const Shape &new_shape); Status Resize(const Shape &new_shape); + // Transpose + Tensor Transpose(int dim0 = -2, int dim1 = -1); + // 填充操作 Status FillZero(); Status Fill(void *value, size_t value_size); diff --git a/include/axono/pybind/core/tensor.h b/include/axono/pybind/core/tensor.h index a71dafa..9671cda 100644 --- a/include/axono/pybind/core/tensor.h +++ b/include/axono/pybind/core/tensor.h @@ -33,6 +33,7 @@ void init_tensor(py::module &m) { py::arg("dtype"), py::arg("shape"), py::arg("device")) .def_static("create", &axono::core::Tensor::Create) .def_static("create_like", &axono::core::Tensor::CreateLike) + .def("transpose", &axono::core::Tensor::Transpose) .def("reshape", &axono::core::Tensor::Reshape) .def("resize", &axono::core::Tensor::Resize) .def( diff --git a/python/axono/core/tensor.py b/python/axono/core/tensor.py index c43ef1c..469fc8a 100644 --- a/python/axono/core/tensor.py +++ b/python/axono/core/tensor.py @@ -125,6 +125,10 @@ def __add__(self, other) -> "Tensor": from .operators import add return add(self, other) + + def transpose(self, dim0: int = -2, dim1: int = -1) -> "Tensor": + new = self._tensor.transpose(dim0, dim1) + return Tensor.from_raw(new) def to_numpy(self) -> np.ndarray: """Convert tensor to numpy array - FIXED VERSION""" diff --git a/python/tests/core/test_tensors.py b/python/tests/core/test_tensors.py index fd3a343..38242ee 100644 --- a/python/tests/core/test_tensors.py +++ b/python/tests/core/test_tensors.py @@ -45,6 +45,13 @@ def test_tensor_data_types(self): for dtype in dtypes: tensor = Tensor(dtype=dtype, shape=[2, 2]) self.assertEqual(tensor.dtype, dtype) + + def test_tensor_transpose(self): + """测试 Tensor 转置""" + tensor = Tensor(dtype=DataType.FLOAT32, shape=[2, 3]) + tensor.fill(1.0) + transposed_tensor = tensor.transpose() + self.assertEqual(transposed_tensor.shape, [3, 2]) def test_tensor_fill(self): """测试 Tensor 填充""" diff --git a/src/core/cpu/tensor/transpose.cpp b/src/core/cpu/tensor/transpose.cpp new file mode 100644 index 0000000..ad8ec76 --- /dev/null +++ b/src/core/cpu/tensor/transpose.cpp @@ -0,0 +1,86 @@ +#include +#include +#include + +#include "axono/core/tensor.h" +#include "axono/core/types.h" + +namespace axono { +namespace core { +namespace cpu { +namespace tensor { + +namespace { +template +void TransposeImpl(const Tensor& src, Tensor& dst, int dim0, int dim1) { + const auto& src_shape = src.shape(); + const auto& dst_shape = dst.shape(); + const size_t total_elems = src.num_elements(); + + if (total_elems == 0) return; + if (dst.num_elements() != total_elems) throw std::logic_error("src/dst elem count mismatch"); + if (dim0 < 0 || dim0 >= (int)src_shape.size() || dim1 < 0 || dim1 >= (int)src_shape.size()) + throw std::out_of_range("dim index out of bound"); + + std::vector src_stride(src_shape.size(), 1); + std::vector dst_stride(dst_shape.size(), 1); + for (int i = (int)src_shape.size() - 2; i >= 0; --i) + src_stride[i] = src_stride[i+1] * src_shape[i+1]; + for (int i = (int)dst_shape.size() - 2; i >= 0; --i) + dst_stride[i] = dst_stride[i+1] * dst_shape[i+1]; + + const T* src_data = src.data(); + T* dst_data = dst.data(); + if (!src_data || !dst_data) throw std::runtime_error("null data pointer"); + + for (size_t linear_idx = 0; linear_idx < total_elems; ++linear_idx) { + std::vector coords(src_shape.size(), 0); + size_t rem = linear_idx; + for (int i = 0; i < (int)src_shape.size(); ++i) { + coords[i] = rem / src_stride[i]; + rem %= src_stride[i]; + if (coords[i] >= src_shape[i]) + throw std::out_of_range("src coord out of bound"); + } + + std::swap(coords[dim0], coords[dim1]); + + size_t dst_idx = 0; + for (int i = 0; i < (int)dst_shape.size(); ++i) { + if (coords[i] >= dst_shape[i]) + throw std::out_of_range("dst coord out of bound"); + dst_idx += coords[i] * dst_stride[i]; + } + + if (dst_idx >= dst.num_elements()) + throw std::out_of_range("dst linear index out of bound"); + + dst_data[dst_idx] = src_data[linear_idx]; + } +} + +Status DispatchDtype(const Tensor& src, Tensor& dst, int dim0, int dim1) { + switch (src.dtype()) { + case DataType::INT8: TransposeImpl(src, dst, dim0, dim1); break; + case DataType::INT16: TransposeImpl(src, dst, dim0, dim1); break; + case DataType::INT32: TransposeImpl(src, dst, dim0, dim1); break; + case DataType::INT64: TransposeImpl(src, dst, dim0, dim1); break; + case DataType::FLOAT32: TransposeImpl(src, dst, dim0, dim1); break; + case DataType::FLOAT64: TransposeImpl(src, dst, dim0, dim1); break; + case DataType::BOOLEAN: TransposeImpl(src, dst, dim0, dim1); break; + default: return Status::UNSUPPORTED_TYPE; + } + return Status::OK; +} +} // anonymous namespace + +Status TransposeKernel(const Tensor& src, Tensor& dst, int dim0, int dim1) { + if (src.device() != "cpu" || dst.device() != "cpu") return Status::DEVICE_MISMATCH; + if (src.dtype() != dst.dtype()) return Status::UNSUPPORTED_TYPE; + if (src.num_elements() == 0) return Status::OK; +} + +} // namespace tensor +} // namespace cpu +} // namespace core +} // namespace axono diff --git a/src/core/cuda/tensor/transpose.cu b/src/core/cuda/tensor/transpose.cu new file mode 100644 index 0000000..d8f76f3 --- /dev/null +++ b/src/core/cuda/tensor/transpose.cu @@ -0,0 +1,100 @@ +#include + +#include "axono/core/tensor.h" +#include "axono/core/macros.h" +#include "axono/core/types.h" + +namespace axono { +namespace core { +namespace cuda { +namespace tensor { + +namespace { +template +__global__ void Transpose2DKernel(const T* src, T* dst, + size_t dim0_size, size_t dim1_size, + size_t other_size, size_t src_stride0, size_t src_stride1) { + const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= other_size * dim0_size * dim1_size) return; + + const size_t total_dim1_dim0 = dim1_size * dim0_size; + const size_t other_idx = idx / total_dim1_dim0; + const size_t rem = idx % total_dim1_dim0; + const size_t dim1_idx = rem / dim0_size; + const size_t dim0_idx = rem % dim0_size; + + const size_t src_idx = other_idx * src_stride0 * src_stride1 + + dim0_idx * src_stride1 + + dim1_idx; + const size_t dst_idx = other_idx * src_stride0 * src_stride1 + + dim1_idx * src_stride0 + + dim0_idx; + dst[dst_idx] = src[src_idx]; +} + +template +Status LaunchTransposeKernel(const Tensor& src, Tensor& dst, int dim0, int dim1) { + dim0 = (dim0 < 0) ? static_cast(src.shape().size()) + dim0 : dim0; + dim1 = (dim1 < 0) ? static_cast(src.shape().size()) + dim1 : dim1; + + const size_t dim0_size = src.shape()[dim0]; + const size_t dim1_size = src.shape()[dim1]; + size_t other_size = 1; + for (int i = 0; i < static_cast(src.shape().size()); ++i) { + if (i != dim0 && i != dim1) { + other_size *= src.shape()[i]; + } + } + + const size_t total_elements = other_size * dim0_size * dim1_size; + const int block_size = 256; + const int grid_size = (total_elements + block_size - 1) / block_size; + + Transpose2DKernel<<>>( + src.data(), dst.data(), + dim0_size, dim1_size, other_size, + dim0_size, dim1_size + ); + + // 检查 CUDA 错误 + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + return Status::DEVICE_ERROR; + } + return Status::OK; +} +} // anonymous namespace + +Status TransposeKernel(const Tensor& src, Tensor& dst, int dim0, int dim1) { + if (!src.is_cuda() || !dst.is_cuda()) { + return Status::DEVICE_MISMATCH; + } + if (src.dtype() != dst.dtype()) { + return Status::UNSUPPORTED_TYPE; + } + + const DataType dtype = src.dtype(); + switch (dtype) { + case DataType::INT8: + return LaunchTransposeKernel(src, dst, dim0, dim1); + case DataType::INT16: + return LaunchTransposeKernel(src, dst, dim0, dim1); + case DataType::INT32: + return LaunchTransposeKernel(src, dst, dim0, dim1); + case DataType::INT64: + return LaunchTransposeKernel(src, dst, dim0, dim1); + case DataType::FLOAT32: + return LaunchTransposeKernel(src, dst, dim0, dim1); + case DataType::FLOAT64: + return LaunchTransposeKernel(src, dst, dim0, dim1); + case DataType::BOOLEAN: + return LaunchTransposeKernel(src, dst, dim0, dim1); + default: + return Status::UNSUPPORTED_TYPE; + } +} + +} // namespace tensor +} // namespace cuda +} // namespace core +} // namespace axono diff --git a/src/core/tensor.cpp b/src/core/tensor.cpp index 016d23e..836f6fc 100644 --- a/src/core/tensor.cpp +++ b/src/core/tensor.cpp @@ -1,19 +1,21 @@ -#include "axono/core/tensor.h" - #include #include #include #include // std::runtime_error +#include "axono/core/tensor.h" + #ifdef COMPILED_WITH_CUDA #include "axono/ops/cuda/randn.h" #include "axono/core/cuda/detail.h" #include "axono/core/cuda/tensor/kernel.h" +#include "axono/core/cuda/tensor/transpose.h" #endif #include "axono/ops/cpu/randn.h" #include "axono/core/cpu/tensor/kernel.h" #include "axono/core/types.h" +#include "axono/core/cpu/tensor/transpose.h" namespace { // 自定义删除器,用于 shared_ptr @@ -390,5 +392,44 @@ std::string Tensor::ToString() const { return oss.str(); } +Tensor Tensor::Transpose(int dim0, int dim1) { + const int n_dim = static_cast(this->ndim()); + dim0 = (dim0 < 0) ? (n_dim + dim0) : dim0; + dim1 = (dim1 < 0) ? (n_dim + dim1) : dim1; + + if (dim0 < 0 || dim0 >= n_dim || dim1 < 0 || dim1 >= n_dim) { + throw std::invalid_argument( + "Transpose: invalid dims, ndim=" + std::to_string(n_dim) + + ", dim0=" + std::to_string(dim0) + ", dim1=" + std::to_string(dim1) + ); + } + if (dim0 == dim1) { + return *this; + } + + Shape dst_shape = this->shape_; + std::swap(dst_shape[dim0], dst_shape[dim1]); + + Tensor dst(this->dtype_, dst_shape, this->device_); + dst.InitializeStorage(); + + Status status; + if (this->is_cuda()) { +#ifdef COMPILED_WITH_CUDA + status = cuda::tensor::TransposeKernel(*this, dst, dim0, dim1); +#else + throw std::runtime_error("Transpose: CUDA not compiled, but tensor is on cuda"); +#endif + } else { + status = cpu::tensor::TransposeKernel(*this, dst, dim0, dim1); + } + + if (status != Status::OK) { + throw std::runtime_error("Transpose failed, status=" + std::to_string(static_cast(status))); + } + + return dst; +} + } // namespace core } // namespace axono