Skip to content
16 changes: 16 additions & 0 deletions include/axono/core/cpu/tensor/transpose.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#pragma once

#include "axono/core/tensor.h"
#include "axono/core/types.h"

namespace axono {
namespace core {
namespace cpu {
namespace tensor {

AXONO_EXPORT Status TransposeKernel(const Tensor& src, Tensor& dst, int dim0, int dim1);

} // namespace tensor
} // namespace cpu
} // namespace core
} // namespace axono
16 changes: 16 additions & 0 deletions include/axono/core/cuda/tensor/transpose.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#pragma once

#include "axono/core/tensor.h"
#include "axono/core/types.h"

namespace axono {
namespace core {
namespace cuda {
namespace tensor {

AXONO_EXPORT Status TransposeKernel(const Tensor& src, Tensor& dst, int dim0, int dim1);

} // namespace tensor
} // namespace cuda
} // namespace core
} // namespace axono
3 changes: 3 additions & 0 deletions include/axono/core/tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,9 @@ class Tensor {
Status Reshape(const Shape &new_shape);
Status Resize(const Shape &new_shape);

// Transpose
Tensor Transpose(int dim0 = -2, int dim1 = -1);

// 填充操作
Status FillZero();
Status Fill(void *value, size_t value_size);
Expand Down
1 change: 1 addition & 0 deletions include/axono/pybind/core/tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ void init_tensor(py::module &m) {
py::arg("dtype"), py::arg("shape"), py::arg("device"))
.def_static("create", &axono::core::Tensor::Create)
.def_static("create_like", &axono::core::Tensor::CreateLike)
.def("transpose", &axono::core::Tensor::Transpose)
.def("reshape", &axono::core::Tensor::Reshape)
.def("resize", &axono::core::Tensor::Resize)
.def(
Expand Down
4 changes: 4 additions & 0 deletions python/axono/core/tensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,10 @@ def __add__(self, other) -> "Tensor":
from .operators import add

return add(self, other)

def transpose(self, dim0: int = -2, dim1: int = -1) -> "Tensor":
new = self._tensor.transpose(dim0, dim1)
return Tensor.from_raw(new)

def to_numpy(self) -> np.ndarray:
"""Convert tensor to numpy array - FIXED VERSION"""
Expand Down
7 changes: 7 additions & 0 deletions python/tests/core/test_tensors.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,13 @@ def test_tensor_data_types(self):
for dtype in dtypes:
tensor = Tensor(dtype=dtype, shape=[2, 2])
self.assertEqual(tensor.dtype, dtype)

def test_tensor_transpose(self):
"""测试 Tensor 转置"""
tensor = Tensor(dtype=DataType.FLOAT32, shape=[2, 3])
tensor.fill(1.0)
transposed_tensor = tensor.transpose()
self.assertEqual(transposed_tensor.shape, [3, 2])

def test_tensor_fill(self):
"""测试 Tensor 填充"""
Expand Down
86 changes: 86 additions & 0 deletions src/core/cpu/tensor/transpose.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
#include <stdexcept>
#include <algorithm>
#include <cstring>

#include "axono/core/tensor.h"
#include "axono/core/types.h"

namespace axono {
namespace core {
namespace cpu {
namespace tensor {

namespace {
template <typename T>
void TransposeImpl(const Tensor& src, Tensor& dst, int dim0, int dim1) {
const auto& src_shape = src.shape();
const auto& dst_shape = dst.shape();
const size_t total_elems = src.num_elements();

if (total_elems == 0) return;
if (dst.num_elements() != total_elems) throw std::logic_error("src/dst elem count mismatch");
if (dim0 < 0 || dim0 >= (int)src_shape.size() || dim1 < 0 || dim1 >= (int)src_shape.size())
throw std::out_of_range("dim index out of bound");

std::vector<size_t> src_stride(src_shape.size(), 1);
std::vector<size_t> dst_stride(dst_shape.size(), 1);
for (int i = (int)src_shape.size() - 2; i >= 0; --i)
src_stride[i] = src_stride[i+1] * src_shape[i+1];
for (int i = (int)dst_shape.size() - 2; i >= 0; --i)
dst_stride[i] = dst_stride[i+1] * dst_shape[i+1];

const T* src_data = src.data<T>();
T* dst_data = dst.data<T>();
if (!src_data || !dst_data) throw std::runtime_error("null data pointer");

for (size_t linear_idx = 0; linear_idx < total_elems; ++linear_idx) {
std::vector<size_t> coords(src_shape.size(), 0);
size_t rem = linear_idx;
for (int i = 0; i < (int)src_shape.size(); ++i) {
coords[i] = rem / src_stride[i];
rem %= src_stride[i];
if (coords[i] >= src_shape[i])
throw std::out_of_range("src coord out of bound");
}

std::swap(coords[dim0], coords[dim1]);

size_t dst_idx = 0;
for (int i = 0; i < (int)dst_shape.size(); ++i) {
if (coords[i] >= dst_shape[i])
throw std::out_of_range("dst coord out of bound");
dst_idx += coords[i] * dst_stride[i];
}

if (dst_idx >= dst.num_elements())
throw std::out_of_range("dst linear index out of bound");

dst_data[dst_idx] = src_data[linear_idx];
}
}

Status DispatchDtype(const Tensor& src, Tensor& dst, int dim0, int dim1) {
switch (src.dtype()) {
case DataType::INT8: TransposeImpl<int8_t>(src, dst, dim0, dim1); break;
case DataType::INT16: TransposeImpl<int16_t>(src, dst, dim0, dim1); break;
case DataType::INT32: TransposeImpl<int32_t>(src, dst, dim0, dim1); break;
case DataType::INT64: TransposeImpl<int64_t>(src, dst, dim0, dim1); break;
case DataType::FLOAT32: TransposeImpl<float>(src, dst, dim0, dim1); break;
case DataType::FLOAT64: TransposeImpl<double>(src, dst, dim0, dim1); break;
case DataType::BOOLEAN: TransposeImpl<bool>(src, dst, dim0, dim1); break;
default: return Status::UNSUPPORTED_TYPE;
}
return Status::OK;
}
} // anonymous namespace

Status TransposeKernel(const Tensor& src, Tensor& dst, int dim0, int dim1) {
if (src.device() != "cpu" || dst.device() != "cpu") return Status::DEVICE_MISMATCH;
if (src.dtype() != dst.dtype()) return Status::UNSUPPORTED_TYPE;
if (src.num_elements() == 0) return Status::OK;
}

} // namespace tensor
} // namespace cpu
} // namespace core
} // namespace axono
100 changes: 100 additions & 0 deletions src/core/cuda/tensor/transpose.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
#include <cuda_runtime.h>

#include "axono/core/tensor.h"
#include "axono/core/macros.h"
#include "axono/core/types.h"

namespace axono {
namespace core {
namespace cuda {
namespace tensor {

namespace {
template <typename T>
__global__ void Transpose2DKernel(const T* src, T* dst,
size_t dim0_size, size_t dim1_size,
size_t other_size, size_t src_stride0, size_t src_stride1) {
const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= other_size * dim0_size * dim1_size) return;

const size_t total_dim1_dim0 = dim1_size * dim0_size;
const size_t other_idx = idx / total_dim1_dim0;
const size_t rem = idx % total_dim1_dim0;
const size_t dim1_idx = rem / dim0_size;
const size_t dim0_idx = rem % dim0_size;

const size_t src_idx = other_idx * src_stride0 * src_stride1
+ dim0_idx * src_stride1
+ dim1_idx;
const size_t dst_idx = other_idx * src_stride0 * src_stride1
+ dim1_idx * src_stride0
+ dim0_idx;
dst[dst_idx] = src[src_idx];
}

template <typename T>
Status LaunchTransposeKernel(const Tensor& src, Tensor& dst, int dim0, int dim1) {
dim0 = (dim0 < 0) ? static_cast<int>(src.shape().size()) + dim0 : dim0;
dim1 = (dim1 < 0) ? static_cast<int>(src.shape().size()) + dim1 : dim1;

const size_t dim0_size = src.shape()[dim0];
const size_t dim1_size = src.shape()[dim1];
size_t other_size = 1;
for (int i = 0; i < static_cast<int>(src.shape().size()); ++i) {
if (i != dim0 && i != dim1) {
other_size *= src.shape()[i];
}
}

const size_t total_elements = other_size * dim0_size * dim1_size;
const int block_size = 256;
const int grid_size = (total_elements + block_size - 1) / block_size;

Transpose2DKernel<T><<<grid_size, block_size>>>(
src.data<T>(), dst.data<T>(),
dim0_size, dim1_size, other_size,
dim0_size, dim1_size
);

// 检查 CUDA 错误
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
return Status::DEVICE_ERROR;
}
return Status::OK;
}
} // anonymous namespace

Status TransposeKernel(const Tensor& src, Tensor& dst, int dim0, int dim1) {
if (!src.is_cuda() || !dst.is_cuda()) {
return Status::DEVICE_MISMATCH;
}
if (src.dtype() != dst.dtype()) {
return Status::UNSUPPORTED_TYPE;
}

const DataType dtype = src.dtype();
switch (dtype) {
case DataType::INT8:
return LaunchTransposeKernel<int8_t>(src, dst, dim0, dim1);
case DataType::INT16:
return LaunchTransposeKernel<int16_t>(src, dst, dim0, dim1);
case DataType::INT32:
return LaunchTransposeKernel<int32_t>(src, dst, dim0, dim1);
case DataType::INT64:
return LaunchTransposeKernel<int64_t>(src, dst, dim0, dim1);
case DataType::FLOAT32:
return LaunchTransposeKernel<float>(src, dst, dim0, dim1);
case DataType::FLOAT64:
return LaunchTransposeKernel<double>(src, dst, dim0, dim1);
case DataType::BOOLEAN:
return LaunchTransposeKernel<bool>(src, dst, dim0, dim1);
default:
return Status::UNSUPPORTED_TYPE;
}
}

} // namespace tensor
} // namespace cuda
} // namespace core
} // namespace axono
45 changes: 43 additions & 2 deletions src/core/tensor.cpp
Original file line number Diff line number Diff line change
@@ -1,19 +1,21 @@
#include "axono/core/tensor.h"

#include <cstdlib>
#include <cstring>
#include <sstream>
#include <stdexcept> // std::runtime_error

#include "axono/core/tensor.h"

#ifdef COMPILED_WITH_CUDA
#include "axono/ops/cuda/randn.h"
#include "axono/core/cuda/detail.h"
#include "axono/core/cuda/tensor/kernel.h"
#include "axono/core/cuda/tensor/transpose.h"
#endif

#include "axono/ops/cpu/randn.h"
#include "axono/core/cpu/tensor/kernel.h"
#include "axono/core/types.h"
#include "axono/core/cpu/tensor/transpose.h"

namespace {
// 自定义删除器,用于 shared_ptr
Expand Down Expand Up @@ -390,5 +392,44 @@ std::string Tensor::ToString() const {
return oss.str();
}

Tensor Tensor::Transpose(int dim0, int dim1) {
const int n_dim = static_cast<int>(this->ndim());
dim0 = (dim0 < 0) ? (n_dim + dim0) : dim0;
dim1 = (dim1 < 0) ? (n_dim + dim1) : dim1;

if (dim0 < 0 || dim0 >= n_dim || dim1 < 0 || dim1 >= n_dim) {
throw std::invalid_argument(
"Transpose: invalid dims, ndim=" + std::to_string(n_dim) +
", dim0=" + std::to_string(dim0) + ", dim1=" + std::to_string(dim1)
);
}
if (dim0 == dim1) {
return *this;
}

Shape dst_shape = this->shape_;
std::swap(dst_shape[dim0], dst_shape[dim1]);

Tensor dst(this->dtype_, dst_shape, this->device_);
dst.InitializeStorage();

Status status;
if (this->is_cuda()) {
#ifdef COMPILED_WITH_CUDA
status = cuda::tensor::TransposeKernel(*this, dst, dim0, dim1);
#else
throw std::runtime_error("Transpose: CUDA not compiled, but tensor is on cuda");
#endif
} else {
status = cpu::tensor::TransposeKernel(*this, dst, dim0, dim1);
}

if (status != Status::OK) {
throw std::runtime_error("Transpose failed, status=" + std::to_string(static_cast<int>(status)));
}

return dst;
}

} // namespace core
} // namespace axono