From c2537dcf768ce3cd4f26acec1dc0fb8a360ffe08 Mon Sep 17 00:00:00 2001 From: yuerqiqi <2500526025@qq.com> Date: Tue, 20 Jan 2026 18:05:56 +0800 Subject: [PATCH] feat(ascend): implement element-wise ops --- mllm/backends/ascend/AscendBackend.cpp | 2 + mllm/backends/ascend/CMakeLists.txt | 4 + mllm/backends/ascend/ops/AscendElewiseOps.cpp | 176 ++++++++++++++++++ mllm/backends/ascend/ops/AscendElewiseOps.hpp | 30 +++ tests/ascend/AscendKernelTest.hpp | 70 +++++++ tests/ascend/KernelTest.cpp | 35 ++++ 6 files changed, 317 insertions(+) diff --git a/mllm/backends/ascend/AscendBackend.cpp b/mllm/backends/ascend/AscendBackend.cpp index 5ec76413a..8f644e1a8 100644 --- a/mllm/backends/ascend/AscendBackend.cpp +++ b/mllm/backends/ascend/AscendBackend.cpp @@ -13,6 +13,8 @@ namespace mllm::ascend { AscendBackend::AscendBackend() : Backend(kAscend, createAscendAllocator()) { regOpFactory(); + regOpFactory(); + regOpFactory(); regOpFactory(); auto& devices = AscendDeviceMetaInfo::instance().devices; for (const auto& device : devices) { diff --git a/mllm/backends/ascend/CMakeLists.txt b/mllm/backends/ascend/CMakeLists.txt index bb0feac46..9d6dfd6e0 100644 --- a/mllm/backends/ascend/CMakeLists.txt +++ b/mllm/backends/ascend/CMakeLists.txt @@ -34,6 +34,10 @@ endif() if(DEFINED ENV{ATB_HOME_PATH}) target_include_directories(MllmAscendBackend PUBLIC $ENV{ATB_HOME_PATH}/include) target_link_directories(MllmAscendBackend PUBLIC $ENV{ATB_HOME_PATH}/lib) +elseif(EXISTS "${PROJECT_SOURCE_DIR}/../libs/atb") + message(STATUS "Found ATB in ${PROJECT_SOURCE_DIR}/../libs/atb") + target_include_directories(MllmAscendBackend PUBLIC "${PROJECT_SOURCE_DIR}/../libs/atb/include") + target_link_directories(MllmAscendBackend PUBLIC "${PROJECT_SOURCE_DIR}/../libs/atb/lib") else() message(WARNING "ATB_HOME_PATH not defined, ATB library will not be linked") endif() diff --git a/mllm/backends/ascend/ops/AscendElewiseOps.cpp b/mllm/backends/ascend/ops/AscendElewiseOps.cpp index 762ef1dfe..68f1425f7 100644 --- a/mllm/backends/ascend/ops/AscendElewiseOps.cpp +++ b/mllm/backends/ascend/ops/AscendElewiseOps.cpp @@ -106,4 +106,180 @@ void AscendAddOp::forward(const std::vector& inputs, std::vector atb::DestroyOperation(op); } +AscendSubOp::AscendSubOp(const aops::SubOpOptions& options) : aops::SubOp(options) {} + +void AscendSubOp::setup(const std::vector& inputs, std::vector& outputs) { + BaseOp::setup(inputs, outputs); +} + +void AscendSubOp::forward(const std::vector& inputs, std::vector& outputs) { + MLLM_RT_ASSERT_EQ(inputs.size(), 2); + MLLM_RT_ASSERT_EQ(outputs.size(), 1); + + const auto& x = inputs[0]; + const auto& y = inputs[1]; + auto& z = outputs[0]; + + if (x.dtype() != y.dtype() || x.dtype() != z.dtype()) { + NYI("AscendSubOp currently requires x/y/z have same dtype"); + } + if (x.numel() != y.numel() || x.numel() != z.numel()) { + NYI("AscendSubOp demo only supports no-broadcast case (numel equal)"); + } + + atb::infer::ElewiseParam subParam; + subParam.elewiseType = atb::infer::ElewiseParam::ELEWISE_SUB; + + atb::Operation* op = nullptr; + auto st = atb::CreateOperation(subParam, &op); + if (st != atb::NO_ERROR || op == nullptr) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB CreateOperation(ELEWISE_SUB) failed, status={}", static_cast(st)); + } + + atb::Context* atb_ctx = getGlobalAtbContext(); + + atb::Tensor atb_x; + atb::Tensor atb_y; + atb::Tensor atb_z; + + fillAtbTensorDesc(x, atb_x.desc); + fillAtbTensorDesc(y, atb_y.desc); + fillAtbTensorDesc(z, atb_z.desc); + + atb_x.deviceData = reinterpret_cast(x.ptr()); + atb_x.dataSize = x.bytes(); + atb_y.deviceData = reinterpret_cast(y.ptr()); + atb_y.dataSize = y.bytes(); + atb_z.deviceData = reinterpret_cast(z.ptr()); + atb_z.dataSize = z.bytes(); + + atb::SVector inTensors; + atb::SVector outTensors; + inTensors.push_back(atb_x); + inTensors.push_back(atb_y); + outTensors.push_back(atb_z); + + atb::VariantPack vp; + vp.inTensors = inTensors; + vp.outTensors = outTensors; + + uint64_t workspaceSize = 0; + st = op->Setup(vp, workspaceSize, atb_ctx); + if (st != atb::NO_ERROR) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB SubOp Setup failed, status={}", static_cast(st)); + } + + void* workspace = nullptr; + int workspace_block_id = -1; + if (workspaceSize > 0) { + auto& mem_mgr = getAscendMemoryManager(); + mem_mgr.allocateBlock(static_cast(workspaceSize), workspace_block_id); + mem_mgr.getBlockPtr(workspace_block_id, workspace); + } + { + ASCEND_TIME_SCOPE("AscendSubOp::forward"); + st = op->Execute(vp, reinterpret_cast(workspace), workspaceSize, atb_ctx); + } + if (st != atb::NO_ERROR) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB SubOp Execute failed, status={}", static_cast(st)); + } + + syncGlobalAtbStream(); + + if (workspace_block_id != -1) { + auto& mem_mgr = getAscendMemoryManager(); + mem_mgr.freeBlock(workspace_block_id); + } + + atb::DestroyOperation(op); +} + +AscendMulOp::AscendMulOp(const aops::MulOpOptions& options) : aops::MulOp(options) {} + +void AscendMulOp::setup(const std::vector& inputs, std::vector& outputs) { + BaseOp::setup(inputs, outputs); +} + +void AscendMulOp::forward(const std::vector& inputs, std::vector& outputs) { + MLLM_RT_ASSERT_EQ(inputs.size(), 2); + MLLM_RT_ASSERT_EQ(outputs.size(), 1); + + const auto& x = inputs[0]; + const auto& y = inputs[1]; + auto& z = outputs[0]; + + if (x.dtype() != y.dtype() || x.dtype() != z.dtype()) { + NYI("AscendMulOp currently requires x/y/z have same dtype"); + } + if (x.numel() != y.numel() || x.numel() != z.numel()) { + NYI("AscendMulOp demo only supports no-broadcast case (numel equal)"); + } + + atb::infer::ElewiseParam mulParam; + mulParam.elewiseType = atb::infer::ElewiseParam::ELEWISE_MUL; + + atb::Operation* op = nullptr; + auto st = atb::CreateOperation(mulParam, &op); + if (st != atb::NO_ERROR || op == nullptr) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB CreateOperation(ELEWISE_MUL) failed, status={}", static_cast(st)); + } + + atb::Context* atb_ctx = getGlobalAtbContext(); + + atb::Tensor atb_x; + atb::Tensor atb_y; + atb::Tensor atb_z; + + fillAtbTensorDesc(x, atb_x.desc); + fillAtbTensorDesc(y, atb_y.desc); + fillAtbTensorDesc(z, atb_z.desc); + + atb_x.deviceData = reinterpret_cast(x.ptr()); + atb_x.dataSize = x.bytes(); + atb_y.deviceData = reinterpret_cast(y.ptr()); + atb_y.dataSize = y.bytes(); + atb_z.deviceData = reinterpret_cast(z.ptr()); + atb_z.dataSize = z.bytes(); + + atb::SVector inTensors; + atb::SVector outTensors; + inTensors.push_back(atb_x); + inTensors.push_back(atb_y); + outTensors.push_back(atb_z); + + atb::VariantPack vp; + vp.inTensors = inTensors; + vp.outTensors = outTensors; + + uint64_t workspaceSize = 0; + st = op->Setup(vp, workspaceSize, atb_ctx); + if (st != atb::NO_ERROR) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB MulOp Setup failed, status={}", static_cast(st)); + } + + void* workspace = nullptr; + int workspace_block_id = -1; + if (workspaceSize > 0) { + auto& mem_mgr = getAscendMemoryManager(); + mem_mgr.allocateBlock(static_cast(workspaceSize), workspace_block_id); + mem_mgr.getBlockPtr(workspace_block_id, workspace); + } + { + ASCEND_TIME_SCOPE("AscendMulOp::forward"); + st = op->Execute(vp, reinterpret_cast(workspace), workspaceSize, atb_ctx); + } + if (st != atb::NO_ERROR) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB MulOp Execute failed, status={}", static_cast(st)); + } + + syncGlobalAtbStream(); + + if (workspace_block_id != -1) { + auto& mem_mgr = getAscendMemoryManager(); + mem_mgr.freeBlock(workspace_block_id); + } + + atb::DestroyOperation(op); +} + } // namespace mllm::ascend \ No newline at end of file diff --git a/mllm/backends/ascend/ops/AscendElewiseOps.hpp b/mllm/backends/ascend/ops/AscendElewiseOps.hpp index 26117cbc2..9122e20cb 100644 --- a/mllm/backends/ascend/ops/AscendElewiseOps.hpp +++ b/mllm/backends/ascend/ops/AscendElewiseOps.hpp @@ -24,4 +24,34 @@ class AscendAddOpFactory final : public TypedOpFactory& inputs, std::vector& outputs) override; + void forward(const std::vector& inputs, std::vector& outputs) override; +}; + +class AscendSubOpFactory final : public TypedOpFactory { + public: + std::shared_ptr createOpImpl(const aops::SubOpOptions& options) override { + return std::make_shared(options); + } +}; + +class AscendMulOp final : public aops::MulOp { + public: + explicit AscendMulOp(const aops::MulOpOptions& options); + + void setup(const std::vector& inputs, std::vector& outputs) override; + void forward(const std::vector& inputs, std::vector& outputs) override; +}; + +class AscendMulOpFactory final : public TypedOpFactory { + public: + std::shared_ptr createOpImpl(const aops::MulOpOptions& options) override { + return std::make_shared(options); + } +}; + } // namespace mllm::ascend \ No newline at end of file diff --git a/tests/ascend/AscendKernelTest.hpp b/tests/ascend/AscendKernelTest.hpp index 138ee5ae8..a01028906 100644 --- a/tests/ascend/AscendKernelTest.hpp +++ b/tests/ascend/AscendKernelTest.hpp @@ -48,5 +48,75 @@ class AscendKernelTest : public KernelTest { } return true; } + + // Test Sub operation with different shapes + bool SubFloat16Test(const std::vector& shapes) { + using namespace mllm; // NOLINT + for (auto& shape : shapes) { + // 1. Construct random FP16 inputs on CPU + Tensor x_cpu = Tensor::random(shape, -3, 3, kFloat16, kCPU); + Tensor y_cpu = Tensor::random(shape, -3, 3, kFloat16, kCPU); + + // 2. Compute reference result (FP16) on CPU + Tensor ref_cpu = Tensor::zeros(shape, kFloat16, kCPU); + { + auto* x_ptr = x_cpu.ptr(); + auto* y_ptr = y_cpu.ptr(); + auto* r_ptr = ref_cpu.ptr(); + auto num_elements = x_cpu.numel(); + for (size_t i = 0; i < num_elements; ++i) { + r_ptr[i] = x_ptr[i] - y_ptr[i]; + } + } + + // 3. Move inputs to Ascend and run Sub (z = x - y) + auto x_ascend = x_cpu.to(kAscend); + auto y_ascend = y_cpu.to(kAscend); + auto z_ascend = x_ascend - y_ascend; + + // 4. Move result back to CPU and compare with reference using allClose + auto z_cpu = z_ascend.to(kCPU); + auto result = mllm::test::allClose(z_cpu, ref_cpu, 1e-2f, 1e-2f); + if (!result.is_close) { + return false; + } + } + return true; + } + + // Test Mul operation with different shapes + bool MulFloat16Test(const std::vector& shapes) { + using namespace mllm; // NOLINT + for (auto& shape : shapes) { + // 1. Construct random FP16 inputs on CPU + Tensor x_cpu = Tensor::random(shape, -3, 3, kFloat16, kCPU); + Tensor y_cpu = Tensor::random(shape, -3, 3, kFloat16, kCPU); + + // 2. Compute reference result (FP16) on CPU + Tensor ref_cpu = Tensor::zeros(shape, kFloat16, kCPU); + { + auto* x_ptr = x_cpu.ptr(); + auto* y_ptr = y_cpu.ptr(); + auto* r_ptr = ref_cpu.ptr(); + auto num_elements = x_cpu.numel(); + for (size_t i = 0; i < num_elements; ++i) { + r_ptr[i] = x_ptr[i] * y_ptr[i]; + } + } + + // 3. Move inputs to Ascend and run Mul (z = x * y) + auto x_ascend = x_cpu.to(kAscend); + auto y_ascend = y_cpu.to(kAscend); + auto z_ascend = x_ascend * y_ascend; + + // 4. Move result back to CPU and compare with reference using allClose + auto z_cpu = z_ascend.to(kCPU); + auto result = mllm::test::allClose(z_cpu, ref_cpu, 1e-2f, 1e-2f); + if (!result.is_close) { + return false; + } + } + return true; + } }; diff --git a/tests/ascend/KernelTest.cpp b/tests/ascend/KernelTest.cpp index b0489f545..1e392df05 100644 --- a/tests/ascend/KernelTest.cpp +++ b/tests/ascend/KernelTest.cpp @@ -25,6 +25,41 @@ TEST_F(AscendKernelTest, AddFloat16) { true); } +//===----------------------------------------------------------------------===// +// Element wise SUB. +// +// FP16 (Ascend currently uses FP16) +//===----------------------------------------------------------------------===// +TEST_F(AscendKernelTest, SubFloat16) { + EXPECT_EQ(SubFloat16Test({ + {2, 3}, + {1, 1}, + {4, 4}, + {8, 8}, + {16, 16}, + {32, 32}, + }), + true); +} + +//===----------------------------------------------------------------------===// +// Element wise MUL. +// +// FP16 (Ascend currently uses FP16) +//===----------------------------------------------------------------------===// +TEST_F(AscendKernelTest, MulFloat16) { + EXPECT_EQ(MulFloat16Test({ + {2, 3}, + {1, 1}, + {4, 4}, + {8, 8}, + {16, 16}, + {32, 32}, + }), + true); +} + + int main(int argc, char** argv) { testing::InitGoogleTest(&argc, argv);