From c2537dcf768ce3cd4f26acec1dc0fb8a360ffe08 Mon Sep 17 00:00:00 2001
From: yuerqiqi <2500526025@qq.com>
Date: Tue, 20 Jan 2026 18:05:56 +0800
Subject: [PATCH] feat(ascend): implement element-wise ops

---
 mllm/backends/ascend/AscendBackend.cpp        |   2 +
 mllm/backends/ascend/CMakeLists.txt           |   4 +
 mllm/backends/ascend/ops/AscendElewiseOps.cpp | 176 ++++++++++++++++++
 mllm/backends/ascend/ops/AscendElewiseOps.hpp |  30 +++
 tests/ascend/AscendKernelTest.hpp             |  70 +++++++
 tests/ascend/KernelTest.cpp                   |  35 ++++
 6 files changed, 317 insertions(+)
diff --git a/mllm/backends/ascend/AscendBackend.cpp b/mllm/backends/ascend/AscendBackend.cpp
index 5ec76413a..8f644e1a8 100644
--- a/mllm/backends/ascend/AscendBackend.cpp
+++ b/mllm/backends/ascend/AscendBackend.cpp
@@ -13,6 +13,8 @@ namespace mllm::ascend {
 
 AscendBackend::AscendBackend() : Backend(kAscend, createAscendAllocator()) {
   regOpFactory<AscendAddOpFactory>();
+  regOpFactory<AscendSubOpFactory>();
+  regOpFactory<AscendMulOpFactory>();
   regOpFactory<AscendX2XOpFactory>();
   auto& devices = AscendDeviceMetaInfo::instance().devices;
   for (const auto& device : devices) {
diff --git a/mllm/backends/ascend/CMakeLists.txt b/mllm/backends/ascend/CMakeLists.txt
index bb0feac46..9d6dfd6e0 100644
--- a/mllm/backends/ascend/CMakeLists.txt
+++ b/mllm/backends/ascend/CMakeLists.txt
@@ -34,6 +34,10 @@ endif()
 if(DEFINED ENV{ATB_HOME_PATH})
   target_include_directories(MllmAscendBackend PUBLIC $ENV{ATB_HOME_PATH}/include)
   target_link_directories(MllmAscendBackend PUBLIC $ENV{ATB_HOME_PATH}/lib)
+elseif(EXISTS "${PROJECT_SOURCE_DIR}/../libs/atb")
+  message(STATUS "Found ATB in ${PROJECT_SOURCE_DIR}/../libs/atb")
+  target_include_directories(MllmAscendBackend PUBLIC "${PROJECT_SOURCE_DIR}/../libs/atb/include")
+  target_link_directories(MllmAscendBackend PUBLIC "${PROJECT_SOURCE_DIR}/../libs/atb/lib")
 else()
   message(WARNING "ATB_HOME_PATH not defined, ATB library will not be linked")
 endif()
diff --git a/mllm/backends/ascend/ops/AscendElewiseOps.cpp b/mllm/backends/ascend/ops/AscendElewiseOps.cpp
index 762ef1dfe..68f1425f7 100644
--- a/mllm/backends/ascend/ops/AscendElewiseOps.cpp
+++ b/mllm/backends/ascend/ops/AscendElewiseOps.cpp
@@ -106,4 +106,180 @@ void AscendAddOp::forward(const std::vector<Tensor>& inputs, std::vector<Tensor>
   atb::DestroyOperation(op);
 }
 
+AscendSubOp::AscendSubOp(const aops::SubOpOptions& options) : aops::SubOp(options) {}
+
+void AscendSubOp::setup(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) {
+  BaseOp::setup(inputs, outputs);
+}
+
+void AscendSubOp::forward(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) {
+  MLLM_RT_ASSERT_EQ(inputs.size(), 2);
+  MLLM_RT_ASSERT_EQ(outputs.size(), 1);
+
+  const auto& x = inputs[0];
+  const auto& y = inputs[1];
+  auto& z = outputs[0];
+
+  if (x.dtype() != y.dtype() || x.dtype() != z.dtype()) {
+    NYI("AscendSubOp currently requires x/y/z have same dtype");
+  }
+  if (x.numel() != y.numel() || x.numel() != z.numel()) {
+    NYI("AscendSubOp demo only supports no-broadcast case (numel equal)");
+  }
+
+  atb::infer::ElewiseParam subParam;
+  subParam.elewiseType = atb::infer::ElewiseParam::ELEWISE_SUB;
+
+  atb::Operation* op = nullptr;
+  auto st = atb::CreateOperation(subParam, &op);
+  if (st != atb::NO_ERROR || op == nullptr) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB CreateOperation(ELEWISE_SUB) failed, status={}", static_cast<int>(st));
+  }
+
+  atb::Context* atb_ctx = getGlobalAtbContext();
+
+  atb::Tensor atb_x;
+  atb::Tensor atb_y;
+  atb::Tensor atb_z;
+
+  fillAtbTensorDesc(x, atb_x.desc);
+  fillAtbTensorDesc(y, atb_y.desc);
+  fillAtbTensorDesc(z, atb_z.desc);
+
+  atb_x.deviceData = reinterpret_cast<uint8_t*>(x.ptr<void>());
+  atb_x.dataSize = x.bytes();
+  atb_y.deviceData = reinterpret_cast<uint8_t*>(y.ptr<void>());
+  atb_y.dataSize = y.bytes();
+  atb_z.deviceData = reinterpret_cast<uint8_t*>(z.ptr<void>());
+  atb_z.dataSize = z.bytes();
+
+  atb::SVector<atb::Tensor> inTensors;
+  atb::SVector<atb::Tensor> outTensors;
+  inTensors.push_back(atb_x);
+  inTensors.push_back(atb_y);
+  outTensors.push_back(atb_z);
+
+  atb::VariantPack vp;
+  vp.inTensors = inTensors;
+  vp.outTensors = outTensors;
+
+  uint64_t workspaceSize = 0;
+  st = op->Setup(vp, workspaceSize, atb_ctx);
+  if (st != atb::NO_ERROR) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB SubOp Setup failed, status={}", static_cast<int>(st));
+  }
+
+  void* workspace = nullptr;
+  int workspace_block_id = -1;
+  if (workspaceSize > 0) {
+    auto& mem_mgr = getAscendMemoryManager();
+    mem_mgr.allocateBlock(static_cast<uint32_t>(workspaceSize), workspace_block_id);
+    mem_mgr.getBlockPtr(workspace_block_id, workspace);
+  }
+  {
+    ASCEND_TIME_SCOPE("AscendSubOp::forward");
+    st = op->Execute(vp, reinterpret_cast<uint8_t*>(workspace), workspaceSize, atb_ctx);
+  }
+  if (st != atb::NO_ERROR) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB SubOp Execute failed, status={}", static_cast<int>(st));
+  }
+  
+  syncGlobalAtbStream();
+
+  if (workspace_block_id != -1) {
+    auto& mem_mgr = getAscendMemoryManager();
+    mem_mgr.freeBlock(workspace_block_id);
+  }
+
+  atb::DestroyOperation(op);
+}
+
+AscendMulOp::AscendMulOp(const aops::MulOpOptions& options) : aops::MulOp(options) {}
+
+void AscendMulOp::setup(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) {
+  BaseOp::setup(inputs, outputs);
+}
+
+void AscendMulOp::forward(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) {
+  MLLM_RT_ASSERT_EQ(inputs.size(), 2);
+  MLLM_RT_ASSERT_EQ(outputs.size(), 1);
+
+  const auto& x = inputs[0];
+  const auto& y = inputs[1];
+  auto& z = outputs[0];
+
+  if (x.dtype() != y.dtype() || x.dtype() != z.dtype()) {
+    NYI("AscendMulOp currently requires x/y/z have same dtype");
+  }
+  if (x.numel() != y.numel() || x.numel() != z.numel()) {
+    NYI("AscendMulOp demo only supports no-broadcast case (numel equal)");
+  }
+
+  atb::infer::ElewiseParam mulParam;
+  mulParam.elewiseType = atb::infer::ElewiseParam::ELEWISE_MUL;
+
+  atb::Operation* op = nullptr;
+  auto st = atb::CreateOperation(mulParam, &op);
+  if (st != atb::NO_ERROR || op == nullptr) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB CreateOperation(ELEWISE_MUL) failed, status={}", static_cast<int>(st));
+  }
+
+  atb::Context* atb_ctx = getGlobalAtbContext();
+
+  atb::Tensor atb_x;
+  atb::Tensor atb_y;
+  atb::Tensor atb_z;
+
+  fillAtbTensorDesc(x, atb_x.desc);
+  fillAtbTensorDesc(y, atb_y.desc);
+  fillAtbTensorDesc(z, atb_z.desc);
+
+  atb_x.deviceData = reinterpret_cast<uint8_t*>(x.ptr<void>());
+  atb_x.dataSize = x.bytes();
+  atb_y.deviceData = reinterpret_cast<uint8_t*>(y.ptr<void>());
+  atb_y.dataSize = y.bytes();
+  atb_z.deviceData = reinterpret_cast<uint8_t*>(z.ptr<void>());
+  atb_z.dataSize = z.bytes();
+
+  atb::SVector<atb::Tensor> inTensors;
+  atb::SVector<atb::Tensor> outTensors;
+  inTensors.push_back(atb_x);
+  inTensors.push_back(atb_y);
+  outTensors.push_back(atb_z);
+
+  atb::VariantPack vp;
+  vp.inTensors = inTensors;
+  vp.outTensors = outTensors;
+
+  uint64_t workspaceSize = 0;
+  st = op->Setup(vp, workspaceSize, atb_ctx);
+  if (st != atb::NO_ERROR) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB MulOp Setup failed, status={}", static_cast<int>(st));
+  }
+
+  void* workspace = nullptr;
+  int workspace_block_id = -1;
+  if (workspaceSize > 0) {
+    auto& mem_mgr = getAscendMemoryManager();
+    mem_mgr.allocateBlock(static_cast<uint32_t>(workspaceSize), workspace_block_id);
+    mem_mgr.getBlockPtr(workspace_block_id, workspace);
+  }
+  {
+    ASCEND_TIME_SCOPE("AscendMulOp::forward");
+    st = op->Execute(vp, reinterpret_cast<uint8_t*>(workspace), workspaceSize, atb_ctx);
+  }
+  if (st != atb::NO_ERROR) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB MulOp Execute failed, status={}", static_cast<int>(st));
+  }
+  
+  syncGlobalAtbStream();
+
+  if (workspace_block_id != -1) {
+    auto& mem_mgr = getAscendMemoryManager();
+    mem_mgr.freeBlock(workspace_block_id);
+  }
+
+  atb::DestroyOperation(op);
+}
+
 }  // namespace mllm::ascend
\ No newline at end of file
diff --git a/mllm/backends/ascend/ops/AscendElewiseOps.hpp b/mllm/backends/ascend/ops/AscendElewiseOps.hpp
index 26117cbc2..9122e20cb 100644
--- a/mllm/backends/ascend/ops/AscendElewiseOps.hpp
+++ b/mllm/backends/ascend/ops/AscendElewiseOps.hpp
@@ -24,4 +24,34 @@ class AscendAddOpFactory final : public TypedOpFactory<OpTypes::kAdd, aops::AddO
   }
 };
 
+class AscendSubOp final : public aops::SubOp {
+ public:
+  explicit AscendSubOp(const aops::SubOpOptions& options);
+
+  void setup(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) override;
+  void forward(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) override;
+};
+
+class AscendSubOpFactory final : public TypedOpFactory<OpTypes::kSub, aops::SubOpOptions> {
+ public:
+  std::shared_ptr<BaseOp> createOpImpl(const aops::SubOpOptions& options) override {
+    return std::make_shared<AscendSubOp>(options);
+  }
+};
+
+class AscendMulOp final : public aops::MulOp {
+ public:
+  explicit AscendMulOp(const aops::MulOpOptions& options);
+
+  void setup(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) override;
+  void forward(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) override;
+};
+
+class AscendMulOpFactory final : public TypedOpFactory<OpTypes::kMul, aops::MulOpOptions> {
+ public:
+  std::shared_ptr<BaseOp> createOpImpl(const aops::MulOpOptions& options) override {
+    return std::make_shared<AscendMulOp>(options);
+  }
+};
+
 }  // namespace mllm::ascend
\ No newline at end of file
diff --git a/tests/ascend/AscendKernelTest.hpp b/tests/ascend/AscendKernelTest.hpp
index 138ee5ae8..a01028906 100644
--- a/tests/ascend/AscendKernelTest.hpp
+++ b/tests/ascend/AscendKernelTest.hpp
@@ -48,5 +48,75 @@ class AscendKernelTest : public KernelTest {
     }
     return true;
   }
+
+  // Test Sub operation with different shapes
+  bool SubFloat16Test(const std::vector<mllm::Tensor::shape_t>& shapes) {
+    using namespace mllm;  // NOLINT
+    for (auto& shape : shapes) {
+      // 1. Construct random FP16 inputs on CPU
+      Tensor x_cpu = Tensor::random(shape, -3, 3, kFloat16, kCPU);
+      Tensor y_cpu = Tensor::random(shape, -3, 3, kFloat16, kCPU);
+
+      // 2. Compute reference result (FP16) on CPU
+      Tensor ref_cpu = Tensor::zeros(shape, kFloat16, kCPU);
+      {
+        auto* x_ptr = x_cpu.ptr<mllm_fp16_t>();
+        auto* y_ptr = y_cpu.ptr<mllm_fp16_t>();
+        auto* r_ptr = ref_cpu.ptr<mllm_fp16_t>();
+        auto num_elements = x_cpu.numel();
+        for (size_t i = 0; i < num_elements; ++i) {
+          r_ptr[i] = x_ptr[i] - y_ptr[i];
+        }
+      }
+
+      // 3. Move inputs to Ascend and run Sub (z = x - y)
+      auto x_ascend = x_cpu.to(kAscend);
+      auto y_ascend = y_cpu.to(kAscend);
+      auto z_ascend = x_ascend - y_ascend;
+
+      // 4. Move result back to CPU and compare with reference using allClose
+      auto z_cpu = z_ascend.to(kCPU);
+      auto result = mllm::test::allClose(z_cpu, ref_cpu, 1e-2f, 1e-2f);
+      if (!result.is_close) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  // Test Mul operation with different shapes
+  bool MulFloat16Test(const std::vector<mllm::Tensor::shape_t>& shapes) {
+    using namespace mllm;  // NOLINT
+    for (auto& shape : shapes) {
+      // 1. Construct random FP16 inputs on CPU
+      Tensor x_cpu = Tensor::random(shape, -3, 3, kFloat16, kCPU);
+      Tensor y_cpu = Tensor::random(shape, -3, 3, kFloat16, kCPU);
+
+      // 2. Compute reference result (FP16) on CPU
+      Tensor ref_cpu = Tensor::zeros(shape, kFloat16, kCPU);
+      {
+        auto* x_ptr = x_cpu.ptr<mllm_fp16_t>();
+        auto* y_ptr = y_cpu.ptr<mllm_fp16_t>();
+        auto* r_ptr = ref_cpu.ptr<mllm_fp16_t>();
+        auto num_elements = x_cpu.numel();
+        for (size_t i = 0; i < num_elements; ++i) {
+          r_ptr[i] = x_ptr[i] * y_ptr[i];
+        }
+      }
+
+      // 3. Move inputs to Ascend and run Mul (z = x * y)
+      auto x_ascend = x_cpu.to(kAscend);
+      auto y_ascend = y_cpu.to(kAscend);
+      auto z_ascend = x_ascend * y_ascend;
+
+      // 4. Move result back to CPU and compare with reference using allClose
+      auto z_cpu = z_ascend.to(kCPU);
+      auto result = mllm::test::allClose(z_cpu, ref_cpu, 1e-2f, 1e-2f);
+      if (!result.is_close) {
+        return false;
+      }
+    }
+    return true;
+  }
 };
 
diff --git a/tests/ascend/KernelTest.cpp b/tests/ascend/KernelTest.cpp
index b0489f545..1e392df05 100644
--- a/tests/ascend/KernelTest.cpp
+++ b/tests/ascend/KernelTest.cpp
@@ -25,6 +25,41 @@ TEST_F(AscendKernelTest, AddFloat16) {
             true);
 }
 
+//===----------------------------------------------------------------------===//
+// Element wise SUB.
+//
+// FP16 (Ascend currently uses FP16)
+//===----------------------------------------------------------------------===//
+TEST_F(AscendKernelTest, SubFloat16) {
+  EXPECT_EQ(SubFloat16Test({
+                {2, 3},
+                {1, 1},
+                {4, 4},
+                {8, 8},
+                {16, 16},
+                {32, 32},
+            }),
+            true);
+}
+
+//===----------------------------------------------------------------------===//
+// Element wise MUL.
+//
+// FP16 (Ascend currently uses FP16)
+//===----------------------------------------------------------------------===//
+TEST_F(AscendKernelTest, MulFloat16) {
+  EXPECT_EQ(MulFloat16Test({
+                {2, 3},
+                {1, 1},
+                {4, 4},
+                {8, 8},
+                {16, 16},
+                {32, 32},
+            }),
+            true);
+}
+
+
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);