CANN: fix multi-thread set_tensor race conditions

hipudding · hipudding · commit b9305e71299c · 2026-03-06T02:06:26.000Z
When ollama calls ggml_backend_tensor_set from multiple threads (each
writing a different chunk of the same tensor), the CANN backend had
three concurrency issues:

1. Quantized tensors (Q4_0/Q8_0) require a full-tensor format transform
   before uploading to device. Per-chunk transforms produced corrupt data.

2. ND-to-NZ weight conversion requires complete tensor data on device.
   Per-chunk conversion operated on incomplete data.

3. The global g_nz_workspaces array had unprotected concurrent access.

Fix by introducing a TensorSetTracker that accumulates write progress
per tensor. For quantized tensors, raw data is staged in a host buffer
and the transform + upload is deferred until all chunks arrive. For NZ
weights, chunks are uploaded directly but conversion is deferred. The
tracker and its staging buffer are released immediately after
post-processing completes.

Add per-device mutex to g_nz_workspaces to prevent data races.
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -36,10 +36,13 @@
 #include <cmath>
 #include <cstdio>
 #include <cstring>
+#include <memory>
 #include <mutex>
 #include <optional>
 #include <queue>
+#include <unordered_map>
 #include <unordered_set>
+#include <vector>
 
 #define GGML_COMMON_DECL_C
 
@@ -770,6 +773,28 @@ std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(i
 }
 
 // cann buffer
+
+/**
+ * @brief Tracks multi-threaded write progress for a single tensor.
+ *
+ * When multiple threads call set_tensor on different chunks of the same tensor,
+ * this tracker accumulates progress and defers post-processing (quantized format
+ * transform or ND-to-NZ conversion) until all data has been written.
+ */
+struct TensorSetTracker {
+    std::mutex           mtx;           ///< Protects bytes_written and host_buffer access.
+    size_t               bytes_written; ///< Accumulated bytes written so far.
+    size_t               total_bytes;   ///< Total bytes of the tensor (ggml_nbytes).
+    std::vector<uint8_t> host_buffer;   ///< Staging buffer for quantized tensors only.
+
+    TensorSetTracker(size_t total, bool need_staging)
+        : bytes_written(0), total_bytes(total) {
+        if (need_staging) {
+            host_buffer.resize(total);
+        }
+    }
+};
+
 /**
  * @brief Context for managing a CANN buffer associated with a specific device.
  *
@@ -780,6 +805,9 @@ struct ggml_backend_cann_buffer_context {
     int32_t device;             ///< The device ID associated with this buffer context.
     void *  dev_ptr = nullptr;  ///< Pointer to the device memory allocated for the buffer.
 
+    std::mutex                                                          tracker_mutex;
+    std::unordered_map<ggml_tensor *, std::shared_ptr<TensorSetTracker>> trackers;
+
     /**
      * @brief Constructor to initialize the CANN buffer context.
      *
@@ -792,6 +820,28 @@ struct ggml_backend_cann_buffer_context {
      * @brief Destructor to free the device memory allocated for the buffer.
      */
     ~ggml_backend_cann_buffer_context() { ACL_CHECK(aclrtFree(dev_ptr)); }
+
+    /**
+     * @brief Get or create a tracker for the given tensor.
+     */
+    std::shared_ptr<TensorSetTracker> get_or_create_tracker(ggml_tensor * tensor, bool need_staging) {
+        std::lock_guard<std::mutex> lock(tracker_mutex);
+        auto it = trackers.find(tensor);
+        if (it == trackers.end()) {
+            auto tracker = std::make_shared<TensorSetTracker>(ggml_nbytes(tensor), need_staging);
+            trackers[tensor] = tracker;
+            return tracker;
+        }
+        return it->second;
+    }
+
+    /**
+     * @brief Remove the tracker for the given tensor.
+     */
+    void remove_tracker(ggml_tensor * tensor) {
+        std::lock_guard<std::mutex> lock(tracker_mutex);
+        trackers.erase(tensor);
+    }
 };
 
 // cann buffer type
@@ -1124,6 +1174,7 @@ static enum ggml_status ggml_backend_cann_buffer_init_tensor(ggml_backend_buffer
  * designed to be used with a global array, one per device.
  */
 struct ggml_cann_nz_workspace {
+    std::mutex mtx;    // Protects ptr/allocated from concurrent access
     void * ptr;        // Pointer to allocated device buffer
     size_t allocated;  // Size of currently allocated buffer in bytes
 
@@ -1190,13 +1241,15 @@ static ggml_cann_nz_workspace g_nz_workspaces[GGML_CANN_MAX_DEVICES];
  * @note The workspace buffer used in this function is managed globally and reused
  *       across calls. This reduces overhead from repeated memory allocation and deallocation.
  */
-static void weight_format_to_nz(ggml_tensor * tensor, size_t offset, int device) {
-    acl_tensor_ptr weightTransposed = ggml_cann_create_tensor(tensor, tensor->ne, tensor->nb, 2, ACL_FORMAT_ND, offset);
+static void weight_format_to_nz(ggml_tensor * tensor, int device) {
+    acl_tensor_ptr weightTransposed = ggml_cann_create_tensor(tensor, tensor->ne, tensor->nb, 2, ACL_FORMAT_ND, 0);
     uint64_t       workspaceSize    = 0;
     aclOpExecutor * executor;
 
     // TransMatmulWeight
     ACL_CHECK(aclnnTransMatmulWeightGetWorkspaceSize(weightTransposed.get(), &workspaceSize, &executor));
+
+    std::lock_guard<std::mutex> lock(g_nz_workspaces[device].mtx);
     // Avoid frequent malloc/free of the workspace.
     g_nz_workspaces[device].realloc(workspaceSize);
 
@@ -1210,7 +1263,13 @@ static void weight_format_to_nz(ggml_tensor * tensor, size_t offset, int device)
  * @brief Set tensor data in a CANN buffer.
  *
  * This function sets tensor data in a CANN buffer, handling transformations
- * if needed based on the tensor's type.
+ * if needed based on the tensor's type. It supports multi-threaded calls
+ * where different threads write different chunks of the same tensor.
+ *
+ * For quantized tensors (Q4_0/Q8_0), data is staged in a host buffer and
+ * the format transform is deferred until all chunks are written.
+ * For NZ weight tensors, chunks are uploaded directly but the ND-to-NZ
+ * conversion is deferred until all chunks are written.
  *
  * @param buffer The CANN buffer where the tensor data will be set.
  * @param tensor Pointer to the tensor whose data will be set.
@@ -1226,25 +1285,57 @@ static void ggml_backend_cann_buffer_set_tensor(ggml_backend_buffer_t buffer,
     ggml_backend_cann_buffer_context * ctx = (ggml_backend_cann_buffer_context *) buffer->context;
 
     ggml_cann_set_device(ctx->device);
-    // TODO: refer to cann(#6017), it use thread's default stream.
-    // For acl, synchronous functions use this default stream.
-    // Why aclrtSynchronizeDevice?
 
     // Only check env once.
     static bool weight_to_nz = parse_bool(get_env_as_lowercase("GGML_CANN_WEIGHT_NZ").value_or("on"));
-    if (!need_transform(tensor->type)) {
+
+    const bool needs_transform = need_transform(tensor->type);
+    const bool needs_nz        = !needs_transform && weight_to_nz && is_matmul_weight((const ggml_tensor *) tensor);
+
+    if (!needs_transform && !needs_nz) {
+        // Plain tensor: direct memcpy is safe per-chunk, no tracker needed.
         ACL_CHECK(aclrtMemcpy((char *) tensor->data + offset, size, data, size, ACL_MEMCPY_HOST_TO_DEVICE));
-        if (weight_to_nz && is_matmul_weight((const ggml_tensor *) tensor)) {
+        return;
+    }
+
+    // Needs post-processing: use tracker to defer until all chunks are written.
+    auto tracker = ctx->get_or_create_tracker(tensor, needs_transform);
+
+    bool all_done = false;
+    {
+        std::lock_guard<std::mutex> lock(tracker->mtx);
+
+        if (needs_transform) {
+            // Stage raw data in host buffer; transform requires the full tensor.
+            memcpy(tracker->host_buffer.data() + offset, data, size);
+        } else {
+            // NZ case: upload chunk to device immediately (different offsets, safe).
+            ACL_CHECK(aclrtMemcpy((char *) tensor->data + offset, size, data, size, ACL_MEMCPY_HOST_TO_DEVICE));
+        }
+
+        tracker->bytes_written += size;
+        all_done = (tracker->bytes_written >= tracker->total_bytes);
+    }
+
+    if (all_done) {
+        if (needs_transform) {
+            // All data staged, now transform entire tensor and upload at once.
+            size_t total = tracker->total_bytes;
+            void * transform_buf = malloc(total);
+            ggml_backend_cann_transform(tensor, tracker->host_buffer.data(), transform_buf);
+            ACL_CHECK(aclrtMemcpy(tensor->data, total, transform_buf, total, ACL_MEMCPY_HOST_TO_DEVICE));
+            free(transform_buf);
+        }
+
+        if (needs_nz) {
+            // All data on device, now convert entire tensor to NZ format.
             GGML_ASSERT(tensor->ne[2] == 1);
             GGML_ASSERT(tensor->ne[3] == 1);
-            weight_format_to_nz(tensor, offset, ctx->device);
+            weight_format_to_nz(tensor, ctx->device);
         }
-    } else {
-        void * transform_buffer = malloc(size);
-        ggml_backend_cann_transform(tensor, data, transform_buffer);
 
-        ACL_CHECK(aclrtMemcpy((char *) tensor->data + offset, size, transform_buffer, size, ACL_MEMCPY_HOST_TO_DEVICE));
-        free(transform_buffer);
+        // Cleanup: release the tracker and its host_buffer memory.
+        ctx->remove_tracker(tensor);
     }
 }