From 2bd60413e0f8484f1952870a4321db6dee2a8efb Mon Sep 17 00:00:00 2001
From: fzw <fzw1202@gmail.com>
Date: Tue, 2 Dec 2025 12:35:07 +0800
Subject: [PATCH] add support for q4_1 and q8_1

---
 ggml/src/ggml-cann/acl_tensor.cpp |   2 +
 ggml/src/ggml-cann/aclnn_ops.cpp  |  49 ++++++++++---
 ggml/src/ggml-cann/ggml-cann.cpp  | 113 +++++++++++++++++++++++++++++-
 3 files changed, 154 insertions(+), 10 deletions(-)

diff --git a/ggml/src/ggml-cann/acl_tensor.cpp b/ggml/src/ggml-cann/acl_tensor.cpp
index 8ffac31dd66..ad748e74496 100755
--- a/ggml/src/ggml-cann/acl_tensor.cpp
+++ b/ggml/src/ggml-cann/acl_tensor.cpp
@@ -40,8 +40,10 @@ aclDataType ggml_cann_type_mapping(ggml_type type) {
         case GGML_TYPE_I32:
             return ACL_INT32;
         case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
             return ACL_INT4;
         case GGML_TYPE_Q8_0:
+        case GGML_TYPE_Q8_1:
             return ACL_INT8;
         case GGML_TYPE_I64:
             return ACL_INT64;
diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp
index bc33b99d96e..87d2ed94ccb 100755
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@@ -2008,12 +2008,12 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
     // HC is regarded as batch.
     // weight need transpose.
     float weight_elem_size;
-    if (type == GGML_TYPE_Q4_0) {
+    if (type == GGML_TYPE_Q4_0 || type == GGML_TYPE_Q4_1) {
         weight_elem_size = float(sizeof(uint8_t)) / 2;
-    } else if (type == GGML_TYPE_Q8_0) {
+    } else if (type == GGML_TYPE_Q8_0 || type == GGML_TYPE_Q8_1) {
         weight_elem_size = float(sizeof(uint8_t));
     } else {
-        GGML_ABORT("Only support Q4_0 and Q8_0 MUL_MAT");
+        GGML_ABORT("Only support Q4_0, Q4_1, Q8_0 and Q8_1 MUL_MAT");
     }
     float weight_nb[] = {src0->ne[0] * weight_elem_size, weight_elem_size};
     size_t weight_stride = src0->ne[1] * src0->ne[0] * weight_elem_size;
@@ -2024,8 +2024,15 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
     size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size,
                          scale_elem_size};
     size_t scale_stride = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size;
+    size_t scale_size = scale_stride * src0->ne[2] * src0->ne[3];
     char* scale_offset = (char*)src0->data + weight_size;
 
+    // min. Also need transpose.
+    size_t min_elem_size;
+    size_t min_nb[2];
+    size_t min_stride;
+    char* min_offset;   
+
     // input
     size_t input_elem_size = sizeof(uint16_t);
     int64_t input_ne[] = {src1->ne[0], src1->ne[1]};
@@ -2034,7 +2041,7 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
     ggml_cann_pool_alloc input_alloctor(ctx.pool());
     void* input_buffer = src1->data;
 
-    // case in
+    // cast in
     if (src1->type != GGML_TYPE_F16) {
         aclTensor* acl_src1_tensor = ggml_cann_create_tensor(src1);
         input_buffer =
@@ -2079,12 +2086,14 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
                 input_elem_size, input_ne, input_nb, 2);
 
             // first split
-            int64_t weight_ne_offset = 0;
+            int64_t weight_ne_offset = 0; 
             int64_t weight_ne[2] = {
                 max_elem_size > src0->ne[1] ? src0->ne[1] : max_elem_size,
                 src0->ne[0]};
             int64_t scale_ne_offset = 0;
             int64_t scale_ne[2] = {weight_ne[0], weight_ne[1] / QK8_0};
+            int64_t min_ne_offset = 0;
+            int64_t min_ne[2] = {weight_ne[0], weight_ne[1] / QK4_1};
             int64_t output_ne_offset = 0;
             int64_t output_ne[2] = {weight_ne[0], dst->ne[1]};
 
@@ -2096,6 +2105,18 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
                 scale_offset + batch0 * scale_stride, ACL_FLOAT16,
                 scale_elem_size, scale_ne, scale_nb, 2, ACL_FORMAT_ND,
                 scale_ne_offset);
+            aclTensor* acl_min_tensor = nullptr;
+            if (src0->type == GGML_TYPE_Q4_1) {
+                min_elem_size = sizeof(uint16_t);
+                min_nb[0] = src0->ne[0] / QK4_1 * min_elem_size;
+                min_nb[1] = min_elem_size;
+                min_stride = src0->ne[1] * src0->ne[0] / QK4_1 * min_elem_size;
+                min_offset = (char*)src0->data + weight_size + scale_size;   
+
+                acl_min_tensor = ggml_cann_create_tensor(
+                    min_offset + batch0 * min_stride, ACL_FLOAT16,
+                    min_elem_size, min_ne, min_nb, 2, ACL_FORMAT_ND, min_ne_offset);
+            }
             aclTensor* acl_output_tensor = ggml_cann_create_tensor(
                 (char*)output_buffer + batch1 * output_stride, ACL_FLOAT16,
                 output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND,
@@ -2105,10 +2126,10 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
                 antiquantGroupSize = QK8_0;
             }
             GGML_CANN_CALL_ACLNN_OP(ctx, WeightQuantBatchMatmulV2, acl_input_tensor,
-                           acl_weight_tensor, acl_scale_tensor, nullptr,
+                           acl_weight_tensor, acl_scale_tensor, acl_min_tensor,
                            nullptr, nullptr, nullptr, antiquantGroupSize,
                            acl_output_tensor);
-            ggml_cann_release_resources(ctx, acl_weight_tensor, acl_scale_tensor, acl_output_tensor);
+            ggml_cann_release_resources(ctx, acl_weight_tensor, acl_scale_tensor, acl_min_tensor, acl_output_tensor);
 
             // other splits
             for (int64_t split = 1; split < split_size; split++) {
@@ -2131,15 +2152,23 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
                     scale_offset + batch0 * scale_stride, ACL_FLOAT16,
                     scale_elem_size, scale_ne, scale_nb, 2, ACL_FORMAT_ND,
                     scale_ne_offset);
+                acl_min_tensor = nullptr;
+                if (src0->type == GGML_TYPE_Q4_1) {
+                    min_ne_offset += min_elem_size * min_ne[0] * min_ne[1];
+                    min_ne[0] = weight_ne[0];
+                    acl_min_tensor = ggml_cann_create_tensor(
+                        min_offset + batch0 * min_stride, ACL_FLOAT16,
+                        min_elem_size, min_ne, min_nb, 2, ACL_FORMAT_ND, min_ne_offset);
+                }
                 acl_output_tensor = ggml_cann_create_tensor(
                     (char*)output_buffer + batch1 * output_stride, ACL_FLOAT16,
                     output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND,
                     output_ne_offset);
                 GGML_CANN_CALL_ACLNN_OP(ctx, WeightQuantBatchMatmulV2, acl_input_tensor,
-                                   acl_weight_tensor, acl_scale_tensor, nullptr,
+                                   acl_weight_tensor, acl_scale_tensor, acl_min_tensor,
                                    nullptr, nullptr, nullptr, antiquantGroupSize,
                                    acl_output_tensor);
-                ggml_cann_release_resources(ctx, acl_weight_tensor, acl_scale_tensor, acl_output_tensor);
+                ggml_cann_release_resources(ctx, acl_weight_tensor, acl_scale_tensor, acl_min_tensor, acl_output_tensor);
             }
 
             ggml_cann_release_resources(ctx, acl_input_tensor);
@@ -2174,6 +2203,8 @@ void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
             break;
         case GGML_TYPE_Q4_0:
         case GGML_TYPE_Q8_0:
+        case GGML_TYPE_Q4_1:
+        case GGML_TYPE_Q8_1:
             ggml_cann_mul_mat_quant(ctx, dst, type);
             break;
         default:
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
index cb8af42ebf9..8999b9e10a2 100755
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -945,6 +945,69 @@ static void ggml_backend_cann_transform_back_q4_0(
     }
 }
 
+/**
+ * @brief Transform quantized Q4.1 tensor data into a format suitable for CANN
+ * processing.
+ *
+ * This function transforms quantized Q4.1 tensor data into a format suitable
+ * for CANN processing. It extracts quantization values and scales from the
+ * source data and prepares them in a format expected by CANN operations.
+ *
+ * @param tensor Pointer to the tensor information.
+ * @param src Pointer to the source data in Q4.1 format.
+ * @param dst Pointer to the destination buffer where transformed data will be
+ * stored.
+ */
+static void ggml_backend_cann_transform_q4_1(ggml_tensor* tensor,
+                                             const void* src,
+                                             void* dst) {
+
+    int64_t n_elems = ggml_nelements(tensor);
+    int64_t groups = n_elems / QK4_1;
+    size_t quant_bytes = n_elems * sizeof(uint8_t) / 2;
+    size_t scale_bytes = groups * sizeof(uint16_t);
+
+    uint8_t* quant_offset = (uint8_t*)dst;
+    uint16_t* scale_offset = (uint16_t*)((char*)dst + quant_bytes);
+    uint16_t* min_offset = (uint16_t*)((char*)dst + quant_bytes + scale_bytes);
+
+    for (int i = 0; i < groups; i++) {
+        const block_q4_1* group =
+            (const block_q4_1*)((const char*)src + i * sizeof(block_q4_1));
+        *scale_offset = group->d;
+        scale_offset++;
+
+        float d = ggml_fp16_to_fp32(group->d);
+        float m = ggml_fp16_to_fp32(group->m);
+        
+        float min = 0.0f;
+        if (d != 0.0f)
+            min = 8.0f + (m / d);
+        *min_offset = ggml_fp32_to_fp16(min);
+        min_offset++;
+
+        // 0-15
+        for (int j = 0; j < QK4_1 / 2; j += 2) {
+            (*quant_offset) = (group->qs[j] & 0x0F);
+            (*quant_offset) |= ((group->qs[j + 1] << 4));
+            quant_offset++;
+        }
+
+        // 16-31
+        for (int j = 0; j < QK4_1 / 2; j += 2) {
+            (*quant_offset) = (group->qs[j] >> 4);
+            (*quant_offset) |= (group->qs[j + 1] & 0xF0);
+            quant_offset++;
+        }
+    }
+
+    // put (uint4b_t -8) into int4b_t
+    for (quant_offset = (uint8_t*)dst;
+         quant_offset < (uint8_t*)dst + quant_bytes; quant_offset++) {
+        (*quant_offset) ^= 0x88;
+    }
+}
+
 /**
  * @brief Transform quantized Q8.0 tensor data into a format suitable for CANN
  * processing.
@@ -1012,6 +1075,44 @@ static void ggml_backend_cann_transform_back_q8_0(
     }
 }
 
+/**
+ * @brief Transform quantized Q8.1 tensor data into a format suitable for CANN
+ * processing.
+ *
+ * This function transforms quantized Q8.1 tensor data into a format suitable
+ * for CANN processing. It extracts quantization values and scales from the
+ * source data and prepares them in a format expected by CANN operations.
+ *
+ * @param tensor Pointer to the tensor information.
+ * @param src Pointer to the source data in Q8.1 format.
+ * @param dst Pointer to the destination buffer where transformed data will be
+ * stored.
+ */
+static void ggml_backend_cann_transform_q8_1(ggml_tensor* tensor,
+                                             const void* src,
+                                             void* dst) {
+    int64_t n_elems = ggml_nelements(tensor);
+    int64_t groups = n_elems / QK8_1;
+    size_t quant_bytes = n_elems * sizeof(uint8_t);
+    size_t scale_bytes = groups * sizeof(uint16_t);
+
+    uint8_t* quant_offset = (uint8_t*)dst;
+    uint16_t* scale_offset = (uint16_t*)((char*)dst + quant_bytes);
+    uint16_t* sum_offset = (uint16_t*)((char*)dst + quant_bytes + scale_bytes);
+
+    for (int i = 0; i < groups; i++) {
+        const block_q8_1* group =
+            (const block_q8_1*)((const char*)src + i * sizeof(block_q8_1));
+        *scale_offset = group->d;
+        scale_offset++;
+        *sum_offset = group->s;
+        sum_offset++;
+        size_t group_quant_size = QK8_1 * sizeof(uint8_t);
+        memcpy(quant_offset, group->qs, group_quant_size);
+        quant_offset += group_quant_size;
+    }
+}
+
 /**
  * @brief Transform tensor data based on its type for CANN processing.
  *
@@ -1033,6 +1134,12 @@ static void ggml_backend_cann_transform(ggml_tensor* tensor,
         case GGML_TYPE_Q8_0:
             ggml_backend_cann_transform_q8_0(tensor, src, dst);
             break;
+        case GGML_TYPE_Q4_1:
+            ggml_backend_cann_transform_q4_1(tensor, src, dst);
+            break;
+        case GGML_TYPE_Q8_1:
+            ggml_backend_cann_transform_q8_1(tensor, src, dst);
+            break;
         default:
             break;
     }
@@ -1077,6 +1184,8 @@ static bool need_transform(ggml_type type) {
     switch (type) {
         case GGML_TYPE_Q4_0:
         case GGML_TYPE_Q8_0:
+        case GGML_TYPE_Q4_1:
+        case GGML_TYPE_Q8_1:
             return true;
         default:
             return false;
@@ -2293,7 +2402,7 @@ static enum ggml_status ggml_backend_cann_graph_compute(
  *              otherwise false.
  */
 static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
-                                                    const ggml_tensor* op) {
+                                                    const ggml_tensor* op) {                                              
     switch (op->op) {
         case GGML_OP_UNARY:
             switch (ggml_get_unary_op(op)) {
@@ -2335,6 +2444,8 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
                     return true;
                 case GGML_TYPE_Q8_0:
                 case GGML_TYPE_Q4_0:
+                case GGML_TYPE_Q8_1:
+                case GGML_TYPE_Q4_1:
 #ifdef ASCEND_310P
                     // Q4 && Q8 per group is not suppor on 310p device
                     return false;