From 2bd60413e0f8484f1952870a4321db6dee2a8efb Mon Sep 17 00:00:00 2001 From: fzw Date: Tue, 2 Dec 2025 12:35:07 +0800 Subject: [PATCH] add support for q4_1 and q8_1 --- ggml/src/ggml-cann/acl_tensor.cpp | 2 + ggml/src/ggml-cann/aclnn_ops.cpp | 49 ++++++++++--- ggml/src/ggml-cann/ggml-cann.cpp | 113 +++++++++++++++++++++++++++++- 3 files changed, 154 insertions(+), 10 deletions(-) diff --git a/ggml/src/ggml-cann/acl_tensor.cpp b/ggml/src/ggml-cann/acl_tensor.cpp index 8ffac31dd66..ad748e74496 100755 --- a/ggml/src/ggml-cann/acl_tensor.cpp +++ b/ggml/src/ggml-cann/acl_tensor.cpp @@ -40,8 +40,10 @@ aclDataType ggml_cann_type_mapping(ggml_type type) { case GGML_TYPE_I32: return ACL_INT32; case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: return ACL_INT4; case GGML_TYPE_Q8_0: + case GGML_TYPE_Q8_1: return ACL_INT8; case GGML_TYPE_I64: return ACL_INT64; diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index bc33b99d96e..87d2ed94ccb 100755 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -2008,12 +2008,12 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx, // HC is regarded as batch. // weight need transpose. float weight_elem_size; - if (type == GGML_TYPE_Q4_0) { + if (type == GGML_TYPE_Q4_0 || type == GGML_TYPE_Q4_1) { weight_elem_size = float(sizeof(uint8_t)) / 2; - } else if (type == GGML_TYPE_Q8_0) { + } else if (type == GGML_TYPE_Q8_0 || type == GGML_TYPE_Q8_1) { weight_elem_size = float(sizeof(uint8_t)); } else { - GGML_ABORT("Only support Q4_0 and Q8_0 MUL_MAT"); + GGML_ABORT("Only support Q4_0, Q4_1, Q8_0 and Q8_1 MUL_MAT"); } float weight_nb[] = {src0->ne[0] * weight_elem_size, weight_elem_size}; size_t weight_stride = src0->ne[1] * src0->ne[0] * weight_elem_size; @@ -2024,8 +2024,15 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx, size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size, scale_elem_size}; size_t scale_stride = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size; + size_t scale_size = scale_stride * src0->ne[2] * src0->ne[3]; char* scale_offset = (char*)src0->data + weight_size; + // min. Also need transpose. + size_t min_elem_size; + size_t min_nb[2]; + size_t min_stride; + char* min_offset; + // input size_t input_elem_size = sizeof(uint16_t); int64_t input_ne[] = {src1->ne[0], src1->ne[1]}; @@ -2034,7 +2041,7 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx, ggml_cann_pool_alloc input_alloctor(ctx.pool()); void* input_buffer = src1->data; - // case in + // cast in if (src1->type != GGML_TYPE_F16) { aclTensor* acl_src1_tensor = ggml_cann_create_tensor(src1); input_buffer = @@ -2079,12 +2086,14 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx, input_elem_size, input_ne, input_nb, 2); // first split - int64_t weight_ne_offset = 0; + int64_t weight_ne_offset = 0; int64_t weight_ne[2] = { max_elem_size > src0->ne[1] ? src0->ne[1] : max_elem_size, src0->ne[0]}; int64_t scale_ne_offset = 0; int64_t scale_ne[2] = {weight_ne[0], weight_ne[1] / QK8_0}; + int64_t min_ne_offset = 0; + int64_t min_ne[2] = {weight_ne[0], weight_ne[1] / QK4_1}; int64_t output_ne_offset = 0; int64_t output_ne[2] = {weight_ne[0], dst->ne[1]}; @@ -2096,6 +2105,18 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx, scale_offset + batch0 * scale_stride, ACL_FLOAT16, scale_elem_size, scale_ne, scale_nb, 2, ACL_FORMAT_ND, scale_ne_offset); + aclTensor* acl_min_tensor = nullptr; + if (src0->type == GGML_TYPE_Q4_1) { + min_elem_size = sizeof(uint16_t); + min_nb[0] = src0->ne[0] / QK4_1 * min_elem_size; + min_nb[1] = min_elem_size; + min_stride = src0->ne[1] * src0->ne[0] / QK4_1 * min_elem_size; + min_offset = (char*)src0->data + weight_size + scale_size; + + acl_min_tensor = ggml_cann_create_tensor( + min_offset + batch0 * min_stride, ACL_FLOAT16, + min_elem_size, min_ne, min_nb, 2, ACL_FORMAT_ND, min_ne_offset); + } aclTensor* acl_output_tensor = ggml_cann_create_tensor( (char*)output_buffer + batch1 * output_stride, ACL_FLOAT16, output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND, @@ -2105,10 +2126,10 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx, antiquantGroupSize = QK8_0; } GGML_CANN_CALL_ACLNN_OP(ctx, WeightQuantBatchMatmulV2, acl_input_tensor, - acl_weight_tensor, acl_scale_tensor, nullptr, + acl_weight_tensor, acl_scale_tensor, acl_min_tensor, nullptr, nullptr, nullptr, antiquantGroupSize, acl_output_tensor); - ggml_cann_release_resources(ctx, acl_weight_tensor, acl_scale_tensor, acl_output_tensor); + ggml_cann_release_resources(ctx, acl_weight_tensor, acl_scale_tensor, acl_min_tensor, acl_output_tensor); // other splits for (int64_t split = 1; split < split_size; split++) { @@ -2131,15 +2152,23 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx, scale_offset + batch0 * scale_stride, ACL_FLOAT16, scale_elem_size, scale_ne, scale_nb, 2, ACL_FORMAT_ND, scale_ne_offset); + acl_min_tensor = nullptr; + if (src0->type == GGML_TYPE_Q4_1) { + min_ne_offset += min_elem_size * min_ne[0] * min_ne[1]; + min_ne[0] = weight_ne[0]; + acl_min_tensor = ggml_cann_create_tensor( + min_offset + batch0 * min_stride, ACL_FLOAT16, + min_elem_size, min_ne, min_nb, 2, ACL_FORMAT_ND, min_ne_offset); + } acl_output_tensor = ggml_cann_create_tensor( (char*)output_buffer + batch1 * output_stride, ACL_FLOAT16, output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND, output_ne_offset); GGML_CANN_CALL_ACLNN_OP(ctx, WeightQuantBatchMatmulV2, acl_input_tensor, - acl_weight_tensor, acl_scale_tensor, nullptr, + acl_weight_tensor, acl_scale_tensor, acl_min_tensor, nullptr, nullptr, nullptr, antiquantGroupSize, acl_output_tensor); - ggml_cann_release_resources(ctx, acl_weight_tensor, acl_scale_tensor, acl_output_tensor); + ggml_cann_release_resources(ctx, acl_weight_tensor, acl_scale_tensor, acl_min_tensor, acl_output_tensor); } ggml_cann_release_resources(ctx, acl_input_tensor); @@ -2174,6 +2203,8 @@ void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) { break; case GGML_TYPE_Q4_0: case GGML_TYPE_Q8_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q8_1: ggml_cann_mul_mat_quant(ctx, dst, type); break; default: diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index cb8af42ebf9..8999b9e10a2 100755 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -945,6 +945,69 @@ static void ggml_backend_cann_transform_back_q4_0( } } +/** + * @brief Transform quantized Q4.1 tensor data into a format suitable for CANN + * processing. + * + * This function transforms quantized Q4.1 tensor data into a format suitable + * for CANN processing. It extracts quantization values and scales from the + * source data and prepares them in a format expected by CANN operations. + * + * @param tensor Pointer to the tensor information. + * @param src Pointer to the source data in Q4.1 format. + * @param dst Pointer to the destination buffer where transformed data will be + * stored. + */ +static void ggml_backend_cann_transform_q4_1(ggml_tensor* tensor, + const void* src, + void* dst) { + + int64_t n_elems = ggml_nelements(tensor); + int64_t groups = n_elems / QK4_1; + size_t quant_bytes = n_elems * sizeof(uint8_t) / 2; + size_t scale_bytes = groups * sizeof(uint16_t); + + uint8_t* quant_offset = (uint8_t*)dst; + uint16_t* scale_offset = (uint16_t*)((char*)dst + quant_bytes); + uint16_t* min_offset = (uint16_t*)((char*)dst + quant_bytes + scale_bytes); + + for (int i = 0; i < groups; i++) { + const block_q4_1* group = + (const block_q4_1*)((const char*)src + i * sizeof(block_q4_1)); + *scale_offset = group->d; + scale_offset++; + + float d = ggml_fp16_to_fp32(group->d); + float m = ggml_fp16_to_fp32(group->m); + + float min = 0.0f; + if (d != 0.0f) + min = 8.0f + (m / d); + *min_offset = ggml_fp32_to_fp16(min); + min_offset++; + + // 0-15 + for (int j = 0; j < QK4_1 / 2; j += 2) { + (*quant_offset) = (group->qs[j] & 0x0F); + (*quant_offset) |= ((group->qs[j + 1] << 4)); + quant_offset++; + } + + // 16-31 + for (int j = 0; j < QK4_1 / 2; j += 2) { + (*quant_offset) = (group->qs[j] >> 4); + (*quant_offset) |= (group->qs[j + 1] & 0xF0); + quant_offset++; + } + } + + // put (uint4b_t -8) into int4b_t + for (quant_offset = (uint8_t*)dst; + quant_offset < (uint8_t*)dst + quant_bytes; quant_offset++) { + (*quant_offset) ^= 0x88; + } +} + /** * @brief Transform quantized Q8.0 tensor data into a format suitable for CANN * processing. @@ -1012,6 +1075,44 @@ static void ggml_backend_cann_transform_back_q8_0( } } +/** + * @brief Transform quantized Q8.1 tensor data into a format suitable for CANN + * processing. + * + * This function transforms quantized Q8.1 tensor data into a format suitable + * for CANN processing. It extracts quantization values and scales from the + * source data and prepares them in a format expected by CANN operations. + * + * @param tensor Pointer to the tensor information. + * @param src Pointer to the source data in Q8.1 format. + * @param dst Pointer to the destination buffer where transformed data will be + * stored. + */ +static void ggml_backend_cann_transform_q8_1(ggml_tensor* tensor, + const void* src, + void* dst) { + int64_t n_elems = ggml_nelements(tensor); + int64_t groups = n_elems / QK8_1; + size_t quant_bytes = n_elems * sizeof(uint8_t); + size_t scale_bytes = groups * sizeof(uint16_t); + + uint8_t* quant_offset = (uint8_t*)dst; + uint16_t* scale_offset = (uint16_t*)((char*)dst + quant_bytes); + uint16_t* sum_offset = (uint16_t*)((char*)dst + quant_bytes + scale_bytes); + + for (int i = 0; i < groups; i++) { + const block_q8_1* group = + (const block_q8_1*)((const char*)src + i * sizeof(block_q8_1)); + *scale_offset = group->d; + scale_offset++; + *sum_offset = group->s; + sum_offset++; + size_t group_quant_size = QK8_1 * sizeof(uint8_t); + memcpy(quant_offset, group->qs, group_quant_size); + quant_offset += group_quant_size; + } +} + /** * @brief Transform tensor data based on its type for CANN processing. * @@ -1033,6 +1134,12 @@ static void ggml_backend_cann_transform(ggml_tensor* tensor, case GGML_TYPE_Q8_0: ggml_backend_cann_transform_q8_0(tensor, src, dst); break; + case GGML_TYPE_Q4_1: + ggml_backend_cann_transform_q4_1(tensor, src, dst); + break; + case GGML_TYPE_Q8_1: + ggml_backend_cann_transform_q8_1(tensor, src, dst); + break; default: break; } @@ -1077,6 +1184,8 @@ static bool need_transform(ggml_type type) { switch (type) { case GGML_TYPE_Q4_0: case GGML_TYPE_Q8_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q8_1: return true; default: return false; @@ -2293,7 +2402,7 @@ static enum ggml_status ggml_backend_cann_graph_compute( * otherwise false. */ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, - const ggml_tensor* op) { + const ggml_tensor* op) { switch (op->op) { case GGML_OP_UNARY: switch (ggml_get_unary_op(op)) { @@ -2335,6 +2444,8 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, return true; case GGML_TYPE_Q8_0: case GGML_TYPE_Q4_0: + case GGML_TYPE_Q8_1: + case GGML_TYPE_Q4_1: #ifdef ASCEND_310P // Q4 && Q8 per group is not suppor on 310p device return false;