Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions ggml/src/ggml-cann/acl_tensor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,10 @@ aclDataType ggml_cann_type_mapping(ggml_type type) {
case GGML_TYPE_I32:
return ACL_INT32;
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
return ACL_INT4;
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q8_1:
return ACL_INT8;
case GGML_TYPE_I64:
return ACL_INT64;
Expand Down
49 changes: 40 additions & 9 deletions ggml/src/ggml-cann/aclnn_ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2008,12 +2008,12 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
// HC is regarded as batch.
// weight need transpose.
float weight_elem_size;
if (type == GGML_TYPE_Q4_0) {
if (type == GGML_TYPE_Q4_0 || type == GGML_TYPE_Q4_1) {
weight_elem_size = float(sizeof(uint8_t)) / 2;
} else if (type == GGML_TYPE_Q8_0) {
} else if (type == GGML_TYPE_Q8_0 || type == GGML_TYPE_Q8_1) {
weight_elem_size = float(sizeof(uint8_t));
} else {
GGML_ABORT("Only support Q4_0 and Q8_0 MUL_MAT");
GGML_ABORT("Only support Q4_0, Q4_1, Q8_0 and Q8_1 MUL_MAT");
}
float weight_nb[] = {src0->ne[0] * weight_elem_size, weight_elem_size};
size_t weight_stride = src0->ne[1] * src0->ne[0] * weight_elem_size;
Expand All @@ -2024,8 +2024,15 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size,
scale_elem_size};
size_t scale_stride = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size;
size_t scale_size = scale_stride * src0->ne[2] * src0->ne[3];
char* scale_offset = (char*)src0->data + weight_size;

// min. Also need transpose.
size_t min_elem_size;
size_t min_nb[2];
size_t min_stride;
char* min_offset;

// input
size_t input_elem_size = sizeof(uint16_t);
int64_t input_ne[] = {src1->ne[0], src1->ne[1]};
Expand All @@ -2034,7 +2041,7 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
ggml_cann_pool_alloc input_alloctor(ctx.pool());
void* input_buffer = src1->data;

// case in
// cast in
if (src1->type != GGML_TYPE_F16) {
aclTensor* acl_src1_tensor = ggml_cann_create_tensor(src1);
input_buffer =
Expand Down Expand Up @@ -2079,12 +2086,14 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
input_elem_size, input_ne, input_nb, 2);

// first split
int64_t weight_ne_offset = 0;
int64_t weight_ne_offset = 0;
int64_t weight_ne[2] = {
max_elem_size > src0->ne[1] ? src0->ne[1] : max_elem_size,
src0->ne[0]};
int64_t scale_ne_offset = 0;
int64_t scale_ne[2] = {weight_ne[0], weight_ne[1] / QK8_0};
int64_t min_ne_offset = 0;
int64_t min_ne[2] = {weight_ne[0], weight_ne[1] / QK4_1};
int64_t output_ne_offset = 0;
int64_t output_ne[2] = {weight_ne[0], dst->ne[1]};

Expand All @@ -2096,6 +2105,18 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
scale_offset + batch0 * scale_stride, ACL_FLOAT16,
scale_elem_size, scale_ne, scale_nb, 2, ACL_FORMAT_ND,
scale_ne_offset);
aclTensor* acl_min_tensor = nullptr;
if (src0->type == GGML_TYPE_Q4_1) {
min_elem_size = sizeof(uint16_t);
min_nb[0] = src0->ne[0] / QK4_1 * min_elem_size;
min_nb[1] = min_elem_size;
min_stride = src0->ne[1] * src0->ne[0] / QK4_1 * min_elem_size;
min_offset = (char*)src0->data + weight_size + scale_size;

acl_min_tensor = ggml_cann_create_tensor(
min_offset + batch0 * min_stride, ACL_FLOAT16,
min_elem_size, min_ne, min_nb, 2, ACL_FORMAT_ND, min_ne_offset);
}
aclTensor* acl_output_tensor = ggml_cann_create_tensor(
(char*)output_buffer + batch1 * output_stride, ACL_FLOAT16,
output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND,
Expand All @@ -2105,10 +2126,10 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
antiquantGroupSize = QK8_0;
}
GGML_CANN_CALL_ACLNN_OP(ctx, WeightQuantBatchMatmulV2, acl_input_tensor,
acl_weight_tensor, acl_scale_tensor, nullptr,
acl_weight_tensor, acl_scale_tensor, acl_min_tensor,
nullptr, nullptr, nullptr, antiquantGroupSize,
acl_output_tensor);
ggml_cann_release_resources(ctx, acl_weight_tensor, acl_scale_tensor, acl_output_tensor);
ggml_cann_release_resources(ctx, acl_weight_tensor, acl_scale_tensor, acl_min_tensor, acl_output_tensor);

// other splits
for (int64_t split = 1; split < split_size; split++) {
Expand All @@ -2131,15 +2152,23 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
scale_offset + batch0 * scale_stride, ACL_FLOAT16,
scale_elem_size, scale_ne, scale_nb, 2, ACL_FORMAT_ND,
scale_ne_offset);
acl_min_tensor = nullptr;
if (src0->type == GGML_TYPE_Q4_1) {
min_ne_offset += min_elem_size * min_ne[0] * min_ne[1];
min_ne[0] = weight_ne[0];
acl_min_tensor = ggml_cann_create_tensor(
min_offset + batch0 * min_stride, ACL_FLOAT16,
min_elem_size, min_ne, min_nb, 2, ACL_FORMAT_ND, min_ne_offset);
}
acl_output_tensor = ggml_cann_create_tensor(
(char*)output_buffer + batch1 * output_stride, ACL_FLOAT16,
output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND,
output_ne_offset);
GGML_CANN_CALL_ACLNN_OP(ctx, WeightQuantBatchMatmulV2, acl_input_tensor,
acl_weight_tensor, acl_scale_tensor, nullptr,
acl_weight_tensor, acl_scale_tensor, acl_min_tensor,
nullptr, nullptr, nullptr, antiquantGroupSize,
acl_output_tensor);
ggml_cann_release_resources(ctx, acl_weight_tensor, acl_scale_tensor, acl_output_tensor);
ggml_cann_release_resources(ctx, acl_weight_tensor, acl_scale_tensor, acl_min_tensor, acl_output_tensor);
}

ggml_cann_release_resources(ctx, acl_input_tensor);
Expand Down Expand Up @@ -2174,6 +2203,8 @@ void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
break;
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q8_1:
ggml_cann_mul_mat_quant(ctx, dst, type);
break;
default:
Expand Down
113 changes: 112 additions & 1 deletion ggml/src/ggml-cann/ggml-cann.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -945,6 +945,69 @@ static void ggml_backend_cann_transform_back_q4_0(
}
}

/**
* @brief Transform quantized Q4.1 tensor data into a format suitable for CANN
* processing.
*
* This function transforms quantized Q4.1 tensor data into a format suitable
* for CANN processing. It extracts quantization values and scales from the
* source data and prepares them in a format expected by CANN operations.
*
* @param tensor Pointer to the tensor information.
* @param src Pointer to the source data in Q4.1 format.
* @param dst Pointer to the destination buffer where transformed data will be
* stored.
*/
static void ggml_backend_cann_transform_q4_1(ggml_tensor* tensor,
const void* src,
void* dst) {

int64_t n_elems = ggml_nelements(tensor);
int64_t groups = n_elems / QK4_1;
size_t quant_bytes = n_elems * sizeof(uint8_t) / 2;
size_t scale_bytes = groups * sizeof(uint16_t);

uint8_t* quant_offset = (uint8_t*)dst;
uint16_t* scale_offset = (uint16_t*)((char*)dst + quant_bytes);
uint16_t* min_offset = (uint16_t*)((char*)dst + quant_bytes + scale_bytes);

for (int i = 0; i < groups; i++) {
const block_q4_1* group =
(const block_q4_1*)((const char*)src + i * sizeof(block_q4_1));
*scale_offset = group->d;
scale_offset++;

float d = ggml_fp16_to_fp32(group->d);
float m = ggml_fp16_to_fp32(group->m);

float min = 0.0f;
if (d != 0.0f)
min = 8.0f + (m / d);
*min_offset = ggml_fp32_to_fp16(min);
min_offset++;

// 0-15
for (int j = 0; j < QK4_1 / 2; j += 2) {
(*quant_offset) = (group->qs[j] & 0x0F);
(*quant_offset) |= ((group->qs[j + 1] << 4));
quant_offset++;
}

// 16-31
for (int j = 0; j < QK4_1 / 2; j += 2) {
(*quant_offset) = (group->qs[j] >> 4);
(*quant_offset) |= (group->qs[j + 1] & 0xF0);
quant_offset++;
}
}

// put (uint4b_t -8) into int4b_t
for (quant_offset = (uint8_t*)dst;
quant_offset < (uint8_t*)dst + quant_bytes; quant_offset++) {
(*quant_offset) ^= 0x88;
}
}

/**
* @brief Transform quantized Q8.0 tensor data into a format suitable for CANN
* processing.
Expand Down Expand Up @@ -1012,6 +1075,44 @@ static void ggml_backend_cann_transform_back_q8_0(
}
}

/**
* @brief Transform quantized Q8.1 tensor data into a format suitable for CANN
* processing.
*
* This function transforms quantized Q8.1 tensor data into a format suitable
* for CANN processing. It extracts quantization values and scales from the
* source data and prepares them in a format expected by CANN operations.
*
* @param tensor Pointer to the tensor information.
* @param src Pointer to the source data in Q8.1 format.
* @param dst Pointer to the destination buffer where transformed data will be
* stored.
*/
static void ggml_backend_cann_transform_q8_1(ggml_tensor* tensor,
const void* src,
void* dst) {
int64_t n_elems = ggml_nelements(tensor);
int64_t groups = n_elems / QK8_1;
size_t quant_bytes = n_elems * sizeof(uint8_t);
size_t scale_bytes = groups * sizeof(uint16_t);

uint8_t* quant_offset = (uint8_t*)dst;
uint16_t* scale_offset = (uint16_t*)((char*)dst + quant_bytes);
uint16_t* sum_offset = (uint16_t*)((char*)dst + quant_bytes + scale_bytes);

for (int i = 0; i < groups; i++) {
const block_q8_1* group =
(const block_q8_1*)((const char*)src + i * sizeof(block_q8_1));
*scale_offset = group->d;
scale_offset++;
*sum_offset = group->s;
sum_offset++;
size_t group_quant_size = QK8_1 * sizeof(uint8_t);
memcpy(quant_offset, group->qs, group_quant_size);
quant_offset += group_quant_size;
}
}

/**
* @brief Transform tensor data based on its type for CANN processing.
*
Expand All @@ -1033,6 +1134,12 @@ static void ggml_backend_cann_transform(ggml_tensor* tensor,
case GGML_TYPE_Q8_0:
ggml_backend_cann_transform_q8_0(tensor, src, dst);
break;
case GGML_TYPE_Q4_1:
ggml_backend_cann_transform_q4_1(tensor, src, dst);
break;
case GGML_TYPE_Q8_1:
ggml_backend_cann_transform_q8_1(tensor, src, dst);
break;
default:
break;
}
Expand Down Expand Up @@ -1077,6 +1184,8 @@ static bool need_transform(ggml_type type) {
switch (type) {
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q8_1:
return true;
default:
return false;
Expand Down Expand Up @@ -2293,7 +2402,7 @@ static enum ggml_status ggml_backend_cann_graph_compute(
* otherwise false.
*/
static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
const ggml_tensor* op) {
const ggml_tensor* op) {
switch (op->op) {
case GGML_OP_UNARY:
switch (ggml_get_unary_op(op)) {
Expand Down Expand Up @@ -2335,6 +2444,8 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
return true;
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q8_1:
case GGML_TYPE_Q4_1:
#ifdef ASCEND_310P
// Q4 && Q8 per group is not suppor on 310p device
return false;
Expand Down