CANN: add BF16 support for core operators (#20152)
* CANN: add BF16 support for core operators Add BF16 (bfloat16) type support to the CANN backend for the following operators: MUL_MAT, MUL_MAT_ID, GET_ROWS, SET_ROWS, CPY, CONT, and OUT_PROD. This enables BF16 models to run on Ascend NPUs. * CANN: skip NZ weight format for BF16 and add 310P compile guards NZ weight format conversion does not support BF16 tensors, skip it in set_tensor, get_alloc_size and mul_mat. Remove BF16 from MUL_MAT_ID and OUT_PROD as there are no BF16 use cases. Add #ifndef ASCEND_310P guards for all BF16 operator support since 310P does not support BF16.
This commit is contained in:
@@ -1788,9 +1788,11 @@ void ggml_cann_get_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
|||||||
ggml_tensor * src0 = dst->src[0]; // src
|
ggml_tensor * src0 = dst->src[0]; // src
|
||||||
ggml_tensor * src1 = dst->src[1]; // index
|
ggml_tensor * src1 = dst->src[1]; // index
|
||||||
|
|
||||||
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16
|
||||||
|
|| dst->type == GGML_TYPE_BF16);
|
||||||
|
|
||||||
switch (src0->type) {
|
switch (src0->type) {
|
||||||
|
case GGML_TYPE_BF16:
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
case GGML_TYPE_F32:
|
case GGML_TYPE_F32:
|
||||||
if (src0->type == dst->type) {
|
if (src0->type == dst->type) {
|
||||||
@@ -1881,6 +1883,7 @@ void ggml_cann_set_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
|
case GGML_TYPE_BF16:
|
||||||
{
|
{
|
||||||
acl_tensor_ptr acl_src0 = ggml_cann_create_tensor(src0);
|
acl_tensor_ptr acl_src0 = ggml_cann_create_tensor(src0);
|
||||||
ggml_cann_pool_alloc src_buffer_allocator(ctx.pool(), ggml_nelements(src0) * sizeof(uint16_t));
|
ggml_cann_pool_alloc src_buffer_allocator(ctx.pool(), ggml_nelements(src0) * sizeof(uint16_t));
|
||||||
@@ -1891,7 +1894,7 @@ void ggml_cann_set_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
|||||||
src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
|
src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
|
||||||
}
|
}
|
||||||
acl_tensor_ptr src_trans_tensor = ggml_cann_create_tensor(
|
acl_tensor_ptr src_trans_tensor = ggml_cann_create_tensor(
|
||||||
src_trans_buffer, ACL_FLOAT16, ggml_type_size(dst->type), src0->ne, src_trans_nb, GGML_MAX_DIMS);
|
src_trans_buffer, ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type), src0->ne, src_trans_nb, GGML_MAX_DIMS);
|
||||||
aclnn_cast(ctx, acl_src0.get(), src_trans_tensor.get(), ggml_cann_type_mapping(dst->type));
|
aclnn_cast(ctx, acl_src0.get(), src_trans_tensor.get(), ggml_cann_type_mapping(dst->type));
|
||||||
aclnn_index_copy_4d(ctx, src_trans_buffer, src0->ne, src_trans_nb, dst->data, dst->ne, dst->nb, src1,
|
aclnn_index_copy_4d(ctx, src_trans_buffer, src0->ne, src_trans_nb, dst->data, dst->ne, dst->nb, src1,
|
||||||
dst->type);
|
dst->type);
|
||||||
@@ -1965,7 +1968,7 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context & ctx, ggml_tensor *
|
|||||||
|
|
||||||
// Only check env once.
|
// Only check env once.
|
||||||
static bool weight_to_nz = parse_bool(get_env_as_lowercase("GGML_CANN_WEIGHT_NZ").value_or("on"));
|
static bool weight_to_nz = parse_bool(get_env_as_lowercase("GGML_CANN_WEIGHT_NZ").value_or("on"));
|
||||||
if (weight_to_nz && is_matmul_weight(weight)) {
|
if (weight_to_nz && weight->type != GGML_TYPE_BF16 && is_matmul_weight(weight)) {
|
||||||
acl_weight_tensor = ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims, ACL_FORMAT_FRACTAL_NZ);
|
acl_weight_tensor = ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims, ACL_FORMAT_FRACTAL_NZ);
|
||||||
} else {
|
} else {
|
||||||
acl_weight_tensor = ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims, ACL_FORMAT_ND);
|
acl_weight_tensor = ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims, ACL_FORMAT_ND);
|
||||||
@@ -2146,6 +2149,9 @@ void ggml_cann_mul_mat(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
|||||||
switch (type) {
|
switch (type) {
|
||||||
case GGML_TYPE_F32:
|
case GGML_TYPE_F32:
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
|
#ifndef ASCEND_310P
|
||||||
|
case GGML_TYPE_BF16:
|
||||||
|
#endif
|
||||||
ggml_cann_mat_mul_fp(ctx, dst);
|
ggml_cann_mat_mul_fp(ctx, dst);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q4_0:
|
case GGML_TYPE_Q4_0:
|
||||||
|
|||||||
@@ -1234,7 +1234,8 @@ static void ggml_backend_cann_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|||||||
static bool weight_to_nz = parse_bool(get_env_as_lowercase("GGML_CANN_WEIGHT_NZ").value_or("on"));
|
static bool weight_to_nz = parse_bool(get_env_as_lowercase("GGML_CANN_WEIGHT_NZ").value_or("on"));
|
||||||
if (!need_transform(tensor->type)) {
|
if (!need_transform(tensor->type)) {
|
||||||
ACL_CHECK(aclrtMemcpy((char *) tensor->data + offset, size, data, size, ACL_MEMCPY_HOST_TO_DEVICE));
|
ACL_CHECK(aclrtMemcpy((char *) tensor->data + offset, size, data, size, ACL_MEMCPY_HOST_TO_DEVICE));
|
||||||
if (weight_to_nz && is_matmul_weight((const ggml_tensor *) tensor)) {
|
if (weight_to_nz && tensor->type != GGML_TYPE_BF16
|
||||||
|
&& is_matmul_weight((const ggml_tensor *) tensor)) {
|
||||||
GGML_ASSERT(tensor->ne[2] == 1);
|
GGML_ASSERT(tensor->ne[2] == 1);
|
||||||
GGML_ASSERT(tensor->ne[3] == 1);
|
GGML_ASSERT(tensor->ne[3] == 1);
|
||||||
weight_format_to_nz(tensor, offset, ctx->device);
|
weight_format_to_nz(tensor, offset, ctx->device);
|
||||||
@@ -1443,7 +1444,8 @@ static size_t ggml_backend_cann_buffer_type_get_alloc_size(ggml_backend_buffer_t
|
|||||||
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
||||||
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
||||||
}
|
}
|
||||||
} else if (weight_to_nz && is_matmul_weight((const ggml_tensor *) tensor)) {
|
} else if (weight_to_nz && tensor->type != GGML_TYPE_BF16
|
||||||
|
&& is_matmul_weight((const ggml_tensor *) tensor)) {
|
||||||
// NZ format weight are not support quantized yet.
|
// NZ format weight are not support quantized yet.
|
||||||
// If ND tensor transform to NZ, size may changed.
|
// If ND tensor transform to NZ, size may changed.
|
||||||
int64_t shape[] = { tensor->ne[1], tensor->ne[0] };
|
int64_t shape[] = { tensor->ne[1], tensor->ne[0] };
|
||||||
@@ -2283,6 +2285,9 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten
|
|||||||
case GGML_OP_MUL_MAT:
|
case GGML_OP_MUL_MAT:
|
||||||
{
|
{
|
||||||
switch (op->src[0]->type) {
|
switch (op->src[0]->type) {
|
||||||
|
#ifndef ASCEND_310P
|
||||||
|
case GGML_TYPE_BF16:
|
||||||
|
#endif
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
case GGML_TYPE_F32:
|
case GGML_TYPE_F32:
|
||||||
return true;
|
return true;
|
||||||
@@ -2320,6 +2325,9 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten
|
|||||||
switch (op->src[0]->type) {
|
switch (op->src[0]->type) {
|
||||||
case GGML_TYPE_F32:
|
case GGML_TYPE_F32:
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
|
#ifndef ASCEND_310P
|
||||||
|
case GGML_TYPE_BF16:
|
||||||
|
#endif
|
||||||
case GGML_TYPE_Q8_0:
|
case GGML_TYPE_Q8_0:
|
||||||
return true;
|
return true;
|
||||||
default:
|
default:
|
||||||
@@ -2332,6 +2340,9 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten
|
|||||||
switch (op->type) {
|
switch (op->type) {
|
||||||
case GGML_TYPE_F32:
|
case GGML_TYPE_F32:
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
|
#ifndef ASCEND_310P
|
||||||
|
case GGML_TYPE_BF16:
|
||||||
|
#endif
|
||||||
return true;
|
return true;
|
||||||
default:
|
default:
|
||||||
return false;
|
return false;
|
||||||
@@ -2341,20 +2352,30 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten
|
|||||||
case GGML_OP_CPY:
|
case GGML_OP_CPY:
|
||||||
{
|
{
|
||||||
ggml_tensor * src = op->src[0];
|
ggml_tensor * src = op->src[0];
|
||||||
|
#ifdef ASCEND_310P
|
||||||
if ((op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_F16) ||
|
if ((op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_F16) ||
|
||||||
(src->type != GGML_TYPE_F32 && src->type != GGML_TYPE_F16)) {
|
(src->type != GGML_TYPE_F32 && src->type != GGML_TYPE_F16)) {
|
||||||
// only support F32 and F16.
|
// only support F32 and F16 on 310P.
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
if ((op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_F16 && op->type != GGML_TYPE_BF16) ||
|
||||||
|
(src->type != GGML_TYPE_F32 && src->type != GGML_TYPE_F16 && src->type != GGML_TYPE_BF16)) {
|
||||||
|
// only support F32, F16 and BF16.
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case GGML_OP_CONT:
|
case GGML_OP_CONT:
|
||||||
{
|
{
|
||||||
// TODO: support GGML_TYPE_BF16
|
|
||||||
switch (op->src[0]->type) {
|
switch (op->src[0]->type) {
|
||||||
case GGML_TYPE_F32:
|
case GGML_TYPE_F32:
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
|
#ifndef ASCEND_310P
|
||||||
|
case GGML_TYPE_BF16:
|
||||||
|
#endif
|
||||||
return true;
|
return true;
|
||||||
default:
|
default:
|
||||||
return false;
|
return false;
|
||||||
|
|||||||
Reference in New Issue
Block a user