opencl: add flattened q6_K mv (#19054)

* opencl: flatten `q6_K` and add `kernel_mul_mv_q6_K_f32_flat`

* opencl: clean up

* opencl: refactor q6_K mv - put loop body in `block_q_6_K_dot_y_flat`

* opencl: tweak the workgroup size a bit

* opencl: output 4 values per subgroup for `kernel_mul_mv_q6_K_f32_flat`

* opencl: proper alignment for q6_K

* opencl: boundary handling for flattened q6_K mv

* opencl: rename q6_K mv kernel file

* opencl: put flattened q6_K mv in its own file

* opencl: use lower k in file name

* opencl: use K in variable names
This commit is contained in:
lhez
2026-01-26 19:36:24 -08:00
committed by GitHub
parent b0311c16d2
commit 94eeb5967c
5 changed files with 518 additions and 9 deletions
+252 -8
View File
@@ -533,8 +533,10 @@ struct ggml_backend_opencl_context {
cl_kernel kernel_mul_mat_q4_0_f32_8x_flat;
cl_kernel kernel_convert_block_q4_0_noshuffle;
cl_kernel kernel_restore_block_q4_0_noshuffle;
cl_kernel kernel_convert_block_q6_K, kernel_restore_block_q6_K;
cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
cl_kernel kernel_mul_mv_q6_K_f32;
cl_kernel kernel_mul_mv_q6_K_f32_flat;
cl_kernel kernel_mul_mv_mxfp4_f32, kernel_mul_mv_mxfp4_f32_flat;
cl_kernel kernel_mul_mv_q8_0_f32, kernel_mul_mv_q8_0_f32_flat;
cl_kernel kernel_solve_tri_f32;
@@ -892,6 +894,8 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
CL_CHECK((backend_ctx->kernel_restore_block_mxfp4 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_mxfp4", &err), err));
CL_CHECK((backend_ctx->kernel_convert_block_q8_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q8_0", &err), err));
CL_CHECK((backend_ctx->kernel_restore_block_q8_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q8_0", &err), err));
CL_CHECK((backend_ctx->kernel_convert_block_q6_K = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q6_K", &err), err));
CL_CHECK((backend_ctx->kernel_restore_block_q6_K = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q6_K", &err), err));
GGML_LOG_CONT(".");
}
@@ -1114,14 +1118,14 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
GGML_LOG_CONT(".");
}
// mul_mv_q6_k
// mul_mv_q6_k_f32
{
#ifdef GGML_OPENCL_EMBED_KERNELS
const std::string kernel_src {
#include "mul_mv_q6_k.cl.h"
#include "mul_mv_q6_k_f32.cl.h"
};
#else
const std::string kernel_src = read_file("mul_mv_q6_k.cl");
const std::string kernel_src = read_file("mul_mv_q6_k_f32.cl");
#endif
backend_ctx->program_mul_mv_q6_K =
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
@@ -1130,6 +1134,23 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
GGML_LOG_CONT(".");
}
// mul_mv_q6_k_f32_flat
{
#ifdef GGML_OPENCL_EMBED_KERNELS
const std::string kernel_src {
#include "mul_mv_q6_k_f32_flat.cl.h"
};
#else
const std::string kernel_src = read_file("mul_mv_q6_k_f32_flat.cl");
#endif
cl_program prog =
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
CL_CHECK((backend_ctx->kernel_mul_mv_q6_K_f32_flat = clCreateKernel(prog, "kernel_mul_mv_q6_K_f32_flat", &err), err));
CL_CHECK(clReleaseProgram(prog));
GGML_LOG_CONT(".");
}
// mul_mv_q8_0_f32
{
#ifdef GGML_OPENCL_EMBED_KERNELS
@@ -2919,6 +2940,50 @@ struct ggml_tensor_extra_cl_q8_0 {
}
};
struct ggml_tensor_extra_cl_q6_K {
// Lower 4 bits of quantized weights.
cl_mem ql = nullptr;
// Upper 2 bits of quantized weights.
cl_mem qh = nullptr;
// Scales for each block.
cl_mem s = nullptr;
// Scales for each super block.
cl_mem d = nullptr;
size_t size_ql = 0;
size_t size_qh = 0;
size_t size_s = 0;
size_t size_d = 0;
~ggml_tensor_extra_cl_q6_K() {
reset();
}
void reset() {
if (ql != nullptr) {
CL_CHECK(clReleaseMemObject(ql));
ql = nullptr;
}
if (qh != nullptr) {
CL_CHECK(clReleaseMemObject(qh));
qh = nullptr;
}
if (s != nullptr) {
CL_CHECK(clReleaseMemObject(s));
s = nullptr;
}
if (d != nullptr) {
CL_CHECK(clReleaseMemObject(d));
d = nullptr;
}
size_ql = 0;
size_qh = 0;
size_s = 0;
size_d = 0;
}
};
//------------------------------------------------------------------------------
// Backend API
//------------------------------------------------------------------------------
@@ -3465,6 +3530,12 @@ struct ggml_backend_opencl_buffer_context {
for (ggml_tensor_extra_cl_q8_0 * e : temp_tensor_extras_q8_0_in_use) {
delete e;
}
for (ggml_tensor_extra_cl_q6_K * e : temp_tensor_extras_q6_K) {
delete e;
}
for (ggml_tensor_extra_cl_q6_K * e : temp_tensor_extras_q6_K_in_use) {
delete e;
}
}
ggml_tensor_extra_cl * ggml_opencl_alloc_temp_tensor_extra() {
@@ -3527,6 +3598,21 @@ struct ggml_backend_opencl_buffer_context {
return extra;
}
ggml_tensor_extra_cl_q6_K * ggml_opencl_alloc_temp_tensor_extra_q6_K() {
ggml_tensor_extra_cl_q6_K * extra;
if (temp_tensor_extras_q6_K.empty()) {
extra = new ggml_tensor_extra_cl_q6_K();
} else {
extra = temp_tensor_extras_q6_K.back();
temp_tensor_extras_q6_K.pop_back();
}
temp_tensor_extras_q6_K_in_use.push_back(extra);
extra->reset();
return extra;
}
void reset() {
for (ggml_tensor_extra_cl * e : temp_tensor_extras_in_use) {
temp_tensor_extras.push_back(e);
@@ -3547,6 +3633,11 @@ struct ggml_backend_opencl_buffer_context {
temp_tensor_extras_q8_0.push_back(e);
}
temp_tensor_extras_q8_0_in_use.clear();
for (ggml_tensor_extra_cl_q6_K * e : temp_tensor_extras_q6_K_in_use) {
temp_tensor_extras_q6_K.push_back(e);
}
temp_tensor_extras_q6_K_in_use.clear();
}
// Pools for extras. Available extras are in `temp_tensor_extras`. Extras
@@ -3562,6 +3653,8 @@ struct ggml_backend_opencl_buffer_context {
std::vector<ggml_tensor_extra_cl_mxfp4 *> temp_tensor_extras_mxfp4_in_use;
std::vector<ggml_tensor_extra_cl_q8_0 *> temp_tensor_extras_q8_0;
std::vector<ggml_tensor_extra_cl_q8_0 *> temp_tensor_extras_q8_0_in_use;
std::vector<ggml_tensor_extra_cl_q6_K *> temp_tensor_extras_q6_K;
std::vector<ggml_tensor_extra_cl_q6_K *> temp_tensor_extras_q6_K_in_use;
// The buffer_context is initially created by ggml_backend_buft_alloc_buffer
// before any tensor is initialized (at the beginning of alloc_tensor_range).
@@ -4068,6 +4161,92 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
return;
}
if (tensor->type == GGML_TYPE_Q6_K) {
ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra;
GGML_ASSERT(extra_orig && "Tesnors in OpenCL backend should have been allocated and initialized");
// Allocate the new extra and create aliases from the original.
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
ggml_tensor_extra_cl_q6_K * extra = ctx->ggml_opencl_alloc_temp_tensor_extra_q6_K();
size_t size_ql = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*ggml_blck_size(tensor->type)/2;
size_t size_qh = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*ggml_blck_size(tensor->type)/4;
size_t size_s = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*ggml_blck_size(tensor->type)/16;
size_t size_d = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(ggml_fp16_t);
GGML_ASSERT(size_ql + size_qh + size_s + size_d == ggml_nbytes(tensor) &&
"Incorrect tensor size");
cl_int err;
cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
ggml_nbytes(tensor), NULL, &err);
CL_CHECK(err);
CL_CHECK(clEnqueueWriteBuffer(
queue, data_device, CL_TRUE, 0,
ggml_nbytes(tensor), data, 0, NULL, NULL));
cl_buffer_region region;
// Subbuffer for ql
region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment);
region.size = size_ql;
extra->ql = clCreateSubBuffer(
extra_orig->data_device, CL_MEM_READ_WRITE,
CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
CL_CHECK(err);
auto previous_origin = region.origin;
// Subbuffer for qh
region.origin = align_to(previous_origin + size_ql, backend_ctx->alignment);
region.size = size_qh;
extra->qh = clCreateSubBuffer(
extra_orig->data_device, CL_MEM_READ_WRITE,
CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
CL_CHECK(err);
previous_origin = region.origin;
// Subbuffer for scales
region.origin = align_to(previous_origin + size_qh, backend_ctx->alignment);
region.size = size_s;
extra->s = clCreateSubBuffer(
extra_orig->data_device, CL_MEM_READ_WRITE,
CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
CL_CHECK(err);
previous_origin = region.origin;
// Create subbuffer for d.
region.origin = align_to(previous_origin + size_s, backend_ctx->alignment);
region.size = size_d;
extra->d = clCreateSubBuffer(
extra_orig->data_device, CL_MEM_READ_WRITE,
CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
CL_CHECK(err);
previous_origin = region.origin;
// Flatten the weights
cl_kernel kernel = backend_ctx->kernel_convert_block_q6_K;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->ql));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->qh));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->s));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra->d));
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
size_t local_work_size[] = {64, 1, 1};
cl_event evt;
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
CL_CHECK(clWaitForEvents(1, &evt));
CL_CHECK(clReleaseMemObject(data_device));
extra->size_ql = size_ql;
extra->size_qh = size_qh;
extra->size_s = size_s;
extra->size_d = size_d;
tensor->extra = extra;
return;
}
#endif // GGML_OPENCL_SOA_Q
ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *) tensor->extra;
@@ -4277,6 +4456,34 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
size_t local_work_size[] = {1, 1, 1};
cl_event evt;
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
global_work_size, local_work_size, 0, NULL, &evt));
CL_CHECK(clWaitForEvents(1, &evt));
CL_CHECK(clEnqueueReadBuffer(
queue, data_device, CL_TRUE, offset,
size, data, 0, NULL, NULL));
CL_CHECK(clReleaseMemObject(data_device));
return;
}
if (tensor->type == GGML_TYPE_Q6_K) {
ggml_tensor_extra_cl_q6_K * extra = (ggml_tensor_extra_cl_q6_K *)tensor->extra;
cl_int err;
cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
ggml_nbytes(tensor), NULL, &err);
CL_CHECK(err);
cl_kernel kernel = backend_ctx->kernel_restore_block_q6_K;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->ql));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->qh));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->s));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->d));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &data_device));
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
size_t local_work_size[] = {1, 1, 1};
cl_event evt;
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
global_work_size, local_work_size, 0, NULL, &evt));
@@ -7765,6 +7972,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra;
ggml_tensor_extra_cl_mxfp4 * extra0_mxfp4 = (ggml_tensor_extra_cl_mxfp4 *)src0->extra;
ggml_tensor_extra_cl_q8_0 * extra0_q8_0 = (ggml_tensor_extra_cl_q8_0 *)src0->extra;
ggml_tensor_extra_cl_q6_K * extra0_q6_K = (ggml_tensor_extra_cl_q6_K *)src0->extra;
#endif
const int ne00 = src0 ? src0->ne[0] : 0;
@@ -8648,14 +8856,49 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
case GGML_TYPE_Q4_K:
case GGML_TYPE_Q5_K:
case GGML_TYPE_Q6_K:
#ifdef GGML_OPENCL_SOA_Q
kernel = backend_ctx->kernel_mul_mv_q6_K_f32_flat;
if (backend_ctx->gpu_family == INTEL) {
nth0 = 16;
nth1 = 2;
ndst = 4;
} else if (backend_ctx->gpu_family == ADRENO) {
nth0 = 64;
nth1 = 2;
ndst = 4;
} else {
GGML_ASSERT(false && "TODO: Unknown GPU");
}
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q6_K->ql));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q6_K->qh));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra0_q6_K->s));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra0_q6_K->d));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra1->data_device));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset1));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00));
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01));
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne02));
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne10));
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne12));
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne0));
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne1));
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &r2));
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &r3));
#else
kernel = backend_ctx->kernel_mul_mv_q6_K_f32;
if (backend_ctx->gpu_family == INTEL) {
nth0 = 2;
nth1 = 16;
nth0 = 16;
nth1 = 2;
ndst = 1;
} else if (backend_ctx->gpu_family == ADRENO) {
nth0 = 2;
nth1 = 64;
nth0 = 64;
nth1 = 2;
ndst = 1;
} else {
GGML_ASSERT(false && "TODO: Unknown GPU");
}
@@ -8675,6 +8918,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne1));
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &r2));
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &r3));
#endif // GGML_OPENCL_SOA_Q
break;
case GGML_TYPE_MXFP4: {
#ifdef GGML_OPENCL_SOA_Q
@@ -8777,7 +9021,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
} else if (src0t == GGML_TYPE_Q5_K) {
GGML_ASSERT(false && "not implemented");
} else if (src0t == GGML_TYPE_Q6_K) {
size_t global_work_size[] = {(size_t)(ne01+1)/2*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13};
size_t global_work_size[] = {(size_t)(ne01+ndst*nth1-1)/(ndst*nth1)*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13};
size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);