48bbb14a64
Root cause analysis corrected: Q4_0 low BW utilization is NOT due to SYCL submission model overhead. Per-op profiling proves the bottleneck is dp4a compute throughput — nibble extraction requires 2 dp4a per byte vs Q8_0's 1 dp4a per byte, making Q4_0 compute-bound at ~30 t/s. Fix: Increase vdr_mmvq from 2 to 4 for Q4_0 reorder path, processing 16 blocks per subgroup instead of 8. Better amortizes dp4a overhead. Benchmark (Qwen3.5-9B, Arc A770, llama-bench -p 512 -n 128 -r 3): Q4_0: 30.16 → 35.96 t/s (+19%) Q8_0: 29.96 → 30.82 t/s (within noise) Q4_K_M: 24.65 → 25.32 t/s (within noise) Also includes: - Timing instrumentation patch (for debugging, not applied to source) - Updated decisions log (Decisions 8-9) - Updated workplan with revised benchmark data - Root cause analysis document
70 lines
3.1 KiB
Diff
70 lines
3.1 KiB
Diff
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
|
|
index 1234567..abcdefg 100644
|
|
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
|
|
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
|
|
@@ -4409,6 +4409,20 @@ static void ggml_backend_sycl_graph_compute_impl(ggml_backend_sycl_context * sycl_ctx, ggml_cgraph * cgraph)
|
|
static void ggml_backend_sycl_graph_compute_impl(ggml_backend_sycl_context * sycl_ctx, ggml_cgraph * cgraph) {
|
|
ggml_sycl_set_main_device(sycl_ctx->device);
|
|
|
|
+ // Timing instrumentation
|
|
+ static int call_count = 0;
|
|
+ static bool timing_enabled = (getenv("GGML_SYCL_OP_TIMING") != nullptr);
|
|
+ auto t_start = std::chrono::high_resolution_clock::now();
|
|
+
|
|
+ struct op_stats {
|
|
+ const char* name;
|
|
+ int count;
|
|
+ double total_ms;
|
|
+ };
|
|
+ op_stats stats[GGML_OP_COUNT] = {};
|
|
+
|
|
+ auto t_loop_start = std::chrono::high_resolution_clock::now();
|
|
+ int compute_ops = 0;
|
|
+
|
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
|
ggml_tensor * node = cgraph->nodes[i];
|
|
if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
|
|
@@ -4420,10 +4434,28 @@ static void ggml_backend_sycl_graph_compute_impl(ggml_backend_sycl_context * syc
|
|
assert(node->src[j]->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device));
|
|
}
|
|
}
|
|
+
|
|
+ auto t_op_start = std::chrono::high_resolution_clock::now();
|
|
+
|
|
bool ok = ggml_sycl_compute_forward(*sycl_ctx, node);
|
|
if (!ok) {
|
|
GGML_LOG_ERROR("%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
|
|
}
|
|
GGML_ASSERT(ok);
|
|
+
|
|
+ if (timing_enabled) {
|
|
+ // Flush the queue to get accurate per-op timing
|
|
+ sycl_ctx->stream()->wait();
|
|
+ auto t_op_end = std::chrono::high_resolution_clock::now();
|
|
+ double op_ms = std::chrono::duration<double, std::milli>(t_op_end - t_op_start).count();
|
|
+ int op_idx = (int)node->op;
|
|
+ if (op_idx >= 0 && op_idx < GGML_OP_COUNT) {
|
|
+ stats[op_idx].name = ggml_op_name(node->op);
|
|
+ stats[op_idx].count++;
|
|
+ stats[op_idx].total_ms += op_ms;
|
|
+ }
|
|
+ compute_ops++;
|
|
+ }
|
|
}
|
|
+
|
|
+ if (timing_enabled) {
|
|
+ auto t_loop_end = std::chrono::high_resolution_clock::now();
|
|
+ double loop_ms = std::chrono::duration<double, std::milli>(t_loop_end - t_loop_start).count();
|
|
+ call_count++;
|
|
+ if (call_count <= 5 || call_count % 100 == 0) {
|
|
+ fprintf(stderr, "[OP-TIMING] call=%d total_ops=%d loop=%.2fms\n", call_count, compute_ops, loop_ms);
|
|
+ for (int i = 0; i < GGML_OP_COUNT; i++) {
|
|
+ if (stats[i].count > 0) {
|
|
+ fprintf(stderr, " %-20s count=%3d total=%.2fms avg=%.3fms\n",
|
|
+ stats[i].name, stats[i].count, stats[i].total_ms, stats[i].total_ms/stats[i].count);
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ }
|
|
}
|