Files
intel-gpu-llm-diagnosis/repos/patch/phase5-op-timing/patch.diff
T
Pi Agent 48bbb14a64 Phase 5: Q4_0 vdr_mmvq 2→4 (+19% tg128, 30→36 t/s)
Root cause analysis corrected: Q4_0 low BW utilization is NOT due to
SYCL submission model overhead. Per-op profiling proves the bottleneck
is dp4a compute throughput — nibble extraction requires 2 dp4a per byte
vs Q8_0's 1 dp4a per byte, making Q4_0 compute-bound at ~30 t/s.

Fix: Increase vdr_mmvq from 2 to 4 for Q4_0 reorder path, processing
16 blocks per subgroup instead of 8. Better amortizes dp4a overhead.

Benchmark (Qwen3.5-9B, Arc A770, llama-bench -p 512 -n 128 -r 3):
  Q4_0:  30.16 → 35.96 t/s (+19%)
  Q8_0:  29.96 → 30.82 t/s (within noise)
  Q4_K_M: 24.65 → 25.32 t/s (within noise)

Also includes:
- Timing instrumentation patch (for debugging, not applied to source)
- Updated decisions log (Decisions 8-9)
- Updated workplan with revised benchmark data
- Root cause analysis document
2026-04-15 19:55:44 +03:00

70 lines
3.1 KiB
Diff

diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index 1234567..abcdefg 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -4409,6 +4409,20 @@ static void ggml_backend_sycl_graph_compute_impl(ggml_backend_sycl_context * sycl_ctx, ggml_cgraph * cgraph)
static void ggml_backend_sycl_graph_compute_impl(ggml_backend_sycl_context * sycl_ctx, ggml_cgraph * cgraph) {
ggml_sycl_set_main_device(sycl_ctx->device);
+ // Timing instrumentation
+ static int call_count = 0;
+ static bool timing_enabled = (getenv("GGML_SYCL_OP_TIMING") != nullptr);
+ auto t_start = std::chrono::high_resolution_clock::now();
+
+ struct op_stats {
+ const char* name;
+ int count;
+ double total_ms;
+ };
+ op_stats stats[GGML_OP_COUNT] = {};
+
+ auto t_loop_start = std::chrono::high_resolution_clock::now();
+ int compute_ops = 0;
+
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_tensor * node = cgraph->nodes[i];
if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
@@ -4420,10 +4434,28 @@ static void ggml_backend_sycl_graph_compute_impl(ggml_backend_sycl_context * syc
assert(node->src[j]->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device));
}
}
+
+ auto t_op_start = std::chrono::high_resolution_clock::now();
+
bool ok = ggml_sycl_compute_forward(*sycl_ctx, node);
if (!ok) {
GGML_LOG_ERROR("%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
}
GGML_ASSERT(ok);
+
+ if (timing_enabled) {
+ // Flush the queue to get accurate per-op timing
+ sycl_ctx->stream()->wait();
+ auto t_op_end = std::chrono::high_resolution_clock::now();
+ double op_ms = std::chrono::duration<double, std::milli>(t_op_end - t_op_start).count();
+ int op_idx = (int)node->op;
+ if (op_idx >= 0 && op_idx < GGML_OP_COUNT) {
+ stats[op_idx].name = ggml_op_name(node->op);
+ stats[op_idx].count++;
+ stats[op_idx].total_ms += op_ms;
+ }
+ compute_ops++;
+ }
}
+
+ if (timing_enabled) {
+ auto t_loop_end = std::chrono::high_resolution_clock::now();
+ double loop_ms = std::chrono::duration<double, std::milli>(t_loop_end - t_loop_start).count();
+ call_count++;
+ if (call_count <= 5 || call_count % 100 == 0) {
+ fprintf(stderr, "[OP-TIMING] call=%d total_ops=%d loop=%.2fms\n", call_count, compute_ops, loop_ms);
+ for (int i = 0; i < GGML_OP_COUNT; i++) {
+ if (stats[i].count > 0) {
+ fprintf(stderr, " %-20s count=%3d total=%.2fms avg=%.3fms\n",
+ stats[i].name, stats[i].count, stats[i].total_ms, stats[i].total_ms/stats[i].count);
+ }
+ }
+ }
+ }
}