intel-gpu-llm-diagnosis/repos/patch/phase5-op-timing/patch.diff

diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index 1234567..abcdefg 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -4409,6 +4409,20 @@ static void ggml_backend_sycl_graph_compute_impl(ggml_backend_sycl_context * sycl_ctx, ggml_cgraph * cgraph)
 static void ggml_backend_sycl_graph_compute_impl(ggml_backend_sycl_context * sycl_ctx, ggml_cgraph * cgraph) {
     ggml_sycl_set_main_device(sycl_ctx->device);

+    // Timing instrumentation
+    static int call_count = 0;
+    static bool timing_enabled = (getenv("GGML_SYCL_OP_TIMING") != nullptr);
+    auto t_start = std::chrono::high_resolution_clock::now();
+
+    struct op_stats {
+        const char* name;
+        int count;
+        double total_ms;
+    };
+    op_stats stats[GGML_OP_COUNT] = {};
+
+    auto t_loop_start = std::chrono::high_resolution_clock::now();
+    int compute_ops = 0;
+
     for (int i = 0; i < cgraph->n_nodes; i++) {
         ggml_tensor * node = cgraph->nodes[i];
         if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
@@ -4420,10 +4434,28 @@ static void ggml_backend_sycl_graph_compute_impl(ggml_backend_sycl_context * syc
                 assert(node->src[j]->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device));
             }
         }
+
+        auto t_op_start = std::chrono::high_resolution_clock::now();
+
         bool ok = ggml_sycl_compute_forward(*sycl_ctx, node);
         if (!ok) {
             GGML_LOG_ERROR("%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
         }
         GGML_ASSERT(ok);
+
+        if (timing_enabled) {
+            // Flush the queue to get accurate per-op timing
+            sycl_ctx->stream()->wait();
+            auto t_op_end = std::chrono::high_resolution_clock::now();
+            double op_ms = std::chrono::duration<double, std::milli>(t_op_end - t_op_start).count();
+            int op_idx = (int)node->op;
+            if (op_idx >= 0 && op_idx < GGML_OP_COUNT) {
+                stats[op_idx].name = ggml_op_name(node->op);
+                stats[op_idx].count++;
+                stats[op_idx].total_ms += op_ms;
+            }
+            compute_ops++;
+        }
     }
+
+    if (timing_enabled) {
+        auto t_loop_end = std::chrono::high_resolution_clock::now();
+        double loop_ms = std::chrono::duration<double, std::milli>(t_loop_end - t_loop_start).count();
+        call_count++;
+        if (call_count <= 5 || call_count % 100 == 0) {
+            fprintf(stderr, "[OP-TIMING] call=%d total_ops=%d loop=%.2fms\n", call_count, compute_ops, loop_ms);
+            for (int i = 0; i < GGML_OP_COUNT; i++) {
+                if (stats[i].count > 0) {
+                    fprintf(stderr, "  %-20s count=%3d total=%.2fms avg=%.3fms\n",
+                            stats[i].name, stats[i].count, stats[i].total_ms, stats[i].total_ms/stats[i].count);
+                }
+            }
+        }
+    }
 }