cuda : fix nkvo, offload and cuda graph node properties matching (#19165)

* cuda : fix nkvo * cont : more robust cuda graph node property matching * cont : restore pre-leafs implementation * cont : comments + static_assert
2026-01-29 18:45:30 +02:00
parent 7b7ae857f6
commit 4fdbc1e4db
4 changed files with 59 additions and 32 deletions
@@ -1122,15 +1122,17 @@ struct ggml_tensor_extra_gpu {
 #endif

 struct ggml_cuda_graph_node_properties {
-    void * node_address;
+    void * node_data;
    ggml_op node_op;
    int32_t flags;
    int64_t ne[GGML_MAX_DIMS];
    size_t nb[GGML_MAX_DIMS];
-    void * src_address[GGML_MAX_SRC];
+    void * src_data[GGML_MAX_SRC];
    int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
 };

+static_assert(std::is_trivial<ggml_cuda_graph_node_properties>::value, "ggml_cuda_graph_node_properties must be trivial");
+
 struct ggml_cuda_graph {
 #ifdef USE_CUDA_GRAPH
    ~ggml_cuda_graph() {
@@ -1150,6 +1152,12 @@ struct ggml_cuda_graph {
    int number_consecutive_updates = 0;
    std::vector<ggml_cuda_graph_node_properties> props;

+    // these are extra tensors (inputs) that participate in the ggml graph but are not nodes
+    // they properties also have to match in order to be able to safely reuse a CUDA graph
+    // ref: https://github.com/ggml-org/llama.cpp/pull/18583
+    // ref: https://github.com/ggml-org/llama.cpp/pull/19165
+    std::vector<ggml_cuda_graph_node_properties> extra;
+
    void record_update(bool use_graph, bool update_required) {
        if (use_graph && update_required) {
            number_consecutive_updates++;