graph : utilize ggml_build_forward_select() to avoid reallocations (#18898)

* graph : avoid branches between embedding and token inputs * models : make deepstack graphs (e.g. Qwen3 VL) have constant topology * ci : enable -DGGML_SCHED_NO_REALLOC=ON for server CI * cont : pad token embeddings to n_embd_inp
2026-01-23 18:22:34 +02:00
parent cb6caca191
commit 557515be1e
7 changed files with 69 additions and 53 deletions
@@ -2903,7 +2903,7 @@ void llama_context::opt_epoch_iter(
                };
                ctx_compute_opt = ggml_init(params);
            }
-            ggml_opt_prepare_alloc(opt_ctx, ctx_compute_opt, gf, res->get_tokens(), res->get_logits());
+            ggml_opt_prepare_alloc(opt_ctx, ctx_compute_opt, gf, res->get_inp_tokens(), res->get_logits());
            ggml_opt_alloc(opt_ctx, train);

            res->set_inputs(&ubatch);