graph : utilize ggml_build_forward_select() to avoid reallocations (#18898)

* graph : avoid branches between embedding and token inputs

* models : make deepstack graphs (e.g. Qwen3 VL) have constant topology

* ci : enable -DGGML_SCHED_NO_REALLOC=ON for server CI

* cont : pad token embeddings to n_embd_inp
This commit is contained in:
Georgi Gerganov
2026-01-23 18:22:34 +02:00
committed by GitHub
parent cb6caca191
commit 557515be1e
7 changed files with 69 additions and 53 deletions
+6 -3
View File
@@ -106,7 +106,7 @@ using llm_graph_input_ptr = std::unique_ptr<llm_graph_input_i>;
class llm_graph_input_embd : public llm_graph_input_i {
public:
llm_graph_input_embd() = default;
llm_graph_input_embd(int64_t n_embd) : n_embd(n_embd) {}
virtual ~llm_graph_input_embd() = default;
void set_input(const llama_ubatch * ubatch) override;
@@ -115,6 +115,8 @@ public:
ggml_tensor * tokens = nullptr; // I32 [n_batch]
ggml_tensor * embd = nullptr; // F32 [n_embd, n_batch]
const int64_t n_embd = 0;
};
class llm_graph_input_pos : public llm_graph_input_i {
@@ -566,7 +568,7 @@ public:
virtual ~llm_graph_result() = default;
ggml_tensor * get_tokens() const { return t_tokens; }
ggml_tensor * get_inp_tokens() const { return t_inp_tokens; }
ggml_tensor * get_logits() const { return t_logits; }
ggml_tensor * get_embd() const { return t_embd; }
ggml_tensor * get_embd_pooled() const { return t_embd_pooled; }
@@ -593,7 +595,8 @@ public:
void set_params(const llm_graph_params & params);
// important graph nodes
ggml_tensor * t_tokens = nullptr;
ggml_tensor * t_inp_tokens = nullptr;
ggml_tensor * t_inp_embd = nullptr; // [n_embd_inp, n_tokens]
ggml_tensor * t_logits = nullptr;
ggml_tensor * t_embd = nullptr;
ggml_tensor * t_embd_pooled = nullptr;