[Model] Qwen3.5 dense and MoE support (no vision) (#19435)

* Unified delta net handling

* Remove old methods.

* Refactor and optimize

* Adapt autoregressive version from @ymcki

* Change to decay mask approach

* Fix bad permute

* Qwen 3.5 support

* Apply suggestions from code review

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Further fixes

* Use inheritance, remove unneeded conts

* Not like this!

* Remove ggml.h explicit import

* Remove transformers, fix the views

* ACTUALLY fix views, make super calls explicit in conversion.

* Fix conversion again

* Remove extra ggml.h imports

---------

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
This commit is contained in:
Piotr Wilkin (ilintar)
2026-02-09 00:24:08 +01:00
committed by GitHub
parent e06088da0f
commit 39bf692af1
14 changed files with 1532 additions and 399 deletions
+101 -1
View File
@@ -17,6 +17,53 @@ struct llm_graph_context_mamba : public llm_graph_context {
};
struct llm_graph_context_delta : public llm_graph_context_mamba {
llm_graph_context_delta(const llm_graph_params & params);
virtual ~llm_graph_context_delta() = default;
std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_unified_chunking(
ggml_context * ctx0,
ggml_tensor * q,
ggml_tensor * k,
ggml_tensor * v,
ggml_tensor * g,
ggml_tensor * beta,
ggml_tensor * state,
ggml_tensor * causal_mask,
ggml_tensor * identity,
ggml_tensor * diag_mask,
int il,
int64_t chunk_size,
float eps_norm);
std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_unified_autoregressive(
ggml_context * ctx0,
ggml_tensor * q,
ggml_tensor * k,
ggml_tensor * v,
ggml_tensor * g,
ggml_tensor * beta,
ggml_tensor * state,
int il,
float eps_norm);
std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_unified(
ggml_context * ctx0,
ggml_tensor * q,
ggml_tensor * k,
ggml_tensor * v,
ggml_tensor * g,
ggml_tensor * beta,
ggml_tensor * state,
ggml_tensor * causal_mask,
ggml_tensor * identity,
ggml_tensor * diag_mask,
int il,
int64_t chunk_size,
float eps_norm);
};
// Base class for RWKV-related models
struct llm_build_rwkv6_base : public llm_graph_context {
const llama_model & model;
@@ -476,7 +523,7 @@ struct llm_build_qwen3vl : public llm_graph_context {
struct llm_build_qwen3vlmoe : public llm_graph_context {
llm_build_qwen3vlmoe(const llama_model & model, const llm_graph_params & params);
};
struct llm_build_qwen3next : public llm_graph_context_mamba {
struct llm_build_qwen3next : public llm_graph_context_delta {
llm_build_qwen3next(const llama_model & model, const llm_graph_params & params);
private:
ggml_tensor * build_layer_attn(
@@ -534,6 +581,59 @@ private:
const llama_model & model;
};
struct llm_build_qwen3_5 : public llm_graph_context_delta {
llm_build_qwen3_5(const llama_model & model, const llm_graph_params & params);
protected:
// Tag type for subclass constructors that need to call build_graph() themselves
// (to ensure virtual dispatch works correctly)
struct defer_graph_build_t {};
llm_build_qwen3_5(const llama_model & model, const llm_graph_params & params, defer_graph_build_t);
void build_graph();
virtual ggml_tensor * build_layer_ffn(
ggml_tensor * cur,
int il);
const llama_model & model;
private:
ggml_tensor * build_layer_attn(
llm_graph_input_attn_kv * inp_attn,
ggml_tensor * cur,
ggml_tensor * inp_pos,
int il);
ggml_tensor * build_layer_attn_linear(
llm_graph_input_rs * inp,
ggml_tensor * cur,
ggml_tensor * causal_mask,
ggml_tensor * identity,
ggml_tensor * diag_mask,
int il);
ggml_tensor * build_norm_gated(
ggml_tensor * input,
ggml_tensor * weights,
ggml_tensor * gate,
int layer);
std::pair<ggml_tensor *, ggml_tensor *> build_qkvz(
ggml_tensor * input,
int il);
};
struct llm_build_qwen3_5_moe : public llm_build_qwen3_5 {
llm_build_qwen3_5_moe(const llama_model & model, const llm_graph_params & params);
protected:
ggml_tensor * build_layer_ffn(
ggml_tensor * cur,
int il) override;
};
struct llm_build_qwen : public llm_graph_context {
llm_build_qwen(const llama_model & model, const llm_graph_params & params);
};