[Model] Qwen3.5 dense and MoE support (no vision) (#19435)
* Unified delta net handling * Remove old methods. * Refactor and optimize * Adapt autoregressive version from @ymcki * Change to decay mask approach * Fix bad permute * Qwen 3.5 support * Apply suggestions from code review Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Further fixes * Use inheritance, remove unneeded conts * Not like this! * Remove ggml.h explicit import * Remove transformers, fix the views * ACTUALLY fix views, make super calls explicit in conversion. * Fix conversion again * Remove extra ggml.h imports --------- Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
This commit is contained in:
committed by
GitHub
parent
e06088da0f
commit
39bf692af1
+101
-1
@@ -17,6 +17,53 @@ struct llm_graph_context_mamba : public llm_graph_context {
|
||||
|
||||
};
|
||||
|
||||
struct llm_graph_context_delta : public llm_graph_context_mamba {
|
||||
llm_graph_context_delta(const llm_graph_params & params);
|
||||
|
||||
virtual ~llm_graph_context_delta() = default;
|
||||
|
||||
std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_unified_chunking(
|
||||
ggml_context * ctx0,
|
||||
ggml_tensor * q,
|
||||
ggml_tensor * k,
|
||||
ggml_tensor * v,
|
||||
ggml_tensor * g,
|
||||
ggml_tensor * beta,
|
||||
ggml_tensor * state,
|
||||
ggml_tensor * causal_mask,
|
||||
ggml_tensor * identity,
|
||||
ggml_tensor * diag_mask,
|
||||
int il,
|
||||
int64_t chunk_size,
|
||||
float eps_norm);
|
||||
|
||||
std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_unified_autoregressive(
|
||||
ggml_context * ctx0,
|
||||
ggml_tensor * q,
|
||||
ggml_tensor * k,
|
||||
ggml_tensor * v,
|
||||
ggml_tensor * g,
|
||||
ggml_tensor * beta,
|
||||
ggml_tensor * state,
|
||||
int il,
|
||||
float eps_norm);
|
||||
|
||||
std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_unified(
|
||||
ggml_context * ctx0,
|
||||
ggml_tensor * q,
|
||||
ggml_tensor * k,
|
||||
ggml_tensor * v,
|
||||
ggml_tensor * g,
|
||||
ggml_tensor * beta,
|
||||
ggml_tensor * state,
|
||||
ggml_tensor * causal_mask,
|
||||
ggml_tensor * identity,
|
||||
ggml_tensor * diag_mask,
|
||||
int il,
|
||||
int64_t chunk_size,
|
||||
float eps_norm);
|
||||
};
|
||||
|
||||
// Base class for RWKV-related models
|
||||
struct llm_build_rwkv6_base : public llm_graph_context {
|
||||
const llama_model & model;
|
||||
@@ -476,7 +523,7 @@ struct llm_build_qwen3vl : public llm_graph_context {
|
||||
struct llm_build_qwen3vlmoe : public llm_graph_context {
|
||||
llm_build_qwen3vlmoe(const llama_model & model, const llm_graph_params & params);
|
||||
};
|
||||
struct llm_build_qwen3next : public llm_graph_context_mamba {
|
||||
struct llm_build_qwen3next : public llm_graph_context_delta {
|
||||
llm_build_qwen3next(const llama_model & model, const llm_graph_params & params);
|
||||
private:
|
||||
ggml_tensor * build_layer_attn(
|
||||
@@ -534,6 +581,59 @@ private:
|
||||
const llama_model & model;
|
||||
};
|
||||
|
||||
struct llm_build_qwen3_5 : public llm_graph_context_delta {
|
||||
llm_build_qwen3_5(const llama_model & model, const llm_graph_params & params);
|
||||
|
||||
protected:
|
||||
// Tag type for subclass constructors that need to call build_graph() themselves
|
||||
// (to ensure virtual dispatch works correctly)
|
||||
struct defer_graph_build_t {};
|
||||
|
||||
llm_build_qwen3_5(const llama_model & model, const llm_graph_params & params, defer_graph_build_t);
|
||||
|
||||
void build_graph();
|
||||
|
||||
virtual ggml_tensor * build_layer_ffn(
|
||||
ggml_tensor * cur,
|
||||
int il);
|
||||
|
||||
const llama_model & model;
|
||||
|
||||
private:
|
||||
ggml_tensor * build_layer_attn(
|
||||
llm_graph_input_attn_kv * inp_attn,
|
||||
ggml_tensor * cur,
|
||||
ggml_tensor * inp_pos,
|
||||
int il);
|
||||
|
||||
ggml_tensor * build_layer_attn_linear(
|
||||
llm_graph_input_rs * inp,
|
||||
ggml_tensor * cur,
|
||||
ggml_tensor * causal_mask,
|
||||
ggml_tensor * identity,
|
||||
ggml_tensor * diag_mask,
|
||||
int il);
|
||||
|
||||
ggml_tensor * build_norm_gated(
|
||||
ggml_tensor * input,
|
||||
ggml_tensor * weights,
|
||||
ggml_tensor * gate,
|
||||
int layer);
|
||||
|
||||
std::pair<ggml_tensor *, ggml_tensor *> build_qkvz(
|
||||
ggml_tensor * input,
|
||||
int il);
|
||||
};
|
||||
|
||||
struct llm_build_qwen3_5_moe : public llm_build_qwen3_5 {
|
||||
llm_build_qwen3_5_moe(const llama_model & model, const llm_graph_params & params);
|
||||
|
||||
protected:
|
||||
ggml_tensor * build_layer_ffn(
|
||||
ggml_tensor * cur,
|
||||
int il) override;
|
||||
};
|
||||
|
||||
struct llm_build_qwen : public llm_graph_context {
|
||||
llm_build_qwen(const llama_model & model, const llm_graph_params & params);
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user