Files
llama.cpp/tools/mtmd/models/models.h
T
Stephen Cox 547765a93e mtmd: add Gemma 4 audio conformer encoder support (#21421)
* mtmd: add Gemma 4 audio conformer encoder support

Add audio processing for Gemma 4 E2B/E4B via a USM-style Conformer.

Architecture:
- 12-layer Conformer: FFN → Self-Attention → Causal Conv1D → FFN → Norm
- Subsampling Conv Projection: 2x Conv2D(stride=2) with LayerNorm
- Full self-attention with sinusoidal RPE and sliding window mask (24)
- Logit softcapping at 50.0, ClippableLinear clamping
- Output: 1024 → 1536 → RMSNorm → multimodal embedder

Mel preprocessing (dedicated mtmd_audio_preprocessor_gemma4a):
- HTK mel scale, 128 bins, magnitude STFT, mel_floor=1e-3
- Standard periodic Hann window (320 samples), zero-padded to FFT size
- Semicausal left-padding (frame_length/2 samples)
- Frame count matched to PyTorch (unfold formula)
- No pre-emphasis, no Whisper-style normalization
- Mel cosine similarity vs PyTorch: 0.9998

Key fixes:
- Tensor loading dedup: prevent get_tensor() from creating duplicate
  entries in ctx_data. Fixed with std::set guard.
- ClippableLinear clamp_info loading moved after per-layer tensors.
- Sliding window mask (24 positions) matching PyTorch context_size.
- Skip Whisper normalization for Gemma4 mel output.

Tested on E2B and E4B with CPU and Vulkan backends.
Transcribes: "Glad to see things are going well and business is starting
to pick up" (matching ground truth).

Ref: #21325
2026-04-12 14:15:26 +02:00

161 lines
5.3 KiB
C

#pragma once
#include "../clip-graph.h"
/*
* IMPORTANT: The mtmd module does NOT accept pull requests that are fully or predominantly AI-generated.
* We encourage human contributors to ensure the quality and reliability of the codebase.
*/
struct clip_graph_siglip : clip_graph {
clip_graph_siglip(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
};
struct clip_graph_gemma4v : clip_graph {
clip_graph_gemma4v(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
ggml_tensor * build_mm(ggml_tensor * w, ggml_tensor * x) const override;
};
struct clip_graph_pixtral : clip_graph {
clip_graph_pixtral(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
};
struct clip_graph_qwen2vl : clip_graph {
clip_graph_qwen2vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
};
struct clip_graph_qwen3vl : clip_graph {
clip_graph_qwen3vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
};
struct clip_graph_step3vl : clip_graph {
clip_graph_step3vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
};
struct clip_graph_youtuvl : clip_graph {
clip_graph_youtuvl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
};
struct clip_graph_minicpmv : clip_graph {
clip_graph_minicpmv(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
};
struct clip_graph_internvl : clip_graph {
clip_graph_internvl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
};
struct clip_graph_nemotron_v2_vl : clip_graph {
clip_graph_nemotron_v2_vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
};
struct clip_graph_llama4 : clip_graph {
clip_graph_llama4(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
};
struct clip_graph_kimivl : clip_graph {
clip_graph_kimivl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
};
struct clip_graph_paddleocr : clip_graph {
clip_graph_paddleocr(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
};
struct clip_graph_dotsocr : clip_graph {
clip_graph_dotsocr(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
};
struct clip_graph_cogvlm : clip_graph {
clip_graph_cogvlm(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
};
struct clip_graph_llava : clip_graph {
clip_graph_llava(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
};
struct clip_graph_whisper_enc : clip_graph {
clip_graph_whisper_enc(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
};
struct clip_graph_deepseekocr : clip_graph {
clip_graph_deepseekocr(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
};
struct clip_graph_conformer : clip_graph {
clip_graph_conformer(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
};
struct clip_graph_gemma4a : clip_graph {
clip_graph_gemma4a(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
ggml_tensor * build_mm(ggml_tensor * w, ggml_tensor * x) const override;
};
struct clip_graph_glm4v : clip_graph {
clip_graph_glm4v(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
};
struct clip_graph_hunyuanocr : clip_graph {
clip_graph_hunyuanocr(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
};
struct clip_graph_mobilenetv5 : clip_graph {
clip_graph_mobilenetv5(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
ggml_tensor * rms_norm_2d(
ggml_tensor * inp,
ggml_tensor * weight,
float eps = 1e-6f);
ggml_tensor* pad_same_2d(
ggml_tensor* inp,
int kernel_h,
int kernel_w,
int stride_h,
int stride_w,
int dilation_h = 1,
int dilation_w = 1);
ggml_tensor * build_edge_residual(
ggml_tensor * inp,
const mobilenetv5_block & block,
int stride);
ggml_tensor * build_inverted_residual(
ggml_tensor * inp,
const mobilenetv5_block & block,
int stride);
ggml_tensor * build_mobilenet_attn(
ggml_tensor * inp,
const mobilenetv5_block & block);
};
struct clip_graph_kimik25 : clip_graph {
clip_graph_kimik25(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
ggml_tensor * resize_position_embeddings_3d(uint32_t interpolation_mode);
};