model : add tokenizer from LFM2.5-Audio-1.5B (#19687)

* model : Add tokenizer from LFM2.5-Audio-1.5B

[LFM2.5-Audio-1.5B](https://huggingface.co/LiquidAI/LFM2.5-Audio-1.5B) introduced lightweight audio tokenizer.

Tokenizer based on LFM2 architecture and acts as "embedding" model with
different input `n_embd` and output `n_embd_out`.

To be used in https://github.com/ggml-org/llama.cpp/pull/18641.

To convert use

```shell
python3 convert_hf_to_gguf.py /path/to/LFM2.5-Audio-1.5B/audio_detokenizer
```

* Update convert_hf_to_gguf.py

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Formatting

* Rework check for attention layers

* Add LFM2 SWA model support

* Address PR feedback

* Set vocab to none

* Move helper function definitions to cpp file

---------

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
This commit is contained in:
Tarek Dakhran
2026-02-19 09:54:48 +01:00
committed by GitHub
parent eacb4b67a2
commit 8004f3a8d1
7 changed files with 185 additions and 137 deletions
+14 -3
View File
@@ -2348,6 +2348,12 @@ void llama_model::load_hparams(llama_model_loader & ml) {
case 10752: type = LLM_TYPE_2_6B; break;
default: type = LLM_TYPE_UNKNOWN;
}
if (const auto is_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false); is_swa && hparams.n_swa > 0) {
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
for (uint32_t il = 0; il < hparams.n_layer; ++il) {
hparams.swa_layers[il] = !hparams.recurrent_layer_arr[il];
}
}
} break;
case LLM_ARCH_LFM2MOE:
{
@@ -6896,7 +6902,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
}
// for LFM2-ColBert-350M
dense_2_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "weight"), {n_embd, hparams.n_embd_out()}, TENSOR_NOT_REQUIRED);
dense_2_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "weight"), {n_embd, hparams.n_embd_out()}, TENSOR_NOT_REQUIRED);
dense_2_out_layers_b = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "bias"), {hparams.n_embd_out() }, TENSOR_NOT_REQUIRED);
} break;
case LLM_ARCH_SMALLTHINKER:
{
@@ -8672,7 +8679,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
case LLM_ARCH_LFM2:
case LLM_ARCH_LFM2MOE:
{
llm = std::make_unique<llm_build_lfm2>(*this, params);
if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
llm = std::make_unique<llm_build_lfm2<true>>(*this, params);
} else {
llm = std::make_unique<llm_build_lfm2<false>>(*this, params);
}
} break;
case LLM_ARCH_SMALLTHINKER:
{
@@ -8744,7 +8755,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
// there will be two additional dense projection layers
// dense linear projections are applied after pooling
// TODO: move reranking logic here and generalize
llm->build_dense_out(dense_2_out_layers, dense_3_out_layers);
llm->build_dense_out(dense_2_out_layers, dense_2_out_layers_b, dense_3_out_layers);
llm->res->set_outputs();