ggml: add Q1_0 1-bit quantization support (CPU) (#21273)

* ggml: add Q1_0 and Q1_0_g128 1-bit quantization support (CPU)

* add generic fallback for x86

* remove Q1_0 (group size 32)

* rename Q1_0_g128 => Q1_0

* fix Q1_0 LlamaFileType Enum

* Fix trailing spaces; add generic fallback for othre backends

* Apply suggestions from code review

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* fix /r/n spacing + arch-fallback

---------

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
This commit is contained in:
Pasha Khosravi
2026-04-06 11:55:21 -07:00
committed by GitHub
parent 506200cf8b
commit 2e1f0a889e
21 changed files with 285 additions and 5 deletions
+2
View File
@@ -36,6 +36,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
case LLAMA_FTYPE_ALL_F32: return "all F32";
case LLAMA_FTYPE_MOSTLY_F16: return "F16";
case LLAMA_FTYPE_MOSTLY_BF16: return "BF16";
case LLAMA_FTYPE_MOSTLY_Q1_0: return "Q1_0";
case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0";
@@ -758,6 +759,7 @@ llama_model_loader::llama_model_loader(
case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
case GGML_TYPE_NVFP4: ftype = LLAMA_FTYPE_MOSTLY_NVFP4; break;
case GGML_TYPE_Q1_0: ftype = LLAMA_FTYPE_MOSTLY_Q1_0; break;
default:
{
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));