model: Add Kimi-K2.5 support (#19170)
* Move dequant_model to after the text_config merge Add new kimi-k2.5 keys to mtmd convert Update V_MMPROJ tensor mapping for new mm_projector.proj keys Update V_M_IMP_NORM for new mm_projector.pre_norm key * Fix a couple of oversights * Add image support for Kimi-K2.5 * Revert changes to KimiVLForConditionalGeneration * Fix an assert crash * Fix permute swapping w / h on accident * Kimi-K2.5: Use merged QKV for vision * Kimi-K2.5: pre-convert vision QK to use build_rope_2d * Kimi-K2.5: support non-interleaved rope for vision * Kimi-K2.5: fix min / max pixel * Kimi-K2.5: remove v/o permutes, unnecessary * Kimi-K2.5: update permute name to match * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Kimi-K2.5: replace build_rope_2d ggml_cont with ggml_view_3d pointers --------- Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
This commit is contained in:
+86
-4
@@ -673,8 +673,8 @@ ggml_tensor * clip_graph::build_rope_2d(
|
||||
{
|
||||
first = ggml_view_3d(ctx0, cur,
|
||||
n_dim/2, n_head, n_pos,
|
||||
ggml_row_size(cur->type, n_dim),
|
||||
ggml_row_size(cur->type, n_dim*n_head),
|
||||
cur->nb[1],
|
||||
cur->nb[2],
|
||||
0);
|
||||
first = ggml_rope_ext(
|
||||
ctx0,
|
||||
@@ -692,8 +692,8 @@ ggml_tensor * clip_graph::build_rope_2d(
|
||||
{
|
||||
second = ggml_view_3d(ctx0, cur,
|
||||
n_dim/2, n_head, n_pos,
|
||||
ggml_row_size(cur->type, n_dim),
|
||||
ggml_row_size(cur->type, n_dim*n_head),
|
||||
cur->nb[1],
|
||||
cur->nb[2],
|
||||
n_dim/2 * ggml_element_size(cur));
|
||||
second = ggml_rope_ext(
|
||||
ctx0,
|
||||
@@ -826,6 +826,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||
{
|
||||
builder = std::make_unique<clip_graph_kimivl>(ctx, img);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_KIMIK25:
|
||||
{
|
||||
builder = std::make_unique<clip_graph_kimik25>(ctx, img);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_COGVLM:
|
||||
{
|
||||
builder = std::make_unique<clip_graph_cogvlm>(ctx, img);
|
||||
@@ -1139,6 +1143,22 @@ struct clip_model_loader {
|
||||
hparams.set_limit_image_tokens(8, 1024);
|
||||
hparams.set_warmup_n_tokens(256); // avoid OOM on warmup
|
||||
} break;
|
||||
case PROJECTOR_TYPE_KIMIK25:
|
||||
{
|
||||
hparams.rope_theta = 10000.0f;
|
||||
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
|
||||
|
||||
int min_pixels = 0, max_pixels = 0;
|
||||
get_u32(KEY_IMAGE_MIN_PIXELS, min_pixels, false);
|
||||
get_u32(KEY_IMAGE_MAX_PIXELS, max_pixels, false);
|
||||
if (min_pixels > 0 && max_pixels > 0) {
|
||||
hparams.image_min_pixels = min_pixels;
|
||||
hparams.image_max_pixels = max_pixels;
|
||||
hparams.warmup_image_size = static_cast<int>(std::sqrt(max_pixels));
|
||||
} else {
|
||||
hparams.set_limit_image_tokens(2, 4096);
|
||||
}
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GEMMA3:
|
||||
{
|
||||
// default value (used by all model sizes in gemma 3 family)
|
||||
@@ -1668,6 +1688,7 @@ struct clip_model_loader {
|
||||
model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
|
||||
} break;
|
||||
case PROJECTOR_TYPE_KIMIVL:
|
||||
case PROJECTOR_TYPE_KIMIK25:
|
||||
{
|
||||
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM);
|
||||
model.mm_input_norm_b = get_tensor(TN_MM_INP_NORM_B);
|
||||
@@ -3165,6 +3186,23 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
|
||||
res_imgs->entries.push_back(std::move(res));
|
||||
} break;
|
||||
|
||||
case PROJECTOR_TYPE_KIMIK25:
|
||||
{
|
||||
GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0);
|
||||
const clip_image_size target_size = img_tool::calc_size_preserved_ratio(
|
||||
original_size,
|
||||
params.patch_size * params.n_merge,
|
||||
params.image_min_pixels,
|
||||
params.image_max_pixels);
|
||||
const std::array<uint8_t, 3> pad_color = {0, 0, 0};
|
||||
|
||||
clip_image_u8 resized_img;
|
||||
img_tool::resize(*img, resized_img, target_size, img_tool::RESIZE_ALGO_BICUBIC, true, pad_color);
|
||||
clip_image_f32_ptr res(clip_image_f32_init());
|
||||
normalize_image_u8_to_f32(resized_img, *res, params.image_mean, params.image_std);
|
||||
res_imgs->entries.push_back(std::move(res));
|
||||
} break;
|
||||
|
||||
case PROJECTOR_TYPE_MLP:
|
||||
case PROJECTOR_TYPE_MLP_NORM:
|
||||
case PROJECTOR_TYPE_LDP:
|
||||
@@ -3373,6 +3411,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
||||
} break;
|
||||
case PROJECTOR_TYPE_LFM2:
|
||||
case PROJECTOR_TYPE_KIMIVL:
|
||||
case PROJECTOR_TYPE_KIMIK25:
|
||||
{
|
||||
// dynamic size
|
||||
int out_patch_size = params.patch_size * ctx->model.hparams.n_merge;
|
||||
@@ -3714,6 +3753,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
} break;
|
||||
case PROJECTOR_TYPE_PIXTRAL:
|
||||
case PROJECTOR_TYPE_KIMIVL:
|
||||
case PROJECTOR_TYPE_KIMIK25:
|
||||
case PROJECTOR_TYPE_LIGHTONOCR:
|
||||
{
|
||||
// set the 2D positions
|
||||
@@ -3850,6 +3890,47 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
|
||||
}
|
||||
|
||||
// Debug: dump final embeddings if MTMD_DEBUG_EMBEDDINGS is set
|
||||
if (std::getenv("MTMD_DEBUG_EMBEDDINGS") != nullptr) {
|
||||
const int64_t n_embd = embeddings->ne[0];
|
||||
const int64_t n_tokens = embeddings->ne[1];
|
||||
std::vector<float> emb_data(n_embd * n_tokens);
|
||||
ggml_backend_tensor_get(embeddings, emb_data.data(), 0, ggml_nbytes(embeddings));
|
||||
|
||||
LOG_INF("\n=== MTMD_DEBUG_EMBEDDINGS ===\n");
|
||||
LOG_INF("Shape: [%lld, %lld]\n", (long long)n_embd, (long long)n_tokens);
|
||||
|
||||
// Print first few values of first token
|
||||
LOG_INF("Token 0 (first 16 values): ");
|
||||
for (int i = 0; i < std::min((int64_t)16, n_embd); i++) {
|
||||
LOG_INF("%.6f ", emb_data[i]);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
// Print last few values of first token
|
||||
if (n_embd > 16) {
|
||||
LOG_INF("Token 0 (last 16 values): ");
|
||||
for (int64_t i = n_embd - 16; i < n_embd; i++) {
|
||||
LOG_INF("%.6f ", emb_data[i]);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
}
|
||||
|
||||
// Compute and print statistics
|
||||
float sum = 0.0f, sum_sq = 0.0f, min_val = emb_data[0], max_val = emb_data[0];
|
||||
for (size_t i = 0; i < emb_data.size(); i++) {
|
||||
sum += emb_data[i];
|
||||
sum_sq += emb_data[i] * emb_data[i];
|
||||
min_val = std::min(min_val, emb_data[i]);
|
||||
max_val = std::max(max_val, emb_data[i]);
|
||||
}
|
||||
float mean = sum / emb_data.size();
|
||||
float variance = (sum_sq / emb_data.size()) - (mean * mean);
|
||||
LOG_INF("Stats: mean=%.6f, std=%.6f, min=%.6f, max=%.6f, sum=%.6f\n",
|
||||
mean, sqrtf(variance), min_val, max_val, sum);
|
||||
LOG_INF("=== END MTMD_DEBUG_EMBEDDINGS ===\n\n");
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -3896,6 +3977,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
||||
return ctx->model.mm_2_w->ne[1];
|
||||
case PROJECTOR_TYPE_LFM2:
|
||||
case PROJECTOR_TYPE_KIMIVL:
|
||||
case PROJECTOR_TYPE_KIMIK25:
|
||||
return ctx->model.mm_2_w->ne[1];
|
||||
case PROJECTOR_TYPE_COGVLM:
|
||||
return ctx->model.mm_4h_to_h_w->ne[1];
|
||||
|
||||
Reference in New Issue
Block a user