mtmd: add llama-mtmd-debug binary (#20508)

* mtmd: add llama-mtmd-debug binary

* adapt

* fixes

* fix compile error

* fix windows compile error

* rm legacy clip_debug_encode()

* add MTMD_API to fix build
This commit is contained in:
Xuan-Son Nguyen
2026-03-14 15:52:29 +01:00
committed by GitHub
parent a93c0ef0fa
commit 94d0262277
7 changed files with 392 additions and 15 deletions
+8 -13
View File
@@ -159,6 +159,8 @@ struct clip_ctx {
clip_flash_attn_type flash_attn_type = CLIP_FLASH_ATTN_TYPE_AUTO;
bool is_allocated = false;
bool debug_output_embeddings = false;
clip_ctx(clip_context_params & ctx_params) {
flash_attn_type = ctx_params.flash_attn_type;
backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
@@ -205,6 +207,8 @@ struct clip_ctx {
if (ctx_params.cb_eval != nullptr) {
ggml_backend_sched_set_eval_callback(sched.get(), ctx_params.cb_eval, ctx_params.cb_eval_user_data);
}
debug_output_embeddings = std::getenv("MTMD_DEBUG_EMBEDDINGS") != nullptr;
}
~clip_ctx() {
@@ -2193,8 +2197,6 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
// TODO: we don't support audio for Gemma 3N, but GGUF contains audio tensors
// we can remove this check when we implement audio support for Gemma 3N
skip_audio = ctx_vision->model.proj_type == PROJECTOR_TYPE_GEMMA3NV;
// clip_debug_encode(ctx_vision, 24*14, 24*14, 0.5f);
}
if (loader.has_audio && !skip_audio) {
@@ -3981,7 +3983,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
}
// Debug: dump final embeddings if MTMD_DEBUG_EMBEDDINGS is set
if (std::getenv("MTMD_DEBUG_EMBEDDINGS") != nullptr) {
if (ctx->debug_output_embeddings) {
const int64_t n_embd = embeddings->ne[0];
const int64_t n_tokens = embeddings->ne[1];
std::vector<float> emb_data(n_embd * n_tokens);
@@ -4160,14 +4162,7 @@ const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx) {
//
// API for debugging
//
void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value) {
clip_image_f32 img;
img.nx = w;
img.ny = h;
img.buf.resize(h * w * 3);
for (int i = 0; i < h * w * 3; i++) {
img.buf[i] = static_cast<float>(fill_value);
}
clip_image_encode(ctx, 1, &img, nullptr);
GGML_ASSERT(img.buf.empty() && "expected, always stop here");
void clip_set_debug_output_embeddings(clip_ctx * ctx, bool enable) {
ctx->debug_output_embeddings = enable;
}