mtmd: add llama-mtmd-debug binary (#20508)
* mtmd: add llama-mtmd-debug binary * adapt * fixes * fix compile error * fix windows compile error * rm legacy clip_debug_encode() * add MTMD_API to fix build
This commit is contained in:
+8
-13
@@ -159,6 +159,8 @@ struct clip_ctx {
|
||||
clip_flash_attn_type flash_attn_type = CLIP_FLASH_ATTN_TYPE_AUTO;
|
||||
bool is_allocated = false;
|
||||
|
||||
bool debug_output_embeddings = false;
|
||||
|
||||
clip_ctx(clip_context_params & ctx_params) {
|
||||
flash_attn_type = ctx_params.flash_attn_type;
|
||||
backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
|
||||
@@ -205,6 +207,8 @@ struct clip_ctx {
|
||||
if (ctx_params.cb_eval != nullptr) {
|
||||
ggml_backend_sched_set_eval_callback(sched.get(), ctx_params.cb_eval, ctx_params.cb_eval_user_data);
|
||||
}
|
||||
|
||||
debug_output_embeddings = std::getenv("MTMD_DEBUG_EMBEDDINGS") != nullptr;
|
||||
}
|
||||
|
||||
~clip_ctx() {
|
||||
@@ -2193,8 +2197,6 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
|
||||
// TODO: we don't support audio for Gemma 3N, but GGUF contains audio tensors
|
||||
// we can remove this check when we implement audio support for Gemma 3N
|
||||
skip_audio = ctx_vision->model.proj_type == PROJECTOR_TYPE_GEMMA3NV;
|
||||
|
||||
// clip_debug_encode(ctx_vision, 24*14, 24*14, 0.5f);
|
||||
}
|
||||
|
||||
if (loader.has_audio && !skip_audio) {
|
||||
@@ -3981,7 +3983,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
}
|
||||
|
||||
// Debug: dump final embeddings if MTMD_DEBUG_EMBEDDINGS is set
|
||||
if (std::getenv("MTMD_DEBUG_EMBEDDINGS") != nullptr) {
|
||||
if (ctx->debug_output_embeddings) {
|
||||
const int64_t n_embd = embeddings->ne[0];
|
||||
const int64_t n_tokens = embeddings->ne[1];
|
||||
std::vector<float> emb_data(n_embd * n_tokens);
|
||||
@@ -4160,14 +4162,7 @@ const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx) {
|
||||
//
|
||||
// API for debugging
|
||||
//
|
||||
void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value) {
|
||||
clip_image_f32 img;
|
||||
img.nx = w;
|
||||
img.ny = h;
|
||||
img.buf.resize(h * w * 3);
|
||||
for (int i = 0; i < h * w * 3; i++) {
|
||||
img.buf[i] = static_cast<float>(fill_value);
|
||||
}
|
||||
clip_image_encode(ctx, 1, &img, nullptr);
|
||||
GGML_ASSERT(img.buf.empty() && "expected, always stop here");
|
||||
|
||||
void clip_set_debug_output_embeddings(clip_ctx * ctx, bool enable) {
|
||||
ctx->debug_output_embeddings = enable;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user