mtmd: support dots.ocr (#17575)
* convert gguf * clip impl * fix conversion * wip * corrections * update docs * add gguf to test script
This commit is contained in:
@@ -0,0 +1,49 @@
|
||||
#include "models.h"
|
||||
|
||||
ggml_cgraph * clip_graph_dotsocr::build() {
|
||||
const int n_pos = n_patches;
|
||||
const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position
|
||||
|
||||
// note: similar to PaddleOCR
|
||||
int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
|
||||
|
||||
ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
|
||||
ggml_set_name(positions, "positions");
|
||||
ggml_set_input(positions);
|
||||
|
||||
auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
|
||||
return ggml_rope_multi(
|
||||
ctx0, cur, positions, nullptr,
|
||||
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION,
|
||||
32768, 10000, 1, 0, 1, 32, 1);
|
||||
};
|
||||
|
||||
ggml_tensor * inp = build_inp();
|
||||
ggml_tensor * cur = build_vit(
|
||||
inp, n_patches,
|
||||
NORM_TYPE_RMS,
|
||||
hparams.ffn_op,
|
||||
nullptr,
|
||||
add_pos);
|
||||
|
||||
cb(cur, "vit_out", -1);
|
||||
|
||||
// dots.ocr patch merger + projector
|
||||
{
|
||||
GGML_ASSERT(hparams.n_merge > 0);
|
||||
cur = build_norm(cur, model.mm_input_norm_w, model.mm_input_norm_b, NORM_TYPE_NORMAL, 1e-6, -1);
|
||||
cur = build_patch_merge_permute(cur, hparams.n_merge);
|
||||
cb(cur, "after_patch_merger", -1);
|
||||
cur = build_ffn(cur,
|
||||
model.mm_0_w, model.mm_0_b,
|
||||
nullptr, nullptr, // no gate
|
||||
model.mm_2_w, model.mm_2_b,
|
||||
FFN_GELU_ERF, -1); // nn.GELU() defaults to exact erf-based GELU
|
||||
cb(cur, "after_projector", -1);
|
||||
}
|
||||
|
||||
// build the graph
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
}
|
||||
Reference in New Issue
Block a user