model : support step3-vl-10b (#21287)
* feat: support step3-vl-10b * use fused QKV && mapping tensor in tensor_mapping.py * guard hardcoded params and drop crop metadata * get understand_projector_stride from global config * img_u8_resize_bilinear_to_f32 move in step3vl class * Apply suggestions from code review Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * fix the \r\n mess * add width and heads to MmprojModel.set_gguf_parameters --------- Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
This commit is contained in:
@@ -0,0 +1,81 @@
|
||||
#include "models.h"
|
||||
|
||||
ggml_cgraph * clip_graph_step3vl::build() {
|
||||
GGML_ASSERT(model.class_embedding == nullptr);
|
||||
GGML_ASSERT(model.patch_embeddings_0 != nullptr);
|
||||
GGML_ASSERT(model.position_embeddings != nullptr);
|
||||
|
||||
norm_type norm_t = NORM_TYPE_NORMAL;
|
||||
|
||||
ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
|
||||
ggml_set_name(pos_h, "pos_h");
|
||||
ggml_set_input(pos_h);
|
||||
|
||||
ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
|
||||
ggml_set_name(pos_w, "pos_w");
|
||||
ggml_set_input(pos_w);
|
||||
|
||||
ggml_tensor * inp = build_inp();
|
||||
ggml_tensor * learned_pos_embd = resize_position_embeddings();
|
||||
|
||||
auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
|
||||
return build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false);
|
||||
};
|
||||
|
||||
auto add_spatial_bias = [&](ggml_tensor * cur, ggml_tensor * bias) {
|
||||
if (bias == nullptr) {
|
||||
return cur;
|
||||
}
|
||||
|
||||
const int64_t width = cur->ne[0];
|
||||
const int64_t height = cur->ne[1];
|
||||
const int64_t channels = cur->ne[2];
|
||||
|
||||
cur = ggml_reshape_2d(ctx0, cur, width * height, channels);
|
||||
cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
|
||||
cur = ggml_add(ctx0, cur, bias);
|
||||
cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
|
||||
cur = ggml_reshape_3d(ctx0, cur, width, height, channels);
|
||||
|
||||
return cur;
|
||||
};
|
||||
|
||||
ggml_tensor * cur = build_vit(
|
||||
inp,
|
||||
n_patches,
|
||||
norm_t,
|
||||
hparams.ffn_op,
|
||||
learned_pos_embd,
|
||||
add_pos);
|
||||
cb(cur, "vit_out", -1);
|
||||
|
||||
// [n_embd, n_patches] -> [w, h, n_embd] for spatial downsampling convolutions.
|
||||
cur = ggml_permute(ctx0, cur, 1, 0, 2, 3);
|
||||
cur = ggml_cont_3d(ctx0, cur, n_patches_x, n_patches_y, n_embd);
|
||||
|
||||
// First downsampler: Conv2d(1536 -> 3072, k=3, s=2, p=1)
|
||||
cur = ggml_conv_2d(ctx0, model.mm_0_w, cur, 2, 2, 1, 1, 1, 1);
|
||||
cur = add_spatial_bias(cur, model.mm_0_b);
|
||||
cb(cur, "downsample_0", -1);
|
||||
|
||||
// Second downsampler: Conv2d(3072 -> 6144, k=3, s=2, p=1)
|
||||
cur = ggml_conv_2d(ctx0, model.mm_1_w, cur, 2, 2, 1, 1, 1, 1);
|
||||
cur = add_spatial_bias(cur, model.mm_1_b);
|
||||
cb(cur, "downsample_1", -1);
|
||||
|
||||
// [w, h, c] -> [c, w*h]
|
||||
{
|
||||
const int64_t w = cur->ne[0];
|
||||
const int64_t h = cur->ne[1];
|
||||
cur = ggml_reshape_3d(ctx0, cur, w * h, cur->ne[2], cur->ne[3]);
|
||||
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 0, 2, 3));
|
||||
}
|
||||
cb(cur, "downsample_flatten", -1);
|
||||
|
||||
// Final projector: Linear(6144 -> projection_dim)
|
||||
cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur);
|
||||
cb(cur, "projector_out", -1);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
return gf;
|
||||
}
|
||||
Reference in New Issue
Block a user