model : add glm-asr support (#17901)
* [model] add glm-asr support * fix format for ci * fix convert format for ci * update glm_asr convert script & use build_ffn for glm_asr clip & use build_stack for padding and review * check root architecture for convert hf script * fix conficlt with upstream * fix convert script for glm asr & format clip-impl * format * restore hparams text * improved conversion --------- Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
This commit is contained in:
+59
-1
@@ -720,6 +720,32 @@ ggml_tensor * clip_graph::build_rope_2d(
|
||||
return cur;
|
||||
}
|
||||
|
||||
// Generic function to stack frames for audio processing
|
||||
// Abstracts out the StackAudioFrames logic used by ultravox
|
||||
ggml_tensor * clip_graph::build_stack(ggml_tensor * cur, int32_t stack_factor, int32_t n_embed) {
|
||||
if (stack_factor <= 1) {
|
||||
return cur;
|
||||
}
|
||||
|
||||
int64_t total_elements = ggml_nelements(cur);
|
||||
int64_t stride = n_embed * stack_factor;
|
||||
|
||||
// Calculate padded length
|
||||
int64_t padded_len = GGML_PAD(total_elements, stride);
|
||||
int64_t pad = padded_len - total_elements;
|
||||
|
||||
if (pad > 0) {
|
||||
// Pad the tensor to make it divisible by stride
|
||||
cur = ggml_view_1d(ctx0, cur, total_elements, 0);
|
||||
cur = ggml_pad(ctx0, cur, pad, 0, 0, 0);
|
||||
}
|
||||
|
||||
// Reshape to [stride, padded_len / stride]
|
||||
cur = ggml_view_2d(ctx0, cur, stride, padded_len / stride,
|
||||
ggml_row_size(cur->type, stride), 0);
|
||||
return cur;
|
||||
}
|
||||
|
||||
// aka pixel_shuffle / pixel_unshuffle / patch_merger (Kimi-VL)
|
||||
// support dynamic resolution
|
||||
ggml_tensor * clip_graph::build_patch_merge_permute(ggml_tensor * cur, int scale_factor) {
|
||||
@@ -796,6 +822,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||
case PROJECTOR_TYPE_ULTRAVOX:
|
||||
case PROJECTOR_TYPE_VOXTRAL:
|
||||
case PROJECTOR_TYPE_QWEN2A:
|
||||
case PROJECTOR_TYPE_GLMA:
|
||||
{
|
||||
builder = std::make_unique<clip_graph_whisper_enc>(ctx, img);
|
||||
} break;
|
||||
@@ -1136,10 +1163,12 @@ struct clip_model_loader {
|
||||
} break;
|
||||
case PROJECTOR_TYPE_ULTRAVOX:
|
||||
case PROJECTOR_TYPE_QWEN2A:
|
||||
case PROJECTOR_TYPE_GLMA:
|
||||
case PROJECTOR_TYPE_VOXTRAL:
|
||||
{
|
||||
bool require_stack = model.proj_type == PROJECTOR_TYPE_ULTRAVOX ||
|
||||
model.proj_type == PROJECTOR_TYPE_VOXTRAL;
|
||||
model.proj_type == PROJECTOR_TYPE_VOXTRAL ||
|
||||
model.proj_type == PROJECTOR_TYPE_GLMA;
|
||||
get_u32(KEY_A_PROJ_STACK_FACTOR, hparams.proj_stack_factor, require_stack);
|
||||
if (hparams.n_mel_bins != 128) {
|
||||
throw std::runtime_error(string_format("%s: only 128 mel bins are supported for ultravox\n", __func__));
|
||||
@@ -1510,6 +1539,21 @@ struct clip_model_loader {
|
||||
model.mm_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "weight"));
|
||||
model.mm_3_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "bias"));
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GLMA:
|
||||
{
|
||||
model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
|
||||
model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
|
||||
model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
|
||||
model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
|
||||
model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
|
||||
model.mm_1_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "bias"));
|
||||
model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
|
||||
model.mm_2_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "bias"));
|
||||
model.mm_norm_pre_w = get_tensor(string_format(TN_MM_NORM_PRE, "weight"));
|
||||
model.mm_norm_pre_b = get_tensor(string_format(TN_MM_NORM_PRE, "bias"));
|
||||
model.mm_boi = get_tensor(string_format(TN_TOK_BOI, "weight"));
|
||||
model.mm_eoi = get_tensor(string_format(TN_TOK_EOI, "weight"));
|
||||
} break;
|
||||
case PROJECTOR_TYPE_LLAMA4:
|
||||
{
|
||||
model.mm_model_proj = get_tensor(TN_MM_PROJECTOR);
|
||||
@@ -2895,6 +2939,16 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
||||
n_patches /= 2;
|
||||
}
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GLMA:
|
||||
{
|
||||
n_patches = img->nx;
|
||||
// whisper downscales input token by half after conv1d
|
||||
n_patches /= 2;
|
||||
// reshape by merge_factor
|
||||
n_patches /= ctx->model.hparams.proj_stack_factor;
|
||||
// for BOI and EOI token embeddings
|
||||
n_patches += 2;
|
||||
} break;
|
||||
case PROJECTOR_TYPE_COGVLM:
|
||||
{
|
||||
n_patches += 2; // for BOI and EOI token embeddings
|
||||
@@ -3230,6 +3284,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
case PROJECTOR_TYPE_IDEFICS3:
|
||||
case PROJECTOR_TYPE_INTERNVL:
|
||||
case PROJECTOR_TYPE_QWEN2A:
|
||||
case PROJECTOR_TYPE_GLMA:
|
||||
case PROJECTOR_TYPE_ULTRAVOX:
|
||||
case PROJECTOR_TYPE_LFM2:
|
||||
case PROJECTOR_TYPE_VOXTRAL:
|
||||
@@ -3340,6 +3395,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
||||
return ctx->model.mm_model_proj->ne[1];
|
||||
case PROJECTOR_TYPE_QWEN2A:
|
||||
return ctx->model.mm_fc_w->ne[1];
|
||||
case PROJECTOR_TYPE_GLMA:
|
||||
return ctx->model.mm_2_w->ne[1];
|
||||
case PROJECTOR_TYPE_LFM2:
|
||||
case PROJECTOR_TYPE_KIMIVL:
|
||||
return ctx->model.mm_2_w->ne[1];
|
||||
@@ -3386,6 +3443,7 @@ bool clip_has_audio_encoder(const struct clip_ctx * ctx) {
|
||||
bool clip_has_whisper_encoder(const struct clip_ctx * ctx) {
|
||||
return ctx->proj_type() == PROJECTOR_TYPE_ULTRAVOX
|
||||
|| ctx->proj_type() == PROJECTOR_TYPE_QWEN2A
|
||||
|| ctx->proj_type() == PROJECTOR_TYPE_GLMA
|
||||
|| ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user