mtmd: refactor audio preprocessing (#17978)

* mtmd: refactor audio preprocessing

* refactor

Co-authored-by: Tarek <tdakhran@users.noreply.github.com>

* wip

* wip (2)

* improve constructor

* fix use_natural_log

* fix padding for short input

* clean up

* remove need_chunking

---------

Co-authored-by: Tarek <tdakhran@users.noreply.github.com>
This commit is contained in:
Xuan-Son Nguyen
2025-12-15 14:16:52 +01:00
committed by GitHub
parent 4a4f7e6550
commit 96a181a933
5 changed files with 379 additions and 596 deletions
+14 -29
View File
@@ -1,6 +1,7 @@
#pragma once
#include "ggml.h"
#include "clip-model.h"
#include <cstdint>
#include <vector>
@@ -8,18 +9,7 @@
#define MTMD_INTERNAL_HEADER
#define WHISPER_ASSERT GGML_ASSERT
#define WHISPER_SAMPLE_RATE 16000
#define WHISPER_N_FFT 400
#define WHISPER_HOP_LENGTH 160
#define WHISPER_CHUNK_SIZE 30
#define COMMON_SAMPLE_RATE 16000
namespace whisper_preprocessor {
struct whisper_mel {
struct mtmd_audio_mel {
int n_len;
int n_len_org;
int n_mel;
@@ -27,23 +17,18 @@ struct whisper_mel {
std::vector<float> data;
};
struct whisper_filters {
int32_t n_mel;
int32_t n_fft;
struct mtmd_audio_preprocessor {
const clip_hparams & hparams;
std::vector<float> data;
mtmd_audio_preprocessor(const clip_ctx * ctx): hparams(*clip_get_hparams(ctx)) {}
virtual ~mtmd_audio_preprocessor() = default;
virtual void initialize() = 0; // NOT thread-safe
virtual bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) = 0;
};
bool preprocess_audio(
const float * samples,
size_t n_samples,
const whisper_filters & filters,
std::vector<whisper_mel> & output);
} // namespace whisper_preprocessor
namespace whisper_precalc_filters {
whisper_preprocessor::whisper_filters get_128_bins();
} // namespace whisper_precalc_filters
struct mtmd_audio_preprocessor_whisper : mtmd_audio_preprocessor {
mtmd_audio_preprocessor_whisper(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {}
void initialize() override;
bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;
};