547765a93e
* mtmd: add Gemma 4 audio conformer encoder support Add audio processing for Gemma 4 E2B/E4B via a USM-style Conformer. Architecture: - 12-layer Conformer: FFN → Self-Attention → Causal Conv1D → FFN → Norm - Subsampling Conv Projection: 2x Conv2D(stride=2) with LayerNorm - Full self-attention with sinusoidal RPE and sliding window mask (24) - Logit softcapping at 50.0, ClippableLinear clamping - Output: 1024 → 1536 → RMSNorm → multimodal embedder Mel preprocessing (dedicated mtmd_audio_preprocessor_gemma4a): - HTK mel scale, 128 bins, magnitude STFT, mel_floor=1e-3 - Standard periodic Hann window (320 samples), zero-padded to FFT size - Semicausal left-padding (frame_length/2 samples) - Frame count matched to PyTorch (unfold formula) - No pre-emphasis, no Whisper-style normalization - Mel cosine similarity vs PyTorch: 0.9998 Key fixes: - Tensor loading dedup: prevent get_tensor() from creating duplicate entries in ctx_data. Fixed with std::set guard. - ClippableLinear clamp_info loading moved after per-layer tensors. - Sliding window mask (24 positions) matching PyTorch context_size. - Skip Whisper normalization for Gemma4 mel output. Tested on E2B and E4B with CPU and Vulkan backends. Transcribes: "Glad to see things are going well and business is starting to pick up" (matching ground truth). Ref: #21325
116 lines
3.5 KiB
CMake
116 lines
3.5 KiB
CMake
# mtmd
|
|
|
|
find_package(Threads REQUIRED)
|
|
|
|
add_library(mtmd
|
|
mtmd.cpp
|
|
mtmd-audio.cpp
|
|
mtmd-image.cpp
|
|
mtmd.h
|
|
mtmd-helper.cpp
|
|
mtmd-helper.h
|
|
clip.cpp
|
|
clip.h
|
|
clip-impl.h
|
|
clip-model.h
|
|
clip-graph.h
|
|
models/models.h
|
|
models/cogvlm.cpp
|
|
models/conformer.cpp
|
|
models/dotsocr.cpp
|
|
models/gemma4a.cpp
|
|
models/gemma4v.cpp
|
|
models/glm4v.cpp
|
|
models/hunyuanocr.cpp
|
|
models/internvl.cpp
|
|
models/kimivl.cpp
|
|
models/kimik25.cpp
|
|
models/nemotron-v2-vl.cpp
|
|
models/llama4.cpp
|
|
models/llava.cpp
|
|
models/minicpmv.cpp
|
|
models/paddleocr.cpp
|
|
models/pixtral.cpp
|
|
models/qwen2vl.cpp
|
|
models/qwen3vl.cpp
|
|
models/step3vl.cpp
|
|
models/siglip.cpp
|
|
models/whisper-enc.cpp
|
|
models/deepseekocr.cpp
|
|
models/mobilenetv5.cpp
|
|
models/youtuvl.cpp
|
|
)
|
|
|
|
set_target_properties(mtmd PROPERTIES
|
|
VERSION ${LLAMA_INSTALL_VERSION}
|
|
SOVERSION 0
|
|
MACHO_CURRENT_VERSION 0 # keep macOS linker from seeing oversized version number
|
|
)
|
|
|
|
target_link_libraries (mtmd PUBLIC ggml llama)
|
|
target_link_libraries (mtmd PRIVATE Threads::Threads)
|
|
target_include_directories(mtmd PUBLIC .)
|
|
target_include_directories(mtmd PRIVATE ../..)
|
|
target_include_directories(mtmd PRIVATE ../../vendor)
|
|
target_compile_features (mtmd PRIVATE cxx_std_17)
|
|
|
|
if (BUILD_SHARED_LIBS)
|
|
set_target_properties (mtmd PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
|
target_compile_definitions(mtmd PRIVATE LLAMA_BUILD)
|
|
target_compile_definitions(mtmd PUBLIC LLAMA_SHARED)
|
|
endif()
|
|
|
|
set(MTMD_PUBLIC_HEADERS
|
|
${CMAKE_CURRENT_SOURCE_DIR}/mtmd.h
|
|
${CMAKE_CURRENT_SOURCE_DIR}/mtmd-helper.h
|
|
)
|
|
|
|
set_target_properties(mtmd
|
|
PROPERTIES
|
|
PUBLIC_HEADER "${MTMD_PUBLIC_HEADERS}")
|
|
|
|
set_target_properties(mtmd
|
|
PROPERTIES
|
|
PRIVATE_HEADER debug/mtmd-debug.h)
|
|
|
|
install(TARGETS mtmd LIBRARY PUBLIC_HEADER)
|
|
|
|
if (NOT MSVC)
|
|
# for stb_image.h and miniaudio.h
|
|
target_compile_options(mtmd PRIVATE -Wno-cast-qual)
|
|
endif()
|
|
|
|
if (TARGET BUILD_INFO)
|
|
add_dependencies(mtmd BUILD_INFO)
|
|
add_dependencies(mtmd-helper BUILD_INFO)
|
|
endif()
|
|
|
|
# if mtmd is linked against common, we throw an error
|
|
if (TARGET mtmd)
|
|
get_target_property(libs mtmd LINK_LIBRARIES)
|
|
if (libs AND "common" IN_LIST libs)
|
|
message(FATAL_ERROR "mtmd is designed to be a public library.\n"
|
|
"It must not link against common")
|
|
endif()
|
|
endif()
|
|
|
|
add_executable(llama-llava-cli deprecation-warning.cpp)
|
|
add_executable(llama-gemma3-cli deprecation-warning.cpp)
|
|
add_executable(llama-minicpmv-cli deprecation-warning.cpp)
|
|
add_executable(llama-qwen2vl-cli deprecation-warning.cpp)
|
|
|
|
set(TARGET llama-mtmd-cli)
|
|
add_executable (${TARGET} mtmd-cli.cpp)
|
|
set_target_properties (${TARGET} PROPERTIES OUTPUT_NAME llama-mtmd-cli)
|
|
if(LLAMA_TOOLS_INSTALL)
|
|
install(TARGETS ${TARGET} RUNTIME)
|
|
endif()
|
|
target_link_libraries (${TARGET} PRIVATE common mtmd Threads::Threads)
|
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
|
|
|
# mtmd-debug tool
|
|
add_executable(llama-mtmd-debug debug/mtmd-debug.cpp)
|
|
set_target_properties(llama-mtmd-debug PROPERTIES OUTPUT_NAME llama-mtmd-debug)
|
|
target_link_libraries(llama-mtmd-debug PRIVATE common mtmd Threads::Threads)
|
|
target_compile_features(llama-mtmd-debug PRIVATE cxx_std_17)
|