llama.cpp/tools/mtmd/tests/test-deepseek-ocr.py

#!/usr/bin/env python3
"""
Test script to compare llama.cpp mtmd-cli output with HuggingFace reference implementation
for DeepSeek-OCR model using embedding similarity.
"""

import argparse
import subprocess
import sys
from pathlib import Path

from sentence_transformers import SentenceTransformer
from sentence_transformers import util


def run_mtmd_deepseek_ocr(
        model_path: str,
        mmproj_path: str,
        image_path: str,
        bin_path: str,
        prompt: str = "Free OCR."
) -> str:
    """
    Run inference using llama.cpp mtmd-cli.
    """
    cmd = [
        bin_path,
        "-m", model_path,
        "--mmproj", mmproj_path,
        "--image", image_path,
        # "-p", "<|grounding|>Convert the document to markdown.",
        "-p", prompt,
        "--chat-template", "deepseek-ocr",
        "--temp", "0",
        "-n", "1024",
        # "--verbose"
    ]

    print(f"Running llama.cpp command: {' '.join(cmd)}")

    result = subprocess.run(
        cmd,
        capture_output=True,
        text=False,
        timeout=300
    )

    if result.returncode != 0:
        stderr = result.stderr.decode('utf-8', errors='replace')
        print(f"llama.cpp stderr: {stderr}")
        raise RuntimeError(f"llama-mtmd-cli failed with code {result.returncode}")

    output = result.stdout.decode('utf-8', errors='replace').strip()
    print(f"llama.cpp output length: {len(output)} chars")
    return output


def compute_embedding_similarity(text1: str, text2: str, model_name: str) -> float:
    """
    Compute cosine similarity between two texts using embedding model.
    """
    print(f"Loading embedding model: {model_name}")

    # Use sentence-transformers for easier embedding extraction
    embed_model = SentenceTransformer(model_name)

    print("Computing embeddings...")
    embeddings = embed_model.encode([text1, text2], convert_to_numpy=True)

    similarity = util.similarity.cos_sim([embeddings[0]], [embeddings[1]])[0][0]
    return float(similarity)


def read_expected_output(file_path: str) -> str:
    """
    Read expected OCR output from file.
    """
    cur_path = Path(__file__).parent
    expected_path = str(cur_path / file_path)
    with open(expected_path, "r", encoding="utf-8") as f:
        return f.read().strip()


def main():
    ap = argparse.ArgumentParser(description="Compare llama.cpp and HuggingFace DeepSeek-OCR outputs")
    ap.add_argument("--llama-model", default="gguf_models/deepseek-ai/deepseek-ocr-f16.gguf",
                    help="Path to llama.cpp GGUF model")
    ap.add_argument("--mmproj", default="gguf_models/deepseek-ai/mmproj-deepseek-ocr-f16.gguf",
                    help="Path to mmproj GGUF file")
    ap.add_argument("--image", default="test-1.jpeg",
                    help="Path to test image")
    ap.add_argument("--llama-bin", default="build/bin/llama-mtmd-cli",
                    help="Path to llama-mtmd-cli binary")
    ap.add_argument("--embedding-model", default="Qwen/Qwen3-Embedding-0.6B",
                    help="Embedding model for similarity computation")
    ap.add_argument("--threshold", type=float, default=0.7,
                    help="Minimum similarity threshold for pass")
    args = ap.parse_args()

    # Validate paths
    # script directory + image
    mtmd_dir = Path(__file__).parent.parent
    args.image = str(mtmd_dir / args.image)
    # project directory + llama model
    args.llama_model = str(mtmd_dir.parent.parent / args.llama_model)
    # project directory + mmproj
    args.mmproj = str(mtmd_dir.parent.parent / args.mmproj)
    args.llama_bin = str(mtmd_dir.parent.parent / args.llama_bin)
    if not Path(args.image).exists():
        print(f"Error: Image not found: {args.image}")
        sys.exit(1)
    if not Path(args.llama_model).exists():
        print(f"Error: Model not found: {args.llama_model}")
        sys.exit(1)
    if not Path(args.mmproj).exists():
        print(f"Error: mmproj not found: {args.mmproj}")
        sys.exit(1)

    print("=" * 60)
    print("DeepSeek-OCR: llama.cpp vs HuggingFace Comparison")
    print("=" * 60)

    # Default paths based on your command

    # Run llama.cpp inference
    print("\n[2/3] Running llama.cpp implementation...")
    llama_free_ocr = run_mtmd_deepseek_ocr(
        args.llama_model,
        args.mmproj,
        args.image,
        args.llama_bin
    )

    llama_md_ocr = run_mtmd_deepseek_ocr(
        args.llama_model,
        args.mmproj,
        args.image,
        args.llama_bin,
        prompt="<|grounding|>Convert the document to markdown."
    )

    expected_free_ocr = read_expected_output("test-1-extracted.txt")
    expected_md_ocr = read_expected_output("test-1-extracted.md")

    # Compute similarity
    print("\n[3/3] Computing embedding similarity...")
    free_ocr_similarity = compute_embedding_similarity(
        expected_free_ocr,
        llama_free_ocr,
        args.embedding_model
    )

    md_ocr_similarity = compute_embedding_similarity(
        expected_md_ocr,
        llama_md_ocr,
        args.embedding_model
    )

    # Results
    print("\n" + "=" * 60)
    print("RESULTS")
    print("=" * 60)
    print(f"\nReference Model output:\n{'-' * 40}")
    print(expected_free_ocr)
    print(f"\nDeepSeek-OCR output:\n{'-' * 40}")
    print(llama_free_ocr)
    print(f"\n{'=' * 60}")
    print(f"Cosine Similarity: {free_ocr_similarity:.4f}")
    print(f"Threshold: {args.threshold}")
    print(f"Result: {'PASS' if free_ocr_similarity >= args.threshold else 'FAIL'}")
    print("=" * 60)

    # Markdown OCR results
    print(f"\nReference Model Markdown output:\n{'-' * 40}")
    print(expected_md_ocr)
    print(f"\nDeepSeek-OCR Markdown output:\n{'-' * 40}")
    print(llama_md_ocr)
    print(f"\n{'=' * 60}")
    print(f"Cosine Similarity (Markdown): {md_ocr_similarity:.4f}")
    print(f"Threshold: {args.threshold}")
    print(f"Result: {'PASS' if md_ocr_similarity >= args.threshold else 'FAIL'}")
    print("=" * 60)


if __name__ == "__main__":
    main()