llama.cpp/tools/server/webui/src/lib/utils/latex-protection.ts

import {
	CODE_BLOCK_REGEXP,
	LATEX_MATH_AND_CODE_PATTERN,
	LATEX_LINEBREAK_REGEXP,
	MHCHEM_PATTERN_MAP
} from '$lib/constants/latex-protection';

/**
 * Replaces inline LaTeX expressions enclosed in `$...$` with placeholders, avoiding dollar signs
 * that appear to be part of monetary values or identifiers.
 *
 * This function processes the input line by line and skips `$` sequences that are likely
 * part of money amounts (e.g., `$5`, `$100.99`) or code-like tokens (e.g., `var$`, `$var`).
 * Valid LaTeX inline math is replaced with a placeholder like `<<LATEX_0>>`, and the
 * actual LaTeX content is stored in the provided `latexExpressions` array.
 *
 * @param content - The input text potentially containing LaTeX expressions.
 * @param latexExpressions - An array used to collect extracted LaTeX expressions.
 * @returns The processed string with LaTeX replaced by placeholders.
 */
export function maskInlineLaTeX(content: string, latexExpressions: string[]): string {
	if (!content.includes('$')) {
		return content;
	}
	return content
		.split('\n')
		.map((line) => {
			if (line.indexOf('$') == -1) {
				return line;
			}

			let processedLine = '';
			let currentPosition = 0;

			while (currentPosition < line.length) {
				const openDollarIndex = line.indexOf('$', currentPosition);

				if (openDollarIndex == -1) {
					processedLine += line.slice(currentPosition);
					break;
				}

				// Is there a next $-sign?
				const closeDollarIndex = line.indexOf('$', openDollarIndex + 1);

				if (closeDollarIndex == -1) {
					processedLine += line.slice(currentPosition);
					break;
				}

				const charBeforeOpen = openDollarIndex > 0 ? line[openDollarIndex - 1] : '';
				const charAfterOpen = line[openDollarIndex + 1];
				const charBeforeClose =
					openDollarIndex + 1 < closeDollarIndex ? line[closeDollarIndex - 1] : '';
				const charAfterClose = closeDollarIndex + 1 < line.length ? line[closeDollarIndex + 1] : '';

				let shouldSkipAsNonLatex = false;

				if (closeDollarIndex == currentPosition + 1) {
					// No content
					shouldSkipAsNonLatex = true;
				}

				if (/[A-Za-z0-9_$-]/.test(charBeforeOpen)) {
					// Character, digit, $, _ or - before first '$', no TeX.
					shouldSkipAsNonLatex = true;
				}

				if (
					/[0-9]/.test(charAfterOpen) &&
					(/[A-Za-z0-9_$-]/.test(charAfterClose) || ' ' == charBeforeClose)
				) {
					// First $ seems to belong to an amount.
					shouldSkipAsNonLatex = true;
				}

				if (shouldSkipAsNonLatex) {
					processedLine += line.slice(currentPosition, openDollarIndex + 1);
					currentPosition = openDollarIndex + 1;

					continue;
				}

				// Treat as LaTeX
				processedLine += line.slice(currentPosition, openDollarIndex);
				const latexContent = line.slice(openDollarIndex, closeDollarIndex + 1);
				latexExpressions.push(latexContent);
				processedLine += `<<LATEX_${latexExpressions.length - 1}>>`;
				currentPosition = closeDollarIndex + 1;
			}

			return processedLine;
		})
		.join('\n');
}

function escapeBrackets(text: string): string {
	return text.replace(
		LATEX_MATH_AND_CODE_PATTERN,
		(
			match: string,
			codeBlock: string | undefined,
			squareBracket: string | undefined,
			roundBracket: string | undefined
		): string => {
			if (codeBlock != null) {
				return codeBlock;
			} else if (squareBracket != null) {
				return `$$${squareBracket}$$`;
			} else if (roundBracket != null) {
				return `$${roundBracket}$`;
			}

			return match;
		}
	);
}

// Escape $\\ce{...} → $\\ce{...} but with proper handling
function escapeMhchem(text: string): string {
	return MHCHEM_PATTERN_MAP.reduce((result, [pattern, replacement]) => {
		return result.replace(pattern, replacement);
	}, text);
}

const doEscapeMhchem = false;

/**
 * Preprocesses markdown content to safely handle LaTeX math expressions while protecting
 * against false positives (e.g., dollar amounts like $5.99) and ensuring proper rendering.
 *
 * This function:
 * - Protects code blocks (```) and inline code (`...`)
 * - Safeguards block and inline LaTeX: \(...\), \[...\], $$...$$, and selective $...$
 * - Escapes standalone dollar signs before numbers (e.g., $5 → \$5) to prevent misinterpretation
 * - Restores protected LaTeX and code blocks after processing
 * - Converts \(...\) → $...$ and \[...\] → $$...$$ for compatibility with math renderers
 * - Applies additional escaping for brackets and mhchem syntax if needed
 *
 * @param content - The raw text (e.g., markdown) that may contain LaTeX or code blocks.
 * @returns The preprocessed string with properly escaped and normalized LaTeX.
 *
 * @example
 * preprocessLaTeX("Price: $10. The equation is \\(x^2\\).")
 * // → "Price: $10. The equation is $x^2$."
 */
export function preprocessLaTeX(content: string): string {
	// See also:
	// https://github.com/danny-avila/LibreChat/blob/main/client/src/utils/latex.ts

	// Step 0: Temporarily remove blockquote markers (>) to process LaTeX correctly
	// Store the structure so we can restore it later
	const blockquoteMarkers: Map<number, string> = new Map();
	const lines = content.split('\n');
	const processedLines = lines.map((line, index) => {
		const match = line.match(/^(>\s*)/);
		if (match) {
			blockquoteMarkers.set(index, match[1]);
			return line.slice(match[1].length);
		}
		return line;
	});
	content = processedLines.join('\n');

	// Step 1: Protect code blocks
	const codeBlocks: string[] = [];

	content = content.replace(CODE_BLOCK_REGEXP, (match) => {
		codeBlocks.push(match);

		return `<<CODE_BLOCK_${codeBlocks.length - 1}>>`;
	});

	// Step 2: Protect existing LaTeX expressions
	const latexExpressions: string[] = [];

	// Match \S...\[...\] and protect them and insert a line-break.
	content = content.replace(/([\S].*?)\\\[([\s\S]*?)\\\](.*)/g, (match, group1, group2, group3) => {
		// Check if there are characters following the formula (display-formula in a table-cell?)
		if (group1.endsWith('\\')) {
			return match; // Backslash before \[, do nothing.
		}
		const hasSuffix = /\S/.test(group3);
		let optBreak;

		if (hasSuffix) {
			latexExpressions.push(`\\(${group2.trim()}\\)`); // Convert into inline.
			optBreak = '';
		} else {
			latexExpressions.push(`\\[${group2}\\]`);
			optBreak = '\n';
		}

		return `${group1}${optBreak}<<LATEX_${latexExpressions.length - 1}>>${optBreak}${group3}`;
	});

	// Match \(...\), \[...\], $$...$$ and protect them
	content = content.replace(
		/(\$\$[\s\S]*?\$\$|(?<!\\)\\\[[\s\S]*?\\\]|(?<!\\)\\\(.*?\\\))/g,
		(match) => {
			latexExpressions.push(match);

			return `<<LATEX_${latexExpressions.length - 1}>>`;
		}
	);

	// Protect inline $...$ but NOT if it looks like money (e.g., $10, $3.99)
	content = maskInlineLaTeX(content, latexExpressions);

	// Step 3: Escape standalone $ before digits (currency like $5 → \$5)
	// (Now that inline math is protected, this will only escape dollars not already protected)
	content = content.replace(/\$(?=\d)/g, '\\$');

	// Step 4: Restore protected LaTeX expressions (they are valid)
	content = content.replace(/<<LATEX_(\d+)>>/g, (_, index) => {
		let expr = latexExpressions[parseInt(index)];
		const match = expr.match(LATEX_LINEBREAK_REGEXP);
		if (match) {
			// Katex: The $$-delimiters should be in their own line
			// if there are \\-line-breaks.
			const formula = match[1];
			const prefix = formula.startsWith('\n') ? '' : '\n';
			const suffix = formula.endsWith('\n') ? '' : '\n';
			expr = '$$' + prefix + formula + suffix + '$$';
		}
		return expr;
	});

	// Step 5: Restore code blocks
	content = content.replace(/<<CODE_BLOCK_(\d+)>>/g, (_, index) => {
		return codeBlocks[parseInt(index)];
	});

	// Step 6: Apply additional escaping functions (brackets and mhchem)
	content = escapeBrackets(content);

	if (doEscapeMhchem && (content.includes('\\ce{') || content.includes('\\pu{'))) {
		content = escapeMhchem(content);
	}

	// Final pass: Convert \(...\) → $...$, \[...\] → $$...$$
	content = content
		// Using the look‑behind pattern `(?<!\\)` we skip matches
		// that are preceded by a backslash, e.g.
		// `Definitions\\(also called macros)` (title of chapter 20 in The TeXbook).
		.replace(/(?<!\\)\\\((.+?)\\\)/g, '$$$1$') // inline
		.replace(
			// Using the look‑behind pattern `(?<!\\)` we skip matches
			// that are preceded by a backslash, e.g. `\\[4pt]`.
			/(?<!\\)\\\[([\s\S]*?)\\\]/g, // display, see also PR #16599
			(_, prefix: string, content: string) => {
				return `${prefix}$$${content}$$`;
			}
		);

	// Step 7: Restore blockquote markers
	if (blockquoteMarkers.size > 0) {
		const finalLines = content.split('\n');
		const restoredLines = finalLines.map((line, index) => {
			const marker = blockquoteMarkers.get(index);
			return marker ? marker + line : line;
		});
		content = restoredLines.join('\n');
	}

	return content;
}