Files
llama.cpp/tools/server/webui/src/lib/utils/latex-protection.ts
T
Sascha Rogmann bcfa87622a feat(webui): improve LaTeX rendering with currency detection (#16508)
* webui : Revised LaTeX formula recognition

* webui : Further examples containg amounts

* webui : vitest for maskInlineLaTeX

* webui: Moved preprocessLaTeX to lib/utils

* webui: LaTeX in table-cells

* chore: update webui build output (use theirs)

* webui: backslash in LaTeX-preprocessing

* chore: update webui build output

* webui: look-behind backslash-check

* chore: update webui build output

* Apply suggestions from code review

Code maintenance (variable names, code formatting, string handling)

Co-authored-by: Aleksander Grygier <aleksander.grygier@gmail.com>

* webui: Moved constants to lib/constants.

* webui: package woff2 inside base64 data

* webui: LaTeX-line-break in display formula

* chore: update webui build output

* webui: Bugfix (font embedding)

* webui: Bugfix (font embedding)

* webui: vite embeds assets

* webui: don't suppress 404 (fonts)

* refactor: KaTeX integration with SCSS

Moves KaTeX styling to SCSS for better customization and font embedding.

This change includes:
- Adding `sass` as a dev dependency.
- Introducing a custom SCSS file to override KaTeX variables and disable TTF/WOFF fonts, relying solely on WOFF2 for embedding.
- Adjusting the Vite configuration to resolve `katex-fonts` alias and inject SCSS variables.

* fix: LaTeX processing within blockquotes

* webui: update webui build output

---------

Co-authored-by: Aleksander Grygier <aleksander.grygier@gmail.com>
2025-11-03 00:41:08 +01:00

268 lines
8.6 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import {
CODE_BLOCK_REGEXP,
LATEX_MATH_AND_CODE_PATTERN,
LATEX_LINEBREAK_REGEXP,
MHCHEM_PATTERN_MAP
} from '$lib/constants/latex-protection';
/**
* Replaces inline LaTeX expressions enclosed in `$...$` with placeholders, avoiding dollar signs
* that appear to be part of monetary values or identifiers.
*
* This function processes the input line by line and skips `$` sequences that are likely
* part of money amounts (e.g., `$5`, `$100.99`) or code-like tokens (e.g., `var$`, `$var`).
* Valid LaTeX inline math is replaced with a placeholder like `<<LATEX_0>>`, and the
* actual LaTeX content is stored in the provided `latexExpressions` array.
*
* @param content - The input text potentially containing LaTeX expressions.
* @param latexExpressions - An array used to collect extracted LaTeX expressions.
* @returns The processed string with LaTeX replaced by placeholders.
*/
export function maskInlineLaTeX(content: string, latexExpressions: string[]): string {
if (!content.includes('$')) {
return content;
}
return content
.split('\n')
.map((line) => {
if (line.indexOf('$') == -1) {
return line;
}
let processedLine = '';
let currentPosition = 0;
while (currentPosition < line.length) {
const openDollarIndex = line.indexOf('$', currentPosition);
if (openDollarIndex == -1) {
processedLine += line.slice(currentPosition);
break;
}
// Is there a next $-sign?
const closeDollarIndex = line.indexOf('$', openDollarIndex + 1);
if (closeDollarIndex == -1) {
processedLine += line.slice(currentPosition);
break;
}
const charBeforeOpen = openDollarIndex > 0 ? line[openDollarIndex - 1] : '';
const charAfterOpen = line[openDollarIndex + 1];
const charBeforeClose =
openDollarIndex + 1 < closeDollarIndex ? line[closeDollarIndex - 1] : '';
const charAfterClose = closeDollarIndex + 1 < line.length ? line[closeDollarIndex + 1] : '';
let shouldSkipAsNonLatex = false;
if (closeDollarIndex == currentPosition + 1) {
// No content
shouldSkipAsNonLatex = true;
}
if (/[A-Za-z0-9_$-]/.test(charBeforeOpen)) {
// Character, digit, $, _ or - before first '$', no TeX.
shouldSkipAsNonLatex = true;
}
if (
/[0-9]/.test(charAfterOpen) &&
(/[A-Za-z0-9_$-]/.test(charAfterClose) || ' ' == charBeforeClose)
) {
// First $ seems to belong to an amount.
shouldSkipAsNonLatex = true;
}
if (shouldSkipAsNonLatex) {
processedLine += line.slice(currentPosition, openDollarIndex + 1);
currentPosition = openDollarIndex + 1;
continue;
}
// Treat as LaTeX
processedLine += line.slice(currentPosition, openDollarIndex);
const latexContent = line.slice(openDollarIndex, closeDollarIndex + 1);
latexExpressions.push(latexContent);
processedLine += `<<LATEX_${latexExpressions.length - 1}>>`;
currentPosition = closeDollarIndex + 1;
}
return processedLine;
})
.join('\n');
}
function escapeBrackets(text: string): string {
return text.replace(
LATEX_MATH_AND_CODE_PATTERN,
(
match: string,
codeBlock: string | undefined,
squareBracket: string | undefined,
roundBracket: string | undefined
): string => {
if (codeBlock != null) {
return codeBlock;
} else if (squareBracket != null) {
return `$$${squareBracket}$$`;
} else if (roundBracket != null) {
return `$${roundBracket}$`;
}
return match;
}
);
}
// Escape $\\ce{...} → $\\ce{...} but with proper handling
function escapeMhchem(text: string): string {
return MHCHEM_PATTERN_MAP.reduce((result, [pattern, replacement]) => {
return result.replace(pattern, replacement);
}, text);
}
const doEscapeMhchem = false;
/**
* Preprocesses markdown content to safely handle LaTeX math expressions while protecting
* against false positives (e.g., dollar amounts like $5.99) and ensuring proper rendering.
*
* This function:
* - Protects code blocks (```) and inline code (`...`)
* - Safeguards block and inline LaTeX: \(...\), \[...\], $$...$$, and selective $...$
* - Escapes standalone dollar signs before numbers (e.g., $5 → \$5) to prevent misinterpretation
* - Restores protected LaTeX and code blocks after processing
* - Converts \(...\) → $...$ and \[...\] → $$...$$ for compatibility with math renderers
* - Applies additional escaping for brackets and mhchem syntax if needed
*
* @param content - The raw text (e.g., markdown) that may contain LaTeX or code blocks.
* @returns The preprocessed string with properly escaped and normalized LaTeX.
*
* @example
* preprocessLaTeX("Price: $10. The equation is \\(x^2\\).")
* // → "Price: $10. The equation is $x^2$."
*/
export function preprocessLaTeX(content: string): string {
// See also:
// https://github.com/danny-avila/LibreChat/blob/main/client/src/utils/latex.ts
// Step 0: Temporarily remove blockquote markers (>) to process LaTeX correctly
// Store the structure so we can restore it later
const blockquoteMarkers: Map<number, string> = new Map();
const lines = content.split('\n');
const processedLines = lines.map((line, index) => {
const match = line.match(/^(>\s*)/);
if (match) {
blockquoteMarkers.set(index, match[1]);
return line.slice(match[1].length);
}
return line;
});
content = processedLines.join('\n');
// Step 1: Protect code blocks
const codeBlocks: string[] = [];
content = content.replace(CODE_BLOCK_REGEXP, (match) => {
codeBlocks.push(match);
return `<<CODE_BLOCK_${codeBlocks.length - 1}>>`;
});
// Step 2: Protect existing LaTeX expressions
const latexExpressions: string[] = [];
// Match \S...\[...\] and protect them and insert a line-break.
content = content.replace(/([\S].*?)\\\[([\s\S]*?)\\\](.*)/g, (match, group1, group2, group3) => {
// Check if there are characters following the formula (display-formula in a table-cell?)
if (group1.endsWith('\\')) {
return match; // Backslash before \[, do nothing.
}
const hasSuffix = /\S/.test(group3);
let optBreak;
if (hasSuffix) {
latexExpressions.push(`\\(${group2.trim()}\\)`); // Convert into inline.
optBreak = '';
} else {
latexExpressions.push(`\\[${group2}\\]`);
optBreak = '\n';
}
return `${group1}${optBreak}<<LATEX_${latexExpressions.length - 1}>>${optBreak}${group3}`;
});
// Match \(...\), \[...\], $$...$$ and protect them
content = content.replace(
/(\$\$[\s\S]*?\$\$|(?<!\\)\\\[[\s\S]*?\\\]|(?<!\\)\\\(.*?\\\))/g,
(match) => {
latexExpressions.push(match);
return `<<LATEX_${latexExpressions.length - 1}>>`;
}
);
// Protect inline $...$ but NOT if it looks like money (e.g., $10, $3.99)
content = maskInlineLaTeX(content, latexExpressions);
// Step 3: Escape standalone $ before digits (currency like $5 → \$5)
// (Now that inline math is protected, this will only escape dollars not already protected)
content = content.replace(/\$(?=\d)/g, '\\$');
// Step 4: Restore protected LaTeX expressions (they are valid)
content = content.replace(/<<LATEX_(\d+)>>/g, (_, index) => {
let expr = latexExpressions[parseInt(index)];
const match = expr.match(LATEX_LINEBREAK_REGEXP);
if (match) {
// Katex: The $$-delimiters should be in their own line
// if there are \\-line-breaks.
const formula = match[1];
const prefix = formula.startsWith('\n') ? '' : '\n';
const suffix = formula.endsWith('\n') ? '' : '\n';
expr = '$$' + prefix + formula + suffix + '$$';
}
return expr;
});
// Step 5: Restore code blocks
content = content.replace(/<<CODE_BLOCK_(\d+)>>/g, (_, index) => {
return codeBlocks[parseInt(index)];
});
// Step 6: Apply additional escaping functions (brackets and mhchem)
content = escapeBrackets(content);
if (doEscapeMhchem && (content.includes('\\ce{') || content.includes('\\pu{'))) {
content = escapeMhchem(content);
}
// Final pass: Convert \(...\) → $...$, \[...\] → $$...$$
content = content
// Using the lookbehind pattern `(?<!\\)` we skip matches
// that are preceded by a backslash, e.g.
// `Definitions\\(also called macros)` (title of chapter 20 in The TeXbook).
.replace(/(?<!\\)\\\((.+?)\\\)/g, '$$$1$') // inline
.replace(
// Using the lookbehind pattern `(?<!\\)` we skip matches
// that are preceded by a backslash, e.g. `\\[4pt]`.
/(?<!\\)\\\[([\s\S]*?)\\\]/g, // display, see also PR #16599
(_, prefix: string, content: string) => {
return `${prefix}$$${content}$$`;
}
);
// Step 7: Restore blockquote markers
if (blockquoteMarkers.size > 0) {
const finalLines = content.split('\n');
const restoredLines = finalLines.map((line, index) => {
const marker = blockquoteMarkers.get(index);
return marker ? marker + line : line;
});
content = restoredLines.join('\n');
}
return content;
}