bcfa87622a
* webui : Revised LaTeX formula recognition * webui : Further examples containg amounts * webui : vitest for maskInlineLaTeX * webui: Moved preprocessLaTeX to lib/utils * webui: LaTeX in table-cells * chore: update webui build output (use theirs) * webui: backslash in LaTeX-preprocessing * chore: update webui build output * webui: look-behind backslash-check * chore: update webui build output * Apply suggestions from code review Code maintenance (variable names, code formatting, string handling) Co-authored-by: Aleksander Grygier <aleksander.grygier@gmail.com> * webui: Moved constants to lib/constants. * webui: package woff2 inside base64 data * webui: LaTeX-line-break in display formula * chore: update webui build output * webui: Bugfix (font embedding) * webui: Bugfix (font embedding) * webui: vite embeds assets * webui: don't suppress 404 (fonts) * refactor: KaTeX integration with SCSS Moves KaTeX styling to SCSS for better customization and font embedding. This change includes: - Adding `sass` as a dev dependency. - Introducing a custom SCSS file to override KaTeX variables and disable TTF/WOFF fonts, relying solely on WOFF2 for embedding. - Adjusting the Vite configuration to resolve `katex-fonts` alias and inject SCSS variables. * fix: LaTeX processing within blockquotes * webui: update webui build output --------- Co-authored-by: Aleksander Grygier <aleksander.grygier@gmail.com>
268 lines
8.6 KiB
TypeScript
268 lines
8.6 KiB
TypeScript
import {
|
||
CODE_BLOCK_REGEXP,
|
||
LATEX_MATH_AND_CODE_PATTERN,
|
||
LATEX_LINEBREAK_REGEXP,
|
||
MHCHEM_PATTERN_MAP
|
||
} from '$lib/constants/latex-protection';
|
||
|
||
/**
|
||
* Replaces inline LaTeX expressions enclosed in `$...$` with placeholders, avoiding dollar signs
|
||
* that appear to be part of monetary values or identifiers.
|
||
*
|
||
* This function processes the input line by line and skips `$` sequences that are likely
|
||
* part of money amounts (e.g., `$5`, `$100.99`) or code-like tokens (e.g., `var$`, `$var`).
|
||
* Valid LaTeX inline math is replaced with a placeholder like `<<LATEX_0>>`, and the
|
||
* actual LaTeX content is stored in the provided `latexExpressions` array.
|
||
*
|
||
* @param content - The input text potentially containing LaTeX expressions.
|
||
* @param latexExpressions - An array used to collect extracted LaTeX expressions.
|
||
* @returns The processed string with LaTeX replaced by placeholders.
|
||
*/
|
||
export function maskInlineLaTeX(content: string, latexExpressions: string[]): string {
|
||
if (!content.includes('$')) {
|
||
return content;
|
||
}
|
||
return content
|
||
.split('\n')
|
||
.map((line) => {
|
||
if (line.indexOf('$') == -1) {
|
||
return line;
|
||
}
|
||
|
||
let processedLine = '';
|
||
let currentPosition = 0;
|
||
|
||
while (currentPosition < line.length) {
|
||
const openDollarIndex = line.indexOf('$', currentPosition);
|
||
|
||
if (openDollarIndex == -1) {
|
||
processedLine += line.slice(currentPosition);
|
||
break;
|
||
}
|
||
|
||
// Is there a next $-sign?
|
||
const closeDollarIndex = line.indexOf('$', openDollarIndex + 1);
|
||
|
||
if (closeDollarIndex == -1) {
|
||
processedLine += line.slice(currentPosition);
|
||
break;
|
||
}
|
||
|
||
const charBeforeOpen = openDollarIndex > 0 ? line[openDollarIndex - 1] : '';
|
||
const charAfterOpen = line[openDollarIndex + 1];
|
||
const charBeforeClose =
|
||
openDollarIndex + 1 < closeDollarIndex ? line[closeDollarIndex - 1] : '';
|
||
const charAfterClose = closeDollarIndex + 1 < line.length ? line[closeDollarIndex + 1] : '';
|
||
|
||
let shouldSkipAsNonLatex = false;
|
||
|
||
if (closeDollarIndex == currentPosition + 1) {
|
||
// No content
|
||
shouldSkipAsNonLatex = true;
|
||
}
|
||
|
||
if (/[A-Za-z0-9_$-]/.test(charBeforeOpen)) {
|
||
// Character, digit, $, _ or - before first '$', no TeX.
|
||
shouldSkipAsNonLatex = true;
|
||
}
|
||
|
||
if (
|
||
/[0-9]/.test(charAfterOpen) &&
|
||
(/[A-Za-z0-9_$-]/.test(charAfterClose) || ' ' == charBeforeClose)
|
||
) {
|
||
// First $ seems to belong to an amount.
|
||
shouldSkipAsNonLatex = true;
|
||
}
|
||
|
||
if (shouldSkipAsNonLatex) {
|
||
processedLine += line.slice(currentPosition, openDollarIndex + 1);
|
||
currentPosition = openDollarIndex + 1;
|
||
|
||
continue;
|
||
}
|
||
|
||
// Treat as LaTeX
|
||
processedLine += line.slice(currentPosition, openDollarIndex);
|
||
const latexContent = line.slice(openDollarIndex, closeDollarIndex + 1);
|
||
latexExpressions.push(latexContent);
|
||
processedLine += `<<LATEX_${latexExpressions.length - 1}>>`;
|
||
currentPosition = closeDollarIndex + 1;
|
||
}
|
||
|
||
return processedLine;
|
||
})
|
||
.join('\n');
|
||
}
|
||
|
||
function escapeBrackets(text: string): string {
|
||
return text.replace(
|
||
LATEX_MATH_AND_CODE_PATTERN,
|
||
(
|
||
match: string,
|
||
codeBlock: string | undefined,
|
||
squareBracket: string | undefined,
|
||
roundBracket: string | undefined
|
||
): string => {
|
||
if (codeBlock != null) {
|
||
return codeBlock;
|
||
} else if (squareBracket != null) {
|
||
return `$$${squareBracket}$$`;
|
||
} else if (roundBracket != null) {
|
||
return `$${roundBracket}$`;
|
||
}
|
||
|
||
return match;
|
||
}
|
||
);
|
||
}
|
||
|
||
// Escape $\\ce{...} → $\\ce{...} but with proper handling
|
||
function escapeMhchem(text: string): string {
|
||
return MHCHEM_PATTERN_MAP.reduce((result, [pattern, replacement]) => {
|
||
return result.replace(pattern, replacement);
|
||
}, text);
|
||
}
|
||
|
||
const doEscapeMhchem = false;
|
||
|
||
/**
|
||
* Preprocesses markdown content to safely handle LaTeX math expressions while protecting
|
||
* against false positives (e.g., dollar amounts like $5.99) and ensuring proper rendering.
|
||
*
|
||
* This function:
|
||
* - Protects code blocks (```) and inline code (`...`)
|
||
* - Safeguards block and inline LaTeX: \(...\), \[...\], $$...$$, and selective $...$
|
||
* - Escapes standalone dollar signs before numbers (e.g., $5 → \$5) to prevent misinterpretation
|
||
* - Restores protected LaTeX and code blocks after processing
|
||
* - Converts \(...\) → $...$ and \[...\] → $$...$$ for compatibility with math renderers
|
||
* - Applies additional escaping for brackets and mhchem syntax if needed
|
||
*
|
||
* @param content - The raw text (e.g., markdown) that may contain LaTeX or code blocks.
|
||
* @returns The preprocessed string with properly escaped and normalized LaTeX.
|
||
*
|
||
* @example
|
||
* preprocessLaTeX("Price: $10. The equation is \\(x^2\\).")
|
||
* // → "Price: $10. The equation is $x^2$."
|
||
*/
|
||
export function preprocessLaTeX(content: string): string {
|
||
// See also:
|
||
// https://github.com/danny-avila/LibreChat/blob/main/client/src/utils/latex.ts
|
||
|
||
// Step 0: Temporarily remove blockquote markers (>) to process LaTeX correctly
|
||
// Store the structure so we can restore it later
|
||
const blockquoteMarkers: Map<number, string> = new Map();
|
||
const lines = content.split('\n');
|
||
const processedLines = lines.map((line, index) => {
|
||
const match = line.match(/^(>\s*)/);
|
||
if (match) {
|
||
blockquoteMarkers.set(index, match[1]);
|
||
return line.slice(match[1].length);
|
||
}
|
||
return line;
|
||
});
|
||
content = processedLines.join('\n');
|
||
|
||
// Step 1: Protect code blocks
|
||
const codeBlocks: string[] = [];
|
||
|
||
content = content.replace(CODE_BLOCK_REGEXP, (match) => {
|
||
codeBlocks.push(match);
|
||
|
||
return `<<CODE_BLOCK_${codeBlocks.length - 1}>>`;
|
||
});
|
||
|
||
// Step 2: Protect existing LaTeX expressions
|
||
const latexExpressions: string[] = [];
|
||
|
||
// Match \S...\[...\] and protect them and insert a line-break.
|
||
content = content.replace(/([\S].*?)\\\[([\s\S]*?)\\\](.*)/g, (match, group1, group2, group3) => {
|
||
// Check if there are characters following the formula (display-formula in a table-cell?)
|
||
if (group1.endsWith('\\')) {
|
||
return match; // Backslash before \[, do nothing.
|
||
}
|
||
const hasSuffix = /\S/.test(group3);
|
||
let optBreak;
|
||
|
||
if (hasSuffix) {
|
||
latexExpressions.push(`\\(${group2.trim()}\\)`); // Convert into inline.
|
||
optBreak = '';
|
||
} else {
|
||
latexExpressions.push(`\\[${group2}\\]`);
|
||
optBreak = '\n';
|
||
}
|
||
|
||
return `${group1}${optBreak}<<LATEX_${latexExpressions.length - 1}>>${optBreak}${group3}`;
|
||
});
|
||
|
||
// Match \(...\), \[...\], $$...$$ and protect them
|
||
content = content.replace(
|
||
/(\$\$[\s\S]*?\$\$|(?<!\\)\\\[[\s\S]*?\\\]|(?<!\\)\\\(.*?\\\))/g,
|
||
(match) => {
|
||
latexExpressions.push(match);
|
||
|
||
return `<<LATEX_${latexExpressions.length - 1}>>`;
|
||
}
|
||
);
|
||
|
||
// Protect inline $...$ but NOT if it looks like money (e.g., $10, $3.99)
|
||
content = maskInlineLaTeX(content, latexExpressions);
|
||
|
||
// Step 3: Escape standalone $ before digits (currency like $5 → \$5)
|
||
// (Now that inline math is protected, this will only escape dollars not already protected)
|
||
content = content.replace(/\$(?=\d)/g, '\\$');
|
||
|
||
// Step 4: Restore protected LaTeX expressions (they are valid)
|
||
content = content.replace(/<<LATEX_(\d+)>>/g, (_, index) => {
|
||
let expr = latexExpressions[parseInt(index)];
|
||
const match = expr.match(LATEX_LINEBREAK_REGEXP);
|
||
if (match) {
|
||
// Katex: The $$-delimiters should be in their own line
|
||
// if there are \\-line-breaks.
|
||
const formula = match[1];
|
||
const prefix = formula.startsWith('\n') ? '' : '\n';
|
||
const suffix = formula.endsWith('\n') ? '' : '\n';
|
||
expr = '$$' + prefix + formula + suffix + '$$';
|
||
}
|
||
return expr;
|
||
});
|
||
|
||
// Step 5: Restore code blocks
|
||
content = content.replace(/<<CODE_BLOCK_(\d+)>>/g, (_, index) => {
|
||
return codeBlocks[parseInt(index)];
|
||
});
|
||
|
||
// Step 6: Apply additional escaping functions (brackets and mhchem)
|
||
content = escapeBrackets(content);
|
||
|
||
if (doEscapeMhchem && (content.includes('\\ce{') || content.includes('\\pu{'))) {
|
||
content = escapeMhchem(content);
|
||
}
|
||
|
||
// Final pass: Convert \(...\) → $...$, \[...\] → $$...$$
|
||
content = content
|
||
// Using the look‑behind pattern `(?<!\\)` we skip matches
|
||
// that are preceded by a backslash, e.g.
|
||
// `Definitions\\(also called macros)` (title of chapter 20 in The TeXbook).
|
||
.replace(/(?<!\\)\\\((.+?)\\\)/g, '$$$1$') // inline
|
||
.replace(
|
||
// Using the look‑behind pattern `(?<!\\)` we skip matches
|
||
// that are preceded by a backslash, e.g. `\\[4pt]`.
|
||
/(?<!\\)\\\[([\s\S]*?)\\\]/g, // display, see also PR #16599
|
||
(_, prefix: string, content: string) => {
|
||
return `${prefix}$$${content}$$`;
|
||
}
|
||
);
|
||
|
||
// Step 7: Restore blockquote markers
|
||
if (blockquoteMarkers.size > 0) {
|
||
const finalLines = content.split('\n');
|
||
const restoredLines = finalLines.map((line, index) => {
|
||
const marker = blockquoteMarkers.get(index);
|
||
return marker ? marker + line : line;
|
||
});
|
||
content = restoredLines.join('\n');
|
||
}
|
||
|
||
return content;
|
||
}
|