refactor: centralize CoT parsing in backend for streaming mode (#16394)

* refactor: unify reasoning handling via backend reasoning_content, drop frontend tag parsing - Updated the chat message component to surface backend-supplied reasoning via message.thinking while showing the raw assistant content without inline tag scrubbing - Simplified chat streaming to append content chunks directly, stream reasoning into the message model, and persist any partial reasoning when generation stops - Refactored the chat service SSE handler to rely on server-provided reasoning_content, removing legacy <think> parsing logic - Refreshed Storybook data and streaming flows to populate the thinking field explicitly for static and streaming assistant messages * refactor: implement streaming-aware universal reasoning parser Remove the streaming mode limitation from --reasoning-format by refactoring try_parse_reasoning() to handle incremental parsing of <think> tags across all formats. - Rework try_parse_reasoning() to track whitespace, partial tags, and multiple reasoning segments, allowing proper separation of reasoning_content and content in streaming mode - Parse reasoning tags before tool call handling in content-only and Llama 3.x formats to ensure inline <think> blocks are captured correctly - Change default reasoning_format from 'auto' to 'deepseek' for consistent behavior - Add 'deepseek-legacy' option to preserve old inline behavior when needed - Update CLI help and documentation to reflect streaming support - Add parser tests for inline <think>...</think> segments The parser now continues processing content after </think> closes instead of stopping, enabling proper message.reasoning_content and message.content separation in both streaming and non-streaming modes. Fixes the issue where streaming responses would dump everything (including post-thinking content) into reasoning_content while leaving content empty. * refactor: address review feedback from allozaur - Passed the assistant message content directly to ChatMessageAssistant to drop the redundant derived state in the chat message component - Simplified chat streaming updates by removing unused partial-thinking handling and persisting partial responses straight from currentResponse - Refreshed the ChatMessage stories to cover standard and reasoning scenarios without the old THINK-tag parsing examples Co-authored-by: Aleksander Grygier <aleksander.grygier@gmail.com> * refactor: restore forced reasoning prefix to pass test-chat ([chat] All tests passed) - store the exact sequence seen on input when 'thinking_forced_open' enforces a reasoning block - inject this prefix before the first accumulated segment in 'reasoning_content', then clear it to avoid duplication - repeat the capture on every new 'start_think' detection to properly handle partial/streaming flows * refactor: address review feedback from ngxson * debug: say goodbye to curl -N, hello one-click raw stream - adds a new checkbox in the WebUI to display raw LLM output without backend parsing or frontend Markdown rendering * Update tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage.svelte Co-authored-by: Aleksander Grygier <aleksander.grygier@gmail.com> * webui: add Storybook example for raw LLM output and scope reasoning format toggle per story - Added a Storybook example that showcases the chat message component in raw LLM output mode with the provided trace sample - Updated every ChatMessage story to toggle the disableReasoningFormat setting so the raw-output rendering remains scoped to its own example * npm run format * chat-parser: address review feedback from ngxson Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com> --------- Co-authored-by: Aleksander Grygier <aleksander.grygier@gmail.com> Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com>
2025-10-08 22:18:41 +02:00
parent 9d0882840e
commit 12bbc3fa50
14 changed files with 276 additions and 431 deletions
@@ -1,143 +0,0 @@
-/**
- * Parses thinking content from a message that may contain <think> tags or [THINK] tags
- * Returns an object with thinking content and cleaned message content
- * Handles both complete blocks and incomplete blocks (streaming)
- * Supports formats: <think>...</think> and [THINK]...[/THINK]
- * @param content - The message content to parse
- * @returns An object containing the extracted thinking content and the cleaned message content
- */
-export function parseThinkingContent(content: string): {
-	thinking: string | null;
-	cleanContent: string;
-} {
-	const incompleteThinkMatch = content.includes('<think>') && !content.includes('</think>');
-	const incompleteThinkBracketMatch = content.includes('[THINK]') && !content.includes('[/THINK]');
-
-	if (incompleteThinkMatch) {
-		const cleanContent = content.split('</think>')?.[1]?.trim();
-		const thinkingContent = content.split('<think>')?.[1]?.trim();
-
-		return {
-			cleanContent,
-			thinking: thinkingContent
-		};
-	}
-
-	if (incompleteThinkBracketMatch) {
-		const cleanContent = content.split('[/THINK]')?.[1]?.trim();
-		const thinkingContent = content.split('[THINK]')?.[1]?.trim();
-
-		return {
-			cleanContent,
-			thinking: thinkingContent
-		};
-	}
-
-	const completeThinkMatch = content.match(/<think>([\s\S]*?)<\/think>/);
-	const completeThinkBracketMatch = content.match(/\[THINK\]([\s\S]*?)\[\/THINK\]/);
-
-	if (completeThinkMatch) {
-		const thinkingContent = completeThinkMatch[1]?.trim() ?? '';
-		const cleanContent = `${content.slice(0, completeThinkMatch.index ?? 0)}${content.slice(
-			(completeThinkMatch.index ?? 0) + completeThinkMatch[0].length
-		)}`.trim();
-
-		return {
-			thinking: thinkingContent,
-			cleanContent
-		};
-	}
-
-	if (completeThinkBracketMatch) {
-		const thinkingContent = completeThinkBracketMatch[1]?.trim() ?? '';
-		const cleanContent = `${content.slice(0, completeThinkBracketMatch.index ?? 0)}${content.slice(
-			(completeThinkBracketMatch.index ?? 0) + completeThinkBracketMatch[0].length
-		)}`.trim();
-
-		return {
-			thinking: thinkingContent,
-			cleanContent
-		};
-	}
-
-	return {
-		thinking: null,
-		cleanContent: content
-	};
-}
-
-/**
- * Checks if content contains an opening thinking tag (for streaming)
- * Supports both <think> and [THINK] formats
- * @param content - The message content to check
- * @returns True if the content contains an opening thinking tag
- */
-export function hasThinkingStart(content: string): boolean {
-	return (
-		content.includes('<think>') ||
-		content.includes('[THINK]') ||
-		content.includes('<|channel|>analysis')
-	);
-}
-
-/**
- * Checks if content contains a closing thinking tag (for streaming)
- * Supports both </think> and [/THINK] formats
- * @param content - The message content to check
- * @returns True if the content contains a closing thinking tag
- */
-export function hasThinkingEnd(content: string): boolean {
-	return content.includes('</think>') || content.includes('[/THINK]');
-}
-
-/**
- * Extracts partial thinking content during streaming
- * Supports both <think> and [THINK] formats
- * Used when we have opening tag but not yet closing tag
- * @param content - The message content to extract partial thinking from
- * @returns An object containing the extracted partial thinking content and the remaining content
- */
-export function extractPartialThinking(content: string): {
-	thinking: string | null;
-	remainingContent: string;
-} {
-	const thinkStartIndex = content.indexOf('<think>');
-	const thinkEndIndex = content.indexOf('</think>');
-
-	const bracketStartIndex = content.indexOf('[THINK]');
-	const bracketEndIndex = content.indexOf('[/THINK]');
-
-	const useThinkFormat =
-		thinkStartIndex !== -1 && (bracketStartIndex === -1 || thinkStartIndex < bracketStartIndex);
-	const useBracketFormat =
-		bracketStartIndex !== -1 && (thinkStartIndex === -1 || bracketStartIndex < thinkStartIndex);
-
-	if (useThinkFormat) {
-		if (thinkEndIndex === -1) {
-			const thinkingStart = thinkStartIndex + '<think>'.length;
-
-			return {
-				thinking: content.substring(thinkingStart),
-				remainingContent: content.substring(0, thinkStartIndex)
-			};
-		}
-	} else if (useBracketFormat) {
-		if (bracketEndIndex === -1) {
-			const thinkingStart = bracketStartIndex + '[THINK]'.length;
-
-			return {
-				thinking: content.substring(thinkingStart),
-				remainingContent: content.substring(0, bracketStartIndex)
-			};
-		}
-	} else {
-		return { thinking: null, remainingContent: content };
-	}
-
-	const parsed = parseThinkingContent(content);
-
-	return {
-		thinking: parsed.thinking,
-		remainingContent: parsed.cleanContent
-	};
-}