Send reasoning content back to the model across turns via the reasoning_content API field (#21036)

* webui: send reasoning_content back to model in context Preserve assistant reasoning across turns by extracting it from internal tags and sending it as a separate reasoning_content field in the API payload. The server and Jinja templates handle native formatting (e.g. <think> tags for Qwen, GLM, DeepSeek...). Adds "Exclude reasoning from context" toggle in Settings > Developer (off by default, so reasoning is preserved). Includes unit tests. * webui: add syncable parameter for excludeReasoningFromContext * chore: update webui build output
2026-03-27 08:17:35 +01:00
parent 9bcb4eff4d
commit d0fa2c9fbb
11 changed files with 281 additions and 9 deletions
@@ -57,6 +57,46 @@ export class ChatService {
 	 *
 	 */

+	/**
+	 * Extracts reasoning text from content that contains internal reasoning tags.
+	 * Returns the concatenated reasoning content or undefined if none found.
+	 */
+	private static extractReasoningFromContent(
+		content: ApiChatMessageData['content'] | null | undefined
+	): string | undefined {
+		if (!content) return undefined;
+
+		const extractFromString = (text: string): string => {
+			const parts: string[] = [];
+			// Use a fresh regex instance to avoid shared lastIndex state
+			const re = new RegExp(AGENTIC_REGEX.REASONING_EXTRACT.source);
+			let match = re.exec(text);
+			while (match) {
+				parts.push(match[1]);
+				// advance past the matched portion and retry
+				text = text.slice(match.index + match[0].length);
+				match = re.exec(text);
+			}
+			return parts.join('');
+		};
+
+		if (typeof content === 'string') {
+			const result = extractFromString(content);
+			return result || undefined;
+		}
+
+		if (!Array.isArray(content)) return undefined;
+
+		const parts: string[] = [];
+		for (const part of content) {
+			if (part.type === ContentPartType.TEXT && part.text) {
+				const result = extractFromString(part.text);
+				if (result) parts.push(result);
+			}
+		}
+		return parts.length > 0 ? parts.join('') : undefined;
+	}
+
 	/**
 	 * Sends a chat completion request to the llama.cpp server.
 	 * Supports both streaming and non-streaming responses with comprehensive parameter configuration.
@@ -111,7 +151,8 @@ export class ChatService {
 			custom,
 			timings_per_token,
 			// Config options
-			disableReasoningParsing
+			disableReasoningParsing,
+			excludeReasoningFromContext
 		} = options;

 		const normalizedMessages: ApiChatMessageData[] = messages
@@ -159,14 +200,24 @@ export class ChatService {
 		}

 		const requestBody: ApiChatCompletionRequest = {
-			messages: normalizedMessages.map((msg: ApiChatMessageData) => ({
-				role: msg.role,
-				// Strip reasoning tags/content from the prompt to avoid polluting KV cache.
-				// TODO: investigate backend expectations for reasoning tags and add a toggle if needed.
-				content: ChatService.stripReasoningContent(msg.content),
-				tool_calls: msg.tool_calls,
-				tool_call_id: msg.tool_call_id
-			})),
+			messages: normalizedMessages.map((msg: ApiChatMessageData) => {
+				// Always strip internal reasoning/agentic tags from content
+				const cleanedContent = ChatService.stripReasoningContent(msg.content);
+				const mapped: ApiChatCompletionRequest['messages'][0] = {
+					role: msg.role,
+					content: cleanedContent,
+					tool_calls: msg.tool_calls,
+					tool_call_id: msg.tool_call_id
+				};
+				// When preserving reasoning, extract it from raw content and send as separate field
+				if (!excludeReasoningFromContext) {
+					const reasoning = ChatService.extractReasoningFromContent(msg.content);
+					if (reasoning) {
+						mapped.reasoning_content = reasoning;
+					}
+				}
+				return mapped;
+			}),
 			stream,
 			return_progress: stream ? true : undefined,
 			tools: tools && tools.length > 0 ? tools : undefined