webui: Add option to pre-encode conversation for faster next turns (#21034)

This commit is contained in:
Aleksander Grygier
2026-04-09 09:10:18 +02:00
committed by GitHub
parent b54cb2e3d0
commit 75511a8d7e
7 changed files with 267 additions and 81 deletions
@@ -56,6 +56,7 @@ export const SETTING_CONFIG_DEFAULT: Record<string, string | number | boolean |
dry_penalty_last_n: undefined,
max_tokens: undefined,
custom: '', // custom json-stringified object
preEncodeConversation: false,
// experimental features
pyInterpreterEnabled: false,
enableContinueGeneration: false
@@ -106,9 +107,9 @@ export const SETTING_CONFIG_INFO: Record<string, string> = {
custom: 'Custom JSON parameters to send to the API. Must be valid JSON format.',
showThoughtInProgress: 'Expand thought process by default when generating messages.',
disableReasoningParsing:
'Send reasoning_format=none to prevent server-side extraction of reasoning tokens into separate field',
'Send reasoning_format=none so the server returns thinking tokens inline instead of extracting them into a separate field.',
excludeReasoningFromContext:
'Strip reasoning content from previous messages before sending to the model. When unchecked, reasoning is sent back via the reasoning_content field so the model can see its own chain-of-thought across turns.',
'Strip thinking from previous messages before sending. When off, thinking is sent back via the reasoning_content field so the model sees its own chain-of-thought across turns.',
showRawOutputSwitch:
'Show toggle button to display messages as plain text instead of Markdown-formatted content',
keepStatsVisible: 'Keep processing statistics visible after generation finishes.',
@@ -143,6 +144,8 @@ export const SETTING_CONFIG_INFO: Record<string, string> = {
'Automatically expand tool call details while executing and keep them expanded after completion.',
pyInterpreterEnabled:
'Enable Python interpreter using Pyodide. Allows running Python code in markdown code blocks.',
preEncodeConversation:
'After each response, re-submit the conversation to pre-fill the server KV cache. Makes the next turn faster since the prompt is already encoded while you read the response.',
enableContinueGeneration:
'Enable "Continue" button for assistant messages. Currently works only with non-reasoning models.'
};
@@ -52,6 +52,8 @@ export const SETTINGS_KEYS = {
ALWAYS_SHOW_AGENTIC_TURNS: 'alwaysShowAgenticTurns',
AGENTIC_MAX_TOOL_PREVIEW_LINES: 'agenticMaxToolPreviewLines',
SHOW_TOOL_CALL_IN_PROGRESS: 'showToolCallInProgress',
// Performance
PRE_ENCODE_CONVERSATION: 'preEncodeConversation',
// Developer
DISABLE_REASONING_PARSING: 'disableReasoningParsing',
EXCLUDE_REASONING_FROM_CONTEXT: 'excludeReasoningFromContext',