common/parser: handle reasoning budget (#20297)
* v1 * Finished! * Handlie cli * Reasoning sampler * Apply suggestions from code review Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Less explosive terminology :) * Add utf-8 case and tests * common : migrate reasoning budget sampler to common * cont : clean up * cont : expose state and allow passing as initial state * cont : remove unused imports * cont : update state machine doc string --------- Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> Co-authored-by: Alde Rojas <hello@alde.dev>
This commit is contained in:
committed by
GitHub
parent
5f91b1d5d5
commit
acb7c79069
@@ -0,0 +1,41 @@
|
||||
#pragma once
|
||||
|
||||
#include "llama.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <vector>
|
||||
|
||||
enum common_reasoning_budget_state {
|
||||
REASONING_BUDGET_IDLE, // waiting for start sequence
|
||||
REASONING_BUDGET_COUNTING, // counting down tokens
|
||||
REASONING_BUDGET_FORCING, // forcing budget message + end sequence
|
||||
REASONING_BUDGET_WAITING_UTF8, // budget exhausted, waiting for UTF-8 completion
|
||||
REASONING_BUDGET_DONE, // passthrough forever
|
||||
};
|
||||
|
||||
// Creates a reasoning budget sampler that limits token generation inside a
|
||||
// reasoning block (e.g. between <think> and </think>).
|
||||
//
|
||||
// State machine: IDLE -> COUNTING -> WAITING_UTF8 -> FORCING -> DONE
|
||||
// IDLE: passthrough, watching for start_tokens sequence
|
||||
// COUNTING: counting down remaining tokens, watching for natural end_tokens
|
||||
// WAITING_UTF8: budget exhausted, allowing tokens to complete a UTF-8 sequence
|
||||
// FORCING: forces forced_tokens token-by-token (all other logits -> -inf)
|
||||
// DONE: passthrough forever
|
||||
//
|
||||
// Parameters:
|
||||
// vocab - vocabulary (used for UTF-8 boundary detection; can be nullptr)
|
||||
// start_tokens - token sequence that activates counting
|
||||
// end_tokens - token sequence for natural deactivation
|
||||
// forced_tokens - token sequence forced when budget expires
|
||||
// budget - max tokens allowed in the reasoning block
|
||||
// initial_state - initial state of the sampler (e.g. IDLE or COUNTING)
|
||||
// note: COUNTING with budget <= 0 is promoted to FORCING
|
||||
//
|
||||
struct llama_sampler * common_reasoning_budget_init(
|
||||
const struct llama_vocab * vocab,
|
||||
const std::vector<llama_token> & start_tokens,
|
||||
const std::vector<llama_token> & end_tokens,
|
||||
const std::vector<llama_token> & forced_tokens,
|
||||
int32_t budget,
|
||||
common_reasoning_budget_state initial_state);
|
||||
Reference in New Issue
Block a user