common : re-arm reasoning budget after DONE on new <think> (#22323)
DONE state absorbs all tokens including a new start tag, causing any think blocks after the first to run unbudgeted. Observed on unsloth/Qwen3.6-27B-GGUF which interleaves multiple <think> blocks per response. Fixed by advancing start_matcher in DONE branch and re-arming to COUNTING with a fresh budget on match. Adds regression test (test-reasoning-budget: test 6).
This commit is contained in:
@@ -122,6 +122,20 @@ static void common_reasoning_budget_accept(struct llama_sampler * smpl, llama_to
|
||||
}
|
||||
break;
|
||||
case REASONING_BUDGET_DONE:
|
||||
// Re-arm on a new start tag: some models emit multiple <think> blocks
|
||||
// per response, and each should get a fresh budget window.
|
||||
if (ctx->start_matcher.advance(token)) {
|
||||
ctx->state = REASONING_BUDGET_COUNTING;
|
||||
ctx->remaining = ctx->budget;
|
||||
ctx->end_matcher.reset();
|
||||
LOG_INF("reasoning-budget: re-activated on new start tag, budget=%d tokens\n", ctx->budget);
|
||||
|
||||
if (ctx->remaining <= 0) {
|
||||
ctx->state = REASONING_BUDGET_FORCING;
|
||||
ctx->force_pos = 0;
|
||||
LOG_INF("reasoning-budget: budget=0, forcing immediately\n");
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -227,7 +227,30 @@ int main(void) {
|
||||
3); // forcing continues through i=3
|
||||
}
|
||||
|
||||
printf("OK (5 tests passed)\n");
|
||||
// Test 6: Multi-block thinking. First block ends naturally at i=2, second
|
||||
// start tag at i=3 re-arms the budget, which then exhausts at i=5.
|
||||
// Regression: before this fix, DONE absorbed all subsequent tokens and a
|
||||
// second <think> block ran unbudgeted.
|
||||
// Flow: i=0 accept(100)->COUNTING rem=2; i=1 accept(50)->rem=1;
|
||||
// i=2 accept(101)->end_matcher matches, DONE;
|
||||
// i=3 accept(100)->re-arm, COUNTING rem=2;
|
||||
// i=4 accept(60)->rem=1; i=5 accept(61)->rem=0->FORCING;
|
||||
// i=6 apply()->forces token[0]=102, accept(62)->force_pos=1, stay FORCING;
|
||||
// i=7 apply()->forces token[1]=101, accept(63)->force_pos=2->DONE.
|
||||
{
|
||||
const std::vector<llama_token> start = {100};
|
||||
const std::vector<llama_token> end = {101};
|
||||
const std::vector<llama_token> forced = {102, 101};
|
||||
const std::vector<llama_token> sequence = {100, 50, 101, 100, 60, 61, 62, 63};
|
||||
|
||||
test_reasoning_budget("multi-block re-arms budget after DONE", sequence, start, end, forced,
|
||||
2, // budget of 2 tokens (per block)
|
||||
REASONING_BUDGET_IDLE,
|
||||
6, // forcing starts at i=6 (after second block exhausts at i=5)
|
||||
7); // forcing continues through i=7
|
||||
}
|
||||
|
||||
printf("OK (6 tests passed)\n");
|
||||
|
||||
printf("Testing UTF-8 boundary detection... ");
|
||||
test_utf8_boundary_detection();
|
||||
|
||||
Reference in New Issue
Block a user