diff --git a/common/reasoning-budget.cpp b/common/reasoning-budget.cpp index cc408a686..74fce5367 100644 --- a/common/reasoning-budget.cpp +++ b/common/reasoning-budget.cpp @@ -122,6 +122,20 @@ static void common_reasoning_budget_accept(struct llama_sampler * smpl, llama_to } break; case REASONING_BUDGET_DONE: + // Re-arm on a new start tag: some models emit multiple blocks + // per response, and each should get a fresh budget window. + if (ctx->start_matcher.advance(token)) { + ctx->state = REASONING_BUDGET_COUNTING; + ctx->remaining = ctx->budget; + ctx->end_matcher.reset(); + LOG_INF("reasoning-budget: re-activated on new start tag, budget=%d tokens\n", ctx->budget); + + if (ctx->remaining <= 0) { + ctx->state = REASONING_BUDGET_FORCING; + ctx->force_pos = 0; + LOG_INF("reasoning-budget: budget=0, forcing immediately\n"); + } + } break; } } diff --git a/tests/test-reasoning-budget.cpp b/tests/test-reasoning-budget.cpp index 3028fb4d8..f7a601789 100644 --- a/tests/test-reasoning-budget.cpp +++ b/tests/test-reasoning-budget.cpp @@ -227,7 +227,30 @@ int main(void) { 3); // forcing continues through i=3 } - printf("OK (5 tests passed)\n"); + // Test 6: Multi-block thinking. First block ends naturally at i=2, second + // start tag at i=3 re-arms the budget, which then exhausts at i=5. + // Regression: before this fix, DONE absorbed all subsequent tokens and a + // second block ran unbudgeted. + // Flow: i=0 accept(100)->COUNTING rem=2; i=1 accept(50)->rem=1; + // i=2 accept(101)->end_matcher matches, DONE; + // i=3 accept(100)->re-arm, COUNTING rem=2; + // i=4 accept(60)->rem=1; i=5 accept(61)->rem=0->FORCING; + // i=6 apply()->forces token[0]=102, accept(62)->force_pos=1, stay FORCING; + // i=7 apply()->forces token[1]=101, accept(63)->force_pos=2->DONE. + { + const std::vector start = {100}; + const std::vector end = {101}; + const std::vector forced = {102, 101}; + const std::vector sequence = {100, 50, 101, 100, 60, 61, 62, 63}; + + test_reasoning_budget("multi-block re-arms budget after DONE", sequence, start, end, forced, + 2, // budget of 2 tokens (per block) + REASONING_BUDGET_IDLE, + 6, // forcing starts at i=6 (after second block exhausts at i=5) + 7); // forcing continues through i=7 + } + + printf("OK (6 tests passed)\n"); printf("Testing UTF-8 boundary detection... "); test_utf8_boundary_detection();