From c807c6e3b0c74b77ad4c7a8213a1d5690d34e462 Mon Sep 17 00:00:00 2001 From: kvc0 <3454741+kvc0@users.noreply.github.com> Date: Thu, 23 Apr 2026 08:45:02 -0700 Subject: [PATCH] server: (anthropic API) fix prefix caching (#21793) When testing claude code against llama.cpp, I noticed that only n_past 18577 was used even when context was 60k or more. The log in llama-server says: ``` slot update_slots: id 3 | task 10342 | old: ... ; cch= | defa0;You are slot update_slots: id 3 | task 10342 | new: ... ; cch= | 1c8b4; ``` I observed that the cch value changed every time. Reading about that, the x-anthropic-billing-header system message seems to be specially handled inside of the anthropic api. I could remove it, but there is a meaningful string sometimes included at the end. So instead, I just replace the changing cch checksum with fffff. I'm treating this as an anthropic message body API detail - I think this is the right way to do this, but by all means please correct me! It's always 5 hexadecimal characters, but I've written the replacement defensively in case they change the protocol. --- tools/server/server-chat.cpp | 41 +++++++++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/tools/server/server-chat.cpp b/tools/server/server-chat.cpp index ef586d1e1..34ec7982b 100644 --- a/tools/server/server-chat.cpp +++ b/tools/server/server-chat.cpp @@ -281,6 +281,42 @@ json server_chat_convert_responses_to_chatcmpl(const json & response_body) { return chatcmpl_body; } +// Edits the cch section of an "x-anthropic-billing-header" system prompt. +// Does nothing to any other prompt. +// +// This is a claude message with a "cch=ef01a" attribute that breaks prefix caching. +// The cch stamp is a whitebox end-to-end integrity hint. It's not meaningful as a +// system prompt data, particularly to llama.cpp, but its presence means the prefix +// cache will not get past it: It changes on each request. +// +// Reference: https://github.com/ggml-org/llama.cpp/pull/21793 +// Example header: +// ``` +// x-anthropic-billing-header: cc_version=2.1.101.e51; cc_entrypoint=cli; cch=a5145;You are Claude Code, Anthropic's official CLI for Claude. +// ^^^^^ +// ``` +static void normalize_anthropic_billing_header(std::string & system_text) { + if (system_text.rfind("x-anthropic-billing-header:", 0) != 0) { + return; + } + + const size_t header_prefix_length = strlen("x-anthropic-billing-header:"); + const size_t cch_length = 5; + const size_t index_cch = system_text.find("cch=", header_prefix_length); + if (index_cch == std::string::npos) { + return; + } + + const size_t index_replace = index_cch + 4; + if (index_replace + cch_length < system_text.length() && system_text[index_replace + cch_length] == ';') { + for (size_t i = 0; i < cch_length; ++i) { + system_text[index_replace + i] = 'f'; + } + } else { + LOG_ERR("anthropic string not as expected: %s", system_text.c_str()); + } +} + json server_chat_convert_anthropic_to_oai(const json & body) { json oai_body; @@ -292,10 +328,13 @@ json server_chat_convert_anthropic_to_oai(const json & body) { if (system_param.is_string()) { system_content = system_param.get(); + normalize_anthropic_billing_header(system_content); } else if (system_param.is_array()) { for (const auto & block : system_param) { if (json_value(block, "type", std::string()) == "text") { - system_content += json_value(block, "text", std::string()); + auto system_text = json_value(block, "text", std::string()); + normalize_anthropic_billing_header(system_text); + system_content += system_text; } } }