server: Add cached_tokens info to oaicompat responses (#19361)

* tests : fix fetch_server_test_models.py * server: to_json_oaicompat cached_tokens Adds OpenAI and Anthropic compatible information about the number of cached prompt tokens used in a response.
2026-03-19 11:09:33 -07:00
parent 76f2dc70c3
commit 26c9ce1288
6 changed files with 61 additions and 31 deletions
@@ -344,6 +344,7 @@ struct server_task_result_cmpl_final : server_task_result {
    bool truncated;
    int32_t n_decoded;
    int32_t n_prompt_tokens;
+    int32_t n_prompt_tokens_cache;
    int32_t n_tokens_cached;
    bool has_new_line;
    std::string stopping_word;
@@ -387,6 +388,8 @@ struct server_task_result_cmpl_final : server_task_result {

    json to_json_non_oaicompat();

+    json usage_json_oaicompat();
+
    json to_json_oaicompat();

    json to_json_oaicompat_chat();
@@ -408,6 +411,7 @@ struct server_task_result_cmpl_partial : server_task_result {

    int32_t n_decoded;
    int32_t n_prompt_tokens;
+    int32_t n_prompt_tokens_cache;

    bool post_sampling_probs;
    bool is_progress = false;