server: Add cached_tokens info to oaicompat responses (#19361)

* tests : fix fetch_server_test_models.py * server: to_json_oaicompat cached_tokens Adds OpenAI and Anthropic compatible information about the number of cached prompt tokens used in a response.
2026-03-19 11:09:33 -07:00
parent 76f2dc70c3
commit 26c9ce1288
6 changed files with 61 additions and 31 deletions
@@ -95,9 +95,9 @@ if __name__ == '__main__':
            '-p', 'Hey',
            '--no-warmup',
            '--log-disable',
-            '-no-cnv']
+            '-st']
        if m.hf_file != 'tinyllamas/stories260K.gguf' and 'Mistral-Nemo' not in m.hf_repo:
-            cmd.append('-fa')
+            cmd += ('-fa', 'on')
        try:
            subprocess.check_call(cmd)
        except subprocess.CalledProcessError: