server: Add cached_tokens info to oaicompat responses (#19361)

* tests : fix fetch_server_test_models.py

* server: to_json_oaicompat cached_tokens

Adds OpenAI and Anthropic compatible information about the
number of cached prompt tokens used in a response.
This commit is contained in:
Ryan Goulden
2026-03-19 11:09:33 -07:00
committed by GitHub
parent 76f2dc70c3
commit 26c9ce1288
6 changed files with 61 additions and 31 deletions
+2 -2
View File
@@ -95,9 +95,9 @@ if __name__ == '__main__':
'-p', 'Hey',
'--no-warmup',
'--log-disable',
'-no-cnv']
'-st']
if m.hf_file != 'tinyllamas/stories260K.gguf' and 'Mistral-Nemo' not in m.hf_repo:
cmd.append('-fa')
cmd += ('-fa', 'on')
try:
subprocess.check_call(cmd)
except subprocess.CalledProcessError: