server: Add cached_tokens info to oaicompat responses (#19361)
* tests : fix fetch_server_test_models.py * server: to_json_oaicompat cached_tokens Adds OpenAI and Anthropic compatible information about the number of cached prompt tokens used in a response.
This commit is contained in:
@@ -95,9 +95,9 @@ if __name__ == '__main__':
|
||||
'-p', 'Hey',
|
||||
'--no-warmup',
|
||||
'--log-disable',
|
||||
'-no-cnv']
|
||||
'-st']
|
||||
if m.hf_file != 'tinyllamas/stories260K.gguf' and 'Mistral-Nemo' not in m.hf_repo:
|
||||
cmd.append('-fa')
|
||||
cmd += ('-fa', 'on')
|
||||
try:
|
||||
subprocess.check_call(cmd)
|
||||
except subprocess.CalledProcessError:
|
||||
|
||||
Reference in New Issue
Block a user