sampling : delegate input allocation to the scheduler (#19266)

* sampling : delegate input allocation to the scheduler

* graph : compute backend samplers only if needed
This commit is contained in:
Georgi Gerganov
2026-02-03 22:16:16 +02:00
committed by GitHub
parent 32b17abdb0
commit faa1bc26ee
3 changed files with 33 additions and 73 deletions
+1 -5
View File
@@ -1027,11 +1027,7 @@ bool llama_context::set_sampler(llama_seq_id seq_id, llama_sampler * sampler) {
llama_sampler_chain_n(sampler) > 0;
if (sampler && can_offload) {
ggml_backend_buffer_type_t buft = ggml_backend_dev_buffer_type(model.dev_output());
auto * host_buft = ggml_backend_dev_host_buffer_type(model.dev_output());
if (host_buft) {
buft = host_buft;
}
auto * buft = ggml_backend_dev_buffer_type(model.dev_output());
sampler->iface->backend_init(sampler, buft);