CUDA: manage NCCL communicators in context (#21891)

* CUDA: manage NCCL communicators in context

* add check that all backends are CUDA

* remove unused vector, limit init to > 1 GPUs

* fix warnings

* fix cuda device, cache allreduce
This commit is contained in:
Johannes Gäßler
2026-04-15 15:58:40 +02:00
committed by GitHub
parent adb541a6ad
commit 014dca49d6
4 changed files with 148 additions and 76 deletions
+5 -2
View File
@@ -202,8 +202,11 @@ extern "C" {
// Common functions that may be obtained using ggml_backend_reg_get_proc_address
// AllReduce operation for tensor parallelism (meta backend)
typedef bool (*ggml_backend_allreduce_tensor_t)(ggml_backend_t * backends, struct ggml_tensor ** tensors, size_t n_backends);
// Context management and operations for faster communication between backends, used for tensor parallelism (meta backend)
typedef void * (*ggml_backend_comm_init_t)(ggml_backend_t * backends, size_t n_backends);
typedef void (*ggml_backend_comm_free_t)(void * comm_ctx);
typedef bool (*ggml_backend_comm_allreduce_tensor_t)(void * comm_ctx, struct ggml_tensor ** tensors);
// Split buffer type for tensor parallelism (old)
typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(int main_device, const float * tensor_split);
// Set the number of threads for the backend