TP: fix 0-sized tensor slices, AllReduce fallback (#21808)

* TP: fix 0-sized tensor slices, AllReduce fallback

* fix layer structure <-> GPU count aliasing

* add missing std::fill

* fix CUDA device set, max ggml ctx size
This commit is contained in:
Johannes Gäßler
2026-04-20 18:09:39 +02:00
committed by GitHub
parent 7f251fdbce
commit fb19f94c71
3 changed files with 169 additions and 80 deletions
+142 -76
View File
@@ -1133,7 +1133,7 @@ static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer
if (t_ij->view_src != nullptr && ggml_backend_buffer_is_meta(t_ij->view_src->buffer)) {
t_ij->view_src = ggml_backend_meta_buffer_simple_tensor(tensor->view_src, j);
if (t_ij->view_offs > 0 && split_dim >= 0 && split_dim < GGML_MAX_DIMS) {
GGML_ASSERT(ne[split_dim] != 0 && tensor->ne[split_dim] != 0);
GGML_ASSERT(tensor->ne[split_dim] != 0);
const int split_dim_view_src = ggml_backend_meta_get_split_state(tensor->view_src, /*assume_sync =*/ true).axis;
GGML_ASSERT(split_dim_view_src >= 0 && split_dim_view_src < GGML_MAX_DIMS);
@@ -1170,6 +1170,28 @@ static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer
simple_tensors.push_back(t_ij);
}
// If one of the sources has a zero-sized slice, disable the computation:
for (int i = 0; i < GGML_MAX_SRC; i++) {
if (tensor->src[i] == nullptr || !ggml_backend_buffer_is_meta(tensor->src[i]->buffer)) {
continue;
}
const ggml_backend_meta_split_state split_state_src = ggml_backend_meta_get_split_state(tensor->src[i], /*assume_sync =*/ true);
if (split_state_src.axis < 0 || split_state_src.axis >= GGML_MAX_DIMS) {
continue;
}
for (size_t j = 0; j < n_simple_bufs; j++) {
int64_t ne_sum = 0;
for (size_t s = 0; s < split_state_src.n_segments; s++) {
ne_sum += split_state_src.ne[s*n_simple_bufs + j];
}
if (ne_sum == 0) {
simple_tensors[j]->flags &= ~GGML_TENSOR_FLAG_COMPUTE;
}
}
}
buf_ctx->simple_tensors[tensor] = simple_tensors;
return GGML_STATUS_SUCCESS;
@@ -1442,17 +1464,20 @@ struct ggml_backend_meta_context {
struct backend_config {
ggml_backend_t backend;
std::vector<cgraph_config> cgraphs;
std::vector<ggml_tensor *> nodes;
ggml_backend_buffer_ptr buf;
std::vector<cgraph_config> cgraphs;
std::vector<ggml_tensor *> nodes;
std::vector<ggml_backend_buffer_ptr> bufs;
backend_config(ggml_backend_t backend) : backend(backend) {}
backend_config(ggml_backend_t backend, const size_t n_reduce_steps) : backend(backend) {
bufs.resize(n_reduce_steps);
}
};
std::string name;
std::vector<backend_config> backend_configs;
ggml_context_ptr ctx;
std::vector<ggml_cgraph *> cgraphs_aux;
std::vector<ggml_tensor *> nodes_aux;
size_t n_reduce_steps;
int max_nnodes = 0;
size_t max_tmp_size = 0;
size_t max_subgraphs = 0;
@@ -1464,6 +1489,7 @@ struct ggml_backend_meta_context {
ggml_backend_meta_context(ggml_backend_dev_t meta_dev, const char * params) {
const size_t n_devs = ggml_backend_meta_dev_n_devs(meta_dev);
n_reduce_steps = std::ceil(std::log2(n_devs));
name = "Meta(";
std::vector<ggml_backend_t> simple_backends;
backend_configs.reserve(n_devs);
@@ -1475,7 +1501,7 @@ struct ggml_backend_meta_context {
}
name += ggml_backend_dev_name(simple_dev);
simple_backends.push_back(ggml_backend_dev_init(simple_dev, params));
backend_configs.emplace_back(simple_backends.back());
backend_configs.emplace_back(simple_backends.back(), n_reduce_steps);
}
name += ")";
@@ -1505,10 +1531,6 @@ struct ggml_backend_meta_context {
ggml_backend_free(bc.backend);
}
}
size_t n_reduce_steps() const {
return std::ceil(std::log2(backend_configs.size()));
}
};
static const char * ggml_backend_meta_get_name(ggml_backend_t backend) {
@@ -1754,16 +1776,17 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
if (max_tmp_size > backend_ctx->max_tmp_size) {
for (size_t j = 0; j < n_backends; j++) {
auto & bcj = backend_ctx->backend_configs[j];
bcj.buf.reset(ggml_backend_alloc_buffer(bcj.backend, max_tmp_size));
for (size_t i = 0; i < backend_ctx->n_reduce_steps; i++) {
bcj.bufs[i].reset(ggml_backend_alloc_buffer(bcj.backend, max_tmp_size));
}
}
backend_ctx->max_tmp_size = max_tmp_size;
}
if (max_nnodes_raised || n_subgraphs > backend_ctx->max_subgraphs) {
backend_ctx->max_subgraphs = std::max(backend_ctx->max_subgraphs, n_subgraphs);
const size_t n_reduce_steps = backend_ctx->n_reduce_steps();
const size_t n_nodes_per_device = 2 * n_reduce_steps; // tmp + ADD per step
const size_t n_cgraphs_per_device = n_reduce_steps; // 1 ADD graph per step
const size_t n_nodes_per_device = 3 * backend_ctx->n_reduce_steps; // tmp + ADD (+zeroing) graph per step and device
const size_t n_cgraphs_per_device = 2 * backend_ctx->n_reduce_steps; // ADD ( + zeroing) graph per step and device
const size_t mem_per_device_graphs_main = backend_ctx->max_subgraphs*ggml_graph_overhead_custom(backend_ctx->max_nnodes, cgraph->grads);
const size_t mem_per_device_graphs_aux = n_cgraphs_per_device*backend_ctx->max_subgraphs*ggml_graph_overhead_custom(1, cgraph->grads);
const size_t mem_per_device_nodes_aux = n_nodes_per_device*backend_ctx->max_subgraphs*ggml_tensor_overhead();
@@ -1812,11 +1835,6 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
size_t iga = 0; // i graph aux
size_t ina = 0; // i node aux
// FIXME usage_counts
auto get_cgraph_aux = [&]() -> ggml_cgraph * {
ggml_cgraph * ret = backend_ctx->cgraphs_aux[iga++];
return ret;
};
auto get_node_aux = [&](ggml_tensor * t) -> ggml_tensor * {
ggml_tensor * ret = backend_ctx->nodes_aux[ina++];
memset(ret, 0, sizeof(ggml_tensor));
@@ -1828,75 +1846,110 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
}
return ret;
};
auto set_tmp_data = [&](ggml_tensor * tensor, const size_t j, const size_t i_buf) {
auto & bcj = backend_ctx->backend_configs[j];
ggml_backend_buffer_ptr & buf_ptr = bcj.bufs[i_buf];
if (!buf_ptr || ggml_backend_buffer_get_size(buf_ptr.get()) < backend_ctx->max_tmp_size) {
buf_ptr.reset(ggml_backend_alloc_buffer(bcj.backend, backend_ctx->max_tmp_size));
}
tensor->buffer = buf_ptr.get();
tensor->data = ggml_backend_buffer_get_base(buf_ptr.get());
};
// FIXME usage_counts
auto get_cgraph_aux = [&]() -> ggml_cgraph * {
ggml_cgraph * ret = backend_ctx->cgraphs_aux[iga++];
return ret;
};
// Preferentially use backend-specific allreduce_tensor_async (e.g. NCCL for CUDA), use a generic fallback if unavailable:
auto allreduce_fallback = [&](size_t i) -> ggml_status {
std::vector<ggml_cgraph *> step_cgraphs(n_backends, nullptr);
for (size_t offset_j = 1; offset_j < n_backends; offset_j *= 2) {
// Zero out nodes that were disabled due to having a zero-sized slice:
for (size_t j = 0; j < n_backends; j++) {
auto & bcj = backend_ctx->backend_configs[j];
ggml_tensor * node = bcj.cgraphs[i].cgraph_main->nodes[bcj.cgraphs[i].cgraph_main->n_nodes - 1];
if (node->flags & GGML_TENSOR_FLAG_COMPUTE) {
continue;
}
ggml_tensor * node_zero = get_node_aux(node);
node_zero->op = GGML_OP_SCALE; // FIXME 0.0f * NaN == NaN
node_zero->src[0] = node;
ggml_set_op_params_f32(node_zero, 0, 0.0f);
node_zero->data = node->data;
node_zero->flags |= GGML_TENSOR_FLAG_COMPUTE;
step_cgraphs[j] = get_cgraph_aux();
step_cgraphs[j]->nodes[0] = node_zero;
step_cgraphs[j]->n_nodes = 1;
const ggml_status status = ggml_backend_graph_compute_async(bcj.backend, step_cgraphs[j]);
if (status != GGML_STATUS_SUCCESS) {
return status;
}
}
std::fill(step_cgraphs.begin(), step_cgraphs.end(), nullptr);
auto push_data = [&](const size_t j_src, const size_t j_dst, const size_t i_buf) {
assert(step_cgraphs[j_dst] == nullptr);
auto & bcj_src = backend_ctx->backend_configs[j_src];
auto & bcj_dst = backend_ctx->backend_configs[j_dst];
ggml_tensor * node_src = bcj_src.cgraphs[i].cgraph_main->nodes[bcj_src.cgraphs[i].cgraph_main->n_nodes - 1];
ggml_tensor * node_dst = bcj_dst.cgraphs[i].cgraph_main->nodes[bcj_dst.cgraphs[i].cgraph_main->n_nodes - 1];
GGML_ASSERT(ggml_is_contiguous(node_src));
GGML_ASSERT(ggml_is_contiguous(node_dst));
ggml_tensor * node_tmp = get_node_aux(node_dst);
set_tmp_data(node_tmp, j_dst, i_buf);
ggml_backend_tensor_copy_async(bcj_src.backend, bcj_dst.backend, node_src, node_tmp);
ggml_tensor * node_red = get_node_aux(node_dst);
node_red->view_src = node_dst->view_src == nullptr ? node_dst : node_dst->view_src;
node_red->view_offs = node_dst->view_offs;
node_red->op = GGML_OP_ADD;
node_red->src[0] = node_dst;
node_red->src[1] = node_tmp;
node_red->flags |= GGML_TENSOR_FLAG_COMPUTE;
ggml_backend_view_init(node_red);
ggml_cgraph * cgraph_aux = get_cgraph_aux();
cgraph_aux->nodes[0] = node_red;
cgraph_aux->n_nodes = 1;
step_cgraphs[j_dst] = cgraph_aux;
};
size_t offset_j = n_backends/2;
while ((offset_j & (offset_j - 1)) != 0) {
offset_j--;
}
const size_t offset_j_max = offset_j;
size_t i_buf = 0;
// If n_backends is not a power of 2, fold in the excess prior to butterfly reduction:
for (size_t j_src = 2*offset_j_max; j_src < n_backends; j_src++) {
const size_t j_dst = j_src - 2*offset_j_max;
push_data(j_src, j_dst, i_buf);
const ggml_status status = ggml_backend_graph_compute_async(backend_ctx->backend_configs[j_dst].backend, step_cgraphs[j_dst]);
if (status != GGML_STATUS_SUCCESS) {
return status;
}
i_buf = 1;
}
// Butterfly reduction:
for (; offset_j >= 1; offset_j /= 2) {
std::fill(step_cgraphs.begin(), step_cgraphs.end(), nullptr);
for (size_t j = 0; j < n_backends; j++) {
for (size_t j = 0; j < 2*offset_j_max; j++) {
const size_t j_other = j ^ offset_j;
if (j_other > j) {
if (j_other >= n_backends) {
continue;
}
auto & bcj1 = backend_ctx->backend_configs[j];
auto & bcj2 = backend_ctx->backend_configs[j_other];
ggml_tensor * node1 = bcj1.cgraphs[i].cgraph_main->nodes[bcj1.cgraphs[i].cgraph_main->n_nodes - 1];
ggml_tensor * node2 = bcj2.cgraphs[i].cgraph_main->nodes[bcj2.cgraphs[i].cgraph_main->n_nodes - 1];
GGML_ASSERT(ggml_is_contiguous(node1));
GGML_ASSERT(ggml_is_contiguous(node2));
// Tmp tensors to receive P2P copies
ggml_tensor * node_tmp_1 = get_node_aux(node1);
node_tmp_1->buffer = bcj1.buf.get();
node_tmp_1->data = ggml_backend_buffer_get_base(bcj1.buf.get());
ggml_tensor * node_tmp_2 = get_node_aux(node2);
node_tmp_2->buffer = bcj2.buf.get();
node_tmp_2->data = ggml_backend_buffer_get_base(bcj2.buf.get());
// 2 P2P copies: exchange full buffers
ggml_backend_tensor_copy_async(bcj1.backend, bcj2.backend, node1, node_tmp_2);
ggml_backend_tensor_copy_async(bcj2.backend, bcj1.backend, node2, node_tmp_1);
// Local ADD: node1 += tmp1 (in-place via view)
ggml_tensor * node_red_1 = get_node_aux(node1);
node_red_1->view_src = node1->view_src == nullptr ? node1 : node1->view_src;
node_red_1->view_offs = node1->view_offs;
node_red_1->op = GGML_OP_ADD;
node_red_1->src[0] = node1;
node_red_1->src[1] = node_tmp_1;
node_red_1->flags |= GGML_TENSOR_FLAG_COMPUTE;
ggml_backend_view_init(node_red_1);
// Local ADD: node2 += tmp2 (in-place via view)
ggml_tensor * node_red_2 = get_node_aux(node2);
node_red_2->view_src = node2->view_src == nullptr ? node2 : node2->view_src;
node_red_2->view_offs = node2->view_offs;
node_red_2->op = GGML_OP_ADD;
node_red_2->src[0] = node2;
node_red_2->src[1] = node_tmp_2;
node_red_2->flags |= GGML_TENSOR_FLAG_COMPUTE;
ggml_backend_view_init(node_red_2);
// Build 1-node cgraphs for the ADD ops
ggml_cgraph * cgraph_aux_1 = get_cgraph_aux();
cgraph_aux_1->nodes[0] = node_red_1;
cgraph_aux_1->n_nodes = 1;
step_cgraphs[j] = cgraph_aux_1;
ggml_cgraph * cgraph_aux_2 = get_cgraph_aux();
cgraph_aux_2->nodes[0] = node_red_2;
cgraph_aux_2->n_nodes = 1;
step_cgraphs[j_other] = cgraph_aux_2;
push_data(j, j_other, i_buf);
}
// Execute local ADDs for this step
for (size_t j = 0; j < n_backends; j++) {
for (size_t j = 0; j < 2*offset_j_max; j++) {
if (step_cgraphs[j] == nullptr) {
continue;
}
@@ -1906,7 +1959,20 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
return status;
}
}
i_buf++;
}
assert(i_buf == backend_ctx->n_reduce_steps);
// If n_backends is not a power of 2, copy back the reduced tensors to the excess:
for (size_t j = 2*offset_j_max; j < n_backends; j++) {
auto & bcj_src = backend_ctx->backend_configs[j - 2*offset_j_max];
auto & bcj_dst = backend_ctx->backend_configs[j];
ggml_tensor * node_src = bcj_src.cgraphs[i].cgraph_main->nodes[bcj_src.cgraphs[i].cgraph_main->n_nodes - 1];
ggml_tensor * node_dst = bcj_dst.cgraphs[i].cgraph_main->nodes[bcj_dst.cgraphs[i].cgraph_main->n_nodes - 1];
ggml_backend_tensor_copy_async(bcj_src.backend, bcj_dst.backend, node_src, node_dst);
}
return GGML_STATUS_SUCCESS;
};
+12 -1
View File
@@ -1203,6 +1203,13 @@ static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct gg
// For small tensors, simply reduce them as FP32.
// The following heuristic for how "small" a tensor should be is based on RTX 4090s connected via 16x PCIe 4.0.
if ((n_backends <= 2 && ne < 32768) || (n_backends == 3 && ne < 131072) || (n_backends >= 4 && ne < 262144)) {
for (size_t i = 0; i < n_backends; ++i) {
if ((tensors[i]->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) comm_ctx->backends[i]->context;
ggml_cuda_set_device(cuda_ctx->device);
CUDA_CHECK(cudaMemsetAsync(tensors[i]->data, 0, ggml_nbytes(tensors[i]), cuda_ctx->stream()));
}
}
NCCL_CHECK(ncclGroupStart());
for (size_t i = 0; i < n_backends; ++i) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) comm_ctx->backends[i]->context;
@@ -1224,7 +1231,11 @@ static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct gg
tmp[i].alloc(ne);
ggml_cuda_set_device(cuda_ctx->device);
to_bf16(tensors[i]->data, tmp[i].get(), ne, cuda_ctx->stream());
if (tensors[i]->flags & GGML_TENSOR_FLAG_COMPUTE) {
to_bf16(tensors[i]->data, tmp[i].get(), ne, cuda_ctx->stream());
} else {
CUDA_CHECK(cudaMemsetAsync(tmp[i].get(), 0, ne * sizeof(nv_bfloat16), cuda_ctx->stream()));
}
CUDA_CHECK(cudaGetLastError());
}