TP: fix delayed AllReduce + zero-sized slices (#22489)

This commit is contained in:
Johannes Gäßler
2026-04-29 08:55:07 +02:00
committed by GitHub
parent fc2b0053ff
commit 739393beeb
+18 -1
View File
@@ -1826,7 +1826,24 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
continue; continue;
} }
i = get_i_delayed(i); const int i_delayed = get_i_delayed(i);
// If we can delay the AllReduce we need to consider the interaction with zero-sized tensor slices.
// A backend with such a slice would normally have valid data after participating in the AllReduce with a node that has
// its compute flag disabled and thus gets its data zeroed out.
// If the AllReduce is delayed then the nodes until that point also need to have their compute flag disabled.
if (i_delayed > i) {
for (size_t j = 0; j < n_backends; j++) {
auto & bcj = backend_ctx->backend_configs[j];
if ((bcj.nodes[i]->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
for (int ii = i + 1; ii <= i_delayed; ii++) {
bcj.nodes[ii]->flags &= ~GGML_TENSOR_FLAG_COMPUTE;
}
}
}
}
i = i_delayed;
for (size_t j = 0; j < n_backends; j++) { for (size_t j = 0; j < n_backends; j++) {
auto & bcj = backend_ctx->backend_configs[j]; auto & bcj = backend_ctx->backend_configs[j];