TP: fix delayed AllReduce + zero-sized slices (#22489)
This commit is contained in:
@@ -1826,7 +1826,24 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
i = get_i_delayed(i);
|
const int i_delayed = get_i_delayed(i);
|
||||||
|
|
||||||
|
// If we can delay the AllReduce we need to consider the interaction with zero-sized tensor slices.
|
||||||
|
// A backend with such a slice would normally have valid data after participating in the AllReduce with a node that has
|
||||||
|
// its compute flag disabled and thus gets its data zeroed out.
|
||||||
|
// If the AllReduce is delayed then the nodes until that point also need to have their compute flag disabled.
|
||||||
|
if (i_delayed > i) {
|
||||||
|
for (size_t j = 0; j < n_backends; j++) {
|
||||||
|
auto & bcj = backend_ctx->backend_configs[j];
|
||||||
|
if ((bcj.nodes[i]->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
|
||||||
|
for (int ii = i + 1; ii <= i_delayed; ii++) {
|
||||||
|
bcj.nodes[ii]->flags &= ~GGML_TENSOR_FLAG_COMPUTE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
i = i_delayed;
|
||||||
|
|
||||||
for (size_t j = 0; j < n_backends; j++) {
|
for (size_t j = 0; j < n_backends; j++) {
|
||||||
auto & bcj = backend_ctx->backend_configs[j];
|
auto & bcj = backend_ctx->backend_configs[j];
|
||||||
|
|||||||
Reference in New Issue
Block a user