[metal] extend bin op fusion to MUL/SUB/DIV chains (#28)
CI (apple) / macOS-latest-ios (pull_request) Waiting to run
CI (apple) / macos-latest-ios-xcode (pull_request) Waiting to run
CI (apple) / macOS-latest-tvos (pull_request) Waiting to run
CI (apple) / macOS-latest-visionos (pull_request) Waiting to run
CI (apple) / macOS-latest-swift (generic/platform=iOS) (pull_request) Blocked by required conditions
CI (apple) / macOS-latest-swift (generic/platform=macOS) (pull_request) Blocked by required conditions
CI (apple) / macOS-latest-swift (generic/platform=tvOS) (pull_request) Blocked by required conditions
CI (self-hosted) / ggml-ci-nvidia-cuda (pull_request) Waiting to run
CI (self-hosted) / ggml-ci-nvidia-vulkan-cm (pull_request) Waiting to run
CI (self-hosted) / ggml-ci-nvidia-vulkan-cm2 (pull_request) Waiting to run
CI (self-hosted) / ggml-ci-mac-metal (pull_request) Waiting to run
CI (self-hosted) / ggml-ci-mac-webgpu (pull_request) Waiting to run
CI (self-hosted) / ggml-ci-mac-vulkan (pull_request) Waiting to run
CI (self-hosted) / ggml-ci-linux-intel-vulkan (pull_request) Waiting to run
CI (self-hosted) / ggml-ci-win-intel-vulkan (pull_request) Waiting to run
CI (self-hosted) / ggml-ci-intel-openvino-gpu-low-perf (pull_request) Waiting to run
CI / build-cmake-pkg (pull_request) Waiting to run
CI / macOS-latest-arm64 (pull_request) Waiting to run
CI / macOS-latest-x64 (pull_request) Waiting to run
CI / macOS-latest-arm64-webgpu (pull_request) Waiting to run
CI / ubuntu-cpu (arm64, ubuntu-24.04-arm) (pull_request) Waiting to run
CI / ubuntu-cpu (ppc64le, ubuntu-24.04-ppc64le) (pull_request) Waiting to run
CI / ubuntu-cpu (s390x, ubuntu-24.04-s390x) (pull_request) Waiting to run
CI / ubuntu-cpu (x64, ubuntu-22.04) (pull_request) Waiting to run
CI / android-arm64 (pull_request) Waiting to run
CI / ubuntu-latest-rpc (pull_request) Waiting to run
CI / ubuntu-24-vulkan (arm64, ubuntu-24.04-arm) (pull_request) Waiting to run
CI / ubuntu-24-vulkan (x64, ubuntu-24.04) (pull_request) Waiting to run
CI / ubuntu-24-webgpu (pull_request) Waiting to run
CI / ubuntu-24-webgpu-wasm (pull_request) Waiting to run
CI / ubuntu-22-hip (pull_request) Waiting to run
CI / ubuntu-22-musa (pull_request) Waiting to run
CI / windows-latest (arm64, llvm-arm64, -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON) (pull_request) Waiting to run
CI / windows-latest (arm64, llvm-arm64-opencl-adreno, -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON) (pull_request) Waiting to run
CI / windows-latest (x64, cpu-x64 (static), -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF) (pull_request) Waiting to run
CI / windows-latest (x64, openblas-x64, -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=ON -DG… (pull_request) Waiting to run
CI / windows-latest (x64, vulkan-x64, -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON) (pull_request) Waiting to run
CI / ubuntu-latest-cuda (pull_request) Waiting to run
CI / windows-2022-cuda (12.4) (pull_request) Waiting to run
CI / windows-latest-hip (pull_request) Waiting to run
CI / ubuntu-cpu-riscv64-native (pull_request) Waiting to run
CI / ggml-ci-x64-cpu-low-perf (pull_request) Waiting to run
CI / ggml-ci-arm64-cpu-low-perf (pull_request) Waiting to run
CI / ggml-ci-x64-cpu-high-perf (pull_request) Waiting to run
CI / ggml-ci-arm64-cpu-high-perf (pull_request) Waiting to run
CI / ggml-ci-arm64-cpu-high-perf-sve (pull_request) Waiting to run
CI / ggml-ci-arm64-cpu-kleidiai (pull_request) Waiting to run
CI / ggml-ci-arm64-cpu-kleidiai-graviton4 (pull_request) Waiting to run
EditorConfig Checker / editorconfig (pull_request) Waiting to run
Server / server (default) (pull_request) Waiting to run
Server / server (backend-sampling) (pull_request) Waiting to run
Server / server-windows (pull_request) Waiting to run
Pull Request Labeler / labeler (pull_request_target) Waiting to run

This commit is contained in:
Kaloyan Nikolov
2026-04-30 20:14:12 +02:00
parent 222626cfdc
commit eeb79b026b
2 changed files with 25 additions and 19 deletions
+17 -7
View File
@@ -394,19 +394,29 @@ void ggml_graph_optimize(ggml_cgraph * gf) {
// fuse only ops that start with these operations // fuse only ops that start with these operations
// can be expanded when needed // can be expanded when needed
if (node.op() == GGML_OP_ADD || if (node.op() == GGML_OP_ADD ||
node.op() == GGML_OP_SUB ||
node.op() == GGML_OP_MUL ||
node.op() == GGML_OP_DIV ||
node.op() == GGML_OP_NORM || node.op() == GGML_OP_NORM ||
node.op() == GGML_OP_RMS_NORM) { node.op() == GGML_OP_RMS_NORM) {
ops[0] = node.op(); ops[0] = node.op();
int f = i + 1; int f = i + 1;
while (f < n && f < i + MAX_FUSE) { while (f < n && f < i + MAX_FUSE) {
// conservatively allow fusing only these ops // bin ops (ADD/SUB/MUL/DIV) must be same type to fuse
// can be expanded when needed // NORM/RMS_NORM can chain with MUL/ADD
if (gf->nodes[f]->op != GGML_OP_ADD && if (node.op() == GGML_OP_ADD ||
gf->nodes[f]->op != GGML_OP_MUL && node.op() == GGML_OP_SUB ||
gf->nodes[f]->op != GGML_OP_NORM && node.op() == GGML_OP_MUL ||
gf->nodes[f]->op != GGML_OP_RMS_NORM) { node.op() == GGML_OP_DIV) {
break; if (gf->nodes[f]->op != node.op()) break;
} else {
if (gf->nodes[f]->op != GGML_OP_ADD &&
gf->nodes[f]->op != GGML_OP_MUL &&
gf->nodes[f]->op != GGML_OP_NORM &&
gf->nodes[f]->op != GGML_OP_RMS_NORM) {
break;
}
} }
ops[f - i] = gf->nodes[f]->op; ops[f - i] = gf->nodes[f]->op;
f++; f++;
+8 -12
View File
@@ -3118,19 +3118,15 @@ int ggml_metal_op_bin(ggml_metal_op_t ctx, int idx) {
int n_fuse = 1; int n_fuse = 1;
// c[0] = add(a, b[0]) // c[0] = op(a, b[0])
// c[1] = add(c[0], b[1]) // c[1] = op(c[0], b[1])
// c[2] = add(c[1], b[2]) // c[2] = op(c[1], b[2])
// ... // ...
if (use_fusion) { if (use_fusion) {
fops[0] = GGML_OP_ADD; ggml_op cur_op = op->op;
fops[1] = GGML_OP_ADD; for (int i = 0; i < 8; ++i) {
fops[2] = GGML_OP_ADD; fops[i] = cur_op;
fops[3] = GGML_OP_ADD; }
fops[4] = GGML_OP_ADD;
fops[5] = GGML_OP_ADD;
fops[6] = GGML_OP_ADD;
fops[7] = GGML_OP_ADD;
// note: in metal, we sometimes encode the graph in parallel so we have to avoid fusing ops // note: in metal, we sometimes encode the graph in parallel so we have to avoid fusing ops
// across splits. idx_end indicates the last node in the current split // across splits. idx_end indicates the last node in the current split
@@ -3165,7 +3161,7 @@ int ggml_metal_op_bin(ggml_metal_op_t ctx, int idx) {
++n_fuse; ++n_fuse;
if (debug_fusion > 1 && n_fuse > 1) { if (debug_fusion > 1 && n_fuse > 1) {
GGML_LOG_DEBUG("%s: fuse: ADD x %d\n", __func__, n_fuse); GGML_LOG_DEBUG("%s: fuse: %s x %d\n", __func__, ggml_op_name(cur_op), n_fuse);
} }
} }