ggml: backend-agnostic tensor parallelism (experimental) (#19378)

* ggml: backend-agnostic tensor parallelism * support for GPT-OSS, Qwen 3 MoE * partial Vulkan fix * add support for 4/8 GPUs * unconditional peer access * re-use buffers + ggml contexts * fix output pattern * NCCL support * GGML: HIP: add RCCL support * Remove shfl and AllReduce from backend interface * move allocation workaround out of ggml-alloc.c * 2d tensor set/get support * Fix the seg fault without NCCL * Apply suggestion from JohannesGaessler * support for tensor dims % n_devs != 0 * fix view_offs scaling * arbitrary num. of GPUs/tensor split * fix compilation * better granularity estimate * Support device-specific host buffer types if all underlying backends expose the same type. This allows using pinned memory instead of pageable memory for CUDA. Fix compilation errors. * partial Qwen 3 Next support * Fix qwen3 30b (#8) * Fix crash with Qwen-30B-A3B Q4_0 Qwen-30B-A3B Q4_0 has an intermediate dimension of 768. Using a granularity of 256 forces an uneven split between GPUs, which is not supported by the current implementation. * Decide block size based on tensor quantization type * Fix crashes due to KV cache serialization (#9) KV cache serialization requires non-zero offsets on the tensor. Add support in the meta backend to set/get a tensor with a non-zero offset. * metal : fix build (#7) * static memory allocations, fix usage count * fix tensor granularity * more even memory distribution * use BF16 for allreduce * rebase fixup * better error message for unsupported architectures * Fix device mismatch during scatter of allReduce. (#11) There is a mismatch between the dst buffer device and the backend device, causing the use of sync copies * Enable the previous allreduce implementation. It is better in both perf and stability (#12) * delay AllReduce for Moe for less I/O * build : clean-up compile warnings * backend : move most of the meta backend API to ggml-backend-impl.h * cont : hide unused public API in the implementation * llama : use llama_device + remove ggml_backend_dev_is_meta() * ggml-backend : remove unused alloc include * minor : remove regex include * ggml : introduce ggml-ext.h for staging new APIs * rebase fixup * fix tests * llama : more robust logic for determining Meta devices (#16) * llama : more robust logic for determining Meta devices * cont : fix devs size check Co-authored-by: Johannes Gäßler <johannesg@5d6.de> * cont : fix log type Co-authored-by: Johannes Gäßler <johannesg@5d6.de> --------- Co-authored-by: Johannes Gäßler <johannesg@5d6.de> * disable roundtrip for meta backend * fix arch selection * Qwen 3.5 support * fix Gemma 4 MoE * fix OpenVino, SYCL * fix test-llama-archs for CPU-only builds * Fix Qwen 3.5 MoE * disable meta backend tests for WebGPU * tests : filter CPU-based devices from the Meta backend tests (#17) * meta : formatting, naming, indentation (#18) * formatting : llama-model.cpp * formatting : ggml-ext.h * formatting : ggml-backend-meta.cpp * meta : add TODO * add documentation * better error messages * fix GPT-OSS --------- Co-authored-by: Carl Philipp Klemm <carl@uvos.xyz> Co-authored-by: Gaurav Garg <gaugarg@nvidia.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2026-04-09 16:42:19 +02:00
parent 009a113326
commit d6f3030047
48 changed files with 3198 additions and 342 deletions
@@ -6,6 +6,8 @@
 #include "ggml-cpp.h"
 #include "llama.h"
 #include "llama-cpp.h"
+
+// TODO: replace with #include "llama-ext.h" in the future
 #include "../src/llama-arch.h"
 #include "../src/llama-model-saver.h"

@@ -205,9 +207,9 @@ static gguf_context_ptr get_gguf_ctx(const llm_arch arch, const bool moe) {
    ms.add_kv(LLM_KV_XIELU_ALPHA_P,             1.0f);
    ms.add_kv(LLM_KV_XIELU_BETA,                1.0f);
    ms.add_kv(LLM_KV_XIELU_EPS,                 1.0e-7f);
-    ms.add_kv(LLM_KV_SSM_INNER_SIZE,            arch == LLM_ARCH_QWEN3NEXT || arch == LLM_ARCH_QWEN35 || arch == LLM_ARCH_QWEN35MOE ? 64 : 2*n_embd);
+    ms.add_kv(LLM_KV_SSM_INNER_SIZE,            arch == LLM_ARCH_QWEN3NEXT || arch == LLM_ARCH_QWEN35 || arch == LLM_ARCH_QWEN35MOE ? 256 : 2*n_embd);
    ms.add_kv(LLM_KV_SSM_CONV_KERNEL,           uint32_t(4));
-    ms.add_kv(LLM_KV_SSM_STATE_SIZE,            uint32_t(32));
+    ms.add_kv(LLM_KV_SSM_STATE_SIZE,            uint32_t(128));
    ms.add_kv(LLM_KV_SSM_TIME_STEP_RANK,        n_head);
    ms.add_kv(LLM_KV_SSM_GROUP_COUNT,           arch == LLM_ARCH_PLAMO2 ? 0 : uint32_t(2));
    ms.add_kv(LLM_KV_KDA_HEAD_DIM,              uint32_t(128));
@@ -235,18 +237,23 @@ static bool silent_model_load_progress(float /*progress*/, void * /*user_data*/)
 }

 static std::pair<llama_model_ptr, llama_context_ptr> get_model_and_ctx(
-        struct gguf_context * gguf_ctx, FILE * file, const size_t seed, const std::vector<ggml_backend_dev_t> & devs) {
+        struct gguf_context * gguf_ctx, FILE * file, const size_t seed, const std::vector<ggml_backend_dev_t> & devs,
+        const llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER, bool encode = false) {
    GGML_ASSERT((gguf_ctx == nullptr) != (file == nullptr));
    llama_model_params model_params = llama_model_default_params();
    model_params.progress_callback = silent_model_load_progress;
    std::vector<ggml_backend_dev_t> devs_copy = devs;
    devs_copy.push_back(nullptr);
    model_params.devices = devs_copy.data();
+    model_params.split_mode = split_mode;

    llama_context_params ctx_params = llama_context_default_params();
    ctx_params.n_ctx = 0;
    ctx_params.n_threads = 4;
    ctx_params.n_threads_batch = 4;
+    if (!encode) {
+        ctx_params.n_ubatch = 64;
+    }

    size_t tmp = seed;
    llama_model_ptr model(gguf_ctx != nullptr ?
@@ -357,6 +364,46 @@ static bool moe_implemented(const llm_arch arch) {
    }
 }

+static bool arch_supported(const llm_arch arch) {
+    if (arch == LLM_ARCH_CLIP || arch == LLM_ARCH_GPTJ || arch == LLM_ARCH_UNKNOWN) {
+        return false; // These models don't have usable implementations.
+    }
+    if (arch == LLM_ARCH_CHAMELEON) {
+        return false; // Only half-implemented and to be removed in the future.
+    }
+    if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
+        return false; // FIXME CUDA backend crashes.
+    }
+    if (arch == LLM_ARCH_GEMMA4) {
+        return false; // FIXME @ngxson
+    }
+    if (arch == LLM_ARCH_LLAMA_EMBED || arch == LLM_ARCH_GEMMA_EMBEDDING || arch == LLM_ARCH_T5ENCODER) {
+        return false; // FIXME Embedding (?) models produce inconsistent results.
+    }
+    if (arch == LLM_ARCH_RWKV6 || arch == LLM_ARCH_RWKV6QWEN2 || arch == LLM_ARCH_RWKV7 || arch == LLM_ARCH_ARWKV7) {
+        return false; // FIXME RWKV models hang indefinitely.
+    }
+    if (arch == LLM_ARCH_BERT || arch == LLM_ARCH_MODERN_BERT || arch == LLM_ARCH_NOMIC_BERT || arch == LLM_ARCH_NOMIC_BERT_MOE ||
+            arch == LLM_ARCH_NEO_BERT || arch == LLM_ARCH_JINA_BERT_V2 || arch == LLM_ARCH_JINA_BERT_V3 || arch == LLM_ARCH_EUROBERT) {
+        return false; // TODO vocab
+    }
+    if (arch == LLM_ARCH_PLM) {
+        return false; // TODO tensor shapes
+    }
+    if (arch == LLM_ARCH_DEEPSEEK2OCR) {
+        return false;
+    }
+
+    // FIXME some models are segfaulting with WebGPU:
+#ifdef GGML_USE_WEBGPU
+    if (arch == LLM_ARCH_QWEN3NEXT || arch == LLM_ARCH_QWEN35 || arch == LLM_ARCH_QWEN35MOE || arch == LLM_ARCH_KIMI_LINEAR) {
+        return false;
+    }
+#endif // GGML_USE_WEBGPU
+
+    return true;
+}
+
 static int save_models(const llm_arch target_arch, const size_t seed, const ggml_log_level log_level, const std::string & dir) {
    struct user_data_t {
        struct {
@@ -376,27 +423,11 @@ static int save_models(const llm_arch target_arch, const size_t seed, const ggml
    }, &ud);

    for (const llm_arch & arch : llm_arch_all()) {
-        if (target_arch != LLM_ARCH_UNKNOWN && arch != target_arch) {
+        if (arch == LLM_ARCH_UNKNOWN) {
            continue;
        }
-        if (arch == LLM_ARCH_CLIP || arch == LLM_ARCH_GPTJ || arch == LLM_ARCH_UNKNOWN) {
-            continue; // These models don't have usable implementations.
-        }
-        if (arch == LLM_ARCH_CHAMELEON) {
-            continue; // Only half-implemented and to be removed in the future.
-        }
-        if (arch == LLM_ARCH_GEMMA4) {
-            continue; // FIXME @ngxson
-        }
-        if (arch == LLM_ARCH_RWKV6 || arch == LLM_ARCH_RWKV6QWEN2 || arch == LLM_ARCH_RWKV7 || arch == LLM_ARCH_ARWKV7) {
-            continue; // FIXME
-        }
-        if (arch == LLM_ARCH_BERT || arch == LLM_ARCH_MODERN_BERT || arch == LLM_ARCH_NOMIC_BERT || arch == LLM_ARCH_NOMIC_BERT_MOE ||
-                arch == LLM_ARCH_NEO_BERT || arch == LLM_ARCH_JINA_BERT_V2 || arch == LLM_ARCH_JINA_BERT_V3 || arch == LLM_ARCH_EUROBERT) {
-            continue; // TODO vocab
-        }
-        if (arch == LLM_ARCH_PLM) {
-            continue; // TODO tensor shapes
+        if (target_arch != LLM_ARCH_UNKNOWN && arch != target_arch) {
+            continue;
        }
        for (bool moe : {false, true}) {
            if (moe && !moe_implemented(arch)) {
@@ -440,51 +471,47 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg

    const std::vector<llama_token> tokens = get_tokens(128, 128, seed);

+    struct device_config {
+        std::vector<ggml_backend_dev_t> devs;
+        std::string                     label;
+        llama_split_mode                split_mode;
+
+        device_config(std::vector<ggml_backend_dev_t> devs, std::string name, llama_split_mode split_mode)
+            : devs(std::move(devs)), label(std::move(name)), split_mode(split_mode) {}
+    };
+
+    std::vector<device_config> dev_configs;
+    {
+        std::vector<ggml_backend_dev_t> devices_meta;
+        {
+            const size_t device_count = ggml_backend_dev_count();
+            for (size_t i = 0; i < device_count; i++) {
+                ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+                dev_configs.emplace_back(std::vector<ggml_backend_dev_t>{dev}, ggml_backend_dev_description(dev), LLAMA_SPLIT_MODE_LAYER);
+
+                // cpu-based devices cannot be used in tensor split mode
+                if (ggml_backend_dev_buffer_type(dev) != ggml_backend_cpu_buffer_type()) {
+                    devices_meta.push_back(dev);
+                }
+            }
+        }
+
+        dev_configs.emplace_back(devices_meta, "Meta", LLAMA_SPLIT_MODE_TENSOR);
+    }
+
    bool all_ok = true;
    common_log_flush(common_log_main());
-    printf("|%15s|%30s|%6s|%15s|%9s|\n", "Model arch.", "Device", "Config", "NMSE vs. CPU", "Roundtrip");
-    printf("|---------------|------------------------------|------|---------------|---------|\n");
+    printf("|%16s|%30s|%6s|%15s|%9s|\n", "Model arch.", "Device", "Config", "NMSE vs. CPU", "Roundtrip");
+    printf("|----------------|------------------------------|------|---------------|---------|\n");
    for (const llm_arch & arch : llm_arch_all()) {
+        if (arch == LLM_ARCH_UNKNOWN) {
+            continue;
+        }
        if (target_arch != LLM_ARCH_UNKNOWN && arch != target_arch) {
            continue;
        }
-        if (arch == LLM_ARCH_CLIP || arch == LLM_ARCH_GPTJ || arch == LLM_ARCH_UNKNOWN) {
-            continue; // These models don't have usable implementations.
-        }
-        if (arch == LLM_ARCH_CHAMELEON) {
-            continue; // Only half-implemented and to be removed in the future.
-        }
-        if (arch == LLM_ARCH_GEMMA4) {
-            continue; // FIXME @ngxson
-        }
-        if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
-            continue; // FIXME CUDA backend crashes.
-        }
-        if (arch == LLM_ARCH_LLAMA_EMBED || arch == LLM_ARCH_GEMMA_EMBEDDING || arch == LLM_ARCH_T5ENCODER) {
-            continue; // FIXME Embedding (?) models produce inconsistent results.
-        }
-        if (arch == LLM_ARCH_RWKV6 || arch == LLM_ARCH_RWKV6QWEN2 || arch == LLM_ARCH_RWKV7 || arch == LLM_ARCH_ARWKV7) {
-            continue; // FIXME RWKV models hang indefinitely.
-        }
-        if (arch == LLM_ARCH_BERT || arch == LLM_ARCH_MODERN_BERT || arch == LLM_ARCH_NOMIC_BERT || arch == LLM_ARCH_NOMIC_BERT_MOE ||
-                arch == LLM_ARCH_NEO_BERT || arch == LLM_ARCH_JINA_BERT_V2 || arch == LLM_ARCH_JINA_BERT_V3 || arch == LLM_ARCH_EUROBERT) {
-            continue; // TODO vocab
-        }
-        if (arch == LLM_ARCH_PLM) {
-            continue; // TODO tensor shapes
-        }
-        if (arch == LLM_ARCH_DEEPSEEK2OCR) {
-            continue; // TODO tensor shapes
-        }

-        // FIXME some models are segfaulting with WebGPU:
-#ifdef GGML_USE_WEBGPU
-        if (arch == LLM_ARCH_QWEN3NEXT || arch == LLM_ARCH_QWEN35 || arch == LLM_ARCH_QWEN35MOE || arch == LLM_ARCH_KIMI_LINEAR) {
-            continue;
-        }
-#endif // GGML_USE_WEBGPU
-
-        const bool encode = arch == LLM_ARCH_T5;
+        const bool encode = arch == LLM_ARCH_T5 || arch == LLM_ARCH_DREAM || arch == LLM_ARCH_LLADA || arch == LLM_ARCH_LLADA_MOE || arch == LLM_ARCH_RND1;
        for (bool moe : {false, true}) {
            if (moe && !moe_implemented(arch)) {
                continue;
@@ -492,50 +519,64 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg
            if (!moe && moe_mandatory(arch)) {
                continue;
            }
+            const std::string config_name = moe ? "MoE" : "Dense";
            gguf_context_ptr gguf_ctx = get_gguf_ctx(arch, moe);
-            auto model_and_ctx_cpu = get_model_and_ctx(gguf_ctx.get(), nullptr, seed, {});
-            const std::vector<float> logits_cpu = get_logits(model_and_ctx_cpu.first.get(), model_and_ctx_cpu.second.get(), tokens, encode);
-            for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
-                ggml_backend_dev_t dev = ggml_backend_dev_get(i);
-                if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
-                    continue;
-                }
-                auto model_and_ctx_dev = get_model_and_ctx(gguf_ctx.get(), nullptr, seed, {dev});
-                std::string config_name = moe ? "MoE" : "Dense";
-                const std::vector<float> logits_dev = get_logits(model_and_ctx_dev.first.get(), model_and_ctx_dev.second.get(), tokens, encode);
-                const double nmse_val = nmse(logits_cpu, logits_dev);
-                char nmse_str[10];
-                snprintf(nmse_str, sizeof(nmse_str), "%.2e", nmse_val);
-                std::string status_nmse = "\033[1;32mOK\033[0m";
-                if (nmse_val > 1e-4) {
-                    all_ok = false;
-                    status_nmse = "\033[1;31mFAIL\033[0m";
-                }
-
+            std::pair<llama_model_ptr, llama_context_ptr> model_and_ctx_cpu;
+            std::vector<float> logits_cpu;
+            for (device_config & dc : dev_configs) {
+                std::pair<llama_model_ptr, llama_context_ptr> model_and_ctx_dev;
+                std::vector<float> logits_dev;
+                std::string status_nmse      = "\033[1;33mSKIP\033[0m";
                std::string status_roundtrip = "\033[1;33mSKIP\033[0m";
-                FILE * file = tmpfile(); // Can be null on Windows without administrator privileges.
-                if (file != nullptr && llama_model_saver_supports_arch(arch)) {
-                    llama_model_saver ms = llama_model_saver(model_and_ctx_dev.first.get());
-                    ms.add_kv_from_model();
-                    ms.add_tensors_from_model();
-                    ms.save(file);
-                    rewind(file);
-
-                    auto model_and_ctx_roundtrip = get_model_and_ctx(nullptr, file, seed, {dev});
-                    const std::vector<float> logits_roundtrip = get_logits(
-                        model_and_ctx_roundtrip.first.get(), model_and_ctx_roundtrip.second.get(), tokens, encode);
-                    status_roundtrip = "\033[1;32mOK\033[0m";
-                    GGML_ASSERT(logits_roundtrip.size() == logits_dev.size());
-                    for (size_t i = 0; i < logits_roundtrip.size(); i++) {
-                        if (logits_roundtrip[i] != logits_dev[i]) {
+                char nmse_str[12] = {0};
+                bool skip = !arch_supported(arch) || (dc.split_mode == LLAMA_SPLIT_MODE_TENSOR && dc.devs.empty());
+#if defined(GGML_USE_WEBGPU)
+                skip = true; // FIXME
+#endif // GGML_USE_WEBGPU
+                if (!skip) {
+                    if (logits_cpu.empty()) {
+                        model_and_ctx_cpu = get_model_and_ctx(gguf_ctx.get(), nullptr, seed, {}, LLAMA_SPLIT_MODE_LAYER, encode);
+                        logits_cpu = get_logits(model_and_ctx_cpu.first.get(), model_and_ctx_cpu.second.get(), tokens, encode);
+                    }
+                    if (dc.split_mode != LLAMA_SPLIT_MODE_TENSOR || llm_arch_supports_sm_tensor(arch)) {
+                        model_and_ctx_dev = get_model_and_ctx(gguf_ctx.get(), nullptr, seed, dc.devs, dc.split_mode, encode);
+                        logits_dev = get_logits(model_and_ctx_dev.first.get(), model_and_ctx_dev.second.get(), tokens, encode);
+                        const double nmse_val = nmse(logits_cpu, logits_dev);
+                        snprintf(nmse_str, sizeof(nmse_str), "(%.2e)", nmse_val);
+                        status_nmse = "\033[1;32mOK\033[0m";
+                        if (nmse_val > 1e-4) {
                            all_ok = false;
-                            status_roundtrip = "\033[1;31mFAIL\033[0m";
-                            break;
+                            status_nmse = "\033[1;31mFAIL\033[0m";
+                        }
+                    }
+
+                    FILE * file = tmpfile(); // Can be null on Windows without administrator privileges.
+                    // FIXME: when adding a tensor to a gguf_context a copy is made, this changes the pointer which the meta backend
+                    //     in turn uses to map the tensors to their simple equivalents - this is fundamentally incompatible
+                    if (file != nullptr && llama_model_saver_supports_arch(arch) && dc.split_mode != LLAMA_SPLIT_MODE_TENSOR) {
+                        GGML_ASSERT(model_and_ctx_dev.first && model_and_ctx_dev.second);
+                        llama_model_saver ms = llama_model_saver(model_and_ctx_dev.first.get());
+                        ms.add_kv_from_model();
+                        ms.add_tensors_from_model();
+                        ms.save(file);
+                        rewind(file);
+
+                        auto model_and_ctx_roundtrip = get_model_and_ctx(nullptr, file, seed, dc.devs, dc.split_mode, encode);
+                        const std::vector<float> logits_roundtrip = get_logits(
+                            model_and_ctx_roundtrip.first.get(), model_and_ctx_roundtrip.second.get(), tokens, encode);
+                        status_roundtrip = "\033[1;32mOK\033[0m";
+                        GGML_ASSERT(logits_roundtrip.size() == logits_dev.size());
+                        for (size_t i = 0; i < logits_roundtrip.size(); i++) {
+                            if (logits_roundtrip[i] != logits_dev[i]) {
+                                all_ok = false;
+                                status_roundtrip = "\033[1;31mFAIL\033[0m";
+                                break;
+                            }
                        }
                    }
                }

-                printf("|%15s|%30s|%6s|%15s (%8s)|%20s|\n", llm_arch_name(arch), ggml_backend_dev_description(dev),
+                printf("|%16s|%30s|%6s|%15s %10s|%20s|\n", llm_arch_name(arch), dc.label.c_str(),
                    config_name.c_str(), status_nmse.c_str(), nmse_str, status_roundtrip.c_str());
            }
        }