ggml: backend-agnostic tensor parallelism (experimental) (#19378)

* ggml: backend-agnostic tensor parallelism

* support for GPT-OSS, Qwen 3 MoE

* partial Vulkan fix

* add support for 4/8 GPUs

* unconditional peer access

* re-use buffers + ggml contexts

* fix output pattern

* NCCL support

* GGML: HIP: add RCCL support

* Remove shfl and AllReduce from backend interface

* move allocation workaround out of ggml-alloc.c

* 2d tensor set/get support

* Fix the seg fault without NCCL

* Apply suggestion from JohannesGaessler

* support for tensor dims % n_devs != 0

* fix view_offs scaling

* arbitrary num. of GPUs/tensor split

* fix compilation

* better granularity estimate

* Support device-specific host buffer types if all underlying backends expose the same type. This allows using pinned memory instead of pageable memory for CUDA.

Fix compilation errors.

* partial Qwen 3 Next support

* Fix qwen3 30b (#8)

* Fix crash with Qwen-30B-A3B Q4_0

Qwen-30B-A3B Q4_0 has an intermediate dimension of 768. Using a granularity of 256 forces an uneven split between GPUs, which is not supported by the current implementation.

* Decide block size based on tensor quantization type

* Fix crashes due to KV cache serialization (#9)

KV cache serialization requires non-zero offsets on the tensor. Add support in the meta backend to set/get a tensor with a non-zero offset.

* metal : fix build (#7)

* static memory allocations, fix usage count

* fix tensor granularity

* more even memory distribution

* use BF16 for allreduce

* rebase fixup

* better error message for unsupported architectures

* Fix device mismatch during scatter of allReduce. (#11)

There is a mismatch between the dst buffer device and the backend device, causing the use of sync copies

* Enable the previous allreduce implementation. It is better in both perf and stability (#12)

* delay AllReduce for Moe for less I/O

* build : clean-up compile warnings

* backend : move most of the meta backend API to ggml-backend-impl.h

* cont : hide unused public API in the implementation

* llama : use llama_device + remove ggml_backend_dev_is_meta()

* ggml-backend : remove unused alloc include

* minor : remove regex include

* ggml : introduce ggml-ext.h for staging new APIs

* rebase fixup

* fix tests

* llama : more robust logic for determining Meta devices (#16)

* llama : more robust logic for determining Meta devices

* cont : fix devs size check

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cont : fix log type

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* disable roundtrip for meta backend

* fix arch selection

* Qwen 3.5 support

* fix Gemma 4 MoE

* fix OpenVino, SYCL

* fix test-llama-archs for CPU-only builds

* Fix Qwen 3.5 MoE

* disable meta backend tests for WebGPU

* tests : filter CPU-based devices from the Meta backend tests (#17)

* meta : formatting, naming, indentation (#18)

* formatting : llama-model.cpp

* formatting : ggml-ext.h

* formatting : ggml-backend-meta.cpp

* meta : add TODO

* add documentation

* better error messages

* fix GPT-OSS

---------

Co-authored-by: Carl Philipp Klemm <carl@uvos.xyz>
Co-authored-by: Gaurav Garg <gaugarg@nvidia.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
Johannes Gäßler
2026-04-09 16:42:19 +02:00
committed by GitHub
parent 009a113326
commit d6f3030047
48 changed files with 3198 additions and 342 deletions
+138 -97
View File
@@ -6,6 +6,8 @@
#include "ggml-cpp.h"
#include "llama.h"
#include "llama-cpp.h"
// TODO: replace with #include "llama-ext.h" in the future
#include "../src/llama-arch.h"
#include "../src/llama-model-saver.h"
@@ -205,9 +207,9 @@ static gguf_context_ptr get_gguf_ctx(const llm_arch arch, const bool moe) {
ms.add_kv(LLM_KV_XIELU_ALPHA_P, 1.0f);
ms.add_kv(LLM_KV_XIELU_BETA, 1.0f);
ms.add_kv(LLM_KV_XIELU_EPS, 1.0e-7f);
ms.add_kv(LLM_KV_SSM_INNER_SIZE, arch == LLM_ARCH_QWEN3NEXT || arch == LLM_ARCH_QWEN35 || arch == LLM_ARCH_QWEN35MOE ? 64 : 2*n_embd);
ms.add_kv(LLM_KV_SSM_INNER_SIZE, arch == LLM_ARCH_QWEN3NEXT || arch == LLM_ARCH_QWEN35 || arch == LLM_ARCH_QWEN35MOE ? 256 : 2*n_embd);
ms.add_kv(LLM_KV_SSM_CONV_KERNEL, uint32_t(4));
ms.add_kv(LLM_KV_SSM_STATE_SIZE, uint32_t(32));
ms.add_kv(LLM_KV_SSM_STATE_SIZE, uint32_t(128));
ms.add_kv(LLM_KV_SSM_TIME_STEP_RANK, n_head);
ms.add_kv(LLM_KV_SSM_GROUP_COUNT, arch == LLM_ARCH_PLAMO2 ? 0 : uint32_t(2));
ms.add_kv(LLM_KV_KDA_HEAD_DIM, uint32_t(128));
@@ -235,18 +237,23 @@ static bool silent_model_load_progress(float /*progress*/, void * /*user_data*/)
}
static std::pair<llama_model_ptr, llama_context_ptr> get_model_and_ctx(
struct gguf_context * gguf_ctx, FILE * file, const size_t seed, const std::vector<ggml_backend_dev_t> & devs) {
struct gguf_context * gguf_ctx, FILE * file, const size_t seed, const std::vector<ggml_backend_dev_t> & devs,
const llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER, bool encode = false) {
GGML_ASSERT((gguf_ctx == nullptr) != (file == nullptr));
llama_model_params model_params = llama_model_default_params();
model_params.progress_callback = silent_model_load_progress;
std::vector<ggml_backend_dev_t> devs_copy = devs;
devs_copy.push_back(nullptr);
model_params.devices = devs_copy.data();
model_params.split_mode = split_mode;
llama_context_params ctx_params = llama_context_default_params();
ctx_params.n_ctx = 0;
ctx_params.n_threads = 4;
ctx_params.n_threads_batch = 4;
if (!encode) {
ctx_params.n_ubatch = 64;
}
size_t tmp = seed;
llama_model_ptr model(gguf_ctx != nullptr ?
@@ -357,6 +364,46 @@ static bool moe_implemented(const llm_arch arch) {
}
}
static bool arch_supported(const llm_arch arch) {
if (arch == LLM_ARCH_CLIP || arch == LLM_ARCH_GPTJ || arch == LLM_ARCH_UNKNOWN) {
return false; // These models don't have usable implementations.
}
if (arch == LLM_ARCH_CHAMELEON) {
return false; // Only half-implemented and to be removed in the future.
}
if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
return false; // FIXME CUDA backend crashes.
}
if (arch == LLM_ARCH_GEMMA4) {
return false; // FIXME @ngxson
}
if (arch == LLM_ARCH_LLAMA_EMBED || arch == LLM_ARCH_GEMMA_EMBEDDING || arch == LLM_ARCH_T5ENCODER) {
return false; // FIXME Embedding (?) models produce inconsistent results.
}
if (arch == LLM_ARCH_RWKV6 || arch == LLM_ARCH_RWKV6QWEN2 || arch == LLM_ARCH_RWKV7 || arch == LLM_ARCH_ARWKV7) {
return false; // FIXME RWKV models hang indefinitely.
}
if (arch == LLM_ARCH_BERT || arch == LLM_ARCH_MODERN_BERT || arch == LLM_ARCH_NOMIC_BERT || arch == LLM_ARCH_NOMIC_BERT_MOE ||
arch == LLM_ARCH_NEO_BERT || arch == LLM_ARCH_JINA_BERT_V2 || arch == LLM_ARCH_JINA_BERT_V3 || arch == LLM_ARCH_EUROBERT) {
return false; // TODO vocab
}
if (arch == LLM_ARCH_PLM) {
return false; // TODO tensor shapes
}
if (arch == LLM_ARCH_DEEPSEEK2OCR) {
return false;
}
// FIXME some models are segfaulting with WebGPU:
#ifdef GGML_USE_WEBGPU
if (arch == LLM_ARCH_QWEN3NEXT || arch == LLM_ARCH_QWEN35 || arch == LLM_ARCH_QWEN35MOE || arch == LLM_ARCH_KIMI_LINEAR) {
return false;
}
#endif // GGML_USE_WEBGPU
return true;
}
static int save_models(const llm_arch target_arch, const size_t seed, const ggml_log_level log_level, const std::string & dir) {
struct user_data_t {
struct {
@@ -376,27 +423,11 @@ static int save_models(const llm_arch target_arch, const size_t seed, const ggml
}, &ud);
for (const llm_arch & arch : llm_arch_all()) {
if (target_arch != LLM_ARCH_UNKNOWN && arch != target_arch) {
if (arch == LLM_ARCH_UNKNOWN) {
continue;
}
if (arch == LLM_ARCH_CLIP || arch == LLM_ARCH_GPTJ || arch == LLM_ARCH_UNKNOWN) {
continue; // These models don't have usable implementations.
}
if (arch == LLM_ARCH_CHAMELEON) {
continue; // Only half-implemented and to be removed in the future.
}
if (arch == LLM_ARCH_GEMMA4) {
continue; // FIXME @ngxson
}
if (arch == LLM_ARCH_RWKV6 || arch == LLM_ARCH_RWKV6QWEN2 || arch == LLM_ARCH_RWKV7 || arch == LLM_ARCH_ARWKV7) {
continue; // FIXME
}
if (arch == LLM_ARCH_BERT || arch == LLM_ARCH_MODERN_BERT || arch == LLM_ARCH_NOMIC_BERT || arch == LLM_ARCH_NOMIC_BERT_MOE ||
arch == LLM_ARCH_NEO_BERT || arch == LLM_ARCH_JINA_BERT_V2 || arch == LLM_ARCH_JINA_BERT_V3 || arch == LLM_ARCH_EUROBERT) {
continue; // TODO vocab
}
if (arch == LLM_ARCH_PLM) {
continue; // TODO tensor shapes
if (target_arch != LLM_ARCH_UNKNOWN && arch != target_arch) {
continue;
}
for (bool moe : {false, true}) {
if (moe && !moe_implemented(arch)) {
@@ -440,51 +471,47 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg
const std::vector<llama_token> tokens = get_tokens(128, 128, seed);
struct device_config {
std::vector<ggml_backend_dev_t> devs;
std::string label;
llama_split_mode split_mode;
device_config(std::vector<ggml_backend_dev_t> devs, std::string name, llama_split_mode split_mode)
: devs(std::move(devs)), label(std::move(name)), split_mode(split_mode) {}
};
std::vector<device_config> dev_configs;
{
std::vector<ggml_backend_dev_t> devices_meta;
{
const size_t device_count = ggml_backend_dev_count();
for (size_t i = 0; i < device_count; i++) {
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
dev_configs.emplace_back(std::vector<ggml_backend_dev_t>{dev}, ggml_backend_dev_description(dev), LLAMA_SPLIT_MODE_LAYER);
// cpu-based devices cannot be used in tensor split mode
if (ggml_backend_dev_buffer_type(dev) != ggml_backend_cpu_buffer_type()) {
devices_meta.push_back(dev);
}
}
}
dev_configs.emplace_back(devices_meta, "Meta", LLAMA_SPLIT_MODE_TENSOR);
}
bool all_ok = true;
common_log_flush(common_log_main());
printf("|%15s|%30s|%6s|%15s|%9s|\n", "Model arch.", "Device", "Config", "NMSE vs. CPU", "Roundtrip");
printf("|---------------|------------------------------|------|---------------|---------|\n");
printf("|%16s|%30s|%6s|%15s|%9s|\n", "Model arch.", "Device", "Config", "NMSE vs. CPU", "Roundtrip");
printf("|----------------|------------------------------|------|---------------|---------|\n");
for (const llm_arch & arch : llm_arch_all()) {
if (arch == LLM_ARCH_UNKNOWN) {
continue;
}
if (target_arch != LLM_ARCH_UNKNOWN && arch != target_arch) {
continue;
}
if (arch == LLM_ARCH_CLIP || arch == LLM_ARCH_GPTJ || arch == LLM_ARCH_UNKNOWN) {
continue; // These models don't have usable implementations.
}
if (arch == LLM_ARCH_CHAMELEON) {
continue; // Only half-implemented and to be removed in the future.
}
if (arch == LLM_ARCH_GEMMA4) {
continue; // FIXME @ngxson
}
if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
continue; // FIXME CUDA backend crashes.
}
if (arch == LLM_ARCH_LLAMA_EMBED || arch == LLM_ARCH_GEMMA_EMBEDDING || arch == LLM_ARCH_T5ENCODER) {
continue; // FIXME Embedding (?) models produce inconsistent results.
}
if (arch == LLM_ARCH_RWKV6 || arch == LLM_ARCH_RWKV6QWEN2 || arch == LLM_ARCH_RWKV7 || arch == LLM_ARCH_ARWKV7) {
continue; // FIXME RWKV models hang indefinitely.
}
if (arch == LLM_ARCH_BERT || arch == LLM_ARCH_MODERN_BERT || arch == LLM_ARCH_NOMIC_BERT || arch == LLM_ARCH_NOMIC_BERT_MOE ||
arch == LLM_ARCH_NEO_BERT || arch == LLM_ARCH_JINA_BERT_V2 || arch == LLM_ARCH_JINA_BERT_V3 || arch == LLM_ARCH_EUROBERT) {
continue; // TODO vocab
}
if (arch == LLM_ARCH_PLM) {
continue; // TODO tensor shapes
}
if (arch == LLM_ARCH_DEEPSEEK2OCR) {
continue; // TODO tensor shapes
}
// FIXME some models are segfaulting with WebGPU:
#ifdef GGML_USE_WEBGPU
if (arch == LLM_ARCH_QWEN3NEXT || arch == LLM_ARCH_QWEN35 || arch == LLM_ARCH_QWEN35MOE || arch == LLM_ARCH_KIMI_LINEAR) {
continue;
}
#endif // GGML_USE_WEBGPU
const bool encode = arch == LLM_ARCH_T5;
const bool encode = arch == LLM_ARCH_T5 || arch == LLM_ARCH_DREAM || arch == LLM_ARCH_LLADA || arch == LLM_ARCH_LLADA_MOE || arch == LLM_ARCH_RND1;
for (bool moe : {false, true}) {
if (moe && !moe_implemented(arch)) {
continue;
@@ -492,50 +519,64 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg
if (!moe && moe_mandatory(arch)) {
continue;
}
const std::string config_name = moe ? "MoE" : "Dense";
gguf_context_ptr gguf_ctx = get_gguf_ctx(arch, moe);
auto model_and_ctx_cpu = get_model_and_ctx(gguf_ctx.get(), nullptr, seed, {});
const std::vector<float> logits_cpu = get_logits(model_and_ctx_cpu.first.get(), model_and_ctx_cpu.second.get(), tokens, encode);
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
continue;
}
auto model_and_ctx_dev = get_model_and_ctx(gguf_ctx.get(), nullptr, seed, {dev});
std::string config_name = moe ? "MoE" : "Dense";
const std::vector<float> logits_dev = get_logits(model_and_ctx_dev.first.get(), model_and_ctx_dev.second.get(), tokens, encode);
const double nmse_val = nmse(logits_cpu, logits_dev);
char nmse_str[10];
snprintf(nmse_str, sizeof(nmse_str), "%.2e", nmse_val);
std::string status_nmse = "\033[1;32mOK\033[0m";
if (nmse_val > 1e-4) {
all_ok = false;
status_nmse = "\033[1;31mFAIL\033[0m";
}
std::pair<llama_model_ptr, llama_context_ptr> model_and_ctx_cpu;
std::vector<float> logits_cpu;
for (device_config & dc : dev_configs) {
std::pair<llama_model_ptr, llama_context_ptr> model_and_ctx_dev;
std::vector<float> logits_dev;
std::string status_nmse = "\033[1;33mSKIP\033[0m";
std::string status_roundtrip = "\033[1;33mSKIP\033[0m";
FILE * file = tmpfile(); // Can be null on Windows without administrator privileges.
if (file != nullptr && llama_model_saver_supports_arch(arch)) {
llama_model_saver ms = llama_model_saver(model_and_ctx_dev.first.get());
ms.add_kv_from_model();
ms.add_tensors_from_model();
ms.save(file);
rewind(file);
auto model_and_ctx_roundtrip = get_model_and_ctx(nullptr, file, seed, {dev});
const std::vector<float> logits_roundtrip = get_logits(
model_and_ctx_roundtrip.first.get(), model_and_ctx_roundtrip.second.get(), tokens, encode);
status_roundtrip = "\033[1;32mOK\033[0m";
GGML_ASSERT(logits_roundtrip.size() == logits_dev.size());
for (size_t i = 0; i < logits_roundtrip.size(); i++) {
if (logits_roundtrip[i] != logits_dev[i]) {
char nmse_str[12] = {0};
bool skip = !arch_supported(arch) || (dc.split_mode == LLAMA_SPLIT_MODE_TENSOR && dc.devs.empty());
#if defined(GGML_USE_WEBGPU)
skip = true; // FIXME
#endif // GGML_USE_WEBGPU
if (!skip) {
if (logits_cpu.empty()) {
model_and_ctx_cpu = get_model_and_ctx(gguf_ctx.get(), nullptr, seed, {}, LLAMA_SPLIT_MODE_LAYER, encode);
logits_cpu = get_logits(model_and_ctx_cpu.first.get(), model_and_ctx_cpu.second.get(), tokens, encode);
}
if (dc.split_mode != LLAMA_SPLIT_MODE_TENSOR || llm_arch_supports_sm_tensor(arch)) {
model_and_ctx_dev = get_model_and_ctx(gguf_ctx.get(), nullptr, seed, dc.devs, dc.split_mode, encode);
logits_dev = get_logits(model_and_ctx_dev.first.get(), model_and_ctx_dev.second.get(), tokens, encode);
const double nmse_val = nmse(logits_cpu, logits_dev);
snprintf(nmse_str, sizeof(nmse_str), "(%.2e)", nmse_val);
status_nmse = "\033[1;32mOK\033[0m";
if (nmse_val > 1e-4) {
all_ok = false;
status_roundtrip = "\033[1;31mFAIL\033[0m";
break;
status_nmse = "\033[1;31mFAIL\033[0m";
}
}
FILE * file = tmpfile(); // Can be null on Windows without administrator privileges.
// FIXME: when adding a tensor to a gguf_context a copy is made, this changes the pointer which the meta backend
// in turn uses to map the tensors to their simple equivalents - this is fundamentally incompatible
if (file != nullptr && llama_model_saver_supports_arch(arch) && dc.split_mode != LLAMA_SPLIT_MODE_TENSOR) {
GGML_ASSERT(model_and_ctx_dev.first && model_and_ctx_dev.second);
llama_model_saver ms = llama_model_saver(model_and_ctx_dev.first.get());
ms.add_kv_from_model();
ms.add_tensors_from_model();
ms.save(file);
rewind(file);
auto model_and_ctx_roundtrip = get_model_and_ctx(nullptr, file, seed, dc.devs, dc.split_mode, encode);
const std::vector<float> logits_roundtrip = get_logits(
model_and_ctx_roundtrip.first.get(), model_and_ctx_roundtrip.second.get(), tokens, encode);
status_roundtrip = "\033[1;32mOK\033[0m";
GGML_ASSERT(logits_roundtrip.size() == logits_dev.size());
for (size_t i = 0; i < logits_roundtrip.size(); i++) {
if (logits_roundtrip[i] != logits_dev[i]) {
all_ok = false;
status_roundtrip = "\033[1;31mFAIL\033[0m";
break;
}
}
}
}
printf("|%15s|%30s|%6s|%15s (%8s)|%20s|\n", llm_arch_name(arch), ggml_backend_dev_description(dev),
printf("|%16s|%30s|%6s|%15s %10s|%20s|\n", llm_arch_name(arch), dc.label.c_str(),
config_name.c_str(), status_nmse.c_str(), nmse_str, status_roundtrip.c_str());
}
}