ggml: backend-agnostic tensor parallelism (experimental) (#19378)
* ggml: backend-agnostic tensor parallelism * support for GPT-OSS, Qwen 3 MoE * partial Vulkan fix * add support for 4/8 GPUs * unconditional peer access * re-use buffers + ggml contexts * fix output pattern * NCCL support * GGML: HIP: add RCCL support * Remove shfl and AllReduce from backend interface * move allocation workaround out of ggml-alloc.c * 2d tensor set/get support * Fix the seg fault without NCCL * Apply suggestion from JohannesGaessler * support for tensor dims % n_devs != 0 * fix view_offs scaling * arbitrary num. of GPUs/tensor split * fix compilation * better granularity estimate * Support device-specific host buffer types if all underlying backends expose the same type. This allows using pinned memory instead of pageable memory for CUDA. Fix compilation errors. * partial Qwen 3 Next support * Fix qwen3 30b (#8) * Fix crash with Qwen-30B-A3B Q4_0 Qwen-30B-A3B Q4_0 has an intermediate dimension of 768. Using a granularity of 256 forces an uneven split between GPUs, which is not supported by the current implementation. * Decide block size based on tensor quantization type * Fix crashes due to KV cache serialization (#9) KV cache serialization requires non-zero offsets on the tensor. Add support in the meta backend to set/get a tensor with a non-zero offset. * metal : fix build (#7) * static memory allocations, fix usage count * fix tensor granularity * more even memory distribution * use BF16 for allreduce * rebase fixup * better error message for unsupported architectures * Fix device mismatch during scatter of allReduce. (#11) There is a mismatch between the dst buffer device and the backend device, causing the use of sync copies * Enable the previous allreduce implementation. It is better in both perf and stability (#12) * delay AllReduce for Moe for less I/O * build : clean-up compile warnings * backend : move most of the meta backend API to ggml-backend-impl.h * cont : hide unused public API in the implementation * llama : use llama_device + remove ggml_backend_dev_is_meta() * ggml-backend : remove unused alloc include * minor : remove regex include * ggml : introduce ggml-ext.h for staging new APIs * rebase fixup * fix tests * llama : more robust logic for determining Meta devices (#16) * llama : more robust logic for determining Meta devices * cont : fix devs size check Co-authored-by: Johannes Gäßler <johannesg@5d6.de> * cont : fix log type Co-authored-by: Johannes Gäßler <johannesg@5d6.de> --------- Co-authored-by: Johannes Gäßler <johannesg@5d6.de> * disable roundtrip for meta backend * fix arch selection * Qwen 3.5 support * fix Gemma 4 MoE * fix OpenVino, SYCL * fix test-llama-archs for CPU-only builds * Fix Qwen 3.5 MoE * disable meta backend tests for WebGPU * tests : filter CPU-based devices from the Meta backend tests (#17) * meta : formatting, naming, indentation (#18) * formatting : llama-model.cpp * formatting : ggml-ext.h * formatting : ggml-backend-meta.cpp * meta : add TODO * add documentation * better error messages * fix GPT-OSS --------- Co-authored-by: Carl Philipp Klemm <carl@uvos.xyz> Co-authored-by: Gaurav Garg <gaugarg@nvidia.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
+138
-97
@@ -6,6 +6,8 @@
|
||||
#include "ggml-cpp.h"
|
||||
#include "llama.h"
|
||||
#include "llama-cpp.h"
|
||||
|
||||
// TODO: replace with #include "llama-ext.h" in the future
|
||||
#include "../src/llama-arch.h"
|
||||
#include "../src/llama-model-saver.h"
|
||||
|
||||
@@ -205,9 +207,9 @@ static gguf_context_ptr get_gguf_ctx(const llm_arch arch, const bool moe) {
|
||||
ms.add_kv(LLM_KV_XIELU_ALPHA_P, 1.0f);
|
||||
ms.add_kv(LLM_KV_XIELU_BETA, 1.0f);
|
||||
ms.add_kv(LLM_KV_XIELU_EPS, 1.0e-7f);
|
||||
ms.add_kv(LLM_KV_SSM_INNER_SIZE, arch == LLM_ARCH_QWEN3NEXT || arch == LLM_ARCH_QWEN35 || arch == LLM_ARCH_QWEN35MOE ? 64 : 2*n_embd);
|
||||
ms.add_kv(LLM_KV_SSM_INNER_SIZE, arch == LLM_ARCH_QWEN3NEXT || arch == LLM_ARCH_QWEN35 || arch == LLM_ARCH_QWEN35MOE ? 256 : 2*n_embd);
|
||||
ms.add_kv(LLM_KV_SSM_CONV_KERNEL, uint32_t(4));
|
||||
ms.add_kv(LLM_KV_SSM_STATE_SIZE, uint32_t(32));
|
||||
ms.add_kv(LLM_KV_SSM_STATE_SIZE, uint32_t(128));
|
||||
ms.add_kv(LLM_KV_SSM_TIME_STEP_RANK, n_head);
|
||||
ms.add_kv(LLM_KV_SSM_GROUP_COUNT, arch == LLM_ARCH_PLAMO2 ? 0 : uint32_t(2));
|
||||
ms.add_kv(LLM_KV_KDA_HEAD_DIM, uint32_t(128));
|
||||
@@ -235,18 +237,23 @@ static bool silent_model_load_progress(float /*progress*/, void * /*user_data*/)
|
||||
}
|
||||
|
||||
static std::pair<llama_model_ptr, llama_context_ptr> get_model_and_ctx(
|
||||
struct gguf_context * gguf_ctx, FILE * file, const size_t seed, const std::vector<ggml_backend_dev_t> & devs) {
|
||||
struct gguf_context * gguf_ctx, FILE * file, const size_t seed, const std::vector<ggml_backend_dev_t> & devs,
|
||||
const llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER, bool encode = false) {
|
||||
GGML_ASSERT((gguf_ctx == nullptr) != (file == nullptr));
|
||||
llama_model_params model_params = llama_model_default_params();
|
||||
model_params.progress_callback = silent_model_load_progress;
|
||||
std::vector<ggml_backend_dev_t> devs_copy = devs;
|
||||
devs_copy.push_back(nullptr);
|
||||
model_params.devices = devs_copy.data();
|
||||
model_params.split_mode = split_mode;
|
||||
|
||||
llama_context_params ctx_params = llama_context_default_params();
|
||||
ctx_params.n_ctx = 0;
|
||||
ctx_params.n_threads = 4;
|
||||
ctx_params.n_threads_batch = 4;
|
||||
if (!encode) {
|
||||
ctx_params.n_ubatch = 64;
|
||||
}
|
||||
|
||||
size_t tmp = seed;
|
||||
llama_model_ptr model(gguf_ctx != nullptr ?
|
||||
@@ -357,6 +364,46 @@ static bool moe_implemented(const llm_arch arch) {
|
||||
}
|
||||
}
|
||||
|
||||
static bool arch_supported(const llm_arch arch) {
|
||||
if (arch == LLM_ARCH_CLIP || arch == LLM_ARCH_GPTJ || arch == LLM_ARCH_UNKNOWN) {
|
||||
return false; // These models don't have usable implementations.
|
||||
}
|
||||
if (arch == LLM_ARCH_CHAMELEON) {
|
||||
return false; // Only half-implemented and to be removed in the future.
|
||||
}
|
||||
if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
|
||||
return false; // FIXME CUDA backend crashes.
|
||||
}
|
||||
if (arch == LLM_ARCH_GEMMA4) {
|
||||
return false; // FIXME @ngxson
|
||||
}
|
||||
if (arch == LLM_ARCH_LLAMA_EMBED || arch == LLM_ARCH_GEMMA_EMBEDDING || arch == LLM_ARCH_T5ENCODER) {
|
||||
return false; // FIXME Embedding (?) models produce inconsistent results.
|
||||
}
|
||||
if (arch == LLM_ARCH_RWKV6 || arch == LLM_ARCH_RWKV6QWEN2 || arch == LLM_ARCH_RWKV7 || arch == LLM_ARCH_ARWKV7) {
|
||||
return false; // FIXME RWKV models hang indefinitely.
|
||||
}
|
||||
if (arch == LLM_ARCH_BERT || arch == LLM_ARCH_MODERN_BERT || arch == LLM_ARCH_NOMIC_BERT || arch == LLM_ARCH_NOMIC_BERT_MOE ||
|
||||
arch == LLM_ARCH_NEO_BERT || arch == LLM_ARCH_JINA_BERT_V2 || arch == LLM_ARCH_JINA_BERT_V3 || arch == LLM_ARCH_EUROBERT) {
|
||||
return false; // TODO vocab
|
||||
}
|
||||
if (arch == LLM_ARCH_PLM) {
|
||||
return false; // TODO tensor shapes
|
||||
}
|
||||
if (arch == LLM_ARCH_DEEPSEEK2OCR) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// FIXME some models are segfaulting with WebGPU:
|
||||
#ifdef GGML_USE_WEBGPU
|
||||
if (arch == LLM_ARCH_QWEN3NEXT || arch == LLM_ARCH_QWEN35 || arch == LLM_ARCH_QWEN35MOE || arch == LLM_ARCH_KIMI_LINEAR) {
|
||||
return false;
|
||||
}
|
||||
#endif // GGML_USE_WEBGPU
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static int save_models(const llm_arch target_arch, const size_t seed, const ggml_log_level log_level, const std::string & dir) {
|
||||
struct user_data_t {
|
||||
struct {
|
||||
@@ -376,27 +423,11 @@ static int save_models(const llm_arch target_arch, const size_t seed, const ggml
|
||||
}, &ud);
|
||||
|
||||
for (const llm_arch & arch : llm_arch_all()) {
|
||||
if (target_arch != LLM_ARCH_UNKNOWN && arch != target_arch) {
|
||||
if (arch == LLM_ARCH_UNKNOWN) {
|
||||
continue;
|
||||
}
|
||||
if (arch == LLM_ARCH_CLIP || arch == LLM_ARCH_GPTJ || arch == LLM_ARCH_UNKNOWN) {
|
||||
continue; // These models don't have usable implementations.
|
||||
}
|
||||
if (arch == LLM_ARCH_CHAMELEON) {
|
||||
continue; // Only half-implemented and to be removed in the future.
|
||||
}
|
||||
if (arch == LLM_ARCH_GEMMA4) {
|
||||
continue; // FIXME @ngxson
|
||||
}
|
||||
if (arch == LLM_ARCH_RWKV6 || arch == LLM_ARCH_RWKV6QWEN2 || arch == LLM_ARCH_RWKV7 || arch == LLM_ARCH_ARWKV7) {
|
||||
continue; // FIXME
|
||||
}
|
||||
if (arch == LLM_ARCH_BERT || arch == LLM_ARCH_MODERN_BERT || arch == LLM_ARCH_NOMIC_BERT || arch == LLM_ARCH_NOMIC_BERT_MOE ||
|
||||
arch == LLM_ARCH_NEO_BERT || arch == LLM_ARCH_JINA_BERT_V2 || arch == LLM_ARCH_JINA_BERT_V3 || arch == LLM_ARCH_EUROBERT) {
|
||||
continue; // TODO vocab
|
||||
}
|
||||
if (arch == LLM_ARCH_PLM) {
|
||||
continue; // TODO tensor shapes
|
||||
if (target_arch != LLM_ARCH_UNKNOWN && arch != target_arch) {
|
||||
continue;
|
||||
}
|
||||
for (bool moe : {false, true}) {
|
||||
if (moe && !moe_implemented(arch)) {
|
||||
@@ -440,51 +471,47 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg
|
||||
|
||||
const std::vector<llama_token> tokens = get_tokens(128, 128, seed);
|
||||
|
||||
struct device_config {
|
||||
std::vector<ggml_backend_dev_t> devs;
|
||||
std::string label;
|
||||
llama_split_mode split_mode;
|
||||
|
||||
device_config(std::vector<ggml_backend_dev_t> devs, std::string name, llama_split_mode split_mode)
|
||||
: devs(std::move(devs)), label(std::move(name)), split_mode(split_mode) {}
|
||||
};
|
||||
|
||||
std::vector<device_config> dev_configs;
|
||||
{
|
||||
std::vector<ggml_backend_dev_t> devices_meta;
|
||||
{
|
||||
const size_t device_count = ggml_backend_dev_count();
|
||||
for (size_t i = 0; i < device_count; i++) {
|
||||
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
||||
dev_configs.emplace_back(std::vector<ggml_backend_dev_t>{dev}, ggml_backend_dev_description(dev), LLAMA_SPLIT_MODE_LAYER);
|
||||
|
||||
// cpu-based devices cannot be used in tensor split mode
|
||||
if (ggml_backend_dev_buffer_type(dev) != ggml_backend_cpu_buffer_type()) {
|
||||
devices_meta.push_back(dev);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
dev_configs.emplace_back(devices_meta, "Meta", LLAMA_SPLIT_MODE_TENSOR);
|
||||
}
|
||||
|
||||
bool all_ok = true;
|
||||
common_log_flush(common_log_main());
|
||||
printf("|%15s|%30s|%6s|%15s|%9s|\n", "Model arch.", "Device", "Config", "NMSE vs. CPU", "Roundtrip");
|
||||
printf("|---------------|------------------------------|------|---------------|---------|\n");
|
||||
printf("|%16s|%30s|%6s|%15s|%9s|\n", "Model arch.", "Device", "Config", "NMSE vs. CPU", "Roundtrip");
|
||||
printf("|----------------|------------------------------|------|---------------|---------|\n");
|
||||
for (const llm_arch & arch : llm_arch_all()) {
|
||||
if (arch == LLM_ARCH_UNKNOWN) {
|
||||
continue;
|
||||
}
|
||||
if (target_arch != LLM_ARCH_UNKNOWN && arch != target_arch) {
|
||||
continue;
|
||||
}
|
||||
if (arch == LLM_ARCH_CLIP || arch == LLM_ARCH_GPTJ || arch == LLM_ARCH_UNKNOWN) {
|
||||
continue; // These models don't have usable implementations.
|
||||
}
|
||||
if (arch == LLM_ARCH_CHAMELEON) {
|
||||
continue; // Only half-implemented and to be removed in the future.
|
||||
}
|
||||
if (arch == LLM_ARCH_GEMMA4) {
|
||||
continue; // FIXME @ngxson
|
||||
}
|
||||
if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
|
||||
continue; // FIXME CUDA backend crashes.
|
||||
}
|
||||
if (arch == LLM_ARCH_LLAMA_EMBED || arch == LLM_ARCH_GEMMA_EMBEDDING || arch == LLM_ARCH_T5ENCODER) {
|
||||
continue; // FIXME Embedding (?) models produce inconsistent results.
|
||||
}
|
||||
if (arch == LLM_ARCH_RWKV6 || arch == LLM_ARCH_RWKV6QWEN2 || arch == LLM_ARCH_RWKV7 || arch == LLM_ARCH_ARWKV7) {
|
||||
continue; // FIXME RWKV models hang indefinitely.
|
||||
}
|
||||
if (arch == LLM_ARCH_BERT || arch == LLM_ARCH_MODERN_BERT || arch == LLM_ARCH_NOMIC_BERT || arch == LLM_ARCH_NOMIC_BERT_MOE ||
|
||||
arch == LLM_ARCH_NEO_BERT || arch == LLM_ARCH_JINA_BERT_V2 || arch == LLM_ARCH_JINA_BERT_V3 || arch == LLM_ARCH_EUROBERT) {
|
||||
continue; // TODO vocab
|
||||
}
|
||||
if (arch == LLM_ARCH_PLM) {
|
||||
continue; // TODO tensor shapes
|
||||
}
|
||||
if (arch == LLM_ARCH_DEEPSEEK2OCR) {
|
||||
continue; // TODO tensor shapes
|
||||
}
|
||||
|
||||
// FIXME some models are segfaulting with WebGPU:
|
||||
#ifdef GGML_USE_WEBGPU
|
||||
if (arch == LLM_ARCH_QWEN3NEXT || arch == LLM_ARCH_QWEN35 || arch == LLM_ARCH_QWEN35MOE || arch == LLM_ARCH_KIMI_LINEAR) {
|
||||
continue;
|
||||
}
|
||||
#endif // GGML_USE_WEBGPU
|
||||
|
||||
const bool encode = arch == LLM_ARCH_T5;
|
||||
const bool encode = arch == LLM_ARCH_T5 || arch == LLM_ARCH_DREAM || arch == LLM_ARCH_LLADA || arch == LLM_ARCH_LLADA_MOE || arch == LLM_ARCH_RND1;
|
||||
for (bool moe : {false, true}) {
|
||||
if (moe && !moe_implemented(arch)) {
|
||||
continue;
|
||||
@@ -492,50 +519,64 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg
|
||||
if (!moe && moe_mandatory(arch)) {
|
||||
continue;
|
||||
}
|
||||
const std::string config_name = moe ? "MoE" : "Dense";
|
||||
gguf_context_ptr gguf_ctx = get_gguf_ctx(arch, moe);
|
||||
auto model_and_ctx_cpu = get_model_and_ctx(gguf_ctx.get(), nullptr, seed, {});
|
||||
const std::vector<float> logits_cpu = get_logits(model_and_ctx_cpu.first.get(), model_and_ctx_cpu.second.get(), tokens, encode);
|
||||
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
|
||||
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
||||
if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
|
||||
continue;
|
||||
}
|
||||
auto model_and_ctx_dev = get_model_and_ctx(gguf_ctx.get(), nullptr, seed, {dev});
|
||||
std::string config_name = moe ? "MoE" : "Dense";
|
||||
const std::vector<float> logits_dev = get_logits(model_and_ctx_dev.first.get(), model_and_ctx_dev.second.get(), tokens, encode);
|
||||
const double nmse_val = nmse(logits_cpu, logits_dev);
|
||||
char nmse_str[10];
|
||||
snprintf(nmse_str, sizeof(nmse_str), "%.2e", nmse_val);
|
||||
std::string status_nmse = "\033[1;32mOK\033[0m";
|
||||
if (nmse_val > 1e-4) {
|
||||
all_ok = false;
|
||||
status_nmse = "\033[1;31mFAIL\033[0m";
|
||||
}
|
||||
|
||||
std::pair<llama_model_ptr, llama_context_ptr> model_and_ctx_cpu;
|
||||
std::vector<float> logits_cpu;
|
||||
for (device_config & dc : dev_configs) {
|
||||
std::pair<llama_model_ptr, llama_context_ptr> model_and_ctx_dev;
|
||||
std::vector<float> logits_dev;
|
||||
std::string status_nmse = "\033[1;33mSKIP\033[0m";
|
||||
std::string status_roundtrip = "\033[1;33mSKIP\033[0m";
|
||||
FILE * file = tmpfile(); // Can be null on Windows without administrator privileges.
|
||||
if (file != nullptr && llama_model_saver_supports_arch(arch)) {
|
||||
llama_model_saver ms = llama_model_saver(model_and_ctx_dev.first.get());
|
||||
ms.add_kv_from_model();
|
||||
ms.add_tensors_from_model();
|
||||
ms.save(file);
|
||||
rewind(file);
|
||||
|
||||
auto model_and_ctx_roundtrip = get_model_and_ctx(nullptr, file, seed, {dev});
|
||||
const std::vector<float> logits_roundtrip = get_logits(
|
||||
model_and_ctx_roundtrip.first.get(), model_and_ctx_roundtrip.second.get(), tokens, encode);
|
||||
status_roundtrip = "\033[1;32mOK\033[0m";
|
||||
GGML_ASSERT(logits_roundtrip.size() == logits_dev.size());
|
||||
for (size_t i = 0; i < logits_roundtrip.size(); i++) {
|
||||
if (logits_roundtrip[i] != logits_dev[i]) {
|
||||
char nmse_str[12] = {0};
|
||||
bool skip = !arch_supported(arch) || (dc.split_mode == LLAMA_SPLIT_MODE_TENSOR && dc.devs.empty());
|
||||
#if defined(GGML_USE_WEBGPU)
|
||||
skip = true; // FIXME
|
||||
#endif // GGML_USE_WEBGPU
|
||||
if (!skip) {
|
||||
if (logits_cpu.empty()) {
|
||||
model_and_ctx_cpu = get_model_and_ctx(gguf_ctx.get(), nullptr, seed, {}, LLAMA_SPLIT_MODE_LAYER, encode);
|
||||
logits_cpu = get_logits(model_and_ctx_cpu.first.get(), model_and_ctx_cpu.second.get(), tokens, encode);
|
||||
}
|
||||
if (dc.split_mode != LLAMA_SPLIT_MODE_TENSOR || llm_arch_supports_sm_tensor(arch)) {
|
||||
model_and_ctx_dev = get_model_and_ctx(gguf_ctx.get(), nullptr, seed, dc.devs, dc.split_mode, encode);
|
||||
logits_dev = get_logits(model_and_ctx_dev.first.get(), model_and_ctx_dev.second.get(), tokens, encode);
|
||||
const double nmse_val = nmse(logits_cpu, logits_dev);
|
||||
snprintf(nmse_str, sizeof(nmse_str), "(%.2e)", nmse_val);
|
||||
status_nmse = "\033[1;32mOK\033[0m";
|
||||
if (nmse_val > 1e-4) {
|
||||
all_ok = false;
|
||||
status_roundtrip = "\033[1;31mFAIL\033[0m";
|
||||
break;
|
||||
status_nmse = "\033[1;31mFAIL\033[0m";
|
||||
}
|
||||
}
|
||||
|
||||
FILE * file = tmpfile(); // Can be null on Windows without administrator privileges.
|
||||
// FIXME: when adding a tensor to a gguf_context a copy is made, this changes the pointer which the meta backend
|
||||
// in turn uses to map the tensors to their simple equivalents - this is fundamentally incompatible
|
||||
if (file != nullptr && llama_model_saver_supports_arch(arch) && dc.split_mode != LLAMA_SPLIT_MODE_TENSOR) {
|
||||
GGML_ASSERT(model_and_ctx_dev.first && model_and_ctx_dev.second);
|
||||
llama_model_saver ms = llama_model_saver(model_and_ctx_dev.first.get());
|
||||
ms.add_kv_from_model();
|
||||
ms.add_tensors_from_model();
|
||||
ms.save(file);
|
||||
rewind(file);
|
||||
|
||||
auto model_and_ctx_roundtrip = get_model_and_ctx(nullptr, file, seed, dc.devs, dc.split_mode, encode);
|
||||
const std::vector<float> logits_roundtrip = get_logits(
|
||||
model_and_ctx_roundtrip.first.get(), model_and_ctx_roundtrip.second.get(), tokens, encode);
|
||||
status_roundtrip = "\033[1;32mOK\033[0m";
|
||||
GGML_ASSERT(logits_roundtrip.size() == logits_dev.size());
|
||||
for (size_t i = 0; i < logits_roundtrip.size(); i++) {
|
||||
if (logits_roundtrip[i] != logits_dev[i]) {
|
||||
all_ok = false;
|
||||
status_roundtrip = "\033[1;31mFAIL\033[0m";
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
printf("|%15s|%30s|%6s|%15s (%8s)|%20s|\n", llm_arch_name(arch), ggml_backend_dev_description(dev),
|
||||
printf("|%16s|%30s|%6s|%15s %10s|%20s|\n", llm_arch_name(arch), dc.label.c_str(),
|
||||
config_name.c_str(), status_nmse.c_str(), nmse_str, status_roundtrip.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user