diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h index 3c06aeaff..4a8f6d428 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -348,6 +348,53 @@ extern "C" { // Set a callback to be called for each resulting node during graph compute GGML_API void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data); + // + // Meta backend + // + +#define GGML_BACKEND_META_MAX_DEVICES 16 + + enum ggml_backend_meta_split_axis { + // tensor split by tensor dimensions: + GGML_BACKEND_SPLIT_AXIS_0 = 0, + GGML_BACKEND_SPLIT_AXIS_1 = 1, + GGML_BACKEND_SPLIT_AXIS_2 = 2, + GGML_BACKEND_SPLIT_AXIS_3 = 3, + + GGML_BACKEND_SPLIT_AXIS_MIRRORED = 10, // all values on all backends + GGML_BACKEND_SPLIT_AXIS_PARTIAL = 11, // each backend has a partial sum + + // for internal bookkeeping only: + GGML_BACKEND_SPLIT_AXIS_NONE = 98, + GGML_BACKEND_SPLIT_AXIS_UNKNOWN = 99, + }; + GGML_API const char * ggml_backend_meta_split_axis_name(enum ggml_backend_meta_split_axis split_axis); + + struct ggml_backend_meta_split_state { + enum ggml_backend_meta_split_axis axis; + + // for tensors with axis >= 0 && axis < GGML_MAX_DIMS: + // - each device has a slice of the tensor along the split axis + // - most tensors have n_segments == 1 and a contiguous slice of the tensor data + // - some tensors have an inhomogenenous data layout along the split axis, + // those tensors are divided into segments which are each individually split across devices + // - ne has one entry per segment and device that add up to ggml_tensor::ne for that axis, + // the outer/inner loops are over segments/devices like [seg0_dev0, seg0_dev1, seg1_dev0, seg1_dev1], + // - for example, a transformer may have a fused QKV matrix rather than 3 matrices, those would be 3 separate segments + // that each need to be split individually across devices so that each device gets a slice of Q, K, and V + int64_t ne[16*GGML_BACKEND_META_MAX_DEVICES]; + uint32_t n_segments; + }; + + // function to assign split states for statically allocated tensors, compute tensor split states will be assigned to be compatible: + typedef struct ggml_backend_meta_split_state(*ggml_backend_meta_get_split_state_t)(const struct ggml_tensor * tensor, void * userdata); + + // create a new meta device from "simple" devices, meta buffer type/buffer/backend is then derived from this: + // TODO: this looks a bit strange - a backend API creates a device. I think we should try + // express this as a backend registry functionality instead + GGML_API ggml_backend_dev_t ggml_backend_meta_device( + ggml_backend_dev_t * devs, size_t n_devs, ggml_backend_meta_get_split_state_t get_split_state, void * get_split_state_ud); + // // Utils // diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c index e9b70398f..a4b01ccf8 100644 --- a/ggml/src/ggml-alloc.c +++ b/ggml/src/ggml-alloc.c @@ -2,6 +2,7 @@ #include "ggml-backend-impl.h" #include "ggml.h" #include "ggml-impl.h" + #include #include #include diff --git a/ggml/src/ggml-backend-meta.cpp b/ggml/src/ggml-backend-meta.cpp index a2ab8872c..0a8eea4e9 100644 --- a/ggml/src/ggml-backend-meta.cpp +++ b/ggml/src/ggml-backend-meta.cpp @@ -5,9 +5,6 @@ #include "ggml-alloc.h" #include "ggml-cpp.h" -// TODO: tmp -#include "ggml-ext.h" - #include #include #include diff --git a/ggml/src/ggml-ext.h b/ggml/src/ggml-ext.h deleted file mode 100644 index 56b0e6d31..000000000 --- a/ggml/src/ggml-ext.h +++ /dev/null @@ -1,56 +0,0 @@ -#pragma once - -#include "ggml.h" -#include "ggml-backend.h" - -// This is a "staging" header for new ggml API -// It is not publicly available and it should not be used by 3rd party projects -// -// When the API matures enough, it will be moved to the official public API - -// -// Meta backend -// - -#define GGML_BACKEND_META_MAX_DEVICES 16 - -enum ggml_backend_meta_split_axis { - // tensor split by tensor dimensions: - GGML_BACKEND_SPLIT_AXIS_0 = 0, - GGML_BACKEND_SPLIT_AXIS_1 = 1, - GGML_BACKEND_SPLIT_AXIS_2 = 2, - GGML_BACKEND_SPLIT_AXIS_3 = 3, - - GGML_BACKEND_SPLIT_AXIS_MIRRORED = 10, // all values on all backends - GGML_BACKEND_SPLIT_AXIS_PARTIAL = 11, // each backend has a partial sum - - // for internal bookkeeping only: - GGML_BACKEND_SPLIT_AXIS_NONE = 98, - GGML_BACKEND_SPLIT_AXIS_UNKNOWN = 99, -}; -GGML_API const char * ggml_backend_meta_split_axis_name(enum ggml_backend_meta_split_axis split_axis); - -struct ggml_backend_meta_split_state { - enum ggml_backend_meta_split_axis axis; - - // for tensors with axis >= 0 && axis < GGML_MAX_DIMS: - // - each device has a slice of the tensor along the split axis - // - most tensors have n_segments == 1 and a contiguous slice of the tensor data - // - some tensors have an inhomogenenous data layout along the split axis, - // those tensors are divided into segments which are each individually split across devices - // - ne has one entry per segment and device that add up to ggml_tensor::ne for that axis, - // the outer/inner loops are over segments/devices like [seg0_dev0, seg0_dev1, seg1_dev0, seg1_dev1], - // - for example, a transformer may have a fused QKV matrix rather than 3 matrices, those would be 3 separate segments - // that each need to be split individually across devices so that each device gets a slice of Q, K, and V - int64_t ne[16*GGML_BACKEND_META_MAX_DEVICES]; - uint32_t n_segments; -}; - -// function to assign split states for statically allocated tensors, compute tensor split states will be assigned to be compatible: -typedef struct ggml_backend_meta_split_state(*ggml_backend_meta_get_split_state_t)(const struct ggml_tensor * tensor, void * userdata); - -// create a new meta device from "simple" devices, meta buffer type/buffer/backend is then derived from this: -// TODO: this looks a bit strange - a backend API creates a device. I think we should try -// express this as a backend registry functionality instead -GGML_API ggml_backend_dev_t ggml_backend_meta_device( - ggml_backend_dev_t * devs, size_t n_devs, ggml_backend_meta_get_split_state_t get_split_state, void * get_split_state_ud); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index d2ffc1f45..b265394ef 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -18,9 +18,6 @@ #include "ggml.h" #include "ggml-cpp.h" -// TODO: tmp until the ggml meta backend matures and becomes public -#include "../src/ggml-ext.h" - #include #include #include diff --git a/src/llama.cpp b/src/llama.cpp index ce5752467..484372d8d 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -15,9 +15,6 @@ #include "ggml-backend.h" #include "gguf.h" -// TODO: tmp until the ggml meta backend matures and becomes public -#include "../src/ggml-ext.h" - #include #include #include