diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index 3c06aeaff..4a8f6d428 100644
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -348,6 +348,53 @@ extern "C" {
     // Set a callback to be called for each resulting node during graph compute
     GGML_API void                 ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
 
+    //
+    // Meta backend
+    //
+
+#define GGML_BACKEND_META_MAX_DEVICES 16
+
+    enum ggml_backend_meta_split_axis {
+        // tensor split by tensor dimensions:
+        GGML_BACKEND_SPLIT_AXIS_0 = 0,
+        GGML_BACKEND_SPLIT_AXIS_1 = 1,
+        GGML_BACKEND_SPLIT_AXIS_2 = 2,
+        GGML_BACKEND_SPLIT_AXIS_3 = 3,
+
+        GGML_BACKEND_SPLIT_AXIS_MIRRORED = 10, // all values on all backends
+        GGML_BACKEND_SPLIT_AXIS_PARTIAL  = 11, // each backend has a partial sum
+
+        // for internal bookkeeping only:
+        GGML_BACKEND_SPLIT_AXIS_NONE    = 98,
+        GGML_BACKEND_SPLIT_AXIS_UNKNOWN = 99,
+    };
+    GGML_API const char * ggml_backend_meta_split_axis_name(enum ggml_backend_meta_split_axis split_axis);
+
+    struct ggml_backend_meta_split_state {
+        enum ggml_backend_meta_split_axis axis;
+
+        // for tensors with axis >= 0 && axis < GGML_MAX_DIMS:
+        //   - each device has a slice of the tensor along the split axis
+        //   - most tensors have n_segments == 1 and a contiguous slice of the tensor data
+        //   - some tensors have an inhomogenenous data layout along the split axis,
+        //     those tensors are divided into segments which are each individually split across devices
+        //   - ne has one entry per segment and device that add up to ggml_tensor::ne for that axis,
+        //     the outer/inner loops are over segments/devices like [seg0_dev0, seg0_dev1, seg1_dev0, seg1_dev1],
+        //   - for example, a transformer may have a fused QKV matrix rather than 3 matrices, those would be 3 separate segments
+        //     that each need to be split individually across devices so that each device gets a slice of Q, K, and V
+        int64_t  ne[16*GGML_BACKEND_META_MAX_DEVICES];
+        uint32_t n_segments;
+    };
+
+    // function to assign split states for statically allocated tensors, compute tensor split states will be assigned to be compatible:
+    typedef struct ggml_backend_meta_split_state(*ggml_backend_meta_get_split_state_t)(const struct ggml_tensor * tensor, void * userdata);
+
+    // create a new meta device from "simple" devices, meta buffer type/buffer/backend is then derived from this:
+    // TODO: this looks a bit strange - a backend API creates a device. I think we should try
+    //       express this as a backend registry functionality instead
+    GGML_API ggml_backend_dev_t ggml_backend_meta_device(
+        ggml_backend_dev_t * devs, size_t n_devs, ggml_backend_meta_get_split_state_t get_split_state, void * get_split_state_ud);
+
     //
     // Utils
     //
diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
index e9b70398f..a4b01ccf8 100644
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@@ -2,6 +2,7 @@
 #include "ggml-backend-impl.h"
 #include "ggml.h"
 #include "ggml-impl.h"
+
 #include <assert.h>
 #include <limits.h>
 #include <stdarg.h>
diff --git a/ggml/src/ggml-backend-meta.cpp b/ggml/src/ggml-backend-meta.cpp
index a2ab8872c..0a8eea4e9 100644
--- a/ggml/src/ggml-backend-meta.cpp
+++ b/ggml/src/ggml-backend-meta.cpp
@@ -5,9 +5,6 @@
 #include "ggml-alloc.h"
 #include "ggml-cpp.h"
 
-// TODO: tmp
-#include "ggml-ext.h"
-
 #include <algorithm>
 #include <cassert>
 #include <cmath>
diff --git a/ggml/src/ggml-ext.h b/ggml/src/ggml-ext.h
deleted file mode 100644
index 56b0e6d31..000000000
--- a/ggml/src/ggml-ext.h
+++ /dev/null
@@ -1,56 +0,0 @@
-#pragma once
-
-#include "ggml.h"
-#include "ggml-backend.h"
-
-// This is a "staging" header for new ggml API
-// It is not publicly available and it should not be used by 3rd party projects
-//
-// When the API matures enough, it will be moved to the official public API
-
-//
-// Meta backend
-//
-
-#define GGML_BACKEND_META_MAX_DEVICES 16
-
-enum ggml_backend_meta_split_axis {
-    // tensor split by tensor dimensions:
-    GGML_BACKEND_SPLIT_AXIS_0   =  0,
-    GGML_BACKEND_SPLIT_AXIS_1   =  1,
-    GGML_BACKEND_SPLIT_AXIS_2   =  2,
-    GGML_BACKEND_SPLIT_AXIS_3   =  3,
-
-    GGML_BACKEND_SPLIT_AXIS_MIRRORED = 10, // all values on all backends
-    GGML_BACKEND_SPLIT_AXIS_PARTIAL  = 11, // each backend has a partial sum
-
-    // for internal bookkeeping only:
-    GGML_BACKEND_SPLIT_AXIS_NONE     = 98,
-    GGML_BACKEND_SPLIT_AXIS_UNKNOWN  = 99,
-};
-GGML_API const char * ggml_backend_meta_split_axis_name(enum ggml_backend_meta_split_axis split_axis);
-
-struct ggml_backend_meta_split_state {
-    enum ggml_backend_meta_split_axis axis;
-
-    // for tensors with axis >= 0 && axis < GGML_MAX_DIMS:
-    //   - each device has a slice of the tensor along the split axis
-    //   - most tensors have n_segments == 1 and a contiguous slice of the tensor data
-    //   - some tensors have an inhomogenenous data layout along the split axis,
-    //     those tensors are divided into segments which are each individually split across devices
-    //   - ne has one entry per segment and device that add up to ggml_tensor::ne for that axis,
-    //     the outer/inner loops are over segments/devices like [seg0_dev0, seg0_dev1, seg1_dev0, seg1_dev1],
-    //   - for example, a transformer may have a fused QKV matrix rather than 3 matrices, those would be 3 separate segments
-    //     that each need to be split individually across devices so that each device gets a slice of Q, K, and V
-    int64_t  ne[16*GGML_BACKEND_META_MAX_DEVICES];
-    uint32_t n_segments;
-};
-
-// function to assign split states for statically allocated tensors, compute tensor split states will be assigned to be compatible:
-typedef struct ggml_backend_meta_split_state(*ggml_backend_meta_get_split_state_t)(const struct ggml_tensor * tensor, void * userdata);
-
-// create a new meta device from "simple" devices, meta buffer type/buffer/backend is then derived from this:
-// TODO: this looks a bit strange - a backend API creates a device. I think we should try
-//       express this as a backend registry functionality instead
-GGML_API ggml_backend_dev_t ggml_backend_meta_device(
-    ggml_backend_dev_t * devs, size_t n_devs, ggml_backend_meta_get_split_state_t get_split_state, void * get_split_state_ud);
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index d2ffc1f45..b265394ef 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -18,9 +18,6 @@
 #include "ggml.h"
 #include "ggml-cpp.h"
 
-// TODO: tmp until the ggml meta backend matures and becomes public
-#include "../src/ggml-ext.h"
-
 #include <algorithm>
 #include <cassert>
 #include <cfloat>
diff --git a/src/llama.cpp b/src/llama.cpp
index ce5752467..484372d8d 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -15,9 +15,6 @@
 #include "ggml-backend.h"
 #include "gguf.h"
 
-// TODO: tmp until the ggml meta backend matures and becomes public
-#include "../src/ggml-ext.h"
-
 #include <algorithm>
 #include <cassert>
 #include <cinttypes>