llama: end-to-end tests (#19802)
* tests: add end-to-end tests per model architecture * fixup for rebase * fix use-after-free in llama-model-loader.cpp * fix CI * fix WebGPU * fix CI * disable CI for macOS-latest-cmake-arm64 * use expert_weights_scale only if != 0.0f * comments
This commit is contained in:
@@ -4,17 +4,22 @@
|
||||
|
||||
#include "llama-impl.h"
|
||||
#include "llama-arch.h"
|
||||
#include "llama-hparams.h"
|
||||
#include "llama-mmap.h"
|
||||
|
||||
#include "ggml-cpp.h"
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstring>
|
||||
#include <map>
|
||||
#include <stdexcept>
|
||||
#include <unordered_map>
|
||||
|
||||
using llama_buf_map = std::unordered_map<uint32_t, ggml_backend_buffer_t>;
|
||||
|
||||
// lists of buffer types used for each layer
|
||||
using buft_list_t = std::vector<std::pair<ggml_backend_dev_t, ggml_backend_buffer_type_t>>;
|
||||
|
||||
enum llama_fver {
|
||||
GGUF_FILE_VERSION_V1 = 1,
|
||||
GGUF_FILE_VERSION_V2 = 2,
|
||||
@@ -58,9 +63,10 @@ struct llama_model_loader {
|
||||
}
|
||||
};
|
||||
|
||||
static const int TENSOR_NOT_REQUIRED = 1 << 0;
|
||||
static const int TENSOR_DUPLICATED = 1 << 1;
|
||||
static const int TENSOR_SKIP = 1 << 2;
|
||||
static const int TENSOR_NOT_REQUIRED = 1 << 0;
|
||||
static const int TENSOR_DUPLICATED = 1 << 1;
|
||||
static const int TENSOR_SKIP = 1 << 2;
|
||||
static const int TENSOR_SKIP_IF_VIRTUAL = 1 << 3;
|
||||
|
||||
int n_kv = 0;
|
||||
int n_tensors = 0;
|
||||
@@ -84,7 +90,10 @@ struct llama_model_loader {
|
||||
std::unordered_map<std::string, llama_model_kv_override> kv_overrides;
|
||||
const llama_model_tensor_buft_override * tensor_buft_overrides;
|
||||
|
||||
gguf_context_ptr meta;
|
||||
gguf_context_ptr metadata_ptr;
|
||||
struct gguf_context * metadata; // either metadata_ptr.get() or externally set
|
||||
llama_model_set_tensor_data_t set_tensor_data;
|
||||
void * set_tensor_data_ud;
|
||||
std::vector<ggml_context_ptr> contexts;
|
||||
|
||||
std::string arch_name;
|
||||
@@ -94,7 +103,26 @@ struct llama_model_loader {
|
||||
size_t size_data = 0;
|
||||
std::vector<std::pair<size_t, size_t>> mmaps_used;
|
||||
|
||||
// define a comparator for the buft -> ctx map to ensure that the order is well-defined:
|
||||
struct ggml_backend_buft_comparator {
|
||||
bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const {
|
||||
return strcmp(ggml_backend_buft_name(lhs), ggml_backend_buft_name(rhs)) < 0;
|
||||
}
|
||||
};
|
||||
|
||||
std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> ctx_map;
|
||||
|
||||
// track tensors that had to be moved for debugging:
|
||||
size_t n_tensors_moved = 0;
|
||||
std::string first_tensor_moved_name;
|
||||
std::string first_tensor_moved_type_name;
|
||||
ggml_backend_buffer_type_t first_moved_from_buft = nullptr;
|
||||
ggml_backend_buffer_type_t first_moved_to_buft = nullptr;
|
||||
|
||||
llama_model_loader(
|
||||
struct gguf_context * metadata,
|
||||
llama_model_set_tensor_data_t set_tensor_data,
|
||||
void * set_tensor_data_ud,
|
||||
const std::string & fname,
|
||||
std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
|
||||
bool use_mmap,
|
||||
@@ -149,7 +177,9 @@ struct llama_model_loader {
|
||||
|
||||
const struct ggml_tensor * check_tensor_dims(const std::string & name, const std::vector<int64_t> & ne, bool required) const;
|
||||
|
||||
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::initializer_list<int64_t> & ne, int flags = 0);
|
||||
struct ggml_tensor * create_tensor(
|
||||
const llama_hparams & hparams, const buft_list_t * buft_list_cpu, const buft_list_t * buft_list_input, const buft_list_t * buft_list_output,
|
||||
const buft_list_t * buft_list_layer, const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags);
|
||||
|
||||
struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::initializer_list<int64_t> & ne, size_t offset, bool required = true);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user