test-backend-ops: allow loading tests from file and parsing model operators into file (#19896)
* tests: allow loading test-backend-ops tests from json * add error threshold based on op * add error when file cannot be read * add graph operator json extraction tool * add nb parameter for non-contiguous input tensors * fix view check * only use view if non-contiguous/permuted, use C++ random instead of rand() * replace internal API calls with public llama_graph_reserve call * reduce test description length * fix nb[0] not getting set for view * add name to tests * fix inplace error * use text file instead of json * move llama_graph_reserve function to new llama-ext header, move export-graph-ops to tests/ * fix missing declaration * use pragma once * fix indent * fix Windows build
This commit is contained in:
+328
-12
@@ -31,10 +31,12 @@
|
||||
#include <cstring>
|
||||
#include <ctime>
|
||||
#include <future>
|
||||
#include <fstream>
|
||||
#include <memory>
|
||||
#include <random>
|
||||
#include <regex>
|
||||
#include <set>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <thread>
|
||||
@@ -6648,6 +6650,236 @@ struct test_diag : public test_case {
|
||||
}
|
||||
};
|
||||
|
||||
// Deserializable generic test case
|
||||
struct input_tensor {
|
||||
ggml_type type;
|
||||
std::array<int64_t, 4> ne;
|
||||
std::array<size_t, 4> nb; // strides (0 = use default contiguous strides)
|
||||
};
|
||||
|
||||
static bool is_non_contiguous(const input_tensor & src) {
|
||||
if (src.nb[0] == 0) {
|
||||
return false;
|
||||
}
|
||||
const size_t default_nb0 = ggml_type_size(src.type);
|
||||
const size_t default_nb1 = default_nb0 * (src.ne[0] / ggml_blck_size(src.type));
|
||||
const size_t default_nb2 = default_nb1 * src.ne[1];
|
||||
const size_t default_nb3 = default_nb2 * src.ne[2];
|
||||
return src.nb[0] != default_nb0 ||
|
||||
src.nb[1] != default_nb1 ||
|
||||
src.nb[2] != default_nb2 ||
|
||||
src.nb[3] != default_nb3;
|
||||
}
|
||||
|
||||
static std::string var_to_str(const std::vector<input_tensor>& sources) {
|
||||
std::ostringstream oss;
|
||||
bool first = true;
|
||||
for (const auto& src : sources) {
|
||||
if (!first) oss << ",";
|
||||
oss << ggml_type_name(src.type) << "[" << src.ne[0] << "," << src.ne[1] << "," << src.ne[2] << "," << src.ne[3] << "]";
|
||||
if (is_non_contiguous(src)) {
|
||||
oss << "nb[" << src.nb[0] << "," << src.nb[1] << "," << src.nb[2] << "," << src.nb[3] << "]";
|
||||
}
|
||||
first = false;
|
||||
}
|
||||
return oss.str();
|
||||
}
|
||||
|
||||
static std::string var_to_str(const std::array<int32_t, GGML_MAX_OP_PARAMS / sizeof(int32_t)>& params) {
|
||||
std::ostringstream oss;
|
||||
oss << "[";
|
||||
bool first = true;
|
||||
for (size_t i = 0; i < params.size(); ++i) {
|
||||
if (params[i] != 0) {
|
||||
if (!first) oss << ",";
|
||||
oss << i << ":" << params[i];
|
||||
first = false;
|
||||
}
|
||||
}
|
||||
oss << "]";
|
||||
return oss.str();
|
||||
}
|
||||
|
||||
|
||||
struct test_generic_op : public test_case {
|
||||
const ggml_op op;
|
||||
const ggml_type type;
|
||||
const std::array<int64_t, 4> ne;
|
||||
const std::array<int32_t, GGML_MAX_OP_PARAMS / sizeof(int32_t)> op_params;
|
||||
|
||||
const std::vector<input_tensor> sources;
|
||||
const std::string name;
|
||||
|
||||
std::string vars() override {
|
||||
if (name.empty()) {
|
||||
return VARS_TO_STR4(type, ne, op_params, sources);
|
||||
}
|
||||
|
||||
return VARS_TO_STR5(name, type, ne, op_params, sources);
|
||||
}
|
||||
|
||||
test_generic_op(ggml_op op, ggml_type type, std::array<int64_t, 4> ne,
|
||||
std::array<int32_t, GGML_MAX_OP_PARAMS / sizeof(int32_t)> op_params,
|
||||
std::vector<input_tensor> sources, std::string name = "")
|
||||
: op(op), type(type), ne(ne), op_params(op_params), sources(sources), name(std::move(name)) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
const size_t source_count = std::min(sources.size(), (size_t)GGML_MAX_SRC);
|
||||
|
||||
std::array<ggml_tensor *, GGML_MAX_SRC> source_tensors;
|
||||
for (size_t i = 0; i < source_count; ++i) {
|
||||
const input_tensor& src = sources[i];
|
||||
|
||||
if (is_non_contiguous(src)) {
|
||||
size_t total_size;
|
||||
const size_t blck_size = ggml_blck_size(src.type);
|
||||
if (blck_size == 1) {
|
||||
total_size = ggml_type_size(src.type);
|
||||
for (int d = 0; d < 4; d++) {
|
||||
total_size += (src.ne[d] - 1) * src.nb[d];
|
||||
}
|
||||
} else {
|
||||
total_size = src.ne[0] * src.nb[0] / blck_size;
|
||||
for (int d = 1; d < 4; d++) {
|
||||
total_size += (src.ne[d] - 1) * src.nb[d];
|
||||
}
|
||||
}
|
||||
|
||||
// Convert bytes to elements, padded to block size for quantized types
|
||||
const size_t type_size = ggml_type_size(src.type);
|
||||
size_t backing_elements = (total_size * blck_size + type_size - 1) / type_size;
|
||||
backing_elements = ((backing_elements + blck_size - 1) / blck_size) * blck_size;
|
||||
ggml_tensor * backing = ggml_new_tensor_1d(ctx, src.type, backing_elements);
|
||||
source_tensors[i] = ggml_view_4d(ctx, backing,
|
||||
src.ne[0], src.ne[1], src.ne[2], src.ne[3],
|
||||
src.nb[1], src.nb[2], src.nb[3], 0);
|
||||
// nb[0] does not get set by view_4d, so set it manually
|
||||
source_tensors[i]->nb[0] = src.nb[0];
|
||||
} else {
|
||||
source_tensors[i] = ggml_new_tensor_4d(ctx, src.type, src.ne[0], src.ne[1], src.ne[2], src.ne[3]);
|
||||
}
|
||||
}
|
||||
|
||||
// Ops with an inplace flag create a view of src[0] as their output.
|
||||
bool inplace = false;
|
||||
if (op == GGML_OP_SET || op == GGML_OP_ACC) {
|
||||
inplace = op_params[4] != 0;
|
||||
} else if (op == GGML_OP_ADD_REL_POS) {
|
||||
inplace = op_params[0] != 0;
|
||||
}
|
||||
|
||||
ggml_tensor * out;
|
||||
if (inplace && source_count > 0) {
|
||||
out = ggml_view_tensor(ctx, source_tensors[0]);
|
||||
} else {
|
||||
out = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
|
||||
}
|
||||
out->op = op;
|
||||
for (size_t i = 0; i < source_count; ++i) {
|
||||
out->src[i] = source_tensors[i];
|
||||
}
|
||||
|
||||
memcpy(out->op_params, op_params.data(), GGML_MAX_OP_PARAMS);
|
||||
ggml_set_name(out, "out");
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
double max_nmse_err() override {
|
||||
switch (op) {
|
||||
case GGML_OP_MUL_MAT:
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
case GGML_OP_OUT_PROD:
|
||||
case GGML_OP_CONV_TRANSPOSE_2D:
|
||||
case GGML_OP_IM2COL:
|
||||
case GGML_OP_CONV_2D:
|
||||
case GGML_OP_CONV_3D:
|
||||
case GGML_OP_SET_ROWS:
|
||||
case GGML_OP_CPY:
|
||||
return 5e-4;
|
||||
case GGML_OP_SOFT_MAX:
|
||||
return 1e-6;
|
||||
case GGML_OP_RWKV_WKV7:
|
||||
return 5e-3;
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
{
|
||||
// Scale error with kv length to account for accumulating floating point error
|
||||
const int64_t kv = sources[1].ne[1];
|
||||
return 5e-4 * std::max(1.0, kv / 20000.0);
|
||||
}
|
||||
default:
|
||||
return 1e-7;
|
||||
}
|
||||
}
|
||||
|
||||
void initialize_tensors(ggml_context * ctx) override {
|
||||
ggml_tensor * out = ggml_get_tensor(ctx, "out");
|
||||
|
||||
std::random_device rd;
|
||||
std::default_random_engine rng(rd());
|
||||
|
||||
for (size_t i = 0; i < sources.size() && i < GGML_MAX_SRC; i++) {
|
||||
ggml_tensor * t = out->src[i];
|
||||
if (!t) {
|
||||
break;
|
||||
}
|
||||
|
||||
// FLASH_ATTN_EXT: src[3] is the KQ mask
|
||||
if (op == GGML_OP_FLASH_ATTN_EXT && i == 3) {
|
||||
init_tensor_kq_mask(t);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (t->type == GGML_TYPE_I32 || t->type == GGML_TYPE_I64) {
|
||||
if (op == GGML_OP_GET_ROWS || op == GGML_OP_GET_ROWS_BACK) {
|
||||
const int64_t num_rows = sources[0].ne[1];
|
||||
const int64_t nels = ggml_nelements(t);
|
||||
std::vector<int32_t> data(nels);
|
||||
std::uniform_int_distribution<int32_t> dist(0, num_rows - 1);
|
||||
for (int64_t i = 0; i < nels; i++) {
|
||||
data[i] = dist(rng);
|
||||
}
|
||||
ggml_backend_tensor_set(t, data.data(), 0, nels * sizeof(int32_t));
|
||||
} else if (op == GGML_OP_SET_ROWS) {
|
||||
init_set_rows_row_ids(t, ne[1]);
|
||||
} else if (op == GGML_OP_ROPE) {
|
||||
const int mode = op_params[2];
|
||||
const int64_t nels = (mode & GGML_ROPE_TYPE_MROPE) ? ne[2] * 4 : ne[2];
|
||||
std::vector<int32_t> data(nels);
|
||||
std::uniform_int_distribution<int32_t> dist(0, ne[2] - 1);
|
||||
for (int64_t i = 0; i < nels; i++) {
|
||||
data[i] = dist(rng);
|
||||
}
|
||||
ggml_backend_tensor_set(t, data.data(), 0, nels * sizeof(int32_t));
|
||||
} else if (op == GGML_OP_MUL_MAT_ID || op == GGML_OP_ADD_ID) {
|
||||
const int64_t n_expert = (op == GGML_OP_MUL_MAT_ID) ? sources[0].ne[2] : sources[1].ne[1];
|
||||
for (int64_t r = 0; r < ggml_nrows(t); r++) {
|
||||
std::vector<int32_t> data(t->ne[0]);
|
||||
for (int32_t i = 0; i < t->ne[0]; i++) {
|
||||
data[i] = i % n_expert;
|
||||
}
|
||||
std::shuffle(data.begin(), data.end(), rng);
|
||||
ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(int32_t));
|
||||
}
|
||||
} else if (op == GGML_OP_SSM_SCAN) {
|
||||
for (int64_t r = 0; r < ggml_nrows(t); r++) {
|
||||
std::vector<int32_t> data(t->ne[0]);
|
||||
for (int32_t i = 0; i < t->ne[0]; i++) {
|
||||
data[i] = i;
|
||||
}
|
||||
std::shuffle(data.begin(), data.end(), rng);
|
||||
ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(int32_t));
|
||||
}
|
||||
} else {
|
||||
init_tensor_uniform(t);
|
||||
}
|
||||
} else {
|
||||
init_tensor_uniform(t);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
enum llm_norm_type {
|
||||
LLM_NORM,
|
||||
@@ -8751,8 +8983,72 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
|
||||
return test_cases;
|
||||
}
|
||||
|
||||
static std::vector<std::unique_ptr<test_case>> make_test_cases_from_file(const char * path) {
|
||||
std::ifstream f(path);
|
||||
|
||||
if (!f.is_open()) {
|
||||
throw std::runtime_error("Unable to read test file");
|
||||
}
|
||||
|
||||
std::vector<std::unique_ptr<test_case>> test_cases;
|
||||
|
||||
std::string line;
|
||||
|
||||
while (std::getline(f, line)) {
|
||||
std::istringstream iss(line);
|
||||
|
||||
ggml_op op;
|
||||
ggml_type type;
|
||||
std::array<int64_t, 4> ne;
|
||||
std::array<int32_t, GGML_MAX_OP_PARAMS / sizeof(int32_t)> op_params = {};
|
||||
std::string name;
|
||||
uint64_t tmp;
|
||||
|
||||
iss >> tmp;
|
||||
op = (ggml_op)tmp;
|
||||
iss >> tmp;
|
||||
type = (ggml_type)tmp;
|
||||
|
||||
for (size_t i = 0; i < 4; i++) {
|
||||
iss >> ne[i];
|
||||
}
|
||||
|
||||
iss >> tmp;
|
||||
for (size_t i = 0; i < tmp && i < op_params.size(); i++) {
|
||||
iss >> op_params[i];
|
||||
}
|
||||
|
||||
iss >> tmp;
|
||||
|
||||
size_t num_src = std::min((uint64_t)GGML_MAX_SRC, tmp);
|
||||
std::vector<input_tensor> sources(num_src);
|
||||
for (size_t i = 0; i < num_src; i++) {
|
||||
input_tensor& src = sources[i];
|
||||
iss >> tmp;
|
||||
src.type = (ggml_type)tmp;
|
||||
|
||||
for (size_t i = 0; i < 4; i++) {
|
||||
iss >> src.ne[i];
|
||||
}
|
||||
for (size_t i = 0; i < 4; i++) {
|
||||
iss >> src.nb[i];
|
||||
}
|
||||
}
|
||||
|
||||
iss >> name;
|
||||
|
||||
if (name.length() == 1 && name[0] == '-') {
|
||||
name = "";
|
||||
}
|
||||
|
||||
test_cases.emplace_back(new test_generic_op(op, type, ne, op_params, sources, std::move(name)));
|
||||
}
|
||||
|
||||
return test_cases;
|
||||
}
|
||||
|
||||
static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_names_filter, const char * params_filter,
|
||||
printer * output_printer) {
|
||||
printer * output_printer, const char * test_file_path) {
|
||||
auto filter_test_cases = [](std::vector<std::unique_ptr<test_case>> & test_cases, const char * params_filter) {
|
||||
if (params_filter == nullptr) {
|
||||
return;
|
||||
@@ -8770,9 +9066,26 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||
}
|
||||
};
|
||||
|
||||
std::vector<std::unique_ptr<test_case>> test_cases;
|
||||
|
||||
if (test_file_path == nullptr) {
|
||||
switch (mode) {
|
||||
case MODE_TEST:
|
||||
case MODE_GRAD:
|
||||
case MODE_SUPPORT:
|
||||
test_cases = make_test_cases_eval();
|
||||
break;
|
||||
case MODE_PERF:
|
||||
test_cases = make_test_cases_perf();
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
test_cases = make_test_cases_from_file(test_file_path);
|
||||
}
|
||||
|
||||
filter_test_cases(test_cases, params_filter);
|
||||
|
||||
if (mode == MODE_TEST) {
|
||||
auto test_cases = make_test_cases_eval();
|
||||
filter_test_cases(test_cases, params_filter);
|
||||
ggml_backend_t backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, NULL);
|
||||
if (backend_cpu == NULL) {
|
||||
test_operation_info info("", "", "CPU");
|
||||
@@ -8812,8 +9125,6 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||
}
|
||||
|
||||
if (mode == MODE_GRAD) {
|
||||
auto test_cases = make_test_cases_eval();
|
||||
filter_test_cases(test_cases, params_filter);
|
||||
size_t n_ok = 0;
|
||||
for (auto & test : test_cases) {
|
||||
if (test->eval_grad(backend, op_names_filter, output_printer)) {
|
||||
@@ -8826,8 +9137,6 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||
}
|
||||
|
||||
if (mode == MODE_PERF) {
|
||||
auto test_cases = make_test_cases_perf();
|
||||
filter_test_cases(test_cases, params_filter);
|
||||
for (auto & test : test_cases) {
|
||||
test->eval_perf(backend, op_names_filter, output_printer);
|
||||
}
|
||||
@@ -8835,9 +9144,6 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||
}
|
||||
|
||||
if (mode == MODE_SUPPORT) {
|
||||
auto test_cases = make_test_cases_eval();
|
||||
filter_test_cases(test_cases, params_filter);
|
||||
|
||||
// Filter out fusion cases
|
||||
test_cases.erase(
|
||||
std::remove_if(test_cases.begin(), test_cases.end(), [](const std::unique_ptr<test_case> & tc) {
|
||||
@@ -8956,7 +9262,8 @@ static void show_test_coverage() {
|
||||
}
|
||||
|
||||
static void usage(char ** argv) {
|
||||
printf("Usage: %s [mode] [-o <op,..>] [-b <backend>] [-p <params regex>] [--output <console|sql|csv>] [--list-ops] [--show-coverage]\n", argv[0]);
|
||||
printf("Usage: %s [mode] [-o <op,..>] [-b <backend>] [-p <params regex>] [--output <console|sql|csv>] [--list-ops]", argv[0]);
|
||||
printf(" [--show-coverage] [--test-file <path>]\n");
|
||||
printf(" valid modes:\n");
|
||||
printf(" - test (default, compare with CPU backend for correctness)\n");
|
||||
printf(" - grad (compare gradients from backpropagation with method of finite differences)\n");
|
||||
@@ -8967,6 +9274,7 @@ static void usage(char ** argv) {
|
||||
printf(" --output specifies output format (default: console, options: console, sql, csv)\n");
|
||||
printf(" --list-ops lists all available GGML operations\n");
|
||||
printf(" --show-coverage shows test coverage\n");
|
||||
printf(" --test-file reads test operators from a test file generated by llama-export-graph-ops\n");
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
@@ -8975,6 +9283,7 @@ int main(int argc, char ** argv) {
|
||||
const char * op_names_filter = nullptr;
|
||||
const char * backend_filter = nullptr;
|
||||
const char * params_filter = nullptr;
|
||||
const char * test_file_path = nullptr;
|
||||
|
||||
for (int i = 1; i < argc; i++) {
|
||||
if (strcmp(argv[i], "test") == 0) {
|
||||
@@ -9022,6 +9331,13 @@ int main(int argc, char ** argv) {
|
||||
} else if (strcmp(argv[i], "--show-coverage") == 0) {
|
||||
show_test_coverage();
|
||||
return 0;
|
||||
} else if (strcmp(argv[i], "--test-file") == 0) {
|
||||
if (i + 1 < argc) {
|
||||
test_file_path = argv[++i];
|
||||
} else {
|
||||
usage(argv);
|
||||
return 1;
|
||||
}
|
||||
} else {
|
||||
usage(argv);
|
||||
return 1;
|
||||
@@ -9074,7 +9390,7 @@ int main(int argc, char ** argv) {
|
||||
false, "", ggml_backend_dev_description(dev),
|
||||
total / 1024 / 1024, free / 1024 / 1024, true));
|
||||
|
||||
bool ok = test_backend(backend, mode, op_names_filter, params_filter, output_printer.get());
|
||||
bool ok = test_backend(backend, mode, op_names_filter, params_filter, output_printer.get(), test_file_path);
|
||||
|
||||
if (ok) {
|
||||
n_ok++;
|
||||
|
||||
Reference in New Issue
Block a user