llama: fix magic number of 999 for GPU layers (#18266)
* llama: fix magic number of 999 for GPU layers * use strings for -ngl, -ngld * enacapsulate n_gpu_layers, split_mode
This commit is contained in:
+21
-6
@@ -2137,11 +2137,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
}
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT"));
|
||||
GGML_ASSERT(params.n_gpu_layers < 0); // string_format would need to be extended for a default >= 0
|
||||
add_opt(common_arg(
|
||||
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
|
||||
string_format("max. number of layers to store in VRAM (default: %d)", params.n_gpu_layers),
|
||||
[](common_params & params, int value) {
|
||||
params.n_gpu_layers = value;
|
||||
string_format("max. number of layers to store in VRAM, either an exact number, 'auto', or 'all' (default: %s)", params.n_gpu_layers == -1 ? "auto" : "all"),
|
||||
[](common_params & params, const std::string & value) {
|
||||
if (value == "auto") {
|
||||
params.n_gpu_layers = -1;
|
||||
} else if (value == "all") {
|
||||
params.n_gpu_layers = -2;
|
||||
} else {
|
||||
params.n_gpu_layers = std::stoi(value);
|
||||
}
|
||||
if (!llama_supports_gpu_offload()) {
|
||||
fprintf(stderr, "warning: no usable GPU found, --gpu-layers option will be ignored\n");
|
||||
fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
|
||||
@@ -3175,11 +3182,19 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.speculative.devices = parse_device_list(value);
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
||||
GGML_ASSERT(params.speculative.n_gpu_layers < 0); // string_format would need to be extended for a default >= 0
|
||||
add_opt(common_arg(
|
||||
{"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
|
||||
"number of layers to store in VRAM for the draft model",
|
||||
[](common_params & params, int value) {
|
||||
params.speculative.n_gpu_layers = value;
|
||||
string_format("max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: %s)",
|
||||
params.speculative.n_gpu_layers == -1 ? "auto" : "all"),
|
||||
[](common_params & params, const std::string & value) {
|
||||
if (value == "auto") {
|
||||
params.speculative.n_gpu_layers = -1;
|
||||
} else if (value == "all") {
|
||||
params.speculative.n_gpu_layers = -2;
|
||||
} else {
|
||||
params.speculative.n_gpu_layers = std::stoi(value);
|
||||
}
|
||||
if (!llama_supports_gpu_offload()) {
|
||||
fprintf(stderr, "warning: no usable GPU found, --gpu-layers-draft option will be ignored\n");
|
||||
fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
|
||||
|
||||
+1
-4
@@ -1341,10 +1341,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
|
||||
mparams.devices = params.devices.data();
|
||||
}
|
||||
|
||||
if (params.n_gpu_layers != -1) {
|
||||
mparams.n_gpu_layers = params.n_gpu_layers;
|
||||
}
|
||||
|
||||
mparams.n_gpu_layers = params.n_gpu_layers;
|
||||
mparams.main_gpu = params.main_gpu;
|
||||
mparams.split_mode = params.split_mode;
|
||||
mparams.tensor_split = params.tensor_split;
|
||||
|
||||
+1
-1
@@ -329,7 +329,7 @@ struct common_params {
|
||||
// offload params
|
||||
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
|
||||
|
||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM, -1 is auto, <= -2 is all
|
||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
||||
bool fit_params = true; // whether to fit unset model/context parameters to free device memory
|
||||
|
||||
Reference in New Issue
Block a user