1#include "arg.h"
2
3#include "chat.h"
4#include "common.h"
5#include "json-schema-to-grammar.h"
6#include "log.h"
7#include "sampling.h"
8#include "download.h"
9
10// fix problem with std::min and std::max
11#if defined(_WIN32)
12#define WIN32_LEAN_AND_MEAN
13#ifndef NOMINMAX
14# define NOMINMAX
15#endif
16#include <windows.h>
17#endif
18
19#define JSON_ASSERT GGML_ASSERT
20#include <nlohmann/json.hpp>
21
22#include <algorithm>
23#include <climits>
24#include <cstdarg>
25#include <fstream>
26#include <list>
27#include <regex>
28#include <set>
29#include <string>
30#include <thread> // for hardware_concurrency
31#include <vector>
32
33#ifdef __linux__
34#include <linux/limits.h>
35#elif defined(_WIN32)
36# if !defined(PATH_MAX)
37# define PATH_MAX MAX_PATH
38# endif
39#elif defined(_AIX)
40#include <sys/limits.h>
41#else
42#include <sys/syslimits.h>
43#endif
44#define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
45
46using json = nlohmann::ordered_json;
47
48static std::initializer_list<enum llama_example> mmproj_examples = {
49 LLAMA_EXAMPLE_MTMD,
50 LLAMA_EXAMPLE_SERVER,
51};
52
53static std::string read_file(const std::string & fname) {
54 std::ifstream file(fname);
55 if (!file) {
56 throw std::runtime_error(string_format(fmt: "error: failed to open file '%s'\n", fname.c_str()));
57 }
58 std::string content((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
59 file.close();
60 return content;
61}
62
63common_arg & common_arg::set_examples(std::initializer_list<enum llama_example> examples) {
64 this->examples = examples;
65 return *this;
66}
67
68common_arg & common_arg::set_excludes(std::initializer_list<enum llama_example> excludes) {
69 this->excludes = excludes;
70 return *this;
71}
72
73common_arg & common_arg::set_env(const char * env) {
74 help = help + "\n(env: " + env + ")";
75 this->env = env;
76 return *this;
77}
78
79common_arg & common_arg::set_sparam() {
80 is_sparam = true;
81 return *this;
82}
83
84bool common_arg::in_example(enum llama_example ex) {
85 return examples.find(x: ex) != examples.end();
86}
87
88bool common_arg::is_exclude(enum llama_example ex) {
89 return excludes.find(x: ex) != excludes.end();
90}
91
92bool common_arg::get_value_from_env(std::string & output) const {
93 if (env == nullptr) return false;
94 char * value = std::getenv(name: env);
95 if (value) {
96 output = value;
97 return true;
98 }
99 return false;
100}
101
102bool common_arg::has_value_from_env() const {
103 return env != nullptr && std::getenv(name: env);
104}
105
106static std::vector<std::string> break_str_into_lines(std::string input, size_t max_char_per_line) {
107 std::vector<std::string> result;
108 std::istringstream iss(input);
109 std::string line;
110 auto add_line = [&](const std::string& l) {
111 if (l.length() <= max_char_per_line) {
112 result.push_back(x: l);
113 } else {
114 std::istringstream line_stream(l);
115 std::string word, current_line;
116 while (line_stream >> word) {
117 if (current_line.length() + !current_line.empty() + word.length() > max_char_per_line) {
118 if (!current_line.empty()) result.push_back(x: current_line);
119 current_line = word;
120 } else {
121 current_line += (!current_line.empty() ? " " : "") + word;
122 }
123 }
124 if (!current_line.empty()) result.push_back(x: current_line);
125 }
126 };
127 while (std::getline(is&: iss, str&: line)) {
128 add_line(line);
129 }
130 return result;
131}
132
133std::string common_arg::to_string() {
134 // params for printing to console
135 const static int n_leading_spaces = 40;
136 const static int n_char_per_line_help = 70; // TODO: detect this based on current console
137 std::string leading_spaces(n_leading_spaces, ' ');
138
139 std::ostringstream ss;
140 for (const auto arg : args) {
141 if (arg == args.front()) {
142 if (args.size() == 1) {
143 ss << arg;
144 } else {
145 // first arg is usually abbreviation, we need padding to make it more beautiful
146 auto tmp = std::string(arg) + ", ";
147 auto spaces = std::string(std::max(a: 0, b: 7 - (int)tmp.size()), ' ');
148 ss << tmp << spaces;
149 }
150 } else {
151 ss << arg << (arg != args.back() ? ", " : "");
152 }
153 }
154 if (value_hint) ss << " " << value_hint;
155 if (value_hint_2) ss << " " << value_hint_2;
156 if (ss.tellp() > n_leading_spaces - 3) {
157 // current line is too long, add new line
158 ss << "\n" << leading_spaces;
159 } else {
160 // padding between arg and help, same line
161 ss << std::string(leading_spaces.size() - ss.tellp(), ' ');
162 }
163 const auto help_lines = break_str_into_lines(input: help, max_char_per_line: n_char_per_line_help);
164 for (const auto & line : help_lines) {
165 ss << (&line == &help_lines.front() ? "" : leading_spaces) << line << "\n";
166 }
167 return ss.str();
168}
169
170//
171// utils
172//
173
174// Helper function to parse tensor buffer override strings
175static void parse_tensor_buffer_overrides(const std::string & value, std::vector<llama_model_tensor_buft_override> & overrides) {
176 std::map<std::string, ggml_backend_buffer_type_t> buft_list;
177 for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
178 auto * dev = ggml_backend_dev_get(index: i);
179 auto * buft = ggml_backend_dev_buffer_type(device: dev);
180 if (buft) {
181 buft_list[ggml_backend_buft_name(buft)] = buft;
182 }
183 }
184
185 for (const auto & override : string_split<std::string>(input: value, separator: ',')) {
186 std::string::size_type pos = override.find(c: '=');
187 if (pos == std::string::npos) {
188 throw std::invalid_argument("invalid value");
189 }
190 std::string tensor_name = override.substr(pos: 0, n: pos);
191 std::string buffer_type = override.substr(pos: pos + 1);
192
193 if (buft_list.find(x: buffer_type) == buft_list.end()) {
194 printf(format: "Available buffer types:\n");
195 for (const auto & it : buft_list) {
196 printf(format: " %s\n", ggml_backend_buft_name(buft: it.second));
197 }
198 throw std::invalid_argument("unknown buffer type");
199 }
200 // keep strings alive and avoid leaking memory by storing them in a static vector
201 static std::list<std::string> buft_overrides;
202 buft_overrides.push_back(x: tensor_name);
203 overrides.push_back(x: {.pattern: buft_overrides.back().c_str(), .buft: buft_list.at(k: buffer_type)});
204 }
205}
206
207struct handle_model_result {
208 bool found_mmproj = false;
209 common_params_model mmproj;
210};
211
212static handle_model_result common_params_handle_model(
213 struct common_params_model & model,
214 const std::string & bearer_token,
215 const std::string & model_path_default,
216 bool offline) {
217 handle_model_result result;
218 // handle pre-fill default model path and url based on hf_repo and hf_file
219 {
220 if (!model.docker_repo.empty()) { // Handle Docker URLs by resolving them to local paths
221 model.path = common_docker_resolve_model(docker: model.docker_repo);
222 } else if (!model.hf_repo.empty()) {
223 // short-hand to avoid specifying --hf-file -> default it to --model
224 if (model.hf_file.empty()) {
225 if (model.path.empty()) {
226 auto auto_detected = common_get_hf_file(hf_repo_with_tag: model.hf_repo, bearer_token, offline);
227 if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) {
228 exit(status: 1); // built without CURL, error message already printed
229 }
230 model.hf_repo = auto_detected.repo;
231 model.hf_file = auto_detected.ggufFile;
232 if (!auto_detected.mmprojFile.empty()) {
233 result.found_mmproj = true;
234 result.mmproj.hf_repo = model.hf_repo;
235 result.mmproj.hf_file = auto_detected.mmprojFile;
236 }
237 } else {
238 model.hf_file = model.path;
239 }
240 }
241
242 std::string model_endpoint = get_model_endpoint();
243 model.url = model_endpoint + model.hf_repo + "/resolve/main/" + model.hf_file;
244 // make sure model path is present (for caching purposes)
245 if (model.path.empty()) {
246 // this is to avoid different repo having same file name, or same file name in different subdirs
247 std::string filename = model.hf_repo + "_" + model.hf_file;
248 // to make sure we don't have any slashes in the filename
249 string_replace_all(s&: filename, search: "/", replace: "_");
250 model.path = fs_get_cache_file(filename);
251 }
252
253 } else if (!model.url.empty()) {
254 if (model.path.empty()) {
255 auto f = string_split<std::string>(input: model.url, separator: '#').front();
256 f = string_split<std::string>(input: f, separator: '?').front();
257 model.path = fs_get_cache_file(filename: string_split<std::string>(input: f, separator: '/').back());
258 }
259
260 } else if (model.path.empty()) {
261 model.path = model_path_default;
262 }
263 }
264
265 // then, download it if needed
266 if (!model.url.empty()) {
267 bool ok = common_download_model(model, bearer_token, offline);
268 if (!ok) {
269 LOG_ERR("error: failed to download model from %s\n", model.url.c_str());
270 exit(status: 1);
271 }
272 }
273
274 return result;
275}
276
277const std::vector<ggml_type> kv_cache_types = {
278 GGML_TYPE_F32,
279 GGML_TYPE_F16,
280 GGML_TYPE_BF16,
281 GGML_TYPE_Q8_0,
282 GGML_TYPE_Q4_0,
283 GGML_TYPE_Q4_1,
284 GGML_TYPE_IQ4_NL,
285 GGML_TYPE_Q5_0,
286 GGML_TYPE_Q5_1,
287};
288
289static ggml_type kv_cache_type_from_str(const std::string & s) {
290 for (const auto & type : kv_cache_types) {
291 if (ggml_type_name(type) == s) {
292 return type;
293 }
294 }
295 throw std::runtime_error("Unsupported cache type: " + s);
296}
297
298static std::string get_all_kv_cache_types() {
299 std::ostringstream msg;
300 for (const auto & type : kv_cache_types) {
301 msg << ggml_type_name(type) << (&type == &kv_cache_types.back() ? "" : ", ");
302 }
303 return msg.str();
304}
305
306//
307// CLI argument parsing functions
308//
309
310static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) {
311 common_params & params = ctx_arg.params;
312
313 std::unordered_map<std::string, common_arg *> arg_to_options;
314 for (auto & opt : ctx_arg.options) {
315 for (const auto & arg : opt.args) {
316 arg_to_options[arg] = &opt;
317 }
318 }
319
320 // handle environment variables
321 for (auto & opt : ctx_arg.options) {
322 std::string value;
323 if (opt.get_value_from_env(output&: value)) {
324 try {
325 if (opt.handler_void && (value == "1" || value == "true")) {
326 opt.handler_void(params);
327 }
328 if (opt.handler_int) {
329 opt.handler_int(params, std::stoi(str: value));
330 }
331 if (opt.handler_string) {
332 opt.handler_string(params, value);
333 continue;
334 }
335 } catch (std::exception & e) {
336 throw std::invalid_argument(string_format(
337 fmt: "error while handling environment variable \"%s\": %s\n\n", opt.env, e.what()));
338 }
339 }
340 }
341
342 // handle command line arguments
343 auto check_arg = [&](int i) {
344 if (i+1 >= argc) {
345 throw std::invalid_argument("expected value for argument");
346 }
347 };
348
349 for (int i = 1; i < argc; i++) {
350 const std::string arg_prefix = "--";
351
352 std::string arg = argv[i];
353 if (arg.compare(pos: 0, n: arg_prefix.size(), str: arg_prefix) == 0) {
354 std::replace(first: arg.begin(), last: arg.end(), old_value: '_', new_value: '-');
355 }
356 if (arg_to_options.find(x: arg) == arg_to_options.end()) {
357 throw std::invalid_argument(string_format(fmt: "error: invalid argument: %s", arg.c_str()));
358 }
359 auto opt = *arg_to_options[arg];
360 if (opt.has_value_from_env()) {
361 fprintf(stderr, format: "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
362 }
363 try {
364 if (opt.handler_void) {
365 opt.handler_void(params);
366 continue;
367 }
368
369 // arg with single value
370 check_arg(i);
371 std::string val = argv[++i];
372 if (opt.handler_int) {
373 opt.handler_int(params, std::stoi(str: val));
374 continue;
375 }
376 if (opt.handler_string) {
377 opt.handler_string(params, val);
378 continue;
379 }
380
381 // arg with 2 values
382 check_arg(i);
383 std::string val2 = argv[++i];
384 if (opt.handler_str_str) {
385 opt.handler_str_str(params, val, val2);
386 continue;
387 }
388 } catch (std::exception & e) {
389 throw std::invalid_argument(string_format(
390 fmt: "error while handling argument \"%s\": %s\n\n"
391 "usage:\n%s\n\nto show complete usage, run with -h",
392 arg.c_str(), e.what(), arg_to_options[arg]->to_string().c_str()));
393 }
394 }
395
396 postprocess_cpu_params(cpuparams&: params.cpuparams, role_model: nullptr);
397 postprocess_cpu_params(cpuparams&: params.cpuparams_batch, role_model: &params.cpuparams);
398
399 postprocess_cpu_params(cpuparams&: params.speculative.cpuparams, role_model: &params.cpuparams);
400 postprocess_cpu_params(cpuparams&: params.speculative.cpuparams_batch, role_model: &params.cpuparams_batch);
401
402 if (params.prompt_cache_all && (params.interactive || params.interactive_first)) {
403 throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
404 }
405
406 // handle model and download
407 {
408 auto res = common_params_handle_model(model&: params.model, bearer_token: params.hf_token, DEFAULT_MODEL_PATH, offline: params.offline);
409 if (params.no_mmproj) {
410 params.mmproj = {};
411 } else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
412 // optionally, handle mmproj model when -hf is specified
413 params.mmproj = res.mmproj;
414 }
415 // only download mmproj if the current example is using it
416 for (auto & ex : mmproj_examples) {
417 if (ctx_arg.ex == ex) {
418 common_params_handle_model(model&: params.mmproj, bearer_token: params.hf_token, model_path_default: "", offline: params.offline);
419 break;
420 }
421 }
422 common_params_handle_model(model&: params.speculative.model, bearer_token: params.hf_token, model_path_default: "", offline: params.offline);
423 common_params_handle_model(model&: params.vocoder.model, bearer_token: params.hf_token, model_path_default: "", offline: params.offline);
424 }
425
426 if (params.escape) {
427 string_process_escapes(input&: params.prompt);
428 string_process_escapes(input&: params.input_prefix);
429 string_process_escapes(input&: params.input_suffix);
430 for (auto & antiprompt : params.antiprompt) {
431 string_process_escapes(input&: antiprompt);
432 }
433 for (auto & seq_breaker : params.sampling.dry_sequence_breakers) {
434 string_process_escapes(input&: seq_breaker);
435 }
436 for (auto & pair : params.speculative.replacements) {
437 string_process_escapes(input&: pair.first);
438 string_process_escapes(input&: pair.second);
439 }
440 }
441
442 if (!params.kv_overrides.empty()) {
443 params.kv_overrides.emplace_back();
444 params.kv_overrides.back().key[0] = 0;
445 }
446
447 if (!params.tensor_buft_overrides.empty()) {
448 params.tensor_buft_overrides.push_back(x: {.pattern: nullptr, .buft: nullptr});
449 }
450
451 if (!params.speculative.tensor_buft_overrides.empty()) {
452 params.speculative.tensor_buft_overrides.push_back(x: {.pattern: nullptr, .buft: nullptr});
453 }
454
455 if (!params.chat_template.empty() && !common_chat_verify_template(tmpl: params.chat_template, use_jinja: params.use_jinja)) {
456 throw std::runtime_error(string_format(
457 fmt: "error: the supplied chat template is not supported: %s%s\n",
458 params.chat_template.c_str(),
459 params.use_jinja ? "" : "\nnote: llama.cpp was started without --jinja, we only support commonly used templates"
460 ));
461 }
462
463 return true;
464}
465
466static void common_params_print_usage(common_params_context & ctx_arg) {
467 auto print_options = [](std::vector<common_arg *> & options) {
468 for (common_arg * opt : options) {
469 printf(format: "%s", opt->to_string().c_str());
470 }
471 };
472
473 std::vector<common_arg *> common_options;
474 std::vector<common_arg *> sparam_options;
475 std::vector<common_arg *> specific_options;
476 for (auto & opt : ctx_arg.options) {
477 // in case multiple LLAMA_EXAMPLE_* are set, we prioritize the LLAMA_EXAMPLE_* matching current example
478 if (opt.is_sparam) {
479 sparam_options.push_back(x: &opt);
480 } else if (opt.in_example(ex: ctx_arg.ex)) {
481 specific_options.push_back(x: &opt);
482 } else {
483 common_options.push_back(x: &opt);
484 }
485 }
486 printf(format: "----- common params -----\n\n");
487 print_options(common_options);
488 printf(format: "\n\n----- sampling params -----\n\n");
489 print_options(sparam_options);
490 // TODO: maybe convert enum llama_example to string
491 printf(format: "\n\n----- example-specific params -----\n\n");
492 print_options(specific_options);
493}
494
495static void common_params_print_completion(common_params_context & ctx_arg) {
496 std::vector<common_arg *> common_options;
497 std::vector<common_arg *> sparam_options;
498 std::vector<common_arg *> specific_options;
499
500 for (auto & opt : ctx_arg.options) {
501 if (opt.is_sparam) {
502 sparam_options.push_back(x: &opt);
503 } else if (opt.in_example(ex: ctx_arg.ex)) {
504 specific_options.push_back(x: &opt);
505 } else {
506 common_options.push_back(x: &opt);
507 }
508 }
509
510 printf(format: "_llama_completions() {\n");
511 printf(format: " local cur prev opts\n");
512 printf(format: " COMPREPLY=()\n");
513 printf(format: " cur=\"${COMP_WORDS[COMP_CWORD]}\"\n");
514 printf(format: " prev=\"${COMP_WORDS[COMP_CWORD-1]}\"\n\n");
515
516 printf(format: " opts=\"");
517 auto print_options = [](const std::vector<common_arg *> & options) {
518 for (const common_arg * opt : options) {
519 for (const char * arg : opt->args) {
520 printf(format: "%s ", arg);
521 }
522 }
523 };
524
525 print_options(common_options);
526 print_options(sparam_options);
527 print_options(specific_options);
528 printf(format: "\"\n\n");
529
530 printf(format: " case \"$prev\" in\n");
531 printf(format: " --model|-m)\n");
532 printf(format: " COMPREPLY=( $(compgen -f -X '!*.gguf' -- \"$cur\") $(compgen -d -- \"$cur\") )\n");
533 printf(format: " return 0\n");
534 printf(format: " ;;\n");
535 printf(format: " --grammar-file)\n");
536 printf(format: " COMPREPLY=( $(compgen -f -X '!*.gbnf' -- \"$cur\") $(compgen -d -- \"$cur\") )\n");
537 printf(format: " return 0\n");
538 printf(format: " ;;\n");
539 printf(format: " --chat-template-file)\n");
540 printf(format: " COMPREPLY=( $(compgen -f -X '!*.jinja' -- \"$cur\") $(compgen -d -- \"$cur\") )\n");
541 printf(format: " return 0\n");
542 printf(format: " ;;\n");
543 printf(format: " *)\n");
544 printf(format: " COMPREPLY=( $(compgen -W \"${opts}\" -- \"$cur\") )\n");
545 printf(format: " return 0\n");
546 printf(format: " ;;\n");
547 printf(format: " esac\n");
548 printf(format: "}\n\n");
549
550 std::set<std::string> executables = {
551 "llama-batched",
552 "llama-batched-bench",
553 "llama-bench",
554 "llama-cli",
555 "llama-convert-llama2c-to-ggml",
556 "llama-cvector-generator",
557 "llama-embedding",
558 "llama-eval-callback",
559 "llama-export-lora",
560 "llama-gen-docs",
561 "llama-gguf",
562 "llama-gguf-hash",
563 "llama-gguf-split",
564 "llama-gritlm",
565 "llama-imatrix",
566 "llama-infill",
567 "llama-mtmd-cli",
568 "llama-llava-clip-quantize-cli",
569 "llama-lookahead",
570 "llama-lookup",
571 "llama-lookup-create",
572 "llama-lookup-merge",
573 "llama-lookup-stats",
574 "llama-parallel",
575 "llama-passkey",
576 "llama-perplexity",
577 "llama-q8dot",
578 "llama-quantize",
579 "llama-qwen2vl-cli",
580 "llama-retrieval",
581 "llama-run",
582 "llama-save-load-state",
583 "llama-server",
584 "llama-simple",
585 "llama-simple-chat",
586 "llama-speculative",
587 "llama-speculative-simple",
588 "llama-tokenize",
589 "llama-tts",
590 "llama-vdot"
591 };
592
593 for (const auto& exe : executables) {
594 printf(format: "complete -F _llama_completions %s\n", exe.c_str());
595 }
596}
597
598static std::vector<ggml_backend_dev_t> parse_device_list(const std::string & value) {
599 std::vector<ggml_backend_dev_t> devices;
600 auto dev_names = string_split<std::string>(input: value, separator: ',');
601 if (dev_names.empty()) {
602 throw std::invalid_argument("no devices specified");
603 }
604 if (dev_names.size() == 1 && dev_names[0] == "none") {
605 devices.push_back(x: nullptr);
606 } else {
607 for (const auto & device : dev_names) {
608 auto * dev = ggml_backend_dev_by_name(name: device.c_str());
609 if (!dev || ggml_backend_dev_type(device: dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
610 throw std::invalid_argument(string_format(fmt: "invalid device: %s", device.c_str()));
611 }
612 devices.push_back(x: dev);
613 }
614 devices.push_back(x: nullptr);
615 }
616 return devices;
617}
618
619static void add_rpc_devices(const std::string & servers) {
620 auto rpc_servers = string_split<std::string>(input: servers, separator: ',');
621 if (rpc_servers.empty()) {
622 throw std::invalid_argument("no RPC servers specified");
623 }
624 ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name(name: "RPC");
625 if (!rpc_reg) {
626 throw std::invalid_argument("failed to find RPC backend");
627 }
628 typedef ggml_backend_reg_t (*ggml_backend_rpc_add_server_t)(const char * endpoint);
629 ggml_backend_rpc_add_server_t ggml_backend_rpc_add_server_fn = (ggml_backend_rpc_add_server_t) ggml_backend_reg_get_proc_address(reg: rpc_reg, name: "ggml_backend_rpc_add_server");
630 if (!ggml_backend_rpc_add_server_fn) {
631 throw std::invalid_argument("failed to find RPC add server function");
632 }
633 for (const auto & server : rpc_servers) {
634 auto reg = ggml_backend_rpc_add_server_fn(server.c_str());
635 ggml_backend_register(reg);
636 }
637}
638
639bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
640 auto ctx_arg = common_params_parser_init(params, ex, print_usage);
641 const common_params params_org = ctx_arg.params; // the example can modify the default params
642
643 try {
644 if (!common_params_parse_ex(argc, argv, ctx_arg)) {
645 ctx_arg.params = params_org;
646 return false;
647 }
648 if (ctx_arg.params.usage) {
649 common_params_print_usage(ctx_arg);
650 if (ctx_arg.print_usage) {
651 ctx_arg.print_usage(argc, argv);
652 }
653 exit(status: 0);
654 }
655 if (ctx_arg.params.completion) {
656 common_params_print_completion(ctx_arg);
657 exit(status: 0);
658 }
659 params.lr.init();
660 } catch (const std::invalid_argument & ex) {
661 fprintf(stderr, format: "%s\n", ex.what());
662 ctx_arg.params = params_org;
663 return false;
664 } catch (std::exception & ex) {
665 fprintf(stderr, format: "%s\n", ex.what());
666 exit(status: 1); // for other exceptions, we exit with status code 1
667 }
668
669 return true;
670}
671
672static std::string list_builtin_chat_templates() {
673 std::vector<const char *> supported_tmpl;
674 int32_t res = llama_chat_builtin_templates(output: nullptr, len: 0);
675 supported_tmpl.resize(new_size: res);
676 res = llama_chat_builtin_templates(output: supported_tmpl.data(), len: supported_tmpl.size());
677 std::ostringstream msg;
678 for (auto & tmpl : supported_tmpl) {
679 msg << tmpl << (&tmpl == &supported_tmpl.back() ? "" : ", ");
680 }
681 return msg.str();
682}
683
684static bool is_truthy(const std::string & value) {
685 return value == "on" || value == "enabled" || value == "1";
686}
687
688static bool is_falsey(const std::string & value) {
689 return value == "off" || value == "disabled" || value == "0";
690}
691
692static bool is_autoy(const std::string & value) {
693 return value == "auto" || value == "-1";
694}
695
696common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
697 // load dynamic backends
698 ggml_backend_load_all();
699
700 common_params_context ctx_arg(params);
701 ctx_arg.print_usage = print_usage;
702 ctx_arg.ex = ex;
703
704 std::string sampler_type_chars;
705 std::string sampler_type_names;
706 for (const auto & sampler : params.sampling.samplers) {
707 sampler_type_chars += common_sampler_type_to_chr(cnstr: sampler);
708 sampler_type_names += common_sampler_type_to_str(cnstr: sampler) + ";";
709 }
710 sampler_type_names.pop_back();
711
712
713 /**
714 * filter options by example
715 * rules:
716 * - all examples inherit options from LLAMA_EXAMPLE_COMMON
717 * - if LLAMA_EXAMPLE_* is set (other than COMMON), we only show the option in the corresponding example
718 * - if both {LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_*,} are set, we will prioritize the LLAMA_EXAMPLE_* matching current example
719 */
720 auto add_opt = [&](common_arg arg) {
721 if ((arg.in_example(ex) || arg.in_example(ex: LLAMA_EXAMPLE_COMMON)) && !arg.is_exclude(ex)) {
722 ctx_arg.options.push_back(x: std::move(arg));
723 }
724 };
725
726
727 add_opt(common_arg(
728 {"-h", "--help", "--usage"},
729 "print usage and exit",
730 [](common_params & params) {
731 params.usage = true;
732 }
733 ));
734 add_opt(common_arg(
735 {"--version"},
736 "show version and build info",
737 [](common_params &) {
738 fprintf(stderr, format: "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
739 fprintf(stderr, format: "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
740 exit(status: 0);
741 }
742 ));
743 add_opt(common_arg(
744 {"--completion-bash"},
745 "print source-able bash completion script for llama.cpp",
746 [](common_params & params) {
747 params.completion = true;
748 }
749 ));
750 add_opt(common_arg(
751 {"--verbose-prompt"},
752 string_format(fmt: "print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"),
753 [](common_params & params) {
754 params.verbose_prompt = true;
755 }
756 ));
757 add_opt(common_arg(
758 {"--no-display-prompt"},
759 string_format(fmt: "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"),
760 [](common_params & params) {
761 params.display_prompt = false;
762 }
763 ).set_examples({LLAMA_EXAMPLE_MAIN}));
764 add_opt(common_arg(
765 {"-co", "--color"},
766 string_format(fmt: "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"),
767 [](common_params & params) {
768 params.use_color = true;
769 }
770 ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
771 add_opt(common_arg(
772 {"-t", "--threads"}, "N",
773 string_format(fmt: "number of CPU threads to use during generation (default: %d)", params.cpuparams.n_threads),
774 [](common_params & params, int value) {
775 params.cpuparams.n_threads = value;
776 if (params.cpuparams.n_threads <= 0) {
777 params.cpuparams.n_threads = std::thread::hardware_concurrency();
778 }
779 }
780 ).set_env("LLAMA_ARG_THREADS"));
781 add_opt(common_arg(
782 {"-tb", "--threads-batch"}, "N",
783 "number of threads to use during batch and prompt processing (default: same as --threads)",
784 [](common_params & params, int value) {
785 params.cpuparams_batch.n_threads = value;
786 if (params.cpuparams_batch.n_threads <= 0) {
787 params.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
788 }
789 }
790 ));
791 add_opt(common_arg(
792 {"-C", "--cpu-mask"}, "M",
793 "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")",
794 [](common_params & params, const std::string & mask) {
795 params.cpuparams.mask_valid = true;
796 if (!parse_cpu_mask(mask, boolmask&: params.cpuparams.cpumask)) {
797 throw std::invalid_argument("invalid cpumask");
798 }
799 }
800 ));
801 add_opt(common_arg(
802 {"-Cr", "--cpu-range"}, "lo-hi",
803 "range of CPUs for affinity. Complements --cpu-mask",
804 [](common_params & params, const std::string & range) {
805 params.cpuparams.mask_valid = true;
806 if (!parse_cpu_range(range, boolmask&: params.cpuparams.cpumask)) {
807 throw std::invalid_argument("invalid range");
808 }
809 }
810 ));
811 add_opt(common_arg(
812 {"--cpu-strict"}, "<0|1>",
813 string_format(fmt: "use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu),
814 [](common_params & params, const std::string & value) {
815 params.cpuparams.strict_cpu = std::stoul(str: value);
816 }
817 ));
818 add_opt(common_arg(
819 {"--prio"}, "N",
820 string_format(fmt: "set process/thread priority : low(-1), normal(0), medium(1), high(2), realtime(3) (default: %d)\n", params.cpuparams.priority),
821 [](common_params & params, int prio) {
822 if (prio < GGML_SCHED_PRIO_LOW || prio > GGML_SCHED_PRIO_REALTIME) {
823 throw std::invalid_argument("invalid value");
824 }
825 params.cpuparams.priority = (enum ggml_sched_priority) prio;
826 }
827 ));
828 add_opt(common_arg(
829 {"--poll"}, "<0...100>",
830 string_format(fmt: "use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll),
831 [](common_params & params, const std::string & value) {
832 params.cpuparams.poll = std::stoul(str: value);
833 }
834 ));
835 add_opt(common_arg(
836 {"-Cb", "--cpu-mask-batch"}, "M",
837 "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)",
838 [](common_params & params, const std::string & mask) {
839 params.cpuparams_batch.mask_valid = true;
840 if (!parse_cpu_mask(mask, boolmask&: params.cpuparams_batch.cpumask)) {
841 throw std::invalid_argument("invalid cpumask");
842 }
843 }
844 ));
845 add_opt(common_arg(
846 {"-Crb", "--cpu-range-batch"}, "lo-hi",
847 "ranges of CPUs for affinity. Complements --cpu-mask-batch",
848 [](common_params & params, const std::string & range) {
849 params.cpuparams_batch.mask_valid = true;
850 if (!parse_cpu_range(range, boolmask&: params.cpuparams_batch.cpumask)) {
851 throw std::invalid_argument("invalid range");
852 }
853 }
854 ));
855 add_opt(common_arg(
856 {"--cpu-strict-batch"}, "<0|1>",
857 "use strict CPU placement (default: same as --cpu-strict)",
858 [](common_params & params, int value) {
859 params.cpuparams_batch.strict_cpu = value;
860 }
861 ));
862 add_opt(common_arg(
863 {"--prio-batch"}, "N",
864 string_format(fmt: "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams_batch.priority),
865 [](common_params & params, int prio) {
866 if (prio < 0 || prio > 3) {
867 throw std::invalid_argument("invalid value");
868 }
869 params.cpuparams_batch.priority = (enum ggml_sched_priority) prio;
870 }
871 ));
872 add_opt(common_arg(
873 {"--poll-batch"}, "<0|1>",
874 "use polling to wait for work (default: same as --poll)",
875 [](common_params & params, int value) {
876 params.cpuparams_batch.poll = value;
877 }
878 ));
879 add_opt(common_arg(
880 {"-lcs", "--lookup-cache-static"}, "FNAME",
881 "path to static lookup cache to use for lookup decoding (not updated by generation)",
882 [](common_params & params, const std::string & value) {
883 params.lookup_cache_static = value;
884 }
885 ).set_examples({LLAMA_EXAMPLE_LOOKUP}));
886 add_opt(common_arg(
887 {"-lcd", "--lookup-cache-dynamic"}, "FNAME",
888 "path to dynamic lookup cache to use for lookup decoding (updated by generation)",
889 [](common_params & params, const std::string & value) {
890 params.lookup_cache_dynamic = value;
891 }
892 ).set_examples({LLAMA_EXAMPLE_LOOKUP}));
893 add_opt(common_arg(
894 {"-c", "--ctx-size"}, "N",
895 string_format(fmt: "size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx),
896 [](common_params & params, int value) {
897 params.n_ctx = value;
898 }
899 ).set_env("LLAMA_ARG_CTX_SIZE"));
900 add_opt(common_arg(
901 {"-n", "--predict", "--n-predict"}, "N",
902 string_format(
903 fmt: ex == LLAMA_EXAMPLE_MAIN
904 ? "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)"
905 : "number of tokens to predict (default: %d, -1 = infinity)",
906 params.n_predict),
907 [](common_params & params, int value) {
908 params.n_predict = value;
909 }
910 ).set_env("LLAMA_ARG_N_PREDICT"));
911 add_opt(common_arg(
912 {"-b", "--batch-size"}, "N",
913 string_format(fmt: "logical maximum batch size (default: %d)", params.n_batch),
914 [](common_params & params, int value) {
915 params.n_batch = value;
916 }
917 ).set_env("LLAMA_ARG_BATCH"));
918 add_opt(common_arg(
919 {"-ub", "--ubatch-size"}, "N",
920 string_format(fmt: "physical maximum batch size (default: %d)", params.n_ubatch),
921 [](common_params & params, int value) {
922 params.n_ubatch = value;
923 }
924 ).set_env("LLAMA_ARG_UBATCH"));
925 add_opt(common_arg(
926 {"--keep"}, "N",
927 string_format(fmt: "number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep),
928 [](common_params & params, int value) {
929 params.n_keep = value;
930 }
931 ));
932 add_opt(common_arg(
933 {"--swa-full"},
934 string_format(fmt: "use full-size SWA cache (default: %s)\n"
935 "[(more info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)", params.swa_full ? "true" : "false"),
936 [](common_params & params) {
937 params.swa_full = true;
938 }
939 ).set_env("LLAMA_ARG_SWA_FULL"));
940 add_opt(common_arg(
941 {"--ctx-checkpoints", "--swa-checkpoints"}, "N",
942 string_format(fmt: "max number of context checkpoints to create per slot (default: %d)\n"
943 "[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)", params.n_ctx_checkpoints),
944 [](common_params & params, int value) {
945 params.n_ctx_checkpoints = value;
946 }
947 ).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER}));
948 add_opt(common_arg(
949 {"--cache-ram", "-cram"}, "N",
950 string_format(fmt: "set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)\n"
951 "[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)", params.cache_ram_mib),
952 [](common_params & params, int value) {
953 params.cache_ram_mib = value;
954 }
955 ).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER}));
956 add_opt(common_arg(
957 {"--kv-unified", "-kvu"},
958 string_format(fmt: "use single unified KV buffer for the KV cache of all sequences (default: %s)\n"
959 "[(more info)](https://github.com/ggml-org/llama.cpp/pull/14363)", params.kv_unified ? "true" : "false"),
960 [](common_params & params) {
961 params.kv_unified = true;
962 }
963 ).set_env("LLAMA_ARG_KV_SPLIT"));
964 add_opt(common_arg(
965 {"--no-context-shift"},
966 string_format(fmt: "disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
967 [](common_params & params) {
968 params.ctx_shift = false;
969 }
970 ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
971 add_opt(common_arg(
972 {"--context-shift"},
973 string_format(fmt: "enables context shift on infinite text generation (default: %s)", params.ctx_shift ? "enabled" : "disabled"),
974 [](common_params & params) {
975 params.ctx_shift = true;
976 }
977 ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_CONTEXT_SHIFT"));
978 add_opt(common_arg(
979 {"--chunks"}, "N",
980 string_format(fmt: "max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
981 [](common_params & params, int value) {
982 params.n_chunks = value;
983 }
984 ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL}));
985 add_opt(common_arg({ "-fa", "--flash-attn" }, "[on|off|auto]",
986 string_format(fmt: "set Flash Attention use ('on', 'off', or 'auto', default: '%s')",
987 llama_flash_attn_type_name(flash_attn_type: params.flash_attn_type)),
988 [](common_params & params, const std::string & value) {
989 if (is_truthy(value)) {
990 params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_ENABLED;
991 } else if (is_falsey(value)) {
992 params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
993 } else if (is_autoy(value)) {
994 params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;
995 } else {
996 throw std::runtime_error(
997 string_format(fmt: "error: unkown value for --flash-attn: '%s'\n", value.c_str()));
998 }
999 }).set_env("LLAMA_ARG_FLASH_ATTN"));
1000 add_opt(common_arg(
1001 {"-p", "--prompt"}, "PROMPT",
1002 "prompt to start generation with; for system message, use -sys",
1003 [](common_params & params, const std::string & value) {
1004 params.prompt = value;
1005 }
1006 ).set_excludes({LLAMA_EXAMPLE_SERVER}));
1007 add_opt(common_arg(
1008 {"-sys", "--system-prompt"}, "PROMPT",
1009 "system prompt to use with model (if applicable, depending on chat template)",
1010 [](common_params & params, const std::string & value) {
1011 params.system_prompt = value;
1012 }
1013 ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_DIFFUSION}));
1014 add_opt(common_arg(
1015 {"--no-perf"},
1016 string_format(fmt: "disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
1017 [](common_params & params) {
1018 params.no_perf = true;
1019 params.sampling.no_perf = true;
1020 }
1021 ).set_env("LLAMA_ARG_NO_PERF"));
1022 add_opt(common_arg(
1023 {"-f", "--file"}, "FNAME",
1024 "a file containing the prompt (default: none)",
1025 [](common_params & params, const std::string & value) {
1026 params.prompt = read_file(fname: value);
1027 // store the external file name in params
1028 params.prompt_file = value;
1029 if (!params.prompt.empty() && params.prompt.back() == '\n') {
1030 params.prompt.pop_back();
1031 }
1032 }
1033 ).set_excludes({LLAMA_EXAMPLE_SERVER}));
1034 add_opt(common_arg(
1035 {"-sysf", "--system-prompt-file"}, "FNAME",
1036 "a file containing the system prompt (default: none)",
1037 [](common_params & params, const std::string & value) {
1038 params.system_prompt = read_file(fname: value);
1039 if (!params.system_prompt.empty() && params.system_prompt.back() == '\n') {
1040 params.system_prompt.pop_back();
1041 }
1042 }
1043 ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_DIFFUSION}));
1044 add_opt(common_arg(
1045 {"--in-file"}, "FNAME",
1046 "an input file (repeat to specify multiple files)",
1047 [](common_params & params, const std::string & value) {
1048 std::ifstream file(value);
1049 if (!file) {
1050 throw std::runtime_error(string_format(fmt: "error: failed to open file '%s'\n", value.c_str()));
1051 }
1052 params.in_files.push_back(x: value);
1053 }
1054 ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
1055 add_opt(common_arg(
1056 {"-bf", "--binary-file"}, "FNAME",
1057 "binary file containing the prompt (default: none)",
1058 [](common_params & params, const std::string & value) {
1059 std::ifstream file(value, std::ios::binary);
1060 if (!file) {
1061 throw std::runtime_error(string_format(fmt: "error: failed to open file '%s'\n", value.c_str()));
1062 }
1063 // store the external file name in params
1064 params.prompt_file = value;
1065 std::ostringstream ss;
1066 ss << file.rdbuf();
1067 params.prompt = ss.str();
1068 fprintf(stderr, format: "Read %zu bytes from binary file %s\n", params.prompt.size(), value.c_str());
1069 }
1070 ).set_excludes({LLAMA_EXAMPLE_SERVER}));
1071 add_opt(common_arg(
1072 {"-e", "--escape"},
1073 string_format(fmt: "process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
1074 [](common_params & params) {
1075 params.escape = true;
1076 }
1077 ));
1078 add_opt(common_arg(
1079 {"--no-escape"},
1080 "do not process escape sequences",
1081 [](common_params & params) {
1082 params.escape = false;
1083 }
1084 ));
1085 add_opt(common_arg(
1086 {"-ptc", "--print-token-count"}, "N",
1087 string_format(fmt: "print token count every N tokens (default: %d)", params.n_print),
1088 [](common_params & params, int value) {
1089 params.n_print = value;
1090 }
1091 ).set_examples({LLAMA_EXAMPLE_MAIN}));
1092 add_opt(common_arg(
1093 {"--prompt-cache"}, "FNAME",
1094 "file to cache prompt state for faster startup (default: none)",
1095 [](common_params & params, const std::string & value) {
1096 params.path_prompt_cache = value;
1097 }
1098 ).set_examples({LLAMA_EXAMPLE_MAIN}));
1099 add_opt(common_arg(
1100 {"--prompt-cache-all"},
1101 "if specified, saves user input and generations to cache as well\n",
1102 [](common_params & params) {
1103 params.prompt_cache_all = true;
1104 }
1105 ).set_examples({LLAMA_EXAMPLE_MAIN}));
1106 add_opt(common_arg(
1107 {"--prompt-cache-ro"},
1108 "if specified, uses the prompt cache but does not update it",
1109 [](common_params & params) {
1110 params.prompt_cache_ro = true;
1111 }
1112 ).set_examples({LLAMA_EXAMPLE_MAIN}));
1113 add_opt(common_arg(
1114 {"-r", "--reverse-prompt"}, "PROMPT",
1115 "halt generation at PROMPT, return control in interactive mode\n",
1116 [](common_params & params, const std::string & value) {
1117 params.antiprompt.emplace_back(args: value);
1118 }
1119 ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
1120 add_opt(common_arg(
1121 {"-sp", "--special"},
1122 string_format(fmt: "special tokens output enabled (default: %s)", params.special ? "true" : "false"),
1123 [](common_params & params) {
1124 params.special = true;
1125 }
1126 ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
1127 add_opt(common_arg(
1128 {"-cnv", "--conversation"},
1129 "run in conversation mode:\n"
1130 "- does not print special tokens and suffix/prefix\n"
1131 "- interactive mode is also enabled\n"
1132 "(default: auto enabled if chat template is available)",
1133 [](common_params & params) {
1134 params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED;
1135 }
1136 ).set_examples({LLAMA_EXAMPLE_MAIN}));
1137 add_opt(common_arg(
1138 {"-no-cnv", "--no-conversation"},
1139 "force disable conversation mode (default: false)",
1140 [](common_params & params) {
1141 params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED;
1142 }
1143 ).set_examples({LLAMA_EXAMPLE_MAIN}));
1144 add_opt(common_arg(
1145 {"-st", "--single-turn"},
1146 "run conversation for a single turn only, then exit when done\n"
1147 "will not be interactive if first turn is predefined with --prompt\n"
1148 "(default: false)",
1149 [](common_params & params) {
1150 params.single_turn = true;
1151 }
1152 ).set_examples({LLAMA_EXAMPLE_MAIN}));
1153 add_opt(common_arg(
1154 {"-i", "--interactive"},
1155 string_format(fmt: "run in interactive mode (default: %s)", params.interactive ? "true" : "false"),
1156 [](common_params & params) {
1157 params.interactive = true;
1158 }
1159 ).set_examples({LLAMA_EXAMPLE_MAIN}));
1160 add_opt(common_arg(
1161 {"-if", "--interactive-first"},
1162 string_format(fmt: "run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"),
1163 [](common_params & params) {
1164 params.interactive_first = true;
1165 }
1166 ).set_examples({LLAMA_EXAMPLE_MAIN}));
1167 add_opt(common_arg(
1168 {"-mli", "--multiline-input"},
1169 "allows you to write or paste multiple lines without ending each in '\\'",
1170 [](common_params & params) {
1171 params.multiline_input = true;
1172 }
1173 ).set_examples({LLAMA_EXAMPLE_MAIN}));
1174 add_opt(common_arg(
1175 {"--in-prefix-bos"},
1176 "prefix BOS to user inputs, preceding the `--in-prefix` string",
1177 [](common_params & params) {
1178 params.input_prefix_bos = true;
1179 params.enable_chat_template = false;
1180 }
1181 ).set_examples({LLAMA_EXAMPLE_MAIN}));
1182 add_opt(common_arg(
1183 {"--in-prefix"}, "STRING",
1184 "string to prefix user inputs with (default: empty)",
1185 [](common_params & params, const std::string & value) {
1186 params.input_prefix = value;
1187 params.enable_chat_template = false;
1188 }
1189 ).set_examples({LLAMA_EXAMPLE_MAIN}));
1190 add_opt(common_arg(
1191 {"--in-suffix"}, "STRING",
1192 "string to suffix after user inputs with (default: empty)",
1193 [](common_params & params, const std::string & value) {
1194 params.input_suffix = value;
1195 params.enable_chat_template = false;
1196 }
1197 ).set_examples({LLAMA_EXAMPLE_MAIN}));
1198 add_opt(common_arg(
1199 {"--no-warmup"},
1200 "skip warming up the model with an empty run",
1201 [](common_params & params) {
1202 params.warmup = false;
1203 }
1204 ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
1205 add_opt(common_arg(
1206 {"--spm-infill"},
1207 string_format(
1208 fmt: "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)",
1209 params.spm_infill ? "enabled" : "disabled"
1210 ),
1211 [](common_params & params) {
1212 params.spm_infill = true;
1213 }
1214 ).set_examples({LLAMA_EXAMPLE_SERVER}));
1215 add_opt(common_arg(
1216 {"--samplers"}, "SAMPLERS",
1217 string_format(fmt: "samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()),
1218 [](common_params & params, const std::string & value) {
1219 const auto sampler_names = string_split<std::string>(input: value, separator: ';');
1220 params.sampling.samplers = common_sampler_types_from_names(names: sampler_names, allow_alt_names: true);
1221 }
1222 ).set_sparam());
1223 add_opt(common_arg(
1224 {"-s", "--seed"}, "SEED",
1225 string_format(fmt: "RNG seed (default: %d, use random seed for %d)", params.sampling.seed, LLAMA_DEFAULT_SEED),
1226 [](common_params & params, const std::string & value) {
1227 params.sampling.seed = std::stoul(str: value);
1228 }
1229 ).set_sparam());
1230 add_opt(common_arg(
1231 {"--sampling-seq", "--sampler-seq"}, "SEQUENCE",
1232 string_format(fmt: "simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()),
1233 [](common_params & params, const std::string & value) {
1234 params.sampling.samplers = common_sampler_types_from_chars(chars: value);
1235 }
1236 ).set_sparam());
1237 add_opt(common_arg(
1238 {"--ignore-eos"},
1239 "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)",
1240 [](common_params & params) {
1241 params.sampling.ignore_eos = true;
1242 }
1243 ).set_sparam());
1244 add_opt(common_arg(
1245 {"--temp"}, "N",
1246 string_format(fmt: "temperature (default: %.1f)", (double)params.sampling.temp),
1247 [](common_params & params, const std::string & value) {
1248 params.sampling.temp = std::stof(str: value);
1249 params.sampling.temp = std::max(a: params.sampling.temp, b: 0.0f);
1250 }
1251 ).set_sparam());
1252 add_opt(common_arg(
1253 {"--top-k"}, "N",
1254 string_format(fmt: "top-k sampling (default: %d, 0 = disabled)", params.sampling.top_k),
1255 [](common_params & params, int value) {
1256 params.sampling.top_k = value;
1257 }
1258 ).set_sparam());
1259 add_opt(common_arg(
1260 {"--top-p"}, "N",
1261 string_format(fmt: "top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sampling.top_p),
1262 [](common_params & params, const std::string & value) {
1263 params.sampling.top_p = std::stof(str: value);
1264 }
1265 ).set_sparam());
1266 add_opt(common_arg(
1267 {"--min-p"}, "N",
1268 string_format(fmt: "min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sampling.min_p),
1269 [](common_params & params, const std::string & value) {
1270 params.sampling.min_p = std::stof(str: value);
1271 }
1272 ).set_sparam());
1273 add_opt(common_arg(
1274 {"--top-nsigma"}, "N",
1275 string_format(fmt: "top-n-sigma sampling (default: %.1f, -1.0 = disabled)", params.sampling.top_n_sigma),
1276 [](common_params & params, const std::string & value) {
1277 params.sampling.top_n_sigma = std::stof(str: value);
1278 }
1279 ).set_sparam());
1280 add_opt(common_arg(
1281 {"--xtc-probability"}, "N",
1282 string_format(fmt: "xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sampling.xtc_probability),
1283 [](common_params & params, const std::string & value) {
1284 params.sampling.xtc_probability = std::stof(str: value);
1285 }
1286 ).set_sparam());
1287 add_opt(common_arg(
1288 {"--xtc-threshold"}, "N",
1289 string_format(fmt: "xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sampling.xtc_threshold),
1290 [](common_params & params, const std::string & value) {
1291 params.sampling.xtc_threshold = std::stof(str: value);
1292 }
1293 ).set_sparam());
1294 add_opt(common_arg(
1295 {"--typical"}, "N",
1296 string_format(fmt: "locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sampling.typ_p),
1297 [](common_params & params, const std::string & value) {
1298 params.sampling.typ_p = std::stof(str: value);
1299 }
1300 ).set_sparam());
1301 add_opt(common_arg(
1302 {"--repeat-last-n"}, "N",
1303 string_format(fmt: "last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sampling.penalty_last_n),
1304 [](common_params & params, int value) {
1305 if (value < -1) {
1306 throw std::runtime_error(string_format(fmt: "error: invalid repeat-last-n = %d\n", value));
1307 }
1308 params.sampling.penalty_last_n = value;
1309 params.sampling.n_prev = std::max(a: params.sampling.n_prev, b: params.sampling.penalty_last_n);
1310 }
1311 ).set_sparam());
1312 add_opt(common_arg(
1313 {"--repeat-penalty"}, "N",
1314 string_format(fmt: "penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sampling.penalty_repeat),
1315 [](common_params & params, const std::string & value) {
1316 params.sampling.penalty_repeat = std::stof(str: value);
1317 }
1318 ).set_sparam());
1319 add_opt(common_arg(
1320 {"--presence-penalty"}, "N",
1321 string_format(fmt: "repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sampling.penalty_present),
1322 [](common_params & params, const std::string & value) {
1323 params.sampling.penalty_present = std::stof(str: value);
1324 }
1325 ).set_sparam());
1326 add_opt(common_arg(
1327 {"--frequency-penalty"}, "N",
1328 string_format(fmt: "repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sampling.penalty_freq),
1329 [](common_params & params, const std::string & value) {
1330 params.sampling.penalty_freq = std::stof(str: value);
1331 }
1332 ).set_sparam());
1333 add_opt(common_arg(
1334 {"--dry-multiplier"}, "N",
1335 string_format(fmt: "set DRY sampling multiplier (default: %.1f, 0.0 = disabled)", (double)params.sampling.dry_multiplier),
1336 [](common_params & params, const std::string & value) {
1337 params.sampling.dry_multiplier = std::stof(str: value);
1338 }
1339 ).set_sparam());
1340 add_opt(common_arg(
1341 {"--dry-base"}, "N",
1342 string_format(fmt: "set DRY sampling base value (default: %.2f)", (double)params.sampling.dry_base),
1343 [](common_params & params, const std::string & value) {
1344 float potential_base = std::stof(str: value);
1345 if (potential_base >= 1.0f)
1346 {
1347 params.sampling.dry_base = potential_base;
1348 }
1349 }
1350 ).set_sparam());
1351 add_opt(common_arg(
1352 {"--dry-allowed-length"}, "N",
1353 string_format(fmt: "set allowed length for DRY sampling (default: %d)", params.sampling.dry_allowed_length),
1354 [](common_params & params, int value) {
1355 params.sampling.dry_allowed_length = value;
1356 }
1357 ).set_sparam());
1358 add_opt(common_arg(
1359 {"--dry-penalty-last-n"}, "N",
1360 string_format(fmt: "set DRY penalty for the last n tokens (default: %d, 0 = disable, -1 = context size)", params.sampling.dry_penalty_last_n),
1361 [](common_params & params, int value) {
1362 if (value < -1) {
1363 throw std::runtime_error(string_format(fmt: "error: invalid dry-penalty-last-n = %d\n", value));
1364 }
1365 params.sampling.dry_penalty_last_n = value;
1366 }
1367 ).set_sparam());
1368 add_opt(common_arg(
1369 {"--dry-sequence-breaker"}, "STRING",
1370 string_format(fmt: "add sequence breaker for DRY sampling, clearing out default breakers (%s) in the process; use \"none\" to not use any sequence breakers\n",
1371 params.sampling.dry_sequence_breakers.empty() ? "none" :
1372 std::accumulate(first: std::next(x: params.sampling.dry_sequence_breakers.begin()),
1373 last: params.sampling.dry_sequence_breakers.end(),
1374 init: std::string("'") + (params.sampling.dry_sequence_breakers[0] == "\n" ? "\\n" : params.sampling.dry_sequence_breakers[0]) + "'",
1375 binary_op: [](const std::string& a, const std::string& b) {
1376 std::string formatted_b = (b == "\n") ? "\\n" : b;
1377 return a + ", '" + formatted_b + "'";
1378 }).c_str()),
1379 [](common_params & params, const std::string & value) {
1380 static bool defaults_cleared = false;
1381
1382 if (!defaults_cleared) {
1383 params.sampling.dry_sequence_breakers.clear();
1384 defaults_cleared = true;
1385 }
1386
1387 if (value == "none") {
1388 params.sampling.dry_sequence_breakers.clear();
1389 } else {
1390 params.sampling.dry_sequence_breakers.emplace_back(args: value);
1391 }
1392 }
1393 ).set_sparam());
1394 add_opt(common_arg(
1395 {"--dynatemp-range"}, "N",
1396 string_format(fmt: "dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sampling.dynatemp_range),
1397 [](common_params & params, const std::string & value) {
1398 params.sampling.dynatemp_range = std::stof(str: value);
1399 }
1400 ).set_sparam());
1401 add_opt(common_arg(
1402 {"--dynatemp-exp"}, "N",
1403 string_format(fmt: "dynamic temperature exponent (default: %.1f)", (double)params.sampling.dynatemp_exponent),
1404 [](common_params & params, const std::string & value) {
1405 params.sampling.dynatemp_exponent = std::stof(str: value);
1406 }
1407 ).set_sparam());
1408 add_opt(common_arg(
1409 {"--mirostat"}, "N",
1410 string_format(fmt: "use Mirostat sampling.\nTop K, Nucleus and Locally Typical samplers are ignored if used.\n"
1411 "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sampling.mirostat),
1412 [](common_params & params, int value) {
1413 params.sampling.mirostat = value;
1414 }
1415 ).set_sparam());
1416 add_opt(common_arg(
1417 {"--mirostat-lr"}, "N",
1418 string_format(fmt: "Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sampling.mirostat_eta),
1419 [](common_params & params, const std::string & value) {
1420 params.sampling.mirostat_eta = std::stof(str: value);
1421 }
1422 ).set_sparam());
1423 add_opt(common_arg(
1424 {"--mirostat-ent"}, "N",
1425 string_format(fmt: "Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sampling.mirostat_tau),
1426 [](common_params & params, const std::string & value) {
1427 params.sampling.mirostat_tau = std::stof(str: value);
1428 }
1429 ).set_sparam());
1430 add_opt(common_arg(
1431 {"-l", "--logit-bias"}, "TOKEN_ID(+/-)BIAS",
1432 "modifies the likelihood of token appearing in the completion,\n"
1433 "i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"
1434 "or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'",
1435 [](common_params & params, const std::string & value) {
1436 std::stringstream ss(value);
1437 llama_token key;
1438 char sign;
1439 std::string value_str;
1440 try {
1441 if (ss >> key && ss >> sign && std::getline(is&: ss, str&: value_str) && (sign == '+' || sign == '-')) {
1442 const float bias = std::stof(str: value_str) * ((sign == '-') ? -1.0f : 1.0f);
1443 params.sampling.logit_bias.push_back(x: {.token: key, .bias: bias});
1444 } else {
1445 throw std::invalid_argument("invalid input format");
1446 }
1447 } catch (const std::exception&) {
1448 throw std::invalid_argument("invalid input format");
1449 }
1450 }
1451 ).set_sparam());
1452 add_opt(common_arg(
1453 {"--grammar"}, "GRAMMAR",
1454 string_format(fmt: "BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sampling.grammar.c_str()),
1455 [](common_params & params, const std::string & value) {
1456 params.sampling.grammar = value;
1457 }
1458 ).set_sparam());
1459 add_opt(common_arg(
1460 {"--grammar-file"}, "FNAME",
1461 "file to read grammar from",
1462 [](common_params & params, const std::string & value) {
1463 params.sampling.grammar = read_file(fname: value);
1464 }
1465 ).set_sparam());
1466 add_opt(common_arg(
1467 {"-j", "--json-schema"}, "SCHEMA",
1468 "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead",
1469 [](common_params & params, const std::string & value) {
1470 params.sampling.grammar = json_schema_to_grammar(schema: json::parse(i: value));
1471 }
1472 ).set_sparam());
1473 add_opt(common_arg(
1474 {"-jf", "--json-schema-file"}, "FILE",
1475 "File containing a JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead",
1476 [](common_params & params, const std::string & value) {
1477 std::ifstream file(value);
1478 if (!file) {
1479 throw std::runtime_error(string_format(fmt: "error: failed to open file '%s'\n", value.c_str()));
1480 }
1481 std::string schema;
1482 std::copy(
1483 first: std::istreambuf_iterator<char>(file),
1484 last: std::istreambuf_iterator<char>(),
1485 result: std::back_inserter(x&: schema)
1486 );
1487 params.sampling.grammar = json_schema_to_grammar(schema: json::parse(i&: schema));
1488 }
1489 ).set_sparam());
1490 add_opt(common_arg(
1491 {"--pooling"}, "{none,mean,cls,last,rank}",
1492 "pooling type for embeddings, use model default if unspecified",
1493 [](common_params & params, const std::string & value) {
1494 /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
1495 else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
1496 else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
1497 else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; }
1498 else if (value == "rank") { params.pooling_type = LLAMA_POOLING_TYPE_RANK; }
1499 else { throw std::invalid_argument("invalid value"); }
1500 }
1501 ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_POOLING"));
1502 add_opt(common_arg(
1503 {"--attention"}, "{causal,non-causal}",
1504 "attention type for embeddings, use model default if unspecified",
1505 [](common_params & params, const std::string & value) {
1506 /**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; }
1507 else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; }
1508 else { throw std::invalid_argument("invalid value"); }
1509 }
1510 ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
1511 add_opt(common_arg(
1512 {"--rope-scaling"}, "{none,linear,yarn}",
1513 "RoPE frequency scaling method, defaults to linear unless specified by the model",
1514 [](common_params & params, const std::string & value) {
1515 /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
1516 else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
1517 else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
1518 else { throw std::invalid_argument("invalid value"); }
1519 }
1520 ).set_env("LLAMA_ARG_ROPE_SCALING_TYPE"));
1521 add_opt(common_arg(
1522 {"--rope-scale"}, "N",
1523 "RoPE context scaling factor, expands context by a factor of N",
1524 [](common_params & params, const std::string & value) {
1525 params.rope_freq_scale = 1.0f / std::stof(str: value);
1526 }
1527 ).set_env("LLAMA_ARG_ROPE_SCALE"));
1528 add_opt(common_arg(
1529 {"--rope-freq-base"}, "N",
1530 "RoPE base frequency, used by NTK-aware scaling (default: loaded from model)",
1531 [](common_params & params, const std::string & value) {
1532 params.rope_freq_base = std::stof(str: value);
1533 }
1534 ).set_env("LLAMA_ARG_ROPE_FREQ_BASE"));
1535 add_opt(common_arg(
1536 {"--rope-freq-scale"}, "N",
1537 "RoPE frequency scaling factor, expands context by a factor of 1/N",
1538 [](common_params & params, const std::string & value) {
1539 params.rope_freq_scale = std::stof(str: value);
1540 }
1541 ).set_env("LLAMA_ARG_ROPE_FREQ_SCALE"));
1542 add_opt(common_arg(
1543 {"--yarn-orig-ctx"}, "N",
1544 string_format(fmt: "YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx),
1545 [](common_params & params, int value) {
1546 params.yarn_orig_ctx = value;
1547 }
1548 ).set_env("LLAMA_ARG_YARN_ORIG_CTX"));
1549 add_opt(common_arg(
1550 {"--yarn-ext-factor"}, "N",
1551 string_format(fmt: "YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor),
1552 [](common_params & params, const std::string & value) {
1553 params.yarn_ext_factor = std::stof(str: value);
1554 }
1555 ).set_env("LLAMA_ARG_YARN_EXT_FACTOR"));
1556 add_opt(common_arg(
1557 {"--yarn-attn-factor"}, "N",
1558 string_format(fmt: "YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor),
1559 [](common_params & params, const std::string & value) {
1560 params.yarn_attn_factor = std::stof(str: value);
1561 }
1562 ).set_env("LLAMA_ARG_YARN_ATTN_FACTOR"));
1563 add_opt(common_arg(
1564 {"--yarn-beta-slow"}, "N",
1565 string_format(fmt: "YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow),
1566 [](common_params & params, const std::string & value) {
1567 params.yarn_beta_slow = std::stof(str: value);
1568 }
1569 ).set_env("LLAMA_ARG_YARN_BETA_SLOW"));
1570 add_opt(common_arg(
1571 {"--yarn-beta-fast"}, "N",
1572 string_format(fmt: "YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast),
1573 [](common_params & params, const std::string & value) {
1574 params.yarn_beta_fast = std::stof(str: value);
1575 }
1576 ).set_env("LLAMA_ARG_YARN_BETA_FAST"));
1577 add_opt(common_arg(
1578 {"-gan", "--grp-attn-n"}, "N",
1579 string_format(fmt: "group-attention factor (default: %d)", params.grp_attn_n),
1580 [](common_params & params, int value) {
1581 params.grp_attn_n = value;
1582 }
1583 ).set_env("LLAMA_ARG_GRP_ATTN_N").set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_PASSKEY}));
1584 add_opt(common_arg(
1585 {"-gaw", "--grp-attn-w"}, "N",
1586 string_format(fmt: "group-attention width (default: %d)", params.grp_attn_w),
1587 [](common_params & params, int value) {
1588 params.grp_attn_w = value;
1589 }
1590 ).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_MAIN}));
1591 add_opt(common_arg(
1592 {"-nkvo", "--no-kv-offload"},
1593 "disable KV offload",
1594 [](common_params & params) {
1595 params.no_kv_offload = true;
1596 }
1597 ).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
1598 add_opt(common_arg(
1599 {"-nr", "--no-repack"},
1600 "disable weight repacking",
1601 [](common_params & params) {
1602 params.no_extra_bufts = true;
1603 }
1604 ).set_env("LLAMA_ARG_NO_REPACK"));
1605 add_opt(common_arg(
1606 {"--no-host"},
1607 "bypass host buffer allowing extra buffers to be used",
1608 [](common_params & params) {
1609 params.no_host = true;
1610 }
1611 ).set_env("LLAMA_ARG_NO_HOST"));
1612 add_opt(common_arg(
1613 {"-ctk", "--cache-type-k"}, "TYPE",
1614 string_format(
1615 fmt: "KV cache data type for K\n"
1616 "allowed values: %s\n"
1617 "(default: %s)",
1618 get_all_kv_cache_types().c_str(),
1619 ggml_type_name(type: params.cache_type_k)
1620 ),
1621 [](common_params & params, const std::string & value) {
1622 params.cache_type_k = kv_cache_type_from_str(s: value);
1623 }
1624 ).set_env("LLAMA_ARG_CACHE_TYPE_K"));
1625 add_opt(common_arg(
1626 {"-ctv", "--cache-type-v"}, "TYPE",
1627 string_format(
1628 fmt: "KV cache data type for V\n"
1629 "allowed values: %s\n"
1630 "(default: %s)",
1631 get_all_kv_cache_types().c_str(),
1632 ggml_type_name(type: params.cache_type_v)
1633 ),
1634 [](common_params & params, const std::string & value) {
1635 params.cache_type_v = kv_cache_type_from_str(s: value);
1636 }
1637 ).set_env("LLAMA_ARG_CACHE_TYPE_V"));
1638 add_opt(common_arg(
1639 {"--hellaswag"},
1640 "compute HellaSwag score over random tasks from datafile supplied with -f",
1641 [](common_params & params) {
1642 params.hellaswag = true;
1643 }
1644 ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
1645 add_opt(common_arg(
1646 {"--hellaswag-tasks"}, "N",
1647 string_format(fmt: "number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks),
1648 [](common_params & params, int value) {
1649 params.hellaswag_tasks = value;
1650 }
1651 ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
1652 add_opt(common_arg(
1653 {"--winogrande"},
1654 "compute Winogrande score over random tasks from datafile supplied with -f",
1655 [](common_params & params) {
1656 params.winogrande = true;
1657 }
1658 ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
1659 add_opt(common_arg(
1660 {"--winogrande-tasks"}, "N",
1661 string_format(fmt: "number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks),
1662 [](common_params & params, int value) {
1663 params.winogrande_tasks = value;
1664 }
1665 ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
1666 add_opt(common_arg(
1667 {"--multiple-choice"},
1668 "compute multiple choice score over random tasks from datafile supplied with -f",
1669 [](common_params & params) {
1670 params.multiple_choice = true;
1671 }
1672 ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
1673 add_opt(common_arg(
1674 {"--multiple-choice-tasks"}, "N",
1675 string_format(fmt: "number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks),
1676 [](common_params & params, int value) {
1677 params.multiple_choice_tasks = value;
1678 }
1679 ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
1680 add_opt(common_arg(
1681 {"--kl-divergence"},
1682 "computes KL-divergence to logits provided via --kl-divergence-base",
1683 [](common_params & params) {
1684 params.kl_divergence = true;
1685 }
1686 ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
1687 add_opt(common_arg(
1688 {"--save-all-logits", "--kl-divergence-base"}, "FNAME",
1689 "set logits file",
1690 [](common_params & params, const std::string & value) {
1691 params.logits_file = value;
1692 }
1693 ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
1694 add_opt(common_arg(
1695 {"--ppl-stride"}, "N",
1696 string_format(fmt: "stride for perplexity calculation (default: %d)", params.ppl_stride),
1697 [](common_params & params, int value) {
1698 params.ppl_stride = value;
1699 }
1700 ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
1701 add_opt(common_arg(
1702 {"--ppl-output-type"}, "<0|1>",
1703 string_format(fmt: "output type for perplexity calculation (default: %d)", params.ppl_output_type),
1704 [](common_params & params, int value) {
1705 params.ppl_output_type = value;
1706 }
1707 ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
1708 add_opt(common_arg(
1709 {"-dt", "--defrag-thold"}, "N",
1710 string_format(fmt: "KV cache defragmentation threshold (DEPRECATED)"),
1711 [](common_params & params, const std::string & value) {
1712 GGML_UNUSED(params);
1713 GGML_UNUSED(value);
1714 LOG_WRN("DEPRECATED: --defrag-thold is deprecated and no longer necessary to specify\n");
1715 }
1716 ).set_env("LLAMA_ARG_DEFRAG_THOLD"));
1717 add_opt(common_arg(
1718 {"-np", "--parallel"}, "N",
1719 string_format(fmt: "number of parallel sequences to decode (default: %d)", params.n_parallel),
1720 [](common_params & params, int value) {
1721 params.n_parallel = value;
1722 }
1723 ).set_env("LLAMA_ARG_N_PARALLEL"));
1724 add_opt(common_arg(
1725 {"-ns", "--sequences"}, "N",
1726 string_format(fmt: "number of sequences to decode (default: %d)", params.n_sequences),
1727 [](common_params & params, int value) {
1728 params.n_sequences = value;
1729 }
1730 ).set_examples({LLAMA_EXAMPLE_PARALLEL}));
1731 add_opt(common_arg(
1732 {"-cb", "--cont-batching"},
1733 string_format(fmt: "enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"),
1734 [](common_params & params) {
1735 params.cont_batching = true;
1736 }
1737 ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CONT_BATCHING"));
1738 add_opt(common_arg(
1739 {"-nocb", "--no-cont-batching"},
1740 "disable continuous batching",
1741 [](common_params & params) {
1742 params.cont_batching = false;
1743 }
1744 ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
1745 add_opt(common_arg(
1746 {"--mmproj"}, "FILE",
1747 "path to a multimodal projector file. see tools/mtmd/README.md\n"
1748 "note: if -hf is used, this argument can be omitted",
1749 [](common_params & params, const std::string & value) {
1750 params.mmproj.path = value;
1751 }
1752 ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ"));
1753 add_opt(common_arg(
1754 {"--mmproj-url"}, "URL",
1755 "URL to a multimodal projector file. see tools/mtmd/README.md",
1756 [](common_params & params, const std::string & value) {
1757 params.mmproj.url = value;
1758 }
1759 ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_URL"));
1760 add_opt(common_arg(
1761 {"--no-mmproj"},
1762 "explicitly disable multimodal projector, useful when using -hf",
1763 [](common_params & params) {
1764 params.no_mmproj = true;
1765 }
1766 ).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ"));
1767 add_opt(common_arg(
1768 {"--no-mmproj-offload"},
1769 "do not offload multimodal projector to GPU",
1770 [](common_params & params) {
1771 params.mmproj_use_gpu = false;
1772 }
1773 ).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ_OFFLOAD"));
1774 add_opt(common_arg(
1775 {"--image", "--audio"}, "FILE",
1776 "path to an image or audio file. use with multimodal models, can be repeated if you have multiple files\n",
1777 [](common_params & params, const std::string & value) {
1778 params.image.emplace_back(args: value);
1779 }
1780 ).set_examples({LLAMA_EXAMPLE_MTMD}));
1781 add_opt(common_arg(
1782 {"--image-min-tokens"}, "N",
1783 "minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)",
1784 [](common_params & params, int value) {
1785 params.image_min_tokens = value;
1786 }
1787 ).set_examples(mmproj_examples).set_env("LLAMA_ARG_IMAGE_MIN_TOKENS"));
1788 add_opt(common_arg(
1789 {"--image-max-tokens"}, "N",
1790 "maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)",
1791 [](common_params & params, int value) {
1792 params.image_max_tokens = value;
1793 }
1794 ).set_examples(mmproj_examples).set_env("LLAMA_ARG_IMAGE_MAX_TOKENS"));
1795 if (llama_supports_rpc()) {
1796 add_opt(common_arg(
1797 {"--rpc"}, "SERVERS",
1798 "comma separated list of RPC servers",
1799 [](common_params & params, const std::string & value) {
1800 add_rpc_devices(servers: value);
1801 GGML_UNUSED(params);
1802 }
1803 ).set_env("LLAMA_ARG_RPC"));
1804 }
1805 add_opt(common_arg(
1806 {"--mlock"},
1807 "force system to keep model in RAM rather than swapping or compressing",
1808 [](common_params & params) {
1809 params.use_mlock = true;
1810 }
1811 ).set_env("LLAMA_ARG_MLOCK"));
1812 add_opt(common_arg(
1813 {"--no-mmap"},
1814 "do not memory-map model (slower load but may reduce pageouts if not using mlock)",
1815 [](common_params & params) {
1816 params.use_mmap = false;
1817 }
1818 ).set_env("LLAMA_ARG_NO_MMAP"));
1819 add_opt(common_arg(
1820 {"--numa"}, "TYPE",
1821 "attempt optimizations that help on some NUMA systems\n"
1822 "- distribute: spread execution evenly over all nodes\n"
1823 "- isolate: only spawn threads on CPUs on the node that execution started on\n"
1824 "- numactl: use the CPU map provided by numactl\n"
1825 "if run without this previously, it is recommended to drop the system page cache before using this\n"
1826 "see https://github.com/ggml-org/llama.cpp/issues/1437",
1827 [](common_params & params, const std::string & value) {
1828 /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
1829 else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
1830 else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
1831 else { throw std::invalid_argument("invalid value"); }
1832 }
1833 ).set_env("LLAMA_ARG_NUMA"));
1834 add_opt(common_arg(
1835 {"-dev", "--device"}, "<dev1,dev2,..>",
1836 "comma-separated list of devices to use for offloading (none = don't offload)\n"
1837 "use --list-devices to see a list of available devices",
1838 [](common_params & params, const std::string & value) {
1839 params.devices = parse_device_list(value);
1840 }
1841 ).set_env("LLAMA_ARG_DEVICE"));
1842 add_opt(common_arg(
1843 {"--list-devices"},
1844 "print list of available devices and exit",
1845 [](common_params &) {
1846 std::vector<ggml_backend_dev_t> devices;
1847 for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
1848 auto * dev = ggml_backend_dev_get(index: i);
1849 if (ggml_backend_dev_type(device: dev) != GGML_BACKEND_DEVICE_TYPE_CPU) {
1850 devices.push_back(x: dev);
1851 }
1852 }
1853 printf(format: "Available devices:\n");
1854 for (auto * dev : devices) {
1855 size_t free, total;
1856 ggml_backend_dev_memory(device: dev, free: &free, total: &total);
1857 printf(format: " %s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(device: dev), ggml_backend_dev_description(device: dev), total / 1024 / 1024, free / 1024 / 1024);
1858 }
1859 exit(status: 0);
1860 }
1861 ));
1862 add_opt(common_arg(
1863 {"--override-tensor", "-ot"}, "<tensor name pattern>=<buffer type>,...",
1864 "override tensor buffer type", [](common_params & params, const std::string & value) {
1865 parse_tensor_buffer_overrides(value, overrides&: params.tensor_buft_overrides);
1866 }
1867 ));
1868 add_opt(common_arg(
1869 {"--override-tensor-draft", "-otd"}, "<tensor name pattern>=<buffer type>,...",
1870 "override tensor buffer type for draft model", [](common_params & params, const std::string & value) {
1871 parse_tensor_buffer_overrides(value, overrides&: params.speculative.tensor_buft_overrides);
1872 }
1873 ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
1874 add_opt(common_arg(
1875 {"--cpu-moe", "-cmoe"},
1876 "keep all Mixture of Experts (MoE) weights in the CPU",
1877 [](common_params & params) {
1878 params.tensor_buft_overrides.push_back(x: llm_ffn_exps_cpu_override());
1879 }
1880 ).set_env("LLAMA_ARG_CPU_MOE"));
1881 add_opt(common_arg(
1882 {"--n-cpu-moe", "-ncmoe"}, "N",
1883 "keep the Mixture of Experts (MoE) weights of the first N layers in the CPU",
1884 [](common_params & params, int value) {
1885 if (value < 0) {
1886 throw std::invalid_argument("invalid value");
1887 }
1888 for (int i = 0; i < value; ++i) {
1889 // keep strings alive and avoid leaking memory by storing them in a static vector
1890 static std::list<std::string> buft_overrides;
1891 buft_overrides.push_back(x: llm_ffn_exps_block_regex(idx: i));
1892 params.tensor_buft_overrides.push_back(x: {.pattern: buft_overrides.back().c_str(), .buft: ggml_backend_cpu_buffer_type()});
1893 }
1894 }
1895 ).set_env("LLAMA_ARG_N_CPU_MOE"));
1896 add_opt(common_arg(
1897 {"--cpu-moe-draft", "-cmoed"},
1898 "keep all Mixture of Experts (MoE) weights in the CPU for the draft model",
1899 [](common_params & params) {
1900 params.speculative.tensor_buft_overrides.push_back(x: llm_ffn_exps_cpu_override());
1901 }
1902 ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
1903 add_opt(common_arg(
1904 {"--n-cpu-moe-draft", "-ncmoed"}, "N",
1905 "keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model",
1906 [](common_params & params, int value) {
1907 if (value < 0) {
1908 throw std::invalid_argument("invalid value");
1909 }
1910 for (int i = 0; i < value; ++i) {
1911 static std::list<std::string> buft_overrides_draft;
1912 buft_overrides_draft.push_back(x: llm_ffn_exps_block_regex(idx: i));
1913 params.speculative.tensor_buft_overrides.push_back(x: {.pattern: buft_overrides_draft.back().c_str(), .buft: ggml_backend_cpu_buffer_type()});
1914 }
1915 }
1916 ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT"));
1917 add_opt(common_arg(
1918 {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
1919 string_format(fmt: "max. number of layers to store in VRAM (default: %d)", params.n_gpu_layers),
1920 [](common_params & params, int value) {
1921 params.n_gpu_layers = value;
1922 if (!llama_supports_gpu_offload()) {
1923 fprintf(stderr, format: "warning: no usable GPU found, --gpu-layers option will be ignored\n");
1924 fprintf(stderr, format: "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
1925 fprintf(stderr, format: "warning: consult docs/build.md for compilation instructions\n");
1926 }
1927 }
1928 ).set_env("LLAMA_ARG_N_GPU_LAYERS"));
1929 add_opt(common_arg(
1930 {"-sm", "--split-mode"}, "{none,layer,row}",
1931 "how to split the model across multiple GPUs, one of:\n"
1932 "- none: use one GPU only\n"
1933 "- layer (default): split layers and KV across GPUs\n"
1934 "- row: split rows across GPUs",
1935 [](common_params & params, const std::string & value) {
1936 std::string arg_next = value;
1937 if (arg_next == "none") {
1938 params.split_mode = LLAMA_SPLIT_MODE_NONE;
1939 } else if (arg_next == "layer") {
1940 params.split_mode = LLAMA_SPLIT_MODE_LAYER;
1941 } else if (arg_next == "row") {
1942 params.split_mode = LLAMA_SPLIT_MODE_ROW;
1943 } else {
1944 throw std::invalid_argument("invalid value");
1945 }
1946 if (!llama_supports_gpu_offload()) {
1947 fprintf(stderr, format: "warning: llama.cpp was compiled without support for GPU offload. Setting the split mode has no effect.\n");
1948 }
1949 }
1950 ).set_env("LLAMA_ARG_SPLIT_MODE"));
1951 add_opt(common_arg(
1952 {"-ts", "--tensor-split"}, "N0,N1,N2,...",
1953 "fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1",
1954 [](common_params & params, const std::string & value) {
1955 std::string arg_next = value;
1956
1957 // split string by , and /
1958 const std::regex regex{ R"([,/]+)" };
1959 std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 };
1960 std::vector<std::string> split_arg{ it, {} };
1961 if (split_arg.size() >= llama_max_devices()) {
1962 throw std::invalid_argument(
1963 string_format(fmt: "got %d input configs, but system only has %d devices", (int)split_arg.size(), (int)llama_max_devices())
1964 );
1965 }
1966 for (size_t i = 0; i < llama_max_devices(); ++i) {
1967 if (i < split_arg.size()) {
1968 params.tensor_split[i] = std::stof(str: split_arg[i]);
1969 } else {
1970 params.tensor_split[i] = 0.0f;
1971 }
1972 }
1973 if (!llama_supports_gpu_offload()) {
1974 fprintf(stderr, format: "warning: llama.cpp was compiled without support for GPU offload. Setting a tensor split has no effect.\n");
1975 }
1976 }
1977 ).set_env("LLAMA_ARG_TENSOR_SPLIT"));
1978 add_opt(common_arg(
1979 {"-mg", "--main-gpu"}, "INDEX",
1980 string_format(fmt: "the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu),
1981 [](common_params & params, int value) {
1982 params.main_gpu = value;
1983 if (!llama_supports_gpu_offload()) {
1984 fprintf(stderr, format: "warning: llama.cpp was compiled without support for GPU offload. Setting the main GPU has no effect.\n");
1985 }
1986 }
1987 ).set_env("LLAMA_ARG_MAIN_GPU"));
1988 add_opt(common_arg(
1989 {"--check-tensors"},
1990 string_format(fmt: "check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"),
1991 [](common_params & params) {
1992 params.check_tensors = true;
1993 }
1994 ));
1995 add_opt(common_arg(
1996 {"--override-kv"}, "KEY=TYPE:VALUE",
1997 "advanced option to override model metadata by key. may be specified multiple times.\n"
1998 "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false",
1999 [](common_params & params, const std::string & value) {
2000 if (!string_parse_kv_override(data: value.c_str(), overrides&: params.kv_overrides)) {
2001 throw std::runtime_error(string_format(fmt: "error: Invalid type for KV override: %s\n", value.c_str()));
2002 }
2003 }
2004 ));
2005 add_opt(common_arg(
2006 {"--no-op-offload"},
2007 string_format(fmt: "disable offloading host tensor operations to device (default: %s)", params.no_op_offload ? "true" : "false"),
2008 [](common_params & params) {
2009 params.no_op_offload = true;
2010 }
2011 ));
2012 add_opt(common_arg(
2013 {"--lora"}, "FNAME",
2014 "path to LoRA adapter (can be repeated to use multiple adapters)",
2015 [](common_params & params, const std::string & value) {
2016 params.lora_adapters.push_back(x: { .path: std::string(value), .scale: 1.0, .task_name: "", .prompt_prefix: "", .ptr: nullptr });
2017 }
2018 // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
2019 ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
2020 add_opt(common_arg(
2021 {"--lora-scaled"}, "FNAME", "SCALE",
2022 "path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)",
2023 [](common_params & params, const std::string & fname, const std::string & scale) {
2024 params.lora_adapters.push_back(x: { .path: fname, .scale: std::stof(str: scale), .task_name: "", .prompt_prefix: "", .ptr: nullptr });
2025 }
2026 // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
2027 ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
2028 add_opt(common_arg(
2029 {"--control-vector"}, "FNAME",
2030 "add a control vector\nnote: this argument can be repeated to add multiple control vectors",
2031 [](common_params & params, const std::string & value) {
2032 params.control_vectors.push_back(x: { .strength: 1.0f, .fname: value, });
2033 }
2034 ));
2035 add_opt(common_arg(
2036 {"--control-vector-scaled"}, "FNAME", "SCALE",
2037 "add a control vector with user defined scaling SCALE\n"
2038 "note: this argument can be repeated to add multiple scaled control vectors",
2039 [](common_params & params, const std::string & fname, const std::string & scale) {
2040 params.control_vectors.push_back(x: { .strength: std::stof(str: scale), .fname: fname });
2041 }
2042 ));
2043 add_opt(common_arg(
2044 {"--control-vector-layer-range"}, "START", "END",
2045 "layer range to apply the control vector(s) to, start and end inclusive",
2046 [](common_params & params, const std::string & start, const std::string & end) {
2047 params.control_vector_layer_start = std::stoi(str: start);
2048 params.control_vector_layer_end = std::stoi(str: end);
2049 }
2050 ));
2051 add_opt(common_arg(
2052 {"-a", "--alias"}, "STRING",
2053 "set alias for model name (to be used by REST API)",
2054 [](common_params & params, const std::string & value) {
2055 params.model_alias = value;
2056 }
2057 ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ALIAS"));
2058 add_opt(common_arg(
2059 {"-m", "--model"}, "FNAME",
2060 ex == LLAMA_EXAMPLE_EXPORT_LORA
2061 ? std::string("model path from which to load base model")
2062 : string_format(
2063 fmt: "model path (default: `models/$filename` with filename from `--hf-file` "
2064 "or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH
2065 ),
2066 [](common_params & params, const std::string & value) {
2067 params.model.path = value;
2068 }
2069 ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL"));
2070 add_opt(common_arg(
2071 {"-mu", "--model-url"}, "MODEL_URL",
2072 "model download url (default: unused)",
2073 [](common_params & params, const std::string & value) {
2074 params.model.url = value;
2075 }
2076 ).set_env("LLAMA_ARG_MODEL_URL"));
2077 add_opt(common_arg(
2078 { "-dr", "--docker-repo" }, "[<repo>/]<model>[:quant]",
2079 "Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.\n"
2080 "example: gemma3\n"
2081 "(default: unused)",
2082 [](common_params & params, const std::string & value) {
2083 params.model.docker_repo = value;
2084 }
2085 ).set_env("LLAMA_ARG_DOCKER_REPO"));
2086 add_opt(common_arg(
2087 {"-hf", "-hfr", "--hf-repo"}, "<user>/<model>[:quant]",
2088 "Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n"
2089 "mmproj is also downloaded automatically if available. to disable, add --no-mmproj\n"
2090 "example: unsloth/phi-4-GGUF:q4_k_m\n"
2091 "(default: unused)",
2092 [](common_params & params, const std::string & value) {
2093 params.model.hf_repo = value;
2094 }
2095 ).set_env("LLAMA_ARG_HF_REPO"));
2096 add_opt(common_arg(
2097 {"-hfd", "-hfrd", "--hf-repo-draft"}, "<user>/<model>[:quant]",
2098 "Same as --hf-repo, but for the draft model (default: unused)",
2099 [](common_params & params, const std::string & value) {
2100 params.speculative.model.hf_repo = value;
2101 }
2102 ).set_env("LLAMA_ARG_HFD_REPO"));
2103 add_opt(common_arg(
2104 {"-hff", "--hf-file"}, "FILE",
2105 "Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)",
2106 [](common_params & params, const std::string & value) {
2107 params.model.hf_file = value;
2108 }
2109 ).set_env("LLAMA_ARG_HF_FILE"));
2110 add_opt(common_arg(
2111 {"-hfv", "-hfrv", "--hf-repo-v"}, "<user>/<model>[:quant]",
2112 "Hugging Face model repository for the vocoder model (default: unused)",
2113 [](common_params & params, const std::string & value) {
2114 params.vocoder.model.hf_repo = value;
2115 }
2116 ).set_env("LLAMA_ARG_HF_REPO_V"));
2117 add_opt(common_arg(
2118 {"-hffv", "--hf-file-v"}, "FILE",
2119 "Hugging Face model file for the vocoder model (default: unused)",
2120 [](common_params & params, const std::string & value) {
2121 params.vocoder.model.hf_file = value;
2122 }
2123 ).set_env("LLAMA_ARG_HF_FILE_V"));
2124 add_opt(common_arg(
2125 {"-hft", "--hf-token"}, "TOKEN",
2126 "Hugging Face access token (default: value from HF_TOKEN environment variable)",
2127 [](common_params & params, const std::string & value) {
2128 params.hf_token = value;
2129 }
2130 ).set_env("HF_TOKEN"));
2131 add_opt(common_arg(
2132 {"--context-file"}, "FNAME",
2133 "file to load context from (repeat to specify multiple files)",
2134 [](common_params & params, const std::string & value) {
2135 std::ifstream file(value, std::ios::binary);
2136 if (!file) {
2137 throw std::runtime_error(string_format(fmt: "error: failed to open file '%s'\n", value.c_str()));
2138 }
2139 params.context_files.push_back(x: value);
2140 }
2141 ).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
2142 add_opt(common_arg(
2143 {"--chunk-size"}, "N",
2144 string_format(fmt: "minimum length of embedded text chunks (default: %d)", params.chunk_size),
2145 [](common_params & params, int value) {
2146 params.chunk_size = value;
2147 }
2148 ).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
2149 add_opt(common_arg(
2150 {"--chunk-separator"}, "STRING",
2151 string_format(fmt: "separator between chunks (default: '%s')", params.chunk_separator.c_str()),
2152 [](common_params & params, const std::string & value) {
2153 params.chunk_separator = value;
2154 }
2155 ).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
2156 add_opt(common_arg(
2157 {"--junk"}, "N",
2158 string_format(fmt: "number of times to repeat the junk text (default: %d)", params.n_junk),
2159 [](common_params & params, int value) {
2160 params.n_junk = value;
2161 }
2162 ).set_examples({LLAMA_EXAMPLE_PASSKEY, LLAMA_EXAMPLE_PARALLEL}));
2163 add_opt(common_arg(
2164 {"--pos"}, "N",
2165 string_format(fmt: "position of the passkey in the junk text (default: %d)", params.i_pos),
2166 [](common_params & params, int value) {
2167 params.i_pos = value;
2168 }
2169 ).set_examples({LLAMA_EXAMPLE_PASSKEY}));
2170 add_opt(common_arg(
2171 {"-o", "--output", "--output-file"}, "FNAME",
2172 string_format(fmt: "output file (default: '%s')", params.out_file.c_str()),
2173 [](common_params & params, const std::string & value) {
2174 params.out_file = value;
2175 }
2176 ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE}));
2177 add_opt(common_arg(
2178 {"-ofreq", "--output-frequency"}, "N",
2179 string_format(fmt: "output the imatrix every N iterations (default: %d)", params.n_out_freq),
2180 [](common_params & params, int value) {
2181 params.n_out_freq = value;
2182 }
2183 ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
2184 add_opt(common_arg(
2185 {"--output-format"}, "{gguf,dat}",
2186 string_format(fmt: "output format for imatrix file (default: %s)", params.imat_dat > 0 ? "dat" : "gguf"),
2187 [](common_params & params, const std::string & value) {
2188 /**/ if (value == "gguf") { params.imat_dat = -1; }
2189 else if (value == "dat") { params.imat_dat = 1; }
2190 else { throw std::invalid_argument("invalid output format"); }
2191 }
2192 ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
2193 add_opt(common_arg(
2194 {"--save-frequency"}, "N",
2195 string_format(fmt: "save an imatrix copy every N iterations (default: %d)", params.n_save_freq),
2196 [](common_params & params, int value) {
2197 params.n_save_freq = value;
2198 }
2199 ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
2200 add_opt(common_arg(
2201 {"--process-output"},
2202 string_format(fmt: "collect data for the output tensor (default: %s)", params.process_output ? "true" : "false"),
2203 [](common_params & params) {
2204 params.process_output = true;
2205 }
2206 ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
2207 add_opt(common_arg(
2208 {"--no-ppl"},
2209 string_format(fmt: "do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"),
2210 [](common_params & params) {
2211 params.compute_ppl = false;
2212 }
2213 ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
2214 add_opt(common_arg(
2215 {"--chunk", "--from-chunk"}, "N",
2216 string_format(fmt: "start processing the input from chunk N (default: %d)", params.i_chunk),
2217 [](common_params & params, int value) {
2218 params.i_chunk = value;
2219 }
2220 ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
2221 add_opt(common_arg(
2222 {"--show-statistics"},
2223 string_format(fmt: "show imatrix statistics and then exit (default: %s)", params.show_statistics ? "true" : "false"),
2224 [](common_params & params) {
2225 params.show_statistics = true;
2226 }
2227 ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
2228 add_opt(common_arg(
2229 {"--parse-special"},
2230 string_format(fmt: "parse special tokens (chat, tool, etc) (default: %s)", params.parse_special ? "true" : "false"),
2231 [](common_params & params) {
2232 params.parse_special = true;
2233 }
2234 ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
2235 add_opt(common_arg(
2236 {"-pps"},
2237 string_format(fmt: "is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"),
2238 [](common_params & params) {
2239 params.is_pp_shared = true;
2240 }
2241 ).set_examples({LLAMA_EXAMPLE_BENCH, LLAMA_EXAMPLE_PARALLEL}));
2242 add_opt(common_arg(
2243 {"-npp"}, "n0,n1,...",
2244 "number of prompt tokens",
2245 [](common_params & params, const std::string & value) {
2246 auto p = string_split<int>(str: value, delim: ',');
2247 params.n_pp.insert(position: params.n_pp.end(), first: p.begin(), last: p.end());
2248 }
2249 ).set_examples({LLAMA_EXAMPLE_BENCH}));
2250 add_opt(common_arg(
2251 {"-ntg"}, "n0,n1,...",
2252 "number of text generation tokens",
2253 [](common_params & params, const std::string & value) {
2254 auto p = string_split<int>(str: value, delim: ',');
2255 params.n_tg.insert(position: params.n_tg.end(), first: p.begin(), last: p.end());
2256 }
2257 ).set_examples({LLAMA_EXAMPLE_BENCH}));
2258 add_opt(common_arg(
2259 {"-npl"}, "n0,n1,...",
2260 "number of parallel prompts",
2261 [](common_params & params, const std::string & value) {
2262 auto p = string_split<int>(str: value, delim: ',');
2263 params.n_pl.insert(position: params.n_pl.end(), first: p.begin(), last: p.end());
2264 }
2265 ).set_examples({LLAMA_EXAMPLE_BENCH}));
2266 add_opt(common_arg(
2267 {"--embd-normalize"}, "N",
2268 string_format(fmt: "normalisation for embeddings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize),
2269 [](common_params & params, int value) {
2270 params.embd_normalize = value;
2271 }
2272 ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
2273 add_opt(common_arg(
2274 {"--embd-output-format"}, "FORMAT",
2275 "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix, \"raw\" = plain whitespace-delimited output (one embedding per line)",
2276 [](common_params & params, const std::string & value) {
2277 params.embd_out = value;
2278 }
2279 ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
2280 add_opt(common_arg(
2281 {"--embd-separator"}, "STRING",
2282 "separator of embeddings (default \\n) for example \"<#sep#>\"",
2283 [](common_params & params, const std::string & value) {
2284 params.embd_sep = value;
2285 }
2286 ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
2287 add_opt(common_arg(
2288 {"--cls-separator"}, "STRING",
2289 "separator of classification sequences (default \\t) for example \"<#seq#>\"",
2290 [](common_params & params, const std::string & value) {
2291 params.cls_sep = value;
2292 }
2293 ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
2294 add_opt(common_arg(
2295 {"--host"}, "HOST",
2296 string_format(fmt: "ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: %s)", params.hostname.c_str()),
2297 [](common_params & params, const std::string & value) {
2298 params.hostname = value;
2299 }
2300 ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_HOST"));
2301 add_opt(common_arg(
2302 {"--port"}, "PORT",
2303 string_format(fmt: "port to listen (default: %d)", params.port),
2304 [](common_params & params, int value) {
2305 params.port = value;
2306 }
2307 ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PORT"));
2308 add_opt(common_arg(
2309 {"--path"}, "PATH",
2310 string_format(fmt: "path to serve static files from (default: %s)", params.public_path.c_str()),
2311 [](common_params & params, const std::string & value) {
2312 params.public_path = value;
2313 }
2314 ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH"));
2315 add_opt(common_arg(
2316 {"--api-prefix"}, "PREFIX",
2317 string_format(fmt: "prefix path the server serves from, without the trailing slash (default: %s)", params.api_prefix.c_str()),
2318 [](common_params & params, const std::string & value) {
2319 params.api_prefix = value;
2320 }
2321 ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
2322 add_opt(common_arg(
2323 {"--no-webui"},
2324 string_format(fmt: "Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
2325 [](common_params & params) {
2326 params.webui = false;
2327 }
2328 ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_WEBUI"));
2329 add_opt(common_arg(
2330 {"--embedding", "--embeddings"},
2331 string_format(fmt: "restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
2332 [](common_params & params) {
2333 params.embedding = true;
2334 }
2335 ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
2336 add_opt(common_arg(
2337 {"--reranking", "--rerank"},
2338 string_format(fmt: "enable reranking endpoint on server (default: %s)", "disabled"),
2339 [](common_params & params) {
2340 params.embedding = true;
2341 params.pooling_type = LLAMA_POOLING_TYPE_RANK;
2342 }
2343 ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_RERANKING"));
2344 add_opt(common_arg(
2345 {"--api-key"}, "KEY",
2346 "API key to use for authentication (default: none)",
2347 [](common_params & params, const std::string & value) {
2348 params.api_keys.push_back(x: value);
2349 }
2350 ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_API_KEY"));
2351 add_opt(common_arg(
2352 {"--api-key-file"}, "FNAME",
2353 "path to file containing API keys (default: none)",
2354 [](common_params & params, const std::string & value) {
2355 std::ifstream key_file(value);
2356 if (!key_file) {
2357 throw std::runtime_error(string_format(fmt: "error: failed to open file '%s'\n", value.c_str()));
2358 }
2359 std::string key;
2360 while (std::getline(is&: key_file, str&: key)) {
2361 if (!key.empty()) {
2362 params.api_keys.push_back(x: key);
2363 }
2364 }
2365 key_file.close();
2366 }
2367 ).set_examples({LLAMA_EXAMPLE_SERVER}));
2368 add_opt(common_arg(
2369 {"--ssl-key-file"}, "FNAME",
2370 "path to file a PEM-encoded SSL private key",
2371 [](common_params & params, const std::string & value) {
2372 params.ssl_file_key = value;
2373 }
2374 ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_KEY_FILE"));
2375 add_opt(common_arg(
2376 {"--ssl-cert-file"}, "FNAME",
2377 "path to file a PEM-encoded SSL certificate",
2378 [](common_params & params, const std::string & value) {
2379 params.ssl_file_cert = value;
2380 }
2381 ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE"));
2382 add_opt(common_arg(
2383 {"--chat-template-kwargs"}, "STRING",
2384 string_format(fmt: "sets additional params for the json template parser"),
2385 [](common_params & params, const std::string & value) {
2386 auto parsed = json::parse(i: value);
2387 for (const auto & item : parsed.items()) {
2388 params.default_template_kwargs[item.key()] = item.value().dump();
2389 }
2390 }
2391 ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_CHAT_TEMPLATE_KWARGS"));
2392 add_opt(common_arg(
2393 {"-to", "--timeout"}, "N",
2394 string_format(fmt: "server read/write timeout in seconds (default: %d)", params.timeout_read),
2395 [](common_params & params, int value) {
2396 params.timeout_read = value;
2397 params.timeout_write = value;
2398 }
2399 ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TIMEOUT"));
2400 add_opt(common_arg(
2401 {"--threads-http"}, "N",
2402 string_format(fmt: "number of threads used to process HTTP requests (default: %d)", params.n_threads_http),
2403 [](common_params & params, int value) {
2404 params.n_threads_http = value;
2405 }
2406 ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP"));
2407 add_opt(common_arg(
2408 {"--cache-reuse"}, "N",
2409 string_format(
2410 fmt: "min chunk size to attempt reusing from the cache via KV shifting (default: %d)\n"
2411 "[(card)](https://ggml.ai/f0.png)", params.n_cache_reuse
2412 ),
2413 [](common_params & params, int value) {
2414 params.n_cache_reuse = value;
2415 }
2416 ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CACHE_REUSE"));
2417 add_opt(common_arg(
2418 {"--metrics"},
2419 string_format(fmt: "enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"),
2420 [](common_params & params) {
2421 params.endpoint_metrics = true;
2422 }
2423 ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS"));
2424 add_opt(common_arg(
2425 {"--props"},
2426 string_format(fmt: "enable changing global properties via POST /props (default: %s)", params.endpoint_props ? "enabled" : "disabled"),
2427 [](common_params & params) {
2428 params.endpoint_props = true;
2429 }
2430 ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS"));
2431 add_opt(common_arg(
2432 {"--slots"},
2433 string_format(fmt: "enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
2434 [](common_params & params) {
2435 params.endpoint_slots = true;
2436 }
2437 ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
2438 add_opt(common_arg(
2439 {"--no-slots"},
2440 "disables slots monitoring endpoint",
2441 [](common_params & params) {
2442 params.endpoint_slots = false;
2443 }
2444 ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_ENDPOINT_SLOTS"));
2445 add_opt(common_arg(
2446 {"--slot-save-path"}, "PATH",
2447 "path to save slot kv cache (default: disabled)",
2448 [](common_params & params, const std::string & value) {
2449 params.slot_save_path = value;
2450 // if doesn't end with DIRECTORY_SEPARATOR, add it
2451 if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) {
2452 params.slot_save_path += DIRECTORY_SEPARATOR;
2453 }
2454 }
2455 ).set_examples({LLAMA_EXAMPLE_SERVER}));
2456 add_opt(common_arg(
2457 {"--jinja"},
2458 "use jinja template for chat (default: disabled)",
2459 [](common_params & params) {
2460 params.use_jinja = true;
2461 }
2462 ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
2463 add_opt(common_arg(
2464 {"--reasoning-format"}, "FORMAT",
2465 "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
2466 "- none: leaves thoughts unparsed in `message.content`\n"
2467 "- deepseek: puts thoughts in `message.reasoning_content`\n"
2468 "- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`\n"
2469 "(default: auto)",
2470 [](common_params & params, const std::string & value) {
2471 params.reasoning_format = common_reasoning_format_from_name(format: value);
2472 }
2473 ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
2474 add_opt(common_arg(
2475 {"--reasoning-budget"}, "N",
2476 "controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)",
2477 [](common_params & params, int value) {
2478 if (value != 0 && value != -1) { throw std::invalid_argument("invalid value"); }
2479 params.reasoning_budget = value;
2480 }
2481 ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK_BUDGET"));
2482 add_opt(common_arg(
2483 {"--chat-template"}, "JINJA_TEMPLATE",
2484 string_format(
2485 fmt: "set custom jinja chat template (default: template taken from model's metadata)\n"
2486 "if suffix/prefix are specified, template will be disabled\n"
2487 "only commonly used templates are accepted (unless --jinja is set before this flag):\n"
2488 "list of built-in templates:\n%s", list_builtin_chat_templates().c_str()
2489 ),
2490 [](common_params & params, const std::string & value) {
2491 params.chat_template = value;
2492 }
2493 ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
2494 add_opt(common_arg(
2495 {"--chat-template-file"}, "JINJA_TEMPLATE_FILE",
2496 string_format(
2497 fmt: "set custom jinja chat template file (default: template taken from model's metadata)\n"
2498 "if suffix/prefix are specified, template will be disabled\n"
2499 "only commonly used templates are accepted (unless --jinja is set before this flag):\n"
2500 "list of built-in templates:\n%s", list_builtin_chat_templates().c_str()
2501 ),
2502 [](common_params & params, const std::string & value) {
2503 params.chat_template = read_file(fname: value);
2504 }
2505 ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
2506 add_opt(common_arg(
2507 {"--no-prefill-assistant"},
2508 string_format(
2509 fmt: "whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)\n"
2510 "when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled\n"
2511 ),
2512 [](common_params & params) {
2513 params.prefill_assistant = false;
2514 }
2515 ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_PREFILL_ASSISTANT"));
2516 add_opt(common_arg(
2517 {"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
2518 string_format(fmt: "how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
2519 [](common_params & params, const std::string & value) {
2520 params.slot_prompt_similarity = std::stof(str: value);
2521 }
2522 ).set_examples({LLAMA_EXAMPLE_SERVER}));
2523 add_opt(common_arg(
2524 {"--lora-init-without-apply"},
2525 string_format(fmt: "load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"),
2526 [](common_params & params) {
2527 params.lora_init_without_apply = true;
2528 }
2529 ).set_examples({LLAMA_EXAMPLE_SERVER}));
2530 add_opt(common_arg(
2531 {"--simple-io"},
2532 "use basic IO for better compatibility in subprocesses and limited consoles",
2533 [](common_params & params) {
2534 params.simple_io = true;
2535 }
2536 ).set_examples({LLAMA_EXAMPLE_MAIN}));
2537 add_opt(common_arg(
2538 {"--positive-file"}, "FNAME",
2539 string_format(fmt: "positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()),
2540 [](common_params & params, const std::string & value) {
2541 params.cvector_positive_file = value;
2542 }
2543 ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
2544 add_opt(common_arg(
2545 {"--negative-file"}, "FNAME",
2546 string_format(fmt: "negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str()),
2547 [](common_params & params, const std::string & value) {
2548 params.cvector_negative_file = value;
2549 }
2550 ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
2551 add_opt(common_arg(
2552 {"--pca-batch"}, "N",
2553 string_format(fmt: "batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch),
2554 [](common_params & params, int value) {
2555 params.n_pca_batch = value;
2556 }
2557 ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
2558 add_opt(common_arg(
2559 {"--pca-iter"}, "N",
2560 string_format(fmt: "number of iterations used for PCA (default: %d)", params.n_pca_iterations),
2561 [](common_params & params, int value) {
2562 params.n_pca_iterations = value;
2563 }
2564 ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
2565 add_opt(common_arg(
2566 {"--method"}, "{pca, mean}",
2567 "dimensionality reduction method to be used (default: pca)",
2568 [](common_params & params, const std::string & value) {
2569 /**/ if (value == "pca") { params.cvector_dimre_method = DIMRE_METHOD_PCA; }
2570 else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; }
2571 else { throw std::invalid_argument("invalid value"); }
2572 }
2573 ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
2574 add_opt(common_arg(
2575 {"--output-format"}, "{md,jsonl}",
2576 "output format for batched-bench results (default: md)",
2577 [](common_params & params, const std::string & value) {
2578 /**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; }
2579 else if (value == "md") { params.batched_bench_output_jsonl = false; }
2580 else { throw std::invalid_argument("invalid value"); }
2581 }
2582 ).set_examples({LLAMA_EXAMPLE_BENCH}));
2583 add_opt(common_arg(
2584 {"--log-disable"},
2585 "Log disable",
2586 [](common_params &) {
2587 common_log_pause(log: common_log_main());
2588 }
2589 ));
2590 add_opt(common_arg(
2591 {"--log-file"}, "FNAME",
2592 "Log to file",
2593 [](common_params &, const std::string & value) {
2594 common_log_set_file(log: common_log_main(), file: value.c_str());
2595 }
2596 ));
2597 add_opt(common_arg(
2598 {"--log-colors"}, "[on|off|auto]",
2599 "Set colored logging ('on', 'off', or 'auto', default: 'auto')\n"
2600 "'auto' enables colors when output is to a terminal",
2601 [](common_params &, const std::string & value) {
2602 if (is_truthy(value)) {
2603 common_log_set_colors(log: common_log_main(), colors: LOG_COLORS_ENABLED);
2604 } else if (is_falsey(value)) {
2605 common_log_set_colors(log: common_log_main(), colors: LOG_COLORS_DISABLED);
2606 } else if (is_autoy(value)) {
2607 common_log_set_colors(log: common_log_main(), colors: LOG_COLORS_AUTO);
2608 } else {
2609 throw std::invalid_argument(
2610 string_format(fmt: "error: unkown value for --log-colors: '%s'\n", value.c_str()));
2611 }
2612 }
2613 ).set_env("LLAMA_LOG_COLORS"));
2614 add_opt(common_arg(
2615 {"-v", "--verbose", "--log-verbose"},
2616 "Set verbosity level to infinity (i.e. log all messages, useful for debugging)",
2617 [](common_params & params) {
2618 params.verbosity = INT_MAX;
2619 common_log_set_verbosity_thold(INT_MAX);
2620 }
2621 ));
2622 add_opt(common_arg(
2623 {"--offline"},
2624 "Offline mode: forces use of cache, prevents network access",
2625 [](common_params & params) {
2626 params.offline = true;
2627 }
2628 ).set_env("LLAMA_OFFLINE"));
2629 add_opt(common_arg(
2630 {"-lv", "--verbosity", "--log-verbosity"}, "N",
2631 "Set the verbosity threshold. Messages with a higher verbosity will be ignored.",
2632 [](common_params & params, int value) {
2633 params.verbosity = value;
2634 common_log_set_verbosity_thold(verbosity: value);
2635 }
2636 ).set_env("LLAMA_LOG_VERBOSITY"));
2637 add_opt(common_arg(
2638 {"--log-prefix"},
2639 "Enable prefix in log messages",
2640 [](common_params &) {
2641 common_log_set_prefix(log: common_log_main(), prefix: true);
2642 }
2643 ).set_env("LLAMA_LOG_PREFIX"));
2644 add_opt(common_arg(
2645 {"--log-timestamps"},
2646 "Enable timestamps in log messages",
2647 [](common_params &) {
2648 common_log_set_timestamps(log: common_log_main(), timestamps: true);
2649 }
2650 ).set_env("LLAMA_LOG_TIMESTAMPS"));
2651
2652 // speculative parameters
2653 add_opt(common_arg(
2654 {"-td", "--threads-draft"}, "N",
2655 "number of threads to use during generation (default: same as --threads)",
2656 [](common_params & params, int value) {
2657 params.speculative.cpuparams.n_threads = value;
2658 if (params.speculative.cpuparams.n_threads <= 0) {
2659 params.speculative.cpuparams.n_threads = std::thread::hardware_concurrency();
2660 }
2661 }
2662 ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
2663 add_opt(common_arg(
2664 {"-tbd", "--threads-batch-draft"}, "N",
2665 "number of threads to use during batch and prompt processing (default: same as --threads-draft)",
2666 [](common_params & params, int value) {
2667 params.speculative.cpuparams_batch.n_threads = value;
2668 if (params.speculative.cpuparams_batch.n_threads <= 0) {
2669 params.speculative.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
2670 }
2671 }
2672 ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
2673 add_opt(common_arg(
2674 {"-Cd", "--cpu-mask-draft"}, "M",
2675 "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
2676 [](common_params & params, const std::string & mask) {
2677 params.speculative.cpuparams.mask_valid = true;
2678 if (!parse_cpu_mask(mask, boolmask&: params.speculative.cpuparams.cpumask)) {
2679 throw std::invalid_argument("invalid cpumask");
2680 }
2681 }
2682 ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
2683 add_opt(common_arg(
2684 {"-Crd", "--cpu-range-draft"}, "lo-hi",
2685 "Ranges of CPUs for affinity. Complements --cpu-mask-draft",
2686 [](common_params & params, const std::string & range) {
2687 params.speculative.cpuparams.mask_valid = true;
2688 if (!parse_cpu_range(range, boolmask&: params.speculative.cpuparams.cpumask)) {
2689 throw std::invalid_argument("invalid range");
2690 }
2691 }
2692 ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
2693 add_opt(common_arg(
2694 {"--cpu-strict-draft"}, "<0|1>",
2695 "Use strict CPU placement for draft model (default: same as --cpu-strict)",
2696 [](common_params & params, int value) {
2697 params.speculative.cpuparams.strict_cpu = value;
2698 }
2699 ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
2700 add_opt(common_arg(
2701 {"--prio-draft"}, "N",
2702 string_format(fmt: "set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.speculative.cpuparams.priority),
2703 [](common_params & params, int prio) {
2704 if (prio < 0 || prio > 3) {
2705 throw std::invalid_argument("invalid value");
2706 }
2707 params.speculative.cpuparams.priority = (enum ggml_sched_priority) prio;
2708 }
2709 ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
2710 add_opt(common_arg(
2711 {"--poll-draft"}, "<0|1>",
2712 "Use polling to wait for draft model work (default: same as --poll])",
2713 [](common_params & params, int value) {
2714 params.speculative.cpuparams.poll = value;
2715 }
2716 ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
2717 add_opt(common_arg(
2718 {"-Cbd", "--cpu-mask-batch-draft"}, "M",
2719 "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
2720 [](common_params & params, const std::string & mask) {
2721 params.speculative.cpuparams_batch.mask_valid = true;
2722 if (!parse_cpu_mask(mask, boolmask&: params.speculative.cpuparams_batch.cpumask)) {
2723 throw std::invalid_argument("invalid cpumask");
2724 }
2725 }
2726 ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
2727 add_opt(common_arg(
2728 {"-Crbd", "--cpu-range-batch-draft"}, "lo-hi",
2729 "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)",
2730 [](common_params & params, const std::string & range) {
2731 params.speculative.cpuparams_batch.mask_valid = true;
2732 if (!parse_cpu_range(range, boolmask&: params.speculative.cpuparams_batch.cpumask)) {
2733 throw std::invalid_argument("invalid cpumask");
2734 }
2735 }
2736 ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
2737 add_opt(common_arg(
2738 {"--cpu-strict-batch-draft"}, "<0|1>",
2739 "Use strict CPU placement for draft model (default: --cpu-strict-draft)",
2740 [](common_params & params, int value) {
2741 params.speculative.cpuparams_batch.strict_cpu = value;
2742 }
2743 ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
2744 add_opt(common_arg(
2745 {"--prio-batch-draft"}, "N",
2746 string_format(fmt: "set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.speculative.cpuparams_batch.priority),
2747 [](common_params & params, int prio) {
2748 if (prio < 0 || prio > 3) {
2749 throw std::invalid_argument("invalid value");
2750 }
2751 params.speculative.cpuparams_batch.priority = (enum ggml_sched_priority) prio;
2752 }
2753 ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
2754 add_opt(common_arg(
2755 {"--poll-batch-draft"}, "<0|1>",
2756 "Use polling to wait for draft model work (default: --poll-draft)",
2757 [](common_params & params, int value) {
2758 params.speculative.cpuparams_batch.poll = value;
2759 }
2760 ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
2761 add_opt(common_arg(
2762 {"--draft-max", "--draft", "--draft-n"}, "N",
2763 string_format(fmt: "number of tokens to draft for speculative decoding (default: %d)", params.speculative.n_max),
2764 [](common_params & params, int value) {
2765 params.speculative.n_max = value;
2766 }
2767 ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MAX"));
2768 add_opt(common_arg(
2769 {"--draft-min", "--draft-n-min"}, "N",
2770 string_format(fmt: "minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min),
2771 [](common_params & params, int value) {
2772 params.speculative.n_min = value;
2773 }
2774 ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MIN"));
2775 add_opt(common_arg(
2776 {"--draft-p-split"}, "P",
2777 string_format(fmt: "speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split),
2778 [](common_params & params, const std::string & value) {
2779 params.speculative.p_split = std::stof(str: value);
2780 }
2781 ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}).set_env("LLAMA_ARG_DRAFT_P_SPLIT"));
2782 add_opt(common_arg(
2783 {"--draft-p-min"}, "P",
2784 string_format(fmt: "minimum speculative decoding probability (greedy) (default: %.1f)", (double)params.speculative.p_min),
2785 [](common_params & params, const std::string & value) {
2786 params.speculative.p_min = std::stof(str: value);
2787 }
2788 ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_P_MIN"));
2789 add_opt(common_arg(
2790 {"-cd", "--ctx-size-draft"}, "N",
2791 string_format(fmt: "size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx),
2792 [](common_params & params, int value) {
2793 params.speculative.n_ctx = value;
2794 }
2795 ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CTX_SIZE_DRAFT"));
2796 add_opt(common_arg(
2797 {"-devd", "--device-draft"}, "<dev1,dev2,..>",
2798 "comma-separated list of devices to use for offloading the draft model (none = don't offload)\n"
2799 "use --list-devices to see a list of available devices",
2800 [](common_params & params, const std::string & value) {
2801 params.speculative.devices = parse_device_list(value);
2802 }
2803 ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
2804 add_opt(common_arg(
2805 {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
2806 "number of layers to store in VRAM for the draft model",
2807 [](common_params & params, int value) {
2808 params.speculative.n_gpu_layers = value;
2809 if (!llama_supports_gpu_offload()) {
2810 fprintf(stderr, format: "warning: no usable GPU found, --gpu-layers-draft option will be ignored\n");
2811 fprintf(stderr, format: "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
2812 fprintf(stderr, format: "warning: consult docs/build.md for compilation instructions\n");
2813 }
2814 }
2815 ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_GPU_LAYERS_DRAFT"));
2816 add_opt(common_arg(
2817 {"-md", "--model-draft"}, "FNAME",
2818 "draft model for speculative decoding (default: unused)",
2819 [](common_params & params, const std::string & value) {
2820 params.speculative.model.path = value;
2821 }
2822 ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
2823 add_opt(common_arg(
2824 {"--spec-replace"}, "TARGET", "DRAFT",
2825 "translate the string in TARGET into DRAFT if the draft model and main model are not compatible",
2826 [](common_params & params, const std::string & tgt, const std::string & dft) {
2827 params.speculative.replacements.push_back(x: { tgt, dft });
2828 }
2829 ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
2830 add_opt(common_arg(
2831 {"-ctkd", "--cache-type-k-draft"}, "TYPE",
2832 string_format(
2833 fmt: "KV cache data type for K for the draft model\n"
2834 "allowed values: %s\n"
2835 "(default: %s)",
2836 get_all_kv_cache_types().c_str(),
2837 ggml_type_name(type: params.speculative.cache_type_k)
2838 ),
2839 [](common_params & params, const std::string & value) {
2840 params.speculative.cache_type_k = kv_cache_type_from_str(s: value);
2841 }
2842 ).set_env("LLAMA_ARG_CACHE_TYPE_K_DRAFT"));
2843 add_opt(common_arg(
2844 {"-ctvd", "--cache-type-v-draft"}, "TYPE",
2845 string_format(
2846 fmt: "KV cache data type for V for the draft model\n"
2847 "allowed values: %s\n"
2848 "(default: %s)",
2849 get_all_kv_cache_types().c_str(),
2850 ggml_type_name(type: params.speculative.cache_type_v)
2851 ),
2852 [](common_params & params, const std::string & value) {
2853 params.speculative.cache_type_v = kv_cache_type_from_str(s: value);
2854 }
2855 ).set_env("LLAMA_ARG_CACHE_TYPE_V_DRAFT"));
2856
2857 add_opt(common_arg(
2858 {"-mv", "--model-vocoder"}, "FNAME",
2859 "vocoder model for audio generation (default: unused)",
2860 [](common_params & params, const std::string & value) {
2861 params.vocoder.model.path = value;
2862 }
2863 ).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));
2864 add_opt(common_arg(
2865 {"--tts-use-guide-tokens"},
2866 "Use guide tokens to improve TTS word recall",
2867 [](common_params & params) {
2868 params.vocoder.use_guide_tokens = true;
2869 }
2870 ).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));
2871 add_opt(common_arg(
2872 {"--tts-speaker-file"}, "FNAME",
2873 "speaker file path for audio generation",
2874 [](common_params & params, const std::string & value) {
2875 params.vocoder.speaker_file = value;
2876 }
2877 ).set_examples({LLAMA_EXAMPLE_TTS}));
2878
2879 add_opt(common_arg(
2880 {"--diffusion-steps"}, "N",
2881 string_format(fmt: "number of diffusion steps (default: %d)", params.diffusion.steps),
2882 [](common_params & params, int value) { params.diffusion.steps = value; }
2883 ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
2884 add_opt(common_arg(
2885 {"--diffusion-visual"},
2886 string_format(fmt: "enable visual diffusion mode (show progressive generation) (default: %s)", params.diffusion.visual_mode ? "true" : "false"),
2887 [](common_params & params) { params.diffusion.visual_mode = true; }
2888 ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
2889 add_opt(common_arg(
2890 {"--diffusion-eps"}, "F",
2891 string_format(fmt: "epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps),
2892 [](common_params & params, const std::string & value) { params.diffusion.eps = std::stof(str: value); }
2893 ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
2894 add_opt(common_arg(
2895 {"--diffusion-algorithm"}, "N",
2896 string_format(fmt: "diffusion algorithm: 0=ORIGIN, 1=ENTROPY_BASED, 2=MARGIN_BASED, 3=RANDOM, 4=LOW_CONFIDENCE (default: %d)", params.diffusion.algorithm),
2897 [](common_params & params, int value) { params.diffusion.algorithm = value; }
2898 ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
2899 add_opt(common_arg(
2900 {"--diffusion-alg-temp"}, "F",
2901 string_format(fmt: "dream algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
2902 [](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(str: value); }
2903 ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
2904 add_opt(common_arg(
2905 {"--diffusion-block-length"}, "N",
2906 string_format(fmt: "llada block length for generation (default: %d)", params.diffusion.block_length),
2907 [](common_params & params, int value) { params.diffusion.block_length = value; }
2908 ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
2909 add_opt(common_arg(
2910 {"--diffusion-cfg-scale"}, "F",
2911 string_format(fmt: "llada classifier-free guidance scale (default: %.3f)", (double) params.diffusion.cfg_scale),
2912 [](common_params & params, const std::string & value) { params.diffusion.cfg_scale = std::stof(str: value); }
2913 ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
2914 add_opt(common_arg(
2915 {"--diffusion-add-gumbel-noise"}, "F",
2916 string_format(fmt: "add gumbel noise to the logits if temp > 0.0 (default: %s)", params.diffusion.add_gumbel_noise ? "true" : "false"),
2917 [](common_params & params, const std::string & value) { params.diffusion.add_gumbel_noise = std::stof(str: value); }
2918 ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
2919 add_opt(common_arg(
2920 { "-lr", "--learning-rate" }, "ALPHA",
2921 string_format(fmt: "adamw or sgd optimizer alpha (default: %.2g); note: sgd alpha recommended ~10x (no momentum)", (double) params.lr.lr0),
2922 [](common_params & params, const std::string & value) { params.lr.lr0 = std::stof(str: value); }
2923 ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
2924 add_opt(common_arg({ "-lr-min", "--learning-rate-min" }, "ALPHA",
2925 string_format(fmt: "(if >0) final learning rate after decay (if -decay-epochs is set, default=%.2g)",
2926 (double) params.lr.lr_min),
2927 [](common_params & params, const std::string & value) { params.lr.lr_min = std::stof(str: value); }
2928 ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
2929 add_opt(common_arg(
2930 {"-decay-epochs", "--learning-rate-decay-epochs"}, "ALPHA",
2931 string_format(fmt: "(if >0) decay learning rate to -lr-min after this many epochs (exponential decay, default=%.2g)", (double) params.lr.decay_epochs),
2932 [](common_params & params, const std::string & value) { params.lr.decay_epochs = std::stof(str: value); }
2933 ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
2934 add_opt(common_arg(
2935 {"-wd", "--weight-decay"}, "WD",
2936 string_format(fmt: "adamw or sgd optimizer weight decay (0 is off; recommend very small e.g. 1e-9) (default: %.2g).", (double) params.lr.wd),
2937 [](common_params & params, const std::string & value) { params.lr.wd = std::stof(str: value); }
2938 ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
2939 add_opt(common_arg(
2940 {"-val-split", "--val-split"}, "FRACTION",
2941 string_format(fmt: "fraction of data to use as validation set for training (default: %.2g).", (double) params.val_split),
2942 [](common_params & params, const std::string & value) { params.val_split = std::stof(str: value); }
2943 ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
2944 add_opt(common_arg(
2945 {"-epochs", "--epochs"}, "N",
2946 string_format(fmt: "optimizer max # of epochs (default: %d)", params.lr.epochs),
2947 [](common_params & params, int epochs) { params.lr.epochs = epochs; }
2948 ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
2949 add_opt(common_arg(
2950 {"-opt", "--optimizer"}, "sgd|adamw", "adamw or sgd",
2951 [](common_params & params, const std::string & name) {
2952 params.optimizer = common_opt_get_optimizer(name.c_str());
2953 if (params.optimizer == GGML_OPT_OPTIMIZER_TYPE_COUNT) {
2954 throw std::invalid_argument("invalid --optimizer, valid options: adamw, sgd");
2955 }
2956 }
2957 ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
2958
2959 // presets
2960 add_opt(common_arg(
2961 {"--tts-oute-default"},
2962 string_format(fmt: "use default OuteTTS models (note: can download weights from the internet)"),
2963 [](common_params & params) {
2964 params.model.hf_repo = "OuteAI/OuteTTS-0.2-500M-GGUF";
2965 params.model.hf_file = "OuteTTS-0.2-500M-Q8_0.gguf";
2966 params.vocoder.model.hf_repo = "ggml-org/WavTokenizer";
2967 params.vocoder.model.hf_file = "WavTokenizer-Large-75-F16.gguf";
2968 }
2969 ).set_examples({LLAMA_EXAMPLE_TTS}));
2970
2971 add_opt(common_arg(
2972 {"--embd-gemma-default"},
2973 string_format(fmt: "use default EmbeddingGemma model (note: can download weights from the internet)"),
2974 [](common_params & params) {
2975 params.model.hf_repo = "ggml-org/embeddinggemma-300M-qat-q4_0-GGUF";
2976 params.model.hf_file = "embeddinggemma-300M-qat-Q4_0.gguf";
2977 params.port = 8011;
2978 params.n_ubatch = 2048;
2979 params.n_batch = 2048;
2980 params.n_parallel = 32;
2981 params.n_ctx = 2048*params.n_parallel;
2982 params.verbose_prompt = true;
2983 params.embedding = true;
2984 }
2985 ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
2986
2987 add_opt(common_arg(
2988 {"--fim-qwen-1.5b-default"},
2989 string_format(fmt: "use default Qwen 2.5 Coder 1.5B (note: can download weights from the internet)"),
2990 [](common_params & params) {
2991 params.model.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF";
2992 params.model.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf";
2993 params.port = 8012;
2994 params.n_ubatch = 1024;
2995 params.n_batch = 1024;
2996 params.n_ctx = 0;
2997 params.n_cache_reuse = 256;
2998 }
2999 ).set_examples({LLAMA_EXAMPLE_SERVER}));
3000
3001 add_opt(common_arg(
3002 {"--fim-qwen-3b-default"},
3003 string_format(fmt: "use default Qwen 2.5 Coder 3B (note: can download weights from the internet)"),
3004 [](common_params & params) {
3005 params.model.hf_repo = "ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF";
3006 params.model.hf_file = "qwen2.5-coder-3b-q8_0.gguf";
3007 params.port = 8012;
3008 params.n_ubatch = 1024;
3009 params.n_batch = 1024;
3010 params.n_ctx = 0;
3011 params.n_cache_reuse = 256;
3012 }
3013 ).set_examples({LLAMA_EXAMPLE_SERVER}));
3014
3015 add_opt(common_arg(
3016 {"--fim-qwen-7b-default"},
3017 string_format(fmt: "use default Qwen 2.5 Coder 7B (note: can download weights from the internet)"),
3018 [](common_params & params) {
3019 params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
3020 params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
3021 params.port = 8012;
3022 params.n_ubatch = 1024;
3023 params.n_batch = 1024;
3024 params.n_ctx = 0;
3025 params.n_cache_reuse = 256;
3026 }
3027 ).set_examples({LLAMA_EXAMPLE_SERVER}));
3028
3029 add_opt(common_arg(
3030 {"--fim-qwen-7b-spec"},
3031 string_format(fmt: "use Qwen 2.5 Coder 7B + 0.5B draft for speculative decoding (note: can download weights from the internet)"),
3032 [](common_params & params) {
3033 params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
3034 params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
3035 params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
3036 params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
3037 params.port = 8012;
3038 params.n_ubatch = 1024;
3039 params.n_batch = 1024;
3040 params.n_ctx = 0;
3041 params.n_cache_reuse = 256;
3042 }
3043 ).set_examples({LLAMA_EXAMPLE_SERVER}));
3044
3045 add_opt(common_arg(
3046 {"--fim-qwen-14b-spec"},
3047 string_format(fmt: "use Qwen 2.5 Coder 14B + 0.5B draft for speculative decoding (note: can download weights from the internet)"),
3048 [](common_params & params) {
3049 params.model.hf_repo = "ggml-org/Qwen2.5-Coder-14B-Q8_0-GGUF";
3050 params.model.hf_file = "qwen2.5-coder-14b-q8_0.gguf";
3051 params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
3052 params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
3053 params.port = 8012;
3054 params.n_ubatch = 1024;
3055 params.n_batch = 1024;
3056 params.n_ctx = 0;
3057 params.n_cache_reuse = 256;
3058 }
3059 ).set_examples({LLAMA_EXAMPLE_SERVER}));
3060
3061 add_opt(common_arg(
3062 {"--fim-qwen-30b-default"},
3063 string_format(fmt: "use default Qwen 3 Coder 30B A3B Instruct (note: can download weights from the internet)"),
3064 [](common_params & params) {
3065 params.model.hf_repo = "ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF";
3066 params.model.hf_file = "qwen3-coder-30b-a3b-instruct-q8_0.gguf";
3067 params.port = 8012;
3068 params.n_ubatch = 1024;
3069 params.n_batch = 1024;
3070 params.n_ctx = 0;
3071 params.n_cache_reuse = 256;
3072 }
3073 ).set_examples({LLAMA_EXAMPLE_SERVER}));
3074
3075 add_opt(common_arg(
3076 {"--gpt-oss-20b-default"},
3077 string_format(fmt: "use gpt-oss-20b (note: can download weights from the internet)"),
3078 [](common_params & params) {
3079 params.model.hf_repo = "ggml-org/gpt-oss-20b-GGUF";
3080 params.model.hf_file = "gpt-oss-20b-mxfp4.gguf";
3081 params.port = 8013;
3082 params.n_ubatch = 2048;
3083 params.n_batch = 32768;
3084 params.n_parallel = 2;
3085 params.n_ctx = 131072*params.n_parallel;
3086 params.sampling.temp = 1.0f;
3087 params.sampling.top_p = 1.0f;
3088 params.sampling.top_k = 0;
3089 params.sampling.min_p = 0.01f;
3090 params.use_jinja = true;
3091 //params.default_template_kwargs["reasoning_effort"] = "\"high\"";
3092 }
3093 ).set_examples({LLAMA_EXAMPLE_SERVER}));
3094
3095 add_opt(common_arg(
3096 {"--gpt-oss-120b-default"},
3097 string_format(fmt: "use gpt-oss-120b (note: can download weights from the internet)"),
3098 [](common_params & params) {
3099 params.model.hf_repo = "ggml-org/gpt-oss-120b-GGUF";
3100 params.port = 8013;
3101 params.n_ubatch = 2048;
3102 params.n_batch = 32768;
3103 params.n_parallel = 2;
3104 params.n_ctx = 131072*params.n_parallel;
3105 params.sampling.temp = 1.0f;
3106 params.sampling.top_p = 1.0f;
3107 params.sampling.top_k = 0;
3108 params.sampling.min_p = 0.01f;
3109 params.use_jinja = true;
3110 //params.default_template_kwargs["reasoning_effort"] = "\"high\"";
3111 }
3112 ).set_examples({LLAMA_EXAMPLE_SERVER}));
3113
3114 add_opt(common_arg(
3115 {"--vision-gemma-4b-default"},
3116 string_format(fmt: "use Gemma 3 4B QAT (note: can download weights from the internet)"),
3117 [](common_params & params) {
3118 params.model.hf_repo = "ggml-org/gemma-3-4b-it-qat-GGUF";
3119 params.port = 8014;
3120 params.n_ctx = 0;
3121 params.use_jinja = true;
3122 }
3123 ).set_examples({LLAMA_EXAMPLE_SERVER}));
3124
3125 add_opt(common_arg(
3126 {"--vision-gemma-12b-default"},
3127 string_format(fmt: "use Gemma 3 12B QAT (note: can download weights from the internet)"),
3128 [](common_params & params) {
3129 params.model.hf_repo = "ggml-org/gemma-3-12b-it-qat-GGUF";
3130 params.port = 8014;
3131 params.n_ctx = 0;
3132 params.use_jinja = true;
3133 }
3134 ).set_examples({LLAMA_EXAMPLE_SERVER}));
3135
3136 return ctx_arg;
3137}
3138