arg.cpp source code [llama.cpp/common/arg.cpp]

1	#include "arg.h"
2
3	#include "chat.h"
4	#include "common.h"
5	#include "json-schema-to-grammar.h"
6	#include "log.h"
7	#include "sampling.h"
8	#include "download.h"
9
10	// fix problem with std::min and std::max
11	#if defined(_WIN32)
12	#define WIN32_LEAN_AND_MEAN
13	#ifndef NOMINMAX
14	# define NOMINMAX
15	#endif
16	#include <windows.h>
17	#endif
18
19	#define JSON_ASSERT GGML_ASSERT
20	#include <nlohmann/json.hpp>
21
22	#include <algorithm>
23	#include <climits>
24	#include <cstdarg>
25	#include <fstream>
26	#include <list>
27	#include <regex>
28	#include <set>
29	#include <string>
30	#include <thread> // for hardware_concurrency
31	#include <vector>
32
33	#ifdef __linux__
34	#include <linux/limits.h>
35	#elif defined(_WIN32)
36	# if !defined(PATH_MAX)
37	# define PATH_MAX MAX_PATH
38	# endif
39	#elif defined(_AIX)
40	#include <sys/limits.h>
41	#else
42	#include <sys/syslimits.h>
43	#endif
44	#define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
45
46	using json = nlohmann::ordered_json;
47
48	static std::initializer_list<enum llama_example> mmproj_examples = {
49	LLAMA_EXAMPLE_MTMD,
50	LLAMA_EXAMPLE_SERVER,
51	};
52
53	static std::string read_file(const std::string & fname) {
54	std::ifstream file(fname);
55	if (!file) {
56	throw std::runtime_error (string_format(fmt: "error: failed to open file '%s'\n", fname.c_str()));
57	}
58	std::string content((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
59	file.close();
60	return content;
61	}
62
63	common_arg & common_arg::set_examples(std::initializer_list<enum llama_example> examples) {
64	this->examples = examples;
65	return *this;
66	}
67
68	common_arg & common_arg::set_excludes(std::initializer_list<enum llama_example> excludes) {
69	this->excludes = excludes;
70	return *this;
71	}
72
73	common_arg & common_arg::set_env(const char * env) {
74	help = help + "\n(env: " + env + ")";
75	this->env = env;
76	return *this;
77	}
78
79	common_arg & common_arg::set_sparam() {
80	is_sparam = true;
81	return *this;
82	}
83
84	bool common_arg::in_example(enum llama_example ex) {
85	return examples.find(x: ex) != examples.end();
86	}
87
88	bool common_arg::is_exclude(enum llama_example ex) {
89	return excludes.find(x: ex) != excludes.end();
90	}
91
92	bool common_arg::get_value_from_env(std::string & output) const {
93	if (env == nullptr) return false;
94	char * value = std::getenv(name: env);
95	if (value) {
96	output = value;
97	return true;
98	}
99	return false;
100	}
101
102	bool common_arg::has_value_from_env() const {
103	return env != nullptr && std::getenv(name: env);
104	}
105
106	static std::vector<std::string> break_str_into_lines(std::string input, size_t max_char_per_line) {
107	std::vector<std::string> result;
108	std::istringstream iss(input);
109	std::string line;
110	auto add_line = [&](const std::string& l) {
111	if (l.length() <= max_char_per_line) {
112	result.push_back(x: l);
113	} else {
114	std::istringstream line_stream(l);
115	std::string word, current_line;
116	while (line_stream >> word) {
117	if (current_line.length() + !current_line.empty() + word.length() > max_char_per_line) {
118	if (!current_line.empty()) result.push_back(x: current_line);
119	current_line = word;
120	} else {
121	current_line += (!current_line.empty() ? " " : "") + word;
122	}
123	}
124	if (!current_line.empty()) result.push_back(x: current_line);
125	}
126	};
127	while (std::getline(is&: iss, str&: line)) {
128	add_line (line);
129	}
130	return result;
131	}
132
133	std::string common_arg::to_string() {
134	// params for printing to console
135	const static int n_leading_spaces = `40`;
136	const static int n_char_per_line_help = `70`; // TODO: detect this based on current console
137	std::string leading_spaces(n_leading_spaces, `' '`);
138
139	std::ostringstream ss;
140	for (const auto arg : args) {
141	if (arg == args.front()) {
142	if (args.size() == `1`) {
143	ss << arg;
144	} else {
145	// first arg is usually abbreviation, we need padding to make it more beautiful
146	auto tmp = std::string (arg) + ", ";
147	auto spaces = std::string (std::max(a: `0`, b: `7` - (int)tmp.size()), `' '`);
148	ss << tmp << spaces;
149	}
150	} else {
151	ss << arg << (arg != args.back() ? ", " : "");
152	}
153	}
154	if (value_hint) ss << " " << value_hint;
155	if (value_hint_2) ss << " " << value_hint_2;
156	if (ss.tellp() > n_leading_spaces - `3`) {
157	// current line is too long, add new line
158	ss << "\n" << leading_spaces;
159	} else {
160	// padding between arg and help, same line
161	ss << std::string (leading_spaces.size() - ss.tellp(), `' '`);
162	}
163	const auto help_lines = break_str_into_lines(input: help, max_char_per_line: n_char_per_line_help);
164	for (const auto & line : help_lines) {
165	ss << (&line == &help_lines.front() ? "" : leading_spaces) << line << "\n";
166	}
167	return ss.str();
168	}
169
170	//
171	// utils
172	//
173
174	// Helper function to parse tensor buffer override strings
175	static void parse_tensor_buffer_overrides(const std::string & value, std::vector<llama_model_tensor_buft_override> & overrides) {
176	std::map<std::string, ggml_backend_buffer_type_t> buft_list;
177	for (size_t i = `0`; i < ggml_backend_dev_count(); ++i) {
178	auto * dev = ggml_backend_dev_get(index: i);
179	auto * buft = ggml_backend_dev_buffer_type(device: dev);
180	if (buft) {
181	buft_list [ggml_backend_buft_name(buft)] = buft;
182	}
183	}
184
185	for (const auto & override : string_split<std::string>(input: value, separator: `','`)) {
186	std::string::size_type pos = override.find(c: `'='`);
187	if (pos == std::string::npos) {
188	throw std::invalid_argument ("invalid value");
189	}
190	std::string tensor_name = override.substr(pos: `0`, n: pos);
191	std::string buffer_type = override.substr(pos: pos + `1`);
192
193	if (buft_list.find(x: buffer_type) == buft_list.end()) {
194	printf(format: "Available buffer types:\n");
195	for (const auto & it : buft_list) {
196	printf(format: " %s\n", ggml_backend_buft_name(buft: it.second));
197	}
198	throw std::invalid_argument ("unknown buffer type");
199	}
200	// keep strings alive and avoid leaking memory by storing them in a static vector
201	static std::list<std::string> buft_overrides;
202	buft_overrides.push_back(x: tensor_name);
203	overrides.push_back(x: {.pattern: buft_overrides.back().c_str(), .buft: buft_list.at(k: buffer_type)});
204	}
205	}
206
207	struct handle_model_result {
208	bool found_mmproj = false;
209	common_params_model mmproj;
210	};
211
212	static handle_model_result common_params_handle_model(
213	struct common_params_model & model,
214	const std::string & bearer_token,
215	const std::string & model_path_default,
216	bool offline) {
217	handle_model_result result;
218	// handle pre-fill default model path and url based on hf_repo and hf_file
219	{
220	if (!model.docker_repo.empty()) { // Handle Docker URLs by resolving them to local paths
221	model.path = common_docker_resolve_model(docker: model.docker_repo);
222	} else if (!model.hf_repo.empty()) {
223	// short-hand to avoid specifying --hf-file -> default it to --model
224	if (model.hf_file.empty()) {
225	if (model.path.empty()) {
226	auto auto_detected = common_get_hf_file(hf_repo_with_tag: model.hf_repo, bearer_token, offline);
227	if (auto_detected.repo.empty() \|\| auto_detected.ggufFile.empty()) {
228	exit(status: `1`); // built without CURL, error message already printed
229	}
230	model.hf_repo = auto_detected.repo;
231	model.hf_file = auto_detected.ggufFile;
232	if (!auto_detected.mmprojFile.empty()) {
233	result.found_mmproj = true;
234	result.mmproj.hf_repo = model.hf_repo;
235	result.mmproj.hf_file = auto_detected.mmprojFile;
236	}
237	} else {
238	model.hf_file = model.path;
239	}
240	}
241
242	std::string model_endpoint = get_model_endpoint();
243	model.url = model_endpoint + model.hf_repo + "/resolve/main/" + model.hf_file;
244	// make sure model path is present (for caching purposes)
245	if (model.path.empty()) {
246	// this is to avoid different repo having same file name, or same file name in different subdirs
247	std::string filename = model.hf_repo + "_" + model.hf_file;
248	// to make sure we don't have any slashes in the filename
249	string_replace_all(s&: filename, search: "/", replace: "_");
250	model.path = fs_get_cache_file(filename);
251	}
252
253	} else if (!model.url.empty()) {
254	if (model.path.empty()) {
255	auto f = string_split<std::string>(input: model.url, separator: `'#'`).front();
256	f = string_split<std::string>(input: f, separator: `'?'`).front();
257	model.path = fs_get_cache_file(filename: string_split<std::string>(input: f, separator: `'/'`).back());
258	}
259
260	} else if (model.path.empty()) {
261	model.path = model_path_default;
262	}
263	}
264
265	// then, download it if needed
266	if (!model.url.empty()) {
267	bool ok = common_download_model(model, bearer_token, offline);
268	if (!ok) {
269	LOG_ERR("error: failed to download model from %s\n", model.url.c_str());
270	exit(status: `1`);
271	}
272	}
273
274	return result;
275	}
276
277	const std::vector<ggml_type> kv_cache_types = {
278	GGML_TYPE_F32,
279	GGML_TYPE_F16,
280	GGML_TYPE_BF16,
281	GGML_TYPE_Q8_0,
282	GGML_TYPE_Q4_0,
283	GGML_TYPE_Q4_1,
284	GGML_TYPE_IQ4_NL,
285	GGML_TYPE_Q5_0,
286	GGML_TYPE_Q5_1,
287	};
288
289	static ggml_type kv_cache_type_from_str(const std::string & s) {
290	for (const auto & type : kv_cache_types) {
291	if (ggml_type_name(type) == s) {
292	return type;
293	}
294	}
295	throw std::runtime_error ("Unsupported cache type: " + s);
296	}
297
298	static std::string get_all_kv_cache_types() {
299	std::ostringstream msg;
300	for (const auto & type : kv_cache_types) {
301	msg << ggml_type_name(type) << (&type == &kv_cache_types.back() ? "" : ", ");
302	}
303	return msg.str();
304	}
305
306	//
307	// CLI argument parsing functions
308	//
309
310	static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) {
311	common_params & params = ctx_arg.params;
312
313	std::unordered_map<std::string, common_arg *> arg_to_options;
314	for (auto & opt : ctx_arg.options) {
315	for (const auto & arg : opt.args) {
316	arg_to_options [arg] = &opt;
317	}
318	}
319
320	// handle environment variables
321	for (auto & opt : ctx_arg.options) {
322	std::string value;
323	if (opt.get_value_from_env(output&: value)) {
324	try {
325	if (opt.handler_void && (value == "1" \|\| value == "true")) {
326	opt.handler_void(params);
327	}
328	if (opt.handler_int) {
329	opt.handler_int(params, std::stoi(str: value));
330	}
331	if (opt.handler_string) {
332	opt.handler_string(params, value);
333	continue;
334	}
335	} catch (std::exception & e) {
336	throw std::invalid_argument (string_format(
337	fmt: "error while handling environment variable \"%s\": %s\n\n", opt.env, e.what()));
338	}
339	}
340	}
341
342	// handle command line arguments
343	auto check_arg = [&](int i) {
344	if (i+`1` >= argc) {
345	throw std::invalid_argument ("expected value for argument");
346	}
347	};
348
349	for (int i = `1`; i < argc; i++) {
350	const std::string arg_prefix = "--";
351
352	std::string arg = argv[i];
353	if (arg.compare(pos: `0`, n: arg_prefix.size(), str: arg_prefix) == `0`) {
354	std::replace(first: arg.begin(), last: arg.end(), old_value: `'_'`, new_value: `'-'`);
355	}
356	if (arg_to_options.find(x: arg) == arg_to_options.end()) {
357	throw std::invalid_argument (string_format(fmt: "error: invalid argument: %s", arg.c_str()));
358	}
359	auto opt = *arg_to_options [arg];
360	if (opt.has_value_from_env()) {
361	fprintf(stderr, format: "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
362	}
363	try {
364	if (opt.handler_void) {
365	opt.handler_void(params);
366	continue;
367	}
368
369	// arg with single value
370	check_arg (i);
371	std::string val = argv[++i];
372	if (opt.handler_int) {
373	opt.handler_int(params, std::stoi(str: val));
374	continue;
375	}
376	if (opt.handler_string) {
377	opt.handler_string(params, val);
378	continue;
379	}
380
381	// arg with 2 values
382	check_arg (i);
383	std::string val2 = argv[++i];
384	if (opt.handler_str_str) {
385	opt.handler_str_str(params, val, val2);
386	continue;
387	}
388	} catch (std::exception & e) {
389	throw std::invalid_argument (string_format(
390	fmt: "error while handling argument \"%s\": %s\n\n"
391	"usage:\n%s\n\nto show complete usage, run with -h",
392	arg.c_str(), e.what(), arg_to_options [arg]->to_string().c_str()));
393	}
394	}
395
396	postprocess_cpu_params(cpuparams&: params.cpuparams, role_model: nullptr);
397	postprocess_cpu_params(cpuparams&: params.cpuparams_batch, role_model: &params.cpuparams);
398
399	postprocess_cpu_params(cpuparams&: params.speculative.cpuparams, role_model: &params.cpuparams);
400	postprocess_cpu_params(cpuparams&: params.speculative.cpuparams_batch, role_model: &params.cpuparams_batch);
401
402	if (params.prompt_cache_all && (params.interactive \|\| params.interactive_first)) {
403	throw std::invalid_argument ("error: --prompt-cache-all not supported in interactive mode yet\n");
404	}
405
406	// handle model and download
407	{
408	auto res = common_params_handle_model(model&: params.model, bearer_token: params.hf_token, DEFAULT_MODEL_PATH, offline: params.offline);
409	if (params.no_mmproj) {
410	params.mmproj = {};
411	} else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
412	// optionally, handle mmproj model when -hf is specified
413	params.mmproj = res.mmproj;
414	}
415	// only download mmproj if the current example is using it
416	for (auto & ex : mmproj_examples) {
417	if (ctx_arg.ex == ex) {
418	common_params_handle_model(model&: params.mmproj, bearer_token: params.hf_token, model_path_default: "", offline: params.offline);
419	break;
420	}
421	}
422	common_params_handle_model(model&: params.speculative.model, bearer_token: params.hf_token, model_path_default: "", offline: params.offline);
423	common_params_handle_model(model&: params.vocoder.model, bearer_token: params.hf_token, model_path_default: "", offline: params.offline);
424	}
425
426	if (params.escape) {
427	string_process_escapes(input&: params.prompt);
428	string_process_escapes(input&: params.input_prefix);
429	string_process_escapes(input&: params.input_suffix);
430	for (auto & antiprompt : params.antiprompt) {
431	string_process_escapes(input&: antiprompt);
432	}
433	for (auto & seq_breaker : params.sampling.dry_sequence_breakers) {
434	string_process_escapes(input&: seq_breaker);
435	}
436	for (auto & pair : params.speculative.replacements) {
437	string_process_escapes(input&: pair.first);
438	string_process_escapes(input&: pair.second);
439	}
440	}
441
442	if (!params.kv_overrides.empty()) {
443	params.kv_overrides.emplace_back();
444	params.kv_overrides.back().key[`0`] = `0`;
445	}
446
447	if (!params.tensor_buft_overrides.empty()) {
448	params.tensor_buft_overrides.push_back(x: {.pattern: nullptr, .buft: nullptr});
449	}
450
451	if (!params.speculative.tensor_buft_overrides.empty()) {
452	params.speculative.tensor_buft_overrides.push_back(x: {.pattern: nullptr, .buft: nullptr});
453	}
454
455	if (!params.chat_template.empty() && !common_chat_verify_template(tmpl: params.chat_template, use_jinja: params.use_jinja)) {
456	throw std::runtime_error (string_format(
457	fmt: "error: the supplied chat template is not supported: %s%s\n",
458	params.chat_template.c_str(),
459	params.use_jinja ? "" : "\nnote: llama.cpp was started without --jinja, we only support commonly used templates"
460	));
461	}
462
463	return true;
464	}
465
466	static void common_params_print_usage(common_params_context & ctx_arg) {
467	auto print_options = [](std::vector<common_arg *> & options) {
468	for (common_arg * opt : options) {
469	printf(format: "%s", opt->to_string().c_str());
470	}
471	};
472
473	std::vector<common_arg *> common_options;
474	std::vector<common_arg *> sparam_options;
475	std::vector<common_arg *> specific_options;
476	for (auto & opt : ctx_arg.options) {
477	// in case multiple LLAMA_EXAMPLE_ are set, we prioritize the LLAMA_EXAMPLE_* matching current example*
478	if (opt.is_sparam) {
479	sparam_options.push_back(x: &opt);
480	} else if (opt.in_example(ex: ctx_arg.ex)) {
481	specific_options.push_back(x: &opt);
482	} else {
483	common_options.push_back(x: &opt);
484	}
485	}
486	printf(format: "----- common params -----\n\n");
487	print_options (common_options);
488	printf(format: "\n\n----- sampling params -----\n\n");
489	print_options (sparam_options);
490	// TODO: maybe convert enum llama_example to string
491	printf(format: "\n\n----- example-specific params -----\n\n");
492	print_options (specific_options);
493	}
494
495	static void common_params_print_completion(common_params_context & ctx_arg) {
496	std::vector<common_arg *> common_options;
497	std::vector<common_arg *> sparam_options;
498	std::vector<common_arg *> specific_options;
499
500	for (auto & opt : ctx_arg.options) {
501	if (opt.is_sparam) {
502	sparam_options.push_back(x: &opt);
503	} else if (opt.in_example(ex: ctx_arg.ex)) {
504	specific_options.push_back(x: &opt);
505	} else {
506	common_options.push_back(x: &opt);
507	}
508	}
509
510	printf(format: "_llama_completions() {\n");
511	printf(format: " local cur prev opts\n");
512	printf(format: " COMPREPLY=()\n");
513	printf(format: " cur=\"${COMP_WORDS[COMP_CWORD]}\"\n");
514	printf(format: " prev=\"${COMP_WORDS[COMP_CWORD-1]}\"\n\n");
515
516	printf(format: " opts=\"");
517	auto print_options = [](const std::vector<common_arg *> & options) {
518	for (const common_arg * opt : options) {
519	for (const char * arg : opt->args) {
520	printf(format: "%s ", arg);
521	}
522	}
523	};
524
525	print_options (common_options);
526	print_options (sparam_options);
527	print_options (specific_options);
528	printf(format: "\"\n\n");
529
530	printf(format: " case \"$prev\" in\n");
531	printf(format: " --model\|-m)\n");
532	printf(format: " COMPREPLY=( $(compgen -f -X '!*.gguf' -- \"$cur\") $(compgen -d -- \"$cur\") )\n");
533	printf(format: " return 0\n");
534	printf(format: " ;;\n");
535	printf(format: " --grammar-file)\n");
536	printf(format: " COMPREPLY=( $(compgen -f -X '!*.gbnf' -- \"$cur\") $(compgen -d -- \"$cur\") )\n");
537	printf(format: " return 0\n");
538	printf(format: " ;;\n");
539	printf(format: " --chat-template-file)\n");
540	printf(format: " COMPREPLY=( $(compgen -f -X '!*.jinja' -- \"$cur\") $(compgen -d -- \"$cur\") )\n");
541	printf(format: " return 0\n");
542	printf(format: " ;;\n");
543	printf(format: " *)\n");
544	printf(format: " COMPREPLY=( $(compgen -W \"${opts}\" -- \"$cur\") )\n");
545	printf(format: " return 0\n");
546	printf(format: " ;;\n");
547	printf(format: " esac\n");
548	printf(format: "}\n\n");
549
550	std::set<std::string> executables = {
551	"llama-batched",
552	"llama-batched-bench",
553	"llama-bench",
554	"llama-cli",
555	"llama-convert-llama2c-to-ggml",
556	"llama-cvector-generator",
557	"llama-embedding",
558	"llama-eval-callback",
559	"llama-export-lora",
560	"llama-gen-docs",
561	"llama-gguf",
562	"llama-gguf-hash",
563	"llama-gguf-split",
564	"llama-gritlm",
565	"llama-imatrix",
566	"llama-infill",
567	"llama-mtmd-cli",
568	"llama-llava-clip-quantize-cli",
569	"llama-lookahead",
570	"llama-lookup",
571	"llama-lookup-create",
572	"llama-lookup-merge",
573	"llama-lookup-stats",
574	"llama-parallel",
575	"llama-passkey",
576	"llama-perplexity",
577	"llama-q8dot",
578	"llama-quantize",
579	"llama-qwen2vl-cli",
580	"llama-retrieval",
581	"llama-run",
582	"llama-save-load-state",
583	"llama-server",
584	"llama-simple",
585	"llama-simple-chat",
586	"llama-speculative",
587	"llama-speculative-simple",
588	"llama-tokenize",
589	"llama-tts",
590	"llama-vdot"
591	};
592
593	for (const auto& exe : executables) {
594	printf(format: "complete -F _llama_completions %s\n", exe.c_str());
595	}
596	}
597
598	static std::vector<ggml_backend_dev_t> parse_device_list(const std::string & value) {
599	std::vector<ggml_backend_dev_t> devices;
600	auto dev_names = string_split<std::string>(input: value, separator: `','`);
601	if (dev_names.empty()) {
602	throw std::invalid_argument ("no devices specified");
603	}
604	if (dev_names.size() == `1` && dev_names [`0`] == "none") {
605	devices.push_back(x: nullptr);
606	} else {
607	for (const auto & device : dev_names) {
608	auto * dev = ggml_backend_dev_by_name(name: device.c_str());
609	if (!dev \|\| ggml_backend_dev_type(device: dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
610	throw std::invalid_argument (string_format(fmt: "invalid device: %s", device.c_str()));
611	}
612	devices.push_back(x: dev);
613	}
614	devices.push_back(x: nullptr);
615	}
616	return devices;
617	}
618
619	static void add_rpc_devices(const std::string & servers) {
620	auto rpc_servers = string_split<std::string>(input: servers, separator: `','`);
621	if (rpc_servers.empty()) {
622	throw std::invalid_argument ("no RPC servers specified");
623	}
624	ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name(name: "RPC");
625	if (!rpc_reg) {
626	throw std::invalid_argument ("failed to find RPC backend");
627	}
628	typedef ggml_backend_reg_t (ggml_backend_rpc_add_server_t)(const* char * endpoint);
629	ggml_backend_rpc_add_server_t ggml_backend_rpc_add_server_fn = (ggml_backend_rpc_add_server_t) ggml_backend_reg_get_proc_address(reg: rpc_reg, name: "ggml_backend_rpc_add_server");
630	if (!ggml_backend_rpc_add_server_fn) {
631	throw std::invalid_argument ("failed to find RPC add server function");
632	}
633	for (const auto & server : rpc_servers) {
634	auto reg = ggml_backend_rpc_add_server_fn(server.c_str());
635	ggml_backend_register(reg);
636	}
637	}
638
639	bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(print_usage)(int, char* **)) {
640	auto ctx_arg = common_params_parser_init(params, ex, print_usage);
641	const common_params params_org = ctx_arg.params; // the example can modify the default params
642
643	try {
644	if (!common_params_parse_ex(argc, argv, ctx_arg)) {
645	ctx_arg.params = params_org;
646	return false;
647	}
648	if (ctx_arg.params.usage) {
649	common_params_print_usage(ctx_arg);
650	if (ctx_arg.print_usage) {
651	ctx_arg.print_usage(argc, argv);
652	}
653	exit(status: `0`);
654	}
655	if (ctx_arg.params.completion) {
656	common_params_print_completion(ctx_arg);
657	exit(status: `0`);
658	}
659	params.lr.init();
660	} catch (const std::invalid_argument & ex) {
661	fprintf(stderr, format: "%s\n", ex.what());
662	ctx_arg.params = params_org;
663	return false;
664	} catch (std::exception & ex) {
665	fprintf(stderr, format: "%s\n", ex.what());
666	exit(status: `1`); // for other exceptions, we exit with status code 1
667	}
668
669	return true;
670	}
671
672	static std::string list_builtin_chat_templates() {
673	std::vector<const char *> supported_tmpl;
674	int32_t res = llama_chat_builtin_templates(output: nullptr, len: `0`);
675	supported_tmpl.resize(new_size: res);
676	res = llama_chat_builtin_templates(output: supported_tmpl.data(), len: supported_tmpl.size());
677	std::ostringstream msg;
678	for (auto & tmpl : supported_tmpl) {
679	msg << tmpl << (&tmpl == &supported_tmpl.back() ? "" : ", ");
680	}
681	return msg.str();
682	}
683
684	static bool is_truthy(const std::string & value) {
685	return value == "on" \|\| value == "enabled" \|\| value == "1";
686	}
687
688	static bool is_falsey(const std::string & value) {
689	return value == "off" \|\| value == "disabled" \|\| value == "0";
690	}
691
692	static bool is_autoy(const std::string & value) {
693	return value == "auto" \|\| value == "-1";
694	}
695
696	common_params_context common_params_parser_init(common_params & params, llama_example ex, void(print_usage)(int, char* **)) {
697	// load dynamic backends
698	ggml_backend_load_all();
699
700	common_params_context ctx_arg(params);
701	ctx_arg.print_usage = print_usage;
702	ctx_arg.ex = ex;
703
704	std::string sampler_type_chars;
705	std::string sampler_type_names;
706	for (const auto & sampler : params.sampling.samplers) {
707	sampler_type_chars += common_sampler_type_to_chr(cnstr: sampler);
708	sampler_type_names += common_sampler_type_to_str(cnstr: sampler) + ";";
709	}
710	sampler_type_names.pop_back();
711
712
713	/**
714	* filter options by example
715	* rules:
716	* - all examples inherit options from LLAMA_EXAMPLE_COMMON
717	* - if LLAMA_EXAMPLE_* is set (other than COMMON), we only show the option in the corresponding example
718	* - if both {LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_,} are set, we will prioritize the LLAMA_EXAMPLE_ matching current example
719	*/
720	auto add_opt = [&](common_arg arg) {
721	if ((arg.in_example(ex) \|\| arg.in_example(ex: LLAMA_EXAMPLE_COMMON)) && !arg.is_exclude(ex)) {
722	ctx_arg.options.push_back(x: std::move(arg));
723	}
724	};
725
726
727	add_opt (common_arg (
728	{"-h", "--help", "--usage"},
729	"print usage and exit",
730	[](common_params & params) {
731	params.usage = true;
732	}
733	));
734	add_opt (common_arg (
735	{"--version"},
736	"show version and build info",
737	[](common_params &) {
738	fprintf(stderr, format: "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
739	fprintf(stderr, format: "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
740	exit(status: `0`);
741	}
742	));
743	add_opt (common_arg (
744	{"--completion-bash"},
745	"print source-able bash completion script for llama.cpp",
746	[](common_params & params) {
747	params.completion = true;
748	}
749	));
750	add_opt (common_arg (
751	{"--verbose-prompt"},
752	string_format(fmt: "print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"),
753	[](common_params & params) {
754	params.verbose_prompt = true;
755	}
756	));
757	add_opt (common_arg (
758	{"--no-display-prompt"},
759	string_format(fmt: "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"),
760	[](common_params & params) {
761	params.display_prompt = false;
762	}
763	).set_examples({LLAMA_EXAMPLE_MAIN}));
764	add_opt (common_arg (
765	{"-co", "--color"},
766	string_format(fmt: "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"),
767	[](common_params & params) {
768	params.use_color = true;
769	}
770	).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
771	add_opt (common_arg (
772	{"-t", "--threads"}, "N",
773	string_format(fmt: "number of CPU threads to use during generation (default: %d)", params.cpuparams.n_threads),
774	[](common_params & params, int value) {
775	params.cpuparams.n_threads = value;
776	if (params.cpuparams.n_threads <= `0`) {
777	params.cpuparams.n_threads = std::thread::hardware_concurrency();
778	}
779	}
780	).set_env("LLAMA_ARG_THREADS"));
781	add_opt (common_arg (
782	{"-tb", "--threads-batch"}, "N",
783	"number of threads to use during batch and prompt processing (default: same as --threads)",
784	[](common_params & params, int value) {
785	params.cpuparams_batch.n_threads = value;
786	if (params.cpuparams_batch.n_threads <= `0`) {
787	params.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
788	}
789	}
790	));
791	add_opt (common_arg (
792	{"-C", "--cpu-mask"}, "M",
793	"CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")",
794	[](common_params & params, const std::string & mask) {
795	params.cpuparams.mask_valid = true;
796	if (!parse_cpu_mask(mask, boolmask&: params.cpuparams.cpumask)) {
797	throw std::invalid_argument ("invalid cpumask");
798	}
799	}
800	));
801	add_opt (common_arg (
802	{"-Cr", "--cpu-range"}, "lo-hi",
803	"range of CPUs for affinity. Complements --cpu-mask",
804	[](common_params & params, const std::string & range) {
805	params.cpuparams.mask_valid = true;
806	if (!parse_cpu_range(range, boolmask&: params.cpuparams.cpumask)) {
807	throw std::invalid_argument ("invalid range");
808	}
809	}
810	));
811	add_opt (common_arg (
812	{"--cpu-strict"}, "<0\|1>",
813	string_format(fmt: "use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu),
814	[](common_params & params, const std::string & value) {
815	params.cpuparams.strict_cpu = std::stoul(str: value);
816	}
817	));
818	add_opt (common_arg (
819	{"--prio"}, "N",
820	string_format(fmt: "set process/thread priority : low(-1), normal(0), medium(1), high(2), realtime(3) (default: %d)\n", params.cpuparams.priority),
821	[](common_params & params, int prio) {
822	if (prio < GGML_SCHED_PRIO_LOW \|\| prio > GGML_SCHED_PRIO_REALTIME) {
823	throw std::invalid_argument ("invalid value");
824	}
825	params.cpuparams.priority = (enum ggml_sched_priority) prio;
826	}
827	));
828	add_opt (common_arg (
829	{"--poll"}, "<0...100>",
830	string_format(fmt: "use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll),
831	[](common_params & params, const std::string & value) {
832	params.cpuparams.poll = std::stoul(str: value);
833	}
834	));
835	add_opt (common_arg (
836	{"-Cb", "--cpu-mask-batch"}, "M",
837	"CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)",
838	[](common_params & params, const std::string & mask) {
839	params.cpuparams_batch.mask_valid = true;
840	if (!parse_cpu_mask(mask, boolmask&: params.cpuparams_batch.cpumask)) {
841	throw std::invalid_argument ("invalid cpumask");
842	}
843	}
844	));
845	add_opt (common_arg (
846	{"-Crb", "--cpu-range-batch"}, "lo-hi",
847	"ranges of CPUs for affinity. Complements --cpu-mask-batch",
848	[](common_params & params, const std::string & range) {
849	params.cpuparams_batch.mask_valid = true;
850	if (!parse_cpu_range(range, boolmask&: params.cpuparams_batch.cpumask)) {
851	throw std::invalid_argument ("invalid range");
852	}
853	}
854	));
855	add_opt (common_arg (
856	{"--cpu-strict-batch"}, "<0\|1>",
857	"use strict CPU placement (default: same as --cpu-strict)",
858	[](common_params & params, int value) {
859	params.cpuparams_batch.strict_cpu = value;
860	}
861	));
862	add_opt (common_arg (
863	{"--prio-batch"}, "N",
864	string_format(fmt: "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams_batch.priority),
865	[](common_params & params, int prio) {
866	if (prio < `0` \|\| prio > `3`) {
867	throw std::invalid_argument ("invalid value");
868	}
869	params.cpuparams_batch.priority = (enum ggml_sched_priority) prio;
870	}
871	));
872	add_opt (common_arg (
873	{"--poll-batch"}, "<0\|1>",
874	"use polling to wait for work (default: same as --poll)",
875	[](common_params & params, int value) {
876	params.cpuparams_batch.poll = value;
877	}
878	));
879	add_opt (common_arg (
880	{"-lcs", "--lookup-cache-static"}, "FNAME",
881	"path to static lookup cache to use for lookup decoding (not updated by generation)",
882	[](common_params & params, const std::string & value) {
883	params.lookup_cache_static = value;
884	}
885	).set_examples({LLAMA_EXAMPLE_LOOKUP}));
886	add_opt (common_arg (
887	{"-lcd", "--lookup-cache-dynamic"}, "FNAME",
888	"path to dynamic lookup cache to use for lookup decoding (updated by generation)",
889	[](common_params & params, const std::string & value) {
890	params.lookup_cache_dynamic = value;
891	}
892	).set_examples({LLAMA_EXAMPLE_LOOKUP}));
893	add_opt (common_arg (
894	{"-c", "--ctx-size"}, "N",
895	string_format(fmt: "size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx),
896	[](common_params & params, int value) {
897	params.n_ctx = value;
898	}
899	).set_env("LLAMA_ARG_CTX_SIZE"));
900	add_opt (common_arg (
901	{"-n", "--predict", "--n-predict"}, "N",
902	string_format(
903	fmt: ex == LLAMA_EXAMPLE_MAIN
904	? "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)"
905	: "number of tokens to predict (default: %d, -1 = infinity)",
906	params.n_predict),
907	[](common_params & params, int value) {
908	params.n_predict = value;
909	}
910	).set_env("LLAMA_ARG_N_PREDICT"));
911	add_opt (common_arg (
912	{"-b", "--batch-size"}, "N",
913	string_format(fmt: "logical maximum batch size (default: %d)", params.n_batch),
914	[](common_params & params, int value) {
915	params.n_batch = value;
916	}
917	).set_env("LLAMA_ARG_BATCH"));
918	add_opt (common_arg (
919	{"-ub", "--ubatch-size"}, "N",
920	string_format(fmt: "physical maximum batch size (default: %d)", params.n_ubatch),
921	[](common_params & params, int value) {
922	params.n_ubatch = value;
923	}
924	).set_env("LLAMA_ARG_UBATCH"));
925	add_opt (common_arg (
926	{"--keep"}, "N",
927	string_format(fmt: "number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep),
928	[](common_params & params, int value) {
929	params.n_keep = value;
930	}
931	));
932	add_opt (common_arg (
933	{"--swa-full"},
934	string_format(fmt: "use full-size SWA cache (default: %s)\n"
935	"[(more info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)", params.swa_full ? "true" : "false"),
936	[](common_params & params) {
937	params.swa_full = true;
938	}
939	).set_env("LLAMA_ARG_SWA_FULL"));
940	add_opt (common_arg (
941	{"--ctx-checkpoints", "--swa-checkpoints"}, "N",
942	string_format(fmt: "max number of context checkpoints to create per slot (default: %d)\n"
943	"[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)", params.n_ctx_checkpoints),
944	[](common_params & params, int value) {
945	params.n_ctx_checkpoints = value;
946	}
947	).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER}));
948	add_opt (common_arg (
949	{"--cache-ram", "-cram"}, "N",
950	string_format(fmt: "set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)\n"
951	"[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)", params.cache_ram_mib),
952	[](common_params & params, int value) {
953	params.cache_ram_mib = value;
954	}
955	).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER}));
956	add_opt (common_arg (
957	{"--kv-unified", "-kvu"},
958	string_format(fmt: "use single unified KV buffer for the KV cache of all sequences (default: %s)\n"
959	"[(more info)](https://github.com/ggml-org/llama.cpp/pull/14363)", params.kv_unified ? "true" : "false"),
960	[](common_params & params) {
961	params.kv_unified = true;
962	}
963	).set_env("LLAMA_ARG_KV_SPLIT"));
964	add_opt (common_arg (
965	{"--no-context-shift"},
966	string_format(fmt: "disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
967	[](common_params & params) {
968	params.ctx_shift = false;
969	}
970	).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
971	add_opt (common_arg (
972	{"--context-shift"},
973	string_format(fmt: "enables context shift on infinite text generation (default: %s)", params.ctx_shift ? "enabled" : "disabled"),
974	[](common_params & params) {
975	params.ctx_shift = true;
976	}
977	).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_CONTEXT_SHIFT"));
978	add_opt (common_arg (
979	{"--chunks"}, "N",
980	string_format(fmt: "max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
981	[](common_params & params, int value) {
982	params.n_chunks = value;
983	}
984	).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL}));
985	add_opt (common_arg ({ "-fa", "--flash-attn" }, "[on\|off\|auto]",
986	string_format(fmt: "set Flash Attention use ('on', 'off', or 'auto', default: '%s')",
987	llama_flash_attn_type_name(flash_attn_type: params.flash_attn_type)),
988	[](common_params & params, const std::string & value) {
989	if (is_truthy(value)) {
990	params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_ENABLED;
991	} else if (is_falsey(value)) {
992	params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
993	} else if (is_autoy(value)) {
994	params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;
995	} else {
996	throw std::runtime_error (
997	string_format(fmt: "error: unkown value for --flash-attn: '%s'\n", value.c_str()));
998	}
999	}).set_env("LLAMA_ARG_FLASH_ATTN"));
1000	add_opt (common_arg (
1001	{"-p", "--prompt"}, "PROMPT",
1002	"prompt to start generation with; for system message, use -sys",
1003	[](common_params & params, const std::string & value) {
1004	params.prompt = value;
1005	}
1006	).set_excludes({LLAMA_EXAMPLE_SERVER}));
1007	add_opt (common_arg (
1008	{"-sys", "--system-prompt"}, "PROMPT",
1009	"system prompt to use with model (if applicable, depending on chat template)",
1010	[](common_params & params, const std::string & value) {
1011	params.system_prompt = value;
1012	}
1013	).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_DIFFUSION}));
1014	add_opt (common_arg (
1015	{"--no-perf"},
1016	string_format(fmt: "disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
1017	[](common_params & params) {
1018	params.no_perf = true;
1019	params.sampling.no_perf = true;
1020	}
1021	).set_env("LLAMA_ARG_NO_PERF"));
1022	add_opt (common_arg (
1023	{"-f", "--file"}, "FNAME",
1024	"a file containing the prompt (default: none)",
1025	[](common_params & params, const std::string & value) {
1026	params.prompt = read_file(fname: value);
1027	// store the external file name in params
1028	params.prompt_file = value;
1029	if (!params.prompt.empty() && params.prompt.back() == `'\n'`) {
1030	params.prompt.pop_back();
1031	}
1032	}
1033	).set_excludes({LLAMA_EXAMPLE_SERVER}));
1034	add_opt (common_arg (
1035	{"-sysf", "--system-prompt-file"}, "FNAME",
1036	"a file containing the system prompt (default: none)",
1037	[](common_params & params, const std::string & value) {
1038	params.system_prompt = read_file(fname: value);
1039	if (!params.system_prompt.empty() && params.system_prompt.back() == `'\n'`) {
1040	params.system_prompt.pop_back();
1041	}
1042	}
1043	).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_DIFFUSION}));
1044	add_opt (common_arg (
1045	{"--in-file"}, "FNAME",
1046	"an input file (repeat to specify multiple files)",
1047	[](common_params & params, const std::string & value) {
1048	std::ifstream file(value);
1049	if (!file) {
1050	throw std::runtime_error (string_format(fmt: "error: failed to open file '%s'\n", value.c_str()));
1051	}
1052	params.in_files.push_back(x: value);
1053	}
1054	).set_examples({LLAMA_EXAMPLE_IMATRIX}));
1055	add_opt (common_arg (
1056	{"-bf", "--binary-file"}, "FNAME",
1057	"binary file containing the prompt (default: none)",
1058	[](common_params & params, const std::string & value) {
1059	std::ifstream file(value, std::ios::binary);
1060	if (!file) {
1061	throw std::runtime_error (string_format(fmt: "error: failed to open file '%s'\n", value.c_str()));
1062	}
1063	// store the external file name in params
1064	params.prompt_file = value;
1065	std::ostringstream ss;
1066	ss << file.rdbuf();
1067	params.prompt = ss.str();
1068	fprintf(stderr, format: "Read %zu bytes from binary file %s\n", params.prompt.size(), value.c_str());
1069	}
1070	).set_excludes({LLAMA_EXAMPLE_SERVER}));
1071	add_opt (common_arg (
1072	{"-e", "--escape"},
1073	string_format(fmt: "process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
1074	[](common_params & params) {
1075	params.escape = true;
1076	}
1077	));
1078	add_opt (common_arg (
1079	{"--no-escape"},
1080	"do not process escape sequences",
1081	[](common_params & params) {
1082	params.escape = false;
1083	}
1084	));
1085	add_opt (common_arg (
1086	{"-ptc", "--print-token-count"}, "N",
1087	string_format(fmt: "print token count every N tokens (default: %d)", params.n_print),
1088	[](common_params & params, int value) {
1089	params.n_print = value;
1090	}
1091	).set_examples({LLAMA_EXAMPLE_MAIN}));
1092	add_opt (common_arg (
1093	{"--prompt-cache"}, "FNAME",
1094	"file to cache prompt state for faster startup (default: none)",
1095	[](common_params & params, const std::string & value) {
1096	params.path_prompt_cache = value;
1097	}
1098	).set_examples({LLAMA_EXAMPLE_MAIN}));
1099	add_opt (common_arg (
1100	{"--prompt-cache-all"},
1101	"if specified, saves user input and generations to cache as well\n",
1102	[](common_params & params) {
1103	params.prompt_cache_all = true;
1104	}
1105	).set_examples({LLAMA_EXAMPLE_MAIN}));
1106	add_opt (common_arg (
1107	{"--prompt-cache-ro"},
1108	"if specified, uses the prompt cache but does not update it",
1109	[](common_params & params) {
1110	params.prompt_cache_ro = true;
1111	}
1112	).set_examples({LLAMA_EXAMPLE_MAIN}));
1113	add_opt (common_arg (
1114	{"-r", "--reverse-prompt"}, "PROMPT",
1115	"halt generation at PROMPT, return control in interactive mode\n",
1116	[](common_params & params, const std::string & value) {
1117	params.antiprompt.emplace_back(args: value);
1118	}
1119	).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
1120	add_opt (common_arg (
1121	{"-sp", "--special"},
1122	string_format(fmt: "special tokens output enabled (default: %s)", params.special ? "true" : "false"),
1123	[](common_params & params) {
1124	params.special = true;
1125	}
1126	).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
1127	add_opt (common_arg (
1128	{"-cnv", "--conversation"},
1129	"run in conversation mode:\n"
1130	"- does not print special tokens and suffix/prefix\n"
1131	"- interactive mode is also enabled\n"
1132	"(default: auto enabled if chat template is available)",
1133	[](common_params & params) {
1134	params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED;
1135	}
1136	).set_examples({LLAMA_EXAMPLE_MAIN}));
1137	add_opt (common_arg (
1138	{"-no-cnv", "--no-conversation"},
1139	"force disable conversation mode (default: false)",
1140	[](common_params & params) {
1141	params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED;
1142	}
1143	).set_examples({LLAMA_EXAMPLE_MAIN}));
1144	add_opt (common_arg (
1145	{"-st", "--single-turn"},
1146	"run conversation for a single turn only, then exit when done\n"
1147	"will not be interactive if first turn is predefined with --prompt\n"
1148	"(default: false)",
1149	[](common_params & params) {
1150	params.single_turn = true;
1151	}
1152	).set_examples({LLAMA_EXAMPLE_MAIN}));
1153	add_opt (common_arg (
1154	{"-i", "--interactive"},
1155	string_format(fmt: "run in interactive mode (default: %s)", params.interactive ? "true" : "false"),
1156	[](common_params & params) {
1157	params.interactive = true;
1158	}
1159	).set_examples({LLAMA_EXAMPLE_MAIN}));
1160	add_opt (common_arg (
1161	{"-if", "--interactive-first"},
1162	string_format(fmt: "run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"),
1163	[](common_params & params) {
1164	params.interactive_first = true;
1165	}
1166	).set_examples({LLAMA_EXAMPLE_MAIN}));
1167	add_opt (common_arg (
1168	{"-mli", "--multiline-input"},
1169	"allows you to write or paste multiple lines without ending each in '\\'",
1170	[](common_params & params) {
1171	params.multiline_input = true;
1172	}
1173	).set_examples({LLAMA_EXAMPLE_MAIN}));
1174	add_opt (common_arg (
1175	{"--in-prefix-bos"},
1176	"prefix BOS to user inputs, preceding the `--in-prefix` string",
1177	[](common_params & params) {
1178	params.input_prefix_bos = true;
1179	params.enable_chat_template = false;
1180	}
1181	).set_examples({LLAMA_EXAMPLE_MAIN}));
1182	add_opt (common_arg (
1183	{"--in-prefix"}, "STRING",
1184	"string to prefix user inputs with (default: empty)",
1185	[](common_params & params, const std::string & value) {
1186	params.input_prefix = value;
1187	params.enable_chat_template = false;
1188	}
1189	).set_examples({LLAMA_EXAMPLE_MAIN}));
1190	add_opt (common_arg (
1191	{"--in-suffix"}, "STRING",
1192	"string to suffix after user inputs with (default: empty)",
1193	[](common_params & params, const std::string & value) {
1194	params.input_suffix = value;
1195	params.enable_chat_template = false;
1196	}
1197	).set_examples({LLAMA_EXAMPLE_MAIN}));
1198	add_opt (common_arg (
1199	{"--no-warmup"},
1200	"skip warming up the model with an empty run",
1201	[](common_params & params) {
1202	params.warmup = false;
1203	}
1204	).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
1205	add_opt (common_arg (
1206	{"--spm-infill"},
1207	string_format(
1208	fmt: "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)",
1209	params.spm_infill ? "enabled" : "disabled"
1210	),
1211	[](common_params & params) {
1212	params.spm_infill = true;
1213	}
1214	).set_examples({LLAMA_EXAMPLE_SERVER}));
1215	add_opt (common_arg (
1216	{"--samplers"}, "SAMPLERS",
1217	string_format(fmt: "samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()),
1218	[](common_params & params, const std::string & value) {
1219	const auto sampler_names = string_split<std::string>(input: value, separator: `';'`);
1220	params.sampling.samplers = common_sampler_types_from_names(names: sampler_names, allow_alt_names: true);
1221	}
1222	).set_sparam());
1223	add_opt (common_arg (
1224	{"-s", "--seed"}, "SEED",
1225	string_format(fmt: "RNG seed (default: %d, use random seed for %d)", params.sampling.seed, LLAMA_DEFAULT_SEED),
1226	[](common_params & params, const std::string & value) {
1227	params.sampling.seed = std::stoul(str: value);
1228	}
1229	).set_sparam());
1230	add_opt (common_arg (
1231	{"--sampling-seq", "--sampler-seq"}, "SEQUENCE",
1232	string_format(fmt: "simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()),
1233	[](common_params & params, const std::string & value) {
1234	params.sampling.samplers = common_sampler_types_from_chars(chars: value);
1235	}
1236	).set_sparam());
1237	add_opt (common_arg (
1238	{"--ignore-eos"},
1239	"ignore end of stream token and continue generating (implies --logit-bias EOS-inf)",
1240	[](common_params & params) {
1241	params.sampling.ignore_eos = true;
1242	}
1243	).set_sparam());
1244	add_opt (common_arg (
1245	{"--temp"}, "N",
1246	string_format(fmt: "temperature (default: %.1f)", (double)params.sampling.temp),
1247	[](common_params & params, const std::string & value) {
1248	params.sampling.temp = std::stof(str: value);
1249	params.sampling.temp = std::max(a: params.sampling.temp, b: `0.0f`);
1250	}
1251	).set_sparam());
1252	add_opt (common_arg (
1253	{"--top-k"}, "N",
1254	string_format(fmt: "top-k sampling (default: %d, 0 = disabled)", params.sampling.top_k),
1255	[](common_params & params, int value) {
1256	params.sampling.top_k = value;
1257	}
1258	).set_sparam());
1259	add_opt (common_arg (
1260	{"--top-p"}, "N",
1261	string_format(fmt: "top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sampling.top_p),
1262	[](common_params & params, const std::string & value) {
1263	params.sampling.top_p = std::stof(str: value);
1264	}
1265	).set_sparam());
1266	add_opt (common_arg (
1267	{"--min-p"}, "N",
1268	string_format(fmt: "min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sampling.min_p),
1269	[](common_params & params, const std::string & value) {
1270	params.sampling.min_p = std::stof(str: value);
1271	}
1272	).set_sparam());
1273	add_opt (common_arg (
1274	{"--top-nsigma"}, "N",
1275	string_format(fmt: "top-n-sigma sampling (default: %.1f, -1.0 = disabled)", params.sampling.top_n_sigma),
1276	[](common_params & params, const std::string & value) {
1277	params.sampling.top_n_sigma = std::stof(str: value);
1278	}
1279	).set_sparam());
1280	add_opt (common_arg (
1281	{"--xtc-probability"}, "N",
1282	string_format(fmt: "xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sampling.xtc_probability),
1283	[](common_params & params, const std::string & value) {
1284	params.sampling.xtc_probability = std::stof(str: value);
1285	}
1286	).set_sparam());
1287	add_opt (common_arg (
1288	{"--xtc-threshold"}, "N",
1289	string_format(fmt: "xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sampling.xtc_threshold),
1290	[](common_params & params, const std::string & value) {
1291	params.sampling.xtc_threshold = std::stof(str: value);
1292	}
1293	).set_sparam());
1294	add_opt (common_arg (
1295	{"--typical"}, "N",
1296	string_format(fmt: "locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sampling.typ_p),
1297	[](common_params & params, const std::string & value) {
1298	params.sampling.typ_p = std::stof(str: value);
1299	}
1300	).set_sparam());
1301	add_opt (common_arg (
1302	{"--repeat-last-n"}, "N",
1303	string_format(fmt: "last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sampling.penalty_last_n),
1304	[](common_params & params, int value) {
1305	if (value < -`1`) {
1306	throw std::runtime_error (string_format(fmt: "error: invalid repeat-last-n = %d\n", value));
1307	}
1308	params.sampling.penalty_last_n = value;
1309	params.sampling.n_prev = std::max(a: params.sampling.n_prev, b: params.sampling.penalty_last_n);
1310	}
1311	).set_sparam());
1312	add_opt (common_arg (
1313	{"--repeat-penalty"}, "N",
1314	string_format(fmt: "penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sampling.penalty_repeat),
1315	[](common_params & params, const std::string & value) {
1316	params.sampling.penalty_repeat = std::stof(str: value);
1317	}
1318	).set_sparam());
1319	add_opt (common_arg (
1320	{"--presence-penalty"}, "N",
1321	string_format(fmt: "repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sampling.penalty_present),
1322	[](common_params & params, const std::string & value) {
1323	params.sampling.penalty_present = std::stof(str: value);
1324	}
1325	).set_sparam());
1326	add_opt (common_arg (
1327	{"--frequency-penalty"}, "N",
1328	string_format(fmt: "repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sampling.penalty_freq),
1329	[](common_params & params, const std::string & value) {
1330	params.sampling.penalty_freq = std::stof(str: value);
1331	}
1332	).set_sparam());
1333	add_opt (common_arg (
1334	{"--dry-multiplier"}, "N",
1335	string_format(fmt: "set DRY sampling multiplier (default: %.1f, 0.0 = disabled)", (double)params.sampling.dry_multiplier),
1336	[](common_params & params, const std::string & value) {
1337	params.sampling.dry_multiplier = std::stof(str: value);
1338	}
1339	).set_sparam());
1340	add_opt (common_arg (
1341	{"--dry-base"}, "N",
1342	string_format(fmt: "set DRY sampling base value (default: %.2f)", (double)params.sampling.dry_base),
1343	[](common_params & params, const std::string & value) {
1344	float potential_base = std::stof(str: value);
1345	if (potential_base >= `1.0f`)
1346	{
1347	params.sampling.dry_base = potential_base;
1348	}
1349	}
1350	).set_sparam());
1351	add_opt (common_arg (
1352	{"--dry-allowed-length"}, "N",
1353	string_format(fmt: "set allowed length for DRY sampling (default: %d)", params.sampling.dry_allowed_length),
1354	[](common_params & params, int value) {
1355	params.sampling.dry_allowed_length = value;
1356	}
1357	).set_sparam());
1358	add_opt (common_arg (
1359	{"--dry-penalty-last-n"}, "N",
1360	string_format(fmt: "set DRY penalty for the last n tokens (default: %d, 0 = disable, -1 = context size)", params.sampling.dry_penalty_last_n),
1361	[](common_params & params, int value) {
1362	if (value < -`1`) {
1363	throw std::runtime_error (string_format(fmt: "error: invalid dry-penalty-last-n = %d\n", value));
1364	}
1365	params.sampling.dry_penalty_last_n = value;
1366	}
1367	).set_sparam());
1368	add_opt (common_arg (
1369	{"--dry-sequence-breaker"}, "STRING",
1370	string_format(fmt: "add sequence breaker for DRY sampling, clearing out default breakers (%s) in the process; use \"none\" to not use any sequence breakers\n",
1371	params.sampling.dry_sequence_breakers.empty() ? "none" :
1372	std::accumulate(first: std::next(x: params.sampling.dry_sequence_breakers.begin()),
1373	last: params.sampling.dry_sequence_breakers.end(),
1374	init: std::string ("'") + (params.sampling.dry_sequence_breakers [`0`] == "\n" ? "\\n" : params.sampling.dry_sequence_breakers [`0`]) + "'",
1375	binary_op: [](const std::string& a, const std::string& b) {
1376	std::string formatted_b = (b == "\n") ? "\\n" : b;
1377	return a + ", '" + formatted_b + "'";
1378	}).c_str()),
1379	[](common_params & params, const std::string & value) {
1380	static bool defaults_cleared = false;
1381
1382	if (!defaults_cleared) {
1383	params.sampling.dry_sequence_breakers.clear();
1384	defaults_cleared = true;
1385	}
1386
1387	if (value == "none") {
1388	params.sampling.dry_sequence_breakers.clear();
1389	} else {
1390	params.sampling.dry_sequence_breakers.emplace_back(args: value);
1391	}
1392	}
1393	).set_sparam());
1394	add_opt (common_arg (
1395	{"--dynatemp-range"}, "N",
1396	string_format(fmt: "dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sampling.dynatemp_range),
1397	[](common_params & params, const std::string & value) {
1398	params.sampling.dynatemp_range = std::stof(str: value);
1399	}
1400	).set_sparam());
1401	add_opt (common_arg (
1402	{"--dynatemp-exp"}, "N",
1403	string_format(fmt: "dynamic temperature exponent (default: %.1f)", (double)params.sampling.dynatemp_exponent),
1404	[](common_params & params, const std::string & value) {
1405	params.sampling.dynatemp_exponent = std::stof(str: value);
1406	}
1407	).set_sparam());
1408	add_opt (common_arg (
1409	{"--mirostat"}, "N",
1410	string_format(fmt: "use Mirostat sampling.\nTop K, Nucleus and Locally Typical samplers are ignored if used.\n"
1411	"(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sampling.mirostat),
1412	[](common_params & params, int value) {
1413	params.sampling.mirostat = value;
1414	}
1415	).set_sparam());
1416	add_opt (common_arg (
1417	{"--mirostat-lr"}, "N",
1418	string_format(fmt: "Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sampling.mirostat_eta),
1419	[](common_params & params, const std::string & value) {
1420	params.sampling.mirostat_eta = std::stof(str: value);
1421	}
1422	).set_sparam());
1423	add_opt (common_arg (
1424	{"--mirostat-ent"}, "N",
1425	string_format(fmt: "Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sampling.mirostat_tau),
1426	[](common_params & params, const std::string & value) {
1427	params.sampling.mirostat_tau = std::stof(str: value);
1428	}
1429	).set_sparam());
1430	add_opt (common_arg (
1431	{"-l", "--logit-bias"}, "TOKEN_ID(+/-)BIAS",
1432	"modifies the likelihood of token appearing in the completion,\n"
1433	"i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"
1434	"or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'",
1435	[](common_params & params, const std::string & value) {
1436	std::stringstream ss(value);
1437	llama_token key;
1438	char sign;
1439	std::string value_str;
1440	try {
1441	if (ss >> key && ss >> sign && std::getline(is&: ss, str&: value_str) && (sign == `'+'` \|\| sign == `'-'`)) {
1442	const float bias = std::stof(str: value_str) * ((sign == `'-'`) ? -`1.0f` : `1.0f`);
1443	params.sampling.logit_bias.push_back(x: {.token: key, .bias: bias});
1444	} else {
1445	throw std::invalid_argument ("invalid input format");
1446	}
1447	} catch (const std::exception&) {
1448	throw std::invalid_argument ("invalid input format");
1449	}
1450	}
1451	).set_sparam());
1452	add_opt (common_arg (
1453	{"--grammar"}, "GRAMMAR",
1454	string_format(fmt: "BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sampling.grammar.c_str()),
1455	[](common_params & params, const std::string & value) {
1456	params.sampling.grammar = value;
1457	}
1458	).set_sparam());
1459	add_opt (common_arg (
1460	{"--grammar-file"}, "FNAME",
1461	"file to read grammar from",
1462	[](common_params & params, const std::string & value) {
1463	params.sampling.grammar = read_file(fname: value);
1464	}
1465	).set_sparam());
1466	add_opt (common_arg (
1467	{"-j", "--json-schema"}, "SCHEMA",
1468	"JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead",
1469	[](common_params & params, const std::string & value) {
1470	params.sampling.grammar = json_schema_to_grammar(schema: json::parse(i: value));
1471	}
1472	).set_sparam());
1473	add_opt (common_arg (
1474	{"-jf", "--json-schema-file"}, "FILE",
1475	"File containing a JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead",
1476	[](common_params & params, const std::string & value) {
1477	std::ifstream file(value);
1478	if (!file) {
1479	throw std::runtime_error (string_format(fmt: "error: failed to open file '%s'\n", value.c_str()));
1480	}
1481	std::string schema;
1482	std::copy(
1483	first: std::istreambuf_iterator<char>(file),
1484	last: std::istreambuf_iterator<char>(),
1485	result: std::back_inserter(x&: schema)
1486	);
1487	params.sampling.grammar = json_schema_to_grammar(schema: json::parse(i&: schema));
1488	}
1489	).set_sparam());
1490	add_opt (common_arg (
1491	{"--pooling"}, "{none,mean,cls,last,rank}",
1492	"pooling type for embeddings, use model default if unspecified",
1493	[](common_params & params, const std::string & value) {
1494	// if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
1495	else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
1496	else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
1497	else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; }
1498	else if (value == "rank") { params.pooling_type = LLAMA_POOLING_TYPE_RANK; }
1499	else { throw std::invalid_argument ("invalid value"); }
1500	}
1501	).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_POOLING"));
1502	add_opt (common_arg (
1503	{"--attention"}, "{causal,non-causal}",
1504	"attention type for embeddings, use model default if unspecified",
1505	[](common_params & params, const std::string & value) {
1506	// if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; }
1507	else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; }
1508	else { throw std::invalid_argument ("invalid value"); }
1509	}
1510	).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
1511	add_opt (common_arg (
1512	{"--rope-scaling"}, "{none,linear,yarn}",
1513	"RoPE frequency scaling method, defaults to linear unless specified by the model",
1514	[](common_params & params, const std::string & value) {
1515	// if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
1516	else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
1517	else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
1518	else { throw std::invalid_argument ("invalid value"); }
1519	}
1520	).set_env("LLAMA_ARG_ROPE_SCALING_TYPE"));
1521	add_opt (common_arg (
1522	{"--rope-scale"}, "N",
1523	"RoPE context scaling factor, expands context by a factor of N",
1524	[](common_params & params, const std::string & value) {
1525	params.rope_freq_scale = `1.0f` / std::stof(str: value);
1526	}
1527	).set_env("LLAMA_ARG_ROPE_SCALE"));
1528	add_opt (common_arg (
1529	{"--rope-freq-base"}, "N",
1530	"RoPE base frequency, used by NTK-aware scaling (default: loaded from model)",
1531	[](common_params & params, const std::string & value) {
1532	params.rope_freq_base = std::stof(str: value);
1533	}
1534	).set_env("LLAMA_ARG_ROPE_FREQ_BASE"));
1535	add_opt (common_arg (
1536	{"--rope-freq-scale"}, "N",
1537	"RoPE frequency scaling factor, expands context by a factor of 1/N",
1538	[](common_params & params, const std::string & value) {
1539	params.rope_freq_scale = std::stof(str: value);
1540	}
1541	).set_env("LLAMA_ARG_ROPE_FREQ_SCALE"));
1542	add_opt (common_arg (
1543	{"--yarn-orig-ctx"}, "N",
1544	string_format(fmt: "YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx),
1545	[](common_params & params, int value) {
1546	params.yarn_orig_ctx = value;
1547	}
1548	).set_env("LLAMA_ARG_YARN_ORIG_CTX"));
1549	add_opt (common_arg (
1550	{"--yarn-ext-factor"}, "N",
1551	string_format(fmt: "YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor),
1552	[](common_params & params, const std::string & value) {
1553	params.yarn_ext_factor = std::stof(str: value);
1554	}
1555	).set_env("LLAMA_ARG_YARN_EXT_FACTOR"));
1556	add_opt (common_arg (
1557	{"--yarn-attn-factor"}, "N",
1558	string_format(fmt: "YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor),
1559	[](common_params & params, const std::string & value) {
1560	params.yarn_attn_factor = std::stof(str: value);
1561	}
1562	).set_env("LLAMA_ARG_YARN_ATTN_FACTOR"));
1563	add_opt (common_arg (
1564	{"--yarn-beta-slow"}, "N",
1565	string_format(fmt: "YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow),
1566	[](common_params & params, const std::string & value) {
1567	params.yarn_beta_slow = std::stof(str: value);
1568	}
1569	).set_env("LLAMA_ARG_YARN_BETA_SLOW"));
1570	add_opt (common_arg (
1571	{"--yarn-beta-fast"}, "N",
1572	string_format(fmt: "YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast),
1573	[](common_params & params, const std::string & value) {
1574	params.yarn_beta_fast = std::stof(str: value);
1575	}
1576	).set_env("LLAMA_ARG_YARN_BETA_FAST"));
1577	add_opt (common_arg (
1578	{"-gan", "--grp-attn-n"}, "N",
1579	string_format(fmt: "group-attention factor (default: %d)", params.grp_attn_n),
1580	[](common_params & params, int value) {
1581	params.grp_attn_n = value;
1582	}
1583	).set_env("LLAMA_ARG_GRP_ATTN_N").set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_PASSKEY}));
1584	add_opt (common_arg (
1585	{"-gaw", "--grp-attn-w"}, "N",
1586	string_format(fmt: "group-attention width (default: %d)", params.grp_attn_w),
1587	[](common_params & params, int value) {
1588	params.grp_attn_w = value;
1589	}
1590	).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_MAIN}));
1591	add_opt (common_arg (
1592	{"-nkvo", "--no-kv-offload"},
1593	"disable KV offload",
1594	[](common_params & params) {
1595	params.no_kv_offload = true;
1596	}
1597	).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
1598	add_opt (common_arg (
1599	{"-nr", "--no-repack"},
1600	"disable weight repacking",
1601	[](common_params & params) {
1602	params.no_extra_bufts = true;
1603	}
1604	).set_env("LLAMA_ARG_NO_REPACK"));
1605	add_opt (common_arg (
1606	{"--no-host"},
1607	"bypass host buffer allowing extra buffers to be used",
1608	[](common_params & params) {
1609	params.no_host = true;
1610	}
1611	).set_env("LLAMA_ARG_NO_HOST"));
1612	add_opt (common_arg (
1613	{"-ctk", "--cache-type-k"}, "TYPE",
1614	string_format(
1615	fmt: "KV cache data type for K\n"
1616	"allowed values: %s\n"
1617	"(default: %s)",
1618	get_all_kv_cache_types().c_str(),
1619	ggml_type_name(type: params.cache_type_k)
1620	),
1621	[](common_params & params, const std::string & value) {
1622	params.cache_type_k = kv_cache_type_from_str(s: value);
1623	}
1624	).set_env("LLAMA_ARG_CACHE_TYPE_K"));
1625	add_opt (common_arg (
1626	{"-ctv", "--cache-type-v"}, "TYPE",
1627	string_format(
1628	fmt: "KV cache data type for V\n"
1629	"allowed values: %s\n"
1630	"(default: %s)",
1631	get_all_kv_cache_types().c_str(),
1632	ggml_type_name(type: params.cache_type_v)
1633	),
1634	[](common_params & params, const std::string & value) {
1635	params.cache_type_v = kv_cache_type_from_str(s: value);
1636	}
1637	).set_env("LLAMA_ARG_CACHE_TYPE_V"));
1638	add_opt (common_arg (
1639	{"--hellaswag"},
1640	"compute HellaSwag score over random tasks from datafile supplied with -f",
1641	[](common_params & params) {
1642	params.hellaswag = true;
1643	}
1644	).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
1645	add_opt (common_arg (
1646	{"--hellaswag-tasks"}, "N",
1647	string_format(fmt: "number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks),
1648	[](common_params & params, int value) {
1649	params.hellaswag_tasks = value;
1650	}
1651	).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
1652	add_opt (common_arg (
1653	{"--winogrande"},
1654	"compute Winogrande score over random tasks from datafile supplied with -f",
1655	[](common_params & params) {
1656	params.winogrande = true;
1657	}
1658	).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
1659	add_opt (common_arg (
1660	{"--winogrande-tasks"}, "N",
1661	string_format(fmt: "number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks),
1662	[](common_params & params, int value) {
1663	params.winogrande_tasks = value;
1664	}
1665	).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
1666	add_opt (common_arg (
1667	{"--multiple-choice"},
1668	"compute multiple choice score over random tasks from datafile supplied with -f",
1669	[](common_params & params) {
1670	params.multiple_choice = true;
1671	}
1672	).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
1673	add_opt (common_arg (
1674	{"--multiple-choice-tasks"}, "N",
1675	string_format(fmt: "number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks),
1676	[](common_params & params, int value) {
1677	params.multiple_choice_tasks = value;
1678	}
1679	).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
1680	add_opt (common_arg (
1681	{"--kl-divergence"},
1682	"computes KL-divergence to logits provided via --kl-divergence-base",
1683	[](common_params & params) {
1684	params.kl_divergence = true;
1685	}
1686	).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
1687	add_opt (common_arg (
1688	{"--save-all-logits", "--kl-divergence-base"}, "FNAME",
1689	"set logits file",
1690	[](common_params & params, const std::string & value) {
1691	params.logits_file = value;
1692	}
1693	).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
1694	add_opt (common_arg (
1695	{"--ppl-stride"}, "N",
1696	string_format(fmt: "stride for perplexity calculation (default: %d)", params.ppl_stride),
1697	[](common_params & params, int value) {
1698	params.ppl_stride = value;
1699	}
1700	).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
1701	add_opt (common_arg (
1702	{"--ppl-output-type"}, "<0\|1>",
1703	string_format(fmt: "output type for perplexity calculation (default: %d)", params.ppl_output_type),
1704	[](common_params & params, int value) {
1705	params.ppl_output_type = value;
1706	}
1707	).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
1708	add_opt (common_arg (
1709	{"-dt", "--defrag-thold"}, "N",
1710	string_format(fmt: "KV cache defragmentation threshold (DEPRECATED)"),
1711	[](common_params & params, const std::string & value) {
1712	GGML_UNUSED(params);
1713	GGML_UNUSED(value);
1714	LOG_WRN("DEPRECATED: --defrag-thold is deprecated and no longer necessary to specify\n");
1715	}
1716	).set_env("LLAMA_ARG_DEFRAG_THOLD"));
1717	add_opt (common_arg (
1718	{"-np", "--parallel"}, "N",
1719	string_format(fmt: "number of parallel sequences to decode (default: %d)", params.n_parallel),
1720	[](common_params & params, int value) {
1721	params.n_parallel = value;
1722	}
1723	).set_env("LLAMA_ARG_N_PARALLEL"));
1724	add_opt (common_arg (
1725	{"-ns", "--sequences"}, "N",
1726	string_format(fmt: "number of sequences to decode (default: %d)", params.n_sequences),
1727	[](common_params & params, int value) {
1728	params.n_sequences = value;
1729	}
1730	).set_examples({LLAMA_EXAMPLE_PARALLEL}));
1731	add_opt (common_arg (
1732	{"-cb", "--cont-batching"},
1733	string_format(fmt: "enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"),
1734	[](common_params & params) {
1735	params.cont_batching = true;
1736	}
1737	).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CONT_BATCHING"));
1738	add_opt (common_arg (
1739	{"-nocb", "--no-cont-batching"},
1740	"disable continuous batching",
1741	[](common_params & params) {
1742	params.cont_batching = false;
1743	}
1744	).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
1745	add_opt (common_arg (
1746	{"--mmproj"}, "FILE",
1747	"path to a multimodal projector file. see tools/mtmd/README.md\n"
1748	"note: if -hf is used, this argument can be omitted",
1749	[](common_params & params, const std::string & value) {
1750	params.mmproj.path = value;
1751	}
1752	).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ"));
1753	add_opt (common_arg (
1754	{"--mmproj-url"}, "URL",
1755	"URL to a multimodal projector file. see tools/mtmd/README.md",
1756	[](common_params & params, const std::string & value) {
1757	params.mmproj.url = value;
1758	}
1759	).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_URL"));
1760	add_opt (common_arg (
1761	{"--no-mmproj"},
1762	"explicitly disable multimodal projector, useful when using -hf",
1763	[](common_params & params) {
1764	params.no_mmproj = true;
1765	}
1766	).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ"));
1767	add_opt (common_arg (
1768	{"--no-mmproj-offload"},
1769	"do not offload multimodal projector to GPU",
1770	[](common_params & params) {
1771	params.mmproj_use_gpu = false;
1772	}
1773	).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ_OFFLOAD"));
1774	add_opt (common_arg (
1775	{"--image", "--audio"}, "FILE",
1776	"path to an image or audio file. use with multimodal models, can be repeated if you have multiple files\n",
1777	[](common_params & params, const std::string & value) {
1778	params.image.emplace_back(args: value);
1779	}
1780	).set_examples({LLAMA_EXAMPLE_MTMD}));
1781	add_opt (common_arg (
1782	{"--image-min-tokens"}, "N",
1783	"minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)",
1784	[](common_params & params, int value) {
1785	params.image_min_tokens = value;
1786	}
1787	).set_examples(mmproj_examples).set_env("LLAMA_ARG_IMAGE_MIN_TOKENS"));
1788	add_opt (common_arg (
1789	{"--image-max-tokens"}, "N",
1790	"maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)",
1791	[](common_params & params, int value) {
1792	params.image_max_tokens = value;
1793	}
1794	).set_examples(mmproj_examples).set_env("LLAMA_ARG_IMAGE_MAX_TOKENS"));
1795	if (llama_supports_rpc()) {
1796	add_opt (common_arg (
1797	{"--rpc"}, "SERVERS",
1798	"comma separated list of RPC servers",
1799	[](common_params & params, const std::string & value) {
1800	add_rpc_devices(servers: value);
1801	GGML_UNUSED(params);
1802	}
1803	).set_env("LLAMA_ARG_RPC"));
1804	}
1805	add_opt (common_arg (
1806	{"--mlock"},
1807	"force system to keep model in RAM rather than swapping or compressing",
1808	[](common_params & params) {
1809	params.use_mlock = true;
1810	}
1811	).set_env("LLAMA_ARG_MLOCK"));
1812	add_opt (common_arg (
1813	{"--no-mmap"},
1814	"do not memory-map model (slower load but may reduce pageouts if not using mlock)",
1815	[](common_params & params) {
1816	params.use_mmap = false;
1817	}
1818	).set_env("LLAMA_ARG_NO_MMAP"));
1819	add_opt (common_arg (
1820	{"--numa"}, "TYPE",
1821	"attempt optimizations that help on some NUMA systems\n"
1822	"- distribute: spread execution evenly over all nodes\n"
1823	"- isolate: only spawn threads on CPUs on the node that execution started on\n"
1824	"- numactl: use the CPU map provided by numactl\n"
1825	"if run without this previously, it is recommended to drop the system page cache before using this\n"
1826	"see https://github.com/ggml-org/llama.cpp/issues/1437",
1827	[](common_params & params, const std::string & value) {
1828	// if (value == "distribute" \|\| value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
1829	else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
1830	else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
1831	else { throw std::invalid_argument ("invalid value"); }
1832	}
1833	).set_env("LLAMA_ARG_NUMA"));
1834	add_opt (common_arg (
1835	{"-dev", "--device"}, "<dev1,dev2,..>",
1836	"comma-separated list of devices to use for offloading (none = don't offload)\n"
1837	"use --list-devices to see a list of available devices",
1838	[](common_params & params, const std::string & value) {
1839	params.devices = parse_device_list(value);
1840	}
1841	).set_env("LLAMA_ARG_DEVICE"));
1842	add_opt (common_arg (
1843	{"--list-devices"},
1844	"print list of available devices and exit",
1845	[](common_params &) {
1846	std::vector<ggml_backend_dev_t> devices;
1847	for (size_t i = `0`; i < ggml_backend_dev_count(); ++i) {
1848	auto * dev = ggml_backend_dev_get(index: i);
1849	if (ggml_backend_dev_type(device: dev) != GGML_BACKEND_DEVICE_TYPE_CPU) {
1850	devices.push_back(x: dev);
1851	}
1852	}
1853	printf(format: "Available devices:\n");
1854	for (auto * dev : devices) {
1855	size_t free, total;
1856	ggml_backend_dev_memory(device: dev, free: &free, total: &total);
1857	printf(format: " %s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(device: dev), ggml_backend_dev_description(device: dev), total / `1024` / `1024`, free / `1024` / `1024`);
1858	}
1859	exit(status: `0`);
1860	}
1861	));
1862	add_opt (common_arg (
1863	{"--override-tensor", "-ot"}, "<tensor name pattern>=<buffer type>,...",
1864	"override tensor buffer type", [](common_params & params, const std::string & value) {
1865	parse_tensor_buffer_overrides(value, overrides&: params.tensor_buft_overrides);
1866	}
1867	));
1868	add_opt (common_arg (
1869	{"--override-tensor-draft", "-otd"}, "<tensor name pattern>=<buffer type>,...",
1870	"override tensor buffer type for draft model", [](common_params & params, const std::string & value) {
1871	parse_tensor_buffer_overrides(value, overrides&: params.speculative.tensor_buft_overrides);
1872	}
1873	).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
1874	add_opt (common_arg (
1875	{"--cpu-moe", "-cmoe"},
1876	"keep all Mixture of Experts (MoE) weights in the CPU",
1877	[](common_params & params) {
1878	params.tensor_buft_overrides.push_back(x: llm_ffn_exps_cpu_override());
1879	}
1880	).set_env("LLAMA_ARG_CPU_MOE"));
1881	add_opt (common_arg (
1882	{"--n-cpu-moe", "-ncmoe"}, "N",
1883	"keep the Mixture of Experts (MoE) weights of the first N layers in the CPU",
1884	[](common_params & params, int value) {
1885	if (value < `0`) {
1886	throw std::invalid_argument ("invalid value");
1887	}
1888	for (int i = `0`; i < value; ++i) {
1889	// keep strings alive and avoid leaking memory by storing them in a static vector
1890	static std::list<std::string> buft_overrides;
1891	buft_overrides.push_back(x: llm_ffn_exps_block_regex(idx: i));
1892	params.tensor_buft_overrides.push_back(x: {.pattern: buft_overrides.back().c_str(), .buft: ggml_backend_cpu_buffer_type()});
1893	}
1894	}
1895	).set_env("LLAMA_ARG_N_CPU_MOE"));
1896	add_opt (common_arg (
1897	{"--cpu-moe-draft", "-cmoed"},
1898	"keep all Mixture of Experts (MoE) weights in the CPU for the draft model",
1899	[](common_params & params) {
1900	params.speculative.tensor_buft_overrides.push_back(x: llm_ffn_exps_cpu_override());
1901	}
1902	).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
1903	add_opt (common_arg (
1904	{"--n-cpu-moe-draft", "-ncmoed"}, "N",
1905	"keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model",
1906	[](common_params & params, int value) {
1907	if (value < `0`) {
1908	throw std::invalid_argument ("invalid value");
1909	}
1910	for (int i = `0`; i < value; ++i) {
1911	static std::list<std::string> buft_overrides_draft;
1912	buft_overrides_draft.push_back(x: llm_ffn_exps_block_regex(idx: i));
1913	params.speculative.tensor_buft_overrides.push_back(x: {.pattern: buft_overrides_draft.back().c_str(), .buft: ggml_backend_cpu_buffer_type()});
1914	}
1915	}
1916	).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT"));
1917	add_opt (common_arg (
1918	{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
1919	string_format(fmt: "max. number of layers to store in VRAM (default: %d)", params.n_gpu_layers),
1920	[](common_params & params, int value) {
1921	params.n_gpu_layers = value;
1922	if (!llama_supports_gpu_offload()) {
1923	fprintf(stderr, format: "warning: no usable GPU found, --gpu-layers option will be ignored\n");
1924	fprintf(stderr, format: "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
1925	fprintf(stderr, format: "warning: consult docs/build.md for compilation instructions\n");
1926	}
1927	}
1928	).set_env("LLAMA_ARG_N_GPU_LAYERS"));
1929	add_opt (common_arg (
1930	{"-sm", "--split-mode"}, "{none,layer,row}",
1931	"how to split the model across multiple GPUs, one of:\n"
1932	"- none: use one GPU only\n"
1933	"- layer (default): split layers and KV across GPUs\n"
1934	"- row: split rows across GPUs",
1935	[](common_params & params, const std::string & value) {
1936	std::string arg_next = value;
1937	if (arg_next == "none") {
1938	params.split_mode = LLAMA_SPLIT_MODE_NONE;
1939	} else if (arg_next == "layer") {
1940	params.split_mode = LLAMA_SPLIT_MODE_LAYER;
1941	} else if (arg_next == "row") {
1942	params.split_mode = LLAMA_SPLIT_MODE_ROW;
1943	} else {
1944	throw std::invalid_argument ("invalid value");
1945	}
1946	if (!llama_supports_gpu_offload()) {
1947	fprintf(stderr, format: "warning: llama.cpp was compiled without support for GPU offload. Setting the split mode has no effect.\n");
1948	}
1949	}
1950	).set_env("LLAMA_ARG_SPLIT_MODE"));
1951	add_opt (common_arg (
1952	{"-ts", "--tensor-split"}, "N0,N1,N2,...",
1953	"fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1",
1954	[](common_params & params, const std::string & value) {
1955	std::string arg_next = value;
1956
1957	// split string by , and /
1958	const std::regex regex{ R"([,/]+)" };
1959	std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -`1` };
1960	std::vector<std::string> split_arg{ it, {} };
1961	if (split_arg.size() >= llama_max_devices()) {
1962	throw std::invalid_argument (
1963	string_format(fmt: "got %d input configs, but system only has %d devices", (int)split_arg.size(), (int)llama_max_devices())
1964	);
1965	}
1966	for (size_t i = `0`; i < llama_max_devices(); ++i) {
1967	if (i < split_arg.size()) {
1968	params.tensor_split[i] = std::stof(str: split_arg [i]);
1969	} else {
1970	params.tensor_split[i] = `0.0f`;
1971	}
1972	}
1973	if (!llama_supports_gpu_offload()) {
1974	fprintf(stderr, format: "warning: llama.cpp was compiled without support for GPU offload. Setting a tensor split has no effect.\n");
1975	}
1976	}
1977	).set_env("LLAMA_ARG_TENSOR_SPLIT"));
1978	add_opt (common_arg (
1979	{"-mg", "--main-gpu"}, "INDEX",
1980	string_format(fmt: "the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu),
1981	[](common_params & params, int value) {
1982	params.main_gpu = value;
1983	if (!llama_supports_gpu_offload()) {
1984	fprintf(stderr, format: "warning: llama.cpp was compiled without support for GPU offload. Setting the main GPU has no effect.\n");
1985	}
1986	}
1987	).set_env("LLAMA_ARG_MAIN_GPU"));
1988	add_opt (common_arg (
1989	{"--check-tensors"},
1990	string_format(fmt: "check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"),
1991	[](common_params & params) {
1992	params.check_tensors = true;
1993	}
1994	));
1995	add_opt (common_arg (
1996	{"--override-kv"}, "KEY=TYPE:VALUE",
1997	"advanced option to override model metadata by key. may be specified multiple times.\n"
1998	"types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false",
1999	[](common_params & params, const std::string & value) {
2000	if (!string_parse_kv_override(data: value.c_str(), overrides&: params.kv_overrides)) {
2001	throw std::runtime_error (string_format(fmt: "error: Invalid type for KV override: %s\n", value.c_str()));
2002	}
2003	}
2004	));
2005	add_opt (common_arg (
2006	{"--no-op-offload"},
2007	string_format(fmt: "disable offloading host tensor operations to device (default: %s)", params.no_op_offload ? "true" : "false"),
2008	[](common_params & params) {
2009	params.no_op_offload = true;
2010	}
2011	));
2012	add_opt (common_arg (
2013	{"--lora"}, "FNAME",
2014	"path to LoRA adapter (can be repeated to use multiple adapters)",
2015	[](common_params & params, const std::string & value) {
2016	params.lora_adapters.push_back(x: { .path: std::string (value), .scale: `1.0`, .task_name: "", .prompt_prefix: "", .ptr: nullptr });
2017	}
2018	// we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
2019	).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
2020	add_opt (common_arg (
2021	{"--lora-scaled"}, "FNAME", "SCALE",
2022	"path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)",
2023	[](common_params & params, const std::string & fname, const std::string & scale) {
2024	params.lora_adapters.push_back(x: { .path: fname, .scale: std::stof(str: scale), .task_name: "", .prompt_prefix: "", .ptr: nullptr });
2025	}
2026	// we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
2027	).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
2028	add_opt (common_arg (
2029	{"--control-vector"}, "FNAME",
2030	"add a control vector\nnote: this argument can be repeated to add multiple control vectors",
2031	[](common_params & params, const std::string & value) {
2032	params.control_vectors.push_back(x: { .strength: `1.0f`, .fname: value, });
2033	}
2034	));
2035	add_opt (common_arg (
2036	{"--control-vector-scaled"}, "FNAME", "SCALE",
2037	"add a control vector with user defined scaling SCALE\n"
2038	"note: this argument can be repeated to add multiple scaled control vectors",
2039	[](common_params & params, const std::string & fname, const std::string & scale) {
2040	params.control_vectors.push_back(x: { .strength: std::stof(str: scale), .fname: fname });
2041	}
2042	));
2043	add_opt (common_arg (
2044	{"--control-vector-layer-range"}, "START", "END",
2045	"layer range to apply the control vector(s) to, start and end inclusive",
2046	[](common_params & params, const std::string & start, const std::string & end) {
2047	params.control_vector_layer_start = std::stoi(str: start);
2048	params.control_vector_layer_end = std::stoi(str: end);
2049	}
2050	));
2051	add_opt (common_arg (
2052	{"-a", "--alias"}, "STRING",
2053	"set alias for model name (to be used by REST API)",
2054	[](common_params & params, const std::string & value) {
2055	params.model_alias = value;
2056	}
2057	).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ALIAS"));
2058	add_opt (common_arg (
2059	{"-m", "--model"}, "FNAME",
2060	ex == LLAMA_EXAMPLE_EXPORT_LORA
2061	? std::string ("model path from which to load base model")
2062	: string_format(
2063	fmt: "model path (default: `models/$filename` with filename from `--hf-file` "
2064	"or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH
2065	),
2066	[](common_params & params, const std::string & value) {
2067	params.model.path = value;
2068	}
2069	).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL"));
2070	add_opt (common_arg (
2071	{"-mu", "--model-url"}, "MODEL_URL",
2072	"model download url (default: unused)",
2073	[](common_params & params, const std::string & value) {
2074	params.model.url = value;
2075	}
2076	).set_env("LLAMA_ARG_MODEL_URL"));
2077	add_opt (common_arg (
2078	{ "-dr", "--docker-repo" }, "[<repo>/]<model>[:quant]",
2079	"Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.\n"
2080	"example: gemma3\n"
2081	"(default: unused)",
2082	[](common_params & params, const std::string & value) {
2083	params.model.docker_repo = value;
2084	}
2085	).set_env("LLAMA_ARG_DOCKER_REPO"));
2086	add_opt (common_arg (
2087	{"-hf", "-hfr", "--hf-repo"}, "<user>/<model>[:quant]",
2088	"Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n"
2089	"mmproj is also downloaded automatically if available. to disable, add --no-mmproj\n"
2090	"example: unsloth/phi-4-GGUF:q4_k_m\n"
2091	"(default: unused)",
2092	[](common_params & params, const std::string & value) {
2093	params.model.hf_repo = value;
2094	}
2095	).set_env("LLAMA_ARG_HF_REPO"));
2096	add_opt (common_arg (
2097	{"-hfd", "-hfrd", "--hf-repo-draft"}, "<user>/<model>[:quant]",
2098	"Same as --hf-repo, but for the draft model (default: unused)",
2099	[](common_params & params, const std::string & value) {
2100	params.speculative.model.hf_repo = value;
2101	}
2102	).set_env("LLAMA_ARG_HFD_REPO"));
2103	add_opt (common_arg (
2104	{"-hff", "--hf-file"}, "FILE",
2105	"Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)",
2106	[](common_params & params, const std::string & value) {
2107	params.model.hf_file = value;
2108	}
2109	).set_env("LLAMA_ARG_HF_FILE"));
2110	add_opt (common_arg (
2111	{"-hfv", "-hfrv", "--hf-repo-v"}, "<user>/<model>[:quant]",
2112	"Hugging Face model repository for the vocoder model (default: unused)",
2113	[](common_params & params, const std::string & value) {
2114	params.vocoder.model.hf_repo = value;
2115	}
2116	).set_env("LLAMA_ARG_HF_REPO_V"));
2117	add_opt (common_arg (
2118	{"-hffv", "--hf-file-v"}, "FILE",
2119	"Hugging Face model file for the vocoder model (default: unused)",
2120	[](common_params & params, const std::string & value) {
2121	params.vocoder.model.hf_file = value;
2122	}
2123	).set_env("LLAMA_ARG_HF_FILE_V"));
2124	add_opt (common_arg (
2125	{"-hft", "--hf-token"}, "TOKEN",
2126	"Hugging Face access token (default: value from HF_TOKEN environment variable)",
2127	[](common_params & params, const std::string & value) {
2128	params.hf_token = value;
2129	}
2130	).set_env("HF_TOKEN"));
2131	add_opt (common_arg (
2132	{"--context-file"}, "FNAME",
2133	"file to load context from (repeat to specify multiple files)",
2134	[](common_params & params, const std::string & value) {
2135	std::ifstream file(value, std::ios::binary);
2136	if (!file) {
2137	throw std::runtime_error (string_format(fmt: "error: failed to open file '%s'\n", value.c_str()));
2138	}
2139	params.context_files.push_back(x: value);
2140	}
2141	).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
2142	add_opt (common_arg (
2143	{"--chunk-size"}, "N",
2144	string_format(fmt: "minimum length of embedded text chunks (default: %d)", params.chunk_size),
2145	[](common_params & params, int value) {
2146	params.chunk_size = value;
2147	}
2148	).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
2149	add_opt (common_arg (
2150	{"--chunk-separator"}, "STRING",
2151	string_format(fmt: "separator between chunks (default: '%s')", params.chunk_separator.c_str()),
2152	[](common_params & params, const std::string & value) {
2153	params.chunk_separator = value;
2154	}
2155	).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
2156	add_opt (common_arg (
2157	{"--junk"}, "N",
2158	string_format(fmt: "number of times to repeat the junk text (default: %d)", params.n_junk),
2159	[](common_params & params, int value) {
2160	params.n_junk = value;
2161	}
2162	).set_examples({LLAMA_EXAMPLE_PASSKEY, LLAMA_EXAMPLE_PARALLEL}));
2163	add_opt (common_arg (
2164	{"--pos"}, "N",
2165	string_format(fmt: "position of the passkey in the junk text (default: %d)", params.i_pos),
2166	[](common_params & params, int value) {
2167	params.i_pos = value;
2168	}
2169	).set_examples({LLAMA_EXAMPLE_PASSKEY}));
2170	add_opt (common_arg (
2171	{"-o", "--output", "--output-file"}, "FNAME",
2172	string_format(fmt: "output file (default: '%s')", params.out_file.c_str()),
2173	[](common_params & params, const std::string & value) {
2174	params.out_file = value;
2175	}
2176	).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE}));
2177	add_opt (common_arg (
2178	{"-ofreq", "--output-frequency"}, "N",
2179	string_format(fmt: "output the imatrix every N iterations (default: %d)", params.n_out_freq),
2180	[](common_params & params, int value) {
2181	params.n_out_freq = value;
2182	}
2183	).set_examples({LLAMA_EXAMPLE_IMATRIX}));
2184	add_opt (common_arg (
2185	{"--output-format"}, "{gguf,dat}",
2186	string_format(fmt: "output format for imatrix file (default: %s)", params.imat_dat > `0` ? "dat" : "gguf"),
2187	[](common_params & params, const std::string & value) {
2188	// if (value == "gguf") { params.imat_dat = -`1`; }
2189	else if (value == "dat") { params.imat_dat = `1`; }
2190	else { throw std::invalid_argument ("invalid output format"); }
2191	}
2192	).set_examples({LLAMA_EXAMPLE_IMATRIX}));
2193	add_opt (common_arg (
2194	{"--save-frequency"}, "N",
2195	string_format(fmt: "save an imatrix copy every N iterations (default: %d)", params.n_save_freq),
2196	[](common_params & params, int value) {
2197	params.n_save_freq = value;
2198	}
2199	).set_examples({LLAMA_EXAMPLE_IMATRIX}));
2200	add_opt (common_arg (
2201	{"--process-output"},
2202	string_format(fmt: "collect data for the output tensor (default: %s)", params.process_output ? "true" : "false"),
2203	[](common_params & params) {
2204	params.process_output = true;
2205	}
2206	).set_examples({LLAMA_EXAMPLE_IMATRIX}));
2207	add_opt (common_arg (
2208	{"--no-ppl"},
2209	string_format(fmt: "do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"),
2210	[](common_params & params) {
2211	params.compute_ppl = false;
2212	}
2213	).set_examples({LLAMA_EXAMPLE_IMATRIX}));
2214	add_opt (common_arg (
2215	{"--chunk", "--from-chunk"}, "N",
2216	string_format(fmt: "start processing the input from chunk N (default: %d)", params.i_chunk),
2217	[](common_params & params, int value) {
2218	params.i_chunk = value;
2219	}
2220	).set_examples({LLAMA_EXAMPLE_IMATRIX}));
2221	add_opt (common_arg (
2222	{"--show-statistics"},
2223	string_format(fmt: "show imatrix statistics and then exit (default: %s)", params.show_statistics ? "true" : "false"),
2224	[](common_params & params) {
2225	params.show_statistics = true;
2226	}
2227	).set_examples({LLAMA_EXAMPLE_IMATRIX}));
2228	add_opt (common_arg (
2229	{"--parse-special"},
2230	string_format(fmt: "parse special tokens (chat, tool, etc) (default: %s)", params.parse_special ? "true" : "false"),
2231	[](common_params & params) {
2232	params.parse_special = true;
2233	}
2234	).set_examples({LLAMA_EXAMPLE_IMATRIX}));
2235	add_opt (common_arg (
2236	{"-pps"},
2237	string_format(fmt: "is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"),
2238	[](common_params & params) {
2239	params.is_pp_shared = true;
2240	}
2241	).set_examples({LLAMA_EXAMPLE_BENCH, LLAMA_EXAMPLE_PARALLEL}));
2242	add_opt (common_arg (
2243	{"-npp"}, "n0,n1,...",
2244	"number of prompt tokens",
2245	[](common_params & params, const std::string & value) {
2246	auto p = string_split<int>(str: value, delim: `','`);
2247	params.n_pp.insert(position: params.n_pp.end(), first: p.begin(), last: p.end());
2248	}
2249	).set_examples({LLAMA_EXAMPLE_BENCH}));
2250	add_opt (common_arg (
2251	{"-ntg"}, "n0,n1,...",
2252	"number of text generation tokens",
2253	[](common_params & params, const std::string & value) {
2254	auto p = string_split<int>(str: value, delim: `','`);
2255	params.n_tg.insert(position: params.n_tg.end(), first: p.begin(), last: p.end());
2256	}
2257	).set_examples({LLAMA_EXAMPLE_BENCH}));
2258	add_opt (common_arg (
2259	{"-npl"}, "n0,n1,...",
2260	"number of parallel prompts",
2261	[](common_params & params, const std::string & value) {
2262	auto p = string_split<int>(str: value, delim: `','`);
2263	params.n_pl.insert(position: params.n_pl.end(), first: p.begin(), last: p.end());
2264	}
2265	).set_examples({LLAMA_EXAMPLE_BENCH}));
2266	add_opt (common_arg (
2267	{"--embd-normalize"}, "N",
2268	string_format(fmt: "normalisation for embeddings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize),
2269	[](common_params & params, int value) {
2270	params.embd_normalize = value;
2271	}
2272	).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
2273	add_opt (common_arg (
2274	{"--embd-output-format"}, "FORMAT",
2275	"empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix, \"raw\" = plain whitespace-delimited output (one embedding per line)",
2276	[](common_params & params, const std::string & value) {
2277	params.embd_out = value;
2278	}
2279	).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
2280	add_opt (common_arg (
2281	{"--embd-separator"}, "STRING",
2282	"separator of embeddings (default \\n) for example \"<#sep#>\"",
2283	[](common_params & params, const std::string & value) {
2284	params.embd_sep = value;
2285	}
2286	).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
2287	add_opt (common_arg (
2288	{"--cls-separator"}, "STRING",
2289	"separator of classification sequences (default \\t) for example \"<#seq#>\"",
2290	[](common_params & params, const std::string & value) {
2291	params.cls_sep = value;
2292	}
2293	).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
2294	add_opt (common_arg (
2295	{"--host"}, "HOST",
2296	string_format(fmt: "ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: %s)", params.hostname.c_str()),
2297	[](common_params & params, const std::string & value) {
2298	params.hostname = value;
2299	}
2300	).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_HOST"));
2301	add_opt (common_arg (
2302	{"--port"}, "PORT",
2303	string_format(fmt: "port to listen (default: %d)", params.port),
2304	[](common_params & params, int value) {
2305	params.port = value;
2306	}
2307	).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PORT"));
2308	add_opt (common_arg (
2309	{"--path"}, "PATH",
2310	string_format(fmt: "path to serve static files from (default: %s)", params.public_path.c_str()),
2311	[](common_params & params, const std::string & value) {
2312	params.public_path = value;
2313	}
2314	).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH"));
2315	add_opt (common_arg (
2316	{"--api-prefix"}, "PREFIX",
2317	string_format(fmt: "prefix path the server serves from, without the trailing slash (default: %s)", params.api_prefix.c_str()),
2318	[](common_params & params, const std::string & value) {
2319	params.api_prefix = value;
2320	}
2321	).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
2322	add_opt (common_arg (
2323	{"--no-webui"},
2324	string_format(fmt: "Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
2325	[](common_params & params) {
2326	params.webui = false;
2327	}
2328	).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_WEBUI"));
2329	add_opt (common_arg (
2330	{"--embedding", "--embeddings"},
2331	string_format(fmt: "restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
2332	[](common_params & params) {
2333	params.embedding = true;
2334	}
2335	).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
2336	add_opt (common_arg (
2337	{"--reranking", "--rerank"},
2338	string_format(fmt: "enable reranking endpoint on server (default: %s)", "disabled"),
2339	[](common_params & params) {
2340	params.embedding = true;
2341	params.pooling_type = LLAMA_POOLING_TYPE_RANK;
2342	}
2343	).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_RERANKING"));
2344	add_opt (common_arg (
2345	{"--api-key"}, "KEY",
2346	"API key to use for authentication (default: none)",
2347	[](common_params & params, const std::string & value) {
2348	params.api_keys.push_back(x: value);
2349	}
2350	).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_API_KEY"));
2351	add_opt (common_arg (
2352	{"--api-key-file"}, "FNAME",
2353	"path to file containing API keys (default: none)",
2354	[](common_params & params, const std::string & value) {
2355	std::ifstream key_file(value);
2356	if (!key_file) {
2357	throw std::runtime_error (string_format(fmt: "error: failed to open file '%s'\n", value.c_str()));
2358	}
2359	std::string key;
2360	while (std::getline(is&: key_file, str&: key)) {
2361	if (!key.empty()) {
2362	params.api_keys.push_back(x: key);
2363	}
2364	}
2365	key_file.close();
2366	}
2367	).set_examples({LLAMA_EXAMPLE_SERVER}));
2368	add_opt (common_arg (
2369	{"--ssl-key-file"}, "FNAME",
2370	"path to file a PEM-encoded SSL private key",
2371	[](common_params & params, const std::string & value) {
2372	params.ssl_file_key = value;
2373	}
2374	).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_KEY_FILE"));
2375	add_opt (common_arg (
2376	{"--ssl-cert-file"}, "FNAME",
2377	"path to file a PEM-encoded SSL certificate",
2378	[](common_params & params, const std::string & value) {
2379	params.ssl_file_cert = value;
2380	}
2381	).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE"));
2382	add_opt (common_arg (
2383	{"--chat-template-kwargs"}, "STRING",
2384	string_format(fmt: "sets additional params for the json template parser"),
2385	[](common_params & params, const std::string & value) {
2386	auto parsed = json::parse(i: value);
2387	for (const auto & item : parsed.items()) {
2388	params.default_template_kwargs [item.key()] = item.value().dump();
2389	}
2390	}
2391	).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_CHAT_TEMPLATE_KWARGS"));
2392	add_opt (common_arg (
2393	{"-to", "--timeout"}, "N",
2394	string_format(fmt: "server read/write timeout in seconds (default: %d)", params.timeout_read),
2395	[](common_params & params, int value) {
2396	params.timeout_read = value;
2397	params.timeout_write = value;
2398	}
2399	).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TIMEOUT"));
2400	add_opt (common_arg (
2401	{"--threads-http"}, "N",
2402	string_format(fmt: "number of threads used to process HTTP requests (default: %d)", params.n_threads_http),
2403	[](common_params & params, int value) {
2404	params.n_threads_http = value;
2405	}
2406	).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP"));
2407	add_opt (common_arg (
2408	{"--cache-reuse"}, "N",
2409	string_format(
2410	fmt: "min chunk size to attempt reusing from the cache via KV shifting (default: %d)\n"
2411	"[(card)](https://ggml.ai/f0.png)", params.n_cache_reuse
2412	),
2413	[](common_params & params, int value) {
2414	params.n_cache_reuse = value;
2415	}
2416	).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CACHE_REUSE"));
2417	add_opt (common_arg (
2418	{"--metrics"},
2419	string_format(fmt: "enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"),
2420	[](common_params & params) {
2421	params.endpoint_metrics = true;
2422	}
2423	).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS"));
2424	add_opt (common_arg (
2425	{"--props"},
2426	string_format(fmt: "enable changing global properties via POST /props (default: %s)", params.endpoint_props ? "enabled" : "disabled"),
2427	[](common_params & params) {
2428	params.endpoint_props = true;
2429	}
2430	).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS"));
2431	add_opt (common_arg (
2432	{"--slots"},
2433	string_format(fmt: "enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
2434	[](common_params & params) {
2435	params.endpoint_slots = true;
2436	}
2437	).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
2438	add_opt (common_arg (
2439	{"--no-slots"},
2440	"disables slots monitoring endpoint",
2441	[](common_params & params) {
2442	params.endpoint_slots = false;
2443	}
2444	).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_ENDPOINT_SLOTS"));
2445	add_opt (common_arg (
2446	{"--slot-save-path"}, "PATH",
2447	"path to save slot kv cache (default: disabled)",
2448	[](common_params & params, const std::string & value) {
2449	params.slot_save_path = value;
2450	// if doesn't end with DIRECTORY_SEPARATOR, add it
2451	if (!params.slot_save_path.empty() && params.slot_save_path [params.slot_save_path.size() - `1`] != DIRECTORY_SEPARATOR) {
2452	params.slot_save_path += DIRECTORY_SEPARATOR;
2453	}
2454	}
2455	).set_examples({LLAMA_EXAMPLE_SERVER}));
2456	add_opt (common_arg (
2457	{"--jinja"},
2458	"use jinja template for chat (default: disabled)",
2459	[](common_params & params) {
2460	params.use_jinja = true;
2461	}
2462	).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
2463	add_opt (common_arg (
2464	{"--reasoning-format"}, "FORMAT",
2465	"controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
2466	"- none: leaves thoughts unparsed in `message.content`\n"
2467	"- deepseek: puts thoughts in `message.reasoning_content`\n"
2468	"- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`\n"
2469	"(default: auto)",
2470	[](common_params & params, const std::string & value) {
2471	params.reasoning_format = common_reasoning_format_from_name(format: value);
2472	}
2473	).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
2474	add_opt (common_arg (
2475	{"--reasoning-budget"}, "N",
2476	"controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)",
2477	[](common_params & params, int value) {
2478	if (value != `0` && value != -`1`) { throw std::invalid_argument ("invalid value"); }
2479	params.reasoning_budget = value;
2480	}
2481	).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK_BUDGET"));
2482	add_opt (common_arg (
2483	{"--chat-template"}, "JINJA_TEMPLATE",
2484	string_format(
2485	fmt: "set custom jinja chat template (default: template taken from model's metadata)\n"
2486	"if suffix/prefix are specified, template will be disabled\n"
2487	"only commonly used templates are accepted (unless --jinja is set before this flag):\n"
2488	"list of built-in templates:\n%s", list_builtin_chat_templates().c_str()
2489	),
2490	[](common_params & params, const std::string & value) {
2491	params.chat_template = value;
2492	}
2493	).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
2494	add_opt (common_arg (
2495	{"--chat-template-file"}, "JINJA_TEMPLATE_FILE",
2496	string_format(
2497	fmt: "set custom jinja chat template file (default: template taken from model's metadata)\n"
2498	"if suffix/prefix are specified, template will be disabled\n"
2499	"only commonly used templates are accepted (unless --jinja is set before this flag):\n"
2500	"list of built-in templates:\n%s", list_builtin_chat_templates().c_str()
2501	),
2502	[](common_params & params, const std::string & value) {
2503	params.chat_template = read_file(fname: value);
2504	}
2505	).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
2506	add_opt (common_arg (
2507	{"--no-prefill-assistant"},
2508	string_format(
2509	fmt: "whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)\n"
2510	"when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled\n"
2511	),
2512	[](common_params & params) {
2513	params.prefill_assistant = false;
2514	}
2515	).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_PREFILL_ASSISTANT"));
2516	add_opt (common_arg (
2517	{"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
2518	string_format(fmt: "how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
2519	[](common_params & params, const std::string & value) {
2520	params.slot_prompt_similarity = std::stof(str: value);
2521	}
2522	).set_examples({LLAMA_EXAMPLE_SERVER}));
2523	add_opt (common_arg (
2524	{"--lora-init-without-apply"},
2525	string_format(fmt: "load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"),
2526	[](common_params & params) {
2527	params.lora_init_without_apply = true;
2528	}
2529	).set_examples({LLAMA_EXAMPLE_SERVER}));
2530	add_opt (common_arg (
2531	{"--simple-io"},
2532	"use basic IO for better compatibility in subprocesses and limited consoles",
2533	[](common_params & params) {
2534	params.simple_io = true;
2535	}
2536	).set_examples({LLAMA_EXAMPLE_MAIN}));
2537	add_opt (common_arg (
2538	{"--positive-file"}, "FNAME",
2539	string_format(fmt: "positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()),
2540	[](common_params & params, const std::string & value) {
2541	params.cvector_positive_file = value;
2542	}
2543	).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
2544	add_opt (common_arg (
2545	{"--negative-file"}, "FNAME",
2546	string_format(fmt: "negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str()),
2547	[](common_params & params, const std::string & value) {
2548	params.cvector_negative_file = value;
2549	}
2550	).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
2551	add_opt (common_arg (
2552	{"--pca-batch"}, "N",
2553	string_format(fmt: "batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch),
2554	[](common_params & params, int value) {
2555	params.n_pca_batch = value;
2556	}
2557	).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
2558	add_opt (common_arg (
2559	{"--pca-iter"}, "N",
2560	string_format(fmt: "number of iterations used for PCA (default: %d)", params.n_pca_iterations),
2561	[](common_params & params, int value) {
2562	params.n_pca_iterations = value;
2563	}
2564	).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
2565	add_opt (common_arg (
2566	{"--method"}, "{pca, mean}",
2567	"dimensionality reduction method to be used (default: pca)",
2568	[](common_params & params, const std::string & value) {
2569	// if (value == "pca") { params.cvector_dimre_method = DIMRE_METHOD_PCA; }
2570	else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; }
2571	else { throw std::invalid_argument ("invalid value"); }
2572	}
2573	).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
2574	add_opt (common_arg (
2575	{"--output-format"}, "{md,jsonl}",
2576	"output format for batched-bench results (default: md)",
2577	[](common_params & params, const std::string & value) {
2578	// if (value == "jsonl") { params.batched_bench_output_jsonl = true; }
2579	else if (value == "md") { params.batched_bench_output_jsonl = false; }
2580	else { throw std::invalid_argument ("invalid value"); }
2581	}
2582	).set_examples({LLAMA_EXAMPLE_BENCH}));
2583	add_opt (common_arg (
2584	{"--log-disable"},
2585	"Log disable",
2586	[](common_params &) {
2587	common_log_pause(log: common_log_main());
2588	}
2589	));
2590	add_opt (common_arg (
2591	{"--log-file"}, "FNAME",
2592	"Log to file",
2593	[](common_params &, const std::string & value) {
2594	common_log_set_file(log: common_log_main(), file: value.c_str());
2595	}
2596	));
2597	add_opt (common_arg (
2598	{"--log-colors"}, "[on\|off\|auto]",
2599	"Set colored logging ('on', 'off', or 'auto', default: 'auto')\n"
2600	"'auto' enables colors when output is to a terminal",
2601	[](common_params &, const std::string & value) {
2602	if (is_truthy(value)) {
2603	common_log_set_colors(log: common_log_main(), colors: LOG_COLORS_ENABLED);
2604	} else if (is_falsey(value)) {
2605	common_log_set_colors(log: common_log_main(), colors: LOG_COLORS_DISABLED);
2606	} else if (is_autoy(value)) {
2607	common_log_set_colors(log: common_log_main(), colors: LOG_COLORS_AUTO);
2608	} else {
2609	throw std::invalid_argument (
2610	string_format(fmt: "error: unkown value for --log-colors: '%s'\n", value.c_str()));
2611	}
2612	}
2613	).set_env("LLAMA_LOG_COLORS"));
2614	add_opt (common_arg (
2615	{"-v", "--verbose", "--log-verbose"},
2616	"Set verbosity level to infinity (i.e. log all messages, useful for debugging)",
2617	[](common_params & params) {
2618	params.verbosity = INT_MAX;
2619	common_log_set_verbosity_thold(INT_MAX);
2620	}
2621	));
2622	add_opt (common_arg (
2623	{"--offline"},
2624	"Offline mode: forces use of cache, prevents network access",
2625	[](common_params & params) {
2626	params.offline = true;
2627	}
2628	).set_env("LLAMA_OFFLINE"));
2629	add_opt (common_arg (
2630	{"-lv", "--verbosity", "--log-verbosity"}, "N",
2631	"Set the verbosity threshold. Messages with a higher verbosity will be ignored.",
2632	[](common_params & params, int value) {
2633	params.verbosity = value;
2634	common_log_set_verbosity_thold(verbosity: value);
2635	}
2636	).set_env("LLAMA_LOG_VERBOSITY"));
2637	add_opt (common_arg (
2638	{"--log-prefix"},
2639	"Enable prefix in log messages",
2640	[](common_params &) {
2641	common_log_set_prefix(log: common_log_main(), prefix: true);
2642	}
2643	).set_env("LLAMA_LOG_PREFIX"));
2644	add_opt (common_arg (
2645	{"--log-timestamps"},
2646	"Enable timestamps in log messages",
2647	[](common_params &) {
2648	common_log_set_timestamps(log: common_log_main(), timestamps: true);
2649	}
2650	).set_env("LLAMA_LOG_TIMESTAMPS"));
2651
2652	// speculative parameters
2653	add_opt (common_arg (
2654	{"-td", "--threads-draft"}, "N",
2655	"number of threads to use during generation (default: same as --threads)",
2656	[](common_params & params, int value) {
2657	params.speculative.cpuparams.n_threads = value;
2658	if (params.speculative.cpuparams.n_threads <= `0`) {
2659	params.speculative.cpuparams.n_threads = std::thread::hardware_concurrency();
2660	}
2661	}
2662	).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
2663	add_opt (common_arg (
2664	{"-tbd", "--threads-batch-draft"}, "N",
2665	"number of threads to use during batch and prompt processing (default: same as --threads-draft)",
2666	[](common_params & params, int value) {
2667	params.speculative.cpuparams_batch.n_threads = value;
2668	if (params.speculative.cpuparams_batch.n_threads <= `0`) {
2669	params.speculative.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
2670	}
2671	}
2672	).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
2673	add_opt (common_arg (
2674	{"-Cd", "--cpu-mask-draft"}, "M",
2675	"Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
2676	[](common_params & params, const std::string & mask) {
2677	params.speculative.cpuparams.mask_valid = true;
2678	if (!parse_cpu_mask(mask, boolmask&: params.speculative.cpuparams.cpumask)) {
2679	throw std::invalid_argument ("invalid cpumask");
2680	}
2681	}
2682	).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
2683	add_opt (common_arg (
2684	{"-Crd", "--cpu-range-draft"}, "lo-hi",
2685	"Ranges of CPUs for affinity. Complements --cpu-mask-draft",
2686	[](common_params & params, const std::string & range) {
2687	params.speculative.cpuparams.mask_valid = true;
2688	if (!parse_cpu_range(range, boolmask&: params.speculative.cpuparams.cpumask)) {
2689	throw std::invalid_argument ("invalid range");
2690	}
2691	}
2692	).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
2693	add_opt (common_arg (
2694	{"--cpu-strict-draft"}, "<0\|1>",
2695	"Use strict CPU placement for draft model (default: same as --cpu-strict)",
2696	[](common_params & params, int value) {
2697	params.speculative.cpuparams.strict_cpu = value;
2698	}
2699	).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
2700	add_opt (common_arg (
2701	{"--prio-draft"}, "N",
2702	string_format(fmt: "set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.speculative.cpuparams.priority),
2703	[](common_params & params, int prio) {
2704	if (prio < `0` \|\| prio > `3`) {
2705	throw std::invalid_argument ("invalid value");
2706	}
2707	params.speculative.cpuparams.priority = (enum ggml_sched_priority) prio;
2708	}
2709	).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
2710	add_opt (common_arg (
2711	{"--poll-draft"}, "<0\|1>",
2712	"Use polling to wait for draft model work (default: same as --poll])",
2713	[](common_params & params, int value) {
2714	params.speculative.cpuparams.poll = value;
2715	}
2716	).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
2717	add_opt (common_arg (
2718	{"-Cbd", "--cpu-mask-batch-draft"}, "M",
2719	"Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
2720	[](common_params & params, const std::string & mask) {
2721	params.speculative.cpuparams_batch.mask_valid = true;
2722	if (!parse_cpu_mask(mask, boolmask&: params.speculative.cpuparams_batch.cpumask)) {
2723	throw std::invalid_argument ("invalid cpumask");
2724	}
2725	}
2726	).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
2727	add_opt (common_arg (
2728	{"-Crbd", "--cpu-range-batch-draft"}, "lo-hi",
2729	"Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)",
2730	[](common_params & params, const std::string & range) {
2731	params.speculative.cpuparams_batch.mask_valid = true;
2732	if (!parse_cpu_range(range, boolmask&: params.speculative.cpuparams_batch.cpumask)) {
2733	throw std::invalid_argument ("invalid cpumask");
2734	}
2735	}
2736	).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
2737	add_opt (common_arg (
2738	{"--cpu-strict-batch-draft"}, "<0\|1>",
2739	"Use strict CPU placement for draft model (default: --cpu-strict-draft)",
2740	[](common_params & params, int value) {
2741	params.speculative.cpuparams_batch.strict_cpu = value;
2742	}
2743	).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
2744	add_opt (common_arg (
2745	{"--prio-batch-draft"}, "N",
2746	string_format(fmt: "set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.speculative.cpuparams_batch.priority),
2747	[](common_params & params, int prio) {
2748	if (prio < `0` \|\| prio > `3`) {
2749	throw std::invalid_argument ("invalid value");
2750	}
2751	params.speculative.cpuparams_batch.priority = (enum ggml_sched_priority) prio;
2752	}
2753	).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
2754	add_opt (common_arg (
2755	{"--poll-batch-draft"}, "<0\|1>",
2756	"Use polling to wait for draft model work (default: --poll-draft)",
2757	[](common_params & params, int value) {
2758	params.speculative.cpuparams_batch.poll = value;
2759	}
2760	).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
2761	add_opt (common_arg (
2762	{"--draft-max", "--draft", "--draft-n"}, "N",
2763	string_format(fmt: "number of tokens to draft for speculative decoding (default: %d)", params.speculative.n_max),
2764	[](common_params & params, int value) {
2765	params.speculative.n_max = value;
2766	}
2767	).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MAX"));
2768	add_opt (common_arg (
2769	{"--draft-min", "--draft-n-min"}, "N",
2770	string_format(fmt: "minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min),
2771	[](common_params & params, int value) {
2772	params.speculative.n_min = value;
2773	}
2774	).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MIN"));
2775	add_opt (common_arg (
2776	{"--draft-p-split"}, "P",
2777	string_format(fmt: "speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split),
2778	[](common_params & params, const std::string & value) {
2779	params.speculative.p_split = std::stof(str: value);
2780	}
2781	).set_examples({LLAMA_EXAMPLE_SPECULATIVE}).set_env("LLAMA_ARG_DRAFT_P_SPLIT"));
2782	add_opt (common_arg (
2783	{"--draft-p-min"}, "P",
2784	string_format(fmt: "minimum speculative decoding probability (greedy) (default: %.1f)", (double)params.speculative.p_min),
2785	[](common_params & params, const std::string & value) {
2786	params.speculative.p_min = std::stof(str: value);
2787	}
2788	).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_P_MIN"));
2789	add_opt (common_arg (
2790	{"-cd", "--ctx-size-draft"}, "N",
2791	string_format(fmt: "size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx),
2792	[](common_params & params, int value) {
2793	params.speculative.n_ctx = value;
2794	}
2795	).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CTX_SIZE_DRAFT"));
2796	add_opt (common_arg (
2797	{"-devd", "--device-draft"}, "<dev1,dev2,..>",
2798	"comma-separated list of devices to use for offloading the draft model (none = don't offload)\n"
2799	"use --list-devices to see a list of available devices",
2800	[](common_params & params, const std::string & value) {
2801	params.speculative.devices = parse_device_list(value);
2802	}
2803	).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
2804	add_opt (common_arg (
2805	{"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
2806	"number of layers to store in VRAM for the draft model",
2807	[](common_params & params, int value) {
2808	params.speculative.n_gpu_layers = value;
2809	if (!llama_supports_gpu_offload()) {
2810	fprintf(stderr, format: "warning: no usable GPU found, --gpu-layers-draft option will be ignored\n");
2811	fprintf(stderr, format: "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
2812	fprintf(stderr, format: "warning: consult docs/build.md for compilation instructions\n");
2813	}
2814	}
2815	).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_GPU_LAYERS_DRAFT"));
2816	add_opt (common_arg (
2817	{"-md", "--model-draft"}, "FNAME",
2818	"draft model for speculative decoding (default: unused)",
2819	[](common_params & params, const std::string & value) {
2820	params.speculative.model.path = value;
2821	}
2822	).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
2823	add_opt (common_arg (
2824	{"--spec-replace"}, "TARGET", "DRAFT",
2825	"translate the string in TARGET into DRAFT if the draft model and main model are not compatible",
2826	[](common_params & params, const std::string & tgt, const std::string & dft) {
2827	params.speculative.replacements.push_back(x: { tgt, dft });
2828	}
2829	).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
2830	add_opt (common_arg (
2831	{"-ctkd", "--cache-type-k-draft"}, "TYPE",
2832	string_format(
2833	fmt: "KV cache data type for K for the draft model\n"
2834	"allowed values: %s\n"
2835	"(default: %s)",
2836	get_all_kv_cache_types().c_str(),
2837	ggml_type_name(type: params.speculative.cache_type_k)
2838	),
2839	[](common_params & params, const std::string & value) {
2840	params.speculative.cache_type_k = kv_cache_type_from_str(s: value);
2841	}
2842	).set_env("LLAMA_ARG_CACHE_TYPE_K_DRAFT"));
2843	add_opt (common_arg (
2844	{"-ctvd", "--cache-type-v-draft"}, "TYPE",
2845	string_format(
2846	fmt: "KV cache data type for V for the draft model\n"
2847	"allowed values: %s\n"
2848	"(default: %s)",
2849	get_all_kv_cache_types().c_str(),
2850	ggml_type_name(type: params.speculative.cache_type_v)
2851	),
2852	[](common_params & params, const std::string & value) {
2853	params.speculative.cache_type_v = kv_cache_type_from_str(s: value);
2854	}
2855	).set_env("LLAMA_ARG_CACHE_TYPE_V_DRAFT"));
2856
2857	add_opt (common_arg (
2858	{"-mv", "--model-vocoder"}, "FNAME",
2859	"vocoder model for audio generation (default: unused)",
2860	[](common_params & params, const std::string & value) {
2861	params.vocoder.model.path = value;
2862	}
2863	).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));
2864	add_opt (common_arg (
2865	{"--tts-use-guide-tokens"},
2866	"Use guide tokens to improve TTS word recall",
2867	[](common_params & params) {
2868	params.vocoder.use_guide_tokens = true;
2869	}
2870	).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));
2871	add_opt (common_arg (
2872	{"--tts-speaker-file"}, "FNAME",
2873	"speaker file path for audio generation",
2874	[](common_params & params, const std::string & value) {
2875	params.vocoder.speaker_file = value;
2876	}
2877	).set_examples({LLAMA_EXAMPLE_TTS}));
2878
2879	add_opt (common_arg (
2880	{"--diffusion-steps"}, "N",
2881	string_format(fmt: "number of diffusion steps (default: %d)", params.diffusion.steps),
2882	[](common_params & params, int value) { params.diffusion.steps = value; }
2883	).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
2884	add_opt (common_arg (
2885	{"--diffusion-visual"},
2886	string_format(fmt: "enable visual diffusion mode (show progressive generation) (default: %s)", params.diffusion.visual_mode ? "true" : "false"),
2887	[](common_params & params) { params.diffusion.visual_mode = true; }
2888	).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
2889	add_opt (common_arg (
2890	{"--diffusion-eps"}, "F",
2891	string_format(fmt: "epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps),
2892	[](common_params & params, const std::string & value) { params.diffusion.eps = std::stof(str: value); }
2893	).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
2894	add_opt (common_arg (
2895	{"--diffusion-algorithm"}, "N",
2896	string_format(fmt: "diffusion algorithm: 0=ORIGIN, 1=ENTROPY_BASED, 2=MARGIN_BASED, 3=RANDOM, 4=LOW_CONFIDENCE (default: %d)", params.diffusion.algorithm),
2897	[](common_params & params, int value) { params.diffusion.algorithm = value; }
2898	).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
2899	add_opt (common_arg (
2900	{"--diffusion-alg-temp"}, "F",
2901	string_format(fmt: "dream algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
2902	[](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(str: value); }
2903	).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
2904	add_opt (common_arg (
2905	{"--diffusion-block-length"}, "N",
2906	string_format(fmt: "llada block length for generation (default: %d)", params.diffusion.block_length),
2907	[](common_params & params, int value) { params.diffusion.block_length = value; }
2908	).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
2909	add_opt (common_arg (
2910	{"--diffusion-cfg-scale"}, "F",
2911	string_format(fmt: "llada classifier-free guidance scale (default: %.3f)", (double) params.diffusion.cfg_scale),
2912	[](common_params & params, const std::string & value) { params.diffusion.cfg_scale = std::stof(str: value); }
2913	).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
2914	add_opt (common_arg (
2915	{"--diffusion-add-gumbel-noise"}, "F",
2916	string_format(fmt: "add gumbel noise to the logits if temp > 0.0 (default: %s)", params.diffusion.add_gumbel_noise ? "true" : "false"),
2917	[](common_params & params, const std::string & value) { params.diffusion.add_gumbel_noise = std::stof(str: value); }
2918	).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
2919	add_opt (common_arg (
2920	{ "-lr", "--learning-rate" }, "ALPHA",
2921	string_format(fmt: "adamw or sgd optimizer alpha (default: %.2g); note: sgd alpha recommended ~10x (no momentum)", (double) params.lr.lr0),
2922	[](common_params & params, const std::string & value) { params.lr.lr0 = std::stof(str: value); }
2923	).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
2924	add_opt (common_arg ({ "-lr-min", "--learning-rate-min" }, "ALPHA",
2925	string_format(fmt: "(if >0) final learning rate after decay (if -decay-epochs is set, default=%.2g)",
2926	(double) params.lr.lr_min),
2927	[](common_params & params, const std::string & value) { params.lr.lr_min = std::stof(str: value); }
2928	).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
2929	add_opt (common_arg (
2930	{"-decay-epochs", "--learning-rate-decay-epochs"}, "ALPHA",
2931	string_format(fmt: "(if >0) decay learning rate to -lr-min after this many epochs (exponential decay, default=%.2g)", (double) params.lr.decay_epochs),
2932	[](common_params & params, const std::string & value) { params.lr.decay_epochs = std::stof(str: value); }
2933	).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
2934	add_opt (common_arg (
2935	{"-wd", "--weight-decay"}, "WD",
2936	string_format(fmt: "adamw or sgd optimizer weight decay (0 is off; recommend very small e.g. 1e-9) (default: %.2g).", (double) params.lr.wd),
2937	[](common_params & params, const std::string & value) { params.lr.wd = std::stof(str: value); }
2938	).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
2939	add_opt (common_arg (
2940	{"-val-split", "--val-split"}, "FRACTION",
2941	string_format(fmt: "fraction of data to use as validation set for training (default: %.2g).", (double) params.val_split),
2942	[](common_params & params, const std::string & value) { params.val_split = std::stof(str: value); }
2943	).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
2944	add_opt (common_arg (
2945	{"-epochs", "--epochs"}, "N",
2946	string_format(fmt: "optimizer max # of epochs (default: %d)", params.lr.epochs),
2947	[](common_params & params, int epochs) { params.lr.epochs = epochs; }
2948	).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
2949	add_opt (common_arg (
2950	{"-opt", "--optimizer"}, "sgd\|adamw", "adamw or sgd",
2951	[](common_params & params, const std::string & name) {
2952	params.optimizer = common_opt_get_optimizer(name.c_str());
2953	if (params.optimizer == GGML_OPT_OPTIMIZER_TYPE_COUNT) {
2954	throw std::invalid_argument ("invalid --optimizer, valid options: adamw, sgd");
2955	}
2956	}
2957	).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
2958
2959	// presets
2960	add_opt (common_arg (
2961	{"--tts-oute-default"},
2962	string_format(fmt: "use default OuteTTS models (note: can download weights from the internet)"),
2963	[](common_params & params) {
2964	params.model.hf_repo = "OuteAI/OuteTTS-0.2-500M-GGUF";
2965	params.model.hf_file = "OuteTTS-0.2-500M-Q8_0.gguf";
2966	params.vocoder.model.hf_repo = "ggml-org/WavTokenizer";
2967	params.vocoder.model.hf_file = "WavTokenizer-Large-75-F16.gguf";
2968	}
2969	).set_examples({LLAMA_EXAMPLE_TTS}));
2970
2971	add_opt (common_arg (
2972	{"--embd-gemma-default"},
2973	string_format(fmt: "use default EmbeddingGemma model (note: can download weights from the internet)"),
2974	[](common_params & params) {
2975	params.model.hf_repo = "ggml-org/embeddinggemma-300M-qat-q4_0-GGUF";
2976	params.model.hf_file = "embeddinggemma-300M-qat-Q4_0.gguf";
2977	params.port = `8011`;
2978	params.n_ubatch = `2048`;
2979	params.n_batch = `2048`;
2980	params.n_parallel = `32`;
2981	params.n_ctx = `2048`*params.n_parallel;
2982	params.verbose_prompt = true;
2983	params.embedding = true;
2984	}
2985	).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
2986
2987	add_opt (common_arg (
2988	{"--fim-qwen-1.5b-default"},
2989	string_format(fmt: "use default Qwen 2.5 Coder 1.5B (note: can download weights from the internet)"),
2990	[](common_params & params) {
2991	params.model.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF";
2992	params.model.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf";
2993	params.port = `8012`;
2994	params.n_ubatch = `1024`;
2995	params.n_batch = `1024`;
2996	params.n_ctx = `0`;
2997	params.n_cache_reuse = `256`;
2998	}
2999	).set_examples({LLAMA_EXAMPLE_SERVER}));
3000
3001	add_opt (common_arg (
3002	{"--fim-qwen-3b-default"},
3003	string_format(fmt: "use default Qwen 2.5 Coder 3B (note: can download weights from the internet)"),
3004	[](common_params & params) {
3005	params.model.hf_repo = "ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF";
3006	params.model.hf_file = "qwen2.5-coder-3b-q8_0.gguf";
3007	params.port = `8012`;
3008	params.n_ubatch = `1024`;
3009	params.n_batch = `1024`;
3010	params.n_ctx = `0`;
3011	params.n_cache_reuse = `256`;
3012	}
3013	).set_examples({LLAMA_EXAMPLE_SERVER}));
3014
3015	add_opt (common_arg (
3016	{"--fim-qwen-7b-default"},
3017	string_format(fmt: "use default Qwen 2.5 Coder 7B (note: can download weights from the internet)"),
3018	[](common_params & params) {
3019	params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
3020	params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
3021	params.port = `8012`;
3022	params.n_ubatch = `1024`;
3023	params.n_batch = `1024`;
3024	params.n_ctx = `0`;
3025	params.n_cache_reuse = `256`;
3026	}
3027	).set_examples({LLAMA_EXAMPLE_SERVER}));
3028
3029	add_opt (common_arg (
3030	{"--fim-qwen-7b-spec"},
3031	string_format(fmt: "use Qwen 2.5 Coder 7B + 0.5B draft for speculative decoding (note: can download weights from the internet)"),
3032	[](common_params & params) {
3033	params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
3034	params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
3035	params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
3036	params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
3037	params.port = `8012`;
3038	params.n_ubatch = `1024`;
3039	params.n_batch = `1024`;
3040	params.n_ctx = `0`;
3041	params.n_cache_reuse = `256`;
3042	}
3043	).set_examples({LLAMA_EXAMPLE_SERVER}));
3044
3045	add_opt (common_arg (
3046	{"--fim-qwen-14b-spec"},
3047	string_format(fmt: "use Qwen 2.5 Coder 14B + 0.5B draft for speculative decoding (note: can download weights from the internet)"),
3048	[](common_params & params) {
3049	params.model.hf_repo = "ggml-org/Qwen2.5-Coder-14B-Q8_0-GGUF";
3050	params.model.hf_file = "qwen2.5-coder-14b-q8_0.gguf";
3051	params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
3052	params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
3053	params.port = `8012`;
3054	params.n_ubatch = `1024`;
3055	params.n_batch = `1024`;
3056	params.n_ctx = `0`;
3057	params.n_cache_reuse = `256`;
3058	}
3059	).set_examples({LLAMA_EXAMPLE_SERVER}));
3060
3061	add_opt (common_arg (
3062	{"--fim-qwen-30b-default"},
3063	string_format(fmt: "use default Qwen 3 Coder 30B A3B Instruct (note: can download weights from the internet)"),
3064	[](common_params & params) {
3065	params.model.hf_repo = "ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF";
3066	params.model.hf_file = "qwen3-coder-30b-a3b-instruct-q8_0.gguf";
3067	params.port = `8012`;
3068	params.n_ubatch = `1024`;
3069	params.n_batch = `1024`;
3070	params.n_ctx = `0`;
3071	params.n_cache_reuse = `256`;
3072	}
3073	).set_examples({LLAMA_EXAMPLE_SERVER}));
3074
3075	add_opt (common_arg (
3076	{"--gpt-oss-20b-default"},
3077	string_format(fmt: "use gpt-oss-20b (note: can download weights from the internet)"),
3078	[](common_params & params) {
3079	params.model.hf_repo = "ggml-org/gpt-oss-20b-GGUF";
3080	params.model.hf_file = "gpt-oss-20b-mxfp4.gguf";
3081	params.port = `8013`;
3082	params.n_ubatch = `2048`;
3083	params.n_batch = `32768`;
3084	params.n_parallel = `2`;
3085	params.n_ctx = `131072`*params.n_parallel;
3086	params.sampling.temp = `1.0f`;
3087	params.sampling.top_p = `1.0f`;
3088	params.sampling.top_k = `0`;
3089	params.sampling.min_p = `0.01f`;
3090	params.use_jinja = true;
3091	//params.default_template_kwargs["reasoning_effort"] = "\"high\"";
3092	}
3093	).set_examples({LLAMA_EXAMPLE_SERVER}));
3094
3095	add_opt (common_arg (
3096	{"--gpt-oss-120b-default"},
3097	string_format(fmt: "use gpt-oss-120b (note: can download weights from the internet)"),
3098	[](common_params & params) {
3099	params.model.hf_repo = "ggml-org/gpt-oss-120b-GGUF";
3100	params.port = `8013`;
3101	params.n_ubatch = `2048`;
3102	params.n_batch = `32768`;
3103	params.n_parallel = `2`;
3104	params.n_ctx = `131072`*params.n_parallel;
3105	params.sampling.temp = `1.0f`;
3106	params.sampling.top_p = `1.0f`;
3107	params.sampling.top_k = `0`;
3108	params.sampling.min_p = `0.01f`;
3109	params.use_jinja = true;
3110	//params.default_template_kwargs["reasoning_effort"] = "\"high\"";
3111	}
3112	).set_examples({LLAMA_EXAMPLE_SERVER}));
3113
3114	add_opt (common_arg (
3115	{"--vision-gemma-4b-default"},
3116	string_format(fmt: "use Gemma 3 4B QAT (note: can download weights from the internet)"),
3117	[](common_params & params) {
3118	params.model.hf_repo = "ggml-org/gemma-3-4b-it-qat-GGUF";
3119	params.port = `8014`;
3120	params.n_ctx = `0`;
3121	params.use_jinja = true;
3122	}
3123	).set_examples({LLAMA_EXAMPLE_SERVER}));
3124
3125	add_opt (common_arg (
3126	{"--vision-gemma-12b-default"},
3127	string_format(fmt: "use Gemma 3 12B QAT (note: can download weights from the internet)"),
3128	[](common_params & params) {
3129	params.model.hf_repo = "ggml-org/gemma-3-12b-it-qat-GGUF";
3130	params.port = `8014`;
3131	params.n_ctx = `0`;
3132	params.use_jinja = true;
3133	}
3134	).set_examples({LLAMA_EXAMPLE_SERVER}));
3135
3136	return ctx_arg;
3137	}
3138

Browse the source code of llama.cpp/common/arg.cpp