common.h source code [llama.cpp/common/common.h]

1	// Various helper functions and utilities
2
3	#pragma once
4
5	#include <set>
6	#include <sstream>
7	#include <string>
8	#include <string_view>
9	#include <vector>
10	#include <map>
11	#include <sstream>
12	#include <cmath>
13
14	#include "ggml-opt.h"
15	#include "llama-cpp.h"
16
17	#ifdef _WIN32
18	#define DIRECTORY_SEPARATOR '\\'
19	#else
20	#define DIRECTORY_SEPARATOR '/'
21	#endif // _WIN32
22
23	#define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
24	#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
25
26	#define print_build_info() do { \
27	fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
28	fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
29	} while(0)
30
31	#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
32
33	struct common_adapter_lora_info {
34	std::string path;
35	float scale;
36
37	std::string task_name;
38	std::string prompt_prefix;
39
40	struct llama_adapter_lora * ptr;
41	};
42
43	using llama_tokens = std::vector<llama_token>;
44
45	// build info
46	extern int LLAMA_BUILD_NUMBER;
47	extern const char * LLAMA_COMMIT;
48	extern const char * LLAMA_COMPILER;
49	extern const char * LLAMA_BUILD_TARGET;
50
51	struct common_control_vector_load_info;
52
53	//
54	// CPU utils
55	//
56
57	struct cpu_params {
58	int n_threads = -`1`;
59	bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
60	bool mask_valid = false; // Default: any CPU
61	enum ggml_sched_priority priority = GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
62	bool strict_cpu = false; // Use strict CPU placement
63	uint32_t poll = `50`; // Polling (busywait) level (0 - no polling, 100 - mostly polling)
64	};
65
66	int32_t cpu_get_num_physical_cores();
67	int32_t cpu_get_num_math();
68
69	//
70	// Common params
71	//
72
73	enum llama_example {
74	LLAMA_EXAMPLE_COMMON,
75	LLAMA_EXAMPLE_SPECULATIVE,
76	LLAMA_EXAMPLE_MAIN,
77	LLAMA_EXAMPLE_EMBEDDING,
78	LLAMA_EXAMPLE_PERPLEXITY,
79	LLAMA_EXAMPLE_RETRIEVAL,
80	LLAMA_EXAMPLE_PASSKEY,
81	LLAMA_EXAMPLE_IMATRIX,
82	LLAMA_EXAMPLE_BENCH,
83	LLAMA_EXAMPLE_SERVER,
84	LLAMA_EXAMPLE_CVECTOR_GENERATOR,
85	LLAMA_EXAMPLE_EXPORT_LORA,
86	LLAMA_EXAMPLE_MTMD,
87	LLAMA_EXAMPLE_LOOKUP,
88	LLAMA_EXAMPLE_PARALLEL,
89	LLAMA_EXAMPLE_TTS,
90	LLAMA_EXAMPLE_DIFFUSION,
91	LLAMA_EXAMPLE_FINETUNE,
92
93	LLAMA_EXAMPLE_COUNT,
94	};
95
96	enum common_sampler_type {
97	COMMON_SAMPLER_TYPE_NONE = `0`,
98	COMMON_SAMPLER_TYPE_DRY = `1`,
99	COMMON_SAMPLER_TYPE_TOP_K = `2`,
100	COMMON_SAMPLER_TYPE_TOP_P = `3`,
101	COMMON_SAMPLER_TYPE_MIN_P = `4`,
102	//COMMON_SAMPLER_TYPE_TFS_Z = 5,
103	COMMON_SAMPLER_TYPE_TYPICAL_P = `6`,
104	COMMON_SAMPLER_TYPE_TEMPERATURE = `7`,
105	COMMON_SAMPLER_TYPE_XTC = `8`,
106	COMMON_SAMPLER_TYPE_INFILL = `9`,
107	COMMON_SAMPLER_TYPE_PENALTIES = `10`,
108	COMMON_SAMPLER_TYPE_TOP_N_SIGMA = `11`,
109	};
110
111	// dimensionality reduction methods, used by cvector-generator
112	enum dimre_method {
113	DIMRE_METHOD_PCA,
114	DIMRE_METHOD_MEAN,
115	};
116
117	enum common_conversation_mode {
118	COMMON_CONVERSATION_MODE_DISABLED = `0`,
119	COMMON_CONVERSATION_MODE_ENABLED = `1`,
120	COMMON_CONVERSATION_MODE_AUTO = `2`,
121	};
122
123	enum common_grammar_trigger_type {
124	COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN,
125	COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
126	COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
127	COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
128	};
129
130	struct common_grammar_trigger {
131	common_grammar_trigger_type type;
132	std::string value;
133	llama_token token = LLAMA_TOKEN_NULL;
134	};
135
136	// sampling parameters
137	struct common_params_sampling {
138	uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
139
140	int32_t n_prev = `64`; // number of previous tokens to remember
141	int32_t n_probs = `0`; // if greater than 0, output the probabilities of top n_probs tokens.
142	int32_t min_keep = `0`; // 0 = disabled, otherwise samplers should return at least min_keep tokens
143	int32_t top_k = `40`; // <= 0 to use vocab size
144	float top_p = `0.95f`; // 1.0 = disabled
145	float min_p = `0.05f`; // 0.0 = disabled
146	float xtc_probability = `0.00f`; // 0.0 = disabled
147	float xtc_threshold = `0.10f`; // > 0.5 disables XTC
148	float typ_p = `1.00f`; // typical_p, 1.0 = disabled
149	float temp = `0.80f`; // <= 0.0 to sample greedily, 0.0 to not output probabilities
150	float dynatemp_range = `0.00f`; // 0.0 = disabled
151	float dynatemp_exponent = `1.00f`; // controls how entropy maps to temperature in dynamic temperature sampler
152	int32_t penalty_last_n = `64`; // last n tokens to penalize (0 = disable penalty, -1 = context size)
153	float penalty_repeat = `1.00f`; // 1.0 = disabled
154	float penalty_freq = `0.00f`; // 0.0 = disabled
155	float penalty_present = `0.00f`; // 0.0 = disabled
156	float dry_multiplier = `0.0f`; // 0.0 = disabled; DRY repetition penalty for tokens extending repetition:
157	float dry_base = `1.75f`; // 0.0 = disabled; multiplier base ^ (length of sequence before token - allowed length)*
158	int32_t dry_allowed_length = `2`; // tokens extending repetitions beyond this receive penalty
159	int32_t dry_penalty_last_n = -`1`; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
160	int32_t mirostat = `0`; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
161	float top_n_sigma = -`1.00f`;// -1.0 = disabled
162	float mirostat_tau = `5.00f`; // target entropy
163	float mirostat_eta = `0.10f`; // learning rate
164	bool ignore_eos = false;
165	bool no_perf = false; // disable performance metrics
166	bool timing_per_token = false;
167
168	std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", ""}; // default sequence breakers for DRY*
169
170
171	std::vector<enum common_sampler_type> samplers = {
172	COMMON_SAMPLER_TYPE_PENALTIES,
173	COMMON_SAMPLER_TYPE_DRY,
174	COMMON_SAMPLER_TYPE_TOP_N_SIGMA,
175	COMMON_SAMPLER_TYPE_TOP_K,
176	COMMON_SAMPLER_TYPE_TYPICAL_P,
177	COMMON_SAMPLER_TYPE_TOP_P,
178	COMMON_SAMPLER_TYPE_MIN_P,
179	COMMON_SAMPLER_TYPE_XTC,
180	COMMON_SAMPLER_TYPE_TEMPERATURE,
181	};
182
183	std::string grammar; // optional BNF-like grammar to constrain sampling
184	bool grammar_lazy = false;
185	std::vector<common_grammar_trigger> grammar_triggers; // optional triggers (for lazy grammars)
186	std::set<llama_token> preserved_tokens;
187
188	std::vector<llama_logit_bias> logit_bias; // logit biases to apply
189	std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens
190
191	// print the parameters into a string
192	std::string print() const;
193	};
194
195	struct common_params_model {
196	std::string path = ""; // model local path // NOLINT
197	std::string url = ""; // model url to download // NOLINT
198	std::string hf_repo = ""; // HF repo // NOLINT
199	std::string hf_file = ""; // HF file // NOLINT
200	std::string docker_repo = ""; // Docker repo // NOLINT
201	};
202
203	struct common_params_speculative {
204	std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
205
206	int32_t n_ctx = `0`; // draft context size
207	int32_t n_max = `16`; // maximum number of tokens to draft during speculative decoding
208	int32_t n_min = `0`; // minimum number of draft tokens to use for speculative decoding
209	int32_t n_gpu_layers = -`1`; // number of layers to store in VRAM for the draft model (-1 - use default)
210	float p_split = `0.1f`; // speculative decoding split probability
211	float p_min = `0.75f`; // minimum speculative decoding probability (greedy)
212	std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
213	std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
214
215	ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
216	ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
217
218	struct cpu_params cpuparams;
219	struct cpu_params cpuparams_batch;
220
221	struct common_params_model model;
222	};
223
224	struct common_params_vocoder {
225	struct common_params_model model;
226
227	std::string speaker_file = ""; // speaker file path // NOLINT
228
229	bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy // NOLINT
230	};
231
232	struct common_params_diffusion {
233	int32_t steps = `128`;
234	bool visual_mode = false;
235
236	float eps = `0`; // epsilon for timesteps
237	int32_t block_length = `0`; // block length for generation
238
239	int32_t algorithm = `4`; // default algorithm: low-confidence
240	float alg_temp = `0.0f`; // algorithm temperature
241
242	float cfg_scale = `0`; // classifier-free guidance scale
243	bool add_gumbel_noise = false; // add gumbel noise to the logits if temp > 0.0
244	};
245
246	// reasoning API response format (not to be confused as chat template's reasoning format)
247	enum common_reasoning_format {
248	COMMON_REASONING_FORMAT_NONE,
249	COMMON_REASONING_FORMAT_AUTO, // Same as deepseek, using `message.reasoning_content`
250	COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
251	COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
252	// do not extend this enum unless you absolutely have to
253	// in most cases, use COMMON_REASONING_FORMAT_AUTO
254	// see: https://github.com/ggml-org/llama.cpp/pull/15408
255	};
256
257
258	struct lr_opt {
259	float lr0 = `1e-5`; // learning rate at first epoch
260	float lr_min = -`1`;
261	float decay_epochs = -`1`; // if >0, the learning rate starts at lr0 and decays to lr_min after this many epochs
262	float scale_epoch = `0`;
263	float wd = `0`;
264	unsigned epochs = `2`;
265
266	unsigned epoch; // set by optimizer outer (epochs) loop
267	// learning rate decay - constant LR per epoch only for now
268	float get_lr(float e) const;
269	float get_lr() const { return get_lr(e: epoch); }
270	// must call after arg parse, before get_lr
271	void init();
272	};
273
274	struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
275
276	struct common_params {
277	int32_t n_predict = -`1`; // new tokens to predict
278	int32_t n_ctx = `4096`; // context size
279	int32_t n_batch = `2048`; // logical batch size for prompt processing (must be >=32 to use BLAS)
280	int32_t n_ubatch = `512`; // physical batch size for prompt processing (must be >=32 to use BLAS)
281	int32_t n_keep = `0`; // number of tokens to keep from initial prompt
282	int32_t n_chunks = -`1`; // max number of chunks to process (-1 = unlimited)
283	int32_t n_parallel = `1`; // number of parallel sequences to decode
284	int32_t n_sequences = `1`; // number of sequences to decode
285	int32_t grp_attn_n = `1`; // group-attention factor
286	int32_t grp_attn_w = `512`; // group-attention width
287	int32_t n_print = -`1`; // print token count every n tokens (-1 = disabled)
288	float rope_freq_base = `0.0f`; // RoPE base frequency
289	float rope_freq_scale = `0.0f`; // RoPE frequency scaling factor
290	float yarn_ext_factor = -`1.0f`; // YaRN extrapolation mix factor
291	float yarn_attn_factor = -`1.0f`; // YaRN magnitude scaling factor
292	float yarn_beta_fast = -`1.0f`; // YaRN low correction dim
293	float yarn_beta_slow = -`1.0f`; // YaRN high correction dim
294	int32_t yarn_orig_ctx = `0`; // YaRN original context length
295
296	// offload params
297	std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
298
299	int32_t n_gpu_layers = -`1`; // number of layers to store in VRAM (-1 - use default)
300	int32_t main_gpu = `0`; // the GPU that is used for scratch and small tensors
301	float tensor_split[`128`] = {`0`}; // how split tensors should be distributed across GPUs
302
303	enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
304
305	struct cpu_params cpuparams;
306	struct cpu_params cpuparams_batch;
307
308	ggml_backend_sched_eval_callback cb_eval = nullptr;
309	void * cb_eval_user_data = nullptr;
310
311	ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
312
313	enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
314	enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
315	enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
316	enum llama_flash_attn_type flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO; // whether to use Flash Attention
317
318	struct common_params_sampling sampling;
319	struct common_params_speculative speculative;
320	struct common_params_vocoder vocoder;
321	struct common_params_diffusion diffusion;
322
323	struct common_params_model model;
324
325	std::string model_alias = ""; // model alias // NOLINT
326	std::string hf_token = ""; // HF token // NOLINT
327	std::string prompt = ""; // NOLINT
328	std::string system_prompt = ""; // NOLINT
329	std::string prompt_file = ""; // store the external prompt file name // NOLINT
330	std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT
331	std::string input_prefix = ""; // string to prefix user inputs with // NOLINT
332	std::string input_suffix = ""; // string to suffix user inputs with // NOLINT
333	std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT
334	std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
335	std::string logits_file = ""; // file for saving all* logits // NOLINT*
336
337	std::vector<std::string> in_files; // all input files
338	std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
339	std::vector<llama_model_kv_override> kv_overrides;
340	std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
341
342	bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
343	std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale
344
345	std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
346
347	int32_t verbosity = `0`;
348	int32_t control_vector_layer_start = -`1`; // layer range for control vector
349	int32_t control_vector_layer_end = -`1`; // layer range for control vector
350	bool offline = false;
351
352	int32_t ppl_stride = `0`; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
353	int32_t ppl_output_type = `0`; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
354	// (which is more convenient to use for plotting)
355	//
356	bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
357	size_t hellaswag_tasks = `400`; // number of tasks to use when computing the HellaSwag score
358
359	bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt
360	size_t winogrande_tasks = `0`; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
361
362	bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
363	size_t multiple_choice_tasks = `0`; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
364
365	bool kl_divergence = false; // compute KL divergence
366
367	bool usage = false; // print usage
368	bool completion = false; // print source-able completion script
369	bool use_color = false; // use color to distinguish generations and inputs
370	bool special = false; // enable special token output
371	bool interactive = false; // interactive mode
372	bool interactive_first = false; // wait for user input immediately
373	bool prompt_cache_all = false; // save user input and generations to prompt cache
374	bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
375
376	bool escape = true; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
377	bool multiline_input = false; // reverse the usage of `\`
378	bool simple_io = false; // improves compatibility with subprocesses and limited consoles
379	bool cont_batching = true; // insert new sequences for decoding on-the-fly
380	bool no_perf = false; // disable performance metrics
381	bool ctx_shift = false; // context shift on infinite text generation
382	bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
383	bool kv_unified = false; // enable unified KV cache
384
385	bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
386	bool use_mmap = true; // use mmap for faster loads
387	bool use_mlock = false; // use mlock to keep model in memory
388	bool verbose_prompt = false; // print prompt tokens before generation
389	bool display_prompt = true; // print prompt before generation
390	bool no_kv_offload = false; // disable KV offloading
391	bool warmup = true; // warmup run
392	bool check_tensors = false; // validate tensor data
393	bool no_op_offload = false; // globally disable offload host tensor operations to device
394	bool no_extra_bufts = false; // disable extra buffer types (used for weight repacking)
395	bool no_host = false; // bypass host buffer allowing extra buffers to be used
396
397	bool single_turn = false; // single turn chat conversation
398
399	ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
400	ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
401
402	common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
403
404	// multimodal models (see tools/mtmd)
405	struct common_params_model mmproj;
406	bool mmproj_use_gpu = true; // use GPU for multimodal model
407	bool no_mmproj = false; // explicitly disable multimodal model
408	std::vector<std::string> image; // path to image file(s)
409	int image_min_tokens = -`1`;
410	int image_max_tokens = -`1`;
411
412	// finetune
413	struct lr_opt lr;
414	enum ggml_opt_optimizer_type optimizer = GGML_OPT_OPTIMIZER_TYPE_ADAMW;
415	float val_split = `0.05f`; // fraction of the data used for the validation set
416
417	// embedding
418	bool embedding = false; // get only sentence embedding
419	int32_t embd_normalize = `2`; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
420	std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
421	std::string embd_sep = "\n"; // separator of embeddings
422	std::string cls_sep = "\t"; // separator of classification sequences
423
424	// server params
425	int32_t port = `8080`; // server listens on this network port
426	int32_t timeout_read = `600`; // http read timeout in seconds
427	int32_t timeout_write = timeout_read; // http write timeout in seconds
428	int32_t n_threads_http = -`1`; // number of threads to process HTTP requests (TODO: support threadpool)
429	int32_t n_cache_reuse = `0`; // min chunk size to reuse from the cache via KV shifting
430	int32_t n_ctx_checkpoints = `8`; // max number of context checkpoints per slot
431	int32_t cache_ram_mib = `8192`; // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.
432
433	std::string hostname = "127.0.0.1";
434	std::string public_path = ""; // NOLINT
435	std::string api_prefix = ""; // NOLINT
436	std::string chat_template = ""; // NOLINT
437	bool use_jinja = false; // NOLINT
438	bool enable_chat_template = true;
439	common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
440	int reasoning_budget = -`1`;
441	bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
442
443	std::vector<std::string> api_keys;
444
445	std::string ssl_file_key = ""; // NOLINT
446	std::string ssl_file_cert = ""; // NOLINT
447
448	std::map<std::string, std::string> default_template_kwargs;
449
450	// "advanced" endpoints are disabled by default for better security
451	bool webui = true;
452	bool endpoint_slots = true;
453	bool endpoint_props = false; // only control POST requests, not GET
454	bool endpoint_metrics = false;
455
456	bool log_json = false;
457
458	std::string slot_save_path;
459
460	float slot_prompt_similarity = `0.1f`;
461
462	// batched-bench params
463	bool is_pp_shared = false;
464
465	std::vector<int32_t> n_pp;
466	std::vector<int32_t> n_tg;
467	std::vector<int32_t> n_pl;
468
469	// retrieval params
470	std::vector<std::string> context_files; // context files to embed
471
472	int32_t chunk_size = `64`; // chunk size for context embedding
473
474	std::string chunk_separator = "\n"; // chunk separator for context embedding
475
476	// passkey params
477	int32_t n_junk = `250`; // number of times to repeat the junk text
478	int32_t i_pos = -`1`; // position of the passkey in the junk text
479
480	// imatrix params
481	int32_t n_out_freq = `10`; // output the imatrix every n_out_freq iterations
482	int32_t n_save_freq = `0`; // save the imatrix every n_save_freq iterations
483	int32_t i_chunk = `0`; // start processing from this chunk
484	int8_t imat_dat = `0`; // whether the legacy imatrix.dat format should be output (gguf <= 0 < dat)
485
486	bool process_output = false; // collect data for the output tensor
487	bool compute_ppl = true; // whether to compute perplexity
488	bool show_statistics = false; // show imatrix statistics per tensor
489	bool parse_special = false; // whether to parse special tokens during imatrix tokenization
490
491	// cvector-generator params
492	int n_pca_batch = `100`;
493	int n_pca_iterations = `1000`;
494	dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
495	std::string cvector_positive_file = "tools/cvector-generator/positive.txt";
496	std::string cvector_negative_file = "tools/cvector-generator/negative.txt";
497
498	bool spm_infill = false; // suffix/prefix/middle pattern for infill
499
500	// batched-bench params
501	bool batched_bench_output_jsonl = false;
502
503	// common params
504	std::string out_file; // output filename for all example programs
505	// optional callback for model loading progress and cancellation:
506	// called with a progress value between 0.0 and 1.0.
507	// return false from callback to abort model loading or true to continue
508	llama_progress_callback load_progress_callback = NULL;
509	void * load_progress_callback_user_data = NULL;
510
511	bool has_speculative() const {
512	return !speculative.model.path.empty() \|\| !speculative.model.hf_repo.empty();
513	}
514	};
515
516	// call once at the start of a program if it uses libcommon
517	// initializes the logging system and prints info about the build
518	void common_init();
519
520	std::string common_params_get_system_info(const common_params & params);
521
522	bool parse_cpu_range(const std::string & range, bool(&boolmask)[GGML_MAX_N_THREADS]);
523	bool parse_cpu_mask(const std::string & mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
524	void postprocess_cpu_params(cpu_params & cpuparams, const cpu_params * role_model = nullptr);
525	bool set_process_priority(enum ggml_sched_priority prio);
526
527	//
528	// String utils
529	//
530
531	#ifdef __GNUC__
532	# if defined(__MINGW32__) && !defined(__clang__)
533	# define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
534	# else
535	# define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
536	# endif
537	#else
538	# define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
539	#endif
540
541	LLAMA_COMMON_ATTRIBUTE_FORMAT(`1`, `2`)
542	std::string string_format(const char * fmt, ...);
543
544	std::string string_strip(const std::string & str);
545	std::string string_get_sortable_timestamp();
546
547	std::string string_join(const std::vector<std::string> & values, const std::string & separator);
548	std::vector<std::string> string_split(const std::string & str, const std::string & delimiter);
549	std::string string_repeat(const std::string & str, size_t n);
550
551	void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
552
553	std::string regex_escape(const std::string & s);
554
555	template<class T>
556	static std::vector<T> string_split(const std::string & str, char delim) {
557	static_assert(!std::is_same<T, std::string>::value, "Please use the specialized version for std::string");
558	std::vector<T> values;
559	std::istringstream str_stream(str);
560	std::string token;
561	while (std::getline(in&: str_stream, str&: token, delim: delim)) {
562	T value;
563	std::istringstream token_stream(token);
564	token_stream >> value;
565	values.push_back(value);
566	}
567	return values;
568	}
569
570	template<>
571	std::vector<std::string> string_split<std::string>(const std::string & input, char separator)
572	{
573	std::vector<std::string> parts;
574	size_t begin_pos = `0`;
575	size_t separator_pos = input.find(c: separator);
576	while (separator_pos != std::string::npos) {
577	std::string part = input.substr(pos: begin_pos, n: separator_pos - begin_pos);
578	parts.emplace_back(args&: part);
579	begin_pos = separator_pos + `1`;
580	separator_pos = input.find(c: separator, pos: begin_pos);
581	}
582	parts.emplace_back(args: input.substr(pos: begin_pos, n: separator_pos - begin_pos));
583	return parts;
584	}
585
586	static bool string_starts_with(const std::string & str,
587	const std::string & prefix) { // While we wait for C++20's std::string::starts_with...
588	return str.rfind(str: prefix, pos: `0`) == `0`;
589	}
590
591	// While we wait for C++20's std::string::ends_with...
592	bool string_ends_with(const std::string_view & str, const std::string_view & suffix);
593	bool string_remove_suffix(std::string & str, const std::string_view & suffix);
594	size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop);
595
596	bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
597	void string_process_escapes(std::string & input);
598
599	std::string string_from(bool value);
600	std::string string_from(const std::vector<int> & values);
601	std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
602	std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);
603
604	//
605	// Filesystem utils
606	//
607
608	bool fs_validate_filename(const std::string & filename);
609	bool fs_create_directory_with_parents(const std::string & path);
610
611	std::string fs_get_cache_directory();
612	std::string fs_get_cache_file(const std::string & filename);
613
614	//
615	// Model utils
616	//
617
618	// note: defines object's lifetime
619	struct common_init_result {
620	llama_model_ptr model;
621	llama_context_ptr context;
622
623	std::vector<llama_adapter_lora_ptr> lora;
624	};
625
626	struct common_init_result common_init_from_params(common_params & params);
627
628	struct llama_model_params common_model_params_to_llama ( common_params & params);
629	struct llama_context_params common_context_params_to_llama(const common_params & params);
630	struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
631
632	// clear LoRA adapters from context, then apply new list of adapters
633	void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
634
635	std::string get_model_endpoint();
636
637	//
638	// Batch utils
639	//
640
641	void common_batch_clear(struct llama_batch & batch);
642
643	void common_batch_add(
644	struct llama_batch & batch,
645	llama_token id,
646	llama_pos pos,
647	const std::vector<llama_seq_id> & seq_ids,
648	bool logits);
649
650	//
651	// Token utils
652	//
653
654	// longest common prefix
655	size_t common_lcp(const llama_tokens & a, const llama_tokens & b);
656
657	// longet common subsequence
658	size_t common_lcs(const llama_tokens & a, const llama_tokens & b);
659
660	//
661	// Vocab utils
662	//
663
664	// tokenizes a string into a vector of tokens
665	// should work similar to Python's `tokenizer.encode`
666	std::vector<llama_token> common_tokenize(
667	const struct llama_context * ctx,
668	const std::string & text,
669	bool add_special,
670	bool parse_special = false);
671
672	std::vector<llama_token> common_tokenize(
673	const struct llama_vocab * vocab,
674	const std::string & text,
675	bool add_special,
676	bool parse_special = false);
677
678	// tokenizes a token into a piece, optionally renders special/control tokens
679	// should work similar to Python's `tokenizer.id_to_piece`
680	std::string common_token_to_piece(
681	const struct llama_context * ctx,
682	llama_token token,
683	bool special = true);
684
685	std::string common_token_to_piece(
686	const struct llama_vocab * vocab,
687	llama_token token,
688	bool special = true);
689
690	// detokenizes a vector of tokens into a string
691	// should work similar to Python's `tokenizer.decode`
692	// optionally renders special/control tokens
693	std::string common_detokenize(
694	const struct llama_context * ctx,
695	const std::vector<llama_token> & tokens,
696	bool special = true);
697
698	std::string common_detokenize(
699	const struct llama_vocab * vocab,
700	const std::vector<llama_token> & tokens,
701	bool special = true);
702
703	//
704	// Embedding utils
705	//
706
707	// TODO: repace embd_norm with an enum
708	void common_embd_normalize(const float * inp, float * out, int n, int embd_norm);
709
710	float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
711
712	//
713	// Control vector utils
714	//
715
716	struct common_control_vector_data {
717	int n_embd;
718
719	// stores data for layers [1, n_layer] where n_layer = data.size() / n_embd
720	std::vector<float> data;
721	};
722
723	struct common_control_vector_load_info {
724	float strength;
725
726	std::string fname;
727	};
728
729	// Load control vectors, scale each by strength, and add them together.
730	// On error, returns {-1, empty}
731	common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos);
732
733	//
734	// Split utils
735	//
736
737	namespace {
738
739	const char * const LLM_KV_SPLIT_NO = "split.no";
740	const char * const LLM_KV_SPLIT_COUNT = "split.count";
741	const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
742
743	}
744
745	//
746	// MoE utils
747	//
748
749	const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up\|down\|gate)_(ch\|)exps";
750
751	static std::string llm_ffn_exps_block_regex(int idx) {
752	return string_format(fmt: "blk\\.%d%s", idx, LLM_FFN_EXPS_REGEX);
753	}
754
755	static llama_model_tensor_buft_override llm_ffn_exps_cpu_override() {
756	return { .pattern: LLM_FFN_EXPS_REGEX, .buft: ggml_backend_cpu_buffer_type() };
757	}
758
759	//
760	// training utils
761	//
762
763	ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride);
764
765	// "adamw" or "sgd" (case insensitive)
766	enum ggml_opt_optimizer_type common_opt_get_optimizer(const char *);
767

Browse the source code of llama.cpp/common/common.h