| 1 | // Various helper functions and utilities |
| 2 | |
| 3 | #pragma once |
| 4 | |
| 5 | #include <set> |
| 6 | #include <sstream> |
| 7 | #include <string> |
| 8 | #include <string_view> |
| 9 | #include <vector> |
| 10 | #include <map> |
| 11 | #include <sstream> |
| 12 | #include <cmath> |
| 13 | |
| 14 | #include "ggml-opt.h" |
| 15 | #include "llama-cpp.h" |
| 16 | |
| 17 | #ifdef _WIN32 |
| 18 | #define DIRECTORY_SEPARATOR '\\' |
| 19 | #else |
| 20 | #define DIRECTORY_SEPARATOR '/' |
| 21 | #endif // _WIN32 |
| 22 | |
| 23 | #define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0) |
| 24 | #define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0) |
| 25 | |
| 26 | #define print_build_info() do { \ |
| 27 | fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \ |
| 28 | fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \ |
| 29 | } while(0) |
| 30 | |
| 31 | #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf" |
| 32 | |
| 33 | struct common_adapter_lora_info { |
| 34 | std::string path; |
| 35 | float scale; |
| 36 | |
| 37 | std::string task_name; |
| 38 | std::string prompt_prefix; |
| 39 | |
| 40 | struct llama_adapter_lora * ptr; |
| 41 | }; |
| 42 | |
| 43 | using llama_tokens = std::vector<llama_token>; |
| 44 | |
| 45 | // build info |
| 46 | extern int LLAMA_BUILD_NUMBER; |
| 47 | extern const char * LLAMA_COMMIT; |
| 48 | extern const char * LLAMA_COMPILER; |
| 49 | extern const char * LLAMA_BUILD_TARGET; |
| 50 | |
| 51 | struct common_control_vector_load_info; |
| 52 | |
| 53 | // |
| 54 | // CPU utils |
| 55 | // |
| 56 | |
| 57 | struct cpu_params { |
| 58 | int n_threads = -1; |
| 59 | bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask. |
| 60 | bool mask_valid = false; // Default: any CPU |
| 61 | enum ggml_sched_priority priority = GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime) |
| 62 | bool strict_cpu = false; // Use strict CPU placement |
| 63 | uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling) |
| 64 | }; |
| 65 | |
| 66 | int32_t cpu_get_num_physical_cores(); |
| 67 | int32_t cpu_get_num_math(); |
| 68 | |
| 69 | // |
| 70 | // Common params |
| 71 | // |
| 72 | |
| 73 | enum llama_example { |
| 74 | LLAMA_EXAMPLE_COMMON, |
| 75 | LLAMA_EXAMPLE_SPECULATIVE, |
| 76 | LLAMA_EXAMPLE_MAIN, |
| 77 | LLAMA_EXAMPLE_EMBEDDING, |
| 78 | LLAMA_EXAMPLE_PERPLEXITY, |
| 79 | LLAMA_EXAMPLE_RETRIEVAL, |
| 80 | LLAMA_EXAMPLE_PASSKEY, |
| 81 | LLAMA_EXAMPLE_IMATRIX, |
| 82 | LLAMA_EXAMPLE_BENCH, |
| 83 | LLAMA_EXAMPLE_SERVER, |
| 84 | LLAMA_EXAMPLE_CVECTOR_GENERATOR, |
| 85 | LLAMA_EXAMPLE_EXPORT_LORA, |
| 86 | LLAMA_EXAMPLE_MTMD, |
| 87 | LLAMA_EXAMPLE_LOOKUP, |
| 88 | LLAMA_EXAMPLE_PARALLEL, |
| 89 | LLAMA_EXAMPLE_TTS, |
| 90 | LLAMA_EXAMPLE_DIFFUSION, |
| 91 | LLAMA_EXAMPLE_FINETUNE, |
| 92 | |
| 93 | LLAMA_EXAMPLE_COUNT, |
| 94 | }; |
| 95 | |
| 96 | enum common_sampler_type { |
| 97 | COMMON_SAMPLER_TYPE_NONE = 0, |
| 98 | COMMON_SAMPLER_TYPE_DRY = 1, |
| 99 | COMMON_SAMPLER_TYPE_TOP_K = 2, |
| 100 | COMMON_SAMPLER_TYPE_TOP_P = 3, |
| 101 | COMMON_SAMPLER_TYPE_MIN_P = 4, |
| 102 | //COMMON_SAMPLER_TYPE_TFS_Z = 5, |
| 103 | COMMON_SAMPLER_TYPE_TYPICAL_P = 6, |
| 104 | COMMON_SAMPLER_TYPE_TEMPERATURE = 7, |
| 105 | COMMON_SAMPLER_TYPE_XTC = 8, |
| 106 | COMMON_SAMPLER_TYPE_INFILL = 9, |
| 107 | COMMON_SAMPLER_TYPE_PENALTIES = 10, |
| 108 | COMMON_SAMPLER_TYPE_TOP_N_SIGMA = 11, |
| 109 | }; |
| 110 | |
| 111 | // dimensionality reduction methods, used by cvector-generator |
| 112 | enum dimre_method { |
| 113 | DIMRE_METHOD_PCA, |
| 114 | DIMRE_METHOD_MEAN, |
| 115 | }; |
| 116 | |
| 117 | enum common_conversation_mode { |
| 118 | COMMON_CONVERSATION_MODE_DISABLED = 0, |
| 119 | COMMON_CONVERSATION_MODE_ENABLED = 1, |
| 120 | COMMON_CONVERSATION_MODE_AUTO = 2, |
| 121 | }; |
| 122 | |
| 123 | enum common_grammar_trigger_type { |
| 124 | COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN, |
| 125 | COMMON_GRAMMAR_TRIGGER_TYPE_WORD, |
| 126 | COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN, |
| 127 | COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL, |
| 128 | }; |
| 129 | |
| 130 | struct common_grammar_trigger { |
| 131 | common_grammar_trigger_type type; |
| 132 | std::string value; |
| 133 | llama_token token = LLAMA_TOKEN_NULL; |
| 134 | }; |
| 135 | |
| 136 | // sampling parameters |
| 137 | struct common_params_sampling { |
| 138 | uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler |
| 139 | |
| 140 | int32_t n_prev = 64; // number of previous tokens to remember |
| 141 | int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. |
| 142 | int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens |
| 143 | int32_t top_k = 40; // <= 0 to use vocab size |
| 144 | float top_p = 0.95f; // 1.0 = disabled |
| 145 | float min_p = 0.05f; // 0.0 = disabled |
| 146 | float xtc_probability = 0.00f; // 0.0 = disabled |
| 147 | float xtc_threshold = 0.10f; // > 0.5 disables XTC |
| 148 | float typ_p = 1.00f; // typical_p, 1.0 = disabled |
| 149 | float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities |
| 150 | float dynatemp_range = 0.00f; // 0.0 = disabled |
| 151 | float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler |
| 152 | int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) |
| 153 | float penalty_repeat = 1.00f; // 1.0 = disabled |
| 154 | float penalty_freq = 0.00f; // 0.0 = disabled |
| 155 | float penalty_present = 0.00f; // 0.0 = disabled |
| 156 | float dry_multiplier = 0.0f; // 0.0 = disabled; DRY repetition penalty for tokens extending repetition: |
| 157 | float dry_base = 1.75f; // 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length) |
| 158 | int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty |
| 159 | int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size) |
| 160 | int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 |
| 161 | float top_n_sigma = -1.00f;// -1.0 = disabled |
| 162 | float mirostat_tau = 5.00f; // target entropy |
| 163 | float mirostat_eta = 0.10f; // learning rate |
| 164 | bool ignore_eos = false; |
| 165 | bool no_perf = false; // disable performance metrics |
| 166 | bool timing_per_token = false; |
| 167 | |
| 168 | std::vector<std::string> dry_sequence_breakers = {"\n" , ":" , "\"" , "*" }; // default sequence breakers for DRY |
| 169 | |
| 170 | |
| 171 | std::vector<enum common_sampler_type> samplers = { |
| 172 | COMMON_SAMPLER_TYPE_PENALTIES, |
| 173 | COMMON_SAMPLER_TYPE_DRY, |
| 174 | COMMON_SAMPLER_TYPE_TOP_N_SIGMA, |
| 175 | COMMON_SAMPLER_TYPE_TOP_K, |
| 176 | COMMON_SAMPLER_TYPE_TYPICAL_P, |
| 177 | COMMON_SAMPLER_TYPE_TOP_P, |
| 178 | COMMON_SAMPLER_TYPE_MIN_P, |
| 179 | COMMON_SAMPLER_TYPE_XTC, |
| 180 | COMMON_SAMPLER_TYPE_TEMPERATURE, |
| 181 | }; |
| 182 | |
| 183 | std::string grammar; // optional BNF-like grammar to constrain sampling |
| 184 | bool grammar_lazy = false; |
| 185 | std::vector<common_grammar_trigger> grammar_triggers; // optional triggers (for lazy grammars) |
| 186 | std::set<llama_token> preserved_tokens; |
| 187 | |
| 188 | std::vector<llama_logit_bias> logit_bias; // logit biases to apply |
| 189 | std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens |
| 190 | |
| 191 | // print the parameters into a string |
| 192 | std::string print() const; |
| 193 | }; |
| 194 | |
| 195 | struct common_params_model { |
| 196 | std::string path = "" ; // model local path // NOLINT |
| 197 | std::string url = "" ; // model url to download // NOLINT |
| 198 | std::string hf_repo = "" ; // HF repo // NOLINT |
| 199 | std::string hf_file = "" ; // HF file // NOLINT |
| 200 | std::string docker_repo = "" ; // Docker repo // NOLINT |
| 201 | }; |
| 202 | |
| 203 | struct common_params_speculative { |
| 204 | std::vector<ggml_backend_dev_t> devices; // devices to use for offloading |
| 205 | |
| 206 | int32_t n_ctx = 0; // draft context size |
| 207 | int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding |
| 208 | int32_t n_min = 0; // minimum number of draft tokens to use for speculative decoding |
| 209 | int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default) |
| 210 | float p_split = 0.1f; // speculative decoding split probability |
| 211 | float p_min = 0.75f; // minimum speculative decoding probability (greedy) |
| 212 | std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements |
| 213 | std::vector<llama_model_tensor_buft_override> tensor_buft_overrides; |
| 214 | |
| 215 | ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K |
| 216 | ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V |
| 217 | |
| 218 | struct cpu_params cpuparams; |
| 219 | struct cpu_params cpuparams_batch; |
| 220 | |
| 221 | struct common_params_model model; |
| 222 | }; |
| 223 | |
| 224 | struct common_params_vocoder { |
| 225 | struct common_params_model model; |
| 226 | |
| 227 | std::string speaker_file = "" ; // speaker file path // NOLINT |
| 228 | |
| 229 | bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy // NOLINT |
| 230 | }; |
| 231 | |
| 232 | struct common_params_diffusion { |
| 233 | int32_t steps = 128; |
| 234 | bool visual_mode = false; |
| 235 | |
| 236 | float eps = 0; // epsilon for timesteps |
| 237 | int32_t block_length = 0; // block length for generation |
| 238 | |
| 239 | int32_t algorithm = 4; // default algorithm: low-confidence |
| 240 | float alg_temp = 0.0f; // algorithm temperature |
| 241 | |
| 242 | float cfg_scale = 0; // classifier-free guidance scale |
| 243 | bool add_gumbel_noise = false; // add gumbel noise to the logits if temp > 0.0 |
| 244 | }; |
| 245 | |
| 246 | // reasoning API response format (not to be confused as chat template's reasoning format) |
| 247 | enum common_reasoning_format { |
| 248 | COMMON_REASONING_FORMAT_NONE, |
| 249 | COMMON_REASONING_FORMAT_AUTO, // Same as deepseek, using `message.reasoning_content` |
| 250 | COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode |
| 251 | COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas. |
| 252 | // do not extend this enum unless you absolutely have to |
| 253 | // in most cases, use COMMON_REASONING_FORMAT_AUTO |
| 254 | // see: https://github.com/ggml-org/llama.cpp/pull/15408 |
| 255 | }; |
| 256 | |
| 257 | |
| 258 | struct lr_opt { |
| 259 | float lr0 = 1e-5; // learning rate at first epoch |
| 260 | float lr_min = -1; |
| 261 | float decay_epochs = -1; // if >0, the learning rate starts at lr0 and decays to lr_min after this many epochs |
| 262 | float scale_epoch = 0; |
| 263 | float wd = 0; |
| 264 | unsigned epochs = 2; |
| 265 | |
| 266 | unsigned epoch; // set by optimizer outer (epochs) loop |
| 267 | // learning rate decay - constant LR per epoch only for now |
| 268 | float get_lr(float e) const; |
| 269 | float get_lr() const { return get_lr(e: epoch); } |
| 270 | // must call after arg parse, before get_lr |
| 271 | void init(); |
| 272 | }; |
| 273 | |
| 274 | struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata); |
| 275 | |
| 276 | struct common_params { |
| 277 | int32_t n_predict = -1; // new tokens to predict |
| 278 | int32_t n_ctx = 4096; // context size |
| 279 | int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS) |
| 280 | int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS) |
| 281 | int32_t n_keep = 0; // number of tokens to keep from initial prompt |
| 282 | int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited) |
| 283 | int32_t n_parallel = 1; // number of parallel sequences to decode |
| 284 | int32_t n_sequences = 1; // number of sequences to decode |
| 285 | int32_t grp_attn_n = 1; // group-attention factor |
| 286 | int32_t grp_attn_w = 512; // group-attention width |
| 287 | int32_t n_print = -1; // print token count every n tokens (-1 = disabled) |
| 288 | float rope_freq_base = 0.0f; // RoPE base frequency |
| 289 | float rope_freq_scale = 0.0f; // RoPE frequency scaling factor |
| 290 | float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor |
| 291 | float yarn_attn_factor = -1.0f; // YaRN magnitude scaling factor |
| 292 | float yarn_beta_fast = -1.0f; // YaRN low correction dim |
| 293 | float yarn_beta_slow = -1.0f; // YaRN high correction dim |
| 294 | int32_t yarn_orig_ctx = 0; // YaRN original context length |
| 295 | |
| 296 | // offload params |
| 297 | std::vector<ggml_backend_dev_t> devices; // devices to use for offloading |
| 298 | |
| 299 | int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) |
| 300 | int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors |
| 301 | float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs |
| 302 | |
| 303 | enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs |
| 304 | |
| 305 | struct cpu_params cpuparams; |
| 306 | struct cpu_params cpuparams_batch; |
| 307 | |
| 308 | ggml_backend_sched_eval_callback cb_eval = nullptr; |
| 309 | void * cb_eval_user_data = nullptr; |
| 310 | |
| 311 | ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED; |
| 312 | |
| 313 | enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED; |
| 314 | enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings |
| 315 | enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings |
| 316 | enum llama_flash_attn_type flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO; // whether to use Flash Attention |
| 317 | |
| 318 | struct common_params_sampling sampling; |
| 319 | struct common_params_speculative speculative; |
| 320 | struct common_params_vocoder vocoder; |
| 321 | struct common_params_diffusion diffusion; |
| 322 | |
| 323 | struct common_params_model model; |
| 324 | |
| 325 | std::string model_alias = "" ; // model alias // NOLINT |
| 326 | std::string hf_token = "" ; // HF token // NOLINT |
| 327 | std::string prompt = "" ; // NOLINT |
| 328 | std::string system_prompt = "" ; // NOLINT |
| 329 | std::string prompt_file = "" ; // store the external prompt file name // NOLINT |
| 330 | std::string path_prompt_cache = "" ; // path to file for saving/loading prompt eval state // NOLINT |
| 331 | std::string input_prefix = "" ; // string to prefix user inputs with // NOLINT |
| 332 | std::string input_suffix = "" ; // string to suffix user inputs with // NOLINT |
| 333 | std::string lookup_cache_static = "" ; // path of static ngram cache file for lookup decoding // NOLINT |
| 334 | std::string lookup_cache_dynamic = "" ; // path of dynamic ngram cache file for lookup decoding // NOLINT |
| 335 | std::string logits_file = "" ; // file for saving *all* logits // NOLINT |
| 336 | |
| 337 | std::vector<std::string> in_files; // all input files |
| 338 | std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts) |
| 339 | std::vector<llama_model_kv_override> kv_overrides; |
| 340 | std::vector<llama_model_tensor_buft_override> tensor_buft_overrides; |
| 341 | |
| 342 | bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply) |
| 343 | std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale |
| 344 | |
| 345 | std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale |
| 346 | |
| 347 | int32_t verbosity = 0; |
| 348 | int32_t control_vector_layer_start = -1; // layer range for control vector |
| 349 | int32_t control_vector_layer_end = -1; // layer range for control vector |
| 350 | bool offline = false; |
| 351 | |
| 352 | int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used. |
| 353 | int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line |
| 354 | // (which is more convenient to use for plotting) |
| 355 | // |
| 356 | bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt |
| 357 | size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score |
| 358 | |
| 359 | bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt |
| 360 | size_t winogrande_tasks = 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed |
| 361 | |
| 362 | bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt |
| 363 | size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed |
| 364 | |
| 365 | bool kl_divergence = false; // compute KL divergence |
| 366 | |
| 367 | bool usage = false; // print usage |
| 368 | bool completion = false; // print source-able completion script |
| 369 | bool use_color = false; // use color to distinguish generations and inputs |
| 370 | bool special = false; // enable special token output |
| 371 | bool interactive = false; // interactive mode |
| 372 | bool interactive_first = false; // wait for user input immediately |
| 373 | bool prompt_cache_all = false; // save user input and generations to prompt cache |
| 374 | bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it |
| 375 | |
| 376 | bool escape = true; // escape "\n", "\r", "\t", "\'", "\"", and "\\" |
| 377 | bool multiline_input = false; // reverse the usage of `\` |
| 378 | bool simple_io = false; // improves compatibility with subprocesses and limited consoles |
| 379 | bool cont_batching = true; // insert new sequences for decoding on-the-fly |
| 380 | bool no_perf = false; // disable performance metrics |
| 381 | bool ctx_shift = false; // context shift on infinite text generation |
| 382 | bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055) |
| 383 | bool kv_unified = false; // enable unified KV cache |
| 384 | |
| 385 | bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix |
| 386 | bool use_mmap = true; // use mmap for faster loads |
| 387 | bool use_mlock = false; // use mlock to keep model in memory |
| 388 | bool verbose_prompt = false; // print prompt tokens before generation |
| 389 | bool display_prompt = true; // print prompt before generation |
| 390 | bool no_kv_offload = false; // disable KV offloading |
| 391 | bool warmup = true; // warmup run |
| 392 | bool check_tensors = false; // validate tensor data |
| 393 | bool no_op_offload = false; // globally disable offload host tensor operations to device |
| 394 | bool = false; // disable extra buffer types (used for weight repacking) |
| 395 | bool no_host = false; // bypass host buffer allowing extra buffers to be used |
| 396 | |
| 397 | bool single_turn = false; // single turn chat conversation |
| 398 | |
| 399 | ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K |
| 400 | ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V |
| 401 | |
| 402 | common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO; |
| 403 | |
| 404 | // multimodal models (see tools/mtmd) |
| 405 | struct common_params_model mmproj; |
| 406 | bool mmproj_use_gpu = true; // use GPU for multimodal model |
| 407 | bool no_mmproj = false; // explicitly disable multimodal model |
| 408 | std::vector<std::string> image; // path to image file(s) |
| 409 | int image_min_tokens = -1; |
| 410 | int image_max_tokens = -1; |
| 411 | |
| 412 | // finetune |
| 413 | struct lr_opt lr; |
| 414 | enum ggml_opt_optimizer_type optimizer = GGML_OPT_OPTIMIZER_TYPE_ADAMW; |
| 415 | float val_split = 0.05f; // fraction of the data used for the validation set |
| 416 | |
| 417 | // embedding |
| 418 | bool embedding = false; // get only sentence embedding |
| 419 | int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm) |
| 420 | std::string embd_out = "" ; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix |
| 421 | std::string embd_sep = "\n" ; // separator of embeddings |
| 422 | std::string cls_sep = "\t" ; // separator of classification sequences |
| 423 | |
| 424 | // server params |
| 425 | int32_t port = 8080; // server listens on this network port |
| 426 | int32_t timeout_read = 600; // http read timeout in seconds |
| 427 | int32_t timeout_write = timeout_read; // http write timeout in seconds |
| 428 | int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool) |
| 429 | int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting |
| 430 | int32_t n_ctx_checkpoints = 8; // max number of context checkpoints per slot |
| 431 | int32_t cache_ram_mib = 8192; // -1 = no limit, 0 - disable, 1 = 1 MiB, etc. |
| 432 | |
| 433 | std::string hostname = "127.0.0.1" ; |
| 434 | std::string public_path = "" ; // NOLINT |
| 435 | std::string api_prefix = "" ; // NOLINT |
| 436 | std::string chat_template = "" ; // NOLINT |
| 437 | bool use_jinja = false; // NOLINT |
| 438 | bool enable_chat_template = true; |
| 439 | common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; |
| 440 | int reasoning_budget = -1; |
| 441 | bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response |
| 442 | |
| 443 | std::vector<std::string> api_keys; |
| 444 | |
| 445 | std::string ssl_file_key = "" ; // NOLINT |
| 446 | std::string ssl_file_cert = "" ; // NOLINT |
| 447 | |
| 448 | std::map<std::string, std::string> default_template_kwargs; |
| 449 | |
| 450 | // "advanced" endpoints are disabled by default for better security |
| 451 | bool webui = true; |
| 452 | bool endpoint_slots = true; |
| 453 | bool endpoint_props = false; // only control POST requests, not GET |
| 454 | bool endpoint_metrics = false; |
| 455 | |
| 456 | bool log_json = false; |
| 457 | |
| 458 | std::string slot_save_path; |
| 459 | |
| 460 | float slot_prompt_similarity = 0.1f; |
| 461 | |
| 462 | // batched-bench params |
| 463 | bool is_pp_shared = false; |
| 464 | |
| 465 | std::vector<int32_t> n_pp; |
| 466 | std::vector<int32_t> n_tg; |
| 467 | std::vector<int32_t> n_pl; |
| 468 | |
| 469 | // retrieval params |
| 470 | std::vector<std::string> context_files; // context files to embed |
| 471 | |
| 472 | int32_t chunk_size = 64; // chunk size for context embedding |
| 473 | |
| 474 | std::string chunk_separator = "\n" ; // chunk separator for context embedding |
| 475 | |
| 476 | // passkey params |
| 477 | int32_t n_junk = 250; // number of times to repeat the junk text |
| 478 | int32_t i_pos = -1; // position of the passkey in the junk text |
| 479 | |
| 480 | // imatrix params |
| 481 | int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations |
| 482 | int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations |
| 483 | int32_t i_chunk = 0; // start processing from this chunk |
| 484 | int8_t imat_dat = 0; // whether the legacy imatrix.dat format should be output (gguf <= 0 < dat) |
| 485 | |
| 486 | bool process_output = false; // collect data for the output tensor |
| 487 | bool compute_ppl = true; // whether to compute perplexity |
| 488 | bool show_statistics = false; // show imatrix statistics per tensor |
| 489 | bool parse_special = false; // whether to parse special tokens during imatrix tokenization |
| 490 | |
| 491 | // cvector-generator params |
| 492 | int n_pca_batch = 100; |
| 493 | int n_pca_iterations = 1000; |
| 494 | dimre_method cvector_dimre_method = DIMRE_METHOD_PCA; |
| 495 | std::string cvector_positive_file = "tools/cvector-generator/positive.txt" ; |
| 496 | std::string cvector_negative_file = "tools/cvector-generator/negative.txt" ; |
| 497 | |
| 498 | bool spm_infill = false; // suffix/prefix/middle pattern for infill |
| 499 | |
| 500 | // batched-bench params |
| 501 | bool batched_bench_output_jsonl = false; |
| 502 | |
| 503 | // common params |
| 504 | std::string out_file; // output filename for all example programs |
| 505 | // optional callback for model loading progress and cancellation: |
| 506 | // called with a progress value between 0.0 and 1.0. |
| 507 | // return false from callback to abort model loading or true to continue |
| 508 | llama_progress_callback load_progress_callback = NULL; |
| 509 | void * load_progress_callback_user_data = NULL; |
| 510 | |
| 511 | bool has_speculative() const { |
| 512 | return !speculative.model.path.empty() || !speculative.model.hf_repo.empty(); |
| 513 | } |
| 514 | }; |
| 515 | |
| 516 | // call once at the start of a program if it uses libcommon |
| 517 | // initializes the logging system and prints info about the build |
| 518 | void common_init(); |
| 519 | |
| 520 | std::string common_params_get_system_info(const common_params & params); |
| 521 | |
| 522 | bool parse_cpu_range(const std::string & range, bool(&boolmask)[GGML_MAX_N_THREADS]); |
| 523 | bool parse_cpu_mask(const std::string & mask, bool(&boolmask)[GGML_MAX_N_THREADS]); |
| 524 | void postprocess_cpu_params(cpu_params & cpuparams, const cpu_params * role_model = nullptr); |
| 525 | bool set_process_priority(enum ggml_sched_priority prio); |
| 526 | |
| 527 | // |
| 528 | // String utils |
| 529 | // |
| 530 | |
| 531 | #ifdef __GNUC__ |
| 532 | # if defined(__MINGW32__) && !defined(__clang__) |
| 533 | # define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__))) |
| 534 | # else |
| 535 | # define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__))) |
| 536 | # endif |
| 537 | #else |
| 538 | # define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) |
| 539 | #endif |
| 540 | |
| 541 | LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2) |
| 542 | std::string string_format(const char * fmt, ...); |
| 543 | |
| 544 | std::string string_strip(const std::string & str); |
| 545 | std::string string_get_sortable_timestamp(); |
| 546 | |
| 547 | std::string string_join(const std::vector<std::string> & values, const std::string & separator); |
| 548 | std::vector<std::string> string_split(const std::string & str, const std::string & delimiter); |
| 549 | std::string string_repeat(const std::string & str, size_t n); |
| 550 | |
| 551 | void string_replace_all(std::string & s, const std::string & search, const std::string & replace); |
| 552 | |
| 553 | std::string regex_escape(const std::string & s); |
| 554 | |
| 555 | template<class T> |
| 556 | static std::vector<T> string_split(const std::string & str, char delim) { |
| 557 | static_assert(!std::is_same<T, std::string>::value, "Please use the specialized version for std::string" ); |
| 558 | std::vector<T> values; |
| 559 | std::istringstream str_stream(str); |
| 560 | std::string token; |
| 561 | while (std::getline(in&: str_stream, str&: token, delim: delim)) { |
| 562 | T value; |
| 563 | std::istringstream token_stream(token); |
| 564 | token_stream >> value; |
| 565 | values.push_back(value); |
| 566 | } |
| 567 | return values; |
| 568 | } |
| 569 | |
| 570 | template<> |
| 571 | std::vector<std::string> string_split<std::string>(const std::string & input, char separator) |
| 572 | { |
| 573 | std::vector<std::string> parts; |
| 574 | size_t begin_pos = 0; |
| 575 | size_t separator_pos = input.find(c: separator); |
| 576 | while (separator_pos != std::string::npos) { |
| 577 | std::string part = input.substr(pos: begin_pos, n: separator_pos - begin_pos); |
| 578 | parts.emplace_back(args&: part); |
| 579 | begin_pos = separator_pos + 1; |
| 580 | separator_pos = input.find(c: separator, pos: begin_pos); |
| 581 | } |
| 582 | parts.emplace_back(args: input.substr(pos: begin_pos, n: separator_pos - begin_pos)); |
| 583 | return parts; |
| 584 | } |
| 585 | |
| 586 | static bool string_starts_with(const std::string & str, |
| 587 | const std::string & prefix) { // While we wait for C++20's std::string::starts_with... |
| 588 | return str.rfind(str: prefix, pos: 0) == 0; |
| 589 | } |
| 590 | |
| 591 | // While we wait for C++20's std::string::ends_with... |
| 592 | bool string_ends_with(const std::string_view & str, const std::string_view & suffix); |
| 593 | bool string_remove_suffix(std::string & str, const std::string_view & suffix); |
| 594 | size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop); |
| 595 | |
| 596 | bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides); |
| 597 | void string_process_escapes(std::string & input); |
| 598 | |
| 599 | std::string string_from(bool value); |
| 600 | std::string string_from(const std::vector<int> & values); |
| 601 | std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens); |
| 602 | std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch); |
| 603 | |
| 604 | // |
| 605 | // Filesystem utils |
| 606 | // |
| 607 | |
| 608 | bool fs_validate_filename(const std::string & filename); |
| 609 | bool fs_create_directory_with_parents(const std::string & path); |
| 610 | |
| 611 | std::string fs_get_cache_directory(); |
| 612 | std::string fs_get_cache_file(const std::string & filename); |
| 613 | |
| 614 | // |
| 615 | // Model utils |
| 616 | // |
| 617 | |
| 618 | // note: defines object's lifetime |
| 619 | struct common_init_result { |
| 620 | llama_model_ptr model; |
| 621 | llama_context_ptr context; |
| 622 | |
| 623 | std::vector<llama_adapter_lora_ptr> lora; |
| 624 | }; |
| 625 | |
| 626 | struct common_init_result common_init_from_params(common_params & params); |
| 627 | |
| 628 | struct llama_model_params common_model_params_to_llama ( common_params & params); |
| 629 | struct llama_context_params common_context_params_to_llama(const common_params & params); |
| 630 | struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params); |
| 631 | |
| 632 | // clear LoRA adapters from context, then apply new list of adapters |
| 633 | void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora); |
| 634 | |
| 635 | std::string get_model_endpoint(); |
| 636 | |
| 637 | // |
| 638 | // Batch utils |
| 639 | // |
| 640 | |
| 641 | void common_batch_clear(struct llama_batch & batch); |
| 642 | |
| 643 | void common_batch_add( |
| 644 | struct llama_batch & batch, |
| 645 | llama_token id, |
| 646 | llama_pos pos, |
| 647 | const std::vector<llama_seq_id> & seq_ids, |
| 648 | bool logits); |
| 649 | |
| 650 | // |
| 651 | // Token utils |
| 652 | // |
| 653 | |
| 654 | // longest common prefix |
| 655 | size_t common_lcp(const llama_tokens & a, const llama_tokens & b); |
| 656 | |
| 657 | // longet common subsequence |
| 658 | size_t common_lcs(const llama_tokens & a, const llama_tokens & b); |
| 659 | |
| 660 | // |
| 661 | // Vocab utils |
| 662 | // |
| 663 | |
| 664 | // tokenizes a string into a vector of tokens |
| 665 | // should work similar to Python's `tokenizer.encode` |
| 666 | std::vector<llama_token> common_tokenize( |
| 667 | const struct llama_context * ctx, |
| 668 | const std::string & text, |
| 669 | bool add_special, |
| 670 | bool parse_special = false); |
| 671 | |
| 672 | std::vector<llama_token> common_tokenize( |
| 673 | const struct llama_vocab * vocab, |
| 674 | const std::string & text, |
| 675 | bool add_special, |
| 676 | bool parse_special = false); |
| 677 | |
| 678 | // tokenizes a token into a piece, optionally renders special/control tokens |
| 679 | // should work similar to Python's `tokenizer.id_to_piece` |
| 680 | std::string common_token_to_piece( |
| 681 | const struct llama_context * ctx, |
| 682 | llama_token token, |
| 683 | bool special = true); |
| 684 | |
| 685 | std::string common_token_to_piece( |
| 686 | const struct llama_vocab * vocab, |
| 687 | llama_token token, |
| 688 | bool special = true); |
| 689 | |
| 690 | // detokenizes a vector of tokens into a string |
| 691 | // should work similar to Python's `tokenizer.decode` |
| 692 | // optionally renders special/control tokens |
| 693 | std::string common_detokenize( |
| 694 | const struct llama_context * ctx, |
| 695 | const std::vector<llama_token> & tokens, |
| 696 | bool special = true); |
| 697 | |
| 698 | std::string common_detokenize( |
| 699 | const struct llama_vocab * vocab, |
| 700 | const std::vector<llama_token> & tokens, |
| 701 | bool special = true); |
| 702 | |
| 703 | // |
| 704 | // Embedding utils |
| 705 | // |
| 706 | |
| 707 | // TODO: repace embd_norm with an enum |
| 708 | void common_embd_normalize(const float * inp, float * out, int n, int embd_norm); |
| 709 | |
| 710 | float common_embd_similarity_cos(const float * embd1, const float * embd2, int n); |
| 711 | |
| 712 | // |
| 713 | // Control vector utils |
| 714 | // |
| 715 | |
| 716 | struct common_control_vector_data { |
| 717 | int n_embd; |
| 718 | |
| 719 | // stores data for layers [1, n_layer] where n_layer = data.size() / n_embd |
| 720 | std::vector<float> data; |
| 721 | }; |
| 722 | |
| 723 | struct common_control_vector_load_info { |
| 724 | float strength; |
| 725 | |
| 726 | std::string fname; |
| 727 | }; |
| 728 | |
| 729 | // Load control vectors, scale each by strength, and add them together. |
| 730 | // On error, returns {-1, empty} |
| 731 | common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos); |
| 732 | |
| 733 | // |
| 734 | // Split utils |
| 735 | // |
| 736 | |
| 737 | namespace { |
| 738 | |
| 739 | const char * const LLM_KV_SPLIT_NO = "split.no" ; |
| 740 | const char * const LLM_KV_SPLIT_COUNT = "split.count" ; |
| 741 | const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count" ; |
| 742 | |
| 743 | } |
| 744 | |
| 745 | // |
| 746 | // MoE utils |
| 747 | // |
| 748 | |
| 749 | const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_(ch|)exps" ; |
| 750 | |
| 751 | static std::string llm_ffn_exps_block_regex(int idx) { |
| 752 | return string_format(fmt: "blk\\.%d%s" , idx, LLM_FFN_EXPS_REGEX); |
| 753 | } |
| 754 | |
| 755 | static llama_model_tensor_buft_override llm_ffn_exps_cpu_override() { |
| 756 | return { .pattern: LLM_FFN_EXPS_REGEX, .buft: ggml_backend_cpu_buffer_type() }; |
| 757 | } |
| 758 | |
| 759 | // |
| 760 | // training utils |
| 761 | // |
| 762 | |
| 763 | ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride); |
| 764 | |
| 765 | // "adamw" or "sgd" (case insensitive) |
| 766 | enum ggml_opt_optimizer_type common_opt_get_optimizer(const char *); |
| 767 | |