| 1 | #include "arg.h" |
| 2 | |
| 3 | #include "chat.h" |
| 4 | #include "common.h" |
| 5 | #include "json-schema-to-grammar.h" |
| 6 | #include "log.h" |
| 7 | #include "sampling.h" |
| 8 | #include "download.h" |
| 9 | |
| 10 | // fix problem with std::min and std::max |
| 11 | #if defined(_WIN32) |
| 12 | #define WIN32_LEAN_AND_MEAN |
| 13 | #ifndef NOMINMAX |
| 14 | # define NOMINMAX |
| 15 | #endif |
| 16 | #include <windows.h> |
| 17 | #endif |
| 18 | |
| 19 | #define JSON_ASSERT GGML_ASSERT |
| 20 | #include <nlohmann/json.hpp> |
| 21 | |
| 22 | #include <algorithm> |
| 23 | #include <climits> |
| 24 | #include <cstdarg> |
| 25 | #include <fstream> |
| 26 | #include <list> |
| 27 | #include <regex> |
| 28 | #include <set> |
| 29 | #include <string> |
| 30 | #include <thread> // for hardware_concurrency |
| 31 | #include <vector> |
| 32 | |
| 33 | #ifdef __linux__ |
| 34 | #include <linux/limits.h> |
| 35 | #elif defined(_WIN32) |
| 36 | # if !defined(PATH_MAX) |
| 37 | # define PATH_MAX MAX_PATH |
| 38 | # endif |
| 39 | #elif defined(_AIX) |
| 40 | #include <sys/limits.h> |
| 41 | #else |
| 42 | #include <sys/syslimits.h> |
| 43 | #endif |
| 44 | #define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083 |
| 45 | |
| 46 | using json = nlohmann::ordered_json; |
| 47 | |
| 48 | static std::initializer_list<enum llama_example> mmproj_examples = { |
| 49 | LLAMA_EXAMPLE_MTMD, |
| 50 | LLAMA_EXAMPLE_SERVER, |
| 51 | }; |
| 52 | |
| 53 | static std::string read_file(const std::string & fname) { |
| 54 | std::ifstream file(fname); |
| 55 | if (!file) { |
| 56 | throw std::runtime_error(string_format(fmt: "error: failed to open file '%s'\n" , fname.c_str())); |
| 57 | } |
| 58 | std::string content((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>()); |
| 59 | file.close(); |
| 60 | return content; |
| 61 | } |
| 62 | |
| 63 | common_arg & common_arg::set_examples(std::initializer_list<enum llama_example> examples) { |
| 64 | this->examples = examples; |
| 65 | return *this; |
| 66 | } |
| 67 | |
| 68 | common_arg & common_arg::set_excludes(std::initializer_list<enum llama_example> excludes) { |
| 69 | this->excludes = excludes; |
| 70 | return *this; |
| 71 | } |
| 72 | |
| 73 | common_arg & common_arg::set_env(const char * env) { |
| 74 | help = help + "\n(env: " + env + ")" ; |
| 75 | this->env = env; |
| 76 | return *this; |
| 77 | } |
| 78 | |
| 79 | common_arg & common_arg::set_sparam() { |
| 80 | is_sparam = true; |
| 81 | return *this; |
| 82 | } |
| 83 | |
| 84 | bool common_arg::in_example(enum llama_example ex) { |
| 85 | return examples.find(x: ex) != examples.end(); |
| 86 | } |
| 87 | |
| 88 | bool common_arg::is_exclude(enum llama_example ex) { |
| 89 | return excludes.find(x: ex) != excludes.end(); |
| 90 | } |
| 91 | |
| 92 | bool common_arg::get_value_from_env(std::string & output) const { |
| 93 | if (env == nullptr) return false; |
| 94 | char * value = std::getenv(name: env); |
| 95 | if (value) { |
| 96 | output = value; |
| 97 | return true; |
| 98 | } |
| 99 | return false; |
| 100 | } |
| 101 | |
| 102 | bool common_arg::has_value_from_env() const { |
| 103 | return env != nullptr && std::getenv(name: env); |
| 104 | } |
| 105 | |
| 106 | static std::vector<std::string> break_str_into_lines(std::string input, size_t max_char_per_line) { |
| 107 | std::vector<std::string> result; |
| 108 | std::istringstream iss(input); |
| 109 | std::string line; |
| 110 | auto add_line = [&](const std::string& l) { |
| 111 | if (l.length() <= max_char_per_line) { |
| 112 | result.push_back(x: l); |
| 113 | } else { |
| 114 | std::istringstream line_stream(l); |
| 115 | std::string word, current_line; |
| 116 | while (line_stream >> word) { |
| 117 | if (current_line.length() + !current_line.empty() + word.length() > max_char_per_line) { |
| 118 | if (!current_line.empty()) result.push_back(x: current_line); |
| 119 | current_line = word; |
| 120 | } else { |
| 121 | current_line += (!current_line.empty() ? " " : "" ) + word; |
| 122 | } |
| 123 | } |
| 124 | if (!current_line.empty()) result.push_back(x: current_line); |
| 125 | } |
| 126 | }; |
| 127 | while (std::getline(is&: iss, str&: line)) { |
| 128 | add_line(line); |
| 129 | } |
| 130 | return result; |
| 131 | } |
| 132 | |
| 133 | std::string common_arg::to_string() { |
| 134 | // params for printing to console |
| 135 | const static int n_leading_spaces = 40; |
| 136 | const static int n_char_per_line_help = 70; // TODO: detect this based on current console |
| 137 | std::string leading_spaces(n_leading_spaces, ' '); |
| 138 | |
| 139 | std::ostringstream ss; |
| 140 | for (const auto arg : args) { |
| 141 | if (arg == args.front()) { |
| 142 | if (args.size() == 1) { |
| 143 | ss << arg; |
| 144 | } else { |
| 145 | // first arg is usually abbreviation, we need padding to make it more beautiful |
| 146 | auto tmp = std::string(arg) + ", " ; |
| 147 | auto spaces = std::string(std::max(a: 0, b: 7 - (int)tmp.size()), ' '); |
| 148 | ss << tmp << spaces; |
| 149 | } |
| 150 | } else { |
| 151 | ss << arg << (arg != args.back() ? ", " : "" ); |
| 152 | } |
| 153 | } |
| 154 | if (value_hint) ss << " " << value_hint; |
| 155 | if (value_hint_2) ss << " " << value_hint_2; |
| 156 | if (ss.tellp() > n_leading_spaces - 3) { |
| 157 | // current line is too long, add new line |
| 158 | ss << "\n" << leading_spaces; |
| 159 | } else { |
| 160 | // padding between arg and help, same line |
| 161 | ss << std::string(leading_spaces.size() - ss.tellp(), ' '); |
| 162 | } |
| 163 | const auto help_lines = break_str_into_lines(input: help, max_char_per_line: n_char_per_line_help); |
| 164 | for (const auto & line : help_lines) { |
| 165 | ss << (&line == &help_lines.front() ? "" : leading_spaces) << line << "\n" ; |
| 166 | } |
| 167 | return ss.str(); |
| 168 | } |
| 169 | |
| 170 | // |
| 171 | // utils |
| 172 | // |
| 173 | |
| 174 | // Helper function to parse tensor buffer override strings |
| 175 | static void parse_tensor_buffer_overrides(const std::string & value, std::vector<llama_model_tensor_buft_override> & overrides) { |
| 176 | std::map<std::string, ggml_backend_buffer_type_t> buft_list; |
| 177 | for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { |
| 178 | auto * dev = ggml_backend_dev_get(index: i); |
| 179 | auto * buft = ggml_backend_dev_buffer_type(device: dev); |
| 180 | if (buft) { |
| 181 | buft_list[ggml_backend_buft_name(buft)] = buft; |
| 182 | } |
| 183 | } |
| 184 | |
| 185 | for (const auto & override : string_split<std::string>(input: value, separator: ',')) { |
| 186 | std::string::size_type pos = override.find(c: '='); |
| 187 | if (pos == std::string::npos) { |
| 188 | throw std::invalid_argument("invalid value" ); |
| 189 | } |
| 190 | std::string tensor_name = override.substr(pos: 0, n: pos); |
| 191 | std::string buffer_type = override.substr(pos: pos + 1); |
| 192 | |
| 193 | if (buft_list.find(x: buffer_type) == buft_list.end()) { |
| 194 | printf(format: "Available buffer types:\n" ); |
| 195 | for (const auto & it : buft_list) { |
| 196 | printf(format: " %s\n" , ggml_backend_buft_name(buft: it.second)); |
| 197 | } |
| 198 | throw std::invalid_argument("unknown buffer type" ); |
| 199 | } |
| 200 | // keep strings alive and avoid leaking memory by storing them in a static vector |
| 201 | static std::list<std::string> buft_overrides; |
| 202 | buft_overrides.push_back(x: tensor_name); |
| 203 | overrides.push_back(x: {.pattern: buft_overrides.back().c_str(), .buft: buft_list.at(k: buffer_type)}); |
| 204 | } |
| 205 | } |
| 206 | |
| 207 | struct handle_model_result { |
| 208 | bool found_mmproj = false; |
| 209 | common_params_model mmproj; |
| 210 | }; |
| 211 | |
| 212 | static handle_model_result common_params_handle_model( |
| 213 | struct common_params_model & model, |
| 214 | const std::string & bearer_token, |
| 215 | const std::string & model_path_default, |
| 216 | bool offline) { |
| 217 | handle_model_result result; |
| 218 | // handle pre-fill default model path and url based on hf_repo and hf_file |
| 219 | { |
| 220 | if (!model.docker_repo.empty()) { // Handle Docker URLs by resolving them to local paths |
| 221 | model.path = common_docker_resolve_model(docker: model.docker_repo); |
| 222 | } else if (!model.hf_repo.empty()) { |
| 223 | // short-hand to avoid specifying --hf-file -> default it to --model |
| 224 | if (model.hf_file.empty()) { |
| 225 | if (model.path.empty()) { |
| 226 | auto auto_detected = common_get_hf_file(hf_repo_with_tag: model.hf_repo, bearer_token, offline); |
| 227 | if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) { |
| 228 | exit(status: 1); // built without CURL, error message already printed |
| 229 | } |
| 230 | model.hf_repo = auto_detected.repo; |
| 231 | model.hf_file = auto_detected.ggufFile; |
| 232 | if (!auto_detected.mmprojFile.empty()) { |
| 233 | result.found_mmproj = true; |
| 234 | result.mmproj.hf_repo = model.hf_repo; |
| 235 | result.mmproj.hf_file = auto_detected.mmprojFile; |
| 236 | } |
| 237 | } else { |
| 238 | model.hf_file = model.path; |
| 239 | } |
| 240 | } |
| 241 | |
| 242 | std::string model_endpoint = get_model_endpoint(); |
| 243 | model.url = model_endpoint + model.hf_repo + "/resolve/main/" + model.hf_file; |
| 244 | // make sure model path is present (for caching purposes) |
| 245 | if (model.path.empty()) { |
| 246 | // this is to avoid different repo having same file name, or same file name in different subdirs |
| 247 | std::string filename = model.hf_repo + "_" + model.hf_file; |
| 248 | // to make sure we don't have any slashes in the filename |
| 249 | string_replace_all(s&: filename, search: "/" , replace: "_" ); |
| 250 | model.path = fs_get_cache_file(filename); |
| 251 | } |
| 252 | |
| 253 | } else if (!model.url.empty()) { |
| 254 | if (model.path.empty()) { |
| 255 | auto f = string_split<std::string>(input: model.url, separator: '#').front(); |
| 256 | f = string_split<std::string>(input: f, separator: '?').front(); |
| 257 | model.path = fs_get_cache_file(filename: string_split<std::string>(input: f, separator: '/').back()); |
| 258 | } |
| 259 | |
| 260 | } else if (model.path.empty()) { |
| 261 | model.path = model_path_default; |
| 262 | } |
| 263 | } |
| 264 | |
| 265 | // then, download it if needed |
| 266 | if (!model.url.empty()) { |
| 267 | bool ok = common_download_model(model, bearer_token, offline); |
| 268 | if (!ok) { |
| 269 | LOG_ERR("error: failed to download model from %s\n" , model.url.c_str()); |
| 270 | exit(status: 1); |
| 271 | } |
| 272 | } |
| 273 | |
| 274 | return result; |
| 275 | } |
| 276 | |
| 277 | const std::vector<ggml_type> kv_cache_types = { |
| 278 | GGML_TYPE_F32, |
| 279 | GGML_TYPE_F16, |
| 280 | GGML_TYPE_BF16, |
| 281 | GGML_TYPE_Q8_0, |
| 282 | GGML_TYPE_Q4_0, |
| 283 | GGML_TYPE_Q4_1, |
| 284 | GGML_TYPE_IQ4_NL, |
| 285 | GGML_TYPE_Q5_0, |
| 286 | GGML_TYPE_Q5_1, |
| 287 | }; |
| 288 | |
| 289 | static ggml_type kv_cache_type_from_str(const std::string & s) { |
| 290 | for (const auto & type : kv_cache_types) { |
| 291 | if (ggml_type_name(type) == s) { |
| 292 | return type; |
| 293 | } |
| 294 | } |
| 295 | throw std::runtime_error("Unsupported cache type: " + s); |
| 296 | } |
| 297 | |
| 298 | static std::string get_all_kv_cache_types() { |
| 299 | std::ostringstream msg; |
| 300 | for (const auto & type : kv_cache_types) { |
| 301 | msg << ggml_type_name(type) << (&type == &kv_cache_types.back() ? "" : ", " ); |
| 302 | } |
| 303 | return msg.str(); |
| 304 | } |
| 305 | |
| 306 | // |
| 307 | // CLI argument parsing functions |
| 308 | // |
| 309 | |
| 310 | static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) { |
| 311 | common_params & params = ctx_arg.params; |
| 312 | |
| 313 | std::unordered_map<std::string, common_arg *> arg_to_options; |
| 314 | for (auto & opt : ctx_arg.options) { |
| 315 | for (const auto & arg : opt.args) { |
| 316 | arg_to_options[arg] = &opt; |
| 317 | } |
| 318 | } |
| 319 | |
| 320 | // handle environment variables |
| 321 | for (auto & opt : ctx_arg.options) { |
| 322 | std::string value; |
| 323 | if (opt.get_value_from_env(output&: value)) { |
| 324 | try { |
| 325 | if (opt.handler_void && (value == "1" || value == "true" )) { |
| 326 | opt.handler_void(params); |
| 327 | } |
| 328 | if (opt.handler_int) { |
| 329 | opt.handler_int(params, std::stoi(str: value)); |
| 330 | } |
| 331 | if (opt.handler_string) { |
| 332 | opt.handler_string(params, value); |
| 333 | continue; |
| 334 | } |
| 335 | } catch (std::exception & e) { |
| 336 | throw std::invalid_argument(string_format( |
| 337 | fmt: "error while handling environment variable \"%s\": %s\n\n" , opt.env, e.what())); |
| 338 | } |
| 339 | } |
| 340 | } |
| 341 | |
| 342 | // handle command line arguments |
| 343 | auto check_arg = [&](int i) { |
| 344 | if (i+1 >= argc) { |
| 345 | throw std::invalid_argument("expected value for argument" ); |
| 346 | } |
| 347 | }; |
| 348 | |
| 349 | for (int i = 1; i < argc; i++) { |
| 350 | const std::string arg_prefix = "--" ; |
| 351 | |
| 352 | std::string arg = argv[i]; |
| 353 | if (arg.compare(pos: 0, n: arg_prefix.size(), str: arg_prefix) == 0) { |
| 354 | std::replace(first: arg.begin(), last: arg.end(), old_value: '_', new_value: '-'); |
| 355 | } |
| 356 | if (arg_to_options.find(x: arg) == arg_to_options.end()) { |
| 357 | throw std::invalid_argument(string_format(fmt: "error: invalid argument: %s" , arg.c_str())); |
| 358 | } |
| 359 | auto opt = *arg_to_options[arg]; |
| 360 | if (opt.has_value_from_env()) { |
| 361 | fprintf(stderr, format: "warn: %s environment variable is set, but will be overwritten by command line argument %s\n" , opt.env, arg.c_str()); |
| 362 | } |
| 363 | try { |
| 364 | if (opt.handler_void) { |
| 365 | opt.handler_void(params); |
| 366 | continue; |
| 367 | } |
| 368 | |
| 369 | // arg with single value |
| 370 | check_arg(i); |
| 371 | std::string val = argv[++i]; |
| 372 | if (opt.handler_int) { |
| 373 | opt.handler_int(params, std::stoi(str: val)); |
| 374 | continue; |
| 375 | } |
| 376 | if (opt.handler_string) { |
| 377 | opt.handler_string(params, val); |
| 378 | continue; |
| 379 | } |
| 380 | |
| 381 | // arg with 2 values |
| 382 | check_arg(i); |
| 383 | std::string val2 = argv[++i]; |
| 384 | if (opt.handler_str_str) { |
| 385 | opt.handler_str_str(params, val, val2); |
| 386 | continue; |
| 387 | } |
| 388 | } catch (std::exception & e) { |
| 389 | throw std::invalid_argument(string_format( |
| 390 | fmt: "error while handling argument \"%s\": %s\n\n" |
| 391 | "usage:\n%s\n\nto show complete usage, run with -h" , |
| 392 | arg.c_str(), e.what(), arg_to_options[arg]->to_string().c_str())); |
| 393 | } |
| 394 | } |
| 395 | |
| 396 | postprocess_cpu_params(cpuparams&: params.cpuparams, role_model: nullptr); |
| 397 | postprocess_cpu_params(cpuparams&: params.cpuparams_batch, role_model: ¶ms.cpuparams); |
| 398 | |
| 399 | postprocess_cpu_params(cpuparams&: params.speculative.cpuparams, role_model: ¶ms.cpuparams); |
| 400 | postprocess_cpu_params(cpuparams&: params.speculative.cpuparams_batch, role_model: ¶ms.cpuparams_batch); |
| 401 | |
| 402 | if (params.prompt_cache_all && (params.interactive || params.interactive_first)) { |
| 403 | throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n" ); |
| 404 | } |
| 405 | |
| 406 | // handle model and download |
| 407 | { |
| 408 | auto res = common_params_handle_model(model&: params.model, bearer_token: params.hf_token, DEFAULT_MODEL_PATH, offline: params.offline); |
| 409 | if (params.no_mmproj) { |
| 410 | params.mmproj = {}; |
| 411 | } else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) { |
| 412 | // optionally, handle mmproj model when -hf is specified |
| 413 | params.mmproj = res.mmproj; |
| 414 | } |
| 415 | // only download mmproj if the current example is using it |
| 416 | for (auto & ex : mmproj_examples) { |
| 417 | if (ctx_arg.ex == ex) { |
| 418 | common_params_handle_model(model&: params.mmproj, bearer_token: params.hf_token, model_path_default: "" , offline: params.offline); |
| 419 | break; |
| 420 | } |
| 421 | } |
| 422 | common_params_handle_model(model&: params.speculative.model, bearer_token: params.hf_token, model_path_default: "" , offline: params.offline); |
| 423 | common_params_handle_model(model&: params.vocoder.model, bearer_token: params.hf_token, model_path_default: "" , offline: params.offline); |
| 424 | } |
| 425 | |
| 426 | if (params.escape) { |
| 427 | string_process_escapes(input&: params.prompt); |
| 428 | string_process_escapes(input&: params.input_prefix); |
| 429 | string_process_escapes(input&: params.input_suffix); |
| 430 | for (auto & antiprompt : params.antiprompt) { |
| 431 | string_process_escapes(input&: antiprompt); |
| 432 | } |
| 433 | for (auto & seq_breaker : params.sampling.dry_sequence_breakers) { |
| 434 | string_process_escapes(input&: seq_breaker); |
| 435 | } |
| 436 | for (auto & pair : params.speculative.replacements) { |
| 437 | string_process_escapes(input&: pair.first); |
| 438 | string_process_escapes(input&: pair.second); |
| 439 | } |
| 440 | } |
| 441 | |
| 442 | if (!params.kv_overrides.empty()) { |
| 443 | params.kv_overrides.emplace_back(); |
| 444 | params.kv_overrides.back().key[0] = 0; |
| 445 | } |
| 446 | |
| 447 | if (!params.tensor_buft_overrides.empty()) { |
| 448 | params.tensor_buft_overrides.push_back(x: {.pattern: nullptr, .buft: nullptr}); |
| 449 | } |
| 450 | |
| 451 | if (!params.speculative.tensor_buft_overrides.empty()) { |
| 452 | params.speculative.tensor_buft_overrides.push_back(x: {.pattern: nullptr, .buft: nullptr}); |
| 453 | } |
| 454 | |
| 455 | if (!params.chat_template.empty() && !common_chat_verify_template(tmpl: params.chat_template, use_jinja: params.use_jinja)) { |
| 456 | throw std::runtime_error(string_format( |
| 457 | fmt: "error: the supplied chat template is not supported: %s%s\n" , |
| 458 | params.chat_template.c_str(), |
| 459 | params.use_jinja ? "" : "\nnote: llama.cpp was started without --jinja, we only support commonly used templates" |
| 460 | )); |
| 461 | } |
| 462 | |
| 463 | return true; |
| 464 | } |
| 465 | |
| 466 | static void common_params_print_usage(common_params_context & ctx_arg) { |
| 467 | auto print_options = [](std::vector<common_arg *> & options) { |
| 468 | for (common_arg * opt : options) { |
| 469 | printf(format: "%s" , opt->to_string().c_str()); |
| 470 | } |
| 471 | }; |
| 472 | |
| 473 | std::vector<common_arg *> common_options; |
| 474 | std::vector<common_arg *> sparam_options; |
| 475 | std::vector<common_arg *> specific_options; |
| 476 | for (auto & opt : ctx_arg.options) { |
| 477 | // in case multiple LLAMA_EXAMPLE_* are set, we prioritize the LLAMA_EXAMPLE_* matching current example |
| 478 | if (opt.is_sparam) { |
| 479 | sparam_options.push_back(x: &opt); |
| 480 | } else if (opt.in_example(ex: ctx_arg.ex)) { |
| 481 | specific_options.push_back(x: &opt); |
| 482 | } else { |
| 483 | common_options.push_back(x: &opt); |
| 484 | } |
| 485 | } |
| 486 | printf(format: "----- common params -----\n\n" ); |
| 487 | print_options(common_options); |
| 488 | printf(format: "\n\n----- sampling params -----\n\n" ); |
| 489 | print_options(sparam_options); |
| 490 | // TODO: maybe convert enum llama_example to string |
| 491 | printf(format: "\n\n----- example-specific params -----\n\n" ); |
| 492 | print_options(specific_options); |
| 493 | } |
| 494 | |
| 495 | static void common_params_print_completion(common_params_context & ctx_arg) { |
| 496 | std::vector<common_arg *> common_options; |
| 497 | std::vector<common_arg *> sparam_options; |
| 498 | std::vector<common_arg *> specific_options; |
| 499 | |
| 500 | for (auto & opt : ctx_arg.options) { |
| 501 | if (opt.is_sparam) { |
| 502 | sparam_options.push_back(x: &opt); |
| 503 | } else if (opt.in_example(ex: ctx_arg.ex)) { |
| 504 | specific_options.push_back(x: &opt); |
| 505 | } else { |
| 506 | common_options.push_back(x: &opt); |
| 507 | } |
| 508 | } |
| 509 | |
| 510 | printf(format: "_llama_completions() {\n" ); |
| 511 | printf(format: " local cur prev opts\n" ); |
| 512 | printf(format: " COMPREPLY=()\n" ); |
| 513 | printf(format: " cur=\"${COMP_WORDS[COMP_CWORD]}\"\n" ); |
| 514 | printf(format: " prev=\"${COMP_WORDS[COMP_CWORD-1]}\"\n\n" ); |
| 515 | |
| 516 | printf(format: " opts=\"" ); |
| 517 | auto print_options = [](const std::vector<common_arg *> & options) { |
| 518 | for (const common_arg * opt : options) { |
| 519 | for (const char * arg : opt->args) { |
| 520 | printf(format: "%s " , arg); |
| 521 | } |
| 522 | } |
| 523 | }; |
| 524 | |
| 525 | print_options(common_options); |
| 526 | print_options(sparam_options); |
| 527 | print_options(specific_options); |
| 528 | printf(format: "\"\n\n" ); |
| 529 | |
| 530 | printf(format: " case \"$prev\" in\n" ); |
| 531 | printf(format: " --model|-m)\n" ); |
| 532 | printf(format: " COMPREPLY=( $(compgen -f -X '!*.gguf' -- \"$cur\") $(compgen -d -- \"$cur\") )\n" ); |
| 533 | printf(format: " return 0\n" ); |
| 534 | printf(format: " ;;\n" ); |
| 535 | printf(format: " --grammar-file)\n" ); |
| 536 | printf(format: " COMPREPLY=( $(compgen -f -X '!*.gbnf' -- \"$cur\") $(compgen -d -- \"$cur\") )\n" ); |
| 537 | printf(format: " return 0\n" ); |
| 538 | printf(format: " ;;\n" ); |
| 539 | printf(format: " --chat-template-file)\n" ); |
| 540 | printf(format: " COMPREPLY=( $(compgen -f -X '!*.jinja' -- \"$cur\") $(compgen -d -- \"$cur\") )\n" ); |
| 541 | printf(format: " return 0\n" ); |
| 542 | printf(format: " ;;\n" ); |
| 543 | printf(format: " *)\n" ); |
| 544 | printf(format: " COMPREPLY=( $(compgen -W \"${opts}\" -- \"$cur\") )\n" ); |
| 545 | printf(format: " return 0\n" ); |
| 546 | printf(format: " ;;\n" ); |
| 547 | printf(format: " esac\n" ); |
| 548 | printf(format: "}\n\n" ); |
| 549 | |
| 550 | std::set<std::string> executables = { |
| 551 | "llama-batched" , |
| 552 | "llama-batched-bench" , |
| 553 | "llama-bench" , |
| 554 | "llama-cli" , |
| 555 | "llama-convert-llama2c-to-ggml" , |
| 556 | "llama-cvector-generator" , |
| 557 | "llama-embedding" , |
| 558 | "llama-eval-callback" , |
| 559 | "llama-export-lora" , |
| 560 | "llama-gen-docs" , |
| 561 | "llama-gguf" , |
| 562 | "llama-gguf-hash" , |
| 563 | "llama-gguf-split" , |
| 564 | "llama-gritlm" , |
| 565 | "llama-imatrix" , |
| 566 | "llama-infill" , |
| 567 | "llama-mtmd-cli" , |
| 568 | "llama-llava-clip-quantize-cli" , |
| 569 | "llama-lookahead" , |
| 570 | "llama-lookup" , |
| 571 | "llama-lookup-create" , |
| 572 | "llama-lookup-merge" , |
| 573 | "llama-lookup-stats" , |
| 574 | "llama-parallel" , |
| 575 | "llama-passkey" , |
| 576 | "llama-perplexity" , |
| 577 | "llama-q8dot" , |
| 578 | "llama-quantize" , |
| 579 | "llama-qwen2vl-cli" , |
| 580 | "llama-retrieval" , |
| 581 | "llama-run" , |
| 582 | "llama-save-load-state" , |
| 583 | "llama-server" , |
| 584 | "llama-simple" , |
| 585 | "llama-simple-chat" , |
| 586 | "llama-speculative" , |
| 587 | "llama-speculative-simple" , |
| 588 | "llama-tokenize" , |
| 589 | "llama-tts" , |
| 590 | "llama-vdot" |
| 591 | }; |
| 592 | |
| 593 | for (const auto& exe : executables) { |
| 594 | printf(format: "complete -F _llama_completions %s\n" , exe.c_str()); |
| 595 | } |
| 596 | } |
| 597 | |
| 598 | static std::vector<ggml_backend_dev_t> parse_device_list(const std::string & value) { |
| 599 | std::vector<ggml_backend_dev_t> devices; |
| 600 | auto dev_names = string_split<std::string>(input: value, separator: ','); |
| 601 | if (dev_names.empty()) { |
| 602 | throw std::invalid_argument("no devices specified" ); |
| 603 | } |
| 604 | if (dev_names.size() == 1 && dev_names[0] == "none" ) { |
| 605 | devices.push_back(x: nullptr); |
| 606 | } else { |
| 607 | for (const auto & device : dev_names) { |
| 608 | auto * dev = ggml_backend_dev_by_name(name: device.c_str()); |
| 609 | if (!dev || ggml_backend_dev_type(device: dev) == GGML_BACKEND_DEVICE_TYPE_CPU) { |
| 610 | throw std::invalid_argument(string_format(fmt: "invalid device: %s" , device.c_str())); |
| 611 | } |
| 612 | devices.push_back(x: dev); |
| 613 | } |
| 614 | devices.push_back(x: nullptr); |
| 615 | } |
| 616 | return devices; |
| 617 | } |
| 618 | |
| 619 | static void add_rpc_devices(const std::string & servers) { |
| 620 | auto rpc_servers = string_split<std::string>(input: servers, separator: ','); |
| 621 | if (rpc_servers.empty()) { |
| 622 | throw std::invalid_argument("no RPC servers specified" ); |
| 623 | } |
| 624 | ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name(name: "RPC" ); |
| 625 | if (!rpc_reg) { |
| 626 | throw std::invalid_argument("failed to find RPC backend" ); |
| 627 | } |
| 628 | typedef ggml_backend_reg_t (*ggml_backend_rpc_add_server_t)(const char * endpoint); |
| 629 | ggml_backend_rpc_add_server_t ggml_backend_rpc_add_server_fn = (ggml_backend_rpc_add_server_t) ggml_backend_reg_get_proc_address(reg: rpc_reg, name: "ggml_backend_rpc_add_server" ); |
| 630 | if (!ggml_backend_rpc_add_server_fn) { |
| 631 | throw std::invalid_argument("failed to find RPC add server function" ); |
| 632 | } |
| 633 | for (const auto & server : rpc_servers) { |
| 634 | auto reg = ggml_backend_rpc_add_server_fn(server.c_str()); |
| 635 | ggml_backend_register(reg); |
| 636 | } |
| 637 | } |
| 638 | |
| 639 | bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) { |
| 640 | auto ctx_arg = common_params_parser_init(params, ex, print_usage); |
| 641 | const common_params params_org = ctx_arg.params; // the example can modify the default params |
| 642 | |
| 643 | try { |
| 644 | if (!common_params_parse_ex(argc, argv, ctx_arg)) { |
| 645 | ctx_arg.params = params_org; |
| 646 | return false; |
| 647 | } |
| 648 | if (ctx_arg.params.usage) { |
| 649 | common_params_print_usage(ctx_arg); |
| 650 | if (ctx_arg.print_usage) { |
| 651 | ctx_arg.print_usage(argc, argv); |
| 652 | } |
| 653 | exit(status: 0); |
| 654 | } |
| 655 | if (ctx_arg.params.completion) { |
| 656 | common_params_print_completion(ctx_arg); |
| 657 | exit(status: 0); |
| 658 | } |
| 659 | params.lr.init(); |
| 660 | } catch (const std::invalid_argument & ex) { |
| 661 | fprintf(stderr, format: "%s\n" , ex.what()); |
| 662 | ctx_arg.params = params_org; |
| 663 | return false; |
| 664 | } catch (std::exception & ex) { |
| 665 | fprintf(stderr, format: "%s\n" , ex.what()); |
| 666 | exit(status: 1); // for other exceptions, we exit with status code 1 |
| 667 | } |
| 668 | |
| 669 | return true; |
| 670 | } |
| 671 | |
| 672 | static std::string list_builtin_chat_templates() { |
| 673 | std::vector<const char *> supported_tmpl; |
| 674 | int32_t res = llama_chat_builtin_templates(output: nullptr, len: 0); |
| 675 | supported_tmpl.resize(new_size: res); |
| 676 | res = llama_chat_builtin_templates(output: supported_tmpl.data(), len: supported_tmpl.size()); |
| 677 | std::ostringstream msg; |
| 678 | for (auto & tmpl : supported_tmpl) { |
| 679 | msg << tmpl << (&tmpl == &supported_tmpl.back() ? "" : ", " ); |
| 680 | } |
| 681 | return msg.str(); |
| 682 | } |
| 683 | |
| 684 | static bool is_truthy(const std::string & value) { |
| 685 | return value == "on" || value == "enabled" || value == "1" ; |
| 686 | } |
| 687 | |
| 688 | static bool is_falsey(const std::string & value) { |
| 689 | return value == "off" || value == "disabled" || value == "0" ; |
| 690 | } |
| 691 | |
| 692 | static bool is_autoy(const std::string & value) { |
| 693 | return value == "auto" || value == "-1" ; |
| 694 | } |
| 695 | |
| 696 | common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) { |
| 697 | // load dynamic backends |
| 698 | ggml_backend_load_all(); |
| 699 | |
| 700 | common_params_context ctx_arg(params); |
| 701 | ctx_arg.print_usage = print_usage; |
| 702 | ctx_arg.ex = ex; |
| 703 | |
| 704 | std::string sampler_type_chars; |
| 705 | std::string sampler_type_names; |
| 706 | for (const auto & sampler : params.sampling.samplers) { |
| 707 | sampler_type_chars += common_sampler_type_to_chr(cnstr: sampler); |
| 708 | sampler_type_names += common_sampler_type_to_str(cnstr: sampler) + ";" ; |
| 709 | } |
| 710 | sampler_type_names.pop_back(); |
| 711 | |
| 712 | |
| 713 | /** |
| 714 | * filter options by example |
| 715 | * rules: |
| 716 | * - all examples inherit options from LLAMA_EXAMPLE_COMMON |
| 717 | * - if LLAMA_EXAMPLE_* is set (other than COMMON), we only show the option in the corresponding example |
| 718 | * - if both {LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_*,} are set, we will prioritize the LLAMA_EXAMPLE_* matching current example |
| 719 | */ |
| 720 | auto add_opt = [&](common_arg arg) { |
| 721 | if ((arg.in_example(ex) || arg.in_example(ex: LLAMA_EXAMPLE_COMMON)) && !arg.is_exclude(ex)) { |
| 722 | ctx_arg.options.push_back(x: std::move(arg)); |
| 723 | } |
| 724 | }; |
| 725 | |
| 726 | |
| 727 | add_opt(common_arg( |
| 728 | {"-h" , "--help" , "--usage" }, |
| 729 | "print usage and exit" , |
| 730 | [](common_params & params) { |
| 731 | params.usage = true; |
| 732 | } |
| 733 | )); |
| 734 | add_opt(common_arg( |
| 735 | {"--version" }, |
| 736 | "show version and build info" , |
| 737 | [](common_params &) { |
| 738 | fprintf(stderr, format: "version: %d (%s)\n" , LLAMA_BUILD_NUMBER, LLAMA_COMMIT); |
| 739 | fprintf(stderr, format: "built with %s for %s\n" , LLAMA_COMPILER, LLAMA_BUILD_TARGET); |
| 740 | exit(status: 0); |
| 741 | } |
| 742 | )); |
| 743 | add_opt(common_arg( |
| 744 | {"--completion-bash" }, |
| 745 | "print source-able bash completion script for llama.cpp" , |
| 746 | [](common_params & params) { |
| 747 | params.completion = true; |
| 748 | } |
| 749 | )); |
| 750 | add_opt(common_arg( |
| 751 | {"--verbose-prompt" }, |
| 752 | string_format(fmt: "print a verbose prompt before generation (default: %s)" , params.verbose_prompt ? "true" : "false" ), |
| 753 | [](common_params & params) { |
| 754 | params.verbose_prompt = true; |
| 755 | } |
| 756 | )); |
| 757 | add_opt(common_arg( |
| 758 | {"--no-display-prompt" }, |
| 759 | string_format(fmt: "don't print prompt at generation (default: %s)" , !params.display_prompt ? "true" : "false" ), |
| 760 | [](common_params & params) { |
| 761 | params.display_prompt = false; |
| 762 | } |
| 763 | ).set_examples({LLAMA_EXAMPLE_MAIN})); |
| 764 | add_opt(common_arg( |
| 765 | {"-co" , "--color" }, |
| 766 | string_format(fmt: "colorise output to distinguish prompt and user input from generations (default: %s)" , params.use_color ? "true" : "false" ), |
| 767 | [](common_params & params) { |
| 768 | params.use_color = true; |
| 769 | } |
| 770 | ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP})); |
| 771 | add_opt(common_arg( |
| 772 | {"-t" , "--threads" }, "N" , |
| 773 | string_format(fmt: "number of CPU threads to use during generation (default: %d)" , params.cpuparams.n_threads), |
| 774 | [](common_params & params, int value) { |
| 775 | params.cpuparams.n_threads = value; |
| 776 | if (params.cpuparams.n_threads <= 0) { |
| 777 | params.cpuparams.n_threads = std::thread::hardware_concurrency(); |
| 778 | } |
| 779 | } |
| 780 | ).set_env("LLAMA_ARG_THREADS" )); |
| 781 | add_opt(common_arg( |
| 782 | {"-tb" , "--threads-batch" }, "N" , |
| 783 | "number of threads to use during batch and prompt processing (default: same as --threads)" , |
| 784 | [](common_params & params, int value) { |
| 785 | params.cpuparams_batch.n_threads = value; |
| 786 | if (params.cpuparams_batch.n_threads <= 0) { |
| 787 | params.cpuparams_batch.n_threads = std::thread::hardware_concurrency(); |
| 788 | } |
| 789 | } |
| 790 | )); |
| 791 | add_opt(common_arg( |
| 792 | {"-C" , "--cpu-mask" }, "M" , |
| 793 | "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")" , |
| 794 | [](common_params & params, const std::string & mask) { |
| 795 | params.cpuparams.mask_valid = true; |
| 796 | if (!parse_cpu_mask(mask, boolmask&: params.cpuparams.cpumask)) { |
| 797 | throw std::invalid_argument("invalid cpumask" ); |
| 798 | } |
| 799 | } |
| 800 | )); |
| 801 | add_opt(common_arg( |
| 802 | {"-Cr" , "--cpu-range" }, "lo-hi" , |
| 803 | "range of CPUs for affinity. Complements --cpu-mask" , |
| 804 | [](common_params & params, const std::string & range) { |
| 805 | params.cpuparams.mask_valid = true; |
| 806 | if (!parse_cpu_range(range, boolmask&: params.cpuparams.cpumask)) { |
| 807 | throw std::invalid_argument("invalid range" ); |
| 808 | } |
| 809 | } |
| 810 | )); |
| 811 | add_opt(common_arg( |
| 812 | {"--cpu-strict" }, "<0|1>" , |
| 813 | string_format(fmt: "use strict CPU placement (default: %u)\n" , (unsigned) params.cpuparams.strict_cpu), |
| 814 | [](common_params & params, const std::string & value) { |
| 815 | params.cpuparams.strict_cpu = std::stoul(str: value); |
| 816 | } |
| 817 | )); |
| 818 | add_opt(common_arg( |
| 819 | {"--prio" }, "N" , |
| 820 | string_format(fmt: "set process/thread priority : low(-1), normal(0), medium(1), high(2), realtime(3) (default: %d)\n" , params.cpuparams.priority), |
| 821 | [](common_params & params, int prio) { |
| 822 | if (prio < GGML_SCHED_PRIO_LOW || prio > GGML_SCHED_PRIO_REALTIME) { |
| 823 | throw std::invalid_argument("invalid value" ); |
| 824 | } |
| 825 | params.cpuparams.priority = (enum ggml_sched_priority) prio; |
| 826 | } |
| 827 | )); |
| 828 | add_opt(common_arg( |
| 829 | {"--poll" }, "<0...100>" , |
| 830 | string_format(fmt: "use polling level to wait for work (0 - no polling, default: %u)\n" , (unsigned) params.cpuparams.poll), |
| 831 | [](common_params & params, const std::string & value) { |
| 832 | params.cpuparams.poll = std::stoul(str: value); |
| 833 | } |
| 834 | )); |
| 835 | add_opt(common_arg( |
| 836 | {"-Cb" , "--cpu-mask-batch" }, "M" , |
| 837 | "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)" , |
| 838 | [](common_params & params, const std::string & mask) { |
| 839 | params.cpuparams_batch.mask_valid = true; |
| 840 | if (!parse_cpu_mask(mask, boolmask&: params.cpuparams_batch.cpumask)) { |
| 841 | throw std::invalid_argument("invalid cpumask" ); |
| 842 | } |
| 843 | } |
| 844 | )); |
| 845 | add_opt(common_arg( |
| 846 | {"-Crb" , "--cpu-range-batch" }, "lo-hi" , |
| 847 | "ranges of CPUs for affinity. Complements --cpu-mask-batch" , |
| 848 | [](common_params & params, const std::string & range) { |
| 849 | params.cpuparams_batch.mask_valid = true; |
| 850 | if (!parse_cpu_range(range, boolmask&: params.cpuparams_batch.cpumask)) { |
| 851 | throw std::invalid_argument("invalid range" ); |
| 852 | } |
| 853 | } |
| 854 | )); |
| 855 | add_opt(common_arg( |
| 856 | {"--cpu-strict-batch" }, "<0|1>" , |
| 857 | "use strict CPU placement (default: same as --cpu-strict)" , |
| 858 | [](common_params & params, int value) { |
| 859 | params.cpuparams_batch.strict_cpu = value; |
| 860 | } |
| 861 | )); |
| 862 | add_opt(common_arg( |
| 863 | {"--prio-batch" }, "N" , |
| 864 | string_format(fmt: "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n" , params.cpuparams_batch.priority), |
| 865 | [](common_params & params, int prio) { |
| 866 | if (prio < 0 || prio > 3) { |
| 867 | throw std::invalid_argument("invalid value" ); |
| 868 | } |
| 869 | params.cpuparams_batch.priority = (enum ggml_sched_priority) prio; |
| 870 | } |
| 871 | )); |
| 872 | add_opt(common_arg( |
| 873 | {"--poll-batch" }, "<0|1>" , |
| 874 | "use polling to wait for work (default: same as --poll)" , |
| 875 | [](common_params & params, int value) { |
| 876 | params.cpuparams_batch.poll = value; |
| 877 | } |
| 878 | )); |
| 879 | add_opt(common_arg( |
| 880 | {"-lcs" , "--lookup-cache-static" }, "FNAME" , |
| 881 | "path to static lookup cache to use for lookup decoding (not updated by generation)" , |
| 882 | [](common_params & params, const std::string & value) { |
| 883 | params.lookup_cache_static = value; |
| 884 | } |
| 885 | ).set_examples({LLAMA_EXAMPLE_LOOKUP})); |
| 886 | add_opt(common_arg( |
| 887 | {"-lcd" , "--lookup-cache-dynamic" }, "FNAME" , |
| 888 | "path to dynamic lookup cache to use for lookup decoding (updated by generation)" , |
| 889 | [](common_params & params, const std::string & value) { |
| 890 | params.lookup_cache_dynamic = value; |
| 891 | } |
| 892 | ).set_examples({LLAMA_EXAMPLE_LOOKUP})); |
| 893 | add_opt(common_arg( |
| 894 | {"-c" , "--ctx-size" }, "N" , |
| 895 | string_format(fmt: "size of the prompt context (default: %d, 0 = loaded from model)" , params.n_ctx), |
| 896 | [](common_params & params, int value) { |
| 897 | params.n_ctx = value; |
| 898 | } |
| 899 | ).set_env("LLAMA_ARG_CTX_SIZE" )); |
| 900 | add_opt(common_arg( |
| 901 | {"-n" , "--predict" , "--n-predict" }, "N" , |
| 902 | string_format( |
| 903 | fmt: ex == LLAMA_EXAMPLE_MAIN |
| 904 | ? "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)" |
| 905 | : "number of tokens to predict (default: %d, -1 = infinity)" , |
| 906 | params.n_predict), |
| 907 | [](common_params & params, int value) { |
| 908 | params.n_predict = value; |
| 909 | } |
| 910 | ).set_env("LLAMA_ARG_N_PREDICT" )); |
| 911 | add_opt(common_arg( |
| 912 | {"-b" , "--batch-size" }, "N" , |
| 913 | string_format(fmt: "logical maximum batch size (default: %d)" , params.n_batch), |
| 914 | [](common_params & params, int value) { |
| 915 | params.n_batch = value; |
| 916 | } |
| 917 | ).set_env("LLAMA_ARG_BATCH" )); |
| 918 | add_opt(common_arg( |
| 919 | {"-ub" , "--ubatch-size" }, "N" , |
| 920 | string_format(fmt: "physical maximum batch size (default: %d)" , params.n_ubatch), |
| 921 | [](common_params & params, int value) { |
| 922 | params.n_ubatch = value; |
| 923 | } |
| 924 | ).set_env("LLAMA_ARG_UBATCH" )); |
| 925 | add_opt(common_arg( |
| 926 | {"--keep" }, "N" , |
| 927 | string_format(fmt: "number of tokens to keep from the initial prompt (default: %d, -1 = all)" , params.n_keep), |
| 928 | [](common_params & params, int value) { |
| 929 | params.n_keep = value; |
| 930 | } |
| 931 | )); |
| 932 | add_opt(common_arg( |
| 933 | {"--swa-full" }, |
| 934 | string_format(fmt: "use full-size SWA cache (default: %s)\n" |
| 935 | "[(more info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)" , params.swa_full ? "true" : "false" ), |
| 936 | [](common_params & params) { |
| 937 | params.swa_full = true; |
| 938 | } |
| 939 | ).set_env("LLAMA_ARG_SWA_FULL" )); |
| 940 | add_opt(common_arg( |
| 941 | {"--ctx-checkpoints" , "--swa-checkpoints" }, "N" , |
| 942 | string_format(fmt: "max number of context checkpoints to create per slot (default: %d)\n" |
| 943 | "[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)" , params.n_ctx_checkpoints), |
| 944 | [](common_params & params, int value) { |
| 945 | params.n_ctx_checkpoints = value; |
| 946 | } |
| 947 | ).set_env("LLAMA_ARG_CTX_CHECKPOINTS" ).set_examples({LLAMA_EXAMPLE_SERVER})); |
| 948 | add_opt(common_arg( |
| 949 | {"--cache-ram" , "-cram" }, "N" , |
| 950 | string_format(fmt: "set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)\n" |
| 951 | "[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)" , params.cache_ram_mib), |
| 952 | [](common_params & params, int value) { |
| 953 | params.cache_ram_mib = value; |
| 954 | } |
| 955 | ).set_env("LLAMA_ARG_CACHE_RAM" ).set_examples({LLAMA_EXAMPLE_SERVER})); |
| 956 | add_opt(common_arg( |
| 957 | {"--kv-unified" , "-kvu" }, |
| 958 | string_format(fmt: "use single unified KV buffer for the KV cache of all sequences (default: %s)\n" |
| 959 | "[(more info)](https://github.com/ggml-org/llama.cpp/pull/14363)" , params.kv_unified ? "true" : "false" ), |
| 960 | [](common_params & params) { |
| 961 | params.kv_unified = true; |
| 962 | } |
| 963 | ).set_env("LLAMA_ARG_KV_SPLIT" )); |
| 964 | add_opt(common_arg( |
| 965 | {"--no-context-shift" }, |
| 966 | string_format(fmt: "disables context shift on infinite text generation (default: %s)" , params.ctx_shift ? "disabled" : "enabled" ), |
| 967 | [](common_params & params) { |
| 968 | params.ctx_shift = false; |
| 969 | } |
| 970 | ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT" )); |
| 971 | add_opt(common_arg( |
| 972 | {"--context-shift" }, |
| 973 | string_format(fmt: "enables context shift on infinite text generation (default: %s)" , params.ctx_shift ? "enabled" : "disabled" ), |
| 974 | [](common_params & params) { |
| 975 | params.ctx_shift = true; |
| 976 | } |
| 977 | ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_CONTEXT_SHIFT" )); |
| 978 | add_opt(common_arg( |
| 979 | {"--chunks" }, "N" , |
| 980 | string_format(fmt: "max number of chunks to process (default: %d, -1 = all)" , params.n_chunks), |
| 981 | [](common_params & params, int value) { |
| 982 | params.n_chunks = value; |
| 983 | } |
| 984 | ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL})); |
| 985 | add_opt(common_arg({ "-fa" , "--flash-attn" }, "[on|off|auto]" , |
| 986 | string_format(fmt: "set Flash Attention use ('on', 'off', or 'auto', default: '%s')" , |
| 987 | llama_flash_attn_type_name(flash_attn_type: params.flash_attn_type)), |
| 988 | [](common_params & params, const std::string & value) { |
| 989 | if (is_truthy(value)) { |
| 990 | params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_ENABLED; |
| 991 | } else if (is_falsey(value)) { |
| 992 | params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED; |
| 993 | } else if (is_autoy(value)) { |
| 994 | params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO; |
| 995 | } else { |
| 996 | throw std::runtime_error( |
| 997 | string_format(fmt: "error: unkown value for --flash-attn: '%s'\n" , value.c_str())); |
| 998 | } |
| 999 | }).set_env("LLAMA_ARG_FLASH_ATTN" )); |
| 1000 | add_opt(common_arg( |
| 1001 | {"-p" , "--prompt" }, "PROMPT" , |
| 1002 | "prompt to start generation with; for system message, use -sys" , |
| 1003 | [](common_params & params, const std::string & value) { |
| 1004 | params.prompt = value; |
| 1005 | } |
| 1006 | ).set_excludes({LLAMA_EXAMPLE_SERVER})); |
| 1007 | add_opt(common_arg( |
| 1008 | {"-sys" , "--system-prompt" }, "PROMPT" , |
| 1009 | "system prompt to use with model (if applicable, depending on chat template)" , |
| 1010 | [](common_params & params, const std::string & value) { |
| 1011 | params.system_prompt = value; |
| 1012 | } |
| 1013 | ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_DIFFUSION})); |
| 1014 | add_opt(common_arg( |
| 1015 | {"--no-perf" }, |
| 1016 | string_format(fmt: "disable internal libllama performance timings (default: %s)" , params.no_perf ? "true" : "false" ), |
| 1017 | [](common_params & params) { |
| 1018 | params.no_perf = true; |
| 1019 | params.sampling.no_perf = true; |
| 1020 | } |
| 1021 | ).set_env("LLAMA_ARG_NO_PERF" )); |
| 1022 | add_opt(common_arg( |
| 1023 | {"-f" , "--file" }, "FNAME" , |
| 1024 | "a file containing the prompt (default: none)" , |
| 1025 | [](common_params & params, const std::string & value) { |
| 1026 | params.prompt = read_file(fname: value); |
| 1027 | // store the external file name in params |
| 1028 | params.prompt_file = value; |
| 1029 | if (!params.prompt.empty() && params.prompt.back() == '\n') { |
| 1030 | params.prompt.pop_back(); |
| 1031 | } |
| 1032 | } |
| 1033 | ).set_excludes({LLAMA_EXAMPLE_SERVER})); |
| 1034 | add_opt(common_arg( |
| 1035 | {"-sysf" , "--system-prompt-file" }, "FNAME" , |
| 1036 | "a file containing the system prompt (default: none)" , |
| 1037 | [](common_params & params, const std::string & value) { |
| 1038 | params.system_prompt = read_file(fname: value); |
| 1039 | if (!params.system_prompt.empty() && params.system_prompt.back() == '\n') { |
| 1040 | params.system_prompt.pop_back(); |
| 1041 | } |
| 1042 | } |
| 1043 | ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_DIFFUSION})); |
| 1044 | add_opt(common_arg( |
| 1045 | {"--in-file" }, "FNAME" , |
| 1046 | "an input file (repeat to specify multiple files)" , |
| 1047 | [](common_params & params, const std::string & value) { |
| 1048 | std::ifstream file(value); |
| 1049 | if (!file) { |
| 1050 | throw std::runtime_error(string_format(fmt: "error: failed to open file '%s'\n" , value.c_str())); |
| 1051 | } |
| 1052 | params.in_files.push_back(x: value); |
| 1053 | } |
| 1054 | ).set_examples({LLAMA_EXAMPLE_IMATRIX})); |
| 1055 | add_opt(common_arg( |
| 1056 | {"-bf" , "--binary-file" }, "FNAME" , |
| 1057 | "binary file containing the prompt (default: none)" , |
| 1058 | [](common_params & params, const std::string & value) { |
| 1059 | std::ifstream file(value, std::ios::binary); |
| 1060 | if (!file) { |
| 1061 | throw std::runtime_error(string_format(fmt: "error: failed to open file '%s'\n" , value.c_str())); |
| 1062 | } |
| 1063 | // store the external file name in params |
| 1064 | params.prompt_file = value; |
| 1065 | std::ostringstream ss; |
| 1066 | ss << file.rdbuf(); |
| 1067 | params.prompt = ss.str(); |
| 1068 | fprintf(stderr, format: "Read %zu bytes from binary file %s\n" , params.prompt.size(), value.c_str()); |
| 1069 | } |
| 1070 | ).set_excludes({LLAMA_EXAMPLE_SERVER})); |
| 1071 | add_opt(common_arg( |
| 1072 | {"-e" , "--escape" }, |
| 1073 | string_format(fmt: "process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)" , params.escape ? "true" : "false" ), |
| 1074 | [](common_params & params) { |
| 1075 | params.escape = true; |
| 1076 | } |
| 1077 | )); |
| 1078 | add_opt(common_arg( |
| 1079 | {"--no-escape" }, |
| 1080 | "do not process escape sequences" , |
| 1081 | [](common_params & params) { |
| 1082 | params.escape = false; |
| 1083 | } |
| 1084 | )); |
| 1085 | add_opt(common_arg( |
| 1086 | {"-ptc" , "--print-token-count" }, "N" , |
| 1087 | string_format(fmt: "print token count every N tokens (default: %d)" , params.n_print), |
| 1088 | [](common_params & params, int value) { |
| 1089 | params.n_print = value; |
| 1090 | } |
| 1091 | ).set_examples({LLAMA_EXAMPLE_MAIN})); |
| 1092 | add_opt(common_arg( |
| 1093 | {"--prompt-cache" }, "FNAME" , |
| 1094 | "file to cache prompt state for faster startup (default: none)" , |
| 1095 | [](common_params & params, const std::string & value) { |
| 1096 | params.path_prompt_cache = value; |
| 1097 | } |
| 1098 | ).set_examples({LLAMA_EXAMPLE_MAIN})); |
| 1099 | add_opt(common_arg( |
| 1100 | {"--prompt-cache-all" }, |
| 1101 | "if specified, saves user input and generations to cache as well\n" , |
| 1102 | [](common_params & params) { |
| 1103 | params.prompt_cache_all = true; |
| 1104 | } |
| 1105 | ).set_examples({LLAMA_EXAMPLE_MAIN})); |
| 1106 | add_opt(common_arg( |
| 1107 | {"--prompt-cache-ro" }, |
| 1108 | "if specified, uses the prompt cache but does not update it" , |
| 1109 | [](common_params & params) { |
| 1110 | params.prompt_cache_ro = true; |
| 1111 | } |
| 1112 | ).set_examples({LLAMA_EXAMPLE_MAIN})); |
| 1113 | add_opt(common_arg( |
| 1114 | {"-r" , "--reverse-prompt" }, "PROMPT" , |
| 1115 | "halt generation at PROMPT, return control in interactive mode\n" , |
| 1116 | [](common_params & params, const std::string & value) { |
| 1117 | params.antiprompt.emplace_back(args: value); |
| 1118 | } |
| 1119 | ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER})); |
| 1120 | add_opt(common_arg( |
| 1121 | {"-sp" , "--special" }, |
| 1122 | string_format(fmt: "special tokens output enabled (default: %s)" , params.special ? "true" : "false" ), |
| 1123 | [](common_params & params) { |
| 1124 | params.special = true; |
| 1125 | } |
| 1126 | ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER})); |
| 1127 | add_opt(common_arg( |
| 1128 | {"-cnv" , "--conversation" }, |
| 1129 | "run in conversation mode:\n" |
| 1130 | "- does not print special tokens and suffix/prefix\n" |
| 1131 | "- interactive mode is also enabled\n" |
| 1132 | "(default: auto enabled if chat template is available)" , |
| 1133 | [](common_params & params) { |
| 1134 | params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED; |
| 1135 | } |
| 1136 | ).set_examples({LLAMA_EXAMPLE_MAIN})); |
| 1137 | add_opt(common_arg( |
| 1138 | {"-no-cnv" , "--no-conversation" }, |
| 1139 | "force disable conversation mode (default: false)" , |
| 1140 | [](common_params & params) { |
| 1141 | params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED; |
| 1142 | } |
| 1143 | ).set_examples({LLAMA_EXAMPLE_MAIN})); |
| 1144 | add_opt(common_arg( |
| 1145 | {"-st" , "--single-turn" }, |
| 1146 | "run conversation for a single turn only, then exit when done\n" |
| 1147 | "will not be interactive if first turn is predefined with --prompt\n" |
| 1148 | "(default: false)" , |
| 1149 | [](common_params & params) { |
| 1150 | params.single_turn = true; |
| 1151 | } |
| 1152 | ).set_examples({LLAMA_EXAMPLE_MAIN})); |
| 1153 | add_opt(common_arg( |
| 1154 | {"-i" , "--interactive" }, |
| 1155 | string_format(fmt: "run in interactive mode (default: %s)" , params.interactive ? "true" : "false" ), |
| 1156 | [](common_params & params) { |
| 1157 | params.interactive = true; |
| 1158 | } |
| 1159 | ).set_examples({LLAMA_EXAMPLE_MAIN})); |
| 1160 | add_opt(common_arg( |
| 1161 | {"-if" , "--interactive-first" }, |
| 1162 | string_format(fmt: "run in interactive mode and wait for input right away (default: %s)" , params.interactive_first ? "true" : "false" ), |
| 1163 | [](common_params & params) { |
| 1164 | params.interactive_first = true; |
| 1165 | } |
| 1166 | ).set_examples({LLAMA_EXAMPLE_MAIN})); |
| 1167 | add_opt(common_arg( |
| 1168 | {"-mli" , "--multiline-input" }, |
| 1169 | "allows you to write or paste multiple lines without ending each in '\\'" , |
| 1170 | [](common_params & params) { |
| 1171 | params.multiline_input = true; |
| 1172 | } |
| 1173 | ).set_examples({LLAMA_EXAMPLE_MAIN})); |
| 1174 | add_opt(common_arg( |
| 1175 | {"--in-prefix-bos" }, |
| 1176 | "prefix BOS to user inputs, preceding the `--in-prefix` string" , |
| 1177 | [](common_params & params) { |
| 1178 | params.input_prefix_bos = true; |
| 1179 | params.enable_chat_template = false; |
| 1180 | } |
| 1181 | ).set_examples({LLAMA_EXAMPLE_MAIN})); |
| 1182 | add_opt(common_arg( |
| 1183 | {"--in-prefix" }, "STRING" , |
| 1184 | "string to prefix user inputs with (default: empty)" , |
| 1185 | [](common_params & params, const std::string & value) { |
| 1186 | params.input_prefix = value; |
| 1187 | params.enable_chat_template = false; |
| 1188 | } |
| 1189 | ).set_examples({LLAMA_EXAMPLE_MAIN})); |
| 1190 | add_opt(common_arg( |
| 1191 | {"--in-suffix" }, "STRING" , |
| 1192 | "string to suffix after user inputs with (default: empty)" , |
| 1193 | [](common_params & params, const std::string & value) { |
| 1194 | params.input_suffix = value; |
| 1195 | params.enable_chat_template = false; |
| 1196 | } |
| 1197 | ).set_examples({LLAMA_EXAMPLE_MAIN})); |
| 1198 | add_opt(common_arg( |
| 1199 | {"--no-warmup" }, |
| 1200 | "skip warming up the model with an empty run" , |
| 1201 | [](common_params & params) { |
| 1202 | params.warmup = false; |
| 1203 | } |
| 1204 | ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY})); |
| 1205 | add_opt(common_arg( |
| 1206 | {"--spm-infill" }, |
| 1207 | string_format( |
| 1208 | fmt: "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)" , |
| 1209 | params.spm_infill ? "enabled" : "disabled" |
| 1210 | ), |
| 1211 | [](common_params & params) { |
| 1212 | params.spm_infill = true; |
| 1213 | } |
| 1214 | ).set_examples({LLAMA_EXAMPLE_SERVER})); |
| 1215 | add_opt(common_arg( |
| 1216 | {"--samplers" }, "SAMPLERS" , |
| 1217 | string_format(fmt: "samplers that will be used for generation in the order, separated by \';\'\n(default: %s)" , sampler_type_names.c_str()), |
| 1218 | [](common_params & params, const std::string & value) { |
| 1219 | const auto sampler_names = string_split<std::string>(input: value, separator: ';'); |
| 1220 | params.sampling.samplers = common_sampler_types_from_names(names: sampler_names, allow_alt_names: true); |
| 1221 | } |
| 1222 | ).set_sparam()); |
| 1223 | add_opt(common_arg( |
| 1224 | {"-s" , "--seed" }, "SEED" , |
| 1225 | string_format(fmt: "RNG seed (default: %d, use random seed for %d)" , params.sampling.seed, LLAMA_DEFAULT_SEED), |
| 1226 | [](common_params & params, const std::string & value) { |
| 1227 | params.sampling.seed = std::stoul(str: value); |
| 1228 | } |
| 1229 | ).set_sparam()); |
| 1230 | add_opt(common_arg( |
| 1231 | {"--sampling-seq" , "--sampler-seq" }, "SEQUENCE" , |
| 1232 | string_format(fmt: "simplified sequence for samplers that will be used (default: %s)" , sampler_type_chars.c_str()), |
| 1233 | [](common_params & params, const std::string & value) { |
| 1234 | params.sampling.samplers = common_sampler_types_from_chars(chars: value); |
| 1235 | } |
| 1236 | ).set_sparam()); |
| 1237 | add_opt(common_arg( |
| 1238 | {"--ignore-eos" }, |
| 1239 | "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)" , |
| 1240 | [](common_params & params) { |
| 1241 | params.sampling.ignore_eos = true; |
| 1242 | } |
| 1243 | ).set_sparam()); |
| 1244 | add_opt(common_arg( |
| 1245 | {"--temp" }, "N" , |
| 1246 | string_format(fmt: "temperature (default: %.1f)" , (double)params.sampling.temp), |
| 1247 | [](common_params & params, const std::string & value) { |
| 1248 | params.sampling.temp = std::stof(str: value); |
| 1249 | params.sampling.temp = std::max(a: params.sampling.temp, b: 0.0f); |
| 1250 | } |
| 1251 | ).set_sparam()); |
| 1252 | add_opt(common_arg( |
| 1253 | {"--top-k" }, "N" , |
| 1254 | string_format(fmt: "top-k sampling (default: %d, 0 = disabled)" , params.sampling.top_k), |
| 1255 | [](common_params & params, int value) { |
| 1256 | params.sampling.top_k = value; |
| 1257 | } |
| 1258 | ).set_sparam()); |
| 1259 | add_opt(common_arg( |
| 1260 | {"--top-p" }, "N" , |
| 1261 | string_format(fmt: "top-p sampling (default: %.1f, 1.0 = disabled)" , (double)params.sampling.top_p), |
| 1262 | [](common_params & params, const std::string & value) { |
| 1263 | params.sampling.top_p = std::stof(str: value); |
| 1264 | } |
| 1265 | ).set_sparam()); |
| 1266 | add_opt(common_arg( |
| 1267 | {"--min-p" }, "N" , |
| 1268 | string_format(fmt: "min-p sampling (default: %.1f, 0.0 = disabled)" , (double)params.sampling.min_p), |
| 1269 | [](common_params & params, const std::string & value) { |
| 1270 | params.sampling.min_p = std::stof(str: value); |
| 1271 | } |
| 1272 | ).set_sparam()); |
| 1273 | add_opt(common_arg( |
| 1274 | {"--top-nsigma" }, "N" , |
| 1275 | string_format(fmt: "top-n-sigma sampling (default: %.1f, -1.0 = disabled)" , params.sampling.top_n_sigma), |
| 1276 | [](common_params & params, const std::string & value) { |
| 1277 | params.sampling.top_n_sigma = std::stof(str: value); |
| 1278 | } |
| 1279 | ).set_sparam()); |
| 1280 | add_opt(common_arg( |
| 1281 | {"--xtc-probability" }, "N" , |
| 1282 | string_format(fmt: "xtc probability (default: %.1f, 0.0 = disabled)" , (double)params.sampling.xtc_probability), |
| 1283 | [](common_params & params, const std::string & value) { |
| 1284 | params.sampling.xtc_probability = std::stof(str: value); |
| 1285 | } |
| 1286 | ).set_sparam()); |
| 1287 | add_opt(common_arg( |
| 1288 | {"--xtc-threshold" }, "N" , |
| 1289 | string_format(fmt: "xtc threshold (default: %.1f, 1.0 = disabled)" , (double)params.sampling.xtc_threshold), |
| 1290 | [](common_params & params, const std::string & value) { |
| 1291 | params.sampling.xtc_threshold = std::stof(str: value); |
| 1292 | } |
| 1293 | ).set_sparam()); |
| 1294 | add_opt(common_arg( |
| 1295 | {"--typical" }, "N" , |
| 1296 | string_format(fmt: "locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)" , (double)params.sampling.typ_p), |
| 1297 | [](common_params & params, const std::string & value) { |
| 1298 | params.sampling.typ_p = std::stof(str: value); |
| 1299 | } |
| 1300 | ).set_sparam()); |
| 1301 | add_opt(common_arg( |
| 1302 | {"--repeat-last-n" }, "N" , |
| 1303 | string_format(fmt: "last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)" , params.sampling.penalty_last_n), |
| 1304 | [](common_params & params, int value) { |
| 1305 | if (value < -1) { |
| 1306 | throw std::runtime_error(string_format(fmt: "error: invalid repeat-last-n = %d\n" , value)); |
| 1307 | } |
| 1308 | params.sampling.penalty_last_n = value; |
| 1309 | params.sampling.n_prev = std::max(a: params.sampling.n_prev, b: params.sampling.penalty_last_n); |
| 1310 | } |
| 1311 | ).set_sparam()); |
| 1312 | add_opt(common_arg( |
| 1313 | {"--repeat-penalty" }, "N" , |
| 1314 | string_format(fmt: "penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)" , (double)params.sampling.penalty_repeat), |
| 1315 | [](common_params & params, const std::string & value) { |
| 1316 | params.sampling.penalty_repeat = std::stof(str: value); |
| 1317 | } |
| 1318 | ).set_sparam()); |
| 1319 | add_opt(common_arg( |
| 1320 | {"--presence-penalty" }, "N" , |
| 1321 | string_format(fmt: "repeat alpha presence penalty (default: %.1f, 0.0 = disabled)" , (double)params.sampling.penalty_present), |
| 1322 | [](common_params & params, const std::string & value) { |
| 1323 | params.sampling.penalty_present = std::stof(str: value); |
| 1324 | } |
| 1325 | ).set_sparam()); |
| 1326 | add_opt(common_arg( |
| 1327 | {"--frequency-penalty" }, "N" , |
| 1328 | string_format(fmt: "repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)" , (double)params.sampling.penalty_freq), |
| 1329 | [](common_params & params, const std::string & value) { |
| 1330 | params.sampling.penalty_freq = std::stof(str: value); |
| 1331 | } |
| 1332 | ).set_sparam()); |
| 1333 | add_opt(common_arg( |
| 1334 | {"--dry-multiplier" }, "N" , |
| 1335 | string_format(fmt: "set DRY sampling multiplier (default: %.1f, 0.0 = disabled)" , (double)params.sampling.dry_multiplier), |
| 1336 | [](common_params & params, const std::string & value) { |
| 1337 | params.sampling.dry_multiplier = std::stof(str: value); |
| 1338 | } |
| 1339 | ).set_sparam()); |
| 1340 | add_opt(common_arg( |
| 1341 | {"--dry-base" }, "N" , |
| 1342 | string_format(fmt: "set DRY sampling base value (default: %.2f)" , (double)params.sampling.dry_base), |
| 1343 | [](common_params & params, const std::string & value) { |
| 1344 | float potential_base = std::stof(str: value); |
| 1345 | if (potential_base >= 1.0f) |
| 1346 | { |
| 1347 | params.sampling.dry_base = potential_base; |
| 1348 | } |
| 1349 | } |
| 1350 | ).set_sparam()); |
| 1351 | add_opt(common_arg( |
| 1352 | {"--dry-allowed-length" }, "N" , |
| 1353 | string_format(fmt: "set allowed length for DRY sampling (default: %d)" , params.sampling.dry_allowed_length), |
| 1354 | [](common_params & params, int value) { |
| 1355 | params.sampling.dry_allowed_length = value; |
| 1356 | } |
| 1357 | ).set_sparam()); |
| 1358 | add_opt(common_arg( |
| 1359 | {"--dry-penalty-last-n" }, "N" , |
| 1360 | string_format(fmt: "set DRY penalty for the last n tokens (default: %d, 0 = disable, -1 = context size)" , params.sampling.dry_penalty_last_n), |
| 1361 | [](common_params & params, int value) { |
| 1362 | if (value < -1) { |
| 1363 | throw std::runtime_error(string_format(fmt: "error: invalid dry-penalty-last-n = %d\n" , value)); |
| 1364 | } |
| 1365 | params.sampling.dry_penalty_last_n = value; |
| 1366 | } |
| 1367 | ).set_sparam()); |
| 1368 | add_opt(common_arg( |
| 1369 | {"--dry-sequence-breaker" }, "STRING" , |
| 1370 | string_format(fmt: "add sequence breaker for DRY sampling, clearing out default breakers (%s) in the process; use \"none\" to not use any sequence breakers\n" , |
| 1371 | params.sampling.dry_sequence_breakers.empty() ? "none" : |
| 1372 | std::accumulate(first: std::next(x: params.sampling.dry_sequence_breakers.begin()), |
| 1373 | last: params.sampling.dry_sequence_breakers.end(), |
| 1374 | init: std::string("'" ) + (params.sampling.dry_sequence_breakers[0] == "\n" ? "\\n" : params.sampling.dry_sequence_breakers[0]) + "'" , |
| 1375 | binary_op: [](const std::string& a, const std::string& b) { |
| 1376 | std::string formatted_b = (b == "\n" ) ? "\\n" : b; |
| 1377 | return a + ", '" + formatted_b + "'" ; |
| 1378 | }).c_str()), |
| 1379 | [](common_params & params, const std::string & value) { |
| 1380 | static bool defaults_cleared = false; |
| 1381 | |
| 1382 | if (!defaults_cleared) { |
| 1383 | params.sampling.dry_sequence_breakers.clear(); |
| 1384 | defaults_cleared = true; |
| 1385 | } |
| 1386 | |
| 1387 | if (value == "none" ) { |
| 1388 | params.sampling.dry_sequence_breakers.clear(); |
| 1389 | } else { |
| 1390 | params.sampling.dry_sequence_breakers.emplace_back(args: value); |
| 1391 | } |
| 1392 | } |
| 1393 | ).set_sparam()); |
| 1394 | add_opt(common_arg( |
| 1395 | {"--dynatemp-range" }, "N" , |
| 1396 | string_format(fmt: "dynamic temperature range (default: %.1f, 0.0 = disabled)" , (double)params.sampling.dynatemp_range), |
| 1397 | [](common_params & params, const std::string & value) { |
| 1398 | params.sampling.dynatemp_range = std::stof(str: value); |
| 1399 | } |
| 1400 | ).set_sparam()); |
| 1401 | add_opt(common_arg( |
| 1402 | {"--dynatemp-exp" }, "N" , |
| 1403 | string_format(fmt: "dynamic temperature exponent (default: %.1f)" , (double)params.sampling.dynatemp_exponent), |
| 1404 | [](common_params & params, const std::string & value) { |
| 1405 | params.sampling.dynatemp_exponent = std::stof(str: value); |
| 1406 | } |
| 1407 | ).set_sparam()); |
| 1408 | add_opt(common_arg( |
| 1409 | {"--mirostat" }, "N" , |
| 1410 | string_format(fmt: "use Mirostat sampling.\nTop K, Nucleus and Locally Typical samplers are ignored if used.\n" |
| 1411 | "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)" , params.sampling.mirostat), |
| 1412 | [](common_params & params, int value) { |
| 1413 | params.sampling.mirostat = value; |
| 1414 | } |
| 1415 | ).set_sparam()); |
| 1416 | add_opt(common_arg( |
| 1417 | {"--mirostat-lr" }, "N" , |
| 1418 | string_format(fmt: "Mirostat learning rate, parameter eta (default: %.1f)" , (double)params.sampling.mirostat_eta), |
| 1419 | [](common_params & params, const std::string & value) { |
| 1420 | params.sampling.mirostat_eta = std::stof(str: value); |
| 1421 | } |
| 1422 | ).set_sparam()); |
| 1423 | add_opt(common_arg( |
| 1424 | {"--mirostat-ent" }, "N" , |
| 1425 | string_format(fmt: "Mirostat target entropy, parameter tau (default: %.1f)" , (double)params.sampling.mirostat_tau), |
| 1426 | [](common_params & params, const std::string & value) { |
| 1427 | params.sampling.mirostat_tau = std::stof(str: value); |
| 1428 | } |
| 1429 | ).set_sparam()); |
| 1430 | add_opt(common_arg( |
| 1431 | {"-l" , "--logit-bias" }, "TOKEN_ID(+/-)BIAS" , |
| 1432 | "modifies the likelihood of token appearing in the completion,\n" |
| 1433 | "i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n" |
| 1434 | "or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'" , |
| 1435 | [](common_params & params, const std::string & value) { |
| 1436 | std::stringstream ss(value); |
| 1437 | llama_token key; |
| 1438 | char sign; |
| 1439 | std::string value_str; |
| 1440 | try { |
| 1441 | if (ss >> key && ss >> sign && std::getline(is&: ss, str&: value_str) && (sign == '+' || sign == '-')) { |
| 1442 | const float bias = std::stof(str: value_str) * ((sign == '-') ? -1.0f : 1.0f); |
| 1443 | params.sampling.logit_bias.push_back(x: {.token: key, .bias: bias}); |
| 1444 | } else { |
| 1445 | throw std::invalid_argument("invalid input format" ); |
| 1446 | } |
| 1447 | } catch (const std::exception&) { |
| 1448 | throw std::invalid_argument("invalid input format" ); |
| 1449 | } |
| 1450 | } |
| 1451 | ).set_sparam()); |
| 1452 | add_opt(common_arg( |
| 1453 | {"--grammar" }, "GRAMMAR" , |
| 1454 | string_format(fmt: "BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')" , params.sampling.grammar.c_str()), |
| 1455 | [](common_params & params, const std::string & value) { |
| 1456 | params.sampling.grammar = value; |
| 1457 | } |
| 1458 | ).set_sparam()); |
| 1459 | add_opt(common_arg( |
| 1460 | {"--grammar-file" }, "FNAME" , |
| 1461 | "file to read grammar from" , |
| 1462 | [](common_params & params, const std::string & value) { |
| 1463 | params.sampling.grammar = read_file(fname: value); |
| 1464 | } |
| 1465 | ).set_sparam()); |
| 1466 | add_opt(common_arg( |
| 1467 | {"-j" , "--json-schema" }, "SCHEMA" , |
| 1468 | "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead" , |
| 1469 | [](common_params & params, const std::string & value) { |
| 1470 | params.sampling.grammar = json_schema_to_grammar(schema: json::parse(i: value)); |
| 1471 | } |
| 1472 | ).set_sparam()); |
| 1473 | add_opt(common_arg( |
| 1474 | {"-jf" , "--json-schema-file" }, "FILE" , |
| 1475 | "File containing a JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead" , |
| 1476 | [](common_params & params, const std::string & value) { |
| 1477 | std::ifstream file(value); |
| 1478 | if (!file) { |
| 1479 | throw std::runtime_error(string_format(fmt: "error: failed to open file '%s'\n" , value.c_str())); |
| 1480 | } |
| 1481 | std::string schema; |
| 1482 | std::copy( |
| 1483 | first: std::istreambuf_iterator<char>(file), |
| 1484 | last: std::istreambuf_iterator<char>(), |
| 1485 | result: std::back_inserter(x&: schema) |
| 1486 | ); |
| 1487 | params.sampling.grammar = json_schema_to_grammar(schema: json::parse(i&: schema)); |
| 1488 | } |
| 1489 | ).set_sparam()); |
| 1490 | add_opt(common_arg( |
| 1491 | {"--pooling" }, "{none,mean,cls,last,rank}" , |
| 1492 | "pooling type for embeddings, use model default if unspecified" , |
| 1493 | [](common_params & params, const std::string & value) { |
| 1494 | /**/ if (value == "none" ) { params.pooling_type = LLAMA_POOLING_TYPE_NONE; } |
| 1495 | else if (value == "mean" ) { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; } |
| 1496 | else if (value == "cls" ) { params.pooling_type = LLAMA_POOLING_TYPE_CLS; } |
| 1497 | else if (value == "last" ) { params.pooling_type = LLAMA_POOLING_TYPE_LAST; } |
| 1498 | else if (value == "rank" ) { params.pooling_type = LLAMA_POOLING_TYPE_RANK; } |
| 1499 | else { throw std::invalid_argument("invalid value" ); } |
| 1500 | } |
| 1501 | ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_POOLING" )); |
| 1502 | add_opt(common_arg( |
| 1503 | {"--attention" }, "{causal,non-causal}" , |
| 1504 | "attention type for embeddings, use model default if unspecified" , |
| 1505 | [](common_params & params, const std::string & value) { |
| 1506 | /**/ if (value == "causal" ) { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; } |
| 1507 | else if (value == "non-causal" ) { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; } |
| 1508 | else { throw std::invalid_argument("invalid value" ); } |
| 1509 | } |
| 1510 | ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); |
| 1511 | add_opt(common_arg( |
| 1512 | {"--rope-scaling" }, "{none,linear,yarn}" , |
| 1513 | "RoPE frequency scaling method, defaults to linear unless specified by the model" , |
| 1514 | [](common_params & params, const std::string & value) { |
| 1515 | /**/ if (value == "none" ) { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; } |
| 1516 | else if (value == "linear" ) { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; } |
| 1517 | else if (value == "yarn" ) { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; } |
| 1518 | else { throw std::invalid_argument("invalid value" ); } |
| 1519 | } |
| 1520 | ).set_env("LLAMA_ARG_ROPE_SCALING_TYPE" )); |
| 1521 | add_opt(common_arg( |
| 1522 | {"--rope-scale" }, "N" , |
| 1523 | "RoPE context scaling factor, expands context by a factor of N" , |
| 1524 | [](common_params & params, const std::string & value) { |
| 1525 | params.rope_freq_scale = 1.0f / std::stof(str: value); |
| 1526 | } |
| 1527 | ).set_env("LLAMA_ARG_ROPE_SCALE" )); |
| 1528 | add_opt(common_arg( |
| 1529 | {"--rope-freq-base" }, "N" , |
| 1530 | "RoPE base frequency, used by NTK-aware scaling (default: loaded from model)" , |
| 1531 | [](common_params & params, const std::string & value) { |
| 1532 | params.rope_freq_base = std::stof(str: value); |
| 1533 | } |
| 1534 | ).set_env("LLAMA_ARG_ROPE_FREQ_BASE" )); |
| 1535 | add_opt(common_arg( |
| 1536 | {"--rope-freq-scale" }, "N" , |
| 1537 | "RoPE frequency scaling factor, expands context by a factor of 1/N" , |
| 1538 | [](common_params & params, const std::string & value) { |
| 1539 | params.rope_freq_scale = std::stof(str: value); |
| 1540 | } |
| 1541 | ).set_env("LLAMA_ARG_ROPE_FREQ_SCALE" )); |
| 1542 | add_opt(common_arg( |
| 1543 | {"--yarn-orig-ctx" }, "N" , |
| 1544 | string_format(fmt: "YaRN: original context size of model (default: %d = model training context size)" , params.yarn_orig_ctx), |
| 1545 | [](common_params & params, int value) { |
| 1546 | params.yarn_orig_ctx = value; |
| 1547 | } |
| 1548 | ).set_env("LLAMA_ARG_YARN_ORIG_CTX" )); |
| 1549 | add_opt(common_arg( |
| 1550 | {"--yarn-ext-factor" }, "N" , |
| 1551 | string_format(fmt: "YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)" , (double)params.yarn_ext_factor), |
| 1552 | [](common_params & params, const std::string & value) { |
| 1553 | params.yarn_ext_factor = std::stof(str: value); |
| 1554 | } |
| 1555 | ).set_env("LLAMA_ARG_YARN_EXT_FACTOR" )); |
| 1556 | add_opt(common_arg( |
| 1557 | {"--yarn-attn-factor" }, "N" , |
| 1558 | string_format(fmt: "YaRN: scale sqrt(t) or attention magnitude (default: %.1f)" , (double)params.yarn_attn_factor), |
| 1559 | [](common_params & params, const std::string & value) { |
| 1560 | params.yarn_attn_factor = std::stof(str: value); |
| 1561 | } |
| 1562 | ).set_env("LLAMA_ARG_YARN_ATTN_FACTOR" )); |
| 1563 | add_opt(common_arg( |
| 1564 | {"--yarn-beta-slow" }, "N" , |
| 1565 | string_format(fmt: "YaRN: high correction dim or alpha (default: %.1f)" , (double)params.yarn_beta_slow), |
| 1566 | [](common_params & params, const std::string & value) { |
| 1567 | params.yarn_beta_slow = std::stof(str: value); |
| 1568 | } |
| 1569 | ).set_env("LLAMA_ARG_YARN_BETA_SLOW" )); |
| 1570 | add_opt(common_arg( |
| 1571 | {"--yarn-beta-fast" }, "N" , |
| 1572 | string_format(fmt: "YaRN: low correction dim or beta (default: %.1f)" , (double)params.yarn_beta_fast), |
| 1573 | [](common_params & params, const std::string & value) { |
| 1574 | params.yarn_beta_fast = std::stof(str: value); |
| 1575 | } |
| 1576 | ).set_env("LLAMA_ARG_YARN_BETA_FAST" )); |
| 1577 | add_opt(common_arg( |
| 1578 | {"-gan" , "--grp-attn-n" }, "N" , |
| 1579 | string_format(fmt: "group-attention factor (default: %d)" , params.grp_attn_n), |
| 1580 | [](common_params & params, int value) { |
| 1581 | params.grp_attn_n = value; |
| 1582 | } |
| 1583 | ).set_env("LLAMA_ARG_GRP_ATTN_N" ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_PASSKEY})); |
| 1584 | add_opt(common_arg( |
| 1585 | {"-gaw" , "--grp-attn-w" }, "N" , |
| 1586 | string_format(fmt: "group-attention width (default: %d)" , params.grp_attn_w), |
| 1587 | [](common_params & params, int value) { |
| 1588 | params.grp_attn_w = value; |
| 1589 | } |
| 1590 | ).set_env("LLAMA_ARG_GRP_ATTN_W" ).set_examples({LLAMA_EXAMPLE_MAIN})); |
| 1591 | add_opt(common_arg( |
| 1592 | {"-nkvo" , "--no-kv-offload" }, |
| 1593 | "disable KV offload" , |
| 1594 | [](common_params & params) { |
| 1595 | params.no_kv_offload = true; |
| 1596 | } |
| 1597 | ).set_env("LLAMA_ARG_NO_KV_OFFLOAD" )); |
| 1598 | add_opt(common_arg( |
| 1599 | {"-nr" , "--no-repack" }, |
| 1600 | "disable weight repacking" , |
| 1601 | [](common_params & params) { |
| 1602 | params.no_extra_bufts = true; |
| 1603 | } |
| 1604 | ).set_env("LLAMA_ARG_NO_REPACK" )); |
| 1605 | add_opt(common_arg( |
| 1606 | {"--no-host" }, |
| 1607 | "bypass host buffer allowing extra buffers to be used" , |
| 1608 | [](common_params & params) { |
| 1609 | params.no_host = true; |
| 1610 | } |
| 1611 | ).set_env("LLAMA_ARG_NO_HOST" )); |
| 1612 | add_opt(common_arg( |
| 1613 | {"-ctk" , "--cache-type-k" }, "TYPE" , |
| 1614 | string_format( |
| 1615 | fmt: "KV cache data type for K\n" |
| 1616 | "allowed values: %s\n" |
| 1617 | "(default: %s)" , |
| 1618 | get_all_kv_cache_types().c_str(), |
| 1619 | ggml_type_name(type: params.cache_type_k) |
| 1620 | ), |
| 1621 | [](common_params & params, const std::string & value) { |
| 1622 | params.cache_type_k = kv_cache_type_from_str(s: value); |
| 1623 | } |
| 1624 | ).set_env("LLAMA_ARG_CACHE_TYPE_K" )); |
| 1625 | add_opt(common_arg( |
| 1626 | {"-ctv" , "--cache-type-v" }, "TYPE" , |
| 1627 | string_format( |
| 1628 | fmt: "KV cache data type for V\n" |
| 1629 | "allowed values: %s\n" |
| 1630 | "(default: %s)" , |
| 1631 | get_all_kv_cache_types().c_str(), |
| 1632 | ggml_type_name(type: params.cache_type_v) |
| 1633 | ), |
| 1634 | [](common_params & params, const std::string & value) { |
| 1635 | params.cache_type_v = kv_cache_type_from_str(s: value); |
| 1636 | } |
| 1637 | ).set_env("LLAMA_ARG_CACHE_TYPE_V" )); |
| 1638 | add_opt(common_arg( |
| 1639 | {"--hellaswag" }, |
| 1640 | "compute HellaSwag score over random tasks from datafile supplied with -f" , |
| 1641 | [](common_params & params) { |
| 1642 | params.hellaswag = true; |
| 1643 | } |
| 1644 | ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); |
| 1645 | add_opt(common_arg( |
| 1646 | {"--hellaswag-tasks" }, "N" , |
| 1647 | string_format(fmt: "number of tasks to use when computing the HellaSwag score (default: %zu)" , params.hellaswag_tasks), |
| 1648 | [](common_params & params, int value) { |
| 1649 | params.hellaswag_tasks = value; |
| 1650 | } |
| 1651 | ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); |
| 1652 | add_opt(common_arg( |
| 1653 | {"--winogrande" }, |
| 1654 | "compute Winogrande score over random tasks from datafile supplied with -f" , |
| 1655 | [](common_params & params) { |
| 1656 | params.winogrande = true; |
| 1657 | } |
| 1658 | ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); |
| 1659 | add_opt(common_arg( |
| 1660 | {"--winogrande-tasks" }, "N" , |
| 1661 | string_format(fmt: "number of tasks to use when computing the Winogrande score (default: %zu)" , params.winogrande_tasks), |
| 1662 | [](common_params & params, int value) { |
| 1663 | params.winogrande_tasks = value; |
| 1664 | } |
| 1665 | ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); |
| 1666 | add_opt(common_arg( |
| 1667 | {"--multiple-choice" }, |
| 1668 | "compute multiple choice score over random tasks from datafile supplied with -f" , |
| 1669 | [](common_params & params) { |
| 1670 | params.multiple_choice = true; |
| 1671 | } |
| 1672 | ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); |
| 1673 | add_opt(common_arg( |
| 1674 | {"--multiple-choice-tasks" }, "N" , |
| 1675 | string_format(fmt: "number of tasks to use when computing the multiple choice score (default: %zu)" , params.multiple_choice_tasks), |
| 1676 | [](common_params & params, int value) { |
| 1677 | params.multiple_choice_tasks = value; |
| 1678 | } |
| 1679 | ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); |
| 1680 | add_opt(common_arg( |
| 1681 | {"--kl-divergence" }, |
| 1682 | "computes KL-divergence to logits provided via --kl-divergence-base" , |
| 1683 | [](common_params & params) { |
| 1684 | params.kl_divergence = true; |
| 1685 | } |
| 1686 | ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); |
| 1687 | add_opt(common_arg( |
| 1688 | {"--save-all-logits" , "--kl-divergence-base" }, "FNAME" , |
| 1689 | "set logits file" , |
| 1690 | [](common_params & params, const std::string & value) { |
| 1691 | params.logits_file = value; |
| 1692 | } |
| 1693 | ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); |
| 1694 | add_opt(common_arg( |
| 1695 | {"--ppl-stride" }, "N" , |
| 1696 | string_format(fmt: "stride for perplexity calculation (default: %d)" , params.ppl_stride), |
| 1697 | [](common_params & params, int value) { |
| 1698 | params.ppl_stride = value; |
| 1699 | } |
| 1700 | ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); |
| 1701 | add_opt(common_arg( |
| 1702 | {"--ppl-output-type" }, "<0|1>" , |
| 1703 | string_format(fmt: "output type for perplexity calculation (default: %d)" , params.ppl_output_type), |
| 1704 | [](common_params & params, int value) { |
| 1705 | params.ppl_output_type = value; |
| 1706 | } |
| 1707 | ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); |
| 1708 | add_opt(common_arg( |
| 1709 | {"-dt" , "--defrag-thold" }, "N" , |
| 1710 | string_format(fmt: "KV cache defragmentation threshold (DEPRECATED)" ), |
| 1711 | [](common_params & params, const std::string & value) { |
| 1712 | GGML_UNUSED(params); |
| 1713 | GGML_UNUSED(value); |
| 1714 | LOG_WRN("DEPRECATED: --defrag-thold is deprecated and no longer necessary to specify\n" ); |
| 1715 | } |
| 1716 | ).set_env("LLAMA_ARG_DEFRAG_THOLD" )); |
| 1717 | add_opt(common_arg( |
| 1718 | {"-np" , "--parallel" }, "N" , |
| 1719 | string_format(fmt: "number of parallel sequences to decode (default: %d)" , params.n_parallel), |
| 1720 | [](common_params & params, int value) { |
| 1721 | params.n_parallel = value; |
| 1722 | } |
| 1723 | ).set_env("LLAMA_ARG_N_PARALLEL" )); |
| 1724 | add_opt(common_arg( |
| 1725 | {"-ns" , "--sequences" }, "N" , |
| 1726 | string_format(fmt: "number of sequences to decode (default: %d)" , params.n_sequences), |
| 1727 | [](common_params & params, int value) { |
| 1728 | params.n_sequences = value; |
| 1729 | } |
| 1730 | ).set_examples({LLAMA_EXAMPLE_PARALLEL})); |
| 1731 | add_opt(common_arg( |
| 1732 | {"-cb" , "--cont-batching" }, |
| 1733 | string_format(fmt: "enable continuous batching (a.k.a dynamic batching) (default: %s)" , params.cont_batching ? "enabled" : "disabled" ), |
| 1734 | [](common_params & params) { |
| 1735 | params.cont_batching = true; |
| 1736 | } |
| 1737 | ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CONT_BATCHING" )); |
| 1738 | add_opt(common_arg( |
| 1739 | {"-nocb" , "--no-cont-batching" }, |
| 1740 | "disable continuous batching" , |
| 1741 | [](common_params & params) { |
| 1742 | params.cont_batching = false; |
| 1743 | } |
| 1744 | ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING" )); |
| 1745 | add_opt(common_arg( |
| 1746 | {"--mmproj" }, "FILE" , |
| 1747 | "path to a multimodal projector file. see tools/mtmd/README.md\n" |
| 1748 | "note: if -hf is used, this argument can be omitted" , |
| 1749 | [](common_params & params, const std::string & value) { |
| 1750 | params.mmproj.path = value; |
| 1751 | } |
| 1752 | ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ" )); |
| 1753 | add_opt(common_arg( |
| 1754 | {"--mmproj-url" }, "URL" , |
| 1755 | "URL to a multimodal projector file. see tools/mtmd/README.md" , |
| 1756 | [](common_params & params, const std::string & value) { |
| 1757 | params.mmproj.url = value; |
| 1758 | } |
| 1759 | ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_URL" )); |
| 1760 | add_opt(common_arg( |
| 1761 | {"--no-mmproj" }, |
| 1762 | "explicitly disable multimodal projector, useful when using -hf" , |
| 1763 | [](common_params & params) { |
| 1764 | params.no_mmproj = true; |
| 1765 | } |
| 1766 | ).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ" )); |
| 1767 | add_opt(common_arg( |
| 1768 | {"--no-mmproj-offload" }, |
| 1769 | "do not offload multimodal projector to GPU" , |
| 1770 | [](common_params & params) { |
| 1771 | params.mmproj_use_gpu = false; |
| 1772 | } |
| 1773 | ).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ_OFFLOAD" )); |
| 1774 | add_opt(common_arg( |
| 1775 | {"--image" , "--audio" }, "FILE" , |
| 1776 | "path to an image or audio file. use with multimodal models, can be repeated if you have multiple files\n" , |
| 1777 | [](common_params & params, const std::string & value) { |
| 1778 | params.image.emplace_back(args: value); |
| 1779 | } |
| 1780 | ).set_examples({LLAMA_EXAMPLE_MTMD})); |
| 1781 | add_opt(common_arg( |
| 1782 | {"--image-min-tokens" }, "N" , |
| 1783 | "minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)" , |
| 1784 | [](common_params & params, int value) { |
| 1785 | params.image_min_tokens = value; |
| 1786 | } |
| 1787 | ).set_examples(mmproj_examples).set_env("LLAMA_ARG_IMAGE_MIN_TOKENS" )); |
| 1788 | add_opt(common_arg( |
| 1789 | {"--image-max-tokens" }, "N" , |
| 1790 | "maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)" , |
| 1791 | [](common_params & params, int value) { |
| 1792 | params.image_max_tokens = value; |
| 1793 | } |
| 1794 | ).set_examples(mmproj_examples).set_env("LLAMA_ARG_IMAGE_MAX_TOKENS" )); |
| 1795 | if (llama_supports_rpc()) { |
| 1796 | add_opt(common_arg( |
| 1797 | {"--rpc" }, "SERVERS" , |
| 1798 | "comma separated list of RPC servers" , |
| 1799 | [](common_params & params, const std::string & value) { |
| 1800 | add_rpc_devices(servers: value); |
| 1801 | GGML_UNUSED(params); |
| 1802 | } |
| 1803 | ).set_env("LLAMA_ARG_RPC" )); |
| 1804 | } |
| 1805 | add_opt(common_arg( |
| 1806 | {"--mlock" }, |
| 1807 | "force system to keep model in RAM rather than swapping or compressing" , |
| 1808 | [](common_params & params) { |
| 1809 | params.use_mlock = true; |
| 1810 | } |
| 1811 | ).set_env("LLAMA_ARG_MLOCK" )); |
| 1812 | add_opt(common_arg( |
| 1813 | {"--no-mmap" }, |
| 1814 | "do not memory-map model (slower load but may reduce pageouts if not using mlock)" , |
| 1815 | [](common_params & params) { |
| 1816 | params.use_mmap = false; |
| 1817 | } |
| 1818 | ).set_env("LLAMA_ARG_NO_MMAP" )); |
| 1819 | add_opt(common_arg( |
| 1820 | {"--numa" }, "TYPE" , |
| 1821 | "attempt optimizations that help on some NUMA systems\n" |
| 1822 | "- distribute: spread execution evenly over all nodes\n" |
| 1823 | "- isolate: only spawn threads on CPUs on the node that execution started on\n" |
| 1824 | "- numactl: use the CPU map provided by numactl\n" |
| 1825 | "if run without this previously, it is recommended to drop the system page cache before using this\n" |
| 1826 | "see https://github.com/ggml-org/llama.cpp/issues/1437" , |
| 1827 | [](common_params & params, const std::string & value) { |
| 1828 | /**/ if (value == "distribute" || value == "" ) { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; } |
| 1829 | else if (value == "isolate" ) { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } |
| 1830 | else if (value == "numactl" ) { params.numa = GGML_NUMA_STRATEGY_NUMACTL; } |
| 1831 | else { throw std::invalid_argument("invalid value" ); } |
| 1832 | } |
| 1833 | ).set_env("LLAMA_ARG_NUMA" )); |
| 1834 | add_opt(common_arg( |
| 1835 | {"-dev" , "--device" }, "<dev1,dev2,..>" , |
| 1836 | "comma-separated list of devices to use for offloading (none = don't offload)\n" |
| 1837 | "use --list-devices to see a list of available devices" , |
| 1838 | [](common_params & params, const std::string & value) { |
| 1839 | params.devices = parse_device_list(value); |
| 1840 | } |
| 1841 | ).set_env("LLAMA_ARG_DEVICE" )); |
| 1842 | add_opt(common_arg( |
| 1843 | {"--list-devices" }, |
| 1844 | "print list of available devices and exit" , |
| 1845 | [](common_params &) { |
| 1846 | std::vector<ggml_backend_dev_t> devices; |
| 1847 | for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { |
| 1848 | auto * dev = ggml_backend_dev_get(index: i); |
| 1849 | if (ggml_backend_dev_type(device: dev) != GGML_BACKEND_DEVICE_TYPE_CPU) { |
| 1850 | devices.push_back(x: dev); |
| 1851 | } |
| 1852 | } |
| 1853 | printf(format: "Available devices:\n" ); |
| 1854 | for (auto * dev : devices) { |
| 1855 | size_t free, total; |
| 1856 | ggml_backend_dev_memory(device: dev, free: &free, total: &total); |
| 1857 | printf(format: " %s: %s (%zu MiB, %zu MiB free)\n" , ggml_backend_dev_name(device: dev), ggml_backend_dev_description(device: dev), total / 1024 / 1024, free / 1024 / 1024); |
| 1858 | } |
| 1859 | exit(status: 0); |
| 1860 | } |
| 1861 | )); |
| 1862 | add_opt(common_arg( |
| 1863 | {"--override-tensor" , "-ot" }, "<tensor name pattern>=<buffer type>,..." , |
| 1864 | "override tensor buffer type" , [](common_params & params, const std::string & value) { |
| 1865 | parse_tensor_buffer_overrides(value, overrides&: params.tensor_buft_overrides); |
| 1866 | } |
| 1867 | )); |
| 1868 | add_opt(common_arg( |
| 1869 | {"--override-tensor-draft" , "-otd" }, "<tensor name pattern>=<buffer type>,..." , |
| 1870 | "override tensor buffer type for draft model" , [](common_params & params, const std::string & value) { |
| 1871 | parse_tensor_buffer_overrides(value, overrides&: params.speculative.tensor_buft_overrides); |
| 1872 | } |
| 1873 | ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); |
| 1874 | add_opt(common_arg( |
| 1875 | {"--cpu-moe" , "-cmoe" }, |
| 1876 | "keep all Mixture of Experts (MoE) weights in the CPU" , |
| 1877 | [](common_params & params) { |
| 1878 | params.tensor_buft_overrides.push_back(x: llm_ffn_exps_cpu_override()); |
| 1879 | } |
| 1880 | ).set_env("LLAMA_ARG_CPU_MOE" )); |
| 1881 | add_opt(common_arg( |
| 1882 | {"--n-cpu-moe" , "-ncmoe" }, "N" , |
| 1883 | "keep the Mixture of Experts (MoE) weights of the first N layers in the CPU" , |
| 1884 | [](common_params & params, int value) { |
| 1885 | if (value < 0) { |
| 1886 | throw std::invalid_argument("invalid value" ); |
| 1887 | } |
| 1888 | for (int i = 0; i < value; ++i) { |
| 1889 | // keep strings alive and avoid leaking memory by storing them in a static vector |
| 1890 | static std::list<std::string> buft_overrides; |
| 1891 | buft_overrides.push_back(x: llm_ffn_exps_block_regex(idx: i)); |
| 1892 | params.tensor_buft_overrides.push_back(x: {.pattern: buft_overrides.back().c_str(), .buft: ggml_backend_cpu_buffer_type()}); |
| 1893 | } |
| 1894 | } |
| 1895 | ).set_env("LLAMA_ARG_N_CPU_MOE" )); |
| 1896 | add_opt(common_arg( |
| 1897 | {"--cpu-moe-draft" , "-cmoed" }, |
| 1898 | "keep all Mixture of Experts (MoE) weights in the CPU for the draft model" , |
| 1899 | [](common_params & params) { |
| 1900 | params.speculative.tensor_buft_overrides.push_back(x: llm_ffn_exps_cpu_override()); |
| 1901 | } |
| 1902 | ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CPU_MOE_DRAFT" )); |
| 1903 | add_opt(common_arg( |
| 1904 | {"--n-cpu-moe-draft" , "-ncmoed" }, "N" , |
| 1905 | "keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model" , |
| 1906 | [](common_params & params, int value) { |
| 1907 | if (value < 0) { |
| 1908 | throw std::invalid_argument("invalid value" ); |
| 1909 | } |
| 1910 | for (int i = 0; i < value; ++i) { |
| 1911 | static std::list<std::string> buft_overrides_draft; |
| 1912 | buft_overrides_draft.push_back(x: llm_ffn_exps_block_regex(idx: i)); |
| 1913 | params.speculative.tensor_buft_overrides.push_back(x: {.pattern: buft_overrides_draft.back().c_str(), .buft: ggml_backend_cpu_buffer_type()}); |
| 1914 | } |
| 1915 | } |
| 1916 | ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT" )); |
| 1917 | add_opt(common_arg( |
| 1918 | {"-ngl" , "--gpu-layers" , "--n-gpu-layers" }, "N" , |
| 1919 | string_format(fmt: "max. number of layers to store in VRAM (default: %d)" , params.n_gpu_layers), |
| 1920 | [](common_params & params, int value) { |
| 1921 | params.n_gpu_layers = value; |
| 1922 | if (!llama_supports_gpu_offload()) { |
| 1923 | fprintf(stderr, format: "warning: no usable GPU found, --gpu-layers option will be ignored\n" ); |
| 1924 | fprintf(stderr, format: "warning: one possible reason is that llama.cpp was compiled without GPU support\n" ); |
| 1925 | fprintf(stderr, format: "warning: consult docs/build.md for compilation instructions\n" ); |
| 1926 | } |
| 1927 | } |
| 1928 | ).set_env("LLAMA_ARG_N_GPU_LAYERS" )); |
| 1929 | add_opt(common_arg( |
| 1930 | {"-sm" , "--split-mode" }, "{none,layer,row}" , |
| 1931 | "how to split the model across multiple GPUs, one of:\n" |
| 1932 | "- none: use one GPU only\n" |
| 1933 | "- layer (default): split layers and KV across GPUs\n" |
| 1934 | "- row: split rows across GPUs" , |
| 1935 | [](common_params & params, const std::string & value) { |
| 1936 | std::string arg_next = value; |
| 1937 | if (arg_next == "none" ) { |
| 1938 | params.split_mode = LLAMA_SPLIT_MODE_NONE; |
| 1939 | } else if (arg_next == "layer" ) { |
| 1940 | params.split_mode = LLAMA_SPLIT_MODE_LAYER; |
| 1941 | } else if (arg_next == "row" ) { |
| 1942 | params.split_mode = LLAMA_SPLIT_MODE_ROW; |
| 1943 | } else { |
| 1944 | throw std::invalid_argument("invalid value" ); |
| 1945 | } |
| 1946 | if (!llama_supports_gpu_offload()) { |
| 1947 | fprintf(stderr, format: "warning: llama.cpp was compiled without support for GPU offload. Setting the split mode has no effect.\n" ); |
| 1948 | } |
| 1949 | } |
| 1950 | ).set_env("LLAMA_ARG_SPLIT_MODE" )); |
| 1951 | add_opt(common_arg( |
| 1952 | {"-ts" , "--tensor-split" }, "N0,N1,N2,..." , |
| 1953 | "fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1" , |
| 1954 | [](common_params & params, const std::string & value) { |
| 1955 | std::string arg_next = value; |
| 1956 | |
| 1957 | // split string by , and / |
| 1958 | const std::regex regex{ R"([,/]+)" }; |
| 1959 | std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 }; |
| 1960 | std::vector<std::string> split_arg{ it, {} }; |
| 1961 | if (split_arg.size() >= llama_max_devices()) { |
| 1962 | throw std::invalid_argument( |
| 1963 | string_format(fmt: "got %d input configs, but system only has %d devices" , (int)split_arg.size(), (int)llama_max_devices()) |
| 1964 | ); |
| 1965 | } |
| 1966 | for (size_t i = 0; i < llama_max_devices(); ++i) { |
| 1967 | if (i < split_arg.size()) { |
| 1968 | params.tensor_split[i] = std::stof(str: split_arg[i]); |
| 1969 | } else { |
| 1970 | params.tensor_split[i] = 0.0f; |
| 1971 | } |
| 1972 | } |
| 1973 | if (!llama_supports_gpu_offload()) { |
| 1974 | fprintf(stderr, format: "warning: llama.cpp was compiled without support for GPU offload. Setting a tensor split has no effect.\n" ); |
| 1975 | } |
| 1976 | } |
| 1977 | ).set_env("LLAMA_ARG_TENSOR_SPLIT" )); |
| 1978 | add_opt(common_arg( |
| 1979 | {"-mg" , "--main-gpu" }, "INDEX" , |
| 1980 | string_format(fmt: "the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)" , params.main_gpu), |
| 1981 | [](common_params & params, int value) { |
| 1982 | params.main_gpu = value; |
| 1983 | if (!llama_supports_gpu_offload()) { |
| 1984 | fprintf(stderr, format: "warning: llama.cpp was compiled without support for GPU offload. Setting the main GPU has no effect.\n" ); |
| 1985 | } |
| 1986 | } |
| 1987 | ).set_env("LLAMA_ARG_MAIN_GPU" )); |
| 1988 | add_opt(common_arg( |
| 1989 | {"--check-tensors" }, |
| 1990 | string_format(fmt: "check model tensor data for invalid values (default: %s)" , params.check_tensors ? "true" : "false" ), |
| 1991 | [](common_params & params) { |
| 1992 | params.check_tensors = true; |
| 1993 | } |
| 1994 | )); |
| 1995 | add_opt(common_arg( |
| 1996 | {"--override-kv" }, "KEY=TYPE:VALUE" , |
| 1997 | "advanced option to override model metadata by key. may be specified multiple times.\n" |
| 1998 | "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false" , |
| 1999 | [](common_params & params, const std::string & value) { |
| 2000 | if (!string_parse_kv_override(data: value.c_str(), overrides&: params.kv_overrides)) { |
| 2001 | throw std::runtime_error(string_format(fmt: "error: Invalid type for KV override: %s\n" , value.c_str())); |
| 2002 | } |
| 2003 | } |
| 2004 | )); |
| 2005 | add_opt(common_arg( |
| 2006 | {"--no-op-offload" }, |
| 2007 | string_format(fmt: "disable offloading host tensor operations to device (default: %s)" , params.no_op_offload ? "true" : "false" ), |
| 2008 | [](common_params & params) { |
| 2009 | params.no_op_offload = true; |
| 2010 | } |
| 2011 | )); |
| 2012 | add_opt(common_arg( |
| 2013 | {"--lora" }, "FNAME" , |
| 2014 | "path to LoRA adapter (can be repeated to use multiple adapters)" , |
| 2015 | [](common_params & params, const std::string & value) { |
| 2016 | params.lora_adapters.push_back(x: { .path: std::string(value), .scale: 1.0, .task_name: "" , .prompt_prefix: "" , .ptr: nullptr }); |
| 2017 | } |
| 2018 | // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg |
| 2019 | ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA})); |
| 2020 | add_opt(common_arg( |
| 2021 | {"--lora-scaled" }, "FNAME" , "SCALE" , |
| 2022 | "path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)" , |
| 2023 | [](common_params & params, const std::string & fname, const std::string & scale) { |
| 2024 | params.lora_adapters.push_back(x: { .path: fname, .scale: std::stof(str: scale), .task_name: "" , .prompt_prefix: "" , .ptr: nullptr }); |
| 2025 | } |
| 2026 | // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg |
| 2027 | ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA})); |
| 2028 | add_opt(common_arg( |
| 2029 | {"--control-vector" }, "FNAME" , |
| 2030 | "add a control vector\nnote: this argument can be repeated to add multiple control vectors" , |
| 2031 | [](common_params & params, const std::string & value) { |
| 2032 | params.control_vectors.push_back(x: { .strength: 1.0f, .fname: value, }); |
| 2033 | } |
| 2034 | )); |
| 2035 | add_opt(common_arg( |
| 2036 | {"--control-vector-scaled" }, "FNAME" , "SCALE" , |
| 2037 | "add a control vector with user defined scaling SCALE\n" |
| 2038 | "note: this argument can be repeated to add multiple scaled control vectors" , |
| 2039 | [](common_params & params, const std::string & fname, const std::string & scale) { |
| 2040 | params.control_vectors.push_back(x: { .strength: std::stof(str: scale), .fname: fname }); |
| 2041 | } |
| 2042 | )); |
| 2043 | add_opt(common_arg( |
| 2044 | {"--control-vector-layer-range" }, "START" , "END" , |
| 2045 | "layer range to apply the control vector(s) to, start and end inclusive" , |
| 2046 | [](common_params & params, const std::string & start, const std::string & end) { |
| 2047 | params.control_vector_layer_start = std::stoi(str: start); |
| 2048 | params.control_vector_layer_end = std::stoi(str: end); |
| 2049 | } |
| 2050 | )); |
| 2051 | add_opt(common_arg( |
| 2052 | {"-a" , "--alias" }, "STRING" , |
| 2053 | "set alias for model name (to be used by REST API)" , |
| 2054 | [](common_params & params, const std::string & value) { |
| 2055 | params.model_alias = value; |
| 2056 | } |
| 2057 | ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ALIAS" )); |
| 2058 | add_opt(common_arg( |
| 2059 | {"-m" , "--model" }, "FNAME" , |
| 2060 | ex == LLAMA_EXAMPLE_EXPORT_LORA |
| 2061 | ? std::string("model path from which to load base model" ) |
| 2062 | : string_format( |
| 2063 | fmt: "model path (default: `models/$filename` with filename from `--hf-file` " |
| 2064 | "or `--model-url` if set, otherwise %s)" , DEFAULT_MODEL_PATH |
| 2065 | ), |
| 2066 | [](common_params & params, const std::string & value) { |
| 2067 | params.model.path = value; |
| 2068 | } |
| 2069 | ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL" )); |
| 2070 | add_opt(common_arg( |
| 2071 | {"-mu" , "--model-url" }, "MODEL_URL" , |
| 2072 | "model download url (default: unused)" , |
| 2073 | [](common_params & params, const std::string & value) { |
| 2074 | params.model.url = value; |
| 2075 | } |
| 2076 | ).set_env("LLAMA_ARG_MODEL_URL" )); |
| 2077 | add_opt(common_arg( |
| 2078 | { "-dr" , "--docker-repo" }, "[<repo>/]<model>[:quant]" , |
| 2079 | "Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.\n" |
| 2080 | "example: gemma3\n" |
| 2081 | "(default: unused)" , |
| 2082 | [](common_params & params, const std::string & value) { |
| 2083 | params.model.docker_repo = value; |
| 2084 | } |
| 2085 | ).set_env("LLAMA_ARG_DOCKER_REPO" )); |
| 2086 | add_opt(common_arg( |
| 2087 | {"-hf" , "-hfr" , "--hf-repo" }, "<user>/<model>[:quant]" , |
| 2088 | "Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n" |
| 2089 | "mmproj is also downloaded automatically if available. to disable, add --no-mmproj\n" |
| 2090 | "example: unsloth/phi-4-GGUF:q4_k_m\n" |
| 2091 | "(default: unused)" , |
| 2092 | [](common_params & params, const std::string & value) { |
| 2093 | params.model.hf_repo = value; |
| 2094 | } |
| 2095 | ).set_env("LLAMA_ARG_HF_REPO" )); |
| 2096 | add_opt(common_arg( |
| 2097 | {"-hfd" , "-hfrd" , "--hf-repo-draft" }, "<user>/<model>[:quant]" , |
| 2098 | "Same as --hf-repo, but for the draft model (default: unused)" , |
| 2099 | [](common_params & params, const std::string & value) { |
| 2100 | params.speculative.model.hf_repo = value; |
| 2101 | } |
| 2102 | ).set_env("LLAMA_ARG_HFD_REPO" )); |
| 2103 | add_opt(common_arg( |
| 2104 | {"-hff" , "--hf-file" }, "FILE" , |
| 2105 | "Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)" , |
| 2106 | [](common_params & params, const std::string & value) { |
| 2107 | params.model.hf_file = value; |
| 2108 | } |
| 2109 | ).set_env("LLAMA_ARG_HF_FILE" )); |
| 2110 | add_opt(common_arg( |
| 2111 | {"-hfv" , "-hfrv" , "--hf-repo-v" }, "<user>/<model>[:quant]" , |
| 2112 | "Hugging Face model repository for the vocoder model (default: unused)" , |
| 2113 | [](common_params & params, const std::string & value) { |
| 2114 | params.vocoder.model.hf_repo = value; |
| 2115 | } |
| 2116 | ).set_env("LLAMA_ARG_HF_REPO_V" )); |
| 2117 | add_opt(common_arg( |
| 2118 | {"-hffv" , "--hf-file-v" }, "FILE" , |
| 2119 | "Hugging Face model file for the vocoder model (default: unused)" , |
| 2120 | [](common_params & params, const std::string & value) { |
| 2121 | params.vocoder.model.hf_file = value; |
| 2122 | } |
| 2123 | ).set_env("LLAMA_ARG_HF_FILE_V" )); |
| 2124 | add_opt(common_arg( |
| 2125 | {"-hft" , "--hf-token" }, "TOKEN" , |
| 2126 | "Hugging Face access token (default: value from HF_TOKEN environment variable)" , |
| 2127 | [](common_params & params, const std::string & value) { |
| 2128 | params.hf_token = value; |
| 2129 | } |
| 2130 | ).set_env("HF_TOKEN" )); |
| 2131 | add_opt(common_arg( |
| 2132 | {"--context-file" }, "FNAME" , |
| 2133 | "file to load context from (repeat to specify multiple files)" , |
| 2134 | [](common_params & params, const std::string & value) { |
| 2135 | std::ifstream file(value, std::ios::binary); |
| 2136 | if (!file) { |
| 2137 | throw std::runtime_error(string_format(fmt: "error: failed to open file '%s'\n" , value.c_str())); |
| 2138 | } |
| 2139 | params.context_files.push_back(x: value); |
| 2140 | } |
| 2141 | ).set_examples({LLAMA_EXAMPLE_RETRIEVAL})); |
| 2142 | add_opt(common_arg( |
| 2143 | {"--chunk-size" }, "N" , |
| 2144 | string_format(fmt: "minimum length of embedded text chunks (default: %d)" , params.chunk_size), |
| 2145 | [](common_params & params, int value) { |
| 2146 | params.chunk_size = value; |
| 2147 | } |
| 2148 | ).set_examples({LLAMA_EXAMPLE_RETRIEVAL})); |
| 2149 | add_opt(common_arg( |
| 2150 | {"--chunk-separator" }, "STRING" , |
| 2151 | string_format(fmt: "separator between chunks (default: '%s')" , params.chunk_separator.c_str()), |
| 2152 | [](common_params & params, const std::string & value) { |
| 2153 | params.chunk_separator = value; |
| 2154 | } |
| 2155 | ).set_examples({LLAMA_EXAMPLE_RETRIEVAL})); |
| 2156 | add_opt(common_arg( |
| 2157 | {"--junk" }, "N" , |
| 2158 | string_format(fmt: "number of times to repeat the junk text (default: %d)" , params.n_junk), |
| 2159 | [](common_params & params, int value) { |
| 2160 | params.n_junk = value; |
| 2161 | } |
| 2162 | ).set_examples({LLAMA_EXAMPLE_PASSKEY, LLAMA_EXAMPLE_PARALLEL})); |
| 2163 | add_opt(common_arg( |
| 2164 | {"--pos" }, "N" , |
| 2165 | string_format(fmt: "position of the passkey in the junk text (default: %d)" , params.i_pos), |
| 2166 | [](common_params & params, int value) { |
| 2167 | params.i_pos = value; |
| 2168 | } |
| 2169 | ).set_examples({LLAMA_EXAMPLE_PASSKEY})); |
| 2170 | add_opt(common_arg( |
| 2171 | {"-o" , "--output" , "--output-file" }, "FNAME" , |
| 2172 | string_format(fmt: "output file (default: '%s')" , params.out_file.c_str()), |
| 2173 | [](common_params & params, const std::string & value) { |
| 2174 | params.out_file = value; |
| 2175 | } |
| 2176 | ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE})); |
| 2177 | add_opt(common_arg( |
| 2178 | {"-ofreq" , "--output-frequency" }, "N" , |
| 2179 | string_format(fmt: "output the imatrix every N iterations (default: %d)" , params.n_out_freq), |
| 2180 | [](common_params & params, int value) { |
| 2181 | params.n_out_freq = value; |
| 2182 | } |
| 2183 | ).set_examples({LLAMA_EXAMPLE_IMATRIX})); |
| 2184 | add_opt(common_arg( |
| 2185 | {"--output-format" }, "{gguf,dat}" , |
| 2186 | string_format(fmt: "output format for imatrix file (default: %s)" , params.imat_dat > 0 ? "dat" : "gguf" ), |
| 2187 | [](common_params & params, const std::string & value) { |
| 2188 | /**/ if (value == "gguf" ) { params.imat_dat = -1; } |
| 2189 | else if (value == "dat" ) { params.imat_dat = 1; } |
| 2190 | else { throw std::invalid_argument("invalid output format" ); } |
| 2191 | } |
| 2192 | ).set_examples({LLAMA_EXAMPLE_IMATRIX})); |
| 2193 | add_opt(common_arg( |
| 2194 | {"--save-frequency" }, "N" , |
| 2195 | string_format(fmt: "save an imatrix copy every N iterations (default: %d)" , params.n_save_freq), |
| 2196 | [](common_params & params, int value) { |
| 2197 | params.n_save_freq = value; |
| 2198 | } |
| 2199 | ).set_examples({LLAMA_EXAMPLE_IMATRIX})); |
| 2200 | add_opt(common_arg( |
| 2201 | {"--process-output" }, |
| 2202 | string_format(fmt: "collect data for the output tensor (default: %s)" , params.process_output ? "true" : "false" ), |
| 2203 | [](common_params & params) { |
| 2204 | params.process_output = true; |
| 2205 | } |
| 2206 | ).set_examples({LLAMA_EXAMPLE_IMATRIX})); |
| 2207 | add_opt(common_arg( |
| 2208 | {"--no-ppl" }, |
| 2209 | string_format(fmt: "do not compute perplexity (default: %s)" , params.compute_ppl ? "true" : "false" ), |
| 2210 | [](common_params & params) { |
| 2211 | params.compute_ppl = false; |
| 2212 | } |
| 2213 | ).set_examples({LLAMA_EXAMPLE_IMATRIX})); |
| 2214 | add_opt(common_arg( |
| 2215 | {"--chunk" , "--from-chunk" }, "N" , |
| 2216 | string_format(fmt: "start processing the input from chunk N (default: %d)" , params.i_chunk), |
| 2217 | [](common_params & params, int value) { |
| 2218 | params.i_chunk = value; |
| 2219 | } |
| 2220 | ).set_examples({LLAMA_EXAMPLE_IMATRIX})); |
| 2221 | add_opt(common_arg( |
| 2222 | {"--show-statistics" }, |
| 2223 | string_format(fmt: "show imatrix statistics and then exit (default: %s)" , params.show_statistics ? "true" : "false" ), |
| 2224 | [](common_params & params) { |
| 2225 | params.show_statistics = true; |
| 2226 | } |
| 2227 | ).set_examples({LLAMA_EXAMPLE_IMATRIX})); |
| 2228 | add_opt(common_arg( |
| 2229 | {"--parse-special" }, |
| 2230 | string_format(fmt: "parse special tokens (chat, tool, etc) (default: %s)" , params.parse_special ? "true" : "false" ), |
| 2231 | [](common_params & params) { |
| 2232 | params.parse_special = true; |
| 2233 | } |
| 2234 | ).set_examples({LLAMA_EXAMPLE_IMATRIX})); |
| 2235 | add_opt(common_arg( |
| 2236 | {"-pps" }, |
| 2237 | string_format(fmt: "is the prompt shared across parallel sequences (default: %s)" , params.is_pp_shared ? "true" : "false" ), |
| 2238 | [](common_params & params) { |
| 2239 | params.is_pp_shared = true; |
| 2240 | } |
| 2241 | ).set_examples({LLAMA_EXAMPLE_BENCH, LLAMA_EXAMPLE_PARALLEL})); |
| 2242 | add_opt(common_arg( |
| 2243 | {"-npp" }, "n0,n1,..." , |
| 2244 | "number of prompt tokens" , |
| 2245 | [](common_params & params, const std::string & value) { |
| 2246 | auto p = string_split<int>(str: value, delim: ','); |
| 2247 | params.n_pp.insert(position: params.n_pp.end(), first: p.begin(), last: p.end()); |
| 2248 | } |
| 2249 | ).set_examples({LLAMA_EXAMPLE_BENCH})); |
| 2250 | add_opt(common_arg( |
| 2251 | {"-ntg" }, "n0,n1,..." , |
| 2252 | "number of text generation tokens" , |
| 2253 | [](common_params & params, const std::string & value) { |
| 2254 | auto p = string_split<int>(str: value, delim: ','); |
| 2255 | params.n_tg.insert(position: params.n_tg.end(), first: p.begin(), last: p.end()); |
| 2256 | } |
| 2257 | ).set_examples({LLAMA_EXAMPLE_BENCH})); |
| 2258 | add_opt(common_arg( |
| 2259 | {"-npl" }, "n0,n1,..." , |
| 2260 | "number of parallel prompts" , |
| 2261 | [](common_params & params, const std::string & value) { |
| 2262 | auto p = string_split<int>(str: value, delim: ','); |
| 2263 | params.n_pl.insert(position: params.n_pl.end(), first: p.begin(), last: p.end()); |
| 2264 | } |
| 2265 | ).set_examples({LLAMA_EXAMPLE_BENCH})); |
| 2266 | add_opt(common_arg( |
| 2267 | {"--embd-normalize" }, "N" , |
| 2268 | string_format(fmt: "normalisation for embeddings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)" , params.embd_normalize), |
| 2269 | [](common_params & params, int value) { |
| 2270 | params.embd_normalize = value; |
| 2271 | } |
| 2272 | ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); |
| 2273 | add_opt(common_arg( |
| 2274 | {"--embd-output-format" }, "FORMAT" , |
| 2275 | "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix, \"raw\" = plain whitespace-delimited output (one embedding per line)" , |
| 2276 | [](common_params & params, const std::string & value) { |
| 2277 | params.embd_out = value; |
| 2278 | } |
| 2279 | ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); |
| 2280 | add_opt(common_arg( |
| 2281 | {"--embd-separator" }, "STRING" , |
| 2282 | "separator of embeddings (default \\n) for example \"<#sep#>\"" , |
| 2283 | [](common_params & params, const std::string & value) { |
| 2284 | params.embd_sep = value; |
| 2285 | } |
| 2286 | ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); |
| 2287 | add_opt(common_arg( |
| 2288 | {"--cls-separator" }, "STRING" , |
| 2289 | "separator of classification sequences (default \\t) for example \"<#seq#>\"" , |
| 2290 | [](common_params & params, const std::string & value) { |
| 2291 | params.cls_sep = value; |
| 2292 | } |
| 2293 | ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); |
| 2294 | add_opt(common_arg( |
| 2295 | {"--host" }, "HOST" , |
| 2296 | string_format(fmt: "ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: %s)" , params.hostname.c_str()), |
| 2297 | [](common_params & params, const std::string & value) { |
| 2298 | params.hostname = value; |
| 2299 | } |
| 2300 | ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_HOST" )); |
| 2301 | add_opt(common_arg( |
| 2302 | {"--port" }, "PORT" , |
| 2303 | string_format(fmt: "port to listen (default: %d)" , params.port), |
| 2304 | [](common_params & params, int value) { |
| 2305 | params.port = value; |
| 2306 | } |
| 2307 | ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PORT" )); |
| 2308 | add_opt(common_arg( |
| 2309 | {"--path" }, "PATH" , |
| 2310 | string_format(fmt: "path to serve static files from (default: %s)" , params.public_path.c_str()), |
| 2311 | [](common_params & params, const std::string & value) { |
| 2312 | params.public_path = value; |
| 2313 | } |
| 2314 | ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH" )); |
| 2315 | add_opt(common_arg( |
| 2316 | {"--api-prefix" }, "PREFIX" , |
| 2317 | string_format(fmt: "prefix path the server serves from, without the trailing slash (default: %s)" , params.api_prefix.c_str()), |
| 2318 | [](common_params & params, const std::string & value) { |
| 2319 | params.api_prefix = value; |
| 2320 | } |
| 2321 | ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX" )); |
| 2322 | add_opt(common_arg( |
| 2323 | {"--no-webui" }, |
| 2324 | string_format(fmt: "Disable the Web UI (default: %s)" , params.webui ? "enabled" : "disabled" ), |
| 2325 | [](common_params & params) { |
| 2326 | params.webui = false; |
| 2327 | } |
| 2328 | ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_WEBUI" )); |
| 2329 | add_opt(common_arg( |
| 2330 | {"--embedding" , "--embeddings" }, |
| 2331 | string_format(fmt: "restrict to only support embedding use case; use only with dedicated embedding models (default: %s)" , params.embedding ? "enabled" : "disabled" ), |
| 2332 | [](common_params & params) { |
| 2333 | params.embedding = true; |
| 2334 | } |
| 2335 | ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS" )); |
| 2336 | add_opt(common_arg( |
| 2337 | {"--reranking" , "--rerank" }, |
| 2338 | string_format(fmt: "enable reranking endpoint on server (default: %s)" , "disabled" ), |
| 2339 | [](common_params & params) { |
| 2340 | params.embedding = true; |
| 2341 | params.pooling_type = LLAMA_POOLING_TYPE_RANK; |
| 2342 | } |
| 2343 | ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_RERANKING" )); |
| 2344 | add_opt(common_arg( |
| 2345 | {"--api-key" }, "KEY" , |
| 2346 | "API key to use for authentication (default: none)" , |
| 2347 | [](common_params & params, const std::string & value) { |
| 2348 | params.api_keys.push_back(x: value); |
| 2349 | } |
| 2350 | ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_API_KEY" )); |
| 2351 | add_opt(common_arg( |
| 2352 | {"--api-key-file" }, "FNAME" , |
| 2353 | "path to file containing API keys (default: none)" , |
| 2354 | [](common_params & params, const std::string & value) { |
| 2355 | std::ifstream key_file(value); |
| 2356 | if (!key_file) { |
| 2357 | throw std::runtime_error(string_format(fmt: "error: failed to open file '%s'\n" , value.c_str())); |
| 2358 | } |
| 2359 | std::string key; |
| 2360 | while (std::getline(is&: key_file, str&: key)) { |
| 2361 | if (!key.empty()) { |
| 2362 | params.api_keys.push_back(x: key); |
| 2363 | } |
| 2364 | } |
| 2365 | key_file.close(); |
| 2366 | } |
| 2367 | ).set_examples({LLAMA_EXAMPLE_SERVER})); |
| 2368 | add_opt(common_arg( |
| 2369 | {"--ssl-key-file" }, "FNAME" , |
| 2370 | "path to file a PEM-encoded SSL private key" , |
| 2371 | [](common_params & params, const std::string & value) { |
| 2372 | params.ssl_file_key = value; |
| 2373 | } |
| 2374 | ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_KEY_FILE" )); |
| 2375 | add_opt(common_arg( |
| 2376 | {"--ssl-cert-file" }, "FNAME" , |
| 2377 | "path to file a PEM-encoded SSL certificate" , |
| 2378 | [](common_params & params, const std::string & value) { |
| 2379 | params.ssl_file_cert = value; |
| 2380 | } |
| 2381 | ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE" )); |
| 2382 | add_opt(common_arg( |
| 2383 | {"--chat-template-kwargs" }, "STRING" , |
| 2384 | string_format(fmt: "sets additional params for the json template parser" ), |
| 2385 | [](common_params & params, const std::string & value) { |
| 2386 | auto parsed = json::parse(i: value); |
| 2387 | for (const auto & item : parsed.items()) { |
| 2388 | params.default_template_kwargs[item.key()] = item.value().dump(); |
| 2389 | } |
| 2390 | } |
| 2391 | ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_CHAT_TEMPLATE_KWARGS" )); |
| 2392 | add_opt(common_arg( |
| 2393 | {"-to" , "--timeout" }, "N" , |
| 2394 | string_format(fmt: "server read/write timeout in seconds (default: %d)" , params.timeout_read), |
| 2395 | [](common_params & params, int value) { |
| 2396 | params.timeout_read = value; |
| 2397 | params.timeout_write = value; |
| 2398 | } |
| 2399 | ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TIMEOUT" )); |
| 2400 | add_opt(common_arg( |
| 2401 | {"--threads-http" }, "N" , |
| 2402 | string_format(fmt: "number of threads used to process HTTP requests (default: %d)" , params.n_threads_http), |
| 2403 | [](common_params & params, int value) { |
| 2404 | params.n_threads_http = value; |
| 2405 | } |
| 2406 | ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP" )); |
| 2407 | add_opt(common_arg( |
| 2408 | {"--cache-reuse" }, "N" , |
| 2409 | string_format( |
| 2410 | fmt: "min chunk size to attempt reusing from the cache via KV shifting (default: %d)\n" |
| 2411 | "[(card)](https://ggml.ai/f0.png)" , params.n_cache_reuse |
| 2412 | ), |
| 2413 | [](common_params & params, int value) { |
| 2414 | params.n_cache_reuse = value; |
| 2415 | } |
| 2416 | ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CACHE_REUSE" )); |
| 2417 | add_opt(common_arg( |
| 2418 | {"--metrics" }, |
| 2419 | string_format(fmt: "enable prometheus compatible metrics endpoint (default: %s)" , params.endpoint_metrics ? "enabled" : "disabled" ), |
| 2420 | [](common_params & params) { |
| 2421 | params.endpoint_metrics = true; |
| 2422 | } |
| 2423 | ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS" )); |
| 2424 | add_opt(common_arg( |
| 2425 | {"--props" }, |
| 2426 | string_format(fmt: "enable changing global properties via POST /props (default: %s)" , params.endpoint_props ? "enabled" : "disabled" ), |
| 2427 | [](common_params & params) { |
| 2428 | params.endpoint_props = true; |
| 2429 | } |
| 2430 | ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS" )); |
| 2431 | add_opt(common_arg( |
| 2432 | {"--slots" }, |
| 2433 | string_format(fmt: "enable slots monitoring endpoint (default: %s)" , params.endpoint_slots ? "enabled" : "disabled" ), |
| 2434 | [](common_params & params) { |
| 2435 | params.endpoint_slots = true; |
| 2436 | } |
| 2437 | ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS" )); |
| 2438 | add_opt(common_arg( |
| 2439 | {"--no-slots" }, |
| 2440 | "disables slots monitoring endpoint" , |
| 2441 | [](common_params & params) { |
| 2442 | params.endpoint_slots = false; |
| 2443 | } |
| 2444 | ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_ENDPOINT_SLOTS" )); |
| 2445 | add_opt(common_arg( |
| 2446 | {"--slot-save-path" }, "PATH" , |
| 2447 | "path to save slot kv cache (default: disabled)" , |
| 2448 | [](common_params & params, const std::string & value) { |
| 2449 | params.slot_save_path = value; |
| 2450 | // if doesn't end with DIRECTORY_SEPARATOR, add it |
| 2451 | if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) { |
| 2452 | params.slot_save_path += DIRECTORY_SEPARATOR; |
| 2453 | } |
| 2454 | } |
| 2455 | ).set_examples({LLAMA_EXAMPLE_SERVER})); |
| 2456 | add_opt(common_arg( |
| 2457 | {"--jinja" }, |
| 2458 | "use jinja template for chat (default: disabled)" , |
| 2459 | [](common_params & params) { |
| 2460 | params.use_jinja = true; |
| 2461 | } |
| 2462 | ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA" )); |
| 2463 | add_opt(common_arg( |
| 2464 | {"--reasoning-format" }, "FORMAT" , |
| 2465 | "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n" |
| 2466 | "- none: leaves thoughts unparsed in `message.content`\n" |
| 2467 | "- deepseek: puts thoughts in `message.reasoning_content`\n" |
| 2468 | "- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`\n" |
| 2469 | "(default: auto)" , |
| 2470 | [](common_params & params, const std::string & value) { |
| 2471 | params.reasoning_format = common_reasoning_format_from_name(format: value); |
| 2472 | } |
| 2473 | ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK" )); |
| 2474 | add_opt(common_arg( |
| 2475 | {"--reasoning-budget" }, "N" , |
| 2476 | "controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)" , |
| 2477 | [](common_params & params, int value) { |
| 2478 | if (value != 0 && value != -1) { throw std::invalid_argument("invalid value" ); } |
| 2479 | params.reasoning_budget = value; |
| 2480 | } |
| 2481 | ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK_BUDGET" )); |
| 2482 | add_opt(common_arg( |
| 2483 | {"--chat-template" }, "JINJA_TEMPLATE" , |
| 2484 | string_format( |
| 2485 | fmt: "set custom jinja chat template (default: template taken from model's metadata)\n" |
| 2486 | "if suffix/prefix are specified, template will be disabled\n" |
| 2487 | "only commonly used templates are accepted (unless --jinja is set before this flag):\n" |
| 2488 | "list of built-in templates:\n%s" , list_builtin_chat_templates().c_str() |
| 2489 | ), |
| 2490 | [](common_params & params, const std::string & value) { |
| 2491 | params.chat_template = value; |
| 2492 | } |
| 2493 | ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_CHAT_TEMPLATE" )); |
| 2494 | add_opt(common_arg( |
| 2495 | {"--chat-template-file" }, "JINJA_TEMPLATE_FILE" , |
| 2496 | string_format( |
| 2497 | fmt: "set custom jinja chat template file (default: template taken from model's metadata)\n" |
| 2498 | "if suffix/prefix are specified, template will be disabled\n" |
| 2499 | "only commonly used templates are accepted (unless --jinja is set before this flag):\n" |
| 2500 | "list of built-in templates:\n%s" , list_builtin_chat_templates().c_str() |
| 2501 | ), |
| 2502 | [](common_params & params, const std::string & value) { |
| 2503 | params.chat_template = read_file(fname: value); |
| 2504 | } |
| 2505 | ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE" )); |
| 2506 | add_opt(common_arg( |
| 2507 | {"--no-prefill-assistant" }, |
| 2508 | string_format( |
| 2509 | fmt: "whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)\n" |
| 2510 | "when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled\n" |
| 2511 | ), |
| 2512 | [](common_params & params) { |
| 2513 | params.prefill_assistant = false; |
| 2514 | } |
| 2515 | ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_PREFILL_ASSISTANT" )); |
| 2516 | add_opt(common_arg( |
| 2517 | {"-sps" , "--slot-prompt-similarity" }, "SIMILARITY" , |
| 2518 | string_format(fmt: "how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n" , params.slot_prompt_similarity), |
| 2519 | [](common_params & params, const std::string & value) { |
| 2520 | params.slot_prompt_similarity = std::stof(str: value); |
| 2521 | } |
| 2522 | ).set_examples({LLAMA_EXAMPLE_SERVER})); |
| 2523 | add_opt(common_arg( |
| 2524 | {"--lora-init-without-apply" }, |
| 2525 | string_format(fmt: "load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)" , params.lora_init_without_apply ? "enabled" : "disabled" ), |
| 2526 | [](common_params & params) { |
| 2527 | params.lora_init_without_apply = true; |
| 2528 | } |
| 2529 | ).set_examples({LLAMA_EXAMPLE_SERVER})); |
| 2530 | add_opt(common_arg( |
| 2531 | {"--simple-io" }, |
| 2532 | "use basic IO for better compatibility in subprocesses and limited consoles" , |
| 2533 | [](common_params & params) { |
| 2534 | params.simple_io = true; |
| 2535 | } |
| 2536 | ).set_examples({LLAMA_EXAMPLE_MAIN})); |
| 2537 | add_opt(common_arg( |
| 2538 | {"--positive-file" }, "FNAME" , |
| 2539 | string_format(fmt: "positive prompts file, one prompt per line (default: '%s')" , params.cvector_positive_file.c_str()), |
| 2540 | [](common_params & params, const std::string & value) { |
| 2541 | params.cvector_positive_file = value; |
| 2542 | } |
| 2543 | ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); |
| 2544 | add_opt(common_arg( |
| 2545 | {"--negative-file" }, "FNAME" , |
| 2546 | string_format(fmt: "negative prompts file, one prompt per line (default: '%s')" , params.cvector_negative_file.c_str()), |
| 2547 | [](common_params & params, const std::string & value) { |
| 2548 | params.cvector_negative_file = value; |
| 2549 | } |
| 2550 | ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); |
| 2551 | add_opt(common_arg( |
| 2552 | {"--pca-batch" }, "N" , |
| 2553 | string_format(fmt: "batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)" , params.n_pca_batch), |
| 2554 | [](common_params & params, int value) { |
| 2555 | params.n_pca_batch = value; |
| 2556 | } |
| 2557 | ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); |
| 2558 | add_opt(common_arg( |
| 2559 | {"--pca-iter" }, "N" , |
| 2560 | string_format(fmt: "number of iterations used for PCA (default: %d)" , params.n_pca_iterations), |
| 2561 | [](common_params & params, int value) { |
| 2562 | params.n_pca_iterations = value; |
| 2563 | } |
| 2564 | ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); |
| 2565 | add_opt(common_arg( |
| 2566 | {"--method" }, "{pca, mean}" , |
| 2567 | "dimensionality reduction method to be used (default: pca)" , |
| 2568 | [](common_params & params, const std::string & value) { |
| 2569 | /**/ if (value == "pca" ) { params.cvector_dimre_method = DIMRE_METHOD_PCA; } |
| 2570 | else if (value == "mean" ) { params.cvector_dimre_method = DIMRE_METHOD_MEAN; } |
| 2571 | else { throw std::invalid_argument("invalid value" ); } |
| 2572 | } |
| 2573 | ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); |
| 2574 | add_opt(common_arg( |
| 2575 | {"--output-format" }, "{md,jsonl}" , |
| 2576 | "output format for batched-bench results (default: md)" , |
| 2577 | [](common_params & params, const std::string & value) { |
| 2578 | /**/ if (value == "jsonl" ) { params.batched_bench_output_jsonl = true; } |
| 2579 | else if (value == "md" ) { params.batched_bench_output_jsonl = false; } |
| 2580 | else { throw std::invalid_argument("invalid value" ); } |
| 2581 | } |
| 2582 | ).set_examples({LLAMA_EXAMPLE_BENCH})); |
| 2583 | add_opt(common_arg( |
| 2584 | {"--log-disable" }, |
| 2585 | "Log disable" , |
| 2586 | [](common_params &) { |
| 2587 | common_log_pause(log: common_log_main()); |
| 2588 | } |
| 2589 | )); |
| 2590 | add_opt(common_arg( |
| 2591 | {"--log-file" }, "FNAME" , |
| 2592 | "Log to file" , |
| 2593 | [](common_params &, const std::string & value) { |
| 2594 | common_log_set_file(log: common_log_main(), file: value.c_str()); |
| 2595 | } |
| 2596 | )); |
| 2597 | add_opt(common_arg( |
| 2598 | {"--log-colors" }, "[on|off|auto]" , |
| 2599 | "Set colored logging ('on', 'off', or 'auto', default: 'auto')\n" |
| 2600 | "'auto' enables colors when output is to a terminal" , |
| 2601 | [](common_params &, const std::string & value) { |
| 2602 | if (is_truthy(value)) { |
| 2603 | common_log_set_colors(log: common_log_main(), colors: LOG_COLORS_ENABLED); |
| 2604 | } else if (is_falsey(value)) { |
| 2605 | common_log_set_colors(log: common_log_main(), colors: LOG_COLORS_DISABLED); |
| 2606 | } else if (is_autoy(value)) { |
| 2607 | common_log_set_colors(log: common_log_main(), colors: LOG_COLORS_AUTO); |
| 2608 | } else { |
| 2609 | throw std::invalid_argument( |
| 2610 | string_format(fmt: "error: unkown value for --log-colors: '%s'\n" , value.c_str())); |
| 2611 | } |
| 2612 | } |
| 2613 | ).set_env("LLAMA_LOG_COLORS" )); |
| 2614 | add_opt(common_arg( |
| 2615 | {"-v" , "--verbose" , "--log-verbose" }, |
| 2616 | "Set verbosity level to infinity (i.e. log all messages, useful for debugging)" , |
| 2617 | [](common_params & params) { |
| 2618 | params.verbosity = INT_MAX; |
| 2619 | common_log_set_verbosity_thold(INT_MAX); |
| 2620 | } |
| 2621 | )); |
| 2622 | add_opt(common_arg( |
| 2623 | {"--offline" }, |
| 2624 | "Offline mode: forces use of cache, prevents network access" , |
| 2625 | [](common_params & params) { |
| 2626 | params.offline = true; |
| 2627 | } |
| 2628 | ).set_env("LLAMA_OFFLINE" )); |
| 2629 | add_opt(common_arg( |
| 2630 | {"-lv" , "--verbosity" , "--log-verbosity" }, "N" , |
| 2631 | "Set the verbosity threshold. Messages with a higher verbosity will be ignored." , |
| 2632 | [](common_params & params, int value) { |
| 2633 | params.verbosity = value; |
| 2634 | common_log_set_verbosity_thold(verbosity: value); |
| 2635 | } |
| 2636 | ).set_env("LLAMA_LOG_VERBOSITY" )); |
| 2637 | add_opt(common_arg( |
| 2638 | {"--log-prefix" }, |
| 2639 | "Enable prefix in log messages" , |
| 2640 | [](common_params &) { |
| 2641 | common_log_set_prefix(log: common_log_main(), prefix: true); |
| 2642 | } |
| 2643 | ).set_env("LLAMA_LOG_PREFIX" )); |
| 2644 | add_opt(common_arg( |
| 2645 | {"--log-timestamps" }, |
| 2646 | "Enable timestamps in log messages" , |
| 2647 | [](common_params &) { |
| 2648 | common_log_set_timestamps(log: common_log_main(), timestamps: true); |
| 2649 | } |
| 2650 | ).set_env("LLAMA_LOG_TIMESTAMPS" )); |
| 2651 | |
| 2652 | // speculative parameters |
| 2653 | add_opt(common_arg( |
| 2654 | {"-td" , "--threads-draft" }, "N" , |
| 2655 | "number of threads to use during generation (default: same as --threads)" , |
| 2656 | [](common_params & params, int value) { |
| 2657 | params.speculative.cpuparams.n_threads = value; |
| 2658 | if (params.speculative.cpuparams.n_threads <= 0) { |
| 2659 | params.speculative.cpuparams.n_threads = std::thread::hardware_concurrency(); |
| 2660 | } |
| 2661 | } |
| 2662 | ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); |
| 2663 | add_opt(common_arg( |
| 2664 | {"-tbd" , "--threads-batch-draft" }, "N" , |
| 2665 | "number of threads to use during batch and prompt processing (default: same as --threads-draft)" , |
| 2666 | [](common_params & params, int value) { |
| 2667 | params.speculative.cpuparams_batch.n_threads = value; |
| 2668 | if (params.speculative.cpuparams_batch.n_threads <= 0) { |
| 2669 | params.speculative.cpuparams_batch.n_threads = std::thread::hardware_concurrency(); |
| 2670 | } |
| 2671 | } |
| 2672 | ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); |
| 2673 | add_opt(common_arg( |
| 2674 | {"-Cd" , "--cpu-mask-draft" }, "M" , |
| 2675 | "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)" , |
| 2676 | [](common_params & params, const std::string & mask) { |
| 2677 | params.speculative.cpuparams.mask_valid = true; |
| 2678 | if (!parse_cpu_mask(mask, boolmask&: params.speculative.cpuparams.cpumask)) { |
| 2679 | throw std::invalid_argument("invalid cpumask" ); |
| 2680 | } |
| 2681 | } |
| 2682 | ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); |
| 2683 | add_opt(common_arg( |
| 2684 | {"-Crd" , "--cpu-range-draft" }, "lo-hi" , |
| 2685 | "Ranges of CPUs for affinity. Complements --cpu-mask-draft" , |
| 2686 | [](common_params & params, const std::string & range) { |
| 2687 | params.speculative.cpuparams.mask_valid = true; |
| 2688 | if (!parse_cpu_range(range, boolmask&: params.speculative.cpuparams.cpumask)) { |
| 2689 | throw std::invalid_argument("invalid range" ); |
| 2690 | } |
| 2691 | } |
| 2692 | ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); |
| 2693 | add_opt(common_arg( |
| 2694 | {"--cpu-strict-draft" }, "<0|1>" , |
| 2695 | "Use strict CPU placement for draft model (default: same as --cpu-strict)" , |
| 2696 | [](common_params & params, int value) { |
| 2697 | params.speculative.cpuparams.strict_cpu = value; |
| 2698 | } |
| 2699 | ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); |
| 2700 | add_opt(common_arg( |
| 2701 | {"--prio-draft" }, "N" , |
| 2702 | string_format(fmt: "set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n" , params.speculative.cpuparams.priority), |
| 2703 | [](common_params & params, int prio) { |
| 2704 | if (prio < 0 || prio > 3) { |
| 2705 | throw std::invalid_argument("invalid value" ); |
| 2706 | } |
| 2707 | params.speculative.cpuparams.priority = (enum ggml_sched_priority) prio; |
| 2708 | } |
| 2709 | ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); |
| 2710 | add_opt(common_arg( |
| 2711 | {"--poll-draft" }, "<0|1>" , |
| 2712 | "Use polling to wait for draft model work (default: same as --poll])" , |
| 2713 | [](common_params & params, int value) { |
| 2714 | params.speculative.cpuparams.poll = value; |
| 2715 | } |
| 2716 | ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); |
| 2717 | add_opt(common_arg( |
| 2718 | {"-Cbd" , "--cpu-mask-batch-draft" }, "M" , |
| 2719 | "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)" , |
| 2720 | [](common_params & params, const std::string & mask) { |
| 2721 | params.speculative.cpuparams_batch.mask_valid = true; |
| 2722 | if (!parse_cpu_mask(mask, boolmask&: params.speculative.cpuparams_batch.cpumask)) { |
| 2723 | throw std::invalid_argument("invalid cpumask" ); |
| 2724 | } |
| 2725 | } |
| 2726 | ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); |
| 2727 | add_opt(common_arg( |
| 2728 | {"-Crbd" , "--cpu-range-batch-draft" }, "lo-hi" , |
| 2729 | "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)" , |
| 2730 | [](common_params & params, const std::string & range) { |
| 2731 | params.speculative.cpuparams_batch.mask_valid = true; |
| 2732 | if (!parse_cpu_range(range, boolmask&: params.speculative.cpuparams_batch.cpumask)) { |
| 2733 | throw std::invalid_argument("invalid cpumask" ); |
| 2734 | } |
| 2735 | } |
| 2736 | ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); |
| 2737 | add_opt(common_arg( |
| 2738 | {"--cpu-strict-batch-draft" }, "<0|1>" , |
| 2739 | "Use strict CPU placement for draft model (default: --cpu-strict-draft)" , |
| 2740 | [](common_params & params, int value) { |
| 2741 | params.speculative.cpuparams_batch.strict_cpu = value; |
| 2742 | } |
| 2743 | ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); |
| 2744 | add_opt(common_arg( |
| 2745 | {"--prio-batch-draft" }, "N" , |
| 2746 | string_format(fmt: "set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n" , params.speculative.cpuparams_batch.priority), |
| 2747 | [](common_params & params, int prio) { |
| 2748 | if (prio < 0 || prio > 3) { |
| 2749 | throw std::invalid_argument("invalid value" ); |
| 2750 | } |
| 2751 | params.speculative.cpuparams_batch.priority = (enum ggml_sched_priority) prio; |
| 2752 | } |
| 2753 | ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); |
| 2754 | add_opt(common_arg( |
| 2755 | {"--poll-batch-draft" }, "<0|1>" , |
| 2756 | "Use polling to wait for draft model work (default: --poll-draft)" , |
| 2757 | [](common_params & params, int value) { |
| 2758 | params.speculative.cpuparams_batch.poll = value; |
| 2759 | } |
| 2760 | ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); |
| 2761 | add_opt(common_arg( |
| 2762 | {"--draft-max" , "--draft" , "--draft-n" }, "N" , |
| 2763 | string_format(fmt: "number of tokens to draft for speculative decoding (default: %d)" , params.speculative.n_max), |
| 2764 | [](common_params & params, int value) { |
| 2765 | params.speculative.n_max = value; |
| 2766 | } |
| 2767 | ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MAX" )); |
| 2768 | add_opt(common_arg( |
| 2769 | {"--draft-min" , "--draft-n-min" }, "N" , |
| 2770 | string_format(fmt: "minimum number of draft tokens to use for speculative decoding (default: %d)" , params.speculative.n_min), |
| 2771 | [](common_params & params, int value) { |
| 2772 | params.speculative.n_min = value; |
| 2773 | } |
| 2774 | ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MIN" )); |
| 2775 | add_opt(common_arg( |
| 2776 | {"--draft-p-split" }, "P" , |
| 2777 | string_format(fmt: "speculative decoding split probability (default: %.1f)" , (double)params.speculative.p_split), |
| 2778 | [](common_params & params, const std::string & value) { |
| 2779 | params.speculative.p_split = std::stof(str: value); |
| 2780 | } |
| 2781 | ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}).set_env("LLAMA_ARG_DRAFT_P_SPLIT" )); |
| 2782 | add_opt(common_arg( |
| 2783 | {"--draft-p-min" }, "P" , |
| 2784 | string_format(fmt: "minimum speculative decoding probability (greedy) (default: %.1f)" , (double)params.speculative.p_min), |
| 2785 | [](common_params & params, const std::string & value) { |
| 2786 | params.speculative.p_min = std::stof(str: value); |
| 2787 | } |
| 2788 | ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_P_MIN" )); |
| 2789 | add_opt(common_arg( |
| 2790 | {"-cd" , "--ctx-size-draft" }, "N" , |
| 2791 | string_format(fmt: "size of the prompt context for the draft model (default: %d, 0 = loaded from model)" , params.speculative.n_ctx), |
| 2792 | [](common_params & params, int value) { |
| 2793 | params.speculative.n_ctx = value; |
| 2794 | } |
| 2795 | ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CTX_SIZE_DRAFT" )); |
| 2796 | add_opt(common_arg( |
| 2797 | {"-devd" , "--device-draft" }, "<dev1,dev2,..>" , |
| 2798 | "comma-separated list of devices to use for offloading the draft model (none = don't offload)\n" |
| 2799 | "use --list-devices to see a list of available devices" , |
| 2800 | [](common_params & params, const std::string & value) { |
| 2801 | params.speculative.devices = parse_device_list(value); |
| 2802 | } |
| 2803 | ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); |
| 2804 | add_opt(common_arg( |
| 2805 | {"-ngld" , "--gpu-layers-draft" , "--n-gpu-layers-draft" }, "N" , |
| 2806 | "number of layers to store in VRAM for the draft model" , |
| 2807 | [](common_params & params, int value) { |
| 2808 | params.speculative.n_gpu_layers = value; |
| 2809 | if (!llama_supports_gpu_offload()) { |
| 2810 | fprintf(stderr, format: "warning: no usable GPU found, --gpu-layers-draft option will be ignored\n" ); |
| 2811 | fprintf(stderr, format: "warning: one possible reason is that llama.cpp was compiled without GPU support\n" ); |
| 2812 | fprintf(stderr, format: "warning: consult docs/build.md for compilation instructions\n" ); |
| 2813 | } |
| 2814 | } |
| 2815 | ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_GPU_LAYERS_DRAFT" )); |
| 2816 | add_opt(common_arg( |
| 2817 | {"-md" , "--model-draft" }, "FNAME" , |
| 2818 | "draft model for speculative decoding (default: unused)" , |
| 2819 | [](common_params & params, const std::string & value) { |
| 2820 | params.speculative.model.path = value; |
| 2821 | } |
| 2822 | ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT" )); |
| 2823 | add_opt(common_arg( |
| 2824 | {"--spec-replace" }, "TARGET" , "DRAFT" , |
| 2825 | "translate the string in TARGET into DRAFT if the draft model and main model are not compatible" , |
| 2826 | [](common_params & params, const std::string & tgt, const std::string & dft) { |
| 2827 | params.speculative.replacements.push_back(x: { tgt, dft }); |
| 2828 | } |
| 2829 | ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); |
| 2830 | add_opt(common_arg( |
| 2831 | {"-ctkd" , "--cache-type-k-draft" }, "TYPE" , |
| 2832 | string_format( |
| 2833 | fmt: "KV cache data type for K for the draft model\n" |
| 2834 | "allowed values: %s\n" |
| 2835 | "(default: %s)" , |
| 2836 | get_all_kv_cache_types().c_str(), |
| 2837 | ggml_type_name(type: params.speculative.cache_type_k) |
| 2838 | ), |
| 2839 | [](common_params & params, const std::string & value) { |
| 2840 | params.speculative.cache_type_k = kv_cache_type_from_str(s: value); |
| 2841 | } |
| 2842 | ).set_env("LLAMA_ARG_CACHE_TYPE_K_DRAFT" )); |
| 2843 | add_opt(common_arg( |
| 2844 | {"-ctvd" , "--cache-type-v-draft" }, "TYPE" , |
| 2845 | string_format( |
| 2846 | fmt: "KV cache data type for V for the draft model\n" |
| 2847 | "allowed values: %s\n" |
| 2848 | "(default: %s)" , |
| 2849 | get_all_kv_cache_types().c_str(), |
| 2850 | ggml_type_name(type: params.speculative.cache_type_v) |
| 2851 | ), |
| 2852 | [](common_params & params, const std::string & value) { |
| 2853 | params.speculative.cache_type_v = kv_cache_type_from_str(s: value); |
| 2854 | } |
| 2855 | ).set_env("LLAMA_ARG_CACHE_TYPE_V_DRAFT" )); |
| 2856 | |
| 2857 | add_opt(common_arg( |
| 2858 | {"-mv" , "--model-vocoder" }, "FNAME" , |
| 2859 | "vocoder model for audio generation (default: unused)" , |
| 2860 | [](common_params & params, const std::string & value) { |
| 2861 | params.vocoder.model.path = value; |
| 2862 | } |
| 2863 | ).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER})); |
| 2864 | add_opt(common_arg( |
| 2865 | {"--tts-use-guide-tokens" }, |
| 2866 | "Use guide tokens to improve TTS word recall" , |
| 2867 | [](common_params & params) { |
| 2868 | params.vocoder.use_guide_tokens = true; |
| 2869 | } |
| 2870 | ).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER})); |
| 2871 | add_opt(common_arg( |
| 2872 | {"--tts-speaker-file" }, "FNAME" , |
| 2873 | "speaker file path for audio generation" , |
| 2874 | [](common_params & params, const std::string & value) { |
| 2875 | params.vocoder.speaker_file = value; |
| 2876 | } |
| 2877 | ).set_examples({LLAMA_EXAMPLE_TTS})); |
| 2878 | |
| 2879 | add_opt(common_arg( |
| 2880 | {"--diffusion-steps" }, "N" , |
| 2881 | string_format(fmt: "number of diffusion steps (default: %d)" , params.diffusion.steps), |
| 2882 | [](common_params & params, int value) { params.diffusion.steps = value; } |
| 2883 | ).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); |
| 2884 | add_opt(common_arg( |
| 2885 | {"--diffusion-visual" }, |
| 2886 | string_format(fmt: "enable visual diffusion mode (show progressive generation) (default: %s)" , params.diffusion.visual_mode ? "true" : "false" ), |
| 2887 | [](common_params & params) { params.diffusion.visual_mode = true; } |
| 2888 | ).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); |
| 2889 | add_opt(common_arg( |
| 2890 | {"--diffusion-eps" }, "F" , |
| 2891 | string_format(fmt: "epsilon for timesteps (default: %.6f)" , (double) params.diffusion.eps), |
| 2892 | [](common_params & params, const std::string & value) { params.diffusion.eps = std::stof(str: value); } |
| 2893 | ).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); |
| 2894 | add_opt(common_arg( |
| 2895 | {"--diffusion-algorithm" }, "N" , |
| 2896 | string_format(fmt: "diffusion algorithm: 0=ORIGIN, 1=ENTROPY_BASED, 2=MARGIN_BASED, 3=RANDOM, 4=LOW_CONFIDENCE (default: %d)" , params.diffusion.algorithm), |
| 2897 | [](common_params & params, int value) { params.diffusion.algorithm = value; } |
| 2898 | ).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); |
| 2899 | add_opt(common_arg( |
| 2900 | {"--diffusion-alg-temp" }, "F" , |
| 2901 | string_format(fmt: "dream algorithm temperature (default: %.3f)" , (double) params.diffusion.alg_temp), |
| 2902 | [](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(str: value); } |
| 2903 | ).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); |
| 2904 | add_opt(common_arg( |
| 2905 | {"--diffusion-block-length" }, "N" , |
| 2906 | string_format(fmt: "llada block length for generation (default: %d)" , params.diffusion.block_length), |
| 2907 | [](common_params & params, int value) { params.diffusion.block_length = value; } |
| 2908 | ).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); |
| 2909 | add_opt(common_arg( |
| 2910 | {"--diffusion-cfg-scale" }, "F" , |
| 2911 | string_format(fmt: "llada classifier-free guidance scale (default: %.3f)" , (double) params.diffusion.cfg_scale), |
| 2912 | [](common_params & params, const std::string & value) { params.diffusion.cfg_scale = std::stof(str: value); } |
| 2913 | ).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); |
| 2914 | add_opt(common_arg( |
| 2915 | {"--diffusion-add-gumbel-noise" }, "F" , |
| 2916 | string_format(fmt: "add gumbel noise to the logits if temp > 0.0 (default: %s)" , params.diffusion.add_gumbel_noise ? "true" : "false" ), |
| 2917 | [](common_params & params, const std::string & value) { params.diffusion.add_gumbel_noise = std::stof(str: value); } |
| 2918 | ).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); |
| 2919 | add_opt(common_arg( |
| 2920 | { "-lr" , "--learning-rate" }, "ALPHA" , |
| 2921 | string_format(fmt: "adamw or sgd optimizer alpha (default: %.2g); note: sgd alpha recommended ~10x (no momentum)" , (double) params.lr.lr0), |
| 2922 | [](common_params & params, const std::string & value) { params.lr.lr0 = std::stof(str: value); } |
| 2923 | ).set_examples({ LLAMA_EXAMPLE_FINETUNE })); |
| 2924 | add_opt(common_arg({ "-lr-min" , "--learning-rate-min" }, "ALPHA" , |
| 2925 | string_format(fmt: "(if >0) final learning rate after decay (if -decay-epochs is set, default=%.2g)" , |
| 2926 | (double) params.lr.lr_min), |
| 2927 | [](common_params & params, const std::string & value) { params.lr.lr_min = std::stof(str: value); } |
| 2928 | ).set_examples({ LLAMA_EXAMPLE_FINETUNE })); |
| 2929 | add_opt(common_arg( |
| 2930 | {"-decay-epochs" , "--learning-rate-decay-epochs" }, "ALPHA" , |
| 2931 | string_format(fmt: "(if >0) decay learning rate to -lr-min after this many epochs (exponential decay, default=%.2g)" , (double) params.lr.decay_epochs), |
| 2932 | [](common_params & params, const std::string & value) { params.lr.decay_epochs = std::stof(str: value); } |
| 2933 | ).set_examples({ LLAMA_EXAMPLE_FINETUNE })); |
| 2934 | add_opt(common_arg( |
| 2935 | {"-wd" , "--weight-decay" }, "WD" , |
| 2936 | string_format(fmt: "adamw or sgd optimizer weight decay (0 is off; recommend very small e.g. 1e-9) (default: %.2g)." , (double) params.lr.wd), |
| 2937 | [](common_params & params, const std::string & value) { params.lr.wd = std::stof(str: value); } |
| 2938 | ).set_examples({ LLAMA_EXAMPLE_FINETUNE })); |
| 2939 | add_opt(common_arg( |
| 2940 | {"-val-split" , "--val-split" }, "FRACTION" , |
| 2941 | string_format(fmt: "fraction of data to use as validation set for training (default: %.2g)." , (double) params.val_split), |
| 2942 | [](common_params & params, const std::string & value) { params.val_split = std::stof(str: value); } |
| 2943 | ).set_examples({ LLAMA_EXAMPLE_FINETUNE })); |
| 2944 | add_opt(common_arg( |
| 2945 | {"-epochs" , "--epochs" }, "N" , |
| 2946 | string_format(fmt: "optimizer max # of epochs (default: %d)" , params.lr.epochs), |
| 2947 | [](common_params & params, int epochs) { params.lr.epochs = epochs; } |
| 2948 | ).set_examples({ LLAMA_EXAMPLE_FINETUNE })); |
| 2949 | add_opt(common_arg( |
| 2950 | {"-opt" , "--optimizer" }, "sgd|adamw" , "adamw or sgd" , |
| 2951 | [](common_params & params, const std::string & name) { |
| 2952 | params.optimizer = common_opt_get_optimizer(name.c_str()); |
| 2953 | if (params.optimizer == GGML_OPT_OPTIMIZER_TYPE_COUNT) { |
| 2954 | throw std::invalid_argument("invalid --optimizer, valid options: adamw, sgd" ); |
| 2955 | } |
| 2956 | } |
| 2957 | ).set_examples({ LLAMA_EXAMPLE_FINETUNE })); |
| 2958 | |
| 2959 | // presets |
| 2960 | add_opt(common_arg( |
| 2961 | {"--tts-oute-default" }, |
| 2962 | string_format(fmt: "use default OuteTTS models (note: can download weights from the internet)" ), |
| 2963 | [](common_params & params) { |
| 2964 | params.model.hf_repo = "OuteAI/OuteTTS-0.2-500M-GGUF" ; |
| 2965 | params.model.hf_file = "OuteTTS-0.2-500M-Q8_0.gguf" ; |
| 2966 | params.vocoder.model.hf_repo = "ggml-org/WavTokenizer" ; |
| 2967 | params.vocoder.model.hf_file = "WavTokenizer-Large-75-F16.gguf" ; |
| 2968 | } |
| 2969 | ).set_examples({LLAMA_EXAMPLE_TTS})); |
| 2970 | |
| 2971 | add_opt(common_arg( |
| 2972 | {"--embd-gemma-default" }, |
| 2973 | string_format(fmt: "use default EmbeddingGemma model (note: can download weights from the internet)" ), |
| 2974 | [](common_params & params) { |
| 2975 | params.model.hf_repo = "ggml-org/embeddinggemma-300M-qat-q4_0-GGUF" ; |
| 2976 | params.model.hf_file = "embeddinggemma-300M-qat-Q4_0.gguf" ; |
| 2977 | params.port = 8011; |
| 2978 | params.n_ubatch = 2048; |
| 2979 | params.n_batch = 2048; |
| 2980 | params.n_parallel = 32; |
| 2981 | params.n_ctx = 2048*params.n_parallel; |
| 2982 | params.verbose_prompt = true; |
| 2983 | params.embedding = true; |
| 2984 | } |
| 2985 | ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER})); |
| 2986 | |
| 2987 | add_opt(common_arg( |
| 2988 | {"--fim-qwen-1.5b-default" }, |
| 2989 | string_format(fmt: "use default Qwen 2.5 Coder 1.5B (note: can download weights from the internet)" ), |
| 2990 | [](common_params & params) { |
| 2991 | params.model.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF" ; |
| 2992 | params.model.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf" ; |
| 2993 | params.port = 8012; |
| 2994 | params.n_ubatch = 1024; |
| 2995 | params.n_batch = 1024; |
| 2996 | params.n_ctx = 0; |
| 2997 | params.n_cache_reuse = 256; |
| 2998 | } |
| 2999 | ).set_examples({LLAMA_EXAMPLE_SERVER})); |
| 3000 | |
| 3001 | add_opt(common_arg( |
| 3002 | {"--fim-qwen-3b-default" }, |
| 3003 | string_format(fmt: "use default Qwen 2.5 Coder 3B (note: can download weights from the internet)" ), |
| 3004 | [](common_params & params) { |
| 3005 | params.model.hf_repo = "ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF" ; |
| 3006 | params.model.hf_file = "qwen2.5-coder-3b-q8_0.gguf" ; |
| 3007 | params.port = 8012; |
| 3008 | params.n_ubatch = 1024; |
| 3009 | params.n_batch = 1024; |
| 3010 | params.n_ctx = 0; |
| 3011 | params.n_cache_reuse = 256; |
| 3012 | } |
| 3013 | ).set_examples({LLAMA_EXAMPLE_SERVER})); |
| 3014 | |
| 3015 | add_opt(common_arg( |
| 3016 | {"--fim-qwen-7b-default" }, |
| 3017 | string_format(fmt: "use default Qwen 2.5 Coder 7B (note: can download weights from the internet)" ), |
| 3018 | [](common_params & params) { |
| 3019 | params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF" ; |
| 3020 | params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf" ; |
| 3021 | params.port = 8012; |
| 3022 | params.n_ubatch = 1024; |
| 3023 | params.n_batch = 1024; |
| 3024 | params.n_ctx = 0; |
| 3025 | params.n_cache_reuse = 256; |
| 3026 | } |
| 3027 | ).set_examples({LLAMA_EXAMPLE_SERVER})); |
| 3028 | |
| 3029 | add_opt(common_arg( |
| 3030 | {"--fim-qwen-7b-spec" }, |
| 3031 | string_format(fmt: "use Qwen 2.5 Coder 7B + 0.5B draft for speculative decoding (note: can download weights from the internet)" ), |
| 3032 | [](common_params & params) { |
| 3033 | params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF" ; |
| 3034 | params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf" ; |
| 3035 | params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF" ; |
| 3036 | params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf" ; |
| 3037 | params.port = 8012; |
| 3038 | params.n_ubatch = 1024; |
| 3039 | params.n_batch = 1024; |
| 3040 | params.n_ctx = 0; |
| 3041 | params.n_cache_reuse = 256; |
| 3042 | } |
| 3043 | ).set_examples({LLAMA_EXAMPLE_SERVER})); |
| 3044 | |
| 3045 | add_opt(common_arg( |
| 3046 | {"--fim-qwen-14b-spec" }, |
| 3047 | string_format(fmt: "use Qwen 2.5 Coder 14B + 0.5B draft for speculative decoding (note: can download weights from the internet)" ), |
| 3048 | [](common_params & params) { |
| 3049 | params.model.hf_repo = "ggml-org/Qwen2.5-Coder-14B-Q8_0-GGUF" ; |
| 3050 | params.model.hf_file = "qwen2.5-coder-14b-q8_0.gguf" ; |
| 3051 | params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF" ; |
| 3052 | params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf" ; |
| 3053 | params.port = 8012; |
| 3054 | params.n_ubatch = 1024; |
| 3055 | params.n_batch = 1024; |
| 3056 | params.n_ctx = 0; |
| 3057 | params.n_cache_reuse = 256; |
| 3058 | } |
| 3059 | ).set_examples({LLAMA_EXAMPLE_SERVER})); |
| 3060 | |
| 3061 | add_opt(common_arg( |
| 3062 | {"--fim-qwen-30b-default" }, |
| 3063 | string_format(fmt: "use default Qwen 3 Coder 30B A3B Instruct (note: can download weights from the internet)" ), |
| 3064 | [](common_params & params) { |
| 3065 | params.model.hf_repo = "ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF" ; |
| 3066 | params.model.hf_file = "qwen3-coder-30b-a3b-instruct-q8_0.gguf" ; |
| 3067 | params.port = 8012; |
| 3068 | params.n_ubatch = 1024; |
| 3069 | params.n_batch = 1024; |
| 3070 | params.n_ctx = 0; |
| 3071 | params.n_cache_reuse = 256; |
| 3072 | } |
| 3073 | ).set_examples({LLAMA_EXAMPLE_SERVER})); |
| 3074 | |
| 3075 | add_opt(common_arg( |
| 3076 | {"--gpt-oss-20b-default" }, |
| 3077 | string_format(fmt: "use gpt-oss-20b (note: can download weights from the internet)" ), |
| 3078 | [](common_params & params) { |
| 3079 | params.model.hf_repo = "ggml-org/gpt-oss-20b-GGUF" ; |
| 3080 | params.model.hf_file = "gpt-oss-20b-mxfp4.gguf" ; |
| 3081 | params.port = 8013; |
| 3082 | params.n_ubatch = 2048; |
| 3083 | params.n_batch = 32768; |
| 3084 | params.n_parallel = 2; |
| 3085 | params.n_ctx = 131072*params.n_parallel; |
| 3086 | params.sampling.temp = 1.0f; |
| 3087 | params.sampling.top_p = 1.0f; |
| 3088 | params.sampling.top_k = 0; |
| 3089 | params.sampling.min_p = 0.01f; |
| 3090 | params.use_jinja = true; |
| 3091 | //params.default_template_kwargs["reasoning_effort"] = "\"high\""; |
| 3092 | } |
| 3093 | ).set_examples({LLAMA_EXAMPLE_SERVER})); |
| 3094 | |
| 3095 | add_opt(common_arg( |
| 3096 | {"--gpt-oss-120b-default" }, |
| 3097 | string_format(fmt: "use gpt-oss-120b (note: can download weights from the internet)" ), |
| 3098 | [](common_params & params) { |
| 3099 | params.model.hf_repo = "ggml-org/gpt-oss-120b-GGUF" ; |
| 3100 | params.port = 8013; |
| 3101 | params.n_ubatch = 2048; |
| 3102 | params.n_batch = 32768; |
| 3103 | params.n_parallel = 2; |
| 3104 | params.n_ctx = 131072*params.n_parallel; |
| 3105 | params.sampling.temp = 1.0f; |
| 3106 | params.sampling.top_p = 1.0f; |
| 3107 | params.sampling.top_k = 0; |
| 3108 | params.sampling.min_p = 0.01f; |
| 3109 | params.use_jinja = true; |
| 3110 | //params.default_template_kwargs["reasoning_effort"] = "\"high\""; |
| 3111 | } |
| 3112 | ).set_examples({LLAMA_EXAMPLE_SERVER})); |
| 3113 | |
| 3114 | add_opt(common_arg( |
| 3115 | {"--vision-gemma-4b-default" }, |
| 3116 | string_format(fmt: "use Gemma 3 4B QAT (note: can download weights from the internet)" ), |
| 3117 | [](common_params & params) { |
| 3118 | params.model.hf_repo = "ggml-org/gemma-3-4b-it-qat-GGUF" ; |
| 3119 | params.port = 8014; |
| 3120 | params.n_ctx = 0; |
| 3121 | params.use_jinja = true; |
| 3122 | } |
| 3123 | ).set_examples({LLAMA_EXAMPLE_SERVER})); |
| 3124 | |
| 3125 | add_opt(common_arg( |
| 3126 | {"--vision-gemma-12b-default" }, |
| 3127 | string_format(fmt: "use Gemma 3 12B QAT (note: can download weights from the internet)" ), |
| 3128 | [](common_params & params) { |
| 3129 | params.model.hf_repo = "ggml-org/gemma-3-12b-it-qat-GGUF" ; |
| 3130 | params.port = 8014; |
| 3131 | params.n_ctx = 0; |
| 3132 | params.use_jinja = true; |
| 3133 | } |
| 3134 | ).set_examples({LLAMA_EXAMPLE_SERVER})); |
| 3135 | |
| 3136 | return ctx_arg; |
| 3137 | } |
| 3138 | |