| 1 | #pragma once |
| 2 | |
| 3 | #include "llama.h" |
| 4 | |
| 5 | #include <cstdint> |
| 6 | |
| 7 | #define LLAMA_MAX_SEQ 256 |
| 8 | |
| 9 | struct llama_cparams { |
| 10 | uint32_t n_ctx; // context size used during inference |
| 11 | uint32_t n_ctx_seq; // context for a single sequence |
| 12 | uint32_t n_batch; |
| 13 | uint32_t n_ubatch; |
| 14 | uint32_t n_seq_max; |
| 15 | int32_t n_threads; // number of threads to use for generation |
| 16 | int32_t n_threads_batch; // number of threads to use for batch processing |
| 17 | |
| 18 | float rope_freq_base; |
| 19 | float rope_freq_scale; |
| 20 | |
| 21 | uint32_t n_ctx_orig_yarn; |
| 22 | // These hyperparameters are not exposed in GGUF, because all |
| 23 | // existing YaRN models use the same values for them. |
| 24 | float yarn_ext_factor; |
| 25 | float yarn_attn_factor; |
| 26 | float yarn_beta_fast; |
| 27 | float yarn_beta_slow; |
| 28 | |
| 29 | bool embeddings; |
| 30 | bool causal_attn; |
| 31 | bool offload_kqv; |
| 32 | bool flash_attn; |
| 33 | bool no_perf; |
| 34 | bool warmup; |
| 35 | bool op_offload; |
| 36 | bool kv_unified; |
| 37 | |
| 38 | enum llama_pooling_type pooling_type; |
| 39 | |
| 40 | ggml_backend_sched_eval_callback cb_eval; |
| 41 | void * cb_eval_user_data; |
| 42 | }; |
| 43 | |