1#pragma once
2
3#include "llama.h"
4
5#include <cstdint>
6
7#define LLAMA_MAX_SEQ 256
8
9struct llama_cparams {
10 uint32_t n_ctx; // context size used during inference
11 uint32_t n_ctx_seq; // context for a single sequence
12 uint32_t n_batch;
13 uint32_t n_ubatch;
14 uint32_t n_seq_max;
15 int32_t n_threads; // number of threads to use for generation
16 int32_t n_threads_batch; // number of threads to use for batch processing
17
18 float rope_freq_base;
19 float rope_freq_scale;
20
21 uint32_t n_ctx_orig_yarn;
22 // These hyperparameters are not exposed in GGUF, because all
23 // existing YaRN models use the same values for them.
24 float yarn_ext_factor;
25 float yarn_attn_factor;
26 float yarn_beta_fast;
27 float yarn_beta_slow;
28
29 bool embeddings;
30 bool causal_attn;
31 bool offload_kqv;
32 bool flash_attn;
33 bool no_perf;
34 bool warmup;
35 bool op_offload;
36 bool kv_unified;
37
38 enum llama_pooling_type pooling_type;
39
40 ggml_backend_sched_eval_callback cb_eval;
41 void * cb_eval_user_data;
42};
43