| 1 | #pragma once |
| 2 | |
| 3 | #include "llama.h" |
| 4 | |
| 5 | #include <array> |
| 6 | |
| 7 | // bump if necessary |
| 8 | #define LLAMA_MAX_LAYERS 512 |
| 9 | #define LLAMA_MAX_EXPERTS 384 // Kimi-K2 |
| 10 | |
| 11 | enum llama_expert_gating_func_type { |
| 12 | LLAMA_EXPERT_GATING_FUNC_TYPE_NONE = 0, |
| 13 | LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX = 1, |
| 14 | LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID = 2, |
| 15 | LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT = 3, // applied to the router weights instead of the logits |
| 16 | }; |
| 17 | |
| 18 | enum llama_swa_type { |
| 19 | LLAMA_SWA_TYPE_NONE = 0, |
| 20 | LLAMA_SWA_TYPE_STANDARD = 1, |
| 21 | LLAMA_SWA_TYPE_CHUNKED = 2, |
| 22 | LLAMA_SWA_TYPE_SYMMETRIC = 3, |
| 23 | }; |
| 24 | |
| 25 | struct llama_hparams_posnet { |
| 26 | uint32_t n_embd; |
| 27 | uint32_t n_layer; |
| 28 | }; |
| 29 | |
| 30 | struct llama_hparams_convnext { |
| 31 | uint32_t n_embd; |
| 32 | uint32_t n_layer; |
| 33 | }; |
| 34 | |
| 35 | struct llama_hparams { |
| 36 | bool vocab_only; |
| 37 | bool rope_finetuned; |
| 38 | bool use_par_res; |
| 39 | bool swin_norm; |
| 40 | |
| 41 | uint32_t n_ctx_train; // context size the model was trained on |
| 42 | uint32_t n_embd; |
| 43 | uint32_t n_embd_features = 0; |
| 44 | uint32_t n_layer; |
| 45 | int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache |
| 46 | uint32_t n_rot; |
| 47 | uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads |
| 48 | uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head |
| 49 | uint32_t n_expert = 0; |
| 50 | uint32_t n_expert_used = 0; |
| 51 | uint32_t n_rel_attn_bkts = 0; |
| 52 | |
| 53 | // note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA |
| 54 | uint32_t n_embd_head_k_mla = 0; |
| 55 | uint32_t n_embd_head_v_mla = 0; |
| 56 | |
| 57 | // for WavTokenizer |
| 58 | struct llama_hparams_posnet posnet; |
| 59 | struct llama_hparams_convnext convnext; |
| 60 | |
| 61 | uint32_t n_shortconv_l_cache = 0; |
| 62 | |
| 63 | std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_arr; |
| 64 | std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr; |
| 65 | std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr; |
| 66 | |
| 67 | uint32_t n_layer_dense_lead = 0; |
| 68 | uint32_t n_lora_q = 0; |
| 69 | uint32_t n_lora_kv = 0; |
| 70 | uint32_t n_ff_exp = 0; |
| 71 | uint32_t n_ff_shexp = 0; |
| 72 | uint32_t n_ff_chexp = 0; |
| 73 | uint32_t n_expert_shared = 0; |
| 74 | uint32_t n_norm_groups = 0; |
| 75 | uint32_t n_expert_groups = 0; |
| 76 | uint32_t n_group_used = 0; |
| 77 | uint32_t n_group_experts = 0; |
| 78 | |
| 79 | float expert_group_scale = 0.05f; |
| 80 | float expert_weights_scale = 0.0f; |
| 81 | bool expert_weights_norm = false; |
| 82 | uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE; |
| 83 | uint32_t moe_every_n_layers = 0; |
| 84 | uint32_t nextn_predict_layers = 0; |
| 85 | |
| 86 | float f_norm_eps; |
| 87 | float f_norm_rms_eps; |
| 88 | float f_norm_group_eps; |
| 89 | |
| 90 | float f_attn_logit_softcapping = 50.0f; |
| 91 | float f_router_logit_softcapping = 30.0f; |
| 92 | float f_final_logit_softcapping = 30.0f; |
| 93 | |
| 94 | // for RWKV |
| 95 | uint32_t rescale_every_n_layers = 0; |
| 96 | uint32_t = 0; |
| 97 | uint32_t = 0; |
| 98 | uint32_t wkv_head_size = 0; |
| 99 | uint32_t token_shift_count = 2; |
| 100 | uint32_t n_lora_decay = 0; |
| 101 | uint32_t n_lora_iclr = 0; |
| 102 | uint32_t n_lora_value_res_mix = 0; |
| 103 | uint32_t n_lora_gate = 0; |
| 104 | |
| 105 | float rope_attn_factor = 1.0f; |
| 106 | float rope_freq_base_train; |
| 107 | float rope_freq_base_train_swa; |
| 108 | float rope_freq_scale_train; |
| 109 | float rope_freq_scale_train_swa; |
| 110 | uint32_t n_ctx_orig_yarn; |
| 111 | float rope_yarn_log_mul = 0.0f; |
| 112 | |
| 113 | float yarn_ext_factor = -1.0f; |
| 114 | float yarn_attn_factor = 1.0f; |
| 115 | float yarn_beta_fast = 32.0f; |
| 116 | float yarn_beta_slow = 1.0f; |
| 117 | |
| 118 | std::array<int, 4> rope_sections; |
| 119 | |
| 120 | // Sliding Window Attention (SWA) |
| 121 | llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE; |
| 122 | // the size of the sliding window (0 - no SWA) |
| 123 | uint32_t n_swa = 0; |
| 124 | // if swa_layers[il] == true, then layer il is SWA |
| 125 | // if swa_layers[il] == false, then layer il is dense (i.e. non-SWA) |
| 126 | // by default, all layers are dense |
| 127 | std::array<bool, LLAMA_MAX_LAYERS> swa_layers; |
| 128 | |
| 129 | // for State Space Models |
| 130 | uint32_t ssm_d_conv = 0; |
| 131 | uint32_t ssm_d_inner = 0; |
| 132 | uint32_t ssm_d_state = 0; |
| 133 | uint32_t ssm_dt_rank = 0; |
| 134 | uint32_t ssm_n_group = 0; |
| 135 | |
| 136 | // for hybrid state space models |
| 137 | std::array<bool, LLAMA_MAX_LAYERS> recurrent_layer_arr; |
| 138 | |
| 139 | bool ssm_dt_b_c_rms = false; |
| 140 | |
| 141 | float f_clamp_kqv = 0.0f; |
| 142 | float f_max_alibi_bias = 0.0f; |
| 143 | float f_logit_scale = 0.0f; |
| 144 | |
| 145 | // Additional scale factors (Granite/Granite MoE) |
| 146 | float f_residual_scale = 0.0f; |
| 147 | float f_embedding_scale = 0.0f; |
| 148 | float f_attention_scale = 0.0f; |
| 149 | |
| 150 | // grok-2 |
| 151 | float f_attn_out_scale = 0.0f; |
| 152 | uint32_t attn_temp_length = 0; |
| 153 | |
| 154 | bool causal_attn = true; |
| 155 | bool use_alibi = false; |
| 156 | bool attn_soft_cap = false; |
| 157 | bool use_kq_norm = false; |
| 158 | |
| 159 | // for Classifiers |
| 160 | uint32_t n_cls_out = 1; |
| 161 | |
| 162 | // llama4 smallthinker |
| 163 | uint32_t n_moe_layer_step = 0; |
| 164 | uint32_t n_no_rope_layer_step = 4; |
| 165 | uint32_t n_attn_temp_floor_scale = 8192; |
| 166 | float f_attn_temp_scale = 0.1; |
| 167 | |
| 168 | // gemma3n altup |
| 169 | uint32_t n_altup = 4; // altup_num_inputs |
| 170 | uint32_t i_altup_act = 0; // altup_active_idx |
| 171 | uint32_t laurel_rank = 64; |
| 172 | uint32_t n_embd_altup = 256; |
| 173 | |
| 174 | // needed for sentence-transformers dense layers |
| 175 | uint32_t dense_2_feat_in = 0; // in_features of the 2_Dense |
| 176 | uint32_t dense_2_feat_out = 0; // out_features of the 2_Dense |
| 177 | uint32_t dense_3_feat_in = 0; // in_features of the 3_Dense |
| 178 | uint32_t dense_3_feat_out = 0; // out_features of the 3_Dense |
| 179 | |
| 180 | // xIELU |
| 181 | std::array<float, LLAMA_MAX_LAYERS> xielu_alpha_n; |
| 182 | std::array<float, LLAMA_MAX_LAYERS> xielu_alpha_p; |
| 183 | std::array<float, LLAMA_MAX_LAYERS> xielu_beta; |
| 184 | std::array<float, LLAMA_MAX_LAYERS> xielu_eps; |
| 185 | |
| 186 | // qwen3vl deepstack |
| 187 | uint32_t n_deepstack_layers = 0; |
| 188 | |
| 189 | // needed by encoder-decoder models (e.g. T5, FLAN-T5) |
| 190 | // ref: https://github.com/ggerganov/llama.cpp/pull/8141 |
| 191 | llama_token dec_start_token_id = LLAMA_TOKEN_NULL; |
| 192 | uint32_t dec_n_layer = 0; |
| 193 | |
| 194 | enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE; |
| 195 | enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE; |
| 196 | enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE; |
| 197 | |
| 198 | // this value n_pattern means that every nth layer is dense (i.e. non-SWA) |
| 199 | // dense_first means whether the pattern is start with a dense layer |
| 200 | // note that if n_pattern == 0, all layers are SWA |
| 201 | // if n_pattern == 1, all layers are dense |
| 202 | // example 1: n_pattern = 3, dense_first = false |
| 203 | // il == 0: swa |
| 204 | // il == 1: swa |
| 205 | // il == 2: dense |
| 206 | // il == 3: swa |
| 207 | // il == 4: swa |
| 208 | // il == 5: dense |
| 209 | // il == 6: swa |
| 210 | // etc ... |
| 211 | // example 2: n_pattern = 2, dense_first = true |
| 212 | // il == 0: dense |
| 213 | // il == 1: swa |
| 214 | // il == 2: dense |
| 215 | // il == 3: swa |
| 216 | // etc ... |
| 217 | void set_swa_pattern(uint32_t n_pattern, bool dense_first = false); |
| 218 | |
| 219 | // return true if one of the layers is SWA |
| 220 | bool is_swa_any() const; |
| 221 | |
| 222 | uint32_t n_head(uint32_t il = 0) const; |
| 223 | |
| 224 | uint32_t n_head_kv(uint32_t il = 0) const; |
| 225 | |
| 226 | uint32_t n_ff(uint32_t il = 0) const; |
| 227 | |
| 228 | uint32_t n_gqa(uint32_t il = 0) const; |
| 229 | |
| 230 | // dimension of main + auxiliary input embeddings |
| 231 | uint32_t n_embd_inp() const; |
| 232 | |
| 233 | // dimension of key embeddings across all k-v heads |
| 234 | uint32_t n_embd_k_gqa(uint32_t il = 0) const; |
| 235 | |
| 236 | // dimension of value embeddings across all k-v heads |
| 237 | uint32_t n_embd_v_gqa(uint32_t il = 0) const; |
| 238 | |
| 239 | // true if any layer has a different n_embd_k_gqa/n_embd_v_gqa |
| 240 | bool is_n_embd_k_gqa_variable() const; |
| 241 | bool is_n_embd_v_gqa_variable() const; |
| 242 | |
| 243 | // return the maximum n_embd_k_gqa/n_embd_v_gqa across all layers |
| 244 | uint32_t n_embd_k_gqa_max() const; |
| 245 | uint32_t n_embd_v_gqa_max() const; |
| 246 | |
| 247 | // dimension of the rolling state embeddings |
| 248 | // corresponds to Mamba's conv_states size or RWKV's token_shift states size |
| 249 | uint32_t n_embd_r() const; |
| 250 | |
| 251 | // dimension of the recurrent state embeddings |
| 252 | uint32_t n_embd_s() const; |
| 253 | |
| 254 | // whether or not the given layer is recurrent (for hybrid models) |
| 255 | bool is_recurrent(uint32_t il) const; |
| 256 | |
| 257 | uint32_t n_pos_per_embd() const; |
| 258 | |
| 259 | bool is_swa(uint32_t il) const; |
| 260 | |
| 261 | bool has_kv(uint32_t il) const; |
| 262 | |
| 263 | // number of layers for which has_kv() returns true |
| 264 | uint32_t n_layer_kv() const; |
| 265 | |
| 266 | // note that this function uses different SWA parameters from those in the hparams |
| 267 | // TODO: think of a better place for this function |
| 268 | // TODO: pack the SWA params in a struct? |
| 269 | static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1); |
| 270 | }; |
| 271 | |
| 272 | static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable" ); |
| 273 | |
| 274 | |