| 1 | #pragma once |
| 2 | |
| 3 | #include "llama.h" |
| 4 | #include "llama-arch.h" |
| 5 | #include "llama-graph.h" |
| 6 | #include "llama-hparams.h" |
| 7 | #include "llama-memory.h" |
| 8 | #include "llama-vocab.h" |
| 9 | |
| 10 | #include <map> |
| 11 | #include <memory> |
| 12 | #include <string> |
| 13 | #include <unordered_map> |
| 14 | #include <vector> |
| 15 | |
| 16 | struct llama_cparams; |
| 17 | struct llama_ubatch; |
| 18 | struct llama_model_loader; |
| 19 | |
| 20 | // available models |
| 21 | enum llm_type { |
| 22 | LLM_TYPE_UNKNOWN, |
| 23 | LLM_TYPE_14M, |
| 24 | LLM_TYPE_17M, |
| 25 | LLM_TYPE_22M, |
| 26 | LLM_TYPE_33M, |
| 27 | LLM_TYPE_60M, |
| 28 | LLM_TYPE_70M, |
| 29 | LLM_TYPE_80M, |
| 30 | LLM_TYPE_109M, |
| 31 | LLM_TYPE_137M, |
| 32 | LLM_TYPE_140M, |
| 33 | LLM_TYPE_160M, |
| 34 | LLM_TYPE_190M, |
| 35 | LLM_TYPE_220M, |
| 36 | LLM_TYPE_250M, |
| 37 | LLM_TYPE_256M, |
| 38 | LLM_TYPE_270M, |
| 39 | LLM_TYPE_335M, |
| 40 | LLM_TYPE_350M, |
| 41 | LLM_TYPE_360M, |
| 42 | LLM_TYPE_410M, |
| 43 | LLM_TYPE_450M, |
| 44 | LLM_TYPE_475M, |
| 45 | LLM_TYPE_558M, |
| 46 | LLM_TYPE_700M, |
| 47 | LLM_TYPE_770M, |
| 48 | LLM_TYPE_780M, |
| 49 | LLM_TYPE_950M, |
| 50 | LLM_TYPE_0_3B, |
| 51 | LLM_TYPE_0_5B, |
| 52 | LLM_TYPE_0_6B, |
| 53 | LLM_TYPE_1B, |
| 54 | LLM_TYPE_1_2B, |
| 55 | LLM_TYPE_1_3B, |
| 56 | LLM_TYPE_1_4B, |
| 57 | LLM_TYPE_1_5B, |
| 58 | LLM_TYPE_1_6B, |
| 59 | LLM_TYPE_1_7B, |
| 60 | LLM_TYPE_1_8B, |
| 61 | LLM_TYPE_2B, |
| 62 | LLM_TYPE_2_6B, |
| 63 | LLM_TYPE_2_8B, |
| 64 | LLM_TYPE_2_9B, |
| 65 | LLM_TYPE_3B, |
| 66 | LLM_TYPE_4B, |
| 67 | LLM_TYPE_6B, |
| 68 | LLM_TYPE_6_9B, |
| 69 | LLM_TYPE_7B, |
| 70 | LLM_TYPE_8B, |
| 71 | LLM_TYPE_9B, |
| 72 | LLM_TYPE_11B, |
| 73 | LLM_TYPE_12B, |
| 74 | LLM_TYPE_13B, |
| 75 | LLM_TYPE_14B, |
| 76 | LLM_TYPE_15B, |
| 77 | LLM_TYPE_16B, |
| 78 | LLM_TYPE_20B, |
| 79 | LLM_TYPE_27B, |
| 80 | LLM_TYPE_30B, |
| 81 | LLM_TYPE_32B, |
| 82 | LLM_TYPE_34B, |
| 83 | LLM_TYPE_35B, |
| 84 | LLM_TYPE_36B, |
| 85 | LLM_TYPE_40B, |
| 86 | LLM_TYPE_65B, |
| 87 | LLM_TYPE_70B, |
| 88 | LLM_TYPE_120B, |
| 89 | LLM_TYPE_142B, |
| 90 | LLM_TYPE_236B, |
| 91 | LLM_TYPE_290B, |
| 92 | LLM_TYPE_314B, |
| 93 | LLM_TYPE_405B, |
| 94 | LLM_TYPE_671B, |
| 95 | LLM_TYPE_SMALL, |
| 96 | LLM_TYPE_MEDIUM, |
| 97 | LLM_TYPE_LARGE, |
| 98 | LLM_TYPE_XL, |
| 99 | LLM_TYPE_A1_7B, |
| 100 | LLM_TYPE_A2_7B, |
| 101 | LLM_TYPE_8x7B, |
| 102 | LLM_TYPE_8x22B, |
| 103 | LLM_TYPE_16x12B, |
| 104 | LLM_TYPE_16x3_8B, |
| 105 | LLM_TYPE_10B_128x3_66B, |
| 106 | LLM_TYPE_57B_A14B, |
| 107 | LLM_TYPE_17B_16E, // llama4 Scout |
| 108 | LLM_TYPE_17B_128E, // llama4 Maverick |
| 109 | LLM_TYPE_A13B, |
| 110 | LLM_TYPE_7B_A1B, |
| 111 | LLM_TYPE_8B_A1B, // lfm2moe |
| 112 | LLM_TYPE_16B_A1B, |
| 113 | LLM_TYPE_21B_A3B, // Ernie MoE small |
| 114 | LLM_TYPE_30B_A3B, |
| 115 | LLM_TYPE_100B_A6B, |
| 116 | LLM_TYPE_106B_A12B, // GLM-4.5-Air |
| 117 | LLM_TYPE_230B_A10B, // Minimax M2 |
| 118 | LLM_TYPE_235B_A22B, |
| 119 | LLM_TYPE_300B_A47B, // Ernie MoE big |
| 120 | LLM_TYPE_355B_A32B, // GLM-4.5 |
| 121 | LLM_TYPE_E2B, |
| 122 | LLM_TYPE_E4B, |
| 123 | }; |
| 124 | |
| 125 | std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type); |
| 126 | |
| 127 | struct llama_layer_posnet { |
| 128 | // resnet |
| 129 | struct ggml_tensor * norm1 = nullptr; |
| 130 | struct ggml_tensor * norm1_b = nullptr; |
| 131 | |
| 132 | struct ggml_tensor * conv1 = nullptr; |
| 133 | struct ggml_tensor * conv1_b = nullptr; |
| 134 | |
| 135 | struct ggml_tensor * norm2 = nullptr; |
| 136 | struct ggml_tensor * norm2_b = nullptr; |
| 137 | |
| 138 | struct ggml_tensor * conv2 = nullptr; |
| 139 | struct ggml_tensor * conv2_b = nullptr; |
| 140 | |
| 141 | // attention |
| 142 | struct ggml_tensor * attn_norm = nullptr; |
| 143 | struct ggml_tensor * attn_norm_b = nullptr; |
| 144 | |
| 145 | struct ggml_tensor * attn_q = nullptr; |
| 146 | struct ggml_tensor * attn_q_b = nullptr; |
| 147 | |
| 148 | struct ggml_tensor * attn_k = nullptr; |
| 149 | struct ggml_tensor * attn_k_b = nullptr; |
| 150 | |
| 151 | struct ggml_tensor * attn_v = nullptr; |
| 152 | struct ggml_tensor * attn_v_b = nullptr; |
| 153 | |
| 154 | struct ggml_tensor * attn_o = nullptr; |
| 155 | struct ggml_tensor * attn_o_b = nullptr; |
| 156 | |
| 157 | // normalize |
| 158 | struct ggml_tensor * norm = nullptr; |
| 159 | struct ggml_tensor * norm_b = nullptr; |
| 160 | }; |
| 161 | |
| 162 | struct llama_layer_convnext { |
| 163 | struct ggml_tensor * dw = nullptr; |
| 164 | struct ggml_tensor * dw_b = nullptr; |
| 165 | |
| 166 | struct ggml_tensor * norm = nullptr; |
| 167 | struct ggml_tensor * norm_b = nullptr; |
| 168 | |
| 169 | struct ggml_tensor * pw1 = nullptr; |
| 170 | struct ggml_tensor * pw1_b = nullptr; |
| 171 | |
| 172 | struct ggml_tensor * pw2 = nullptr; |
| 173 | struct ggml_tensor * pw2_b = nullptr; |
| 174 | |
| 175 | struct ggml_tensor * gamma = nullptr; |
| 176 | }; |
| 177 | |
| 178 | struct llama_layer_shortconv { |
| 179 | struct ggml_tensor * in_proj = nullptr; |
| 180 | struct ggml_tensor * conv = nullptr; |
| 181 | struct ggml_tensor * out_proj = nullptr; |
| 182 | }; |
| 183 | |
| 184 | struct llama_layer_nextn { |
| 185 | struct ggml_tensor * eh_proj = nullptr; |
| 186 | struct ggml_tensor * embed_tokens = nullptr; |
| 187 | struct ggml_tensor * enorm = nullptr; |
| 188 | struct ggml_tensor * hnorm = nullptr; |
| 189 | struct ggml_tensor * shared_head_head = nullptr; |
| 190 | struct ggml_tensor * shared_head_norm = nullptr; |
| 191 | }; |
| 192 | |
| 193 | struct llama_layer { |
| 194 | // normalization |
| 195 | struct ggml_tensor * attn_norm = nullptr; |
| 196 | struct ggml_tensor * attn_norm_b = nullptr; |
| 197 | struct ggml_tensor * attn_norm_2 = nullptr; |
| 198 | struct ggml_tensor * attn_norm_2_b = nullptr; |
| 199 | struct ggml_tensor * attn_q_norm = nullptr; |
| 200 | struct ggml_tensor * attn_q_norm_b = nullptr; |
| 201 | struct ggml_tensor * attn_k_norm = nullptr; |
| 202 | struct ggml_tensor * attn_k_norm_b = nullptr; |
| 203 | struct ggml_tensor * attn_out_norm = nullptr; |
| 204 | struct ggml_tensor * attn_out_norm_b = nullptr; |
| 205 | struct ggml_tensor * attn_q_a_norm = nullptr; |
| 206 | struct ggml_tensor * attn_kv_a_norm = nullptr; |
| 207 | struct ggml_tensor * attn_sub_norm = nullptr; |
| 208 | struct ggml_tensor * attn_post_norm = nullptr; |
| 209 | struct ggml_tensor * ffn_sub_norm = nullptr; |
| 210 | struct ggml_tensor * attn_norm_cross = nullptr; |
| 211 | struct ggml_tensor * attn_norm_enc = nullptr; |
| 212 | struct ggml_tensor * ssm_norm = nullptr; |
| 213 | struct ggml_tensor * ssm_dt_norm = nullptr; |
| 214 | struct ggml_tensor * ssm_b_norm = nullptr; |
| 215 | struct ggml_tensor * ssm_c_norm = nullptr; |
| 216 | |
| 217 | // attention |
| 218 | struct ggml_tensor * wq = nullptr; |
| 219 | struct ggml_tensor * wk = nullptr; |
| 220 | struct ggml_tensor * wv = nullptr; |
| 221 | struct ggml_tensor * wo = nullptr; |
| 222 | struct ggml_tensor * wqkv = nullptr; |
| 223 | struct ggml_tensor * wq_a = nullptr; |
| 224 | struct ggml_tensor * wq_b = nullptr; |
| 225 | struct ggml_tensor * wkv_a_mqa = nullptr; |
| 226 | struct ggml_tensor * wkv_b = nullptr; |
| 227 | struct ggml_tensor * wk_b = nullptr; |
| 228 | struct ggml_tensor * wv_b = nullptr; |
| 229 | struct ggml_tensor * wq_cross = nullptr; |
| 230 | struct ggml_tensor * wk_cross = nullptr; |
| 231 | struct ggml_tensor * wv_cross = nullptr; |
| 232 | struct ggml_tensor * wo_cross = nullptr; |
| 233 | struct ggml_tensor * wq_enc = nullptr; |
| 234 | struct ggml_tensor * wk_enc = nullptr; |
| 235 | struct ggml_tensor * wv_enc = nullptr; |
| 236 | struct ggml_tensor * wo_enc = nullptr; |
| 237 | |
| 238 | // attention bias |
| 239 | struct ggml_tensor * bq = nullptr; |
| 240 | struct ggml_tensor * bk = nullptr; |
| 241 | struct ggml_tensor * bv = nullptr; |
| 242 | struct ggml_tensor * bo = nullptr; |
| 243 | struct ggml_tensor * bqkv = nullptr; |
| 244 | |
| 245 | // relative position bias |
| 246 | struct ggml_tensor * attn_rel_b = nullptr; |
| 247 | struct ggml_tensor * attn_rel_b_enc = nullptr; |
| 248 | struct ggml_tensor * attn_rel_b_cross = nullptr; |
| 249 | |
| 250 | // normalization |
| 251 | struct ggml_tensor * ffn_norm = nullptr; |
| 252 | struct ggml_tensor * ffn_norm_b = nullptr; |
| 253 | struct ggml_tensor * ffn_post_norm = nullptr; |
| 254 | struct ggml_tensor * layer_out_norm = nullptr; |
| 255 | struct ggml_tensor * layer_out_norm_b = nullptr; |
| 256 | struct ggml_tensor * ffn_norm_exps = nullptr; |
| 257 | struct ggml_tensor * ffn_norm_enc = nullptr; |
| 258 | |
| 259 | // ff |
| 260 | struct ggml_tensor * ffn_gate = nullptr; // w1 |
| 261 | struct ggml_tensor * ffn_down = nullptr; // w2 |
| 262 | struct ggml_tensor * ffn_up = nullptr; // w3 |
| 263 | struct ggml_tensor * ffn_gate_enc = nullptr; |
| 264 | struct ggml_tensor * ffn_down_enc = nullptr; |
| 265 | struct ggml_tensor * ffn_up_enc = nullptr; |
| 266 | |
| 267 | // ff MoE |
| 268 | struct ggml_tensor * ffn_gate_inp = nullptr; |
| 269 | struct ggml_tensor * ffn_gate_exps = nullptr; |
| 270 | struct ggml_tensor * ffn_down_exps = nullptr; |
| 271 | struct ggml_tensor * ffn_up_exps = nullptr; |
| 272 | struct ggml_tensor * ffn_gate_inp_b = nullptr; |
| 273 | struct ggml_tensor * ffn_gate_exps_b = nullptr; |
| 274 | struct ggml_tensor * ffn_down_exps_b = nullptr; |
| 275 | struct ggml_tensor * ffn_up_exps_b = nullptr; |
| 276 | |
| 277 | // ff shared expert (shexp) |
| 278 | struct ggml_tensor * ffn_gate_inp_shexp = nullptr; |
| 279 | struct ggml_tensor * ffn_gate_shexp = nullptr; |
| 280 | struct ggml_tensor * ffn_down_shexp = nullptr; |
| 281 | struct ggml_tensor * ffn_up_shexp = nullptr; |
| 282 | |
| 283 | // ff adjugate experts (chexps) |
| 284 | struct ggml_tensor * ffn_gate_chexps = nullptr; |
| 285 | struct ggml_tensor * ffn_down_chexps = nullptr; |
| 286 | struct ggml_tensor * ffn_up_chexps = nullptr; |
| 287 | |
| 288 | // ff bias |
| 289 | struct ggml_tensor * ffn_gate_b = nullptr; |
| 290 | struct ggml_tensor * ffn_down_b = nullptr; // b2 |
| 291 | struct ggml_tensor * ffn_up_b = nullptr; // b3 |
| 292 | struct ggml_tensor * ffn_act = nullptr; |
| 293 | struct ggml_tensor * ffn_exp_probs_b = nullptr; |
| 294 | |
| 295 | // mamba proj |
| 296 | struct ggml_tensor * ssm_in = nullptr; |
| 297 | struct ggml_tensor * ssm_x = nullptr; |
| 298 | struct ggml_tensor * ssm_dt = nullptr; |
| 299 | struct ggml_tensor * ssm_out = nullptr; |
| 300 | |
| 301 | // mamba |
| 302 | struct ggml_tensor * ssm_conv1d = nullptr; |
| 303 | struct ggml_tensor * ssm_a = nullptr; |
| 304 | struct ggml_tensor * ssm_d = nullptr; |
| 305 | |
| 306 | // mamba bias |
| 307 | struct ggml_tensor * ssm_conv1d_b = nullptr; |
| 308 | struct ggml_tensor * ssm_dt_b = nullptr; |
| 309 | |
| 310 | // rwkv |
| 311 | struct ggml_tensor * time_mix_w1 = nullptr; |
| 312 | struct ggml_tensor * time_mix_w2 = nullptr; |
| 313 | struct ggml_tensor * time_mix_lerp_x = nullptr; |
| 314 | struct ggml_tensor * time_mix_lerp_w = nullptr; |
| 315 | struct ggml_tensor * time_mix_lerp_k = nullptr; |
| 316 | struct ggml_tensor * time_mix_lerp_v = nullptr; |
| 317 | struct ggml_tensor * time_mix_lerp_r = nullptr; |
| 318 | struct ggml_tensor * time_mix_lerp_g = nullptr; |
| 319 | struct ggml_tensor * time_mix_lerp_fused = nullptr; |
| 320 | |
| 321 | struct ggml_tensor * time_mix_first = nullptr; |
| 322 | struct ggml_tensor * time_mix_decay = nullptr; |
| 323 | struct ggml_tensor * time_mix_decay_w1 = nullptr; |
| 324 | struct ggml_tensor * time_mix_decay_w2 = nullptr; |
| 325 | struct ggml_tensor * time_mix_key = nullptr; |
| 326 | struct ggml_tensor * time_mix_key_b = nullptr; |
| 327 | struct ggml_tensor * time_mix_value = nullptr; |
| 328 | struct ggml_tensor * time_mix_value_b = nullptr; |
| 329 | struct ggml_tensor * time_mix_receptance = nullptr; |
| 330 | struct ggml_tensor * time_mix_receptance_b = nullptr; |
| 331 | struct ggml_tensor * time_mix_gate = nullptr; |
| 332 | |
| 333 | // rwkv7 |
| 334 | struct ggml_tensor * time_mix_w0 = nullptr; |
| 335 | struct ggml_tensor * time_mix_a0 = nullptr; |
| 336 | struct ggml_tensor * time_mix_a1 = nullptr; |
| 337 | struct ggml_tensor * time_mix_a2 = nullptr; |
| 338 | struct ggml_tensor * time_mix_v0 = nullptr; |
| 339 | struct ggml_tensor * time_mix_v1 = nullptr; |
| 340 | struct ggml_tensor * time_mix_v2 = nullptr; |
| 341 | struct ggml_tensor * time_mix_g1 = nullptr; |
| 342 | struct ggml_tensor * time_mix_g2 = nullptr; |
| 343 | struct ggml_tensor * time_mix_k_k = nullptr; |
| 344 | struct ggml_tensor * time_mix_k_a = nullptr; |
| 345 | struct ggml_tensor * time_mix_r_k = nullptr; |
| 346 | |
| 347 | struct ggml_tensor * time_mix_ln = nullptr; |
| 348 | struct ggml_tensor * time_mix_ln_b = nullptr; |
| 349 | struct ggml_tensor * time_mix_output = nullptr; |
| 350 | |
| 351 | struct ggml_tensor * channel_mix_lerp_k = nullptr; |
| 352 | struct ggml_tensor * channel_mix_lerp_r = nullptr; |
| 353 | |
| 354 | struct ggml_tensor * channel_mix_key = nullptr; |
| 355 | struct ggml_tensor * channel_mix_receptance = nullptr; |
| 356 | struct ggml_tensor * channel_mix_value = nullptr; |
| 357 | |
| 358 | // long rope factors |
| 359 | struct ggml_tensor * rope_long = nullptr; |
| 360 | struct ggml_tensor * rope_short = nullptr; |
| 361 | struct ggml_tensor * rope_freqs = nullptr; |
| 362 | |
| 363 | // bitnet scale |
| 364 | struct ggml_tensor * wq_scale = nullptr; |
| 365 | struct ggml_tensor * wk_scale = nullptr; |
| 366 | struct ggml_tensor * wv_scale = nullptr; |
| 367 | struct ggml_tensor * wo_scale = nullptr; |
| 368 | struct ggml_tensor * ffn_gate_scale = nullptr; |
| 369 | struct ggml_tensor * ffn_up_scale = nullptr; |
| 370 | struct ggml_tensor * ffn_down_scale = nullptr; |
| 371 | |
| 372 | // altup & laurel |
| 373 | struct ggml_tensor * per_layer_inp_gate = nullptr; |
| 374 | struct ggml_tensor * per_layer_proj = nullptr; |
| 375 | struct ggml_tensor * per_layer_post_norm = nullptr; |
| 376 | struct ggml_tensor * altup_correct_coef = nullptr; |
| 377 | struct ggml_tensor * altup_correct_scale = nullptr; |
| 378 | struct ggml_tensor * altup_predict_coef = nullptr; |
| 379 | struct ggml_tensor * altup_router = nullptr; |
| 380 | struct ggml_tensor * altup_router_norm = nullptr; |
| 381 | struct ggml_tensor * laurel_l = nullptr; |
| 382 | struct ggml_tensor * laurel_r = nullptr; |
| 383 | struct ggml_tensor * laurel_post_norm = nullptr; |
| 384 | |
| 385 | // openai-moe |
| 386 | struct ggml_tensor * attn_sinks = nullptr; |
| 387 | |
| 388 | // cogvlm |
| 389 | struct ggml_tensor * visexp_attn_wqkv = nullptr; |
| 390 | struct ggml_tensor * visexp_attn_wo = nullptr; |
| 391 | struct ggml_tensor * visexp_ffn_gate = nullptr; |
| 392 | struct ggml_tensor * visexp_ffn_down = nullptr; |
| 393 | struct ggml_tensor * visexp_ffn_up = nullptr; |
| 394 | |
| 395 | // xIELU activation parameters for Apertus |
| 396 | struct ggml_tensor * ffn_act_alpha_n = nullptr; |
| 397 | struct ggml_tensor * ffn_act_alpha_p = nullptr; |
| 398 | struct ggml_tensor * ffn_act_beta = nullptr; |
| 399 | struct ggml_tensor * ffn_act_eps = nullptr; |
| 400 | |
| 401 | struct llama_layer_posnet posnet; |
| 402 | |
| 403 | struct llama_layer_convnext convnext; |
| 404 | |
| 405 | struct llama_layer_shortconv shortconv; |
| 406 | |
| 407 | struct llama_layer_nextn nextn; |
| 408 | }; |
| 409 | |
| 410 | struct llama_model { |
| 411 | llm_type type = LLM_TYPE_UNKNOWN; |
| 412 | llm_arch arch = LLM_ARCH_UNKNOWN; |
| 413 | |
| 414 | std::string name = "n/a" ; |
| 415 | |
| 416 | llama_hparams hparams = {}; |
| 417 | llama_vocab vocab; |
| 418 | |
| 419 | // for classifier models |
| 420 | std::vector<std::string> classifier_labels; |
| 421 | |
| 422 | struct ggml_tensor * tok_embd = nullptr; |
| 423 | struct ggml_tensor * type_embd = nullptr; |
| 424 | struct ggml_tensor * pos_embd = nullptr; |
| 425 | struct ggml_tensor * tok_norm = nullptr; |
| 426 | struct ggml_tensor * tok_norm_b = nullptr; |
| 427 | |
| 428 | struct ggml_tensor * output_norm = nullptr; |
| 429 | struct ggml_tensor * output_norm_b = nullptr; |
| 430 | struct ggml_tensor * output = nullptr; |
| 431 | struct ggml_tensor * output_b = nullptr; |
| 432 | struct ggml_tensor * output_norm_enc = nullptr; |
| 433 | |
| 434 | // classifier |
| 435 | struct ggml_tensor * cls = nullptr; |
| 436 | struct ggml_tensor * cls_b = nullptr; |
| 437 | struct ggml_tensor * cls_out = nullptr; |
| 438 | struct ggml_tensor * cls_out_b = nullptr; |
| 439 | |
| 440 | struct ggml_tensor * conv1d = nullptr; |
| 441 | struct ggml_tensor * conv1d_b = nullptr; |
| 442 | |
| 443 | // gemma3n altup |
| 444 | struct ggml_tensor * tok_embd_per_layer = nullptr; |
| 445 | struct ggml_tensor * altup_proj = nullptr; |
| 446 | struct ggml_tensor * altup_unembd_proj = nullptr; |
| 447 | struct ggml_tensor * per_layer_model_proj = nullptr; |
| 448 | struct ggml_tensor * per_layer_proj_norm = nullptr; |
| 449 | |
| 450 | std::vector<llama_layer> layers; |
| 451 | |
| 452 | //Dense linear projections for SentenceTransformers models like embeddinggemma |
| 453 | // For Sentence Transformers models structure see |
| 454 | // https://sbert.net/docs/sentence_transformer/usage/custom_models.html#structure-of-sentence-transformer-models |
| 455 | struct ggml_tensor * dense_2_out_layers = nullptr; |
| 456 | struct ggml_tensor * dense_3_out_layers = nullptr; |
| 457 | |
| 458 | llama_model_params params; |
| 459 | |
| 460 | // gguf metadata |
| 461 | std::unordered_map<std::string, std::string> gguf_kv; |
| 462 | |
| 463 | // list of devices used in this model |
| 464 | std::vector<ggml_backend_dev_t> devices; |
| 465 | |
| 466 | // for quantize-stats only |
| 467 | std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name; |
| 468 | |
| 469 | int64_t t_load_us = 0; |
| 470 | int64_t t_start_us = 0; |
| 471 | |
| 472 | explicit llama_model(const struct llama_model_params & params); |
| 473 | ~llama_model(); |
| 474 | |
| 475 | void load_stats (llama_model_loader & ml); |
| 476 | void load_arch (llama_model_loader & ml); |
| 477 | void load_hparams(llama_model_loader & ml); |
| 478 | void load_vocab (llama_model_loader & ml); |
| 479 | bool load_tensors(llama_model_loader & ml); // returns false if cancelled by progress_callback |
| 480 | |
| 481 | std::string arch_name() const; |
| 482 | std::string type_name() const; |
| 483 | |
| 484 | std::string desc() const; |
| 485 | |
| 486 | size_t size() const; // file size |
| 487 | size_t n_tensors() const; |
| 488 | size_t n_devices() const; |
| 489 | |
| 490 | std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const; |
| 491 | |
| 492 | // total number of parameters in the model |
| 493 | uint64_t n_elements() const; |
| 494 | |
| 495 | void print_info() const; |
| 496 | |
| 497 | ggml_backend_dev_t dev_layer(int il) const; |
| 498 | ggml_backend_dev_t dev_output() const; |
| 499 | |
| 500 | ggml_backend_buffer_type_t select_buft(int il) const; |
| 501 | |
| 502 | bool has_tensor_overrides() const; |
| 503 | |
| 504 | const struct ggml_tensor * get_tensor(const char * name) const; |
| 505 | |
| 506 | float get_rope_freq_base (const llama_cparams & cparams, int il) const; |
| 507 | float get_rope_freq_scale(const llama_cparams & cparams, int il) const; |
| 508 | |
| 509 | ggml_tensor * get_rope_factors(const llama_cparams & cparams, int il) const; |
| 510 | |
| 511 | // TODO: move this to new llm_arch_model_i interface |
| 512 | llama_memory_i * create_memory(const llama_memory_params & params, const llama_cparams & cparams) const; |
| 513 | |
| 514 | // TODO: move this to new llm_arch_model_i interface |
| 515 | ggml_cgraph * build_graph(const llm_graph_params & params) const; |
| 516 | |
| 517 | private: |
| 518 | struct impl; |
| 519 | std::unique_ptr<impl> pimpl; |
| 520 | }; |
| 521 | |
| 522 | const char * llm_type_name(llm_type type); |
| 523 | |
| 524 | // For internal test use |
| 525 | // TODO: remove |
| 526 | const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model); |
| 527 | |