1#include "llama-model.h"
2
3#include "llama-impl.h"
4#include "llama-mmap.h"
5#include "llama-batch.h"
6#include "llama-cparams.h"
7#include "llama-model-loader.h"
8
9#include "llama-kv-cache.h"
10#include "llama-kv-cache-iswa.h"
11#include "llama-memory-hybrid.h"
12#include "llama-memory-recurrent.h"
13
14#include "ggml-cpp.h"
15
16#include "models/models.h"
17
18#include <algorithm>
19#include <cassert>
20#include <cfloat>
21#include <cstring>
22#include <cmath>
23#include <functional>
24#include <map>
25#include <regex>
26#include <sstream>
27#include <stdexcept>
28
29const char * llm_type_name(llm_type type) {
30 switch (type) {
31 case LLM_TYPE_14M: return "14M";
32 case LLM_TYPE_17M: return "17M";
33 case LLM_TYPE_22M: return "22M";
34 case LLM_TYPE_33M: return "33M";
35 case LLM_TYPE_60M: return "60M";
36 case LLM_TYPE_70M: return "70M";
37 case LLM_TYPE_80M: return "80M";
38 case LLM_TYPE_109M: return "109M";
39 case LLM_TYPE_137M: return "137M";
40 case LLM_TYPE_140M: return "140M";
41 case LLM_TYPE_160M: return "160M";
42 case LLM_TYPE_190M: return "190M";
43 case LLM_TYPE_220M: return "220M";
44 case LLM_TYPE_250M: return "250M";
45 case LLM_TYPE_256M: return "256M";
46 case LLM_TYPE_270M: return "270M";
47 case LLM_TYPE_335M: return "335M";
48 case LLM_TYPE_350M: return "350M";
49 case LLM_TYPE_360M: return "360M";
50 case LLM_TYPE_410M: return "410M";
51 case LLM_TYPE_450M: return "450M";
52 case LLM_TYPE_475M: return "475M";
53 case LLM_TYPE_558M: return "558M";
54 case LLM_TYPE_700M: return "700M";
55 case LLM_TYPE_770M: return "770M";
56 case LLM_TYPE_780M: return "780M";
57 case LLM_TYPE_950M: return "950M";
58 case LLM_TYPE_0_3B: return "0.3B";
59 case LLM_TYPE_0_5B: return "0.5B";
60 case LLM_TYPE_0_6B: return "0.6B";
61 case LLM_TYPE_1B: return "1B";
62 case LLM_TYPE_1_2B: return "1.2B";
63 case LLM_TYPE_1_3B: return "1.3B";
64 case LLM_TYPE_1_4B: return "1.4B";
65 case LLM_TYPE_1_5B: return "1.5B";
66 case LLM_TYPE_1_6B: return "1.6B";
67 case LLM_TYPE_1_7B: return "1.7B";
68 case LLM_TYPE_1_8B: return "1.8B";
69 case LLM_TYPE_2B: return "2B";
70 case LLM_TYPE_2_6B: return "2.6B";
71 case LLM_TYPE_2_8B: return "2.8B";
72 case LLM_TYPE_2_9B: return "2.9B";
73 case LLM_TYPE_3B: return "3B";
74 case LLM_TYPE_4B: return "4B";
75 case LLM_TYPE_6B: return "6B";
76 case LLM_TYPE_6_9B: return "6.9B";
77 case LLM_TYPE_7B: return "7B";
78 case LLM_TYPE_8B: return "8B";
79 case LLM_TYPE_9B: return "9B";
80 case LLM_TYPE_11B: return "11B";
81 case LLM_TYPE_12B: return "12B";
82 case LLM_TYPE_13B: return "13B";
83 case LLM_TYPE_14B: return "14B";
84 case LLM_TYPE_15B: return "15B";
85 case LLM_TYPE_16B: return "16B";
86 case LLM_TYPE_20B: return "20B";
87 case LLM_TYPE_27B: return "27B";
88 case LLM_TYPE_30B: return "30B";
89 case LLM_TYPE_32B: return "32B";
90 case LLM_TYPE_34B: return "34B";
91 case LLM_TYPE_35B: return "35B";
92 case LLM_TYPE_36B: return "36B";
93 case LLM_TYPE_40B: return "40B";
94 case LLM_TYPE_65B: return "65B";
95 case LLM_TYPE_70B: return "70B";
96 case LLM_TYPE_120B: return "120B";
97 case LLM_TYPE_142B: return "142B";
98 case LLM_TYPE_236B: return "236B";
99 case LLM_TYPE_290B: return "290B";
100 case LLM_TYPE_314B: return "314B";
101 case LLM_TYPE_405B: return "405B";
102 case LLM_TYPE_671B: return "671B";
103 case LLM_TYPE_SMALL: return "0.1B";
104 case LLM_TYPE_MEDIUM: return "0.4B";
105 case LLM_TYPE_LARGE: return "0.8B";
106 case LLM_TYPE_XL: return "1.5B";
107 case LLM_TYPE_A1_7B: return "A1.7B";
108 case LLM_TYPE_A2_7B: return "A2.7B";
109 case LLM_TYPE_8x7B: return "8x7B";
110 case LLM_TYPE_8x22B: return "8x22B";
111 case LLM_TYPE_16x12B: return "16x12B";
112 case LLM_TYPE_16x3_8B: return "16x3.8B";
113 case LLM_TYPE_10B_128x3_66B: return "10B+128x3.66B";
114 case LLM_TYPE_57B_A14B: return "57B.A14B";
115 case LLM_TYPE_17B_16E: return "17Bx16E (Scout)";
116 case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
117 case LLM_TYPE_A13B: return "A13B";
118 case LLM_TYPE_7B_A1B: return "7B.A1B";
119 case LLM_TYPE_8B_A1B: return "8B.A1B";
120 case LLM_TYPE_16B_A1B: return "16B.A1B";
121 case LLM_TYPE_21B_A3B: return "21B.A3B";
122 case LLM_TYPE_30B_A3B: return "30B.A3B";
123 case LLM_TYPE_100B_A6B: return "100B.A6B";
124 case LLM_TYPE_106B_A12B: return "106B.A12B";
125 case LLM_TYPE_230B_A10B: return "230B.A10B";
126 case LLM_TYPE_235B_A22B: return "235B.A22B";
127 case LLM_TYPE_300B_A47B: return "300B.A47B";
128 case LLM_TYPE_355B_A32B: return "355B.A32B";
129 case LLM_TYPE_E2B: return "E2B";
130 case LLM_TYPE_E4B: return "E4B";
131 default: return "?B";
132 }
133}
134
135static const char * llama_expert_gating_func_name(llama_expert_gating_func_type type) {
136 switch (type) {
137 case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX: return "softmax";
138 case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID: return "sigmoid";
139 default: return "unknown";
140 }
141}
142
143static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_TYPES = {
144 { LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
145 { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
146 { LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
147 { LLAMA_ROPE_SCALING_TYPE_LONGROPE, "longrope" },
148};
149
150std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type) {
151 return LLAMA_ROPE_SCALING_TYPES.at(k: rope_scaling_type);
152}
153
154static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
155 for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
156 if (kv.second == name) {
157 return (llama_rope_scaling_type) kv.first;
158 }
159 }
160
161 return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
162}
163
164// checks if the weight tensor can be used with the specified buffer type and device
165static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
166 GGML_ASSERT(w != nullptr);
167
168 if (op == GGML_OP_NONE) {
169 return true;
170 }
171
172 ggml_init_params params = {
173 /*.mem_size =*/ ggml_tensor_overhead()*8,
174 /*.mem_buffer =*/ NULL,
175 /*.no_alloc =*/ true,
176 };
177 ggml_context_ptr ctx_ptr { ggml_init(params) };
178 if (!ctx_ptr) {
179 throw std::runtime_error(format(fmt: "failed to create ggml context"));
180 }
181 ggml_context * ctx = ctx_ptr.get();
182
183 ggml_tensor * op_tensor = nullptr;
184
185 switch (op) {
186 case GGML_OP_GET_ROWS:
187 {
188 ggml_tensor * b = ggml_new_tensor_1d(ctx, type: GGML_TYPE_I32, ne0: 512);
189 op_tensor = ggml_get_rows(ctx, a: w, b);
190 } break;
191 case GGML_OP_MUL_MAT:
192 {
193 ggml_tensor * b = ggml_new_tensor_4d(ctx, type: GGML_TYPE_F32, ne0: w->ne[0], ne1: 512, ne2: w->ne[2], ne3: w->ne[3]);
194 op_tensor = ggml_mul_mat(ctx, a: w, b);
195 } break;
196 case GGML_OP_MUL_MAT_ID:
197 {
198 int n_expert_used = hparams.n_expert_used;
199 ggml_tensor * b = ggml_new_tensor_3d(ctx, type: GGML_TYPE_F32, ne0: w->ne[0], ne1: n_expert_used, ne2: 512);
200 ggml_tensor * ids = ggml_new_tensor_2d(ctx, type: GGML_TYPE_I32, ne0: n_expert_used, ne1: 512);
201 op_tensor = ggml_mul_mat_id(ctx, as: w, b, ids);
202 } break;
203 case GGML_OP_ADD:
204 {
205 ggml_tensor * a = ggml_new_tensor_4d(ctx, type: GGML_TYPE_F32, ne0: w->ne[0], ne1: w->ne[1], ne2: w->ne[2], ne3: w->ne[3]);
206 op_tensor = ggml_add(ctx, a, b: w);
207 } break;
208 case GGML_OP_ADD_ID:
209 {
210 int n_expert_used = hparams.n_expert_used;
211 ggml_tensor * a = ggml_new_tensor_3d(ctx, type: GGML_TYPE_F32, ne0: w->ne[0], ne1: n_expert_used, ne2: 512);
212 ggml_tensor * c = ggml_new_tensor_2d(ctx, type: GGML_TYPE_I32, ne0: n_expert_used, ne1: 512);
213 op_tensor = ggml_add_id(ctx, a, b: w, ids: c);
214 } break;
215 case GGML_OP_MUL:
216 {
217 ggml_tensor * a = ggml_new_tensor_4d(ctx, type: GGML_TYPE_F32, ne0: w->ne[0], ne1: w->ne[1], ne2: w->ne[2], ne3: w->ne[3]);
218 op_tensor = ggml_mul(ctx, a, b: w);
219 } break;
220 case GGML_OP_DIV:
221 {
222 ggml_tensor * a = ggml_new_tensor_1d(ctx, type: GGML_TYPE_F32, ne0: w->ne[0]);
223 op_tensor = ggml_div(ctx, a, b: w);
224 } break;
225 case GGML_OP_ROPE:
226 {
227 int n_embd_head = hparams.n_embd_head_v;
228 int n_head = hparams.n_head();
229 ggml_tensor * a = ggml_new_tensor_3d(ctx, type: GGML_TYPE_F32, ne0: n_embd_head, ne1: n_head, ne2: 512);
230 ggml_tensor * b = ggml_new_tensor_1d(ctx, type: GGML_TYPE_I32, ne0: 512);
231 op_tensor = ggml_rope_ext(
232 ctx, a, b, c: w,
233 n_dims: 0, mode: 0, n_ctx_orig: 0, freq_base: 0, freq_scale: 0,
234 ext_factor: 0, attn_factor: 0, beta_fast: 0, beta_slow: 0
235 );
236
237 } break;
238 case GGML_OP_SSM_CONV:
239 {
240 const int64_t n_seq_tokens = 512;
241 const int64_t n_seqs = 3;
242 ggml_tensor * conv_x = ggml_new_tensor_3d(ctx, type: GGML_TYPE_F32, ne0: w->ne[0] - 1 + n_seq_tokens, ne1: w->ne[1], ne2: n_seqs);
243 op_tensor = ggml_ssm_conv(ctx, sx: conv_x, c: w);
244 } break;
245 case GGML_OP_SSM_SCAN:
246 {
247 // w is ssm_a, which is used to distinguish Mamba-1 and Mamba-2
248 const int64_t d_state = w->ne[0] == 1 ? hparams.ssm_d_state : w->ne[0];
249 const int64_t n_head = w->ne[1];
250 const int64_t head_dim = hparams.ssm_d_inner / n_head;
251 const int64_t n_group = hparams.ssm_n_group ? hparams.ssm_n_group : 1;
252 const int64_t n_seq_tokens = 512;
253 const int64_t n_seqs = 3;
254 ggml_tensor * s = ggml_new_tensor_4d(ctx, type: GGML_TYPE_F32, ne0: d_state, ne1: head_dim, ne2: n_head, ne3: n_seqs);
255 ggml_tensor * x = ggml_new_tensor_4d(ctx, type: GGML_TYPE_F32, ne0: head_dim, ne1: n_head, ne2: n_seq_tokens, ne3: n_seqs);
256 ggml_tensor * dt = ggml_new_tensor_3d(ctx, type: GGML_TYPE_F32, ne0: n_head, ne1: n_seq_tokens, ne2: n_seqs);
257 ggml_tensor * B = ggml_new_tensor_4d(ctx, type: GGML_TYPE_F32, ne0: d_state, ne1: n_group, ne2: n_seq_tokens, ne3: n_seqs);
258 ggml_tensor * C = ggml_new_tensor_4d(ctx, type: GGML_TYPE_F32, ne0: d_state, ne1: n_group, ne2: n_seq_tokens, ne3: n_seqs);
259 ggml_tensor * ids = ggml_new_tensor_1d(ctx, type: GGML_TYPE_I32, ne0: n_seqs);
260 op_tensor = ggml_ssm_scan(ctx, s, x, dt, A: w, B, C, ids);
261 } break;
262 case GGML_OP_RWKV_WKV6:
263 {
264 // FIXME
265 const int64_t S = 123;
266 const int64_t H = 123;
267 const int64_t n_tokens = 123;
268 const int64_t n_seqs = 123;
269 ggml_tensor * k = ggml_new_tensor_3d(ctx, type: GGML_TYPE_F32, ne0: S, ne1: H, ne2: n_tokens);
270 ggml_tensor * v = ggml_new_tensor_3d(ctx, type: GGML_TYPE_F32, ne0: S, ne1: H, ne2: n_tokens);
271 ggml_tensor * r = ggml_new_tensor_3d(ctx, type: GGML_TYPE_F32, ne0: S, ne1: H, ne2: n_tokens);
272 ggml_tensor * tf = w;
273 ggml_tensor * td = ggml_new_tensor_3d(ctx, type: GGML_TYPE_F32, ne0: S, ne1: H, ne2: n_tokens);
274 ggml_tensor * state = ggml_new_tensor_4d(ctx, type: GGML_TYPE_F32, ne0: S, ne1: n_seqs, ne2: S, ne3: H);
275 op_tensor = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state);
276 } break;
277 case GGML_OP_IM2COL:
278 {
279 const int n_embd_inp = hparams.n_embd_inp();
280 ggml_tensor * b = ggml_new_tensor_4d(ctx, type: GGML_TYPE_F32, ne0: n_embd_inp, ne1: w->ne[1], ne2: 1, ne3: 1);
281 op_tensor = ggml_im2col(ctx, a: w, b, s0: 1, s1: 0, p0: 0, p1: 0, d0: 1, d1: 0, is_2D: false, dst_type: GGML_TYPE_F16);
282 } break;
283 case GGML_OP_SCALE:
284 {
285 op_tensor = ggml_scale(ctx, a: w, s: 1.0f);
286 } break;
287 default:
288 GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
289 }
290
291 // create a temporary dummy buffer for the weight so that supports_op can check the buffer type
292 GGML_ASSERT(w->buffer == nullptr);
293 w->buffer = ggml_backend_buft_alloc_buffer(buft, size: 0);
294 bool op_supported = ggml_backend_dev_supports_op(device: dev, op: op_tensor);
295 ggml_backend_buffer_free(buffer: w->buffer);
296 w->buffer = nullptr;
297
298 return op_supported;
299}
300
301// lists of buffer types used for each layer
302using buft_list_t = std::vector<std::pair<ggml_backend_dev_t, ggml_backend_buffer_type_t>>;
303
304// find the first buffer type in the list that can use the tensor
305static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hparams, ggml_tensor * tensor, ggml_op op, const buft_list_t & buft_list) {
306 GGML_ASSERT(!buft_list.empty());
307 for (const auto & cur : buft_list) {
308 ggml_backend_dev_t cur_dev = cur.first;
309 ggml_backend_buffer_type_t cur_buft = cur.second;
310 if (weight_buft_supported(hparams, w: tensor, op, buft: cur_buft, dev: cur_dev)) {
311 return cur_buft;
312 }
313 }
314
315 return nullptr;
316}
317
318// CPU: ACCEL -> GPU host -> CPU extra -> CPU
319static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices, bool use_extra_bufts, bool no_host) {
320 buft_list_t buft_list;
321
322 // add ACCEL buffer types
323 for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
324 ggml_backend_dev_t dev = ggml_backend_dev_get(index: i);
325 if (ggml_backend_dev_type(device: dev) == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
326 auto * buft = ggml_backend_dev_buffer_type(device: dev);
327 // skip
328 if (buft != ggml_backend_cpu_buffer_type()) {
329 buft_list.emplace_back(args&: dev, args&: buft);
330 }
331 }
332 }
333
334 // add a host buffer type
335 // storing the tensors in a host buffer is useful when the processing of large batches
336 // is offloaded to a GPU device, since it reduces the time spent on data transfers
337 // generally, this will be done using the first device in the list
338 // a better approach would be to handle this on a weight-by-weight basis using the offload_op
339 // function of the device to determine if it would benefit from being stored in a host buffer
340 if (!no_host) {
341 for (auto * dev : devices) {
342 ggml_backend_buffer_type_t buft = ggml_backend_dev_host_buffer_type(device: dev);
343 if (buft) {
344 buft_list.emplace_back(args&: dev, args&: buft);
345 break;
346 }
347 }
348 }
349
350 // add extra buffer types
351 if (use_extra_bufts) {
352 auto * cpu_dev = ggml_backend_dev_by_type(type: GGML_BACKEND_DEVICE_TYPE_CPU);
353 if (cpu_dev == nullptr) {
354 throw std::runtime_error(format(fmt: "%s: no CPU backend found", __func__));
355 }
356
357 auto * cpu_reg = ggml_backend_dev_backend_reg(device: cpu_dev);
358 auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
359 ggml_backend_reg_get_proc_address(reg: cpu_reg, name: "ggml_backend_dev_get_extra_bufts");
360 if (ggml_backend_dev_get_extra_bufts_fn) {
361 ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
362 while (extra_bufts && *extra_bufts) {
363 buft_list.emplace_back(args&: cpu_dev, args&: *extra_bufts);
364 ++extra_bufts;
365 }
366 }
367 }
368
369 // add the CPU buffer type
370 for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
371 ggml_backend_dev_t dev = ggml_backend_dev_get(index: i);
372 if (ggml_backend_dev_type(device: dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
373 buft_list.emplace_back(args&: dev, args: ggml_backend_dev_buffer_type(device: dev));
374 }
375 }
376
377 return buft_list;
378}
379
380// GPU: split if LLAMA_SPLIT_MODE_ROW -> GPU
381static buft_list_t make_gpu_buft_list(ggml_backend_dev_t dev, llama_split_mode split_mode, const float * tensor_split) {
382 buft_list_t buft_list;
383
384 // add the device split buffer type if requested and available
385 if (split_mode == LLAMA_SPLIT_MODE_ROW) {
386 ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(device: dev);
387 auto ggml_backend_split_buffer_type_fn = (ggml_backend_split_buffer_type_t)
388 ggml_backend_reg_get_proc_address(reg, name: "ggml_backend_split_buffer_type");
389 if (ggml_backend_split_buffer_type_fn) {
390 size_t dev_index = [&]() {
391 auto * reg = ggml_backend_dev_backend_reg(device: dev);
392 for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); ++i) {
393 if (ggml_backend_reg_dev_get(reg, index: i) == dev) {
394 return i;
395 }
396 }
397 throw std::runtime_error(format(fmt: "device %s not found in its backend reg", ggml_backend_dev_name(device: dev)));
398 }();
399 auto * buft = ggml_backend_split_buffer_type_fn(dev_index, tensor_split);
400 if (buft != nullptr) {
401 buft_list.emplace_back(args&: dev, args&: buft);
402 }
403 }
404 }
405
406 // add the device default buffer type
407 buft_list.emplace_back(args&: dev, args: ggml_backend_dev_buffer_type(device: dev));
408
409 // add the device extra buffer type (if any)
410 ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(device: dev);
411 auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
412 ggml_backend_reg_get_proc_address(reg, name: "ggml_backend_dev_get_extra_bufts");
413
414 if (ggml_backend_dev_get_extra_bufts_fn) {
415 ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(dev);
416 while (extra_bufts && *extra_bufts) {
417 buft_list.emplace_back(args&: dev, args&: *extra_bufts);
418 ++extra_bufts;
419 }
420 }
421
422 return buft_list;
423}
424
425struct llama_model::impl {
426 impl() {}
427 ~impl() {}
428
429 uint64_t n_elements = 0;
430
431 size_t n_bytes = 0;
432
433 std::string desc_str;
434
435 // model memory mapped files
436 llama_mmaps mappings;
437
438 // objects representing data potentially being locked in memory
439 llama_mlocks mlock_bufs;
440 llama_mlocks mlock_mmaps;
441
442 // contexts where the model tensors metadata is stored as well ass the corresponding buffers:
443 std::vector<std::pair<ggml_context_ptr, std::vector<ggml_backend_buffer_ptr>>> ctxs_bufs;
444
445 buft_list_t cpu_buft_list;
446 std::map<ggml_backend_dev_t, buft_list_t> gpu_buft_list;
447
448 struct layer_dev {
449 ggml_backend_dev_t dev;
450 buft_list_t * buft_list;
451 };
452
453 layer_dev dev_input = {};
454 layer_dev dev_output = {};
455 std::vector<layer_dev> dev_layer;
456
457 bool has_tensor_overrides;
458};
459
460llama_model::llama_model(const llama_model_params & params) : params(params), pimpl(std::make_unique<impl>()) {
461 pimpl->has_tensor_overrides = params.tensor_buft_overrides && params.tensor_buft_overrides[0].pattern;
462}
463
464llama_model::~llama_model() {}
465
466void llama_model::load_stats(llama_model_loader & ml) {
467 pimpl->n_elements = ml.n_elements;
468 pimpl->n_bytes = ml.n_bytes;
469}
470
471void llama_model::load_arch(llama_model_loader & ml) {
472 arch = ml.get_arch();
473 if (arch == LLM_ARCH_UNKNOWN) {
474 throw std::runtime_error("unknown model architecture: '" + ml.get_arch_name() + "'");
475 }
476}
477
478void llama_model::load_hparams(llama_model_loader & ml) {
479 const gguf_context * ctx = ml.meta.get();
480
481 // get metadata as string
482 for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
483 gguf_type type = gguf_get_kv_type(ctx, key_id: i);
484 if (type == GGUF_TYPE_ARRAY) {
485 continue;
486 }
487 const char * name = gguf_get_key(ctx, key_id: i);
488 const std::string value = gguf_kv_to_str(ctx_gguf: ctx, i);
489 gguf_kv.emplace(args&: name, args: value);
490 }
491
492 // get general kv
493 ml.get_key(kid: LLM_KV_GENERAL_NAME, result&: name, required: false);
494
495 // everything past this point is not vocab-related
496 // for CLIP models, we only need to load tensors, no hparams
497 if (hparams.vocab_only || ml.get_arch() == LLM_ARCH_CLIP) {
498 return;
499 }
500
501 ml.get_key(kid: LLM_KV_CONTEXT_LENGTH, result&: hparams.n_ctx_train);
502 ml.get_key(kid: LLM_KV_EMBEDDING_LENGTH, result&: hparams.n_embd);
503 ml.get_key(kid: LLM_KV_BLOCK_COUNT, result&: hparams.n_layer);
504 ml.get_key(kid: LLM_KV_EXPERT_COUNT, result&: hparams.n_expert, required: false);
505 ml.get_key(kid: LLM_KV_EXPERT_USED_COUNT, result&: hparams.n_expert_used, required: false);
506 ml.get_key(kid: LLM_KV_EXPERT_GROUP_COUNT, result&: hparams.n_expert_groups, required: false);
507 ml.get_key(kid: LLM_KV_EXPERT_GROUP_USED_COUNT, result&: hparams.n_group_used, required: false);
508
509 if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
510 ml.get_key(kid: LLM_KV_FEATURES_LENGTH, result&: hparams.n_embd_features);
511
512 ml.get_key(kid: LLM_KV_POSNET_EMBEDDING_LENGTH, result&: hparams.posnet.n_embd);
513 ml.get_key(kid: LLM_KV_POSNET_BLOCK_COUNT, result&: hparams.posnet.n_layer);
514
515 ml.get_key(kid: LLM_KV_CONVNEXT_EMBEDDING_LENGTH, result&: hparams.convnext.n_embd);
516 ml.get_key(kid: LLM_KV_CONVNEXT_BLOCK_COUNT, result&: hparams.convnext.n_layer);
517 }
518
519 GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
520 GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
521 if (hparams.n_expert > 0) {
522 GGML_ASSERT(hparams.n_expert_used > 0);
523 GGML_ASSERT(hparams.n_expert_groups < hparams.n_expert);
524 if (hparams.n_expert_groups > 1) {
525 GGML_ASSERT(hparams.n_expert % hparams.n_expert_groups == 0);
526 GGML_ASSERT(hparams.n_group_used > 0);
527 GGML_ASSERT(hparams.n_group_used < hparams.n_expert_groups);
528 }
529 } else {
530 GGML_ASSERT(hparams.n_expert_used == 0);
531 GGML_ASSERT(hparams.n_expert_groups == 0);
532 }
533
534 std::fill(first: hparams.n_head_arr.begin(), last: hparams.n_head_arr.end(), value: 0);
535 std::fill(first: hparams.n_head_kv_arr.begin(), last: hparams.n_head_kv_arr.end(), value: 0);
536 std::fill(first: hparams.n_ff_arr.begin(), last: hparams.n_ff_arr.end(), value: 0);
537 std::fill(
538 first: hparams.recurrent_layer_arr.begin(),
539 last: hparams.recurrent_layer_arr.end(),
540 value: llm_arch_is_recurrent(arch: ml.get_arch()));
541
542 std::fill(first: hparams.rope_sections.begin(), last: hparams.rope_sections.end(), value: 0);
543 std::fill(first: hparams.swa_layers.begin(), last: hparams.swa_layers.end(), value: 0);
544
545 std::fill(first: hparams.xielu_alpha_n.begin(), last: hparams.xielu_alpha_n.end(), value: 0.0f);
546 std::fill(first: hparams.xielu_alpha_p.begin(), last: hparams.xielu_alpha_p.end(), value: 0.0f);
547 std::fill(first: hparams.xielu_beta.begin(), last: hparams.xielu_beta.end(), value: 0.0f);
548 std::fill(first: hparams.xielu_eps.begin(), last: hparams.xielu_eps.end(), value: 0.0f);
549
550 ml.get_key_or_arr(kid: LLM_KV_FEED_FORWARD_LENGTH, result&: hparams.n_ff_arr, n: hparams.n_layer, required: false);
551 ml.get_key_or_arr(kid: LLM_KV_ATTENTION_HEAD_COUNT, result&: hparams.n_head_arr, n: hparams.n_layer, required: false);
552
553 // n_head_kv is optional, default to n_head
554 hparams.n_head_kv_arr = hparams.n_head_arr;
555
556 ml.get_key_or_arr(kid: LLM_KV_ATTENTION_HEAD_COUNT_KV, result&: hparams.n_head_kv_arr, n: hparams.n_layer, required: false);
557
558 bool rope_finetuned = false;
559 ml.get_key(kid: LLM_KV_ROPE_SCALING_FINETUNED, result&: rope_finetuned, required: false);
560 hparams.rope_finetuned = rope_finetuned;
561
562 hparams.n_ctx_orig_yarn = hparams.n_ctx_train;
563 ml.get_key(kid: LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, result&: hparams.n_ctx_orig_yarn, required: false);
564
565 // rope_freq_base (optional)
566 hparams.rope_freq_base_train = 10000.0f;
567 ml.get_key(kid: LLM_KV_ROPE_FREQ_BASE, result&: hparams.rope_freq_base_train, required: false);
568
569 std::string rope_scaling("linear");
570 ml.get_key(kid: LLM_KV_ROPE_SCALING_TYPE, result&: rope_scaling, required: false);
571 hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(name: rope_scaling);
572 GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED);
573
574 // rope_freq_scale (inverse of the kv) is optional
575 float ropescale = 0.0f;
576 if (!ml.get_key(kid: LLM_KV_ROPE_SCALING_FACTOR, result&: ropescale, required: false)) {
577 // try the old key name
578 ml.get_key(kid: LLM_KV_ROPE_SCALE_LINEAR, result&: ropescale, required: false);
579 }
580 hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
581
582 // by default assume that the sliding-window layers use the same scaling type as the non-sliding-window layers
583 hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
584 hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
585
586 ml.get_key(kid: LLM_KV_ROPE_SCALING_ATTN_FACTOR, result&: hparams.rope_attn_factor, required: false);
587
588 // non-transformer models do not have attention heads
589 if (hparams.n_head() > 0) {
590 // gpt-neox n_rot = rotary_pct * (n_embd / n_head)
591 // gpt-j n_rot = rotary_dim
592
593 hparams.n_embd_head_k = hparams.n_embd / hparams.n_head();
594 ml.get_key(kid: LLM_KV_ATTENTION_KEY_LENGTH, result&: hparams.n_embd_head_k, required: false);
595
596 hparams.n_embd_head_v = hparams.n_embd / hparams.n_head();
597 ml.get_key(kid: LLM_KV_ATTENTION_VALUE_LENGTH, result&: hparams.n_embd_head_v, required: false);
598
599 // sanity check for n_rot (optional)
600 hparams.n_rot = hparams.n_embd_head_k;
601
602 ml.get_key(kid: LLM_KV_ROPE_DIMENSION_COUNT, result&: hparams.n_rot, required: false);
603
604 if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
605 if (hparams.n_rot != hparams.n_embd_head_k) {
606 throw std::runtime_error(format(fmt: "invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
607 }
608 }
609 } else {
610 hparams.n_rot = 0;
611 hparams.n_embd_head_k = 0;
612 hparams.n_embd_head_v = 0;
613 }
614
615 // for differentiating model types
616 uint32_t n_vocab = 0;
617 ml.get_key(kid: LLM_KV_VOCAB_SIZE, result&: n_vocab, required: false) || ml.get_arr_n(kid: LLM_KV_TOKENIZER_LIST, result&: n_vocab, required: false);
618
619 // for classifier models
620 ml.get_arr(kid: LLM_KV_CLASSIFIER_OUTPUT_LABELS, result&: classifier_labels, required: false);
621 if (!classifier_labels.empty()) {
622 hparams.n_cls_out = classifier_labels.size();
623 }
624
625 // arch-specific KVs
626 switch (arch) {
627 case LLM_ARCH_LLAMA:
628 {
629 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
630
631 if (hparams.n_expert == 8) {
632 switch (hparams.n_layer) {
633 case 32: type = LLM_TYPE_8x7B; break;
634 case 56: type = LLM_TYPE_8x22B; break;
635 default: type = LLM_TYPE_UNKNOWN;
636 }
637 } else {
638 switch (hparams.n_layer) {
639 case 16: type = LLM_TYPE_1B; break; // Llama 3.2 1B
640 case 22: type = LLM_TYPE_1B; break;
641 case 26: type = LLM_TYPE_3B; break;
642 case 28: type = LLM_TYPE_3B; break; // Llama 3.2 3B
643 case 30: type = LLM_TYPE_256M; break; // smoldocling 256M
644 // granite uses a vocab with len 49152
645 case 32: type = n_vocab == 49152 ? LLM_TYPE_3B : (n_vocab < 40000 ? LLM_TYPE_7B : LLM_TYPE_8B); break;
646 case 36: type = LLM_TYPE_8B; break; // granite
647 case 40: type = LLM_TYPE_13B; break;
648 case 48: type = LLM_TYPE_34B; break;
649 case 60: type = LLM_TYPE_30B; break;
650 case 80: type = hparams.n_head() == hparams.n_head_kv() ? LLM_TYPE_65B : LLM_TYPE_70B; break;
651 default: type = LLM_TYPE_UNKNOWN;
652 }
653 }
654 } break;
655 case LLM_ARCH_LLAMA4:
656 {
657 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
658 ml.get_key(kid: LLM_KV_EXPERT_FEED_FORWARD_LENGTH, result&: hparams.n_ff_exp);
659 ml.get_key(kid: LLM_KV_INTERLEAVE_MOE_LAYER_STEP, result&: hparams.n_moe_layer_step);
660
661 const bool found_swa = ml.get_key(kid: LLM_KV_ATTENTION_SLIDING_WINDOW, result&: hparams.n_swa, required: false);
662 if (found_swa && hparams.n_swa == 0) {
663 hparams.swa_type = LLAMA_SWA_TYPE_NONE;
664 hparams.n_no_rope_layer_step = hparams.n_layer; // always use rope
665 } else {
666 hparams.swa_type = LLAMA_SWA_TYPE_CHUNKED;
667 hparams.n_swa = 8192;
668 hparams.set_swa_pattern(n_pattern: 4); // pattern: 3 chunked - 1 full
669 }
670
671 switch (hparams.n_expert) {
672 case 0: {
673 // MobileLLM (no MoE)
674 switch (hparams.n_embd) {
675 case 2048: type = LLM_TYPE_140M; break;
676 case 4096: type = LLM_TYPE_360M; break;
677 case 6144: type = LLM_TYPE_950M; break;
678 default: type = LLM_TYPE_UNKNOWN;
679 }
680 } break;
681 case 16: type = LLM_TYPE_17B_16E; break;
682 case 128: type = LLM_TYPE_17B_128E; break;
683 default: type = LLM_TYPE_UNKNOWN;
684 }
685
686 hparams.use_kq_norm = type != LLM_TYPE_17B_128E;
687 } break;
688 case LLM_ARCH_ARCEE:
689 {
690 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
691
692 // Arcee uses the same structure as Llama
693 switch (hparams.n_layer) {
694 case 36: type = LLM_TYPE_4B; break;
695 default: type = LLM_TYPE_UNKNOWN;
696 }
697 } break;
698 case LLM_ARCH_DECI:
699 {
700 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
701 switch (hparams.n_layer) {
702 case 32: type = LLM_TYPE_7B; break;
703 case 80: type = LLM_TYPE_70B; break;
704 case 162: type = LLM_TYPE_405B; break;
705 default: type = LLM_TYPE_UNKNOWN;
706 }
707 } break;
708 case LLM_ARCH_MINICPM:
709 {
710 // Backward-compatible defaults for older MiniCPM GGUFs
711 hparams.f_embedding_scale = 12.0f;
712 hparams.f_residual_scale = 1.4f / sqrtf(x: float(hparams.n_layer));
713 hparams.f_logit_scale = hparams.n_embd ? (256.0f / float(hparams.n_embd)) : 1.0f;
714
715 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
716
717 // Optional KV reads, override defaults if present in newer GGUF exports
718 ml.get_key(kid: LLM_KV_EMBEDDING_SCALE, result&: hparams.f_embedding_scale, /*required=*/false);
719 ml.get_key(kid: LLM_KV_RESIDUAL_SCALE, result&: hparams.f_residual_scale, /*required=*/false);
720 ml.get_key(kid: LLM_KV_LOGIT_SCALE, result&: hparams.f_logit_scale, /*required=*/false);
721
722 // MiniCPM uses rope by default, unlike Granite which uses it as a switch
723 hparams.rope_finetuned = true;
724
725 switch (hparams.n_layer) {
726 case 52: type = LLM_TYPE_1B; break;
727 case 40: type = LLM_TYPE_2B; break;
728 default: type = LLM_TYPE_UNKNOWN;
729 }
730 } break;
731 case LLM_ARCH_MINICPM3:
732 {
733 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
734 ml.get_key(kid: LLM_KV_ATTENTION_Q_LORA_RANK, result&: hparams.n_lora_q);
735 ml.get_key(kid: LLM_KV_ATTENTION_KV_LORA_RANK, result&: hparams.n_lora_kv);
736
737 switch (hparams.n_layer) {
738 case 62: type = LLM_TYPE_4B; break;
739 default: type = LLM_TYPE_UNKNOWN;
740 }
741 } break;
742 case LLM_ARCH_GROK:
743 {
744 // defaults for old GGUFs
745 hparams.yarn_beta_fast = 8.0f;
746 hparams.f_logit_scale = 0.5773502691896257f;
747 hparams.f_embedding_scale = 78.38367176906169f;
748 hparams.f_attn_out_scale = 0.08838834764831845f;
749 hparams.f_attn_logit_softcapping = 30.0f;
750 hparams.f_router_logit_softcapping = 30.0f;
751 // no final_logit_softcapping in grok-1
752 hparams.f_final_logit_softcapping = 0.0f;
753
754 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
755 ml.get_key(kid: LLM_KV_EXPERT_FEED_FORWARD_LENGTH, result&: hparams.n_ff_exp, required: false);
756 ml.get_key(kid: LLM_KV_LOGIT_SCALE, result&: hparams.f_logit_scale, required: false);
757 ml.get_key(kid: LLM_KV_EMBEDDING_SCALE, result&: hparams.f_embedding_scale, required: false);
758 ml.get_key(kid: LLM_KV_ATTENTION_OUTPUT_SCALE, result&: hparams.f_attn_out_scale, required: false);
759 ml.get_key(kid: LLM_KV_ATTN_LOGIT_SOFTCAPPING, result&: hparams.f_attn_logit_softcapping, required: false);
760 ml.get_key(kid: LLM_KV_ROUTER_LOGIT_SOFTCAPPING, result&: hparams.f_router_logit_softcapping, required: false);
761 ml.get_key(kid: LLM_KV_FINAL_LOGIT_SOFTCAPPING, result&: hparams.f_final_logit_softcapping, required: false);
762
763 ml.get_key(kid: LLM_KV_ATTENTION_TEMPERATURE_LENGTH, result&: hparams.attn_temp_length, required: false);
764 ml.get_key(kid: LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR, result&: hparams.yarn_ext_factor, required: false);
765 ml.get_key(kid: LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR, result&: hparams.yarn_attn_factor, required: false);
766 ml.get_key(kid: LLM_KV_ROPE_SCALING_YARN_BETA_FAST, result&: hparams.yarn_beta_fast, required: false);
767 ml.get_key(kid: LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, result&: hparams.yarn_beta_slow, required: false);
768
769 switch (hparams.n_layer) {
770 case 64: type = LLM_TYPE_314B; break;
771 default: type = LLM_TYPE_UNKNOWN;
772 }
773 } break;
774 case LLM_ARCH_FALCON:
775 {
776 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_EPS, result&: hparams.f_norm_eps);
777
778 switch (hparams.n_layer) {
779 case 32: type = LLM_TYPE_7B; break;
780 case 60: type = LLM_TYPE_40B; break;
781 default: type = LLM_TYPE_UNKNOWN;
782 }
783 } break;
784 case LLM_ARCH_BAICHUAN:
785 {
786 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
787 switch (hparams.n_layer) {
788 case 32: type = LLM_TYPE_7B; break;
789 case 40: type = LLM_TYPE_13B; break;
790 default: type = LLM_TYPE_UNKNOWN;
791 }
792
793 if (type == LLM_TYPE_13B) {
794 // TODO: become GGUF KV parameter
795 hparams.f_max_alibi_bias = 8.0f;
796 }
797 } break;
798 case LLM_ARCH_STARCODER:
799 {
800 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_EPS, result&: hparams.f_norm_eps);
801 switch (hparams.n_layer) {
802 case 24: type = LLM_TYPE_1B; break;
803 case 36: type = LLM_TYPE_3B; break;
804 case 42: type = LLM_TYPE_7B; break;
805 case 40: type = LLM_TYPE_15B; break;
806 default: type = LLM_TYPE_UNKNOWN;
807 }
808 } break;
809 case LLM_ARCH_REFACT:
810 {
811 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
812 switch (hparams.n_layer) {
813 case 32: type = LLM_TYPE_1B; break;
814 default: type = LLM_TYPE_UNKNOWN;
815 }
816
817 // TODO: become GGUF KV parameter
818 hparams.f_max_alibi_bias = 8.0f;
819 } break;
820 case LLM_ARCH_BERT:
821 {
822 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_EPS, result&: hparams.f_norm_eps);
823 ml.get_key(kid: LLM_KV_ATTENTION_CAUSAL, result&: hparams.causal_attn);
824 ml.get_key(kid: LLM_KV_POOLING_TYPE, result&: hparams.pooling_type, required: false);
825
826 switch (hparams.n_layer) {
827 case 3:
828 type = LLM_TYPE_17M; break; // bge-micro
829 case 6:
830 type = LLM_TYPE_22M; break; // MiniLM-L6
831 case 12:
832 switch (hparams.n_embd) {
833 case 384: type = LLM_TYPE_33M; break; // MiniLM-L12, bge-small
834 case 768: type = LLM_TYPE_109M; break; // bge-base
835 default: type = LLM_TYPE_UNKNOWN;
836 } break;
837 case 24:
838 type = LLM_TYPE_335M; break; // bge-large
839 default: type = LLM_TYPE_UNKNOWN;
840 }
841 } break;
842 case LLM_ARCH_JINA_BERT_V2:
843 {
844 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_EPS, result&: hparams.f_norm_eps);
845 ml.get_key(kid: LLM_KV_ATTENTION_CAUSAL, result&: hparams.causal_attn);
846 ml.get_key(kid: LLM_KV_POOLING_TYPE, result&: hparams.pooling_type, required: false);
847 hparams.f_max_alibi_bias = 8.0f;
848
849 switch (hparams.n_layer) {
850 case 4: type = LLM_TYPE_33M; break; // jina-embeddings-small
851 case 12: type = LLM_TYPE_137M; break; // jina-embeddings-base
852 default: type = LLM_TYPE_UNKNOWN;
853 }
854 } break;
855 case LLM_ARCH_JINA_BERT_V3:
856 {
857 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_EPS, result&: hparams.f_norm_eps);
858 ml.get_key(kid: LLM_KV_ATTENTION_CAUSAL, result&: hparams.causal_attn);
859 ml.get_key(kid: LLM_KV_POOLING_TYPE, result&: hparams.pooling_type, required: false);
860
861 switch (hparams.n_layer) {
862 case 24:
863 type = LLM_TYPE_558M; break;
864 default: type = LLM_TYPE_UNKNOWN;
865 }
866 } break;
867 case LLM_ARCH_NOMIC_BERT:
868 case LLM_ARCH_NOMIC_BERT_MOE:
869 {
870 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_EPS, result&: hparams.f_norm_eps);
871 ml.get_key(kid: LLM_KV_ATTENTION_CAUSAL, result&: hparams.causal_attn);
872 ml.get_key(kid: LLM_KV_POOLING_TYPE, result&: hparams.pooling_type);
873 ml.get_key(kid: LLM_KV_MOE_EVERY_N_LAYERS, result&: hparams.moe_every_n_layers, required: 0);
874
875 if (hparams.n_layer == 12 && hparams.n_embd == 768) {
876 if (arch == LLM_ARCH_NOMIC_BERT) {
877 type = LLM_TYPE_137M;
878 } else if (arch == LLM_ARCH_NOMIC_BERT_MOE && hparams.moe_every_n_layers == 2) {
879 type = LLM_TYPE_475M;
880 }
881 }
882 } break;
883 case LLM_ARCH_NEO_BERT:
884 {
885 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
886 ml.get_key(kid: LLM_KV_ATTENTION_CAUSAL, result&: hparams.causal_attn);
887 ml.get_key(kid: LLM_KV_POOLING_TYPE, result&: hparams.pooling_type);
888
889 if (hparams.n_layer == 28) {
890 type = LLM_TYPE_250M;
891 }
892 } break;
893 case LLM_ARCH_BLOOM:
894 {
895 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_EPS, result&: hparams.f_norm_eps);
896
897 switch (hparams.n_layer) {
898 case 24: type = LLM_TYPE_1B; break;
899 case 30:
900 switch (hparams.n_embd) {
901 case 2560: type = LLM_TYPE_3B; break;
902 case 4096: type = LLM_TYPE_7B; break;
903 default: type = LLM_TYPE_UNKNOWN;
904 } break;
905 default: type = LLM_TYPE_UNKNOWN;
906 }
907
908 // TODO: become GGUF KV parameter
909 hparams.f_max_alibi_bias = 8.0f;
910 } break;
911 case LLM_ARCH_MPT:
912 {
913 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_EPS, result&: hparams.f_norm_eps);
914 ml.get_key(kid: LLM_KV_ATTENTION_CLAMP_KQV, result&: hparams.f_clamp_kqv, required: false);
915 ml.get_key(kid: LLM_KV_ATTENTION_MAX_ALIBI_BIAS, result&: hparams.f_max_alibi_bias);
916
917 switch (hparams.n_layer) {
918 case 32: type = LLM_TYPE_7B; break;
919 case 48: type = LLM_TYPE_30B; break;
920 default: type = LLM_TYPE_UNKNOWN;
921 }
922 } break;
923 case LLM_ARCH_STABLELM:
924 {
925 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_EPS, result&: hparams.f_norm_eps);
926
927 switch (hparams.n_layer) {
928 case 24: type = LLM_TYPE_1B; break;
929 case 32: type = LLM_TYPE_3B; break;
930 case 40: type = LLM_TYPE_12B; break;
931 default: type = LLM_TYPE_UNKNOWN;
932 }
933 } break;
934 case LLM_ARCH_QWEN:
935 {
936 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
937
938 switch (hparams.n_layer) {
939 case 32: type = LLM_TYPE_7B; break;
940 case 40: type = LLM_TYPE_13B; break;
941 default: type = LLM_TYPE_UNKNOWN;
942 }
943 } break;
944 case LLM_ARCH_QWEN2VL:
945 {
946 ml.get_key_or_arr(kid: LLM_KV_ROPE_DIMENSION_SECTIONS, result&: hparams.rope_sections, n: 4, required: true);
947 }
948 // fall through
949 case LLM_ARCH_QWEN2:
950 {
951 ml.get_key(kid: LLM_KV_POOLING_TYPE, result&: hparams.pooling_type, required: false);
952 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
953 switch (hparams.n_layer) {
954 case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_5B : LLM_TYPE_1B; break;
955 case 28: type = hparams.n_embd == 1536 ? LLM_TYPE_1_5B : LLM_TYPE_7B; break;
956 case 32: type = LLM_TYPE_7B; break;
957 case 36: type = LLM_TYPE_3B; break;
958 case 40: type = hparams.n_head() == 20 ? LLM_TYPE_4B : LLM_TYPE_13B; break;
959 case 48: type = LLM_TYPE_14B; break;
960 case 64: type = LLM_TYPE_32B; break;
961 case 80: type = LLM_TYPE_70B; break;
962 default: type = LLM_TYPE_UNKNOWN;
963 }
964 } break;
965 case LLM_ARCH_DREAM:
966 {
967 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
968 // Dream models are primarily 7B with 28 layers
969 switch (hparams.n_layer) {
970 case 28:
971 type = LLM_TYPE_7B;
972 break;
973 default:
974 type = LLM_TYPE_UNKNOWN;
975 }
976 // Set non-causal attention for diffusion models
977 hparams.causal_attn = false;
978 }
979 break;
980 case LLM_ARCH_LLADA:
981 {
982 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
983 // LLaDA-8B has 32 layers, similar to LLaMA but for diffusion
984 switch (hparams.n_layer) {
985 case 32:
986 type = LLM_TYPE_8B;
987 break;
988 default:
989 type = LLM_TYPE_UNKNOWN;
990 }
991 // Set non-causal attention for diffusion models
992 hparams.causal_attn = false;
993 }
994 break;
995 case LLM_ARCH_LLADA_MOE:
996 {
997 ml.get_key(kid: LLM_KV_EXPERT_FEED_FORWARD_LENGTH, result&: hparams.n_ff_exp, required: false);
998
999 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
1000 // diffusion language model uses non-causal attention
1001 hparams.causal_attn = false;
1002 switch (hparams.n_layer) {
1003 case 16: type = LLM_TYPE_A1_7B; break;
1004 default: type = LLM_TYPE_UNKNOWN;
1005 }
1006 } break;
1007 case LLM_ARCH_QWEN2MOE:
1008 {
1009 ml.get_key(kid: LLM_KV_EXPERT_FEED_FORWARD_LENGTH, result&: hparams.n_ff_exp, required: false);
1010 ml.get_key(kid: LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, result&: hparams.n_ff_shexp, required: false);
1011
1012 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
1013 switch (hparams.n_layer) {
1014 case 24: type = LLM_TYPE_A2_7B; break;
1015 case 28: type = LLM_TYPE_57B_A14B; break;
1016 default: type = LLM_TYPE_UNKNOWN;
1017 }
1018 } break;
1019 case LLM_ARCH_QWEN3:
1020 {
1021 ml.get_key(kid: LLM_KV_POOLING_TYPE, result&: hparams.pooling_type, required: false);
1022 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
1023 switch (hparams.n_layer) {
1024 case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
1025 case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break;
1026 case 40: type = LLM_TYPE_14B; break;
1027 case 64: type = LLM_TYPE_32B; break;
1028 default: type = LLM_TYPE_UNKNOWN;
1029 }
1030 } break;
1031 case LLM_ARCH_QWEN3VL:
1032 {
1033 ml.get_key(kid: LLM_KV_NUM_DEEPSTACK_LAYERS, result&: hparams.n_deepstack_layers, required: false);
1034 ml.get_key_or_arr(kid: LLM_KV_ROPE_DIMENSION_SECTIONS, result&: hparams.rope_sections, n: 4, required: true);
1035 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
1036 switch (hparams.n_layer) {
1037 case 28: type = LLM_TYPE_1_7B; break;
1038 case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break;
1039 case 64: type = LLM_TYPE_32B; break;
1040 default: type = LLM_TYPE_UNKNOWN;
1041 }
1042 } break;
1043 case LLM_ARCH_QWEN3MOE:
1044 {
1045 ml.get_key(kid: LLM_KV_EXPERT_FEED_FORWARD_LENGTH, result&: hparams.n_ff_exp, required: false);
1046
1047 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
1048 switch (hparams.n_layer) {
1049 case 48: type = LLM_TYPE_30B_A3B; break;
1050 case 94: type = LLM_TYPE_235B_A22B; break;
1051 default: type = LLM_TYPE_UNKNOWN;
1052 }
1053 } break;
1054 case LLM_ARCH_QWEN3VLMOE:
1055 {
1056 ml.get_key(kid: LLM_KV_NUM_DEEPSTACK_LAYERS, result&: hparams.n_deepstack_layers, required: false);
1057 ml.get_key_or_arr(kid: LLM_KV_ROPE_DIMENSION_SECTIONS, result&: hparams.rope_sections, n: 4, required: true);
1058 ml.get_key(kid: LLM_KV_EXPERT_FEED_FORWARD_LENGTH, result&: hparams.n_ff_exp, required: false);
1059 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
1060 switch (hparams.n_layer) {
1061 case 48: type = LLM_TYPE_30B_A3B; break;
1062 case 94: type = LLM_TYPE_235B_A22B; break;
1063 default: type = LLM_TYPE_UNKNOWN;
1064 }
1065 } break;
1066 case LLM_ARCH_PHI2:
1067 {
1068 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_EPS, result&: hparams.f_norm_eps);
1069
1070 switch (hparams.n_layer) {
1071 case 24: type = LLM_TYPE_1B; break;
1072 case 32: type = LLM_TYPE_3B; break;
1073 default: type = LLM_TYPE_UNKNOWN;
1074 }
1075 } break;
1076 case LLM_ARCH_PHI3:
1077 {
1078 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
1079
1080 switch (hparams.n_layer) {
1081 case 24: type = LLM_TYPE_1B; break;
1082 case 32: type = LLM_TYPE_3B; break;
1083 case 40: type = LLM_TYPE_14B; break;
1084 default: type = LLM_TYPE_UNKNOWN;
1085 }
1086
1087 const bool found_swa = ml.get_key(kid: LLM_KV_ATTENTION_SLIDING_WINDOW, result&: hparams.n_swa, required: false);
1088
1089 if (found_swa && hparams.n_swa > 0) {
1090 LLAMA_LOG_WARN("%s: Phi SWA is currently disabled - results might be suboptimal for some models (see %s)\n",
1091 __func__, "https://github.com/ggml-org/llama.cpp/pull/13676");
1092
1093 // TODO: fix conversion scripts to correctly populate `n_swa` and `n_swa_pattern`
1094 hparams.swa_type = LLAMA_SWA_TYPE_NONE;
1095
1096 hparams.n_swa = 0;
1097 hparams.set_swa_pattern(n_pattern: 1);
1098 }
1099 } break;
1100 case LLM_ARCH_PHIMOE:
1101 {
1102 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
1103
1104 switch (hparams.n_layer) {
1105 case 32: type = LLM_TYPE_16x3_8B; break;
1106 default: type = LLM_TYPE_UNKNOWN;
1107 }
1108 } break;
1109 case LLM_ARCH_PLAMO:
1110 {
1111 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
1112
1113 switch (hparams.n_layer) {
1114 case 40: type = LLM_TYPE_13B; break;
1115 default: type = LLM_TYPE_UNKNOWN;
1116 }
1117 } break;
1118 case LLM_ARCH_PLAMO2:
1119 {
1120 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
1121
1122 // Load Mamba SSM parameters
1123 ml.get_key(kid: LLM_KV_SSM_CONV_KERNEL, result&: hparams.ssm_d_conv);
1124 ml.get_key(kid: LLM_KV_SSM_INNER_SIZE, result&: hparams.ssm_d_inner);
1125 ml.get_key(kid: LLM_KV_SSM_STATE_SIZE, result&: hparams.ssm_d_state);
1126 ml.get_key(kid: LLM_KV_SSM_TIME_STEP_RANK, result&: hparams.ssm_dt_rank);
1127 ml.get_key(kid: LLM_KV_SSM_GROUP_COUNT, result&: hparams.ssm_n_group);
1128
1129 for (uint32_t i = 0; i < hparams.n_layer; ++i) {
1130 hparams.recurrent_layer_arr[i] = hparams.n_head_kv(il: i) == 0;
1131 }
1132
1133 switch (hparams.n_layer) {
1134 case 16: type = LLM_TYPE_1B; break;
1135 case 32:
1136 if (hparams.n_embd == 2048) {
1137 type = LLM_TYPE_2B;
1138 } else if (hparams.n_embd == 4096) {
1139 type = LLM_TYPE_8B;
1140 }
1141 break;
1142 default: type = LLM_TYPE_UNKNOWN;
1143 }
1144
1145 // Load attention parameters
1146 ml.get_key(kid: LLM_KV_ATTENTION_KEY_LENGTH, result&: hparams.n_embd_head_k, required: false);
1147 ml.get_key(kid: LLM_KV_ATTENTION_VALUE_LENGTH, result&: hparams.n_embd_head_v, required: false);
1148 } break;
1149 case LLM_ARCH_GPT2:
1150 {
1151 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_EPS, result&: hparams.f_norm_eps);
1152 switch (hparams.n_layer) {
1153 case 12: type = LLM_TYPE_SMALL; break;
1154 case 24: type = LLM_TYPE_MEDIUM; break;
1155 case 36: type = LLM_TYPE_LARGE; break;
1156 case 48: type = LLM_TYPE_XL; break;
1157 default: type = LLM_TYPE_UNKNOWN;
1158 }
1159 } break;
1160 case LLM_ARCH_CODESHELL:
1161 {
1162 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_EPS, result&: hparams.f_norm_eps);
1163 switch (hparams.n_layer) {
1164 case 42: type = LLM_TYPE_7B; break;
1165 default: type = LLM_TYPE_UNKNOWN;
1166 }
1167 } break;
1168 case LLM_ARCH_ORION:
1169 {
1170 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_EPS, result&: hparams.f_norm_eps);
1171
1172 switch (hparams.n_layer) {
1173 case 40: type = LLM_TYPE_14B; break;
1174 default: type = LLM_TYPE_UNKNOWN;
1175 }
1176 } break;
1177 case LLM_ARCH_INTERNLM2:
1178 {
1179 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
1180 switch (hparams.n_layer) {
1181 case 32: type = LLM_TYPE_7B; break;
1182 case 48: type = LLM_TYPE_20B; break;
1183 default: type = LLM_TYPE_UNKNOWN;
1184 }
1185 } break;
1186 case LLM_ARCH_GEMMA:
1187 {
1188 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
1189
1190 switch (hparams.n_layer) {
1191 case 18: type = LLM_TYPE_2B; break;
1192 case 28: type = LLM_TYPE_7B; break;
1193 default: type = LLM_TYPE_UNKNOWN;
1194 }
1195 } break;
1196 case LLM_ARCH_GEMMA2:
1197 {
1198 hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1199 hparams.n_swa = 4096; // default value of gemma 2
1200 hparams.set_swa_pattern(n_pattern: 2);
1201 hparams.attn_soft_cap = true;
1202
1203 ml.get_key(kid: LLM_KV_ATTENTION_SLIDING_WINDOW, result&: hparams.n_swa, required: false);
1204 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
1205 ml.get_key(kid: LLM_KV_ATTN_LOGIT_SOFTCAPPING, result&: hparams.f_attn_logit_softcapping, required: false);
1206 ml.get_key(kid: LLM_KV_FINAL_LOGIT_SOFTCAPPING, result&: hparams.f_final_logit_softcapping, required: false);
1207
1208 switch (hparams.n_layer) {
1209 case 26: type = LLM_TYPE_2B; break;
1210 case 42: type = LLM_TYPE_9B; break;
1211 case 46: type = LLM_TYPE_27B; break;
1212 default: type = LLM_TYPE_UNKNOWN;
1213 }
1214
1215 // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L173
1216 hparams.f_attention_scale = type == LLM_TYPE_27B
1217 ? 1.0f / std::sqrt(x: float(hparams.n_embd / hparams.n_head(il: 0)))
1218 : 1.0f / std::sqrt(x: float(hparams.n_embd_head_k));
1219 } break;
1220 case LLM_ARCH_GEMMA3:
1221 {
1222 hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1223 hparams.set_swa_pattern(n_pattern: 6);
1224
1225 hparams.rope_freq_base_train_swa = 10000.0f;
1226 hparams.rope_freq_scale_train_swa = 1.0f;
1227
1228 ml.get_key(kid: LLM_KV_ATTENTION_SLIDING_WINDOW, result&: hparams.n_swa);
1229 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
1230
1231 switch (hparams.n_layer) {
1232 case 18: type = LLM_TYPE_270M; break;
1233 case 26: type = LLM_TYPE_1B; break;
1234 case 34: type = LLM_TYPE_4B; break;
1235 case 48: type = LLM_TYPE_12B; break;
1236 case 62: type = LLM_TYPE_27B; break;
1237 default: type = LLM_TYPE_UNKNOWN;
1238 }
1239
1240 // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L289
1241 hparams.f_attention_scale = type == LLM_TYPE_27B
1242 ? 1.0f / std::sqrt(x: float(hparams.n_embd / hparams.n_head(il: 0)))
1243 : 1.0f / std::sqrt(x: float(hparams.n_embd_head_k));
1244 } break;
1245 case LLM_ARCH_GEMMA3N:
1246 {
1247 hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1248 hparams.set_swa_pattern(n_pattern: 5);
1249
1250 hparams.n_layer_kv_from_start = 20;
1251 hparams.rope_freq_base_train_swa = 10000.0f;
1252 hparams.rope_freq_scale_train_swa = 1.0f;
1253 hparams.f_attention_scale = 1.0f;
1254
1255 ml.get_key(kid: LLM_KV_ATTENTION_SLIDING_WINDOW, result&: hparams.n_swa);
1256 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
1257
1258 switch (hparams.n_layer) {
1259 case 30: type = LLM_TYPE_E2B; break;
1260 case 35: type = LLM_TYPE_E4B; break;
1261 default: type = LLM_TYPE_UNKNOWN;
1262 }
1263 } break;
1264 case LLM_ARCH_GEMMA_EMBEDDING:
1265 {
1266 hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC;
1267 hparams.set_swa_pattern(n_pattern: 6);
1268
1269 hparams.causal_attn = false; // embeddings do not use causal attention
1270 hparams.rope_freq_base_train_swa = 10000.0f;
1271 hparams.rope_freq_scale_train_swa = 1.0f;
1272
1273 ml.get_key(kid: LLM_KV_ATTENTION_SLIDING_WINDOW, result&: hparams.n_swa);
1274 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
1275 ml.get_key(kid: LLM_KV_POOLING_TYPE, result&: hparams.pooling_type);
1276
1277 //applied only if model converted with --sentence-transformers-dense-modules
1278 ml.get_key(kid: LLM_KV_DENSE_2_FEAT_IN, result&: hparams.dense_2_feat_in, required: false);
1279 ml.get_key(kid: LLM_KV_DENSE_2_FEAT_OUT, result&: hparams.dense_2_feat_out, required: false);
1280 ml.get_key(kid: LLM_KV_DENSE_3_FEAT_IN, result&: hparams.dense_3_feat_in, required: false);
1281 ml.get_key(kid: LLM_KV_DENSE_3_FEAT_OUT, result&: hparams.dense_3_feat_out, required: false);
1282
1283 GGML_ASSERT((hparams.dense_2_feat_in == 0 || hparams.dense_2_feat_in == hparams.n_embd) && "dense_2_feat_in must be equal to n_embd");
1284 GGML_ASSERT((hparams.dense_3_feat_out == 0 || hparams.dense_3_feat_out == hparams.n_embd) && "dense_3_feat_out must be equal to n_embd");
1285
1286 switch (hparams.n_layer) {
1287 case 24: type = LLM_TYPE_0_3B; break;
1288 default: type = LLM_TYPE_UNKNOWN;
1289 }
1290 hparams.f_attention_scale = 1.0f / std::sqrt(x: float(hparams.n_embd_head_k));
1291
1292 } break;
1293 case LLM_ARCH_STARCODER2:
1294 {
1295 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_EPS, result&: hparams.f_norm_eps);
1296 switch (hparams.n_layer) {
1297 case 30: type = LLM_TYPE_3B; break;
1298 case 32: type = LLM_TYPE_7B; break;
1299 case 40: type = LLM_TYPE_15B; break;
1300 case 52: type = LLM_TYPE_20B; break; // granite
1301 case 88: type = LLM_TYPE_34B; break; // granite
1302 default: type = LLM_TYPE_UNKNOWN;
1303 }
1304 } break;
1305 case LLM_ARCH_MAMBA:
1306 {
1307 ml.get_key(kid: LLM_KV_SSM_CONV_KERNEL, result&: hparams.ssm_d_conv);
1308 ml.get_key(kid: LLM_KV_SSM_INNER_SIZE, result&: hparams.ssm_d_inner);
1309 ml.get_key(kid: LLM_KV_SSM_STATE_SIZE, result&: hparams.ssm_d_state);
1310 ml.get_key(kid: LLM_KV_SSM_TIME_STEP_RANK, result&: hparams.ssm_dt_rank);
1311 ml.get_key(kid: LLM_KV_SSM_DT_B_C_RMS, result&: hparams.ssm_dt_b_c_rms, required: false);
1312
1313 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
1314
1315 switch (hparams.n_layer) {
1316 case 24:
1317 switch (hparams.n_embd) {
1318 case 768: type = LLM_TYPE_SMALL; break;
1319 default: type = LLM_TYPE_UNKNOWN;
1320 } break;
1321 case 48:
1322 switch (hparams.n_embd) {
1323 case 1024: type = LLM_TYPE_MEDIUM; break;
1324 case 1536: type = LLM_TYPE_LARGE; break;
1325 case 2048: type = LLM_TYPE_XL; break;
1326 default: type = LLM_TYPE_UNKNOWN;
1327 } break;
1328 case 64:
1329 switch (hparams.n_embd) {
1330 case 2560: type = LLM_TYPE_3B; break;
1331 default: type = LLM_TYPE_UNKNOWN;
1332 } break;
1333 default: type = LLM_TYPE_UNKNOWN;
1334 }
1335 } break;
1336 case LLM_ARCH_MAMBA2:
1337 {
1338 ml.get_key(kid: LLM_KV_SSM_CONV_KERNEL, result&: hparams.ssm_d_conv);
1339 ml.get_key(kid: LLM_KV_SSM_INNER_SIZE, result&: hparams.ssm_d_inner);
1340 ml.get_key(kid: LLM_KV_SSM_STATE_SIZE, result&: hparams.ssm_d_state);
1341 ml.get_key(kid: LLM_KV_SSM_TIME_STEP_RANK, result&: hparams.ssm_dt_rank);
1342 ml.get_key(kid: LLM_KV_SSM_GROUP_COUNT, result&: hparams.ssm_n_group);
1343
1344 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
1345
1346 switch (hparams.n_layer) {
1347 case 24:
1348 switch (hparams.n_embd) {
1349 case 768: type = LLM_TYPE_SMALL; break;
1350 default: type = LLM_TYPE_UNKNOWN;
1351 } break;
1352 case 48:
1353 switch (hparams.n_embd) {
1354 case 1024: type = LLM_TYPE_MEDIUM; break;
1355 case 1536: type = LLM_TYPE_LARGE; break;
1356 case 2048: type = LLM_TYPE_XL; break;
1357 default: type = LLM_TYPE_UNKNOWN;
1358 } break;
1359 case 64:
1360 switch (hparams.n_embd) {
1361 case 2560: type = LLM_TYPE_3B; break;
1362 case 4096: type = LLM_TYPE_7B; break;
1363 default: type = LLM_TYPE_UNKNOWN;
1364 } break;
1365 default: type = LLM_TYPE_UNKNOWN;
1366 }
1367 } break;
1368 case LLM_ARCH_JAMBA:
1369 {
1370 ml.get_key(kid: LLM_KV_SSM_CONV_KERNEL, result&: hparams.ssm_d_conv);
1371 ml.get_key(kid: LLM_KV_SSM_INNER_SIZE, result&: hparams.ssm_d_inner);
1372 ml.get_key(kid: LLM_KV_SSM_STATE_SIZE, result&: hparams.ssm_d_state);
1373 ml.get_key(kid: LLM_KV_SSM_TIME_STEP_RANK, result&: hparams.ssm_dt_rank);
1374
1375 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
1376
1377 for (uint32_t i = 0; i < hparams.n_layer; ++i) {
1378 hparams.recurrent_layer_arr[i] = hparams.n_head_kv(il: i) == 0;
1379 }
1380
1381 switch (hparams.n_layer) {
1382 // TODO: Jamba layers are a bit heterogenous, so naming this is hard.
1383 case 12: // 900M 8x???M
1384 case 32: // 51B 16x?B
1385 default: type = LLM_TYPE_UNKNOWN;
1386 }
1387 } break;
1388 case LLM_ARCH_XVERSE:
1389 {
1390 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
1391 switch (hparams.n_layer) {
1392 case 32: type = LLM_TYPE_7B; break;
1393 case 40: type = LLM_TYPE_13B; break;
1394 case 80: type = LLM_TYPE_65B; break;
1395 default: type = LLM_TYPE_UNKNOWN;
1396 }
1397 } break;
1398 case LLM_ARCH_COMMAND_R:
1399 {
1400 ml.get_key(kid: LLM_KV_LOGIT_SCALE, result&: hparams.f_logit_scale);
1401 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_EPS, result&: hparams.f_norm_eps);
1402 switch (hparams.n_layer) {
1403 case 40: type = LLM_TYPE_35B; break;
1404 default: type = LLM_TYPE_UNKNOWN;
1405 }
1406 } break;
1407 case LLM_ARCH_COHERE2:
1408 {
1409 hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1410 hparams.set_swa_pattern(n_pattern: 4);
1411
1412 ml.get_key(kid: LLM_KV_ATTENTION_SLIDING_WINDOW, result&: hparams.n_swa);
1413 ml.get_key(kid: LLM_KV_LOGIT_SCALE, result&: hparams.f_logit_scale);
1414 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_EPS, result&: hparams.f_norm_eps);
1415 switch (hparams.n_layer) {
1416 case 32: type = LLM_TYPE_8B; break;
1417 default: type = LLM_TYPE_UNKNOWN;
1418 }
1419 } break;
1420 case LLM_ARCH_DBRX:
1421 {
1422 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_EPS, result&: hparams.f_norm_eps);
1423 ml.get_key(kid: LLM_KV_ATTENTION_CLAMP_KQV, result&: hparams.f_clamp_kqv);
1424
1425 switch (hparams.n_layer) {
1426 case 40: type = LLM_TYPE_16x12B; break;
1427 default: type = LLM_TYPE_UNKNOWN;
1428 }
1429 } break;
1430 case LLM_ARCH_OLMO:
1431 {
1432 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_EPS, result&: hparams.f_norm_eps);
1433 ml.get_key(kid: LLM_KV_ATTENTION_CLAMP_KQV, result&: hparams.f_clamp_kqv, required: false);
1434
1435 switch (hparams.n_layer) {
1436 case 22: type = LLM_TYPE_1B; break;
1437 case 32: type = LLM_TYPE_7B; break;
1438 case 80: type = LLM_TYPE_70B; break;
1439 default: type = LLM_TYPE_UNKNOWN;
1440 }
1441 } break;
1442 case LLM_ARCH_OLMO2:
1443 {
1444 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
1445
1446 const bool found_swa = ml.get_key(kid: LLM_KV_ATTENTION_SLIDING_WINDOW, result&: hparams.n_swa, required: false);
1447 if (found_swa && hparams.n_swa > 0) {
1448 hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1449 hparams.set_swa_pattern(n_pattern: 4);
1450 } else {
1451 hparams.swa_type = LLAMA_SWA_TYPE_NONE;
1452 }
1453
1454 switch (hparams.n_layer) {
1455 case 16: type = LLM_TYPE_1B; break;
1456 case 32: type = LLM_TYPE_7B; break;
1457 case 40: type = LLM_TYPE_13B; break;
1458 case 64: type = LLM_TYPE_32B; break;
1459 default: type = LLM_TYPE_UNKNOWN;
1460 }
1461 } break;
1462 case LLM_ARCH_SEED_OSS:
1463 {
1464 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
1465 switch (hparams.n_layer) {
1466 case 64: type = LLM_TYPE_36B; break;
1467 default: type = LLM_TYPE_UNKNOWN;
1468 }
1469 } break;
1470 case LLM_ARCH_OLMOE:
1471 {
1472 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
1473 switch (hparams.n_layer) {
1474 case 16: type = LLM_TYPE_A1_7B; break;
1475 default: type = LLM_TYPE_UNKNOWN;
1476 }
1477 } break;
1478 case LLM_ARCH_OPENELM:
1479 {
1480 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
1481
1482 switch (hparams.n_layer) {
1483 case 16: type = LLM_TYPE_270M; break;
1484 case 20: type = LLM_TYPE_450M; break;
1485 case 28: type = LLM_TYPE_1B; break;
1486 case 36: type = LLM_TYPE_3B; break;
1487 default: type = LLM_TYPE_UNKNOWN;
1488 }
1489 } break;
1490 case LLM_ARCH_GPTNEOX:
1491 {
1492 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_EPS, result&: hparams.f_norm_eps);
1493 ml.get_key(kid: LLM_KV_USE_PARALLEL_RESIDUAL, result&: hparams.use_par_res);
1494 switch (hparams.n_layer) {
1495 case 6:
1496 switch (hparams.n_ff()) {
1497 case 512: type = LLM_TYPE_14M; break;
1498 case 2048: type = LLM_TYPE_70M; break;
1499 default: type = LLM_TYPE_UNKNOWN;
1500 } break;
1501 case 12:
1502 switch (hparams.n_ff()) {
1503 case 3072: type = LLM_TYPE_160M; break;
1504 default: type = LLM_TYPE_UNKNOWN;
1505 } break;
1506 case 16:
1507 switch (hparams.n_ff()) {
1508 case 8192: type = LLM_TYPE_1B; break;
1509 default: type = LLM_TYPE_UNKNOWN;
1510 } break;
1511 case 24:
1512 switch (hparams.n_ff()) {
1513 case 4096: type = LLM_TYPE_410M; break;
1514 case 8192: type = LLM_TYPE_1_4B; break;
1515 default: type = LLM_TYPE_UNKNOWN;
1516 } break;
1517 case 32:
1518 switch (hparams.n_ff()) {
1519 case 10240: type = LLM_TYPE_2_8B; break;
1520 case 16384: type = LLM_TYPE_6_9B; break;
1521 default: type = LLM_TYPE_UNKNOWN;
1522 } break;
1523 case 36:
1524 switch (hparams.n_ff()) {
1525 case 20480: type = LLM_TYPE_12B; break;
1526 default: type = LLM_TYPE_UNKNOWN;
1527 } break;
1528 case 44:
1529 switch (hparams.n_ff()) {
1530 case 24576: type = LLM_TYPE_20B; break;
1531 default: type = LLM_TYPE_UNKNOWN;
1532 } break;
1533 default: type = LLM_TYPE_UNKNOWN;
1534 }
1535 } break;
1536 case LLM_ARCH_ARCTIC:
1537 {
1538 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
1539
1540 if (hparams.n_expert == 128) {
1541 switch (hparams.n_layer) {
1542 case 35: type = LLM_TYPE_10B_128x3_66B; break;
1543 default: type = LLM_TYPE_UNKNOWN;
1544 }
1545 } else {
1546 type = LLM_TYPE_UNKNOWN;
1547 }
1548 } break;
1549 case LLM_ARCH_DEEPSEEK:
1550 {
1551 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
1552 ml.get_key(kid: LLM_KV_LEADING_DENSE_BLOCK_COUNT, result&: hparams.n_layer_dense_lead);
1553 ml.get_key(kid: LLM_KV_EXPERT_FEED_FORWARD_LENGTH, result&: hparams.n_ff_exp);
1554 ml.get_key(kid: LLM_KV_EXPERT_SHARED_COUNT, result&: hparams.n_expert_shared);
1555 ml.get_key(kid: LLM_KV_EXPERT_WEIGHTS_SCALE, result&: hparams.expert_weights_scale);
1556
1557 switch (hparams.n_layer) {
1558 case 28: type = LLM_TYPE_20B; break;
1559 default: type = LLM_TYPE_UNKNOWN;
1560 }
1561 } break;
1562 case LLM_ARCH_DEEPSEEK2:
1563 {
1564 bool is_lite = (hparams.n_layer == 27);
1565 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
1566 ml.get_key(kid: LLM_KV_LEADING_DENSE_BLOCK_COUNT, result&: hparams.n_layer_dense_lead);
1567 if (!is_lite) {
1568 ml.get_key(kid: LLM_KV_ATTENTION_Q_LORA_RANK, result&: hparams.n_lora_q);
1569 }
1570 ml.get_key(kid: LLM_KV_ATTENTION_KV_LORA_RANK, result&: hparams.n_lora_kv);
1571 ml.get_key(kid: LLM_KV_ATTENTION_KEY_LENGTH_MLA, result&: hparams.n_embd_head_k_mla, required: false);
1572 ml.get_key(kid: LLM_KV_ATTENTION_VALUE_LENGTH_MLA, result&: hparams.n_embd_head_v_mla, required: false);
1573 ml.get_key(kid: LLM_KV_EXPERT_FEED_FORWARD_LENGTH, result&: hparams.n_ff_exp);
1574 ml.get_key(kid: LLM_KV_EXPERT_SHARED_COUNT, result&: hparams.n_expert_shared);
1575 ml.get_key(kid: LLM_KV_EXPERT_WEIGHTS_SCALE, result&: hparams.expert_weights_scale);
1576 ml.get_key(kid: LLM_KV_EXPERT_WEIGHTS_NORM, result&: hparams.expert_weights_norm, required: false);
1577 ml.get_key(kid: LLM_KV_EXPERT_GATING_FUNC, result&: hparams.expert_gating_func, required: false);
1578 if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
1579 // for compatibility with existing DeepSeek V2 and V2.5 GGUFs
1580 // that have no expert_gating_func model parameter set
1581 hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
1582 }
1583 ml.get_key(kid: LLM_KV_ROPE_SCALING_YARN_LOG_MUL, result&: hparams.rope_yarn_log_mul, required: false);
1584
1585 switch (hparams.n_layer) {
1586 case 27: type = LLM_TYPE_16B; break;
1587 case 60: type = LLM_TYPE_236B; break;
1588 case 61: type = LLM_TYPE_671B; break;
1589 default: type = LLM_TYPE_UNKNOWN;
1590 }
1591 } break;
1592 case LLM_ARCH_PLM:
1593 {
1594 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
1595 ml.get_key(kid: LLM_KV_ATTENTION_KV_LORA_RANK, result&: hparams.n_lora_kv);
1596 switch (hparams.n_layer) {
1597 case 32: type = LLM_TYPE_1_8B; break;
1598 default: type = LLM_TYPE_UNKNOWN;
1599 }
1600 } break;
1601 case LLM_ARCH_CHATGLM:
1602 {
1603 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
1604 switch (hparams.n_layer) {
1605 case 28: {
1606 if (hparams.n_head(il: 0) == 16) {
1607 type = LLM_TYPE_1_5B;
1608 } else {
1609 type = LLM_TYPE_6B;
1610 }
1611 } break;
1612 case 40: {
1613 if (hparams.n_head(il: 0) == 24) {
1614 type = LLM_TYPE_4B;
1615 } else {
1616 type = LLM_TYPE_9B;
1617 }
1618 } break;
1619 default: type = LLM_TYPE_UNKNOWN;
1620 }
1621 } break;
1622 case LLM_ARCH_GLM4:
1623 {
1624 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
1625 switch (hparams.n_layer) {
1626 case 40: type = LLM_TYPE_9B; break;
1627 case 61: type = LLM_TYPE_32B; break;
1628 default: type = LLM_TYPE_UNKNOWN;
1629 }
1630 } break;
1631 case LLM_ARCH_GLM4_MOE:
1632 {
1633 ml.get_key(kid: LLM_KV_EXPERT_FEED_FORWARD_LENGTH, result&: hparams.n_ff_exp);
1634 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
1635
1636 // MoE parameters
1637 ml.get_key(kid: LLM_KV_EXPERT_COUNT, result&: hparams.n_expert);
1638 ml.get_key(kid: LLM_KV_EXPERT_USED_COUNT, result&: hparams.n_expert_used);
1639 ml.get_key(kid: LLM_KV_EXPERT_SHARED_COUNT, result&: hparams.n_expert_shared);
1640 ml.get_key(kid: LLM_KV_LEADING_DENSE_BLOCK_COUNT, result&: hparams.n_layer_dense_lead, required: false);
1641 ml.get_key(kid: LLM_KV_EXPERT_WEIGHTS_SCALE, result&: hparams.expert_weights_scale);
1642 ml.get_key(kid: LLM_KV_EXPERT_WEIGHTS_NORM, result&: hparams.expert_weights_norm, required: false);
1643
1644 // Expert gating function (GLM-4.5 uses sigmoid)
1645 ml.get_key(kid: LLM_KV_EXPERT_GATING_FUNC, result&: hparams.expert_gating_func, required: false);
1646 if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
1647 hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
1648 }
1649
1650 // NextN/MTP parameters
1651 ml.get_key(kid: LLM_KV_NEXTN_PREDICT_LAYERS, result&: hparams.nextn_predict_layers, required: false);
1652
1653 // TODO: when MTP is implemented, this should probably be updated if needed
1654 hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
1655
1656 switch (hparams.n_layer) {
1657 case 47: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air (46 layers + 1 NextN layer)
1658 case 93: type = LLM_TYPE_355B_A32B; break; // GLM-4.5 (92 layers + 1 NextN layer)
1659 default: type = LLM_TYPE_UNKNOWN;
1660 }
1661 } break;
1662 case LLM_ARCH_BITNET:
1663 {
1664 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
1665
1666 switch (hparams.n_layer) {
1667 case 26: type = LLM_TYPE_3B; break;
1668 default: type = LLM_TYPE_UNKNOWN;
1669 }
1670 } break;
1671 case LLM_ARCH_T5:
1672 {
1673 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
1674 ml.get_key(kid: LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, result&: hparams.n_rel_attn_bkts);
1675
1676 uint32_t dec_start_token_id;
1677 if (ml.get_key(kid: LLM_KV_DECODER_START_TOKEN_ID, result&: dec_start_token_id, required: false)) {
1678 hparams.dec_start_token_id = dec_start_token_id;
1679 }
1680
1681 hparams.dec_n_layer = hparams.n_layer;
1682 ml.get_key(kid: LLM_KV_DECODER_BLOCK_COUNT, result&: hparams.dec_n_layer, required: false);
1683
1684 switch (hparams.n_layer) {
1685 case 6: type = LLM_TYPE_60M; break; // t5-small
1686 case 8: type = LLM_TYPE_80M; break; // flan-t5-small
1687 case 12:
1688 switch (hparams.n_ff()) {
1689 case 3072: type = LLM_TYPE_220M; break; // t5-base
1690 case 2048: type = LLM_TYPE_250M; break; // flan-t5-base
1691 default: type = LLM_TYPE_UNKNOWN;
1692 } break;
1693 case 24:
1694 switch (hparams.n_ff()) {
1695 case 4096: type = LLM_TYPE_770M; break; // t5-large
1696 case 2816: type = LLM_TYPE_780M; break; // flan-t5-large
1697 case 16384: type = LLM_TYPE_3B; break; // t5-3b
1698 case 5120: type = LLM_TYPE_3B; break; // flan-t5-xl
1699 case 65536: type = LLM_TYPE_11B; break; // t5-11b
1700 case 10240: type = LLM_TYPE_11B; break; // flan-t5-xxl
1701 default: type = LLM_TYPE_UNKNOWN;
1702 } break;
1703 default: type = LLM_TYPE_UNKNOWN;
1704 }
1705 } break;
1706 case LLM_ARCH_T5ENCODER:
1707 {
1708 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
1709 ml.get_key(kid: LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, result&: hparams.n_rel_attn_bkts);
1710 type = LLM_TYPE_UNKNOWN;
1711 } break;
1712 case LLM_ARCH_JAIS:
1713 {
1714 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_EPS, result&: hparams.f_norm_eps);
1715 ml.get_key(kid: LLM_KV_ATTENTION_MAX_ALIBI_BIAS, result&: hparams.f_max_alibi_bias);
1716
1717 switch (hparams.n_layer) {
1718 case 24: type = LLM_TYPE_1_3B; break;
1719 case 40: type = LLM_TYPE_13B; break;
1720 /* TODO: add variants */
1721 default: type = LLM_TYPE_UNKNOWN;
1722 }
1723 } break;
1724 case LLM_ARCH_NEMOTRON:
1725 {
1726 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_EPS, result&: hparams.f_norm_eps);
1727 switch (hparams.n_layer) {
1728 case 32: type = LLM_TYPE_4B; break;
1729 default: type = LLM_TYPE_UNKNOWN;
1730 }
1731 } break;
1732 case LLM_ARCH_NEMOTRON_H:
1733 {
1734 ml.get_key(kid: LLM_KV_SSM_CONV_KERNEL, result&: hparams.ssm_d_conv);
1735 ml.get_key(kid: LLM_KV_SSM_INNER_SIZE, result&: hparams.ssm_d_inner);
1736 ml.get_key(kid: LLM_KV_SSM_STATE_SIZE, result&: hparams.ssm_d_state);
1737 ml.get_key(kid: LLM_KV_SSM_TIME_STEP_RANK, result&: hparams.ssm_dt_rank);
1738 ml.get_key(kid: LLM_KV_SSM_GROUP_COUNT, result&: hparams.ssm_n_group);
1739
1740 // A layer is recurrent IFF the n_head_kv value is set to 0 and
1741 // the n_ff value is set to 0
1742 for (uint32_t i = 0; i < hparams.n_layer; ++i) {
1743 hparams.recurrent_layer_arr[i] = (hparams.n_head_kv(il: i) == 0 && hparams.n_ff(il: i) == 0);
1744 }
1745
1746 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
1747
1748 switch (hparams.n_layer) {
1749 case 56: type = LLM_TYPE_9B; break;
1750 default: type = LLM_TYPE_UNKNOWN;
1751 }
1752 } break;
1753 case LLM_ARCH_EXAONE:
1754 {
1755 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
1756
1757 switch (hparams.n_layer) {
1758 case 32: type = LLM_TYPE_8B; break;
1759 default: type = LLM_TYPE_UNKNOWN;
1760 }
1761 } break;
1762 case LLM_ARCH_EXAONE4:
1763 {
1764 if (hparams.n_layer == 64) { // 32B
1765 hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1766 hparams.n_swa = 4096;
1767 hparams.set_swa_pattern(n_pattern: 4);
1768 }
1769
1770 ml.get_key(kid: LLM_KV_ATTENTION_SLIDING_WINDOW, result&: hparams.n_swa, required: false);
1771 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
1772
1773 switch (hparams.n_layer) {
1774 case 30: type = LLM_TYPE_1_2B; break;
1775 case 64: type = LLM_TYPE_32B; break;
1776 default: type = LLM_TYPE_UNKNOWN;
1777 }
1778 } break;
1779 case LLM_ARCH_RWKV6:
1780 case LLM_ARCH_RWKV6QWEN2:
1781 {
1782 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_EPS, result&: hparams.f_norm_eps, required: false);
1783 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps, required: false);
1784 ml.get_key(kid: LLM_KV_WKV_HEAD_SIZE, result&: hparams.wkv_head_size);
1785 ml.get_key(kid: LLM_KV_TIME_MIX_EXTRA_DIM, result&: hparams.time_mix_extra_dim);
1786 ml.get_key(kid: LLM_KV_TIME_DECAY_EXTRA_DIM, result&: hparams.time_decay_extra_dim);
1787 ml.get_key(kid: LLM_KV_RESCALE_EVERY_N_LAYERS, result&: hparams.rescale_every_n_layers, required: false);
1788 ml.get_key(kid: LLM_KV_TOKEN_SHIFT_COUNT, result&: hparams.token_shift_count, required: false);
1789
1790 switch (hparams.n_layer) {
1791 case 24: type = LLM_TYPE_1_6B; break;
1792 case 32:
1793 switch (hparams.n_embd) {
1794 case 2560: type = LLM_TYPE_3B; break;
1795 case 4096: type = LLM_TYPE_7B; break;
1796 default: type = LLM_TYPE_UNKNOWN;
1797 } break;
1798 case 61: type = LLM_TYPE_14B; break;
1799 case 64: type = LLM_TYPE_32B; break;
1800 default: type = LLM_TYPE_UNKNOWN;
1801 }
1802 } break;
1803 case LLM_ARCH_RWKV7:
1804 case LLM_ARCH_ARWKV7:
1805 {
1806 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_EPS, result&: hparams.f_norm_eps, required: false);
1807 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps, required: false);
1808 ml.get_key(kid: LLM_KV_WKV_HEAD_SIZE, result&: hparams.wkv_head_size);
1809 ml.get_key(kid: LLM_KV_ATTENTION_DECAY_LORA_RANK, result&: hparams.n_lora_decay);
1810 ml.get_key(kid: LLM_KV_ATTENTION_ICLR_LORA_RANK, result&: hparams.n_lora_iclr);
1811 ml.get_key(kid: LLM_KV_ATTENTION_VALUE_RESIDUAL_MIX_LORA_RANK, result&: hparams.n_lora_value_res_mix);
1812 ml.get_key(kid: LLM_KV_ATTENTION_GATE_LORA_RANK, result&: hparams.n_lora_gate, required: false);
1813 ml.get_key(kid: LLM_KV_TOKEN_SHIFT_COUNT, result&: hparams.token_shift_count, required: false);
1814
1815 switch (hparams.n_layer) {
1816 case 12:
1817 switch (hparams.n_embd) {
1818 case 768: type = LLM_TYPE_190M; break;
1819 default: type = LLM_TYPE_UNKNOWN;
1820 } break;
1821 case 24:
1822 switch (hparams.n_embd) {
1823 case 1024: type = LLM_TYPE_450M; break;
1824 case 2048: type = LLM_TYPE_1_5B; break;
1825 default: type = LLM_TYPE_UNKNOWN;
1826 } break;
1827 case 28:
1828 switch (hparams.n_embd) {
1829 case 1536: type = LLM_TYPE_1_5B; break;
1830 case 3584: type = LLM_TYPE_7B; break;
1831 default: type = LLM_TYPE_UNKNOWN;
1832 } break;
1833 case 32:
1834 switch (hparams.n_embd) {
1835 case 2560: type = LLM_TYPE_2_9B; break;
1836 case 4096: type = LLM_TYPE_7B; break;
1837 default: type = LLM_TYPE_UNKNOWN;
1838 } break;
1839 case 61:
1840 switch (hparams.n_embd) {
1841 case 4096: type = LLM_TYPE_14B; break;
1842 default: type = LLM_TYPE_UNKNOWN;
1843 } break;
1844 default: type = LLM_TYPE_UNKNOWN;
1845 }
1846 } break;
1847 case LLM_ARCH_GRANITE:
1848 case LLM_ARCH_GRANITE_MOE:
1849 {
1850 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
1851 ml.get_key(kid: LLM_KV_LOGIT_SCALE, result&: hparams.f_logit_scale);
1852 ml.get_key(kid: LLM_KV_RESIDUAL_SCALE, result&: hparams.f_residual_scale);
1853 ml.get_key(kid: LLM_KV_EMBEDDING_SCALE, result&: hparams.f_embedding_scale);
1854 ml.get_key(kid: LLM_KV_ATTENTION_SCALE, result&: hparams.f_attention_scale);
1855
1856 // Granite uses rope_finetuned as a switch for rope, so default to true
1857 bool rope_finetuned = true;
1858 ml.get_key(kid: LLM_KV_ROPE_SCALING_FINETUNED, result&: rope_finetuned, required: false);
1859 hparams.rope_finetuned = rope_finetuned;
1860
1861 switch (hparams.n_layer) {
1862 case 32: type = LLM_TYPE_3B; break;
1863 case 40: type = LLM_TYPE_3B; break;
1864 // Add additional layer/vocab/etc checks here for other model sizes
1865 default: type = LLM_TYPE_UNKNOWN;
1866 }
1867
1868 // For Granite MoE Shared
1869 ml.get_key(kid: LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, result&: hparams.n_ff_shexp, /* required */ false);
1870 } break;
1871 case LLM_ARCH_GRANITE_HYBRID:
1872 {
1873 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
1874 ml.get_key(kid: LLM_KV_LOGIT_SCALE, result&: hparams.f_logit_scale, /* required */ false);
1875 ml.get_key(kid: LLM_KV_RESIDUAL_SCALE, result&: hparams.f_residual_scale, /* required */ false);
1876 ml.get_key(kid: LLM_KV_EMBEDDING_SCALE, result&: hparams.f_embedding_scale, /* required */ false);
1877 ml.get_key(kid: LLM_KV_ATTENTION_SCALE, result&: hparams.f_attention_scale, /* required */ false);
1878
1879 ml.get_key(kid: LLM_KV_SSM_CONV_KERNEL, result&: hparams.ssm_d_conv);
1880 ml.get_key(kid: LLM_KV_SSM_INNER_SIZE, result&: hparams.ssm_d_inner);
1881 ml.get_key(kid: LLM_KV_SSM_STATE_SIZE, result&: hparams.ssm_d_state);
1882 ml.get_key(kid: LLM_KV_SSM_TIME_STEP_RANK, result&: hparams.ssm_dt_rank);
1883 ml.get_key(kid: LLM_KV_SSM_GROUP_COUNT, result&: hparams.ssm_n_group);
1884
1885 // Granite uses rope_finetuned as a switch for rope, so default to true
1886 bool rope_finetuned = true;
1887 ml.get_key(kid: LLM_KV_ROPE_SCALING_FINETUNED, result&: rope_finetuned, required: false);
1888 hparams.rope_finetuned = rope_finetuned;
1889
1890 // A layer is recurrent IFF the n_head_kv value is set to 0
1891 for (uint32_t i = 0; i < hparams.n_layer; ++i) {
1892 hparams.recurrent_layer_arr[i] = hparams.n_head_kv(il: i) == 0;
1893 }
1894
1895 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
1896
1897 switch (hparams.n_embd) {
1898 case 768: type = LLM_TYPE_350M; break;
1899 case 1536: type = (hparams.n_embd == 2048 ? LLM_TYPE_7B_A1B : LLM_TYPE_1B); break;
1900 case 2048: case 2560: type = LLM_TYPE_3B; break;
1901 case 4096: type = LLM_TYPE_32B; break;
1902 default: type = LLM_TYPE_UNKNOWN;
1903 }
1904
1905 // For Granite MoE Shared
1906 ml.get_key(kid: LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, result&: hparams.n_ff_shexp, /* required */ false);
1907 } break;
1908 case LLM_ARCH_CHAMELEON:
1909 {
1910 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
1911 hparams.f_norm_eps = 1e-5; // eps for qk-norm, torch default
1912 ml.get_key(kid: LLM_KV_SWIN_NORM, result&: hparams.swin_norm);
1913
1914 switch (hparams.n_layer) {
1915 case 32: type = LLM_TYPE_7B; break;
1916 case 48: type = LLM_TYPE_34B; break;
1917 default: type = LLM_TYPE_UNKNOWN;
1918 }
1919 } break;
1920 case LLM_ARCH_WAVTOKENIZER_DEC:
1921 {
1922 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_EPS, result&: hparams.f_norm_eps);
1923 ml.get_key(kid: LLM_KV_ATTENTION_GROUPNORM_EPS, result&: hparams.f_norm_group_eps);
1924 ml.get_key(kid: LLM_KV_ATTENTION_GROUPNORM_GROUPS, result&: hparams.n_norm_groups);
1925 ml.get_key(kid: LLM_KV_ATTENTION_CAUSAL, result&: hparams.causal_attn);
1926 } break;
1927 case LLM_ARCH_BAILINGMOE:
1928 {
1929 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
1930 ml.get_key(kid: LLM_KV_LEADING_DENSE_BLOCK_COUNT, result&: hparams.n_layer_dense_lead);
1931 ml.get_key(kid: LLM_KV_EXPERT_FEED_FORWARD_LENGTH, result&: hparams.n_ff_exp);
1932 ml.get_key(kid: LLM_KV_EXPERT_SHARED_COUNT, result&: hparams.n_expert_shared);
1933 ml.get_key(kid: LLM_KV_EXPERT_WEIGHTS_SCALE, result&: hparams.expert_weights_scale);
1934 ml.get_key(kid: LLM_KV_EXPERT_WEIGHTS_NORM, result&: hparams.expert_weights_norm, required: false);
1935
1936 switch (hparams.n_layer) {
1937 case 28: type = LLM_TYPE_16B; break;
1938 case 88: type = LLM_TYPE_290B; break;
1939 default: type = LLM_TYPE_UNKNOWN;
1940 }
1941 } break;
1942 case LLM_ARCH_BAILINGMOE2:
1943 {
1944 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
1945 ml.get_key(kid: LLM_KV_LEADING_DENSE_BLOCK_COUNT, result&: hparams.n_layer_dense_lead);
1946 ml.get_key(kid: LLM_KV_EXPERT_FEED_FORWARD_LENGTH, result&: hparams.n_ff_exp);
1947 ml.get_key(kid: LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, result&: hparams.n_ff_shexp);
1948 ml.get_key(kid: LLM_KV_EXPERT_SHARED_COUNT, result&: hparams.n_expert_shared);
1949 ml.get_key(kid: LLM_KV_EXPERT_WEIGHTS_SCALE, result&: hparams.expert_weights_scale);
1950 ml.get_key(kid: LLM_KV_EXPERT_WEIGHTS_NORM, result&: hparams.expert_weights_norm, required: false);
1951 ml.get_key(kid: LLM_KV_EXPERT_GATING_FUNC, result&: hparams.expert_gating_func);
1952 ml.get_key(kid: LLM_KV_NEXTN_PREDICT_LAYERS, result&: hparams.nextn_predict_layers, required: false);
1953
1954 // TODO: when MTP is implemented, this should probably be updated if needed
1955 hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
1956
1957 switch (hparams.n_layer) {
1958 case 20: type = LLM_TYPE_16B_A1B; break;
1959 case 21: type = LLM_TYPE_16B_A1B; break;
1960 case 32: type = LLM_TYPE_100B_A6B; break;
1961 case 33: type = LLM_TYPE_100B_A6B; break;
1962 default: type = LLM_TYPE_UNKNOWN;
1963 }
1964 } break;
1965 case LLM_ARCH_DOTS1:
1966 {
1967 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
1968 ml.get_key(kid: LLM_KV_LEADING_DENSE_BLOCK_COUNT, result&: hparams.n_layer_dense_lead);
1969 ml.get_key(kid: LLM_KV_EXPERT_FEED_FORWARD_LENGTH, result&: hparams.n_ff_exp);
1970 ml.get_key(kid: LLM_KV_EXPERT_SHARED_COUNT, result&: hparams.n_expert_shared);
1971 ml.get_key(kid: LLM_KV_EXPERT_WEIGHTS_SCALE, result&: hparams.expert_weights_scale);
1972 ml.get_key(kid: LLM_KV_EXPERT_WEIGHTS_NORM, result&: hparams.expert_weights_norm, required: false);
1973 ml.get_key(kid: LLM_KV_EXPERT_GATING_FUNC, result&: hparams.expert_gating_func, required: false);
1974 switch (hparams.n_layer) {
1975 case 62: type = LLM_TYPE_142B; break;
1976 default: type = LLM_TYPE_UNKNOWN;
1977 }
1978 } break;
1979 case LLM_ARCH_ERNIE4_5:
1980 case LLM_ARCH_ERNIE4_5_MOE:
1981 {
1982 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
1983 if (arch == LLM_ARCH_ERNIE4_5_MOE) {
1984 ml.get_key(kid: LLM_KV_EXPERT_FEED_FORWARD_LENGTH, result&: hparams.n_ff_exp);
1985 ml.get_key(kid: LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, result&: hparams.n_ff_shexp, required: false);
1986 ml.get_key(kid: LLM_KV_INTERLEAVE_MOE_LAYER_STEP, result&: hparams.n_moe_layer_step);
1987 ml.get_key(kid: LLM_KV_LEADING_DENSE_BLOCK_COUNT, result&: hparams.n_layer_dense_lead);
1988 }
1989
1990 switch (hparams.n_layer) {
1991 case 18: type = LLM_TYPE_0_3B; break;
1992 case 28: type = LLM_TYPE_21B_A3B; break;
1993 case 54: type = LLM_TYPE_300B_A47B; break;
1994 default: type = LLM_TYPE_UNKNOWN;
1995 }
1996 } break;
1997 case LLM_ARCH_FALCON_H1:
1998 {
1999 // Common parameters
2000 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
2001
2002 // SSM parameters
2003 ml.get_key(kid: LLM_KV_SSM_CONV_KERNEL, result&: hparams.ssm_d_conv);
2004 ml.get_key(kid: LLM_KV_SSM_INNER_SIZE, result&: hparams.ssm_d_inner);
2005 ml.get_key(kid: LLM_KV_SSM_STATE_SIZE, result&: hparams.ssm_d_state);
2006 ml.get_key(kid: LLM_KV_SSM_TIME_STEP_RANK, result&: hparams.ssm_dt_rank);
2007 ml.get_key(kid: LLM_KV_SSM_GROUP_COUNT, result&: hparams.ssm_n_group);
2008
2009 std::fill(first: hparams.recurrent_layer_arr.begin(), last: hparams.recurrent_layer_arr.end(), value: true);
2010
2011 switch (hparams.n_layer) {
2012 case 36:
2013 type = LLM_TYPE_0_5B; break;
2014 case 24:
2015 type = LLM_TYPE_1_5B; break;
2016 case 66:
2017 type = LLM_TYPE_1B; break;
2018 case 32:
2019 type = LLM_TYPE_3B; break;
2020 case 44:
2021 type = LLM_TYPE_7B; break;
2022 case 72:
2023 type = LLM_TYPE_34B; break;
2024 default:
2025 type = LLM_TYPE_UNKNOWN;
2026 }
2027 } break;
2028 case LLM_ARCH_HUNYUAN_MOE:
2029 {
2030 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
2031 ml.get_key(kid: LLM_KV_EXPERT_FEED_FORWARD_LENGTH, result&: hparams.n_ff_exp);
2032 ml.get_key(kid: LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, result&: hparams.n_ff_shexp);
2033
2034 switch (hparams.n_layer) {
2035 case 32: type = LLM_TYPE_A13B; break;
2036 default: type = LLM_TYPE_UNKNOWN;
2037 }
2038 } break;
2039 case LLM_ARCH_HUNYUAN_DENSE:
2040 {
2041 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
2042
2043 switch (hparams.n_embd) {
2044 case 1024: type = LLM_TYPE_0_5B; break;
2045 case 2048: type = LLM_TYPE_1_8B; break;
2046 case 3072: type = LLM_TYPE_4B; break;
2047 case 4096: type = LLM_TYPE_7B; break;
2048 default: type = LLM_TYPE_UNKNOWN;
2049 }
2050 } break;
2051 case LLM_ARCH_SMOLLM3:
2052 {
2053 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
2054 hparams.n_no_rope_layer_step = 4;
2055
2056 switch (hparams.n_layer) {
2057 case 36: type = LLM_TYPE_3B; break;
2058 default: type = LLM_TYPE_UNKNOWN;
2059 }
2060 } break;
2061 case LLM_ARCH_OPENAI_MOE:
2062 {
2063 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
2064 ml.get_key(kid: LLM_KV_EXPERT_FEED_FORWARD_LENGTH, result&: hparams.n_ff_exp);
2065 ml.get_key(kid: LLM_KV_ATTENTION_SLIDING_WINDOW, result&: hparams.n_swa);
2066
2067 hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
2068 hparams.set_swa_pattern(n_pattern: 2);
2069
2070 switch (hparams.n_layer) {
2071 case 24: type = LLM_TYPE_20B; break;
2072 case 36: type = LLM_TYPE_120B; break;
2073 default: type = LLM_TYPE_UNKNOWN;
2074 }
2075 } break;
2076 case LLM_ARCH_LFM2:
2077 {
2078 ml.get_key(kid: LLM_KV_SHORTCONV_L_CACHE, result&: hparams.n_shortconv_l_cache);
2079 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
2080 for (uint32_t il = 0; il < hparams.n_layer; ++il) {
2081 hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0;
2082 }
2083 hparams.n_layer_dense_lead = hparams.n_layer;
2084 switch (hparams.n_ff()) {
2085 case 4608: type = LLM_TYPE_350M; break;
2086 case 6912: type = LLM_TYPE_700M; break;
2087 case 8192: type = LLM_TYPE_1_2B; break;
2088 case 10752: type = LLM_TYPE_2_6B; break;
2089 default: type = LLM_TYPE_UNKNOWN;
2090 }
2091 } break;
2092 case LLM_ARCH_LFM2MOE:
2093 {
2094 ml.get_key(kid: LLM_KV_SHORTCONV_L_CACHE, result&: hparams.n_shortconv_l_cache);
2095 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
2096 ml.get_key(kid: LLM_KV_LEADING_DENSE_BLOCK_COUNT, result&: hparams.n_layer_dense_lead);
2097 ml.get_key(kid: LLM_KV_EXPERT_FEED_FORWARD_LENGTH, result&: hparams.n_ff_exp);
2098 ml.get_key(kid: LLM_KV_EXPERT_GATING_FUNC, result&: hparams.expert_gating_func);
2099
2100 for (uint32_t il = 0; il < hparams.n_layer; ++il) {
2101 hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0;
2102 }
2103
2104 type = LLM_TYPE_8B_A1B;
2105 } break;
2106 case LLM_ARCH_SMALLTHINKER:
2107 {
2108 const bool found_swa = ml.get_key(kid: LLM_KV_ATTENTION_SLIDING_WINDOW, result&: hparams.n_swa, required: false);
2109
2110 if (found_swa && hparams.n_swa > 0) {
2111 hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
2112 hparams.n_swa = 4096;
2113 hparams.set_swa_pattern(n_pattern: 4, dense_first: true);
2114 } else {
2115 hparams.swa_type = LLAMA_SWA_TYPE_NONE;
2116 hparams.n_no_rope_layer_step = hparams.n_layer;
2117 }
2118
2119 ml.get_key(kid: LLM_KV_EXPERT_FEED_FORWARD_LENGTH, result&: hparams.n_ff_exp, required: false);
2120 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
2121 ml.get_key(kid: LLM_KV_EXPERT_GATING_FUNC, result&: hparams.expert_gating_func, required: false);
2122
2123 switch (hparams.n_layer) {
2124 case 32: type = LLM_TYPE_4B; break;
2125 case 52: type = LLM_TYPE_20B; break;
2126 default: type = LLM_TYPE_UNKNOWN;
2127 }
2128 } break;
2129 case LLM_ARCH_GROVEMOE:
2130 {
2131 ml.get_key(kid: LLM_KV_EXPERT_FEED_FORWARD_LENGTH, result&: hparams.n_ff_exp);
2132 ml.get_key(kid: LLM_KV_EXPERT_CHUNK_FEED_FORWARD_LENGTH, result&: hparams.n_ff_chexp);
2133 ml.get_key(kid: LLM_KV_EXPERT_GROUP_SCALE, result&: hparams.expert_group_scale);
2134 ml.get_key(kid: LLM_KV_EXPERTS_PER_GROUP, result&: hparams.n_group_experts);
2135 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
2136
2137 switch (hparams.n_layer) {
2138 case 48: type = LLM_TYPE_30B_A3B; break;
2139 default: type = LLM_TYPE_UNKNOWN;
2140 }
2141 } break;
2142 case LLM_ARCH_APERTUS:
2143 {
2144 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
2145 ml.get_key_or_arr(kid: LLM_KV_XIELU_ALPHA_N, result&: hparams.xielu_alpha_n, n: hparams.n_layer);
2146 ml.get_key_or_arr(kid: LLM_KV_XIELU_ALPHA_P, result&: hparams.xielu_alpha_p, n: hparams.n_layer);
2147 ml.get_key_or_arr(kid: LLM_KV_XIELU_BETA, result&: hparams.xielu_beta, n: hparams.n_layer);
2148 ml.get_key_or_arr(kid: LLM_KV_XIELU_EPS, result&: hparams.xielu_eps, n: hparams.n_layer);
2149
2150 switch (hparams.n_layer) {
2151 case 32: type = LLM_TYPE_8B; break;
2152 default: type = LLM_TYPE_UNKNOWN;
2153 }
2154 } break;
2155 case LLM_ARCH_MINIMAX_M2:
2156 {
2157 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
2158 ml.get_key(kid: LLM_KV_EXPERT_FEED_FORWARD_LENGTH, result&: hparams.n_ff_exp);
2159 ml.get_key(kid: LLM_KV_EXPERT_GATING_FUNC, result&: hparams.expert_gating_func, required: false);
2160
2161 switch (hparams.n_layer) {
2162 case 62: type = LLM_TYPE_230B_A10B; break;
2163 default: type = LLM_TYPE_UNKNOWN;
2164 }
2165 } break;
2166 case LLM_ARCH_COGVLM:
2167 {
2168 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
2169 switch (hparams.n_layer) {
2170 case 32: type = LLM_TYPE_13B; break;
2171 default: type = LLM_TYPE_UNKNOWN;
2172 }
2173 } break;
2174 case LLM_ARCH_PANGU_EMBED:
2175 {
2176 ml.get_key(kid: LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, result&: hparams.f_norm_rms_eps);
2177 switch (hparams.n_layer) {
2178 case 26: type = LLM_TYPE_1B; break; // openPangu-Embedded-1B-V1.1
2179 case 34: type = LLM_TYPE_7B; break; // openPangu-Embedded-7B-V1.1
2180 default: type = LLM_TYPE_UNKNOWN;
2181 }
2182 } break;
2183 default: throw std::runtime_error("unsupported model architecture");
2184 }
2185
2186 pimpl->n_bytes = ml.n_bytes;
2187
2188 pimpl->desc_str = arch_name() + " " + type_name() + " " + ml.ftype_name();
2189
2190 if (hparams.f_max_alibi_bias > 0.0f) {
2191 hparams.use_alibi = true;
2192 }
2193
2194 hparams.rope_type = llama_model_rope_type(model: this);
2195}
2196
2197void llama_model::load_vocab(llama_model_loader & ml) {
2198 const auto kv = LLM_KV(arch);
2199
2200 vocab.load(ml, kv);
2201}
2202
2203bool llama_model::load_tensors(llama_model_loader & ml) {
2204 const auto & split_mode = params.split_mode;
2205 const auto & n_gpu_layers = params.n_gpu_layers;
2206 const auto & use_mlock = params.use_mlock;
2207 const auto & tensor_split = params.tensor_split;
2208
2209 const int n_layer = hparams.n_layer;
2210
2211 const bool use_mmap_buffer = true;
2212
2213 LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false");
2214
2215 // build a list of buffer types for the CPU and GPU devices
2216 pimpl->cpu_buft_list = make_cpu_buft_list(devices, use_extra_bufts: params.use_extra_bufts, no_host: params.no_host);
2217 for (auto * dev : devices) {
2218 buft_list_t buft_list = make_gpu_buft_list(dev, split_mode, tensor_split);
2219 // add CPU buffer types as a fallback
2220 buft_list.insert(position: buft_list.end(), first: pimpl->cpu_buft_list.begin(), last: pimpl->cpu_buft_list.end());
2221 pimpl->gpu_buft_list.emplace(args&: dev, args: std::move(buft_list));
2222 }
2223
2224 // calculate the split points
2225 bool all_zero = tensor_split == nullptr || std::all_of(first: tensor_split, last: tensor_split + n_devices(), pred: [](float x) { return x == 0.0f; });
2226 std::vector<float> splits(n_devices());
2227 if (all_zero) {
2228 // default split, by free memory
2229 for (size_t i = 0; i < n_devices(); ++i) {
2230 ggml_backend_dev_t dev = devices[i];
2231 size_t total;
2232 size_t free;
2233 ggml_backend_dev_memory(device: dev, free: &free, total: &total);
2234 splits[i] = free;
2235 }
2236 } else {
2237 std::copy(first: tensor_split, last: tensor_split + n_devices(), result: splits.begin());
2238 }
2239
2240 // sum and normalize the splits to get the split points
2241 float split_sum = 0.0f;
2242 for (size_t i = 0; i < n_devices(); ++i) {
2243 split_sum += splits[i];
2244 splits[i] = split_sum;
2245 }
2246 for (size_t i = 0; i < n_devices(); ++i) {
2247 splits[i] /= split_sum;
2248 }
2249
2250 ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(type: GGML_BACKEND_DEVICE_TYPE_CPU);
2251 if (cpu_dev == nullptr) {
2252 throw std::runtime_error(format(fmt: "%s: no CPU backend found", __func__));
2253 }
2254 const int i_gpu_start = std::max(a: (int) hparams.n_layer - n_gpu_layers, b: (int) 0);
2255 const int act_gpu_layers = devices.empty() ? 0 : std::min(a: n_gpu_layers, b: (int)n_layer + 1);
2256 auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
2257 const bool is_swa = il < (int) hparams.n_layer && hparams.is_swa(il);
2258 if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
2259 LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(cpu_dev), is_swa);
2260 return {.dev: cpu_dev, .buft_list: &pimpl->cpu_buft_list};
2261 }
2262 const int layer_gpu = std::upper_bound(first: splits.begin(), last: splits.begin() + n_devices(), val: float(il - i_gpu_start)/act_gpu_layers) - splits.begin();
2263 auto * dev = devices.at(n: layer_gpu);
2264 LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(dev), is_swa);
2265 return {.dev: dev, .buft_list: &pimpl->gpu_buft_list.at(k: dev)};
2266 };
2267
2268 // assign the input layer
2269 // there is very little benefit to offloading the input layer, so always keep it on the CPU
2270 pimpl->dev_input = { .dev: cpu_dev, .buft_list: &pimpl->cpu_buft_list };
2271
2272 // assign the repeating layers to the devices according to the splits
2273 pimpl->dev_layer.resize(new_size: n_layer);
2274 for (int il = 0; il < n_layer; ++il) {
2275 pimpl->dev_layer[il] = get_layer_buft_list(il);
2276 }
2277
2278 // assign the output layer
2279 pimpl->dev_output = get_layer_buft_list(n_layer);
2280
2281 // one ggml context per buffer type
2282 int max_n_tensors = ml.n_tensors;
2283 max_n_tensors += 1; // duplicated output tensor
2284 max_n_tensors += n_layer*2; // duplicated rope freq tensors
2285 const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors;
2286
2287 // define a comparator for the buft -> ctx map to ensure that the order is well-defined:
2288 struct ggml_backend_buft_comparator {
2289 bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const {
2290 return strcmp(s1: ggml_backend_buft_name(buft: lhs), s2: ggml_backend_buft_name(buft: rhs)) < 0;
2291 }
2292 };
2293 std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> ctx_map;
2294
2295 auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
2296 auto it = ctx_map.find(x: buft);
2297 if (it == ctx_map.end()) {
2298 ggml_init_params params = {
2299 /*.mem_size =*/ ctx_size,
2300 /*.mem_buffer =*/ NULL,
2301 /*.no_alloc =*/ true,
2302 };
2303
2304 ggml_context * ctx = ggml_init(params);
2305 if (!ctx) {
2306 throw std::runtime_error(format(fmt: "failed to create ggml context"));
2307 }
2308
2309 ctx_map.emplace(args&: buft, args&: ctx);
2310
2311 return ctx;
2312 }
2313 return it->second.get();
2314 };
2315
2316 const auto TENSOR_DUPLICATED = llama_model_loader::TENSOR_DUPLICATED;
2317 const auto TENSOR_NOT_REQUIRED = llama_model_loader::TENSOR_NOT_REQUIRED;
2318 const auto TENSOR_SKIP = llama_model_loader::TENSOR_SKIP;
2319
2320 // create tensors for the weights
2321 {
2322 // note: cast to int64_t since we will use these for the tensor dimensions
2323 const int64_t n_head = hparams.n_head();
2324 const int64_t n_head_kv = hparams.n_head_kv();
2325 const int64_t n_embd = hparams.n_embd;
2326 const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
2327 const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
2328 const int64_t n_embd_head_k = hparams.n_embd_head_k;
2329 const int64_t n_embd_head_v = hparams.n_embd_head_v;
2330 const int64_t n_ff = hparams.n_ff();
2331 const int64_t n_embd_gqa = n_embd_v_gqa;
2332 const int64_t n_vocab = vocab.n_tokens();
2333 const int64_t n_token_types = vocab.n_token_types();
2334 const int64_t n_rot = hparams.n_rot;
2335 const int64_t n_expert = hparams.n_expert;
2336 const int64_t n_expert_used = hparams.n_expert_used;
2337 const int64_t n_ctx_train = hparams.n_ctx_train;
2338
2339 if (n_expert > 0 && hparams.n_expert_used == 0) {
2340 throw std::runtime_error("model has expert layers but no expert layers are used");
2341 }
2342
2343 int n_moved_tensors = 0;
2344 ggml_tensor * first_moved_tensor = nullptr;
2345 ggml_backend_buffer_type_t first_moved_from_buft = nullptr;
2346 ggml_backend_buffer_type_t first_moved_to_buft = nullptr;
2347
2348 auto create_tensor = [&](const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) -> ggml_tensor * {
2349 ggml_tensor * t_meta = ml.get_tensor_meta(name: tn.str().c_str());
2350
2351 if (!t_meta) {
2352 if (flags & TENSOR_NOT_REQUIRED) {
2353 return nullptr;
2354 }
2355 throw std::runtime_error(format(fmt: "missing tensor '%s'", tn.str().c_str()));
2356 }
2357
2358 // some models use the token embedding tensor as the output, but since these are used in different layers and with different ops
2359 // the tensor is duplicated
2360 // to handle this, we check if the tensor is duplicated, and if so, we assume that it is being loaded as the output tensor
2361 llm_tensor tn_tensor = tn.tensor;
2362 if (tn.tensor == LLM_TENSOR_TOKEN_EMBD && flags & TENSOR_DUPLICATED) {
2363 tn_tensor = LLM_TENSOR_OUTPUT;
2364 }
2365
2366 llm_tensor_info info;
2367 try {
2368 info = llm_tensor_info_for(tensor: tn_tensor);
2369 } catch (const std::out_of_range & e) {
2370 throw std::runtime_error(format(fmt: "missing tensor info mapping for %s", tn.str().c_str()));
2371 }
2372
2373 // skip unused tensors
2374 if (info.op == GGML_OP_NONE || flags & TENSOR_SKIP) {
2375 const size_t nbytes = ggml_nbytes(tensor: t_meta);
2376 LLAMA_LOG_WARN("model has unused tensor %s (size = %zu bytes) -- ignoring\n", tn.str().c_str(), nbytes);
2377
2378 ml.size_data -= nbytes;
2379 ml.n_created++;
2380
2381 return nullptr;
2382 }
2383
2384 // tensors with "bias" suffix are always used with GGML_OP_ADD or GGML_OP_ADD_ID
2385 ggml_op op;
2386 bool bias = tn.suffix != nullptr && strcmp(s1: tn.suffix, s2: "bias") == 0;
2387 if (bias) {
2388 if (info.op == GGML_OP_MUL_MAT_ID) {
2389 op = GGML_OP_ADD_ID;
2390 } else {
2391 op = GGML_OP_ADD;
2392 }
2393 } else {
2394 op = info.op;
2395 }
2396
2397 // sanity checks
2398 if (info.layer == LLM_TENSOR_LAYER_INPUT || info.layer == LLM_TENSOR_LAYER_OUTPUT) {
2399 if (tn.bid != -1) {
2400 GGML_ABORT("input/output layer tensor %s used with a layer number", tn.str().c_str());
2401 }
2402 } else {
2403 if (tn.bid == -1) {
2404 GGML_ABORT("repeating layer tensor %s used without a layer number", tn.str().c_str());
2405 }
2406 }
2407
2408 // select the buffer type for this tensor
2409 buft_list_t * buft_list;
2410 switch (info.layer) {
2411 case LLM_TENSOR_LAYER_INPUT:
2412 buft_list = pimpl->dev_input.buft_list;
2413 break;
2414 case LLM_TENSOR_LAYER_OUTPUT:
2415 buft_list = pimpl->dev_output.buft_list;
2416 break;
2417 case LLM_TENSOR_LAYER_REPEATING:
2418 buft_list = pimpl->dev_layer.at(n: tn.bid).buft_list;
2419 break;
2420 default:
2421 GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str());
2422 }
2423
2424 ggml_backend_buffer_type_t buft = nullptr;
2425
2426 // check overrides
2427 if (ml.tensor_buft_overrides) {
2428 std::string tensor_name = tn.str();
2429 for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
2430 std::regex pattern(overrides->pattern);
2431 if (std::regex_search(s: tensor_name, e: pattern)) {
2432 if (overrides->buft == ggml_backend_cpu_buffer_type()) {
2433 // when overriding to a CPU buffer, consider the extra buffer types
2434 buft = select_weight_buft(hparams, tensor: t_meta, op, buft_list: pimpl->cpu_buft_list);
2435 } else {
2436 buft = overrides->buft;
2437 }
2438
2439 LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
2440 tensor_name.c_str(),
2441 ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type),
2442 ggml_backend_buft_name(buft));
2443 break;
2444 }
2445 }
2446 }
2447
2448 if (!buft) {
2449 buft = select_weight_buft(hparams, tensor: t_meta, op, buft_list: *buft_list);
2450 if (!buft) {
2451 throw std::runtime_error(format(fmt: "failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
2452 }
2453 }
2454
2455 // avoid using a host buffer when using mmap
2456 auto * buft_dev = ggml_backend_buft_get_device(buft);
2457 if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(device: buft_dev)) {
2458 auto * cpu_dev = ggml_backend_dev_by_type(type: GGML_BACKEND_DEVICE_TYPE_CPU);
2459 if (!cpu_dev) {
2460 throw std::runtime_error("no CPU backend found");
2461 }
2462 buft = ggml_backend_dev_buffer_type(device: cpu_dev);
2463 }
2464
2465 if (buft != buft_list->front().second) {
2466 n_moved_tensors++;
2467 if (!first_moved_tensor) {
2468 first_moved_tensor = t_meta;
2469 first_moved_from_buft = buft_list->front().second;
2470 first_moved_to_buft = buft;
2471 }
2472 }
2473
2474 ggml_context * ctx = ctx_for_buft(buft);
2475
2476 // if duplicated, check if the original tensor was allocated in the same buffer type context and avoid creating a new one
2477 if (flags & TENSOR_DUPLICATED) {
2478 ggml_tensor * t = ggml_get_tensor(ctx, name: tn.str().c_str());
2479 if (t) {
2480 return t;
2481 }
2482 }
2483 return ml.create_tensor(ctx, name: tn, ne, flags);
2484 };
2485
2486 layers.resize(new_size: n_layer);
2487
2488 // TODO: move to a separate function
2489 const auto tn = LLM_TN(arch);
2490 switch (arch) {
2491 case LLM_ARCH_LLAMA:
2492 case LLM_ARCH_REFACT:
2493 case LLM_ARCH_MINICPM:
2494 case LLM_ARCH_GRANITE:
2495 case LLM_ARCH_GRANITE_MOE:
2496 {
2497 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2498
2499 // output
2500 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2501 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
2502
2503 // if output is NULL, init from the input tok embed
2504 if (output == NULL) {
2505 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
2506 }
2507
2508 for (int i = 0; i < n_layer; ++i) {
2509 auto & layer = layers[i];
2510
2511 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2512
2513 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
2514 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
2515 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
2516 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
2517
2518 // optional bias tensors
2519 layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
2520 layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
2521 layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
2522 layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
2523
2524 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2525
2526 if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
2527 layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
2528 layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
2529 }
2530 else {
2531 layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
2532 }
2533
2534 if (n_expert == 0) {
2535 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
2536 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
2537 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
2538
2539 // optional MLP bias
2540 layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
2541 layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
2542 layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
2543 } else {
2544 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
2545 layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
2546 layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
2547 layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
2548
2549 // For Granite MoE Shared
2550 if (hparams.n_ff_shexp > 0) {
2551 layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
2552 layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
2553 layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
2554 }
2555 }
2556 }
2557 } break;
2558 case LLM_ARCH_LLADA:
2559 {
2560 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
2561
2562 // output
2563 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
2564 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
2565
2566 // if output is NULL, init from the input tok embed
2567 if (output == NULL) {
2568 output =
2569 create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
2570 }
2571
2572 for (int i = 0; i < n_layer; ++i) {
2573 auto & layer = layers[i];
2574
2575 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
2576
2577 // Use separate Q, K, V projections without bias, matching LLaDALlamaBlock
2578 layer.wq =
2579 create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
2580 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
2581 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
2582 // No bias for QKV projections as per config: include_bias=false, include_qkv_bias=false
2583 layer.wo =
2584 create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
2585 layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
2586
2587 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
2588
2589 layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), { n_rot / 2 },
2590 TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
2591
2592 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
2593 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
2594 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
2595
2596 // optional MLP bias
2597 layer.ffn_gate_b =
2598 create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), { n_ff }, TENSOR_NOT_REQUIRED);
2599 layer.ffn_down_b =
2600 create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
2601 layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), { n_ff }, TENSOR_NOT_REQUIRED);
2602 }
2603 }
2604 break;
2605 case LLM_ARCH_LLADA_MOE:
2606 {
2607 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2608
2609 // output
2610 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2611 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
2612
2613 GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for llada-moe");
2614 GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for llada-moe");
2615
2616 for (int i = 0; i < n_layer; ++i) {
2617 auto & layer = layers[i];
2618
2619 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2620
2621 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
2622 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
2623 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
2624 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
2625 layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
2626 layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
2627
2628 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2629
2630 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
2631
2632 const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
2633
2634 layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
2635 layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
2636 layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
2637 }
2638 } break;
2639 case LLM_ARCH_LLAMA4:
2640 {
2641 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2642
2643 // output
2644 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2645 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
2646
2647 // if output is NULL, init from the input tok embed
2648 if (output == NULL) {
2649 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
2650 }
2651
2652 for (int i = 0; i < n_layer; ++i) {
2653 bool is_moe_layer = hparams.n_moe_layer_step > 0 && (i + 1) % hparams.n_moe_layer_step == 0;
2654
2655 auto & layer = layers[i];
2656
2657 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2658
2659 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
2660 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
2661 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
2662 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
2663
2664 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2665
2666 layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
2667
2668 if (is_moe_layer) {
2669 int n_ff_exp = hparams.n_ff_exp;
2670
2671 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
2672 layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
2673 layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert}, 0);
2674 layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
2675
2676 // Shared expert
2677 const int64_t n_ff_shexp = n_ff_exp;
2678 layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
2679 layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd }, 0);
2680 layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
2681 } else {
2682 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
2683 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
2684 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
2685 }
2686 }
2687 } break;
2688 case LLM_ARCH_DECI:
2689 {
2690 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2691
2692 // output
2693 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2694 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
2695
2696 // if output is NULL, init from the input tok embed
2697 if (output == NULL) {
2698 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
2699 }
2700
2701 for (int i = 0; i < n_layer; ++i) {
2702 auto & layer = layers[i];
2703 const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il: i);
2704 const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il: i);
2705 const int64_t n_embd_gqa = hparams.n_embd_v_gqa(il: i);
2706 const int64_t n_ff = hparams.n_ff(il: i);
2707 const int64_t n_head = hparams.n_head(il: i);
2708 const int64_t n_head_kv = hparams.n_head_kv(il: i);
2709
2710 if (n_head_kv == 0 && n_head > 0) {
2711 // linear attention for DeciLMCausalModel
2712 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2713 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
2714 }
2715 else if (n_head_kv > 0) {
2716 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2717
2718 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
2719 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
2720 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
2721 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
2722 }
2723
2724 // optional bias tensors
2725 layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
2726 layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
2727 layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
2728 layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
2729
2730 if (n_ff > 0) {
2731 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2732 }
2733
2734 if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
2735 layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
2736 layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
2737 }
2738 else {
2739 layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
2740 }
2741
2742 if (n_ff > 0) {
2743 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
2744 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
2745 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
2746 }
2747
2748 // optional MLP bias
2749 layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
2750 layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
2751 layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
2752 }
2753 } break;
2754 case LLM_ARCH_MINICPM3:
2755 {
2756 const int64_t n_embd_head_qk_rope = hparams.n_rot;
2757 const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
2758
2759 const int64_t q_lora_rank = hparams.n_lora_q;
2760 const int64_t kv_lora_rank = hparams.n_lora_kv;
2761 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2762
2763 // output
2764 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2765 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
2766
2767 // if output is NULL, init from the input tok embed
2768 if (output == NULL) {
2769 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
2770 }
2771
2772 for (int i = 0; i < n_layer; ++i) {
2773 auto & layer = layers[i];
2774
2775 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2776 layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
2777
2778 layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
2779
2780 layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
2781 layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}, 0);
2782
2783 layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
2784 layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
2785 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0);
2786
2787 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2788
2789 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
2790 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
2791 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
2792
2793 layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_embd_head_qk_rope/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
2794 layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head_qk_rope/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
2795 }
2796 } break;
2797 case LLM_ARCH_GROK:
2798 {
2799 if (n_expert == 0) {
2800 throw std::runtime_error("Grok model cannot have zero experts");
2801 }
2802
2803 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2804
2805 // output
2806 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2807 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
2808
2809 // if output is NULL, init from the input tok embed
2810 if (output == NULL) {
2811 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
2812 }
2813
2814 const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff/* / n_expert_used*/; // grok-1 n_ff_exp == n_ff
2815 for (int i = 0; i < n_layer; ++i) {
2816 auto & layer = layers[i];
2817
2818 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2819
2820 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
2821 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
2822 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
2823 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
2824
2825 layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
2826
2827 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2828
2829 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
2830 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, TENSOR_NOT_REQUIRED);
2831 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
2832
2833 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
2834 layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
2835 layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
2836 layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
2837
2838 layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
2839 if (!layer.ffn_post_norm) {
2840 layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
2841 }
2842 }
2843 } break;
2844 case LLM_ARCH_DBRX:
2845 {
2846 if (n_expert == 0) {
2847 throw std::runtime_error("DBRX model cannot have zero experts");
2848 }
2849
2850 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2851
2852 // output
2853 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2854 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
2855
2856 for (int i = 0; i < n_layer; ++i) {
2857 auto & layer = layers[i];
2858
2859 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2860
2861 layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
2862 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
2863
2864 layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
2865
2866 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
2867 layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
2868 layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
2869 layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
2870 }
2871 } break;
2872 case LLM_ARCH_BAICHUAN:
2873 {
2874 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2875 {
2876 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2877 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
2878 }
2879
2880 for (int i = 0; i < n_layer; ++i) {
2881 auto & layer = layers[i];
2882
2883 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2884
2885 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
2886 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
2887 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
2888 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
2889
2890 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2891
2892 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
2893 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
2894 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
2895 }
2896 } break;
2897 case LLM_ARCH_FALCON:
2898 {
2899 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2900
2901 // output
2902 {
2903 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2904 output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
2905
2906 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
2907 if (!output) {
2908 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // needs to be on GPU
2909 }
2910 }
2911
2912 for (int i = 0; i < n_layer; ++i) {
2913 auto & layer = layers[i];
2914
2915 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2916 layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
2917
2918 layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
2919 layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
2920
2921 layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
2922 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
2923
2924 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
2925 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
2926 }
2927 } break;
2928 case LLM_ARCH_STARCODER:
2929 {
2930 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2931 pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
2932
2933 // output
2934 {
2935 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2936 output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
2937 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
2938 if (!output) {
2939 // needs to be on GPU
2940 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
2941 }
2942
2943 }
2944
2945 for (int i = 0; i < n_layer; ++i) {
2946 auto & layer = layers[i];
2947
2948 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2949 layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
2950
2951 layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
2952 layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
2953
2954 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
2955 layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
2956
2957 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2958 layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
2959
2960 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
2961 layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
2962
2963 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
2964 layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
2965 }
2966 } break;
2967 case LLM_ARCH_BERT:
2968 case LLM_ARCH_NOMIC_BERT:
2969 case LLM_ARCH_NOMIC_BERT_MOE:
2970 case LLM_ARCH_JINA_BERT_V3:
2971 {
2972 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2973 type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, TENSOR_NOT_REQUIRED);
2974
2975 if (arch == LLM_ARCH_BERT) {
2976 pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
2977
2978 cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
2979 cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
2980
2981 cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
2982 cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
2983 }
2984
2985 tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
2986 tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
2987
2988 for (int i = 0; i < n_layer; ++i) {
2989 auto & layer = layers[i];
2990
2991 layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
2992 layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
2993
2994 if (!layer.wqkv) {
2995 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
2996 layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
2997
2998 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
2999 layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
3000
3001 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
3002 layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
3003 }
3004
3005 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3006 layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3007
3008 layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
3009 layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
3010
3011 if (hparams.moe_every_n_layers > 0 && i % hparams.moe_every_n_layers == 1) {
3012 layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff, n_expert}, 0);
3013 layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
3014 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
3015 } else {
3016 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3017 layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
3018 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3019 layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3020
3021 if (arch == LLM_ARCH_NOMIC_BERT) {
3022 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
3023 }
3024 }
3025
3026 layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
3027 layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
3028 }
3029 } break;
3030 case LLM_ARCH_NEO_BERT:
3031 {
3032 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3033
3034 cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
3035 cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
3036
3037 cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
3038 cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
3039
3040 output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
3041
3042 for (int i = 0; i < n_layer; ++i) {
3043 auto & layer = layers[i];
3044
3045 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3046
3047 layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
3048 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3049
3050 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3051
3052 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff*2}, 0);
3053 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3054 }
3055 } break;
3056 case LLM_ARCH_JINA_BERT_V2:
3057 {
3058 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); // word_embeddings
3059 type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0); // token_type_embeddings
3060
3061 tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0); // LayerNorm
3062 tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0); //LayerNorm bias
3063
3064 cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, 1}, TENSOR_NOT_REQUIRED);
3065 cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {1}, TENSOR_NOT_REQUIRED);
3066 for (int i = 0; i < n_layer; ++i) {
3067 auto & layer = layers[i]; // JinaBertLayer
3068
3069 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
3070 layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
3071
3072 layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
3073 layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3074
3075 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
3076 layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
3077
3078 layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
3079 layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3080
3081 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
3082 layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
3083
3084 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); //output_dens
3085 layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0); //output_dens
3086
3087 layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0); //output_norm
3088 layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
3089
3090 layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
3091 layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3092
3093 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
3094 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, layer.ffn_gate ? n_ff : n_ff * 2}, 0);
3095
3096 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3097 layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
3098
3099 layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
3100 layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
3101 }
3102 } break;
3103 case LLM_ARCH_BLOOM:
3104 {
3105 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3106 tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
3107 tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
3108
3109 // output
3110 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3111 output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
3112 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3113
3114 // if output is NULL, init from the input tok embed
3115 if (output == NULL) {
3116 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3117 }
3118
3119 for (int i = 0; i < n_layer; ++i) {
3120 auto & layer = layers[i];
3121
3122 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3123 layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
3124
3125 layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
3126 layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
3127
3128 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3129 layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
3130
3131 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3132 layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
3133
3134 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3135 layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
3136
3137 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3138 layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
3139 }
3140 } break;
3141 case LLM_ARCH_MPT:
3142 {
3143 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3144 pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, TENSOR_NOT_REQUIRED);
3145
3146 // output
3147 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3148 output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
3149
3150 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3151 if (!output) {
3152 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // needs to be on GPU
3153 }
3154
3155 for (int i = 0; i < n_layer; ++i) {
3156 auto & layer = layers[i];
3157
3158 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3159 layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3160
3161 layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
3162 layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
3163
3164 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3165 layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3166
3167 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3168 layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3169
3170 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3171 layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3172
3173 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3174 layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
3175
3176 layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
3177 layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3178
3179 layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
3180 layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3181
3182 // AWQ ScaleActivation layer
3183 layer.ffn_act = create_tensor(tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, TENSOR_NOT_REQUIRED);
3184 }
3185 } break;
3186 case LLM_ARCH_STABLELM:
3187 {
3188 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3189
3190 // output
3191 output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
3192 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3193 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
3194
3195 for (int i = 0; i < n_layer; ++i) {
3196 auto & layer = layers[i];
3197
3198 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3199 layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
3200
3201 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
3202 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
3203 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
3204 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3205
3206 // optional bias tensors, present in Stable LM 2 1.6B
3207 layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3208 layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3209 layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3210
3211 // optional q and k layernorms, present in StableLM 2 12B
3212 layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, TENSOR_NOT_REQUIRED);
3213 layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, TENSOR_NOT_REQUIRED);
3214
3215 // optional FFN norm, not present in StableLM 2 12B which uses parallel residual
3216 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
3217 layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3218
3219 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
3220 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
3221 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3222 }
3223 } break;
3224 case LLM_ARCH_QWEN:
3225 {
3226 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3227
3228 // output
3229 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3230 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
3231
3232 for (int i = 0; i < n_layer; ++i) {
3233 auto & layer = layers[i];
3234
3235 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3236
3237 layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd*3}, 0);
3238 layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd*3}, 0);
3239 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3240
3241 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3242
3243 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff/2}, 0);
3244 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff/2, n_embd}, 0);
3245 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff/2}, 0);
3246 }
3247 } break;
3248 case LLM_ARCH_QWEN2:
3249 case LLM_ARCH_QWEN2VL:
3250 case LLM_ARCH_DREAM:
3251 {
3252 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3253
3254 // output
3255 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3256 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3257 output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, TENSOR_NOT_REQUIRED);
3258 // if output is NULL, init from the input tok embed
3259 if (output == NULL) {
3260 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3261 }
3262
3263 for (int i = 0; i < n_layer; ++i) {
3264 auto & layer = layers[i];
3265
3266 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3267
3268 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
3269 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
3270 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
3271 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3272
3273 // optional bias tensors
3274 layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
3275 layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
3276 layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
3277
3278 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3279
3280 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
3281 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
3282 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3283 }
3284 } break;
3285 case LLM_ARCH_QWEN2MOE:
3286 {
3287 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3288
3289 // output
3290 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3291 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
3292
3293 for (int i = 0; i < n_layer; ++i) {
3294 auto & layer = layers[i];
3295
3296 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3297
3298 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
3299 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
3300 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
3301 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3302
3303 // optional bias tensors
3304 layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3305 layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3306 layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3307
3308 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3309
3310 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
3311
3312 if (n_expert == 0) {
3313 throw std::runtime_error("n_expert must be > 0 for QWEN2MOE");
3314 }
3315 if (n_expert_used == 0) {
3316 throw std::runtime_error("n_expert_used must be > 0 for QWEN2MOE");
3317 }
3318
3319 // MoE branch
3320 const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
3321
3322 layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
3323 layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
3324 layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
3325
3326 // Shared expert branch
3327 const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
3328
3329 layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd}, 0);
3330 layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
3331 layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
3332 layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
3333 }
3334 } break;
3335 case LLM_ARCH_QWEN3:
3336 case LLM_ARCH_QWEN3VL:
3337 {
3338 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3339
3340 // output
3341 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3342 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3343 // if output is NULL, init from the input tok embed
3344 if (output == NULL) {
3345 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3346 }
3347
3348 // output rerank head
3349 cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
3350
3351 for (int i = 0; i < n_layer; ++i) {
3352 auto & layer = layers[i];
3353
3354 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3355
3356 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
3357 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
3358 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
3359 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
3360
3361 layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
3362 layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
3363
3364 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3365 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
3366 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
3367 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3368 }
3369 } break;
3370 case LLM_ARCH_QWEN3MOE:
3371 case LLM_ARCH_QWEN3VLMOE:
3372 {
3373 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3374
3375 // output
3376 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3377 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3378 // if output is NULL, init from the input tok embed
3379 if (output == NULL) {
3380 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3381 }
3382
3383 for (int i = 0; i < n_layer; ++i) {
3384 auto & layer = layers[i];
3385
3386 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3387
3388 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
3389 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
3390 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
3391 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
3392
3393 layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
3394 layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
3395
3396 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3397
3398 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
3399
3400 if (n_expert == 0) {
3401 throw std::runtime_error("n_expert must be > 0 for QWEN3MOE");
3402 }
3403 if (n_expert_used == 0) {
3404 throw std::runtime_error("n_expert_used must be > 0 for QWEN3MOE");
3405 }
3406
3407 // MoE branch
3408 const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
3409
3410 layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
3411 layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
3412 layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
3413 }
3414 } break;
3415 case LLM_ARCH_PHI2:
3416 {
3417 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3418
3419 // output
3420 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3421 output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
3422 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
3423 output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, 0);
3424
3425 for (int i = 0; i < n_layer; ++i) {
3426 auto & layer = layers[i];
3427
3428 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3429 layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
3430
3431 layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
3432 layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
3433
3434 if (layer.wqkv == nullptr) {
3435 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
3436 layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
3437
3438 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
3439 layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
3440
3441 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
3442 layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
3443 }
3444
3445 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3446 layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
3447
3448 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3449 layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
3450
3451 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3452 layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
3453 }
3454 } break;
3455 case LLM_ARCH_PHI3:
3456 {
3457 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
3458
3459 // output
3460 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
3461 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3462
3463 // if output is NULL, init from the input tok embed
3464 if (output == NULL) {
3465 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3466 }
3467
3468 for (int i = 0; i < n_layer; ++i) {
3469 auto & layer = layers[i];
3470
3471 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
3472
3473 layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, TENSOR_NOT_REQUIRED);
3474 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
3475
3476 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
3477
3478 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
3479 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff }, 0);
3480
3481 layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
3482 layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
3483 }
3484 } break;
3485 case LLM_ARCH_PHIMOE:
3486 {
3487 const int64_t n_embd_head = n_embd / n_head;
3488
3489 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
3490
3491 // output
3492 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
3493 output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
3494 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, 0);
3495 output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), { n_vocab }, 0);
3496
3497 for (int i = 0; i < n_layer; ++i) {
3498 auto & layer = layers[i];
3499
3500 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
3501 layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), { n_embd }, 0);
3502
3503 layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, TENSOR_NOT_REQUIRED);
3504 if (layer.wqkv == nullptr) {
3505 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
3506 layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
3507
3508 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
3509 layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
3510
3511 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
3512 layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
3513 }
3514 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
3515 layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, 0);
3516
3517 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
3518 layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), { n_embd }, 0);
3519
3520 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
3521 layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
3522 layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
3523 layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
3524
3525 layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
3526 layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
3527 }
3528 } break;
3529 case LLM_ARCH_PLAMO:
3530 {
3531 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3532
3533 // output
3534 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3535 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
3536
3537 for (int i = 0; i < n_layer; ++i) {
3538 auto & layer = layers[i];
3539
3540 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3541
3542 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
3543 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
3544 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
3545 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3546
3547 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
3548 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
3549 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3550 }
3551 } break;
3552 case LLM_ARCH_PLAMO2:
3553 {
3554 // mamba parameters
3555 const uint32_t d_conv = hparams.ssm_d_conv;
3556 const uint32_t d_state = hparams.ssm_d_state;
3557 const uint32_t num_heads = hparams.ssm_dt_rank;
3558 const uint32_t intermediate_size = hparams.ssm_d_inner;
3559 const int64_t dt_dim = std::max(a: 64, b: int(hparams.n_embd / 16));
3560
3561 // attention parameters
3562 const uint32_t qk_dim = hparams.n_embd_head_k;
3563 const uint32_t v_dim = hparams.n_embd_head_v;
3564
3565 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3566
3567 // output
3568 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3569 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3570 // if output is NULL, init from the input tok embed
3571 if (output == NULL) {
3572 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3573 }
3574
3575 for (int i = 0; i < n_layer; ++i) {
3576 auto & layer = layers[i];
3577 bool is_mamba_layer = hparams.is_recurrent(il: i);
3578
3579 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3580
3581 if (is_mamba_layer) {
3582 layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2 * intermediate_size}, 0);
3583 layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, intermediate_size}, 0);
3584
3585 layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {intermediate_size, dt_dim + 2*d_state}, 0);
3586 layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_dim, num_heads}, 0);
3587 layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {num_heads}, 0);
3588
3589 layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {num_heads}, 0);
3590 layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {num_heads}, 0);
3591
3592 layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {intermediate_size, n_embd}, 0);
3593
3594 layer.ssm_dt_norm = create_tensor(tn(LLM_TENSOR_SSM_DT_NORM, i), {dt_dim}, 0);
3595 layer.ssm_b_norm = create_tensor(tn(LLM_TENSOR_SSM_B_NORM, i), {d_state}, 0);
3596 layer.ssm_c_norm = create_tensor(tn(LLM_TENSOR_SSM_C_NORM, i), {d_state}, 0);
3597 } else {
3598 const int64_t num_attention_heads = hparams.n_head(il: i);
3599 const int64_t q_num_heads = num_attention_heads;
3600 const int64_t num_key_value_heads = hparams.n_head_kv(il: i);
3601 const int64_t k_num_heads = num_key_value_heads;
3602 const int64_t v_num_heads = num_key_value_heads;
3603 const int64_t q_proj_dim = q_num_heads * qk_dim;
3604 const int64_t k_proj_dim = k_num_heads * qk_dim;
3605 const int64_t v_proj_dim = v_num_heads * v_dim;
3606
3607 layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, q_proj_dim + k_proj_dim + v_proj_dim}, 0);
3608 layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {qk_dim, num_attention_heads}, 0);
3609 layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {qk_dim, k_num_heads}, 0);
3610 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {q_num_heads * v_dim, n_embd}, 0);
3611 }
3612
3613 // All layers have post-attention norm, FFN norm, and FFN tensors
3614 layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, i), {n_embd}, 0);
3615 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3616 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3617 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
3618 layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, i), {n_embd}, 0);
3619 }
3620 } break;
3621 case LLM_ARCH_GPT2:
3622 {
3623 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3624 pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
3625
3626 // output
3627 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3628 output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
3629 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3630
3631 // if output is NULL, init from the input tok embed
3632 if (output == NULL) {
3633 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3634 }
3635
3636 for (int i = 0; i < n_layer; ++i) {
3637 auto & layer = layers[i];
3638
3639 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3640 layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
3641
3642 layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
3643 layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
3644
3645 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3646 layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
3647
3648 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3649 layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
3650
3651 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3652 layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
3653
3654 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3655 layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
3656 }
3657 } break;
3658 case LLM_ARCH_CODESHELL:
3659 {
3660 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3661
3662 // if tok embd is NULL, init from output
3663 if (tok_embd == NULL) {
3664 tok_embd = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3665 }
3666
3667 // output
3668 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3669 output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
3670 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
3671
3672 for (int i = 0; i < n_layer; ++i) {
3673 auto & layer = layers[i];
3674
3675 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3676 layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
3677
3678 layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
3679 layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
3680
3681 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3682 layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
3683
3684 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3685 layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
3686
3687 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3688 layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
3689
3690 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3691 layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
3692 }
3693 } break;
3694 case LLM_ARCH_ORION:
3695 {
3696 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3697
3698 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3699 output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
3700 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
3701
3702 for (int i = 0; i < n_layer; ++i) {
3703 auto & layer = layers[i];
3704
3705 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3706 layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
3707
3708 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
3709 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
3710 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
3711 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3712
3713 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3714 layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
3715
3716 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
3717 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
3718 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3719 }
3720 } break;
3721 case LLM_ARCH_INTERNLM2:
3722 {
3723 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3724
3725 // output
3726 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3727 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
3728
3729 for (int i = 0; i < n_layer; ++i) {
3730 auto & layer = layers[i];
3731
3732 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3733 // layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
3734 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
3735 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
3736 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
3737
3738 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3739 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3740 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
3741 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
3742 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3743 }
3744 } break;
3745 case LLM_ARCH_GEMMA:
3746 {
3747 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3748
3749 // output
3750 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3751 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
3752
3753 for (int i = 0; i < n_layer; ++i) {
3754 auto & layer = layers[i];
3755
3756 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3757
3758 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
3759 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
3760 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
3761 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
3762
3763 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3764 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
3765 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3766 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
3767 }
3768 } break;
3769 case LLM_ARCH_GEMMA2:
3770 {
3771 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3772
3773 // output
3774 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3775 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
3776
3777 for (int i = 0; i < n_layer; ++i) {
3778 auto & layer = layers[i];
3779
3780 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3781
3782 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
3783 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
3784 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
3785 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
3786 layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
3787
3788 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3789 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
3790 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3791 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
3792 layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
3793 }
3794 } break;
3795 case LLM_ARCH_GEMMA3:
3796 case LLM_ARCH_GEMMA_EMBEDDING:
3797 {
3798 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3799
3800 // output
3801 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3802 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3803
3804 // if output is NULL, init from the input tok embed
3805 if (output == NULL) {
3806 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3807 }
3808
3809 // Dense linear weights
3810 dense_2_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "weight"), {n_embd, hparams.dense_2_feat_out}, TENSOR_NOT_REQUIRED);
3811 dense_3_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_3_OUT, "weight"), {hparams.dense_3_feat_in, n_embd}, TENSOR_NOT_REQUIRED);
3812
3813
3814 for (int i = 0; i < n_layer; ++i) {
3815 auto & layer = layers[i];
3816
3817 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3818
3819 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
3820 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
3821 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
3822 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
3823
3824 layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
3825 layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
3826 layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
3827
3828 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3829 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
3830 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3831 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
3832 layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
3833 }
3834 } break;
3835 case LLM_ARCH_GEMMA3N:
3836 {
3837 const int64_t n_altup = hparams.n_altup;
3838 const int64_t laurel_rank = hparams.laurel_rank;
3839 const int64_t n_embd_altup = hparams.n_embd_altup;
3840
3841 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3842 // if output is NULL, init from the input tok embed
3843 if (output == NULL) {
3844 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3845 }
3846
3847 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3848 tok_embd_per_layer = create_tensor(tn(LLM_TENSOR_PER_LAYER_TOKEN_EMBD, "weight"), {n_embd_altup * n_layer, n_vocab}, 0);
3849
3850 altup_proj = create_tensor(tn(LLM_TENSOR_ALTUP_PROJ, "weight"), {n_embd, n_embd, n_altup - 1}, 0);
3851 altup_unembd_proj = create_tensor(tn(LLM_TENSOR_ALTUP_UNEMBD_PROJ, "weight"), {n_embd, n_embd, n_altup - 1}, 0);
3852 per_layer_model_proj = create_tensor(tn(LLM_TENSOR_PER_LAYER_MODEL_PROJ, "weight"), {n_embd, n_embd_altup * n_layer}, 0);
3853 per_layer_proj_norm = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ_NORM, "weight"), {n_embd_altup}, 0);
3854
3855 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3856
3857 for (int i = 0; i < n_layer; ++i) {
3858 auto & layer = layers[i];
3859
3860 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3861
3862 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
3863 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
3864 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
3865 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
3866
3867 layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
3868 layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
3869 layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
3870
3871 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3872 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
3873 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3874 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
3875 layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
3876
3877 // altup & laurel
3878 layer.per_layer_inp_gate = create_tensor(tn(LLM_TENSOR_PER_LAYER_INP_GATE, "weight", i), {n_embd, n_embd_altup}, 0);
3879 layer.per_layer_proj = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ, "weight", i), {n_embd_altup, n_embd}, 0);
3880 layer.per_layer_post_norm = create_tensor(tn(LLM_TENSOR_PER_LAYER_POST_NORM, "weight", i), {n_embd}, 0);
3881 layer.altup_correct_coef = create_tensor(tn(LLM_TENSOR_ALTUP_CORRECT_COEF, "weight", i), {n_altup, n_altup}, 0);
3882 layer.altup_correct_scale = create_tensor(tn(LLM_TENSOR_ALTUP_CORRECT_SCALE, "weight", i), {n_embd}, 0);
3883 layer.altup_predict_coef = create_tensor(tn(LLM_TENSOR_ALTUP_PREDICT_COEF, "weight", i), {n_altup, n_altup * n_altup}, 0);
3884 layer.altup_router = create_tensor(tn(LLM_TENSOR_ALTUP_ROUTER, "weight", i), {n_embd, n_altup}, 0);
3885 layer.altup_router_norm = create_tensor(tn(LLM_TENSOR_ALTUP_ROUTER_NORM, "weight", i), {n_embd}, 0);
3886 layer.laurel_l = create_tensor(tn(LLM_TENSOR_LAUREL_L, "weight", i), {n_embd, laurel_rank}, 0);
3887 layer.laurel_r = create_tensor(tn(LLM_TENSOR_LAUREL_R, "weight", i), {laurel_rank, n_embd}, 0);
3888 layer.laurel_post_norm = create_tensor(tn(LLM_TENSOR_LAUREL_POST_NORM, "weight", i), {n_embd}, 0);
3889 }
3890 } break;
3891 case LLM_ARCH_STARCODER2:
3892 {
3893 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3894
3895 // output
3896 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3897 output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
3898
3899 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3900 // if output is NULL, init from the input tok embed
3901 if (output == NULL) {
3902 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3903 }
3904
3905 for (int i = 0; i < n_layer; ++i) {
3906 auto & layer = layers[i];
3907
3908 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3909 layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
3910
3911 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
3912 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
3913 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
3914 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3915
3916 // optional bias tensors
3917 layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
3918 layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
3919 layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
3920 layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
3921
3922 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3923 layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
3924
3925 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
3926 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3927
3928 // optional bias tensors
3929 layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
3930 layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP , "bias", i), { n_ff}, 0);
3931 }
3932 } break;
3933 case LLM_ARCH_MAMBA:
3934 {
3935 const int64_t d_conv = hparams.ssm_d_conv;
3936 const int64_t d_inner = hparams.ssm_d_inner;
3937 const int64_t d_state = hparams.ssm_d_state;
3938 const int64_t dt_rank = hparams.ssm_dt_rank;
3939
3940 // only an expansion factor of 2 is supported for now
3941 if (2 * n_embd != d_inner) {
3942 throw std::runtime_error("only an expansion factor of 2 is supported for now");
3943 }
3944
3945 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3946
3947 // output
3948 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3949
3950 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3951 // if output is NULL, init from the input tok embed, duplicated to allow offloading
3952 if (output == NULL) {
3953 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3954 }
3955
3956 for (int i = 0; i < n_layer; ++i) {
3957 auto & layer = layers[i];
3958
3959 // norm
3960 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3961
3962 layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2*d_inner}, 0);
3963
3964 layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner}, 0);
3965 layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner}, 0);
3966
3967 layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {d_inner, dt_rank + 2*d_state}, 0);
3968
3969 layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_rank, d_inner}, 0);
3970 layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {d_inner}, 0);
3971
3972 // no "weight" suffix for these
3973 layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner}, 0);
3974 layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {d_inner}, 0);
3975
3976 // out_proj
3977 layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
3978 }
3979 } break;
3980 case LLM_ARCH_MAMBA2:
3981 {
3982 const int64_t d_conv = hparams.ssm_d_conv;
3983 const int64_t d_inner = hparams.ssm_d_inner;
3984 const int64_t d_state = hparams.ssm_d_state;
3985 const int64_t n_head = hparams.ssm_dt_rank;
3986 const int64_t n_group = hparams.ssm_n_group;
3987 const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_head;
3988
3989 // only an expansion factor of 2 is supported for now
3990 GGML_ASSERT(2 * n_embd == d_inner);
3991
3992 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3993
3994 // output
3995 {
3996 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3997
3998 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3999 // if output is NULL, init from the input tok embed, duplicated to allow offloading
4000 if (output == NULL) {
4001 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4002 }
4003 }
4004
4005 for (int i = 0; i < n_layer; ++i) {
4006 auto & layer = layers[i];
4007
4008 // norm
4009 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4010
4011 layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
4012
4013 layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
4014 layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, 0);
4015
4016 layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_head}, 0);
4017
4018 // no "weight" suffix for these
4019 layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_head}, 0);
4020 layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_head}, 0);
4021
4022 layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
4023
4024 // out_proj
4025 layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
4026 }
4027 } break;
4028 case LLM_ARCH_JAMBA:
4029 {
4030 const int64_t d_conv = hparams.ssm_d_conv;
4031 const int64_t d_inner = hparams.ssm_d_inner;
4032 const int64_t d_state = hparams.ssm_d_state;
4033 const int64_t dt_rank = hparams.ssm_dt_rank;
4034
4035 // only an expansion factor of 2 is supported for now
4036 GGML_ASSERT(2 * n_embd == d_inner);
4037
4038 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4039
4040 // output
4041 {
4042 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4043
4044 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4045 // if output is NULL, init from the input tok embed, duplicated to allow offloading
4046 if (output == NULL) {
4047 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4048 }
4049 }
4050
4051 for (int i = 0; i < n_layer; ++i) {
4052 const int64_t n_head_kv = hparams.n_head_kv(il: i);
4053 const int64_t n_embd_gqa = hparams.n_embd_v_gqa(il: i);
4054
4055 auto & layer = layers[i];
4056
4057 // norm
4058 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4059
4060 if (n_head_kv == 0) {
4061 // Mamba layer
4062 layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2*d_inner}, 0);
4063
4064 layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner}, 0);
4065 layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner}, 0);
4066
4067 layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {d_inner, dt_rank + 2*d_state}, 0);
4068
4069 layer.ssm_dt_norm = create_tensor(tn(LLM_TENSOR_SSM_DT_NORM, "weight", i), {dt_rank}, 0);
4070
4071 layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_rank, d_inner}, 0);
4072 layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {d_inner}, 0);
4073
4074 layer.ssm_b_norm = create_tensor(tn(LLM_TENSOR_SSM_B_NORM, "weight", i), {d_state}, 0);
4075 layer.ssm_c_norm = create_tensor(tn(LLM_TENSOR_SSM_C_NORM, "weight", i), {d_state}, 0);
4076
4077 // no "weight" suffix for these
4078 layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner}, 0);
4079 layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {d_inner}, 0);
4080
4081 // out_proj
4082 layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
4083 } else {
4084 // Attention layers
4085
4086 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
4087 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
4088 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
4089 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4090 }
4091
4092 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4093
4094 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, TENSOR_NOT_REQUIRED);
4095
4096 if (layer.ffn_gate_inp) {
4097 // MoE
4098 layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
4099 layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
4100 layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
4101 } else {
4102 // FFN (no MoE)
4103 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
4104 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
4105 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4106 }
4107 }
4108 } break;
4109 case LLM_ARCH_GRANITE_HYBRID:
4110 {
4111 // mamba2 Mixer SSM params
4112 // NOTE: int64_t for tensor dimensions
4113 const int64_t d_conv = hparams.ssm_d_conv;
4114 const int64_t d_inner = hparams.ssm_d_inner;
4115 const int64_t d_state = hparams.ssm_d_state;
4116 const int64_t n_ssm_head = hparams.ssm_dt_rank;
4117 const int64_t n_group = hparams.ssm_n_group;
4118 const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head;
4119
4120 // only an expansion factor of 2 is supported for now
4121 GGML_ASSERT(2 * n_embd == d_inner);
4122
4123 // embeddings
4124 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4125
4126 // output
4127 {
4128 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4129 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4130 // if output is NULL, init from the input tok embed, duplicated to allow offloading
4131 if (output == NULL) {
4132 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4133 }
4134 }
4135
4136 for (int i = 0; i < n_layer; ++i) {
4137 auto & layer = layers[i];
4138
4139 // norm
4140 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4141
4142 if (hparams.is_recurrent(il: i)) {
4143 // ssm layers
4144 layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
4145
4146 layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
4147 layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, TENSOR_NOT_REQUIRED);
4148
4149 layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_ssm_head}, 0);
4150
4151 // no "weight" suffix for these
4152 layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_ssm_head}, 0);
4153 layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_ssm_head}, 0);
4154
4155 layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
4156
4157 // out_proj
4158 layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
4159 } else {
4160 // attention layers (with optional bias)
4161 const int64_t n_head_i = hparams.n_head(il: i);
4162 const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(il: i);
4163 const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(il: i);
4164 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head_i}, 0);
4165 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa_i}, 0);
4166 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa_i}, 0);
4167 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head_i, n_embd}, 0);
4168 layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
4169 layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_k_gqa_i}, TENSOR_NOT_REQUIRED);
4170 layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_v_gqa_i}, TENSOR_NOT_REQUIRED);
4171 layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
4172 }
4173
4174 // feed forward (w/ optional biases)
4175 if (n_expert > 0) {
4176 // MoE FFN
4177 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4178 layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
4179 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
4180 layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
4181 layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
4182 layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
4183
4184 // For Granite MoE Shared
4185 if (hparams.n_ff_shexp > 0) {
4186 layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
4187 layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
4188 layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
4189 }
4190 } else {
4191 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4192 layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
4193 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
4194 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
4195 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4196 layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
4197 layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
4198 layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
4199 }
4200 }
4201 } break;
4202 case LLM_ARCH_XVERSE:
4203 {
4204 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4205
4206 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4207 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
4208
4209 for (int i = 0; i < n_layer; ++i) {
4210 auto & layer = layers[i];
4211
4212 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4213
4214 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
4215 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
4216 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
4217 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4218
4219 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4220 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
4221 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
4222 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4223 }
4224 } break;
4225 case LLM_ARCH_COMMAND_R:
4226 {
4227 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4228
4229 // output
4230 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4231 // init output from the input tok embed
4232 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4233
4234 for (int i = 0; i < n_layer; ++i) {
4235 auto & layer = layers[i];
4236
4237 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4238
4239 if (n_layer >= 64){
4240 layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, 0);
4241 layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, 0);
4242 }
4243
4244 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
4245 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
4246 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
4247 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4248
4249 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
4250 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
4251 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4252 }
4253 } break;
4254 case LLM_ARCH_COHERE2:
4255 {
4256 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
4257
4258 // output
4259 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
4260 // init output from the input tok embed
4261 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab },
4262 TENSOR_DUPLICATED);
4263
4264 for (int i = 0; i < n_layer; ++i) {
4265 auto & layer = layers[i];
4266
4267 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
4268
4269 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd }, 0);
4270 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
4271 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
4272 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
4273
4274 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
4275 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
4276 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
4277 }
4278 }
4279 break;
4280 case LLM_ARCH_OLMO: // adapted from LLM_ARCH_LLAMA with norm params removed
4281 {
4282 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4283
4284 // output
4285 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4286 // if output is NULL, init from the input tok embed
4287 if (output == NULL) {
4288 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4289 }
4290
4291 for (int i = 0; i < n_layer; ++i) {
4292 auto & layer = layers[i];
4293
4294 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
4295 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
4296 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
4297 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4298
4299 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
4300 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
4301 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4302 }
4303 } break;
4304 case LLM_ARCH_OLMO2:
4305 {
4306 const int64_t n_embd_head = n_embd / n_head;
4307
4308 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4309
4310 // output
4311 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4312 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
4313
4314 for (int i = 0; i < n_layer; ++i) {
4315 auto & layer = layers[i];
4316
4317 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
4318 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
4319 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
4320 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4321 layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
4322 layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_head_kv * n_embd_head}, 0);
4323 layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
4324
4325 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
4326 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4327 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
4328 layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
4329 }
4330 } break;
4331 case LLM_ARCH_SEED_OSS:
4332 {
4333 const uint32_t head_dim = hparams.n_embd_head_k;
4334 const int64_t n_qo_dim = n_head * head_dim;
4335 const int64_t n_kv_dim = n_head_kv * head_dim;
4336
4337 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4338
4339 // output
4340 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4341 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4342 // if output is NULL, init from the input tok embed
4343 if (output == NULL) {
4344 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4345 }
4346
4347 for (int i = 0; i < n_layer; ++i) {
4348 auto & layer = layers[i];
4349
4350 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_qo_dim}, 0);
4351 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_kv_dim}, 0);
4352 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_kv_dim}, 0);
4353 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_qo_dim, n_embd}, 0);
4354
4355 layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_qo_dim}, TENSOR_NOT_REQUIRED);
4356 layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_kv_dim}, TENSOR_NOT_REQUIRED);
4357 layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_kv_dim}, TENSOR_NOT_REQUIRED);
4358
4359 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4360 layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
4361
4362 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
4363 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4364 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
4365 }
4366 } break;
4367
4368 case LLM_ARCH_OLMOE:
4369 {
4370 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4371
4372 // output
4373 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4374 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
4375
4376 for (int i = 0; i < n_layer; ++i) {
4377 auto & layer = layers[i];
4378
4379 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4380
4381 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
4382 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
4383 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
4384 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4385 layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
4386 layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, 0);
4387
4388 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4389
4390 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
4391
4392 if (n_expert == 0) {
4393 throw std::runtime_error("n_expert must be > 0");
4394 }
4395 if (n_expert_used == 0) {
4396 throw std::runtime_error("n_expert_used must be > 0");
4397 }
4398
4399 // MoE branch
4400 layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
4401 layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
4402 layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
4403 }
4404 } break;
4405 case LLM_ARCH_OPENELM:
4406 {
4407 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4408
4409 // output
4410 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4411 // init output from the input tok embed
4412 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4413
4414 for (int i = 0; i < n_layer; ++i) {
4415 const int64_t n_head = hparams.n_head(il: i);
4416 const int64_t n_head_qkv = 2*hparams.n_head_kv(il: i) + n_head;
4417 const int64_t n_ff = hparams.n_ff(il: i);
4418
4419 auto & layer = layers[i];
4420
4421 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4422
4423 layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_head_qkv*n_embd_head_k}, 0);
4424 layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
4425 layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
4426 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head*n_embd_head_k, n_embd}, 0);
4427
4428 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4429 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
4430 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
4431 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4432 }
4433 } break;
4434 case LLM_ARCH_GPTNEOX:
4435 {
4436 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4437
4438 // output
4439 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4440 output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
4441 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
4442
4443 for (int i = 0; i < n_layer; ++i) {
4444 auto & layer = layers[i];
4445
4446 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4447 layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
4448
4449 layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
4450 layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
4451
4452 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4453 layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
4454
4455 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4456 layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
4457
4458 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
4459 layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
4460
4461 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4462 layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
4463 }
4464 } break;
4465 case LLM_ARCH_ARCTIC:
4466 {
4467 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4468
4469 // output
4470 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4471 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4472
4473 // if output is NULL, init from the input tok embed
4474 if (output == NULL) {
4475 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4476 }
4477
4478 for (int i = 0; i < n_layer; ++i) {
4479 auto & layer = layers[i];
4480
4481 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4482
4483 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
4484 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
4485 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
4486 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4487
4488 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4489
4490 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_embd}, 0);
4491 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_embd, n_embd}, 0);
4492 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_embd}, 0);
4493
4494 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
4495 layer.ffn_norm_exps = create_tensor(tn(LLM_TENSOR_FFN_NORM_EXPS, "weight", i), {n_embd}, 0);
4496 layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
4497 layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
4498 layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
4499 }
4500 } break;
4501 case LLM_ARCH_DEEPSEEK:
4502 {
4503
4504 const int64_t n_ff_exp = hparams.n_ff_exp;
4505 const int64_t n_expert_shared = hparams.n_expert_shared;
4506
4507 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4508
4509 // output
4510 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4511 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
4512
4513 for (int i = 0; i < n_layer; ++i) {
4514 auto & layer = layers[i];
4515
4516 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4517
4518 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
4519 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
4520 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
4521 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4522 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4523
4524 if (i < (int) hparams.n_layer_dense_lead) {
4525 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
4526 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
4527 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4528 } else {
4529 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
4530
4531 if (n_expert == 0) {
4532 throw std::runtime_error("n_expert must be > 0");
4533 }
4534 if (n_expert_used == 0) {
4535 throw std::runtime_error("n_expert_used must be > 0");
4536 }
4537
4538 // MoE branch
4539 layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
4540 layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
4541 layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
4542
4543 // Shared expert branch
4544 layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
4545 layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
4546 layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
4547 }
4548 }
4549 } break;
4550 case LLM_ARCH_DEEPSEEK2:
4551 {
4552 const bool is_lite = (hparams.n_layer == 27);
4553
4554 const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
4555
4556 // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
4557 const int64_t n_embd_head_k_mla = is_mla ? hparams.n_embd_head_k_mla : hparams.n_embd_head_k;
4558 const int64_t n_embd_head_v_mla = is_mla ? hparams.n_embd_head_v_mla : hparams.n_embd_head_v;
4559
4560 const int64_t n_embd_head_qk_rope = hparams.n_rot;
4561 const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;
4562
4563 const int64_t q_lora_rank = hparams.n_lora_q;
4564 const int64_t kv_lora_rank = hparams.n_lora_kv;
4565
4566 const int64_t n_ff_exp = hparams.n_ff_exp;
4567 const int64_t n_expert_shared = hparams.n_expert_shared;
4568
4569 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4570
4571 // output
4572 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4573 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
4574
4575 for (int i = 0; i < n_layer; ++i) {
4576 auto & layer = layers[i];
4577
4578 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4579 if (!is_lite) {
4580 layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
4581 }
4582
4583 layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
4584
4585 if (!is_lite) {
4586 layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
4587 layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, 0);
4588 } else {
4589 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_embd_head_k_mla}, 0);
4590 }
4591
4592 layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + n_embd_head_qk_rope}, 0);
4593
4594 // note: only old legacy GGUF files will have the unsplit wkv_b tensor in
4595 if (is_mla) {
4596 layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_qk_nope, kv_lora_rank, n_head}, 0);
4597 layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v_mla, n_head}, 0);
4598 } else {
4599 layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v_mla)}, 0);
4600 }
4601
4602 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v_mla, n_embd}, 0);
4603
4604 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4605
4606 if (i < (int) hparams.n_layer_dense_lead) {
4607 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
4608 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
4609 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4610 } else {
4611 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
4612 layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
4613
4614 if (n_expert == 0) {
4615 throw std::runtime_error("n_expert must be > 0");
4616 }
4617 if (n_expert_used == 0) {
4618 throw std::runtime_error("n_expert_used must be > 0");
4619 }
4620
4621 // MoE branch
4622 layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
4623 layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
4624 layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
4625
4626 // Shared expert branch
4627 layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
4628 layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
4629 layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
4630 }
4631 }
4632 } break;
4633 case LLM_ARCH_PLM:
4634 {
4635 const int64_t n_embd_head_qk_rope = hparams.n_rot;
4636 const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
4637 const int64_t kv_lora_rank = hparams.n_lora_kv;
4638
4639 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4640
4641 // output
4642 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4643 // output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
4644 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4645
4646 for (int i = 0; i < n_layer; ++i) {
4647 auto & layer = layers[i];
4648
4649 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4650
4651 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
4652 layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
4653 layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
4654 layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
4655 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0);
4656
4657 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4658 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
4659 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4660 }
4661 } break;
4662 case LLM_ARCH_BITNET:
4663 {
4664 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4665
4666 // output
4667 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4668
4669 for (int i = 0; i < n_layer; ++i) {
4670 auto & layer = layers[i];
4671
4672 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4673 layer.attn_sub_norm = create_tensor(tn(LLM_TENSOR_ATTN_SUB_NORM, "weight", i), {n_embd}, 0);
4674
4675 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
4676 layer.wq_scale = create_tensor(tn(LLM_TENSOR_ATTN_Q, "scale", i), {1}, TENSOR_NOT_REQUIRED);
4677 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
4678 layer.wk_scale = create_tensor(tn(LLM_TENSOR_ATTN_K, "scale", i), {1}, TENSOR_NOT_REQUIRED);
4679 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
4680 layer.wv_scale = create_tensor(tn(LLM_TENSOR_ATTN_V, "scale", i), {1}, TENSOR_NOT_REQUIRED);
4681 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4682 layer.wo_scale = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1}, TENSOR_NOT_REQUIRED);
4683
4684 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4685 layer.ffn_sub_norm = create_tensor(tn(LLM_TENSOR_FFN_SUB_NORM, "weight", i), {n_ff}, 0);
4686
4687 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
4688 layer.ffn_gate_scale = create_tensor(tn(LLM_TENSOR_FFN_GATE, "scale", i), {1}, TENSOR_NOT_REQUIRED);
4689 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
4690 layer.ffn_down_scale = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "scale", i), {1}, TENSOR_NOT_REQUIRED);
4691 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4692 layer.ffn_up_scale = create_tensor(tn(LLM_TENSOR_FFN_UP, "scale", i), {1}, TENSOR_NOT_REQUIRED);
4693 }
4694 } break;
4695 case LLM_ARCH_T5:
4696 {
4697 const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
4698
4699 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4700
4701 // output
4702 output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
4703 output_norm = create_tensor(tn(LLM_TENSOR_DEC_OUTPUT_NORM, "weight"), {n_embd}, 0);
4704
4705 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4706 // if output is NULL, init from the input tok embed
4707 if (output == NULL) {
4708 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4709 }
4710
4711 // n_layer: number of encoder_layers
4712 // dec_n_layer: number of decoder_layers
4713 const int dec_n_layer = hparams.dec_n_layer;
4714 if (dec_n_layer > n_layer) {
4715 layers.resize(new_size: dec_n_layer);
4716 }
4717
4718 // load encoder layers
4719 for (int i = 0; i < n_layer; ++i) {
4720 auto & layer = layers[i];
4721
4722 layer.attn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM, "weight", i), {n_embd}, 0);
4723 layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
4724
4725 layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
4726 layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
4727 layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
4728 layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
4729
4730 layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0);
4731 layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
4732 layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
4733 layer.ffn_up_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4734 }
4735
4736 // load decoder layers
4737 for (int i = 0; i < dec_n_layer; ++i) {
4738 auto & layer = layers[i];
4739
4740 layer.attn_norm = create_tensor(tn(LLM_TENSOR_DEC_ATTN_NORM, "weight", i), {n_embd}, 0);
4741 layer.attn_rel_b = create_tensor(tn(LLM_TENSOR_DEC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
4742
4743 layer.wq = create_tensor(tn(LLM_TENSOR_DEC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
4744 layer.wk = create_tensor(tn(LLM_TENSOR_DEC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
4745 layer.wv = create_tensor(tn(LLM_TENSOR_DEC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
4746 layer.wo = create_tensor(tn(LLM_TENSOR_DEC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
4747
4748 layer.attn_norm_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_NORM, "weight", i), {n_embd}, 0);
4749 // this tensor seems to be unused in HF transformers implementation
4750 layer.attn_rel_b_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
4751
4752 layer.wq_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
4753 layer.wk_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
4754 layer.wv_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
4755 layer.wo_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
4756
4757 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_DEC_FFN_NORM, "weight", i), {n_embd}, 0);
4758 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_DEC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
4759 layer.ffn_down = create_tensor(tn(LLM_TENSOR_DEC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
4760 layer.ffn_up = create_tensor(tn(LLM_TENSOR_DEC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4761 }
4762 } break;
4763 case LLM_ARCH_T5ENCODER:
4764 {
4765 const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
4766
4767 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4768
4769 // output
4770 output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
4771 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4772 // if output is NULL, init from the input tok embed
4773 if (output == NULL) {
4774 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4775 }
4776
4777 for (int i = 0; i < n_layer; ++i) {
4778 auto & layer = layers[i];
4779
4780 layer.attn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM, "weight", i), {n_embd}, 0);
4781 layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
4782
4783 layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
4784 layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
4785 layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
4786 layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
4787
4788 layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0);
4789 layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
4790 layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
4791 layer.ffn_up_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4792 }
4793 } break;
4794 case LLM_ARCH_JAIS:
4795 {
4796 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4797
4798 // output
4799 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4800 output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
4801 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
4802
4803 for (int i = 0; i < n_layer; ++i) {
4804 auto & layer = layers[i];
4805
4806 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4807 layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
4808
4809 layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
4810 layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
4811
4812 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4813 layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
4814
4815 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4816 layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
4817
4818 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
4819 layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
4820
4821 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
4822 layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, 0);
4823
4824 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4825 layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
4826 }
4827 } break;
4828 case LLM_ARCH_CHATGLM:
4829 {
4830 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4831
4832 // output
4833 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4834 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4835 // if output is NULL, init from the input tok embed
4836 if (output == NULL) {
4837 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4838 }
4839
4840 for (int i = 0; i < n_layer; ++i) {
4841 auto & layer = layers[i];
4842
4843 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4844 layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
4845 layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
4846
4847 if (layer.wqkv == nullptr) {
4848 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
4849 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
4850 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
4851 layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
4852 layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
4853 layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
4854 }
4855
4856 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4857
4858 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4859
4860 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
4861
4862 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
4863 }
4864 } break;
4865 case LLM_ARCH_GLM4:
4866 {
4867 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4868
4869 // output
4870 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4871 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4872 // if output is NULL, init from the input tok embed
4873 if (output == NULL) {
4874 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4875 }
4876
4877 for (int i = 0; i < n_layer; ++i) {
4878 auto & layer = layers[i];
4879
4880 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4881 layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
4882 layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
4883
4884 if (layer.wqkv == nullptr) {
4885 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
4886 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
4887 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
4888 layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
4889 layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
4890 layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
4891 }
4892
4893 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4894
4895 layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
4896
4897 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4898 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
4899 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
4900
4901 layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
4902 }
4903 } break;
4904 case LLM_ARCH_GLM4_MOE:
4905 {
4906 const int64_t n_expert = hparams.n_expert;
4907 const int64_t n_expert_used = hparams.n_expert_used;
4908 const int64_t n_expert_shared = hparams.n_expert_shared;
4909
4910 GGML_ASSERT(hparams.n_expert > 0 && "n_expert must be > 0 for GLM4_MOE MoE layers");
4911 GGML_ASSERT(hparams.n_expert_used > 0 && "n_expert_used must be > 0 for GLM4_MOE MoE layers");
4912
4913 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
4914
4915 // output
4916 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
4917 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
4918 // if output is NULL, init from the input tok embed
4919 if (output == NULL) {
4920 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
4921 }
4922
4923 // Load ALL tensors including NextN layer to satisfy total tensor count
4924 // but only PROCESS up to last layer (skipping final NextN layer) in forward pass
4925 for (int i = 0; i < n_layer; ++i) {
4926 int flags = 0;
4927 if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
4928 // skip all tensors in the NextN layers
4929 flags |= TENSOR_SKIP;
4930 }
4931
4932 auto & layer = layers[i];
4933
4934 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, flags);
4935
4936 // GLM-style attention with bias terms
4937 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, flags);
4938 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, flags);
4939 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, flags);
4940 layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, flags);
4941 layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, flags);
4942 layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, flags);
4943
4944 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, flags);
4945
4946 // K/Q norm tensors (optional for GLM-4.5 355B variant)
4947 layer.attn_q_norm = create_tensor(
4948 tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED | flags);
4949 layer.attn_k_norm = create_tensor(
4950 tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED | flags);
4951
4952 layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, flags);
4953
4954 // Check if this layer uses MoE or dense FFN based on n_layer_dense_lead
4955 // GLM 4.5 uses hybrid architecture: layer 0 is dense, layers 1+ are MoE
4956 const bool use_moe = (static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead);
4957
4958 if (use_moe) {
4959 // MoE layers
4960 layer.ffn_gate_inp =
4961 create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, flags);
4962 layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), { n_expert }, flags);
4963
4964 // MoE branch
4965 const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
4966
4967 layer.ffn_gate_exps = create_tensor(
4968 tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags);
4969 layer.ffn_down_exps = create_tensor(
4970 tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, flags);
4971 layer.ffn_up_exps = create_tensor(
4972 tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags);
4973
4974 // Shared expert
4975 if (n_expert_shared > 0) {
4976 const int64_t n_ff_shexp = n_ff_exp * n_expert_shared;
4977 layer.ffn_gate_shexp = create_tensor(
4978 tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags);
4979 layer.ffn_down_shexp = create_tensor(
4980 tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_shexp, n_embd }, flags);
4981 layer.ffn_up_shexp = create_tensor(
4982 tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags);
4983 }
4984 } else {
4985 // Dense layers (first k layers) - GLM uses separate gate/up projections
4986 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, flags);
4987 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, flags);
4988 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, flags);
4989 }
4990
4991 // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
4992 if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
4993 layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
4994 layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
4995 layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
4996
4997 // Optional tensors
4998 layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
4999 layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
5000 layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, flags | TENSOR_NOT_REQUIRED);
5001 }
5002 }
5003 }
5004 break;
5005 case LLM_ARCH_NEMOTRON:
5006 {
5007 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5008
5009 // output
5010 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5011 output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
5012 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
5013
5014 for (int i = 0; i < n_layer; ++i) {
5015 auto & layer = layers[i];
5016
5017 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5018 layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
5019
5020 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
5021 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
5022 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
5023 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
5024
5025 // optional bias tensors
5026 layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
5027 layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
5028 layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
5029 layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
5030
5031 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5032 layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
5033
5034 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
5035 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
5036
5037 // optional MLP bias
5038 layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
5039 layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
5040 }
5041 } break;
5042 case LLM_ARCH_NEMOTRON_H:
5043 {
5044 // mamba2 Mixer SSM params
5045 // NOTE: int64_t for tensor dimensions
5046 const int64_t d_conv = hparams.ssm_d_conv;
5047 const int64_t d_inner = hparams.ssm_d_inner;
5048 const int64_t d_state = hparams.ssm_d_state;
5049 const int64_t n_ssm_head = hparams.ssm_dt_rank;
5050 const int64_t n_group = hparams.ssm_n_group;
5051 const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head;
5052
5053 // embeddings
5054 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5055
5056 // output
5057 {
5058 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5059 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5060 // if output is NULL, init from the input tok embed, duplicated to allow offloading
5061 if (output == NULL) {
5062 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5063 }
5064 }
5065
5066 for (int i = 0; i < n_layer; ++i) {
5067 auto & layer = layers[i];
5068
5069 // all blocks use the attn norm
5070 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5071
5072 if (hparams.is_recurrent(il: i)) {
5073 // ssm layers
5074 layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
5075
5076 layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
5077 layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, TENSOR_NOT_REQUIRED);
5078
5079 layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_ssm_head}, 0);
5080
5081 // no "weight" suffix for these
5082 layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_ssm_head}, 0);
5083 layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_ssm_head}, 0);
5084
5085 layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
5086
5087 // out_proj
5088 layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
5089 } else if (hparams.n_ff(il: i) == 0) {
5090 // attention layers (with optional bias)
5091 const int64_t n_head_i = hparams.n_head(il: i);
5092 const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(il: i);
5093 const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(il: i);
5094 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head_i}, 0);
5095 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa_i}, 0);
5096 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa_i}, 0);
5097 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head_i, n_embd}, 0);
5098 layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
5099 layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_k_gqa_i}, TENSOR_NOT_REQUIRED);
5100 layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_v_gqa_i}, TENSOR_NOT_REQUIRED);
5101 layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
5102 } else {
5103 // mlp layers
5104 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { hparams.n_ff(il: i), n_embd}, 0);
5105 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, hparams.n_ff(il: i)}, 0);
5106 layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
5107 layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {hparams.n_ff(il: i)}, TENSOR_NOT_REQUIRED);
5108 }
5109 }
5110 } break;
5111 case LLM_ARCH_EXAONE:
5112 {
5113 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5114
5115 // output
5116 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5117 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5118
5119 // if output is NULL, init from the input tok embed
5120 if (output == NULL) {
5121 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5122 }
5123
5124 for (int i = 0; i < n_layer; ++i) {
5125 auto & layer = layers[i];
5126
5127 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5128
5129 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
5130 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
5131 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
5132 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
5133
5134 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5135 layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
5136 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
5137 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
5138 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
5139 }
5140 } break;
5141 case LLM_ARCH_EXAONE4:
5142 {
5143 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5144
5145 // output
5146 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5147 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5148
5149 // if output is NULL, init from the input tok embed
5150 if (output == NULL) {
5151 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5152 }
5153
5154 for (int i = 0; i < n_layer; ++i) {
5155 auto & layer = layers[i];
5156
5157 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
5158 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
5159 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
5160 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
5161
5162 layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
5163
5164 layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
5165 layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
5166 layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
5167
5168 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
5169 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
5170 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
5171 layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
5172 }
5173 } break;
5174 case LLM_ARCH_RWKV6:
5175 {
5176 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5177
5178 // Block 0, LN0
5179 tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
5180 tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
5181
5182 // output
5183 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5184 output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
5185 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
5186
5187 const int time_mix_extra_dim = hparams.time_mix_extra_dim;
5188 const int time_decay_extra_dim = hparams.time_decay_extra_dim;
5189 const int head_size = hparams.wkv_head_size;
5190 const int attn_hidden_size = n_embd;
5191 const int ffn_size = hparams.n_ff_arr[0];
5192
5193 for (int i = 0; i < n_layer; ++i) {
5194 auto & layer = layers[i];
5195
5196 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5197 layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
5198
5199 layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0);
5200 layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, 0);
5201
5202 layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5}, 0);
5203 layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0);
5204
5205 layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
5206 layer.time_mix_lerp_w = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_W, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
5207 layer.time_mix_lerp_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
5208 layer.time_mix_lerp_v = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_V, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
5209 layer.time_mix_lerp_r = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
5210 layer.time_mix_lerp_g = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_G, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
5211 layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, TENSOR_NOT_REQUIRED);
5212 GGML_ASSERT(!(layer.time_mix_lerp_fused == NULL && layer.time_mix_lerp_w == NULL));
5213
5214 layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, 0);
5215 layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0);
5216 layer.time_mix_decay_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}, 0);
5217 layer.time_mix_decay_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}, 0);
5218 layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
5219 layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
5220 layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
5221 layer.time_mix_gate = create_tensor(tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}, 0);
5222
5223 layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, 0);
5224 layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, 0);
5225 layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
5226
5227 layer.channel_mix_lerp_k = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, 0);
5228 layer.channel_mix_lerp_r = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, 0);
5229
5230 layer.channel_mix_key = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_KEY, "weight", i), {n_embd, ffn_size}, 0);
5231 layer.channel_mix_value = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_VALUE, "weight", i), {ffn_size, n_embd}, 0);
5232 layer.channel_mix_receptance = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "weight", i), {n_embd, n_embd}, 0);
5233 }
5234
5235 } break;
5236 case LLM_ARCH_RWKV6QWEN2:
5237 {
5238 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5239
5240 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5241 output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
5242 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
5243
5244 const int time_mix_extra_dim = hparams.time_mix_extra_dim;
5245 const int time_decay_extra_dim = hparams.time_decay_extra_dim;
5246 const int head_size = hparams.wkv_head_size;
5247 const int attn_hidden_size = n_embd;
5248 const int n_head_kv = hparams.n_head_kv();
5249 int attn_key_value_size;
5250 if (n_head_kv == 0 || attn_hidden_size / head_size == n_head_kv) {
5251 attn_key_value_size = attn_hidden_size;
5252 } else {
5253 attn_key_value_size = n_head_kv * head_size;
5254 }
5255
5256 for (int i = 0; i < n_layer; ++i) {
5257 auto & layer = layers[i];
5258
5259 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5260
5261 layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5}, 0);
5262 layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0);
5263
5264 layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
5265 layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, 0);
5266
5267 layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, TENSOR_NOT_REQUIRED);
5268 layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0);
5269 layer.time_mix_decay_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}, 0);
5270 layer.time_mix_decay_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}, 0);
5271 layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {n_embd, attn_key_value_size}, 0);
5272 layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {n_embd, attn_key_value_size}, 0);
5273 layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
5274 layer.time_mix_gate = create_tensor(tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}, 0);
5275 // optional bias tensors
5276 layer.time_mix_key_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "bias", i), {attn_key_value_size}, TENSOR_NOT_REQUIRED);
5277 layer.time_mix_value_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "bias", i), {attn_key_value_size}, TENSOR_NOT_REQUIRED);
5278 layer.time_mix_receptance_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "bias", i), {attn_hidden_size}, TENSOR_NOT_REQUIRED);
5279
5280 layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
5281
5282 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5283
5284 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
5285 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
5286 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
5287 }
5288 } break;
5289 case LLM_ARCH_RWKV7:
5290 {
5291 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5292
5293 // Block 0, LN0
5294 tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
5295 tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
5296
5297 // output
5298 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5299 output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
5300 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
5301
5302 const int n_lora_decay = hparams.n_lora_decay;
5303 const int n_lora_iclr = hparams.n_lora_iclr;
5304 const int n_lora_value_res_mix = hparams.n_lora_value_res_mix;
5305 const int n_lora_gate = hparams.n_lora_gate;
5306 const int attn_hidden_size = n_embd;
5307 const int ffn_size = hparams.n_ff_arr[0];
5308
5309 for (int i = 0; i < n_layer; ++i) {
5310 auto & layer = layers[i];
5311
5312 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5313 layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
5314
5315 layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0);
5316 layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, 0);
5317
5318 layer.time_mix_w0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W0, "weight", i), {n_embd}, 0);
5319 layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, n_lora_decay}, 0);
5320 layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {n_lora_decay, n_embd}, 0);
5321
5322 layer.time_mix_a0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A0, "weight", i), {n_embd}, 0);
5323 layer.time_mix_a1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A1, "weight", i), {n_embd, n_lora_iclr}, 0);
5324 layer.time_mix_a2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A2, "weight", i), {n_lora_iclr, n_embd}, 0);
5325
5326 if (i == 0) {
5327 // actually not used
5328 layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
5329 layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_iclr}, 0);
5330 layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_iclr, n_embd}, 0);
5331 } else {
5332 layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
5333 layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_value_res_mix}, 0);
5334 layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_value_res_mix, n_embd}, 0);
5335 }
5336
5337 layer.time_mix_g1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G1, "weight", i), {n_embd, n_lora_gate}, 0);
5338 layer.time_mix_g2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G2, "weight", i), {n_lora_gate, n_embd}, 0);
5339
5340 layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 6}, 0);
5341
5342 layer.time_mix_k_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_K, "weight", i), {attn_hidden_size}, 0);
5343 layer.time_mix_k_a = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_A, "weight", i), {attn_hidden_size}, 0);
5344 layer.time_mix_r_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_R_K, "weight", i), {attn_hidden_size}, 0);
5345
5346 layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
5347 layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
5348 layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
5349
5350 layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, 0);
5351 layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, 0);
5352 layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
5353
5354 layer.channel_mix_lerp_k = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, 0);
5355
5356 layer.channel_mix_key = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_KEY, "weight", i), {n_embd, ffn_size}, 0);
5357 layer.channel_mix_value = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_VALUE, "weight", i), {ffn_size, n_embd}, 0);
5358 }
5359
5360 } break;
5361 case LLM_ARCH_ARWKV7:
5362 {
5363 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5364
5365 // output
5366 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5367 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
5368
5369 const int n_lora_decay = hparams.n_lora_decay;
5370 const int n_lora_iclr = hparams.n_lora_iclr;
5371 const int n_lora_value_res_mix = hparams.n_lora_value_res_mix;
5372 const int n_lora_gate = hparams.n_lora_gate;
5373 const int attn_hidden_size = n_embd;
5374
5375 for (int i = 0; i < n_layer; ++i) {
5376 auto & layer = layers[i];
5377
5378 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5379
5380 layer.time_mix_w0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W0, "weight", i), {n_embd}, 0);
5381 layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, n_lora_decay}, 0);
5382 layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {n_lora_decay, n_embd}, 0);
5383
5384 layer.time_mix_a0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A0, "weight", i), {n_embd}, 0);
5385 layer.time_mix_a1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A1, "weight", i), {n_embd, n_lora_iclr}, 0);
5386 layer.time_mix_a2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A2, "weight", i), {n_lora_iclr, n_embd}, 0);
5387
5388 if (i == 0) {
5389 // actually not used
5390 layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
5391 layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_iclr}, 0);
5392 layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_iclr, n_embd}, 0);
5393 } else {
5394 layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
5395 layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_value_res_mix}, 0);
5396 layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_value_res_mix, n_embd}, 0);
5397 }
5398
5399 layer.time_mix_g1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G1, "weight", i), {n_embd, n_lora_gate}, TENSOR_NOT_REQUIRED);
5400 layer.time_mix_g2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G2, "weight", i), {n_lora_gate, n_embd}, TENSOR_NOT_REQUIRED);
5401
5402 try {
5403 layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 6}, 0);
5404 } catch(std::runtime_error & e) {
5405 // ARWKV models may not have gate tensors
5406 layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, 0);
5407 }
5408
5409 layer.time_mix_k_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_K, "weight", i), {attn_hidden_size}, 0);
5410 layer.time_mix_k_a = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_A, "weight", i), {attn_hidden_size}, 0);
5411 layer.time_mix_r_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_R_K, "weight", i), {attn_hidden_size}, 0);
5412
5413 layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
5414 layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
5415 layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
5416
5417 layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
5418 layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
5419 layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
5420
5421 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5422
5423 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
5424 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
5425 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
5426 }
5427
5428 } break;
5429 case LLM_ARCH_CHAMELEON:
5430 {
5431 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5432
5433 // output
5434 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5435 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5436 // if output is NULL, init from the input tok embed
5437 if (output == NULL) {
5438 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5439 }
5440
5441 for (int i = 0; i < n_layer; ++i) {
5442 auto & layer = layers[i];
5443
5444 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5445 layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, 0);
5446 layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, 0);
5447 layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd_head_k, n_head}, TENSOR_NOT_REQUIRED);
5448 layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd_head_k, n_head_kv}, TENSOR_NOT_REQUIRED);
5449
5450 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
5451 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
5452 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
5453 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
5454
5455 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5456
5457 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
5458 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
5459 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
5460 }
5461 } break;
5462 case LLM_ARCH_WAVTOKENIZER_DEC:
5463 {
5464 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hparams.n_embd_features, n_vocab}, 0);
5465
5466 conv1d = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, hparams.n_embd_features, hparams.posnet.n_embd}, 0);
5467 conv1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"), {1, hparams.posnet.n_embd}, 0);
5468
5469 // posnet
5470 {
5471 const int64_t n_embd = hparams.posnet.n_embd;
5472
5473 for (uint32_t i = 0; i < hparams.posnet.n_layer; ++i) {
5474 auto & layer = layers[i].posnet;
5475
5476 // posnet:
5477 //
5478 // - resnet
5479 // - resnet
5480 // - attn
5481 // - resnet
5482 // - resnet
5483 // - norm
5484 //
5485 switch (i) {
5486 case 0:
5487 case 1:
5488 case 3:
5489 case 4:
5490 {
5491 layer.norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", i), {1, n_embd}, 0);
5492 layer.norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", i), {1, n_embd}, 0);
5493
5494 layer.conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", i), {3, n_embd, n_embd}, 0);
5495 layer.conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", i), {1, n_embd}, 0);
5496
5497 layer.norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", i), {1, n_embd}, 0);
5498 layer.norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", i), {1, n_embd}, 0);
5499
5500 layer.conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", i), {3, n_embd, n_embd}, 0);
5501 layer.conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", i), {1, n_embd}, 0);
5502 } break;
5503 case 2:
5504 {
5505 layer.attn_norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
5506 layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0);
5507
5508 layer.attn_q = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "weight", i), {1, n_embd, n_embd}, 0);
5509 layer.attn_q_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "bias", i), {1, n_embd}, 0);
5510
5511 layer.attn_k = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "weight", i), {1, n_embd, n_embd}, 0);
5512 layer.attn_k_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "bias", i), {1, n_embd}, 0);
5513
5514 layer.attn_v = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "weight", i), {1, n_embd, n_embd}, 0);
5515 layer.attn_v_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "bias", i), {1, n_embd}, 0);
5516
5517 layer.attn_o = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "weight", i), {1, n_embd, n_embd}, 0);
5518 layer.attn_o_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "bias", i), {1, n_embd}, 0);
5519 } break;
5520 case 5:
5521 {
5522 layer.norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
5523 layer.norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0);
5524 } break;
5525 default: GGML_ABORT("unknown posnet layer");
5526 };
5527 }
5528 }
5529
5530 GGML_ASSERT(hparams.posnet.n_embd == hparams.convnext.n_embd);
5531
5532 tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {hparams.posnet.n_embd}, 0);
5533 tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {hparams.posnet.n_embd}, 0);
5534
5535 // convnext
5536 {
5537 const int64_t n_embd = hparams.convnext.n_embd;
5538
5539 for (uint32_t i = 0; i < hparams.convnext.n_layer; ++i) {
5540 auto & layer = layers[i].convnext;
5541
5542 layer.dw = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "weight", i), {7, 1, n_embd}, 0);
5543 layer.dw_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "bias", i), {1, n_embd}, 0);
5544
5545 layer.norm = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "weight", i), {n_embd}, 0);
5546 layer.norm_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "bias", i), {n_embd}, 0);
5547
5548 layer.pw1 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "weight", i), {n_embd, n_ff}, 0);
5549 layer.pw1_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "bias", i), {n_ff}, 0);
5550
5551 layer.pw2 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "weight", i), {n_ff, n_embd}, 0);
5552 layer.pw2_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "bias", i), {n_embd}, 0);
5553
5554 layer.gamma = create_tensor(tn(LLM_TENSOR_CONVNEXT_GAMMA, "weight", i), {n_embd}, 0);
5555 }
5556
5557 // output
5558 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5559 output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
5560 }
5561
5562 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, n_embd}, 0);
5563 output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_embd}, 0);
5564 } break;
5565 case LLM_ARCH_BAILINGMOE:
5566 {
5567 const int64_t n_ff_exp = hparams.n_ff_exp;
5568 const int64_t n_expert_shared = hparams.n_expert_shared;
5569
5570 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5571
5572 // output
5573 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5574 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
5575
5576 for (int i = 0; i < n_layer; ++i) {
5577 auto & layer = layers[i];
5578
5579 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5580
5581 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_rot}, 0);
5582 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
5583 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
5584 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_rot, n_embd}, 0);
5585 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5586
5587 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
5588
5589 if (n_expert == 0) {
5590 throw std::runtime_error("n_expert must be > 0");
5591 }
5592 if (n_expert_used == 0) {
5593 throw std::runtime_error("n_expert_used must be > 0");
5594 }
5595
5596 layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
5597 layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
5598 layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
5599
5600 layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
5601 layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
5602 layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
5603 }
5604 } break;
5605 case LLM_ARCH_BAILINGMOE2:
5606 {
5607 const int64_t n_ff_exp = hparams.n_ff_exp;
5608 const int64_t n_expert_shared = hparams.n_expert_shared;
5609
5610 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5611
5612 // output
5613 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5614 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
5615
5616 GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for bailingmoe2");
5617 GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for bailingmoe2");
5618
5619 for (int i = 0; i < n_layer; ++i) {
5620 int flags = 0;
5621 if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
5622 // skip all tensors in the NextN layers
5623 flags |= TENSOR_SKIP;
5624 }
5625
5626 auto & layer = layers[i];
5627
5628 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags);
5629
5630 layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, flags);
5631 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, flags);
5632
5633 layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, flags);
5634 layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, flags);
5635
5636 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags);
5637
5638 if (static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead) { // MoE layers
5639 const int64_t n_ff_shexp = (hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff_exp) * n_expert_shared;
5640
5641 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, flags);
5642 layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED | flags);
5643
5644 layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, flags);
5645 layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, flags);
5646 layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, flags);
5647
5648 layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp}, flags);
5649 layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, flags);
5650 layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp}, flags);
5651 } else { // Dense layers
5652 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, flags);
5653 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, flags);
5654 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, flags);
5655 }
5656
5657 // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
5658 if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
5659 layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
5660 layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED | flags);
5661 layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
5662 layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
5663 layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED | flags);
5664 layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, TENSOR_NOT_REQUIRED | flags);
5665 layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, flags);
5666 }
5667 }
5668 } break;
5669 case LLM_ARCH_DOTS1:
5670 {
5671 const int64_t n_ff_exp = hparams.n_ff_exp;
5672 const int64_t n_expert_shared = hparams.n_expert_shared;
5673
5674 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5675
5676 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5677 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
5678
5679 for (int i = 0; i < n_layer; ++i) {
5680 auto & layer = layers[i];
5681
5682 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5683
5684 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
5685 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
5686 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
5687 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
5688
5689 layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
5690 layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
5691
5692 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5693
5694 if (i < (int) hparams.n_layer_dense_lead) {
5695 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
5696 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
5697 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
5698 } else {
5699 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
5700 layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
5701
5702 if (n_expert == 0) {
5703 throw std::runtime_error("n_expert must be > 0");
5704 }
5705 if (n_expert_used == 0) {
5706 throw std::runtime_error("n_expert_used must be > 0");
5707 }
5708
5709 // MoE branch
5710 layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
5711 layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
5712 layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
5713
5714 // Shared expert branch
5715 layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
5716 layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
5717 layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
5718 }
5719 }
5720 } break;
5721 case LLM_ARCH_ARCEE:
5722 {
5723 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5724
5725 // output
5726 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5727 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5728
5729 // if output is NULL, init from the input tok embed
5730 if (output == NULL) {
5731 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5732 }
5733
5734 for (int i = 0; i < n_layer; ++i) {
5735 auto & layer = layers[i];
5736
5737 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5738
5739 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
5740 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
5741 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
5742 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
5743
5744 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5745
5746 layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
5747
5748 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
5749 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
5750 }
5751 } break;
5752 case LLM_ARCH_ERNIE4_5:
5753 case LLM_ARCH_ERNIE4_5_MOE:
5754 {
5755 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5756
5757 // output
5758 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5759 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5760 // if output is NULL, init from the input tok embed
5761 if (output == NULL) {
5762 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5763 }
5764
5765 for (int i = 0; i < n_layer; ++i) {
5766 auto & layer = layers[i];
5767
5768 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5769
5770 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
5771 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
5772 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
5773 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
5774
5775 // optional bias tensors
5776 layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
5777 layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
5778 layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
5779 layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
5780
5781 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5782
5783 if (arch == LLM_ARCH_ERNIE4_5_MOE && static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead) { // MoE layers
5784 int n_ff_exp = hparams.n_ff_exp;
5785
5786 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
5787 layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
5788 layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
5789 layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert}, 0);
5790 layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
5791
5792 // Shared expert (if present)
5793 if (hparams.n_ff_shexp > 0) {
5794 layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp}, 0);
5795 layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd }, 0);
5796 layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp}, 0);
5797 }
5798 } else { // Dense layers
5799 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
5800 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
5801 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
5802 }
5803 }
5804 } break;
5805 case LLM_ARCH_FALCON_H1:
5806 {
5807 // Common
5808 const int64_t hidden_size = hparams.n_embd; // hidden_size
5809
5810 // mamba2 Mixer SSM params
5811 const int64_t ssm_conv_kernel_size = hparams.ssm_d_conv; // ssm_conv_kernel_size
5812 const int64_t ssm_n_groups = hparams.ssm_n_group; // ssm_n_groups
5813 const int64_t ssm_state_size = hparams.ssm_d_state; // ssm_state_size
5814 const int64_t ssm_intermediate_size = hparams.ssm_d_inner; // TODO expand
5815 const int64_t ssm_num_heads = hparams.ssm_dt_rank; // ssm_num_heads
5816 const int64_t ssm_conv_dim = ssm_intermediate_size + 2 * ssm_n_groups * ssm_state_size;
5817 const int64_t ssm_projection_size = ssm_intermediate_size + ssm_conv_dim + ssm_num_heads;
5818
5819 // attn params
5820 const int64_t attn_num_attention_head = hparams.n_head(il: 0); // rename to: attn_num_attention_head
5821 const int64_t attn_num_key_value_head = hparams.n_head_kv(il: 0);
5822
5823 // ffn params
5824 const int64_t ffn_intermediate_size = hparams.n_ff(il: 0);
5825
5826 // embeddings
5827 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hidden_size, n_vocab}, 0);
5828
5829 // output
5830 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hidden_size, n_vocab}, TENSOR_NOT_REQUIRED);
5831 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {hidden_size}, 0);
5832
5833 // if output is NULL, init from the input tok embed
5834 if (output == NULL) {
5835 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hidden_size, n_vocab}, TENSOR_DUPLICATED);
5836 }
5837
5838 for (int i = 0; i < n_layer; ++i) {
5839 auto & layer = layers[i];
5840
5841 /*SSM LAYERS*/
5842 // ssm in
5843 layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {hidden_size, ssm_projection_size}, 0);
5844 // ssm 1d conv
5845 layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {ssm_conv_kernel_size, ssm_conv_dim}, 0);
5846 layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {ssm_conv_dim}, TENSOR_NOT_REQUIRED);
5847 // ssm_dt
5848 layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {ssm_num_heads}, 0);
5849 // no "weight" suffix for these
5850 layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, ssm_num_heads}, 0);
5851 layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, ssm_num_heads}, 0);
5852 // ssm_norm
5853 layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {ssm_intermediate_size / ssm_n_groups, ssm_n_groups}, TENSOR_NOT_REQUIRED);
5854 // out_proj
5855 layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {ssm_intermediate_size, hidden_size}, 0);
5856
5857 /*ATTENTION LAYERS*/
5858 // attention layers (with optional bias)
5859 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {hidden_size, n_embd_head_k * attn_num_attention_head}, 0);
5860 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {hidden_size, attn_num_key_value_head * n_embd_head_k}, 0);
5861 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {hidden_size, attn_num_key_value_head * n_embd_head_v}, 0);
5862 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * attn_num_attention_head, hidden_size}, 0);
5863 layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
5864 layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {attn_num_key_value_head * n_embd_head_k}, TENSOR_NOT_REQUIRED);
5865 layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {attn_num_key_value_head * n_embd_head_v}, TENSOR_NOT_REQUIRED);
5866 layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
5867 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {hidden_size}, 0);
5868
5869
5870 // feed forward (w/ optional biases)
5871 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, i), {hidden_size}, 0);
5872 layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
5873 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {hidden_size, ffn_intermediate_size}, 0);
5874 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { ffn_intermediate_size, hidden_size}, 0);
5875 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {hidden_size, ffn_intermediate_size}, 0);
5876
5877 layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {ffn_intermediate_size}, TENSOR_NOT_REQUIRED);
5878 layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
5879 layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {ffn_intermediate_size}, TENSOR_NOT_REQUIRED);
5880 }
5881 } break;
5882 case LLM_ARCH_HUNYUAN_MOE:
5883 {
5884 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5885
5886 // output
5887 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5888 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5889 // if output is NULL, init from the input tok embed
5890 if (output == NULL) {
5891 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5892 }
5893
5894 for (int i = 0; i < n_layer; ++i) {
5895 auto & layer = layers[i];
5896
5897 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5898
5899 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
5900 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
5901 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
5902 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
5903
5904 layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
5905 layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
5906
5907 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5908
5909 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
5910 layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
5911 layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
5912 layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
5913
5914 layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
5915 layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
5916 layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
5917 }
5918 } break;
5919 case LLM_ARCH_HUNYUAN_DENSE:
5920 {
5921 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5922
5923 // output
5924 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5925 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5926 // if output is NULL, init from the input tok embed
5927 if (output == NULL) {
5928 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5929 }
5930
5931 for (int i = 0; i < n_layer; ++i) {
5932 auto & layer = layers[i];
5933
5934 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5935
5936 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
5937 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
5938 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
5939 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
5940
5941 layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
5942 layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
5943
5944 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5945
5946 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
5947 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
5948 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
5949
5950 }
5951 } break;
5952 case LLM_ARCH_SMOLLM3:
5953 {
5954 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5955
5956 // output
5957 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5958 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5959
5960 // if output is NULL, init from the input tok embed
5961 if (output == NULL) {
5962 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5963 }
5964
5965 for (int i = 0; i < n_layer; ++i) {
5966 auto & layer = layers[i];
5967
5968 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5969
5970 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
5971 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
5972 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
5973 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
5974
5975 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5976 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
5977 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
5978 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
5979 }
5980 } break;
5981 case LLM_ARCH_OPENAI_MOE:
5982 {
5983 const int64_t n_ff_exp = hparams.n_ff_exp;
5984
5985 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5986
5987 // output
5988 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5989 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
5990
5991 for (int i = 0; i < n_layer; ++i) {
5992 auto & layer = layers[i];
5993
5994 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5995 layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
5996
5997 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_rot}, 0);
5998 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
5999 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
6000 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_rot, n_embd}, 0);
6001
6002 layer.attn_sinks = create_tensor(tn(LLM_TENSOR_ATTN_SINKS, "weight", i), {n_head}, 0);
6003
6004 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert}, 0);
6005 layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
6006 layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
6007 layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
6008
6009 // bias
6010 layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_head * n_rot}, 0);
6011 layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_head_kv * n_rot}, 0);
6012 layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_head_kv * n_rot}, 0);
6013 layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
6014
6015 layer.ffn_gate_inp_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "bias", i), {n_expert}, 0);
6016 layer.ffn_gate_exps_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "bias", i), {n_ff_exp, n_expert}, 0);
6017 layer.ffn_down_exps_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "bias", i), { n_embd, n_expert}, 0);
6018 layer.ffn_up_exps_b = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "bias", i), {n_ff_exp, n_expert}, 0);
6019 }
6020 } break;
6021 case LLM_ARCH_LFM2:
6022 case LLM_ARCH_LFM2MOE:
6023 {
6024 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6025 tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
6026 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
6027
6028 if (output == NULL) {
6029 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
6030 }
6031
6032 for (int i = 0; i < n_layer; ++i) {
6033 auto & layer = layers[i];
6034
6035 const bool is_moe_layer = i >= static_cast<int>(hparams.n_layer_dense_lead);
6036
6037 // ffn/moe is same for transformer and conv layers
6038 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
6039 if (is_moe_layer) {
6040 GGML_ASSERT(n_expert && n_expert_used);
6041 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
6042 layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, hparams.n_ff_exp, n_expert}, 0);
6043 layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {hparams.n_ff_exp, n_embd, n_expert}, 0);
6044 layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, hparams.n_ff_exp, n_expert}, 0);
6045 layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 0);
6046 } else { // dense
6047 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
6048 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
6049 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
6050 }
6051
6052 // for operator_norm
6053 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6054
6055 if (!hparams.is_recurrent(il: i)) {
6056 layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
6057 layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
6058 GGML_ASSERT(n_embd_v_gqa == n_embd_k_gqa);
6059
6060 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
6061 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, hparams.n_embd_k_gqa(il: i)}, 0);
6062 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, hparams.n_embd_v_gqa(il: i)}, 0);
6063
6064 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
6065 } else {
6066 layer.shortconv.conv = create_tensor(tn(LLM_TENSOR_SHORTCONV_CONV, "weight", i), {hparams.n_shortconv_l_cache, n_embd}, 0);
6067 layer.shortconv.in_proj = create_tensor(tn(LLM_TENSOR_SHORTCONV_INPROJ, "weight", i), {n_embd, 3 * n_embd}, 0);
6068 layer.shortconv.out_proj = create_tensor(tn(LLM_TENSOR_SHORTCONV_OUTPROJ, "weight", i), {n_embd, n_embd}, 0);
6069 }
6070 }
6071 } break;
6072 case LLM_ARCH_SMALLTHINKER:
6073 {
6074 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
6075
6076 // output
6077 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
6078 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
6079
6080 // if output is NULL, init from the input tok embed
6081 if (output == NULL) {
6082 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
6083 }
6084
6085 for (int i = 0; i < n_layer; ++i) {
6086 auto & layer = layers[i];
6087
6088 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
6089
6090 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
6091 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
6092 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
6093 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
6094
6095 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
6096
6097 GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for SMALLTHINKER");
6098 GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for SMALLTHINKER");
6099
6100 // MoE branch
6101 const int64_t n_ff_exp = hparams.n_ff_exp;
6102 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0);
6103 layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
6104 layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
6105 layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
6106 }
6107 } break;
6108 case LLM_ARCH_GROVEMOE:
6109 {
6110 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6111
6112 // output
6113 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6114 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
6115 // if output is NULL, init from the input tok embed
6116 if (output == NULL) {
6117 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
6118 }
6119
6120 GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for GROVEMOE");
6121 GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for GROVEMOE");
6122 GGML_ASSERT(hparams.n_group_experts > 0 && "n_group_experts must be > 0 for GROVEMOE");
6123
6124 for (int i = 0; i < n_layer; ++i) {
6125 auto & layer = layers[i];
6126
6127 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6128
6129 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
6130 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
6131 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
6132 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
6133
6134 layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
6135 layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
6136
6137 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
6138
6139 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
6140
6141 // MoE branch
6142 const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
6143 const int64_t n_ff_chexp = hparams.n_ff_chexp ? hparams.n_ff_chexp : n_embd_head_k;
6144 const int64_t n_chunk_expert = n_expert / hparams.n_group_experts;
6145
6146 layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
6147 layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
6148 layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
6149
6150 layer.ffn_gate_chexps = create_tensor(tn(LLM_TENSOR_FFN_GATE_CHEXPS, "weight", i), { n_embd, n_ff_chexp, n_chunk_expert}, 0);
6151 layer.ffn_down_chexps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_CHEXPS, "weight", i), {n_ff_chexp, n_embd, n_chunk_expert}, 0);
6152 layer.ffn_up_chexps = create_tensor(tn(LLM_TENSOR_FFN_UP_CHEXPS, "weight", i), { n_embd, n_ff_chexp, n_chunk_expert}, 0);
6153 }
6154 } break;
6155 case LLM_ARCH_APERTUS:
6156 {
6157 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
6158
6159 // output
6160 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
6161 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, 0);
6162
6163 for (int i = 0; i < n_layer; ++i) {
6164 auto & layer = layers[i];
6165
6166 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
6167
6168 if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
6169 layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
6170 layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
6171 } else {
6172 layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
6173 }
6174
6175 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
6176 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
6177 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
6178 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
6179
6180 // optional bias tensors
6181 layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
6182 layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_gqa }, TENSOR_NOT_REQUIRED);
6183 layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_gqa }, TENSOR_NOT_REQUIRED);
6184 layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
6185
6186 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
6187 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
6188 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
6189
6190 // Q and K layernorms for Apertus
6191 layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
6192 layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED);
6193 layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
6194 layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED);
6195 }
6196 } break;
6197 case LLM_ARCH_MINIMAX_M2:
6198 {
6199 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6200
6201 // output
6202 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6203 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
6204
6205 for (int i = 0; i < n_layer; ++i) {
6206 auto & layer = layers[i];
6207
6208 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
6209 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
6210 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
6211 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
6212
6213 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6214 layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k * n_head}, 0);
6215 layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_k_gqa}, 0);
6216
6217 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
6218
6219 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
6220 layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
6221 layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
6222 layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
6223 layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 0);
6224 }
6225 } break;
6226 case LLM_ARCH_COGVLM:
6227 {
6228 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6229
6230 // output
6231 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6232 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
6233
6234 // if output is NULL, init from the input tok embed
6235 if (output == NULL) {
6236 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
6237 }
6238
6239 for (int i = 0; i < n_layer; ++i) {
6240 auto & layer = layers[i];
6241
6242 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6243 layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd_head_k * n_head * 3}, 0);
6244 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
6245
6246 layer.visexp_attn_wqkv = create_tensor(tn(LLM_TENSOR_VISEXP_ATTN_QKV, "weight", i), {n_embd, n_embd_head_k * n_head * 3}, 0);
6247 layer.visexp_attn_wo = create_tensor(tn(LLM_TENSOR_VISEXP_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
6248
6249 layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
6250
6251 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
6252 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
6253 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
6254 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
6255
6256 layer.visexp_ffn_gate = create_tensor(tn(LLM_TENSOR_VISEXP_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
6257 layer.visexp_ffn_down = create_tensor(tn(LLM_TENSOR_VISEXP_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
6258 layer.visexp_ffn_up = create_tensor(tn(LLM_TENSOR_VISEXP_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
6259 }
6260 } break;
6261 case LLM_ARCH_PANGU_EMBED:
6262 {
6263 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6264
6265 // output
6266 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6267 output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
6268
6269 // if output is NULL, init from the input tok embed
6270 if (output == NULL) {
6271 output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
6272 }
6273
6274 for (int i = 0; i < n_layer; ++i) {
6275 auto & layer = layers[i];
6276
6277 layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6278
6279 // weight tensors
6280 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
6281 layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
6282 layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
6283 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
6284
6285 // bias tensors
6286 layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd_head_k * n_head}, 0);
6287 layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
6288 layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
6289 layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
6290
6291 layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
6292
6293 if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
6294 layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
6295 layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
6296 } else {
6297 layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
6298 }
6299
6300 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
6301 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
6302 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
6303 }
6304 } break;
6305 default:
6306 throw std::runtime_error("unknown architecture");
6307 }
6308
6309 if (n_moved_tensors > 0) {
6310 LLAMA_LOG_DEBUG("%s: tensor '%s' (%s) (and %d others) cannot be used with preferred buffer type %s, using %s instead\n",
6311 __func__, first_moved_tensor->name, ggml_type_name(first_moved_tensor->type), n_moved_tensors - 1,
6312 ggml_backend_buft_name(first_moved_from_buft), ggml_backend_buft_name(first_moved_to_buft));
6313 }
6314 }
6315
6316 ml.done_getting_tensors();
6317
6318 ml.init_mappings(prefetch: true, mlock_mmaps: use_mlock ? &pimpl->mlock_mmaps : nullptr);
6319 pimpl->mappings.reserve(n: ml.mappings.size());
6320
6321 // create the backend buffers
6322 std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_buf_maps;
6323 ctx_buf_maps.reserve(n: ctx_map.size());
6324
6325 // Ensure we have enough capacity for the maximum backend buffer we will potentially create
6326 const size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
6327 pimpl->ctxs_bufs.reserve(n: n_max_backend_buffer);
6328
6329 for (auto & [buft, ctx_ptr] : ctx_map) {
6330 ggml_context * ctx = ctx_ptr.get();
6331
6332 // skip contexts without tensors
6333 if (ggml_get_first_tensor(ctx) == nullptr) {
6334 continue;
6335 }
6336
6337 llama_buf_map buf_map;
6338 buf_map.reserve(n: n_max_backend_buffer);
6339
6340 // check if it is possible to use buffer_from_host_ptr with this buffer type
6341 ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
6342 if (!dev) {
6343 // FIXME: workaround for CPU backend buft having a NULL device
6344 dev = ggml_backend_dev_by_type(type: GGML_BACKEND_DEVICE_TYPE_CPU);
6345 if (!dev) {
6346 throw std::runtime_error(format(fmt: "%s: no CPU backend found", __func__));
6347 }
6348 }
6349 ggml_backend_dev_props props;
6350 ggml_backend_dev_get_props(device: dev, props: &props);
6351 bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
6352 bool is_default_buft = buft == ggml_backend_dev_buffer_type(device: dev);
6353
6354 std::vector<ggml_backend_buffer_ptr> bufs;
6355 if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
6356 for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
6357 // only the mmap region containing the tensors in the model is mapped to the backend buffer
6358 // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
6359 // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
6360 void * addr = nullptr;
6361 size_t first, last; // NOLINT
6362 ml.get_mapping_range(first: &first, last: &last, addr: &addr, idx, ctx);
6363 if (first >= last) {
6364 continue;
6365 }
6366 const size_t max_size = ggml_get_max_tensor_size(ctx);
6367 ggml_backend_buffer_t buf = ggml_backend_dev_buffer_from_host_ptr(device: dev, ptr: (char *) addr + first, size: last - first, max_tensor_size: max_size);
6368 if (buf == nullptr) {
6369 throw std::runtime_error(format(fmt: "unable to allocate %s buffer", ggml_backend_buft_name(buft)));
6370 }
6371 bufs.emplace_back(args&: buf);
6372 buf_map.emplace(args&: idx, args&: buf);
6373 }
6374 }
6375 else {
6376 ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
6377 if (buf == nullptr) {
6378 throw std::runtime_error(format(fmt: "unable to allocate %s buffer", ggml_backend_buft_name(buft)));
6379 }
6380 if (use_mlock && ggml_backend_buffer_is_host(buffer: buf)) {
6381 pimpl->mlock_bufs.emplace_back(args: new llama_mlock);
6382 auto & mlock_buf = pimpl->mlock_bufs.back();
6383 mlock_buf->init (ptr: ggml_backend_buffer_get_base(buffer: buf));
6384 mlock_buf->grow_to(target_size: ggml_backend_buffer_get_size(buffer: buf));
6385 }
6386 bufs.emplace_back(args&: buf);
6387 for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
6388 buf_map.emplace(args&: idx, args&: buf);
6389 }
6390 }
6391 pimpl->ctxs_bufs.emplace_back(args: std::move(ctx_ptr), args: std::move(bufs));
6392
6393 for (auto & buf : buf_map) {
6394 // indicate that this buffer contains weights
6395 // this is used by ggml_backend_sched to improve op scheduling: ops that use a weight are preferably scheduled to the backend that contains the weight
6396 ggml_backend_buffer_set_usage(buffer: buf.second, usage: GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
6397 }
6398
6399 ctx_buf_maps.emplace_back(args&: ctx, args&: buf_map);
6400 }
6401
6402 if (llama_supports_gpu_offload()) {
6403 const int n_gpu = std::min(a: n_gpu_layers, b: int(hparams.n_layer));
6404
6405 LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
6406 if (n_gpu_layers > (int) hparams.n_layer) {
6407 LLAMA_LOG_INFO("%s: offloading output layer to GPU\n", __func__);
6408 }
6409
6410 const int max_backend_supported_layers = hparams.n_layer + 1;
6411 const int max_offloadable_layers = hparams.n_layer + 1;
6412
6413 LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
6414 }
6415
6416 // print memory requirements per buffer type
6417 for (auto & [_, bufs] : pimpl->ctxs_bufs) {
6418 for (auto & buf: bufs) {
6419 LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n",
6420 __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
6421 }
6422 }
6423
6424 // populate tensors_by_name
6425 for (auto & [ctx, _] : pimpl->ctxs_bufs) {
6426 for (auto * cur = ggml_get_first_tensor(ctx: ctx.get()); cur != NULL; cur = ggml_get_next_tensor(ctx: ctx.get(), tensor: cur)) {
6427 tensors_by_name.emplace_back(args: ggml_get_name(tensor: cur), args&: cur);
6428 }
6429 }
6430
6431 // load tensor data
6432 for (auto & [ctx, buf_map] : ctx_buf_maps) {
6433 if (!ml.load_all_data(ctx, bufs&: buf_map, lmlocks: use_mlock ? &pimpl->mlock_mmaps : NULL, progress_callback: params.progress_callback, progress_callback_user_data: params.progress_callback_user_data)) {
6434 return false;
6435 }
6436 }
6437
6438 if (use_mmap_buffer) {
6439 for (auto & mapping : ml.mappings) {
6440 pimpl->mappings.emplace_back(args: std::move(mapping));
6441 }
6442 }
6443
6444 return true;
6445}
6446
6447std::string llama_model::arch_name() const {
6448 return llm_arch_name(arch);
6449}
6450
6451std::string llama_model::type_name() const {
6452 return llm_type_name(type);
6453}
6454
6455std::string llama_model::desc() const {
6456 return pimpl->desc_str;
6457}
6458
6459size_t llama_model::size() const {
6460 return pimpl->n_bytes;
6461}
6462
6463size_t llama_model::n_tensors() const {
6464 return tensors_by_name.size();
6465}
6466
6467size_t llama_model::n_devices() const {
6468 return devices.size();
6469}
6470
6471std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
6472 std::map<ggml_backend_buffer_type_t, size_t> ret;
6473 for (const auto & [_, bufs] : pimpl->ctxs_bufs) {
6474 for (const auto & buf : bufs) {
6475 ret[ggml_backend_buffer_get_type(buffer: buf.get())] += ggml_backend_buffer_get_size(buffer: buf.get());
6476 }
6477 }
6478 return ret;
6479}
6480
6481uint64_t llama_model::n_elements() const {
6482 return pimpl->n_elements;
6483}
6484
6485void llama_model::print_info() const {
6486 const std::string rope_scaling_type = llama_rope_scaling_type_name(rope_scaling_type: hparams.rope_scaling_type_train);
6487
6488 auto print_f = [](const std::function<uint32_t(uint32_t)> & f, uint32_t n) {
6489 bool is_var = false;
6490
6491 std::vector<uint32_t> v;
6492 for (uint32_t i = 0; i < n; ++i) {
6493 v.push_back(x: f(i));
6494 if (v[i] != v[0]) {
6495 is_var = true;
6496 }
6497 }
6498
6499 std::stringstream ss;
6500
6501 if (is_var) {
6502 ss << "[";
6503 for (uint32_t i = 0; i < n; ++i) {
6504 ss << v[i];
6505 if (i < n - 1) {
6506 ss << ", ";
6507 }
6508 }
6509 ss << "]";
6510 } else {
6511 ss << v[0];
6512 }
6513
6514 return ss.str();
6515 };
6516
6517 // hparams
6518 LLAMA_LOG_INFO("%s: arch = %s\n", __func__, arch_name().c_str());
6519 LLAMA_LOG_INFO("%s: vocab_only = %d\n", __func__, hparams.vocab_only);
6520
6521 if (!hparams.vocab_only) {
6522 LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
6523 LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
6524 LLAMA_LOG_INFO("%s: n_embd_inp = %u\n", __func__, hparams.n_embd_inp());
6525 LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
6526 LLAMA_LOG_INFO("%s: n_head = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head(il); }, hparams.n_layer).c_str());
6527 LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
6528 LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
6529 LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa);
6530 LLAMA_LOG_INFO("%s: is_swa_any = %u\n", __func__, hparams.is_swa_any());
6531 LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k);
6532 LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v);
6533 LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer).c_str());
6534 LLAMA_LOG_INFO("%s: n_embd_k_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer).c_str());
6535 LLAMA_LOG_INFO("%s: n_embd_v_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer).c_str());
6536 LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
6537 LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
6538 LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
6539 LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
6540 LLAMA_LOG_INFO("%s: f_logit_scale = %.1e\n", __func__, hparams.f_logit_scale);
6541 LLAMA_LOG_INFO("%s: f_attn_scale = %.1e\n", __func__, hparams.f_attention_scale);
6542 LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str());
6543 LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
6544 LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
6545 LLAMA_LOG_INFO("%s: n_expert_groups = %d\n", __func__, hparams.n_expert_groups);
6546 LLAMA_LOG_INFO("%s: n_group_used = %d\n", __func__, hparams.n_group_used);
6547 LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn);
6548 LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
6549 LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
6550 LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
6551 LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
6552 LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
6553 LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
6554 LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
6555 // MRoPE (Multi-axis Rotary Position Embedding) sections
6556 if (const auto & s = hparams.rope_sections; s[0] || s[1] || s[2] || s[3]) {
6557 LLAMA_LOG_INFO("%s: mrope sections = [%d, %d, %d, %d]\n", __func__, s[0], s[1], s[2], s[3]);
6558 }
6559 if (!classifier_labels.empty()) {
6560 LLAMA_LOG_INFO("%s: n_cls_out = %u\n", __func__, hparams.n_cls_out);
6561
6562 size_t i = 0;
6563 for (auto label : classifier_labels) {
6564 LLAMA_LOG_INFO("%s: cls_label[%2zu] = %s\n", __func__, i++, label.c_str());
6565 }
6566 }
6567 }
6568
6569 if (arch == LLM_ARCH_MAMBA ||
6570 arch == LLM_ARCH_MAMBA2 ||
6571 arch == LLM_ARCH_JAMBA ||
6572 arch == LLM_ARCH_FALCON_H1 ||
6573 arch == LLM_ARCH_PLAMO2 ||
6574 arch == LLM_ARCH_GRANITE_HYBRID ||
6575 arch == LLM_ARCH_NEMOTRON_H) {
6576 LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
6577 LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
6578 LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
6579 LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
6580 LLAMA_LOG_INFO("%s: ssm_n_group = %u\n", __func__, hparams.ssm_n_group);
6581 LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
6582 }
6583
6584 LLAMA_LOG_INFO("%s: model type = %s\n", __func__, type_name().c_str());
6585 if (pimpl->n_elements >= 1e12) {
6586 LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, pimpl->n_elements*1e-12);
6587 } else if (pimpl->n_elements >= 1e9) {
6588 LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, pimpl->n_elements*1e-9);
6589 } else if (pimpl->n_elements >= 1e6) {
6590 LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, pimpl->n_elements*1e-6);
6591 } else {
6592 LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, pimpl->n_elements*1e-3);
6593 }
6594
6595 // general kv
6596 LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, name.c_str());
6597
6598 if (arch == LLM_ARCH_DEEPSEEK) {
6599 LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
6600 LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
6601 LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
6602 LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
6603 }
6604
6605 if (arch == LLM_ARCH_DEEPSEEK2) {
6606 LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
6607 LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
6608 LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
6609 LLAMA_LOG_INFO("%s: n_embd_head_k_mla = %d\n", __func__, hparams.n_embd_head_k_mla);
6610 LLAMA_LOG_INFO("%s: n_embd_head_v_mla = %d\n", __func__, hparams.n_embd_head_v_mla);
6611 LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
6612 LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
6613 LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
6614 LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
6615 LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
6616 LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
6617 }
6618
6619 if (arch == LLM_ARCH_QWEN2MOE) {
6620 LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
6621 LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
6622 }
6623
6624 if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE || arch == LLM_ARCH_QWEN3VLMOE) {
6625 LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
6626 }
6627
6628 if (arch == LLM_ARCH_MINICPM ||
6629 arch == LLM_ARCH_GRANITE ||
6630 arch == LLM_ARCH_GRANITE_MOE ||
6631 arch == LLM_ARCH_GRANITE_HYBRID) {
6632 LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
6633 LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
6634 LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
6635 LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
6636 }
6637
6638 if (arch == LLM_ARCH_BAILINGMOE) {
6639 LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
6640 LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
6641 LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
6642 LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
6643 LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
6644 }
6645
6646 if (arch == LLM_ARCH_BAILINGMOE2) {
6647 LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
6648 LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
6649 LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
6650 LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
6651 LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
6652 LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
6653 LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
6654 LLAMA_LOG_INFO("%s: nextn_predict_layers = %d\n", __func__, hparams.nextn_predict_layers);
6655 }
6656
6657 if (arch == LLM_ARCH_SMALLTHINKER || arch == LLM_ARCH_LFM2MOE) {
6658 LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
6659 LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
6660 }
6661
6662 if (arch == LLM_ARCH_GROVEMOE) {
6663 LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
6664 LLAMA_LOG_INFO("%s: n_ff_chexp = %d\n", __func__, hparams.n_ff_chexp);
6665 LLAMA_LOG_INFO("%s: n_group_experts = %d\n", __func__, hparams.n_group_experts);
6666 LLAMA_LOG_INFO("%s: expert_group_scale = %.2f\n", __func__, hparams.expert_group_scale);
6667 }
6668
6669 vocab.print_info();
6670}
6671
6672ggml_backend_dev_t llama_model::dev_layer(int il) const {
6673 return pimpl->dev_layer.at(n: il).dev;
6674}
6675
6676ggml_backend_dev_t llama_model::dev_output() const {
6677 return pimpl->dev_output.dev;
6678}
6679
6680template<typename F>
6681static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
6682 ggml_init_params params = {
6683 /*.mem_size =*/ ggml_tensor_overhead()*8,
6684 /*.mem_buffer =*/ NULL,
6685 /*.no_alloc =*/ true,
6686 };
6687
6688 ggml_context_ptr ctx { ggml_init(params) };
6689 if (!ctx) {
6690 throw std::runtime_error(format(fmt: "failed to create ggml context"));
6691 }
6692
6693 ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, size: 0) };
6694 ggml_tensor * op_tensor = fn(ctx.get());
6695 for (int i = 0; i < GGML_MAX_SRC; i++) {
6696 if (op_tensor->src[i] != nullptr) {
6697 assert(op_tensor->src[i]->buffer == nullptr);
6698 op_tensor->src[i]->buffer = buf.get();
6699 }
6700 }
6701
6702 bool op_supported = ggml_backend_dev_supports_op(device: dev, op: op_tensor);
6703
6704 return op_supported;
6705}
6706
6707template<typename F>
6708static ggml_backend_buffer_type_t select_buft(const buft_list_t & buft_list, const F & fn) {
6709 for (const auto & cur : buft_list) {
6710 ggml_backend_dev_t cur_dev = cur.first;
6711 ggml_backend_buffer_type_t cur_buft = cur.second;
6712 if (buft_supported(cur_buft, cur_dev, fn)) {
6713 return cur_buft;
6714 }
6715 }
6716
6717 throw std::runtime_error(format(fmt: "no suitable buffer type found"));
6718}
6719
6720ggml_backend_buffer_type_t llama_model::select_buft(int il) const {
6721 return ::select_buft(
6722 buft_list: *pimpl->dev_layer.at(n: il).buft_list,
6723 fn: [&](ggml_context * ctx) {
6724 ggml_tensor * cur = ggml_new_tensor_1d(ctx, type: GGML_TYPE_F32, ne0: hparams.n_embd);
6725 ggml_tensor * layer_dir = ggml_new_tensor_1d(ctx, type: GGML_TYPE_F32, ne0: hparams.n_embd);
6726 return ggml_add(ctx, a: cur, b: layer_dir);
6727 });
6728}
6729
6730bool llama_model::has_tensor_overrides() const {
6731 return pimpl->has_tensor_overrides;
6732}
6733
6734const ggml_tensor * llama_model::get_tensor(const char * name) const {
6735 auto it = std::find_if(first: tensors_by_name.begin(), last: tensors_by_name.end(),
6736 pred: [name](const std::pair<std::string, ggml_tensor *> & it) {
6737 return it.first == name;
6738 });
6739 if (it == tensors_by_name.end()) {
6740 return nullptr;
6741 }
6742
6743 return it->second;
6744}
6745
6746float llama_model::get_rope_freq_base (const llama_cparams & cparams, int il) const {
6747 return hparams.is_swa(il) ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base;
6748}
6749
6750float llama_model::get_rope_freq_scale(const llama_cparams & cparams, int il) const {
6751 return hparams.is_swa(il) ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale;
6752}
6753
6754ggml_tensor * llama_model::get_rope_factors(const llama_cparams & cparams, int il) const {
6755 const uint32_t n_ctx_seq = cparams.n_ctx_seq;
6756
6757 // choose long/short freq factors based on the context size
6758 if (layers[il].rope_freqs != nullptr) {
6759 return layers[il].rope_freqs;
6760 }
6761
6762 if (n_ctx_seq > hparams.n_ctx_orig_yarn) {
6763 return layers[il].rope_long;
6764 }
6765
6766 return layers[il].rope_short;
6767}
6768
6769llama_memory_i * llama_model::create_memory(const llama_memory_params & params, const llama_cparams & cparams) const {
6770 llama_memory_i * res;
6771
6772 switch (arch) {
6773 // Models that need specific instantiation should be handled in the
6774 // switch statement
6775 case LLM_ARCH_BERT:
6776 case LLM_ARCH_JINA_BERT_V2:
6777 case LLM_ARCH_JINA_BERT_V3:
6778 case LLM_ARCH_NOMIC_BERT:
6779 case LLM_ARCH_NOMIC_BERT_MOE:
6780 case LLM_ARCH_NEO_BERT:
6781 case LLM_ARCH_WAVTOKENIZER_DEC:
6782 case LLM_ARCH_GEMMA_EMBEDDING:
6783 case LLM_ARCH_DREAM:
6784 case LLM_ARCH_LLADA:
6785 case LLM_ARCH_LLADA_MOE:
6786 {
6787 res = nullptr;
6788 } break;
6789 // Models that need standard caching should rely on recurrent/hybrid
6790 // checks
6791 default:
6792 {
6793 if (llm_arch_is_recurrent(arch)) {
6794 res = new llama_memory_recurrent(
6795 *this,
6796 GGML_TYPE_F32,
6797 GGML_TYPE_F32,
6798 cparams.offload_kqv,
6799 std::max(a: (uint32_t) 1, b: cparams.n_seq_max),
6800 cparams.n_seq_max,
6801 nullptr);
6802 } else if (llm_arch_is_hybrid(arch)) {
6803
6804 // The main difference between hybrid architectures is the
6805 // layer filters, so pick the right one here
6806 llama_memory_hybrid::layer_filter_cb filter_attn = nullptr;
6807 llama_memory_hybrid::layer_filter_cb filter_recr = nullptr;
6808 if (arch == LLM_ARCH_FALCON_H1) {
6809 filter_attn = [&](int32_t) { return true; };
6810 filter_recr = [&](int32_t) { return true; };
6811 } else if (arch == LLM_ARCH_NEMOTRON_H) {
6812 filter_attn = [&](int32_t il) {
6813 return !hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
6814 };
6815 filter_recr = [&](int32_t il) {
6816 return hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
6817 };
6818 }
6819
6820 res = new llama_memory_hybrid(
6821 /* model */ *this,
6822 /* attn_type_k */ params.type_k,
6823 /* attn_type_v */ params.type_v,
6824 /* attn_v_trans */ !cparams.flash_attn,
6825 /* attn_kv_size */ cparams.n_ctx,
6826 /* attn_n_pad */ 1,
6827 /* attn_n_swa */ hparams.n_swa,
6828 /* attn_swa_type */ hparams.swa_type,
6829 /* recurrent_type_k */ GGML_TYPE_F32,
6830 /* recurrent_type_v */ GGML_TYPE_F32,
6831 /* recurrent_kv_size */ std::max(a: (uint32_t) 1, b: cparams.n_seq_max),
6832 /* n_seq_max */ cparams.n_seq_max,
6833 /* offload */ cparams.offload_kqv,
6834 /* unified */ cparams.kv_unified,
6835 /* filter_attn */ std::move(filter_attn),
6836 /* filter_recr */ std::move(filter_recr));
6837 } else {
6838 llama_memory_i::layer_reuse_cb reuse = nullptr;
6839
6840 if (arch == LLM_ARCH_GEMMA3N) {
6841 reuse = [&](int32_t il) {
6842 if (il >= (int32_t) hparams.n_layer_kv_from_start) {
6843 return (int32_t) hparams.n_layer_kv_from_start - (hparams.is_swa(il) ? 2 : 1);
6844 }
6845
6846 return -1;
6847 };
6848 }
6849
6850 if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
6851 GGML_ASSERT(hparams.is_swa_any());
6852
6853 res = new llama_kv_cache_iswa(
6854 *this,
6855 params.type_k,
6856 params.type_v,
6857 !cparams.flash_attn,
6858 cparams.offload_kqv,
6859 params.swa_full,
6860 cparams.kv_unified,
6861 cparams.n_ctx_seq,
6862 cparams.n_seq_max,
6863 cparams.n_ubatch,
6864 1,
6865 nullptr,
6866 reuse);
6867 } else {
6868 GGML_ASSERT(!hparams.is_swa_any());
6869
6870 res = new llama_kv_cache(
6871 *this,
6872 params.type_k,
6873 params.type_v,
6874 !cparams.flash_attn,
6875 cparams.offload_kqv,
6876 cparams.kv_unified,
6877 cparams.n_ctx_seq,
6878 cparams.n_seq_max,
6879 1,
6880 hparams.n_swa,
6881 hparams.swa_type,
6882 nullptr,
6883 nullptr);
6884 }
6885 }
6886 }
6887 }
6888
6889 return res;
6890}
6891
6892ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
6893 std::unique_ptr<llm_graph_context> llm;
6894
6895 switch (arch) {
6896 case LLM_ARCH_LLAMA:
6897 {
6898 llm = std::make_unique<llm_build_llama>(args: *this, args: params);
6899 } break;
6900 case LLM_ARCH_LLAMA4:
6901 {
6902 if (hparams.swa_type == LLAMA_SWA_TYPE_NONE) {
6903 llm = std::make_unique<llm_build_llama>(args: *this, args: params);
6904 } else {
6905 llm = std::make_unique<llm_build_llama_iswa>(args: *this, args: params);
6906 }
6907 } break;
6908 case LLM_ARCH_DECI:
6909 {
6910 llm = std::make_unique<llm_build_deci>(args: *this, args: params);
6911 } break;
6912 case LLM_ARCH_BAICHUAN:
6913 {
6914 llm = std::make_unique<llm_build_baichuan>(args: *this, args: params);
6915 } break;
6916 case LLM_ARCH_FALCON:
6917 {
6918 llm = std::make_unique<llm_build_falcon>(args: *this, args: params);
6919 } break;
6920 case LLM_ARCH_GROK:
6921 {
6922 llm = std::make_unique<llm_build_grok>(args: *this, args: params);
6923 } break;
6924 case LLM_ARCH_STARCODER:
6925 {
6926 llm = std::make_unique<llm_build_starcoder>(args: *this, args: params);
6927 } break;
6928 case LLM_ARCH_REFACT:
6929 {
6930 llm = std::make_unique<llm_build_refact>(args: *this, args: params);
6931 } break;
6932 case LLM_ARCH_BERT:
6933 case LLM_ARCH_JINA_BERT_V2:
6934 case LLM_ARCH_JINA_BERT_V3:
6935 case LLM_ARCH_NOMIC_BERT:
6936 case LLM_ARCH_NOMIC_BERT_MOE:
6937 {
6938 llm = std::make_unique<llm_build_bert>(args: *this, args: params);
6939 } break;
6940 case LLM_ARCH_NEO_BERT:
6941 {
6942 llm = std::make_unique<llm_build_neo_bert>(args: *this, args: params);
6943 } break;
6944 case LLM_ARCH_BLOOM:
6945 {
6946 llm = std::make_unique<llm_build_bloom>(args: *this, args: params);
6947 } break;
6948 case LLM_ARCH_MPT:
6949 {
6950 llm = std::make_unique<llm_build_mpt>(args: *this, args: params);
6951 } break;
6952 case LLM_ARCH_STABLELM:
6953 {
6954 llm = std::make_unique<llm_build_stablelm>(args: *this, args: params);
6955 } break;
6956 case LLM_ARCH_QWEN:
6957 {
6958 llm = std::make_unique<llm_build_qwen>(args: *this, args: params);
6959 } break;
6960 case LLM_ARCH_QWEN2:
6961 {
6962 llm = std::make_unique<llm_build_qwen2>(args: *this, args: params);
6963 } break;
6964 case LLM_ARCH_DREAM:
6965 {
6966 llm = std::make_unique<llm_build_dream>(args: *this, args: params);
6967 }
6968 break;
6969 case LLM_ARCH_LLADA:
6970 {
6971 llm = std::make_unique<llm_build_llada>(args: *this, args: params);
6972 }
6973 break;
6974 case LLM_ARCH_LLADA_MOE:
6975 {
6976 llm = std::make_unique<llm_build_llada_moe>(args: *this, args: params);
6977 }
6978 break;
6979 case LLM_ARCH_QWEN2VL:
6980 {
6981 llm = std::make_unique<llm_build_qwen2vl>(args: *this, args: params);
6982 } break;
6983 case LLM_ARCH_QWEN2MOE:
6984 {
6985 llm = std::make_unique<llm_build_qwen2moe>(args: *this, args: params);
6986 } break;
6987 case LLM_ARCH_QWEN3:
6988 {
6989 llm = std::make_unique<llm_build_qwen3>(args: *this, args: params);
6990 } break;
6991 case LLM_ARCH_QWEN3MOE:
6992 {
6993 llm = std::make_unique<llm_build_qwen3moe>(args: *this, args: params);
6994 } break;
6995 case LLM_ARCH_QWEN3VL:
6996 {
6997 llm = std::make_unique<llm_build_qwen3vl>(args: *this, args: params);
6998 } break;
6999 case LLM_ARCH_QWEN3VLMOE:
7000 {
7001 llm = std::make_unique<llm_build_qwen3vlmoe>(args: *this, args: params);
7002 } break;
7003 case LLM_ARCH_PHI2:
7004 {
7005 llm = std::make_unique<llm_build_phi2>(args: *this, args: params);
7006 } break;
7007 case LLM_ARCH_PHI3:
7008 case LLM_ARCH_PHIMOE:
7009 {
7010 if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
7011 llm = std::make_unique<llm_build_phi3<true>> (args: *this, args: params);
7012 } else {
7013 llm = std::make_unique<llm_build_phi3<false>>(args: *this, args: params);
7014 }
7015 } break;
7016 case LLM_ARCH_PLAMO:
7017 {
7018 llm = std::make_unique<llm_build_plamo>(args: *this, args: params);
7019 } break;
7020 case LLM_ARCH_PLAMO2:
7021 {
7022 llm = std::make_unique<llm_build_plamo2>(args: *this, args: params);
7023 } break;
7024 case LLM_ARCH_GPT2:
7025 {
7026 llm = std::make_unique<llm_build_gpt2>(args: *this, args: params);
7027 } break;
7028 case LLM_ARCH_CODESHELL:
7029 {
7030 llm = std::make_unique<llm_build_codeshell>(args: *this, args: params);
7031 } break;
7032 case LLM_ARCH_ORION:
7033 {
7034 llm = std::make_unique<llm_build_orion>(args: *this, args: params);
7035 } break;
7036 case LLM_ARCH_INTERNLM2:
7037 {
7038 llm = std::make_unique<llm_build_internlm2>(args: *this, args: params);
7039 } break;
7040 case LLM_ARCH_MINICPM3:
7041 {
7042 llm = std::make_unique<llm_build_minicpm3>(args: *this, args: params);
7043 } break;
7044 case LLM_ARCH_GEMMA:
7045 {
7046 llm = std::make_unique<llm_build_gemma>(args: *this, args: params);
7047 } break;
7048 case LLM_ARCH_GEMMA2:
7049 {
7050 llm = std::make_unique<llm_build_gemma2_iswa>(args: *this, args: params);
7051 } break;
7052 case LLM_ARCH_GEMMA3:
7053 {
7054 llm = std::make_unique<llm_build_gemma3_iswa>(args: *this, args: params);
7055 } break;
7056 case LLM_ARCH_GEMMA3N:
7057 {
7058 llm = std::make_unique<llm_build_gemma3n_iswa>(args: *this, args: params);
7059 } break;
7060 case LLM_ARCH_GEMMA_EMBEDDING:
7061 {
7062 llm = std::make_unique<llm_build_gemma_embedding>(args: *this, args: params);
7063 } break;
7064 case LLM_ARCH_STARCODER2:
7065 {
7066 llm = std::make_unique<llm_build_starcoder2>(args: *this, args: params);
7067 } break;
7068 case LLM_ARCH_MAMBA:
7069 case LLM_ARCH_MAMBA2:
7070 {
7071 llm = std::make_unique<llm_build_mamba>(args: *this, args: params);
7072 } break;
7073 case LLM_ARCH_JAMBA:
7074 {
7075 llm = std::make_unique<llm_build_jamba>(args: *this, args: params);
7076 } break;
7077 case LLM_ARCH_XVERSE:
7078 {
7079 llm = std::make_unique<llm_build_xverse>(args: *this, args: params);
7080 } break;
7081 case LLM_ARCH_COMMAND_R:
7082 {
7083 llm = std::make_unique<llm_build_command_r>(args: *this, args: params);
7084 } break;
7085 case LLM_ARCH_COHERE2:
7086 {
7087 llm = std::make_unique<llm_build_cohere2_iswa>(args: *this, args: params);
7088 } break;
7089 case LLM_ARCH_DBRX:
7090 {
7091 llm = std::make_unique<llm_build_dbrx>(args: *this, args: params);
7092 } break;
7093 case LLM_ARCH_OLMO:
7094 {
7095 llm = std::make_unique<llm_build_olmo>(args: *this, args: params);
7096 } break;
7097 case LLM_ARCH_OLMO2:
7098 {
7099 if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
7100 llm = std::make_unique<llm_build_olmo2<true>>(args: *this, args: params);
7101 } else {
7102 llm = std::make_unique<llm_build_olmo2<false>>(args: *this, args: params);
7103 }
7104 } break;
7105 case LLM_ARCH_OLMOE:
7106 {
7107 llm = std::make_unique<llm_build_olmoe>(args: *this, args: params);
7108 } break;
7109 case LLM_ARCH_OPENELM:
7110 {
7111 llm = std::make_unique<llm_build_openelm>(args: *this, args: params);
7112 } break;
7113 case LLM_ARCH_GPTNEOX:
7114 {
7115 llm = std::make_unique<llm_build_gptneox>(args: *this, args: params);
7116 } break;
7117 case LLM_ARCH_ARCTIC:
7118 {
7119 llm = std::make_unique<llm_build_arctic>(args: *this, args: params);
7120 } break;
7121 case LLM_ARCH_DEEPSEEK:
7122 {
7123 llm = std::make_unique<llm_build_deepseek>(args: *this, args: params);
7124 } break;
7125 case LLM_ARCH_DEEPSEEK2:
7126 {
7127 llm = std::make_unique<llm_build_deepseek2>(args: *this, args: params);
7128 } break;
7129 case LLM_ARCH_CHATGLM:
7130 {
7131 llm = std::make_unique<llm_build_chatglm>(args: *this, args: params);
7132 } break;
7133 case LLM_ARCH_GLM4:
7134 {
7135 llm = std::make_unique<llm_build_glm4>(args: *this, args: params);
7136 } break;
7137 case LLM_ARCH_GLM4_MOE:
7138 {
7139 llm = std::make_unique<llm_build_glm4_moe>(args: *this, args: params);
7140 } break;
7141 case LLM_ARCH_BITNET:
7142 {
7143 llm = std::make_unique<llm_build_bitnet>(args: *this, args: params);
7144 } break;
7145 case LLM_ARCH_T5:
7146 {
7147 switch (params.gtype) {
7148 case LLM_GRAPH_TYPE_ENCODER:
7149 llm = std::make_unique<llm_build_t5_enc>(args: *this, args: params);
7150 break;
7151 case LLM_GRAPH_TYPE_DEFAULT:
7152 case LLM_GRAPH_TYPE_DECODER:
7153 llm = std::make_unique<llm_build_t5_dec>(args: *this, args: params);
7154 break;
7155 default:
7156 GGML_ABORT("invalid graph type");
7157 };
7158 } break;
7159 case LLM_ARCH_T5ENCODER:
7160 {
7161 llm = std::make_unique<llm_build_t5_enc>(args: *this, args: params);
7162 }
7163 break;
7164 case LLM_ARCH_JAIS:
7165 {
7166 llm = std::make_unique<llm_build_jais>(args: *this, args: params);
7167 } break;
7168 case LLM_ARCH_NEMOTRON:
7169 {
7170 llm = std::make_unique<llm_build_nemotron>(args: *this, args: params);
7171 } break;
7172 case LLM_ARCH_NEMOTRON_H:
7173 {
7174 llm = std::make_unique<llm_build_nemotron_h>(args: *this, args: params);
7175 } break;
7176 case LLM_ARCH_EXAONE:
7177 {
7178 llm = std::make_unique<llm_build_exaone>(args: *this, args: params);
7179 } break;
7180 case LLM_ARCH_EXAONE4:
7181 {
7182 if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
7183 llm = std::make_unique<llm_build_exaone4<true>>(args: *this, args: params);
7184 } else {
7185 llm = std::make_unique<llm_build_exaone4<false>>(args: *this, args: params);
7186 }
7187 } break;
7188 case LLM_ARCH_RWKV6:
7189 {
7190 llm = std::make_unique<llm_build_rwkv6>(args: *this, args: params);
7191 } break;
7192 case LLM_ARCH_RWKV6QWEN2:
7193 {
7194 llm = std::make_unique<llm_build_rwkv6qwen2>(args: *this, args: params);
7195 } break;
7196 case LLM_ARCH_RWKV7:
7197 {
7198 llm = std::make_unique<llm_build_rwkv7>(args: *this, args: params);
7199 } break;
7200 case LLM_ARCH_ARWKV7:
7201 {
7202 llm = std::make_unique<llm_build_arwkv7>(args: *this, args: params);
7203 } break;
7204 case LLM_ARCH_GRANITE:
7205 case LLM_ARCH_GRANITE_MOE:
7206 case LLM_ARCH_MINICPM:
7207 {
7208 llm = std::make_unique<llm_build_granite>(args: *this, args: params);
7209 } break;
7210 case LLM_ARCH_GRANITE_HYBRID:
7211 {
7212 llm = std::make_unique<llm_build_granite_hybrid>(args: *this, args: params);
7213 } break;
7214 case LLM_ARCH_CHAMELEON:
7215 {
7216 llm = std::make_unique<llm_build_chameleon>(args: *this, args: params);
7217 } break;
7218 case LLM_ARCH_WAVTOKENIZER_DEC:
7219 {
7220 llm = std::make_unique<llm_build_wavtokenizer_dec>(args: *this, args: params);
7221 } break;
7222 case LLM_ARCH_PLM:
7223 {
7224 llm = std::make_unique<llm_build_plm>(args: *this, args: params);
7225 } break;
7226 case LLM_ARCH_BAILINGMOE:
7227 {
7228 llm = std::make_unique<llm_build_bailingmoe>(args: *this, args: params);
7229 } break;
7230 case LLM_ARCH_BAILINGMOE2:
7231 {
7232 llm = std::make_unique<llm_build_bailingmoe2>(args: *this, args: params);
7233 } break;
7234 case LLM_ARCH_SEED_OSS:
7235 {
7236 llm = std::make_unique<llm_build_seed_oss>(args: *this, args: params);
7237 } break;
7238 case LLM_ARCH_DOTS1:
7239 {
7240 llm = std::make_unique<llm_build_dots1>(args: *this, args: params);
7241 } break;
7242 case LLM_ARCH_ARCEE:
7243 {
7244 llm = std::make_unique<llm_build_arcee>(args: *this, args: params);
7245 } break;
7246 case LLM_ARCH_ERNIE4_5:
7247 {
7248 llm = std::make_unique<llm_build_ernie4_5>(args: *this, args: params);
7249 } break;
7250 case LLM_ARCH_ERNIE4_5_MOE:
7251 {
7252 llm = std::make_unique<llm_build_ernie4_5_moe>(args: *this, args: params);
7253 } break;
7254 case LLM_ARCH_HUNYUAN_MOE:
7255 {
7256 llm = std::make_unique<llm_build_hunyuan_moe>(args: *this, args: params);
7257 } break;
7258 case LLM_ARCH_HUNYUAN_DENSE:
7259 {
7260 llm = std::make_unique<llm_build_hunyuan_dense>(args: *this, args: params);
7261 } break;
7262 case LLM_ARCH_SMOLLM3:
7263 {
7264 llm = std::make_unique<llm_build_smollm3>(args: *this, args: params);
7265 } break;
7266 case LLM_ARCH_OPENAI_MOE:
7267 {
7268 llm = std::make_unique<llm_build_openai_moe_iswa>(args: *this, args: params);
7269 } break;
7270 case LLM_ARCH_FALCON_H1:
7271 {
7272 llm = std::make_unique<llm_build_falcon_h1>(args: *this, args: params);
7273 } break;
7274 case LLM_ARCH_LFM2:
7275 case LLM_ARCH_LFM2MOE:
7276 {
7277 llm = std::make_unique<llm_build_lfm2>(args: *this, args: params);
7278 } break;
7279 case LLM_ARCH_SMALLTHINKER:
7280 {
7281 if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
7282 llm = std::make_unique<llm_build_smallthinker<true>> (args: *this, args: params);
7283 } else {
7284 llm = std::make_unique<llm_build_smallthinker<false>>(args: *this, args: params);
7285 }
7286 } break;
7287 case LLM_ARCH_GROVEMOE:
7288 {
7289 llm = std::make_unique<llm_build_grovemoe>(args: *this, args: params);
7290 } break;
7291 case LLM_ARCH_APERTUS:
7292 {
7293 llm = std::make_unique<llm_build_apertus>(args: *this, args: params);
7294 } break;
7295 case LLM_ARCH_MINIMAX_M2:
7296 {
7297 llm = std::make_unique<llm_build_minimax_m2>(args: *this, args: params);
7298 } break;
7299 case LLM_ARCH_COGVLM:
7300 {
7301 llm = std::make_unique<llm_build_cogvlm>(args: *this, args: params);
7302 } break;
7303 case LLM_ARCH_PANGU_EMBED:
7304 {
7305 llm = std::make_unique<llm_build_pangu_embedded>(args: *this, args: params);
7306 }break;
7307 default:
7308 GGML_ABORT("fatal error");
7309 }
7310
7311 // add on pooling layer
7312 llm->build_pooling(cls, cls_b, cls_out, cls_out_b);
7313
7314 // if the gguf model was converted with --sentence-transformers-dense-modules
7315 // there will be two additional dense projection layers
7316 // dense linear projections are applied after pooling
7317 // TODO: move reranking logic here and generalize
7318 llm->build_dense_out(dense_2: dense_2_out_layers, dense_3: dense_3_out_layers);
7319
7320 return llm->res->get_gf();
7321}
7322
7323
7324//
7325// interface implementation
7326//
7327
7328llama_model_params llama_model_default_params() {
7329 llama_model_params result = {
7330 /*.devices =*/ nullptr,
7331 /*.tensor_buft_overrides =*/ nullptr,
7332 /*.n_gpu_layers =*/ 999,
7333 /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
7334 /*.main_gpu =*/ 0,
7335 /*.tensor_split =*/ nullptr,
7336 /*.progress_callback =*/ nullptr,
7337 /*.progress_callback_user_data =*/ nullptr,
7338 /*.kv_overrides =*/ nullptr,
7339 /*.vocab_only =*/ false,
7340 /*.use_mmap =*/ true,
7341 /*.use_mlock =*/ false,
7342 /*.check_tensors =*/ false,
7343 /*.use_extra_bufts =*/ true,
7344 /*.no_host =*/ false,
7345 };
7346
7347 return result;
7348}
7349
7350const llama_vocab * llama_model_get_vocab(const llama_model * model) {
7351 return &model->vocab;
7352}
7353
7354void llama_free_model(llama_model * model) {
7355 llama_model_free(model);
7356}
7357
7358void llama_model_free(llama_model * model) {
7359 delete model;
7360}
7361
7362int32_t llama_model_n_ctx_train(const llama_model * model) {
7363 return model->hparams.n_ctx_train;
7364}
7365
7366int32_t llama_model_n_embd(const llama_model * model) {
7367 return model->hparams.n_embd;
7368}
7369
7370int32_t llama_model_n_embd_inp(const llama_model * model) {
7371 return model->hparams.n_embd_inp();
7372}
7373
7374int32_t llama_model_n_layer(const llama_model * model) {
7375 return model->hparams.n_layer;
7376}
7377
7378int32_t llama_model_n_head(const llama_model * model) {
7379 return model->hparams.n_head();
7380}
7381
7382int32_t llama_model_n_head_kv(const llama_model * model) {
7383 return model->hparams.n_head_kv();
7384}
7385
7386int32_t llama_model_n_swa(const llama_model * model) {
7387 return model->hparams.n_swa;
7388}
7389
7390uint32_t llama_model_n_cls_out(const struct llama_model * model) {
7391 return model->hparams.n_cls_out;
7392}
7393
7394const char * llama_model_cls_label(const struct llama_model * model, uint32_t i) {
7395 if (i < model->classifier_labels.size()) {
7396 return model->classifier_labels[i].c_str();
7397 }
7398
7399 return nullptr;
7400}
7401
7402// deprecated
7403int32_t llama_n_ctx_train(const llama_model * model) {
7404 return llama_model_n_ctx_train(model);
7405}
7406
7407// deprecated
7408int32_t llama_n_embd(const llama_model * model) {
7409 return llama_model_n_embd(model);
7410}
7411
7412// deprecated
7413int32_t llama_n_layer(const llama_model * model) {
7414 return llama_model_n_layer(model);
7415}
7416
7417// deprecated
7418int32_t llama_n_head(const llama_model * model) {
7419 return llama_model_n_head(model);
7420}
7421
7422llama_rope_type llama_model_rope_type(const llama_model * model) {
7423 switch (model->arch) {
7424 // these models do not use RoPE
7425 case LLM_ARCH_CLIP:
7426 case LLM_ARCH_GPT2:
7427 case LLM_ARCH_GPTJ:
7428 case LLM_ARCH_MPT:
7429 case LLM_ARCH_REFACT:
7430 case LLM_ARCH_BLOOM:
7431 case LLM_ARCH_MAMBA:
7432 case LLM_ARCH_MAMBA2:
7433 case LLM_ARCH_JAMBA:
7434 case LLM_ARCH_JINA_BERT_V2:
7435 case LLM_ARCH_T5:
7436 case LLM_ARCH_T5ENCODER:
7437 case LLM_ARCH_JAIS:
7438 case LLM_ARCH_RWKV6:
7439 case LLM_ARCH_RWKV6QWEN2:
7440 case LLM_ARCH_RWKV7:
7441 case LLM_ARCH_ARWKV7:
7442 case LLM_ARCH_WAVTOKENIZER_DEC:
7443 case LLM_ARCH_NEMOTRON_H:
7444 return LLAMA_ROPE_TYPE_NONE;
7445
7446 // use what we call a normal RoPE, operating on pairs of consecutive head values
7447 case LLM_ARCH_LLAMA:
7448 case LLM_ARCH_LLADA:
7449 case LLM_ARCH_LLAMA4:
7450 case LLM_ARCH_DECI:
7451 case LLM_ARCH_BAICHUAN:
7452 case LLM_ARCH_STARCODER:
7453 case LLM_ARCH_INTERNLM2:
7454 case LLM_ARCH_MINICPM:
7455 case LLM_ARCH_XVERSE:
7456 case LLM_ARCH_COMMAND_R:
7457 case LLM_ARCH_COHERE2:
7458 case LLM_ARCH_OLMO:
7459 case LLM_ARCH_ARCTIC:
7460 case LLM_ARCH_DEEPSEEK:
7461 case LLM_ARCH_DEEPSEEK2:
7462 case LLM_ARCH_PLM:
7463 case LLM_ARCH_CHATGLM:
7464 case LLM_ARCH_GLM4:
7465 case LLM_ARCH_GRANITE:
7466 case LLM_ARCH_GRANITE_MOE:
7467 case LLM_ARCH_GRANITE_HYBRID:
7468 case LLM_ARCH_CHAMELEON:
7469 case LLM_ARCH_BAILINGMOE:
7470 case LLM_ARCH_NEO_BERT:
7471 case LLM_ARCH_SMOLLM3:
7472 case LLM_ARCH_ARCEE:
7473 case LLM_ARCH_ERNIE4_5:
7474 case LLM_ARCH_ERNIE4_5_MOE:
7475 return LLAMA_ROPE_TYPE_NORM;
7476
7477 // the pairs of head values are offset by n_rot/2
7478 case LLM_ARCH_FALCON:
7479 case LLM_ARCH_FALCON_H1:
7480 case LLM_ARCH_GROK:
7481 case LLM_ARCH_DBRX:
7482 case LLM_ARCH_BERT:
7483 case LLM_ARCH_JINA_BERT_V3:
7484 case LLM_ARCH_NOMIC_BERT:
7485 case LLM_ARCH_NOMIC_BERT_MOE:
7486 case LLM_ARCH_STABLELM:
7487 case LLM_ARCH_BITNET:
7488 case LLM_ARCH_QWEN:
7489 case LLM_ARCH_QWEN2:
7490 case LLM_ARCH_DREAM:
7491 case LLM_ARCH_QWEN2MOE:
7492 case LLM_ARCH_QWEN3:
7493 case LLM_ARCH_QWEN3MOE:
7494 case LLM_ARCH_LLADA_MOE:
7495 case LLM_ARCH_OLMO2:
7496 case LLM_ARCH_OLMOE:
7497 case LLM_ARCH_PHI2:
7498 case LLM_ARCH_PHI3:
7499 case LLM_ARCH_PHIMOE:
7500 case LLM_ARCH_PLAMO:
7501 case LLM_ARCH_PLAMO2:
7502 case LLM_ARCH_GEMMA:
7503 case LLM_ARCH_GEMMA2:
7504 case LLM_ARCH_GEMMA3:
7505 case LLM_ARCH_GEMMA3N:
7506 case LLM_ARCH_GEMMA_EMBEDDING:
7507 case LLM_ARCH_STARCODER2:
7508 case LLM_ARCH_OPENELM:
7509 case LLM_ARCH_GPTNEOX:
7510 case LLM_ARCH_CODESHELL:
7511 case LLM_ARCH_ORION:
7512 case LLM_ARCH_NEMOTRON:
7513 case LLM_ARCH_EXAONE:
7514 case LLM_ARCH_EXAONE4:
7515 case LLM_ARCH_MINICPM3:
7516 case LLM_ARCH_BAILINGMOE2:
7517 case LLM_ARCH_DOTS1:
7518 case LLM_ARCH_HUNYUAN_MOE:
7519 case LLM_ARCH_OPENAI_MOE:
7520 case LLM_ARCH_HUNYUAN_DENSE:
7521 case LLM_ARCH_LFM2:
7522 case LLM_ARCH_LFM2MOE:
7523 case LLM_ARCH_SMALLTHINKER:
7524 case LLM_ARCH_GLM4_MOE:
7525 case LLM_ARCH_SEED_OSS:
7526 case LLM_ARCH_GROVEMOE:
7527 case LLM_ARCH_APERTUS:
7528 case LLM_ARCH_MINIMAX_M2:
7529 case LLM_ARCH_COGVLM:
7530 case LLM_ARCH_PANGU_EMBED:
7531 return LLAMA_ROPE_TYPE_NEOX;
7532
7533 case LLM_ARCH_QWEN2VL:
7534 return LLAMA_ROPE_TYPE_MROPE;
7535 case LLM_ARCH_QWEN3VL:
7536 case LLM_ARCH_QWEN3VLMOE:
7537 return LLAMA_ROPE_TYPE_IMROPE;
7538
7539 // all model arches should be listed explicitly here
7540 case LLM_ARCH_UNKNOWN:
7541 GGML_ABORT("unknown architecture");
7542 }
7543
7544 return LLAMA_ROPE_TYPE_NONE;
7545}
7546
7547float llama_model_rope_freq_scale_train(const llama_model * model) {
7548 return model->hparams.rope_freq_scale_train;
7549}
7550
7551int32_t llama_model_meta_val_str(const llama_model * model, const char * key, char * buf, size_t buf_size) {
7552 const auto & it = model->gguf_kv.find(x: key);
7553 if (it == model->gguf_kv.end()) {
7554 if (buf_size > 0) {
7555 buf[0] = '\0';
7556 }
7557 return -1;
7558 }
7559 return snprintf(s: buf, maxlen: buf_size, format: "%s", it->second.c_str());
7560}
7561
7562int32_t llama_model_meta_count(const llama_model * model) {
7563 return (int)model->gguf_kv.size();
7564}
7565
7566int32_t llama_model_meta_key_by_index(const llama_model * model, int i, char * buf, size_t buf_size) {
7567 if (i < 0 || i >= (int)model->gguf_kv.size()) {
7568 if (buf_size > 0) {
7569 buf[0] = '\0';
7570 }
7571 return -1;
7572 }
7573 auto it = model->gguf_kv.begin();
7574 std::advance(i&: it, n: i);
7575 return snprintf(s: buf, maxlen: buf_size, format: "%s", it->first.c_str());
7576}
7577
7578int32_t llama_model_meta_val_str_by_index(const llama_model * model, int32_t i, char * buf, size_t buf_size) {
7579 if (i < 0 || i >= (int)model->gguf_kv.size()) {
7580 if (buf_size > 0) {
7581 buf[0] = '\0';
7582 }
7583 return -1;
7584 }
7585 auto it = model->gguf_kv.begin();
7586 std::advance(i&: it, n: i);
7587 return snprintf(s: buf, maxlen: buf_size, format: "%s", it->second.c_str());
7588}
7589
7590int32_t llama_model_desc(const llama_model * model, char * buf, size_t buf_size) {
7591 return snprintf(s: buf, maxlen: buf_size, format: "%s", model->desc().c_str());
7592}
7593
7594uint64_t llama_model_size(const llama_model * model) {
7595 return model->size();
7596}
7597
7598const char * llama_model_chat_template(const llama_model * model, const char * name) {
7599 const auto key = name ? LLM_KV(model->arch, name)(LLM_KV_TOKENIZER_CHAT_TEMPLATE)
7600 : LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);
7601 const auto & it = model->gguf_kv.find(x: key);
7602 if (it == model->gguf_kv.end()) {
7603 // one-off fix for very popular models (so we are not flooded with issues)
7604 // do not extend this list unless absolutely necessary
7605 // Mistral-Small-2503 does not have built-in chat template
7606 llama_vocab_pre_type pre_type = model->vocab.get_pre_type();
7607 if (!name && pre_type == LLAMA_VOCAB_PRE_TYPE_TEKKEN && model->layers.size() == 40) {
7608 return "mistral-v7-tekken";
7609 }
7610
7611 return nullptr;
7612 }
7613
7614 return it->second.c_str();
7615}
7616
7617uint64_t llama_model_n_params(const llama_model * model) {
7618 return model->n_elements();
7619}
7620
7621bool llama_model_has_encoder(const llama_model * model) {
7622 switch (model->arch) {
7623 case LLM_ARCH_T5: return true;
7624 case LLM_ARCH_T5ENCODER: return true;
7625 default: return false;
7626 }
7627}
7628
7629bool llama_model_has_decoder(const llama_model * model) {
7630 switch (model->arch) {
7631 case LLM_ARCH_T5ENCODER: return false;
7632 default: return true;
7633 }
7634}
7635
7636llama_token llama_model_decoder_start_token(const llama_model * model) {
7637 return model->hparams.dec_start_token_id;
7638}
7639
7640bool llama_model_is_recurrent(const llama_model * model) {
7641 return llm_arch_is_recurrent(arch: model->arch);
7642}
7643
7644bool llama_model_is_hybrid(const llama_model * model) {
7645 return llm_arch_is_hybrid(arch: model->arch);
7646}
7647
7648bool llama_model_is_diffusion(const llama_model * model) {
7649 return llm_arch_is_diffusion(arch: model->arch);
7650}
7651
7652const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model) {
7653 return model->tensors_by_name;
7654}
7655