1#include "models.h"
2
3llm_build_minicpm3::llm_build_minicpm3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
4 //TODO: if the model varies, these parameters need to be read from the model
5 const int64_t n_embd_base = 256;
6 const float scale_embd = 12.0f;
7 const float scale_depth = 1.4f;
8 const float kq_scale = 1.0f / sqrtf(x: float(hparams.n_embd_head_k));
9
10 const uint32_t n_embd_head_qk_rope = hparams.n_rot;
11 const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
12 const uint32_t kv_lora_rank = hparams.n_lora_kv;
13
14 ggml_tensor * cur;
15 ggml_tensor * inpL;
16
17 inpL = build_inp_embd(tok_embd: model.tok_embd);
18
19 // scale the input embeddings
20 inpL = ggml_scale(ctx: ctx0, a: inpL, s: scale_embd);
21 cb(cur: inpL, name: "inp_scaled", il: -1);
22
23 // inp_pos - contains the positions
24 ggml_tensor * inp_pos = build_inp_pos();
25
26 auto * inp_attn = build_attn_inp_kv();
27
28 ggml_tensor * inp_out_ids = build_inp_out_ids();
29
30 for (int il = 0; il < n_layer; ++il) {
31 ggml_tensor * inpSA = inpL;
32
33 ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
34
35 // norm
36 cur = build_norm(cur: inpL,
37 mw: model.layers[il].attn_norm, NULL,
38 type: LLM_NORM_RMS, il);
39 cb(cur, name: "attn_norm", il);
40
41 // self_attention
42 {
43 ggml_tensor * q = NULL;
44 // {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
45 q = ggml_mul_mat(ctx: ctx0, a: model.layers[il].wq_a, b: cur);
46 cb(cur: q, name: "q", il);
47
48 q = build_norm(cur: q,
49 mw: model.layers[il].attn_q_a_norm, NULL,
50 type: LLM_NORM_RMS, il);
51 cb(cur: q, name: "q", il);
52
53 // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
54 q = ggml_mul_mat(ctx: ctx0, a: model.layers[il].wq_b, b: q);
55 cb(cur: q, name: "q", il);
56
57 // split into {n_head * n_embd_head_qk_nope, n_tokens}
58 ggml_tensor * q_nope = ggml_view_3d(ctx: ctx0, a: q, ne0: n_embd_head_qk_nope, ne1: n_head, ne2: n_tokens,
59 nb1: ggml_row_size(type: q->type, ne: hparams.n_embd_head_k),
60 nb2: ggml_row_size(type: q->type, ne: hparams.n_embd_head_k * n_head),
61 offset: 0);
62 cb(cur: q_nope, name: "q_nope", il);
63
64 // and {n_head * n_embd_head_qk_rope, n_tokens}
65 ggml_tensor * q_pe = ggml_view_3d(ctx: ctx0, a: q, ne0: n_embd_head_qk_rope, ne1: n_head, ne2: n_tokens,
66 nb1: ggml_row_size(type: q->type, ne: hparams.n_embd_head_k),
67 nb2: ggml_row_size(type: q->type, ne: hparams.n_embd_head_k * n_head),
68 offset: ggml_row_size(type: q->type, ne: n_embd_head_qk_nope));
69 cb(cur: q_pe, name: "q_pe", il);
70
71 // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
72 ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx: ctx0, a: model.layers[il].wkv_a_mqa, b: cur);
73 cb(cur: kv_pe_compresseed, name: "kv_pe_compresseed", il);
74
75 // split into {kv_lora_rank, n_tokens}
76 ggml_tensor * kv_compressed = ggml_view_2d(ctx: ctx0, a: kv_pe_compresseed, ne0: kv_lora_rank, ne1: n_tokens,
77 nb1: kv_pe_compresseed->nb[1],
78 offset: 0);
79 cb(cur: kv_compressed, name: "kv_compressed", il);
80
81 // and {n_embd_head_qk_rope, n_tokens}
82 ggml_tensor * k_pe = ggml_view_3d(ctx: ctx0, a: kv_pe_compresseed, ne0: n_embd_head_qk_rope, ne1: 1, ne2: n_tokens,
83 nb1: kv_pe_compresseed->nb[1],
84 nb2: kv_pe_compresseed->nb[1],
85 offset: ggml_row_size(type: kv_pe_compresseed->type, ne: kv_lora_rank));
86 cb(cur: k_pe, name: "k_pe", il);
87
88 kv_compressed = build_norm(cur: kv_compressed,
89 mw: model.layers[il].attn_kv_a_norm, NULL,
90 type: LLM_NORM_RMS, il);
91 cb(cur: kv_compressed, name: "kv_compressed", il);
92
93 // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
94 ggml_tensor * kv = ggml_mul_mat(ctx: ctx0, a: model.layers[il].wkv_b, b: kv_compressed);
95 cb(cur: kv, name: "kv", il);
96
97 // split into {n_head * n_embd_head_qk_nope, n_tokens}
98 ggml_tensor * k_nope = ggml_view_3d(ctx: ctx0, a: kv, ne0: n_embd_head_qk_nope, ne1: n_head, ne2: n_tokens,
99 nb1: ggml_row_size(type: kv->type, ne: n_embd_head_qk_nope + hparams.n_embd_head_v),
100 nb2: ggml_row_size(type: kv->type, ne: n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
101 offset: 0);
102 cb(cur: k_nope, name: "k_nope", il);
103
104 // and {n_head * n_embd_head_v, n_tokens}
105 ggml_tensor * v_states = ggml_view_3d(ctx: ctx0, a: kv, ne0: hparams.n_embd_head_v, ne1: n_head, ne2: n_tokens,
106 nb1: ggml_row_size(type: kv->type, ne: (n_embd_head_qk_nope + hparams.n_embd_head_v)),
107 nb2: ggml_row_size(type: kv->type, ne: (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
108 offset: ggml_row_size(type: kv->type, ne: (n_embd_head_qk_nope)));
109 cb(cur: v_states, name: "v_states", il);
110
111 v_states = ggml_cont(ctx: ctx0, a: v_states);
112 cb(cur: v_states, name: "v_states", il);
113
114 q_pe = ggml_rope_ext(
115 ctx: ctx0, a: q_pe, b: inp_pos, c: rope_factors,
116 n_dims: n_rot, mode: rope_type, n_ctx_orig, freq_base, freq_scale,
117 ext_factor, attn_factor, beta_fast, beta_slow
118 );
119 cb(cur: q_pe, name: "q_pe", il);
120
121 // shared RoPE key
122 k_pe = ggml_rope_ext(
123 ctx: ctx0, a: k_pe, b: inp_pos, c: rope_factors,
124 n_dims: n_rot, mode: rope_type, n_ctx_orig, freq_base, freq_scale,
125 ext_factor, attn_factor, beta_fast, beta_slow
126 );
127 cb(cur: k_pe, name: "k_pe", il);
128
129 ggml_tensor * q_states = ggml_concat(ctx: ctx0, a: q_nope, b: q_pe, dim: 0);
130 cb(cur: q_states, name: "q_states", il);
131
132 ggml_tensor * k_states = ggml_concat(ctx: ctx0, a: k_nope, b: ggml_repeat(ctx: ctx0, a: k_pe, b: q_pe), dim: 0);
133 cb(cur: k_states, name: "k_states", il);
134
135 cur = build_attn(inp: inp_attn,
136 wo: model.layers[il].wo, NULL,
137 q_cur: q_states, k_cur: k_states, v_cur: v_states, kq_b: nullptr, sinks: nullptr, v_mla: nullptr, kq_scale, il);
138 }
139 if (il == n_layer - 1 && inp_out_ids) {
140 cur = ggml_get_rows(ctx: ctx0, a: cur, b: inp_out_ids);
141 inpSA = ggml_get_rows(ctx: ctx0, a: inpSA, b: inp_out_ids);
142 }
143 // scale_res - scale the hidden states for residual connection
144 const float scale_res = scale_depth/sqrtf(x: float(n_layer)); // TODO: is this correct?
145 cur = ggml_scale(ctx: ctx0, a: cur, s: scale_res);
146 cb(cur, name: "hidden_scaled", il);
147
148 ggml_tensor * ffn_inp = ggml_add(ctx: ctx0, a: cur, b: inpSA);
149 cb(cur: ffn_inp, name: "ffn_inp", il);
150
151 // feed-forward network
152 {
153 cur = build_norm(cur: ffn_inp,
154 mw: model.layers[il].ffn_norm, NULL,
155 type: LLM_NORM_RMS, il);
156 cb(cur, name: "ffn_norm", il);
157
158 cur = build_ffn(cur,
159 up: model.layers[il].ffn_up, NULL, NULL,
160 gate: model.layers[il].ffn_gate, NULL, NULL,
161 down: model.layers[il].ffn_down, NULL, NULL,
162 NULL,
163 type_op: LLM_FFN_SILU, type_gate: LLM_FFN_PAR, il);
164 cb(cur, name: "ffn_out", il);
165 }
166 // scale the hidden states for residual connection
167 cur = ggml_scale(ctx: ctx0, a: cur, s: scale_res);
168 cb(cur, name: "hidden_scaled_ffn", il);
169
170 cur = ggml_add(ctx: ctx0, a: cur, b: ffn_inp);
171
172 cur = build_cvec(cur, il);
173 cb(cur, name: "l_out", il);
174
175 // input for next layer
176 inpL = cur;
177 }
178 cur = inpL;
179
180 cur = build_norm(cur,
181 mw: model.output_norm, NULL,
182 type: LLM_NORM_RMS, il: -1);
183
184 cb(cur, name: "result_norm", il: -1);
185 res->t_embd = cur;
186
187 // lm_head scaling
188 const float scale_lmhead = float(n_embd_base)/float(n_embd);
189 cur = ggml_scale(ctx: ctx0, a: cur, s: scale_lmhead);
190 cb(cur, name: "lmhead_scaling", il: -1);
191
192 // lm_head
193 cur = build_lora_mm(w: model.output, cur);
194
195 cb(cur, name: "result_output", il: -1);
196 res->t_logits = cur;
197
198 ggml_build_forward_expand(cgraph: gf, tensor: cur);
199}
200