1#include "models.h"
2
3
4
5llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_graph_params & params) :
6 llm_graph_context(params) {
7 bool is_lite = (hparams.n_layer == 27);
8
9 const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
10
11 // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
12 const int64_t n_embd_head_k = is_mla ? hparams.n_embd_head_k_mla : hparams.n_embd_head_k;
13 const int64_t n_embd_head_v = is_mla ? hparams.n_embd_head_v_mla : hparams.n_embd_head_v;
14
15 const int64_t n_embd_head_qk_rope = hparams.n_rot;
16 const int64_t n_embd_head_qk_nope = n_embd_head_k - n_embd_head_qk_rope;
17
18 const uint32_t kv_lora_rank = hparams.n_lora_kv;
19
20 // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
21 // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
22 const float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(x: 1.0f / freq_scale));
23 const float kq_scale = 1.0f * mscale * mscale / sqrtf(x: float(n_embd_head_k));
24 const float attn_factor = 1.0f / (1.0f + 0.1f * logf(x: 1.0f / freq_scale));
25
26 ggml_tensor * cur;
27 ggml_tensor * inpL;
28
29 // {n_embd, n_tokens}
30 inpL = build_inp_embd(tok_embd: model.tok_embd);
31
32 // inp_pos - contains the positions
33 ggml_tensor * inp_pos = build_inp_pos();
34
35 auto * inp_attn = build_attn_inp_kv();
36
37 ggml_tensor * inp_out_ids = build_inp_out_ids();
38
39 for (int il = 0; il < n_layer; ++il) {
40 ggml_tensor * inpSA = inpL;
41
42 // norm
43 cur = build_norm(cur: inpL, mw: model.layers[il].attn_norm, NULL, type: LLM_NORM_RMS, il);
44 cb(cur, name: "attn_norm", il);
45
46 // self_attention
47 {
48 ggml_tensor * q = NULL;
49 if (!is_lite) {
50 q = ggml_mul_mat(ctx: ctx0, a: model.layers[il].wq_a, b: cur);
51 cb(cur: q, name: "q", il);
52
53 q = build_norm(cur: q, mw: model.layers[il].attn_q_a_norm, mb: nullptr, type: LLM_NORM_RMS, il);
54 cb(cur: q, name: "q", il);
55
56 q = ggml_mul_mat(ctx: ctx0, a: model.layers[il].wq_b, b: q);
57 cb(cur: q, name: "q", il);
58 } else {
59 q = ggml_mul_mat(ctx: ctx0, a: model.layers[il].wq, b: cur);
60 cb(cur: q, name: "q", il);
61 }
62 // split into {n_embd_head_qk_nope, n_head, n_tokens}
63 ggml_tensor * q_nope =
64 ggml_view_3d(ctx: ctx0, a: q, ne0: n_embd_head_qk_nope, ne1: n_head, ne2: n_tokens, nb1: ggml_row_size(type: q->type, ne: n_embd_head_k),
65 nb2: ggml_row_size(type: q->type, ne: n_embd_head_k) * n_head, offset: 0);
66 cb(cur: q_nope, name: "q_nope", il);
67
68 // and {n_embd_head_qk_rope, n_head, n_tokens}
69 ggml_tensor * q_pe = ggml_view_3d(
70 ctx: ctx0, a: q, ne0: n_embd_head_qk_rope, ne1: n_head, ne2: n_tokens, nb1: ggml_row_size(type: q->type, ne: n_embd_head_k),
71 nb2: ggml_row_size(type: q->type, ne: n_embd_head_k) * n_head, offset: ggml_row_size(type: q->type, ne: n_embd_head_qk_nope));
72 cb(cur: q_pe, name: "q_pe", il);
73
74 ggml_tensor * kv_cmpr_pe = ggml_mul_mat(ctx: ctx0, a: model.layers[il].wkv_a_mqa, b: cur);
75 cb(cur: kv_cmpr_pe, name: "kv_cmpr_pe", il);
76
77 // split into {kv_lora_rank, n_tokens}
78 ggml_tensor * kv_cmpr =
79 ggml_view_2d(ctx: ctx0, a: kv_cmpr_pe, ne0: kv_lora_rank, ne1: n_tokens,
80 nb1: ggml_row_size(type: kv_cmpr_pe->type, ne: kv_lora_rank + n_embd_head_qk_rope), offset: 0);
81 cb(cur: kv_cmpr, name: "kv_cmpr", il);
82
83 // and {n_embd_head_qk_rope, 1, n_tokens}
84 ggml_tensor * k_pe = ggml_view_3d(ctx: ctx0, a: kv_cmpr_pe, ne0: n_embd_head_qk_rope, ne1: 1, ne2: n_tokens,
85 nb1: ggml_row_size(type: kv_cmpr_pe->type, ne: kv_lora_rank + n_embd_head_qk_rope),
86 nb2: ggml_row_size(type: kv_cmpr_pe->type, ne: kv_lora_rank + n_embd_head_qk_rope),
87 offset: ggml_row_size(type: kv_cmpr_pe->type, ne: kv_lora_rank));
88 cb(cur: k_pe, name: "k_pe", il);
89
90 q_pe = ggml_rope_ext(ctx: ctx0, a: q_pe, b: inp_pos, c: nullptr, n_dims: n_rot, mode: rope_type, n_ctx_orig, freq_base, freq_scale,
91 ext_factor, attn_factor, beta_fast, beta_slow);
92 cb(cur: q_pe, name: "q_pe", il);
93
94 k_pe = ggml_rope_ext(ctx: ctx0, a: k_pe, b: inp_pos, c: nullptr, n_dims: n_rot, mode: rope_type, n_ctx_orig, freq_base, freq_scale,
95 ext_factor, attn_factor, beta_fast, beta_slow);
96 cb(cur: k_pe, name: "k_pe", il);
97
98 kv_cmpr = build_norm(cur: kv_cmpr, mw: model.layers[il].attn_kv_a_norm, mb: nullptr, type: LLM_NORM_RMS, il);
99 cb(cur: kv_cmpr, name: "kv_cmpr", il);
100
101 if (is_mla) {
102 // {n_embd_head_qk_nope, n_tokens, n_head}
103 q_nope = ggml_permute(ctx: ctx0, a: q_nope, axis0: 0, axis1: 2, axis2: 1, axis3: 3);
104 cb(cur: q_nope, name: "q_nope_perm", il);
105
106 // {n_embd_head_qk_nope, kv_lora_rank, n_head} x {n_embd_head_qk_nope, n_tokens, n_head}
107 ggml_tensor * q_nope_absorbed = ggml_mul_mat(ctx: ctx0, a: model.layers[il].wk_b, b: q_nope);
108 cb(cur: q_nope_absorbed, name: "q_nope_absorbed", il);
109
110 // {kv_lora_rank, n_head, n_tokens}
111 q_nope_absorbed = ggml_permute(ctx: ctx0, a: q_nope_absorbed, axis0: 0, axis1: 2, axis2: 1, axis3: 3);
112 cb(cur: q_nope_absorbed, name: "q_nope_absorbed_perm", il);
113
114 // {n_embd_head_qk_rope + kv_lora_rank, n_head, n_tokens}
115 // note: rope must go first for in-place context shifting in build_rope_shift()
116 ggml_tensor * Qcur = ggml_concat(ctx: ctx0, a: q_pe, b: q_nope_absorbed, dim: 0);
117 cb(cur: Qcur, name: "Qcur", il);
118
119 kv_cmpr = ggml_reshape_3d(ctx: ctx0, a: kv_cmpr, ne0: kv_lora_rank, ne1: 1, ne2: n_tokens);
120 cb(cur: kv_cmpr, name: "kv_cmpr_reshape", il);
121
122 // {n_embd_head_qk_rope + kv_lora_rank, 1, n_tokens}
123 ggml_tensor * Kcur = ggml_concat(ctx: ctx0, a: k_pe, b: kv_cmpr, dim: 0);
124 cb(cur: Kcur, name: "Kcur", il);
125
126 // {kv_lora_rank, 1, n_tokens}
127 ggml_tensor * Vcur = kv_cmpr;
128 cb(cur: Vcur, name: "Vcur", il);
129
130 // note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
131 cur = build_attn(inp: inp_attn,
132 wo: model.layers[il].wo, NULL,
133 q_cur: Qcur, k_cur: Kcur, v_cur: Vcur, kq_b: nullptr, sinks: nullptr, v_mla: model.layers[il].wv_b, kq_scale, il);
134 } else {
135 ggml_tensor * kv = ggml_mul_mat(ctx: ctx0, a: model.layers[il].wkv_b, b: kv_cmpr);
136 cb(cur: kv, name: "kv", il);
137
138 // split into {n_embd_head_qk_nope, n_head, n_tokens}
139 ggml_tensor * k_nope =
140 ggml_view_3d(ctx: ctx0, a: kv, ne0: n_embd_head_qk_nope, ne1: n_head, ne2: n_tokens,
141 nb1: ggml_row_size(type: kv->type, ne: n_embd_head_qk_nope + n_embd_head_v),
142 nb2: ggml_row_size(type: kv->type, ne: n_embd_head_qk_nope + n_embd_head_v) * n_head, offset: 0);
143 cb(cur: k_nope, name: "k_nope_view", il);
144
145 // and {n_embd_head_v, n_head, n_tokens}
146 ggml_tensor * Vcur = ggml_view_3d(ctx: ctx0, a: kv, ne0: n_embd_head_v, ne1: n_head, ne2: n_tokens,
147 nb1: ggml_row_size(type: kv->type, ne: n_embd_head_qk_nope + n_embd_head_v),
148 nb2: ggml_row_size(type: kv->type, ne: n_embd_head_qk_nope + n_embd_head_v) * n_head,
149 offset: ggml_row_size(type: kv->type, ne: n_embd_head_qk_nope));
150 cb(cur: Vcur, name: "Vcur_view", il);
151
152 Vcur = ggml_cont(ctx: ctx0, a: Vcur);
153 cb(cur: Vcur, name: "Vcur_cont", il);
154
155 // note: rope must go first for in-place context shifting in build_rope_shift()
156 ggml_tensor * Qcur = ggml_concat(ctx: ctx0, a: q_pe, b: q_nope, dim: 0);
157 cb(cur: Qcur, name: "Qcur", il);
158
159 ggml_tensor * Kcur = ggml_concat(ctx: ctx0, a: ggml_repeat(ctx: ctx0, a: k_pe, b: q_pe), b: k_nope, dim: 0);
160 cb(cur: Kcur, name: "Kcur", il);
161
162 // note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
163 cur = build_attn(inp: inp_attn,
164 wo: model.layers[il].wo, NULL,
165 q_cur: Qcur, k_cur: Kcur, v_cur: Vcur, kq_b: nullptr, sinks: nullptr, v_mla: nullptr, kq_scale, il);
166 }
167 }
168 if (il == n_layer - 1 && inp_out_ids) {
169 cur = ggml_get_rows(ctx: ctx0, a: cur, b: inp_out_ids);
170 inpSA = ggml_get_rows(ctx: ctx0, a: inpSA, b: inp_out_ids);
171 }
172 ggml_tensor * ffn_inp = ggml_add(ctx: ctx0, a: cur, b: inpSA);
173 cb(cur: ffn_inp, name: "ffn_inp", il);
174
175 cur = build_norm(cur: ffn_inp, mw: model.layers[il].ffn_norm, NULL, type: LLM_NORM_RMS, il);
176 cb(cur, name: "ffn_norm", il);
177
178 if ((uint32_t) il < hparams.n_layer_dense_lead) {
179 cur = build_ffn(cur,
180 up: model.layers[il].ffn_up, NULL, NULL,
181 gate: model.layers[il].ffn_gate, NULL, NULL,
182 down: model.layers[il].ffn_down, NULL, NULL,
183 NULL, type_op: LLM_FFN_SILU, type_gate: LLM_FFN_PAR, il);
184 cb(cur, name: "ffn_out", il);
185 } else {
186 // MoE branch
187 ggml_tensor * moe_out = build_moe_ffn(cur,
188 gate_inp: model.layers[il].ffn_gate_inp,
189 up_exps: model.layers[il].ffn_up_exps,
190 gate_exps: model.layers[il].ffn_gate_exps,
191 down_exps: model.layers[il].ffn_down_exps,
192 exp_probs_b: model.layers[il].ffn_exp_probs_b,
193 n_expert, n_expert_used,
194 type_op: LLM_FFN_SILU, norm_w: hparams.expert_weights_norm,
195 scale_w: true, w_scale: hparams.expert_weights_scale,
196 gating_op: (llama_expert_gating_func_type) hparams.expert_gating_func,
197 il);
198 cb(cur: moe_out, name: "ffn_moe_out", il);
199
200 // FFN shared expert
201 {
202 ggml_tensor * ffn_shexp =
203 build_ffn(cur,
204 up: model.layers[il].ffn_up_shexp, NULL, NULL,
205 gate: model.layers[il].ffn_gate_shexp, NULL, NULL,
206 down: model.layers[il].ffn_down_shexp, NULL, NULL,
207 NULL, type_op: LLM_FFN_SILU, type_gate: LLM_FFN_PAR, il);
208 cb(cur: ffn_shexp, name: "ffn_shexp", il);
209
210 cur = ggml_add(ctx: ctx0, a: moe_out, b: ffn_shexp);
211 cb(cur, name: "ffn_out", il);
212 }
213 }
214 cur = ggml_add(ctx: ctx0, a: cur, b: ffn_inp);
215
216 cur = build_cvec(cur, il);
217 cb(cur, name: "l_out", il);
218
219 // input for next layer
220 inpL = cur;
221 }
222 cur = inpL;
223
224 cur = build_norm(cur, mw: model.output_norm, NULL, type: LLM_NORM_RMS, il: -1);
225
226 cb(cur, name: "result_norm", il: -1);
227 res->t_embd = cur;
228
229 // lm_head
230 cur = ggml_mul_mat(ctx: ctx0, a: model.output, b: cur);
231
232 cb(cur, name: "result_output", il: -1);
233 res->t_logits = cur;
234
235 ggml_build_forward_expand(cgraph: gf, tensor: cur);
236}
237