deepseek2.cpp source code [llama.cpp/src/models/deepseek2.cpp]

1	#include "models.h"
2
3
4
5	llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_graph_params & params) :
6	llm_graph_context (params) {
7	bool is_lite = (hparams.n_layer == `27`);
8
9	const bool is_mla = (hparams.n_embd_head_k_mla != `0` && hparams.n_embd_head_v_mla != `0`);
10
11	// note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
12	const int64_t n_embd_head_k = is_mla ? hparams.n_embd_head_k_mla : hparams.n_embd_head_k;
13	const int64_t n_embd_head_v = is_mla ? hparams.n_embd_head_v_mla : hparams.n_embd_head_v;
14
15	const int64_t n_embd_head_qk_rope = hparams.n_rot;
16	const int64_t n_embd_head_qk_nope = n_embd_head_k - n_embd_head_qk_rope;
17
18	const uint32_t kv_lora_rank = hparams.n_lora_kv;
19
20	// We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
21	// See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
22	const float mscale = attn_factor * (`1.0f` + hparams.rope_yarn_log_mul * logf(x: `1.0f` / freq_scale));
23	const float kq_scale = `1.0f` * mscale * mscale / sqrtf(x: float(n_embd_head_k));
24	const float attn_factor = `1.0f` / (`1.0f` + `0.1f` * logf(x: `1.0f` / freq_scale));
25
26	ggml_tensor * cur;
27	ggml_tensor * inpL;
28
29	// {n_embd, n_tokens}
30	inpL = build_inp_embd(tok_embd: model.tok_embd);
31
32	// inp_pos - contains the positions
33	ggml_tensor * inp_pos = build_inp_pos();
34
35	auto * inp_attn = build_attn_inp_kv();
36
37	ggml_tensor * inp_out_ids = build_inp_out_ids();
38
39	for (int il = `0`; il < n_layer; ++il) {
40	ggml_tensor * inpSA = inpL;
41
42	// norm
43	cur = build_norm(cur: inpL, mw: model.layers [il].attn_norm, NULL, type: LLM_NORM_RMS, il);
44	cb(cur, name: "attn_norm", il);
45
46	// self_attention
47	{
48	ggml_tensor * q = NULL;
49	if (!is_lite) {
50	q = ggml_mul_mat(ctx: ctx0, a: model.layers [il].wq_a, b: cur);
51	cb(cur: q, name: "q", il);
52
53	q = build_norm(cur: q, mw: model.layers [il].attn_q_a_norm, mb: nullptr, type: LLM_NORM_RMS, il);
54	cb(cur: q, name: "q", il);
55
56	q = ggml_mul_mat(ctx: ctx0, a: model.layers [il].wq_b, b: q);
57	cb(cur: q, name: "q", il);
58	} else {
59	q = ggml_mul_mat(ctx: ctx0, a: model.layers [il].wq, b: cur);
60	cb(cur: q, name: "q", il);
61	}
62	// split into {n_embd_head_qk_nope, n_head, n_tokens}
63	ggml_tensor * q_nope =
64	ggml_view_3d(ctx: ctx0, a: q, ne0: n_embd_head_qk_nope, ne1: n_head, ne2: n_tokens, nb1: ggml_row_size(type: q->type, ne: n_embd_head_k),
65	nb2: ggml_row_size(type: q->type, ne: n_embd_head_k) * n_head, offset: `0`);
66	cb(cur: q_nope, name: "q_nope", il);
67
68	// and {n_embd_head_qk_rope, n_head, n_tokens}
69	ggml_tensor * q_pe = ggml_view_3d(
70	ctx: ctx0, a: q, ne0: n_embd_head_qk_rope, ne1: n_head, ne2: n_tokens, nb1: ggml_row_size(type: q->type, ne: n_embd_head_k),
71	nb2: ggml_row_size(type: q->type, ne: n_embd_head_k) * n_head, offset: ggml_row_size(type: q->type, ne: n_embd_head_qk_nope));
72	cb(cur: q_pe, name: "q_pe", il);
73
74	ggml_tensor * kv_cmpr_pe = ggml_mul_mat(ctx: ctx0, a: model.layers [il].wkv_a_mqa, b: cur);
75	cb(cur: kv_cmpr_pe, name: "kv_cmpr_pe", il);
76
77	// split into {kv_lora_rank, n_tokens}
78	ggml_tensor * kv_cmpr =
79	ggml_view_2d(ctx: ctx0, a: kv_cmpr_pe, ne0: kv_lora_rank, ne1: n_tokens,
80	nb1: ggml_row_size(type: kv_cmpr_pe->type, ne: kv_lora_rank + n_embd_head_qk_rope), offset: `0`);
81	cb(cur: kv_cmpr, name: "kv_cmpr", il);
82
83	// and {n_embd_head_qk_rope, 1, n_tokens}
84	ggml_tensor * k_pe = ggml_view_3d(ctx: ctx0, a: kv_cmpr_pe, ne0: n_embd_head_qk_rope, ne1: `1`, ne2: n_tokens,
85	nb1: ggml_row_size(type: kv_cmpr_pe->type, ne: kv_lora_rank + n_embd_head_qk_rope),
86	nb2: ggml_row_size(type: kv_cmpr_pe->type, ne: kv_lora_rank + n_embd_head_qk_rope),
87	offset: ggml_row_size(type: kv_cmpr_pe->type, ne: kv_lora_rank));
88	cb(cur: k_pe, name: "k_pe", il);
89
90	q_pe = ggml_rope_ext(ctx: ctx0, a: q_pe, b: inp_pos, c: nullptr, n_dims: n_rot, mode: rope_type, n_ctx_orig, freq_base, freq_scale,
91	ext_factor, attn_factor, beta_fast, beta_slow);
92	cb(cur: q_pe, name: "q_pe", il);
93
94	k_pe = ggml_rope_ext(ctx: ctx0, a: k_pe, b: inp_pos, c: nullptr, n_dims: n_rot, mode: rope_type, n_ctx_orig, freq_base, freq_scale,
95	ext_factor, attn_factor, beta_fast, beta_slow);
96	cb(cur: k_pe, name: "k_pe", il);
97
98	kv_cmpr = build_norm(cur: kv_cmpr, mw: model.layers [il].attn_kv_a_norm, mb: nullptr, type: LLM_NORM_RMS, il);
99	cb(cur: kv_cmpr, name: "kv_cmpr", il);
100
101	if (is_mla) {
102	// {n_embd_head_qk_nope, n_tokens, n_head}
103	q_nope = ggml_permute(ctx: ctx0, a: q_nope, axis0: `0`, axis1: `2`, axis2: `1`, axis3: `3`);
104	cb(cur: q_nope, name: "q_nope_perm", il);
105
106	// {n_embd_head_qk_nope, kv_lora_rank, n_head} x {n_embd_head_qk_nope, n_tokens, n_head}
107	ggml_tensor * q_nope_absorbed = ggml_mul_mat(ctx: ctx0, a: model.layers [il].wk_b, b: q_nope);
108	cb(cur: q_nope_absorbed, name: "q_nope_absorbed", il);
109
110	// {kv_lora_rank, n_head, n_tokens}
111	q_nope_absorbed = ggml_permute(ctx: ctx0, a: q_nope_absorbed, axis0: `0`, axis1: `2`, axis2: `1`, axis3: `3`);
112	cb(cur: q_nope_absorbed, name: "q_nope_absorbed_perm", il);
113
114	// {n_embd_head_qk_rope + kv_lora_rank, n_head, n_tokens}
115	// note: rope must go first for in-place context shifting in build_rope_shift()
116	ggml_tensor * Qcur = ggml_concat(ctx: ctx0, a: q_pe, b: q_nope_absorbed, dim: `0`);
117	cb(cur: Qcur, name: "Qcur", il);
118
119	kv_cmpr = ggml_reshape_3d(ctx: ctx0, a: kv_cmpr, ne0: kv_lora_rank, ne1: `1`, ne2: n_tokens);
120	cb(cur: kv_cmpr, name: "kv_cmpr_reshape", il);
121
122	// {n_embd_head_qk_rope + kv_lora_rank, 1, n_tokens}
123	ggml_tensor * Kcur = ggml_concat(ctx: ctx0, a: k_pe, b: kv_cmpr, dim: `0`);
124	cb(cur: Kcur, name: "Kcur", il);
125
126	// {kv_lora_rank, 1, n_tokens}
127	ggml_tensor * Vcur = kv_cmpr;
128	cb(cur: Vcur, name: "Vcur", il);
129
130	// note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
131	cur = build_attn(inp: inp_attn,
132	wo: model.layers [il].wo, NULL,
133	q_cur: Qcur, k_cur: Kcur, v_cur: Vcur, kq_b: nullptr, sinks: nullptr, v_mla: model.layers [il].wv_b, kq_scale, il);
134	} else {
135	ggml_tensor * kv = ggml_mul_mat(ctx: ctx0, a: model.layers [il].wkv_b, b: kv_cmpr);
136	cb(cur: kv, name: "kv", il);
137
138	// split into {n_embd_head_qk_nope, n_head, n_tokens}
139	ggml_tensor * k_nope =
140	ggml_view_3d(ctx: ctx0, a: kv, ne0: n_embd_head_qk_nope, ne1: n_head, ne2: n_tokens,
141	nb1: ggml_row_size(type: kv->type, ne: n_embd_head_qk_nope + n_embd_head_v),
142	nb2: ggml_row_size(type: kv->type, ne: n_embd_head_qk_nope + n_embd_head_v) * n_head, offset: `0`);
143	cb(cur: k_nope, name: "k_nope_view", il);
144
145	// and {n_embd_head_v, n_head, n_tokens}
146	ggml_tensor * Vcur = ggml_view_3d(ctx: ctx0, a: kv, ne0: n_embd_head_v, ne1: n_head, ne2: n_tokens,
147	nb1: ggml_row_size(type: kv->type, ne: n_embd_head_qk_nope + n_embd_head_v),
148	nb2: ggml_row_size(type: kv->type, ne: n_embd_head_qk_nope + n_embd_head_v) * n_head,
149	offset: ggml_row_size(type: kv->type, ne: n_embd_head_qk_nope));
150	cb(cur: Vcur, name: "Vcur_view", il);
151
152	Vcur = ggml_cont(ctx: ctx0, a: Vcur);
153	cb(cur: Vcur, name: "Vcur_cont", il);
154
155	// note: rope must go first for in-place context shifting in build_rope_shift()
156	ggml_tensor * Qcur = ggml_concat(ctx: ctx0, a: q_pe, b: q_nope, dim: `0`);
157	cb(cur: Qcur, name: "Qcur", il);
158
159	ggml_tensor * Kcur = ggml_concat(ctx: ctx0, a: ggml_repeat(ctx: ctx0, a: k_pe, b: q_pe), b: k_nope, dim: `0`);
160	cb(cur: Kcur, name: "Kcur", il);
161
162	// note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
163	cur = build_attn(inp: inp_attn,
164	wo: model.layers [il].wo, NULL,
165	q_cur: Qcur, k_cur: Kcur, v_cur: Vcur, kq_b: nullptr, sinks: nullptr, v_mla: nullptr, kq_scale, il);
166	}
167	}
168	if (il == n_layer - `1` && inp_out_ids) {
169	cur = ggml_get_rows(ctx: ctx0, a: cur, b: inp_out_ids);
170	inpSA = ggml_get_rows(ctx: ctx0, a: inpSA, b: inp_out_ids);
171	}
172	ggml_tensor * ffn_inp = ggml_add(ctx: ctx0, a: cur, b: inpSA);
173	cb(cur: ffn_inp, name: "ffn_inp", il);
174
175	cur = build_norm(cur: ffn_inp, mw: model.layers [il].ffn_norm, NULL, type: LLM_NORM_RMS, il);
176	cb(cur, name: "ffn_norm", il);
177
178	if ((uint32_t) il < hparams.n_layer_dense_lead) {
179	cur = build_ffn(cur,
180	up: model.layers [il].ffn_up, NULL, NULL,
181	gate: model.layers [il].ffn_gate, NULL, NULL,
182	down: model.layers [il].ffn_down, NULL, NULL,
183	NULL, type_op: LLM_FFN_SILU, type_gate: LLM_FFN_PAR, il);
184	cb(cur, name: "ffn_out", il);
185	} else {
186	// MoE branch
187	ggml_tensor * moe_out = build_moe_ffn(cur,
188	gate_inp: model.layers [il].ffn_gate_inp,
189	up_exps: model.layers [il].ffn_up_exps,
190	gate_exps: model.layers [il].ffn_gate_exps,
191	down_exps: model.layers [il].ffn_down_exps,
192	exp_probs_b: model.layers [il].ffn_exp_probs_b,
193	n_expert, n_expert_used,
194	type_op: LLM_FFN_SILU, norm_w: hparams.expert_weights_norm,
195	scale_w: true, w_scale: hparams.expert_weights_scale,
196	gating_op: (llama_expert_gating_func_type) hparams.expert_gating_func,
197	il);
198	cb(cur: moe_out, name: "ffn_moe_out", il);
199
200	// FFN shared expert
201	{
202	ggml_tensor * ffn_shexp =
203	build_ffn(cur,
204	up: model.layers [il].ffn_up_shexp, NULL, NULL,
205	gate: model.layers [il].ffn_gate_shexp, NULL, NULL,
206	down: model.layers [il].ffn_down_shexp, NULL, NULL,
207	NULL, type_op: LLM_FFN_SILU, type_gate: LLM_FFN_PAR, il);
208	cb(cur: ffn_shexp, name: "ffn_shexp", il);
209
210	cur = ggml_add(ctx: ctx0, a: moe_out, b: ffn_shexp);
211	cb(cur, name: "ffn_out", il);
212	}
213	}
214	cur = ggml_add(ctx: ctx0, a: cur, b: ffn_inp);
215
216	cur = build_cvec(cur, il);
217	cb(cur, name: "l_out", il);
218
219	// input for next layer
220	inpL = cur;
221	}
222	cur = inpL;
223
224	cur = build_norm(cur, mw: model.output_norm, NULL, type: LLM_NORM_RMS, il: -`1`);
225
226	cb(cur, name: "result_norm", il: -`1`);
227	res->t_embd = cur;
228
229	// lm_head
230	cur = ggml_mul_mat(ctx: ctx0, a: model.output, b: cur);
231
232	cb(cur, name: "result_output", il: -`1`);
233	res->t_logits = cur;
234
235	ggml_build_forward_expand(cgraph: gf, tensor: cur);
236	}
237

Browse the source code of llama.cpp/src/models/deepseek2.cpp