minicpm3.cpp source code [llama.cpp/src/models/minicpm3.cpp]

1	#include "models.h"
2
3	llm_build_minicpm3::llm_build_minicpm3(const llama_model & model, const llm_graph_params & params) : llm_graph_context (params) {
4	//TODO: if the model varies, these parameters need to be read from the model
5	const int64_t n_embd_base = `256`;
6	const float scale_embd = `12.0f`;
7	const float scale_depth = `1.4f`;
8	const float kq_scale = `1.0f` / sqrtf(x: float(hparams.n_embd_head_k));
9
10	const uint32_t n_embd_head_qk_rope = hparams.n_rot;
11	const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
12	const uint32_t kv_lora_rank = hparams.n_lora_kv;
13
14	ggml_tensor * cur;
15	ggml_tensor * inpL;
16
17	inpL = build_inp_embd(tok_embd: model.tok_embd);
18
19	// scale the input embeddings
20	inpL = ggml_scale(ctx: ctx0, a: inpL, s: scale_embd);
21	cb(cur: inpL, name: "inp_scaled", il: -`1`);
22
23	// inp_pos - contains the positions
24	ggml_tensor * inp_pos = build_inp_pos();
25
26	auto * inp_attn = build_attn_inp_kv();
27
28	ggml_tensor * inp_out_ids = build_inp_out_ids();
29
30	for (int il = `0`; il < n_layer; ++il) {
31	ggml_tensor * inpSA = inpL;
32
33	ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
34
35	// norm
36	cur = build_norm(cur: inpL,
37	mw: model.layers [il].attn_norm, NULL,
38	type: LLM_NORM_RMS, il);
39	cb(cur, name: "attn_norm", il);
40
41	// self_attention
42	{
43	ggml_tensor * q = NULL;
44	// {n_embd, q_lora_rank} {n_embd, n_tokens} -> {q_lora_rank, n_tokens}*
45	q = ggml_mul_mat(ctx: ctx0, a: model.layers [il].wq_a, b: cur);
46	cb(cur: q, name: "q", il);
47
48	q = build_norm(cur: q,
49	mw: model.layers [il].attn_q_a_norm, NULL,
50	type: LLM_NORM_RMS, il);
51	cb(cur: q, name: "q", il);
52
53	// {q_lora_rank, n_head hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}*
54	q = ggml_mul_mat(ctx: ctx0, a: model.layers [il].wq_b, b: q);
55	cb(cur: q, name: "q", il);
56
57	// split into {n_head n_embd_head_qk_nope, n_tokens}*
58	ggml_tensor * q_nope = ggml_view_3d(ctx: ctx0, a: q, ne0: n_embd_head_qk_nope, ne1: n_head, ne2: n_tokens,
59	nb1: ggml_row_size(type: q->type, ne: hparams.n_embd_head_k),
60	nb2: ggml_row_size(type: q->type, ne: hparams.n_embd_head_k * n_head),
61	offset: `0`);
62	cb(cur: q_nope, name: "q_nope", il);
63
64	// and {n_head n_embd_head_qk_rope, n_tokens}*
65	ggml_tensor * q_pe = ggml_view_3d(ctx: ctx0, a: q, ne0: n_embd_head_qk_rope, ne1: n_head, ne2: n_tokens,
66	nb1: ggml_row_size(type: q->type, ne: hparams.n_embd_head_k),
67	nb2: ggml_row_size(type: q->type, ne: hparams.n_embd_head_k * n_head),
68	offset: ggml_row_size(type: q->type, ne: n_embd_head_qk_nope));
69	cb(cur: q_pe, name: "q_pe", il);
70
71	// {n_embd, kv_lora_rank + n_embd_head_qk_rope} {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}*
72	ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx: ctx0, a: model.layers [il].wkv_a_mqa, b: cur);
73	cb(cur: kv_pe_compresseed, name: "kv_pe_compresseed", il);
74
75	// split into {kv_lora_rank, n_tokens}
76	ggml_tensor * kv_compressed = ggml_view_2d(ctx: ctx0, a: kv_pe_compresseed, ne0: kv_lora_rank, ne1: n_tokens,
77	nb1: kv_pe_compresseed->nb[`1`],
78	offset: `0`);
79	cb(cur: kv_compressed, name: "kv_compressed", il);
80
81	// and {n_embd_head_qk_rope, n_tokens}
82	ggml_tensor * k_pe = ggml_view_3d(ctx: ctx0, a: kv_pe_compresseed, ne0: n_embd_head_qk_rope, ne1: `1`, ne2: n_tokens,
83	nb1: kv_pe_compresseed->nb[`1`],
84	nb2: kv_pe_compresseed->nb[`1`],
85	offset: ggml_row_size(type: kv_pe_compresseed->type, ne: kv_lora_rank));
86	cb(cur: k_pe, name: "k_pe", il);
87
88	kv_compressed = build_norm(cur: kv_compressed,
89	mw: model.layers [il].attn_kv_a_norm, NULL,
90	type: LLM_NORM_RMS, il);
91	cb(cur: kv_compressed, name: "kv_compressed", il);
92
93	// {kv_lora_rank, n_head (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}*
94	ggml_tensor * kv = ggml_mul_mat(ctx: ctx0, a: model.layers [il].wkv_b, b: kv_compressed);
95	cb(cur: kv, name: "kv", il);
96
97	// split into {n_head n_embd_head_qk_nope, n_tokens}*
98	ggml_tensor * k_nope = ggml_view_3d(ctx: ctx0, a: kv, ne0: n_embd_head_qk_nope, ne1: n_head, ne2: n_tokens,
99	nb1: ggml_row_size(type: kv->type, ne: n_embd_head_qk_nope + hparams.n_embd_head_v),
100	nb2: ggml_row_size(type: kv->type, ne: n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
101	offset: `0`);
102	cb(cur: k_nope, name: "k_nope", il);
103
104	// and {n_head n_embd_head_v, n_tokens}*
105	ggml_tensor * v_states = ggml_view_3d(ctx: ctx0, a: kv, ne0: hparams.n_embd_head_v, ne1: n_head, ne2: n_tokens,
106	nb1: ggml_row_size(type: kv->type, ne: (n_embd_head_qk_nope + hparams.n_embd_head_v)),
107	nb2: ggml_row_size(type: kv->type, ne: (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
108	offset: ggml_row_size(type: kv->type, ne: (n_embd_head_qk_nope)));
109	cb(cur: v_states, name: "v_states", il);
110
111	v_states = ggml_cont(ctx: ctx0, a: v_states);
112	cb(cur: v_states, name: "v_states", il);
113
114	q_pe = ggml_rope_ext(
115	ctx: ctx0, a: q_pe, b: inp_pos, c: rope_factors,
116	n_dims: n_rot, mode: rope_type, n_ctx_orig, freq_base, freq_scale,
117	ext_factor, attn_factor, beta_fast, beta_slow
118	);
119	cb(cur: q_pe, name: "q_pe", il);
120
121	// shared RoPE key
122	k_pe = ggml_rope_ext(
123	ctx: ctx0, a: k_pe, b: inp_pos, c: rope_factors,
124	n_dims: n_rot, mode: rope_type, n_ctx_orig, freq_base, freq_scale,
125	ext_factor, attn_factor, beta_fast, beta_slow
126	);
127	cb(cur: k_pe, name: "k_pe", il);
128
129	ggml_tensor * q_states = ggml_concat(ctx: ctx0, a: q_nope, b: q_pe, dim: `0`);
130	cb(cur: q_states, name: "q_states", il);
131
132	ggml_tensor * k_states = ggml_concat(ctx: ctx0, a: k_nope, b: ggml_repeat(ctx: ctx0, a: k_pe, b: q_pe), dim: `0`);
133	cb(cur: k_states, name: "k_states", il);
134
135	cur = build_attn(inp: inp_attn,
136	wo: model.layers [il].wo, NULL,
137	q_cur: q_states, k_cur: k_states, v_cur: v_states, kq_b: nullptr, sinks: nullptr, v_mla: nullptr, kq_scale, il);
138	}
139	if (il == n_layer - `1` && inp_out_ids) {
140	cur = ggml_get_rows(ctx: ctx0, a: cur, b: inp_out_ids);
141	inpSA = ggml_get_rows(ctx: ctx0, a: inpSA, b: inp_out_ids);
142	}
143	// scale_res - scale the hidden states for residual connection
144	const float scale_res = scale_depth/sqrtf(x: float(n_layer)); // TODO: is this correct?
145	cur = ggml_scale(ctx: ctx0, a: cur, s: scale_res);
146	cb(cur, name: "hidden_scaled", il);
147
148	ggml_tensor * ffn_inp = ggml_add(ctx: ctx0, a: cur, b: inpSA);
149	cb(cur: ffn_inp, name: "ffn_inp", il);
150
151	// feed-forward network
152	{
153	cur = build_norm(cur: ffn_inp,
154	mw: model.layers [il].ffn_norm, NULL,
155	type: LLM_NORM_RMS, il);
156	cb(cur, name: "ffn_norm", il);
157
158	cur = build_ffn(cur,
159	up: model.layers [il].ffn_up, NULL, NULL,
160	gate: model.layers [il].ffn_gate, NULL, NULL,
161	down: model.layers [il].ffn_down, NULL, NULL,
162	NULL,
163	type_op: LLM_FFN_SILU, type_gate: LLM_FFN_PAR, il);
164	cb(cur, name: "ffn_out", il);
165	}
166	// scale the hidden states for residual connection
167	cur = ggml_scale(ctx: ctx0, a: cur, s: scale_res);
168	cb(cur, name: "hidden_scaled_ffn", il);
169
170	cur = ggml_add(ctx: ctx0, a: cur, b: ffn_inp);
171
172	cur = build_cvec(cur, il);
173	cb(cur, name: "l_out", il);
174
175	// input for next layer
176	inpL = cur;
177	}
178	cur = inpL;
179
180	cur = build_norm(cur,
181	mw: model.output_norm, NULL,
182	type: LLM_NORM_RMS, il: -`1`);
183
184	cb(cur, name: "result_norm", il: -`1`);
185	res->t_embd = cur;
186
187	// lm_head scaling
188	const float scale_lmhead = float(n_embd_base)/float(n_embd);
189	cur = ggml_scale(ctx: ctx0, a: cur, s: scale_lmhead);
190	cb(cur, name: "lmhead_scaling", il: -`1`);
191
192	// lm_head
193	cur = build_lora_mm(w: model.output, cur);
194
195	cb(cur, name: "result_output", il: -`1`);
196	res->t_logits = cur;
197
198	ggml_build_forward_expand(cgraph: gf, tensor: cur);
199	}
200

Browse the source code of llama.cpp/src/models/minicpm3.cpp