plm.cpp source code [llama.cpp/src/models/plm.cpp]

1	#include "models.h"
2
3	llm_build_plm::llm_build_plm(const llama_model & model, const llm_graph_params & params) : llm_graph_context (params) {
4	const float kq_scale = `1.0f`/sqrtf(x: float(hparams.n_embd_head_k));
5
6	const uint32_t n_embd_head_qk_rope = hparams.n_rot;
7	const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
8	const uint32_t kv_lora_rank = hparams.n_lora_kv;
9
10	ggml_tensor * cur;
11	ggml_tensor * inpL;
12
13	// {n_embd, n_tokens}
14	inpL = build_inp_embd(tok_embd: model.tok_embd);
15
16	// inp_pos - contains the positions
17	ggml_tensor * inp_pos = build_inp_pos();
18
19	auto * inp_attn = build_attn_inp_kv();
20
21	ggml_tensor * inp_out_ids = build_inp_out_ids();
22
23	for (int il = `0`; il < n_layer; ++il) {
24	ggml_tensor * inpSA = inpL;
25
26	// norm
27	cur = build_norm(cur: inpL,
28	mw: model.layers [il].attn_norm, NULL,
29	type: LLM_NORM_RMS, il);
30	cb(cur, name: "attn_norm", il);
31
32	// self_attention
33	{
34	ggml_tensor * q = NULL;
35	q = ggml_mul_mat(ctx: ctx0, a: model.layers [il].wq, b: cur);
36	cb(cur: q, name: "q", il);
37
38	// split into {n_head n_embd_head_qk_nope, n_tokens}*
39	ggml_tensor * q_nope = ggml_view_3d(ctx: ctx0, a: q, ne0: n_embd_head_qk_nope, ne1: n_head, ne2: n_tokens,
40	nb1: ggml_row_size(type: q->type, ne: hparams.n_embd_head_k),
41	nb2: ggml_row_size(type: q->type, ne: hparams.n_embd_head_k * n_head),
42	offset: `0`);
43	cb(cur: q_nope, name: "q_nope", il);
44
45	// and {n_head n_embd_head_qk_rope, n_tokens}*
46	ggml_tensor * q_pe = ggml_view_3d(ctx: ctx0, a: q, ne0: n_embd_head_qk_rope, ne1: n_head, ne2: n_tokens,
47	nb1: ggml_row_size(type: q->type, ne: hparams.n_embd_head_k),
48	nb2: ggml_row_size(type: q->type, ne: hparams.n_embd_head_k * n_head),
49	offset: ggml_row_size(type: q->type, ne: n_embd_head_qk_nope));
50	cb(cur: q_pe, name: "q_pe", il);
51
52	// {n_embd, kv_lora_rank + n_embd_head_qk_rope} {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}*
53	ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx: ctx0, a: model.layers [il].wkv_a_mqa, b: cur);
54	cb(cur: kv_pe_compresseed, name: "kv_pe_compresseed", il);
55
56	// split into {kv_lora_rank, n_tokens}
57	ggml_tensor * kv_compressed = ggml_view_2d(ctx: ctx0, a: kv_pe_compresseed, ne0: kv_lora_rank, ne1: n_tokens,
58	nb1: kv_pe_compresseed->nb[`1`],
59	offset: `0`);
60	cb(cur: kv_compressed, name: "kv_compressed", il);
61
62	// and {n_embd_head_qk_rope, n_tokens}
63	ggml_tensor * k_pe = ggml_view_3d(ctx: ctx0, a: kv_pe_compresseed, ne0: n_embd_head_qk_rope, ne1: `1`, ne2: n_tokens,
64	nb1: kv_pe_compresseed->nb[`1`],
65	nb2: kv_pe_compresseed->nb[`1`],
66	offset: ggml_row_size(type: kv_pe_compresseed->type, ne: kv_lora_rank));
67	cb(cur: k_pe, name: "k_pe", il);
68
69	kv_compressed = build_norm(cur: kv_compressed,
70	mw: model.layers [il].attn_kv_a_norm, NULL,
71	type: LLM_NORM_RMS, il);
72	cb(cur: kv_compressed, name: "kv_compressed", il);
73
74	// {kv_lora_rank, n_head (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}*
75	ggml_tensor * kv = ggml_mul_mat(ctx: ctx0, a: model.layers [il].wkv_b, b: kv_compressed);
76	cb(cur: kv, name: "kv", il);
77
78	// split into {n_head n_embd_head_qk_nope, n_tokens}*
79	ggml_tensor * k_nope = ggml_view_3d(ctx: ctx0, a: kv, ne0: n_embd_head_qk_nope, ne1: n_head, ne2: n_tokens,
80	nb1: ggml_row_size(type: kv->type, ne: n_embd_head_qk_nope + hparams.n_embd_head_v),
81	nb2: ggml_row_size(type: kv->type, ne: n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
82	offset: `0`);
83	cb(cur: k_nope, name: "k_nope", il);
84
85	// and {n_head n_embd_head_v, n_tokens}*
86	ggml_tensor * v_states = ggml_view_3d(ctx: ctx0, a: kv, ne0: hparams.n_embd_head_v, ne1: n_head, ne2: n_tokens,
87	nb1: ggml_row_size(type: kv->type, ne: (n_embd_head_qk_nope + hparams.n_embd_head_v)),
88	nb2: ggml_row_size(type: kv->type, ne: (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
89	offset: ggml_row_size(type: kv->type, ne: (n_embd_head_qk_nope)));
90	cb(cur: v_states, name: "v_states", il);
91
92	v_states = ggml_cont(ctx: ctx0, a: v_states);
93	cb(cur: v_states, name: "v_states", il);
94
95	v_states = ggml_view_2d(ctx: ctx0, a: v_states, ne0: hparams.n_embd_head_v * n_head, ne1: n_tokens,
96	nb1: ggml_row_size(type: kv->type, ne: hparams.n_embd_head_v * n_head),
97	offset: `0`);
98	cb(cur: v_states, name: "v_states", il);
99
100	q_pe = ggml_rope_ext(
101	ctx: ctx0, a: q_pe, b: inp_pos, c: nullptr,
102	n_dims: n_rot, mode: rope_type, n_ctx_orig, freq_base, freq_scale,
103	ext_factor, attn_factor, beta_fast, beta_slow
104	);
105	cb(cur: q_pe, name: "q_pe", il);
106
107	// shared RoPE key
108	k_pe = ggml_rope_ext(
109	ctx: ctx0, a: k_pe, b: inp_pos, c: nullptr,
110	n_dims: n_rot, mode: rope_type, n_ctx_orig, freq_base, freq_scale,
111	ext_factor, attn_factor, beta_fast, beta_slow
112	);
113	cb(cur: k_pe, name: "k_pe", il);
114
115	ggml_tensor * q_states = ggml_concat(ctx: ctx0, a: q_nope, b: q_pe, dim: `0`);
116	cb(cur: q_states, name: "q_states", il);
117
118	ggml_tensor * k_states = ggml_concat(ctx: ctx0, a: k_nope, b: ggml_repeat(ctx: ctx0, a: k_pe, b: q_pe), dim: `0`);
119	cb(cur: k_states, name: "k_states", il);
120
121	cur = build_attn(inp: inp_attn,
122	wo: model.layers [il].wo, NULL,
123	q_cur: q_states, k_cur: k_states, v_cur: v_states, kq_b: nullptr, sinks: nullptr, v_mla: nullptr, kq_scale, il);
124	}
125	if (il == n_layer - `1` && inp_out_ids) {
126	cur = ggml_get_rows(ctx: ctx0, a: cur, b: inp_out_ids);
127	inpSA = ggml_get_rows(ctx: ctx0, a: inpSA, b: inp_out_ids);
128	}
129	ggml_tensor * ffn_inp = ggml_add(ctx: ctx0, a: cur, b: inpSA);
130	cb(cur: ffn_inp, name: "ffn_inp", il);
131
132	cur = build_norm(cur: ffn_inp,
133	mw: model.layers [il].ffn_norm, NULL,
134	type: LLM_NORM_RMS, il);
135	cb(cur, name: "ffn_norm", il);
136
137	cur = build_ffn(cur,
138	up: model.layers [il].ffn_up, NULL, NULL,
139	NULL, NULL, NULL,
140	down: model.layers [il].ffn_down, NULL, NULL,
141	NULL,
142	type_op: LLM_FFN_RELU_SQR, type_gate: LLM_FFN_SEQ, il);
143	cb(cur, name: "ffn_out", il);
144
145	cur = ggml_add(ctx: ctx0, a: cur, b: ffn_inp);
146
147	cur = build_cvec(cur, il);
148	cb(cur, name: "l_out", il);
149
150	// input for next layer
151	inpL = cur;
152	}
153	cur = inpL;
154
155	cur = build_norm(cur,
156	mw: model.output_norm, NULL,
157	type: LLM_NORM_RMS, il: -`1`);
158
159	cb(cur, name: "result_norm", il: -`1`);
160	res->t_embd = cur;
161
162	cur = build_lora_mm(w: model.output, cur);
163
164	cb(cur, name: "result_output", il: -`1`);
165	res->t_logits = cur;
166
167	ggml_build_forward_expand(cgraph: gf, tensor: cur);
168	}
169

Browse the source code of llama.cpp/src/models/plm.cpp