1#include "models.h"
2
3llm_build_plm::llm_build_plm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
4 const float kq_scale = 1.0f/sqrtf(x: float(hparams.n_embd_head_k));
5
6 const uint32_t n_embd_head_qk_rope = hparams.n_rot;
7 const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
8 const uint32_t kv_lora_rank = hparams.n_lora_kv;
9
10 ggml_tensor * cur;
11 ggml_tensor * inpL;
12
13 // {n_embd, n_tokens}
14 inpL = build_inp_embd(tok_embd: model.tok_embd);
15
16 // inp_pos - contains the positions
17 ggml_tensor * inp_pos = build_inp_pos();
18
19 auto * inp_attn = build_attn_inp_kv();
20
21 ggml_tensor * inp_out_ids = build_inp_out_ids();
22
23 for (int il = 0; il < n_layer; ++il) {
24 ggml_tensor * inpSA = inpL;
25
26 // norm
27 cur = build_norm(cur: inpL,
28 mw: model.layers[il].attn_norm, NULL,
29 type: LLM_NORM_RMS, il);
30 cb(cur, name: "attn_norm", il);
31
32 // self_attention
33 {
34 ggml_tensor * q = NULL;
35 q = ggml_mul_mat(ctx: ctx0, a: model.layers[il].wq, b: cur);
36 cb(cur: q, name: "q", il);
37
38 // split into {n_head * n_embd_head_qk_nope, n_tokens}
39 ggml_tensor * q_nope = ggml_view_3d(ctx: ctx0, a: q, ne0: n_embd_head_qk_nope, ne1: n_head, ne2: n_tokens,
40 nb1: ggml_row_size(type: q->type, ne: hparams.n_embd_head_k),
41 nb2: ggml_row_size(type: q->type, ne: hparams.n_embd_head_k * n_head),
42 offset: 0);
43 cb(cur: q_nope, name: "q_nope", il);
44
45 // and {n_head * n_embd_head_qk_rope, n_tokens}
46 ggml_tensor * q_pe = ggml_view_3d(ctx: ctx0, a: q, ne0: n_embd_head_qk_rope, ne1: n_head, ne2: n_tokens,
47 nb1: ggml_row_size(type: q->type, ne: hparams.n_embd_head_k),
48 nb2: ggml_row_size(type: q->type, ne: hparams.n_embd_head_k * n_head),
49 offset: ggml_row_size(type: q->type, ne: n_embd_head_qk_nope));
50 cb(cur: q_pe, name: "q_pe", il);
51
52 // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
53 ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx: ctx0, a: model.layers[il].wkv_a_mqa, b: cur);
54 cb(cur: kv_pe_compresseed, name: "kv_pe_compresseed", il);
55
56 // split into {kv_lora_rank, n_tokens}
57 ggml_tensor * kv_compressed = ggml_view_2d(ctx: ctx0, a: kv_pe_compresseed, ne0: kv_lora_rank, ne1: n_tokens,
58 nb1: kv_pe_compresseed->nb[1],
59 offset: 0);
60 cb(cur: kv_compressed, name: "kv_compressed", il);
61
62 // and {n_embd_head_qk_rope, n_tokens}
63 ggml_tensor * k_pe = ggml_view_3d(ctx: ctx0, a: kv_pe_compresseed, ne0: n_embd_head_qk_rope, ne1: 1, ne2: n_tokens,
64 nb1: kv_pe_compresseed->nb[1],
65 nb2: kv_pe_compresseed->nb[1],
66 offset: ggml_row_size(type: kv_pe_compresseed->type, ne: kv_lora_rank));
67 cb(cur: k_pe, name: "k_pe", il);
68
69 kv_compressed = build_norm(cur: kv_compressed,
70 mw: model.layers[il].attn_kv_a_norm, NULL,
71 type: LLM_NORM_RMS, il);
72 cb(cur: kv_compressed, name: "kv_compressed", il);
73
74 // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
75 ggml_tensor * kv = ggml_mul_mat(ctx: ctx0, a: model.layers[il].wkv_b, b: kv_compressed);
76 cb(cur: kv, name: "kv", il);
77
78 // split into {n_head * n_embd_head_qk_nope, n_tokens}
79 ggml_tensor * k_nope = ggml_view_3d(ctx: ctx0, a: kv, ne0: n_embd_head_qk_nope, ne1: n_head, ne2: n_tokens,
80 nb1: ggml_row_size(type: kv->type, ne: n_embd_head_qk_nope + hparams.n_embd_head_v),
81 nb2: ggml_row_size(type: kv->type, ne: n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
82 offset: 0);
83 cb(cur: k_nope, name: "k_nope", il);
84
85 // and {n_head * n_embd_head_v, n_tokens}
86 ggml_tensor * v_states = ggml_view_3d(ctx: ctx0, a: kv, ne0: hparams.n_embd_head_v, ne1: n_head, ne2: n_tokens,
87 nb1: ggml_row_size(type: kv->type, ne: (n_embd_head_qk_nope + hparams.n_embd_head_v)),
88 nb2: ggml_row_size(type: kv->type, ne: (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
89 offset: ggml_row_size(type: kv->type, ne: (n_embd_head_qk_nope)));
90 cb(cur: v_states, name: "v_states", il);
91
92 v_states = ggml_cont(ctx: ctx0, a: v_states);
93 cb(cur: v_states, name: "v_states", il);
94
95 v_states = ggml_view_2d(ctx: ctx0, a: v_states, ne0: hparams.n_embd_head_v * n_head, ne1: n_tokens,
96 nb1: ggml_row_size(type: kv->type, ne: hparams.n_embd_head_v * n_head),
97 offset: 0);
98 cb(cur: v_states, name: "v_states", il);
99
100 q_pe = ggml_rope_ext(
101 ctx: ctx0, a: q_pe, b: inp_pos, c: nullptr,
102 n_dims: n_rot, mode: rope_type, n_ctx_orig, freq_base, freq_scale,
103 ext_factor, attn_factor, beta_fast, beta_slow
104 );
105 cb(cur: q_pe, name: "q_pe", il);
106
107 // shared RoPE key
108 k_pe = ggml_rope_ext(
109 ctx: ctx0, a: k_pe, b: inp_pos, c: nullptr,
110 n_dims: n_rot, mode: rope_type, n_ctx_orig, freq_base, freq_scale,
111 ext_factor, attn_factor, beta_fast, beta_slow
112 );
113 cb(cur: k_pe, name: "k_pe", il);
114
115 ggml_tensor * q_states = ggml_concat(ctx: ctx0, a: q_nope, b: q_pe, dim: 0);
116 cb(cur: q_states, name: "q_states", il);
117
118 ggml_tensor * k_states = ggml_concat(ctx: ctx0, a: k_nope, b: ggml_repeat(ctx: ctx0, a: k_pe, b: q_pe), dim: 0);
119 cb(cur: k_states, name: "k_states", il);
120
121 cur = build_attn(inp: inp_attn,
122 wo: model.layers[il].wo, NULL,
123 q_cur: q_states, k_cur: k_states, v_cur: v_states, kq_b: nullptr, sinks: nullptr, v_mla: nullptr, kq_scale, il);
124 }
125 if (il == n_layer - 1 && inp_out_ids) {
126 cur = ggml_get_rows(ctx: ctx0, a: cur, b: inp_out_ids);
127 inpSA = ggml_get_rows(ctx: ctx0, a: inpSA, b: inp_out_ids);
128 }
129 ggml_tensor * ffn_inp = ggml_add(ctx: ctx0, a: cur, b: inpSA);
130 cb(cur: ffn_inp, name: "ffn_inp", il);
131
132 cur = build_norm(cur: ffn_inp,
133 mw: model.layers[il].ffn_norm, NULL,
134 type: LLM_NORM_RMS, il);
135 cb(cur, name: "ffn_norm", il);
136
137 cur = build_ffn(cur,
138 up: model.layers[il].ffn_up, NULL, NULL,
139 NULL, NULL, NULL,
140 down: model.layers[il].ffn_down, NULL, NULL,
141 NULL,
142 type_op: LLM_FFN_RELU_SQR, type_gate: LLM_FFN_SEQ, il);
143 cb(cur, name: "ffn_out", il);
144
145 cur = ggml_add(ctx: ctx0, a: cur, b: ffn_inp);
146
147 cur = build_cvec(cur, il);
148 cb(cur, name: "l_out", il);
149
150 // input for next layer
151 inpL = cur;
152 }
153 cur = inpL;
154
155 cur = build_norm(cur,
156 mw: model.output_norm, NULL,
157 type: LLM_NORM_RMS, il: -1);
158
159 cb(cur, name: "result_norm", il: -1);
160 res->t_embd = cur;
161
162 cur = build_lora_mm(w: model.output, cur);
163
164 cb(cur, name: "result_output", il: -1);
165 res->t_logits = cur;
166
167 ggml_build_forward_expand(cgraph: gf, tensor: cur);
168}
169