1#include "models.h"
2
3
4
5llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
6 const int64_t n_embd_head = hparams.n_embd_head_v;
7 const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
8
9 GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
10
11 ggml_tensor * cur;
12 ggml_tensor * inpL;
13 ggml_tensor * inp_pos = nullptr;
14
15 if (model.arch != LLM_ARCH_JINA_BERT_V2) {
16 inp_pos = build_inp_pos();
17 }
18
19 // construct input embeddings (token, type, position)
20 inpL = build_inp_embd(tok_embd: model.tok_embd);
21
22 // token types are hardcoded to zero ("Sentence A")
23 if (model.type_embd) {
24 ggml_tensor * type_row0 = ggml_view_1d(ctx: ctx0, a: model.type_embd, ne0: n_embd, offset: 0);
25 inpL = ggml_add(ctx: ctx0, a: inpL, b: type_row0);
26 }
27 if (model.arch == LLM_ARCH_BERT) {
28 inpL = ggml_add(ctx: ctx0, a: ggml_get_rows(ctx: ctx0, a: model.pos_embd, b: inp_pos), b: inpL);
29 }
30 cb(cur: inpL, name: "inp_embd", il: -1);
31
32 // embed layer norm
33 inpL = build_norm(cur: inpL, mw: model.tok_norm, mb: model.tok_norm_b, type: LLM_NORM, il: -1);
34 cb(cur: inpL, name: "inp_norm", il: -1);
35
36 auto * inp_attn = build_attn_inp_no_cache();
37
38 ggml_tensor * inp_out_ids = build_inp_out_ids();
39
40 for (int il = 0; il < n_layer; ++il) {
41 ggml_tensor * cur = inpL;
42
43 {
44 ggml_tensor * Qcur;
45 ggml_tensor * Kcur;
46 ggml_tensor * Vcur;
47
48 // self-attention
49 if (model.layers[il].wqkv) {
50 cur = build_lora_mm(w: model.layers[il].wqkv, cur);
51 cb(cur, name: "wqkv", il);
52
53 if (model.layers[il].bqkv) {
54 cur = ggml_add(ctx: ctx0, a: cur, b: model.layers[il].bqkv);
55 cb(cur, name: "bqkv", il);
56 }
57
58 Qcur = ggml_view_3d(ctx: ctx0, a: cur, ne0: n_embd_head, ne1: n_head, ne2: n_tokens, nb1: n_embd_head * sizeof(float), nb2: cur->nb[1],
59 offset: 0 * sizeof(float) * (n_embd));
60 Kcur = ggml_view_3d(ctx: ctx0, a: cur, ne0: n_embd_head, ne1: n_head_kv, ne2: n_tokens, nb1: n_embd_head * sizeof(float),
61 nb2: cur->nb[1], offset: 1 * sizeof(float) * (n_embd));
62 Vcur = ggml_view_3d(ctx: ctx0, a: cur, ne0: n_embd_head, ne1: n_head_kv, ne2: n_tokens, nb1: n_embd_head * sizeof(float),
63 nb2: cur->nb[1], offset: 1 * sizeof(float) * (n_embd + n_embd_gqa));
64 } else {
65 Qcur = ggml_add(ctx: ctx0, a: build_lora_mm(w: model.layers[il].wq, cur), b: model.layers[il].bq);
66 Kcur = ggml_add(ctx: ctx0, a: build_lora_mm(w: model.layers[il].wk, cur), b: model.layers[il].bk);
67 Vcur = ggml_add(ctx: ctx0, a: build_lora_mm(w: model.layers[il].wv, cur), b: model.layers[il].bv);
68
69 Qcur = ggml_reshape_3d(ctx: ctx0, a: Qcur, ne0: n_embd_head, ne1: n_head, ne2: n_tokens);
70 Kcur = ggml_reshape_3d(ctx: ctx0, a: Kcur, ne0: n_embd_head, ne1: n_head_kv, ne2: n_tokens);
71 Vcur = ggml_reshape_3d(ctx: ctx0, a: Vcur, ne0: n_embd_head, ne1: n_head_kv, ne2: n_tokens);
72 }
73
74 if (model.layers[il].attn_q_norm) {
75 Qcur = ggml_reshape_2d(ctx: ctx0, a: Qcur, ne0: n_embd_head * n_head, ne1: n_tokens);
76
77 Qcur = build_norm(cur: Qcur, mw: model.layers[il].attn_q_norm, mb: model.layers[il].attn_q_norm_b, type: LLM_NORM, il);
78
79 Qcur = ggml_reshape_3d(ctx: ctx0, a: Qcur, ne0: n_embd_head, ne1: n_head, ne2: n_tokens);
80 }
81
82 if (model.layers[il].attn_k_norm) {
83 Kcur = ggml_reshape_2d(ctx: ctx0, a: Kcur, ne0: n_embd_head * n_head_kv, ne1: n_tokens);
84
85 Kcur = build_norm(cur: Kcur, mw: model.layers[il].attn_k_norm, mb: model.layers[il].attn_k_norm_b, type: LLM_NORM, il);
86
87 Kcur = ggml_reshape_3d(ctx: ctx0, a: Kcur, ne0: n_embd_head, ne1: n_head_kv, ne2: n_tokens);
88 }
89
90 // RoPE
91 if (model.arch == LLM_ARCH_NOMIC_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE ||
92 model.arch == LLM_ARCH_JINA_BERT_V3) {
93 Qcur = ggml_rope_ext(ctx: ctx0, a: Qcur, b: inp_pos, c: nullptr, n_dims: n_rot, mode: rope_type, n_ctx_orig, freq_base, freq_scale,
94 ext_factor, attn_factor, beta_fast, beta_slow);
95
96 Kcur = ggml_rope_ext(ctx: ctx0, a: Kcur, b: inp_pos, c: nullptr, n_dims: n_rot, mode: rope_type, n_ctx_orig, freq_base, freq_scale,
97 ext_factor, attn_factor, beta_fast, beta_slow);
98 }
99
100 cb(cur: Qcur, name: "Qcur", il);
101 cb(cur: Kcur, name: "Kcur", il);
102 cb(cur: Vcur, name: "Vcur", il);
103
104 cur = build_attn(inp: inp_attn,
105 wo: model.layers[il].wo, wo_b: model.layers[il].bo,
106 q_cur: Qcur, k_cur: Kcur, v_cur: Vcur, kq_b: nullptr, sinks: nullptr, v_mla: nullptr, kq_scale: 1.0f / sqrtf(x: float(n_embd_head)), il);
107 cb(cur, name: "kqv_out", il);
108 }
109
110 if (il == n_layer - 1 && inp_out_ids) {
111 cur = ggml_get_rows(ctx: ctx0, a: cur, b: inp_out_ids);
112 inpL = ggml_get_rows(ctx: ctx0, a: inpL, b: inp_out_ids);
113 }
114
115 // re-add the layer input
116 cur = ggml_add(ctx: ctx0, a: cur, b: inpL);
117
118 // attention layer norm
119 cur = build_norm(cur, mw: model.layers[il].attn_out_norm, mb: model.layers[il].attn_out_norm_b, type: LLM_NORM, il);
120
121 if (model.layers[il].attn_norm_2 != nullptr) {
122 cur = ggml_add(ctx: ctx0, a: cur, b: inpL); // re-add the layer input
123 cur = build_norm(cur, mw: model.layers[il].attn_norm_2, mb: model.layers[il].attn_norm_2_b, type: LLM_NORM, il);
124 }
125
126 ggml_tensor * ffn_inp = cur;
127 cb(cur: ffn_inp, name: "ffn_inp", il);
128
129 // feed-forward network
130 if (hparams.moe_every_n_layers > 0 && il % hparams.moe_every_n_layers == 1) {
131 // MoE branch
132 cur = build_moe_ffn(cur, gate_inp: model.layers[il].ffn_gate_inp, up_exps: model.layers[il].ffn_up_exps, gate_exps: nullptr,
133 down_exps: model.layers[il].ffn_down_exps, exp_probs_b: nullptr, n_expert: hparams.n_expert, n_expert_used: hparams.n_expert_used,
134 type_op: LLM_FFN_GELU, norm_w: false, scale_w: false, w_scale: 0.0f, gating_op: LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
135 cb(cur, name: "ffn_moe_out", il);
136 } else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE ||
137 model.arch == LLM_ARCH_JINA_BERT_V3) {
138 cur = build_ffn(cur,
139 up: model.layers[il].ffn_up, up_b: model.layers[il].ffn_up_b, NULL,
140 NULL, NULL, NULL,
141 down: model.layers[il].ffn_down, down_b: model.layers[il].ffn_down_b, NULL, NULL,
142 type_op: LLM_FFN_GELU, type_gate: LLM_FFN_SEQ, il);
143 cb(cur, name: "ffn_out", il);
144 } else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
145 cur = build_ffn(cur,
146 up: model.layers[il].ffn_up, NULL, NULL,
147 gate: model.layers[il].ffn_gate, NULL, NULL,
148 down: model.layers[il].ffn_down, down_b: model.layers[il].ffn_down_b, NULL, NULL,
149 type_op: model.layers[il].ffn_gate ? LLM_FFN_GELU : LLM_FFN_GEGLU, type_gate: LLM_FFN_PAR, il);
150 cb(cur, name: "ffn_out", il);
151 } else {
152 cur = build_ffn(cur,
153 up: model.layers[il].ffn_up, NULL, NULL,
154 gate: model.layers[il].ffn_gate, NULL, NULL,
155 down: model.layers[il].ffn_down, NULL, NULL,
156 NULL, type_op: LLM_FFN_SILU, type_gate: LLM_FFN_PAR, il);
157 cb(cur, name: "ffn_out", il);
158 }
159
160 // attentions bypass the intermediate layer
161 cur = ggml_add(ctx: ctx0, a: cur, b: ffn_inp);
162
163 // output layer norm
164 cur = build_norm(cur, mw: model.layers[il].layer_out_norm, mb: model.layers[il].layer_out_norm_b, type: LLM_NORM, il);
165
166 // input for next layer
167 inpL = cur;
168 }
169
170 cur = inpL;
171
172 cb(cur, name: "result_embd", il: -1);
173 res->t_embd = cur;
174
175 ggml_build_forward_expand(cgraph: gf, tensor: cur);
176}
177