bert.cpp source code [llama.cpp/src/models/bert.cpp]

1	#include "models.h"
2
3
4
5	llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context (params) {
6	const int64_t n_embd_head = hparams.n_embd_head_v;
7	const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
8
9	GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
10
11	ggml_tensor * cur;
12	ggml_tensor * inpL;
13	ggml_tensor * inp_pos = nullptr;
14
15	if (model.arch != LLM_ARCH_JINA_BERT_V2) {
16	inp_pos = build_inp_pos();
17	}
18
19	// construct input embeddings (token, type, position)
20	inpL = build_inp_embd(tok_embd: model.tok_embd);
21
22	// token types are hardcoded to zero ("Sentence A")
23	if (model.type_embd) {
24	ggml_tensor * type_row0 = ggml_view_1d(ctx: ctx0, a: model.type_embd, ne0: n_embd, offset: `0`);
25	inpL = ggml_add(ctx: ctx0, a: inpL, b: type_row0);
26	}
27	if (model.arch == LLM_ARCH_BERT) {
28	inpL = ggml_add(ctx: ctx0, a: ggml_get_rows(ctx: ctx0, a: model.pos_embd, b: inp_pos), b: inpL);
29	}
30	cb(cur: inpL, name: "inp_embd", il: -`1`);
31
32	// embed layer norm
33	inpL = build_norm(cur: inpL, mw: model.tok_norm, mb: model.tok_norm_b, type: LLM_NORM, il: -`1`);
34	cb(cur: inpL, name: "inp_norm", il: -`1`);
35
36	auto * inp_attn = build_attn_inp_no_cache();
37
38	ggml_tensor * inp_out_ids = build_inp_out_ids();
39
40	for (int il = `0`; il < n_layer; ++il) {
41	ggml_tensor * cur = inpL;
42
43	{
44	ggml_tensor * Qcur;
45	ggml_tensor * Kcur;
46	ggml_tensor * Vcur;
47
48	// self-attention
49	if (model.layers [il].wqkv) {
50	cur = build_lora_mm(w: model.layers [il].wqkv, cur);
51	cb(cur, name: "wqkv", il);
52
53	if (model.layers [il].bqkv) {
54	cur = ggml_add(ctx: ctx0, a: cur, b: model.layers [il].bqkv);
55	cb(cur, name: "bqkv", il);
56	}
57
58	Qcur = ggml_view_3d(ctx: ctx0, a: cur, ne0: n_embd_head, ne1: n_head, ne2: n_tokens, nb1: n_embd_head * sizeof(float), nb2: cur->nb[`1`],
59	offset: `0` * sizeof(float) * (n_embd));
60	Kcur = ggml_view_3d(ctx: ctx0, a: cur, ne0: n_embd_head, ne1: n_head_kv, ne2: n_tokens, nb1: n_embd_head * sizeof(float),
61	nb2: cur->nb[`1`], offset: `1` * sizeof(float) * (n_embd));
62	Vcur = ggml_view_3d(ctx: ctx0, a: cur, ne0: n_embd_head, ne1: n_head_kv, ne2: n_tokens, nb1: n_embd_head * sizeof(float),
63	nb2: cur->nb[`1`], offset: `1` * sizeof(float) * (n_embd + n_embd_gqa));
64	} else {
65	Qcur = ggml_add(ctx: ctx0, a: build_lora_mm(w: model.layers [il].wq, cur), b: model.layers [il].bq);
66	Kcur = ggml_add(ctx: ctx0, a: build_lora_mm(w: model.layers [il].wk, cur), b: model.layers [il].bk);
67	Vcur = ggml_add(ctx: ctx0, a: build_lora_mm(w: model.layers [il].wv, cur), b: model.layers [il].bv);
68
69	Qcur = ggml_reshape_3d(ctx: ctx0, a: Qcur, ne0: n_embd_head, ne1: n_head, ne2: n_tokens);
70	Kcur = ggml_reshape_3d(ctx: ctx0, a: Kcur, ne0: n_embd_head, ne1: n_head_kv, ne2: n_tokens);
71	Vcur = ggml_reshape_3d(ctx: ctx0, a: Vcur, ne0: n_embd_head, ne1: n_head_kv, ne2: n_tokens);
72	}
73
74	if (model.layers [il].attn_q_norm) {
75	Qcur = ggml_reshape_2d(ctx: ctx0, a: Qcur, ne0: n_embd_head * n_head, ne1: n_tokens);
76
77	Qcur = build_norm(cur: Qcur, mw: model.layers [il].attn_q_norm, mb: model.layers [il].attn_q_norm_b, type: LLM_NORM, il);
78
79	Qcur = ggml_reshape_3d(ctx: ctx0, a: Qcur, ne0: n_embd_head, ne1: n_head, ne2: n_tokens);
80	}
81
82	if (model.layers [il].attn_k_norm) {
83	Kcur = ggml_reshape_2d(ctx: ctx0, a: Kcur, ne0: n_embd_head * n_head_kv, ne1: n_tokens);
84
85	Kcur = build_norm(cur: Kcur, mw: model.layers [il].attn_k_norm, mb: model.layers [il].attn_k_norm_b, type: LLM_NORM, il);
86
87	Kcur = ggml_reshape_3d(ctx: ctx0, a: Kcur, ne0: n_embd_head, ne1: n_head_kv, ne2: n_tokens);
88	}
89
90	// RoPE
91	if (model.arch == LLM_ARCH_NOMIC_BERT \|\| model.arch == LLM_ARCH_NOMIC_BERT_MOE \|\|
92	model.arch == LLM_ARCH_JINA_BERT_V3) {
93	Qcur = ggml_rope_ext(ctx: ctx0, a: Qcur, b: inp_pos, c: nullptr, n_dims: n_rot, mode: rope_type, n_ctx_orig, freq_base, freq_scale,
94	ext_factor, attn_factor, beta_fast, beta_slow);
95
96	Kcur = ggml_rope_ext(ctx: ctx0, a: Kcur, b: inp_pos, c: nullptr, n_dims: n_rot, mode: rope_type, n_ctx_orig, freq_base, freq_scale,
97	ext_factor, attn_factor, beta_fast, beta_slow);
98	}
99
100	cb(cur: Qcur, name: "Qcur", il);
101	cb(cur: Kcur, name: "Kcur", il);
102	cb(cur: Vcur, name: "Vcur", il);
103
104	cur = build_attn(inp: inp_attn,
105	wo: model.layers [il].wo, wo_b: model.layers [il].bo,
106	q_cur: Qcur, k_cur: Kcur, v_cur: Vcur, kq_b: nullptr, sinks: nullptr, v_mla: nullptr, kq_scale: `1.0f` / sqrtf(x: float(n_embd_head)), il);
107	cb(cur, name: "kqv_out", il);
108	}
109
110	if (il == n_layer - `1` && inp_out_ids) {
111	cur = ggml_get_rows(ctx: ctx0, a: cur, b: inp_out_ids);
112	inpL = ggml_get_rows(ctx: ctx0, a: inpL, b: inp_out_ids);
113	}
114
115	// re-add the layer input
116	cur = ggml_add(ctx: ctx0, a: cur, b: inpL);
117
118	// attention layer norm
119	cur = build_norm(cur, mw: model.layers [il].attn_out_norm, mb: model.layers [il].attn_out_norm_b, type: LLM_NORM, il);
120
121	if (model.layers [il].attn_norm_2 != nullptr) {
122	cur = ggml_add(ctx: ctx0, a: cur, b: inpL); // re-add the layer input
123	cur = build_norm(cur, mw: model.layers [il].attn_norm_2, mb: model.layers [il].attn_norm_2_b, type: LLM_NORM, il);
124	}
125
126	ggml_tensor * ffn_inp = cur;
127	cb(cur: ffn_inp, name: "ffn_inp", il);
128
129	// feed-forward network
130	if (hparams.moe_every_n_layers > `0` && il % hparams.moe_every_n_layers == `1`) {
131	// MoE branch
132	cur = build_moe_ffn(cur, gate_inp: model.layers [il].ffn_gate_inp, up_exps: model.layers [il].ffn_up_exps, gate_exps: nullptr,
133	down_exps: model.layers [il].ffn_down_exps, exp_probs_b: nullptr, n_expert: hparams.n_expert, n_expert_used: hparams.n_expert_used,
134	type_op: LLM_FFN_GELU, norm_w: false, scale_w: false, w_scale: `0.0f`, gating_op: LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
135	cb(cur, name: "ffn_moe_out", il);
136	} else if (model.arch == LLM_ARCH_BERT \|\| model.arch == LLM_ARCH_NOMIC_BERT_MOE \|\|
137	model.arch == LLM_ARCH_JINA_BERT_V3) {
138	cur = build_ffn(cur,
139	up: model.layers [il].ffn_up, up_b: model.layers [il].ffn_up_b, NULL,
140	NULL, NULL, NULL,
141	down: model.layers [il].ffn_down, down_b: model.layers [il].ffn_down_b, NULL, NULL,
142	type_op: LLM_FFN_GELU, type_gate: LLM_FFN_SEQ, il);
143	cb(cur, name: "ffn_out", il);
144	} else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
145	cur = build_ffn(cur,
146	up: model.layers [il].ffn_up, NULL, NULL,
147	gate: model.layers [il].ffn_gate, NULL, NULL,
148	down: model.layers [il].ffn_down, down_b: model.layers [il].ffn_down_b, NULL, NULL,
149	type_op: model.layers [il].ffn_gate ? LLM_FFN_GELU : LLM_FFN_GEGLU, type_gate: LLM_FFN_PAR, il);
150	cb(cur, name: "ffn_out", il);
151	} else {
152	cur = build_ffn(cur,
153	up: model.layers [il].ffn_up, NULL, NULL,
154	gate: model.layers [il].ffn_gate, NULL, NULL,
155	down: model.layers [il].ffn_down, NULL, NULL,
156	NULL, type_op: LLM_FFN_SILU, type_gate: LLM_FFN_PAR, il);
157	cb(cur, name: "ffn_out", il);
158	}
159
160	// attentions bypass the intermediate layer
161	cur = ggml_add(ctx: ctx0, a: cur, b: ffn_inp);
162
163	// output layer norm
164	cur = build_norm(cur, mw: model.layers [il].layer_out_norm, mb: model.layers [il].layer_out_norm_b, type: LLM_NORM, il);
165
166	// input for next layer
167	inpL = cur;
168	}
169
170	cur = inpL;
171
172	cb(cur, name: "result_embd", il: -`1`);
173	res->t_embd = cur;
174
175	ggml_build_forward_expand(cgraph: gf, tensor: cur);
176	}
177

Browse the source code of llama.cpp/src/models/bert.cpp