chameleon.cpp source code [llama.cpp/src/models/chameleon.cpp]

1	#include "models.h"
2
3	#include <float.h>
4
5	llm_build_chameleon::llm_build_chameleon(const llama_model & model, const llm_graph_params & params) : llm_graph_context (params) {
6	const int64_t n_embd_head = hparams.n_embd_head_v;
7
8	GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
9	GGML_ASSERT(n_embd_head == hparams.n_rot);
10
11	ggml_tensor * cur;
12	ggml_tensor * inpL;
13
14	inpL = build_inp_embd(tok_embd: model.tok_embd);
15
16	// inp_pos - contains the positions
17	ggml_tensor * inp_pos = build_inp_pos();
18
19	auto * inp_attn = build_attn_inp_kv();
20
21	ggml_tensor * inp_out_ids = build_inp_out_ids();
22
23	for (int il = `0`; il < n_layer; ++il) {
24	ggml_tensor * inpSA = inpL;
25
26	// norm
27	if (hparams.swin_norm) {
28	cur = inpL;
29	} else {
30	cur = build_norm(cur: inpL,
31	mw: model.layers [il].attn_norm, NULL,
32	type: LLM_NORM_RMS, il);
33	cb(cur, name: "attn_norm", il);
34	}
35
36	// self-attention
37	{
38	// compute Q and K and RoPE them
39	ggml_tensor * Qcur = build_lora_mm(w: model.layers [il].wq, cur);
40	cb(cur: Qcur, name: "Qcur", il);
41
42	ggml_tensor * Kcur = build_lora_mm(w: model.layers [il].wk, cur);
43	cb(cur: Kcur, name: "Kcur", il);
44
45	ggml_tensor * Vcur = build_lora_mm(w: model.layers [il].wv, cur);
46	cb(cur: Vcur, name: "Vcur", il);
47
48	if (model.layers [il].attn_q_norm) {
49	Qcur = ggml_view_3d(ctx: ctx0, a: Qcur, ne0: n_embd_head, ne1: n_head, ne2: n_tokens,
50	nb1: ggml_element_size(tensor: Qcur) * n_embd_head,
51	nb2: ggml_element_size(tensor: Qcur) * n_embd_head * n_head,
52	offset: `0`);
53	cb(cur: Qcur, name: "Qcur", il);
54
55	Qcur = build_norm(cur: Qcur,
56	mw: model.layers [il].attn_q_norm,
57	mb: model.layers [il].attn_q_norm_b,
58	type: LLM_NORM, il);
59	cb(cur: Qcur, name: "Qcur", il);
60	}
61
62	if (model.layers [il].attn_k_norm) {
63	Kcur = ggml_view_3d(ctx: ctx0, a: Kcur, ne0: n_embd_head, ne1: n_head_kv, ne2: n_tokens,
64	nb1: ggml_element_size(tensor: Kcur) * n_embd_head,
65	nb2: ggml_element_size(tensor: Kcur) * n_embd_head * n_head_kv,
66	offset: `0`);
67	cb(cur: Kcur, name: "Kcur", il);
68
69	Kcur = build_norm(cur: Kcur,
70	mw: model.layers [il].attn_k_norm,
71	mb: model.layers [il].attn_k_norm_b,
72	type: LLM_NORM, il);
73	cb(cur: Kcur, name: "Kcur", il);
74	}
75
76	Qcur = ggml_reshape_3d(ctx: ctx0, a: Qcur, ne0: n_embd_head, ne1: n_head, ne2: n_tokens);
77	Kcur = ggml_reshape_3d(ctx: ctx0, a: Kcur, ne0: n_embd_head, ne1: n_head_kv, ne2: n_tokens);
78	Vcur = ggml_reshape_3d(ctx: ctx0, a: Vcur, ne0: n_embd_head, ne1: n_head_kv, ne2: n_tokens);
79
80	Qcur = ggml_rope_ext(
81	ctx: ctx0, a: Qcur, b: inp_pos, c: nullptr,
82	n_dims: n_rot, mode: rope_type, n_ctx_orig, freq_base, freq_scale,
83	ext_factor, attn_factor, beta_fast, beta_slow
84	);
85
86	Kcur = ggml_rope_ext(
87	ctx: ctx0, a: Kcur, b: inp_pos, c: nullptr,
88	n_dims: n_rot, mode: rope_type, n_ctx_orig, freq_base, freq_scale,
89	ext_factor, attn_factor, beta_fast, beta_slow
90	);
91
92	cb(cur: Qcur, name: "Qcur", il);
93	cb(cur: Kcur, name: "Kcur", il);
94	cb(cur: Vcur, name: "Vcur", il);
95
96	cur = build_attn(inp: inp_attn,
97	wo: model.layers [il].wo, wo_b: nullptr,
98	q_cur: Qcur, k_cur: Kcur, v_cur: Vcur, kq_b: nullptr, sinks: nullptr, v_mla: nullptr, kq_scale: `1.0f`/sqrtf(x: float(n_embd_head)), il);
99	}
100
101	if (il == n_layer - `1` && inp_out_ids) {
102	cur = ggml_get_rows(ctx: ctx0, a: cur, b: inp_out_ids);
103	inpSA = ggml_get_rows(ctx: ctx0, a: inpSA, b: inp_out_ids);
104	}
105
106	if (hparams.swin_norm) {
107	cur = build_norm(cur,
108	mw: model.layers [il].attn_norm, NULL,
109	type: LLM_NORM_RMS, il);
110	}
111
112	ggml_tensor * ffn_inp = ggml_add(ctx: ctx0, a: cur, b: inpSA);
113	cb(cur: ffn_inp, name: "ffn_inp", il);
114
115	// feed-forward network
116	if (!hparams.swin_norm) {
117	cur = build_norm(cur: ffn_inp,
118	mw: model.layers [il].ffn_norm, NULL,
119	type: LLM_NORM_RMS, il);
120	cb(cur, name: "ffn_norm", il);
121	}
122
123	cur = build_ffn(cur,
124	up: model.layers [il].ffn_up, NULL, NULL,
125	gate: model.layers [il].ffn_gate, NULL, NULL,
126	down: model.layers [il].ffn_down, NULL, NULL,
127	NULL,
128	type_op: LLM_FFN_SILU, type_gate: LLM_FFN_PAR, il);
129	cb(cur, name: "ffn_out", il);
130
131	if (hparams.swin_norm) {
132	cur = build_norm(cur,
133	mw: model.layers [il].ffn_norm, NULL,
134	type: LLM_NORM_RMS, il);
135	cb(cur, name: "ffn_norm", il);
136	}
137
138	cur = ggml_add(ctx: ctx0, a: cur, b: ffn_inp);
139	cb(cur, name: "ffn_out", il);
140
141	cur = build_cvec(cur, il);
142	cb(cur, name: "l_out", il);
143
144	// input for next layer
145	inpL = cur;
146	}
147
148	cur = inpL;
149
150	cur = build_norm(cur,
151	mw: model.output_norm, NULL,
152	type: LLM_NORM_RMS, il: -`1`);
153
154	cb(cur, name: "result_norm", il: -`1`);
155	res->t_embd = cur;
156
157	// lm_head
158	cur = build_lora_mm(w: model.output, cur);
159	cb(cur, name: "result_output_with_img_logits", il: -`1`);
160
161	// TODO: this suppresses the output of image tokens, which is required to enable text-only outputs.
162	// Needs to be removed once image outputs are supported.
163	int img_token_end_idx = `8196`;
164	int img_token_start_idx = `4`;
165	int num_img_tokens = img_token_end_idx - img_token_start_idx;
166	// creates 1d tensor of size num_img_tokens and values -FLT_MAX,
167	// which ensures that text token values are always at least larger than image token values
168	ggml_tensor * img_logits = ggml_new_tensor_1d(ctx: ctx0, type: GGML_TYPE_F32, ne0: num_img_tokens);
169	img_logits = ggml_clamp(ctx: ctx0, a: img_logits, min: -FLT_MAX, max: -FLT_MAX);
170	cb(cur: img_logits, name: "img_logits", il: -`1`);
171
172	cur = ggml_set_1d(ctx: ctx0, a: cur, b: img_logits, offset: ggml_element_size(tensor: cur) * img_token_start_idx);
173
174	cb(cur, name: "result_output", il: -`1`);
175	res->t_logits = cur;
176
177	ggml_build_forward_expand(cgraph: gf, tensor: cur);
178	}
179

Browse the source code of llama.cpp/src/models/chameleon.cpp