passkey.cpp source code [llama.cpp/examples/passkey/passkey.cpp]

1	#include "arg.h"
2	#include "common.h"
3	#include "log.h"
4	#include "llama.h"
5
6	#include <cmath>
7	#include <cstdio>
8	#include <string>
9	#include <vector>
10	#include <algorithm>
11
12	static void print_usage(int, char ** argv) {
13	LOG("\nexample usage:\n");
14	LOG("\n %s -m model.gguf --junk 250 --pos 90 --keep 32 --grp-attn-n 2 [--seed 1234]\n", argv[`0`]);
15	LOG("\n");
16	}
17
18	int main(int argc, char ** argv) {
19	common_params params;
20
21	params.n_junk = `250`;
22	params.n_keep = `32`;
23	params.i_pos = -`1`;
24
25	if (!common_params_parse(argc, argv, params, ex: LLAMA_EXAMPLE_PASSKEY, print_usage)) {
26	return `1`;
27	}
28
29	common_init();
30
31	int n_junk = params.n_junk;
32	int n_keep = params.n_keep;
33	int n_grp = params.grp_attn_n;
34	int i_pos = params.i_pos;
35
36	if (i_pos == -`1`) {
37	i_pos = rand() % n_junk;
38	}
39
40	const std::string prompt_prefix = "There is an important info hidden inside a lot of irrelevant text. Find it and memorize them. I will quiz you about the important information there.";
41	const std::string prompt_suffix = " What is the pass key? The pass key is";
42
43	// generate junk text
44	params.prompt = prompt_prefix;
45
46	const int passkey = rand() % `50000` + `1`;
47
48	for (int i = `0`; i < n_junk; i++) {
49	if (i % n_junk == i_pos) {
50	params.prompt += " The pass key is " + std::to_string(val: passkey) + ". Remember it. " + std::to_string(val: passkey) + " is the pass key.";
51	}
52
53	params.prompt += " The grass is green. The sky is blue. The sun is yellow. Here we go. There and back again.";
54	}
55
56	params.prompt += prompt_suffix;
57
58	// init LLM
59
60	llama_backend_init();
61	llama_numa_init(numa: params.numa);
62
63	// initialize the model
64
65	llama_model_params model_params = common_model_params_to_llama(params);
66
67	llama_model * model = llama_model_load_from_file(path_model: params.model.path.c_str(), params: model_params);
68
69	if (model == NULL) {
70	LOG_ERR("%s: unable to load model\n" , __func__);
71	return `1`;
72	}
73
74	const llama_vocab * vocab = llama_model_get_vocab(model);
75
76	// initialize the context
77
78	llama_context_params ctx_params = common_context_params_to_llama(params);
79
80	ctx_params.n_ctx = llama_model_n_ctx_train(model)*n_grp + n_keep;
81
82	GGML_ASSERT(ctx_params.n_batch % n_grp == `0` && "n_batch must be divisible by n_grp");
83
84	llama_context * ctx = llama_init_from_model(model, params: ctx_params);
85	if (ctx == NULL) {
86	LOG_ERR("%s: failed to create the llama_context\n" , __func__);
87	return `1`;
88	}
89
90	auto sparams = llama_sampler_chain_default_params();
91
92	llama_sampler * smpl = llama_sampler_chain_init(params: sparams);
93
94	llama_sampler_chain_add(chain: smpl, smpl: llama_sampler_init_greedy());
95
96	// tokenize the prompt
97	std::vector<llama_token> tokens_list;
98	tokens_list = common_tokenize(ctx, text: params.prompt, add_special: true);
99
100	// tokenize the prefix and use it as a sink
101	const int n_tokens_prefix = common_tokenize(ctx, text: prompt_prefix, add_special: true).size();
102
103	const int n_tokens_all = tokens_list.size();
104
105	// we leave a margin of 16 tokens for the generated text - it should contain just the passkey
106	const int n_predict = `16`;
107
108	// total length of the sequences including the prompt
109	const int n_len = n_tokens_all + n_predict;
110
111	const int n_ctx = llama_n_ctx(ctx) - n_keep;
112	const int n_kv_req = llama_n_ctx(ctx);
113	const int n_batch = ctx_params.n_batch;
114	const int n_batch_grp = ctx_params.n_batch/n_grp;
115
116	LOG_INF("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d, n_junk = %d, i_pos = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch, n_junk, i_pos);
117
118	// print the prompt token-by-token
119
120	LOG_INF("\n");
121	LOG_INF("prefix tokens: %d\n", n_tokens_prefix);
122	LOG_INF("prompt tokens: %d\n", n_tokens_all);
123	//LOG_INF("prompt: %s\n", params.prompt.c_str());
124
125	llama_batch batch = llama_batch_init(n_tokens: params.n_batch, embd: `0`, n_seq_max: `1`);
126
127	int n_past = `0`;
128
129	auto * mem = llama_get_memory(ctx);
130
131	// fill the KV cache
132	for (int i = `0`; i < n_ctx; i += n_batch) {
133	if (i > `0` && n_grp > `1`) {
134	// if SelfExtend is enabled, we compress the position from the last batch by a factor of n_grp
135	const int ib = i/n_batch - `1`;
136	const int bd = n_batch_grp*(n_grp - `1`);
137
138	llama_memory_seq_add(mem, seq_id: `0`, p0: n_past - n_batch, p1: n_past, delta: ib*bd);
139	llama_memory_seq_div(mem, seq_id: `0`, p0: n_past - n_batch + ibbd, p1: n_past + ibbd, d: n_grp);
140
141	n_past = llama_memory_seq_pos_max(mem, seq_id: `0`) + `1`;
142	}
143
144	common_batch_clear(batch);
145
146	for (int j = `0`; j < n_batch && i + j < n_tokens_all; j++) {
147	common_batch_add(batch, id: tokens_list [i + j], pos: n_past++, seq_ids: { `0` }, logits: false);
148	}
149
150	if (i + n_batch >= n_tokens_all) {
151	batch.logits[batch.n_tokens - `1`] = true;
152	}
153
154	if (llama_decode(ctx, batch) != `0`) {
155	LOG_INF("%s: llama_decode() failed\n", __func__);
156	return `1`;
157	}
158
159	LOG_INF("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all));
160
161	if (i + n_batch >= n_tokens_all) {
162	break;
163	}
164	}
165
166	for (int i = n_ctx; i < n_tokens_all; i += n_batch) {
167	const int n_discard = n_batch;
168
169	LOG_INF("%s: shifting KV cache with %d\n", __func__, n_discard);
170
171	llama_memory_seq_rm (mem, seq_id: `0`, p0: n_keep , p1: n_keep + n_discard);
172	llama_memory_seq_add(mem, seq_id: `0`, p0: n_keep + n_discard, p1: n_ctx, delta: -n_discard);
173
174	n_past = llama_memory_seq_pos_max(mem, seq_id: `0`) + `1`;
175
176	common_batch_clear(batch);
177
178	for (int j = `0`; j < n_batch && i + j < n_tokens_all; j++) {
179	common_batch_add(batch, id: tokens_list [i + j], pos: n_past++, seq_ids: { `0` }, logits: false);
180	}
181
182	if (i + n_batch >= n_tokens_all) {
183	batch.logits[batch.n_tokens - `1`] = true;
184	}
185
186	if (llama_decode(ctx, batch) != `0`) {
187	LOG_ERR("%s: llama_decode() failed\n", __func__);
188	return `1`;
189	}
190
191	LOG_INF("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all));
192	}
193
194	{
195	const int n_discard = n_past - n_ctx + n_predict;
196
197	if (n_discard > `0`) {
198	LOG_INF("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard);
199
200	llama_memory_seq_rm (mem, seq_id: `0`, p0: n_keep , p1: n_keep + n_discard);
201	llama_memory_seq_add(mem, seq_id: `0`, p0: n_keep + n_discard, p1: n_ctx, delta: -n_discard);
202
203	n_past = llama_memory_seq_pos_max(mem, seq_id: `0`) + `1`;
204	}
205	}
206
207	LOG_INF("\n");
208	LOG_INF("%s: passkey = %d, inserted at position %d / %d (token pos: ~%d)\n", __func__, passkey, i_pos, n_junk, (i_pos * n_tokens_all) / n_junk);
209	LOG_INF("\n");
210
211	// main loop
212
213	int n_cur = n_tokens_all;
214	int n_decode = `0`;
215
216	LOG_INF("%s", prompt_suffix.c_str());
217
218	const auto t_main_start = ggml_time_us();
219
220	while (n_cur <= n_len) {
221	// sample the next token
222	{
223	const llama_token new_token_id = llama_sampler_sample(smpl, ctx, idx: batch.n_tokens - `1`);
224
225	// is it an end of generation?
226	if (llama_vocab_is_eog(vocab, token: new_token_id) \|\| n_cur == n_len) {
227	LOG("\n");
228
229	break;
230	}
231
232	LOG("%s", common_token_to_piece(ctx, new_token_id).c_str());
233
234	n_decode += `1`;
235
236	// prepare the next batch
237	common_batch_clear(batch);
238
239	// push this new token for next evaluation
240	common_batch_add(batch, id: new_token_id, pos: n_past++, seq_ids: { `0` }, logits: true);
241	}
242
243	n_cur += `1`;
244
245	// evaluate the current batch with the transformer model
246	if (llama_decode(ctx, batch)) {
247	LOG_ERR("%s : failed to eval, return code %d\n", __func__, `1`);
248	return `1`;
249	}
250	}
251
252	LOG("\n");
253
254	const auto t_main_end = ggml_time_us();
255
256	LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
257	__func__, n_decode, (t_main_end - t_main_start) / `1000000.0f`, n_decode / ((t_main_end - t_main_start) / `1000000.0f`));
258
259	LOG("\n");
260	llama_perf_context_print(ctx);
261
262	LOG("\n");
263
264	llama_sampler_free(smpl);
265
266	llama_batch_free(batch);
267
268	llama_free(ctx);
269	llama_model_free(model);
270
271	llama_backend_free();
272
273	return `0`;
274	}
275

Browse the source code of llama.cpp/examples/passkey/passkey.cpp