lookup.cpp source code [llama.cpp/examples/lookup/lookup.cpp]

1	#include "arg.h"
2	#include "ggml.h"
3	#include "common.h"
4	#include "ngram-cache.h"
5	#include "sampling.h"
6	#include "log.h"
7	#include "llama.h"
8
9	#include <cstdint>
10	#include <cstdio>
11	#include <fstream>
12	#include <string>
13	#include <vector>
14
15	int main(int argc, char ** argv){
16	common_params params;
17
18	if (!common_params_parse(argc, argv, params, ex: LLAMA_EXAMPLE_LOOKUP)) {
19	return `1`;
20	}
21
22	common_init();
23
24	// max. number of additional tokens to draft if match is found
25	const int n_draft = params.speculative.n_max;
26
27	// init llama.cpp
28	llama_backend_init();
29	llama_numa_init(numa: params.numa);
30
31	// load the model
32	common_init_result llama_init = common_init_from_params(params);
33
34	llama_model * model = llama_init.model.get();
35	llama_context * ctx = llama_init.context.get();
36
37	const llama_vocab * vocab = llama_model_get_vocab(model);
38
39	// tokenize the prompt
40	std::vector<llama_token> inp;
41	inp = common_tokenize(ctx, text: params.prompt, add_special: true, parse_special: true);
42
43	common_ngram_cache ngram_cache_context;
44	common_ngram_cache ngram_cache_dynamic;
45	common_ngram_cache ngram_cache_static;
46	int64_t t_draft_flat_us = `0`;
47	int64_t t_draft_us = `0`;
48
49	{
50	// Fill up context ngram cache with tokens from user input:
51	const int64_t t_start_draft_us = ggml_time_us();
52	common_ngram_cache_update(ngram_cache&: ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp_data&: inp, nnew: inp.size(), print_progress: false);
53
54	if (!params.lookup_cache_static.empty()) {
55	try {
56	ngram_cache_static = common_ngram_cache_load(filename&: params.lookup_cache_static);
57	} catch (std::ifstream::failure const &) {
58	LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
59	exit(status: `1`);
60	}
61	}
62
63	if (!params.lookup_cache_dynamic.empty()) {
64	try {
65	ngram_cache_dynamic = common_ngram_cache_load(filename&: params.lookup_cache_dynamic);
66	} catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program
67	}
68
69	t_draft_flat_us += ggml_time_us() - t_start_draft_us;
70	}
71
72	const int max_context_size = llama_n_ctx(ctx);
73	const int max_tokens_list_size = max_context_size - `4`;
74
75	if ((int) inp.size() > max_tokens_list_size) {
76	LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
77	return `1`;
78	}
79
80	LOG("\n\n");
81
82	for (auto id : inp) {
83	LOG("%s", common_token_to_piece(ctx, id).c_str());
84	}
85
86	fflush(stderr);
87
88	const int n_input = inp.size();
89
90	const auto t_enc_start = ggml_time_us();
91
92	llama_decode(ctx, batch: llama_batch_get_one( tokens: inp.data(), n_tokens: n_input - `1`));
93	llama_decode(ctx, batch: llama_batch_get_one(tokens: &inp.back(), n_tokens: `1`));
94
95	const auto t_enc_end = ggml_time_us();
96
97	int n_predict = `0`;
98	int n_drafted = `0`;
99	int n_accept = `0`;
100
101	int n_past = inp.size();
102
103	bool has_eos = false;
104
105	struct common_sampler * smpl = common_sampler_init(model, params: params.sampling);
106
107	std::vector<llama_token> draft;
108
109	llama_batch batch_tgt = llama_batch_init(n_tokens: params.n_ctx, embd: `0`, n_seq_max: `1`);
110
111	const auto t_dec_start = ggml_time_us();
112
113	while (true) {
114	// print current draft sequence
115	LOG_DBG("drafted %s\n", string_from(ctx, draft).c_str());
116
117	int i_dft = `0`;
118	while (true) {
119	// sample from the target model
120	llama_token id = common_sampler_sample(gsmpl: smpl, ctx, idx: i_dft);
121
122	common_sampler_accept(gsmpl: smpl, token: id, accept_grammar: true);
123
124	const std::string token_str = common_token_to_piece(ctx, token: id);
125
126	if (!params.use_color) {
127	LOG("%s", token_str.c_str());
128	}
129
130	if (llama_vocab_is_eog(vocab, token: id)) {
131	has_eos = true;
132	}
133
134	++n_predict;
135
136	// check if the target token matches the draft
137	if (i_dft < (int) draft.size() && id == draft [i_dft]) {
138	LOG_DBG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str());
139	++n_accept;
140	++n_past;
141	++i_dft;
142	inp.push_back(x: id);
143	{
144	// Update context ngram cache with the newly accepted token:
145	const int64_t t_start_draft_us = ggml_time_us();
146	common_ngram_cache_update(ngram_cache&: ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp_data&: inp, nnew: `1`, print_progress: false);
147	t_draft_us += ggml_time_us() - t_start_draft_us;
148	}
149
150	if (params.use_color) {
151	// color accepted draft token
152	LOG("\033[34m%s\033[0m", token_str.c_str());
153	fflush(stdout);
154	}
155	continue;
156	}
157
158	if (params.use_color) {
159	LOG("%s", token_str.c_str());
160	}
161	fflush(stdout);
162
163
164	LOG_DBG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());
165
166	draft.clear();
167	draft.push_back(x: id);
168	inp.push_back(x: id);
169	{
170	// Update context ngram cache with the newly accepted token:
171	const int64_t t_start_draft_us = ggml_time_us();
172	common_ngram_cache_update(ngram_cache&: ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp_data&: inp, nnew: `1`, print_progress: false);
173	t_draft_us += ggml_time_us() - t_start_draft_us;
174	}
175	break;
176	}
177
178	if ((params.n_predict > `0` && n_predict > params.n_predict) \|\| has_eos) {
179	break;
180	}
181
182	// KV cache management
183	// clean the cache of draft tokens that weren't accepted
184	llama_memory_seq_rm(mem: llama_get_memory(ctx), seq_id: `0`, p0: n_past, p1: -`1`);
185
186	common_batch_clear(batch&: batch_tgt);
187	common_batch_add(batch&: batch_tgt, id: draft [`0`], pos: n_past, seq_ids: { `0` }, logits: true);
188
189	// Draft already contains a single token sampled from the model:
190	GGML_ASSERT(draft.size() == `1`);
191	GGML_ASSERT(draft[`0`] == inp.back());
192	const int64_t t_start_draft_us = ggml_time_us();
193
194	common_ngram_cache_draft(inp, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, nc_context&: ngram_cache_context, nc_dynamic&: ngram_cache_dynamic, nc_static&: ngram_cache_static);
195
196	for (size_t i = `1`; i < draft.size(); ++i) {
197	common_batch_add(batch&: batch_tgt, id: draft [i], pos: n_past + i, seq_ids: { `0` }, logits: true);
198	}
199
200	t_draft_us += ggml_time_us() - t_start_draft_us;
201	n_drafted += draft.size() - `1`;
202
203	llama_decode(ctx, batch: batch_tgt);
204	++n_past;
205
206	draft.erase(position: draft.begin());
207	}
208
209	auto t_dec_end = ggml_time_us();
210
211	// Update dynamic ngram cache with context ngram cache and save it to disk:
212	common_ngram_cache_merge(ngram_cache_target&: ngram_cache_dynamic, ngram_cache_add&: ngram_cache_context);
213	common_ngram_cache_save(ngram_cache&: ngram_cache_dynamic, filename&: params.lookup_cache_dynamic);
214
215	LOG("\n\n");
216
217	LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / `1e6f`, inp.size() / ((t_enc_end - t_enc_start) / `1e6f`));
218	LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / `1e6f`, n_predict / ((t_dec_end - t_dec_start) / `1e6f`));
219
220	LOG_INF("\n");
221	LOG_INF("n_draft = %d\n", n_draft);
222	LOG_INF("n_predict = %d\n", n_predict);
223	LOG_INF("n_drafted = %d\n", n_drafted);
224	LOG_INF("t_draft_flat = %.2f ms\n", t_draft_flat_us*`1e-3`);
225	LOG_INF("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n",
226	t_draft_us`1e-3`, `1.0f`t_draft_us/n_drafted, n_drafted/(`1e-6`*t_draft_us));
227	LOG_INF("n_accept = %d\n", n_accept);
228	LOG_INF("accept = %.3f%%\n", `100.0f` * n_accept / n_drafted);
229
230	LOG_INF("\ntarget:\n\n");
231	common_perf_print(ctx, gsmpl: smpl);
232
233	common_sampler_free(gsmpl: smpl);
234
235	llama_batch_free(batch: batch_tgt);
236
237	llama_backend_free();
238
239	LOG("\n\n");
240
241	return `0`;
242	}
243

Browse the source code of llama.cpp/examples/lookup/lookup.cpp