speculative.cpp source code [llama.cpp/common/speculative.cpp]

1	#include "speculative.h"
2
3	#include "ggml.h"
4	#include "llama.h"
5	#include "log.h"
6	#include "common.h"
7	#include "sampling.h"
8
9	#include <cstring>
10	#include <algorithm>
11	#include <map>
12
13	#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 128
14	#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
15
16	struct common_speculative {
17	struct llama_context * ctx_tgt; // only used for retokenizing from ctx_dft
18	struct llama_context * ctx_dft;
19	struct common_sampler * smpl;
20
21	llama_batch batch;
22	llama_tokens prompt_dft;
23	bool vocab_dft_compatible = true; // whether retokenization is needed
24	std::map<std::string, std::string> tgt_dft_replacements = {};
25	};
26
27	struct common_speculative * common_speculative_init(
28	struct llama_context * ctx_tgt,
29	struct llama_context * ctx_dft) {
30	auto * result = new common_speculative {
31	/ .ctx_tgt = / ctx_tgt,
32	/ .ctx_dft = / ctx_dft,
33	/ .smpl = / nullptr,
34	/ .batch = / llama_batch_init(n_tokens: llama_n_batch(ctx: ctx_dft), embd: `0`, n_seq_max: `1`),
35	/ .prompt_dft = / {},
36	/ .vocab_dft_compatible = / false,
37	};
38
39	// TODO: optimize or pass from outside?
40	#if 0
41	{
42	common_params_sampling params;
43	params.no_perf = false;
44
45	params.top_k = `40`;
46	params.top_p = `0.9`;
47
48	params.samplers = {
49	COMMON_SAMPLER_TYPE_TOP_K,
50	COMMON_SAMPLER_TYPE_TOP_P,
51	COMMON_SAMPLER_TYPE_INFILL,
52	};
53
54	result->smpl = common_sampler_init(llama_get_model(ctx_dft), params);
55	}
56	#else
57	{
58	common_params_sampling params;
59	params.no_perf = false;
60
61	params.top_k = `10`;
62
63	params.samplers = {
64	COMMON_SAMPLER_TYPE_TOP_K,
65	};
66
67	result->smpl = common_sampler_init(model: llama_get_model(ctx: ctx_dft), params);
68	}
69	#endif
70
71	result->vocab_dft_compatible = common_speculative_are_compatible(ctx_tgt, ctx_dft);
72	LOG_DBG("vocab_dft_compatible = %d\n", result->vocab_dft_compatible);
73
74	return result;
75	}
76
77	void common_speculative_free(struct common_speculative * spec) {
78	if (spec == nullptr) {
79	return;
80	}
81
82	common_sampler_free(gsmpl: spec->smpl);
83
84	llama_batch_free(batch: spec->batch);
85
86	delete spec;
87	}
88
89	bool common_speculative_are_compatible(
90	const struct llama_context * ctx_tgt,
91	const struct llama_context * ctx_dft) {
92	const struct llama_model * model_tgt = llama_get_model(ctx: ctx_tgt);
93	const struct llama_model * model_dft = llama_get_model(ctx: ctx_dft);
94
95	const struct llama_vocab * vocab_tgt = llama_model_get_vocab(model: model_tgt);
96	const struct llama_vocab * vocab_dft = llama_model_get_vocab(model: model_dft);
97
98	const bool vocab_type_tgt = llama_vocab_type(vocab: vocab_tgt);
99	LOG_DBG("%s: vocab_type tgt: %d\n", __func__, vocab_type_tgt);
100
101	const bool vocab_type_dft = llama_vocab_type(vocab: vocab_dft);
102	LOG_DBG("%s: vocab_type dft: %d\n", __func__, vocab_type_dft);
103
104	if (vocab_type_tgt != vocab_type_dft) {
105	LOG_DBG("%s: draft model vocab type must match target model to use speculation but ", __func__);
106	LOG_DBG("vocab_type_dft = %d while vocab_type_tgt = %d\n", vocab_type_dft, vocab_type_tgt);
107	return false;
108	}
109
110	if (
111	llama_vocab_get_add_bos(vocab: vocab_tgt) != llama_vocab_get_add_bos(vocab: vocab_dft) \|\|
112	llama_vocab_get_add_eos(vocab: vocab_tgt) != llama_vocab_get_add_eos(vocab: vocab_dft) \|\|
113	llama_vocab_bos(vocab: vocab_tgt) != llama_vocab_bos(vocab: vocab_dft) \|\|
114	llama_vocab_eos(vocab: vocab_tgt) != llama_vocab_eos(vocab: vocab_dft)
115	) {
116	LOG_DBG("%s: draft model special tokens must match target model to use speculation\n", __func__);
117	return false;
118	}
119
120	{
121	const int n_vocab_tgt = llama_vocab_n_tokens(vocab: vocab_tgt);
122	const int n_vocab_dft = llama_vocab_n_tokens(vocab: vocab_dft);
123	const int vocab_diff = n_vocab_tgt > n_vocab_dft
124	? n_vocab_tgt - n_vocab_dft
125	: n_vocab_dft - n_vocab_tgt;
126
127	if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
128	LOG_DBG("%s: draft model vocab must closely match target model to use speculation but ", __func__);
129	LOG_DBG("target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
130	n_vocab_tgt, llama_vocab_n_tokens(vocab_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
131	return false;
132	}
133
134	for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(a: n_vocab_tgt, b: n_vocab_dft); ++i) {
135	const char * token_text_tgt = llama_vocab_get_text(vocab: vocab_tgt, token: i);
136	const char * token_text_dft = llama_vocab_get_text(vocab: vocab_dft, token: i);
137	if (std::strcmp(s1: token_text_tgt, s2: token_text_dft) != `0`) {
138	LOG_DBG("%s: draft model vocab must match target model to use speculation but ", __func__);
139	LOG_DBG("token %d content differs - target '%s', draft '%s'\n", i,
140	common_token_to_piece(ctx_tgt, i).c_str(),
141	common_token_to_piece(ctx_dft, i).c_str());
142	return false;
143	}
144	}
145	}
146
147	return true;
148	}
149
150	void common_speculative_add_replacement_tgt_dft(
151	struct common_speculative * spec,
152	const char source, const* char *dest) {
153	spec->tgt_dft_replacements [source] = dest;
154	}
155
156	static std::string replace_to_dft(
157	struct common_speculative * spec,
158	const std::string& input) {
159	std::string result = input;
160	for (const auto & pair : spec->tgt_dft_replacements) {
161	size_t pos = result.find(str: pair.first);
162	while (pos != std::string::npos) {
163	result.replace(pos: pos, n: pair.first.length(), str: pair.second);
164	pos = result.find(str: pair.first, pos: pos + pair.second.length());
165	}
166	}
167	return result;
168	}
169
170	static std::string replace_to_tgt(
171	struct common_speculative * spec,
172	const std::string& input) {
173	std::string result = input;
174	for (const auto& pair : spec->tgt_dft_replacements) {
175	size_t pos = result.find(str: pair.second);
176	while (pos != std::string::npos) {
177	result.replace(pos: pos, n: pair.second.length(), str: pair.first);
178	pos = result.find(str: pair.second, pos: pos + pair.first.length());
179	}
180	}
181	return result;
182	}
183
184
185	llama_tokens common_speculative_gen_draft(
186	struct common_speculative * spec,
187	struct common_speculative_params params,
188	const llama_tokens & prompt_tgt_main_model, // specified in target model vocab
189	llama_token id_last) {
190	auto & batch = spec->batch;
191	auto & ctx_tgt = spec->ctx_tgt;
192	auto & ctx_dft = spec->ctx_dft;
193	auto & smpl = spec->smpl;
194	auto & prompt_dft = spec->prompt_dft;
195
196	auto * mem_dft = llama_get_memory(ctx: ctx_dft);
197
198	int reuse_i = `0`;
199	int reuse_n = `0`;
200
201	const int n_ctx = llama_n_ctx(ctx: ctx_dft) - params.n_draft;
202
203	llama_tokens prompt_tgt_draft_model;
204	if (!spec->vocab_dft_compatible) {
205	std::string text;
206	text = common_detokenize(ctx: ctx_tgt, tokens: prompt_tgt_main_model, special: true);
207	text = replace_to_dft(spec, input: text);
208	LOG_DBG("%s: main->draft detokenized string: '%s'\n", __func__, text.c_str());
209	prompt_tgt_draft_model = common_tokenize(ctx: ctx_dft, text, add_special: false, parse_special: true);
210
211	// convert id_last to draft vocab. llama_detokenize is called directly to avoid an allocation
212	const auto * model_tgt = llama_get_model(ctx: ctx_tgt);
213	const auto * vocab_tgt = llama_model_get_vocab(model: model_tgt);
214
215	int32_t n_chars = llama_detokenize(vocab: vocab_tgt, tokens: &id_last, n_tokens: `1`, text: nullptr, text_len_max: `0`, remove_special: false, unparse_special: false);
216	GGML_ASSERT(n_chars < `0` && "failed to detokenize id_last");
217	text.resize(n: -n_chars);
218	llama_detokenize(vocab: vocab_tgt, tokens: &id_last, n_tokens: `1`, text: text.data(), text_len_max: text.size(), remove_special: false, unparse_special: false);
219	text = replace_to_dft(spec, input: text);
220
221	LOG_DBG("main->draft detokenized id_last(%d): '%s'\n", id_last, text.c_str());
222	id_last = common_tokenize(ctx: ctx_dft, text, add_special: false, parse_special: true)[`0`];
223	}
224	// prompt_tgt's tokens will always be compatible with ctx_dft
225	const llama_tokens &prompt_tgt =
226	spec->vocab_dft_compatible ? prompt_tgt_main_model : prompt_tgt_draft_model;
227
228	const int i_start = std::max<int>(a: `0`, b: (int) prompt_tgt.size() - n_ctx);
229
230	// reuse as much as possible from the old draft context
231	// ideally, the draft context should be as big as the target context and we will always reuse the entire prompt
232	for (int i = `0`; i < (int) prompt_dft.size(); ++i) {
233	int cur = `0`;
234	while (i_start + cur < (int) prompt_tgt.size() &&
235	i + cur < (int) prompt_dft.size() &&
236	prompt_tgt [i_start + cur] == prompt_dft [i + cur]) {
237	cur++;
238	}
239
240	if ((cur >= params.n_reuse \|\| n_ctx >= (int) prompt_tgt.size()) && cur > reuse_n) {
241	reuse_i = i;
242	reuse_n = cur;
243	}
244	}
245
246	LOG_DBG("%s: reuse_i = %d, reuse_n = %d, prompt = %d\n", __func__, reuse_i, reuse_n, (int) prompt_dft.size());
247
248	llama_tokens result;
249	result.reserve(n: params.n_draft);
250
251	if (reuse_n == `0`) {
252	llama_memory_clear(mem: mem_dft, data: false);
253	prompt_dft.clear();
254	} else {
255	// this happens when a previous draft has been discarded (for example, due to being too small), but the
256	// target model agreed with it. in this case, we simply pass back the previous results to save compute
257	if (reuse_i + reuse_n < (int) prompt_dft.size() && prompt_dft [reuse_i + reuse_n] == id_last) {
258	for (int i = reuse_i + reuse_n + `1`; i < (int) prompt_dft.size(); ++i) {
259	result.push_back(x: prompt_dft [i]);
260
261	if (params.n_draft <= (int) result.size()) {
262	break;
263	}
264	}
265
266	return result;
267	}
268
269	if (reuse_i > `0`) {
270	llama_memory_seq_rm (mem: mem_dft, seq_id: `0`, p0: `0`, p1: reuse_i);
271	llama_memory_seq_add(mem: mem_dft, seq_id: `0`, p0: reuse_i, p1: -`1`, delta: -reuse_i);
272
273	prompt_dft.erase(first: prompt_dft.begin(), last: prompt_dft.begin() + reuse_i);
274	}
275
276	if (reuse_n < (int) prompt_dft.size()) {
277	llama_memory_seq_rm (mem: mem_dft, seq_id: `0`, p0: reuse_n, p1: -`1`);
278	prompt_dft.erase(first: prompt_dft.begin() + reuse_n, last: prompt_dft.end());
279	}
280	}
281
282	// prepare a batch to evaluate any new tokens in the prompt
283	common_batch_clear(batch);
284
285	for (size_t i = i_start + reuse_n; i < prompt_tgt.size(); ++i) {
286	//LOG_DBG("i = %d, i_start = %d, reuse_n = %d, i - i_start = %d, id = %6d\n", i, i_start, reuse_n, i - i_start, prompt_tgt[i]);
287	common_batch_add(batch, id: prompt_tgt [i], pos: i - i_start, seq_ids: { `0` }, logits: false);
288
289	prompt_dft.push_back(x: prompt_tgt [i]);
290	}
291
292	// we should rarely end-up here during normal decoding
293	if (batch.n_tokens > `0`) {
294	//LOG_DBG("%s: draft prompt batch: %s\n", __func__, string_from(ctx, batch).c_str());
295
296	llama_decode(ctx: ctx_dft, batch);
297	}
298
299	const llama_pos n_past = prompt_dft.size();
300
301	LOG_DBG("%s: n_past = %d\n", __func__, n_past);
302
303	common_batch_clear(batch);
304	common_batch_add (batch, id: id_last, pos: n_past, seq_ids: { `0` }, logits: true);
305
306	prompt_dft.push_back(x: id_last);
307
308	LOG_DBG("%s: draft prompt: %s\n", __func__, string_from(ctx_dft, prompt_dft).c_str());
309
310	llama_decode(ctx: ctx_dft, batch);
311
312	common_sampler_reset(gsmpl: smpl);
313
314	// sample n_draft tokens from the draft model
315	for (int i = `0`; i < params.n_draft; ++i) {
316	common_batch_clear(batch);
317
318	common_sampler_sample(gsmpl: smpl, ctx: ctx_dft, idx: `0`, grammar_first: true);
319
320	const auto * cur_p = common_sampler_get_candidates(gsmpl: smpl, do_sort: true);
321
322	for (int k = `0`; k < std::min(a: `3`, b: (int) cur_p->size); ++k) {
323	LOG_DBG(" - draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n",
324	k, i, cur_p->data[k].id, cur_p->data[k].p, common_token_to_piece(ctx_dft, cur_p->data[k].id).c_str());
325	}
326
327	// add drafted token for each sequence
328	const llama_token id = cur_p->data[`0`].id;
329
330	common_sampler_accept(gsmpl: smpl, token: id, accept_grammar: true);
331
332	result.push_back(x: id);
333
334	if (params.n_draft <= (int) result.size()) {
335	break;
336	}
337
338	// only collect very high-confidence draft tokens
339	if (cur_p->data[`0`].p < params.p_min) {
340	break;
341	}
342
343	common_batch_add(batch, id, pos: n_past + i + `1`, seq_ids: { `0` }, logits: true);
344
345	// evaluate the drafted tokens on the draft model
346	llama_decode(ctx: ctx_dft, batch);
347
348	prompt_dft.push_back(x: id);
349	}
350
351	if (!spec->vocab_dft_compatible) {
352	std::string detokenized = common_detokenize(ctx: ctx_dft, tokens: result, special: true);
353	detokenized = replace_to_tgt(spec, input: detokenized);
354	LOG_DBG("draft->main detokenized string: '%s'\n", detokenized.c_str());
355	result = common_tokenize(ctx: ctx_tgt, text: detokenized, add_special: false, parse_special: true);
356	if (result.size() > (size_t)params.n_draft) {
357	result.resize(new_size: params.n_draft);
358	}
359	}
360	return result;
361	}
362

Browse the source code of llama.cpp/common/speculative.cpp