ngram-cache.cpp source code [llama.cpp/common/ngram-cache.cpp]

1	#include "ngram-cache.h"
2	#include "common.h"
3	#include "log.h"
4
5	#include <cinttypes>
6	#include <cstdint>
7	#include <cstdio>
8	#include <fstream>
9	#include <thread>
10	#include <algorithm>
11
12	void common_ngram_cache_update(common_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
13	std::vector<llama_token> & inp, int nnew, bool print_progress) {
14	const int64_t t_start_ms = ggml_time_ms();
15	const int64_t inp_size = inp.size();
16
17	const int64_t n_todo = inp_size * (ngram_max - ngram_min + `1`);
18	int64_t n_done = `0`;
19
20	for (int64_t ngram_size = ngram_min; ngram_size <= ngram_max; ++ngram_size) {
21	const int64_t i_start = std::max(a: inp_size - nnew, b: ngram_size);
22	for (int64_t i = i_start; i < inp_size; ++i) {
23	const int64_t ngram_start = i - ngram_size;
24	common_ngram ngram(&inp [ngram_start], ngram_size);
25	const llama_token token = inp [i];
26
27	common_ngram_cache::iterator part_it = ngram_cache.find(x: ngram);
28	if (part_it == ngram_cache.end()) {
29	common_ngram_cache_part part;
30	part.emplace(args: token, args: `1`);
31	ngram_cache.emplace(args&: ngram, args&: part);
32	} else {
33	common_ngram_cache_part::iterator token_count_it = part_it ->second.find(x: token);
34	if (token_count_it == part_it ->second.end()) {
35	part_it ->second.emplace(args: token, args: `1`);
36	} else {
37	token_count_it ->second++;
38	}
39	}
40	++n_done;
41
42	if (print_progress && n_done % `10000000` == `0`) {
43	const int64_t t_now_ms = ggml_time_ms();
44	const int64_t eta_ms = (inp_size(ngram_max-ngram_min+`1`) - n_done) (t_now_ms - t_start_ms) / n_done;
45	const int64_t eta_min = eta_ms / (`60`*`1000`);
46	const int64_t eta_s = (eta_ms - `60``1000`eta_min) / `1000`;
47
48	fprintf(stderr, format: "%s: %" PRId64 "/%" PRId64 " done, ETA: %02" PRId64 ":%02" PRId64 "\n", __func__, n_done, n_todo, eta_min, eta_s);
49	}
50	}
51	}
52	}
53
54	// Helper function to get a token from the combined, speculative sequence of inp and draft.
55	static llama_token get_token(const std::vector<llama_token> & inp, const std::vector<llama_token> & draft, const size_t i) {
56	return i < inp.size() ? inp [i] : draft [`1` + i - inp.size()];
57	}
58
59	// If sample size or percentage are below these thresholds the draft is aborted early:
60	constexpr int draft_min_sample_size_lax[LLAMA_NGRAM_MAX] = { `2`, `2`, `1`, `1`};
61	constexpr int draft_min_percent_lax[LLAMA_NGRAM_MAX] = {`66`, `50`, `50`, `50`};
62	constexpr int draft_min_sample_size_strict[LLAMA_NGRAM_MAX] = { `4`, `3`, `2`, `2`};
63	constexpr int draft_min_percent_strict[LLAMA_NGRAM_MAX] = {`75`, `66`, `66`, `66`};
64
65	// Helper function that tries to draft a token from only the static ngram cache:
66	static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram ngram_static) {
67	common_ngram_cache::iterator part_static_it = nc_static.find(x: ngram_static);
68	if (part_static_it == nc_static.end()) {
69	return LLAMA_TOKEN_NULL;
70	}
71	const common_ngram_cache_part part_static = part_static_it ->second;
72
73	int max_count_static = `0`;
74	int sum_count_static = `0`;
75	llama_token max_token = LLAMA_TOKEN_NULL;
76
77	for (std::pair<llama_token, int> token_count_static : part_static) {
78	const llama_token token = token_count_static.first;
79	const int32_t count_static = token_count_static.second;
80
81	if (count_static > max_count_static) {
82	max_token = token;
83	max_count_static = count_static;
84	}
85	sum_count_static += count_static;
86	}
87
88	if (sum_count_static < draft_min_sample_size_lax[LLAMA_NGRAM_STATIC-`1`]) {
89	return LLAMA_TOKEN_NULL;
90	}
91	if (`100`max_count_static < draft_min_percent_lax[LLAMA_NGRAM_STATIC-`1`]sum_count_static) {
92	return LLAMA_TOKEN_NULL;
93	}
94	return max_token;
95	}
96
97	// Try to draft a token from primary cache (context/dynamic), validate with static cache:
98	static llama_token try_draft(
99	common_ngram_cache & nc_primary, const std::vector<common_ngram> & ngrams_primary, common_ngram_cache_part & part_static,
100	const int * min_sample_size, const int * min_percent) {
101
102	llama_token drafted_token = LLAMA_TOKEN_NULL;
103
104	for (int i = ngrams_primary.size()-`1`; i >= `0` && drafted_token == LLAMA_TOKEN_NULL; --i) {
105	const common_ngram ngram_primary = ngrams_primary [i];
106
107	common_ngram_cache::iterator part_primary_it = nc_primary.find(x: ngram_primary);
108	if (part_primary_it == nc_primary.end()) {
109	continue;
110	}
111	const common_ngram_cache_part part_primary = part_primary_it ->second;
112
113	int max_count_primary = `0`;
114	int max_count_static = `0`;
115	int sum_count_primary = `0`;
116	llama_token max_token = LLAMA_TOKEN_NULL;
117
118	for (std::pair<llama_token, int> token_count_primary : part_primary) {
119	const llama_token token = token_count_primary.first;
120
121	common_ngram_cache_part::iterator token_count_static_it = part_static.find(x: token);
122
123	const int32_t count_primary = token_count_primary.second;
124	const int32_t count_static = token_count_static_it != part_static.end() ? `100`*token_count_static_it ->second : `1`;
125
126	if (count_primarycount_static > max_count_primarymax_count_static) {
127	max_token = token;
128	max_count_primary = count_primary;
129	max_count_static = count_static;
130	}
131	sum_count_primary += count_primary;
132	}
133
134	if (sum_count_primary < min_sample_size[i]) {
135	continue;
136	}
137	if (`100`max_count_primary < min_percent[i]sum_count_primary) {
138	continue;;
139	}
140	drafted_token = max_token;
141	}
142
143	return drafted_token;
144	}
145
146	void common_ngram_cache_draft(
147	std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
148	common_ngram_cache & nc_context, common_ngram_cache & nc_dynamic, common_ngram_cache & nc_static
149	) {
150	GGML_ASSERT(draft.size() == `1`);
151	const int inp_size = inp.size();
152
153	if (inp_size < LLAMA_NGRAM_STATIC) {
154	return;
155	}
156
157	while ((int) draft.size()-`1` < n_draft) {
158	llama_token drafted_token = LLAMA_TOKEN_NULL;
159
160	const int ngram_start_static = inp_size-LLAMA_NGRAM_STATIC + draft.size()-`1`;
161	common_ngram ngram_static;
162	for (int j = ngram_start_static; j < ngram_start_static + LLAMA_NGRAM_STATIC; ++j) {
163	ngram_static.tokens[j-ngram_start_static] = get_token(inp, draft, i: j);
164	}
165	common_ngram_cache::iterator part_static_it = nc_static.find(x: ngram_static);
166	common_ngram_cache_part part_static;
167	if (part_static_it != nc_static.end()) {
168	part_static = part_static_it ->second;
169	}
170
171	// cd = context + dynamic
172	std::vector<common_ngram> ngrams_cd;
173	for (int ngram_size_cd = ngram_min; ngram_size_cd <= ngram_max; ++ngram_size_cd) {
174	const int ngram_start_cd = inp_size-ngram_size_cd + draft.size()-`1`;
175	common_ngram ngram_cd;
176	for (int j = ngram_start_cd; j < ngram_start_cd + ngram_size_cd; ++j) {
177	ngram_cd.tokens[j-ngram_start_cd] = get_token(inp, draft, i: j);
178	}
179	ngrams_cd.push_back(x: ngram_cd);
180	}
181	if (drafted_token == LLAMA_TOKEN_NULL) {
182	drafted_token = try_draft(nc_primary&: nc_context, ngrams_primary: ngrams_cd, part_static, min_sample_size: draft_min_sample_size_lax, min_percent: draft_min_percent_lax);
183	}
184	if (drafted_token == LLAMA_TOKEN_NULL) {
185	drafted_token = try_draft(nc_primary&: nc_dynamic, ngrams_primary: ngrams_cd, part_static, min_sample_size: draft_min_sample_size_strict, min_percent: draft_min_percent_strict);
186	}
187	if (drafted_token == LLAMA_TOKEN_NULL) {
188	drafted_token = try_draft(nc_static, ngram_static);
189	}
190
191	if (drafted_token == LLAMA_TOKEN_NULL) {
192	break;
193	}
194
195	LOG(" - draft candidate: token=%d\n", drafted_token);
196	draft.push_back(x: drafted_token);
197	}
198	}
199
200	void common_ngram_cache_save(common_ngram_cache & ngram_cache, std::string & filename) {
201	std::ofstream file_out(filename, std::ios::binary);
202	for (std::pair<common_ngram, common_ngram_cache_part> item : ngram_cache) {
203	const common_ngram ngram = item.first;
204	common_ngram_cache_part token_counts = item.second;
205	GGML_ASSERT(!token_counts.empty());
206	const int32_t ntokens = token_counts.size();
207	GGML_ASSERT(ntokens > `0`);
208
209	file_out.write(s: reinterpret_cast<const char >(&ngram), n: sizeof*(common_ngram));
210	file_out.write(s: reinterpret_cast<const char >(&ntokens), n: sizeof*(int32_t));
211	for (std::pair<llama_token, int32_t> item2 : token_counts) {
212	const llama_token token = item2.first;
213	const int32_t count = item2.second;
214	GGML_ASSERT(count > `0`);
215
216	file_out.write(s: reinterpret_cast<const char >(&token), n: sizeof*(llama_token));
217	file_out.write(s: reinterpret_cast<const char >(&count), n: sizeof*(int32_t));
218	}
219	}
220
221	}
222
223	common_ngram_cache common_ngram_cache_load(std::string & filename) {
224	std::ifstream hashmap_file(filename, std::ios::binary);
225	if (!hashmap_file) {
226	throw std::ifstream::failure ("Unable to open file " + filename);
227	}
228	common_ngram_cache ngram_cache;
229
230	common_ngram ngram;
231	int32_t ntokens;
232	llama_token token;
233	int32_t count;
234
235	char * ngramc = reinterpret_cast<char*>(&ngram);
236	char * ntokensc = reinterpret_cast<char*>(&ntokens);
237	char * tokenc = reinterpret_cast<char*>(&token);
238	char * countc = reinterpret_cast<char*>(&count);
239	while(hashmap_file.read(s: ngramc, n: sizeof(common_ngram))) {
240	GGML_ASSERT(!hashmap_file.eof());
241	GGML_ASSERT(hashmap_file.read(ntokensc, sizeof(int32_t)));
242	GGML_ASSERT(ntokens > `0`);
243	common_ngram_cache_part token_counts;
244
245	for (int i = `0`; i < ntokens; ++i) {
246	GGML_ASSERT(!hashmap_file.eof());
247	GGML_ASSERT(hashmap_file.read(tokenc, sizeof(llama_token)));
248	GGML_ASSERT(!hashmap_file.eof());
249	GGML_ASSERT(hashmap_file.read(countc, sizeof(int32_t)));
250	GGML_ASSERT(count > `0`);
251	token_counts.emplace(args&: token, args&: count);
252	}
253
254	ngram_cache.emplace(args&: ngram, args&: token_counts);
255	}
256	GGML_ASSERT(hashmap_file.eof());
257
258	return ngram_cache;
259	}
260
261	void common_ngram_cache_merge(common_ngram_cache & ngram_cache_target, common_ngram_cache & ngram_cache_add) {
262	for (std::pair<common_ngram, common_ngram_cache_part> ngram_part : ngram_cache_add) {
263	const common_ngram ngram = ngram_part.first;
264	common_ngram_cache_part part = ngram_part.second;
265
266	common_ngram_cache::iterator part_merged_it = ngram_cache_target.find(x: ngram);
267	if (part_merged_it == ngram_cache_target.end()) {
268	ngram_cache_target.emplace(args: ngram, args&: part);
269	continue;
270	}
271
272	for (std::pair<llama_token, int32_t> token_count : part) {
273	const llama_token token = token_count.first;
274	const int32_t count = token_count.second;
275	GGML_ASSERT(count > `0`);
276
277	common_ngram_cache_part::iterator token_count_merged_it = part_merged_it ->second.find(x: token);
278	if (token_count_merged_it == part_merged_it ->second.end()) {
279	part_merged_it ->second.emplace(args: token, args: count);
280	continue;
281	}
282
283	token_count_merged_it ->second += count;
284	}
285	}
286	}
287

Browse the source code of llama.cpp/common/ngram-cache.cpp