llama-vocab.cpp source code [llama.cpp/src/llama-vocab.cpp]

1	#include "llama-vocab.h"
2
3	#include "ggml.h"
4	#include "gguf.h"
5	#include "llama-impl.h"
6	#include "llama-model-loader.h"
7
8	#include "unicode.h"
9
10	#include <algorithm>
11	#include <cassert>
12	#include <cctype>
13	#include <cfloat>
14	#include <cmath>
15	#include <cstdarg>
16	#include <cstring>
17	#include <forward_list>
18	#include <limits>
19	#include <map>
20	#include <queue>
21	#include <set>
22	#include <unordered_map>
23
24	//
25	// helpers
26	//
27
28	struct naive_trie {
29	naive_trie() : has_value(false), value(`0`) {
30	}
31	void insert(const char * key, size_t len, int32_t value = `0`) {
32	if (len == `0`) {
33	this->has_value = true;
34	this->value = value;
35	return;
36	}
37	char c = key[`0`];
38	auto res = children.find(x: c);
39	if (res != children.end()) {
40	res ->second.insert(key: key + `1`, len: len - `1`, value);
41	} else {
42	auto res = children.insert(x: std::make_pair(x&: c, y: naive_trie ()));
43	res.first ->second.insert(key: key + `1`, len: len - `1`, value);
44	}
45	}
46	std::pair<const char , size_t> get_longest_prefix(const* char * key, size_t len, size_t offset = `0`) const {
47	if (len == `0` \|\| offset == len) {
48	return std::make_pair(x&: key, y&: offset);
49	}
50	char c = key[offset];
51	auto res = children.find(x: c);
52	if (res != children.end()) {
53	return res ->second.get_longest_prefix(key, len, offset: offset + `1`);
54	}
55
56	return std::make_pair(x&: key, y&: offset);
57	}
58	const struct naive_trie * traverse(const char c) const {
59	auto res = children.find(x: c);
60	if (res != children.end()) {
61	return &res ->second;
62	}
63
64	return NULL;
65	}
66	std::map<char, struct naive_trie> children;
67	bool has_value;
68	llama_token value;
69	};
70
71	//
72	// tokenizers
73	//
74
75	struct llm_tokenizer {
76	llm_tokenizer() {}
77	virtual ~llm_tokenizer() = default;
78	};
79
80	struct llm_symbol {
81	using index = int;
82	index prev;
83	index next;
84	const char * text;
85	size_t n;
86	};
87
88	static_assert(std::is_trivially_copyable<llm_symbol>::value, "llm_symbol is not trivially copyable");
89
90	//
91	// SPM tokenizer
92	// original implementation:
93	// https://github.com/ggerganov/llama.cpp/commit/074bea2eb1f1349a0118239c4152914aecaa1be4
94	//
95
96	struct llm_bigram_spm {
97	struct comparator {
98	bool operator()(llm_bigram_spm & l, llm_bigram_spm & r) {
99	return (l.score < r.score) \|\| (l.score == r.score && l.left > r.left);
100	}
101	};
102	using queue_storage = std::vector<llm_bigram_spm>;
103	using queue = std::priority_queue<llm_bigram_spm, queue_storage, comparator>;
104	llm_symbol::index left;
105	llm_symbol::index right;
106	float score;
107	size_t size;
108	};
109
110	struct llm_tokenizer_spm : llm_tokenizer {
111	llm_tokenizer_spm(const llama_vocab & /vocab/) {}
112	};
113
114	struct llm_tokenizer_spm_session {
115	llm_tokenizer_spm_session(const llama_vocab & vocab) : vocab(vocab) {}
116
117	void tokenize(const std::string & text, std::vector<llama_token> & output) {
118	// split string into utf8 chars
119	int index = `0`;
120	size_t offs = `0`;
121	while (offs < text.size()) {
122	llm_symbol sym;
123	size_t len = unicode_len_utf8(src: text [offs]);
124	sym.text = text.c_str() + offs;
125	sym.n = std::min(a: len, b: text.size() - offs);
126	offs += sym.n;
127	sym.prev = index - `1`;
128	sym.next = offs == text.size() ? -`1` : index + `1`;
129	index++;
130	symbols.emplace_back(args&: sym);
131	}
132
133	// seed the work queue with all possible 2-character tokens.
134	for (int i = `1`; i < (int) symbols.size(); ++i) {
135	try_add_bigram(left: i - `1`, right: i);
136	}
137
138	// keep substituting the highest frequency pairs for as long as we can.
139	while (!work_queue.empty()) {
140	auto bigram = work_queue.top();
141	work_queue.pop();
142
143	auto & left_sym = symbols [bigram.left];
144	auto & right_sym = symbols [bigram.right];
145
146	// if one of the symbols already got merged, skip it.
147	if (left_sym.n == `0` \|\| right_sym.n == `0` \|\|
148	left_sym.n + right_sym.n != bigram.size) {
149	continue;
150	}
151
152	// merge the right sym into the left one
153	left_sym.n += right_sym.n;
154	right_sym.n = `0`;
155
156	//LLAMA_LOG_INFO("left = '%s' size = %zu\n", (int) left_sym.n, left_sym.text, bigram.size);*
157
158	// remove the right sym from the chain
159	left_sym.next = right_sym.next;
160	if (right_sym.next >= `0`) {
161	symbols [right_sym.next].prev = bigram.left;
162	}
163
164	// find more substitutions
165	try_add_bigram(left: left_sym.prev, right: bigram.left);
166	try_add_bigram(left: bigram.left, right: left_sym.next);
167	}
168
169	for (int i = `0`; i != -`1`; i = symbols [i].next) {
170	auto & symbol = symbols [i];
171	resegment(symbol, output);
172	}
173	}
174
175	private:
176	void resegment(llm_symbol & symbol, std::vector<llama_token> & output) {
177	auto text = std::string (symbol.text, symbol.n);
178	auto token = vocab.text_to_token(text);
179
180	// Do we need to support is_unused?
181	if (token != LLAMA_TOKEN_NULL) {
182	output.push_back(x: token);
183	return;
184	}
185
186	const auto p = rev_merge.find(x: text);
187
188	if (p == rev_merge.end()) {
189	// output any symbols that did not form tokens as bytes.
190	output.reserve(n: output.size() + symbol.n);
191	for (int j = `0`; j < (int)symbol.n; ++j) {
192	llama_token id = vocab.byte_to_token(ch: symbol.text[j]);
193	output.push_back(x: id);
194	}
195	return;
196	}
197
198	resegment(symbol&: symbols [p ->second.first], output);
199	resegment(symbol&: symbols [p ->second.second], output);
200	}
201
202	void try_add_bigram(int left, int right) {
203	if (left == -`1` \|\| right == -`1`) {
204	return;
205	}
206	const std::string text = std::string (symbols [left].text, symbols [left].n + symbols [right].n);
207	auto token = vocab.text_to_token(text);
208
209	if (token == LLAMA_TOKEN_NULL) {
210	return;
211	}
212
213	if (static_cast<uint32_t>(token) >= vocab.n_tokens()) {
214	return;
215	}
216
217	const auto & tok_data = vocab.get_token_data(id: token);
218
219	llm_bigram_spm bigram;
220	bigram.left = left;
221	bigram.right = right;
222	bigram.score = tok_data.score;
223	bigram.size = text.size();
224
225	work_queue.push(x: bigram);
226
227	// Do we need to support is_unused?
228	rev_merge [text] = std::make_pair(x&: left, y&: right);
229	}
230
231	const llama_vocab & vocab;
232	// currently unused
233	// const llm_tokenizer_spm spm_tokenizer;*
234
235	std::vector<llm_symbol> symbols;
236	llm_bigram_spm::queue work_queue;
237	std::map<std::string, std::pair<int, int>> rev_merge;
238	};
239
240	//
241	// BPE tokenizer
242	// adapted from https://github.com/cmp-nct/ggllm.cpp [MIT License]
243	// tried to simplify unicode stuff, so most likely does not work 100% correctly!
244	//
245
246	// TODO: there are a lot of common parts between spm and bpe tokenizers, should be refactored and reused
247
248	template<typename T, typename Container = std::vector<T>, typename Compare = std::less<typename Container::value_type>>
249	class llama_priority_queue : public std::priority_queue<T, Container, Compare> {
250	public:
251	using std::priority_queue<T, Container, Compare>::priority_queue;
252
253	T pop_move() {
254	T item = std::move(this->c.front());
255	std::pop_heap(this->c.begin(), this->c.end(), this->comp);
256	this->c.pop_back();
257	return item;
258	}
259
260	void pop() = delete;
261	};
262
263	struct llm_bigram_bpe {
264	struct comparator {
265	bool operator()(const llm_bigram_bpe & l, const llm_bigram_bpe & r) const {
266	return l.rank > r.rank \|\| (l.rank == r.rank && l.left > r.left);
267	}
268	};
269
270	using queue_storage = std::vector<llm_bigram_bpe>;
271	using queue = llama_priority_queue<llm_bigram_bpe, queue_storage, comparator>;
272	llm_symbol::index left;
273	llm_symbol::index right;
274	std::string text;
275	int rank;
276	size_t size;
277	};
278
279	struct llm_tokenizer_bpe : llm_tokenizer {
280	llm_tokenizer_bpe(const llama_vocab & vocab) {
281	GGML_ASSERT(vocab.get_type() == LLAMA_VOCAB_TYPE_BPE);
282	switch (vocab.get_pre_type()) {
283	case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
284	regex_exprs = {
285	// original regex from tokenizer.json
286	//"(?i:'s\|'t\|'re\|'ve\|'m\|'ll\|'d)\|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+\|\\p{N}{1,3}\| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]\|\\s[\\r\\n]+\|\\s+(?!\\S)\|\\s+",
287
288	// adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989
289	"(?:'[sS]\|'[tT]\|'[rR][eE]\|'[vV][eE]\|'[mM]\|'[lL][lL]\|'[dD])\|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+\|\\p{N}{1,3}\| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]\|\\s[\\r\\n]+\|\\s+(?!\\S)\|\\s+",
290	};
291	break;
292	case LLAMA_VOCAB_PRE_TYPE_DBRX:
293	case LLAMA_VOCAB_PRE_TYPE_SMAUG:
294	regex_exprs = {
295	// same as llama3
296	"(?:'[sS]\|'[tT]\|'[rR][eE]\|'[vV][eE]\|'[mM]\|'[lL][lL]\|'[dD])\|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+\|\\p{N}{1,3}\| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]\|\\s[\\r\\n]+\|\\s+(?!\\S)\|\\s+",
297	};
298	break;
299	case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
300	regex_exprs = {
301	"[\r\n]",
302	"\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿǄ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿﬀ-ﬆﬓ-ﬗＡ-Ｚａ-ｚ𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
303	"\\s?[!-/:-~！-／：-～‘-‟　-。]+",
304	"\\s+$",
305	"[一-龥ࠀ-一가-퟿]+",
306	"\\p{N}+",
307	};
308	break;
309	case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM:
310	case LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE:
311	regex_exprs = {
312	"\\p{N}{1,3}",
313	"[一-龥぀-ゟ゠-ヿ]+",
314	"[!\"#$%&'()+,\\-./:;<=>?@\\[\\\\\\]^_`{\|}~][A-Za-z]+\|[^\r\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+\| ?[\\p{P}\\p{S}]+[\r\n]\|\\s*[\r\n]+\|\\s+(?!\\S)\|\\s+",
315	};
316	break;
317	case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
318	regex_exprs = {
319	"[\r\n]",
320	"\\s?\\p{L}+",
321	"\\s?\\p{P}+",
322	"[一-龥ࠀ-一가-퟿]+",
323	"\\p{N}",
324	};
325	break;
326	case LLAMA_VOCAB_PRE_TYPE_FALCON:
327	regex_exprs = {
328	"[\\p{P}\\$\\+<=>\\^~\\\|`]+",
329	"'s\|'t\|'re\|'ve\|'m\|'ll\|'d\| ?\\p{L}+\| ?\\p{N}+\| ?[^\\s\\p{L}\\p{N}]+\|\\s+(?!\\S)",
330	"[0-9][0-9][0-9]",
331	};
332	break;
333	case LLAMA_VOCAB_PRE_TYPE_STARCODER:
334	case LLAMA_VOCAB_PRE_TYPE_REFACT:
335	case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
336	case LLAMA_VOCAB_PRE_TYPE_SMOLLM:
337	case LLAMA_VOCAB_PRE_TYPE_CODESHELL:
338	case LLAMA_VOCAB_PRE_TYPE_EXAONE:
339	case LLAMA_VOCAB_PRE_TYPE_MINERVA:
340	regex_exprs = {
341	"\\p{N}",
342	"'s\|'t\|'re\|'ve\|'m\|'ll\|'d\| ?\\p{L}+\| ?\\p{N}+\| ?[^\\s\\p{L}\\p{N}]+\|\\s+(?!\\S)",
343	};
344	break;
345	case LLAMA_VOCAB_PRE_TYPE_GPT2:
346	case LLAMA_VOCAB_PRE_TYPE_MPT:
347	case LLAMA_VOCAB_PRE_TYPE_OLMO:
348	case LLAMA_VOCAB_PRE_TYPE_JAIS:
349	case LLAMA_VOCAB_PRE_TYPE_TRILLION:
350	case LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING:
351	regex_exprs = {
352	"'s\|'t\|'re\|'ve\|'m\|'ll\|'d\| ?\\p{L}+\| ?\\p{N}+\| ?[^\\s\\p{L}\\p{N}]+\|\\s+(?!\\S)",
353	};
354	break;
355	case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
356	case LLAMA_VOCAB_PRE_TYPE_QWEN2:
357	case LLAMA_VOCAB_PRE_TYPE_HUNYUAN:
358	regex_exprs = {
359	// original regex from tokenizer.json
360	// "(?i:'s\|'t\|'re\|'ve\|'m\|'ll\|'d)\|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+\|\\p{N}\| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]\|\\s[\\r\\n]+\|\\s+(?!\\S)\|\\s+"
361	"(?:'[sS]\|'[tT]\|'[rR][eE]\|'[vV][eE]\|'[mM]\|'[lL][lL]\|'[dD])\|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+\|\\p{N}\| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]\|\\s[\\r\\n]+\|\\s+(?!\\S)\|\\s+",
362	};
363	break;
364	case LLAMA_VOCAB_PRE_TYPE_PORO:
365	case LLAMA_VOCAB_PRE_TYPE_BLOOM:
366	case LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH:
367	regex_exprs = {
368	" ?[^(\\s\|.,!?…。，、।۔،)]+",
369	};
370	break;
371	case LLAMA_VOCAB_PRE_TYPE_CHATGLM4:
372	regex_exprs = {
373	"(?:'[sS]\|'[tT]\|'[rR][eE]\|'[vV][eE]\|'[mM]\|'[lL][lL]\|'[dD])\|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+\|\\p{N}{1,3}\| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]\|\\s[\\r\\n]+\|\\s+(?!\\S)\|\\s+",
374	};
375	break;
376	case LLAMA_VOCAB_PRE_TYPE_VIKING:
377	regex_exprs = {
378	" ?[^(\\s\|.,!?…。，、।۔،)]+",
379	"\\p{N}",
380	};
381	break;
382	case LLAMA_VOCAB_PRE_TYPE_TEKKEN:
383	// original regex from tokenizer.json
384	// "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}][\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+\|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]\|\\p{N}\| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]\|\\s[\\r\\n]+\|\\s+(?!\\S)\|\\s+"
385	regex_exprs = {
386	"[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))((?=[\\p{L}])([^A-Z]))+\|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))\|\\p{N}\| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]\|\\s[\\r\\n]+\|\\s+(?!\\S)\|\\s+",
387	};
388	break;
389	case LLAMA_VOCAB_PRE_TYPE_CHAMELEON:
390	// Note: in theory, the special token (sentinel and image token) regex_exprs below
391	// are unnecessary, as they are split in `tokenizer_st_partition` anyway.
392	// However, since the upstream pre-tokenizer uses them, they are also
393	// included here (see https://huggingface.co/facebook/chameleon-7b).
394	regex_exprs = {
395	"<sentinel:[0-9]+>", // Sentinel tokens
396	"(IMGIMG)((A\|B\|C\|D\|E\|F\|G\|H\|I){1,4})Z", // Image tokens
397	"([\\t\\n]\| \| )", // directly from tokenizer.json
398	"\\p{N}", // Individual digits
399	"[\\p{P}!-/:-@\\[-`{-~]", // Punctuation, Isolated
400	"'s\|'t\|'re\|'ve\|'m\|'ll\|'d\| ?\\p{L}+\| ?\\p{N}+\| ?[^\\s\\p{L}\\p{N}]+\|\\s+(?!\\S)",
401	};
402	break;
403	case LLAMA_VOCAB_PRE_TYPE_GPT4O:
404	case LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2:
405	regex_exprs = {
406	// original regex from tokenizer.json
407	// "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}][\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s\|'t\|'re\|'ve\|'m\|'ll\|'d)?\|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}](?i:'s\|'t\|'re\|'ve\|'m\|'ll\|'d)?\|\\p{N}{1,3}\| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]\|\\s[\\r\\n]+\|\\s+(?!\\S)\|\\s+",
408	"[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))((?=[\\p{L}])([^A-Z]))+(?:'[sS]\|'[tT]\|'[rR][eE]\|'[vV][eE]\|'[mM]\|'[lL][lL]\|'[dD])?\|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))(?:'[sS]\|'[tT]\|'[rR][eE]\|'[vV][eE]\|'[mM]\|'[lL][lL]\|'[dD])?\|\\p{N}{1,3}\| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]\|\\s[\\r\\n]+\|\\s+(?!\\S)\|\\s+",
409	};
410	break;
411	case LLAMA_VOCAB_PRE_TYPE_KIMI_K2:
412	regex_exprs = {
413	// K2 trigger pattern - this will activate the custom K2 handler in unicode.cpp
414	// The custom handler implements all K2 patterns with proper Han character exclusion
415	"\\p{Han}+",
416	};
417	break;
418	case LLAMA_VOCAB_PRE_TYPE_SUPERBPE:
419	regex_exprs = {
420	"\\p{N}+",
421	"(?=(\\d{3})+(?!\\d))",
422	};
423	break;
424	case LLAMA_VOCAB_PRE_TYPE_BAILINGMOE:
425	regex_exprs = {
426	// original regex from tokenizer.json
427	// "'(?i:[sdmt]\|ll\|ve\|re)\|[^\\r\\n\\p{L}\\p{N}]?+\\p{L}+\|\\p{N}\| ?[^\\s\\p{L}\\p{N}]++[\\r\\n]\|\\s[\\r\\n]\|\\s+(?!\\S)\|\\s+"
428	// FIXME? Changed possessive quantifiers (?+ and ++) to greedy to avoid errors and imatrix hanging (tried atomic grouping but it's not supported?)
429	"'(?:[sSdDmMtT]\|[lL][lL]\|[vV][eE]\|[rR][eE])\|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+\|\\p{N}\| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]\|\\s[\\r\\n]\|\\s+(?!\\S)\|\\s+",
430	};
431	break;
432	case LLAMA_VOCAB_PRE_TYPE_SEED_CODER:
433	regex_exprs = {
434	// original regex from tokenizer.json
435	// "(?i:'s\|'t\|'re\|'ve\|'m\|'ll\|'d)\|[^\r\n\\p{L}\\p{N}]?\\p{L}+\|\\p{N}{1}\| ?[^\\s\\p{L}\\p{N}\r\n]+\|\\s[\r\n]+\|\\s+(?!\\S)\|\\s+"*
436	"(?:'[sS]\|'[tT]\|'[rR][eE]\|'[vV][eE]\|'[mM]\|'[lL][lL]\|'[dD])\|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+\|\\p{N}{1}\| ?[^\\s\\p{L}\\p{N}\\r\\n]+\|\\s*[\\r\\n]+\|\\s+(?!\\S)\|\\s+",
437	};
438	break;
439	case LLAMA_VOCAB_PRE_TYPE_GROK_2:
440	regex_exprs = {
441	// original regex from tokenizer.json
442	// "(?i:'s\|'t\|'re\|'ve\|'m\|'ll\|'d)\|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+\|\\p{N}\| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]\|\\s[\\r\\n]+\|\\s+(?!\\S)\|\\s+"
443	"(?:'[sS]\|'[tT]\|'[rR][eE]\|'[vV][eE]\|'[mM]\|'[lL][lL]\|'[dD])\|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+\|\\p{N}\| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]\|\\s[\\r\\n]+\|\\s+(?!\\S)\|\\s+",
444	};
445	break;
446	default:
447	// default regex for BPE tokenization pre-processing
448	regex_exprs = {
449	"[\\p{P}\\$\\+<=>\\^~\\\|]+",
450	"'s\|'t\|'re\|'ve\|'m\|'ll\|'d\| ?\\p{L}+\| ?\\p{N}+\| ?[^\\s\\p{L}\\p{N}]+\|\\s+(?!\\S)",
451	"\\p{N}+",
452	"[0-9][0-9][0-9]",
453	};
454	break;
455	}
456	}
457
458	std::vector<std::string> regex_exprs;
459	};
460
461	struct llm_tokenizer_bpe_session {
462	llm_tokenizer_bpe_session(const llama_vocab & vocab, const llm_tokenizer_bpe & tokenizer) : vocab(vocab), tokenizer(tokenizer) {}
463
464	static void append(const llama_token token_id, std::vector<llama_token> & output) {
465	output.push_back(x: token_id);
466	}
467
468	bool append_bos(std::vector<llama_token> & output) const {
469	if (vocab.get_add_bos()) {
470	GGML_ASSERT(vocab.token_bos() != LLAMA_TOKEN_NULL);
471	output.push_back(x: vocab.token_bos());
472	return true;
473	}
474	return false;
475	}
476
477	bool append_eos(std::vector<llama_token> & output) const {
478	if (vocab.get_add_eos()) {
479	GGML_ASSERT(vocab.token_eos() != LLAMA_TOKEN_NULL);
480	output.push_back(x: vocab.token_eos());
481	return true;
482	}
483	return false;
484	}
485
486	void check_double_bos_eos(const std::vector<llama_token> & output) const {
487	if (vocab.get_add_bos() && output.size() >= `2` && output [`1`] == vocab.token_bos()) {
488	LLAMA_LOG_WARN(
489	"%s: Added a BOS token to the prompt as specified by the model but the prompt "
490	"also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
491	"Are you sure this is what you want?\n", __FUNCTION__);
492	}
493	if (vocab.get_add_eos() && output.size() >= `2` && *(output.end()-`2`) == vocab.token_eos()) {
494	LLAMA_LOG_WARN(
495	"%s: Added a EOS token to the prompt as specified by the model but the prompt "
496	"also ends with a EOS token. So now the final prompt ends with 2 EOS tokens. "
497	"Are you sure this is what you want?\n", __FUNCTION__);
498	}
499	}
500
501	void tokenize(const std::string & text, std::vector<llama_token> & output) {
502	int final_prev_index = -`1`;
503	const auto word_collection = unicode_regex_split(text, regex_exprs: tokenizer.regex_exprs);
504
505	symbols_final.clear();
506
507	for (const auto & word : word_collection) {
508	work_queue = llm_bigram_bpe::queue ();
509	symbols.clear();
510
511	int index = `0`;
512	size_t offset = `0`;
513
514	//if (vocab.tokenizer_ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
515	if (vocab.get_ignore_merges() && vocab.text_to_token(text: word) != LLAMA_TOKEN_NULL) {
516	symbols.emplace_back(args: llm_symbol{.prev: -`1`, .next: -`1`, .text: word.c_str(), .n: word.size()});
517	offset = word.size();
518	}
519
520	while (offset < word.size()) {
521	llm_symbol sym;
522	size_t char_len = std::min(a: word.size() - offset, b: (size_t) unicode_len_utf8(src: word [offset]));
523	sym.text = word.c_str() + offset;
524	sym.n = char_len;
525	offset += sym.n;
526	sym.prev = index - `1`;
527	sym.next = offset == word.size() ? -`1` : index + `1`;
528	index++;
529	symbols.emplace_back(args&: sym);
530	}
531	for (int i = `1`; i < (int) symbols.size(); ++i) {
532	add_new_bigram(left: i - `1`, right: i);
533	}
534
535	// build token(s)
536	while (!work_queue.empty()) {
537	auto bigram = work_queue.pop_move();
538
539	auto & left_symbol = symbols [bigram.left];
540	auto & right_symbol = symbols [bigram.right];
541
542	if (left_symbol.n == `0` \|\| right_symbol.n == `0`) {
543	continue;
544	}
545	std::string left_token = std::string (left_symbol.text, left_symbol.n);
546	std::string right_token = std::string (right_symbol.text, right_symbol.n);
547	if (left_token + right_token != bigram.text) {
548	continue; // Skip this bigram if it's outdated
549	}
550
551	// merge the right sym into the left one
552	left_symbol.n += right_symbol.n;
553	right_symbol.n = `0`;
554
555	// remove the right sym from the chain
556	left_symbol.next = right_symbol.next;
557	if (right_symbol.next >= `0`) {
558	symbols [right_symbol.next].prev = bigram.left;
559	}
560
561	add_new_bigram(left: left_symbol.prev, right: bigram.left); // left side of current symbol
562	add_new_bigram(left: bigram.left, right: left_symbol.next); // right side of current symbol
563	}
564
565	// add the finished tokens to the final list keeping correct order for next and prev
566	for (auto & sym : symbols) {
567	if (sym.n > `0`) {
568	sym.prev = final_prev_index;
569	sym.next = -`1`;
570	if (final_prev_index != -`1`) {
571	symbols_final [final_prev_index].next = symbols_final.size();
572	}
573	symbols_final.emplace_back(args&: sym);
574	final_prev_index = symbols_final.size() - `1`;
575	}
576	}
577	}
578
579	symbols = symbols_final;
580
581	if (!symbols.empty()) {
582	for (int i = `0`; i != -`1`; i = symbols [i].next) {
583	auto & symbol = symbols [i];
584	if (symbol.n == `0`) {
585	continue;
586	}
587
588	const std::string str = std::string (symbol.text, symbol.n);
589	const auto token = vocab.text_to_token(text: str);
590
591	if (token == LLAMA_TOKEN_NULL) {
592	for (auto j = str.begin(); j != str.end(); ++j) {
593	std::string byte_str(`1`, *j);
594	auto token_multibyte = vocab.text_to_token(text: byte_str);
595	if (token_multibyte != LLAMA_TOKEN_NULL) {
596	output.push_back(x: token_multibyte);
597	}
598	}
599	} else {
600	output.push_back(x: token);
601	}
602	}
603	}
604	}
605
606	private:
607	void add_new_bigram(int left, int right) {
608	if (left == -`1` \|\| right == -`1`) {
609	return;
610	}
611	std::string left_token = std::string (symbols [left].text, symbols [left].n);
612	std::string right_token = std::string (symbols [right].text, symbols [right].n);
613
614	int rank_found = -`1`;
615
616	rank_found = vocab.find_bpe_rank(token_left: left_token, token_right: right_token);
617
618	if (rank_found < `0`) {
619	return;
620	}
621
622	llm_bigram_bpe bigram;
623
624	bigram.left = left;
625	bigram.right = right;
626	bigram.text = left_token + right_token;
627	bigram.size = left_token.size() + right_token.size();
628	bigram.rank = rank_found;
629
630	work_queue.push(x: bigram);
631	}
632
633	const llama_vocab & vocab;
634	const llm_tokenizer_bpe & tokenizer;
635
636	std::vector<llm_symbol> symbols;
637	std::vector<llm_symbol> symbols_final;
638	llm_bigram_bpe::queue work_queue;
639	};
640
641	//
642	// WPM tokenizer
643	//
644
645	struct llm_tokenizer_wpm : llm_tokenizer {
646	llm_tokenizer_wpm(const llama_vocab & /vocab/) {}
647	};
648
649	struct llm_tokenizer_wpm_session {
650	llm_tokenizer_wpm_session(const llama_vocab & vocab) : vocab(vocab) {}
651
652	void tokenize(const std::string & text, std::vector<llama_token> & output) {
653	// normalize and split by whitespace
654	std::vector<std::string> words = preprocess(text);
655	// bos token prepended already
656
657	// find the longest tokens that form the words
658	for (const std::string & word : words) {
659	// skip empty words
660	if (word.size() == `0`) {
661	continue;
662	}
663
664	// prepend phantom space
665	const std::string word1 = "\xe2\x96\x81" + word;
666	const int n = word1.size();
667
668	const size_t current_tokens = output.size();
669
670	// we're at the start of a new word
671	// move through character position in word
672	for (int i = `0`; i < n; ++i) {
673	// loop through possible match length
674	bool match = false;
675	for (int j = std::min(a: n, b: i + vocab.max_token_len() + `1`); j > i; j--) {
676	auto id = vocab.text_to_token(text: word1.substr(pos: i, n: j - i));
677	if (id != LLAMA_TOKEN_NULL) {
678	output.push_back(x: id);
679	match = true;
680	i = j - `1`;
681	break;
682	}
683	}
684
685	if (!match) { // discard all
686	output.resize(new_size: current_tokens);
687	break; // and discard next tokens
688	}
689	}
690
691	// we didn't find any matches for this word
692	if (current_tokens == output.size()) {
693	output.push_back(x: vocab.token_unk());
694	}
695	}
696	}
697
698	// TODO: reduce string copies by using cpts_offs array
699	static std::vector<std::string> preprocess(const std::string & text) {
700	const std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(cpts: unicode_cpts_from_utf8(utf8: text));
701	std::vector<std::string> words(`1`, "");
702
703	for (const uint32_t cpt : cpts_nfd) {
704	const auto flags = unicode_cpt_flags_from_cpt(cpt);
705
706	if (flags.is_whitespace) {
707	if (words.back().size()) { // finish previous word if any
708	words.emplace_back();
709	}
710	continue;
711	}
712
713	assert (!flags.is_separator);
714	if (cpt == `0` \|\| cpt == `0xFFFD` \|\| flags.is_control) {
715	continue;
716	}
717
718	const std::string s = unicode_cpt_to_utf8(cpt: unicode_tolower(cpt));
719	if (flags.is_punctuation \|\| ( cpt < `0x7F` && flags.is_symbol ) \|\| is_chinese_char(cpt)) {
720	if (words.back().size()) { // finish previous word if any
721	words.emplace_back();
722	}
723	words.back() = s; // single char word
724	words.emplace_back(); // start a new word
725	} else {
726	words.back() += s; // append char to word
727	}
728	}
729
730	if (!words.back().size()) {
731	words.pop_back();
732	}
733
734	return words;
735	}
736
737	static bool is_chinese_char(uint32_t cpt) {
738	return
739	(cpt >= `0x04E00` && cpt <= `0x09FFF`) \|\|
740	(cpt >= `0x03400` && cpt <= `0x04DBF`) \|\|
741	(cpt >= `0x20000` && cpt <= `0x2A6DF`) \|\|
742	(cpt >= `0x2A700` && cpt <= `0x2B73F`) \|\|
743	(cpt >= `0x2B740` && cpt <= `0x2B81F`) \|\|
744	(cpt >= `0x2B920` && cpt <= `0x2CEAF`) \|\| // this should be 0x2B820 but in hf rust code it is 0x2B920
745	(cpt >= `0x0F900` && cpt <= `0x0FAFF`) \|\|
746	(cpt >= `0x2F800` && cpt <= `0x2FA1F`);
747	//(cpt >= 0x3000 && cpt <= 0x303F) \|\|
748	//(cpt >= 0xFF00 && cpt <= 0xFFEF);
749	}
750
751	private:
752	const llama_vocab & vocab;
753	// currently unused
754	// const llm_tokenizer_wpm wpm_tokenizer;*
755	};
756
757	//
758	// UGM tokenizer
759	//
760
761	struct llm_tokenizer_ugm : llm_tokenizer {
762	llm_tokenizer_ugm(const llama_vocab & vocab, const std::vector<char> & precompiled_charsmap) {
763	if (precompiled_charsmap.size() > `0`) {
764	size_t charsmap_offset = `0`;
765
766	// First four bytes of precompiled_charsmap contains length of binary
767	// blob containing XOR-compressed compact double array (XCDA) entries
768	uint32_t xcda_blob_size = (const* uint32_t *) &precompiled_charsmap [`0`];
769	charsmap_offset += sizeof(xcda_blob_size);
770	if (xcda_blob_size + charsmap_offset >= precompiled_charsmap.size()) {
771	throw std::runtime_error ("Index out of array bounds in precompiled charsmap!");
772	}
773
774	// Next xcda_blob_size bytes contain entries of XOR-compressed compact
775	// double array (XCDA). Each entry is bit-packed into a 32-bit integer.
776	xcda_array = (const uint32_t *) &precompiled_charsmap [charsmap_offset];
777	xcda_array_size = xcda_blob_size / sizeof(uint32_t);
778	charsmap_offset += xcda_blob_size;
779
780	// Remaining bytes of precompiled charsmap contain null-terminated
781	// replacement strings for prefixes matched by the XCDA.
782	prefix_replacements = &precompiled_charsmap [charsmap_offset];
783	prefix_replacements_size = precompiled_charsmap.size() - charsmap_offset;
784	}
785
786	for (uint32_t id = `0`; id < vocab.n_tokens(); ++id) {
787	const auto & token_data = vocab.get_token_data(id);
788
789	if (vocab.is_normal(id)) {
790	min_score = std::min<float>(a: min_score, b: token_data.score);
791	max_score = std::max<float>(a: max_score, b: token_data.score);
792	}
793
794	if (vocab.is_normal(id) \|\|
795	vocab.is_user_defined(id) \|\|
796	vocab.is_unused(id)) {
797	token_matcher.insert(key: token_data.text.data(), len: token_data.text.size(), value: id);
798	}
799
800	if (vocab.is_user_defined(id)) {
801	user_defined_token_matcher.insert(key: token_data.text.data(), len: token_data.text.size());
802	}
803	}
804
805	unknown_token_score = min_score - unknown_token_score_penalty;
806	}
807
808	// escaped space symbol - U+2581 (Lower One Eighth Block)
809	const std::string escaped_space = "\xE2\x96\x81";
810
811	const char * prefix_replacements = NULL;
812	size_t prefix_replacements_size = `0`;
813
814	const uint32_t * xcda_array = NULL;
815	size_t xcda_array_size = `0`;
816
817	struct naive_trie user_defined_token_matcher;
818
819	float min_score = FLT_MAX;
820	float max_score = -FLT_MAX;
821
822	float unknown_token_score_penalty = `10.0`;
823	float unknown_token_score;
824
825	struct naive_trie token_matcher;
826	};
827
828	struct llm_tokenizer_ugm_session {
829	llm_tokenizer_ugm_session(const llama_vocab & vocab, const llm_tokenizer_ugm & tokenizer) : vocab(vocab), tokenizer(tokenizer) {}
830
831	/ This implementation is based on SentencePiece optimized Viterbi algorithm for*
832	* unigram language models. The general idea is to:
833	* - move along the input sequence in steps of one UTF code point,
834	* - at each step find all possible tokenizations of the prefix by
835	* traversing the tokens trie,
836	* - for each tokenization store the best one so far (by higher score)
837	* - use the position in sequence after given token as an index to store
838	* results
839	* - if there was no valid tokenization of the current UTF code point
840	* then use unknown token with additional score penalty
841	* After processing the whole sequence we backtrack from the end to get
842	* the best tokenization.
843	*/
844	void tokenize(const std::string & text, std::vector<llama_token> & output) {
845	// get current size of output (for reversal later)
846	size_t output_size = output.size();
847
848	// normalize the input first
849	std::string normalized;
850	normalize(input: text, normalized: &normalized);
851	size_t input_len = normalized.size();
852	if (input_len == `0`) {
853	return;
854	}
855
856	// initialize score_sum to -FLT_MAX so it will be always lower than sums of token scores
857	std::vector<struct best_tokenization> tokenization_results(input_len + `1`, {.token_id: vocab.token_unk(), .input_offset: `0`, .score_sum: -DBL_MAX});
858	// at the beginning tokenization score is zero
859	tokenization_results [`0`] = { .token_id: vocab.token_unk(), .input_offset: `0`, .score_sum: `0` };
860
861	for (size_t input_offset = `0`; input_offset < input_len;) {
862	size_t prefix_offset = input_offset;
863	// calculate how many code units are in the currently processed UTF code point
864	size_t n_utf8_code_units = std::min<size_t>(a: unicode_len_utf8(src: normalized [input_offset]), b: input_len - input_offset);
865
866	// traverse the token matcher trie to find a matching token
867	bool single_codepoint_token_found = false;
868	const struct best_tokenization & current_best = tokenization_results [input_offset];
869	const struct naive_trie * node = tokenizer.token_matcher.traverse(c: normalized [prefix_offset++]);
870
871	while (prefix_offset <= input_len && node != NULL) {
872	// check if we found valid token in prefix
873	if (node->has_value) {
874	// check if it corresponds to the whole UTF code point
875	if (prefix_offset - input_offset == n_utf8_code_units) {
876	single_codepoint_token_found = true;
877	}
878	llama_token token_id = node->value;
879	const auto & token_data = vocab.get_token_data(id: token_id);
880
881	// we set the user-defined token scores to 0 to make them more likely to be selected
882	// (normal token scores are log probabilities, so they are negative)
883	// score type is double here to make tokenization results exactly
884	// the same as in the HF tokenizer using SentencePiece
885	const double token_score = vocab.is_user_defined(id: token_id) ? `0.0` : token_data.score;
886	const double challenger_score = current_best.score_sum + token_score;
887	struct best_tokenization & current_champ = tokenization_results [prefix_offset];
888	if (challenger_score > current_champ.score_sum) {
889	struct best_tokenization challenger = { .token_id: token_id, .input_offset: input_offset, .score_sum: challenger_score };
890	current_champ = challenger;
891	}
892	}
893	node = node->traverse(c: normalized [prefix_offset++]);
894	}
895
896	// if we didn't find a valid token corresponding to the whole UTF code point
897	// then use unknown token as the tokenization of this UTF code point
898	if (!single_codepoint_token_found) {
899	const double challenger_score = current_best.score_sum + tokenizer.unknown_token_score;
900	prefix_offset = input_offset + n_utf8_code_units;
901	struct best_tokenization & current_champ = tokenization_results [prefix_offset];
902	if (challenger_score > current_champ.score_sum) {
903	struct best_tokenization challenger = { .token_id: vocab.token_unk(), .input_offset: input_offset, .score_sum: challenger_score };
904	current_champ = challenger;
905	}
906	}
907
908	// move to the next UTF code point
909	input_offset += n_utf8_code_units;
910	}
911
912	// now backtrack from the end to gather token ids of the best tokenization
913	// merge sequences of consecutive unknown tokens into single unknown tokens
914	bool is_prev_unknown = false;
915	for (struct best_tokenization & tokenization = tokenization_results [input_len]; ; tokenization = tokenization_results [tokenization.input_offset]) {
916	bool is_unknown = tokenization.token_id == vocab.token_unk();
917	if (!(is_prev_unknown && is_unknown)) {
918	output.push_back(x: tokenization.token_id);
919	}
920	if (tokenization.input_offset == `0`) {
921	break;
922	}
923	is_prev_unknown = is_unknown;
924	}
925
926	// reverse the output since we added tokens starting from the end of the input
927	std::reverse(first: output.begin() + output_size, last: output.end());
928	}
929
930	private:
931
932	// helper structure for returning normalization results
933	struct normalization_result {
934	const char * normalized;
935	size_t normalized_len;
936	size_t consumed_input;
937	};
938
939	void normalize(const std::string& input, std::string * normalized) {
940	normalized->clear();
941	normalized->reserve(res_arg: input.size() * `3`);
942
943	const std::string space = vocab.get_escape_whitespaces() ? tokenizer.escaped_space : " ";
944
945	const bool shall_prepend_space = !vocab.get_treat_whitespace_as_suffix() && vocab.get_add_space_prefix();
946	const bool shall_append_space = vocab.get_treat_whitespace_as_suffix() && vocab.get_add_space_prefix();
947	const bool shall_merge_spaces = vocab.get_remove_extra_whitespaces();
948
949	bool is_space_prepended = false;
950	bool processing_non_ws = false;
951
952	size_t input_len = input.size();
953
954	for (size_t input_offset = `0`; input_offset < input_len; ) {
955	auto norm_res = normalize_prefix(input, input_offset);
956	for (size_t i = `0`; i < norm_res.normalized_len; i++) {
957	char c = norm_res.normalized[i];
958	if (c != `' '`) {
959	if (!processing_non_ws) {
960	processing_non_ws = true;
961	if ((shall_prepend_space && !is_space_prepended) \|\| shall_merge_spaces) {
962	normalized->append(str: space);
963	is_space_prepended = true;
964	}
965	}
966	normalized->push_back(c: c);
967	} else {
968	if (processing_non_ws) {
969	processing_non_ws = false;
970	}
971	if (!shall_merge_spaces) {
972	normalized->append(str: space);
973	}
974	}
975	}
976
977	input_offset += norm_res.consumed_input;
978	}
979
980	if (shall_append_space) {
981	normalized->append(str: space);
982	}
983	}
984
985	/*
986	* This structure is a view wrapper for XOR-compressed double array (XCDA)
987	* See Shunsuke Kanda (2018). Space- and Time-Efficient String Dictionaries.
988	* Each bit-packed entry contains:
989	* - BASE array value in bits 10-30
990	* - LCHECK array value in bits 0-7
991	* - LEAF array value in bit 9
992	* Entries containing indexes of replacement sequences have set bit 31
993	*/
994	struct xcda_array_view {
995	public:
996	xcda_array_view(const uint32_t * xcda_array, size_t xcda_array_size) : xcda_array(xcda_array), xcda_array_size(xcda_array_size) {
997	}
998	uint32_t get_base(size_t index) {
999	uint32_t packed_node = get_node(index);
1000	return (packed_node >> `10`) << ((packed_node & (`1U` << `9`)) >> `6`);
1001	}
1002	uint32_t get_lcheck(size_t index) {
1003	uint32_t packed_node = get_node(index);
1004	return packed_node & ((`1U` << `31`) \| `0xff`);
1005	}
1006	bool get_leaf(size_t index) {
1007	uint32_t packed_node = get_node(index);
1008	return (packed_node >> `8`) & `1`;
1009	}
1010	uint32_t get_value(size_t index) {
1011	uint32_t packed_node = get_node(index);
1012	return packed_node & ((`1U` << `31`) - `1`);
1013	}
1014	private:
1015	uint32_t get_node(size_t index) {
1016	if (index > xcda_array_size) {
1017	throw std::runtime_error ("Index out of array bounds in XCDA array!");
1018	}
1019	return xcda_array[index];
1020	}
1021	const uint32_t * xcda_array;
1022	size_t xcda_array_size;
1023	};
1024
1025	// this structure stores the best tokenization so far at input_offset
1026	struct best_tokenization {
1027	llama_token token_id;
1028	size_t input_offset;
1029	double score_sum;
1030	};
1031
1032	struct normalization_result normalize_prefix(const std::string & input, size_t input_offset) {
1033	if (input_offset == input.size()) {
1034	return { .normalized: &input [input_offset], .normalized_len: `0`, .consumed_input: `0` };
1035	}
1036
1037	// if input prefix matches some user-defined token return this token as normalization result
1038	auto user_defined_token_match =
1039	tokenizer.user_defined_token_matcher.get_longest_prefix(key: &input [input_offset], len: input.size() - input_offset);
1040	if (user_defined_token_match.second > `0`) {
1041	return { .normalized: &input [input_offset], .normalized_len: user_defined_token_match.second, .consumed_input: user_defined_token_match.second };
1042	}
1043
1044	size_t longest_prefix_length = `0`;
1045	size_t longest_prefix_offset = `0`;
1046
1047	if (tokenizer.xcda_array_size > `0`) {
1048	struct xcda_array_view xcda_view(tokenizer.xcda_array, tokenizer.xcda_array_size);
1049
1050	// Find the longest normalized sequence matching the input prefix by walking
1051	// the XOR-compressed compact double array (XCDA) starting from the root node
1052	// We find the index of the next node by calculating BASE[s] ^ c where s is
1053	// the index of the previous node and c is a numerical character value
1054	uint32_t node_index = `0`;
1055	// get BASE of the root node
1056	node_index = xcda_view.get_base(index: node_index);
1057	for (size_t prefix_offset = input_offset; prefix_offset < input.size(); prefix_offset++) {
1058	unsigned char c = input [prefix_offset];
1059	if (c == `0`) {
1060	break;
1061	}
1062	node_index ^= c;
1063	// if value of LCHECK is not c it means that this is not a child of
1064	// the previous node, so we stop matching
1065	if (xcda_view.get_lcheck(index: node_index) != c) {
1066	break;
1067	}
1068	bool is_leaf = xcda_view.get_leaf(index: node_index);
1069	// get BASE of the current node
1070	node_index ^= xcda_view.get_base(index: node_index);
1071	// if LEAF of the current node is true, it means that its BASE points to the node
1072	// containing index of replacement sequence for currently matched input prefix
1073	if (is_leaf)
1074	{
1075	longest_prefix_length = prefix_offset - input_offset + `1`;
1076	// get index of replacement sequence for currently matched input prefix
1077	longest_prefix_offset = xcda_view.get_value(index: node_index);
1078	}
1079	}
1080	}
1081
1082	if (longest_prefix_length > `0`) {
1083	// we have a match, so return the replacement sequence
1084	if (longest_prefix_offset >= tokenizer.prefix_replacements_size) {
1085	throw std::runtime_error ("Index out of array bounds in precompiled charsmap!");
1086	}
1087	const char * prefix_replacement = &(tokenizer.prefix_replacements)[longest_prefix_offset];
1088	return { .normalized: prefix_replacement, .normalized_len: strlen(s: prefix_replacement), .consumed_input: longest_prefix_length };
1089	}
1090
1091	// check if the input prefix contains a valid sequence of UTF-8 code units
1092	try {
1093	// if yes, return this sequence unmodified
1094	size_t prefix_offset = input_offset;
1095	unicode_cpt_from_utf8(utf8: input, offset&: prefix_offset);
1096	return { .normalized: &input [input_offset], .normalized_len: prefix_offset - input_offset, .consumed_input: prefix_offset - input_offset };
1097	} catch (std::invalid_argument & /ex/) {
1098	// if no, consume 1 byte and return U+FFFD - REPLACEMENT CHARACTER
1099	return { .normalized: "\xEF\xBF\xBD", .normalized_len: `3`, .consumed_input: `1` };
1100	}
1101	}
1102
1103	const llama_vocab & vocab;
1104	const llm_tokenizer_ugm & tokenizer;
1105	};
1106
1107	//
1108	// RWKV tokenizer
1109	//
1110
1111	static std::vector<uint8_t> llama_unescape_rwkv_token(const std::string & escaped) {
1112	std::vector<uint8_t> output;
1113	output.reserve(n: escaped.size());
1114
1115	// Parser state
1116	bool escaping = false;
1117	uint8_t hex_remaining = `0`;
1118	uint8_t hex_acc = `0`;
1119
1120	// Step through characters, performing parsing
1121	for (const char & c : escaped) {
1122	// If we're parsing a hex code, interpret the next character
1123	if (hex_remaining != `0`) {
1124	uint8_t value = (c >= `'a'`) ? (c - `'a'` + `10`) : (c - `'0'`);
1125	hex_acc = (hex_acc << `4`) + value;
1126
1127	hex_remaining -= `1`;
1128	if (hex_remaining == `0`) {
1129	output.push_back(x: hex_acc);
1130	hex_acc = `0`;
1131	}
1132
1133	continue;
1134	}
1135
1136	// If we got an escape character, interpret it
1137	if (escaping) {
1138	if (c == `'t'`) {
1139	output.push_back(x: `'\t'`);
1140	} else if (c == `'n'`) {
1141	output.push_back(x: `'\n'`);
1142	} else if (c == `'r'`) {
1143	output.push_back(x: `'\r'`);
1144	} else if (c == `'x'`) {
1145	hex_remaining = `2`;
1146	} else {
1147	output.push_back(x: c);
1148	}
1149
1150	escaping = false;
1151	continue;
1152	}
1153
1154	if (c == `'\\'`) {
1155	escaping = true;
1156	continue;
1157	}
1158
1159	output.push_back(x: c);
1160	}
1161
1162	return output;
1163	}
1164
1165	struct llm_tokenizer_rwkv : llm_tokenizer {
1166	llm_tokenizer_rwkv(const llama_vocab & vocab) {
1167	// RWKV supports arbitrary byte tokens, but the vocab struct only supports string tokens.
1168	// For now, we decode the vocab here into the lookup we'll use for tokenization.
1169
1170	// build trie
1171	for (uint32_t id = `0`; id < vocab.n_tokens(); ++id) {
1172	const auto & data = vocab.get_token_data(id);
1173	const auto text = llama_unescape_rwkv_token(escaped: data.text);
1174	token_matcher.insert(key: (const char *) text.data(), len: text.size(), value: id);
1175	}
1176	}
1177
1178	struct naive_trie token_matcher;
1179	};
1180
1181	struct llm_tokenizer_rwkv_session {
1182	llm_tokenizer_rwkv_session(const llama_vocab & vocab, const llm_tokenizer_rwkv & tokenizer) : vocab(vocab), tokenizer(tokenizer) {}
1183
1184	void tokenize(const std::string & text, std::vector<llama_token> & output) {
1185	uint32_t position = `0`;
1186	while (position < text.size()) {
1187	const struct naive_trie * node = tokenizer.token_matcher.traverse(c: text [position]);
1188	if (node == NULL) {
1189	// no matching token found, add unknown token
1190	output.push_back(x: vocab.token_unk());
1191	position += `1`;
1192	continue;
1193	}
1194
1195	// traverse the trie to find the longest matching token
1196	uint32_t token_id = `0`;
1197	uint32_t token_length = `0`;
1198	while (node != NULL) {
1199	if (node->has_value) {
1200	token_id = node->value;
1201	token_length = position + `1`;
1202	}
1203	node = node->traverse(c: text [++position]);
1204	}
1205
1206	// add the longest matching token
1207	output.push_back(x: token_id);
1208	position = token_length;
1209	}
1210	}
1211
1212	private:
1213	const llama_vocab & vocab;
1214	const llm_tokenizer_rwkv & tokenizer;
1215	};
1216
1217	struct llm_tokenizer_plamo2 : llm_tokenizer {
1218	llm_tokenizer_plamo2(const llama_vocab & vocab) {
1219	build(vocab);
1220	}
1221
1222	void build(const llama_vocab & vocab) {
1223	// Reset internal structures
1224	tokens_.clear();
1225	bytes_.assign(n: `256`, val: `0`);
1226	to_suffix_id_.clear();
1227	table_.clear();
1228
1229	// Build token list and byte mapping
1230	std::unordered_map<std::string, float> suffix_to_score;
1231	std::unordered_map<std::string, llama_token> token_to_id;
1232
1233	for (size_t token_id = `0`; token_id < vocab.n_tokens(); ++token_id) {
1234	const auto & entry = vocab.get_token_data(id: token_id);
1235	tokens_.push_back(x: entry.text);
1236	token_to_id [entry.text] = static_cast<llama_token>(token_id);
1237
1238	// Handle byte tokens
1239	if (vocab.is_byte(id: token_id)) {
1240	if (entry.text.length() == `6` && entry.text.substr(pos: `0`, n: `3`) == "<0x" && entry.text.back() == `'>'`) {
1241	std::string hex_str = entry.text.substr(pos: `3`, n: `2`);
1242	int byte_val = std::stoi(str: hex_str, idx: nullptr, base: `16`);
1243	bytes_[byte_val] = static_cast<llama_token>(token_id);
1244	}
1245	continue;
1246	}
1247
1248	// Add token and all its suffixes to suffix_to_score
1249	suffix_to_score [entry.text] = entry.score;
1250
1251	// Extract suffixes character by character (UTF-8 aware)
1252	std::vector<uint32_t> cpts = unicode_cpts_from_utf8(utf8: entry.text);
1253	for (size_t i = `1`; i < cpts.size(); ++i) {
1254	std::string suffix;
1255	for (size_t j = i; j < cpts.size(); ++j) {
1256	suffix += unicode_cpt_to_utf8(cpt: cpts [j]);
1257	}
1258	if (suffix_to_score.find(x: suffix) == suffix_to_score.end()) {
1259	suffix_to_score [suffix] = std::numeric_limits<float>::quiet_NaN();
1260	}
1261	}
1262	}
1263
1264	// Check that all byte tokens are set
1265	for (int i = `0`; i < `256`; ++i) {
1266	if (bytes_[i] == `0`) {
1267	throw std::runtime_error ("Byte token for <0x" + std::to_string(val: i) + "> is not set");
1268	}
1269	}
1270
1271	// Build suffix list in lexicographical order of reversed strings
1272	std::vector<std::string> suffixes;
1273	for (const auto & pair : suffix_to_score) {
1274	suffixes.push_back(x: pair.first);
1275	}
1276	suffixes.push_back(x: ""); // Empty suffix
1277
1278	std::sort(first: suffixes.begin(), last: suffixes.end(), comp: [](const std::string & a, const std::string & b) {
1279	std::string rev_a(a.rbegin(), a.rend());
1280	std::string rev_b(b.rbegin(), b.rend());
1281	return rev_a < rev_b;
1282	});
1283
1284	// Build suffix_to_id and to_suffix_id_
1285	std::unordered_map<std::string, int32_t> suffix_to_id;
1286	int32_t num_pieces = `0`;
1287
1288	for (const auto & suffix : suffixes) {
1289	suffix_to_id [suffix] = num_pieces;
1290	if (!suffix.empty()) {
1291	std::vector<uint32_t> cpts = unicode_cpts_from_utf8(utf8: suffix);
1292
1293	std::string remaining;
1294	for (size_t i = `1`; i < cpts.size(); ++i) {
1295	remaining += unicode_cpt_to_utf8(cpt: cpts [i]);
1296	}
1297
1298	int64_t piece_code = (static_cast<int64_t>(cpts [`0`]) << `32`) \| suffix_to_id [remaining];
1299	to_suffix_id_[piece_code] = num_pieces;
1300
1301	// Count number of pieces for this suffix
1302	int32_t pieces_for_suffix = `1`; // sentinel row
1303	for (int32_t piece_length = static_cast<int32_t>(cpts.size()); piece_length > `0`; --piece_length) {
1304	std::string piece;
1305	for (int32_t i = `0`; i < piece_length; ++i) {
1306	piece += unicode_cpt_to_utf8(cpt: cpts [i]);
1307	}
1308	if (suffix_to_score.find(x: piece) != suffix_to_score.end()) {
1309	pieces_for_suffix++;
1310	}
1311	}
1312	num_pieces += pieces_for_suffix;
1313	} else {
1314	num_pieces++; // Empty suffix contributes one piece (sentinel row)
1315	}
1316	}
1317
1318	// Build flattened table
1319	table_.resize(new_size: num_pieces, x: std::vector<int32_t>(`4`, `0`));
1320	int32_t table_idx = `0`;
1321
1322	for (const auto & suffix : suffixes) {
1323	// Add all prefixes of the suffix to the table (in decreasing order of length)
1324	std::vector<uint32_t> cpts = unicode_cpts_from_utf8(utf8: suffix);
1325	for (int32_t piece_length = static_cast<int32_t>(cpts.size()); piece_length > `0`; --piece_length) {
1326	std::string piece;
1327	for (int32_t i = `0`; i < piece_length; ++i) {
1328	piece += unicode_cpt_to_utf8(cpt: cpts [i]);
1329	}
1330
1331	auto score_it = suffix_to_score.find(x: piece);
1332	if (score_it == suffix_to_score.end()) {
1333	continue;
1334	}
1335
1336	table_[table_idx][TABLE_PIECE_LENGTH] = piece_length;
1337	auto token_it = token_to_id.find(x: piece);
1338	table_[table_idx][TABLE_TOKEN_ID] = (token_it != token_to_id.end()) ? token_it ->second : -`1`;
1339
1340	float score = score_it ->second;
1341	table_[table_idx][TABLE_SCORE] = std::isfinite(x: score) ?
1342	static_cast<int32_t>(std::round(x: score * `1e4`)) : INVALID_SCORE;
1343	table_[table_idx][TABLE_PIECE_ID] = suffix_to_id [piece];
1344
1345	table_idx++;
1346	}
1347
1348	// Add sentinel row
1349	table_[table_idx][TABLE_PIECE_LENGTH] = `1`;
1350	table_[table_idx][TABLE_TOKEN_ID] = -`1`;
1351	table_[table_idx][TABLE_SCORE] = UNKNOWN_SCORE;
1352	table_idx++;
1353	}
1354	}
1355
1356	std::vector<llama_token> encode(const std::string & text) const {
1357	std::vector<uint32_t> unicode_data = unicode_cpts_from_utf8(utf8: text);
1358	// Skip the first code point if it is a BOM (Byte Order Mark)
1359	if (!unicode_data.empty() && unicode_data [`0`] == `0xFEFF`) {
1360	unicode_data.erase(position: unicode_data.begin());
1361	}
1362
1363	if (unicode_data.empty()) {
1364	return {};
1365	}
1366
1367	const size_t data_len = unicode_data.size();
1368
1369	// Initialize scores array (dynamic programming)
1370	std::vector<int64_t> scores(data_len + `1`, static_cast<int64_t>(`1`) << `60`);
1371	scores [data_len] = `0`;
1372
1373	// Path array to track best tokenization
1374	std::vector<std::vector<int32_t>> path(data_len + `1`, std::vector<int32_t>(`3`, `0`));
1375
1376	int32_t suffix_id = `0`;
1377
1378	// Process from end to beginning
1379	for (int i = static_cast<int>(data_len) - `1`; i >= `0`; --i) {
1380	uint32_t c = unicode_data [i];
1381
1382	// Find next suffix ID
1383	for (size_t p = suffix_id; p < table_.size(); ++p) {
1384	int64_t piece_code = (static_cast<int64_t>(c) << `32`) \| table_[p][TABLE_PIECE_ID];
1385	auto it = to_suffix_id_.find(x: piece_code);
1386	suffix_id = (it != to_suffix_id_.end()) ? it ->second : `0`;
1387
1388	if (suffix_id > `0` \|\| table_[p][TABLE_SCORE] == UNKNOWN_SCORE) {
1389	break;
1390	}
1391	}
1392
1393	// Update best path
1394	for (size_t p = suffix_id; p < table_.size(); ++p) {
1395	int32_t score = table_[p][TABLE_SCORE];
1396	if (score > INVALID_SCORE) {
1397	int32_t piece_length = table_[p][TABLE_PIECE_LENGTH];
1398	int64_t s = scores [i + piece_length] - score;
1399
1400	if (s < scores [i]) {
1401	scores [i] = s;
1402	path [i][PATH_TOKEN_LENGTH] = piece_length;
1403	path [i][PATH_TOKEN_ID] = table_[p][TABLE_TOKEN_ID];
1404	path [i][PATH_NUM_TOKENS] = path [i + piece_length][PATH_NUM_TOKENS] + `1`;
1405
1406	if (score == UNKNOWN_SCORE) {
1407	// Add UTF-8 byte count
1408	path [i][PATH_NUM_TOKENS] += (c >= `0x80`) + (c >= `0x800`) + (c >= `0x10000`);
1409	}
1410	}
1411	}
1412
1413	if (score == UNKNOWN_SCORE) {
1414	break;
1415	}
1416	}
1417	}
1418
1419	// Decode the best path
1420	std::vector<llama_token> token_ids;
1421	token_ids.reserve(n: path [`0`][PATH_NUM_TOKENS]);
1422
1423	int pos = `0`;
1424	while (pos < static_cast<int>(data_len)) {
1425	if (path [pos][PATH_TOKEN_ID] >= `0`) {
1426	token_ids.push_back(x: path [pos][PATH_TOKEN_ID]);
1427	} else {
1428	// Fall back to byte tokens
1429	uint32_t c = unicode_data [pos];
1430	int s = `1` + (c >= `0x80`) + (c >= `0x800`) + (c >= `0x10000`);
1431
1432	for (int i = `0`; i < s; ++i) {
1433	uint8_t b;
1434	if (s == `1`) {
1435	b = c;
1436	} else {
1437	if (i == `0`) {
1438	b = (`0xF00` >> s) & `0xFF`;
1439	} else {
1440	b = `0x80`;
1441	}
1442	}
1443	token_ids.push_back(x: bytes_[b \| ((c >> ((s - i - `1`) * `6`)) & `0x3F`)]);
1444	}
1445	}
1446
1447	assert(path[pos][PATH_TOKEN_LENGTH] > `0`);
1448	pos += path [pos][PATH_TOKEN_LENGTH];
1449	}
1450
1451	return token_ids;
1452	}
1453	private:
1454	// Constants for table structure
1455	static constexpr int32_t TABLE_PIECE_LENGTH = `0`;
1456	static constexpr int32_t TABLE_TOKEN_ID = `1`;
1457	static constexpr int32_t TABLE_SCORE = `2`;
1458	static constexpr int32_t TABLE_PIECE_ID = `3`;
1459
1460	// Constants for path array
1461	static constexpr int32_t PATH_TOKEN_LENGTH = `0`;
1462	static constexpr int32_t PATH_TOKEN_ID = `1`;
1463	static constexpr int32_t PATH_NUM_TOKENS = `2`;
1464
1465	// Score constants
1466	static constexpr int32_t INVALID_SCORE = -`20000000`;
1467	static constexpr int32_t UNKNOWN_SCORE = -`10000000`;
1468
1469	// List of tokens in the vocabulary
1470	std::vector<std::string> tokens_;
1471
1472	// Mapping from byte code point to token ID (for byte fallback)
1473	std::vector<llama_token> bytes_;
1474
1475	// Mapping from piece code to suffix ID
1476	std::unordered_map<int64_t, int32_t> to_suffix_id_;
1477
1478	// Flattened table representing the Trie structure
1479	// Each row contains: [piece_length, token_id, score, piece_id]
1480	std::vector<std::vector<int32_t>> table_;
1481	};
1482
1483	struct llm_tokenizer_plamo2_session {
1484	llm_tokenizer_plamo2_session(const llm_tokenizer_plamo2 & tokenizer) : tokenizer(tokenizer) {}
1485
1486	void tokenize(const std::string & text, std::vector<llama_token> & output) {
1487	std::vector<llama_token> tokens = tokenizer.encode(text);
1488	output.insert(position: output.end(), first: tokens.begin(), last: tokens.end());
1489	}
1490
1491	private:
1492	const llm_tokenizer_plamo2 & tokenizer;
1493	};
1494
1495	//
1496	// impl
1497	//
1498
1499	typedef enum FRAGMENT_BUFFER_VARIANT_TYPE {
1500	FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN,
1501	FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT
1502	} FRAGMENT_BUFFER_VARIANT_TYPE;
1503
1504	struct fragment_buffer_variant {
1505	fragment_buffer_variant(llama_token _token)
1506	:
1507	type(FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN),
1508	token(_token),
1509	raw_text(_dummy),
1510	offset(`0`),
1511	length(`0`) {}
1512
1513	fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length)
1514	:
1515	type(FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT),
1516	token((llama_token) - `1`),
1517	raw_text(_raw_text),
1518	offset(_offset),
1519	length(_length){
1520	GGML_ASSERT(_offset >= `0`);
1521	GGML_ASSERT(_length >= `1`);
1522	GGML_ASSERT(offset + length <= raw_text.length());
1523	}
1524
1525	const FRAGMENT_BUFFER_VARIANT_TYPE type;
1526	const llama_token token;
1527	const std::string _dummy;
1528	const std::string & raw_text;
1529	const uint64_t offset;
1530	const uint64_t length;
1531	};
1532
1533	struct llama_vocab::impl {
1534	uint32_t n_token_types = `0`; // for BERT-style token types
1535
1536	std::string tokenizer_model;
1537	std::string tokenizer_pre;
1538
1539	enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
1540	enum llama_vocab_pre_type pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
1541
1542	int max_token_len = `0`; // used for optimizing longest token search
1543
1544	// default LLaMA special tokens
1545	// TODO: should we set all of these to LLAMA_TOKEN_NULL?
1546	llama_token special_bos_id = `1`;
1547	llama_token special_eos_id = `2`;
1548	llama_token special_eot_id = LLAMA_TOKEN_NULL;
1549	llama_token special_eom_id = LLAMA_TOKEN_NULL;
1550	llama_token special_unk_id = `0`;
1551	llama_token special_sep_id = LLAMA_TOKEN_NULL;
1552	llama_token special_pad_id = LLAMA_TOKEN_NULL;
1553	llama_token special_mask_id = LLAMA_TOKEN_NULL;
1554
1555	llama_token linefeed_id = `13`;
1556
1557	// fim tokens
1558	llama_token special_fim_pre_id = LLAMA_TOKEN_NULL;
1559	llama_token special_fim_suf_id = LLAMA_TOKEN_NULL;
1560	llama_token special_fim_mid_id = LLAMA_TOKEN_NULL;
1561	llama_token special_fim_pad_id = LLAMA_TOKEN_NULL;
1562	llama_token special_fim_rep_id = LLAMA_TOKEN_NULL; // repo
1563	llama_token special_fim_sep_id = LLAMA_TOKEN_NULL; // file separator
1564
1565	// tokenizer flags
1566	bool add_space_prefix = false;
1567	bool add_bos = false;
1568	bool add_eos = false;
1569	bool add_sep = false;
1570	bool ignore_merges = false;
1571	bool clean_spaces = false; // clean_up_tokenization_spaces
1572	bool remove_extra_whitespaces = false;
1573	bool escape_whitespaces = true;
1574	bool treat_whitespace_as_suffix = false;
1575
1576	std::unordered_map<std::string, llama_token> token_to_id;
1577	std::vector<token_data> id_to_token;
1578
1579	std::vector<llama_token> cache_special_tokens;
1580	std::vector<std::string> cache_token_to_piece; // llama_token_to_piece(special = true);
1581	struct pair_hash {
1582	size_t operator()(const std::pair<std::string, std::string> & p) const {
1583	return std::hash<std::string>{}(p.first) ^ //create some hash for pair
1584	(std::hash<std::string>{}(p.second) << `1`);
1585	}
1586	};
1587	std::unordered_map<std::pair<std::string, std::string>, int, pair_hash> bpe_ranks;
1588
1589	// set of all tokens that cause "end of generation"
1590	std::set<llama_token> special_eog_ids;
1591
1592	std::unique_ptr<llm_tokenizer> tokenizer;
1593
1594	std::vector<char> precompiled_charsmap;
1595
1596	impl(const llama_vocab & vocab) : vocab(vocab) {
1597	}
1598
1599	~impl() = default;
1600
1601	void load(llama_model_loader & ml, const LLM_KV & kv);
1602
1603	enum llama_vocab_type get_type() const;
1604
1605	std::string type_name() const;
1606
1607	bool is_normal (llama_token id) const;
1608	bool is_unknown (llama_token id) const;
1609	bool is_control (llama_token id) const;
1610	bool is_byte (llama_token id) const;
1611	bool is_user_defined(llama_token id) const;
1612	bool is_unused (llama_token id) const;
1613	bool is_eog (llama_token id) const;
1614
1615	uint8_t token_to_byte(llama_token id) const;
1616
1617	llama_token_attr token_get_attr(llama_token id) const;
1618
1619	void init_tokenizer(enum llama_vocab_type type);
1620
1621	void tokenizer_st_partition(std::forward_list<fragment_buffer_variant> & buffer, bool parse_special) const;
1622
1623	std::string token_to_piece_for_cache(
1624	llama_token token,
1625	bool special) const;
1626
1627
1628	std::vector<llama_token> tokenize(
1629	const std::string & raw_text,
1630	bool add_special,
1631	bool parse_special = false) const;
1632
1633	int32_t tokenize(
1634	const char * text,
1635	int32_t text_len,
1636	llama_token * tokens,
1637	int32_t n_tokens_max,
1638	bool add_special,
1639	bool parse_special) const;
1640
1641	// does not write null-terminator to buf
1642	int32_t token_to_piece(
1643	llama_token token,
1644	char * buf,
1645	int32_t length,
1646	int32_t lstrip,
1647	bool special) const;
1648
1649	// use cached data
1650	const std::string & token_to_piece(llama_token token) const;
1651
1652	int32_t detokenize(
1653	const llama_token * tokens,
1654	int32_t n_tokens,
1655	char * text,
1656	int32_t text_len_max,
1657	bool remove_special,
1658	bool unparse_special) const;
1659
1660	std::string detokenize(
1661	const std::vector<llama_token> & tokens,
1662	bool special) const;
1663
1664	void print_info() const;
1665
1666	private:
1667	const llama_vocab & vocab;
1668	};
1669
1670	void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1671	struct gguf_context * ctx = ml.meta.get();
1672
1673	// determine vocab type
1674	{
1675	ml.get_key(kid: LLM_KV_TOKENIZER_MODEL, result&: tokenizer_model);
1676	ml.get_key(kid: LLM_KV_TOKENIZER_PRE, result&: tokenizer_pre, required: false);
1677
1678	ml.get_key(kid: LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, result&: n_token_types, required: false);
1679
1680	if (tokenizer_model == "no_vocab" \|\| tokenizer_model == "none") {
1681	type = LLAMA_VOCAB_TYPE_NONE;
1682
1683	// default special tokens
1684	special_bos_id = LLAMA_TOKEN_NULL;
1685	special_eos_id = LLAMA_TOKEN_NULL;
1686	special_unk_id = LLAMA_TOKEN_NULL;
1687	special_sep_id = LLAMA_TOKEN_NULL;
1688	special_pad_id = LLAMA_TOKEN_NULL;
1689	special_mask_id = LLAMA_TOKEN_NULL;
1690	linefeed_id = LLAMA_TOKEN_NULL;
1691
1692	// read vocab size from metadata
1693	uint32_t n_tokens = `0`;
1694	if (ml.get_key(kid: LLM_KV_VOCAB_SIZE, result&: n_tokens, required: false)) {
1695	LLAMA_LOG_WARN("%s: adding %u dummy tokens\n", __func__, n_tokens);
1696	id_to_token.resize(new_size: n_tokens);
1697	}
1698
1699	return;
1700	}
1701
1702	if (tokenizer_model == "llama") {
1703	type = LLAMA_VOCAB_TYPE_SPM;
1704
1705	// default special tokens
1706	special_bos_id = `1`;
1707	special_eos_id = `2`;
1708	special_unk_id = `0`;
1709	special_sep_id = LLAMA_TOKEN_NULL;
1710	special_pad_id = LLAMA_TOKEN_NULL;
1711	special_mask_id = LLAMA_TOKEN_NULL;
1712	} else if (tokenizer_model == "bert") {
1713	type = LLAMA_VOCAB_TYPE_WPM;
1714
1715	// default special tokens
1716	special_bos_id = `101`;
1717	special_eos_id = LLAMA_TOKEN_NULL;
1718	special_unk_id = `100`;
1719	special_sep_id = `102`;
1720	special_pad_id = `0`;
1721	special_mask_id = `103`;
1722
1723	add_sep = true;
1724	} else if (tokenizer_model == "gpt2") {
1725	type = LLAMA_VOCAB_TYPE_BPE;
1726
1727	// read bpe merges and populate bpe ranks
1728	const int merges_keyidx = gguf_find_key(ctx, key: kv (LLM_KV_TOKENIZER_MERGES).c_str());
1729	if (merges_keyidx == -`1`) {
1730	throw std::runtime_error ("cannot find tokenizer merges in model file\n");
1731	}
1732
1733	const int n_merges = gguf_get_arr_n(ctx, key_id: merges_keyidx);
1734	for (int i = `0`; i < n_merges; i++) {
1735	const std::string word = gguf_get_arr_str(ctx, key_id: merges_keyidx, i);
1736	//GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
1737
1738	std::string first;
1739	std::string second;
1740
1741	const size_t pos = word.find(c: `' '`, pos: `1`);
1742
1743	if (pos != std::string::npos) {
1744	first = word.substr(pos: `0`, n: pos);
1745	second = word.substr(pos: pos + `1`);
1746	}
1747
1748	bpe_ranks.emplace(args: std::make_pair(x&: first, y&: second), args&: i);
1749	}
1750
1751	// default special tokens
1752	special_bos_id = `11`;
1753	special_eos_id = `11`;
1754	special_unk_id = LLAMA_TOKEN_NULL;
1755	special_sep_id = LLAMA_TOKEN_NULL;
1756	special_pad_id = LLAMA_TOKEN_NULL;
1757	special_mask_id = LLAMA_TOKEN_NULL;
1758	} else if (tokenizer_model == "t5") {
1759	type = LLAMA_VOCAB_TYPE_UGM;
1760
1761	// default special tokens
1762	special_bos_id = LLAMA_TOKEN_NULL;
1763	special_eos_id = `1`;
1764	special_unk_id = `2`;
1765	special_sep_id = LLAMA_TOKEN_NULL;
1766	special_pad_id = `0`;
1767	special_mask_id = LLAMA_TOKEN_NULL;
1768
1769	const int precompiled_charsmap_keyidx = gguf_find_key(ctx, key: kv (LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
1770	if (precompiled_charsmap_keyidx != -`1`) {
1771	const gguf_type pc_type = gguf_get_arr_type(ctx, key_id: precompiled_charsmap_keyidx);
1772	GGML_ASSERT(pc_type == GGUF_TYPE_INT8 \|\| pc_type == GGUF_TYPE_UINT8);
1773
1774	const size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, key_id: precompiled_charsmap_keyidx);
1775	const char * pc = (const char *) gguf_get_arr_data(ctx, key_id: precompiled_charsmap_keyidx);
1776	precompiled_charsmap.assign(first: pc, last: pc + n_precompiled_charsmap);
1777	#if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
1778	// correct endiannes of data in precompiled_charsmap binary blob
1779	uint32_t * xcda_blob_size = (uint32_t *) &precompiled_charsmap[`0`];
1780	xcda_blob_size = __builtin_bswap32(xcda_blob_size);
1781	assert(xcda_blob_size + sizeof*(uint32_t) < n_precompiled_charsmap);
1782	size_t xcda_array_size = xcda_blob_size / sizeof*(uint32_t);
1783	uint32_t * xcda_array = (uint32_t ) &precompiled_charsmap[sizeof*(uint32_t)];
1784	for (size_t i = `0`; i < xcda_array_size; ++i) {
1785	xcda_array[i] = __builtin_bswap32(xcda_array[i]);
1786	}
1787	#endif
1788	}
1789	} else if (tokenizer_model == "rwkv") {
1790	type = LLAMA_VOCAB_TYPE_RWKV;
1791
1792	// default special tokens
1793	special_bos_id = LLAMA_TOKEN_NULL;
1794	special_eos_id = LLAMA_TOKEN_NULL;
1795	special_unk_id = LLAMA_TOKEN_NULL;
1796	special_sep_id = LLAMA_TOKEN_NULL;
1797	special_pad_id = LLAMA_TOKEN_NULL;
1798	} else if (tokenizer_model == "plamo2") {
1799	type = LLAMA_VOCAB_TYPE_PLAMO2;
1800
1801	// PLaMo-2 default special tokens (these will be overridden by model config)
1802	special_bos_id = `1`; // <\|plamo:bos\|>
1803	special_eos_id = `2`; // <\|plamo:eos\|>
1804	special_unk_id = `0`; // <\|plamo:unk\|>
1805	special_sep_id = LLAMA_TOKEN_NULL;
1806	special_pad_id = `3`; // <\|plamo:pad\|>
1807	special_mask_id = LLAMA_TOKEN_NULL;
1808	} else {
1809	throw std::runtime_error (format(fmt: "unknown tokenizer: '%s'", tokenizer_model.c_str()));
1810	}
1811
1812	// for now, only BPE models have pre-tokenizers
1813	if (type == LLAMA_VOCAB_TYPE_BPE) {
1814	add_space_prefix = false;
1815	clean_spaces = true;
1816	if (tokenizer_pre.empty()) {
1817	LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
1818	LLAMA_LOG_WARN("%s: \n", __func__);
1819	LLAMA_LOG_WARN("%s: ********************************** \n", __func__**);
1820	LLAMA_LOG_WARN("%s: GENERATION QUALITY WILL BE DEGRADED! \n", __func__);
1821	LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL \n", __func__);
1822	LLAMA_LOG_WARN("%s: ********************************** \n", __func__**);
1823	LLAMA_LOG_WARN("%s: \n", __func__);
1824	pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
1825	} else if (tokenizer_pre == "default") {
1826	pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
1827	} else if (
1828	tokenizer_pre == "llama3" \|\|
1829	tokenizer_pre == "llama-v3" \|\|
1830	tokenizer_pre == "llama-bpe"\|\|
1831	tokenizer_pre == "falcon3" \|\|
1832	tokenizer_pre == "falcon-h1" \|\|
1833	tokenizer_pre == "pixtral" \|\|
1834	tokenizer_pre == "midm-2.0" \|\|
1835	tokenizer_pre == "lfm2") {
1836	pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
1837	ignore_merges = true;
1838	add_bos = true;
1839	} else if (
1840	tokenizer_pre == "deepseek-llm") {
1841	pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM;
1842	clean_spaces = false;
1843	} else if (
1844	tokenizer_pre == "deepseek-coder") {
1845	pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
1846	clean_spaces = false;
1847	} else if (
1848	tokenizer_pre == "deepseek-v3") {
1849	pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM;
1850	clean_spaces = false;
1851	} else if (
1852	tokenizer_pre == "falcon") {
1853	pre_type = LLAMA_VOCAB_PRE_TYPE_FALCON;
1854	} else if (
1855	tokenizer_pre == "mpt") {
1856	pre_type = LLAMA_VOCAB_PRE_TYPE_MPT;
1857	} else if (
1858	tokenizer_pre == "starcoder") {
1859	pre_type = LLAMA_VOCAB_PRE_TYPE_STARCODER;
1860	} else if (
1861	tokenizer_pre == "gpt-2" \|\|
1862	tokenizer_pre == "phi-2" \|\|
1863	tokenizer_pre == "jina-es" \|\|
1864	tokenizer_pre == "jina-de" \|\|
1865	tokenizer_pre == "gigachat" \|\|
1866	tokenizer_pre == "jina-v2-es" \|\|
1867	tokenizer_pre == "jina-v2-de" \|\|
1868	tokenizer_pre == "a.x-4.0" \|\|
1869	tokenizer_pre == "mellum") {
1870	pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
1871	} else if (
1872	tokenizer_pre == "jina-v1-en" \|\|
1873	tokenizer_pre == "jina-v2-code" \|\|
1874	tokenizer_pre == "roberta-bpe") {
1875	pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
1876	add_sep = true;
1877	} else if (
1878	tokenizer_pre == "refact") {
1879	pre_type = LLAMA_VOCAB_PRE_TYPE_REFACT;
1880	} else if (
1881	tokenizer_pre == "command-r") {
1882	pre_type = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
1883	clean_spaces = false;
1884	} else if (
1885	tokenizer_pre == "qwen2" \|\|
1886	tokenizer_pre == "deepseek-r1-qwen") {
1887	pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
1888	clean_spaces = false;
1889	} else if (
1890	tokenizer_pre == "stablelm2") {
1891	pre_type = LLAMA_VOCAB_PRE_TYPE_STABLELM2;
1892	} else if (
1893	tokenizer_pre == "olmo") {
1894	pre_type = LLAMA_VOCAB_PRE_TYPE_OLMO;
1895	} else if (
1896	tokenizer_pre == "dbrx") {
1897	pre_type = LLAMA_VOCAB_PRE_TYPE_DBRX;
1898	} else if (
1899	tokenizer_pre == "smaug-bpe") {
1900	pre_type = LLAMA_VOCAB_PRE_TYPE_SMAUG;
1901	} else if (
1902	tokenizer_pre == "poro-chat") {
1903	pre_type = LLAMA_VOCAB_PRE_TYPE_PORO;
1904	clean_spaces = false;
1905	} else if (
1906	tokenizer_pre == "glm4" \|\|
1907	tokenizer_pre == "chatglm-bpe") {
1908	pre_type = LLAMA_VOCAB_PRE_TYPE_CHATGLM4;
1909	special_bos_id = LLAMA_TOKEN_NULL;
1910	} else if (
1911	tokenizer_pre == "viking") {
1912	pre_type = LLAMA_VOCAB_PRE_TYPE_VIKING;
1913	clean_spaces = false;
1914	} else if (
1915	tokenizer_pre == "jais") {
1916	pre_type = LLAMA_VOCAB_PRE_TYPE_JAIS;
1917	} else if (
1918	tokenizer_pre == "tekken") {
1919	pre_type = LLAMA_VOCAB_PRE_TYPE_TEKKEN;
1920	clean_spaces = false;
1921	ignore_merges = true;
1922	add_bos = true;
1923	} else if (
1924	tokenizer_pre == "smollm") {
1925	pre_type = LLAMA_VOCAB_PRE_TYPE_SMOLLM;
1926	clean_spaces = false;
1927	} else if (
1928	tokenizer_pre == "codeshell") {
1929	pre_type = LLAMA_VOCAB_PRE_TYPE_CODESHELL;
1930	} else if (
1931	tokenizer_pre == "bloom") {
1932	pre_type = LLAMA_VOCAB_PRE_TYPE_BLOOM;
1933	} else if (
1934	tokenizer_pre == "gpt3-finnish") {
1935	pre_type = LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH;
1936	} else if (
1937	tokenizer_pre == "exaone") {
1938	pre_type = LLAMA_VOCAB_PRE_TYPE_EXAONE;
1939	} else if (
1940	tokenizer_pre == "exaone4") {
1941	pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
1942	} else if (
1943	tokenizer_pre == "chameleon") {
1944	pre_type = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
1945	add_bos = true;
1946	clean_spaces = false;
1947	} else if (
1948	tokenizer_pre == "minerva-7b") {
1949	pre_type = LLAMA_VOCAB_PRE_TYPE_MINERVA;
1950	} else if (
1951	tokenizer_pre == "megrez") {
1952	pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
1953	} else if (
1954	tokenizer_pre == "gpt-4o" \|\|
1955	tokenizer_pre == "llama4") {
1956	pre_type = LLAMA_VOCAB_PRE_TYPE_GPT4O;
1957	clean_spaces = false;
1958	} else if (
1959	tokenizer_pre == "superbpe") {
1960	pre_type = LLAMA_VOCAB_PRE_TYPE_SUPERBPE;
1961	clean_spaces = false;
1962	} else if (
1963	tokenizer_pre == "trillion") {
1964	pre_type = LLAMA_VOCAB_PRE_TYPE_TRILLION;
1965	clean_spaces = false;
1966	} else if (
1967	tokenizer_pre == "granite-docling") {
1968	pre_type = LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING;
1969	clean_spaces = false;
1970	} else if (
1971	tokenizer_pre == "bailingmoe" \|\|
1972	tokenizer_pre == "bailingmoe2" \|\|
1973	tokenizer_pre == "llada-moe") {
1974	pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
1975	clean_spaces = false;
1976	} else if (
1977	tokenizer_pre == "seed-coder") {
1978	pre_type = LLAMA_VOCAB_PRE_TYPE_SEED_CODER;
1979	clean_spaces = false;
1980	} else if (
1981	tokenizer_pre == "hunyuan") {
1982	pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN;
1983	clean_spaces = false;
1984	} else if (
1985	tokenizer_pre == "hunyuan-dense") {
1986	pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE;
1987	clean_spaces = false;
1988	} else if (
1989	tokenizer_pre == "kimi-k2") {
1990	pre_type = LLAMA_VOCAB_PRE_TYPE_KIMI_K2;
1991	clean_spaces = false;
1992	} else if (
1993	tokenizer_pre == "grok-2") {
1994	pre_type = LLAMA_VOCAB_PRE_TYPE_GROK_2;
1995	clean_spaces = false;
1996	} else if (
1997	tokenizer_pre == "minimax-m2") {
1998	pre_type = LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2;
1999	clean_spaces = false;
2000	} else {
2001	throw std::runtime_error (format(fmt: "unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
2002	}
2003	} else if (type == LLAMA_VOCAB_TYPE_SPM) {
2004	pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
2005	add_space_prefix = true;
2006	clean_spaces = false;
2007	add_bos = true;
2008	add_eos = false;
2009	} else if (type == LLAMA_VOCAB_TYPE_WPM) {
2010	pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
2011	add_space_prefix = false;
2012	clean_spaces = true;
2013	add_bos = true;
2014	add_eos = false;
2015	add_sep = true;
2016	} else if (type == LLAMA_VOCAB_TYPE_UGM) {
2017	pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
2018	add_bos = false;
2019	add_eos = true;
2020	} else if (type == LLAMA_VOCAB_TYPE_RWKV) {
2021	pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
2022	add_space_prefix = false;
2023	clean_spaces = false;
2024	add_bos = false;
2025	add_eos = false;
2026	} else {
2027	pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
2028	}
2029
2030	ml.get_key(kid: LLM_KV_TOKENIZER_ADD_PREFIX, result&: add_space_prefix, required: false);
2031	ml.get_key(kid: LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, result&: remove_extra_whitespaces, required: false);
2032	}
2033
2034	const int token_idx = gguf_find_key(ctx, key: kv (LLM_KV_TOKENIZER_LIST).c_str());
2035	if (token_idx == -`1`) {
2036	throw std::runtime_error ("cannot find tokenizer vocab in model file\n");
2037	}
2038
2039	const float * scores = nullptr;
2040	const int score_idx = gguf_find_key(ctx, key: kv (LLM_KV_TOKENIZER_SCORES).c_str());
2041	if (score_idx != -`1`) {
2042	scores = (const float * ) gguf_get_arr_data(ctx, key_id: score_idx);
2043	}
2044
2045	const int * toktypes = nullptr;
2046	const int toktype_idx = gguf_find_key(ctx, key: kv (LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
2047	if (toktype_idx != -`1`) {
2048	toktypes = (const int * ) gguf_get_arr_data(ctx, key_id: toktype_idx);
2049	}
2050
2051	uint32_t n_tokens = gguf_get_arr_n(ctx, key_id: token_idx);
2052	id_to_token.resize(new_size: n_tokens);
2053
2054	for (uint32_t i = `0`; i < n_tokens; i++) {
2055	std::string word = gguf_get_arr_str(ctx, key_id: token_idx, i);
2056	if (word.empty()) {
2057	LLAMA_LOG_WARN("%s: empty token at index %u\n", __func__, i);
2058	word = "[EMPTY_" + std::to_string(val: i) + "]";
2059	}
2060
2061	token_to_id [word] = i;
2062	max_token_len = std::max(a: max_token_len, b: (int) word.size());
2063
2064	auto & token_data = id_to_token [i];
2065	token_data.text = std::move(word);
2066	token_data.score = scores ? scores[i] : `0.0f`;
2067	token_data.attr = LLAMA_TOKEN_ATTR_NORMAL;
2068
2069	if (toktypes) { //TODO: remove, required until per token attributes are available from GGUF file
2070	switch(toktypes[i]) {
2071	case LLAMA_TOKEN_TYPE_UNKNOWN: token_data.attr = LLAMA_TOKEN_ATTR_UNKNOWN; break;
2072	case LLAMA_TOKEN_TYPE_UNUSED: token_data.attr = LLAMA_TOKEN_ATTR_UNUSED; break;
2073	case LLAMA_TOKEN_TYPE_NORMAL: token_data.attr = LLAMA_TOKEN_ATTR_NORMAL; break;
2074	case LLAMA_TOKEN_TYPE_CONTROL: token_data.attr = LLAMA_TOKEN_ATTR_CONTROL; break;
2075	case LLAMA_TOKEN_TYPE_USER_DEFINED: token_data.attr = LLAMA_TOKEN_ATTR_USER_DEFINED; break;
2076	case LLAMA_TOKEN_TYPE_BYTE: token_data.attr = LLAMA_TOKEN_ATTR_BYTE; break;
2077	case LLAMA_TOKEN_TYPE_UNDEFINED: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
2078	default: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
2079	}
2080	}
2081	}
2082	GGML_ASSERT(id_to_token.size() == token_to_id.size());
2083
2084	init_tokenizer(type);
2085
2086	// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
2087	if (type == LLAMA_VOCAB_TYPE_SPM) {
2088	try {
2089	linefeed_id = vocab.byte_to_token(ch: `'\n'`);
2090	} catch (const std::exception & e) {
2091	LLAMA_LOG_WARN("%s: SPM vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, e.what());
2092	linefeed_id = special_pad_id;
2093	}
2094	} else if (type == LLAMA_VOCAB_TYPE_WPM) {
2095	linefeed_id = special_pad_id;
2096	} else if (type == LLAMA_VOCAB_TYPE_RWKV) {
2097	const std::vector<int> ids = tokenize(raw_text: "\n", add_special: false);
2098	GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
2099	linefeed_id = ids [`0`];
2100	} else {
2101	const std::vector<int> ids = tokenize(raw_text: "\n", add_special: false);
2102
2103	//GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
2104	if (ids.empty()) {
2105	LLAMA_LOG_WARN("%s: model vocab missing newline token, using special_pad_id instead\n", __func__);
2106	linefeed_id = special_pad_id;
2107	} else {
2108	linefeed_id = ids [`0`];
2109	}
2110	}
2111
2112	// special tokens
2113	{
2114	const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
2115	{ LLM_KV_TOKENIZER_BOS_ID, special_bos_id },
2116	{ LLM_KV_TOKENIZER_EOS_ID, special_eos_id },
2117	{ LLM_KV_TOKENIZER_EOT_ID, special_eot_id },
2118	{ LLM_KV_TOKENIZER_EOM_ID, special_eom_id },
2119	{ LLM_KV_TOKENIZER_UNK_ID, special_unk_id },
2120	{ LLM_KV_TOKENIZER_SEP_ID, special_sep_id },
2121	{ LLM_KV_TOKENIZER_PAD_ID, special_pad_id },
2122	{ LLM_KV_TOKENIZER_MASK_ID, special_mask_id },
2123	{ LLM_KV_TOKENIZER_FIM_PRE_ID, special_fim_pre_id },
2124	{ LLM_KV_TOKENIZER_FIM_SUF_ID, special_fim_suf_id },
2125	{ LLM_KV_TOKENIZER_FIM_MID_ID, special_fim_mid_id },
2126	{ LLM_KV_TOKENIZER_FIM_PAD_ID, special_fim_pad_id },
2127	{ LLM_KV_TOKENIZER_FIM_REP_ID, special_fim_rep_id },
2128	{ LLM_KV_TOKENIZER_FIM_SEP_ID, special_fim_sep_id },
2129
2130	// deprecated
2131	{ LLM_KV_TOKENIZER_PREFIX_ID, special_fim_pre_id },
2132	{ LLM_KV_TOKENIZER_SUFFIX_ID, special_fim_suf_id },
2133	{ LLM_KV_TOKENIZER_MIDDLE_ID, special_fim_mid_id },
2134	};
2135
2136	for (const auto & it : special_token_types) {
2137	const std::string & key = kv (std::get<`0`>(in: it));
2138	int32_t & id = std::get<`1`>(in: it);
2139
2140	uint32_t new_id;
2141	if (!ml.get_key(kid: std::get<`0`>(in: it), result&: new_id, required: false)) {
2142	continue;
2143	}
2144	if (new_id >= id_to_token.size()) {
2145	LLAMA_LOG_WARN("%s: bad special token: '%s' = %u, using default id %d\n",
2146	__func__, key.c_str(), new_id, id);
2147	} else {
2148	id = new_id;
2149	}
2150	}
2151
2152	// Handle add_bos, add_eos and add_sep
2153	{
2154	bool temp = true;
2155
2156	if (ml.get_key(kid: LLM_KV_TOKENIZER_ADD_BOS, result&: temp, required: false)) {
2157	add_bos = temp;
2158	}
2159	if (ml.get_key(kid: LLM_KV_TOKENIZER_ADD_EOS, result&: temp, required: false)) {
2160	add_eos = temp;
2161	}
2162	if (ml.get_key(kid: LLM_KV_TOKENIZER_ADD_SEP, result&: temp, required: false)) {
2163	add_sep = temp;
2164	}
2165	}
2166
2167	// auto-detect special tokens by text
2168	// TODO: convert scripts should provide these tokens through the KV metadata LLM_KV_TOKENIZER_...
2169	// for now, we apply this workaround to find the tokens based on their text
2170
2171	for (const auto & t : token_to_id) {
2172	// find EOT token: "<\|eot_id\|>", "<\|im_end\|>", "<end_of_turn>", etc.
2173	if (special_eot_id == LLAMA_TOKEN_NULL) {
2174	if (false
2175	\|\| t.first == "<\|eot_id\|>"
2176	\|\| t.first == "<\|im_end\|>"
2177	\|\| t.first == "<\|end\|>"
2178	\|\| t.first == "<end_of_turn>"
2179	\|\| t.first == "<\|endoftext\|>"
2180	\|\| t.first == "<\|end_of_text\|>" // granite
2181	\|\| t.first == "<EOT>"
2182	\|\| t.first == "_<EOT>"
2183	\|\| t.first == "<｜end▁of▁sentence｜>" // DeepSeek
2184	\|\| t.first == "<end_of_utterance>" // smoldocling
2185	) {
2186	special_eot_id = t.second;
2187	if ((id_to_token [t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == `0`) {
2188	LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2189	__func__, t.second, t.first.c_str());
2190	id_to_token [t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
2191	}
2192	}
2193	}
2194
2195	// find EOM token: "<\|eom_id\|>"
2196	if (special_eom_id == LLAMA_TOKEN_NULL) {
2197	if (false
2198	\|\| t.first == "<\|eom_id\|>"
2199	) {
2200	special_eom_id = t.second;
2201	if ((id_to_token [t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == `0`) {
2202	LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2203	__func__, t.second, t.first.c_str());
2204	id_to_token [t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
2205	}
2206	}
2207	}
2208
2209	// find FIM_PRE token: "<\|fim_prefix\|>", "<fim-prefix>", "<PRE>", etc.
2210	if (special_fim_pre_id == LLAMA_TOKEN_NULL) {
2211	if (false
2212	\|\| t.first == "<\|fim_prefix\|>" // Qwen
2213	\|\| t.first == "<fim-prefix>"
2214	\|\| t.first == "<fim_prefix>" // Granite
2215	\|\| t.first == "<｜fim▁begin｜>" // DeepSeek
2216	\|\| t.first == "<PRE>"
2217	\|\| t.first == "▁<PRE>" // CodeLlama
2218	\|\| t.first == "<\|code_prefix\|>" // GLM-4.5
2219	) {
2220	special_fim_pre_id = t.second;
2221	if ((id_to_token [t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == `0`) {
2222	LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2223	__func__, t.second, t.first.c_str());
2224	id_to_token [t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
2225	}
2226	}
2227	}
2228
2229	// find FIM_SUF token: "<\|fim_suffix\|>", "<fim-suffix>", "<SUF>", etc.
2230	if (special_fim_suf_id == LLAMA_TOKEN_NULL) {
2231	if (false
2232	\|\| t.first == "<\|fim_suffix\|>" // Qwen
2233	\|\| t.first == "<fim-suffix>"
2234	\|\| t.first == "<fim_suffix>" // Granite
2235	\|\| t.first == "<｜fim▁hole｜>" // DeepSeek
2236	\|\| t.first == "<SUF>"
2237	\|\| t.first == "▁<SUF>" // CodeLlama
2238	\|\| t.first == "<\|code_suffix\|>" // GLM-4.5
2239	) {
2240	special_fim_suf_id = t.second;
2241	if ((id_to_token [t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == `0`) {
2242	LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2243	__func__, t.second, t.first.c_str());
2244	id_to_token [t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
2245	}
2246	}
2247	}
2248
2249	// find FIM_MID token: "<\|fim_middle\|>", "<fim-middle>", "<MID>", etc.
2250	if (special_fim_mid_id == LLAMA_TOKEN_NULL) {
2251	if (false
2252	\|\| t.first == "<\|fim_middle\|>" // Qwen
2253	\|\| t.first == "<fim-middle>"
2254	\|\| t.first == "<fim_middle>" // Granite
2255	\|\| t.first == "<｜fim▁end｜>" // DeepSeek
2256	\|\| t.first == "<MID>"
2257	\|\| t.first == "▁<MID>" // CodeLlama
2258	\|\| t.first == "<\|code_middle\|>" // GLM-4.5
2259	) {
2260	special_fim_mid_id = t.second;
2261	if ((id_to_token [t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == `0`) {
2262	LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2263	__func__, t.second, t.first.c_str());
2264	id_to_token [t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
2265	}
2266	}
2267	}
2268
2269	// find FIM_PAD token: "<\|fim_pad\|>", "<fim-pad>", "<PAD>", etc.
2270	if (special_fim_pad_id == LLAMA_TOKEN_NULL) {
2271	if (false
2272	\|\| t.first == "<\|fim_pad\|>" // Qwen
2273	\|\| t.first == "<fim-pad>"
2274	\|\| t.first == "<fim_pad>" // Granite
2275	\|\| t.first == "<PAD>"
2276	) {
2277	special_fim_pad_id = t.second;
2278	if ((id_to_token [t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == `0`) {
2279	LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2280	__func__, t.second, t.first.c_str());
2281	id_to_token [t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
2282	}
2283	}
2284	}
2285
2286	// find FIM_REP token: "<\|fim_repo\|>", "<fim-repo>", "<REP>", etc.
2287	if (special_fim_rep_id == LLAMA_TOKEN_NULL) {
2288	if (false
2289	\|\| t.first == "<\|fim_repo\|>" // Qwen
2290	\|\| t.first == "<\|repo_name\|>"
2291	\|\| t.first == "<fim-repo>"
2292	\|\| t.first == "<REPO>"
2293	\|\| t.first == "<reponame>" // Granite
2294	) {
2295	special_fim_rep_id = t.second;
2296	if ((id_to_token [t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == `0`) {
2297	LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2298	__func__, t.second, t.first.c_str());
2299	id_to_token [t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
2300	}
2301	}
2302	}
2303
2304	// find FIM_SEP token: "<\|file_sep\|>"
2305	if (special_fim_sep_id == LLAMA_TOKEN_NULL) {
2306	if (false
2307	\|\| t.first == "<\|file_sep\|>" // Qwen
2308	) {
2309	special_fim_sep_id = t.second;
2310	if ((id_to_token [t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == `0`) {
2311	LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2312	__func__, t.second, t.first.c_str());
2313	id_to_token [t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
2314	}
2315	}
2316	}
2317	}
2318
2319	// maintain a list of tokens that cause end-of-generation
2320	// this is currently determined based on the token text, which is obviously not ideal
2321	// ref: https://github.com/ggerganov/llama.cpp/issues/9606
2322	special_eog_ids.clear();
2323
2324	if (special_fim_pad_id != LLAMA_TOKEN_NULL && special_eog_ids.count(x: special_fim_pad_id) == `0`) {
2325	special_eog_ids.insert(x: special_fim_pad_id);
2326	}
2327
2328	if (special_fim_rep_id != LLAMA_TOKEN_NULL && special_eog_ids.count(x: special_fim_rep_id) == `0`) {
2329	special_eog_ids.insert(x: special_fim_rep_id);
2330	}
2331
2332	if (special_fim_sep_id != LLAMA_TOKEN_NULL && special_eog_ids.count(x: special_fim_sep_id) == `0`) {
2333	special_eog_ids.insert(x: special_fim_sep_id);
2334	}
2335
2336	for (const auto & t : token_to_id) {
2337	if (false
2338	\|\| t.first == "<\|eot_id\|>"
2339	\|\| t.first == "<\|im_end\|>"
2340	\|\| t.first == "<\|end\|>"
2341	\|\| t.first == "<\|return\|>" // o200k_harmony
2342	\|\| t.first == "<\|call\|>" // o200k_harmony
2343	\|\| t.first == "<end_of_turn>"
2344	\|\| t.first == "<\|endoftext\|>"
2345	\|\| t.first == "<\|eom_id\|>"
2346	\|\| t.first == "<EOT>"
2347	\|\| t.first == "_<EOT>"
2348	\|\| t.first == "<\|end_of_text\|>"
2349	\|\| t.first == "<end_of_utterance>" // smoldocling
2350	) {
2351	special_eog_ids.insert(x: t.second);
2352	if ((id_to_token [t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == `0`) {
2353	LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2354	__func__, t.second, t.first.c_str());
2355	id_to_token [t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
2356	}
2357	} else {
2358	// token is control, but not marked as EOG -> print a debug log
2359	if (id_to_token [t.second].attr & LLAMA_TOKEN_ATTR_CONTROL && special_eog_ids.count(x: t.second) == `0`) {
2360	LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
2361	__func__, t.second, t.first.c_str());
2362	}
2363	}
2364	}
2365
2366	// @ngxson : quick hack for gpt-oss, always render these tokens
2367	for (const auto & t : token_to_id) {
2368	if (t.first == "<\|channel\|>" \|\| t.first == "<\|message\|>" \|\| t.first == "<\|start\|>" \|\| t.first == "<\|constrain\|>") {
2369	id_to_token [t.second].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
2370	}
2371	}
2372
2373	// sanity checks
2374	if (special_eos_id != LLAMA_TOKEN_NULL && special_eog_ids.count(x: special_eos_id) == `0`) {
2375	special_eog_ids.insert(x: special_eos_id);
2376	LLAMA_LOG_WARN("%s: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
2377	}
2378
2379	if (special_eot_id != LLAMA_TOKEN_NULL && special_eog_ids.count(x: special_eot_id) == `0`) {
2380	special_eog_ids.insert(x: special_eot_id);
2381	LLAMA_LOG_WARN("%s: special_eot_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
2382	}
2383
2384	if (special_eom_id != LLAMA_TOKEN_NULL && special_eog_ids.count(x: special_eom_id) == `0`) {
2385	special_eog_ids.insert(x: special_eom_id);
2386	LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
2387	}
2388
2389	// TODO: workaround for o200k_harmony tokenizer: the "<\|end\|>" token should not be EOG
2390	// we don't have a good way to detect this, so for now, if we have "<\|return\|>" and "<\|call\|>" tokens,
2391	// we remove the "<\|end\|>" token from the EOG list
2392	{
2393	bool has_return = false;
2394	bool has_call = false;
2395	bool has_end = false;
2396
2397	llama_token end_id = LLAMA_TOKEN_NULL;
2398
2399	LLAMA_LOG_INFO("%s: printing all EOG tokens:\n", __func__);
2400	for (auto tid : special_eog_ids) {
2401	LLAMA_LOG_INFO("%s: - %d ('%s')\n", __func__, tid, id_to_token[tid].text.c_str());
2402
2403	if (id_to_token [tid].text == "<\|return\|>") {
2404	has_return = true;
2405	} else if (id_to_token [tid].text == "<\|call\|>") {
2406	has_call = true;
2407	} else if (id_to_token [tid].text == "<\|end\|>") {
2408	has_end = true;
2409	end_id = tid;
2410	}
2411	}
2412
2413	if (has_return && has_call && has_end) {
2414	special_eog_ids.erase(x: end_id);
2415	id_to_token [end_id].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
2416	LLAMA_LOG_WARN("%s: special_eog_ids contains both '<\|return\|>' and '<\|call\|>' tokens, removing '<\|end\|>' token from EOG list\n", __func__);
2417	}
2418	}
2419	}
2420
2421	// build special tokens cache
2422	{
2423	for (llama_token id = `0`; id < (llama_token) n_tokens; ++id) {
2424	if (id_to_token [id].attr & (LLAMA_TOKEN_ATTR_CONTROL \| LLAMA_TOKEN_ATTR_USER_DEFINED \| LLAMA_TOKEN_ATTR_UNKNOWN)) {
2425	cache_special_tokens.push_back(x: id);
2426	}
2427	}
2428
2429	std::sort(first: cache_special_tokens.begin(), last: cache_special_tokens.end(),
2430	comp: [&] (const llama_token a, const llama_token b) {
2431	return id_to_token [a].text.size() > id_to_token [b].text.size();
2432	}
2433	);
2434
2435	LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t) cache_special_tokens.size());
2436	}
2437
2438	// build token to piece cache
2439	{
2440	size_t size_cache = `0`;
2441
2442	std::vector<std::string> cache(n_tokens);
2443
2444	for (uint32_t id = `0`; id < n_tokens; ++id) {
2445	cache [id] = token_to_piece_for_cache(token: id, special: true);
2446
2447	size_cache += cache [id].size();
2448	}
2449
2450	std::swap(x&: cache_token_to_piece, y&: cache);
2451
2452	LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / `1024.0` / `1024.0`);
2453	}
2454
2455	// Handle per token attributes
2456	//NOTE: Each model customizes per token attributes.
2457	//NOTE: Per token attributes are missing from the GGUF file.
2458	//TODO: Extract attributes from GGUF file.
2459	{
2460	auto _contains_any = [] (const std::string & str, const std::vector<std::string_view> & substrs) -> bool {
2461	for (const auto & substr : substrs) {
2462	if (str.find(svt: substr) != std::string::npos) {
2463	return true;
2464	}
2465	}
2466	return false;
2467	};
2468
2469	auto _set_tokenid_attr = [&] (const llama_token id, llama_token_attr attr, bool value) {
2470	uint32_t current = id_to_token.at(n: id).attr;
2471	current = value ? (current \| attr) : (current & ~attr);
2472	id_to_token [id].attr = (llama_token_attr) current;
2473	};
2474
2475	auto _set_token_attr = [&] (const std::string & token, llama_token_attr attr, bool value) {
2476	_set_tokenid_attr (token_to_id.at(k: token), attr, value);
2477	};
2478
2479	std::string model_name;
2480	std::string tokenizer_pre;
2481	std::string general_arch;
2482
2483	ml.get_key(kid: LLM_KV_GENERAL_NAME, result&: model_name, required: false);
2484	ml.get_key(kid: LLM_KV_TOKENIZER_PRE, result&: tokenizer_pre, required: false);
2485	ml.get_key(kid: LLM_KV_GENERAL_ARCHITECTURE, result&: general_arch, required: false);
2486
2487	// model name to lowercase
2488	std::transform(first: model_name.begin(), last: model_name.end(), result: model_name.begin(),
2489	unary_op: [] (const std::string::value_type x) {
2490	return std::tolower(c: x);
2491	}
2492	);
2493
2494	// set attributes by model/tokenizer/architecture name
2495	if (false
2496	\|\| _contains_any (tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})
2497	\|\| _contains_any (general_arch, {"nomic-bert-moe", "jina-bert-v3"})
2498	) {
2499	if (token_to_id.count(x: "<mask>") == `0`) {
2500	LLAMA_LOG_WARN("%s: Mask token is missing in vocab, please reconvert model!\n", __func__);
2501	} else {
2502	_set_token_attr ("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
2503	}
2504	} else if (_contains_any (model_name, {"phi-3", "phi3"})) {
2505	for (auto id : cache_special_tokens) {
2506	_set_tokenid_attr (id, LLAMA_TOKEN_ATTR_RSTRIP, true);
2507	}
2508	for (const auto * token : {"</s>"}) {
2509	_set_token_attr (token, LLAMA_TOKEN_ATTR_RSTRIP, true);
2510	}
2511	for (const auto * token : {"<unk>", "<s>", "<\|endoftext\|>"}) {
2512	_set_token_attr (token, LLAMA_TOKEN_ATTR_RSTRIP, false);
2513	}
2514	}
2515	}
2516	}
2517
2518	enum llama_vocab_type llama_vocab::impl::get_type() const {
2519	return type;
2520	}
2521
2522	std::string llama_vocab::impl::type_name() const{
2523	switch (type) {
2524	case LLAMA_VOCAB_TYPE_NONE: return "no vocab";
2525	case LLAMA_VOCAB_TYPE_SPM: return "SPM";
2526	case LLAMA_VOCAB_TYPE_BPE: return "BPE";
2527	case LLAMA_VOCAB_TYPE_WPM: return "WPM";
2528	case LLAMA_VOCAB_TYPE_UGM: return "UGM";
2529	case LLAMA_VOCAB_TYPE_RWKV: return "RWKV";
2530	case LLAMA_VOCAB_TYPE_PLAMO2: return "PLaMo2";
2531	default: return "unknown";
2532	}
2533	}
2534
2535	bool llama_vocab::impl::is_normal(llama_token id) const {
2536	GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
2537	return id_to_token [id].attr & LLAMA_TOKEN_ATTR_NORMAL;
2538	}
2539
2540	bool llama_vocab::impl::is_unknown(llama_token id) const {
2541	GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
2542	return id_to_token [id].attr & LLAMA_TOKEN_ATTR_UNKNOWN;
2543	}
2544
2545	bool llama_vocab::impl::is_control(llama_token id) const {
2546	GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
2547	return id_to_token [id].attr & LLAMA_TOKEN_ATTR_CONTROL;
2548	}
2549
2550	bool llama_vocab::impl::is_byte(llama_token id) const {
2551	GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
2552	return id_to_token [id].attr & LLAMA_TOKEN_ATTR_BYTE;
2553	}
2554
2555	bool llama_vocab::impl::is_user_defined(llama_token id) const {
2556	GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
2557	return id_to_token [id].attr & LLAMA_TOKEN_ATTR_USER_DEFINED;
2558	}
2559
2560	bool llama_vocab::impl::is_unused(llama_token id) const {
2561	GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
2562	return id_to_token [id].attr & LLAMA_TOKEN_ATTR_UNUSED;
2563	}
2564
2565	bool llama_vocab::impl::is_eog(llama_token id) const {
2566	return id != LLAMA_TOKEN_NULL && special_eog_ids.count(x: id) > `0`;
2567	}
2568
2569	uint8_t llama_vocab::impl::token_to_byte(llama_token id) const {
2570	GGML_ASSERT(get_type() != LLAMA_VOCAB_TYPE_NONE);
2571	GGML_ASSERT(is_byte(id));
2572	const auto & token_data = id_to_token.at(n: id);
2573	switch (get_type()) {
2574	case LLAMA_VOCAB_TYPE_SPM:
2575	case LLAMA_VOCAB_TYPE_UGM: {
2576	auto buf = token_data.text.substr(pos: `3`, n: `2`);
2577	return strtol(nptr: buf.c_str(), NULL, base: `16`);
2578	}
2579	case LLAMA_VOCAB_TYPE_BPE: {
2580	GGML_ABORT("fatal error");
2581	}
2582	case LLAMA_VOCAB_TYPE_WPM: {
2583	GGML_ABORT("fatal error");
2584	}
2585	default:
2586	GGML_ABORT("fatal error");
2587	}
2588	}
2589
2590	llama_token_attr llama_vocab::impl::token_get_attr(llama_token id) const {
2591	GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
2592	return id_to_token.at(n: id).attr;
2593	}
2594
2595	void llama_vocab::impl::init_tokenizer(enum llama_vocab_type type) {
2596	LLAMA_LOG_DEBUG("%s: initializing tokenizer for type %d\n", __func__, type);
2597
2598	switch (type) {
2599	case LLAMA_VOCAB_TYPE_SPM:
2600	tokenizer = std::make_unique<llm_tokenizer_spm>(args: vocab);
2601	break;
2602	case LLAMA_VOCAB_TYPE_BPE:
2603	tokenizer = std::make_unique<llm_tokenizer_bpe>(args: vocab);
2604	break;
2605	case LLAMA_VOCAB_TYPE_WPM:
2606	tokenizer = std::make_unique<llm_tokenizer_wpm>(args: vocab);
2607	break;
2608	case LLAMA_VOCAB_TYPE_UGM:
2609	tokenizer = std::make_unique<llm_tokenizer_ugm>(args: vocab, args&: precompiled_charsmap);
2610	break;
2611	case LLAMA_VOCAB_TYPE_RWKV:
2612	tokenizer = std::make_unique<llm_tokenizer_rwkv>(args: vocab);
2613	break;
2614	case LLAMA_VOCAB_TYPE_PLAMO2:
2615	tokenizer = std::make_unique<llm_tokenizer_plamo2>(args: vocab);
2616	break;
2617	default:
2618	GGML_ABORT("unsupported vocab type");
2619	}
2620	}
2621
2622	//
2623	// (de-) tokenize
2624	//
2625
2626	// #define PRETOKENIZERDEBUG
2627
2628	void llama_vocab::impl::tokenizer_st_partition(std::forward_list<fragment_buffer_variant> & buffer, bool parse_special) const {
2629	// for each special token
2630	for (const llama_token special_id : cache_special_tokens) {
2631	const auto & data = vocab.get_token_data(id: special_id);
2632	const auto & text = data.text;
2633
2634	if (!parse_special && (data.attr & (LLAMA_TOKEN_ATTR_CONTROL \| LLAMA_TOKEN_ATTR_UNKNOWN))) {
2635	// Ignore control and unknown tokens when parse_special == false
2636	continue;
2637	// User-defined tokens are still pre-tokenized before everything else
2638	// ref: https://github.com/huggingface/tokenizers/blob/fdd26ba9a3f0c133427aab0423888cbde91362d7/tokenizers/src/tokenizer/mod.rs#L726
2639	// This is mostly relevant for neox-style tokenizers (mpt, olmo, stablelm, etc.)
2640	}
2641
2642	// for each text fragment
2643	std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
2644	while (it != buffer.end()) {
2645	auto & fragment = (*it);
2646
2647	// if a fragment is text ( not yet processed )
2648	if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2649	const auto & raw_text = fragment.raw_text;
2650
2651	auto raw_text_base_offset = fragment.offset;
2652	auto raw_text_base_length = fragment.length;
2653
2654	// loop over the text
2655	while (true) {
2656	// find the first occurrence of a given special token in this fragment
2657	// passing offset argument only limit the "search area" but match coordinates
2658	// are still relative to the source full raw_text
2659	// string_view begins at pos 0 for the same reason
2660	auto match = std::string_view (raw_text.data(), raw_text_base_offset + raw_text_base_length).find(str: text, pos: raw_text_base_offset);
2661
2662	// no occurrences found, stop processing this fragment for a given special token
2663	if (match == std::string::npos) break;
2664
2665	#ifdef PRETOKENIZERDEBUG
2666	LLAMA_LOG_WARN("FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
2667	#endif
2668	auto source = std::distance(first: buffer.begin(), last: it);
2669
2670	// if match is further than base offset
2671	// then we have some text to the left of it
2672	if (match > raw_text_base_offset) {
2673	// left
2674	const int64_t left_reminder_offset = raw_text_base_offset + `0`;
2675	int64_t left_reminder_length = match - raw_text_base_offset;
2676
2677	if (data.attr & LLAMA_TOKEN_ATTR_LSTRIP) {
2678	while (left_reminder_length > `0` && isspace(raw_text [left_reminder_offset + left_reminder_length - `1`])) {
2679	left_reminder_length--;
2680	}
2681	}
2682
2683	if (left_reminder_length > `0`) {
2684	buffer.emplace_after(pos: it, args: raw_text, args: left_reminder_offset, args&: left_reminder_length);
2685	it ++;
2686	}
2687
2688	#ifdef PRETOKENIZERDEBUG
2689	LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
2690	#endif
2691	}
2692
2693	// special token
2694	buffer.emplace_after(pos: it, args: special_id);
2695	it ++;
2696
2697	// right
2698	if (match + text.length() < raw_text_base_offset + raw_text_base_length) {
2699	int64_t right_reminder_offset = match + text.length();
2700	int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + text.length());
2701
2702	if (data.attr & LLAMA_TOKEN_ATTR_RSTRIP) {
2703	while (right_reminder_length > `0` && isspace(raw_text [right_reminder_offset])) {
2704	right_reminder_offset++;
2705	right_reminder_length--;
2706	}
2707	}
2708
2709	if (right_reminder_length > `0`) {
2710	buffer.emplace_after(pos: it, args: raw_text, args&: right_reminder_offset, args&: right_reminder_length);
2711	it ++;
2712	}
2713
2714	#ifdef PRETOKENIZERDEBUG
2715	LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
2716	#endif
2717
2718	if (source == `0`) {
2719	buffer.erase_after(pos: buffer.before_begin());
2720	} else {
2721	buffer.erase_after(pos: std::next(x: buffer.begin(), n: (source - `1`)));
2722	}
2723
2724	// repeat for the right side
2725	raw_text_base_offset = right_reminder_offset;
2726	raw_text_base_length = right_reminder_length;
2727
2728	#ifdef PRETOKENIZERDEBUG
2729	LLAMA_LOG_WARN("RR: (%ld %ld) '%s'\n", raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
2730	#endif
2731	} else {
2732	if (source == `0`) {
2733	buffer.erase_after(pos: buffer.before_begin());
2734	} else {
2735	buffer.erase_after(pos: std::next(x: buffer.begin(), n: (source - `1`)));
2736	}
2737	break;
2738	}
2739	}
2740	}
2741	it ++;
2742	}
2743	}
2744	}
2745
2746	// NOTE: avoid ever using this except for building the token_to_piece caches
2747	std::string llama_vocab::impl::token_to_piece_for_cache(llama_token token, bool special) const {
2748	std::string piece;
2749	piece.resize(n: piece.capacity()); // using string internal cache
2750	const int n_chars = vocab.token_to_piece(token, buf: &piece [`0`], length: piece.size(), lstrip: `0`, special);
2751	if (n_chars < `0`) {
2752	piece.resize(n: -n_chars);
2753	int check = vocab.token_to_piece(token, buf: &piece [`0`], length: piece.size(), lstrip: `0`, special);
2754	GGML_ASSERT(check == -n_chars);
2755	}
2756	else {
2757	piece.resize(n: n_chars);
2758	}
2759
2760	return piece;
2761	}
2762
2763	static void llama_escape_whitespace(std::string & text) {
2764	replace_all(s&: text, search: " ", replace: "\xe2\x96\x81");
2765	}
2766
2767	static void llama_unescape_whitespace(std::string & word) {
2768	replace_all(s&: word, search: "\xe2\x96\x81", replace: " ");
2769	}
2770
2771	static std::string llama_decode_text(const std::string & text) {
2772	std::string decoded_text;
2773
2774	const auto cpts = unicode_cpts_from_utf8(utf8: text);
2775	for (const auto cpt : cpts) {
2776	const auto utf8 = unicode_cpt_to_utf8(cpt);
2777	try {
2778	decoded_text += unicode_utf8_to_byte(utf8);
2779	} catch (const std::out_of_range & /e/) {
2780	decoded_text += "[UNK_BYTE_0x";
2781	for (const auto c : utf8) {
2782	decoded_text += format(fmt: "%02x", (uint8_t) c);
2783	}
2784	decoded_text += text + "]";
2785	}
2786	}
2787
2788	return decoded_text;
2789	}
2790
2791	std::vector<llama_token> llama_vocab::impl::tokenize(
2792	const std::string & raw_text,
2793	bool add_special,
2794	bool parse_special) const {
2795	GGML_ASSERT(tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first.");
2796
2797	std::vector<llama_token> output;
2798	std::forward_list<fragment_buffer_variant> fragment_buffer;
2799
2800	if (!raw_text.empty()) {
2801	fragment_buffer.emplace_front(args: raw_text, args: `0`, args: raw_text.length());
2802	tokenizer_st_partition(buffer&: fragment_buffer, parse_special);
2803	}
2804
2805	switch (get_type()) {
2806	case LLAMA_VOCAB_TYPE_SPM:
2807	{
2808	// OG tokenizer behavior:
2809	//
2810	// tokenizer.encode('', add_special_tokens=True) returns [1]
2811	// tokenizer.encode('', add_special_tokens=False) returns []
2812
2813	bool is_prev_special = true; // prefix with space if first token
2814
2815	if (add_special && add_bos) {
2816	GGML_ASSERT(special_bos_id != LLAMA_TOKEN_NULL);
2817	output.push_back(x: special_bos_id);
2818	is_prev_special = true;
2819	}
2820
2821	for (const auto & fragment : fragment_buffer) {
2822	if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2823	std::string text;
2824
2825	// prefix with space if previous is special
2826	if (add_space_prefix && is_prev_special) {
2827	text = `' '`;
2828	}
2829
2830	text += fragment.raw_text.substr(pos: fragment.offset, n: fragment.length);
2831
2832	#ifdef PRETOKENIZERDEBUG
2833	LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
2834	#endif
2835	llama_escape_whitespace(text);
2836	llm_tokenizer_spm_session session(vocab);
2837	session.tokenize(text, output);
2838	is_prev_special = false;
2839	} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
2840	output.push_back(x: fragment.token);
2841	is_prev_special = true;
2842	}
2843	}
2844
2845	if (add_special && add_bos && output.size() >= `2` && output [`1`] == special_bos_id) {
2846	LLAMA_LOG_WARN(
2847	"%s: Added a BOS token to the prompt as specified by the model but the prompt "
2848	"also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
2849	"Are you sure this is what you want?\n", __FUNCTION__);
2850	}
2851
2852	if (add_special && add_eos) {
2853	GGML_ASSERT(special_eos_id != LLAMA_TOKEN_NULL);
2854	output.push_back(x: special_eos_id);
2855	}
2856	} break;
2857	case LLAMA_VOCAB_TYPE_BPE:
2858	{
2859	llm_tokenizer_bpe_session session(vocab, *static_cast<const llm_tokenizer_bpe *>(tokenizer.get()));
2860	// it calls some other methods that are not exist in llm_tokenizer,
2861	// here just cast it to bpe tokenizer object
2862	if (add_special) {
2863	session.append_bos(output);
2864	}
2865	for (const auto & fragment : fragment_buffer) {
2866	if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2867	std::string text = fragment.raw_text.substr(pos: fragment.offset, n: fragment.length);
2868
2869	#ifdef PRETOKENIZERDEBUG
2870	LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
2871	#endif
2872	session.tokenize(text, output);
2873	} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
2874	session.append(token_id: fragment.token, output);
2875	}
2876	}
2877
2878	if (add_special) {
2879	session.append_eos(output);
2880	session.check_double_bos_eos(output);
2881	}
2882	} break;
2883	case LLAMA_VOCAB_TYPE_WPM:
2884	{
2885	if (add_special) {
2886	GGML_ASSERT(special_bos_id != LLAMA_TOKEN_NULL);
2887	output.push_back(x: special_bos_id);
2888	}
2889
2890	llm_tokenizer_wpm_session session(vocab);
2891
2892	for (const auto & fragment : fragment_buffer) {
2893	if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2894	std::string text = fragment.raw_text.substr(pos: fragment.offset, n: fragment.length);
2895
2896	#ifdef PRETOKENIZERDEBUG
2897	LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
2898	#endif
2899	session.tokenize(text, output);
2900	} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
2901	output.push_back(x: fragment.token);
2902	}
2903	}
2904
2905	if (add_special) {
2906	GGML_ASSERT(special_sep_id != LLAMA_TOKEN_NULL);
2907	output.push_back(x: special_sep_id);
2908	}
2909	} break;
2910	case LLAMA_VOCAB_TYPE_UGM:
2911	{
2912	if (add_special && add_bos) {
2913	GGML_ASSERT(special_bos_id != LLAMA_TOKEN_NULL);
2914	output.push_back(x: special_bos_id);
2915	}
2916	llm_tokenizer_ugm_session session(vocab, *static_cast<const llm_tokenizer_ugm *>(tokenizer.get()));
2917
2918	for (const auto & fragment : fragment_buffer) {
2919	if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2920	std::string text = fragment.raw_text.substr(pos: fragment.offset, n: fragment.length);
2921	#ifdef PRETOKENIZERDEBUG
2922	LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
2923	#endif
2924	session.tokenize(text, output);
2925	} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
2926	output.push_back(x: fragment.token);
2927	}
2928	}
2929
2930	if (add_special && add_bos && output.size() >= `2` && output [`1`] == special_bos_id) {
2931	LLAMA_LOG_WARN(
2932	"%s: Added a BOS token to the prompt as specified by the model but the prompt "
2933	"also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
2934	"Are you sure this is what you want?\n", __FUNCTION__);
2935	}
2936
2937	if (add_special && add_eos) {
2938	GGML_ASSERT(special_eos_id != LLAMA_TOKEN_NULL);
2939	output.push_back(x: special_eos_id);
2940	}
2941	} break;
2942	case LLAMA_VOCAB_TYPE_RWKV:
2943	{
2944	llm_tokenizer_rwkv_session session(vocab, *static_cast<const llm_tokenizer_rwkv *>(tokenizer.get()));
2945	for (const auto & fragment : fragment_buffer) {
2946	if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2947	std::string text = fragment.raw_text.substr(pos: fragment.offset, n: fragment.length);
2948
2949	#ifdef PRETOKENIZERDEBUG
2950	LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
2951	#endif
2952
2953	session.tokenize(text, output);
2954	} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
2955	output.push_back(x: fragment.token);
2956	}
2957	}
2958	} break;
2959	case LLAMA_VOCAB_TYPE_PLAMO2:
2960	{
2961	llm_tokenizer_plamo2_session session(*static_cast<const llm_tokenizer_plamo2 *>(tokenizer.get()));
2962	for (const auto & fragment : fragment_buffer) {
2963	if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2964	std::string text = fragment.raw_text.substr(pos: fragment.offset, n: fragment.length);
2965
2966	#ifdef PRETOKENIZERDEBUG
2967	LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
2968	#endif
2969
2970	session.tokenize(text, output);
2971	} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
2972	output.push_back(x: fragment.token);
2973	}
2974	}
2975	} break;
2976	case LLAMA_VOCAB_TYPE_NONE:
2977	GGML_ABORT("fatal error");
2978	}
2979
2980	return output;
2981	}
2982
2983	int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t length, int32_t lstrip, bool special) const {
2984	// ref: https://github.com/ggerganov/llama.cpp/pull/7587#discussion_r1620983843
2985	static const int attr_special = LLAMA_TOKEN_ATTR_UNKNOWN \| LLAMA_TOKEN_ATTR_CONTROL;
2986	const llama_token_attr attr = token_get_attr(id: token);
2987	if (!special && (attr & attr_special)) {
2988	return `0`;
2989	}
2990
2991	// copy piece chars to output text buffer
2992	// skip up to 'lstrip' leading spaces before copying
2993	auto _try_copy = [=] (const char * token, size_t size) -> int32_t {
2994	if (size >= static_cast<size_t>(std::numeric_limits<int32_t>::max())) {
2995	GGML_ABORT("invalid token size: %zu exceeds int32_t limit", size);
2996	}
2997
2998	for (int32_t i = `0`; i < lstrip && size && *token == `' '`; ++i) {
2999	token++;
3000	size--;
3001	}
3002	if (length < (int32_t)size) {
3003	return -(int32_t) size;
3004	}
3005	memcpy(dest: buf, src: token, n: size);
3006	return (int32_t) size;
3007	};
3008
3009	// if we have a cache - use it
3010	{
3011	const auto & cache = cache_token_to_piece;
3012
3013	if (!cache.empty()) {
3014	const auto & result = cache.at(n: token);
3015	return _try_copy (result.data(), result.size());
3016	}
3017	}
3018
3019	if (`0` <= token && token < (int32_t) id_to_token.size()) {
3020	const std::string & token_text = id_to_token [token].text;
3021	switch (get_type()) {
3022	case LLAMA_VOCAB_TYPE_WPM:
3023	case LLAMA_VOCAB_TYPE_SPM:
3024	case LLAMA_VOCAB_TYPE_UGM: {
3025	// NOTE: we accept all unsupported token types,
3026	// suppressing them like CONTROL tokens.
3027	if (attr & (attr_special \| LLAMA_TOKEN_ATTR_USER_DEFINED)) {
3028	return _try_copy (token_text.data(), token_text.size());
3029	}
3030	if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
3031	std::string result = token_text;
3032	llama_unescape_whitespace(word&: result);
3033	return _try_copy (result.data(), result.size());
3034	}
3035	if (attr & LLAMA_TOKEN_ATTR_BYTE) {
3036	char byte = (char) token_to_byte(id: token);
3037	return _try_copy ((char*) &byte, `1`);
3038	}
3039	break;
3040	}
3041	case LLAMA_VOCAB_TYPE_BPE: {
3042	// NOTE: we accept all unsupported token types,
3043	// suppressing them like CONTROL tokens.
3044	if (attr & (attr_special \| LLAMA_TOKEN_ATTR_USER_DEFINED)) {
3045	return _try_copy (token_text.data(), token_text.size());
3046	}
3047	if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
3048	std::string result = llama_decode_text(text: token_text);
3049	return _try_copy (result.data(), result.size());
3050	}
3051	break;
3052	}
3053	case LLAMA_VOCAB_TYPE_RWKV: {
3054	std::vector<uint8_t> result = llama_unescape_rwkv_token(escaped: token_text);
3055
3056	// If we don't have enough space, return an error
3057	if (result.size() > (size_t)length) {
3058	return -(int)result.size();
3059	}
3060
3061	memcpy(dest: buf, src: result.data(), n: result.size());
3062	return (int)result.size();
3063	}
3064	case LLAMA_VOCAB_TYPE_PLAMO2: {
3065	// PLaMo-2 uses similar token handling as BPE/SPM
3066	if (vocab.is_byte(id: token)) {
3067	// Handle byte tokens like <0xXX>
3068	if (token_text.length() == `6` && token_text.substr(pos: `0`, n: `3`) == "<0x" && token_text.back() == `'>'`) {
3069	int hex_val = std::stoi(str: token_text.substr(pos: `3`, n: `2`), idx: nullptr, base: `16`);
3070	if (length < `1`) {
3071	return -`1`;
3072	}
3073	buf[`0`] = static_cast<char>(hex_val);
3074	return `1`;
3075	}
3076	}
3077
3078	// Normal token - just copy the text
3079	std::string result = token_text;
3080	return _try_copy (result.data(), result.size());
3081	}
3082	default:
3083	GGML_ABORT("fatal error");
3084	}
3085	}
3086
3087	return `0`;
3088	}
3089
3090	const std::string & llama_vocab::impl::token_to_piece(llama_token token) const {
3091	return cache_token_to_piece.at(n: token);
3092	}
3093
3094	int32_t llama_vocab::impl::detokenize(
3095	const llama_token * tokens,
3096	int32_t n_tokens,
3097	char * text,
3098	int32_t text_len_max,
3099	bool remove_special,
3100	bool unparse_special) const {
3101	if (type == LLAMA_VOCAB_TYPE_NONE) {
3102	return `0`;
3103	}
3104
3105	GGML_ASSERT(tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first.");
3106
3107	int32_t avail = text_len_max;
3108	int32_t total = `0`;
3109
3110	// remove the leading space
3111	bool remove_space = add_space_prefix;
3112
3113	if (remove_special && add_bos) {
3114	if (n_tokens > `0` && tokens[`0`] == special_bos_id) {
3115	remove_space = false;
3116	n_tokens--;
3117	tokens++;
3118	}
3119	}
3120
3121	if (remove_special && add_eos) {
3122	if (n_tokens > `0` && tokens[n_tokens - `1`] == special_eos_id) {
3123	n_tokens--;
3124	}
3125	}
3126
3127	for (int32_t i = `0`; i < n_tokens; ++i) {
3128	GGML_ASSERT(avail >= `0`);
3129	int32_t n_chars = token_to_piece(token: tokens[i], buf: text, length: avail, lstrip: remove_space, special: unparse_special);
3130	remove_space = false;
3131	if (n_chars < `0`) {
3132	avail = `0`;
3133	total -= n_chars;
3134	} else if (n_chars > `0`) {
3135	avail -= n_chars;
3136	text += n_chars;
3137	total += n_chars;
3138	}
3139	}
3140
3141	if (total > text_len_max) {
3142	return -total;
3143	}
3144
3145	if (clean_spaces) {
3146	text -= total; // restart text
3147
3148	// first pass: characters ?!., //TODO: where do these characters come from?
3149	const int32_t total1 = total;
3150	total = total ? `1` : `0`;
3151	for (int32_t i = `1`; i < total1; ++i) {
3152	const char x = text[i];
3153	if (text[i - `1`] == `' '`) {
3154	if (x == `'?'` \|\| x == `'!'` \|\| x == `'.'` \|\| x == `','`) { // " ?", " !", " .", " ,"
3155	total--; // remove space
3156	}
3157	}
3158	text[total++] = x;
3159	}
3160
3161	// second pass: strip single apostrophe between spaces
3162	const int32_t total2 = total;
3163	total = total ? `1` : `0`;
3164	for (int32_t i = `1`; i < total2; ++i) {
3165	const char x = text[i];
3166	if (x == `'\''` && i + `1` < total2 && text[i - `1`] == `' '` && text[i + `1`] == `' '`) { // " ' "
3167	total--; // remove prev space
3168	text[++i] = `'\0'`; // remove next space
3169	}
3170	text[total++] = x;
3171	}
3172
3173	// third pass: apostrophe contractions //NOTE: this makes sense?
3174	const int32_t total3 = total;
3175	total = total ? `1` : `0`;
3176	for (int32_t i = `1`; i < total3; ++i) {
3177	const char x = text[i];
3178	if (text[i - `1`] == `' '`) {
3179	if (x == `'\''` && i + `1` < total3) {
3180	const char x1 = text[i + `1`];
3181	if (x1 == `'t'` \|\| x1 == `'d'`) { // " 't", " 'd"
3182	//total--; // remove space
3183	} else if (x1 == `'s'` \|\| x1 == `'m'`) { // " 's", " 'm"
3184	total--; // remove space
3185	} else if (i + `2` < total3) {
3186	const char x2 = text[i + `2`];
3187	if ((x1 == `'l'` && x2 == `'l'`)) { // " 'll"
3188	//total--; // remove space
3189	} else if ((x1 == `'r'` && x2 == `'e'`) \|\| (x1 == `'v'` && x2 == `'e'`)) { // " 're", " 've"
3190	total--; // remove space
3191	} else {
3192	//total--; // remove space
3193	}
3194	} else {
3195	//total--; // remove space
3196	}
3197	}
3198	}
3199	text[total++] = x;
3200	}
3201	}
3202
3203	return total <= text_len_max ? total : -total;
3204	}
3205
3206	void llama_vocab::impl::print_info() const {
3207	LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, type_name().c_str());
3208	LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, vocab.n_tokens());
3209	LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (uint32_t) bpe_ranks.size());
3210
3211	// special tokens
3212	if (special_bos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, special_bos_id, id_to_token.at(special_bos_id).text.c_str() ); }
3213	if (special_eos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, special_eos_id, id_to_token.at(special_eos_id).text.c_str() ); }
3214	if (special_eot_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, special_eot_id, id_to_token.at(special_eot_id).text.c_str() ); }
3215	if (special_eom_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, special_eom_id, id_to_token.at(special_eom_id).text.c_str() ); }
3216	if (special_unk_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, special_unk_id, id_to_token.at(special_unk_id).text.c_str() ); }
3217	if (special_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, special_sep_id, id_to_token.at(special_sep_id).text.c_str() ); }
3218	if (special_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, special_pad_id, id_to_token.at(special_pad_id).text.c_str() ); }
3219	if (special_mask_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, special_mask_id, id_to_token.at(special_mask_id).text.c_str() ); }
3220
3221	if (linefeed_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, linefeed_id, id_to_token.at(linefeed_id).text.c_str() ); }
3222
3223	if (special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, special_fim_pre_id, id_to_token.at(special_fim_pre_id).text.c_str() ); }
3224	if (special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, special_fim_suf_id, id_to_token.at(special_fim_suf_id).text.c_str() ); }
3225	if (special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, special_fim_mid_id, id_to_token.at(special_fim_mid_id).text.c_str() ); }
3226	if (special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, special_fim_pad_id, id_to_token.at(special_fim_pad_id).text.c_str() ); }
3227	if (special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, special_fim_rep_id, id_to_token.at(special_fim_rep_id).text.c_str() ); }
3228	if (special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, special_fim_sep_id, id_to_token.at(special_fim_sep_id).text.c_str() ); }
3229
3230	for (const auto & id : special_eog_ids) {
3231	LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, id_to_token.at(id).text.c_str() );
3232	}
3233
3234	LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, max_token_len);
3235	}
3236
3237	llama_vocab::llama_vocab() : pimpl (new impl (*this)) {
3238	}
3239
3240	llama_vocab::~llama_vocab() {
3241	}
3242
3243	void llama_vocab::load(llama_model_loader & ml, const LLM_KV & kv) {
3244	pimpl ->load(ml, kv);
3245	}
3246
3247	std::string llama_vocab::get_tokenizer_model() const {
3248	return pimpl ->tokenizer_model;
3249	}
3250
3251	std::string llama_vocab::get_tokenizer_pre() const {
3252	return pimpl ->tokenizer_pre;
3253	}
3254
3255	enum llama_vocab_type llama_vocab::get_type() const {
3256	return pimpl ->type;
3257	}
3258
3259	enum llama_vocab_pre_type llama_vocab::get_pre_type() const {
3260	return pimpl ->pre_type;
3261	}
3262
3263	uint32_t llama_vocab::n_tokens() const {
3264	return (uint32_t) pimpl ->id_to_token.size();
3265	}
3266
3267	uint32_t llama_vocab::n_token_types() const {
3268	return (uint32_t) pimpl ->n_token_types;
3269	}
3270
3271	std::string llama_vocab::type_name() const{
3272	return pimpl ->type_name();
3273	}
3274
3275	bool llama_vocab::is_normal(llama_token id) const {
3276	return pimpl ->is_normal(id);
3277	}
3278
3279	bool llama_vocab::is_unknown(llama_token id) const {
3280	return pimpl ->is_unknown(id);
3281	}
3282
3283	bool llama_vocab::is_control(llama_token id) const {
3284	return pimpl ->is_control(id);
3285	}
3286
3287	bool llama_vocab::is_byte(llama_token id) const {
3288	return pimpl ->is_byte(id);
3289	}
3290
3291	bool llama_vocab::is_user_defined(llama_token id) const {
3292	return pimpl ->is_user_defined(id);
3293	}
3294
3295	bool llama_vocab::is_unused(llama_token id) const {
3296	return pimpl ->is_unused(id);
3297	}
3298
3299	bool llama_vocab::is_eog(llama_token id) const {
3300	return pimpl ->is_eog(id);
3301	}
3302
3303	uint8_t llama_vocab::token_to_byte(llama_token id) const {
3304	return pimpl ->token_to_byte(id);
3305	}
3306
3307	llama_token llama_vocab::byte_to_token(uint8_t ch) const {
3308	GGML_ASSERT(get_type() != LLAMA_VOCAB_TYPE_NONE);
3309	static const char * hex = "0123456789ABCDEF";
3310	switch (get_type()) {
3311	case LLAMA_VOCAB_TYPE_SPM:
3312	case LLAMA_VOCAB_TYPE_UGM: {
3313	const char buf[`7`] = { `'<'`, `'0'`, `'x'`, hex[ch >> `4`], hex[ch & `15`], `'>'`, `0` };
3314	auto token = pimpl ->token_to_id.find(x: buf);
3315	if (token != pimpl ->token_to_id.end()) {
3316	return (*token).second;
3317	}
3318	// Try to fall back to just the byte as a string
3319	const char buf2[`2`] = { (char)ch, `0` };
3320	return pimpl ->token_to_id.at(k: buf2);
3321	}
3322	case LLAMA_VOCAB_TYPE_WPM:
3323	case LLAMA_VOCAB_TYPE_BPE: {
3324	return pimpl ->token_to_id.at(k: unicode_byte_to_utf8(byte: ch));
3325	}
3326	case LLAMA_VOCAB_TYPE_PLAMO2: {
3327	// PLaMo-2 uses byte tokens in format <0xXX>
3328	char hex_str[`8`];
3329	snprintf(s: hex_str, maxlen: sizeof(hex_str), format: "<0x%02X>", ch);
3330	return pimpl ->token_to_id.at(k: hex_str);
3331	}
3332	default:
3333	GGML_ABORT("fatal error");
3334	}
3335	}
3336
3337	llama_token llama_vocab::text_to_token(const std::string & text) const {
3338	GGML_ASSERT(pimpl ->type != LLAMA_VOCAB_TYPE_NONE);
3339	auto it = pimpl ->token_to_id.find(x: text);
3340	if (it != pimpl ->token_to_id.end()) {
3341	return (*it).second;
3342	}
3343	return LLAMA_TOKEN_NULL;
3344	}
3345
3346	const llama_vocab::token_data & llama_vocab::get_token_data(llama_token id) const {
3347	GGML_ASSERT(pimpl ->type != LLAMA_VOCAB_TYPE_NONE);
3348	return pimpl ->id_to_token.at(n: id);
3349	}
3350
3351	const char * llama_vocab::token_get_text(llama_token id) const {
3352	GGML_ASSERT(pimpl ->type != LLAMA_VOCAB_TYPE_NONE);
3353	return pimpl ->id_to_token.at(n: id).text.c_str();
3354	}
3355
3356	float llama_vocab::token_get_score(llama_token id) const {
3357	GGML_ASSERT(pimpl ->type != LLAMA_VOCAB_TYPE_NONE);
3358	return pimpl ->id_to_token.at(n: id).score;
3359	}
3360
3361	llama_token_attr llama_vocab::token_get_attr(llama_token id) const {
3362	return pimpl ->token_get_attr(id);
3363	}
3364
3365	llama_token llama_vocab::token_bos() const {
3366	return pimpl ->special_bos_id;
3367	}
3368
3369	llama_token llama_vocab::token_eos() const {
3370	return pimpl ->special_eos_id;
3371	}
3372
3373	llama_token llama_vocab::token_eot() const {
3374	return pimpl ->special_eot_id;
3375	}
3376
3377	llama_token llama_vocab::token_eom() const {
3378	return pimpl ->special_eom_id;
3379	}
3380
3381	llama_token llama_vocab::token_unk() const {
3382	return pimpl ->special_unk_id;
3383	}
3384
3385	llama_token llama_vocab::token_sep() const {
3386	return pimpl ->special_sep_id;
3387	}
3388
3389	llama_token llama_vocab::token_nl() const {
3390	return pimpl ->linefeed_id;
3391	}
3392
3393	llama_token llama_vocab::token_pad() const {
3394	return pimpl ->special_pad_id;
3395	}
3396
3397	llama_token llama_vocab::token_prefix() const {
3398	return pimpl ->special_fim_pre_id;
3399	}
3400
3401	llama_token llama_vocab::token_middle() const {
3402	return pimpl ->special_fim_mid_id;
3403	}
3404
3405	llama_token llama_vocab::token_suffix() const {
3406	return pimpl ->special_fim_suf_id;
3407	}
3408
3409	llama_token llama_vocab::token_fim_pre() const {
3410	return pimpl ->special_fim_pre_id;
3411	}
3412
3413	llama_token llama_vocab::token_fim_suf() const {
3414	return pimpl ->special_fim_suf_id;
3415	}
3416
3417	llama_token llama_vocab::token_fim_mid() const {
3418	return pimpl ->special_fim_mid_id;
3419	}
3420
3421	llama_token llama_vocab::token_fim_pad() const {
3422	return pimpl ->special_fim_pad_id;
3423	}
3424
3425	llama_token llama_vocab::token_fim_rep() const {
3426	return pimpl ->special_fim_rep_id;
3427	}
3428
3429	llama_token llama_vocab::token_fim_sep() const {
3430	return pimpl ->special_fim_sep_id;
3431	}
3432
3433	llama_token llama_vocab::token_mask() const {
3434	return pimpl ->special_mask_id;
3435	}
3436
3437	bool llama_vocab::get_add_space_prefix() const {
3438	return pimpl ->add_space_prefix;
3439	}
3440
3441	bool llama_vocab::get_add_bos() const {
3442	return pimpl ->add_bos;
3443	}
3444
3445	bool llama_vocab::get_add_eos() const {
3446	return pimpl ->add_eos;
3447	}
3448
3449	bool llama_vocab::get_add_sep() const {
3450	return pimpl ->add_sep;
3451	}
3452
3453	bool llama_vocab::get_ignore_merges() const {
3454	return pimpl ->ignore_merges;
3455	}
3456
3457	bool llama_vocab::get_clean_spaces() const {
3458	return pimpl ->clean_spaces;
3459	}
3460
3461	bool llama_vocab::get_remove_extra_whitespaces() const {
3462	return pimpl ->remove_extra_whitespaces;
3463	}
3464
3465	bool llama_vocab::get_escape_whitespaces() const {
3466	return pimpl ->escape_whitespaces;
3467	}
3468
3469	bool llama_vocab::get_treat_whitespace_as_suffix() const {
3470	return pimpl ->treat_whitespace_as_suffix;
3471	}
3472
3473	int llama_vocab::max_token_len() const {
3474	return pimpl ->max_token_len;
3475	}
3476
3477	int llama_vocab::find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
3478	GGML_ASSERT(token_left.find(`' '`) == std::string::npos);
3479	GGML_ASSERT(token_left.find(`'\n'`) == std::string::npos);
3480	GGML_ASSERT(token_right.find(`' '`) == std::string::npos);
3481	GGML_ASSERT(token_right.find(`'\n'`) == std::string::npos);
3482
3483	auto it = pimpl ->bpe_ranks.find(x: std::make_pair(x: token_left, y: token_right));
3484	if (it == pimpl ->bpe_ranks.end()) {
3485	return -`1`;
3486	}
3487
3488	return it ->second;
3489	}
3490
3491	std::vector<std::string> llama_vocab::get_bpe_merges() const {
3492	std::vector<std::string> result(pimpl ->bpe_ranks.size());
3493
3494	for (const auto & pair : pimpl ->bpe_ranks) {
3495	result [pair.second] = pair.first.first + " " + pair.first.second;
3496	}
3497
3498	return result;
3499	}
3500
3501	std::vector<char> llama_vocab::get_precompiled_charsmap() const {
3502	return pimpl ->precompiled_charsmap;
3503	}
3504
3505	int32_t llama_vocab::tokenize(
3506	const char * text,
3507	int32_t text_len,
3508	llama_token * tokens,
3509	int32_t n_tokens_max,
3510	bool add_special,
3511	bool parse_special) const {
3512	auto res = tokenize(raw_text: std::string (text, text_len), add_special, parse_special);
3513	if (res.size() >= static_cast<size_t>(std::numeric_limits<int32_t>::max())) {
3514	LLAMA_LOG_ERROR("%s: tokenization result size %zu exceeds int32_t limit\n", __func__, res.size());
3515	return std::numeric_limits<int32_t>::min();
3516	}
3517
3518	if (n_tokens_max < (int) res.size()) {
3519	// LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
3520	return -((int) res.size());
3521	}
3522
3523	for (size_t i = `0`; i < res.size(); i++) {
3524	tokens[i] = res [i];
3525	}
3526
3527	return res.size();
3528	}
3529
3530	std::vector<llama_token> llama_vocab::tokenize(
3531	const std::string & raw_text,
3532	bool add_special,
3533	bool parse_special) const {
3534	return pimpl ->tokenize(raw_text, add_special, parse_special);
3535	}
3536
3537	const std::string & llama_vocab::token_to_piece(llama_token token) const {
3538	return pimpl ->token_to_piece(token);
3539	}
3540
3541	int32_t llama_vocab::token_to_piece(llama_token token, char * buf, int32_t length, int32_t lstrip, bool special) const {
3542	return pimpl ->token_to_piece(token, buf, length, lstrip, special);
3543	}
3544
3545	int32_t llama_vocab::detokenize(
3546	const llama_token * tokens,
3547	int32_t n_tokens,
3548	char * text,
3549	int32_t text_len_max,
3550	bool remove_special,
3551	bool unparse_special) const {
3552	return pimpl ->detokenize(tokens, n_tokens, text, text_len_max, remove_special, unparse_special);
3553	}
3554
3555	std::string llama_vocab::detokenize(const std::vector<llama_token> & tokens, bool special) const {
3556	std::string text;
3557	text.resize(n: std::max(a: text.capacity(), b: tokens.size()));
3558	int32_t n_chars = detokenize(tokens: tokens.data(), n_tokens: (int32_t)tokens.size(), text: &text [`0`], text_len_max: (int32_t)text.size(), remove_special: false, unparse_special: special);
3559	if (n_chars < `0`) {
3560	text.resize(n: -n_chars);
3561	n_chars = detokenize(tokens: tokens.data(), n_tokens: (int32_t)tokens.size(), text: &text [`0`], text_len_max: (int32_t)text.size(), remove_special: false, unparse_special: special);
3562	GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization
3563	}
3564
3565	text.resize(n: n_chars);
3566
3567	// NOTE: the original tokenizer decodes bytes after collecting the pieces.
3568	return text;
3569	}
3570
3571	void llama_vocab::print_info() const {
3572	pimpl ->print_info();
3573	}
3574
3575	//
3576	// interface implementation
3577	//
3578
3579	int32_t llama_vocab_n_tokens(const struct llama_vocab * vocab) {
3580	return vocab->n_tokens();
3581	}
3582
3583	// deprecated
3584	int32_t llama_n_vocab(const struct llama_vocab * vocab) {
3585	return llama_vocab_n_tokens(vocab);
3586	}
3587
3588	enum llama_vocab_type llama_vocab_type(const struct llama_vocab * vocab) {
3589	return vocab->get_type();
3590	}
3591
3592	const char * llama_vocab_get_text(const struct llama_vocab * vocab, llama_token token) {
3593	return vocab->token_get_text(id: token);
3594	}
3595
3596	float llama_vocab_get_score(const struct llama_vocab * vocab, llama_token token) {
3597	return vocab->token_get_score(id: token);
3598	}
3599
3600	enum llama_token_attr llama_vocab_get_attr(const struct llama_vocab * vocab, llama_token token) {
3601	return vocab->token_get_attr(id: token);
3602	}
3603
3604	bool llama_vocab_is_eog(const struct llama_vocab * vocab, llama_token token) {
3605	return vocab->is_eog(id: token);
3606	}
3607
3608	bool llama_vocab_is_control(const struct llama_vocab * vocab, llama_token token) {
3609	return vocab->is_control(id: token);
3610	}
3611
3612	llama_token llama_vocab_bos(const struct llama_vocab * vocab) {
3613	return vocab->token_bos();
3614	}
3615
3616	llama_token llama_vocab_eos(const struct llama_vocab * vocab) {
3617	return vocab->token_eos();
3618	}
3619
3620	llama_token llama_vocab_eot(const struct llama_vocab * vocab) {
3621	return vocab->token_eot();
3622	}
3623
3624	// deprecated
3625	llama_token llama_vocab_cls(const struct llama_vocab * vocab) {
3626	return vocab->token_bos();
3627	}
3628
3629	llama_token llama_vocab_sep(const struct llama_vocab * vocab) {
3630	return vocab->token_sep();
3631	}
3632
3633	llama_token llama_vocab_nl (const struct llama_vocab * vocab) {
3634	return vocab->token_nl();
3635	}
3636
3637	llama_token llama_vocab_pad(const struct llama_vocab * vocab) {
3638	return vocab->token_pad();
3639	}
3640
3641	bool llama_vocab_get_add_bos(const struct llama_vocab * vocab) {
3642	return vocab->get_add_bos();
3643	}
3644
3645	bool llama_vocab_get_add_eos(const struct llama_vocab * vocab) {
3646	return vocab->get_add_eos();
3647	}
3648
3649	bool llama_vocab_get_add_sep(const struct llama_vocab * vocab) {
3650	return vocab->get_add_sep();
3651	}
3652
3653	llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab) {
3654	return vocab->token_fim_pre();
3655	}
3656
3657	llama_token llama_vocab_fim_suf(const struct llama_vocab * vocab) {
3658	return vocab->token_fim_suf();
3659	}
3660
3661	llama_token llama_vocab_fim_mid(const struct llama_vocab * vocab) {
3662	return vocab->token_fim_mid();
3663	}
3664
3665	llama_token llama_vocab_fim_pad(const struct llama_vocab * vocab) {
3666	return vocab->token_fim_pad();
3667	}
3668
3669	llama_token llama_vocab_fim_rep(const struct llama_vocab * vocab) {
3670	return vocab->token_fim_rep();
3671	}
3672
3673	llama_token llama_vocab_fim_sep(const struct llama_vocab * vocab) {
3674	return vocab->token_fim_sep();
3675	}
3676
3677	llama_token llama_vocab_mask(const struct llama_vocab* vocab) {
3678	return vocab->token_mask();
3679	}
3680
3681	// deprecated
3682	const char * llama_token_get_text(const struct llama_vocab * vocab, llama_token token) {
3683	return llama_vocab_get_text(vocab, token);
3684	}
3685
3686	// deprecated
3687	float llama_token_get_score(const struct llama_vocab * vocab, llama_token token) {
3688	return llama_vocab_get_score(vocab, token);
3689	}
3690
3691	// deprecated
3692	enum llama_token_attr llama_token_get_attr(const struct llama_vocab * vocab, llama_token token) {
3693	return llama_vocab_get_attr(vocab, token);
3694	}
3695
3696	// deprecated
3697	bool llama_token_is_eog(const struct llama_vocab * vocab, llama_token token) {
3698	return llama_vocab_is_eog(vocab, token);
3699	}
3700
3701	// deprecated
3702	bool llama_token_is_control(const struct llama_vocab * vocab, llama_token token) {
3703	return llama_vocab_is_control(vocab, token);
3704	}
3705
3706	// deprecated
3707	llama_token llama_token_bos(const struct llama_vocab * vocab) {
3708	return llama_vocab_bos(vocab);
3709	}
3710
3711	// deprecated
3712	llama_token llama_token_eos(const struct llama_vocab * vocab) {
3713	return llama_vocab_eos(vocab);
3714	}
3715
3716	// deprecated
3717	llama_token llama_token_eot(const struct llama_vocab * vocab) {
3718	return llama_vocab_eot(vocab);
3719	}
3720
3721	// deprecated
3722	llama_token llama_token_cls(const struct llama_vocab * vocab) {
3723	//return llama_vocab_cls(vocab);
3724	return llama_vocab_bos(vocab); // avoid deprecation warning
3725	}
3726
3727	// deprecated
3728	llama_token llama_token_sep(const struct llama_vocab * vocab) {
3729	return llama_vocab_sep(vocab);
3730	}
3731
3732	// deprecated
3733	llama_token llama_token_nl (const struct llama_vocab * vocab) {
3734	return llama_vocab_nl(vocab);
3735	}
3736
3737	// deprecated
3738	llama_token llama_token_pad(const struct llama_vocab * vocab) {
3739	return llama_vocab_pad(vocab);
3740	}
3741
3742	// deprecated
3743	bool llama_add_bos_token(const struct llama_vocab * vocab) {
3744	return llama_vocab_get_add_bos(vocab);
3745	}
3746
3747	// deprecated
3748	bool llama_add_eos_token(const struct llama_vocab * vocab) {
3749	return llama_vocab_get_add_eos(vocab);
3750	}
3751
3752	// deprecated
3753	llama_token llama_token_fim_pre(const struct llama_vocab * vocab) {
3754	return llama_vocab_fim_pre(vocab);
3755	}
3756
3757	// deprecated
3758	llama_token llama_token_fim_suf(const struct llama_vocab * vocab) {
3759	return llama_vocab_fim_suf(vocab);
3760	}
3761
3762	// deprecated
3763	llama_token llama_token_fim_mid(const struct llama_vocab * vocab) {
3764	return llama_vocab_fim_mid(vocab);
3765	}
3766
3767	// deprecated
3768	llama_token llama_token_fim_pad(const struct llama_vocab * vocab) {
3769	return llama_vocab_fim_pad(vocab);
3770	}
3771
3772	// deprecated
3773	llama_token llama_token_fim_rep(const struct llama_vocab * vocab) {
3774	return llama_vocab_fim_rep(vocab);
3775	}
3776
3777	// deprecated
3778	llama_token llama_token_fim_sep(const struct llama_vocab * vocab) {
3779	return llama_vocab_fim_sep(vocab);
3780	}
3781
3782	//
3783	// tokenization
3784	//
3785
3786	int32_t llama_tokenize(
3787	const struct llama_vocab * vocab,
3788	const char * text,
3789	int32_t text_len,
3790	llama_token * tokens,
3791	int32_t n_tokens_max,
3792	bool add_special,
3793	bool parse_special) {
3794	return vocab->tokenize(text, text_len, tokens, n_tokens_max, add_special, parse_special);
3795	}
3796
3797	int32_t llama_token_to_piece(
3798	const struct llama_vocab * vocab,
3799	llama_token token,
3800	char * buf,
3801	int32_t length,
3802	int32_t lstrip,
3803	bool special) {
3804	return vocab->token_to_piece(token, buf, length, lstrip, special);
3805	}
3806
3807	int32_t llama_detokenize(
3808	const struct llama_vocab * vocab,
3809	const llama_token * tokens,
3810	int32_t n_tokens,
3811	char * text,
3812	int32_t text_len_max,
3813	bool remove_special,
3814	bool unparse_special) {
3815	return vocab->detokenize(tokens, n_tokens, text, text_len_max, remove_special, unparse_special);
3816	}
3817

Browse the source code of llama.cpp/src/llama-vocab.cpp