1#include "llama-vocab.h"
2
3#include "ggml.h"
4#include "gguf.h"
5#include "llama-impl.h"
6#include "llama-model-loader.h"
7
8#include "unicode.h"
9
10#include <algorithm>
11#include <cassert>
12#include <cctype>
13#include <cfloat>
14#include <cmath>
15#include <cstdarg>
16#include <cstring>
17#include <forward_list>
18#include <limits>
19#include <map>
20#include <queue>
21#include <set>
22#include <unordered_map>
23
24//
25// helpers
26//
27
28struct naive_trie {
29 naive_trie() : has_value(false), value(0) {
30 }
31 void insert(const char * key, size_t len, int32_t value = 0) {
32 if (len == 0) {
33 this->has_value = true;
34 this->value = value;
35 return;
36 }
37 char c = key[0];
38 auto res = children.find(x: c);
39 if (res != children.end()) {
40 res->second.insert(key: key + 1, len: len - 1, value);
41 } else {
42 auto res = children.insert(x: std::make_pair(x&: c, y: naive_trie()));
43 res.first->second.insert(key: key + 1, len: len - 1, value);
44 }
45 }
46 std::pair<const char *, size_t> get_longest_prefix(const char * key, size_t len, size_t offset = 0) const {
47 if (len == 0 || offset == len) {
48 return std::make_pair(x&: key, y&: offset);
49 }
50 char c = key[offset];
51 auto res = children.find(x: c);
52 if (res != children.end()) {
53 return res->second.get_longest_prefix(key, len, offset: offset + 1);
54 }
55
56 return std::make_pair(x&: key, y&: offset);
57 }
58 const struct naive_trie * traverse(const char c) const {
59 auto res = children.find(x: c);
60 if (res != children.end()) {
61 return &res->second;
62 }
63
64 return NULL;
65 }
66 std::map<char, struct naive_trie> children;
67 bool has_value;
68 llama_token value;
69};
70
71//
72// tokenizers
73//
74
75struct llm_tokenizer {
76 llm_tokenizer() {}
77 virtual ~llm_tokenizer() = default;
78};
79
80struct llm_symbol {
81 using index = int;
82 index prev;
83 index next;
84 const char * text;
85 size_t n;
86};
87
88static_assert(std::is_trivially_copyable<llm_symbol>::value, "llm_symbol is not trivially copyable");
89
90//
91// SPM tokenizer
92// original implementation:
93// https://github.com/ggerganov/llama.cpp/commit/074bea2eb1f1349a0118239c4152914aecaa1be4
94//
95
96struct llm_bigram_spm {
97 struct comparator {
98 bool operator()(llm_bigram_spm & l, llm_bigram_spm & r) {
99 return (l.score < r.score) || (l.score == r.score && l.left > r.left);
100 }
101 };
102 using queue_storage = std::vector<llm_bigram_spm>;
103 using queue = std::priority_queue<llm_bigram_spm, queue_storage, comparator>;
104 llm_symbol::index left;
105 llm_symbol::index right;
106 float score;
107 size_t size;
108};
109
110struct llm_tokenizer_spm : llm_tokenizer {
111 llm_tokenizer_spm(const llama_vocab & /*vocab*/) {}
112};
113
114struct llm_tokenizer_spm_session {
115 llm_tokenizer_spm_session(const llama_vocab & vocab) : vocab(vocab) {}
116
117 void tokenize(const std::string & text, std::vector<llama_token> & output) {
118 // split string into utf8 chars
119 int index = 0;
120 size_t offs = 0;
121 while (offs < text.size()) {
122 llm_symbol sym;
123 size_t len = unicode_len_utf8(src: text[offs]);
124 sym.text = text.c_str() + offs;
125 sym.n = std::min(a: len, b: text.size() - offs);
126 offs += sym.n;
127 sym.prev = index - 1;
128 sym.next = offs == text.size() ? -1 : index + 1;
129 index++;
130 symbols.emplace_back(args&: sym);
131 }
132
133 // seed the work queue with all possible 2-character tokens.
134 for (int i = 1; i < (int) symbols.size(); ++i) {
135 try_add_bigram(left: i - 1, right: i);
136 }
137
138 // keep substituting the highest frequency pairs for as long as we can.
139 while (!work_queue.empty()) {
140 auto bigram = work_queue.top();
141 work_queue.pop();
142
143 auto & left_sym = symbols[bigram.left];
144 auto & right_sym = symbols[bigram.right];
145
146 // if one of the symbols already got merged, skip it.
147 if (left_sym.n == 0 || right_sym.n == 0 ||
148 left_sym.n + right_sym.n != bigram.size) {
149 continue;
150 }
151
152 // merge the right sym into the left one
153 left_sym.n += right_sym.n;
154 right_sym.n = 0;
155
156 //LLAMA_LOG_INFO("left = '%*s' size = %zu\n", (int) left_sym.n, left_sym.text, bigram.size);
157
158 // remove the right sym from the chain
159 left_sym.next = right_sym.next;
160 if (right_sym.next >= 0) {
161 symbols[right_sym.next].prev = bigram.left;
162 }
163
164 // find more substitutions
165 try_add_bigram(left: left_sym.prev, right: bigram.left);
166 try_add_bigram(left: bigram.left, right: left_sym.next);
167 }
168
169 for (int i = 0; i != -1; i = symbols[i].next) {
170 auto & symbol = symbols[i];
171 resegment(symbol, output);
172 }
173 }
174
175private:
176 void resegment(llm_symbol & symbol, std::vector<llama_token> & output) {
177 auto text = std::string(symbol.text, symbol.n);
178 auto token = vocab.text_to_token(text);
179
180 // Do we need to support is_unused?
181 if (token != LLAMA_TOKEN_NULL) {
182 output.push_back(x: token);
183 return;
184 }
185
186 const auto p = rev_merge.find(x: text);
187
188 if (p == rev_merge.end()) {
189 // output any symbols that did not form tokens as bytes.
190 output.reserve(n: output.size() + symbol.n);
191 for (int j = 0; j < (int)symbol.n; ++j) {
192 llama_token id = vocab.byte_to_token(ch: symbol.text[j]);
193 output.push_back(x: id);
194 }
195 return;
196 }
197
198 resegment(symbol&: symbols[p->second.first], output);
199 resegment(symbol&: symbols[p->second.second], output);
200 }
201
202 void try_add_bigram(int left, int right) {
203 if (left == -1 || right == -1) {
204 return;
205 }
206 const std::string text = std::string(symbols[left].text, symbols[left].n + symbols[right].n);
207 auto token = vocab.text_to_token(text);
208
209 if (token == LLAMA_TOKEN_NULL) {
210 return;
211 }
212
213 if (static_cast<uint32_t>(token) >= vocab.n_tokens()) {
214 return;
215 }
216
217 const auto & tok_data = vocab.get_token_data(id: token);
218
219 llm_bigram_spm bigram;
220 bigram.left = left;
221 bigram.right = right;
222 bigram.score = tok_data.score;
223 bigram.size = text.size();
224
225 work_queue.push(x: bigram);
226
227 // Do we need to support is_unused?
228 rev_merge[text] = std::make_pair(x&: left, y&: right);
229 }
230
231 const llama_vocab & vocab;
232 // currently unused
233 // const llm_tokenizer_spm * spm_tokenizer;
234
235 std::vector<llm_symbol> symbols;
236 llm_bigram_spm::queue work_queue;
237 std::map<std::string, std::pair<int, int>> rev_merge;
238};
239
240//
241// BPE tokenizer
242// adapted from https://github.com/cmp-nct/ggllm.cpp [MIT License]
243// tried to simplify unicode stuff, so most likely does not work 100% correctly!
244//
245
246// TODO: there are a lot of common parts between spm and bpe tokenizers, should be refactored and reused
247
248template<typename T, typename Container = std::vector<T>, typename Compare = std::less<typename Container::value_type>>
249class llama_priority_queue : public std::priority_queue<T, Container, Compare> {
250public:
251 using std::priority_queue<T, Container, Compare>::priority_queue;
252
253 T pop_move() {
254 T item = std::move(this->c.front());
255 std::pop_heap(this->c.begin(), this->c.end(), this->comp);
256 this->c.pop_back();
257 return item;
258 }
259
260 void pop() = delete;
261};
262
263struct llm_bigram_bpe {
264 struct comparator {
265 bool operator()(const llm_bigram_bpe & l, const llm_bigram_bpe & r) const {
266 return l.rank > r.rank || (l.rank == r.rank && l.left > r.left);
267 }
268 };
269
270 using queue_storage = std::vector<llm_bigram_bpe>;
271 using queue = llama_priority_queue<llm_bigram_bpe, queue_storage, comparator>;
272 llm_symbol::index left;
273 llm_symbol::index right;
274 std::string text;
275 int rank;
276 size_t size;
277};
278
279struct llm_tokenizer_bpe : llm_tokenizer {
280 llm_tokenizer_bpe(const llama_vocab & vocab) {
281 GGML_ASSERT(vocab.get_type() == LLAMA_VOCAB_TYPE_BPE);
282 switch (vocab.get_pre_type()) {
283 case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
284 regex_exprs = {
285 // original regex from tokenizer.json
286 //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
287
288 // adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989
289 "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
290 };
291 break;
292 case LLAMA_VOCAB_PRE_TYPE_DBRX:
293 case LLAMA_VOCAB_PRE_TYPE_SMAUG:
294 regex_exprs = {
295 // same as llama3
296 "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
297 };
298 break;
299 case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
300 regex_exprs = {
301 "[\r\n]",
302 "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
303 "\\s?[!-/:-~!-/:-~‘-‟ -。]+",
304 "\\s+$",
305 "[一-龥ࠀ-一가-퟿]+",
306 "\\p{N}+",
307 };
308 break;
309 case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM:
310 case LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE:
311 regex_exprs = {
312 "\\p{N}{1,3}",
313 "[一-龥぀-ゟ゠-ヿ]+",
314 "[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\r\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\r\n]*|\\s*[\r\n]+|\\s+(?!\\S)|\\s+",
315 };
316 break;
317 case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
318 regex_exprs = {
319 "[\r\n]",
320 "\\s?\\p{L}+",
321 "\\s?\\p{P}+",
322 "[一-龥ࠀ-一가-퟿]+",
323 "\\p{N}",
324 };
325 break;
326 case LLAMA_VOCAB_PRE_TYPE_FALCON:
327 regex_exprs = {
328 "[\\p{P}\\$\\+<=>\\^~\\|`]+",
329 "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
330 "[0-9][0-9][0-9]",
331 };
332 break;
333 case LLAMA_VOCAB_PRE_TYPE_STARCODER:
334 case LLAMA_VOCAB_PRE_TYPE_REFACT:
335 case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
336 case LLAMA_VOCAB_PRE_TYPE_SMOLLM:
337 case LLAMA_VOCAB_PRE_TYPE_CODESHELL:
338 case LLAMA_VOCAB_PRE_TYPE_EXAONE:
339 case LLAMA_VOCAB_PRE_TYPE_MINERVA:
340 regex_exprs = {
341 "\\p{N}",
342 "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
343 };
344 break;
345 case LLAMA_VOCAB_PRE_TYPE_GPT2:
346 case LLAMA_VOCAB_PRE_TYPE_MPT:
347 case LLAMA_VOCAB_PRE_TYPE_OLMO:
348 case LLAMA_VOCAB_PRE_TYPE_JAIS:
349 case LLAMA_VOCAB_PRE_TYPE_TRILLION:
350 case LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING:
351 regex_exprs = {
352 "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
353 };
354 break;
355 case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
356 case LLAMA_VOCAB_PRE_TYPE_QWEN2:
357 case LLAMA_VOCAB_PRE_TYPE_HUNYUAN:
358 regex_exprs = {
359 // original regex from tokenizer.json
360 // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
361 "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
362 };
363 break;
364 case LLAMA_VOCAB_PRE_TYPE_PORO:
365 case LLAMA_VOCAB_PRE_TYPE_BLOOM:
366 case LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH:
367 regex_exprs = {
368 " ?[^(\\s|.,!?…。,、।۔،)]+",
369 };
370 break;
371 case LLAMA_VOCAB_PRE_TYPE_CHATGLM4:
372 regex_exprs = {
373 "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
374 };
375 break;
376 case LLAMA_VOCAB_PRE_TYPE_VIKING:
377 regex_exprs = {
378 " ?[^(\\s|.,!?…。,、।۔،)]+",
379 "\\p{N}",
380 };
381 break;
382 case LLAMA_VOCAB_PRE_TYPE_TEKKEN:
383 // original regex from tokenizer.json
384 // "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
385 regex_exprs = {
386 "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
387 };
388 break;
389 case LLAMA_VOCAB_PRE_TYPE_CHAMELEON:
390 // Note: in theory, the special token (sentinel and image token) regex_exprs below
391 // are unnecessary, as they are split in `tokenizer_st_partition` anyway.
392 // However, since the upstream pre-tokenizer uses them, they are also
393 // included here (see https://huggingface.co/facebook/chameleon-7b).
394 regex_exprs = {
395 "<sentinel:[0-9]+>", // Sentinel tokens
396 "(IMGIMG)((A|B|C|D|E|F|G|H|I){1,4})Z", // Image tokens
397 "([\\t\\n]| | )", // directly from tokenizer.json
398 "\\p{N}", // Individual digits
399 "[\\p{P}!-/:-@\\[-`{-~]", // Punctuation, Isolated
400 "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
401 };
402 break;
403 case LLAMA_VOCAB_PRE_TYPE_GPT4O:
404 case LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2:
405 regex_exprs = {
406 // original regex from tokenizer.json
407 // "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
408 "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
409 };
410 break;
411 case LLAMA_VOCAB_PRE_TYPE_KIMI_K2:
412 regex_exprs = {
413 // K2 trigger pattern - this will activate the custom K2 handler in unicode.cpp
414 // The custom handler implements all K2 patterns with proper Han character exclusion
415 "\\p{Han}+",
416 };
417 break;
418 case LLAMA_VOCAB_PRE_TYPE_SUPERBPE:
419 regex_exprs = {
420 "\\p{N}+",
421 "(?=(\\d{3})+(?!\\d))",
422 };
423 break;
424 case LLAMA_VOCAB_PRE_TYPE_BAILINGMOE:
425 regex_exprs = {
426 // original regex from tokenizer.json
427 // "'(?i:[sdmt]|ll|ve|re)|[^\\r\\n\\p{L}\\p{N}]?+\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]++[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+"
428 // FIXME? Changed possessive quantifiers (?+ and ++) to greedy to avoid errors and imatrix hanging (tried atomic grouping but it's not supported?)
429 "'(?:[sSdDmMtT]|[lL][lL]|[vV][eE]|[rR][eE])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
430 };
431 break;
432 case LLAMA_VOCAB_PRE_TYPE_SEED_CODER:
433 regex_exprs = {
434 // original regex from tokenizer.json
435 // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\r\n]+|\\s*[\r\n]+|\\s+(?!\\S)|\\s+"
436 "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\\r\\n]+|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
437 };
438 break;
439 case LLAMA_VOCAB_PRE_TYPE_GROK_2:
440 regex_exprs = {
441 // original regex from tokenizer.json
442 // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
443 "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
444 };
445 break;
446 default:
447 // default regex for BPE tokenization pre-processing
448 regex_exprs = {
449 "[\\p{P}\\$\\+<=>\\^~\\|]+",
450 "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
451 "\\p{N}+",
452 "[0-9][0-9][0-9]",
453 };
454 break;
455 }
456 }
457
458 std::vector<std::string> regex_exprs;
459};
460
461struct llm_tokenizer_bpe_session {
462 llm_tokenizer_bpe_session(const llama_vocab & vocab, const llm_tokenizer_bpe & tokenizer) : vocab(vocab), tokenizer(tokenizer) {}
463
464 static void append(const llama_token token_id, std::vector<llama_token> & output) {
465 output.push_back(x: token_id);
466 }
467
468 bool append_bos(std::vector<llama_token> & output) const {
469 if (vocab.get_add_bos()) {
470 GGML_ASSERT(vocab.token_bos() != LLAMA_TOKEN_NULL);
471 output.push_back(x: vocab.token_bos());
472 return true;
473 }
474 return false;
475 }
476
477 bool append_eos(std::vector<llama_token> & output) const {
478 if (vocab.get_add_eos()) {
479 GGML_ASSERT(vocab.token_eos() != LLAMA_TOKEN_NULL);
480 output.push_back(x: vocab.token_eos());
481 return true;
482 }
483 return false;
484 }
485
486 void check_double_bos_eos(const std::vector<llama_token> & output) const {
487 if (vocab.get_add_bos() && output.size() >= 2 && output[1] == vocab.token_bos()) {
488 LLAMA_LOG_WARN(
489 "%s: Added a BOS token to the prompt as specified by the model but the prompt "
490 "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
491 "Are you sure this is what you want?\n", __FUNCTION__);
492 }
493 if (vocab.get_add_eos() && output.size() >= 2 && *(output.end()-2) == vocab.token_eos()) {
494 LLAMA_LOG_WARN(
495 "%s: Added a EOS token to the prompt as specified by the model but the prompt "
496 "also ends with a EOS token. So now the final prompt ends with 2 EOS tokens. "
497 "Are you sure this is what you want?\n", __FUNCTION__);
498 }
499 }
500
501 void tokenize(const std::string & text, std::vector<llama_token> & output) {
502 int final_prev_index = -1;
503 const auto word_collection = unicode_regex_split(text, regex_exprs: tokenizer.regex_exprs);
504
505 symbols_final.clear();
506
507 for (const auto & word : word_collection) {
508 work_queue = llm_bigram_bpe::queue();
509 symbols.clear();
510
511 int index = 0;
512 size_t offset = 0;
513
514 //if (vocab.tokenizer_ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
515 if (vocab.get_ignore_merges() && vocab.text_to_token(text: word) != LLAMA_TOKEN_NULL) {
516 symbols.emplace_back(args: llm_symbol{.prev: -1, .next: -1, .text: word.c_str(), .n: word.size()});
517 offset = word.size();
518 }
519
520 while (offset < word.size()) {
521 llm_symbol sym;
522 size_t char_len = std::min(a: word.size() - offset, b: (size_t) unicode_len_utf8(src: word[offset]));
523 sym.text = word.c_str() + offset;
524 sym.n = char_len;
525 offset += sym.n;
526 sym.prev = index - 1;
527 sym.next = offset == word.size() ? -1 : index + 1;
528 index++;
529 symbols.emplace_back(args&: sym);
530 }
531 for (int i = 1; i < (int) symbols.size(); ++i) {
532 add_new_bigram(left: i - 1, right: i);
533 }
534
535 // build token(s)
536 while (!work_queue.empty()) {
537 auto bigram = work_queue.pop_move();
538
539 auto & left_symbol = symbols[bigram.left];
540 auto & right_symbol = symbols[bigram.right];
541
542 if (left_symbol.n == 0 || right_symbol.n == 0) {
543 continue;
544 }
545 std::string left_token = std::string(left_symbol.text, left_symbol.n);
546 std::string right_token = std::string(right_symbol.text, right_symbol.n);
547 if (left_token + right_token != bigram.text) {
548 continue; // Skip this bigram if it's outdated
549 }
550
551 // merge the right sym into the left one
552 left_symbol.n += right_symbol.n;
553 right_symbol.n = 0;
554
555 // remove the right sym from the chain
556 left_symbol.next = right_symbol.next;
557 if (right_symbol.next >= 0) {
558 symbols[right_symbol.next].prev = bigram.left;
559 }
560
561 add_new_bigram(left: left_symbol.prev, right: bigram.left); // left side of current symbol
562 add_new_bigram(left: bigram.left, right: left_symbol.next); // right side of current symbol
563 }
564
565 // add the finished tokens to the final list keeping correct order for next and prev
566 for (auto & sym : symbols) {
567 if (sym.n > 0) {
568 sym.prev = final_prev_index;
569 sym.next = -1;
570 if (final_prev_index != -1) {
571 symbols_final[final_prev_index].next = symbols_final.size();
572 }
573 symbols_final.emplace_back(args&: sym);
574 final_prev_index = symbols_final.size() - 1;
575 }
576 }
577 }
578
579 symbols = symbols_final;
580
581 if (!symbols.empty()) {
582 for (int i = 0; i != -1; i = symbols[i].next) {
583 auto & symbol = symbols[i];
584 if (symbol.n == 0) {
585 continue;
586 }
587
588 const std::string str = std::string(symbol.text, symbol.n);
589 const auto token = vocab.text_to_token(text: str);
590
591 if (token == LLAMA_TOKEN_NULL) {
592 for (auto j = str.begin(); j != str.end(); ++j) {
593 std::string byte_str(1, *j);
594 auto token_multibyte = vocab.text_to_token(text: byte_str);
595 if (token_multibyte != LLAMA_TOKEN_NULL) {
596 output.push_back(x: token_multibyte);
597 }
598 }
599 } else {
600 output.push_back(x: token);
601 }
602 }
603 }
604 }
605
606private:
607 void add_new_bigram(int left, int right) {
608 if (left == -1 || right == -1) {
609 return;
610 }
611 std::string left_token = std::string(symbols[left].text, symbols[left].n);
612 std::string right_token = std::string(symbols[right].text, symbols[right].n);
613
614 int rank_found = -1;
615
616 rank_found = vocab.find_bpe_rank(token_left: left_token, token_right: right_token);
617
618 if (rank_found < 0) {
619 return;
620 }
621
622 llm_bigram_bpe bigram;
623
624 bigram.left = left;
625 bigram.right = right;
626 bigram.text = left_token + right_token;
627 bigram.size = left_token.size() + right_token.size();
628 bigram.rank = rank_found;
629
630 work_queue.push(x: bigram);
631 }
632
633 const llama_vocab & vocab;
634 const llm_tokenizer_bpe & tokenizer;
635
636 std::vector<llm_symbol> symbols;
637 std::vector<llm_symbol> symbols_final;
638 llm_bigram_bpe::queue work_queue;
639};
640
641//
642// WPM tokenizer
643//
644
645struct llm_tokenizer_wpm : llm_tokenizer {
646 llm_tokenizer_wpm(const llama_vocab & /*vocab*/) {}
647};
648
649struct llm_tokenizer_wpm_session {
650 llm_tokenizer_wpm_session(const llama_vocab & vocab) : vocab(vocab) {}
651
652 void tokenize(const std::string & text, std::vector<llama_token> & output) {
653 // normalize and split by whitespace
654 std::vector<std::string> words = preprocess(text);
655 // bos token prepended already
656
657 // find the longest tokens that form the words
658 for (const std::string & word : words) {
659 // skip empty words
660 if (word.size() == 0) {
661 continue;
662 }
663
664 // prepend phantom space
665 const std::string word1 = "\xe2\x96\x81" + word;
666 const int n = word1.size();
667
668 const size_t current_tokens = output.size();
669
670 // we're at the start of a new word
671 // move through character position in word
672 for (int i = 0; i < n; ++i) {
673 // loop through possible match length
674 bool match = false;
675 for (int j = std::min(a: n, b: i + vocab.max_token_len() + 1); j > i; j--) {
676 auto id = vocab.text_to_token(text: word1.substr(pos: i, n: j - i));
677 if (id != LLAMA_TOKEN_NULL) {
678 output.push_back(x: id);
679 match = true;
680 i = j - 1;
681 break;
682 }
683 }
684
685 if (!match) { // discard all
686 output.resize(new_size: current_tokens);
687 break; // and discard next tokens
688 }
689 }
690
691 // we didn't find any matches for this word
692 if (current_tokens == output.size()) {
693 output.push_back(x: vocab.token_unk());
694 }
695 }
696 }
697
698 // TODO: reduce string copies by using cpts_offs array
699 static std::vector<std::string> preprocess(const std::string & text) {
700 const std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(cpts: unicode_cpts_from_utf8(utf8: text));
701 std::vector<std::string> words(1, "");
702
703 for (const uint32_t cpt : cpts_nfd) {
704 const auto flags = unicode_cpt_flags_from_cpt(cpt);
705
706 if (flags.is_whitespace) {
707 if (words.back().size()) { // finish previous word if any
708 words.emplace_back();
709 }
710 continue;
711 }
712
713 assert (!flags.is_separator);
714 if (cpt == 0 || cpt == 0xFFFD || flags.is_control) {
715 continue;
716 }
717
718 const std::string s = unicode_cpt_to_utf8(cpt: unicode_tolower(cpt));
719 if (flags.is_punctuation || ( cpt < 0x7F && flags.is_symbol ) || is_chinese_char(cpt)) {
720 if (words.back().size()) { // finish previous word if any
721 words.emplace_back();
722 }
723 words.back() = s; // single char word
724 words.emplace_back(); // start a new word
725 } else {
726 words.back() += s; // append char to word
727 }
728 }
729
730 if (!words.back().size()) {
731 words.pop_back();
732 }
733
734 return words;
735 }
736
737 static bool is_chinese_char(uint32_t cpt) {
738 return
739 (cpt >= 0x04E00 && cpt <= 0x09FFF) ||
740 (cpt >= 0x03400 && cpt <= 0x04DBF) ||
741 (cpt >= 0x20000 && cpt <= 0x2A6DF) ||
742 (cpt >= 0x2A700 && cpt <= 0x2B73F) ||
743 (cpt >= 0x2B740 && cpt <= 0x2B81F) ||
744 (cpt >= 0x2B920 && cpt <= 0x2CEAF) || // this should be 0x2B820 but in hf rust code it is 0x2B920
745 (cpt >= 0x0F900 && cpt <= 0x0FAFF) ||
746 (cpt >= 0x2F800 && cpt <= 0x2FA1F);
747 //(cpt >= 0x3000 && cpt <= 0x303F) ||
748 //(cpt >= 0xFF00 && cpt <= 0xFFEF);
749 }
750
751private:
752 const llama_vocab & vocab;
753 // currently unused
754 // const llm_tokenizer_wpm * wpm_tokenizer;
755};
756
757//
758// UGM tokenizer
759//
760
761struct llm_tokenizer_ugm : llm_tokenizer {
762 llm_tokenizer_ugm(const llama_vocab & vocab, const std::vector<char> & precompiled_charsmap) {
763 if (precompiled_charsmap.size() > 0) {
764 size_t charsmap_offset = 0;
765
766 // First four bytes of precompiled_charsmap contains length of binary
767 // blob containing XOR-compressed compact double array (XCDA) entries
768 uint32_t xcda_blob_size = *(const uint32_t *) &precompiled_charsmap[0];
769 charsmap_offset += sizeof(xcda_blob_size);
770 if (xcda_blob_size + charsmap_offset >= precompiled_charsmap.size()) {
771 throw std::runtime_error("Index out of array bounds in precompiled charsmap!");
772 }
773
774 // Next xcda_blob_size bytes contain entries of XOR-compressed compact
775 // double array (XCDA). Each entry is bit-packed into a 32-bit integer.
776 xcda_array = (const uint32_t *) &precompiled_charsmap[charsmap_offset];
777 xcda_array_size = xcda_blob_size / sizeof(uint32_t);
778 charsmap_offset += xcda_blob_size;
779
780 // Remaining bytes of precompiled charsmap contain null-terminated
781 // replacement strings for prefixes matched by the XCDA.
782 prefix_replacements = &precompiled_charsmap[charsmap_offset];
783 prefix_replacements_size = precompiled_charsmap.size() - charsmap_offset;
784 }
785
786 for (uint32_t id = 0; id < vocab.n_tokens(); ++id) {
787 const auto & token_data = vocab.get_token_data(id);
788
789 if (vocab.is_normal(id)) {
790 min_score = std::min<float>(a: min_score, b: token_data.score);
791 max_score = std::max<float>(a: max_score, b: token_data.score);
792 }
793
794 if (vocab.is_normal(id) ||
795 vocab.is_user_defined(id) ||
796 vocab.is_unused(id)) {
797 token_matcher.insert(key: token_data.text.data(), len: token_data.text.size(), value: id);
798 }
799
800 if (vocab.is_user_defined(id)) {
801 user_defined_token_matcher.insert(key: token_data.text.data(), len: token_data.text.size());
802 }
803 }
804
805 unknown_token_score = min_score - unknown_token_score_penalty;
806 }
807
808 // escaped space symbol - U+2581 (Lower One Eighth Block)
809 const std::string escaped_space = "\xE2\x96\x81";
810
811 const char * prefix_replacements = NULL;
812 size_t prefix_replacements_size = 0;
813
814 const uint32_t * xcda_array = NULL;
815 size_t xcda_array_size = 0;
816
817 struct naive_trie user_defined_token_matcher;
818
819 float min_score = FLT_MAX;
820 float max_score = -FLT_MAX;
821
822 float unknown_token_score_penalty = 10.0;
823 float unknown_token_score;
824
825 struct naive_trie token_matcher;
826};
827
828struct llm_tokenizer_ugm_session {
829 llm_tokenizer_ugm_session(const llama_vocab & vocab, const llm_tokenizer_ugm & tokenizer) : vocab(vocab), tokenizer(tokenizer) {}
830
831 /* This implementation is based on SentencePiece optimized Viterbi algorithm for
832 * unigram language models. The general idea is to:
833 * - move along the input sequence in steps of one UTF code point,
834 * - at each step find all possible tokenizations of the prefix by
835 * traversing the tokens trie,
836 * - for each tokenization store the best one so far (by higher score)
837 * - use the position in sequence after given token as an index to store
838 * results
839 * - if there was no valid tokenization of the current UTF code point
840 * then use unknown token with additional score penalty
841 * After processing the whole sequence we backtrack from the end to get
842 * the best tokenization.
843 */
844 void tokenize(const std::string & text, std::vector<llama_token> & output) {
845 // get current size of output (for reversal later)
846 size_t output_size = output.size();
847
848 // normalize the input first
849 std::string normalized;
850 normalize(input: text, normalized: &normalized);
851 size_t input_len = normalized.size();
852 if (input_len == 0) {
853 return;
854 }
855
856 // initialize score_sum to -FLT_MAX so it will be always lower than sums of token scores
857 std::vector<struct best_tokenization> tokenization_results(input_len + 1, {.token_id: vocab.token_unk(), .input_offset: 0, .score_sum: -DBL_MAX});
858 // at the beginning tokenization score is zero
859 tokenization_results[0] = { .token_id: vocab.token_unk(), .input_offset: 0, .score_sum: 0 };
860
861 for (size_t input_offset = 0; input_offset < input_len;) {
862 size_t prefix_offset = input_offset;
863 // calculate how many code units are in the currently processed UTF code point
864 size_t n_utf8_code_units = std::min<size_t>(a: unicode_len_utf8(src: normalized[input_offset]), b: input_len - input_offset);
865
866 // traverse the token matcher trie to find a matching token
867 bool single_codepoint_token_found = false;
868 const struct best_tokenization & current_best = tokenization_results[input_offset];
869 const struct naive_trie * node = tokenizer.token_matcher.traverse(c: normalized[prefix_offset++]);
870
871 while (prefix_offset <= input_len && node != NULL) {
872 // check if we found valid token in prefix
873 if (node->has_value) {
874 // check if it corresponds to the whole UTF code point
875 if (prefix_offset - input_offset == n_utf8_code_units) {
876 single_codepoint_token_found = true;
877 }
878 llama_token token_id = node->value;
879 const auto & token_data = vocab.get_token_data(id: token_id);
880
881 // we set the user-defined token scores to 0 to make them more likely to be selected
882 // (normal token scores are log probabilities, so they are negative)
883 // score type is double here to make tokenization results exactly
884 // the same as in the HF tokenizer using SentencePiece
885 const double token_score = vocab.is_user_defined(id: token_id) ? 0.0 : token_data.score;
886 const double challenger_score = current_best.score_sum + token_score;
887 struct best_tokenization & current_champ = tokenization_results[prefix_offset];
888 if (challenger_score > current_champ.score_sum) {
889 struct best_tokenization challenger = { .token_id: token_id, .input_offset: input_offset, .score_sum: challenger_score };
890 current_champ = challenger;
891 }
892 }
893 node = node->traverse(c: normalized[prefix_offset++]);
894 }
895
896 // if we didn't find a valid token corresponding to the whole UTF code point
897 // then use unknown token as the tokenization of this UTF code point
898 if (!single_codepoint_token_found) {
899 const double challenger_score = current_best.score_sum + tokenizer.unknown_token_score;
900 prefix_offset = input_offset + n_utf8_code_units;
901 struct best_tokenization & current_champ = tokenization_results[prefix_offset];
902 if (challenger_score > current_champ.score_sum) {
903 struct best_tokenization challenger = { .token_id: vocab.token_unk(), .input_offset: input_offset, .score_sum: challenger_score };
904 current_champ = challenger;
905 }
906 }
907
908 // move to the next UTF code point
909 input_offset += n_utf8_code_units;
910 }
911
912 // now backtrack from the end to gather token ids of the best tokenization
913 // merge sequences of consecutive unknown tokens into single unknown tokens
914 bool is_prev_unknown = false;
915 for (struct best_tokenization & tokenization = tokenization_results[input_len]; ; tokenization = tokenization_results[tokenization.input_offset]) {
916 bool is_unknown = tokenization.token_id == vocab.token_unk();
917 if (!(is_prev_unknown && is_unknown)) {
918 output.push_back(x: tokenization.token_id);
919 }
920 if (tokenization.input_offset == 0) {
921 break;
922 }
923 is_prev_unknown = is_unknown;
924 }
925
926 // reverse the output since we added tokens starting from the end of the input
927 std::reverse(first: output.begin() + output_size, last: output.end());
928 }
929
930private:
931
932 // helper structure for returning normalization results
933 struct normalization_result {
934 const char * normalized;
935 size_t normalized_len;
936 size_t consumed_input;
937 };
938
939 void normalize(const std::string& input, std::string * normalized) {
940 normalized->clear();
941 normalized->reserve(res_arg: input.size() * 3);
942
943 const std::string space = vocab.get_escape_whitespaces() ? tokenizer.escaped_space : " ";
944
945 const bool shall_prepend_space = !vocab.get_treat_whitespace_as_suffix() && vocab.get_add_space_prefix();
946 const bool shall_append_space = vocab.get_treat_whitespace_as_suffix() && vocab.get_add_space_prefix();
947 const bool shall_merge_spaces = vocab.get_remove_extra_whitespaces();
948
949 bool is_space_prepended = false;
950 bool processing_non_ws = false;
951
952 size_t input_len = input.size();
953
954 for (size_t input_offset = 0; input_offset < input_len; ) {
955 auto norm_res = normalize_prefix(input, input_offset);
956 for (size_t i = 0; i < norm_res.normalized_len; i++) {
957 char c = norm_res.normalized[i];
958 if (c != ' ') {
959 if (!processing_non_ws) {
960 processing_non_ws = true;
961 if ((shall_prepend_space && !is_space_prepended) || shall_merge_spaces) {
962 normalized->append(str: space);
963 is_space_prepended = true;
964 }
965 }
966 normalized->push_back(c: c);
967 } else {
968 if (processing_non_ws) {
969 processing_non_ws = false;
970 }
971 if (!shall_merge_spaces) {
972 normalized->append(str: space);
973 }
974 }
975 }
976
977 input_offset += norm_res.consumed_input;
978 }
979
980 if (shall_append_space) {
981 normalized->append(str: space);
982 }
983 }
984
985 /*
986 * This structure is a view wrapper for XOR-compressed double array (XCDA)
987 * See Shunsuke Kanda (2018). Space- and Time-Efficient String Dictionaries.
988 * Each bit-packed entry contains:
989 * - BASE array value in bits 10-30
990 * - LCHECK array value in bits 0-7
991 * - LEAF array value in bit 9
992 * Entries containing indexes of replacement sequences have set bit 31
993 */
994 struct xcda_array_view {
995 public:
996 xcda_array_view(const uint32_t * xcda_array, size_t xcda_array_size) : xcda_array(xcda_array), xcda_array_size(xcda_array_size) {
997 }
998 uint32_t get_base(size_t index) {
999 uint32_t packed_node = get_node(index);
1000 return (packed_node >> 10) << ((packed_node & (1U << 9)) >> 6);
1001 }
1002 uint32_t get_lcheck(size_t index) {
1003 uint32_t packed_node = get_node(index);
1004 return packed_node & ((1U << 31) | 0xff);
1005 }
1006 bool get_leaf(size_t index) {
1007 uint32_t packed_node = get_node(index);
1008 return (packed_node >> 8) & 1;
1009 }
1010 uint32_t get_value(size_t index) {
1011 uint32_t packed_node = get_node(index);
1012 return packed_node & ((1U << 31) - 1);
1013 }
1014 private:
1015 uint32_t get_node(size_t index) {
1016 if (index > xcda_array_size) {
1017 throw std::runtime_error("Index out of array bounds in XCDA array!");
1018 }
1019 return xcda_array[index];
1020 }
1021 const uint32_t * xcda_array;
1022 size_t xcda_array_size;
1023 };
1024
1025 // this structure stores the best tokenization so far at input_offset
1026 struct best_tokenization {
1027 llama_token token_id;
1028 size_t input_offset;
1029 double score_sum;
1030 };
1031
1032 struct normalization_result normalize_prefix(const std::string & input, size_t input_offset) {
1033 if (input_offset == input.size()) {
1034 return { .normalized: &input[input_offset], .normalized_len: 0, .consumed_input: 0 };
1035 }
1036
1037 // if input prefix matches some user-defined token return this token as normalization result
1038 auto user_defined_token_match =
1039 tokenizer.user_defined_token_matcher.get_longest_prefix(key: &input[input_offset], len: input.size() - input_offset);
1040 if (user_defined_token_match.second > 0) {
1041 return { .normalized: &input[input_offset], .normalized_len: user_defined_token_match.second, .consumed_input: user_defined_token_match.second };
1042 }
1043
1044 size_t longest_prefix_length = 0;
1045 size_t longest_prefix_offset = 0;
1046
1047 if (tokenizer.xcda_array_size > 0) {
1048 struct xcda_array_view xcda_view(tokenizer.xcda_array, tokenizer.xcda_array_size);
1049
1050 // Find the longest normalized sequence matching the input prefix by walking
1051 // the XOR-compressed compact double array (XCDA) starting from the root node
1052 // We find the index of the next node by calculating BASE[s] ^ c where s is
1053 // the index of the previous node and c is a numerical character value
1054 uint32_t node_index = 0;
1055 // get BASE of the root node
1056 node_index = xcda_view.get_base(index: node_index);
1057 for (size_t prefix_offset = input_offset; prefix_offset < input.size(); prefix_offset++) {
1058 unsigned char c = input[prefix_offset];
1059 if (c == 0) {
1060 break;
1061 }
1062 node_index ^= c;
1063 // if value of LCHECK is not c it means that this is not a child of
1064 // the previous node, so we stop matching
1065 if (xcda_view.get_lcheck(index: node_index) != c) {
1066 break;
1067 }
1068 bool is_leaf = xcda_view.get_leaf(index: node_index);
1069 // get BASE of the current node
1070 node_index ^= xcda_view.get_base(index: node_index);
1071 // if LEAF of the current node is true, it means that its BASE points to the node
1072 // containing index of replacement sequence for currently matched input prefix
1073 if (is_leaf)
1074 {
1075 longest_prefix_length = prefix_offset - input_offset + 1;
1076 // get index of replacement sequence for currently matched input prefix
1077 longest_prefix_offset = xcda_view.get_value(index: node_index);
1078 }
1079 }
1080 }
1081
1082 if (longest_prefix_length > 0) {
1083 // we have a match, so return the replacement sequence
1084 if (longest_prefix_offset >= tokenizer.prefix_replacements_size) {
1085 throw std::runtime_error("Index out of array bounds in precompiled charsmap!");
1086 }
1087 const char * prefix_replacement = &(tokenizer.prefix_replacements)[longest_prefix_offset];
1088 return { .normalized: prefix_replacement, .normalized_len: strlen(s: prefix_replacement), .consumed_input: longest_prefix_length };
1089 }
1090
1091 // check if the input prefix contains a valid sequence of UTF-8 code units
1092 try {
1093 // if yes, return this sequence unmodified
1094 size_t prefix_offset = input_offset;
1095 unicode_cpt_from_utf8(utf8: input, offset&: prefix_offset);
1096 return { .normalized: &input[input_offset], .normalized_len: prefix_offset - input_offset, .consumed_input: prefix_offset - input_offset };
1097 } catch (std::invalid_argument & /*ex*/) {
1098 // if no, consume 1 byte and return U+FFFD - REPLACEMENT CHARACTER
1099 return { .normalized: "\xEF\xBF\xBD", .normalized_len: 3, .consumed_input: 1 };
1100 }
1101 }
1102
1103 const llama_vocab & vocab;
1104 const llm_tokenizer_ugm & tokenizer;
1105};
1106
1107//
1108// RWKV tokenizer
1109//
1110
1111static std::vector<uint8_t> llama_unescape_rwkv_token(const std::string & escaped) {
1112 std::vector<uint8_t> output;
1113 output.reserve(n: escaped.size());
1114
1115 // Parser state
1116 bool escaping = false;
1117 uint8_t hex_remaining = 0;
1118 uint8_t hex_acc = 0;
1119
1120 // Step through characters, performing parsing
1121 for (const char & c : escaped) {
1122 // If we're parsing a hex code, interpret the next character
1123 if (hex_remaining != 0) {
1124 uint8_t value = (c >= 'a') ? (c - 'a' + 10) : (c - '0');
1125 hex_acc = (hex_acc << 4) + value;
1126
1127 hex_remaining -= 1;
1128 if (hex_remaining == 0) {
1129 output.push_back(x: hex_acc);
1130 hex_acc = 0;
1131 }
1132
1133 continue;
1134 }
1135
1136 // If we got an escape character, interpret it
1137 if (escaping) {
1138 if (c == 't') {
1139 output.push_back(x: '\t');
1140 } else if (c == 'n') {
1141 output.push_back(x: '\n');
1142 } else if (c == 'r') {
1143 output.push_back(x: '\r');
1144 } else if (c == 'x') {
1145 hex_remaining = 2;
1146 } else {
1147 output.push_back(x: c);
1148 }
1149
1150 escaping = false;
1151 continue;
1152 }
1153
1154 if (c == '\\') {
1155 escaping = true;
1156 continue;
1157 }
1158
1159 output.push_back(x: c);
1160 }
1161
1162 return output;
1163}
1164
1165struct llm_tokenizer_rwkv : llm_tokenizer {
1166 llm_tokenizer_rwkv(const llama_vocab & vocab) {
1167 // RWKV supports arbitrary byte tokens, but the vocab struct only supports string tokens.
1168 // For now, we decode the vocab here into the lookup we'll use for tokenization.
1169
1170 // build trie
1171 for (uint32_t id = 0; id < vocab.n_tokens(); ++id) {
1172 const auto & data = vocab.get_token_data(id);
1173 const auto text = llama_unescape_rwkv_token(escaped: data.text);
1174 token_matcher.insert(key: (const char *) text.data(), len: text.size(), value: id);
1175 }
1176 }
1177
1178 struct naive_trie token_matcher;
1179};
1180
1181struct llm_tokenizer_rwkv_session {
1182 llm_tokenizer_rwkv_session(const llama_vocab & vocab, const llm_tokenizer_rwkv & tokenizer) : vocab(vocab), tokenizer(tokenizer) {}
1183
1184 void tokenize(const std::string & text, std::vector<llama_token> & output) {
1185 uint32_t position = 0;
1186 while (position < text.size()) {
1187 const struct naive_trie * node = tokenizer.token_matcher.traverse(c: text[position]);
1188 if (node == NULL) {
1189 // no matching token found, add unknown token
1190 output.push_back(x: vocab.token_unk());
1191 position += 1;
1192 continue;
1193 }
1194
1195 // traverse the trie to find the longest matching token
1196 uint32_t token_id = 0;
1197 uint32_t token_length = 0;
1198 while (node != NULL) {
1199 if (node->has_value) {
1200 token_id = node->value;
1201 token_length = position + 1;
1202 }
1203 node = node->traverse(c: text[++position]);
1204 }
1205
1206 // add the longest matching token
1207 output.push_back(x: token_id);
1208 position = token_length;
1209 }
1210 }
1211
1212private:
1213 const llama_vocab & vocab;
1214 const llm_tokenizer_rwkv & tokenizer;
1215};
1216
1217struct llm_tokenizer_plamo2 : llm_tokenizer {
1218 llm_tokenizer_plamo2(const llama_vocab & vocab) {
1219 build(vocab);
1220 }
1221
1222 void build(const llama_vocab & vocab) {
1223 // Reset internal structures
1224 tokens_.clear();
1225 bytes_.assign(n: 256, val: 0);
1226 to_suffix_id_.clear();
1227 table_.clear();
1228
1229 // Build token list and byte mapping
1230 std::unordered_map<std::string, float> suffix_to_score;
1231 std::unordered_map<std::string, llama_token> token_to_id;
1232
1233 for (size_t token_id = 0; token_id < vocab.n_tokens(); ++token_id) {
1234 const auto & entry = vocab.get_token_data(id: token_id);
1235 tokens_.push_back(x: entry.text);
1236 token_to_id[entry.text] = static_cast<llama_token>(token_id);
1237
1238 // Handle byte tokens
1239 if (vocab.is_byte(id: token_id)) {
1240 if (entry.text.length() == 6 && entry.text.substr(pos: 0, n: 3) == "<0x" && entry.text.back() == '>') {
1241 std::string hex_str = entry.text.substr(pos: 3, n: 2);
1242 int byte_val = std::stoi(str: hex_str, idx: nullptr, base: 16);
1243 bytes_[byte_val] = static_cast<llama_token>(token_id);
1244 }
1245 continue;
1246 }
1247
1248 // Add token and all its suffixes to suffix_to_score
1249 suffix_to_score[entry.text] = entry.score;
1250
1251 // Extract suffixes character by character (UTF-8 aware)
1252 std::vector<uint32_t> cpts = unicode_cpts_from_utf8(utf8: entry.text);
1253 for (size_t i = 1; i < cpts.size(); ++i) {
1254 std::string suffix;
1255 for (size_t j = i; j < cpts.size(); ++j) {
1256 suffix += unicode_cpt_to_utf8(cpt: cpts[j]);
1257 }
1258 if (suffix_to_score.find(x: suffix) == suffix_to_score.end()) {
1259 suffix_to_score[suffix] = std::numeric_limits<float>::quiet_NaN();
1260 }
1261 }
1262 }
1263
1264 // Check that all byte tokens are set
1265 for (int i = 0; i < 256; ++i) {
1266 if (bytes_[i] == 0) {
1267 throw std::runtime_error("Byte token for <0x" + std::to_string(val: i) + "> is not set");
1268 }
1269 }
1270
1271 // Build suffix list in lexicographical order of reversed strings
1272 std::vector<std::string> suffixes;
1273 for (const auto & pair : suffix_to_score) {
1274 suffixes.push_back(x: pair.first);
1275 }
1276 suffixes.push_back(x: ""); // Empty suffix
1277
1278 std::sort(first: suffixes.begin(), last: suffixes.end(), comp: [](const std::string & a, const std::string & b) {
1279 std::string rev_a(a.rbegin(), a.rend());
1280 std::string rev_b(b.rbegin(), b.rend());
1281 return rev_a < rev_b;
1282 });
1283
1284 // Build suffix_to_id and to_suffix_id_
1285 std::unordered_map<std::string, int32_t> suffix_to_id;
1286 int32_t num_pieces = 0;
1287
1288 for (const auto & suffix : suffixes) {
1289 suffix_to_id[suffix] = num_pieces;
1290 if (!suffix.empty()) {
1291 std::vector<uint32_t> cpts = unicode_cpts_from_utf8(utf8: suffix);
1292
1293 std::string remaining;
1294 for (size_t i = 1; i < cpts.size(); ++i) {
1295 remaining += unicode_cpt_to_utf8(cpt: cpts[i]);
1296 }
1297
1298 int64_t piece_code = (static_cast<int64_t>(cpts[0]) << 32) | suffix_to_id[remaining];
1299 to_suffix_id_[piece_code] = num_pieces;
1300
1301 // Count number of pieces for this suffix
1302 int32_t pieces_for_suffix = 1; // sentinel row
1303 for (int32_t piece_length = static_cast<int32_t>(cpts.size()); piece_length > 0; --piece_length) {
1304 std::string piece;
1305 for (int32_t i = 0; i < piece_length; ++i) {
1306 piece += unicode_cpt_to_utf8(cpt: cpts[i]);
1307 }
1308 if (suffix_to_score.find(x: piece) != suffix_to_score.end()) {
1309 pieces_for_suffix++;
1310 }
1311 }
1312 num_pieces += pieces_for_suffix;
1313 } else {
1314 num_pieces++; // Empty suffix contributes one piece (sentinel row)
1315 }
1316 }
1317
1318 // Build flattened table
1319 table_.resize(new_size: num_pieces, x: std::vector<int32_t>(4, 0));
1320 int32_t table_idx = 0;
1321
1322 for (const auto & suffix : suffixes) {
1323 // Add all prefixes of the suffix to the table (in decreasing order of length)
1324 std::vector<uint32_t> cpts = unicode_cpts_from_utf8(utf8: suffix);
1325 for (int32_t piece_length = static_cast<int32_t>(cpts.size()); piece_length > 0; --piece_length) {
1326 std::string piece;
1327 for (int32_t i = 0; i < piece_length; ++i) {
1328 piece += unicode_cpt_to_utf8(cpt: cpts[i]);
1329 }
1330
1331 auto score_it = suffix_to_score.find(x: piece);
1332 if (score_it == suffix_to_score.end()) {
1333 continue;
1334 }
1335
1336 table_[table_idx][TABLE_PIECE_LENGTH] = piece_length;
1337 auto token_it = token_to_id.find(x: piece);
1338 table_[table_idx][TABLE_TOKEN_ID] = (token_it != token_to_id.end()) ? token_it->second : -1;
1339
1340 float score = score_it->second;
1341 table_[table_idx][TABLE_SCORE] = std::isfinite(x: score) ?
1342 static_cast<int32_t>(std::round(x: score * 1e4)) : INVALID_SCORE;
1343 table_[table_idx][TABLE_PIECE_ID] = suffix_to_id[piece];
1344
1345 table_idx++;
1346 }
1347
1348 // Add sentinel row
1349 table_[table_idx][TABLE_PIECE_LENGTH] = 1;
1350 table_[table_idx][TABLE_TOKEN_ID] = -1;
1351 table_[table_idx][TABLE_SCORE] = UNKNOWN_SCORE;
1352 table_idx++;
1353 }
1354 }
1355
1356 std::vector<llama_token> encode(const std::string & text) const {
1357 std::vector<uint32_t> unicode_data = unicode_cpts_from_utf8(utf8: text);
1358 // Skip the first code point if it is a BOM (Byte Order Mark)
1359 if (!unicode_data.empty() && unicode_data[0] == 0xFEFF) {
1360 unicode_data.erase(position: unicode_data.begin());
1361 }
1362
1363 if (unicode_data.empty()) {
1364 return {};
1365 }
1366
1367 const size_t data_len = unicode_data.size();
1368
1369 // Initialize scores array (dynamic programming)
1370 std::vector<int64_t> scores(data_len + 1, static_cast<int64_t>(1) << 60);
1371 scores[data_len] = 0;
1372
1373 // Path array to track best tokenization
1374 std::vector<std::vector<int32_t>> path(data_len + 1, std::vector<int32_t>(3, 0));
1375
1376 int32_t suffix_id = 0;
1377
1378 // Process from end to beginning
1379 for (int i = static_cast<int>(data_len) - 1; i >= 0; --i) {
1380 uint32_t c = unicode_data[i];
1381
1382 // Find next suffix ID
1383 for (size_t p = suffix_id; p < table_.size(); ++p) {
1384 int64_t piece_code = (static_cast<int64_t>(c) << 32) | table_[p][TABLE_PIECE_ID];
1385 auto it = to_suffix_id_.find(x: piece_code);
1386 suffix_id = (it != to_suffix_id_.end()) ? it->second : 0;
1387
1388 if (suffix_id > 0 || table_[p][TABLE_SCORE] == UNKNOWN_SCORE) {
1389 break;
1390 }
1391 }
1392
1393 // Update best path
1394 for (size_t p = suffix_id; p < table_.size(); ++p) {
1395 int32_t score = table_[p][TABLE_SCORE];
1396 if (score > INVALID_SCORE) {
1397 int32_t piece_length = table_[p][TABLE_PIECE_LENGTH];
1398 int64_t s = scores[i + piece_length] - score;
1399
1400 if (s < scores[i]) {
1401 scores[i] = s;
1402 path[i][PATH_TOKEN_LENGTH] = piece_length;
1403 path[i][PATH_TOKEN_ID] = table_[p][TABLE_TOKEN_ID];
1404 path[i][PATH_NUM_TOKENS] = path[i + piece_length][PATH_NUM_TOKENS] + 1;
1405
1406 if (score == UNKNOWN_SCORE) {
1407 // Add UTF-8 byte count
1408 path[i][PATH_NUM_TOKENS] += (c >= 0x80) + (c >= 0x800) + (c >= 0x10000);
1409 }
1410 }
1411 }
1412
1413 if (score == UNKNOWN_SCORE) {
1414 break;
1415 }
1416 }
1417 }
1418
1419 // Decode the best path
1420 std::vector<llama_token> token_ids;
1421 token_ids.reserve(n: path[0][PATH_NUM_TOKENS]);
1422
1423 int pos = 0;
1424 while (pos < static_cast<int>(data_len)) {
1425 if (path[pos][PATH_TOKEN_ID] >= 0) {
1426 token_ids.push_back(x: path[pos][PATH_TOKEN_ID]);
1427 } else {
1428 // Fall back to byte tokens
1429 uint32_t c = unicode_data[pos];
1430 int s = 1 + (c >= 0x80) + (c >= 0x800) + (c >= 0x10000);
1431
1432 for (int i = 0; i < s; ++i) {
1433 uint8_t b;
1434 if (s == 1) {
1435 b = c;
1436 } else {
1437 if (i == 0) {
1438 b = (0xF00 >> s) & 0xFF;
1439 } else {
1440 b = 0x80;
1441 }
1442 }
1443 token_ids.push_back(x: bytes_[b | ((c >> ((s - i - 1) * 6)) & 0x3F)]);
1444 }
1445 }
1446
1447 assert(path[pos][PATH_TOKEN_LENGTH] > 0);
1448 pos += path[pos][PATH_TOKEN_LENGTH];
1449 }
1450
1451 return token_ids;
1452 }
1453private:
1454 // Constants for table structure
1455 static constexpr int32_t TABLE_PIECE_LENGTH = 0;
1456 static constexpr int32_t TABLE_TOKEN_ID = 1;
1457 static constexpr int32_t TABLE_SCORE = 2;
1458 static constexpr int32_t TABLE_PIECE_ID = 3;
1459
1460 // Constants for path array
1461 static constexpr int32_t PATH_TOKEN_LENGTH = 0;
1462 static constexpr int32_t PATH_TOKEN_ID = 1;
1463 static constexpr int32_t PATH_NUM_TOKENS = 2;
1464
1465 // Score constants
1466 static constexpr int32_t INVALID_SCORE = -20000000;
1467 static constexpr int32_t UNKNOWN_SCORE = -10000000;
1468
1469 // List of tokens in the vocabulary
1470 std::vector<std::string> tokens_;
1471
1472 // Mapping from byte code point to token ID (for byte fallback)
1473 std::vector<llama_token> bytes_;
1474
1475 // Mapping from piece code to suffix ID
1476 std::unordered_map<int64_t, int32_t> to_suffix_id_;
1477
1478 // Flattened table representing the Trie structure
1479 // Each row contains: [piece_length, token_id, score, piece_id]
1480 std::vector<std::vector<int32_t>> table_;
1481};
1482
1483struct llm_tokenizer_plamo2_session {
1484 llm_tokenizer_plamo2_session(const llm_tokenizer_plamo2 & tokenizer) : tokenizer(tokenizer) {}
1485
1486 void tokenize(const std::string & text, std::vector<llama_token> & output) {
1487 std::vector<llama_token> tokens = tokenizer.encode(text);
1488 output.insert(position: output.end(), first: tokens.begin(), last: tokens.end());
1489 }
1490
1491private:
1492 const llm_tokenizer_plamo2 & tokenizer;
1493};
1494
1495//
1496// impl
1497//
1498
1499typedef enum FRAGMENT_BUFFER_VARIANT_TYPE {
1500 FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN,
1501 FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT
1502} FRAGMENT_BUFFER_VARIANT_TYPE;
1503
1504struct fragment_buffer_variant {
1505 fragment_buffer_variant(llama_token _token)
1506 :
1507 type(FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN),
1508 token(_token),
1509 raw_text(_dummy),
1510 offset(0),
1511 length(0) {}
1512
1513 fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length)
1514 :
1515 type(FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT),
1516 token((llama_token) - 1),
1517 raw_text(_raw_text),
1518 offset(_offset),
1519 length(_length){
1520 GGML_ASSERT(_offset >= 0);
1521 GGML_ASSERT(_length >= 1);
1522 GGML_ASSERT(offset + length <= raw_text.length());
1523 }
1524
1525 const FRAGMENT_BUFFER_VARIANT_TYPE type;
1526 const llama_token token;
1527 const std::string _dummy;
1528 const std::string & raw_text;
1529 const uint64_t offset;
1530 const uint64_t length;
1531};
1532
1533struct llama_vocab::impl {
1534 uint32_t n_token_types = 0; // for BERT-style token types
1535
1536 std::string tokenizer_model;
1537 std::string tokenizer_pre;
1538
1539 enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
1540 enum llama_vocab_pre_type pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
1541
1542 int max_token_len = 0; // used for optimizing longest token search
1543
1544 // default LLaMA special tokens
1545 // TODO: should we set all of these to LLAMA_TOKEN_NULL?
1546 llama_token special_bos_id = 1;
1547 llama_token special_eos_id = 2;
1548 llama_token special_eot_id = LLAMA_TOKEN_NULL;
1549 llama_token special_eom_id = LLAMA_TOKEN_NULL;
1550 llama_token special_unk_id = 0;
1551 llama_token special_sep_id = LLAMA_TOKEN_NULL;
1552 llama_token special_pad_id = LLAMA_TOKEN_NULL;
1553 llama_token special_mask_id = LLAMA_TOKEN_NULL;
1554
1555 llama_token linefeed_id = 13;
1556
1557 // fim tokens
1558 llama_token special_fim_pre_id = LLAMA_TOKEN_NULL;
1559 llama_token special_fim_suf_id = LLAMA_TOKEN_NULL;
1560 llama_token special_fim_mid_id = LLAMA_TOKEN_NULL;
1561 llama_token special_fim_pad_id = LLAMA_TOKEN_NULL;
1562 llama_token special_fim_rep_id = LLAMA_TOKEN_NULL; // repo
1563 llama_token special_fim_sep_id = LLAMA_TOKEN_NULL; // file separator
1564
1565 // tokenizer flags
1566 bool add_space_prefix = false;
1567 bool add_bos = false;
1568 bool add_eos = false;
1569 bool add_sep = false;
1570 bool ignore_merges = false;
1571 bool clean_spaces = false; // clean_up_tokenization_spaces
1572 bool remove_extra_whitespaces = false;
1573 bool escape_whitespaces = true;
1574 bool treat_whitespace_as_suffix = false;
1575
1576 std::unordered_map<std::string, llama_token> token_to_id;
1577 std::vector<token_data> id_to_token;
1578
1579 std::vector<llama_token> cache_special_tokens;
1580 std::vector<std::string> cache_token_to_piece; // llama_token_to_piece(special = true);
1581 struct pair_hash {
1582 size_t operator()(const std::pair<std::string, std::string> & p) const {
1583 return std::hash<std::string>{}(p.first) ^ //create some hash for pair
1584 (std::hash<std::string>{}(p.second) << 1);
1585 }
1586 };
1587 std::unordered_map<std::pair<std::string, std::string>, int, pair_hash> bpe_ranks;
1588
1589 // set of all tokens that cause "end of generation"
1590 std::set<llama_token> special_eog_ids;
1591
1592 std::unique_ptr<llm_tokenizer> tokenizer;
1593
1594 std::vector<char> precompiled_charsmap;
1595
1596 impl(const llama_vocab & vocab) : vocab(vocab) {
1597 }
1598
1599 ~impl() = default;
1600
1601 void load(llama_model_loader & ml, const LLM_KV & kv);
1602
1603 enum llama_vocab_type get_type() const;
1604
1605 std::string type_name() const;
1606
1607 bool is_normal (llama_token id) const;
1608 bool is_unknown (llama_token id) const;
1609 bool is_control (llama_token id) const;
1610 bool is_byte (llama_token id) const;
1611 bool is_user_defined(llama_token id) const;
1612 bool is_unused (llama_token id) const;
1613 bool is_eog (llama_token id) const;
1614
1615 uint8_t token_to_byte(llama_token id) const;
1616
1617 llama_token_attr token_get_attr(llama_token id) const;
1618
1619 void init_tokenizer(enum llama_vocab_type type);
1620
1621 void tokenizer_st_partition(std::forward_list<fragment_buffer_variant> & buffer, bool parse_special) const;
1622
1623 std::string token_to_piece_for_cache(
1624 llama_token token,
1625 bool special) const;
1626
1627
1628 std::vector<llama_token> tokenize(
1629 const std::string & raw_text,
1630 bool add_special,
1631 bool parse_special = false) const;
1632
1633 int32_t tokenize(
1634 const char * text,
1635 int32_t text_len,
1636 llama_token * tokens,
1637 int32_t n_tokens_max,
1638 bool add_special,
1639 bool parse_special) const;
1640
1641 // does not write null-terminator to buf
1642 int32_t token_to_piece(
1643 llama_token token,
1644 char * buf,
1645 int32_t length,
1646 int32_t lstrip,
1647 bool special) const;
1648
1649 // use cached data
1650 const std::string & token_to_piece(llama_token token) const;
1651
1652 int32_t detokenize(
1653 const llama_token * tokens,
1654 int32_t n_tokens,
1655 char * text,
1656 int32_t text_len_max,
1657 bool remove_special,
1658 bool unparse_special) const;
1659
1660 std::string detokenize(
1661 const std::vector<llama_token> & tokens,
1662 bool special) const;
1663
1664 void print_info() const;
1665
1666private:
1667 const llama_vocab & vocab;
1668};
1669
1670void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1671 struct gguf_context * ctx = ml.meta.get();
1672
1673 // determine vocab type
1674 {
1675 ml.get_key(kid: LLM_KV_TOKENIZER_MODEL, result&: tokenizer_model);
1676 ml.get_key(kid: LLM_KV_TOKENIZER_PRE, result&: tokenizer_pre, required: false);
1677
1678 ml.get_key(kid: LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, result&: n_token_types, required: false);
1679
1680 if (tokenizer_model == "no_vocab" || tokenizer_model == "none") {
1681 type = LLAMA_VOCAB_TYPE_NONE;
1682
1683 // default special tokens
1684 special_bos_id = LLAMA_TOKEN_NULL;
1685 special_eos_id = LLAMA_TOKEN_NULL;
1686 special_unk_id = LLAMA_TOKEN_NULL;
1687 special_sep_id = LLAMA_TOKEN_NULL;
1688 special_pad_id = LLAMA_TOKEN_NULL;
1689 special_mask_id = LLAMA_TOKEN_NULL;
1690 linefeed_id = LLAMA_TOKEN_NULL;
1691
1692 // read vocab size from metadata
1693 uint32_t n_tokens = 0;
1694 if (ml.get_key(kid: LLM_KV_VOCAB_SIZE, result&: n_tokens, required: false)) {
1695 LLAMA_LOG_WARN("%s: adding %u dummy tokens\n", __func__, n_tokens);
1696 id_to_token.resize(new_size: n_tokens);
1697 }
1698
1699 return;
1700 }
1701
1702 if (tokenizer_model == "llama") {
1703 type = LLAMA_VOCAB_TYPE_SPM;
1704
1705 // default special tokens
1706 special_bos_id = 1;
1707 special_eos_id = 2;
1708 special_unk_id = 0;
1709 special_sep_id = LLAMA_TOKEN_NULL;
1710 special_pad_id = LLAMA_TOKEN_NULL;
1711 special_mask_id = LLAMA_TOKEN_NULL;
1712 } else if (tokenizer_model == "bert") {
1713 type = LLAMA_VOCAB_TYPE_WPM;
1714
1715 // default special tokens
1716 special_bos_id = 101;
1717 special_eos_id = LLAMA_TOKEN_NULL;
1718 special_unk_id = 100;
1719 special_sep_id = 102;
1720 special_pad_id = 0;
1721 special_mask_id = 103;
1722
1723 add_sep = true;
1724 } else if (tokenizer_model == "gpt2") {
1725 type = LLAMA_VOCAB_TYPE_BPE;
1726
1727 // read bpe merges and populate bpe ranks
1728 const int merges_keyidx = gguf_find_key(ctx, key: kv(LLM_KV_TOKENIZER_MERGES).c_str());
1729 if (merges_keyidx == -1) {
1730 throw std::runtime_error("cannot find tokenizer merges in model file\n");
1731 }
1732
1733 const int n_merges = gguf_get_arr_n(ctx, key_id: merges_keyidx);
1734 for (int i = 0; i < n_merges; i++) {
1735 const std::string word = gguf_get_arr_str(ctx, key_id: merges_keyidx, i);
1736 //GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
1737
1738 std::string first;
1739 std::string second;
1740
1741 const size_t pos = word.find(c: ' ', pos: 1);
1742
1743 if (pos != std::string::npos) {
1744 first = word.substr(pos: 0, n: pos);
1745 second = word.substr(pos: pos + 1);
1746 }
1747
1748 bpe_ranks.emplace(args: std::make_pair(x&: first, y&: second), args&: i);
1749 }
1750
1751 // default special tokens
1752 special_bos_id = 11;
1753 special_eos_id = 11;
1754 special_unk_id = LLAMA_TOKEN_NULL;
1755 special_sep_id = LLAMA_TOKEN_NULL;
1756 special_pad_id = LLAMA_TOKEN_NULL;
1757 special_mask_id = LLAMA_TOKEN_NULL;
1758 } else if (tokenizer_model == "t5") {
1759 type = LLAMA_VOCAB_TYPE_UGM;
1760
1761 // default special tokens
1762 special_bos_id = LLAMA_TOKEN_NULL;
1763 special_eos_id = 1;
1764 special_unk_id = 2;
1765 special_sep_id = LLAMA_TOKEN_NULL;
1766 special_pad_id = 0;
1767 special_mask_id = LLAMA_TOKEN_NULL;
1768
1769 const int precompiled_charsmap_keyidx = gguf_find_key(ctx, key: kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
1770 if (precompiled_charsmap_keyidx != -1) {
1771 const gguf_type pc_type = gguf_get_arr_type(ctx, key_id: precompiled_charsmap_keyidx);
1772 GGML_ASSERT(pc_type == GGUF_TYPE_INT8 || pc_type == GGUF_TYPE_UINT8);
1773
1774 const size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, key_id: precompiled_charsmap_keyidx);
1775 const char * pc = (const char *) gguf_get_arr_data(ctx, key_id: precompiled_charsmap_keyidx);
1776 precompiled_charsmap.assign(first: pc, last: pc + n_precompiled_charsmap);
1777#if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
1778 // correct endiannes of data in precompiled_charsmap binary blob
1779 uint32_t * xcda_blob_size = (uint32_t *) &precompiled_charsmap[0];
1780 *xcda_blob_size = __builtin_bswap32(*xcda_blob_size);
1781 assert(*xcda_blob_size + sizeof(uint32_t) < n_precompiled_charsmap);
1782 size_t xcda_array_size = *xcda_blob_size / sizeof(uint32_t);
1783 uint32_t * xcda_array = (uint32_t *) &precompiled_charsmap[sizeof(uint32_t)];
1784 for (size_t i = 0; i < xcda_array_size; ++i) {
1785 xcda_array[i] = __builtin_bswap32(xcda_array[i]);
1786 }
1787#endif
1788 }
1789 } else if (tokenizer_model == "rwkv") {
1790 type = LLAMA_VOCAB_TYPE_RWKV;
1791
1792 // default special tokens
1793 special_bos_id = LLAMA_TOKEN_NULL;
1794 special_eos_id = LLAMA_TOKEN_NULL;
1795 special_unk_id = LLAMA_TOKEN_NULL;
1796 special_sep_id = LLAMA_TOKEN_NULL;
1797 special_pad_id = LLAMA_TOKEN_NULL;
1798 } else if (tokenizer_model == "plamo2") {
1799 type = LLAMA_VOCAB_TYPE_PLAMO2;
1800
1801 // PLaMo-2 default special tokens (these will be overridden by model config)
1802 special_bos_id = 1; // <|plamo:bos|>
1803 special_eos_id = 2; // <|plamo:eos|>
1804 special_unk_id = 0; // <|plamo:unk|>
1805 special_sep_id = LLAMA_TOKEN_NULL;
1806 special_pad_id = 3; // <|plamo:pad|>
1807 special_mask_id = LLAMA_TOKEN_NULL;
1808 } else {
1809 throw std::runtime_error(format(fmt: "unknown tokenizer: '%s'", tokenizer_model.c_str()));
1810 }
1811
1812 // for now, only BPE models have pre-tokenizers
1813 if (type == LLAMA_VOCAB_TYPE_BPE) {
1814 add_space_prefix = false;
1815 clean_spaces = true;
1816 if (tokenizer_pre.empty()) {
1817 LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
1818 LLAMA_LOG_WARN("%s: \n", __func__);
1819 LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
1820 LLAMA_LOG_WARN("%s: GENERATION QUALITY WILL BE DEGRADED! \n", __func__);
1821 LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL \n", __func__);
1822 LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
1823 LLAMA_LOG_WARN("%s: \n", __func__);
1824 pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
1825 } else if (tokenizer_pre == "default") {
1826 pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
1827 } else if (
1828 tokenizer_pre == "llama3" ||
1829 tokenizer_pre == "llama-v3" ||
1830 tokenizer_pre == "llama-bpe"||
1831 tokenizer_pre == "falcon3" ||
1832 tokenizer_pre == "falcon-h1" ||
1833 tokenizer_pre == "pixtral" ||
1834 tokenizer_pre == "midm-2.0" ||
1835 tokenizer_pre == "lfm2") {
1836 pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
1837 ignore_merges = true;
1838 add_bos = true;
1839 } else if (
1840 tokenizer_pre == "deepseek-llm") {
1841 pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM;
1842 clean_spaces = false;
1843 } else if (
1844 tokenizer_pre == "deepseek-coder") {
1845 pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
1846 clean_spaces = false;
1847 } else if (
1848 tokenizer_pre == "deepseek-v3") {
1849 pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM;
1850 clean_spaces = false;
1851 } else if (
1852 tokenizer_pre == "falcon") {
1853 pre_type = LLAMA_VOCAB_PRE_TYPE_FALCON;
1854 } else if (
1855 tokenizer_pre == "mpt") {
1856 pre_type = LLAMA_VOCAB_PRE_TYPE_MPT;
1857 } else if (
1858 tokenizer_pre == "starcoder") {
1859 pre_type = LLAMA_VOCAB_PRE_TYPE_STARCODER;
1860 } else if (
1861 tokenizer_pre == "gpt-2" ||
1862 tokenizer_pre == "phi-2" ||
1863 tokenizer_pre == "jina-es" ||
1864 tokenizer_pre == "jina-de" ||
1865 tokenizer_pre == "gigachat" ||
1866 tokenizer_pre == "jina-v2-es" ||
1867 tokenizer_pre == "jina-v2-de" ||
1868 tokenizer_pre == "a.x-4.0" ||
1869 tokenizer_pre == "mellum") {
1870 pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
1871 } else if (
1872 tokenizer_pre == "jina-v1-en" ||
1873 tokenizer_pre == "jina-v2-code" ||
1874 tokenizer_pre == "roberta-bpe") {
1875 pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
1876 add_sep = true;
1877 } else if (
1878 tokenizer_pre == "refact") {
1879 pre_type = LLAMA_VOCAB_PRE_TYPE_REFACT;
1880 } else if (
1881 tokenizer_pre == "command-r") {
1882 pre_type = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
1883 clean_spaces = false;
1884 } else if (
1885 tokenizer_pre == "qwen2" ||
1886 tokenizer_pre == "deepseek-r1-qwen") {
1887 pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
1888 clean_spaces = false;
1889 } else if (
1890 tokenizer_pre == "stablelm2") {
1891 pre_type = LLAMA_VOCAB_PRE_TYPE_STABLELM2;
1892 } else if (
1893 tokenizer_pre == "olmo") {
1894 pre_type = LLAMA_VOCAB_PRE_TYPE_OLMO;
1895 } else if (
1896 tokenizer_pre == "dbrx") {
1897 pre_type = LLAMA_VOCAB_PRE_TYPE_DBRX;
1898 } else if (
1899 tokenizer_pre == "smaug-bpe") {
1900 pre_type = LLAMA_VOCAB_PRE_TYPE_SMAUG;
1901 } else if (
1902 tokenizer_pre == "poro-chat") {
1903 pre_type = LLAMA_VOCAB_PRE_TYPE_PORO;
1904 clean_spaces = false;
1905 } else if (
1906 tokenizer_pre == "glm4" ||
1907 tokenizer_pre == "chatglm-bpe") {
1908 pre_type = LLAMA_VOCAB_PRE_TYPE_CHATGLM4;
1909 special_bos_id = LLAMA_TOKEN_NULL;
1910 } else if (
1911 tokenizer_pre == "viking") {
1912 pre_type = LLAMA_VOCAB_PRE_TYPE_VIKING;
1913 clean_spaces = false;
1914 } else if (
1915 tokenizer_pre == "jais") {
1916 pre_type = LLAMA_VOCAB_PRE_TYPE_JAIS;
1917 } else if (
1918 tokenizer_pre == "tekken") {
1919 pre_type = LLAMA_VOCAB_PRE_TYPE_TEKKEN;
1920 clean_spaces = false;
1921 ignore_merges = true;
1922 add_bos = true;
1923 } else if (
1924 tokenizer_pre == "smollm") {
1925 pre_type = LLAMA_VOCAB_PRE_TYPE_SMOLLM;
1926 clean_spaces = false;
1927 } else if (
1928 tokenizer_pre == "codeshell") {
1929 pre_type = LLAMA_VOCAB_PRE_TYPE_CODESHELL;
1930 } else if (
1931 tokenizer_pre == "bloom") {
1932 pre_type = LLAMA_VOCAB_PRE_TYPE_BLOOM;
1933 } else if (
1934 tokenizer_pre == "gpt3-finnish") {
1935 pre_type = LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH;
1936 } else if (
1937 tokenizer_pre == "exaone") {
1938 pre_type = LLAMA_VOCAB_PRE_TYPE_EXAONE;
1939 } else if (
1940 tokenizer_pre == "exaone4") {
1941 pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
1942 } else if (
1943 tokenizer_pre == "chameleon") {
1944 pre_type = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
1945 add_bos = true;
1946 clean_spaces = false;
1947 } else if (
1948 tokenizer_pre == "minerva-7b") {
1949 pre_type = LLAMA_VOCAB_PRE_TYPE_MINERVA;
1950 } else if (
1951 tokenizer_pre == "megrez") {
1952 pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
1953 } else if (
1954 tokenizer_pre == "gpt-4o" ||
1955 tokenizer_pre == "llama4") {
1956 pre_type = LLAMA_VOCAB_PRE_TYPE_GPT4O;
1957 clean_spaces = false;
1958 } else if (
1959 tokenizer_pre == "superbpe") {
1960 pre_type = LLAMA_VOCAB_PRE_TYPE_SUPERBPE;
1961 clean_spaces = false;
1962 } else if (
1963 tokenizer_pre == "trillion") {
1964 pre_type = LLAMA_VOCAB_PRE_TYPE_TRILLION;
1965 clean_spaces = false;
1966 } else if (
1967 tokenizer_pre == "granite-docling") {
1968 pre_type = LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING;
1969 clean_spaces = false;
1970 } else if (
1971 tokenizer_pre == "bailingmoe" ||
1972 tokenizer_pre == "bailingmoe2" ||
1973 tokenizer_pre == "llada-moe") {
1974 pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
1975 clean_spaces = false;
1976 } else if (
1977 tokenizer_pre == "seed-coder") {
1978 pre_type = LLAMA_VOCAB_PRE_TYPE_SEED_CODER;
1979 clean_spaces = false;
1980 } else if (
1981 tokenizer_pre == "hunyuan") {
1982 pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN;
1983 clean_spaces = false;
1984 } else if (
1985 tokenizer_pre == "hunyuan-dense") {
1986 pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE;
1987 clean_spaces = false;
1988 } else if (
1989 tokenizer_pre == "kimi-k2") {
1990 pre_type = LLAMA_VOCAB_PRE_TYPE_KIMI_K2;
1991 clean_spaces = false;
1992 } else if (
1993 tokenizer_pre == "grok-2") {
1994 pre_type = LLAMA_VOCAB_PRE_TYPE_GROK_2;
1995 clean_spaces = false;
1996 } else if (
1997 tokenizer_pre == "minimax-m2") {
1998 pre_type = LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2;
1999 clean_spaces = false;
2000 } else {
2001 throw std::runtime_error(format(fmt: "unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
2002 }
2003 } else if (type == LLAMA_VOCAB_TYPE_SPM) {
2004 pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
2005 add_space_prefix = true;
2006 clean_spaces = false;
2007 add_bos = true;
2008 add_eos = false;
2009 } else if (type == LLAMA_VOCAB_TYPE_WPM) {
2010 pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
2011 add_space_prefix = false;
2012 clean_spaces = true;
2013 add_bos = true;
2014 add_eos = false;
2015 add_sep = true;
2016 } else if (type == LLAMA_VOCAB_TYPE_UGM) {
2017 pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
2018 add_bos = false;
2019 add_eos = true;
2020 } else if (type == LLAMA_VOCAB_TYPE_RWKV) {
2021 pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
2022 add_space_prefix = false;
2023 clean_spaces = false;
2024 add_bos = false;
2025 add_eos = false;
2026 } else {
2027 pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
2028 }
2029
2030 ml.get_key(kid: LLM_KV_TOKENIZER_ADD_PREFIX, result&: add_space_prefix, required: false);
2031 ml.get_key(kid: LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, result&: remove_extra_whitespaces, required: false);
2032 }
2033
2034 const int token_idx = gguf_find_key(ctx, key: kv(LLM_KV_TOKENIZER_LIST).c_str());
2035 if (token_idx == -1) {
2036 throw std::runtime_error("cannot find tokenizer vocab in model file\n");
2037 }
2038
2039 const float * scores = nullptr;
2040 const int score_idx = gguf_find_key(ctx, key: kv(LLM_KV_TOKENIZER_SCORES).c_str());
2041 if (score_idx != -1) {
2042 scores = (const float * ) gguf_get_arr_data(ctx, key_id: score_idx);
2043 }
2044
2045 const int * toktypes = nullptr;
2046 const int toktype_idx = gguf_find_key(ctx, key: kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
2047 if (toktype_idx != -1) {
2048 toktypes = (const int * ) gguf_get_arr_data(ctx, key_id: toktype_idx);
2049 }
2050
2051 uint32_t n_tokens = gguf_get_arr_n(ctx, key_id: token_idx);
2052 id_to_token.resize(new_size: n_tokens);
2053
2054 for (uint32_t i = 0; i < n_tokens; i++) {
2055 std::string word = gguf_get_arr_str(ctx, key_id: token_idx, i);
2056 if (word.empty()) {
2057 LLAMA_LOG_WARN("%s: empty token at index %u\n", __func__, i);
2058 word = "[EMPTY_" + std::to_string(val: i) + "]";
2059 }
2060
2061 token_to_id[word] = i;
2062 max_token_len = std::max(a: max_token_len, b: (int) word.size());
2063
2064 auto & token_data = id_to_token[i];
2065 token_data.text = std::move(word);
2066 token_data.score = scores ? scores[i] : 0.0f;
2067 token_data.attr = LLAMA_TOKEN_ATTR_NORMAL;
2068
2069 if (toktypes) { //TODO: remove, required until per token attributes are available from GGUF file
2070 switch(toktypes[i]) {
2071 case LLAMA_TOKEN_TYPE_UNKNOWN: token_data.attr = LLAMA_TOKEN_ATTR_UNKNOWN; break;
2072 case LLAMA_TOKEN_TYPE_UNUSED: token_data.attr = LLAMA_TOKEN_ATTR_UNUSED; break;
2073 case LLAMA_TOKEN_TYPE_NORMAL: token_data.attr = LLAMA_TOKEN_ATTR_NORMAL; break;
2074 case LLAMA_TOKEN_TYPE_CONTROL: token_data.attr = LLAMA_TOKEN_ATTR_CONTROL; break;
2075 case LLAMA_TOKEN_TYPE_USER_DEFINED: token_data.attr = LLAMA_TOKEN_ATTR_USER_DEFINED; break;
2076 case LLAMA_TOKEN_TYPE_BYTE: token_data.attr = LLAMA_TOKEN_ATTR_BYTE; break;
2077 case LLAMA_TOKEN_TYPE_UNDEFINED: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
2078 default: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
2079 }
2080 }
2081 }
2082 GGML_ASSERT(id_to_token.size() == token_to_id.size());
2083
2084 init_tokenizer(type);
2085
2086 // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
2087 if (type == LLAMA_VOCAB_TYPE_SPM) {
2088 try {
2089 linefeed_id = vocab.byte_to_token(ch: '\n');
2090 } catch (const std::exception & e) {
2091 LLAMA_LOG_WARN("%s: SPM vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, e.what());
2092 linefeed_id = special_pad_id;
2093 }
2094 } else if (type == LLAMA_VOCAB_TYPE_WPM) {
2095 linefeed_id = special_pad_id;
2096 } else if (type == LLAMA_VOCAB_TYPE_RWKV) {
2097 const std::vector<int> ids = tokenize(raw_text: "\n", add_special: false);
2098 GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
2099 linefeed_id = ids[0];
2100 } else {
2101 const std::vector<int> ids = tokenize(raw_text: "\n", add_special: false);
2102
2103 //GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
2104 if (ids.empty()) {
2105 LLAMA_LOG_WARN("%s: model vocab missing newline token, using special_pad_id instead\n", __func__);
2106 linefeed_id = special_pad_id;
2107 } else {
2108 linefeed_id = ids[0];
2109 }
2110 }
2111
2112 // special tokens
2113 {
2114 const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
2115 { LLM_KV_TOKENIZER_BOS_ID, special_bos_id },
2116 { LLM_KV_TOKENIZER_EOS_ID, special_eos_id },
2117 { LLM_KV_TOKENIZER_EOT_ID, special_eot_id },
2118 { LLM_KV_TOKENIZER_EOM_ID, special_eom_id },
2119 { LLM_KV_TOKENIZER_UNK_ID, special_unk_id },
2120 { LLM_KV_TOKENIZER_SEP_ID, special_sep_id },
2121 { LLM_KV_TOKENIZER_PAD_ID, special_pad_id },
2122 { LLM_KV_TOKENIZER_MASK_ID, special_mask_id },
2123 { LLM_KV_TOKENIZER_FIM_PRE_ID, special_fim_pre_id },
2124 { LLM_KV_TOKENIZER_FIM_SUF_ID, special_fim_suf_id },
2125 { LLM_KV_TOKENIZER_FIM_MID_ID, special_fim_mid_id },
2126 { LLM_KV_TOKENIZER_FIM_PAD_ID, special_fim_pad_id },
2127 { LLM_KV_TOKENIZER_FIM_REP_ID, special_fim_rep_id },
2128 { LLM_KV_TOKENIZER_FIM_SEP_ID, special_fim_sep_id },
2129
2130 // deprecated
2131 { LLM_KV_TOKENIZER_PREFIX_ID, special_fim_pre_id },
2132 { LLM_KV_TOKENIZER_SUFFIX_ID, special_fim_suf_id },
2133 { LLM_KV_TOKENIZER_MIDDLE_ID, special_fim_mid_id },
2134 };
2135
2136 for (const auto & it : special_token_types) {
2137 const std::string & key = kv(std::get<0>(in: it));
2138 int32_t & id = std::get<1>(in: it);
2139
2140 uint32_t new_id;
2141 if (!ml.get_key(kid: std::get<0>(in: it), result&: new_id, required: false)) {
2142 continue;
2143 }
2144 if (new_id >= id_to_token.size()) {
2145 LLAMA_LOG_WARN("%s: bad special token: '%s' = %u, using default id %d\n",
2146 __func__, key.c_str(), new_id, id);
2147 } else {
2148 id = new_id;
2149 }
2150 }
2151
2152 // Handle add_bos, add_eos and add_sep
2153 {
2154 bool temp = true;
2155
2156 if (ml.get_key(kid: LLM_KV_TOKENIZER_ADD_BOS, result&: temp, required: false)) {
2157 add_bos = temp;
2158 }
2159 if (ml.get_key(kid: LLM_KV_TOKENIZER_ADD_EOS, result&: temp, required: false)) {
2160 add_eos = temp;
2161 }
2162 if (ml.get_key(kid: LLM_KV_TOKENIZER_ADD_SEP, result&: temp, required: false)) {
2163 add_sep = temp;
2164 }
2165 }
2166
2167 // auto-detect special tokens by text
2168 // TODO: convert scripts should provide these tokens through the KV metadata LLM_KV_TOKENIZER_...
2169 // for now, we apply this workaround to find the tokens based on their text
2170
2171 for (const auto & t : token_to_id) {
2172 // find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
2173 if (special_eot_id == LLAMA_TOKEN_NULL) {
2174 if (false
2175 || t.first == "<|eot_id|>"
2176 || t.first == "<|im_end|>"
2177 || t.first == "<|end|>"
2178 || t.first == "<end_of_turn>"
2179 || t.first == "<|endoftext|>"
2180 || t.first == "<|end_of_text|>" // granite
2181 || t.first == "<EOT>"
2182 || t.first == "_<EOT>"
2183 || t.first == "<|end▁of▁sentence|>" // DeepSeek
2184 || t.first == "<end_of_utterance>" // smoldocling
2185 ) {
2186 special_eot_id = t.second;
2187 if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2188 LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2189 __func__, t.second, t.first.c_str());
2190 id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
2191 }
2192 }
2193 }
2194
2195 // find EOM token: "<|eom_id|>"
2196 if (special_eom_id == LLAMA_TOKEN_NULL) {
2197 if (false
2198 || t.first == "<|eom_id|>"
2199 ) {
2200 special_eom_id = t.second;
2201 if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2202 LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2203 __func__, t.second, t.first.c_str());
2204 id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
2205 }
2206 }
2207 }
2208
2209 // find FIM_PRE token: "<|fim_prefix|>", "<fim-prefix>", "<PRE>", etc.
2210 if (special_fim_pre_id == LLAMA_TOKEN_NULL) {
2211 if (false
2212 || t.first == "<|fim_prefix|>" // Qwen
2213 || t.first == "<fim-prefix>"
2214 || t.first == "<fim_prefix>" // Granite
2215 || t.first == "<|fim▁begin|>" // DeepSeek
2216 || t.first == "<PRE>"
2217 || t.first == "▁<PRE>" // CodeLlama
2218 || t.first == "<|code_prefix|>" // GLM-4.5
2219 ) {
2220 special_fim_pre_id = t.second;
2221 if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2222 LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2223 __func__, t.second, t.first.c_str());
2224 id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
2225 }
2226 }
2227 }
2228
2229 // find FIM_SUF token: "<|fim_suffix|>", "<fim-suffix>", "<SUF>", etc.
2230 if (special_fim_suf_id == LLAMA_TOKEN_NULL) {
2231 if (false
2232 || t.first == "<|fim_suffix|>" // Qwen
2233 || t.first == "<fim-suffix>"
2234 || t.first == "<fim_suffix>" // Granite
2235 || t.first == "<|fim▁hole|>" // DeepSeek
2236 || t.first == "<SUF>"
2237 || t.first == "▁<SUF>" // CodeLlama
2238 || t.first == "<|code_suffix|>" // GLM-4.5
2239 ) {
2240 special_fim_suf_id = t.second;
2241 if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2242 LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2243 __func__, t.second, t.first.c_str());
2244 id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
2245 }
2246 }
2247 }
2248
2249 // find FIM_MID token: "<|fim_middle|>", "<fim-middle>", "<MID>", etc.
2250 if (special_fim_mid_id == LLAMA_TOKEN_NULL) {
2251 if (false
2252 || t.first == "<|fim_middle|>" // Qwen
2253 || t.first == "<fim-middle>"
2254 || t.first == "<fim_middle>" // Granite
2255 || t.first == "<|fim▁end|>" // DeepSeek
2256 || t.first == "<MID>"
2257 || t.first == "▁<MID>" // CodeLlama
2258 || t.first == "<|code_middle|>" // GLM-4.5
2259 ) {
2260 special_fim_mid_id = t.second;
2261 if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2262 LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2263 __func__, t.second, t.first.c_str());
2264 id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
2265 }
2266 }
2267 }
2268
2269 // find FIM_PAD token: "<|fim_pad|>", "<fim-pad>", "<PAD>", etc.
2270 if (special_fim_pad_id == LLAMA_TOKEN_NULL) {
2271 if (false
2272 || t.first == "<|fim_pad|>" // Qwen
2273 || t.first == "<fim-pad>"
2274 || t.first == "<fim_pad>" // Granite
2275 || t.first == "<PAD>"
2276 ) {
2277 special_fim_pad_id = t.second;
2278 if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2279 LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2280 __func__, t.second, t.first.c_str());
2281 id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
2282 }
2283 }
2284 }
2285
2286 // find FIM_REP token: "<|fim_repo|>", "<fim-repo>", "<REP>", etc.
2287 if (special_fim_rep_id == LLAMA_TOKEN_NULL) {
2288 if (false
2289 || t.first == "<|fim_repo|>" // Qwen
2290 || t.first == "<|repo_name|>"
2291 || t.first == "<fim-repo>"
2292 || t.first == "<REPO>"
2293 || t.first == "<reponame>" // Granite
2294 ) {
2295 special_fim_rep_id = t.second;
2296 if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2297 LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2298 __func__, t.second, t.first.c_str());
2299 id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
2300 }
2301 }
2302 }
2303
2304 // find FIM_SEP token: "<|file_sep|>"
2305 if (special_fim_sep_id == LLAMA_TOKEN_NULL) {
2306 if (false
2307 || t.first == "<|file_sep|>" // Qwen
2308 ) {
2309 special_fim_sep_id = t.second;
2310 if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2311 LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2312 __func__, t.second, t.first.c_str());
2313 id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
2314 }
2315 }
2316 }
2317 }
2318
2319 // maintain a list of tokens that cause end-of-generation
2320 // this is currently determined based on the token text, which is obviously not ideal
2321 // ref: https://github.com/ggerganov/llama.cpp/issues/9606
2322 special_eog_ids.clear();
2323
2324 if (special_fim_pad_id != LLAMA_TOKEN_NULL && special_eog_ids.count(x: special_fim_pad_id) == 0) {
2325 special_eog_ids.insert(x: special_fim_pad_id);
2326 }
2327
2328 if (special_fim_rep_id != LLAMA_TOKEN_NULL && special_eog_ids.count(x: special_fim_rep_id) == 0) {
2329 special_eog_ids.insert(x: special_fim_rep_id);
2330 }
2331
2332 if (special_fim_sep_id != LLAMA_TOKEN_NULL && special_eog_ids.count(x: special_fim_sep_id) == 0) {
2333 special_eog_ids.insert(x: special_fim_sep_id);
2334 }
2335
2336 for (const auto & t : token_to_id) {
2337 if (false
2338 || t.first == "<|eot_id|>"
2339 || t.first == "<|im_end|>"
2340 || t.first == "<|end|>"
2341 || t.first == "<|return|>" // o200k_harmony
2342 || t.first == "<|call|>" // o200k_harmony
2343 || t.first == "<end_of_turn>"
2344 || t.first == "<|endoftext|>"
2345 || t.first == "<|eom_id|>"
2346 || t.first == "<EOT>"
2347 || t.first == "_<EOT>"
2348 || t.first == "<|end_of_text|>"
2349 || t.first == "<end_of_utterance>" // smoldocling
2350 ) {
2351 special_eog_ids.insert(x: t.second);
2352 if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2353 LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2354 __func__, t.second, t.first.c_str());
2355 id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
2356 }
2357 } else {
2358 // token is control, but not marked as EOG -> print a debug log
2359 if (id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL && special_eog_ids.count(x: t.second) == 0) {
2360 LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
2361 __func__, t.second, t.first.c_str());
2362 }
2363 }
2364 }
2365
2366 // @ngxson : quick hack for gpt-oss, always render these tokens
2367 for (const auto & t : token_to_id) {
2368 if (t.first == "<|channel|>" || t.first == "<|message|>" || t.first == "<|start|>" || t.first == "<|constrain|>") {
2369 id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
2370 }
2371 }
2372
2373 // sanity checks
2374 if (special_eos_id != LLAMA_TOKEN_NULL && special_eog_ids.count(x: special_eos_id) == 0) {
2375 special_eog_ids.insert(x: special_eos_id);
2376 LLAMA_LOG_WARN("%s: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
2377 }
2378
2379 if (special_eot_id != LLAMA_TOKEN_NULL && special_eog_ids.count(x: special_eot_id) == 0) {
2380 special_eog_ids.insert(x: special_eot_id);
2381 LLAMA_LOG_WARN("%s: special_eot_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
2382 }
2383
2384 if (special_eom_id != LLAMA_TOKEN_NULL && special_eog_ids.count(x: special_eom_id) == 0) {
2385 special_eog_ids.insert(x: special_eom_id);
2386 LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
2387 }
2388
2389 // TODO: workaround for o200k_harmony tokenizer: the "<|end|>" token should not be EOG
2390 // we don't have a good way to detect this, so for now, if we have "<|return|>" and "<|call|>" tokens,
2391 // we remove the "<|end|>" token from the EOG list
2392 {
2393 bool has_return = false;
2394 bool has_call = false;
2395 bool has_end = false;
2396
2397 llama_token end_id = LLAMA_TOKEN_NULL;
2398
2399 LLAMA_LOG_INFO("%s: printing all EOG tokens:\n", __func__);
2400 for (auto tid : special_eog_ids) {
2401 LLAMA_LOG_INFO("%s: - %d ('%s')\n", __func__, tid, id_to_token[tid].text.c_str());
2402
2403 if (id_to_token[tid].text == "<|return|>") {
2404 has_return = true;
2405 } else if (id_to_token[tid].text == "<|call|>") {
2406 has_call = true;
2407 } else if (id_to_token[tid].text == "<|end|>") {
2408 has_end = true;
2409 end_id = tid;
2410 }
2411 }
2412
2413 if (has_return && has_call && has_end) {
2414 special_eog_ids.erase(x: end_id);
2415 id_to_token[end_id].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
2416 LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
2417 }
2418 }
2419 }
2420
2421 // build special tokens cache
2422 {
2423 for (llama_token id = 0; id < (llama_token) n_tokens; ++id) {
2424 if (id_to_token[id].attr & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_USER_DEFINED | LLAMA_TOKEN_ATTR_UNKNOWN)) {
2425 cache_special_tokens.push_back(x: id);
2426 }
2427 }
2428
2429 std::sort(first: cache_special_tokens.begin(), last: cache_special_tokens.end(),
2430 comp: [&] (const llama_token a, const llama_token b) {
2431 return id_to_token[a].text.size() > id_to_token[b].text.size();
2432 }
2433 );
2434
2435 LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t) cache_special_tokens.size());
2436 }
2437
2438 // build token to piece cache
2439 {
2440 size_t size_cache = 0;
2441
2442 std::vector<std::string> cache(n_tokens);
2443
2444 for (uint32_t id = 0; id < n_tokens; ++id) {
2445 cache[id] = token_to_piece_for_cache(token: id, special: true);
2446
2447 size_cache += cache[id].size();
2448 }
2449
2450 std::swap(x&: cache_token_to_piece, y&: cache);
2451
2452 LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
2453 }
2454
2455 // Handle per token attributes
2456 //NOTE: Each model customizes per token attributes.
2457 //NOTE: Per token attributes are missing from the GGUF file.
2458 //TODO: Extract attributes from GGUF file.
2459 {
2460 auto _contains_any = [] (const std::string & str, const std::vector<std::string_view> & substrs) -> bool {
2461 for (const auto & substr : substrs) {
2462 if (str.find(svt: substr) != std::string::npos) {
2463 return true;
2464 }
2465 }
2466 return false;
2467 };
2468
2469 auto _set_tokenid_attr = [&] (const llama_token id, llama_token_attr attr, bool value) {
2470 uint32_t current = id_to_token.at(n: id).attr;
2471 current = value ? (current | attr) : (current & ~attr);
2472 id_to_token[id].attr = (llama_token_attr) current;
2473 };
2474
2475 auto _set_token_attr = [&] (const std::string & token, llama_token_attr attr, bool value) {
2476 _set_tokenid_attr(token_to_id.at(k: token), attr, value);
2477 };
2478
2479 std::string model_name;
2480 std::string tokenizer_pre;
2481 std::string general_arch;
2482
2483 ml.get_key(kid: LLM_KV_GENERAL_NAME, result&: model_name, required: false);
2484 ml.get_key(kid: LLM_KV_TOKENIZER_PRE, result&: tokenizer_pre, required: false);
2485 ml.get_key(kid: LLM_KV_GENERAL_ARCHITECTURE, result&: general_arch, required: false);
2486
2487 // model name to lowercase
2488 std::transform(first: model_name.begin(), last: model_name.end(), result: model_name.begin(),
2489 unary_op: [] (const std::string::value_type x) {
2490 return std::tolower(c: x);
2491 }
2492 );
2493
2494 // set attributes by model/tokenizer/architecture name
2495 if (false
2496 || _contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})
2497 || _contains_any(general_arch, {"nomic-bert-moe", "jina-bert-v3"})
2498 ) {
2499 if (token_to_id.count(x: "<mask>") == 0) {
2500 LLAMA_LOG_WARN("%s: Mask token is missing in vocab, please reconvert model!\n", __func__);
2501 } else {
2502 _set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
2503 }
2504 } else if (_contains_any(model_name, {"phi-3", "phi3"})) {
2505 for (auto id : cache_special_tokens) {
2506 _set_tokenid_attr(id, LLAMA_TOKEN_ATTR_RSTRIP, true);
2507 }
2508 for (const auto * token : {"</s>"}) {
2509 _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, true);
2510 }
2511 for (const auto * token : {"<unk>", "<s>", "<|endoftext|>"}) {
2512 _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, false);
2513 }
2514 }
2515 }
2516}
2517
2518enum llama_vocab_type llama_vocab::impl::get_type() const {
2519 return type;
2520}
2521
2522std::string llama_vocab::impl::type_name() const{
2523 switch (type) {
2524 case LLAMA_VOCAB_TYPE_NONE: return "no vocab";
2525 case LLAMA_VOCAB_TYPE_SPM: return "SPM";
2526 case LLAMA_VOCAB_TYPE_BPE: return "BPE";
2527 case LLAMA_VOCAB_TYPE_WPM: return "WPM";
2528 case LLAMA_VOCAB_TYPE_UGM: return "UGM";
2529 case LLAMA_VOCAB_TYPE_RWKV: return "RWKV";
2530 case LLAMA_VOCAB_TYPE_PLAMO2: return "PLaMo2";
2531 default: return "unknown";
2532 }
2533}
2534
2535bool llama_vocab::impl::is_normal(llama_token id) const {
2536 GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
2537 return id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL;
2538}
2539
2540bool llama_vocab::impl::is_unknown(llama_token id) const {
2541 GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
2542 return id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNKNOWN;
2543}
2544
2545bool llama_vocab::impl::is_control(llama_token id) const {
2546 GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
2547 return id_to_token[id].attr & LLAMA_TOKEN_ATTR_CONTROL;
2548}
2549
2550bool llama_vocab::impl::is_byte(llama_token id) const {
2551 GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
2552 return id_to_token[id].attr & LLAMA_TOKEN_ATTR_BYTE;
2553}
2554
2555bool llama_vocab::impl::is_user_defined(llama_token id) const {
2556 GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
2557 return id_to_token[id].attr & LLAMA_TOKEN_ATTR_USER_DEFINED;
2558}
2559
2560bool llama_vocab::impl::is_unused(llama_token id) const {
2561 GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
2562 return id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNUSED;
2563}
2564
2565bool llama_vocab::impl::is_eog(llama_token id) const {
2566 return id != LLAMA_TOKEN_NULL && special_eog_ids.count(x: id) > 0;
2567}
2568
2569uint8_t llama_vocab::impl::token_to_byte(llama_token id) const {
2570 GGML_ASSERT(get_type() != LLAMA_VOCAB_TYPE_NONE);
2571 GGML_ASSERT(is_byte(id));
2572 const auto & token_data = id_to_token.at(n: id);
2573 switch (get_type()) {
2574 case LLAMA_VOCAB_TYPE_SPM:
2575 case LLAMA_VOCAB_TYPE_UGM: {
2576 auto buf = token_data.text.substr(pos: 3, n: 2);
2577 return strtol(nptr: buf.c_str(), NULL, base: 16);
2578 }
2579 case LLAMA_VOCAB_TYPE_BPE: {
2580 GGML_ABORT("fatal error");
2581 }
2582 case LLAMA_VOCAB_TYPE_WPM: {
2583 GGML_ABORT("fatal error");
2584 }
2585 default:
2586 GGML_ABORT("fatal error");
2587 }
2588}
2589
2590llama_token_attr llama_vocab::impl::token_get_attr(llama_token id) const {
2591 GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
2592 return id_to_token.at(n: id).attr;
2593}
2594
2595void llama_vocab::impl::init_tokenizer(enum llama_vocab_type type) {
2596 LLAMA_LOG_DEBUG("%s: initializing tokenizer for type %d\n", __func__, type);
2597
2598 switch (type) {
2599 case LLAMA_VOCAB_TYPE_SPM:
2600 tokenizer = std::make_unique<llm_tokenizer_spm>(args: vocab);
2601 break;
2602 case LLAMA_VOCAB_TYPE_BPE:
2603 tokenizer = std::make_unique<llm_tokenizer_bpe>(args: vocab);
2604 break;
2605 case LLAMA_VOCAB_TYPE_WPM:
2606 tokenizer = std::make_unique<llm_tokenizer_wpm>(args: vocab);
2607 break;
2608 case LLAMA_VOCAB_TYPE_UGM:
2609 tokenizer = std::make_unique<llm_tokenizer_ugm>(args: vocab, args&: precompiled_charsmap);
2610 break;
2611 case LLAMA_VOCAB_TYPE_RWKV:
2612 tokenizer = std::make_unique<llm_tokenizer_rwkv>(args: vocab);
2613 break;
2614 case LLAMA_VOCAB_TYPE_PLAMO2:
2615 tokenizer = std::make_unique<llm_tokenizer_plamo2>(args: vocab);
2616 break;
2617 default:
2618 GGML_ABORT("unsupported vocab type");
2619 }
2620}
2621
2622//
2623// (de-) tokenize
2624//
2625
2626// #define PRETOKENIZERDEBUG
2627
2628void llama_vocab::impl::tokenizer_st_partition(std::forward_list<fragment_buffer_variant> & buffer, bool parse_special) const {
2629 // for each special token
2630 for (const llama_token special_id : cache_special_tokens) {
2631 const auto & data = vocab.get_token_data(id: special_id);
2632 const auto & text = data.text;
2633
2634 if (!parse_special && (data.attr & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_UNKNOWN))) {
2635 // Ignore control and unknown tokens when parse_special == false
2636 continue;
2637 // User-defined tokens are still pre-tokenized before everything else
2638 // ref: https://github.com/huggingface/tokenizers/blob/fdd26ba9a3f0c133427aab0423888cbde91362d7/tokenizers/src/tokenizer/mod.rs#L726
2639 // This is mostly relevant for neox-style tokenizers (mpt, olmo, stablelm, etc.)
2640 }
2641
2642 // for each text fragment
2643 std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
2644 while (it != buffer.end()) {
2645 auto & fragment = (*it);
2646
2647 // if a fragment is text ( not yet processed )
2648 if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2649 const auto & raw_text = fragment.raw_text;
2650
2651 auto raw_text_base_offset = fragment.offset;
2652 auto raw_text_base_length = fragment.length;
2653
2654 // loop over the text
2655 while (true) {
2656 // find the first occurrence of a given special token in this fragment
2657 // passing offset argument only limit the "search area" but match coordinates
2658 // are still relative to the source full raw_text
2659 // string_view begins at pos 0 for the same reason
2660 auto match = std::string_view(raw_text.data(), raw_text_base_offset + raw_text_base_length).find(str: text, pos: raw_text_base_offset);
2661
2662 // no occurrences found, stop processing this fragment for a given special token
2663 if (match == std::string::npos) break;
2664
2665#ifdef PRETOKENIZERDEBUG
2666 LLAMA_LOG_WARN("FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
2667#endif
2668 auto source = std::distance(first: buffer.begin(), last: it);
2669
2670 // if match is further than base offset
2671 // then we have some text to the left of it
2672 if (match > raw_text_base_offset) {
2673 // left
2674 const int64_t left_reminder_offset = raw_text_base_offset + 0;
2675 int64_t left_reminder_length = match - raw_text_base_offset;
2676
2677 if (data.attr & LLAMA_TOKEN_ATTR_LSTRIP) {
2678 while (left_reminder_length > 0 && isspace(raw_text[left_reminder_offset + left_reminder_length - 1])) {
2679 left_reminder_length--;
2680 }
2681 }
2682
2683 if (left_reminder_length > 0) {
2684 buffer.emplace_after(pos: it, args: raw_text, args: left_reminder_offset, args&: left_reminder_length);
2685 it++;
2686 }
2687
2688#ifdef PRETOKENIZERDEBUG
2689 LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
2690#endif
2691 }
2692
2693 // special token
2694 buffer.emplace_after(pos: it, args: special_id);
2695 it++;
2696
2697 // right
2698 if (match + text.length() < raw_text_base_offset + raw_text_base_length) {
2699 int64_t right_reminder_offset = match + text.length();
2700 int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + text.length());
2701
2702 if (data.attr & LLAMA_TOKEN_ATTR_RSTRIP) {
2703 while (right_reminder_length > 0 && isspace(raw_text[right_reminder_offset])) {
2704 right_reminder_offset++;
2705 right_reminder_length--;
2706 }
2707 }
2708
2709 if (right_reminder_length > 0) {
2710 buffer.emplace_after(pos: it, args: raw_text, args&: right_reminder_offset, args&: right_reminder_length);
2711 it++;
2712 }
2713
2714#ifdef PRETOKENIZERDEBUG
2715 LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
2716#endif
2717
2718 if (source == 0) {
2719 buffer.erase_after(pos: buffer.before_begin());
2720 } else {
2721 buffer.erase_after(pos: std::next(x: buffer.begin(), n: (source - 1)));
2722 }
2723
2724 // repeat for the right side
2725 raw_text_base_offset = right_reminder_offset;
2726 raw_text_base_length = right_reminder_length;
2727
2728#ifdef PRETOKENIZERDEBUG
2729 LLAMA_LOG_WARN("RR: (%ld %ld) '%s'\n", raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
2730#endif
2731 } else {
2732 if (source == 0) {
2733 buffer.erase_after(pos: buffer.before_begin());
2734 } else {
2735 buffer.erase_after(pos: std::next(x: buffer.begin(), n: (source - 1)));
2736 }
2737 break;
2738 }
2739 }
2740 }
2741 it++;
2742 }
2743 }
2744}
2745
2746// NOTE: avoid ever using this except for building the token_to_piece caches
2747std::string llama_vocab::impl::token_to_piece_for_cache(llama_token token, bool special) const {
2748 std::string piece;
2749 piece.resize(n: piece.capacity()); // using string internal cache
2750 const int n_chars = vocab.token_to_piece(token, buf: &piece[0], length: piece.size(), lstrip: 0, special);
2751 if (n_chars < 0) {
2752 piece.resize(n: -n_chars);
2753 int check = vocab.token_to_piece(token, buf: &piece[0], length: piece.size(), lstrip: 0, special);
2754 GGML_ASSERT(check == -n_chars);
2755 }
2756 else {
2757 piece.resize(n: n_chars);
2758 }
2759
2760 return piece;
2761}
2762
2763static void llama_escape_whitespace(std::string & text) {
2764 replace_all(s&: text, search: " ", replace: "\xe2\x96\x81");
2765}
2766
2767static void llama_unescape_whitespace(std::string & word) {
2768 replace_all(s&: word, search: "\xe2\x96\x81", replace: " ");
2769}
2770
2771static std::string llama_decode_text(const std::string & text) {
2772 std::string decoded_text;
2773
2774 const auto cpts = unicode_cpts_from_utf8(utf8: text);
2775 for (const auto cpt : cpts) {
2776 const auto utf8 = unicode_cpt_to_utf8(cpt);
2777 try {
2778 decoded_text += unicode_utf8_to_byte(utf8);
2779 } catch (const std::out_of_range & /*e*/) {
2780 decoded_text += "[UNK_BYTE_0x";
2781 for (const auto c : utf8) {
2782 decoded_text += format(fmt: "%02x", (uint8_t) c);
2783 }
2784 decoded_text += text + "]";
2785 }
2786 }
2787
2788 return decoded_text;
2789}
2790
2791std::vector<llama_token> llama_vocab::impl::tokenize(
2792 const std::string & raw_text,
2793 bool add_special,
2794 bool parse_special) const {
2795 GGML_ASSERT(tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first.");
2796
2797 std::vector<llama_token> output;
2798 std::forward_list<fragment_buffer_variant> fragment_buffer;
2799
2800 if (!raw_text.empty()) {
2801 fragment_buffer.emplace_front(args: raw_text, args: 0, args: raw_text.length());
2802 tokenizer_st_partition(buffer&: fragment_buffer, parse_special);
2803 }
2804
2805 switch (get_type()) {
2806 case LLAMA_VOCAB_TYPE_SPM:
2807 {
2808 // OG tokenizer behavior:
2809 //
2810 // tokenizer.encode('', add_special_tokens=True) returns [1]
2811 // tokenizer.encode('', add_special_tokens=False) returns []
2812
2813 bool is_prev_special = true; // prefix with space if first token
2814
2815 if (add_special && add_bos) {
2816 GGML_ASSERT(special_bos_id != LLAMA_TOKEN_NULL);
2817 output.push_back(x: special_bos_id);
2818 is_prev_special = true;
2819 }
2820
2821 for (const auto & fragment : fragment_buffer) {
2822 if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2823 std::string text;
2824
2825 // prefix with space if previous is special
2826 if (add_space_prefix && is_prev_special) {
2827 text = ' ';
2828 }
2829
2830 text += fragment.raw_text.substr(pos: fragment.offset, n: fragment.length);
2831
2832#ifdef PRETOKENIZERDEBUG
2833 LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
2834#endif
2835 llama_escape_whitespace(text);
2836 llm_tokenizer_spm_session session(vocab);
2837 session.tokenize(text, output);
2838 is_prev_special = false;
2839 } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
2840 output.push_back(x: fragment.token);
2841 is_prev_special = true;
2842 }
2843 }
2844
2845 if (add_special && add_bos && output.size() >= 2 && output[1] == special_bos_id) {
2846 LLAMA_LOG_WARN(
2847 "%s: Added a BOS token to the prompt as specified by the model but the prompt "
2848 "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
2849 "Are you sure this is what you want?\n", __FUNCTION__);
2850 }
2851
2852 if (add_special && add_eos) {
2853 GGML_ASSERT(special_eos_id != LLAMA_TOKEN_NULL);
2854 output.push_back(x: special_eos_id);
2855 }
2856 } break;
2857 case LLAMA_VOCAB_TYPE_BPE:
2858 {
2859 llm_tokenizer_bpe_session session(vocab, *static_cast<const llm_tokenizer_bpe *>(tokenizer.get()));
2860 // it calls some other methods that are not exist in llm_tokenizer,
2861 // here just cast it to bpe tokenizer object
2862 if (add_special) {
2863 session.append_bos(output);
2864 }
2865 for (const auto & fragment : fragment_buffer) {
2866 if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2867 std::string text = fragment.raw_text.substr(pos: fragment.offset, n: fragment.length);
2868
2869#ifdef PRETOKENIZERDEBUG
2870 LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
2871#endif
2872 session.tokenize(text, output);
2873 } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
2874 session.append(token_id: fragment.token, output);
2875 }
2876 }
2877
2878 if (add_special) {
2879 session.append_eos(output);
2880 session.check_double_bos_eos(output);
2881 }
2882 } break;
2883 case LLAMA_VOCAB_TYPE_WPM:
2884 {
2885 if (add_special) {
2886 GGML_ASSERT(special_bos_id != LLAMA_TOKEN_NULL);
2887 output.push_back(x: special_bos_id);
2888 }
2889
2890 llm_tokenizer_wpm_session session(vocab);
2891
2892 for (const auto & fragment : fragment_buffer) {
2893 if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2894 std::string text = fragment.raw_text.substr(pos: fragment.offset, n: fragment.length);
2895
2896#ifdef PRETOKENIZERDEBUG
2897 LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
2898#endif
2899 session.tokenize(text, output);
2900 } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
2901 output.push_back(x: fragment.token);
2902 }
2903 }
2904
2905 if (add_special) {
2906 GGML_ASSERT(special_sep_id != LLAMA_TOKEN_NULL);
2907 output.push_back(x: special_sep_id);
2908 }
2909 } break;
2910 case LLAMA_VOCAB_TYPE_UGM:
2911 {
2912 if (add_special && add_bos) {
2913 GGML_ASSERT(special_bos_id != LLAMA_TOKEN_NULL);
2914 output.push_back(x: special_bos_id);
2915 }
2916 llm_tokenizer_ugm_session session(vocab, *static_cast<const llm_tokenizer_ugm *>(tokenizer.get()));
2917
2918 for (const auto & fragment : fragment_buffer) {
2919 if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2920 std::string text = fragment.raw_text.substr(pos: fragment.offset, n: fragment.length);
2921#ifdef PRETOKENIZERDEBUG
2922 LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
2923#endif
2924 session.tokenize(text, output);
2925 } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
2926 output.push_back(x: fragment.token);
2927 }
2928 }
2929
2930 if (add_special && add_bos && output.size() >= 2 && output[1] == special_bos_id) {
2931 LLAMA_LOG_WARN(
2932 "%s: Added a BOS token to the prompt as specified by the model but the prompt "
2933 "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
2934 "Are you sure this is what you want?\n", __FUNCTION__);
2935 }
2936
2937 if (add_special && add_eos) {
2938 GGML_ASSERT(special_eos_id != LLAMA_TOKEN_NULL);
2939 output.push_back(x: special_eos_id);
2940 }
2941 } break;
2942 case LLAMA_VOCAB_TYPE_RWKV:
2943 {
2944 llm_tokenizer_rwkv_session session(vocab, *static_cast<const llm_tokenizer_rwkv *>(tokenizer.get()));
2945 for (const auto & fragment : fragment_buffer) {
2946 if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2947 std::string text = fragment.raw_text.substr(pos: fragment.offset, n: fragment.length);
2948
2949#ifdef PRETOKENIZERDEBUG
2950 LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
2951#endif
2952
2953 session.tokenize(text, output);
2954 } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
2955 output.push_back(x: fragment.token);
2956 }
2957 }
2958 } break;
2959 case LLAMA_VOCAB_TYPE_PLAMO2:
2960 {
2961 llm_tokenizer_plamo2_session session(*static_cast<const llm_tokenizer_plamo2 *>(tokenizer.get()));
2962 for (const auto & fragment : fragment_buffer) {
2963 if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2964 std::string text = fragment.raw_text.substr(pos: fragment.offset, n: fragment.length);
2965
2966#ifdef PRETOKENIZERDEBUG
2967 LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
2968#endif
2969
2970 session.tokenize(text, output);
2971 } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
2972 output.push_back(x: fragment.token);
2973 }
2974 }
2975 } break;
2976 case LLAMA_VOCAB_TYPE_NONE:
2977 GGML_ABORT("fatal error");
2978 }
2979
2980 return output;
2981}
2982
2983int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t length, int32_t lstrip, bool special) const {
2984 // ref: https://github.com/ggerganov/llama.cpp/pull/7587#discussion_r1620983843
2985 static const int attr_special = LLAMA_TOKEN_ATTR_UNKNOWN | LLAMA_TOKEN_ATTR_CONTROL;
2986 const llama_token_attr attr = token_get_attr(id: token);
2987 if (!special && (attr & attr_special)) {
2988 return 0;
2989 }
2990
2991 // copy piece chars to output text buffer
2992 // skip up to 'lstrip' leading spaces before copying
2993 auto _try_copy = [=] (const char * token, size_t size) -> int32_t {
2994 if (size >= static_cast<size_t>(std::numeric_limits<int32_t>::max())) {
2995 GGML_ABORT("invalid token size: %zu exceeds int32_t limit", size);
2996 }
2997
2998 for (int32_t i = 0; i < lstrip && size && *token == ' '; ++i) {
2999 token++;
3000 size--;
3001 }
3002 if (length < (int32_t)size) {
3003 return -(int32_t) size;
3004 }
3005 memcpy(dest: buf, src: token, n: size);
3006 return (int32_t) size;
3007 };
3008
3009 // if we have a cache - use it
3010 {
3011 const auto & cache = cache_token_to_piece;
3012
3013 if (!cache.empty()) {
3014 const auto & result = cache.at(n: token);
3015 return _try_copy(result.data(), result.size());
3016 }
3017 }
3018
3019 if (0 <= token && token < (int32_t) id_to_token.size()) {
3020 const std::string & token_text = id_to_token[token].text;
3021 switch (get_type()) {
3022 case LLAMA_VOCAB_TYPE_WPM:
3023 case LLAMA_VOCAB_TYPE_SPM:
3024 case LLAMA_VOCAB_TYPE_UGM: {
3025 // NOTE: we accept all unsupported token types,
3026 // suppressing them like CONTROL tokens.
3027 if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) {
3028 return _try_copy(token_text.data(), token_text.size());
3029 }
3030 if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
3031 std::string result = token_text;
3032 llama_unescape_whitespace(word&: result);
3033 return _try_copy(result.data(), result.size());
3034 }
3035 if (attr & LLAMA_TOKEN_ATTR_BYTE) {
3036 char byte = (char) token_to_byte(id: token);
3037 return _try_copy((char*) &byte, 1);
3038 }
3039 break;
3040 }
3041 case LLAMA_VOCAB_TYPE_BPE: {
3042 // NOTE: we accept all unsupported token types,
3043 // suppressing them like CONTROL tokens.
3044 if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) {
3045 return _try_copy(token_text.data(), token_text.size());
3046 }
3047 if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
3048 std::string result = llama_decode_text(text: token_text);
3049 return _try_copy(result.data(), result.size());
3050 }
3051 break;
3052 }
3053 case LLAMA_VOCAB_TYPE_RWKV: {
3054 std::vector<uint8_t> result = llama_unescape_rwkv_token(escaped: token_text);
3055
3056 // If we don't have enough space, return an error
3057 if (result.size() > (size_t)length) {
3058 return -(int)result.size();
3059 }
3060
3061 memcpy(dest: buf, src: result.data(), n: result.size());
3062 return (int)result.size();
3063 }
3064 case LLAMA_VOCAB_TYPE_PLAMO2: {
3065 // PLaMo-2 uses similar token handling as BPE/SPM
3066 if (vocab.is_byte(id: token)) {
3067 // Handle byte tokens like <0xXX>
3068 if (token_text.length() == 6 && token_text.substr(pos: 0, n: 3) == "<0x" && token_text.back() == '>') {
3069 int hex_val = std::stoi(str: token_text.substr(pos: 3, n: 2), idx: nullptr, base: 16);
3070 if (length < 1) {
3071 return -1;
3072 }
3073 buf[0] = static_cast<char>(hex_val);
3074 return 1;
3075 }
3076 }
3077
3078 // Normal token - just copy the text
3079 std::string result = token_text;
3080 return _try_copy(result.data(), result.size());
3081 }
3082 default:
3083 GGML_ABORT("fatal error");
3084 }
3085 }
3086
3087 return 0;
3088}
3089
3090const std::string & llama_vocab::impl::token_to_piece(llama_token token) const {
3091 return cache_token_to_piece.at(n: token);
3092}
3093
3094int32_t llama_vocab::impl::detokenize(
3095 const llama_token * tokens,
3096 int32_t n_tokens,
3097 char * text,
3098 int32_t text_len_max,
3099 bool remove_special,
3100 bool unparse_special) const {
3101 if (type == LLAMA_VOCAB_TYPE_NONE) {
3102 return 0;
3103 }
3104
3105 GGML_ASSERT(tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first.");
3106
3107 int32_t avail = text_len_max;
3108 int32_t total = 0;
3109
3110 // remove the leading space
3111 bool remove_space = add_space_prefix;
3112
3113 if (remove_special && add_bos) {
3114 if (n_tokens > 0 && tokens[0] == special_bos_id) {
3115 remove_space = false;
3116 n_tokens--;
3117 tokens++;
3118 }
3119 }
3120
3121 if (remove_special && add_eos) {
3122 if (n_tokens > 0 && tokens[n_tokens - 1] == special_eos_id) {
3123 n_tokens--;
3124 }
3125 }
3126
3127 for (int32_t i = 0; i < n_tokens; ++i) {
3128 GGML_ASSERT(avail >= 0);
3129 int32_t n_chars = token_to_piece(token: tokens[i], buf: text, length: avail, lstrip: remove_space, special: unparse_special);
3130 remove_space = false;
3131 if (n_chars < 0) {
3132 avail = 0;
3133 total -= n_chars;
3134 } else if (n_chars > 0) {
3135 avail -= n_chars;
3136 text += n_chars;
3137 total += n_chars;
3138 }
3139 }
3140
3141 if (total > text_len_max) {
3142 return -total;
3143 }
3144
3145 if (clean_spaces) {
3146 text -= total; // restart text
3147
3148 // first pass: characters ?!., //TODO: where do these characters come from?
3149 const int32_t total1 = total;
3150 total = total ? 1 : 0;
3151 for (int32_t i = 1; i < total1; ++i) {
3152 const char x = text[i];
3153 if (text[i - 1] == ' ') {
3154 if (x == '?' || x == '!' || x == '.' || x == ',') { // " ?", " !", " .", " ,"
3155 total--; // remove space
3156 }
3157 }
3158 text[total++] = x;
3159 }
3160
3161 // second pass: strip single apostrophe between spaces
3162 const int32_t total2 = total;
3163 total = total ? 1 : 0;
3164 for (int32_t i = 1; i < total2; ++i) {
3165 const char x = text[i];
3166 if (x == '\'' && i + 1 < total2 && text[i - 1] == ' ' && text[i + 1] == ' ') { // " ' "
3167 total--; // remove prev space
3168 text[++i] = '\0'; // remove next space
3169 }
3170 text[total++] = x;
3171 }
3172
3173 // third pass: apostrophe contractions //NOTE: this makes sense?
3174 const int32_t total3 = total;
3175 total = total ? 1 : 0;
3176 for (int32_t i = 1; i < total3; ++i) {
3177 const char x = text[i];
3178 if (text[i - 1] == ' ') {
3179 if (x == '\'' && i + 1 < total3) {
3180 const char x1 = text[i + 1];
3181 if (x1 == 't' || x1 == 'd') { // " 't", " 'd"
3182 //total--; // remove space
3183 } else if (x1 == 's' || x1 == 'm') { // " 's", " 'm"
3184 total--; // remove space
3185 } else if (i + 2 < total3) {
3186 const char x2 = text[i + 2];
3187 if ((x1 == 'l' && x2 == 'l')) { // " 'll"
3188 //total--; // remove space
3189 } else if ((x1 == 'r' && x2 == 'e') || (x1 == 'v' && x2 == 'e')) { // " 're", " 've"
3190 total--; // remove space
3191 } else {
3192 //total--; // remove space
3193 }
3194 } else {
3195 //total--; // remove space
3196 }
3197 }
3198 }
3199 text[total++] = x;
3200 }
3201 }
3202
3203 return total <= text_len_max ? total : -total;
3204}
3205
3206void llama_vocab::impl::print_info() const {
3207 LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, type_name().c_str());
3208 LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, vocab.n_tokens());
3209 LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (uint32_t) bpe_ranks.size());
3210
3211 // special tokens
3212 if (special_bos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, special_bos_id, id_to_token.at(special_bos_id).text.c_str() ); }
3213 if (special_eos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, special_eos_id, id_to_token.at(special_eos_id).text.c_str() ); }
3214 if (special_eot_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, special_eot_id, id_to_token.at(special_eot_id).text.c_str() ); }
3215 if (special_eom_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, special_eom_id, id_to_token.at(special_eom_id).text.c_str() ); }
3216 if (special_unk_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, special_unk_id, id_to_token.at(special_unk_id).text.c_str() ); }
3217 if (special_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, special_sep_id, id_to_token.at(special_sep_id).text.c_str() ); }
3218 if (special_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, special_pad_id, id_to_token.at(special_pad_id).text.c_str() ); }
3219 if (special_mask_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, special_mask_id, id_to_token.at(special_mask_id).text.c_str() ); }
3220
3221 if (linefeed_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, linefeed_id, id_to_token.at(linefeed_id).text.c_str() ); }
3222
3223 if (special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, special_fim_pre_id, id_to_token.at(special_fim_pre_id).text.c_str() ); }
3224 if (special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, special_fim_suf_id, id_to_token.at(special_fim_suf_id).text.c_str() ); }
3225 if (special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, special_fim_mid_id, id_to_token.at(special_fim_mid_id).text.c_str() ); }
3226 if (special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, special_fim_pad_id, id_to_token.at(special_fim_pad_id).text.c_str() ); }
3227 if (special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, special_fim_rep_id, id_to_token.at(special_fim_rep_id).text.c_str() ); }
3228 if (special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, special_fim_sep_id, id_to_token.at(special_fim_sep_id).text.c_str() ); }
3229
3230 for (const auto & id : special_eog_ids) {
3231 LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, id_to_token.at(id).text.c_str() );
3232 }
3233
3234 LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, max_token_len);
3235}
3236
3237llama_vocab::llama_vocab() : pimpl(new impl(*this)) {
3238}
3239
3240llama_vocab::~llama_vocab() {
3241}
3242
3243void llama_vocab::load(llama_model_loader & ml, const LLM_KV & kv) {
3244 pimpl->load(ml, kv);
3245}
3246
3247std::string llama_vocab::get_tokenizer_model() const {
3248 return pimpl->tokenizer_model;
3249}
3250
3251std::string llama_vocab::get_tokenizer_pre() const {
3252 return pimpl->tokenizer_pre;
3253}
3254
3255enum llama_vocab_type llama_vocab::get_type() const {
3256 return pimpl->type;
3257}
3258
3259enum llama_vocab_pre_type llama_vocab::get_pre_type() const {
3260 return pimpl->pre_type;
3261}
3262
3263uint32_t llama_vocab::n_tokens() const {
3264 return (uint32_t) pimpl->id_to_token.size();
3265}
3266
3267uint32_t llama_vocab::n_token_types() const {
3268 return (uint32_t) pimpl->n_token_types;
3269}
3270
3271std::string llama_vocab::type_name() const{
3272 return pimpl->type_name();
3273}
3274
3275bool llama_vocab::is_normal(llama_token id) const {
3276 return pimpl->is_normal(id);
3277}
3278
3279bool llama_vocab::is_unknown(llama_token id) const {
3280 return pimpl->is_unknown(id);
3281}
3282
3283bool llama_vocab::is_control(llama_token id) const {
3284 return pimpl->is_control(id);
3285}
3286
3287bool llama_vocab::is_byte(llama_token id) const {
3288 return pimpl->is_byte(id);
3289}
3290
3291bool llama_vocab::is_user_defined(llama_token id) const {
3292 return pimpl->is_user_defined(id);
3293}
3294
3295bool llama_vocab::is_unused(llama_token id) const {
3296 return pimpl->is_unused(id);
3297}
3298
3299bool llama_vocab::is_eog(llama_token id) const {
3300 return pimpl->is_eog(id);
3301}
3302
3303uint8_t llama_vocab::token_to_byte(llama_token id) const {
3304 return pimpl->token_to_byte(id);
3305}
3306
3307llama_token llama_vocab::byte_to_token(uint8_t ch) const {
3308 GGML_ASSERT(get_type() != LLAMA_VOCAB_TYPE_NONE);
3309 static const char * hex = "0123456789ABCDEF";
3310 switch (get_type()) {
3311 case LLAMA_VOCAB_TYPE_SPM:
3312 case LLAMA_VOCAB_TYPE_UGM: {
3313 const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
3314 auto token = pimpl->token_to_id.find(x: buf);
3315 if (token != pimpl->token_to_id.end()) {
3316 return (*token).second;
3317 }
3318 // Try to fall back to just the byte as a string
3319 const char buf2[2] = { (char)ch, 0 };
3320 return pimpl->token_to_id.at(k: buf2);
3321 }
3322 case LLAMA_VOCAB_TYPE_WPM:
3323 case LLAMA_VOCAB_TYPE_BPE: {
3324 return pimpl->token_to_id.at(k: unicode_byte_to_utf8(byte: ch));
3325 }
3326 case LLAMA_VOCAB_TYPE_PLAMO2: {
3327 // PLaMo-2 uses byte tokens in format <0xXX>
3328 char hex_str[8];
3329 snprintf(s: hex_str, maxlen: sizeof(hex_str), format: "<0x%02X>", ch);
3330 return pimpl->token_to_id.at(k: hex_str);
3331 }
3332 default:
3333 GGML_ABORT("fatal error");
3334 }
3335}
3336
3337llama_token llama_vocab::text_to_token(const std::string & text) const {
3338 GGML_ASSERT(pimpl->type != LLAMA_VOCAB_TYPE_NONE);
3339 auto it = pimpl->token_to_id.find(x: text);
3340 if (it != pimpl->token_to_id.end()) {
3341 return (*it).second;
3342 }
3343 return LLAMA_TOKEN_NULL;
3344}
3345
3346const llama_vocab::token_data & llama_vocab::get_token_data(llama_token id) const {
3347 GGML_ASSERT(pimpl->type != LLAMA_VOCAB_TYPE_NONE);
3348 return pimpl->id_to_token.at(n: id);
3349}
3350
3351const char * llama_vocab::token_get_text(llama_token id) const {
3352 GGML_ASSERT(pimpl->type != LLAMA_VOCAB_TYPE_NONE);
3353 return pimpl->id_to_token.at(n: id).text.c_str();
3354}
3355
3356float llama_vocab::token_get_score(llama_token id) const {
3357 GGML_ASSERT(pimpl->type != LLAMA_VOCAB_TYPE_NONE);
3358 return pimpl->id_to_token.at(n: id).score;
3359}
3360
3361llama_token_attr llama_vocab::token_get_attr(llama_token id) const {
3362 return pimpl->token_get_attr(id);
3363}
3364
3365llama_token llama_vocab::token_bos() const {
3366 return pimpl->special_bos_id;
3367}
3368
3369llama_token llama_vocab::token_eos() const {
3370 return pimpl->special_eos_id;
3371}
3372
3373llama_token llama_vocab::token_eot() const {
3374 return pimpl->special_eot_id;
3375}
3376
3377llama_token llama_vocab::token_eom() const {
3378 return pimpl->special_eom_id;
3379}
3380
3381llama_token llama_vocab::token_unk() const {
3382 return pimpl->special_unk_id;
3383}
3384
3385llama_token llama_vocab::token_sep() const {
3386 return pimpl->special_sep_id;
3387}
3388
3389llama_token llama_vocab::token_nl() const {
3390 return pimpl->linefeed_id;
3391}
3392
3393llama_token llama_vocab::token_pad() const {
3394 return pimpl->special_pad_id;
3395}
3396
3397llama_token llama_vocab::token_prefix() const {
3398 return pimpl->special_fim_pre_id;
3399}
3400
3401llama_token llama_vocab::token_middle() const {
3402 return pimpl->special_fim_mid_id;
3403}
3404
3405llama_token llama_vocab::token_suffix() const {
3406 return pimpl->special_fim_suf_id;
3407}
3408
3409llama_token llama_vocab::token_fim_pre() const {
3410 return pimpl->special_fim_pre_id;
3411}
3412
3413llama_token llama_vocab::token_fim_suf() const {
3414 return pimpl->special_fim_suf_id;
3415}
3416
3417llama_token llama_vocab::token_fim_mid() const {
3418 return pimpl->special_fim_mid_id;
3419}
3420
3421llama_token llama_vocab::token_fim_pad() const {
3422 return pimpl->special_fim_pad_id;
3423}
3424
3425llama_token llama_vocab::token_fim_rep() const {
3426 return pimpl->special_fim_rep_id;
3427}
3428
3429llama_token llama_vocab::token_fim_sep() const {
3430 return pimpl->special_fim_sep_id;
3431}
3432
3433llama_token llama_vocab::token_mask() const {
3434 return pimpl->special_mask_id;
3435}
3436
3437bool llama_vocab::get_add_space_prefix() const {
3438 return pimpl->add_space_prefix;
3439}
3440
3441bool llama_vocab::get_add_bos() const {
3442 return pimpl->add_bos;
3443}
3444
3445bool llama_vocab::get_add_eos() const {
3446 return pimpl->add_eos;
3447}
3448
3449bool llama_vocab::get_add_sep() const {
3450 return pimpl->add_sep;
3451}
3452
3453bool llama_vocab::get_ignore_merges() const {
3454 return pimpl->ignore_merges;
3455}
3456
3457bool llama_vocab::get_clean_spaces() const {
3458 return pimpl->clean_spaces;
3459}
3460
3461bool llama_vocab::get_remove_extra_whitespaces() const {
3462 return pimpl->remove_extra_whitespaces;
3463}
3464
3465bool llama_vocab::get_escape_whitespaces() const {
3466 return pimpl->escape_whitespaces;
3467}
3468
3469bool llama_vocab::get_treat_whitespace_as_suffix() const {
3470 return pimpl->treat_whitespace_as_suffix;
3471}
3472
3473int llama_vocab::max_token_len() const {
3474 return pimpl->max_token_len;
3475}
3476
3477int llama_vocab::find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
3478 GGML_ASSERT(token_left.find(' ') == std::string::npos);
3479 GGML_ASSERT(token_left.find('\n') == std::string::npos);
3480 GGML_ASSERT(token_right.find(' ') == std::string::npos);
3481 GGML_ASSERT(token_right.find('\n') == std::string::npos);
3482
3483 auto it = pimpl->bpe_ranks.find(x: std::make_pair(x: token_left, y: token_right));
3484 if (it == pimpl->bpe_ranks.end()) {
3485 return -1;
3486 }
3487
3488 return it->second;
3489}
3490
3491std::vector<std::string> llama_vocab::get_bpe_merges() const {
3492 std::vector<std::string> result(pimpl->bpe_ranks.size());
3493
3494 for (const auto & pair : pimpl->bpe_ranks) {
3495 result[pair.second] = pair.first.first + " " + pair.first.second;
3496 }
3497
3498 return result;
3499}
3500
3501std::vector<char> llama_vocab::get_precompiled_charsmap() const {
3502 return pimpl->precompiled_charsmap;
3503}
3504
3505int32_t llama_vocab::tokenize(
3506 const char * text,
3507 int32_t text_len,
3508 llama_token * tokens,
3509 int32_t n_tokens_max,
3510 bool add_special,
3511 bool parse_special) const {
3512 auto res = tokenize(raw_text: std::string(text, text_len), add_special, parse_special);
3513 if (res.size() >= static_cast<size_t>(std::numeric_limits<int32_t>::max())) {
3514 LLAMA_LOG_ERROR("%s: tokenization result size %zu exceeds int32_t limit\n", __func__, res.size());
3515 return std::numeric_limits<int32_t>::min();
3516 }
3517
3518 if (n_tokens_max < (int) res.size()) {
3519 // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
3520 return -((int) res.size());
3521 }
3522
3523 for (size_t i = 0; i < res.size(); i++) {
3524 tokens[i] = res[i];
3525 }
3526
3527 return res.size();
3528}
3529
3530std::vector<llama_token> llama_vocab::tokenize(
3531 const std::string & raw_text,
3532 bool add_special,
3533 bool parse_special) const {
3534 return pimpl->tokenize(raw_text, add_special, parse_special);
3535}
3536
3537const std::string & llama_vocab::token_to_piece(llama_token token) const {
3538 return pimpl->token_to_piece(token);
3539}
3540
3541int32_t llama_vocab::token_to_piece(llama_token token, char * buf, int32_t length, int32_t lstrip, bool special) const {
3542 return pimpl->token_to_piece(token, buf, length, lstrip, special);
3543}
3544
3545int32_t llama_vocab::detokenize(
3546 const llama_token * tokens,
3547 int32_t n_tokens,
3548 char * text,
3549 int32_t text_len_max,
3550 bool remove_special,
3551 bool unparse_special) const {
3552 return pimpl->detokenize(tokens, n_tokens, text, text_len_max, remove_special, unparse_special);
3553}
3554
3555std::string llama_vocab::detokenize(const std::vector<llama_token> & tokens, bool special) const {
3556 std::string text;
3557 text.resize(n: std::max(a: text.capacity(), b: tokens.size()));
3558 int32_t n_chars = detokenize(tokens: tokens.data(), n_tokens: (int32_t)tokens.size(), text: &text[0], text_len_max: (int32_t)text.size(), remove_special: false, unparse_special: special);
3559 if (n_chars < 0) {
3560 text.resize(n: -n_chars);
3561 n_chars = detokenize(tokens: tokens.data(), n_tokens: (int32_t)tokens.size(), text: &text[0], text_len_max: (int32_t)text.size(), remove_special: false, unparse_special: special);
3562 GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization
3563 }
3564
3565 text.resize(n: n_chars);
3566
3567 // NOTE: the original tokenizer decodes bytes after collecting the pieces.
3568 return text;
3569}
3570
3571void llama_vocab::print_info() const {
3572 pimpl->print_info();
3573}
3574
3575//
3576// interface implementation
3577//
3578
3579int32_t llama_vocab_n_tokens(const struct llama_vocab * vocab) {
3580 return vocab->n_tokens();
3581}
3582
3583// deprecated
3584int32_t llama_n_vocab(const struct llama_vocab * vocab) {
3585 return llama_vocab_n_tokens(vocab);
3586}
3587
3588enum llama_vocab_type llama_vocab_type(const struct llama_vocab * vocab) {
3589 return vocab->get_type();
3590}
3591
3592const char * llama_vocab_get_text(const struct llama_vocab * vocab, llama_token token) {
3593 return vocab->token_get_text(id: token);
3594}
3595
3596float llama_vocab_get_score(const struct llama_vocab * vocab, llama_token token) {
3597 return vocab->token_get_score(id: token);
3598}
3599
3600enum llama_token_attr llama_vocab_get_attr(const struct llama_vocab * vocab, llama_token token) {
3601 return vocab->token_get_attr(id: token);
3602}
3603
3604bool llama_vocab_is_eog(const struct llama_vocab * vocab, llama_token token) {
3605 return vocab->is_eog(id: token);
3606}
3607
3608bool llama_vocab_is_control(const struct llama_vocab * vocab, llama_token token) {
3609 return vocab->is_control(id: token);
3610}
3611
3612llama_token llama_vocab_bos(const struct llama_vocab * vocab) {
3613 return vocab->token_bos();
3614}
3615
3616llama_token llama_vocab_eos(const struct llama_vocab * vocab) {
3617 return vocab->token_eos();
3618}
3619
3620llama_token llama_vocab_eot(const struct llama_vocab * vocab) {
3621 return vocab->token_eot();
3622}
3623
3624// deprecated
3625llama_token llama_vocab_cls(const struct llama_vocab * vocab) {
3626 return vocab->token_bos();
3627}
3628
3629llama_token llama_vocab_sep(const struct llama_vocab * vocab) {
3630 return vocab->token_sep();
3631}
3632
3633llama_token llama_vocab_nl (const struct llama_vocab * vocab) {
3634 return vocab->token_nl();
3635}
3636
3637llama_token llama_vocab_pad(const struct llama_vocab * vocab) {
3638 return vocab->token_pad();
3639}
3640
3641bool llama_vocab_get_add_bos(const struct llama_vocab * vocab) {
3642 return vocab->get_add_bos();
3643}
3644
3645bool llama_vocab_get_add_eos(const struct llama_vocab * vocab) {
3646 return vocab->get_add_eos();
3647}
3648
3649bool llama_vocab_get_add_sep(const struct llama_vocab * vocab) {
3650 return vocab->get_add_sep();
3651}
3652
3653llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab) {
3654 return vocab->token_fim_pre();
3655}
3656
3657llama_token llama_vocab_fim_suf(const struct llama_vocab * vocab) {
3658 return vocab->token_fim_suf();
3659}
3660
3661llama_token llama_vocab_fim_mid(const struct llama_vocab * vocab) {
3662 return vocab->token_fim_mid();
3663}
3664
3665llama_token llama_vocab_fim_pad(const struct llama_vocab * vocab) {
3666 return vocab->token_fim_pad();
3667}
3668
3669llama_token llama_vocab_fim_rep(const struct llama_vocab * vocab) {
3670 return vocab->token_fim_rep();
3671}
3672
3673llama_token llama_vocab_fim_sep(const struct llama_vocab * vocab) {
3674 return vocab->token_fim_sep();
3675}
3676
3677llama_token llama_vocab_mask(const struct llama_vocab* vocab) {
3678 return vocab->token_mask();
3679}
3680
3681// deprecated
3682const char * llama_token_get_text(const struct llama_vocab * vocab, llama_token token) {
3683 return llama_vocab_get_text(vocab, token);
3684}
3685
3686// deprecated
3687float llama_token_get_score(const struct llama_vocab * vocab, llama_token token) {
3688 return llama_vocab_get_score(vocab, token);
3689}
3690
3691// deprecated
3692enum llama_token_attr llama_token_get_attr(const struct llama_vocab * vocab, llama_token token) {
3693 return llama_vocab_get_attr(vocab, token);
3694}
3695
3696// deprecated
3697bool llama_token_is_eog(const struct llama_vocab * vocab, llama_token token) {
3698 return llama_vocab_is_eog(vocab, token);
3699}
3700
3701// deprecated
3702bool llama_token_is_control(const struct llama_vocab * vocab, llama_token token) {
3703 return llama_vocab_is_control(vocab, token);
3704}
3705
3706// deprecated
3707llama_token llama_token_bos(const struct llama_vocab * vocab) {
3708 return llama_vocab_bos(vocab);
3709}
3710
3711// deprecated
3712llama_token llama_token_eos(const struct llama_vocab * vocab) {
3713 return llama_vocab_eos(vocab);
3714}
3715
3716// deprecated
3717llama_token llama_token_eot(const struct llama_vocab * vocab) {
3718 return llama_vocab_eot(vocab);
3719}
3720
3721// deprecated
3722llama_token llama_token_cls(const struct llama_vocab * vocab) {
3723 //return llama_vocab_cls(vocab);
3724 return llama_vocab_bos(vocab); // avoid deprecation warning
3725}
3726
3727// deprecated
3728llama_token llama_token_sep(const struct llama_vocab * vocab) {
3729 return llama_vocab_sep(vocab);
3730}
3731
3732// deprecated
3733llama_token llama_token_nl (const struct llama_vocab * vocab) {
3734 return llama_vocab_nl(vocab);
3735}
3736
3737// deprecated
3738llama_token llama_token_pad(const struct llama_vocab * vocab) {
3739 return llama_vocab_pad(vocab);
3740}
3741
3742// deprecated
3743bool llama_add_bos_token(const struct llama_vocab * vocab) {
3744 return llama_vocab_get_add_bos(vocab);
3745}
3746
3747// deprecated
3748bool llama_add_eos_token(const struct llama_vocab * vocab) {
3749 return llama_vocab_get_add_eos(vocab);
3750}
3751
3752// deprecated
3753llama_token llama_token_fim_pre(const struct llama_vocab * vocab) {
3754 return llama_vocab_fim_pre(vocab);
3755}
3756
3757// deprecated
3758llama_token llama_token_fim_suf(const struct llama_vocab * vocab) {
3759 return llama_vocab_fim_suf(vocab);
3760}
3761
3762// deprecated
3763llama_token llama_token_fim_mid(const struct llama_vocab * vocab) {
3764 return llama_vocab_fim_mid(vocab);
3765}
3766
3767// deprecated
3768llama_token llama_token_fim_pad(const struct llama_vocab * vocab) {
3769 return llama_vocab_fim_pad(vocab);
3770}
3771
3772// deprecated
3773llama_token llama_token_fim_rep(const struct llama_vocab * vocab) {
3774 return llama_vocab_fim_rep(vocab);
3775}
3776
3777// deprecated
3778llama_token llama_token_fim_sep(const struct llama_vocab * vocab) {
3779 return llama_vocab_fim_sep(vocab);
3780}
3781
3782//
3783// tokenization
3784//
3785
3786int32_t llama_tokenize(
3787 const struct llama_vocab * vocab,
3788 const char * text,
3789 int32_t text_len,
3790 llama_token * tokens,
3791 int32_t n_tokens_max,
3792 bool add_special,
3793 bool parse_special) {
3794 return vocab->tokenize(text, text_len, tokens, n_tokens_max, add_special, parse_special);
3795}
3796
3797int32_t llama_token_to_piece(
3798 const struct llama_vocab * vocab,
3799 llama_token token,
3800 char * buf,
3801 int32_t length,
3802 int32_t lstrip,
3803 bool special) {
3804 return vocab->token_to_piece(token, buf, length, lstrip, special);
3805}
3806
3807int32_t llama_detokenize(
3808 const struct llama_vocab * vocab,
3809 const llama_token * tokens,
3810 int32_t n_tokens,
3811 char * text,
3812 int32_t text_len_max,
3813 bool remove_special,
3814 bool unparse_special) {
3815 return vocab->detokenize(tokens, n_tokens, text, text_len_max, remove_special, unparse_special);
3816}
3817