| 1 | #include "llama-vocab.h" |
| 2 | |
| 3 | #include "ggml.h" |
| 4 | #include "gguf.h" |
| 5 | #include "llama-impl.h" |
| 6 | #include "llama-model-loader.h" |
| 7 | |
| 8 | #include "unicode.h" |
| 9 | |
| 10 | #include <algorithm> |
| 11 | #include <cassert> |
| 12 | #include <cctype> |
| 13 | #include <cfloat> |
| 14 | #include <cmath> |
| 15 | #include <cstdarg> |
| 16 | #include <cstring> |
| 17 | #include <forward_list> |
| 18 | #include <limits> |
| 19 | #include <map> |
| 20 | #include <queue> |
| 21 | #include <set> |
| 22 | #include <unordered_map> |
| 23 | |
| 24 | // |
| 25 | // helpers |
| 26 | // |
| 27 | |
| 28 | struct naive_trie { |
| 29 | naive_trie() : has_value(false), value(0) { |
| 30 | } |
| 31 | void insert(const char * key, size_t len, int32_t value = 0) { |
| 32 | if (len == 0) { |
| 33 | this->has_value = true; |
| 34 | this->value = value; |
| 35 | return; |
| 36 | } |
| 37 | char c = key[0]; |
| 38 | auto res = children.find(x: c); |
| 39 | if (res != children.end()) { |
| 40 | res->second.insert(key: key + 1, len: len - 1, value); |
| 41 | } else { |
| 42 | auto res = children.insert(x: std::make_pair(x&: c, y: naive_trie())); |
| 43 | res.first->second.insert(key: key + 1, len: len - 1, value); |
| 44 | } |
| 45 | } |
| 46 | std::pair<const char *, size_t> get_longest_prefix(const char * key, size_t len, size_t offset = 0) const { |
| 47 | if (len == 0 || offset == len) { |
| 48 | return std::make_pair(x&: key, y&: offset); |
| 49 | } |
| 50 | char c = key[offset]; |
| 51 | auto res = children.find(x: c); |
| 52 | if (res != children.end()) { |
| 53 | return res->second.get_longest_prefix(key, len, offset: offset + 1); |
| 54 | } |
| 55 | |
| 56 | return std::make_pair(x&: key, y&: offset); |
| 57 | } |
| 58 | const struct naive_trie * traverse(const char c) const { |
| 59 | auto res = children.find(x: c); |
| 60 | if (res != children.end()) { |
| 61 | return &res->second; |
| 62 | } |
| 63 | |
| 64 | return NULL; |
| 65 | } |
| 66 | std::map<char, struct naive_trie> children; |
| 67 | bool has_value; |
| 68 | llama_token value; |
| 69 | }; |
| 70 | |
| 71 | // |
| 72 | // tokenizers |
| 73 | // |
| 74 | |
| 75 | struct llm_tokenizer { |
| 76 | llm_tokenizer() {} |
| 77 | virtual ~llm_tokenizer() = default; |
| 78 | }; |
| 79 | |
| 80 | struct llm_symbol { |
| 81 | using index = int; |
| 82 | index prev; |
| 83 | index next; |
| 84 | const char * text; |
| 85 | size_t n; |
| 86 | }; |
| 87 | |
| 88 | static_assert(std::is_trivially_copyable<llm_symbol>::value, "llm_symbol is not trivially copyable" ); |
| 89 | |
| 90 | // |
| 91 | // SPM tokenizer |
| 92 | // original implementation: |
| 93 | // https://github.com/ggerganov/llama.cpp/commit/074bea2eb1f1349a0118239c4152914aecaa1be4 |
| 94 | // |
| 95 | |
| 96 | struct llm_bigram_spm { |
| 97 | struct comparator { |
| 98 | bool operator()(llm_bigram_spm & l, llm_bigram_spm & r) { |
| 99 | return (l.score < r.score) || (l.score == r.score && l.left > r.left); |
| 100 | } |
| 101 | }; |
| 102 | using queue_storage = std::vector<llm_bigram_spm>; |
| 103 | using queue = std::priority_queue<llm_bigram_spm, queue_storage, comparator>; |
| 104 | llm_symbol::index left; |
| 105 | llm_symbol::index right; |
| 106 | float score; |
| 107 | size_t size; |
| 108 | }; |
| 109 | |
| 110 | struct llm_tokenizer_spm : llm_tokenizer { |
| 111 | llm_tokenizer_spm(const llama_vocab & /*vocab*/) {} |
| 112 | }; |
| 113 | |
| 114 | struct llm_tokenizer_spm_session { |
| 115 | llm_tokenizer_spm_session(const llama_vocab & vocab) : vocab(vocab) {} |
| 116 | |
| 117 | void tokenize(const std::string & text, std::vector<llama_token> & output) { |
| 118 | // split string into utf8 chars |
| 119 | int index = 0; |
| 120 | size_t offs = 0; |
| 121 | while (offs < text.size()) { |
| 122 | llm_symbol sym; |
| 123 | size_t len = unicode_len_utf8(src: text[offs]); |
| 124 | sym.text = text.c_str() + offs; |
| 125 | sym.n = std::min(a: len, b: text.size() - offs); |
| 126 | offs += sym.n; |
| 127 | sym.prev = index - 1; |
| 128 | sym.next = offs == text.size() ? -1 : index + 1; |
| 129 | index++; |
| 130 | symbols.emplace_back(args&: sym); |
| 131 | } |
| 132 | |
| 133 | // seed the work queue with all possible 2-character tokens. |
| 134 | for (int i = 1; i < (int) symbols.size(); ++i) { |
| 135 | try_add_bigram(left: i - 1, right: i); |
| 136 | } |
| 137 | |
| 138 | // keep substituting the highest frequency pairs for as long as we can. |
| 139 | while (!work_queue.empty()) { |
| 140 | auto bigram = work_queue.top(); |
| 141 | work_queue.pop(); |
| 142 | |
| 143 | auto & left_sym = symbols[bigram.left]; |
| 144 | auto & right_sym = symbols[bigram.right]; |
| 145 | |
| 146 | // if one of the symbols already got merged, skip it. |
| 147 | if (left_sym.n == 0 || right_sym.n == 0 || |
| 148 | left_sym.n + right_sym.n != bigram.size) { |
| 149 | continue; |
| 150 | } |
| 151 | |
| 152 | // merge the right sym into the left one |
| 153 | left_sym.n += right_sym.n; |
| 154 | right_sym.n = 0; |
| 155 | |
| 156 | //LLAMA_LOG_INFO("left = '%*s' size = %zu\n", (int) left_sym.n, left_sym.text, bigram.size); |
| 157 | |
| 158 | // remove the right sym from the chain |
| 159 | left_sym.next = right_sym.next; |
| 160 | if (right_sym.next >= 0) { |
| 161 | symbols[right_sym.next].prev = bigram.left; |
| 162 | } |
| 163 | |
| 164 | // find more substitutions |
| 165 | try_add_bigram(left: left_sym.prev, right: bigram.left); |
| 166 | try_add_bigram(left: bigram.left, right: left_sym.next); |
| 167 | } |
| 168 | |
| 169 | for (int i = 0; i != -1; i = symbols[i].next) { |
| 170 | auto & symbol = symbols[i]; |
| 171 | resegment(symbol, output); |
| 172 | } |
| 173 | } |
| 174 | |
| 175 | private: |
| 176 | void resegment(llm_symbol & symbol, std::vector<llama_token> & output) { |
| 177 | auto text = std::string(symbol.text, symbol.n); |
| 178 | auto token = vocab.text_to_token(text); |
| 179 | |
| 180 | // Do we need to support is_unused? |
| 181 | if (token != LLAMA_TOKEN_NULL) { |
| 182 | output.push_back(x: token); |
| 183 | return; |
| 184 | } |
| 185 | |
| 186 | const auto p = rev_merge.find(x: text); |
| 187 | |
| 188 | if (p == rev_merge.end()) { |
| 189 | // output any symbols that did not form tokens as bytes. |
| 190 | output.reserve(n: output.size() + symbol.n); |
| 191 | for (int j = 0; j < (int)symbol.n; ++j) { |
| 192 | llama_token id = vocab.byte_to_token(ch: symbol.text[j]); |
| 193 | output.push_back(x: id); |
| 194 | } |
| 195 | return; |
| 196 | } |
| 197 | |
| 198 | resegment(symbol&: symbols[p->second.first], output); |
| 199 | resegment(symbol&: symbols[p->second.second], output); |
| 200 | } |
| 201 | |
| 202 | void try_add_bigram(int left, int right) { |
| 203 | if (left == -1 || right == -1) { |
| 204 | return; |
| 205 | } |
| 206 | const std::string text = std::string(symbols[left].text, symbols[left].n + symbols[right].n); |
| 207 | auto token = vocab.text_to_token(text); |
| 208 | |
| 209 | if (token == LLAMA_TOKEN_NULL) { |
| 210 | return; |
| 211 | } |
| 212 | |
| 213 | if (static_cast<uint32_t>(token) >= vocab.n_tokens()) { |
| 214 | return; |
| 215 | } |
| 216 | |
| 217 | const auto & tok_data = vocab.get_token_data(id: token); |
| 218 | |
| 219 | llm_bigram_spm bigram; |
| 220 | bigram.left = left; |
| 221 | bigram.right = right; |
| 222 | bigram.score = tok_data.score; |
| 223 | bigram.size = text.size(); |
| 224 | |
| 225 | work_queue.push(x: bigram); |
| 226 | |
| 227 | // Do we need to support is_unused? |
| 228 | rev_merge[text] = std::make_pair(x&: left, y&: right); |
| 229 | } |
| 230 | |
| 231 | const llama_vocab & vocab; |
| 232 | // currently unused |
| 233 | // const llm_tokenizer_spm * spm_tokenizer; |
| 234 | |
| 235 | std::vector<llm_symbol> symbols; |
| 236 | llm_bigram_spm::queue work_queue; |
| 237 | std::map<std::string, std::pair<int, int>> rev_merge; |
| 238 | }; |
| 239 | |
| 240 | // |
| 241 | // BPE tokenizer |
| 242 | // adapted from https://github.com/cmp-nct/ggllm.cpp [MIT License] |
| 243 | // tried to simplify unicode stuff, so most likely does not work 100% correctly! |
| 244 | // |
| 245 | |
| 246 | // TODO: there are a lot of common parts between spm and bpe tokenizers, should be refactored and reused |
| 247 | |
| 248 | template<typename T, typename Container = std::vector<T>, typename Compare = std::less<typename Container::value_type>> |
| 249 | class llama_priority_queue : public std::priority_queue<T, Container, Compare> { |
| 250 | public: |
| 251 | using std::priority_queue<T, Container, Compare>::priority_queue; |
| 252 | |
| 253 | T pop_move() { |
| 254 | T item = std::move(this->c.front()); |
| 255 | std::pop_heap(this->c.begin(), this->c.end(), this->comp); |
| 256 | this->c.pop_back(); |
| 257 | return item; |
| 258 | } |
| 259 | |
| 260 | void pop() = delete; |
| 261 | }; |
| 262 | |
| 263 | struct llm_bigram_bpe { |
| 264 | struct comparator { |
| 265 | bool operator()(const llm_bigram_bpe & l, const llm_bigram_bpe & r) const { |
| 266 | return l.rank > r.rank || (l.rank == r.rank && l.left > r.left); |
| 267 | } |
| 268 | }; |
| 269 | |
| 270 | using queue_storage = std::vector<llm_bigram_bpe>; |
| 271 | using queue = llama_priority_queue<llm_bigram_bpe, queue_storage, comparator>; |
| 272 | llm_symbol::index left; |
| 273 | llm_symbol::index right; |
| 274 | std::string text; |
| 275 | int rank; |
| 276 | size_t size; |
| 277 | }; |
| 278 | |
| 279 | struct llm_tokenizer_bpe : llm_tokenizer { |
| 280 | llm_tokenizer_bpe(const llama_vocab & vocab) { |
| 281 | GGML_ASSERT(vocab.get_type() == LLAMA_VOCAB_TYPE_BPE); |
| 282 | switch (vocab.get_pre_type()) { |
| 283 | case LLAMA_VOCAB_PRE_TYPE_LLAMA3: |
| 284 | regex_exprs = { |
| 285 | // original regex from tokenizer.json |
| 286 | //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", |
| 287 | |
| 288 | // adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989 |
| 289 | "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" , |
| 290 | }; |
| 291 | break; |
| 292 | case LLAMA_VOCAB_PRE_TYPE_DBRX: |
| 293 | case LLAMA_VOCAB_PRE_TYPE_SMAUG: |
| 294 | regex_exprs = { |
| 295 | // same as llama3 |
| 296 | "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" , |
| 297 | }; |
| 298 | break; |
| 299 | case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM: |
| 300 | regex_exprs = { |
| 301 | "[\r\n]" , |
| 302 | "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+" , |
| 303 | "\\s?[!-/:-~!-/:-~‘-‟ -。]+" , |
| 304 | "\\s+$" , |
| 305 | "[一-龥ࠀ-一가-]+" , |
| 306 | "\\p{N}+" , |
| 307 | }; |
| 308 | break; |
| 309 | case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM: |
| 310 | case LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE: |
| 311 | regex_exprs = { |
| 312 | "\\p{N}{1,3}" , |
| 313 | "[一-龥-ゟ゠-ヿ]+" , |
| 314 | "[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\r\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\r\n]*|\\s*[\r\n]+|\\s+(?!\\S)|\\s+" , |
| 315 | }; |
| 316 | break; |
| 317 | case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER: |
| 318 | regex_exprs = { |
| 319 | "[\r\n]" , |
| 320 | "\\s?\\p{L}+" , |
| 321 | "\\s?\\p{P}+" , |
| 322 | "[一-龥ࠀ-一가-]+" , |
| 323 | "\\p{N}" , |
| 324 | }; |
| 325 | break; |
| 326 | case LLAMA_VOCAB_PRE_TYPE_FALCON: |
| 327 | regex_exprs = { |
| 328 | "[\\p{P}\\$\\+<=>\\^~\\|`]+" , |
| 329 | "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)" , |
| 330 | "[0-9][0-9][0-9]" , |
| 331 | }; |
| 332 | break; |
| 333 | case LLAMA_VOCAB_PRE_TYPE_STARCODER: |
| 334 | case LLAMA_VOCAB_PRE_TYPE_REFACT: |
| 335 | case LLAMA_VOCAB_PRE_TYPE_COMMAND_R: |
| 336 | case LLAMA_VOCAB_PRE_TYPE_SMOLLM: |
| 337 | case LLAMA_VOCAB_PRE_TYPE_CODESHELL: |
| 338 | case LLAMA_VOCAB_PRE_TYPE_EXAONE: |
| 339 | case LLAMA_VOCAB_PRE_TYPE_MINERVA: |
| 340 | regex_exprs = { |
| 341 | "\\p{N}" , |
| 342 | "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)" , |
| 343 | }; |
| 344 | break; |
| 345 | case LLAMA_VOCAB_PRE_TYPE_GPT2: |
| 346 | case LLAMA_VOCAB_PRE_TYPE_MPT: |
| 347 | case LLAMA_VOCAB_PRE_TYPE_OLMO: |
| 348 | case LLAMA_VOCAB_PRE_TYPE_JAIS: |
| 349 | case LLAMA_VOCAB_PRE_TYPE_TRILLION: |
| 350 | case LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING: |
| 351 | regex_exprs = { |
| 352 | "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)" , |
| 353 | }; |
| 354 | break; |
| 355 | case LLAMA_VOCAB_PRE_TYPE_STABLELM2: |
| 356 | case LLAMA_VOCAB_PRE_TYPE_QWEN2: |
| 357 | case LLAMA_VOCAB_PRE_TYPE_HUNYUAN: |
| 358 | regex_exprs = { |
| 359 | // original regex from tokenizer.json |
| 360 | // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" |
| 361 | "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" , |
| 362 | }; |
| 363 | break; |
| 364 | case LLAMA_VOCAB_PRE_TYPE_PORO: |
| 365 | case LLAMA_VOCAB_PRE_TYPE_BLOOM: |
| 366 | case LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH: |
| 367 | regex_exprs = { |
| 368 | " ?[^(\\s|.,!?…。,、।۔،)]+" , |
| 369 | }; |
| 370 | break; |
| 371 | case LLAMA_VOCAB_PRE_TYPE_CHATGLM4: |
| 372 | regex_exprs = { |
| 373 | "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" , |
| 374 | }; |
| 375 | break; |
| 376 | case LLAMA_VOCAB_PRE_TYPE_VIKING: |
| 377 | regex_exprs = { |
| 378 | " ?[^(\\s|.,!?…。,、।۔،)]+" , |
| 379 | "\\p{N}" , |
| 380 | }; |
| 381 | break; |
| 382 | case LLAMA_VOCAB_PRE_TYPE_TEKKEN: |
| 383 | // original regex from tokenizer.json |
| 384 | // "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" |
| 385 | regex_exprs = { |
| 386 | "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" , |
| 387 | }; |
| 388 | break; |
| 389 | case LLAMA_VOCAB_PRE_TYPE_CHAMELEON: |
| 390 | // Note: in theory, the special token (sentinel and image token) regex_exprs below |
| 391 | // are unnecessary, as they are split in `tokenizer_st_partition` anyway. |
| 392 | // However, since the upstream pre-tokenizer uses them, they are also |
| 393 | // included here (see https://huggingface.co/facebook/chameleon-7b). |
| 394 | regex_exprs = { |
| 395 | "<sentinel:[0-9]+>" , // Sentinel tokens |
| 396 | "(IMGIMG)((A|B|C|D|E|F|G|H|I){1,4})Z" , // Image tokens |
| 397 | "([\\t\\n]| | )" , // directly from tokenizer.json |
| 398 | "\\p{N}" , // Individual digits |
| 399 | "[\\p{P}!-/:-@\\[-`{-~]" , // Punctuation, Isolated |
| 400 | "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)" , |
| 401 | }; |
| 402 | break; |
| 403 | case LLAMA_VOCAB_PRE_TYPE_GPT4O: |
| 404 | case LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2: |
| 405 | regex_exprs = { |
| 406 | // original regex from tokenizer.json |
| 407 | // "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", |
| 408 | "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" , |
| 409 | }; |
| 410 | break; |
| 411 | case LLAMA_VOCAB_PRE_TYPE_KIMI_K2: |
| 412 | regex_exprs = { |
| 413 | // K2 trigger pattern - this will activate the custom K2 handler in unicode.cpp |
| 414 | // The custom handler implements all K2 patterns with proper Han character exclusion |
| 415 | "\\p{Han}+" , |
| 416 | }; |
| 417 | break; |
| 418 | case LLAMA_VOCAB_PRE_TYPE_SUPERBPE: |
| 419 | regex_exprs = { |
| 420 | "\\p{N}+" , |
| 421 | "(?=(\\d{3})+(?!\\d))" , |
| 422 | }; |
| 423 | break; |
| 424 | case LLAMA_VOCAB_PRE_TYPE_BAILINGMOE: |
| 425 | regex_exprs = { |
| 426 | // original regex from tokenizer.json |
| 427 | // "'(?i:[sdmt]|ll|ve|re)|[^\\r\\n\\p{L}\\p{N}]?+\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]++[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+" |
| 428 | // FIXME? Changed possessive quantifiers (?+ and ++) to greedy to avoid errors and imatrix hanging (tried atomic grouping but it's not supported?) |
| 429 | "'(?:[sSdDmMtT]|[lL][lL]|[vV][eE]|[rR][eE])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+" , |
| 430 | }; |
| 431 | break; |
| 432 | case LLAMA_VOCAB_PRE_TYPE_SEED_CODER: |
| 433 | regex_exprs = { |
| 434 | // original regex from tokenizer.json |
| 435 | // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\r\n]+|\\s*[\r\n]+|\\s+(?!\\S)|\\s+" |
| 436 | "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\\r\\n]+|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" , |
| 437 | }; |
| 438 | break; |
| 439 | case LLAMA_VOCAB_PRE_TYPE_GROK_2: |
| 440 | regex_exprs = { |
| 441 | // original regex from tokenizer.json |
| 442 | // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" |
| 443 | "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" , |
| 444 | }; |
| 445 | break; |
| 446 | default: |
| 447 | // default regex for BPE tokenization pre-processing |
| 448 | regex_exprs = { |
| 449 | "[\\p{P}\\$\\+<=>\\^~\\|]+" , |
| 450 | "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)" , |
| 451 | "\\p{N}+" , |
| 452 | "[0-9][0-9][0-9]" , |
| 453 | }; |
| 454 | break; |
| 455 | } |
| 456 | } |
| 457 | |
| 458 | std::vector<std::string> regex_exprs; |
| 459 | }; |
| 460 | |
| 461 | struct llm_tokenizer_bpe_session { |
| 462 | llm_tokenizer_bpe_session(const llama_vocab & vocab, const llm_tokenizer_bpe & tokenizer) : vocab(vocab), tokenizer(tokenizer) {} |
| 463 | |
| 464 | static void append(const llama_token token_id, std::vector<llama_token> & output) { |
| 465 | output.push_back(x: token_id); |
| 466 | } |
| 467 | |
| 468 | bool append_bos(std::vector<llama_token> & output) const { |
| 469 | if (vocab.get_add_bos()) { |
| 470 | GGML_ASSERT(vocab.token_bos() != LLAMA_TOKEN_NULL); |
| 471 | output.push_back(x: vocab.token_bos()); |
| 472 | return true; |
| 473 | } |
| 474 | return false; |
| 475 | } |
| 476 | |
| 477 | bool append_eos(std::vector<llama_token> & output) const { |
| 478 | if (vocab.get_add_eos()) { |
| 479 | GGML_ASSERT(vocab.token_eos() != LLAMA_TOKEN_NULL); |
| 480 | output.push_back(x: vocab.token_eos()); |
| 481 | return true; |
| 482 | } |
| 483 | return false; |
| 484 | } |
| 485 | |
| 486 | void check_double_bos_eos(const std::vector<llama_token> & output) const { |
| 487 | if (vocab.get_add_bos() && output.size() >= 2 && output[1] == vocab.token_bos()) { |
| 488 | LLAMA_LOG_WARN( |
| 489 | "%s: Added a BOS token to the prompt as specified by the model but the prompt " |
| 490 | "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. " |
| 491 | "Are you sure this is what you want?\n" , __FUNCTION__); |
| 492 | } |
| 493 | if (vocab.get_add_eos() && output.size() >= 2 && *(output.end()-2) == vocab.token_eos()) { |
| 494 | LLAMA_LOG_WARN( |
| 495 | "%s: Added a EOS token to the prompt as specified by the model but the prompt " |
| 496 | "also ends with a EOS token. So now the final prompt ends with 2 EOS tokens. " |
| 497 | "Are you sure this is what you want?\n" , __FUNCTION__); |
| 498 | } |
| 499 | } |
| 500 | |
| 501 | void tokenize(const std::string & text, std::vector<llama_token> & output) { |
| 502 | int final_prev_index = -1; |
| 503 | const auto word_collection = unicode_regex_split(text, regex_exprs: tokenizer.regex_exprs); |
| 504 | |
| 505 | symbols_final.clear(); |
| 506 | |
| 507 | for (const auto & word : word_collection) { |
| 508 | work_queue = llm_bigram_bpe::queue(); |
| 509 | symbols.clear(); |
| 510 | |
| 511 | int index = 0; |
| 512 | size_t offset = 0; |
| 513 | |
| 514 | //if (vocab.tokenizer_ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) { |
| 515 | if (vocab.get_ignore_merges() && vocab.text_to_token(text: word) != LLAMA_TOKEN_NULL) { |
| 516 | symbols.emplace_back(args: llm_symbol{.prev: -1, .next: -1, .text: word.c_str(), .n: word.size()}); |
| 517 | offset = word.size(); |
| 518 | } |
| 519 | |
| 520 | while (offset < word.size()) { |
| 521 | llm_symbol sym; |
| 522 | size_t char_len = std::min(a: word.size() - offset, b: (size_t) unicode_len_utf8(src: word[offset])); |
| 523 | sym.text = word.c_str() + offset; |
| 524 | sym.n = char_len; |
| 525 | offset += sym.n; |
| 526 | sym.prev = index - 1; |
| 527 | sym.next = offset == word.size() ? -1 : index + 1; |
| 528 | index++; |
| 529 | symbols.emplace_back(args&: sym); |
| 530 | } |
| 531 | for (int i = 1; i < (int) symbols.size(); ++i) { |
| 532 | add_new_bigram(left: i - 1, right: i); |
| 533 | } |
| 534 | |
| 535 | // build token(s) |
| 536 | while (!work_queue.empty()) { |
| 537 | auto bigram = work_queue.pop_move(); |
| 538 | |
| 539 | auto & left_symbol = symbols[bigram.left]; |
| 540 | auto & right_symbol = symbols[bigram.right]; |
| 541 | |
| 542 | if (left_symbol.n == 0 || right_symbol.n == 0) { |
| 543 | continue; |
| 544 | } |
| 545 | std::string left_token = std::string(left_symbol.text, left_symbol.n); |
| 546 | std::string right_token = std::string(right_symbol.text, right_symbol.n); |
| 547 | if (left_token + right_token != bigram.text) { |
| 548 | continue; // Skip this bigram if it's outdated |
| 549 | } |
| 550 | |
| 551 | // merge the right sym into the left one |
| 552 | left_symbol.n += right_symbol.n; |
| 553 | right_symbol.n = 0; |
| 554 | |
| 555 | // remove the right sym from the chain |
| 556 | left_symbol.next = right_symbol.next; |
| 557 | if (right_symbol.next >= 0) { |
| 558 | symbols[right_symbol.next].prev = bigram.left; |
| 559 | } |
| 560 | |
| 561 | add_new_bigram(left: left_symbol.prev, right: bigram.left); // left side of current symbol |
| 562 | add_new_bigram(left: bigram.left, right: left_symbol.next); // right side of current symbol |
| 563 | } |
| 564 | |
| 565 | // add the finished tokens to the final list keeping correct order for next and prev |
| 566 | for (auto & sym : symbols) { |
| 567 | if (sym.n > 0) { |
| 568 | sym.prev = final_prev_index; |
| 569 | sym.next = -1; |
| 570 | if (final_prev_index != -1) { |
| 571 | symbols_final[final_prev_index].next = symbols_final.size(); |
| 572 | } |
| 573 | symbols_final.emplace_back(args&: sym); |
| 574 | final_prev_index = symbols_final.size() - 1; |
| 575 | } |
| 576 | } |
| 577 | } |
| 578 | |
| 579 | symbols = symbols_final; |
| 580 | |
| 581 | if (!symbols.empty()) { |
| 582 | for (int i = 0; i != -1; i = symbols[i].next) { |
| 583 | auto & symbol = symbols[i]; |
| 584 | if (symbol.n == 0) { |
| 585 | continue; |
| 586 | } |
| 587 | |
| 588 | const std::string str = std::string(symbol.text, symbol.n); |
| 589 | const auto token = vocab.text_to_token(text: str); |
| 590 | |
| 591 | if (token == LLAMA_TOKEN_NULL) { |
| 592 | for (auto j = str.begin(); j != str.end(); ++j) { |
| 593 | std::string byte_str(1, *j); |
| 594 | auto token_multibyte = vocab.text_to_token(text: byte_str); |
| 595 | if (token_multibyte != LLAMA_TOKEN_NULL) { |
| 596 | output.push_back(x: token_multibyte); |
| 597 | } |
| 598 | } |
| 599 | } else { |
| 600 | output.push_back(x: token); |
| 601 | } |
| 602 | } |
| 603 | } |
| 604 | } |
| 605 | |
| 606 | private: |
| 607 | void add_new_bigram(int left, int right) { |
| 608 | if (left == -1 || right == -1) { |
| 609 | return; |
| 610 | } |
| 611 | std::string left_token = std::string(symbols[left].text, symbols[left].n); |
| 612 | std::string right_token = std::string(symbols[right].text, symbols[right].n); |
| 613 | |
| 614 | int rank_found = -1; |
| 615 | |
| 616 | rank_found = vocab.find_bpe_rank(token_left: left_token, token_right: right_token); |
| 617 | |
| 618 | if (rank_found < 0) { |
| 619 | return; |
| 620 | } |
| 621 | |
| 622 | llm_bigram_bpe bigram; |
| 623 | |
| 624 | bigram.left = left; |
| 625 | bigram.right = right; |
| 626 | bigram.text = left_token + right_token; |
| 627 | bigram.size = left_token.size() + right_token.size(); |
| 628 | bigram.rank = rank_found; |
| 629 | |
| 630 | work_queue.push(x: bigram); |
| 631 | } |
| 632 | |
| 633 | const llama_vocab & vocab; |
| 634 | const llm_tokenizer_bpe & tokenizer; |
| 635 | |
| 636 | std::vector<llm_symbol> symbols; |
| 637 | std::vector<llm_symbol> symbols_final; |
| 638 | llm_bigram_bpe::queue work_queue; |
| 639 | }; |
| 640 | |
| 641 | // |
| 642 | // WPM tokenizer |
| 643 | // |
| 644 | |
| 645 | struct llm_tokenizer_wpm : llm_tokenizer { |
| 646 | llm_tokenizer_wpm(const llama_vocab & /*vocab*/) {} |
| 647 | }; |
| 648 | |
| 649 | struct llm_tokenizer_wpm_session { |
| 650 | llm_tokenizer_wpm_session(const llama_vocab & vocab) : vocab(vocab) {} |
| 651 | |
| 652 | void tokenize(const std::string & text, std::vector<llama_token> & output) { |
| 653 | // normalize and split by whitespace |
| 654 | std::vector<std::string> words = preprocess(text); |
| 655 | // bos token prepended already |
| 656 | |
| 657 | // find the longest tokens that form the words |
| 658 | for (const std::string & word : words) { |
| 659 | // skip empty words |
| 660 | if (word.size() == 0) { |
| 661 | continue; |
| 662 | } |
| 663 | |
| 664 | // prepend phantom space |
| 665 | const std::string word1 = "\xe2\x96\x81" + word; |
| 666 | const int n = word1.size(); |
| 667 | |
| 668 | const size_t current_tokens = output.size(); |
| 669 | |
| 670 | // we're at the start of a new word |
| 671 | // move through character position in word |
| 672 | for (int i = 0; i < n; ++i) { |
| 673 | // loop through possible match length |
| 674 | bool match = false; |
| 675 | for (int j = std::min(a: n, b: i + vocab.max_token_len() + 1); j > i; j--) { |
| 676 | auto id = vocab.text_to_token(text: word1.substr(pos: i, n: j - i)); |
| 677 | if (id != LLAMA_TOKEN_NULL) { |
| 678 | output.push_back(x: id); |
| 679 | match = true; |
| 680 | i = j - 1; |
| 681 | break; |
| 682 | } |
| 683 | } |
| 684 | |
| 685 | if (!match) { // discard all |
| 686 | output.resize(new_size: current_tokens); |
| 687 | break; // and discard next tokens |
| 688 | } |
| 689 | } |
| 690 | |
| 691 | // we didn't find any matches for this word |
| 692 | if (current_tokens == output.size()) { |
| 693 | output.push_back(x: vocab.token_unk()); |
| 694 | } |
| 695 | } |
| 696 | } |
| 697 | |
| 698 | // TODO: reduce string copies by using cpts_offs array |
| 699 | static std::vector<std::string> preprocess(const std::string & text) { |
| 700 | const std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(cpts: unicode_cpts_from_utf8(utf8: text)); |
| 701 | std::vector<std::string> words(1, "" ); |
| 702 | |
| 703 | for (const uint32_t cpt : cpts_nfd) { |
| 704 | const auto flags = unicode_cpt_flags_from_cpt(cpt); |
| 705 | |
| 706 | if (flags.is_whitespace) { |
| 707 | if (words.back().size()) { // finish previous word if any |
| 708 | words.emplace_back(); |
| 709 | } |
| 710 | continue; |
| 711 | } |
| 712 | |
| 713 | assert (!flags.is_separator); |
| 714 | if (cpt == 0 || cpt == 0xFFFD || flags.is_control) { |
| 715 | continue; |
| 716 | } |
| 717 | |
| 718 | const std::string s = unicode_cpt_to_utf8(cpt: unicode_tolower(cpt)); |
| 719 | if (flags.is_punctuation || ( cpt < 0x7F && flags.is_symbol ) || is_chinese_char(cpt)) { |
| 720 | if (words.back().size()) { // finish previous word if any |
| 721 | words.emplace_back(); |
| 722 | } |
| 723 | words.back() = s; // single char word |
| 724 | words.emplace_back(); // start a new word |
| 725 | } else { |
| 726 | words.back() += s; // append char to word |
| 727 | } |
| 728 | } |
| 729 | |
| 730 | if (!words.back().size()) { |
| 731 | words.pop_back(); |
| 732 | } |
| 733 | |
| 734 | return words; |
| 735 | } |
| 736 | |
| 737 | static bool is_chinese_char(uint32_t cpt) { |
| 738 | return |
| 739 | (cpt >= 0x04E00 && cpt <= 0x09FFF) || |
| 740 | (cpt >= 0x03400 && cpt <= 0x04DBF) || |
| 741 | (cpt >= 0x20000 && cpt <= 0x2A6DF) || |
| 742 | (cpt >= 0x2A700 && cpt <= 0x2B73F) || |
| 743 | (cpt >= 0x2B740 && cpt <= 0x2B81F) || |
| 744 | (cpt >= 0x2B920 && cpt <= 0x2CEAF) || // this should be 0x2B820 but in hf rust code it is 0x2B920 |
| 745 | (cpt >= 0x0F900 && cpt <= 0x0FAFF) || |
| 746 | (cpt >= 0x2F800 && cpt <= 0x2FA1F); |
| 747 | //(cpt >= 0x3000 && cpt <= 0x303F) || |
| 748 | //(cpt >= 0xFF00 && cpt <= 0xFFEF); |
| 749 | } |
| 750 | |
| 751 | private: |
| 752 | const llama_vocab & vocab; |
| 753 | // currently unused |
| 754 | // const llm_tokenizer_wpm * wpm_tokenizer; |
| 755 | }; |
| 756 | |
| 757 | // |
| 758 | // UGM tokenizer |
| 759 | // |
| 760 | |
| 761 | struct llm_tokenizer_ugm : llm_tokenizer { |
| 762 | llm_tokenizer_ugm(const llama_vocab & vocab, const std::vector<char> & precompiled_charsmap) { |
| 763 | if (precompiled_charsmap.size() > 0) { |
| 764 | size_t charsmap_offset = 0; |
| 765 | |
| 766 | // First four bytes of precompiled_charsmap contains length of binary |
| 767 | // blob containing XOR-compressed compact double array (XCDA) entries |
| 768 | uint32_t xcda_blob_size = *(const uint32_t *) &precompiled_charsmap[0]; |
| 769 | charsmap_offset += sizeof(xcda_blob_size); |
| 770 | if (xcda_blob_size + charsmap_offset >= precompiled_charsmap.size()) { |
| 771 | throw std::runtime_error("Index out of array bounds in precompiled charsmap!" ); |
| 772 | } |
| 773 | |
| 774 | // Next xcda_blob_size bytes contain entries of XOR-compressed compact |
| 775 | // double array (XCDA). Each entry is bit-packed into a 32-bit integer. |
| 776 | xcda_array = (const uint32_t *) &precompiled_charsmap[charsmap_offset]; |
| 777 | xcda_array_size = xcda_blob_size / sizeof(uint32_t); |
| 778 | charsmap_offset += xcda_blob_size; |
| 779 | |
| 780 | // Remaining bytes of precompiled charsmap contain null-terminated |
| 781 | // replacement strings for prefixes matched by the XCDA. |
| 782 | prefix_replacements = &precompiled_charsmap[charsmap_offset]; |
| 783 | prefix_replacements_size = precompiled_charsmap.size() - charsmap_offset; |
| 784 | } |
| 785 | |
| 786 | for (uint32_t id = 0; id < vocab.n_tokens(); ++id) { |
| 787 | const auto & token_data = vocab.get_token_data(id); |
| 788 | |
| 789 | if (vocab.is_normal(id)) { |
| 790 | min_score = std::min<float>(a: min_score, b: token_data.score); |
| 791 | max_score = std::max<float>(a: max_score, b: token_data.score); |
| 792 | } |
| 793 | |
| 794 | if (vocab.is_normal(id) || |
| 795 | vocab.is_user_defined(id) || |
| 796 | vocab.is_unused(id)) { |
| 797 | token_matcher.insert(key: token_data.text.data(), len: token_data.text.size(), value: id); |
| 798 | } |
| 799 | |
| 800 | if (vocab.is_user_defined(id)) { |
| 801 | user_defined_token_matcher.insert(key: token_data.text.data(), len: token_data.text.size()); |
| 802 | } |
| 803 | } |
| 804 | |
| 805 | unknown_token_score = min_score - unknown_token_score_penalty; |
| 806 | } |
| 807 | |
| 808 | // escaped space symbol - U+2581 (Lower One Eighth Block) |
| 809 | const std::string escaped_space = "\xE2\x96\x81" ; |
| 810 | |
| 811 | const char * prefix_replacements = NULL; |
| 812 | size_t prefix_replacements_size = 0; |
| 813 | |
| 814 | const uint32_t * xcda_array = NULL; |
| 815 | size_t xcda_array_size = 0; |
| 816 | |
| 817 | struct naive_trie user_defined_token_matcher; |
| 818 | |
| 819 | float min_score = FLT_MAX; |
| 820 | float max_score = -FLT_MAX; |
| 821 | |
| 822 | float unknown_token_score_penalty = 10.0; |
| 823 | float unknown_token_score; |
| 824 | |
| 825 | struct naive_trie token_matcher; |
| 826 | }; |
| 827 | |
| 828 | struct llm_tokenizer_ugm_session { |
| 829 | llm_tokenizer_ugm_session(const llama_vocab & vocab, const llm_tokenizer_ugm & tokenizer) : vocab(vocab), tokenizer(tokenizer) {} |
| 830 | |
| 831 | /* This implementation is based on SentencePiece optimized Viterbi algorithm for |
| 832 | * unigram language models. The general idea is to: |
| 833 | * - move along the input sequence in steps of one UTF code point, |
| 834 | * - at each step find all possible tokenizations of the prefix by |
| 835 | * traversing the tokens trie, |
| 836 | * - for each tokenization store the best one so far (by higher score) |
| 837 | * - use the position in sequence after given token as an index to store |
| 838 | * results |
| 839 | * - if there was no valid tokenization of the current UTF code point |
| 840 | * then use unknown token with additional score penalty |
| 841 | * After processing the whole sequence we backtrack from the end to get |
| 842 | * the best tokenization. |
| 843 | */ |
| 844 | void tokenize(const std::string & text, std::vector<llama_token> & output) { |
| 845 | // get current size of output (for reversal later) |
| 846 | size_t output_size = output.size(); |
| 847 | |
| 848 | // normalize the input first |
| 849 | std::string normalized; |
| 850 | normalize(input: text, normalized: &normalized); |
| 851 | size_t input_len = normalized.size(); |
| 852 | if (input_len == 0) { |
| 853 | return; |
| 854 | } |
| 855 | |
| 856 | // initialize score_sum to -FLT_MAX so it will be always lower than sums of token scores |
| 857 | std::vector<struct best_tokenization> tokenization_results(input_len + 1, {.token_id: vocab.token_unk(), .input_offset: 0, .score_sum: -DBL_MAX}); |
| 858 | // at the beginning tokenization score is zero |
| 859 | tokenization_results[0] = { .token_id: vocab.token_unk(), .input_offset: 0, .score_sum: 0 }; |
| 860 | |
| 861 | for (size_t input_offset = 0; input_offset < input_len;) { |
| 862 | size_t prefix_offset = input_offset; |
| 863 | // calculate how many code units are in the currently processed UTF code point |
| 864 | size_t n_utf8_code_units = std::min<size_t>(a: unicode_len_utf8(src: normalized[input_offset]), b: input_len - input_offset); |
| 865 | |
| 866 | // traverse the token matcher trie to find a matching token |
| 867 | bool single_codepoint_token_found = false; |
| 868 | const struct best_tokenization & current_best = tokenization_results[input_offset]; |
| 869 | const struct naive_trie * node = tokenizer.token_matcher.traverse(c: normalized[prefix_offset++]); |
| 870 | |
| 871 | while (prefix_offset <= input_len && node != NULL) { |
| 872 | // check if we found valid token in prefix |
| 873 | if (node->has_value) { |
| 874 | // check if it corresponds to the whole UTF code point |
| 875 | if (prefix_offset - input_offset == n_utf8_code_units) { |
| 876 | single_codepoint_token_found = true; |
| 877 | } |
| 878 | llama_token token_id = node->value; |
| 879 | const auto & token_data = vocab.get_token_data(id: token_id); |
| 880 | |
| 881 | // we set the user-defined token scores to 0 to make them more likely to be selected |
| 882 | // (normal token scores are log probabilities, so they are negative) |
| 883 | // score type is double here to make tokenization results exactly |
| 884 | // the same as in the HF tokenizer using SentencePiece |
| 885 | const double token_score = vocab.is_user_defined(id: token_id) ? 0.0 : token_data.score; |
| 886 | const double challenger_score = current_best.score_sum + token_score; |
| 887 | struct best_tokenization & current_champ = tokenization_results[prefix_offset]; |
| 888 | if (challenger_score > current_champ.score_sum) { |
| 889 | struct best_tokenization challenger = { .token_id: token_id, .input_offset: input_offset, .score_sum: challenger_score }; |
| 890 | current_champ = challenger; |
| 891 | } |
| 892 | } |
| 893 | node = node->traverse(c: normalized[prefix_offset++]); |
| 894 | } |
| 895 | |
| 896 | // if we didn't find a valid token corresponding to the whole UTF code point |
| 897 | // then use unknown token as the tokenization of this UTF code point |
| 898 | if (!single_codepoint_token_found) { |
| 899 | const double challenger_score = current_best.score_sum + tokenizer.unknown_token_score; |
| 900 | prefix_offset = input_offset + n_utf8_code_units; |
| 901 | struct best_tokenization & current_champ = tokenization_results[prefix_offset]; |
| 902 | if (challenger_score > current_champ.score_sum) { |
| 903 | struct best_tokenization challenger = { .token_id: vocab.token_unk(), .input_offset: input_offset, .score_sum: challenger_score }; |
| 904 | current_champ = challenger; |
| 905 | } |
| 906 | } |
| 907 | |
| 908 | // move to the next UTF code point |
| 909 | input_offset += n_utf8_code_units; |
| 910 | } |
| 911 | |
| 912 | // now backtrack from the end to gather token ids of the best tokenization |
| 913 | // merge sequences of consecutive unknown tokens into single unknown tokens |
| 914 | bool is_prev_unknown = false; |
| 915 | for (struct best_tokenization & tokenization = tokenization_results[input_len]; ; tokenization = tokenization_results[tokenization.input_offset]) { |
| 916 | bool is_unknown = tokenization.token_id == vocab.token_unk(); |
| 917 | if (!(is_prev_unknown && is_unknown)) { |
| 918 | output.push_back(x: tokenization.token_id); |
| 919 | } |
| 920 | if (tokenization.input_offset == 0) { |
| 921 | break; |
| 922 | } |
| 923 | is_prev_unknown = is_unknown; |
| 924 | } |
| 925 | |
| 926 | // reverse the output since we added tokens starting from the end of the input |
| 927 | std::reverse(first: output.begin() + output_size, last: output.end()); |
| 928 | } |
| 929 | |
| 930 | private: |
| 931 | |
| 932 | // helper structure for returning normalization results |
| 933 | struct normalization_result { |
| 934 | const char * normalized; |
| 935 | size_t normalized_len; |
| 936 | size_t consumed_input; |
| 937 | }; |
| 938 | |
| 939 | void normalize(const std::string& input, std::string * normalized) { |
| 940 | normalized->clear(); |
| 941 | normalized->reserve(res_arg: input.size() * 3); |
| 942 | |
| 943 | const std::string space = vocab.get_escape_whitespaces() ? tokenizer.escaped_space : " " ; |
| 944 | |
| 945 | const bool shall_prepend_space = !vocab.get_treat_whitespace_as_suffix() && vocab.get_add_space_prefix(); |
| 946 | const bool shall_append_space = vocab.get_treat_whitespace_as_suffix() && vocab.get_add_space_prefix(); |
| 947 | const bool shall_merge_spaces = vocab.get_remove_extra_whitespaces(); |
| 948 | |
| 949 | bool is_space_prepended = false; |
| 950 | bool processing_non_ws = false; |
| 951 | |
| 952 | size_t input_len = input.size(); |
| 953 | |
| 954 | for (size_t input_offset = 0; input_offset < input_len; ) { |
| 955 | auto norm_res = normalize_prefix(input, input_offset); |
| 956 | for (size_t i = 0; i < norm_res.normalized_len; i++) { |
| 957 | char c = norm_res.normalized[i]; |
| 958 | if (c != ' ') { |
| 959 | if (!processing_non_ws) { |
| 960 | processing_non_ws = true; |
| 961 | if ((shall_prepend_space && !is_space_prepended) || shall_merge_spaces) { |
| 962 | normalized->append(str: space); |
| 963 | is_space_prepended = true; |
| 964 | } |
| 965 | } |
| 966 | normalized->push_back(c: c); |
| 967 | } else { |
| 968 | if (processing_non_ws) { |
| 969 | processing_non_ws = false; |
| 970 | } |
| 971 | if (!shall_merge_spaces) { |
| 972 | normalized->append(str: space); |
| 973 | } |
| 974 | } |
| 975 | } |
| 976 | |
| 977 | input_offset += norm_res.consumed_input; |
| 978 | } |
| 979 | |
| 980 | if (shall_append_space) { |
| 981 | normalized->append(str: space); |
| 982 | } |
| 983 | } |
| 984 | |
| 985 | /* |
| 986 | * This structure is a view wrapper for XOR-compressed double array (XCDA) |
| 987 | * See Shunsuke Kanda (2018). Space- and Time-Efficient String Dictionaries. |
| 988 | * Each bit-packed entry contains: |
| 989 | * - BASE array value in bits 10-30 |
| 990 | * - LCHECK array value in bits 0-7 |
| 991 | * - LEAF array value in bit 9 |
| 992 | * Entries containing indexes of replacement sequences have set bit 31 |
| 993 | */ |
| 994 | struct xcda_array_view { |
| 995 | public: |
| 996 | xcda_array_view(const uint32_t * xcda_array, size_t xcda_array_size) : xcda_array(xcda_array), xcda_array_size(xcda_array_size) { |
| 997 | } |
| 998 | uint32_t get_base(size_t index) { |
| 999 | uint32_t packed_node = get_node(index); |
| 1000 | return (packed_node >> 10) << ((packed_node & (1U << 9)) >> 6); |
| 1001 | } |
| 1002 | uint32_t get_lcheck(size_t index) { |
| 1003 | uint32_t packed_node = get_node(index); |
| 1004 | return packed_node & ((1U << 31) | 0xff); |
| 1005 | } |
| 1006 | bool get_leaf(size_t index) { |
| 1007 | uint32_t packed_node = get_node(index); |
| 1008 | return (packed_node >> 8) & 1; |
| 1009 | } |
| 1010 | uint32_t get_value(size_t index) { |
| 1011 | uint32_t packed_node = get_node(index); |
| 1012 | return packed_node & ((1U << 31) - 1); |
| 1013 | } |
| 1014 | private: |
| 1015 | uint32_t get_node(size_t index) { |
| 1016 | if (index > xcda_array_size) { |
| 1017 | throw std::runtime_error("Index out of array bounds in XCDA array!" ); |
| 1018 | } |
| 1019 | return xcda_array[index]; |
| 1020 | } |
| 1021 | const uint32_t * xcda_array; |
| 1022 | size_t xcda_array_size; |
| 1023 | }; |
| 1024 | |
| 1025 | // this structure stores the best tokenization so far at input_offset |
| 1026 | struct best_tokenization { |
| 1027 | llama_token token_id; |
| 1028 | size_t input_offset; |
| 1029 | double score_sum; |
| 1030 | }; |
| 1031 | |
| 1032 | struct normalization_result normalize_prefix(const std::string & input, size_t input_offset) { |
| 1033 | if (input_offset == input.size()) { |
| 1034 | return { .normalized: &input[input_offset], .normalized_len: 0, .consumed_input: 0 }; |
| 1035 | } |
| 1036 | |
| 1037 | // if input prefix matches some user-defined token return this token as normalization result |
| 1038 | auto user_defined_token_match = |
| 1039 | tokenizer.user_defined_token_matcher.get_longest_prefix(key: &input[input_offset], len: input.size() - input_offset); |
| 1040 | if (user_defined_token_match.second > 0) { |
| 1041 | return { .normalized: &input[input_offset], .normalized_len: user_defined_token_match.second, .consumed_input: user_defined_token_match.second }; |
| 1042 | } |
| 1043 | |
| 1044 | size_t longest_prefix_length = 0; |
| 1045 | size_t longest_prefix_offset = 0; |
| 1046 | |
| 1047 | if (tokenizer.xcda_array_size > 0) { |
| 1048 | struct xcda_array_view xcda_view(tokenizer.xcda_array, tokenizer.xcda_array_size); |
| 1049 | |
| 1050 | // Find the longest normalized sequence matching the input prefix by walking |
| 1051 | // the XOR-compressed compact double array (XCDA) starting from the root node |
| 1052 | // We find the index of the next node by calculating BASE[s] ^ c where s is |
| 1053 | // the index of the previous node and c is a numerical character value |
| 1054 | uint32_t node_index = 0; |
| 1055 | // get BASE of the root node |
| 1056 | node_index = xcda_view.get_base(index: node_index); |
| 1057 | for (size_t prefix_offset = input_offset; prefix_offset < input.size(); prefix_offset++) { |
| 1058 | unsigned char c = input[prefix_offset]; |
| 1059 | if (c == 0) { |
| 1060 | break; |
| 1061 | } |
| 1062 | node_index ^= c; |
| 1063 | // if value of LCHECK is not c it means that this is not a child of |
| 1064 | // the previous node, so we stop matching |
| 1065 | if (xcda_view.get_lcheck(index: node_index) != c) { |
| 1066 | break; |
| 1067 | } |
| 1068 | bool is_leaf = xcda_view.get_leaf(index: node_index); |
| 1069 | // get BASE of the current node |
| 1070 | node_index ^= xcda_view.get_base(index: node_index); |
| 1071 | // if LEAF of the current node is true, it means that its BASE points to the node |
| 1072 | // containing index of replacement sequence for currently matched input prefix |
| 1073 | if (is_leaf) |
| 1074 | { |
| 1075 | longest_prefix_length = prefix_offset - input_offset + 1; |
| 1076 | // get index of replacement sequence for currently matched input prefix |
| 1077 | longest_prefix_offset = xcda_view.get_value(index: node_index); |
| 1078 | } |
| 1079 | } |
| 1080 | } |
| 1081 | |
| 1082 | if (longest_prefix_length > 0) { |
| 1083 | // we have a match, so return the replacement sequence |
| 1084 | if (longest_prefix_offset >= tokenizer.prefix_replacements_size) { |
| 1085 | throw std::runtime_error("Index out of array bounds in precompiled charsmap!" ); |
| 1086 | } |
| 1087 | const char * prefix_replacement = &(tokenizer.prefix_replacements)[longest_prefix_offset]; |
| 1088 | return { .normalized: prefix_replacement, .normalized_len: strlen(s: prefix_replacement), .consumed_input: longest_prefix_length }; |
| 1089 | } |
| 1090 | |
| 1091 | // check if the input prefix contains a valid sequence of UTF-8 code units |
| 1092 | try { |
| 1093 | // if yes, return this sequence unmodified |
| 1094 | size_t prefix_offset = input_offset; |
| 1095 | unicode_cpt_from_utf8(utf8: input, offset&: prefix_offset); |
| 1096 | return { .normalized: &input[input_offset], .normalized_len: prefix_offset - input_offset, .consumed_input: prefix_offset - input_offset }; |
| 1097 | } catch (std::invalid_argument & /*ex*/) { |
| 1098 | // if no, consume 1 byte and return U+FFFD - REPLACEMENT CHARACTER |
| 1099 | return { .normalized: "\xEF\xBF\xBD" , .normalized_len: 3, .consumed_input: 1 }; |
| 1100 | } |
| 1101 | } |
| 1102 | |
| 1103 | const llama_vocab & vocab; |
| 1104 | const llm_tokenizer_ugm & tokenizer; |
| 1105 | }; |
| 1106 | |
| 1107 | // |
| 1108 | // RWKV tokenizer |
| 1109 | // |
| 1110 | |
| 1111 | static std::vector<uint8_t> llama_unescape_rwkv_token(const std::string & escaped) { |
| 1112 | std::vector<uint8_t> output; |
| 1113 | output.reserve(n: escaped.size()); |
| 1114 | |
| 1115 | // Parser state |
| 1116 | bool escaping = false; |
| 1117 | uint8_t hex_remaining = 0; |
| 1118 | uint8_t hex_acc = 0; |
| 1119 | |
| 1120 | // Step through characters, performing parsing |
| 1121 | for (const char & c : escaped) { |
| 1122 | // If we're parsing a hex code, interpret the next character |
| 1123 | if (hex_remaining != 0) { |
| 1124 | uint8_t value = (c >= 'a') ? (c - 'a' + 10) : (c - '0'); |
| 1125 | hex_acc = (hex_acc << 4) + value; |
| 1126 | |
| 1127 | hex_remaining -= 1; |
| 1128 | if (hex_remaining == 0) { |
| 1129 | output.push_back(x: hex_acc); |
| 1130 | hex_acc = 0; |
| 1131 | } |
| 1132 | |
| 1133 | continue; |
| 1134 | } |
| 1135 | |
| 1136 | // If we got an escape character, interpret it |
| 1137 | if (escaping) { |
| 1138 | if (c == 't') { |
| 1139 | output.push_back(x: '\t'); |
| 1140 | } else if (c == 'n') { |
| 1141 | output.push_back(x: '\n'); |
| 1142 | } else if (c == 'r') { |
| 1143 | output.push_back(x: '\r'); |
| 1144 | } else if (c == 'x') { |
| 1145 | hex_remaining = 2; |
| 1146 | } else { |
| 1147 | output.push_back(x: c); |
| 1148 | } |
| 1149 | |
| 1150 | escaping = false; |
| 1151 | continue; |
| 1152 | } |
| 1153 | |
| 1154 | if (c == '\\') { |
| 1155 | escaping = true; |
| 1156 | continue; |
| 1157 | } |
| 1158 | |
| 1159 | output.push_back(x: c); |
| 1160 | } |
| 1161 | |
| 1162 | return output; |
| 1163 | } |
| 1164 | |
| 1165 | struct llm_tokenizer_rwkv : llm_tokenizer { |
| 1166 | llm_tokenizer_rwkv(const llama_vocab & vocab) { |
| 1167 | // RWKV supports arbitrary byte tokens, but the vocab struct only supports string tokens. |
| 1168 | // For now, we decode the vocab here into the lookup we'll use for tokenization. |
| 1169 | |
| 1170 | // build trie |
| 1171 | for (uint32_t id = 0; id < vocab.n_tokens(); ++id) { |
| 1172 | const auto & data = vocab.get_token_data(id); |
| 1173 | const auto text = llama_unescape_rwkv_token(escaped: data.text); |
| 1174 | token_matcher.insert(key: (const char *) text.data(), len: text.size(), value: id); |
| 1175 | } |
| 1176 | } |
| 1177 | |
| 1178 | struct naive_trie token_matcher; |
| 1179 | }; |
| 1180 | |
| 1181 | struct llm_tokenizer_rwkv_session { |
| 1182 | llm_tokenizer_rwkv_session(const llama_vocab & vocab, const llm_tokenizer_rwkv & tokenizer) : vocab(vocab), tokenizer(tokenizer) {} |
| 1183 | |
| 1184 | void tokenize(const std::string & text, std::vector<llama_token> & output) { |
| 1185 | uint32_t position = 0; |
| 1186 | while (position < text.size()) { |
| 1187 | const struct naive_trie * node = tokenizer.token_matcher.traverse(c: text[position]); |
| 1188 | if (node == NULL) { |
| 1189 | // no matching token found, add unknown token |
| 1190 | output.push_back(x: vocab.token_unk()); |
| 1191 | position += 1; |
| 1192 | continue; |
| 1193 | } |
| 1194 | |
| 1195 | // traverse the trie to find the longest matching token |
| 1196 | uint32_t token_id = 0; |
| 1197 | uint32_t token_length = 0; |
| 1198 | while (node != NULL) { |
| 1199 | if (node->has_value) { |
| 1200 | token_id = node->value; |
| 1201 | token_length = position + 1; |
| 1202 | } |
| 1203 | node = node->traverse(c: text[++position]); |
| 1204 | } |
| 1205 | |
| 1206 | // add the longest matching token |
| 1207 | output.push_back(x: token_id); |
| 1208 | position = token_length; |
| 1209 | } |
| 1210 | } |
| 1211 | |
| 1212 | private: |
| 1213 | const llama_vocab & vocab; |
| 1214 | const llm_tokenizer_rwkv & tokenizer; |
| 1215 | }; |
| 1216 | |
| 1217 | struct llm_tokenizer_plamo2 : llm_tokenizer { |
| 1218 | llm_tokenizer_plamo2(const llama_vocab & vocab) { |
| 1219 | build(vocab); |
| 1220 | } |
| 1221 | |
| 1222 | void build(const llama_vocab & vocab) { |
| 1223 | // Reset internal structures |
| 1224 | tokens_.clear(); |
| 1225 | bytes_.assign(n: 256, val: 0); |
| 1226 | to_suffix_id_.clear(); |
| 1227 | table_.clear(); |
| 1228 | |
| 1229 | // Build token list and byte mapping |
| 1230 | std::unordered_map<std::string, float> suffix_to_score; |
| 1231 | std::unordered_map<std::string, llama_token> token_to_id; |
| 1232 | |
| 1233 | for (size_t token_id = 0; token_id < vocab.n_tokens(); ++token_id) { |
| 1234 | const auto & entry = vocab.get_token_data(id: token_id); |
| 1235 | tokens_.push_back(x: entry.text); |
| 1236 | token_to_id[entry.text] = static_cast<llama_token>(token_id); |
| 1237 | |
| 1238 | // Handle byte tokens |
| 1239 | if (vocab.is_byte(id: token_id)) { |
| 1240 | if (entry.text.length() == 6 && entry.text.substr(pos: 0, n: 3) == "<0x" && entry.text.back() == '>') { |
| 1241 | std::string hex_str = entry.text.substr(pos: 3, n: 2); |
| 1242 | int byte_val = std::stoi(str: hex_str, idx: nullptr, base: 16); |
| 1243 | bytes_[byte_val] = static_cast<llama_token>(token_id); |
| 1244 | } |
| 1245 | continue; |
| 1246 | } |
| 1247 | |
| 1248 | // Add token and all its suffixes to suffix_to_score |
| 1249 | suffix_to_score[entry.text] = entry.score; |
| 1250 | |
| 1251 | // Extract suffixes character by character (UTF-8 aware) |
| 1252 | std::vector<uint32_t> cpts = unicode_cpts_from_utf8(utf8: entry.text); |
| 1253 | for (size_t i = 1; i < cpts.size(); ++i) { |
| 1254 | std::string suffix; |
| 1255 | for (size_t j = i; j < cpts.size(); ++j) { |
| 1256 | suffix += unicode_cpt_to_utf8(cpt: cpts[j]); |
| 1257 | } |
| 1258 | if (suffix_to_score.find(x: suffix) == suffix_to_score.end()) { |
| 1259 | suffix_to_score[suffix] = std::numeric_limits<float>::quiet_NaN(); |
| 1260 | } |
| 1261 | } |
| 1262 | } |
| 1263 | |
| 1264 | // Check that all byte tokens are set |
| 1265 | for (int i = 0; i < 256; ++i) { |
| 1266 | if (bytes_[i] == 0) { |
| 1267 | throw std::runtime_error("Byte token for <0x" + std::to_string(val: i) + "> is not set" ); |
| 1268 | } |
| 1269 | } |
| 1270 | |
| 1271 | // Build suffix list in lexicographical order of reversed strings |
| 1272 | std::vector<std::string> suffixes; |
| 1273 | for (const auto & pair : suffix_to_score) { |
| 1274 | suffixes.push_back(x: pair.first); |
| 1275 | } |
| 1276 | suffixes.push_back(x: "" ); // Empty suffix |
| 1277 | |
| 1278 | std::sort(first: suffixes.begin(), last: suffixes.end(), comp: [](const std::string & a, const std::string & b) { |
| 1279 | std::string rev_a(a.rbegin(), a.rend()); |
| 1280 | std::string rev_b(b.rbegin(), b.rend()); |
| 1281 | return rev_a < rev_b; |
| 1282 | }); |
| 1283 | |
| 1284 | // Build suffix_to_id and to_suffix_id_ |
| 1285 | std::unordered_map<std::string, int32_t> suffix_to_id; |
| 1286 | int32_t num_pieces = 0; |
| 1287 | |
| 1288 | for (const auto & suffix : suffixes) { |
| 1289 | suffix_to_id[suffix] = num_pieces; |
| 1290 | if (!suffix.empty()) { |
| 1291 | std::vector<uint32_t> cpts = unicode_cpts_from_utf8(utf8: suffix); |
| 1292 | |
| 1293 | std::string remaining; |
| 1294 | for (size_t i = 1; i < cpts.size(); ++i) { |
| 1295 | remaining += unicode_cpt_to_utf8(cpt: cpts[i]); |
| 1296 | } |
| 1297 | |
| 1298 | int64_t piece_code = (static_cast<int64_t>(cpts[0]) << 32) | suffix_to_id[remaining]; |
| 1299 | to_suffix_id_[piece_code] = num_pieces; |
| 1300 | |
| 1301 | // Count number of pieces for this suffix |
| 1302 | int32_t pieces_for_suffix = 1; // sentinel row |
| 1303 | for (int32_t piece_length = static_cast<int32_t>(cpts.size()); piece_length > 0; --piece_length) { |
| 1304 | std::string piece; |
| 1305 | for (int32_t i = 0; i < piece_length; ++i) { |
| 1306 | piece += unicode_cpt_to_utf8(cpt: cpts[i]); |
| 1307 | } |
| 1308 | if (suffix_to_score.find(x: piece) != suffix_to_score.end()) { |
| 1309 | pieces_for_suffix++; |
| 1310 | } |
| 1311 | } |
| 1312 | num_pieces += pieces_for_suffix; |
| 1313 | } else { |
| 1314 | num_pieces++; // Empty suffix contributes one piece (sentinel row) |
| 1315 | } |
| 1316 | } |
| 1317 | |
| 1318 | // Build flattened table |
| 1319 | table_.resize(new_size: num_pieces, x: std::vector<int32_t>(4, 0)); |
| 1320 | int32_t table_idx = 0; |
| 1321 | |
| 1322 | for (const auto & suffix : suffixes) { |
| 1323 | // Add all prefixes of the suffix to the table (in decreasing order of length) |
| 1324 | std::vector<uint32_t> cpts = unicode_cpts_from_utf8(utf8: suffix); |
| 1325 | for (int32_t piece_length = static_cast<int32_t>(cpts.size()); piece_length > 0; --piece_length) { |
| 1326 | std::string piece; |
| 1327 | for (int32_t i = 0; i < piece_length; ++i) { |
| 1328 | piece += unicode_cpt_to_utf8(cpt: cpts[i]); |
| 1329 | } |
| 1330 | |
| 1331 | auto score_it = suffix_to_score.find(x: piece); |
| 1332 | if (score_it == suffix_to_score.end()) { |
| 1333 | continue; |
| 1334 | } |
| 1335 | |
| 1336 | table_[table_idx][TABLE_PIECE_LENGTH] = piece_length; |
| 1337 | auto token_it = token_to_id.find(x: piece); |
| 1338 | table_[table_idx][TABLE_TOKEN_ID] = (token_it != token_to_id.end()) ? token_it->second : -1; |
| 1339 | |
| 1340 | float score = score_it->second; |
| 1341 | table_[table_idx][TABLE_SCORE] = std::isfinite(x: score) ? |
| 1342 | static_cast<int32_t>(std::round(x: score * 1e4)) : INVALID_SCORE; |
| 1343 | table_[table_idx][TABLE_PIECE_ID] = suffix_to_id[piece]; |
| 1344 | |
| 1345 | table_idx++; |
| 1346 | } |
| 1347 | |
| 1348 | // Add sentinel row |
| 1349 | table_[table_idx][TABLE_PIECE_LENGTH] = 1; |
| 1350 | table_[table_idx][TABLE_TOKEN_ID] = -1; |
| 1351 | table_[table_idx][TABLE_SCORE] = UNKNOWN_SCORE; |
| 1352 | table_idx++; |
| 1353 | } |
| 1354 | } |
| 1355 | |
| 1356 | std::vector<llama_token> encode(const std::string & text) const { |
| 1357 | std::vector<uint32_t> unicode_data = unicode_cpts_from_utf8(utf8: text); |
| 1358 | // Skip the first code point if it is a BOM (Byte Order Mark) |
| 1359 | if (!unicode_data.empty() && unicode_data[0] == 0xFEFF) { |
| 1360 | unicode_data.erase(position: unicode_data.begin()); |
| 1361 | } |
| 1362 | |
| 1363 | if (unicode_data.empty()) { |
| 1364 | return {}; |
| 1365 | } |
| 1366 | |
| 1367 | const size_t data_len = unicode_data.size(); |
| 1368 | |
| 1369 | // Initialize scores array (dynamic programming) |
| 1370 | std::vector<int64_t> scores(data_len + 1, static_cast<int64_t>(1) << 60); |
| 1371 | scores[data_len] = 0; |
| 1372 | |
| 1373 | // Path array to track best tokenization |
| 1374 | std::vector<std::vector<int32_t>> path(data_len + 1, std::vector<int32_t>(3, 0)); |
| 1375 | |
| 1376 | int32_t suffix_id = 0; |
| 1377 | |
| 1378 | // Process from end to beginning |
| 1379 | for (int i = static_cast<int>(data_len) - 1; i >= 0; --i) { |
| 1380 | uint32_t c = unicode_data[i]; |
| 1381 | |
| 1382 | // Find next suffix ID |
| 1383 | for (size_t p = suffix_id; p < table_.size(); ++p) { |
| 1384 | int64_t piece_code = (static_cast<int64_t>(c) << 32) | table_[p][TABLE_PIECE_ID]; |
| 1385 | auto it = to_suffix_id_.find(x: piece_code); |
| 1386 | suffix_id = (it != to_suffix_id_.end()) ? it->second : 0; |
| 1387 | |
| 1388 | if (suffix_id > 0 || table_[p][TABLE_SCORE] == UNKNOWN_SCORE) { |
| 1389 | break; |
| 1390 | } |
| 1391 | } |
| 1392 | |
| 1393 | // Update best path |
| 1394 | for (size_t p = suffix_id; p < table_.size(); ++p) { |
| 1395 | int32_t score = table_[p][TABLE_SCORE]; |
| 1396 | if (score > INVALID_SCORE) { |
| 1397 | int32_t piece_length = table_[p][TABLE_PIECE_LENGTH]; |
| 1398 | int64_t s = scores[i + piece_length] - score; |
| 1399 | |
| 1400 | if (s < scores[i]) { |
| 1401 | scores[i] = s; |
| 1402 | path[i][PATH_TOKEN_LENGTH] = piece_length; |
| 1403 | path[i][PATH_TOKEN_ID] = table_[p][TABLE_TOKEN_ID]; |
| 1404 | path[i][PATH_NUM_TOKENS] = path[i + piece_length][PATH_NUM_TOKENS] + 1; |
| 1405 | |
| 1406 | if (score == UNKNOWN_SCORE) { |
| 1407 | // Add UTF-8 byte count |
| 1408 | path[i][PATH_NUM_TOKENS] += (c >= 0x80) + (c >= 0x800) + (c >= 0x10000); |
| 1409 | } |
| 1410 | } |
| 1411 | } |
| 1412 | |
| 1413 | if (score == UNKNOWN_SCORE) { |
| 1414 | break; |
| 1415 | } |
| 1416 | } |
| 1417 | } |
| 1418 | |
| 1419 | // Decode the best path |
| 1420 | std::vector<llama_token> token_ids; |
| 1421 | token_ids.reserve(n: path[0][PATH_NUM_TOKENS]); |
| 1422 | |
| 1423 | int pos = 0; |
| 1424 | while (pos < static_cast<int>(data_len)) { |
| 1425 | if (path[pos][PATH_TOKEN_ID] >= 0) { |
| 1426 | token_ids.push_back(x: path[pos][PATH_TOKEN_ID]); |
| 1427 | } else { |
| 1428 | // Fall back to byte tokens |
| 1429 | uint32_t c = unicode_data[pos]; |
| 1430 | int s = 1 + (c >= 0x80) + (c >= 0x800) + (c >= 0x10000); |
| 1431 | |
| 1432 | for (int i = 0; i < s; ++i) { |
| 1433 | uint8_t b; |
| 1434 | if (s == 1) { |
| 1435 | b = c; |
| 1436 | } else { |
| 1437 | if (i == 0) { |
| 1438 | b = (0xF00 >> s) & 0xFF; |
| 1439 | } else { |
| 1440 | b = 0x80; |
| 1441 | } |
| 1442 | } |
| 1443 | token_ids.push_back(x: bytes_[b | ((c >> ((s - i - 1) * 6)) & 0x3F)]); |
| 1444 | } |
| 1445 | } |
| 1446 | |
| 1447 | assert(path[pos][PATH_TOKEN_LENGTH] > 0); |
| 1448 | pos += path[pos][PATH_TOKEN_LENGTH]; |
| 1449 | } |
| 1450 | |
| 1451 | return token_ids; |
| 1452 | } |
| 1453 | private: |
| 1454 | // Constants for table structure |
| 1455 | static constexpr int32_t TABLE_PIECE_LENGTH = 0; |
| 1456 | static constexpr int32_t TABLE_TOKEN_ID = 1; |
| 1457 | static constexpr int32_t TABLE_SCORE = 2; |
| 1458 | static constexpr int32_t TABLE_PIECE_ID = 3; |
| 1459 | |
| 1460 | // Constants for path array |
| 1461 | static constexpr int32_t PATH_TOKEN_LENGTH = 0; |
| 1462 | static constexpr int32_t PATH_TOKEN_ID = 1; |
| 1463 | static constexpr int32_t PATH_NUM_TOKENS = 2; |
| 1464 | |
| 1465 | // Score constants |
| 1466 | static constexpr int32_t INVALID_SCORE = -20000000; |
| 1467 | static constexpr int32_t UNKNOWN_SCORE = -10000000; |
| 1468 | |
| 1469 | // List of tokens in the vocabulary |
| 1470 | std::vector<std::string> tokens_; |
| 1471 | |
| 1472 | // Mapping from byte code point to token ID (for byte fallback) |
| 1473 | std::vector<llama_token> bytes_; |
| 1474 | |
| 1475 | // Mapping from piece code to suffix ID |
| 1476 | std::unordered_map<int64_t, int32_t> to_suffix_id_; |
| 1477 | |
| 1478 | // Flattened table representing the Trie structure |
| 1479 | // Each row contains: [piece_length, token_id, score, piece_id] |
| 1480 | std::vector<std::vector<int32_t>> table_; |
| 1481 | }; |
| 1482 | |
| 1483 | struct llm_tokenizer_plamo2_session { |
| 1484 | llm_tokenizer_plamo2_session(const llm_tokenizer_plamo2 & tokenizer) : tokenizer(tokenizer) {} |
| 1485 | |
| 1486 | void tokenize(const std::string & text, std::vector<llama_token> & output) { |
| 1487 | std::vector<llama_token> tokens = tokenizer.encode(text); |
| 1488 | output.insert(position: output.end(), first: tokens.begin(), last: tokens.end()); |
| 1489 | } |
| 1490 | |
| 1491 | private: |
| 1492 | const llm_tokenizer_plamo2 & tokenizer; |
| 1493 | }; |
| 1494 | |
| 1495 | // |
| 1496 | // impl |
| 1497 | // |
| 1498 | |
| 1499 | typedef enum FRAGMENT_BUFFER_VARIANT_TYPE { |
| 1500 | FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN, |
| 1501 | FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT |
| 1502 | } FRAGMENT_BUFFER_VARIANT_TYPE; |
| 1503 | |
| 1504 | struct fragment_buffer_variant { |
| 1505 | fragment_buffer_variant(llama_token _token) |
| 1506 | : |
| 1507 | type(FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN), |
| 1508 | token(_token), |
| 1509 | raw_text(_dummy), |
| 1510 | offset(0), |
| 1511 | length(0) {} |
| 1512 | |
| 1513 | fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length) |
| 1514 | : |
| 1515 | type(FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT), |
| 1516 | token((llama_token) - 1), |
| 1517 | raw_text(_raw_text), |
| 1518 | offset(_offset), |
| 1519 | length(_length){ |
| 1520 | GGML_ASSERT(_offset >= 0); |
| 1521 | GGML_ASSERT(_length >= 1); |
| 1522 | GGML_ASSERT(offset + length <= raw_text.length()); |
| 1523 | } |
| 1524 | |
| 1525 | const FRAGMENT_BUFFER_VARIANT_TYPE type; |
| 1526 | const llama_token token; |
| 1527 | const std::string _dummy; |
| 1528 | const std::string & raw_text; |
| 1529 | const uint64_t offset; |
| 1530 | const uint64_t length; |
| 1531 | }; |
| 1532 | |
| 1533 | struct llama_vocab::impl { |
| 1534 | uint32_t n_token_types = 0; // for BERT-style token types |
| 1535 | |
| 1536 | std::string tokenizer_model; |
| 1537 | std::string tokenizer_pre; |
| 1538 | |
| 1539 | enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM; |
| 1540 | enum llama_vocab_pre_type pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT; |
| 1541 | |
| 1542 | int max_token_len = 0; // used for optimizing longest token search |
| 1543 | |
| 1544 | // default LLaMA special tokens |
| 1545 | // TODO: should we set all of these to LLAMA_TOKEN_NULL? |
| 1546 | llama_token special_bos_id = 1; |
| 1547 | llama_token special_eos_id = 2; |
| 1548 | llama_token special_eot_id = LLAMA_TOKEN_NULL; |
| 1549 | llama_token special_eom_id = LLAMA_TOKEN_NULL; |
| 1550 | llama_token special_unk_id = 0; |
| 1551 | llama_token special_sep_id = LLAMA_TOKEN_NULL; |
| 1552 | llama_token special_pad_id = LLAMA_TOKEN_NULL; |
| 1553 | llama_token special_mask_id = LLAMA_TOKEN_NULL; |
| 1554 | |
| 1555 | llama_token linefeed_id = 13; |
| 1556 | |
| 1557 | // fim tokens |
| 1558 | llama_token special_fim_pre_id = LLAMA_TOKEN_NULL; |
| 1559 | llama_token special_fim_suf_id = LLAMA_TOKEN_NULL; |
| 1560 | llama_token special_fim_mid_id = LLAMA_TOKEN_NULL; |
| 1561 | llama_token special_fim_pad_id = LLAMA_TOKEN_NULL; |
| 1562 | llama_token special_fim_rep_id = LLAMA_TOKEN_NULL; // repo |
| 1563 | llama_token special_fim_sep_id = LLAMA_TOKEN_NULL; // file separator |
| 1564 | |
| 1565 | // tokenizer flags |
| 1566 | bool add_space_prefix = false; |
| 1567 | bool add_bos = false; |
| 1568 | bool add_eos = false; |
| 1569 | bool add_sep = false; |
| 1570 | bool ignore_merges = false; |
| 1571 | bool clean_spaces = false; // clean_up_tokenization_spaces |
| 1572 | bool = false; |
| 1573 | bool escape_whitespaces = true; |
| 1574 | bool treat_whitespace_as_suffix = false; |
| 1575 | |
| 1576 | std::unordered_map<std::string, llama_token> token_to_id; |
| 1577 | std::vector<token_data> id_to_token; |
| 1578 | |
| 1579 | std::vector<llama_token> cache_special_tokens; |
| 1580 | std::vector<std::string> cache_token_to_piece; // llama_token_to_piece(special = true); |
| 1581 | struct pair_hash { |
| 1582 | size_t operator()(const std::pair<std::string, std::string> & p) const { |
| 1583 | return std::hash<std::string>{}(p.first) ^ //create some hash for pair |
| 1584 | (std::hash<std::string>{}(p.second) << 1); |
| 1585 | } |
| 1586 | }; |
| 1587 | std::unordered_map<std::pair<std::string, std::string>, int, pair_hash> bpe_ranks; |
| 1588 | |
| 1589 | // set of all tokens that cause "end of generation" |
| 1590 | std::set<llama_token> special_eog_ids; |
| 1591 | |
| 1592 | std::unique_ptr<llm_tokenizer> tokenizer; |
| 1593 | |
| 1594 | std::vector<char> precompiled_charsmap; |
| 1595 | |
| 1596 | impl(const llama_vocab & vocab) : vocab(vocab) { |
| 1597 | } |
| 1598 | |
| 1599 | ~impl() = default; |
| 1600 | |
| 1601 | void load(llama_model_loader & ml, const LLM_KV & kv); |
| 1602 | |
| 1603 | enum llama_vocab_type get_type() const; |
| 1604 | |
| 1605 | std::string type_name() const; |
| 1606 | |
| 1607 | bool is_normal (llama_token id) const; |
| 1608 | bool is_unknown (llama_token id) const; |
| 1609 | bool is_control (llama_token id) const; |
| 1610 | bool is_byte (llama_token id) const; |
| 1611 | bool is_user_defined(llama_token id) const; |
| 1612 | bool is_unused (llama_token id) const; |
| 1613 | bool is_eog (llama_token id) const; |
| 1614 | |
| 1615 | uint8_t token_to_byte(llama_token id) const; |
| 1616 | |
| 1617 | llama_token_attr token_get_attr(llama_token id) const; |
| 1618 | |
| 1619 | void init_tokenizer(enum llama_vocab_type type); |
| 1620 | |
| 1621 | void tokenizer_st_partition(std::forward_list<fragment_buffer_variant> & buffer, bool parse_special) const; |
| 1622 | |
| 1623 | std::string token_to_piece_for_cache( |
| 1624 | llama_token token, |
| 1625 | bool special) const; |
| 1626 | |
| 1627 | |
| 1628 | std::vector<llama_token> tokenize( |
| 1629 | const std::string & raw_text, |
| 1630 | bool add_special, |
| 1631 | bool parse_special = false) const; |
| 1632 | |
| 1633 | int32_t tokenize( |
| 1634 | const char * text, |
| 1635 | int32_t text_len, |
| 1636 | llama_token * tokens, |
| 1637 | int32_t n_tokens_max, |
| 1638 | bool add_special, |
| 1639 | bool parse_special) const; |
| 1640 | |
| 1641 | // does not write null-terminator to buf |
| 1642 | int32_t token_to_piece( |
| 1643 | llama_token token, |
| 1644 | char * buf, |
| 1645 | int32_t length, |
| 1646 | int32_t lstrip, |
| 1647 | bool special) const; |
| 1648 | |
| 1649 | // use cached data |
| 1650 | const std::string & token_to_piece(llama_token token) const; |
| 1651 | |
| 1652 | int32_t detokenize( |
| 1653 | const llama_token * tokens, |
| 1654 | int32_t n_tokens, |
| 1655 | char * text, |
| 1656 | int32_t text_len_max, |
| 1657 | bool remove_special, |
| 1658 | bool unparse_special) const; |
| 1659 | |
| 1660 | std::string detokenize( |
| 1661 | const std::vector<llama_token> & tokens, |
| 1662 | bool special) const; |
| 1663 | |
| 1664 | void print_info() const; |
| 1665 | |
| 1666 | private: |
| 1667 | const llama_vocab & vocab; |
| 1668 | }; |
| 1669 | |
| 1670 | void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { |
| 1671 | struct gguf_context * ctx = ml.meta.get(); |
| 1672 | |
| 1673 | // determine vocab type |
| 1674 | { |
| 1675 | ml.get_key(kid: LLM_KV_TOKENIZER_MODEL, result&: tokenizer_model); |
| 1676 | ml.get_key(kid: LLM_KV_TOKENIZER_PRE, result&: tokenizer_pre, required: false); |
| 1677 | |
| 1678 | ml.get_key(kid: LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, result&: n_token_types, required: false); |
| 1679 | |
| 1680 | if (tokenizer_model == "no_vocab" || tokenizer_model == "none" ) { |
| 1681 | type = LLAMA_VOCAB_TYPE_NONE; |
| 1682 | |
| 1683 | // default special tokens |
| 1684 | special_bos_id = LLAMA_TOKEN_NULL; |
| 1685 | special_eos_id = LLAMA_TOKEN_NULL; |
| 1686 | special_unk_id = LLAMA_TOKEN_NULL; |
| 1687 | special_sep_id = LLAMA_TOKEN_NULL; |
| 1688 | special_pad_id = LLAMA_TOKEN_NULL; |
| 1689 | special_mask_id = LLAMA_TOKEN_NULL; |
| 1690 | linefeed_id = LLAMA_TOKEN_NULL; |
| 1691 | |
| 1692 | // read vocab size from metadata |
| 1693 | uint32_t n_tokens = 0; |
| 1694 | if (ml.get_key(kid: LLM_KV_VOCAB_SIZE, result&: n_tokens, required: false)) { |
| 1695 | LLAMA_LOG_WARN("%s: adding %u dummy tokens\n" , __func__, n_tokens); |
| 1696 | id_to_token.resize(new_size: n_tokens); |
| 1697 | } |
| 1698 | |
| 1699 | return; |
| 1700 | } |
| 1701 | |
| 1702 | if (tokenizer_model == "llama" ) { |
| 1703 | type = LLAMA_VOCAB_TYPE_SPM; |
| 1704 | |
| 1705 | // default special tokens |
| 1706 | special_bos_id = 1; |
| 1707 | special_eos_id = 2; |
| 1708 | special_unk_id = 0; |
| 1709 | special_sep_id = LLAMA_TOKEN_NULL; |
| 1710 | special_pad_id = LLAMA_TOKEN_NULL; |
| 1711 | special_mask_id = LLAMA_TOKEN_NULL; |
| 1712 | } else if (tokenizer_model == "bert" ) { |
| 1713 | type = LLAMA_VOCAB_TYPE_WPM; |
| 1714 | |
| 1715 | // default special tokens |
| 1716 | special_bos_id = 101; |
| 1717 | special_eos_id = LLAMA_TOKEN_NULL; |
| 1718 | special_unk_id = 100; |
| 1719 | special_sep_id = 102; |
| 1720 | special_pad_id = 0; |
| 1721 | special_mask_id = 103; |
| 1722 | |
| 1723 | add_sep = true; |
| 1724 | } else if (tokenizer_model == "gpt2" ) { |
| 1725 | type = LLAMA_VOCAB_TYPE_BPE; |
| 1726 | |
| 1727 | // read bpe merges and populate bpe ranks |
| 1728 | const int merges_keyidx = gguf_find_key(ctx, key: kv(LLM_KV_TOKENIZER_MERGES).c_str()); |
| 1729 | if (merges_keyidx == -1) { |
| 1730 | throw std::runtime_error("cannot find tokenizer merges in model file\n" ); |
| 1731 | } |
| 1732 | |
| 1733 | const int n_merges = gguf_get_arr_n(ctx, key_id: merges_keyidx); |
| 1734 | for (int i = 0; i < n_merges; i++) { |
| 1735 | const std::string word = gguf_get_arr_str(ctx, key_id: merges_keyidx, i); |
| 1736 | //GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0); |
| 1737 | |
| 1738 | std::string first; |
| 1739 | std::string second; |
| 1740 | |
| 1741 | const size_t pos = word.find(c: ' ', pos: 1); |
| 1742 | |
| 1743 | if (pos != std::string::npos) { |
| 1744 | first = word.substr(pos: 0, n: pos); |
| 1745 | second = word.substr(pos: pos + 1); |
| 1746 | } |
| 1747 | |
| 1748 | bpe_ranks.emplace(args: std::make_pair(x&: first, y&: second), args&: i); |
| 1749 | } |
| 1750 | |
| 1751 | // default special tokens |
| 1752 | special_bos_id = 11; |
| 1753 | special_eos_id = 11; |
| 1754 | special_unk_id = LLAMA_TOKEN_NULL; |
| 1755 | special_sep_id = LLAMA_TOKEN_NULL; |
| 1756 | special_pad_id = LLAMA_TOKEN_NULL; |
| 1757 | special_mask_id = LLAMA_TOKEN_NULL; |
| 1758 | } else if (tokenizer_model == "t5" ) { |
| 1759 | type = LLAMA_VOCAB_TYPE_UGM; |
| 1760 | |
| 1761 | // default special tokens |
| 1762 | special_bos_id = LLAMA_TOKEN_NULL; |
| 1763 | special_eos_id = 1; |
| 1764 | special_unk_id = 2; |
| 1765 | special_sep_id = LLAMA_TOKEN_NULL; |
| 1766 | special_pad_id = 0; |
| 1767 | special_mask_id = LLAMA_TOKEN_NULL; |
| 1768 | |
| 1769 | const int precompiled_charsmap_keyidx = gguf_find_key(ctx, key: kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str()); |
| 1770 | if (precompiled_charsmap_keyidx != -1) { |
| 1771 | const gguf_type pc_type = gguf_get_arr_type(ctx, key_id: precompiled_charsmap_keyidx); |
| 1772 | GGML_ASSERT(pc_type == GGUF_TYPE_INT8 || pc_type == GGUF_TYPE_UINT8); |
| 1773 | |
| 1774 | const size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, key_id: precompiled_charsmap_keyidx); |
| 1775 | const char * pc = (const char *) gguf_get_arr_data(ctx, key_id: precompiled_charsmap_keyidx); |
| 1776 | precompiled_charsmap.assign(first: pc, last: pc + n_precompiled_charsmap); |
| 1777 | #if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ |
| 1778 | // correct endiannes of data in precompiled_charsmap binary blob |
| 1779 | uint32_t * xcda_blob_size = (uint32_t *) &precompiled_charsmap[0]; |
| 1780 | *xcda_blob_size = __builtin_bswap32(*xcda_blob_size); |
| 1781 | assert(*xcda_blob_size + sizeof(uint32_t) < n_precompiled_charsmap); |
| 1782 | size_t xcda_array_size = *xcda_blob_size / sizeof(uint32_t); |
| 1783 | uint32_t * xcda_array = (uint32_t *) &precompiled_charsmap[sizeof(uint32_t)]; |
| 1784 | for (size_t i = 0; i < xcda_array_size; ++i) { |
| 1785 | xcda_array[i] = __builtin_bswap32(xcda_array[i]); |
| 1786 | } |
| 1787 | #endif |
| 1788 | } |
| 1789 | } else if (tokenizer_model == "rwkv" ) { |
| 1790 | type = LLAMA_VOCAB_TYPE_RWKV; |
| 1791 | |
| 1792 | // default special tokens |
| 1793 | special_bos_id = LLAMA_TOKEN_NULL; |
| 1794 | special_eos_id = LLAMA_TOKEN_NULL; |
| 1795 | special_unk_id = LLAMA_TOKEN_NULL; |
| 1796 | special_sep_id = LLAMA_TOKEN_NULL; |
| 1797 | special_pad_id = LLAMA_TOKEN_NULL; |
| 1798 | } else if (tokenizer_model == "plamo2" ) { |
| 1799 | type = LLAMA_VOCAB_TYPE_PLAMO2; |
| 1800 | |
| 1801 | // PLaMo-2 default special tokens (these will be overridden by model config) |
| 1802 | special_bos_id = 1; // <|plamo:bos|> |
| 1803 | special_eos_id = 2; // <|plamo:eos|> |
| 1804 | special_unk_id = 0; // <|plamo:unk|> |
| 1805 | special_sep_id = LLAMA_TOKEN_NULL; |
| 1806 | special_pad_id = 3; // <|plamo:pad|> |
| 1807 | special_mask_id = LLAMA_TOKEN_NULL; |
| 1808 | } else { |
| 1809 | throw std::runtime_error(format(fmt: "unknown tokenizer: '%s'" , tokenizer_model.c_str())); |
| 1810 | } |
| 1811 | |
| 1812 | // for now, only BPE models have pre-tokenizers |
| 1813 | if (type == LLAMA_VOCAB_TYPE_BPE) { |
| 1814 | add_space_prefix = false; |
| 1815 | clean_spaces = true; |
| 1816 | if (tokenizer_pre.empty()) { |
| 1817 | LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n" , __func__); |
| 1818 | LLAMA_LOG_WARN("%s: \n" , __func__); |
| 1819 | LLAMA_LOG_WARN("%s: ************************************ \n" , __func__); |
| 1820 | LLAMA_LOG_WARN("%s: GENERATION QUALITY WILL BE DEGRADED! \n" , __func__); |
| 1821 | LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL \n" , __func__); |
| 1822 | LLAMA_LOG_WARN("%s: ************************************ \n" , __func__); |
| 1823 | LLAMA_LOG_WARN("%s: \n" , __func__); |
| 1824 | pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT; |
| 1825 | } else if (tokenizer_pre == "default" ) { |
| 1826 | pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT; |
| 1827 | } else if ( |
| 1828 | tokenizer_pre == "llama3" || |
| 1829 | tokenizer_pre == "llama-v3" || |
| 1830 | tokenizer_pre == "llama-bpe" || |
| 1831 | tokenizer_pre == "falcon3" || |
| 1832 | tokenizer_pre == "falcon-h1" || |
| 1833 | tokenizer_pre == "pixtral" || |
| 1834 | tokenizer_pre == "midm-2.0" || |
| 1835 | tokenizer_pre == "lfm2" ) { |
| 1836 | pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3; |
| 1837 | ignore_merges = true; |
| 1838 | add_bos = true; |
| 1839 | } else if ( |
| 1840 | tokenizer_pre == "deepseek-llm" ) { |
| 1841 | pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM; |
| 1842 | clean_spaces = false; |
| 1843 | } else if ( |
| 1844 | tokenizer_pre == "deepseek-coder" ) { |
| 1845 | pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER; |
| 1846 | clean_spaces = false; |
| 1847 | } else if ( |
| 1848 | tokenizer_pre == "deepseek-v3" ) { |
| 1849 | pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM; |
| 1850 | clean_spaces = false; |
| 1851 | } else if ( |
| 1852 | tokenizer_pre == "falcon" ) { |
| 1853 | pre_type = LLAMA_VOCAB_PRE_TYPE_FALCON; |
| 1854 | } else if ( |
| 1855 | tokenizer_pre == "mpt" ) { |
| 1856 | pre_type = LLAMA_VOCAB_PRE_TYPE_MPT; |
| 1857 | } else if ( |
| 1858 | tokenizer_pre == "starcoder" ) { |
| 1859 | pre_type = LLAMA_VOCAB_PRE_TYPE_STARCODER; |
| 1860 | } else if ( |
| 1861 | tokenizer_pre == "gpt-2" || |
| 1862 | tokenizer_pre == "phi-2" || |
| 1863 | tokenizer_pre == "jina-es" || |
| 1864 | tokenizer_pre == "jina-de" || |
| 1865 | tokenizer_pre == "gigachat" || |
| 1866 | tokenizer_pre == "jina-v2-es" || |
| 1867 | tokenizer_pre == "jina-v2-de" || |
| 1868 | tokenizer_pre == "a.x-4.0" || |
| 1869 | tokenizer_pre == "mellum" ) { |
| 1870 | pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2; |
| 1871 | } else if ( |
| 1872 | tokenizer_pre == "jina-v1-en" || |
| 1873 | tokenizer_pre == "jina-v2-code" || |
| 1874 | tokenizer_pre == "roberta-bpe" ) { |
| 1875 | pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2; |
| 1876 | add_sep = true; |
| 1877 | } else if ( |
| 1878 | tokenizer_pre == "refact" ) { |
| 1879 | pre_type = LLAMA_VOCAB_PRE_TYPE_REFACT; |
| 1880 | } else if ( |
| 1881 | tokenizer_pre == "command-r" ) { |
| 1882 | pre_type = LLAMA_VOCAB_PRE_TYPE_COMMAND_R; |
| 1883 | clean_spaces = false; |
| 1884 | } else if ( |
| 1885 | tokenizer_pre == "qwen2" || |
| 1886 | tokenizer_pre == "deepseek-r1-qwen" ) { |
| 1887 | pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2; |
| 1888 | clean_spaces = false; |
| 1889 | } else if ( |
| 1890 | tokenizer_pre == "stablelm2" ) { |
| 1891 | pre_type = LLAMA_VOCAB_PRE_TYPE_STABLELM2; |
| 1892 | } else if ( |
| 1893 | tokenizer_pre == "olmo" ) { |
| 1894 | pre_type = LLAMA_VOCAB_PRE_TYPE_OLMO; |
| 1895 | } else if ( |
| 1896 | tokenizer_pre == "dbrx" ) { |
| 1897 | pre_type = LLAMA_VOCAB_PRE_TYPE_DBRX; |
| 1898 | } else if ( |
| 1899 | tokenizer_pre == "smaug-bpe" ) { |
| 1900 | pre_type = LLAMA_VOCAB_PRE_TYPE_SMAUG; |
| 1901 | } else if ( |
| 1902 | tokenizer_pre == "poro-chat" ) { |
| 1903 | pre_type = LLAMA_VOCAB_PRE_TYPE_PORO; |
| 1904 | clean_spaces = false; |
| 1905 | } else if ( |
| 1906 | tokenizer_pre == "glm4" || |
| 1907 | tokenizer_pre == "chatglm-bpe" ) { |
| 1908 | pre_type = LLAMA_VOCAB_PRE_TYPE_CHATGLM4; |
| 1909 | special_bos_id = LLAMA_TOKEN_NULL; |
| 1910 | } else if ( |
| 1911 | tokenizer_pre == "viking" ) { |
| 1912 | pre_type = LLAMA_VOCAB_PRE_TYPE_VIKING; |
| 1913 | clean_spaces = false; |
| 1914 | } else if ( |
| 1915 | tokenizer_pre == "jais" ) { |
| 1916 | pre_type = LLAMA_VOCAB_PRE_TYPE_JAIS; |
| 1917 | } else if ( |
| 1918 | tokenizer_pre == "tekken" ) { |
| 1919 | pre_type = LLAMA_VOCAB_PRE_TYPE_TEKKEN; |
| 1920 | clean_spaces = false; |
| 1921 | ignore_merges = true; |
| 1922 | add_bos = true; |
| 1923 | } else if ( |
| 1924 | tokenizer_pre == "smollm" ) { |
| 1925 | pre_type = LLAMA_VOCAB_PRE_TYPE_SMOLLM; |
| 1926 | clean_spaces = false; |
| 1927 | } else if ( |
| 1928 | tokenizer_pre == "codeshell" ) { |
| 1929 | pre_type = LLAMA_VOCAB_PRE_TYPE_CODESHELL; |
| 1930 | } else if ( |
| 1931 | tokenizer_pre == "bloom" ) { |
| 1932 | pre_type = LLAMA_VOCAB_PRE_TYPE_BLOOM; |
| 1933 | } else if ( |
| 1934 | tokenizer_pre == "gpt3-finnish" ) { |
| 1935 | pre_type = LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH; |
| 1936 | } else if ( |
| 1937 | tokenizer_pre == "exaone" ) { |
| 1938 | pre_type = LLAMA_VOCAB_PRE_TYPE_EXAONE; |
| 1939 | } else if ( |
| 1940 | tokenizer_pre == "exaone4" ) { |
| 1941 | pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2; |
| 1942 | } else if ( |
| 1943 | tokenizer_pre == "chameleon" ) { |
| 1944 | pre_type = LLAMA_VOCAB_PRE_TYPE_CHAMELEON; |
| 1945 | add_bos = true; |
| 1946 | clean_spaces = false; |
| 1947 | } else if ( |
| 1948 | tokenizer_pre == "minerva-7b" ) { |
| 1949 | pre_type = LLAMA_VOCAB_PRE_TYPE_MINERVA; |
| 1950 | } else if ( |
| 1951 | tokenizer_pre == "megrez" ) { |
| 1952 | pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2; |
| 1953 | } else if ( |
| 1954 | tokenizer_pre == "gpt-4o" || |
| 1955 | tokenizer_pre == "llama4" ) { |
| 1956 | pre_type = LLAMA_VOCAB_PRE_TYPE_GPT4O; |
| 1957 | clean_spaces = false; |
| 1958 | } else if ( |
| 1959 | tokenizer_pre == "superbpe" ) { |
| 1960 | pre_type = LLAMA_VOCAB_PRE_TYPE_SUPERBPE; |
| 1961 | clean_spaces = false; |
| 1962 | } else if ( |
| 1963 | tokenizer_pre == "trillion" ) { |
| 1964 | pre_type = LLAMA_VOCAB_PRE_TYPE_TRILLION; |
| 1965 | clean_spaces = false; |
| 1966 | } else if ( |
| 1967 | tokenizer_pre == "granite-docling" ) { |
| 1968 | pre_type = LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING; |
| 1969 | clean_spaces = false; |
| 1970 | } else if ( |
| 1971 | tokenizer_pre == "bailingmoe" || |
| 1972 | tokenizer_pre == "bailingmoe2" || |
| 1973 | tokenizer_pre == "llada-moe" ) { |
| 1974 | pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE; |
| 1975 | clean_spaces = false; |
| 1976 | } else if ( |
| 1977 | tokenizer_pre == "seed-coder" ) { |
| 1978 | pre_type = LLAMA_VOCAB_PRE_TYPE_SEED_CODER; |
| 1979 | clean_spaces = false; |
| 1980 | } else if ( |
| 1981 | tokenizer_pre == "hunyuan" ) { |
| 1982 | pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN; |
| 1983 | clean_spaces = false; |
| 1984 | } else if ( |
| 1985 | tokenizer_pre == "hunyuan-dense" ) { |
| 1986 | pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE; |
| 1987 | clean_spaces = false; |
| 1988 | } else if ( |
| 1989 | tokenizer_pre == "kimi-k2" ) { |
| 1990 | pre_type = LLAMA_VOCAB_PRE_TYPE_KIMI_K2; |
| 1991 | clean_spaces = false; |
| 1992 | } else if ( |
| 1993 | tokenizer_pre == "grok-2" ) { |
| 1994 | pre_type = LLAMA_VOCAB_PRE_TYPE_GROK_2; |
| 1995 | clean_spaces = false; |
| 1996 | } else if ( |
| 1997 | tokenizer_pre == "minimax-m2" ) { |
| 1998 | pre_type = LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2; |
| 1999 | clean_spaces = false; |
| 2000 | } else { |
| 2001 | throw std::runtime_error(format(fmt: "unknown pre-tokenizer type: '%s'" , tokenizer_pre.c_str())); |
| 2002 | } |
| 2003 | } else if (type == LLAMA_VOCAB_TYPE_SPM) { |
| 2004 | pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT; |
| 2005 | add_space_prefix = true; |
| 2006 | clean_spaces = false; |
| 2007 | add_bos = true; |
| 2008 | add_eos = false; |
| 2009 | } else if (type == LLAMA_VOCAB_TYPE_WPM) { |
| 2010 | pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT; |
| 2011 | add_space_prefix = false; |
| 2012 | clean_spaces = true; |
| 2013 | add_bos = true; |
| 2014 | add_eos = false; |
| 2015 | add_sep = true; |
| 2016 | } else if (type == LLAMA_VOCAB_TYPE_UGM) { |
| 2017 | pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT; |
| 2018 | add_bos = false; |
| 2019 | add_eos = true; |
| 2020 | } else if (type == LLAMA_VOCAB_TYPE_RWKV) { |
| 2021 | pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT; |
| 2022 | add_space_prefix = false; |
| 2023 | clean_spaces = false; |
| 2024 | add_bos = false; |
| 2025 | add_eos = false; |
| 2026 | } else { |
| 2027 | pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT; |
| 2028 | } |
| 2029 | |
| 2030 | ml.get_key(kid: LLM_KV_TOKENIZER_ADD_PREFIX, result&: add_space_prefix, required: false); |
| 2031 | ml.get_key(kid: LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, result&: remove_extra_whitespaces, required: false); |
| 2032 | } |
| 2033 | |
| 2034 | const int token_idx = gguf_find_key(ctx, key: kv(LLM_KV_TOKENIZER_LIST).c_str()); |
| 2035 | if (token_idx == -1) { |
| 2036 | throw std::runtime_error("cannot find tokenizer vocab in model file\n" ); |
| 2037 | } |
| 2038 | |
| 2039 | const float * scores = nullptr; |
| 2040 | const int score_idx = gguf_find_key(ctx, key: kv(LLM_KV_TOKENIZER_SCORES).c_str()); |
| 2041 | if (score_idx != -1) { |
| 2042 | scores = (const float * ) gguf_get_arr_data(ctx, key_id: score_idx); |
| 2043 | } |
| 2044 | |
| 2045 | const int * toktypes = nullptr; |
| 2046 | const int toktype_idx = gguf_find_key(ctx, key: kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str()); |
| 2047 | if (toktype_idx != -1) { |
| 2048 | toktypes = (const int * ) gguf_get_arr_data(ctx, key_id: toktype_idx); |
| 2049 | } |
| 2050 | |
| 2051 | uint32_t n_tokens = gguf_get_arr_n(ctx, key_id: token_idx); |
| 2052 | id_to_token.resize(new_size: n_tokens); |
| 2053 | |
| 2054 | for (uint32_t i = 0; i < n_tokens; i++) { |
| 2055 | std::string word = gguf_get_arr_str(ctx, key_id: token_idx, i); |
| 2056 | if (word.empty()) { |
| 2057 | LLAMA_LOG_WARN("%s: empty token at index %u\n" , __func__, i); |
| 2058 | word = "[EMPTY_" + std::to_string(val: i) + "]" ; |
| 2059 | } |
| 2060 | |
| 2061 | token_to_id[word] = i; |
| 2062 | max_token_len = std::max(a: max_token_len, b: (int) word.size()); |
| 2063 | |
| 2064 | auto & token_data = id_to_token[i]; |
| 2065 | token_data.text = std::move(word); |
| 2066 | token_data.score = scores ? scores[i] : 0.0f; |
| 2067 | token_data.attr = LLAMA_TOKEN_ATTR_NORMAL; |
| 2068 | |
| 2069 | if (toktypes) { //TODO: remove, required until per token attributes are available from GGUF file |
| 2070 | switch(toktypes[i]) { |
| 2071 | case LLAMA_TOKEN_TYPE_UNKNOWN: token_data.attr = LLAMA_TOKEN_ATTR_UNKNOWN; break; |
| 2072 | case LLAMA_TOKEN_TYPE_UNUSED: token_data.attr = LLAMA_TOKEN_ATTR_UNUSED; break; |
| 2073 | case LLAMA_TOKEN_TYPE_NORMAL: token_data.attr = LLAMA_TOKEN_ATTR_NORMAL; break; |
| 2074 | case LLAMA_TOKEN_TYPE_CONTROL: token_data.attr = LLAMA_TOKEN_ATTR_CONTROL; break; |
| 2075 | case LLAMA_TOKEN_TYPE_USER_DEFINED: token_data.attr = LLAMA_TOKEN_ATTR_USER_DEFINED; break; |
| 2076 | case LLAMA_TOKEN_TYPE_BYTE: token_data.attr = LLAMA_TOKEN_ATTR_BYTE; break; |
| 2077 | case LLAMA_TOKEN_TYPE_UNDEFINED: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break; |
| 2078 | default: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break; |
| 2079 | } |
| 2080 | } |
| 2081 | } |
| 2082 | GGML_ASSERT(id_to_token.size() == token_to_id.size()); |
| 2083 | |
| 2084 | init_tokenizer(type); |
| 2085 | |
| 2086 | // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n' |
| 2087 | if (type == LLAMA_VOCAB_TYPE_SPM) { |
| 2088 | try { |
| 2089 | linefeed_id = vocab.byte_to_token(ch: '\n'); |
| 2090 | } catch (const std::exception & e) { |
| 2091 | LLAMA_LOG_WARN("%s: SPM vocabulary, but newline token not found: %s! Using special_pad_id instead." , __func__, e.what()); |
| 2092 | linefeed_id = special_pad_id; |
| 2093 | } |
| 2094 | } else if (type == LLAMA_VOCAB_TYPE_WPM) { |
| 2095 | linefeed_id = special_pad_id; |
| 2096 | } else if (type == LLAMA_VOCAB_TYPE_RWKV) { |
| 2097 | const std::vector<int> ids = tokenize(raw_text: "\n" , add_special: false); |
| 2098 | GGML_ASSERT(!ids.empty() && "model vocab missing newline token" ); |
| 2099 | linefeed_id = ids[0]; |
| 2100 | } else { |
| 2101 | const std::vector<int> ids = tokenize(raw_text: "\n" , add_special: false); |
| 2102 | |
| 2103 | //GGML_ASSERT(!ids.empty() && "model vocab missing newline token"); |
| 2104 | if (ids.empty()) { |
| 2105 | LLAMA_LOG_WARN("%s: model vocab missing newline token, using special_pad_id instead\n" , __func__); |
| 2106 | linefeed_id = special_pad_id; |
| 2107 | } else { |
| 2108 | linefeed_id = ids[0]; |
| 2109 | } |
| 2110 | } |
| 2111 | |
| 2112 | // special tokens |
| 2113 | { |
| 2114 | const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = { |
| 2115 | { LLM_KV_TOKENIZER_BOS_ID, special_bos_id }, |
| 2116 | { LLM_KV_TOKENIZER_EOS_ID, special_eos_id }, |
| 2117 | { LLM_KV_TOKENIZER_EOT_ID, special_eot_id }, |
| 2118 | { LLM_KV_TOKENIZER_EOM_ID, special_eom_id }, |
| 2119 | { LLM_KV_TOKENIZER_UNK_ID, special_unk_id }, |
| 2120 | { LLM_KV_TOKENIZER_SEP_ID, special_sep_id }, |
| 2121 | { LLM_KV_TOKENIZER_PAD_ID, special_pad_id }, |
| 2122 | { LLM_KV_TOKENIZER_MASK_ID, special_mask_id }, |
| 2123 | { LLM_KV_TOKENIZER_FIM_PRE_ID, special_fim_pre_id }, |
| 2124 | { LLM_KV_TOKENIZER_FIM_SUF_ID, special_fim_suf_id }, |
| 2125 | { LLM_KV_TOKENIZER_FIM_MID_ID, special_fim_mid_id }, |
| 2126 | { LLM_KV_TOKENIZER_FIM_PAD_ID, special_fim_pad_id }, |
| 2127 | { LLM_KV_TOKENIZER_FIM_REP_ID, special_fim_rep_id }, |
| 2128 | { LLM_KV_TOKENIZER_FIM_SEP_ID, special_fim_sep_id }, |
| 2129 | |
| 2130 | // deprecated |
| 2131 | { LLM_KV_TOKENIZER_PREFIX_ID, special_fim_pre_id }, |
| 2132 | { LLM_KV_TOKENIZER_SUFFIX_ID, special_fim_suf_id }, |
| 2133 | { LLM_KV_TOKENIZER_MIDDLE_ID, special_fim_mid_id }, |
| 2134 | }; |
| 2135 | |
| 2136 | for (const auto & it : special_token_types) { |
| 2137 | const std::string & key = kv(std::get<0>(in: it)); |
| 2138 | int32_t & id = std::get<1>(in: it); |
| 2139 | |
| 2140 | uint32_t new_id; |
| 2141 | if (!ml.get_key(kid: std::get<0>(in: it), result&: new_id, required: false)) { |
| 2142 | continue; |
| 2143 | } |
| 2144 | if (new_id >= id_to_token.size()) { |
| 2145 | LLAMA_LOG_WARN("%s: bad special token: '%s' = %u, using default id %d\n" , |
| 2146 | __func__, key.c_str(), new_id, id); |
| 2147 | } else { |
| 2148 | id = new_id; |
| 2149 | } |
| 2150 | } |
| 2151 | |
| 2152 | // Handle add_bos, add_eos and add_sep |
| 2153 | { |
| 2154 | bool temp = true; |
| 2155 | |
| 2156 | if (ml.get_key(kid: LLM_KV_TOKENIZER_ADD_BOS, result&: temp, required: false)) { |
| 2157 | add_bos = temp; |
| 2158 | } |
| 2159 | if (ml.get_key(kid: LLM_KV_TOKENIZER_ADD_EOS, result&: temp, required: false)) { |
| 2160 | add_eos = temp; |
| 2161 | } |
| 2162 | if (ml.get_key(kid: LLM_KV_TOKENIZER_ADD_SEP, result&: temp, required: false)) { |
| 2163 | add_sep = temp; |
| 2164 | } |
| 2165 | } |
| 2166 | |
| 2167 | // auto-detect special tokens by text |
| 2168 | // TODO: convert scripts should provide these tokens through the KV metadata LLM_KV_TOKENIZER_... |
| 2169 | // for now, we apply this workaround to find the tokens based on their text |
| 2170 | |
| 2171 | for (const auto & t : token_to_id) { |
| 2172 | // find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc. |
| 2173 | if (special_eot_id == LLAMA_TOKEN_NULL) { |
| 2174 | if (false |
| 2175 | || t.first == "<|eot_id|>" |
| 2176 | || t.first == "<|im_end|>" |
| 2177 | || t.first == "<|end|>" |
| 2178 | || t.first == "<end_of_turn>" |
| 2179 | || t.first == "<|endoftext|>" |
| 2180 | || t.first == "<|end_of_text|>" // granite |
| 2181 | || t.first == "<EOT>" |
| 2182 | || t.first == "_<EOT>" |
| 2183 | || t.first == "<|end▁of▁sentence|>" // DeepSeek |
| 2184 | || t.first == "<end_of_utterance>" // smoldocling |
| 2185 | ) { |
| 2186 | special_eot_id = t.second; |
| 2187 | if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { |
| 2188 | LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n" , |
| 2189 | __func__, t.second, t.first.c_str()); |
| 2190 | id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL; |
| 2191 | } |
| 2192 | } |
| 2193 | } |
| 2194 | |
| 2195 | // find EOM token: "<|eom_id|>" |
| 2196 | if (special_eom_id == LLAMA_TOKEN_NULL) { |
| 2197 | if (false |
| 2198 | || t.first == "<|eom_id|>" |
| 2199 | ) { |
| 2200 | special_eom_id = t.second; |
| 2201 | if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { |
| 2202 | LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n" , |
| 2203 | __func__, t.second, t.first.c_str()); |
| 2204 | id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL; |
| 2205 | } |
| 2206 | } |
| 2207 | } |
| 2208 | |
| 2209 | // find FIM_PRE token: "<|fim_prefix|>", "<fim-prefix>", "<PRE>", etc. |
| 2210 | if (special_fim_pre_id == LLAMA_TOKEN_NULL) { |
| 2211 | if (false |
| 2212 | || t.first == "<|fim_prefix|>" // Qwen |
| 2213 | || t.first == "<fim-prefix>" |
| 2214 | || t.first == "<fim_prefix>" // Granite |
| 2215 | || t.first == "<|fim▁begin|>" // DeepSeek |
| 2216 | || t.first == "<PRE>" |
| 2217 | || t.first == "▁<PRE>" // CodeLlama |
| 2218 | || t.first == "<|code_prefix|>" // GLM-4.5 |
| 2219 | ) { |
| 2220 | special_fim_pre_id = t.second; |
| 2221 | if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { |
| 2222 | LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n" , |
| 2223 | __func__, t.second, t.first.c_str()); |
| 2224 | id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL; |
| 2225 | } |
| 2226 | } |
| 2227 | } |
| 2228 | |
| 2229 | // find FIM_SUF token: "<|fim_suffix|>", "<fim-suffix>", "<SUF>", etc. |
| 2230 | if (special_fim_suf_id == LLAMA_TOKEN_NULL) { |
| 2231 | if (false |
| 2232 | || t.first == "<|fim_suffix|>" // Qwen |
| 2233 | || t.first == "<fim-suffix>" |
| 2234 | || t.first == "<fim_suffix>" // Granite |
| 2235 | || t.first == "<|fim▁hole|>" // DeepSeek |
| 2236 | || t.first == "<SUF>" |
| 2237 | || t.first == "▁<SUF>" // CodeLlama |
| 2238 | || t.first == "<|code_suffix|>" // GLM-4.5 |
| 2239 | ) { |
| 2240 | special_fim_suf_id = t.second; |
| 2241 | if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { |
| 2242 | LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n" , |
| 2243 | __func__, t.second, t.first.c_str()); |
| 2244 | id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL; |
| 2245 | } |
| 2246 | } |
| 2247 | } |
| 2248 | |
| 2249 | // find FIM_MID token: "<|fim_middle|>", "<fim-middle>", "<MID>", etc. |
| 2250 | if (special_fim_mid_id == LLAMA_TOKEN_NULL) { |
| 2251 | if (false |
| 2252 | || t.first == "<|fim_middle|>" // Qwen |
| 2253 | || t.first == "<fim-middle>" |
| 2254 | || t.first == "<fim_middle>" // Granite |
| 2255 | || t.first == "<|fim▁end|>" // DeepSeek |
| 2256 | || t.first == "<MID>" |
| 2257 | || t.first == "▁<MID>" // CodeLlama |
| 2258 | || t.first == "<|code_middle|>" // GLM-4.5 |
| 2259 | ) { |
| 2260 | special_fim_mid_id = t.second; |
| 2261 | if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { |
| 2262 | LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n" , |
| 2263 | __func__, t.second, t.first.c_str()); |
| 2264 | id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL; |
| 2265 | } |
| 2266 | } |
| 2267 | } |
| 2268 | |
| 2269 | // find FIM_PAD token: "<|fim_pad|>", "<fim-pad>", "<PAD>", etc. |
| 2270 | if (special_fim_pad_id == LLAMA_TOKEN_NULL) { |
| 2271 | if (false |
| 2272 | || t.first == "<|fim_pad|>" // Qwen |
| 2273 | || t.first == "<fim-pad>" |
| 2274 | || t.first == "<fim_pad>" // Granite |
| 2275 | || t.first == "<PAD>" |
| 2276 | ) { |
| 2277 | special_fim_pad_id = t.second; |
| 2278 | if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { |
| 2279 | LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n" , |
| 2280 | __func__, t.second, t.first.c_str()); |
| 2281 | id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL; |
| 2282 | } |
| 2283 | } |
| 2284 | } |
| 2285 | |
| 2286 | // find FIM_REP token: "<|fim_repo|>", "<fim-repo>", "<REP>", etc. |
| 2287 | if (special_fim_rep_id == LLAMA_TOKEN_NULL) { |
| 2288 | if (false |
| 2289 | || t.first == "<|fim_repo|>" // Qwen |
| 2290 | || t.first == "<|repo_name|>" |
| 2291 | || t.first == "<fim-repo>" |
| 2292 | || t.first == "<REPO>" |
| 2293 | || t.first == "<reponame>" // Granite |
| 2294 | ) { |
| 2295 | special_fim_rep_id = t.second; |
| 2296 | if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { |
| 2297 | LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n" , |
| 2298 | __func__, t.second, t.first.c_str()); |
| 2299 | id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL; |
| 2300 | } |
| 2301 | } |
| 2302 | } |
| 2303 | |
| 2304 | // find FIM_SEP token: "<|file_sep|>" |
| 2305 | if (special_fim_sep_id == LLAMA_TOKEN_NULL) { |
| 2306 | if (false |
| 2307 | || t.first == "<|file_sep|>" // Qwen |
| 2308 | ) { |
| 2309 | special_fim_sep_id = t.second; |
| 2310 | if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { |
| 2311 | LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n" , |
| 2312 | __func__, t.second, t.first.c_str()); |
| 2313 | id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL; |
| 2314 | } |
| 2315 | } |
| 2316 | } |
| 2317 | } |
| 2318 | |
| 2319 | // maintain a list of tokens that cause end-of-generation |
| 2320 | // this is currently determined based on the token text, which is obviously not ideal |
| 2321 | // ref: https://github.com/ggerganov/llama.cpp/issues/9606 |
| 2322 | special_eog_ids.clear(); |
| 2323 | |
| 2324 | if (special_fim_pad_id != LLAMA_TOKEN_NULL && special_eog_ids.count(x: special_fim_pad_id) == 0) { |
| 2325 | special_eog_ids.insert(x: special_fim_pad_id); |
| 2326 | } |
| 2327 | |
| 2328 | if (special_fim_rep_id != LLAMA_TOKEN_NULL && special_eog_ids.count(x: special_fim_rep_id) == 0) { |
| 2329 | special_eog_ids.insert(x: special_fim_rep_id); |
| 2330 | } |
| 2331 | |
| 2332 | if (special_fim_sep_id != LLAMA_TOKEN_NULL && special_eog_ids.count(x: special_fim_sep_id) == 0) { |
| 2333 | special_eog_ids.insert(x: special_fim_sep_id); |
| 2334 | } |
| 2335 | |
| 2336 | for (const auto & t : token_to_id) { |
| 2337 | if (false |
| 2338 | || t.first == "<|eot_id|>" |
| 2339 | || t.first == "<|im_end|>" |
| 2340 | || t.first == "<|end|>" |
| 2341 | || t.first == "<|return|>" // o200k_harmony |
| 2342 | || t.first == "<|call|>" // o200k_harmony |
| 2343 | || t.first == "<end_of_turn>" |
| 2344 | || t.first == "<|endoftext|>" |
| 2345 | || t.first == "<|eom_id|>" |
| 2346 | || t.first == "<EOT>" |
| 2347 | || t.first == "_<EOT>" |
| 2348 | || t.first == "<|end_of_text|>" |
| 2349 | || t.first == "<end_of_utterance>" // smoldocling |
| 2350 | ) { |
| 2351 | special_eog_ids.insert(x: t.second); |
| 2352 | if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { |
| 2353 | LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n" , |
| 2354 | __func__, t.second, t.first.c_str()); |
| 2355 | id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL; |
| 2356 | } |
| 2357 | } else { |
| 2358 | // token is control, but not marked as EOG -> print a debug log |
| 2359 | if (id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL && special_eog_ids.count(x: t.second) == 0) { |
| 2360 | LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n" , |
| 2361 | __func__, t.second, t.first.c_str()); |
| 2362 | } |
| 2363 | } |
| 2364 | } |
| 2365 | |
| 2366 | // @ngxson : quick hack for gpt-oss, always render these tokens |
| 2367 | for (const auto & t : token_to_id) { |
| 2368 | if (t.first == "<|channel|>" || t.first == "<|message|>" || t.first == "<|start|>" || t.first == "<|constrain|>" ) { |
| 2369 | id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_USER_DEFINED; |
| 2370 | } |
| 2371 | } |
| 2372 | |
| 2373 | // sanity checks |
| 2374 | if (special_eos_id != LLAMA_TOKEN_NULL && special_eog_ids.count(x: special_eos_id) == 0) { |
| 2375 | special_eog_ids.insert(x: special_eos_id); |
| 2376 | LLAMA_LOG_WARN("%s: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect\n" , __func__); |
| 2377 | } |
| 2378 | |
| 2379 | if (special_eot_id != LLAMA_TOKEN_NULL && special_eog_ids.count(x: special_eot_id) == 0) { |
| 2380 | special_eog_ids.insert(x: special_eot_id); |
| 2381 | LLAMA_LOG_WARN("%s: special_eot_id is not in special_eog_ids - the tokenizer config may be incorrect\n" , __func__); |
| 2382 | } |
| 2383 | |
| 2384 | if (special_eom_id != LLAMA_TOKEN_NULL && special_eog_ids.count(x: special_eom_id) == 0) { |
| 2385 | special_eog_ids.insert(x: special_eom_id); |
| 2386 | LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n" , __func__); |
| 2387 | } |
| 2388 | |
| 2389 | // TODO: workaround for o200k_harmony tokenizer: the "<|end|>" token should not be EOG |
| 2390 | // we don't have a good way to detect this, so for now, if we have "<|return|>" and "<|call|>" tokens, |
| 2391 | // we remove the "<|end|>" token from the EOG list |
| 2392 | { |
| 2393 | bool has_return = false; |
| 2394 | bool has_call = false; |
| 2395 | bool has_end = false; |
| 2396 | |
| 2397 | llama_token end_id = LLAMA_TOKEN_NULL; |
| 2398 | |
| 2399 | LLAMA_LOG_INFO("%s: printing all EOG tokens:\n" , __func__); |
| 2400 | for (auto tid : special_eog_ids) { |
| 2401 | LLAMA_LOG_INFO("%s: - %d ('%s')\n" , __func__, tid, id_to_token[tid].text.c_str()); |
| 2402 | |
| 2403 | if (id_to_token[tid].text == "<|return|>" ) { |
| 2404 | has_return = true; |
| 2405 | } else if (id_to_token[tid].text == "<|call|>" ) { |
| 2406 | has_call = true; |
| 2407 | } else if (id_to_token[tid].text == "<|end|>" ) { |
| 2408 | has_end = true; |
| 2409 | end_id = tid; |
| 2410 | } |
| 2411 | } |
| 2412 | |
| 2413 | if (has_return && has_call && has_end) { |
| 2414 | special_eog_ids.erase(x: end_id); |
| 2415 | id_to_token[end_id].attr = LLAMA_TOKEN_ATTR_USER_DEFINED; |
| 2416 | LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>' tokens, removing '<|end|>' token from EOG list\n" , __func__); |
| 2417 | } |
| 2418 | } |
| 2419 | } |
| 2420 | |
| 2421 | // build special tokens cache |
| 2422 | { |
| 2423 | for (llama_token id = 0; id < (llama_token) n_tokens; ++id) { |
| 2424 | if (id_to_token[id].attr & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_USER_DEFINED | LLAMA_TOKEN_ATTR_UNKNOWN)) { |
| 2425 | cache_special_tokens.push_back(x: id); |
| 2426 | } |
| 2427 | } |
| 2428 | |
| 2429 | std::sort(first: cache_special_tokens.begin(), last: cache_special_tokens.end(), |
| 2430 | comp: [&] (const llama_token a, const llama_token b) { |
| 2431 | return id_to_token[a].text.size() > id_to_token[b].text.size(); |
| 2432 | } |
| 2433 | ); |
| 2434 | |
| 2435 | LLAMA_LOG_INFO("%s: special tokens cache size = %u\n" , __func__, (uint32_t) cache_special_tokens.size()); |
| 2436 | } |
| 2437 | |
| 2438 | // build token to piece cache |
| 2439 | { |
| 2440 | size_t size_cache = 0; |
| 2441 | |
| 2442 | std::vector<std::string> cache(n_tokens); |
| 2443 | |
| 2444 | for (uint32_t id = 0; id < n_tokens; ++id) { |
| 2445 | cache[id] = token_to_piece_for_cache(token: id, special: true); |
| 2446 | |
| 2447 | size_cache += cache[id].size(); |
| 2448 | } |
| 2449 | |
| 2450 | std::swap(x&: cache_token_to_piece, y&: cache); |
| 2451 | |
| 2452 | LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n" , __func__, size_cache / 1024.0 / 1024.0); |
| 2453 | } |
| 2454 | |
| 2455 | // Handle per token attributes |
| 2456 | //NOTE: Each model customizes per token attributes. |
| 2457 | //NOTE: Per token attributes are missing from the GGUF file. |
| 2458 | //TODO: Extract attributes from GGUF file. |
| 2459 | { |
| 2460 | auto _contains_any = [] (const std::string & str, const std::vector<std::string_view> & substrs) -> bool { |
| 2461 | for (const auto & substr : substrs) { |
| 2462 | if (str.find(svt: substr) != std::string::npos) { |
| 2463 | return true; |
| 2464 | } |
| 2465 | } |
| 2466 | return false; |
| 2467 | }; |
| 2468 | |
| 2469 | auto _set_tokenid_attr = [&] (const llama_token id, llama_token_attr attr, bool value) { |
| 2470 | uint32_t current = id_to_token.at(n: id).attr; |
| 2471 | current = value ? (current | attr) : (current & ~attr); |
| 2472 | id_to_token[id].attr = (llama_token_attr) current; |
| 2473 | }; |
| 2474 | |
| 2475 | auto _set_token_attr = [&] (const std::string & token, llama_token_attr attr, bool value) { |
| 2476 | _set_tokenid_attr(token_to_id.at(k: token), attr, value); |
| 2477 | }; |
| 2478 | |
| 2479 | std::string model_name; |
| 2480 | std::string tokenizer_pre; |
| 2481 | std::string general_arch; |
| 2482 | |
| 2483 | ml.get_key(kid: LLM_KV_GENERAL_NAME, result&: model_name, required: false); |
| 2484 | ml.get_key(kid: LLM_KV_TOKENIZER_PRE, result&: tokenizer_pre, required: false); |
| 2485 | ml.get_key(kid: LLM_KV_GENERAL_ARCHITECTURE, result&: general_arch, required: false); |
| 2486 | |
| 2487 | // model name to lowercase |
| 2488 | std::transform(first: model_name.begin(), last: model_name.end(), result: model_name.begin(), |
| 2489 | unary_op: [] (const std::string::value_type x) { |
| 2490 | return std::tolower(c: x); |
| 2491 | } |
| 2492 | ); |
| 2493 | |
| 2494 | // set attributes by model/tokenizer/architecture name |
| 2495 | if (false |
| 2496 | || _contains_any(tokenizer_pre, {"jina-v2-de" , "jina-v2-es" , "jina-v2-code" }) |
| 2497 | || _contains_any(general_arch, {"nomic-bert-moe" , "jina-bert-v3" }) |
| 2498 | ) { |
| 2499 | if (token_to_id.count(x: "<mask>" ) == 0) { |
| 2500 | LLAMA_LOG_WARN("%s: Mask token is missing in vocab, please reconvert model!\n" , __func__); |
| 2501 | } else { |
| 2502 | _set_token_attr("<mask>" , LLAMA_TOKEN_ATTR_LSTRIP, true); |
| 2503 | } |
| 2504 | } else if (_contains_any(model_name, {"phi-3" , "phi3" })) { |
| 2505 | for (auto id : cache_special_tokens) { |
| 2506 | _set_tokenid_attr(id, LLAMA_TOKEN_ATTR_RSTRIP, true); |
| 2507 | } |
| 2508 | for (const auto * token : {"</s>" }) { |
| 2509 | _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, true); |
| 2510 | } |
| 2511 | for (const auto * token : {"<unk>" , "<s>" , "<|endoftext|>" }) { |
| 2512 | _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, false); |
| 2513 | } |
| 2514 | } |
| 2515 | } |
| 2516 | } |
| 2517 | |
| 2518 | enum llama_vocab_type llama_vocab::impl::get_type() const { |
| 2519 | return type; |
| 2520 | } |
| 2521 | |
| 2522 | std::string llama_vocab::impl::type_name() const{ |
| 2523 | switch (type) { |
| 2524 | case LLAMA_VOCAB_TYPE_NONE: return "no vocab" ; |
| 2525 | case LLAMA_VOCAB_TYPE_SPM: return "SPM" ; |
| 2526 | case LLAMA_VOCAB_TYPE_BPE: return "BPE" ; |
| 2527 | case LLAMA_VOCAB_TYPE_WPM: return "WPM" ; |
| 2528 | case LLAMA_VOCAB_TYPE_UGM: return "UGM" ; |
| 2529 | case LLAMA_VOCAB_TYPE_RWKV: return "RWKV" ; |
| 2530 | case LLAMA_VOCAB_TYPE_PLAMO2: return "PLaMo2" ; |
| 2531 | default: return "unknown" ; |
| 2532 | } |
| 2533 | } |
| 2534 | |
| 2535 | bool llama_vocab::impl::is_normal(llama_token id) const { |
| 2536 | GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE); |
| 2537 | return id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL; |
| 2538 | } |
| 2539 | |
| 2540 | bool llama_vocab::impl::is_unknown(llama_token id) const { |
| 2541 | GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE); |
| 2542 | return id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNKNOWN; |
| 2543 | } |
| 2544 | |
| 2545 | bool llama_vocab::impl::is_control(llama_token id) const { |
| 2546 | GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE); |
| 2547 | return id_to_token[id].attr & LLAMA_TOKEN_ATTR_CONTROL; |
| 2548 | } |
| 2549 | |
| 2550 | bool llama_vocab::impl::is_byte(llama_token id) const { |
| 2551 | GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE); |
| 2552 | return id_to_token[id].attr & LLAMA_TOKEN_ATTR_BYTE; |
| 2553 | } |
| 2554 | |
| 2555 | bool llama_vocab::impl::is_user_defined(llama_token id) const { |
| 2556 | GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE); |
| 2557 | return id_to_token[id].attr & LLAMA_TOKEN_ATTR_USER_DEFINED; |
| 2558 | } |
| 2559 | |
| 2560 | bool llama_vocab::impl::is_unused(llama_token id) const { |
| 2561 | GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE); |
| 2562 | return id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNUSED; |
| 2563 | } |
| 2564 | |
| 2565 | bool llama_vocab::impl::is_eog(llama_token id) const { |
| 2566 | return id != LLAMA_TOKEN_NULL && special_eog_ids.count(x: id) > 0; |
| 2567 | } |
| 2568 | |
| 2569 | uint8_t llama_vocab::impl::token_to_byte(llama_token id) const { |
| 2570 | GGML_ASSERT(get_type() != LLAMA_VOCAB_TYPE_NONE); |
| 2571 | GGML_ASSERT(is_byte(id)); |
| 2572 | const auto & token_data = id_to_token.at(n: id); |
| 2573 | switch (get_type()) { |
| 2574 | case LLAMA_VOCAB_TYPE_SPM: |
| 2575 | case LLAMA_VOCAB_TYPE_UGM: { |
| 2576 | auto buf = token_data.text.substr(pos: 3, n: 2); |
| 2577 | return strtol(nptr: buf.c_str(), NULL, base: 16); |
| 2578 | } |
| 2579 | case LLAMA_VOCAB_TYPE_BPE: { |
| 2580 | GGML_ABORT("fatal error" ); |
| 2581 | } |
| 2582 | case LLAMA_VOCAB_TYPE_WPM: { |
| 2583 | GGML_ABORT("fatal error" ); |
| 2584 | } |
| 2585 | default: |
| 2586 | GGML_ABORT("fatal error" ); |
| 2587 | } |
| 2588 | } |
| 2589 | |
| 2590 | llama_token_attr llama_vocab::impl::token_get_attr(llama_token id) const { |
| 2591 | GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE); |
| 2592 | return id_to_token.at(n: id).attr; |
| 2593 | } |
| 2594 | |
| 2595 | void llama_vocab::impl::init_tokenizer(enum llama_vocab_type type) { |
| 2596 | LLAMA_LOG_DEBUG("%s: initializing tokenizer for type %d\n" , __func__, type); |
| 2597 | |
| 2598 | switch (type) { |
| 2599 | case LLAMA_VOCAB_TYPE_SPM: |
| 2600 | tokenizer = std::make_unique<llm_tokenizer_spm>(args: vocab); |
| 2601 | break; |
| 2602 | case LLAMA_VOCAB_TYPE_BPE: |
| 2603 | tokenizer = std::make_unique<llm_tokenizer_bpe>(args: vocab); |
| 2604 | break; |
| 2605 | case LLAMA_VOCAB_TYPE_WPM: |
| 2606 | tokenizer = std::make_unique<llm_tokenizer_wpm>(args: vocab); |
| 2607 | break; |
| 2608 | case LLAMA_VOCAB_TYPE_UGM: |
| 2609 | tokenizer = std::make_unique<llm_tokenizer_ugm>(args: vocab, args&: precompiled_charsmap); |
| 2610 | break; |
| 2611 | case LLAMA_VOCAB_TYPE_RWKV: |
| 2612 | tokenizer = std::make_unique<llm_tokenizer_rwkv>(args: vocab); |
| 2613 | break; |
| 2614 | case LLAMA_VOCAB_TYPE_PLAMO2: |
| 2615 | tokenizer = std::make_unique<llm_tokenizer_plamo2>(args: vocab); |
| 2616 | break; |
| 2617 | default: |
| 2618 | GGML_ABORT("unsupported vocab type" ); |
| 2619 | } |
| 2620 | } |
| 2621 | |
| 2622 | // |
| 2623 | // (de-) tokenize |
| 2624 | // |
| 2625 | |
| 2626 | // #define PRETOKENIZERDEBUG |
| 2627 | |
| 2628 | void llama_vocab::impl::tokenizer_st_partition(std::forward_list<fragment_buffer_variant> & buffer, bool parse_special) const { |
| 2629 | // for each special token |
| 2630 | for (const llama_token special_id : cache_special_tokens) { |
| 2631 | const auto & data = vocab.get_token_data(id: special_id); |
| 2632 | const auto & text = data.text; |
| 2633 | |
| 2634 | if (!parse_special && (data.attr & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_UNKNOWN))) { |
| 2635 | // Ignore control and unknown tokens when parse_special == false |
| 2636 | continue; |
| 2637 | // User-defined tokens are still pre-tokenized before everything else |
| 2638 | // ref: https://github.com/huggingface/tokenizers/blob/fdd26ba9a3f0c133427aab0423888cbde91362d7/tokenizers/src/tokenizer/mod.rs#L726 |
| 2639 | // This is mostly relevant for neox-style tokenizers (mpt, olmo, stablelm, etc.) |
| 2640 | } |
| 2641 | |
| 2642 | // for each text fragment |
| 2643 | std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin(); |
| 2644 | while (it != buffer.end()) { |
| 2645 | auto & fragment = (*it); |
| 2646 | |
| 2647 | // if a fragment is text ( not yet processed ) |
| 2648 | if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { |
| 2649 | const auto & raw_text = fragment.raw_text; |
| 2650 | |
| 2651 | auto raw_text_base_offset = fragment.offset; |
| 2652 | auto raw_text_base_length = fragment.length; |
| 2653 | |
| 2654 | // loop over the text |
| 2655 | while (true) { |
| 2656 | // find the first occurrence of a given special token in this fragment |
| 2657 | // passing offset argument only limit the "search area" but match coordinates |
| 2658 | // are still relative to the source full raw_text |
| 2659 | // string_view begins at pos 0 for the same reason |
| 2660 | auto match = std::string_view(raw_text.data(), raw_text_base_offset + raw_text_base_length).find(str: text, pos: raw_text_base_offset); |
| 2661 | |
| 2662 | // no occurrences found, stop processing this fragment for a given special token |
| 2663 | if (match == std::string::npos) break; |
| 2664 | |
| 2665 | #ifdef PRETOKENIZERDEBUG |
| 2666 | LLAMA_LOG_WARN("FF: (%ld %ld %ld) '%s'\n" , raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str()); |
| 2667 | #endif |
| 2668 | auto source = std::distance(first: buffer.begin(), last: it); |
| 2669 | |
| 2670 | // if match is further than base offset |
| 2671 | // then we have some text to the left of it |
| 2672 | if (match > raw_text_base_offset) { |
| 2673 | // left |
| 2674 | const int64_t left_reminder_offset = raw_text_base_offset + 0; |
| 2675 | int64_t left_reminder_length = match - raw_text_base_offset; |
| 2676 | |
| 2677 | if (data.attr & LLAMA_TOKEN_ATTR_LSTRIP) { |
| 2678 | while (left_reminder_length > 0 && isspace(raw_text[left_reminder_offset + left_reminder_length - 1])) { |
| 2679 | left_reminder_length--; |
| 2680 | } |
| 2681 | } |
| 2682 | |
| 2683 | if (left_reminder_length > 0) { |
| 2684 | buffer.emplace_after(pos: it, args: raw_text, args: left_reminder_offset, args&: left_reminder_length); |
| 2685 | it++; |
| 2686 | } |
| 2687 | |
| 2688 | #ifdef PRETOKENIZERDEBUG |
| 2689 | LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n" , left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str()); |
| 2690 | #endif |
| 2691 | } |
| 2692 | |
| 2693 | // special token |
| 2694 | buffer.emplace_after(pos: it, args: special_id); |
| 2695 | it++; |
| 2696 | |
| 2697 | // right |
| 2698 | if (match + text.length() < raw_text_base_offset + raw_text_base_length) { |
| 2699 | int64_t right_reminder_offset = match + text.length(); |
| 2700 | int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + text.length()); |
| 2701 | |
| 2702 | if (data.attr & LLAMA_TOKEN_ATTR_RSTRIP) { |
| 2703 | while (right_reminder_length > 0 && isspace(raw_text[right_reminder_offset])) { |
| 2704 | right_reminder_offset++; |
| 2705 | right_reminder_length--; |
| 2706 | } |
| 2707 | } |
| 2708 | |
| 2709 | if (right_reminder_length > 0) { |
| 2710 | buffer.emplace_after(pos: it, args: raw_text, args&: right_reminder_offset, args&: right_reminder_length); |
| 2711 | it++; |
| 2712 | } |
| 2713 | |
| 2714 | #ifdef PRETOKENIZERDEBUG |
| 2715 | LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n" , right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str()); |
| 2716 | #endif |
| 2717 | |
| 2718 | if (source == 0) { |
| 2719 | buffer.erase_after(pos: buffer.before_begin()); |
| 2720 | } else { |
| 2721 | buffer.erase_after(pos: std::next(x: buffer.begin(), n: (source - 1))); |
| 2722 | } |
| 2723 | |
| 2724 | // repeat for the right side |
| 2725 | raw_text_base_offset = right_reminder_offset; |
| 2726 | raw_text_base_length = right_reminder_length; |
| 2727 | |
| 2728 | #ifdef PRETOKENIZERDEBUG |
| 2729 | LLAMA_LOG_WARN("RR: (%ld %ld) '%s'\n" , raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str()); |
| 2730 | #endif |
| 2731 | } else { |
| 2732 | if (source == 0) { |
| 2733 | buffer.erase_after(pos: buffer.before_begin()); |
| 2734 | } else { |
| 2735 | buffer.erase_after(pos: std::next(x: buffer.begin(), n: (source - 1))); |
| 2736 | } |
| 2737 | break; |
| 2738 | } |
| 2739 | } |
| 2740 | } |
| 2741 | it++; |
| 2742 | } |
| 2743 | } |
| 2744 | } |
| 2745 | |
| 2746 | // NOTE: avoid ever using this except for building the token_to_piece caches |
| 2747 | std::string llama_vocab::impl::token_to_piece_for_cache(llama_token token, bool special) const { |
| 2748 | std::string piece; |
| 2749 | piece.resize(n: piece.capacity()); // using string internal cache |
| 2750 | const int n_chars = vocab.token_to_piece(token, buf: &piece[0], length: piece.size(), lstrip: 0, special); |
| 2751 | if (n_chars < 0) { |
| 2752 | piece.resize(n: -n_chars); |
| 2753 | int check = vocab.token_to_piece(token, buf: &piece[0], length: piece.size(), lstrip: 0, special); |
| 2754 | GGML_ASSERT(check == -n_chars); |
| 2755 | } |
| 2756 | else { |
| 2757 | piece.resize(n: n_chars); |
| 2758 | } |
| 2759 | |
| 2760 | return piece; |
| 2761 | } |
| 2762 | |
| 2763 | static void llama_escape_whitespace(std::string & text) { |
| 2764 | replace_all(s&: text, search: " " , replace: "\xe2\x96\x81" ); |
| 2765 | } |
| 2766 | |
| 2767 | static void llama_unescape_whitespace(std::string & word) { |
| 2768 | replace_all(s&: word, search: "\xe2\x96\x81" , replace: " " ); |
| 2769 | } |
| 2770 | |
| 2771 | static std::string llama_decode_text(const std::string & text) { |
| 2772 | std::string decoded_text; |
| 2773 | |
| 2774 | const auto cpts = unicode_cpts_from_utf8(utf8: text); |
| 2775 | for (const auto cpt : cpts) { |
| 2776 | const auto utf8 = unicode_cpt_to_utf8(cpt); |
| 2777 | try { |
| 2778 | decoded_text += unicode_utf8_to_byte(utf8); |
| 2779 | } catch (const std::out_of_range & /*e*/) { |
| 2780 | decoded_text += "[UNK_BYTE_0x" ; |
| 2781 | for (const auto c : utf8) { |
| 2782 | decoded_text += format(fmt: "%02x" , (uint8_t) c); |
| 2783 | } |
| 2784 | decoded_text += text + "]" ; |
| 2785 | } |
| 2786 | } |
| 2787 | |
| 2788 | return decoded_text; |
| 2789 | } |
| 2790 | |
| 2791 | std::vector<llama_token> llama_vocab::impl::tokenize( |
| 2792 | const std::string & raw_text, |
| 2793 | bool add_special, |
| 2794 | bool parse_special) const { |
| 2795 | GGML_ASSERT(tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first." ); |
| 2796 | |
| 2797 | std::vector<llama_token> output; |
| 2798 | std::forward_list<fragment_buffer_variant> fragment_buffer; |
| 2799 | |
| 2800 | if (!raw_text.empty()) { |
| 2801 | fragment_buffer.emplace_front(args: raw_text, args: 0, args: raw_text.length()); |
| 2802 | tokenizer_st_partition(buffer&: fragment_buffer, parse_special); |
| 2803 | } |
| 2804 | |
| 2805 | switch (get_type()) { |
| 2806 | case LLAMA_VOCAB_TYPE_SPM: |
| 2807 | { |
| 2808 | // OG tokenizer behavior: |
| 2809 | // |
| 2810 | // tokenizer.encode('', add_special_tokens=True) returns [1] |
| 2811 | // tokenizer.encode('', add_special_tokens=False) returns [] |
| 2812 | |
| 2813 | bool is_prev_special = true; // prefix with space if first token |
| 2814 | |
| 2815 | if (add_special && add_bos) { |
| 2816 | GGML_ASSERT(special_bos_id != LLAMA_TOKEN_NULL); |
| 2817 | output.push_back(x: special_bos_id); |
| 2818 | is_prev_special = true; |
| 2819 | } |
| 2820 | |
| 2821 | for (const auto & fragment : fragment_buffer) { |
| 2822 | if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { |
| 2823 | std::string text; |
| 2824 | |
| 2825 | // prefix with space if previous is special |
| 2826 | if (add_space_prefix && is_prev_special) { |
| 2827 | text = ' '; |
| 2828 | } |
| 2829 | |
| 2830 | text += fragment.raw_text.substr(pos: fragment.offset, n: fragment.length); |
| 2831 | |
| 2832 | #ifdef PRETOKENIZERDEBUG |
| 2833 | LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n" , text.length(), fragment.offset, fragment.length, text.c_str()); |
| 2834 | #endif |
| 2835 | llama_escape_whitespace(text); |
| 2836 | llm_tokenizer_spm_session session(vocab); |
| 2837 | session.tokenize(text, output); |
| 2838 | is_prev_special = false; |
| 2839 | } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) |
| 2840 | output.push_back(x: fragment.token); |
| 2841 | is_prev_special = true; |
| 2842 | } |
| 2843 | } |
| 2844 | |
| 2845 | if (add_special && add_bos && output.size() >= 2 && output[1] == special_bos_id) { |
| 2846 | LLAMA_LOG_WARN( |
| 2847 | "%s: Added a BOS token to the prompt as specified by the model but the prompt " |
| 2848 | "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. " |
| 2849 | "Are you sure this is what you want?\n" , __FUNCTION__); |
| 2850 | } |
| 2851 | |
| 2852 | if (add_special && add_eos) { |
| 2853 | GGML_ASSERT(special_eos_id != LLAMA_TOKEN_NULL); |
| 2854 | output.push_back(x: special_eos_id); |
| 2855 | } |
| 2856 | } break; |
| 2857 | case LLAMA_VOCAB_TYPE_BPE: |
| 2858 | { |
| 2859 | llm_tokenizer_bpe_session session(vocab, *static_cast<const llm_tokenizer_bpe *>(tokenizer.get())); |
| 2860 | // it calls some other methods that are not exist in llm_tokenizer, |
| 2861 | // here just cast it to bpe tokenizer object |
| 2862 | if (add_special) { |
| 2863 | session.append_bos(output); |
| 2864 | } |
| 2865 | for (const auto & fragment : fragment_buffer) { |
| 2866 | if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { |
| 2867 | std::string text = fragment.raw_text.substr(pos: fragment.offset, n: fragment.length); |
| 2868 | |
| 2869 | #ifdef PRETOKENIZERDEBUG |
| 2870 | LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n" , text.length(), fragment.offset, fragment.length, text.c_str()); |
| 2871 | #endif |
| 2872 | session.tokenize(text, output); |
| 2873 | } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) |
| 2874 | session.append(token_id: fragment.token, output); |
| 2875 | } |
| 2876 | } |
| 2877 | |
| 2878 | if (add_special) { |
| 2879 | session.append_eos(output); |
| 2880 | session.check_double_bos_eos(output); |
| 2881 | } |
| 2882 | } break; |
| 2883 | case LLAMA_VOCAB_TYPE_WPM: |
| 2884 | { |
| 2885 | if (add_special) { |
| 2886 | GGML_ASSERT(special_bos_id != LLAMA_TOKEN_NULL); |
| 2887 | output.push_back(x: special_bos_id); |
| 2888 | } |
| 2889 | |
| 2890 | llm_tokenizer_wpm_session session(vocab); |
| 2891 | |
| 2892 | for (const auto & fragment : fragment_buffer) { |
| 2893 | if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { |
| 2894 | std::string text = fragment.raw_text.substr(pos: fragment.offset, n: fragment.length); |
| 2895 | |
| 2896 | #ifdef PRETOKENIZERDEBUG |
| 2897 | LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n" , text.length(), fragment.offset, fragment.length, text.c_str()); |
| 2898 | #endif |
| 2899 | session.tokenize(text, output); |
| 2900 | } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) |
| 2901 | output.push_back(x: fragment.token); |
| 2902 | } |
| 2903 | } |
| 2904 | |
| 2905 | if (add_special) { |
| 2906 | GGML_ASSERT(special_sep_id != LLAMA_TOKEN_NULL); |
| 2907 | output.push_back(x: special_sep_id); |
| 2908 | } |
| 2909 | } break; |
| 2910 | case LLAMA_VOCAB_TYPE_UGM: |
| 2911 | { |
| 2912 | if (add_special && add_bos) { |
| 2913 | GGML_ASSERT(special_bos_id != LLAMA_TOKEN_NULL); |
| 2914 | output.push_back(x: special_bos_id); |
| 2915 | } |
| 2916 | llm_tokenizer_ugm_session session(vocab, *static_cast<const llm_tokenizer_ugm *>(tokenizer.get())); |
| 2917 | |
| 2918 | for (const auto & fragment : fragment_buffer) { |
| 2919 | if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { |
| 2920 | std::string text = fragment.raw_text.substr(pos: fragment.offset, n: fragment.length); |
| 2921 | #ifdef PRETOKENIZERDEBUG |
| 2922 | LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n" , text.length(), fragment.offset, fragment.length, text.c_str()); |
| 2923 | #endif |
| 2924 | session.tokenize(text, output); |
| 2925 | } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) |
| 2926 | output.push_back(x: fragment.token); |
| 2927 | } |
| 2928 | } |
| 2929 | |
| 2930 | if (add_special && add_bos && output.size() >= 2 && output[1] == special_bos_id) { |
| 2931 | LLAMA_LOG_WARN( |
| 2932 | "%s: Added a BOS token to the prompt as specified by the model but the prompt " |
| 2933 | "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. " |
| 2934 | "Are you sure this is what you want?\n" , __FUNCTION__); |
| 2935 | } |
| 2936 | |
| 2937 | if (add_special && add_eos) { |
| 2938 | GGML_ASSERT(special_eos_id != LLAMA_TOKEN_NULL); |
| 2939 | output.push_back(x: special_eos_id); |
| 2940 | } |
| 2941 | } break; |
| 2942 | case LLAMA_VOCAB_TYPE_RWKV: |
| 2943 | { |
| 2944 | llm_tokenizer_rwkv_session session(vocab, *static_cast<const llm_tokenizer_rwkv *>(tokenizer.get())); |
| 2945 | for (const auto & fragment : fragment_buffer) { |
| 2946 | if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { |
| 2947 | std::string text = fragment.raw_text.substr(pos: fragment.offset, n: fragment.length); |
| 2948 | |
| 2949 | #ifdef PRETOKENIZERDEBUG |
| 2950 | LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n" , text.length(), fragment.offset, fragment.length, text.c_str()); |
| 2951 | #endif |
| 2952 | |
| 2953 | session.tokenize(text, output); |
| 2954 | } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) |
| 2955 | output.push_back(x: fragment.token); |
| 2956 | } |
| 2957 | } |
| 2958 | } break; |
| 2959 | case LLAMA_VOCAB_TYPE_PLAMO2: |
| 2960 | { |
| 2961 | llm_tokenizer_plamo2_session session(*static_cast<const llm_tokenizer_plamo2 *>(tokenizer.get())); |
| 2962 | for (const auto & fragment : fragment_buffer) { |
| 2963 | if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { |
| 2964 | std::string text = fragment.raw_text.substr(pos: fragment.offset, n: fragment.length); |
| 2965 | |
| 2966 | #ifdef PRETOKENIZERDEBUG |
| 2967 | LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n" , text.length(), fragment.offset, fragment.length, text.c_str()); |
| 2968 | #endif |
| 2969 | |
| 2970 | session.tokenize(text, output); |
| 2971 | } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) |
| 2972 | output.push_back(x: fragment.token); |
| 2973 | } |
| 2974 | } |
| 2975 | } break; |
| 2976 | case LLAMA_VOCAB_TYPE_NONE: |
| 2977 | GGML_ABORT("fatal error" ); |
| 2978 | } |
| 2979 | |
| 2980 | return output; |
| 2981 | } |
| 2982 | |
| 2983 | int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t length, int32_t lstrip, bool special) const { |
| 2984 | // ref: https://github.com/ggerganov/llama.cpp/pull/7587#discussion_r1620983843 |
| 2985 | static const int attr_special = LLAMA_TOKEN_ATTR_UNKNOWN | LLAMA_TOKEN_ATTR_CONTROL; |
| 2986 | const llama_token_attr attr = token_get_attr(id: token); |
| 2987 | if (!special && (attr & attr_special)) { |
| 2988 | return 0; |
| 2989 | } |
| 2990 | |
| 2991 | // copy piece chars to output text buffer |
| 2992 | // skip up to 'lstrip' leading spaces before copying |
| 2993 | auto _try_copy = [=] (const char * token, size_t size) -> int32_t { |
| 2994 | if (size >= static_cast<size_t>(std::numeric_limits<int32_t>::max())) { |
| 2995 | GGML_ABORT("invalid token size: %zu exceeds int32_t limit" , size); |
| 2996 | } |
| 2997 | |
| 2998 | for (int32_t i = 0; i < lstrip && size && *token == ' '; ++i) { |
| 2999 | token++; |
| 3000 | size--; |
| 3001 | } |
| 3002 | if (length < (int32_t)size) { |
| 3003 | return -(int32_t) size; |
| 3004 | } |
| 3005 | memcpy(dest: buf, src: token, n: size); |
| 3006 | return (int32_t) size; |
| 3007 | }; |
| 3008 | |
| 3009 | // if we have a cache - use it |
| 3010 | { |
| 3011 | const auto & cache = cache_token_to_piece; |
| 3012 | |
| 3013 | if (!cache.empty()) { |
| 3014 | const auto & result = cache.at(n: token); |
| 3015 | return _try_copy(result.data(), result.size()); |
| 3016 | } |
| 3017 | } |
| 3018 | |
| 3019 | if (0 <= token && token < (int32_t) id_to_token.size()) { |
| 3020 | const std::string & token_text = id_to_token[token].text; |
| 3021 | switch (get_type()) { |
| 3022 | case LLAMA_VOCAB_TYPE_WPM: |
| 3023 | case LLAMA_VOCAB_TYPE_SPM: |
| 3024 | case LLAMA_VOCAB_TYPE_UGM: { |
| 3025 | // NOTE: we accept all unsupported token types, |
| 3026 | // suppressing them like CONTROL tokens. |
| 3027 | if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) { |
| 3028 | return _try_copy(token_text.data(), token_text.size()); |
| 3029 | } |
| 3030 | if (attr & LLAMA_TOKEN_ATTR_NORMAL) { |
| 3031 | std::string result = token_text; |
| 3032 | llama_unescape_whitespace(word&: result); |
| 3033 | return _try_copy(result.data(), result.size()); |
| 3034 | } |
| 3035 | if (attr & LLAMA_TOKEN_ATTR_BYTE) { |
| 3036 | char byte = (char) token_to_byte(id: token); |
| 3037 | return _try_copy((char*) &byte, 1); |
| 3038 | } |
| 3039 | break; |
| 3040 | } |
| 3041 | case LLAMA_VOCAB_TYPE_BPE: { |
| 3042 | // NOTE: we accept all unsupported token types, |
| 3043 | // suppressing them like CONTROL tokens. |
| 3044 | if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) { |
| 3045 | return _try_copy(token_text.data(), token_text.size()); |
| 3046 | } |
| 3047 | if (attr & LLAMA_TOKEN_ATTR_NORMAL) { |
| 3048 | std::string result = llama_decode_text(text: token_text); |
| 3049 | return _try_copy(result.data(), result.size()); |
| 3050 | } |
| 3051 | break; |
| 3052 | } |
| 3053 | case LLAMA_VOCAB_TYPE_RWKV: { |
| 3054 | std::vector<uint8_t> result = llama_unescape_rwkv_token(escaped: token_text); |
| 3055 | |
| 3056 | // If we don't have enough space, return an error |
| 3057 | if (result.size() > (size_t)length) { |
| 3058 | return -(int)result.size(); |
| 3059 | } |
| 3060 | |
| 3061 | memcpy(dest: buf, src: result.data(), n: result.size()); |
| 3062 | return (int)result.size(); |
| 3063 | } |
| 3064 | case LLAMA_VOCAB_TYPE_PLAMO2: { |
| 3065 | // PLaMo-2 uses similar token handling as BPE/SPM |
| 3066 | if (vocab.is_byte(id: token)) { |
| 3067 | // Handle byte tokens like <0xXX> |
| 3068 | if (token_text.length() == 6 && token_text.substr(pos: 0, n: 3) == "<0x" && token_text.back() == '>') { |
| 3069 | int hex_val = std::stoi(str: token_text.substr(pos: 3, n: 2), idx: nullptr, base: 16); |
| 3070 | if (length < 1) { |
| 3071 | return -1; |
| 3072 | } |
| 3073 | buf[0] = static_cast<char>(hex_val); |
| 3074 | return 1; |
| 3075 | } |
| 3076 | } |
| 3077 | |
| 3078 | // Normal token - just copy the text |
| 3079 | std::string result = token_text; |
| 3080 | return _try_copy(result.data(), result.size()); |
| 3081 | } |
| 3082 | default: |
| 3083 | GGML_ABORT("fatal error" ); |
| 3084 | } |
| 3085 | } |
| 3086 | |
| 3087 | return 0; |
| 3088 | } |
| 3089 | |
| 3090 | const std::string & llama_vocab::impl::token_to_piece(llama_token token) const { |
| 3091 | return cache_token_to_piece.at(n: token); |
| 3092 | } |
| 3093 | |
| 3094 | int32_t llama_vocab::impl::detokenize( |
| 3095 | const llama_token * tokens, |
| 3096 | int32_t n_tokens, |
| 3097 | char * text, |
| 3098 | int32_t text_len_max, |
| 3099 | bool remove_special, |
| 3100 | bool unparse_special) const { |
| 3101 | if (type == LLAMA_VOCAB_TYPE_NONE) { |
| 3102 | return 0; |
| 3103 | } |
| 3104 | |
| 3105 | GGML_ASSERT(tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first." ); |
| 3106 | |
| 3107 | int32_t avail = text_len_max; |
| 3108 | int32_t total = 0; |
| 3109 | |
| 3110 | // remove the leading space |
| 3111 | bool remove_space = add_space_prefix; |
| 3112 | |
| 3113 | if (remove_special && add_bos) { |
| 3114 | if (n_tokens > 0 && tokens[0] == special_bos_id) { |
| 3115 | remove_space = false; |
| 3116 | n_tokens--; |
| 3117 | tokens++; |
| 3118 | } |
| 3119 | } |
| 3120 | |
| 3121 | if (remove_special && add_eos) { |
| 3122 | if (n_tokens > 0 && tokens[n_tokens - 1] == special_eos_id) { |
| 3123 | n_tokens--; |
| 3124 | } |
| 3125 | } |
| 3126 | |
| 3127 | for (int32_t i = 0; i < n_tokens; ++i) { |
| 3128 | GGML_ASSERT(avail >= 0); |
| 3129 | int32_t n_chars = token_to_piece(token: tokens[i], buf: text, length: avail, lstrip: remove_space, special: unparse_special); |
| 3130 | remove_space = false; |
| 3131 | if (n_chars < 0) { |
| 3132 | avail = 0; |
| 3133 | total -= n_chars; |
| 3134 | } else if (n_chars > 0) { |
| 3135 | avail -= n_chars; |
| 3136 | text += n_chars; |
| 3137 | total += n_chars; |
| 3138 | } |
| 3139 | } |
| 3140 | |
| 3141 | if (total > text_len_max) { |
| 3142 | return -total; |
| 3143 | } |
| 3144 | |
| 3145 | if (clean_spaces) { |
| 3146 | text -= total; // restart text |
| 3147 | |
| 3148 | // first pass: characters ?!., //TODO: where do these characters come from? |
| 3149 | const int32_t total1 = total; |
| 3150 | total = total ? 1 : 0; |
| 3151 | for (int32_t i = 1; i < total1; ++i) { |
| 3152 | const char x = text[i]; |
| 3153 | if (text[i - 1] == ' ') { |
| 3154 | if (x == '?' || x == '!' || x == '.' || x == ',') { // " ?", " !", " .", " ," |
| 3155 | total--; // remove space |
| 3156 | } |
| 3157 | } |
| 3158 | text[total++] = x; |
| 3159 | } |
| 3160 | |
| 3161 | // second pass: strip single apostrophe between spaces |
| 3162 | const int32_t total2 = total; |
| 3163 | total = total ? 1 : 0; |
| 3164 | for (int32_t i = 1; i < total2; ++i) { |
| 3165 | const char x = text[i]; |
| 3166 | if (x == '\'' && i + 1 < total2 && text[i - 1] == ' ' && text[i + 1] == ' ') { // " ' " |
| 3167 | total--; // remove prev space |
| 3168 | text[++i] = '\0'; // remove next space |
| 3169 | } |
| 3170 | text[total++] = x; |
| 3171 | } |
| 3172 | |
| 3173 | // third pass: apostrophe contractions //NOTE: this makes sense? |
| 3174 | const int32_t total3 = total; |
| 3175 | total = total ? 1 : 0; |
| 3176 | for (int32_t i = 1; i < total3; ++i) { |
| 3177 | const char x = text[i]; |
| 3178 | if (text[i - 1] == ' ') { |
| 3179 | if (x == '\'' && i + 1 < total3) { |
| 3180 | const char x1 = text[i + 1]; |
| 3181 | if (x1 == 't' || x1 == 'd') { // " 't", " 'd" |
| 3182 | //total--; // remove space |
| 3183 | } else if (x1 == 's' || x1 == 'm') { // " 's", " 'm" |
| 3184 | total--; // remove space |
| 3185 | } else if (i + 2 < total3) { |
| 3186 | const char x2 = text[i + 2]; |
| 3187 | if ((x1 == 'l' && x2 == 'l')) { // " 'll" |
| 3188 | //total--; // remove space |
| 3189 | } else if ((x1 == 'r' && x2 == 'e') || (x1 == 'v' && x2 == 'e')) { // " 're", " 've" |
| 3190 | total--; // remove space |
| 3191 | } else { |
| 3192 | //total--; // remove space |
| 3193 | } |
| 3194 | } else { |
| 3195 | //total--; // remove space |
| 3196 | } |
| 3197 | } |
| 3198 | } |
| 3199 | text[total++] = x; |
| 3200 | } |
| 3201 | } |
| 3202 | |
| 3203 | return total <= text_len_max ? total : -total; |
| 3204 | } |
| 3205 | |
| 3206 | void llama_vocab::impl::print_info() const { |
| 3207 | LLAMA_LOG_INFO("%s: vocab type = %s\n" , __func__, type_name().c_str()); |
| 3208 | LLAMA_LOG_INFO("%s: n_vocab = %u\n" , __func__, vocab.n_tokens()); |
| 3209 | LLAMA_LOG_INFO("%s: n_merges = %u\n" , __func__, (uint32_t) bpe_ranks.size()); |
| 3210 | |
| 3211 | // special tokens |
| 3212 | if (special_bos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n" , __func__, special_bos_id, id_to_token.at(special_bos_id).text.c_str() ); } |
| 3213 | if (special_eos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n" , __func__, special_eos_id, id_to_token.at(special_eos_id).text.c_str() ); } |
| 3214 | if (special_eot_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n" , __func__, special_eot_id, id_to_token.at(special_eot_id).text.c_str() ); } |
| 3215 | if (special_eom_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n" , __func__, special_eom_id, id_to_token.at(special_eom_id).text.c_str() ); } |
| 3216 | if (special_unk_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n" , __func__, special_unk_id, id_to_token.at(special_unk_id).text.c_str() ); } |
| 3217 | if (special_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n" , __func__, special_sep_id, id_to_token.at(special_sep_id).text.c_str() ); } |
| 3218 | if (special_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n" , __func__, special_pad_id, id_to_token.at(special_pad_id).text.c_str() ); } |
| 3219 | if (special_mask_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n" , __func__, special_mask_id, id_to_token.at(special_mask_id).text.c_str() ); } |
| 3220 | |
| 3221 | if (linefeed_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n" , __func__, linefeed_id, id_to_token.at(linefeed_id).text.c_str() ); } |
| 3222 | |
| 3223 | if (special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n" , __func__, special_fim_pre_id, id_to_token.at(special_fim_pre_id).text.c_str() ); } |
| 3224 | if (special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n" , __func__, special_fim_suf_id, id_to_token.at(special_fim_suf_id).text.c_str() ); } |
| 3225 | if (special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n" , __func__, special_fim_mid_id, id_to_token.at(special_fim_mid_id).text.c_str() ); } |
| 3226 | if (special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n" , __func__, special_fim_pad_id, id_to_token.at(special_fim_pad_id).text.c_str() ); } |
| 3227 | if (special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n" , __func__, special_fim_rep_id, id_to_token.at(special_fim_rep_id).text.c_str() ); } |
| 3228 | if (special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n" , __func__, special_fim_sep_id, id_to_token.at(special_fim_sep_id).text.c_str() ); } |
| 3229 | |
| 3230 | for (const auto & id : special_eog_ids) { |
| 3231 | LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n" , __func__, id, id_to_token.at(id).text.c_str() ); |
| 3232 | } |
| 3233 | |
| 3234 | LLAMA_LOG_INFO("%s: max token length = %d\n" , __func__, max_token_len); |
| 3235 | } |
| 3236 | |
| 3237 | llama_vocab::llama_vocab() : pimpl(new impl(*this)) { |
| 3238 | } |
| 3239 | |
| 3240 | llama_vocab::~llama_vocab() { |
| 3241 | } |
| 3242 | |
| 3243 | void llama_vocab::load(llama_model_loader & ml, const LLM_KV & kv) { |
| 3244 | pimpl->load(ml, kv); |
| 3245 | } |
| 3246 | |
| 3247 | std::string llama_vocab::get_tokenizer_model() const { |
| 3248 | return pimpl->tokenizer_model; |
| 3249 | } |
| 3250 | |
| 3251 | std::string llama_vocab::get_tokenizer_pre() const { |
| 3252 | return pimpl->tokenizer_pre; |
| 3253 | } |
| 3254 | |
| 3255 | enum llama_vocab_type llama_vocab::get_type() const { |
| 3256 | return pimpl->type; |
| 3257 | } |
| 3258 | |
| 3259 | enum llama_vocab_pre_type llama_vocab::get_pre_type() const { |
| 3260 | return pimpl->pre_type; |
| 3261 | } |
| 3262 | |
| 3263 | uint32_t llama_vocab::n_tokens() const { |
| 3264 | return (uint32_t) pimpl->id_to_token.size(); |
| 3265 | } |
| 3266 | |
| 3267 | uint32_t llama_vocab::n_token_types() const { |
| 3268 | return (uint32_t) pimpl->n_token_types; |
| 3269 | } |
| 3270 | |
| 3271 | std::string llama_vocab::type_name() const{ |
| 3272 | return pimpl->type_name(); |
| 3273 | } |
| 3274 | |
| 3275 | bool llama_vocab::is_normal(llama_token id) const { |
| 3276 | return pimpl->is_normal(id); |
| 3277 | } |
| 3278 | |
| 3279 | bool llama_vocab::is_unknown(llama_token id) const { |
| 3280 | return pimpl->is_unknown(id); |
| 3281 | } |
| 3282 | |
| 3283 | bool llama_vocab::is_control(llama_token id) const { |
| 3284 | return pimpl->is_control(id); |
| 3285 | } |
| 3286 | |
| 3287 | bool llama_vocab::is_byte(llama_token id) const { |
| 3288 | return pimpl->is_byte(id); |
| 3289 | } |
| 3290 | |
| 3291 | bool llama_vocab::is_user_defined(llama_token id) const { |
| 3292 | return pimpl->is_user_defined(id); |
| 3293 | } |
| 3294 | |
| 3295 | bool llama_vocab::is_unused(llama_token id) const { |
| 3296 | return pimpl->is_unused(id); |
| 3297 | } |
| 3298 | |
| 3299 | bool llama_vocab::is_eog(llama_token id) const { |
| 3300 | return pimpl->is_eog(id); |
| 3301 | } |
| 3302 | |
| 3303 | uint8_t llama_vocab::token_to_byte(llama_token id) const { |
| 3304 | return pimpl->token_to_byte(id); |
| 3305 | } |
| 3306 | |
| 3307 | llama_token llama_vocab::byte_to_token(uint8_t ch) const { |
| 3308 | GGML_ASSERT(get_type() != LLAMA_VOCAB_TYPE_NONE); |
| 3309 | static const char * hex = "0123456789ABCDEF" ; |
| 3310 | switch (get_type()) { |
| 3311 | case LLAMA_VOCAB_TYPE_SPM: |
| 3312 | case LLAMA_VOCAB_TYPE_UGM: { |
| 3313 | const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 }; |
| 3314 | auto token = pimpl->token_to_id.find(x: buf); |
| 3315 | if (token != pimpl->token_to_id.end()) { |
| 3316 | return (*token).second; |
| 3317 | } |
| 3318 | // Try to fall back to just the byte as a string |
| 3319 | const char buf2[2] = { (char)ch, 0 }; |
| 3320 | return pimpl->token_to_id.at(k: buf2); |
| 3321 | } |
| 3322 | case LLAMA_VOCAB_TYPE_WPM: |
| 3323 | case LLAMA_VOCAB_TYPE_BPE: { |
| 3324 | return pimpl->token_to_id.at(k: unicode_byte_to_utf8(byte: ch)); |
| 3325 | } |
| 3326 | case LLAMA_VOCAB_TYPE_PLAMO2: { |
| 3327 | // PLaMo-2 uses byte tokens in format <0xXX> |
| 3328 | char hex_str[8]; |
| 3329 | snprintf(s: hex_str, maxlen: sizeof(hex_str), format: "<0x%02X>" , ch); |
| 3330 | return pimpl->token_to_id.at(k: hex_str); |
| 3331 | } |
| 3332 | default: |
| 3333 | GGML_ABORT("fatal error" ); |
| 3334 | } |
| 3335 | } |
| 3336 | |
| 3337 | llama_token llama_vocab::text_to_token(const std::string & text) const { |
| 3338 | GGML_ASSERT(pimpl->type != LLAMA_VOCAB_TYPE_NONE); |
| 3339 | auto it = pimpl->token_to_id.find(x: text); |
| 3340 | if (it != pimpl->token_to_id.end()) { |
| 3341 | return (*it).second; |
| 3342 | } |
| 3343 | return LLAMA_TOKEN_NULL; |
| 3344 | } |
| 3345 | |
| 3346 | const llama_vocab::token_data & llama_vocab::get_token_data(llama_token id) const { |
| 3347 | GGML_ASSERT(pimpl->type != LLAMA_VOCAB_TYPE_NONE); |
| 3348 | return pimpl->id_to_token.at(n: id); |
| 3349 | } |
| 3350 | |
| 3351 | const char * llama_vocab::token_get_text(llama_token id) const { |
| 3352 | GGML_ASSERT(pimpl->type != LLAMA_VOCAB_TYPE_NONE); |
| 3353 | return pimpl->id_to_token.at(n: id).text.c_str(); |
| 3354 | } |
| 3355 | |
| 3356 | float llama_vocab::token_get_score(llama_token id) const { |
| 3357 | GGML_ASSERT(pimpl->type != LLAMA_VOCAB_TYPE_NONE); |
| 3358 | return pimpl->id_to_token.at(n: id).score; |
| 3359 | } |
| 3360 | |
| 3361 | llama_token_attr llama_vocab::token_get_attr(llama_token id) const { |
| 3362 | return pimpl->token_get_attr(id); |
| 3363 | } |
| 3364 | |
| 3365 | llama_token llama_vocab::token_bos() const { |
| 3366 | return pimpl->special_bos_id; |
| 3367 | } |
| 3368 | |
| 3369 | llama_token llama_vocab::token_eos() const { |
| 3370 | return pimpl->special_eos_id; |
| 3371 | } |
| 3372 | |
| 3373 | llama_token llama_vocab::token_eot() const { |
| 3374 | return pimpl->special_eot_id; |
| 3375 | } |
| 3376 | |
| 3377 | llama_token llama_vocab::token_eom() const { |
| 3378 | return pimpl->special_eom_id; |
| 3379 | } |
| 3380 | |
| 3381 | llama_token llama_vocab::token_unk() const { |
| 3382 | return pimpl->special_unk_id; |
| 3383 | } |
| 3384 | |
| 3385 | llama_token llama_vocab::token_sep() const { |
| 3386 | return pimpl->special_sep_id; |
| 3387 | } |
| 3388 | |
| 3389 | llama_token llama_vocab::token_nl() const { |
| 3390 | return pimpl->linefeed_id; |
| 3391 | } |
| 3392 | |
| 3393 | llama_token llama_vocab::token_pad() const { |
| 3394 | return pimpl->special_pad_id; |
| 3395 | } |
| 3396 | |
| 3397 | llama_token llama_vocab::token_prefix() const { |
| 3398 | return pimpl->special_fim_pre_id; |
| 3399 | } |
| 3400 | |
| 3401 | llama_token llama_vocab::token_middle() const { |
| 3402 | return pimpl->special_fim_mid_id; |
| 3403 | } |
| 3404 | |
| 3405 | llama_token llama_vocab::token_suffix() const { |
| 3406 | return pimpl->special_fim_suf_id; |
| 3407 | } |
| 3408 | |
| 3409 | llama_token llama_vocab::token_fim_pre() const { |
| 3410 | return pimpl->special_fim_pre_id; |
| 3411 | } |
| 3412 | |
| 3413 | llama_token llama_vocab::token_fim_suf() const { |
| 3414 | return pimpl->special_fim_suf_id; |
| 3415 | } |
| 3416 | |
| 3417 | llama_token llama_vocab::token_fim_mid() const { |
| 3418 | return pimpl->special_fim_mid_id; |
| 3419 | } |
| 3420 | |
| 3421 | llama_token llama_vocab::token_fim_pad() const { |
| 3422 | return pimpl->special_fim_pad_id; |
| 3423 | } |
| 3424 | |
| 3425 | llama_token llama_vocab::token_fim_rep() const { |
| 3426 | return pimpl->special_fim_rep_id; |
| 3427 | } |
| 3428 | |
| 3429 | llama_token llama_vocab::token_fim_sep() const { |
| 3430 | return pimpl->special_fim_sep_id; |
| 3431 | } |
| 3432 | |
| 3433 | llama_token llama_vocab::token_mask() const { |
| 3434 | return pimpl->special_mask_id; |
| 3435 | } |
| 3436 | |
| 3437 | bool llama_vocab::get_add_space_prefix() const { |
| 3438 | return pimpl->add_space_prefix; |
| 3439 | } |
| 3440 | |
| 3441 | bool llama_vocab::get_add_bos() const { |
| 3442 | return pimpl->add_bos; |
| 3443 | } |
| 3444 | |
| 3445 | bool llama_vocab::get_add_eos() const { |
| 3446 | return pimpl->add_eos; |
| 3447 | } |
| 3448 | |
| 3449 | bool llama_vocab::get_add_sep() const { |
| 3450 | return pimpl->add_sep; |
| 3451 | } |
| 3452 | |
| 3453 | bool llama_vocab::get_ignore_merges() const { |
| 3454 | return pimpl->ignore_merges; |
| 3455 | } |
| 3456 | |
| 3457 | bool llama_vocab::get_clean_spaces() const { |
| 3458 | return pimpl->clean_spaces; |
| 3459 | } |
| 3460 | |
| 3461 | bool llama_vocab::() const { |
| 3462 | return pimpl->remove_extra_whitespaces; |
| 3463 | } |
| 3464 | |
| 3465 | bool llama_vocab::get_escape_whitespaces() const { |
| 3466 | return pimpl->escape_whitespaces; |
| 3467 | } |
| 3468 | |
| 3469 | bool llama_vocab::get_treat_whitespace_as_suffix() const { |
| 3470 | return pimpl->treat_whitespace_as_suffix; |
| 3471 | } |
| 3472 | |
| 3473 | int llama_vocab::max_token_len() const { |
| 3474 | return pimpl->max_token_len; |
| 3475 | } |
| 3476 | |
| 3477 | int llama_vocab::find_bpe_rank(const std::string & token_left, const std::string & token_right) const { |
| 3478 | GGML_ASSERT(token_left.find(' ') == std::string::npos); |
| 3479 | GGML_ASSERT(token_left.find('\n') == std::string::npos); |
| 3480 | GGML_ASSERT(token_right.find(' ') == std::string::npos); |
| 3481 | GGML_ASSERT(token_right.find('\n') == std::string::npos); |
| 3482 | |
| 3483 | auto it = pimpl->bpe_ranks.find(x: std::make_pair(x: token_left, y: token_right)); |
| 3484 | if (it == pimpl->bpe_ranks.end()) { |
| 3485 | return -1; |
| 3486 | } |
| 3487 | |
| 3488 | return it->second; |
| 3489 | } |
| 3490 | |
| 3491 | std::vector<std::string> llama_vocab::get_bpe_merges() const { |
| 3492 | std::vector<std::string> result(pimpl->bpe_ranks.size()); |
| 3493 | |
| 3494 | for (const auto & pair : pimpl->bpe_ranks) { |
| 3495 | result[pair.second] = pair.first.first + " " + pair.first.second; |
| 3496 | } |
| 3497 | |
| 3498 | return result; |
| 3499 | } |
| 3500 | |
| 3501 | std::vector<char> llama_vocab::get_precompiled_charsmap() const { |
| 3502 | return pimpl->precompiled_charsmap; |
| 3503 | } |
| 3504 | |
| 3505 | int32_t llama_vocab::tokenize( |
| 3506 | const char * text, |
| 3507 | int32_t text_len, |
| 3508 | llama_token * tokens, |
| 3509 | int32_t n_tokens_max, |
| 3510 | bool add_special, |
| 3511 | bool parse_special) const { |
| 3512 | auto res = tokenize(raw_text: std::string(text, text_len), add_special, parse_special); |
| 3513 | if (res.size() >= static_cast<size_t>(std::numeric_limits<int32_t>::max())) { |
| 3514 | LLAMA_LOG_ERROR("%s: tokenization result size %zu exceeds int32_t limit\n" , __func__, res.size()); |
| 3515 | return std::numeric_limits<int32_t>::min(); |
| 3516 | } |
| 3517 | |
| 3518 | if (n_tokens_max < (int) res.size()) { |
| 3519 | // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__); |
| 3520 | return -((int) res.size()); |
| 3521 | } |
| 3522 | |
| 3523 | for (size_t i = 0; i < res.size(); i++) { |
| 3524 | tokens[i] = res[i]; |
| 3525 | } |
| 3526 | |
| 3527 | return res.size(); |
| 3528 | } |
| 3529 | |
| 3530 | std::vector<llama_token> llama_vocab::tokenize( |
| 3531 | const std::string & raw_text, |
| 3532 | bool add_special, |
| 3533 | bool parse_special) const { |
| 3534 | return pimpl->tokenize(raw_text, add_special, parse_special); |
| 3535 | } |
| 3536 | |
| 3537 | const std::string & llama_vocab::token_to_piece(llama_token token) const { |
| 3538 | return pimpl->token_to_piece(token); |
| 3539 | } |
| 3540 | |
| 3541 | int32_t llama_vocab::token_to_piece(llama_token token, char * buf, int32_t length, int32_t lstrip, bool special) const { |
| 3542 | return pimpl->token_to_piece(token, buf, length, lstrip, special); |
| 3543 | } |
| 3544 | |
| 3545 | int32_t llama_vocab::detokenize( |
| 3546 | const llama_token * tokens, |
| 3547 | int32_t n_tokens, |
| 3548 | char * text, |
| 3549 | int32_t text_len_max, |
| 3550 | bool remove_special, |
| 3551 | bool unparse_special) const { |
| 3552 | return pimpl->detokenize(tokens, n_tokens, text, text_len_max, remove_special, unparse_special); |
| 3553 | } |
| 3554 | |
| 3555 | std::string llama_vocab::detokenize(const std::vector<llama_token> & tokens, bool special) const { |
| 3556 | std::string text; |
| 3557 | text.resize(n: std::max(a: text.capacity(), b: tokens.size())); |
| 3558 | int32_t n_chars = detokenize(tokens: tokens.data(), n_tokens: (int32_t)tokens.size(), text: &text[0], text_len_max: (int32_t)text.size(), remove_special: false, unparse_special: special); |
| 3559 | if (n_chars < 0) { |
| 3560 | text.resize(n: -n_chars); |
| 3561 | n_chars = detokenize(tokens: tokens.data(), n_tokens: (int32_t)tokens.size(), text: &text[0], text_len_max: (int32_t)text.size(), remove_special: false, unparse_special: special); |
| 3562 | GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization |
| 3563 | } |
| 3564 | |
| 3565 | text.resize(n: n_chars); |
| 3566 | |
| 3567 | // NOTE: the original tokenizer decodes bytes after collecting the pieces. |
| 3568 | return text; |
| 3569 | } |
| 3570 | |
| 3571 | void llama_vocab::print_info() const { |
| 3572 | pimpl->print_info(); |
| 3573 | } |
| 3574 | |
| 3575 | // |
| 3576 | // interface implementation |
| 3577 | // |
| 3578 | |
| 3579 | int32_t llama_vocab_n_tokens(const struct llama_vocab * vocab) { |
| 3580 | return vocab->n_tokens(); |
| 3581 | } |
| 3582 | |
| 3583 | // deprecated |
| 3584 | int32_t llama_n_vocab(const struct llama_vocab * vocab) { |
| 3585 | return llama_vocab_n_tokens(vocab); |
| 3586 | } |
| 3587 | |
| 3588 | enum llama_vocab_type llama_vocab_type(const struct llama_vocab * vocab) { |
| 3589 | return vocab->get_type(); |
| 3590 | } |
| 3591 | |
| 3592 | const char * llama_vocab_get_text(const struct llama_vocab * vocab, llama_token token) { |
| 3593 | return vocab->token_get_text(id: token); |
| 3594 | } |
| 3595 | |
| 3596 | float llama_vocab_get_score(const struct llama_vocab * vocab, llama_token token) { |
| 3597 | return vocab->token_get_score(id: token); |
| 3598 | } |
| 3599 | |
| 3600 | enum llama_token_attr llama_vocab_get_attr(const struct llama_vocab * vocab, llama_token token) { |
| 3601 | return vocab->token_get_attr(id: token); |
| 3602 | } |
| 3603 | |
| 3604 | bool llama_vocab_is_eog(const struct llama_vocab * vocab, llama_token token) { |
| 3605 | return vocab->is_eog(id: token); |
| 3606 | } |
| 3607 | |
| 3608 | bool llama_vocab_is_control(const struct llama_vocab * vocab, llama_token token) { |
| 3609 | return vocab->is_control(id: token); |
| 3610 | } |
| 3611 | |
| 3612 | llama_token llama_vocab_bos(const struct llama_vocab * vocab) { |
| 3613 | return vocab->token_bos(); |
| 3614 | } |
| 3615 | |
| 3616 | llama_token llama_vocab_eos(const struct llama_vocab * vocab) { |
| 3617 | return vocab->token_eos(); |
| 3618 | } |
| 3619 | |
| 3620 | llama_token llama_vocab_eot(const struct llama_vocab * vocab) { |
| 3621 | return vocab->token_eot(); |
| 3622 | } |
| 3623 | |
| 3624 | // deprecated |
| 3625 | llama_token llama_vocab_cls(const struct llama_vocab * vocab) { |
| 3626 | return vocab->token_bos(); |
| 3627 | } |
| 3628 | |
| 3629 | llama_token llama_vocab_sep(const struct llama_vocab * vocab) { |
| 3630 | return vocab->token_sep(); |
| 3631 | } |
| 3632 | |
| 3633 | llama_token llama_vocab_nl (const struct llama_vocab * vocab) { |
| 3634 | return vocab->token_nl(); |
| 3635 | } |
| 3636 | |
| 3637 | llama_token llama_vocab_pad(const struct llama_vocab * vocab) { |
| 3638 | return vocab->token_pad(); |
| 3639 | } |
| 3640 | |
| 3641 | bool llama_vocab_get_add_bos(const struct llama_vocab * vocab) { |
| 3642 | return vocab->get_add_bos(); |
| 3643 | } |
| 3644 | |
| 3645 | bool llama_vocab_get_add_eos(const struct llama_vocab * vocab) { |
| 3646 | return vocab->get_add_eos(); |
| 3647 | } |
| 3648 | |
| 3649 | bool llama_vocab_get_add_sep(const struct llama_vocab * vocab) { |
| 3650 | return vocab->get_add_sep(); |
| 3651 | } |
| 3652 | |
| 3653 | llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab) { |
| 3654 | return vocab->token_fim_pre(); |
| 3655 | } |
| 3656 | |
| 3657 | llama_token llama_vocab_fim_suf(const struct llama_vocab * vocab) { |
| 3658 | return vocab->token_fim_suf(); |
| 3659 | } |
| 3660 | |
| 3661 | llama_token llama_vocab_fim_mid(const struct llama_vocab * vocab) { |
| 3662 | return vocab->token_fim_mid(); |
| 3663 | } |
| 3664 | |
| 3665 | llama_token llama_vocab_fim_pad(const struct llama_vocab * vocab) { |
| 3666 | return vocab->token_fim_pad(); |
| 3667 | } |
| 3668 | |
| 3669 | llama_token llama_vocab_fim_rep(const struct llama_vocab * vocab) { |
| 3670 | return vocab->token_fim_rep(); |
| 3671 | } |
| 3672 | |
| 3673 | llama_token llama_vocab_fim_sep(const struct llama_vocab * vocab) { |
| 3674 | return vocab->token_fim_sep(); |
| 3675 | } |
| 3676 | |
| 3677 | llama_token llama_vocab_mask(const struct llama_vocab* vocab) { |
| 3678 | return vocab->token_mask(); |
| 3679 | } |
| 3680 | |
| 3681 | // deprecated |
| 3682 | const char * llama_token_get_text(const struct llama_vocab * vocab, llama_token token) { |
| 3683 | return llama_vocab_get_text(vocab, token); |
| 3684 | } |
| 3685 | |
| 3686 | // deprecated |
| 3687 | float llama_token_get_score(const struct llama_vocab * vocab, llama_token token) { |
| 3688 | return llama_vocab_get_score(vocab, token); |
| 3689 | } |
| 3690 | |
| 3691 | // deprecated |
| 3692 | enum llama_token_attr llama_token_get_attr(const struct llama_vocab * vocab, llama_token token) { |
| 3693 | return llama_vocab_get_attr(vocab, token); |
| 3694 | } |
| 3695 | |
| 3696 | // deprecated |
| 3697 | bool llama_token_is_eog(const struct llama_vocab * vocab, llama_token token) { |
| 3698 | return llama_vocab_is_eog(vocab, token); |
| 3699 | } |
| 3700 | |
| 3701 | // deprecated |
| 3702 | bool llama_token_is_control(const struct llama_vocab * vocab, llama_token token) { |
| 3703 | return llama_vocab_is_control(vocab, token); |
| 3704 | } |
| 3705 | |
| 3706 | // deprecated |
| 3707 | llama_token llama_token_bos(const struct llama_vocab * vocab) { |
| 3708 | return llama_vocab_bos(vocab); |
| 3709 | } |
| 3710 | |
| 3711 | // deprecated |
| 3712 | llama_token llama_token_eos(const struct llama_vocab * vocab) { |
| 3713 | return llama_vocab_eos(vocab); |
| 3714 | } |
| 3715 | |
| 3716 | // deprecated |
| 3717 | llama_token llama_token_eot(const struct llama_vocab * vocab) { |
| 3718 | return llama_vocab_eot(vocab); |
| 3719 | } |
| 3720 | |
| 3721 | // deprecated |
| 3722 | llama_token llama_token_cls(const struct llama_vocab * vocab) { |
| 3723 | //return llama_vocab_cls(vocab); |
| 3724 | return llama_vocab_bos(vocab); // avoid deprecation warning |
| 3725 | } |
| 3726 | |
| 3727 | // deprecated |
| 3728 | llama_token llama_token_sep(const struct llama_vocab * vocab) { |
| 3729 | return llama_vocab_sep(vocab); |
| 3730 | } |
| 3731 | |
| 3732 | // deprecated |
| 3733 | llama_token llama_token_nl (const struct llama_vocab * vocab) { |
| 3734 | return llama_vocab_nl(vocab); |
| 3735 | } |
| 3736 | |
| 3737 | // deprecated |
| 3738 | llama_token llama_token_pad(const struct llama_vocab * vocab) { |
| 3739 | return llama_vocab_pad(vocab); |
| 3740 | } |
| 3741 | |
| 3742 | // deprecated |
| 3743 | bool llama_add_bos_token(const struct llama_vocab * vocab) { |
| 3744 | return llama_vocab_get_add_bos(vocab); |
| 3745 | } |
| 3746 | |
| 3747 | // deprecated |
| 3748 | bool llama_add_eos_token(const struct llama_vocab * vocab) { |
| 3749 | return llama_vocab_get_add_eos(vocab); |
| 3750 | } |
| 3751 | |
| 3752 | // deprecated |
| 3753 | llama_token llama_token_fim_pre(const struct llama_vocab * vocab) { |
| 3754 | return llama_vocab_fim_pre(vocab); |
| 3755 | } |
| 3756 | |
| 3757 | // deprecated |
| 3758 | llama_token llama_token_fim_suf(const struct llama_vocab * vocab) { |
| 3759 | return llama_vocab_fim_suf(vocab); |
| 3760 | } |
| 3761 | |
| 3762 | // deprecated |
| 3763 | llama_token llama_token_fim_mid(const struct llama_vocab * vocab) { |
| 3764 | return llama_vocab_fim_mid(vocab); |
| 3765 | } |
| 3766 | |
| 3767 | // deprecated |
| 3768 | llama_token llama_token_fim_pad(const struct llama_vocab * vocab) { |
| 3769 | return llama_vocab_fim_pad(vocab); |
| 3770 | } |
| 3771 | |
| 3772 | // deprecated |
| 3773 | llama_token llama_token_fim_rep(const struct llama_vocab * vocab) { |
| 3774 | return llama_vocab_fim_rep(vocab); |
| 3775 | } |
| 3776 | |
| 3777 | // deprecated |
| 3778 | llama_token llama_token_fim_sep(const struct llama_vocab * vocab) { |
| 3779 | return llama_vocab_fim_sep(vocab); |
| 3780 | } |
| 3781 | |
| 3782 | // |
| 3783 | // tokenization |
| 3784 | // |
| 3785 | |
| 3786 | int32_t llama_tokenize( |
| 3787 | const struct llama_vocab * vocab, |
| 3788 | const char * text, |
| 3789 | int32_t text_len, |
| 3790 | llama_token * tokens, |
| 3791 | int32_t n_tokens_max, |
| 3792 | bool add_special, |
| 3793 | bool parse_special) { |
| 3794 | return vocab->tokenize(text, text_len, tokens, n_tokens_max, add_special, parse_special); |
| 3795 | } |
| 3796 | |
| 3797 | int32_t llama_token_to_piece( |
| 3798 | const struct llama_vocab * vocab, |
| 3799 | llama_token token, |
| 3800 | char * buf, |
| 3801 | int32_t length, |
| 3802 | int32_t lstrip, |
| 3803 | bool special) { |
| 3804 | return vocab->token_to_piece(token, buf, length, lstrip, special); |
| 3805 | } |
| 3806 | |
| 3807 | int32_t llama_detokenize( |
| 3808 | const struct llama_vocab * vocab, |
| 3809 | const llama_token * tokens, |
| 3810 | int32_t n_tokens, |
| 3811 | char * text, |
| 3812 | int32_t text_len_max, |
| 3813 | bool remove_special, |
| 3814 | bool unparse_special) { |
| 3815 | return vocab->detokenize(tokens, n_tokens, text, text_len_max, remove_special, unparse_special); |
| 3816 | } |
| 3817 | |