unicode.cpp source code [llama.cpp/src/unicode.cpp]

1	#if defined(_MSC_VER)
2	#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
3	#endif
4
5	#include "unicode.h"
6	#include "unicode-data.h"
7
8	#include <algorithm>
9	#include <cassert>
10	#include <codecvt>
11	#include <cstddef>
12	#include <cstdint>
13	#include <locale>
14	#include <map>
15	#include <regex>
16	#include <stdexcept>
17	#include <string>
18	#include <unordered_map>
19	#include <utility>
20	#include <vector>
21
22	size_t unicode_len_utf8(char src) {
23	const size_t lookup[] = { `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `2`, `2`, `3`, `4` };
24	uint8_t highbits = static_cast<uint8_t>(src) >> `4`;
25	return lookup[highbits];
26	}
27
28	static std::string unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) {
29	std::string result;
30	for (size_t i = `0`; i < cps.size(); ++i) {
31	result.append(str: unicode_cpt_to_utf8(cpt: cps [i]));
32	}
33	return result;
34	}
35
36	uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
37	assert(offset < utf8.size());
38	if (!(utf8 [offset + `0`] & `0x80`)) {
39	auto result = utf8 [offset + `0`];
40	offset += `1`;
41	return result;
42	}
43	if (!(utf8 [offset + `0`] & `0x40`)) {
44	throw std::invalid_argument ("invalid character");
45	}
46	if (!(utf8 [offset + `0`] & `0x20`)) {
47	if (offset + `1` >= utf8.size() \|\| ! ((utf8 [offset + `1`] & `0xc0`) == `0x80`)) {
48	throw std::invalid_argument ("invalid character");
49	}
50	auto result = ((utf8 [offset + `0`] & `0x1f`) << `6`) \| (utf8 [offset + `1`] & `0x3f`);
51	offset += `2`;
52	return result;
53	}
54	if (!(utf8 [offset + `0`] & `0x10`)) {
55	if (offset + `2` >= utf8.size() \|\| ! ((utf8 [offset + `1`] & `0xc0`) == `0x80`) \|\| ! ((utf8 [offset + `2`] & `0xc0`) == `0x80`)) {
56	throw std::invalid_argument ("invalid character");
57	}
58	auto result = ((utf8 [offset + `0`] & `0x0f`) << `12`) \| ((utf8 [offset + `1`] & `0x3f`) << `6`) \| (utf8 [offset + `2`] & `0x3f`);
59	offset += `3`;
60	return result;
61	}
62	if (!(utf8 [offset + `0`] & `0x08`)) {
63	if (offset + `3` >= utf8.size() \|\| ! ((utf8 [offset + `1`] & `0xc0`) == `0x80`) \|\| ! ((utf8 [offset + `2`] & `0xc0`) == `0x80`) \|\| !((utf8 [offset + `3`] & `0xc0`) == `0x80`)) {
64	throw std::invalid_argument ("invalid character");
65	}
66	auto result = ((utf8 [offset + `0`] & `0x07`) << `18`) \| ((utf8 [offset + `1`] & `0x3f`) << `12`) \| ((utf8 [offset + `2`] & `0x3f`) << `6`) \| (utf8 [offset + `3`] & `0x3f`);
67	offset += `4`;
68	return result;
69	}
70	throw std::invalid_argument ("failed to convert utf8 to codepoint");
71	}
72
73	//static std::vector<uint16_t> unicode_cpt_to_utf16(uint32_t cpt) {
74	// std::vector<uint16_t> result;
75	// if (/ 0x0000 <= cpt && / cpt <= 0xffff) {
76	// result.emplace_back(cpt);
77	// return result;
78	// }
79	// if (0x10000 <= cpt && cpt <= 0x10ffff) {
80	// result.emplace_back(0xd800 \| ((cpt - 0x10000) >> 10));
81	// result.emplace_back(0xdc00 \| ((cpt - 0x10000) & 0x03ff));
82	// return result;
83	// }
84	// throw std::invalid_argument("failed to convert codepoint to utf16");
85	//}
86
87	//static std::vector<uint16_t> unicode_cpts_to_utf16(const std::vector<uint32_t> & cps) {
88	// std::vector<uint16_t> result;
89	// for (size_t i = 0; i < cps.size(); ++i) {
90	// auto temp = unicode_cpt_to_utf16(cps[i]);
91	// result.insert(result.end(), temp.begin(), temp.end());
92	// }
93	// return result;
94	//}
95
96	//static uint32_t unicode_cpt_from_utf16(const std::vector<uint16_t> & utf16, size_t & offset) {
97	// assert(offset < utf16.size());
98	// if (((utf16[0] >> 10) << 10) != 0xd800) {
99	// auto result = utf16[offset + 0];
100	// offset += 1;
101	// return result;
102	// }
103	//
104	// if (offset + 1 >= utf16.size() \|\| !((utf16[1] & 0xdc00) == 0xdc00)) {
105	// throw std::invalid_argument("invalid character");
106	// }
107	//
108	// auto result = 0x10000 + (((utf16[0] & 0x03ff) << 10) \| (utf16[1] & 0x03ff));
109	// offset += 2;
110	// return result;
111	//}
112
113	//static std::vector<uint32_t> unicode_cpts_from_utf16(const std::vector<uint16_t> & utf16) {
114	// std::vector<uint32_t> result;
115	// size_t offset = 0;
116	// while (offset < utf16.size()) {
117	// result.push_back(unicode_cpt_from_utf16(utf16, offset));
118	// }
119	// return result;
120	//}
121
122	static std::vector<unicode_cpt_flags> unicode_cpt_flags_array() {
123	std::vector<unicode_cpt_flags> cpt_flags(MAX_CODEPOINTS, unicode_cpt_flags::UNDEFINED);
124
125	assert (unicode_ranges_flags.begin()[`0`].first == `0`);
126	assert (unicode_ranges_flags.begin()[unicode_ranges_flags.size()-`1`].first == MAX_CODEPOINTS);
127	for (size_t i = `1`; i < unicode_ranges_flags.size(); ++i) {
128	const auto range_ini = unicode_ranges_flags.begin()[i-`1`]; // codepoint_ini, flags
129	const auto range_end = unicode_ranges_flags.begin()[i]; // codepoint_end, flags
130	for (uint32_t cpt = range_ini.first; cpt < range_end.first; ++cpt) {
131	cpt_flags [cpt] = range_ini.second;
132	}
133	}
134
135	for (auto cpt : unicode_set_whitespace) {
136	cpt_flags [cpt].is_whitespace = true;
137	}
138
139	for (auto p : unicode_map_lowercase) {
140	cpt_flags [p.second].is_lowercase = true;
141	}
142
143	for (auto p : unicode_map_uppercase) {
144	cpt_flags [p.second].is_uppercase = true;
145	}
146
147	for (auto &range : unicode_ranges_nfd) { // start, last, nfd
148	cpt_flags [range.nfd].is_nfd = true;
149	}
150
151	return cpt_flags;
152	}
153
154	static std::unordered_map<uint8_t, std::string> unicode_byte_to_utf8_map() {
155	std::unordered_map<uint8_t, std::string> map;
156	for (int ch = `0x21`; ch <= `0x7E`; ++ch) { // u'!' to u'~'
157	assert(`0` <= ch && ch < `256`);
158	map [ch] = unicode_cpt_to_utf8(cpt: ch);
159	}
160	for (int ch = `0xA1`; ch <= `0xAC`; ++ch) { // u'¡' to u'¬'
161	assert(`0` <= ch && ch < `256`);
162	map [ch] = unicode_cpt_to_utf8(cpt: ch);
163	}
164	for (int ch = `0xAE`; ch <= `0xFF`; ++ch) { // u'®' to u'ÿ'
165	assert(`0` <= ch && ch < `256`);
166	map [ch] = unicode_cpt_to_utf8(cpt: ch);
167	}
168	auto n = `0`;
169	for (int ch = `0`; ch < `256`; ++ch) {
170	if (map.find(x: ch) == map.end()) {
171	map [ch] = unicode_cpt_to_utf8(cpt: `256` + n);
172	++n;
173	}
174	}
175	return map;
176	}
177
178	static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
179	std::unordered_map<std::string, uint8_t> map;
180	for (int ch = `0x21`; ch <= `0x7E`; ++ch) { // u'!' to u'~'
181	assert(`0` <= ch && ch < `256`);
182	map [unicode_cpt_to_utf8(cpt: ch)] = ch;
183	}
184	for (int ch = `0xA1`; ch <= `0xAC`; ++ch) { // u'¡' to u'¬'
185	assert(`0` <= ch && ch < `256`);
186	map [unicode_cpt_to_utf8(cpt: ch)] = ch;
187	}
188	for (int ch = `0xAE`; ch <= `0xFF`; ++ch) { // u'®' to u'ÿ'
189	assert(`0` <= ch && ch < `256`);
190	map [unicode_cpt_to_utf8(cpt: ch)] = ch;
191	}
192	auto n = `0`;
193	for (int ch = `0`; ch < `256`; ++ch) {
194	if (map.find(x: unicode_cpt_to_utf8(cpt: ch)) == map.end()) {
195	map [unicode_cpt_to_utf8(cpt: `256` + n)] = ch;
196	++n;
197	}
198	}
199	return map;
200	}
201
202	static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
203	#if defined(__clang__)
204	// disable C++17 deprecation warning for std::codecvt_utf8
205	# pragma clang diagnostic push
206	# pragma clang diagnostic ignored "-Wdeprecated-declarations"
207	#elif defined(__GNUC__)
208	# pragma GCC diagnostic push
209	# pragma GCC diagnostic ignored "-Wdeprecated-declarations"
210	#endif
211
212	std::wstring_convert<std::codecvt_utf8<wchar_t>> conv;
213
214	#if defined(__clang__)
215	# pragma clang diagnostic pop
216	#elif defined(__GNUC__)
217	# pragma GCC diagnostic pop
218	#endif
219
220	return conv.from_bytes(str: s);
221	}
222
223	static std::vector<std::string> unicode_byte_encoding_process(const std::vector<std::string> & bpe_words) {
224	std::vector<std::string> bpe_encoded_words;
225	for (const auto & word : bpe_words) {
226	std::string text_utf;
227	auto utf_word = unicode_cpts_from_utf8(utf8: word);
228	for (size_t i = `0`; i < utf_word.size(); ++i) {
229	text_utf += unicode_cpt_to_utf8(cpt: utf_word [i]);
230	}
231
232	std::string encoded_token;
233	for (char & c : text_utf) {
234	encoded_token += unicode_byte_to_utf8(byte: c);
235	}
236	bpe_encoded_words.emplace_back(args&: encoded_token);
237	}
238	return bpe_encoded_words;
239	}
240
241	// GPT2 system regex: 's\|'t\|'re\|'ve\|'m\|'ll\|'d\| ?\p{L}+\| ?\p{N}+\| ?[^\s\p{L}\p{N}]+\|\s+(?!\S)\|\s+
242	static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & text, const std::vector<size_t> & offsets) {
243	std::vector<size_t> bpe_offsets; // store the offset of each word
244	bpe_offsets.reserve(n: offsets.size()); // Reserve memory for the approximate size
245
246	const auto cpts = unicode_cpts_from_utf8(utf8: text);
247
248	size_t start = `0`;
249	for (auto offset : offsets) {
250	const size_t offset_ini = start;
251	const size_t offset_end = start + offset;
252	assert(offset_end <= cpts.size());
253	start = offset_end;
254
255	static const uint32_t OUT_OF_RANGE = `0xFFFFFFFF`;
256	auto _get_cpt = [&] (const size_t pos) -> uint32_t {
257	return (offset_ini <= pos && pos < offset_end) ? cpts [pos] : OUT_OF_RANGE;
258	};
259
260	auto _get_flags = [&] (const size_t pos) -> unicode_cpt_flags {
261	return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags_from_cpt(cpt: cpts [pos]) : unicode_cpt_flags {};
262	};
263
264	size_t _prev_end = offset_ini;
265	auto _add_token = [&] (const size_t end) -> size_t {
266	assert(_prev_end <= end && end <= offset_end);
267	size_t len = end - _prev_end;
268	if (len > `0`) {
269	bpe_offsets.push_back(x: len);
270	}
271	_prev_end = end;
272	//if (len > 0) {
273	// std::string s = "";
274	// for(size_t p = end-len; p < end; p++)
275	// s += unicode_cpt_to_utf8(cpts[p]);
276	// printf(">>> '%s'\n", s.c_str());
277	//}
278	return len;
279	};
280
281	for (size_t pos = offset_ini; pos < offset_end; /pos++/ ) {
282	const uint32_t cpt = _get_cpt (pos);
283	const auto flags = _get_flags (pos);
284
285	// regex: 's\|'t\|'re\|'ve\|'m\|'ll\|'d
286	if (cpt == `'\''` && pos+`1` < offset_end) {
287	uint32_t cpt_next = _get_cpt (pos+`1`);
288	if (cpt_next == `'s'` \|\| cpt_next == `'t'` \|\| cpt_next == `'m'` \|\| cpt_next == `'d'`) {
289	pos += _add_token (pos+`2`);
290	continue;
291	}
292	if (pos+`2` < offset_end) {
293	uint32_t cpt_next_next = _get_cpt (pos+`2`);
294	if ((cpt_next == `'r'` && cpt_next_next == `'e'`) \|\|
295	(cpt_next == `'v'` && cpt_next_next == `'e'`) \|\|
296	(cpt_next == `'l'` && cpt_next_next == `'l'`)) {
297	pos += _add_token (pos+`3`);
298	continue;
299	}
300	}
301	}
302
303	auto flags2 = (cpt == `' '` ? _get_flags (pos+`1`) : flags);
304	// regex: <space>?\p{L}+
305	if (flags2.is_letter) {
306	pos += (cpt == `' '`);
307	while (flags2.is_letter) {
308	flags2 = _get_flags (++pos);
309	}
310	_add_token (pos);
311	continue;
312	}
313	// regex: <space>?\p{N}+
314	if (flags2.is_number) {
315	pos += (cpt == `' '`);
316	while (flags2.is_number) {
317	flags2 = _get_flags (++pos);
318	}
319	_add_token (pos);
320	continue;
321	}
322	// regex: <space>?[^\s\p{L}\p{N}]+
323	if (!(flags2.is_whitespace \| flags2.is_letter \| flags2.is_number) && flags2.as_uint()) {
324	pos += (cpt == `' '`);
325	while (!(flags2.is_whitespace \| flags2.is_letter \| flags2.is_number) && flags2.as_uint()) {
326	flags2 = _get_flags (++pos);
327	}
328	_add_token (pos);
329	continue;
330	}
331
332	size_t num_whitespaces = `0`;
333	while (_get_flags (pos+num_whitespaces).is_whitespace) {
334	num_whitespaces++;
335	}
336
337	// regex: \s+(?!\S)
338	if (num_whitespaces > `1` && _get_cpt (pos+num_whitespaces) != OUT_OF_RANGE) {
339	pos += num_whitespaces - `1`;
340	_add_token (pos);
341	continue;
342	}
343
344	// regex: \s+
345	if (num_whitespaces > `0`) {
346	pos += num_whitespaces;
347	_add_token (pos);
348	continue;
349	}
350
351	// no matches
352	_add_token (++pos);
353	}
354	}
355
356	return bpe_offsets;
357	}
358
359	// LLAMA3 system regex: "(?i:'s\|'t\|'re\|'ve\|'m\|'ll\|'d)\|[^\r\n\p{L}\p{N}]?\p{L}+\|\p{N}{1,3}\| ?[^\s\p{L}\p{N}]+[\r\n]\|\s[\r\n]+\|\s+(?!\S)\|\s+"
360	static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string & text, const std::vector<size_t> & offsets) {
361	std::vector<size_t> bpe_offsets; // store the offset of each word
362	bpe_offsets.reserve(n: offsets.size()); // Reserve memory for the approximate size
363
364	const auto cpts = unicode_cpts_from_utf8(utf8: text);
365
366	size_t start = `0`;
367	for (auto offset : offsets) {
368	const size_t offset_ini = start;
369	const size_t offset_end = start + offset;
370	assert(offset_end <= cpts.size());
371	start = offset_end;
372
373	static const uint32_t OUT_OF_RANGE = `0xFFFFFFFF`;
374	auto _get_cpt = [&] (const size_t pos) -> uint32_t {
375	return (offset_ini <= pos && pos < offset_end) ? cpts [pos] : OUT_OF_RANGE;
376	};
377
378	auto _get_flags = [&] (const size_t pos) -> unicode_cpt_flags {
379	return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags_from_cpt(cpt: cpts [pos]) : unicode_cpt_flags {};
380	};
381
382	size_t _prev_end = offset_ini;
383	auto _add_token = [&] (const size_t end) -> size_t {
384	assert(_prev_end <= end && end <= offset_end);
385	size_t len = end - _prev_end;
386	if (len > `0`) {
387	bpe_offsets.push_back(x: len);
388	}
389	_prev_end = end;
390	//if (len > 0) {
391	// std::string s = "";
392	// for(size_t p = end-len; p < end; p++)
393	// s += unicode_cpt_to_utf8(cpts[p]);
394	// printf(">>> '%s'\n", s.c_str());
395	//}
396	return len;
397	};
398
399	for (size_t pos = offset_ini; pos < offset_end; /pos++/ ) {
400	const uint32_t cpt = _get_cpt (pos);
401	const auto flags = _get_flags (pos);
402
403	// regex: (?i:'s\|'t\|'re\|'ve\|'m\|'ll\|'d) // case insensitive
404	if (cpt == `'\''` && pos+`1` < offset_end) {
405	uint32_t cpt_next = unicode_tolower(cpt: _get_cpt (pos+`1`));
406	if (cpt_next == `'s'` \|\| cpt_next == `'t'` \|\| cpt_next == `'m'` \|\| cpt_next == `'d'`) {
407	pos += _add_token (pos+`2`);
408	continue;
409	}
410	if (pos+`2` < offset_end) {
411	uint32_t cpt_next_next = unicode_tolower(cpt: _get_cpt (pos+`2`));
412	if ((cpt_next == `'r'` && cpt_next_next == `'e'`) \|\|
413	(cpt_next == `'v'` && cpt_next_next == `'e'`) \|\|
414	(cpt_next == `'l'` && cpt_next_next == `'l'`)) {
415	pos += _add_token (pos+`3`);
416	continue;
417	}
418	}
419	}
420
421	// regex: [^\r\n\p{L}\p{N}]?\p{L}+
422	if (!(cpt == `'\r'` \|\| cpt == `'\n'` \|\| flags.is_number)) {
423	if (flags.is_letter \|\| _get_flags (pos+`1`).is_letter) { // one or more letters
424	pos++;
425	while (_get_flags (pos).is_letter) {
426	pos++;
427	}
428	_add_token (pos);
429	continue;
430	}
431	}
432
433	// regex: \p{N}{1,3}
434	if (flags.is_number) {
435	size_t ini = pos;
436	while (_get_flags (pos).is_number) {
437	if (++pos - ini >= `3` ) {
438	_add_token (pos);
439	ini = pos;
440	}
441	}
442	_add_token (pos);
443	continue;
444	}
445
446	// regex: <space>?[^\s\p{L}\p{N}]+[\r\n]*
447	auto flags2 = (cpt == `' '` ? _get_flags (pos+`1`) : flags);
448	if (!(flags2.is_whitespace \| flags2.is_letter \| flags2.is_number) && flags.as_uint()) {
449	pos += (cpt == `' '`);
450	while (!(flags2.is_whitespace \| flags2.is_letter \| flags2.is_number) && flags2.as_uint()) {
451	flags2 = _get_flags (++pos);
452	}
453	uint32_t cpt2 = _get_cpt (pos);
454	while (cpt2 == `'\r'` \|\| cpt2 == `'\n'`) {
455	cpt2 = _get_cpt (++pos);
456	}
457	_add_token (pos);
458	continue;
459	}
460
461	size_t num_whitespaces = `0`;
462	size_t last_end_r_or_n = `0`;
463	while (_get_flags (pos+num_whitespaces).is_whitespace) {
464	uint32_t cpt2 = _get_cpt (pos+num_whitespaces);
465	if (cpt2 == `'\r'` \|\| cpt2 == `'\n'`) {
466	last_end_r_or_n = pos + num_whitespaces + `1`;
467	}
468	num_whitespaces++;
469	}
470
471	// regex: \s[\r\n]+*
472	if (last_end_r_or_n > `0`) {
473	pos = last_end_r_or_n;
474	_add_token (pos);
475	continue;
476	}
477
478	// regex: \s+(?!\S)
479	if (num_whitespaces > `1` && _get_cpt (pos+num_whitespaces) != OUT_OF_RANGE) {
480	pos += num_whitespaces - `1`;
481	_add_token (pos);
482	continue;
483	}
484
485	// regex: \s+
486	if (num_whitespaces > `0`) {
487	pos += num_whitespaces;
488	_add_token (pos);
489	continue;
490	}
491
492	// no matches
493	_add_token (++pos);
494	}
495	}
496
497	return bpe_offsets;
498	}
499
500	// use std::wregex to split the text
501	static std::vector<size_t> unicode_regex_split_stl(const std::wstring & wtext, const std::wstring & regex_expr, const std::vector<size_t> & offsets) {
502	std::wregex expr(regex_expr);
503	std::vector<size_t> bpe_offsets; // store the offset of each word
504	bpe_offsets.reserve(n: offsets.size()); // Reserve memory for the approximate size
505	size_t start = `0`;
506	for (auto offset : offsets) {
507	std::wcregex_iterator it(wtext.data() + start, wtext.data() + start + offset, expr);
508	std::wcregex_iterator end;
509
510	int64_t start_idx = `0`;
511	while (it != end) {
512	std::wcmatch match = *it;
513	if (match.position() > start_idx) {
514	bpe_offsets.emplace_back(args: match.position() - start_idx);
515	}
516	bpe_offsets.emplace_back(args: match.length());
517	start_idx = match.position() + match.length();
518	++it;
519	}
520
521	if (start_idx < (int64_t) offset) {
522	bpe_offsets.emplace_back(args: offset - start_idx);
523	}
524	start += offset;
525	}
526
527	return bpe_offsets;
528	}
529
530	// use std::regex to split the text
531	static std::vector<size_t> unicode_regex_split_stl(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
532	std::regex expr(regex_expr);
533	std::vector<size_t> bpe_offsets; // store the offset of each word
534	bpe_offsets.reserve(n: offsets.size()); // Reserve memory for the approximate size
535	size_t start = `0`;
536	for (auto offset : offsets) {
537	std::cregex_iterator it(text.data() + start, text.data() + start + offset, expr);
538	std::cregex_iterator end;
539
540	int64_t start_idx = `0`;
541	while (it != end) {
542	std::cmatch match = *it;
543	if (match.position() > start_idx) {
544	bpe_offsets.emplace_back(args: match.position() - start_idx);
545	}
546	bpe_offsets.emplace_back(args: match.length());
547	start_idx = match.position() + match.length();
548	++it;
549	}
550
551	if (start_idx < (int64_t) offset) {
552	bpe_offsets.emplace_back(args: offset - start_idx);
553	}
554	start += offset;
555	}
556
557	return bpe_offsets;
558	}
559
560	// K2 system regex patterns (from tokenization_kimi.py):
561	// [\p{Han}]+\|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]][\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+(?i:'s\|'t\|'re\|'ve\|'m\|'ll\|'d)?\|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]](?i:'s\|'t\|'re\|'ve\|'m\|'ll\|'d)?\|\p{N}{1,3}\| ?[^\s\p{L}\p{N}]+[\r\n]\|\s[\r\n]+\|\s+(?!\S)\|\s+
562	static std::vector<size_t> unicode_regex_split_custom_kimi_k2(const std::string & text, const std::vector<size_t> & offsets) {
563	std::vector<size_t> bpe_offsets;
564	bpe_offsets.reserve(n: offsets.size());
565
566	const auto cpts = unicode_cpts_from_utf8(utf8: text);
567
568	size_t start = `0`;
569	for (auto offset : offsets) {
570	const size_t offset_ini = start;
571	const size_t offset_end = start + offset;
572	assert(offset_end <= cpts.size());
573	start = offset_end;
574
575	static const uint32_t OUT_OF_RANGE = `0xFFFFFFFF`;
576	auto _get_cpt = [&] (const size_t pos) -> uint32_t {
577	return (offset_ini <= pos && pos < offset_end) ? cpts [pos] : OUT_OF_RANGE;
578	};
579
580	auto _get_flags = [&] (const size_t pos) -> unicode_cpt_flags {
581	return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags_from_cpt(cpt: cpts [pos]) : unicode_cpt_flags {};
582	};
583
584	size_t _prev_end = offset_ini;
585	auto _add_token = [&] (const size_t end) -> size_t {
586	assert(_prev_end <= end && end <= offset_end);
587	size_t len = end - _prev_end;
588	if (len > `0`) {
589	bpe_offsets.push_back(x: len);
590	}
591	_prev_end = end;
592	return len;
593	};
594
595	for (size_t pos = offset_ini; pos < offset_end; /pos++/ ) {
596	const uint32_t cpt = _get_cpt (pos);
597	const auto flags = _get_flags (pos);
598
599	// Pattern 1: [\p{Han}]+ (Chinese characters)
600	if (unicode_cpt_is_han(cpt)) {
601	while (unicode_cpt_is_han(cpt: _get_cpt (pos))) {
602	pos++;
603	}
604	_add_token (pos);
605	continue;
606	}
607
608	// Pattern 2 & 3: Letter words excluding Han characters with optional contractions
609	// [^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]][\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+(?:'s\|'t\|'re\|'ve\|'m\|'ll\|'d)?*
610	// [^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]](?:'s\|'t\|'re\|'ve\|'m\|'ll\|'d)?*
611	// Check if current char is a letter OR if current char could be a leading char and next char is a letter
612	bool is_letter_pattern = (flags.is_letter && !unicode_cpt_is_han(cpt)) \|\|
613	(!(cpt == `'\r'` \|\| cpt == `'\n'` \|\| flags.is_letter \|\| flags.is_number) &&
614	_get_flags (pos + `1`).is_letter && !unicode_cpt_is_han(cpt: _get_cpt (pos + `1`)));
615
616	if (is_letter_pattern) {
617	// Handle optional leading non-letter/non-number character
618	bool has_leading_char = false;
619	if (!(cpt == `'\r'` \|\| cpt == `'\n'` \|\| flags.is_letter \|\| flags.is_number)) {
620	has_leading_char = true;
621	pos++;
622	}
623
624	// Match letter sequence (excluding Han characters)
625	bool has_letters = false;
626	while (_get_flags (pos).is_letter && !unicode_cpt_is_han(cpt: _get_cpt (pos))) {
627	has_letters = true;
628	pos++;
629	}
630
631	// Only proceed if we found letters (after potentially skipping leading char)
632	if (has_letters \|\| (!has_leading_char && _get_flags (pos).is_letter && !unicode_cpt_is_han(cpt: _get_cpt (pos)))) {
633	if (!has_letters) pos++; // consume the first letter if we didn't already
634
635	// Continue consuming letters
636	while (_get_flags (pos).is_letter && !unicode_cpt_is_han(cpt: _get_cpt (pos))) {
637	pos++;
638	}
639
640	// Check for optional contractions (?:'s\|'t\|'re\|'ve\|'m\|'ll\|'d)
641	if (_get_cpt (pos) == `'\''` && pos + `1` < offset_end) {
642	uint32_t cpt_next = unicode_tolower(cpt: _get_cpt (pos + `1`));
643	if (cpt_next == `'s'` \|\| cpt_next == `'t'` \|\| cpt_next == `'m'` \|\| cpt_next == `'d'`) {
644	pos += `2`;
645	} else if (pos + `2` < offset_end) {
646	uint32_t cpt_next_next = unicode_tolower(cpt: _get_cpt (pos + `2`));
647	if ((cpt_next == `'r'` && cpt_next_next == `'e'`) \|\|
648	(cpt_next == `'v'` && cpt_next_next == `'e'`) \|\|
649	(cpt_next == `'l'` && cpt_next_next == `'l'`)) {
650	pos += `3`;
651	}
652	}
653	}
654
655	_add_token (pos);
656	continue;
657	} else if (has_leading_char) {
658	// We consumed a leading char but found no letters, backtrack
659	pos--;
660	}
661	}
662
663	// Pattern 4: \p{N}{1,3} (numbers 1-3 digits)
664	if (flags.is_number) {
665	size_t ini = pos;
666	while (_get_flags (pos).is_number) {
667	if (++pos - ini >= `3`) {
668	_add_token (pos);
669	ini = pos;
670	}
671	}
672	_add_token (pos);
673	continue;
674	}
675
676	// Pattern 5: ?[^\s\p{L}\p{N}]+[\r\n] (optional space + non-word chars + optional newlines)*
677	auto flags2 = (cpt == `' '` ? _get_flags (pos + `1`) : flags);
678	if (!(flags2.is_whitespace \|\| flags2.is_letter \|\| flags2.is_number) && flags2.as_uint()) {
679	pos += (cpt == `' '`);
680	while (!(flags2.is_whitespace \|\| flags2.is_letter \|\| flags2.is_number) && flags2.as_uint()) {
681	flags2 = _get_flags (++pos);
682	}
683	// Match optional [\r\n]*
684	uint32_t cpt2 = _get_cpt (pos);
685	while (cpt2 == `'\r'` \|\| cpt2 == `'\n'`) {
686	cpt2 = _get_cpt (++pos);
687	}
688	_add_token (pos);
689	continue;
690	}
691
692	// Count whitespace characters
693	size_t num_whitespaces = `0`;
694	size_t last_end_r_or_n = `0`;
695	while (_get_flags (pos + num_whitespaces).is_whitespace) {
696	uint32_t cpt2 = _get_cpt (pos + num_whitespaces);
697	if (cpt2 == `'\r'` \|\| cpt2 == `'\n'`) {
698	last_end_r_or_n = pos + num_whitespaces + `1`;
699	}
700	num_whitespaces++;
701	}
702
703	// Pattern 6: \s[\r\n]+ (whitespace with newlines)*
704	if (last_end_r_or_n > `0`) {
705	pos = last_end_r_or_n;
706	_add_token (pos);
707	continue;
708	}
709
710	// Pattern 7: \s+(?!\S) (trailing whitespace)
711	if (num_whitespaces > `1` && _get_cpt (pos + num_whitespaces) != OUT_OF_RANGE) {
712	pos += num_whitespaces - `1`;
713	_add_token (pos);
714	continue;
715	}
716
717	// Pattern 8: \s+ (general whitespace)
718	if (num_whitespaces > `0`) {
719	pos += num_whitespaces;
720	_add_token (pos);
721	continue;
722	}
723
724	// No matches - consume single character
725	_add_token (++pos);
726	}
727	}
728
729	return bpe_offsets;
730	}
731
732	static std::vector<size_t> unicode_regex_split_custom(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
733	std::vector<size_t> bpe_offsets;
734
735	if (regex_expr == "'s\|'t\|'re\|'ve\|'m\|'ll\|'d\| ?\\p{L}+\| ?\\p{N}+\| ?[^\\s\\p{L}\\p{N}]+\|\\s+(?!\\S)") {
736	bpe_offsets = unicode_regex_split_custom_gpt2(text, offsets);
737	} else if (
738	regex_expr == "(?i:'s\|'t\|'re\|'ve\|'m\|'ll\|'d)\|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+\|\\p{N}{1,3}\| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]\|\\s[\\r\\n]+\|\\s+(?!\\S)\|\\s+" \|\|
739	regex_expr == "(?:'[sS]\|'[tT]\|'[rR][eE]\|'[vV][eE]\|'[mM]\|'[lL][lL]\|'[dD])\|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+\|\\p{N}{1,3}\| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]\|\\s[\\r\\n]+\|\\s+(?!\\S)\|\\s+") {
740
741	bpe_offsets = unicode_regex_split_custom_llama3(text, offsets);
742	} else if (regex_expr == "\\p{Han}+") {
743	// K2's first pattern - handle all K2 patterns together
744	bpe_offsets = unicode_regex_split_custom_kimi_k2(text, offsets);
745	}
746
747	return bpe_offsets;
748	}
749
750	//
751	// interface
752	//
753
754	std::string unicode_cpt_to_utf8(uint32_t cpt) {
755	std::string result;
756
757	if (/ 0x00 <= cpt && / cpt <= `0x7f`) {
758	result.push_back(c: cpt);
759	return result;
760	}
761	if (`0x80` <= cpt && cpt <= `0x7ff`) {
762	result.push_back(c: `0xc0` \| ((cpt >> `6`) & `0x1f`));
763	result.push_back(c: `0x80` \| (cpt & `0x3f`));
764	return result;
765	}
766	if (`0x800` <= cpt && cpt <= `0xffff`) {
767	result.push_back(c: `0xe0` \| ((cpt >> `12`) & `0x0f`));
768	result.push_back(c: `0x80` \| ((cpt >> `6`) & `0x3f`));
769	result.push_back(c: `0x80` \| (cpt & `0x3f`));
770	return result;
771	}
772	if (`0x10000` <= cpt && cpt <= `0x10ffff`) {
773	result.push_back(c: `0xf0` \| ((cpt >> `18`) & `0x07`));
774	result.push_back(c: `0x80` \| ((cpt >> `12`) & `0x3f`));
775	result.push_back(c: `0x80` \| ((cpt >> `6`) & `0x3f`));
776	result.push_back(c: `0x80` \| (cpt & `0x3f`));
777	return result;
778	}
779
780	throw std::invalid_argument ("invalid codepoint");
781	}
782
783	std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts) {
784	auto comp = [] (const uint32_t cpt, const range_nfd & range) {
785	return cpt < range.first;
786	};
787	std::vector<uint32_t> result(cpts.size());
788	for (size_t i = `0`; i < cpts.size(); ++i) {
789	const uint32_t cpt = cpts [i];
790	auto it = std::upper_bound(first: unicode_ranges_nfd.begin(), last: unicode_ranges_nfd.end(), val: cpt, comp: comp) - `1`;
791	result [i] = (it->first <= cpt && cpt <= it->last) ? it->nfd : cpt;
792	}
793	return result;
794	}
795
796	std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
797	std::vector<uint32_t> result;
798	result.reserve(n: utf8.size());
799	size_t offset = `0`;
800	while (offset < utf8.size()) {
801	try {
802	result.push_back(x: unicode_cpt_from_utf8(utf8, offset));
803	}
804	catch (const std::invalid_argument & /ex/) {
805	// Silently ignore invalid UTF-8 input to avoid leaking the exception beyond llama_tokenize
806	++offset;
807	result.emplace_back(args: `0xFFFD`); // replacement character
808	}
809	}
810	return result;
811	}
812
813	unicode_cpt_flags unicode_cpt_flags_from_cpt(const uint32_t cpt) {
814	static const unicode_cpt_flags undef(unicode_cpt_flags::UNDEFINED);
815	static const auto cpt_flags = unicode_cpt_flags_array();
816	return cpt < cpt_flags.size() ? cpt_flags [cpt] : undef;
817	}
818
819	unicode_cpt_flags unicode_cpt_flags_from_utf8(const std::string & utf8) {
820	static const unicode_cpt_flags undef(unicode_cpt_flags::UNDEFINED);
821	if (utf8.empty()) {
822	return undef; // undefined
823	}
824	size_t offset = `0`;
825	return unicode_cpt_flags_from_cpt(cpt: unicode_cpt_from_utf8(utf8, offset));
826	}
827
828	std::string unicode_byte_to_utf8(uint8_t byte) {
829	static std::unordered_map<uint8_t, std::string> map = unicode_byte_to_utf8_map();
830	return map.at(k: byte);
831	}
832
833	uint8_t unicode_utf8_to_byte(const std::string & utf8) {
834	static std::unordered_map<std::string, uint8_t> map = unicode_utf8_to_byte_map();
835	return map.at(k: utf8);
836	}
837
838	uint32_t unicode_tolower(uint32_t cpt) {
839	// binary search
840	auto it = std::lower_bound(first: unicode_map_lowercase.begin(), last: unicode_map_lowercase.end(), val: cpt,
841	comp: [](const std::pair<uint32_t, uint32_t> & pair, uint32_t value) {
842	return pair.first < value;
843	});
844	if (it != unicode_map_lowercase.end() && it->first == cpt) {
845	return it->second;
846	}
847	return cpt; // Return the original code point if no lowercase mapping is found
848	}
849
850	bool unicode_cpt_is_han(uint32_t cpt) {
851	// Han character ranges (Chinese/CJK characters)
852	// CJK Unified Ideographs (most common)
853	if (cpt >= `0x4E00` && cpt <= `0x9FFF`) return true;
854
855	// CJK Extension A
856	if (cpt >= `0x3400` && cpt <= `0x4DBF`) return true;
857
858	// CJK Extension B
859	if (cpt >= `0x20000` && cpt <= `0x2A6DF`) return true;
860
861	// CJK Extension C
862	if (cpt >= `0x2A700` && cpt <= `0x2B73F`) return true;
863
864	// CJK Extension D
865	if (cpt >= `0x2B740` && cpt <= `0x2B81F`) return true;
866
867	// CJK Extension E
868	if (cpt >= `0x2B820` && cpt <= `0x2CEAF`) return true;
869
870	// CJK Extension F
871	if (cpt >= `0x2CEB0` && cpt <= `0x2EBEF`) return true;
872
873	// CJK Compatibility Ideographs
874	if (cpt >= `0xF900` && cpt <= `0xFAFF`) return true;
875
876	// CJK Compatibility Ideographs Supplement
877	if (cpt >= `0x2F800` && cpt <= `0x2FA1F`) return true;
878
879	return false;
880	}
881
882	std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
883	// unicode categories
884	static const std::map<std::string, int> k_ucat_enum = {
885	{ "\\p{N}", unicode_cpt_flags::NUMBER },
886	{ "\\p{L}", unicode_cpt_flags::LETTER },
887	{ "\\p{P}", unicode_cpt_flags::PUNCTUATION },
888	{ "\\p{M}", unicode_cpt_flags::ACCENT_MARK },
889	{ "\\p{S}", unicode_cpt_flags::SYMBOL },
890	};
891
892	static const std::map<int, int> k_ucat_cpt = {
893	{ unicode_cpt_flags::NUMBER, `0xD1` },
894	{ unicode_cpt_flags::LETTER, `0xD2` },
895	{ unicode_cpt_flags::PUNCTUATION, `0xD3` },
896	{ unicode_cpt_flags::ACCENT_MARK, `0xD4` },
897	{ unicode_cpt_flags::SYMBOL, `0xD5` },
898	};
899
900	static const std::map<int, std::string> k_ucat_map = {
901	{ unicode_cpt_flags::NUMBER, "\x30-\x39" }, // 0-9
902	{ unicode_cpt_flags::LETTER, "\x41-\x5A\x61-\x7A" }, // A-Za-z
903	{ unicode_cpt_flags::PUNCTUATION, "\x21-\x23\x25-\x2A\x2C-\x2F\x3A-\x3B\x3F-\x40\\\x5B-\\\x5D\x5F\\\x7B\\\x7D" }, // !-#%-,-/:-;?-@\[-\]_\{\}*
904	{ unicode_cpt_flags::ACCENT_MARK, "" }, // no sub-128 codepoints
905	{ unicode_cpt_flags::SYMBOL, "\\\x24\\\x2B\x3C-\x3E\x5E\x60\\\x7C" }, // $+<=>^`\|
906	};
907
908	// compute collapsed codepoints only if needed by at least one regex
909	bool need_collapse = false;
910	for (const auto & regex_expr : regex_exprs) {
911	// search for unicode categories
912	for (const auto & ucat : k_ucat_enum) {
913	if (std::string::npos != regex_expr.find(str: ucat.first)) {
914	need_collapse = true;
915	break;
916	}
917	}
918	}
919
920	const auto cpts = unicode_cpts_from_utf8(utf8: text);
921
922	// generate a "collapsed" representation of the text, where all codepoints are replaced by a single byte
923	// ref: https://github.com/ggml-org/llama.cpp/pull/6920#issuecomment-2081479935
924	std::string text_collapsed;
925	if (need_collapse) {
926	// collapse all unicode categories
927	text_collapsed.resize(n: cpts.size());
928
929	for (size_t i = `0`; i < cpts.size(); ++i) {
930	// keep single-byte codepoints as is
931	if (cpts [i] < `128`) {
932	text_collapsed [i] = cpts [i];
933	continue;
934	}
935
936	const auto flags = unicode_cpt_flags_from_cpt(cpt: cpts [i]);
937
938	if (flags.is_whitespace) {
939	//NOTE: C++ std::regex \s does not mach 0x85, Rust and Python regex does.
940	//text_collapsed[i] = (char) 0x85; // <Next Line> as whitespace fallback
941	text_collapsed [i] = (char) `0x0B`; // <vertical tab> as whitespace fallback
942	} else if (k_ucat_cpt.find(x: flags.category_flag()) != k_ucat_cpt.end()) {
943	text_collapsed [i] = k_ucat_cpt.at(k: flags.category_flag());
944	} else {
945	text_collapsed [i] = (char) `0xD0`; // fallback
946	}
947	}
948	}
949
950	std::vector<size_t> bpe_offsets = { cpts.size() };
951
952	for (const auto & regex_expr : regex_exprs) {
953	// first, see if we have an efficient custom regex implementation
954	auto tmp = unicode_regex_split_custom(text, regex_expr, offsets: bpe_offsets);
955
956	if (!tmp.empty()) {
957	bpe_offsets = std::move(tmp);
958	continue;
959	}
960
961	// fallback to general-purpose std::regex / std::wregex
962	try {
963	// if a unicode category is used in the regex, we use the collapsed text and replace the unicode category
964	// with the corresponding collapsed representation
965	bool use_collapsed = false;
966	for (const auto & ucat : k_ucat_enum) {
967	if (std::string::npos != regex_expr.find(str: ucat.first)) {
968	use_collapsed = true;
969	break;
970	}
971	}
972
973	if (use_collapsed) {
974	// sanity-check that the original regex does not contain any non-ASCII characters
975	const auto cpts_regex = unicode_cpts_from_utf8(utf8: regex_expr);
976	for (size_t i = `0`; i < cpts_regex.size(); ++i) {
977	if (cpts_regex [i] >= `128`) {
978	throw std::runtime_error ("Regex includes both unicode categories and non-ASCII characters - not supported");
979	}
980	}
981
982	// generate a collapsed representation of the regex
983	std::string regex_expr_collapsed;
984
985	// track if we are inside [], because nested [] are not allowed
986	bool inside = false;
987	for (size_t i = `0`; i < regex_expr.size(); ++i) {
988	if (regex_expr [i] == `'['` && (i == `0` \|\| regex_expr [i - `1`] != `'\\'`)) {
989	regex_expr_collapsed += `'['`;
990	inside = true;
991	continue;
992	}
993
994	if (inside && regex_expr [i] == `']'` && regex_expr [i - `1`] != `'\\'`) {
995	regex_expr_collapsed += `']'`;
996	inside = false;
997	continue;
998	}
999
1000	if (regex_expr [i + `0`] == `'\\'` && i + `4` < regex_expr.size() &&
1001	regex_expr [i + `1`] == `'p'` &&
1002	regex_expr [i + `2`] == `'{'` &&
1003	regex_expr [i + `4`] == `'}'`) {
1004	const std::string pat = regex_expr.substr(pos: i, n: `5`);
1005	if (k_ucat_enum.find(x: pat) != k_ucat_enum.end()) {
1006	if (!inside) {
1007	regex_expr_collapsed += `'['`;
1008	}
1009	regex_expr_collapsed += k_ucat_cpt.at(k: k_ucat_enum.at(k: pat));
1010	regex_expr_collapsed += k_ucat_map.at(k: k_ucat_enum.at(k: pat));
1011	if (!inside) {
1012	regex_expr_collapsed += `']'`;
1013	}
1014	i += `4`;
1015	continue;
1016	}
1017	}
1018
1019	regex_expr_collapsed += regex_expr [i];
1020	}
1021
1022	//printf("text_collapsed: %s\n", text_collapsed.c_str());
1023	//printf("regex_expr_collapsed: %s\n", regex_expr_collapsed.c_str());
1024	bpe_offsets = unicode_regex_split_stl(text: text_collapsed, regex_expr: regex_expr_collapsed, offsets: bpe_offsets);
1025	} else {
1026	// no unicode category used, we can use std::wregex directly
1027	const std::wstring wregex_expr = unicode_wstring_from_utf8(s: regex_expr);
1028
1029	// std::wregex \s does not mach non-ASCII whitespaces, using 0x0B as fallback
1030	std::wstring wtext(cpts.begin(), cpts.end());
1031	for (size_t i = `0`; i < wtext.size(); ++i) {
1032	if (wtext [i] > `0x7F` && unicode_cpt_flags_from_cpt(cpt: wtext [i]).is_whitespace) {
1033	wtext [i] = `0x0B`;
1034	}
1035	}
1036
1037	//printf("text: %s\n", text.c_str());
1038	//printf("regex_expr: %s\n", regex_expr.c_str());
1039	bpe_offsets = unicode_regex_split_stl(wtext, regex_expr: wregex_expr, offsets: bpe_offsets);
1040	}
1041	} catch (std::regex_error & e) {
1042	fprintf(stderr, format: "Failed to process regex: '%s'\n", regex_expr.c_str());
1043	fprintf(stderr, format: "Regex error: %s\n", e.what());
1044	throw std::runtime_error ("Failed to process regex");
1045	}
1046	}
1047
1048	std::vector<std::string> bpe_words;
1049	bpe_words.reserve(n: bpe_offsets.size()); // reserve memory for the approximate size
1050
1051	size_t start = `0`;
1052	for (size_t & offset : bpe_offsets) {
1053	bpe_words.emplace_back();
1054	for (size_t i = start; i < start + offset; ++i) {
1055	bpe_words.back() += unicode_cpt_to_utf8(cpt: cpts [i]);
1056	}
1057	start += offset;
1058	}
1059
1060	return unicode_byte_encoding_process(bpe_words);
1061	}
1062

Browse the source code of llama.cpp/src/unicode.cpp