unicode.h source code [llama.cpp/src/unicode.h]

1	#pragma once
2
3	#include <cstdint>
4	#include <string>
5	#include <vector>
6
7	// TODO: reimplement this structure in endian-independent way
8	struct unicode_cpt_flags {
9	enum {
10	UNDEFINED = `0x0001`,
11	NUMBER = `0x0002`, // regex: \p{N}
12	LETTER = `0x0004`, // regex: \p{L}
13	SEPARATOR = `0x0008`, // regex: \p{Z}
14	ACCENT_MARK = `0x0010`, // regex: \p{M}
15	PUNCTUATION = `0x0020`, // regex: \p{P}
16	SYMBOL = `0x0040`, // regex: \p{S}
17	CONTROL = `0x0080`, // regex: \p{C}
18	MASK_CATEGORIES = `0x00FF`,
19	WHITESPACE = `0x0100`,
20	LOWERCASE = `0x0200`,
21	UPPERCASE = `0x0400`,
22	NFD = `0x0800`,
23	};
24
25	// codepoint type
26	uint16_t is_undefined : `1`;
27	uint16_t is_number : `1`; // regex: \p{N}
28	uint16_t is_letter : `1`; // regex: \p{L}
29	uint16_t is_separator : `1`; // regex: \p{Z}
30	uint16_t is_accent_mark : `1`; // regex: \p{M}
31	uint16_t is_punctuation : `1`; // regex: \p{P}
32	uint16_t is_symbol : `1`; // regex: \p{S}
33	uint16_t is_control : `1`; // regex: \p{C}
34	// helper flags
35	uint16_t is_whitespace : `1`; // regex: \s
36	uint16_t is_lowercase : `1`;
37	uint16_t is_uppercase : `1`;
38	uint16_t is_nfd : `1`;
39
40	// decode from uint16
41	inline unicode_cpt_flags(const uint16_t flags = `0`) {
42	#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
43	*reinterpret_cast<uint16_t>(this*) = flags;
44	#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
45	is_undefined = (flags & UNDEFINED) ? `1` : `0`;
46	is_number = (flags & NUMBER) ? `1` : `0`;
47	is_letter = (flags & LETTER) ? `1` : `0`;
48	is_separator = (flags & SEPARATOR) ? `1` : `0`;
49	is_accent_mark = (flags & ACCENT_MARK) ? `1` : `0`;
50	is_punctuation = (flags & PUNCTUATION) ? `1` : `0`;
51	is_symbol = (flags & SYMBOL) ? `1` : `0`;
52	is_control = (flags & CONTROL) ? `1` : `0`;
53	is_whitespace = (flags & WHITESPACE) ? `1` : `0`;
54	is_lowercase = (flags & LOWERCASE) ? `1` : `0`;
55	is_uppercase = (flags & UPPERCASE) ? `1` : `0`;
56	is_nfd = (flags & NFD) ? `1` : `0`;
57	#else
58	#error Unexpected or undefined __BYTE_ORDER__
59	#endif
60	}
61
62	inline uint16_t as_uint() const {
63	#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
64	return *reinterpret_cast<const uint16_t>(this*);
65	#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
66	uint16_t result =
67	is_undefined * UNDEFINED
68	+ is_number * NUMBER
69	+ is_letter * LETTER
70	+ is_separator * SEPARATOR
71	+ is_accent_mark * ACCENT_MARK
72	+ is_punctuation * PUNCTUATION
73	+ is_symbol * SYMBOL
74	+ is_control * CONTROL
75	+ is_whitespace * WHITESPACE
76	+ is_lowercase * LOWERCASE
77	+ is_uppercase * UPPERCASE
78	+ is_nfd * NFD
79	;
80
81	return result;
82	#else
83	#error Unexpected or undefined __BYTE_ORDER__
84	#endif
85	}
86
87	inline uint16_t category_flag() const {
88	return this->as_uint() & MASK_CATEGORIES;
89	}
90	};
91
92	size_t unicode_len_utf8(char src);
93
94	std::string unicode_cpt_to_utf8 (uint32_t cpt);
95	uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset);
96
97	std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);
98
99	std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts);
100
101	unicode_cpt_flags unicode_cpt_flags_from_cpt (uint32_t cpt);
102	unicode_cpt_flags unicode_cpt_flags_from_utf8(const std::string & utf8);
103
104	std::string unicode_byte_to_utf8(uint8_t byte);
105	uint8_t unicode_utf8_to_byte(const std::string & utf8);
106
107	uint32_t unicode_tolower(uint32_t cpt);
108
109	bool unicode_cpt_is_han(uint32_t cpt);
110
111	std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);
112

Browse the source code of llama.cpp/src/unicode.h