1#pragma once
2
3#include <cstdint>
4#include <string>
5#include <vector>
6
7// TODO: reimplement this structure in endian-independent way
8struct unicode_cpt_flags {
9 enum {
10 UNDEFINED = 0x0001,
11 NUMBER = 0x0002, // regex: \p{N}
12 LETTER = 0x0004, // regex: \p{L}
13 SEPARATOR = 0x0008, // regex: \p{Z}
14 ACCENT_MARK = 0x0010, // regex: \p{M}
15 PUNCTUATION = 0x0020, // regex: \p{P}
16 SYMBOL = 0x0040, // regex: \p{S}
17 CONTROL = 0x0080, // regex: \p{C}
18 MASK_CATEGORIES = 0x00FF,
19 WHITESPACE = 0x0100,
20 LOWERCASE = 0x0200,
21 UPPERCASE = 0x0400,
22 NFD = 0x0800,
23 };
24
25 // codepoint type
26 uint16_t is_undefined : 1;
27 uint16_t is_number : 1; // regex: \p{N}
28 uint16_t is_letter : 1; // regex: \p{L}
29 uint16_t is_separator : 1; // regex: \p{Z}
30 uint16_t is_accent_mark : 1; // regex: \p{M}
31 uint16_t is_punctuation : 1; // regex: \p{P}
32 uint16_t is_symbol : 1; // regex: \p{S}
33 uint16_t is_control : 1; // regex: \p{C}
34 // helper flags
35 uint16_t is_whitespace : 1; // regex: \s
36 uint16_t is_lowercase : 1;
37 uint16_t is_uppercase : 1;
38 uint16_t is_nfd : 1;
39
40 // decode from uint16
41 inline unicode_cpt_flags(const uint16_t flags = 0) {
42#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
43 *reinterpret_cast<uint16_t*>(this) = flags;
44#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
45 is_undefined = (flags & UNDEFINED) ? 1 : 0;
46 is_number = (flags & NUMBER) ? 1 : 0;
47 is_letter = (flags & LETTER) ? 1 : 0;
48 is_separator = (flags & SEPARATOR) ? 1 : 0;
49 is_accent_mark = (flags & ACCENT_MARK) ? 1 : 0;
50 is_punctuation = (flags & PUNCTUATION) ? 1 : 0;
51 is_symbol = (flags & SYMBOL) ? 1 : 0;
52 is_control = (flags & CONTROL) ? 1 : 0;
53 is_whitespace = (flags & WHITESPACE) ? 1 : 0;
54 is_lowercase = (flags & LOWERCASE) ? 1 : 0;
55 is_uppercase = (flags & UPPERCASE) ? 1 : 0;
56 is_nfd = (flags & NFD) ? 1 : 0;
57#else
58#error Unexpected or undefined __BYTE_ORDER__
59#endif
60 }
61
62 inline uint16_t as_uint() const {
63#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
64 return *reinterpret_cast<const uint16_t*>(this);
65#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
66 uint16_t result =
67 is_undefined * UNDEFINED
68 + is_number * NUMBER
69 + is_letter * LETTER
70 + is_separator * SEPARATOR
71 + is_accent_mark * ACCENT_MARK
72 + is_punctuation * PUNCTUATION
73 + is_symbol * SYMBOL
74 + is_control * CONTROL
75 + is_whitespace * WHITESPACE
76 + is_lowercase * LOWERCASE
77 + is_uppercase * UPPERCASE
78 + is_nfd * NFD
79 ;
80
81 return result;
82#else
83#error Unexpected or undefined __BYTE_ORDER__
84#endif
85 }
86
87 inline uint16_t category_flag() const {
88 return this->as_uint() & MASK_CATEGORIES;
89 }
90};
91
92size_t unicode_len_utf8(char src);
93
94std::string unicode_cpt_to_utf8 (uint32_t cpt);
95uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset);
96
97std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);
98
99std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts);
100
101unicode_cpt_flags unicode_cpt_flags_from_cpt (uint32_t cpt);
102unicode_cpt_flags unicode_cpt_flags_from_utf8(const std::string & utf8);
103
104std::string unicode_byte_to_utf8(uint8_t byte);
105uint8_t unicode_utf8_to_byte(const std::string & utf8);
106
107uint32_t unicode_tolower(uint32_t cpt);
108
109bool unicode_cpt_is_han(uint32_t cpt);
110
111std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);
112