| 1 | #pragma once |
| 2 | |
| 3 | #include <cstdint> |
| 4 | #include <string> |
| 5 | #include <vector> |
| 6 | |
| 7 | // TODO: reimplement this structure in endian-independent way |
| 8 | struct unicode_cpt_flags { |
| 9 | enum { |
| 10 | UNDEFINED = 0x0001, |
| 11 | NUMBER = 0x0002, // regex: \p{N} |
| 12 | LETTER = 0x0004, // regex: \p{L} |
| 13 | SEPARATOR = 0x0008, // regex: \p{Z} |
| 14 | ACCENT_MARK = 0x0010, // regex: \p{M} |
| 15 | PUNCTUATION = 0x0020, // regex: \p{P} |
| 16 | SYMBOL = 0x0040, // regex: \p{S} |
| 17 | CONTROL = 0x0080, // regex: \p{C} |
| 18 | MASK_CATEGORIES = 0x00FF, |
| 19 | WHITESPACE = 0x0100, |
| 20 | LOWERCASE = 0x0200, |
| 21 | UPPERCASE = 0x0400, |
| 22 | NFD = 0x0800, |
| 23 | }; |
| 24 | |
| 25 | // codepoint type |
| 26 | uint16_t is_undefined : 1; |
| 27 | uint16_t is_number : 1; // regex: \p{N} |
| 28 | uint16_t is_letter : 1; // regex: \p{L} |
| 29 | uint16_t is_separator : 1; // regex: \p{Z} |
| 30 | uint16_t is_accent_mark : 1; // regex: \p{M} |
| 31 | uint16_t is_punctuation : 1; // regex: \p{P} |
| 32 | uint16_t is_symbol : 1; // regex: \p{S} |
| 33 | uint16_t is_control : 1; // regex: \p{C} |
| 34 | // helper flags |
| 35 | uint16_t is_whitespace : 1; // regex: \s |
| 36 | uint16_t is_lowercase : 1; |
| 37 | uint16_t is_uppercase : 1; |
| 38 | uint16_t is_nfd : 1; |
| 39 | |
| 40 | // decode from uint16 |
| 41 | inline unicode_cpt_flags(const uint16_t flags = 0) { |
| 42 | #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ |
| 43 | *reinterpret_cast<uint16_t*>(this) = flags; |
| 44 | #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ |
| 45 | is_undefined = (flags & UNDEFINED) ? 1 : 0; |
| 46 | is_number = (flags & NUMBER) ? 1 : 0; |
| 47 | is_letter = (flags & LETTER) ? 1 : 0; |
| 48 | is_separator = (flags & SEPARATOR) ? 1 : 0; |
| 49 | is_accent_mark = (flags & ACCENT_MARK) ? 1 : 0; |
| 50 | is_punctuation = (flags & PUNCTUATION) ? 1 : 0; |
| 51 | is_symbol = (flags & SYMBOL) ? 1 : 0; |
| 52 | is_control = (flags & CONTROL) ? 1 : 0; |
| 53 | is_whitespace = (flags & WHITESPACE) ? 1 : 0; |
| 54 | is_lowercase = (flags & LOWERCASE) ? 1 : 0; |
| 55 | is_uppercase = (flags & UPPERCASE) ? 1 : 0; |
| 56 | is_nfd = (flags & NFD) ? 1 : 0; |
| 57 | #else |
| 58 | #error Unexpected or undefined __BYTE_ORDER__ |
| 59 | #endif |
| 60 | } |
| 61 | |
| 62 | inline uint16_t as_uint() const { |
| 63 | #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ |
| 64 | return *reinterpret_cast<const uint16_t*>(this); |
| 65 | #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ |
| 66 | uint16_t result = |
| 67 | is_undefined * UNDEFINED |
| 68 | + is_number * NUMBER |
| 69 | + is_letter * LETTER |
| 70 | + is_separator * SEPARATOR |
| 71 | + is_accent_mark * ACCENT_MARK |
| 72 | + is_punctuation * PUNCTUATION |
| 73 | + is_symbol * SYMBOL |
| 74 | + is_control * CONTROL |
| 75 | + is_whitespace * WHITESPACE |
| 76 | + is_lowercase * LOWERCASE |
| 77 | + is_uppercase * UPPERCASE |
| 78 | + is_nfd * NFD |
| 79 | ; |
| 80 | |
| 81 | return result; |
| 82 | #else |
| 83 | #error Unexpected or undefined __BYTE_ORDER__ |
| 84 | #endif |
| 85 | } |
| 86 | |
| 87 | inline uint16_t category_flag() const { |
| 88 | return this->as_uint() & MASK_CATEGORIES; |
| 89 | } |
| 90 | }; |
| 91 | |
| 92 | size_t unicode_len_utf8(char src); |
| 93 | |
| 94 | std::string unicode_cpt_to_utf8 (uint32_t cpt); |
| 95 | uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset); |
| 96 | |
| 97 | std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8); |
| 98 | |
| 99 | std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts); |
| 100 | |
| 101 | unicode_cpt_flags unicode_cpt_flags_from_cpt (uint32_t cpt); |
| 102 | unicode_cpt_flags unicode_cpt_flags_from_utf8(const std::string & utf8); |
| 103 | |
| 104 | std::string unicode_byte_to_utf8(uint8_t byte); |
| 105 | uint8_t unicode_utf8_to_byte(const std::string & utf8); |
| 106 | |
| 107 | uint32_t unicode_tolower(uint32_t cpt); |
| 108 | |
| 109 | bool unicode_cpt_is_han(uint32_t cpt); |
| 110 | |
| 111 | std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs); |
| 112 | |