| 1 | #include "utf8proc_wrapper.hpp" |
| 2 | #include "utf8proc_wrapper.h" |
| 3 | #include "utf8proc.hpp" |
| 4 | |
| 5 | using namespace duckdb; |
| 6 | using namespace std; |
| 7 | |
| 8 | // This function efficiently checks if a string is valid UTF8. |
| 9 | // It was originally written by Sjoerd Mullender. |
| 10 | |
| 11 | // Here is the table that makes it work: |
| 12 | |
| 13 | // B = Number of Bytes in UTF8 encoding |
| 14 | // C_MIN = First Unicode code point |
| 15 | // C_MAX = Last Unicode code point |
| 16 | // B1 = First Byte Prefix |
| 17 | |
| 18 | // B C_MIN C_MAX B1 |
| 19 | // 1 U+000000 U+00007F 0xxxxxxx |
| 20 | // 2 U+000080 U+0007FF 110xxxxx |
| 21 | // 3 U+000800 U+00FFFF 1110xxxx |
| 22 | // 4 U+010000 U+10FFFF 11110xxx |
| 23 | |
| 24 | UnicodeType Utf8Proc::Analyze(const char *s, size_t len) { |
| 25 | UnicodeType type = UnicodeType::ASCII; |
| 26 | char c; |
| 27 | for (size_t i = 0; i < len; i++) { |
| 28 | c = s[i]; |
| 29 | // 1 Byte / ASCII |
| 30 | if ((c & 0x80) == 0) |
| 31 | continue; |
| 32 | type = UnicodeType::UNICODE; |
| 33 | if ((s[++i] & 0xC0) != 0x80) |
| 34 | return UnicodeType::INVALID; |
| 35 | if ((c & 0xE0) == 0xC0) |
| 36 | continue; |
| 37 | if ((s[++i] & 0xC0) != 0x80) |
| 38 | return UnicodeType::INVALID; |
| 39 | if ((c & 0xF0) == 0xE0) |
| 40 | continue; |
| 41 | if ((s[++i] & 0xC0) != 0x80) |
| 42 | return UnicodeType::INVALID; |
| 43 | if ((c & 0xF8) == 0xF0) |
| 44 | continue; |
| 45 | return UnicodeType::INVALID; |
| 46 | } |
| 47 | |
| 48 | return type; |
| 49 | } |
| 50 | |
| 51 | |
| 52 | std::string Utf8Proc::Normalize(std::string s) { |
| 53 | auto normalized = Normalize(s.c_str()); |
| 54 | auto res = std::string(normalized); |
| 55 | free(normalized); |
| 56 | return res; |
| 57 | } |
| 58 | |
| 59 | char* Utf8Proc::Normalize(const char *s) { |
| 60 | assert(s); |
| 61 | assert(Utf8Proc::Analyze(s) != UnicodeType::INVALID); |
| 62 | return (char*) utf8proc_NFC((const utf8proc_uint8_t*) s); |
| 63 | } |
| 64 | |
| 65 | bool Utf8Proc::IsValid(const char *s, size_t len) { |
| 66 | return Utf8Proc::Analyze(s, len) != UnicodeType::INVALID; |
| 67 | } |
| 68 | |
| 69 | size_t Utf8Proc::NextGraphemeCluster(const char *s, size_t len, size_t cpos) { |
| 70 | return utf8proc_next_grapheme(s, len, cpos); |
| 71 | } |
| 72 | |
| 73 | size_t Utf8Proc::PreviousGraphemeCluster(const char *s, size_t len, size_t cpos) { |
| 74 | if (!Utf8Proc::IsValid(s, len)) { |
| 75 | return cpos - 1; |
| 76 | } |
| 77 | size_t current_pos = 0; |
| 78 | while(true) { |
| 79 | size_t new_pos = NextGraphemeCluster(s, len, current_pos); |
| 80 | if (new_pos <= current_pos || new_pos >= cpos) { |
| 81 | return current_pos; |
| 82 | } |
| 83 | current_pos = new_pos; |
| 84 | } |
| 85 | } |
| 86 | |
| 87 | size_t utf8proc_next_grapheme_cluster(const char *s, size_t len, size_t pos) { |
| 88 | return Utf8Proc::NextGraphemeCluster(s, len, pos); |
| 89 | } |
| 90 | |
| 91 | size_t utf8proc_prev_grapheme_cluster(const char *s, size_t len, size_t pos) { |
| 92 | return Utf8Proc::PreviousGraphemeCluster(s, len, pos); |
| 93 | } |
| 94 | |
| 95 | size_t utf8proc_render_width(const char *s, size_t len, size_t pos) { |
| 96 | int sz; |
| 97 | auto codepoint = utf8proc_codepoint(s + pos, sz); |
| 98 | auto properties = utf8proc_get_property(codepoint); |
| 99 | return properties->charwidth; |
| 100 | } |
| 101 | |
| 102 | int utf8proc_is_valid(const char *s, size_t len) { |
| 103 | return Utf8Proc::IsValid(s, len) ? 1 : 0; |
| 104 | } |
| 105 | |