1 | #include "utf8proc_wrapper.hpp" |
2 | #include "utf8proc_wrapper.h" |
3 | #include "utf8proc.hpp" |
4 | |
5 | using namespace duckdb; |
6 | using namespace std; |
7 | |
8 | // This function efficiently checks if a string is valid UTF8. |
9 | // It was originally written by Sjoerd Mullender. |
10 | |
11 | // Here is the table that makes it work: |
12 | |
13 | // B = Number of Bytes in UTF8 encoding |
14 | // C_MIN = First Unicode code point |
15 | // C_MAX = Last Unicode code point |
16 | // B1 = First Byte Prefix |
17 | |
18 | // B C_MIN C_MAX B1 |
19 | // 1 U+000000 U+00007F 0xxxxxxx |
20 | // 2 U+000080 U+0007FF 110xxxxx |
21 | // 3 U+000800 U+00FFFF 1110xxxx |
22 | // 4 U+010000 U+10FFFF 11110xxx |
23 | |
24 | UnicodeType Utf8Proc::Analyze(const char *s, size_t len) { |
25 | UnicodeType type = UnicodeType::ASCII; |
26 | char c; |
27 | for (size_t i = 0; i < len; i++) { |
28 | c = s[i]; |
29 | // 1 Byte / ASCII |
30 | if ((c & 0x80) == 0) |
31 | continue; |
32 | type = UnicodeType::UNICODE; |
33 | if ((s[++i] & 0xC0) != 0x80) |
34 | return UnicodeType::INVALID; |
35 | if ((c & 0xE0) == 0xC0) |
36 | continue; |
37 | if ((s[++i] & 0xC0) != 0x80) |
38 | return UnicodeType::INVALID; |
39 | if ((c & 0xF0) == 0xE0) |
40 | continue; |
41 | if ((s[++i] & 0xC0) != 0x80) |
42 | return UnicodeType::INVALID; |
43 | if ((c & 0xF8) == 0xF0) |
44 | continue; |
45 | return UnicodeType::INVALID; |
46 | } |
47 | |
48 | return type; |
49 | } |
50 | |
51 | |
52 | std::string Utf8Proc::Normalize(std::string s) { |
53 | auto normalized = Normalize(s.c_str()); |
54 | auto res = std::string(normalized); |
55 | free(normalized); |
56 | return res; |
57 | } |
58 | |
59 | char* Utf8Proc::Normalize(const char *s) { |
60 | assert(s); |
61 | assert(Utf8Proc::Analyze(s) != UnicodeType::INVALID); |
62 | return (char*) utf8proc_NFC((const utf8proc_uint8_t*) s); |
63 | } |
64 | |
65 | bool Utf8Proc::IsValid(const char *s, size_t len) { |
66 | return Utf8Proc::Analyze(s, len) != UnicodeType::INVALID; |
67 | } |
68 | |
69 | size_t Utf8Proc::NextGraphemeCluster(const char *s, size_t len, size_t cpos) { |
70 | return utf8proc_next_grapheme(s, len, cpos); |
71 | } |
72 | |
73 | size_t Utf8Proc::PreviousGraphemeCluster(const char *s, size_t len, size_t cpos) { |
74 | if (!Utf8Proc::IsValid(s, len)) { |
75 | return cpos - 1; |
76 | } |
77 | size_t current_pos = 0; |
78 | while(true) { |
79 | size_t new_pos = NextGraphemeCluster(s, len, current_pos); |
80 | if (new_pos <= current_pos || new_pos >= cpos) { |
81 | return current_pos; |
82 | } |
83 | current_pos = new_pos; |
84 | } |
85 | } |
86 | |
87 | size_t utf8proc_next_grapheme_cluster(const char *s, size_t len, size_t pos) { |
88 | return Utf8Proc::NextGraphemeCluster(s, len, pos); |
89 | } |
90 | |
91 | size_t utf8proc_prev_grapheme_cluster(const char *s, size_t len, size_t pos) { |
92 | return Utf8Proc::PreviousGraphemeCluster(s, len, pos); |
93 | } |
94 | |
95 | size_t utf8proc_render_width(const char *s, size_t len, size_t pos) { |
96 | int sz; |
97 | auto codepoint = utf8proc_codepoint(s + pos, sz); |
98 | auto properties = utf8proc_get_property(codepoint); |
99 | return properties->charwidth; |
100 | } |
101 | |
102 | int utf8proc_is_valid(const char *s, size_t len) { |
103 | return Utf8Proc::IsValid(s, len) ? 1 : 0; |
104 | } |
105 | |