1#include "utf8proc_wrapper.hpp"
2#include "utf8proc.hpp"
3
4using namespace std;
5
6namespace duckdb {
7
8// This function efficiently checks if a string is valid UTF8.
9// It was originally written by Sjoerd Mullender.
10
11// Here is the table that makes it work:
12
13// B = Number of Bytes in UTF8 encoding
14// C_MIN = First Unicode code point
15// C_MAX = Last Unicode code point
16// B1 = First Byte Prefix
17
18// B C_MIN C_MAX B1
19// 1 U+000000 U+00007F 0xxxxxxx
20// 2 U+000080 U+0007FF 110xxxxx
21// 3 U+000800 U+00FFFF 1110xxxx
22// 4 U+010000 U+10FFFF 11110xxx
23
24static void AssignInvalidUTF8Reason(UnicodeInvalidReason *invalid_reason, size_t *invalid_pos, size_t pos, UnicodeInvalidReason reason) {
25 if (invalid_reason) {
26 *invalid_reason = reason;
27 }
28 if (invalid_pos) {
29 *invalid_pos = pos;
30 }
31}
32
33template <const int nextra_bytes, const int mask>
34static inline UnicodeType
35UTF8ExtraByteLoop(const int first_pos_seq, int utf8char, size_t& i,
36 const char *s, const size_t len, UnicodeInvalidReason *invalid_reason, size_t *invalid_pos) {
37 if ((len - i) < (nextra_bytes + 1)) {
38 /* incomplete byte sequence */
39 AssignInvalidUTF8Reason(invalid_reason, invalid_pos, pos: first_pos_seq, reason: UnicodeInvalidReason::BYTE_MISMATCH);
40 return UnicodeType::INVALID;
41 }
42 for (size_t j = 0 ; j < nextra_bytes; j++) {
43 int c = (int) s[++i];
44 /* now validate the extra bytes */
45 if ((c & 0xC0) != 0x80) {
46 /* extra byte is not in the format 10xxxxxx */
47 AssignInvalidUTF8Reason(invalid_reason, invalid_pos, pos: i, reason: UnicodeInvalidReason::BYTE_MISMATCH);
48 return UnicodeType::INVALID;
49 }
50 utf8char = (utf8char << 6) | (c & 0x3F);
51 }
52 if ((utf8char & mask) == 0) {
53 /* invalid UTF-8 codepoint, not shortest possible */
54 AssignInvalidUTF8Reason(invalid_reason, invalid_pos, pos: first_pos_seq, reason: UnicodeInvalidReason::INVALID_UNICODE);
55 return UnicodeType::INVALID;
56 }
57 if (utf8char > 0x10FFFF) {
58 /* value not representable by Unicode */
59 AssignInvalidUTF8Reason(invalid_reason, invalid_pos, pos: first_pos_seq, reason: UnicodeInvalidReason::INVALID_UNICODE);
60 return UnicodeType::INVALID;
61 }
62 if ((utf8char & 0x1FFF800) == 0xD800) {
63 /* Unicode characters from U+D800 to U+DFFF are surrogate characters used by UTF-16 which are invalid in UTF-8 */
64 AssignInvalidUTF8Reason(invalid_reason, invalid_pos, pos: first_pos_seq, reason: UnicodeInvalidReason::INVALID_UNICODE);
65 return UnicodeType::INVALID;
66 }
67 return UnicodeType::UNICODE;
68}
69
70UnicodeType Utf8Proc::Analyze(const char *s, size_t len, UnicodeInvalidReason *invalid_reason, size_t *invalid_pos) {
71 UnicodeType type = UnicodeType::ASCII;
72
73 for (size_t i = 0; i < len; i++) {
74 int c = (int) s[i];
75
76 if ((c & 0x80) == 0) {
77 continue;
78 }
79 int first_pos_seq = i;
80
81 if ((c & 0xE0) == 0xC0) {
82 /* 2 byte sequence */
83 int utf8char = c & 0x1F;
84 type = UTF8ExtraByteLoop<1, 0x000780>(first_pos_seq, utf8char, i, s, len, invalid_reason, invalid_pos);
85 } else if ((c & 0xF0) == 0xE0) {
86 /* 3 byte sequence */
87 int utf8char = c & 0x0F;
88 type = UTF8ExtraByteLoop<2, 0x00F800>(first_pos_seq, utf8char, i, s, len, invalid_reason, invalid_pos);
89 } else if ((c & 0xF8) == 0xF0) {
90 /* 4 byte sequence */
91 int utf8char = c & 0x07;
92 type = UTF8ExtraByteLoop<3, 0x1F0000>(first_pos_seq, utf8char, i, s, len, invalid_reason, invalid_pos);
93 } else {
94 /* invalid UTF-8 start byte */
95 AssignInvalidUTF8Reason(invalid_reason, invalid_pos, pos: i, reason: UnicodeInvalidReason::BYTE_MISMATCH);
96 return UnicodeType::INVALID;
97 }
98 if (type == UnicodeType::INVALID) {
99 return type;
100 }
101 }
102 return type;
103}
104
105char* Utf8Proc::Normalize(const char *s, size_t len) {
106 assert(s);
107 assert(Utf8Proc::Analyze(s, len) != UnicodeType::INVALID);
108 return (char*) utf8proc_NFC(str: (const utf8proc_uint8_t*) s, len);
109}
110
111bool Utf8Proc::IsValid(const char *s, size_t len) {
112 return Utf8Proc::Analyze(s, len) != UnicodeType::INVALID;
113}
114
115size_t Utf8Proc::NextGraphemeCluster(const char *s, size_t len, size_t cpos) {
116 return utf8proc_next_grapheme(s, len, cpos);
117}
118
119size_t Utf8Proc::PreviousGraphemeCluster(const char *s, size_t len, size_t cpos) {
120 if (!Utf8Proc::IsValid(s, len)) {
121 return cpos - 1;
122 }
123 size_t current_pos = 0;
124 while(true) {
125 size_t new_pos = NextGraphemeCluster(s, len, cpos: current_pos);
126 if (new_pos <= current_pos || new_pos >= cpos) {
127 return current_pos;
128 }
129 current_pos = new_pos;
130 }
131}
132
133bool Utf8Proc::CodepointToUtf8(int cp, int &sz, char *c) {
134 return utf8proc_codepoint_to_utf8(cp, sz, c);
135}
136
137int Utf8Proc::CodepointLength(int cp) {
138 return utf8proc_codepoint_length(cp);
139}
140
141int32_t Utf8Proc::UTF8ToCodepoint(const char *c, int &sz) {
142 return utf8proc_codepoint(u_input: c, sz);
143}
144
145size_t Utf8Proc::RenderWidth(const char *s, size_t len, size_t pos) {
146 int sz;
147 auto codepoint = duckdb::utf8proc_codepoint(u_input: s + pos, sz);
148 auto properties = duckdb::utf8proc_get_property(codepoint);
149 return properties->charwidth;
150}
151
152size_t Utf8Proc::RenderWidth(const std::string &str) {
153 size_t render_width = 0;
154 size_t pos = 0;
155 while(pos < str.size()) {
156 int sz;
157 auto codepoint = duckdb::utf8proc_codepoint(u_input: str.c_str() + pos, sz);
158 auto properties = duckdb::utf8proc_get_property(codepoint);
159 render_width += properties->charwidth;
160 pos += sz;
161 }
162 return render_width;
163}
164
165}
166