utf8proc_wrapper.cpp source code [Velox/build/_deps/duckdb-src/third_party/utf8proc/utf8proc_wrapper.cpp]

1	#include "utf8proc_wrapper.hpp"
2	#include "utf8proc.hpp"
3
4	using namespace std;
5
6	namespace duckdb {
7
8	// This function efficiently checks if a string is valid UTF8.
9	// It was originally written by Sjoerd Mullender.
10
11	// Here is the table that makes it work:
12
13	// B = Number of Bytes in UTF8 encoding
14	// C_MIN = First Unicode code point
15	// C_MAX = Last Unicode code point
16	// B1 = First Byte Prefix
17
18	// B C_MIN C_MAX B1
19	// 1 U+000000 U+00007F 0xxxxxxx
20	// 2 U+000080 U+0007FF 110xxxxx
21	// 3 U+000800 U+00FFFF 1110xxxx
22	// 4 U+010000 U+10FFFF 11110xxx
23
24	static void AssignInvalidUTF8Reason(UnicodeInvalidReason invalid_reason, size_t invalid_pos, size_t pos, UnicodeInvalidReason reason) {
25	if (invalid_reason) {
26	*invalid_reason = reason;
27	}
28	if (invalid_pos) {
29	*invalid_pos = pos;
30	}
31	}
32
33	template <const int nextra_bytes, const int mask>
34	static inline UnicodeType
35	UTF8ExtraByteLoop(const int first_pos_seq, int utf8char, size_t& i,
36	const char s, const* size_t len, UnicodeInvalidReason invalid_reason, size_t invalid_pos) {
37	if ((len - i) < (nextra_bytes + `1`)) {
38	/ incomplete byte sequence /
39	AssignInvalidUTF8Reason(invalid_reason, invalid_pos, pos: first_pos_seq, reason: UnicodeInvalidReason::BYTE_MISMATCH);
40	return UnicodeType::INVALID;
41	}
42	for (size_t j = `0` ; j < nextra_bytes; j++) {
43	int c = (int) s[++i];
44	/ now validate the extra bytes /
45	if ((c & `0xC0`) != `0x80`) {
46	/ extra byte is not in the format 10xxxxxx /
47	AssignInvalidUTF8Reason(invalid_reason, invalid_pos, pos: i, reason: UnicodeInvalidReason::BYTE_MISMATCH);
48	return UnicodeType::INVALID;
49	}
50	utf8char = (utf8char << `6`) \| (c & `0x3F`);
51	}
52	if ((utf8char & mask) == `0`) {
53	/ invalid UTF-8 codepoint, not shortest possible /
54	AssignInvalidUTF8Reason(invalid_reason, invalid_pos, pos: first_pos_seq, reason: UnicodeInvalidReason::INVALID_UNICODE);
55	return UnicodeType::INVALID;
56	}
57	if (utf8char > `0x10FFFF`) {
58	/ value not representable by Unicode /
59	AssignInvalidUTF8Reason(invalid_reason, invalid_pos, pos: first_pos_seq, reason: UnicodeInvalidReason::INVALID_UNICODE);
60	return UnicodeType::INVALID;
61	}
62	if ((utf8char & `0x1FFF800`) == `0xD800`) {
63	/ Unicode characters from U+D800 to U+DFFF are surrogate characters used by UTF-16 which are invalid in UTF-8 /
64	AssignInvalidUTF8Reason(invalid_reason, invalid_pos, pos: first_pos_seq, reason: UnicodeInvalidReason::INVALID_UNICODE);
65	return UnicodeType::INVALID;
66	}
67	return UnicodeType::UNICODE;
68	}
69
70	UnicodeType Utf8Proc::Analyze(const char s, size_t len, UnicodeInvalidReason invalid_reason, size_t *invalid_pos) {
71	UnicodeType type = UnicodeType::ASCII;
72
73	for (size_t i = `0`; i < len; i++) {
74	int c = (int) s[i];
75
76	if ((c & `0x80`) == `0`) {
77	continue;
78	}
79	int first_pos_seq = i;
80
81	if ((c & `0xE0`) == `0xC0`) {
82	/ 2 byte sequence /
83	int utf8char = c & `0x1F`;
84	type = UTF8ExtraByteLoop<`1`, `0x000780`>(first_pos_seq, utf8char, i, s, len, invalid_reason, invalid_pos);
85	} else if ((c & `0xF0`) == `0xE0`) {
86	/ 3 byte sequence /
87	int utf8char = c & `0x0F`;
88	type = UTF8ExtraByteLoop<`2`, `0x00F800`>(first_pos_seq, utf8char, i, s, len, invalid_reason, invalid_pos);
89	} else if ((c & `0xF8`) == `0xF0`) {
90	/ 4 byte sequence /
91	int utf8char = c & `0x07`;
92	type = UTF8ExtraByteLoop<`3`, `0x1F0000`>(first_pos_seq, utf8char, i, s, len, invalid_reason, invalid_pos);
93	} else {
94	/ invalid UTF-8 start byte /
95	AssignInvalidUTF8Reason(invalid_reason, invalid_pos, pos: i, reason: UnicodeInvalidReason::BYTE_MISMATCH);
96	return UnicodeType::INVALID;
97	}
98	if (type == UnicodeType::INVALID) {
99	return type;
100	}
101	}
102	return type;
103	}
104
105	char* Utf8Proc::Normalize(const char *s, size_t len) {
106	assert(s);
107	assert(Utf8Proc::Analyze(s, len) != UnicodeType::INVALID);
108	return (char) utf8proc_NFC(str: (const* utf8proc_uint8_t*) s, len);
109	}
110
111	bool Utf8Proc::IsValid(const char *s, size_t len) {
112	return Utf8Proc::Analyze(s, len) != UnicodeType::INVALID;
113	}
114
115	size_t Utf8Proc::NextGraphemeCluster(const char *s, size_t len, size_t cpos) {
116	return utf8proc_next_grapheme(s, len, cpos);
117	}
118
119	size_t Utf8Proc::PreviousGraphemeCluster(const char *s, size_t len, size_t cpos) {
120	if (!Utf8Proc::IsValid(s, len)) {
121	return cpos - `1`;
122	}
123	size_t current_pos = `0`;
124	while(true) {
125	size_t new_pos = NextGraphemeCluster(s, len, cpos: current_pos);
126	if (new_pos <= current_pos \|\| new_pos >= cpos) {
127	return current_pos;
128	}
129	current_pos = new_pos;
130	}
131	}
132
133	bool Utf8Proc::CodepointToUtf8(int cp, int &sz, char *c) {
134	return utf8proc_codepoint_to_utf8(cp, sz, c);
135	}
136
137	int Utf8Proc::CodepointLength(int cp) {
138	return utf8proc_codepoint_length(cp);
139	}
140
141	int32_t Utf8Proc::UTF8ToCodepoint(const char c, int* &sz) {
142	return utf8proc_codepoint(u_input: c, sz);
143	}
144
145	size_t Utf8Proc::RenderWidth(const char *s, size_t len, size_t pos) {
146	int sz;
147	auto codepoint = duckdb::utf8proc_codepoint(u_input: s + pos, sz);
148	auto properties = duckdb::utf8proc_get_property(codepoint);
149	return properties->charwidth;
150	}
151
152	size_t Utf8Proc::RenderWidth(const std::string &str) {
153	size_t render_width = `0`;
154	size_t pos = `0`;
155	while(pos < str.size()) {
156	int sz;
157	auto codepoint = duckdb::utf8proc_codepoint(u_input: str.c_str() + pos, sz);
158	auto properties = duckdb::utf8proc_get_property(codepoint);
159	render_width += properties->charwidth;
160	pos += sz;
161	}
162	return render_width;
163	}
164
165	}
166

Browse the source code of Velox/build/_deps/duckdb-src/third_party/utf8proc/utf8proc_wrapper.cpp