utf8proc_wrapper.cpp source code [DuckDB/third_party/utf8proc/utf8proc_wrapper.cpp]

1	#include "utf8proc_wrapper.hpp"
2	#include "utf8proc_wrapper.h"
3	#include "utf8proc.hpp"
4
5	using namespace duckdb;
6	using namespace std;
7
8	// This function efficiently checks if a string is valid UTF8.
9	// It was originally written by Sjoerd Mullender.
10
11	// Here is the table that makes it work:
12
13	// B = Number of Bytes in UTF8 encoding
14	// C_MIN = First Unicode code point
15	// C_MAX = Last Unicode code point
16	// B1 = First Byte Prefix
17
18	// B C_MIN C_MAX B1
19	// 1 U+000000 U+00007F 0xxxxxxx
20	// 2 U+000080 U+0007FF 110xxxxx
21	// 3 U+000800 U+00FFFF 1110xxxx
22	// 4 U+010000 U+10FFFF 11110xxx
23
24	UnicodeType Utf8Proc::Analyze(const char *s, size_t len) {
25	UnicodeType type = UnicodeType::ASCII;
26	char c;
27	for (size_t i = `0`; i < len; i++) {
28	c = s[i];
29	// 1 Byte / ASCII
30	if ((c & `0x80`) == `0`)
31	continue;
32	type = UnicodeType::UNICODE;
33	if ((s[++i] & `0xC0`) != `0x80`)
34	return UnicodeType::INVALID;
35	if ((c & `0xE0`) == `0xC0`)
36	continue;
37	if ((s[++i] & `0xC0`) != `0x80`)
38	return UnicodeType::INVALID;
39	if ((c & `0xF0`) == `0xE0`)
40	continue;
41	if ((s[++i] & `0xC0`) != `0x80`)
42	return UnicodeType::INVALID;
43	if ((c & `0xF8`) == `0xF0`)
44	continue;
45	return UnicodeType::INVALID;
46	}
47
48	return type;
49	}
50
51
52	std::string Utf8Proc::Normalize(std::string s) {
53	auto normalized = Normalize(s.c_str());
54	auto res = std::string (normalized);
55	free(normalized);
56	return res;
57	}
58
59	char* Utf8Proc::Normalize(const char *s) {
60	assert(s);
61	assert(Utf8Proc::Analyze(s) != UnicodeType::INVALID);
62	return (char) utf8proc_NFC((const* utf8proc_uint8_t*) s);
63	}
64
65	bool Utf8Proc::IsValid(const char *s, size_t len) {
66	return Utf8Proc::Analyze(s, len) != UnicodeType::INVALID;
67	}
68
69	size_t Utf8Proc::NextGraphemeCluster(const char *s, size_t len, size_t cpos) {
70	return utf8proc_next_grapheme(s, len, cpos);
71	}
72
73	size_t Utf8Proc::PreviousGraphemeCluster(const char *s, size_t len, size_t cpos) {
74	if (!Utf8Proc::IsValid(s, len)) {
75	return cpos - `1`;
76	}
77	size_t current_pos = `0`;
78	while(true) {
79	size_t new_pos = NextGraphemeCluster(s, len, current_pos);
80	if (new_pos <= current_pos \|\| new_pos >= cpos) {
81	return current_pos;
82	}
83	current_pos = new_pos;
84	}
85	}
86
87	size_t utf8proc_next_grapheme_cluster(const char *s, size_t len, size_t pos) {
88	return Utf8Proc::NextGraphemeCluster(s, len, pos);
89	}
90
91	size_t utf8proc_prev_grapheme_cluster(const char *s, size_t len, size_t pos) {
92	return Utf8Proc::PreviousGraphemeCluster(s, len, pos);
93	}
94
95	size_t utf8proc_render_width(const char *s, size_t len, size_t pos) {
96	int sz;
97	auto codepoint = utf8proc_codepoint(s + pos, sz);
98	auto properties = utf8proc_get_property(codepoint);
99	return properties->charwidth;
100	}
101
102	int utf8proc_is_valid(const char *s, size_t len) {
103	return Utf8Proc::IsValid(s, len) ? `1` : `0`;
104	}
105

Browse the source code of DuckDB/third_party/utf8proc/utf8proc_wrapper.cpp