1#include "utf8proc_wrapper.hpp"
2#include "utf8proc_wrapper.h"
3#include "utf8proc.hpp"
4
5using namespace duckdb;
6using namespace std;
7
8// This function efficiently checks if a string is valid UTF8.
9// It was originally written by Sjoerd Mullender.
10
11// Here is the table that makes it work:
12
13// B = Number of Bytes in UTF8 encoding
14// C_MIN = First Unicode code point
15// C_MAX = Last Unicode code point
16// B1 = First Byte Prefix
17
18// B C_MIN C_MAX B1
19// 1 U+000000 U+00007F 0xxxxxxx
20// 2 U+000080 U+0007FF 110xxxxx
21// 3 U+000800 U+00FFFF 1110xxxx
22// 4 U+010000 U+10FFFF 11110xxx
23
24UnicodeType Utf8Proc::Analyze(const char *s, size_t len) {
25 UnicodeType type = UnicodeType::ASCII;
26 char c;
27 for (size_t i = 0; i < len; i++) {
28 c = s[i];
29 // 1 Byte / ASCII
30 if ((c & 0x80) == 0)
31 continue;
32 type = UnicodeType::UNICODE;
33 if ((s[++i] & 0xC0) != 0x80)
34 return UnicodeType::INVALID;
35 if ((c & 0xE0) == 0xC0)
36 continue;
37 if ((s[++i] & 0xC0) != 0x80)
38 return UnicodeType::INVALID;
39 if ((c & 0xF0) == 0xE0)
40 continue;
41 if ((s[++i] & 0xC0) != 0x80)
42 return UnicodeType::INVALID;
43 if ((c & 0xF8) == 0xF0)
44 continue;
45 return UnicodeType::INVALID;
46 }
47
48 return type;
49}
50
51
52std::string Utf8Proc::Normalize(std::string s) {
53 auto normalized = Normalize(s.c_str());
54 auto res = std::string(normalized);
55 free(normalized);
56 return res;
57}
58
59char* Utf8Proc::Normalize(const char *s) {
60 assert(s);
61 assert(Utf8Proc::Analyze(s) != UnicodeType::INVALID);
62 return (char*) utf8proc_NFC((const utf8proc_uint8_t*) s);
63}
64
65bool Utf8Proc::IsValid(const char *s, size_t len) {
66 return Utf8Proc::Analyze(s, len) != UnicodeType::INVALID;
67}
68
69size_t Utf8Proc::NextGraphemeCluster(const char *s, size_t len, size_t cpos) {
70 return utf8proc_next_grapheme(s, len, cpos);
71}
72
73size_t Utf8Proc::PreviousGraphemeCluster(const char *s, size_t len, size_t cpos) {
74 if (!Utf8Proc::IsValid(s, len)) {
75 return cpos - 1;
76 }
77 size_t current_pos = 0;
78 while(true) {
79 size_t new_pos = NextGraphemeCluster(s, len, current_pos);
80 if (new_pos <= current_pos || new_pos >= cpos) {
81 return current_pos;
82 }
83 current_pos = new_pos;
84 }
85}
86
87size_t utf8proc_next_grapheme_cluster(const char *s, size_t len, size_t pos) {
88 return Utf8Proc::NextGraphemeCluster(s, len, pos);
89}
90
91size_t utf8proc_prev_grapheme_cluster(const char *s, size_t len, size_t pos) {
92 return Utf8Proc::PreviousGraphemeCluster(s, len, pos);
93}
94
95size_t utf8proc_render_width(const char *s, size_t len, size_t pos) {
96 int sz;
97 auto codepoint = utf8proc_codepoint(s + pos, sz);
98 auto properties = utf8proc_get_property(codepoint);
99 return properties->charwidth;
100}
101
102int utf8proc_is_valid(const char *s, size_t len) {
103 return Utf8Proc::IsValid(s, len) ? 1 : 0;
104}
105