1/* -*- mode: c; c-basic-offset: 2; tab-width: 2; indent-tabs-mode: nil -*- */
2/*
3 * Copyright (c) 2014-2019 Steven G. Johnson, Jiahao Chen, Peter Colberg, Tony Kelman, Scott P. Jones, and other contributors.
4 * Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
22 * DEALINGS IN THE SOFTWARE.
23 */
24
25/*
26 * This library contains derived data from a modified version of the
27 * Unicode data files.
28 *
29 * The original data files are available at
30 * http://www.unicode.org/Public/UNIDATA/
31 *
32 * Please notice the copyright statement in the file "utf8proc_data.c".
33 */
34
35
36/*
37 * File name: utf8proc.c
38 *
39 * Description:
40 * Implementation of libutf8proc.
41 */
42
43
44#include "utf8proc.hpp"
45
46namespace duckdb {
47
48#ifndef SSIZE_MAX
49#define SSIZE_MAX ((size_t)SIZE_MAX/2)
50#endif
51#ifndef UINT16_MAX
52# define UINT16_MAX 65535U
53#endif
54
55#include "utf8proc_data.cpp"
56
57
58// UTF8PROC_DLLEXPORT const utf8proc_int8_t utf8proc_utf8class[256] = {
59// 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
60// 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
61// 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
62// 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
63// 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
64// 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
65// 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
66// 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
67// 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
68// 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
69// 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
70// 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
71// 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
72// 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
73// 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
74// 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 };
75
76#define UTF8PROC_HANGUL_SBASE 0xAC00
77#define UTF8PROC_HANGUL_LBASE 0x1100
78#define UTF8PROC_HANGUL_VBASE 0x1161
79#define UTF8PROC_HANGUL_TBASE 0x11A7
80#define UTF8PROC_HANGUL_LCOUNT 19
81#define UTF8PROC_HANGUL_VCOUNT 21
82#define UTF8PROC_HANGUL_TCOUNT 28
83#define UTF8PROC_HANGUL_NCOUNT 588
84#define UTF8PROC_HANGUL_SCOUNT 11172
85/* END is exclusive */
86#define UTF8PROC_HANGUL_L_START 0x1100
87#define UTF8PROC_HANGUL_L_END 0x115A
88#define UTF8PROC_HANGUL_L_FILLER 0x115F
89#define UTF8PROC_HANGUL_V_START 0x1160
90#define UTF8PROC_HANGUL_V_END 0x11A3
91#define UTF8PROC_HANGUL_T_START 0x11A8
92#define UTF8PROC_HANGUL_T_END 0x11FA
93#define UTF8PROC_HANGUL_S_START 0xAC00
94#define UTF8PROC_HANGUL_S_END 0xD7A4
95
96/* Should follow semantic-versioning rules (semver.org) based on API
97 compatibility. (Note that the shared-library version number will
98 be different, being based on ABI compatibility.): */
99#define STRINGIZEx(x) #x
100#define STRINGIZE(x) STRINGIZEx(x)
101UTF8PROC_DLLEXPORT const char *utf8proc_version(void) {
102 return STRINGIZE(UTF8PROC_VERSION_MAJOR) "." STRINGIZE(UTF8PROC_VERSION_MINOR) "." STRINGIZE(UTF8PROC_VERSION_PATCH) "";
103}
104
105UTF8PROC_DLLEXPORT const char *utf8proc_unicode_version(void) {
106 return "12.1.0";
107}
108
109UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode) {
110 switch (errcode) {
111 case UTF8PROC_ERROR_NOMEM:
112 return "Memory for processing UTF-8 data could not be allocated.";
113 case UTF8PROC_ERROR_OVERFLOW:
114 return "UTF-8 string is too long to be processed.";
115 case UTF8PROC_ERROR_INVALIDUTF8:
116 return "Invalid UTF-8 string";
117 case UTF8PROC_ERROR_NOTASSIGNED:
118 return "Unassigned Unicode code point found in UTF-8 string.";
119 case UTF8PROC_ERROR_INVALIDOPTS:
120 return "Invalid options for UTF-8 processing chosen.";
121 default:
122 return "An unknown error occurred while processing UTF-8 data.";
123 }
124}
125
126#define utf_cont(ch) (((ch) & 0xc0) == 0x80)
127UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate(
128 const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *dst
129) {
130 utf8proc_uint32_t uc;
131 const utf8proc_uint8_t *end;
132
133 *dst = -1;
134 if (!strlen) return 0;
135 end = str + ((strlen < 0) ? 4 : strlen);
136 uc = *str++;
137 if (uc < 0x80) {
138 *dst = uc;
139 return 1;
140 }
141 // Must be between 0xc2 and 0xf4 inclusive to be valid
142 if ((uc - 0xc2) > (0xf4-0xc2)) return UTF8PROC_ERROR_INVALIDUTF8;
143 if (uc < 0xe0) { // 2-byte sequence
144 // Must have valid continuation character
145 if (str >= end || !utf_cont(*str)) return UTF8PROC_ERROR_INVALIDUTF8;
146 *dst = ((uc & 0x1f)<<6) | (*str & 0x3f);
147 return 2;
148 }
149 if (uc < 0xf0) { // 3-byte sequence
150 if ((str + 1 >= end) || !utf_cont(*str) || !utf_cont(str[1]))
151 return UTF8PROC_ERROR_INVALIDUTF8;
152 // Check for surrogate chars
153 if (uc == 0xed && *str > 0x9f)
154 return UTF8PROC_ERROR_INVALIDUTF8;
155 uc = ((uc & 0xf)<<12) | ((*str & 0x3f)<<6) | (str[1] & 0x3f);
156 if (uc < 0x800)
157 return UTF8PROC_ERROR_INVALIDUTF8;
158 *dst = uc;
159 return 3;
160 }
161 // 4-byte sequence
162 // Must have 3 valid continuation characters
163 if ((str + 2 >= end) || !utf_cont(*str) || !utf_cont(str[1]) || !utf_cont(str[2]))
164 return UTF8PROC_ERROR_INVALIDUTF8;
165 // Make sure in correct range (0x10000 - 0x10ffff)
166 if (uc == 0xf0) {
167 if (*str < 0x90) return UTF8PROC_ERROR_INVALIDUTF8;
168 } else if (uc == 0xf4) {
169 if (*str > 0x8f) return UTF8PROC_ERROR_INVALIDUTF8;
170 }
171 *dst = ((uc & 7)<<18) | ((*str & 0x3f)<<12) | ((str[1] & 0x3f)<<6) | (str[2] & 0x3f);
172 return 4;
173}
174
175UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_codepoint_valid(utf8proc_int32_t uc) {
176 return (((utf8proc_uint32_t)uc)-0xd800 > 0x07ff) && ((utf8proc_uint32_t)uc < 0x110000);
177}
178
179UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) {
180 if (uc < 0x00) {
181 return 0;
182 } else if (uc < 0x80) {
183 dst[0] = (utf8proc_uint8_t) uc;
184 return 1;
185 } else if (uc < 0x800) {
186 dst[0] = (utf8proc_uint8_t)(0xC0 + (uc >> 6));
187 dst[1] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
188 return 2;
189 // Note: we allow encoding 0xd800-0xdfff here, so as not to change
190 // the API, however, these are actually invalid in UTF-8
191 } else if (uc < 0x10000) {
192 dst[0] = (utf8proc_uint8_t)(0xE0 + (uc >> 12));
193 dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
194 dst[2] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
195 return 3;
196 } else if (uc < 0x110000) {
197 dst[0] = (utf8proc_uint8_t)(0xF0 + (uc >> 18));
198 dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 12) & 0x3F));
199 dst[2] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
200 dst[3] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
201 return 4;
202 } else return 0;
203}
204
205/* internal version used for inserting 0xff bytes between graphemes */
206static utf8proc_ssize_t charbound_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) {
207 if (uc < 0x00) {
208 if (uc == -1) { /* internal value used for grapheme breaks */
209 dst[0] = (utf8proc_uint8_t)0xFF;
210 return 1;
211 }
212 return 0;
213 } else if (uc < 0x80) {
214 dst[0] = (utf8proc_uint8_t)uc;
215 return 1;
216 } else if (uc < 0x800) {
217 dst[0] = (utf8proc_uint8_t)(0xC0 + (uc >> 6));
218 dst[1] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
219 return 2;
220 } else if (uc < 0x10000) {
221 dst[0] = (utf8proc_uint8_t)(0xE0 + (uc >> 12));
222 dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
223 dst[2] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
224 return 3;
225 } else if (uc < 0x110000) {
226 dst[0] = (utf8proc_uint8_t)(0xF0 + (uc >> 18));
227 dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 12) & 0x3F));
228 dst[2] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
229 dst[3] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
230 return 4;
231 } else return 0;
232}
233
234/* internal "unsafe" version that does not check whether uc is in range */
235static const utf8proc_property_t *unsafe_get_property(utf8proc_int32_t uc) {
236 /* ASSERT: uc >= 0 && uc < 0x110000 */
237 return utf8proc_properties + (
238 utf8proc_stage2table[
239 utf8proc_stage1table[uc >> 8] + (uc & 0xFF)
240 ]
241 );
242}
243
244UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int32_t uc) {
245 return uc < 0 || uc >= 0x110000 ? utf8proc_properties : unsafe_get_property(uc);
246}
247
248/* return whether there is a grapheme break between boundclasses lbc and tbc
249 (according to the definition of extended grapheme clusters)
250
251 Rule numbering refers to TR29 Version 29 (Unicode 9.0.0):
252 http://www.unicode.org/reports/tr29/tr29-29.html
253
254 CAVEATS:
255 Please note that evaluation of GB10 (grapheme breaks between emoji zwj sequences)
256 and GB 12/13 (regional indicator code points) require knowledge of previous characters
257 and are thus not handled by this function. This may result in an incorrect break before
258 an E_Modifier class codepoint and an incorrectly missing break between two
259 REGIONAL_INDICATOR class code points if such support does not exist in the caller.
260
261 See the special support in grapheme_break_extended, for required bookkeeping by the caller.
262*/
263static utf8proc_bool grapheme_break_simple(int lbc, int tbc) {
264 return
265 (lbc == UTF8PROC_BOUNDCLASS_START) ? true : // GB1
266 (lbc == UTF8PROC_BOUNDCLASS_CR && // GB3
267 tbc == UTF8PROC_BOUNDCLASS_LF) ? false : // ---
268 (lbc >= UTF8PROC_BOUNDCLASS_CR && lbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true : // GB4
269 (tbc >= UTF8PROC_BOUNDCLASS_CR && tbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true : // GB5
270 (lbc == UTF8PROC_BOUNDCLASS_L && // GB6
271 (tbc == UTF8PROC_BOUNDCLASS_L || // ---
272 tbc == UTF8PROC_BOUNDCLASS_V || // ---
273 tbc == UTF8PROC_BOUNDCLASS_LV || // ---
274 tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false : // ---
275 ((lbc == UTF8PROC_BOUNDCLASS_LV || // GB7
276 lbc == UTF8PROC_BOUNDCLASS_V) && // ---
277 (tbc == UTF8PROC_BOUNDCLASS_V || // ---
278 tbc == UTF8PROC_BOUNDCLASS_T)) ? false : // ---
279 ((lbc == UTF8PROC_BOUNDCLASS_LVT || // GB8
280 lbc == UTF8PROC_BOUNDCLASS_T) && // ---
281 tbc == UTF8PROC_BOUNDCLASS_T) ? false : // ---
282 (tbc == UTF8PROC_BOUNDCLASS_EXTEND || // GB9
283 tbc == UTF8PROC_BOUNDCLASS_ZWJ || // ---
284 tbc == UTF8PROC_BOUNDCLASS_SPACINGMARK || // GB9a
285 lbc == UTF8PROC_BOUNDCLASS_PREPEND) ? false : // GB9b
286 (lbc == UTF8PROC_BOUNDCLASS_E_ZWG && // GB11 (requires additional handling below)
287 tbc == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC) ? false : // ----
288 (lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR && // GB12/13 (requires additional handling below)
289 tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false : // ----
290 true; // GB999
291}
292
293utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t *state)
294{
295 int lbc_override = ((state && *state != UTF8PROC_BOUNDCLASS_START)
296 ? *state : lbc);
297 utf8proc_bool break_permitted = grapheme_break_simple(lbc: lbc_override, tbc);
298 if (state) {
299 // Special support for GB 12/13 made possible by GB999. After two RI
300 // class codepoints we want to force a break. Do this by resetting the
301 // second RI's bound class to UTF8PROC_BOUNDCLASS_OTHER, to force a break
302 // after that character according to GB999 (unless of course such a break is
303 // forbidden by a different rule such as GB9).
304 if (*state == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR)
305 *state = UTF8PROC_BOUNDCLASS_OTHER;
306 // Special support for GB11 (emoji extend* zwj / emoji)
307 else if (*state == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC) {
308 if (tbc == UTF8PROC_BOUNDCLASS_EXTEND) // fold EXTEND codepoints into emoji
309 *state = UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC;
310 else if (tbc == UTF8PROC_BOUNDCLASS_ZWJ)
311 *state = UTF8PROC_BOUNDCLASS_E_ZWG; // state to record emoji+zwg combo
312 else
313 *state = tbc;
314 }
315 else
316 *state = tbc;
317 }
318 return break_permitted;
319}
320
321UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break_stateful(
322 utf8proc_int32_t c1, utf8proc_int32_t c2, utf8proc_int32_t *state) {
323
324 return grapheme_break_extended(lbc: utf8proc_get_property(uc: c1)->boundclass,
325 tbc: utf8proc_get_property(uc: c2)->boundclass,
326 state);
327}
328
329
330UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(
331 utf8proc_int32_t c1, utf8proc_int32_t c2) {
332 return utf8proc_grapheme_break_stateful(c1, c2, NULL);
333}
334
335// from http://www.zedwood.com/article/cpp-utf8-char-to-codepoint
336UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_codepoint(const char *u_input, int &sz) {
337 auto u = (const unsigned char *) u_input;
338 unsigned char u0 = u[0];
339 if (u0<=127) {
340 sz = 1;
341 return u0;
342 }
343 unsigned char u1 = u[1];
344 if (u0>=192 && u0<=223) {
345 sz = 2;
346 return (u0-192)*64 + (u1-128);
347 }
348 if (u[0]==0xed && (u[1] & 0xa0) == 0xa0) {
349 return -1; //code points, 0xd800 to 0xdfff
350 }
351 unsigned char u2 = u[2];
352 if (u0>=224 && u0<=239) {
353 sz = 3;
354 return (u0-224)*4096 + (u1-128)*64 + (u2-128);
355 }
356 unsigned char u3 = u[3];
357 if (u0>=240 && u0<=247) {
358 sz = 4;
359 return (u0-240)*262144 + (u1-128)*4096 + (u2-128)*64 + (u3-128);
360 }
361 return -1;
362}
363
364bool utf8proc_codepoint_to_utf8(int cp, int &sz, char *c) {
365 if (cp<=0x7F) {
366 sz = 1;
367 c[0] = cp;
368 } else if(cp<=0x7FF) {
369 sz = 2;
370 c[0] = (cp>>6)+192;
371 c[1] = (cp&63)+128;
372 } else if(0xd800<=cp && cp<=0xdfff) {
373 sz = -1;
374 // invalid block of utf
375 return false;
376 } else if(cp<=0xFFFF) {
377 sz = 3;
378 c[0] = (cp>>12)+224;
379 c[1]= ((cp>>6)&63)+128;
380 c[2]=(cp&63)+128;
381 } else if(cp<=0x10FFFF) {
382 sz = 4;
383 c[0] = (cp>>18)+240;
384 c[1] = ((cp>>12)&63)+128;
385 c[2] = ((cp>>6)&63)+128;
386 c[3]=(cp&63)+128;
387 } else {
388 sz = -1;
389 return false;
390 }
391 return true;
392}
393
394int utf8proc_codepoint_length(int cp) {
395 if (cp<=0x7F) {
396 return 1;
397 } else if(cp<=0x7FF) {
398 return 2;
399 } else if(0xd800<=cp && cp<=0xdfff) {
400 return -1;
401 } else if(cp<=0xFFFF) {
402 return 3;
403 } else if(cp<=0x10FFFF) {
404 return 4;
405 }
406 return -1;
407}
408
409size_t utf8proc_next_grapheme(const char *s, size_t len, size_t cpos) {
410 int sz;
411 int boundclass = UTF8PROC_BOUNDCLASS_START;
412 int initial = utf8proc_get_property(uc: utf8proc_codepoint(u_input: s + cpos, sz))->boundclass;
413 grapheme_break_extended(lbc: boundclass, tbc: initial, state: &boundclass);
414 while(true) {
415 cpos += sz;
416 if (cpos >= len) {
417 return cpos;
418 }
419 int next = utf8proc_get_property(uc: utf8proc_codepoint(u_input: s + cpos, sz))->boundclass;
420 if (grapheme_break_extended(lbc: boundclass, tbc: next, state: &boundclass)) {
421 return cpos;
422 }
423 }
424}
425
426static utf8proc_int32_t seqindex_decode_entry(const utf8proc_uint16_t **entry)
427{
428 utf8proc_int32_t entry_cp = **entry;
429 if ((entry_cp & 0xF800) == 0xD800) {
430 *entry = *entry + 1;
431 entry_cp = ((entry_cp & 0x03FF) << 10) | (**entry & 0x03FF);
432 entry_cp += 0x10000;
433 }
434 return entry_cp;
435}
436
437static utf8proc_int32_t seqindex_decode_index(const utf8proc_uint32_t seqindex)
438{
439 const utf8proc_uint16_t *entry = &utf8proc_sequences[seqindex];
440 return seqindex_decode_entry(entry: &entry);
441}
442
443static utf8proc_ssize_t seqindex_write_char_decomposed(utf8proc_uint16_t seqindex, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) {
444 utf8proc_ssize_t written = 0;
445 const utf8proc_uint16_t *entry = &utf8proc_sequences[seqindex & 0x1FFF];
446 int len = seqindex >> 13;
447 if (len >= 7) {
448 len = *entry;
449 entry++;
450 }
451 for (; len >= 0; entry++, len--) {
452 utf8proc_int32_t entry_cp = seqindex_decode_entry(entry: &entry);
453 utf8proc_int32_t *dst_ptr = dst ? dst + written : nullptr;
454 written += utf8proc_decompose_char(codepoint: entry_cp, dst: dst_ptr,
455 bufsize: (bufsize > written) ? (bufsize - written) : 0, options,
456 last_boundclass);
457 if (written < 0) return UTF8PROC_ERROR_OVERFLOW;
458 }
459 return written;
460}
461
462UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c)
463{
464 utf8proc_int32_t cl = utf8proc_get_property(uc: c)->lowercase_seqindex;
465 return cl != UINT16_MAX ? seqindex_decode_index(seqindex: cl) : c;
466}
467
468UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c)
469{
470 utf8proc_int32_t cu = utf8proc_get_property(uc: c)->uppercase_seqindex;
471 return cu != UINT16_MAX ? seqindex_decode_index(seqindex: cu) : c;
472}
473
474UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_totitle(utf8proc_int32_t c)
475{
476 utf8proc_int32_t cu = utf8proc_get_property(uc: c)->titlecase_seqindex;
477 return cu != UINT16_MAX ? seqindex_decode_index(seqindex: cu) : c;
478}
479
480/* return a character width analogous to wcwidth (except portable and
481 hopefully less buggy than most system wcwidth functions). */
482UTF8PROC_DLLEXPORT int utf8proc_charwidth(utf8proc_int32_t c) {
483 return utf8proc_get_property(uc: c)->charwidth;
484}
485
486UTF8PROC_DLLEXPORT utf8proc_category_t utf8proc_category(utf8proc_int32_t c) {
487 return (utf8proc_category_t)utf8proc_get_property(uc: c)->category;
488}
489
490UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t c) {
491 static const char s[][3] = {"Cn","Lu","Ll","Lt","Lm","Lo","Mn","Mc","Me","Nd","Nl","No","Pc","Pd","Ps","Pe","Pi","Pf","Po","Sm","Sc","Sk","So","Zs","Zl","Zp","Cc","Cf","Cs","Co"};
492 return s[utf8proc_category(c)];
493}
494
495#define utf8proc_decompose_lump(replacement_uc) \
496 return utf8proc_decompose_char((replacement_uc), dst, bufsize, \
497 (utf8proc_option_t) (options & ~UTF8PROC_LUMP), last_boundclass)
498
499UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) {
500 const utf8proc_property_t *property;
501 utf8proc_propval_t category;
502 utf8proc_int32_t hangul_sindex;
503 if (uc < 0 || uc >= 0x110000) return UTF8PROC_ERROR_NOTASSIGNED;
504 property = unsafe_get_property(uc);
505 category = property->category;
506 hangul_sindex = uc - UTF8PROC_HANGUL_SBASE;
507 if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) {
508 if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT) {
509 utf8proc_int32_t hangul_tindex;
510 if (bufsize >= 1) {
511 dst[0] = UTF8PROC_HANGUL_LBASE +
512 hangul_sindex / UTF8PROC_HANGUL_NCOUNT;
513 if (bufsize >= 2) dst[1] = UTF8PROC_HANGUL_VBASE +
514 (hangul_sindex % UTF8PROC_HANGUL_NCOUNT) / UTF8PROC_HANGUL_TCOUNT;
515 }
516 hangul_tindex = hangul_sindex % UTF8PROC_HANGUL_TCOUNT;
517 if (!hangul_tindex) return 2;
518 if (bufsize >= 3) dst[2] = UTF8PROC_HANGUL_TBASE + hangul_tindex;
519 return 3;
520 }
521 }
522 if (options & UTF8PROC_REJECTNA) {
523 if (!category) return UTF8PROC_ERROR_NOTASSIGNED;
524 }
525 if (options & UTF8PROC_IGNORE) {
526 if (property->ignorable) return 0;
527 }
528 if (options & UTF8PROC_STRIPNA) {
529 if (!category) return 0;
530 }
531 if (options & UTF8PROC_LUMP) {
532 if (category == UTF8PROC_CATEGORY_ZS) utf8proc_decompose_lump(0x0020);
533 if (uc == 0x2018 || uc == 0x2019 || uc == 0x02BC || uc == 0x02C8)
534 utf8proc_decompose_lump(0x0027);
535 if (category == UTF8PROC_CATEGORY_PD || uc == 0x2212)
536 utf8proc_decompose_lump(0x002D);
537 if (uc == 0x2044 || uc == 0x2215) utf8proc_decompose_lump(0x002F);
538 if (uc == 0x2236) utf8proc_decompose_lump(0x003A);
539 if (uc == 0x2039 || uc == 0x2329 || uc == 0x3008)
540 utf8proc_decompose_lump(0x003C);
541 if (uc == 0x203A || uc == 0x232A || uc == 0x3009)
542 utf8proc_decompose_lump(0x003E);
543 if (uc == 0x2216) utf8proc_decompose_lump(0x005C);
544 if (uc == 0x02C4 || uc == 0x02C6 || uc == 0x2038 || uc == 0x2303)
545 utf8proc_decompose_lump(0x005E);
546 if (category == UTF8PROC_CATEGORY_PC || uc == 0x02CD)
547 utf8proc_decompose_lump(0x005F);
548 if (uc == 0x02CB) utf8proc_decompose_lump(0x0060);
549 if (uc == 0x2223) utf8proc_decompose_lump(0x007C);
550 if (uc == 0x223C) utf8proc_decompose_lump(0x007E);
551 if ((options & UTF8PROC_NLF2LS) && (options & UTF8PROC_NLF2PS)) {
552 if (category == UTF8PROC_CATEGORY_ZL ||
553 category == UTF8PROC_CATEGORY_ZP)
554 utf8proc_decompose_lump(0x000A);
555 }
556 }
557 if (options & UTF8PROC_STRIPMARK) {
558 if (category == UTF8PROC_CATEGORY_MN ||
559 category == UTF8PROC_CATEGORY_MC ||
560 category == UTF8PROC_CATEGORY_ME) return 0;
561 }
562 if (options & UTF8PROC_CASEFOLD) {
563 if (property->casefold_seqindex != UINT16_MAX) {
564 return seqindex_write_char_decomposed(seqindex: property->casefold_seqindex, dst, bufsize, options, last_boundclass);
565 }
566 }
567 if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) {
568 if (property->decomp_seqindex != UINT16_MAX &&
569 (!property->decomp_type || (options & UTF8PROC_COMPAT))) {
570 return seqindex_write_char_decomposed(seqindex: property->decomp_seqindex, dst, bufsize, options, last_boundclass);
571 }
572 }
573 if (options & UTF8PROC_CHARBOUND) {
574 utf8proc_bool boundary;
575 int tbc = property->boundclass;
576 boundary = grapheme_break_extended(lbc: *last_boundclass, tbc, state: last_boundclass);
577 if (boundary) {
578 if (bufsize >= 1) dst[0] = -1; /* sentinel value for grapheme break */
579 if (bufsize >= 2) dst[1] = uc;
580 return 2;
581 }
582 }
583 if (bufsize >= 1) *dst = uc;
584 return 1;
585}
586
587UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
588 const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
589 utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options
590) {
591 return utf8proc_decompose_custom(str, strlen, buffer, bufsize, options, NULL, NULL);
592}
593
594UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_custom(
595 const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
596 utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options,
597 utf8proc_custom_func custom_func, void *custom_data
598) {
599 /* strlen will be ignored, if UTF8PROC_NULLTERM is set in options */
600 utf8proc_ssize_t wpos = 0;
601 if ((options & UTF8PROC_COMPOSE) && (options & UTF8PROC_DECOMPOSE))
602 return UTF8PROC_ERROR_INVALIDOPTS;
603 if ((options & UTF8PROC_STRIPMARK) &&
604 !(options & UTF8PROC_COMPOSE) && !(options & UTF8PROC_DECOMPOSE))
605 return UTF8PROC_ERROR_INVALIDOPTS;
606 {
607 utf8proc_int32_t uc;
608 utf8proc_ssize_t rpos = 0;
609 utf8proc_ssize_t decomp_result;
610 int boundclass = UTF8PROC_BOUNDCLASS_START;
611 while (1) {
612 if (options & UTF8PROC_NULLTERM) {
613 rpos += utf8proc_iterate(str: str + rpos, strlen: -1, dst: &uc);
614 /* checking of return value is not necessary,
615 as 'uc' is < 0 in case of error */
616 if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8;
617 if (rpos < 0) return UTF8PROC_ERROR_OVERFLOW;
618 if (uc == 0) break;
619 } else {
620 if (rpos >= strlen) break;
621 rpos += utf8proc_iterate(str: str + rpos, strlen: strlen - rpos, dst: &uc);
622 if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8;
623 }
624 if (custom_func != NULL) {
625 uc = custom_func(uc, custom_data); /* user-specified custom mapping */
626 }
627 utf8proc_int32_t *target_buffer = buffer ? buffer + wpos : nullptr;
628 decomp_result = utf8proc_decompose_char(
629 uc, dst: target_buffer, bufsize: (bufsize > wpos) ? (bufsize - wpos) : 0, options,
630 last_boundclass: &boundclass
631 );
632 if (decomp_result < 0) return decomp_result;
633 wpos += decomp_result;
634 /* prohibiting integer overflows due to too long strings: */
635 if (wpos < 0 ||
636 wpos > (utf8proc_ssize_t)(SSIZE_MAX/sizeof(utf8proc_int32_t)/2))
637 return UTF8PROC_ERROR_OVERFLOW;
638 }
639 }
640 if ((options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) && bufsize >= wpos) {
641 utf8proc_ssize_t pos = 0;
642 while (pos < wpos-1) {
643 utf8proc_int32_t uc1, uc2;
644 const utf8proc_property_t *property1, *property2;
645 uc1 = buffer[pos];
646 uc2 = buffer[pos+1];
647 property1 = unsafe_get_property(uc: uc1);
648 property2 = unsafe_get_property(uc: uc2);
649 if (property1->combining_class > property2->combining_class &&
650 property2->combining_class > 0) {
651 buffer[pos] = uc2;
652 buffer[pos+1] = uc1;
653 if (pos > 0) pos--; else pos++;
654 } else {
655 pos++;
656 }
657 }
658 }
659 return wpos;
660}
661
662UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) {
663 /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored */
664 if (options & (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS | UTF8PROC_STRIPCC)) {
665 utf8proc_ssize_t rpos;
666 utf8proc_ssize_t wpos = 0;
667 utf8proc_int32_t uc;
668 for (rpos = 0; rpos < length; rpos++) {
669 uc = buffer[rpos];
670 if (uc == 0x000D && rpos < length-1 && buffer[rpos+1] == 0x000A) rpos++;
671 if (uc == 0x000A || uc == 0x000D || uc == 0x0085 ||
672 ((options & UTF8PROC_STRIPCC) && (uc == 0x000B || uc == 0x000C))) {
673 if (options & UTF8PROC_NLF2LS) {
674 if (options & UTF8PROC_NLF2PS) {
675 buffer[wpos++] = 0x000A;
676 } else {
677 buffer[wpos++] = 0x2028;
678 }
679 } else {
680 if (options & UTF8PROC_NLF2PS) {
681 buffer[wpos++] = 0x2029;
682 } else {
683 buffer[wpos++] = 0x0020;
684 }
685 }
686 } else if ((options & UTF8PROC_STRIPCC) &&
687 (uc < 0x0020 || (uc >= 0x007F && uc < 0x00A0))) {
688 if (uc == 0x0009) buffer[wpos++] = 0x0020;
689 } else {
690 buffer[wpos++] = uc;
691 }
692 }
693 length = wpos;
694 }
695 if (options & UTF8PROC_COMPOSE) {
696 utf8proc_int32_t *starter = NULL;
697 utf8proc_int32_t current_char;
698 const utf8proc_property_t *starter_property = NULL, *current_property;
699 utf8proc_propval_t max_combining_class = -1;
700 utf8proc_ssize_t rpos;
701 utf8proc_ssize_t wpos = 0;
702 utf8proc_int32_t composition;
703 for (rpos = 0; rpos < length; rpos++) {
704 current_char = buffer[rpos];
705 current_property = unsafe_get_property(uc: current_char);
706 if (starter && current_property->combining_class > max_combining_class) {
707 /* combination perhaps possible */
708 utf8proc_int32_t hangul_lindex;
709 utf8proc_int32_t hangul_sindex;
710 hangul_lindex = *starter - UTF8PROC_HANGUL_LBASE;
711 if (hangul_lindex >= 0 && hangul_lindex < UTF8PROC_HANGUL_LCOUNT) {
712 utf8proc_int32_t hangul_vindex;
713 hangul_vindex = current_char - UTF8PROC_HANGUL_VBASE;
714 if (hangul_vindex >= 0 && hangul_vindex < UTF8PROC_HANGUL_VCOUNT) {
715 *starter = UTF8PROC_HANGUL_SBASE +
716 (hangul_lindex * UTF8PROC_HANGUL_VCOUNT + hangul_vindex) *
717 UTF8PROC_HANGUL_TCOUNT;
718 starter_property = NULL;
719 continue;
720 }
721 }
722 hangul_sindex = *starter - UTF8PROC_HANGUL_SBASE;
723 if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT &&
724 (hangul_sindex % UTF8PROC_HANGUL_TCOUNT) == 0) {
725 utf8proc_int32_t hangul_tindex;
726 hangul_tindex = current_char - UTF8PROC_HANGUL_TBASE;
727 if (hangul_tindex >= 0 && hangul_tindex < UTF8PROC_HANGUL_TCOUNT) {
728 *starter += hangul_tindex;
729 starter_property = NULL;
730 continue;
731 }
732 }
733 if (!starter_property) {
734 starter_property = unsafe_get_property(uc: *starter);
735 }
736 if (starter_property->comb_index < 0x8000 &&
737 current_property->comb_index != UINT16_MAX &&
738 current_property->comb_index >= 0x8000) {
739 int sidx = starter_property->comb_index;
740 int idx = current_property->comb_index & 0x3FFF;
741 if (idx >= utf8proc_combinations[sidx] && idx <= utf8proc_combinations[sidx + 1] ) {
742 idx += sidx + 2 - utf8proc_combinations[sidx];
743 if (current_property->comb_index & 0x4000) {
744 composition = (utf8proc_combinations[idx] << 16) | utf8proc_combinations[idx+1];
745 } else
746 composition = utf8proc_combinations[idx];
747
748 if (composition > 0 && (!(options & UTF8PROC_STABLE) ||
749 !(unsafe_get_property(uc: composition)->comp_exclusion))) {
750 *starter = composition;
751 starter_property = NULL;
752 continue;
753 }
754 }
755 }
756 }
757 buffer[wpos] = current_char;
758 if (current_property->combining_class) {
759 if (current_property->combining_class > max_combining_class) {
760 max_combining_class = current_property->combining_class;
761 }
762 } else {
763 starter = buffer + wpos;
764 starter_property = NULL;
765 max_combining_class = -1;
766 }
767 wpos++;
768 }
769 length = wpos;
770 }
771 return length;
772}
773
774UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) {
775 /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored
776 ASSERT: 'buffer' has one spare byte of free space at the end! */
777 length = utf8proc_normalize_utf32(buffer, length, options);
778 if (length < 0) return length;
779 {
780 utf8proc_ssize_t rpos, wpos = 0;
781 utf8proc_int32_t uc;
782 if (options & UTF8PROC_CHARBOUND) {
783 for (rpos = 0; rpos < length; rpos++) {
784 uc = buffer[rpos];
785 wpos += charbound_encode_char(uc, dst: ((utf8proc_uint8_t *)buffer) + wpos);
786 }
787 } else {
788 for (rpos = 0; rpos < length; rpos++) {
789 uc = buffer[rpos];
790 wpos += utf8proc_encode_char(uc, dst: ((utf8proc_uint8_t *)buffer) + wpos);
791 }
792 }
793 ((utf8proc_uint8_t *)buffer)[wpos] = 0;
794 return wpos;
795 }
796}
797
798UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map(
799 const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options
800) {
801 return utf8proc_map_custom(str, strlen, dstptr, options, NULL, NULL);
802}
803
804UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom(
805 const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options,
806 utf8proc_custom_func custom_func, void *custom_data
807) {
808 utf8proc_int32_t *buffer;
809 utf8proc_ssize_t result;
810 *dstptr = NULL;
811 result = utf8proc_decompose_custom(str, strlen, NULL, bufsize: 0, options, custom_func, custom_data);
812 if (result < 0) return result;
813 buffer = (utf8proc_int32_t *) malloc(size: result * sizeof(utf8proc_int32_t) + 1);
814 if (!buffer) return UTF8PROC_ERROR_NOMEM;
815 result = utf8proc_decompose_custom(str, strlen, buffer, bufsize: result, options, custom_func, custom_data);
816 if (result < 0) {
817 free(ptr: buffer);
818 return result;
819 }
820 result = utf8proc_reencode(buffer, length: result, options);
821 if (result < 0) {
822 free(ptr: buffer);
823 return result;
824 }
825 {
826 utf8proc_int32_t *newptr;
827 newptr = (utf8proc_int32_t *) realloc(ptr: buffer, size: (size_t)result+1);
828 if (newptr) buffer = newptr;
829 }
830 *dstptr = (utf8proc_uint8_t *)buffer;
831 return result;
832}
833
834UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFD(const utf8proc_uint8_t *str, utf8proc_ssize_t len) {
835 utf8proc_uint8_t *retval;
836 utf8proc_map(str, strlen: len, dstptr: &retval, options: (utf8proc_option_t)(UTF8PROC_STABLE |
837 UTF8PROC_DECOMPOSE));
838 return retval;
839}
840
841UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFC(const utf8proc_uint8_t *str, utf8proc_ssize_t len) {
842 utf8proc_uint8_t *retval;
843 utf8proc_map(str, strlen: len, dstptr: &retval, options: (utf8proc_option_t)(UTF8PROC_STABLE |
844 UTF8PROC_COMPOSE));
845 return retval;
846}
847
848UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_remove_accents(const utf8proc_uint8_t *str, utf8proc_ssize_t len) {
849 utf8proc_uint8_t *retval;
850 utf8proc_map(str, strlen: len, dstptr: &retval, options: (utf8proc_option_t)(UTF8PROC_STABLE |
851 UTF8PROC_COMPOSE | UTF8PROC_STRIPMARK));
852 return retval;
853}
854
855UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKD(const utf8proc_uint8_t *str, utf8proc_ssize_t len) {
856 utf8proc_uint8_t *retval;
857 utf8proc_map(str, strlen: len, dstptr: &retval, options: (utf8proc_option_t)(UTF8PROC_STABLE |
858 UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT));
859 return retval;
860}
861
862UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str, utf8proc_ssize_t len) {
863 utf8proc_uint8_t *retval;
864 utf8proc_map(str, strlen: len, dstptr: &retval, options: (utf8proc_option_t)(UTF8PROC_STABLE |
865 UTF8PROC_COMPOSE | UTF8PROC_COMPAT));
866 return retval;
867}
868
869UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC_Casefold(const utf8proc_uint8_t *str, utf8proc_ssize_t len) {
870 utf8proc_uint8_t *retval;
871 utf8proc_map(str, strlen: len, dstptr: &retval, options: (utf8proc_option_t)(UTF8PROC_STABLE |
872 UTF8PROC_COMPOSE | UTF8PROC_COMPAT | UTF8PROC_CASEFOLD | UTF8PROC_IGNORE));
873 return retval;
874}
875
876}
877