utf8proc.cpp source code [Velox/build/_deps/duckdb-src/third_party/utf8proc/utf8proc.cpp]

1	/ -- mode: c; c-basic-offset: 2; tab-width: 2; indent-tabs-mode: nil -- /
2	/*
3	* Copyright (c) 2014-2019 Steven G. Johnson, Jiahao Chen, Peter Colberg, Tony Kelman, Scott P. Jones, and other contributors.
4	* Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
5	*
6	* Permission is hereby granted, free of charge, to any person obtaining a
7	* copy of this software and associated documentation files (the "Software"),
8	* to deal in the Software without restriction, including without limitation
9	* the rights to use, copy, modify, merge, publish, distribute, sublicense,
10	* and/or sell copies of the Software, and to permit persons to whom the
11	* Software is furnished to do so, subject to the following conditions:
12	*
13	* The above copyright notice and this permission notice shall be included in
14	* all copies or substantial portions of the Software.
15	*
16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21	* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
22	* DEALINGS IN THE SOFTWARE.
23	*/
24
25	/*
26	* This library contains derived data from a modified version of the
27	* Unicode data files.
28	*
29	* The original data files are available at
30	* http://www.unicode.org/Public/UNIDATA/
31	*
32	* Please notice the copyright statement in the file "utf8proc_data.c".
33	*/
34
35
36	/*
37	* File name: utf8proc.c
38	*
39	* Description:
40	* Implementation of libutf8proc.
41	*/
42
43
44	#include "utf8proc.hpp"
45
46	namespace duckdb {
47
48	#ifndef SSIZE_MAX
49	#define SSIZE_MAX ((size_t)SIZE_MAX/2)
50	#endif
51	#ifndef UINT16_MAX
52	# define UINT16_MAX 65535U
53	#endif
54
55	#include "utf8proc_data.cpp"
56
57
58	// UTF8PROC_DLLEXPORT const utf8proc_int8_t utf8proc_utf8class[256] = {
59	// 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
60	// 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
61	// 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
62	// 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
63	// 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
64	// 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
65	// 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
66	// 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
67	// 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
68	// 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
69	// 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
70	// 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
71	// 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
72	// 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
73	// 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
74	// 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 };
75
76	#define UTF8PROC_HANGUL_SBASE 0xAC00
77	#define UTF8PROC_HANGUL_LBASE 0x1100
78	#define UTF8PROC_HANGUL_VBASE 0x1161
79	#define UTF8PROC_HANGUL_TBASE 0x11A7
80	#define UTF8PROC_HANGUL_LCOUNT 19
81	#define UTF8PROC_HANGUL_VCOUNT 21
82	#define UTF8PROC_HANGUL_TCOUNT 28
83	#define UTF8PROC_HANGUL_NCOUNT 588
84	#define UTF8PROC_HANGUL_SCOUNT 11172
85	/ END is exclusive /
86	#define UTF8PROC_HANGUL_L_START 0x1100
87	#define UTF8PROC_HANGUL_L_END 0x115A
88	#define UTF8PROC_HANGUL_L_FILLER 0x115F
89	#define UTF8PROC_HANGUL_V_START 0x1160
90	#define UTF8PROC_HANGUL_V_END 0x11A3
91	#define UTF8PROC_HANGUL_T_START 0x11A8
92	#define UTF8PROC_HANGUL_T_END 0x11FA
93	#define UTF8PROC_HANGUL_S_START 0xAC00
94	#define UTF8PROC_HANGUL_S_END 0xD7A4
95
96	/ Should follow semantic-versioning rules (semver.org) based on API*
97	compatibility. (Note that the shared-library version number will
98	be different, being based on ABI compatibility.): /*
99	#define STRINGIZEx(x) #x
100	#define STRINGIZE(x) STRINGIZEx(x)
101	UTF8PROC_DLLEXPORT const char utf8proc_version(void*) {
102	return STRINGIZE(UTF8PROC_VERSION_MAJOR) "." STRINGIZE(UTF8PROC_VERSION_MINOR) "." STRINGIZE(UTF8PROC_VERSION_PATCH) "";
103	}
104
105	UTF8PROC_DLLEXPORT const char utf8proc_unicode_version(void*) {
106	return "12.1.0";
107	}
108
109	UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode) {
110	switch (errcode) {
111	case UTF8PROC_ERROR_NOMEM:
112	return "Memory for processing UTF-8 data could not be allocated.";
113	case UTF8PROC_ERROR_OVERFLOW:
114	return "UTF-8 string is too long to be processed.";
115	case UTF8PROC_ERROR_INVALIDUTF8:
116	return "Invalid UTF-8 string";
117	case UTF8PROC_ERROR_NOTASSIGNED:
118	return "Unassigned Unicode code point found in UTF-8 string.";
119	case UTF8PROC_ERROR_INVALIDOPTS:
120	return "Invalid options for UTF-8 processing chosen.";
121	default:
122	return "An unknown error occurred while processing UTF-8 data.";
123	}
124	}
125
126	#define utf_cont(ch) (((ch) & 0xc0) == 0x80)
127	UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate(
128	const utf8proc_uint8_t str, utf8proc_ssize_t strlen, utf8proc_int32_t dst
129	) {
130	utf8proc_uint32_t uc;
131	const utf8proc_uint8_t *end;
132
133	*dst = -`1`;
134	if (!strlen) return `0`;
135	end = str + ((strlen < `0`) ? `4` : strlen);
136	uc = *str++;
137	if (uc < `0x80`) {
138	*dst = uc;
139	return `1`;
140	}
141	// Must be between 0xc2 and 0xf4 inclusive to be valid
142	if ((uc - `0xc2`) > (`0xf4`-`0xc2`)) return UTF8PROC_ERROR_INVALIDUTF8;
143	if (uc < `0xe0`) { // 2-byte sequence
144	// Must have valid continuation character
145	if (str >= end \|\| !utf_cont(str)) return* UTF8PROC_ERROR_INVALIDUTF8;
146	dst = ((uc & `0x1f`)<<`6`) \| (str & `0x3f`);
147	return `2`;
148	}
149	if (uc < `0xf0`) { // 3-byte sequence
150	if ((str + `1` >= end) \|\| !utf_cont(*str) \|\| !utf_cont(str[`1`]))
151	return UTF8PROC_ERROR_INVALIDUTF8;
152	// Check for surrogate chars
153	if (uc == `0xed` && *str > `0x9f`)
154	return UTF8PROC_ERROR_INVALIDUTF8;
155	uc = ((uc & `0xf`)<<`12`) \| ((*str & `0x3f`)<<`6`) \| (str[`1`] & `0x3f`);
156	if (uc < `0x800`)
157	return UTF8PROC_ERROR_INVALIDUTF8;
158	*dst = uc;
159	return `3`;
160	}
161	// 4-byte sequence
162	// Must have 3 valid continuation characters
163	if ((str + `2` >= end) \|\| !utf_cont(*str) \|\| !utf_cont(str[`1`]) \|\| !utf_cont(str[`2`]))
164	return UTF8PROC_ERROR_INVALIDUTF8;
165	// Make sure in correct range (0x10000 - 0x10ffff)
166	if (uc == `0xf0`) {
167	if (str < `0x90`) return* UTF8PROC_ERROR_INVALIDUTF8;
168	} else if (uc == `0xf4`) {
169	if (str > `0x8f`) return* UTF8PROC_ERROR_INVALIDUTF8;
170	}
171	dst = ((uc & `7`)<<`18`) \| ((str & `0x3f`)<<`12`) \| ((str[`1`] & `0x3f`)<<`6`) \| (str[`2`] & `0x3f`);
172	return `4`;
173	}
174
175	UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_codepoint_valid(utf8proc_int32_t uc) {
176	return (((utf8proc_uint32_t)uc)-`0xd800` > `0x07ff`) && ((utf8proc_uint32_t)uc < `0x110000`);
177	}
178
179	UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) {
180	if (uc < `0x00`) {
181	return `0`;
182	} else if (uc < `0x80`) {
183	dst[`0`] = (utf8proc_uint8_t) uc;
184	return `1`;
185	} else if (uc < `0x800`) {
186	dst[`0`] = (utf8proc_uint8_t)(`0xC0` + (uc >> `6`));
187	dst[`1`] = (utf8proc_uint8_t)(`0x80` + (uc & `0x3F`));
188	return `2`;
189	// Note: we allow encoding 0xd800-0xdfff here, so as not to change
190	// the API, however, these are actually invalid in UTF-8
191	} else if (uc < `0x10000`) {
192	dst[`0`] = (utf8proc_uint8_t)(`0xE0` + (uc >> `12`));
193	dst[`1`] = (utf8proc_uint8_t)(`0x80` + ((uc >> `6`) & `0x3F`));
194	dst[`2`] = (utf8proc_uint8_t)(`0x80` + (uc & `0x3F`));
195	return `3`;
196	} else if (uc < `0x110000`) {
197	dst[`0`] = (utf8proc_uint8_t)(`0xF0` + (uc >> `18`));
198	dst[`1`] = (utf8proc_uint8_t)(`0x80` + ((uc >> `12`) & `0x3F`));
199	dst[`2`] = (utf8proc_uint8_t)(`0x80` + ((uc >> `6`) & `0x3F`));
200	dst[`3`] = (utf8proc_uint8_t)(`0x80` + (uc & `0x3F`));
201	return `4`;
202	} else return `0`;
203	}
204
205	/ internal version used for inserting 0xff bytes between graphemes /
206	static utf8proc_ssize_t charbound_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) {
207	if (uc < `0x00`) {
208	if (uc == -`1`) { / internal value used for grapheme breaks /
209	dst[`0`] = (utf8proc_uint8_t)`0xFF`;
210	return `1`;
211	}
212	return `0`;
213	} else if (uc < `0x80`) {
214	dst[`0`] = (utf8proc_uint8_t)uc;
215	return `1`;
216	} else if (uc < `0x800`) {
217	dst[`0`] = (utf8proc_uint8_t)(`0xC0` + (uc >> `6`));
218	dst[`1`] = (utf8proc_uint8_t)(`0x80` + (uc & `0x3F`));
219	return `2`;
220	} else if (uc < `0x10000`) {
221	dst[`0`] = (utf8proc_uint8_t)(`0xE0` + (uc >> `12`));
222	dst[`1`] = (utf8proc_uint8_t)(`0x80` + ((uc >> `6`) & `0x3F`));
223	dst[`2`] = (utf8proc_uint8_t)(`0x80` + (uc & `0x3F`));
224	return `3`;
225	} else if (uc < `0x110000`) {
226	dst[`0`] = (utf8proc_uint8_t)(`0xF0` + (uc >> `18`));
227	dst[`1`] = (utf8proc_uint8_t)(`0x80` + ((uc >> `12`) & `0x3F`));
228	dst[`2`] = (utf8proc_uint8_t)(`0x80` + ((uc >> `6`) & `0x3F`));
229	dst[`3`] = (utf8proc_uint8_t)(`0x80` + (uc & `0x3F`));
230	return `4`;
231	} else return `0`;
232	}
233
234	/ internal "unsafe" version that does not check whether uc is in range /
235	static const utf8proc_property_t *unsafe_get_property(utf8proc_int32_t uc) {
236	/ ASSERT: uc >= 0 && uc < 0x110000 /
237	return utf8proc_properties + (
238	utf8proc_stage2table[
239	utf8proc_stage1table[uc >> `8`] + (uc & `0xFF`)
240	]
241	);
242	}
243
244	UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int32_t uc) {
245	return uc < `0` \|\| uc >= `0x110000` ? utf8proc_properties : unsafe_get_property(uc);
246	}
247
248	/ return whether there is a grapheme break between boundclasses lbc and tbc*
249	(according to the definition of extended grapheme clusters)
250
251	Rule numbering refers to TR29 Version 29 (Unicode 9.0.0):
252	http://www.unicode.org/reports/tr29/tr29-29.html
253
254	CAVEATS:
255	Please note that evaluation of GB10 (grapheme breaks between emoji zwj sequences)
256	and GB 12/13 (regional indicator code points) require knowledge of previous characters
257	and are thus not handled by this function. This may result in an incorrect break before
258	an E_Modifier class codepoint and an incorrectly missing break between two
259	REGIONAL_INDICATOR class code points if such support does not exist in the caller.
260
261	See the special support in grapheme_break_extended, for required bookkeeping by the caller.
262	*/
263	static utf8proc_bool grapheme_break_simple(int lbc, int tbc) {
264	return
265	(lbc == UTF8PROC_BOUNDCLASS_START) ? true : // GB1
266	(lbc == UTF8PROC_BOUNDCLASS_CR && // GB3
267	tbc == UTF8PROC_BOUNDCLASS_LF) ? false : // ---
268	(lbc >= UTF8PROC_BOUNDCLASS_CR && lbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true : // GB4
269	(tbc >= UTF8PROC_BOUNDCLASS_CR && tbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true : // GB5
270	(lbc == UTF8PROC_BOUNDCLASS_L && // GB6
271	(tbc == UTF8PROC_BOUNDCLASS_L \|\| // ---
272	tbc == UTF8PROC_BOUNDCLASS_V \|\| // ---
273	tbc == UTF8PROC_BOUNDCLASS_LV \|\| // ---
274	tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false : // ---
275	((lbc == UTF8PROC_BOUNDCLASS_LV \|\| // GB7
276	lbc == UTF8PROC_BOUNDCLASS_V) && // ---
277	(tbc == UTF8PROC_BOUNDCLASS_V \|\| // ---
278	tbc == UTF8PROC_BOUNDCLASS_T)) ? false : // ---
279	((lbc == UTF8PROC_BOUNDCLASS_LVT \|\| // GB8
280	lbc == UTF8PROC_BOUNDCLASS_T) && // ---
281	tbc == UTF8PROC_BOUNDCLASS_T) ? false : // ---
282	(tbc == UTF8PROC_BOUNDCLASS_EXTEND \|\| // GB9
283	tbc == UTF8PROC_BOUNDCLASS_ZWJ \|\| // ---
284	tbc == UTF8PROC_BOUNDCLASS_SPACINGMARK \|\| // GB9a
285	lbc == UTF8PROC_BOUNDCLASS_PREPEND) ? false : // GB9b
286	(lbc == UTF8PROC_BOUNDCLASS_E_ZWG && // GB11 (requires additional handling below)
287	tbc == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC) ? false : // ----
288	(lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR && // GB12/13 (requires additional handling below)
289	tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false : // ----
290	true; // GB999
291	}
292
293	utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t *state)
294	{
295	int lbc_override = ((state && *state != UTF8PROC_BOUNDCLASS_START)
296	? *state : lbc);
297	utf8proc_bool break_permitted = grapheme_break_simple(lbc: lbc_override, tbc);
298	if (state) {
299	// Special support for GB 12/13 made possible by GB999. After two RI
300	// class codepoints we want to force a break. Do this by resetting the
301	// second RI's bound class to UTF8PROC_BOUNDCLASS_OTHER, to force a break
302	// after that character according to GB999 (unless of course such a break is
303	// forbidden by a different rule such as GB9).
304	if (*state == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR)
305	*state = UTF8PROC_BOUNDCLASS_OTHER;
306	// Special support for GB11 (emoji extend zwj / emoji)*
307	else if (*state == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC) {
308	if (tbc == UTF8PROC_BOUNDCLASS_EXTEND) // fold EXTEND codepoints into emoji
309	*state = UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC;
310	else if (tbc == UTF8PROC_BOUNDCLASS_ZWJ)
311	state = UTF8PROC_BOUNDCLASS_E_ZWG; // state to record emoji+zwg combo*
312	else
313	*state = tbc;
314	}
315	else
316	*state = tbc;
317	}
318	return break_permitted;
319	}
320
321	UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break_stateful(
322	utf8proc_int32_t c1, utf8proc_int32_t c2, utf8proc_int32_t *state) {
323
324	return grapheme_break_extended(lbc: utf8proc_get_property(uc: c1)->boundclass,
325	tbc: utf8proc_get_property(uc: c2)->boundclass,
326	state);
327	}
328
329
330	UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(
331	utf8proc_int32_t c1, utf8proc_int32_t c2) {
332	return utf8proc_grapheme_break_stateful(c1, c2, NULL);
333	}
334
335	// from http://www.zedwood.com/article/cpp-utf8-char-to-codepoint
336	UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_codepoint(const char u_input, int* &sz) {
337	auto u = (const unsigned char *) u_input;
338	unsigned char u0 = u[`0`];
339	if (u0<=`127`) {
340	sz = `1`;
341	return u0;
342	}
343	unsigned char u1 = u[`1`];
344	if (u0>=`192` && u0<=`223`) {
345	sz = `2`;
346	return (u0-`192`)*`64` + (u1-`128`);
347	}
348	if (u[`0`]==`0xed` && (u[`1`] & `0xa0`) == `0xa0`) {
349	return -`1`; //code points, 0xd800 to 0xdfff
350	}
351	unsigned char u2 = u[`2`];
352	if (u0>=`224` && u0<=`239`) {
353	sz = `3`;
354	return (u0-`224`)`4096` + (u1-`128`)`64` + (u2-`128`);
355	}
356	unsigned char u3 = u[`3`];
357	if (u0>=`240` && u0<=`247`) {
358	sz = `4`;
359	return (u0-`240`)`262144` + (u1-`128`)`4096` + (u2-`128`)*`64` + (u3-`128`);
360	}
361	return -`1`;
362	}
363
364	bool utf8proc_codepoint_to_utf8(int cp, int &sz, char *c) {
365	if (cp<=`0x7F`) {
366	sz = `1`;
367	c[`0`] = cp;
368	} else if(cp<=`0x7FF`) {
369	sz = `2`;
370	c[`0`] = (cp>>`6`)+`192`;
371	c[`1`] = (cp&`63`)+`128`;
372	} else if(`0xd800`<=cp && cp<=`0xdfff`) {
373	sz = -`1`;
374	// invalid block of utf
375	return false;
376	} else if(cp<=`0xFFFF`) {
377	sz = `3`;
378	c[`0`] = (cp>>`12`)+`224`;
379	c[`1`]= ((cp>>`6`)&`63`)+`128`;
380	c[`2`]=(cp&`63`)+`128`;
381	} else if(cp<=`0x10FFFF`) {
382	sz = `4`;
383	c[`0`] = (cp>>`18`)+`240`;
384	c[`1`] = ((cp>>`12`)&`63`)+`128`;
385	c[`2`] = ((cp>>`6`)&`63`)+`128`;
386	c[`3`]=(cp&`63`)+`128`;
387	} else {
388	sz = -`1`;
389	return false;
390	}
391	return true;
392	}
393
394	int utf8proc_codepoint_length(int cp) {
395	if (cp<=`0x7F`) {
396	return `1`;
397	} else if(cp<=`0x7FF`) {
398	return `2`;
399	} else if(`0xd800`<=cp && cp<=`0xdfff`) {
400	return -`1`;
401	} else if(cp<=`0xFFFF`) {
402	return `3`;
403	} else if(cp<=`0x10FFFF`) {
404	return `4`;
405	}
406	return -`1`;
407	}
408
409	size_t utf8proc_next_grapheme(const char *s, size_t len, size_t cpos) {
410	int sz;
411	int boundclass = UTF8PROC_BOUNDCLASS_START;
412	int initial = utf8proc_get_property(uc: utf8proc_codepoint(u_input: s + cpos, sz))->boundclass;
413	grapheme_break_extended(lbc: boundclass, tbc: initial, state: &boundclass);
414	while(true) {
415	cpos += sz;
416	if (cpos >= len) {
417	return cpos;
418	}
419	int next = utf8proc_get_property(uc: utf8proc_codepoint(u_input: s + cpos, sz))->boundclass;
420	if (grapheme_break_extended(lbc: boundclass, tbc: next, state: &boundclass)) {
421	return cpos;
422	}
423	}
424	}
425
426	static utf8proc_int32_t seqindex_decode_entry(const utf8proc_uint16_t **entry)
427	{
428	utf8proc_int32_t entry_cp = **entry;
429	if ((entry_cp & `0xF800`) == `0xD800`) {
430	entry = entry + `1`;
431	entry_cp = ((entry_cp & `0x03FF`) << `10`) \| (**entry & `0x03FF`);
432	entry_cp += `0x10000`;
433	}
434	return entry_cp;
435	}
436
437	static utf8proc_int32_t seqindex_decode_index(const utf8proc_uint32_t seqindex)
438	{
439	const utf8proc_uint16_t *entry = &utf8proc_sequences[seqindex];
440	return seqindex_decode_entry(entry: &entry);
441	}
442
443	static utf8proc_ssize_t seqindex_write_char_decomposed(utf8proc_uint16_t seqindex, utf8proc_int32_t dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int* *last_boundclass) {
444	utf8proc_ssize_t written = `0`;
445	const utf8proc_uint16_t *entry = &utf8proc_sequences[seqindex & `0x1FFF`];
446	int len = seqindex >> `13`;
447	if (len >= `7`) {
448	len = *entry;
449	entry++;
450	}
451	for (; len >= `0`; entry++, len--) {
452	utf8proc_int32_t entry_cp = seqindex_decode_entry(entry: &entry);
453	utf8proc_int32_t dst_ptr = dst ? dst + written : nullptr*;
454	written += utf8proc_decompose_char(codepoint: entry_cp, dst: dst_ptr,
455	bufsize: (bufsize > written) ? (bufsize - written) : `0`, options,
456	last_boundclass);
457	if (written < `0`) return UTF8PROC_ERROR_OVERFLOW;
458	}
459	return written;
460	}
461
462	UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c)
463	{
464	utf8proc_int32_t cl = utf8proc_get_property(uc: c)->lowercase_seqindex;
465	return cl != UINT16_MAX ? seqindex_decode_index(seqindex: cl) : c;
466	}
467
468	UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c)
469	{
470	utf8proc_int32_t cu = utf8proc_get_property(uc: c)->uppercase_seqindex;
471	return cu != UINT16_MAX ? seqindex_decode_index(seqindex: cu) : c;
472	}
473
474	UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_totitle(utf8proc_int32_t c)
475	{
476	utf8proc_int32_t cu = utf8proc_get_property(uc: c)->titlecase_seqindex;
477	return cu != UINT16_MAX ? seqindex_decode_index(seqindex: cu) : c;
478	}
479
480	/ return a character width analogous to wcwidth (except portable and*
481	hopefully less buggy than most system wcwidth functions). /*
482	UTF8PROC_DLLEXPORT int utf8proc_charwidth(utf8proc_int32_t c) {
483	return utf8proc_get_property(uc: c)->charwidth;
484	}
485
486	UTF8PROC_DLLEXPORT utf8proc_category_t utf8proc_category(utf8proc_int32_t c) {
487	return (utf8proc_category_t)utf8proc_get_property(uc: c)->category;
488	}
489
490	UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t c) {
491	static const char s[][`3`] = {"Cn","Lu","Ll","Lt","Lm","Lo","Mn","Mc","Me","Nd","Nl","No","Pc","Pd","Ps","Pe","Pi","Pf","Po","Sm","Sc","Sk","So","Zs","Zl","Zp","Cc","Cf","Cs","Co"};
492	return s[utf8proc_category(c)];
493	}
494
495	#define utf8proc_decompose_lump(replacement_uc) \
496	return utf8proc_decompose_char((replacement_uc), dst, bufsize, \
497	(utf8proc_option_t) (options & ~UTF8PROC_LUMP), last_boundclass)
498
499	UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc, utf8proc_int32_t dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int* *last_boundclass) {
500	const utf8proc_property_t *property;
501	utf8proc_propval_t category;
502	utf8proc_int32_t hangul_sindex;
503	if (uc < `0` \|\| uc >= `0x110000`) return UTF8PROC_ERROR_NOTASSIGNED;
504	property = unsafe_get_property(uc);
505	category = property->category;
506	hangul_sindex = uc - UTF8PROC_HANGUL_SBASE;
507	if (options & (UTF8PROC_COMPOSE\|UTF8PROC_DECOMPOSE)) {
508	if (hangul_sindex >= `0` && hangul_sindex < UTF8PROC_HANGUL_SCOUNT) {
509	utf8proc_int32_t hangul_tindex;
510	if (bufsize >= `1`) {
511	dst[`0`] = UTF8PROC_HANGUL_LBASE +
512	hangul_sindex / UTF8PROC_HANGUL_NCOUNT;
513	if (bufsize >= `2`) dst[`1`] = UTF8PROC_HANGUL_VBASE +
514	(hangul_sindex % UTF8PROC_HANGUL_NCOUNT) / UTF8PROC_HANGUL_TCOUNT;
515	}
516	hangul_tindex = hangul_sindex % UTF8PROC_HANGUL_TCOUNT;
517	if (!hangul_tindex) return `2`;
518	if (bufsize >= `3`) dst[`2`] = UTF8PROC_HANGUL_TBASE + hangul_tindex;
519	return `3`;
520	}
521	}
522	if (options & UTF8PROC_REJECTNA) {
523	if (!category) return UTF8PROC_ERROR_NOTASSIGNED;
524	}
525	if (options & UTF8PROC_IGNORE) {
526	if (property->ignorable) return `0`;
527	}
528	if (options & UTF8PROC_STRIPNA) {
529	if (!category) return `0`;
530	}
531	if (options & UTF8PROC_LUMP) {
532	if (category == UTF8PROC_CATEGORY_ZS) utf8proc_decompose_lump(`0x0020`);
533	if (uc == `0x2018` \|\| uc == `0x2019` \|\| uc == `0x02BC` \|\| uc == `0x02C8`)
534	utf8proc_decompose_lump(`0x0027`);
535	if (category == UTF8PROC_CATEGORY_PD \|\| uc == `0x2212`)
536	utf8proc_decompose_lump(`0x002D`);
537	if (uc == `0x2044` \|\| uc == `0x2215`) utf8proc_decompose_lump(`0x002F`);
538	if (uc == `0x2236`) utf8proc_decompose_lump(`0x003A`);
539	if (uc == `0x2039` \|\| uc == `0x2329` \|\| uc == `0x3008`)
540	utf8proc_decompose_lump(`0x003C`);
541	if (uc == `0x203A` \|\| uc == `0x232A` \|\| uc == `0x3009`)
542	utf8proc_decompose_lump(`0x003E`);
543	if (uc == `0x2216`) utf8proc_decompose_lump(`0x005C`);
544	if (uc == `0x02C4` \|\| uc == `0x02C6` \|\| uc == `0x2038` \|\| uc == `0x2303`)
545	utf8proc_decompose_lump(`0x005E`);
546	if (category == UTF8PROC_CATEGORY_PC \|\| uc == `0x02CD`)
547	utf8proc_decompose_lump(`0x005F`);
548	if (uc == `0x02CB`) utf8proc_decompose_lump(`0x0060`);
549	if (uc == `0x2223`) utf8proc_decompose_lump(`0x007C`);
550	if (uc == `0x223C`) utf8proc_decompose_lump(`0x007E`);
551	if ((options & UTF8PROC_NLF2LS) && (options & UTF8PROC_NLF2PS)) {
552	if (category == UTF8PROC_CATEGORY_ZL \|\|
553	category == UTF8PROC_CATEGORY_ZP)
554	utf8proc_decompose_lump(`0x000A`);
555	}
556	}
557	if (options & UTF8PROC_STRIPMARK) {
558	if (category == UTF8PROC_CATEGORY_MN \|\|
559	category == UTF8PROC_CATEGORY_MC \|\|
560	category == UTF8PROC_CATEGORY_ME) return `0`;
561	}
562	if (options & UTF8PROC_CASEFOLD) {
563	if (property->casefold_seqindex != UINT16_MAX) {
564	return seqindex_write_char_decomposed(seqindex: property->casefold_seqindex, dst, bufsize, options, last_boundclass);
565	}
566	}
567	if (options & (UTF8PROC_COMPOSE\|UTF8PROC_DECOMPOSE)) {
568	if (property->decomp_seqindex != UINT16_MAX &&
569	(!property->decomp_type \|\| (options & UTF8PROC_COMPAT))) {
570	return seqindex_write_char_decomposed(seqindex: property->decomp_seqindex, dst, bufsize, options, last_boundclass);
571	}
572	}
573	if (options & UTF8PROC_CHARBOUND) {
574	utf8proc_bool boundary;
575	int tbc = property->boundclass;
576	boundary = grapheme_break_extended(lbc: *last_boundclass, tbc, state: last_boundclass);
577	if (boundary) {
578	if (bufsize >= `1`) dst[`0`] = -`1`; / sentinel value for grapheme break /
579	if (bufsize >= `2`) dst[`1`] = uc;
580	return `2`;
581	}
582	}
583	if (bufsize >= `1`) *dst = uc;
584	return `1`;
585	}
586
587	UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
588	const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
589	utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options
590	) {
591	return utf8proc_decompose_custom(str, strlen, buffer, bufsize, options, NULL, NULL);
592	}
593
594	UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_custom(
595	const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
596	utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options,
597	utf8proc_custom_func custom_func, void *custom_data
598	) {
599	/ strlen will be ignored, if UTF8PROC_NULLTERM is set in options /
600	utf8proc_ssize_t wpos = `0`;
601	if ((options & UTF8PROC_COMPOSE) && (options & UTF8PROC_DECOMPOSE))
602	return UTF8PROC_ERROR_INVALIDOPTS;
603	if ((options & UTF8PROC_STRIPMARK) &&
604	!(options & UTF8PROC_COMPOSE) && !(options & UTF8PROC_DECOMPOSE))
605	return UTF8PROC_ERROR_INVALIDOPTS;
606	{
607	utf8proc_int32_t uc;
608	utf8proc_ssize_t rpos = `0`;
609	utf8proc_ssize_t decomp_result;
610	int boundclass = UTF8PROC_BOUNDCLASS_START;
611	while (`1`) {
612	if (options & UTF8PROC_NULLTERM) {
613	rpos += utf8proc_iterate(str: str + rpos, strlen: -`1`, dst: &uc);
614	/ checking of return value is not necessary,*
615	as 'uc' is < 0 in case of error /*
616	if (uc < `0`) return UTF8PROC_ERROR_INVALIDUTF8;
617	if (rpos < `0`) return UTF8PROC_ERROR_OVERFLOW;
618	if (uc == `0`) break;
619	} else {
620	if (rpos >= strlen) break;
621	rpos += utf8proc_iterate(str: str + rpos, strlen: strlen - rpos, dst: &uc);
622	if (uc < `0`) return UTF8PROC_ERROR_INVALIDUTF8;
623	}
624	if (custom_func != NULL) {
625	uc = custom_func(uc, custom_data); / user-specified custom mapping /
626	}
627	utf8proc_int32_t target_buffer = buffer ? buffer + wpos : nullptr*;
628	decomp_result = utf8proc_decompose_char(
629	uc, dst: target_buffer, bufsize: (bufsize > wpos) ? (bufsize - wpos) : `0`, options,
630	last_boundclass: &boundclass
631	);
632	if (decomp_result < `0`) return decomp_result;
633	wpos += decomp_result;
634	/ prohibiting integer overflows due to too long strings: /
635	if (wpos < `0` \|\|
636	wpos > (utf8proc_ssize_t)(SSIZE_MAX/sizeof(utf8proc_int32_t)/`2`))
637	return UTF8PROC_ERROR_OVERFLOW;
638	}
639	}
640	if ((options & (UTF8PROC_COMPOSE\|UTF8PROC_DECOMPOSE)) && bufsize >= wpos) {
641	utf8proc_ssize_t pos = `0`;
642	while (pos < wpos-`1`) {
643	utf8proc_int32_t uc1, uc2;
644	const utf8proc_property_t property1, property2;
645	uc1 = buffer[pos];
646	uc2 = buffer[pos+`1`];
647	property1 = unsafe_get_property(uc: uc1);
648	property2 = unsafe_get_property(uc: uc2);
649	if (property1->combining_class > property2->combining_class &&
650	property2->combining_class > `0`) {
651	buffer[pos] = uc2;
652	buffer[pos+`1`] = uc1;
653	if (pos > `0`) pos--; else pos++;
654	} else {
655	pos++;
656	}
657	}
658	}
659	return wpos;
660	}
661
662	UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) {
663	/ UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored /
664	if (options & (UTF8PROC_NLF2LS \| UTF8PROC_NLF2PS \| UTF8PROC_STRIPCC)) {
665	utf8proc_ssize_t rpos;
666	utf8proc_ssize_t wpos = `0`;
667	utf8proc_int32_t uc;
668	for (rpos = `0`; rpos < length; rpos++) {
669	uc = buffer[rpos];
670	if (uc == `0x000D` && rpos < length-`1` && buffer[rpos+`1`] == `0x000A`) rpos++;
671	if (uc == `0x000A` \|\| uc == `0x000D` \|\| uc == `0x0085` \|\|
672	((options & UTF8PROC_STRIPCC) && (uc == `0x000B` \|\| uc == `0x000C`))) {
673	if (options & UTF8PROC_NLF2LS) {
674	if (options & UTF8PROC_NLF2PS) {
675	buffer[wpos++] = `0x000A`;
676	} else {
677	buffer[wpos++] = `0x2028`;
678	}
679	} else {
680	if (options & UTF8PROC_NLF2PS) {
681	buffer[wpos++] = `0x2029`;
682	} else {
683	buffer[wpos++] = `0x0020`;
684	}
685	}
686	} else if ((options & UTF8PROC_STRIPCC) &&
687	(uc < `0x0020` \|\| (uc >= `0x007F` && uc < `0x00A0`))) {
688	if (uc == `0x0009`) buffer[wpos++] = `0x0020`;
689	} else {
690	buffer[wpos++] = uc;
691	}
692	}
693	length = wpos;
694	}
695	if (options & UTF8PROC_COMPOSE) {
696	utf8proc_int32_t *starter = NULL;
697	utf8proc_int32_t current_char;
698	const utf8proc_property_t starter_property = NULL, current_property;
699	utf8proc_propval_t max_combining_class = -`1`;
700	utf8proc_ssize_t rpos;
701	utf8proc_ssize_t wpos = `0`;
702	utf8proc_int32_t composition;
703	for (rpos = `0`; rpos < length; rpos++) {
704	current_char = buffer[rpos];
705	current_property = unsafe_get_property(uc: current_char);
706	if (starter && current_property->combining_class > max_combining_class) {
707	/ combination perhaps possible /
708	utf8proc_int32_t hangul_lindex;
709	utf8proc_int32_t hangul_sindex;
710	hangul_lindex = *starter - UTF8PROC_HANGUL_LBASE;
711	if (hangul_lindex >= `0` && hangul_lindex < UTF8PROC_HANGUL_LCOUNT) {
712	utf8proc_int32_t hangul_vindex;
713	hangul_vindex = current_char - UTF8PROC_HANGUL_VBASE;
714	if (hangul_vindex >= `0` && hangul_vindex < UTF8PROC_HANGUL_VCOUNT) {
715	*starter = UTF8PROC_HANGUL_SBASE +
716	(hangul_lindex * UTF8PROC_HANGUL_VCOUNT + hangul_vindex) *
717	UTF8PROC_HANGUL_TCOUNT;
718	starter_property = NULL;
719	continue;
720	}
721	}
722	hangul_sindex = *starter - UTF8PROC_HANGUL_SBASE;
723	if (hangul_sindex >= `0` && hangul_sindex < UTF8PROC_HANGUL_SCOUNT &&
724	(hangul_sindex % UTF8PROC_HANGUL_TCOUNT) == `0`) {
725	utf8proc_int32_t hangul_tindex;
726	hangul_tindex = current_char - UTF8PROC_HANGUL_TBASE;
727	if (hangul_tindex >= `0` && hangul_tindex < UTF8PROC_HANGUL_TCOUNT) {
728	*starter += hangul_tindex;
729	starter_property = NULL;
730	continue;
731	}
732	}
733	if (!starter_property) {
734	starter_property = unsafe_get_property(uc: *starter);
735	}
736	if (starter_property->comb_index < `0x8000` &&
737	current_property->comb_index != UINT16_MAX &&
738	current_property->comb_index >= `0x8000`) {
739	int sidx = starter_property->comb_index;
740	int idx = current_property->comb_index & `0x3FFF`;
741	if (idx >= utf8proc_combinations[sidx] && idx <= utf8proc_combinations[sidx + `1`] ) {
742	idx += sidx + `2` - utf8proc_combinations[sidx];
743	if (current_property->comb_index & `0x4000`) {
744	composition = (utf8proc_combinations[idx] << `16`) \| utf8proc_combinations[idx+`1`];
745	} else
746	composition = utf8proc_combinations[idx];
747
748	if (composition > `0` && (!(options & UTF8PROC_STABLE) \|\|
749	!(unsafe_get_property(uc: composition)->comp_exclusion))) {
750	*starter = composition;
751	starter_property = NULL;
752	continue;
753	}
754	}
755	}
756	}
757	buffer[wpos] = current_char;
758	if (current_property->combining_class) {
759	if (current_property->combining_class > max_combining_class) {
760	max_combining_class = current_property->combining_class;
761	}
762	} else {
763	starter = buffer + wpos;
764	starter_property = NULL;
765	max_combining_class = -`1`;
766	}
767	wpos++;
768	}
769	length = wpos;
770	}
771	return length;
772	}
773
774	UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) {
775	/ UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored*
776	ASSERT: 'buffer' has one spare byte of free space at the end! /*
777	length = utf8proc_normalize_utf32(buffer, length, options);
778	if (length < `0`) return length;
779	{
780	utf8proc_ssize_t rpos, wpos = `0`;
781	utf8proc_int32_t uc;
782	if (options & UTF8PROC_CHARBOUND) {
783	for (rpos = `0`; rpos < length; rpos++) {
784	uc = buffer[rpos];
785	wpos += charbound_encode_char(uc, dst: ((utf8proc_uint8_t *)buffer) + wpos);
786	}
787	} else {
788	for (rpos = `0`; rpos < length; rpos++) {
789	uc = buffer[rpos];
790	wpos += utf8proc_encode_char(uc, dst: ((utf8proc_uint8_t *)buffer) + wpos);
791	}
792	}
793	((utf8proc_uint8_t *)buffer)[wpos] = `0`;
794	return wpos;
795	}
796	}
797
798	UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map(
799	const utf8proc_uint8_t str, utf8proc_ssize_t strlen, utf8proc_uint8_t *dstptr, utf8proc_option_t options
800	) {
801	return utf8proc_map_custom(str, strlen, dstptr, options, NULL, NULL);
802	}
803
804	UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom(
805	const utf8proc_uint8_t str, utf8proc_ssize_t strlen, utf8proc_uint8_t *dstptr, utf8proc_option_t options,
806	utf8proc_custom_func custom_func, void *custom_data
807	) {
808	utf8proc_int32_t *buffer;
809	utf8proc_ssize_t result;
810	*dstptr = NULL;
811	result = utf8proc_decompose_custom(str, strlen, NULL, bufsize: `0`, options, custom_func, custom_data);
812	if (result < `0`) return result;
813	buffer = (utf8proc_int32_t ) malloc(size: result sizeof(utf8proc_int32_t) + `1`);
814	if (!buffer) return UTF8PROC_ERROR_NOMEM;
815	result = utf8proc_decompose_custom(str, strlen, buffer, bufsize: result, options, custom_func, custom_data);
816	if (result < `0`) {
817	free(ptr: buffer);
818	return result;
819	}
820	result = utf8proc_reencode(buffer, length: result, options);
821	if (result < `0`) {
822	free(ptr: buffer);
823	return result;
824	}
825	{
826	utf8proc_int32_t *newptr;
827	newptr = (utf8proc_int32_t *) realloc(ptr: buffer, size: (size_t)result+`1`);
828	if (newptr) buffer = newptr;
829	}
830	dstptr = (utf8proc_uint8_t )buffer;
831	return result;
832	}
833
834	UTF8PROC_DLLEXPORT utf8proc_uint8_t utf8proc_NFD(const* utf8proc_uint8_t *str, utf8proc_ssize_t len) {
835	utf8proc_uint8_t *retval;
836	utf8proc_map(str, strlen: len, dstptr: &retval, options: (utf8proc_option_t)(UTF8PROC_STABLE \|
837	UTF8PROC_DECOMPOSE));
838	return retval;
839	}
840
841	UTF8PROC_DLLEXPORT utf8proc_uint8_t utf8proc_NFC(const* utf8proc_uint8_t *str, utf8proc_ssize_t len) {
842	utf8proc_uint8_t *retval;
843	utf8proc_map(str, strlen: len, dstptr: &retval, options: (utf8proc_option_t)(UTF8PROC_STABLE \|
844	UTF8PROC_COMPOSE));
845	return retval;
846	}
847
848	UTF8PROC_DLLEXPORT utf8proc_uint8_t utf8proc_remove_accents(const* utf8proc_uint8_t *str, utf8proc_ssize_t len) {
849	utf8proc_uint8_t *retval;
850	utf8proc_map(str, strlen: len, dstptr: &retval, options: (utf8proc_option_t)(UTF8PROC_STABLE \|
851	UTF8PROC_COMPOSE \| UTF8PROC_STRIPMARK));
852	return retval;
853	}
854
855	UTF8PROC_DLLEXPORT utf8proc_uint8_t utf8proc_NFKD(const* utf8proc_uint8_t *str, utf8proc_ssize_t len) {
856	utf8proc_uint8_t *retval;
857	utf8proc_map(str, strlen: len, dstptr: &retval, options: (utf8proc_option_t)(UTF8PROC_STABLE \|
858	UTF8PROC_DECOMPOSE \| UTF8PROC_COMPAT));
859	return retval;
860	}
861
862	UTF8PROC_DLLEXPORT utf8proc_uint8_t utf8proc_NFKC(const* utf8proc_uint8_t *str, utf8proc_ssize_t len) {
863	utf8proc_uint8_t *retval;
864	utf8proc_map(str, strlen: len, dstptr: &retval, options: (utf8proc_option_t)(UTF8PROC_STABLE \|
865	UTF8PROC_COMPOSE \| UTF8PROC_COMPAT));
866	return retval;
867	}
868
869	UTF8PROC_DLLEXPORT utf8proc_uint8_t utf8proc_NFKC_Casefold(const* utf8proc_uint8_t *str, utf8proc_ssize_t len) {
870	utf8proc_uint8_t *retval;
871	utf8proc_map(str, strlen: len, dstptr: &retval, options: (utf8proc_option_t)(UTF8PROC_STABLE \|
872	UTF8PROC_COMPOSE \| UTF8PROC_COMPAT \| UTF8PROC_CASEFOLD \| UTF8PROC_IGNORE));
873	return retval;
874	}
875
876	}
877

Browse the source code of Velox/build/_deps/duckdb-src/third_party/utf8proc/utf8proc.cpp