utf8proc.cpp source code [DuckDB/third_party/utf8proc/utf8proc.cpp]

1	/ -- mode: c; c-basic-offset: 2; tab-width: 2; indent-tabs-mode: nil -- /
2	/*
3	* Copyright (c) 2014-2019 Steven G. Johnson, Jiahao Chen, Peter Colberg, Tony Kelman, Scott P. Jones, and other contributors.
4	* Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
5	*
6	* Permission is hereby granted, free of charge, to any person obtaining a
7	* copy of this software and associated documentation files (the "Software"),
8	* to deal in the Software without restriction, including without limitation
9	* the rights to use, copy, modify, merge, publish, distribute, sublicense,
10	* and/or sell copies of the Software, and to permit persons to whom the
11	* Software is furnished to do so, subject to the following conditions:
12	*
13	* The above copyright notice and this permission notice shall be included in
14	* all copies or substantial portions of the Software.
15	*
16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21	* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
22	* DEALINGS IN THE SOFTWARE.
23	*/
24
25	/*
26	* This library contains derived data from a modified version of the
27	* Unicode data files.
28	*
29	* The original data files are available at
30	* http://www.unicode.org/Public/UNIDATA/
31	*
32	* Please notice the copyright statement in the file "utf8proc_data.c".
33	*/
34
35
36	/*
37	* File name: utf8proc.c
38	*
39	* Description:
40	* Implementation of libutf8proc.
41	*/
42
43
44	#include "utf8proc.hpp"
45
46	#ifndef SSIZE_MAX
47	#define SSIZE_MAX ((size_t)SIZE_MAX/2)
48	#endif
49	#ifndef UINT16_MAX
50	# define UINT16_MAX 65535U
51	#endif
52
53	#include "utf8proc_data.cpp"
54
55
56	UTF8PROC_DLLEXPORT const utf8proc_int8_t utf8proc_utf8class[`256`] = {
57	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`,
58	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`,
59	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`,
60	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`,
61	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`,
62	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`,
63	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`,
64	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`,
65	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
66	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
67	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
68	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
69	`2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`,
70	`2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`,
71	`3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`,
72	`4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0` };
73
74	#define UTF8PROC_HANGUL_SBASE 0xAC00
75	#define UTF8PROC_HANGUL_LBASE 0x1100
76	#define UTF8PROC_HANGUL_VBASE 0x1161
77	#define UTF8PROC_HANGUL_TBASE 0x11A7
78	#define UTF8PROC_HANGUL_LCOUNT 19
79	#define UTF8PROC_HANGUL_VCOUNT 21
80	#define UTF8PROC_HANGUL_TCOUNT 28
81	#define UTF8PROC_HANGUL_NCOUNT 588
82	#define UTF8PROC_HANGUL_SCOUNT 11172
83	/ END is exclusive /
84	#define UTF8PROC_HANGUL_L_START 0x1100
85	#define UTF8PROC_HANGUL_L_END 0x115A
86	#define UTF8PROC_HANGUL_L_FILLER 0x115F
87	#define UTF8PROC_HANGUL_V_START 0x1160
88	#define UTF8PROC_HANGUL_V_END 0x11A3
89	#define UTF8PROC_HANGUL_T_START 0x11A8
90	#define UTF8PROC_HANGUL_T_END 0x11FA
91	#define UTF8PROC_HANGUL_S_START 0xAC00
92	#define UTF8PROC_HANGUL_S_END 0xD7A4
93
94	/ Should follow semantic-versioning rules (semver.org) based on API*
95	compatibility. (Note that the shared-library version number will
96	be different, being based on ABI compatibility.): /*
97	#define STRINGIZEx(x) #x
98	#define STRINGIZE(x) STRINGIZEx(x)
99	UTF8PROC_DLLEXPORT const char utf8proc_version(void*) {
100	return STRINGIZE(UTF8PROC_VERSION_MAJOR) "." STRINGIZE(UTF8PROC_VERSION_MINOR) "." STRINGIZE(UTF8PROC_VERSION_PATCH) "";
101	}
102
103	UTF8PROC_DLLEXPORT const char utf8proc_unicode_version(void*) {
104	return "12.1.0";
105	}
106
107	UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode) {
108	switch (errcode) {
109	case UTF8PROC_ERROR_NOMEM:
110	return "Memory for processing UTF-8 data could not be allocated.";
111	case UTF8PROC_ERROR_OVERFLOW:
112	return "UTF-8 string is too long to be processed.";
113	case UTF8PROC_ERROR_INVALIDUTF8:
114	return "Invalid UTF-8 string";
115	case UTF8PROC_ERROR_NOTASSIGNED:
116	return "Unassigned Unicode code point found in UTF-8 string.";
117	case UTF8PROC_ERROR_INVALIDOPTS:
118	return "Invalid options for UTF-8 processing chosen.";
119	default:
120	return "An unknown error occurred while processing UTF-8 data.";
121	}
122	}
123
124	#define utf_cont(ch) (((ch) & 0xc0) == 0x80)
125	UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate(
126	const utf8proc_uint8_t str, utf8proc_ssize_t strlen, utf8proc_int32_t dst
127	) {
128	utf8proc_uint32_t uc;
129	const utf8proc_uint8_t *end;
130
131	*dst = -`1`;
132	if (!strlen) return `0`;
133	end = str + ((strlen < `0`) ? `4` : strlen);
134	uc = *str++;
135	if (uc < `0x80`) {
136	*dst = uc;
137	return `1`;
138	}
139	// Must be between 0xc2 and 0xf4 inclusive to be valid
140	if ((uc - `0xc2`) > (`0xf4`-`0xc2`)) return UTF8PROC_ERROR_INVALIDUTF8;
141	if (uc < `0xe0`) { // 2-byte sequence
142	// Must have valid continuation character
143	if (str >= end \|\| !utf_cont(str)) return* UTF8PROC_ERROR_INVALIDUTF8;
144	dst = ((uc & `0x1f`)<<`6`) \| (str & `0x3f`);
145	return `2`;
146	}
147	if (uc < `0xf0`) { // 3-byte sequence
148	if ((str + `1` >= end) \|\| !utf_cont(*str) \|\| !utf_cont(str[`1`]))
149	return UTF8PROC_ERROR_INVALIDUTF8;
150	// Check for surrogate chars
151	if (uc == `0xed` && *str > `0x9f`)
152	return UTF8PROC_ERROR_INVALIDUTF8;
153	uc = ((uc & `0xf`)<<`12`) \| ((*str & `0x3f`)<<`6`) \| (str[`1`] & `0x3f`);
154	if (uc < `0x800`)
155	return UTF8PROC_ERROR_INVALIDUTF8;
156	*dst = uc;
157	return `3`;
158	}
159	// 4-byte sequence
160	// Must have 3 valid continuation characters
161	if ((str + `2` >= end) \|\| !utf_cont(*str) \|\| !utf_cont(str[`1`]) \|\| !utf_cont(str[`2`]))
162	return UTF8PROC_ERROR_INVALIDUTF8;
163	// Make sure in correct range (0x10000 - 0x10ffff)
164	if (uc == `0xf0`) {
165	if (str < `0x90`) return* UTF8PROC_ERROR_INVALIDUTF8;
166	} else if (uc == `0xf4`) {
167	if (str > `0x8f`) return* UTF8PROC_ERROR_INVALIDUTF8;
168	}
169	dst = ((uc & `7`)<<`18`) \| ((str & `0x3f`)<<`12`) \| ((str[`1`] & `0x3f`)<<`6`) \| (str[`2`] & `0x3f`);
170	return `4`;
171	}
172
173	UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_codepoint_valid(utf8proc_int32_t uc) {
174	return (((utf8proc_uint32_t)uc)-`0xd800` > `0x07ff`) && ((utf8proc_uint32_t)uc < `0x110000`);
175	}
176
177	UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) {
178	if (uc < `0x00`) {
179	return `0`;
180	} else if (uc < `0x80`) {
181	dst[`0`] = (utf8proc_uint8_t) uc;
182	return `1`;
183	} else if (uc < `0x800`) {
184	dst[`0`] = (utf8proc_uint8_t)(`0xC0` + (uc >> `6`));
185	dst[`1`] = (utf8proc_uint8_t)(`0x80` + (uc & `0x3F`));
186	return `2`;
187	// Note: we allow encoding 0xd800-0xdfff here, so as not to change
188	// the API, however, these are actually invalid in UTF-8
189	} else if (uc < `0x10000`) {
190	dst[`0`] = (utf8proc_uint8_t)(`0xE0` + (uc >> `12`));
191	dst[`1`] = (utf8proc_uint8_t)(`0x80` + ((uc >> `6`) & `0x3F`));
192	dst[`2`] = (utf8proc_uint8_t)(`0x80` + (uc & `0x3F`));
193	return `3`;
194	} else if (uc < `0x110000`) {
195	dst[`0`] = (utf8proc_uint8_t)(`0xF0` + (uc >> `18`));
196	dst[`1`] = (utf8proc_uint8_t)(`0x80` + ((uc >> `12`) & `0x3F`));
197	dst[`2`] = (utf8proc_uint8_t)(`0x80` + ((uc >> `6`) & `0x3F`));
198	dst[`3`] = (utf8proc_uint8_t)(`0x80` + (uc & `0x3F`));
199	return `4`;
200	} else return `0`;
201	}
202
203	/ internal version used for inserting 0xff bytes between graphemes /
204	static utf8proc_ssize_t charbound_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) {
205	if (uc < `0x00`) {
206	if (uc == -`1`) { / internal value used for grapheme breaks /
207	dst[`0`] = (utf8proc_uint8_t)`0xFF`;
208	return `1`;
209	}
210	return `0`;
211	} else if (uc < `0x80`) {
212	dst[`0`] = (utf8proc_uint8_t)uc;
213	return `1`;
214	} else if (uc < `0x800`) {
215	dst[`0`] = (utf8proc_uint8_t)(`0xC0` + (uc >> `6`));
216	dst[`1`] = (utf8proc_uint8_t)(`0x80` + (uc & `0x3F`));
217	return `2`;
218	} else if (uc < `0x10000`) {
219	dst[`0`] = (utf8proc_uint8_t)(`0xE0` + (uc >> `12`));
220	dst[`1`] = (utf8proc_uint8_t)(`0x80` + ((uc >> `6`) & `0x3F`));
221	dst[`2`] = (utf8proc_uint8_t)(`0x80` + (uc & `0x3F`));
222	return `3`;
223	} else if (uc < `0x110000`) {
224	dst[`0`] = (utf8proc_uint8_t)(`0xF0` + (uc >> `18`));
225	dst[`1`] = (utf8proc_uint8_t)(`0x80` + ((uc >> `12`) & `0x3F`));
226	dst[`2`] = (utf8proc_uint8_t)(`0x80` + ((uc >> `6`) & `0x3F`));
227	dst[`3`] = (utf8proc_uint8_t)(`0x80` + (uc & `0x3F`));
228	return `4`;
229	} else return `0`;
230	}
231
232	/ internal "unsafe" version that does not check whether uc is in range /
233	static const utf8proc_property_t *unsafe_get_property(utf8proc_int32_t uc) {
234	/ ASSERT: uc >= 0 && uc < 0x110000 /
235	return utf8proc_properties + (
236	utf8proc_stage2table[
237	utf8proc_stage1table[uc >> `8`] + (uc & `0xFF`)
238	]
239	);
240	}
241
242	UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int32_t uc) {
243	return uc < `0` \|\| uc >= `0x110000` ? utf8proc_properties : unsafe_get_property(uc);
244	}
245
246	/ return whether there is a grapheme break between boundclasses lbc and tbc*
247	(according to the definition of extended grapheme clusters)
248
249	Rule numbering refers to TR29 Version 29 (Unicode 9.0.0):
250	http://www.unicode.org/reports/tr29/tr29-29.html
251
252	CAVEATS:
253	Please note that evaluation of GB10 (grapheme breaks between emoji zwj sequences)
254	and GB 12/13 (regional indicator code points) require knowledge of previous characters
255	and are thus not handled by this function. This may result in an incorrect break before
256	an E_Modifier class codepoint and an incorrectly missing break between two
257	REGIONAL_INDICATOR class code points if such support does not exist in the caller.
258
259	See the special support in grapheme_break_extended, for required bookkeeping by the caller.
260	*/
261	static utf8proc_bool grapheme_break_simple(int lbc, int tbc) {
262	return
263	(lbc == UTF8PROC_BOUNDCLASS_START) ? true : // GB1
264	(lbc == UTF8PROC_BOUNDCLASS_CR && // GB3
265	tbc == UTF8PROC_BOUNDCLASS_LF) ? false : // ---
266	(lbc >= UTF8PROC_BOUNDCLASS_CR && lbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true : // GB4
267	(tbc >= UTF8PROC_BOUNDCLASS_CR && tbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true : // GB5
268	(lbc == UTF8PROC_BOUNDCLASS_L && // GB6
269	(tbc == UTF8PROC_BOUNDCLASS_L \|\| // ---
270	tbc == UTF8PROC_BOUNDCLASS_V \|\| // ---
271	tbc == UTF8PROC_BOUNDCLASS_LV \|\| // ---
272	tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false : // ---
273	((lbc == UTF8PROC_BOUNDCLASS_LV \|\| // GB7
274	lbc == UTF8PROC_BOUNDCLASS_V) && // ---
275	(tbc == UTF8PROC_BOUNDCLASS_V \|\| // ---
276	tbc == UTF8PROC_BOUNDCLASS_T)) ? false : // ---
277	((lbc == UTF8PROC_BOUNDCLASS_LVT \|\| // GB8
278	lbc == UTF8PROC_BOUNDCLASS_T) && // ---
279	tbc == UTF8PROC_BOUNDCLASS_T) ? false : // ---
280	(tbc == UTF8PROC_BOUNDCLASS_EXTEND \|\| // GB9
281	tbc == UTF8PROC_BOUNDCLASS_ZWJ \|\| // ---
282	tbc == UTF8PROC_BOUNDCLASS_SPACINGMARK \|\| // GB9a
283	lbc == UTF8PROC_BOUNDCLASS_PREPEND) ? false : // GB9b
284	(lbc == UTF8PROC_BOUNDCLASS_E_ZWG && // GB11 (requires additional handling below)
285	tbc == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC) ? false : // ----
286	(lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR && // GB12/13 (requires additional handling below)
287	tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false : // ----
288	true; // GB999
289	}
290
291	utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t *state)
292	{
293	int lbc_override = ((state && *state != UTF8PROC_BOUNDCLASS_START)
294	? *state : lbc);
295	utf8proc_bool break_permitted = grapheme_break_simple(lbc_override, tbc);
296	if (state) {
297	// Special support for GB 12/13 made possible by GB999. After two RI
298	// class codepoints we want to force a break. Do this by resetting the
299	// second RI's bound class to UTF8PROC_BOUNDCLASS_OTHER, to force a break
300	// after that character according to GB999 (unless of course such a break is
301	// forbidden by a different rule such as GB9).
302	if (*state == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR)
303	*state = UTF8PROC_BOUNDCLASS_OTHER;
304	// Special support for GB11 (emoji extend zwj / emoji)*
305	else if (*state == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC) {
306	if (tbc == UTF8PROC_BOUNDCLASS_EXTEND) // fold EXTEND codepoints into emoji
307	*state = UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC;
308	else if (tbc == UTF8PROC_BOUNDCLASS_ZWJ)
309	state = UTF8PROC_BOUNDCLASS_E_ZWG; // state to record emoji+zwg combo*
310	else
311	*state = tbc;
312	}
313	else
314	*state = tbc;
315	}
316	return break_permitted;
317	}
318
319	UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break_stateful(
320	utf8proc_int32_t c1, utf8proc_int32_t c2, utf8proc_int32_t *state) {
321
322	return grapheme_break_extended(utf8proc_get_property(c1)->boundclass,
323	utf8proc_get_property(c2)->boundclass,
324	state);
325	}
326
327
328	UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(
329	utf8proc_int32_t c1, utf8proc_int32_t c2) {
330	return utf8proc_grapheme_break_stateful(c1, c2, NULL);
331	}
332
333	// from http://www.zedwood.com/article/cpp-utf8-char-to-codepoint
334	utf8proc_int32_t utf8proc_codepoint(const char u_input, int* &sz) {
335	auto u = (const unsigned char *) u_input;
336	unsigned char u0 = u[`0`];
337	if (u0>=`0` && u0<=`127`) {
338	sz = `1`;
339	return u0;
340	}
341	unsigned char u1 = u[`1`];
342	if (u0>=`192` && u0<=`223`) {
343	sz = `2`;
344	return (u0-`192`)*`64` + (u1-`128`);
345	}
346	if (u[`0`]==`0xed` && (u[`1`] & `0xa0`) == `0xa0`) {
347	return -`1`; //code points, 0xd800 to 0xdfff
348	}
349	unsigned char u2 = u[`2`];
350	if (u0>=`224` && u0<=`239`) {
351	sz = `3`;
352	return (u0-`224`)`4096` + (u1-`128`)`64` + (u2-`128`);
353	}
354	unsigned char u3 = u[`3`];
355	if (u0>=`240` && u0<=`247`) {
356	sz = `4`;
357	return (u0-`240`)`262144` + (u1-`128`)`4096` + (u2-`128`)*`64` + (u3-`128`);
358	}
359	return -`1`;
360	}
361
362	bool utf8proc_codepoint_to_utf8(int cp, int &sz, char *c) {
363	if (cp<=`0x7F`) {
364	sz = `1`;
365	c[`0`] = cp;
366	} else if(cp<=`0x7FF`) {
367	sz = `2`;
368	c[`0`] = (cp>>`6`)+`192`;
369	c[`1`] = (cp&`63`)+`128`;
370	} else if(`0xd800`<=cp && cp<=`0xdfff`) {
371	sz = -`1`;
372	// invalid block of utf
373	return false;
374	} else if(cp<=`0xFFFF`) {
375	sz = `3`;
376	c[`0`] = (cp>>`12`)+`224`;
377	c[`1`]= ((cp>>`6`)&`63`)+`128`;
378	c[`2`]=(cp&`63`)+`128`;
379	} else if(cp<=`0x10FFFF`) {
380	sz = `4`;
381	c[`0`] = (cp>>`18`)+`240`;
382	c[`1`] = ((cp>>`12`)&`63`)+`128`;
383	c[`2`] = ((cp>>`6`)&`63`)+`128`;
384	c[`3`]=(cp&`63`)+`128`;
385	} else {
386	sz = -`1`;
387	return false;
388	}
389	return true;
390	}
391
392	int utf8proc_codepoint_length(int cp) {
393	if (cp<=`0x7F`) {
394	return `1`;
395	} else if(cp<=`0x7FF`) {
396	return `2`;
397	} else if(`0xd800`<=cp && cp<=`0xdfff`) {
398	return -`1`;
399	} else if(cp<=`0xFFFF`) {
400	return `3`;
401	} else if(cp<=`0x10FFFF`) {
402	return `4`;
403	}
404	return -`1`;
405	}
406
407	size_t utf8proc_next_grapheme(const char *s, size_t len, size_t cpos) {
408	int sz;
409	int boundclass = UTF8PROC_BOUNDCLASS_START;
410	int initial = utf8proc_get_property(utf8proc_codepoint(s + cpos, sz))->boundclass;
411	grapheme_break_extended(boundclass, initial, &boundclass);
412	while(true) {
413	cpos += sz;
414	if (cpos >= len) {
415	return cpos;
416	}
417	int next = utf8proc_get_property(utf8proc_codepoint(s + cpos, sz))->boundclass;
418	if (grapheme_break_extended(boundclass, next, &boundclass)) {
419	return cpos;
420	}
421	}
422	}
423
424	static utf8proc_int32_t seqindex_decode_entry(const utf8proc_uint16_t **entry)
425	{
426	utf8proc_int32_t entry_cp = **entry;
427	if ((entry_cp & `0xF800`) == `0xD800`) {
428	entry = entry + `1`;
429	entry_cp = ((entry_cp & `0x03FF`) << `10`) \| (**entry & `0x03FF`);
430	entry_cp += `0x10000`;
431	}
432	return entry_cp;
433	}
434
435	static utf8proc_int32_t seqindex_decode_index(const utf8proc_uint32_t seqindex)
436	{
437	const utf8proc_uint16_t *entry = &utf8proc_sequences[seqindex];
438	return seqindex_decode_entry(&entry);
439	}
440
441	static utf8proc_ssize_t seqindex_write_char_decomposed(utf8proc_uint16_t seqindex, utf8proc_int32_t dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int* *last_boundclass) {
442	utf8proc_ssize_t written = `0`;
443	const utf8proc_uint16_t *entry = &utf8proc_sequences[seqindex & `0x1FFF`];
444	int len = seqindex >> `13`;
445	if (len >= `7`) {
446	len = *entry;
447	entry++;
448	}
449	for (; len >= `0`; entry++, len--) {
450	utf8proc_int32_t entry_cp = seqindex_decode_entry(&entry);
451
452	written += utf8proc_decompose_char(entry_cp, dst+written,
453	(bufsize > written) ? (bufsize - written) : `0`, options,
454	last_boundclass);
455	if (written < `0`) return UTF8PROC_ERROR_OVERFLOW;
456	}
457	return written;
458	}
459
460	UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c)
461	{
462	utf8proc_int32_t cl = utf8proc_get_property(c)->lowercase_seqindex;
463	return cl != UINT16_MAX ? seqindex_decode_index(cl) : c;
464	}
465
466	UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c)
467	{
468	utf8proc_int32_t cu = utf8proc_get_property(c)->uppercase_seqindex;
469	return cu != UINT16_MAX ? seqindex_decode_index(cu) : c;
470	}
471
472	UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_totitle(utf8proc_int32_t c)
473	{
474	utf8proc_int32_t cu = utf8proc_get_property(c)->titlecase_seqindex;
475	return cu != UINT16_MAX ? seqindex_decode_index(cu) : c;
476	}
477
478	/ return a character width analogous to wcwidth (except portable and*
479	hopefully less buggy than most system wcwidth functions). /*
480	UTF8PROC_DLLEXPORT int utf8proc_charwidth(utf8proc_int32_t c) {
481	return utf8proc_get_property(c)->charwidth;
482	}
483
484	UTF8PROC_DLLEXPORT utf8proc_category_t utf8proc_category(utf8proc_int32_t c) {
485	return (utf8proc_category_t)utf8proc_get_property(c)->category;
486	}
487
488	UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t c) {
489	static const char s[][`3`] = {"Cn","Lu","Ll","Lt","Lm","Lo","Mn","Mc","Me","Nd","Nl","No","Pc","Pd","Ps","Pe","Pi","Pf","Po","Sm","Sc","Sk","So","Zs","Zl","Zp","Cc","Cf","Cs","Co"};
490	return s[utf8proc_category(c)];
491	}
492
493	#define utf8proc_decompose_lump(replacement_uc) \
494	return utf8proc_decompose_char((replacement_uc), dst, bufsize, \
495	(utf8proc_option_t) (options & ~UTF8PROC_LUMP), last_boundclass)
496
497	UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc, utf8proc_int32_t dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int* *last_boundclass) {
498	const utf8proc_property_t *property;
499	utf8proc_propval_t category;
500	utf8proc_int32_t hangul_sindex;
501	if (uc < `0` \|\| uc >= `0x110000`) return UTF8PROC_ERROR_NOTASSIGNED;
502	property = unsafe_get_property(uc);
503	category = property->category;
504	hangul_sindex = uc - UTF8PROC_HANGUL_SBASE;
505	if (options & (UTF8PROC_COMPOSE\|UTF8PROC_DECOMPOSE)) {
506	if (hangul_sindex >= `0` && hangul_sindex < UTF8PROC_HANGUL_SCOUNT) {
507	utf8proc_int32_t hangul_tindex;
508	if (bufsize >= `1`) {
509	dst[`0`] = UTF8PROC_HANGUL_LBASE +
510	hangul_sindex / UTF8PROC_HANGUL_NCOUNT;
511	if (bufsize >= `2`) dst[`1`] = UTF8PROC_HANGUL_VBASE +
512	(hangul_sindex % UTF8PROC_HANGUL_NCOUNT) / UTF8PROC_HANGUL_TCOUNT;
513	}
514	hangul_tindex = hangul_sindex % UTF8PROC_HANGUL_TCOUNT;
515	if (!hangul_tindex) return `2`;
516	if (bufsize >= `3`) dst[`2`] = UTF8PROC_HANGUL_TBASE + hangul_tindex;
517	return `3`;
518	}
519	}
520	if (options & UTF8PROC_REJECTNA) {
521	if (!category) return UTF8PROC_ERROR_NOTASSIGNED;
522	}
523	if (options & UTF8PROC_IGNORE) {
524	if (property->ignorable) return `0`;
525	}
526	if (options & UTF8PROC_STRIPNA) {
527	if (!category) return `0`;
528	}
529	if (options & UTF8PROC_LUMP) {
530	if (category == UTF8PROC_CATEGORY_ZS) utf8proc_decompose_lump(`0x0020`);
531	if (uc == `0x2018` \|\| uc == `0x2019` \|\| uc == `0x02BC` \|\| uc == `0x02C8`)
532	utf8proc_decompose_lump(`0x0027`);
533	if (category == UTF8PROC_CATEGORY_PD \|\| uc == `0x2212`)
534	utf8proc_decompose_lump(`0x002D`);
535	if (uc == `0x2044` \|\| uc == `0x2215`) utf8proc_decompose_lump(`0x002F`);
536	if (uc == `0x2236`) utf8proc_decompose_lump(`0x003A`);
537	if (uc == `0x2039` \|\| uc == `0x2329` \|\| uc == `0x3008`)
538	utf8proc_decompose_lump(`0x003C`);
539	if (uc == `0x203A` \|\| uc == `0x232A` \|\| uc == `0x3009`)
540	utf8proc_decompose_lump(`0x003E`);
541	if (uc == `0x2216`) utf8proc_decompose_lump(`0x005C`);
542	if (uc == `0x02C4` \|\| uc == `0x02C6` \|\| uc == `0x2038` \|\| uc == `0x2303`)
543	utf8proc_decompose_lump(`0x005E`);
544	if (category == UTF8PROC_CATEGORY_PC \|\| uc == `0x02CD`)
545	utf8proc_decompose_lump(`0x005F`);
546	if (uc == `0x02CB`) utf8proc_decompose_lump(`0x0060`);
547	if (uc == `0x2223`) utf8proc_decompose_lump(`0x007C`);
548	if (uc == `0x223C`) utf8proc_decompose_lump(`0x007E`);
549	if ((options & UTF8PROC_NLF2LS) && (options & UTF8PROC_NLF2PS)) {
550	if (category == UTF8PROC_CATEGORY_ZL \|\|
551	category == UTF8PROC_CATEGORY_ZP)
552	utf8proc_decompose_lump(`0x000A`);
553	}
554	}
555	if (options & UTF8PROC_STRIPMARK) {
556	if (category == UTF8PROC_CATEGORY_MN \|\|
557	category == UTF8PROC_CATEGORY_MC \|\|
558	category == UTF8PROC_CATEGORY_ME) return `0`;
559	}
560	if (options & UTF8PROC_CASEFOLD) {
561	if (property->casefold_seqindex != UINT16_MAX) {
562	return seqindex_write_char_decomposed(property->casefold_seqindex, dst, bufsize, options, last_boundclass);
563	}
564	}
565	if (options & (UTF8PROC_COMPOSE\|UTF8PROC_DECOMPOSE)) {
566	if (property->decomp_seqindex != UINT16_MAX &&
567	(!property->decomp_type \|\| (options & UTF8PROC_COMPAT))) {
568	return seqindex_write_char_decomposed(property->decomp_seqindex, dst, bufsize, options, last_boundclass);
569	}
570	}
571	if (options & UTF8PROC_CHARBOUND) {
572	utf8proc_bool boundary;
573	int tbc = property->boundclass;
574	boundary = grapheme_break_extended(*last_boundclass, tbc, last_boundclass);
575	if (boundary) {
576	if (bufsize >= `1`) dst[`0`] = -`1`; / sentinel value for grapheme break /
577	if (bufsize >= `2`) dst[`1`] = uc;
578	return `2`;
579	}
580	}
581	if (bufsize >= `1`) *dst = uc;
582	return `1`;
583	}
584
585	UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
586	const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
587	utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options
588	) {
589	return utf8proc_decompose_custom(str, strlen, buffer, bufsize, options, NULL, NULL);
590	}
591
592	UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_custom(
593	const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
594	utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options,
595	utf8proc_custom_func custom_func, void *custom_data
596	) {
597	/ strlen will be ignored, if UTF8PROC_NULLTERM is set in options /
598	utf8proc_ssize_t wpos = `0`;
599	if ((options & UTF8PROC_COMPOSE) && (options & UTF8PROC_DECOMPOSE))
600	return UTF8PROC_ERROR_INVALIDOPTS;
601	if ((options & UTF8PROC_STRIPMARK) &&
602	!(options & UTF8PROC_COMPOSE) && !(options & UTF8PROC_DECOMPOSE))
603	return UTF8PROC_ERROR_INVALIDOPTS;
604	{
605	utf8proc_int32_t uc;
606	utf8proc_ssize_t rpos = `0`;
607	utf8proc_ssize_t decomp_result;
608	int boundclass = UTF8PROC_BOUNDCLASS_START;
609	while (`1`) {
610	if (options & UTF8PROC_NULLTERM) {
611	rpos += utf8proc_iterate(str + rpos, -`1`, &uc);
612	/ checking of return value is not necessary,*
613	as 'uc' is < 0 in case of error /*
614	if (uc < `0`) return UTF8PROC_ERROR_INVALIDUTF8;
615	if (rpos < `0`) return UTF8PROC_ERROR_OVERFLOW;
616	if (uc == `0`) break;
617	} else {
618	if (rpos >= strlen) break;
619	rpos += utf8proc_iterate(str + rpos, strlen - rpos, &uc);
620	if (uc < `0`) return UTF8PROC_ERROR_INVALIDUTF8;
621	}
622	if (custom_func != NULL) {
623	uc = custom_func(uc, custom_data); / user-specified custom mapping /
624	}
625	decomp_result = utf8proc_decompose_char(
626	uc, buffer + wpos, (bufsize > wpos) ? (bufsize - wpos) : `0`, options,
627	&boundclass
628	);
629	if (decomp_result < `0`) return decomp_result;
630	wpos += decomp_result;
631	/ prohibiting integer overflows due to too long strings: /
632	if (wpos < `0` \|\|
633	wpos > (utf8proc_ssize_t)(SSIZE_MAX/sizeof(utf8proc_int32_t)/`2`))
634	return UTF8PROC_ERROR_OVERFLOW;
635	}
636	}
637	if ((options & (UTF8PROC_COMPOSE\|UTF8PROC_DECOMPOSE)) && bufsize >= wpos) {
638	utf8proc_ssize_t pos = `0`;
639	while (pos < wpos-`1`) {
640	utf8proc_int32_t uc1, uc2;
641	const utf8proc_property_t property1, property2;
642	uc1 = buffer[pos];
643	uc2 = buffer[pos+`1`];
644	property1 = unsafe_get_property(uc1);
645	property2 = unsafe_get_property(uc2);
646	if (property1->combining_class > property2->combining_class &&
647	property2->combining_class > `0`) {
648	buffer[pos] = uc2;
649	buffer[pos+`1`] = uc1;
650	if (pos > `0`) pos--; else pos++;
651	} else {
652	pos++;
653	}
654	}
655	}
656	return wpos;
657	}
658
659	UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) {
660	/ UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored /
661	if (options & (UTF8PROC_NLF2LS \| UTF8PROC_NLF2PS \| UTF8PROC_STRIPCC)) {
662	utf8proc_ssize_t rpos;
663	utf8proc_ssize_t wpos = `0`;
664	utf8proc_int32_t uc;
665	for (rpos = `0`; rpos < length; rpos++) {
666	uc = buffer[rpos];
667	if (uc == `0x000D` && rpos < length-`1` && buffer[rpos+`1`] == `0x000A`) rpos++;
668	if (uc == `0x000A` \|\| uc == `0x000D` \|\| uc == `0x0085` \|\|
669	((options & UTF8PROC_STRIPCC) && (uc == `0x000B` \|\| uc == `0x000C`))) {
670	if (options & UTF8PROC_NLF2LS) {
671	if (options & UTF8PROC_NLF2PS) {
672	buffer[wpos++] = `0x000A`;
673	} else {
674	buffer[wpos++] = `0x2028`;
675	}
676	} else {
677	if (options & UTF8PROC_NLF2PS) {
678	buffer[wpos++] = `0x2029`;
679	} else {
680	buffer[wpos++] = `0x0020`;
681	}
682	}
683	} else if ((options & UTF8PROC_STRIPCC) &&
684	(uc < `0x0020` \|\| (uc >= `0x007F` && uc < `0x00A0`))) {
685	if (uc == `0x0009`) buffer[wpos++] = `0x0020`;
686	} else {
687	buffer[wpos++] = uc;
688	}
689	}
690	length = wpos;
691	}
692	if (options & UTF8PROC_COMPOSE) {
693	utf8proc_int32_t *starter = NULL;
694	utf8proc_int32_t current_char;
695	const utf8proc_property_t starter_property = NULL, current_property;
696	utf8proc_propval_t max_combining_class = -`1`;
697	utf8proc_ssize_t rpos;
698	utf8proc_ssize_t wpos = `0`;
699	utf8proc_int32_t composition;
700	for (rpos = `0`; rpos < length; rpos++) {
701	current_char = buffer[rpos];
702	current_property = unsafe_get_property(current_char);
703	if (starter && current_property->combining_class > max_combining_class) {
704	/ combination perhaps possible /
705	utf8proc_int32_t hangul_lindex;
706	utf8proc_int32_t hangul_sindex;
707	hangul_lindex = *starter - UTF8PROC_HANGUL_LBASE;
708	if (hangul_lindex >= `0` && hangul_lindex < UTF8PROC_HANGUL_LCOUNT) {
709	utf8proc_int32_t hangul_vindex;
710	hangul_vindex = current_char - UTF8PROC_HANGUL_VBASE;
711	if (hangul_vindex >= `0` && hangul_vindex < UTF8PROC_HANGUL_VCOUNT) {
712	*starter = UTF8PROC_HANGUL_SBASE +
713	(hangul_lindex * UTF8PROC_HANGUL_VCOUNT + hangul_vindex) *
714	UTF8PROC_HANGUL_TCOUNT;
715	starter_property = NULL;
716	continue;
717	}
718	}
719	hangul_sindex = *starter - UTF8PROC_HANGUL_SBASE;
720	if (hangul_sindex >= `0` && hangul_sindex < UTF8PROC_HANGUL_SCOUNT &&
721	(hangul_sindex % UTF8PROC_HANGUL_TCOUNT) == `0`) {
722	utf8proc_int32_t hangul_tindex;
723	hangul_tindex = current_char - UTF8PROC_HANGUL_TBASE;
724	if (hangul_tindex >= `0` && hangul_tindex < UTF8PROC_HANGUL_TCOUNT) {
725	*starter += hangul_tindex;
726	starter_property = NULL;
727	continue;
728	}
729	}
730	if (!starter_property) {
731	starter_property = unsafe_get_property(*starter);
732	}
733	if (starter_property->comb_index < `0x8000` &&
734	current_property->comb_index != UINT16_MAX &&
735	current_property->comb_index >= `0x8000`) {
736	int sidx = starter_property->comb_index;
737	int idx = current_property->comb_index & `0x3FFF`;
738	if (idx >= utf8proc_combinations[sidx] && idx <= utf8proc_combinations[sidx + `1`] ) {
739	idx += sidx + `2` - utf8proc_combinations[sidx];
740	if (current_property->comb_index & `0x4000`) {
741	composition = (utf8proc_combinations[idx] << `16`) \| utf8proc_combinations[idx+`1`];
742	} else
743	composition = utf8proc_combinations[idx];
744
745	if (composition > `0` && (!(options & UTF8PROC_STABLE) \|\|
746	!(unsafe_get_property(composition)->comp_exclusion))) {
747	*starter = composition;
748	starter_property = NULL;
749	continue;
750	}
751	}
752	}
753	}
754	buffer[wpos] = current_char;
755	if (current_property->combining_class) {
756	if (current_property->combining_class > max_combining_class) {
757	max_combining_class = current_property->combining_class;
758	}
759	} else {
760	starter = buffer + wpos;
761	starter_property = NULL;
762	max_combining_class = -`1`;
763	}
764	wpos++;
765	}
766	length = wpos;
767	}
768	return length;
769	}
770
771	UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) {
772	/ UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored*
773	ASSERT: 'buffer' has one spare byte of free space at the end! /*
774	length = utf8proc_normalize_utf32(buffer, length, options);
775	if (length < `0`) return length;
776	{
777	utf8proc_ssize_t rpos, wpos = `0`;
778	utf8proc_int32_t uc;
779	if (options & UTF8PROC_CHARBOUND) {
780	for (rpos = `0`; rpos < length; rpos++) {
781	uc = buffer[rpos];
782	wpos += charbound_encode_char(uc, ((utf8proc_uint8_t *)buffer) + wpos);
783	}
784	} else {
785	for (rpos = `0`; rpos < length; rpos++) {
786	uc = buffer[rpos];
787	wpos += utf8proc_encode_char(uc, ((utf8proc_uint8_t *)buffer) + wpos);
788	}
789	}
790	((utf8proc_uint8_t *)buffer)[wpos] = `0`;
791	return wpos;
792	}
793	}
794
795	UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map(
796	const utf8proc_uint8_t str, utf8proc_ssize_t strlen, utf8proc_uint8_t *dstptr, utf8proc_option_t options
797	) {
798	return utf8proc_map_custom(str, strlen, dstptr, options, NULL, NULL);
799	}
800
801	UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom(
802	const utf8proc_uint8_t str, utf8proc_ssize_t strlen, utf8proc_uint8_t *dstptr, utf8proc_option_t options,
803	utf8proc_custom_func custom_func, void *custom_data
804	) {
805	utf8proc_int32_t *buffer;
806	utf8proc_ssize_t result;
807	*dstptr = NULL;
808	result = utf8proc_decompose_custom(str, strlen, NULL, `0`, options, custom_func, custom_data);
809	if (result < `0`) return result;
810	buffer = (utf8proc_int32_t ) malloc(result sizeof(utf8proc_int32_t) + `1`);
811	if (!buffer) return UTF8PROC_ERROR_NOMEM;
812	result = utf8proc_decompose_custom(str, strlen, buffer, result, options, custom_func, custom_data);
813	if (result < `0`) {
814	free(buffer);
815	return result;
816	}
817	result = utf8proc_reencode(buffer, result, options);
818	if (result < `0`) {
819	free(buffer);
820	return result;
821	}
822	{
823	utf8proc_int32_t *newptr;
824	newptr = (utf8proc_int32_t *) realloc(buffer, (size_t)result+`1`);
825	if (newptr) buffer = newptr;
826	}
827	dstptr = (utf8proc_uint8_t )buffer;
828	return result;
829	}
830
831	UTF8PROC_DLLEXPORT utf8proc_uint8_t utf8proc_NFD(const* utf8proc_uint8_t *str) {
832	utf8proc_uint8_t *retval;
833	utf8proc_map(str, `0`, &retval, (utf8proc_option_t)(UTF8PROC_NULLTERM \| UTF8PROC_STABLE \|
834	UTF8PROC_DECOMPOSE));
835	return retval;
836	}
837
838	UTF8PROC_DLLEXPORT utf8proc_uint8_t utf8proc_NFC(const* utf8proc_uint8_t *str) {
839	utf8proc_uint8_t *retval;
840	utf8proc_map(str, `0`, &retval, (utf8proc_option_t)(UTF8PROC_NULLTERM \| UTF8PROC_STABLE \|
841	UTF8PROC_COMPOSE));
842	return retval;
843	}
844
845	UTF8PROC_DLLEXPORT utf8proc_uint8_t utf8proc_remove_accents(const* utf8proc_uint8_t *str) {
846	utf8proc_uint8_t *retval;
847	utf8proc_map(str, `0`, &retval, (utf8proc_option_t)(UTF8PROC_NULLTERM \| UTF8PROC_STABLE \|
848	UTF8PROC_COMPOSE \| UTF8PROC_STRIPMARK));
849	return retval;
850	}
851
852	UTF8PROC_DLLEXPORT utf8proc_uint8_t utf8proc_NFKD(const* utf8proc_uint8_t *str) {
853	utf8proc_uint8_t *retval;
854	utf8proc_map(str, `0`, &retval, (utf8proc_option_t)(UTF8PROC_NULLTERM \| UTF8PROC_STABLE \|
855	UTF8PROC_DECOMPOSE \| UTF8PROC_COMPAT));
856	return retval;
857	}
858
859	UTF8PROC_DLLEXPORT utf8proc_uint8_t utf8proc_NFKC(const* utf8proc_uint8_t *str) {
860	utf8proc_uint8_t *retval;
861	utf8proc_map(str, `0`, &retval, (utf8proc_option_t)(UTF8PROC_NULLTERM \| UTF8PROC_STABLE \|
862	UTF8PROC_COMPOSE \| UTF8PROC_COMPAT));
863	return retval;
864	}
865
866	UTF8PROC_DLLEXPORT utf8proc_uint8_t utf8proc_NFKC_Casefold(const* utf8proc_uint8_t *str) {
867	utf8proc_uint8_t *retval;
868	utf8proc_map(str, `0`, &retval, (utf8proc_option_t)(UTF8PROC_NULLTERM \| UTF8PROC_STABLE \|
869	UTF8PROC_COMPOSE \| UTF8PROC_COMPAT \| UTF8PROC_CASEFOLD \| UTF8PROC_IGNORE));
870	return retval;
871	}
872

Browse the source code of DuckDB/third_party/utf8proc/utf8proc.cpp