libunicode.c source code [QuickJS/libunicode.c]

1	/*
2	* Unicode utilities
3	*
4	* Copyright (c) 2017-2018 Fabrice Bellard
5	*
6	* Permission is hereby granted, free of charge, to any person obtaining a copy
7	* of this software and associated documentation files (the "Software"), to deal
8	* in the Software without restriction, including without limitation the rights
9	* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10	* copies of the Software, and to permit persons to whom the Software is
11	* furnished to do so, subject to the following conditions:
12	*
13	* The above copyright notice and this permission notice shall be included in
14	* all copies or substantial portions of the Software.
15	*
16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19	* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22	* THE SOFTWARE.
23	*/
24	#include <stdlib.h>
25	#include <stdio.h>
26	#include <stdarg.h>
27	#include <string.h>
28	#include <assert.h>
29
30	#include "cutils.h"
31	#include "libunicode.h"
32	#include "libunicode-table.h"
33
34	enum {
35	RUN_TYPE_U,
36	RUN_TYPE_L,
37	RUN_TYPE_UF,
38	RUN_TYPE_LF,
39	RUN_TYPE_UL,
40	RUN_TYPE_LSU,
41	RUN_TYPE_U2L_399_EXT2,
42	RUN_TYPE_UF_D20,
43	RUN_TYPE_UF_D1_EXT,
44	RUN_TYPE_U_EXT,
45	RUN_TYPE_LF_EXT,
46	RUN_TYPE_UF_EXT2,
47	RUN_TYPE_LF_EXT2,
48	RUN_TYPE_UF_EXT3,
49	};
50
51	static int lre_case_conv1(uint32_t c, int conv_type)
52	{
53	uint32_t res[LRE_CC_RES_LEN_MAX];
54	lre_case_conv(res, c, conv_type);
55	return res[`0`];
56	}
57
58	/ case conversion using the table entry 'idx' with value 'v' /
59	static int lre_case_conv_entry(uint32_t res, uint32_t c, int* conv_type, uint32_t idx, uint32_t v)
60	{
61	uint32_t code, data, type, a, is_lower;
62	is_lower = (conv_type != `0`);
63	type = (v >> (`32` - `17` - `7` - `4`)) & `0xf`;
64	data = ((v & `0xf`) << `8`) \| case_conv_table2[idx];
65	code = v >> (`32` - `17`);
66	switch(type) {
67	case RUN_TYPE_U:
68	case RUN_TYPE_L:
69	case RUN_TYPE_UF:
70	case RUN_TYPE_LF:
71	if (conv_type == (type & `1`) \|\|
72	(type >= RUN_TYPE_UF && conv_type == `2`)) {
73	c = c - code + (case_conv_table1[data] >> (`32` - `17`));
74	}
75	break;
76	case RUN_TYPE_UL:
77	a = c - code;
78	if ((a & `1`) != (`1` - is_lower))
79	break;
80	c = (a ^ `1`) + code;
81	break;
82	case RUN_TYPE_LSU:
83	a = c - code;
84	if (a == `1`) {
85	c += `2` * is_lower - `1`;
86	} else if (a == (`1` - is_lower) * `2`) {
87	c += (`2` * is_lower - `1`) * `2`;
88	}
89	break;
90	case RUN_TYPE_U2L_399_EXT2:
91	if (!is_lower) {
92	res[`0`] = c - code + case_conv_ext[data >> `6`];
93	res[`1`] = `0x399`;
94	return `2`;
95	} else {
96	c = c - code + case_conv_ext[data & `0x3f`];
97	}
98	break;
99	case RUN_TYPE_UF_D20:
100	if (conv_type == `1`)
101	break;
102	c = data + (conv_type == `2`) * `0x20`;
103	break;
104	case RUN_TYPE_UF_D1_EXT:
105	if (conv_type == `1`)
106	break;
107	c = case_conv_ext[data] + (conv_type == `2`);
108	break;
109	case RUN_TYPE_U_EXT:
110	case RUN_TYPE_LF_EXT:
111	if (is_lower != (type - RUN_TYPE_U_EXT))
112	break;
113	c = case_conv_ext[data];
114	break;
115	case RUN_TYPE_LF_EXT2:
116	if (!is_lower)
117	break;
118	res[`0`] = c - code + case_conv_ext[data >> `6`];
119	res[`1`] = case_conv_ext[data & `0x3f`];
120	return `2`;
121	case RUN_TYPE_UF_EXT2:
122	if (conv_type == `1`)
123	break;
124	res[`0`] = c - code + case_conv_ext[data >> `6`];
125	res[`1`] = case_conv_ext[data & `0x3f`];
126	if (conv_type == `2`) {
127	/ convert to lower /
128	res[`0`] = lre_case_conv1(res[`0`], `1`);
129	res[`1`] = lre_case_conv1(res[`1`], `1`);
130	}
131	return `2`;
132	default:
133	case RUN_TYPE_UF_EXT3:
134	if (conv_type == `1`)
135	break;
136	res[`0`] = case_conv_ext[data >> `8`];
137	res[`1`] = case_conv_ext[(data >> `4`) & `0xf`];
138	res[`2`] = case_conv_ext[data & `0xf`];
139	if (conv_type == `2`) {
140	/ convert to lower /
141	res[`0`] = lre_case_conv1(res[`0`], `1`);
142	res[`1`] = lre_case_conv1(res[`1`], `1`);
143	res[`2`] = lre_case_conv1(res[`2`], `1`);
144	}
145	return `3`;
146	}
147	res[`0`] = c;
148	return `1`;
149	}
150
151	/ conv_type:*
152	0 = to upper
153	1 = to lower
154	2 = case folding (= to lower with modifications)
155	*/
156	int lre_case_conv(uint32_t res, uint32_t c, int* conv_type)
157	{
158	if (c < `128`) {
159	if (conv_type) {
160	if (c >= `'A'` && c <= `'Z'`) {
161	c = c - `'A'` + `'a'`;
162	}
163	} else {
164	if (c >= `'a'` && c <= `'z'`) {
165	c = c - `'a'` + `'A'`;
166	}
167	}
168	} else {
169	uint32_t v, code, len;
170	int idx, idx_min, idx_max;
171
172	idx_min = `0`;
173	idx_max = countof(case_conv_table1) - `1`;
174	while (idx_min <= idx_max) {
175	idx = (unsigned)(idx_max + idx_min) / `2`;
176	v = case_conv_table1[idx];
177	code = v >> (`32` - `17`);
178	len = (v >> (`32` - `17` - `7`)) & `0x7f`;
179	if (c < code) {
180	idx_max = idx - `1`;
181	} else if (c >= code + len) {
182	idx_min = idx + `1`;
183	} else {
184	return lre_case_conv_entry(res, c, conv_type, idx, v);
185	}
186	}
187	}
188	res[`0`] = c;
189	return `1`;
190	}
191
192	static int lre_case_folding_entry(uint32_t c, uint32_t idx, uint32_t v, BOOL is_unicode)
193	{
194	uint32_t res[LRE_CC_RES_LEN_MAX];
195	int len;
196
197	if (is_unicode) {
198	len = lre_case_conv_entry(res, c, `2`, idx, v);
199	if (len == `1`) {
200	c = res[`0`];
201	} else {
202	/ handle the few specific multi-character cases (see*
203	unicode_gen.c:dump_case_folding_special_cases()) /*
204	if (c == `0xfb06`) {
205	c = `0xfb05`;
206	} else if (c == `0x01fd3`) {
207	c = `0x390`;
208	} else if (c == `0x01fe3`) {
209	c = `0x3b0`;
210	}
211	}
212	} else {
213	if (likely(c < `128`)) {
214	if (c >= `'a'` && c <= `'z'`)
215	c = c - `'a'` + `'A'`;
216	} else {
217	/ legacy regexp: to upper case if single char >= 128 /
218	len = lre_case_conv_entry(res, c, FALSE, idx, v);
219	if (len == `1` && res[`0`] >= `128`)
220	c = res[`0`];
221	}
222	}
223	return c;
224	}
225
226	/ JS regexp specific rules for case folding /
227	int lre_canonicalize(uint32_t c, BOOL is_unicode)
228	{
229	if (c < `128`) {
230	/ fast case /
231	if (is_unicode) {
232	if (c >= `'A'` && c <= `'Z'`) {
233	c = c - `'A'` + `'a'`;
234	}
235	} else {
236	if (c >= `'a'` && c <= `'z'`) {
237	c = c - `'a'` + `'A'`;
238	}
239	}
240	} else {
241	uint32_t v, code, len;
242	int idx, idx_min, idx_max;
243
244	idx_min = `0`;
245	idx_max = countof(case_conv_table1) - `1`;
246	while (idx_min <= idx_max) {
247	idx = (unsigned)(idx_max + idx_min) / `2`;
248	v = case_conv_table1[idx];
249	code = v >> (`32` - `17`);
250	len = (v >> (`32` - `17` - `7`)) & `0x7f`;
251	if (c < code) {
252	idx_max = idx - `1`;
253	} else if (c >= code + len) {
254	idx_min = idx + `1`;
255	} else {
256	return lre_case_folding_entry(c, idx, v, is_unicode);
257	}
258	}
259	}
260	return c;
261	}
262
263	static uint32_t get_le24(const uint8_t *ptr)
264	{
265	return ptr[`0`] \| (ptr[`1`] << `8`) \| (ptr[`2`] << `16`);
266	}
267
268	#define UNICODE_INDEX_BLOCK_LEN 32
269
270	/ return -1 if not in table, otherwise the offset in the block /
271	static int get_index_pos(uint32_t *pcode, uint32_t c,
272	const uint8_t index_table, int* index_table_len)
273	{
274	uint32_t code, v;
275	int idx_min, idx_max, idx;
276
277	idx_min = `0`;
278	v = get_le24(index_table);
279	code = v & ((`1` << `21`) - `1`);
280	if (c < code) {
281	*pcode = `0`;
282	return `0`;
283	}
284	idx_max = index_table_len - `1`;
285	code = get_le24(index_table + idx_max * `3`);
286	if (c >= code)
287	return -`1`;
288	/ invariant: tab[idx_min] <= c < tab2[idx_max] /
289	while ((idx_max - idx_min) > `1`) {
290	idx = (idx_max + idx_min) / `2`;
291	v = get_le24(index_table + idx * `3`);
292	code = v & ((`1` << `21`) - `1`);
293	if (c < code) {
294	idx_max = idx;
295	} else {
296	idx_min = idx;
297	}
298	}
299	v = get_le24(index_table + idx_min * `3`);
300	*pcode = v & ((`1` << `21`) - `1`);
301	return (idx_min + `1`) * UNICODE_INDEX_BLOCK_LEN + (v >> `21`);
302	}
303
304	static BOOL lre_is_in_table(uint32_t c, const uint8_t *table,
305	const uint8_t index_table, int* index_table_len)
306	{
307	uint32_t code, b, bit;
308	int pos;
309	const uint8_t *p;
310
311	pos = get_index_pos(&code, c, index_table, index_table_len);
312	if (pos < `0`)
313	return FALSE; / outside the table /
314	p = table + pos;
315	bit = `0`;
316	/ Compressed run length encoding:*
317	00..3F: 2 packed lengths: 3-bit + 3-bit
318	40..5F: 5-bits plus extra byte for length
319	60..7F: 5-bits plus 2 extra bytes for length
320	80..FF: 7-bit length
321	lengths must be incremented to get character count
322	Ranges alternate between false and true return value.
323	*/
324	for(;;) {
325	b = *p++;
326	if (b < `64`) {
327	code += (b >> `3`) + `1`;
328	if (c < code)
329	return bit;
330	bit ^= `1`;
331	code += (b & `7`) + `1`;
332	} else if (b >= `0x80`) {
333	code += b - `0x80` + `1`;
334	} else if (b < `0x60`) {
335	code += (((b - `0x40`) << `8`) \| p[`0`]) + `1`;
336	p++;
337	} else {
338	code += (((b - `0x60`) << `16`) \| (p[`0`] << `8`) \| p[`1`]) + `1`;
339	p += `2`;
340	}
341	if (c < code)
342	return bit;
343	bit ^= `1`;
344	}
345	}
346
347	BOOL lre_is_cased(uint32_t c)
348	{
349	uint32_t v, code, len;
350	int idx, idx_min, idx_max;
351
352	idx_min = `0`;
353	idx_max = countof(case_conv_table1) - `1`;
354	while (idx_min <= idx_max) {
355	idx = (unsigned)(idx_max + idx_min) / `2`;
356	v = case_conv_table1[idx];
357	code = v >> (`32` - `17`);
358	len = (v >> (`32` - `17` - `7`)) & `0x7f`;
359	if (c < code) {
360	idx_max = idx - `1`;
361	} else if (c >= code + len) {
362	idx_min = idx + `1`;
363	} else {
364	return TRUE;
365	}
366	}
367	return lre_is_in_table(c, unicode_prop_Cased1_table,
368	unicode_prop_Cased1_index,
369	sizeof(unicode_prop_Cased1_index) / `3`);
370	}
371
372	BOOL lre_is_case_ignorable(uint32_t c)
373	{
374	return lre_is_in_table(c, unicode_prop_Case_Ignorable_table,
375	unicode_prop_Case_Ignorable_index,
376	sizeof(unicode_prop_Case_Ignorable_index) / `3`);
377	}
378
379	/ character range /
380
381	static __maybe_unused void cr_dump(CharRange *cr)
382	{
383	int i;
384	for(i = `0`; i < cr->len; i++)
385	printf("%d: 0x%04x\n", i, cr->points[i]);
386	}
387
388	static void cr_default_realloc(void* opaque, void* *ptr, size_t size)
389	{
390	return realloc(ptr, size);
391	}
392
393	void cr_init(CharRange cr, void* mem_opaque, DynBufReallocFunc realloc_func)
394	{
395	cr->len = cr->size = `0`;
396	cr->points = NULL;
397	cr->mem_opaque = mem_opaque;
398	cr->realloc_func = realloc_func ? realloc_func : cr_default_realloc;
399	}
400
401	void cr_free(CharRange *cr)
402	{
403	cr->realloc_func(cr->mem_opaque, cr->points, `0`);
404	}
405
406	int cr_realloc(CharRange cr, int* size)
407	{
408	int new_size;
409	uint32_t *new_buf;
410
411	if (size > cr->size) {
412	new_size = max_int(size, cr->size * `3` / `2`);
413	new_buf = cr->realloc_func(cr->mem_opaque, cr->points,
414	new_size * sizeof(cr->points[`0`]));
415	if (!new_buf)
416	return -`1`;
417	cr->points = new_buf;
418	cr->size = new_size;
419	}
420	return `0`;
421	}
422
423	int cr_copy(CharRange cr, const* CharRange *cr1)
424	{
425	if (cr_realloc(cr, cr1->len))
426	return -`1`;
427	memcpy(cr->points, cr1->points, sizeof(cr->points[`0`]) * cr1->len);
428	cr->len = cr1->len;
429	return `0`;
430	}
431
432	/ merge consecutive intervals and remove empty intervals /
433	static void cr_compress(CharRange *cr)
434	{
435	int i, j, k, len;
436	uint32_t *pt;
437
438	pt = cr->points;
439	len = cr->len;
440	i = `0`;
441	j = `0`;
442	k = `0`;
443	while ((i + `1`) < len) {
444	if (pt[i] == pt[i + `1`]) {
445	/ empty interval /
446	i += `2`;
447	} else {
448	j = i;
449	while ((j + `3`) < len && pt[j + `1`] == pt[j + `2`])
450	j += `2`;
451	/ just copy /
452	pt[k] = pt[i];
453	pt[k + `1`] = pt[j + `1`];
454	k += `2`;
455	i = j + `2`;
456	}
457	}
458	cr->len = k;
459	}
460
461	/ union or intersection /
462	int cr_op(CharRange cr, const* uint32_t a_pt, int* a_len,
463	const uint32_t b_pt, int* b_len, int op)
464	{
465	int a_idx, b_idx, is_in;
466	uint32_t v;
467
468	a_idx = `0`;
469	b_idx = `0`;
470	for(;;) {
471	/ get one more point from a or b in increasing order /
472	if (a_idx < a_len && b_idx < b_len) {
473	if (a_pt[a_idx] < b_pt[b_idx]) {
474	goto a_add;
475	} else if (a_pt[a_idx] == b_pt[b_idx]) {
476	v = a_pt[a_idx];
477	a_idx++;
478	b_idx++;
479	} else {
480	goto b_add;
481	}
482	} else if (a_idx < a_len) {
483	a_add:
484	v = a_pt[a_idx++];
485	} else if (b_idx < b_len) {
486	b_add:
487	v = b_pt[b_idx++];
488	} else {
489	break;
490	}
491	/ add the point if the in/out status changes /
492	switch(op) {
493	case CR_OP_UNION:
494	is_in = (a_idx & `1`) \| (b_idx & `1`);
495	break;
496	case CR_OP_INTER:
497	is_in = (a_idx & `1`) & (b_idx & `1`);
498	break;
499	case CR_OP_XOR:
500	is_in = (a_idx & `1`) ^ (b_idx & `1`);
501	break;
502	default:
503	abort();
504	}
505	if (is_in != (cr->len & `1`)) {
506	if (cr_add_point(cr, v))
507	return -`1`;
508	}
509	}
510	cr_compress(cr);
511	return `0`;
512	}
513
514	int cr_union1(CharRange cr, const* uint32_t b_pt, int* b_len)
515	{
516	CharRange a = *cr;
517	int ret;
518	cr->len = `0`;
519	cr->size = `0`;
520	cr->points = NULL;
521	ret = cr_op(cr, a.points, a.len, b_pt, b_len, CR_OP_UNION);
522	cr_free(&a);
523	return ret;
524	}
525
526	int cr_invert(CharRange *cr)
527	{
528	int len;
529	len = cr->len;
530	if (cr_realloc(cr, len + `2`))
531	return -`1`;
532	memmove(cr->points + `1`, cr->points, len * sizeof(cr->points[`0`]));
533	cr->points[`0`] = `0`;
534	cr->points[len + `1`] = UINT32_MAX;
535	cr->len = len + `2`;
536	cr_compress(cr);
537	return `0`;
538	}
539
540	#define CASE_U (1 << 0)
541	#define CASE_L (1 << 1)
542	#define CASE_F (1 << 2)
543
544	/ use the case conversion table to generate range of characters.*
545	CASE_U: set char if modified by uppercasing,
546	CASE_L: set char if modified by lowercasing,
547	CASE_F: set char if modified by case folding,
548	*/
549	static int unicode_case1(CharRange cr, int* case_mask)
550	{
551	#define MR(x) (1 << RUN_TYPE_ ## x)
552	const uint32_t tab_run_mask[`3`] = {
553	MR(U) \| MR(UF) \| MR(UL) \| MR(LSU) \| MR(U2L_399_EXT2) \| MR(UF_D20) \|
554	MR(UF_D1_EXT) \| MR(U_EXT) \| MR(UF_EXT2) \| MR(UF_EXT3),
555
556	MR(L) \| MR(LF) \| MR(UL) \| MR(LSU) \| MR(U2L_399_EXT2) \| MR(LF_EXT) \| MR(LF_EXT2),
557
558	MR(UF) \| MR(LF) \| MR(UL) \| MR(LSU) \| MR(U2L_399_EXT2) \| MR(LF_EXT) \| MR(LF_EXT2) \| MR(UF_D20) \| MR(UF_D1_EXT) \| MR(LF_EXT) \| MR(UF_EXT2) \| MR(UF_EXT3),
559	};
560	#undef MR
561	uint32_t mask, v, code, type, len, i, idx;
562
563	if (case_mask == `0`)
564	return `0`;
565	mask = `0`;
566	for(i = `0`; i < `3`; i++) {
567	if ((case_mask >> i) & `1`)
568	mask \|= tab_run_mask[i];
569	}
570	for(idx = `0`; idx < countof(case_conv_table1); idx++) {
571	v = case_conv_table1[idx];
572	type = (v >> (`32` - `17` - `7` - `4`)) & `0xf`;
573	code = v >> (`32` - `17`);
574	len = (v >> (`32` - `17` - `7`)) & `0x7f`;
575	if ((mask >> type) & `1`) {
576	// printf("%d: type=%d %04x %04x\n", idx, type, code, code + len - 1);
577	switch(type) {
578	case RUN_TYPE_UL:
579	if ((case_mask & CASE_U) && (case_mask & (CASE_L \| CASE_F)))
580	goto def_case;
581	code += ((case_mask & CASE_U) != `0`);
582	for(i = `0`; i < len; i += `2`) {
583	if (cr_add_interval(cr, code + i, code + i + `1`))
584	return -`1`;
585	}
586	break;
587	case RUN_TYPE_LSU:
588	if ((case_mask & CASE_U) && (case_mask & (CASE_L \| CASE_F)))
589	goto def_case;
590	if (!(case_mask & CASE_U)) {
591	if (cr_add_interval(cr, code, code + `1`))
592	return -`1`;
593	}
594	if (cr_add_interval(cr, code + `1`, code + `2`))
595	return -`1`;
596	if (case_mask & CASE_U) {
597	if (cr_add_interval(cr, code + `2`, code + `3`))
598	return -`1`;
599	}
600	break;
601	default:
602	def_case:
603	if (cr_add_interval(cr, code, code + len))
604	return -`1`;
605	break;
606	}
607	}
608	}
609	return `0`;
610	}
611
612	static int point_cmp(const void p1, const* void p2, void* *arg)
613	{
614	uint32_t v1 = (uint32_t )p1;
615	uint32_t v2 = (uint32_t )p2;
616	return (v1 > v2) - (v1 < v2);
617	}
618
619	static void cr_sort_and_remove_overlap(CharRange *cr)
620	{
621	uint32_t start, end, start1, end1, i, j;
622
623	/ the resulting ranges are not necessarily sorted and may overlap /
624	rqsort(cr->points, cr->len / `2`, sizeof(cr->points[`0`]) * `2`, point_cmp, NULL);
625	j = `0`;
626	for(i = `0`; i < cr->len; ) {
627	start = cr->points[i];
628	end = cr->points[i + `1`];
629	i += `2`;
630	while (i < cr->len) {
631	start1 = cr->points[i];
632	end1 = cr->points[i + `1`];
633	if (start1 > end) {
634	/ \|------\|*
635	* \|-------\| */
636	break;
637	} else if (end1 <= end) {
638	/ \|------\|*
639	* \|--\| */
640	i += `2`;
641	} else {
642	/ \|------\|*
643	* \|-------\| */
644	end = end1;
645	i += `2`;
646	}
647	}
648	cr->points[j] = start;
649	cr->points[j + `1`] = end;
650	j += `2`;
651	}
652	cr->len = j;
653	}
654
655	/ canonicalize a character set using the JS regex case folding rules*
656	(see lre_canonicalize()) /*
657	int cr_regexp_canonicalize(CharRange *cr, BOOL is_unicode)
658	{
659	CharRange cr_inter, cr_mask, cr_result, cr_sub;
660	uint32_t v, code, len, i, idx, start, end, c, d_start, d_end, d;
661
662	cr_init(&cr_mask, cr->mem_opaque, cr->realloc_func);
663	cr_init(&cr_inter, cr->mem_opaque, cr->realloc_func);
664	cr_init(&cr_result, cr->mem_opaque, cr->realloc_func);
665	cr_init(&cr_sub, cr->mem_opaque, cr->realloc_func);
666
667	if (unicode_case1(&cr_mask, is_unicode ? CASE_F : CASE_U))
668	goto fail;
669	if (cr_op(&cr_inter, cr_mask.points, cr_mask.len, cr->points, cr->len, CR_OP_INTER))
670	goto fail;
671
672	if (cr_invert(&cr_mask))
673	goto fail;
674	if (cr_op(&cr_sub, cr_mask.points, cr_mask.len, cr->points, cr->len, CR_OP_INTER))
675	goto fail;
676
677	/ cr_inter = cr & cr_mask /
678	/ cr_sub = cr & ~cr_mask /
679
680	/ use the case conversion table to compute the result /
681	d_start = -`1`;
682	d_end = -`1`;
683	idx = `0`;
684	v = case_conv_table1[idx];
685	code = v >> (`32` - `17`);
686	len = (v >> (`32` - `17` - `7`)) & `0x7f`;
687	for(i = `0`; i < cr_inter.len; i += `2`) {
688	start = cr_inter.points[i];
689	end = cr_inter.points[i + `1`];
690
691	for(c = start; c < end; c++) {
692	for(;;) {
693	if (c >= code && c < code + len)
694	break;
695	idx++;
696	assert(idx < countof(case_conv_table1));
697	v = case_conv_table1[idx];
698	code = v >> (`32` - `17`);
699	len = (v >> (`32` - `17` - `7`)) & `0x7f`;
700	}
701	d = lre_case_folding_entry(c, idx, v, is_unicode);
702	/ try to merge with the current interval /
703	if (d_start == -`1`) {
704	d_start = d;
705	d_end = d + `1`;
706	} else if (d_end == d) {
707	d_end++;
708	} else {
709	cr_add_interval(&cr_result, d_start, d_end);
710	d_start = d;
711	d_end = d + `1`;
712	}
713	}
714	}
715	if (d_start != -`1`) {
716	if (cr_add_interval(&cr_result, d_start, d_end))
717	goto fail;
718	}
719
720	/ the resulting ranges are not necessarily sorted and may overlap /
721	cr_sort_and_remove_overlap(&cr_result);
722
723	/ or with the character not affected by the case folding /
724	cr->len = `0`;
725	if (cr_op(cr, cr_result.points, cr_result.len, cr_sub.points, cr_sub.len, CR_OP_UNION))
726	goto fail;
727
728	cr_free(&cr_inter);
729	cr_free(&cr_mask);
730	cr_free(&cr_result);
731	cr_free(&cr_sub);
732	return `0`;
733	fail:
734	cr_free(&cr_inter);
735	cr_free(&cr_mask);
736	cr_free(&cr_result);
737	cr_free(&cr_sub);
738	return -`1`;
739	}
740
741	#ifdef CONFIG_ALL_UNICODE
742
743	BOOL lre_is_id_start(uint32_t c)
744	{
745	return lre_is_in_table(c, unicode_prop_ID_Start_table,
746	unicode_prop_ID_Start_index,
747	sizeof(unicode_prop_ID_Start_index) / `3`);
748	}
749
750	BOOL lre_is_id_continue(uint32_t c)
751	{
752	return lre_is_id_start(c) \|\|
753	lre_is_in_table(c, unicode_prop_ID_Continue1_table,
754	unicode_prop_ID_Continue1_index,
755	sizeof(unicode_prop_ID_Continue1_index) / `3`);
756	}
757
758	#define UNICODE_DECOMP_LEN_MAX 18
759
760	typedef enum {
761	DECOMP_TYPE_C1, / 16 bit char /
762	DECOMP_TYPE_L1, / 16 bit char table /
763	DECOMP_TYPE_L2,
764	DECOMP_TYPE_L3,
765	DECOMP_TYPE_L4,
766	DECOMP_TYPE_L5, / XXX: not used /
767	DECOMP_TYPE_L6, / XXX: could remove /
768	DECOMP_TYPE_L7, / XXX: could remove /
769	DECOMP_TYPE_LL1, / 18 bit char table /
770	DECOMP_TYPE_LL2,
771	DECOMP_TYPE_S1, / 8 bit char table /
772	DECOMP_TYPE_S2,
773	DECOMP_TYPE_S3,
774	DECOMP_TYPE_S4,
775	DECOMP_TYPE_S5,
776	DECOMP_TYPE_I1, / increment 16 bit char value /
777	DECOMP_TYPE_I2_0,
778	DECOMP_TYPE_I2_1,
779	DECOMP_TYPE_I3_1,
780	DECOMP_TYPE_I3_2,
781	DECOMP_TYPE_I4_1,
782	DECOMP_TYPE_I4_2,
783	DECOMP_TYPE_B1, / 16 bit base + 8 bit offset /
784	DECOMP_TYPE_B2,
785	DECOMP_TYPE_B3,
786	DECOMP_TYPE_B4,
787	DECOMP_TYPE_B5,
788	DECOMP_TYPE_B6,
789	DECOMP_TYPE_B7,
790	DECOMP_TYPE_B8,
791	DECOMP_TYPE_B18,
792	DECOMP_TYPE_LS2,
793	DECOMP_TYPE_PAT3,
794	DECOMP_TYPE_S2_UL,
795	DECOMP_TYPE_LS2_UL,
796	} DecompTypeEnum;
797
798	static uint32_t unicode_get_short_code(uint32_t c)
799	{
800	static const uint16_t unicode_short_table[`2`] = { `0x2044`, `0x2215` };
801
802	if (c < `0x80`)
803	return c;
804	else if (c < `0x80` + `0x50`)
805	return c - `0x80` + `0x300`;
806	else
807	return unicode_short_table[c - `0x80` - `0x50`];
808	}
809
810	static uint32_t unicode_get_lower_simple(uint32_t c)
811	{
812	if (c < `0x100` \|\| (c >= `0x410` && c <= `0x42f`))
813	c += `0x20`;
814	else
815	c++;
816	return c;
817	}
818
819	static uint16_t unicode_get16(const uint8_t *p)
820	{
821	return p[`0`] \| (p[`1`] << `8`);
822	}
823
824	static int unicode_decomp_entry(uint32_t *res, uint32_t c,
825	int idx, uint32_t code, uint32_t len,
826	uint32_t type)
827	{
828	uint32_t c1;
829	int l, i, p;
830	const uint8_t *d;
831
832	if (type == DECOMP_TYPE_C1) {
833	res[`0`] = unicode_decomp_table2[idx];
834	return `1`;
835	} else {
836	d = unicode_decomp_data + unicode_decomp_table2[idx];
837	switch(type) {
838	case DECOMP_TYPE_L1:
839	case DECOMP_TYPE_L2:
840	case DECOMP_TYPE_L3:
841	case DECOMP_TYPE_L4:
842	case DECOMP_TYPE_L5:
843	case DECOMP_TYPE_L6:
844	case DECOMP_TYPE_L7:
845	l = type - DECOMP_TYPE_L1 + `1`;
846	d += (c - code) * l * `2`;
847	for(i = `0`; i < l; i++) {
848	if ((res[i] = unicode_get16(d + `2` * i)) == `0`)
849	return `0`;
850	}
851	return l;
852	case DECOMP_TYPE_LL1:
853	case DECOMP_TYPE_LL2:
854	{
855	uint32_t k, p;
856	l = type - DECOMP_TYPE_LL1 + `1`;
857	k = (c - code) * l;
858	p = len * l * `2`;
859	for(i = `0`; i < l; i++) {
860	c1 = unicode_get16(d + `2` * k) \|
861	(((d[p + (k / `4`)] >> ((k % `4`) * `2`)) & `3`) << `16`);
862	if (!c1)
863	return `0`;
864	res[i] = c1;
865	k++;
866	}
867	}
868	return l;
869	case DECOMP_TYPE_S1:
870	case DECOMP_TYPE_S2:
871	case DECOMP_TYPE_S3:
872	case DECOMP_TYPE_S4:
873	case DECOMP_TYPE_S5:
874	l = type - DECOMP_TYPE_S1 + `1`;
875	d += (c - code) * l;
876	for(i = `0`; i < l; i++) {
877	if ((res[i] = unicode_get_short_code(d[i])) == `0`)
878	return `0`;
879	}
880	return l;
881	case DECOMP_TYPE_I1:
882	l = `1`;
883	p = `0`;
884	goto decomp_type_i;
885	case DECOMP_TYPE_I2_0:
886	case DECOMP_TYPE_I2_1:
887	case DECOMP_TYPE_I3_1:
888	case DECOMP_TYPE_I3_2:
889	case DECOMP_TYPE_I4_1:
890	case DECOMP_TYPE_I4_2:
891	l = `2` + ((type - DECOMP_TYPE_I2_0) >> `1`);
892	p = ((type - DECOMP_TYPE_I2_0) & `1`) + (l > `2`);
893	decomp_type_i:
894	for(i = `0`; i < l; i++) {
895	c1 = unicode_get16(d + `2` * i);
896	if (i == p)
897	c1 += c - code;
898	res[i] = c1;
899	}
900	return l;
901	case DECOMP_TYPE_B18:
902	l = `18`;
903	goto decomp_type_b;
904	case DECOMP_TYPE_B1:
905	case DECOMP_TYPE_B2:
906	case DECOMP_TYPE_B3:
907	case DECOMP_TYPE_B4:
908	case DECOMP_TYPE_B5:
909	case DECOMP_TYPE_B6:
910	case DECOMP_TYPE_B7:
911	case DECOMP_TYPE_B8:
912	l = type - DECOMP_TYPE_B1 + `1`;
913	decomp_type_b:
914	{
915	uint32_t c_min;
916	c_min = unicode_get16(d);
917	d += `2` + (c - code) * l;
918	for(i = `0`; i < l; i++) {
919	c1 = d[i];
920	if (c1 == `0xff`)
921	c1 = `0x20`;
922	else
923	c1 += c_min;
924	res[i] = c1;
925	}
926	}
927	return l;
928	case DECOMP_TYPE_LS2:
929	d += (c - code) * `3`;
930	if (!(res[`0`] = unicode_get16(d)))
931	return `0`;
932	res[`1`] = unicode_get_short_code(d[`2`]);
933	return `2`;
934	case DECOMP_TYPE_PAT3:
935	res[`0`] = unicode_get16(d);
936	res[`2`] = unicode_get16(d + `2`);
937	d += `4` + (c - code) * `2`;
938	res[`1`] = unicode_get16(d);
939	return `3`;
940	case DECOMP_TYPE_S2_UL:
941	case DECOMP_TYPE_LS2_UL:
942	c1 = c - code;
943	if (type == DECOMP_TYPE_S2_UL) {
944	d += c1 & ~`1`;
945	c = unicode_get_short_code(*d);
946	d++;
947	} else {
948	d += (c1 >> `1`) * `3`;
949	c = unicode_get16(d);
950	d += `2`;
951	}
952	if (c1 & `1`)
953	c = unicode_get_lower_simple(c);
954	res[`0`] = c;
955	res[`1`] = unicode_get_short_code(*d);
956	return `2`;
957	}
958	}
959	return `0`;
960	}
961
962
963	/ return the length of the decomposition (length <=*
964	UNICODE_DECOMP_LEN_MAX) or 0 if no decomposition /*
965	static int unicode_decomp_char(uint32_t *res, uint32_t c, BOOL is_compat1)
966	{
967	uint32_t v, type, is_compat, code, len;
968	int idx_min, idx_max, idx;
969
970	idx_min = `0`;
971	idx_max = countof(unicode_decomp_table1) - `1`;
972	while (idx_min <= idx_max) {
973	idx = (idx_max + idx_min) / `2`;
974	v = unicode_decomp_table1[idx];
975	code = v >> (`32` - `18`);
976	len = (v >> (`32` - `18` - `7`)) & `0x7f`;
977	// printf("idx=%d code=%05x len=%d\n", idx, code, len);
978	if (c < code) {
979	idx_max = idx - `1`;
980	} else if (c >= code + len) {
981	idx_min = idx + `1`;
982	} else {
983	is_compat = v & `1`;
984	if (is_compat1 < is_compat)
985	break;
986	type = (v >> (`32` - `18` - `7` - `6`)) & `0x3f`;
987	return unicode_decomp_entry(res, c, idx, code, len, type);
988	}
989	}
990	return `0`;
991	}
992
993	/ return 0 if no pair found /
994	static int unicode_compose_pair(uint32_t c0, uint32_t c1)
995	{
996	uint32_t code, len, type, v, idx1, d_idx, d_offset, ch;
997	int idx_min, idx_max, idx, d;
998	uint32_t pair[`2`];
999
1000	idx_min = `0`;
1001	idx_max = countof(unicode_comp_table) - `1`;
1002	while (idx_min <= idx_max) {
1003	idx = (idx_max + idx_min) / `2`;
1004	idx1 = unicode_comp_table[idx];
1005
1006	/ idx1 represent an entry of the decomposition table /
1007	d_idx = idx1 >> `6`;
1008	d_offset = idx1 & `0x3f`;
1009	v = unicode_decomp_table1[d_idx];
1010	code = v >> (`32` - `18`);
1011	len = (v >> (`32` - `18` - `7`)) & `0x7f`;
1012	type = (v >> (`32` - `18` - `7` - `6`)) & `0x3f`;
1013	ch = code + d_offset;
1014	unicode_decomp_entry(pair, ch, d_idx, code, len, type);
1015	d = c0 - pair[`0`];
1016	if (d == `0`)
1017	d = c1 - pair[`1`];
1018	if (d < `0`) {
1019	idx_max = idx - `1`;
1020	} else if (d > `0`) {
1021	idx_min = idx + `1`;
1022	} else {
1023	return ch;
1024	}
1025	}
1026	return `0`;
1027	}
1028
1029	/ return the combining class of character c (between 0 and 255) /
1030	static int unicode_get_cc(uint32_t c)
1031	{
1032	uint32_t code, n, type, cc, c1, b;
1033	int pos;
1034	const uint8_t *p;
1035
1036	pos = get_index_pos(&code, c,
1037	unicode_cc_index, sizeof(unicode_cc_index) / `3`);
1038	if (pos < `0`)
1039	return `0`;
1040	p = unicode_cc_table + pos;
1041	/ Compressed run length encoding:*
1042	- 2 high order bits are combining class type
1043	- 0:0, 1:230, 2:extra byte linear progression, 3:extra byte
1044	- 00..2F: range length (add 1)
1045	- 30..37: 3-bit range-length + 1 extra byte
1046	- 38..3F: 3-bit range-length + 2 extra byte
1047	*/
1048	for(;;) {
1049	b = *p++;
1050	type = b >> `6`;
1051	n = b & `0x3f`;
1052	if (n < `48`) {
1053	} else if (n < `56`) {
1054	n = (n - `48`) << `8`;
1055	n \|= *p++;
1056	n += `48`;
1057	} else {
1058	n = (n - `56`) << `8`;
1059	n \|= *p++ << `8`;
1060	n \|= *p++;
1061	n += `48` + (`1` << `11`);
1062	}
1063	if (type <= `1`)
1064	p++;
1065	c1 = code + n + `1`;
1066	if (c < c1) {
1067	switch(type) {
1068	case `0`:
1069	cc = p[-`1`];
1070	break;
1071	case `1`:
1072	cc = p[-`1`] + c - code;
1073	break;
1074	case `2`:
1075	cc = `0`;
1076	break;
1077	default:
1078	case `3`:
1079	cc = `230`;
1080	break;
1081	}
1082	return cc;
1083	}
1084	code = c1;
1085	}
1086	}
1087
1088	static void sort_cc(int buf, int* len)
1089	{
1090	int i, j, k, cc, cc1, start, ch1;
1091
1092	for(i = `0`; i < len; i++) {
1093	cc = unicode_get_cc(buf[i]);
1094	if (cc != `0`) {
1095	start = i;
1096	j = i + `1`;
1097	while (j < len) {
1098	ch1 = buf[j];
1099	cc1 = unicode_get_cc(ch1);
1100	if (cc1 == `0`)
1101	break;
1102	k = j - `1`;
1103	while (k >= start) {
1104	if (unicode_get_cc(buf[k]) <= cc1)
1105	break;
1106	buf[k + `1`] = buf[k];
1107	k--;
1108	}
1109	buf[k + `1`] = ch1;
1110	j++;
1111	}
1112	#if 0
1113	printf("cc:");
1114	for(k = start; k < j; k++) {
1115	printf(" %3d", unicode_get_cc(buf[k]));
1116	}
1117	printf("\n");
1118	#endif
1119	i = j;
1120	}
1121	}
1122	}
1123
1124	static void to_nfd_rec(DynBuf *dbuf,
1125	const int src, int* src_len, int is_compat)
1126	{
1127	uint32_t c, v;
1128	int i, l;
1129	uint32_t res[UNICODE_DECOMP_LEN_MAX];
1130
1131	for(i = `0`; i < src_len; i++) {
1132	c = src[i];
1133	if (c >= `0xac00` && c < `0xd7a4`) {
1134	/ Hangul decomposition /
1135	c -= `0xac00`;
1136	dbuf_put_u32(dbuf, `0x1100` + c / `588`);
1137	dbuf_put_u32(dbuf, `0x1161` + (c % `588`) / `28`);
1138	v = c % `28`;
1139	if (v != `0`)
1140	dbuf_put_u32(dbuf, `0x11a7` + v);
1141	} else {
1142	l = unicode_decomp_char(res, c, is_compat);
1143	if (l) {
1144	to_nfd_rec(dbuf, (int *)res, l, is_compat);
1145	} else {
1146	dbuf_put_u32(dbuf, c);
1147	}
1148	}
1149	}
1150	}
1151
1152	/ return 0 if not found /
1153	static int compose_pair(uint32_t c0, uint32_t c1)
1154	{
1155	/ Hangul composition /
1156	if (c0 >= `0x1100` && c0 < `0x1100` + `19` &&
1157	c1 >= `0x1161` && c1 < `0x1161` + `21`) {
1158	return `0xac00` + (c0 - `0x1100`) * `588` + (c1 - `0x1161`) * `28`;
1159	} else if (c0 >= `0xac00` && c0 < `0xac00` + `11172` &&
1160	(c0 - `0xac00`) % `28` == `0` &&
1161	c1 >= `0x11a7` && c1 < `0x11a7` + `28`) {
1162	return c0 + c1 - `0x11a7`;
1163	} else {
1164	return unicode_compose_pair(c0, c1);
1165	}
1166	}
1167
1168	int unicode_normalize(uint32_t *pdst, const* uint32_t src, int* src_len,
1169	UnicodeNormalizationEnum n_type,
1170	void opaque, DynBufReallocFunc realloc_func)
1171	{
1172	int *buf, buf_len, i, p, starter_pos, cc, last_cc, out_len;
1173	BOOL is_compat;
1174	DynBuf dbuf_s, *dbuf = &dbuf_s;
1175
1176	is_compat = n_type >> `1`;
1177
1178	dbuf_init2(dbuf, opaque, realloc_func);
1179	if (dbuf_realloc(dbuf, sizeof(int) * src_len))
1180	goto fail;
1181
1182	/ common case: latin1 is unaffected by NFC /
1183	if (n_type == UNICODE_NFC) {
1184	for(i = `0`; i < src_len; i++) {
1185	if (src[i] >= `0x100`)
1186	goto not_latin1;
1187	}
1188	buf = (int *)dbuf->buf;
1189	memcpy(buf, src, src_len * sizeof(int));
1190	pdst = (uint32_t )buf;
1191	return src_len;
1192	not_latin1: ;
1193	}
1194
1195	to_nfd_rec(dbuf, (const int *)src, src_len, is_compat);
1196	if (dbuf_error(dbuf)) {
1197	fail:
1198	*pdst = NULL;
1199	return -`1`;
1200	}
1201	buf = (int *)dbuf->buf;
1202	buf_len = dbuf->size / sizeof(int);
1203
1204	sort_cc(buf, buf_len);
1205
1206	if (buf_len <= `1` \|\| (n_type & `1`) != `0`) {
1207	/ NFD / NFKD /
1208	pdst = (uint32_t )buf;
1209	return buf_len;
1210	}
1211
1212	i = `1`;
1213	out_len = `1`;
1214	while (i < buf_len) {
1215	/ find the starter character and test if it is blocked from*
1216	the character at 'i' /*
1217	last_cc = unicode_get_cc(buf[i]);
1218	starter_pos = out_len - `1`;
1219	while (starter_pos >= `0`) {
1220	cc = unicode_get_cc(buf[starter_pos]);
1221	if (cc == `0`)
1222	break;
1223	if (cc >= last_cc)
1224	goto next;
1225	last_cc = `256`;
1226	starter_pos--;
1227	}
1228	if (starter_pos >= `0` &&
1229	(p = compose_pair(buf[starter_pos], buf[i])) != `0`) {
1230	buf[starter_pos] = p;
1231	i++;
1232	} else {
1233	next:
1234	buf[out_len++] = buf[i++];
1235	}
1236	}
1237	pdst = (uint32_t )buf;
1238	return out_len;
1239	}
1240
1241	/ char ranges for various unicode properties /
1242
1243	static int unicode_find_name(const char name_table, const* char *name)
1244	{
1245	const char p, r;
1246	int pos;
1247	size_t name_len, len;
1248
1249	p = name_table;
1250	pos = `0`;
1251	name_len = strlen(name);
1252	while (*p) {
1253	for(;;) {
1254	r = strchr(p, `','`);
1255	if (!r)
1256	len = strlen(p);
1257	else
1258	len = r - p;
1259	if (len == name_len && !memcmp(p, name, name_len))
1260	return pos;
1261	p += len + `1`;
1262	if (!r)
1263	break;
1264	}
1265	pos++;
1266	}
1267	return -`1`;
1268	}
1269
1270	/ 'cr' must be initialized and empty. Return 0 if OK, -1 if error, -2*
1271	if not found /*
1272	int unicode_script(CharRange *cr,
1273	const char *script_name, BOOL is_ext)
1274	{
1275	int script_idx;
1276	const uint8_t p, p_end;
1277	uint32_t c, c1, b, n, v, v_len, i, type;
1278	CharRange cr1_s, *cr1;
1279	CharRange cr2_s, *cr2 = &cr2_s;
1280	BOOL is_common;
1281
1282	script_idx = unicode_find_name(unicode_script_name_table, script_name);
1283	if (script_idx < `0`)
1284	return -`2`;
1285	/ Note: we remove the "Unknown" Script /
1286	script_idx += UNICODE_SCRIPT_Unknown + `1`;
1287
1288	is_common = (script_idx == UNICODE_SCRIPT_Common \|\|
1289	script_idx == UNICODE_SCRIPT_Inherited);
1290	if (is_ext) {
1291	cr1 = &cr1_s;
1292	cr_init(cr1, cr->mem_opaque, cr->realloc_func);
1293	cr_init(cr2, cr->mem_opaque, cr->realloc_func);
1294	} else {
1295	cr1 = cr;
1296	}
1297
1298	p = unicode_script_table;
1299	p_end = unicode_script_table + countof(unicode_script_table);
1300	c = `0`;
1301	while (p < p_end) {
1302	b = *p++;
1303	type = b >> `7`;
1304	n = b & `0x7f`;
1305	if (n < `96`) {
1306	} else if (n < `112`) {
1307	n = (n - `96`) << `8`;
1308	n \|= *p++;
1309	n += `96`;
1310	} else {
1311	n = (n - `112`) << `16`;
1312	n \|= *p++ << `8`;
1313	n \|= *p++;
1314	n += `96` + (`1` << `12`);
1315	}
1316	if (type == `0`)
1317	v = `0`;
1318	else
1319	v = *p++;
1320	c1 = c + n + `1`;
1321	if (v == script_idx) {
1322	if (cr_add_interval(cr1, c, c1))
1323	goto fail;
1324	}
1325	c = c1;
1326	}
1327
1328	if (is_ext) {
1329	/ add the script extensions /
1330	p = unicode_script_ext_table;
1331	p_end = unicode_script_ext_table + countof(unicode_script_ext_table);
1332	c = `0`;
1333	while (p < p_end) {
1334	b = *p++;
1335	if (b < `128`) {
1336	n = b;
1337	} else if (b < `128` + `64`) {
1338	n = (b - `128`) << `8`;
1339	n \|= *p++;
1340	n += `128`;
1341	} else {
1342	n = (b - `128` - `64`) << `16`;
1343	n \|= *p++ << `8`;
1344	n \|= *p++;
1345	n += `128` + (`1` << `14`);
1346	}
1347	c1 = c + n + `1`;
1348	v_len = *p++;
1349	if (is_common) {
1350	if (v_len != `0`) {
1351	if (cr_add_interval(cr2, c, c1))
1352	goto fail;
1353	}
1354	} else {
1355	for(i = `0`; i < v_len; i++) {
1356	if (p[i] == script_idx) {
1357	if (cr_add_interval(cr2, c, c1))
1358	goto fail;
1359	break;
1360	}
1361	}
1362	}
1363	p += v_len;
1364	c = c1;
1365	}
1366	if (is_common) {
1367	/ remove all the characters with script extensions /
1368	if (cr_invert(cr2))
1369	goto fail;
1370	if (cr_op(cr, cr1->points, cr1->len, cr2->points, cr2->len,
1371	CR_OP_INTER))
1372	goto fail;
1373	} else {
1374	if (cr_op(cr, cr1->points, cr1->len, cr2->points, cr2->len,
1375	CR_OP_UNION))
1376	goto fail;
1377	}
1378	cr_free(cr1);
1379	cr_free(cr2);
1380	}
1381	return `0`;
1382	fail:
1383	if (is_ext) {
1384	cr_free(cr1);
1385	cr_free(cr2);
1386	}
1387	goto fail;
1388	}
1389
1390	#define M(id) (1U << UNICODE_GC_ ## id)
1391
1392	static int unicode_general_category1(CharRange *cr, uint32_t gc_mask)
1393	{
1394	const uint8_t p, p_end;
1395	uint32_t c, c0, b, n, v;
1396
1397	p = unicode_gc_table;
1398	p_end = unicode_gc_table + countof(unicode_gc_table);
1399	c = `0`;
1400	/ Compressed range encoding:*
1401	initial byte:
1402	bits 0..4: category number (special case 31)
1403	bits 5..7: range length (add 1)
1404	special case bits 5..7 == 7: read an extra byte
1405	- 00..7F: range length (add 7 + 1)
1406	- 80..BF: 6-bits plus extra byte for range length (add 7 + 128)
1407	- C0..FF: 6-bits plus 2 extra bytes for range length (add 7 + 128 + 16384)
1408	*/
1409	while (p < p_end) {
1410	b = *p++;
1411	n = b >> `5`;
1412	v = b & `0x1f`;
1413	if (n == `7`) {
1414	n = *p++;
1415	if (n < `128`) {
1416	n += `7`;
1417	} else if (n < `128` + `64`) {
1418	n = (n - `128`) << `8`;
1419	n \|= *p++;
1420	n += `7` + `128`;
1421	} else {
1422	n = (n - `128` - `64`) << `16`;
1423	n \|= *p++ << `8`;
1424	n \|= *p++;
1425	n += `7` + `128` + (`1` << `14`);
1426	}
1427	}
1428	c0 = c;
1429	c += n + `1`;
1430	if (v == `31`) {
1431	/ run of Lu / Ll /
1432	b = gc_mask & (M(Lu) \| M(Ll));
1433	if (b != `0`) {
1434	if (b == (M(Lu) \| M(Ll))) {
1435	goto add_range;
1436	} else {
1437	c0 += ((gc_mask & M(Ll)) != `0`);
1438	for(; c0 < c; c0 += `2`) {
1439	if (cr_add_interval(cr, c0, c0 + `1`))
1440	return -`1`;
1441	}
1442	}
1443	}
1444	} else if ((gc_mask >> v) & `1`) {
1445	add_range:
1446	if (cr_add_interval(cr, c0, c))
1447	return -`1`;
1448	}
1449	}
1450	return `0`;
1451	}
1452
1453	static int unicode_prop1(CharRange cr, int* prop_idx)
1454	{
1455	const uint8_t p, p_end;
1456	uint32_t c, c0, b, bit;
1457
1458	p = unicode_prop_table[prop_idx];
1459	p_end = p + unicode_prop_len_table[prop_idx];
1460	c = `0`;
1461	bit = `0`;
1462	/ Compressed range encoding:*
1463	00..3F: 2 packed lengths: 3-bit + 3-bit
1464	40..5F: 5-bits plus extra byte for length
1465	60..7F: 5-bits plus 2 extra bytes for length
1466	80..FF: 7-bit length
1467	lengths must be incremented to get character count
1468	Ranges alternate between false and true return value.
1469	*/
1470	while (p < p_end) {
1471	c0 = c;
1472	b = *p++;
1473	if (b < `64`) {
1474	c += (b >> `3`) + `1`;
1475	if (bit) {
1476	if (cr_add_interval(cr, c0, c))
1477	return -`1`;
1478	}
1479	bit ^= `1`;
1480	c0 = c;
1481	c += (b & `7`) + `1`;
1482	} else if (b >= `0x80`) {
1483	c += b - `0x80` + `1`;
1484	} else if (b < `0x60`) {
1485	c += (((b - `0x40`) << `8`) \| p[`0`]) + `1`;
1486	p++;
1487	} else {
1488	c += (((b - `0x60`) << `16`) \| (p[`0`] << `8`) \| p[`1`]) + `1`;
1489	p += `2`;
1490	}
1491	if (bit) {
1492	if (cr_add_interval(cr, c0, c))
1493	return -`1`;
1494	}
1495	bit ^= `1`;
1496	}
1497	return `0`;
1498	}
1499
1500	typedef enum {
1501	POP_GC,
1502	POP_PROP,
1503	POP_CASE,
1504	POP_UNION,
1505	POP_INTER,
1506	POP_XOR,
1507	POP_INVERT,
1508	POP_END,
1509	} PropOPEnum;
1510
1511	#define POP_STACK_LEN_MAX 4
1512
1513	static int unicode_prop_ops(CharRange *cr, ...)
1514	{
1515	va_list ap;
1516	CharRange stack[POP_STACK_LEN_MAX];
1517	int stack_len, op, ret, i;
1518	uint32_t a;
1519
1520	va_start(ap, cr);
1521	stack_len = `0`;
1522	for(;;) {
1523	op = va_arg(ap, int);
1524	switch(op) {
1525	case POP_GC:
1526	assert(stack_len < POP_STACK_LEN_MAX);
1527	a = va_arg(ap, int);
1528	cr_init(&stack[stack_len++], cr->mem_opaque, cr->realloc_func);
1529	if (unicode_general_category1(&stack[stack_len - `1`], a))
1530	goto fail;
1531	break;
1532	case POP_PROP:
1533	assert(stack_len < POP_STACK_LEN_MAX);
1534	a = va_arg(ap, int);
1535	cr_init(&stack[stack_len++], cr->mem_opaque, cr->realloc_func);
1536	if (unicode_prop1(&stack[stack_len - `1`], a))
1537	goto fail;
1538	break;
1539	case POP_CASE:
1540	assert(stack_len < POP_STACK_LEN_MAX);
1541	a = va_arg(ap, int);
1542	cr_init(&stack[stack_len++], cr->mem_opaque, cr->realloc_func);
1543	if (unicode_case1(&stack[stack_len - `1`], a))
1544	goto fail;
1545	break;
1546	case POP_UNION:
1547	case POP_INTER:
1548	case POP_XOR:
1549	{
1550	CharRange cr1, cr2, *cr3;
1551	assert(stack_len >= `2`);
1552	assert(stack_len < POP_STACK_LEN_MAX);
1553	cr1 = &stack[stack_len - `2`];
1554	cr2 = &stack[stack_len - `1`];
1555	cr3 = &stack[stack_len++];
1556	cr_init(cr3, cr->mem_opaque, cr->realloc_func);
1557	if (cr_op(cr3, cr1->points, cr1->len,
1558	cr2->points, cr2->len, op - POP_UNION + CR_OP_UNION))
1559	goto fail;
1560	cr_free(cr1);
1561	cr_free(cr2);
1562	cr1 = cr3;
1563	stack_len -= `2`;
1564	}
1565	break;
1566	case POP_INVERT:
1567	assert(stack_len >= `1`);
1568	if (cr_invert(&stack[stack_len - `1`]))
1569	goto fail;
1570	break;
1571	case POP_END:
1572	goto done;
1573	default:
1574	abort();
1575	}
1576	}
1577	done:
1578	assert(stack_len == `1`);
1579	ret = cr_copy(cr, &stack[`0`]);
1580	cr_free(&stack[`0`]);
1581	return ret;
1582	fail:
1583	for(i = `0`; i < stack_len; i++)
1584	cr_free(&stack[i]);
1585	return -`1`;
1586	}
1587
1588	static const uint32_t unicode_gc_mask_table[] = {
1589	M(Lu) \| M(Ll) \| M(Lt), / LC /
1590	M(Lu) \| M(Ll) \| M(Lt) \| M(Lm) \| M(Lo), / L /
1591	M(Mn) \| M(Mc) \| M(Me), / M /
1592	M(Nd) \| M(Nl) \| M(No), / N /
1593	M(Sm) \| M(Sc) \| M(Sk) \| M(So), / S /
1594	M(Pc) \| M(Pd) \| M(Ps) \| M(Pe) \| M(Pi) \| M(Pf) \| M(Po), / P /
1595	M(Zs) \| M(Zl) \| M(Zp), / Z /
1596	M(Cc) \| M(Cf) \| M(Cs) \| M(Co) \| M(Cn), / C /
1597	};
1598
1599	/ 'cr' must be initialized and empty. Return 0 if OK, -1 if error, -2*
1600	if not found /*
1601	int unicode_general_category(CharRange cr, const* char *gc_name)
1602	{
1603	int gc_idx;
1604	uint32_t gc_mask;
1605
1606	gc_idx = unicode_find_name(unicode_gc_name_table, gc_name);
1607	if (gc_idx < `0`)
1608	return -`2`;
1609	if (gc_idx <= UNICODE_GC_Co) {
1610	gc_mask = (uint64_t)`1` << gc_idx;
1611	} else {
1612	gc_mask = unicode_gc_mask_table[gc_idx - UNICODE_GC_LC];
1613	}
1614	return unicode_general_category1(cr, gc_mask);
1615	}
1616
1617
1618	/ 'cr' must be initialized and empty. Return 0 if OK, -1 if error, -2*
1619	if not found /*
1620	int unicode_prop(CharRange cr, const* char *prop_name)
1621	{
1622	int prop_idx, ret;
1623
1624	prop_idx = unicode_find_name(unicode_prop_name_table, prop_name);
1625	if (prop_idx < `0`)
1626	return -`2`;
1627	prop_idx += UNICODE_PROP_ASCII_Hex_Digit;
1628
1629	ret = `0`;
1630	switch(prop_idx) {
1631	case UNICODE_PROP_ASCII:
1632	if (cr_add_interval(cr, `0x00`, `0x7f` + `1`))
1633	return -`1`;
1634	break;
1635	case UNICODE_PROP_Any:
1636	if (cr_add_interval(cr, `0x00000`, `0x10ffff` + `1`))
1637	return -`1`;
1638	break;
1639	case UNICODE_PROP_Assigned:
1640	ret = unicode_prop_ops(cr,
1641	POP_GC, M(Cn),
1642	POP_INVERT,
1643	POP_END);
1644	break;
1645	case UNICODE_PROP_Math:
1646	ret = unicode_prop_ops(cr,
1647	POP_GC, M(Sm),
1648	POP_PROP, UNICODE_PROP_Other_Math,
1649	POP_UNION,
1650	POP_END);
1651	break;
1652	case UNICODE_PROP_Lowercase:
1653	ret = unicode_prop_ops(cr,
1654	POP_GC, M(Ll),
1655	POP_PROP, UNICODE_PROP_Other_Lowercase,
1656	POP_UNION,
1657	POP_END);
1658	break;
1659	case UNICODE_PROP_Uppercase:
1660	ret = unicode_prop_ops(cr,
1661	POP_GC, M(Lu),
1662	POP_PROP, UNICODE_PROP_Other_Uppercase,
1663	POP_UNION,
1664	POP_END);
1665	break;
1666	case UNICODE_PROP_Cased:
1667	ret = unicode_prop_ops(cr,
1668	POP_GC, M(Lu) \| M(Ll) \| M(Lt),
1669	POP_PROP, UNICODE_PROP_Other_Uppercase,
1670	POP_UNION,
1671	POP_PROP, UNICODE_PROP_Other_Lowercase,
1672	POP_UNION,
1673	POP_END);
1674	break;
1675	case UNICODE_PROP_Alphabetic:
1676	ret = unicode_prop_ops(cr,
1677	POP_GC, M(Lu) \| M(Ll) \| M(Lt) \| M(Lm) \| M(Lo) \| M(Nl),
1678	POP_PROP, UNICODE_PROP_Other_Uppercase,
1679	POP_UNION,
1680	POP_PROP, UNICODE_PROP_Other_Lowercase,
1681	POP_UNION,
1682	POP_PROP, UNICODE_PROP_Other_Alphabetic,
1683	POP_UNION,
1684	POP_END);
1685	break;
1686	case UNICODE_PROP_Grapheme_Base:
1687	ret = unicode_prop_ops(cr,
1688	POP_GC, M(Cc) \| M(Cf) \| M(Cs) \| M(Co) \| M(Cn) \| M(Zl) \| M(Zp) \| M(Me) \| M(Mn),
1689	POP_PROP, UNICODE_PROP_Other_Grapheme_Extend,
1690	POP_UNION,
1691	POP_INVERT,
1692	POP_END);
1693	break;
1694	case UNICODE_PROP_Grapheme_Extend:
1695	ret = unicode_prop_ops(cr,
1696	POP_GC, M(Me) \| M(Mn),
1697	POP_PROP, UNICODE_PROP_Other_Grapheme_Extend,
1698	POP_UNION,
1699	POP_END);
1700	break;
1701	case UNICODE_PROP_XID_Start:
1702	ret = unicode_prop_ops(cr,
1703	POP_GC, M(Lu) \| M(Ll) \| M(Lt) \| M(Lm) \| M(Lo) \| M(Nl),
1704	POP_PROP, UNICODE_PROP_Other_ID_Start,
1705	POP_UNION,
1706	POP_PROP, UNICODE_PROP_Pattern_Syntax,
1707	POP_PROP, UNICODE_PROP_Pattern_White_Space,
1708	POP_UNION,
1709	POP_PROP, UNICODE_PROP_XID_Start1,
1710	POP_UNION,
1711	POP_INVERT,
1712	POP_INTER,
1713	POP_END);
1714	break;
1715	case UNICODE_PROP_XID_Continue:
1716	ret = unicode_prop_ops(cr,
1717	POP_GC, M(Lu) \| M(Ll) \| M(Lt) \| M(Lm) \| M(Lo) \| M(Nl) \|
1718	M(Mn) \| M(Mc) \| M(Nd) \| M(Pc),
1719	POP_PROP, UNICODE_PROP_Other_ID_Start,
1720	POP_UNION,
1721	POP_PROP, UNICODE_PROP_Other_ID_Continue,
1722	POP_UNION,
1723	POP_PROP, UNICODE_PROP_Pattern_Syntax,
1724	POP_PROP, UNICODE_PROP_Pattern_White_Space,
1725	POP_UNION,
1726	POP_PROP, UNICODE_PROP_XID_Continue1,
1727	POP_UNION,
1728	POP_INVERT,
1729	POP_INTER,
1730	POP_END);
1731	break;
1732	case UNICODE_PROP_Changes_When_Uppercased:
1733	ret = unicode_case1(cr, CASE_U);
1734	break;
1735	case UNICODE_PROP_Changes_When_Lowercased:
1736	ret = unicode_case1(cr, CASE_L);
1737	break;
1738	case UNICODE_PROP_Changes_When_Casemapped:
1739	ret = unicode_case1(cr, CASE_U \| CASE_L \| CASE_F);
1740	break;
1741	case UNICODE_PROP_Changes_When_Titlecased:
1742	ret = unicode_prop_ops(cr,
1743	POP_CASE, CASE_U,
1744	POP_PROP, UNICODE_PROP_Changes_When_Titlecased1,
1745	POP_XOR,
1746	POP_END);
1747	break;
1748	case UNICODE_PROP_Changes_When_Casefolded:
1749	ret = unicode_prop_ops(cr,
1750	POP_CASE, CASE_F,
1751	POP_PROP, UNICODE_PROP_Changes_When_Casefolded1,
1752	POP_XOR,
1753	POP_END);
1754	break;
1755	case UNICODE_PROP_Changes_When_NFKC_Casefolded:
1756	ret = unicode_prop_ops(cr,
1757	POP_CASE, CASE_F,
1758	POP_PROP, UNICODE_PROP_Changes_When_NFKC_Casefolded1,
1759	POP_XOR,
1760	POP_END);
1761	break;
1762	#if 0
1763	case UNICODE_PROP_ID_Start:
1764	ret = unicode_prop_ops(cr,
1765	POP_GC, M(Lu) \| M(Ll) \| M(Lt) \| M(Lm) \| M(Lo) \| M(Nl),
1766	POP_PROP, UNICODE_PROP_Other_ID_Start,
1767	POP_UNION,
1768	POP_PROP, UNICODE_PROP_Pattern_Syntax,
1769	POP_PROP, UNICODE_PROP_Pattern_White_Space,
1770	POP_UNION,
1771	POP_INVERT,
1772	POP_INTER,
1773	POP_END);
1774	break;
1775	case UNICODE_PROP_ID_Continue:
1776	ret = unicode_prop_ops(cr,
1777	POP_GC, M(Lu) \| M(Ll) \| M(Lt) \| M(Lm) \| M(Lo) \| M(Nl) \|
1778	M(Mn) \| M(Mc) \| M(Nd) \| M(Pc),
1779	POP_PROP, UNICODE_PROP_Other_ID_Start,
1780	POP_UNION,
1781	POP_PROP, UNICODE_PROP_Other_ID_Continue,
1782	POP_UNION,
1783	POP_PROP, UNICODE_PROP_Pattern_Syntax,
1784	POP_PROP, UNICODE_PROP_Pattern_White_Space,
1785	POP_UNION,
1786	POP_INVERT,
1787	POP_INTER,
1788	POP_END);
1789	break;
1790	case UNICODE_PROP_Case_Ignorable:
1791	ret = unicode_prop_ops(cr,
1792	POP_GC, M(Mn) \| M(Cf) \| M(Lm) \| M(Sk),
1793	POP_PROP, UNICODE_PROP_Case_Ignorable1,
1794	POP_XOR,
1795	POP_END);
1796	break;
1797	#else
1798	/ we use the existing tables /
1799	case UNICODE_PROP_ID_Continue:
1800	ret = unicode_prop_ops(cr,
1801	POP_PROP, UNICODE_PROP_ID_Start,
1802	POP_PROP, UNICODE_PROP_ID_Continue1,
1803	POP_XOR,
1804	POP_END);
1805	break;
1806	#endif
1807	default:
1808	if (prop_idx >= countof(unicode_prop_table))
1809	return -`2`;
1810	ret = unicode_prop1(cr, prop_idx);
1811	break;
1812	}
1813	return ret;
1814	}
1815
1816	#endif /* CONFIG_ALL_UNICODE */
1817
1818	/---- lre codepoint categorizing functions ----/
1819
1820	#define S UNICODE_C_SPACE
1821	#define D UNICODE_C_DIGIT
1822	#define X UNICODE_C_XDIGIT
1823	#define U UNICODE_C_UPPER
1824	#define L UNICODE_C_LOWER
1825	#define _ UNICODE_C_UNDER
1826	#define d UNICODE_C_DOLLAR
1827
1828	uint8_t const lre_ctype_bits[`256`] = {
1829	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1830	`0`, S, S, S, S, S, `0`, `0`,
1831	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1832	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1833
1834	S, `0`, `0`, `0`, d, `0`, `0`, `0`,
1835	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1836	X\|D, X\|D, X\|D, X\|D, X\|D, X\|D, X\|D, X\|D,
1837	X\|D, X\|D, `0`, `0`, `0`, `0`, `0`, `0`,
1838
1839	`0`, X\|U, X\|U, X\|U, X\|U, X\|U, X\|U, U,
1840	U, U, U, U, U, U, U, U,
1841	U, U, U, U, U, U, U, U,
1842	U, U, U, `0`, `0`, `0`, `0`, _,
1843
1844	`0`, X\|L, X\|L, X\|L, X\|L, X\|L, X\|L, L,
1845	L, L, L, L, L, L, L, L,
1846	L, L, L, L, L, L, L, L,
1847	L, L, L, `0`, `0`, `0`, `0`, `0`,
1848
1849	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1850	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1851	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1852	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1853
1854	S, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1855	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1856	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1857	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1858
1859	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1860	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1861	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1862	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1863
1864	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1865	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1866	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1867	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
1868	};
1869
1870	#undef S
1871	#undef D
1872	#undef X
1873	#undef U
1874	#undef L
1875	#undef _
1876	#undef d
1877
1878	/ code point ranges for Zs,Zl or Zp property /
1879	static const uint16_t char_range_s[] = {
1880	`10`,
1881	`0x0009`, `0x000D` + `1`,
1882	`0x0020`, `0x0020` + `1`,
1883	`0x00A0`, `0x00A0` + `1`,
1884	`0x1680`, `0x1680` + `1`,
1885	`0x2000`, `0x200A` + `1`,
1886	/ 2028;LINE SEPARATOR;Zl;0;WS;;;;;N;;;;; /
1887	/ 2029;PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;; /
1888	`0x2028`, `0x2029` + `1`,
1889	`0x202F`, `0x202F` + `1`,
1890	`0x205F`, `0x205F` + `1`,
1891	`0x3000`, `0x3000` + `1`,
1892	/ FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;; /
1893	`0xFEFF`, `0xFEFF` + `1`,
1894	};
1895
1896	BOOL lre_is_space_non_ascii(uint32_t c)
1897	{
1898	size_t i, n;
1899
1900	n = countof(char_range_s);
1901	for(i = `5`; i < n; i += `2`) {
1902	uint32_t low = char_range_s[i];
1903	uint32_t high = char_range_s[i + `1`];
1904	if (c < low)
1905	return FALSE;
1906	if (c < high)
1907	return TRUE;
1908	}
1909	return FALSE;
1910	}
1911

Browse the source code of QuickJS/libunicode.c