1/*
2 * Unicode utilities
3 *
4 * Copyright (c) 2017-2018 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24#include <stdlib.h>
25#include <stdio.h>
26#include <stdarg.h>
27#include <string.h>
28#include <assert.h>
29
30#include "cutils.h"
31#include "libunicode.h"
32#include "libunicode-table.h"
33
34enum {
35 RUN_TYPE_U,
36 RUN_TYPE_L,
37 RUN_TYPE_UF,
38 RUN_TYPE_LF,
39 RUN_TYPE_UL,
40 RUN_TYPE_LSU,
41 RUN_TYPE_U2L_399_EXT2,
42 RUN_TYPE_UF_D20,
43 RUN_TYPE_UF_D1_EXT,
44 RUN_TYPE_U_EXT,
45 RUN_TYPE_LF_EXT,
46 RUN_TYPE_UF_EXT2,
47 RUN_TYPE_LF_EXT2,
48 RUN_TYPE_UF_EXT3,
49};
50
51static int lre_case_conv1(uint32_t c, int conv_type)
52{
53 uint32_t res[LRE_CC_RES_LEN_MAX];
54 lre_case_conv(res, c, conv_type);
55 return res[0];
56}
57
58/* case conversion using the table entry 'idx' with value 'v' */
59static int lre_case_conv_entry(uint32_t *res, uint32_t c, int conv_type, uint32_t idx, uint32_t v)
60{
61 uint32_t code, data, type, a, is_lower;
62 is_lower = (conv_type != 0);
63 type = (v >> (32 - 17 - 7 - 4)) & 0xf;
64 data = ((v & 0xf) << 8) | case_conv_table2[idx];
65 code = v >> (32 - 17);
66 switch(type) {
67 case RUN_TYPE_U:
68 case RUN_TYPE_L:
69 case RUN_TYPE_UF:
70 case RUN_TYPE_LF:
71 if (conv_type == (type & 1) ||
72 (type >= RUN_TYPE_UF && conv_type == 2)) {
73 c = c - code + (case_conv_table1[data] >> (32 - 17));
74 }
75 break;
76 case RUN_TYPE_UL:
77 a = c - code;
78 if ((a & 1) != (1 - is_lower))
79 break;
80 c = (a ^ 1) + code;
81 break;
82 case RUN_TYPE_LSU:
83 a = c - code;
84 if (a == 1) {
85 c += 2 * is_lower - 1;
86 } else if (a == (1 - is_lower) * 2) {
87 c += (2 * is_lower - 1) * 2;
88 }
89 break;
90 case RUN_TYPE_U2L_399_EXT2:
91 if (!is_lower) {
92 res[0] = c - code + case_conv_ext[data >> 6];
93 res[1] = 0x399;
94 return 2;
95 } else {
96 c = c - code + case_conv_ext[data & 0x3f];
97 }
98 break;
99 case RUN_TYPE_UF_D20:
100 if (conv_type == 1)
101 break;
102 c = data + (conv_type == 2) * 0x20;
103 break;
104 case RUN_TYPE_UF_D1_EXT:
105 if (conv_type == 1)
106 break;
107 c = case_conv_ext[data] + (conv_type == 2);
108 break;
109 case RUN_TYPE_U_EXT:
110 case RUN_TYPE_LF_EXT:
111 if (is_lower != (type - RUN_TYPE_U_EXT))
112 break;
113 c = case_conv_ext[data];
114 break;
115 case RUN_TYPE_LF_EXT2:
116 if (!is_lower)
117 break;
118 res[0] = c - code + case_conv_ext[data >> 6];
119 res[1] = case_conv_ext[data & 0x3f];
120 return 2;
121 case RUN_TYPE_UF_EXT2:
122 if (conv_type == 1)
123 break;
124 res[0] = c - code + case_conv_ext[data >> 6];
125 res[1] = case_conv_ext[data & 0x3f];
126 if (conv_type == 2) {
127 /* convert to lower */
128 res[0] = lre_case_conv1(res[0], 1);
129 res[1] = lre_case_conv1(res[1], 1);
130 }
131 return 2;
132 default:
133 case RUN_TYPE_UF_EXT3:
134 if (conv_type == 1)
135 break;
136 res[0] = case_conv_ext[data >> 8];
137 res[1] = case_conv_ext[(data >> 4) & 0xf];
138 res[2] = case_conv_ext[data & 0xf];
139 if (conv_type == 2) {
140 /* convert to lower */
141 res[0] = lre_case_conv1(res[0], 1);
142 res[1] = lre_case_conv1(res[1], 1);
143 res[2] = lre_case_conv1(res[2], 1);
144 }
145 return 3;
146 }
147 res[0] = c;
148 return 1;
149}
150
151/* conv_type:
152 0 = to upper
153 1 = to lower
154 2 = case folding (= to lower with modifications)
155*/
156int lre_case_conv(uint32_t *res, uint32_t c, int conv_type)
157{
158 if (c < 128) {
159 if (conv_type) {
160 if (c >= 'A' && c <= 'Z') {
161 c = c - 'A' + 'a';
162 }
163 } else {
164 if (c >= 'a' && c <= 'z') {
165 c = c - 'a' + 'A';
166 }
167 }
168 } else {
169 uint32_t v, code, len;
170 int idx, idx_min, idx_max;
171
172 idx_min = 0;
173 idx_max = countof(case_conv_table1) - 1;
174 while (idx_min <= idx_max) {
175 idx = (unsigned)(idx_max + idx_min) / 2;
176 v = case_conv_table1[idx];
177 code = v >> (32 - 17);
178 len = (v >> (32 - 17 - 7)) & 0x7f;
179 if (c < code) {
180 idx_max = idx - 1;
181 } else if (c >= code + len) {
182 idx_min = idx + 1;
183 } else {
184 return lre_case_conv_entry(res, c, conv_type, idx, v);
185 }
186 }
187 }
188 res[0] = c;
189 return 1;
190}
191
192static int lre_case_folding_entry(uint32_t c, uint32_t idx, uint32_t v, BOOL is_unicode)
193{
194 uint32_t res[LRE_CC_RES_LEN_MAX];
195 int len;
196
197 if (is_unicode) {
198 len = lre_case_conv_entry(res, c, 2, idx, v);
199 if (len == 1) {
200 c = res[0];
201 } else {
202 /* handle the few specific multi-character cases (see
203 unicode_gen.c:dump_case_folding_special_cases()) */
204 if (c == 0xfb06) {
205 c = 0xfb05;
206 } else if (c == 0x01fd3) {
207 c = 0x390;
208 } else if (c == 0x01fe3) {
209 c = 0x3b0;
210 }
211 }
212 } else {
213 if (likely(c < 128)) {
214 if (c >= 'a' && c <= 'z')
215 c = c - 'a' + 'A';
216 } else {
217 /* legacy regexp: to upper case if single char >= 128 */
218 len = lre_case_conv_entry(res, c, FALSE, idx, v);
219 if (len == 1 && res[0] >= 128)
220 c = res[0];
221 }
222 }
223 return c;
224}
225
226/* JS regexp specific rules for case folding */
227int lre_canonicalize(uint32_t c, BOOL is_unicode)
228{
229 if (c < 128) {
230 /* fast case */
231 if (is_unicode) {
232 if (c >= 'A' && c <= 'Z') {
233 c = c - 'A' + 'a';
234 }
235 } else {
236 if (c >= 'a' && c <= 'z') {
237 c = c - 'a' + 'A';
238 }
239 }
240 } else {
241 uint32_t v, code, len;
242 int idx, idx_min, idx_max;
243
244 idx_min = 0;
245 idx_max = countof(case_conv_table1) - 1;
246 while (idx_min <= idx_max) {
247 idx = (unsigned)(idx_max + idx_min) / 2;
248 v = case_conv_table1[idx];
249 code = v >> (32 - 17);
250 len = (v >> (32 - 17 - 7)) & 0x7f;
251 if (c < code) {
252 idx_max = idx - 1;
253 } else if (c >= code + len) {
254 idx_min = idx + 1;
255 } else {
256 return lre_case_folding_entry(c, idx, v, is_unicode);
257 }
258 }
259 }
260 return c;
261}
262
263static uint32_t get_le24(const uint8_t *ptr)
264{
265 return ptr[0] | (ptr[1] << 8) | (ptr[2] << 16);
266}
267
268#define UNICODE_INDEX_BLOCK_LEN 32
269
270/* return -1 if not in table, otherwise the offset in the block */
271static int get_index_pos(uint32_t *pcode, uint32_t c,
272 const uint8_t *index_table, int index_table_len)
273{
274 uint32_t code, v;
275 int idx_min, idx_max, idx;
276
277 idx_min = 0;
278 v = get_le24(index_table);
279 code = v & ((1 << 21) - 1);
280 if (c < code) {
281 *pcode = 0;
282 return 0;
283 }
284 idx_max = index_table_len - 1;
285 code = get_le24(index_table + idx_max * 3);
286 if (c >= code)
287 return -1;
288 /* invariant: tab[idx_min] <= c < tab2[idx_max] */
289 while ((idx_max - idx_min) > 1) {
290 idx = (idx_max + idx_min) / 2;
291 v = get_le24(index_table + idx * 3);
292 code = v & ((1 << 21) - 1);
293 if (c < code) {
294 idx_max = idx;
295 } else {
296 idx_min = idx;
297 }
298 }
299 v = get_le24(index_table + idx_min * 3);
300 *pcode = v & ((1 << 21) - 1);
301 return (idx_min + 1) * UNICODE_INDEX_BLOCK_LEN + (v >> 21);
302}
303
304static BOOL lre_is_in_table(uint32_t c, const uint8_t *table,
305 const uint8_t *index_table, int index_table_len)
306{
307 uint32_t code, b, bit;
308 int pos;
309 const uint8_t *p;
310
311 pos = get_index_pos(&code, c, index_table, index_table_len);
312 if (pos < 0)
313 return FALSE; /* outside the table */
314 p = table + pos;
315 bit = 0;
316 /* Compressed run length encoding:
317 00..3F: 2 packed lengths: 3-bit + 3-bit
318 40..5F: 5-bits plus extra byte for length
319 60..7F: 5-bits plus 2 extra bytes for length
320 80..FF: 7-bit length
321 lengths must be incremented to get character count
322 Ranges alternate between false and true return value.
323 */
324 for(;;) {
325 b = *p++;
326 if (b < 64) {
327 code += (b >> 3) + 1;
328 if (c < code)
329 return bit;
330 bit ^= 1;
331 code += (b & 7) + 1;
332 } else if (b >= 0x80) {
333 code += b - 0x80 + 1;
334 } else if (b < 0x60) {
335 code += (((b - 0x40) << 8) | p[0]) + 1;
336 p++;
337 } else {
338 code += (((b - 0x60) << 16) | (p[0] << 8) | p[1]) + 1;
339 p += 2;
340 }
341 if (c < code)
342 return bit;
343 bit ^= 1;
344 }
345}
346
347BOOL lre_is_cased(uint32_t c)
348{
349 uint32_t v, code, len;
350 int idx, idx_min, idx_max;
351
352 idx_min = 0;
353 idx_max = countof(case_conv_table1) - 1;
354 while (idx_min <= idx_max) {
355 idx = (unsigned)(idx_max + idx_min) / 2;
356 v = case_conv_table1[idx];
357 code = v >> (32 - 17);
358 len = (v >> (32 - 17 - 7)) & 0x7f;
359 if (c < code) {
360 idx_max = idx - 1;
361 } else if (c >= code + len) {
362 idx_min = idx + 1;
363 } else {
364 return TRUE;
365 }
366 }
367 return lre_is_in_table(c, unicode_prop_Cased1_table,
368 unicode_prop_Cased1_index,
369 sizeof(unicode_prop_Cased1_index) / 3);
370}
371
372BOOL lre_is_case_ignorable(uint32_t c)
373{
374 return lre_is_in_table(c, unicode_prop_Case_Ignorable_table,
375 unicode_prop_Case_Ignorable_index,
376 sizeof(unicode_prop_Case_Ignorable_index) / 3);
377}
378
379/* character range */
380
381static __maybe_unused void cr_dump(CharRange *cr)
382{
383 int i;
384 for(i = 0; i < cr->len; i++)
385 printf("%d: 0x%04x\n", i, cr->points[i]);
386}
387
388static void *cr_default_realloc(void *opaque, void *ptr, size_t size)
389{
390 return realloc(ptr, size);
391}
392
393void cr_init(CharRange *cr, void *mem_opaque, DynBufReallocFunc *realloc_func)
394{
395 cr->len = cr->size = 0;
396 cr->points = NULL;
397 cr->mem_opaque = mem_opaque;
398 cr->realloc_func = realloc_func ? realloc_func : cr_default_realloc;
399}
400
401void cr_free(CharRange *cr)
402{
403 cr->realloc_func(cr->mem_opaque, cr->points, 0);
404}
405
406int cr_realloc(CharRange *cr, int size)
407{
408 int new_size;
409 uint32_t *new_buf;
410
411 if (size > cr->size) {
412 new_size = max_int(size, cr->size * 3 / 2);
413 new_buf = cr->realloc_func(cr->mem_opaque, cr->points,
414 new_size * sizeof(cr->points[0]));
415 if (!new_buf)
416 return -1;
417 cr->points = new_buf;
418 cr->size = new_size;
419 }
420 return 0;
421}
422
423int cr_copy(CharRange *cr, const CharRange *cr1)
424{
425 if (cr_realloc(cr, cr1->len))
426 return -1;
427 memcpy(cr->points, cr1->points, sizeof(cr->points[0]) * cr1->len);
428 cr->len = cr1->len;
429 return 0;
430}
431
432/* merge consecutive intervals and remove empty intervals */
433static void cr_compress(CharRange *cr)
434{
435 int i, j, k, len;
436 uint32_t *pt;
437
438 pt = cr->points;
439 len = cr->len;
440 i = 0;
441 j = 0;
442 k = 0;
443 while ((i + 1) < len) {
444 if (pt[i] == pt[i + 1]) {
445 /* empty interval */
446 i += 2;
447 } else {
448 j = i;
449 while ((j + 3) < len && pt[j + 1] == pt[j + 2])
450 j += 2;
451 /* just copy */
452 pt[k] = pt[i];
453 pt[k + 1] = pt[j + 1];
454 k += 2;
455 i = j + 2;
456 }
457 }
458 cr->len = k;
459}
460
461/* union or intersection */
462int cr_op(CharRange *cr, const uint32_t *a_pt, int a_len,
463 const uint32_t *b_pt, int b_len, int op)
464{
465 int a_idx, b_idx, is_in;
466 uint32_t v;
467
468 a_idx = 0;
469 b_idx = 0;
470 for(;;) {
471 /* get one more point from a or b in increasing order */
472 if (a_idx < a_len && b_idx < b_len) {
473 if (a_pt[a_idx] < b_pt[b_idx]) {
474 goto a_add;
475 } else if (a_pt[a_idx] == b_pt[b_idx]) {
476 v = a_pt[a_idx];
477 a_idx++;
478 b_idx++;
479 } else {
480 goto b_add;
481 }
482 } else if (a_idx < a_len) {
483 a_add:
484 v = a_pt[a_idx++];
485 } else if (b_idx < b_len) {
486 b_add:
487 v = b_pt[b_idx++];
488 } else {
489 break;
490 }
491 /* add the point if the in/out status changes */
492 switch(op) {
493 case CR_OP_UNION:
494 is_in = (a_idx & 1) | (b_idx & 1);
495 break;
496 case CR_OP_INTER:
497 is_in = (a_idx & 1) & (b_idx & 1);
498 break;
499 case CR_OP_XOR:
500 is_in = (a_idx & 1) ^ (b_idx & 1);
501 break;
502 default:
503 abort();
504 }
505 if (is_in != (cr->len & 1)) {
506 if (cr_add_point(cr, v))
507 return -1;
508 }
509 }
510 cr_compress(cr);
511 return 0;
512}
513
514int cr_union1(CharRange *cr, const uint32_t *b_pt, int b_len)
515{
516 CharRange a = *cr;
517 int ret;
518 cr->len = 0;
519 cr->size = 0;
520 cr->points = NULL;
521 ret = cr_op(cr, a.points, a.len, b_pt, b_len, CR_OP_UNION);
522 cr_free(&a);
523 return ret;
524}
525
526int cr_invert(CharRange *cr)
527{
528 int len;
529 len = cr->len;
530 if (cr_realloc(cr, len + 2))
531 return -1;
532 memmove(cr->points + 1, cr->points, len * sizeof(cr->points[0]));
533 cr->points[0] = 0;
534 cr->points[len + 1] = UINT32_MAX;
535 cr->len = len + 2;
536 cr_compress(cr);
537 return 0;
538}
539
540#define CASE_U (1 << 0)
541#define CASE_L (1 << 1)
542#define CASE_F (1 << 2)
543
544/* use the case conversion table to generate range of characters.
545 CASE_U: set char if modified by uppercasing,
546 CASE_L: set char if modified by lowercasing,
547 CASE_F: set char if modified by case folding,
548 */
549static int unicode_case1(CharRange *cr, int case_mask)
550{
551#define MR(x) (1 << RUN_TYPE_ ## x)
552 const uint32_t tab_run_mask[3] = {
553 MR(U) | MR(UF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(UF_D20) |
554 MR(UF_D1_EXT) | MR(U_EXT) | MR(UF_EXT2) | MR(UF_EXT3),
555
556 MR(L) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(LF_EXT2),
557
558 MR(UF) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(LF_EXT2) | MR(UF_D20) | MR(UF_D1_EXT) | MR(LF_EXT) | MR(UF_EXT2) | MR(UF_EXT3),
559 };
560#undef MR
561 uint32_t mask, v, code, type, len, i, idx;
562
563 if (case_mask == 0)
564 return 0;
565 mask = 0;
566 for(i = 0; i < 3; i++) {
567 if ((case_mask >> i) & 1)
568 mask |= tab_run_mask[i];
569 }
570 for(idx = 0; idx < countof(case_conv_table1); idx++) {
571 v = case_conv_table1[idx];
572 type = (v >> (32 - 17 - 7 - 4)) & 0xf;
573 code = v >> (32 - 17);
574 len = (v >> (32 - 17 - 7)) & 0x7f;
575 if ((mask >> type) & 1) {
576 // printf("%d: type=%d %04x %04x\n", idx, type, code, code + len - 1);
577 switch(type) {
578 case RUN_TYPE_UL:
579 if ((case_mask & CASE_U) && (case_mask & (CASE_L | CASE_F)))
580 goto def_case;
581 code += ((case_mask & CASE_U) != 0);
582 for(i = 0; i < len; i += 2) {
583 if (cr_add_interval(cr, code + i, code + i + 1))
584 return -1;
585 }
586 break;
587 case RUN_TYPE_LSU:
588 if ((case_mask & CASE_U) && (case_mask & (CASE_L | CASE_F)))
589 goto def_case;
590 if (!(case_mask & CASE_U)) {
591 if (cr_add_interval(cr, code, code + 1))
592 return -1;
593 }
594 if (cr_add_interval(cr, code + 1, code + 2))
595 return -1;
596 if (case_mask & CASE_U) {
597 if (cr_add_interval(cr, code + 2, code + 3))
598 return -1;
599 }
600 break;
601 default:
602 def_case:
603 if (cr_add_interval(cr, code, code + len))
604 return -1;
605 break;
606 }
607 }
608 }
609 return 0;
610}
611
612static int point_cmp(const void *p1, const void *p2, void *arg)
613{
614 uint32_t v1 = *(uint32_t *)p1;
615 uint32_t v2 = *(uint32_t *)p2;
616 return (v1 > v2) - (v1 < v2);
617}
618
619static void cr_sort_and_remove_overlap(CharRange *cr)
620{
621 uint32_t start, end, start1, end1, i, j;
622
623 /* the resulting ranges are not necessarily sorted and may overlap */
624 rqsort(cr->points, cr->len / 2, sizeof(cr->points[0]) * 2, point_cmp, NULL);
625 j = 0;
626 for(i = 0; i < cr->len; ) {
627 start = cr->points[i];
628 end = cr->points[i + 1];
629 i += 2;
630 while (i < cr->len) {
631 start1 = cr->points[i];
632 end1 = cr->points[i + 1];
633 if (start1 > end) {
634 /* |------|
635 * |-------| */
636 break;
637 } else if (end1 <= end) {
638 /* |------|
639 * |--| */
640 i += 2;
641 } else {
642 /* |------|
643 * |-------| */
644 end = end1;
645 i += 2;
646 }
647 }
648 cr->points[j] = start;
649 cr->points[j + 1] = end;
650 j += 2;
651 }
652 cr->len = j;
653}
654
655/* canonicalize a character set using the JS regex case folding rules
656 (see lre_canonicalize()) */
657int cr_regexp_canonicalize(CharRange *cr, BOOL is_unicode)
658{
659 CharRange cr_inter, cr_mask, cr_result, cr_sub;
660 uint32_t v, code, len, i, idx, start, end, c, d_start, d_end, d;
661
662 cr_init(&cr_mask, cr->mem_opaque, cr->realloc_func);
663 cr_init(&cr_inter, cr->mem_opaque, cr->realloc_func);
664 cr_init(&cr_result, cr->mem_opaque, cr->realloc_func);
665 cr_init(&cr_sub, cr->mem_opaque, cr->realloc_func);
666
667 if (unicode_case1(&cr_mask, is_unicode ? CASE_F : CASE_U))
668 goto fail;
669 if (cr_op(&cr_inter, cr_mask.points, cr_mask.len, cr->points, cr->len, CR_OP_INTER))
670 goto fail;
671
672 if (cr_invert(&cr_mask))
673 goto fail;
674 if (cr_op(&cr_sub, cr_mask.points, cr_mask.len, cr->points, cr->len, CR_OP_INTER))
675 goto fail;
676
677 /* cr_inter = cr & cr_mask */
678 /* cr_sub = cr & ~cr_mask */
679
680 /* use the case conversion table to compute the result */
681 d_start = -1;
682 d_end = -1;
683 idx = 0;
684 v = case_conv_table1[idx];
685 code = v >> (32 - 17);
686 len = (v >> (32 - 17 - 7)) & 0x7f;
687 for(i = 0; i < cr_inter.len; i += 2) {
688 start = cr_inter.points[i];
689 end = cr_inter.points[i + 1];
690
691 for(c = start; c < end; c++) {
692 for(;;) {
693 if (c >= code && c < code + len)
694 break;
695 idx++;
696 assert(idx < countof(case_conv_table1));
697 v = case_conv_table1[idx];
698 code = v >> (32 - 17);
699 len = (v >> (32 - 17 - 7)) & 0x7f;
700 }
701 d = lre_case_folding_entry(c, idx, v, is_unicode);
702 /* try to merge with the current interval */
703 if (d_start == -1) {
704 d_start = d;
705 d_end = d + 1;
706 } else if (d_end == d) {
707 d_end++;
708 } else {
709 cr_add_interval(&cr_result, d_start, d_end);
710 d_start = d;
711 d_end = d + 1;
712 }
713 }
714 }
715 if (d_start != -1) {
716 if (cr_add_interval(&cr_result, d_start, d_end))
717 goto fail;
718 }
719
720 /* the resulting ranges are not necessarily sorted and may overlap */
721 cr_sort_and_remove_overlap(&cr_result);
722
723 /* or with the character not affected by the case folding */
724 cr->len = 0;
725 if (cr_op(cr, cr_result.points, cr_result.len, cr_sub.points, cr_sub.len, CR_OP_UNION))
726 goto fail;
727
728 cr_free(&cr_inter);
729 cr_free(&cr_mask);
730 cr_free(&cr_result);
731 cr_free(&cr_sub);
732 return 0;
733 fail:
734 cr_free(&cr_inter);
735 cr_free(&cr_mask);
736 cr_free(&cr_result);
737 cr_free(&cr_sub);
738 return -1;
739}
740
741#ifdef CONFIG_ALL_UNICODE
742
743BOOL lre_is_id_start(uint32_t c)
744{
745 return lre_is_in_table(c, unicode_prop_ID_Start_table,
746 unicode_prop_ID_Start_index,
747 sizeof(unicode_prop_ID_Start_index) / 3);
748}
749
750BOOL lre_is_id_continue(uint32_t c)
751{
752 return lre_is_id_start(c) ||
753 lre_is_in_table(c, unicode_prop_ID_Continue1_table,
754 unicode_prop_ID_Continue1_index,
755 sizeof(unicode_prop_ID_Continue1_index) / 3);
756}
757
758#define UNICODE_DECOMP_LEN_MAX 18
759
760typedef enum {
761 DECOMP_TYPE_C1, /* 16 bit char */
762 DECOMP_TYPE_L1, /* 16 bit char table */
763 DECOMP_TYPE_L2,
764 DECOMP_TYPE_L3,
765 DECOMP_TYPE_L4,
766 DECOMP_TYPE_L5, /* XXX: not used */
767 DECOMP_TYPE_L6, /* XXX: could remove */
768 DECOMP_TYPE_L7, /* XXX: could remove */
769 DECOMP_TYPE_LL1, /* 18 bit char table */
770 DECOMP_TYPE_LL2,
771 DECOMP_TYPE_S1, /* 8 bit char table */
772 DECOMP_TYPE_S2,
773 DECOMP_TYPE_S3,
774 DECOMP_TYPE_S4,
775 DECOMP_TYPE_S5,
776 DECOMP_TYPE_I1, /* increment 16 bit char value */
777 DECOMP_TYPE_I2_0,
778 DECOMP_TYPE_I2_1,
779 DECOMP_TYPE_I3_1,
780 DECOMP_TYPE_I3_2,
781 DECOMP_TYPE_I4_1,
782 DECOMP_TYPE_I4_2,
783 DECOMP_TYPE_B1, /* 16 bit base + 8 bit offset */
784 DECOMP_TYPE_B2,
785 DECOMP_TYPE_B3,
786 DECOMP_TYPE_B4,
787 DECOMP_TYPE_B5,
788 DECOMP_TYPE_B6,
789 DECOMP_TYPE_B7,
790 DECOMP_TYPE_B8,
791 DECOMP_TYPE_B18,
792 DECOMP_TYPE_LS2,
793 DECOMP_TYPE_PAT3,
794 DECOMP_TYPE_S2_UL,
795 DECOMP_TYPE_LS2_UL,
796} DecompTypeEnum;
797
798static uint32_t unicode_get_short_code(uint32_t c)
799{
800 static const uint16_t unicode_short_table[2] = { 0x2044, 0x2215 };
801
802 if (c < 0x80)
803 return c;
804 else if (c < 0x80 + 0x50)
805 return c - 0x80 + 0x300;
806 else
807 return unicode_short_table[c - 0x80 - 0x50];
808}
809
810static uint32_t unicode_get_lower_simple(uint32_t c)
811{
812 if (c < 0x100 || (c >= 0x410 && c <= 0x42f))
813 c += 0x20;
814 else
815 c++;
816 return c;
817}
818
819static uint16_t unicode_get16(const uint8_t *p)
820{
821 return p[0] | (p[1] << 8);
822}
823
824static int unicode_decomp_entry(uint32_t *res, uint32_t c,
825 int idx, uint32_t code, uint32_t len,
826 uint32_t type)
827{
828 uint32_t c1;
829 int l, i, p;
830 const uint8_t *d;
831
832 if (type == DECOMP_TYPE_C1) {
833 res[0] = unicode_decomp_table2[idx];
834 return 1;
835 } else {
836 d = unicode_decomp_data + unicode_decomp_table2[idx];
837 switch(type) {
838 case DECOMP_TYPE_L1:
839 case DECOMP_TYPE_L2:
840 case DECOMP_TYPE_L3:
841 case DECOMP_TYPE_L4:
842 case DECOMP_TYPE_L5:
843 case DECOMP_TYPE_L6:
844 case DECOMP_TYPE_L7:
845 l = type - DECOMP_TYPE_L1 + 1;
846 d += (c - code) * l * 2;
847 for(i = 0; i < l; i++) {
848 if ((res[i] = unicode_get16(d + 2 * i)) == 0)
849 return 0;
850 }
851 return l;
852 case DECOMP_TYPE_LL1:
853 case DECOMP_TYPE_LL2:
854 {
855 uint32_t k, p;
856 l = type - DECOMP_TYPE_LL1 + 1;
857 k = (c - code) * l;
858 p = len * l * 2;
859 for(i = 0; i < l; i++) {
860 c1 = unicode_get16(d + 2 * k) |
861 (((d[p + (k / 4)] >> ((k % 4) * 2)) & 3) << 16);
862 if (!c1)
863 return 0;
864 res[i] = c1;
865 k++;
866 }
867 }
868 return l;
869 case DECOMP_TYPE_S1:
870 case DECOMP_TYPE_S2:
871 case DECOMP_TYPE_S3:
872 case DECOMP_TYPE_S4:
873 case DECOMP_TYPE_S5:
874 l = type - DECOMP_TYPE_S1 + 1;
875 d += (c - code) * l;
876 for(i = 0; i < l; i++) {
877 if ((res[i] = unicode_get_short_code(d[i])) == 0)
878 return 0;
879 }
880 return l;
881 case DECOMP_TYPE_I1:
882 l = 1;
883 p = 0;
884 goto decomp_type_i;
885 case DECOMP_TYPE_I2_0:
886 case DECOMP_TYPE_I2_1:
887 case DECOMP_TYPE_I3_1:
888 case DECOMP_TYPE_I3_2:
889 case DECOMP_TYPE_I4_1:
890 case DECOMP_TYPE_I4_2:
891 l = 2 + ((type - DECOMP_TYPE_I2_0) >> 1);
892 p = ((type - DECOMP_TYPE_I2_0) & 1) + (l > 2);
893 decomp_type_i:
894 for(i = 0; i < l; i++) {
895 c1 = unicode_get16(d + 2 * i);
896 if (i == p)
897 c1 += c - code;
898 res[i] = c1;
899 }
900 return l;
901 case DECOMP_TYPE_B18:
902 l = 18;
903 goto decomp_type_b;
904 case DECOMP_TYPE_B1:
905 case DECOMP_TYPE_B2:
906 case DECOMP_TYPE_B3:
907 case DECOMP_TYPE_B4:
908 case DECOMP_TYPE_B5:
909 case DECOMP_TYPE_B6:
910 case DECOMP_TYPE_B7:
911 case DECOMP_TYPE_B8:
912 l = type - DECOMP_TYPE_B1 + 1;
913 decomp_type_b:
914 {
915 uint32_t c_min;
916 c_min = unicode_get16(d);
917 d += 2 + (c - code) * l;
918 for(i = 0; i < l; i++) {
919 c1 = d[i];
920 if (c1 == 0xff)
921 c1 = 0x20;
922 else
923 c1 += c_min;
924 res[i] = c1;
925 }
926 }
927 return l;
928 case DECOMP_TYPE_LS2:
929 d += (c - code) * 3;
930 if (!(res[0] = unicode_get16(d)))
931 return 0;
932 res[1] = unicode_get_short_code(d[2]);
933 return 2;
934 case DECOMP_TYPE_PAT3:
935 res[0] = unicode_get16(d);
936 res[2] = unicode_get16(d + 2);
937 d += 4 + (c - code) * 2;
938 res[1] = unicode_get16(d);
939 return 3;
940 case DECOMP_TYPE_S2_UL:
941 case DECOMP_TYPE_LS2_UL:
942 c1 = c - code;
943 if (type == DECOMP_TYPE_S2_UL) {
944 d += c1 & ~1;
945 c = unicode_get_short_code(*d);
946 d++;
947 } else {
948 d += (c1 >> 1) * 3;
949 c = unicode_get16(d);
950 d += 2;
951 }
952 if (c1 & 1)
953 c = unicode_get_lower_simple(c);
954 res[0] = c;
955 res[1] = unicode_get_short_code(*d);
956 return 2;
957 }
958 }
959 return 0;
960}
961
962
963/* return the length of the decomposition (length <=
964 UNICODE_DECOMP_LEN_MAX) or 0 if no decomposition */
965static int unicode_decomp_char(uint32_t *res, uint32_t c, BOOL is_compat1)
966{
967 uint32_t v, type, is_compat, code, len;
968 int idx_min, idx_max, idx;
969
970 idx_min = 0;
971 idx_max = countof(unicode_decomp_table1) - 1;
972 while (idx_min <= idx_max) {
973 idx = (idx_max + idx_min) / 2;
974 v = unicode_decomp_table1[idx];
975 code = v >> (32 - 18);
976 len = (v >> (32 - 18 - 7)) & 0x7f;
977 // printf("idx=%d code=%05x len=%d\n", idx, code, len);
978 if (c < code) {
979 idx_max = idx - 1;
980 } else if (c >= code + len) {
981 idx_min = idx + 1;
982 } else {
983 is_compat = v & 1;
984 if (is_compat1 < is_compat)
985 break;
986 type = (v >> (32 - 18 - 7 - 6)) & 0x3f;
987 return unicode_decomp_entry(res, c, idx, code, len, type);
988 }
989 }
990 return 0;
991}
992
993/* return 0 if no pair found */
994static int unicode_compose_pair(uint32_t c0, uint32_t c1)
995{
996 uint32_t code, len, type, v, idx1, d_idx, d_offset, ch;
997 int idx_min, idx_max, idx, d;
998 uint32_t pair[2];
999
1000 idx_min = 0;
1001 idx_max = countof(unicode_comp_table) - 1;
1002 while (idx_min <= idx_max) {
1003 idx = (idx_max + idx_min) / 2;
1004 idx1 = unicode_comp_table[idx];
1005
1006 /* idx1 represent an entry of the decomposition table */
1007 d_idx = idx1 >> 6;
1008 d_offset = idx1 & 0x3f;
1009 v = unicode_decomp_table1[d_idx];
1010 code = v >> (32 - 18);
1011 len = (v >> (32 - 18 - 7)) & 0x7f;
1012 type = (v >> (32 - 18 - 7 - 6)) & 0x3f;
1013 ch = code + d_offset;
1014 unicode_decomp_entry(pair, ch, d_idx, code, len, type);
1015 d = c0 - pair[0];
1016 if (d == 0)
1017 d = c1 - pair[1];
1018 if (d < 0) {
1019 idx_max = idx - 1;
1020 } else if (d > 0) {
1021 idx_min = idx + 1;
1022 } else {
1023 return ch;
1024 }
1025 }
1026 return 0;
1027}
1028
1029/* return the combining class of character c (between 0 and 255) */
1030static int unicode_get_cc(uint32_t c)
1031{
1032 uint32_t code, n, type, cc, c1, b;
1033 int pos;
1034 const uint8_t *p;
1035
1036 pos = get_index_pos(&code, c,
1037 unicode_cc_index, sizeof(unicode_cc_index) / 3);
1038 if (pos < 0)
1039 return 0;
1040 p = unicode_cc_table + pos;
1041 /* Compressed run length encoding:
1042 - 2 high order bits are combining class type
1043 - 0:0, 1:230, 2:extra byte linear progression, 3:extra byte
1044 - 00..2F: range length (add 1)
1045 - 30..37: 3-bit range-length + 1 extra byte
1046 - 38..3F: 3-bit range-length + 2 extra byte
1047 */
1048 for(;;) {
1049 b = *p++;
1050 type = b >> 6;
1051 n = b & 0x3f;
1052 if (n < 48) {
1053 } else if (n < 56) {
1054 n = (n - 48) << 8;
1055 n |= *p++;
1056 n += 48;
1057 } else {
1058 n = (n - 56) << 8;
1059 n |= *p++ << 8;
1060 n |= *p++;
1061 n += 48 + (1 << 11);
1062 }
1063 if (type <= 1)
1064 p++;
1065 c1 = code + n + 1;
1066 if (c < c1) {
1067 switch(type) {
1068 case 0:
1069 cc = p[-1];
1070 break;
1071 case 1:
1072 cc = p[-1] + c - code;
1073 break;
1074 case 2:
1075 cc = 0;
1076 break;
1077 default:
1078 case 3:
1079 cc = 230;
1080 break;
1081 }
1082 return cc;
1083 }
1084 code = c1;
1085 }
1086}
1087
1088static void sort_cc(int *buf, int len)
1089{
1090 int i, j, k, cc, cc1, start, ch1;
1091
1092 for(i = 0; i < len; i++) {
1093 cc = unicode_get_cc(buf[i]);
1094 if (cc != 0) {
1095 start = i;
1096 j = i + 1;
1097 while (j < len) {
1098 ch1 = buf[j];
1099 cc1 = unicode_get_cc(ch1);
1100 if (cc1 == 0)
1101 break;
1102 k = j - 1;
1103 while (k >= start) {
1104 if (unicode_get_cc(buf[k]) <= cc1)
1105 break;
1106 buf[k + 1] = buf[k];
1107 k--;
1108 }
1109 buf[k + 1] = ch1;
1110 j++;
1111 }
1112#if 0
1113 printf("cc:");
1114 for(k = start; k < j; k++) {
1115 printf(" %3d", unicode_get_cc(buf[k]));
1116 }
1117 printf("\n");
1118#endif
1119 i = j;
1120 }
1121 }
1122}
1123
1124static void to_nfd_rec(DynBuf *dbuf,
1125 const int *src, int src_len, int is_compat)
1126{
1127 uint32_t c, v;
1128 int i, l;
1129 uint32_t res[UNICODE_DECOMP_LEN_MAX];
1130
1131 for(i = 0; i < src_len; i++) {
1132 c = src[i];
1133 if (c >= 0xac00 && c < 0xd7a4) {
1134 /* Hangul decomposition */
1135 c -= 0xac00;
1136 dbuf_put_u32(dbuf, 0x1100 + c / 588);
1137 dbuf_put_u32(dbuf, 0x1161 + (c % 588) / 28);
1138 v = c % 28;
1139 if (v != 0)
1140 dbuf_put_u32(dbuf, 0x11a7 + v);
1141 } else {
1142 l = unicode_decomp_char(res, c, is_compat);
1143 if (l) {
1144 to_nfd_rec(dbuf, (int *)res, l, is_compat);
1145 } else {
1146 dbuf_put_u32(dbuf, c);
1147 }
1148 }
1149 }
1150}
1151
1152/* return 0 if not found */
1153static int compose_pair(uint32_t c0, uint32_t c1)
1154{
1155 /* Hangul composition */
1156 if (c0 >= 0x1100 && c0 < 0x1100 + 19 &&
1157 c1 >= 0x1161 && c1 < 0x1161 + 21) {
1158 return 0xac00 + (c0 - 0x1100) * 588 + (c1 - 0x1161) * 28;
1159 } else if (c0 >= 0xac00 && c0 < 0xac00 + 11172 &&
1160 (c0 - 0xac00) % 28 == 0 &&
1161 c1 >= 0x11a7 && c1 < 0x11a7 + 28) {
1162 return c0 + c1 - 0x11a7;
1163 } else {
1164 return unicode_compose_pair(c0, c1);
1165 }
1166}
1167
1168int unicode_normalize(uint32_t **pdst, const uint32_t *src, int src_len,
1169 UnicodeNormalizationEnum n_type,
1170 void *opaque, DynBufReallocFunc *realloc_func)
1171{
1172 int *buf, buf_len, i, p, starter_pos, cc, last_cc, out_len;
1173 BOOL is_compat;
1174 DynBuf dbuf_s, *dbuf = &dbuf_s;
1175
1176 is_compat = n_type >> 1;
1177
1178 dbuf_init2(dbuf, opaque, realloc_func);
1179 if (dbuf_realloc(dbuf, sizeof(int) * src_len))
1180 goto fail;
1181
1182 /* common case: latin1 is unaffected by NFC */
1183 if (n_type == UNICODE_NFC) {
1184 for(i = 0; i < src_len; i++) {
1185 if (src[i] >= 0x100)
1186 goto not_latin1;
1187 }
1188 buf = (int *)dbuf->buf;
1189 memcpy(buf, src, src_len * sizeof(int));
1190 *pdst = (uint32_t *)buf;
1191 return src_len;
1192 not_latin1: ;
1193 }
1194
1195 to_nfd_rec(dbuf, (const int *)src, src_len, is_compat);
1196 if (dbuf_error(dbuf)) {
1197 fail:
1198 *pdst = NULL;
1199 return -1;
1200 }
1201 buf = (int *)dbuf->buf;
1202 buf_len = dbuf->size / sizeof(int);
1203
1204 sort_cc(buf, buf_len);
1205
1206 if (buf_len <= 1 || (n_type & 1) != 0) {
1207 /* NFD / NFKD */
1208 *pdst = (uint32_t *)buf;
1209 return buf_len;
1210 }
1211
1212 i = 1;
1213 out_len = 1;
1214 while (i < buf_len) {
1215 /* find the starter character and test if it is blocked from
1216 the character at 'i' */
1217 last_cc = unicode_get_cc(buf[i]);
1218 starter_pos = out_len - 1;
1219 while (starter_pos >= 0) {
1220 cc = unicode_get_cc(buf[starter_pos]);
1221 if (cc == 0)
1222 break;
1223 if (cc >= last_cc)
1224 goto next;
1225 last_cc = 256;
1226 starter_pos--;
1227 }
1228 if (starter_pos >= 0 &&
1229 (p = compose_pair(buf[starter_pos], buf[i])) != 0) {
1230 buf[starter_pos] = p;
1231 i++;
1232 } else {
1233 next:
1234 buf[out_len++] = buf[i++];
1235 }
1236 }
1237 *pdst = (uint32_t *)buf;
1238 return out_len;
1239}
1240
1241/* char ranges for various unicode properties */
1242
1243static int unicode_find_name(const char *name_table, const char *name)
1244{
1245 const char *p, *r;
1246 int pos;
1247 size_t name_len, len;
1248
1249 p = name_table;
1250 pos = 0;
1251 name_len = strlen(name);
1252 while (*p) {
1253 for(;;) {
1254 r = strchr(p, ',');
1255 if (!r)
1256 len = strlen(p);
1257 else
1258 len = r - p;
1259 if (len == name_len && !memcmp(p, name, name_len))
1260 return pos;
1261 p += len + 1;
1262 if (!r)
1263 break;
1264 }
1265 pos++;
1266 }
1267 return -1;
1268}
1269
1270/* 'cr' must be initialized and empty. Return 0 if OK, -1 if error, -2
1271 if not found */
1272int unicode_script(CharRange *cr,
1273 const char *script_name, BOOL is_ext)
1274{
1275 int script_idx;
1276 const uint8_t *p, *p_end;
1277 uint32_t c, c1, b, n, v, v_len, i, type;
1278 CharRange cr1_s, *cr1;
1279 CharRange cr2_s, *cr2 = &cr2_s;
1280 BOOL is_common;
1281
1282 script_idx = unicode_find_name(unicode_script_name_table, script_name);
1283 if (script_idx < 0)
1284 return -2;
1285 /* Note: we remove the "Unknown" Script */
1286 script_idx += UNICODE_SCRIPT_Unknown + 1;
1287
1288 is_common = (script_idx == UNICODE_SCRIPT_Common ||
1289 script_idx == UNICODE_SCRIPT_Inherited);
1290 if (is_ext) {
1291 cr1 = &cr1_s;
1292 cr_init(cr1, cr->mem_opaque, cr->realloc_func);
1293 cr_init(cr2, cr->mem_opaque, cr->realloc_func);
1294 } else {
1295 cr1 = cr;
1296 }
1297
1298 p = unicode_script_table;
1299 p_end = unicode_script_table + countof(unicode_script_table);
1300 c = 0;
1301 while (p < p_end) {
1302 b = *p++;
1303 type = b >> 7;
1304 n = b & 0x7f;
1305 if (n < 96) {
1306 } else if (n < 112) {
1307 n = (n - 96) << 8;
1308 n |= *p++;
1309 n += 96;
1310 } else {
1311 n = (n - 112) << 16;
1312 n |= *p++ << 8;
1313 n |= *p++;
1314 n += 96 + (1 << 12);
1315 }
1316 if (type == 0)
1317 v = 0;
1318 else
1319 v = *p++;
1320 c1 = c + n + 1;
1321 if (v == script_idx) {
1322 if (cr_add_interval(cr1, c, c1))
1323 goto fail;
1324 }
1325 c = c1;
1326 }
1327
1328 if (is_ext) {
1329 /* add the script extensions */
1330 p = unicode_script_ext_table;
1331 p_end = unicode_script_ext_table + countof(unicode_script_ext_table);
1332 c = 0;
1333 while (p < p_end) {
1334 b = *p++;
1335 if (b < 128) {
1336 n = b;
1337 } else if (b < 128 + 64) {
1338 n = (b - 128) << 8;
1339 n |= *p++;
1340 n += 128;
1341 } else {
1342 n = (b - 128 - 64) << 16;
1343 n |= *p++ << 8;
1344 n |= *p++;
1345 n += 128 + (1 << 14);
1346 }
1347 c1 = c + n + 1;
1348 v_len = *p++;
1349 if (is_common) {
1350 if (v_len != 0) {
1351 if (cr_add_interval(cr2, c, c1))
1352 goto fail;
1353 }
1354 } else {
1355 for(i = 0; i < v_len; i++) {
1356 if (p[i] == script_idx) {
1357 if (cr_add_interval(cr2, c, c1))
1358 goto fail;
1359 break;
1360 }
1361 }
1362 }
1363 p += v_len;
1364 c = c1;
1365 }
1366 if (is_common) {
1367 /* remove all the characters with script extensions */
1368 if (cr_invert(cr2))
1369 goto fail;
1370 if (cr_op(cr, cr1->points, cr1->len, cr2->points, cr2->len,
1371 CR_OP_INTER))
1372 goto fail;
1373 } else {
1374 if (cr_op(cr, cr1->points, cr1->len, cr2->points, cr2->len,
1375 CR_OP_UNION))
1376 goto fail;
1377 }
1378 cr_free(cr1);
1379 cr_free(cr2);
1380 }
1381 return 0;
1382 fail:
1383 if (is_ext) {
1384 cr_free(cr1);
1385 cr_free(cr2);
1386 }
1387 goto fail;
1388}
1389
1390#define M(id) (1U << UNICODE_GC_ ## id)
1391
1392static int unicode_general_category1(CharRange *cr, uint32_t gc_mask)
1393{
1394 const uint8_t *p, *p_end;
1395 uint32_t c, c0, b, n, v;
1396
1397 p = unicode_gc_table;
1398 p_end = unicode_gc_table + countof(unicode_gc_table);
1399 c = 0;
1400 /* Compressed range encoding:
1401 initial byte:
1402 bits 0..4: category number (special case 31)
1403 bits 5..7: range length (add 1)
1404 special case bits 5..7 == 7: read an extra byte
1405 - 00..7F: range length (add 7 + 1)
1406 - 80..BF: 6-bits plus extra byte for range length (add 7 + 128)
1407 - C0..FF: 6-bits plus 2 extra bytes for range length (add 7 + 128 + 16384)
1408 */
1409 while (p < p_end) {
1410 b = *p++;
1411 n = b >> 5;
1412 v = b & 0x1f;
1413 if (n == 7) {
1414 n = *p++;
1415 if (n < 128) {
1416 n += 7;
1417 } else if (n < 128 + 64) {
1418 n = (n - 128) << 8;
1419 n |= *p++;
1420 n += 7 + 128;
1421 } else {
1422 n = (n - 128 - 64) << 16;
1423 n |= *p++ << 8;
1424 n |= *p++;
1425 n += 7 + 128 + (1 << 14);
1426 }
1427 }
1428 c0 = c;
1429 c += n + 1;
1430 if (v == 31) {
1431 /* run of Lu / Ll */
1432 b = gc_mask & (M(Lu) | M(Ll));
1433 if (b != 0) {
1434 if (b == (M(Lu) | M(Ll))) {
1435 goto add_range;
1436 } else {
1437 c0 += ((gc_mask & M(Ll)) != 0);
1438 for(; c0 < c; c0 += 2) {
1439 if (cr_add_interval(cr, c0, c0 + 1))
1440 return -1;
1441 }
1442 }
1443 }
1444 } else if ((gc_mask >> v) & 1) {
1445 add_range:
1446 if (cr_add_interval(cr, c0, c))
1447 return -1;
1448 }
1449 }
1450 return 0;
1451}
1452
1453static int unicode_prop1(CharRange *cr, int prop_idx)
1454{
1455 const uint8_t *p, *p_end;
1456 uint32_t c, c0, b, bit;
1457
1458 p = unicode_prop_table[prop_idx];
1459 p_end = p + unicode_prop_len_table[prop_idx];
1460 c = 0;
1461 bit = 0;
1462 /* Compressed range encoding:
1463 00..3F: 2 packed lengths: 3-bit + 3-bit
1464 40..5F: 5-bits plus extra byte for length
1465 60..7F: 5-bits plus 2 extra bytes for length
1466 80..FF: 7-bit length
1467 lengths must be incremented to get character count
1468 Ranges alternate between false and true return value.
1469 */
1470 while (p < p_end) {
1471 c0 = c;
1472 b = *p++;
1473 if (b < 64) {
1474 c += (b >> 3) + 1;
1475 if (bit) {
1476 if (cr_add_interval(cr, c0, c))
1477 return -1;
1478 }
1479 bit ^= 1;
1480 c0 = c;
1481 c += (b & 7) + 1;
1482 } else if (b >= 0x80) {
1483 c += b - 0x80 + 1;
1484 } else if (b < 0x60) {
1485 c += (((b - 0x40) << 8) | p[0]) + 1;
1486 p++;
1487 } else {
1488 c += (((b - 0x60) << 16) | (p[0] << 8) | p[1]) + 1;
1489 p += 2;
1490 }
1491 if (bit) {
1492 if (cr_add_interval(cr, c0, c))
1493 return -1;
1494 }
1495 bit ^= 1;
1496 }
1497 return 0;
1498}
1499
1500typedef enum {
1501 POP_GC,
1502 POP_PROP,
1503 POP_CASE,
1504 POP_UNION,
1505 POP_INTER,
1506 POP_XOR,
1507 POP_INVERT,
1508 POP_END,
1509} PropOPEnum;
1510
1511#define POP_STACK_LEN_MAX 4
1512
1513static int unicode_prop_ops(CharRange *cr, ...)
1514{
1515 va_list ap;
1516 CharRange stack[POP_STACK_LEN_MAX];
1517 int stack_len, op, ret, i;
1518 uint32_t a;
1519
1520 va_start(ap, cr);
1521 stack_len = 0;
1522 for(;;) {
1523 op = va_arg(ap, int);
1524 switch(op) {
1525 case POP_GC:
1526 assert(stack_len < POP_STACK_LEN_MAX);
1527 a = va_arg(ap, int);
1528 cr_init(&stack[stack_len++], cr->mem_opaque, cr->realloc_func);
1529 if (unicode_general_category1(&stack[stack_len - 1], a))
1530 goto fail;
1531 break;
1532 case POP_PROP:
1533 assert(stack_len < POP_STACK_LEN_MAX);
1534 a = va_arg(ap, int);
1535 cr_init(&stack[stack_len++], cr->mem_opaque, cr->realloc_func);
1536 if (unicode_prop1(&stack[stack_len - 1], a))
1537 goto fail;
1538 break;
1539 case POP_CASE:
1540 assert(stack_len < POP_STACK_LEN_MAX);
1541 a = va_arg(ap, int);
1542 cr_init(&stack[stack_len++], cr->mem_opaque, cr->realloc_func);
1543 if (unicode_case1(&stack[stack_len - 1], a))
1544 goto fail;
1545 break;
1546 case POP_UNION:
1547 case POP_INTER:
1548 case POP_XOR:
1549 {
1550 CharRange *cr1, *cr2, *cr3;
1551 assert(stack_len >= 2);
1552 assert(stack_len < POP_STACK_LEN_MAX);
1553 cr1 = &stack[stack_len - 2];
1554 cr2 = &stack[stack_len - 1];
1555 cr3 = &stack[stack_len++];
1556 cr_init(cr3, cr->mem_opaque, cr->realloc_func);
1557 if (cr_op(cr3, cr1->points, cr1->len,
1558 cr2->points, cr2->len, op - POP_UNION + CR_OP_UNION))
1559 goto fail;
1560 cr_free(cr1);
1561 cr_free(cr2);
1562 *cr1 = *cr3;
1563 stack_len -= 2;
1564 }
1565 break;
1566 case POP_INVERT:
1567 assert(stack_len >= 1);
1568 if (cr_invert(&stack[stack_len - 1]))
1569 goto fail;
1570 break;
1571 case POP_END:
1572 goto done;
1573 default:
1574 abort();
1575 }
1576 }
1577 done:
1578 assert(stack_len == 1);
1579 ret = cr_copy(cr, &stack[0]);
1580 cr_free(&stack[0]);
1581 return ret;
1582 fail:
1583 for(i = 0; i < stack_len; i++)
1584 cr_free(&stack[i]);
1585 return -1;
1586}
1587
1588static const uint32_t unicode_gc_mask_table[] = {
1589 M(Lu) | M(Ll) | M(Lt), /* LC */
1590 M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo), /* L */
1591 M(Mn) | M(Mc) | M(Me), /* M */
1592 M(Nd) | M(Nl) | M(No), /* N */
1593 M(Sm) | M(Sc) | M(Sk) | M(So), /* S */
1594 M(Pc) | M(Pd) | M(Ps) | M(Pe) | M(Pi) | M(Pf) | M(Po), /* P */
1595 M(Zs) | M(Zl) | M(Zp), /* Z */
1596 M(Cc) | M(Cf) | M(Cs) | M(Co) | M(Cn), /* C */
1597};
1598
1599/* 'cr' must be initialized and empty. Return 0 if OK, -1 if error, -2
1600 if not found */
1601int unicode_general_category(CharRange *cr, const char *gc_name)
1602{
1603 int gc_idx;
1604 uint32_t gc_mask;
1605
1606 gc_idx = unicode_find_name(unicode_gc_name_table, gc_name);
1607 if (gc_idx < 0)
1608 return -2;
1609 if (gc_idx <= UNICODE_GC_Co) {
1610 gc_mask = (uint64_t)1 << gc_idx;
1611 } else {
1612 gc_mask = unicode_gc_mask_table[gc_idx - UNICODE_GC_LC];
1613 }
1614 return unicode_general_category1(cr, gc_mask);
1615}
1616
1617
1618/* 'cr' must be initialized and empty. Return 0 if OK, -1 if error, -2
1619 if not found */
1620int unicode_prop(CharRange *cr, const char *prop_name)
1621{
1622 int prop_idx, ret;
1623
1624 prop_idx = unicode_find_name(unicode_prop_name_table, prop_name);
1625 if (prop_idx < 0)
1626 return -2;
1627 prop_idx += UNICODE_PROP_ASCII_Hex_Digit;
1628
1629 ret = 0;
1630 switch(prop_idx) {
1631 case UNICODE_PROP_ASCII:
1632 if (cr_add_interval(cr, 0x00, 0x7f + 1))
1633 return -1;
1634 break;
1635 case UNICODE_PROP_Any:
1636 if (cr_add_interval(cr, 0x00000, 0x10ffff + 1))
1637 return -1;
1638 break;
1639 case UNICODE_PROP_Assigned:
1640 ret = unicode_prop_ops(cr,
1641 POP_GC, M(Cn),
1642 POP_INVERT,
1643 POP_END);
1644 break;
1645 case UNICODE_PROP_Math:
1646 ret = unicode_prop_ops(cr,
1647 POP_GC, M(Sm),
1648 POP_PROP, UNICODE_PROP_Other_Math,
1649 POP_UNION,
1650 POP_END);
1651 break;
1652 case UNICODE_PROP_Lowercase:
1653 ret = unicode_prop_ops(cr,
1654 POP_GC, M(Ll),
1655 POP_PROP, UNICODE_PROP_Other_Lowercase,
1656 POP_UNION,
1657 POP_END);
1658 break;
1659 case UNICODE_PROP_Uppercase:
1660 ret = unicode_prop_ops(cr,
1661 POP_GC, M(Lu),
1662 POP_PROP, UNICODE_PROP_Other_Uppercase,
1663 POP_UNION,
1664 POP_END);
1665 break;
1666 case UNICODE_PROP_Cased:
1667 ret = unicode_prop_ops(cr,
1668 POP_GC, M(Lu) | M(Ll) | M(Lt),
1669 POP_PROP, UNICODE_PROP_Other_Uppercase,
1670 POP_UNION,
1671 POP_PROP, UNICODE_PROP_Other_Lowercase,
1672 POP_UNION,
1673 POP_END);
1674 break;
1675 case UNICODE_PROP_Alphabetic:
1676 ret = unicode_prop_ops(cr,
1677 POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl),
1678 POP_PROP, UNICODE_PROP_Other_Uppercase,
1679 POP_UNION,
1680 POP_PROP, UNICODE_PROP_Other_Lowercase,
1681 POP_UNION,
1682 POP_PROP, UNICODE_PROP_Other_Alphabetic,
1683 POP_UNION,
1684 POP_END);
1685 break;
1686 case UNICODE_PROP_Grapheme_Base:
1687 ret = unicode_prop_ops(cr,
1688 POP_GC, M(Cc) | M(Cf) | M(Cs) | M(Co) | M(Cn) | M(Zl) | M(Zp) | M(Me) | M(Mn),
1689 POP_PROP, UNICODE_PROP_Other_Grapheme_Extend,
1690 POP_UNION,
1691 POP_INVERT,
1692 POP_END);
1693 break;
1694 case UNICODE_PROP_Grapheme_Extend:
1695 ret = unicode_prop_ops(cr,
1696 POP_GC, M(Me) | M(Mn),
1697 POP_PROP, UNICODE_PROP_Other_Grapheme_Extend,
1698 POP_UNION,
1699 POP_END);
1700 break;
1701 case UNICODE_PROP_XID_Start:
1702 ret = unicode_prop_ops(cr,
1703 POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl),
1704 POP_PROP, UNICODE_PROP_Other_ID_Start,
1705 POP_UNION,
1706 POP_PROP, UNICODE_PROP_Pattern_Syntax,
1707 POP_PROP, UNICODE_PROP_Pattern_White_Space,
1708 POP_UNION,
1709 POP_PROP, UNICODE_PROP_XID_Start1,
1710 POP_UNION,
1711 POP_INVERT,
1712 POP_INTER,
1713 POP_END);
1714 break;
1715 case UNICODE_PROP_XID_Continue:
1716 ret = unicode_prop_ops(cr,
1717 POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl) |
1718 M(Mn) | M(Mc) | M(Nd) | M(Pc),
1719 POP_PROP, UNICODE_PROP_Other_ID_Start,
1720 POP_UNION,
1721 POP_PROP, UNICODE_PROP_Other_ID_Continue,
1722 POP_UNION,
1723 POP_PROP, UNICODE_PROP_Pattern_Syntax,
1724 POP_PROP, UNICODE_PROP_Pattern_White_Space,
1725 POP_UNION,
1726 POP_PROP, UNICODE_PROP_XID_Continue1,
1727 POP_UNION,
1728 POP_INVERT,
1729 POP_INTER,
1730 POP_END);
1731 break;
1732 case UNICODE_PROP_Changes_When_Uppercased:
1733 ret = unicode_case1(cr, CASE_U);
1734 break;
1735 case UNICODE_PROP_Changes_When_Lowercased:
1736 ret = unicode_case1(cr, CASE_L);
1737 break;
1738 case UNICODE_PROP_Changes_When_Casemapped:
1739 ret = unicode_case1(cr, CASE_U | CASE_L | CASE_F);
1740 break;
1741 case UNICODE_PROP_Changes_When_Titlecased:
1742 ret = unicode_prop_ops(cr,
1743 POP_CASE, CASE_U,
1744 POP_PROP, UNICODE_PROP_Changes_When_Titlecased1,
1745 POP_XOR,
1746 POP_END);
1747 break;
1748 case UNICODE_PROP_Changes_When_Casefolded:
1749 ret = unicode_prop_ops(cr,
1750 POP_CASE, CASE_F,
1751 POP_PROP, UNICODE_PROP_Changes_When_Casefolded1,
1752 POP_XOR,
1753 POP_END);
1754 break;
1755 case UNICODE_PROP_Changes_When_NFKC_Casefolded:
1756 ret = unicode_prop_ops(cr,
1757 POP_CASE, CASE_F,
1758 POP_PROP, UNICODE_PROP_Changes_When_NFKC_Casefolded1,
1759 POP_XOR,
1760 POP_END);
1761 break;
1762#if 0
1763 case UNICODE_PROP_ID_Start:
1764 ret = unicode_prop_ops(cr,
1765 POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl),
1766 POP_PROP, UNICODE_PROP_Other_ID_Start,
1767 POP_UNION,
1768 POP_PROP, UNICODE_PROP_Pattern_Syntax,
1769 POP_PROP, UNICODE_PROP_Pattern_White_Space,
1770 POP_UNION,
1771 POP_INVERT,
1772 POP_INTER,
1773 POP_END);
1774 break;
1775 case UNICODE_PROP_ID_Continue:
1776 ret = unicode_prop_ops(cr,
1777 POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl) |
1778 M(Mn) | M(Mc) | M(Nd) | M(Pc),
1779 POP_PROP, UNICODE_PROP_Other_ID_Start,
1780 POP_UNION,
1781 POP_PROP, UNICODE_PROP_Other_ID_Continue,
1782 POP_UNION,
1783 POP_PROP, UNICODE_PROP_Pattern_Syntax,
1784 POP_PROP, UNICODE_PROP_Pattern_White_Space,
1785 POP_UNION,
1786 POP_INVERT,
1787 POP_INTER,
1788 POP_END);
1789 break;
1790 case UNICODE_PROP_Case_Ignorable:
1791 ret = unicode_prop_ops(cr,
1792 POP_GC, M(Mn) | M(Cf) | M(Lm) | M(Sk),
1793 POP_PROP, UNICODE_PROP_Case_Ignorable1,
1794 POP_XOR,
1795 POP_END);
1796 break;
1797#else
1798 /* we use the existing tables */
1799 case UNICODE_PROP_ID_Continue:
1800 ret = unicode_prop_ops(cr,
1801 POP_PROP, UNICODE_PROP_ID_Start,
1802 POP_PROP, UNICODE_PROP_ID_Continue1,
1803 POP_XOR,
1804 POP_END);
1805 break;
1806#endif
1807 default:
1808 if (prop_idx >= countof(unicode_prop_table))
1809 return -2;
1810 ret = unicode_prop1(cr, prop_idx);
1811 break;
1812 }
1813 return ret;
1814}
1815
1816#endif /* CONFIG_ALL_UNICODE */
1817
1818/*---- lre codepoint categorizing functions ----*/
1819
1820#define S UNICODE_C_SPACE
1821#define D UNICODE_C_DIGIT
1822#define X UNICODE_C_XDIGIT
1823#define U UNICODE_C_UPPER
1824#define L UNICODE_C_LOWER
1825#define _ UNICODE_C_UNDER
1826#define d UNICODE_C_DOLLAR
1827
1828uint8_t const lre_ctype_bits[256] = {
1829 0, 0, 0, 0, 0, 0, 0, 0,
1830 0, S, S, S, S, S, 0, 0,
1831 0, 0, 0, 0, 0, 0, 0, 0,
1832 0, 0, 0, 0, 0, 0, 0, 0,
1833
1834 S, 0, 0, 0, d, 0, 0, 0,
1835 0, 0, 0, 0, 0, 0, 0, 0,
1836 X|D, X|D, X|D, X|D, X|D, X|D, X|D, X|D,
1837 X|D, X|D, 0, 0, 0, 0, 0, 0,
1838
1839 0, X|U, X|U, X|U, X|U, X|U, X|U, U,
1840 U, U, U, U, U, U, U, U,
1841 U, U, U, U, U, U, U, U,
1842 U, U, U, 0, 0, 0, 0, _,
1843
1844 0, X|L, X|L, X|L, X|L, X|L, X|L, L,
1845 L, L, L, L, L, L, L, L,
1846 L, L, L, L, L, L, L, L,
1847 L, L, L, 0, 0, 0, 0, 0,
1848
1849 0, 0, 0, 0, 0, 0, 0, 0,
1850 0, 0, 0, 0, 0, 0, 0, 0,
1851 0, 0, 0, 0, 0, 0, 0, 0,
1852 0, 0, 0, 0, 0, 0, 0, 0,
1853
1854 S, 0, 0, 0, 0, 0, 0, 0,
1855 0, 0, 0, 0, 0, 0, 0, 0,
1856 0, 0, 0, 0, 0, 0, 0, 0,
1857 0, 0, 0, 0, 0, 0, 0, 0,
1858
1859 0, 0, 0, 0, 0, 0, 0, 0,
1860 0, 0, 0, 0, 0, 0, 0, 0,
1861 0, 0, 0, 0, 0, 0, 0, 0,
1862 0, 0, 0, 0, 0, 0, 0, 0,
1863
1864 0, 0, 0, 0, 0, 0, 0, 0,
1865 0, 0, 0, 0, 0, 0, 0, 0,
1866 0, 0, 0, 0, 0, 0, 0, 0,
1867 0, 0, 0, 0, 0, 0, 0, 0,
1868};
1869
1870#undef S
1871#undef D
1872#undef X
1873#undef U
1874#undef L
1875#undef _
1876#undef d
1877
1878/* code point ranges for Zs,Zl or Zp property */
1879static const uint16_t char_range_s[] = {
1880 10,
1881 0x0009, 0x000D + 1,
1882 0x0020, 0x0020 + 1,
1883 0x00A0, 0x00A0 + 1,
1884 0x1680, 0x1680 + 1,
1885 0x2000, 0x200A + 1,
1886 /* 2028;LINE SEPARATOR;Zl;0;WS;;;;;N;;;;; */
1887 /* 2029;PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;; */
1888 0x2028, 0x2029 + 1,
1889 0x202F, 0x202F + 1,
1890 0x205F, 0x205F + 1,
1891 0x3000, 0x3000 + 1,
1892 /* FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;; */
1893 0xFEFF, 0xFEFF + 1,
1894};
1895
1896BOOL lre_is_space_non_ascii(uint32_t c)
1897{
1898 size_t i, n;
1899
1900 n = countof(char_range_s);
1901 for(i = 5; i < n; i += 2) {
1902 uint32_t low = char_range_s[i];
1903 uint32_t high = char_range_s[i + 1];
1904 if (c < low)
1905 return FALSE;
1906 if (c < high)
1907 return TRUE;
1908 }
1909 return FALSE;
1910}
1911