1 | /* |
2 | * Unicode utilities |
3 | * |
4 | * Copyright (c) 2017-2018 Fabrice Bellard |
5 | * |
6 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
7 | * of this software and associated documentation files (the "Software"), to deal |
8 | * in the Software without restriction, including without limitation the rights |
9 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
10 | * copies of the Software, and to permit persons to whom the Software is |
11 | * furnished to do so, subject to the following conditions: |
12 | * |
13 | * The above copyright notice and this permission notice shall be included in |
14 | * all copies or substantial portions of the Software. |
15 | * |
16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
19 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
22 | * THE SOFTWARE. |
23 | */ |
24 | #ifndef LIBUNICODE_H |
25 | #define LIBUNICODE_H |
26 | |
27 | #include <stdint.h> |
28 | |
29 | /* define it to include all the unicode tables (40KB larger) */ |
30 | #define CONFIG_ALL_UNICODE |
31 | |
32 | #define LRE_CC_RES_LEN_MAX 3 |
33 | |
34 | /* char ranges */ |
35 | |
36 | typedef struct { |
37 | int len; /* in points, always even */ |
38 | int size; |
39 | uint32_t *points; /* points sorted by increasing value */ |
40 | void *mem_opaque; |
41 | void *(*realloc_func)(void *opaque, void *ptr, size_t size); |
42 | } CharRange; |
43 | |
44 | typedef enum { |
45 | CR_OP_UNION, |
46 | CR_OP_INTER, |
47 | CR_OP_XOR, |
48 | } CharRangeOpEnum; |
49 | |
50 | void cr_init(CharRange *cr, void *mem_opaque, void *(*realloc_func)(void *opaque, void *ptr, size_t size)); |
51 | void cr_free(CharRange *cr); |
52 | int cr_realloc(CharRange *cr, int size); |
53 | int cr_copy(CharRange *cr, const CharRange *cr1); |
54 | |
55 | static inline int cr_add_point(CharRange *cr, uint32_t v) |
56 | { |
57 | if (cr->len >= cr->size) { |
58 | if (cr_realloc(cr, cr->len + 1)) |
59 | return -1; |
60 | } |
61 | cr->points[cr->len++] = v; |
62 | return 0; |
63 | } |
64 | |
65 | static inline int cr_add_interval(CharRange *cr, uint32_t c1, uint32_t c2) |
66 | { |
67 | if ((cr->len + 2) > cr->size) { |
68 | if (cr_realloc(cr, cr->len + 2)) |
69 | return -1; |
70 | } |
71 | cr->points[cr->len++] = c1; |
72 | cr->points[cr->len++] = c2; |
73 | return 0; |
74 | } |
75 | |
76 | int cr_union1(CharRange *cr, const uint32_t *b_pt, int b_len); |
77 | |
78 | static inline int cr_union_interval(CharRange *cr, uint32_t c1, uint32_t c2) |
79 | { |
80 | uint32_t b_pt[2]; |
81 | b_pt[0] = c1; |
82 | b_pt[1] = c2 + 1; |
83 | return cr_union1(cr, b_pt, 2); |
84 | } |
85 | |
86 | int cr_op(CharRange *cr, const uint32_t *a_pt, int a_len, |
87 | const uint32_t *b_pt, int b_len, int op); |
88 | |
89 | int cr_invert(CharRange *cr); |
90 | |
91 | int cr_regexp_canonicalize(CharRange *cr, int is_unicode); |
92 | |
93 | typedef enum { |
94 | UNICODE_NFC, |
95 | UNICODE_NFD, |
96 | UNICODE_NFKC, |
97 | UNICODE_NFKD, |
98 | } UnicodeNormalizationEnum; |
99 | |
100 | int unicode_normalize(uint32_t **pdst, const uint32_t *src, int src_len, |
101 | UnicodeNormalizationEnum n_type, |
102 | void *opaque, void *(*realloc_func)(void *opaque, void *ptr, size_t size)); |
103 | |
104 | /* Unicode character range functions */ |
105 | |
106 | int unicode_script(CharRange *cr, const char *script_name, int is_ext); |
107 | int unicode_general_category(CharRange *cr, const char *gc_name); |
108 | int unicode_prop(CharRange *cr, const char *prop_name); |
109 | |
110 | int lre_case_conv(uint32_t *res, uint32_t c, int conv_type); |
111 | int lre_canonicalize(uint32_t c, int is_unicode); |
112 | |
113 | /* Code point type categories */ |
114 | enum { |
115 | UNICODE_C_SPACE = (1 << 0), |
116 | UNICODE_C_DIGIT = (1 << 1), |
117 | UNICODE_C_UPPER = (1 << 2), |
118 | UNICODE_C_LOWER = (1 << 3), |
119 | UNICODE_C_UNDER = (1 << 4), |
120 | UNICODE_C_DOLLAR = (1 << 5), |
121 | UNICODE_C_XDIGIT = (1 << 6), |
122 | }; |
123 | extern uint8_t const lre_ctype_bits[256]; |
124 | |
125 | /* zero or non-zero return value */ |
126 | int lre_is_cased(uint32_t c); |
127 | int lre_is_case_ignorable(uint32_t c); |
128 | int lre_is_id_start(uint32_t c); |
129 | int lre_is_id_continue(uint32_t c); |
130 | |
131 | static inline int lre_is_space_byte(uint8_t c) { |
132 | return lre_ctype_bits[c] & UNICODE_C_SPACE; |
133 | } |
134 | |
135 | static inline int lre_is_id_start_byte(uint8_t c) { |
136 | return lre_ctype_bits[c] & (UNICODE_C_UPPER | UNICODE_C_LOWER | |
137 | UNICODE_C_UNDER | UNICODE_C_DOLLAR); |
138 | } |
139 | |
140 | static inline int lre_is_id_continue_byte(uint8_t c) { |
141 | return lre_ctype_bits[c] & (UNICODE_C_UPPER | UNICODE_C_LOWER | |
142 | UNICODE_C_UNDER | UNICODE_C_DOLLAR | |
143 | UNICODE_C_DIGIT); |
144 | } |
145 | |
146 | int lre_is_space_non_ascii(uint32_t c); |
147 | |
148 | static inline int lre_is_space(uint32_t c) { |
149 | if (c < 256) |
150 | return lre_is_space_byte(c); |
151 | else |
152 | return lre_is_space_non_ascii(c); |
153 | } |
154 | |
155 | static inline int lre_js_is_ident_first(uint32_t c) { |
156 | if (c < 128) { |
157 | return lre_is_id_start_byte(c); |
158 | } else { |
159 | #ifdef CONFIG_ALL_UNICODE |
160 | return lre_is_id_start(c); |
161 | #else |
162 | return !lre_is_space_non_ascii(c); |
163 | #endif |
164 | } |
165 | } |
166 | |
167 | static inline int lre_js_is_ident_next(uint32_t c) { |
168 | if (c < 128) { |
169 | return lre_is_id_continue_byte(c); |
170 | } else { |
171 | /* ZWNJ and ZWJ are accepted in identifiers */ |
172 | if (c >= 0x200C && c <= 0x200D) |
173 | return TRUE; |
174 | #ifdef CONFIG_ALL_UNICODE |
175 | return lre_is_id_continue(c); |
176 | #else |
177 | return !lre_is_space_non_ascii(c); |
178 | #endif |
179 | } |
180 | } |
181 | |
182 | #endif /* LIBUNICODE_H */ |
183 | |