1/*
2 * Unicode utilities
3 *
4 * Copyright (c) 2017-2018 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24#ifndef LIBUNICODE_H
25#define LIBUNICODE_H
26
27#include <stdint.h>
28
29/* define it to include all the unicode tables (40KB larger) */
30#define CONFIG_ALL_UNICODE
31
32#define LRE_CC_RES_LEN_MAX 3
33
34/* char ranges */
35
36typedef struct {
37 int len; /* in points, always even */
38 int size;
39 uint32_t *points; /* points sorted by increasing value */
40 void *mem_opaque;
41 void *(*realloc_func)(void *opaque, void *ptr, size_t size);
42} CharRange;
43
44typedef enum {
45 CR_OP_UNION,
46 CR_OP_INTER,
47 CR_OP_XOR,
48} CharRangeOpEnum;
49
50void cr_init(CharRange *cr, void *mem_opaque, void *(*realloc_func)(void *opaque, void *ptr, size_t size));
51void cr_free(CharRange *cr);
52int cr_realloc(CharRange *cr, int size);
53int cr_copy(CharRange *cr, const CharRange *cr1);
54
55static inline int cr_add_point(CharRange *cr, uint32_t v)
56{
57 if (cr->len >= cr->size) {
58 if (cr_realloc(cr, cr->len + 1))
59 return -1;
60 }
61 cr->points[cr->len++] = v;
62 return 0;
63}
64
65static inline int cr_add_interval(CharRange *cr, uint32_t c1, uint32_t c2)
66{
67 if ((cr->len + 2) > cr->size) {
68 if (cr_realloc(cr, cr->len + 2))
69 return -1;
70 }
71 cr->points[cr->len++] = c1;
72 cr->points[cr->len++] = c2;
73 return 0;
74}
75
76int cr_union1(CharRange *cr, const uint32_t *b_pt, int b_len);
77
78static inline int cr_union_interval(CharRange *cr, uint32_t c1, uint32_t c2)
79{
80 uint32_t b_pt[2];
81 b_pt[0] = c1;
82 b_pt[1] = c2 + 1;
83 return cr_union1(cr, b_pt, 2);
84}
85
86int cr_op(CharRange *cr, const uint32_t *a_pt, int a_len,
87 const uint32_t *b_pt, int b_len, int op);
88
89int cr_invert(CharRange *cr);
90
91int cr_regexp_canonicalize(CharRange *cr, int is_unicode);
92
93typedef enum {
94 UNICODE_NFC,
95 UNICODE_NFD,
96 UNICODE_NFKC,
97 UNICODE_NFKD,
98} UnicodeNormalizationEnum;
99
100int unicode_normalize(uint32_t **pdst, const uint32_t *src, int src_len,
101 UnicodeNormalizationEnum n_type,
102 void *opaque, void *(*realloc_func)(void *opaque, void *ptr, size_t size));
103
104/* Unicode character range functions */
105
106int unicode_script(CharRange *cr, const char *script_name, int is_ext);
107int unicode_general_category(CharRange *cr, const char *gc_name);
108int unicode_prop(CharRange *cr, const char *prop_name);
109
110int lre_case_conv(uint32_t *res, uint32_t c, int conv_type);
111int lre_canonicalize(uint32_t c, int is_unicode);
112
113/* Code point type categories */
114enum {
115 UNICODE_C_SPACE = (1 << 0),
116 UNICODE_C_DIGIT = (1 << 1),
117 UNICODE_C_UPPER = (1 << 2),
118 UNICODE_C_LOWER = (1 << 3),
119 UNICODE_C_UNDER = (1 << 4),
120 UNICODE_C_DOLLAR = (1 << 5),
121 UNICODE_C_XDIGIT = (1 << 6),
122};
123extern uint8_t const lre_ctype_bits[256];
124
125/* zero or non-zero return value */
126int lre_is_cased(uint32_t c);
127int lre_is_case_ignorable(uint32_t c);
128int lre_is_id_start(uint32_t c);
129int lre_is_id_continue(uint32_t c);
130
131static inline int lre_is_space_byte(uint8_t c) {
132 return lre_ctype_bits[c] & UNICODE_C_SPACE;
133}
134
135static inline int lre_is_id_start_byte(uint8_t c) {
136 return lre_ctype_bits[c] & (UNICODE_C_UPPER | UNICODE_C_LOWER |
137 UNICODE_C_UNDER | UNICODE_C_DOLLAR);
138}
139
140static inline int lre_is_id_continue_byte(uint8_t c) {
141 return lre_ctype_bits[c] & (UNICODE_C_UPPER | UNICODE_C_LOWER |
142 UNICODE_C_UNDER | UNICODE_C_DOLLAR |
143 UNICODE_C_DIGIT);
144}
145
146int lre_is_space_non_ascii(uint32_t c);
147
148static inline int lre_is_space(uint32_t c) {
149 if (c < 256)
150 return lre_is_space_byte(c);
151 else
152 return lre_is_space_non_ascii(c);
153}
154
155static inline int lre_js_is_ident_first(uint32_t c) {
156 if (c < 128) {
157 return lre_is_id_start_byte(c);
158 } else {
159#ifdef CONFIG_ALL_UNICODE
160 return lre_is_id_start(c);
161#else
162 return !lre_is_space_non_ascii(c);
163#endif
164 }
165}
166
167static inline int lre_js_is_ident_next(uint32_t c) {
168 if (c < 128) {
169 return lre_is_id_continue_byte(c);
170 } else {
171 /* ZWNJ and ZWJ are accepted in identifiers */
172 if (c >= 0x200C && c <= 0x200D)
173 return TRUE;
174#ifdef CONFIG_ALL_UNICODE
175 return lre_is_id_continue(c);
176#else
177 return !lre_is_space_non_ascii(c);
178#endif
179 }
180}
181
182#endif /* LIBUNICODE_H */
183