libunicode.h source code [QuickJS/libunicode.h]

1	/*
2	* Unicode utilities
3	*
4	* Copyright (c) 2017-2018 Fabrice Bellard
5	*
6	* Permission is hereby granted, free of charge, to any person obtaining a copy
7	* of this software and associated documentation files (the "Software"), to deal
8	* in the Software without restriction, including without limitation the rights
9	* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10	* copies of the Software, and to permit persons to whom the Software is
11	* furnished to do so, subject to the following conditions:
12	*
13	* The above copyright notice and this permission notice shall be included in
14	* all copies or substantial portions of the Software.
15	*
16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19	* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22	* THE SOFTWARE.
23	*/
24	#ifndef LIBUNICODE_H
25	#define LIBUNICODE_H
26
27	#include <stdint.h>
28
29	/ define it to include all the unicode tables (40KB larger) /
30	#define CONFIG_ALL_UNICODE
31
32	#define LRE_CC_RES_LEN_MAX 3
33
34	/ char ranges /
35
36	typedef struct {
37	int len; / in points, always even /
38	int size;
39	uint32_t points; /* points sorted by increasing value /
40	void *mem_opaque;
41	void (realloc_func)(void opaque, void* *ptr, size_t size);
42	} CharRange;
43
44	typedef enum {
45	CR_OP_UNION,
46	CR_OP_INTER,
47	CR_OP_XOR,
48	} CharRangeOpEnum;
49
50	void cr_init(CharRange cr, void* mem_opaque, void* (realloc_func)(void opaque, void* *ptr, size_t size));
51	void cr_free(CharRange *cr);
52	int cr_realloc(CharRange cr, int* size);
53	int cr_copy(CharRange cr, const* CharRange *cr1);
54
55	static inline int cr_add_point(CharRange *cr, uint32_t v)
56	{
57	if (cr->len >= cr->size) {
58	if (cr_realloc(cr, cr->len + `1`))
59	return -`1`;
60	}
61	cr->points[cr->len++] = v;
62	return `0`;
63	}
64
65	static inline int cr_add_interval(CharRange *cr, uint32_t c1, uint32_t c2)
66	{
67	if ((cr->len + `2`) > cr->size) {
68	if (cr_realloc(cr, cr->len + `2`))
69	return -`1`;
70	}
71	cr->points[cr->len++] = c1;
72	cr->points[cr->len++] = c2;
73	return `0`;
74	}
75
76	int cr_union1(CharRange cr, const* uint32_t b_pt, int* b_len);
77
78	static inline int cr_union_interval(CharRange *cr, uint32_t c1, uint32_t c2)
79	{
80	uint32_t b_pt[`2`];
81	b_pt[`0`] = c1;
82	b_pt[`1`] = c2 + `1`;
83	return cr_union1(cr, b_pt, `2`);
84	}
85
86	int cr_op(CharRange cr, const* uint32_t a_pt, int* a_len,
87	const uint32_t b_pt, int* b_len, int op);
88
89	int cr_invert(CharRange *cr);
90
91	int cr_regexp_canonicalize(CharRange cr, int* is_unicode);
92
93	typedef enum {
94	UNICODE_NFC,
95	UNICODE_NFD,
96	UNICODE_NFKC,
97	UNICODE_NFKD,
98	} UnicodeNormalizationEnum;
99
100	int unicode_normalize(uint32_t *pdst, const* uint32_t src, int* src_len,
101	UnicodeNormalizationEnum n_type,
102	void opaque, void* (realloc_func)(void opaque, void* *ptr, size_t size));
103
104	/ Unicode character range functions /
105
106	int unicode_script(CharRange cr, const* char script_name, int* is_ext);
107	int unicode_general_category(CharRange cr, const* char *gc_name);
108	int unicode_prop(CharRange cr, const* char *prop_name);
109
110	int lre_case_conv(uint32_t res, uint32_t c, int* conv_type);
111	int lre_canonicalize(uint32_t c, int is_unicode);
112
113	/ Code point type categories /
114	enum {
115	UNICODE_C_SPACE = (`1` << `0`),
116	UNICODE_C_DIGIT = (`1` << `1`),
117	UNICODE_C_UPPER = (`1` << `2`),
118	UNICODE_C_LOWER = (`1` << `3`),
119	UNICODE_C_UNDER = (`1` << `4`),
120	UNICODE_C_DOLLAR = (`1` << `5`),
121	UNICODE_C_XDIGIT = (`1` << `6`),
122	};
123	extern uint8_t const lre_ctype_bits[`256`];
124
125	/ zero or non-zero return value /
126	int lre_is_cased(uint32_t c);
127	int lre_is_case_ignorable(uint32_t c);
128	int lre_is_id_start(uint32_t c);
129	int lre_is_id_continue(uint32_t c);
130
131	static inline int lre_is_space_byte(uint8_t c) {
132	return lre_ctype_bits[c] & UNICODE_C_SPACE;
133	}
134
135	static inline int lre_is_id_start_byte(uint8_t c) {
136	return lre_ctype_bits[c] & (UNICODE_C_UPPER \| UNICODE_C_LOWER \|
137	UNICODE_C_UNDER \| UNICODE_C_DOLLAR);
138	}
139
140	static inline int lre_is_id_continue_byte(uint8_t c) {
141	return lre_ctype_bits[c] & (UNICODE_C_UPPER \| UNICODE_C_LOWER \|
142	UNICODE_C_UNDER \| UNICODE_C_DOLLAR \|
143	UNICODE_C_DIGIT);
144	}
145
146	int lre_is_space_non_ascii(uint32_t c);
147
148	static inline int lre_is_space(uint32_t c) {
149	if (c < `256`)
150	return lre_is_space_byte(c);
151	else
152	return lre_is_space_non_ascii(c);
153	}
154
155	static inline int lre_js_is_ident_first(uint32_t c) {
156	if (c < `128`) {
157	return lre_is_id_start_byte(c);
158	} else {
159	#ifdef CONFIG_ALL_UNICODE
160	return lre_is_id_start(c);
161	#else
162	return !lre_is_space_non_ascii(c);
163	#endif
164	}
165	}
166
167	static inline int lre_js_is_ident_next(uint32_t c) {
168	if (c < `128`) {
169	return lre_is_id_continue_byte(c);
170	} else {
171	/ ZWNJ and ZWJ are accepted in identifiers /
172	if (c >= `0x200C` && c <= `0x200D`)
173	return TRUE;
174	#ifdef CONFIG_ALL_UNICODE
175	return lre_is_id_continue(c);
176	#else
177	return !lre_is_space_non_ascii(c);
178	#endif
179	}
180	}
181
182	#endif /* LIBUNICODE_H */
183

Browse the source code of QuickJS/libunicode.h