1/*
2 * Copyright (c) 2015, Intel Corporation
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *
7 * * Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Intel Corporation nor the names of its contributors
13 * may be used to endorse or promote products derived from this software
14 * without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/** \file
30 * \brief Character classes and their mnemonics.
31 */
32#include "Parser.h"
33#include "ComponentClass.h"
34#include "AsciiComponentClass.h"
35#include "ucp_table.h"
36#include "Utf8ComponentClass.h"
37#include "util/charreach.h"
38#include "util/make_unique.h"
39
40#include <boost/icl/interval_set.hpp>
41
42using namespace std;
43
44namespace ue2 {
45
46static
47CharReach to_cr(const CodePointSet &cps) {
48 CharReach cr;
49 for (const auto &cp : cps) {
50 if (lower(cp) >= CharReach::npos) {
51 break;
52 }
53
54 cr.setRange(lower(cp), MIN(upper(cp), CharReach::npos - 1));
55 }
56
57 return cr;
58}
59
60CharReach getPredefinedCharReach(PredefinedClass c, const ParseMode &mode) {
61 const CharReach lower('a', 'z');
62 const CharReach upper('A', 'Z');
63 const CharReach number('0', '9');
64 switch (c) {
65 case CLASS_ALNUM:
66 return lower | upper | number;
67 case CLASS_ALPHA:
68 return lower | upper;
69 case CLASS_ANY:
70 if (mode.dotall) {
71 return ~CharReach();
72 } else {
73 return ~CharReach('\n');
74 }
75 case CLASS_ASCII:
76 return CharReach(0, 127);
77 case CLASS_BLANK:
78 return CharReach(" \t");
79 case CLASS_CNTRL:
80 return CharReach(0, 31) | CharReach(127 /* del */);
81 case CLASS_DIGIT:
82 return number;
83 case CLASS_GRAPH:
84 return CharReach(0x21, 0x7e);
85 case CLASS_XGRAPH:
86 return to_cr(getPredefinedCodePointSet(c, mode));
87 case CLASS_HORZ:
88 return CharReach("\x09\x20\xA0");
89 case CLASS_LOWER:
90 if (mode.caseless) {
91 return lower | upper;
92 } else {
93 return lower;
94 }
95 case CLASS_PRINT:
96 return CharReach(0x20, 0x7e);
97 case CLASS_XPRINT:
98 return to_cr(getPredefinedCodePointSet(c, mode));
99 case CLASS_PUNCT:
100 return CharReach(0x21, '0' - 1)
101 | CharReach('9' + 1, 'A' - 1)
102 | CharReach('Z' + 1, 'a' - 1)
103 | CharReach('z' + 1, 126);
104 case CLASS_XPUNCT:
105 return to_cr(getPredefinedCodePointSet(c, mode));
106 case CLASS_SPACE:
107 return CharReach("\x09\x0a\x0c\x0b\x0d\x20");
108 case CLASS_UPPER:
109 if (mode.caseless) {
110 return lower | upper;
111 } else {
112 return upper;
113 }
114 case CLASS_VERT:
115 return CharReach("\x0a\x0b\x0c\x0d\x85");
116 case CLASS_WORD:
117 return lower | upper | number | CharReach('_');
118 case CLASS_XDIGIT:
119 return CharReach("0123456789abcdefABCDEF");
120 case CLASS_UCP_C:
121 return to_cr(getUcpC());
122 case CLASS_UCP_CC:
123 return to_cr(getUcpCc());
124 case CLASS_UCP_CF:
125 return to_cr(getUcpCf());
126 case CLASS_UCP_CN:
127 return to_cr(getUcpCn());
128 case CLASS_UCP_CO:
129 return to_cr(getUcpCo());
130 case CLASS_UCP_CS:
131 return to_cr(getUcpCs());
132 case CLASS_UCP_L:
133 return to_cr(getUcpL());
134 case CLASS_UCP_L_AND:
135 return to_cr(getUcpL_and());
136 case CLASS_UCP_LL:
137 return to_cr(getUcpLl());
138 case CLASS_UCP_LM:
139 return to_cr(getUcpLm());
140 case CLASS_UCP_LO:
141 return to_cr(getUcpLo());
142 case CLASS_UCP_LT:
143 return to_cr(getUcpLt());
144 case CLASS_UCP_LU:
145 return to_cr(getUcpLu());
146 case CLASS_UCP_M:
147 return to_cr(getUcpM());
148 case CLASS_UCP_MC:
149 return to_cr(getUcpMc());
150 case CLASS_UCP_ME:
151 return to_cr(getUcpMe());
152 case CLASS_UCP_MN:
153 return to_cr(getUcpMn());
154 case CLASS_UCP_N:
155 return to_cr(getUcpN());
156 case CLASS_UCP_ND:
157 return to_cr(getUcpNd());
158 case CLASS_UCP_NL:
159 return to_cr(getUcpNl());
160 case CLASS_UCP_NO:
161 return to_cr(getUcpNo());
162 case CLASS_UCP_P:
163 return to_cr(getUcpP());
164 case CLASS_UCP_PC:
165 return to_cr(getUcpPc());
166 case CLASS_UCP_PD:
167 return to_cr(getUcpPd());
168 case CLASS_UCP_PE:
169 return to_cr(getUcpPe());
170 case CLASS_UCP_PF:
171 return to_cr(getUcpPf());
172 case CLASS_UCP_PI:
173 return to_cr(getUcpPi());
174 case CLASS_UCP_PO:
175 return to_cr(getUcpPo());
176 case CLASS_UCP_PS:
177 return to_cr(getUcpPs());
178 case CLASS_UCP_S:
179 return to_cr(getUcpS());
180 case CLASS_UCP_SC:
181 return to_cr(getUcpSc());
182 case CLASS_UCP_SK:
183 return to_cr(getUcpSk());
184 case CLASS_UCP_SM:
185 return to_cr(getUcpSm());
186 case CLASS_UCP_SO:
187 return to_cr(getUcpSo());
188 case CLASS_UCP_XAN:
189 return to_cr(getUcpXan());
190 case CLASS_UCP_XPS:
191 case CLASS_UCP_XSP:
192 return getPredefinedCharReach(CLASS_VERT, mode) | getPredefinedCharReach(CLASS_HORZ, mode);
193 case CLASS_UCP_XWD:
194 return to_cr(getUcpXwd());
195 case CLASS_UCP_Z:
196 return to_cr(getUcpZ());
197 case CLASS_UCP_ZL:
198 return to_cr(getUcpZl());
199 case CLASS_UCP_ZP:
200 return to_cr(getUcpZp());
201 case CLASS_UCP_ZS:
202 return to_cr(getUcpZs());
203 case CLASS_SCRIPT_ARABIC:
204 return to_cr(getUcpArabic());
205 case CLASS_SCRIPT_ARMENIAN:
206 return to_cr(getUcpArmenian());
207 case CLASS_SCRIPT_AVESTAN:
208 return to_cr(getUcpAvestan());
209 case CLASS_SCRIPT_BALINESE:
210 return to_cr(getUcpBalinese());
211 case CLASS_SCRIPT_BAMUM:
212 return to_cr(getUcpBamum());
213 case CLASS_SCRIPT_BATAK:
214 return to_cr(getUcpBatak());
215 case CLASS_SCRIPT_BENGALI:
216 return to_cr(getUcpBengali());
217 case CLASS_SCRIPT_BOPOMOFO:
218 return to_cr(getUcpBopomofo());
219 case CLASS_SCRIPT_BRAHMI:
220 return to_cr(getUcpBrahmi());
221 case CLASS_SCRIPT_BRAILLE:
222 return to_cr(getUcpBraille());
223 case CLASS_SCRIPT_BUGINESE:
224 return to_cr(getUcpBuginese());
225 case CLASS_SCRIPT_BUHID:
226 return to_cr(getUcpBuhid());
227 case CLASS_SCRIPT_CANADIAN_ABORIGINAL:
228 return to_cr(getUcpCanadian_Aboriginal());
229 case CLASS_SCRIPT_CARIAN:
230 return to_cr(getUcpCarian());
231 case CLASS_SCRIPT_CHAM:
232 return to_cr(getUcpCham());
233 case CLASS_SCRIPT_CHEROKEE:
234 return to_cr(getUcpCherokee());
235 case CLASS_SCRIPT_COMMON:
236 return to_cr(getUcpCommon());
237 case CLASS_SCRIPT_COPTIC:
238 return to_cr(getUcpCoptic());
239 case CLASS_SCRIPT_CUNEIFORM:
240 return to_cr(getUcpCuneiform());
241 case CLASS_SCRIPT_CYPRIOT:
242 return to_cr(getUcpCypriot());
243 case CLASS_SCRIPT_CYRILLIC:
244 return to_cr(getUcpCyrillic());
245 case CLASS_SCRIPT_DESERET:
246 return to_cr(getUcpDeseret());
247 case CLASS_SCRIPT_DEVANAGARI:
248 return to_cr(getUcpDevanagari());
249 case CLASS_SCRIPT_EGYPTIAN_HIEROGLYPHS:
250 return to_cr(getUcpEgyptian_Hieroglyphs());
251 case CLASS_SCRIPT_ETHIOPIC:
252 return to_cr(getUcpEthiopic());
253 case CLASS_SCRIPT_GEORGIAN:
254 return to_cr(getUcpGeorgian());
255 case CLASS_SCRIPT_GLAGOLITIC:
256 return to_cr(getUcpGlagolitic());
257 case CLASS_SCRIPT_GOTHIC:
258 return to_cr(getUcpGothic());
259 case CLASS_SCRIPT_GREEK:
260 return to_cr(getUcpGreek());
261 case CLASS_SCRIPT_GUJARATI:
262 return to_cr(getUcpGujarati());
263 case CLASS_SCRIPT_GURMUKHI:
264 return to_cr(getUcpGurmukhi());
265 case CLASS_SCRIPT_HAN:
266 return to_cr(getUcpHan());
267 case CLASS_SCRIPT_HANGUL:
268 return to_cr(getUcpHangul());
269 case CLASS_SCRIPT_HANUNOO:
270 return to_cr(getUcpHanunoo());
271 case CLASS_SCRIPT_HEBREW:
272 return to_cr(getUcpHebrew());
273 case CLASS_SCRIPT_HIRAGANA:
274 return to_cr(getUcpHiragana());
275 case CLASS_SCRIPT_IMPERIAL_ARAMAIC:
276 return to_cr(getUcpImperial_Aramaic());
277 case CLASS_SCRIPT_INHERITED:
278 return to_cr(getUcpInherited());
279 case CLASS_SCRIPT_INSCRIPTIONAL_PAHLAVI:
280 return to_cr(getUcpInscriptional_Pahlavi());
281 case CLASS_SCRIPT_INSCRIPTIONAL_PARTHIAN:
282 return to_cr(getUcpInscriptional_Parthian());
283 case CLASS_SCRIPT_JAVANESE:
284 return to_cr(getUcpJavanese());
285 case CLASS_SCRIPT_KAITHI:
286 return to_cr(getUcpKaithi());
287 case CLASS_SCRIPT_KANNADA:
288 return to_cr(getUcpKannada());
289 case CLASS_SCRIPT_KATAKANA:
290 return to_cr(getUcpKatakana());
291 case CLASS_SCRIPT_KAYAH_LI:
292 return to_cr(getUcpKayah_Li());
293 case CLASS_SCRIPT_KHAROSHTHI:
294 return to_cr(getUcpKharoshthi());
295 case CLASS_SCRIPT_KHMER:
296 return to_cr(getUcpKhmer());
297 case CLASS_SCRIPT_LAO:
298 return to_cr(getUcpLao());
299 case CLASS_SCRIPT_LATIN:
300 return to_cr(getUcpLatin());
301 case CLASS_SCRIPT_LEPCHA:
302 return to_cr(getUcpLepcha());
303 case CLASS_SCRIPT_LIMBU:
304 return to_cr(getUcpLimbu());
305 case CLASS_SCRIPT_LINEAR_B:
306 return to_cr(getUcpLinear_B());
307 case CLASS_SCRIPT_LISU:
308 return to_cr(getUcpLisu());
309 case CLASS_SCRIPT_LYCIAN:
310 return to_cr(getUcpLycian());
311 case CLASS_SCRIPT_LYDIAN:
312 return to_cr(getUcpLydian());
313 case CLASS_SCRIPT_MALAYALAM:
314 return to_cr(getUcpMalayalam());
315 case CLASS_SCRIPT_MANDAIC:
316 return to_cr(getUcpMandaic());
317 case CLASS_SCRIPT_MEETEI_MAYEK:
318 return to_cr(getUcpMeetei_Mayek());
319 case CLASS_SCRIPT_MONGOLIAN:
320 return to_cr(getUcpMongolian());
321 case CLASS_SCRIPT_MYANMAR:
322 return to_cr(getUcpMyanmar());
323 case CLASS_SCRIPT_NEW_TAI_LUE:
324 return to_cr(getUcpNew_Tai_Lue());
325 case CLASS_SCRIPT_NKO:
326 return to_cr(getUcpNko());
327 case CLASS_SCRIPT_OGHAM:
328 return to_cr(getUcpOgham());
329 case CLASS_SCRIPT_OL_CHIKI:
330 return to_cr(getUcpOl_Chiki());
331 case CLASS_SCRIPT_OLD_ITALIC:
332 return to_cr(getUcpOld_Italic());
333 case CLASS_SCRIPT_OLD_PERSIAN:
334 return to_cr(getUcpOld_Persian());
335 case CLASS_SCRIPT_OLD_SOUTH_ARABIAN:
336 return to_cr(getUcpOld_South_Arabian());
337 case CLASS_SCRIPT_OLD_TURKIC:
338 return to_cr(getUcpOld_Turkic());
339 case CLASS_SCRIPT_ORIYA:
340 return to_cr(getUcpOriya());
341 case CLASS_SCRIPT_OSMANYA:
342 return to_cr(getUcpOsmanya());
343 case CLASS_SCRIPT_PHAGS_PA:
344 return to_cr(getUcpPhags_Pa());
345 case CLASS_SCRIPT_PHOENICIAN:
346 return to_cr(getUcpPhoenician());
347 case CLASS_SCRIPT_REJANG:
348 return to_cr(getUcpRejang());
349 case CLASS_SCRIPT_RUNIC:
350 return to_cr(getUcpRunic());
351 case CLASS_SCRIPT_SAMARITAN:
352 return to_cr(getUcpSamaritan());
353 case CLASS_SCRIPT_SAURASHTRA:
354 return to_cr(getUcpSaurashtra());
355 case CLASS_SCRIPT_SHAVIAN:
356 return to_cr(getUcpShavian());
357 case CLASS_SCRIPT_SINHALA:
358 return to_cr(getUcpSinhala());
359 case CLASS_SCRIPT_SUNDANESE:
360 return to_cr(getUcpSundanese());
361 case CLASS_SCRIPT_SYLOTI_NAGRI:
362 return to_cr(getUcpSyloti_Nagri());
363 case CLASS_SCRIPT_SYRIAC:
364 return to_cr(getUcpSyriac());
365 case CLASS_SCRIPT_TAGALOG:
366 return to_cr(getUcpTagalog());
367 case CLASS_SCRIPT_TAGBANWA:
368 return to_cr(getUcpTagbanwa());
369 case CLASS_SCRIPT_TAI_LE:
370 return to_cr(getUcpTai_Le());
371 case CLASS_SCRIPT_TAI_THAM:
372 return to_cr(getUcpTai_Tham());
373 case CLASS_SCRIPT_TAI_VIET:
374 return to_cr(getUcpTai_Viet());
375 case CLASS_SCRIPT_TAMIL:
376 return to_cr(getUcpTamil());
377 case CLASS_SCRIPT_TELUGU:
378 return to_cr(getUcpTelugu());
379 case CLASS_SCRIPT_THAANA:
380 return to_cr(getUcpThaana());
381 case CLASS_SCRIPT_THAI:
382 return to_cr(getUcpThai());
383 case CLASS_SCRIPT_TIBETAN:
384 return to_cr(getUcpTibetan());
385 case CLASS_SCRIPT_TIFINAGH:
386 return to_cr(getUcpTifinagh());
387 case CLASS_SCRIPT_UGARITIC:
388 return to_cr(getUcpUgaritic());
389 case CLASS_SCRIPT_VAI:
390 return to_cr(getUcpVai());
391 case CLASS_SCRIPT_YI:
392 return to_cr(getUcpYi());
393 case CLASS_UCP_ANY: /* always include newline */
394 return ~CharReach();
395 }
396 assert(0);
397 return CharReach();
398}
399
400unique_ptr<ComponentClass> getComponentClass(const ParseMode &mode) {
401 if (mode.utf8) {
402 return ue2::make_unique<UTF8ComponentClass>(mode);
403 } else {
404 return ue2::make_unique<AsciiComponentClass>(mode);
405 }
406}
407
408unique_ptr<ComponentClass> generateComponent(PredefinedClass c, bool negate,
409 const ParseMode &mode) {
410 auto cc = getComponentClass(mode);
411 cc->add(c, negate);
412 cc->finalize();
413 return cc;
414}
415
416unique_ptr<ComponentClass> getLiteralComponentClass(unsigned char c,
417 bool nocase) {
418 ParseMode mode;
419 mode.caseless = nocase;
420 auto cc = getComponentClass(mode);
421 cc->add(c);
422 cc->finalize();
423 return cc;
424}
425
426ComponentClass::ComponentClass(const ParseMode &mode_in)
427 : m_negate(false), mode(mode_in), in_cand_range(false),
428 range_start(INVALID_UNICODE), finalized(false) {}
429
430ComponentClass::~ComponentClass() { }
431
432void ComponentClass::addDash(void) {
433 if (!in_cand_range) {
434 // this could be the start of a range
435 if (range_start != INVALID_UNICODE) {
436 in_cand_range = true;
437 } else {
438 /* no possible start character for range, this is just a literal */
439 add('-');
440 }
441 } else {
442 // already creating a range, so this must be literal '-'
443 in_cand_range = false;
444 createRange('-');
445 }
446}
447
448void ComponentClass::negate() {
449 m_negate = true;
450}
451
452} // namespace ue2
453