1/*
2 * Copyright (c) 2015, Intel Corporation
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *
7 * * Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Intel Corporation nor the names of its contributors
13 * may be used to endorse or promote products derived from this software
14 * without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/** \file
30 * \brief Character class in UTF-8 mode.
31 */
32
33
34#include "Utf8ComponentClass.h"
35
36#include "buildstate.h"
37#include "Parser.h"
38#include "parse_error.h"
39#include "position.h"
40#include "position_info.h"
41#include "nfagraph/ng_builder.h"
42#include "util/compare.h"
43#include "util/unicode_def.h"
44
45#include <cstring>
46
47#include "ucp_table.h"
48
49using namespace std;
50
51namespace ue2 {
52
53PredefinedClass translateForUcpMode(PredefinedClass in, const ParseMode &mode) {
54 /* Note: the mapping used here for mapping posix character classes
55 * matches the observed behaviour of PCRE (lower and upper going to \p{L}
56 * is not documented by pcre).
57 *
58 * Note: this mapping is quite different from both of the mappings
59 * recommended in the unicode regex tech report (TR-18) appendix C
60 */
61 switch (in) {
62 case CLASS_ALNUM:
63 return CLASS_UCP_XAN;
64 case CLASS_ALPHA:
65 return CLASS_UCP_L;
66 case CLASS_BLANK:
67 return CLASS_HORZ;
68 case CLASS_DIGIT:
69 return CLASS_UCP_ND;
70 case CLASS_GRAPH:
71 return CLASS_XGRAPH;
72 case CLASS_LOWER:
73 if (mode.caseless) { /* we also pick up uppercase titlecase and others */
74 return CLASS_UCP_L;
75 } else {
76 return CLASS_UCP_LL;
77 }
78 case CLASS_PRINT:
79 return CLASS_XPRINT;
80 case CLASS_PUNCT:
81 return CLASS_XPUNCT;
82 case CLASS_SPACE:
83 return CLASS_UCP_XPS;
84 case CLASS_UPPER:
85 if (mode.caseless) { /* we also pick up lowercase titlecase and others */
86 return CLASS_UCP_L;
87 } else {
88 return CLASS_UCP_LU;
89 }
90 case CLASS_WORD:
91 return CLASS_UCP_XWD;
92 default:
93 return in;
94 }
95}
96
97CodePointSet getPredefinedCodePointSet(PredefinedClass c,
98 const ParseMode &mode) {
99 /* TODO: support properly PCRE_UCP mode and non PCRE_UCP mode */
100 switch (c) {
101 case CLASS_ANY:
102 if (mode.dotall) {
103 return CodePointSet(CodePointSet::interval(0, MAX_UNICODE));
104 } else {
105 CodePointSet rv;
106 rv.set('\n');
107 rv.flip();
108 return rv;
109 }
110 case CLASS_XGRAPH: {
111 CodePointSet rv;
112 rv = getUcpZ();
113 rv |= getUcpC();
114 rv.flip();
115 // most of Cf, except for ...
116 CodePointSet cf = getUcpCf();
117 cf.unset(0x061c);
118 cf.unset(0x180e);
119 cf.unsetRange(0x2066, 0x2069);
120 rv |= cf;
121 return rv;
122 }
123 case CLASS_XPRINT: {
124 // Same as graph, plus everything with the Zs property.
125 CodePointSet rv = getPredefinedCodePointSet(CLASS_XGRAPH, mode);
126 rv |= getUcpZs();
127 rv.set(0x180e); // Also included in this class by PCRE 8.38.
128 return rv;
129 }
130 case CLASS_XPUNCT: {
131 // Everything with the P (punctuation) property, plus code points in S
132 // (symbols) that are < 128.
133 CodePointSet rv = getUcpP();
134 CodePointSet symbols = getUcpS();
135 symbols.unsetRange(128, MAX_UNICODE);
136 rv |= symbols;
137 return rv;
138 }
139 case CLASS_HORZ: {
140 CodePointSet rv;
141 rv.set(0x0009); /* Horizontal tab */
142 rv.set(0x0020); /* Space */
143 rv.set(0x00A0); /* Non-break space */
144 rv.set(0x1680); /* Ogham space mark */
145 rv.set(0x180E); /* Mongolian vowel separator */
146 rv.set(0x2000); /* En quad */
147 rv.set(0x2001); /* Em quad */
148 rv.set(0x2002); /* En space */
149 rv.set(0x2003); /* Em space */
150 rv.set(0x2004); /* Three-per-em space */
151 rv.set(0x2005); /* Four-per-em space */
152 rv.set(0x2006); /* Six-per-em space */
153 rv.set(0x2007); /* Figure space */
154 rv.set(0x2008); /* Punctuation space */
155 rv.set(0x2009); /* Thin space */
156 rv.set(0x200A); /* Hair space */
157 rv.set(0x202F); /* Narrow no-break space */
158 rv.set(0x205F); /* Medium mathematical space */
159 rv.set(0x3000); /* Ideographic space */
160 return rv;
161 }
162 case CLASS_VERT: {
163 CodePointSet rv;
164 rv.set(0x000A); /* Linefeed */
165 rv.set(0x000B); /* Vertical tab */
166 rv.set(0x000C); /* Formfeed */
167 rv.set(0x000D); /* Carriage return */
168 rv.set(0x0085); /* Next line */
169 rv.set(0x2028); /* Line separator */
170 rv.set(0x2029); /* Paragraph separator */
171 return rv;
172 }
173 case CLASS_UCP_XPS:
174 case CLASS_UCP_XSP: {
175 CodePointSet rv;
176 rv.set(0x0009); /* Horizontal tab */
177 rv.set(0x0020); /* Space */
178 rv.set(0x00A0); /* Non-break space */
179 rv.set(0x1680); /* Ogham space mark */
180 rv.set(0x180E); /* Mongolian vowel separator */
181 rv.set(0x2000); /* En quad */
182 rv.set(0x2001); /* Em quad */
183 rv.set(0x2002); /* En space */
184 rv.set(0x2003); /* Em space */
185 rv.set(0x2004); /* Three-per-em space */
186 rv.set(0x2005); /* Four-per-em space */
187 rv.set(0x2006); /* Six-per-em space */
188 rv.set(0x2007); /* Figure space */
189 rv.set(0x2008); /* Punctuation space */
190 rv.set(0x2009); /* Thin space */
191 rv.set(0x200A); /* Hair space */
192 rv.set(0x202F); /* Narrow no-break space */
193 rv.set(0x205F); /* Medium mathematical space */
194 rv.set(0x3000); /* Ideographic space */
195 rv.set(0x000A); /* Linefeed */
196 rv.set(0x000B); /* Vertical tab */
197 rv.set(0x000C); /* Formfeed */
198 rv.set(0x000D); /* Carriage return */
199 rv.set(0x0085); /* Next line */
200 rv.set(0x2028); /* Line separator */
201 rv.set(0x2029); /* Paragraph separator */
202 return rv;
203 }
204 case CLASS_UCP_C:
205 return getUcpC();
206 case CLASS_UCP_CC:
207 return getUcpCc();
208 case CLASS_UCP_CF:
209 return getUcpCf();
210 case CLASS_UCP_CN:
211 return getUcpCn();
212 case CLASS_UCP_CO:
213 return getUcpCo();
214 case CLASS_UCP_CS:
215 return getUcpCs();
216 case CLASS_UCP_L:
217 return getUcpL();
218 case CLASS_UCP_L_AND:
219 return getUcpL_and();
220 case CLASS_UCP_LL:
221 return getUcpLl();
222 case CLASS_UCP_LM:
223 return getUcpLm();
224 case CLASS_UCP_LO:
225 return getUcpLo();
226 case CLASS_UCP_LT:
227 return getUcpLt();
228 case CLASS_UCP_LU:
229 return getUcpLu();
230 case CLASS_UCP_M:
231 return getUcpM();
232 case CLASS_UCP_MC:
233 return getUcpMc();
234 case CLASS_UCP_ME:
235 return getUcpMe();
236 case CLASS_UCP_MN:
237 return getUcpMn();
238 case CLASS_UCP_N:
239 return getUcpN();
240 case CLASS_UCP_ND:
241 return getUcpNd();
242 case CLASS_UCP_NL:
243 return getUcpNl();
244 case CLASS_UCP_NO:
245 return getUcpNo();
246 case CLASS_UCP_P:
247 return getUcpP();
248 case CLASS_UCP_PC:
249 return getUcpPc();
250 case CLASS_UCP_PD:
251 return getUcpPd();
252 case CLASS_UCP_PE:
253 return getUcpPe();
254 case CLASS_UCP_PF:
255 return getUcpPf();
256 case CLASS_UCP_PI:
257 return getUcpPi();
258 case CLASS_UCP_PO:
259 return getUcpPo();
260 case CLASS_UCP_PS:
261 return getUcpPs();
262 case CLASS_UCP_S:
263 return getUcpS();
264 case CLASS_UCP_SC:
265 return getUcpSc();
266 case CLASS_UCP_SK:
267 return getUcpSk();
268 case CLASS_UCP_SM:
269 return getUcpSm();
270 case CLASS_UCP_SO:
271 return getUcpSo();
272 case CLASS_UCP_XAN:
273 return getUcpXan();
274 case CLASS_UCP_XWD:
275 return getUcpXwd();
276 case CLASS_UCP_Z:
277 return getUcpZ();
278 case CLASS_UCP_ZL:
279 return getUcpZl();
280 case CLASS_UCP_ZP:
281 return getUcpZp();
282 case CLASS_UCP_ZS:
283 return getUcpZs();
284 case CLASS_SCRIPT_ARABIC:
285 return getUcpArabic();
286 case CLASS_SCRIPT_ARMENIAN:
287 return getUcpArmenian();
288 case CLASS_SCRIPT_AVESTAN:
289 return getUcpAvestan();
290 case CLASS_SCRIPT_BALINESE:
291 return getUcpBalinese();
292 case CLASS_SCRIPT_BAMUM:
293 return getUcpBamum();
294 case CLASS_SCRIPT_BATAK:
295 return getUcpBatak();
296 case CLASS_SCRIPT_BENGALI:
297 return getUcpBengali();
298 case CLASS_SCRIPT_BOPOMOFO:
299 return getUcpBopomofo();
300 case CLASS_SCRIPT_BRAHMI:
301 return getUcpBrahmi();
302 case CLASS_SCRIPT_BRAILLE:
303 return getUcpBraille();
304 case CLASS_SCRIPT_BUGINESE:
305 return getUcpBuginese();
306 case CLASS_SCRIPT_BUHID:
307 return getUcpBuhid();
308 case CLASS_SCRIPT_CANADIAN_ABORIGINAL:
309 return getUcpCanadian_Aboriginal();
310 case CLASS_SCRIPT_CARIAN:
311 return getUcpCarian();
312 case CLASS_SCRIPT_CHAM:
313 return getUcpCham();
314 case CLASS_SCRIPT_CHEROKEE:
315 return getUcpCherokee();
316 case CLASS_SCRIPT_COMMON:
317 return getUcpCommon();
318 case CLASS_SCRIPT_COPTIC:
319 return getUcpCoptic();
320 case CLASS_SCRIPT_CUNEIFORM:
321 return getUcpCuneiform();
322 case CLASS_SCRIPT_CYPRIOT:
323 return getUcpCypriot();
324 case CLASS_SCRIPT_CYRILLIC:
325 return getUcpCyrillic();
326 case CLASS_SCRIPT_DESERET:
327 return getUcpDeseret();
328 case CLASS_SCRIPT_DEVANAGARI:
329 return getUcpDevanagari();
330 case CLASS_SCRIPT_EGYPTIAN_HIEROGLYPHS:
331 return getUcpEgyptian_Hieroglyphs();
332 case CLASS_SCRIPT_ETHIOPIC:
333 return getUcpEthiopic();
334 case CLASS_SCRIPT_GEORGIAN:
335 return getUcpGeorgian();
336 case CLASS_SCRIPT_GLAGOLITIC:
337 return getUcpGlagolitic();
338 case CLASS_SCRIPT_GOTHIC:
339 return getUcpGothic();
340 case CLASS_SCRIPT_GREEK:
341 return getUcpGreek();
342 case CLASS_SCRIPT_GUJARATI:
343 return getUcpGujarati();
344 case CLASS_SCRIPT_GURMUKHI:
345 return getUcpGurmukhi();
346 case CLASS_SCRIPT_HAN:
347 return getUcpHan();
348 case CLASS_SCRIPT_HANGUL:
349 return getUcpHangul();
350 case CLASS_SCRIPT_HANUNOO:
351 return getUcpHanunoo();
352 case CLASS_SCRIPT_HEBREW:
353 return getUcpHebrew();
354 case CLASS_SCRIPT_HIRAGANA:
355 return getUcpHiragana();
356 case CLASS_SCRIPT_IMPERIAL_ARAMAIC:
357 return getUcpImperial_Aramaic();
358 case CLASS_SCRIPT_INHERITED:
359 return getUcpInherited();
360 case CLASS_SCRIPT_INSCRIPTIONAL_PAHLAVI:
361 return getUcpInscriptional_Pahlavi();
362 case CLASS_SCRIPT_INSCRIPTIONAL_PARTHIAN:
363 return getUcpInscriptional_Parthian();
364 case CLASS_SCRIPT_JAVANESE:
365 return getUcpJavanese();
366 case CLASS_SCRIPT_KAITHI:
367 return getUcpKaithi();
368 case CLASS_SCRIPT_KANNADA:
369 return getUcpKannada();
370 case CLASS_SCRIPT_KATAKANA:
371 return getUcpKatakana();
372 case CLASS_SCRIPT_KAYAH_LI:
373 return getUcpKayah_Li();
374 case CLASS_SCRIPT_KHAROSHTHI:
375 return getUcpKharoshthi();
376 case CLASS_SCRIPT_KHMER:
377 return getUcpKhmer();
378 case CLASS_SCRIPT_LAO:
379 return getUcpLao();
380 case CLASS_SCRIPT_LATIN:
381 return getUcpLatin();
382 case CLASS_SCRIPT_LEPCHA:
383 return getUcpLepcha();
384 case CLASS_SCRIPT_LIMBU:
385 return getUcpLimbu();
386 case CLASS_SCRIPT_LINEAR_B:
387 return getUcpLinear_B();
388 case CLASS_SCRIPT_LISU:
389 return getUcpLisu();
390 case CLASS_SCRIPT_LYCIAN:
391 return getUcpLycian();
392 case CLASS_SCRIPT_LYDIAN:
393 return getUcpLydian();
394 case CLASS_SCRIPT_MALAYALAM:
395 return getUcpMalayalam();
396 case CLASS_SCRIPT_MANDAIC:
397 return getUcpMandaic();
398 case CLASS_SCRIPT_MEETEI_MAYEK:
399 return getUcpMeetei_Mayek();
400 case CLASS_SCRIPT_MONGOLIAN:
401 return getUcpMongolian();
402 case CLASS_SCRIPT_MYANMAR:
403 return getUcpMyanmar();
404 case CLASS_SCRIPT_NEW_TAI_LUE:
405 return getUcpNew_Tai_Lue();
406 case CLASS_SCRIPT_NKO:
407 return getUcpNko();
408 case CLASS_SCRIPT_OGHAM:
409 return getUcpOgham();
410 case CLASS_SCRIPT_OL_CHIKI:
411 return getUcpOl_Chiki();
412 case CLASS_SCRIPT_OLD_ITALIC:
413 return getUcpOld_Italic();
414 case CLASS_SCRIPT_OLD_PERSIAN:
415 return getUcpOld_Persian();
416 case CLASS_SCRIPT_OLD_SOUTH_ARABIAN:
417 return getUcpOld_South_Arabian();
418 case CLASS_SCRIPT_OLD_TURKIC:
419 return getUcpOld_Turkic();
420 case CLASS_SCRIPT_ORIYA:
421 return getUcpOriya();
422 case CLASS_SCRIPT_OSMANYA:
423 return getUcpOsmanya();
424 case CLASS_SCRIPT_PHAGS_PA:
425 return getUcpPhags_Pa();
426 case CLASS_SCRIPT_PHOENICIAN:
427 return getUcpPhoenician();
428 case CLASS_SCRIPT_REJANG:
429 return getUcpRejang();
430 case CLASS_SCRIPT_RUNIC:
431 return getUcpRunic();
432 case CLASS_SCRIPT_SAMARITAN:
433 return getUcpSamaritan();
434 case CLASS_SCRIPT_SAURASHTRA:
435 return getUcpSaurashtra();
436 case CLASS_SCRIPT_SHAVIAN:
437 return getUcpShavian();
438 case CLASS_SCRIPT_SINHALA:
439 return getUcpSinhala();
440 case CLASS_SCRIPT_SUNDANESE:
441 return getUcpSundanese();
442 case CLASS_SCRIPT_SYLOTI_NAGRI:
443 return getUcpSyloti_Nagri();
444 case CLASS_SCRIPT_SYRIAC:
445 return getUcpSyriac();
446 case CLASS_SCRIPT_TAGALOG:
447 return getUcpTagalog();
448 case CLASS_SCRIPT_TAGBANWA:
449 return getUcpTagbanwa();
450 case CLASS_SCRIPT_TAI_LE:
451 return getUcpTai_Le();
452 case CLASS_SCRIPT_TAI_THAM:
453 return getUcpTai_Tham();
454 case CLASS_SCRIPT_TAI_VIET:
455 return getUcpTai_Viet();
456 case CLASS_SCRIPT_TAMIL:
457 return getUcpTamil();
458 case CLASS_SCRIPT_TELUGU:
459 return getUcpTelugu();
460 case CLASS_SCRIPT_THAANA:
461 return getUcpThaana();
462 case CLASS_SCRIPT_THAI:
463 return getUcpThai();
464 case CLASS_SCRIPT_TIBETAN:
465 return getUcpTibetan();
466 case CLASS_SCRIPT_TIFINAGH:
467 return getUcpTifinagh();
468 case CLASS_SCRIPT_UGARITIC:
469 return getUcpUgaritic();
470 case CLASS_SCRIPT_VAI:
471 return getUcpVai();
472 case CLASS_SCRIPT_YI:
473 return getUcpYi();
474 case CLASS_UCP_ANY:
475 return CodePointSet(CodePointSet::interval(0, MAX_UNICODE));
476
477 default: { /* currently uses ascii defns */
478 CharReach cr = getPredefinedCharReach(c, mode);
479 CodePointSet rv;
480 for (u32 i = cr.find_first(); i != CharReach::npos;
481 i = cr.find_next(i)) {
482 rv.set(i);
483 }
484 return rv;
485 }
486 }
487}
488
489UTF8ComponentClass::UTF8ComponentClass(const ParseMode &mode_in)
490 : ComponentClass(mode_in),
491 single_pos( GlushkovBuildState::POS_UNINITIALIZED),
492 one_dot_trailer( GlushkovBuildState::POS_UNINITIALIZED),
493 two_dot_trailer( GlushkovBuildState::POS_UNINITIALIZED),
494 three_dot_trailer( GlushkovBuildState::POS_UNINITIALIZED),
495 two_char_dot_head( GlushkovBuildState::POS_UNINITIALIZED),
496 three_char_dot_head(GlushkovBuildState::POS_UNINITIALIZED),
497 four_char_dot_head( GlushkovBuildState::POS_UNINITIALIZED) {
498 assert(mode.utf8);
499}
500
501UTF8ComponentClass *UTF8ComponentClass::clone() const {
502 return new UTF8ComponentClass(*this);
503}
504
505bool UTF8ComponentClass::class_empty(void) const {
506 assert(finalized);
507 return cps.none();
508}
509
510void UTF8ComponentClass::createRange(unichar to) {
511 assert(range_start != INVALID_UNICODE);
512 unichar from = range_start;
513 if (from > to) {
514 throw LocatedParseError("Range out of order in character class");
515 }
516
517 in_cand_range = false;
518 CodePointSet ncps;
519 ncps.setRange(from, to);
520 if (mode.caseless) {
521 make_caseless(&ncps);
522 }
523 cps |= ncps;
524 range_start = INVALID_UNICODE;
525}
526
527void UTF8ComponentClass::add(PredefinedClass c, bool negative) {
528 if (in_cand_range) { // can't form a range here
529 throw LocatedParseError("Invalid range in character class");
530 }
531
532 if (mode.ucp) {
533 c = translateForUcpMode(c, mode);
534 }
535
536 // caselessness is handled inside this call - don't apply make_caseless
537 // to the result
538 CodePointSet pcps = getPredefinedCodePointSet(c, mode);
539 if (negative) {
540 pcps.flip();
541 }
542
543 cps |= pcps;
544
545 range_start = INVALID_UNICODE;
546 in_cand_range = false;
547}
548
549void UTF8ComponentClass::add(unichar c) {
550 DEBUG_PRINTF("adding \\x%08x\n", c);
551 if (c > MAX_UNICODE) { // too big!
552 throw LocatedParseError("Hexadecimal value is greater than \\x10FFFF");
553 }
554
555 if (in_cand_range) {
556 createRange(c);
557 return;
558 }
559
560 CodePointSet ncps;
561 ncps.set(c);
562 if (mode.caseless) {
563 make_caseless(&ncps);
564 }
565 cps |= ncps;
566 range_start = c;
567}
568
569void UTF8ComponentClass::finalize() {
570 if (finalized) {
571 return;
572 }
573
574 // Handle unclosed ranges, like '[a-]' and '[a-\Q\E]' -- in these cases the
575 // dash is a literal dash.
576 if (in_cand_range) {
577 cps.set('-');
578 in_cand_range = false;
579 }
580
581 if (m_negate) {
582 cps.flip();
583 }
584
585 finalized = true;
586}
587
588Position UTF8ComponentClass::getHead(NFABuilder &builder, u8 first_byte) {
589 map<u8, Position>::const_iterator it = heads.find(first_byte);
590 if (it != heads.end()) {
591 return it->second;
592 }
593
594 Position head = builder.makePositions(1);
595 assert(heads.find(first_byte) == heads.end());
596 builder.addCharReach(head, CharReach(first_byte));
597 /* no report id as head can not be directly wired to accept */
598
599 heads[first_byte] = head;
600 return head;
601}
602
603void UTF8ComponentClass::ensureDotTrailer(GlushkovBuildState &bs) {
604 NFABuilder &builder = bs.getBuilder();
605 if (one_dot_trailer != GlushkovBuildState::POS_UNINITIALIZED) {
606 return;
607 }
608
609 one_dot_trailer = builder.makePositions(1);
610 builder.setNodeReportID(one_dot_trailer, 0);
611 builder.addCharReach(one_dot_trailer, CharReach(0x80, 0xbf));
612 tails.insert(one_dot_trailer);
613}
614
615void UTF8ComponentClass::ensureTwoDotTrailer(GlushkovBuildState &bs) {
616 NFABuilder &builder = bs.getBuilder();
617 if (two_dot_trailer != GlushkovBuildState::POS_UNINITIALIZED) {
618 return;
619 }
620
621 ensureDotTrailer(bs);
622
623 two_dot_trailer = builder.makePositions(1);
624 builder.addCharReach(two_dot_trailer, CharReach(0x80, 0xbf));
625 bs.addSuccessor(two_dot_trailer, one_dot_trailer);
626}
627
628void UTF8ComponentClass::ensureThreeDotTrailer(GlushkovBuildState &bs) {
629 NFABuilder &builder = bs.getBuilder();
630 if (three_dot_trailer != GlushkovBuildState::POS_UNINITIALIZED) {
631 return;
632 }
633
634 ensureTwoDotTrailer(bs);
635
636 three_dot_trailer = builder.makePositions(1);
637 builder.addCharReach(three_dot_trailer, CharReach(0x80, 0xbf));
638 bs.addSuccessor(three_dot_trailer, two_dot_trailer);
639}
640
641void UTF8ComponentClass::buildOneByte(GlushkovBuildState &bs) {
642 NFABuilder &builder = bs.getBuilder();
643 for (CodePointSet::const_iterator it = cps.begin(); it != cps.end(); ++it) {
644 unichar b = lower(*it);
645 unichar e = upper(*it) + 1;
646 if (b >= UTF_2CHAR_MIN) {
647 continue;
648 }
649
650 DEBUG_PRINTF("building vertices for [%u, %u)\n", b, e);
651
652 if (single_pos == GlushkovBuildState::POS_UNINITIALIZED) {
653 single_pos = builder.makePositions(1);
654 builder.setNodeReportID(single_pos, 0 /* offset adj */);
655 tails.insert(single_pos);
656 }
657 CharReach cr(b, MIN(e, UTF_2CHAR_MIN) - 1);
658 builder.addCharReach(single_pos, cr);
659 }
660}
661
662void UTF8ComponentClass::addToTail(GlushkovBuildState &bs,
663 map<Position, Position> &finals,
664 Position prev, unichar b, unichar e) {
665 NFABuilder &builder = bs.getBuilder();
666 Position tail;
667 if (finals.find(prev) == finals.end()) {
668 tail = builder.makePositions(1);
669 builder.setNodeReportID(tail, 0 /* offset adj */);
670 bs.addSuccessor(prev, tail);
671 finals[prev] = tail;
672 tails.insert(tail);
673 } else {
674 tail = finals[prev];
675 }
676
677 u8 bb = makeContByte(b);
678 u8 ee = makeContByte(e - 1);
679 builder.addCharReach(tail, CharReach(bb, ee));
680}
681
682void UTF8ComponentClass::buildTwoByte(GlushkovBuildState &bs) {
683 NFABuilder &builder = bs.getBuilder();
684 map<Position, Position> finals;
685
686 for (auto it = cps.begin(); it != cps.end(); ++it) {
687 unichar b = lower(*it);
688 unichar e = upper(*it) + 1;
689
690 b = MAX(b, UTF_2CHAR_MIN);
691 e = MIN(e, UTF_3CHAR_MIN);
692
693 if (b >= e) {
694 continue; /* we're done here */
695 }
696
697 /* raise b to the start of the next tail byte boundary */
698 if (b & UTF_CONT_BYTE_VALUE_MASK) {
699 unichar bb = MIN(e, ROUNDUP_N(b, UTF_CONT_BYTE_RANGE));
700 u8 first_byte = UTF_TWO_BYTE_HEADER | (b >> UTF_CONT_SHIFT);
701 assert(first_byte > 0xc1 && first_byte <= 0xdf);
702
703 Position head = getHead(builder, first_byte);
704 addToTail(bs, finals, head, b, bb);
705
706 b = bb;
707 }
708
709 if (b == e) {
710 continue; /* we're done here */
711 }
712 assert(b < e);
713
714 /* lower e to the end of a tail byte boundary */
715 if (e & UTF_CONT_BYTE_VALUE_MASK) {
716 unichar ee = e & ~UTF_CONT_BYTE_VALUE_MASK;
717 assert(ee >= b);
718
719 u8 first_byte = UTF_TWO_BYTE_HEADER | (ee >> UTF_CONT_SHIFT);
720 assert(first_byte > 0xc1 && first_byte <= 0xdf);
721
722 Position head = getHead(builder, first_byte);
723 addToTail(bs, finals, head, ee, e);
724
725 e = ee;
726 }
727
728 if (b == e) {
729 continue; /* we're done here */
730 }
731 assert(b < e);
732
733 /* middle section just goes to a common full vertex */
734 ensureDotTrailer(bs);
735
736 if (two_char_dot_head == GlushkovBuildState::POS_UNINITIALIZED) {
737 two_char_dot_head = builder.makePositions(1);
738 bs.addSuccessor(two_char_dot_head, one_dot_trailer);
739 }
740
741 u8 min_first_byte = UTF_TWO_BYTE_HEADER | (b >> UTF_CONT_SHIFT);
742 u8 max_first_byte = UTF_TWO_BYTE_HEADER | ((e - 1) >> UTF_CONT_SHIFT);
743
744 assert(min_first_byte > 0xc1 && min_first_byte <= 0xdf);
745 assert(max_first_byte > 0xc1 && max_first_byte <= 0xdf);
746
747 builder.addCharReach(two_char_dot_head,
748 CharReach(min_first_byte, max_first_byte));
749 }
750}
751
752static
753Position getMid(GlushkovBuildState &bs, map<Position, map<u8, Position> > &mids,
754 const Position &prev, u8 byte_val) {
755 NFABuilder &builder = bs.getBuilder();
756 map<u8, Position> &by_byte = mids[prev];
757
758 map<u8, Position>::const_iterator it = by_byte.find(byte_val);
759 if (it != by_byte.end()) {
760 return it->second;
761 }
762
763 Position mid = builder.makePositions(1);
764 builder.addCharReach(mid, CharReach(byte_val));
765 bs.addSuccessor(prev, mid);
766 /* no report id as mid can not be directly wired to accept */
767
768 by_byte[byte_val] = mid;
769 return mid;
770}
771
772void UTF8ComponentClass::buildThreeByte(GlushkovBuildState &bs) {
773 NFABuilder &builder = bs.getBuilder();
774
775 map<Position, map<u8, Position> > mids;
776 map<Position, Position> finals;
777
778 for (auto it = cps.begin(); it != cps.end(); ++it) {
779 unichar b = lower(*it);
780 unichar e = upper(*it) + 1;
781
782 b = MAX(b, UTF_3CHAR_MIN);
783 e = MIN(e, UTF_4CHAR_MIN);
784
785 if (b >= e) {
786 continue; /* we're done here */
787 }
788
789 /* raise b to the start of the next tail byte boundary */
790 if (b & UTF_CONT_BYTE_VALUE_MASK) {
791 unichar bb = MIN(e, ROUNDUP_N(b, UTF_CONT_BYTE_RANGE));
792
793 u8 first_byte = UTF_THREE_BYTE_HEADER | (b >> (2 * UTF_CONT_SHIFT));
794 assert(first_byte >= 0xe0 && first_byte <= 0xef);
795 Position head = getHead(builder, first_byte);
796
797 u8 second_byte = makeContByte(b >> UTF_CONT_SHIFT);
798 Position mid = getMid(bs, mids, head, second_byte);
799
800 addToTail(bs, finals, mid, b, bb);
801
802 b = bb;
803 }
804
805 if (b == e) {
806 continue; /* we're done here */
807 }
808 assert(b < e);
809
810 /* lower e to the end of a tail byte boundary */
811 if (e & UTF_CONT_BYTE_VALUE_MASK) {
812 unichar ee = e & ~UTF_CONT_BYTE_VALUE_MASK;
813 assert(ee >= b);
814
815 u8 first_byte = UTF_THREE_BYTE_HEADER
816 | (ee >> (2 * UTF_CONT_SHIFT));
817 assert(first_byte >= 0xe0 && first_byte <= 0xef);
818 Position head = getHead(builder, first_byte);
819
820 u8 second_byte = makeContByte(ee >> UTF_CONT_SHIFT);
821 Position mid = getMid(bs, mids, head, second_byte);
822
823 addToTail(bs, finals, mid, ee, e);
824
825 e = ee;
826 }
827
828 if (b == e) {
829 continue; /* we're done here */
830 }
831 assert(b < e);
832
833 /* from here on in the last byte is always full */
834 ensureDotTrailer(bs);
835
836 /* raise b to the start of the next mid byte boundary */
837 if (b & ((1 << (2 * UTF_CONT_SHIFT)) - 1)) {
838 unichar bb = MIN(e, ROUNDUP_N(b, 1 << (2 * UTF_CONT_SHIFT)));
839
840 u8 first_byte = UTF_THREE_BYTE_HEADER | (b >> (2 * UTF_CONT_SHIFT));
841 Position head = getHead(builder, first_byte);
842
843 Position mid = builder.makePositions(1);
844 bs.addSuccessor(head, mid);
845 bs.addSuccessor(mid, one_dot_trailer);
846 /* no report id as mid can not be directly wired to accept,
847 * not adding to mids as we are completely filling its downstream */
848 u8 second_min = makeContByte(b >> UTF_CONT_SHIFT);
849 u8 second_max = makeContByte((bb - 1) >> UTF_CONT_SHIFT);
850
851 builder.addCharReach(mid, CharReach(second_min, second_max));
852
853 b = bb;
854 }
855
856 if (b == e) {
857 continue; /* we're done here */
858 }
859 assert(b < e);
860
861 /* lower e to the end of a mid byte boundary */
862 if (e & ((1 << (2 * UTF_CONT_SHIFT)) - 1)) {
863 unichar ee = e & ~((1 << (2 * UTF_CONT_SHIFT)) - 1);
864 assert(ee >= b);
865
866 u8 first_byte = UTF_THREE_BYTE_HEADER
867 | (ee >> (2 * UTF_CONT_SHIFT));
868 Position head = getHead(builder, first_byte);
869
870 Position mid = builder.makePositions(1);
871 bs.addSuccessor(head, mid);
872 bs.addSuccessor(mid, one_dot_trailer);
873 /* no report id as mid can not be directly wired to accept,
874 * not adding to mids as we are completely filling its downstream */
875 u8 second_min = makeContByte(ee >> UTF_CONT_SHIFT);
876 u8 second_max = makeContByte((e - 1) >> UTF_CONT_SHIFT);
877
878 builder.addCharReach(mid, CharReach(second_min, second_max));
879
880 e = ee;
881 }
882
883 if (b == e) {
884 continue; /* we're done here */
885 }
886 assert(b < e);
887
888 /* now we just have to wire head to a common dot trailer */
889 ensureTwoDotTrailer(bs);
890 if (three_char_dot_head == GlushkovBuildState::POS_UNINITIALIZED) {
891 three_char_dot_head = builder.makePositions(1);
892 bs.addSuccessor(three_char_dot_head, two_dot_trailer);
893 }
894
895 u8 min_first_byte = UTF_THREE_BYTE_HEADER
896 | (b >> (2 * UTF_CONT_SHIFT));
897 u8 max_first_byte = UTF_THREE_BYTE_HEADER
898 | ((e - 1) >> (2 * UTF_CONT_SHIFT));
899
900 assert(min_first_byte > 0xdf && min_first_byte <= 0xef);
901 assert(max_first_byte > 0xdf && max_first_byte <= 0xef);
902
903 builder.addCharReach(three_char_dot_head,
904 CharReach(min_first_byte, max_first_byte));
905 }
906}
907
908static
909u8 makeFirstByteOfFour(unichar raw) {
910 u8 first_byte = UTF_FOUR_BYTE_HEADER | (raw >> (3 * UTF_CONT_SHIFT));
911 assert(first_byte > 0xef && first_byte <= 0xf7);
912 return first_byte;
913}
914
915static
916bool isTwoContAligned(unichar raw) {
917 return !(raw & ((1 << (2 * UTF_CONT_SHIFT)) - 1));
918}
919
920static
921bool isThreeContAligned(unichar raw) {
922 return !(raw & ((1 << (3 * UTF_CONT_SHIFT)) - 1));
923}
924
925void UTF8ComponentClass::buildFourByte(GlushkovBuildState &bs) {
926 NFABuilder &builder = bs.getBuilder();
927 map<Position, map<u8, Position> > mids;
928 map<Position, Position> finals;
929
930 for (auto it = cps.begin(); it != cps.end(); ++it) {
931 unichar b = lower(*it);
932 unichar e = upper(*it) + 1;
933
934 b = MAX(b, UTF_4CHAR_MIN);
935 e = MIN(e, MAX_UNICODE + 1);
936
937 if (b >= e) {
938 continue;
939 }
940
941 /* raise b to the start of the next tail byte boundary */
942 if (b & UTF_CONT_BYTE_VALUE_MASK) {
943 unichar bb = MIN(e, ROUNDUP_N(b, UTF_CONT_BYTE_RANGE));
944
945 u8 first_byte = makeFirstByteOfFour(b);
946 Position head = getHead(builder, first_byte);
947
948 u8 second_byte = makeContByte(b >> (2 * UTF_CONT_SHIFT));
949 Position mid1 = getMid(bs, mids, head, second_byte);
950
951 u8 third_byte = makeContByte(b >> UTF_CONT_SHIFT);
952 Position mid2 = getMid(bs, mids, mid1, third_byte);
953
954 addToTail(bs, finals, mid2, b, bb);
955
956 b = bb;
957 }
958
959 if (b == e) {
960 continue; /* we're done here */
961 }
962 assert(b < e);
963
964 /* lower e to the end of a tail byte boundary */
965 if (e & UTF_CONT_BYTE_VALUE_MASK) {
966 unichar ee = e & ~UTF_CONT_BYTE_VALUE_MASK;
967 assert(ee >= b);
968
969 u8 first_byte = makeFirstByteOfFour(ee);
970 Position head = getHead(builder, first_byte);
971
972 u8 second_byte = makeContByte(ee >> (2 * UTF_CONT_SHIFT));
973 Position mid1 = getMid(bs, mids, head, second_byte);
974
975 u8 third_byte = makeContByte(ee >> UTF_CONT_SHIFT);
976 Position mid2 = getMid(bs, mids, mid1, third_byte);
977
978 addToTail(bs, finals, mid2, ee, e);
979
980 e = ee;
981 }
982
983 if (b == e) {
984 continue; /* we're done here */
985 }
986 assert(b < e);
987
988 /* from here on in the last byte is always full */
989 ensureDotTrailer(bs);
990
991 /* raise b to the start of the next mid byte boundary */
992 if (!isTwoContAligned(b)) {
993 unichar bb = MIN(e, ROUNDUP_N(b, 1 << (2 * UTF_CONT_SHIFT)));
994
995 u8 first_byte = makeFirstByteOfFour(b);
996 Position head = getHead(builder, first_byte);
997
998 u8 second_byte = makeContByte(b >> (2 * UTF_CONT_SHIFT));
999 Position mid1 = getMid(bs, mids, head, second_byte);
1000
1001 Position mid2 = builder.makePositions(1);
1002 bs.addSuccessor(mid1, mid2);
1003 bs.addSuccessor(mid2, one_dot_trailer);
1004 /* no report id as mid can not be directly wired to accept,
1005 * not adding to mids as we are completely filling its downstream */
1006 u8 byte_min = makeContByte(b >> UTF_CONT_SHIFT);
1007 u8 byte_max = makeContByte((bb - 1) >> UTF_CONT_SHIFT);
1008
1009 builder.addCharReach(mid2, CharReach(byte_min, byte_max));
1010
1011 b = bb;
1012 }
1013
1014 if (b == e) {
1015 continue; /* we're done here */
1016 }
1017 assert(b < e);
1018
1019 /* lower e to the end of a mid byte boundary */
1020 if (!isTwoContAligned(e)) {
1021 unichar ee = e & ~((1 << (2 * UTF_CONT_SHIFT)) - 1);
1022 assert(ee >= b);
1023
1024 u8 first_byte = makeFirstByteOfFour(ee);
1025 Position head = getHead(builder, first_byte);
1026
1027 u8 second_byte = makeContByte(ee >> (2 * UTF_CONT_SHIFT));
1028 Position mid1 = getMid(bs, mids, head, second_byte);
1029
1030 Position mid2 = builder.makePositions(1);
1031 bs.addSuccessor(mid1, mid2);
1032 bs.addSuccessor(mid2, one_dot_trailer);
1033 /* no report id as mid can not be directly wired to accept,
1034 * not adding to mids as we are completely filling its downstream */
1035 u8 byte_min = makeContByte(ee >> UTF_CONT_SHIFT);
1036 u8 byte_max = makeContByte((e - 1) >> UTF_CONT_SHIFT);
1037
1038 builder.addCharReach(mid2, CharReach(byte_min, byte_max));
1039
1040 e = ee;
1041 }
1042
1043 if (b == e) {
1044 continue; /* we're done here */
1045 }
1046 assert(b < e);
1047
1048 ensureTwoDotTrailer(bs);
1049
1050 /* raise b to the next byte boundary */
1051 if (!isThreeContAligned(b)) {
1052 unichar bb = MIN(e, ROUNDUP_N(b, 1 << (3 * UTF_CONT_SHIFT)));
1053
1054 u8 first_byte = makeFirstByteOfFour(b);
1055 Position head = getHead(builder, first_byte);
1056
1057 Position mid1 = builder.makePositions(1);
1058 bs.addSuccessor(head, mid1);
1059 bs.addSuccessor(mid1, two_dot_trailer);
1060 /* no report id as mid can not be directly wired to accept,
1061 * not adding to mids as we are completely filling its downstream */
1062 u8 byte_min = makeContByte(b >> (2 * UTF_CONT_SHIFT));
1063 u8 byte_max = makeContByte((bb - 1) >> (2 * UTF_CONT_SHIFT));
1064
1065 builder.addCharReach(mid1, CharReach(byte_min, byte_max));
1066
1067 b = bb;
1068 }
1069
1070 if (b == e) {
1071 continue; /* we're done here */
1072 }
1073 assert(b < e);
1074
1075 /* lower e to the next byte boundary */
1076 if (!isThreeContAligned(e)) {
1077 unichar ee = e & ~((1 << (3 * UTF_CONT_SHIFT)) - 1);
1078 assert(ee >= b);
1079
1080 u8 first_byte = makeFirstByteOfFour(ee);
1081 Position head = getHead(builder, first_byte);
1082 Position mid1 = builder.makePositions(1);
1083 bs.addSuccessor(head, mid1);
1084 bs.addSuccessor(mid1, two_dot_trailer);
1085 /* no report id as mid can not be directly wired to accept,
1086 * not adding to mids as we are completely filling its downstream */
1087 u8 byte_min = makeContByte(ee >> (2 * UTF_CONT_SHIFT));
1088 u8 byte_max = makeContByte((e - 1) >> (2 * UTF_CONT_SHIFT));
1089
1090 builder.addCharReach(mid1, CharReach(byte_min, byte_max));
1091
1092 e = ee;
1093 }
1094
1095 if (b == e) {
1096 continue; /* we're done here */
1097 }
1098 assert(b < e);
1099
1100 /* now we just have to wire head to a common dot trailer */
1101 ensureThreeDotTrailer(bs);
1102 if (four_char_dot_head == GlushkovBuildState::POS_UNINITIALIZED) {
1103 four_char_dot_head = builder.makePositions(1);
1104 bs.addSuccessor(four_char_dot_head, three_dot_trailer);
1105 }
1106
1107 u8 min_first_byte = makeFirstByteOfFour(b);
1108 u8 max_first_byte = makeFirstByteOfFour(e - 1);
1109
1110 builder.addCharReach(four_char_dot_head,
1111 CharReach(min_first_byte, max_first_byte));
1112 }
1113}
1114
1115void UTF8ComponentClass::notePositions(GlushkovBuildState &bs) {
1116 // We should always be finalized by now.
1117 assert(finalized);
1118
1119 // An empty class is a special case; this would be generated by something
1120 // like /[\s\S]/8, which can never match. We treat these like we do the non
1121 // UTF-8 version: add a vertex with empty reach (to ensure we create a
1122 // connected graph) and pick it up later on.
1123 if (class_empty()) {
1124 DEBUG_PRINTF("empty class!\n");
1125 assert(single_pos == GlushkovBuildState::POS_UNINITIALIZED);
1126 NFABuilder &builder = bs.getBuilder();
1127 single_pos = builder.makePositions(1);
1128 builder.setNodeReportID(single_pos, 0 /* offset adj */);
1129 builder.addCharReach(single_pos, CharReach());
1130 tails.insert(single_pos);
1131 return;
1132 }
1133
1134 buildOneByte(bs);
1135 buildTwoByte(bs);
1136 buildThreeByte(bs);
1137 buildFourByte(bs);
1138}
1139
1140void UTF8ComponentClass::buildFollowSet(GlushkovBuildState &,
1141 const vector<PositionInfo> &) {
1142 /* states are wired in notePositions as all belong to this component. */
1143}
1144
1145vector<PositionInfo> UTF8ComponentClass::first(void) const {
1146 vector<PositionInfo> rv;
1147 if (single_pos != GlushkovBuildState::POS_UNINITIALIZED) {
1148 rv.push_back(single_pos);
1149 }
1150 if (two_char_dot_head != GlushkovBuildState::POS_UNINITIALIZED) {
1151 rv.push_back(two_char_dot_head);
1152 }
1153 if (three_char_dot_head != GlushkovBuildState::POS_UNINITIALIZED) {
1154 rv.push_back(three_char_dot_head);
1155 }
1156 if (four_char_dot_head != GlushkovBuildState::POS_UNINITIALIZED) {
1157 rv.push_back(four_char_dot_head);
1158 }
1159
1160 for (auto it = heads.begin(); it != heads.end(); ++it) {
1161 rv.push_back(it->second);
1162 }
1163 return rv;
1164}
1165
1166vector<PositionInfo> UTF8ComponentClass::last(void) const {
1167 vector<PositionInfo> rv;
1168
1169 rv.insert(rv.end(), tails.begin(), tails.end());
1170 return rv;
1171}
1172
1173} // namespace ue2
1174