1 | /* |
2 | * Copyright (c) 2015, Intel Corporation |
3 | * |
4 | * Redistribution and use in source and binary forms, with or without |
5 | * modification, are permitted provided that the following conditions are met: |
6 | * |
7 | * * Redistributions of source code must retain the above copyright notice, |
8 | * this list of conditions and the following disclaimer. |
9 | * * Redistributions in binary form must reproduce the above copyright |
10 | * notice, this list of conditions and the following disclaimer in the |
11 | * documentation and/or other materials provided with the distribution. |
12 | * * Neither the name of Intel Corporation nor the names of its contributors |
13 | * may be used to endorse or promote products derived from this software |
14 | * without specific prior written permission. |
15 | * |
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
17 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
19 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
20 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
26 | * POSSIBILITY OF SUCH DAMAGE. |
27 | */ |
28 | |
29 | /** \file |
30 | * \brief Character class in UTF-8 mode. |
31 | */ |
32 | |
33 | |
34 | #include "Utf8ComponentClass.h" |
35 | |
36 | #include "buildstate.h" |
37 | #include "Parser.h" |
38 | #include "parse_error.h" |
39 | #include "position.h" |
40 | #include "position_info.h" |
41 | #include "nfagraph/ng_builder.h" |
42 | #include "util/compare.h" |
43 | #include "util/unicode_def.h" |
44 | |
45 | #include <cstring> |
46 | |
47 | #include "ucp_table.h" |
48 | |
49 | using namespace std; |
50 | |
51 | namespace ue2 { |
52 | |
53 | PredefinedClass translateForUcpMode(PredefinedClass in, const ParseMode &mode) { |
54 | /* Note: the mapping used here for mapping posix character classes |
55 | * matches the observed behaviour of PCRE (lower and upper going to \p{L} |
56 | * is not documented by pcre). |
57 | * |
58 | * Note: this mapping is quite different from both of the mappings |
59 | * recommended in the unicode regex tech report (TR-18) appendix C |
60 | */ |
61 | switch (in) { |
62 | case CLASS_ALNUM: |
63 | return CLASS_UCP_XAN; |
64 | case CLASS_ALPHA: |
65 | return CLASS_UCP_L; |
66 | case CLASS_BLANK: |
67 | return CLASS_HORZ; |
68 | case CLASS_DIGIT: |
69 | return CLASS_UCP_ND; |
70 | case CLASS_GRAPH: |
71 | return CLASS_XGRAPH; |
72 | case CLASS_LOWER: |
73 | if (mode.caseless) { /* we also pick up uppercase titlecase and others */ |
74 | return CLASS_UCP_L; |
75 | } else { |
76 | return CLASS_UCP_LL; |
77 | } |
78 | case CLASS_PRINT: |
79 | return CLASS_XPRINT; |
80 | case CLASS_PUNCT: |
81 | return CLASS_XPUNCT; |
82 | case CLASS_SPACE: |
83 | return CLASS_UCP_XPS; |
84 | case CLASS_UPPER: |
85 | if (mode.caseless) { /* we also pick up lowercase titlecase and others */ |
86 | return CLASS_UCP_L; |
87 | } else { |
88 | return CLASS_UCP_LU; |
89 | } |
90 | case CLASS_WORD: |
91 | return CLASS_UCP_XWD; |
92 | default: |
93 | return in; |
94 | } |
95 | } |
96 | |
97 | CodePointSet getPredefinedCodePointSet(PredefinedClass c, |
98 | const ParseMode &mode) { |
99 | /* TODO: support properly PCRE_UCP mode and non PCRE_UCP mode */ |
100 | switch (c) { |
101 | case CLASS_ANY: |
102 | if (mode.dotall) { |
103 | return CodePointSet(CodePointSet::interval(0, MAX_UNICODE)); |
104 | } else { |
105 | CodePointSet rv; |
106 | rv.set('\n'); |
107 | rv.flip(); |
108 | return rv; |
109 | } |
110 | case CLASS_XGRAPH: { |
111 | CodePointSet rv; |
112 | rv = getUcpZ(); |
113 | rv |= getUcpC(); |
114 | rv.flip(); |
115 | // most of Cf, except for ... |
116 | CodePointSet cf = getUcpCf(); |
117 | cf.unset(0x061c); |
118 | cf.unset(0x180e); |
119 | cf.unsetRange(0x2066, 0x2069); |
120 | rv |= cf; |
121 | return rv; |
122 | } |
123 | case CLASS_XPRINT: { |
124 | // Same as graph, plus everything with the Zs property. |
125 | CodePointSet rv = getPredefinedCodePointSet(CLASS_XGRAPH, mode); |
126 | rv |= getUcpZs(); |
127 | rv.set(0x180e); // Also included in this class by PCRE 8.38. |
128 | return rv; |
129 | } |
130 | case CLASS_XPUNCT: { |
131 | // Everything with the P (punctuation) property, plus code points in S |
132 | // (symbols) that are < 128. |
133 | CodePointSet rv = getUcpP(); |
134 | CodePointSet symbols = getUcpS(); |
135 | symbols.unsetRange(128, MAX_UNICODE); |
136 | rv |= symbols; |
137 | return rv; |
138 | } |
139 | case CLASS_HORZ: { |
140 | CodePointSet rv; |
141 | rv.set(0x0009); /* Horizontal tab */ |
142 | rv.set(0x0020); /* Space */ |
143 | rv.set(0x00A0); /* Non-break space */ |
144 | rv.set(0x1680); /* Ogham space mark */ |
145 | rv.set(0x180E); /* Mongolian vowel separator */ |
146 | rv.set(0x2000); /* En quad */ |
147 | rv.set(0x2001); /* Em quad */ |
148 | rv.set(0x2002); /* En space */ |
149 | rv.set(0x2003); /* Em space */ |
150 | rv.set(0x2004); /* Three-per-em space */ |
151 | rv.set(0x2005); /* Four-per-em space */ |
152 | rv.set(0x2006); /* Six-per-em space */ |
153 | rv.set(0x2007); /* Figure space */ |
154 | rv.set(0x2008); /* Punctuation space */ |
155 | rv.set(0x2009); /* Thin space */ |
156 | rv.set(0x200A); /* Hair space */ |
157 | rv.set(0x202F); /* Narrow no-break space */ |
158 | rv.set(0x205F); /* Medium mathematical space */ |
159 | rv.set(0x3000); /* Ideographic space */ |
160 | return rv; |
161 | } |
162 | case CLASS_VERT: { |
163 | CodePointSet rv; |
164 | rv.set(0x000A); /* Linefeed */ |
165 | rv.set(0x000B); /* Vertical tab */ |
166 | rv.set(0x000C); /* Formfeed */ |
167 | rv.set(0x000D); /* Carriage return */ |
168 | rv.set(0x0085); /* Next line */ |
169 | rv.set(0x2028); /* Line separator */ |
170 | rv.set(0x2029); /* Paragraph separator */ |
171 | return rv; |
172 | } |
173 | case CLASS_UCP_XPS: |
174 | case CLASS_UCP_XSP: { |
175 | CodePointSet rv; |
176 | rv.set(0x0009); /* Horizontal tab */ |
177 | rv.set(0x0020); /* Space */ |
178 | rv.set(0x00A0); /* Non-break space */ |
179 | rv.set(0x1680); /* Ogham space mark */ |
180 | rv.set(0x180E); /* Mongolian vowel separator */ |
181 | rv.set(0x2000); /* En quad */ |
182 | rv.set(0x2001); /* Em quad */ |
183 | rv.set(0x2002); /* En space */ |
184 | rv.set(0x2003); /* Em space */ |
185 | rv.set(0x2004); /* Three-per-em space */ |
186 | rv.set(0x2005); /* Four-per-em space */ |
187 | rv.set(0x2006); /* Six-per-em space */ |
188 | rv.set(0x2007); /* Figure space */ |
189 | rv.set(0x2008); /* Punctuation space */ |
190 | rv.set(0x2009); /* Thin space */ |
191 | rv.set(0x200A); /* Hair space */ |
192 | rv.set(0x202F); /* Narrow no-break space */ |
193 | rv.set(0x205F); /* Medium mathematical space */ |
194 | rv.set(0x3000); /* Ideographic space */ |
195 | rv.set(0x000A); /* Linefeed */ |
196 | rv.set(0x000B); /* Vertical tab */ |
197 | rv.set(0x000C); /* Formfeed */ |
198 | rv.set(0x000D); /* Carriage return */ |
199 | rv.set(0x0085); /* Next line */ |
200 | rv.set(0x2028); /* Line separator */ |
201 | rv.set(0x2029); /* Paragraph separator */ |
202 | return rv; |
203 | } |
204 | case CLASS_UCP_C: |
205 | return getUcpC(); |
206 | case CLASS_UCP_CC: |
207 | return getUcpCc(); |
208 | case CLASS_UCP_CF: |
209 | return getUcpCf(); |
210 | case CLASS_UCP_CN: |
211 | return getUcpCn(); |
212 | case CLASS_UCP_CO: |
213 | return getUcpCo(); |
214 | case CLASS_UCP_CS: |
215 | return getUcpCs(); |
216 | case CLASS_UCP_L: |
217 | return getUcpL(); |
218 | case CLASS_UCP_L_AND: |
219 | return getUcpL_and(); |
220 | case CLASS_UCP_LL: |
221 | return getUcpLl(); |
222 | case CLASS_UCP_LM: |
223 | return getUcpLm(); |
224 | case CLASS_UCP_LO: |
225 | return getUcpLo(); |
226 | case CLASS_UCP_LT: |
227 | return getUcpLt(); |
228 | case CLASS_UCP_LU: |
229 | return getUcpLu(); |
230 | case CLASS_UCP_M: |
231 | return getUcpM(); |
232 | case CLASS_UCP_MC: |
233 | return getUcpMc(); |
234 | case CLASS_UCP_ME: |
235 | return getUcpMe(); |
236 | case CLASS_UCP_MN: |
237 | return getUcpMn(); |
238 | case CLASS_UCP_N: |
239 | return getUcpN(); |
240 | case CLASS_UCP_ND: |
241 | return getUcpNd(); |
242 | case CLASS_UCP_NL: |
243 | return getUcpNl(); |
244 | case CLASS_UCP_NO: |
245 | return getUcpNo(); |
246 | case CLASS_UCP_P: |
247 | return getUcpP(); |
248 | case CLASS_UCP_PC: |
249 | return getUcpPc(); |
250 | case CLASS_UCP_PD: |
251 | return getUcpPd(); |
252 | case CLASS_UCP_PE: |
253 | return getUcpPe(); |
254 | case CLASS_UCP_PF: |
255 | return getUcpPf(); |
256 | case CLASS_UCP_PI: |
257 | return getUcpPi(); |
258 | case CLASS_UCP_PO: |
259 | return getUcpPo(); |
260 | case CLASS_UCP_PS: |
261 | return getUcpPs(); |
262 | case CLASS_UCP_S: |
263 | return getUcpS(); |
264 | case CLASS_UCP_SC: |
265 | return getUcpSc(); |
266 | case CLASS_UCP_SK: |
267 | return getUcpSk(); |
268 | case CLASS_UCP_SM: |
269 | return getUcpSm(); |
270 | case CLASS_UCP_SO: |
271 | return getUcpSo(); |
272 | case CLASS_UCP_XAN: |
273 | return getUcpXan(); |
274 | case CLASS_UCP_XWD: |
275 | return getUcpXwd(); |
276 | case CLASS_UCP_Z: |
277 | return getUcpZ(); |
278 | case CLASS_UCP_ZL: |
279 | return getUcpZl(); |
280 | case CLASS_UCP_ZP: |
281 | return getUcpZp(); |
282 | case CLASS_UCP_ZS: |
283 | return getUcpZs(); |
284 | case CLASS_SCRIPT_ARABIC: |
285 | return getUcpArabic(); |
286 | case CLASS_SCRIPT_ARMENIAN: |
287 | return getUcpArmenian(); |
288 | case CLASS_SCRIPT_AVESTAN: |
289 | return getUcpAvestan(); |
290 | case CLASS_SCRIPT_BALINESE: |
291 | return getUcpBalinese(); |
292 | case CLASS_SCRIPT_BAMUM: |
293 | return getUcpBamum(); |
294 | case CLASS_SCRIPT_BATAK: |
295 | return getUcpBatak(); |
296 | case CLASS_SCRIPT_BENGALI: |
297 | return getUcpBengali(); |
298 | case CLASS_SCRIPT_BOPOMOFO: |
299 | return getUcpBopomofo(); |
300 | case CLASS_SCRIPT_BRAHMI: |
301 | return getUcpBrahmi(); |
302 | case CLASS_SCRIPT_BRAILLE: |
303 | return getUcpBraille(); |
304 | case CLASS_SCRIPT_BUGINESE: |
305 | return getUcpBuginese(); |
306 | case CLASS_SCRIPT_BUHID: |
307 | return getUcpBuhid(); |
308 | case CLASS_SCRIPT_CANADIAN_ABORIGINAL: |
309 | return getUcpCanadian_Aboriginal(); |
310 | case CLASS_SCRIPT_CARIAN: |
311 | return getUcpCarian(); |
312 | case CLASS_SCRIPT_CHAM: |
313 | return getUcpCham(); |
314 | case CLASS_SCRIPT_CHEROKEE: |
315 | return getUcpCherokee(); |
316 | case CLASS_SCRIPT_COMMON: |
317 | return getUcpCommon(); |
318 | case CLASS_SCRIPT_COPTIC: |
319 | return getUcpCoptic(); |
320 | case CLASS_SCRIPT_CUNEIFORM: |
321 | return getUcpCuneiform(); |
322 | case CLASS_SCRIPT_CYPRIOT: |
323 | return getUcpCypriot(); |
324 | case CLASS_SCRIPT_CYRILLIC: |
325 | return getUcpCyrillic(); |
326 | case CLASS_SCRIPT_DESERET: |
327 | return getUcpDeseret(); |
328 | case CLASS_SCRIPT_DEVANAGARI: |
329 | return getUcpDevanagari(); |
330 | case CLASS_SCRIPT_EGYPTIAN_HIEROGLYPHS: |
331 | return getUcpEgyptian_Hieroglyphs(); |
332 | case CLASS_SCRIPT_ETHIOPIC: |
333 | return getUcpEthiopic(); |
334 | case CLASS_SCRIPT_GEORGIAN: |
335 | return getUcpGeorgian(); |
336 | case CLASS_SCRIPT_GLAGOLITIC: |
337 | return getUcpGlagolitic(); |
338 | case CLASS_SCRIPT_GOTHIC: |
339 | return getUcpGothic(); |
340 | case CLASS_SCRIPT_GREEK: |
341 | return getUcpGreek(); |
342 | case CLASS_SCRIPT_GUJARATI: |
343 | return getUcpGujarati(); |
344 | case CLASS_SCRIPT_GURMUKHI: |
345 | return getUcpGurmukhi(); |
346 | case CLASS_SCRIPT_HAN: |
347 | return getUcpHan(); |
348 | case CLASS_SCRIPT_HANGUL: |
349 | return getUcpHangul(); |
350 | case CLASS_SCRIPT_HANUNOO: |
351 | return getUcpHanunoo(); |
352 | case CLASS_SCRIPT_HEBREW: |
353 | return getUcpHebrew(); |
354 | case CLASS_SCRIPT_HIRAGANA: |
355 | return getUcpHiragana(); |
356 | case CLASS_SCRIPT_IMPERIAL_ARAMAIC: |
357 | return getUcpImperial_Aramaic(); |
358 | case CLASS_SCRIPT_INHERITED: |
359 | return getUcpInherited(); |
360 | case CLASS_SCRIPT_INSCRIPTIONAL_PAHLAVI: |
361 | return getUcpInscriptional_Pahlavi(); |
362 | case CLASS_SCRIPT_INSCRIPTIONAL_PARTHIAN: |
363 | return getUcpInscriptional_Parthian(); |
364 | case CLASS_SCRIPT_JAVANESE: |
365 | return getUcpJavanese(); |
366 | case CLASS_SCRIPT_KAITHI: |
367 | return getUcpKaithi(); |
368 | case CLASS_SCRIPT_KANNADA: |
369 | return getUcpKannada(); |
370 | case CLASS_SCRIPT_KATAKANA: |
371 | return getUcpKatakana(); |
372 | case CLASS_SCRIPT_KAYAH_LI: |
373 | return getUcpKayah_Li(); |
374 | case CLASS_SCRIPT_KHAROSHTHI: |
375 | return getUcpKharoshthi(); |
376 | case CLASS_SCRIPT_KHMER: |
377 | return getUcpKhmer(); |
378 | case CLASS_SCRIPT_LAO: |
379 | return getUcpLao(); |
380 | case CLASS_SCRIPT_LATIN: |
381 | return getUcpLatin(); |
382 | case CLASS_SCRIPT_LEPCHA: |
383 | return getUcpLepcha(); |
384 | case CLASS_SCRIPT_LIMBU: |
385 | return getUcpLimbu(); |
386 | case CLASS_SCRIPT_LINEAR_B: |
387 | return getUcpLinear_B(); |
388 | case CLASS_SCRIPT_LISU: |
389 | return getUcpLisu(); |
390 | case CLASS_SCRIPT_LYCIAN: |
391 | return getUcpLycian(); |
392 | case CLASS_SCRIPT_LYDIAN: |
393 | return getUcpLydian(); |
394 | case CLASS_SCRIPT_MALAYALAM: |
395 | return getUcpMalayalam(); |
396 | case CLASS_SCRIPT_MANDAIC: |
397 | return getUcpMandaic(); |
398 | case CLASS_SCRIPT_MEETEI_MAYEK: |
399 | return getUcpMeetei_Mayek(); |
400 | case CLASS_SCRIPT_MONGOLIAN: |
401 | return getUcpMongolian(); |
402 | case CLASS_SCRIPT_MYANMAR: |
403 | return getUcpMyanmar(); |
404 | case CLASS_SCRIPT_NEW_TAI_LUE: |
405 | return getUcpNew_Tai_Lue(); |
406 | case CLASS_SCRIPT_NKO: |
407 | return getUcpNko(); |
408 | case CLASS_SCRIPT_OGHAM: |
409 | return getUcpOgham(); |
410 | case CLASS_SCRIPT_OL_CHIKI: |
411 | return getUcpOl_Chiki(); |
412 | case CLASS_SCRIPT_OLD_ITALIC: |
413 | return getUcpOld_Italic(); |
414 | case CLASS_SCRIPT_OLD_PERSIAN: |
415 | return getUcpOld_Persian(); |
416 | case CLASS_SCRIPT_OLD_SOUTH_ARABIAN: |
417 | return getUcpOld_South_Arabian(); |
418 | case CLASS_SCRIPT_OLD_TURKIC: |
419 | return getUcpOld_Turkic(); |
420 | case CLASS_SCRIPT_ORIYA: |
421 | return getUcpOriya(); |
422 | case CLASS_SCRIPT_OSMANYA: |
423 | return getUcpOsmanya(); |
424 | case CLASS_SCRIPT_PHAGS_PA: |
425 | return getUcpPhags_Pa(); |
426 | case CLASS_SCRIPT_PHOENICIAN: |
427 | return getUcpPhoenician(); |
428 | case CLASS_SCRIPT_REJANG: |
429 | return getUcpRejang(); |
430 | case CLASS_SCRIPT_RUNIC: |
431 | return getUcpRunic(); |
432 | case CLASS_SCRIPT_SAMARITAN: |
433 | return getUcpSamaritan(); |
434 | case CLASS_SCRIPT_SAURASHTRA: |
435 | return getUcpSaurashtra(); |
436 | case CLASS_SCRIPT_SHAVIAN: |
437 | return getUcpShavian(); |
438 | case CLASS_SCRIPT_SINHALA: |
439 | return getUcpSinhala(); |
440 | case CLASS_SCRIPT_SUNDANESE: |
441 | return getUcpSundanese(); |
442 | case CLASS_SCRIPT_SYLOTI_NAGRI: |
443 | return getUcpSyloti_Nagri(); |
444 | case CLASS_SCRIPT_SYRIAC: |
445 | return getUcpSyriac(); |
446 | case CLASS_SCRIPT_TAGALOG: |
447 | return getUcpTagalog(); |
448 | case CLASS_SCRIPT_TAGBANWA: |
449 | return getUcpTagbanwa(); |
450 | case CLASS_SCRIPT_TAI_LE: |
451 | return getUcpTai_Le(); |
452 | case CLASS_SCRIPT_TAI_THAM: |
453 | return getUcpTai_Tham(); |
454 | case CLASS_SCRIPT_TAI_VIET: |
455 | return getUcpTai_Viet(); |
456 | case CLASS_SCRIPT_TAMIL: |
457 | return getUcpTamil(); |
458 | case CLASS_SCRIPT_TELUGU: |
459 | return getUcpTelugu(); |
460 | case CLASS_SCRIPT_THAANA: |
461 | return getUcpThaana(); |
462 | case CLASS_SCRIPT_THAI: |
463 | return getUcpThai(); |
464 | case CLASS_SCRIPT_TIBETAN: |
465 | return getUcpTibetan(); |
466 | case CLASS_SCRIPT_TIFINAGH: |
467 | return getUcpTifinagh(); |
468 | case CLASS_SCRIPT_UGARITIC: |
469 | return getUcpUgaritic(); |
470 | case CLASS_SCRIPT_VAI: |
471 | return getUcpVai(); |
472 | case CLASS_SCRIPT_YI: |
473 | return getUcpYi(); |
474 | case CLASS_UCP_ANY: |
475 | return CodePointSet(CodePointSet::interval(0, MAX_UNICODE)); |
476 | |
477 | default: { /* currently uses ascii defns */ |
478 | CharReach cr = getPredefinedCharReach(c, mode); |
479 | CodePointSet rv; |
480 | for (u32 i = cr.find_first(); i != CharReach::npos; |
481 | i = cr.find_next(i)) { |
482 | rv.set(i); |
483 | } |
484 | return rv; |
485 | } |
486 | } |
487 | } |
488 | |
489 | UTF8ComponentClass::UTF8ComponentClass(const ParseMode &mode_in) |
490 | : ComponentClass(mode_in), |
491 | single_pos( GlushkovBuildState::POS_UNINITIALIZED), |
492 | one_dot_trailer( GlushkovBuildState::POS_UNINITIALIZED), |
493 | two_dot_trailer( GlushkovBuildState::POS_UNINITIALIZED), |
494 | three_dot_trailer( GlushkovBuildState::POS_UNINITIALIZED), |
495 | two_char_dot_head( GlushkovBuildState::POS_UNINITIALIZED), |
496 | three_char_dot_head(GlushkovBuildState::POS_UNINITIALIZED), |
497 | four_char_dot_head( GlushkovBuildState::POS_UNINITIALIZED) { |
498 | assert(mode.utf8); |
499 | } |
500 | |
501 | UTF8ComponentClass *UTF8ComponentClass::clone() const { |
502 | return new UTF8ComponentClass(*this); |
503 | } |
504 | |
505 | bool UTF8ComponentClass::class_empty(void) const { |
506 | assert(finalized); |
507 | return cps.none(); |
508 | } |
509 | |
510 | void UTF8ComponentClass::createRange(unichar to) { |
511 | assert(range_start != INVALID_UNICODE); |
512 | unichar from = range_start; |
513 | if (from > to) { |
514 | throw LocatedParseError("Range out of order in character class" ); |
515 | } |
516 | |
517 | in_cand_range = false; |
518 | CodePointSet ncps; |
519 | ncps.setRange(from, to); |
520 | if (mode.caseless) { |
521 | make_caseless(&ncps); |
522 | } |
523 | cps |= ncps; |
524 | range_start = INVALID_UNICODE; |
525 | } |
526 | |
527 | void UTF8ComponentClass::add(PredefinedClass c, bool negative) { |
528 | if (in_cand_range) { // can't form a range here |
529 | throw LocatedParseError("Invalid range in character class" ); |
530 | } |
531 | |
532 | if (mode.ucp) { |
533 | c = translateForUcpMode(c, mode); |
534 | } |
535 | |
536 | // caselessness is handled inside this call - don't apply make_caseless |
537 | // to the result |
538 | CodePointSet pcps = getPredefinedCodePointSet(c, mode); |
539 | if (negative) { |
540 | pcps.flip(); |
541 | } |
542 | |
543 | cps |= pcps; |
544 | |
545 | range_start = INVALID_UNICODE; |
546 | in_cand_range = false; |
547 | } |
548 | |
549 | void UTF8ComponentClass::add(unichar c) { |
550 | DEBUG_PRINTF("adding \\x%08x\n" , c); |
551 | if (c > MAX_UNICODE) { // too big! |
552 | throw LocatedParseError("Hexadecimal value is greater than \\x10FFFF" ); |
553 | } |
554 | |
555 | if (in_cand_range) { |
556 | createRange(c); |
557 | return; |
558 | } |
559 | |
560 | CodePointSet ncps; |
561 | ncps.set(c); |
562 | if (mode.caseless) { |
563 | make_caseless(&ncps); |
564 | } |
565 | cps |= ncps; |
566 | range_start = c; |
567 | } |
568 | |
569 | void UTF8ComponentClass::finalize() { |
570 | if (finalized) { |
571 | return; |
572 | } |
573 | |
574 | // Handle unclosed ranges, like '[a-]' and '[a-\Q\E]' -- in these cases the |
575 | // dash is a literal dash. |
576 | if (in_cand_range) { |
577 | cps.set('-'); |
578 | in_cand_range = false; |
579 | } |
580 | |
581 | if (m_negate) { |
582 | cps.flip(); |
583 | } |
584 | |
585 | finalized = true; |
586 | } |
587 | |
588 | Position UTF8ComponentClass::(NFABuilder &builder, u8 first_byte) { |
589 | map<u8, Position>::const_iterator it = heads.find(first_byte); |
590 | if (it != heads.end()) { |
591 | return it->second; |
592 | } |
593 | |
594 | Position head = builder.makePositions(1); |
595 | assert(heads.find(first_byte) == heads.end()); |
596 | builder.addCharReach(head, CharReach(first_byte)); |
597 | /* no report id as head can not be directly wired to accept */ |
598 | |
599 | heads[first_byte] = head; |
600 | return head; |
601 | } |
602 | |
603 | void UTF8ComponentClass::ensureDotTrailer(GlushkovBuildState &bs) { |
604 | NFABuilder &builder = bs.getBuilder(); |
605 | if (one_dot_trailer != GlushkovBuildState::POS_UNINITIALIZED) { |
606 | return; |
607 | } |
608 | |
609 | one_dot_trailer = builder.makePositions(1); |
610 | builder.setNodeReportID(one_dot_trailer, 0); |
611 | builder.addCharReach(one_dot_trailer, CharReach(0x80, 0xbf)); |
612 | tails.insert(one_dot_trailer); |
613 | } |
614 | |
615 | void UTF8ComponentClass::ensureTwoDotTrailer(GlushkovBuildState &bs) { |
616 | NFABuilder &builder = bs.getBuilder(); |
617 | if (two_dot_trailer != GlushkovBuildState::POS_UNINITIALIZED) { |
618 | return; |
619 | } |
620 | |
621 | ensureDotTrailer(bs); |
622 | |
623 | two_dot_trailer = builder.makePositions(1); |
624 | builder.addCharReach(two_dot_trailer, CharReach(0x80, 0xbf)); |
625 | bs.addSuccessor(two_dot_trailer, one_dot_trailer); |
626 | } |
627 | |
628 | void UTF8ComponentClass::ensureThreeDotTrailer(GlushkovBuildState &bs) { |
629 | NFABuilder &builder = bs.getBuilder(); |
630 | if (three_dot_trailer != GlushkovBuildState::POS_UNINITIALIZED) { |
631 | return; |
632 | } |
633 | |
634 | ensureTwoDotTrailer(bs); |
635 | |
636 | three_dot_trailer = builder.makePositions(1); |
637 | builder.addCharReach(three_dot_trailer, CharReach(0x80, 0xbf)); |
638 | bs.addSuccessor(three_dot_trailer, two_dot_trailer); |
639 | } |
640 | |
641 | void UTF8ComponentClass::buildOneByte(GlushkovBuildState &bs) { |
642 | NFABuilder &builder = bs.getBuilder(); |
643 | for (CodePointSet::const_iterator it = cps.begin(); it != cps.end(); ++it) { |
644 | unichar b = lower(*it); |
645 | unichar e = upper(*it) + 1; |
646 | if (b >= UTF_2CHAR_MIN) { |
647 | continue; |
648 | } |
649 | |
650 | DEBUG_PRINTF("building vertices for [%u, %u)\n" , b, e); |
651 | |
652 | if (single_pos == GlushkovBuildState::POS_UNINITIALIZED) { |
653 | single_pos = builder.makePositions(1); |
654 | builder.setNodeReportID(single_pos, 0 /* offset adj */); |
655 | tails.insert(single_pos); |
656 | } |
657 | CharReach cr(b, MIN(e, UTF_2CHAR_MIN) - 1); |
658 | builder.addCharReach(single_pos, cr); |
659 | } |
660 | } |
661 | |
662 | void UTF8ComponentClass::addToTail(GlushkovBuildState &bs, |
663 | map<Position, Position> &finals, |
664 | Position prev, unichar b, unichar e) { |
665 | NFABuilder &builder = bs.getBuilder(); |
666 | Position tail; |
667 | if (finals.find(prev) == finals.end()) { |
668 | tail = builder.makePositions(1); |
669 | builder.setNodeReportID(tail, 0 /* offset adj */); |
670 | bs.addSuccessor(prev, tail); |
671 | finals[prev] = tail; |
672 | tails.insert(tail); |
673 | } else { |
674 | tail = finals[prev]; |
675 | } |
676 | |
677 | u8 bb = makeContByte(b); |
678 | u8 ee = makeContByte(e - 1); |
679 | builder.addCharReach(tail, CharReach(bb, ee)); |
680 | } |
681 | |
682 | void UTF8ComponentClass::buildTwoByte(GlushkovBuildState &bs) { |
683 | NFABuilder &builder = bs.getBuilder(); |
684 | map<Position, Position> finals; |
685 | |
686 | for (auto it = cps.begin(); it != cps.end(); ++it) { |
687 | unichar b = lower(*it); |
688 | unichar e = upper(*it) + 1; |
689 | |
690 | b = MAX(b, UTF_2CHAR_MIN); |
691 | e = MIN(e, UTF_3CHAR_MIN); |
692 | |
693 | if (b >= e) { |
694 | continue; /* we're done here */ |
695 | } |
696 | |
697 | /* raise b to the start of the next tail byte boundary */ |
698 | if (b & UTF_CONT_BYTE_VALUE_MASK) { |
699 | unichar bb = MIN(e, ROUNDUP_N(b, UTF_CONT_BYTE_RANGE)); |
700 | u8 first_byte = UTF_TWO_BYTE_HEADER | (b >> UTF_CONT_SHIFT); |
701 | assert(first_byte > 0xc1 && first_byte <= 0xdf); |
702 | |
703 | Position head = getHead(builder, first_byte); |
704 | addToTail(bs, finals, head, b, bb); |
705 | |
706 | b = bb; |
707 | } |
708 | |
709 | if (b == e) { |
710 | continue; /* we're done here */ |
711 | } |
712 | assert(b < e); |
713 | |
714 | /* lower e to the end of a tail byte boundary */ |
715 | if (e & UTF_CONT_BYTE_VALUE_MASK) { |
716 | unichar ee = e & ~UTF_CONT_BYTE_VALUE_MASK; |
717 | assert(ee >= b); |
718 | |
719 | u8 first_byte = UTF_TWO_BYTE_HEADER | (ee >> UTF_CONT_SHIFT); |
720 | assert(first_byte > 0xc1 && first_byte <= 0xdf); |
721 | |
722 | Position head = getHead(builder, first_byte); |
723 | addToTail(bs, finals, head, ee, e); |
724 | |
725 | e = ee; |
726 | } |
727 | |
728 | if (b == e) { |
729 | continue; /* we're done here */ |
730 | } |
731 | assert(b < e); |
732 | |
733 | /* middle section just goes to a common full vertex */ |
734 | ensureDotTrailer(bs); |
735 | |
736 | if (two_char_dot_head == GlushkovBuildState::POS_UNINITIALIZED) { |
737 | two_char_dot_head = builder.makePositions(1); |
738 | bs.addSuccessor(two_char_dot_head, one_dot_trailer); |
739 | } |
740 | |
741 | u8 min_first_byte = UTF_TWO_BYTE_HEADER | (b >> UTF_CONT_SHIFT); |
742 | u8 max_first_byte = UTF_TWO_BYTE_HEADER | ((e - 1) >> UTF_CONT_SHIFT); |
743 | |
744 | assert(min_first_byte > 0xc1 && min_first_byte <= 0xdf); |
745 | assert(max_first_byte > 0xc1 && max_first_byte <= 0xdf); |
746 | |
747 | builder.addCharReach(two_char_dot_head, |
748 | CharReach(min_first_byte, max_first_byte)); |
749 | } |
750 | } |
751 | |
752 | static |
753 | Position getMid(GlushkovBuildState &bs, map<Position, map<u8, Position> > &mids, |
754 | const Position &prev, u8 byte_val) { |
755 | NFABuilder &builder = bs.getBuilder(); |
756 | map<u8, Position> &by_byte = mids[prev]; |
757 | |
758 | map<u8, Position>::const_iterator it = by_byte.find(byte_val); |
759 | if (it != by_byte.end()) { |
760 | return it->second; |
761 | } |
762 | |
763 | Position mid = builder.makePositions(1); |
764 | builder.addCharReach(mid, CharReach(byte_val)); |
765 | bs.addSuccessor(prev, mid); |
766 | /* no report id as mid can not be directly wired to accept */ |
767 | |
768 | by_byte[byte_val] = mid; |
769 | return mid; |
770 | } |
771 | |
772 | void UTF8ComponentClass::buildThreeByte(GlushkovBuildState &bs) { |
773 | NFABuilder &builder = bs.getBuilder(); |
774 | |
775 | map<Position, map<u8, Position> > mids; |
776 | map<Position, Position> finals; |
777 | |
778 | for (auto it = cps.begin(); it != cps.end(); ++it) { |
779 | unichar b = lower(*it); |
780 | unichar e = upper(*it) + 1; |
781 | |
782 | b = MAX(b, UTF_3CHAR_MIN); |
783 | e = MIN(e, UTF_4CHAR_MIN); |
784 | |
785 | if (b >= e) { |
786 | continue; /* we're done here */ |
787 | } |
788 | |
789 | /* raise b to the start of the next tail byte boundary */ |
790 | if (b & UTF_CONT_BYTE_VALUE_MASK) { |
791 | unichar bb = MIN(e, ROUNDUP_N(b, UTF_CONT_BYTE_RANGE)); |
792 | |
793 | u8 first_byte = UTF_THREE_BYTE_HEADER | (b >> (2 * UTF_CONT_SHIFT)); |
794 | assert(first_byte >= 0xe0 && first_byte <= 0xef); |
795 | Position head = getHead(builder, first_byte); |
796 | |
797 | u8 second_byte = makeContByte(b >> UTF_CONT_SHIFT); |
798 | Position mid = getMid(bs, mids, head, second_byte); |
799 | |
800 | addToTail(bs, finals, mid, b, bb); |
801 | |
802 | b = bb; |
803 | } |
804 | |
805 | if (b == e) { |
806 | continue; /* we're done here */ |
807 | } |
808 | assert(b < e); |
809 | |
810 | /* lower e to the end of a tail byte boundary */ |
811 | if (e & UTF_CONT_BYTE_VALUE_MASK) { |
812 | unichar ee = e & ~UTF_CONT_BYTE_VALUE_MASK; |
813 | assert(ee >= b); |
814 | |
815 | u8 first_byte = UTF_THREE_BYTE_HEADER |
816 | | (ee >> (2 * UTF_CONT_SHIFT)); |
817 | assert(first_byte >= 0xe0 && first_byte <= 0xef); |
818 | Position head = getHead(builder, first_byte); |
819 | |
820 | u8 second_byte = makeContByte(ee >> UTF_CONT_SHIFT); |
821 | Position mid = getMid(bs, mids, head, second_byte); |
822 | |
823 | addToTail(bs, finals, mid, ee, e); |
824 | |
825 | e = ee; |
826 | } |
827 | |
828 | if (b == e) { |
829 | continue; /* we're done here */ |
830 | } |
831 | assert(b < e); |
832 | |
833 | /* from here on in the last byte is always full */ |
834 | ensureDotTrailer(bs); |
835 | |
836 | /* raise b to the start of the next mid byte boundary */ |
837 | if (b & ((1 << (2 * UTF_CONT_SHIFT)) - 1)) { |
838 | unichar bb = MIN(e, ROUNDUP_N(b, 1 << (2 * UTF_CONT_SHIFT))); |
839 | |
840 | u8 first_byte = UTF_THREE_BYTE_HEADER | (b >> (2 * UTF_CONT_SHIFT)); |
841 | Position head = getHead(builder, first_byte); |
842 | |
843 | Position mid = builder.makePositions(1); |
844 | bs.addSuccessor(head, mid); |
845 | bs.addSuccessor(mid, one_dot_trailer); |
846 | /* no report id as mid can not be directly wired to accept, |
847 | * not adding to mids as we are completely filling its downstream */ |
848 | u8 second_min = makeContByte(b >> UTF_CONT_SHIFT); |
849 | u8 second_max = makeContByte((bb - 1) >> UTF_CONT_SHIFT); |
850 | |
851 | builder.addCharReach(mid, CharReach(second_min, second_max)); |
852 | |
853 | b = bb; |
854 | } |
855 | |
856 | if (b == e) { |
857 | continue; /* we're done here */ |
858 | } |
859 | assert(b < e); |
860 | |
861 | /* lower e to the end of a mid byte boundary */ |
862 | if (e & ((1 << (2 * UTF_CONT_SHIFT)) - 1)) { |
863 | unichar ee = e & ~((1 << (2 * UTF_CONT_SHIFT)) - 1); |
864 | assert(ee >= b); |
865 | |
866 | u8 first_byte = UTF_THREE_BYTE_HEADER |
867 | | (ee >> (2 * UTF_CONT_SHIFT)); |
868 | Position head = getHead(builder, first_byte); |
869 | |
870 | Position mid = builder.makePositions(1); |
871 | bs.addSuccessor(head, mid); |
872 | bs.addSuccessor(mid, one_dot_trailer); |
873 | /* no report id as mid can not be directly wired to accept, |
874 | * not adding to mids as we are completely filling its downstream */ |
875 | u8 second_min = makeContByte(ee >> UTF_CONT_SHIFT); |
876 | u8 second_max = makeContByte((e - 1) >> UTF_CONT_SHIFT); |
877 | |
878 | builder.addCharReach(mid, CharReach(second_min, second_max)); |
879 | |
880 | e = ee; |
881 | } |
882 | |
883 | if (b == e) { |
884 | continue; /* we're done here */ |
885 | } |
886 | assert(b < e); |
887 | |
888 | /* now we just have to wire head to a common dot trailer */ |
889 | ensureTwoDotTrailer(bs); |
890 | if (three_char_dot_head == GlushkovBuildState::POS_UNINITIALIZED) { |
891 | three_char_dot_head = builder.makePositions(1); |
892 | bs.addSuccessor(three_char_dot_head, two_dot_trailer); |
893 | } |
894 | |
895 | u8 min_first_byte = UTF_THREE_BYTE_HEADER |
896 | | (b >> (2 * UTF_CONT_SHIFT)); |
897 | u8 max_first_byte = UTF_THREE_BYTE_HEADER |
898 | | ((e - 1) >> (2 * UTF_CONT_SHIFT)); |
899 | |
900 | assert(min_first_byte > 0xdf && min_first_byte <= 0xef); |
901 | assert(max_first_byte > 0xdf && max_first_byte <= 0xef); |
902 | |
903 | builder.addCharReach(three_char_dot_head, |
904 | CharReach(min_first_byte, max_first_byte)); |
905 | } |
906 | } |
907 | |
908 | static |
909 | u8 makeFirstByteOfFour(unichar raw) { |
910 | u8 first_byte = UTF_FOUR_BYTE_HEADER | (raw >> (3 * UTF_CONT_SHIFT)); |
911 | assert(first_byte > 0xef && first_byte <= 0xf7); |
912 | return first_byte; |
913 | } |
914 | |
915 | static |
916 | bool isTwoContAligned(unichar raw) { |
917 | return !(raw & ((1 << (2 * UTF_CONT_SHIFT)) - 1)); |
918 | } |
919 | |
920 | static |
921 | bool isThreeContAligned(unichar raw) { |
922 | return !(raw & ((1 << (3 * UTF_CONT_SHIFT)) - 1)); |
923 | } |
924 | |
925 | void UTF8ComponentClass::buildFourByte(GlushkovBuildState &bs) { |
926 | NFABuilder &builder = bs.getBuilder(); |
927 | map<Position, map<u8, Position> > mids; |
928 | map<Position, Position> finals; |
929 | |
930 | for (auto it = cps.begin(); it != cps.end(); ++it) { |
931 | unichar b = lower(*it); |
932 | unichar e = upper(*it) + 1; |
933 | |
934 | b = MAX(b, UTF_4CHAR_MIN); |
935 | e = MIN(e, MAX_UNICODE + 1); |
936 | |
937 | if (b >= e) { |
938 | continue; |
939 | } |
940 | |
941 | /* raise b to the start of the next tail byte boundary */ |
942 | if (b & UTF_CONT_BYTE_VALUE_MASK) { |
943 | unichar bb = MIN(e, ROUNDUP_N(b, UTF_CONT_BYTE_RANGE)); |
944 | |
945 | u8 first_byte = makeFirstByteOfFour(b); |
946 | Position head = getHead(builder, first_byte); |
947 | |
948 | u8 second_byte = makeContByte(b >> (2 * UTF_CONT_SHIFT)); |
949 | Position mid1 = getMid(bs, mids, head, second_byte); |
950 | |
951 | u8 third_byte = makeContByte(b >> UTF_CONT_SHIFT); |
952 | Position mid2 = getMid(bs, mids, mid1, third_byte); |
953 | |
954 | addToTail(bs, finals, mid2, b, bb); |
955 | |
956 | b = bb; |
957 | } |
958 | |
959 | if (b == e) { |
960 | continue; /* we're done here */ |
961 | } |
962 | assert(b < e); |
963 | |
964 | /* lower e to the end of a tail byte boundary */ |
965 | if (e & UTF_CONT_BYTE_VALUE_MASK) { |
966 | unichar ee = e & ~UTF_CONT_BYTE_VALUE_MASK; |
967 | assert(ee >= b); |
968 | |
969 | u8 first_byte = makeFirstByteOfFour(ee); |
970 | Position head = getHead(builder, first_byte); |
971 | |
972 | u8 second_byte = makeContByte(ee >> (2 * UTF_CONT_SHIFT)); |
973 | Position mid1 = getMid(bs, mids, head, second_byte); |
974 | |
975 | u8 third_byte = makeContByte(ee >> UTF_CONT_SHIFT); |
976 | Position mid2 = getMid(bs, mids, mid1, third_byte); |
977 | |
978 | addToTail(bs, finals, mid2, ee, e); |
979 | |
980 | e = ee; |
981 | } |
982 | |
983 | if (b == e) { |
984 | continue; /* we're done here */ |
985 | } |
986 | assert(b < e); |
987 | |
988 | /* from here on in the last byte is always full */ |
989 | ensureDotTrailer(bs); |
990 | |
991 | /* raise b to the start of the next mid byte boundary */ |
992 | if (!isTwoContAligned(b)) { |
993 | unichar bb = MIN(e, ROUNDUP_N(b, 1 << (2 * UTF_CONT_SHIFT))); |
994 | |
995 | u8 first_byte = makeFirstByteOfFour(b); |
996 | Position head = getHead(builder, first_byte); |
997 | |
998 | u8 second_byte = makeContByte(b >> (2 * UTF_CONT_SHIFT)); |
999 | Position mid1 = getMid(bs, mids, head, second_byte); |
1000 | |
1001 | Position mid2 = builder.makePositions(1); |
1002 | bs.addSuccessor(mid1, mid2); |
1003 | bs.addSuccessor(mid2, one_dot_trailer); |
1004 | /* no report id as mid can not be directly wired to accept, |
1005 | * not adding to mids as we are completely filling its downstream */ |
1006 | u8 byte_min = makeContByte(b >> UTF_CONT_SHIFT); |
1007 | u8 byte_max = makeContByte((bb - 1) >> UTF_CONT_SHIFT); |
1008 | |
1009 | builder.addCharReach(mid2, CharReach(byte_min, byte_max)); |
1010 | |
1011 | b = bb; |
1012 | } |
1013 | |
1014 | if (b == e) { |
1015 | continue; /* we're done here */ |
1016 | } |
1017 | assert(b < e); |
1018 | |
1019 | /* lower e to the end of a mid byte boundary */ |
1020 | if (!isTwoContAligned(e)) { |
1021 | unichar ee = e & ~((1 << (2 * UTF_CONT_SHIFT)) - 1); |
1022 | assert(ee >= b); |
1023 | |
1024 | u8 first_byte = makeFirstByteOfFour(ee); |
1025 | Position head = getHead(builder, first_byte); |
1026 | |
1027 | u8 second_byte = makeContByte(ee >> (2 * UTF_CONT_SHIFT)); |
1028 | Position mid1 = getMid(bs, mids, head, second_byte); |
1029 | |
1030 | Position mid2 = builder.makePositions(1); |
1031 | bs.addSuccessor(mid1, mid2); |
1032 | bs.addSuccessor(mid2, one_dot_trailer); |
1033 | /* no report id as mid can not be directly wired to accept, |
1034 | * not adding to mids as we are completely filling its downstream */ |
1035 | u8 byte_min = makeContByte(ee >> UTF_CONT_SHIFT); |
1036 | u8 byte_max = makeContByte((e - 1) >> UTF_CONT_SHIFT); |
1037 | |
1038 | builder.addCharReach(mid2, CharReach(byte_min, byte_max)); |
1039 | |
1040 | e = ee; |
1041 | } |
1042 | |
1043 | if (b == e) { |
1044 | continue; /* we're done here */ |
1045 | } |
1046 | assert(b < e); |
1047 | |
1048 | ensureTwoDotTrailer(bs); |
1049 | |
1050 | /* raise b to the next byte boundary */ |
1051 | if (!isThreeContAligned(b)) { |
1052 | unichar bb = MIN(e, ROUNDUP_N(b, 1 << (3 * UTF_CONT_SHIFT))); |
1053 | |
1054 | u8 first_byte = makeFirstByteOfFour(b); |
1055 | Position head = getHead(builder, first_byte); |
1056 | |
1057 | Position mid1 = builder.makePositions(1); |
1058 | bs.addSuccessor(head, mid1); |
1059 | bs.addSuccessor(mid1, two_dot_trailer); |
1060 | /* no report id as mid can not be directly wired to accept, |
1061 | * not adding to mids as we are completely filling its downstream */ |
1062 | u8 byte_min = makeContByte(b >> (2 * UTF_CONT_SHIFT)); |
1063 | u8 byte_max = makeContByte((bb - 1) >> (2 * UTF_CONT_SHIFT)); |
1064 | |
1065 | builder.addCharReach(mid1, CharReach(byte_min, byte_max)); |
1066 | |
1067 | b = bb; |
1068 | } |
1069 | |
1070 | if (b == e) { |
1071 | continue; /* we're done here */ |
1072 | } |
1073 | assert(b < e); |
1074 | |
1075 | /* lower e to the next byte boundary */ |
1076 | if (!isThreeContAligned(e)) { |
1077 | unichar ee = e & ~((1 << (3 * UTF_CONT_SHIFT)) - 1); |
1078 | assert(ee >= b); |
1079 | |
1080 | u8 first_byte = makeFirstByteOfFour(ee); |
1081 | Position head = getHead(builder, first_byte); |
1082 | Position mid1 = builder.makePositions(1); |
1083 | bs.addSuccessor(head, mid1); |
1084 | bs.addSuccessor(mid1, two_dot_trailer); |
1085 | /* no report id as mid can not be directly wired to accept, |
1086 | * not adding to mids as we are completely filling its downstream */ |
1087 | u8 byte_min = makeContByte(ee >> (2 * UTF_CONT_SHIFT)); |
1088 | u8 byte_max = makeContByte((e - 1) >> (2 * UTF_CONT_SHIFT)); |
1089 | |
1090 | builder.addCharReach(mid1, CharReach(byte_min, byte_max)); |
1091 | |
1092 | e = ee; |
1093 | } |
1094 | |
1095 | if (b == e) { |
1096 | continue; /* we're done here */ |
1097 | } |
1098 | assert(b < e); |
1099 | |
1100 | /* now we just have to wire head to a common dot trailer */ |
1101 | ensureThreeDotTrailer(bs); |
1102 | if (four_char_dot_head == GlushkovBuildState::POS_UNINITIALIZED) { |
1103 | four_char_dot_head = builder.makePositions(1); |
1104 | bs.addSuccessor(four_char_dot_head, three_dot_trailer); |
1105 | } |
1106 | |
1107 | u8 min_first_byte = makeFirstByteOfFour(b); |
1108 | u8 max_first_byte = makeFirstByteOfFour(e - 1); |
1109 | |
1110 | builder.addCharReach(four_char_dot_head, |
1111 | CharReach(min_first_byte, max_first_byte)); |
1112 | } |
1113 | } |
1114 | |
1115 | void UTF8ComponentClass::notePositions(GlushkovBuildState &bs) { |
1116 | // We should always be finalized by now. |
1117 | assert(finalized); |
1118 | |
1119 | // An empty class is a special case; this would be generated by something |
1120 | // like /[\s\S]/8, which can never match. We treat these like we do the non |
1121 | // UTF-8 version: add a vertex with empty reach (to ensure we create a |
1122 | // connected graph) and pick it up later on. |
1123 | if (class_empty()) { |
1124 | DEBUG_PRINTF("empty class!\n" ); |
1125 | assert(single_pos == GlushkovBuildState::POS_UNINITIALIZED); |
1126 | NFABuilder &builder = bs.getBuilder(); |
1127 | single_pos = builder.makePositions(1); |
1128 | builder.setNodeReportID(single_pos, 0 /* offset adj */); |
1129 | builder.addCharReach(single_pos, CharReach()); |
1130 | tails.insert(single_pos); |
1131 | return; |
1132 | } |
1133 | |
1134 | buildOneByte(bs); |
1135 | buildTwoByte(bs); |
1136 | buildThreeByte(bs); |
1137 | buildFourByte(bs); |
1138 | } |
1139 | |
1140 | void UTF8ComponentClass::buildFollowSet(GlushkovBuildState &, |
1141 | const vector<PositionInfo> &) { |
1142 | /* states are wired in notePositions as all belong to this component. */ |
1143 | } |
1144 | |
1145 | vector<PositionInfo> UTF8ComponentClass::first(void) const { |
1146 | vector<PositionInfo> rv; |
1147 | if (single_pos != GlushkovBuildState::POS_UNINITIALIZED) { |
1148 | rv.push_back(single_pos); |
1149 | } |
1150 | if (two_char_dot_head != GlushkovBuildState::POS_UNINITIALIZED) { |
1151 | rv.push_back(two_char_dot_head); |
1152 | } |
1153 | if (three_char_dot_head != GlushkovBuildState::POS_UNINITIALIZED) { |
1154 | rv.push_back(three_char_dot_head); |
1155 | } |
1156 | if (four_char_dot_head != GlushkovBuildState::POS_UNINITIALIZED) { |
1157 | rv.push_back(four_char_dot_head); |
1158 | } |
1159 | |
1160 | for (auto it = heads.begin(); it != heads.end(); ++it) { |
1161 | rv.push_back(it->second); |
1162 | } |
1163 | return rv; |
1164 | } |
1165 | |
1166 | vector<PositionInfo> UTF8ComponentClass::last(void) const { |
1167 | vector<PositionInfo> rv; |
1168 | |
1169 | rv.insert(rv.end(), tails.begin(), tails.end()); |
1170 | return rv; |
1171 | } |
1172 | |
1173 | } // namespace ue2 |
1174 | |