1// Copyright (c) 2014, the Dart project authors. Please see the AUTHORS file
2// for details. All rights reserved. Use of this source code is governed by a
3// BSD-style license that can be found in the LICENSE file.
4
5#include "vm/regexp_parser.h"
6
7#include "unicode/uchar.h"
8#include "unicode/uniset.h"
9
10#include "platform/unicode.h"
11
12#include "vm/longjump.h"
13#include "vm/object_store.h"
14
15namespace dart {
16
17#define Z zone()
18
19// Enables possessive quantifier syntax for testing.
20static const bool FLAG_regexp_possessive_quantifier = false;
21
22RegExpBuilder::RegExpBuilder(RegExpFlags flags)
23 : zone_(Thread::Current()->zone()),
24 pending_empty_(false),
25 flags_(flags),
26 characters_(NULL),
27 pending_surrogate_(kNoPendingSurrogate),
28 terms_(),
29 text_(),
30 alternatives_()
31#ifdef DEBUG
32 ,
33 last_added_(ADD_NONE)
34#endif
35{
36}
37
38void RegExpBuilder::AddLeadSurrogate(uint16_t lead_surrogate) {
39 ASSERT(Utf16::IsLeadSurrogate(lead_surrogate));
40 FlushPendingSurrogate();
41 // Hold onto the lead surrogate, waiting for a trail surrogate to follow.
42 pending_surrogate_ = lead_surrogate;
43}
44
45void RegExpBuilder::AddTrailSurrogate(uint16_t trail_surrogate) {
46 ASSERT(Utf16::IsTrailSurrogate(trail_surrogate));
47 if (pending_surrogate_ != kNoPendingSurrogate) {
48 uint16_t lead_surrogate = pending_surrogate_;
49 pending_surrogate_ = kNoPendingSurrogate;
50 ASSERT(Utf16::IsLeadSurrogate(lead_surrogate));
51 uint32_t combined = Utf16::Decode(lead_surrogate, trail_surrogate);
52 if (NeedsDesugaringForIgnoreCase(combined)) {
53 AddCharacterClassForDesugaring(combined);
54 } else {
55 auto surrogate_pair = new (Z) ZoneGrowableArray<uint16_t>(2);
56 surrogate_pair->Add(lead_surrogate);
57 surrogate_pair->Add(trail_surrogate);
58 RegExpAtom* atom = new (Z) RegExpAtom(surrogate_pair, flags_);
59 AddAtom(atom);
60 }
61 } else {
62 pending_surrogate_ = trail_surrogate;
63 FlushPendingSurrogate();
64 }
65}
66
67void RegExpBuilder::FlushPendingSurrogate() {
68 if (pending_surrogate_ != kNoPendingSurrogate) {
69 ASSERT(is_unicode());
70 uint32_t c = pending_surrogate_;
71 pending_surrogate_ = kNoPendingSurrogate;
72 AddCharacterClassForDesugaring(c);
73 }
74}
75
76void RegExpBuilder::FlushCharacters() {
77 FlushPendingSurrogate();
78 pending_empty_ = false;
79 if (characters_ != NULL) {
80 RegExpTree* atom = new (Z) RegExpAtom(characters_, flags_);
81 characters_ = NULL;
82 text_.Add(atom);
83 LAST(ADD_ATOM);
84 }
85}
86
87void RegExpBuilder::FlushText() {
88 FlushCharacters();
89 intptr_t num_text = text_.length();
90 if (num_text == 0) {
91 return;
92 } else if (num_text == 1) {
93 terms_.Add(text_.Last());
94 } else {
95 RegExpText* text = new (Z) RegExpText();
96 for (intptr_t i = 0; i < num_text; i++)
97 text_[i]->AppendToText(text);
98 terms_.Add(text);
99 }
100 text_.Clear();
101}
102
103void RegExpBuilder::AddCharacter(uint16_t c) {
104 FlushPendingSurrogate();
105 pending_empty_ = false;
106 if (NeedsDesugaringForIgnoreCase(c)) {
107 AddCharacterClassForDesugaring(c);
108 } else {
109 if (characters_ == NULL) {
110 characters_ = new (Z) ZoneGrowableArray<uint16_t>(4);
111 }
112 characters_->Add(c);
113 LAST(ADD_CHAR);
114 }
115}
116
117void RegExpBuilder::AddUnicodeCharacter(uint32_t c) {
118 if (c > static_cast<uint32_t>(Utf16::kMaxCodeUnit)) {
119 ASSERT(is_unicode());
120 uint16_t surrogates[2];
121 Utf16::Encode(c, surrogates);
122 AddLeadSurrogate(surrogates[0]);
123 AddTrailSurrogate(surrogates[1]);
124 } else if (is_unicode() && Utf16::IsLeadSurrogate(c)) {
125 AddLeadSurrogate(c);
126 } else if (is_unicode() && Utf16::IsTrailSurrogate(c)) {
127 AddTrailSurrogate(c);
128 } else {
129 AddCharacter(static_cast<uint16_t>(c));
130 }
131}
132
133void RegExpBuilder::AddEscapedUnicodeCharacter(uint32_t character) {
134 // A lead or trail surrogate parsed via escape sequence will not
135 // pair up with any preceding lead or following trail surrogate.
136 FlushPendingSurrogate();
137 AddUnicodeCharacter(character);
138 FlushPendingSurrogate();
139}
140
141void RegExpBuilder::AddEmpty() {
142 pending_empty_ = true;
143}
144
145void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
146 if (NeedsDesugaringForUnicode(cc)) {
147 // With /u, character class needs to be desugared, so it
148 // must be a standalone term instead of being part of a RegExpText.
149 AddTerm(cc);
150 } else {
151 AddAtom(cc);
152 }
153}
154
155void RegExpBuilder::AddCharacterClassForDesugaring(uint32_t c) {
156 auto ranges = CharacterRange::List(Z, CharacterRange::Singleton(c));
157 AddTerm(new (Z) RegExpCharacterClass(ranges, flags_));
158}
159
160void RegExpBuilder::AddAtom(RegExpTree* term) {
161 if (term->IsEmpty()) {
162 AddEmpty();
163 return;
164 }
165 if (term->IsTextElement()) {
166 FlushCharacters();
167 text_.Add(term);
168 } else {
169 FlushText();
170 terms_.Add(term);
171 }
172 LAST(ADD_ATOM);
173}
174
175void RegExpBuilder::AddTerm(RegExpTree* term) {
176 FlushText();
177 terms_.Add(term);
178 LAST(ADD_ATOM);
179}
180
181void RegExpBuilder::AddAssertion(RegExpTree* assert) {
182 FlushText();
183 terms_.Add(assert);
184 LAST(ADD_ASSERT);
185}
186
187void RegExpBuilder::NewAlternative() {
188 FlushTerms();
189}
190
191void RegExpBuilder::FlushTerms() {
192 FlushText();
193 intptr_t num_terms = terms_.length();
194 RegExpTree* alternative;
195 if (num_terms == 0) {
196 alternative = RegExpEmpty::GetInstance();
197 } else if (num_terms == 1) {
198 alternative = terms_.Last();
199 } else {
200 ZoneGrowableArray<RegExpTree*>* terms =
201 new (Z) ZoneGrowableArray<RegExpTree*>();
202 for (intptr_t i = 0; i < terms_.length(); i++) {
203 terms->Add(terms_[i]);
204 }
205 alternative = new (Z) RegExpAlternative(terms);
206 }
207 alternatives_.Add(alternative);
208 terms_.Clear();
209 LAST(ADD_NONE);
210}
211
212bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) {
213 if (!is_unicode()) return false;
214 // TODO(yangguo): we could be smarter than this. Case-insensitivity does not
215 // necessarily mean that we need to desugar. It's probably nicer to have a
216 // separate pass to figure out unicode desugarings.
217 if (ignore_case()) return true;
218 ZoneGrowableArray<CharacterRange>* ranges = cc->ranges();
219 CharacterRange::Canonicalize(ranges);
220 for (int i = ranges->length() - 1; i >= 0; i--) {
221 uint32_t from = ranges->At(i).from();
222 uint32_t to = ranges->At(i).to();
223 // Check for non-BMP characters.
224 if (to >= Utf16::kMaxCodeUnit) return true;
225 // Check for lone surrogates.
226 if (from <= Utf16::kTrailSurrogateEnd && to >= Utf16::kLeadSurrogateStart) {
227 return true;
228 }
229 }
230 return false;
231}
232
233bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uint32_t c) {
234 if (is_unicode() && ignore_case()) {
235 icu::UnicodeSet set(c, c);
236 set.closeOver(USET_CASE_INSENSITIVE);
237 set.removeAllStrings();
238 return set.size() > 1;
239 }
240 return false;
241}
242
243RegExpTree* RegExpBuilder::ToRegExp() {
244 FlushTerms();
245 intptr_t num_alternatives = alternatives_.length();
246 if (num_alternatives == 0) {
247 return RegExpEmpty::GetInstance();
248 }
249 if (num_alternatives == 1) {
250 return alternatives_.Last();
251 }
252 ZoneGrowableArray<RegExpTree*>* alternatives =
253 new (Z) ZoneGrowableArray<RegExpTree*>();
254 for (intptr_t i = 0; i < alternatives_.length(); i++) {
255 alternatives->Add(alternatives_[i]);
256 }
257 return new (Z) RegExpDisjunction(alternatives);
258}
259
260bool RegExpBuilder::AddQuantifierToAtom(
261 intptr_t min,
262 intptr_t max,
263 RegExpQuantifier::QuantifierType quantifier_type) {
264 if (pending_empty_) {
265 pending_empty_ = false;
266 return true;
267 }
268 RegExpTree* atom;
269 if (characters_ != NULL) {
270 DEBUG_ASSERT(last_added_ == ADD_CHAR);
271 // Last atom was character.
272
273 ZoneGrowableArray<uint16_t>* char_vector =
274 new (Z) ZoneGrowableArray<uint16_t>();
275 char_vector->AddArray(*characters_);
276 intptr_t num_chars = char_vector->length();
277 if (num_chars > 1) {
278 ZoneGrowableArray<uint16_t>* prefix =
279 new (Z) ZoneGrowableArray<uint16_t>();
280 for (intptr_t i = 0; i < num_chars - 1; i++) {
281 prefix->Add(char_vector->At(i));
282 }
283 text_.Add(new (Z) RegExpAtom(prefix, flags_));
284 ZoneGrowableArray<uint16_t>* tail = new (Z) ZoneGrowableArray<uint16_t>();
285 tail->Add(char_vector->At(num_chars - 1));
286 char_vector = tail;
287 }
288 characters_ = NULL;
289 atom = new (Z) RegExpAtom(char_vector, flags_);
290 FlushText();
291 } else if (text_.length() > 0) {
292 DEBUG_ASSERT(last_added_ == ADD_ATOM);
293 atom = text_.RemoveLast();
294 FlushText();
295 } else if (terms_.length() > 0) {
296 DEBUG_ASSERT(last_added_ == ADD_ATOM);
297 atom = terms_.RemoveLast();
298 if (auto lookaround = atom->AsLookaround()) {
299 // With /u, lookarounds are not quantifiable.
300 if (is_unicode()) return false;
301 // Lookbehinds are not quantifiable.
302 if (lookaround->type() == RegExpLookaround::LOOKBEHIND) {
303 return false;
304 }
305 }
306 if (atom->max_match() == 0) {
307 // Guaranteed to only match an empty string.
308 LAST(ADD_TERM);
309 if (min == 0) {
310 return true;
311 }
312 terms_.Add(atom);
313 return true;
314 }
315 } else {
316 // Only call immediately after adding an atom or character!
317 UNREACHABLE();
318 }
319 terms_.Add(new (Z) RegExpQuantifier(min, max, quantifier_type, atom));
320 LAST(ADD_TERM);
321 return true;
322}
323
324// ----------------------------------------------------------------------------
325// Implementation of Parser
326
327RegExpParser::RegExpParser(const String& in, String* error, RegExpFlags flags)
328 : zone_(Thread::Current()->zone()),
329 captures_(nullptr),
330 named_captures_(nullptr),
331 named_back_references_(nullptr),
332 in_(in),
333 current_(kEndMarker),
334 next_pos_(0),
335 captures_started_(0),
336 capture_count_(0),
337 has_more_(true),
338 top_level_flags_(flags),
339 simple_(false),
340 contains_anchor_(false),
341 is_scanned_for_captures_(false),
342 has_named_captures_(false) {
343 Advance();
344}
345
346inline uint32_t RegExpParser::ReadNext(bool update_position) {
347 intptr_t position = next_pos_;
348 const uint16_t c0 = in().CharAt(position);
349 uint32_t c = c0;
350 position++;
351 if (is_unicode() && position < in().Length() && Utf16::IsLeadSurrogate(c0)) {
352 const uint16_t c1 = in().CharAt(position);
353 if (Utf16::IsTrailSurrogate(c1)) {
354 c = Utf16::Decode(c0, c1);
355 position++;
356 }
357 }
358 if (update_position) next_pos_ = position;
359 return c;
360}
361
362uint32_t RegExpParser::Next() {
363 if (has_next()) {
364 return ReadNext(false);
365 } else {
366 return kEndMarker;
367 }
368}
369
370void RegExpParser::Advance() {
371 if (has_next()) {
372 current_ = ReadNext(true);
373 } else {
374 current_ = kEndMarker;
375 // Advance so that position() points to 1 after the last character. This is
376 // important so that Reset() to this position works correctly.
377 next_pos_ = in().Length() + 1;
378 has_more_ = false;
379 }
380}
381
382void RegExpParser::Reset(intptr_t pos) {
383 next_pos_ = pos;
384 has_more_ = (pos < in().Length());
385 Advance();
386}
387
388void RegExpParser::Advance(intptr_t dist) {
389 next_pos_ += dist - 1;
390 Advance();
391}
392
393bool RegExpParser::simple() {
394 return simple_;
395}
396
397bool RegExpParser::IsSyntaxCharacterOrSlash(uint32_t c) {
398 switch (c) {
399 case '^':
400 case '$':
401 case '\\':
402 case '.':
403 case '*':
404 case '+':
405 case '?':
406 case '(':
407 case ')':
408 case '[':
409 case ']':
410 case '{':
411 case '}':
412 case '|':
413 case '/':
414 return true;
415 default:
416 break;
417 }
418 return false;
419}
420
421void RegExpParser::ReportError(const char* message) {
422 // Zip to the end to make sure the no more input is read.
423 current_ = kEndMarker;
424 next_pos_ = in().Length();
425
426 // Throw a FormatException on parsing failures.
427 const String& msg = String::Handle(
428 String::Concat(String::Handle(String::New(message)), in()));
429 const Array& args = Array::Handle(Array::New(1));
430 args.SetAt(0, msg);
431 Exceptions::ThrowByType(Exceptions::kFormat, args);
432 UNREACHABLE();
433}
434
435// Pattern ::
436// Disjunction
437RegExpTree* RegExpParser::ParsePattern() {
438 RegExpTree* result = ParseDisjunction();
439 PatchNamedBackReferences();
440 ASSERT(!has_more());
441 // If the result of parsing is a literal string atom, and it has the
442 // same length as the input, then the atom is identical to the input.
443 if (result->IsAtom() && result->AsAtom()->length() == in().Length()) {
444 simple_ = true;
445 }
446 return result;
447}
448
449// Used for error messages where we would have fallen back on treating an
450// escape as the identity escape, but we are in Unicode mode.
451static const char* kUnicodeIdentity =
452 "Invalid identity escape in Unicode pattern";
453
454// Disjunction ::
455// Alternative
456// Alternative | Disjunction
457// Alternative ::
458// [empty]
459// Term Alternative
460// Term ::
461// Assertion
462// Atom
463// Atom Quantifier
464RegExpTree* RegExpParser::ParseDisjunction() {
465 // Used to store current state while parsing subexpressions.
466 RegExpParserState initial_state(nullptr, INITIAL, RegExpLookaround::LOOKAHEAD,
467 0, nullptr, top_level_flags_, Z);
468 RegExpParserState* stored_state = &initial_state;
469 // Cache the builder in a local variable for quick access.
470 RegExpBuilder* builder = initial_state.builder();
471 while (true) {
472 switch (current()) {
473 case kEndMarker:
474 if (stored_state->IsSubexpression()) {
475 // Inside a parenthesized group when hitting end of input.
476 ReportError("Unterminated group");
477 UNREACHABLE();
478 }
479 ASSERT(INITIAL == stored_state->group_type());
480 // Parsing completed successfully.
481 return builder->ToRegExp();
482 case ')': {
483 if (!stored_state->IsSubexpression()) {
484 ReportError("Unmatched ')'");
485 UNREACHABLE();
486 }
487 ASSERT(INITIAL != stored_state->group_type());
488
489 Advance();
490 // End disjunction parsing and convert builder content to new single
491 // regexp atom.
492 RegExpTree* body = builder->ToRegExp();
493
494 intptr_t end_capture_index = captures_started();
495
496 intptr_t capture_index = stored_state->capture_index();
497 SubexpressionType group_type = stored_state->group_type();
498
499 // Build result of subexpression.
500 if (group_type == CAPTURE) {
501 if (stored_state->IsNamedCapture()) {
502 CreateNamedCaptureAtIndex(stored_state->capture_name(),
503 capture_index);
504 }
505 RegExpCapture* capture = GetCapture(capture_index);
506 capture->set_body(body);
507 body = capture;
508 } else if (group_type != GROUPING) {
509 ASSERT(group_type == POSITIVE_LOOKAROUND ||
510 group_type == NEGATIVE_LOOKAROUND);
511 bool is_positive = (group_type == POSITIVE_LOOKAROUND);
512 body = new (Z) RegExpLookaround(
513 body, is_positive, end_capture_index - capture_index,
514 capture_index, stored_state->lookaround_type());
515 }
516
517 // Restore previous state.
518 stored_state = stored_state->previous_state();
519 builder = stored_state->builder();
520
521 builder->AddAtom(body);
522 // For compatibility with JSC and ES3, we allow quantifiers after
523 // lookaheads, and break in all cases.
524 break;
525 }
526 case '|': {
527 Advance();
528 builder->NewAlternative();
529 continue;
530 }
531 case '*':
532 case '+':
533 case '?':
534 ReportError("Nothing to repeat");
535 UNREACHABLE();
536 case '^': {
537 Advance();
538 if (builder->is_multi_line()) {
539 builder->AddAssertion(new (Z) RegExpAssertion(
540 RegExpAssertion::START_OF_LINE, builder->flags()));
541 } else {
542 builder->AddAssertion(new (Z) RegExpAssertion(
543 RegExpAssertion::START_OF_INPUT, builder->flags()));
544 set_contains_anchor();
545 }
546 continue;
547 }
548 case '$': {
549 Advance();
550 RegExpAssertion::AssertionType assertion_type =
551 builder->is_multi_line() ? RegExpAssertion::END_OF_LINE
552 : RegExpAssertion::END_OF_INPUT;
553 builder->AddAssertion(
554 new (Z) RegExpAssertion(assertion_type, builder->flags()));
555 continue;
556 }
557 case '.': {
558 Advance();
559 auto ranges = new (Z) ZoneGrowableArray<CharacterRange>(2);
560 if (builder->is_dot_all()) {
561 // Everything.
562 CharacterRange::AddClassEscape(
563 '*', ranges,
564 /*add_unicode_case_equivalents=*/false);
565 } else {
566 // everything except \x0a, \x0d, \u2028 and \u2029
567 CharacterRange::AddClassEscape(
568 '.', ranges,
569 /*add_unicode_case_equivalents=*/false);
570 }
571 RegExpCharacterClass* cc =
572 new (Z) RegExpCharacterClass(ranges, builder->flags());
573 builder->AddCharacterClass(cc);
574 break;
575 }
576 case '(': {
577 stored_state = ParseOpenParenthesis(stored_state);
578 builder = stored_state->builder();
579 continue;
580 }
581 case '[': {
582 RegExpTree* atom = ParseCharacterClass(builder);
583 builder->AddCharacterClass(atom->AsCharacterClass());
584 break;
585 }
586 // Atom ::
587 // \ AtomEscape
588 case '\\':
589 switch (Next()) {
590 case kEndMarker:
591 ReportError("\\ at end of pattern");
592 UNREACHABLE();
593 case 'b':
594 Advance(2);
595 builder->AddAssertion(new (Z) RegExpAssertion(
596 RegExpAssertion::BOUNDARY, builder->flags()));
597 continue;
598 case 'B':
599 Advance(2);
600 builder->AddAssertion(new (Z) RegExpAssertion(
601 RegExpAssertion::NON_BOUNDARY, builder->flags()));
602 continue;
603 // AtomEscape ::
604 // CharacterClassEscape
605 //
606 // CharacterClassEscape :: one of
607 // d D s S w W
608 case 'd':
609 case 'D':
610 case 's':
611 case 'S':
612 case 'w':
613 case 'W': {
614 uint32_t c = Next();
615 Advance(2);
616 auto ranges = new (Z) ZoneGrowableArray<CharacterRange>(2);
617 CharacterRange::AddClassEscape(
618 c, ranges, is_unicode() && builder->ignore_case());
619 RegExpCharacterClass* cc =
620 new (Z) RegExpCharacterClass(ranges, builder->flags());
621 builder->AddCharacterClass(cc);
622 break;
623 }
624 case 'p':
625 case 'P': {
626 uint32_t p = Next();
627 Advance(2);
628
629 if (is_unicode()) {
630 auto name_1 = new (Z) ZoneGrowableArray<char>();
631 auto name_2 = new (Z) ZoneGrowableArray<char>();
632 auto ranges = new (Z) ZoneGrowableArray<CharacterRange>(2);
633 if (ParsePropertyClassName(name_1, name_2)) {
634 if (AddPropertyClassRange(ranges, p == 'P', name_1, name_2)) {
635 RegExpCharacterClass* cc =
636 new (Z) RegExpCharacterClass(ranges, builder->flags());
637 builder->AddCharacterClass(cc);
638 break;
639 }
640 }
641 ReportError("Invalid property name");
642 UNREACHABLE();
643 } else {
644 builder->AddCharacter(p);
645 }
646 break;
647 }
648 case '1':
649 case '2':
650 case '3':
651 case '4':
652 case '5':
653 case '6':
654 case '7':
655 case '8':
656 case '9': {
657 intptr_t index = 0;
658 if (ParseBackReferenceIndex(&index)) {
659 if (stored_state->IsInsideCaptureGroup(index)) {
660 // The back reference is inside the capture group it refers to.
661 // Nothing can possibly have been captured yet, so we use empty
662 // instead. This ensures that, when checking a back reference,
663 // the capture registers of the referenced capture are either
664 // both set or both cleared.
665 builder->AddEmpty();
666 } else {
667 RegExpCapture* capture = GetCapture(index);
668 RegExpTree* atom =
669 new (Z) RegExpBackReference(capture, builder->flags());
670 builder->AddAtom(atom);
671 }
672 break;
673 }
674 // With /u, no identity escapes except for syntax characters are
675 // allowed. Otherwise, all identity escapes are allowed.
676 if (is_unicode()) {
677 ReportError(kUnicodeIdentity);
678 UNREACHABLE();
679 }
680 uint32_t first_digit = Next();
681 if (first_digit == '8' || first_digit == '9') {
682 builder->AddCharacter(first_digit);
683 Advance(2);
684 break;
685 }
686 }
687 FALL_THROUGH;
688 case '0': {
689 Advance();
690 if (is_unicode() && Next() >= '0' && Next() <= '9') {
691 // With /u, decimal escape with leading 0 are not parsed as octal.
692 ReportError("Invalid decimal escape");
693 UNREACHABLE();
694 }
695 uint32_t octal = ParseOctalLiteral();
696 builder->AddCharacter(octal);
697 break;
698 }
699 // ControlEscape :: one of
700 // f n r t v
701 case 'f':
702 Advance(2);
703 builder->AddCharacter('\f');
704 break;
705 case 'n':
706 Advance(2);
707 builder->AddCharacter('\n');
708 break;
709 case 'r':
710 Advance(2);
711 builder->AddCharacter('\r');
712 break;
713 case 't':
714 Advance(2);
715 builder->AddCharacter('\t');
716 break;
717 case 'v':
718 Advance(2);
719 builder->AddCharacter('\v');
720 break;
721 case 'c': {
722 Advance();
723 uint32_t controlLetter = Next();
724 // Special case if it is an ASCII letter.
725 // Convert lower case letters to uppercase.
726 uint32_t letter = controlLetter & ~('a' ^ 'A');
727 if (letter < 'A' || 'Z' < letter) {
728 // controlLetter is not in range 'A'-'Z' or 'a'-'z'.
729 // This is outside the specification. We match JSC in
730 // reading the backslash as a literal character instead
731 // of as starting an escape.
732 if (is_unicode()) {
733 // With /u, invalid escapes are not treated as identity escapes.
734 ReportError(kUnicodeIdentity);
735 UNREACHABLE();
736 }
737 builder->AddCharacter('\\');
738 } else {
739 Advance(2);
740 builder->AddCharacter(controlLetter & 0x1f);
741 }
742 break;
743 }
744 case 'x': {
745 Advance(2);
746 uint32_t value;
747 if (ParseHexEscape(2, &value)) {
748 builder->AddCharacter(value);
749 } else if (!is_unicode()) {
750 builder->AddCharacter('x');
751 } else {
752 // With /u, invalid escapes are not treated as identity escapes.
753 ReportError(kUnicodeIdentity);
754 UNREACHABLE();
755 }
756 break;
757 }
758 case 'u': {
759 Advance(2);
760 uint32_t value;
761 if (ParseUnicodeEscape(&value)) {
762 builder->AddEscapedUnicodeCharacter(value);
763 } else if (!is_unicode()) {
764 builder->AddCharacter('u');
765 } else {
766 // With /u, invalid escapes are not treated as identity escapes.
767 ReportError(kUnicodeIdentity);
768 UNREACHABLE();
769 }
770 break;
771 }
772 case 'k':
773 // Either an identity escape or a named back-reference. The two
774 // interpretations are mutually exclusive: '\k' is interpreted as
775 // an identity escape for non-Unicode patterns without named
776 // capture groups, and as the beginning of a named back-reference
777 // in all other cases.
778 if (is_unicode() || HasNamedCaptures()) {
779 Advance(2);
780 ParseNamedBackReference(builder, stored_state);
781 break;
782 }
783 FALL_THROUGH;
784 default:
785 Advance();
786 // With the unicode flag, no identity escapes except for syntax
787 // characters are allowed. Otherwise, all identity escapes are
788 // allowed.
789 if (!is_unicode() || IsSyntaxCharacterOrSlash(current())) {
790 builder->AddCharacter(current());
791 Advance();
792 } else {
793 ReportError(kUnicodeIdentity);
794 UNREACHABLE();
795 }
796 break;
797 }
798 break;
799 case '{': {
800 intptr_t dummy;
801 if (ParseIntervalQuantifier(&dummy, &dummy)) {
802 ReportError("Nothing to repeat");
803 UNREACHABLE();
804 }
805 }
806 FALL_THROUGH;
807 case '}':
808 case ']':
809 if (is_unicode()) {
810 ReportError("Lone quantifier brackets");
811 UNREACHABLE();
812 }
813 FALL_THROUGH;
814 default:
815 builder->AddUnicodeCharacter(current());
816 Advance();
817 break;
818 } // end switch(current())
819
820 intptr_t min;
821 intptr_t max;
822 switch (current()) {
823 // QuantifierPrefix ::
824 // *
825 // +
826 // ?
827 // {
828 case '*':
829 min = 0;
830 max = RegExpTree::kInfinity;
831 Advance();
832 break;
833 case '+':
834 min = 1;
835 max = RegExpTree::kInfinity;
836 Advance();
837 break;
838 case '?':
839 min = 0;
840 max = 1;
841 Advance();
842 break;
843 case '{':
844 if (ParseIntervalQuantifier(&min, &max)) {
845 if (max < min) {
846 ReportError("numbers out of order in {} quantifier.");
847 UNREACHABLE();
848 }
849 break;
850 } else {
851 continue;
852 }
853 default:
854 continue;
855 }
856 RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;
857 if (current() == '?') {
858 quantifier_type = RegExpQuantifier::NON_GREEDY;
859 Advance();
860 } else if (FLAG_regexp_possessive_quantifier && current() == '+') {
861 // FLAG_regexp_possessive_quantifier is a debug-only flag.
862 quantifier_type = RegExpQuantifier::POSSESSIVE;
863 Advance();
864 }
865 if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) {
866 ReportError("invalid quantifier.");
867 UNREACHABLE();
868 }
869 }
870}
871
872#ifdef DEBUG
873// Currently only used in an ASSERT.
874static bool IsSpecialClassEscape(uint32_t c) {
875 switch (c) {
876 case 'd':
877 case 'D':
878 case 's':
879 case 'S':
880 case 'w':
881 case 'W':
882 return true;
883 default:
884 return false;
885 }
886}
887#endif
888
889RegExpParser::RegExpParserState* RegExpParser::ParseOpenParenthesis(
890 RegExpParserState* state) {
891 RegExpLookaround::Type lookaround_type = state->lookaround_type();
892 bool is_named_capture = false;
893 const RegExpCaptureName* capture_name = nullptr;
894 SubexpressionType subexpr_type = CAPTURE;
895 Advance();
896 if (current() == '?') {
897 switch (Next()) {
898 case ':':
899 Advance(2);
900 subexpr_type = GROUPING;
901 break;
902 case '=':
903 Advance(2);
904 lookaround_type = RegExpLookaround::LOOKAHEAD;
905 subexpr_type = POSITIVE_LOOKAROUND;
906 break;
907 case '!':
908 Advance(2);
909 lookaround_type = RegExpLookaround::LOOKAHEAD;
910 subexpr_type = NEGATIVE_LOOKAROUND;
911 break;
912 case '<':
913 Advance();
914 if (Next() == '=') {
915 Advance(2);
916 lookaround_type = RegExpLookaround::LOOKBEHIND;
917 subexpr_type = POSITIVE_LOOKAROUND;
918 break;
919 } else if (Next() == '!') {
920 Advance(2);
921 lookaround_type = RegExpLookaround::LOOKBEHIND;
922 subexpr_type = NEGATIVE_LOOKAROUND;
923 break;
924 }
925 is_named_capture = true;
926 has_named_captures_ = true;
927 Advance();
928 break;
929 default:
930 ReportError("Invalid group");
931 UNREACHABLE();
932 }
933 }
934
935 if (subexpr_type == CAPTURE) {
936 if (captures_started_ >= kMaxCaptures) {
937 ReportError("Too many captures");
938 UNREACHABLE();
939 }
940 captures_started_++;
941
942 if (is_named_capture) {
943 capture_name = ParseCaptureGroupName();
944 }
945 }
946 // Store current state and begin new disjunction parsing.
947 return new (Z)
948 RegExpParserState(state, subexpr_type, lookaround_type, captures_started_,
949 capture_name, state->builder()->flags(), Z);
950}
951
952// In order to know whether an escape is a backreference or not we have to scan
953// the entire regexp and find the number of capturing parentheses. However we
954// don't want to scan the regexp twice unless it is necessary. This mini-parser
955// is called when needed. It can see the difference between capturing and
956// noncapturing parentheses and can skip character classes and backslash-escaped
957// characters.
958void RegExpParser::ScanForCaptures() {
959 ASSERT(!is_scanned_for_captures_);
960 const intptr_t saved_position = position();
961 // Start with captures started previous to current position
962 intptr_t capture_count = captures_started();
963 // Add count of captures after this position.
964 uintptr_t n;
965 while ((n = current()) != kEndMarker) {
966 Advance();
967 switch (n) {
968 case '\\':
969 Advance();
970 break;
971 case '[': {
972 uintptr_t c;
973 while ((c = current()) != kEndMarker) {
974 Advance();
975 if (c == '\\') {
976 Advance();
977 } else {
978 if (c == ']') break;
979 }
980 }
981 break;
982 }
983 case '(':
984 // At this point we could be in
985 // * a non-capturing group '(:',
986 // * a lookbehind assertion '(?<=' '(?<!'
987 // * or a named capture '(?<'.
988 //
989 // Of these, only named captures are capturing groups.
990 if (current() == '?') {
991 Advance();
992 if (current() != '<') break;
993
994 Advance();
995 if (current() == '=' || current() == '!') break;
996
997 // Found a possible named capture. It could turn out to be a syntax
998 // error (e.g. an unterminated or invalid name), but that distinction
999 // does not matter for our purposes.
1000 has_named_captures_ = true;
1001 }
1002 capture_count++;
1003 break;
1004 }
1005 }
1006 capture_count_ = capture_count;
1007 is_scanned_for_captures_ = true;
1008 Reset(saved_position);
1009}
1010
1011bool RegExpParser::ParseBackReferenceIndex(intptr_t* index_out) {
1012 ASSERT('\\' == current());
1013 ASSERT('1' <= Next() && Next() <= '9');
1014 // Try to parse a decimal literal that is no greater than the total number
1015 // of left capturing parentheses in the input.
1016 intptr_t start = position();
1017 intptr_t value = Next() - '0';
1018 Advance(2);
1019 while (true) {
1020 uint32_t c = current();
1021 if (Utils::IsDecimalDigit(c)) {
1022 value = 10 * value + (c - '0');
1023 if (value > kMaxCaptures) {
1024 Reset(start);
1025 return false;
1026 }
1027 Advance();
1028 } else {
1029 break;
1030 }
1031 }
1032 if (value > captures_started()) {
1033 if (!is_scanned_for_captures_) ScanForCaptures();
1034 if (value > capture_count_) {
1035 Reset(start);
1036 return false;
1037 }
1038 }
1039 *index_out = value;
1040 return true;
1041}
1042
1043namespace {
1044
1045static inline constexpr bool IsAsciiIdentifierPart(uint32_t ch) {
1046 return Utils::IsAlphaNumeric(ch) || ch == '_' || ch == '$';
1047}
1048
1049// ES#sec-names-and-keywords Names and Keywords
1050// UnicodeIDStart, '$', '_' and '\'
1051static bool IsIdentifierStartSlow(uint32_t c) {
1052 // cannot use u_isIDStart because it does not work for
1053 // Other_ID_Start characters.
1054 return u_hasBinaryProperty(c, UCHAR_ID_START) ||
1055 (c < 0x60 && (c == '$' || c == '\\' || c == '_'));
1056}
1057
1058// ES#sec-names-and-keywords Names and Keywords
1059// UnicodeIDContinue, '$', '_', '\', ZWJ, and ZWNJ
1060static bool IsIdentifierPartSlow(uint32_t c) {
1061 const uint32_t kZeroWidthNonJoiner = 0x200C;
1062 const uint32_t kZeroWidthJoiner = 0x200D;
1063 // Can't use u_isIDPart because it does not work for
1064 // Other_ID_Continue characters.
1065 return u_hasBinaryProperty(c, UCHAR_ID_CONTINUE) ||
1066 (c < 0x60 && (c == '$' || c == '\\' || c == '_')) ||
1067 c == kZeroWidthNonJoiner || c == kZeroWidthJoiner;
1068}
1069
1070static inline bool IsIdentifierStart(uint32_t c) {
1071 if (c > 127) return IsIdentifierStartSlow(c);
1072 return IsAsciiIdentifierPart(c) && !Utils::IsDecimalDigit(c);
1073}
1074
1075static inline bool IsIdentifierPart(uint32_t c) {
1076 if (c > 127) return IsIdentifierPartSlow(c);
1077 return IsAsciiIdentifierPart(c);
1078}
1079
1080static bool IsSameName(const RegExpCaptureName* name1,
1081 const RegExpCaptureName* name2) {
1082 if (name1->length() != name2->length()) return false;
1083 for (intptr_t i = 0; i < name1->length(); i++) {
1084 if (name1->At(i) != name2->At(i)) return false;
1085 }
1086 return true;
1087}
1088
1089} // end namespace
1090
1091static void PushCodeUnit(RegExpCaptureName* v, uint32_t code_unit) {
1092 if (code_unit <= Utf16::kMaxCodeUnit) {
1093 v->Add(code_unit);
1094 } else {
1095 uint16_t units[2];
1096 Utf16::Encode(code_unit, units);
1097 v->Add(units[0]);
1098 v->Add(units[1]);
1099 }
1100}
1101
1102const RegExpCaptureName* RegExpParser::ParseCaptureGroupName() {
1103 auto name = new (Z) RegExpCaptureName();
1104
1105 bool at_start = true;
1106 while (true) {
1107 uint32_t c = current();
1108 Advance();
1109
1110 // Convert unicode escapes.
1111 if (c == '\\' && current() == 'u') {
1112 Advance();
1113 if (!ParseUnicodeEscape(&c)) {
1114 ReportError("Invalid Unicode escape sequence");
1115 UNREACHABLE();
1116 }
1117 }
1118
1119 // The backslash char is misclassified as both ID_Start and ID_Continue.
1120 if (c == '\\') {
1121 ReportError("Invalid capture group name");
1122 UNREACHABLE();
1123 }
1124
1125 if (at_start) {
1126 if (!IsIdentifierStart(c)) {
1127 ReportError("Invalid capture group name");
1128 UNREACHABLE();
1129 }
1130 PushCodeUnit(name, c);
1131 at_start = false;
1132 } else {
1133 if (c == '>') {
1134 break;
1135 } else if (IsIdentifierPart(c)) {
1136 PushCodeUnit(name, c);
1137 } else {
1138 ReportError("Invalid capture group name");
1139 UNREACHABLE();
1140 }
1141 }
1142 }
1143
1144 return name;
1145}
1146
1147intptr_t RegExpParser::GetNamedCaptureIndex(const RegExpCaptureName* name) {
1148 for (const auto& capture : *named_captures_) {
1149 if (IsSameName(name, capture->name())) return capture->index();
1150 }
1151 return -1;
1152}
1153
1154void RegExpParser::CreateNamedCaptureAtIndex(const RegExpCaptureName* name,
1155 intptr_t index) {
1156 ASSERT(0 < index && index <= captures_started_);
1157 ASSERT(name != nullptr);
1158
1159 if (named_captures_ == nullptr) {
1160 named_captures_ = new (Z) ZoneGrowableArray<RegExpCapture*>(1);
1161 } else {
1162 // Check for duplicates and bail if we find any. Currently O(n^2).
1163 if (GetNamedCaptureIndex(name) >= 0) {
1164 ReportError("Duplicate capture group name");
1165 UNREACHABLE();
1166 }
1167 }
1168
1169 RegExpCapture* capture = GetCapture(index);
1170 ASSERT(capture->name() == nullptr);
1171
1172 capture->set_name(name);
1173 named_captures_->Add(capture);
1174}
1175
1176bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder,
1177 RegExpParserState* state) {
1178 // The parser is assumed to be on the '<' in \k<name>.
1179 if (current() != '<') {
1180 ReportError("Invalid named reference");
1181 UNREACHABLE();
1182 }
1183
1184 Advance();
1185 const RegExpCaptureName* name = ParseCaptureGroupName();
1186 if (name == nullptr) {
1187 return false;
1188 }
1189
1190 if (state->IsInsideCaptureGroup(name)) {
1191 builder->AddEmpty();
1192 } else {
1193 RegExpBackReference* atom = new (Z) RegExpBackReference(builder->flags());
1194 atom->set_name(name);
1195
1196 builder->AddAtom(atom);
1197
1198 if (named_back_references_ == nullptr) {
1199 named_back_references_ =
1200 new (Z) ZoneGrowableArray<RegExpBackReference*>(1);
1201 }
1202 named_back_references_->Add(atom);
1203 }
1204
1205 return true;
1206}
1207
1208void RegExpParser::PatchNamedBackReferences() {
1209 if (named_back_references_ == nullptr) return;
1210
1211 if (named_captures_ == nullptr) {
1212 ReportError("Invalid named capture referenced");
1213 return;
1214 }
1215
1216 // Look up and patch the actual capture for each named back reference.
1217 // Currently O(n^2), optimize if necessary.
1218 for (intptr_t i = 0; i < named_back_references_->length(); i++) {
1219 RegExpBackReference* ref = named_back_references_->At(i);
1220 intptr_t index = GetNamedCaptureIndex(ref->name());
1221
1222 if (index < 0) {
1223 ReportError("Invalid named capture referenced");
1224 UNREACHABLE();
1225 }
1226 ref->set_capture(GetCapture(index));
1227 }
1228}
1229
1230RegExpCapture* RegExpParser::GetCapture(intptr_t index) {
1231 // The index for the capture groups are one-based. Its index in the list is
1232 // zero-based.
1233 const intptr_t know_captures =
1234 is_scanned_for_captures_ ? capture_count_ : captures_started_;
1235 ASSERT(index <= know_captures);
1236 if (captures_ == nullptr) {
1237 captures_ = new (Z) ZoneGrowableArray<RegExpCapture*>(know_captures);
1238 }
1239 while (captures_->length() < know_captures) {
1240 captures_->Add(new (Z) RegExpCapture(captures_->length() + 1));
1241 }
1242 return captures_->At(index - 1);
1243}
1244
1245ArrayPtr RegExpParser::CreateCaptureNameMap() {
1246 if (named_captures_ == nullptr || named_captures_->is_empty()) {
1247 return Array::null();
1248 }
1249
1250 const intptr_t len = named_captures_->length() * 2;
1251
1252 const Array& array = Array::Handle(Array::New(len));
1253
1254 auto& name = String::Handle();
1255 auto& smi = Smi::Handle();
1256 for (intptr_t i = 0; i < named_captures_->length(); i++) {
1257 RegExpCapture* capture = named_captures_->At(i);
1258 name =
1259 String::FromUTF16(capture->name()->data(), capture->name()->length());
1260 smi = Smi::New(capture->index());
1261 array.SetAt(i * 2, name);
1262 array.SetAt(i * 2 + 1, smi);
1263 }
1264
1265 return array.raw();
1266}
1267
1268bool RegExpParser::HasNamedCaptures() {
1269 if (has_named_captures_ || is_scanned_for_captures_) {
1270 return has_named_captures_;
1271 }
1272
1273 ScanForCaptures();
1274 ASSERT(is_scanned_for_captures_);
1275 return has_named_captures_;
1276}
1277
1278bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(intptr_t index) {
1279 for (RegExpParserState* s = this; s != nullptr; s = s->previous_state()) {
1280 if (s->group_type() != CAPTURE) continue;
1281 // Return true if we found the matching capture index.
1282 if (index == s->capture_index()) return true;
1283 // Abort if index is larger than what has been parsed up till this state.
1284 if (index > s->capture_index()) return false;
1285 }
1286 return false;
1287}
1288
1289bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(
1290 const RegExpCaptureName* name) {
1291 ASSERT(name != nullptr);
1292 for (RegExpParserState* s = this; s != nullptr; s = s->previous_state()) {
1293 if (s->capture_name() == nullptr) continue;
1294 if (IsSameName(s->capture_name(), name)) return true;
1295 }
1296 return false;
1297}
1298
1299// QuantifierPrefix ::
1300// { DecimalDigits }
1301// { DecimalDigits , }
1302// { DecimalDigits , DecimalDigits }
1303//
1304// Returns true if parsing succeeds, and set the min_out and max_out
1305// values. Values are truncated to RegExpTree::kInfinity if they overflow.
1306bool RegExpParser::ParseIntervalQuantifier(intptr_t* min_out,
1307 intptr_t* max_out) {
1308 ASSERT(current() == '{');
1309 intptr_t start = position();
1310 Advance();
1311 intptr_t min = 0;
1312 if (!Utils::IsDecimalDigit(current())) {
1313 Reset(start);
1314 return false;
1315 }
1316 while (Utils::IsDecimalDigit(current())) {
1317 intptr_t next = current() - '0';
1318 if (min > (RegExpTree::kInfinity - next) / 10) {
1319 // Overflow. Skip past remaining decimal digits and return -1.
1320 do {
1321 Advance();
1322 } while (Utils::IsDecimalDigit(current()));
1323 min = RegExpTree::kInfinity;
1324 break;
1325 }
1326 min = 10 * min + next;
1327 Advance();
1328 }
1329 intptr_t max = 0;
1330 if (current() == '}') {
1331 max = min;
1332 Advance();
1333 } else if (current() == ',') {
1334 Advance();
1335 if (current() == '}') {
1336 max = RegExpTree::kInfinity;
1337 Advance();
1338 } else {
1339 while (Utils::IsDecimalDigit(current())) {
1340 intptr_t next = current() - '0';
1341 if (max > (RegExpTree::kInfinity - next) / 10) {
1342 do {
1343 Advance();
1344 } while (Utils::IsDecimalDigit(current()));
1345 max = RegExpTree::kInfinity;
1346 break;
1347 }
1348 max = 10 * max + next;
1349 Advance();
1350 }
1351 if (current() != '}') {
1352 Reset(start);
1353 return false;
1354 }
1355 Advance();
1356 }
1357 } else {
1358 Reset(start);
1359 return false;
1360 }
1361 *min_out = min;
1362 *max_out = max;
1363 return true;
1364}
1365
1366uint32_t RegExpParser::ParseOctalLiteral() {
1367 ASSERT(('0' <= current() && current() <= '7') || current() == kEndMarker);
1368 // For compatibility with some other browsers (not all), we parse
1369 // up to three octal digits with a value below 256.
1370 uint32_t value = current() - '0';
1371 Advance();
1372 if ('0' <= current() && current() <= '7') {
1373 value = value * 8 + current() - '0';
1374 Advance();
1375 if (value < 32 && '0' <= current() && current() <= '7') {
1376 value = value * 8 + current() - '0';
1377 Advance();
1378 }
1379 }
1380 return value;
1381}
1382
1383// Returns the value (0 .. 15) of a hexadecimal character c.
1384// If c is not a legal hexadecimal character, returns a value < 0.
1385static inline intptr_t HexValue(uint32_t c) {
1386 c -= '0';
1387 if (static_cast<unsigned>(c) <= 9) return c;
1388 c = (c | 0x20) - ('a' - '0'); // detect 0x11..0x16 and 0x31..0x36.
1389 if (static_cast<unsigned>(c) <= 5) return c + 10;
1390 return -1;
1391}
1392
1393bool RegExpParser::ParseHexEscape(intptr_t length, uint32_t* value) {
1394 intptr_t start = position();
1395 uint32_t val = 0;
1396 bool done = false;
1397 for (intptr_t i = 0; !done; i++) {
1398 uint32_t c = current();
1399 intptr_t d = HexValue(c);
1400 if (d < 0) {
1401 Reset(start);
1402 return false;
1403 }
1404 val = val * 16 + d;
1405 Advance();
1406 if (i == length - 1) {
1407 done = true;
1408 }
1409 }
1410 *value = val;
1411 return true;
1412}
1413
1414// This parses RegExpUnicodeEscapeSequence as described in ECMA262.
1415bool RegExpParser::ParseUnicodeEscape(uint32_t* value) {
1416 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
1417 // allowed). In the latter case, the number of hex digits between { } is
1418 // arbitrary. \ and u have already been read.
1419 if (current() == '{' && is_unicode()) {
1420 int start = position();
1421 Advance();
1422 if (ParseUnlimitedLengthHexNumber(Utf::kMaxCodePoint, value)) {
1423 if (current() == '}') {
1424 Advance();
1425 return true;
1426 }
1427 }
1428 Reset(start);
1429 return false;
1430 }
1431 // \u but no {, or \u{...} escapes not allowed.
1432 bool result = ParseHexEscape(4, value);
1433 if (result && is_unicode() && Utf16::IsLeadSurrogate(*value) &&
1434 current() == '\\') {
1435 // Attempt to read trail surrogate.
1436 int start = position();
1437 if (Next() == 'u') {
1438 Advance(2);
1439 uint32_t trail;
1440 if (ParseHexEscape(4, &trail) && Utf16::IsTrailSurrogate(trail)) {
1441 *value = Utf16::Decode(static_cast<uint16_t>(*value),
1442 static_cast<uint16_t>(trail));
1443 return true;
1444 }
1445 }
1446 Reset(start);
1447 }
1448 return result;
1449}
1450
1451namespace {
1452
1453bool IsExactPropertyAlias(const char* property_name, UProperty property) {
1454 const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME);
1455 if (short_name != nullptr && strcmp(property_name, short_name) == 0) {
1456 return true;
1457 }
1458 for (int i = 0;; i++) {
1459 const char* long_name = u_getPropertyName(
1460 property, static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
1461 if (long_name == nullptr) break;
1462 if (strcmp(property_name, long_name) == 0) return true;
1463 }
1464 return false;
1465}
1466
1467bool IsExactPropertyValueAlias(const char* property_value_name,
1468 UProperty property,
1469 int32_t property_value) {
1470 const char* short_name =
1471 u_getPropertyValueName(property, property_value, U_SHORT_PROPERTY_NAME);
1472 if (short_name != nullptr && strcmp(property_value_name, short_name) == 0) {
1473 return true;
1474 }
1475 for (int i = 0;; i++) {
1476 const char* long_name = u_getPropertyValueName(
1477 property, property_value,
1478 static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
1479 if (long_name == nullptr) break;
1480 if (strcmp(property_value_name, long_name) == 0) return true;
1481 }
1482 return false;
1483}
1484
1485bool LookupPropertyValueName(UProperty property,
1486 const char* property_value_name,
1487 bool negate,
1488 ZoneGrowableArray<CharacterRange>* result) {
1489 UProperty property_for_lookup = property;
1490 if (property_for_lookup == UCHAR_SCRIPT_EXTENSIONS) {
1491 // For the property Script_Extensions, we have to do the property value
1492 // name lookup as if the property is Script.
1493 property_for_lookup = UCHAR_SCRIPT;
1494 }
1495 int32_t property_value =
1496 u_getPropertyValueEnum(property_for_lookup, property_value_name);
1497 if (property_value == UCHAR_INVALID_CODE) return false;
1498
1499 // We require the property name to match exactly to one of the property value
1500 // aliases. However, u_getPropertyValueEnum uses loose matching.
1501 if (!IsExactPropertyValueAlias(property_value_name, property_for_lookup,
1502 property_value)) {
1503 return false;
1504 }
1505
1506 UErrorCode ec = U_ZERO_ERROR;
1507 icu::UnicodeSet set;
1508 set.applyIntPropertyValue(property, property_value, ec);
1509 bool success = ec == U_ZERO_ERROR && (set.isEmpty() == 0);
1510
1511 if (success) {
1512 set.removeAllStrings();
1513 if (negate) set.complement();
1514 for (int i = 0; i < set.getRangeCount(); i++) {
1515 result->Add(
1516 CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)));
1517 }
1518 }
1519 return success;
1520}
1521
1522template <size_t N>
1523inline bool NameEquals(const char* name, const char (&literal)[N]) {
1524 return strncmp(name, literal, N + 1) == 0;
1525}
1526
1527bool LookupSpecialPropertyValueName(const char* name,
1528 ZoneGrowableArray<CharacterRange>* result,
1529 bool negate) {
1530 if (NameEquals(name, "Any")) {
1531 if (negate) {
1532 // Leave the list of character ranges empty, since the negation of 'Any'
1533 // is the empty set.
1534 } else {
1535 result->Add(CharacterRange::Everything());
1536 }
1537 } else if (NameEquals(name, "ASCII")) {
1538 result->Add(negate ? CharacterRange::Range(0x80, Utf::kMaxCodePoint)
1539 : CharacterRange::Range(0x0, 0x7F));
1540 } else if (NameEquals(name, "Assigned")) {
1541 return LookupPropertyValueName(UCHAR_GENERAL_CATEGORY, "Unassigned",
1542 !negate, result);
1543 } else {
1544 return false;
1545 }
1546 return true;
1547}
1548
1549// Explicitly list supported binary properties. The spec forbids supporting
1550// properties outside of this set to ensure interoperability.
1551bool IsSupportedBinaryProperty(UProperty property) {
1552 switch (property) {
1553 case UCHAR_ALPHABETIC:
1554 // 'Any' is not supported by ICU. See LookupSpecialPropertyValueName.
1555 // 'ASCII' is not supported by ICU. See LookupSpecialPropertyValueName.
1556 case UCHAR_ASCII_HEX_DIGIT:
1557 // 'Assigned' is not supported by ICU. See LookupSpecialPropertyValueName.
1558 case UCHAR_BIDI_CONTROL:
1559 case UCHAR_BIDI_MIRRORED:
1560 case UCHAR_CASE_IGNORABLE:
1561 case UCHAR_CASED:
1562 case UCHAR_CHANGES_WHEN_CASEFOLDED:
1563 case UCHAR_CHANGES_WHEN_CASEMAPPED:
1564 case UCHAR_CHANGES_WHEN_LOWERCASED:
1565 case UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED:
1566 case UCHAR_CHANGES_WHEN_TITLECASED:
1567 case UCHAR_CHANGES_WHEN_UPPERCASED:
1568 case UCHAR_DASH:
1569 case UCHAR_DEFAULT_IGNORABLE_CODE_POINT:
1570 case UCHAR_DEPRECATED:
1571 case UCHAR_DIACRITIC:
1572 case UCHAR_EMOJI:
1573 case UCHAR_EMOJI_COMPONENT:
1574 case UCHAR_EMOJI_MODIFIER_BASE:
1575 case UCHAR_EMOJI_MODIFIER:
1576 case UCHAR_EMOJI_PRESENTATION:
1577 case UCHAR_EXTENDED_PICTOGRAPHIC:
1578 case UCHAR_EXTENDER:
1579 case UCHAR_GRAPHEME_BASE:
1580 case UCHAR_GRAPHEME_EXTEND:
1581 case UCHAR_HEX_DIGIT:
1582 case UCHAR_ID_CONTINUE:
1583 case UCHAR_ID_START:
1584 case UCHAR_IDEOGRAPHIC:
1585 case UCHAR_IDS_BINARY_OPERATOR:
1586 case UCHAR_IDS_TRINARY_OPERATOR:
1587 case UCHAR_JOIN_CONTROL:
1588 case UCHAR_LOGICAL_ORDER_EXCEPTION:
1589 case UCHAR_LOWERCASE:
1590 case UCHAR_MATH:
1591 case UCHAR_NONCHARACTER_CODE_POINT:
1592 case UCHAR_PATTERN_SYNTAX:
1593 case UCHAR_PATTERN_WHITE_SPACE:
1594 case UCHAR_QUOTATION_MARK:
1595 case UCHAR_RADICAL:
1596 case UCHAR_REGIONAL_INDICATOR:
1597 case UCHAR_S_TERM:
1598 case UCHAR_SOFT_DOTTED:
1599 case UCHAR_TERMINAL_PUNCTUATION:
1600 case UCHAR_UNIFIED_IDEOGRAPH:
1601 case UCHAR_UPPERCASE:
1602 case UCHAR_VARIATION_SELECTOR:
1603 case UCHAR_WHITE_SPACE:
1604 case UCHAR_XID_CONTINUE:
1605 case UCHAR_XID_START:
1606 return true;
1607 default:
1608 break;
1609 }
1610 return false;
1611}
1612
1613bool IsUnicodePropertyValueCharacter(char c) {
1614 // https://tc39.github.io/proposal-regexp-unicode-property-escapes/
1615 //
1616 // Note that using this to validate each parsed char is quite conservative.
1617 // A possible alternative solution would be to only ensure the parsed
1618 // property name/value candidate string does not contain '\0' characters and
1619 // let ICU lookups trigger the final failure.
1620 if (Utils::IsAlphaNumeric(c)) return true;
1621 return (c == '_');
1622}
1623
1624} // anonymous namespace
1625
1626bool RegExpParser::ParsePropertyClassName(ZoneGrowableArray<char>* name_1,
1627 ZoneGrowableArray<char>* name_2) {
1628 ASSERT(name_1->is_empty());
1629 ASSERT(name_2->is_empty());
1630 // Parse the property class as follows:
1631 // - In \p{name}, 'name' is interpreted
1632 // - either as a general category property value name.
1633 // - or as a binary property name.
1634 // - In \p{name=value}, 'name' is interpreted as an enumerated property name,
1635 // and 'value' is interpreted as one of the available property value names.
1636 // - Aliases in PropertyAlias.txt and PropertyValueAlias.txt can be used.
1637 // - Loose matching is not applied.
1638 if (current() == '{') {
1639 // Parse \p{[PropertyName=]PropertyNameValue}
1640 for (Advance(); current() != '}' && current() != '='; Advance()) {
1641 if (!IsUnicodePropertyValueCharacter(current())) return false;
1642 if (!has_next()) return false;
1643 name_1->Add(static_cast<char>(current()));
1644 }
1645 if (current() == '=') {
1646 for (Advance(); current() != '}'; Advance()) {
1647 if (!IsUnicodePropertyValueCharacter(current())) return false;
1648 if (!has_next()) return false;
1649 name_2->Add(static_cast<char>(current()));
1650 }
1651 name_2->Add(0); // null-terminate string.
1652 }
1653 } else {
1654 return false;
1655 }
1656 Advance();
1657 name_1->Add(0); // null-terminate string.
1658
1659 ASSERT(static_cast<size_t>(name_1->length() - 1) == strlen(name_1->data()));
1660 ASSERT(name_2->is_empty() ||
1661 static_cast<size_t>(name_2->length() - 1) == strlen(name_2->data()));
1662 return true;
1663}
1664
1665bool RegExpParser::AddPropertyClassRange(
1666 ZoneGrowableArray<CharacterRange>* add_to,
1667 bool negate,
1668 ZoneGrowableArray<char>* name_1,
1669 ZoneGrowableArray<char>* name_2) {
1670 ASSERT(name_1->At(name_1->length() - 1) == '\0');
1671 ASSERT(name_2->is_empty() || name_2->At(name_2->length() - 1) == '\0');
1672 if (name_2->is_empty()) {
1673 // First attempt to interpret as general category property value name.
1674 const char* name = name_1->data();
1675 if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, negate,
1676 add_to)) {
1677 return true;
1678 }
1679 // Interpret "Any", "ASCII", and "Assigned".
1680 if (LookupSpecialPropertyValueName(name, add_to, negate)) {
1681 return true;
1682 }
1683 // Then attempt to interpret as binary property name with value name 'Y'.
1684 UProperty property = u_getPropertyEnum(name);
1685 if (!IsSupportedBinaryProperty(property)) return false;
1686 if (!IsExactPropertyAlias(name, property)) return false;
1687 return LookupPropertyValueName(property, negate ? "N" : "Y", false, add_to);
1688 } else {
1689 // Both property name and value name are specified. Attempt to interpret
1690 // the property name as enumerated property.
1691 const char* property_name = name_1->data();
1692 const char* value_name = name_2->data();
1693 UProperty property = u_getPropertyEnum(property_name);
1694 if (!IsExactPropertyAlias(property_name, property)) return false;
1695 if (property == UCHAR_GENERAL_CATEGORY) {
1696 // We want to allow aggregate value names such as "Letter".
1697 property = UCHAR_GENERAL_CATEGORY_MASK;
1698 } else if (property != UCHAR_SCRIPT &&
1699 property != UCHAR_SCRIPT_EXTENSIONS) {
1700 return false;
1701 }
1702 return LookupPropertyValueName(property, value_name, negate, add_to);
1703 }
1704}
1705
1706bool RegExpParser::ParseUnlimitedLengthHexNumber(uint32_t max_value,
1707 uint32_t* value) {
1708 uint32_t x = 0;
1709 int d = HexValue(current());
1710 if (d < 0) {
1711 return false;
1712 }
1713 while (d >= 0) {
1714 x = x * 16 + d;
1715 if (x > max_value) {
1716 return false;
1717 }
1718 Advance();
1719 d = HexValue(current());
1720 }
1721 *value = x;
1722 return true;
1723}
1724
1725uint32_t RegExpParser::ParseClassCharacterEscape() {
1726 ASSERT(current() == '\\');
1727 DEBUG_ASSERT(has_next() && !IsSpecialClassEscape(Next()));
1728 Advance();
1729 switch (current()) {
1730 case 'b':
1731 Advance();
1732 return '\b';
1733 // ControlEscape :: one of
1734 // f n r t v
1735 case 'f':
1736 Advance();
1737 return '\f';
1738 case 'n':
1739 Advance();
1740 return '\n';
1741 case 'r':
1742 Advance();
1743 return '\r';
1744 case 't':
1745 Advance();
1746 return '\t';
1747 case 'v':
1748 Advance();
1749 return '\v';
1750 case 'c': {
1751 uint32_t controlLetter = Next();
1752 uint32_t letter = controlLetter & ~('A' ^ 'a');
1753 // For compatibility with JSC, inside a character class
1754 // we also accept digits and underscore as control characters.
1755 if (letter >= 'A' && letter <= 'Z') {
1756 Advance(2);
1757 // Control letters mapped to ASCII control characters in the range
1758 // 0x00-0x1f.
1759 return controlLetter & 0x1f;
1760 }
1761 if (is_unicode()) {
1762 // With /u, \c# or \c_ are invalid.
1763 ReportError("Invalid class escape");
1764 UNREACHABLE();
1765 }
1766 if (Utils::IsDecimalDigit(controlLetter) || controlLetter == '_') {
1767 Advance(2);
1768 return controlLetter & 0x1f;
1769 }
1770 // We match JSC in reading the backslash as a literal
1771 // character instead of as starting an escape.
1772 return '\\';
1773 }
1774 case '0':
1775 // With /u, \0 is interpreted as NUL if not followed by another digit.
1776 if (is_unicode() && !(Next() >= '0' && Next() <= '9')) {
1777 Advance();
1778 return 0;
1779 }
1780 FALL_THROUGH;
1781 case '1':
1782 case '2':
1783 case '3':
1784 case '4':
1785 case '5':
1786 case '6':
1787 case '7':
1788 // For compatibility, we interpret a decimal escape that isn't
1789 // a back reference (and therefore either \0 or not valid according
1790 // to the specification) as a 1..3 digit octal character code.
1791 if (is_unicode()) {
1792 // With \u, decimal escape is not interpreted as octal character code.
1793 ReportError("Invalid class escape");
1794 UNREACHABLE();
1795 }
1796 return ParseOctalLiteral();
1797 case 'x': {
1798 Advance();
1799 uint32_t value;
1800 if (ParseHexEscape(2, &value)) {
1801 return value;
1802 }
1803 if (is_unicode()) {
1804 // With \u, invalid escapes are not treated as identity escapes.
1805 ReportError("Invalid escape");
1806 UNREACHABLE();
1807 }
1808 // If \x is not followed by a two-digit hexadecimal, treat it
1809 // as an identity escape.
1810 return 'x';
1811 }
1812 case 'u': {
1813 Advance();
1814 uint32_t value;
1815 if (ParseUnicodeEscape(&value)) {
1816 return value;
1817 }
1818 if (is_unicode()) {
1819 // With \u, invalid escapes are not treated as identity escapes.
1820 ReportError(kUnicodeIdentity);
1821 UNREACHABLE();
1822 }
1823 // If \u is not followed by a four-digit hexadecimal, treat it
1824 // as an identity escape.
1825 return 'u';
1826 }
1827 default: {
1828 // Extended identity escape. We accept any character that hasn't
1829 // been matched by a more specific case, not just the subset required
1830 // by the ECMAScript specification.
1831 uint32_t result = current();
1832 if (!is_unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') {
1833 Advance();
1834 return result;
1835 }
1836 ReportError(kUnicodeIdentity);
1837 UNREACHABLE();
1838 }
1839 }
1840 return 0;
1841}
1842
1843bool RegExpParser::ParseClassEscape(ZoneGrowableArray<CharacterRange>* ranges,
1844 bool add_unicode_case_equivalents,
1845 uint32_t* char_out) {
1846 uint32_t first = current();
1847 if (first == '\\') {
1848 switch (Next()) {
1849 case 'w':
1850 case 'W':
1851 case 'd':
1852 case 'D':
1853 case 's':
1854 case 'S': {
1855 CharacterRange::AddClassEscape(static_cast<uint16_t>(Next()), ranges,
1856 add_unicode_case_equivalents);
1857 Advance(2);
1858 return true;
1859 }
1860 case 'p':
1861 case 'P': {
1862 if (!is_unicode()) break;
1863 bool negate = Next() == 'P';
1864 Advance(2);
1865 auto name_1 = new (Z) ZoneGrowableArray<char>();
1866 auto name_2 = new (Z) ZoneGrowableArray<char>();
1867 if (!ParsePropertyClassName(name_1, name_2) ||
1868 !AddPropertyClassRange(ranges, negate, name_1, name_2)) {
1869 ReportError("Invalid property name in character class");
1870 UNREACHABLE();
1871 }
1872 return true;
1873 }
1874 case kEndMarker:
1875 ReportError("\\ at end of pattern");
1876 UNREACHABLE();
1877 default:
1878 break;
1879 }
1880 *char_out = ParseClassCharacterEscape();
1881 return false;
1882 }
1883 Advance();
1884 *char_out = first;
1885 return false;
1886}
1887
1888RegExpTree* RegExpParser::ParseCharacterClass(const RegExpBuilder* builder) {
1889 static const char* kUnterminated = "Unterminated character class";
1890 static const char* kRangeInvalid = "Invalid character class";
1891 static const char* kRangeOutOfOrder = "Range out of order in character class";
1892
1893 ASSERT(current() == '[');
1894 Advance();
1895 bool is_negated = false;
1896 if (current() == '^') {
1897 is_negated = true;
1898 Advance();
1899 }
1900 ZoneGrowableArray<CharacterRange>* ranges =
1901 new (Z) ZoneGrowableArray<CharacterRange>(2);
1902 bool add_unicode_case_equivalents = is_unicode() && builder->ignore_case();
1903 while (has_more() && current() != ']') {
1904 uint32_t char_1 = 0;
1905 bool is_class_1 =
1906 ParseClassEscape(ranges, add_unicode_case_equivalents, &char_1);
1907 if (current() == '-') {
1908 Advance();
1909 if (current() == kEndMarker) {
1910 // If we reach the end we break out of the loop and let the
1911 // following code report an error.
1912 break;
1913 } else if (current() == ']') {
1914 if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1));
1915 ranges->Add(CharacterRange::Singleton('-'));
1916 break;
1917 }
1918 uint32_t char_2 = 0;
1919 bool is_class_2 =
1920 ParseClassEscape(ranges, add_unicode_case_equivalents, &char_2);
1921 if (is_class_1 || is_class_2) {
1922 // Either end is an escaped character class. Treat the '-' verbatim.
1923 if (is_unicode()) {
1924 // ES2015 21.2.2.15.1 step 1.
1925 ReportError(kRangeInvalid);
1926 UNREACHABLE();
1927 }
1928 if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1));
1929 ranges->Add(CharacterRange::Singleton('-'));
1930 if (!is_class_2) ranges->Add(CharacterRange::Singleton(char_2));
1931 continue;
1932 }
1933 if (char_1 > char_2) {
1934 ReportError(kRangeOutOfOrder);
1935 UNREACHABLE();
1936 }
1937 ranges->Add(CharacterRange::Range(char_1, char_2));
1938 } else {
1939 if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1));
1940 }
1941 }
1942 if (!has_more()) {
1943 ReportError(kUnterminated);
1944 UNREACHABLE();
1945 }
1946 Advance();
1947 RegExpCharacterClass::CharacterClassFlags character_class_flags =
1948 RegExpCharacterClass::DefaultFlags();
1949 if (is_negated) character_class_flags |= RegExpCharacterClass::NEGATED;
1950 return new (Z)
1951 RegExpCharacterClass(ranges, builder->flags(), character_class_flags);
1952}
1953
1954// ----------------------------------------------------------------------------
1955// The Parser interface.
1956
1957void RegExpParser::ParseRegExp(const String& input,
1958 RegExpFlags flags,
1959 RegExpCompileData* result) {
1960 ASSERT(result != NULL);
1961 RegExpParser parser(input, &result->error, flags);
1962 // Throws an exception if 'input' is not valid.
1963 RegExpTree* tree = parser.ParsePattern();
1964 ASSERT(tree != NULL);
1965 ASSERT(result->error.IsNull());
1966 result->tree = tree;
1967 intptr_t capture_count = parser.captures_started();
1968 result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;
1969 result->contains_anchor = parser.contains_anchor();
1970 result->capture_name_map = parser.CreateCaptureNameMap();
1971 result->capture_count = capture_count;
1972}
1973
1974} // namespace dart
1975