1 | // Copyright (c) 2014, the Dart project authors. Please see the AUTHORS file |
2 | // for details. All rights reserved. Use of this source code is governed by a |
3 | // BSD-style license that can be found in the LICENSE file. |
4 | |
5 | #include "vm/regexp_parser.h" |
6 | |
7 | #include "unicode/uchar.h" |
8 | #include "unicode/uniset.h" |
9 | |
10 | #include "platform/unicode.h" |
11 | |
12 | #include "vm/longjump.h" |
13 | #include "vm/object_store.h" |
14 | |
15 | namespace dart { |
16 | |
17 | #define Z zone() |
18 | |
19 | // Enables possessive quantifier syntax for testing. |
20 | static const bool FLAG_regexp_possessive_quantifier = false; |
21 | |
22 | RegExpBuilder::RegExpBuilder(RegExpFlags flags) |
23 | : zone_(Thread::Current()->zone()), |
24 | pending_empty_(false), |
25 | flags_(flags), |
26 | characters_(NULL), |
27 | pending_surrogate_(kNoPendingSurrogate), |
28 | terms_(), |
29 | text_(), |
30 | alternatives_() |
31 | #ifdef DEBUG |
32 | , |
33 | last_added_(ADD_NONE) |
34 | #endif |
35 | { |
36 | } |
37 | |
38 | void RegExpBuilder::AddLeadSurrogate(uint16_t lead_surrogate) { |
39 | ASSERT(Utf16::IsLeadSurrogate(lead_surrogate)); |
40 | FlushPendingSurrogate(); |
41 | // Hold onto the lead surrogate, waiting for a trail surrogate to follow. |
42 | pending_surrogate_ = lead_surrogate; |
43 | } |
44 | |
45 | void RegExpBuilder::AddTrailSurrogate(uint16_t trail_surrogate) { |
46 | ASSERT(Utf16::IsTrailSurrogate(trail_surrogate)); |
47 | if (pending_surrogate_ != kNoPendingSurrogate) { |
48 | uint16_t lead_surrogate = pending_surrogate_; |
49 | pending_surrogate_ = kNoPendingSurrogate; |
50 | ASSERT(Utf16::IsLeadSurrogate(lead_surrogate)); |
51 | uint32_t combined = Utf16::Decode(lead_surrogate, trail_surrogate); |
52 | if (NeedsDesugaringForIgnoreCase(combined)) { |
53 | AddCharacterClassForDesugaring(combined); |
54 | } else { |
55 | auto surrogate_pair = new (Z) ZoneGrowableArray<uint16_t>(2); |
56 | surrogate_pair->Add(lead_surrogate); |
57 | surrogate_pair->Add(trail_surrogate); |
58 | RegExpAtom* atom = new (Z) RegExpAtom(surrogate_pair, flags_); |
59 | AddAtom(atom); |
60 | } |
61 | } else { |
62 | pending_surrogate_ = trail_surrogate; |
63 | FlushPendingSurrogate(); |
64 | } |
65 | } |
66 | |
67 | void RegExpBuilder::FlushPendingSurrogate() { |
68 | if (pending_surrogate_ != kNoPendingSurrogate) { |
69 | ASSERT(is_unicode()); |
70 | uint32_t c = pending_surrogate_; |
71 | pending_surrogate_ = kNoPendingSurrogate; |
72 | AddCharacterClassForDesugaring(c); |
73 | } |
74 | } |
75 | |
76 | void RegExpBuilder::FlushCharacters() { |
77 | FlushPendingSurrogate(); |
78 | pending_empty_ = false; |
79 | if (characters_ != NULL) { |
80 | RegExpTree* atom = new (Z) RegExpAtom(characters_, flags_); |
81 | characters_ = NULL; |
82 | text_.Add(atom); |
83 | LAST(ADD_ATOM); |
84 | } |
85 | } |
86 | |
87 | void RegExpBuilder::FlushText() { |
88 | FlushCharacters(); |
89 | intptr_t num_text = text_.length(); |
90 | if (num_text == 0) { |
91 | return; |
92 | } else if (num_text == 1) { |
93 | terms_.Add(text_.Last()); |
94 | } else { |
95 | RegExpText* text = new (Z) RegExpText(); |
96 | for (intptr_t i = 0; i < num_text; i++) |
97 | text_[i]->AppendToText(text); |
98 | terms_.Add(text); |
99 | } |
100 | text_.Clear(); |
101 | } |
102 | |
103 | void RegExpBuilder::AddCharacter(uint16_t c) { |
104 | FlushPendingSurrogate(); |
105 | pending_empty_ = false; |
106 | if (NeedsDesugaringForIgnoreCase(c)) { |
107 | AddCharacterClassForDesugaring(c); |
108 | } else { |
109 | if (characters_ == NULL) { |
110 | characters_ = new (Z) ZoneGrowableArray<uint16_t>(4); |
111 | } |
112 | characters_->Add(c); |
113 | LAST(ADD_CHAR); |
114 | } |
115 | } |
116 | |
117 | void RegExpBuilder::AddUnicodeCharacter(uint32_t c) { |
118 | if (c > static_cast<uint32_t>(Utf16::kMaxCodeUnit)) { |
119 | ASSERT(is_unicode()); |
120 | uint16_t surrogates[2]; |
121 | Utf16::Encode(c, surrogates); |
122 | AddLeadSurrogate(surrogates[0]); |
123 | AddTrailSurrogate(surrogates[1]); |
124 | } else if (is_unicode() && Utf16::IsLeadSurrogate(c)) { |
125 | AddLeadSurrogate(c); |
126 | } else if (is_unicode() && Utf16::IsTrailSurrogate(c)) { |
127 | AddTrailSurrogate(c); |
128 | } else { |
129 | AddCharacter(static_cast<uint16_t>(c)); |
130 | } |
131 | } |
132 | |
133 | void RegExpBuilder::AddEscapedUnicodeCharacter(uint32_t character) { |
134 | // A lead or trail surrogate parsed via escape sequence will not |
135 | // pair up with any preceding lead or following trail surrogate. |
136 | FlushPendingSurrogate(); |
137 | AddUnicodeCharacter(character); |
138 | FlushPendingSurrogate(); |
139 | } |
140 | |
141 | void RegExpBuilder::AddEmpty() { |
142 | pending_empty_ = true; |
143 | } |
144 | |
145 | void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) { |
146 | if (NeedsDesugaringForUnicode(cc)) { |
147 | // With /u, character class needs to be desugared, so it |
148 | // must be a standalone term instead of being part of a RegExpText. |
149 | AddTerm(cc); |
150 | } else { |
151 | AddAtom(cc); |
152 | } |
153 | } |
154 | |
155 | void RegExpBuilder::AddCharacterClassForDesugaring(uint32_t c) { |
156 | auto ranges = CharacterRange::List(Z, CharacterRange::Singleton(c)); |
157 | AddTerm(new (Z) RegExpCharacterClass(ranges, flags_)); |
158 | } |
159 | |
160 | void RegExpBuilder::AddAtom(RegExpTree* term) { |
161 | if (term->IsEmpty()) { |
162 | AddEmpty(); |
163 | return; |
164 | } |
165 | if (term->IsTextElement()) { |
166 | FlushCharacters(); |
167 | text_.Add(term); |
168 | } else { |
169 | FlushText(); |
170 | terms_.Add(term); |
171 | } |
172 | LAST(ADD_ATOM); |
173 | } |
174 | |
175 | void RegExpBuilder::AddTerm(RegExpTree* term) { |
176 | FlushText(); |
177 | terms_.Add(term); |
178 | LAST(ADD_ATOM); |
179 | } |
180 | |
181 | void RegExpBuilder::AddAssertion(RegExpTree* assert) { |
182 | FlushText(); |
183 | terms_.Add(assert); |
184 | LAST(ADD_ASSERT); |
185 | } |
186 | |
187 | void RegExpBuilder::NewAlternative() { |
188 | FlushTerms(); |
189 | } |
190 | |
191 | void RegExpBuilder::FlushTerms() { |
192 | FlushText(); |
193 | intptr_t num_terms = terms_.length(); |
194 | RegExpTree* alternative; |
195 | if (num_terms == 0) { |
196 | alternative = RegExpEmpty::GetInstance(); |
197 | } else if (num_terms == 1) { |
198 | alternative = terms_.Last(); |
199 | } else { |
200 | ZoneGrowableArray<RegExpTree*>* terms = |
201 | new (Z) ZoneGrowableArray<RegExpTree*>(); |
202 | for (intptr_t i = 0; i < terms_.length(); i++) { |
203 | terms->Add(terms_[i]); |
204 | } |
205 | alternative = new (Z) RegExpAlternative(terms); |
206 | } |
207 | alternatives_.Add(alternative); |
208 | terms_.Clear(); |
209 | LAST(ADD_NONE); |
210 | } |
211 | |
212 | bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) { |
213 | if (!is_unicode()) return false; |
214 | // TODO(yangguo): we could be smarter than this. Case-insensitivity does not |
215 | // necessarily mean that we need to desugar. It's probably nicer to have a |
216 | // separate pass to figure out unicode desugarings. |
217 | if (ignore_case()) return true; |
218 | ZoneGrowableArray<CharacterRange>* ranges = cc->ranges(); |
219 | CharacterRange::Canonicalize(ranges); |
220 | for (int i = ranges->length() - 1; i >= 0; i--) { |
221 | uint32_t from = ranges->At(i).from(); |
222 | uint32_t to = ranges->At(i).to(); |
223 | // Check for non-BMP characters. |
224 | if (to >= Utf16::kMaxCodeUnit) return true; |
225 | // Check for lone surrogates. |
226 | if (from <= Utf16::kTrailSurrogateEnd && to >= Utf16::kLeadSurrogateStart) { |
227 | return true; |
228 | } |
229 | } |
230 | return false; |
231 | } |
232 | |
233 | bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uint32_t c) { |
234 | if (is_unicode() && ignore_case()) { |
235 | icu::UnicodeSet set(c, c); |
236 | set.closeOver(USET_CASE_INSENSITIVE); |
237 | set.removeAllStrings(); |
238 | return set.size() > 1; |
239 | } |
240 | return false; |
241 | } |
242 | |
243 | RegExpTree* RegExpBuilder::ToRegExp() { |
244 | FlushTerms(); |
245 | intptr_t num_alternatives = alternatives_.length(); |
246 | if (num_alternatives == 0) { |
247 | return RegExpEmpty::GetInstance(); |
248 | } |
249 | if (num_alternatives == 1) { |
250 | return alternatives_.Last(); |
251 | } |
252 | ZoneGrowableArray<RegExpTree*>* alternatives = |
253 | new (Z) ZoneGrowableArray<RegExpTree*>(); |
254 | for (intptr_t i = 0; i < alternatives_.length(); i++) { |
255 | alternatives->Add(alternatives_[i]); |
256 | } |
257 | return new (Z) RegExpDisjunction(alternatives); |
258 | } |
259 | |
260 | bool RegExpBuilder::AddQuantifierToAtom( |
261 | intptr_t min, |
262 | intptr_t max, |
263 | RegExpQuantifier::QuantifierType quantifier_type) { |
264 | if (pending_empty_) { |
265 | pending_empty_ = false; |
266 | return true; |
267 | } |
268 | RegExpTree* atom; |
269 | if (characters_ != NULL) { |
270 | DEBUG_ASSERT(last_added_ == ADD_CHAR); |
271 | // Last atom was character. |
272 | |
273 | ZoneGrowableArray<uint16_t>* char_vector = |
274 | new (Z) ZoneGrowableArray<uint16_t>(); |
275 | char_vector->AddArray(*characters_); |
276 | intptr_t num_chars = char_vector->length(); |
277 | if (num_chars > 1) { |
278 | ZoneGrowableArray<uint16_t>* prefix = |
279 | new (Z) ZoneGrowableArray<uint16_t>(); |
280 | for (intptr_t i = 0; i < num_chars - 1; i++) { |
281 | prefix->Add(char_vector->At(i)); |
282 | } |
283 | text_.Add(new (Z) RegExpAtom(prefix, flags_)); |
284 | ZoneGrowableArray<uint16_t>* tail = new (Z) ZoneGrowableArray<uint16_t>(); |
285 | tail->Add(char_vector->At(num_chars - 1)); |
286 | char_vector = tail; |
287 | } |
288 | characters_ = NULL; |
289 | atom = new (Z) RegExpAtom(char_vector, flags_); |
290 | FlushText(); |
291 | } else if (text_.length() > 0) { |
292 | DEBUG_ASSERT(last_added_ == ADD_ATOM); |
293 | atom = text_.RemoveLast(); |
294 | FlushText(); |
295 | } else if (terms_.length() > 0) { |
296 | DEBUG_ASSERT(last_added_ == ADD_ATOM); |
297 | atom = terms_.RemoveLast(); |
298 | if (auto lookaround = atom->AsLookaround()) { |
299 | // With /u, lookarounds are not quantifiable. |
300 | if (is_unicode()) return false; |
301 | // Lookbehinds are not quantifiable. |
302 | if (lookaround->type() == RegExpLookaround::LOOKBEHIND) { |
303 | return false; |
304 | } |
305 | } |
306 | if (atom->max_match() == 0) { |
307 | // Guaranteed to only match an empty string. |
308 | LAST(ADD_TERM); |
309 | if (min == 0) { |
310 | return true; |
311 | } |
312 | terms_.Add(atom); |
313 | return true; |
314 | } |
315 | } else { |
316 | // Only call immediately after adding an atom or character! |
317 | UNREACHABLE(); |
318 | } |
319 | terms_.Add(new (Z) RegExpQuantifier(min, max, quantifier_type, atom)); |
320 | LAST(ADD_TERM); |
321 | return true; |
322 | } |
323 | |
324 | // ---------------------------------------------------------------------------- |
325 | // Implementation of Parser |
326 | |
327 | RegExpParser::RegExpParser(const String& in, String* error, RegExpFlags flags) |
328 | : zone_(Thread::Current()->zone()), |
329 | captures_(nullptr), |
330 | named_captures_(nullptr), |
331 | named_back_references_(nullptr), |
332 | in_(in), |
333 | current_(kEndMarker), |
334 | next_pos_(0), |
335 | captures_started_(0), |
336 | capture_count_(0), |
337 | has_more_(true), |
338 | top_level_flags_(flags), |
339 | simple_(false), |
340 | contains_anchor_(false), |
341 | is_scanned_for_captures_(false), |
342 | has_named_captures_(false) { |
343 | Advance(); |
344 | } |
345 | |
346 | inline uint32_t RegExpParser::ReadNext(bool update_position) { |
347 | intptr_t position = next_pos_; |
348 | const uint16_t c0 = in().CharAt(position); |
349 | uint32_t c = c0; |
350 | position++; |
351 | if (is_unicode() && position < in().Length() && Utf16::IsLeadSurrogate(c0)) { |
352 | const uint16_t c1 = in().CharAt(position); |
353 | if (Utf16::IsTrailSurrogate(c1)) { |
354 | c = Utf16::Decode(c0, c1); |
355 | position++; |
356 | } |
357 | } |
358 | if (update_position) next_pos_ = position; |
359 | return c; |
360 | } |
361 | |
362 | uint32_t RegExpParser::Next() { |
363 | if (has_next()) { |
364 | return ReadNext(false); |
365 | } else { |
366 | return kEndMarker; |
367 | } |
368 | } |
369 | |
370 | void RegExpParser::Advance() { |
371 | if (has_next()) { |
372 | current_ = ReadNext(true); |
373 | } else { |
374 | current_ = kEndMarker; |
375 | // Advance so that position() points to 1 after the last character. This is |
376 | // important so that Reset() to this position works correctly. |
377 | next_pos_ = in().Length() + 1; |
378 | has_more_ = false; |
379 | } |
380 | } |
381 | |
382 | void RegExpParser::Reset(intptr_t pos) { |
383 | next_pos_ = pos; |
384 | has_more_ = (pos < in().Length()); |
385 | Advance(); |
386 | } |
387 | |
388 | void RegExpParser::Advance(intptr_t dist) { |
389 | next_pos_ += dist - 1; |
390 | Advance(); |
391 | } |
392 | |
393 | bool RegExpParser::simple() { |
394 | return simple_; |
395 | } |
396 | |
397 | bool RegExpParser::IsSyntaxCharacterOrSlash(uint32_t c) { |
398 | switch (c) { |
399 | case '^': |
400 | case '$': |
401 | case '\\': |
402 | case '.': |
403 | case '*': |
404 | case '+': |
405 | case '?': |
406 | case '(': |
407 | case ')': |
408 | case '[': |
409 | case ']': |
410 | case '{': |
411 | case '}': |
412 | case '|': |
413 | case '/': |
414 | return true; |
415 | default: |
416 | break; |
417 | } |
418 | return false; |
419 | } |
420 | |
421 | void RegExpParser::ReportError(const char* message) { |
422 | // Zip to the end to make sure the no more input is read. |
423 | current_ = kEndMarker; |
424 | next_pos_ = in().Length(); |
425 | |
426 | // Throw a FormatException on parsing failures. |
427 | const String& msg = String::Handle( |
428 | String::Concat(String::Handle(String::New(message)), in())); |
429 | const Array& args = Array::Handle(Array::New(1)); |
430 | args.SetAt(0, msg); |
431 | Exceptions::ThrowByType(Exceptions::kFormat, args); |
432 | UNREACHABLE(); |
433 | } |
434 | |
435 | // Pattern :: |
436 | // Disjunction |
437 | RegExpTree* RegExpParser::ParsePattern() { |
438 | RegExpTree* result = ParseDisjunction(); |
439 | PatchNamedBackReferences(); |
440 | ASSERT(!has_more()); |
441 | // If the result of parsing is a literal string atom, and it has the |
442 | // same length as the input, then the atom is identical to the input. |
443 | if (result->IsAtom() && result->AsAtom()->length() == in().Length()) { |
444 | simple_ = true; |
445 | } |
446 | return result; |
447 | } |
448 | |
449 | // Used for error messages where we would have fallen back on treating an |
450 | // escape as the identity escape, but we are in Unicode mode. |
451 | static const char* kUnicodeIdentity = |
452 | "Invalid identity escape in Unicode pattern" ; |
453 | |
454 | // Disjunction :: |
455 | // Alternative |
456 | // Alternative | Disjunction |
457 | // Alternative :: |
458 | // [empty] |
459 | // Term Alternative |
460 | // Term :: |
461 | // Assertion |
462 | // Atom |
463 | // Atom Quantifier |
464 | RegExpTree* RegExpParser::ParseDisjunction() { |
465 | // Used to store current state while parsing subexpressions. |
466 | RegExpParserState initial_state(nullptr, INITIAL, RegExpLookaround::LOOKAHEAD, |
467 | 0, nullptr, top_level_flags_, Z); |
468 | RegExpParserState* stored_state = &initial_state; |
469 | // Cache the builder in a local variable for quick access. |
470 | RegExpBuilder* builder = initial_state.builder(); |
471 | while (true) { |
472 | switch (current()) { |
473 | case kEndMarker: |
474 | if (stored_state->IsSubexpression()) { |
475 | // Inside a parenthesized group when hitting end of input. |
476 | ReportError("Unterminated group" ); |
477 | UNREACHABLE(); |
478 | } |
479 | ASSERT(INITIAL == stored_state->group_type()); |
480 | // Parsing completed successfully. |
481 | return builder->ToRegExp(); |
482 | case ')': { |
483 | if (!stored_state->IsSubexpression()) { |
484 | ReportError("Unmatched ')'" ); |
485 | UNREACHABLE(); |
486 | } |
487 | ASSERT(INITIAL != stored_state->group_type()); |
488 | |
489 | Advance(); |
490 | // End disjunction parsing and convert builder content to new single |
491 | // regexp atom. |
492 | RegExpTree* body = builder->ToRegExp(); |
493 | |
494 | intptr_t end_capture_index = captures_started(); |
495 | |
496 | intptr_t capture_index = stored_state->capture_index(); |
497 | SubexpressionType group_type = stored_state->group_type(); |
498 | |
499 | // Build result of subexpression. |
500 | if (group_type == CAPTURE) { |
501 | if (stored_state->IsNamedCapture()) { |
502 | CreateNamedCaptureAtIndex(stored_state->capture_name(), |
503 | capture_index); |
504 | } |
505 | RegExpCapture* capture = GetCapture(capture_index); |
506 | capture->set_body(body); |
507 | body = capture; |
508 | } else if (group_type != GROUPING) { |
509 | ASSERT(group_type == POSITIVE_LOOKAROUND || |
510 | group_type == NEGATIVE_LOOKAROUND); |
511 | bool is_positive = (group_type == POSITIVE_LOOKAROUND); |
512 | body = new (Z) RegExpLookaround( |
513 | body, is_positive, end_capture_index - capture_index, |
514 | capture_index, stored_state->lookaround_type()); |
515 | } |
516 | |
517 | // Restore previous state. |
518 | stored_state = stored_state->previous_state(); |
519 | builder = stored_state->builder(); |
520 | |
521 | builder->AddAtom(body); |
522 | // For compatibility with JSC and ES3, we allow quantifiers after |
523 | // lookaheads, and break in all cases. |
524 | break; |
525 | } |
526 | case '|': { |
527 | Advance(); |
528 | builder->NewAlternative(); |
529 | continue; |
530 | } |
531 | case '*': |
532 | case '+': |
533 | case '?': |
534 | ReportError("Nothing to repeat" ); |
535 | UNREACHABLE(); |
536 | case '^': { |
537 | Advance(); |
538 | if (builder->is_multi_line()) { |
539 | builder->AddAssertion(new (Z) RegExpAssertion( |
540 | RegExpAssertion::START_OF_LINE, builder->flags())); |
541 | } else { |
542 | builder->AddAssertion(new (Z) RegExpAssertion( |
543 | RegExpAssertion::START_OF_INPUT, builder->flags())); |
544 | set_contains_anchor(); |
545 | } |
546 | continue; |
547 | } |
548 | case '$': { |
549 | Advance(); |
550 | RegExpAssertion::AssertionType assertion_type = |
551 | builder->is_multi_line() ? RegExpAssertion::END_OF_LINE |
552 | : RegExpAssertion::END_OF_INPUT; |
553 | builder->AddAssertion( |
554 | new (Z) RegExpAssertion(assertion_type, builder->flags())); |
555 | continue; |
556 | } |
557 | case '.': { |
558 | Advance(); |
559 | auto ranges = new (Z) ZoneGrowableArray<CharacterRange>(2); |
560 | if (builder->is_dot_all()) { |
561 | // Everything. |
562 | CharacterRange::AddClassEscape( |
563 | '*', ranges, |
564 | /*add_unicode_case_equivalents=*/false); |
565 | } else { |
566 | // everything except \x0a, \x0d, \u2028 and \u2029 |
567 | CharacterRange::AddClassEscape( |
568 | '.', ranges, |
569 | /*add_unicode_case_equivalents=*/false); |
570 | } |
571 | RegExpCharacterClass* cc = |
572 | new (Z) RegExpCharacterClass(ranges, builder->flags()); |
573 | builder->AddCharacterClass(cc); |
574 | break; |
575 | } |
576 | case '(': { |
577 | stored_state = ParseOpenParenthesis(stored_state); |
578 | builder = stored_state->builder(); |
579 | continue; |
580 | } |
581 | case '[': { |
582 | RegExpTree* atom = ParseCharacterClass(builder); |
583 | builder->AddCharacterClass(atom->AsCharacterClass()); |
584 | break; |
585 | } |
586 | // Atom :: |
587 | // \ AtomEscape |
588 | case '\\': |
589 | switch (Next()) { |
590 | case kEndMarker: |
591 | ReportError("\\ at end of pattern" ); |
592 | UNREACHABLE(); |
593 | case 'b': |
594 | Advance(2); |
595 | builder->AddAssertion(new (Z) RegExpAssertion( |
596 | RegExpAssertion::BOUNDARY, builder->flags())); |
597 | continue; |
598 | case 'B': |
599 | Advance(2); |
600 | builder->AddAssertion(new (Z) RegExpAssertion( |
601 | RegExpAssertion::NON_BOUNDARY, builder->flags())); |
602 | continue; |
603 | // AtomEscape :: |
604 | // CharacterClassEscape |
605 | // |
606 | // CharacterClassEscape :: one of |
607 | // d D s S w W |
608 | case 'd': |
609 | case 'D': |
610 | case 's': |
611 | case 'S': |
612 | case 'w': |
613 | case 'W': { |
614 | uint32_t c = Next(); |
615 | Advance(2); |
616 | auto ranges = new (Z) ZoneGrowableArray<CharacterRange>(2); |
617 | CharacterRange::AddClassEscape( |
618 | c, ranges, is_unicode() && builder->ignore_case()); |
619 | RegExpCharacterClass* cc = |
620 | new (Z) RegExpCharacterClass(ranges, builder->flags()); |
621 | builder->AddCharacterClass(cc); |
622 | break; |
623 | } |
624 | case 'p': |
625 | case 'P': { |
626 | uint32_t p = Next(); |
627 | Advance(2); |
628 | |
629 | if (is_unicode()) { |
630 | auto name_1 = new (Z) ZoneGrowableArray<char>(); |
631 | auto name_2 = new (Z) ZoneGrowableArray<char>(); |
632 | auto ranges = new (Z) ZoneGrowableArray<CharacterRange>(2); |
633 | if (ParsePropertyClassName(name_1, name_2)) { |
634 | if (AddPropertyClassRange(ranges, p == 'P', name_1, name_2)) { |
635 | RegExpCharacterClass* cc = |
636 | new (Z) RegExpCharacterClass(ranges, builder->flags()); |
637 | builder->AddCharacterClass(cc); |
638 | break; |
639 | } |
640 | } |
641 | ReportError("Invalid property name" ); |
642 | UNREACHABLE(); |
643 | } else { |
644 | builder->AddCharacter(p); |
645 | } |
646 | break; |
647 | } |
648 | case '1': |
649 | case '2': |
650 | case '3': |
651 | case '4': |
652 | case '5': |
653 | case '6': |
654 | case '7': |
655 | case '8': |
656 | case '9': { |
657 | intptr_t index = 0; |
658 | if (ParseBackReferenceIndex(&index)) { |
659 | if (stored_state->IsInsideCaptureGroup(index)) { |
660 | // The back reference is inside the capture group it refers to. |
661 | // Nothing can possibly have been captured yet, so we use empty |
662 | // instead. This ensures that, when checking a back reference, |
663 | // the capture registers of the referenced capture are either |
664 | // both set or both cleared. |
665 | builder->AddEmpty(); |
666 | } else { |
667 | RegExpCapture* capture = GetCapture(index); |
668 | RegExpTree* atom = |
669 | new (Z) RegExpBackReference(capture, builder->flags()); |
670 | builder->AddAtom(atom); |
671 | } |
672 | break; |
673 | } |
674 | // With /u, no identity escapes except for syntax characters are |
675 | // allowed. Otherwise, all identity escapes are allowed. |
676 | if (is_unicode()) { |
677 | ReportError(kUnicodeIdentity); |
678 | UNREACHABLE(); |
679 | } |
680 | uint32_t first_digit = Next(); |
681 | if (first_digit == '8' || first_digit == '9') { |
682 | builder->AddCharacter(first_digit); |
683 | Advance(2); |
684 | break; |
685 | } |
686 | } |
687 | FALL_THROUGH; |
688 | case '0': { |
689 | Advance(); |
690 | if (is_unicode() && Next() >= '0' && Next() <= '9') { |
691 | // With /u, decimal escape with leading 0 are not parsed as octal. |
692 | ReportError("Invalid decimal escape" ); |
693 | UNREACHABLE(); |
694 | } |
695 | uint32_t octal = ParseOctalLiteral(); |
696 | builder->AddCharacter(octal); |
697 | break; |
698 | } |
699 | // ControlEscape :: one of |
700 | // f n r t v |
701 | case 'f': |
702 | Advance(2); |
703 | builder->AddCharacter('\f'); |
704 | break; |
705 | case 'n': |
706 | Advance(2); |
707 | builder->AddCharacter('\n'); |
708 | break; |
709 | case 'r': |
710 | Advance(2); |
711 | builder->AddCharacter('\r'); |
712 | break; |
713 | case 't': |
714 | Advance(2); |
715 | builder->AddCharacter('\t'); |
716 | break; |
717 | case 'v': |
718 | Advance(2); |
719 | builder->AddCharacter('\v'); |
720 | break; |
721 | case 'c': { |
722 | Advance(); |
723 | uint32_t controlLetter = Next(); |
724 | // Special case if it is an ASCII letter. |
725 | // Convert lower case letters to uppercase. |
726 | uint32_t letter = controlLetter & ~('a' ^ 'A'); |
727 | if (letter < 'A' || 'Z' < letter) { |
728 | // controlLetter is not in range 'A'-'Z' or 'a'-'z'. |
729 | // This is outside the specification. We match JSC in |
730 | // reading the backslash as a literal character instead |
731 | // of as starting an escape. |
732 | if (is_unicode()) { |
733 | // With /u, invalid escapes are not treated as identity escapes. |
734 | ReportError(kUnicodeIdentity); |
735 | UNREACHABLE(); |
736 | } |
737 | builder->AddCharacter('\\'); |
738 | } else { |
739 | Advance(2); |
740 | builder->AddCharacter(controlLetter & 0x1f); |
741 | } |
742 | break; |
743 | } |
744 | case 'x': { |
745 | Advance(2); |
746 | uint32_t value; |
747 | if (ParseHexEscape(2, &value)) { |
748 | builder->AddCharacter(value); |
749 | } else if (!is_unicode()) { |
750 | builder->AddCharacter('x'); |
751 | } else { |
752 | // With /u, invalid escapes are not treated as identity escapes. |
753 | ReportError(kUnicodeIdentity); |
754 | UNREACHABLE(); |
755 | } |
756 | break; |
757 | } |
758 | case 'u': { |
759 | Advance(2); |
760 | uint32_t value; |
761 | if (ParseUnicodeEscape(&value)) { |
762 | builder->AddEscapedUnicodeCharacter(value); |
763 | } else if (!is_unicode()) { |
764 | builder->AddCharacter('u'); |
765 | } else { |
766 | // With /u, invalid escapes are not treated as identity escapes. |
767 | ReportError(kUnicodeIdentity); |
768 | UNREACHABLE(); |
769 | } |
770 | break; |
771 | } |
772 | case 'k': |
773 | // Either an identity escape or a named back-reference. The two |
774 | // interpretations are mutually exclusive: '\k' is interpreted as |
775 | // an identity escape for non-Unicode patterns without named |
776 | // capture groups, and as the beginning of a named back-reference |
777 | // in all other cases. |
778 | if (is_unicode() || HasNamedCaptures()) { |
779 | Advance(2); |
780 | ParseNamedBackReference(builder, stored_state); |
781 | break; |
782 | } |
783 | FALL_THROUGH; |
784 | default: |
785 | Advance(); |
786 | // With the unicode flag, no identity escapes except for syntax |
787 | // characters are allowed. Otherwise, all identity escapes are |
788 | // allowed. |
789 | if (!is_unicode() || IsSyntaxCharacterOrSlash(current())) { |
790 | builder->AddCharacter(current()); |
791 | Advance(); |
792 | } else { |
793 | ReportError(kUnicodeIdentity); |
794 | UNREACHABLE(); |
795 | } |
796 | break; |
797 | } |
798 | break; |
799 | case '{': { |
800 | intptr_t dummy; |
801 | if (ParseIntervalQuantifier(&dummy, &dummy)) { |
802 | ReportError("Nothing to repeat" ); |
803 | UNREACHABLE(); |
804 | } |
805 | } |
806 | FALL_THROUGH; |
807 | case '}': |
808 | case ']': |
809 | if (is_unicode()) { |
810 | ReportError("Lone quantifier brackets" ); |
811 | UNREACHABLE(); |
812 | } |
813 | FALL_THROUGH; |
814 | default: |
815 | builder->AddUnicodeCharacter(current()); |
816 | Advance(); |
817 | break; |
818 | } // end switch(current()) |
819 | |
820 | intptr_t min; |
821 | intptr_t max; |
822 | switch (current()) { |
823 | // QuantifierPrefix :: |
824 | // * |
825 | // + |
826 | // ? |
827 | // { |
828 | case '*': |
829 | min = 0; |
830 | max = RegExpTree::kInfinity; |
831 | Advance(); |
832 | break; |
833 | case '+': |
834 | min = 1; |
835 | max = RegExpTree::kInfinity; |
836 | Advance(); |
837 | break; |
838 | case '?': |
839 | min = 0; |
840 | max = 1; |
841 | Advance(); |
842 | break; |
843 | case '{': |
844 | if (ParseIntervalQuantifier(&min, &max)) { |
845 | if (max < min) { |
846 | ReportError("numbers out of order in {} quantifier." ); |
847 | UNREACHABLE(); |
848 | } |
849 | break; |
850 | } else { |
851 | continue; |
852 | } |
853 | default: |
854 | continue; |
855 | } |
856 | RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY; |
857 | if (current() == '?') { |
858 | quantifier_type = RegExpQuantifier::NON_GREEDY; |
859 | Advance(); |
860 | } else if (FLAG_regexp_possessive_quantifier && current() == '+') { |
861 | // FLAG_regexp_possessive_quantifier is a debug-only flag. |
862 | quantifier_type = RegExpQuantifier::POSSESSIVE; |
863 | Advance(); |
864 | } |
865 | if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) { |
866 | ReportError("invalid quantifier." ); |
867 | UNREACHABLE(); |
868 | } |
869 | } |
870 | } |
871 | |
872 | #ifdef DEBUG |
873 | // Currently only used in an ASSERT. |
874 | static bool IsSpecialClassEscape(uint32_t c) { |
875 | switch (c) { |
876 | case 'd': |
877 | case 'D': |
878 | case 's': |
879 | case 'S': |
880 | case 'w': |
881 | case 'W': |
882 | return true; |
883 | default: |
884 | return false; |
885 | } |
886 | } |
887 | #endif |
888 | |
889 | RegExpParser::RegExpParserState* RegExpParser::ParseOpenParenthesis( |
890 | RegExpParserState* state) { |
891 | RegExpLookaround::Type lookaround_type = state->lookaround_type(); |
892 | bool is_named_capture = false; |
893 | const RegExpCaptureName* capture_name = nullptr; |
894 | SubexpressionType subexpr_type = CAPTURE; |
895 | Advance(); |
896 | if (current() == '?') { |
897 | switch (Next()) { |
898 | case ':': |
899 | Advance(2); |
900 | subexpr_type = GROUPING; |
901 | break; |
902 | case '=': |
903 | Advance(2); |
904 | lookaround_type = RegExpLookaround::LOOKAHEAD; |
905 | subexpr_type = POSITIVE_LOOKAROUND; |
906 | break; |
907 | case '!': |
908 | Advance(2); |
909 | lookaround_type = RegExpLookaround::LOOKAHEAD; |
910 | subexpr_type = NEGATIVE_LOOKAROUND; |
911 | break; |
912 | case '<': |
913 | Advance(); |
914 | if (Next() == '=') { |
915 | Advance(2); |
916 | lookaround_type = RegExpLookaround::LOOKBEHIND; |
917 | subexpr_type = POSITIVE_LOOKAROUND; |
918 | break; |
919 | } else if (Next() == '!') { |
920 | Advance(2); |
921 | lookaround_type = RegExpLookaround::LOOKBEHIND; |
922 | subexpr_type = NEGATIVE_LOOKAROUND; |
923 | break; |
924 | } |
925 | is_named_capture = true; |
926 | has_named_captures_ = true; |
927 | Advance(); |
928 | break; |
929 | default: |
930 | ReportError("Invalid group" ); |
931 | UNREACHABLE(); |
932 | } |
933 | } |
934 | |
935 | if (subexpr_type == CAPTURE) { |
936 | if (captures_started_ >= kMaxCaptures) { |
937 | ReportError("Too many captures" ); |
938 | UNREACHABLE(); |
939 | } |
940 | captures_started_++; |
941 | |
942 | if (is_named_capture) { |
943 | capture_name = ParseCaptureGroupName(); |
944 | } |
945 | } |
946 | // Store current state and begin new disjunction parsing. |
947 | return new (Z) |
948 | RegExpParserState(state, subexpr_type, lookaround_type, captures_started_, |
949 | capture_name, state->builder()->flags(), Z); |
950 | } |
951 | |
952 | // In order to know whether an escape is a backreference or not we have to scan |
953 | // the entire regexp and find the number of capturing parentheses. However we |
954 | // don't want to scan the regexp twice unless it is necessary. This mini-parser |
955 | // is called when needed. It can see the difference between capturing and |
956 | // noncapturing parentheses and can skip character classes and backslash-escaped |
957 | // characters. |
958 | void RegExpParser::ScanForCaptures() { |
959 | ASSERT(!is_scanned_for_captures_); |
960 | const intptr_t saved_position = position(); |
961 | // Start with captures started previous to current position |
962 | intptr_t capture_count = captures_started(); |
963 | // Add count of captures after this position. |
964 | uintptr_t n; |
965 | while ((n = current()) != kEndMarker) { |
966 | Advance(); |
967 | switch (n) { |
968 | case '\\': |
969 | Advance(); |
970 | break; |
971 | case '[': { |
972 | uintptr_t c; |
973 | while ((c = current()) != kEndMarker) { |
974 | Advance(); |
975 | if (c == '\\') { |
976 | Advance(); |
977 | } else { |
978 | if (c == ']') break; |
979 | } |
980 | } |
981 | break; |
982 | } |
983 | case '(': |
984 | // At this point we could be in |
985 | // * a non-capturing group '(:', |
986 | // * a lookbehind assertion '(?<=' '(?<!' |
987 | // * or a named capture '(?<'. |
988 | // |
989 | // Of these, only named captures are capturing groups. |
990 | if (current() == '?') { |
991 | Advance(); |
992 | if (current() != '<') break; |
993 | |
994 | Advance(); |
995 | if (current() == '=' || current() == '!') break; |
996 | |
997 | // Found a possible named capture. It could turn out to be a syntax |
998 | // error (e.g. an unterminated or invalid name), but that distinction |
999 | // does not matter for our purposes. |
1000 | has_named_captures_ = true; |
1001 | } |
1002 | capture_count++; |
1003 | break; |
1004 | } |
1005 | } |
1006 | capture_count_ = capture_count; |
1007 | is_scanned_for_captures_ = true; |
1008 | Reset(saved_position); |
1009 | } |
1010 | |
1011 | bool RegExpParser::ParseBackReferenceIndex(intptr_t* index_out) { |
1012 | ASSERT('\\' == current()); |
1013 | ASSERT('1' <= Next() && Next() <= '9'); |
1014 | // Try to parse a decimal literal that is no greater than the total number |
1015 | // of left capturing parentheses in the input. |
1016 | intptr_t start = position(); |
1017 | intptr_t value = Next() - '0'; |
1018 | Advance(2); |
1019 | while (true) { |
1020 | uint32_t c = current(); |
1021 | if (Utils::IsDecimalDigit(c)) { |
1022 | value = 10 * value + (c - '0'); |
1023 | if (value > kMaxCaptures) { |
1024 | Reset(start); |
1025 | return false; |
1026 | } |
1027 | Advance(); |
1028 | } else { |
1029 | break; |
1030 | } |
1031 | } |
1032 | if (value > captures_started()) { |
1033 | if (!is_scanned_for_captures_) ScanForCaptures(); |
1034 | if (value > capture_count_) { |
1035 | Reset(start); |
1036 | return false; |
1037 | } |
1038 | } |
1039 | *index_out = value; |
1040 | return true; |
1041 | } |
1042 | |
1043 | namespace { |
1044 | |
1045 | static inline constexpr bool IsAsciiIdentifierPart(uint32_t ch) { |
1046 | return Utils::IsAlphaNumeric(ch) || ch == '_' || ch == '$'; |
1047 | } |
1048 | |
1049 | // ES#sec-names-and-keywords Names and Keywords |
1050 | // UnicodeIDStart, '$', '_' and '\' |
1051 | static bool IsIdentifierStartSlow(uint32_t c) { |
1052 | // cannot use u_isIDStart because it does not work for |
1053 | // Other_ID_Start characters. |
1054 | return u_hasBinaryProperty(c, UCHAR_ID_START) || |
1055 | (c < 0x60 && (c == '$' || c == '\\' || c == '_')); |
1056 | } |
1057 | |
1058 | // ES#sec-names-and-keywords Names and Keywords |
1059 | // UnicodeIDContinue, '$', '_', '\', ZWJ, and ZWNJ |
1060 | static bool IsIdentifierPartSlow(uint32_t c) { |
1061 | const uint32_t kZeroWidthNonJoiner = 0x200C; |
1062 | const uint32_t kZeroWidthJoiner = 0x200D; |
1063 | // Can't use u_isIDPart because it does not work for |
1064 | // Other_ID_Continue characters. |
1065 | return u_hasBinaryProperty(c, UCHAR_ID_CONTINUE) || |
1066 | (c < 0x60 && (c == '$' || c == '\\' || c == '_')) || |
1067 | c == kZeroWidthNonJoiner || c == kZeroWidthJoiner; |
1068 | } |
1069 | |
1070 | static inline bool IsIdentifierStart(uint32_t c) { |
1071 | if (c > 127) return IsIdentifierStartSlow(c); |
1072 | return IsAsciiIdentifierPart(c) && !Utils::IsDecimalDigit(c); |
1073 | } |
1074 | |
1075 | static inline bool IsIdentifierPart(uint32_t c) { |
1076 | if (c > 127) return IsIdentifierPartSlow(c); |
1077 | return IsAsciiIdentifierPart(c); |
1078 | } |
1079 | |
1080 | static bool IsSameName(const RegExpCaptureName* name1, |
1081 | const RegExpCaptureName* name2) { |
1082 | if (name1->length() != name2->length()) return false; |
1083 | for (intptr_t i = 0; i < name1->length(); i++) { |
1084 | if (name1->At(i) != name2->At(i)) return false; |
1085 | } |
1086 | return true; |
1087 | } |
1088 | |
1089 | } // end namespace |
1090 | |
1091 | static void PushCodeUnit(RegExpCaptureName* v, uint32_t code_unit) { |
1092 | if (code_unit <= Utf16::kMaxCodeUnit) { |
1093 | v->Add(code_unit); |
1094 | } else { |
1095 | uint16_t units[2]; |
1096 | Utf16::Encode(code_unit, units); |
1097 | v->Add(units[0]); |
1098 | v->Add(units[1]); |
1099 | } |
1100 | } |
1101 | |
1102 | const RegExpCaptureName* RegExpParser::ParseCaptureGroupName() { |
1103 | auto name = new (Z) RegExpCaptureName(); |
1104 | |
1105 | bool at_start = true; |
1106 | while (true) { |
1107 | uint32_t c = current(); |
1108 | Advance(); |
1109 | |
1110 | // Convert unicode escapes. |
1111 | if (c == '\\' && current() == 'u') { |
1112 | Advance(); |
1113 | if (!ParseUnicodeEscape(&c)) { |
1114 | ReportError("Invalid Unicode escape sequence" ); |
1115 | UNREACHABLE(); |
1116 | } |
1117 | } |
1118 | |
1119 | // The backslash char is misclassified as both ID_Start and ID_Continue. |
1120 | if (c == '\\') { |
1121 | ReportError("Invalid capture group name" ); |
1122 | UNREACHABLE(); |
1123 | } |
1124 | |
1125 | if (at_start) { |
1126 | if (!IsIdentifierStart(c)) { |
1127 | ReportError("Invalid capture group name" ); |
1128 | UNREACHABLE(); |
1129 | } |
1130 | PushCodeUnit(name, c); |
1131 | at_start = false; |
1132 | } else { |
1133 | if (c == '>') { |
1134 | break; |
1135 | } else if (IsIdentifierPart(c)) { |
1136 | PushCodeUnit(name, c); |
1137 | } else { |
1138 | ReportError("Invalid capture group name" ); |
1139 | UNREACHABLE(); |
1140 | } |
1141 | } |
1142 | } |
1143 | |
1144 | return name; |
1145 | } |
1146 | |
1147 | intptr_t RegExpParser::GetNamedCaptureIndex(const RegExpCaptureName* name) { |
1148 | for (const auto& capture : *named_captures_) { |
1149 | if (IsSameName(name, capture->name())) return capture->index(); |
1150 | } |
1151 | return -1; |
1152 | } |
1153 | |
1154 | void RegExpParser::CreateNamedCaptureAtIndex(const RegExpCaptureName* name, |
1155 | intptr_t index) { |
1156 | ASSERT(0 < index && index <= captures_started_); |
1157 | ASSERT(name != nullptr); |
1158 | |
1159 | if (named_captures_ == nullptr) { |
1160 | named_captures_ = new (Z) ZoneGrowableArray<RegExpCapture*>(1); |
1161 | } else { |
1162 | // Check for duplicates and bail if we find any. Currently O(n^2). |
1163 | if (GetNamedCaptureIndex(name) >= 0) { |
1164 | ReportError("Duplicate capture group name" ); |
1165 | UNREACHABLE(); |
1166 | } |
1167 | } |
1168 | |
1169 | RegExpCapture* capture = GetCapture(index); |
1170 | ASSERT(capture->name() == nullptr); |
1171 | |
1172 | capture->set_name(name); |
1173 | named_captures_->Add(capture); |
1174 | } |
1175 | |
1176 | bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder, |
1177 | RegExpParserState* state) { |
1178 | // The parser is assumed to be on the '<' in \k<name>. |
1179 | if (current() != '<') { |
1180 | ReportError("Invalid named reference" ); |
1181 | UNREACHABLE(); |
1182 | } |
1183 | |
1184 | Advance(); |
1185 | const RegExpCaptureName* name = ParseCaptureGroupName(); |
1186 | if (name == nullptr) { |
1187 | return false; |
1188 | } |
1189 | |
1190 | if (state->IsInsideCaptureGroup(name)) { |
1191 | builder->AddEmpty(); |
1192 | } else { |
1193 | RegExpBackReference* atom = new (Z) RegExpBackReference(builder->flags()); |
1194 | atom->set_name(name); |
1195 | |
1196 | builder->AddAtom(atom); |
1197 | |
1198 | if (named_back_references_ == nullptr) { |
1199 | named_back_references_ = |
1200 | new (Z) ZoneGrowableArray<RegExpBackReference*>(1); |
1201 | } |
1202 | named_back_references_->Add(atom); |
1203 | } |
1204 | |
1205 | return true; |
1206 | } |
1207 | |
1208 | void RegExpParser::PatchNamedBackReferences() { |
1209 | if (named_back_references_ == nullptr) return; |
1210 | |
1211 | if (named_captures_ == nullptr) { |
1212 | ReportError("Invalid named capture referenced" ); |
1213 | return; |
1214 | } |
1215 | |
1216 | // Look up and patch the actual capture for each named back reference. |
1217 | // Currently O(n^2), optimize if necessary. |
1218 | for (intptr_t i = 0; i < named_back_references_->length(); i++) { |
1219 | RegExpBackReference* ref = named_back_references_->At(i); |
1220 | intptr_t index = GetNamedCaptureIndex(ref->name()); |
1221 | |
1222 | if (index < 0) { |
1223 | ReportError("Invalid named capture referenced" ); |
1224 | UNREACHABLE(); |
1225 | } |
1226 | ref->set_capture(GetCapture(index)); |
1227 | } |
1228 | } |
1229 | |
1230 | RegExpCapture* RegExpParser::GetCapture(intptr_t index) { |
1231 | // The index for the capture groups are one-based. Its index in the list is |
1232 | // zero-based. |
1233 | const intptr_t know_captures = |
1234 | is_scanned_for_captures_ ? capture_count_ : captures_started_; |
1235 | ASSERT(index <= know_captures); |
1236 | if (captures_ == nullptr) { |
1237 | captures_ = new (Z) ZoneGrowableArray<RegExpCapture*>(know_captures); |
1238 | } |
1239 | while (captures_->length() < know_captures) { |
1240 | captures_->Add(new (Z) RegExpCapture(captures_->length() + 1)); |
1241 | } |
1242 | return captures_->At(index - 1); |
1243 | } |
1244 | |
1245 | ArrayPtr RegExpParser::CreateCaptureNameMap() { |
1246 | if (named_captures_ == nullptr || named_captures_->is_empty()) { |
1247 | return Array::null(); |
1248 | } |
1249 | |
1250 | const intptr_t len = named_captures_->length() * 2; |
1251 | |
1252 | const Array& array = Array::Handle(Array::New(len)); |
1253 | |
1254 | auto& name = String::Handle(); |
1255 | auto& smi = Smi::Handle(); |
1256 | for (intptr_t i = 0; i < named_captures_->length(); i++) { |
1257 | RegExpCapture* capture = named_captures_->At(i); |
1258 | name = |
1259 | String::FromUTF16(capture->name()->data(), capture->name()->length()); |
1260 | smi = Smi::New(capture->index()); |
1261 | array.SetAt(i * 2, name); |
1262 | array.SetAt(i * 2 + 1, smi); |
1263 | } |
1264 | |
1265 | return array.raw(); |
1266 | } |
1267 | |
1268 | bool RegExpParser::HasNamedCaptures() { |
1269 | if (has_named_captures_ || is_scanned_for_captures_) { |
1270 | return has_named_captures_; |
1271 | } |
1272 | |
1273 | ScanForCaptures(); |
1274 | ASSERT(is_scanned_for_captures_); |
1275 | return has_named_captures_; |
1276 | } |
1277 | |
1278 | bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(intptr_t index) { |
1279 | for (RegExpParserState* s = this; s != nullptr; s = s->previous_state()) { |
1280 | if (s->group_type() != CAPTURE) continue; |
1281 | // Return true if we found the matching capture index. |
1282 | if (index == s->capture_index()) return true; |
1283 | // Abort if index is larger than what has been parsed up till this state. |
1284 | if (index > s->capture_index()) return false; |
1285 | } |
1286 | return false; |
1287 | } |
1288 | |
1289 | bool RegExpParser::RegExpParserState::IsInsideCaptureGroup( |
1290 | const RegExpCaptureName* name) { |
1291 | ASSERT(name != nullptr); |
1292 | for (RegExpParserState* s = this; s != nullptr; s = s->previous_state()) { |
1293 | if (s->capture_name() == nullptr) continue; |
1294 | if (IsSameName(s->capture_name(), name)) return true; |
1295 | } |
1296 | return false; |
1297 | } |
1298 | |
1299 | // QuantifierPrefix :: |
1300 | // { DecimalDigits } |
1301 | // { DecimalDigits , } |
1302 | // { DecimalDigits , DecimalDigits } |
1303 | // |
1304 | // Returns true if parsing succeeds, and set the min_out and max_out |
1305 | // values. Values are truncated to RegExpTree::kInfinity if they overflow. |
1306 | bool RegExpParser::ParseIntervalQuantifier(intptr_t* min_out, |
1307 | intptr_t* max_out) { |
1308 | ASSERT(current() == '{'); |
1309 | intptr_t start = position(); |
1310 | Advance(); |
1311 | intptr_t min = 0; |
1312 | if (!Utils::IsDecimalDigit(current())) { |
1313 | Reset(start); |
1314 | return false; |
1315 | } |
1316 | while (Utils::IsDecimalDigit(current())) { |
1317 | intptr_t next = current() - '0'; |
1318 | if (min > (RegExpTree::kInfinity - next) / 10) { |
1319 | // Overflow. Skip past remaining decimal digits and return -1. |
1320 | do { |
1321 | Advance(); |
1322 | } while (Utils::IsDecimalDigit(current())); |
1323 | min = RegExpTree::kInfinity; |
1324 | break; |
1325 | } |
1326 | min = 10 * min + next; |
1327 | Advance(); |
1328 | } |
1329 | intptr_t max = 0; |
1330 | if (current() == '}') { |
1331 | max = min; |
1332 | Advance(); |
1333 | } else if (current() == ',') { |
1334 | Advance(); |
1335 | if (current() == '}') { |
1336 | max = RegExpTree::kInfinity; |
1337 | Advance(); |
1338 | } else { |
1339 | while (Utils::IsDecimalDigit(current())) { |
1340 | intptr_t next = current() - '0'; |
1341 | if (max > (RegExpTree::kInfinity - next) / 10) { |
1342 | do { |
1343 | Advance(); |
1344 | } while (Utils::IsDecimalDigit(current())); |
1345 | max = RegExpTree::kInfinity; |
1346 | break; |
1347 | } |
1348 | max = 10 * max + next; |
1349 | Advance(); |
1350 | } |
1351 | if (current() != '}') { |
1352 | Reset(start); |
1353 | return false; |
1354 | } |
1355 | Advance(); |
1356 | } |
1357 | } else { |
1358 | Reset(start); |
1359 | return false; |
1360 | } |
1361 | *min_out = min; |
1362 | *max_out = max; |
1363 | return true; |
1364 | } |
1365 | |
1366 | uint32_t RegExpParser::ParseOctalLiteral() { |
1367 | ASSERT(('0' <= current() && current() <= '7') || current() == kEndMarker); |
1368 | // For compatibility with some other browsers (not all), we parse |
1369 | // up to three octal digits with a value below 256. |
1370 | uint32_t value = current() - '0'; |
1371 | Advance(); |
1372 | if ('0' <= current() && current() <= '7') { |
1373 | value = value * 8 + current() - '0'; |
1374 | Advance(); |
1375 | if (value < 32 && '0' <= current() && current() <= '7') { |
1376 | value = value * 8 + current() - '0'; |
1377 | Advance(); |
1378 | } |
1379 | } |
1380 | return value; |
1381 | } |
1382 | |
1383 | // Returns the value (0 .. 15) of a hexadecimal character c. |
1384 | // If c is not a legal hexadecimal character, returns a value < 0. |
1385 | static inline intptr_t HexValue(uint32_t c) { |
1386 | c -= '0'; |
1387 | if (static_cast<unsigned>(c) <= 9) return c; |
1388 | c = (c | 0x20) - ('a' - '0'); // detect 0x11..0x16 and 0x31..0x36. |
1389 | if (static_cast<unsigned>(c) <= 5) return c + 10; |
1390 | return -1; |
1391 | } |
1392 | |
1393 | bool RegExpParser::ParseHexEscape(intptr_t length, uint32_t* value) { |
1394 | intptr_t start = position(); |
1395 | uint32_t val = 0; |
1396 | bool done = false; |
1397 | for (intptr_t i = 0; !done; i++) { |
1398 | uint32_t c = current(); |
1399 | intptr_t d = HexValue(c); |
1400 | if (d < 0) { |
1401 | Reset(start); |
1402 | return false; |
1403 | } |
1404 | val = val * 16 + d; |
1405 | Advance(); |
1406 | if (i == length - 1) { |
1407 | done = true; |
1408 | } |
1409 | } |
1410 | *value = val; |
1411 | return true; |
1412 | } |
1413 | |
1414 | // This parses RegExpUnicodeEscapeSequence as described in ECMA262. |
1415 | bool RegExpParser::ParseUnicodeEscape(uint32_t* value) { |
1416 | // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are |
1417 | // allowed). In the latter case, the number of hex digits between { } is |
1418 | // arbitrary. \ and u have already been read. |
1419 | if (current() == '{' && is_unicode()) { |
1420 | int start = position(); |
1421 | Advance(); |
1422 | if (ParseUnlimitedLengthHexNumber(Utf::kMaxCodePoint, value)) { |
1423 | if (current() == '}') { |
1424 | Advance(); |
1425 | return true; |
1426 | } |
1427 | } |
1428 | Reset(start); |
1429 | return false; |
1430 | } |
1431 | // \u but no {, or \u{...} escapes not allowed. |
1432 | bool result = ParseHexEscape(4, value); |
1433 | if (result && is_unicode() && Utf16::IsLeadSurrogate(*value) && |
1434 | current() == '\\') { |
1435 | // Attempt to read trail surrogate. |
1436 | int start = position(); |
1437 | if (Next() == 'u') { |
1438 | Advance(2); |
1439 | uint32_t trail; |
1440 | if (ParseHexEscape(4, &trail) && Utf16::IsTrailSurrogate(trail)) { |
1441 | *value = Utf16::Decode(static_cast<uint16_t>(*value), |
1442 | static_cast<uint16_t>(trail)); |
1443 | return true; |
1444 | } |
1445 | } |
1446 | Reset(start); |
1447 | } |
1448 | return result; |
1449 | } |
1450 | |
1451 | namespace { |
1452 | |
1453 | bool IsExactPropertyAlias(const char* property_name, UProperty property) { |
1454 | const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME); |
1455 | if (short_name != nullptr && strcmp(property_name, short_name) == 0) { |
1456 | return true; |
1457 | } |
1458 | for (int i = 0;; i++) { |
1459 | const char* long_name = u_getPropertyName( |
1460 | property, static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i)); |
1461 | if (long_name == nullptr) break; |
1462 | if (strcmp(property_name, long_name) == 0) return true; |
1463 | } |
1464 | return false; |
1465 | } |
1466 | |
1467 | bool IsExactPropertyValueAlias(const char* property_value_name, |
1468 | UProperty property, |
1469 | int32_t property_value) { |
1470 | const char* short_name = |
1471 | u_getPropertyValueName(property, property_value, U_SHORT_PROPERTY_NAME); |
1472 | if (short_name != nullptr && strcmp(property_value_name, short_name) == 0) { |
1473 | return true; |
1474 | } |
1475 | for (int i = 0;; i++) { |
1476 | const char* long_name = u_getPropertyValueName( |
1477 | property, property_value, |
1478 | static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i)); |
1479 | if (long_name == nullptr) break; |
1480 | if (strcmp(property_value_name, long_name) == 0) return true; |
1481 | } |
1482 | return false; |
1483 | } |
1484 | |
1485 | bool LookupPropertyValueName(UProperty property, |
1486 | const char* property_value_name, |
1487 | bool negate, |
1488 | ZoneGrowableArray<CharacterRange>* result) { |
1489 | UProperty property_for_lookup = property; |
1490 | if (property_for_lookup == UCHAR_SCRIPT_EXTENSIONS) { |
1491 | // For the property Script_Extensions, we have to do the property value |
1492 | // name lookup as if the property is Script. |
1493 | property_for_lookup = UCHAR_SCRIPT; |
1494 | } |
1495 | int32_t property_value = |
1496 | u_getPropertyValueEnum(property_for_lookup, property_value_name); |
1497 | if (property_value == UCHAR_INVALID_CODE) return false; |
1498 | |
1499 | // We require the property name to match exactly to one of the property value |
1500 | // aliases. However, u_getPropertyValueEnum uses loose matching. |
1501 | if (!IsExactPropertyValueAlias(property_value_name, property_for_lookup, |
1502 | property_value)) { |
1503 | return false; |
1504 | } |
1505 | |
1506 | UErrorCode ec = U_ZERO_ERROR; |
1507 | icu::UnicodeSet set; |
1508 | set.applyIntPropertyValue(property, property_value, ec); |
1509 | bool success = ec == U_ZERO_ERROR && (set.isEmpty() == 0); |
1510 | |
1511 | if (success) { |
1512 | set.removeAllStrings(); |
1513 | if (negate) set.complement(); |
1514 | for (int i = 0; i < set.getRangeCount(); i++) { |
1515 | result->Add( |
1516 | CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i))); |
1517 | } |
1518 | } |
1519 | return success; |
1520 | } |
1521 | |
1522 | template <size_t N> |
1523 | inline bool NameEquals(const char* name, const char (&literal)[N]) { |
1524 | return strncmp(name, literal, N + 1) == 0; |
1525 | } |
1526 | |
1527 | bool LookupSpecialPropertyValueName(const char* name, |
1528 | ZoneGrowableArray<CharacterRange>* result, |
1529 | bool negate) { |
1530 | if (NameEquals(name, "Any" )) { |
1531 | if (negate) { |
1532 | // Leave the list of character ranges empty, since the negation of 'Any' |
1533 | // is the empty set. |
1534 | } else { |
1535 | result->Add(CharacterRange::Everything()); |
1536 | } |
1537 | } else if (NameEquals(name, "ASCII" )) { |
1538 | result->Add(negate ? CharacterRange::Range(0x80, Utf::kMaxCodePoint) |
1539 | : CharacterRange::Range(0x0, 0x7F)); |
1540 | } else if (NameEquals(name, "Assigned" )) { |
1541 | return LookupPropertyValueName(UCHAR_GENERAL_CATEGORY, "Unassigned" , |
1542 | !negate, result); |
1543 | } else { |
1544 | return false; |
1545 | } |
1546 | return true; |
1547 | } |
1548 | |
1549 | // Explicitly list supported binary properties. The spec forbids supporting |
1550 | // properties outside of this set to ensure interoperability. |
1551 | bool IsSupportedBinaryProperty(UProperty property) { |
1552 | switch (property) { |
1553 | case UCHAR_ALPHABETIC: |
1554 | // 'Any' is not supported by ICU. See LookupSpecialPropertyValueName. |
1555 | // 'ASCII' is not supported by ICU. See LookupSpecialPropertyValueName. |
1556 | case UCHAR_ASCII_HEX_DIGIT: |
1557 | // 'Assigned' is not supported by ICU. See LookupSpecialPropertyValueName. |
1558 | case UCHAR_BIDI_CONTROL: |
1559 | case UCHAR_BIDI_MIRRORED: |
1560 | case UCHAR_CASE_IGNORABLE: |
1561 | case UCHAR_CASED: |
1562 | case UCHAR_CHANGES_WHEN_CASEFOLDED: |
1563 | case UCHAR_CHANGES_WHEN_CASEMAPPED: |
1564 | case UCHAR_CHANGES_WHEN_LOWERCASED: |
1565 | case UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED: |
1566 | case UCHAR_CHANGES_WHEN_TITLECASED: |
1567 | case UCHAR_CHANGES_WHEN_UPPERCASED: |
1568 | case UCHAR_DASH: |
1569 | case UCHAR_DEFAULT_IGNORABLE_CODE_POINT: |
1570 | case UCHAR_DEPRECATED: |
1571 | case UCHAR_DIACRITIC: |
1572 | case UCHAR_EMOJI: |
1573 | case UCHAR_EMOJI_COMPONENT: |
1574 | case UCHAR_EMOJI_MODIFIER_BASE: |
1575 | case UCHAR_EMOJI_MODIFIER: |
1576 | case UCHAR_EMOJI_PRESENTATION: |
1577 | case UCHAR_EXTENDED_PICTOGRAPHIC: |
1578 | case UCHAR_EXTENDER: |
1579 | case UCHAR_GRAPHEME_BASE: |
1580 | case UCHAR_GRAPHEME_EXTEND: |
1581 | case UCHAR_HEX_DIGIT: |
1582 | case UCHAR_ID_CONTINUE: |
1583 | case UCHAR_ID_START: |
1584 | case UCHAR_IDEOGRAPHIC: |
1585 | case UCHAR_IDS_BINARY_OPERATOR: |
1586 | case UCHAR_IDS_TRINARY_OPERATOR: |
1587 | case UCHAR_JOIN_CONTROL: |
1588 | case UCHAR_LOGICAL_ORDER_EXCEPTION: |
1589 | case UCHAR_LOWERCASE: |
1590 | case UCHAR_MATH: |
1591 | case UCHAR_NONCHARACTER_CODE_POINT: |
1592 | case UCHAR_PATTERN_SYNTAX: |
1593 | case UCHAR_PATTERN_WHITE_SPACE: |
1594 | case UCHAR_QUOTATION_MARK: |
1595 | case UCHAR_RADICAL: |
1596 | case UCHAR_REGIONAL_INDICATOR: |
1597 | case UCHAR_S_TERM: |
1598 | case UCHAR_SOFT_DOTTED: |
1599 | case UCHAR_TERMINAL_PUNCTUATION: |
1600 | case UCHAR_UNIFIED_IDEOGRAPH: |
1601 | case UCHAR_UPPERCASE: |
1602 | case UCHAR_VARIATION_SELECTOR: |
1603 | case UCHAR_WHITE_SPACE: |
1604 | case UCHAR_XID_CONTINUE: |
1605 | case UCHAR_XID_START: |
1606 | return true; |
1607 | default: |
1608 | break; |
1609 | } |
1610 | return false; |
1611 | } |
1612 | |
1613 | bool IsUnicodePropertyValueCharacter(char c) { |
1614 | // https://tc39.github.io/proposal-regexp-unicode-property-escapes/ |
1615 | // |
1616 | // Note that using this to validate each parsed char is quite conservative. |
1617 | // A possible alternative solution would be to only ensure the parsed |
1618 | // property name/value candidate string does not contain '\0' characters and |
1619 | // let ICU lookups trigger the final failure. |
1620 | if (Utils::IsAlphaNumeric(c)) return true; |
1621 | return (c == '_'); |
1622 | } |
1623 | |
1624 | } // anonymous namespace |
1625 | |
1626 | bool RegExpParser::ParsePropertyClassName(ZoneGrowableArray<char>* name_1, |
1627 | ZoneGrowableArray<char>* name_2) { |
1628 | ASSERT(name_1->is_empty()); |
1629 | ASSERT(name_2->is_empty()); |
1630 | // Parse the property class as follows: |
1631 | // - In \p{name}, 'name' is interpreted |
1632 | // - either as a general category property value name. |
1633 | // - or as a binary property name. |
1634 | // - In \p{name=value}, 'name' is interpreted as an enumerated property name, |
1635 | // and 'value' is interpreted as one of the available property value names. |
1636 | // - Aliases in PropertyAlias.txt and PropertyValueAlias.txt can be used. |
1637 | // - Loose matching is not applied. |
1638 | if (current() == '{') { |
1639 | // Parse \p{[PropertyName=]PropertyNameValue} |
1640 | for (Advance(); current() != '}' && current() != '='; Advance()) { |
1641 | if (!IsUnicodePropertyValueCharacter(current())) return false; |
1642 | if (!has_next()) return false; |
1643 | name_1->Add(static_cast<char>(current())); |
1644 | } |
1645 | if (current() == '=') { |
1646 | for (Advance(); current() != '}'; Advance()) { |
1647 | if (!IsUnicodePropertyValueCharacter(current())) return false; |
1648 | if (!has_next()) return false; |
1649 | name_2->Add(static_cast<char>(current())); |
1650 | } |
1651 | name_2->Add(0); // null-terminate string. |
1652 | } |
1653 | } else { |
1654 | return false; |
1655 | } |
1656 | Advance(); |
1657 | name_1->Add(0); // null-terminate string. |
1658 | |
1659 | ASSERT(static_cast<size_t>(name_1->length() - 1) == strlen(name_1->data())); |
1660 | ASSERT(name_2->is_empty() || |
1661 | static_cast<size_t>(name_2->length() - 1) == strlen(name_2->data())); |
1662 | return true; |
1663 | } |
1664 | |
1665 | bool RegExpParser::AddPropertyClassRange( |
1666 | ZoneGrowableArray<CharacterRange>* add_to, |
1667 | bool negate, |
1668 | ZoneGrowableArray<char>* name_1, |
1669 | ZoneGrowableArray<char>* name_2) { |
1670 | ASSERT(name_1->At(name_1->length() - 1) == '\0'); |
1671 | ASSERT(name_2->is_empty() || name_2->At(name_2->length() - 1) == '\0'); |
1672 | if (name_2->is_empty()) { |
1673 | // First attempt to interpret as general category property value name. |
1674 | const char* name = name_1->data(); |
1675 | if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, negate, |
1676 | add_to)) { |
1677 | return true; |
1678 | } |
1679 | // Interpret "Any", "ASCII", and "Assigned". |
1680 | if (LookupSpecialPropertyValueName(name, add_to, negate)) { |
1681 | return true; |
1682 | } |
1683 | // Then attempt to interpret as binary property name with value name 'Y'. |
1684 | UProperty property = u_getPropertyEnum(name); |
1685 | if (!IsSupportedBinaryProperty(property)) return false; |
1686 | if (!IsExactPropertyAlias(name, property)) return false; |
1687 | return LookupPropertyValueName(property, negate ? "N" : "Y" , false, add_to); |
1688 | } else { |
1689 | // Both property name and value name are specified. Attempt to interpret |
1690 | // the property name as enumerated property. |
1691 | const char* property_name = name_1->data(); |
1692 | const char* value_name = name_2->data(); |
1693 | UProperty property = u_getPropertyEnum(property_name); |
1694 | if (!IsExactPropertyAlias(property_name, property)) return false; |
1695 | if (property == UCHAR_GENERAL_CATEGORY) { |
1696 | // We want to allow aggregate value names such as "Letter". |
1697 | property = UCHAR_GENERAL_CATEGORY_MASK; |
1698 | } else if (property != UCHAR_SCRIPT && |
1699 | property != UCHAR_SCRIPT_EXTENSIONS) { |
1700 | return false; |
1701 | } |
1702 | return LookupPropertyValueName(property, value_name, negate, add_to); |
1703 | } |
1704 | } |
1705 | |
1706 | bool RegExpParser::ParseUnlimitedLengthHexNumber(uint32_t max_value, |
1707 | uint32_t* value) { |
1708 | uint32_t x = 0; |
1709 | int d = HexValue(current()); |
1710 | if (d < 0) { |
1711 | return false; |
1712 | } |
1713 | while (d >= 0) { |
1714 | x = x * 16 + d; |
1715 | if (x > max_value) { |
1716 | return false; |
1717 | } |
1718 | Advance(); |
1719 | d = HexValue(current()); |
1720 | } |
1721 | *value = x; |
1722 | return true; |
1723 | } |
1724 | |
1725 | uint32_t RegExpParser::ParseClassCharacterEscape() { |
1726 | ASSERT(current() == '\\'); |
1727 | DEBUG_ASSERT(has_next() && !IsSpecialClassEscape(Next())); |
1728 | Advance(); |
1729 | switch (current()) { |
1730 | case 'b': |
1731 | Advance(); |
1732 | return '\b'; |
1733 | // ControlEscape :: one of |
1734 | // f n r t v |
1735 | case 'f': |
1736 | Advance(); |
1737 | return '\f'; |
1738 | case 'n': |
1739 | Advance(); |
1740 | return '\n'; |
1741 | case 'r': |
1742 | Advance(); |
1743 | return '\r'; |
1744 | case 't': |
1745 | Advance(); |
1746 | return '\t'; |
1747 | case 'v': |
1748 | Advance(); |
1749 | return '\v'; |
1750 | case 'c': { |
1751 | uint32_t controlLetter = Next(); |
1752 | uint32_t letter = controlLetter & ~('A' ^ 'a'); |
1753 | // For compatibility with JSC, inside a character class |
1754 | // we also accept digits and underscore as control characters. |
1755 | if (letter >= 'A' && letter <= 'Z') { |
1756 | Advance(2); |
1757 | // Control letters mapped to ASCII control characters in the range |
1758 | // 0x00-0x1f. |
1759 | return controlLetter & 0x1f; |
1760 | } |
1761 | if (is_unicode()) { |
1762 | // With /u, \c# or \c_ are invalid. |
1763 | ReportError("Invalid class escape" ); |
1764 | UNREACHABLE(); |
1765 | } |
1766 | if (Utils::IsDecimalDigit(controlLetter) || controlLetter == '_') { |
1767 | Advance(2); |
1768 | return controlLetter & 0x1f; |
1769 | } |
1770 | // We match JSC in reading the backslash as a literal |
1771 | // character instead of as starting an escape. |
1772 | return '\\'; |
1773 | } |
1774 | case '0': |
1775 | // With /u, \0 is interpreted as NUL if not followed by another digit. |
1776 | if (is_unicode() && !(Next() >= '0' && Next() <= '9')) { |
1777 | Advance(); |
1778 | return 0; |
1779 | } |
1780 | FALL_THROUGH; |
1781 | case '1': |
1782 | case '2': |
1783 | case '3': |
1784 | case '4': |
1785 | case '5': |
1786 | case '6': |
1787 | case '7': |
1788 | // For compatibility, we interpret a decimal escape that isn't |
1789 | // a back reference (and therefore either \0 or not valid according |
1790 | // to the specification) as a 1..3 digit octal character code. |
1791 | if (is_unicode()) { |
1792 | // With \u, decimal escape is not interpreted as octal character code. |
1793 | ReportError("Invalid class escape" ); |
1794 | UNREACHABLE(); |
1795 | } |
1796 | return ParseOctalLiteral(); |
1797 | case 'x': { |
1798 | Advance(); |
1799 | uint32_t value; |
1800 | if (ParseHexEscape(2, &value)) { |
1801 | return value; |
1802 | } |
1803 | if (is_unicode()) { |
1804 | // With \u, invalid escapes are not treated as identity escapes. |
1805 | ReportError("Invalid escape" ); |
1806 | UNREACHABLE(); |
1807 | } |
1808 | // If \x is not followed by a two-digit hexadecimal, treat it |
1809 | // as an identity escape. |
1810 | return 'x'; |
1811 | } |
1812 | case 'u': { |
1813 | Advance(); |
1814 | uint32_t value; |
1815 | if (ParseUnicodeEscape(&value)) { |
1816 | return value; |
1817 | } |
1818 | if (is_unicode()) { |
1819 | // With \u, invalid escapes are not treated as identity escapes. |
1820 | ReportError(kUnicodeIdentity); |
1821 | UNREACHABLE(); |
1822 | } |
1823 | // If \u is not followed by a four-digit hexadecimal, treat it |
1824 | // as an identity escape. |
1825 | return 'u'; |
1826 | } |
1827 | default: { |
1828 | // Extended identity escape. We accept any character that hasn't |
1829 | // been matched by a more specific case, not just the subset required |
1830 | // by the ECMAScript specification. |
1831 | uint32_t result = current(); |
1832 | if (!is_unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') { |
1833 | Advance(); |
1834 | return result; |
1835 | } |
1836 | ReportError(kUnicodeIdentity); |
1837 | UNREACHABLE(); |
1838 | } |
1839 | } |
1840 | return 0; |
1841 | } |
1842 | |
1843 | bool RegExpParser::ParseClassEscape(ZoneGrowableArray<CharacterRange>* ranges, |
1844 | bool add_unicode_case_equivalents, |
1845 | uint32_t* char_out) { |
1846 | uint32_t first = current(); |
1847 | if (first == '\\') { |
1848 | switch (Next()) { |
1849 | case 'w': |
1850 | case 'W': |
1851 | case 'd': |
1852 | case 'D': |
1853 | case 's': |
1854 | case 'S': { |
1855 | CharacterRange::AddClassEscape(static_cast<uint16_t>(Next()), ranges, |
1856 | add_unicode_case_equivalents); |
1857 | Advance(2); |
1858 | return true; |
1859 | } |
1860 | case 'p': |
1861 | case 'P': { |
1862 | if (!is_unicode()) break; |
1863 | bool negate = Next() == 'P'; |
1864 | Advance(2); |
1865 | auto name_1 = new (Z) ZoneGrowableArray<char>(); |
1866 | auto name_2 = new (Z) ZoneGrowableArray<char>(); |
1867 | if (!ParsePropertyClassName(name_1, name_2) || |
1868 | !AddPropertyClassRange(ranges, negate, name_1, name_2)) { |
1869 | ReportError("Invalid property name in character class" ); |
1870 | UNREACHABLE(); |
1871 | } |
1872 | return true; |
1873 | } |
1874 | case kEndMarker: |
1875 | ReportError("\\ at end of pattern" ); |
1876 | UNREACHABLE(); |
1877 | default: |
1878 | break; |
1879 | } |
1880 | *char_out = ParseClassCharacterEscape(); |
1881 | return false; |
1882 | } |
1883 | Advance(); |
1884 | *char_out = first; |
1885 | return false; |
1886 | } |
1887 | |
1888 | RegExpTree* RegExpParser::ParseCharacterClass(const RegExpBuilder* builder) { |
1889 | static const char* kUnterminated = "Unterminated character class" ; |
1890 | static const char* kRangeInvalid = "Invalid character class" ; |
1891 | static const char* kRangeOutOfOrder = "Range out of order in character class" ; |
1892 | |
1893 | ASSERT(current() == '['); |
1894 | Advance(); |
1895 | bool is_negated = false; |
1896 | if (current() == '^') { |
1897 | is_negated = true; |
1898 | Advance(); |
1899 | } |
1900 | ZoneGrowableArray<CharacterRange>* ranges = |
1901 | new (Z) ZoneGrowableArray<CharacterRange>(2); |
1902 | bool add_unicode_case_equivalents = is_unicode() && builder->ignore_case(); |
1903 | while (has_more() && current() != ']') { |
1904 | uint32_t char_1 = 0; |
1905 | bool is_class_1 = |
1906 | ParseClassEscape(ranges, add_unicode_case_equivalents, &char_1); |
1907 | if (current() == '-') { |
1908 | Advance(); |
1909 | if (current() == kEndMarker) { |
1910 | // If we reach the end we break out of the loop and let the |
1911 | // following code report an error. |
1912 | break; |
1913 | } else if (current() == ']') { |
1914 | if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1)); |
1915 | ranges->Add(CharacterRange::Singleton('-')); |
1916 | break; |
1917 | } |
1918 | uint32_t char_2 = 0; |
1919 | bool is_class_2 = |
1920 | ParseClassEscape(ranges, add_unicode_case_equivalents, &char_2); |
1921 | if (is_class_1 || is_class_2) { |
1922 | // Either end is an escaped character class. Treat the '-' verbatim. |
1923 | if (is_unicode()) { |
1924 | // ES2015 21.2.2.15.1 step 1. |
1925 | ReportError(kRangeInvalid); |
1926 | UNREACHABLE(); |
1927 | } |
1928 | if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1)); |
1929 | ranges->Add(CharacterRange::Singleton('-')); |
1930 | if (!is_class_2) ranges->Add(CharacterRange::Singleton(char_2)); |
1931 | continue; |
1932 | } |
1933 | if (char_1 > char_2) { |
1934 | ReportError(kRangeOutOfOrder); |
1935 | UNREACHABLE(); |
1936 | } |
1937 | ranges->Add(CharacterRange::Range(char_1, char_2)); |
1938 | } else { |
1939 | if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1)); |
1940 | } |
1941 | } |
1942 | if (!has_more()) { |
1943 | ReportError(kUnterminated); |
1944 | UNREACHABLE(); |
1945 | } |
1946 | Advance(); |
1947 | RegExpCharacterClass::CharacterClassFlags character_class_flags = |
1948 | RegExpCharacterClass::DefaultFlags(); |
1949 | if (is_negated) character_class_flags |= RegExpCharacterClass::NEGATED; |
1950 | return new (Z) |
1951 | RegExpCharacterClass(ranges, builder->flags(), character_class_flags); |
1952 | } |
1953 | |
1954 | // ---------------------------------------------------------------------------- |
1955 | // The Parser interface. |
1956 | |
1957 | void RegExpParser::ParseRegExp(const String& input, |
1958 | RegExpFlags flags, |
1959 | RegExpCompileData* result) { |
1960 | ASSERT(result != NULL); |
1961 | RegExpParser parser(input, &result->error, flags); |
1962 | // Throws an exception if 'input' is not valid. |
1963 | RegExpTree* tree = parser.ParsePattern(); |
1964 | ASSERT(tree != NULL); |
1965 | ASSERT(result->error.IsNull()); |
1966 | result->tree = tree; |
1967 | intptr_t capture_count = parser.captures_started(); |
1968 | result->simple = tree->IsAtom() && parser.simple() && capture_count == 0; |
1969 | result->contains_anchor = parser.contains_anchor(); |
1970 | result->capture_name_map = parser.CreateCaptureNameMap(); |
1971 | result->capture_count = capture_count; |
1972 | } |
1973 | |
1974 | } // namespace dart |
1975 | |