regexp_parser.cc source code [engine/third_party/dart/runtime/vm/regexp_parser.cc]

1	// Copyright (c) 2014, the Dart project authors. Please see the AUTHORS file
2	// for details. All rights reserved. Use of this source code is governed by a
3	// BSD-style license that can be found in the LICENSE file.
4
5	#include "vm/regexp_parser.h"
6
7	#include "unicode/uchar.h"
8	#include "unicode/uniset.h"
9
10	#include "platform/unicode.h"
11
12	#include "vm/longjump.h"
13	#include "vm/object_store.h"
14
15	namespace dart {
16
17	#define Z zone()
18
19	// Enables possessive quantifier syntax for testing.
20	static const bool FLAG_regexp_possessive_quantifier = false;
21
22	RegExpBuilder::RegExpBuilder(RegExpFlags flags)
23	: zone_(Thread::Current()->zone()),
24	pending_empty_(false),
25	flags_(flags),
26	characters_(NULL),
27	pending_surrogate_(kNoPendingSurrogate),
28	terms_(),
29	text_(),
30	alternatives_()
31	#ifdef DEBUG
32	,
33	last_added_(ADD_NONE)
34	#endif
35	{
36	}
37
38	void RegExpBuilder::AddLeadSurrogate(uint16_t lead_surrogate) {
39	ASSERT(Utf16::IsLeadSurrogate(lead_surrogate));
40	FlushPendingSurrogate();
41	// Hold onto the lead surrogate, waiting for a trail surrogate to follow.
42	pending_surrogate_ = lead_surrogate;
43	}
44
45	void RegExpBuilder::AddTrailSurrogate(uint16_t trail_surrogate) {
46	ASSERT(Utf16::IsTrailSurrogate(trail_surrogate));
47	if (pending_surrogate_ != kNoPendingSurrogate) {
48	uint16_t lead_surrogate = pending_surrogate_;
49	pending_surrogate_ = kNoPendingSurrogate;
50	ASSERT(Utf16::IsLeadSurrogate(lead_surrogate));
51	uint32_t combined = Utf16::Decode(lead_surrogate, trail_surrogate);
52	if (NeedsDesugaringForIgnoreCase(combined)) {
53	AddCharacterClassForDesugaring(combined);
54	} else {
55	auto surrogate_pair = new (Z) ZoneGrowableArray<uint16_t>(`2`);
56	surrogate_pair->Add(lead_surrogate);
57	surrogate_pair->Add(trail_surrogate);
58	RegExpAtom* atom = new (Z) RegExpAtom (surrogate_pair, flags_);
59	AddAtom(atom);
60	}
61	} else {
62	pending_surrogate_ = trail_surrogate;
63	FlushPendingSurrogate();
64	}
65	}
66
67	void RegExpBuilder::FlushPendingSurrogate() {
68	if (pending_surrogate_ != kNoPendingSurrogate) {
69	ASSERT(is_unicode());
70	uint32_t c = pending_surrogate_;
71	pending_surrogate_ = kNoPendingSurrogate;
72	AddCharacterClassForDesugaring(c);
73	}
74	}
75
76	void RegExpBuilder::FlushCharacters() {
77	FlushPendingSurrogate();
78	pending_empty_ = false;
79	if (characters_ != NULL) {
80	RegExpTree* atom = new (Z) RegExpAtom (characters_, flags_);
81	characters_ = NULL;
82	text_.Add(atom);
83	LAST(ADD_ATOM);
84	}
85	}
86
87	void RegExpBuilder::FlushText() {
88	FlushCharacters();
89	intptr_t num_text = text_.length();
90	if (num_text == `0`) {
91	return;
92	} else if (num_text == `1`) {
93	terms_.Add(text_.Last());
94	} else {
95	RegExpText* text = new (Z) RegExpText ();
96	for (intptr_t i = `0`; i < num_text; i++)
97	text_[i]->AppendToText(text);
98	terms_.Add(text);
99	}
100	text_.Clear();
101	}
102
103	void RegExpBuilder::AddCharacter(uint16_t c) {
104	FlushPendingSurrogate();
105	pending_empty_ = false;
106	if (NeedsDesugaringForIgnoreCase(c)) {
107	AddCharacterClassForDesugaring(c);
108	} else {
109	if (characters_ == NULL) {
110	characters_ = new (Z) ZoneGrowableArray<uint16_t>(`4`);
111	}
112	characters_->Add(c);
113	LAST(ADD_CHAR);
114	}
115	}
116
117	void RegExpBuilder::AddUnicodeCharacter(uint32_t c) {
118	if (c > static_cast<uint32_t>(Utf16::kMaxCodeUnit)) {
119	ASSERT(is_unicode());
120	uint16_t surrogates[`2`];
121	Utf16::Encode(c, surrogates);
122	AddLeadSurrogate(surrogates[`0`]);
123	AddTrailSurrogate(surrogates[`1`]);
124	} else if (is_unicode() && Utf16::IsLeadSurrogate(c)) {
125	AddLeadSurrogate(c);
126	} else if (is_unicode() && Utf16::IsTrailSurrogate(c)) {
127	AddTrailSurrogate(c);
128	} else {
129	AddCharacter(static_cast<uint16_t>(c));
130	}
131	}
132
133	void RegExpBuilder::AddEscapedUnicodeCharacter(uint32_t character) {
134	// A lead or trail surrogate parsed via escape sequence will not
135	// pair up with any preceding lead or following trail surrogate.
136	FlushPendingSurrogate();
137	AddUnicodeCharacter(character);
138	FlushPendingSurrogate();
139	}
140
141	void RegExpBuilder::AddEmpty() {
142	pending_empty_ = true;
143	}
144
145	void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
146	if (NeedsDesugaringForUnicode(cc)) {
147	// With /u, character class needs to be desugared, so it
148	// must be a standalone term instead of being part of a RegExpText.
149	AddTerm(cc);
150	} else {
151	AddAtom(cc);
152	}
153	}
154
155	void RegExpBuilder::AddCharacterClassForDesugaring(uint32_t c) {
156	auto ranges = CharacterRange::List(Z, CharacterRange::Singleton(c));
157	AddTerm(new (Z) RegExpCharacterClass (ranges, flags_));
158	}
159
160	void RegExpBuilder::AddAtom(RegExpTree* term) {
161	if (term->IsEmpty()) {
162	AddEmpty();
163	return;
164	}
165	if (term->IsTextElement()) {
166	FlushCharacters();
167	text_.Add(term);
168	} else {
169	FlushText();
170	terms_.Add(term);
171	}
172	LAST(ADD_ATOM);
173	}
174
175	void RegExpBuilder::AddTerm(RegExpTree* term) {
176	FlushText();
177	terms_.Add(term);
178	LAST(ADD_ATOM);
179	}
180
181	void RegExpBuilder::AddAssertion(RegExpTree* assert) {
182	FlushText();
183	terms_.Add(assert);
184	LAST(ADD_ASSERT);
185	}
186
187	void RegExpBuilder::NewAlternative() {
188	FlushTerms();
189	}
190
191	void RegExpBuilder::FlushTerms() {
192	FlushText();
193	intptr_t num_terms = terms_.length();
194	RegExpTree* alternative;
195	if (num_terms == `0`) {
196	alternative = RegExpEmpty::GetInstance();
197	} else if (num_terms == `1`) {
198	alternative = terms_.Last();
199	} else {
200	ZoneGrowableArray<RegExpTree> terms =
201	new (Z) ZoneGrowableArray<RegExpTree*>();
202	for (intptr_t i = `0`; i < terms_.length(); i++) {
203	terms->Add(terms_[i]);
204	}
205	alternative = new (Z) RegExpAlternative (terms);
206	}
207	alternatives_.Add(alternative);
208	terms_.Clear();
209	LAST(ADD_NONE);
210	}
211
212	bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) {
213	if (!is_unicode()) return false;
214	// TODO(yangguo): we could be smarter than this. Case-insensitivity does not
215	// necessarily mean that we need to desugar. It's probably nicer to have a
216	// separate pass to figure out unicode desugarings.
217	if (ignore_case()) return true;
218	ZoneGrowableArray<CharacterRange>* ranges = cc->ranges();
219	CharacterRange::Canonicalize(ranges);
220	for (int i = ranges->length() - `1`; i >= `0`; i--) {
221	uint32_t from = ranges->At(i).from();
222	uint32_t to = ranges->At(i).to();
223	// Check for non-BMP characters.
224	if (to >= Utf16::kMaxCodeUnit) return true;
225	// Check for lone surrogates.
226	if (from <= Utf16::kTrailSurrogateEnd && to >= Utf16::kLeadSurrogateStart) {
227	return true;
228	}
229	}
230	return false;
231	}
232
233	bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uint32_t c) {
234	if (is_unicode() && ignore_case()) {
235	icu::UnicodeSet set(c, c);
236	set.closeOver(USET_CASE_INSENSITIVE);
237	set.removeAllStrings();
238	return set.size() > `1`;
239	}
240	return false;
241	}
242
243	RegExpTree* RegExpBuilder::ToRegExp() {
244	FlushTerms();
245	intptr_t num_alternatives = alternatives_.length();
246	if (num_alternatives == `0`) {
247	return RegExpEmpty::GetInstance();
248	}
249	if (num_alternatives == `1`) {
250	return alternatives_.Last();
251	}
252	ZoneGrowableArray<RegExpTree> alternatives =
253	new (Z) ZoneGrowableArray<RegExpTree*>();
254	for (intptr_t i = `0`; i < alternatives_.length(); i++) {
255	alternatives->Add(alternatives_[i]);
256	}
257	return new (Z) RegExpDisjunction (alternatives);
258	}
259
260	bool RegExpBuilder::AddQuantifierToAtom(
261	intptr_t min,
262	intptr_t max,
263	RegExpQuantifier::QuantifierType quantifier_type) {
264	if (pending_empty_) {
265	pending_empty_ = false;
266	return true;
267	}
268	RegExpTree* atom;
269	if (characters_ != NULL) {
270	DEBUG_ASSERT(last_added_ == ADD_CHAR);
271	// Last atom was character.
272
273	ZoneGrowableArray<uint16_t>* char_vector =
274	new (Z) ZoneGrowableArray<uint16_t>();
275	char_vector->AddArray(*characters_);
276	intptr_t num_chars = char_vector->length();
277	if (num_chars > `1`) {
278	ZoneGrowableArray<uint16_t>* prefix =
279	new (Z) ZoneGrowableArray<uint16_t>();
280	for (intptr_t i = `0`; i < num_chars - `1`; i++) {
281	prefix->Add(char_vector->At(i));
282	}
283	text_.Add(new (Z) RegExpAtom (prefix, flags_));
284	ZoneGrowableArray<uint16_t>* tail = new (Z) ZoneGrowableArray<uint16_t>();
285	tail->Add(char_vector->At(num_chars - `1`));
286	char_vector = tail;
287	}
288	characters_ = NULL;
289	atom = new (Z) RegExpAtom (char_vector, flags_);
290	FlushText();
291	} else if (text_.length() > `0`) {
292	DEBUG_ASSERT(last_added_ == ADD_ATOM);
293	atom = text_.RemoveLast();
294	FlushText();
295	} else if (terms_.length() > `0`) {
296	DEBUG_ASSERT(last_added_ == ADD_ATOM);
297	atom = terms_.RemoveLast();
298	if (auto lookaround = atom->AsLookaround()) {
299	// With /u, lookarounds are not quantifiable.
300	if (is_unicode()) return false;
301	// Lookbehinds are not quantifiable.
302	if (lookaround->type() == RegExpLookaround::LOOKBEHIND) {
303	return false;
304	}
305	}
306	if (atom->max_match() == `0`) {
307	// Guaranteed to only match an empty string.
308	LAST(ADD_TERM);
309	if (min == `0`) {
310	return true;
311	}
312	terms_.Add(atom);
313	return true;
314	}
315	} else {
316	// Only call immediately after adding an atom or character!
317	UNREACHABLE();
318	}
319	terms_.Add(new (Z) RegExpQuantifier (min, max, quantifier_type, atom));
320	LAST(ADD_TERM);
321	return true;
322	}
323
324	// ----------------------------------------------------------------------------
325	// Implementation of Parser
326
327	RegExpParser::RegExpParser(const String& in, String* error, RegExpFlags flags)
328	: zone_(Thread::Current()->zone()),
329	captures_(nullptr),
330	named_captures_(nullptr),
331	named_back_references_(nullptr),
332	in_(in),
333	current_(kEndMarker),
334	next_pos_(`0`),
335	captures_started_(`0`),
336	capture_count_(`0`),
337	has_more_(true),
338	top_level_flags_(flags),
339	simple_(false),
340	contains_anchor_(false),
341	is_scanned_for_captures_(false),
342	has_named_captures_(false) {
343	Advance();
344	}
345
346	inline uint32_t RegExpParser::ReadNext(bool update_position) {
347	intptr_t position = next_pos_;
348	const uint16_t c0 = in().CharAt(position);
349	uint32_t c = c0;
350	position++;
351	if (is_unicode() && position < in().Length() && Utf16::IsLeadSurrogate(c0)) {
352	const uint16_t c1 = in().CharAt(position);
353	if (Utf16::IsTrailSurrogate(c1)) {
354	c = Utf16::Decode(c0, c1);
355	position++;
356	}
357	}
358	if (update_position) next_pos_ = position;
359	return c;
360	}
361
362	uint32_t RegExpParser::Next() {
363	if (has_next()) {
364	return ReadNext(false);
365	} else {
366	return kEndMarker;
367	}
368	}
369
370	void RegExpParser::Advance() {
371	if (has_next()) {
372	current_ = ReadNext(true);
373	} else {
374	current_ = kEndMarker;
375	// Advance so that position() points to 1 after the last character. This is
376	// important so that Reset() to this position works correctly.
377	next_pos_ = in().Length() + `1`;
378	has_more_ = false;
379	}
380	}
381
382	void RegExpParser::Reset(intptr_t pos) {
383	next_pos_ = pos;
384	has_more_ = (pos < in().Length());
385	Advance();
386	}
387
388	void RegExpParser::Advance(intptr_t dist) {
389	next_pos_ += dist - `1`;
390	Advance();
391	}
392
393	bool RegExpParser::simple() {
394	return simple_;
395	}
396
397	bool RegExpParser::IsSyntaxCharacterOrSlash(uint32_t c) {
398	switch (c) {
399	case `'^'`:
400	case `'$'`:
401	case `'\\'`:
402	case `'.'`:
403	case `'*'`:
404	case `'+'`:
405	case `'?'`:
406	case `'('`:
407	case `')'`:
408	case `'['`:
409	case `']'`:
410	case `'{'`:
411	case `'}'`:
412	case `'\|'`:
413	case `'/'`:
414	return true;
415	default:
416	break;
417	}
418	return false;
419	}
420
421	void RegExpParser::ReportError(const char* message) {
422	// Zip to the end to make sure the no more input is read.
423	current_ = kEndMarker;
424	next_pos_ = in().Length();
425
426	// Throw a FormatException on parsing failures.
427	const String& msg = String::Handle(
428	String::Concat(String::Handle(String::New(message)), in()));
429	const Array& args = Array::Handle(Array::New(`1`));
430	args.SetAt(`0`, msg);
431	Exceptions::ThrowByType(Exceptions::kFormat, args);
432	UNREACHABLE();
433	}
434
435	// Pattern ::
436	// Disjunction
437	RegExpTree* RegExpParser::ParsePattern() {
438	RegExpTree* result = ParseDisjunction();
439	PatchNamedBackReferences();
440	ASSERT(!has_more());
441	// If the result of parsing is a literal string atom, and it has the
442	// same length as the input, then the atom is identical to the input.
443	if (result->IsAtom() && result->AsAtom()->length() == in().Length()) {
444	simple_ = true;
445	}
446	return result;
447	}
448
449	// Used for error messages where we would have fallen back on treating an
450	// escape as the identity escape, but we are in Unicode mode.
451	static const char* kUnicodeIdentity =
452	"Invalid identity escape in Unicode pattern";
453
454	// Disjunction ::
455	// Alternative
456	// Alternative \| Disjunction
457	// Alternative ::
458	// [empty]
459	// Term Alternative
460	// Term ::
461	// Assertion
462	// Atom
463	// Atom Quantifier
464	RegExpTree* RegExpParser::ParseDisjunction() {
465	// Used to store current state while parsing subexpressions.
466	RegExpParserState initial_state(nullptr, INITIAL, RegExpLookaround::LOOKAHEAD,
467	`0`, nullptr, top_level_flags_, Z);
468	RegExpParserState* stored_state = &initial_state;
469	// Cache the builder in a local variable for quick access.
470	RegExpBuilder* builder = initial_state.builder();
471	while (true) {
472	switch (current()) {
473	case kEndMarker:
474	if (stored_state->IsSubexpression()) {
475	// Inside a parenthesized group when hitting end of input.
476	ReportError("Unterminated group");
477	UNREACHABLE();
478	}
479	ASSERT(INITIAL == stored_state->group_type());
480	// Parsing completed successfully.
481	return builder->ToRegExp();
482	case `')'`: {
483	if (!stored_state->IsSubexpression()) {
484	ReportError("Unmatched ')'");
485	UNREACHABLE();
486	}
487	ASSERT(INITIAL != stored_state->group_type());
488
489	Advance();
490	// End disjunction parsing and convert builder content to new single
491	// regexp atom.
492	RegExpTree* body = builder->ToRegExp();
493
494	intptr_t end_capture_index = captures_started();
495
496	intptr_t capture_index = stored_state->capture_index();
497	SubexpressionType group_type = stored_state->group_type();
498
499	// Build result of subexpression.
500	if (group_type == CAPTURE) {
501	if (stored_state->IsNamedCapture()) {
502	CreateNamedCaptureAtIndex(stored_state->capture_name(),
503	capture_index);
504	}
505	RegExpCapture* capture = GetCapture(capture_index);
506	capture->set_body(body);
507	body = capture;
508	} else if (group_type != GROUPING) {
509	ASSERT(group_type == POSITIVE_LOOKAROUND \|\|
510	group_type == NEGATIVE_LOOKAROUND);
511	bool is_positive = (group_type == POSITIVE_LOOKAROUND);
512	body = new (Z) RegExpLookaround (
513	body, is_positive, end_capture_index - capture_index,
514	capture_index, stored_state->lookaround_type());
515	}
516
517	// Restore previous state.
518	stored_state = stored_state->previous_state();
519	builder = stored_state->builder();
520
521	builder->AddAtom(body);
522	// For compatibility with JSC and ES3, we allow quantifiers after
523	// lookaheads, and break in all cases.
524	break;
525	}
526	case `'\|'`: {
527	Advance();
528	builder->NewAlternative();
529	continue;
530	}
531	case `'*'`:
532	case `'+'`:
533	case `'?'`:
534	ReportError("Nothing to repeat");
535	UNREACHABLE();
536	case `'^'`: {
537	Advance();
538	if (builder->is_multi_line()) {
539	builder->AddAssertion(new (Z) RegExpAssertion (
540	RegExpAssertion::START_OF_LINE, builder->flags()));
541	} else {
542	builder->AddAssertion(new (Z) RegExpAssertion (
543	RegExpAssertion::START_OF_INPUT, builder->flags()));
544	set_contains_anchor();
545	}
546	continue;
547	}
548	case `'$'`: {
549	Advance();
550	RegExpAssertion::AssertionType assertion_type =
551	builder->is_multi_line() ? RegExpAssertion::END_OF_LINE
552	: RegExpAssertion::END_OF_INPUT;
553	builder->AddAssertion(
554	new (Z) RegExpAssertion (assertion_type, builder->flags()));
555	continue;
556	}
557	case `'.'`: {
558	Advance();
559	auto ranges = new (Z) ZoneGrowableArray<CharacterRange>(`2`);
560	if (builder->is_dot_all()) {
561	// Everything.
562	CharacterRange::AddClassEscape(
563	`'*'`, ranges,
564	/add_unicode_case_equivalents=/false);
565	} else {
566	// everything except \x0a, \x0d, \u2028 and \u2029
567	CharacterRange::AddClassEscape(
568	`'.'`, ranges,
569	/add_unicode_case_equivalents=/false);
570	}
571	RegExpCharacterClass* cc =
572	new (Z) RegExpCharacterClass (ranges, builder->flags());
573	builder->AddCharacterClass(cc);
574	break;
575	}
576	case `'('`: {
577	stored_state = ParseOpenParenthesis(stored_state);
578	builder = stored_state->builder();
579	continue;
580	}
581	case `'['`: {
582	RegExpTree* atom = ParseCharacterClass(builder);
583	builder->AddCharacterClass(atom->AsCharacterClass());
584	break;
585	}
586	// Atom ::
587	// \ AtomEscape
588	case `'\\'`:
589	switch (Next()) {
590	case kEndMarker:
591	ReportError("\\ at end of pattern");
592	UNREACHABLE();
593	case `'b'`:
594	Advance(`2`);
595	builder->AddAssertion(new (Z) RegExpAssertion (
596	RegExpAssertion::BOUNDARY, builder->flags()));
597	continue;
598	case `'B'`:
599	Advance(`2`);
600	builder->AddAssertion(new (Z) RegExpAssertion (
601	RegExpAssertion::NON_BOUNDARY, builder->flags()));
602	continue;
603	// AtomEscape ::
604	// CharacterClassEscape
605	//
606	// CharacterClassEscape :: one of
607	// d D s S w W
608	case `'d'`:
609	case `'D'`:
610	case `'s'`:
611	case `'S'`:
612	case `'w'`:
613	case `'W'`: {
614	uint32_t c = Next();
615	Advance(`2`);
616	auto ranges = new (Z) ZoneGrowableArray<CharacterRange>(`2`);
617	CharacterRange::AddClassEscape(
618	c, ranges, is_unicode() && builder->ignore_case());
619	RegExpCharacterClass* cc =
620	new (Z) RegExpCharacterClass (ranges, builder->flags());
621	builder->AddCharacterClass(cc);
622	break;
623	}
624	case `'p'`:
625	case `'P'`: {
626	uint32_t p = Next();
627	Advance(`2`);
628
629	if (is_unicode()) {
630	auto name_1 = new (Z) ZoneGrowableArray<char>();
631	auto name_2 = new (Z) ZoneGrowableArray<char>();
632	auto ranges = new (Z) ZoneGrowableArray<CharacterRange>(`2`);
633	if (ParsePropertyClassName(name_1, name_2)) {
634	if (AddPropertyClassRange(ranges, p == `'P'`, name_1, name_2)) {
635	RegExpCharacterClass* cc =
636	new (Z) RegExpCharacterClass (ranges, builder->flags());
637	builder->AddCharacterClass(cc);
638	break;
639	}
640	}
641	ReportError("Invalid property name");
642	UNREACHABLE();
643	} else {
644	builder->AddCharacter(p);
645	}
646	break;
647	}
648	case `'1'`:
649	case `'2'`:
650	case `'3'`:
651	case `'4'`:
652	case `'5'`:
653	case `'6'`:
654	case `'7'`:
655	case `'8'`:
656	case `'9'`: {
657	intptr_t index = `0`;
658	if (ParseBackReferenceIndex(&index)) {
659	if (stored_state->IsInsideCaptureGroup(index)) {
660	// The back reference is inside the capture group it refers to.
661	// Nothing can possibly have been captured yet, so we use empty
662	// instead. This ensures that, when checking a back reference,
663	// the capture registers of the referenced capture are either
664	// both set or both cleared.
665	builder->AddEmpty();
666	} else {
667	RegExpCapture* capture = GetCapture(index);
668	RegExpTree* atom =
669	new (Z) RegExpBackReference (capture, builder->flags());
670	builder->AddAtom(atom);
671	}
672	break;
673	}
674	// With /u, no identity escapes except for syntax characters are
675	// allowed. Otherwise, all identity escapes are allowed.
676	if (is_unicode()) {
677	ReportError(kUnicodeIdentity);
678	UNREACHABLE();
679	}
680	uint32_t first_digit = Next();
681	if (first_digit == `'8'` \|\| first_digit == `'9'`) {
682	builder->AddCharacter(first_digit);
683	Advance(`2`);
684	break;
685	}
686	}
687	FALL_THROUGH;
688	case `'0'`: {
689	Advance();
690	if (is_unicode() && Next() >= `'0'` && Next() <= `'9'`) {
691	// With /u, decimal escape with leading 0 are not parsed as octal.
692	ReportError("Invalid decimal escape");
693	UNREACHABLE();
694	}
695	uint32_t octal = ParseOctalLiteral();
696	builder->AddCharacter(octal);
697	break;
698	}
699	// ControlEscape :: one of
700	// f n r t v
701	case `'f'`:
702	Advance(`2`);
703	builder->AddCharacter(`'\f'`);
704	break;
705	case `'n'`:
706	Advance(`2`);
707	builder->AddCharacter(`'\n'`);
708	break;
709	case `'r'`:
710	Advance(`2`);
711	builder->AddCharacter(`'\r'`);
712	break;
713	case `'t'`:
714	Advance(`2`);
715	builder->AddCharacter(`'\t'`);
716	break;
717	case `'v'`:
718	Advance(`2`);
719	builder->AddCharacter(`'\v'`);
720	break;
721	case `'c'`: {
722	Advance();
723	uint32_t controlLetter = Next();
724	// Special case if it is an ASCII letter.
725	// Convert lower case letters to uppercase.
726	uint32_t letter = controlLetter & ~(`'a'` ^ `'A'`);
727	if (letter < `'A'` \|\| `'Z'` < letter) {
728	// controlLetter is not in range 'A'-'Z' or 'a'-'z'.
729	// This is outside the specification. We match JSC in
730	// reading the backslash as a literal character instead
731	// of as starting an escape.
732	if (is_unicode()) {
733	// With /u, invalid escapes are not treated as identity escapes.
734	ReportError(kUnicodeIdentity);
735	UNREACHABLE();
736	}
737	builder->AddCharacter(`'\\'`);
738	} else {
739	Advance(`2`);
740	builder->AddCharacter(controlLetter & `0x1f`);
741	}
742	break;
743	}
744	case `'x'`: {
745	Advance(`2`);
746	uint32_t value;
747	if (ParseHexEscape(`2`, &value)) {
748	builder->AddCharacter(value);
749	} else if (!is_unicode()) {
750	builder->AddCharacter(`'x'`);
751	} else {
752	// With /u, invalid escapes are not treated as identity escapes.
753	ReportError(kUnicodeIdentity);
754	UNREACHABLE();
755	}
756	break;
757	}
758	case `'u'`: {
759	Advance(`2`);
760	uint32_t value;
761	if (ParseUnicodeEscape(&value)) {
762	builder->AddEscapedUnicodeCharacter(value);
763	} else if (!is_unicode()) {
764	builder->AddCharacter(`'u'`);
765	} else {
766	// With /u, invalid escapes are not treated as identity escapes.
767	ReportError(kUnicodeIdentity);
768	UNREACHABLE();
769	}
770	break;
771	}
772	case `'k'`:
773	// Either an identity escape or a named back-reference. The two
774	// interpretations are mutually exclusive: '\k' is interpreted as
775	// an identity escape for non-Unicode patterns without named
776	// capture groups, and as the beginning of a named back-reference
777	// in all other cases.
778	if (is_unicode() \|\| HasNamedCaptures()) {
779	Advance(`2`);
780	ParseNamedBackReference(builder, stored_state);
781	break;
782	}
783	FALL_THROUGH;
784	default:
785	Advance();
786	// With the unicode flag, no identity escapes except for syntax
787	// characters are allowed. Otherwise, all identity escapes are
788	// allowed.
789	if (!is_unicode() \|\| IsSyntaxCharacterOrSlash(current())) {
790	builder->AddCharacter(current());
791	Advance();
792	} else {
793	ReportError(kUnicodeIdentity);
794	UNREACHABLE();
795	}
796	break;
797	}
798	break;
799	case `'{'`: {
800	intptr_t dummy;
801	if (ParseIntervalQuantifier(&dummy, &dummy)) {
802	ReportError("Nothing to repeat");
803	UNREACHABLE();
804	}
805	}
806	FALL_THROUGH;
807	case `'}'`:
808	case `']'`:
809	if (is_unicode()) {
810	ReportError("Lone quantifier brackets");
811	UNREACHABLE();
812	}
813	FALL_THROUGH;
814	default:
815	builder->AddUnicodeCharacter(current());
816	Advance();
817	break;
818	} // end switch(current())
819
820	intptr_t min;
821	intptr_t max;
822	switch (current()) {
823	// QuantifierPrefix ::
824	// *
825	// +
826	// ?
827	// {
828	case `'*'`:
829	min = `0`;
830	max = RegExpTree::kInfinity;
831	Advance();
832	break;
833	case `'+'`:
834	min = `1`;
835	max = RegExpTree::kInfinity;
836	Advance();
837	break;
838	case `'?'`:
839	min = `0`;
840	max = `1`;
841	Advance();
842	break;
843	case `'{'`:
844	if (ParseIntervalQuantifier(&min, &max)) {
845	if (max < min) {
846	ReportError("numbers out of order in {} quantifier.");
847	UNREACHABLE();
848	}
849	break;
850	} else {
851	continue;
852	}
853	default:
854	continue;
855	}
856	RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;
857	if (current() == `'?'`) {
858	quantifier_type = RegExpQuantifier::NON_GREEDY;
859	Advance();
860	} else if (FLAG_regexp_possessive_quantifier && current() == `'+'`) {
861	// FLAG_regexp_possessive_quantifier is a debug-only flag.
862	quantifier_type = RegExpQuantifier::POSSESSIVE;
863	Advance();
864	}
865	if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) {
866	ReportError("invalid quantifier.");
867	UNREACHABLE();
868	}
869	}
870	}
871
872	#ifdef DEBUG
873	// Currently only used in an ASSERT.
874	static bool IsSpecialClassEscape(uint32_t c) {
875	switch (c) {
876	case `'d'`:
877	case `'D'`:
878	case `'s'`:
879	case `'S'`:
880	case `'w'`:
881	case `'W'`:
882	return true;
883	default:
884	return false;
885	}
886	}
887	#endif
888
889	RegExpParser::RegExpParserState* RegExpParser::ParseOpenParenthesis(
890	RegExpParserState* state) {
891	RegExpLookaround::Type lookaround_type = state->lookaround_type();
892	bool is_named_capture = false;
893	const RegExpCaptureName* capture_name = nullptr;
894	SubexpressionType subexpr_type = CAPTURE;
895	Advance();
896	if (current() == `'?'`) {
897	switch (Next()) {
898	case `':'`:
899	Advance(`2`);
900	subexpr_type = GROUPING;
901	break;
902	case `'='`:
903	Advance(`2`);
904	lookaround_type = RegExpLookaround::LOOKAHEAD;
905	subexpr_type = POSITIVE_LOOKAROUND;
906	break;
907	case `'!'`:
908	Advance(`2`);
909	lookaround_type = RegExpLookaround::LOOKAHEAD;
910	subexpr_type = NEGATIVE_LOOKAROUND;
911	break;
912	case `'<'`:
913	Advance();
914	if (Next() == `'='`) {
915	Advance(`2`);
916	lookaround_type = RegExpLookaround::LOOKBEHIND;
917	subexpr_type = POSITIVE_LOOKAROUND;
918	break;
919	} else if (Next() == `'!'`) {
920	Advance(`2`);
921	lookaround_type = RegExpLookaround::LOOKBEHIND;
922	subexpr_type = NEGATIVE_LOOKAROUND;
923	break;
924	}
925	is_named_capture = true;
926	has_named_captures_ = true;
927	Advance();
928	break;
929	default:
930	ReportError("Invalid group");
931	UNREACHABLE();
932	}
933	}
934
935	if (subexpr_type == CAPTURE) {
936	if (captures_started_ >= kMaxCaptures) {
937	ReportError("Too many captures");
938	UNREACHABLE();
939	}
940	captures_started_++;
941
942	if (is_named_capture) {
943	capture_name = ParseCaptureGroupName();
944	}
945	}
946	// Store current state and begin new disjunction parsing.
947	return new (Z)
948	RegExpParserState (state, subexpr_type, lookaround_type, captures_started_,
949	capture_name, state->builder()->flags(), Z);
950	}
951
952	// In order to know whether an escape is a backreference or not we have to scan
953	// the entire regexp and find the number of capturing parentheses. However we
954	// don't want to scan the regexp twice unless it is necessary. This mini-parser
955	// is called when needed. It can see the difference between capturing and
956	// noncapturing parentheses and can skip character classes and backslash-escaped
957	// characters.
958	void RegExpParser::ScanForCaptures() {
959	ASSERT(!is_scanned_for_captures_);
960	const intptr_t saved_position = position();
961	// Start with captures started previous to current position
962	intptr_t capture_count = captures_started();
963	// Add count of captures after this position.
964	uintptr_t n;
965	while ((n = current()) != kEndMarker) {
966	Advance();
967	switch (n) {
968	case `'\\'`:
969	Advance();
970	break;
971	case `'['`: {
972	uintptr_t c;
973	while ((c = current()) != kEndMarker) {
974	Advance();
975	if (c == `'\\'`) {
976	Advance();
977	} else {
978	if (c == `']'`) break;
979	}
980	}
981	break;
982	}
983	case `'('`:
984	// At this point we could be in
985	// a non-capturing group '(:',*
986	// a lookbehind assertion '(?<=' '(?<!'*
987	// or a named capture '(?<'.*
988	//
989	// Of these, only named captures are capturing groups.
990	if (current() == `'?'`) {
991	Advance();
992	if (current() != `'<'`) break;
993
994	Advance();
995	if (current() == `'='` \|\| current() == `'!'`) break;
996
997	// Found a possible named capture. It could turn out to be a syntax
998	// error (e.g. an unterminated or invalid name), but that distinction
999	// does not matter for our purposes.
1000	has_named_captures_ = true;
1001	}
1002	capture_count++;
1003	break;
1004	}
1005	}
1006	capture_count_ = capture_count;
1007	is_scanned_for_captures_ = true;
1008	Reset(saved_position);
1009	}
1010
1011	bool RegExpParser::ParseBackReferenceIndex(intptr_t* index_out) {
1012	ASSERT(`'\\'` == current());
1013	ASSERT(`'1'` <= Next() && Next() <= `'9'`);
1014	// Try to parse a decimal literal that is no greater than the total number
1015	// of left capturing parentheses in the input.
1016	intptr_t start = position();
1017	intptr_t value = Next() - `'0'`;
1018	Advance(`2`);
1019	while (true) {
1020	uint32_t c = current();
1021	if (Utils::IsDecimalDigit(c)) {
1022	value = `10` * value + (c - `'0'`);
1023	if (value > kMaxCaptures) {
1024	Reset(start);
1025	return false;
1026	}
1027	Advance();
1028	} else {
1029	break;
1030	}
1031	}
1032	if (value > captures_started()) {
1033	if (!is_scanned_for_captures_) ScanForCaptures();
1034	if (value > capture_count_) {
1035	Reset(start);
1036	return false;
1037	}
1038	}
1039	*index_out = value;
1040	return true;
1041	}
1042
1043	namespace {
1044
1045	static inline constexpr bool IsAsciiIdentifierPart(uint32_t ch) {
1046	return Utils::IsAlphaNumeric(ch) \|\| ch == `'_'` \|\| ch == `'$'`;
1047	}
1048
1049	// ES#sec-names-and-keywords Names and Keywords
1050	// UnicodeIDStart, '$', '_' and '\'
1051	static bool IsIdentifierStartSlow(uint32_t c) {
1052	// cannot use u_isIDStart because it does not work for
1053	// Other_ID_Start characters.
1054	return u_hasBinaryProperty(c, UCHAR_ID_START) \|\|
1055	(c < `0x60` && (c == `'$'` \|\| c == `'\\'` \|\| c == `'_'`));
1056	}
1057
1058	// ES#sec-names-and-keywords Names and Keywords
1059	// UnicodeIDContinue, '$', '_', '\', ZWJ, and ZWNJ
1060	static bool IsIdentifierPartSlow(uint32_t c) {
1061	const uint32_t kZeroWidthNonJoiner = `0x200C`;
1062	const uint32_t kZeroWidthJoiner = `0x200D`;
1063	// Can't use u_isIDPart because it does not work for
1064	// Other_ID_Continue characters.
1065	return u_hasBinaryProperty(c, UCHAR_ID_CONTINUE) \|\|
1066	(c < `0x60` && (c == `'$'` \|\| c == `'\\'` \|\| c == `'_'`)) \|\|
1067	c == kZeroWidthNonJoiner \|\| c == kZeroWidthJoiner;
1068	}
1069
1070	static inline bool IsIdentifierStart(uint32_t c) {
1071	if (c > `127`) return IsIdentifierStartSlow(c);
1072	return IsAsciiIdentifierPart(c) && !Utils::IsDecimalDigit(c);
1073	}
1074
1075	static inline bool IsIdentifierPart(uint32_t c) {
1076	if (c > `127`) return IsIdentifierPartSlow(c);
1077	return IsAsciiIdentifierPart(c);
1078	}
1079
1080	static bool IsSameName(const RegExpCaptureName* name1,
1081	const RegExpCaptureName* name2) {
1082	if (name1->length() != name2->length()) return false;
1083	for (intptr_t i = `0`; i < name1->length(); i++) {
1084	if (name1->At(i) != name2->At(i)) return false;
1085	}
1086	return true;
1087	}
1088
1089	} // end namespace
1090
1091	static void PushCodeUnit(RegExpCaptureName* v, uint32_t code_unit) {
1092	if (code_unit <= Utf16::kMaxCodeUnit) {
1093	v->Add(code_unit);
1094	} else {
1095	uint16_t units[`2`];
1096	Utf16::Encode(code_unit, units);
1097	v->Add(units[`0`]);
1098	v->Add(units[`1`]);
1099	}
1100	}
1101
1102	const RegExpCaptureName* RegExpParser::ParseCaptureGroupName() {
1103	auto name = new (Z) RegExpCaptureName ();
1104
1105	bool at_start = true;
1106	while (true) {
1107	uint32_t c = current();
1108	Advance();
1109
1110	// Convert unicode escapes.
1111	if (c == `'\\'` && current() == `'u'`) {
1112	Advance();
1113	if (!ParseUnicodeEscape(&c)) {
1114	ReportError("Invalid Unicode escape sequence");
1115	UNREACHABLE();
1116	}
1117	}
1118
1119	// The backslash char is misclassified as both ID_Start and ID_Continue.
1120	if (c == `'\\'`) {
1121	ReportError("Invalid capture group name");
1122	UNREACHABLE();
1123	}
1124
1125	if (at_start) {
1126	if (!IsIdentifierStart(c)) {
1127	ReportError("Invalid capture group name");
1128	UNREACHABLE();
1129	}
1130	PushCodeUnit(name, c);
1131	at_start = false;
1132	} else {
1133	if (c == `'>'`) {
1134	break;
1135	} else if (IsIdentifierPart(c)) {
1136	PushCodeUnit(name, c);
1137	} else {
1138	ReportError("Invalid capture group name");
1139	UNREACHABLE();
1140	}
1141	}
1142	}
1143
1144	return name;
1145	}
1146
1147	intptr_t RegExpParser::GetNamedCaptureIndex(const RegExpCaptureName* name) {
1148	for (const auto& capture : *named_captures_) {
1149	if (IsSameName(name, capture->name())) return capture->index();
1150	}
1151	return -`1`;
1152	}
1153
1154	void RegExpParser::CreateNamedCaptureAtIndex(const RegExpCaptureName* name,
1155	intptr_t index) {
1156	ASSERT(`0` < index && index <= captures_started_);
1157	ASSERT(name != nullptr);
1158
1159	if (named_captures_ == nullptr) {
1160	named_captures_ = new (Z) ZoneGrowableArray<RegExpCapture*>(`1`);
1161	} else {
1162	// Check for duplicates and bail if we find any. Currently O(n^2).
1163	if (GetNamedCaptureIndex(name) >= `0`) {
1164	ReportError("Duplicate capture group name");
1165	UNREACHABLE();
1166	}
1167	}
1168
1169	RegExpCapture* capture = GetCapture(index);
1170	ASSERT(capture->name() == nullptr);
1171
1172	capture->set_name(name);
1173	named_captures_->Add(capture);
1174	}
1175
1176	bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder,
1177	RegExpParserState* state) {
1178	// The parser is assumed to be on the '<' in \k<name>.
1179	if (current() != `'<'`) {
1180	ReportError("Invalid named reference");
1181	UNREACHABLE();
1182	}
1183
1184	Advance();
1185	const RegExpCaptureName* name = ParseCaptureGroupName();
1186	if (name == nullptr) {
1187	return false;
1188	}
1189
1190	if (state->IsInsideCaptureGroup(name)) {
1191	builder->AddEmpty();
1192	} else {
1193	RegExpBackReference* atom = new (Z) RegExpBackReference (builder->flags());
1194	atom->set_name(name);
1195
1196	builder->AddAtom(atom);
1197
1198	if (named_back_references_ == nullptr) {
1199	named_back_references_ =
1200	new (Z) ZoneGrowableArray<RegExpBackReference*>(`1`);
1201	}
1202	named_back_references_->Add(atom);
1203	}
1204
1205	return true;
1206	}
1207
1208	void RegExpParser::PatchNamedBackReferences() {
1209	if (named_back_references_ == nullptr) return;
1210
1211	if (named_captures_ == nullptr) {
1212	ReportError("Invalid named capture referenced");
1213	return;
1214	}
1215
1216	// Look up and patch the actual capture for each named back reference.
1217	// Currently O(n^2), optimize if necessary.
1218	for (intptr_t i = `0`; i < named_back_references_->length(); i++) {
1219	RegExpBackReference* ref = named_back_references_->At(i);
1220	intptr_t index = GetNamedCaptureIndex(ref->name());
1221
1222	if (index < `0`) {
1223	ReportError("Invalid named capture referenced");
1224	UNREACHABLE();
1225	}
1226	ref->set_capture(GetCapture(index));
1227	}
1228	}
1229
1230	RegExpCapture* RegExpParser::GetCapture(intptr_t index) {
1231	// The index for the capture groups are one-based. Its index in the list is
1232	// zero-based.
1233	const intptr_t know_captures =
1234	is_scanned_for_captures_ ? capture_count_ : captures_started_;
1235	ASSERT(index <= know_captures);
1236	if (captures_ == nullptr) {
1237	captures_ = new (Z) ZoneGrowableArray<RegExpCapture*>(know_captures);
1238	}
1239	while (captures_->length() < know_captures) {
1240	captures_->Add(new (Z) RegExpCapture (captures_->length() + `1`));
1241	}
1242	return captures_->At(index - `1`);
1243	}
1244
1245	ArrayPtr RegExpParser::CreateCaptureNameMap() {
1246	if (named_captures_ == nullptr \|\| named_captures_->is_empty()) {
1247	return Array::null();
1248	}
1249
1250	const intptr_t len = named_captures_->length() * `2`;
1251
1252	const Array& array = Array::Handle(Array::New(len));
1253
1254	auto& name = String::Handle();
1255	auto& smi = Smi::Handle();
1256	for (intptr_t i = `0`; i < named_captures_->length(); i++) {
1257	RegExpCapture* capture = named_captures_->At(i);
1258	name =
1259	String::FromUTF16(capture->name()->data(), capture->name()->length());
1260	smi = Smi::New(capture->index());
1261	array.SetAt(i * `2`, name);
1262	array.SetAt(i * `2` + `1`, smi);
1263	}
1264
1265	return array.raw();
1266	}
1267
1268	bool RegExpParser::HasNamedCaptures() {
1269	if (has_named_captures_ \|\| is_scanned_for_captures_) {
1270	return has_named_captures_;
1271	}
1272
1273	ScanForCaptures();
1274	ASSERT(is_scanned_for_captures_);
1275	return has_named_captures_;
1276	}
1277
1278	bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(intptr_t index) {
1279	for (RegExpParserState* s = this; s != nullptr; s = s->previous_state()) {
1280	if (s->group_type() != CAPTURE) continue;
1281	// Return true if we found the matching capture index.
1282	if (index == s->capture_index()) return true;
1283	// Abort if index is larger than what has been parsed up till this state.
1284	if (index > s->capture_index()) return false;
1285	}
1286	return false;
1287	}
1288
1289	bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(
1290	const RegExpCaptureName* name) {
1291	ASSERT(name != nullptr);
1292	for (RegExpParserState* s = this; s != nullptr; s = s->previous_state()) {
1293	if (s->capture_name() == nullptr) continue;
1294	if (IsSameName(s->capture_name(), name)) return true;
1295	}
1296	return false;
1297	}
1298
1299	// QuantifierPrefix ::
1300	// { DecimalDigits }
1301	// { DecimalDigits , }
1302	// { DecimalDigits , DecimalDigits }
1303	//
1304	// Returns true if parsing succeeds, and set the min_out and max_out
1305	// values. Values are truncated to RegExpTree::kInfinity if they overflow.
1306	bool RegExpParser::ParseIntervalQuantifier(intptr_t* min_out,
1307	intptr_t* max_out) {
1308	ASSERT(current() == `'{'`);
1309	intptr_t start = position();
1310	Advance();
1311	intptr_t min = `0`;
1312	if (!Utils::IsDecimalDigit(current())) {
1313	Reset(start);
1314	return false;
1315	}
1316	while (Utils::IsDecimalDigit(current())) {
1317	intptr_t next = current() - `'0'`;
1318	if (min > (RegExpTree::kInfinity - next) / `10`) {
1319	// Overflow. Skip past remaining decimal digits and return -1.
1320	do {
1321	Advance();
1322	} while (Utils::IsDecimalDigit(current()));
1323	min = RegExpTree::kInfinity;
1324	break;
1325	}
1326	min = `10` * min + next;
1327	Advance();
1328	}
1329	intptr_t max = `0`;
1330	if (current() == `'}'`) {
1331	max = min;
1332	Advance();
1333	} else if (current() == `','`) {
1334	Advance();
1335	if (current() == `'}'`) {
1336	max = RegExpTree::kInfinity;
1337	Advance();
1338	} else {
1339	while (Utils::IsDecimalDigit(current())) {
1340	intptr_t next = current() - `'0'`;
1341	if (max > (RegExpTree::kInfinity - next) / `10`) {
1342	do {
1343	Advance();
1344	} while (Utils::IsDecimalDigit(current()));
1345	max = RegExpTree::kInfinity;
1346	break;
1347	}
1348	max = `10` * max + next;
1349	Advance();
1350	}
1351	if (current() != `'}'`) {
1352	Reset(start);
1353	return false;
1354	}
1355	Advance();
1356	}
1357	} else {
1358	Reset(start);
1359	return false;
1360	}
1361	*min_out = min;
1362	*max_out = max;
1363	return true;
1364	}
1365
1366	uint32_t RegExpParser::ParseOctalLiteral() {
1367	ASSERT((`'0'` <= current() && current() <= `'7'`) \|\| current() == kEndMarker);
1368	// For compatibility with some other browsers (not all), we parse
1369	// up to three octal digits with a value below 256.
1370	uint32_t value = current() - `'0'`;
1371	Advance();
1372	if (`'0'` <= current() && current() <= `'7'`) {
1373	value = value * `8` + current() - `'0'`;
1374	Advance();
1375	if (value < `32` && `'0'` <= current() && current() <= `'7'`) {
1376	value = value * `8` + current() - `'0'`;
1377	Advance();
1378	}
1379	}
1380	return value;
1381	}
1382
1383	// Returns the value (0 .. 15) of a hexadecimal character c.
1384	// If c is not a legal hexadecimal character, returns a value < 0.
1385	static inline intptr_t HexValue(uint32_t c) {
1386	c -= `'0'`;
1387	if (static_cast<unsigned>(c) <= `9`) return c;
1388	c = (c \| `0x20`) - (`'a'` - `'0'`); // detect 0x11..0x16 and 0x31..0x36.
1389	if (static_cast<unsigned>(c) <= `5`) return c + `10`;
1390	return -`1`;
1391	}
1392
1393	bool RegExpParser::ParseHexEscape(intptr_t length, uint32_t* value) {
1394	intptr_t start = position();
1395	uint32_t val = `0`;
1396	bool done = false;
1397	for (intptr_t i = `0`; !done; i++) {
1398	uint32_t c = current();
1399	intptr_t d = HexValue(c);
1400	if (d < `0`) {
1401	Reset(start);
1402	return false;
1403	}
1404	val = val * `16` + d;
1405	Advance();
1406	if (i == length - `1`) {
1407	done = true;
1408	}
1409	}
1410	*value = val;
1411	return true;
1412	}
1413
1414	// This parses RegExpUnicodeEscapeSequence as described in ECMA262.
1415	bool RegExpParser::ParseUnicodeEscape(uint32_t* value) {
1416	// Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
1417	// allowed). In the latter case, the number of hex digits between { } is
1418	// arbitrary. \ and u have already been read.
1419	if (current() == `'{'` && is_unicode()) {
1420	int start = position();
1421	Advance();
1422	if (ParseUnlimitedLengthHexNumber(Utf::kMaxCodePoint, value)) {
1423	if (current() == `'}'`) {
1424	Advance();
1425	return true;
1426	}
1427	}
1428	Reset(start);
1429	return false;
1430	}
1431	// \u but no {, or \u{...} escapes not allowed.
1432	bool result = ParseHexEscape(`4`, value);
1433	if (result && is_unicode() && Utf16::IsLeadSurrogate(*value) &&
1434	current() == `'\\'`) {
1435	// Attempt to read trail surrogate.
1436	int start = position();
1437	if (Next() == `'u'`) {
1438	Advance(`2`);
1439	uint32_t trail;
1440	if (ParseHexEscape(`4`, &trail) && Utf16::IsTrailSurrogate(trail)) {
1441	value = Utf16::Decode(static_cast<uint16_t>(value),
1442	static_cast<uint16_t>(trail));
1443	return true;
1444	}
1445	}
1446	Reset(start);
1447	}
1448	return result;
1449	}
1450
1451	namespace {
1452
1453	bool IsExactPropertyAlias(const char* property_name, UProperty property) {
1454	const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME);
1455	if (short_name != nullptr && strcmp(property_name, short_name) == `0`) {
1456	return true;
1457	}
1458	for (int i = `0`;; i++) {
1459	const char* long_name = u_getPropertyName(
1460	property, static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
1461	if (long_name == nullptr) break;
1462	if (strcmp(property_name, long_name) == `0`) return true;
1463	}
1464	return false;
1465	}
1466
1467	bool IsExactPropertyValueAlias(const char* property_value_name,
1468	UProperty property,
1469	int32_t property_value) {
1470	const char* short_name =
1471	u_getPropertyValueName(property, property_value, U_SHORT_PROPERTY_NAME);
1472	if (short_name != nullptr && strcmp(property_value_name, short_name) == `0`) {
1473	return true;
1474	}
1475	for (int i = `0`;; i++) {
1476	const char* long_name = u_getPropertyValueName(
1477	property, property_value,
1478	static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
1479	if (long_name == nullptr) break;
1480	if (strcmp(property_value_name, long_name) == `0`) return true;
1481	}
1482	return false;
1483	}
1484
1485	bool LookupPropertyValueName(UProperty property,
1486	const char* property_value_name,
1487	bool negate,
1488	ZoneGrowableArray<CharacterRange>* result) {
1489	UProperty property_for_lookup = property;
1490	if (property_for_lookup == UCHAR_SCRIPT_EXTENSIONS) {
1491	// For the property Script_Extensions, we have to do the property value
1492	// name lookup as if the property is Script.
1493	property_for_lookup = UCHAR_SCRIPT;
1494	}
1495	int32_t property_value =
1496	u_getPropertyValueEnum(property_for_lookup, property_value_name);
1497	if (property_value == UCHAR_INVALID_CODE) return false;
1498
1499	// We require the property name to match exactly to one of the property value
1500	// aliases. However, u_getPropertyValueEnum uses loose matching.
1501	if (!IsExactPropertyValueAlias(property_value_name, property_for_lookup,
1502	property_value)) {
1503	return false;
1504	}
1505
1506	UErrorCode ec = U_ZERO_ERROR;
1507	icu::UnicodeSet set;
1508	set.applyIntPropertyValue(property, property_value, ec);
1509	bool success = ec == U_ZERO_ERROR && (set.isEmpty() == `0`);
1510
1511	if (success) {
1512	set.removeAllStrings();
1513	if (negate) set.complement();
1514	for (int i = `0`; i < set.getRangeCount(); i++) {
1515	result->Add(
1516	CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)));
1517	}
1518	}
1519	return success;
1520	}
1521
1522	template <size_t N>
1523	inline bool NameEquals(const char* name, const char (&literal)[N]) {
1524	return strncmp(name, literal, N + `1`) == `0`;
1525	}
1526
1527	bool LookupSpecialPropertyValueName(const char* name,
1528	ZoneGrowableArray<CharacterRange>* result,
1529	bool negate) {
1530	if (NameEquals(name, "Any")) {
1531	if (negate) {
1532	// Leave the list of character ranges empty, since the negation of 'Any'
1533	// is the empty set.
1534	} else {
1535	result->Add(CharacterRange::Everything());
1536	}
1537	} else if (NameEquals(name, "ASCII")) {
1538	result->Add(negate ? CharacterRange::Range(`0x80`, Utf::kMaxCodePoint)
1539	: CharacterRange::Range(`0x0`, `0x7F`));
1540	} else if (NameEquals(name, "Assigned")) {
1541	return LookupPropertyValueName(UCHAR_GENERAL_CATEGORY, "Unassigned",
1542	!negate, result);
1543	} else {
1544	return false;
1545	}
1546	return true;
1547	}
1548
1549	// Explicitly list supported binary properties. The spec forbids supporting
1550	// properties outside of this set to ensure interoperability.
1551	bool IsSupportedBinaryProperty(UProperty property) {
1552	switch (property) {
1553	case UCHAR_ALPHABETIC:
1554	// 'Any' is not supported by ICU. See LookupSpecialPropertyValueName.
1555	// 'ASCII' is not supported by ICU. See LookupSpecialPropertyValueName.
1556	case UCHAR_ASCII_HEX_DIGIT:
1557	// 'Assigned' is not supported by ICU. See LookupSpecialPropertyValueName.
1558	case UCHAR_BIDI_CONTROL:
1559	case UCHAR_BIDI_MIRRORED:
1560	case UCHAR_CASE_IGNORABLE:
1561	case UCHAR_CASED:
1562	case UCHAR_CHANGES_WHEN_CASEFOLDED:
1563	case UCHAR_CHANGES_WHEN_CASEMAPPED:
1564	case UCHAR_CHANGES_WHEN_LOWERCASED:
1565	case UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED:
1566	case UCHAR_CHANGES_WHEN_TITLECASED:
1567	case UCHAR_CHANGES_WHEN_UPPERCASED:
1568	case UCHAR_DASH:
1569	case UCHAR_DEFAULT_IGNORABLE_CODE_POINT:
1570	case UCHAR_DEPRECATED:
1571	case UCHAR_DIACRITIC:
1572	case UCHAR_EMOJI:
1573	case UCHAR_EMOJI_COMPONENT:
1574	case UCHAR_EMOJI_MODIFIER_BASE:
1575	case UCHAR_EMOJI_MODIFIER:
1576	case UCHAR_EMOJI_PRESENTATION:
1577	case UCHAR_EXTENDED_PICTOGRAPHIC:
1578	case UCHAR_EXTENDER:
1579	case UCHAR_GRAPHEME_BASE:
1580	case UCHAR_GRAPHEME_EXTEND:
1581	case UCHAR_HEX_DIGIT:
1582	case UCHAR_ID_CONTINUE:
1583	case UCHAR_ID_START:
1584	case UCHAR_IDEOGRAPHIC:
1585	case UCHAR_IDS_BINARY_OPERATOR:
1586	case UCHAR_IDS_TRINARY_OPERATOR:
1587	case UCHAR_JOIN_CONTROL:
1588	case UCHAR_LOGICAL_ORDER_EXCEPTION:
1589	case UCHAR_LOWERCASE:
1590	case UCHAR_MATH:
1591	case UCHAR_NONCHARACTER_CODE_POINT:
1592	case UCHAR_PATTERN_SYNTAX:
1593	case UCHAR_PATTERN_WHITE_SPACE:
1594	case UCHAR_QUOTATION_MARK:
1595	case UCHAR_RADICAL:
1596	case UCHAR_REGIONAL_INDICATOR:
1597	case UCHAR_S_TERM:
1598	case UCHAR_SOFT_DOTTED:
1599	case UCHAR_TERMINAL_PUNCTUATION:
1600	case UCHAR_UNIFIED_IDEOGRAPH:
1601	case UCHAR_UPPERCASE:
1602	case UCHAR_VARIATION_SELECTOR:
1603	case UCHAR_WHITE_SPACE:
1604	case UCHAR_XID_CONTINUE:
1605	case UCHAR_XID_START:
1606	return true;
1607	default:
1608	break;
1609	}
1610	return false;
1611	}
1612
1613	bool IsUnicodePropertyValueCharacter(char c) {
1614	// https://tc39.github.io/proposal-regexp-unicode-property-escapes/
1615	//
1616	// Note that using this to validate each parsed char is quite conservative.
1617	// A possible alternative solution would be to only ensure the parsed
1618	// property name/value candidate string does not contain '\0' characters and
1619	// let ICU lookups trigger the final failure.
1620	if (Utils::IsAlphaNumeric(c)) return true;
1621	return (c == `'_'`);
1622	}
1623
1624	} // anonymous namespace
1625
1626	bool RegExpParser::ParsePropertyClassName(ZoneGrowableArray<char>* name_1,
1627	ZoneGrowableArray<char>* name_2) {
1628	ASSERT(name_1->is_empty());
1629	ASSERT(name_2->is_empty());
1630	// Parse the property class as follows:
1631	// - In \p{name}, 'name' is interpreted
1632	// - either as a general category property value name.
1633	// - or as a binary property name.
1634	// - In \p{name=value}, 'name' is interpreted as an enumerated property name,
1635	// and 'value' is interpreted as one of the available property value names.
1636	// - Aliases in PropertyAlias.txt and PropertyValueAlias.txt can be used.
1637	// - Loose matching is not applied.
1638	if (current() == `'{'`) {
1639	// Parse \p{[PropertyName=]PropertyNameValue}
1640	for (Advance(); current() != `'}'` && current() != `'='`; Advance()) {
1641	if (!IsUnicodePropertyValueCharacter(current())) return false;
1642	if (!has_next()) return false;
1643	name_1->Add(static_cast<char>(current()));
1644	}
1645	if (current() == `'='`) {
1646	for (Advance(); current() != `'}'`; Advance()) {
1647	if (!IsUnicodePropertyValueCharacter(current())) return false;
1648	if (!has_next()) return false;
1649	name_2->Add(static_cast<char>(current()));
1650	}
1651	name_2->Add(`0`); // null-terminate string.
1652	}
1653	} else {
1654	return false;
1655	}
1656	Advance();
1657	name_1->Add(`0`); // null-terminate string.
1658
1659	ASSERT(static_cast<size_t>(name_1->length() - `1`) == strlen(name_1->data()));
1660	ASSERT(name_2->is_empty() \|\|
1661	static_cast<size_t>(name_2->length() - `1`) == strlen(name_2->data()));
1662	return true;
1663	}
1664
1665	bool RegExpParser::AddPropertyClassRange(
1666	ZoneGrowableArray<CharacterRange>* add_to,
1667	bool negate,
1668	ZoneGrowableArray<char>* name_1,
1669	ZoneGrowableArray<char>* name_2) {
1670	ASSERT(name_1->At(name_1->length() - `1`) == `'\0'`);
1671	ASSERT(name_2->is_empty() \|\| name_2->At(name_2->length() - `1`) == `'\0'`);
1672	if (name_2->is_empty()) {
1673	// First attempt to interpret as general category property value name.
1674	const char* name = name_1->data();
1675	if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, negate,
1676	add_to)) {
1677	return true;
1678	}
1679	// Interpret "Any", "ASCII", and "Assigned".
1680	if (LookupSpecialPropertyValueName(name, add_to, negate)) {
1681	return true;
1682	}
1683	// Then attempt to interpret as binary property name with value name 'Y'.
1684	UProperty property = u_getPropertyEnum(name);
1685	if (!IsSupportedBinaryProperty(property)) return false;
1686	if (!IsExactPropertyAlias(name, property)) return false;
1687	return LookupPropertyValueName(property, negate ? "N" : "Y", false, add_to);
1688	} else {
1689	// Both property name and value name are specified. Attempt to interpret
1690	// the property name as enumerated property.
1691	const char* property_name = name_1->data();
1692	const char* value_name = name_2->data();
1693	UProperty property = u_getPropertyEnum(property_name);
1694	if (!IsExactPropertyAlias(property_name, property)) return false;
1695	if (property == UCHAR_GENERAL_CATEGORY) {
1696	// We want to allow aggregate value names such as "Letter".
1697	property = UCHAR_GENERAL_CATEGORY_MASK;
1698	} else if (property != UCHAR_SCRIPT &&
1699	property != UCHAR_SCRIPT_EXTENSIONS) {
1700	return false;
1701	}
1702	return LookupPropertyValueName(property, value_name, negate, add_to);
1703	}
1704	}
1705
1706	bool RegExpParser::ParseUnlimitedLengthHexNumber(uint32_t max_value,
1707	uint32_t* value) {
1708	uint32_t x = `0`;
1709	int d = HexValue(current());
1710	if (d < `0`) {
1711	return false;
1712	}
1713	while (d >= `0`) {
1714	x = x * `16` + d;
1715	if (x > max_value) {
1716	return false;
1717	}
1718	Advance();
1719	d = HexValue(current());
1720	}
1721	*value = x;
1722	return true;
1723	}
1724
1725	uint32_t RegExpParser::ParseClassCharacterEscape() {
1726	ASSERT(current() == `'\\'`);
1727	DEBUG_ASSERT(has_next() && !IsSpecialClassEscape(Next()));
1728	Advance();
1729	switch (current()) {
1730	case `'b'`:
1731	Advance();
1732	return `'\b'`;
1733	// ControlEscape :: one of
1734	// f n r t v
1735	case `'f'`:
1736	Advance();
1737	return `'\f'`;
1738	case `'n'`:
1739	Advance();
1740	return `'\n'`;
1741	case `'r'`:
1742	Advance();
1743	return `'\r'`;
1744	case `'t'`:
1745	Advance();
1746	return `'\t'`;
1747	case `'v'`:
1748	Advance();
1749	return `'\v'`;
1750	case `'c'`: {
1751	uint32_t controlLetter = Next();
1752	uint32_t letter = controlLetter & ~(`'A'` ^ `'a'`);
1753	// For compatibility with JSC, inside a character class
1754	// we also accept digits and underscore as control characters.
1755	if (letter >= `'A'` && letter <= `'Z'`) {
1756	Advance(`2`);
1757	// Control letters mapped to ASCII control characters in the range
1758	// 0x00-0x1f.
1759	return controlLetter & `0x1f`;
1760	}
1761	if (is_unicode()) {
1762	// With /u, \c# or \c_ are invalid.
1763	ReportError("Invalid class escape");
1764	UNREACHABLE();
1765	}
1766	if (Utils::IsDecimalDigit(controlLetter) \|\| controlLetter == `'_'`) {
1767	Advance(`2`);
1768	return controlLetter & `0x1f`;
1769	}
1770	// We match JSC in reading the backslash as a literal
1771	// character instead of as starting an escape.
1772	return `'\\'`;
1773	}
1774	case `'0'`:
1775	// With /u, \0 is interpreted as NUL if not followed by another digit.
1776	if (is_unicode() && !(Next() >= `'0'` && Next() <= `'9'`)) {
1777	Advance();
1778	return `0`;
1779	}
1780	FALL_THROUGH;
1781	case `'1'`:
1782	case `'2'`:
1783	case `'3'`:
1784	case `'4'`:
1785	case `'5'`:
1786	case `'6'`:
1787	case `'7'`:
1788	// For compatibility, we interpret a decimal escape that isn't
1789	// a back reference (and therefore either \0 or not valid according
1790	// to the specification) as a 1..3 digit octal character code.
1791	if (is_unicode()) {
1792	// With \u, decimal escape is not interpreted as octal character code.
1793	ReportError("Invalid class escape");
1794	UNREACHABLE();
1795	}
1796	return ParseOctalLiteral();
1797	case `'x'`: {
1798	Advance();
1799	uint32_t value;
1800	if (ParseHexEscape(`2`, &value)) {
1801	return value;
1802	}
1803	if (is_unicode()) {
1804	// With \u, invalid escapes are not treated as identity escapes.
1805	ReportError("Invalid escape");
1806	UNREACHABLE();
1807	}
1808	// If \x is not followed by a two-digit hexadecimal, treat it
1809	// as an identity escape.
1810	return `'x'`;
1811	}
1812	case `'u'`: {
1813	Advance();
1814	uint32_t value;
1815	if (ParseUnicodeEscape(&value)) {
1816	return value;
1817	}
1818	if (is_unicode()) {
1819	// With \u, invalid escapes are not treated as identity escapes.
1820	ReportError(kUnicodeIdentity);
1821	UNREACHABLE();
1822	}
1823	// If \u is not followed by a four-digit hexadecimal, treat it
1824	// as an identity escape.
1825	return `'u'`;
1826	}
1827	default: {
1828	// Extended identity escape. We accept any character that hasn't
1829	// been matched by a more specific case, not just the subset required
1830	// by the ECMAScript specification.
1831	uint32_t result = current();
1832	if (!is_unicode() \|\| IsSyntaxCharacterOrSlash(result) \|\| result == `'-'`) {
1833	Advance();
1834	return result;
1835	}
1836	ReportError(kUnicodeIdentity);
1837	UNREACHABLE();
1838	}
1839	}
1840	return `0`;
1841	}
1842
1843	bool RegExpParser::ParseClassEscape(ZoneGrowableArray<CharacterRange>* ranges,
1844	bool add_unicode_case_equivalents,
1845	uint32_t* char_out) {
1846	uint32_t first = current();
1847	if (first == `'\\'`) {
1848	switch (Next()) {
1849	case `'w'`:
1850	case `'W'`:
1851	case `'d'`:
1852	case `'D'`:
1853	case `'s'`:
1854	case `'S'`: {
1855	CharacterRange::AddClassEscape(static_cast<uint16_t>(Next()), ranges,
1856	add_unicode_case_equivalents);
1857	Advance(`2`);
1858	return true;
1859	}
1860	case `'p'`:
1861	case `'P'`: {
1862	if (!is_unicode()) break;
1863	bool negate = Next() == `'P'`;
1864	Advance(`2`);
1865	auto name_1 = new (Z) ZoneGrowableArray<char>();
1866	auto name_2 = new (Z) ZoneGrowableArray<char>();
1867	if (!ParsePropertyClassName(name_1, name_2) \|\|
1868	!AddPropertyClassRange(ranges, negate, name_1, name_2)) {
1869	ReportError("Invalid property name in character class");
1870	UNREACHABLE();
1871	}
1872	return true;
1873	}
1874	case kEndMarker:
1875	ReportError("\\ at end of pattern");
1876	UNREACHABLE();
1877	default:
1878	break;
1879	}
1880	*char_out = ParseClassCharacterEscape();
1881	return false;
1882	}
1883	Advance();
1884	*char_out = first;
1885	return false;
1886	}
1887
1888	RegExpTree* RegExpParser::ParseCharacterClass(const RegExpBuilder* builder) {
1889	static const char* kUnterminated = "Unterminated character class";
1890	static const char* kRangeInvalid = "Invalid character class";
1891	static const char* kRangeOutOfOrder = "Range out of order in character class";
1892
1893	ASSERT(current() == `'['`);
1894	Advance();
1895	bool is_negated = false;
1896	if (current() == `'^'`) {
1897	is_negated = true;
1898	Advance();
1899	}
1900	ZoneGrowableArray<CharacterRange>* ranges =
1901	new (Z) ZoneGrowableArray<CharacterRange>(`2`);
1902	bool add_unicode_case_equivalents = is_unicode() && builder->ignore_case();
1903	while (has_more() && current() != `']'`) {
1904	uint32_t char_1 = `0`;
1905	bool is_class_1 =
1906	ParseClassEscape(ranges, add_unicode_case_equivalents, &char_1);
1907	if (current() == `'-'`) {
1908	Advance();
1909	if (current() == kEndMarker) {
1910	// If we reach the end we break out of the loop and let the
1911	// following code report an error.
1912	break;
1913	} else if (current() == `']'`) {
1914	if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1));
1915	ranges->Add(CharacterRange::Singleton(`'-'`));
1916	break;
1917	}
1918	uint32_t char_2 = `0`;
1919	bool is_class_2 =
1920	ParseClassEscape(ranges, add_unicode_case_equivalents, &char_2);
1921	if (is_class_1 \|\| is_class_2) {
1922	// Either end is an escaped character class. Treat the '-' verbatim.
1923	if (is_unicode()) {
1924	// ES2015 21.2.2.15.1 step 1.
1925	ReportError(kRangeInvalid);
1926	UNREACHABLE();
1927	}
1928	if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1));
1929	ranges->Add(CharacterRange::Singleton(`'-'`));
1930	if (!is_class_2) ranges->Add(CharacterRange::Singleton(char_2));
1931	continue;
1932	}
1933	if (char_1 > char_2) {
1934	ReportError(kRangeOutOfOrder);
1935	UNREACHABLE();
1936	}
1937	ranges->Add(CharacterRange::Range(char_1, char_2));
1938	} else {
1939	if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1));
1940	}
1941	}
1942	if (!has_more()) {
1943	ReportError(kUnterminated);
1944	UNREACHABLE();
1945	}
1946	Advance();
1947	RegExpCharacterClass::CharacterClassFlags character_class_flags =
1948	RegExpCharacterClass::DefaultFlags();
1949	if (is_negated) character_class_flags \|= RegExpCharacterClass::NEGATED;
1950	return new (Z)
1951	RegExpCharacterClass (ranges, builder->flags(), character_class_flags);
1952	}
1953
1954	// ----------------------------------------------------------------------------
1955	// The Parser interface.
1956
1957	void RegExpParser::ParseRegExp(const String& input,
1958	RegExpFlags flags,
1959	RegExpCompileData* result) {
1960	ASSERT(result != NULL);
1961	RegExpParser parser(input, &result->error, flags);
1962	// Throws an exception if 'input' is not valid.
1963	RegExpTree* tree = parser.ParsePattern();
1964	ASSERT(tree != NULL);
1965	ASSERT(result->error.IsNull());
1966	result->tree = tree;
1967	intptr_t capture_count = parser.captures_started();
1968	result->simple = tree->IsAtom() && parser.simple() && capture_count == `0`;
1969	result->contains_anchor = parser.contains_anchor();
1970	result->capture_name_map = parser.CreateCaptureNameMap();
1971	result->capture_count = capture_count;
1972	}
1973
1974	} // namespace dart
1975

Browse the source code of engine/third_party/dart/runtime/vm/regexp_parser.cc