regexp_parser.h source code [engine/third_party/dart/runtime/vm/regexp_parser.h]

1	// Copyright (c) 2014, the Dart project authors. Please see the AUTHORS file
2	// for details. All rights reserved. Use of this source code is governed by a
3	// BSD-style license that can be found in the LICENSE file.
4
5	#ifndef RUNTIME_VM_REGEXP_PARSER_H_
6	#define RUNTIME_VM_REGEXP_PARSER_H_
7
8	#include "vm/allocation.h"
9	#include "vm/growable_array.h"
10	#include "vm/regexp_ast.h"
11
12	namespace dart {
13
14	// Accumulates RegExp atoms and assertions into lists of terms and alternatives.
15	class RegExpBuilder : public ZoneAllocated {
16	public:
17	explicit RegExpBuilder(RegExpFlags flags);
18
19	void AddCharacter(uint16_t character);
20	void AddUnicodeCharacter(uint32_t character);
21	void AddEscapedUnicodeCharacter(uint32_t character);
22	// "Adds" an empty expression. Does nothing except consume a
23	// following quantifier
24	void AddEmpty();
25	void AddCharacterClass(RegExpCharacterClass* cc);
26	void AddCharacterClassForDesugaring(uint32_t c);
27	void AddAtom(RegExpTree* tree);
28	void AddTerm(RegExpTree* tree);
29	void AddAssertion(RegExpTree* tree);
30	void NewAlternative(); // '\|'
31	// Attempt to add a quantifier to the last atom added. The return value
32	// denotes whether the attempt succeeded, since some atoms like lookbehind
33	// cannot be quantified.
34	bool AddQuantifierToAtom(intptr_t min,
35	intptr_t max,
36	RegExpQuantifier::QuantifierType type);
37	RegExpTree* ToRegExp();
38	RegExpFlags flags() const { return flags_; }
39	bool ignore_case() const { return flags_.IgnoreCase(); }
40	bool is_multi_line() const { return flags_.IsMultiLine(); }
41	bool is_dot_all() const { return flags_.IsDotAll(); }
42
43	private:
44	static const uint16_t kNoPendingSurrogate = `0`;
45	void AddLeadSurrogate(uint16_t lead_surrogate);
46	void AddTrailSurrogate(uint16_t trail_surrogate);
47	void FlushPendingSurrogate();
48	void FlushCharacters();
49	void FlushText();
50	void FlushTerms();
51	bool NeedsDesugaringForUnicode(RegExpCharacterClass* cc);
52	bool NeedsDesugaringForIgnoreCase(uint32_t c);
53
54	Zone* zone() const { return zone_; }
55	bool is_unicode() const { return flags_.IsUnicode(); }
56
57	Zone* zone_;
58	bool pending_empty_;
59	RegExpFlags flags_;
60	ZoneGrowableArray<uint16_t>* characters_;
61	uint16_t pending_surrogate_;
62	GrowableArray<RegExpTree*> terms_;
63	GrowableArray<RegExpTree*> text_;
64	GrowableArray<RegExpTree*> alternatives_;
65	#ifdef DEBUG
66	enum {ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM} last_added_;
67	#define LAST(x) last_added_ = x;
68	#else
69	#define LAST(x)
70	#endif
71	};
72
73	using RegExpCaptureName = ZoneGrowableArray<uint16_t>;
74
75	class RegExpParser : public ValueObject {
76	public:
77	RegExpParser(const String& in, String* error, RegExpFlags regexp_flags);
78
79	static void ParseRegExp(const String& input,
80	RegExpFlags regexp_flags,
81	RegExpCompileData* result);
82
83	RegExpTree* ParsePattern();
84	RegExpTree* ParseDisjunction();
85	RegExpTree* ParseGroup();
86
87	// Parses a {...,...} quantifier and stores the range in the given
88	// out parameters.
89	bool ParseIntervalQuantifier(intptr_t* min_out, intptr_t* max_out);
90
91	// Parses and returns a single escaped character. The character
92	// must not be 'b' or 'B' since they are usually handle specially.
93	uint32_t ParseClassCharacterEscape();
94
95	// Checks whether the following is a length-digit hexadecimal number,
96	// and sets the value if it is.
97	bool ParseHexEscape(intptr_t length, uint32_t* value);
98	bool ParseUnicodeEscape(uint32_t* value);
99	bool ParseUnlimitedLengthHexNumber(uint32_t max_value, uint32_t* value);
100
101	// Parses either {UNICODE_PROPERTY_NAME=UNICODE_PROPERTY_VALUE} or
102	// the shorthand {UNICODE_PROPERTY_NAME_OR_VALUE} and stores the
103	// result in the given out parameters. If the shorthand is used,
104	// nothing will be added to name_2.
105	bool ParsePropertyClassName(ZoneGrowableArray<char>* name_1,
106	ZoneGrowableArray<char>* name_2);
107	// Adds the specified unicode property to the provided character range.
108	bool AddPropertyClassRange(ZoneGrowableArray<CharacterRange>* add_to,
109	bool negate,
110	ZoneGrowableArray<char>* name_1,
111	ZoneGrowableArray<char>* name_2);
112	// Returns a regexp node that corresponds to one of these unicode
113	// property sequences: "Any", "ASCII", "Assigned".
114	RegExpTree* GetPropertySequence(ZoneGrowableArray<char>* name_1);
115	RegExpTree* ParseCharacterClass(const RegExpBuilder* builder);
116
117	uint32_t ParseOctalLiteral();
118
119	// Tries to parse the input as a back reference. If successful it
120	// stores the result in the output parameter and returns true. If
121	// it fails it will push back the characters read so the same characters
122	// can be reparsed.
123	bool ParseBackReferenceIndex(intptr_t* index_out);
124
125	// Attempts to parse a possible escape within a character class.
126	bool ParseClassEscape(ZoneGrowableArray<CharacterRange>* ranges,
127	bool add_unicode_case_equivalents,
128	uint32_t* char_out);
129	void ReportError(const char* message);
130	void Advance();
131	void Advance(intptr_t dist);
132	void Reset(intptr_t pos);
133
134	// Reports whether the pattern might be used as a literal search string.
135	// Only use if the result of the parse is a single atom node.
136	bool simple();
137	bool contains_anchor() { return contains_anchor_; }
138	void set_contains_anchor() { contains_anchor_ = true; }
139	intptr_t captures_started() { return captures_started_; }
140	intptr_t position() { return next_pos_ - `1`; }
141	bool is_unicode() const { return top_level_flags_.IsUnicode(); }
142
143	static bool IsSyntaxCharacterOrSlash(uint32_t c);
144
145	static const intptr_t kMaxCaptures = `1` << `16`;
146	static const uint32_t kEndMarker = (`1` << `21`);
147
148	private:
149	enum SubexpressionType {
150	INITIAL,
151	CAPTURE, // All positive values represent captures.
152	POSITIVE_LOOKAROUND,
153	NEGATIVE_LOOKAROUND,
154	GROUPING
155	};
156
157	class RegExpParserState : public ZoneAllocated {
158	public:
159	RegExpParserState(RegExpParserState* previous_state,
160	SubexpressionType group_type,
161	RegExpLookaround::Type lookaround_type,
162	intptr_t disjunction_capture_index,
163	const RegExpCaptureName* capture_name,
164	RegExpFlags flags,
165	Zone* zone)
166	: previous_state_(previous_state),
167	builder_(new (zone) RegExpBuilder (flags)),
168	group_type_(group_type),
169	lookaround_type_(lookaround_type),
170	disjunction_capture_index_(disjunction_capture_index),
171	capture_name_(capture_name) {}
172	// Parser state of containing expression, if any.
173	RegExpParserState* previous_state() { return previous_state_; }
174	bool IsSubexpression() { return previous_state_ != NULL; }
175	// RegExpBuilder building this regexp's AST.
176	RegExpBuilder* builder() { return builder_; }
177	// Type of regexp being parsed (parenthesized group or entire regexp).
178	SubexpressionType group_type() { return group_type_; }
179	// Lookahead or lookbehind.
180	RegExpLookaround::Type lookaround_type() { return lookaround_type_; }
181	// Index in captures array of first capture in this sub-expression, if any.
182	// Also the capture index of this sub-expression itself, if group_type
183	// is CAPTURE.
184	intptr_t capture_index() { return disjunction_capture_index_; }
185	const RegExpCaptureName* capture_name() const { return capture_name_; }
186
187	bool IsNamedCapture() const { return capture_name_ != nullptr; }
188
189	// Check whether the parser is inside a capture group with the given index.
190	bool IsInsideCaptureGroup(intptr_t index);
191	// Check whether the parser is inside a capture group with the given name.
192	bool IsInsideCaptureGroup(const RegExpCaptureName* name);
193
194	private:
195	// Linked list implementation of stack of states.
196	RegExpParserState* previous_state_;
197	// Builder for the stored disjunction.
198	RegExpBuilder* builder_;
199	// Stored disjunction type (capture, look-ahead or grouping), if any.
200	SubexpressionType group_type_;
201	// Stored read direction.
202	const RegExpLookaround::Type lookaround_type_;
203	// Stored disjunction's capture index (if any).
204	intptr_t disjunction_capture_index_;
205	// Stored capture name (if any).
206	const RegExpCaptureName* const capture_name_;
207	};
208
209	// Return the 1-indexed RegExpCapture object, allocate if necessary.
210	RegExpCapture* GetCapture(intptr_t index);
211
212	// Creates a new named capture at the specified index. Must be called exactly
213	// once for each named capture. Fails if a capture with the same name is
214	// encountered.
215	void CreateNamedCaptureAtIndex(const RegExpCaptureName* name, intptr_t index);
216
217	// Parses the name of a capture group (?<name>pattern). The name must adhere
218	// to IdentifierName in the ECMAScript standard.
219	const RegExpCaptureName* ParseCaptureGroupName();
220
221	bool ParseNamedBackReference(RegExpBuilder* builder,
222	RegExpParserState* state);
223	RegExpParserState* ParseOpenParenthesis(RegExpParserState* state);
224	intptr_t GetNamedCaptureIndex(const RegExpCaptureName* name);
225
226	// After the initial parsing pass, patch corresponding RegExpCapture objects
227	// into all RegExpBackReferences. This is done after initial parsing in order
228	// to avoid complicating cases in which references come before the capture.
229	void PatchNamedBackReferences();
230
231	ArrayPtr CreateCaptureNameMap();
232
233	// Returns true iff the pattern contains named captures. May call
234	// ScanForCaptures to look ahead at the remaining pattern.
235	bool HasNamedCaptures();
236
237	Zone* zone() { return zone_; }
238
239	uint32_t current() { return current_; }
240	bool has_more() { return has_more_; }
241	bool has_next() { return next_pos_ < in().Length(); }
242	uint32_t Next();
243	uint32_t ReadNext(bool update_position);
244	const String& in() { return in_; }
245	void ScanForCaptures();
246
247	Zone* zone_;
248	ZoneGrowableArray<RegExpCapture> captures_;
249	ZoneGrowableArray<RegExpCapture> named_captures_;
250	ZoneGrowableArray<RegExpBackReference> named_back_references_;
251	const String& in_;
252	uint32_t current_;
253	intptr_t next_pos_;
254	intptr_t captures_started_;
255	// The capture count is only valid after we have scanned for captures.
256	intptr_t capture_count_;
257	bool has_more_;
258	RegExpFlags top_level_flags_;
259	bool simple_;
260	bool contains_anchor_;
261	bool is_scanned_for_captures_;
262	bool has_named_captures_;
263	};
264
265	} // namespace dart
266
267	#endif // RUNTIME_VM_REGEXP_PARSER_H_
268

Browse the source code of engine/third_party/dart/runtime/vm/regexp_parser.h