1 | // Copyright (c) 2014, the Dart project authors. Please see the AUTHORS file |
2 | // for details. All rights reserved. Use of this source code is governed by a |
3 | // BSD-style license that can be found in the LICENSE file. |
4 | |
5 | #ifndef RUNTIME_VM_REGEXP_PARSER_H_ |
6 | #define RUNTIME_VM_REGEXP_PARSER_H_ |
7 | |
8 | #include "vm/allocation.h" |
9 | #include "vm/growable_array.h" |
10 | #include "vm/regexp_ast.h" |
11 | |
12 | namespace dart { |
13 | |
14 | // Accumulates RegExp atoms and assertions into lists of terms and alternatives. |
15 | class RegExpBuilder : public ZoneAllocated { |
16 | public: |
17 | explicit RegExpBuilder(RegExpFlags flags); |
18 | |
19 | void AddCharacter(uint16_t character); |
20 | void AddUnicodeCharacter(uint32_t character); |
21 | void AddEscapedUnicodeCharacter(uint32_t character); |
22 | // "Adds" an empty expression. Does nothing except consume a |
23 | // following quantifier |
24 | void AddEmpty(); |
25 | void AddCharacterClass(RegExpCharacterClass* cc); |
26 | void AddCharacterClassForDesugaring(uint32_t c); |
27 | void AddAtom(RegExpTree* tree); |
28 | void AddTerm(RegExpTree* tree); |
29 | void AddAssertion(RegExpTree* tree); |
30 | void NewAlternative(); // '|' |
31 | // Attempt to add a quantifier to the last atom added. The return value |
32 | // denotes whether the attempt succeeded, since some atoms like lookbehind |
33 | // cannot be quantified. |
34 | bool AddQuantifierToAtom(intptr_t min, |
35 | intptr_t max, |
36 | RegExpQuantifier::QuantifierType type); |
37 | RegExpTree* ToRegExp(); |
38 | RegExpFlags flags() const { return flags_; } |
39 | bool ignore_case() const { return flags_.IgnoreCase(); } |
40 | bool is_multi_line() const { return flags_.IsMultiLine(); } |
41 | bool is_dot_all() const { return flags_.IsDotAll(); } |
42 | |
43 | private: |
44 | static const uint16_t kNoPendingSurrogate = 0; |
45 | void AddLeadSurrogate(uint16_t lead_surrogate); |
46 | void AddTrailSurrogate(uint16_t trail_surrogate); |
47 | void FlushPendingSurrogate(); |
48 | void FlushCharacters(); |
49 | void FlushText(); |
50 | void FlushTerms(); |
51 | bool NeedsDesugaringForUnicode(RegExpCharacterClass* cc); |
52 | bool NeedsDesugaringForIgnoreCase(uint32_t c); |
53 | |
54 | Zone* zone() const { return zone_; } |
55 | bool is_unicode() const { return flags_.IsUnicode(); } |
56 | |
57 | Zone* zone_; |
58 | bool pending_empty_; |
59 | RegExpFlags flags_; |
60 | ZoneGrowableArray<uint16_t>* characters_; |
61 | uint16_t pending_surrogate_; |
62 | GrowableArray<RegExpTree*> terms_; |
63 | GrowableArray<RegExpTree*> text_; |
64 | GrowableArray<RegExpTree*> alternatives_; |
65 | #ifdef DEBUG |
66 | enum {ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM} last_added_; |
67 | #define LAST(x) last_added_ = x; |
68 | #else |
69 | #define LAST(x) |
70 | #endif |
71 | }; |
72 | |
73 | using RegExpCaptureName = ZoneGrowableArray<uint16_t>; |
74 | |
75 | class RegExpParser : public ValueObject { |
76 | public: |
77 | RegExpParser(const String& in, String* error, RegExpFlags regexp_flags); |
78 | |
79 | static void ParseRegExp(const String& input, |
80 | RegExpFlags regexp_flags, |
81 | RegExpCompileData* result); |
82 | |
83 | RegExpTree* ParsePattern(); |
84 | RegExpTree* ParseDisjunction(); |
85 | RegExpTree* ParseGroup(); |
86 | |
87 | // Parses a {...,...} quantifier and stores the range in the given |
88 | // out parameters. |
89 | bool ParseIntervalQuantifier(intptr_t* min_out, intptr_t* max_out); |
90 | |
91 | // Parses and returns a single escaped character. The character |
92 | // must not be 'b' or 'B' since they are usually handle specially. |
93 | uint32_t ParseClassCharacterEscape(); |
94 | |
95 | // Checks whether the following is a length-digit hexadecimal number, |
96 | // and sets the value if it is. |
97 | bool ParseHexEscape(intptr_t length, uint32_t* value); |
98 | bool ParseUnicodeEscape(uint32_t* value); |
99 | bool ParseUnlimitedLengthHexNumber(uint32_t max_value, uint32_t* value); |
100 | |
101 | // Parses either {UNICODE_PROPERTY_NAME=UNICODE_PROPERTY_VALUE} or |
102 | // the shorthand {UNICODE_PROPERTY_NAME_OR_VALUE} and stores the |
103 | // result in the given out parameters. If the shorthand is used, |
104 | // nothing will be added to name_2. |
105 | bool ParsePropertyClassName(ZoneGrowableArray<char>* name_1, |
106 | ZoneGrowableArray<char>* name_2); |
107 | // Adds the specified unicode property to the provided character range. |
108 | bool AddPropertyClassRange(ZoneGrowableArray<CharacterRange>* add_to, |
109 | bool negate, |
110 | ZoneGrowableArray<char>* name_1, |
111 | ZoneGrowableArray<char>* name_2); |
112 | // Returns a regexp node that corresponds to one of these unicode |
113 | // property sequences: "Any", "ASCII", "Assigned". |
114 | RegExpTree* GetPropertySequence(ZoneGrowableArray<char>* name_1); |
115 | RegExpTree* ParseCharacterClass(const RegExpBuilder* builder); |
116 | |
117 | uint32_t ParseOctalLiteral(); |
118 | |
119 | // Tries to parse the input as a back reference. If successful it |
120 | // stores the result in the output parameter and returns true. If |
121 | // it fails it will push back the characters read so the same characters |
122 | // can be reparsed. |
123 | bool ParseBackReferenceIndex(intptr_t* index_out); |
124 | |
125 | // Attempts to parse a possible escape within a character class. |
126 | bool ParseClassEscape(ZoneGrowableArray<CharacterRange>* ranges, |
127 | bool add_unicode_case_equivalents, |
128 | uint32_t* char_out); |
129 | void ReportError(const char* message); |
130 | void Advance(); |
131 | void Advance(intptr_t dist); |
132 | void Reset(intptr_t pos); |
133 | |
134 | // Reports whether the pattern might be used as a literal search string. |
135 | // Only use if the result of the parse is a single atom node. |
136 | bool simple(); |
137 | bool contains_anchor() { return contains_anchor_; } |
138 | void set_contains_anchor() { contains_anchor_ = true; } |
139 | intptr_t captures_started() { return captures_started_; } |
140 | intptr_t position() { return next_pos_ - 1; } |
141 | bool is_unicode() const { return top_level_flags_.IsUnicode(); } |
142 | |
143 | static bool IsSyntaxCharacterOrSlash(uint32_t c); |
144 | |
145 | static const intptr_t kMaxCaptures = 1 << 16; |
146 | static const uint32_t kEndMarker = (1 << 21); |
147 | |
148 | private: |
149 | enum SubexpressionType { |
150 | INITIAL, |
151 | CAPTURE, // All positive values represent captures. |
152 | POSITIVE_LOOKAROUND, |
153 | NEGATIVE_LOOKAROUND, |
154 | GROUPING |
155 | }; |
156 | |
157 | class RegExpParserState : public ZoneAllocated { |
158 | public: |
159 | RegExpParserState(RegExpParserState* previous_state, |
160 | SubexpressionType group_type, |
161 | RegExpLookaround::Type lookaround_type, |
162 | intptr_t disjunction_capture_index, |
163 | const RegExpCaptureName* capture_name, |
164 | RegExpFlags flags, |
165 | Zone* zone) |
166 | : previous_state_(previous_state), |
167 | builder_(new (zone) RegExpBuilder(flags)), |
168 | group_type_(group_type), |
169 | lookaround_type_(lookaround_type), |
170 | disjunction_capture_index_(disjunction_capture_index), |
171 | capture_name_(capture_name) {} |
172 | // Parser state of containing expression, if any. |
173 | RegExpParserState* previous_state() { return previous_state_; } |
174 | bool IsSubexpression() { return previous_state_ != NULL; } |
175 | // RegExpBuilder building this regexp's AST. |
176 | RegExpBuilder* builder() { return builder_; } |
177 | // Type of regexp being parsed (parenthesized group or entire regexp). |
178 | SubexpressionType group_type() { return group_type_; } |
179 | // Lookahead or lookbehind. |
180 | RegExpLookaround::Type lookaround_type() { return lookaround_type_; } |
181 | // Index in captures array of first capture in this sub-expression, if any. |
182 | // Also the capture index of this sub-expression itself, if group_type |
183 | // is CAPTURE. |
184 | intptr_t capture_index() { return disjunction_capture_index_; } |
185 | const RegExpCaptureName* capture_name() const { return capture_name_; } |
186 | |
187 | bool IsNamedCapture() const { return capture_name_ != nullptr; } |
188 | |
189 | // Check whether the parser is inside a capture group with the given index. |
190 | bool IsInsideCaptureGroup(intptr_t index); |
191 | // Check whether the parser is inside a capture group with the given name. |
192 | bool IsInsideCaptureGroup(const RegExpCaptureName* name); |
193 | |
194 | private: |
195 | // Linked list implementation of stack of states. |
196 | RegExpParserState* previous_state_; |
197 | // Builder for the stored disjunction. |
198 | RegExpBuilder* builder_; |
199 | // Stored disjunction type (capture, look-ahead or grouping), if any. |
200 | SubexpressionType group_type_; |
201 | // Stored read direction. |
202 | const RegExpLookaround::Type lookaround_type_; |
203 | // Stored disjunction's capture index (if any). |
204 | intptr_t disjunction_capture_index_; |
205 | // Stored capture name (if any). |
206 | const RegExpCaptureName* const capture_name_; |
207 | }; |
208 | |
209 | // Return the 1-indexed RegExpCapture object, allocate if necessary. |
210 | RegExpCapture* GetCapture(intptr_t index); |
211 | |
212 | // Creates a new named capture at the specified index. Must be called exactly |
213 | // once for each named capture. Fails if a capture with the same name is |
214 | // encountered. |
215 | void CreateNamedCaptureAtIndex(const RegExpCaptureName* name, intptr_t index); |
216 | |
217 | // Parses the name of a capture group (?<name>pattern). The name must adhere |
218 | // to IdentifierName in the ECMAScript standard. |
219 | const RegExpCaptureName* ParseCaptureGroupName(); |
220 | |
221 | bool ParseNamedBackReference(RegExpBuilder* builder, |
222 | RegExpParserState* state); |
223 | RegExpParserState* ParseOpenParenthesis(RegExpParserState* state); |
224 | intptr_t GetNamedCaptureIndex(const RegExpCaptureName* name); |
225 | |
226 | // After the initial parsing pass, patch corresponding RegExpCapture objects |
227 | // into all RegExpBackReferences. This is done after initial parsing in order |
228 | // to avoid complicating cases in which references come before the capture. |
229 | void PatchNamedBackReferences(); |
230 | |
231 | ArrayPtr CreateCaptureNameMap(); |
232 | |
233 | // Returns true iff the pattern contains named captures. May call |
234 | // ScanForCaptures to look ahead at the remaining pattern. |
235 | bool HasNamedCaptures(); |
236 | |
237 | Zone* zone() { return zone_; } |
238 | |
239 | uint32_t current() { return current_; } |
240 | bool has_more() { return has_more_; } |
241 | bool has_next() { return next_pos_ < in().Length(); } |
242 | uint32_t Next(); |
243 | uint32_t ReadNext(bool update_position); |
244 | const String& in() { return in_; } |
245 | void ScanForCaptures(); |
246 | |
247 | Zone* zone_; |
248 | ZoneGrowableArray<RegExpCapture*>* captures_; |
249 | ZoneGrowableArray<RegExpCapture*>* named_captures_; |
250 | ZoneGrowableArray<RegExpBackReference*>* named_back_references_; |
251 | const String& in_; |
252 | uint32_t current_; |
253 | intptr_t next_pos_; |
254 | intptr_t captures_started_; |
255 | // The capture count is only valid after we have scanned for captures. |
256 | intptr_t capture_count_; |
257 | bool has_more_; |
258 | RegExpFlags top_level_flags_; |
259 | bool simple_; |
260 | bool contains_anchor_; |
261 | bool is_scanned_for_captures_; |
262 | bool has_named_captures_; |
263 | }; |
264 | |
265 | } // namespace dart |
266 | |
267 | #endif // RUNTIME_VM_REGEXP_PARSER_H_ |
268 | |