1// Copyright (c) 2014, the Dart project authors. Please see the AUTHORS file
2// for details. All rights reserved. Use of this source code is governed by a
3// BSD-style license that can be found in the LICENSE file.
4
5#ifndef RUNTIME_VM_REGEXP_PARSER_H_
6#define RUNTIME_VM_REGEXP_PARSER_H_
7
8#include "vm/allocation.h"
9#include "vm/growable_array.h"
10#include "vm/regexp_ast.h"
11
12namespace dart {
13
14// Accumulates RegExp atoms and assertions into lists of terms and alternatives.
15class RegExpBuilder : public ZoneAllocated {
16 public:
17 explicit RegExpBuilder(RegExpFlags flags);
18
19 void AddCharacter(uint16_t character);
20 void AddUnicodeCharacter(uint32_t character);
21 void AddEscapedUnicodeCharacter(uint32_t character);
22 // "Adds" an empty expression. Does nothing except consume a
23 // following quantifier
24 void AddEmpty();
25 void AddCharacterClass(RegExpCharacterClass* cc);
26 void AddCharacterClassForDesugaring(uint32_t c);
27 void AddAtom(RegExpTree* tree);
28 void AddTerm(RegExpTree* tree);
29 void AddAssertion(RegExpTree* tree);
30 void NewAlternative(); // '|'
31 // Attempt to add a quantifier to the last atom added. The return value
32 // denotes whether the attempt succeeded, since some atoms like lookbehind
33 // cannot be quantified.
34 bool AddQuantifierToAtom(intptr_t min,
35 intptr_t max,
36 RegExpQuantifier::QuantifierType type);
37 RegExpTree* ToRegExp();
38 RegExpFlags flags() const { return flags_; }
39 bool ignore_case() const { return flags_.IgnoreCase(); }
40 bool is_multi_line() const { return flags_.IsMultiLine(); }
41 bool is_dot_all() const { return flags_.IsDotAll(); }
42
43 private:
44 static const uint16_t kNoPendingSurrogate = 0;
45 void AddLeadSurrogate(uint16_t lead_surrogate);
46 void AddTrailSurrogate(uint16_t trail_surrogate);
47 void FlushPendingSurrogate();
48 void FlushCharacters();
49 void FlushText();
50 void FlushTerms();
51 bool NeedsDesugaringForUnicode(RegExpCharacterClass* cc);
52 bool NeedsDesugaringForIgnoreCase(uint32_t c);
53
54 Zone* zone() const { return zone_; }
55 bool is_unicode() const { return flags_.IsUnicode(); }
56
57 Zone* zone_;
58 bool pending_empty_;
59 RegExpFlags flags_;
60 ZoneGrowableArray<uint16_t>* characters_;
61 uint16_t pending_surrogate_;
62 GrowableArray<RegExpTree*> terms_;
63 GrowableArray<RegExpTree*> text_;
64 GrowableArray<RegExpTree*> alternatives_;
65#ifdef DEBUG
66 enum {ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM} last_added_;
67#define LAST(x) last_added_ = x;
68#else
69#define LAST(x)
70#endif
71};
72
73using RegExpCaptureName = ZoneGrowableArray<uint16_t>;
74
75class RegExpParser : public ValueObject {
76 public:
77 RegExpParser(const String& in, String* error, RegExpFlags regexp_flags);
78
79 static void ParseRegExp(const String& input,
80 RegExpFlags regexp_flags,
81 RegExpCompileData* result);
82
83 RegExpTree* ParsePattern();
84 RegExpTree* ParseDisjunction();
85 RegExpTree* ParseGroup();
86
87 // Parses a {...,...} quantifier and stores the range in the given
88 // out parameters.
89 bool ParseIntervalQuantifier(intptr_t* min_out, intptr_t* max_out);
90
91 // Parses and returns a single escaped character. The character
92 // must not be 'b' or 'B' since they are usually handle specially.
93 uint32_t ParseClassCharacterEscape();
94
95 // Checks whether the following is a length-digit hexadecimal number,
96 // and sets the value if it is.
97 bool ParseHexEscape(intptr_t length, uint32_t* value);
98 bool ParseUnicodeEscape(uint32_t* value);
99 bool ParseUnlimitedLengthHexNumber(uint32_t max_value, uint32_t* value);
100
101 // Parses either {UNICODE_PROPERTY_NAME=UNICODE_PROPERTY_VALUE} or
102 // the shorthand {UNICODE_PROPERTY_NAME_OR_VALUE} and stores the
103 // result in the given out parameters. If the shorthand is used,
104 // nothing will be added to name_2.
105 bool ParsePropertyClassName(ZoneGrowableArray<char>* name_1,
106 ZoneGrowableArray<char>* name_2);
107 // Adds the specified unicode property to the provided character range.
108 bool AddPropertyClassRange(ZoneGrowableArray<CharacterRange>* add_to,
109 bool negate,
110 ZoneGrowableArray<char>* name_1,
111 ZoneGrowableArray<char>* name_2);
112 // Returns a regexp node that corresponds to one of these unicode
113 // property sequences: "Any", "ASCII", "Assigned".
114 RegExpTree* GetPropertySequence(ZoneGrowableArray<char>* name_1);
115 RegExpTree* ParseCharacterClass(const RegExpBuilder* builder);
116
117 uint32_t ParseOctalLiteral();
118
119 // Tries to parse the input as a back reference. If successful it
120 // stores the result in the output parameter and returns true. If
121 // it fails it will push back the characters read so the same characters
122 // can be reparsed.
123 bool ParseBackReferenceIndex(intptr_t* index_out);
124
125 // Attempts to parse a possible escape within a character class.
126 bool ParseClassEscape(ZoneGrowableArray<CharacterRange>* ranges,
127 bool add_unicode_case_equivalents,
128 uint32_t* char_out);
129 void ReportError(const char* message);
130 void Advance();
131 void Advance(intptr_t dist);
132 void Reset(intptr_t pos);
133
134 // Reports whether the pattern might be used as a literal search string.
135 // Only use if the result of the parse is a single atom node.
136 bool simple();
137 bool contains_anchor() { return contains_anchor_; }
138 void set_contains_anchor() { contains_anchor_ = true; }
139 intptr_t captures_started() { return captures_started_; }
140 intptr_t position() { return next_pos_ - 1; }
141 bool is_unicode() const { return top_level_flags_.IsUnicode(); }
142
143 static bool IsSyntaxCharacterOrSlash(uint32_t c);
144
145 static const intptr_t kMaxCaptures = 1 << 16;
146 static const uint32_t kEndMarker = (1 << 21);
147
148 private:
149 enum SubexpressionType {
150 INITIAL,
151 CAPTURE, // All positive values represent captures.
152 POSITIVE_LOOKAROUND,
153 NEGATIVE_LOOKAROUND,
154 GROUPING
155 };
156
157 class RegExpParserState : public ZoneAllocated {
158 public:
159 RegExpParserState(RegExpParserState* previous_state,
160 SubexpressionType group_type,
161 RegExpLookaround::Type lookaround_type,
162 intptr_t disjunction_capture_index,
163 const RegExpCaptureName* capture_name,
164 RegExpFlags flags,
165 Zone* zone)
166 : previous_state_(previous_state),
167 builder_(new (zone) RegExpBuilder(flags)),
168 group_type_(group_type),
169 lookaround_type_(lookaround_type),
170 disjunction_capture_index_(disjunction_capture_index),
171 capture_name_(capture_name) {}
172 // Parser state of containing expression, if any.
173 RegExpParserState* previous_state() { return previous_state_; }
174 bool IsSubexpression() { return previous_state_ != NULL; }
175 // RegExpBuilder building this regexp's AST.
176 RegExpBuilder* builder() { return builder_; }
177 // Type of regexp being parsed (parenthesized group or entire regexp).
178 SubexpressionType group_type() { return group_type_; }
179 // Lookahead or lookbehind.
180 RegExpLookaround::Type lookaround_type() { return lookaround_type_; }
181 // Index in captures array of first capture in this sub-expression, if any.
182 // Also the capture index of this sub-expression itself, if group_type
183 // is CAPTURE.
184 intptr_t capture_index() { return disjunction_capture_index_; }
185 const RegExpCaptureName* capture_name() const { return capture_name_; }
186
187 bool IsNamedCapture() const { return capture_name_ != nullptr; }
188
189 // Check whether the parser is inside a capture group with the given index.
190 bool IsInsideCaptureGroup(intptr_t index);
191 // Check whether the parser is inside a capture group with the given name.
192 bool IsInsideCaptureGroup(const RegExpCaptureName* name);
193
194 private:
195 // Linked list implementation of stack of states.
196 RegExpParserState* previous_state_;
197 // Builder for the stored disjunction.
198 RegExpBuilder* builder_;
199 // Stored disjunction type (capture, look-ahead or grouping), if any.
200 SubexpressionType group_type_;
201 // Stored read direction.
202 const RegExpLookaround::Type lookaround_type_;
203 // Stored disjunction's capture index (if any).
204 intptr_t disjunction_capture_index_;
205 // Stored capture name (if any).
206 const RegExpCaptureName* const capture_name_;
207 };
208
209 // Return the 1-indexed RegExpCapture object, allocate if necessary.
210 RegExpCapture* GetCapture(intptr_t index);
211
212 // Creates a new named capture at the specified index. Must be called exactly
213 // once for each named capture. Fails if a capture with the same name is
214 // encountered.
215 void CreateNamedCaptureAtIndex(const RegExpCaptureName* name, intptr_t index);
216
217 // Parses the name of a capture group (?<name>pattern). The name must adhere
218 // to IdentifierName in the ECMAScript standard.
219 const RegExpCaptureName* ParseCaptureGroupName();
220
221 bool ParseNamedBackReference(RegExpBuilder* builder,
222 RegExpParserState* state);
223 RegExpParserState* ParseOpenParenthesis(RegExpParserState* state);
224 intptr_t GetNamedCaptureIndex(const RegExpCaptureName* name);
225
226 // After the initial parsing pass, patch corresponding RegExpCapture objects
227 // into all RegExpBackReferences. This is done after initial parsing in order
228 // to avoid complicating cases in which references come before the capture.
229 void PatchNamedBackReferences();
230
231 ArrayPtr CreateCaptureNameMap();
232
233 // Returns true iff the pattern contains named captures. May call
234 // ScanForCaptures to look ahead at the remaining pattern.
235 bool HasNamedCaptures();
236
237 Zone* zone() { return zone_; }
238
239 uint32_t current() { return current_; }
240 bool has_more() { return has_more_; }
241 bool has_next() { return next_pos_ < in().Length(); }
242 uint32_t Next();
243 uint32_t ReadNext(bool update_position);
244 const String& in() { return in_; }
245 void ScanForCaptures();
246
247 Zone* zone_;
248 ZoneGrowableArray<RegExpCapture*>* captures_;
249 ZoneGrowableArray<RegExpCapture*>* named_captures_;
250 ZoneGrowableArray<RegExpBackReference*>* named_back_references_;
251 const String& in_;
252 uint32_t current_;
253 intptr_t next_pos_;
254 intptr_t captures_started_;
255 // The capture count is only valid after we have scanned for captures.
256 intptr_t capture_count_;
257 bool has_more_;
258 RegExpFlags top_level_flags_;
259 bool simple_;
260 bool contains_anchor_;
261 bool is_scanned_for_captures_;
262 bool has_named_captures_;
263};
264
265} // namespace dart
266
267#endif // RUNTIME_VM_REGEXP_PARSER_H_
268