1 | // Copyright (c) 2014, the Dart project authors. Please see the AUTHORS file |
2 | // for details. All rights reserved. Use of this source code is governed by a |
3 | // BSD-style license that can be found in the LICENSE file. |
4 | |
5 | #ifndef RUNTIME_VM_REGEXP_AST_H_ |
6 | #define RUNTIME_VM_REGEXP_AST_H_ |
7 | |
8 | #include "platform/globals.h" |
9 | #include "platform/utils.h" |
10 | #include "vm/allocation.h" |
11 | #include "vm/regexp.h" |
12 | |
13 | namespace dart { |
14 | |
15 | class RegExpAlternative; |
16 | class RegExpAssertion; |
17 | class RegExpAtom; |
18 | class RegExpBackReference; |
19 | class RegExpCapture; |
20 | class RegExpCharacterClass; |
21 | class RegExpCompiler; |
22 | class RegExpDisjunction; |
23 | class RegExpEmpty; |
24 | class RegExpLookaround; |
25 | class RegExpQuantifier; |
26 | class RegExpText; |
27 | |
28 | class RegExpVisitor : public ValueObject { |
29 | public: |
30 | virtual ~RegExpVisitor() {} |
31 | #define MAKE_CASE(Name) \ |
32 | virtual void* Visit##Name(RegExp##Name*, void* data) = 0; |
33 | FOR_EACH_REG_EXP_TREE_TYPE(MAKE_CASE) |
34 | #undef MAKE_CASE |
35 | }; |
36 | |
37 | class RegExpTree : public ZoneAllocated { |
38 | public: |
39 | static const intptr_t kInfinity = kMaxInt32; |
40 | virtual ~RegExpTree() {} |
41 | virtual void* Accept(RegExpVisitor* visitor, void* data) = 0; |
42 | virtual RegExpNode* ToNode(RegExpCompiler* compiler, |
43 | RegExpNode* on_success) = 0; |
44 | virtual bool IsTextElement() const { return false; } |
45 | virtual bool IsAnchoredAtStart() const { return false; } |
46 | virtual bool IsAnchoredAtEnd() const { return false; } |
47 | virtual intptr_t min_match() const = 0; |
48 | virtual intptr_t max_match() const = 0; |
49 | // Returns the interval of registers used for captures within this |
50 | // expression. |
51 | virtual Interval CaptureRegisters() const { return Interval::Empty(); } |
52 | virtual void AppendToText(RegExpText* text); |
53 | void Print(); |
54 | #define MAKE_ASTYPE(Name) \ |
55 | virtual RegExp##Name* As##Name(); \ |
56 | virtual bool Is##Name() const; |
57 | FOR_EACH_REG_EXP_TREE_TYPE(MAKE_ASTYPE) |
58 | #undef MAKE_ASTYPE |
59 | }; |
60 | |
61 | class RegExpDisjunction : public RegExpTree { |
62 | public: |
63 | explicit RegExpDisjunction(ZoneGrowableArray<RegExpTree*>* alternatives); |
64 | virtual void* Accept(RegExpVisitor* visitor, void* data); |
65 | virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success); |
66 | virtual RegExpDisjunction* AsDisjunction(); |
67 | virtual Interval CaptureRegisters() const; |
68 | virtual bool IsDisjunction() const; |
69 | virtual bool IsAnchoredAtStart() const; |
70 | virtual bool IsAnchoredAtEnd() const; |
71 | virtual intptr_t min_match() const { return min_match_; } |
72 | virtual intptr_t max_match() const { return max_match_; } |
73 | ZoneGrowableArray<RegExpTree*>* alternatives() const { return alternatives_; } |
74 | |
75 | private: |
76 | ZoneGrowableArray<RegExpTree*>* alternatives_; |
77 | intptr_t min_match_; |
78 | intptr_t max_match_; |
79 | }; |
80 | |
81 | class RegExpAlternative : public RegExpTree { |
82 | public: |
83 | explicit RegExpAlternative(ZoneGrowableArray<RegExpTree*>* nodes); |
84 | virtual void* Accept(RegExpVisitor* visitor, void* data); |
85 | virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success); |
86 | virtual RegExpAlternative* AsAlternative(); |
87 | virtual Interval CaptureRegisters() const; |
88 | virtual bool IsAlternative() const; |
89 | virtual bool IsAnchoredAtStart() const; |
90 | virtual bool IsAnchoredAtEnd() const; |
91 | virtual intptr_t min_match() const { return min_match_; } |
92 | virtual intptr_t max_match() const { return max_match_; } |
93 | ZoneGrowableArray<RegExpTree*>* nodes() const { return nodes_; } |
94 | |
95 | private: |
96 | ZoneGrowableArray<RegExpTree*>* nodes_; |
97 | intptr_t min_match_; |
98 | intptr_t max_match_; |
99 | }; |
100 | |
101 | class RegExpAssertion : public RegExpTree { |
102 | public: |
103 | enum AssertionType { |
104 | START_OF_LINE, |
105 | START_OF_INPUT, |
106 | END_OF_LINE, |
107 | END_OF_INPUT, |
108 | BOUNDARY, |
109 | NON_BOUNDARY |
110 | }; |
111 | RegExpAssertion(AssertionType type, RegExpFlags flags) |
112 | : assertion_type_(type), flags_(flags) {} |
113 | virtual void* Accept(RegExpVisitor* visitor, void* data); |
114 | virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success); |
115 | virtual RegExpAssertion* AsAssertion(); |
116 | virtual bool IsAssertion() const; |
117 | virtual bool IsAnchoredAtStart() const; |
118 | virtual bool IsAnchoredAtEnd() const; |
119 | virtual intptr_t min_match() const { return 0; } |
120 | virtual intptr_t max_match() const { return 0; } |
121 | AssertionType assertion_type() const { return assertion_type_; } |
122 | |
123 | private: |
124 | AssertionType assertion_type_; |
125 | RegExpFlags flags_; |
126 | }; |
127 | |
128 | class CharacterSet : public ValueObject { |
129 | public: |
130 | explicit CharacterSet(uint16_t standard_set_type) |
131 | : ranges_(NULL), standard_set_type_(standard_set_type) {} |
132 | explicit CharacterSet(ZoneGrowableArray<CharacterRange>* ranges) |
133 | : ranges_(ranges), standard_set_type_(0) {} |
134 | CharacterSet(const CharacterSet& that) |
135 | : ValueObject(), |
136 | ranges_(that.ranges_), |
137 | standard_set_type_(that.standard_set_type_) {} |
138 | ZoneGrowableArray<CharacterRange>* ranges(); |
139 | uint16_t standard_set_type() const { return standard_set_type_; } |
140 | void set_standard_set_type(uint16_t special_set_type) { |
141 | standard_set_type_ = special_set_type; |
142 | } |
143 | bool is_standard() { return standard_set_type_ != 0; } |
144 | void Canonicalize(); |
145 | |
146 | private: |
147 | ZoneGrowableArray<CharacterRange>* ranges_; |
148 | // If non-zero, the value represents a standard set (e.g., all whitespace |
149 | // characters) without having to expand the ranges. |
150 | uint16_t standard_set_type_; |
151 | }; |
152 | |
153 | class RegExpCharacterClass : public RegExpTree { |
154 | public: |
155 | enum Flag { |
156 | // The character class is negated and should match everything but the |
157 | // specified ranges. |
158 | NEGATED = 1 << 0, |
159 | // The character class contains part of a split surrogate and should not |
160 | // be unicode-desugared. |
161 | CONTAINS_SPLIT_SURROGATE = 1 << 1, |
162 | }; |
163 | using CharacterClassFlags = intptr_t; |
164 | static inline CharacterClassFlags DefaultFlags() { return 0; } |
165 | |
166 | RegExpCharacterClass( |
167 | ZoneGrowableArray<CharacterRange>* ranges, |
168 | RegExpFlags flags, |
169 | CharacterClassFlags character_class_flags = DefaultFlags()) |
170 | : set_(ranges), |
171 | flags_(flags), |
172 | character_class_flags_(character_class_flags) { |
173 | // Convert the empty set of ranges to the negated Everything() range. |
174 | if (ranges->is_empty()) { |
175 | ranges->Add(CharacterRange::Everything()); |
176 | character_class_flags_ ^= NEGATED; |
177 | } |
178 | } |
179 | RegExpCharacterClass(uint16_t type, RegExpFlags flags) |
180 | : set_(type), flags_(flags), character_class_flags_(0) {} |
181 | virtual void* Accept(RegExpVisitor* visitor, void* data); |
182 | virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success); |
183 | virtual RegExpCharacterClass* AsCharacterClass(); |
184 | virtual bool IsCharacterClass() const; |
185 | virtual bool IsTextElement() const { return true; } |
186 | virtual intptr_t min_match() const { return 1; } |
187 | // The character class may match two code units for unicode regexps. |
188 | virtual intptr_t max_match() const { return 2; } |
189 | virtual void AppendToText(RegExpText* text); |
190 | CharacterSet character_set() const { return set_; } |
191 | // TODO(lrn): Remove need for complex version if is_standard that |
192 | // recognizes a mangled standard set and just do { return set_.is_special(); } |
193 | bool is_standard(); |
194 | // Returns a value representing the standard character set if is_standard() |
195 | // returns true. |
196 | // Currently used values are: |
197 | // s : unicode whitespace |
198 | // S : unicode non-whitespace |
199 | // w : ASCII word character (digit, letter, underscore) |
200 | // W : non-ASCII word character |
201 | // d : ASCII digit |
202 | // D : non-ASCII digit |
203 | // . : non-unicode non-newline |
204 | // * : All characters |
205 | uint16_t standard_type() const { return set_.standard_set_type(); } |
206 | ZoneGrowableArray<CharacterRange>* ranges() { return set_.ranges(); } |
207 | bool is_negated() const { return (character_class_flags_ & NEGATED) != 0; } |
208 | RegExpFlags flags() const { return flags_; } |
209 | bool contains_split_surrogate() const { |
210 | return (character_class_flags_ & CONTAINS_SPLIT_SURROGATE) != 0; |
211 | } |
212 | |
213 | private: |
214 | CharacterSet set_; |
215 | RegExpFlags flags_; |
216 | CharacterClassFlags character_class_flags_; |
217 | }; |
218 | |
219 | class RegExpAtom : public RegExpTree { |
220 | public: |
221 | RegExpAtom(ZoneGrowableArray<uint16_t>* data, RegExpFlags flags) |
222 | : data_(data), flags_(flags) {} |
223 | virtual void* Accept(RegExpVisitor* visitor, void* data); |
224 | virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success); |
225 | virtual RegExpAtom* AsAtom(); |
226 | virtual bool IsAtom() const; |
227 | virtual bool IsTextElement() const { return true; } |
228 | virtual intptr_t min_match() const { return data_->length(); } |
229 | virtual intptr_t max_match() const { return data_->length(); } |
230 | virtual void AppendToText(RegExpText* text); |
231 | ZoneGrowableArray<uint16_t>* data() const { return data_; } |
232 | intptr_t length() const { return data_->length(); } |
233 | RegExpFlags flags() const { return flags_; } |
234 | bool ignore_case() const { return flags_.IgnoreCase(); } |
235 | |
236 | private: |
237 | ZoneGrowableArray<uint16_t>* data_; |
238 | const RegExpFlags flags_; |
239 | }; |
240 | |
241 | class RegExpText : public RegExpTree { |
242 | public: |
243 | RegExpText() : elements_(2), length_(0) {} |
244 | virtual void* Accept(RegExpVisitor* visitor, void* data); |
245 | virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success); |
246 | virtual RegExpText* AsText(); |
247 | virtual bool IsText() const; |
248 | virtual bool IsTextElement() const { return true; } |
249 | virtual intptr_t min_match() const { return length_; } |
250 | virtual intptr_t max_match() const { return length_; } |
251 | virtual void AppendToText(RegExpText* text); |
252 | void AddElement(TextElement elm) { |
253 | elements_.Add(elm); |
254 | length_ += elm.length(); |
255 | } |
256 | GrowableArray<TextElement>* elements() { return &elements_; } |
257 | |
258 | private: |
259 | GrowableArray<TextElement> elements_; |
260 | intptr_t length_; |
261 | }; |
262 | |
263 | class RegExpQuantifier : public RegExpTree { |
264 | public: |
265 | enum QuantifierType { GREEDY, NON_GREEDY, POSSESSIVE }; |
266 | RegExpQuantifier(intptr_t min, |
267 | intptr_t max, |
268 | QuantifierType type, |
269 | RegExpTree* body) |
270 | : body_(body), |
271 | min_(min), |
272 | max_(max), |
273 | min_match_(min * body->min_match()), |
274 | quantifier_type_(type) { |
275 | if (max > 0 && body->max_match() > kInfinity / max) { |
276 | max_match_ = kInfinity; |
277 | } else { |
278 | max_match_ = max * body->max_match(); |
279 | } |
280 | } |
281 | virtual void* Accept(RegExpVisitor* visitor, void* data); |
282 | virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success); |
283 | static RegExpNode* ToNode(intptr_t min, |
284 | intptr_t max, |
285 | bool is_greedy, |
286 | RegExpTree* body, |
287 | RegExpCompiler* compiler, |
288 | RegExpNode* on_success, |
289 | bool not_at_start = false); |
290 | virtual RegExpQuantifier* AsQuantifier(); |
291 | virtual Interval CaptureRegisters() const; |
292 | virtual bool IsQuantifier() const; |
293 | virtual intptr_t min_match() const { return min_match_; } |
294 | virtual intptr_t max_match() const { return max_match_; } |
295 | intptr_t min() const { return min_; } |
296 | intptr_t max() const { return max_; } |
297 | bool is_possessive() const { return quantifier_type_ == POSSESSIVE; } |
298 | bool is_non_greedy() const { return quantifier_type_ == NON_GREEDY; } |
299 | bool is_greedy() const { return quantifier_type_ == GREEDY; } |
300 | RegExpTree* body() const { return body_; } |
301 | |
302 | private: |
303 | RegExpTree* body_; |
304 | intptr_t min_; |
305 | intptr_t max_; |
306 | intptr_t min_match_; |
307 | intptr_t max_match_; |
308 | QuantifierType quantifier_type_; |
309 | }; |
310 | |
311 | class RegExpCapture : public RegExpTree { |
312 | public: |
313 | explicit RegExpCapture(intptr_t index) |
314 | : body_(nullptr), index_(index), name_(nullptr) {} |
315 | virtual void* Accept(RegExpVisitor* visitor, void* data); |
316 | virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success); |
317 | static RegExpNode* ToNode(RegExpTree* body, |
318 | intptr_t index, |
319 | RegExpCompiler* compiler, |
320 | RegExpNode* on_success); |
321 | virtual RegExpCapture* AsCapture(); |
322 | virtual bool IsAnchoredAtStart() const; |
323 | virtual bool IsAnchoredAtEnd() const; |
324 | virtual Interval CaptureRegisters() const; |
325 | virtual bool IsCapture() const; |
326 | virtual intptr_t min_match() const { return body_->min_match(); } |
327 | virtual intptr_t max_match() const { return body_->max_match(); } |
328 | RegExpTree* body() const { return body_; } |
329 | // When a backreference is parsed before the corresponding capture group, |
330 | // which can happen because of lookbehind, we create the capture object when |
331 | // we create the backreference, and fill in the body later when the actual |
332 | // capture group is parsed. |
333 | void set_body(RegExpTree* body) { body_ = body; } |
334 | intptr_t index() const { return index_; } |
335 | const ZoneGrowableArray<uint16_t>* name() { return name_; } |
336 | void set_name(const ZoneGrowableArray<uint16_t>* name) { name_ = name; } |
337 | static intptr_t StartRegister(intptr_t index) { return index * 2; } |
338 | static intptr_t EndRegister(intptr_t index) { return index * 2 + 1; } |
339 | |
340 | private: |
341 | RegExpTree* body_; |
342 | intptr_t index_; |
343 | const ZoneGrowableArray<uint16_t>* name_; |
344 | }; |
345 | |
346 | class RegExpLookaround : public RegExpTree { |
347 | public: |
348 | enum Type { LOOKAHEAD, LOOKBEHIND }; |
349 | RegExpLookaround(RegExpTree* body, |
350 | bool is_positive, |
351 | intptr_t capture_count, |
352 | intptr_t capture_from, |
353 | Type type) |
354 | : body_(body), |
355 | is_positive_(is_positive), |
356 | capture_count_(capture_count), |
357 | capture_from_(capture_from), |
358 | type_(type) {} |
359 | |
360 | virtual void* Accept(RegExpVisitor* visitor, void* data); |
361 | virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success); |
362 | virtual RegExpLookaround* AsLookaround(); |
363 | virtual Interval CaptureRegisters() const; |
364 | virtual bool IsLookaround() const; |
365 | virtual bool IsAnchoredAtStart() const; |
366 | virtual intptr_t min_match() const { return 0; } |
367 | virtual intptr_t max_match() const { return 0; } |
368 | RegExpTree* body() const { return body_; } |
369 | bool is_positive() const { return is_positive_; } |
370 | intptr_t capture_count() const { return capture_count_; } |
371 | intptr_t capture_from() const { return capture_from_; } |
372 | Type type() const { return type_; } |
373 | |
374 | // The RegExpLookaround::Builder class abstracts out the process of building |
375 | // the compiling a RegExpLookaround object by splitting it into two phases, |
376 | // represented by the provided methods. |
377 | class Builder : public ValueObject { |
378 | public: |
379 | Builder(bool is_positive, |
380 | RegExpNode* on_success, |
381 | intptr_t stack_pointer_register, |
382 | intptr_t position_register, |
383 | intptr_t capture_register_count = 0, |
384 | intptr_t capture_register_start = 0); |
385 | RegExpNode* on_match_success() { return on_match_success_; } |
386 | RegExpNode* ForMatch(RegExpNode* match); |
387 | |
388 | private: |
389 | bool is_positive_; |
390 | RegExpNode* on_match_success_; |
391 | RegExpNode* on_success_; |
392 | intptr_t stack_pointer_register_; |
393 | intptr_t position_register_; |
394 | }; |
395 | |
396 | private: |
397 | RegExpTree* body_; |
398 | bool is_positive_; |
399 | intptr_t capture_count_; |
400 | intptr_t capture_from_; |
401 | Type type_; |
402 | }; |
403 | |
404 | class RegExpBackReference : public RegExpTree { |
405 | public: |
406 | explicit RegExpBackReference(RegExpFlags flags) |
407 | : capture_(nullptr), name_(nullptr), flags_(flags) {} |
408 | RegExpBackReference(RegExpCapture* capture, RegExpFlags flags) |
409 | : capture_(capture), name_(nullptr), flags_(flags) {} |
410 | virtual void* Accept(RegExpVisitor* visitor, void* data); |
411 | virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success); |
412 | virtual RegExpBackReference* AsBackReference(); |
413 | virtual bool IsBackReference() const; |
414 | virtual intptr_t min_match() const { return 0; } |
415 | // The back reference may be recursive, e.g. /(\2)(\1)/. To avoid infinite |
416 | // recursion, we give up and just assume arbitrary length, which matches v8's |
417 | // behavior. |
418 | virtual intptr_t max_match() const { return kInfinity; } |
419 | intptr_t index() const { return capture_->index(); } |
420 | RegExpCapture* capture() const { return capture_; } |
421 | void set_capture(RegExpCapture* capture) { capture_ = capture; } |
422 | const ZoneGrowableArray<uint16_t>* name() { return name_; } |
423 | void set_name(const ZoneGrowableArray<uint16_t>* name) { name_ = name; } |
424 | |
425 | private: |
426 | RegExpCapture* capture_; |
427 | const ZoneGrowableArray<uint16_t>* name_; |
428 | RegExpFlags flags_; |
429 | }; |
430 | |
431 | class RegExpEmpty : public RegExpTree { |
432 | public: |
433 | RegExpEmpty() {} |
434 | virtual void* Accept(RegExpVisitor* visitor, void* data); |
435 | virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success); |
436 | virtual RegExpEmpty* AsEmpty(); |
437 | virtual bool IsEmpty() const; |
438 | virtual intptr_t min_match() const { return 0; } |
439 | virtual intptr_t max_match() const { return 0; } |
440 | static RegExpEmpty* GetInstance() { |
441 | static RegExpEmpty* instance = ::new RegExpEmpty(); |
442 | return instance; |
443 | } |
444 | }; |
445 | |
446 | } // namespace dart |
447 | |
448 | #endif // RUNTIME_VM_REGEXP_AST_H_ |
449 | |