1// Copyright (c) 2014, the Dart project authors. Please see the AUTHORS file
2// for details. All rights reserved. Use of this source code is governed by a
3// BSD-style license that can be found in the LICENSE file.
4
5#ifndef RUNTIME_VM_REGEXP_AST_H_
6#define RUNTIME_VM_REGEXP_AST_H_
7
8#include "platform/globals.h"
9#include "platform/utils.h"
10#include "vm/allocation.h"
11#include "vm/regexp.h"
12
13namespace dart {
14
15class RegExpAlternative;
16class RegExpAssertion;
17class RegExpAtom;
18class RegExpBackReference;
19class RegExpCapture;
20class RegExpCharacterClass;
21class RegExpCompiler;
22class RegExpDisjunction;
23class RegExpEmpty;
24class RegExpLookaround;
25class RegExpQuantifier;
26class RegExpText;
27
28class RegExpVisitor : public ValueObject {
29 public:
30 virtual ~RegExpVisitor() {}
31#define MAKE_CASE(Name) \
32 virtual void* Visit##Name(RegExp##Name*, void* data) = 0;
33 FOR_EACH_REG_EXP_TREE_TYPE(MAKE_CASE)
34#undef MAKE_CASE
35};
36
37class RegExpTree : public ZoneAllocated {
38 public:
39 static const intptr_t kInfinity = kMaxInt32;
40 virtual ~RegExpTree() {}
41 virtual void* Accept(RegExpVisitor* visitor, void* data) = 0;
42 virtual RegExpNode* ToNode(RegExpCompiler* compiler,
43 RegExpNode* on_success) = 0;
44 virtual bool IsTextElement() const { return false; }
45 virtual bool IsAnchoredAtStart() const { return false; }
46 virtual bool IsAnchoredAtEnd() const { return false; }
47 virtual intptr_t min_match() const = 0;
48 virtual intptr_t max_match() const = 0;
49 // Returns the interval of registers used for captures within this
50 // expression.
51 virtual Interval CaptureRegisters() const { return Interval::Empty(); }
52 virtual void AppendToText(RegExpText* text);
53 void Print();
54#define MAKE_ASTYPE(Name) \
55 virtual RegExp##Name* As##Name(); \
56 virtual bool Is##Name() const;
57 FOR_EACH_REG_EXP_TREE_TYPE(MAKE_ASTYPE)
58#undef MAKE_ASTYPE
59};
60
61class RegExpDisjunction : public RegExpTree {
62 public:
63 explicit RegExpDisjunction(ZoneGrowableArray<RegExpTree*>* alternatives);
64 virtual void* Accept(RegExpVisitor* visitor, void* data);
65 virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success);
66 virtual RegExpDisjunction* AsDisjunction();
67 virtual Interval CaptureRegisters() const;
68 virtual bool IsDisjunction() const;
69 virtual bool IsAnchoredAtStart() const;
70 virtual bool IsAnchoredAtEnd() const;
71 virtual intptr_t min_match() const { return min_match_; }
72 virtual intptr_t max_match() const { return max_match_; }
73 ZoneGrowableArray<RegExpTree*>* alternatives() const { return alternatives_; }
74
75 private:
76 ZoneGrowableArray<RegExpTree*>* alternatives_;
77 intptr_t min_match_;
78 intptr_t max_match_;
79};
80
81class RegExpAlternative : public RegExpTree {
82 public:
83 explicit RegExpAlternative(ZoneGrowableArray<RegExpTree*>* nodes);
84 virtual void* Accept(RegExpVisitor* visitor, void* data);
85 virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success);
86 virtual RegExpAlternative* AsAlternative();
87 virtual Interval CaptureRegisters() const;
88 virtual bool IsAlternative() const;
89 virtual bool IsAnchoredAtStart() const;
90 virtual bool IsAnchoredAtEnd() const;
91 virtual intptr_t min_match() const { return min_match_; }
92 virtual intptr_t max_match() const { return max_match_; }
93 ZoneGrowableArray<RegExpTree*>* nodes() const { return nodes_; }
94
95 private:
96 ZoneGrowableArray<RegExpTree*>* nodes_;
97 intptr_t min_match_;
98 intptr_t max_match_;
99};
100
101class RegExpAssertion : public RegExpTree {
102 public:
103 enum AssertionType {
104 START_OF_LINE,
105 START_OF_INPUT,
106 END_OF_LINE,
107 END_OF_INPUT,
108 BOUNDARY,
109 NON_BOUNDARY
110 };
111 RegExpAssertion(AssertionType type, RegExpFlags flags)
112 : assertion_type_(type), flags_(flags) {}
113 virtual void* Accept(RegExpVisitor* visitor, void* data);
114 virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success);
115 virtual RegExpAssertion* AsAssertion();
116 virtual bool IsAssertion() const;
117 virtual bool IsAnchoredAtStart() const;
118 virtual bool IsAnchoredAtEnd() const;
119 virtual intptr_t min_match() const { return 0; }
120 virtual intptr_t max_match() const { return 0; }
121 AssertionType assertion_type() const { return assertion_type_; }
122
123 private:
124 AssertionType assertion_type_;
125 RegExpFlags flags_;
126};
127
128class CharacterSet : public ValueObject {
129 public:
130 explicit CharacterSet(uint16_t standard_set_type)
131 : ranges_(NULL), standard_set_type_(standard_set_type) {}
132 explicit CharacterSet(ZoneGrowableArray<CharacterRange>* ranges)
133 : ranges_(ranges), standard_set_type_(0) {}
134 CharacterSet(const CharacterSet& that)
135 : ValueObject(),
136 ranges_(that.ranges_),
137 standard_set_type_(that.standard_set_type_) {}
138 ZoneGrowableArray<CharacterRange>* ranges();
139 uint16_t standard_set_type() const { return standard_set_type_; }
140 void set_standard_set_type(uint16_t special_set_type) {
141 standard_set_type_ = special_set_type;
142 }
143 bool is_standard() { return standard_set_type_ != 0; }
144 void Canonicalize();
145
146 private:
147 ZoneGrowableArray<CharacterRange>* ranges_;
148 // If non-zero, the value represents a standard set (e.g., all whitespace
149 // characters) without having to expand the ranges.
150 uint16_t standard_set_type_;
151};
152
153class RegExpCharacterClass : public RegExpTree {
154 public:
155 enum Flag {
156 // The character class is negated and should match everything but the
157 // specified ranges.
158 NEGATED = 1 << 0,
159 // The character class contains part of a split surrogate and should not
160 // be unicode-desugared.
161 CONTAINS_SPLIT_SURROGATE = 1 << 1,
162 };
163 using CharacterClassFlags = intptr_t;
164 static inline CharacterClassFlags DefaultFlags() { return 0; }
165
166 RegExpCharacterClass(
167 ZoneGrowableArray<CharacterRange>* ranges,
168 RegExpFlags flags,
169 CharacterClassFlags character_class_flags = DefaultFlags())
170 : set_(ranges),
171 flags_(flags),
172 character_class_flags_(character_class_flags) {
173 // Convert the empty set of ranges to the negated Everything() range.
174 if (ranges->is_empty()) {
175 ranges->Add(CharacterRange::Everything());
176 character_class_flags_ ^= NEGATED;
177 }
178 }
179 RegExpCharacterClass(uint16_t type, RegExpFlags flags)
180 : set_(type), flags_(flags), character_class_flags_(0) {}
181 virtual void* Accept(RegExpVisitor* visitor, void* data);
182 virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success);
183 virtual RegExpCharacterClass* AsCharacterClass();
184 virtual bool IsCharacterClass() const;
185 virtual bool IsTextElement() const { return true; }
186 virtual intptr_t min_match() const { return 1; }
187 // The character class may match two code units for unicode regexps.
188 virtual intptr_t max_match() const { return 2; }
189 virtual void AppendToText(RegExpText* text);
190 CharacterSet character_set() const { return set_; }
191 // TODO(lrn): Remove need for complex version if is_standard that
192 // recognizes a mangled standard set and just do { return set_.is_special(); }
193 bool is_standard();
194 // Returns a value representing the standard character set if is_standard()
195 // returns true.
196 // Currently used values are:
197 // s : unicode whitespace
198 // S : unicode non-whitespace
199 // w : ASCII word character (digit, letter, underscore)
200 // W : non-ASCII word character
201 // d : ASCII digit
202 // D : non-ASCII digit
203 // . : non-unicode non-newline
204 // * : All characters
205 uint16_t standard_type() const { return set_.standard_set_type(); }
206 ZoneGrowableArray<CharacterRange>* ranges() { return set_.ranges(); }
207 bool is_negated() const { return (character_class_flags_ & NEGATED) != 0; }
208 RegExpFlags flags() const { return flags_; }
209 bool contains_split_surrogate() const {
210 return (character_class_flags_ & CONTAINS_SPLIT_SURROGATE) != 0;
211 }
212
213 private:
214 CharacterSet set_;
215 RegExpFlags flags_;
216 CharacterClassFlags character_class_flags_;
217};
218
219class RegExpAtom : public RegExpTree {
220 public:
221 RegExpAtom(ZoneGrowableArray<uint16_t>* data, RegExpFlags flags)
222 : data_(data), flags_(flags) {}
223 virtual void* Accept(RegExpVisitor* visitor, void* data);
224 virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success);
225 virtual RegExpAtom* AsAtom();
226 virtual bool IsAtom() const;
227 virtual bool IsTextElement() const { return true; }
228 virtual intptr_t min_match() const { return data_->length(); }
229 virtual intptr_t max_match() const { return data_->length(); }
230 virtual void AppendToText(RegExpText* text);
231 ZoneGrowableArray<uint16_t>* data() const { return data_; }
232 intptr_t length() const { return data_->length(); }
233 RegExpFlags flags() const { return flags_; }
234 bool ignore_case() const { return flags_.IgnoreCase(); }
235
236 private:
237 ZoneGrowableArray<uint16_t>* data_;
238 const RegExpFlags flags_;
239};
240
241class RegExpText : public RegExpTree {
242 public:
243 RegExpText() : elements_(2), length_(0) {}
244 virtual void* Accept(RegExpVisitor* visitor, void* data);
245 virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success);
246 virtual RegExpText* AsText();
247 virtual bool IsText() const;
248 virtual bool IsTextElement() const { return true; }
249 virtual intptr_t min_match() const { return length_; }
250 virtual intptr_t max_match() const { return length_; }
251 virtual void AppendToText(RegExpText* text);
252 void AddElement(TextElement elm) {
253 elements_.Add(elm);
254 length_ += elm.length();
255 }
256 GrowableArray<TextElement>* elements() { return &elements_; }
257
258 private:
259 GrowableArray<TextElement> elements_;
260 intptr_t length_;
261};
262
263class RegExpQuantifier : public RegExpTree {
264 public:
265 enum QuantifierType { GREEDY, NON_GREEDY, POSSESSIVE };
266 RegExpQuantifier(intptr_t min,
267 intptr_t max,
268 QuantifierType type,
269 RegExpTree* body)
270 : body_(body),
271 min_(min),
272 max_(max),
273 min_match_(min * body->min_match()),
274 quantifier_type_(type) {
275 if (max > 0 && body->max_match() > kInfinity / max) {
276 max_match_ = kInfinity;
277 } else {
278 max_match_ = max * body->max_match();
279 }
280 }
281 virtual void* Accept(RegExpVisitor* visitor, void* data);
282 virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success);
283 static RegExpNode* ToNode(intptr_t min,
284 intptr_t max,
285 bool is_greedy,
286 RegExpTree* body,
287 RegExpCompiler* compiler,
288 RegExpNode* on_success,
289 bool not_at_start = false);
290 virtual RegExpQuantifier* AsQuantifier();
291 virtual Interval CaptureRegisters() const;
292 virtual bool IsQuantifier() const;
293 virtual intptr_t min_match() const { return min_match_; }
294 virtual intptr_t max_match() const { return max_match_; }
295 intptr_t min() const { return min_; }
296 intptr_t max() const { return max_; }
297 bool is_possessive() const { return quantifier_type_ == POSSESSIVE; }
298 bool is_non_greedy() const { return quantifier_type_ == NON_GREEDY; }
299 bool is_greedy() const { return quantifier_type_ == GREEDY; }
300 RegExpTree* body() const { return body_; }
301
302 private:
303 RegExpTree* body_;
304 intptr_t min_;
305 intptr_t max_;
306 intptr_t min_match_;
307 intptr_t max_match_;
308 QuantifierType quantifier_type_;
309};
310
311class RegExpCapture : public RegExpTree {
312 public:
313 explicit RegExpCapture(intptr_t index)
314 : body_(nullptr), index_(index), name_(nullptr) {}
315 virtual void* Accept(RegExpVisitor* visitor, void* data);
316 virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success);
317 static RegExpNode* ToNode(RegExpTree* body,
318 intptr_t index,
319 RegExpCompiler* compiler,
320 RegExpNode* on_success);
321 virtual RegExpCapture* AsCapture();
322 virtual bool IsAnchoredAtStart() const;
323 virtual bool IsAnchoredAtEnd() const;
324 virtual Interval CaptureRegisters() const;
325 virtual bool IsCapture() const;
326 virtual intptr_t min_match() const { return body_->min_match(); }
327 virtual intptr_t max_match() const { return body_->max_match(); }
328 RegExpTree* body() const { return body_; }
329 // When a backreference is parsed before the corresponding capture group,
330 // which can happen because of lookbehind, we create the capture object when
331 // we create the backreference, and fill in the body later when the actual
332 // capture group is parsed.
333 void set_body(RegExpTree* body) { body_ = body; }
334 intptr_t index() const { return index_; }
335 const ZoneGrowableArray<uint16_t>* name() { return name_; }
336 void set_name(const ZoneGrowableArray<uint16_t>* name) { name_ = name; }
337 static intptr_t StartRegister(intptr_t index) { return index * 2; }
338 static intptr_t EndRegister(intptr_t index) { return index * 2 + 1; }
339
340 private:
341 RegExpTree* body_;
342 intptr_t index_;
343 const ZoneGrowableArray<uint16_t>* name_;
344};
345
346class RegExpLookaround : public RegExpTree {
347 public:
348 enum Type { LOOKAHEAD, LOOKBEHIND };
349 RegExpLookaround(RegExpTree* body,
350 bool is_positive,
351 intptr_t capture_count,
352 intptr_t capture_from,
353 Type type)
354 : body_(body),
355 is_positive_(is_positive),
356 capture_count_(capture_count),
357 capture_from_(capture_from),
358 type_(type) {}
359
360 virtual void* Accept(RegExpVisitor* visitor, void* data);
361 virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success);
362 virtual RegExpLookaround* AsLookaround();
363 virtual Interval CaptureRegisters() const;
364 virtual bool IsLookaround() const;
365 virtual bool IsAnchoredAtStart() const;
366 virtual intptr_t min_match() const { return 0; }
367 virtual intptr_t max_match() const { return 0; }
368 RegExpTree* body() const { return body_; }
369 bool is_positive() const { return is_positive_; }
370 intptr_t capture_count() const { return capture_count_; }
371 intptr_t capture_from() const { return capture_from_; }
372 Type type() const { return type_; }
373
374 // The RegExpLookaround::Builder class abstracts out the process of building
375 // the compiling a RegExpLookaround object by splitting it into two phases,
376 // represented by the provided methods.
377 class Builder : public ValueObject {
378 public:
379 Builder(bool is_positive,
380 RegExpNode* on_success,
381 intptr_t stack_pointer_register,
382 intptr_t position_register,
383 intptr_t capture_register_count = 0,
384 intptr_t capture_register_start = 0);
385 RegExpNode* on_match_success() { return on_match_success_; }
386 RegExpNode* ForMatch(RegExpNode* match);
387
388 private:
389 bool is_positive_;
390 RegExpNode* on_match_success_;
391 RegExpNode* on_success_;
392 intptr_t stack_pointer_register_;
393 intptr_t position_register_;
394 };
395
396 private:
397 RegExpTree* body_;
398 bool is_positive_;
399 intptr_t capture_count_;
400 intptr_t capture_from_;
401 Type type_;
402};
403
404class RegExpBackReference : public RegExpTree {
405 public:
406 explicit RegExpBackReference(RegExpFlags flags)
407 : capture_(nullptr), name_(nullptr), flags_(flags) {}
408 RegExpBackReference(RegExpCapture* capture, RegExpFlags flags)
409 : capture_(capture), name_(nullptr), flags_(flags) {}
410 virtual void* Accept(RegExpVisitor* visitor, void* data);
411 virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success);
412 virtual RegExpBackReference* AsBackReference();
413 virtual bool IsBackReference() const;
414 virtual intptr_t min_match() const { return 0; }
415 // The back reference may be recursive, e.g. /(\2)(\1)/. To avoid infinite
416 // recursion, we give up and just assume arbitrary length, which matches v8's
417 // behavior.
418 virtual intptr_t max_match() const { return kInfinity; }
419 intptr_t index() const { return capture_->index(); }
420 RegExpCapture* capture() const { return capture_; }
421 void set_capture(RegExpCapture* capture) { capture_ = capture; }
422 const ZoneGrowableArray<uint16_t>* name() { return name_; }
423 void set_name(const ZoneGrowableArray<uint16_t>* name) { name_ = name; }
424
425 private:
426 RegExpCapture* capture_;
427 const ZoneGrowableArray<uint16_t>* name_;
428 RegExpFlags flags_;
429};
430
431class RegExpEmpty : public RegExpTree {
432 public:
433 RegExpEmpty() {}
434 virtual void* Accept(RegExpVisitor* visitor, void* data);
435 virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success);
436 virtual RegExpEmpty* AsEmpty();
437 virtual bool IsEmpty() const;
438 virtual intptr_t min_match() const { return 0; }
439 virtual intptr_t max_match() const { return 0; }
440 static RegExpEmpty* GetInstance() {
441 static RegExpEmpty* instance = ::new RegExpEmpty();
442 return instance;
443 }
444};
445
446} // namespace dart
447
448#endif // RUNTIME_VM_REGEXP_AST_H_
449