| 1 | // Copyright (c) 2014, the Dart project authors. Please see the AUTHORS file |
| 2 | // for details. All rights reserved. Use of this source code is governed by a |
| 3 | // BSD-style license that can be found in the LICENSE file. |
| 4 | |
| 5 | #ifndef RUNTIME_VM_REGEXP_AST_H_ |
| 6 | #define RUNTIME_VM_REGEXP_AST_H_ |
| 7 | |
| 8 | #include "platform/globals.h" |
| 9 | #include "platform/utils.h" |
| 10 | #include "vm/allocation.h" |
| 11 | #include "vm/regexp.h" |
| 12 | |
| 13 | namespace dart { |
| 14 | |
| 15 | class RegExpAlternative; |
| 16 | class RegExpAssertion; |
| 17 | class RegExpAtom; |
| 18 | class RegExpBackReference; |
| 19 | class RegExpCapture; |
| 20 | class RegExpCharacterClass; |
| 21 | class RegExpCompiler; |
| 22 | class RegExpDisjunction; |
| 23 | class RegExpEmpty; |
| 24 | class RegExpLookaround; |
| 25 | class RegExpQuantifier; |
| 26 | class RegExpText; |
| 27 | |
| 28 | class RegExpVisitor : public ValueObject { |
| 29 | public: |
| 30 | virtual ~RegExpVisitor() {} |
| 31 | #define MAKE_CASE(Name) \ |
| 32 | virtual void* Visit##Name(RegExp##Name*, void* data) = 0; |
| 33 | FOR_EACH_REG_EXP_TREE_TYPE(MAKE_CASE) |
| 34 | #undef MAKE_CASE |
| 35 | }; |
| 36 | |
| 37 | class RegExpTree : public ZoneAllocated { |
| 38 | public: |
| 39 | static const intptr_t kInfinity = kMaxInt32; |
| 40 | virtual ~RegExpTree() {} |
| 41 | virtual void* Accept(RegExpVisitor* visitor, void* data) = 0; |
| 42 | virtual RegExpNode* ToNode(RegExpCompiler* compiler, |
| 43 | RegExpNode* on_success) = 0; |
| 44 | virtual bool IsTextElement() const { return false; } |
| 45 | virtual bool IsAnchoredAtStart() const { return false; } |
| 46 | virtual bool IsAnchoredAtEnd() const { return false; } |
| 47 | virtual intptr_t min_match() const = 0; |
| 48 | virtual intptr_t max_match() const = 0; |
| 49 | // Returns the interval of registers used for captures within this |
| 50 | // expression. |
| 51 | virtual Interval CaptureRegisters() const { return Interval::Empty(); } |
| 52 | virtual void AppendToText(RegExpText* text); |
| 53 | void Print(); |
| 54 | #define MAKE_ASTYPE(Name) \ |
| 55 | virtual RegExp##Name* As##Name(); \ |
| 56 | virtual bool Is##Name() const; |
| 57 | FOR_EACH_REG_EXP_TREE_TYPE(MAKE_ASTYPE) |
| 58 | #undef MAKE_ASTYPE |
| 59 | }; |
| 60 | |
| 61 | class RegExpDisjunction : public RegExpTree { |
| 62 | public: |
| 63 | explicit RegExpDisjunction(ZoneGrowableArray<RegExpTree*>* alternatives); |
| 64 | virtual void* Accept(RegExpVisitor* visitor, void* data); |
| 65 | virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success); |
| 66 | virtual RegExpDisjunction* AsDisjunction(); |
| 67 | virtual Interval CaptureRegisters() const; |
| 68 | virtual bool IsDisjunction() const; |
| 69 | virtual bool IsAnchoredAtStart() const; |
| 70 | virtual bool IsAnchoredAtEnd() const; |
| 71 | virtual intptr_t min_match() const { return min_match_; } |
| 72 | virtual intptr_t max_match() const { return max_match_; } |
| 73 | ZoneGrowableArray<RegExpTree*>* alternatives() const { return alternatives_; } |
| 74 | |
| 75 | private: |
| 76 | ZoneGrowableArray<RegExpTree*>* alternatives_; |
| 77 | intptr_t min_match_; |
| 78 | intptr_t max_match_; |
| 79 | }; |
| 80 | |
| 81 | class RegExpAlternative : public RegExpTree { |
| 82 | public: |
| 83 | explicit RegExpAlternative(ZoneGrowableArray<RegExpTree*>* nodes); |
| 84 | virtual void* Accept(RegExpVisitor* visitor, void* data); |
| 85 | virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success); |
| 86 | virtual RegExpAlternative* AsAlternative(); |
| 87 | virtual Interval CaptureRegisters() const; |
| 88 | virtual bool IsAlternative() const; |
| 89 | virtual bool IsAnchoredAtStart() const; |
| 90 | virtual bool IsAnchoredAtEnd() const; |
| 91 | virtual intptr_t min_match() const { return min_match_; } |
| 92 | virtual intptr_t max_match() const { return max_match_; } |
| 93 | ZoneGrowableArray<RegExpTree*>* nodes() const { return nodes_; } |
| 94 | |
| 95 | private: |
| 96 | ZoneGrowableArray<RegExpTree*>* nodes_; |
| 97 | intptr_t min_match_; |
| 98 | intptr_t max_match_; |
| 99 | }; |
| 100 | |
| 101 | class RegExpAssertion : public RegExpTree { |
| 102 | public: |
| 103 | enum AssertionType { |
| 104 | START_OF_LINE, |
| 105 | START_OF_INPUT, |
| 106 | END_OF_LINE, |
| 107 | END_OF_INPUT, |
| 108 | BOUNDARY, |
| 109 | NON_BOUNDARY |
| 110 | }; |
| 111 | RegExpAssertion(AssertionType type, RegExpFlags flags) |
| 112 | : assertion_type_(type), flags_(flags) {} |
| 113 | virtual void* Accept(RegExpVisitor* visitor, void* data); |
| 114 | virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success); |
| 115 | virtual RegExpAssertion* AsAssertion(); |
| 116 | virtual bool IsAssertion() const; |
| 117 | virtual bool IsAnchoredAtStart() const; |
| 118 | virtual bool IsAnchoredAtEnd() const; |
| 119 | virtual intptr_t min_match() const { return 0; } |
| 120 | virtual intptr_t max_match() const { return 0; } |
| 121 | AssertionType assertion_type() const { return assertion_type_; } |
| 122 | |
| 123 | private: |
| 124 | AssertionType assertion_type_; |
| 125 | RegExpFlags flags_; |
| 126 | }; |
| 127 | |
| 128 | class CharacterSet : public ValueObject { |
| 129 | public: |
| 130 | explicit CharacterSet(uint16_t standard_set_type) |
| 131 | : ranges_(NULL), standard_set_type_(standard_set_type) {} |
| 132 | explicit CharacterSet(ZoneGrowableArray<CharacterRange>* ranges) |
| 133 | : ranges_(ranges), standard_set_type_(0) {} |
| 134 | CharacterSet(const CharacterSet& that) |
| 135 | : ValueObject(), |
| 136 | ranges_(that.ranges_), |
| 137 | standard_set_type_(that.standard_set_type_) {} |
| 138 | ZoneGrowableArray<CharacterRange>* ranges(); |
| 139 | uint16_t standard_set_type() const { return standard_set_type_; } |
| 140 | void set_standard_set_type(uint16_t special_set_type) { |
| 141 | standard_set_type_ = special_set_type; |
| 142 | } |
| 143 | bool is_standard() { return standard_set_type_ != 0; } |
| 144 | void Canonicalize(); |
| 145 | |
| 146 | private: |
| 147 | ZoneGrowableArray<CharacterRange>* ranges_; |
| 148 | // If non-zero, the value represents a standard set (e.g., all whitespace |
| 149 | // characters) without having to expand the ranges. |
| 150 | uint16_t standard_set_type_; |
| 151 | }; |
| 152 | |
| 153 | class RegExpCharacterClass : public RegExpTree { |
| 154 | public: |
| 155 | enum Flag { |
| 156 | // The character class is negated and should match everything but the |
| 157 | // specified ranges. |
| 158 | NEGATED = 1 << 0, |
| 159 | // The character class contains part of a split surrogate and should not |
| 160 | // be unicode-desugared. |
| 161 | CONTAINS_SPLIT_SURROGATE = 1 << 1, |
| 162 | }; |
| 163 | using CharacterClassFlags = intptr_t; |
| 164 | static inline CharacterClassFlags DefaultFlags() { return 0; } |
| 165 | |
| 166 | RegExpCharacterClass( |
| 167 | ZoneGrowableArray<CharacterRange>* ranges, |
| 168 | RegExpFlags flags, |
| 169 | CharacterClassFlags character_class_flags = DefaultFlags()) |
| 170 | : set_(ranges), |
| 171 | flags_(flags), |
| 172 | character_class_flags_(character_class_flags) { |
| 173 | // Convert the empty set of ranges to the negated Everything() range. |
| 174 | if (ranges->is_empty()) { |
| 175 | ranges->Add(CharacterRange::Everything()); |
| 176 | character_class_flags_ ^= NEGATED; |
| 177 | } |
| 178 | } |
| 179 | RegExpCharacterClass(uint16_t type, RegExpFlags flags) |
| 180 | : set_(type), flags_(flags), character_class_flags_(0) {} |
| 181 | virtual void* Accept(RegExpVisitor* visitor, void* data); |
| 182 | virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success); |
| 183 | virtual RegExpCharacterClass* AsCharacterClass(); |
| 184 | virtual bool IsCharacterClass() const; |
| 185 | virtual bool IsTextElement() const { return true; } |
| 186 | virtual intptr_t min_match() const { return 1; } |
| 187 | // The character class may match two code units for unicode regexps. |
| 188 | virtual intptr_t max_match() const { return 2; } |
| 189 | virtual void AppendToText(RegExpText* text); |
| 190 | CharacterSet character_set() const { return set_; } |
| 191 | // TODO(lrn): Remove need for complex version if is_standard that |
| 192 | // recognizes a mangled standard set and just do { return set_.is_special(); } |
| 193 | bool is_standard(); |
| 194 | // Returns a value representing the standard character set if is_standard() |
| 195 | // returns true. |
| 196 | // Currently used values are: |
| 197 | // s : unicode whitespace |
| 198 | // S : unicode non-whitespace |
| 199 | // w : ASCII word character (digit, letter, underscore) |
| 200 | // W : non-ASCII word character |
| 201 | // d : ASCII digit |
| 202 | // D : non-ASCII digit |
| 203 | // . : non-unicode non-newline |
| 204 | // * : All characters |
| 205 | uint16_t standard_type() const { return set_.standard_set_type(); } |
| 206 | ZoneGrowableArray<CharacterRange>* ranges() { return set_.ranges(); } |
| 207 | bool is_negated() const { return (character_class_flags_ & NEGATED) != 0; } |
| 208 | RegExpFlags flags() const { return flags_; } |
| 209 | bool contains_split_surrogate() const { |
| 210 | return (character_class_flags_ & CONTAINS_SPLIT_SURROGATE) != 0; |
| 211 | } |
| 212 | |
| 213 | private: |
| 214 | CharacterSet set_; |
| 215 | RegExpFlags flags_; |
| 216 | CharacterClassFlags character_class_flags_; |
| 217 | }; |
| 218 | |
| 219 | class RegExpAtom : public RegExpTree { |
| 220 | public: |
| 221 | RegExpAtom(ZoneGrowableArray<uint16_t>* data, RegExpFlags flags) |
| 222 | : data_(data), flags_(flags) {} |
| 223 | virtual void* Accept(RegExpVisitor* visitor, void* data); |
| 224 | virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success); |
| 225 | virtual RegExpAtom* AsAtom(); |
| 226 | virtual bool IsAtom() const; |
| 227 | virtual bool IsTextElement() const { return true; } |
| 228 | virtual intptr_t min_match() const { return data_->length(); } |
| 229 | virtual intptr_t max_match() const { return data_->length(); } |
| 230 | virtual void AppendToText(RegExpText* text); |
| 231 | ZoneGrowableArray<uint16_t>* data() const { return data_; } |
| 232 | intptr_t length() const { return data_->length(); } |
| 233 | RegExpFlags flags() const { return flags_; } |
| 234 | bool ignore_case() const { return flags_.IgnoreCase(); } |
| 235 | |
| 236 | private: |
| 237 | ZoneGrowableArray<uint16_t>* data_; |
| 238 | const RegExpFlags flags_; |
| 239 | }; |
| 240 | |
| 241 | class RegExpText : public RegExpTree { |
| 242 | public: |
| 243 | RegExpText() : elements_(2), length_(0) {} |
| 244 | virtual void* Accept(RegExpVisitor* visitor, void* data); |
| 245 | virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success); |
| 246 | virtual RegExpText* AsText(); |
| 247 | virtual bool IsText() const; |
| 248 | virtual bool IsTextElement() const { return true; } |
| 249 | virtual intptr_t min_match() const { return length_; } |
| 250 | virtual intptr_t max_match() const { return length_; } |
| 251 | virtual void AppendToText(RegExpText* text); |
| 252 | void AddElement(TextElement elm) { |
| 253 | elements_.Add(elm); |
| 254 | length_ += elm.length(); |
| 255 | } |
| 256 | GrowableArray<TextElement>* elements() { return &elements_; } |
| 257 | |
| 258 | private: |
| 259 | GrowableArray<TextElement> elements_; |
| 260 | intptr_t length_; |
| 261 | }; |
| 262 | |
| 263 | class RegExpQuantifier : public RegExpTree { |
| 264 | public: |
| 265 | enum QuantifierType { GREEDY, NON_GREEDY, POSSESSIVE }; |
| 266 | RegExpQuantifier(intptr_t min, |
| 267 | intptr_t max, |
| 268 | QuantifierType type, |
| 269 | RegExpTree* body) |
| 270 | : body_(body), |
| 271 | min_(min), |
| 272 | max_(max), |
| 273 | min_match_(min * body->min_match()), |
| 274 | quantifier_type_(type) { |
| 275 | if (max > 0 && body->max_match() > kInfinity / max) { |
| 276 | max_match_ = kInfinity; |
| 277 | } else { |
| 278 | max_match_ = max * body->max_match(); |
| 279 | } |
| 280 | } |
| 281 | virtual void* Accept(RegExpVisitor* visitor, void* data); |
| 282 | virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success); |
| 283 | static RegExpNode* ToNode(intptr_t min, |
| 284 | intptr_t max, |
| 285 | bool is_greedy, |
| 286 | RegExpTree* body, |
| 287 | RegExpCompiler* compiler, |
| 288 | RegExpNode* on_success, |
| 289 | bool not_at_start = false); |
| 290 | virtual RegExpQuantifier* AsQuantifier(); |
| 291 | virtual Interval CaptureRegisters() const; |
| 292 | virtual bool IsQuantifier() const; |
| 293 | virtual intptr_t min_match() const { return min_match_; } |
| 294 | virtual intptr_t max_match() const { return max_match_; } |
| 295 | intptr_t min() const { return min_; } |
| 296 | intptr_t max() const { return max_; } |
| 297 | bool is_possessive() const { return quantifier_type_ == POSSESSIVE; } |
| 298 | bool is_non_greedy() const { return quantifier_type_ == NON_GREEDY; } |
| 299 | bool is_greedy() const { return quantifier_type_ == GREEDY; } |
| 300 | RegExpTree* body() const { return body_; } |
| 301 | |
| 302 | private: |
| 303 | RegExpTree* body_; |
| 304 | intptr_t min_; |
| 305 | intptr_t max_; |
| 306 | intptr_t min_match_; |
| 307 | intptr_t max_match_; |
| 308 | QuantifierType quantifier_type_; |
| 309 | }; |
| 310 | |
| 311 | class RegExpCapture : public RegExpTree { |
| 312 | public: |
| 313 | explicit RegExpCapture(intptr_t index) |
| 314 | : body_(nullptr), index_(index), name_(nullptr) {} |
| 315 | virtual void* Accept(RegExpVisitor* visitor, void* data); |
| 316 | virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success); |
| 317 | static RegExpNode* ToNode(RegExpTree* body, |
| 318 | intptr_t index, |
| 319 | RegExpCompiler* compiler, |
| 320 | RegExpNode* on_success); |
| 321 | virtual RegExpCapture* AsCapture(); |
| 322 | virtual bool IsAnchoredAtStart() const; |
| 323 | virtual bool IsAnchoredAtEnd() const; |
| 324 | virtual Interval CaptureRegisters() const; |
| 325 | virtual bool IsCapture() const; |
| 326 | virtual intptr_t min_match() const { return body_->min_match(); } |
| 327 | virtual intptr_t max_match() const { return body_->max_match(); } |
| 328 | RegExpTree* body() const { return body_; } |
| 329 | // When a backreference is parsed before the corresponding capture group, |
| 330 | // which can happen because of lookbehind, we create the capture object when |
| 331 | // we create the backreference, and fill in the body later when the actual |
| 332 | // capture group is parsed. |
| 333 | void set_body(RegExpTree* body) { body_ = body; } |
| 334 | intptr_t index() const { return index_; } |
| 335 | const ZoneGrowableArray<uint16_t>* name() { return name_; } |
| 336 | void set_name(const ZoneGrowableArray<uint16_t>* name) { name_ = name; } |
| 337 | static intptr_t StartRegister(intptr_t index) { return index * 2; } |
| 338 | static intptr_t EndRegister(intptr_t index) { return index * 2 + 1; } |
| 339 | |
| 340 | private: |
| 341 | RegExpTree* body_; |
| 342 | intptr_t index_; |
| 343 | const ZoneGrowableArray<uint16_t>* name_; |
| 344 | }; |
| 345 | |
| 346 | class RegExpLookaround : public RegExpTree { |
| 347 | public: |
| 348 | enum Type { LOOKAHEAD, LOOKBEHIND }; |
| 349 | RegExpLookaround(RegExpTree* body, |
| 350 | bool is_positive, |
| 351 | intptr_t capture_count, |
| 352 | intptr_t capture_from, |
| 353 | Type type) |
| 354 | : body_(body), |
| 355 | is_positive_(is_positive), |
| 356 | capture_count_(capture_count), |
| 357 | capture_from_(capture_from), |
| 358 | type_(type) {} |
| 359 | |
| 360 | virtual void* Accept(RegExpVisitor* visitor, void* data); |
| 361 | virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success); |
| 362 | virtual RegExpLookaround* AsLookaround(); |
| 363 | virtual Interval CaptureRegisters() const; |
| 364 | virtual bool IsLookaround() const; |
| 365 | virtual bool IsAnchoredAtStart() const; |
| 366 | virtual intptr_t min_match() const { return 0; } |
| 367 | virtual intptr_t max_match() const { return 0; } |
| 368 | RegExpTree* body() const { return body_; } |
| 369 | bool is_positive() const { return is_positive_; } |
| 370 | intptr_t capture_count() const { return capture_count_; } |
| 371 | intptr_t capture_from() const { return capture_from_; } |
| 372 | Type type() const { return type_; } |
| 373 | |
| 374 | // The RegExpLookaround::Builder class abstracts out the process of building |
| 375 | // the compiling a RegExpLookaround object by splitting it into two phases, |
| 376 | // represented by the provided methods. |
| 377 | class Builder : public ValueObject { |
| 378 | public: |
| 379 | Builder(bool is_positive, |
| 380 | RegExpNode* on_success, |
| 381 | intptr_t stack_pointer_register, |
| 382 | intptr_t position_register, |
| 383 | intptr_t capture_register_count = 0, |
| 384 | intptr_t capture_register_start = 0); |
| 385 | RegExpNode* on_match_success() { return on_match_success_; } |
| 386 | RegExpNode* ForMatch(RegExpNode* match); |
| 387 | |
| 388 | private: |
| 389 | bool is_positive_; |
| 390 | RegExpNode* on_match_success_; |
| 391 | RegExpNode* on_success_; |
| 392 | intptr_t stack_pointer_register_; |
| 393 | intptr_t position_register_; |
| 394 | }; |
| 395 | |
| 396 | private: |
| 397 | RegExpTree* body_; |
| 398 | bool is_positive_; |
| 399 | intptr_t capture_count_; |
| 400 | intptr_t capture_from_; |
| 401 | Type type_; |
| 402 | }; |
| 403 | |
| 404 | class RegExpBackReference : public RegExpTree { |
| 405 | public: |
| 406 | explicit RegExpBackReference(RegExpFlags flags) |
| 407 | : capture_(nullptr), name_(nullptr), flags_(flags) {} |
| 408 | RegExpBackReference(RegExpCapture* capture, RegExpFlags flags) |
| 409 | : capture_(capture), name_(nullptr), flags_(flags) {} |
| 410 | virtual void* Accept(RegExpVisitor* visitor, void* data); |
| 411 | virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success); |
| 412 | virtual RegExpBackReference* AsBackReference(); |
| 413 | virtual bool IsBackReference() const; |
| 414 | virtual intptr_t min_match() const { return 0; } |
| 415 | // The back reference may be recursive, e.g. /(\2)(\1)/. To avoid infinite |
| 416 | // recursion, we give up and just assume arbitrary length, which matches v8's |
| 417 | // behavior. |
| 418 | virtual intptr_t max_match() const { return kInfinity; } |
| 419 | intptr_t index() const { return capture_->index(); } |
| 420 | RegExpCapture* capture() const { return capture_; } |
| 421 | void set_capture(RegExpCapture* capture) { capture_ = capture; } |
| 422 | const ZoneGrowableArray<uint16_t>* name() { return name_; } |
| 423 | void set_name(const ZoneGrowableArray<uint16_t>* name) { name_ = name; } |
| 424 | |
| 425 | private: |
| 426 | RegExpCapture* capture_; |
| 427 | const ZoneGrowableArray<uint16_t>* name_; |
| 428 | RegExpFlags flags_; |
| 429 | }; |
| 430 | |
| 431 | class RegExpEmpty : public RegExpTree { |
| 432 | public: |
| 433 | RegExpEmpty() {} |
| 434 | virtual void* Accept(RegExpVisitor* visitor, void* data); |
| 435 | virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success); |
| 436 | virtual RegExpEmpty* AsEmpty(); |
| 437 | virtual bool IsEmpty() const; |
| 438 | virtual intptr_t min_match() const { return 0; } |
| 439 | virtual intptr_t max_match() const { return 0; } |
| 440 | static RegExpEmpty* GetInstance() { |
| 441 | static RegExpEmpty* instance = ::new RegExpEmpty(); |
| 442 | return instance; |
| 443 | } |
| 444 | }; |
| 445 | |
| 446 | } // namespace dart |
| 447 | |
| 448 | #endif // RUNTIME_VM_REGEXP_AST_H_ |
| 449 | |