| 1 | // Copyright (c) 2014, the Dart project authors.  Please see the AUTHORS file | 
|---|
| 2 | // for details. All rights reserved. Use of this source code is governed by a | 
|---|
| 3 | // BSD-style license that can be found in the LICENSE file. | 
|---|
| 4 |  | 
|---|
| 5 | #ifndef RUNTIME_VM_REGEXP_PARSER_H_ | 
|---|
| 6 | #define RUNTIME_VM_REGEXP_PARSER_H_ | 
|---|
| 7 |  | 
|---|
| 8 | #include "vm/allocation.h" | 
|---|
| 9 | #include "vm/growable_array.h" | 
|---|
| 10 | #include "vm/regexp_ast.h" | 
|---|
| 11 |  | 
|---|
| 12 | namespace dart { | 
|---|
| 13 |  | 
|---|
| 14 | // Accumulates RegExp atoms and assertions into lists of terms and alternatives. | 
|---|
| 15 | class RegExpBuilder : public ZoneAllocated { | 
|---|
| 16 | public: | 
|---|
| 17 | explicit RegExpBuilder(RegExpFlags flags); | 
|---|
| 18 |  | 
|---|
| 19 | void AddCharacter(uint16_t character); | 
|---|
| 20 | void AddUnicodeCharacter(uint32_t character); | 
|---|
| 21 | void AddEscapedUnicodeCharacter(uint32_t character); | 
|---|
| 22 | // "Adds" an empty expression. Does nothing except consume a | 
|---|
| 23 | // following quantifier | 
|---|
| 24 | void AddEmpty(); | 
|---|
| 25 | void AddCharacterClass(RegExpCharacterClass* cc); | 
|---|
| 26 | void AddCharacterClassForDesugaring(uint32_t c); | 
|---|
| 27 | void AddAtom(RegExpTree* tree); | 
|---|
| 28 | void AddTerm(RegExpTree* tree); | 
|---|
| 29 | void AddAssertion(RegExpTree* tree); | 
|---|
| 30 | void NewAlternative();  // '|' | 
|---|
| 31 | // Attempt to add a quantifier to the last atom added. The return value | 
|---|
| 32 | // denotes whether the attempt succeeded, since some atoms like lookbehind | 
|---|
| 33 | // cannot be quantified. | 
|---|
| 34 | bool AddQuantifierToAtom(intptr_t min, | 
|---|
| 35 | intptr_t max, | 
|---|
| 36 | RegExpQuantifier::QuantifierType type); | 
|---|
| 37 | RegExpTree* ToRegExp(); | 
|---|
| 38 | RegExpFlags flags() const { return flags_; } | 
|---|
| 39 | bool ignore_case() const { return flags_.IgnoreCase(); } | 
|---|
| 40 | bool is_multi_line() const { return flags_.IsMultiLine(); } | 
|---|
| 41 | bool is_dot_all() const { return flags_.IsDotAll(); } | 
|---|
| 42 |  | 
|---|
| 43 | private: | 
|---|
| 44 | static const uint16_t kNoPendingSurrogate = 0; | 
|---|
| 45 | void AddLeadSurrogate(uint16_t lead_surrogate); | 
|---|
| 46 | void AddTrailSurrogate(uint16_t trail_surrogate); | 
|---|
| 47 | void FlushPendingSurrogate(); | 
|---|
| 48 | void FlushCharacters(); | 
|---|
| 49 | void FlushText(); | 
|---|
| 50 | void FlushTerms(); | 
|---|
| 51 | bool NeedsDesugaringForUnicode(RegExpCharacterClass* cc); | 
|---|
| 52 | bool NeedsDesugaringForIgnoreCase(uint32_t c); | 
|---|
| 53 |  | 
|---|
| 54 | Zone* zone() const { return zone_; } | 
|---|
| 55 | bool is_unicode() const { return flags_.IsUnicode(); } | 
|---|
| 56 |  | 
|---|
| 57 | Zone* zone_; | 
|---|
| 58 | bool pending_empty_; | 
|---|
| 59 | RegExpFlags flags_; | 
|---|
| 60 | ZoneGrowableArray<uint16_t>* characters_; | 
|---|
| 61 | uint16_t pending_surrogate_; | 
|---|
| 62 | GrowableArray<RegExpTree*> terms_; | 
|---|
| 63 | GrowableArray<RegExpTree*> text_; | 
|---|
| 64 | GrowableArray<RegExpTree*> alternatives_; | 
|---|
| 65 | #ifdef DEBUG | 
|---|
| 66 | enum {ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM} last_added_; | 
|---|
| 67 | #define LAST(x) last_added_ = x; | 
|---|
| 68 | #else | 
|---|
| 69 | #define LAST(x) | 
|---|
| 70 | #endif | 
|---|
| 71 | }; | 
|---|
| 72 |  | 
|---|
| 73 | using RegExpCaptureName = ZoneGrowableArray<uint16_t>; | 
|---|
| 74 |  | 
|---|
| 75 | class RegExpParser : public ValueObject { | 
|---|
| 76 | public: | 
|---|
| 77 | RegExpParser(const String& in, String* error, RegExpFlags regexp_flags); | 
|---|
| 78 |  | 
|---|
| 79 | static void ParseRegExp(const String& input, | 
|---|
| 80 | RegExpFlags regexp_flags, | 
|---|
| 81 | RegExpCompileData* result); | 
|---|
| 82 |  | 
|---|
| 83 | RegExpTree* ParsePattern(); | 
|---|
| 84 | RegExpTree* ParseDisjunction(); | 
|---|
| 85 | RegExpTree* ParseGroup(); | 
|---|
| 86 |  | 
|---|
| 87 | // Parses a {...,...} quantifier and stores the range in the given | 
|---|
| 88 | // out parameters. | 
|---|
| 89 | bool ParseIntervalQuantifier(intptr_t* min_out, intptr_t* max_out); | 
|---|
| 90 |  | 
|---|
| 91 | // Parses and returns a single escaped character.  The character | 
|---|
| 92 | // must not be 'b' or 'B' since they are usually handle specially. | 
|---|
| 93 | uint32_t ParseClassCharacterEscape(); | 
|---|
| 94 |  | 
|---|
| 95 | // Checks whether the following is a length-digit hexadecimal number, | 
|---|
| 96 | // and sets the value if it is. | 
|---|
| 97 | bool ParseHexEscape(intptr_t length, uint32_t* value); | 
|---|
| 98 | bool ParseUnicodeEscape(uint32_t* value); | 
|---|
| 99 | bool ParseUnlimitedLengthHexNumber(uint32_t max_value, uint32_t* value); | 
|---|
| 100 |  | 
|---|
| 101 | // Parses either {UNICODE_PROPERTY_NAME=UNICODE_PROPERTY_VALUE} or | 
|---|
| 102 | // the shorthand {UNICODE_PROPERTY_NAME_OR_VALUE} and stores the | 
|---|
| 103 | // result in the given out parameters. If the shorthand is used, | 
|---|
| 104 | // nothing will be added to name_2. | 
|---|
| 105 | bool ParsePropertyClassName(ZoneGrowableArray<char>* name_1, | 
|---|
| 106 | ZoneGrowableArray<char>* name_2); | 
|---|
| 107 | // Adds the specified unicode property to the provided character range. | 
|---|
| 108 | bool AddPropertyClassRange(ZoneGrowableArray<CharacterRange>* add_to, | 
|---|
| 109 | bool negate, | 
|---|
| 110 | ZoneGrowableArray<char>* name_1, | 
|---|
| 111 | ZoneGrowableArray<char>* name_2); | 
|---|
| 112 | // Returns a regexp node that corresponds to one of these unicode | 
|---|
| 113 | // property sequences: "Any", "ASCII", "Assigned". | 
|---|
| 114 | RegExpTree* GetPropertySequence(ZoneGrowableArray<char>* name_1); | 
|---|
| 115 | RegExpTree* ParseCharacterClass(const RegExpBuilder* builder); | 
|---|
| 116 |  | 
|---|
| 117 | uint32_t ParseOctalLiteral(); | 
|---|
| 118 |  | 
|---|
| 119 | // Tries to parse the input as a back reference.  If successful it | 
|---|
| 120 | // stores the result in the output parameter and returns true.  If | 
|---|
| 121 | // it fails it will push back the characters read so the same characters | 
|---|
| 122 | // can be reparsed. | 
|---|
| 123 | bool ParseBackReferenceIndex(intptr_t* index_out); | 
|---|
| 124 |  | 
|---|
| 125 | // Attempts to parse a possible escape within a character class. | 
|---|
| 126 | bool ParseClassEscape(ZoneGrowableArray<CharacterRange>* ranges, | 
|---|
| 127 | bool add_unicode_case_equivalents, | 
|---|
| 128 | uint32_t* char_out); | 
|---|
| 129 | void ReportError(const char* message); | 
|---|
| 130 | void Advance(); | 
|---|
| 131 | void Advance(intptr_t dist); | 
|---|
| 132 | void Reset(intptr_t pos); | 
|---|
| 133 |  | 
|---|
| 134 | // Reports whether the pattern might be used as a literal search string. | 
|---|
| 135 | // Only use if the result of the parse is a single atom node. | 
|---|
| 136 | bool simple(); | 
|---|
| 137 | bool contains_anchor() { return contains_anchor_; } | 
|---|
| 138 | void set_contains_anchor() { contains_anchor_ = true; } | 
|---|
| 139 | intptr_t captures_started() { return captures_started_; } | 
|---|
| 140 | intptr_t position() { return next_pos_ - 1; } | 
|---|
| 141 | bool is_unicode() const { return top_level_flags_.IsUnicode(); } | 
|---|
| 142 |  | 
|---|
| 143 | static bool IsSyntaxCharacterOrSlash(uint32_t c); | 
|---|
| 144 |  | 
|---|
| 145 | static const intptr_t kMaxCaptures = 1 << 16; | 
|---|
| 146 | static const uint32_t kEndMarker = (1 << 21); | 
|---|
| 147 |  | 
|---|
| 148 | private: | 
|---|
| 149 | enum SubexpressionType { | 
|---|
| 150 | INITIAL, | 
|---|
| 151 | CAPTURE,  // All positive values represent captures. | 
|---|
| 152 | POSITIVE_LOOKAROUND, | 
|---|
| 153 | NEGATIVE_LOOKAROUND, | 
|---|
| 154 | GROUPING | 
|---|
| 155 | }; | 
|---|
| 156 |  | 
|---|
| 157 | class RegExpParserState : public ZoneAllocated { | 
|---|
| 158 | public: | 
|---|
| 159 | RegExpParserState(RegExpParserState* previous_state, | 
|---|
| 160 | SubexpressionType group_type, | 
|---|
| 161 | RegExpLookaround::Type lookaround_type, | 
|---|
| 162 | intptr_t disjunction_capture_index, | 
|---|
| 163 | const RegExpCaptureName* capture_name, | 
|---|
| 164 | RegExpFlags flags, | 
|---|
| 165 | Zone* zone) | 
|---|
| 166 | : previous_state_(previous_state), | 
|---|
| 167 | builder_(new (zone) RegExpBuilder(flags)), | 
|---|
| 168 | group_type_(group_type), | 
|---|
| 169 | lookaround_type_(lookaround_type), | 
|---|
| 170 | disjunction_capture_index_(disjunction_capture_index), | 
|---|
| 171 | capture_name_(capture_name) {} | 
|---|
| 172 | // Parser state of containing expression, if any. | 
|---|
| 173 | RegExpParserState* previous_state() { return previous_state_; } | 
|---|
| 174 | bool IsSubexpression() { return previous_state_ != NULL; } | 
|---|
| 175 | // RegExpBuilder building this regexp's AST. | 
|---|
| 176 | RegExpBuilder* builder() { return builder_; } | 
|---|
| 177 | // Type of regexp being parsed (parenthesized group or entire regexp). | 
|---|
| 178 | SubexpressionType group_type() { return group_type_; } | 
|---|
| 179 | // Lookahead or lookbehind. | 
|---|
| 180 | RegExpLookaround::Type lookaround_type() { return lookaround_type_; } | 
|---|
| 181 | // Index in captures array of first capture in this sub-expression, if any. | 
|---|
| 182 | // Also the capture index of this sub-expression itself, if group_type | 
|---|
| 183 | // is CAPTURE. | 
|---|
| 184 | intptr_t capture_index() { return disjunction_capture_index_; } | 
|---|
| 185 | const RegExpCaptureName* capture_name() const { return capture_name_; } | 
|---|
| 186 |  | 
|---|
| 187 | bool IsNamedCapture() const { return capture_name_ != nullptr; } | 
|---|
| 188 |  | 
|---|
| 189 | // Check whether the parser is inside a capture group with the given index. | 
|---|
| 190 | bool IsInsideCaptureGroup(intptr_t index); | 
|---|
| 191 | // Check whether the parser is inside a capture group with the given name. | 
|---|
| 192 | bool IsInsideCaptureGroup(const RegExpCaptureName* name); | 
|---|
| 193 |  | 
|---|
| 194 | private: | 
|---|
| 195 | // Linked list implementation of stack of states. | 
|---|
| 196 | RegExpParserState* previous_state_; | 
|---|
| 197 | // Builder for the stored disjunction. | 
|---|
| 198 | RegExpBuilder* builder_; | 
|---|
| 199 | // Stored disjunction type (capture, look-ahead or grouping), if any. | 
|---|
| 200 | SubexpressionType group_type_; | 
|---|
| 201 | // Stored read direction. | 
|---|
| 202 | const RegExpLookaround::Type lookaround_type_; | 
|---|
| 203 | // Stored disjunction's capture index (if any). | 
|---|
| 204 | intptr_t disjunction_capture_index_; | 
|---|
| 205 | // Stored capture name (if any). | 
|---|
| 206 | const RegExpCaptureName* const capture_name_; | 
|---|
| 207 | }; | 
|---|
| 208 |  | 
|---|
| 209 | // Return the 1-indexed RegExpCapture object, allocate if necessary. | 
|---|
| 210 | RegExpCapture* GetCapture(intptr_t index); | 
|---|
| 211 |  | 
|---|
| 212 | // Creates a new named capture at the specified index. Must be called exactly | 
|---|
| 213 | // once for each named capture. Fails if a capture with the same name is | 
|---|
| 214 | // encountered. | 
|---|
| 215 | void CreateNamedCaptureAtIndex(const RegExpCaptureName* name, intptr_t index); | 
|---|
| 216 |  | 
|---|
| 217 | // Parses the name of a capture group (?<name>pattern). The name must adhere | 
|---|
| 218 | // to IdentifierName in the ECMAScript standard. | 
|---|
| 219 | const RegExpCaptureName* ParseCaptureGroupName(); | 
|---|
| 220 |  | 
|---|
| 221 | bool ParseNamedBackReference(RegExpBuilder* builder, | 
|---|
| 222 | RegExpParserState* state); | 
|---|
| 223 | RegExpParserState* ParseOpenParenthesis(RegExpParserState* state); | 
|---|
| 224 | intptr_t GetNamedCaptureIndex(const RegExpCaptureName* name); | 
|---|
| 225 |  | 
|---|
| 226 | // After the initial parsing pass, patch corresponding RegExpCapture objects | 
|---|
| 227 | // into all RegExpBackReferences. This is done after initial parsing in order | 
|---|
| 228 | // to avoid complicating cases in which references come before the capture. | 
|---|
| 229 | void PatchNamedBackReferences(); | 
|---|
| 230 |  | 
|---|
| 231 | ArrayPtr CreateCaptureNameMap(); | 
|---|
| 232 |  | 
|---|
| 233 | // Returns true iff the pattern contains named captures. May call | 
|---|
| 234 | // ScanForCaptures to look ahead at the remaining pattern. | 
|---|
| 235 | bool HasNamedCaptures(); | 
|---|
| 236 |  | 
|---|
| 237 | Zone* zone() { return zone_; } | 
|---|
| 238 |  | 
|---|
| 239 | uint32_t current() { return current_; } | 
|---|
| 240 | bool has_more() { return has_more_; } | 
|---|
| 241 | bool has_next() { return next_pos_ < in().Length(); } | 
|---|
| 242 | uint32_t Next(); | 
|---|
| 243 | uint32_t ReadNext(bool update_position); | 
|---|
| 244 | const String& in() { return in_; } | 
|---|
| 245 | void ScanForCaptures(); | 
|---|
| 246 |  | 
|---|
| 247 | Zone* zone_; | 
|---|
| 248 | ZoneGrowableArray<RegExpCapture*>* captures_; | 
|---|
| 249 | ZoneGrowableArray<RegExpCapture*>* named_captures_; | 
|---|
| 250 | ZoneGrowableArray<RegExpBackReference*>* named_back_references_; | 
|---|
| 251 | const String& in_; | 
|---|
| 252 | uint32_t current_; | 
|---|
| 253 | intptr_t next_pos_; | 
|---|
| 254 | intptr_t captures_started_; | 
|---|
| 255 | // The capture count is only valid after we have scanned for captures. | 
|---|
| 256 | intptr_t capture_count_; | 
|---|
| 257 | bool has_more_; | 
|---|
| 258 | RegExpFlags top_level_flags_; | 
|---|
| 259 | bool simple_; | 
|---|
| 260 | bool contains_anchor_; | 
|---|
| 261 | bool is_scanned_for_captures_; | 
|---|
| 262 | bool has_named_captures_; | 
|---|
| 263 | }; | 
|---|
| 264 |  | 
|---|
| 265 | }  // namespace dart | 
|---|
| 266 |  | 
|---|
| 267 | #endif  // RUNTIME_VM_REGEXP_PARSER_H_ | 
|---|
| 268 |  | 
|---|