| 1 | // © 2016 and later: Unicode, Inc. and others. | 
|---|
| 2 | // License & terms of use: http://www.unicode.org/copyright.html | 
|---|
| 3 | // | 
|---|
| 4 | //  regexcmp.h | 
|---|
| 5 | // | 
|---|
| 6 | //  Copyright (C) 2002-2016, International Business Machines Corporation and others. | 
|---|
| 7 | //  All Rights Reserved. | 
|---|
| 8 | // | 
|---|
| 9 | //  This file contains declarations for the class RegexCompile | 
|---|
| 10 | // | 
|---|
| 11 | //  This class is internal to the regular expression implementation. | 
|---|
| 12 | //  For the public Regular Expression API, see the file "unicode/regex.h" | 
|---|
| 13 | // | 
|---|
| 14 |  | 
|---|
| 15 |  | 
|---|
| 16 | #ifndef RBBISCAN_H | 
|---|
| 17 | #define RBBISCAN_H | 
|---|
| 18 |  | 
|---|
| 19 | #include "unicode/utypes.h" | 
|---|
| 20 | #if !UCONFIG_NO_REGULAR_EXPRESSIONS | 
|---|
| 21 |  | 
|---|
| 22 | #include "unicode/parseerr.h" | 
|---|
| 23 | #include "unicode/uniset.h" | 
|---|
| 24 | #include "unicode/uobject.h" | 
|---|
| 25 | #include "unicode/utext.h" | 
|---|
| 26 | #include "uhash.h" | 
|---|
| 27 | #include "uvector.h" | 
|---|
| 28 | #include "uvectr32.h" | 
|---|
| 29 |  | 
|---|
| 30 |  | 
|---|
| 31 |  | 
|---|
| 32 | U_NAMESPACE_BEGIN | 
|---|
| 33 |  | 
|---|
| 34 |  | 
|---|
| 35 | //-------------------------------------------------------------------------------- | 
|---|
| 36 | // | 
|---|
| 37 | //  class RegexCompile    Contains the regular expression compiler. | 
|---|
| 38 | // | 
|---|
| 39 | //-------------------------------------------------------------------------------- | 
|---|
| 40 | struct  RegexTableEl; | 
|---|
| 41 | class   RegexPattern; | 
|---|
| 42 |  | 
|---|
| 43 |  | 
|---|
| 44 | class U_I18N_API RegexCompile : public UMemory { | 
|---|
| 45 | public: | 
|---|
| 46 |  | 
|---|
| 47 | enum { | 
|---|
| 48 | kStackSize = 100            // The size of the state stack for | 
|---|
| 49 | };                              //   pattern parsing.  Corresponds roughly | 
|---|
| 50 | //   to the depth of parentheses nesting | 
|---|
| 51 | //   that is allowed in the rules. | 
|---|
| 52 |  | 
|---|
| 53 | struct RegexPatternChar { | 
|---|
| 54 | UChar32             fChar; | 
|---|
| 55 | UBool               fQuoted; | 
|---|
| 56 | }; | 
|---|
| 57 |  | 
|---|
| 58 | RegexCompile(RegexPattern *rp, UErrorCode &e); | 
|---|
| 59 |  | 
|---|
| 60 | void       compile(const UnicodeString &pat, UParseError &pp, UErrorCode &e); | 
|---|
| 61 | void       compile(UText *pat, UParseError &pp, UErrorCode &e); | 
|---|
| 62 |  | 
|---|
| 63 |  | 
|---|
| 64 | virtual    ~RegexCompile(); | 
|---|
| 65 |  | 
|---|
| 66 | void        nextChar(RegexPatternChar &c);      // Get the next char from the input stream. | 
|---|
| 67 |  | 
|---|
| 68 | static void cleanup();                       // Memory cleanup | 
|---|
| 69 |  | 
|---|
| 70 |  | 
|---|
| 71 |  | 
|---|
| 72 | // Categories of parentheses in pattern. | 
|---|
| 73 | //   The category is saved in the compile-time parentheses stack frame, and | 
|---|
| 74 | //   determines the code to be generated when the matching close ) is encountered. | 
|---|
| 75 | enum EParenClass { | 
|---|
| 76 | plain        = -1,               // No special handling | 
|---|
| 77 | capturing    = -2, | 
|---|
| 78 | atomic       = -3, | 
|---|
| 79 | lookAhead    = -4, | 
|---|
| 80 | negLookAhead = -5, | 
|---|
| 81 | flags        = -6, | 
|---|
| 82 | lookBehind   = -7, | 
|---|
| 83 | lookBehindN  = -8 | 
|---|
| 84 | }; | 
|---|
| 85 |  | 
|---|
| 86 | private: | 
|---|
| 87 |  | 
|---|
| 88 |  | 
|---|
| 89 | UBool       doParseActions(int32_t a); | 
|---|
| 90 | void        error(UErrorCode e);                   // error reporting convenience function. | 
|---|
| 91 |  | 
|---|
| 92 | UChar32     nextCharLL(); | 
|---|
| 93 | UChar32     peekCharLL(); | 
|---|
| 94 | UnicodeSet  *scanProp(); | 
|---|
| 95 | UnicodeSet  *scanPosixProp(); | 
|---|
| 96 | void        handleCloseParen(); | 
|---|
| 97 | int32_t     blockTopLoc(UBool reserve);          // Locate a position in the compiled pattern | 
|---|
| 98 | //  at the top of the just completed block | 
|---|
| 99 | //  or operation, and optionally ensure that | 
|---|
| 100 | //  there is space to add an opcode there. | 
|---|
| 101 | void        compileSet(UnicodeSet *theSet);      // Generate the compiled pattern for | 
|---|
| 102 | //   a reference to a UnicodeSet. | 
|---|
| 103 | void        compileInterval(int32_t InitOp,      // Generate the code for a {min,max} quantifier. | 
|---|
| 104 | int32_t LoopOp); | 
|---|
| 105 | UBool       compileInlineInterval();             // Generate inline code for a {min,max} quantifier | 
|---|
| 106 | void        literalChar(UChar32 c);              // Compile a literal char | 
|---|
| 107 | void        fixLiterals(UBool split=FALSE);      // Generate code for pending literal characters. | 
|---|
| 108 | void        insertOp(int32_t where);             // Open up a slot for a new op in the | 
|---|
| 109 | //   generated code at the specified location. | 
|---|
| 110 | void        appendOp(int32_t op);                // Append a new op to the compiled pattern. | 
|---|
| 111 | void        appendOp(int32_t type, int32_t val); // Build & append a new op to the compiled pattern. | 
|---|
| 112 | int32_t     buildOp(int32_t type, int32_t val);  // Construct a new pcode instruction. | 
|---|
| 113 | int32_t     allocateData(int32_t size);          // Allocate space in the matcher data area. | 
|---|
| 114 | //   Return index of the newly allocated data. | 
|---|
| 115 | int32_t     allocateStackData(int32_t size);     // Allocate space in the match back-track stack frame. | 
|---|
| 116 | //   Return offset index in the frame. | 
|---|
| 117 | int32_t     minMatchLength(int32_t start, | 
|---|
| 118 | int32_t end); | 
|---|
| 119 | int32_t     maxMatchLength(int32_t start, | 
|---|
| 120 | int32_t end); | 
|---|
| 121 | void        matchStartType(); | 
|---|
| 122 | void        stripNOPs(); | 
|---|
| 123 |  | 
|---|
| 124 | void        setEval(int32_t op); | 
|---|
| 125 | void        setPushOp(int32_t op); | 
|---|
| 126 | UChar32     scanNamedChar(); | 
|---|
| 127 | UnicodeSet *createSetForProperty(const UnicodeString &propName, UBool negated); | 
|---|
| 128 |  | 
|---|
| 129 | public:   // Public for testing only. | 
|---|
| 130 | static void U_EXPORT2 findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterChars); | 
|---|
| 131 | private: | 
|---|
| 132 |  | 
|---|
| 133 |  | 
|---|
| 134 | UErrorCode                    *fStatus; | 
|---|
| 135 | RegexPattern                  *fRXPat; | 
|---|
| 136 | UParseError                   *fParseErr; | 
|---|
| 137 |  | 
|---|
| 138 | // | 
|---|
| 139 | //  Data associated with low level character scanning | 
|---|
| 140 | // | 
|---|
| 141 | int64_t                       fScanIndex;        // Index of current character being processed | 
|---|
| 142 | //   in the rule input string. | 
|---|
| 143 | UBool                         fQuoteMode;        // Scan is in a \Q...\E quoted region | 
|---|
| 144 | UBool                         fInBackslashQuote; // Scan is between a '\' and the following char. | 
|---|
| 145 | UBool                         ;      // When scan is just after '(?',  inhibit #... to | 
|---|
| 146 | //   end of line comments, in favor of (?#...) comments. | 
|---|
| 147 | int64_t                       fLineNum;          // Line number in input file. | 
|---|
| 148 | int64_t                       fCharNum;          // Char position within the line. | 
|---|
| 149 | UChar32                       fLastChar;         // Previous char, needed to count CR-LF | 
|---|
| 150 | //   as a single line, not two. | 
|---|
| 151 | UChar32                       fPeekChar;         // Saved char, if we've scanned ahead. | 
|---|
| 152 |  | 
|---|
| 153 |  | 
|---|
| 154 | RegexPatternChar              fC;                // Current char for parse state machine | 
|---|
| 155 | //   processing. | 
|---|
| 156 |  | 
|---|
| 157 | // | 
|---|
| 158 | //   Data for the state machine that parses the regular expression. | 
|---|
| 159 | // | 
|---|
| 160 | RegexTableEl                  **fStateTable;     // State Transition Table for regex Rule | 
|---|
| 161 | //   parsing.  index by p[state][char-class] | 
|---|
| 162 |  | 
|---|
| 163 | uint16_t                      fStack[kStackSize];  // State stack, holds state pushes | 
|---|
| 164 | int32_t                       fStackPtr;           //  and pops as specified in the state | 
|---|
| 165 | //  transition rules. | 
|---|
| 166 |  | 
|---|
| 167 | // | 
|---|
| 168 | //  Data associated with the generation of the pcode for the match engine | 
|---|
| 169 | // | 
|---|
| 170 | int32_t                       fModeFlags;        // Match Flags.  (Case Insensitive, etc.) | 
|---|
| 171 | //   Always has high bit (31) set so that flag values | 
|---|
| 172 | //   on the paren stack are distinguished from relocatable | 
|---|
| 173 | //   pcode addresses. | 
|---|
| 174 | int32_t                       fNewModeFlags;     // New flags, while compiling (?i, holds state | 
|---|
| 175 | //   until last flag is scanned. | 
|---|
| 176 | UBool                         fSetModeFlag;      // true for (?ismx, false for (?-ismx | 
|---|
| 177 |  | 
|---|
| 178 | UnicodeString                 fLiteralChars;     // Literal chars or strings from the pattern are accumulated here. | 
|---|
| 179 | //   Once completed, meaning that some non-literal pattern | 
|---|
| 180 | //   construct is encountered, the appropriate opcodes | 
|---|
| 181 | //   to match the literal will be generated, and this | 
|---|
| 182 | //   string will be cleared. | 
|---|
| 183 |  | 
|---|
| 184 | int64_t                       fPatternLength;    // Length of the input pattern string. | 
|---|
| 185 |  | 
|---|
| 186 | UVector32                     fParenStack;       // parentheses stack.  Each frame consists of | 
|---|
| 187 | //   the positions of compiled pattern operations | 
|---|
| 188 | //   needing fixup, followed by negative value.  The | 
|---|
| 189 | //   first entry in each frame is the position of the | 
|---|
| 190 | //   spot reserved for use when a quantifier | 
|---|
| 191 | //   needs to add a SAVE at the start of a (block) | 
|---|
| 192 | //   The negative value (-1, -2,...) indicates | 
|---|
| 193 | //   the kind of paren that opened the frame.  Some | 
|---|
| 194 | //   need special handling on close. | 
|---|
| 195 |  | 
|---|
| 196 |  | 
|---|
| 197 | int32_t                       fMatchOpenParen;   // The position in the compiled pattern | 
|---|
| 198 | //   of the slot reserved for a state save | 
|---|
| 199 | //   at the start of the most recently processed | 
|---|
| 200 | //   parenthesized block. Updated when processing | 
|---|
| 201 | //   a close to the location for the corresponding open. | 
|---|
| 202 |  | 
|---|
| 203 | int32_t                       fMatchCloseParen;  // The position in the pattern of the first | 
|---|
| 204 | //   location after the most recently processed | 
|---|
| 205 | //   parenthesized block. | 
|---|
| 206 |  | 
|---|
| 207 | int32_t                       fIntervalLow;      // {lower, upper} interval quantifier values. | 
|---|
| 208 | int32_t                       fIntervalUpper;    // Placed here temporarily, when pattern is | 
|---|
| 209 | //   initially scanned.  Each new interval | 
|---|
| 210 | //   encountered overwrites these values. | 
|---|
| 211 | //   -1 for the upper interval value means none | 
|---|
| 212 | //   was specified (unlimited occurences.) | 
|---|
| 213 |  | 
|---|
| 214 | int64_t                       fNameStartPos;     // Starting position of a \N{NAME} name in a | 
|---|
| 215 | //   pattern, valid while remainder of name is | 
|---|
| 216 | //   scanned. | 
|---|
| 217 |  | 
|---|
| 218 | UStack                        fSetStack;         // Stack of UnicodeSets, used while evaluating | 
|---|
| 219 | //   (at compile time) set expressions within | 
|---|
| 220 | //   the pattern. | 
|---|
| 221 | UStack                        fSetOpStack;       // Stack of pending set operators (&&, --, union) | 
|---|
| 222 |  | 
|---|
| 223 | UChar32                       fLastSetLiteral;   // The last single code point added to a set. | 
|---|
| 224 | //   needed when "-y" is scanned, and we need | 
|---|
| 225 | //   to turn "x-y" into a range. | 
|---|
| 226 |  | 
|---|
| 227 | UnicodeString                *fCaptureName;      // Named Capture, the group name is built up | 
|---|
| 228 | //   in this string while being scanned. | 
|---|
| 229 | }; | 
|---|
| 230 |  | 
|---|
| 231 | // Constant values to be pushed onto fSetOpStack while scanning & evalueating [set expressions] | 
|---|
| 232 | //   The high 16 bits are the operator precedence, and the low 16 are a code for the operation itself. | 
|---|
| 233 |  | 
|---|
| 234 | enum SetOperations { | 
|---|
| 235 | setStart         = 0 << 16 | 1, | 
|---|
| 236 | setEnd           = 1 << 16 | 2, | 
|---|
| 237 | setNegation      = 2 << 16 | 3, | 
|---|
| 238 | setCaseClose     = 2 << 16 | 9, | 
|---|
| 239 | setDifference2   = 3 << 16 | 4,    // '--' set difference operator | 
|---|
| 240 | setIntersection2 = 3 << 16 | 5,    // '&&' set intersection operator | 
|---|
| 241 | setUnion         = 4 << 16 | 6,    // implicit union of adjacent items | 
|---|
| 242 | setDifference1   = 4 << 16 | 7,    // '-', single dash difference op, for compatibility with old UnicodeSet. | 
|---|
| 243 | setIntersection1 = 4 << 16 | 8     // '&', single amp intersection op, for compatibility with old UnicodeSet. | 
|---|
| 244 | }; | 
|---|
| 245 |  | 
|---|
| 246 | U_NAMESPACE_END | 
|---|
| 247 | #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS | 
|---|
| 248 | #endif   // RBBISCAN_H | 
|---|
| 249 |  | 
|---|