| 1 | // © 2016 and later: Unicode, Inc. and others. |
| 2 | // License & terms of use: http://www.unicode.org/copyright.html |
| 3 | // |
| 4 | // rbbiscan.h |
| 5 | // |
| 6 | // Copyright (C) 2002-2016, International Business Machines Corporation and others. |
| 7 | // All Rights Reserved. |
| 8 | // |
| 9 | // This file contains declarations for class RBBIRuleScanner |
| 10 | // |
| 11 | |
| 12 | |
| 13 | #ifndef RBBISCAN_H |
| 14 | #define RBBISCAN_H |
| 15 | |
| 16 | #include "unicode/utypes.h" |
| 17 | #include "unicode/uobject.h" |
| 18 | #include "unicode/rbbi.h" |
| 19 | #include "unicode/uniset.h" |
| 20 | #include "unicode/parseerr.h" |
| 21 | #include "uhash.h" |
| 22 | #include "uvector.h" |
| 23 | #include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that |
| 24 | // looks up references to $variables within a set. |
| 25 | #include "rbbinode.h" |
| 26 | #include "rbbirpt.h" |
| 27 | |
| 28 | U_NAMESPACE_BEGIN |
| 29 | |
| 30 | class RBBIRuleBuilder; |
| 31 | class RBBISymbolTable; |
| 32 | |
| 33 | |
| 34 | //-------------------------------------------------------------------------------- |
| 35 | // |
| 36 | // class RBBIRuleScanner does the lowest level, character-at-a-time |
| 37 | // scanning of break iterator rules. |
| 38 | // |
| 39 | // The output of the scanner is parse trees for |
| 40 | // the rule expressions and a list of all Unicode Sets |
| 41 | // encountered. |
| 42 | // |
| 43 | //-------------------------------------------------------------------------------- |
| 44 | |
| 45 | class RBBIRuleScanner : public UMemory { |
| 46 | public: |
| 47 | |
| 48 | enum { |
| 49 | kStackSize = 100 // The size of the state stack for |
| 50 | }; // rules parsing. Corresponds roughly |
| 51 | // to the depth of parentheses nesting |
| 52 | // that is allowed in the rules. |
| 53 | |
| 54 | struct RBBIRuleChar { |
| 55 | UChar32 fChar; |
| 56 | UBool fEscaped; |
| 57 | RBBIRuleChar() : fChar(0), fEscaped(FALSE) {} |
| 58 | }; |
| 59 | |
| 60 | RBBIRuleScanner(RBBIRuleBuilder *rb); |
| 61 | |
| 62 | |
| 63 | virtual ~RBBIRuleScanner(); |
| 64 | |
| 65 | void nextChar(RBBIRuleChar &c); // Get the next char from the input stream. |
| 66 | // Return false if at end. |
| 67 | |
| 68 | UBool push(const RBBIRuleChar &c); // Push (unget) one character. |
| 69 | // Only a single character may be pushed. |
| 70 | |
| 71 | void parse(); // Parse the rules, generating two parse |
| 72 | // trees, one each for the forward and |
| 73 | // reverse rules, |
| 74 | // and a list of UnicodeSets encountered. |
| 75 | |
| 76 | int32_t numRules(); // Return the number of rules that have been seen. |
| 77 | |
| 78 | /** |
| 79 | * Return a rules string without unnecessary |
| 80 | * characters. |
| 81 | */ |
| 82 | static UnicodeString stripRules(const UnicodeString &rules); |
| 83 | private: |
| 84 | |
| 85 | UBool doParseActions(int32_t a); |
| 86 | void error(UErrorCode e); // error reporting convenience function. |
| 87 | void fixOpStack(RBBINode::OpPrecedence p); |
| 88 | // a character. |
| 89 | void findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt = NULL); |
| 90 | |
| 91 | UChar32 nextCharLL(); |
| 92 | #ifdef RBBI_DEBUG |
| 93 | void printNodeStack(const char *title); |
| 94 | #endif |
| 95 | RBBINode *pushNewNode(RBBINode::NodeType t); |
| 96 | void scanSet(); |
| 97 | |
| 98 | |
| 99 | RBBIRuleBuilder *fRB; // The rule builder that we are part of. |
| 100 | |
| 101 | int32_t fScanIndex; // Index of current character being processed |
| 102 | // in the rule input string. |
| 103 | int32_t fNextIndex; // Index of the next character, which |
| 104 | // is the first character not yet scanned. |
| 105 | UBool fQuoteMode; // Scan is in a 'quoted region' |
| 106 | int32_t fLineNum; // Line number in input file. |
| 107 | int32_t fCharNum; // Char position within the line. |
| 108 | UChar32 fLastChar; // Previous char, needed to count CR-LF |
| 109 | // as a single line, not two. |
| 110 | |
| 111 | RBBIRuleChar fC; // Current char for parse state machine |
| 112 | // processing. |
| 113 | UnicodeString fVarName; // $variableName, valid when we've just |
| 114 | // scanned one. |
| 115 | |
| 116 | RBBIRuleTableEl **fStateTable; // State Transition Table for RBBI Rule |
| 117 | // parsing. index by p[state][char-class] |
| 118 | |
| 119 | uint16_t fStack[kStackSize]; // State stack, holds state pushes |
| 120 | int32_t fStackPtr; // and pops as specified in the state |
| 121 | // transition rules. |
| 122 | |
| 123 | RBBINode *fNodeStack[kStackSize]; // Node stack, holds nodes created |
| 124 | // during the parse of a rule |
| 125 | int32_t fNodeStackPtr; |
| 126 | |
| 127 | |
| 128 | UBool fReverseRule; // True if the rule currently being scanned |
| 129 | // is a reverse direction rule (if it |
| 130 | // starts with a '!') |
| 131 | |
| 132 | UBool fLookAheadRule; // True if the rule includes a '/' |
| 133 | // somewhere within it. |
| 134 | |
| 135 | UBool fNoChainInRule; // True if the current rule starts with a '^'. |
| 136 | |
| 137 | RBBISymbolTable *fSymbolTable; // symbol table, holds definitions of |
| 138 | // $variable symbols. |
| 139 | |
| 140 | UHashtable *fSetTable; // UnicocodeSet hash table, holds indexes to |
| 141 | // the sets created while parsing rules. |
| 142 | // The key is the string used for creating |
| 143 | // the set. |
| 144 | |
| 145 | UnicodeSet fRuleSets[10]; // Unicode Sets that are needed during |
| 146 | // the scanning of RBBI rules. The |
| 147 | // indicies for these are assigned by the |
| 148 | // perl script that builds the state tables. |
| 149 | // See rbbirpt.h. |
| 150 | |
| 151 | int32_t fRuleNum; // Counts each rule as it is scanned. |
| 152 | |
| 153 | int32_t fOptionStart; // Input index of start of a !!option |
| 154 | // keyword, while being scanned. |
| 155 | |
| 156 | UnicodeSet *gRuleSet_rule_char; |
| 157 | UnicodeSet *gRuleSet_white_space; |
| 158 | UnicodeSet *gRuleSet_name_char; |
| 159 | UnicodeSet *gRuleSet_name_start_char; |
| 160 | |
| 161 | RBBIRuleScanner(const RBBIRuleScanner &other); // forbid copying of this class |
| 162 | RBBIRuleScanner &operator=(const RBBIRuleScanner &other); // forbid copying of this class |
| 163 | }; |
| 164 | |
| 165 | U_NAMESPACE_END |
| 166 | |
| 167 | #endif |
| 168 | |