| 1 | // © 2016 and later: Unicode, Inc. and others. | 
|---|
| 2 | // License & terms of use: http://www.unicode.org/copyright.html | 
|---|
| 3 | // | 
|---|
| 4 | //  rbbiscan.h | 
|---|
| 5 | // | 
|---|
| 6 | //  Copyright (C) 2002-2016, International Business Machines Corporation and others. | 
|---|
| 7 | //  All Rights Reserved. | 
|---|
| 8 | // | 
|---|
| 9 | //  This file contains declarations for class RBBIRuleScanner | 
|---|
| 10 | // | 
|---|
| 11 |  | 
|---|
| 12 |  | 
|---|
| 13 | #ifndef RBBISCAN_H | 
|---|
| 14 | #define RBBISCAN_H | 
|---|
| 15 |  | 
|---|
| 16 | #include "unicode/utypes.h" | 
|---|
| 17 | #include "unicode/uobject.h" | 
|---|
| 18 | #include "unicode/rbbi.h" | 
|---|
| 19 | #include "unicode/uniset.h" | 
|---|
| 20 | #include "unicode/parseerr.h" | 
|---|
| 21 | #include "uhash.h" | 
|---|
| 22 | #include "uvector.h" | 
|---|
| 23 | #include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that | 
|---|
| 24 | //    looks up references to $variables within a set. | 
|---|
| 25 | #include "rbbinode.h" | 
|---|
| 26 | #include "rbbirpt.h" | 
|---|
| 27 |  | 
|---|
| 28 | U_NAMESPACE_BEGIN | 
|---|
| 29 |  | 
|---|
| 30 | class   RBBIRuleBuilder; | 
|---|
| 31 | class   RBBISymbolTable; | 
|---|
| 32 |  | 
|---|
| 33 |  | 
|---|
| 34 | //-------------------------------------------------------------------------------- | 
|---|
| 35 | // | 
|---|
| 36 | //  class RBBIRuleScanner does the lowest level, character-at-a-time | 
|---|
| 37 | //                        scanning of break iterator rules. | 
|---|
| 38 | // | 
|---|
| 39 | //                        The output of the scanner is parse trees for | 
|---|
| 40 | //                        the rule expressions and a list of all Unicode Sets | 
|---|
| 41 | //                        encountered. | 
|---|
| 42 | // | 
|---|
| 43 | //-------------------------------------------------------------------------------- | 
|---|
| 44 |  | 
|---|
| 45 | class RBBIRuleScanner : public UMemory { | 
|---|
| 46 | public: | 
|---|
| 47 |  | 
|---|
| 48 | enum { | 
|---|
| 49 | kStackSize = 100            // The size of the state stack for | 
|---|
| 50 | };                              //   rules parsing.  Corresponds roughly | 
|---|
| 51 | //   to the depth of parentheses nesting | 
|---|
| 52 | //   that is allowed in the rules. | 
|---|
| 53 |  | 
|---|
| 54 | struct RBBIRuleChar { | 
|---|
| 55 | UChar32             fChar; | 
|---|
| 56 | UBool               fEscaped; | 
|---|
| 57 | RBBIRuleChar() : fChar(0), fEscaped(false) {} | 
|---|
| 58 | }; | 
|---|
| 59 |  | 
|---|
| 60 | RBBIRuleScanner(RBBIRuleBuilder  *rb); | 
|---|
| 61 |  | 
|---|
| 62 |  | 
|---|
| 63 | virtual    ~RBBIRuleScanner(); | 
|---|
| 64 |  | 
|---|
| 65 | void        nextChar(RBBIRuleChar &c);          // Get the next char from the input stream. | 
|---|
| 66 | // Return false if at end. | 
|---|
| 67 |  | 
|---|
| 68 | UBool       push(const RBBIRuleChar &c);        // Push (unget) one character. | 
|---|
| 69 | //   Only a single character may be pushed. | 
|---|
| 70 |  | 
|---|
| 71 | void        parse();                            // Parse the rules, generating two parse | 
|---|
| 72 | //   trees, one each for the forward and | 
|---|
| 73 | //   reverse rules, | 
|---|
| 74 | //   and a list of UnicodeSets encountered. | 
|---|
| 75 |  | 
|---|
| 76 | int32_t     numRules();                         // Return the number of rules that have been seen. | 
|---|
| 77 |  | 
|---|
| 78 | /** | 
|---|
| 79 | * Return a rules string without unnecessary | 
|---|
| 80 | * characters. | 
|---|
| 81 | */ | 
|---|
| 82 | static UnicodeString stripRules(const UnicodeString &rules); | 
|---|
| 83 | private: | 
|---|
| 84 |  | 
|---|
| 85 | UBool       doParseActions(int32_t a); | 
|---|
| 86 | void        error(UErrorCode e);                   // error reporting convenience function. | 
|---|
| 87 | void        fixOpStack(RBBINode::OpPrecedence p); | 
|---|
| 88 | //   a character. | 
|---|
| 89 | void        findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt = nullptr); | 
|---|
| 90 |  | 
|---|
| 91 | UChar32     nextCharLL(); | 
|---|
| 92 | #ifdef RBBI_DEBUG | 
|---|
| 93 | void        printNodeStack(const char *title); | 
|---|
| 94 | #endif | 
|---|
| 95 | RBBINode    *pushNewNode(RBBINode::NodeType  t); | 
|---|
| 96 | void        scanSet(); | 
|---|
| 97 |  | 
|---|
| 98 |  | 
|---|
| 99 | RBBIRuleBuilder               *fRB;              // The rule builder that we are part of. | 
|---|
| 100 |  | 
|---|
| 101 | int32_t                       fScanIndex;        // Index of current character being processed | 
|---|
| 102 | //   in the rule input string. | 
|---|
| 103 | int32_t                       fNextIndex;        // Index of the next character, which | 
|---|
| 104 | //   is the first character not yet scanned. | 
|---|
| 105 | UBool                         fQuoteMode;        // Scan is in a 'quoted region' | 
|---|
| 106 | int32_t                       fLineNum;          // Line number in input file. | 
|---|
| 107 | int32_t                       fCharNum;          // Char position within the line. | 
|---|
| 108 | UChar32                       fLastChar;         // Previous char, needed to count CR-LF | 
|---|
| 109 | //   as a single line, not two. | 
|---|
| 110 |  | 
|---|
| 111 | RBBIRuleChar                  fC;                // Current char for parse state machine | 
|---|
| 112 | //   processing. | 
|---|
| 113 | UnicodeString                 fVarName;          // $variableName, valid when we've just | 
|---|
| 114 | //   scanned one. | 
|---|
| 115 |  | 
|---|
| 116 | RBBIRuleTableEl               **fStateTable;     // State Transition Table for RBBI Rule | 
|---|
| 117 | //   parsing.  index by p[state][char-class] | 
|---|
| 118 |  | 
|---|
| 119 | uint16_t                      fStack[kStackSize];  // State stack, holds state pushes | 
|---|
| 120 | int32_t                       fStackPtr;           //  and pops as specified in the state | 
|---|
| 121 | //  transition rules. | 
|---|
| 122 |  | 
|---|
| 123 | RBBINode                      *fNodeStack[kStackSize]; // Node stack, holds nodes created | 
|---|
| 124 | //  during the parse of a rule | 
|---|
| 125 | int32_t                        fNodeStackPtr; | 
|---|
| 126 |  | 
|---|
| 127 |  | 
|---|
| 128 | UBool                          fReverseRule;     // True if the rule currently being scanned | 
|---|
| 129 | //  is a reverse direction rule (if it | 
|---|
| 130 | //  starts with a '!') | 
|---|
| 131 |  | 
|---|
| 132 | UBool                          fLookAheadRule;   // True if the rule includes a '/' | 
|---|
| 133 | //   somewhere within it. | 
|---|
| 134 |  | 
|---|
| 135 | UBool                          fNoChainInRule;   // True if the current rule starts with a '^'. | 
|---|
| 136 |  | 
|---|
| 137 | RBBISymbolTable               *fSymbolTable;     // symbol table, holds definitions of | 
|---|
| 138 | //   $variable symbols. | 
|---|
| 139 |  | 
|---|
| 140 | UHashtable                    *fSetTable;        // UnicocodeSet hash table, holds indexes to | 
|---|
| 141 | //   the sets created while parsing rules. | 
|---|
| 142 | //   The key is the string used for creating | 
|---|
| 143 | //   the set. | 
|---|
| 144 |  | 
|---|
| 145 | UnicodeSet                     fRuleSets[10];    // Unicode Sets that are needed during | 
|---|
| 146 | //  the scanning of RBBI rules.  The | 
|---|
| 147 | //  indices for these are assigned by the | 
|---|
| 148 | //  perl script that builds the state tables. | 
|---|
| 149 | //  See rbbirpt.h. | 
|---|
| 150 |  | 
|---|
| 151 | int32_t                        fRuleNum;         // Counts each rule as it is scanned. | 
|---|
| 152 |  | 
|---|
| 153 | int32_t                        fOptionStart;     // Input index of start of a !!option | 
|---|
| 154 | //   keyword, while being scanned. | 
|---|
| 155 |  | 
|---|
| 156 | UnicodeSet *gRuleSet_rule_char; | 
|---|
| 157 | UnicodeSet *gRuleSet_white_space; | 
|---|
| 158 | UnicodeSet *gRuleSet_name_char; | 
|---|
| 159 | UnicodeSet *gRuleSet_name_start_char; | 
|---|
| 160 |  | 
|---|
| 161 | RBBIRuleScanner(const RBBIRuleScanner &other) = delete; // forbid copying of this class | 
|---|
| 162 | RBBIRuleScanner &operator=(const RBBIRuleScanner &other) = delete; // forbid copying of this class | 
|---|
| 163 | }; | 
|---|
| 164 |  | 
|---|
| 165 | U_NAMESPACE_END | 
|---|
| 166 |  | 
|---|
| 167 | #endif | 
|---|
| 168 |  | 
|---|