| 1 | // © 2016 and later: Unicode, Inc. and others. | 
| 2 | // License & terms of use: http://www.unicode.org/copyright.html | 
| 3 | // | 
| 4 | //  rbbiscan.h | 
| 5 | // | 
| 6 | //  Copyright (C) 2002-2016, International Business Machines Corporation and others. | 
| 7 | //  All Rights Reserved. | 
| 8 | // | 
| 9 | //  This file contains declarations for class RBBIRuleScanner | 
| 10 | // | 
| 11 |  | 
| 12 |  | 
| 13 | #ifndef RBBISCAN_H | 
| 14 | #define RBBISCAN_H | 
| 15 |  | 
| 16 | #include "unicode/utypes.h" | 
| 17 | #include "unicode/uobject.h" | 
| 18 | #include "unicode/rbbi.h" | 
| 19 | #include "unicode/uniset.h" | 
| 20 | #include "unicode/parseerr.h" | 
| 21 | #include "uhash.h" | 
| 22 | #include "uvector.h" | 
| 23 | #include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that | 
| 24 |                           //    looks up references to $variables within a set. | 
| 25 | #include "rbbinode.h" | 
| 26 | #include "rbbirpt.h" | 
| 27 |  | 
| 28 | U_NAMESPACE_BEGIN | 
| 29 |  | 
| 30 | class   RBBIRuleBuilder; | 
| 31 | class   RBBISymbolTable; | 
| 32 |  | 
| 33 |  | 
| 34 | //-------------------------------------------------------------------------------- | 
| 35 | // | 
| 36 | //  class RBBIRuleScanner does the lowest level, character-at-a-time | 
| 37 | //                        scanning of break iterator rules.   | 
| 38 | // | 
| 39 | //                        The output of the scanner is parse trees for | 
| 40 | //                        the rule expressions and a list of all Unicode Sets | 
| 41 | //                        encountered. | 
| 42 | // | 
| 43 | //-------------------------------------------------------------------------------- | 
| 44 |  | 
| 45 | class RBBIRuleScanner : public UMemory { | 
| 46 | public: | 
| 47 |  | 
| 48 |     enum { | 
| 49 |         kStackSize = 100            // The size of the state stack for | 
| 50 |     };                              //   rules parsing.  Corresponds roughly | 
| 51 |                                     //   to the depth of parentheses nesting | 
| 52 |                                     //   that is allowed in the rules. | 
| 53 |  | 
| 54 |     struct RBBIRuleChar { | 
| 55 |         UChar32             fChar; | 
| 56 |         UBool               fEscaped; | 
| 57 |         RBBIRuleChar() : fChar(0), fEscaped(FALSE) {} | 
| 58 |     }; | 
| 59 |  | 
| 60 |     RBBIRuleScanner(RBBIRuleBuilder  *rb); | 
| 61 |  | 
| 62 |  | 
| 63 |     virtual    ~RBBIRuleScanner(); | 
| 64 |  | 
| 65 |     void        nextChar(RBBIRuleChar &c);          // Get the next char from the input stream. | 
| 66 |                                                     // Return false if at end. | 
| 67 |  | 
| 68 |     UBool       push(const RBBIRuleChar &c);        // Push (unget) one character. | 
| 69 |                                                     //   Only a single character may be pushed. | 
| 70 |  | 
| 71 |     void        parse();                            // Parse the rules, generating two parse | 
| 72 |                                                     //   trees, one each for the forward and | 
| 73 |                                                     //   reverse rules, | 
| 74 |                                                     //   and a list of UnicodeSets encountered. | 
| 75 |  | 
| 76 |     int32_t     numRules();                         // Return the number of rules that have been seen. | 
| 77 |  | 
| 78 |     /** | 
| 79 |      * Return a rules string without unnecessary | 
| 80 |      * characters. | 
| 81 |      */ | 
| 82 |     static UnicodeString stripRules(const UnicodeString &rules); | 
| 83 | private: | 
| 84 |  | 
| 85 |     UBool       doParseActions(int32_t a); | 
| 86 |     void        error(UErrorCode e);                   // error reporting convenience function. | 
| 87 |     void        fixOpStack(RBBINode::OpPrecedence p); | 
| 88 |                                                        //   a character. | 
| 89 |     void        findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt = NULL); | 
| 90 |  | 
| 91 |     UChar32     nextCharLL(); | 
| 92 | #ifdef RBBI_DEBUG | 
| 93 |     void        printNodeStack(const char *title); | 
| 94 | #endif | 
| 95 |     RBBINode    *pushNewNode(RBBINode::NodeType  t); | 
| 96 |     void        scanSet(); | 
| 97 |  | 
| 98 |  | 
| 99 |     RBBIRuleBuilder               *fRB;              // The rule builder that we are part of. | 
| 100 |  | 
| 101 |     int32_t                       fScanIndex;        // Index of current character being processed | 
| 102 |                                                      //   in the rule input string. | 
| 103 |     int32_t                       fNextIndex;        // Index of the next character, which | 
| 104 |                                                      //   is the first character not yet scanned. | 
| 105 |     UBool                         fQuoteMode;        // Scan is in a 'quoted region' | 
| 106 |     int32_t                       fLineNum;          // Line number in input file. | 
| 107 |     int32_t                       fCharNum;          // Char position within the line. | 
| 108 |     UChar32                       fLastChar;         // Previous char, needed to count CR-LF | 
| 109 |                                                      //   as a single line, not two. | 
| 110 |  | 
| 111 |     RBBIRuleChar                  fC;                // Current char for parse state machine | 
| 112 |                                                      //   processing. | 
| 113 |     UnicodeString                 fVarName;          // $variableName, valid when we've just | 
| 114 |                                                      //   scanned one. | 
| 115 |  | 
| 116 |     RBBIRuleTableEl               **fStateTable;     // State Transition Table for RBBI Rule | 
| 117 |                                                      //   parsing.  index by p[state][char-class] | 
| 118 |  | 
| 119 |     uint16_t                      fStack[kStackSize];  // State stack, holds state pushes | 
| 120 |     int32_t                       fStackPtr;           //  and pops as specified in the state | 
| 121 |                                                        //  transition rules. | 
| 122 |  | 
| 123 |     RBBINode                      *fNodeStack[kStackSize]; // Node stack, holds nodes created | 
| 124 |                                                            //  during the parse of a rule | 
| 125 |     int32_t                        fNodeStackPtr; | 
| 126 |  | 
| 127 |  | 
| 128 |     UBool                          fReverseRule;     // True if the rule currently being scanned | 
| 129 |                                                      //  is a reverse direction rule (if it | 
| 130 |                                                      //  starts with a '!') | 
| 131 |  | 
| 132 |     UBool                          fLookAheadRule;   // True if the rule includes a '/' | 
| 133 |                                                      //   somewhere within it. | 
| 134 |  | 
| 135 |     UBool                          fNoChainInRule;   // True if the current rule starts with a '^'. | 
| 136 |  | 
| 137 |     RBBISymbolTable               *fSymbolTable;     // symbol table, holds definitions of | 
| 138 |                                                      //   $variable symbols. | 
| 139 |  | 
| 140 |     UHashtable                    *fSetTable;        // UnicocodeSet hash table, holds indexes to | 
| 141 |                                                      //   the sets created while parsing rules. | 
| 142 |                                                      //   The key is the string used for creating | 
| 143 |                                                      //   the set. | 
| 144 |  | 
| 145 |     UnicodeSet                     fRuleSets[10];    // Unicode Sets that are needed during | 
| 146 |                                                      //  the scanning of RBBI rules.  The | 
| 147 |                                                      //  indicies for these are assigned by the | 
| 148 |                                                      //  perl script that builds the state tables. | 
| 149 |                                                      //  See rbbirpt.h. | 
| 150 |  | 
| 151 |     int32_t                        fRuleNum;         // Counts each rule as it is scanned. | 
| 152 |  | 
| 153 |     int32_t                        fOptionStart;     // Input index of start of a !!option | 
| 154 |                                                      //   keyword, while being scanned. | 
| 155 |  | 
| 156 |     UnicodeSet *gRuleSet_rule_char; | 
| 157 |     UnicodeSet *gRuleSet_white_space; | 
| 158 |     UnicodeSet *gRuleSet_name_char; | 
| 159 |     UnicodeSet *gRuleSet_name_start_char; | 
| 160 |  | 
| 161 |     RBBIRuleScanner(const RBBIRuleScanner &other); // forbid copying of this class | 
| 162 |     RBBIRuleScanner &operator=(const RBBIRuleScanner &other); // forbid copying of this class | 
| 163 | }; | 
| 164 |  | 
| 165 | U_NAMESPACE_END | 
| 166 |  | 
| 167 | #endif | 
| 168 |  |