1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | // |
4 | // rbbiscan.h |
5 | // |
6 | // Copyright (C) 2002-2016, International Business Machines Corporation and others. |
7 | // All Rights Reserved. |
8 | // |
9 | // This file contains declarations for class RBBIRuleScanner |
10 | // |
11 | |
12 | |
13 | #ifndef RBBISCAN_H |
14 | #define RBBISCAN_H |
15 | |
16 | #include "unicode/utypes.h" |
17 | #include "unicode/uobject.h" |
18 | #include "unicode/rbbi.h" |
19 | #include "unicode/uniset.h" |
20 | #include "unicode/parseerr.h" |
21 | #include "uhash.h" |
22 | #include "uvector.h" |
23 | #include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that |
24 | // looks up references to $variables within a set. |
25 | #include "rbbinode.h" |
26 | #include "rbbirpt.h" |
27 | |
28 | U_NAMESPACE_BEGIN |
29 | |
30 | class RBBIRuleBuilder; |
31 | class RBBISymbolTable; |
32 | |
33 | |
34 | //-------------------------------------------------------------------------------- |
35 | // |
36 | // class RBBIRuleScanner does the lowest level, character-at-a-time |
37 | // scanning of break iterator rules. |
38 | // |
39 | // The output of the scanner is parse trees for |
40 | // the rule expressions and a list of all Unicode Sets |
41 | // encountered. |
42 | // |
43 | //-------------------------------------------------------------------------------- |
44 | |
45 | class RBBIRuleScanner : public UMemory { |
46 | public: |
47 | |
48 | enum { |
49 | kStackSize = 100 // The size of the state stack for |
50 | }; // rules parsing. Corresponds roughly |
51 | // to the depth of parentheses nesting |
52 | // that is allowed in the rules. |
53 | |
54 | struct RBBIRuleChar { |
55 | UChar32 fChar; |
56 | UBool fEscaped; |
57 | RBBIRuleChar() : fChar(0), fEscaped(FALSE) {} |
58 | }; |
59 | |
60 | RBBIRuleScanner(RBBIRuleBuilder *rb); |
61 | |
62 | |
63 | virtual ~RBBIRuleScanner(); |
64 | |
65 | void nextChar(RBBIRuleChar &c); // Get the next char from the input stream. |
66 | // Return false if at end. |
67 | |
68 | UBool push(const RBBIRuleChar &c); // Push (unget) one character. |
69 | // Only a single character may be pushed. |
70 | |
71 | void parse(); // Parse the rules, generating two parse |
72 | // trees, one each for the forward and |
73 | // reverse rules, |
74 | // and a list of UnicodeSets encountered. |
75 | |
76 | /** |
77 | * Return a rules string without unnecessary |
78 | * characters. |
79 | */ |
80 | static UnicodeString stripRules(const UnicodeString &rules); |
81 | private: |
82 | |
83 | UBool doParseActions(int32_t a); |
84 | void error(UErrorCode e); // error reporting convenience function. |
85 | void fixOpStack(RBBINode::OpPrecedence p); |
86 | // a character. |
87 | void findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt = NULL); |
88 | |
89 | UChar32 nextCharLL(); |
90 | #ifdef RBBI_DEBUG |
91 | void printNodeStack(const char *title); |
92 | #endif |
93 | RBBINode *pushNewNode(RBBINode::NodeType t); |
94 | void scanSet(); |
95 | |
96 | |
97 | RBBIRuleBuilder *fRB; // The rule builder that we are part of. |
98 | |
99 | int32_t fScanIndex; // Index of current character being processed |
100 | // in the rule input string. |
101 | int32_t fNextIndex; // Index of the next character, which |
102 | // is the first character not yet scanned. |
103 | UBool fQuoteMode; // Scan is in a 'quoted region' |
104 | int32_t fLineNum; // Line number in input file. |
105 | int32_t fCharNum; // Char position within the line. |
106 | UChar32 fLastChar; // Previous char, needed to count CR-LF |
107 | // as a single line, not two. |
108 | |
109 | RBBIRuleChar fC; // Current char for parse state machine |
110 | // processing. |
111 | UnicodeString fVarName; // $variableName, valid when we've just |
112 | // scanned one. |
113 | |
114 | RBBIRuleTableEl **fStateTable; // State Transition Table for RBBI Rule |
115 | // parsing. index by p[state][char-class] |
116 | |
117 | uint16_t fStack[kStackSize]; // State stack, holds state pushes |
118 | int32_t fStackPtr; // and pops as specified in the state |
119 | // transition rules. |
120 | |
121 | RBBINode *fNodeStack[kStackSize]; // Node stack, holds nodes created |
122 | // during the parse of a rule |
123 | int32_t fNodeStackPtr; |
124 | |
125 | |
126 | UBool fReverseRule; // True if the rule currently being scanned |
127 | // is a reverse direction rule (if it |
128 | // starts with a '!') |
129 | |
130 | UBool fLookAheadRule; // True if the rule includes a '/' |
131 | // somewhere within it. |
132 | |
133 | UBool fNoChainInRule; // True if the current rule starts with a '^'. |
134 | |
135 | RBBISymbolTable *fSymbolTable; // symbol table, holds definitions of |
136 | // $variable symbols. |
137 | |
138 | UHashtable *fSetTable; // UnicocodeSet hash table, holds indexes to |
139 | // the sets created while parsing rules. |
140 | // The key is the string used for creating |
141 | // the set. |
142 | |
143 | UnicodeSet fRuleSets[10]; // Unicode Sets that are needed during |
144 | // the scanning of RBBI rules. The |
145 | // indicies for these are assigned by the |
146 | // perl script that builds the state tables. |
147 | // See rbbirpt.h. |
148 | |
149 | int32_t fRuleNum; // Counts each rule as it is scanned. |
150 | |
151 | int32_t fOptionStart; // Input index of start of a !!option |
152 | // keyword, while being scanned. |
153 | |
154 | UnicodeSet *gRuleSet_rule_char; |
155 | UnicodeSet *gRuleSet_white_space; |
156 | UnicodeSet *gRuleSet_name_char; |
157 | UnicodeSet *gRuleSet_name_start_char; |
158 | |
159 | RBBIRuleScanner(const RBBIRuleScanner &other); // forbid copying of this class |
160 | RBBIRuleScanner &operator=(const RBBIRuleScanner &other); // forbid copying of this class |
161 | }; |
162 | |
163 | U_NAMESPACE_END |
164 | |
165 | #endif |
166 | |