1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | // |
4 | // rbbiscan.h |
5 | // |
6 | // Copyright (C) 2002-2016, International Business Machines Corporation and others. |
7 | // All Rights Reserved. |
8 | // |
9 | // This file contains declarations for class RBBIRuleScanner |
10 | // |
11 | |
12 | |
13 | #ifndef RBBISCAN_H |
14 | #define RBBISCAN_H |
15 | |
16 | #include "unicode/utypes.h" |
17 | #include "unicode/uobject.h" |
18 | #include "unicode/rbbi.h" |
19 | #include "unicode/uniset.h" |
20 | #include "unicode/parseerr.h" |
21 | #include "uhash.h" |
22 | #include "uvector.h" |
23 | #include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that |
24 | // looks up references to $variables within a set. |
25 | #include "rbbinode.h" |
26 | #include "rbbirpt.h" |
27 | |
28 | U_NAMESPACE_BEGIN |
29 | |
30 | class RBBIRuleBuilder; |
31 | class RBBISymbolTable; |
32 | |
33 | |
34 | //-------------------------------------------------------------------------------- |
35 | // |
36 | // class RBBIRuleScanner does the lowest level, character-at-a-time |
37 | // scanning of break iterator rules. |
38 | // |
39 | // The output of the scanner is parse trees for |
40 | // the rule expressions and a list of all Unicode Sets |
41 | // encountered. |
42 | // |
43 | //-------------------------------------------------------------------------------- |
44 | |
45 | class RBBIRuleScanner : public UMemory { |
46 | public: |
47 | |
48 | enum { |
49 | kStackSize = 100 // The size of the state stack for |
50 | }; // rules parsing. Corresponds roughly |
51 | // to the depth of parentheses nesting |
52 | // that is allowed in the rules. |
53 | |
54 | struct RBBIRuleChar { |
55 | UChar32 fChar; |
56 | UBool fEscaped; |
57 | RBBIRuleChar() : fChar(0), fEscaped(false) {} |
58 | }; |
59 | |
60 | RBBIRuleScanner(RBBIRuleBuilder *rb); |
61 | |
62 | |
63 | virtual ~RBBIRuleScanner(); |
64 | |
65 | void nextChar(RBBIRuleChar &c); // Get the next char from the input stream. |
66 | // Return false if at end. |
67 | |
68 | UBool push(const RBBIRuleChar &c); // Push (unget) one character. |
69 | // Only a single character may be pushed. |
70 | |
71 | void parse(); // Parse the rules, generating two parse |
72 | // trees, one each for the forward and |
73 | // reverse rules, |
74 | // and a list of UnicodeSets encountered. |
75 | |
76 | int32_t numRules(); // Return the number of rules that have been seen. |
77 | |
78 | /** |
79 | * Return a rules string without unnecessary |
80 | * characters. |
81 | */ |
82 | static UnicodeString stripRules(const UnicodeString &rules); |
83 | private: |
84 | |
85 | UBool doParseActions(int32_t a); |
86 | void error(UErrorCode e); // error reporting convenience function. |
87 | void fixOpStack(RBBINode::OpPrecedence p); |
88 | // a character. |
89 | void findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt = nullptr); |
90 | |
91 | UChar32 nextCharLL(); |
92 | #ifdef RBBI_DEBUG |
93 | void printNodeStack(const char *title); |
94 | #endif |
95 | RBBINode *pushNewNode(RBBINode::NodeType t); |
96 | void scanSet(); |
97 | |
98 | |
99 | RBBIRuleBuilder *fRB; // The rule builder that we are part of. |
100 | |
101 | int32_t fScanIndex; // Index of current character being processed |
102 | // in the rule input string. |
103 | int32_t fNextIndex; // Index of the next character, which |
104 | // is the first character not yet scanned. |
105 | UBool fQuoteMode; // Scan is in a 'quoted region' |
106 | int32_t fLineNum; // Line number in input file. |
107 | int32_t fCharNum; // Char position within the line. |
108 | UChar32 fLastChar; // Previous char, needed to count CR-LF |
109 | // as a single line, not two. |
110 | |
111 | RBBIRuleChar fC; // Current char for parse state machine |
112 | // processing. |
113 | UnicodeString fVarName; // $variableName, valid when we've just |
114 | // scanned one. |
115 | |
116 | RBBIRuleTableEl **fStateTable; // State Transition Table for RBBI Rule |
117 | // parsing. index by p[state][char-class] |
118 | |
119 | uint16_t fStack[kStackSize]; // State stack, holds state pushes |
120 | int32_t fStackPtr; // and pops as specified in the state |
121 | // transition rules. |
122 | |
123 | RBBINode *fNodeStack[kStackSize]; // Node stack, holds nodes created |
124 | // during the parse of a rule |
125 | int32_t fNodeStackPtr; |
126 | |
127 | |
128 | UBool fReverseRule; // True if the rule currently being scanned |
129 | // is a reverse direction rule (if it |
130 | // starts with a '!') |
131 | |
132 | UBool fLookAheadRule; // True if the rule includes a '/' |
133 | // somewhere within it. |
134 | |
135 | UBool fNoChainInRule; // True if the current rule starts with a '^'. |
136 | |
137 | RBBISymbolTable *fSymbolTable; // symbol table, holds definitions of |
138 | // $variable symbols. |
139 | |
140 | UHashtable *fSetTable; // UnicocodeSet hash table, holds indexes to |
141 | // the sets created while parsing rules. |
142 | // The key is the string used for creating |
143 | // the set. |
144 | |
145 | UnicodeSet fRuleSets[10]; // Unicode Sets that are needed during |
146 | // the scanning of RBBI rules. The |
147 | // indices for these are assigned by the |
148 | // perl script that builds the state tables. |
149 | // See rbbirpt.h. |
150 | |
151 | int32_t fRuleNum; // Counts each rule as it is scanned. |
152 | |
153 | int32_t fOptionStart; // Input index of start of a !!option |
154 | // keyword, while being scanned. |
155 | |
156 | UnicodeSet *gRuleSet_rule_char; |
157 | UnicodeSet *gRuleSet_white_space; |
158 | UnicodeSet *gRuleSet_name_char; |
159 | UnicodeSet *gRuleSet_name_start_char; |
160 | |
161 | RBBIRuleScanner(const RBBIRuleScanner &other) = delete; // forbid copying of this class |
162 | RBBIRuleScanner &operator=(const RBBIRuleScanner &other) = delete; // forbid copying of this class |
163 | }; |
164 | |
165 | U_NAMESPACE_END |
166 | |
167 | #endif |
168 | |