1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | // |
4 | // rbbirb.h |
5 | // |
6 | // Copyright (C) 2002-2008, International Business Machines Corporation and others. |
7 | // All Rights Reserved. |
8 | // |
9 | // This file contains declarations for several classes from the |
10 | // Rule Based Break Iterator rule builder. |
11 | // |
12 | |
13 | |
14 | #ifndef RBBIRB_H |
15 | #define RBBIRB_H |
16 | |
17 | #include "unicode/utypes.h" |
18 | |
19 | #if !UCONFIG_NO_BREAK_ITERATION |
20 | |
21 | #include <utility> |
22 | |
23 | #include "unicode/uobject.h" |
24 | #include "unicode/rbbi.h" |
25 | #include "unicode/uniset.h" |
26 | #include "unicode/parseerr.h" |
27 | #include "uhash.h" |
28 | #include "uvector.h" |
29 | #include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that |
30 | // looks up references to $variables within a set. |
31 | |
32 | |
33 | U_NAMESPACE_BEGIN |
34 | |
35 | class RBBIRuleScanner; |
36 | struct RBBIRuleTableEl; |
37 | class RBBISetBuilder; |
38 | class RBBINode; |
39 | class RBBITableBuilder; |
40 | |
41 | |
42 | |
43 | //-------------------------------------------------------------------------------- |
44 | // |
45 | // RBBISymbolTable. Implements SymbolTable interface that is used by the |
46 | // UnicodeSet parser to resolve references to $variables. |
47 | // |
48 | //-------------------------------------------------------------------------------- |
49 | class RBBISymbolTableEntry : public UMemory { // The symbol table hash table contains one |
50 | public: // of these structs for each entry. |
51 | RBBISymbolTableEntry(); |
52 | UnicodeString key; |
53 | RBBINode *val; |
54 | ~RBBISymbolTableEntry(); |
55 | |
56 | private: |
57 | RBBISymbolTableEntry(const RBBISymbolTableEntry &other) = delete; // forbid copying of this class |
58 | RBBISymbolTableEntry &operator=(const RBBISymbolTableEntry &other) = delete; // forbid copying of this class |
59 | }; |
60 | |
61 | |
62 | class RBBISymbolTable : public UMemory, public SymbolTable { |
63 | private: |
64 | const UnicodeString &fRules; |
65 | UHashtable *fHashTable; |
66 | RBBIRuleScanner *fRuleScanner; |
67 | |
68 | // These next two fields are part of the mechanism for passing references to |
69 | // already-constructed UnicodeSets back to the UnicodeSet constructor |
70 | // when the pattern includes $variable references. |
71 | const UnicodeString ffffString; // = "/uffff" |
72 | UnicodeSet *fCachedSetLookup; |
73 | |
74 | public: |
75 | // API inherited from class SymbolTable |
76 | virtual const UnicodeString* lookup(const UnicodeString& s) const override; |
77 | virtual const UnicodeFunctor* lookupMatcher(UChar32 ch) const override; |
78 | virtual UnicodeString parseReference(const UnicodeString& text, |
79 | ParsePosition& pos, int32_t limit) const override; |
80 | |
81 | // Additional Functions |
82 | RBBISymbolTable(RBBIRuleScanner *, const UnicodeString &fRules, UErrorCode &status); |
83 | virtual ~RBBISymbolTable(); |
84 | |
85 | virtual RBBINode *lookupNode(const UnicodeString &key) const; |
86 | virtual void addEntry (const UnicodeString &key, RBBINode *val, UErrorCode &err); |
87 | |
88 | #ifdef RBBI_DEBUG |
89 | virtual void rbbiSymtablePrint() const; |
90 | #else |
91 | // A do-nothing inline function for non-debug builds. Member funcs can't be empty |
92 | // or the call sites won't compile. |
93 | int32_t fFakeField; |
94 | #define rbbiSymtablePrint() fFakeField=0; |
95 | #endif |
96 | |
97 | private: |
98 | RBBISymbolTable(const RBBISymbolTable &other); // forbid copying of this class |
99 | RBBISymbolTable &operator=(const RBBISymbolTable &other); // forbid copying of this class |
100 | }; |
101 | |
102 | |
103 | //-------------------------------------------------------------------------------- |
104 | // |
105 | // class RBBIRuleBuilder The top-level class handling RBBI rule compiling. |
106 | // |
107 | //-------------------------------------------------------------------------------- |
108 | class RBBIRuleBuilder : public UMemory { |
109 | public: |
110 | |
111 | // Create a rule based break iterator from a set of rules. |
112 | // This function is the main entry point into the rule builder. The |
113 | // public ICU API for creating RBBIs uses this function to do the actual work. |
114 | // |
115 | static BreakIterator * createRuleBasedBreakIterator( const UnicodeString &rules, |
116 | UParseError *parseError, |
117 | UErrorCode &status); |
118 | |
119 | public: |
120 | // The "public" functions and data members that appear below are accessed |
121 | // (and shared) by the various parts that make up the rule builder. They |
122 | // are NOT intended to be accessed by anything outside of the |
123 | // rule builder implementation. |
124 | RBBIRuleBuilder(const UnicodeString &rules, |
125 | UParseError *parseErr, |
126 | UErrorCode &status |
127 | ); |
128 | |
129 | virtual ~RBBIRuleBuilder(); |
130 | |
131 | /** |
132 | * Build the state tables and char class Trie from the source rules. |
133 | */ |
134 | RBBIDataHeader *build(UErrorCode &status); |
135 | |
136 | |
137 | /** |
138 | * Fold together redundant character classes (table columns) and |
139 | * redundant states (table rows). Done after initial table generation, |
140 | * before serializing the result. |
141 | */ |
142 | void optimizeTables(); |
143 | |
144 | char *fDebugEnv; // controls debug trace output |
145 | UErrorCode *fStatus; // Error reporting. Keeping status |
146 | UParseError *fParseError; // here avoids passing it everywhere. |
147 | const UnicodeString &fRules; // The rule string that we are compiling |
148 | UnicodeString fStrippedRules; // The rule string, with comments stripped. |
149 | |
150 | RBBIRuleScanner *fScanner; // The scanner. |
151 | RBBINode *fForwardTree; // The parse trees, generated by the scanner, |
152 | RBBINode *fReverseTree; // then manipulated by subsequent steps. |
153 | RBBINode *fSafeFwdTree; |
154 | RBBINode *fSafeRevTree; |
155 | |
156 | RBBINode **fDefaultTree; // For rules not qualified with a ! |
157 | // the tree to which they belong to. |
158 | |
159 | UBool fChainRules; // True for chained Unicode TR style rules. |
160 | // False for traditional regexp rules. |
161 | |
162 | UBool fLBCMNoChain; // True: suppress chaining of rules on |
163 | // chars with LineBreak property == CM. |
164 | |
165 | UBool fLookAheadHardBreak; // True: Look ahead matches cause an |
166 | // immediate break, no continuing for the |
167 | // longest match. |
168 | |
169 | RBBISetBuilder *fSetBuilder; // Set and Character Category builder. |
170 | UVector *fUSetNodes; // Vector of all uset nodes. |
171 | |
172 | RBBITableBuilder *fForwardTable; // State transition table, build time form. |
173 | |
174 | UVector *fRuleStatusVals; // The values that can be returned |
175 | // from getRuleStatus(). |
176 | |
177 | RBBIDataHeader *flattenData(); // Create the flattened (runtime format) |
178 | // data tables.. |
179 | private: |
180 | RBBIRuleBuilder(const RBBIRuleBuilder &other) = delete; // forbid copying of this class |
181 | RBBIRuleBuilder &operator=(const RBBIRuleBuilder &other) = delete; // forbid copying of this class |
182 | }; |
183 | |
184 | |
185 | |
186 | |
187 | //---------------------------------------------------------------------------- |
188 | // |
189 | // RBBISetTableEl is an entry in the hash table of UnicodeSets that have |
190 | // been encountered. The val Node will be of nodetype uset |
191 | // and contain pointers to the actual UnicodeSets. |
192 | // The Key is the source string for initializing the set. |
193 | // |
194 | // The hash table is used to avoid creating duplicate |
195 | // unnamed (not $var references) UnicodeSets. |
196 | // |
197 | // Memory Management: |
198 | // The Hash Table owns these RBBISetTableEl structs and |
199 | // the key strings. It does NOT own the val nodes. |
200 | // |
201 | //---------------------------------------------------------------------------- |
202 | struct RBBISetTableEl { |
203 | UnicodeString *key; |
204 | RBBINode *val; |
205 | }; |
206 | |
207 | /** |
208 | * A pair of ints, used to bundle pairs of states or pairs of character classes. |
209 | */ |
210 | typedef std::pair<int32_t, int32_t> IntPair; |
211 | |
212 | |
213 | //---------------------------------------------------------------------------- |
214 | // |
215 | // RBBIDebugPrintf Printf equivalent, for debugging output. |
216 | // Conditional compilation of the implementation lets us |
217 | // get rid of the stdio dependency in environments where it |
218 | // is unavailable. |
219 | // |
220 | //---------------------------------------------------------------------------- |
221 | #ifdef RBBI_DEBUG |
222 | #include <stdio.h> |
223 | #define RBBIDebugPrintf printf |
224 | #define RBBIDebugPuts puts |
225 | #else |
226 | #undef RBBIDebugPrintf |
227 | #define RBBIDebugPuts(arg) |
228 | #endif |
229 | |
230 | U_NAMESPACE_END |
231 | |
232 | #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |
233 | |
234 | #endif |
235 | |
236 | |
237 | |
238 | |