1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | // |
4 | // rbbitblb.h |
5 | // |
6 | |
7 | /* |
8 | ********************************************************************** |
9 | * Copyright (c) 2002-2016, International Business Machines |
10 | * Corporation and others. All Rights Reserved. |
11 | ********************************************************************** |
12 | */ |
13 | |
14 | #ifndef RBBITBLB_H |
15 | #define RBBITBLB_H |
16 | |
17 | #include "unicode/utypes.h" |
18 | |
19 | #if !UCONFIG_NO_BREAK_ITERATION |
20 | |
21 | #include "unicode/uobject.h" |
22 | #include "unicode/rbbi.h" |
23 | #include "rbbidata.h" |
24 | #include "rbbirb.h" |
25 | #include "rbbinode.h" |
26 | |
27 | |
28 | U_NAMESPACE_BEGIN |
29 | |
30 | class RBBIRuleScanner; |
31 | class RBBIRuleBuilder; |
32 | class UVector32; |
33 | |
34 | // |
35 | // class RBBITableBuilder is part of the RBBI rule compiler. |
36 | // It builds the state transition table used by the RBBI runtime |
37 | // from the expression syntax tree generated by the rule scanner. |
38 | // |
39 | // This class is part of the RBBI implementation only. |
40 | // There is no user-visible public API here. |
41 | // |
42 | |
43 | class RBBITableBuilder : public UMemory { |
44 | public: |
45 | RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode, UErrorCode &status); |
46 | ~RBBITableBuilder(); |
47 | |
48 | void buildForwardTable(); |
49 | |
50 | /** Return the runtime size in bytes of the built state table. */ |
51 | int32_t getTableSize() const; |
52 | |
53 | /** Fill in the runtime state table. Sufficient memory must exist at the specified location. |
54 | */ |
55 | void exportTable(void *where); |
56 | |
57 | /** Use 8 bits to encode the forward table */ |
58 | bool use8BitsForTable() const; |
59 | |
60 | /** |
61 | * Find duplicate (redundant) character classes. Begin looking with categories.first. |
62 | * Duplicate, if found are returned in the categories parameter. |
63 | * This is an iterator-like function, used to identify character classes |
64 | * (state table columns) that can be eliminated. |
65 | * @param categories in/out parameter, specifies where to start looking for duplicates, |
66 | * and returns the first pair of duplicates found, if any. |
67 | * @return true if duplicate char classes were found, false otherwise. |
68 | */ |
69 | bool findDuplCharClassFrom(IntPair *categories); |
70 | |
71 | /** Remove a column from the state table. Used when two character categories |
72 | * have been found equivalent, and merged together, to eliminate the unneeded table column. |
73 | */ |
74 | void removeColumn(int32_t column); |
75 | |
76 | /** |
77 | * Check for, and remove duplicate states (table rows). |
78 | * @return the number of states removed. |
79 | */ |
80 | int32_t removeDuplicateStates(); |
81 | |
82 | /** Build the safe reverse table from the already-constructed forward table. */ |
83 | void buildSafeReverseTable(UErrorCode &status); |
84 | |
85 | /** Return the runtime size in bytes of the built safe reverse state table. */ |
86 | int32_t getSafeTableSize() const; |
87 | |
88 | /** Fill in the runtime safe state table. Sufficient memory must exist at the specified location. |
89 | */ |
90 | void exportSafeTable(void *where); |
91 | |
92 | /** Use 8 bits to encode the safe reverse table */ |
93 | bool use8BitsForSafeTable() const; |
94 | |
95 | private: |
96 | void calcNullable(RBBINode *n); |
97 | void calcFirstPos(RBBINode *n); |
98 | void calcLastPos(RBBINode *n); |
99 | void calcFollowPos(RBBINode *n); |
100 | void calcChainedFollowPos(RBBINode *n, RBBINode *endMarkNode); |
101 | void bofFixup(); |
102 | void buildStateTable(); |
103 | void mapLookAheadRules(); |
104 | void flagAcceptingStates(); |
105 | void flagLookAheadStates(); |
106 | void flagTaggedStates(); |
107 | void mergeRuleStatusVals(); |
108 | |
109 | /** |
110 | * Merge redundant state table columns, eliminating character classes with identical behavior. |
111 | * Done after the state tables are generated, just before converting to their run-time format. |
112 | */ |
113 | int32_t mergeColumns(); |
114 | |
115 | void addRuleRootNodes(UVector *dest, RBBINode *node); |
116 | |
117 | /** |
118 | * Find duplicate (redundant) states, beginning at the specified pair, |
119 | * within this state table. This is an iterator-like function, used to |
120 | * identify states (state table rows) that can be eliminated. |
121 | * @param states in/out parameter, specifies where to start looking for duplicates, |
122 | * and returns the first pair of duplicates found, if any. |
123 | * @return true if duplicate states were found, false otherwise. |
124 | */ |
125 | bool findDuplicateState(IntPair *states); |
126 | |
127 | /** Remove a duplicate state. |
128 | * @param duplStates The duplicate states. The first is kept, the second is removed. |
129 | * All references to the second in the state table are retargeted |
130 | * to the first. |
131 | */ |
132 | void removeState(IntPair duplStates); |
133 | |
134 | /** Find the next duplicate state in the safe reverse table. An iterator function. |
135 | * @param states in/out parameter, specifies where to start looking for duplicates, |
136 | * and returns the first pair of duplicates found, if any. |
137 | * @return true if a duplicate pair of states was found. |
138 | */ |
139 | bool findDuplicateSafeState(IntPair *states); |
140 | |
141 | /** Remove a duplicate state from the safe table. |
142 | * @param duplStates The duplicate states. The first is kept, the second is removed. |
143 | * All references to the second in the state table are retargeted |
144 | * to the first. |
145 | */ |
146 | void removeSafeState(IntPair duplStates); |
147 | |
148 | // Set functions for UVector. |
149 | // TODO: make a USet subclass of UVector |
150 | |
151 | void setAdd(UVector *dest, UVector *source); |
152 | UBool setEquals(UVector *a, UVector *b); |
153 | |
154 | void sortedAdd(UVector **dest, int32_t val); |
155 | |
156 | public: |
157 | #ifdef RBBI_DEBUG |
158 | void printSet(UVector *s); |
159 | void printPosSets(RBBINode *n /* = nullptr */); |
160 | void printStates(); |
161 | void printRuleStatusTable(); |
162 | void printReverseTable(); |
163 | #else |
164 | #define printSet(s) |
165 | #define printPosSets(n) |
166 | #define printStates() |
167 | #define printRuleStatusTable() |
168 | #define printReverseTable() |
169 | #endif |
170 | |
171 | private: |
172 | RBBIRuleBuilder *fRB; |
173 | RBBINode *&fTree; // The root node of the parse tree to build a |
174 | // table for. |
175 | UErrorCode *fStatus; |
176 | |
177 | /** State Descriptors, UVector<RBBIStateDescriptor> */ |
178 | UVector *fDStates; // D states (Aho's terminology) |
179 | // Index is state number |
180 | // Contents are RBBIStateDescriptor pointers. |
181 | |
182 | /** Synthesized safe table, UVector of UnicodeString, one string per table row. */ |
183 | UVector *fSafeTable; |
184 | |
185 | /** Map from rule number (fVal in look ahead nodes) to sequential lookahead index. */ |
186 | UVector32 *fLookAheadRuleMap = nullptr; |
187 | |
188 | /* Counter used when assigning lookahead rule numbers. |
189 | * Contains the last look-ahead number already in use. |
190 | * The first look-ahead number is 2; Number 1 (ACCEPTING_UNCONDITIONAL) is reserved |
191 | * for non-lookahead accepting states. See the declarations of RBBIStateTableRowT. */ |
192 | int32_t fLASlotsInUse = ACCEPTING_UNCONDITIONAL; |
193 | |
194 | |
195 | RBBITableBuilder(const RBBITableBuilder &other) = delete; // forbid copying of this class |
196 | RBBITableBuilder &operator=(const RBBITableBuilder &other) = delete; // forbid copying of this class |
197 | }; |
198 | |
199 | // |
200 | // RBBIStateDescriptor - The DFA is constructed as a set of these descriptors, |
201 | // one for each state. |
202 | class RBBIStateDescriptor : public UMemory { |
203 | public: |
204 | UBool fMarked; |
205 | uint32_t fAccepting; |
206 | uint32_t fLookAhead; |
207 | UVector *fTagVals; |
208 | int32_t fTagsIdx; |
209 | UVector *fPositions; // Set of parse tree positions associated |
210 | // with this state. Unordered (it's a set). |
211 | // UVector contents are RBBINode * |
212 | |
213 | UVector32 *fDtran; // Transitions out of this state. |
214 | // indexed by input character |
215 | // contents is int index of dest state |
216 | // in RBBITableBuilder.fDStates |
217 | |
218 | RBBIStateDescriptor(int maxInputSymbol, UErrorCode *fStatus); |
219 | ~RBBIStateDescriptor(); |
220 | |
221 | private: |
222 | RBBIStateDescriptor(const RBBIStateDescriptor &other) = delete; // forbid copying of this class |
223 | RBBIStateDescriptor &operator=(const RBBIStateDescriptor &other) = delete; // forbid copying of this class |
224 | }; |
225 | |
226 | |
227 | |
228 | U_NAMESPACE_END |
229 | |
230 | #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |
231 | |
232 | #endif |
233 | |