1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | // |
4 | // rbbitblb.h |
5 | // |
6 | |
7 | /* |
8 | ********************************************************************** |
9 | * Copyright (c) 2002-2016, International Business Machines |
10 | * Corporation and others. All Rights Reserved. |
11 | ********************************************************************** |
12 | */ |
13 | |
14 | #ifndef RBBITBLB_H |
15 | #define RBBITBLB_H |
16 | |
17 | #include "unicode/utypes.h" |
18 | |
19 | #if !UCONFIG_NO_BREAK_ITERATION |
20 | |
21 | #include "unicode/uobject.h" |
22 | #include "unicode/rbbi.h" |
23 | #include "rbbirb.h" |
24 | #include "rbbinode.h" |
25 | |
26 | |
27 | U_NAMESPACE_BEGIN |
28 | |
29 | class RBBIRuleScanner; |
30 | class RBBIRuleBuilder; |
31 | class UVector32; |
32 | |
33 | // |
34 | // class RBBITableBuilder is part of the RBBI rule compiler. |
35 | // It builds the state transition table used by the RBBI runtime |
36 | // from the expression syntax tree generated by the rule scanner. |
37 | // |
38 | // This class is part of the RBBI implementation only. |
39 | // There is no user-visible public API here. |
40 | // |
41 | |
42 | class RBBITableBuilder : public UMemory { |
43 | public: |
44 | RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode, UErrorCode &status); |
45 | ~RBBITableBuilder(); |
46 | |
47 | void buildForwardTable(); |
48 | |
49 | /** Return the runtime size in bytes of the built state table. */ |
50 | int32_t getTableSize() const; |
51 | |
52 | /** Fill in the runtime state table. Sufficient memory must exist at the specified location. |
53 | */ |
54 | void exportTable(void *where); |
55 | |
56 | /** |
57 | * Find duplicate (redundant) character classes. Begin looking with categories.first. |
58 | * Duplicate, if found are returned in the categories parameter. |
59 | * This is an iterator-like function, used to identify character classes |
60 | * (state table columns) that can be eliminated. |
61 | * @param categories in/out parameter, specifies where to start looking for duplicates, |
62 | * and returns the first pair of duplicates found, if any. |
63 | * @return true if duplicate char classes were found, false otherwise. |
64 | */ |
65 | bool findDuplCharClassFrom(IntPair *categories); |
66 | |
67 | /** Remove a column from the state table. Used when two character categories |
68 | * have been found equivalent, and merged together, to eliminate the uneeded table column. |
69 | */ |
70 | void removeColumn(int32_t column); |
71 | |
72 | /** |
73 | * Check for, and remove dupicate states (table rows). |
74 | * @return the number of states removed. |
75 | */ |
76 | int32_t removeDuplicateStates(); |
77 | |
78 | /** Build the safe reverse table from the already-constructed forward table. */ |
79 | void buildSafeReverseTable(UErrorCode &status); |
80 | |
81 | /** Return the runtime size in bytes of the built safe reverse state table. */ |
82 | int32_t getSafeTableSize() const; |
83 | |
84 | /** Fill in the runtime safe state table. Sufficient memory must exist at the specified location. |
85 | */ |
86 | void exportSafeTable(void *where); |
87 | |
88 | |
89 | private: |
90 | void calcNullable(RBBINode *n); |
91 | void calcFirstPos(RBBINode *n); |
92 | void calcLastPos(RBBINode *n); |
93 | void calcFollowPos(RBBINode *n); |
94 | void calcChainedFollowPos(RBBINode *n, RBBINode *endMarkNode); |
95 | void bofFixup(); |
96 | void buildStateTable(); |
97 | void mapLookAheadRules(); |
98 | void flagAcceptingStates(); |
99 | void flagLookAheadStates(); |
100 | void flagTaggedStates(); |
101 | void mergeRuleStatusVals(); |
102 | |
103 | /** |
104 | * Merge redundant state table columns, eliminating character classes with identical behavior. |
105 | * Done after the state tables are generated, just before converting to their run-time format. |
106 | */ |
107 | int32_t mergeColumns(); |
108 | |
109 | void addRuleRootNodes(UVector *dest, RBBINode *node); |
110 | |
111 | /** |
112 | * Find duplicate (redundant) states, beginning at the specified pair, |
113 | * within this state table. This is an iterator-like function, used to |
114 | * identify states (state table rows) that can be eliminated. |
115 | * @param states in/out parameter, specifies where to start looking for duplicates, |
116 | * and returns the first pair of duplicates found, if any. |
117 | * @return true if duplicate states were found, false otherwise. |
118 | */ |
119 | bool findDuplicateState(IntPair *states); |
120 | |
121 | /** Remove a duplicate state. |
122 | * @param duplStates The duplicate states. The first is kept, the second is removed. |
123 | * All references to the second in the state table are retargeted |
124 | * to the first. |
125 | */ |
126 | void removeState(IntPair duplStates); |
127 | |
128 | /** Find the next duplicate state in the safe reverse table. An iterator function. |
129 | * @param states in/out parameter, specifies where to start looking for duplicates, |
130 | * and returns the first pair of duplicates found, if any. |
131 | * @return true if a duplicate pair of states was found. |
132 | */ |
133 | bool findDuplicateSafeState(IntPair *states); |
134 | |
135 | /** Remove a duplicate state from the safe table. |
136 | * @param duplStates The duplicate states. The first is kept, the second is removed. |
137 | * All references to the second in the state table are retargeted |
138 | * to the first. |
139 | */ |
140 | void removeSafeState(IntPair duplStates); |
141 | |
142 | // Set functions for UVector. |
143 | // TODO: make a USet subclass of UVector |
144 | |
145 | void setAdd(UVector *dest, UVector *source); |
146 | UBool setEquals(UVector *a, UVector *b); |
147 | |
148 | void sortedAdd(UVector **dest, int32_t val); |
149 | |
150 | public: |
151 | #ifdef RBBI_DEBUG |
152 | void printSet(UVector *s); |
153 | void printPosSets(RBBINode *n /* = NULL*/); |
154 | void printStates(); |
155 | void printRuleStatusTable(); |
156 | void printReverseTable(); |
157 | #else |
158 | #define printSet(s) |
159 | #define printPosSets(n) |
160 | #define printStates() |
161 | #define printRuleStatusTable() |
162 | #define printReverseTable() |
163 | #endif |
164 | |
165 | private: |
166 | RBBIRuleBuilder *fRB; |
167 | RBBINode *&fTree; // The root node of the parse tree to build a |
168 | // table for. |
169 | UErrorCode *fStatus; |
170 | |
171 | /** State Descriptors, UVector<RBBIStateDescriptor> */ |
172 | UVector *fDStates; // D states (Aho's terminology) |
173 | // Index is state number |
174 | // Contents are RBBIStateDescriptor pointers. |
175 | |
176 | /** Synthesized safe table, UVector of UnicodeString, one string per table row. */ |
177 | UVector *fSafeTable; |
178 | |
179 | /** Map from rule number (fVal in look ahead nodes) to sequential lookahead index. */ |
180 | UVector32 *fLookAheadRuleMap = nullptr; |
181 | |
182 | |
183 | RBBITableBuilder(const RBBITableBuilder &other); // forbid copying of this class |
184 | RBBITableBuilder &operator=(const RBBITableBuilder &other); // forbid copying of this class |
185 | }; |
186 | |
187 | // |
188 | // RBBIStateDescriptor - The DFA is constructed as a set of these descriptors, |
189 | // one for each state. |
190 | class RBBIStateDescriptor : public UMemory { |
191 | public: |
192 | UBool fMarked; |
193 | int32_t fAccepting; |
194 | int32_t fLookAhead; |
195 | UVector *fTagVals; |
196 | int32_t fTagsIdx; |
197 | UVector *fPositions; // Set of parse tree positions associated |
198 | // with this state. Unordered (it's a set). |
199 | // UVector contents are RBBINode * |
200 | |
201 | UVector32 *fDtran; // Transitions out of this state. |
202 | // indexed by input character |
203 | // contents is int index of dest state |
204 | // in RBBITableBuilder.fDStates |
205 | |
206 | RBBIStateDescriptor(int maxInputSymbol, UErrorCode *fStatus); |
207 | ~RBBIStateDescriptor(); |
208 | |
209 | private: |
210 | RBBIStateDescriptor(const RBBIStateDescriptor &other); // forbid copying of this class |
211 | RBBIStateDescriptor &operator=(const RBBIStateDescriptor &other); // forbid copying of this class |
212 | }; |
213 | |
214 | |
215 | |
216 | U_NAMESPACE_END |
217 | |
218 | #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |
219 | |
220 | #endif |
221 | |