1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3//
4// rbbitblb.h
5//
6
7/*
8**********************************************************************
9* Copyright (c) 2002-2016, International Business Machines
10* Corporation and others. All Rights Reserved.
11**********************************************************************
12*/
13
14#ifndef RBBITBLB_H
15#define RBBITBLB_H
16
17#include "unicode/utypes.h"
18
19#if !UCONFIG_NO_BREAK_ITERATION
20
21#include "unicode/uobject.h"
22#include "unicode/rbbi.h"
23#include "rbbidata.h"
24#include "rbbirb.h"
25#include "rbbinode.h"
26
27
28U_NAMESPACE_BEGIN
29
30class RBBIRuleScanner;
31class RBBIRuleBuilder;
32class UVector32;
33
34//
35// class RBBITableBuilder is part of the RBBI rule compiler.
36// It builds the state transition table used by the RBBI runtime
37// from the expression syntax tree generated by the rule scanner.
38//
39// This class is part of the RBBI implementation only.
40// There is no user-visible public API here.
41//
42
43class RBBITableBuilder : public UMemory {
44public:
45 RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode, UErrorCode &status);
46 ~RBBITableBuilder();
47
48 void buildForwardTable();
49
50 /** Return the runtime size in bytes of the built state table. */
51 int32_t getTableSize() const;
52
53 /** Fill in the runtime state table. Sufficient memory must exist at the specified location.
54 */
55 void exportTable(void *where);
56
57 /** Use 8 bits to encode the forward table */
58 bool use8BitsForTable() const;
59
60 /**
61 * Find duplicate (redundant) character classes. Begin looking with categories.first.
62 * Duplicate, if found are returned in the categories parameter.
63 * This is an iterator-like function, used to identify character classes
64 * (state table columns) that can be eliminated.
65 * @param categories in/out parameter, specifies where to start looking for duplicates,
66 * and returns the first pair of duplicates found, if any.
67 * @return true if duplicate char classes were found, false otherwise.
68 */
69 bool findDuplCharClassFrom(IntPair *categories);
70
71 /** Remove a column from the state table. Used when two character categories
72 * have been found equivalent, and merged together, to eliminate the unneeded table column.
73 */
74 void removeColumn(int32_t column);
75
76 /**
77 * Check for, and remove duplicate states (table rows).
78 * @return the number of states removed.
79 */
80 int32_t removeDuplicateStates();
81
82 /** Build the safe reverse table from the already-constructed forward table. */
83 void buildSafeReverseTable(UErrorCode &status);
84
85 /** Return the runtime size in bytes of the built safe reverse state table. */
86 int32_t getSafeTableSize() const;
87
88 /** Fill in the runtime safe state table. Sufficient memory must exist at the specified location.
89 */
90 void exportSafeTable(void *where);
91
92 /** Use 8 bits to encode the safe reverse table */
93 bool use8BitsForSafeTable() const;
94
95private:
96 void calcNullable(RBBINode *n);
97 void calcFirstPos(RBBINode *n);
98 void calcLastPos(RBBINode *n);
99 void calcFollowPos(RBBINode *n);
100 void calcChainedFollowPos(RBBINode *n, RBBINode *endMarkNode);
101 void bofFixup();
102 void buildStateTable();
103 void mapLookAheadRules();
104 void flagAcceptingStates();
105 void flagLookAheadStates();
106 void flagTaggedStates();
107 void mergeRuleStatusVals();
108
109 /**
110 * Merge redundant state table columns, eliminating character classes with identical behavior.
111 * Done after the state tables are generated, just before converting to their run-time format.
112 */
113 int32_t mergeColumns();
114
115 void addRuleRootNodes(UVector *dest, RBBINode *node);
116
117 /**
118 * Find duplicate (redundant) states, beginning at the specified pair,
119 * within this state table. This is an iterator-like function, used to
120 * identify states (state table rows) that can be eliminated.
121 * @param states in/out parameter, specifies where to start looking for duplicates,
122 * and returns the first pair of duplicates found, if any.
123 * @return true if duplicate states were found, false otherwise.
124 */
125 bool findDuplicateState(IntPair *states);
126
127 /** Remove a duplicate state.
128 * @param duplStates The duplicate states. The first is kept, the second is removed.
129 * All references to the second in the state table are retargeted
130 * to the first.
131 */
132 void removeState(IntPair duplStates);
133
134 /** Find the next duplicate state in the safe reverse table. An iterator function.
135 * @param states in/out parameter, specifies where to start looking for duplicates,
136 * and returns the first pair of duplicates found, if any.
137 * @return true if a duplicate pair of states was found.
138 */
139 bool findDuplicateSafeState(IntPair *states);
140
141 /** Remove a duplicate state from the safe table.
142 * @param duplStates The duplicate states. The first is kept, the second is removed.
143 * All references to the second in the state table are retargeted
144 * to the first.
145 */
146 void removeSafeState(IntPair duplStates);
147
148 // Set functions for UVector.
149 // TODO: make a USet subclass of UVector
150
151 void setAdd(UVector *dest, UVector *source);
152 UBool setEquals(UVector *a, UVector *b);
153
154 void sortedAdd(UVector **dest, int32_t val);
155
156public:
157#ifdef RBBI_DEBUG
158 void printSet(UVector *s);
159 void printPosSets(RBBINode *n /* = nullptr */);
160 void printStates();
161 void printRuleStatusTable();
162 void printReverseTable();
163#else
164 #define printSet(s)
165 #define printPosSets(n)
166 #define printStates()
167 #define printRuleStatusTable()
168 #define printReverseTable()
169#endif
170
171private:
172 RBBIRuleBuilder *fRB;
173 RBBINode *&fTree; // The root node of the parse tree to build a
174 // table for.
175 UErrorCode *fStatus;
176
177 /** State Descriptors, UVector<RBBIStateDescriptor> */
178 UVector *fDStates; // D states (Aho's terminology)
179 // Index is state number
180 // Contents are RBBIStateDescriptor pointers.
181
182 /** Synthesized safe table, UVector of UnicodeString, one string per table row. */
183 UVector *fSafeTable;
184
185 /** Map from rule number (fVal in look ahead nodes) to sequential lookahead index. */
186 UVector32 *fLookAheadRuleMap = nullptr;
187
188 /* Counter used when assigning lookahead rule numbers.
189 * Contains the last look-ahead number already in use.
190 * The first look-ahead number is 2; Number 1 (ACCEPTING_UNCONDITIONAL) is reserved
191 * for non-lookahead accepting states. See the declarations of RBBIStateTableRowT. */
192 int32_t fLASlotsInUse = ACCEPTING_UNCONDITIONAL;
193
194
195 RBBITableBuilder(const RBBITableBuilder &other) = delete; // forbid copying of this class
196 RBBITableBuilder &operator=(const RBBITableBuilder &other) = delete; // forbid copying of this class
197};
198
199//
200// RBBIStateDescriptor - The DFA is constructed as a set of these descriptors,
201// one for each state.
202class RBBIStateDescriptor : public UMemory {
203public:
204 UBool fMarked;
205 uint32_t fAccepting;
206 uint32_t fLookAhead;
207 UVector *fTagVals;
208 int32_t fTagsIdx;
209 UVector *fPositions; // Set of parse tree positions associated
210 // with this state. Unordered (it's a set).
211 // UVector contents are RBBINode *
212
213 UVector32 *fDtran; // Transitions out of this state.
214 // indexed by input character
215 // contents is int index of dest state
216 // in RBBITableBuilder.fDStates
217
218 RBBIStateDescriptor(int maxInputSymbol, UErrorCode *fStatus);
219 ~RBBIStateDescriptor();
220
221private:
222 RBBIStateDescriptor(const RBBIStateDescriptor &other) = delete; // forbid copying of this class
223 RBBIStateDescriptor &operator=(const RBBIStateDescriptor &other) = delete; // forbid copying of this class
224};
225
226
227
228U_NAMESPACE_END
229
230#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
231
232#endif
233