1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3//
4// rbbitblb.h
5//
6
7/*
8**********************************************************************
9* Copyright (c) 2002-2016, International Business Machines
10* Corporation and others. All Rights Reserved.
11**********************************************************************
12*/
13
14#ifndef RBBITBLB_H
15#define RBBITBLB_H
16
17#include "unicode/utypes.h"
18
19#if !UCONFIG_NO_BREAK_ITERATION
20
21#include "unicode/uobject.h"
22#include "unicode/rbbi.h"
23#include "rbbirb.h"
24#include "rbbinode.h"
25
26
27U_NAMESPACE_BEGIN
28
29class RBBIRuleScanner;
30class RBBIRuleBuilder;
31class UVector32;
32
33//
34// class RBBITableBuilder is part of the RBBI rule compiler.
35// It builds the state transition table used by the RBBI runtime
36// from the expression syntax tree generated by the rule scanner.
37//
38// This class is part of the RBBI implementation only.
39// There is no user-visible public API here.
40//
41
42class RBBITableBuilder : public UMemory {
43public:
44 RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode, UErrorCode &status);
45 ~RBBITableBuilder();
46
47 void buildForwardTable();
48
49 /** Return the runtime size in bytes of the built state table. */
50 int32_t getTableSize() const;
51
52 /** Fill in the runtime state table. Sufficient memory must exist at the specified location.
53 */
54 void exportTable(void *where);
55
56 /**
57 * Find duplicate (redundant) character classes. Begin looking with categories.first.
58 * Duplicate, if found are returned in the categories parameter.
59 * This is an iterator-like function, used to identify character classes
60 * (state table columns) that can be eliminated.
61 * @param categories in/out parameter, specifies where to start looking for duplicates,
62 * and returns the first pair of duplicates found, if any.
63 * @return true if duplicate char classes were found, false otherwise.
64 */
65 bool findDuplCharClassFrom(IntPair *categories);
66
67 /** Remove a column from the state table. Used when two character categories
68 * have been found equivalent, and merged together, to eliminate the uneeded table column.
69 */
70 void removeColumn(int32_t column);
71
72 /**
73 * Check for, and remove dupicate states (table rows).
74 * @return the number of states removed.
75 */
76 int32_t removeDuplicateStates();
77
78 /** Build the safe reverse table from the already-constructed forward table. */
79 void buildSafeReverseTable(UErrorCode &status);
80
81 /** Return the runtime size in bytes of the built safe reverse state table. */
82 int32_t getSafeTableSize() const;
83
84 /** Fill in the runtime safe state table. Sufficient memory must exist at the specified location.
85 */
86 void exportSafeTable(void *where);
87
88
89private:
90 void calcNullable(RBBINode *n);
91 void calcFirstPos(RBBINode *n);
92 void calcLastPos(RBBINode *n);
93 void calcFollowPos(RBBINode *n);
94 void calcChainedFollowPos(RBBINode *n, RBBINode *endMarkNode);
95 void bofFixup();
96 void buildStateTable();
97 void mapLookAheadRules();
98 void flagAcceptingStates();
99 void flagLookAheadStates();
100 void flagTaggedStates();
101 void mergeRuleStatusVals();
102
103 /**
104 * Merge redundant state table columns, eliminating character classes with identical behavior.
105 * Done after the state tables are generated, just before converting to their run-time format.
106 */
107 int32_t mergeColumns();
108
109 void addRuleRootNodes(UVector *dest, RBBINode *node);
110
111 /**
112 * Find duplicate (redundant) states, beginning at the specified pair,
113 * within this state table. This is an iterator-like function, used to
114 * identify states (state table rows) that can be eliminated.
115 * @param states in/out parameter, specifies where to start looking for duplicates,
116 * and returns the first pair of duplicates found, if any.
117 * @return true if duplicate states were found, false otherwise.
118 */
119 bool findDuplicateState(IntPair *states);
120
121 /** Remove a duplicate state.
122 * @param duplStates The duplicate states. The first is kept, the second is removed.
123 * All references to the second in the state table are retargeted
124 * to the first.
125 */
126 void removeState(IntPair duplStates);
127
128 /** Find the next duplicate state in the safe reverse table. An iterator function.
129 * @param states in/out parameter, specifies where to start looking for duplicates,
130 * and returns the first pair of duplicates found, if any.
131 * @return true if a duplicate pair of states was found.
132 */
133 bool findDuplicateSafeState(IntPair *states);
134
135 /** Remove a duplicate state from the safe table.
136 * @param duplStates The duplicate states. The first is kept, the second is removed.
137 * All references to the second in the state table are retargeted
138 * to the first.
139 */
140 void removeSafeState(IntPair duplStates);
141
142 // Set functions for UVector.
143 // TODO: make a USet subclass of UVector
144
145 void setAdd(UVector *dest, UVector *source);
146 UBool setEquals(UVector *a, UVector *b);
147
148 void sortedAdd(UVector **dest, int32_t val);
149
150public:
151#ifdef RBBI_DEBUG
152 void printSet(UVector *s);
153 void printPosSets(RBBINode *n /* = NULL*/);
154 void printStates();
155 void printRuleStatusTable();
156 void printReverseTable();
157#else
158 #define printSet(s)
159 #define printPosSets(n)
160 #define printStates()
161 #define printRuleStatusTable()
162 #define printReverseTable()
163#endif
164
165private:
166 RBBIRuleBuilder *fRB;
167 RBBINode *&fTree; // The root node of the parse tree to build a
168 // table for.
169 UErrorCode *fStatus;
170
171 /** State Descriptors, UVector<RBBIStateDescriptor> */
172 UVector *fDStates; // D states (Aho's terminology)
173 // Index is state number
174 // Contents are RBBIStateDescriptor pointers.
175
176 /** Synthesized safe table, UVector of UnicodeString, one string per table row. */
177 UVector *fSafeTable;
178
179 /** Map from rule number (fVal in look ahead nodes) to sequential lookahead index. */
180 UVector32 *fLookAheadRuleMap = nullptr;
181
182
183 RBBITableBuilder(const RBBITableBuilder &other); // forbid copying of this class
184 RBBITableBuilder &operator=(const RBBITableBuilder &other); // forbid copying of this class
185};
186
187//
188// RBBIStateDescriptor - The DFA is constructed as a set of these descriptors,
189// one for each state.
190class RBBIStateDescriptor : public UMemory {
191public:
192 UBool fMarked;
193 int32_t fAccepting;
194 int32_t fLookAhead;
195 UVector *fTagVals;
196 int32_t fTagsIdx;
197 UVector *fPositions; // Set of parse tree positions associated
198 // with this state. Unordered (it's a set).
199 // UVector contents are RBBINode *
200
201 UVector32 *fDtran; // Transitions out of this state.
202 // indexed by input character
203 // contents is int index of dest state
204 // in RBBITableBuilder.fDStates
205
206 RBBIStateDescriptor(int maxInputSymbol, UErrorCode *fStatus);
207 ~RBBIStateDescriptor();
208
209private:
210 RBBIStateDescriptor(const RBBIStateDescriptor &other); // forbid copying of this class
211 RBBIStateDescriptor &operator=(const RBBIStateDescriptor &other); // forbid copying of this class
212};
213
214
215
216U_NAMESPACE_END
217
218#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
219
220#endif
221