| 1 | // © 2016 and later: Unicode, Inc. and others. | 
|---|
| 2 | // License & terms of use: http://www.unicode.org/copyright.html | 
|---|
| 3 | // | 
|---|
| 4 | //  rbbitblb.h | 
|---|
| 5 | // | 
|---|
| 6 |  | 
|---|
| 7 | /* | 
|---|
| 8 | ********************************************************************** | 
|---|
| 9 | *   Copyright (c) 2002-2016, International Business Machines | 
|---|
| 10 | *   Corporation and others.  All Rights Reserved. | 
|---|
| 11 | ********************************************************************** | 
|---|
| 12 | */ | 
|---|
| 13 |  | 
|---|
| 14 | #ifndef RBBITBLB_H | 
|---|
| 15 | #define RBBITBLB_H | 
|---|
| 16 |  | 
|---|
| 17 | #include "unicode/utypes.h" | 
|---|
| 18 |  | 
|---|
| 19 | #if !UCONFIG_NO_BREAK_ITERATION | 
|---|
| 20 |  | 
|---|
| 21 | #include "unicode/uobject.h" | 
|---|
| 22 | #include "unicode/rbbi.h" | 
|---|
| 23 | #include "rbbirb.h" | 
|---|
| 24 | #include "rbbinode.h" | 
|---|
| 25 |  | 
|---|
| 26 |  | 
|---|
| 27 | U_NAMESPACE_BEGIN | 
|---|
| 28 |  | 
|---|
| 29 | class RBBIRuleScanner; | 
|---|
| 30 | class RBBIRuleBuilder; | 
|---|
| 31 | class UVector32; | 
|---|
| 32 |  | 
|---|
| 33 | // | 
|---|
| 34 | //  class RBBITableBuilder is part of the RBBI rule compiler. | 
|---|
| 35 | //                         It builds the state transition table used by the RBBI runtime | 
|---|
| 36 | //                         from the expression syntax tree generated by the rule scanner. | 
|---|
| 37 | // | 
|---|
| 38 | //                         This class is part of the RBBI implementation only. | 
|---|
| 39 | //                         There is no user-visible public API here. | 
|---|
| 40 | // | 
|---|
| 41 |  | 
|---|
| 42 | class RBBITableBuilder : public UMemory { | 
|---|
| 43 | public: | 
|---|
| 44 | RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode, UErrorCode &status); | 
|---|
| 45 | ~RBBITableBuilder(); | 
|---|
| 46 |  | 
|---|
| 47 | void     buildForwardTable(); | 
|---|
| 48 |  | 
|---|
| 49 | /** Return the runtime size in bytes of the built state table.  */ | 
|---|
| 50 | int32_t  getTableSize() const; | 
|---|
| 51 |  | 
|---|
| 52 | /** Fill in the runtime state table. Sufficient memory must exist at the specified location. | 
|---|
| 53 | */ | 
|---|
| 54 | void     exportTable(void *where); | 
|---|
| 55 |  | 
|---|
| 56 | /** | 
|---|
| 57 | *  Find duplicate (redundant) character classes. Begin looking with categories.first. | 
|---|
| 58 | *  Duplicate, if found are returned in the categories parameter. | 
|---|
| 59 | *  This is an iterator-like function, used to identify character classes | 
|---|
| 60 | *  (state table columns) that can be eliminated. | 
|---|
| 61 | *  @param categories in/out parameter, specifies where to start looking for duplicates, | 
|---|
| 62 | *                and returns the first pair of duplicates found, if any. | 
|---|
| 63 | *  @return true if duplicate char classes were found, false otherwise. | 
|---|
| 64 | */ | 
|---|
| 65 | bool     findDuplCharClassFrom(IntPair *categories); | 
|---|
| 66 |  | 
|---|
| 67 | /** Remove a column from the state table. Used when two character categories | 
|---|
| 68 | *  have been found equivalent, and merged together, to eliminate the uneeded table column. | 
|---|
| 69 | */ | 
|---|
| 70 | void     removeColumn(int32_t column); | 
|---|
| 71 |  | 
|---|
| 72 | /** | 
|---|
| 73 | * Check for, and remove dupicate states (table rows). | 
|---|
| 74 | * @return the number of states removed. | 
|---|
| 75 | */ | 
|---|
| 76 | int32_t  removeDuplicateStates(); | 
|---|
| 77 |  | 
|---|
| 78 | /** Build the safe reverse table from the already-constructed forward table. */ | 
|---|
| 79 | void     buildSafeReverseTable(UErrorCode &status); | 
|---|
| 80 |  | 
|---|
| 81 | /** Return the runtime size in bytes of the built safe reverse state table. */ | 
|---|
| 82 | int32_t  getSafeTableSize() const; | 
|---|
| 83 |  | 
|---|
| 84 | /** Fill in the runtime safe state table. Sufficient memory must exist at the specified location. | 
|---|
| 85 | */ | 
|---|
| 86 | void     exportSafeTable(void *where); | 
|---|
| 87 |  | 
|---|
| 88 |  | 
|---|
| 89 | private: | 
|---|
| 90 | void     calcNullable(RBBINode *n); | 
|---|
| 91 | void     calcFirstPos(RBBINode *n); | 
|---|
| 92 | void     calcLastPos(RBBINode  *n); | 
|---|
| 93 | void     calcFollowPos(RBBINode *n); | 
|---|
| 94 | void     calcChainedFollowPos(RBBINode *n, RBBINode *endMarkNode); | 
|---|
| 95 | void     bofFixup(); | 
|---|
| 96 | void     buildStateTable(); | 
|---|
| 97 | void     mapLookAheadRules(); | 
|---|
| 98 | void     flagAcceptingStates(); | 
|---|
| 99 | void     flagLookAheadStates(); | 
|---|
| 100 | void     flagTaggedStates(); | 
|---|
| 101 | void     mergeRuleStatusVals(); | 
|---|
| 102 |  | 
|---|
| 103 | /** | 
|---|
| 104 | * Merge redundant state table columns, eliminating character classes with identical behavior. | 
|---|
| 105 | * Done after the state tables are generated, just before converting to their run-time format. | 
|---|
| 106 | */ | 
|---|
| 107 | int32_t  mergeColumns(); | 
|---|
| 108 |  | 
|---|
| 109 | void     addRuleRootNodes(UVector *dest, RBBINode *node); | 
|---|
| 110 |  | 
|---|
| 111 | /** | 
|---|
| 112 | *  Find duplicate (redundant) states, beginning at the specified pair, | 
|---|
| 113 | *  within this state table. This is an iterator-like function, used to | 
|---|
| 114 | *  identify states (state table rows) that can be eliminated. | 
|---|
| 115 | *  @param states in/out parameter, specifies where to start looking for duplicates, | 
|---|
| 116 | *                and returns the first pair of duplicates found, if any. | 
|---|
| 117 | *  @return true if duplicate states were found, false otherwise. | 
|---|
| 118 | */ | 
|---|
| 119 | bool findDuplicateState(IntPair *states); | 
|---|
| 120 |  | 
|---|
| 121 | /** Remove a duplicate state. | 
|---|
| 122 | * @param duplStates The duplicate states. The first is kept, the second is removed. | 
|---|
| 123 | *                   All references to the second in the state table are retargeted | 
|---|
| 124 | *                   to the first. | 
|---|
| 125 | */ | 
|---|
| 126 | void removeState(IntPair duplStates); | 
|---|
| 127 |  | 
|---|
| 128 | /** Find the next duplicate state in the safe reverse table. An iterator function. | 
|---|
| 129 | *  @param states in/out parameter, specifies where to start looking for duplicates, | 
|---|
| 130 | *                and returns the first pair of duplicates found, if any. | 
|---|
| 131 | *  @return true if a duplicate pair of states was found. | 
|---|
| 132 | */ | 
|---|
| 133 | bool findDuplicateSafeState(IntPair *states); | 
|---|
| 134 |  | 
|---|
| 135 | /** Remove a duplicate state from the safe table. | 
|---|
| 136 | * @param duplStates The duplicate states. The first is kept, the second is removed. | 
|---|
| 137 | *                   All references to the second in the state table are retargeted | 
|---|
| 138 | *                   to the first. | 
|---|
| 139 | */ | 
|---|
| 140 | void removeSafeState(IntPair duplStates); | 
|---|
| 141 |  | 
|---|
| 142 | // Set functions for UVector. | 
|---|
| 143 | //   TODO:  make a USet subclass of UVector | 
|---|
| 144 |  | 
|---|
| 145 | void     setAdd(UVector *dest, UVector *source); | 
|---|
| 146 | UBool    setEquals(UVector *a, UVector *b); | 
|---|
| 147 |  | 
|---|
| 148 | void     sortedAdd(UVector **dest, int32_t val); | 
|---|
| 149 |  | 
|---|
| 150 | public: | 
|---|
| 151 | #ifdef RBBI_DEBUG | 
|---|
| 152 | void     printSet(UVector *s); | 
|---|
| 153 | void     printPosSets(RBBINode *n /* = NULL*/); | 
|---|
| 154 | void     printStates(); | 
|---|
| 155 | void     printRuleStatusTable(); | 
|---|
| 156 | void     printReverseTable(); | 
|---|
| 157 | #else | 
|---|
| 158 | #define  printSet(s) | 
|---|
| 159 | #define  printPosSets(n) | 
|---|
| 160 | #define  printStates() | 
|---|
| 161 | #define  printRuleStatusTable() | 
|---|
| 162 | #define  printReverseTable() | 
|---|
| 163 | #endif | 
|---|
| 164 |  | 
|---|
| 165 | private: | 
|---|
| 166 | RBBIRuleBuilder  *fRB; | 
|---|
| 167 | RBBINode         *&fTree;              // The root node of the parse tree to build a | 
|---|
| 168 | //   table for. | 
|---|
| 169 | UErrorCode       *fStatus; | 
|---|
| 170 |  | 
|---|
| 171 | /** State Descriptors, UVector<RBBIStateDescriptor> */ | 
|---|
| 172 | UVector          *fDStates;            //  D states (Aho's terminology) | 
|---|
| 173 | //  Index is state number | 
|---|
| 174 | //  Contents are RBBIStateDescriptor pointers. | 
|---|
| 175 |  | 
|---|
| 176 | /** Synthesized safe table, UVector of UnicodeString, one string per table row.   */ | 
|---|
| 177 | UVector          *fSafeTable; | 
|---|
| 178 |  | 
|---|
| 179 | /** Map from rule number (fVal in look ahead nodes) to sequential lookahead index. */ | 
|---|
| 180 | UVector32        *fLookAheadRuleMap = nullptr; | 
|---|
| 181 |  | 
|---|
| 182 |  | 
|---|
| 183 | RBBITableBuilder(const RBBITableBuilder &other); // forbid copying of this class | 
|---|
| 184 | RBBITableBuilder &operator=(const RBBITableBuilder &other); // forbid copying of this class | 
|---|
| 185 | }; | 
|---|
| 186 |  | 
|---|
| 187 | // | 
|---|
| 188 | //  RBBIStateDescriptor - The DFA is constructed as a set of these descriptors, | 
|---|
| 189 | //                        one for each state. | 
|---|
| 190 | class RBBIStateDescriptor : public UMemory { | 
|---|
| 191 | public: | 
|---|
| 192 | UBool            fMarked; | 
|---|
| 193 | int32_t          fAccepting; | 
|---|
| 194 | int32_t          fLookAhead; | 
|---|
| 195 | UVector          *fTagVals; | 
|---|
| 196 | int32_t          fTagsIdx; | 
|---|
| 197 | UVector          *fPositions;          // Set of parse tree positions associated | 
|---|
| 198 | //   with this state.  Unordered (it's a set). | 
|---|
| 199 | //   UVector contents are RBBINode * | 
|---|
| 200 |  | 
|---|
| 201 | UVector32        *fDtran;              // Transitions out of this state. | 
|---|
| 202 | //   indexed by input character | 
|---|
| 203 | //   contents is int index of dest state | 
|---|
| 204 | //   in RBBITableBuilder.fDStates | 
|---|
| 205 |  | 
|---|
| 206 | RBBIStateDescriptor(int maxInputSymbol,  UErrorCode *fStatus); | 
|---|
| 207 | ~RBBIStateDescriptor(); | 
|---|
| 208 |  | 
|---|
| 209 | private: | 
|---|
| 210 | RBBIStateDescriptor(const RBBIStateDescriptor &other); // forbid copying of this class | 
|---|
| 211 | RBBIStateDescriptor &operator=(const RBBIStateDescriptor &other); // forbid copying of this class | 
|---|
| 212 | }; | 
|---|
| 213 |  | 
|---|
| 214 |  | 
|---|
| 215 |  | 
|---|
| 216 | U_NAMESPACE_END | 
|---|
| 217 |  | 
|---|
| 218 | #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ | 
|---|
| 219 |  | 
|---|
| 220 | #endif | 
|---|
| 221 |  | 
|---|