| 1 | // © 2016 and later: Unicode, Inc. and others. | 
| 2 | // License & terms of use: http://www.unicode.org/copyright.html | 
| 3 | // | 
| 4 | //  rbbitblb.h | 
| 5 | // | 
| 6 |  | 
| 7 | /* | 
| 8 | ********************************************************************** | 
| 9 | *   Copyright (c) 2002-2016, International Business Machines | 
| 10 | *   Corporation and others.  All Rights Reserved. | 
| 11 | ********************************************************************** | 
| 12 | */ | 
| 13 |  | 
| 14 | #ifndef RBBITBLB_H | 
| 15 | #define RBBITBLB_H | 
| 16 |  | 
| 17 | #include "unicode/utypes.h" | 
| 18 |  | 
| 19 | #if !UCONFIG_NO_BREAK_ITERATION | 
| 20 |  | 
| 21 | #include "unicode/uobject.h" | 
| 22 | #include "unicode/rbbi.h" | 
| 23 | #include "rbbirb.h" | 
| 24 | #include "rbbinode.h" | 
| 25 |  | 
| 26 |  | 
| 27 | U_NAMESPACE_BEGIN | 
| 28 |  | 
| 29 | class RBBIRuleScanner; | 
| 30 | class RBBIRuleBuilder; | 
| 31 | class UVector32; | 
| 32 |  | 
| 33 | // | 
| 34 | //  class RBBITableBuilder is part of the RBBI rule compiler. | 
| 35 | //                         It builds the state transition table used by the RBBI runtime | 
| 36 | //                         from the expression syntax tree generated by the rule scanner. | 
| 37 | // | 
| 38 | //                         This class is part of the RBBI implementation only. | 
| 39 | //                         There is no user-visible public API here. | 
| 40 | // | 
| 41 |  | 
| 42 | class RBBITableBuilder : public UMemory { | 
| 43 | public: | 
| 44 |     RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode, UErrorCode &status); | 
| 45 |     ~RBBITableBuilder(); | 
| 46 |  | 
| 47 |     void     buildForwardTable(); | 
| 48 |  | 
| 49 |     /** Return the runtime size in bytes of the built state table.  */ | 
| 50 |     int32_t  getTableSize() const; | 
| 51 |  | 
| 52 |     /** Fill in the runtime state table. Sufficient memory must exist at the specified location. | 
| 53 |      */ | 
| 54 |     void     exportTable(void *where); | 
| 55 |  | 
| 56 |     /** | 
| 57 |      *  Find duplicate (redundant) character classes. Begin looking with categories.first. | 
| 58 |      *  Duplicate, if found are returned in the categories parameter. | 
| 59 |      *  This is an iterator-like function, used to identify character classes | 
| 60 |      *  (state table columns) that can be eliminated. | 
| 61 |      *  @param categories in/out parameter, specifies where to start looking for duplicates, | 
| 62 |      *                and returns the first pair of duplicates found, if any. | 
| 63 |      *  @return true if duplicate char classes were found, false otherwise. | 
| 64 |      */ | 
| 65 |     bool     findDuplCharClassFrom(IntPair *categories); | 
| 66 |  | 
| 67 |     /** Remove a column from the state table. Used when two character categories | 
| 68 |      *  have been found equivalent, and merged together, to eliminate the uneeded table column. | 
| 69 |      */ | 
| 70 |     void     removeColumn(int32_t column); | 
| 71 |  | 
| 72 |     /** | 
| 73 |      * Check for, and remove dupicate states (table rows). | 
| 74 |      * @return the number of states removed. | 
| 75 |      */ | 
| 76 |     int32_t  removeDuplicateStates(); | 
| 77 |  | 
| 78 |     /** Build the safe reverse table from the already-constructed forward table. */ | 
| 79 |     void     buildSafeReverseTable(UErrorCode &status); | 
| 80 |  | 
| 81 |     /** Return the runtime size in bytes of the built safe reverse state table. */ | 
| 82 |     int32_t  getSafeTableSize() const; | 
| 83 |  | 
| 84 |     /** Fill in the runtime safe state table. Sufficient memory must exist at the specified location. | 
| 85 |      */ | 
| 86 |     void     exportSafeTable(void *where); | 
| 87 |  | 
| 88 |  | 
| 89 | private: | 
| 90 |     void     calcNullable(RBBINode *n); | 
| 91 |     void     calcFirstPos(RBBINode *n); | 
| 92 |     void     calcLastPos(RBBINode  *n); | 
| 93 |     void     calcFollowPos(RBBINode *n); | 
| 94 |     void     calcChainedFollowPos(RBBINode *n, RBBINode *endMarkNode); | 
| 95 |     void     bofFixup(); | 
| 96 |     void     buildStateTable(); | 
| 97 |     void     mapLookAheadRules(); | 
| 98 |     void     flagAcceptingStates(); | 
| 99 |     void     flagLookAheadStates(); | 
| 100 |     void     flagTaggedStates(); | 
| 101 |     void     mergeRuleStatusVals(); | 
| 102 |  | 
| 103 |     /** | 
| 104 |      * Merge redundant state table columns, eliminating character classes with identical behavior. | 
| 105 |      * Done after the state tables are generated, just before converting to their run-time format. | 
| 106 |      */ | 
| 107 |     int32_t  mergeColumns(); | 
| 108 |  | 
| 109 |     void     addRuleRootNodes(UVector *dest, RBBINode *node); | 
| 110 |  | 
| 111 |     /** | 
| 112 |      *  Find duplicate (redundant) states, beginning at the specified pair, | 
| 113 |      *  within this state table. This is an iterator-like function, used to | 
| 114 |      *  identify states (state table rows) that can be eliminated. | 
| 115 |      *  @param states in/out parameter, specifies where to start looking for duplicates, | 
| 116 |      *                and returns the first pair of duplicates found, if any. | 
| 117 |      *  @return true if duplicate states were found, false otherwise. | 
| 118 |      */ | 
| 119 |     bool findDuplicateState(IntPair *states); | 
| 120 |  | 
| 121 |     /** Remove a duplicate state. | 
| 122 |      * @param duplStates The duplicate states. The first is kept, the second is removed. | 
| 123 |      *                   All references to the second in the state table are retargeted | 
| 124 |      *                   to the first. | 
| 125 |      */ | 
| 126 |     void removeState(IntPair duplStates); | 
| 127 |  | 
| 128 |     /** Find the next duplicate state in the safe reverse table. An iterator function. | 
| 129 |      *  @param states in/out parameter, specifies where to start looking for duplicates, | 
| 130 |      *                and returns the first pair of duplicates found, if any. | 
| 131 |      *  @return true if a duplicate pair of states was found. | 
| 132 |      */ | 
| 133 |     bool findDuplicateSafeState(IntPair *states); | 
| 134 |  | 
| 135 |     /** Remove a duplicate state from the safe table. | 
| 136 |      * @param duplStates The duplicate states. The first is kept, the second is removed. | 
| 137 |      *                   All references to the second in the state table are retargeted | 
| 138 |      *                   to the first. | 
| 139 |      */ | 
| 140 |     void removeSafeState(IntPair duplStates); | 
| 141 |  | 
| 142 |     // Set functions for UVector. | 
| 143 |     //   TODO:  make a USet subclass of UVector | 
| 144 |  | 
| 145 |     void     setAdd(UVector *dest, UVector *source); | 
| 146 |     UBool    setEquals(UVector *a, UVector *b); | 
| 147 |  | 
| 148 |     void     sortedAdd(UVector **dest, int32_t val); | 
| 149 |  | 
| 150 | public: | 
| 151 | #ifdef RBBI_DEBUG | 
| 152 |     void     printSet(UVector *s); | 
| 153 |     void     printPosSets(RBBINode *n /* = NULL*/); | 
| 154 |     void     printStates(); | 
| 155 |     void     printRuleStatusTable(); | 
| 156 |     void     printReverseTable(); | 
| 157 | #else | 
| 158 |     #define  printSet(s) | 
| 159 |     #define  printPosSets(n) | 
| 160 |     #define  printStates() | 
| 161 |     #define  printRuleStatusTable() | 
| 162 |     #define  printReverseTable() | 
| 163 | #endif | 
| 164 |  | 
| 165 | private: | 
| 166 |     RBBIRuleBuilder  *fRB; | 
| 167 |     RBBINode         *&fTree;              // The root node of the parse tree to build a | 
| 168 |                                            //   table for. | 
| 169 |     UErrorCode       *fStatus; | 
| 170 |  | 
| 171 |     /** State Descriptors, UVector<RBBIStateDescriptor> */ | 
| 172 |     UVector          *fDStates;            //  D states (Aho's terminology) | 
| 173 |                                            //  Index is state number | 
| 174 |                                            //  Contents are RBBIStateDescriptor pointers. | 
| 175 |  | 
| 176 |     /** Synthesized safe table, UVector of UnicodeString, one string per table row.   */ | 
| 177 |     UVector          *fSafeTable; | 
| 178 |  | 
| 179 |     /** Map from rule number (fVal in look ahead nodes) to sequential lookahead index. */ | 
| 180 |     UVector32        *fLookAheadRuleMap = nullptr; | 
| 181 |  | 
| 182 |  | 
| 183 |     RBBITableBuilder(const RBBITableBuilder &other); // forbid copying of this class | 
| 184 |     RBBITableBuilder &operator=(const RBBITableBuilder &other); // forbid copying of this class | 
| 185 | }; | 
| 186 |  | 
| 187 | // | 
| 188 | //  RBBIStateDescriptor - The DFA is constructed as a set of these descriptors, | 
| 189 | //                        one for each state. | 
| 190 | class RBBIStateDescriptor : public UMemory { | 
| 191 | public: | 
| 192 |     UBool            fMarked; | 
| 193 |     int32_t          fAccepting; | 
| 194 |     int32_t          fLookAhead; | 
| 195 |     UVector          *fTagVals; | 
| 196 |     int32_t          fTagsIdx; | 
| 197 |     UVector          *fPositions;          // Set of parse tree positions associated | 
| 198 |                                            //   with this state.  Unordered (it's a set). | 
| 199 |                                            //   UVector contents are RBBINode * | 
| 200 |  | 
| 201 |     UVector32        *fDtran;              // Transitions out of this state. | 
| 202 |                                            //   indexed by input character | 
| 203 |                                            //   contents is int index of dest state | 
| 204 |                                            //   in RBBITableBuilder.fDStates | 
| 205 |  | 
| 206 |     RBBIStateDescriptor(int maxInputSymbol,  UErrorCode *fStatus); | 
| 207 |     ~RBBIStateDescriptor(); | 
| 208 |  | 
| 209 | private: | 
| 210 |     RBBIStateDescriptor(const RBBIStateDescriptor &other); // forbid copying of this class | 
| 211 |     RBBIStateDescriptor &operator=(const RBBIStateDescriptor &other); // forbid copying of this class | 
| 212 | }; | 
| 213 |  | 
| 214 |  | 
| 215 |  | 
| 216 | U_NAMESPACE_END | 
| 217 |  | 
| 218 | #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ | 
| 219 |  | 
| 220 | #endif | 
| 221 |  |