| 1 | // © 2016 and later: Unicode, Inc. and others. |
| 2 | // License & terms of use: http://www.unicode.org/copyright.html |
| 3 | // |
| 4 | // rbbisetb.h |
| 5 | /* |
| 6 | ********************************************************************** |
| 7 | * Copyright (c) 2001-2005, International Business Machines |
| 8 | * Corporation and others. All Rights Reserved. |
| 9 | ********************************************************************** |
| 10 | */ |
| 11 | |
| 12 | #ifndef RBBISETB_H |
| 13 | #define RBBISETB_H |
| 14 | |
| 15 | #include "unicode/utypes.h" |
| 16 | |
| 17 | #if !UCONFIG_NO_BREAK_ITERATION |
| 18 | |
| 19 | #include "unicode/uobject.h" |
| 20 | #include "rbbirb.h" |
| 21 | #include "utrie2.h" |
| 22 | #include "uvector.h" |
| 23 | |
| 24 | U_NAMESPACE_BEGIN |
| 25 | |
| 26 | // |
| 27 | // RBBISetBuilder Derives the character categories used by the runtime RBBI engine |
| 28 | // from the Unicode Sets appearing in the source RBBI rules, and |
| 29 | // creates the TRIE table used to map from Unicode to the |
| 30 | // character categories. |
| 31 | // |
| 32 | |
| 33 | |
| 34 | // |
| 35 | // RangeDescriptor |
| 36 | // |
| 37 | // Each of the non-overlapping character ranges gets one of these descriptors. |
| 38 | // All of them are strung together in a linked list, which is kept in order |
| 39 | // (by character) |
| 40 | // |
| 41 | class RangeDescriptor : public UMemory { |
| 42 | public: |
| 43 | UChar32 fStartChar; // Start of range, unicode 32 bit value. |
| 44 | UChar32 fEndChar; // End of range, unicode 32 bit value. |
| 45 | int32_t fNum; // runtime-mapped input value for this range. |
| 46 | UVector *fIncludesSets; // vector of the the original |
| 47 | // Unicode sets that include this range. |
| 48 | // (Contains ptrs to uset nodes) |
| 49 | RangeDescriptor *fNext; // Next RangeDescriptor in the linked list. |
| 50 | |
| 51 | RangeDescriptor(UErrorCode &status); |
| 52 | RangeDescriptor(const RangeDescriptor &other, UErrorCode &status); |
| 53 | ~RangeDescriptor(); |
| 54 | void split(UChar32 where, UErrorCode &status); // Spit this range in two at "where", with |
| 55 | // where appearing in the second (higher) part. |
| 56 | void setDictionaryFlag(); // Check whether this range appears as part of |
| 57 | // the Unicode set named "dictionary" |
| 58 | |
| 59 | private: |
| 60 | RangeDescriptor(const RangeDescriptor &other); // forbid copying of this class |
| 61 | RangeDescriptor &operator=(const RangeDescriptor &other); // forbid copying of this class |
| 62 | }; |
| 63 | |
| 64 | |
| 65 | // |
| 66 | // RBBISetBuilder Handles processing of Unicode Sets from RBBI rules. |
| 67 | // |
| 68 | // Starting with the rules parse tree from the scanner, |
| 69 | // |
| 70 | // - Enumerate the set of UnicodeSets that are referenced |
| 71 | // by the RBBI rules. |
| 72 | // - compute a derived set of non-overlapping UnicodeSets |
| 73 | // that will correspond to columns in the state table for |
| 74 | // the RBBI execution engine. |
| 75 | // - construct the trie table that maps input characters |
| 76 | // to set numbers in the non-overlapping set of sets. |
| 77 | // |
| 78 | |
| 79 | |
| 80 | class RBBISetBuilder : public UMemory { |
| 81 | public: |
| 82 | RBBISetBuilder(RBBIRuleBuilder *rb); |
| 83 | ~RBBISetBuilder(); |
| 84 | |
| 85 | void buildRanges(); |
| 86 | void buildTrie(); |
| 87 | void addValToSets(UVector *sets, uint32_t val); |
| 88 | void addValToSet (RBBINode *usetNode, uint32_t val); |
| 89 | int32_t getNumCharCategories() const; // CharCategories are the same as input symbol set to the |
| 90 | // runtime state machine, which are the same as |
| 91 | // columns in the DFA state table |
| 92 | int32_t getTrieSize() /*const*/; // Size in bytes of the serialized Trie. |
| 93 | void serializeTrie(uint8_t *where); // write out the serialized Trie. |
| 94 | UChar32 getFirstChar(int32_t val) const; |
| 95 | UBool sawBOF() const; // Indicate whether any references to the {bof} pseudo |
| 96 | // character were encountered. |
| 97 | /** |
| 98 | * Merge two character categories that have been identified as having equivalent behavior. |
| 99 | * The ranges belonging to the second category (table column) will be added to the first. |
| 100 | * @param categories the pair of categories to be merged. |
| 101 | */ |
| 102 | void mergeCategories(IntPair categories); |
| 103 | |
| 104 | static constexpr int32_t DICT_BIT = 0x4000; |
| 105 | |
| 106 | #ifdef RBBI_DEBUG |
| 107 | void printSets(); |
| 108 | void printRanges(); |
| 109 | void printRangeGroups(); |
| 110 | #else |
| 111 | #define printSets() |
| 112 | #define printRanges() |
| 113 | #define printRangeGroups() |
| 114 | #endif |
| 115 | |
| 116 | private: |
| 117 | void numberSets(); |
| 118 | |
| 119 | RBBIRuleBuilder *fRB; // The RBBI Rule Compiler that owns us. |
| 120 | UErrorCode *fStatus; |
| 121 | |
| 122 | RangeDescriptor *fRangeList; // Head of the linked list of RangeDescriptors |
| 123 | |
| 124 | UTrie2 *fTrie; // The mapping TRIE that is the end result of processing |
| 125 | uint32_t fTrieSize; // the Unicode Sets. |
| 126 | |
| 127 | // Groups correspond to character categories - |
| 128 | // groups of ranges that are in the same original UnicodeSets. |
| 129 | // fGroupCount is the index of the last used group. |
| 130 | // fGroupCount+1 is also the number of columns in the RBBI state table being compiled. |
| 131 | // State table column 0 is not used. Column 1 is for end-of-input. |
| 132 | // column 2 is for group 0. Funny counting. |
| 133 | int32_t fGroupCount; |
| 134 | |
| 135 | UBool fSawBOF; |
| 136 | |
| 137 | RBBISetBuilder(const RBBISetBuilder &other); // forbid copying of this class |
| 138 | RBBISetBuilder &operator=(const RBBISetBuilder &other); // forbid copying of this class |
| 139 | }; |
| 140 | |
| 141 | |
| 142 | |
| 143 | U_NAMESPACE_END |
| 144 | |
| 145 | #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |
| 146 | |
| 147 | #endif |
| 148 | |