| 1 | // © 2016 and later: Unicode, Inc. and others. |
| 2 | // License & terms of use: http://www.unicode.org/copyright.html |
| 3 | // |
| 4 | // rbbisetb.h |
| 5 | /* |
| 6 | ********************************************************************** |
| 7 | * Copyright (c) 2001-2005, International Business Machines |
| 8 | * Corporation and others. All Rights Reserved. |
| 9 | ********************************************************************** |
| 10 | */ |
| 11 | |
| 12 | #ifndef RBBISETB_H |
| 13 | #define RBBISETB_H |
| 14 | |
| 15 | #include "unicode/utypes.h" |
| 16 | |
| 17 | #if !UCONFIG_NO_BREAK_ITERATION |
| 18 | |
| 19 | #include "unicode/ucptrie.h" |
| 20 | #include "unicode/umutablecptrie.h" |
| 21 | #include "unicode/uobject.h" |
| 22 | #include "rbbirb.h" |
| 23 | #include "uvector.h" |
| 24 | |
| 25 | U_NAMESPACE_BEGIN |
| 26 | |
| 27 | // |
| 28 | // RBBISetBuilder Derives the character categories used by the runtime RBBI engine |
| 29 | // from the Unicode Sets appearing in the source RBBI rules, and |
| 30 | // creates the TRIE table used to map from Unicode to the |
| 31 | // character categories. |
| 32 | // |
| 33 | |
| 34 | |
| 35 | // |
| 36 | // RangeDescriptor |
| 37 | // |
| 38 | // Each of the non-overlapping character ranges gets one of these descriptors. |
| 39 | // All of them are strung together in a linked list, which is kept in order |
| 40 | // (by character) |
| 41 | // |
| 42 | class RangeDescriptor : public UMemory { |
| 43 | public: |
| 44 | UChar32 fStartChar {}; // Start of range, unicode 32 bit value. |
| 45 | UChar32 fEndChar {}; // End of range, unicode 32 bit value. |
| 46 | int32_t fNum {0}; // runtime-mapped input value for this range. |
| 47 | bool fIncludesDict {false}; // True if the range includes $dictionary. |
| 48 | bool fFirstInGroup {false}; // True if first range in a group with the same fNum. |
| 49 | UVector *fIncludesSets {nullptr}; // vector of the the original |
| 50 | // Unicode sets that include this range. |
| 51 | // (Contains ptrs to uset nodes) |
| 52 | RangeDescriptor *fNext {nullptr}; // Next RangeDescriptor in the linked list. |
| 53 | |
| 54 | RangeDescriptor(UErrorCode &status); |
| 55 | RangeDescriptor(const RangeDescriptor &other, UErrorCode &status); |
| 56 | ~RangeDescriptor(); |
| 57 | void split(UChar32 where, UErrorCode &status); // Spit this range in two at "where", with |
| 58 | // where appearing in the second (higher) part. |
| 59 | bool isDictionaryRange(); // Check whether this range appears as part of |
| 60 | // the Unicode set named "dictionary" |
| 61 | |
| 62 | RangeDescriptor(const RangeDescriptor &other) = delete; // forbid default copying of this class |
| 63 | RangeDescriptor &operator=(const RangeDescriptor &other) = delete; // forbid assigning of this class |
| 64 | }; |
| 65 | |
| 66 | |
| 67 | // |
| 68 | // RBBISetBuilder Handles processing of Unicode Sets from RBBI rules. |
| 69 | // |
| 70 | // Starting with the rules parse tree from the scanner, |
| 71 | // |
| 72 | // - Enumerate the set of UnicodeSets that are referenced |
| 73 | // by the RBBI rules. |
| 74 | // - compute a derived set of non-overlapping UnicodeSets |
| 75 | // that will correspond to columns in the state table for |
| 76 | // the RBBI execution engine. |
| 77 | // - construct the trie table that maps input characters |
| 78 | // to set numbers in the non-overlapping set of sets. |
| 79 | // |
| 80 | |
| 81 | |
| 82 | class RBBISetBuilder : public UMemory { |
| 83 | public: |
| 84 | RBBISetBuilder(RBBIRuleBuilder *rb); |
| 85 | ~RBBISetBuilder(); |
| 86 | |
| 87 | void buildRanges(); |
| 88 | void buildTrie(); |
| 89 | void addValToSets(UVector *sets, uint32_t val); |
| 90 | void addValToSet (RBBINode *usetNode, uint32_t val); |
| 91 | int32_t getNumCharCategories() const; // CharCategories are the same as input symbol set to the |
| 92 | // runtime state machine, which are the same as |
| 93 | // columns in the DFA state table |
| 94 | int32_t getDictCategoriesStart() const; // First char category that includes $dictionary, or |
| 95 | // last category + 1 if there are no dictionary categories. |
| 96 | int32_t getTrieSize() /*const*/; // Size in bytes of the serialized Trie. |
| 97 | void serializeTrie(uint8_t *where); // write out the serialized Trie. |
| 98 | UChar32 getFirstChar(int32_t val) const; |
| 99 | UBool sawBOF() const; // Indicate whether any references to the {bof} pseudo |
| 100 | // character were encountered. |
| 101 | /** |
| 102 | * Merge two character categories that have been identified as having equivalent behavior. |
| 103 | * The ranges belonging to the second category (table column) will be added to the first. |
| 104 | * @param categories the pair of categories to be merged. |
| 105 | */ |
| 106 | void mergeCategories(IntPair categories); |
| 107 | |
| 108 | #ifdef RBBI_DEBUG |
| 109 | void printSets(); |
| 110 | void printRanges(); |
| 111 | void printRangeGroups(); |
| 112 | #else |
| 113 | #define printSets() |
| 114 | #define printRanges() |
| 115 | #define printRangeGroups() |
| 116 | #endif |
| 117 | |
| 118 | private: |
| 119 | RBBIRuleBuilder *fRB; // The RBBI Rule Compiler that owns us. |
| 120 | UErrorCode *fStatus; |
| 121 | |
| 122 | RangeDescriptor *fRangeList; // Head of the linked list of RangeDescriptors |
| 123 | |
| 124 | UMutableCPTrie *fMutableTrie; // The mapping TRIE that is the end result of processing |
| 125 | UCPTrie *fTrie; // the Unicode Sets. |
| 126 | uint32_t fTrieSize; |
| 127 | |
| 128 | // Number of range groups, which are groups of ranges that are in the same original UnicodeSets. |
| 129 | int32_t fGroupCount; |
| 130 | |
| 131 | // The number of the first dictionary char category. |
| 132 | // If there are no Dictionary categories, set to the last category + 1. |
| 133 | int32_t fDictCategoriesStart; |
| 134 | |
| 135 | UBool fSawBOF; |
| 136 | |
| 137 | RBBISetBuilder(const RBBISetBuilder &other) = delete; // forbid copying of this class |
| 138 | RBBISetBuilder &operator=(const RBBISetBuilder &other) = delete; // forbid copying of this class |
| 139 | }; |
| 140 | |
| 141 | |
| 142 | |
| 143 | U_NAMESPACE_END |
| 144 | |
| 145 | #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |
| 146 | |
| 147 | #endif |
| 148 | |