| 1 | // © 2016 and later: Unicode, Inc. and others. | 
|---|
| 2 | // License & terms of use: http://www.unicode.org/copyright.html | 
|---|
| 3 | // | 
|---|
| 4 | //  rbbisetb.h | 
|---|
| 5 | /* | 
|---|
| 6 | ********************************************************************** | 
|---|
| 7 | *   Copyright (c) 2001-2005, International Business Machines | 
|---|
| 8 | *   Corporation and others.  All Rights Reserved. | 
|---|
| 9 | ********************************************************************** | 
|---|
| 10 | */ | 
|---|
| 11 |  | 
|---|
| 12 | #ifndef RBBISETB_H | 
|---|
| 13 | #define RBBISETB_H | 
|---|
| 14 |  | 
|---|
| 15 | #include "unicode/utypes.h" | 
|---|
| 16 |  | 
|---|
| 17 | #if !UCONFIG_NO_BREAK_ITERATION | 
|---|
| 18 |  | 
|---|
| 19 | #include "unicode/uobject.h" | 
|---|
| 20 | #include "rbbirb.h" | 
|---|
| 21 | #include "utrie2.h" | 
|---|
| 22 | #include "uvector.h" | 
|---|
| 23 |  | 
|---|
| 24 | U_NAMESPACE_BEGIN | 
|---|
| 25 |  | 
|---|
| 26 | // | 
|---|
| 27 | //  RBBISetBuilder   Derives the character categories used by the runtime RBBI engine | 
|---|
| 28 | //                   from the Unicode Sets appearing in the source  RBBI rules, and | 
|---|
| 29 | //                   creates the TRIE table used to map from Unicode to the | 
|---|
| 30 | //                   character categories. | 
|---|
| 31 | // | 
|---|
| 32 |  | 
|---|
| 33 |  | 
|---|
| 34 | // | 
|---|
| 35 | //  RangeDescriptor | 
|---|
| 36 | // | 
|---|
| 37 | //     Each of the non-overlapping character ranges gets one of these descriptors. | 
|---|
| 38 | //     All of them are strung together in a linked list, which is kept in order | 
|---|
| 39 | //     (by character) | 
|---|
| 40 | // | 
|---|
| 41 | class RangeDescriptor : public UMemory { | 
|---|
| 42 | public: | 
|---|
| 43 | UChar32            fStartChar;      // Start of range, unicode 32 bit value. | 
|---|
| 44 | UChar32            fEndChar;        // End of range, unicode 32 bit value. | 
|---|
| 45 | int32_t            fNum;            // runtime-mapped input value for this range. | 
|---|
| 46 | UVector           *fIncludesSets;   // vector of the the original | 
|---|
| 47 | //   Unicode sets that include this range. | 
|---|
| 48 | //    (Contains ptrs to uset nodes) | 
|---|
| 49 | RangeDescriptor   *fNext;           // Next RangeDescriptor in the linked list. | 
|---|
| 50 |  | 
|---|
| 51 | RangeDescriptor(UErrorCode &status); | 
|---|
| 52 | RangeDescriptor(const RangeDescriptor &other, UErrorCode &status); | 
|---|
| 53 | ~RangeDescriptor(); | 
|---|
| 54 | void split(UChar32 where, UErrorCode &status);   // Spit this range in two at "where", with | 
|---|
| 55 | //   where appearing in the second (higher) part. | 
|---|
| 56 | void setDictionaryFlag();           // Check whether this range appears as part of | 
|---|
| 57 | //   the Unicode set named "dictionary" | 
|---|
| 58 |  | 
|---|
| 59 | private: | 
|---|
| 60 | RangeDescriptor(const RangeDescriptor &other); // forbid copying of this class | 
|---|
| 61 | RangeDescriptor &operator=(const RangeDescriptor &other); // forbid copying of this class | 
|---|
| 62 | }; | 
|---|
| 63 |  | 
|---|
| 64 |  | 
|---|
| 65 | // | 
|---|
| 66 | //  RBBISetBuilder   Handles processing of Unicode Sets from RBBI rules. | 
|---|
| 67 | // | 
|---|
| 68 | //      Starting with the rules parse tree from the scanner, | 
|---|
| 69 | // | 
|---|
| 70 | //                   -  Enumerate the set of UnicodeSets that are referenced | 
|---|
| 71 | //                      by the RBBI rules. | 
|---|
| 72 | //                   -  compute a derived set of non-overlapping UnicodeSets | 
|---|
| 73 | //                      that will correspond to columns in the state table for | 
|---|
| 74 | //                      the RBBI execution engine. | 
|---|
| 75 | //                   -  construct the trie table that maps input characters | 
|---|
| 76 | //                      to set numbers in the non-overlapping set of sets. | 
|---|
| 77 | // | 
|---|
| 78 |  | 
|---|
| 79 |  | 
|---|
| 80 | class RBBISetBuilder : public UMemory { | 
|---|
| 81 | public: | 
|---|
| 82 | RBBISetBuilder(RBBIRuleBuilder *rb); | 
|---|
| 83 | ~RBBISetBuilder(); | 
|---|
| 84 |  | 
|---|
| 85 | void     buildRanges(); | 
|---|
| 86 | void     buildTrie(); | 
|---|
| 87 | void     addValToSets(UVector *sets,      uint32_t val); | 
|---|
| 88 | void     addValToSet (RBBINode *usetNode, uint32_t val); | 
|---|
| 89 | int32_t  getNumCharCategories() const;   // CharCategories are the same as input symbol set to the | 
|---|
| 90 | //    runtime state machine, which are the same as | 
|---|
| 91 | //    columns in the DFA state table | 
|---|
| 92 | int32_t  getTrieSize() /*const*/;        // Size in bytes of the serialized Trie. | 
|---|
| 93 | void     serializeTrie(uint8_t *where);  // write out the serialized Trie. | 
|---|
| 94 | UChar32  getFirstChar(int32_t  val) const; | 
|---|
| 95 | UBool    sawBOF() const;                 // Indicate whether any references to the {bof} pseudo | 
|---|
| 96 | //   character were encountered. | 
|---|
| 97 | /** | 
|---|
| 98 | * Merge two character categories that have been identified as having equivalent behavior. | 
|---|
| 99 | * The ranges belonging to the second category (table column) will be added to the first. | 
|---|
| 100 | * @param categories the pair of categories to be merged. | 
|---|
| 101 | */ | 
|---|
| 102 | void     mergeCategories(IntPair categories); | 
|---|
| 103 |  | 
|---|
| 104 | static constexpr int32_t DICT_BIT = 0x4000; | 
|---|
| 105 |  | 
|---|
| 106 | #ifdef RBBI_DEBUG | 
|---|
| 107 | void     printSets(); | 
|---|
| 108 | void     printRanges(); | 
|---|
| 109 | void     printRangeGroups(); | 
|---|
| 110 | #else | 
|---|
| 111 | #define printSets() | 
|---|
| 112 | #define printRanges() | 
|---|
| 113 | #define printRangeGroups() | 
|---|
| 114 | #endif | 
|---|
| 115 |  | 
|---|
| 116 | private: | 
|---|
| 117 | void           numberSets(); | 
|---|
| 118 |  | 
|---|
| 119 | RBBIRuleBuilder       *fRB;             // The RBBI Rule Compiler that owns us. | 
|---|
| 120 | UErrorCode            *fStatus; | 
|---|
| 121 |  | 
|---|
| 122 | RangeDescriptor       *fRangeList;      // Head of the linked list of RangeDescriptors | 
|---|
| 123 |  | 
|---|
| 124 | UTrie2                *fTrie;           // The mapping TRIE that is the end result of processing | 
|---|
| 125 | uint32_t               fTrieSize;       //  the Unicode Sets. | 
|---|
| 126 |  | 
|---|
| 127 | // Groups correspond to character categories - | 
|---|
| 128 | //       groups of ranges that are in the same original UnicodeSets. | 
|---|
| 129 | //       fGroupCount is the index of the last used group. | 
|---|
| 130 | //       fGroupCount+1 is also the number of columns in the RBBI state table being compiled. | 
|---|
| 131 | //       State table column 0 is not used.  Column 1 is for end-of-input. | 
|---|
| 132 | //       column 2 is for group 0.  Funny counting. | 
|---|
| 133 | int32_t               fGroupCount; | 
|---|
| 134 |  | 
|---|
| 135 | UBool                 fSawBOF; | 
|---|
| 136 |  | 
|---|
| 137 | RBBISetBuilder(const RBBISetBuilder &other); // forbid copying of this class | 
|---|
| 138 | RBBISetBuilder &operator=(const RBBISetBuilder &other); // forbid copying of this class | 
|---|
| 139 | }; | 
|---|
| 140 |  | 
|---|
| 141 |  | 
|---|
| 142 |  | 
|---|
| 143 | U_NAMESPACE_END | 
|---|
| 144 |  | 
|---|
| 145 | #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ | 
|---|
| 146 |  | 
|---|
| 147 | #endif | 
|---|
| 148 |  | 
|---|