1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | // |
4 | // rbbisetb.h |
5 | /* |
6 | ********************************************************************** |
7 | * Copyright (c) 2001-2005, International Business Machines |
8 | * Corporation and others. All Rights Reserved. |
9 | ********************************************************************** |
10 | */ |
11 | |
12 | #ifndef RBBISETB_H |
13 | #define RBBISETB_H |
14 | |
15 | #include "unicode/utypes.h" |
16 | |
17 | #if !UCONFIG_NO_BREAK_ITERATION |
18 | |
19 | #include "unicode/ucptrie.h" |
20 | #include "unicode/umutablecptrie.h" |
21 | #include "unicode/uobject.h" |
22 | #include "rbbirb.h" |
23 | #include "uvector.h" |
24 | |
25 | U_NAMESPACE_BEGIN |
26 | |
27 | // |
28 | // RBBISetBuilder Derives the character categories used by the runtime RBBI engine |
29 | // from the Unicode Sets appearing in the source RBBI rules, and |
30 | // creates the TRIE table used to map from Unicode to the |
31 | // character categories. |
32 | // |
33 | |
34 | |
35 | // |
36 | // RangeDescriptor |
37 | // |
38 | // Each of the non-overlapping character ranges gets one of these descriptors. |
39 | // All of them are strung together in a linked list, which is kept in order |
40 | // (by character) |
41 | // |
42 | class RangeDescriptor : public UMemory { |
43 | public: |
44 | UChar32 fStartChar {}; // Start of range, unicode 32 bit value. |
45 | UChar32 fEndChar {}; // End of range, unicode 32 bit value. |
46 | int32_t fNum {0}; // runtime-mapped input value for this range. |
47 | bool fIncludesDict {false}; // True if the range includes $dictionary. |
48 | bool fFirstInGroup {false}; // True if first range in a group with the same fNum. |
49 | UVector *fIncludesSets {nullptr}; // vector of the the original |
50 | // Unicode sets that include this range. |
51 | // (Contains ptrs to uset nodes) |
52 | RangeDescriptor *fNext {nullptr}; // Next RangeDescriptor in the linked list. |
53 | |
54 | RangeDescriptor(UErrorCode &status); |
55 | RangeDescriptor(const RangeDescriptor &other, UErrorCode &status); |
56 | ~RangeDescriptor(); |
57 | void split(UChar32 where, UErrorCode &status); // Spit this range in two at "where", with |
58 | // where appearing in the second (higher) part. |
59 | bool isDictionaryRange(); // Check whether this range appears as part of |
60 | // the Unicode set named "dictionary" |
61 | |
62 | RangeDescriptor(const RangeDescriptor &other) = delete; // forbid default copying of this class |
63 | RangeDescriptor &operator=(const RangeDescriptor &other) = delete; // forbid assigning of this class |
64 | }; |
65 | |
66 | |
67 | // |
68 | // RBBISetBuilder Handles processing of Unicode Sets from RBBI rules. |
69 | // |
70 | // Starting with the rules parse tree from the scanner, |
71 | // |
72 | // - Enumerate the set of UnicodeSets that are referenced |
73 | // by the RBBI rules. |
74 | // - compute a derived set of non-overlapping UnicodeSets |
75 | // that will correspond to columns in the state table for |
76 | // the RBBI execution engine. |
77 | // - construct the trie table that maps input characters |
78 | // to set numbers in the non-overlapping set of sets. |
79 | // |
80 | |
81 | |
82 | class RBBISetBuilder : public UMemory { |
83 | public: |
84 | RBBISetBuilder(RBBIRuleBuilder *rb); |
85 | ~RBBISetBuilder(); |
86 | |
87 | void buildRanges(); |
88 | void buildTrie(); |
89 | void addValToSets(UVector *sets, uint32_t val); |
90 | void addValToSet (RBBINode *usetNode, uint32_t val); |
91 | int32_t getNumCharCategories() const; // CharCategories are the same as input symbol set to the |
92 | // runtime state machine, which are the same as |
93 | // columns in the DFA state table |
94 | int32_t getDictCategoriesStart() const; // First char category that includes $dictionary, or |
95 | // last category + 1 if there are no dictionary categories. |
96 | int32_t getTrieSize() /*const*/; // Size in bytes of the serialized Trie. |
97 | void serializeTrie(uint8_t *where); // write out the serialized Trie. |
98 | UChar32 getFirstChar(int32_t val) const; |
99 | UBool sawBOF() const; // Indicate whether any references to the {bof} pseudo |
100 | // character were encountered. |
101 | /** |
102 | * Merge two character categories that have been identified as having equivalent behavior. |
103 | * The ranges belonging to the second category (table column) will be added to the first. |
104 | * @param categories the pair of categories to be merged. |
105 | */ |
106 | void mergeCategories(IntPair categories); |
107 | |
108 | #ifdef RBBI_DEBUG |
109 | void printSets(); |
110 | void printRanges(); |
111 | void printRangeGroups(); |
112 | #else |
113 | #define printSets() |
114 | #define printRanges() |
115 | #define printRangeGroups() |
116 | #endif |
117 | |
118 | private: |
119 | RBBIRuleBuilder *fRB; // The RBBI Rule Compiler that owns us. |
120 | UErrorCode *fStatus; |
121 | |
122 | RangeDescriptor *fRangeList; // Head of the linked list of RangeDescriptors |
123 | |
124 | UMutableCPTrie *fMutableTrie; // The mapping TRIE that is the end result of processing |
125 | UCPTrie *fTrie; // the Unicode Sets. |
126 | uint32_t fTrieSize; |
127 | |
128 | // Number of range groups, which are groups of ranges that are in the same original UnicodeSets. |
129 | int32_t fGroupCount; |
130 | |
131 | // The number of the first dictionary char category. |
132 | // If there are no Dictionary categories, set to the last category + 1. |
133 | int32_t fDictCategoriesStart; |
134 | |
135 | UBool fSawBOF; |
136 | |
137 | RBBISetBuilder(const RBBISetBuilder &other) = delete; // forbid copying of this class |
138 | RBBISetBuilder &operator=(const RBBISetBuilder &other) = delete; // forbid copying of this class |
139 | }; |
140 | |
141 | |
142 | |
143 | U_NAMESPACE_END |
144 | |
145 | #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |
146 | |
147 | #endif |
148 | |