1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | // |
4 | // rbbisetb.h |
5 | /* |
6 | ********************************************************************** |
7 | * Copyright (c) 2001-2005, International Business Machines |
8 | * Corporation and others. All Rights Reserved. |
9 | ********************************************************************** |
10 | */ |
11 | |
12 | #ifndef RBBISETB_H |
13 | #define RBBISETB_H |
14 | |
15 | #include "unicode/utypes.h" |
16 | |
17 | #if !UCONFIG_NO_BREAK_ITERATION |
18 | |
19 | #include "unicode/uobject.h" |
20 | #include "rbbirb.h" |
21 | #include "utrie2.h" |
22 | #include "uvector.h" |
23 | |
24 | U_NAMESPACE_BEGIN |
25 | |
26 | // |
27 | // RBBISetBuilder Derives the character categories used by the runtime RBBI engine |
28 | // from the Unicode Sets appearing in the source RBBI rules, and |
29 | // creates the TRIE table used to map from Unicode to the |
30 | // character categories. |
31 | // |
32 | |
33 | |
34 | // |
35 | // RangeDescriptor |
36 | // |
37 | // Each of the non-overlapping character ranges gets one of these descriptors. |
38 | // All of them are strung together in a linked list, which is kept in order |
39 | // (by character) |
40 | // |
41 | class RangeDescriptor : public UMemory { |
42 | public: |
43 | UChar32 fStartChar; // Start of range, unicode 32 bit value. |
44 | UChar32 fEndChar; // End of range, unicode 32 bit value. |
45 | int32_t fNum; // runtime-mapped input value for this range. |
46 | UVector *fIncludesSets; // vector of the the original |
47 | // Unicode sets that include this range. |
48 | // (Contains ptrs to uset nodes) |
49 | RangeDescriptor *fNext; // Next RangeDescriptor in the linked list. |
50 | |
51 | RangeDescriptor(UErrorCode &status); |
52 | RangeDescriptor(const RangeDescriptor &other, UErrorCode &status); |
53 | ~RangeDescriptor(); |
54 | void split(UChar32 where, UErrorCode &status); // Spit this range in two at "where", with |
55 | // where appearing in the second (higher) part. |
56 | void setDictionaryFlag(); // Check whether this range appears as part of |
57 | // the Unicode set named "dictionary" |
58 | |
59 | private: |
60 | RangeDescriptor(const RangeDescriptor &other); // forbid copying of this class |
61 | RangeDescriptor &operator=(const RangeDescriptor &other); // forbid copying of this class |
62 | }; |
63 | |
64 | |
65 | // |
66 | // RBBISetBuilder Handles processing of Unicode Sets from RBBI rules. |
67 | // |
68 | // Starting with the rules parse tree from the scanner, |
69 | // |
70 | // - Enumerate the set of UnicodeSets that are referenced |
71 | // by the RBBI rules. |
72 | // - compute a derived set of non-overlapping UnicodeSets |
73 | // that will correspond to columns in the state table for |
74 | // the RBBI execution engine. |
75 | // - construct the trie table that maps input characters |
76 | // to set numbers in the non-overlapping set of sets. |
77 | // |
78 | |
79 | |
80 | class RBBISetBuilder : public UMemory { |
81 | public: |
82 | RBBISetBuilder(RBBIRuleBuilder *rb); |
83 | ~RBBISetBuilder(); |
84 | |
85 | void buildRanges(); |
86 | void buildTrie(); |
87 | void addValToSets(UVector *sets, uint32_t val); |
88 | void addValToSet (RBBINode *usetNode, uint32_t val); |
89 | int32_t getNumCharCategories() const; // CharCategories are the same as input symbol set to the |
90 | // runtime state machine, which are the same as |
91 | // columns in the DFA state table |
92 | int32_t getTrieSize() /*const*/; // Size in bytes of the serialized Trie. |
93 | void serializeTrie(uint8_t *where); // write out the serialized Trie. |
94 | UChar32 getFirstChar(int32_t val) const; |
95 | UBool sawBOF() const; // Indicate whether any references to the {bof} pseudo |
96 | // character were encountered. |
97 | /** |
98 | * Merge two character categories that have been identified as having equivalent behavior. |
99 | * The ranges belonging to the second category (table column) will be added to the first. |
100 | * @param categories the pair of categories to be merged. |
101 | */ |
102 | void mergeCategories(IntPair categories); |
103 | |
104 | static constexpr int32_t DICT_BIT = 0x4000; |
105 | |
106 | #ifdef RBBI_DEBUG |
107 | void printSets(); |
108 | void printRanges(); |
109 | void printRangeGroups(); |
110 | #else |
111 | #define printSets() |
112 | #define printRanges() |
113 | #define printRangeGroups() |
114 | #endif |
115 | |
116 | private: |
117 | void numberSets(); |
118 | |
119 | RBBIRuleBuilder *fRB; // The RBBI Rule Compiler that owns us. |
120 | UErrorCode *fStatus; |
121 | |
122 | RangeDescriptor *fRangeList; // Head of the linked list of RangeDescriptors |
123 | |
124 | UTrie2 *fTrie; // The mapping TRIE that is the end result of processing |
125 | uint32_t fTrieSize; // the Unicode Sets. |
126 | |
127 | // Groups correspond to character categories - |
128 | // groups of ranges that are in the same original UnicodeSets. |
129 | // fGroupCount is the index of the last used group. |
130 | // fGroupCount+1 is also the number of columns in the RBBI state table being compiled. |
131 | // State table column 0 is not used. Column 1 is for end-of-input. |
132 | // column 2 is for group 0. Funny counting. |
133 | int32_t fGroupCount; |
134 | |
135 | UBool fSawBOF; |
136 | |
137 | RBBISetBuilder(const RBBISetBuilder &other); // forbid copying of this class |
138 | RBBISetBuilder &operator=(const RBBISetBuilder &other); // forbid copying of this class |
139 | }; |
140 | |
141 | |
142 | |
143 | U_NAMESPACE_END |
144 | |
145 | #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |
146 | |
147 | #endif |
148 | |