1 | // © 2017 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | |
4 | // ucptrie_impl.h (modified from utrie2_impl.h) |
5 | // created: 2017dec29 Markus W. Scherer |
6 | |
7 | #ifndef __UCPTRIE_IMPL_H__ |
8 | #define __UCPTRIE_IMPL_H__ |
9 | |
10 | #include "unicode/ucptrie.h" |
11 | #ifdef UCPTRIE_DEBUG |
12 | #include "unicode/umutablecptrie.h" |
13 | #endif |
14 | |
15 | // UCPTrie signature values, in platform endianness and opposite endianness. |
16 | // The UCPTrie signature ASCII byte values spell "Tri3". |
17 | #define UCPTRIE_SIG 0x54726933 |
18 | #define UCPTRIE_OE_SIG 0x33697254 |
19 | |
20 | /** |
21 | * Header data for the binary, memory-mappable representation of a UCPTrie/CodePointTrie. |
22 | * @internal |
23 | */ |
24 | struct { |
25 | /** "Tri3" in big-endian US-ASCII (0x54726933) */ |
26 | uint32_t ; |
27 | |
28 | /** |
29 | * Options bit field: |
30 | * Bits 15..12: Data length bits 19..16. |
31 | * Bits 11..8: Data null block offset bits 19..16. |
32 | * Bits 7..6: UCPTrieType |
33 | * Bits 5..3: Reserved (0). |
34 | * Bits 2..0: UCPTrieValueWidth |
35 | */ |
36 | uint16_t ; |
37 | |
38 | /** Total length of the index tables. */ |
39 | uint16_t ; |
40 | |
41 | /** Data length bits 15..0. */ |
42 | uint16_t ; |
43 | |
44 | /** Index-3 null block offset, 0x7fff or 0xffff if none. */ |
45 | uint16_t ; |
46 | |
47 | /** Data null block offset bits 15..0, 0xfffff if none. */ |
48 | uint16_t ; |
49 | |
50 | /** |
51 | * First code point of the single-value range ending with U+10ffff, |
52 | * rounded up and then shifted right by UCPTRIE_SHIFT_2. |
53 | */ |
54 | uint16_t ; |
55 | }; |
56 | |
57 | // Constants for use with UCPTrieHeader.options. |
58 | constexpr uint16_t UCPTRIE_OPTIONS_DATA_LENGTH_MASK = 0xf000; |
59 | constexpr uint16_t UCPTRIE_OPTIONS_DATA_NULL_OFFSET_MASK = 0xf00; |
60 | constexpr uint16_t UCPTRIE_OPTIONS_RESERVED_MASK = 0x38; |
61 | constexpr uint16_t UCPTRIE_OPTIONS_VALUE_BITS_MASK = 7; |
62 | |
63 | /** |
64 | * Value for index3NullOffset which indicates that there is no index-3 null block. |
65 | * Bit 15 is unused for this value because this bit is used if the index-3 contains |
66 | * 18-bit indexes. |
67 | */ |
68 | constexpr int32_t UCPTRIE_NO_INDEX3_NULL_OFFSET = 0x7fff; |
69 | constexpr int32_t UCPTRIE_NO_DATA_NULL_OFFSET = 0xfffff; |
70 | |
71 | // Internal constants. |
72 | |
73 | /** The length of the BMP index table. 1024=0x400 */ |
74 | constexpr int32_t UCPTRIE_BMP_INDEX_LENGTH = 0x10000 >> UCPTRIE_FAST_SHIFT; |
75 | |
76 | constexpr int32_t UCPTRIE_SMALL_LIMIT = 0x1000; |
77 | constexpr int32_t UCPTRIE_SMALL_INDEX_LENGTH = UCPTRIE_SMALL_LIMIT >> UCPTRIE_FAST_SHIFT; |
78 | |
79 | /** Shift size for getting the index-3 table offset. */ |
80 | constexpr int32_t UCPTRIE_SHIFT_3 = 4; |
81 | |
82 | /** Shift size for getting the index-2 table offset. */ |
83 | constexpr int32_t UCPTRIE_SHIFT_2 = 5 + UCPTRIE_SHIFT_3; |
84 | |
85 | /** Shift size for getting the index-1 table offset. */ |
86 | constexpr int32_t UCPTRIE_SHIFT_1 = 5 + UCPTRIE_SHIFT_2; |
87 | |
88 | /** |
89 | * Difference between two shift sizes, |
90 | * for getting an index-2 offset from an index-3 offset. 5=9-4 |
91 | */ |
92 | constexpr int32_t UCPTRIE_SHIFT_2_3 = UCPTRIE_SHIFT_2 - UCPTRIE_SHIFT_3; |
93 | |
94 | /** |
95 | * Difference between two shift sizes, |
96 | * for getting an index-1 offset from an index-2 offset. 5=14-9 |
97 | */ |
98 | constexpr int32_t UCPTRIE_SHIFT_1_2 = UCPTRIE_SHIFT_1 - UCPTRIE_SHIFT_2; |
99 | |
100 | /** |
101 | * Number of index-1 entries for the BMP. (4) |
102 | * This part of the index-1 table is omitted from the serialized form. |
103 | */ |
104 | constexpr int32_t UCPTRIE_OMITTED_BMP_INDEX_1_LENGTH = 0x10000 >> UCPTRIE_SHIFT_1; |
105 | |
106 | /** Number of entries in an index-2 block. 32=0x20 */ |
107 | constexpr int32_t UCPTRIE_INDEX_2_BLOCK_LENGTH = 1 << UCPTRIE_SHIFT_1_2; |
108 | |
109 | /** Mask for getting the lower bits for the in-index-2-block offset. */ |
110 | constexpr int32_t UCPTRIE_INDEX_2_MASK = UCPTRIE_INDEX_2_BLOCK_LENGTH - 1; |
111 | |
112 | /** Number of code points per index-2 table entry. 512=0x200 */ |
113 | constexpr int32_t UCPTRIE_CP_PER_INDEX_2_ENTRY = 1 << UCPTRIE_SHIFT_2; |
114 | |
115 | /** Number of entries in an index-3 block. 32=0x20 */ |
116 | constexpr int32_t UCPTRIE_INDEX_3_BLOCK_LENGTH = 1 << UCPTRIE_SHIFT_2_3; |
117 | |
118 | /** Mask for getting the lower bits for the in-index-3-block offset. */ |
119 | constexpr int32_t UCPTRIE_INDEX_3_MASK = UCPTRIE_INDEX_3_BLOCK_LENGTH - 1; |
120 | |
121 | /** Number of entries in a small data block. 16=0x10 */ |
122 | constexpr int32_t UCPTRIE_SMALL_DATA_BLOCK_LENGTH = 1 << UCPTRIE_SHIFT_3; |
123 | |
124 | /** Mask for getting the lower bits for the in-small-data-block offset. */ |
125 | constexpr int32_t UCPTRIE_SMALL_DATA_MASK = UCPTRIE_SMALL_DATA_BLOCK_LENGTH - 1; |
126 | |
127 | |
128 | typedef UChar32 |
129 | UCPTrieGetRange(const void *trie, UChar32 start, |
130 | UCPMapValueFilter *filter, const void *context, uint32_t *pValue); |
131 | |
132 | U_CFUNC UChar32 |
133 | ucptrie_internalGetRange(UCPTrieGetRange *getRange, |
134 | const void *trie, UChar32 start, |
135 | UCPMapRangeOption option, uint32_t surrogateValue, |
136 | UCPMapValueFilter *filter, const void *context, uint32_t *pValue); |
137 | |
138 | #ifdef UCPTRIE_DEBUG |
139 | U_CFUNC void |
140 | ucptrie_printLengths(const UCPTrie *trie, const char *which); |
141 | |
142 | U_CFUNC void umutablecptrie_setName(UMutableCPTrie *builder, const char *name); |
143 | #endif |
144 | |
145 | /* |
146 | * Format of the binary, memory-mappable representation of a UCPTrie/CodePointTrie. |
147 | * For overview information see https://icu.unicode.org/design/struct/utrie |
148 | * |
149 | * The binary trie data should be 32-bit-aligned. |
150 | * The overall layout is: |
151 | * |
152 | * UCPTrieHeader header; -- 16 bytes, see struct definition above |
153 | * uint16_t index[header.indexLength]; |
154 | * uintXY_t data[header.dataLength]; |
155 | * |
156 | * The trie data array is an array of uint16_t, uint32_t, or uint8_t, |
157 | * specified via the UCPTrieValueWidth when building the trie. |
158 | * The data array is 32-bit-aligned for uint32_t, otherwise 16-bit-aligned. |
159 | * The overall length of the trie data is a multiple of 4 bytes. |
160 | * (Padding is added at the end of the index array and/or near the end of the data array as needed.) |
161 | * |
162 | * The length of the data array (dataLength) is stored as an integer split across two fields |
163 | * of the header struct (high bits in header.options). |
164 | * |
165 | * The trie type can be "fast" or "small" which determines the index structure, |
166 | * specified via the UCPTrieType when building the trie. |
167 | * |
168 | * The type and valueWidth are stored in the header.options. |
169 | * There are reserved type and valueWidth values, and reserved header.options bits. |
170 | * They could be used in future format extensions. |
171 | * Code reading the trie structure must fail with an error when unknown values or options are set. |
172 | * |
173 | * Values for ASCII character (U+0000..U+007F) can always be found at the start of the data array. |
174 | * |
175 | * Values for code points below a type-specific fast-indexing limit are found via two-stage lookup. |
176 | * For a "fast" trie, the limit is the BMP/supplementary boundary at U+10000. |
177 | * For a "small" trie, the limit is UCPTRIE_SMALL_MAX+1=U+1000. |
178 | * |
179 | * All code points in the range highStart..U+10FFFF map to a single highValue |
180 | * which is stored at the second-to-last position of the data array. |
181 | * (See UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET.) |
182 | * The highStart value is header.shiftedHighStart<<UCPTRIE_SHIFT_2. |
183 | * (UCPTRIE_SHIFT_2=9) |
184 | * |
185 | * Values for code points fast_limit..highStart-1 are found via four-stage lookup. |
186 | * The data block size is smaller for this range than for the fast range. |
187 | * This together with more index stages with small blocks makes this range |
188 | * more easily compactable. |
189 | * |
190 | * There is also a trie error value stored at the last position of the data array. |
191 | * (See UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET.) |
192 | * It is intended to be returned for inputs that are not Unicode code points |
193 | * (outside U+0000..U+10FFFF), or in string processing for ill-formed input |
194 | * (unpaired surrogate in UTF-16, ill-formed UTF-8 subsequence). |
195 | * |
196 | * For a "fast" trie: |
197 | * |
198 | * The index array starts with the BMP index table for BMP code point lookup. |
199 | * Its length is 1024=0x400. |
200 | * |
201 | * The supplementary index-1 table follows the BMP index table. |
202 | * Variable length, for code points up to highStart-1. |
203 | * Maximum length 64=0x40=0x100000>>UCPTRIE_SHIFT_1. |
204 | * (For 0x100000 supplementary code points U+10000..U+10ffff.) |
205 | * |
206 | * After this index-1 table follow the variable-length index-3 and index-2 tables. |
207 | * |
208 | * The supplementary index tables are omitted completely |
209 | * if there is only BMP data (highStart<=U+10000). |
210 | * |
211 | * For a "small" trie: |
212 | * |
213 | * The index array starts with a fast-index table for lookup of code points U+0000..U+0FFF. |
214 | * |
215 | * The "supplementary" index tables are always stored. |
216 | * The index-1 table starts from U+0000, its maximum length is 68=0x44=0x110000>>UCPTRIE_SHIFT_1. |
217 | * |
218 | * For both trie types: |
219 | * |
220 | * The last index-2 block may be a partial block, storing indexes only for code points |
221 | * below highStart. |
222 | * |
223 | * Lookup for ASCII code point c: |
224 | * |
225 | * Linear access from the start of the data array. |
226 | * |
227 | * value = data[c]; |
228 | * |
229 | * Lookup for fast-range code point c: |
230 | * |
231 | * Shift the code point right by UCPTRIE_FAST_SHIFT=6 bits, |
232 | * fetch the index array value at that offset, |
233 | * add the lower code point bits, index into the data array. |
234 | * |
235 | * value = data[index[c>>6] + (c&0x3f)]; |
236 | * |
237 | * (This works for ASCII as well.) |
238 | * |
239 | * Lookup for small-range code point c below highStart: |
240 | * |
241 | * Split the code point into four bit fields using several sets of shifts & masks |
242 | * to read consecutive values from the index-1, index-2, index-3 and data tables. |
243 | * |
244 | * If all of the data block offsets in an index-3 block fit within 16 bits (up to 0xffff), |
245 | * then the data block offsets are stored directly as uint16_t. |
246 | * |
247 | * Otherwise (this is very unusual but possible), the index-2 entry for the index-3 block |
248 | * has bit 15 (0x8000) set, and each set of 8 index-3 entries is preceded by |
249 | * an additional uint16_t word. Data block offsets are 18 bits wide, with the top 2 bits stored |
250 | * in the additional word. |
251 | * |
252 | * See ucptrie_internalSmallIndex() for details. |
253 | * |
254 | * (In a "small" trie, this works for ASCII and below-fast_limit code points as well.) |
255 | * |
256 | * Compaction: |
257 | * |
258 | * Multiple code point ranges ("blocks") that are aligned on certain boundaries |
259 | * (determined by the shifting/bit fields of code points) and |
260 | * map to the same data values normally share a single subsequence of the data array. |
261 | * Data blocks can also overlap partially. |
262 | * (Depending on the builder code finding duplicate and overlapping blocks.) |
263 | * |
264 | * Iteration over same-value ranges: |
265 | * |
266 | * Range iteration (ucptrie_getRange()) walks the structure from a start code point |
267 | * until some code point is found that maps to a different value; |
268 | * the end of the returned range is just before that. |
269 | * |
270 | * The header.dataNullOffset (split across two header fields, high bits in header.options) |
271 | * is the offset of a widely shared data block filled with one single value. |
272 | * It helps quickly skip over large ranges of data with that value. |
273 | * The builder must ensure that if the start of any data block (fast or small) |
274 | * matches the dataNullOffset, then the whole block must be filled with the null value. |
275 | * Special care must be taken if there is no fast null data block |
276 | * but a small one, which is shorter, and it matches the *start* of some fast data block. |
277 | * |
278 | * Similarly, the header.index3NullOffset is the index-array offset of an index-3 block |
279 | * where all index entries point to the dataNullOffset. |
280 | * If there is no such data or index-3 block, then these offsets are set to |
281 | * values that cannot be reached (data offset out of range/reserved index offset), |
282 | * normally UCPTRIE_NO_DATA_NULL_OFFSET or UCPTRIE_NO_INDEX3_NULL_OFFSET respectively. |
283 | */ |
284 | |
285 | #endif |
286 | |