| 1 | // © 2017 and later: Unicode, Inc. and others. | 
|---|
| 2 | // License & terms of use: http://www.unicode.org/copyright.html | 
|---|
| 3 |  | 
|---|
| 4 | // ucptrie_impl.h (modified from utrie2_impl.h) | 
|---|
| 5 | // created: 2017dec29 Markus W. Scherer | 
|---|
| 6 |  | 
|---|
| 7 | #ifndef __UCPTRIE_IMPL_H__ | 
|---|
| 8 | #define __UCPTRIE_IMPL_H__ | 
|---|
| 9 |  | 
|---|
| 10 | #include "unicode/ucptrie.h" | 
|---|
| 11 | #ifdef UCPTRIE_DEBUG | 
|---|
| 12 | #include "unicode/umutablecptrie.h" | 
|---|
| 13 | #endif | 
|---|
| 14 |  | 
|---|
| 15 | // UCPTrie signature values, in platform endianness and opposite endianness. | 
|---|
| 16 | // The UCPTrie signature ASCII byte values spell "Tri3". | 
|---|
| 17 | #define UCPTRIE_SIG     0x54726933 | 
|---|
| 18 | #define UCPTRIE_OE_SIG  0x33697254 | 
|---|
| 19 |  | 
|---|
| 20 | /** | 
|---|
| 21 | * Header data for the binary, memory-mappable representation of a UCPTrie/CodePointTrie. | 
|---|
| 22 | * @internal | 
|---|
| 23 | */ | 
|---|
| 24 | struct  { | 
|---|
| 25 | /** "Tri3" in big-endian US-ASCII (0x54726933) */ | 
|---|
| 26 | uint32_t ; | 
|---|
| 27 |  | 
|---|
| 28 | /** | 
|---|
| 29 | * Options bit field: | 
|---|
| 30 | * Bits 15..12: Data length bits 19..16. | 
|---|
| 31 | * Bits 11..8: Data null block offset bits 19..16. | 
|---|
| 32 | * Bits 7..6: UCPTrieType | 
|---|
| 33 | * Bits 5..3: Reserved (0). | 
|---|
| 34 | * Bits 2..0: UCPTrieValueWidth | 
|---|
| 35 | */ | 
|---|
| 36 | uint16_t ; | 
|---|
| 37 |  | 
|---|
| 38 | /** Total length of the index tables. */ | 
|---|
| 39 | uint16_t ; | 
|---|
| 40 |  | 
|---|
| 41 | /** Data length bits 15..0. */ | 
|---|
| 42 | uint16_t ; | 
|---|
| 43 |  | 
|---|
| 44 | /** Index-3 null block offset, 0x7fff or 0xffff if none. */ | 
|---|
| 45 | uint16_t ; | 
|---|
| 46 |  | 
|---|
| 47 | /** Data null block offset bits 15..0, 0xfffff if none. */ | 
|---|
| 48 | uint16_t ; | 
|---|
| 49 |  | 
|---|
| 50 | /** | 
|---|
| 51 | * First code point of the single-value range ending with U+10ffff, | 
|---|
| 52 | * rounded up and then shifted right by UCPTRIE_SHIFT_2. | 
|---|
| 53 | */ | 
|---|
| 54 | uint16_t ; | 
|---|
| 55 | }; | 
|---|
| 56 |  | 
|---|
| 57 | /** | 
|---|
| 58 | * Constants for use with UCPTrieHeader.options. | 
|---|
| 59 | * @internal | 
|---|
| 60 | */ | 
|---|
| 61 | enum { | 
|---|
| 62 | UCPTRIE_OPTIONS_DATA_LENGTH_MASK = 0xf000, | 
|---|
| 63 | UCPTRIE_OPTIONS_DATA_NULL_OFFSET_MASK = 0xf00, | 
|---|
| 64 | UCPTRIE_OPTIONS_RESERVED_MASK = 0x38, | 
|---|
| 65 | UCPTRIE_OPTIONS_VALUE_BITS_MASK = 7, | 
|---|
| 66 | /** | 
|---|
| 67 | * Value for index3NullOffset which indicates that there is no index-3 null block. | 
|---|
| 68 | * Bit 15 is unused for this value because this bit is used if the index-3 contains | 
|---|
| 69 | * 18-bit indexes. | 
|---|
| 70 | */ | 
|---|
| 71 | UCPTRIE_NO_INDEX3_NULL_OFFSET = 0x7fff, | 
|---|
| 72 | UCPTRIE_NO_DATA_NULL_OFFSET = 0xfffff | 
|---|
| 73 | }; | 
|---|
| 74 |  | 
|---|
| 75 | // Internal constants. | 
|---|
| 76 | enum { | 
|---|
| 77 | /** The length of the BMP index table. 1024=0x400 */ | 
|---|
| 78 | UCPTRIE_BMP_INDEX_LENGTH = 0x10000 >> UCPTRIE_FAST_SHIFT, | 
|---|
| 79 |  | 
|---|
| 80 | UCPTRIE_SMALL_LIMIT = 0x1000, | 
|---|
| 81 | UCPTRIE_SMALL_INDEX_LENGTH = UCPTRIE_SMALL_LIMIT >> UCPTRIE_FAST_SHIFT, | 
|---|
| 82 |  | 
|---|
| 83 | /** Shift size for getting the index-3 table offset. */ | 
|---|
| 84 | UCPTRIE_SHIFT_3 = 4, | 
|---|
| 85 |  | 
|---|
| 86 | /** Shift size for getting the index-2 table offset. */ | 
|---|
| 87 | UCPTRIE_SHIFT_2 = 5 + UCPTRIE_SHIFT_3, | 
|---|
| 88 |  | 
|---|
| 89 | /** Shift size for getting the index-1 table offset. */ | 
|---|
| 90 | UCPTRIE_SHIFT_1 = 5 + UCPTRIE_SHIFT_2, | 
|---|
| 91 |  | 
|---|
| 92 | /** | 
|---|
| 93 | * Difference between two shift sizes, | 
|---|
| 94 | * for getting an index-2 offset from an index-3 offset. 5=9-4 | 
|---|
| 95 | */ | 
|---|
| 96 | UCPTRIE_SHIFT_2_3 = UCPTRIE_SHIFT_2 - UCPTRIE_SHIFT_3, | 
|---|
| 97 |  | 
|---|
| 98 | /** | 
|---|
| 99 | * Difference between two shift sizes, | 
|---|
| 100 | * for getting an index-1 offset from an index-2 offset. 5=14-9 | 
|---|
| 101 | */ | 
|---|
| 102 | UCPTRIE_SHIFT_1_2 = UCPTRIE_SHIFT_1 - UCPTRIE_SHIFT_2, | 
|---|
| 103 |  | 
|---|
| 104 | /** | 
|---|
| 105 | * Number of index-1 entries for the BMP. (4) | 
|---|
| 106 | * This part of the index-1 table is omitted from the serialized form. | 
|---|
| 107 | */ | 
|---|
| 108 | UCPTRIE_OMITTED_BMP_INDEX_1_LENGTH = 0x10000 >> UCPTRIE_SHIFT_1, | 
|---|
| 109 |  | 
|---|
| 110 | /** Number of entries in an index-2 block. 32=0x20 */ | 
|---|
| 111 | UCPTRIE_INDEX_2_BLOCK_LENGTH = 1 << UCPTRIE_SHIFT_1_2, | 
|---|
| 112 |  | 
|---|
| 113 | /** Mask for getting the lower bits for the in-index-2-block offset. */ | 
|---|
| 114 | UCPTRIE_INDEX_2_MASK = UCPTRIE_INDEX_2_BLOCK_LENGTH - 1, | 
|---|
| 115 |  | 
|---|
| 116 | /** Number of code points per index-2 table entry. 512=0x200 */ | 
|---|
| 117 | UCPTRIE_CP_PER_INDEX_2_ENTRY = 1 << UCPTRIE_SHIFT_2, | 
|---|
| 118 |  | 
|---|
| 119 | /** Number of entries in an index-3 block. 32=0x20 */ | 
|---|
| 120 | UCPTRIE_INDEX_3_BLOCK_LENGTH = 1 << UCPTRIE_SHIFT_2_3, | 
|---|
| 121 |  | 
|---|
| 122 | /** Mask for getting the lower bits for the in-index-3-block offset. */ | 
|---|
| 123 | UCPTRIE_INDEX_3_MASK = UCPTRIE_INDEX_3_BLOCK_LENGTH - 1, | 
|---|
| 124 |  | 
|---|
| 125 | /** Number of entries in a small data block. 16=0x10 */ | 
|---|
| 126 | UCPTRIE_SMALL_DATA_BLOCK_LENGTH = 1 << UCPTRIE_SHIFT_3, | 
|---|
| 127 |  | 
|---|
| 128 | /** Mask for getting the lower bits for the in-small-data-block offset. */ | 
|---|
| 129 | UCPTRIE_SMALL_DATA_MASK = UCPTRIE_SMALL_DATA_BLOCK_LENGTH - 1 | 
|---|
| 130 | }; | 
|---|
| 131 |  | 
|---|
| 132 | typedef UChar32 | 
|---|
| 133 | UCPTrieGetRange(const void *trie, UChar32 start, | 
|---|
| 134 | UCPMapValueFilter *filter, const void *context, uint32_t *pValue); | 
|---|
| 135 |  | 
|---|
| 136 | U_CFUNC UChar32 | 
|---|
| 137 | ucptrie_internalGetRange(UCPTrieGetRange *getRange, | 
|---|
| 138 | const void *trie, UChar32 start, | 
|---|
| 139 | UCPMapRangeOption option, uint32_t surrogateValue, | 
|---|
| 140 | UCPMapValueFilter *filter, const void *context, uint32_t *pValue); | 
|---|
| 141 |  | 
|---|
| 142 | #ifdef UCPTRIE_DEBUG | 
|---|
| 143 | U_CFUNC void | 
|---|
| 144 | ucptrie_printLengths(const UCPTrie *trie, const char *which); | 
|---|
| 145 |  | 
|---|
| 146 | U_CFUNC void umutablecptrie_setName(UMutableCPTrie *builder, const char *name); | 
|---|
| 147 | #endif | 
|---|
| 148 |  | 
|---|
| 149 | /* | 
|---|
| 150 | * Format of the binary, memory-mappable representation of a UCPTrie/CodePointTrie. | 
|---|
| 151 | * For overview information see http://site.icu-project.org/design/struct/utrie | 
|---|
| 152 | * | 
|---|
| 153 | * The binary trie data should be 32-bit-aligned. | 
|---|
| 154 | * The overall layout is: | 
|---|
| 155 | * | 
|---|
| 156 | * UCPTrieHeader header; -- 16 bytes, see struct definition above | 
|---|
| 157 | * uint16_t index[header.indexLength]; | 
|---|
| 158 | * uintXY_t data[header.dataLength]; | 
|---|
| 159 | * | 
|---|
| 160 | * The trie data array is an array of uint16_t, uint32_t, or uint8_t, | 
|---|
| 161 | * specified via the UCPTrieValueWidth when building the trie. | 
|---|
| 162 | * The data array is 32-bit-aligned for uint32_t, otherwise 16-bit-aligned. | 
|---|
| 163 | * The overall length of the trie data is a multiple of 4 bytes. | 
|---|
| 164 | * (Padding is added at the end of the index array and/or near the end of the data array as needed.) | 
|---|
| 165 | * | 
|---|
| 166 | * The length of the data array (dataLength) is stored as an integer split across two fields | 
|---|
| 167 | * of the header struct (high bits in header.options). | 
|---|
| 168 | * | 
|---|
| 169 | * The trie type can be "fast" or "small" which determines the index structure, | 
|---|
| 170 | * specified via the UCPTrieType when building the trie. | 
|---|
| 171 | * | 
|---|
| 172 | * The type and valueWidth are stored in the header.options. | 
|---|
| 173 | * There are reserved type and valueWidth values, and reserved header.options bits. | 
|---|
| 174 | * They could be used in future format extensions. | 
|---|
| 175 | * Code reading the trie structure must fail with an error when unknown values or options are set. | 
|---|
| 176 | * | 
|---|
| 177 | * Values for ASCII character (U+0000..U+007F) can always be found at the start of the data array. | 
|---|
| 178 | * | 
|---|
| 179 | * Values for code points below a type-specific fast-indexing limit are found via two-stage lookup. | 
|---|
| 180 | * For a "fast" trie, the limit is the BMP/supplementary boundary at U+10000. | 
|---|
| 181 | * For a "small" trie, the limit is UCPTRIE_SMALL_MAX+1=U+1000. | 
|---|
| 182 | * | 
|---|
| 183 | * All code points in the range highStart..U+10FFFF map to a single highValue | 
|---|
| 184 | * which is stored at the second-to-last position of the data array. | 
|---|
| 185 | * (See UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET.) | 
|---|
| 186 | * The highStart value is header.shiftedHighStart<<UCPTRIE_SHIFT_2. | 
|---|
| 187 | * (UCPTRIE_SHIFT_2=9) | 
|---|
| 188 | * | 
|---|
| 189 | * Values for code points fast_limit..highStart-1 are found via four-stage lookup. | 
|---|
| 190 | * The data block size is smaller for this range than for the fast range. | 
|---|
| 191 | * This together with more index stages with small blocks makes this range | 
|---|
| 192 | * more easily compactable. | 
|---|
| 193 | * | 
|---|
| 194 | * There is also a trie error value stored at the last position of the data array. | 
|---|
| 195 | * (See UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET.) | 
|---|
| 196 | * It is intended to be returned for inputs that are not Unicode code points | 
|---|
| 197 | * (outside U+0000..U+10FFFF), or in string processing for ill-formed input | 
|---|
| 198 | * (unpaired surrogate in UTF-16, ill-formed UTF-8 subsequence). | 
|---|
| 199 | * | 
|---|
| 200 | * For a "fast" trie: | 
|---|
| 201 | * | 
|---|
| 202 | * The index array starts with the BMP index table for BMP code point lookup. | 
|---|
| 203 | * Its length is 1024=0x400. | 
|---|
| 204 | * | 
|---|
| 205 | * The supplementary index-1 table follows the BMP index table. | 
|---|
| 206 | * Variable length, for code points up to highStart-1. | 
|---|
| 207 | * Maximum length 64=0x40=0x100000>>UCPTRIE_SHIFT_1. | 
|---|
| 208 | * (For 0x100000 supplementary code points U+10000..U+10ffff.) | 
|---|
| 209 | * | 
|---|
| 210 | * After this index-1 table follow the variable-length index-3 and index-2 tables. | 
|---|
| 211 | * | 
|---|
| 212 | * The supplementary index tables are omitted completely | 
|---|
| 213 | * if there is only BMP data (highStart<=U+10000). | 
|---|
| 214 | * | 
|---|
| 215 | * For a "small" trie: | 
|---|
| 216 | * | 
|---|
| 217 | * The index array starts with a fast-index table for lookup of code points U+0000..U+0FFF. | 
|---|
| 218 | * | 
|---|
| 219 | * The "supplementary" index tables are always stored. | 
|---|
| 220 | * The index-1 table starts from U+0000, its maximum length is 68=0x44=0x110000>>UCPTRIE_SHIFT_1. | 
|---|
| 221 | * | 
|---|
| 222 | * For both trie types: | 
|---|
| 223 | * | 
|---|
| 224 | * The last index-2 block may be a partial block, storing indexes only for code points | 
|---|
| 225 | * below highStart. | 
|---|
| 226 | * | 
|---|
| 227 | * Lookup for ASCII code point c: | 
|---|
| 228 | * | 
|---|
| 229 | * Linear access from the start of the data array. | 
|---|
| 230 | * | 
|---|
| 231 | * value = data[c]; | 
|---|
| 232 | * | 
|---|
| 233 | * Lookup for fast-range code point c: | 
|---|
| 234 | * | 
|---|
| 235 | * Shift the code point right by UCPTRIE_FAST_SHIFT=6 bits, | 
|---|
| 236 | * fetch the index array value at that offset, | 
|---|
| 237 | * add the lower code point bits, index into the data array. | 
|---|
| 238 | * | 
|---|
| 239 | * value = data[index[c>>6] + (c&0x3f)]; | 
|---|
| 240 | * | 
|---|
| 241 | * (This works for ASCII as well.) | 
|---|
| 242 | * | 
|---|
| 243 | * Lookup for small-range code point c below highStart: | 
|---|
| 244 | * | 
|---|
| 245 | * Split the code point into four bit fields using several sets of shifts & masks | 
|---|
| 246 | * to read consecutive values from the index-1, index-2, index-3 and data tables. | 
|---|
| 247 | * | 
|---|
| 248 | * If all of the data block offsets in an index-3 block fit within 16 bits (up to 0xffff), | 
|---|
| 249 | * then the data block offsets are stored directly as uint16_t. | 
|---|
| 250 | * | 
|---|
| 251 | * Otherwise (this is very unusual but possible), the index-2 entry for the index-3 block | 
|---|
| 252 | * has bit 15 (0x8000) set, and each set of 8 index-3 entries is preceded by | 
|---|
| 253 | * an additional uint16_t word. Data block offsets are 18 bits wide, with the top 2 bits stored | 
|---|
| 254 | * in the additional word. | 
|---|
| 255 | * | 
|---|
| 256 | * See ucptrie_internalSmallIndex() for details. | 
|---|
| 257 | * | 
|---|
| 258 | * (In a "small" trie, this works for ASCII and below-fast_limit code points as well.) | 
|---|
| 259 | * | 
|---|
| 260 | * Compaction: | 
|---|
| 261 | * | 
|---|
| 262 | * Multiple code point ranges ("blocks") that are aligned on certain boundaries | 
|---|
| 263 | * (determined by the shifting/bit fields of code points) and | 
|---|
| 264 | * map to the same data values normally share a single subsequence of the data array. | 
|---|
| 265 | * Data blocks can also overlap partially. | 
|---|
| 266 | * (Depending on the builder code finding duplicate and overlapping blocks.) | 
|---|
| 267 | * | 
|---|
| 268 | * Iteration over same-value ranges: | 
|---|
| 269 | * | 
|---|
| 270 | * Range iteration (ucptrie_getRange()) walks the structure from a start code point | 
|---|
| 271 | * until some code point is found that maps to a different value; | 
|---|
| 272 | * the end of the returned range is just before that. | 
|---|
| 273 | * | 
|---|
| 274 | * The header.dataNullOffset (split across two header fields, high bits in header.options) | 
|---|
| 275 | * is the offset of a widely shared data block filled with one single value. | 
|---|
| 276 | * It helps quickly skip over large ranges of data with that value. | 
|---|
| 277 | * The builder must ensure that if the start of any data block (fast or small) | 
|---|
| 278 | * matches the dataNullOffset, then the whole block must be filled with the null value. | 
|---|
| 279 | * Special care must be taken if there is no fast null data block | 
|---|
| 280 | * but a small one, which is shorter, and it matches the *start* of some fast data block. | 
|---|
| 281 | * | 
|---|
| 282 | * Similarly, the header.index3NullOffset is the index-array offset of an index-3 block | 
|---|
| 283 | * where all index entries point to the dataNullOffset. | 
|---|
| 284 | * If there is no such data or index-3 block, then these offsets are set to | 
|---|
| 285 | * values that cannot be reached (data offset out of range/reserved index offset), | 
|---|
| 286 | * normally UCPTRIE_NO_DATA_NULL_OFFSET or UCPTRIE_NO_INDEX3_NULL_OFFSET respectively. | 
|---|
| 287 | */ | 
|---|
| 288 |  | 
|---|
| 289 | #endif | 
|---|
| 290 |  | 
|---|