1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | /* |
4 | ****************************************************************************** |
5 | * |
6 | * Copyright (C) 2001-2008, International Business Machines |
7 | * Corporation and others. All Rights Reserved. |
8 | * |
9 | ****************************************************************************** |
10 | * file name: utrie2_impl.h |
11 | * encoding: UTF-8 |
12 | * tab size: 8 (not used) |
13 | * indentation:4 |
14 | * |
15 | * created on: 2008sep26 (split off from utrie2.c) |
16 | * created by: Markus W. Scherer |
17 | * |
18 | * Definitions needed for both runtime and builder code for UTrie2, |
19 | * used by utrie2.c and utrie2_builder.c. |
20 | */ |
21 | |
22 | #ifndef __UTRIE2_IMPL_H__ |
23 | #define __UTRIE2_IMPL_H__ |
24 | |
25 | #ifdef UCPTRIE_DEBUG |
26 | #include "unicode/umutablecptrie.h" |
27 | #endif |
28 | #include "utrie2.h" |
29 | |
30 | /* Public UTrie2 API implementation ----------------------------------------- */ |
31 | |
32 | /* |
33 | * These definitions are mostly needed by utrie2.cpp, |
34 | * but also by utrie2_serialize() and utrie2_swap(). |
35 | */ |
36 | |
37 | // UTrie2 signature values, in platform endianness and opposite endianness. |
38 | // The UTrie2 signature ASCII byte values spell "Tri2". |
39 | #define UTRIE2_SIG 0x54726932 |
40 | #define UTRIE2_OE_SIG 0x32697254 |
41 | |
42 | /** |
43 | * Trie data structure in serialized form: |
44 | * |
45 | * UTrie2Header header; |
46 | * uint16_t index[header.index2Length]; |
47 | * uint16_t data[header.shiftedDataLength<<2]; -- or uint32_t data[...] |
48 | * @internal |
49 | */ |
50 | typedef struct { |
51 | /** "Tri2" in big-endian US-ASCII (0x54726932) */ |
52 | uint32_t ; |
53 | |
54 | /** |
55 | * options bit field: |
56 | * 15.. 4 reserved (0) |
57 | * 3.. 0 UTrie2ValueBits valueBits |
58 | */ |
59 | uint16_t ; |
60 | |
61 | /** UTRIE2_INDEX_1_OFFSET..UTRIE2_MAX_INDEX_LENGTH */ |
62 | uint16_t ; |
63 | |
64 | /** (UTRIE2_DATA_START_OFFSET..UTRIE2_MAX_DATA_LENGTH)>>UTRIE2_INDEX_SHIFT */ |
65 | uint16_t ; |
66 | |
67 | /** Null index and data blocks, not shifted. */ |
68 | uint16_t , ; |
69 | |
70 | /** |
71 | * First code point of the single-value range ending with U+10ffff, |
72 | * rounded up and then shifted right by UTRIE2_SHIFT_1. |
73 | */ |
74 | uint16_t ; |
75 | } ; |
76 | |
77 | /** |
78 | * Constants for use with UTrie2Header.options. |
79 | * @internal |
80 | */ |
81 | enum { |
82 | /** Mask to get the UTrie2ValueBits valueBits from options. */ |
83 | UTRIE2_OPTIONS_VALUE_BITS_MASK=0xf |
84 | }; |
85 | |
86 | /* Building a trie ---------------------------------------------------------- */ |
87 | |
88 | /* |
89 | * These definitions are mostly needed by utrie2_builder.c, but also by |
90 | * utrie2_get32() and utrie2_enum(). |
91 | */ |
92 | |
93 | enum { |
94 | /** |
95 | * At build time, leave a gap in the index-2 table, |
96 | * at least as long as the maximum lengths of the 2-byte UTF-8 index-2 table |
97 | * and the supplementary index-1 table. |
98 | * Round up to UTRIE2_INDEX_2_BLOCK_LENGTH for proper compacting. |
99 | */ |
100 | UNEWTRIE2_INDEX_GAP_OFFSET=UTRIE2_INDEX_2_BMP_LENGTH, |
101 | UNEWTRIE2_INDEX_GAP_LENGTH= |
102 | ((UTRIE2_UTF8_2B_INDEX_2_LENGTH+UTRIE2_MAX_INDEX_1_LENGTH)+UTRIE2_INDEX_2_MASK)& |
103 | ~UTRIE2_INDEX_2_MASK, |
104 | |
105 | /** |
106 | * Maximum length of the build-time index-2 array. |
107 | * Maximum number of Unicode code points (0x110000) shifted right by UTRIE2_SHIFT_2, |
108 | * plus the part of the index-2 table for lead surrogate code points, |
109 | * plus the build-time index gap, |
110 | * plus the null index-2 block. |
111 | */ |
112 | UNEWTRIE2_MAX_INDEX_2_LENGTH= |
113 | (0x110000>>UTRIE2_SHIFT_2)+ |
114 | UTRIE2_LSCP_INDEX_2_LENGTH+ |
115 | UNEWTRIE2_INDEX_GAP_LENGTH+ |
116 | UTRIE2_INDEX_2_BLOCK_LENGTH, |
117 | |
118 | UNEWTRIE2_INDEX_1_LENGTH=0x110000>>UTRIE2_SHIFT_1 |
119 | }; |
120 | |
121 | /** |
122 | * Maximum length of the build-time data array. |
123 | * One entry per 0x110000 code points, plus the illegal-UTF-8 block and the null block, |
124 | * plus values for the 0x400 surrogate code units. |
125 | */ |
126 | #define UNEWTRIE2_MAX_DATA_LENGTH (0x110000+0x40+0x40+0x400) |
127 | |
128 | /* |
129 | * Build-time trie structure. |
130 | * |
131 | * Just using a boolean flag for "repeat use" could lead to data array overflow |
132 | * because we would not be able to detect when a data block becomes unused. |
133 | * It also leads to orphan data blocks that are kept through serialization. |
134 | * |
135 | * Need to use reference counting for data blocks, |
136 | * and allocDataBlock() needs to look for a free block before increasing dataLength. |
137 | * |
138 | * This scheme seems like overkill for index-2 blocks since the whole index array is |
139 | * preallocated anyway (unlike the growable data array). |
140 | * Just allocating multiple index-2 blocks as needed. |
141 | */ |
142 | struct UNewTrie2 { |
143 | int32_t index1[UNEWTRIE2_INDEX_1_LENGTH]; |
144 | int32_t index2[UNEWTRIE2_MAX_INDEX_2_LENGTH]; |
145 | uint32_t *data; |
146 | #ifdef UCPTRIE_DEBUG |
147 | UMutableCPTrie *t3; |
148 | #endif |
149 | |
150 | uint32_t initialValue, errorValue; |
151 | int32_t index2Length, dataCapacity, dataLength; |
152 | int32_t firstFreeBlock; |
153 | int32_t index2NullOffset, dataNullOffset; |
154 | UChar32 highStart; |
155 | UBool isCompacted; |
156 | |
157 | /** |
158 | * Multi-purpose per-data-block table. |
159 | * |
160 | * Before compacting: |
161 | * |
162 | * Per-data-block reference counters/free-block list. |
163 | * 0: unused |
164 | * >0: reference counter (number of index-2 entries pointing here) |
165 | * <0: next free data block in free-block list |
166 | * |
167 | * While compacting: |
168 | * |
169 | * Map of adjusted indexes, used in compactData() and compactIndex2(). |
170 | * Maps from original indexes to new ones. |
171 | */ |
172 | int32_t map[UNEWTRIE2_MAX_DATA_LENGTH>>UTRIE2_SHIFT_2]; |
173 | }; |
174 | |
175 | #endif |
176 | |