| 1 | // © 2016 and later: Unicode, Inc. and others. |
| 2 | // License & terms of use: http://www.unicode.org/copyright.html |
| 3 | /* |
| 4 | ******************************************************************************* |
| 5 | * Copyright (C) 2013-2015, International Business Machines |
| 6 | * Corporation and others. All Rights Reserved. |
| 7 | ******************************************************************************* |
| 8 | * collationdatareader.h |
| 9 | * |
| 10 | * created on: 2013feb07 |
| 11 | * created by: Markus W. Scherer |
| 12 | */ |
| 13 | |
| 14 | #ifndef __COLLATIONDATAREADER_H__ |
| 15 | #define __COLLATIONDATAREADER_H__ |
| 16 | |
| 17 | #include "unicode/utypes.h" |
| 18 | |
| 19 | #if !UCONFIG_NO_COLLATION |
| 20 | |
| 21 | #include "unicode/udata.h" |
| 22 | |
| 23 | struct UDataMemory; |
| 24 | |
| 25 | U_NAMESPACE_BEGIN |
| 26 | |
| 27 | struct CollationTailoring; |
| 28 | |
| 29 | /** |
| 30 | * Collation binary data reader. |
| 31 | */ |
| 32 | struct U_I18N_API CollationDataReader /* all static */ { |
| 33 | // The following constants are also copied into source/common/ucol_swp.cpp. |
| 34 | // Keep them in sync! |
| 35 | enum { |
| 36 | /** |
| 37 | * Number of int32_t indexes. |
| 38 | * |
| 39 | * Can be 2 if there are only options. |
| 40 | * Can be 7 or 8 if there are only options and a script reordering. |
| 41 | * The loader treats any index>=indexes[IX_INDEXES_LENGTH] as 0. |
| 42 | */ |
| 43 | IX_INDEXES_LENGTH, // 0 |
| 44 | /** |
| 45 | * Bits 31..24: numericPrimary, for numeric collation |
| 46 | * 23..16: fast Latin format version (0 = no fast Latin table) |
| 47 | * 15.. 0: options bit set |
| 48 | */ |
| 49 | IX_OPTIONS, |
| 50 | IX_RESERVED2, |
| 51 | IX_RESERVED3, |
| 52 | |
| 53 | /** Array offset to Jamo CE32s in ce32s[], or <0 if none. */ |
| 54 | IX_JAMO_CE32S_START, // 4 |
| 55 | |
| 56 | // Byte offsets from the start of the data, after the generic header. |
| 57 | // The indexes[] are at byte offset 0, other data follows. |
| 58 | // Each data item is aligned properly. |
| 59 | // The data items should be in descending order of unit size, |
| 60 | // to minimize the need for padding. |
| 61 | // Each item's byte length is given by the difference between its offset and |
| 62 | // the next index/offset value. |
| 63 | /** Byte offset to int32_t reorderCodes[]. */ |
| 64 | IX_REORDER_CODES_OFFSET, |
| 65 | /** |
| 66 | * Byte offset to uint8_t reorderTable[]. |
| 67 | * Empty table if <256 bytes (padding only). |
| 68 | * Otherwise 256 bytes or more (with padding). |
| 69 | */ |
| 70 | IX_REORDER_TABLE_OFFSET, |
| 71 | /** Byte offset to the collation trie. Its length is a multiple of 8 bytes. */ |
| 72 | IX_TRIE_OFFSET, |
| 73 | |
| 74 | IX_RESERVED8_OFFSET, // 8 |
| 75 | /** Byte offset to int64_t ces[]. */ |
| 76 | IX_CES_OFFSET, |
| 77 | IX_RESERVED10_OFFSET, |
| 78 | /** Byte offset to uint32_t ce32s[]. */ |
| 79 | IX_CE32S_OFFSET, |
| 80 | |
| 81 | /** Byte offset to uint32_t rootElements[]. */ |
| 82 | IX_ROOT_ELEMENTS_OFFSET, // 12 |
| 83 | /** Byte offset to UChar *contexts[]. */ |
| 84 | IX_CONTEXTS_OFFSET, |
| 85 | /** Byte offset to uint16_t [] with serialized unsafeBackwardSet. */ |
| 86 | IX_UNSAFE_BWD_OFFSET, |
| 87 | /** Byte offset to uint16_t fastLatinTable[]. */ |
| 88 | IX_FAST_LATIN_TABLE_OFFSET, |
| 89 | |
| 90 | /** Byte offset to uint16_t scripts[]. */ |
| 91 | IX_SCRIPTS_OFFSET, // 16 |
| 92 | /** |
| 93 | * Byte offset to UBool compressibleBytes[]. |
| 94 | * Empty table if <256 bytes (padding only). |
| 95 | * Otherwise 256 bytes or more (with padding). |
| 96 | */ |
| 97 | IX_COMPRESSIBLE_BYTES_OFFSET, |
| 98 | IX_RESERVED18_OFFSET, |
| 99 | IX_TOTAL_SIZE |
| 100 | }; |
| 101 | |
| 102 | static void read(const CollationTailoring *base, const uint8_t *inBytes, int32_t inLength, |
| 103 | CollationTailoring &tailoring, UErrorCode &errorCode); |
| 104 | |
| 105 | static UBool U_CALLCONV |
| 106 | isAcceptable(void *context, const char *type, const char *name, const UDataInfo *pInfo); |
| 107 | |
| 108 | private: |
| 109 | CollationDataReader(); // no constructor |
| 110 | }; |
| 111 | |
| 112 | /* |
| 113 | * Format of collation data (ucadata.icu, binary data in coll/ *.res files). |
| 114 | * Format version 5. |
| 115 | * |
| 116 | * The root collation data is stored in the ucadata.icu file. |
| 117 | * Tailorings are stored inside .res resource bundle files, with a complete file header. |
| 118 | * |
| 119 | * Collation data begins with a standard ICU data file header |
| 120 | * (DataHeader, see ucmndata.h and unicode/udata.h). |
| 121 | * The UDataInfo.dataVersion field contains the UCA and other version numbers, |
| 122 | * see the comments for CollationTailoring.version. |
| 123 | * |
| 124 | * After the header, the file contains the following parts. |
| 125 | * Constants are defined as enum values of the CollationDataReader class. |
| 126 | * See also the Collation class. |
| 127 | * |
| 128 | * int32_t indexes[indexesLength]; |
| 129 | * The indexes array has variable length. |
| 130 | * Some tailorings only need the length and the options, |
| 131 | * others only add reorderCodes and the reorderTable, |
| 132 | * some need to store mappings. |
| 133 | * Only as many indexes are stored as needed to read all of the data. |
| 134 | * |
| 135 | * Index 0: indexesLength |
| 136 | * Index 1: numericPrimary, CollationFastLatin::VERSION, and options: see IX_OPTIONS |
| 137 | * Index 2..3: Unused/reserved/0. |
| 138 | * Index 4: Index into the ce32s array where the CE32s of the conjoining Jamo |
| 139 | * are stored in a short, contiguous part of the ce32s array. |
| 140 | * |
| 141 | * Indexes 5..19 are byte offsets in ascending order. |
| 142 | * Each byte offset marks the start of the next part in the data file, |
| 143 | * and the end of the previous one. |
| 144 | * When two consecutive byte offsets are the same (or too short), |
| 145 | * then the corresponding part is empty. |
| 146 | * Byte offsets are offsets from after the header, |
| 147 | * that is, from the beginning of the indexes[]. |
| 148 | * Each part starts at an offset with proper alignment for its data. |
| 149 | * If necessary, the previous part may include padding bytes to achieve this alignment. |
| 150 | * The last byte offset that is stored in the indexes indicates the total size of the data |
| 151 | * (starting with the indexes). |
| 152 | * |
| 153 | * int32_t reorderCodes[]; -- empty in root |
| 154 | * The list of script and reordering codes. |
| 155 | * |
| 156 | * Beginning with format version 5, this array may optionally |
| 157 | * have trailing entries with a full list of reorder ranges |
| 158 | * as described for CollationSettings::reorderRanges. |
| 159 | * |
| 160 | * Script or reorder codes are first and do not exceed 16-bit values. |
| 161 | * Range limits are stored in the upper 16 bits, and are never 0. |
| 162 | * Split this array into reorder codes and ranges at the first entry |
| 163 | * with non-zero upper 16 bits. |
| 164 | * |
| 165 | * If the ranges are missing but needed for split-reordered primary lead bytes, |
| 166 | * then they are regenerated at load time. |
| 167 | * |
| 168 | * uint8_t reorderTable[256]; -- empty in root; can be longer to include padding bytes |
| 169 | * Primary-weight lead byte permutation table. |
| 170 | * Normally present when the reorderCodes are, but can be built at load time. |
| 171 | * |
| 172 | * Beginning with format version 5, a 0 entry at a non-zero index |
| 173 | * (which is otherwise an illegal value) |
| 174 | * means that the primary lead byte is "split" |
| 175 | * (there are different offsets for primaries that share that lead byte) |
| 176 | * and the reordering offset must be determined via the reorder ranges |
| 177 | * that are either stored as part of the reorderCodes array |
| 178 | * or regenerated at load time. |
| 179 | * |
| 180 | * UTrie2 trie; -- see utrie2_impl.h and utrie2.h |
| 181 | * The trie holds the main collation data. Each code point is mapped to a 32-bit value. |
| 182 | * It encodes a simple collation element (CE) in compact form, unless bits 7..6 are both set, |
| 183 | * in which case it is a special CE32 and contains a 4-bit tag and further data. |
| 184 | * See the Collation class for details. |
| 185 | * |
| 186 | * The trie has a value for each lead surrogate code unit with some bits encoding |
| 187 | * collective properties of the 1024 supplementary characters whose UTF-16 form starts with |
| 188 | * the lead surrogate. See Collation::LEAD_SURROGATE_TAG.. |
| 189 | * |
| 190 | * int64_t ces[]; |
| 191 | * 64-bit CEs and expansions that cannot be stored in a more compact form. |
| 192 | * |
| 193 | * uint32_t ce32s[]; |
| 194 | * CE32s for expansions in compact form, and for characters whose trie values |
| 195 | * contain special data. |
| 196 | * |
| 197 | * uint32_t rootElements[]; -- empty in all tailorings |
| 198 | * Compact storage for all of the CEs that occur in the root collation. |
| 199 | * See the CollationRootElements class. |
| 200 | * |
| 201 | * UChar *contexts[]; |
| 202 | * Serialized UCharsTrie structures with prefix (pre-context) and contraction mappings. |
| 203 | * |
| 204 | * uint16_t unsafeBackwardSet[]; -- see UnicodeSet::serialize() |
| 205 | * Serialized form of characters that are unsafe when iterating backwards, |
| 206 | * and at the end of an identical string prefix. |
| 207 | * Back up to a safe character. |
| 208 | * Lead surrogates are "unsafe" when any of their corresponding supplementary |
| 209 | * code points are unsafe. |
| 210 | * Does not include [:^lccc=0:][:^tccc=0:]. |
| 211 | * For each tailoring, the root unsafeBackwardSet is subtracted. |
| 212 | * (As a result, in many tailorings no set needs to be stored.) |
| 213 | * |
| 214 | * uint16_t fastLatinTable[]; |
| 215 | * Optional optimization for Latin text. |
| 216 | * See the CollationFastLatin class. |
| 217 | * |
| 218 | * uint16_t scripts[]; -- empty in all tailorings |
| 219 | * Format version 5: |
| 220 | * uint16_t numScripts; |
| 221 | * uint16_t scriptsIndex[numScripts+16]; |
| 222 | * uint16_t scriptStarts[]; |
| 223 | * See CollationData::numScripts etc. |
| 224 | * |
| 225 | * Format version 4: |
| 226 | * Table of the reordering groups with their first and last lead bytes, |
| 227 | * and their script and reordering codes. |
| 228 | * See CollationData::scripts. |
| 229 | * |
| 230 | * UBool compressibleBytes[]; -- empty in all tailorings |
| 231 | * Flag for getSortKey(), indicating primary weight lead bytes that are compressible. |
| 232 | * |
| 233 | * ----------------- |
| 234 | * Changes for formatVersion 5 (ICU 55) |
| 235 | * |
| 236 | * Reordering moves single scripts, not groups of scripts. |
| 237 | * Reorder ranges are optionally appended to the reorderCodes, |
| 238 | * and a 0 entry in the reorderTable indicates a split lead byte. |
| 239 | * The scripts data has a new format. |
| 240 | * |
| 241 | * The rootElements may contain secondary and tertiary weights below common=05. |
| 242 | * (Used for small Hiragana letters.) |
| 243 | * Where is occurs, there is also an explicit unit with common secondary & tertiary weights. |
| 244 | * There are no other data structure changes, but builder code needs to be able to handle such data. |
| 245 | * |
| 246 | * The collation element for the merge separator code point U+FFFE |
| 247 | * does not necessarily have special, unique secondary/tertiary weights any more. |
| 248 | */ |
| 249 | |
| 250 | U_NAMESPACE_END |
| 251 | |
| 252 | #endif // !UCONFIG_NO_COLLATION |
| 253 | #endif // __COLLATIONDATAREADER_H__ |
| 254 | |