collationdatareader.h source code [ClickHouse/contrib/icu/icu4c/source/i18n/collationdatareader.h]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	*******************************************************************************
5	* Copyright (C) 2013-2015, International Business Machines
6	* Corporation and others. All Rights Reserved.
7	*******************************************************************************
8	* collationdatareader.h
9	*
10	* created on: 2013feb07
11	* created by: Markus W. Scherer
12	*/
13
14	#ifndef __COLLATIONDATAREADER_H__
15	#define __COLLATIONDATAREADER_H__
16
17	#include "unicode/utypes.h"
18
19	#if !UCONFIG_NO_COLLATION
20
21	#include "unicode/udata.h"
22
23	struct UDataMemory;
24
25	U_NAMESPACE_BEGIN
26
27	struct CollationTailoring;
28
29	/**
30	* Collation binary data reader.
31	*/
32	struct U_I18N_API CollationDataReader / all static / {
33	// The following constants are also copied into source/common/ucol_swp.cpp.
34	// Keep them in sync!
35	enum {
36	/**
37	* Number of int32_t indexes.
38	*
39	* Can be 2 if there are only options.
40	* Can be 7 or 8 if there are only options and a script reordering.
41	* The loader treats any index>=indexes[IX_INDEXES_LENGTH] as 0.
42	*/
43	IX_INDEXES_LENGTH, // 0
44	/**
45	* Bits 31..24: numericPrimary, for numeric collation
46	* 23..16: fast Latin format version (0 = no fast Latin table)
47	* 15.. 0: options bit set
48	*/
49	IX_OPTIONS,
50	IX_RESERVED2,
51	IX_RESERVED3,
52
53	/* Array offset to Jamo CE32s in ce32s[], or <0 if none. /
54	IX_JAMO_CE32S_START, // 4
55
56	// Byte offsets from the start of the data, after the generic header.
57	// The indexes[] are at byte offset 0, other data follows.
58	// Each data item is aligned properly.
59	// The data items should be in descending order of unit size,
60	// to minimize the need for padding.
61	// Each item's byte length is given by the difference between its offset and
62	// the next index/offset value.
63	/* Byte offset to int32_t reorderCodes[]. /
64	IX_REORDER_CODES_OFFSET,
65	/**
66	* Byte offset to uint8_t reorderTable[].
67	* Empty table if <256 bytes (padding only).
68	* Otherwise 256 bytes or more (with padding).
69	*/
70	IX_REORDER_TABLE_OFFSET,
71	/* Byte offset to the collation trie. Its length is a multiple of 8 bytes. /
72	IX_TRIE_OFFSET,
73
74	IX_RESERVED8_OFFSET, // 8
75	/* Byte offset to int64_t ces[]. /
76	IX_CES_OFFSET,
77	IX_RESERVED10_OFFSET,
78	/* Byte offset to uint32_t ce32s[]. /
79	IX_CE32S_OFFSET,
80
81	/* Byte offset to uint32_t rootElements[]. /
82	IX_ROOT_ELEMENTS_OFFSET, // 12
83	/* Byte offset to UChar contexts[]. /*
84	IX_CONTEXTS_OFFSET,
85	/* Byte offset to uint16_t [] with serialized unsafeBackwardSet. /
86	IX_UNSAFE_BWD_OFFSET,
87	/* Byte offset to uint16_t fastLatinTable[]. /
88	IX_FAST_LATIN_TABLE_OFFSET,
89
90	/* Byte offset to uint16_t scripts[]. /
91	IX_SCRIPTS_OFFSET, // 16
92	/**
93	* Byte offset to UBool compressibleBytes[].
94	* Empty table if <256 bytes (padding only).
95	* Otherwise 256 bytes or more (with padding).
96	*/
97	IX_COMPRESSIBLE_BYTES_OFFSET,
98	IX_RESERVED18_OFFSET,
99	IX_TOTAL_SIZE
100	};
101
102	static void read(const CollationTailoring base, const* uint8_t *inBytes, int32_t inLength,
103	CollationTailoring &tailoring, UErrorCode &errorCode);
104
105	static UBool U_CALLCONV
106	isAcceptable(void context, const* char type, const* char name, const* UDataInfo *pInfo);
107
108	private:
109	CollationDataReader(); // no constructor
110	};
111
112	/*
113	* Format of collation data (ucadata.icu, binary data in coll/ *.res files).
114	* Format version 5.
115	*
116	* The root collation data is stored in the ucadata.icu file.
117	* Tailorings are stored inside .res resource bundle files, with a complete file header.
118	*
119	* Collation data begins with a standard ICU data file header
120	* (DataHeader, see ucmndata.h and unicode/udata.h).
121	* The UDataInfo.dataVersion field contains the UCA and other version numbers,
122	* see the comments for CollationTailoring.version.
123	*
124	* After the header, the file contains the following parts.
125	* Constants are defined as enum values of the CollationDataReader class.
126	* See also the Collation class.
127	*
128	* int32_t indexes[indexesLength];
129	* The indexes array has variable length.
130	* Some tailorings only need the length and the options,
131	* others only add reorderCodes and the reorderTable,
132	* some need to store mappings.
133	* Only as many indexes are stored as needed to read all of the data.
134	*
135	* Index 0: indexesLength
136	* Index 1: numericPrimary, CollationFastLatin::VERSION, and options: see IX_OPTIONS
137	* Index 2..3: Unused/reserved/0.
138	* Index 4: Index into the ce32s array where the CE32s of the conjoining Jamo
139	* are stored in a short, contiguous part of the ce32s array.
140	*
141	* Indexes 5..19 are byte offsets in ascending order.
142	* Each byte offset marks the start of the next part in the data file,
143	* and the end of the previous one.
144	* When two consecutive byte offsets are the same (or too short),
145	* then the corresponding part is empty.
146	* Byte offsets are offsets from after the header,
147	* that is, from the beginning of the indexes[].
148	* Each part starts at an offset with proper alignment for its data.
149	* If necessary, the previous part may include padding bytes to achieve this alignment.
150	* The last byte offset that is stored in the indexes indicates the total size of the data
151	* (starting with the indexes).
152	*
153	* int32_t reorderCodes[]; -- empty in root
154	* The list of script and reordering codes.
155	*
156	* Beginning with format version 5, this array may optionally
157	* have trailing entries with a full list of reorder ranges
158	* as described for CollationSettings::reorderRanges.
159	*
160	* Script or reorder codes are first and do not exceed 16-bit values.
161	* Range limits are stored in the upper 16 bits, and are never 0.
162	* Split this array into reorder codes and ranges at the first entry
163	* with non-zero upper 16 bits.
164	*
165	* If the ranges are missing but needed for split-reordered primary lead bytes,
166	* then they are regenerated at load time.
167	*
168	* uint8_t reorderTable[256]; -- empty in root; can be longer to include padding bytes
169	* Primary-weight lead byte permutation table.
170	* Normally present when the reorderCodes are, but can be built at load time.
171	*
172	* Beginning with format version 5, a 0 entry at a non-zero index
173	* (which is otherwise an illegal value)
174	* means that the primary lead byte is "split"
175	* (there are different offsets for primaries that share that lead byte)
176	* and the reordering offset must be determined via the reorder ranges
177	* that are either stored as part of the reorderCodes array
178	* or regenerated at load time.
179	*
180	* UTrie2 trie; -- see utrie2_impl.h and utrie2.h
181	* The trie holds the main collation data. Each code point is mapped to a 32-bit value.
182	* It encodes a simple collation element (CE) in compact form, unless bits 7..6 are both set,
183	* in which case it is a special CE32 and contains a 4-bit tag and further data.
184	* See the Collation class for details.
185	*
186	* The trie has a value for each lead surrogate code unit with some bits encoding
187	* collective properties of the 1024 supplementary characters whose UTF-16 form starts with
188	* the lead surrogate. See Collation::LEAD_SURROGATE_TAG..
189	*
190	* int64_t ces[];
191	* 64-bit CEs and expansions that cannot be stored in a more compact form.
192	*
193	* uint32_t ce32s[];
194	* CE32s for expansions in compact form, and for characters whose trie values
195	* contain special data.
196	*
197	* uint32_t rootElements[]; -- empty in all tailorings
198	* Compact storage for all of the CEs that occur in the root collation.
199	* See the CollationRootElements class.
200	*
201	* UChar *contexts[];
202	* Serialized UCharsTrie structures with prefix (pre-context) and contraction mappings.
203	*
204	* uint16_t unsafeBackwardSet[]; -- see UnicodeSet::serialize()
205	* Serialized form of characters that are unsafe when iterating backwards,
206	* and at the end of an identical string prefix.
207	* Back up to a safe character.
208	* Lead surrogates are "unsafe" when any of their corresponding supplementary
209	* code points are unsafe.
210	* Does not include [:^lccc=0:][:^tccc=0:].
211	* For each tailoring, the root unsafeBackwardSet is subtracted.
212	* (As a result, in many tailorings no set needs to be stored.)
213	*
214	* uint16_t fastLatinTable[];
215	* Optional optimization for Latin text.
216	* See the CollationFastLatin class.
217	*
218	* uint16_t scripts[]; -- empty in all tailorings
219	* Format version 5:
220	* uint16_t numScripts;
221	* uint16_t scriptsIndex[numScripts+16];
222	* uint16_t scriptStarts[];
223	* See CollationData::numScripts etc.
224	*
225	* Format version 4:
226	* Table of the reordering groups with their first and last lead bytes,
227	* and their script and reordering codes.
228	* See CollationData::scripts.
229	*
230	* UBool compressibleBytes[]; -- empty in all tailorings
231	* Flag for getSortKey(), indicating primary weight lead bytes that are compressible.
232	*
233	* -----------------
234	* Changes for formatVersion 5 (ICU 55)
235	*
236	* Reordering moves single scripts, not groups of scripts.
237	* Reorder ranges are optionally appended to the reorderCodes,
238	* and a 0 entry in the reorderTable indicates a split lead byte.
239	* The scripts data has a new format.
240	*
241	* The rootElements may contain secondary and tertiary weights below common=05.
242	* (Used for small Hiragana letters.)
243	* Where is occurs, there is also an explicit unit with common secondary & tertiary weights.
244	* There are no other data structure changes, but builder code needs to be able to handle such data.
245	*
246	* The collation element for the merge separator code point U+FFFE
247	* does not necessarily have special, unique secondary/tertiary weights any more.
248	*/
249
250	U_NAMESPACE_END
251
252	#endif // !UCONFIG_NO_COLLATION
253	#endif // __COLLATIONDATAREADER_H__
254

Browse the source code of ClickHouse/contrib/icu/icu4c/source/i18n/collationdatareader.h