collationdatareader.cpp source code [ClickHouse/contrib/icu/icu4c/source/i18n/collationdatareader.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	*******************************************************************************
5	* Copyright (C) 2013-2015, International Business Machines
6	* Corporation and others. All Rights Reserved.
7	*******************************************************************************
8	* collationdatareader.cpp
9	*
10	* created on: 2013feb07
11	* created by: Markus W. Scherer
12	*/
13
14	#include "unicode/utypes.h"
15
16	#if !UCONFIG_NO_COLLATION
17
18	#include "unicode/ucol.h"
19	#include "unicode/udata.h"
20	#include "unicode/uscript.h"
21	#include "cmemory.h"
22	#include "collation.h"
23	#include "collationdata.h"
24	#include "collationdatareader.h"
25	#include "collationfastlatin.h"
26	#include "collationkeys.h"
27	#include "collationrootelements.h"
28	#include "collationsettings.h"
29	#include "collationtailoring.h"
30	#include "collunsafe.h"
31	#include "normalizer2impl.h"
32	#include "uassert.h"
33	#include "ucmndata.h"
34	#include "utrie2.h"
35
36	U_NAMESPACE_BEGIN
37
38	namespace {
39
40	int32_t getIndex(const int32_t *indexes, int32_t length, int32_t i) {
41	return (i < length) ? indexes[i] : -`1`;
42	}
43
44	} // namespace
45
46	void
47	CollationDataReader::read(const CollationTailoring base, const* uint8_t *inBytes, int32_t inLength,
48	CollationTailoring &tailoring, UErrorCode &errorCode) {
49	if(U_FAILURE(errorCode)) { return; }
50	if(base != NULL) {
51	if(inBytes == NULL \|\| (`0` <= inLength && inLength < `24`)) {
52	errorCode = U_ILLEGAL_ARGUMENT_ERROR;
53	return;
54	}
55	const DataHeader header = reinterpret_cast<const* DataHeader *>(inBytes);
56	if(!(header->dataHeader.magic1 == `0xda` && header->dataHeader.magic2 == `0x27` &&
57	isAcceptable(tailoring.version, NULL, NULL, &header->info))) {
58	errorCode = U_INVALID_FORMAT_ERROR;
59	return;
60	}
61	if(base->getUCAVersion() != tailoring.getUCAVersion()) {
62	errorCode = U_COLLATOR_VERSION_MISMATCH;
63	return;
64	}
65	int32_t headerLength = header->dataHeader.headerSize;
66	inBytes += headerLength;
67	if(inLength >= `0`) {
68	inLength -= headerLength;
69	}
70	}
71
72	if(inBytes == NULL \|\| (`0` <= inLength && inLength < `8`)) {
73	errorCode = U_ILLEGAL_ARGUMENT_ERROR;
74	return;
75	}
76	const int32_t inIndexes = reinterpret_cast<const* int32_t *>(inBytes);
77	int32_t indexesLength = inIndexes[IX_INDEXES_LENGTH];
78	if(indexesLength < `2` \|\| (`0` <= inLength && inLength < indexesLength * `4`)) {
79	errorCode = U_INVALID_FORMAT_ERROR; // Not enough indexes.
80	return;
81	}
82
83	// Assume that the tailoring data is in initial state,
84	// with NULL pointers and 0 lengths.
85
86	// Set pointers to non-empty data parts.
87	// Do this in order of their byte offsets. (Should help porting to Java.)
88
89	int32_t index; // one of the indexes[] slots
90	int32_t offset; // byte offset for the index part
91	int32_t length; // number of bytes in the index part
92
93	if(indexesLength > IX_TOTAL_SIZE) {
94	length = inIndexes[IX_TOTAL_SIZE];
95	} else if(indexesLength > IX_REORDER_CODES_OFFSET) {
96	length = inIndexes[indexesLength - `1`];
97	} else {
98	length = `0`; // only indexes, and inLength was already checked for them
99	}
100	if(`0` <= inLength && inLength < length) {
101	errorCode = U_INVALID_FORMAT_ERROR;
102	return;
103	}
104
105	const CollationData *baseData = base == NULL ? NULL : base->data;
106	const int32_t *reorderCodes = NULL;
107	int32_t reorderCodesLength = `0`;
108	const uint32_t *reorderRanges = NULL;
109	int32_t reorderRangesLength = `0`;
110	index = IX_REORDER_CODES_OFFSET;
111	offset = getIndex(inIndexes, indexesLength, index);
112	length = getIndex(inIndexes, indexesLength, index + `1`) - offset;
113	if(length >= `4`) {
114	if(baseData == NULL) {
115	// We assume for collation settings that
116	// the base data does not have a reordering.
117	errorCode = U_INVALID_FORMAT_ERROR;
118	return;
119	}
120	reorderCodes = reinterpret_cast<const int32_t *>(inBytes + offset);
121	reorderCodesLength = length / `4`;
122
123	// The reorderRanges (if any) are the trailing reorderCodes entries.
124	// Split the array at the boundary.
125	// Script or reorder codes do not exceed 16-bit values.
126	// Range limits are stored in the upper 16 bits, and are never 0.
127	while(reorderRangesLength < reorderCodesLength &&
128	(reorderCodes[reorderCodesLength - reorderRangesLength - `1`] & `0xffff0000`) != `0`) {
129	++reorderRangesLength;
130	}
131	U_ASSERT(reorderRangesLength < reorderCodesLength);
132	if(reorderRangesLength != `0`) {
133	reorderCodesLength -= reorderRangesLength;
134	reorderRanges = reinterpret_cast<const uint32_t *>(reorderCodes + reorderCodesLength);
135	}
136	}
137
138	// There should be a reorder table only if there are reorder codes.
139	// However, when there are reorder codes the reorder table may be omitted to reduce
140	// the data size.
141	const uint8_t *reorderTable = NULL;
142	index = IX_REORDER_TABLE_OFFSET;
143	offset = getIndex(inIndexes, indexesLength, index);
144	length = getIndex(inIndexes, indexesLength, index + `1`) - offset;
145	if(length >= `256`) {
146	if(reorderCodesLength == `0`) {
147	errorCode = U_INVALID_FORMAT_ERROR; // Reordering table without reordering codes.
148	return;
149	}
150	reorderTable = inBytes + offset;
151	} else {
152	// If we have reorder codes, then build the reorderTable at the end,
153	// when the CollationData is otherwise complete.
154	}
155
156	if(baseData != NULL && baseData->numericPrimary != (inIndexes[IX_OPTIONS] & `0xff000000`)) {
157	errorCode = U_INVALID_FORMAT_ERROR;
158	return;
159	}
160	CollationData data = NULL; // Remains NULL if there are no mappings.*
161
162	index = IX_TRIE_OFFSET;
163	offset = getIndex(inIndexes, indexesLength, index);
164	length = getIndex(inIndexes, indexesLength, index + `1`) - offset;
165	if(length >= `8`) {
166	if(!tailoring.ensureOwnedData(errorCode)) { return; }
167	data = tailoring.ownedData;
168	data->base = baseData;
169	data->numericPrimary = inIndexes[IX_OPTIONS] & `0xff000000`;
170	data->trie = tailoring.trie = utrie2_openFromSerialized(
171	UTRIE2_32_VALUE_BITS, inBytes + offset, length, NULL,
172	&errorCode);
173	if(U_FAILURE(errorCode)) { return; }
174	} else if(baseData != NULL) {
175	// Use the base data. Only the settings are tailored.
176	tailoring.data = baseData;
177	} else {
178	errorCode = U_INVALID_FORMAT_ERROR; // No mappings.
179	return;
180	}
181
182	index = IX_CES_OFFSET;
183	offset = getIndex(inIndexes, indexesLength, index);
184	length = getIndex(inIndexes, indexesLength, index + `1`) - offset;
185	if(length >= `8`) {
186	if(data == NULL) {
187	errorCode = U_INVALID_FORMAT_ERROR; // Tailored ces without tailored trie.
188	return;
189	}
190	data->ces = reinterpret_cast<const int64_t *>(inBytes + offset);
191	data->cesLength = length / `8`;
192	}
193
194	index = IX_CE32S_OFFSET;
195	offset = getIndex(inIndexes, indexesLength, index);
196	length = getIndex(inIndexes, indexesLength, index + `1`) - offset;
197	if(length >= `4`) {
198	if(data == NULL) {
199	errorCode = U_INVALID_FORMAT_ERROR; // Tailored ce32s without tailored trie.
200	return;
201	}
202	data->ce32s = reinterpret_cast<const uint32_t *>(inBytes + offset);
203	data->ce32sLength = length / `4`;
204	}
205
206	int32_t jamoCE32sStart = getIndex(inIndexes, indexesLength, IX_JAMO_CE32S_START);
207	if(jamoCE32sStart >= `0`) {
208	if(data == NULL \|\| data->ce32s == NULL) {
209	errorCode = U_INVALID_FORMAT_ERROR; // Index into non-existent ce32s[].
210	return;
211	}
212	data->jamoCE32s = data->ce32s + jamoCE32sStart;
213	} else if(data == NULL) {
214	// Nothing to do.
215	} else if(baseData != NULL) {
216	data->jamoCE32s = baseData->jamoCE32s;
217	} else {
218	errorCode = U_INVALID_FORMAT_ERROR; // No Jamo CE32s for Hangul processing.
219	return;
220	}
221
222	index = IX_ROOT_ELEMENTS_OFFSET;
223	offset = getIndex(inIndexes, indexesLength, index);
224	length = getIndex(inIndexes, indexesLength, index + `1`) - offset;
225	if(length >= `4`) {
226	length /= `4`;
227	if(data == NULL \|\| length <= CollationRootElements::IX_SEC_TER_BOUNDARIES) {
228	errorCode = U_INVALID_FORMAT_ERROR;
229	return;
230	}
231	data->rootElements = reinterpret_cast<const uint32_t *>(inBytes + offset);
232	data->rootElementsLength = length;
233	uint32_t commonSecTer = data->rootElements[CollationRootElements::IX_COMMON_SEC_AND_TER_CE];
234	if(commonSecTer != Collation::COMMON_SEC_AND_TER_CE) {
235	errorCode = U_INVALID_FORMAT_ERROR;
236	return;
237	}
238	uint32_t secTerBoundaries = data->rootElements[CollationRootElements::IX_SEC_TER_BOUNDARIES];
239	if((secTerBoundaries >> `24`) < CollationKeys::SEC_COMMON_HIGH) {
240	// [fixed last secondary common byte] is too low,
241	// and secondary weights would collide with compressed common secondaries.
242	errorCode = U_INVALID_FORMAT_ERROR;
243	return;
244	}
245	}
246
247	index = IX_CONTEXTS_OFFSET;
248	offset = getIndex(inIndexes, indexesLength, index);
249	length = getIndex(inIndexes, indexesLength, index + `1`) - offset;
250	if(length >= `2`) {
251	if(data == NULL) {
252	errorCode = U_INVALID_FORMAT_ERROR; // Tailored contexts without tailored trie.
253	return;
254	}
255	data->contexts = reinterpret_cast<const UChar *>(inBytes + offset);
256	data->contextsLength = length / `2`;
257	}
258
259	index = IX_UNSAFE_BWD_OFFSET;
260	offset = getIndex(inIndexes, indexesLength, index);
261	length = getIndex(inIndexes, indexesLength, index + `1`) - offset;
262	if(length >= `2`) {
263	if(data == NULL) {
264	errorCode = U_INVALID_FORMAT_ERROR;
265	return;
266	}
267	if(baseData == NULL) {
268	#if defined(COLLUNSAFE_COLL_VERSION) && defined (COLLUNSAFE_SERIALIZE)
269	tailoring.unsafeBackwardSet = new UnicodeSet (unsafe_serializedData, unsafe_serializedCount, UnicodeSet::kSerialized, errorCode);
270	if(tailoring.unsafeBackwardSet == NULL) {
271	errorCode = U_MEMORY_ALLOCATION_ERROR;
272	return;
273	} else if (U_FAILURE(errorCode)) {
274	return;
275	}
276	#else
277	// Create the unsafe-backward set for the root collator.
278	// Include all non-zero combining marks and trail surrogates.
279	// We do this at load time, rather than at build time,
280	// to simplify Unicode version bootstrapping:
281	// The root data builder only needs the new FractionalUCA.txt data,
282	// but it need not be built with a version of ICU already updated to
283	// the corresponding new Unicode Character Database.
284	//
285	// The following is an optimized version of
286	// new UnicodeSet("[[:^lccc=0:][\\udc00-\\udfff]]").
287	// It is faster and requires fewer code dependencies.
288	tailoring.unsafeBackwardSet = new UnicodeSet(`0xdc00`, `0xdfff`); // trail surrogates
289	if(tailoring.unsafeBackwardSet == NULL) {
290	errorCode = U_MEMORY_ALLOCATION_ERROR;
291	return;
292	}
293	data->nfcImpl.addLcccChars(*tailoring.unsafeBackwardSet);
294	#endif // !COLLUNSAFE_SERIALIZE \|\| !COLLUNSAFE_COLL_VERSION
295	} else {
296	// Clone the root collator's set contents.
297	tailoring.unsafeBackwardSet = static_cast<UnicodeSet *>(
298	baseData->unsafeBackwardSet->cloneAsThawed());
299	if(tailoring.unsafeBackwardSet == NULL) {
300	errorCode = U_MEMORY_ALLOCATION_ERROR;
301	return;
302	}
303	}
304	// Add the ranges from the data file to the unsafe-backward set.
305	USerializedSet sset;
306	const uint16_t unsafeData = reinterpret_cast<const* uint16_t *>(inBytes + offset);
307	if(!uset_getSerializedSet(&sset, unsafeData, length / `2`)) {
308	errorCode = U_INVALID_FORMAT_ERROR;
309	return;
310	}
311	int32_t count = uset_getSerializedRangeCount(&sset);
312	for(int32_t i = `0`; i < count; ++i) {
313	UChar32 start, end;
314	uset_getSerializedRange(&sset, i, &start, &end);
315	tailoring.unsafeBackwardSet->add(start, end);
316	}
317	// Mark each lead surrogate as "unsafe"
318	// if any of its 1024 associated supplementary code points is "unsafe".
319	UChar32 c = `0x10000`;
320	for(UChar lead = `0xd800`; lead < `0xdc00`; ++lead, c += `0x400`) {
321	if(!tailoring.unsafeBackwardSet->containsNone(c, c + `0x3ff`)) {
322	tailoring.unsafeBackwardSet->add(lead);
323	}
324	}
325	tailoring.unsafeBackwardSet->freeze();
326	data->unsafeBackwardSet = tailoring.unsafeBackwardSet;
327	} else if(data == NULL) {
328	// Nothing to do.
329	} else if(baseData != NULL) {
330	// No tailoring-specific data: Alias the root collator's set.
331	data->unsafeBackwardSet = baseData->unsafeBackwardSet;
332	} else {
333	errorCode = U_INVALID_FORMAT_ERROR; // No unsafeBackwardSet.
334	return;
335	}
336
337	// If the fast Latin format version is different,
338	// or the version is set to 0 for "no fast Latin table",
339	// then just always use the normal string comparison path.
340	if(data != NULL) {
341	data->fastLatinTable = NULL;
342	data->fastLatinTableLength = `0`;
343	if(((inIndexes[IX_OPTIONS] >> `16`) & `0xff`) == CollationFastLatin::VERSION) {
344	index = IX_FAST_LATIN_TABLE_OFFSET;
345	offset = getIndex(inIndexes, indexesLength, index);
346	length = getIndex(inIndexes, indexesLength, index + `1`) - offset;
347	if(length >= `2`) {
348	data->fastLatinTable = reinterpret_cast<const uint16_t *>(inBytes + offset);
349	data->fastLatinTableLength = length / `2`;
350	if((*data->fastLatinTable >> `8`) != CollationFastLatin::VERSION) {
351	errorCode = U_INVALID_FORMAT_ERROR; // header vs. table version mismatch
352	return;
353	}
354	} else if(baseData != NULL) {
355	data->fastLatinTable = baseData->fastLatinTable;
356	data->fastLatinTableLength = baseData->fastLatinTableLength;
357	}
358	}
359	}
360
361	index = IX_SCRIPTS_OFFSET;
362	offset = getIndex(inIndexes, indexesLength, index);
363	length = getIndex(inIndexes, indexesLength, index + `1`) - offset;
364	if(length >= `2`) {
365	if(data == NULL) {
366	errorCode = U_INVALID_FORMAT_ERROR;
367	return;
368	}
369	const uint16_t scripts = reinterpret_cast<const* uint16_t *>(inBytes + offset);
370	int32_t scriptsLength = length / `2`;
371	data->numScripts = scripts[`0`];
372	// There must be enough entries for both arrays, including more than two range starts.
373	data->scriptStartsLength = scriptsLength - (`1` + data->numScripts + `16`);
374	if(data->scriptStartsLength <= `2` \|\|
375	CollationData::MAX_NUM_SCRIPT_RANGES < data->scriptStartsLength) {
376	errorCode = U_INVALID_FORMAT_ERROR;
377	return;
378	}
379	data->scriptsIndex = scripts + `1`;
380	data->scriptStarts = scripts + `1` + data->numScripts + `16`;
381	if(!(data->scriptStarts[`0`] == `0` &&
382	data->scriptStarts[`1`] == ((Collation::MERGE_SEPARATOR_BYTE + `1`) << `8`) &&
383	data->scriptStarts[data->scriptStartsLength - `1`] ==
384	(Collation::TRAIL_WEIGHT_BYTE << `8`))) {
385	errorCode = U_INVALID_FORMAT_ERROR;
386	return;
387	}
388	} else if(data == NULL) {
389	// Nothing to do.
390	} else if(baseData != NULL) {
391	data->numScripts = baseData->numScripts;
392	data->scriptsIndex = baseData->scriptsIndex;
393	data->scriptStarts = baseData->scriptStarts;
394	data->scriptStartsLength = baseData->scriptStartsLength;
395	}
396
397	index = IX_COMPRESSIBLE_BYTES_OFFSET;
398	offset = getIndex(inIndexes, indexesLength, index);
399	length = getIndex(inIndexes, indexesLength, index + `1`) - offset;
400	if(length >= `256`) {
401	if(data == NULL) {
402	errorCode = U_INVALID_FORMAT_ERROR;
403	return;
404	}
405	data->compressibleBytes = reinterpret_cast<const UBool *>(inBytes + offset);
406	} else if(data == NULL) {
407	// Nothing to do.
408	} else if(baseData != NULL) {
409	data->compressibleBytes = baseData->compressibleBytes;
410	} else {
411	errorCode = U_INVALID_FORMAT_ERROR; // No compressibleBytes[].
412	return;
413	}
414
415	const CollationSettings &ts = *tailoring.settings;
416	int32_t options = inIndexes[IX_OPTIONS] & `0xffff`;
417	uint16_t fastLatinPrimaries[CollationFastLatin::LATIN_LIMIT];
418	int32_t fastLatinOptions = CollationFastLatin::getOptions(
419	tailoring.data, ts, fastLatinPrimaries, UPRV_LENGTHOF(fastLatinPrimaries));
420	if(options == ts.options && ts.variableTop != `0` &&
421	reorderCodesLength == ts.reorderCodesLength &&
422	(reorderCodesLength == `0` \|\|
423	uprv_memcmp(reorderCodes, ts.reorderCodes, reorderCodesLength * `4`) == `0`) &&
424	fastLatinOptions == ts.fastLatinOptions &&
425	(fastLatinOptions < `0` \|\|
426	uprv_memcmp(fastLatinPrimaries, ts.fastLatinPrimaries,
427	sizeof(fastLatinPrimaries)) == `0`)) {
428	return;
429	}
430
431	CollationSettings *settings = SharedObject::copyOnWrite(tailoring.settings);
432	if(settings == NULL) {
433	errorCode = U_MEMORY_ALLOCATION_ERROR;
434	return;
435	}
436	settings->options = options;
437	// Set variableTop from options and scripts data.
438	settings->variableTop = tailoring.data->getLastPrimaryForGroup(
439	UCOL_REORDER_CODE_FIRST + settings->getMaxVariable());
440	if(settings->variableTop == `0`) {
441	errorCode = U_INVALID_FORMAT_ERROR;
442	return;
443	}
444
445	if(reorderCodesLength != `0`) {
446	settings->aliasReordering(*baseData, reorderCodes, reorderCodesLength,
447	reorderRanges, reorderRangesLength,
448	reorderTable, errorCode);
449	}
450
451	settings->fastLatinOptions = CollationFastLatin::getOptions(
452	tailoring.data, *settings,
453	settings->fastLatinPrimaries, UPRV_LENGTHOF(settings->fastLatinPrimaries));
454	}
455
456	UBool U_CALLCONV
457	CollationDataReader::isAcceptable(void *context,
458	const char * / type /, const char * /name/,
459	const UDataInfo *pInfo) {
460	if(
461	pInfo->size >= `20` &&
462	pInfo->isBigEndian == U_IS_BIG_ENDIAN &&
463	pInfo->charsetFamily == U_CHARSET_FAMILY &&
464	pInfo->dataFormat[`0`] == `0x55` && // dataFormat="UCol"
465	pInfo->dataFormat[`1`] == `0x43` &&
466	pInfo->dataFormat[`2`] == `0x6f` &&
467	pInfo->dataFormat[`3`] == `0x6c` &&
468	pInfo->formatVersion[`0`] == `5`
469	) {
470	UVersionInfo version = static_cast<UVersionInfo >(context);
471	if(version != NULL) {
472	uprv_memcpy(version, pInfo->dataVersion, `4`);
473	}
474	return TRUE;
475	} else {
476	return FALSE;
477	}
478	}
479
480	U_NAMESPACE_END
481
482	#endif // !UCONFIG_NO_COLLATION
483

Browse the source code of ClickHouse/contrib/icu/icu4c/source/i18n/collationdatareader.cpp