collationdatawriter.cpp source code [ClickHouse/contrib/icu/icu4c/source/i18n/collationdatawriter.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	*******************************************************************************
5	* Copyright (C) 2013-2015, International Business Machines
6	* Corporation and others. All Rights Reserved.
7	*******************************************************************************
8	* collationdatawriter.cpp
9	*
10	* created on: 2013aug06
11	* created by: Markus W. Scherer
12	*/
13
14	#include "unicode/utypes.h"
15
16	#if !UCONFIG_NO_COLLATION
17
18	#include "unicode/tblcoll.h"
19	#include "unicode/udata.h"
20	#include "unicode/uniset.h"
21	#include "cmemory.h"
22	#include "collationdata.h"
23	#include "collationdatabuilder.h"
24	#include "collationdatareader.h"
25	#include "collationdatawriter.h"
26	#include "collationfastlatin.h"
27	#include "collationsettings.h"
28	#include "collationtailoring.h"
29	#include "uassert.h"
30	#include "ucmndata.h"
31
32	U_NAMESPACE_BEGIN
33
34	uint8_t *
35	RuleBasedCollator::cloneRuleData(int32_t &length, UErrorCode &errorCode) const {
36	if(U_FAILURE(errorCode)) { return NULL; }
37	LocalMemory<uint8_t> buffer((uint8_t *)uprv_malloc(`20000`));
38	if(buffer.isNull()) {
39	errorCode = U_MEMORY_ALLOCATION_ERROR;
40	return NULL;
41	}
42	length = cloneBinary(buffer.getAlias(), `20000`, errorCode);
43	if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
44	if(buffer.allocateInsteadAndCopy(length, `0`) == NULL) {
45	errorCode = U_MEMORY_ALLOCATION_ERROR;
46	return NULL;
47	}
48	errorCode = U_ZERO_ERROR;
49	length = cloneBinary(buffer.getAlias(), length, errorCode);
50	}
51	if(U_FAILURE(errorCode)) { return NULL; }
52	return buffer.orphan();
53	}
54
55	int32_t
56	RuleBasedCollator::cloneBinary(uint8_t dest, int32_t capacity, UErrorCode &errorCode) const* {
57	int32_t indexes[CollationDataReader::IX_TOTAL_SIZE + `1`];
58	return CollationDataWriter::writeTailoring(
59	tailoring, settings, indexes, dest, capacity,
60	errorCode);
61	}
62
63	static const UDataInfo dataInfo = {
64	sizeof(UDataInfo),
65	`0`,
66
67	U_IS_BIG_ENDIAN,
68	U_CHARSET_FAMILY,
69	U_SIZEOF_UCHAR,
70	`0`,
71
72	{ `0x55`, `0x43`, `0x6f`, `0x6c` }, // dataFormat="UCol"
73	{ `5`, `0`, `0`, `0` }, // formatVersion
74	{ `6`, `3`, `0`, `0` } // dataVersion
75	};
76
77	int32_t
78	CollationDataWriter::writeBase(const CollationData &data, const CollationSettings &settings,
79	const void *rootElements, int32_t rootElementsLength,
80	int32_t indexes[], uint8_t *dest, int32_t capacity,
81	UErrorCode &errorCode) {
82	return write(TRUE, NULL,
83	data, settings,
84	rootElements, rootElementsLength,
85	indexes, dest, capacity, errorCode);
86	}
87
88	int32_t
89	CollationDataWriter::writeTailoring(const CollationTailoring &t, const CollationSettings &settings,
90	int32_t indexes[], uint8_t *dest, int32_t capacity,
91	UErrorCode &errorCode) {
92	return write(FALSE, t.version,
93	*t.data, settings,
94	NULL, `0`,
95	indexes, dest, capacity, errorCode);
96	}
97
98	int32_t
99	CollationDataWriter::write(UBool isBase, const UVersionInfo dataVersion,
100	const CollationData &data, const CollationSettings &settings,
101	const void *rootElements, int32_t rootElementsLength,
102	int32_t indexes[], uint8_t *dest, int32_t capacity,
103	UErrorCode &errorCode) {
104	if(U_FAILURE(errorCode)) { return `0`; }
105	if(capacity < `0` \|\| (capacity > `0` && dest == NULL)) {
106	errorCode = U_ILLEGAL_ARGUMENT_ERROR;
107	return `0`;
108	}
109
110	// Figure out which data items to write before settling on
111	// the indexes length and writing offsets.
112	// For any data item, we need to write the start and limit offsets,
113	// so the indexes length must be at least index-of-start-offset + 2.
114	int32_t indexesLength;
115	UBool hasMappings;
116	UnicodeSet unsafeBackwardSet;
117	const CollationData *baseData = data.base;
118
119	int32_t fastLatinVersion;
120	if(data.fastLatinTable != NULL) {
121	fastLatinVersion = (int32_t)CollationFastLatin::VERSION << `16`;
122	} else {
123	fastLatinVersion = `0`;
124	}
125	int32_t fastLatinTableLength = `0`;
126
127	if(isBase) {
128	// For the root collator, we write an even number of indexes
129	// so that we start with an 8-aligned offset.
130	indexesLength = CollationDataReader::IX_TOTAL_SIZE + `1`;
131	U_ASSERT(settings.reorderCodesLength == `0`);
132	hasMappings = TRUE;
133	unsafeBackwardSet = *data.unsafeBackwardSet;
134	fastLatinTableLength = data.fastLatinTableLength;
135	} else if(baseData == NULL) {
136	hasMappings = FALSE;
137	if(settings.reorderCodesLength == `0`) {
138	// only options
139	indexesLength = CollationDataReader::IX_OPTIONS + `1`; // no limit offset here
140	} else {
141	// only options, reorder codes, and the reorder table
142	indexesLength = CollationDataReader::IX_REORDER_TABLE_OFFSET + `2`;
143	}
144	} else {
145	hasMappings = TRUE;
146	// Tailored mappings, and what else?
147	// Check in ascending order of optional tailoring data items.
148	indexesLength = CollationDataReader::IX_CE32S_OFFSET + `2`;
149	if(data.contextsLength != `0`) {
150	indexesLength = CollationDataReader::IX_CONTEXTS_OFFSET + `2`;
151	}
152	unsafeBackwardSet.addAll(data.unsafeBackwardSet).removeAll(baseData->unsafeBackwardSet);
153	if(!unsafeBackwardSet.isEmpty()) {
154	indexesLength = CollationDataReader::IX_UNSAFE_BWD_OFFSET + `2`;
155	}
156	if(data.fastLatinTable != baseData->fastLatinTable) {
157	fastLatinTableLength = data.fastLatinTableLength;
158	indexesLength = CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET + `2`;
159	}
160	}
161
162	UVector32 codesAndRanges(errorCode);
163	const int32_t *reorderCodes = settings.reorderCodes;
164	int32_t reorderCodesLength = settings.reorderCodesLength;
165	if(settings.hasReordering() &&
166	CollationSettings::reorderTableHasSplitBytes(settings.reorderTable)) {
167	// Rebuild the full list of reorder ranges.
168	// The list in the settings is truncated for efficiency.
169	data.makeReorderRanges(reorderCodes, reorderCodesLength, codesAndRanges, errorCode);
170	// Write the codes, then the ranges.
171	for(int32_t i = `0`; i < reorderCodesLength; ++i) {
172	codesAndRanges.insertElementAt(reorderCodes[i], i, errorCode);
173	}
174	if(U_FAILURE(errorCode)) { return `0`; }
175	reorderCodes = codesAndRanges.getBuffer();
176	reorderCodesLength = codesAndRanges.size();
177	}
178
179	int32_t headerSize;
180	if(isBase) {
181	headerSize = `0`; // udata_create() writes the header
182	} else {
183	DataHeader header;
184	header.dataHeader.magic1 = `0xda`;
185	header.dataHeader.magic2 = `0x27`;
186	uprv_memcpy(&header.info, &dataInfo, sizeof(UDataInfo));
187	uprv_memcpy(header.info.dataVersion, dataVersion, sizeof(UVersionInfo));
188	headerSize = (int32_t)sizeof(header);
189	U_ASSERT((headerSize & `3`) == `0`); // multiple of 4 bytes
190	if(hasMappings && data.cesLength != `0`) {
191	// Sum of the sizes of the data items which are
192	// not automatically multiples of 8 bytes and which are placed before the CEs.
193	int32_t sum = headerSize + (indexesLength + reorderCodesLength) * `4`;
194	if((sum & `7`) != `0`) {
195	// We need to add padding somewhere so that the 64-bit CEs are 8-aligned.
196	// We add to the header size here.
197	// Alternatively, we could increment the indexesLength
198	// or add a few bytes to the reorderTable.
199	headerSize += `4`;
200	}
201	}
202	header.dataHeader.headerSize = (uint16_t)headerSize;
203	if(headerSize <= capacity) {
204	uprv_memcpy(dest, &header, sizeof(header));
205	// Write 00 bytes so that the padding is not mistaken for a copyright string.
206	uprv_memset(dest + sizeof(header), `0`, headerSize - (int32_t)sizeof(header));
207	dest += headerSize;
208	capacity -= headerSize;
209	} else {
210	dest = NULL;
211	capacity = `0`;
212	}
213	}
214
215	indexes[CollationDataReader::IX_INDEXES_LENGTH] = indexesLength;
216	U_ASSERT((settings.options & ~`0xffff`) == `0`);
217	indexes[CollationDataReader::IX_OPTIONS] =
218	data.numericPrimary \| fastLatinVersion \| settings.options;
219	indexes[CollationDataReader::IX_RESERVED2] = `0`;
220	indexes[CollationDataReader::IX_RESERVED3] = `0`;
221
222	// Byte offsets of data items all start from the start of the indexes.
223	// We add the headerSize at the very end.
224	int32_t totalSize = indexesLength * `4`;
225
226	if(hasMappings && (isBase \|\| data.jamoCE32s != baseData->jamoCE32s)) {
227	indexes[CollationDataReader::IX_JAMO_CE32S_START] = static_cast<int32_t>(data.jamoCE32s - data.ce32s);
228	} else {
229	indexes[CollationDataReader::IX_JAMO_CE32S_START] = -`1`;
230	}
231
232	indexes[CollationDataReader::IX_REORDER_CODES_OFFSET] = totalSize;
233	totalSize += reorderCodesLength * `4`;
234
235	indexes[CollationDataReader::IX_REORDER_TABLE_OFFSET] = totalSize;
236	if(settings.reorderTable != NULL) {
237	totalSize += `256`;
238	}
239
240	indexes[CollationDataReader::IX_TRIE_OFFSET] = totalSize;
241	if(hasMappings) {
242	UErrorCode errorCode2 = U_ZERO_ERROR;
243	int32_t length;
244	if(totalSize < capacity) {
245	length = utrie2_serialize(data.trie, dest + totalSize,
246	capacity - totalSize, &errorCode2);
247	} else {
248	length = utrie2_serialize(data.trie, NULL, `0`, &errorCode2);
249	}
250	if(U_FAILURE(errorCode2) && errorCode2 != U_BUFFER_OVERFLOW_ERROR) {
251	errorCode = errorCode2;
252	return `0`;
253	}
254	// The trie size should be a multiple of 8 bytes due to the way
255	// compactIndex2(UNewTrie2 trie) currently works.*
256	U_ASSERT((length & `7`) == `0`);
257	totalSize += length;
258	}
259
260	indexes[CollationDataReader::IX_RESERVED8_OFFSET] = totalSize;
261	indexes[CollationDataReader::IX_CES_OFFSET] = totalSize;
262	if(hasMappings && data.cesLength != `0`) {
263	U_ASSERT(((headerSize + totalSize) & `7`) == `0`);
264	totalSize += data.cesLength * `8`;
265	}
266
267	indexes[CollationDataReader::IX_RESERVED10_OFFSET] = totalSize;
268	indexes[CollationDataReader::IX_CE32S_OFFSET] = totalSize;
269	if(hasMappings) {
270	totalSize += data.ce32sLength * `4`;
271	}
272
273	indexes[CollationDataReader::IX_ROOT_ELEMENTS_OFFSET] = totalSize;
274	totalSize += rootElementsLength * `4`;
275
276	indexes[CollationDataReader::IX_CONTEXTS_OFFSET] = totalSize;
277	if(hasMappings) {
278	totalSize += data.contextsLength * `2`;
279	}
280
281	indexes[CollationDataReader::IX_UNSAFE_BWD_OFFSET] = totalSize;
282	if(hasMappings && !unsafeBackwardSet.isEmpty()) {
283	UErrorCode errorCode2 = U_ZERO_ERROR;
284	int32_t length;
285	if(totalSize < capacity) {
286	uint16_t p = reinterpret_cast<uint16_t >(dest + totalSize);
287	length = unsafeBackwardSet.serialize(
288	p, (capacity - totalSize) / `2`, errorCode2);
289	} else {
290	length = unsafeBackwardSet.serialize(NULL, `0`, errorCode2);
291	}
292	if(U_FAILURE(errorCode2) && errorCode2 != U_BUFFER_OVERFLOW_ERROR) {
293	errorCode = errorCode2;
294	return `0`;
295	}
296	totalSize += length * `2`;
297	}
298
299	indexes[CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET] = totalSize;
300	totalSize += fastLatinTableLength * `2`;
301
302	UnicodeString scripts;
303	indexes[CollationDataReader::IX_SCRIPTS_OFFSET] = totalSize;
304	if(isBase) {
305	scripts.append((UChar)data.numScripts);
306	scripts.append(reinterpret_cast<const UChar *>(data.scriptsIndex), data.numScripts + `16`);
307	scripts.append(reinterpret_cast<const UChar *>(data.scriptStarts), data.scriptStartsLength);
308	totalSize += scripts.length() * `2`;
309	}
310
311	indexes[CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET] = totalSize;
312	if(isBase) {
313	totalSize += `256`;
314	}
315
316	indexes[CollationDataReader::IX_RESERVED18_OFFSET] = totalSize;
317	indexes[CollationDataReader::IX_TOTAL_SIZE] = totalSize;
318
319	if(totalSize > capacity) {
320	errorCode = U_BUFFER_OVERFLOW_ERROR;
321	return headerSize + totalSize;
322	}
323
324	uprv_memcpy(dest, indexes, indexesLength * `4`);
325	copyData(indexes, CollationDataReader::IX_REORDER_CODES_OFFSET, reorderCodes, dest);
326	copyData(indexes, CollationDataReader::IX_REORDER_TABLE_OFFSET, settings.reorderTable, dest);
327	// The trie has already been serialized into the dest buffer.
328	copyData(indexes, CollationDataReader::IX_CES_OFFSET, data.ces, dest);
329	copyData(indexes, CollationDataReader::IX_CE32S_OFFSET, data.ce32s, dest);
330	copyData(indexes, CollationDataReader::IX_ROOT_ELEMENTS_OFFSET, rootElements, dest);
331	copyData(indexes, CollationDataReader::IX_CONTEXTS_OFFSET, data.contexts, dest);
332	// The unsafeBackwardSet has already been serialized into the dest buffer.
333	copyData(indexes, CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET, data.fastLatinTable, dest);
334	copyData(indexes, CollationDataReader::IX_SCRIPTS_OFFSET, scripts.getBuffer(), dest);
335	copyData(indexes, CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET, data.compressibleBytes, dest);
336
337	return headerSize + totalSize;
338	}
339
340	void
341	CollationDataWriter::copyData(const int32_t indexes[], int32_t startIndex,
342	const void src, uint8_t dest) {
343	int32_t start = indexes[startIndex];
344	int32_t limit = indexes[startIndex + `1`];
345	if(start < limit) {
346	uprv_memcpy(dest + start, src, limit - start);
347	}
348	}
349
350	U_NAMESPACE_END
351
352	#endif // !UCONFIG_NO_COLLATION
353

Browse the source code of ClickHouse/contrib/icu/icu4c/source/i18n/collationdatawriter.cpp