collationdata.cpp source code [ClickHouse/contrib/icu/icu4c/source/i18n/collationdata.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	*******************************************************************************
5	* Copyright (C) 2012-2015, International Business Machines
6	* Corporation and others. All Rights Reserved.
7	*******************************************************************************
8	* collationdata.cpp
9	*
10	* created on: 2012jul28
11	* created by: Markus W. Scherer
12	*/
13
14	#include "unicode/utypes.h"
15
16	#if !UCONFIG_NO_COLLATION
17
18	#include "unicode/ucol.h"
19	#include "unicode/udata.h"
20	#include "unicode/uscript.h"
21	#include "cmemory.h"
22	#include "collation.h"
23	#include "collationdata.h"
24	#include "uassert.h"
25	#include "utrie2.h"
26	#include "uvectr32.h"
27
28	U_NAMESPACE_BEGIN
29
30	uint32_t
31	CollationData::getIndirectCE32(uint32_t ce32) const {
32	U_ASSERT(Collation::isSpecialCE32(ce32));
33	int32_t tag = Collation::tagFromCE32(ce32);
34	if(tag == Collation::DIGIT_TAG) {
35	// Fetch the non-numeric-collation CE32.
36	ce32 = ce32s[Collation::indexFromCE32(ce32)];
37	} else if(tag == Collation::LEAD_SURROGATE_TAG) {
38	ce32 = Collation::UNASSIGNED_CE32;
39	} else if(tag == Collation::U0000_TAG) {
40	// Fetch the normal ce32 for U+0000.
41	ce32 = ce32s[`0`];
42	}
43	return ce32;
44	}
45
46	uint32_t
47	CollationData::getFinalCE32(uint32_t ce32) const {
48	if(Collation::isSpecialCE32(ce32)) {
49	ce32 = getIndirectCE32(ce32);
50	}
51	return ce32;
52	}
53
54	int64_t
55	CollationData::getSingleCE(UChar32 c, UErrorCode &errorCode) const {
56	if(U_FAILURE(errorCode)) { return `0`; }
57	// Keep parallel with CollationDataBuilder::getSingleCE().
58	const CollationData *d;
59	uint32_t ce32 = getCE32(c);
60	if(ce32 == Collation::FALLBACK_CE32) {
61	d = base;
62	ce32 = base->getCE32(c);
63	} else {
64	d = this;
65	}
66	while(Collation::isSpecialCE32(ce32)) {
67	switch(Collation::tagFromCE32(ce32)) {
68	case Collation::LATIN_EXPANSION_TAG:
69	case Collation::BUILDER_DATA_TAG:
70	case Collation::PREFIX_TAG:
71	case Collation::CONTRACTION_TAG:
72	case Collation::HANGUL_TAG:
73	case Collation::LEAD_SURROGATE_TAG:
74	errorCode = U_UNSUPPORTED_ERROR;
75	return `0`;
76	case Collation::FALLBACK_TAG:
77	case Collation::RESERVED_TAG_3:
78	errorCode = U_INTERNAL_PROGRAM_ERROR;
79	return `0`;
80	case Collation::LONG_PRIMARY_TAG:
81	return Collation::ceFromLongPrimaryCE32(ce32);
82	case Collation::LONG_SECONDARY_TAG:
83	return Collation::ceFromLongSecondaryCE32(ce32);
84	case Collation::EXPANSION32_TAG:
85	if(Collation::lengthFromCE32(ce32) == `1`) {
86	ce32 = d->ce32s[Collation::indexFromCE32(ce32)];
87	break;
88	} else {
89	errorCode = U_UNSUPPORTED_ERROR;
90	return `0`;
91	}
92	case Collation::EXPANSION_TAG: {
93	if(Collation::lengthFromCE32(ce32) == `1`) {
94	return d->ces[Collation::indexFromCE32(ce32)];
95	} else {
96	errorCode = U_UNSUPPORTED_ERROR;
97	return `0`;
98	}
99	}
100	case Collation::DIGIT_TAG:
101	// Fetch the non-numeric-collation CE32 and continue.
102	ce32 = d->ce32s[Collation::indexFromCE32(ce32)];
103	break;
104	case Collation::U0000_TAG:
105	U_ASSERT(c == `0`);
106	// Fetch the normal ce32 for U+0000 and continue.
107	ce32 = d->ce32s[`0`];
108	break;
109	case Collation::OFFSET_TAG:
110	return d->getCEFromOffsetCE32(c, ce32);
111	case Collation::IMPLICIT_TAG:
112	return Collation::unassignedCEFromCodePoint(c);
113	}
114	}
115	return Collation::ceFromSimpleCE32(ce32);
116	}
117
118	uint32_t
119	CollationData::getFirstPrimaryForGroup(int32_t script) const {
120	int32_t index = getScriptIndex(script);
121	return index == `0` ? `0` : (uint32_t)scriptStarts[index] << `16`;
122	}
123
124	uint32_t
125	CollationData::getLastPrimaryForGroup(int32_t script) const {
126	int32_t index = getScriptIndex(script);
127	if(index == `0`) {
128	return `0`;
129	}
130	uint32_t limit = scriptStarts[index + `1`];
131	return (limit << `16`) - `1`;
132	}
133
134	int32_t
135	CollationData::getGroupForPrimary(uint32_t p) const {
136	p >>= `16`;
137	if(p < scriptStarts[`1`] \|\| scriptStarts[scriptStartsLength - `1`] <= p) {
138	return -`1`;
139	}
140	int32_t index = `1`;
141	while(p >= scriptStarts[index + `1`]) { ++index; }
142	for(int32_t i = `0`; i < numScripts; ++i) {
143	if(scriptsIndex[i] == index) {
144	return i;
145	}
146	}
147	for(int32_t i = `0`; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) {
148	if(scriptsIndex[numScripts + i] == index) {
149	return UCOL_REORDER_CODE_FIRST + i;
150	}
151	}
152	return -`1`;
153	}
154
155	int32_t
156	CollationData::getScriptIndex(int32_t script) const {
157	if(script < `0`) {
158	return `0`;
159	} else if(script < numScripts) {
160	return scriptsIndex[script];
161	} else if(script < UCOL_REORDER_CODE_FIRST) {
162	return `0`;
163	} else {
164	script -= UCOL_REORDER_CODE_FIRST;
165	if(script < MAX_NUM_SPECIAL_REORDER_CODES) {
166	return scriptsIndex[numScripts + script];
167	} else {
168	return `0`;
169	}
170	}
171	}
172
173	int32_t
174	CollationData::getEquivalentScripts(int32_t script,
175	int32_t dest[], int32_t capacity,
176	UErrorCode &errorCode) const {
177	if(U_FAILURE(errorCode)) { return `0`; }
178	int32_t index = getScriptIndex(script);
179	if(index == `0`) { return `0`; }
180	if(script >= UCOL_REORDER_CODE_FIRST) {
181	// Special groups have no aliases.
182	if(capacity > `0`) {
183	dest[`0`] = script;
184	} else {
185	errorCode = U_BUFFER_OVERFLOW_ERROR;
186	}
187	return `1`;
188	}
189
190	int32_t length = `0`;
191	for(int32_t i = `0`; i < numScripts; ++i) {
192	if(scriptsIndex[i] == index) {
193	if(length < capacity) {
194	dest[length] = i;
195	}
196	++length;
197	}
198	}
199	if(length > capacity) {
200	errorCode = U_BUFFER_OVERFLOW_ERROR;
201	}
202	return length;
203	}
204
205	void
206	CollationData::makeReorderRanges(const int32_t *reorder, int32_t length,
207	UVector32 &ranges, UErrorCode &errorCode) const {
208	makeReorderRanges(reorder, length, FALSE, ranges, errorCode);
209	}
210
211	void
212	CollationData::makeReorderRanges(const int32_t *reorder, int32_t length,
213	UBool latinMustMove,
214	UVector32 &ranges, UErrorCode &errorCode) const {
215	if(U_FAILURE(errorCode)) { return; }
216	ranges.removeAllElements();
217	if(length == `0` \|\| (length == `1` && reorder[`0`] == USCRIPT_UNKNOWN)) {
218	return;
219	}
220
221	// Maps each script-or-group range to a new lead byte.
222	uint8_t table[MAX_NUM_SCRIPT_RANGES];
223	uprv_memset(table, `0`, sizeof(table));
224
225	{
226	// Set "don't care" values for reserved ranges.
227	int32_t index = scriptsIndex[
228	numScripts + REORDER_RESERVED_BEFORE_LATIN - UCOL_REORDER_CODE_FIRST];
229	if(index != `0`) {
230	table[index] = `0xff`;
231	}
232	index = scriptsIndex[
233	numScripts + REORDER_RESERVED_AFTER_LATIN - UCOL_REORDER_CODE_FIRST];
234	if(index != `0`) {
235	table[index] = `0xff`;
236	}
237	}
238
239	// Never reorder special low and high primary lead bytes.
240	U_ASSERT(scriptStartsLength >= `2`);
241	U_ASSERT(scriptStarts[`0`] == `0`);
242	int32_t lowStart = scriptStarts[`1`];
243	U_ASSERT(lowStart == ((Collation::MERGE_SEPARATOR_BYTE + `1`) << `8`));
244	int32_t highLimit = scriptStarts[scriptStartsLength - `1`];
245	U_ASSERT(highLimit == (Collation::TRAIL_WEIGHT_BYTE << `8`));
246
247	// Get the set of special reorder codes in the input list.
248	// This supports a fixed number of special reorder codes;
249	// it works for data with codes beyond UCOL_REORDER_CODE_LIMIT.
250	uint32_t specials = `0`;
251	for(int32_t i = `0`; i < length; ++i) {
252	int32_t reorderCode = reorder[i] - UCOL_REORDER_CODE_FIRST;
253	if(`0` <= reorderCode && reorderCode < MAX_NUM_SPECIAL_REORDER_CODES) {
254	specials \|= (uint32_t)`1` << reorderCode;
255	}
256	}
257
258	// Start the reordering with the special low reorder codes that do not occur in the input.
259	for(int32_t i = `0`; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) {
260	int32_t index = scriptsIndex[numScripts + i];
261	if(index != `0` && (specials & ((uint32_t)`1` << i)) == `0`) {
262	lowStart = addLowScriptRange(table, index, lowStart);
263	}
264	}
265
266	// Skip the reserved range before Latin if Latin is the first script,
267	// so that we do not move it unnecessarily.
268	int32_t skippedReserved = `0`;
269	if(specials == `0` && reorder[`0`] == USCRIPT_LATIN && !latinMustMove) {
270	int32_t index = scriptsIndex[USCRIPT_LATIN];
271	U_ASSERT(index != `0`);
272	int32_t start = scriptStarts[index];
273	U_ASSERT(lowStart <= start);
274	skippedReserved = start - lowStart;
275	lowStart = start;
276	}
277
278	// Reorder according to the input scripts, continuing from the bottom of the primary range.
279	int32_t originalLength = length; // length will be decremented if "others" is in the list.
280	UBool hasReorderToEnd = FALSE;
281	for(int32_t i = `0`; i < length;) {
282	int32_t script = reorder[i++];
283	if(script == USCRIPT_UNKNOWN) {
284	// Put the remaining scripts at the top.
285	hasReorderToEnd = TRUE;
286	while(i < length) {
287	script = reorder[--length];
288	if(script == USCRIPT_UNKNOWN \|\| // Must occur at most once.
289	script == UCOL_REORDER_CODE_DEFAULT) {
290	errorCode = U_ILLEGAL_ARGUMENT_ERROR;
291	return;
292	}
293	int32_t index = getScriptIndex(script);
294	if(index == `0`) { continue; }
295	if(table[index] != `0`) { // Duplicate or equivalent script.
296	errorCode = U_ILLEGAL_ARGUMENT_ERROR;
297	return;
298	}
299	highLimit = addHighScriptRange(table, index, highLimit);
300	}
301	break;
302	}
303	if(script == UCOL_REORDER_CODE_DEFAULT) {
304	// The default code must be the only one in the list, and that is handled by the caller.
305	// Otherwise it must not be used.
306	errorCode = U_ILLEGAL_ARGUMENT_ERROR;
307	return;
308	}
309	int32_t index = getScriptIndex(script);
310	if(index == `0`) { continue; }
311	if(table[index] != `0`) { // Duplicate or equivalent script.
312	errorCode = U_ILLEGAL_ARGUMENT_ERROR;
313	return;
314	}
315	lowStart = addLowScriptRange(table, index, lowStart);
316	}
317
318	// Put all remaining scripts into the middle.
319	for(int32_t i = `1`; i < scriptStartsLength - `1`; ++i) {
320	int32_t leadByte = table[i];
321	if(leadByte != `0`) { continue; }
322	int32_t start = scriptStarts[i];
323	if(!hasReorderToEnd && start > lowStart) {
324	// No need to move this script.
325	lowStart = start;
326	}
327	lowStart = addLowScriptRange(table, i, lowStart);
328	}
329	if(lowStart > highLimit) {
330	if((lowStart - (skippedReserved & `0xff00`)) <= highLimit) {
331	// Try not skipping the before-Latin reserved range.
332	makeReorderRanges(reorder, originalLength, TRUE, ranges, errorCode);
333	return;
334	}
335	// We need more primary lead bytes than available, despite the reserved ranges.
336	errorCode = U_BUFFER_OVERFLOW_ERROR;
337	return;
338	}
339
340	// Turn lead bytes into a list of (limit, offset) pairs.
341	// Encode each pair in one list element:
342	// Upper 16 bits = limit, lower 16 = signed lead byte offset.
343	int32_t offset = `0`;
344	for(int32_t i = `1`;; ++i) {
345	int32_t nextOffset = offset;
346	while(i < scriptStartsLength - `1`) {
347	int32_t newLeadByte = table[i];
348	if(newLeadByte == `0xff`) {
349	// "Don't care" lead byte for reserved range, continue with current offset.
350	} else {
351	nextOffset = newLeadByte - (scriptStarts[i] >> `8`);
352	if(nextOffset != offset) { break; }
353	}
354	++i;
355	}
356	if(offset != `0` \|\| i < scriptStartsLength - `1`) {
357	ranges.addElement(((int32_t)scriptStarts[i] << `16`) \| (offset & `0xffff`), errorCode);
358	}
359	if(i == scriptStartsLength - `1`) { break; }
360	offset = nextOffset;
361	}
362	}
363
364	int32_t
365	CollationData::addLowScriptRange(uint8_t table[], int32_t index, int32_t lowStart) const {
366	int32_t start = scriptStarts[index];
367	if((start & `0xff`) < (lowStart & `0xff`)) {
368	lowStart += `0x100`;
369	}
370	table[index] = (uint8_t)(lowStart >> `8`);
371	int32_t limit = scriptStarts[index + `1`];
372	lowStart = ((lowStart & `0xff00`) + ((limit & `0xff00`) - (start & `0xff00`))) \| (limit & `0xff`);
373	return lowStart;
374	}
375
376	int32_t
377	CollationData::addHighScriptRange(uint8_t table[], int32_t index, int32_t highLimit) const {
378	int32_t limit = scriptStarts[index + `1`];
379	if((limit & `0xff`) > (highLimit & `0xff`)) {
380	highLimit -= `0x100`;
381	}
382	int32_t start = scriptStarts[index];
383	highLimit = ((highLimit & `0xff00`) - ((limit & `0xff00`) - (start & `0xff00`))) \| (start & `0xff`);
384	table[index] = (uint8_t)(highLimit >> `8`);
385	return highLimit;
386	}
387
388	U_NAMESPACE_END
389
390	#endif // !UCONFIG_NO_COLLATION
391

Browse the source code of ClickHouse/contrib/icu/icu4c/source/i18n/collationdata.cpp