1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5* Copyright (C) 2012-2015, International Business Machines
6* Corporation and others. All Rights Reserved.
7*******************************************************************************
8* collationdata.cpp
9*
10* created on: 2012jul28
11* created by: Markus W. Scherer
12*/
13
14#include "unicode/utypes.h"
15
16#if !UCONFIG_NO_COLLATION
17
18#include "unicode/ucol.h"
19#include "unicode/udata.h"
20#include "unicode/uscript.h"
21#include "cmemory.h"
22#include "collation.h"
23#include "collationdata.h"
24#include "uassert.h"
25#include "utrie2.h"
26#include "uvectr32.h"
27
28U_NAMESPACE_BEGIN
29
30uint32_t
31CollationData::getIndirectCE32(uint32_t ce32) const {
32 U_ASSERT(Collation::isSpecialCE32(ce32));
33 int32_t tag = Collation::tagFromCE32(ce32);
34 if(tag == Collation::DIGIT_TAG) {
35 // Fetch the non-numeric-collation CE32.
36 ce32 = ce32s[Collation::indexFromCE32(ce32)];
37 } else if(tag == Collation::LEAD_SURROGATE_TAG) {
38 ce32 = Collation::UNASSIGNED_CE32;
39 } else if(tag == Collation::U0000_TAG) {
40 // Fetch the normal ce32 for U+0000.
41 ce32 = ce32s[0];
42 }
43 return ce32;
44}
45
46uint32_t
47CollationData::getFinalCE32(uint32_t ce32) const {
48 if(Collation::isSpecialCE32(ce32)) {
49 ce32 = getIndirectCE32(ce32);
50 }
51 return ce32;
52}
53
54int64_t
55CollationData::getSingleCE(UChar32 c, UErrorCode &errorCode) const {
56 if(U_FAILURE(errorCode)) { return 0; }
57 // Keep parallel with CollationDataBuilder::getSingleCE().
58 const CollationData *d;
59 uint32_t ce32 = getCE32(c);
60 if(ce32 == Collation::FALLBACK_CE32) {
61 d = base;
62 ce32 = base->getCE32(c);
63 } else {
64 d = this;
65 }
66 while(Collation::isSpecialCE32(ce32)) {
67 switch(Collation::tagFromCE32(ce32)) {
68 case Collation::LATIN_EXPANSION_TAG:
69 case Collation::BUILDER_DATA_TAG:
70 case Collation::PREFIX_TAG:
71 case Collation::CONTRACTION_TAG:
72 case Collation::HANGUL_TAG:
73 case Collation::LEAD_SURROGATE_TAG:
74 errorCode = U_UNSUPPORTED_ERROR;
75 return 0;
76 case Collation::FALLBACK_TAG:
77 case Collation::RESERVED_TAG_3:
78 errorCode = U_INTERNAL_PROGRAM_ERROR;
79 return 0;
80 case Collation::LONG_PRIMARY_TAG:
81 return Collation::ceFromLongPrimaryCE32(ce32);
82 case Collation::LONG_SECONDARY_TAG:
83 return Collation::ceFromLongSecondaryCE32(ce32);
84 case Collation::EXPANSION32_TAG:
85 if(Collation::lengthFromCE32(ce32) == 1) {
86 ce32 = d->ce32s[Collation::indexFromCE32(ce32)];
87 break;
88 } else {
89 errorCode = U_UNSUPPORTED_ERROR;
90 return 0;
91 }
92 case Collation::EXPANSION_TAG: {
93 if(Collation::lengthFromCE32(ce32) == 1) {
94 return d->ces[Collation::indexFromCE32(ce32)];
95 } else {
96 errorCode = U_UNSUPPORTED_ERROR;
97 return 0;
98 }
99 }
100 case Collation::DIGIT_TAG:
101 // Fetch the non-numeric-collation CE32 and continue.
102 ce32 = d->ce32s[Collation::indexFromCE32(ce32)];
103 break;
104 case Collation::U0000_TAG:
105 U_ASSERT(c == 0);
106 // Fetch the normal ce32 for U+0000 and continue.
107 ce32 = d->ce32s[0];
108 break;
109 case Collation::OFFSET_TAG:
110 return d->getCEFromOffsetCE32(c, ce32);
111 case Collation::IMPLICIT_TAG:
112 return Collation::unassignedCEFromCodePoint(c);
113 }
114 }
115 return Collation::ceFromSimpleCE32(ce32);
116}
117
118uint32_t
119CollationData::getFirstPrimaryForGroup(int32_t script) const {
120 int32_t index = getScriptIndex(script);
121 return index == 0 ? 0 : (uint32_t)scriptStarts[index] << 16;
122}
123
124uint32_t
125CollationData::getLastPrimaryForGroup(int32_t script) const {
126 int32_t index = getScriptIndex(script);
127 if(index == 0) {
128 return 0;
129 }
130 uint32_t limit = scriptStarts[index + 1];
131 return (limit << 16) - 1;
132}
133
134int32_t
135CollationData::getGroupForPrimary(uint32_t p) const {
136 p >>= 16;
137 if(p < scriptStarts[1] || scriptStarts[scriptStartsLength - 1] <= p) {
138 return -1;
139 }
140 int32_t index = 1;
141 while(p >= scriptStarts[index + 1]) { ++index; }
142 for(int32_t i = 0; i < numScripts; ++i) {
143 if(scriptsIndex[i] == index) {
144 return i;
145 }
146 }
147 for(int32_t i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) {
148 if(scriptsIndex[numScripts + i] == index) {
149 return UCOL_REORDER_CODE_FIRST + i;
150 }
151 }
152 return -1;
153}
154
155int32_t
156CollationData::getScriptIndex(int32_t script) const {
157 if(script < 0) {
158 return 0;
159 } else if(script < numScripts) {
160 return scriptsIndex[script];
161 } else if(script < UCOL_REORDER_CODE_FIRST) {
162 return 0;
163 } else {
164 script -= UCOL_REORDER_CODE_FIRST;
165 if(script < MAX_NUM_SPECIAL_REORDER_CODES) {
166 return scriptsIndex[numScripts + script];
167 } else {
168 return 0;
169 }
170 }
171}
172
173int32_t
174CollationData::getEquivalentScripts(int32_t script,
175 int32_t dest[], int32_t capacity,
176 UErrorCode &errorCode) const {
177 if(U_FAILURE(errorCode)) { return 0; }
178 int32_t index = getScriptIndex(script);
179 if(index == 0) { return 0; }
180 if(script >= UCOL_REORDER_CODE_FIRST) {
181 // Special groups have no aliases.
182 if(capacity > 0) {
183 dest[0] = script;
184 } else {
185 errorCode = U_BUFFER_OVERFLOW_ERROR;
186 }
187 return 1;
188 }
189
190 int32_t length = 0;
191 for(int32_t i = 0; i < numScripts; ++i) {
192 if(scriptsIndex[i] == index) {
193 if(length < capacity) {
194 dest[length] = i;
195 }
196 ++length;
197 }
198 }
199 if(length > capacity) {
200 errorCode = U_BUFFER_OVERFLOW_ERROR;
201 }
202 return length;
203}
204
205void
206CollationData::makeReorderRanges(const int32_t *reorder, int32_t length,
207 UVector32 &ranges, UErrorCode &errorCode) const {
208 makeReorderRanges(reorder, length, FALSE, ranges, errorCode);
209}
210
211void
212CollationData::makeReorderRanges(const int32_t *reorder, int32_t length,
213 UBool latinMustMove,
214 UVector32 &ranges, UErrorCode &errorCode) const {
215 if(U_FAILURE(errorCode)) { return; }
216 ranges.removeAllElements();
217 if(length == 0 || (length == 1 && reorder[0] == USCRIPT_UNKNOWN)) {
218 return;
219 }
220
221 // Maps each script-or-group range to a new lead byte.
222 uint8_t table[MAX_NUM_SCRIPT_RANGES];
223 uprv_memset(table, 0, sizeof(table));
224
225 {
226 // Set "don't care" values for reserved ranges.
227 int32_t index = scriptsIndex[
228 numScripts + REORDER_RESERVED_BEFORE_LATIN - UCOL_REORDER_CODE_FIRST];
229 if(index != 0) {
230 table[index] = 0xff;
231 }
232 index = scriptsIndex[
233 numScripts + REORDER_RESERVED_AFTER_LATIN - UCOL_REORDER_CODE_FIRST];
234 if(index != 0) {
235 table[index] = 0xff;
236 }
237 }
238
239 // Never reorder special low and high primary lead bytes.
240 U_ASSERT(scriptStartsLength >= 2);
241 U_ASSERT(scriptStarts[0] == 0);
242 int32_t lowStart = scriptStarts[1];
243 U_ASSERT(lowStart == ((Collation::MERGE_SEPARATOR_BYTE + 1) << 8));
244 int32_t highLimit = scriptStarts[scriptStartsLength - 1];
245 U_ASSERT(highLimit == (Collation::TRAIL_WEIGHT_BYTE << 8));
246
247 // Get the set of special reorder codes in the input list.
248 // This supports a fixed number of special reorder codes;
249 // it works for data with codes beyond UCOL_REORDER_CODE_LIMIT.
250 uint32_t specials = 0;
251 for(int32_t i = 0; i < length; ++i) {
252 int32_t reorderCode = reorder[i] - UCOL_REORDER_CODE_FIRST;
253 if(0 <= reorderCode && reorderCode < MAX_NUM_SPECIAL_REORDER_CODES) {
254 specials |= (uint32_t)1 << reorderCode;
255 }
256 }
257
258 // Start the reordering with the special low reorder codes that do not occur in the input.
259 for(int32_t i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) {
260 int32_t index = scriptsIndex[numScripts + i];
261 if(index != 0 && (specials & ((uint32_t)1 << i)) == 0) {
262 lowStart = addLowScriptRange(table, index, lowStart);
263 }
264 }
265
266 // Skip the reserved range before Latin if Latin is the first script,
267 // so that we do not move it unnecessarily.
268 int32_t skippedReserved = 0;
269 if(specials == 0 && reorder[0] == USCRIPT_LATIN && !latinMustMove) {
270 int32_t index = scriptsIndex[USCRIPT_LATIN];
271 U_ASSERT(index != 0);
272 int32_t start = scriptStarts[index];
273 U_ASSERT(lowStart <= start);
274 skippedReserved = start - lowStart;
275 lowStart = start;
276 }
277
278 // Reorder according to the input scripts, continuing from the bottom of the primary range.
279 int32_t originalLength = length; // length will be decremented if "others" is in the list.
280 UBool hasReorderToEnd = FALSE;
281 for(int32_t i = 0; i < length;) {
282 int32_t script = reorder[i++];
283 if(script == USCRIPT_UNKNOWN) {
284 // Put the remaining scripts at the top.
285 hasReorderToEnd = TRUE;
286 while(i < length) {
287 script = reorder[--length];
288 if(script == USCRIPT_UNKNOWN || // Must occur at most once.
289 script == UCOL_REORDER_CODE_DEFAULT) {
290 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
291 return;
292 }
293 int32_t index = getScriptIndex(script);
294 if(index == 0) { continue; }
295 if(table[index] != 0) { // Duplicate or equivalent script.
296 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
297 return;
298 }
299 highLimit = addHighScriptRange(table, index, highLimit);
300 }
301 break;
302 }
303 if(script == UCOL_REORDER_CODE_DEFAULT) {
304 // The default code must be the only one in the list, and that is handled by the caller.
305 // Otherwise it must not be used.
306 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
307 return;
308 }
309 int32_t index = getScriptIndex(script);
310 if(index == 0) { continue; }
311 if(table[index] != 0) { // Duplicate or equivalent script.
312 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
313 return;
314 }
315 lowStart = addLowScriptRange(table, index, lowStart);
316 }
317
318 // Put all remaining scripts into the middle.
319 for(int32_t i = 1; i < scriptStartsLength - 1; ++i) {
320 int32_t leadByte = table[i];
321 if(leadByte != 0) { continue; }
322 int32_t start = scriptStarts[i];
323 if(!hasReorderToEnd && start > lowStart) {
324 // No need to move this script.
325 lowStart = start;
326 }
327 lowStart = addLowScriptRange(table, i, lowStart);
328 }
329 if(lowStart > highLimit) {
330 if((lowStart - (skippedReserved & 0xff00)) <= highLimit) {
331 // Try not skipping the before-Latin reserved range.
332 makeReorderRanges(reorder, originalLength, TRUE, ranges, errorCode);
333 return;
334 }
335 // We need more primary lead bytes than available, despite the reserved ranges.
336 errorCode = U_BUFFER_OVERFLOW_ERROR;
337 return;
338 }
339
340 // Turn lead bytes into a list of (limit, offset) pairs.
341 // Encode each pair in one list element:
342 // Upper 16 bits = limit, lower 16 = signed lead byte offset.
343 int32_t offset = 0;
344 for(int32_t i = 1;; ++i) {
345 int32_t nextOffset = offset;
346 while(i < scriptStartsLength - 1) {
347 int32_t newLeadByte = table[i];
348 if(newLeadByte == 0xff) {
349 // "Don't care" lead byte for reserved range, continue with current offset.
350 } else {
351 nextOffset = newLeadByte - (scriptStarts[i] >> 8);
352 if(nextOffset != offset) { break; }
353 }
354 ++i;
355 }
356 if(offset != 0 || i < scriptStartsLength - 1) {
357 ranges.addElement(((int32_t)scriptStarts[i] << 16) | (offset & 0xffff), errorCode);
358 }
359 if(i == scriptStartsLength - 1) { break; }
360 offset = nextOffset;
361 }
362}
363
364int32_t
365CollationData::addLowScriptRange(uint8_t table[], int32_t index, int32_t lowStart) const {
366 int32_t start = scriptStarts[index];
367 if((start & 0xff) < (lowStart & 0xff)) {
368 lowStart += 0x100;
369 }
370 table[index] = (uint8_t)(lowStart >> 8);
371 int32_t limit = scriptStarts[index + 1];
372 lowStart = ((lowStart & 0xff00) + ((limit & 0xff00) - (start & 0xff00))) | (limit & 0xff);
373 return lowStart;
374}
375
376int32_t
377CollationData::addHighScriptRange(uint8_t table[], int32_t index, int32_t highLimit) const {
378 int32_t limit = scriptStarts[index + 1];
379 if((limit & 0xff) > (highLimit & 0xff)) {
380 highLimit -= 0x100;
381 }
382 int32_t start = scriptStarts[index];
383 highLimit = ((highLimit & 0xff00) - ((limit & 0xff00) - (start & 0xff00))) | (start & 0xff);
384 table[index] = (uint8_t)(highLimit >> 8);
385 return highLimit;
386}
387
388U_NAMESPACE_END
389
390#endif // !UCONFIG_NO_COLLATION
391