1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | /* |
4 | ******************************************************************************* |
5 | * Copyright (C) 2012-2015, International Business Machines |
6 | * Corporation and others. All Rights Reserved. |
7 | ******************************************************************************* |
8 | * collationdatabuilder.cpp |
9 | * |
10 | * (replaced the former ucol_elm.cpp) |
11 | * |
12 | * created on: 2012apr01 |
13 | * created by: Markus W. Scherer |
14 | */ |
15 | |
16 | #include "unicode/utypes.h" |
17 | |
18 | #if !UCONFIG_NO_COLLATION |
19 | |
20 | #include "unicode/localpointer.h" |
21 | #include "unicode/uchar.h" |
22 | #include "unicode/ucharstrie.h" |
23 | #include "unicode/ucharstriebuilder.h" |
24 | #include "unicode/uniset.h" |
25 | #include "unicode/unistr.h" |
26 | #include "unicode/usetiter.h" |
27 | #include "unicode/utf16.h" |
28 | #include "cmemory.h" |
29 | #include "collation.h" |
30 | #include "collationdata.h" |
31 | #include "collationdatabuilder.h" |
32 | #include "collationfastlatinbuilder.h" |
33 | #include "collationiterator.h" |
34 | #include "normalizer2impl.h" |
35 | #include "utrie2.h" |
36 | #include "uvectr32.h" |
37 | #include "uvectr64.h" |
38 | #include "uvector.h" |
39 | |
40 | U_NAMESPACE_BEGIN |
41 | |
42 | CollationDataBuilder::CEModifier::~CEModifier() {} |
43 | |
44 | /** |
45 | * Build-time context and CE32 for a code point. |
46 | * If a code point has contextual mappings, then the default (no-context) mapping |
47 | * and all conditional mappings are stored in a singly-linked list |
48 | * of ConditionalCE32, sorted by context strings. |
49 | * |
50 | * Context strings sort by prefix length, then by prefix, then by contraction suffix. |
51 | * Context strings must be unique and in ascending order. |
52 | */ |
53 | struct ConditionalCE32 : public UMemory { |
54 | ConditionalCE32() |
55 | : context(), |
56 | ce32(0), defaultCE32(Collation::NO_CE32), builtCE32(Collation::NO_CE32), |
57 | next(-1) {} |
58 | ConditionalCE32(const UnicodeString &ct, uint32_t ce) |
59 | : context(ct), |
60 | ce32(ce), defaultCE32(Collation::NO_CE32), builtCE32(Collation::NO_CE32), |
61 | next(-1) {} |
62 | |
63 | inline UBool hasContext() const { return context.length() > 1; } |
64 | inline int32_t prefixLength() const { return context.charAt(0); } |
65 | |
66 | /** |
67 | * "\0" for the first entry for any code point, with its default CE32. |
68 | * |
69 | * Otherwise one unit with the length of the prefix string, |
70 | * then the prefix string, then the contraction suffix. |
71 | */ |
72 | UnicodeString context; |
73 | /** |
74 | * CE32 for the code point and its context. |
75 | * Can be special (e.g., for an expansion) but not contextual (prefix or contraction tag). |
76 | */ |
77 | uint32_t ce32; |
78 | /** |
79 | * Default CE32 for all contexts with this same prefix. |
80 | * Initially NO_CE32. Set only while building runtime data structures, |
81 | * and only on one of the nodes of a sub-list with the same prefix. |
82 | */ |
83 | uint32_t defaultCE32; |
84 | /** |
85 | * CE32 for the built contexts. |
86 | * When fetching CEs from the builder, the contexts are built into their runtime form |
87 | * so that the normal collation implementation can process them. |
88 | * The result is cached in the list head. It is reset when the contexts are modified. |
89 | */ |
90 | uint32_t builtCE32; |
91 | /** |
92 | * Index of the next ConditionalCE32. |
93 | * Negative for the end of the list. |
94 | */ |
95 | int32_t next; |
96 | }; |
97 | |
98 | U_CDECL_BEGIN |
99 | |
100 | U_CAPI void U_CALLCONV |
101 | uprv_deleteConditionalCE32(void *obj) { |
102 | delete static_cast<ConditionalCE32 *>(obj); |
103 | } |
104 | |
105 | U_CDECL_END |
106 | |
107 | /** |
108 | * Build-time collation element and character iterator. |
109 | * Uses the runtime CollationIterator for fetching CEs for a string |
110 | * but reads from the builder's unfinished data structures. |
111 | * In particular, this class reads from the unfinished trie |
112 | * and has to avoid CollationIterator::nextCE() and redirect other |
113 | * calls to data->getCE32() and data->getCE32FromSupplementary(). |
114 | * |
115 | * We do this so that we need not implement the collation algorithm |
116 | * again for the builder and make it behave exactly like the runtime code. |
117 | * That would be more difficult to test and maintain than this indirection. |
118 | * |
119 | * Some CE32 tags (for example, the DIGIT_TAG) do not occur in the builder data, |
120 | * so the data accesses from those code paths need not be modified. |
121 | * |
122 | * This class iterates directly over whole code points |
123 | * so that the CollationIterator does not need the finished trie |
124 | * for handling the LEAD_SURROGATE_TAG. |
125 | */ |
126 | class DataBuilderCollationIterator : public CollationIterator { |
127 | public: |
128 | DataBuilderCollationIterator(CollationDataBuilder &b); |
129 | |
130 | virtual ~DataBuilderCollationIterator(); |
131 | |
132 | int32_t fetchCEs(const UnicodeString &str, int32_t start, int64_t ces[], int32_t cesLength); |
133 | |
134 | virtual void resetToOffset(int32_t newOffset); |
135 | virtual int32_t getOffset() const; |
136 | |
137 | virtual UChar32 nextCodePoint(UErrorCode &errorCode); |
138 | virtual UChar32 previousCodePoint(UErrorCode &errorCode); |
139 | |
140 | protected: |
141 | virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode); |
142 | virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode); |
143 | |
144 | virtual uint32_t getDataCE32(UChar32 c) const; |
145 | virtual uint32_t getCE32FromBuilderData(uint32_t ce32, UErrorCode &errorCode); |
146 | |
147 | CollationDataBuilder &builder; |
148 | CollationData builderData; |
149 | uint32_t jamoCE32s[CollationData::JAMO_CE32S_LENGTH]; |
150 | const UnicodeString *s; |
151 | int32_t pos; |
152 | }; |
153 | |
154 | DataBuilderCollationIterator::DataBuilderCollationIterator(CollationDataBuilder &b) |
155 | : CollationIterator(&builderData, /*numeric=*/ FALSE), |
156 | builder(b), builderData(b.nfcImpl), |
157 | s(NULL), pos(0) { |
158 | builderData.base = builder.base; |
159 | // Set all of the jamoCE32s[] to indirection CE32s. |
160 | for(int32_t j = 0; j < CollationData::JAMO_CE32S_LENGTH; ++j) { // Count across Jamo types. |
161 | UChar32 jamo = CollationDataBuilder::jamoCpFromIndex(j); |
162 | jamoCE32s[j] = Collation::makeCE32FromTagAndIndex(Collation::BUILDER_DATA_TAG, jamo) | |
163 | CollationDataBuilder::IS_BUILDER_JAMO_CE32; |
164 | } |
165 | builderData.jamoCE32s = jamoCE32s; |
166 | } |
167 | |
168 | DataBuilderCollationIterator::~DataBuilderCollationIterator() {} |
169 | |
170 | int32_t |
171 | DataBuilderCollationIterator::fetchCEs(const UnicodeString &str, int32_t start, |
172 | int64_t ces[], int32_t cesLength) { |
173 | // Set the pointers each time, in case they changed due to reallocation. |
174 | builderData.ce32s = reinterpret_cast<const uint32_t *>(builder.ce32s.getBuffer()); |
175 | builderData.ces = builder.ce64s.getBuffer(); |
176 | builderData.contexts = builder.contexts.getBuffer(); |
177 | // Modified copy of CollationIterator::nextCE() and CollationIterator::nextCEFromCE32(). |
178 | reset(); |
179 | s = &str; |
180 | pos = start; |
181 | UErrorCode errorCode = U_ZERO_ERROR; |
182 | while(U_SUCCESS(errorCode) && pos < s->length()) { |
183 | // No need to keep all CEs in the iterator buffer. |
184 | clearCEs(); |
185 | UChar32 c = s->char32At(pos); |
186 | pos += U16_LENGTH(c); |
187 | uint32_t ce32 = utrie2_get32(builder.trie, c); |
188 | const CollationData *d; |
189 | if(ce32 == Collation::FALLBACK_CE32) { |
190 | d = builder.base; |
191 | ce32 = builder.base->getCE32(c); |
192 | } else { |
193 | d = &builderData; |
194 | } |
195 | appendCEsFromCE32(d, c, ce32, /*forward=*/ TRUE, errorCode); |
196 | U_ASSERT(U_SUCCESS(errorCode)); |
197 | for(int32_t i = 0; i < getCEsLength(); ++i) { |
198 | int64_t ce = getCE(i); |
199 | if(ce != 0) { |
200 | if(cesLength < Collation::MAX_EXPANSION_LENGTH) { |
201 | ces[cesLength] = ce; |
202 | } |
203 | ++cesLength; |
204 | } |
205 | } |
206 | } |
207 | return cesLength; |
208 | } |
209 | |
210 | void |
211 | DataBuilderCollationIterator::resetToOffset(int32_t newOffset) { |
212 | reset(); |
213 | pos = newOffset; |
214 | } |
215 | |
216 | int32_t |
217 | DataBuilderCollationIterator::getOffset() const { |
218 | return pos; |
219 | } |
220 | |
221 | UChar32 |
222 | DataBuilderCollationIterator::nextCodePoint(UErrorCode & /*errorCode*/) { |
223 | if(pos == s->length()) { |
224 | return U_SENTINEL; |
225 | } |
226 | UChar32 c = s->char32At(pos); |
227 | pos += U16_LENGTH(c); |
228 | return c; |
229 | } |
230 | |
231 | UChar32 |
232 | DataBuilderCollationIterator::previousCodePoint(UErrorCode & /*errorCode*/) { |
233 | if(pos == 0) { |
234 | return U_SENTINEL; |
235 | } |
236 | UChar32 c = s->char32At(pos - 1); |
237 | pos -= U16_LENGTH(c); |
238 | return c; |
239 | } |
240 | |
241 | void |
242 | DataBuilderCollationIterator::forwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) { |
243 | pos = s->moveIndex32(pos, num); |
244 | } |
245 | |
246 | void |
247 | DataBuilderCollationIterator::backwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) { |
248 | pos = s->moveIndex32(pos, -num); |
249 | } |
250 | |
251 | uint32_t |
252 | DataBuilderCollationIterator::getDataCE32(UChar32 c) const { |
253 | return utrie2_get32(builder.trie, c); |
254 | } |
255 | |
256 | uint32_t |
257 | DataBuilderCollationIterator::getCE32FromBuilderData(uint32_t ce32, UErrorCode &errorCode) { |
258 | U_ASSERT(Collation::hasCE32Tag(ce32, Collation::BUILDER_DATA_TAG)); |
259 | if((ce32 & CollationDataBuilder::IS_BUILDER_JAMO_CE32) != 0) { |
260 | UChar32 jamo = Collation::indexFromCE32(ce32); |
261 | return utrie2_get32(builder.trie, jamo); |
262 | } else { |
263 | ConditionalCE32 *cond = builder.getConditionalCE32ForCE32(ce32); |
264 | if(cond->builtCE32 == Collation::NO_CE32) { |
265 | // Build the context-sensitive mappings into their runtime form and cache the result. |
266 | cond->builtCE32 = builder.buildContext(cond, errorCode); |
267 | if(errorCode == U_BUFFER_OVERFLOW_ERROR) { |
268 | errorCode = U_ZERO_ERROR; |
269 | builder.clearContexts(); |
270 | cond->builtCE32 = builder.buildContext(cond, errorCode); |
271 | } |
272 | builderData.contexts = builder.contexts.getBuffer(); |
273 | } |
274 | return cond->builtCE32; |
275 | } |
276 | } |
277 | |
278 | // ------------------------------------------------------------------------- *** |
279 | |
280 | CollationDataBuilder::CollationDataBuilder(UErrorCode &errorCode) |
281 | : nfcImpl(*Normalizer2Factory::getNFCImpl(errorCode)), |
282 | base(NULL), baseSettings(NULL), |
283 | trie(NULL), |
284 | ce32s(errorCode), ce64s(errorCode), conditionalCE32s(errorCode), |
285 | modified(FALSE), |
286 | fastLatinEnabled(FALSE), fastLatinBuilder(NULL), |
287 | collIter(NULL) { |
288 | // Reserve the first CE32 for U+0000. |
289 | ce32s.addElement(0, errorCode); |
290 | conditionalCE32s.setDeleter(uprv_deleteConditionalCE32); |
291 | } |
292 | |
293 | CollationDataBuilder::~CollationDataBuilder() { |
294 | utrie2_close(trie); |
295 | delete fastLatinBuilder; |
296 | delete collIter; |
297 | } |
298 | |
299 | void |
300 | CollationDataBuilder::initForTailoring(const CollationData *b, UErrorCode &errorCode) { |
301 | if(U_FAILURE(errorCode)) { return; } |
302 | if(trie != NULL) { |
303 | errorCode = U_INVALID_STATE_ERROR; |
304 | return; |
305 | } |
306 | if(b == NULL) { |
307 | errorCode = U_ILLEGAL_ARGUMENT_ERROR; |
308 | return; |
309 | } |
310 | base = b; |
311 | |
312 | // For a tailoring, the default is to fall back to the base. |
313 | trie = utrie2_open(Collation::FALLBACK_CE32, Collation::FFFD_CE32, &errorCode); |
314 | |
315 | // Set the Latin-1 letters block so that it is allocated first in the data array, |
316 | // to try to improve locality of reference when sorting Latin-1 text. |
317 | // Do not use utrie2_setRange32() since that will not actually allocate blocks |
318 | // that are filled with the default value. |
319 | // ASCII (0..7F) is already preallocated anyway. |
320 | for(UChar32 c = 0xc0; c <= 0xff; ++c) { |
321 | utrie2_set32(trie, c, Collation::FALLBACK_CE32, &errorCode); |
322 | } |
323 | |
324 | // Hangul syllables are not tailorable (except via tailoring Jamos). |
325 | // Always set the Hangul tag to help performance. |
326 | // Do this here, rather than in buildMappings(), |
327 | // so that we see the HANGUL_TAG in various assertions. |
328 | uint32_t hangulCE32 = Collation::makeCE32FromTagAndIndex(Collation::HANGUL_TAG, 0); |
329 | utrie2_setRange32(trie, Hangul::HANGUL_BASE, Hangul::HANGUL_END, hangulCE32, TRUE, &errorCode); |
330 | |
331 | // Copy the set contents but don't copy/clone the set as a whole because |
332 | // that would copy the isFrozen state too. |
333 | unsafeBackwardSet.addAll(*b->unsafeBackwardSet); |
334 | |
335 | if(U_FAILURE(errorCode)) { return; } |
336 | } |
337 | |
338 | UBool |
339 | CollationDataBuilder::maybeSetPrimaryRange(UChar32 start, UChar32 end, |
340 | uint32_t primary, int32_t step, |
341 | UErrorCode &errorCode) { |
342 | if(U_FAILURE(errorCode)) { return FALSE; } |
343 | U_ASSERT(start <= end); |
344 | // TODO: Do we need to check what values are currently set for start..end? |
345 | // An offset range is worth it only if we can achieve an overlap between |
346 | // adjacent UTrie2 blocks of 32 code points each. |
347 | // An offset CE is also a little more expensive to look up and compute |
348 | // than a simple CE. |
349 | // If the range spans at least three UTrie2 block boundaries (> 64 code points), |
350 | // then we take it. |
351 | // If the range spans one or two block boundaries and there are |
352 | // at least 4 code points on either side, then we take it. |
353 | // (We could additionally require a minimum range length of, say, 16.) |
354 | int32_t blockDelta = (end >> 5) - (start >> 5); |
355 | if(2 <= step && step <= 0x7f && |
356 | (blockDelta >= 3 || |
357 | (blockDelta > 0 && (start & 0x1f) <= 0x1c && (end & 0x1f) >= 3))) { |
358 | int64_t dataCE = ((int64_t)primary << 32) | (start << 8) | step; |
359 | if(isCompressiblePrimary(primary)) { dataCE |= 0x80; } |
360 | int32_t index = addCE(dataCE, errorCode); |
361 | if(U_FAILURE(errorCode)) { return 0; } |
362 | if(index > Collation::MAX_INDEX) { |
363 | errorCode = U_BUFFER_OVERFLOW_ERROR; |
364 | return 0; |
365 | } |
366 | uint32_t offsetCE32 = Collation::makeCE32FromTagAndIndex(Collation::OFFSET_TAG, index); |
367 | utrie2_setRange32(trie, start, end, offsetCE32, TRUE, &errorCode); |
368 | modified = TRUE; |
369 | return TRUE; |
370 | } else { |
371 | return FALSE; |
372 | } |
373 | } |
374 | |
375 | uint32_t |
376 | CollationDataBuilder::setPrimaryRangeAndReturnNext(UChar32 start, UChar32 end, |
377 | uint32_t primary, int32_t step, |
378 | UErrorCode &errorCode) { |
379 | if(U_FAILURE(errorCode)) { return 0; } |
380 | UBool isCompressible = isCompressiblePrimary(primary); |
381 | if(maybeSetPrimaryRange(start, end, primary, step, errorCode)) { |
382 | return Collation::incThreeBytePrimaryByOffset(primary, isCompressible, |
383 | (end - start + 1) * step); |
384 | } else { |
385 | // Short range: Set individual CE32s. |
386 | for(;;) { |
387 | utrie2_set32(trie, start, Collation::makeLongPrimaryCE32(primary), &errorCode); |
388 | ++start; |
389 | primary = Collation::incThreeBytePrimaryByOffset(primary, isCompressible, step); |
390 | if(start > end) { return primary; } |
391 | } |
392 | modified = TRUE; |
393 | } |
394 | } |
395 | |
396 | uint32_t |
397 | CollationDataBuilder::getCE32FromOffsetCE32(UBool fromBase, UChar32 c, uint32_t ce32) const { |
398 | int32_t i = Collation::indexFromCE32(ce32); |
399 | int64_t dataCE = fromBase ? base->ces[i] : ce64s.elementAti(i); |
400 | uint32_t p = Collation::getThreeBytePrimaryForOffsetData(c, dataCE); |
401 | return Collation::makeLongPrimaryCE32(p); |
402 | } |
403 | |
404 | UBool |
405 | CollationDataBuilder::isCompressibleLeadByte(uint32_t b) const { |
406 | return base->isCompressibleLeadByte(b); |
407 | } |
408 | |
409 | UBool |
410 | CollationDataBuilder::isAssigned(UChar32 c) const { |
411 | return Collation::isAssignedCE32(utrie2_get32(trie, c)); |
412 | } |
413 | |
414 | uint32_t |
415 | CollationDataBuilder::getLongPrimaryIfSingleCE(UChar32 c) const { |
416 | uint32_t ce32 = utrie2_get32(trie, c); |
417 | if(Collation::isLongPrimaryCE32(ce32)) { |
418 | return Collation::primaryFromLongPrimaryCE32(ce32); |
419 | } else { |
420 | return 0; |
421 | } |
422 | } |
423 | |
424 | int64_t |
425 | CollationDataBuilder::getSingleCE(UChar32 c, UErrorCode &errorCode) const { |
426 | if(U_FAILURE(errorCode)) { return 0; } |
427 | // Keep parallel with CollationData::getSingleCE(). |
428 | UBool fromBase = FALSE; |
429 | uint32_t ce32 = utrie2_get32(trie, c); |
430 | if(ce32 == Collation::FALLBACK_CE32) { |
431 | fromBase = TRUE; |
432 | ce32 = base->getCE32(c); |
433 | } |
434 | while(Collation::isSpecialCE32(ce32)) { |
435 | switch(Collation::tagFromCE32(ce32)) { |
436 | case Collation::LATIN_EXPANSION_TAG: |
437 | case Collation::BUILDER_DATA_TAG: |
438 | case Collation::PREFIX_TAG: |
439 | case Collation::CONTRACTION_TAG: |
440 | case Collation::HANGUL_TAG: |
441 | case Collation::LEAD_SURROGATE_TAG: |
442 | errorCode = U_UNSUPPORTED_ERROR; |
443 | return 0; |
444 | case Collation::FALLBACK_TAG: |
445 | case Collation::RESERVED_TAG_3: |
446 | errorCode = U_INTERNAL_PROGRAM_ERROR; |
447 | return 0; |
448 | case Collation::LONG_PRIMARY_TAG: |
449 | return Collation::ceFromLongPrimaryCE32(ce32); |
450 | case Collation::LONG_SECONDARY_TAG: |
451 | return Collation::ceFromLongSecondaryCE32(ce32); |
452 | case Collation::EXPANSION32_TAG: |
453 | if(Collation::lengthFromCE32(ce32) == 1) { |
454 | int32_t i = Collation::indexFromCE32(ce32); |
455 | ce32 = fromBase ? base->ce32s[i] : ce32s.elementAti(i); |
456 | break; |
457 | } else { |
458 | errorCode = U_UNSUPPORTED_ERROR; |
459 | return 0; |
460 | } |
461 | case Collation::EXPANSION_TAG: { |
462 | if(Collation::lengthFromCE32(ce32) == 1) { |
463 | int32_t i = Collation::indexFromCE32(ce32); |
464 | return fromBase ? base->ces[i] : ce64s.elementAti(i); |
465 | } else { |
466 | errorCode = U_UNSUPPORTED_ERROR; |
467 | return 0; |
468 | } |
469 | } |
470 | case Collation::DIGIT_TAG: |
471 | // Fetch the non-numeric-collation CE32 and continue. |
472 | ce32 = ce32s.elementAti(Collation::indexFromCE32(ce32)); |
473 | break; |
474 | case Collation::U0000_TAG: |
475 | U_ASSERT(c == 0); |
476 | // Fetch the normal ce32 for U+0000 and continue. |
477 | ce32 = fromBase ? base->ce32s[0] : ce32s.elementAti(0); |
478 | break; |
479 | case Collation::OFFSET_TAG: |
480 | ce32 = getCE32FromOffsetCE32(fromBase, c, ce32); |
481 | break; |
482 | case Collation::IMPLICIT_TAG: |
483 | return Collation::unassignedCEFromCodePoint(c); |
484 | } |
485 | } |
486 | return Collation::ceFromSimpleCE32(ce32); |
487 | } |
488 | |
489 | int32_t |
490 | CollationDataBuilder::addCE(int64_t ce, UErrorCode &errorCode) { |
491 | int32_t length = ce64s.size(); |
492 | for(int32_t i = 0; i < length; ++i) { |
493 | if(ce == ce64s.elementAti(i)) { return i; } |
494 | } |
495 | ce64s.addElement(ce, errorCode); |
496 | return length; |
497 | } |
498 | |
499 | int32_t |
500 | CollationDataBuilder::addCE32(uint32_t ce32, UErrorCode &errorCode) { |
501 | int32_t length = ce32s.size(); |
502 | for(int32_t i = 0; i < length; ++i) { |
503 | if(ce32 == (uint32_t)ce32s.elementAti(i)) { return i; } |
504 | } |
505 | ce32s.addElement((int32_t)ce32, errorCode); |
506 | return length; |
507 | } |
508 | |
509 | int32_t |
510 | CollationDataBuilder::addConditionalCE32(const UnicodeString &context, uint32_t ce32, |
511 | UErrorCode &errorCode) { |
512 | if(U_FAILURE(errorCode)) { return -1; } |
513 | U_ASSERT(!context.isEmpty()); |
514 | int32_t index = conditionalCE32s.size(); |
515 | if(index > Collation::MAX_INDEX) { |
516 | errorCode = U_BUFFER_OVERFLOW_ERROR; |
517 | return -1; |
518 | } |
519 | ConditionalCE32 *cond = new ConditionalCE32(context, ce32); |
520 | if(cond == NULL) { |
521 | errorCode = U_MEMORY_ALLOCATION_ERROR; |
522 | return -1; |
523 | } |
524 | conditionalCE32s.addElement(cond, errorCode); |
525 | return index; |
526 | } |
527 | |
528 | void |
529 | CollationDataBuilder::add(const UnicodeString &prefix, const UnicodeString &s, |
530 | const int64_t ces[], int32_t cesLength, |
531 | UErrorCode &errorCode) { |
532 | uint32_t ce32 = encodeCEs(ces, cesLength, errorCode); |
533 | addCE32(prefix, s, ce32, errorCode); |
534 | } |
535 | |
536 | void |
537 | CollationDataBuilder::addCE32(const UnicodeString &prefix, const UnicodeString &s, |
538 | uint32_t ce32, UErrorCode &errorCode) { |
539 | if(U_FAILURE(errorCode)) { return; } |
540 | if(s.isEmpty()) { |
541 | errorCode = U_ILLEGAL_ARGUMENT_ERROR; |
542 | return; |
543 | } |
544 | if(trie == NULL || utrie2_isFrozen(trie)) { |
545 | errorCode = U_INVALID_STATE_ERROR; |
546 | return; |
547 | } |
548 | UChar32 c = s.char32At(0); |
549 | int32_t cLength = U16_LENGTH(c); |
550 | uint32_t oldCE32 = utrie2_get32(trie, c); |
551 | UBool hasContext = !prefix.isEmpty() || s.length() > cLength; |
552 | if(oldCE32 == Collation::FALLBACK_CE32) { |
553 | // First tailoring for c. |
554 | // If c has contextual base mappings or if we add a contextual mapping, |
555 | // then copy the base mappings. |
556 | // Otherwise we just override the base mapping. |
557 | uint32_t baseCE32 = base->getFinalCE32(base->getCE32(c)); |
558 | if(hasContext || Collation::ce32HasContext(baseCE32)) { |
559 | oldCE32 = copyFromBaseCE32(c, baseCE32, TRUE, errorCode); |
560 | utrie2_set32(trie, c, oldCE32, &errorCode); |
561 | if(U_FAILURE(errorCode)) { return; } |
562 | } |
563 | } |
564 | if(!hasContext) { |
565 | // No prefix, no contraction. |
566 | if(!isBuilderContextCE32(oldCE32)) { |
567 | utrie2_set32(trie, c, ce32, &errorCode); |
568 | } else { |
569 | ConditionalCE32 *cond = getConditionalCE32ForCE32(oldCE32); |
570 | cond->builtCE32 = Collation::NO_CE32; |
571 | cond->ce32 = ce32; |
572 | } |
573 | } else { |
574 | ConditionalCE32 *cond; |
575 | if(!isBuilderContextCE32(oldCE32)) { |
576 | // Replace the simple oldCE32 with a builder context CE32 |
577 | // pointing to a new ConditionalCE32 list head. |
578 | int32_t index = addConditionalCE32(UnicodeString((UChar)0), oldCE32, errorCode); |
579 | if(U_FAILURE(errorCode)) { return; } |
580 | uint32_t contextCE32 = makeBuilderContextCE32(index); |
581 | utrie2_set32(trie, c, contextCE32, &errorCode); |
582 | contextChars.add(c); |
583 | cond = getConditionalCE32(index); |
584 | } else { |
585 | cond = getConditionalCE32ForCE32(oldCE32); |
586 | cond->builtCE32 = Collation::NO_CE32; |
587 | } |
588 | UnicodeString suffix(s, cLength); |
589 | UnicodeString context((UChar)prefix.length()); |
590 | context.append(prefix).append(suffix); |
591 | unsafeBackwardSet.addAll(suffix); |
592 | for(;;) { |
593 | // invariant: context > cond->context |
594 | int32_t next = cond->next; |
595 | if(next < 0) { |
596 | // Append a new ConditionalCE32 after cond. |
597 | int32_t index = addConditionalCE32(context, ce32, errorCode); |
598 | if(U_FAILURE(errorCode)) { return; } |
599 | cond->next = index; |
600 | break; |
601 | } |
602 | ConditionalCE32 *nextCond = getConditionalCE32(next); |
603 | int8_t cmp = context.compare(nextCond->context); |
604 | if(cmp < 0) { |
605 | // Insert a new ConditionalCE32 between cond and nextCond. |
606 | int32_t index = addConditionalCE32(context, ce32, errorCode); |
607 | if(U_FAILURE(errorCode)) { return; } |
608 | cond->next = index; |
609 | getConditionalCE32(index)->next = next; |
610 | break; |
611 | } else if(cmp == 0) { |
612 | // Same context as before, overwrite its ce32. |
613 | nextCond->ce32 = ce32; |
614 | break; |
615 | } |
616 | cond = nextCond; |
617 | } |
618 | } |
619 | modified = TRUE; |
620 | } |
621 | |
622 | uint32_t |
623 | CollationDataBuilder::encodeOneCEAsCE32(int64_t ce) { |
624 | uint32_t p = (uint32_t)(ce >> 32); |
625 | uint32_t lower32 = (uint32_t)ce; |
626 | uint32_t t = (uint32_t)(ce & 0xffff); |
627 | U_ASSERT((t & 0xc000) != 0xc000); // Impossible case bits 11 mark special CE32s. |
628 | if((ce & INT64_C(0xffff00ff00ff)) == 0) { |
629 | // normal form ppppsstt |
630 | return p | (lower32 >> 16) | (t >> 8); |
631 | } else if((ce & INT64_C(0xffffffffff)) == Collation::COMMON_SEC_AND_TER_CE) { |
632 | // long-primary form ppppppC1 |
633 | return Collation::makeLongPrimaryCE32(p); |
634 | } else if(p == 0 && (t & 0xff) == 0) { |
635 | // long-secondary form ssssttC2 |
636 | return Collation::makeLongSecondaryCE32(lower32); |
637 | } |
638 | return Collation::NO_CE32; |
639 | } |
640 | |
641 | uint32_t |
642 | CollationDataBuilder::encodeOneCE(int64_t ce, UErrorCode &errorCode) { |
643 | // Try to encode one CE as one CE32. |
644 | uint32_t ce32 = encodeOneCEAsCE32(ce); |
645 | if(ce32 != Collation::NO_CE32) { return ce32; } |
646 | int32_t index = addCE(ce, errorCode); |
647 | if(U_FAILURE(errorCode)) { return 0; } |
648 | if(index > Collation::MAX_INDEX) { |
649 | errorCode = U_BUFFER_OVERFLOW_ERROR; |
650 | return 0; |
651 | } |
652 | return Collation::makeCE32FromTagIndexAndLength(Collation::EXPANSION_TAG, index, 1); |
653 | } |
654 | |
655 | uint32_t |
656 | CollationDataBuilder::encodeCEs(const int64_t ces[], int32_t cesLength, |
657 | UErrorCode &errorCode) { |
658 | if(U_FAILURE(errorCode)) { return 0; } |
659 | if(cesLength < 0 || cesLength > Collation::MAX_EXPANSION_LENGTH) { |
660 | errorCode = U_ILLEGAL_ARGUMENT_ERROR; |
661 | return 0; |
662 | } |
663 | if(trie == NULL || utrie2_isFrozen(trie)) { |
664 | errorCode = U_INVALID_STATE_ERROR; |
665 | return 0; |
666 | } |
667 | if(cesLength == 0) { |
668 | // Convenience: We cannot map to nothing, but we can map to a completely ignorable CE. |
669 | // Do this here so that callers need not do it. |
670 | return encodeOneCEAsCE32(0); |
671 | } else if(cesLength == 1) { |
672 | return encodeOneCE(ces[0], errorCode); |
673 | } else if(cesLength == 2) { |
674 | // Try to encode two CEs as one CE32. |
675 | int64_t ce0 = ces[0]; |
676 | int64_t ce1 = ces[1]; |
677 | uint32_t p0 = (uint32_t)(ce0 >> 32); |
678 | if((ce0 & INT64_C(0xffffffffff00ff)) == Collation::COMMON_SECONDARY_CE && |
679 | (ce1 & INT64_C(0xffffffff00ffffff)) == Collation::COMMON_TERTIARY_CE && |
680 | p0 != 0) { |
681 | // Latin mini expansion |
682 | return |
683 | p0 | |
684 | (((uint32_t)ce0 & 0xff00u) << 8) | |
685 | (uint32_t)(ce1 >> 16) | |
686 | Collation::SPECIAL_CE32_LOW_BYTE | |
687 | Collation::LATIN_EXPANSION_TAG; |
688 | } |
689 | } |
690 | // Try to encode two or more CEs as CE32s. |
691 | int32_t newCE32s[Collation::MAX_EXPANSION_LENGTH]; |
692 | for(int32_t i = 0;; ++i) { |
693 | if(i == cesLength) { |
694 | return encodeExpansion32(newCE32s, cesLength, errorCode); |
695 | } |
696 | uint32_t ce32 = encodeOneCEAsCE32(ces[i]); |
697 | if(ce32 == Collation::NO_CE32) { break; } |
698 | newCE32s[i] = (int32_t)ce32; |
699 | } |
700 | return encodeExpansion(ces, cesLength, errorCode); |
701 | } |
702 | |
703 | uint32_t |
704 | CollationDataBuilder::encodeExpansion(const int64_t ces[], int32_t length, UErrorCode &errorCode) { |
705 | if(U_FAILURE(errorCode)) { return 0; } |
706 | // See if this sequence of CEs has already been stored. |
707 | int64_t first = ces[0]; |
708 | int32_t ce64sMax = ce64s.size() - length; |
709 | for(int32_t i = 0; i <= ce64sMax; ++i) { |
710 | if(first == ce64s.elementAti(i)) { |
711 | if(i > Collation::MAX_INDEX) { |
712 | errorCode = U_BUFFER_OVERFLOW_ERROR; |
713 | return 0; |
714 | } |
715 | for(int32_t j = 1;; ++j) { |
716 | if(j == length) { |
717 | return Collation::makeCE32FromTagIndexAndLength( |
718 | Collation::EXPANSION_TAG, i, length); |
719 | } |
720 | if(ce64s.elementAti(i + j) != ces[j]) { break; } |
721 | } |
722 | } |
723 | } |
724 | // Store the new sequence. |
725 | int32_t i = ce64s.size(); |
726 | if(i > Collation::MAX_INDEX) { |
727 | errorCode = U_BUFFER_OVERFLOW_ERROR; |
728 | return 0; |
729 | } |
730 | for(int32_t j = 0; j < length; ++j) { |
731 | ce64s.addElement(ces[j], errorCode); |
732 | } |
733 | return Collation::makeCE32FromTagIndexAndLength(Collation::EXPANSION_TAG, i, length); |
734 | } |
735 | |
736 | uint32_t |
737 | CollationDataBuilder::encodeExpansion32(const int32_t newCE32s[], int32_t length, |
738 | UErrorCode &errorCode) { |
739 | if(U_FAILURE(errorCode)) { return 0; } |
740 | // See if this sequence of CE32s has already been stored. |
741 | int32_t first = newCE32s[0]; |
742 | int32_t ce32sMax = ce32s.size() - length; |
743 | for(int32_t i = 0; i <= ce32sMax; ++i) { |
744 | if(first == ce32s.elementAti(i)) { |
745 | if(i > Collation::MAX_INDEX) { |
746 | errorCode = U_BUFFER_OVERFLOW_ERROR; |
747 | return 0; |
748 | } |
749 | for(int32_t j = 1;; ++j) { |
750 | if(j == length) { |
751 | return Collation::makeCE32FromTagIndexAndLength( |
752 | Collation::EXPANSION32_TAG, i, length); |
753 | } |
754 | if(ce32s.elementAti(i + j) != newCE32s[j]) { break; } |
755 | } |
756 | } |
757 | } |
758 | // Store the new sequence. |
759 | int32_t i = ce32s.size(); |
760 | if(i > Collation::MAX_INDEX) { |
761 | errorCode = U_BUFFER_OVERFLOW_ERROR; |
762 | return 0; |
763 | } |
764 | for(int32_t j = 0; j < length; ++j) { |
765 | ce32s.addElement(newCE32s[j], errorCode); |
766 | } |
767 | return Collation::makeCE32FromTagIndexAndLength(Collation::EXPANSION32_TAG, i, length); |
768 | } |
769 | |
770 | uint32_t |
771 | CollationDataBuilder::copyFromBaseCE32(UChar32 c, uint32_t ce32, UBool withContext, |
772 | UErrorCode &errorCode) { |
773 | if(U_FAILURE(errorCode)) { return 0; } |
774 | if(!Collation::isSpecialCE32(ce32)) { return ce32; } |
775 | switch(Collation::tagFromCE32(ce32)) { |
776 | case Collation::LONG_PRIMARY_TAG: |
777 | case Collation::LONG_SECONDARY_TAG: |
778 | case Collation::LATIN_EXPANSION_TAG: |
779 | // copy as is |
780 | break; |
781 | case Collation::EXPANSION32_TAG: { |
782 | const uint32_t *baseCE32s = base->ce32s + Collation::indexFromCE32(ce32); |
783 | int32_t length = Collation::lengthFromCE32(ce32); |
784 | ce32 = encodeExpansion32( |
785 | reinterpret_cast<const int32_t *>(baseCE32s), length, errorCode); |
786 | break; |
787 | } |
788 | case Collation::EXPANSION_TAG: { |
789 | const int64_t *baseCEs = base->ces + Collation::indexFromCE32(ce32); |
790 | int32_t length = Collation::lengthFromCE32(ce32); |
791 | ce32 = encodeExpansion(baseCEs, length, errorCode); |
792 | break; |
793 | } |
794 | case Collation::PREFIX_TAG: { |
795 | // Flatten prefixes and nested suffixes (contractions) |
796 | // into a linear list of ConditionalCE32. |
797 | const UChar *p = base->contexts + Collation::indexFromCE32(ce32); |
798 | ce32 = CollationData::readCE32(p); // Default if no prefix match. |
799 | if(!withContext) { |
800 | return copyFromBaseCE32(c, ce32, FALSE, errorCode); |
801 | } |
802 | ConditionalCE32 head; |
803 | UnicodeString context((UChar)0); |
804 | int32_t index; |
805 | if(Collation::isContractionCE32(ce32)) { |
806 | index = copyContractionsFromBaseCE32(context, c, ce32, &head, errorCode); |
807 | } else { |
808 | ce32 = copyFromBaseCE32(c, ce32, TRUE, errorCode); |
809 | head.next = index = addConditionalCE32(context, ce32, errorCode); |
810 | } |
811 | if(U_FAILURE(errorCode)) { return 0; } |
812 | ConditionalCE32 *cond = getConditionalCE32(index); // the last ConditionalCE32 so far |
813 | UCharsTrie::Iterator prefixes(p + 2, 0, errorCode); |
814 | while(prefixes.next(errorCode)) { |
815 | context = prefixes.getString(); |
816 | context.reverse(); |
817 | context.insert(0, (UChar)context.length()); |
818 | ce32 = (uint32_t)prefixes.getValue(); |
819 | if(Collation::isContractionCE32(ce32)) { |
820 | index = copyContractionsFromBaseCE32(context, c, ce32, cond, errorCode); |
821 | } else { |
822 | ce32 = copyFromBaseCE32(c, ce32, TRUE, errorCode); |
823 | cond->next = index = addConditionalCE32(context, ce32, errorCode); |
824 | } |
825 | if(U_FAILURE(errorCode)) { return 0; } |
826 | cond = getConditionalCE32(index); |
827 | } |
828 | ce32 = makeBuilderContextCE32(head.next); |
829 | contextChars.add(c); |
830 | break; |
831 | } |
832 | case Collation::CONTRACTION_TAG: { |
833 | if(!withContext) { |
834 | const UChar *p = base->contexts + Collation::indexFromCE32(ce32); |
835 | ce32 = CollationData::readCE32(p); // Default if no suffix match. |
836 | return copyFromBaseCE32(c, ce32, FALSE, errorCode); |
837 | } |
838 | ConditionalCE32 head; |
839 | UnicodeString context((UChar)0); |
840 | copyContractionsFromBaseCE32(context, c, ce32, &head, errorCode); |
841 | ce32 = makeBuilderContextCE32(head.next); |
842 | contextChars.add(c); |
843 | break; |
844 | } |
845 | case Collation::HANGUL_TAG: |
846 | errorCode = U_UNSUPPORTED_ERROR; // We forbid tailoring of Hangul syllables. |
847 | break; |
848 | case Collation::OFFSET_TAG: |
849 | ce32 = getCE32FromOffsetCE32(TRUE, c, ce32); |
850 | break; |
851 | case Collation::IMPLICIT_TAG: |
852 | ce32 = encodeOneCE(Collation::unassignedCEFromCodePoint(c), errorCode); |
853 | break; |
854 | default: |
855 | UPRV_UNREACHABLE; // require ce32 == base->getFinalCE32(ce32) |
856 | } |
857 | return ce32; |
858 | } |
859 | |
860 | int32_t |
861 | CollationDataBuilder::copyContractionsFromBaseCE32(UnicodeString &context, UChar32 c, uint32_t ce32, |
862 | ConditionalCE32 *cond, UErrorCode &errorCode) { |
863 | if(U_FAILURE(errorCode)) { return 0; } |
864 | const UChar *p = base->contexts + Collation::indexFromCE32(ce32); |
865 | int32_t index; |
866 | if((ce32 & Collation::CONTRACT_SINGLE_CP_NO_MATCH) != 0) { |
867 | // No match on the single code point. |
868 | // We are underneath a prefix, and the default mapping is just |
869 | // a fallback to the mappings for a shorter prefix. |
870 | U_ASSERT(context.length() > 1); |
871 | index = -1; |
872 | } else { |
873 | ce32 = CollationData::readCE32(p); // Default if no suffix match. |
874 | U_ASSERT(!Collation::isContractionCE32(ce32)); |
875 | ce32 = copyFromBaseCE32(c, ce32, TRUE, errorCode); |
876 | cond->next = index = addConditionalCE32(context, ce32, errorCode); |
877 | if(U_FAILURE(errorCode)) { return 0; } |
878 | cond = getConditionalCE32(index); |
879 | } |
880 | |
881 | int32_t suffixStart = context.length(); |
882 | UCharsTrie::Iterator suffixes(p + 2, 0, errorCode); |
883 | while(suffixes.next(errorCode)) { |
884 | context.append(suffixes.getString()); |
885 | ce32 = copyFromBaseCE32(c, (uint32_t)suffixes.getValue(), TRUE, errorCode); |
886 | cond->next = index = addConditionalCE32(context, ce32, errorCode); |
887 | if(U_FAILURE(errorCode)) { return 0; } |
888 | // No need to update the unsafeBackwardSet because the tailoring set |
889 | // is already a copy of the base set. |
890 | cond = getConditionalCE32(index); |
891 | context.truncate(suffixStart); |
892 | } |
893 | U_ASSERT(index >= 0); |
894 | return index; |
895 | } |
896 | |
897 | class CopyHelper { |
898 | public: |
899 | CopyHelper(const CollationDataBuilder &s, CollationDataBuilder &d, |
900 | const CollationDataBuilder::CEModifier &m, UErrorCode &initialErrorCode) |
901 | : src(s), dest(d), modifier(m), |
902 | errorCode(initialErrorCode) {} |
903 | |
904 | UBool copyRangeCE32(UChar32 start, UChar32 end, uint32_t ce32) { |
905 | ce32 = copyCE32(ce32); |
906 | utrie2_setRange32(dest.trie, start, end, ce32, TRUE, &errorCode); |
907 | if(CollationDataBuilder::isBuilderContextCE32(ce32)) { |
908 | dest.contextChars.add(start, end); |
909 | } |
910 | return U_SUCCESS(errorCode); |
911 | } |
912 | |
913 | uint32_t copyCE32(uint32_t ce32) { |
914 | if(!Collation::isSpecialCE32(ce32)) { |
915 | int64_t ce = modifier.modifyCE32(ce32); |
916 | if(ce != Collation::NO_CE) { |
917 | ce32 = dest.encodeOneCE(ce, errorCode); |
918 | } |
919 | } else { |
920 | int32_t tag = Collation::tagFromCE32(ce32); |
921 | if(tag == Collation::EXPANSION32_TAG) { |
922 | const uint32_t *srcCE32s = reinterpret_cast<uint32_t *>(src.ce32s.getBuffer()); |
923 | srcCE32s += Collation::indexFromCE32(ce32); |
924 | int32_t length = Collation::lengthFromCE32(ce32); |
925 | // Inspect the source CE32s. Just copy them if none are modified. |
926 | // Otherwise copy to modifiedCEs, with modifications. |
927 | UBool isModified = FALSE; |
928 | for(int32_t i = 0; i < length; ++i) { |
929 | ce32 = srcCE32s[i]; |
930 | int64_t ce; |
931 | if(Collation::isSpecialCE32(ce32) || |
932 | (ce = modifier.modifyCE32(ce32)) == Collation::NO_CE) { |
933 | if(isModified) { |
934 | modifiedCEs[i] = Collation::ceFromCE32(ce32); |
935 | } |
936 | } else { |
937 | if(!isModified) { |
938 | for(int32_t j = 0; j < i; ++j) { |
939 | modifiedCEs[j] = Collation::ceFromCE32(srcCE32s[j]); |
940 | } |
941 | isModified = TRUE; |
942 | } |
943 | modifiedCEs[i] = ce; |
944 | } |
945 | } |
946 | if(isModified) { |
947 | ce32 = dest.encodeCEs(modifiedCEs, length, errorCode); |
948 | } else { |
949 | ce32 = dest.encodeExpansion32( |
950 | reinterpret_cast<const int32_t *>(srcCE32s), length, errorCode); |
951 | } |
952 | } else if(tag == Collation::EXPANSION_TAG) { |
953 | const int64_t *srcCEs = src.ce64s.getBuffer(); |
954 | srcCEs += Collation::indexFromCE32(ce32); |
955 | int32_t length = Collation::lengthFromCE32(ce32); |
956 | // Inspect the source CEs. Just copy them if none are modified. |
957 | // Otherwise copy to modifiedCEs, with modifications. |
958 | UBool isModified = FALSE; |
959 | for(int32_t i = 0; i < length; ++i) { |
960 | int64_t srcCE = srcCEs[i]; |
961 | int64_t ce = modifier.modifyCE(srcCE); |
962 | if(ce == Collation::NO_CE) { |
963 | if(isModified) { |
964 | modifiedCEs[i] = srcCE; |
965 | } |
966 | } else { |
967 | if(!isModified) { |
968 | for(int32_t j = 0; j < i; ++j) { |
969 | modifiedCEs[j] = srcCEs[j]; |
970 | } |
971 | isModified = TRUE; |
972 | } |
973 | modifiedCEs[i] = ce; |
974 | } |
975 | } |
976 | if(isModified) { |
977 | ce32 = dest.encodeCEs(modifiedCEs, length, errorCode); |
978 | } else { |
979 | ce32 = dest.encodeExpansion(srcCEs, length, errorCode); |
980 | } |
981 | } else if(tag == Collation::BUILDER_DATA_TAG) { |
982 | // Copy the list of ConditionalCE32. |
983 | ConditionalCE32 *cond = src.getConditionalCE32ForCE32(ce32); |
984 | U_ASSERT(!cond->hasContext()); |
985 | int32_t destIndex = dest.addConditionalCE32( |
986 | cond->context, copyCE32(cond->ce32), errorCode); |
987 | ce32 = CollationDataBuilder::makeBuilderContextCE32(destIndex); |
988 | while(cond->next >= 0) { |
989 | cond = src.getConditionalCE32(cond->next); |
990 | ConditionalCE32 *prevDestCond = dest.getConditionalCE32(destIndex); |
991 | destIndex = dest.addConditionalCE32( |
992 | cond->context, copyCE32(cond->ce32), errorCode); |
993 | int32_t suffixStart = cond->prefixLength() + 1; |
994 | dest.unsafeBackwardSet.addAll(cond->context.tempSubString(suffixStart)); |
995 | prevDestCond->next = destIndex; |
996 | } |
997 | } else { |
998 | // Just copy long CEs and Latin mini expansions (and other expected values) as is, |
999 | // assuming that the modifier would not modify them. |
1000 | U_ASSERT(tag == Collation::LONG_PRIMARY_TAG || |
1001 | tag == Collation::LONG_SECONDARY_TAG || |
1002 | tag == Collation::LATIN_EXPANSION_TAG || |
1003 | tag == Collation::HANGUL_TAG); |
1004 | } |
1005 | } |
1006 | return ce32; |
1007 | } |
1008 | |
1009 | const CollationDataBuilder &src; |
1010 | CollationDataBuilder &dest; |
1011 | const CollationDataBuilder::CEModifier &modifier; |
1012 | int64_t modifiedCEs[Collation::MAX_EXPANSION_LENGTH]; |
1013 | UErrorCode errorCode; |
1014 | }; |
1015 | |
1016 | U_CDECL_BEGIN |
1017 | |
1018 | static UBool U_CALLCONV |
1019 | enumRangeForCopy(const void *context, UChar32 start, UChar32 end, uint32_t value) { |
1020 | return |
1021 | value == Collation::UNASSIGNED_CE32 || value == Collation::FALLBACK_CE32 || |
1022 | ((CopyHelper *)context)->copyRangeCE32(start, end, value); |
1023 | } |
1024 | |
1025 | U_CDECL_END |
1026 | |
1027 | void |
1028 | CollationDataBuilder::copyFrom(const CollationDataBuilder &src, const CEModifier &modifier, |
1029 | UErrorCode &errorCode) { |
1030 | if(U_FAILURE(errorCode)) { return; } |
1031 | if(trie == NULL || utrie2_isFrozen(trie)) { |
1032 | errorCode = U_INVALID_STATE_ERROR; |
1033 | return; |
1034 | } |
1035 | CopyHelper helper(src, *this, modifier, errorCode); |
1036 | utrie2_enum(src.trie, NULL, enumRangeForCopy, &helper); |
1037 | errorCode = helper.errorCode; |
1038 | // Update the contextChars and the unsafeBackwardSet while copying, |
1039 | // in case a character had conditional mappings in the source builder |
1040 | // and they were removed later. |
1041 | modified |= src.modified; |
1042 | } |
1043 | |
1044 | void |
1045 | CollationDataBuilder::optimize(const UnicodeSet &set, UErrorCode &errorCode) { |
1046 | if(U_FAILURE(errorCode) || set.isEmpty()) { return; } |
1047 | UnicodeSetIterator iter(set); |
1048 | while(iter.next() && !iter.isString()) { |
1049 | UChar32 c = iter.getCodepoint(); |
1050 | uint32_t ce32 = utrie2_get32(trie, c); |
1051 | if(ce32 == Collation::FALLBACK_CE32) { |
1052 | ce32 = base->getFinalCE32(base->getCE32(c)); |
1053 | ce32 = copyFromBaseCE32(c, ce32, TRUE, errorCode); |
1054 | utrie2_set32(trie, c, ce32, &errorCode); |
1055 | } |
1056 | } |
1057 | modified = TRUE; |
1058 | } |
1059 | |
1060 | void |
1061 | CollationDataBuilder::suppressContractions(const UnicodeSet &set, UErrorCode &errorCode) { |
1062 | if(U_FAILURE(errorCode) || set.isEmpty()) { return; } |
1063 | UnicodeSetIterator iter(set); |
1064 | while(iter.next() && !iter.isString()) { |
1065 | UChar32 c = iter.getCodepoint(); |
1066 | uint32_t ce32 = utrie2_get32(trie, c); |
1067 | if(ce32 == Collation::FALLBACK_CE32) { |
1068 | ce32 = base->getFinalCE32(base->getCE32(c)); |
1069 | if(Collation::ce32HasContext(ce32)) { |
1070 | ce32 = copyFromBaseCE32(c, ce32, FALSE /* without context */, errorCode); |
1071 | utrie2_set32(trie, c, ce32, &errorCode); |
1072 | } |
1073 | } else if(isBuilderContextCE32(ce32)) { |
1074 | ce32 = getConditionalCE32ForCE32(ce32)->ce32; |
1075 | // Simply abandon the list of ConditionalCE32. |
1076 | // The caller will copy this builder in the end, |
1077 | // eliminating unreachable data. |
1078 | utrie2_set32(trie, c, ce32, &errorCode); |
1079 | contextChars.remove(c); |
1080 | } |
1081 | } |
1082 | modified = TRUE; |
1083 | } |
1084 | |
1085 | UBool |
1086 | CollationDataBuilder::getJamoCE32s(uint32_t jamoCE32s[], UErrorCode &errorCode) { |
1087 | if(U_FAILURE(errorCode)) { return FALSE; } |
1088 | UBool anyJamoAssigned = base == NULL; // always set jamoCE32s in the base data |
1089 | UBool needToCopyFromBase = FALSE; |
1090 | for(int32_t j = 0; j < CollationData::JAMO_CE32S_LENGTH; ++j) { // Count across Jamo types. |
1091 | UChar32 jamo = jamoCpFromIndex(j); |
1092 | UBool fromBase = FALSE; |
1093 | uint32_t ce32 = utrie2_get32(trie, jamo); |
1094 | anyJamoAssigned |= Collation::isAssignedCE32(ce32); |
1095 | // TODO: Try to prevent [optimize [Jamo]] from counting as anyJamoAssigned. |
1096 | // (As of CLDR 24 [2013] the Korean tailoring does not optimize conjoining Jamo.) |
1097 | if(ce32 == Collation::FALLBACK_CE32) { |
1098 | fromBase = TRUE; |
1099 | ce32 = base->getCE32(jamo); |
1100 | } |
1101 | if(Collation::isSpecialCE32(ce32)) { |
1102 | switch(Collation::tagFromCE32(ce32)) { |
1103 | case Collation::LONG_PRIMARY_TAG: |
1104 | case Collation::LONG_SECONDARY_TAG: |
1105 | case Collation::LATIN_EXPANSION_TAG: |
1106 | // Copy the ce32 as-is. |
1107 | break; |
1108 | case Collation::EXPANSION32_TAG: |
1109 | case Collation::EXPANSION_TAG: |
1110 | case Collation::PREFIX_TAG: |
1111 | case Collation::CONTRACTION_TAG: |
1112 | if(fromBase) { |
1113 | // Defer copying until we know if anyJamoAssigned. |
1114 | ce32 = Collation::FALLBACK_CE32; |
1115 | needToCopyFromBase = TRUE; |
1116 | } |
1117 | break; |
1118 | case Collation::IMPLICIT_TAG: |
1119 | // An unassigned Jamo should only occur in tests with incomplete bases. |
1120 | U_ASSERT(fromBase); |
1121 | ce32 = Collation::FALLBACK_CE32; |
1122 | needToCopyFromBase = TRUE; |
1123 | break; |
1124 | case Collation::OFFSET_TAG: |
1125 | ce32 = getCE32FromOffsetCE32(fromBase, jamo, ce32); |
1126 | break; |
1127 | case Collation::FALLBACK_TAG: |
1128 | case Collation::RESERVED_TAG_3: |
1129 | case Collation::BUILDER_DATA_TAG: |
1130 | case Collation::DIGIT_TAG: |
1131 | case Collation::U0000_TAG: |
1132 | case Collation::HANGUL_TAG: |
1133 | case Collation::LEAD_SURROGATE_TAG: |
1134 | errorCode = U_INTERNAL_PROGRAM_ERROR; |
1135 | return FALSE; |
1136 | } |
1137 | } |
1138 | jamoCE32s[j] = ce32; |
1139 | } |
1140 | if(anyJamoAssigned && needToCopyFromBase) { |
1141 | for(int32_t j = 0; j < CollationData::JAMO_CE32S_LENGTH; ++j) { |
1142 | if(jamoCE32s[j] == Collation::FALLBACK_CE32) { |
1143 | UChar32 jamo = jamoCpFromIndex(j); |
1144 | jamoCE32s[j] = copyFromBaseCE32(jamo, base->getCE32(jamo), |
1145 | /*withContext=*/ TRUE, errorCode); |
1146 | } |
1147 | } |
1148 | } |
1149 | return anyJamoAssigned && U_SUCCESS(errorCode); |
1150 | } |
1151 | |
1152 | void |
1153 | CollationDataBuilder::setDigitTags(UErrorCode &errorCode) { |
1154 | UnicodeSet digits(UNICODE_STRING_SIMPLE("[:Nd:]" ), errorCode); |
1155 | if(U_FAILURE(errorCode)) { return; } |
1156 | UnicodeSetIterator iter(digits); |
1157 | while(iter.next()) { |
1158 | U_ASSERT(!iter.isString()); |
1159 | UChar32 c = iter.getCodepoint(); |
1160 | uint32_t ce32 = utrie2_get32(trie, c); |
1161 | if(ce32 != Collation::FALLBACK_CE32 && ce32 != Collation::UNASSIGNED_CE32) { |
1162 | int32_t index = addCE32(ce32, errorCode); |
1163 | if(U_FAILURE(errorCode)) { return; } |
1164 | if(index > Collation::MAX_INDEX) { |
1165 | errorCode = U_BUFFER_OVERFLOW_ERROR; |
1166 | return; |
1167 | } |
1168 | ce32 = Collation::makeCE32FromTagIndexAndLength( |
1169 | Collation::DIGIT_TAG, index, u_charDigitValue(c)); |
1170 | utrie2_set32(trie, c, ce32, &errorCode); |
1171 | } |
1172 | } |
1173 | } |
1174 | |
1175 | U_CDECL_BEGIN |
1176 | |
1177 | static UBool U_CALLCONV |
1178 | enumRangeLeadValue(const void *context, UChar32 /*start*/, UChar32 /*end*/, uint32_t value) { |
1179 | int32_t *pValue = (int32_t *)context; |
1180 | if(value == Collation::UNASSIGNED_CE32) { |
1181 | value = Collation::LEAD_ALL_UNASSIGNED; |
1182 | } else if(value == Collation::FALLBACK_CE32) { |
1183 | value = Collation::LEAD_ALL_FALLBACK; |
1184 | } else { |
1185 | *pValue = Collation::LEAD_MIXED; |
1186 | return FALSE; |
1187 | } |
1188 | if(*pValue < 0) { |
1189 | *pValue = (int32_t)value; |
1190 | } else if(*pValue != (int32_t)value) { |
1191 | *pValue = Collation::LEAD_MIXED; |
1192 | return FALSE; |
1193 | } |
1194 | return TRUE; |
1195 | } |
1196 | |
1197 | U_CDECL_END |
1198 | |
1199 | void |
1200 | CollationDataBuilder::setLeadSurrogates(UErrorCode &errorCode) { |
1201 | for(UChar lead = 0xd800; lead < 0xdc00; ++lead) { |
1202 | int32_t value = -1; |
1203 | utrie2_enumForLeadSurrogate(trie, lead, NULL, enumRangeLeadValue, &value); |
1204 | utrie2_set32ForLeadSurrogateCodeUnit( |
1205 | trie, lead, |
1206 | Collation::makeCE32FromTagAndIndex(Collation::LEAD_SURROGATE_TAG, 0) | (uint32_t)value, |
1207 | &errorCode); |
1208 | } |
1209 | } |
1210 | |
1211 | void |
1212 | CollationDataBuilder::build(CollationData &data, UErrorCode &errorCode) { |
1213 | buildMappings(data, errorCode); |
1214 | if(base != NULL) { |
1215 | data.numericPrimary = base->numericPrimary; |
1216 | data.compressibleBytes = base->compressibleBytes; |
1217 | data.numScripts = base->numScripts; |
1218 | data.scriptsIndex = base->scriptsIndex; |
1219 | data.scriptStarts = base->scriptStarts; |
1220 | data.scriptStartsLength = base->scriptStartsLength; |
1221 | } |
1222 | buildFastLatinTable(data, errorCode); |
1223 | } |
1224 | |
1225 | void |
1226 | CollationDataBuilder::buildMappings(CollationData &data, UErrorCode &errorCode) { |
1227 | if(U_FAILURE(errorCode)) { return; } |
1228 | if(trie == NULL || utrie2_isFrozen(trie)) { |
1229 | errorCode = U_INVALID_STATE_ERROR; |
1230 | return; |
1231 | } |
1232 | |
1233 | buildContexts(errorCode); |
1234 | |
1235 | uint32_t jamoCE32s[CollationData::JAMO_CE32S_LENGTH]; |
1236 | int32_t jamoIndex = -1; |
1237 | if(getJamoCE32s(jamoCE32s, errorCode)) { |
1238 | jamoIndex = ce32s.size(); |
1239 | for(int32_t i = 0; i < CollationData::JAMO_CE32S_LENGTH; ++i) { |
1240 | ce32s.addElement((int32_t)jamoCE32s[i], errorCode); |
1241 | } |
1242 | // Small optimization: Use a bit in the Hangul ce32 |
1243 | // to indicate that none of the Jamo CE32s are isSpecialCE32() |
1244 | // (as it should be in the root collator). |
1245 | // It allows CollationIterator to avoid recursive function calls and per-Jamo tests. |
1246 | // In order to still have good trie compression and keep this code simple, |
1247 | // we only set this flag if a whole block of 588 Hangul syllables starting with |
1248 | // a common leading consonant (Jamo L) has this property. |
1249 | UBool isAnyJamoVTSpecial = FALSE; |
1250 | for(int32_t i = Hangul::JAMO_L_COUNT; i < CollationData::JAMO_CE32S_LENGTH; ++i) { |
1251 | if(Collation::isSpecialCE32(jamoCE32s[i])) { |
1252 | isAnyJamoVTSpecial = TRUE; |
1253 | break; |
1254 | } |
1255 | } |
1256 | uint32_t hangulCE32 = Collation::makeCE32FromTagAndIndex(Collation::HANGUL_TAG, 0); |
1257 | UChar32 c = Hangul::HANGUL_BASE; |
1258 | for(int32_t i = 0; i < Hangul::JAMO_L_COUNT; ++i) { // iterate over the Jamo L |
1259 | uint32_t ce32 = hangulCE32; |
1260 | if(!isAnyJamoVTSpecial && !Collation::isSpecialCE32(jamoCE32s[i])) { |
1261 | ce32 |= Collation::HANGUL_NO_SPECIAL_JAMO; |
1262 | } |
1263 | UChar32 limit = c + Hangul::JAMO_VT_COUNT; |
1264 | utrie2_setRange32(trie, c, limit - 1, ce32, TRUE, &errorCode); |
1265 | c = limit; |
1266 | } |
1267 | } else { |
1268 | // Copy the Hangul CE32s from the base in blocks per Jamo L, |
1269 | // assuming that HANGUL_NO_SPECIAL_JAMO is set or not set for whole blocks. |
1270 | for(UChar32 c = Hangul::HANGUL_BASE; c < Hangul::HANGUL_LIMIT;) { |
1271 | uint32_t ce32 = base->getCE32(c); |
1272 | U_ASSERT(Collation::hasCE32Tag(ce32, Collation::HANGUL_TAG)); |
1273 | UChar32 limit = c + Hangul::JAMO_VT_COUNT; |
1274 | utrie2_setRange32(trie, c, limit - 1, ce32, TRUE, &errorCode); |
1275 | c = limit; |
1276 | } |
1277 | } |
1278 | |
1279 | setDigitTags(errorCode); |
1280 | setLeadSurrogates(errorCode); |
1281 | |
1282 | // For U+0000, move its normal ce32 into CE32s[0] and set U0000_TAG. |
1283 | ce32s.setElementAt((int32_t)utrie2_get32(trie, 0), 0); |
1284 | utrie2_set32(trie, 0, Collation::makeCE32FromTagAndIndex(Collation::U0000_TAG, 0), &errorCode); |
1285 | |
1286 | utrie2_freeze(trie, UTRIE2_32_VALUE_BITS, &errorCode); |
1287 | if(U_FAILURE(errorCode)) { return; } |
1288 | |
1289 | // Mark each lead surrogate as "unsafe" |
1290 | // if any of its 1024 associated supplementary code points is "unsafe". |
1291 | UChar32 c = 0x10000; |
1292 | for(UChar lead = 0xd800; lead < 0xdc00; ++lead, c += 0x400) { |
1293 | if(unsafeBackwardSet.containsSome(c, c + 0x3ff)) { |
1294 | unsafeBackwardSet.add(lead); |
1295 | } |
1296 | } |
1297 | unsafeBackwardSet.freeze(); |
1298 | |
1299 | data.trie = trie; |
1300 | data.ce32s = reinterpret_cast<const uint32_t *>(ce32s.getBuffer()); |
1301 | data.ces = ce64s.getBuffer(); |
1302 | data.contexts = contexts.getBuffer(); |
1303 | |
1304 | data.ce32sLength = ce32s.size(); |
1305 | data.cesLength = ce64s.size(); |
1306 | data.contextsLength = contexts.length(); |
1307 | |
1308 | data.base = base; |
1309 | if(jamoIndex >= 0) { |
1310 | data.jamoCE32s = data.ce32s + jamoIndex; |
1311 | } else { |
1312 | data.jamoCE32s = base->jamoCE32s; |
1313 | } |
1314 | data.unsafeBackwardSet = &unsafeBackwardSet; |
1315 | } |
1316 | |
1317 | void |
1318 | CollationDataBuilder::clearContexts() { |
1319 | contexts.remove(); |
1320 | UnicodeSetIterator iter(contextChars); |
1321 | while(iter.next()) { |
1322 | U_ASSERT(!iter.isString()); |
1323 | uint32_t ce32 = utrie2_get32(trie, iter.getCodepoint()); |
1324 | U_ASSERT(isBuilderContextCE32(ce32)); |
1325 | getConditionalCE32ForCE32(ce32)->builtCE32 = Collation::NO_CE32; |
1326 | } |
1327 | } |
1328 | |
1329 | void |
1330 | CollationDataBuilder::buildContexts(UErrorCode &errorCode) { |
1331 | if(U_FAILURE(errorCode)) { return; } |
1332 | // Ignore abandoned lists and the cached builtCE32, |
1333 | // and build all contexts from scratch. |
1334 | contexts.remove(); |
1335 | UnicodeSetIterator iter(contextChars); |
1336 | while(U_SUCCESS(errorCode) && iter.next()) { |
1337 | U_ASSERT(!iter.isString()); |
1338 | UChar32 c = iter.getCodepoint(); |
1339 | uint32_t ce32 = utrie2_get32(trie, c); |
1340 | if(!isBuilderContextCE32(ce32)) { |
1341 | // Impossible: No context data for c in contextChars. |
1342 | errorCode = U_INTERNAL_PROGRAM_ERROR; |
1343 | return; |
1344 | } |
1345 | ConditionalCE32 *cond = getConditionalCE32ForCE32(ce32); |
1346 | ce32 = buildContext(cond, errorCode); |
1347 | utrie2_set32(trie, c, ce32, &errorCode); |
1348 | } |
1349 | } |
1350 | |
1351 | uint32_t |
1352 | CollationDataBuilder::buildContext(ConditionalCE32 *head, UErrorCode &errorCode) { |
1353 | if(U_FAILURE(errorCode)) { return 0; } |
1354 | // The list head must have no context. |
1355 | U_ASSERT(!head->hasContext()); |
1356 | // The list head must be followed by one or more nodes that all do have context. |
1357 | U_ASSERT(head->next >= 0); |
1358 | UCharsTrieBuilder prefixBuilder(errorCode); |
1359 | UCharsTrieBuilder contractionBuilder(errorCode); |
1360 | for(ConditionalCE32 *cond = head;; cond = getConditionalCE32(cond->next)) { |
1361 | // After the list head, the prefix or suffix can be empty, but not both. |
1362 | U_ASSERT(cond == head || cond->hasContext()); |
1363 | int32_t prefixLength = cond->prefixLength(); |
1364 | UnicodeString prefix(cond->context, 0, prefixLength + 1); |
1365 | // Collect all contraction suffixes for one prefix. |
1366 | ConditionalCE32 *firstCond = cond; |
1367 | ConditionalCE32 *lastCond = cond; |
1368 | while(cond->next >= 0 && |
1369 | (cond = getConditionalCE32(cond->next))->context.startsWith(prefix)) { |
1370 | lastCond = cond; |
1371 | } |
1372 | uint32_t ce32; |
1373 | int32_t suffixStart = prefixLength + 1; // == prefix.length() |
1374 | if(lastCond->context.length() == suffixStart) { |
1375 | // One prefix without contraction suffix. |
1376 | U_ASSERT(firstCond == lastCond); |
1377 | ce32 = lastCond->ce32; |
1378 | cond = lastCond; |
1379 | } else { |
1380 | // Build the contractions trie. |
1381 | contractionBuilder.clear(); |
1382 | // Entry for an empty suffix, to be stored before the trie. |
1383 | uint32_t emptySuffixCE32 = 0; |
1384 | uint32_t flags = 0; |
1385 | if(firstCond->context.length() == suffixStart) { |
1386 | // There is a mapping for the prefix and the single character c. (p|c) |
1387 | // If no other suffix matches, then we return this value. |
1388 | emptySuffixCE32 = firstCond->ce32; |
1389 | cond = getConditionalCE32(firstCond->next); |
1390 | } else { |
1391 | // There is no mapping for the prefix and just the single character. |
1392 | // (There is no p|c, only p|cd, p|ce etc.) |
1393 | flags |= Collation::CONTRACT_SINGLE_CP_NO_MATCH; |
1394 | // When the prefix matches but none of the prefix-specific suffixes, |
1395 | // then we fall back to the mappings with the next-longest prefix, |
1396 | // and ultimately to mappings with no prefix. |
1397 | // Each fallback might be another set of contractions. |
1398 | // For example, if there are mappings for ch, p|cd, p|ce, but not for p|c, |
1399 | // then in text "pch" we find the ch contraction. |
1400 | for(cond = head;; cond = getConditionalCE32(cond->next)) { |
1401 | int32_t length = cond->prefixLength(); |
1402 | if(length == prefixLength) { break; } |
1403 | if(cond->defaultCE32 != Collation::NO_CE32 && |
1404 | (length==0 || prefix.endsWith(cond->context, 1, length))) { |
1405 | emptySuffixCE32 = cond->defaultCE32; |
1406 | } |
1407 | } |
1408 | cond = firstCond; |
1409 | } |
1410 | // Optimization: Set a flag when |
1411 | // the first character of every contraction suffix has lccc!=0. |
1412 | // Short-circuits contraction matching when a normal letter follows. |
1413 | flags |= Collation::CONTRACT_NEXT_CCC; |
1414 | // Add all of the non-empty suffixes into the contraction trie. |
1415 | for(;;) { |
1416 | UnicodeString suffix(cond->context, suffixStart); |
1417 | uint16_t fcd16 = nfcImpl.getFCD16(suffix.char32At(0)); |
1418 | if(fcd16 <= 0xff) { |
1419 | flags &= ~Collation::CONTRACT_NEXT_CCC; |
1420 | } |
1421 | fcd16 = nfcImpl.getFCD16(suffix.char32At(suffix.length() - 1)); |
1422 | if(fcd16 > 0xff) { |
1423 | // The last suffix character has lccc!=0, allowing for discontiguous contractions. |
1424 | flags |= Collation::CONTRACT_TRAILING_CCC; |
1425 | } |
1426 | contractionBuilder.add(suffix, (int32_t)cond->ce32, errorCode); |
1427 | if(cond == lastCond) { break; } |
1428 | cond = getConditionalCE32(cond->next); |
1429 | } |
1430 | int32_t index = addContextTrie(emptySuffixCE32, contractionBuilder, errorCode); |
1431 | if(U_FAILURE(errorCode)) { return 0; } |
1432 | if(index > Collation::MAX_INDEX) { |
1433 | errorCode = U_BUFFER_OVERFLOW_ERROR; |
1434 | return 0; |
1435 | } |
1436 | ce32 = Collation::makeCE32FromTagAndIndex(Collation::CONTRACTION_TAG, index) | flags; |
1437 | } |
1438 | U_ASSERT(cond == lastCond); |
1439 | firstCond->defaultCE32 = ce32; |
1440 | if(prefixLength == 0) { |
1441 | if(cond->next < 0) { |
1442 | // No non-empty prefixes, only contractions. |
1443 | return ce32; |
1444 | } |
1445 | } else { |
1446 | prefix.remove(0, 1); // Remove the length unit. |
1447 | prefix.reverse(); |
1448 | prefixBuilder.add(prefix, (int32_t)ce32, errorCode); |
1449 | if(cond->next < 0) { break; } |
1450 | } |
1451 | } |
1452 | U_ASSERT(head->defaultCE32 != Collation::NO_CE32); |
1453 | int32_t index = addContextTrie(head->defaultCE32, prefixBuilder, errorCode); |
1454 | if(U_FAILURE(errorCode)) { return 0; } |
1455 | if(index > Collation::MAX_INDEX) { |
1456 | errorCode = U_BUFFER_OVERFLOW_ERROR; |
1457 | return 0; |
1458 | } |
1459 | return Collation::makeCE32FromTagAndIndex(Collation::PREFIX_TAG, index); |
1460 | } |
1461 | |
1462 | int32_t |
1463 | CollationDataBuilder::addContextTrie(uint32_t defaultCE32, UCharsTrieBuilder &trieBuilder, |
1464 | UErrorCode &errorCode) { |
1465 | UnicodeString context; |
1466 | context.append((UChar)(defaultCE32 >> 16)).append((UChar)defaultCE32); |
1467 | UnicodeString trieString; |
1468 | context.append(trieBuilder.buildUnicodeString(USTRINGTRIE_BUILD_SMALL, trieString, errorCode)); |
1469 | if(U_FAILURE(errorCode)) { return -1; } |
1470 | int32_t index = contexts.indexOf(context); |
1471 | if(index < 0) { |
1472 | index = contexts.length(); |
1473 | contexts.append(context); |
1474 | } |
1475 | return index; |
1476 | } |
1477 | |
1478 | void |
1479 | CollationDataBuilder::buildFastLatinTable(CollationData &data, UErrorCode &errorCode) { |
1480 | if(U_FAILURE(errorCode) || !fastLatinEnabled) { return; } |
1481 | |
1482 | delete fastLatinBuilder; |
1483 | fastLatinBuilder = new CollationFastLatinBuilder(errorCode); |
1484 | if(fastLatinBuilder == NULL) { |
1485 | errorCode = U_MEMORY_ALLOCATION_ERROR; |
1486 | return; |
1487 | } |
1488 | if(fastLatinBuilder->forData(data, errorCode)) { |
1489 | const uint16_t *table = fastLatinBuilder->getTable(); |
1490 | int32_t length = fastLatinBuilder->lengthOfTable(); |
1491 | if(base != NULL && length == base->fastLatinTableLength && |
1492 | uprv_memcmp(table, base->fastLatinTable, length * 2) == 0) { |
1493 | // Same fast Latin table as in the base, use that one instead. |
1494 | delete fastLatinBuilder; |
1495 | fastLatinBuilder = NULL; |
1496 | table = base->fastLatinTable; |
1497 | } |
1498 | data.fastLatinTable = table; |
1499 | data.fastLatinTableLength = length; |
1500 | } else { |
1501 | delete fastLatinBuilder; |
1502 | fastLatinBuilder = NULL; |
1503 | } |
1504 | } |
1505 | |
1506 | int32_t |
1507 | CollationDataBuilder::getCEs(const UnicodeString &s, int64_t ces[], int32_t cesLength) { |
1508 | return getCEs(s, 0, ces, cesLength); |
1509 | } |
1510 | |
1511 | int32_t |
1512 | CollationDataBuilder::getCEs(const UnicodeString &prefix, const UnicodeString &s, |
1513 | int64_t ces[], int32_t cesLength) { |
1514 | int32_t prefixLength = prefix.length(); |
1515 | if(prefixLength == 0) { |
1516 | return getCEs(s, 0, ces, cesLength); |
1517 | } else { |
1518 | return getCEs(prefix + s, prefixLength, ces, cesLength); |
1519 | } |
1520 | } |
1521 | |
1522 | int32_t |
1523 | CollationDataBuilder::getCEs(const UnicodeString &s, int32_t start, |
1524 | int64_t ces[], int32_t cesLength) { |
1525 | if(collIter == NULL) { |
1526 | collIter = new DataBuilderCollationIterator(*this); |
1527 | if(collIter == NULL) { return 0; } |
1528 | } |
1529 | return collIter->fetchCEs(s, start, ces, cesLength); |
1530 | } |
1531 | |
1532 | U_NAMESPACE_END |
1533 | |
1534 | #endif // !UCONFIG_NO_COLLATION |
1535 | |