1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5* Copyright (C) 2009-2014, International Business Machines Corporation and
6* others. All Rights Reserved.
7*******************************************************************************
8*/
9
10#include "unicode/utypes.h"
11
12#if !UCONFIG_NO_COLLATION
13
14#include "unicode/alphaindex.h"
15#include "unicode/coll.h"
16#include "unicode/localpointer.h"
17#include "unicode/normalizer2.h"
18#include "unicode/tblcoll.h"
19#include "unicode/uchar.h"
20#include "unicode/ulocdata.h"
21#include "unicode/uniset.h"
22#include "unicode/uobject.h"
23#include "unicode/usetiter.h"
24#include "unicode/utf16.h"
25
26#include "cmemory.h"
27#include "cstring.h"
28#include "uassert.h"
29#include "uvector.h"
30#include "uvectr64.h"
31
32//#include <string>
33//#include <iostream>
34
35U_NAMESPACE_BEGIN
36
37namespace {
38
39/**
40 * Prefix string for Chinese index buckets.
41 * See http://unicode.org/repos/cldr/trunk/specs/ldml/tr35-collation.html#Collation_Indexes
42 */
43const UChar BASE[1] = { 0xFDD0 };
44const int32_t BASE_LENGTH = 1;
45
46UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer,
47 const UnicodeString &one, const UnicodeString &other);
48
49} // namespace
50
51static int32_t U_CALLCONV
52collatorComparator(const void *context, const void *left, const void *right);
53
54static int32_t U_CALLCONV
55recordCompareFn(const void *context, const void *left, const void *right);
56
57// UVector<Record *> support function, delete a Record.
58static void U_CALLCONV
59alphaIndex_deleteRecord(void *obj) {
60 delete static_cast<AlphabeticIndex::Record *>(obj);
61}
62
63namespace {
64
65UnicodeString *ownedString(const UnicodeString &s, LocalPointer<UnicodeString> &owned,
66 UErrorCode &errorCode) {
67 if (U_FAILURE(errorCode)) { return NULL; }
68 if (owned.isValid()) {
69 return owned.orphan();
70 }
71 UnicodeString *p = new UnicodeString(s);
72 if (p == NULL) {
73 errorCode = U_MEMORY_ALLOCATION_ERROR;
74 }
75 return p;
76}
77
78inline UnicodeString *getString(const UVector &list, int32_t i) {
79 return static_cast<UnicodeString *>(list[i]);
80}
81
82inline AlphabeticIndex::Bucket *getBucket(const UVector &list, int32_t i) {
83 return static_cast<AlphabeticIndex::Bucket *>(list[i]);
84}
85
86inline AlphabeticIndex::Record *getRecord(const UVector &list, int32_t i) {
87 return static_cast<AlphabeticIndex::Record *>(list[i]);
88}
89
90/**
91 * Like Java Collections.binarySearch(List, String, Comparator).
92 *
93 * @return the index>=0 where the item was found,
94 * or the index<0 for inserting the string at ~index in sorted order
95 */
96int32_t binarySearch(const UVector &list, const UnicodeString &s, const Collator &coll) {
97 if (list.size() == 0) { return ~0; }
98 int32_t start = 0;
99 int32_t limit = list.size();
100 for (;;) {
101 int32_t i = (start + limit) / 2;
102 const UnicodeString *si = static_cast<UnicodeString *>(list.elementAt(i));
103 UErrorCode errorCode = U_ZERO_ERROR;
104 UCollationResult cmp = coll.compare(s, *si, errorCode);
105 if (cmp == UCOL_EQUAL) {
106 return i;
107 } else if (cmp < 0) {
108 if (i == start) {
109 return ~start; // insert s before *si
110 }
111 limit = i;
112 } else {
113 if (i == start) {
114 return ~(start + 1); // insert s after *si
115 }
116 start = i;
117 }
118 }
119}
120
121} // namespace
122
123// The BucketList is not in the anonymous namespace because only Clang
124// seems to support its use in other classes from there.
125// However, we also don't need U_I18N_API because it is not used from outside the i18n library.
126class BucketList : public UObject {
127public:
128 BucketList(UVector *bucketList, UVector *publicBucketList)
129 : bucketList_(bucketList), immutableVisibleList_(publicBucketList) {
130 int32_t displayIndex = 0;
131 for (int32_t i = 0; i < publicBucketList->size(); ++i) {
132 getBucket(*publicBucketList, i)->displayIndex_ = displayIndex++;
133 }
134 }
135
136 // The virtual destructor must not be inline.
137 // See ticket #8454 for details.
138 virtual ~BucketList();
139
140 int32_t getBucketCount() const {
141 return immutableVisibleList_->size();
142 }
143
144 int32_t getBucketIndex(const UnicodeString &name, const Collator &collatorPrimaryOnly,
145 UErrorCode &errorCode) {
146 // binary search
147 int32_t start = 0;
148 int32_t limit = bucketList_->size();
149 while ((start + 1) < limit) {
150 int32_t i = (start + limit) / 2;
151 const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, i);
152 UCollationResult nameVsBucket =
153 collatorPrimaryOnly.compare(name, bucket->lowerBoundary_, errorCode);
154 if (nameVsBucket < 0) {
155 limit = i;
156 } else {
157 start = i;
158 }
159 }
160 const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, start);
161 if (bucket->displayBucket_ != NULL) {
162 bucket = bucket->displayBucket_;
163 }
164 return bucket->displayIndex_;
165 }
166
167 /** All of the buckets, visible and invisible. */
168 UVector *bucketList_;
169 /** Just the visible buckets. */
170 UVector *immutableVisibleList_;
171};
172
173BucketList::~BucketList() {
174 delete bucketList_;
175 if (immutableVisibleList_ != bucketList_) {
176 delete immutableVisibleList_;
177 }
178}
179
180AlphabeticIndex::ImmutableIndex::~ImmutableIndex() {
181 delete buckets_;
182 delete collatorPrimaryOnly_;
183}
184
185int32_t
186AlphabeticIndex::ImmutableIndex::getBucketCount() const {
187 return buckets_->getBucketCount();
188}
189
190int32_t
191AlphabeticIndex::ImmutableIndex::getBucketIndex(
192 const UnicodeString &name, UErrorCode &errorCode) const {
193 return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, errorCode);
194}
195
196const AlphabeticIndex::Bucket *
197AlphabeticIndex::ImmutableIndex::getBucket(int32_t index) const {
198 if (0 <= index && index < buckets_->getBucketCount()) {
199 return icu::getBucket(*buckets_->immutableVisibleList_, index);
200 } else {
201 return NULL;
202 }
203}
204
205AlphabeticIndex::AlphabeticIndex(const Locale &locale, UErrorCode &status)
206 : inputList_(NULL),
207 labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL),
208 maxLabelCount_(99),
209 initialLabels_(NULL), firstCharsInScripts_(NULL),
210 collator_(NULL), collatorPrimaryOnly_(NULL),
211 buckets_(NULL) {
212 init(&locale, status);
213}
214
215
216AlphabeticIndex::AlphabeticIndex(RuleBasedCollator *collator, UErrorCode &status)
217 : inputList_(NULL),
218 labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL),
219 maxLabelCount_(99),
220 initialLabels_(NULL), firstCharsInScripts_(NULL),
221 collator_(collator), collatorPrimaryOnly_(NULL),
222 buckets_(NULL) {
223 init(NULL, status);
224}
225
226
227
228AlphabeticIndex::~AlphabeticIndex() {
229 delete collator_;
230 delete collatorPrimaryOnly_;
231 delete firstCharsInScripts_;
232 delete buckets_;
233 delete inputList_;
234 delete initialLabels_;
235}
236
237
238AlphabeticIndex &AlphabeticIndex::addLabels(const UnicodeSet &additions, UErrorCode &status) {
239 if (U_FAILURE(status)) {
240 return *this;
241 }
242 initialLabels_->addAll(additions);
243 clearBuckets();
244 return *this;
245}
246
247
248AlphabeticIndex &AlphabeticIndex::addLabels(const Locale &locale, UErrorCode &status) {
249 addIndexExemplars(locale, status);
250 clearBuckets();
251 return *this;
252}
253
254
255AlphabeticIndex::ImmutableIndex *AlphabeticIndex::buildImmutableIndex(UErrorCode &errorCode) {
256 if (U_FAILURE(errorCode)) { return NULL; }
257 // In C++, the ImmutableIndex must own its copy of the BucketList,
258 // even if it contains no records, for proper memory management.
259 // We could clone the buckets_ if they are not NULL,
260 // but that would be worth it only if this method is called multiple times,
261 // or called after using the old-style bucket iterator API.
262 LocalPointer<BucketList> immutableBucketList(createBucketList(errorCode));
263 LocalPointer<RuleBasedCollator> coll(collatorPrimaryOnly_->clone());
264 if (immutableBucketList.isNull() || coll.isNull()) {
265 errorCode = U_MEMORY_ALLOCATION_ERROR;
266 return NULL;
267 }
268 ImmutableIndex *immIndex = new ImmutableIndex(immutableBucketList.getAlias(), coll.getAlias());
269 if (immIndex == NULL) {
270 errorCode = U_MEMORY_ALLOCATION_ERROR;
271 return NULL;
272 }
273 // The ImmutableIndex adopted its parameter objects.
274 immutableBucketList.orphan();
275 coll.orphan();
276 return immIndex;
277}
278
279int32_t AlphabeticIndex::getBucketCount(UErrorCode &status) {
280 initBuckets(status);
281 if (U_FAILURE(status)) {
282 return 0;
283 }
284 return buckets_->getBucketCount();
285}
286
287
288int32_t AlphabeticIndex::getRecordCount(UErrorCode &status) {
289 if (U_FAILURE(status) || inputList_ == NULL) {
290 return 0;
291 }
292 return inputList_->size();
293}
294
295void AlphabeticIndex::initLabels(UVector &indexCharacters, UErrorCode &errorCode) const {
296 const Normalizer2 *nfkdNormalizer = Normalizer2::getNFKDInstance(errorCode);
297 if (U_FAILURE(errorCode)) { return; }
298
299 const UnicodeString &firstScriptBoundary = *getString(*firstCharsInScripts_, 0);
300 const UnicodeString &overflowBoundary =
301 *getString(*firstCharsInScripts_, firstCharsInScripts_->size() - 1);
302
303 // We make a sorted array of elements.
304 // Some of the input may be redundant.
305 // That is, we might have c, ch, d, where "ch" sorts just like "c", "h".
306 // We filter out those cases.
307 UnicodeSetIterator iter(*initialLabels_);
308 while (iter.next()) {
309 const UnicodeString *item = &iter.getString();
310 LocalPointer<UnicodeString> ownedItem;
311 UBool checkDistinct;
312 int32_t itemLength = item->length();
313 if (!item->hasMoreChar32Than(0, itemLength, 1)) {
314 checkDistinct = FALSE;
315 } else if(item->charAt(itemLength - 1) == 0x2a && // '*'
316 item->charAt(itemLength - 2) != 0x2a) {
317 // Use a label if it is marked with one trailing star,
318 // even if the label string sorts the same when all contractions are suppressed.
319 ownedItem.adoptInstead(new UnicodeString(*item, 0, itemLength - 1));
320 item = ownedItem.getAlias();
321 if (item == NULL) {
322 errorCode = U_MEMORY_ALLOCATION_ERROR;
323 return;
324 }
325 checkDistinct = FALSE;
326 } else {
327 checkDistinct = TRUE;
328 }
329 if (collatorPrimaryOnly_->compare(*item, firstScriptBoundary, errorCode) < 0) {
330 // Ignore a primary-ignorable or non-alphabetic index character.
331 } else if (collatorPrimaryOnly_->compare(*item, overflowBoundary, errorCode) >= 0) {
332 // Ignore an index character that will land in the overflow bucket.
333 } else if (checkDistinct &&
334 collatorPrimaryOnly_->compare(*item, separated(*item), errorCode) == 0) {
335 // Ignore a multi-code point index character that does not sort distinctly
336 // from the sequence of its separate characters.
337 } else {
338 int32_t insertionPoint = binarySearch(indexCharacters, *item, *collatorPrimaryOnly_);
339 if (insertionPoint < 0) {
340 indexCharacters.insertElementAt(
341 ownedString(*item, ownedItem, errorCode), ~insertionPoint, errorCode);
342 } else {
343 const UnicodeString &itemAlreadyIn = *getString(indexCharacters, insertionPoint);
344 if (isOneLabelBetterThanOther(*nfkdNormalizer, *item, itemAlreadyIn)) {
345 indexCharacters.setElementAt(
346 ownedString(*item, ownedItem, errorCode), insertionPoint);
347 }
348 }
349 }
350 }
351 if (U_FAILURE(errorCode)) { return; }
352
353 // if the result is still too large, cut down to maxLabelCount_ elements, by removing every nth element
354
355 int32_t size = indexCharacters.size() - 1;
356 if (size > maxLabelCount_) {
357 int32_t count = 0;
358 int32_t old = -1;
359 for (int32_t i = 0; i < indexCharacters.size();) {
360 ++count;
361 int32_t bump = count * maxLabelCount_ / size;
362 if (bump == old) {
363 indexCharacters.removeElementAt(i);
364 } else {
365 old = bump;
366 ++i;
367 }
368 }
369 }
370}
371
372namespace {
373
374const UnicodeString &fixLabel(const UnicodeString &current, UnicodeString &temp) {
375 if (!current.startsWith(BASE, BASE_LENGTH)) {
376 return current;
377 }
378 UChar rest = current.charAt(BASE_LENGTH);
379 if (0x2800 < rest && rest <= 0x28FF) { // stroke count
380 int32_t count = rest-0x2800;
381 temp.setTo((UChar)(0x30 + count % 10));
382 if (count >= 10) {
383 count /= 10;
384 temp.insert(0, (UChar)(0x30 + count % 10));
385 if (count >= 10) {
386 count /= 10;
387 temp.insert(0, (UChar)(0x30 + count));
388 }
389 }
390 return temp.append((UChar)0x5283);
391 }
392 return temp.setTo(current, BASE_LENGTH);
393}
394
395UBool hasMultiplePrimaryWeights(
396 const RuleBasedCollator &coll, uint32_t variableTop,
397 const UnicodeString &s, UVector64 &ces, UErrorCode &errorCode) {
398 ces.removeAllElements();
399 coll.internalGetCEs(s, ces, errorCode);
400 if (U_FAILURE(errorCode)) { return FALSE; }
401 UBool seenPrimary = FALSE;
402 for (int32_t i = 0; i < ces.size(); ++i) {
403 int64_t ce = ces.elementAti(i);
404 uint32_t p = (uint32_t)(ce >> 32);
405 if (p > variableTop) {
406 // not primary ignorable
407 if (seenPrimary) {
408 return TRUE;
409 }
410 seenPrimary = TRUE;
411 }
412 }
413 return FALSE;
414}
415
416} // namespace
417
418BucketList *AlphabeticIndex::createBucketList(UErrorCode &errorCode) const {
419 // Initialize indexCharacters.
420 UVector indexCharacters(errorCode);
421 indexCharacters.setDeleter(uprv_deleteUObject);
422 initLabels(indexCharacters, errorCode);
423 if (U_FAILURE(errorCode)) { return NULL; }
424
425 // Variables for hasMultiplePrimaryWeights().
426 UVector64 ces(errorCode);
427 uint32_t variableTop;
428 if (collatorPrimaryOnly_->getAttribute(UCOL_ALTERNATE_HANDLING, errorCode) == UCOL_SHIFTED) {
429 variableTop = collatorPrimaryOnly_->getVariableTop(errorCode);
430 } else {
431 variableTop = 0;
432 }
433 UBool hasInvisibleBuckets = FALSE;
434
435 // Helper arrays for Chinese Pinyin collation.
436 Bucket *asciiBuckets[26] = {
437 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
438 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
439 };
440 Bucket *pinyinBuckets[26] = {
441 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
442 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
443 };
444 UBool hasPinyin = FALSE;
445
446 LocalPointer<UVector> bucketList(new UVector(errorCode), errorCode);
447 if (U_FAILURE(errorCode)) {
448 return NULL;
449 }
450 bucketList->setDeleter(uprv_deleteUObject);
451
452 // underflow bucket
453 Bucket *bucket = new Bucket(getUnderflowLabel(), emptyString_, U_ALPHAINDEX_UNDERFLOW);
454 if (bucket == NULL) {
455 errorCode = U_MEMORY_ALLOCATION_ERROR;
456 return NULL;
457 }
458 bucketList->addElement(bucket, errorCode);
459 if (U_FAILURE(errorCode)) { return NULL; }
460
461 UnicodeString temp;
462
463 // fix up the list, adding underflow, additions, overflow
464 // Insert inflow labels as needed.
465 int32_t scriptIndex = -1;
466 const UnicodeString *scriptUpperBoundary = &emptyString_;
467 for (int32_t i = 0; i < indexCharacters.size(); ++i) {
468 UnicodeString &current = *getString(indexCharacters, i);
469 if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) >= 0) {
470 // We crossed the script boundary into a new script.
471 const UnicodeString &inflowBoundary = *scriptUpperBoundary;
472 UBool skippedScript = FALSE;
473 for (;;) {
474 scriptUpperBoundary = getString(*firstCharsInScripts_, ++scriptIndex);
475 if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) < 0) {
476 break;
477 }
478 skippedScript = TRUE;
479 }
480 if (skippedScript && bucketList->size() > 1) {
481 // We are skipping one or more scripts,
482 // and we are not just getting out of the underflow label.
483 bucket = new Bucket(getInflowLabel(), inflowBoundary, U_ALPHAINDEX_INFLOW);
484 if (bucket == NULL) {
485 errorCode = U_MEMORY_ALLOCATION_ERROR;
486 return NULL;
487 }
488 bucketList->addElement(bucket, errorCode);
489 }
490 }
491 // Add a bucket with the current label.
492 bucket = new Bucket(fixLabel(current, temp), current, U_ALPHAINDEX_NORMAL);
493 if (bucket == NULL) {
494 errorCode = U_MEMORY_ALLOCATION_ERROR;
495 return NULL;
496 }
497 bucketList->addElement(bucket, errorCode);
498 // Remember ASCII and Pinyin buckets for Pinyin redirects.
499 UChar c;
500 if (current.length() == 1 && 0x41 <= (c = current.charAt(0)) && c <= 0x5A) { // A-Z
501 asciiBuckets[c - 0x41] = bucket;
502 } else if (current.length() == BASE_LENGTH + 1 && current.startsWith(BASE, BASE_LENGTH) &&
503 0x41 <= (c = current.charAt(BASE_LENGTH)) && c <= 0x5A) {
504 pinyinBuckets[c - 0x41] = bucket;
505 hasPinyin = TRUE;
506 }
507 // Check for multiple primary weights.
508 if (!current.startsWith(BASE, BASE_LENGTH) &&
509 hasMultiplePrimaryWeights(*collatorPrimaryOnly_, variableTop, current,
510 ces, errorCode) &&
511 current.charAt(current.length() - 1) != 0xFFFF /* !current.endsWith("\uffff") */) {
512 // "AE-ligature" or "Sch" etc.
513 for (int32_t j = bucketList->size() - 2;; --j) {
514 Bucket *singleBucket = getBucket(*bucketList, j);
515 if (singleBucket->labelType_ != U_ALPHAINDEX_NORMAL) {
516 // There is no single-character bucket since the last
517 // underflow or inflow label.
518 break;
519 }
520 if (singleBucket->displayBucket_ == NULL &&
521 !hasMultiplePrimaryWeights(*collatorPrimaryOnly_, variableTop,
522 singleBucket->lowerBoundary_,
523 ces, errorCode)) {
524 // Add an invisible bucket that redirects strings greater than the expansion
525 // to the previous single-character bucket.
526 // For example, after ... Q R S Sch we add Sch\uFFFF->S
527 // and after ... Q R S Sch Sch\uFFFF St we add St\uFFFF->S.
528 bucket = new Bucket(emptyString_,
529 UnicodeString(current).append((UChar)0xFFFF),
530 U_ALPHAINDEX_NORMAL);
531 if (bucket == NULL) {
532 errorCode = U_MEMORY_ALLOCATION_ERROR;
533 return NULL;
534 }
535 bucket->displayBucket_ = singleBucket;
536 bucketList->addElement(bucket, errorCode);
537 hasInvisibleBuckets = TRUE;
538 break;
539 }
540 }
541 }
542 }
543 if (U_FAILURE(errorCode)) { return NULL; }
544 if (bucketList->size() == 1) {
545 // No real labels, show only the underflow label.
546 BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias());
547 if (bl == NULL) {
548 errorCode = U_MEMORY_ALLOCATION_ERROR;
549 return NULL;
550 }
551 bucketList.orphan();
552 return bl;
553 }
554 // overflow bucket
555 bucket = new Bucket(getOverflowLabel(), *scriptUpperBoundary, U_ALPHAINDEX_OVERFLOW);
556 if (bucket == NULL) {
557 errorCode = U_MEMORY_ALLOCATION_ERROR;
558 return NULL;
559 }
560 bucketList->addElement(bucket, errorCode); // final
561
562 if (hasPinyin) {
563 // Redirect Pinyin buckets.
564 Bucket *asciiBucket = NULL;
565 for (int32_t i = 0; i < 26; ++i) {
566 if (asciiBuckets[i] != NULL) {
567 asciiBucket = asciiBuckets[i];
568 }
569 if (pinyinBuckets[i] != NULL && asciiBucket != NULL) {
570 pinyinBuckets[i]->displayBucket_ = asciiBucket;
571 hasInvisibleBuckets = TRUE;
572 }
573 }
574 }
575
576 if (U_FAILURE(errorCode)) { return NULL; }
577 if (!hasInvisibleBuckets) {
578 BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias());
579 if (bl == NULL) {
580 errorCode = U_MEMORY_ALLOCATION_ERROR;
581 return NULL;
582 }
583 bucketList.orphan();
584 return bl;
585 }
586 // Merge inflow buckets that are visually adjacent.
587 // Iterate backwards: Merge inflow into overflow rather than the other way around.
588 int32_t i = bucketList->size() - 1;
589 Bucket *nextBucket = getBucket(*bucketList, i);
590 while (--i > 0) {
591 bucket = getBucket(*bucketList, i);
592 if (bucket->displayBucket_ != NULL) {
593 continue; // skip invisible buckets
594 }
595 if (bucket->labelType_ == U_ALPHAINDEX_INFLOW) {
596 if (nextBucket->labelType_ != U_ALPHAINDEX_NORMAL) {
597 bucket->displayBucket_ = nextBucket;
598 continue;
599 }
600 }
601 nextBucket = bucket;
602 }
603
604 LocalPointer<UVector> publicBucketList(new UVector(errorCode), errorCode);
605 if (U_FAILURE(errorCode)) {
606 return NULL;
607 }
608 // Do not call publicBucketList->setDeleter():
609 // This vector shares its objects with the bucketList.
610 for (int32_t j = 0; j < bucketList->size(); ++j) {
611 bucket = getBucket(*bucketList, j);
612 if (bucket->displayBucket_ == NULL) {
613 publicBucketList->addElement(bucket, errorCode);
614 }
615 }
616 if (U_FAILURE(errorCode)) { return NULL; }
617 BucketList *bl = new BucketList(bucketList.getAlias(), publicBucketList.getAlias());
618 if (bl == NULL) {
619 errorCode = U_MEMORY_ALLOCATION_ERROR;
620 return NULL;
621 }
622 bucketList.orphan();
623 publicBucketList.orphan();
624 return bl;
625}
626
627/**
628 * Creates an index, and buckets and sorts the list of records into the index.
629 */
630void AlphabeticIndex::initBuckets(UErrorCode &errorCode) {
631 if (U_FAILURE(errorCode) || buckets_ != NULL) {
632 return;
633 }
634 buckets_ = createBucketList(errorCode);
635 if (U_FAILURE(errorCode) || inputList_ == NULL || inputList_->isEmpty()) {
636 return;
637 }
638
639 // Sort the records by name.
640 // Stable sort preserves input order of collation duplicates.
641 inputList_->sortWithUComparator(recordCompareFn, collator_, errorCode);
642
643 // Now, we traverse all of the input, which is now sorted.
644 // If the item doesn't go in the current bucket, we find the next bucket that contains it.
645 // This makes the process order n*log(n), since we just sort the list and then do a linear process.
646 // However, if the user adds an item at a time and then gets the buckets, this isn't efficient, so
647 // we need to improve it for that case.
648
649 Bucket *currentBucket = getBucket(*buckets_->bucketList_, 0);
650 int32_t bucketIndex = 1;
651 Bucket *nextBucket;
652 const UnicodeString *upperBoundary;
653 if (bucketIndex < buckets_->bucketList_->size()) {
654 nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++);
655 upperBoundary = &nextBucket->lowerBoundary_;
656 } else {
657 nextBucket = NULL;
658 upperBoundary = NULL;
659 }
660 for (int32_t i = 0; i < inputList_->size(); ++i) {
661 Record *r = getRecord(*inputList_, i);
662 // if the current bucket isn't the right one, find the one that is
663 // We have a special flag for the last bucket so that we don't look any further
664 while (upperBoundary != NULL &&
665 collatorPrimaryOnly_->compare(r->name_, *upperBoundary, errorCode) >= 0) {
666 currentBucket = nextBucket;
667 // now reset the boundary that we compare against
668 if (bucketIndex < buckets_->bucketList_->size()) {
669 nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++);
670 upperBoundary = &nextBucket->lowerBoundary_;
671 } else {
672 upperBoundary = NULL;
673 }
674 }
675 // now put the record into the bucket.
676 Bucket *bucket = currentBucket;
677 if (bucket->displayBucket_ != NULL) {
678 bucket = bucket->displayBucket_;
679 }
680 if (bucket->records_ == NULL) {
681 bucket->records_ = new UVector(errorCode);
682 if (bucket->records_ == NULL) {
683 errorCode = U_MEMORY_ALLOCATION_ERROR;
684 return;
685 }
686 }
687 bucket->records_->addElement(r, errorCode);
688 }
689}
690
691void AlphabeticIndex::clearBuckets() {
692 if (buckets_ != NULL) {
693 delete buckets_;
694 buckets_ = NULL;
695 internalResetBucketIterator();
696 }
697}
698
699void AlphabeticIndex::internalResetBucketIterator() {
700 labelsIterIndex_ = -1;
701 currentBucket_ = NULL;
702}
703
704
705void AlphabeticIndex::addIndexExemplars(const Locale &locale, UErrorCode &status) {
706 LocalULocaleDataPointer uld(ulocdata_open(locale.getName(), &status));
707 if (U_FAILURE(status)) {
708 return;
709 }
710
711 UnicodeSet exemplars;
712 ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_INDEX, &status);
713 if (U_SUCCESS(status)) {
714 initialLabels_->addAll(exemplars);
715 return;
716 }
717 status = U_ZERO_ERROR; // Clear out U_MISSING_RESOURCE_ERROR
718
719 // The locale data did not include explicit Index characters.
720 // Synthesize a set of them from the locale's standard exemplar characters.
721 ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_STANDARD, &status);
722 if (U_FAILURE(status)) {
723 return;
724 }
725
726 // question: should we add auxiliary exemplars?
727 if (exemplars.containsSome(0x61, 0x7A) /* a-z */ || exemplars.isEmpty()) {
728 exemplars.add(0x61, 0x7A);
729 }
730 if (exemplars.containsSome(0xAC00, 0xD7A3)) { // Hangul syllables
731 // cut down to small list
732 exemplars.remove(0xAC00, 0xD7A3).
733 add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C).
734 add(0xB9C8).add(0xBC14).add(0xC0AC).add(0xC544).
735 add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0).
736 add(0xD30C).add(0xD558);
737 }
738 if (exemplars.containsSome(0x1200, 0x137F)) { // Ethiopic block
739 // cut down to small list
740 // make use of the fact that Ethiopic is allocated in 8's, where
741 // the base is 0 mod 8.
742 UnicodeSet ethiopic(UnicodeString(u"[ሀለሐመሠረሰሸቀቈቐቘበቨተቸኀኈነኘአከኰኸዀወዐዘዠየደዸጀገጐጘጠጨጰጸፀፈፐፘ]"), status);
743 ethiopic.retainAll(exemplars);
744 exemplars.remove(u'ሀ', 0x137F).addAll(ethiopic);
745 }
746
747 // Upper-case any that aren't already so.
748 // (We only do this for synthesized index characters.)
749 UnicodeSetIterator it(exemplars);
750 UnicodeString upperC;
751 while (it.next()) {
752 const UnicodeString &exemplarC = it.getString();
753 upperC = exemplarC;
754 upperC.toUpper(locale);
755 initialLabels_->add(upperC);
756 }
757}
758
759UBool AlphabeticIndex::addChineseIndexCharacters(UErrorCode &errorCode) {
760 UnicodeSet contractions;
761 collatorPrimaryOnly_->internalAddContractions(BASE[0], contractions, errorCode);
762 if (U_FAILURE(errorCode) || contractions.isEmpty()) { return FALSE; }
763 initialLabels_->addAll(contractions);
764 UnicodeSetIterator iter(contractions);
765 while (iter.next()) {
766 const UnicodeString &s = iter.getString();
767 U_ASSERT (s.startsWith(BASE, BASE_LENGTH));
768 UChar c = s.charAt(s.length() - 1);
769 if (0x41 <= c && c <= 0x5A) { // A-Z
770 // There are Pinyin labels, add ASCII A-Z labels as well.
771 initialLabels_->add(0x41, 0x5A); // A-Z
772 break;
773 }
774 }
775 return TRUE;
776}
777
778
779/*
780 * Return the string with interspersed CGJs. Input must have more than 2 codepoints.
781 */
782static const UChar CGJ = 0x034F;
783UnicodeString AlphabeticIndex::separated(const UnicodeString &item) {
784 UnicodeString result;
785 if (item.length() == 0) {
786 return result;
787 }
788 int32_t i = 0;
789 for (;;) {
790 UChar32 cp = item.char32At(i);
791 result.append(cp);
792 i = item.moveIndex32(i, 1);
793 if (i >= item.length()) {
794 break;
795 }
796 result.append(CGJ);
797 }
798 return result;
799}
800
801
802UBool AlphabeticIndex::operator==(const AlphabeticIndex& /* other */) const {
803 return FALSE;
804}
805
806
807UBool AlphabeticIndex::operator!=(const AlphabeticIndex& /* other */) const {
808 return FALSE;
809}
810
811
812const RuleBasedCollator &AlphabeticIndex::getCollator() const {
813 return *collator_;
814}
815
816
817const UnicodeString &AlphabeticIndex::getInflowLabel() const {
818 return inflowLabel_;
819}
820
821const UnicodeString &AlphabeticIndex::getOverflowLabel() const {
822 return overflowLabel_;
823}
824
825
826const UnicodeString &AlphabeticIndex::getUnderflowLabel() const {
827 return underflowLabel_;
828}
829
830
831AlphabeticIndex &AlphabeticIndex::setInflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
832 inflowLabel_ = label;
833 clearBuckets();
834 return *this;
835}
836
837
838AlphabeticIndex &AlphabeticIndex::setOverflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
839 overflowLabel_ = label;
840 clearBuckets();
841 return *this;
842}
843
844
845AlphabeticIndex &AlphabeticIndex::setUnderflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
846 underflowLabel_ = label;
847 clearBuckets();
848 return *this;
849}
850
851
852int32_t AlphabeticIndex::getMaxLabelCount() const {
853 return maxLabelCount_;
854}
855
856
857AlphabeticIndex &AlphabeticIndex::setMaxLabelCount(int32_t maxLabelCount, UErrorCode &status) {
858 if (U_FAILURE(status)) {
859 return *this;
860 }
861 if (maxLabelCount <= 0) {
862 status = U_ILLEGAL_ARGUMENT_ERROR;
863 return *this;
864 }
865 maxLabelCount_ = maxLabelCount;
866 clearBuckets();
867 return *this;
868}
869
870
871//
872// init() - Common code for constructors.
873//
874
875void AlphabeticIndex::init(const Locale *locale, UErrorCode &status) {
876 if (U_FAILURE(status)) { return; }
877 if (locale == NULL && collator_ == NULL) {
878 status = U_ILLEGAL_ARGUMENT_ERROR;
879 return;
880 }
881
882 initialLabels_ = new UnicodeSet();
883 if (initialLabels_ == NULL) {
884 status = U_MEMORY_ALLOCATION_ERROR;
885 return;
886 }
887
888 inflowLabel_.setTo((UChar)0x2026); // Ellipsis
889 overflowLabel_ = inflowLabel_;
890 underflowLabel_ = inflowLabel_;
891
892 if (collator_ == NULL) {
893 Collator *coll = Collator::createInstance(*locale, status);
894 if (U_FAILURE(status)) {
895 delete coll;
896 return;
897 }
898 if (coll == NULL) {
899 status = U_MEMORY_ALLOCATION_ERROR;
900 return;
901 }
902 collator_ = dynamic_cast<RuleBasedCollator *>(coll);
903 if (collator_ == NULL) {
904 delete coll;
905 status = U_UNSUPPORTED_ERROR;
906 return;
907 }
908 }
909 collatorPrimaryOnly_ = collator_->clone();
910 if (collatorPrimaryOnly_ == NULL) {
911 status = U_MEMORY_ALLOCATION_ERROR;
912 return;
913 }
914 collatorPrimaryOnly_->setAttribute(UCOL_STRENGTH, UCOL_PRIMARY, status);
915 firstCharsInScripts_ = firstStringsInScript(status);
916 if (U_FAILURE(status)) { return; }
917 firstCharsInScripts_->sortWithUComparator(collatorComparator, collatorPrimaryOnly_, status);
918 // Guard against a degenerate collator where
919 // some script boundary strings are primary ignorable.
920 for (;;) {
921 if (U_FAILURE(status)) { return; }
922 if (firstCharsInScripts_->isEmpty()) {
923 // AlphabeticIndex requires some non-ignorable script boundary strings.
924 status = U_ILLEGAL_ARGUMENT_ERROR;
925 return;
926 }
927 if (collatorPrimaryOnly_->compare(
928 *static_cast<UnicodeString *>(firstCharsInScripts_->elementAt(0)),
929 emptyString_, status) == UCOL_EQUAL) {
930 firstCharsInScripts_->removeElementAt(0);
931 } else {
932 break;
933 }
934 }
935
936 // Chinese index characters, which are specific to each of the several Chinese tailorings,
937 // take precedence over the single locale data exemplar set per language.
938 if (!addChineseIndexCharacters(status) && locale != NULL) {
939 addIndexExemplars(*locale, status);
940 }
941}
942
943
944//
945// Comparison function for UVector<UnicodeString *> sorting with a collator.
946//
947static int32_t U_CALLCONV
948collatorComparator(const void *context, const void *left, const void *right) {
949 const UElement *leftElement = static_cast<const UElement *>(left);
950 const UElement *rightElement = static_cast<const UElement *>(right);
951 const UnicodeString *leftString = static_cast<const UnicodeString *>(leftElement->pointer);
952 const UnicodeString *rightString = static_cast<const UnicodeString *>(rightElement->pointer);
953
954 if (leftString == rightString) {
955 // Catches case where both are NULL
956 return 0;
957 }
958 if (leftString == NULL) {
959 return 1;
960 }
961 if (rightString == NULL) {
962 return -1;
963 }
964 const Collator *col = static_cast<const Collator *>(context);
965 UErrorCode errorCode = U_ZERO_ERROR;
966 return col->compare(*leftString, *rightString, errorCode);
967}
968
969//
970// Comparison function for UVector<Record *> sorting with a collator.
971//
972static int32_t U_CALLCONV
973recordCompareFn(const void *context, const void *left, const void *right) {
974 const UElement *leftElement = static_cast<const UElement *>(left);
975 const UElement *rightElement = static_cast<const UElement *>(right);
976 const AlphabeticIndex::Record *leftRec = static_cast<const AlphabeticIndex::Record *>(leftElement->pointer);
977 const AlphabeticIndex::Record *rightRec = static_cast<const AlphabeticIndex::Record *>(rightElement->pointer);
978 const Collator *col = static_cast<const Collator *>(context);
979 UErrorCode errorCode = U_ZERO_ERROR;
980 return col->compare(leftRec->name_, rightRec->name_, errorCode);
981}
982
983UVector *AlphabeticIndex::firstStringsInScript(UErrorCode &status) {
984 if (U_FAILURE(status)) {
985 return NULL;
986 }
987 LocalPointer<UVector> dest(new UVector(status), status);
988 if (U_FAILURE(status)) {
989 return NULL;
990 }
991 dest->setDeleter(uprv_deleteUObject);
992 // Fetch the script-first-primary contractions which are defined in the root collator.
993 // They all start with U+FDD1.
994 UnicodeSet set;
995 collatorPrimaryOnly_->internalAddContractions(0xFDD1, set, status);
996 if (U_FAILURE(status)) {
997 return NULL;
998 }
999 if (set.isEmpty()) {
1000 status = U_UNSUPPORTED_ERROR;
1001 return NULL;
1002 }
1003 UnicodeSetIterator iter(set);
1004 while (iter.next()) {
1005 const UnicodeString &boundary = iter.getString();
1006 uint32_t gcMask = U_GET_GC_MASK(boundary.char32At(1));
1007 if ((gcMask & (U_GC_L_MASK | U_GC_CN_MASK)) == 0) {
1008 // Ignore boundaries for the special reordering groups.
1009 // Take only those for "real scripts" (where the sample character is a Letter,
1010 // and the one for unassigned implicit weights (Cn).
1011 continue;
1012 }
1013 UnicodeString *s = new UnicodeString(boundary);
1014 if (s == NULL) {
1015 status = U_MEMORY_ALLOCATION_ERROR;
1016 return NULL;
1017 }
1018 dest->addElement(s, status);
1019 }
1020 return dest.orphan();
1021}
1022
1023
1024namespace {
1025
1026/**
1027 * Returns true if one index character string is "better" than the other.
1028 * Shorter NFKD is better, and otherwise NFKD-binary-less-than is
1029 * better, and otherwise binary-less-than is better.
1030 */
1031UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer,
1032 const UnicodeString &one, const UnicodeString &other) {
1033 // This is called with primary-equal strings, but never with one.equals(other).
1034 UErrorCode status = U_ZERO_ERROR;
1035 UnicodeString n1 = nfkdNormalizer.normalize(one, status);
1036 UnicodeString n2 = nfkdNormalizer.normalize(other, status);
1037 if (U_FAILURE(status)) { return FALSE; }
1038 int32_t result = n1.countChar32() - n2.countChar32();
1039 if (result != 0) {
1040 return result < 0;
1041 }
1042 result = n1.compareCodePointOrder(n2);
1043 if (result != 0) {
1044 return result < 0;
1045 }
1046 return one.compareCodePointOrder(other) < 0;
1047}
1048
1049} // namespace
1050
1051//
1052// Constructor & Destructor for AlphabeticIndex::Record
1053//
1054// Records are internal only, instances are not directly surfaced in the public API.
1055// This class is mostly struct-like, with all public fields.
1056
1057AlphabeticIndex::Record::Record(const UnicodeString &name, const void *data)
1058 : name_(name), data_(data) {}
1059
1060AlphabeticIndex::Record::~Record() {
1061}
1062
1063
1064AlphabeticIndex & AlphabeticIndex::addRecord(const UnicodeString &name, const void *data, UErrorCode &status) {
1065 if (U_FAILURE(status)) {
1066 return *this;
1067 }
1068 if (inputList_ == NULL) {
1069 inputList_ = new UVector(status);
1070 if (inputList_ == NULL) {
1071 status = U_MEMORY_ALLOCATION_ERROR;
1072 return *this;
1073 }
1074 inputList_->setDeleter(alphaIndex_deleteRecord);
1075 }
1076 Record *r = new Record(name, data);
1077 if (r == NULL) {
1078 status = U_MEMORY_ALLOCATION_ERROR;
1079 return *this;
1080 }
1081 inputList_->addElement(r, status);
1082 clearBuckets();
1083 //std::string ss;
1084 //std::string ss2;
1085 //std::cout << "added record: name = \"" << r->name_.toUTF8String(ss) << "\"" <<
1086 // " sortingName = \"" << r->sortingName_.toUTF8String(ss2) << "\"" << std::endl;
1087 return *this;
1088}
1089
1090
1091AlphabeticIndex &AlphabeticIndex::clearRecords(UErrorCode &status) {
1092 if (U_SUCCESS(status) && inputList_ != NULL && !inputList_->isEmpty()) {
1093 inputList_->removeAllElements();
1094 clearBuckets();
1095 }
1096 return *this;
1097}
1098
1099int32_t AlphabeticIndex::getBucketIndex(const UnicodeString &name, UErrorCode &status) {
1100 initBuckets(status);
1101 if (U_FAILURE(status)) {
1102 return 0;
1103 }
1104 return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, status);
1105}
1106
1107
1108int32_t AlphabeticIndex::getBucketIndex() const {
1109 return labelsIterIndex_;
1110}
1111
1112
1113UBool AlphabeticIndex::nextBucket(UErrorCode &status) {
1114 if (U_FAILURE(status)) {
1115 return FALSE;
1116 }
1117 if (buckets_ == NULL && currentBucket_ != NULL) {
1118 status = U_ENUM_OUT_OF_SYNC_ERROR;
1119 return FALSE;
1120 }
1121 initBuckets(status);
1122 if (U_FAILURE(status)) {
1123 return FALSE;
1124 }
1125 ++labelsIterIndex_;
1126 if (labelsIterIndex_ >= buckets_->getBucketCount()) {
1127 labelsIterIndex_ = buckets_->getBucketCount();
1128 return FALSE;
1129 }
1130 currentBucket_ = getBucket(*buckets_->immutableVisibleList_, labelsIterIndex_);
1131 resetRecordIterator();
1132 return TRUE;
1133}
1134
1135const UnicodeString &AlphabeticIndex::getBucketLabel() const {
1136 if (currentBucket_ != NULL) {
1137 return currentBucket_->label_;
1138 } else {
1139 return emptyString_;
1140 }
1141}
1142
1143
1144UAlphabeticIndexLabelType AlphabeticIndex::getBucketLabelType() const {
1145 if (currentBucket_ != NULL) {
1146 return currentBucket_->labelType_;
1147 } else {
1148 return U_ALPHAINDEX_NORMAL;
1149 }
1150}
1151
1152
1153int32_t AlphabeticIndex::getBucketRecordCount() const {
1154 if (currentBucket_ != NULL && currentBucket_->records_ != NULL) {
1155 return currentBucket_->records_->size();
1156 } else {
1157 return 0;
1158 }
1159}
1160
1161
1162AlphabeticIndex &AlphabeticIndex::resetBucketIterator(UErrorCode &status) {
1163 if (U_FAILURE(status)) {
1164 return *this;
1165 }
1166 internalResetBucketIterator();
1167 return *this;
1168}
1169
1170
1171UBool AlphabeticIndex::nextRecord(UErrorCode &status) {
1172 if (U_FAILURE(status)) {
1173 return FALSE;
1174 }
1175 if (currentBucket_ == NULL) {
1176 // We are trying to iterate over the items in a bucket, but there is no
1177 // current bucket from the enumeration of buckets.
1178 status = U_INVALID_STATE_ERROR;
1179 return FALSE;
1180 }
1181 if (buckets_ == NULL) {
1182 status = U_ENUM_OUT_OF_SYNC_ERROR;
1183 return FALSE;
1184 }
1185 if (currentBucket_->records_ == NULL) {
1186 return FALSE;
1187 }
1188 ++itemsIterIndex_;
1189 if (itemsIterIndex_ >= currentBucket_->records_->size()) {
1190 itemsIterIndex_ = currentBucket_->records_->size();
1191 return FALSE;
1192 }
1193 return TRUE;
1194}
1195
1196
1197const UnicodeString &AlphabeticIndex::getRecordName() const {
1198 const UnicodeString *retStr = &emptyString_;
1199 if (currentBucket_ != NULL && currentBucket_->records_ != NULL &&
1200 itemsIterIndex_ >= 0 &&
1201 itemsIterIndex_ < currentBucket_->records_->size()) {
1202 Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_));
1203 retStr = &item->name_;
1204 }
1205 return *retStr;
1206}
1207
1208const void *AlphabeticIndex::getRecordData() const {
1209 const void *retPtr = NULL;
1210 if (currentBucket_ != NULL && currentBucket_->records_ != NULL &&
1211 itemsIterIndex_ >= 0 &&
1212 itemsIterIndex_ < currentBucket_->records_->size()) {
1213 Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_));
1214 retPtr = item->data_;
1215 }
1216 return retPtr;
1217}
1218
1219
1220AlphabeticIndex & AlphabeticIndex::resetRecordIterator() {
1221 itemsIterIndex_ = -1;
1222 return *this;
1223}
1224
1225
1226
1227AlphabeticIndex::Bucket::Bucket(const UnicodeString &label,
1228 const UnicodeString &lowerBoundary,
1229 UAlphabeticIndexLabelType type)
1230 : label_(label), lowerBoundary_(lowerBoundary), labelType_(type),
1231 displayBucket_(NULL), displayIndex_(-1),
1232 records_(NULL) {
1233}
1234
1235
1236AlphabeticIndex::Bucket::~Bucket() {
1237 delete records_;
1238}
1239
1240U_NAMESPACE_END
1241
1242#endif // !UCONFIG_NO_COLLATION
1243