alphaindex.cpp source code [ClickHouse/contrib/icu/icu4c/source/i18n/alphaindex.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	*******************************************************************************
5	* Copyright (C) 2009-2014, International Business Machines Corporation and
6	* others. All Rights Reserved.
7	*******************************************************************************
8	*/
9
10	#include "unicode/utypes.h"
11
12	#if !UCONFIG_NO_COLLATION
13
14	#include "unicode/alphaindex.h"
15	#include "unicode/coll.h"
16	#include "unicode/localpointer.h"
17	#include "unicode/normalizer2.h"
18	#include "unicode/tblcoll.h"
19	#include "unicode/uchar.h"
20	#include "unicode/ulocdata.h"
21	#include "unicode/uniset.h"
22	#include "unicode/uobject.h"
23	#include "unicode/usetiter.h"
24	#include "unicode/utf16.h"
25
26	#include "cmemory.h"
27	#include "cstring.h"
28	#include "uassert.h"
29	#include "uvector.h"
30	#include "uvectr64.h"
31
32	//#include <string>
33	//#include <iostream>
34
35	U_NAMESPACE_BEGIN
36
37	namespace {
38
39	/**
40	* Prefix string for Chinese index buckets.
41	* See http://unicode.org/repos/cldr/trunk/specs/ldml/tr35-collation.html#Collation_Indexes
42	*/
43	const UChar BASE[`1`] = { `0xFDD0` };
44	const int32_t BASE_LENGTH = `1`;
45
46	UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer,
47	const UnicodeString &one, const UnicodeString &other);
48
49	} // namespace
50
51	static int32_t U_CALLCONV
52	collatorComparator(const void context, const* void left, const* void *right);
53
54	static int32_t U_CALLCONV
55	recordCompareFn(const void context, const* void left, const* void *right);
56
57	// UVector<Record > support function, delete a Record.*
58	static void U_CALLCONV
59	alphaIndex_deleteRecord(void *obj) {
60	delete static_cast<AlphabeticIndex::Record *>(obj);
61	}
62
63	namespace {
64
65	UnicodeString ownedString(const* UnicodeString &s, LocalPointer<UnicodeString> &owned,
66	UErrorCode &errorCode) {
67	if (U_FAILURE(errorCode)) { return NULL; }
68	if (owned.isValid()) {
69	return owned.orphan();
70	}
71	UnicodeString p = new* UnicodeString (s);
72	if (p == NULL) {
73	errorCode = U_MEMORY_ALLOCATION_ERROR;
74	}
75	return p;
76	}
77
78	inline UnicodeString getString(const* UVector &list, int32_t i) {
79	return static_cast<UnicodeString *>(list [i]);
80	}
81
82	inline AlphabeticIndex::Bucket getBucket(const* UVector &list, int32_t i) {
83	return static_cast<AlphabeticIndex::Bucket *>(list [i]);
84	}
85
86	inline AlphabeticIndex::Record getRecord(const* UVector &list, int32_t i) {
87	return static_cast<AlphabeticIndex::Record *>(list [i]);
88	}
89
90	/**
91	* Like Java Collections.binarySearch(List, String, Comparator).
92	*
93	* @return the index>=0 where the item was found,
94	* or the index<0 for inserting the string at ~index in sorted order
95	*/
96	int32_t binarySearch(const UVector &list, const UnicodeString &s, const Collator &coll) {
97	if (list.size() == `0`) { return ~`0`; }
98	int32_t start = `0`;
99	int32_t limit = list.size();
100	for (;;) {
101	int32_t i = (start + limit) / `2`;
102	const UnicodeString si = static_cast<UnicodeString >(list.elementAt(i));
103	UErrorCode errorCode = U_ZERO_ERROR;
104	UCollationResult cmp = coll.compare(s, *si, errorCode);
105	if (cmp == UCOL_EQUAL) {
106	return i;
107	} else if (cmp < `0`) {
108	if (i == start) {
109	return ~start; // insert s before si*
110	}
111	limit = i;
112	} else {
113	if (i == start) {
114	return ~(start + `1`); // insert s after si*
115	}
116	start = i;
117	}
118	}
119	}
120
121	} // namespace
122
123	// The BucketList is not in the anonymous namespace because only Clang
124	// seems to support its use in other classes from there.
125	// However, we also don't need U_I18N_API because it is not used from outside the i18n library.
126	class BucketList : public UObject {
127	public:
128	BucketList(UVector bucketList, UVector publicBucketList)
129	: bucketList_(bucketList), immutableVisibleList_(publicBucketList) {
130	int32_t displayIndex = `0`;
131	for (int32_t i = `0`; i < publicBucketList->size(); ++i) {
132	getBucket(*publicBucketList, i)->displayIndex_ = displayIndex++;
133	}
134	}
135
136	// The virtual destructor must not be inline.
137	// See ticket #8454 for details.
138	virtual ~BucketList();
139
140	int32_t getBucketCount() const {
141	return immutableVisibleList_->size();
142	}
143
144	int32_t getBucketIndex(const UnicodeString &name, const Collator &collatorPrimaryOnly,
145	UErrorCode &errorCode) {
146	// binary search
147	int32_t start = `0`;
148	int32_t limit = bucketList_->size();
149	while ((start + `1`) < limit) {
150	int32_t i = (start + limit) / `2`;
151	const AlphabeticIndex::Bucket bucket = getBucket(bucketList_, i);
152	UCollationResult nameVsBucket =
153	collatorPrimaryOnly.compare(name, bucket->lowerBoundary_, errorCode);
154	if (nameVsBucket < `0`) {
155	limit = i;
156	} else {
157	start = i;
158	}
159	}
160	const AlphabeticIndex::Bucket bucket = getBucket(bucketList_, start);
161	if (bucket->displayBucket_ != NULL) {
162	bucket = bucket->displayBucket_;
163	}
164	return bucket->displayIndex_;
165	}
166
167	/* All of the buckets, visible and invisible. /
168	UVector *bucketList_;
169	/* Just the visible buckets. /
170	UVector *immutableVisibleList_;
171	};
172
173	BucketList::~BucketList() {
174	delete bucketList_;
175	if (immutableVisibleList_ != bucketList_) {
176	delete immutableVisibleList_;
177	}
178	}
179
180	AlphabeticIndex::ImmutableIndex::~ImmutableIndex() {
181	delete buckets_;
182	delete collatorPrimaryOnly_;
183	}
184
185	int32_t
186	AlphabeticIndex::ImmutableIndex::getBucketCount() const {
187	return buckets_->getBucketCount();
188	}
189
190	int32_t
191	AlphabeticIndex::ImmutableIndex::getBucketIndex(
192	const UnicodeString &name, UErrorCode &errorCode) const {
193	return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, errorCode);
194	}
195
196	const AlphabeticIndex::Bucket *
197	AlphabeticIndex::ImmutableIndex::getBucket(int32_t index) const {
198	if (`0` <= index && index < buckets_->getBucketCount()) {
199	return icu::getBucket(*buckets_->immutableVisibleList_, index);
200	} else {
201	return NULL;
202	}
203	}
204
205	AlphabeticIndex::AlphabeticIndex(const Locale &locale, UErrorCode &status)
206	: inputList_(NULL),
207	labelsIterIndex_(-`1`), itemsIterIndex_(`0`), currentBucket_(NULL),
208	maxLabelCount_(`99`),
209	initialLabels_(NULL), firstCharsInScripts_(NULL),
210	collator_(NULL), collatorPrimaryOnly_(NULL),
211	buckets_(NULL) {
212	init(&locale, status);
213	}
214
215
216	AlphabeticIndex::AlphabeticIndex(RuleBasedCollator *collator, UErrorCode &status)
217	: inputList_(NULL),
218	labelsIterIndex_(-`1`), itemsIterIndex_(`0`), currentBucket_(NULL),
219	maxLabelCount_(`99`),
220	initialLabels_(NULL), firstCharsInScripts_(NULL),
221	collator_(collator), collatorPrimaryOnly_(NULL),
222	buckets_(NULL) {
223	init(NULL, status);
224	}
225
226
227
228	AlphabeticIndex::~AlphabeticIndex() {
229	delete collator_;
230	delete collatorPrimaryOnly_;
231	delete firstCharsInScripts_;
232	delete buckets_;
233	delete inputList_;
234	delete initialLabels_;
235	}
236
237
238	AlphabeticIndex &AlphabeticIndex::addLabels(const UnicodeSet &additions, UErrorCode &status) {
239	if (U_FAILURE(status)) {
240	return *this;
241	}
242	initialLabels_->addAll(additions);
243	clearBuckets();
244	return *this;
245	}
246
247
248	AlphabeticIndex &AlphabeticIndex::addLabels(const Locale &locale, UErrorCode &status) {
249	addIndexExemplars(locale, status);
250	clearBuckets();
251	return *this;
252	}
253
254
255	AlphabeticIndex::ImmutableIndex *AlphabeticIndex::buildImmutableIndex(UErrorCode &errorCode) {
256	if (U_FAILURE(errorCode)) { return NULL; }
257	// In C++, the ImmutableIndex must own its copy of the BucketList,
258	// even if it contains no records, for proper memory management.
259	// We could clone the buckets_ if they are not NULL,
260	// but that would be worth it only if this method is called multiple times,
261	// or called after using the old-style bucket iterator API.
262	LocalPointer<BucketList> immutableBucketList(createBucketList(errorCode));
263	LocalPointer<RuleBasedCollator> coll(collatorPrimaryOnly_->clone());
264	if (immutableBucketList.isNull() \|\| coll.isNull()) {
265	errorCode = U_MEMORY_ALLOCATION_ERROR;
266	return NULL;
267	}
268	ImmutableIndex immIndex = new* ImmutableIndex (immutableBucketList.getAlias(), coll.getAlias());
269	if (immIndex == NULL) {
270	errorCode = U_MEMORY_ALLOCATION_ERROR;
271	return NULL;
272	}
273	// The ImmutableIndex adopted its parameter objects.
274	immutableBucketList.orphan();
275	coll.orphan();
276	return immIndex;
277	}
278
279	int32_t AlphabeticIndex::getBucketCount(UErrorCode &status) {
280	initBuckets(status);
281	if (U_FAILURE(status)) {
282	return `0`;
283	}
284	return buckets_->getBucketCount();
285	}
286
287
288	int32_t AlphabeticIndex::getRecordCount(UErrorCode &status) {
289	if (U_FAILURE(status) \|\| inputList_ == NULL) {
290	return `0`;
291	}
292	return inputList_->size();
293	}
294
295	void AlphabeticIndex::initLabels(UVector &indexCharacters, UErrorCode &errorCode) const {
296	const Normalizer2 *nfkdNormalizer = Normalizer2::getNFKDInstance(errorCode);
297	if (U_FAILURE(errorCode)) { return; }
298
299	const UnicodeString &firstScriptBoundary = getString(firstCharsInScripts_, `0`);
300	const UnicodeString &overflowBoundary =
301	getString(firstCharsInScripts_, firstCharsInScripts_->size() - `1`);
302
303	// We make a sorted array of elements.
304	// Some of the input may be redundant.
305	// That is, we might have c, ch, d, where "ch" sorts just like "c", "h".
306	// We filter out those cases.
307	UnicodeSetIterator iter(*initialLabels_);
308	while (iter.next()) {
309	const UnicodeString *item = &iter.getString();
310	LocalPointer<UnicodeString> ownedItem;
311	UBool checkDistinct;
312	int32_t itemLength = item->length();
313	if (!item->hasMoreChar32Than(`0`, itemLength, `1`)) {
314	checkDistinct = FALSE;
315	} else if(item->charAt(itemLength - `1`) == `0x2a` && // ''*
316	item->charAt(itemLength - `2`) != `0x2a`) {
317	// Use a label if it is marked with one trailing star,
318	// even if the label string sorts the same when all contractions are suppressed.
319	ownedItem.adoptInstead(new UnicodeString (*item, `0`, itemLength - `1`));
320	item = ownedItem.getAlias();
321	if (item == NULL) {
322	errorCode = U_MEMORY_ALLOCATION_ERROR;
323	return;
324	}
325	checkDistinct = FALSE;
326	} else {
327	checkDistinct = TRUE;
328	}
329	if (collatorPrimaryOnly_->compare(*item, firstScriptBoundary, errorCode) < `0`) {
330	// Ignore a primary-ignorable or non-alphabetic index character.
331	} else if (collatorPrimaryOnly_->compare(*item, overflowBoundary, errorCode) >= `0`) {
332	// Ignore an index character that will land in the overflow bucket.
333	} else if (checkDistinct &&
334	collatorPrimaryOnly_->compare(item, separated(item), errorCode) == `0`) {
335	// Ignore a multi-code point index character that does not sort distinctly
336	// from the sequence of its separate characters.
337	} else {
338	int32_t insertionPoint = binarySearch(indexCharacters, item, collatorPrimaryOnly_);
339	if (insertionPoint < `0`) {
340	indexCharacters.insertElementAt(
341	ownedString(*item, ownedItem, errorCode), ~insertionPoint, errorCode);
342	} else {
343	const UnicodeString &itemAlreadyIn = *getString(indexCharacters, insertionPoint);
344	if (isOneLabelBetterThanOther(nfkdNormalizer, item, itemAlreadyIn)) {
345	indexCharacters.setElementAt(
346	ownedString(*item, ownedItem, errorCode), insertionPoint);
347	}
348	}
349	}
350	}
351	if (U_FAILURE(errorCode)) { return; }
352
353	// if the result is still too large, cut down to maxLabelCount_ elements, by removing every nth element
354
355	int32_t size = indexCharacters.size() - `1`;
356	if (size > maxLabelCount_) {
357	int32_t count = `0`;
358	int32_t old = -`1`;
359	for (int32_t i = `0`; i < indexCharacters.size();) {
360	++count;
361	int32_t bump = count * maxLabelCount_ / size;
362	if (bump == old) {
363	indexCharacters.removeElementAt(i);
364	} else {
365	old = bump;
366	++i;
367	}
368	}
369	}
370	}
371
372	namespace {
373
374	const UnicodeString &fixLabel(const UnicodeString &current, UnicodeString &temp) {
375	if (!current.startsWith(BASE, BASE_LENGTH)) {
376	return current;
377	}
378	UChar rest = current.charAt(BASE_LENGTH);
379	if (`0x2800` < rest && rest <= `0x28FF`) { // stroke count
380	int32_t count = rest-`0x2800`;
381	temp.setTo((UChar)(`0x30` + count % `10`));
382	if (count >= `10`) {
383	count /= `10`;
384	temp.insert(`0`, (UChar)(`0x30` + count % `10`));
385	if (count >= `10`) {
386	count /= `10`;
387	temp.insert(`0`, (UChar)(`0x30` + count));
388	}
389	}
390	return temp.append((UChar)`0x5283`);
391	}
392	return temp.setTo(current, BASE_LENGTH);
393	}
394
395	UBool hasMultiplePrimaryWeights(
396	const RuleBasedCollator &coll, uint32_t variableTop,
397	const UnicodeString &s, UVector64 &ces, UErrorCode &errorCode) {
398	ces.removeAllElements();
399	coll.internalGetCEs(s, ces, errorCode);
400	if (U_FAILURE(errorCode)) { return FALSE; }
401	UBool seenPrimary = FALSE;
402	for (int32_t i = `0`; i < ces.size(); ++i) {
403	int64_t ce = ces.elementAti(i);
404	uint32_t p = (uint32_t)(ce >> `32`);
405	if (p > variableTop) {
406	// not primary ignorable
407	if (seenPrimary) {
408	return TRUE;
409	}
410	seenPrimary = TRUE;
411	}
412	}
413	return FALSE;
414	}
415
416	} // namespace
417
418	BucketList AlphabeticIndex::createBucketList(UErrorCode &errorCode) const* {
419	// Initialize indexCharacters.
420	UVector indexCharacters(errorCode);
421	indexCharacters.setDeleter(uprv_deleteUObject);
422	initLabels(indexCharacters, errorCode);
423	if (U_FAILURE(errorCode)) { return NULL; }
424
425	// Variables for hasMultiplePrimaryWeights().
426	UVector64 ces(errorCode);
427	uint32_t variableTop;
428	if (collatorPrimaryOnly_->getAttribute(UCOL_ALTERNATE_HANDLING, errorCode) == UCOL_SHIFTED) {
429	variableTop = collatorPrimaryOnly_->getVariableTop(errorCode);
430	} else {
431	variableTop = `0`;
432	}
433	UBool hasInvisibleBuckets = FALSE;
434
435	// Helper arrays for Chinese Pinyin collation.
436	Bucket *asciiBuckets[`26`] = {
437	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
438	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
439	};
440	Bucket *pinyinBuckets[`26`] = {
441	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
442	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
443	};
444	UBool hasPinyin = FALSE;
445
446	LocalPointer<UVector> bucketList(new UVector (errorCode), errorCode);
447	if (U_FAILURE(errorCode)) {
448	return NULL;
449	}
450	bucketList ->setDeleter(uprv_deleteUObject);
451
452	// underflow bucket
453	Bucket bucket = new* Bucket (getUnderflowLabel(), emptyString_, U_ALPHAINDEX_UNDERFLOW);
454	if (bucket == NULL) {
455	errorCode = U_MEMORY_ALLOCATION_ERROR;
456	return NULL;
457	}
458	bucketList ->addElement(bucket, errorCode);
459	if (U_FAILURE(errorCode)) { return NULL; }
460
461	UnicodeString temp;
462
463	// fix up the list, adding underflow, additions, overflow
464	// Insert inflow labels as needed.
465	int32_t scriptIndex = -`1`;
466	const UnicodeString *scriptUpperBoundary = &emptyString_;
467	for (int32_t i = `0`; i < indexCharacters.size(); ++i) {
468	UnicodeString &current = *getString(indexCharacters, i);
469	if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) >= `0`) {
470	// We crossed the script boundary into a new script.
471	const UnicodeString &inflowBoundary = *scriptUpperBoundary;
472	UBool skippedScript = FALSE;
473	for (;;) {
474	scriptUpperBoundary = getString(*firstCharsInScripts_, ++scriptIndex);
475	if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) < `0`) {
476	break;
477	}
478	skippedScript = TRUE;
479	}
480	if (skippedScript && bucketList ->size() > `1`) {
481	// We are skipping one or more scripts,
482	// and we are not just getting out of the underflow label.
483	bucket = new Bucket (getInflowLabel(), inflowBoundary, U_ALPHAINDEX_INFLOW);
484	if (bucket == NULL) {
485	errorCode = U_MEMORY_ALLOCATION_ERROR;
486	return NULL;
487	}
488	bucketList ->addElement(bucket, errorCode);
489	}
490	}
491	// Add a bucket with the current label.
492	bucket = new Bucket (fixLabel(current, temp), current, U_ALPHAINDEX_NORMAL);
493	if (bucket == NULL) {
494	errorCode = U_MEMORY_ALLOCATION_ERROR;
495	return NULL;
496	}
497	bucketList ->addElement(bucket, errorCode);
498	// Remember ASCII and Pinyin buckets for Pinyin redirects.
499	UChar c;
500	if (current.length() == `1` && `0x41` <= (c = current.charAt(`0`)) && c <= `0x5A`) { // A-Z
501	asciiBuckets[c - `0x41`] = bucket;
502	} else if (current.length() == BASE_LENGTH + `1` && current.startsWith(BASE, BASE_LENGTH) &&
503	`0x41` <= (c = current.charAt(BASE_LENGTH)) && c <= `0x5A`) {
504	pinyinBuckets[c - `0x41`] = bucket;
505	hasPinyin = TRUE;
506	}
507	// Check for multiple primary weights.
508	if (!current.startsWith(BASE, BASE_LENGTH) &&
509	hasMultiplePrimaryWeights(*collatorPrimaryOnly_, variableTop, current,
510	ces, errorCode) &&
511	current.charAt(current.length() - `1`) != `0xFFFF` / !current.endsWith("\uffff") /) {
512	// "AE-ligature" or "Sch" etc.
513	for (int32_t j = bucketList ->size() - `2`;; --j) {
514	Bucket singleBucket = getBucket(bucketList, j);
515	if (singleBucket->labelType_ != U_ALPHAINDEX_NORMAL) {
516	// There is no single-character bucket since the last
517	// underflow or inflow label.
518	break;
519	}
520	if (singleBucket->displayBucket_ == NULL &&
521	!hasMultiplePrimaryWeights(*collatorPrimaryOnly_, variableTop,
522	singleBucket->lowerBoundary_,
523	ces, errorCode)) {
524	// Add an invisible bucket that redirects strings greater than the expansion
525	// to the previous single-character bucket.
526	// For example, after ... Q R S Sch we add Sch\uFFFF->S
527	// and after ... Q R S Sch Sch\uFFFF St we add St\uFFFF->S.
528	bucket = new Bucket (emptyString_,
529	UnicodeString (current).append((UChar)`0xFFFF`),
530	U_ALPHAINDEX_NORMAL);
531	if (bucket == NULL) {
532	errorCode = U_MEMORY_ALLOCATION_ERROR;
533	return NULL;
534	}
535	bucket->displayBucket_ = singleBucket;
536	bucketList ->addElement(bucket, errorCode);
537	hasInvisibleBuckets = TRUE;
538	break;
539	}
540	}
541	}
542	}
543	if (U_FAILURE(errorCode)) { return NULL; }
544	if (bucketList ->size() == `1`) {
545	// No real labels, show only the underflow label.
546	BucketList bl = new* BucketList (bucketList.getAlias(), bucketList.getAlias());
547	if (bl == NULL) {
548	errorCode = U_MEMORY_ALLOCATION_ERROR;
549	return NULL;
550	}
551	bucketList.orphan();
552	return bl;
553	}
554	// overflow bucket
555	bucket = new Bucket (getOverflowLabel(), *scriptUpperBoundary, U_ALPHAINDEX_OVERFLOW);
556	if (bucket == NULL) {
557	errorCode = U_MEMORY_ALLOCATION_ERROR;
558	return NULL;
559	}
560	bucketList ->addElement(bucket, errorCode); // final
561
562	if (hasPinyin) {
563	// Redirect Pinyin buckets.
564	Bucket *asciiBucket = NULL;
565	for (int32_t i = `0`; i < `26`; ++i) {
566	if (asciiBuckets[i] != NULL) {
567	asciiBucket = asciiBuckets[i];
568	}
569	if (pinyinBuckets[i] != NULL && asciiBucket != NULL) {
570	pinyinBuckets[i]->displayBucket_ = asciiBucket;
571	hasInvisibleBuckets = TRUE;
572	}
573	}
574	}
575
576	if (U_FAILURE(errorCode)) { return NULL; }
577	if (!hasInvisibleBuckets) {
578	BucketList bl = new* BucketList (bucketList.getAlias(), bucketList.getAlias());
579	if (bl == NULL) {
580	errorCode = U_MEMORY_ALLOCATION_ERROR;
581	return NULL;
582	}
583	bucketList.orphan();
584	return bl;
585	}
586	// Merge inflow buckets that are visually adjacent.
587	// Iterate backwards: Merge inflow into overflow rather than the other way around.
588	int32_t i = bucketList ->size() - `1`;
589	Bucket nextBucket = getBucket(bucketList, i);
590	while (--i > `0`) {
591	bucket = getBucket(*bucketList, i);
592	if (bucket->displayBucket_ != NULL) {
593	continue; // skip invisible buckets
594	}
595	if (bucket->labelType_ == U_ALPHAINDEX_INFLOW) {
596	if (nextBucket->labelType_ != U_ALPHAINDEX_NORMAL) {
597	bucket->displayBucket_ = nextBucket;
598	continue;
599	}
600	}
601	nextBucket = bucket;
602	}
603
604	LocalPointer<UVector> publicBucketList(new UVector (errorCode), errorCode);
605	if (U_FAILURE(errorCode)) {
606	return NULL;
607	}
608	// Do not call publicBucketList->setDeleter():
609	// This vector shares its objects with the bucketList.
610	for (int32_t j = `0`; j < bucketList ->size(); ++j) {
611	bucket = getBucket(*bucketList, j);
612	if (bucket->displayBucket_ == NULL) {
613	publicBucketList ->addElement(bucket, errorCode);
614	}
615	}
616	if (U_FAILURE(errorCode)) { return NULL; }
617	BucketList bl = new* BucketList (bucketList.getAlias(), publicBucketList.getAlias());
618	if (bl == NULL) {
619	errorCode = U_MEMORY_ALLOCATION_ERROR;
620	return NULL;
621	}
622	bucketList.orphan();
623	publicBucketList.orphan();
624	return bl;
625	}
626
627	/**
628	* Creates an index, and buckets and sorts the list of records into the index.
629	*/
630	void AlphabeticIndex::initBuckets(UErrorCode &errorCode) {
631	if (U_FAILURE(errorCode) \|\| buckets_ != NULL) {
632	return;
633	}
634	buckets_ = createBucketList(errorCode);
635	if (U_FAILURE(errorCode) \|\| inputList_ == NULL \|\| inputList_->isEmpty()) {
636	return;
637	}
638
639	// Sort the records by name.
640	// Stable sort preserves input order of collation duplicates.
641	inputList_->sortWithUComparator(recordCompareFn, collator_, errorCode);
642
643	// Now, we traverse all of the input, which is now sorted.
644	// If the item doesn't go in the current bucket, we find the next bucket that contains it.
645	// This makes the process order nlog(n), since we just sort the list and then do a linear process.*
646	// However, if the user adds an item at a time and then gets the buckets, this isn't efficient, so
647	// we need to improve it for that case.
648
649	Bucket currentBucket = getBucket(buckets_->bucketList_, `0`);
650	int32_t bucketIndex = `1`;
651	Bucket *nextBucket;
652	const UnicodeString *upperBoundary;
653	if (bucketIndex < buckets_->bucketList_->size()) {
654	nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++);
655	upperBoundary = &nextBucket->lowerBoundary_;
656	} else {
657	nextBucket = NULL;
658	upperBoundary = NULL;
659	}
660	for (int32_t i = `0`; i < inputList_->size(); ++i) {
661	Record r = getRecord(inputList_, i);
662	// if the current bucket isn't the right one, find the one that is
663	// We have a special flag for the last bucket so that we don't look any further
664	while (upperBoundary != NULL &&
665	collatorPrimaryOnly_->compare(r->name_, *upperBoundary, errorCode) >= `0`) {
666	currentBucket = nextBucket;
667	// now reset the boundary that we compare against
668	if (bucketIndex < buckets_->bucketList_->size()) {
669	nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++);
670	upperBoundary = &nextBucket->lowerBoundary_;
671	} else {
672	upperBoundary = NULL;
673	}
674	}
675	// now put the record into the bucket.
676	Bucket *bucket = currentBucket;
677	if (bucket->displayBucket_ != NULL) {
678	bucket = bucket->displayBucket_;
679	}
680	if (bucket->records_ == NULL) {
681	bucket->records_ = new UVector (errorCode);
682	if (bucket->records_ == NULL) {
683	errorCode = U_MEMORY_ALLOCATION_ERROR;
684	return;
685	}
686	}
687	bucket->records_->addElement(r, errorCode);
688	}
689	}
690
691	void AlphabeticIndex::clearBuckets() {
692	if (buckets_ != NULL) {
693	delete buckets_;
694	buckets_ = NULL;
695	internalResetBucketIterator();
696	}
697	}
698
699	void AlphabeticIndex::internalResetBucketIterator() {
700	labelsIterIndex_ = -`1`;
701	currentBucket_ = NULL;
702	}
703
704
705	void AlphabeticIndex::addIndexExemplars(const Locale &locale, UErrorCode &status) {
706	LocalULocaleDataPointer uld(ulocdata_open(locale.getName(), &status));
707	if (U_FAILURE(status)) {
708	return;
709	}
710
711	UnicodeSet exemplars;
712	ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), `0`, ULOCDATA_ES_INDEX, &status);
713	if (U_SUCCESS(status)) {
714	initialLabels_->addAll(exemplars);
715	return;
716	}
717	status = U_ZERO_ERROR; // Clear out U_MISSING_RESOURCE_ERROR
718
719	// The locale data did not include explicit Index characters.
720	// Synthesize a set of them from the locale's standard exemplar characters.
721	ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), `0`, ULOCDATA_ES_STANDARD, &status);
722	if (U_FAILURE(status)) {
723	return;
724	}
725
726	// question: should we add auxiliary exemplars?
727	if (exemplars.containsSome(`0x61`, `0x7A`) / a-z / \|\| exemplars.isEmpty()) {
728	exemplars.add(`0x61`, `0x7A`);
729	}
730	if (exemplars.containsSome(`0xAC00`, `0xD7A3`)) { // Hangul syllables
731	// cut down to small list
732	exemplars.remove(`0xAC00`, `0xD7A3`).
733	add(`0xAC00`).add(`0xB098`).add(`0xB2E4`).add(`0xB77C`).
734	add(`0xB9C8`).add(`0xBC14`).add(`0xC0AC`).add(`0xC544`).
735	add(`0xC790`).add(`0xCC28`).add(`0xCE74`).add(`0xD0C0`).
736	add(`0xD30C`).add(`0xD558`);
737	}
738	if (exemplars.containsSome(`0x1200`, `0x137F`)) { // Ethiopic block
739	// cut down to small list
740	// make use of the fact that Ethiopic is allocated in 8's, where
741	// the base is 0 mod 8.
742	UnicodeSet ethiopic(UnicodeString (u"[ሀለሐመሠረሰሸቀቈቐቘበቨተቸኀኈነኘአከኰኸዀወዐዘዠየደዸጀገጐጘጠጨጰጸፀፈፐፘ]"), status);
743	ethiopic.retainAll(exemplars);
744	exemplars.remove(u`'ሀ'`, `0x137F`).addAll(ethiopic);
745	}
746
747	// Upper-case any that aren't already so.
748	// (We only do this for synthesized index characters.)
749	UnicodeSetIterator it(exemplars);
750	UnicodeString upperC;
751	while (it.next()) {
752	const UnicodeString &exemplarC = it.getString();
753	upperC = exemplarC;
754	upperC.toUpper(locale);
755	initialLabels_->add(upperC);
756	}
757	}
758
759	UBool AlphabeticIndex::addChineseIndexCharacters(UErrorCode &errorCode) {
760	UnicodeSet contractions;
761	collatorPrimaryOnly_->internalAddContractions(BASE[`0`], contractions, errorCode);
762	if (U_FAILURE(errorCode) \|\| contractions.isEmpty()) { return FALSE; }
763	initialLabels_->addAll(contractions);
764	UnicodeSetIterator iter(contractions);
765	while (iter.next()) {
766	const UnicodeString &s = iter.getString();
767	U_ASSERT (s.startsWith(BASE, BASE_LENGTH));
768	UChar c = s.charAt(s.length() - `1`);
769	if (`0x41` <= c && c <= `0x5A`) { // A-Z
770	// There are Pinyin labels, add ASCII A-Z labels as well.
771	initialLabels_->add(`0x41`, `0x5A`); // A-Z
772	break;
773	}
774	}
775	return TRUE;
776	}
777
778
779	/*
780	* Return the string with interspersed CGJs. Input must have more than 2 codepoints.
781	*/
782	static const UChar CGJ = `0x034F`;
783	UnicodeString AlphabeticIndex::separated(const UnicodeString &item) {
784	UnicodeString result;
785	if (item.length() == `0`) {
786	return result;
787	}
788	int32_t i = `0`;
789	for (;;) {
790	UChar32 cp = item.char32At(i);
791	result.append(cp);
792	i = item.moveIndex32(i, `1`);
793	if (i >= item.length()) {
794	break;
795	}
796	result.append(CGJ);
797	}
798	return result;
799	}
800
801
802	UBool AlphabeticIndex::operator==(const AlphabeticIndex& / other /) const {
803	return FALSE;
804	}
805
806
807	UBool AlphabeticIndex::operator!=(const AlphabeticIndex& / other /) const {
808	return FALSE;
809	}
810
811
812	const RuleBasedCollator &AlphabeticIndex::getCollator() const {
813	return *collator_;
814	}
815
816
817	const UnicodeString &AlphabeticIndex::getInflowLabel() const {
818	return inflowLabel_;
819	}
820
821	const UnicodeString &AlphabeticIndex::getOverflowLabel() const {
822	return overflowLabel_;
823	}
824
825
826	const UnicodeString &AlphabeticIndex::getUnderflowLabel() const {
827	return underflowLabel_;
828	}
829
830
831	AlphabeticIndex &AlphabeticIndex::setInflowLabel(const UnicodeString &label, UErrorCode &/status/) {
832	inflowLabel_ = label;
833	clearBuckets();
834	return *this;
835	}
836
837
838	AlphabeticIndex &AlphabeticIndex::setOverflowLabel(const UnicodeString &label, UErrorCode &/status/) {
839	overflowLabel_ = label;
840	clearBuckets();
841	return *this;
842	}
843
844
845	AlphabeticIndex &AlphabeticIndex::setUnderflowLabel(const UnicodeString &label, UErrorCode &/status/) {
846	underflowLabel_ = label;
847	clearBuckets();
848	return *this;
849	}
850
851
852	int32_t AlphabeticIndex::getMaxLabelCount() const {
853	return maxLabelCount_;
854	}
855
856
857	AlphabeticIndex &AlphabeticIndex::setMaxLabelCount(int32_t maxLabelCount, UErrorCode &status) {
858	if (U_FAILURE(status)) {
859	return *this;
860	}
861	if (maxLabelCount <= `0`) {
862	status = U_ILLEGAL_ARGUMENT_ERROR;
863	return *this;
864	}
865	maxLabelCount_ = maxLabelCount;
866	clearBuckets();
867	return *this;
868	}
869
870
871	//
872	// init() - Common code for constructors.
873	//
874
875	void AlphabeticIndex::init(const Locale *locale, UErrorCode &status) {
876	if (U_FAILURE(status)) { return; }
877	if (locale == NULL && collator_ == NULL) {
878	status = U_ILLEGAL_ARGUMENT_ERROR;
879	return;
880	}
881
882	initialLabels_ = new UnicodeSet ();
883	if (initialLabels_ == NULL) {
884	status = U_MEMORY_ALLOCATION_ERROR;
885	return;
886	}
887
888	inflowLabel_.setTo((UChar)`0x2026`); // Ellipsis
889	overflowLabel_ = inflowLabel_;
890	underflowLabel_ = inflowLabel_;
891
892	if (collator_ == NULL) {
893	Collator coll = Collator::createInstance(locale, status);
894	if (U_FAILURE(status)) {
895	delete coll;
896	return;
897	}
898	if (coll == NULL) {
899	status = U_MEMORY_ALLOCATION_ERROR;
900	return;
901	}
902	collator_ = dynamic_cast<RuleBasedCollator *>(coll);
903	if (collator_ == NULL) {
904	delete coll;
905	status = U_UNSUPPORTED_ERROR;
906	return;
907	}
908	}
909	collatorPrimaryOnly_ = collator_->clone();
910	if (collatorPrimaryOnly_ == NULL) {
911	status = U_MEMORY_ALLOCATION_ERROR;
912	return;
913	}
914	collatorPrimaryOnly_->setAttribute(UCOL_STRENGTH, UCOL_PRIMARY, status);
915	firstCharsInScripts_ = firstStringsInScript(status);
916	if (U_FAILURE(status)) { return; }
917	firstCharsInScripts_->sortWithUComparator(collatorComparator, collatorPrimaryOnly_, status);
918	// Guard against a degenerate collator where
919	// some script boundary strings are primary ignorable.
920	for (;;) {
921	if (U_FAILURE(status)) { return; }
922	if (firstCharsInScripts_->isEmpty()) {
923	// AlphabeticIndex requires some non-ignorable script boundary strings.
924	status = U_ILLEGAL_ARGUMENT_ERROR;
925	return;
926	}
927	if (collatorPrimaryOnly_->compare(
928	*static_cast<UnicodeString *>(firstCharsInScripts_->elementAt(`0`)),
929	emptyString_, status) == UCOL_EQUAL) {
930	firstCharsInScripts_->removeElementAt(`0`);
931	} else {
932	break;
933	}
934	}
935
936	// Chinese index characters, which are specific to each of the several Chinese tailorings,
937	// take precedence over the single locale data exemplar set per language.
938	if (!addChineseIndexCharacters(status) && locale != NULL) {
939	addIndexExemplars(*locale, status);
940	}
941	}
942
943
944	//
945	// Comparison function for UVector<UnicodeString > sorting with a collator.*
946	//
947	static int32_t U_CALLCONV
948	collatorComparator(const void context, const* void left, const* void *right) {
949	const UElement leftElement = static_cast<const* UElement *>(left);
950	const UElement rightElement = static_cast<const* UElement *>(right);
951	const UnicodeString leftString = static_cast<const* UnicodeString *>(leftElement->pointer);
952	const UnicodeString rightString = static_cast<const* UnicodeString *>(rightElement->pointer);
953
954	if (leftString == rightString) {
955	// Catches case where both are NULL
956	return `0`;
957	}
958	if (leftString == NULL) {
959	return `1`;
960	}
961	if (rightString == NULL) {
962	return -`1`;
963	}
964	const Collator col = static_cast<const* Collator *>(context);
965	UErrorCode errorCode = U_ZERO_ERROR;
966	return col->compare(leftString, rightString, errorCode);
967	}
968
969	//
970	// Comparison function for UVector<Record > sorting with a collator.*
971	//
972	static int32_t U_CALLCONV
973	recordCompareFn(const void context, const* void left, const* void *right) {
974	const UElement leftElement = static_cast<const* UElement *>(left);
975	const UElement rightElement = static_cast<const* UElement *>(right);
976	const AlphabeticIndex::Record leftRec = static_cast<const* AlphabeticIndex::Record *>(leftElement->pointer);
977	const AlphabeticIndex::Record rightRec = static_cast<const* AlphabeticIndex::Record *>(rightElement->pointer);
978	const Collator col = static_cast<const* Collator *>(context);
979	UErrorCode errorCode = U_ZERO_ERROR;
980	return col->compare(leftRec->name_, rightRec->name_, errorCode);
981	}
982
983	UVector *AlphabeticIndex::firstStringsInScript(UErrorCode &status) {
984	if (U_FAILURE(status)) {
985	return NULL;
986	}
987	LocalPointer<UVector> dest(new UVector (status), status);
988	if (U_FAILURE(status)) {
989	return NULL;
990	}
991	dest ->setDeleter(uprv_deleteUObject);
992	// Fetch the script-first-primary contractions which are defined in the root collator.
993	// They all start with U+FDD1.
994	UnicodeSet set;
995	collatorPrimaryOnly_->internalAddContractions(`0xFDD1`, set, status);
996	if (U_FAILURE(status)) {
997	return NULL;
998	}
999	if (set.isEmpty()) {
1000	status = U_UNSUPPORTED_ERROR;
1001	return NULL;
1002	}
1003	UnicodeSetIterator iter(set);
1004	while (iter.next()) {
1005	const UnicodeString &boundary = iter.getString();
1006	uint32_t gcMask = U_GET_GC_MASK(boundary.char32At(`1`));
1007	if ((gcMask & (U_GC_L_MASK \| U_GC_CN_MASK)) == `0`) {
1008	// Ignore boundaries for the special reordering groups.
1009	// Take only those for "real scripts" (where the sample character is a Letter,
1010	// and the one for unassigned implicit weights (Cn).
1011	continue;
1012	}
1013	UnicodeString s = new* UnicodeString (boundary);
1014	if (s == NULL) {
1015	status = U_MEMORY_ALLOCATION_ERROR;
1016	return NULL;
1017	}
1018	dest ->addElement(s, status);
1019	}
1020	return dest.orphan();
1021	}
1022
1023
1024	namespace {
1025
1026	/**
1027	* Returns true if one index character string is "better" than the other.
1028	* Shorter NFKD is better, and otherwise NFKD-binary-less-than is
1029	* better, and otherwise binary-less-than is better.
1030	*/
1031	UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer,
1032	const UnicodeString &one, const UnicodeString &other) {
1033	// This is called with primary-equal strings, but never with one.equals(other).
1034	UErrorCode status = U_ZERO_ERROR;
1035	UnicodeString n1 = nfkdNormalizer.normalize(one, status);
1036	UnicodeString n2 = nfkdNormalizer.normalize(other, status);
1037	if (U_FAILURE(status)) { return FALSE; }
1038	int32_t result = n1.countChar32() - n2.countChar32();
1039	if (result != `0`) {
1040	return result < `0`;
1041	}
1042	result = n1.compareCodePointOrder(n2);
1043	if (result != `0`) {
1044	return result < `0`;
1045	}
1046	return one.compareCodePointOrder(other) < `0`;
1047	}
1048
1049	} // namespace
1050
1051	//
1052	// Constructor & Destructor for AlphabeticIndex::Record
1053	//
1054	// Records are internal only, instances are not directly surfaced in the public API.
1055	// This class is mostly struct-like, with all public fields.
1056
1057	AlphabeticIndex::Record::Record(const UnicodeString &name, const void *data)
1058	: name_(name), data_(data) {}
1059
1060	AlphabeticIndex::Record::~Record() {
1061	}
1062
1063
1064	AlphabeticIndex & AlphabeticIndex::addRecord(const UnicodeString &name, const void *data, UErrorCode &status) {
1065	if (U_FAILURE(status)) {
1066	return *this;
1067	}
1068	if (inputList_ == NULL) {
1069	inputList_ = new UVector (status);
1070	if (inputList_ == NULL) {
1071	status = U_MEMORY_ALLOCATION_ERROR;
1072	return *this;
1073	}
1074	inputList_->setDeleter(alphaIndex_deleteRecord);
1075	}
1076	Record r = new* Record (name, data);
1077	if (r == NULL) {
1078	status = U_MEMORY_ALLOCATION_ERROR;
1079	return *this;
1080	}
1081	inputList_->addElement(r, status);
1082	clearBuckets();
1083	//std::string ss;
1084	//std::string ss2;
1085	//std::cout << "added record: name = \"" << r->name_.toUTF8String(ss) << "\"" <<
1086	// " sortingName = \"" << r->sortingName_.toUTF8String(ss2) << "\"" << std::endl;
1087	return *this;
1088	}
1089
1090
1091	AlphabeticIndex &AlphabeticIndex::clearRecords(UErrorCode &status) {
1092	if (U_SUCCESS(status) && inputList_ != NULL && !inputList_->isEmpty()) {
1093	inputList_->removeAllElements();
1094	clearBuckets();
1095	}
1096	return *this;
1097	}
1098
1099	int32_t AlphabeticIndex::getBucketIndex(const UnicodeString &name, UErrorCode &status) {
1100	initBuckets(status);
1101	if (U_FAILURE(status)) {
1102	return `0`;
1103	}
1104	return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, status);
1105	}
1106
1107
1108	int32_t AlphabeticIndex::getBucketIndex() const {
1109	return labelsIterIndex_;
1110	}
1111
1112
1113	UBool AlphabeticIndex::nextBucket(UErrorCode &status) {
1114	if (U_FAILURE(status)) {
1115	return FALSE;
1116	}
1117	if (buckets_ == NULL && currentBucket_ != NULL) {
1118	status = U_ENUM_OUT_OF_SYNC_ERROR;
1119	return FALSE;
1120	}
1121	initBuckets(status);
1122	if (U_FAILURE(status)) {
1123	return FALSE;
1124	}
1125	++labelsIterIndex_;
1126	if (labelsIterIndex_ >= buckets_->getBucketCount()) {
1127	labelsIterIndex_ = buckets_->getBucketCount();
1128	return FALSE;
1129	}
1130	currentBucket_ = getBucket(*buckets_->immutableVisibleList_, labelsIterIndex_);
1131	resetRecordIterator();
1132	return TRUE;
1133	}
1134
1135	const UnicodeString &AlphabeticIndex::getBucketLabel() const {
1136	if (currentBucket_ != NULL) {
1137	return currentBucket_->label_;
1138	} else {
1139	return emptyString_;
1140	}
1141	}
1142
1143
1144	UAlphabeticIndexLabelType AlphabeticIndex::getBucketLabelType() const {
1145	if (currentBucket_ != NULL) {
1146	return currentBucket_->labelType_;
1147	} else {
1148	return U_ALPHAINDEX_NORMAL;
1149	}
1150	}
1151
1152
1153	int32_t AlphabeticIndex::getBucketRecordCount() const {
1154	if (currentBucket_ != NULL && currentBucket_->records_ != NULL) {
1155	return currentBucket_->records_->size();
1156	} else {
1157	return `0`;
1158	}
1159	}
1160
1161
1162	AlphabeticIndex &AlphabeticIndex::resetBucketIterator(UErrorCode &status) {
1163	if (U_FAILURE(status)) {
1164	return *this;
1165	}
1166	internalResetBucketIterator();
1167	return *this;
1168	}
1169
1170
1171	UBool AlphabeticIndex::nextRecord(UErrorCode &status) {
1172	if (U_FAILURE(status)) {
1173	return FALSE;
1174	}
1175	if (currentBucket_ == NULL) {
1176	// We are trying to iterate over the items in a bucket, but there is no
1177	// current bucket from the enumeration of buckets.
1178	status = U_INVALID_STATE_ERROR;
1179	return FALSE;
1180	}
1181	if (buckets_ == NULL) {
1182	status = U_ENUM_OUT_OF_SYNC_ERROR;
1183	return FALSE;
1184	}
1185	if (currentBucket_->records_ == NULL) {
1186	return FALSE;
1187	}
1188	++itemsIterIndex_;
1189	if (itemsIterIndex_ >= currentBucket_->records_->size()) {
1190	itemsIterIndex_ = currentBucket_->records_->size();
1191	return FALSE;
1192	}
1193	return TRUE;
1194	}
1195
1196
1197	const UnicodeString &AlphabeticIndex::getRecordName() const {
1198	const UnicodeString *retStr = &emptyString_;
1199	if (currentBucket_ != NULL && currentBucket_->records_ != NULL &&
1200	itemsIterIndex_ >= `0` &&
1201	itemsIterIndex_ < currentBucket_->records_->size()) {
1202	Record item = static_cast<Record >(currentBucket_->records_->elementAt(itemsIterIndex_));
1203	retStr = &item->name_;
1204	}
1205	return *retStr;
1206	}
1207
1208	const void AlphabeticIndex::getRecordData() const* {
1209	const void *retPtr = NULL;
1210	if (currentBucket_ != NULL && currentBucket_->records_ != NULL &&
1211	itemsIterIndex_ >= `0` &&
1212	itemsIterIndex_ < currentBucket_->records_->size()) {
1213	Record item = static_cast<Record >(currentBucket_->records_->elementAt(itemsIterIndex_));
1214	retPtr = item->data_;
1215	}
1216	return retPtr;
1217	}
1218
1219
1220	AlphabeticIndex & AlphabeticIndex::resetRecordIterator() {
1221	itemsIterIndex_ = -`1`;
1222	return *this;
1223	}
1224
1225
1226
1227	AlphabeticIndex::Bucket::Bucket(const UnicodeString &label,
1228	const UnicodeString &lowerBoundary,
1229	UAlphabeticIndexLabelType type)
1230	: label_(label), lowerBoundary_(lowerBoundary), labelType_(type),
1231	displayBucket_(NULL), displayIndex_(-`1`),
1232	records_(NULL) {
1233	}
1234
1235
1236	AlphabeticIndex::Bucket::~Bucket() {
1237	delete records_;
1238	}
1239
1240	U_NAMESPACE_END
1241
1242	#endif // !UCONFIG_NO_COLLATION
1243

Browse the source code of ClickHouse/contrib/icu/icu4c/source/i18n/alphaindex.cpp