coleitr.cpp source code [ClickHouse/contrib/icu/icu4c/source/i18n/coleitr.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	*******************************************************************************
5	* Copyright (C) 1996-2014, International Business Machines Corporation and
6	* others. All Rights Reserved.
7	*******************************************************************************
8	*/
9
10	/*
11	* File coleitr.cpp
12	*
13	* Created by: Helena Shih
14	*
15	* Modification History:
16	*
17	* Date Name Description
18	*
19	* 6/23/97 helena Adding comments to make code more readable.
20	* 08/03/98 erm Synched with 1.2 version of CollationElementIterator.java
21	* 12/10/99 aliu Ported Thai collation support from Java.
22	* 01/25/01 swquek Modified to a C++ wrapper calling C APIs (ucoliter.h)
23	* 02/19/01 swquek Removed CollationElementIterator() since it is
24	* private constructor and no calls are made to it
25	* 2012-2014 markus Rewritten in C++ again.
26	*/
27
28	#include "unicode/utypes.h"
29
30	#if !UCONFIG_NO_COLLATION
31
32	#include "unicode/chariter.h"
33	#include "unicode/coleitr.h"
34	#include "unicode/tblcoll.h"
35	#include "unicode/ustring.h"
36	#include "cmemory.h"
37	#include "collation.h"
38	#include "collationdata.h"
39	#include "collationiterator.h"
40	#include "collationsets.h"
41	#include "collationtailoring.h"
42	#include "uassert.h"
43	#include "uhash.h"
44	#include "utf16collationiterator.h"
45	#include "uvectr32.h"
46
47	/ Constants --------------------------------------------------------------- /
48
49	U_NAMESPACE_BEGIN
50
51	UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollationElementIterator)
52
53	/ CollationElementIterator public constructor/destructor ------------------ /
54
55	CollationElementIterator::CollationElementIterator(
56	const CollationElementIterator& other)
57	: UObject (other), iter_(NULL), rbc_(NULL), otherHalf_(`0`), dir_(`0`), offsets_(NULL) {
58	*this = other;
59	}
60
61	CollationElementIterator::~CollationElementIterator()
62	{
63	delete iter_;
64	delete offsets_;
65	}
66
67	/ CollationElementIterator public methods --------------------------------- /
68
69	namespace {
70
71	uint32_t getFirstHalf(uint32_t p, uint32_t lower32) {
72	return (p & `0xffff0000`) \| ((lower32 >> `16`) & `0xff00`) \| ((lower32 >> `8`) & `0xff`);
73	}
74	uint32_t getSecondHalf(uint32_t p, uint32_t lower32) {
75	return (p << `16`) \| ((lower32 >> `8`) & `0xff00`) \| (lower32 & `0x3f`);
76	}
77	UBool ceNeedsTwoParts(int64_t ce) {
78	return (ce & INT64_C(`0xffff00ff003f`)) != `0`;
79	}
80
81	} // namespace
82
83	int32_t CollationElementIterator::getOffset() const
84	{
85	if (dir_ < `0` && offsets_ != NULL && !offsets_->isEmpty()) {
86	// CollationIterator::previousCE() decrements the CEs length
87	// while it pops CEs from its internal buffer.
88	int32_t i = iter_->getCEsLength();
89	if (otherHalf_ != `0`) {
90	// Return the trailing CE offset while we are in the middle of a 64-bit CE.
91	++i;
92	}
93	U_ASSERT(i < offsets_->size());
94	return offsets_->elementAti(i);
95	}
96	return iter_->getOffset();
97	}
98
99	/**
100	* Get the ordering priority of the next character in the string.
101	* @return the next character's ordering. Returns NULLORDER if an error has
102	* occured or if the end of string has been reached
103	*/
104	int32_t CollationElementIterator::next(UErrorCode& status)
105	{
106	if (U_FAILURE(status)) { return NULLORDER; }
107	if (dir_ > `1`) {
108	// Continue forward iteration. Test this first.
109	if (otherHalf_ != `0`) {
110	uint32_t oh = otherHalf_;
111	otherHalf_ = `0`;
112	return oh;
113	}
114	} else if (dir_ == `1`) {
115	// next() after setOffset()
116	dir_ = `2`;
117	} else if (dir_ == `0`) {
118	// The iter_ is already reset to the start of the text.
119	dir_ = `2`;
120	} else / dir_ < 0 / {
121	// illegal change of direction
122	status = U_INVALID_STATE_ERROR;
123	return NULLORDER;
124	}
125	// No need to keep all CEs in the buffer when we iterate.
126	iter_->clearCEsIfNoneRemaining();
127	int64_t ce = iter_->nextCE(status);
128	if (ce == Collation::NO_CE) { return NULLORDER; }
129	// Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits.
130	uint32_t p = (uint32_t)(ce >> `32`);
131	uint32_t lower32 = (uint32_t)ce;
132	uint32_t firstHalf = getFirstHalf(p, lower32);
133	uint32_t secondHalf = getSecondHalf(p, lower32);
134	if (secondHalf != `0`) {
135	otherHalf_ = secondHalf \| `0xc0`; // continuation CE
136	}
137	return firstHalf;
138	}
139
140	UBool CollationElementIterator::operator!=(
141	const CollationElementIterator& other) const
142	{
143	return !(*this == other);
144	}
145
146	UBool CollationElementIterator::operator==(
147	const CollationElementIterator& that) const
148	{
149	if (this == &that) {
150	return TRUE;
151	}
152
153	return
154	(rbc_ == that.rbc_ \|\| rbc_ == that.rbc_) &&
155	otherHalf_ == that.otherHalf_ &&
156	normalizeDir() == that.normalizeDir() &&
157	string_ == that.string_ &&
158	iter_ == that.iter_;
159	}
160
161	/**
162	* Get the ordering priority of the previous collation element in the string.
163	* @param status the error code status.
164	* @return the previous element's ordering. Returns NULLORDER if an error has
165	* occured or if the start of string has been reached.
166	*/
167	int32_t CollationElementIterator::previous(UErrorCode& status)
168	{
169	if (U_FAILURE(status)) { return NULLORDER; }
170	if (dir_ < `0`) {
171	// Continue backwards iteration. Test this first.
172	if (otherHalf_ != `0`) {
173	uint32_t oh = otherHalf_;
174	otherHalf_ = `0`;
175	return oh;
176	}
177	} else if (dir_ == `0`) {
178	iter_->resetToOffset(string_.length());
179	dir_ = -`1`;
180	} else if (dir_ == `1`) {
181	// previous() after setOffset()
182	dir_ = -`1`;
183	} else / dir_ > 1 / {
184	// illegal change of direction
185	status = U_INVALID_STATE_ERROR;
186	return NULLORDER;
187	}
188	if (offsets_ == NULL) {
189	offsets_ = new UVector32 (status);
190	if (offsets_ == NULL) {
191	status = U_MEMORY_ALLOCATION_ERROR;
192	return NULLORDER;
193	}
194	}
195	// If we already have expansion CEs, then we also have offsets.
196	// Otherwise remember the trailing offset in case we need to
197	// write offsets for an artificial expansion.
198	int32_t limitOffset = iter_->getCEsLength() == `0` ? iter_->getOffset() : `0`;
199	int64_t ce = iter_->previousCE(*offsets_, status);
200	if (ce == Collation::NO_CE) { return NULLORDER; }
201	// Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits.
202	uint32_t p = (uint32_t)(ce >> `32`);
203	uint32_t lower32 = (uint32_t)ce;
204	uint32_t firstHalf = getFirstHalf(p, lower32);
205	uint32_t secondHalf = getSecondHalf(p, lower32);
206	if (secondHalf != `0`) {
207	if (offsets_->isEmpty()) {
208	// When we convert a single 64-bit CE into two 32-bit CEs,
209	// we need to make this artificial expansion behave like a normal expansion.
210	// See CollationIterator::previousCE().
211	offsets_->addElement(iter_->getOffset(), status);
212	offsets_->addElement(limitOffset, status);
213	}
214	otherHalf_ = firstHalf;
215	return secondHalf \| `0xc0`; // continuation CE
216	}
217	return firstHalf;
218	}
219
220	/**
221	* Resets the cursor to the beginning of the string.
222	*/
223	void CollationElementIterator::reset()
224	{
225	iter_ ->resetToOffset(`0`);
226	otherHalf_ = `0`;
227	dir_ = `0`;
228	}
229
230	void CollationElementIterator::setOffset(int32_t newOffset,
231	UErrorCode& status)
232	{
233	if (U_FAILURE(status)) { return; }
234	if (`0` < newOffset && newOffset < string_.length()) {
235	int32_t offset = newOffset;
236	do {
237	UChar c = string_.charAt(offset);
238	if (!rbc_->isUnsafe(c) \|\|
239	(U16_IS_LEAD(c) && !rbc_->isUnsafe(string_.char32At(offset)))) {
240	break;
241	}
242	// Back up to before this unsafe character.
243	--offset;
244	} while (offset > `0`);
245	if (offset < newOffset) {
246	// We might have backed up more than necessary.
247	// For example, contractions "ch" and "cu" make both 'h' and 'u' unsafe,
248	// but for text "chu" setOffset(2) should remain at 2
249	// although we initially back up to offset 0.
250	// Find the last safe offset no greater than newOffset by iterating forward.
251	int32_t lastSafeOffset = offset;
252	do {
253	iter_->resetToOffset(lastSafeOffset);
254	do {
255	iter_->nextCE(status);
256	if (U_FAILURE(status)) { return; }
257	} while ((offset = iter_->getOffset()) == lastSafeOffset);
258	if (offset <= newOffset) {
259	lastSafeOffset = offset;
260	}
261	} while (offset < newOffset);
262	newOffset = lastSafeOffset;
263	}
264	}
265	iter_->resetToOffset(newOffset);
266	otherHalf_ = `0`;
267	dir_ = `1`;
268	}
269
270	/**
271	* Sets the source to the new source string.
272	*/
273	void CollationElementIterator::setText(const UnicodeString& source,
274	UErrorCode& status)
275	{
276	if (U_FAILURE(status)) {
277	return;
278	}
279
280	string_ = source;
281	const UChar *s = string_.getBuffer();
282	CollationIterator *newIter;
283	UBool numeric = rbc_->settings->isNumeric();
284	if (rbc_->settings->dontCheckFCD()) {
285	newIter = new UTF16CollationIterator (rbc_->data, numeric, s, s, s + string_.length());
286	} else {
287	newIter = new FCDUTF16CollationIterator (rbc_->data, numeric, s, s, s + string_.length());
288	}
289	if (newIter == NULL) {
290	status = U_MEMORY_ALLOCATION_ERROR;
291	return;
292	}
293	delete iter_;
294	iter_ = newIter;
295	otherHalf_ = `0`;
296	dir_ = `0`;
297	}
298
299	// Sets the source to the new character iterator.
300	void CollationElementIterator::setText(CharacterIterator& source,
301	UErrorCode& status)
302	{
303	if (U_FAILURE(status))
304	return;
305
306	source.getText(string_);
307	setText(string_, status);
308	}
309
310	int32_t CollationElementIterator::strengthOrder(int32_t order) const
311	{
312	UColAttributeValue s = (UColAttributeValue)rbc_->settings->getStrength();
313	// Mask off the unwanted differences.
314	if (s == UCOL_PRIMARY) {
315	order &= `0xffff0000`;
316	}
317	else if (s == UCOL_SECONDARY) {
318	order &= `0xffffff00`;
319	}
320
321	return order;
322	}
323
324	/ CollationElementIterator private constructors/destructors --------------- /
325
326	/**
327	* This is the "real" constructor for this class; it constructs an iterator
328	* over the source text using the specified collator
329	*/
330	CollationElementIterator::CollationElementIterator(
331	const UnicodeString &source,
332	const RuleBasedCollator *coll,
333	UErrorCode &status)
334	: iter_(NULL), rbc_(coll), otherHalf_(`0`), dir_(`0`), offsets_(NULL) {
335	setText(source, status);
336	}
337
338	/**
339	* This is the "real" constructor for this class; it constructs an iterator over
340	* the source text using the specified collator
341	*/
342	CollationElementIterator::CollationElementIterator(
343	const CharacterIterator &source,
344	const RuleBasedCollator *coll,
345	UErrorCode &status)
346	: iter_(NULL), rbc_(coll), otherHalf_(`0`), dir_(`0`), offsets_(NULL) {
347	// We only call source.getText() which should be const anyway.
348	setText(const_cast<CharacterIterator &>(source), status);
349	}
350
351	/ CollationElementIterator private methods -------------------------------- /
352
353	const CollationElementIterator& CollationElementIterator::operator=(
354	const CollationElementIterator& other)
355	{
356	if (this == &other) {
357	return *this;
358	}
359
360	CollationIterator *newIter;
361	const FCDUTF16CollationIterator *otherFCDIter =
362	dynamic_cast<const FCDUTF16CollationIterator *>(other.iter_);
363	if(otherFCDIter != NULL) {
364	newIter = new FCDUTF16CollationIterator (*otherFCDIter, string_.getBuffer());
365	} else {
366	const UTF16CollationIterator *otherIter =
367	dynamic_cast<const UTF16CollationIterator *>(other.iter_);
368	if(otherIter != NULL) {
369	newIter = new UTF16CollationIterator (*otherIter, string_.getBuffer());
370	} else {
371	newIter = NULL;
372	}
373	}
374	if(newIter != NULL) {
375	delete iter_;
376	iter_ = newIter;
377	rbc_ = other.rbc_;
378	otherHalf_ = other.otherHalf_;
379	dir_ = other.dir_;
380
381	string_ = other.string_;
382	}
383	if(other.dir_ < `0` && other.offsets_ != NULL && !other.offsets_->isEmpty()) {
384	UErrorCode errorCode = U_ZERO_ERROR;
385	if(offsets_ == NULL) {
386	offsets_ = new UVector32 (other.offsets_->size(), errorCode);
387	}
388	if(offsets_ != NULL) {
389	offsets_->assign(*other.offsets_, errorCode);
390	}
391	}
392	return *this;
393	}
394
395	namespace {
396
397	class MaxExpSink : public ContractionsAndExpansions::CESink {
398	public:
399	MaxExpSink(UHashtable *h, UErrorCode &ec) : maxExpansions(h), errorCode(ec) {}
400	virtual ~MaxExpSink();
401	virtual void handleCE(int64_t /ce/) {}
402	virtual void handleExpansion(const int64_t ces[], int32_t length) {
403	if (length <= `1`) {
404	// We do not need to add single CEs into the map.
405	return;
406	}
407	int32_t count = `0`; // number of CE "halves"
408	for (int32_t i = `0`; i < length; ++i) {
409	count += ceNeedsTwoParts(ces[i]) ? `2` : `1`;
410	}
411	// last "half" of the last CE
412	int64_t ce = ces[length - `1`];
413	uint32_t p = (uint32_t)(ce >> `32`);
414	uint32_t lower32 = (uint32_t)ce;
415	uint32_t lastHalf = getSecondHalf(p, lower32);
416	if (lastHalf == `0`) {
417	lastHalf = getFirstHalf(p, lower32);
418	U_ASSERT(lastHalf != `0`);
419	} else {
420	lastHalf \|= `0xc0`; // old-style continuation CE
421	}
422	if (count > uhash_igeti(maxExpansions, (int32_t)lastHalf)) {
423	uhash_iputi(maxExpansions, (int32_t)lastHalf, count, &errorCode);
424	}
425	}
426
427	private:
428	UHashtable *maxExpansions;
429	UErrorCode &errorCode;
430	};
431
432	MaxExpSink::~MaxExpSink() {}
433
434	} // namespace
435
436	UHashtable *
437	CollationElementIterator::computeMaxExpansions(const CollationData *data, UErrorCode &errorCode) {
438	if (U_FAILURE(errorCode)) { return NULL; }
439	UHashtable *maxExpansions = uhash_open(uhash_hashLong, uhash_compareLong,
440	uhash_compareLong, &errorCode);
441	if (U_FAILURE(errorCode)) { return NULL; }
442	MaxExpSink sink(maxExpansions, errorCode);
443	ContractionsAndExpansions (NULL, NULL, &sink, TRUE).forData(data, errorCode);
444	if (U_FAILURE(errorCode)) {
445	uhash_close(maxExpansions);
446	return NULL;
447	}
448	return maxExpansions;
449	}
450
451	int32_t
452	CollationElementIterator::getMaxExpansion(int32_t order) const {
453	return getMaxExpansion(rbc_->tailoring->maxExpansions, order);
454	}
455
456	int32_t
457	CollationElementIterator::getMaxExpansion(const UHashtable *maxExpansions, int32_t order) {
458	if (order == `0`) { return `1`; }
459	int32_t max;
460	if(maxExpansions != NULL && (max = uhash_igeti(maxExpansions, order)) != `0`) {
461	return max;
462	}
463	if ((order & `0xc0`) == `0xc0`) {
464	// old-style continuation CE
465	return `2`;
466	} else {
467	return `1`;
468	}
469	}
470
471	U_NAMESPACE_END
472
473	#endif /* #if !UCONFIG_NO_COLLATION */
474

Browse the source code of ClickHouse/contrib/icu/icu4c/source/i18n/coleitr.cpp