1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5* Copyright (C) 1996-2014, International Business Machines Corporation and
6* others. All Rights Reserved.
7*******************************************************************************
8*/
9
10/*
11* File coleitr.cpp
12*
13* Created by: Helena Shih
14*
15* Modification History:
16*
17* Date Name Description
18*
19* 6/23/97 helena Adding comments to make code more readable.
20* 08/03/98 erm Synched with 1.2 version of CollationElementIterator.java
21* 12/10/99 aliu Ported Thai collation support from Java.
22* 01/25/01 swquek Modified to a C++ wrapper calling C APIs (ucoliter.h)
23* 02/19/01 swquek Removed CollationElementIterator() since it is
24* private constructor and no calls are made to it
25* 2012-2014 markus Rewritten in C++ again.
26*/
27
28#include "unicode/utypes.h"
29
30#if !UCONFIG_NO_COLLATION
31
32#include "unicode/chariter.h"
33#include "unicode/coleitr.h"
34#include "unicode/tblcoll.h"
35#include "unicode/ustring.h"
36#include "cmemory.h"
37#include "collation.h"
38#include "collationdata.h"
39#include "collationiterator.h"
40#include "collationsets.h"
41#include "collationtailoring.h"
42#include "uassert.h"
43#include "uhash.h"
44#include "utf16collationiterator.h"
45#include "uvectr32.h"
46
47/* Constants --------------------------------------------------------------- */
48
49U_NAMESPACE_BEGIN
50
51UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollationElementIterator)
52
53/* CollationElementIterator public constructor/destructor ------------------ */
54
55CollationElementIterator::CollationElementIterator(
56 const CollationElementIterator& other)
57 : UObject(other), iter_(NULL), rbc_(NULL), otherHalf_(0), dir_(0), offsets_(NULL) {
58 *this = other;
59}
60
61CollationElementIterator::~CollationElementIterator()
62{
63 delete iter_;
64 delete offsets_;
65}
66
67/* CollationElementIterator public methods --------------------------------- */
68
69namespace {
70
71uint32_t getFirstHalf(uint32_t p, uint32_t lower32) {
72 return (p & 0xffff0000) | ((lower32 >> 16) & 0xff00) | ((lower32 >> 8) & 0xff);
73}
74uint32_t getSecondHalf(uint32_t p, uint32_t lower32) {
75 return (p << 16) | ((lower32 >> 8) & 0xff00) | (lower32 & 0x3f);
76}
77UBool ceNeedsTwoParts(int64_t ce) {
78 return (ce & INT64_C(0xffff00ff003f)) != 0;
79}
80
81} // namespace
82
83int32_t CollationElementIterator::getOffset() const
84{
85 if (dir_ < 0 && offsets_ != NULL && !offsets_->isEmpty()) {
86 // CollationIterator::previousCE() decrements the CEs length
87 // while it pops CEs from its internal buffer.
88 int32_t i = iter_->getCEsLength();
89 if (otherHalf_ != 0) {
90 // Return the trailing CE offset while we are in the middle of a 64-bit CE.
91 ++i;
92 }
93 U_ASSERT(i < offsets_->size());
94 return offsets_->elementAti(i);
95 }
96 return iter_->getOffset();
97}
98
99/**
100* Get the ordering priority of the next character in the string.
101* @return the next character's ordering. Returns NULLORDER if an error has
102* occured or if the end of string has been reached
103*/
104int32_t CollationElementIterator::next(UErrorCode& status)
105{
106 if (U_FAILURE(status)) { return NULLORDER; }
107 if (dir_ > 1) {
108 // Continue forward iteration. Test this first.
109 if (otherHalf_ != 0) {
110 uint32_t oh = otherHalf_;
111 otherHalf_ = 0;
112 return oh;
113 }
114 } else if (dir_ == 1) {
115 // next() after setOffset()
116 dir_ = 2;
117 } else if (dir_ == 0) {
118 // The iter_ is already reset to the start of the text.
119 dir_ = 2;
120 } else /* dir_ < 0 */ {
121 // illegal change of direction
122 status = U_INVALID_STATE_ERROR;
123 return NULLORDER;
124 }
125 // No need to keep all CEs in the buffer when we iterate.
126 iter_->clearCEsIfNoneRemaining();
127 int64_t ce = iter_->nextCE(status);
128 if (ce == Collation::NO_CE) { return NULLORDER; }
129 // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits.
130 uint32_t p = (uint32_t)(ce >> 32);
131 uint32_t lower32 = (uint32_t)ce;
132 uint32_t firstHalf = getFirstHalf(p, lower32);
133 uint32_t secondHalf = getSecondHalf(p, lower32);
134 if (secondHalf != 0) {
135 otherHalf_ = secondHalf | 0xc0; // continuation CE
136 }
137 return firstHalf;
138}
139
140UBool CollationElementIterator::operator!=(
141 const CollationElementIterator& other) const
142{
143 return !(*this == other);
144}
145
146UBool CollationElementIterator::operator==(
147 const CollationElementIterator& that) const
148{
149 if (this == &that) {
150 return TRUE;
151 }
152
153 return
154 (rbc_ == that.rbc_ || *rbc_ == *that.rbc_) &&
155 otherHalf_ == that.otherHalf_ &&
156 normalizeDir() == that.normalizeDir() &&
157 string_ == that.string_ &&
158 *iter_ == *that.iter_;
159}
160
161/**
162* Get the ordering priority of the previous collation element in the string.
163* @param status the error code status.
164* @return the previous element's ordering. Returns NULLORDER if an error has
165* occured or if the start of string has been reached.
166*/
167int32_t CollationElementIterator::previous(UErrorCode& status)
168{
169 if (U_FAILURE(status)) { return NULLORDER; }
170 if (dir_ < 0) {
171 // Continue backwards iteration. Test this first.
172 if (otherHalf_ != 0) {
173 uint32_t oh = otherHalf_;
174 otherHalf_ = 0;
175 return oh;
176 }
177 } else if (dir_ == 0) {
178 iter_->resetToOffset(string_.length());
179 dir_ = -1;
180 } else if (dir_ == 1) {
181 // previous() after setOffset()
182 dir_ = -1;
183 } else /* dir_ > 1 */ {
184 // illegal change of direction
185 status = U_INVALID_STATE_ERROR;
186 return NULLORDER;
187 }
188 if (offsets_ == NULL) {
189 offsets_ = new UVector32(status);
190 if (offsets_ == NULL) {
191 status = U_MEMORY_ALLOCATION_ERROR;
192 return NULLORDER;
193 }
194 }
195 // If we already have expansion CEs, then we also have offsets.
196 // Otherwise remember the trailing offset in case we need to
197 // write offsets for an artificial expansion.
198 int32_t limitOffset = iter_->getCEsLength() == 0 ? iter_->getOffset() : 0;
199 int64_t ce = iter_->previousCE(*offsets_, status);
200 if (ce == Collation::NO_CE) { return NULLORDER; }
201 // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits.
202 uint32_t p = (uint32_t)(ce >> 32);
203 uint32_t lower32 = (uint32_t)ce;
204 uint32_t firstHalf = getFirstHalf(p, lower32);
205 uint32_t secondHalf = getSecondHalf(p, lower32);
206 if (secondHalf != 0) {
207 if (offsets_->isEmpty()) {
208 // When we convert a single 64-bit CE into two 32-bit CEs,
209 // we need to make this artificial expansion behave like a normal expansion.
210 // See CollationIterator::previousCE().
211 offsets_->addElement(iter_->getOffset(), status);
212 offsets_->addElement(limitOffset, status);
213 }
214 otherHalf_ = firstHalf;
215 return secondHalf | 0xc0; // continuation CE
216 }
217 return firstHalf;
218}
219
220/**
221* Resets the cursor to the beginning of the string.
222*/
223void CollationElementIterator::reset()
224{
225 iter_ ->resetToOffset(0);
226 otherHalf_ = 0;
227 dir_ = 0;
228}
229
230void CollationElementIterator::setOffset(int32_t newOffset,
231 UErrorCode& status)
232{
233 if (U_FAILURE(status)) { return; }
234 if (0 < newOffset && newOffset < string_.length()) {
235 int32_t offset = newOffset;
236 do {
237 UChar c = string_.charAt(offset);
238 if (!rbc_->isUnsafe(c) ||
239 (U16_IS_LEAD(c) && !rbc_->isUnsafe(string_.char32At(offset)))) {
240 break;
241 }
242 // Back up to before this unsafe character.
243 --offset;
244 } while (offset > 0);
245 if (offset < newOffset) {
246 // We might have backed up more than necessary.
247 // For example, contractions "ch" and "cu" make both 'h' and 'u' unsafe,
248 // but for text "chu" setOffset(2) should remain at 2
249 // although we initially back up to offset 0.
250 // Find the last safe offset no greater than newOffset by iterating forward.
251 int32_t lastSafeOffset = offset;
252 do {
253 iter_->resetToOffset(lastSafeOffset);
254 do {
255 iter_->nextCE(status);
256 if (U_FAILURE(status)) { return; }
257 } while ((offset = iter_->getOffset()) == lastSafeOffset);
258 if (offset <= newOffset) {
259 lastSafeOffset = offset;
260 }
261 } while (offset < newOffset);
262 newOffset = lastSafeOffset;
263 }
264 }
265 iter_->resetToOffset(newOffset);
266 otherHalf_ = 0;
267 dir_ = 1;
268}
269
270/**
271* Sets the source to the new source string.
272*/
273void CollationElementIterator::setText(const UnicodeString& source,
274 UErrorCode& status)
275{
276 if (U_FAILURE(status)) {
277 return;
278 }
279
280 string_ = source;
281 const UChar *s = string_.getBuffer();
282 CollationIterator *newIter;
283 UBool numeric = rbc_->settings->isNumeric();
284 if (rbc_->settings->dontCheckFCD()) {
285 newIter = new UTF16CollationIterator(rbc_->data, numeric, s, s, s + string_.length());
286 } else {
287 newIter = new FCDUTF16CollationIterator(rbc_->data, numeric, s, s, s + string_.length());
288 }
289 if (newIter == NULL) {
290 status = U_MEMORY_ALLOCATION_ERROR;
291 return;
292 }
293 delete iter_;
294 iter_ = newIter;
295 otherHalf_ = 0;
296 dir_ = 0;
297}
298
299// Sets the source to the new character iterator.
300void CollationElementIterator::setText(CharacterIterator& source,
301 UErrorCode& status)
302{
303 if (U_FAILURE(status))
304 return;
305
306 source.getText(string_);
307 setText(string_, status);
308}
309
310int32_t CollationElementIterator::strengthOrder(int32_t order) const
311{
312 UColAttributeValue s = (UColAttributeValue)rbc_->settings->getStrength();
313 // Mask off the unwanted differences.
314 if (s == UCOL_PRIMARY) {
315 order &= 0xffff0000;
316 }
317 else if (s == UCOL_SECONDARY) {
318 order &= 0xffffff00;
319 }
320
321 return order;
322}
323
324/* CollationElementIterator private constructors/destructors --------------- */
325
326/**
327* This is the "real" constructor for this class; it constructs an iterator
328* over the source text using the specified collator
329*/
330CollationElementIterator::CollationElementIterator(
331 const UnicodeString &source,
332 const RuleBasedCollator *coll,
333 UErrorCode &status)
334 : iter_(NULL), rbc_(coll), otherHalf_(0), dir_(0), offsets_(NULL) {
335 setText(source, status);
336}
337
338/**
339* This is the "real" constructor for this class; it constructs an iterator over
340* the source text using the specified collator
341*/
342CollationElementIterator::CollationElementIterator(
343 const CharacterIterator &source,
344 const RuleBasedCollator *coll,
345 UErrorCode &status)
346 : iter_(NULL), rbc_(coll), otherHalf_(0), dir_(0), offsets_(NULL) {
347 // We only call source.getText() which should be const anyway.
348 setText(const_cast<CharacterIterator &>(source), status);
349}
350
351/* CollationElementIterator private methods -------------------------------- */
352
353const CollationElementIterator& CollationElementIterator::operator=(
354 const CollationElementIterator& other)
355{
356 if (this == &other) {
357 return *this;
358 }
359
360 CollationIterator *newIter;
361 const FCDUTF16CollationIterator *otherFCDIter =
362 dynamic_cast<const FCDUTF16CollationIterator *>(other.iter_);
363 if(otherFCDIter != NULL) {
364 newIter = new FCDUTF16CollationIterator(*otherFCDIter, string_.getBuffer());
365 } else {
366 const UTF16CollationIterator *otherIter =
367 dynamic_cast<const UTF16CollationIterator *>(other.iter_);
368 if(otherIter != NULL) {
369 newIter = new UTF16CollationIterator(*otherIter, string_.getBuffer());
370 } else {
371 newIter = NULL;
372 }
373 }
374 if(newIter != NULL) {
375 delete iter_;
376 iter_ = newIter;
377 rbc_ = other.rbc_;
378 otherHalf_ = other.otherHalf_;
379 dir_ = other.dir_;
380
381 string_ = other.string_;
382 }
383 if(other.dir_ < 0 && other.offsets_ != NULL && !other.offsets_->isEmpty()) {
384 UErrorCode errorCode = U_ZERO_ERROR;
385 if(offsets_ == NULL) {
386 offsets_ = new UVector32(other.offsets_->size(), errorCode);
387 }
388 if(offsets_ != NULL) {
389 offsets_->assign(*other.offsets_, errorCode);
390 }
391 }
392 return *this;
393}
394
395namespace {
396
397class MaxExpSink : public ContractionsAndExpansions::CESink {
398public:
399 MaxExpSink(UHashtable *h, UErrorCode &ec) : maxExpansions(h), errorCode(ec) {}
400 virtual ~MaxExpSink();
401 virtual void handleCE(int64_t /*ce*/) {}
402 virtual void handleExpansion(const int64_t ces[], int32_t length) {
403 if (length <= 1) {
404 // We do not need to add single CEs into the map.
405 return;
406 }
407 int32_t count = 0; // number of CE "halves"
408 for (int32_t i = 0; i < length; ++i) {
409 count += ceNeedsTwoParts(ces[i]) ? 2 : 1;
410 }
411 // last "half" of the last CE
412 int64_t ce = ces[length - 1];
413 uint32_t p = (uint32_t)(ce >> 32);
414 uint32_t lower32 = (uint32_t)ce;
415 uint32_t lastHalf = getSecondHalf(p, lower32);
416 if (lastHalf == 0) {
417 lastHalf = getFirstHalf(p, lower32);
418 U_ASSERT(lastHalf != 0);
419 } else {
420 lastHalf |= 0xc0; // old-style continuation CE
421 }
422 if (count > uhash_igeti(maxExpansions, (int32_t)lastHalf)) {
423 uhash_iputi(maxExpansions, (int32_t)lastHalf, count, &errorCode);
424 }
425 }
426
427private:
428 UHashtable *maxExpansions;
429 UErrorCode &errorCode;
430};
431
432MaxExpSink::~MaxExpSink() {}
433
434} // namespace
435
436UHashtable *
437CollationElementIterator::computeMaxExpansions(const CollationData *data, UErrorCode &errorCode) {
438 if (U_FAILURE(errorCode)) { return NULL; }
439 UHashtable *maxExpansions = uhash_open(uhash_hashLong, uhash_compareLong,
440 uhash_compareLong, &errorCode);
441 if (U_FAILURE(errorCode)) { return NULL; }
442 MaxExpSink sink(maxExpansions, errorCode);
443 ContractionsAndExpansions(NULL, NULL, &sink, TRUE).forData(data, errorCode);
444 if (U_FAILURE(errorCode)) {
445 uhash_close(maxExpansions);
446 return NULL;
447 }
448 return maxExpansions;
449}
450
451int32_t
452CollationElementIterator::getMaxExpansion(int32_t order) const {
453 return getMaxExpansion(rbc_->tailoring->maxExpansions, order);
454}
455
456int32_t
457CollationElementIterator::getMaxExpansion(const UHashtable *maxExpansions, int32_t order) {
458 if (order == 0) { return 1; }
459 int32_t max;
460 if(maxExpansions != NULL && (max = uhash_igeti(maxExpansions, order)) != 0) {
461 return max;
462 }
463 if ((order & 0xc0) == 0xc0) {
464 // old-style continuation CE
465 return 2;
466 } else {
467 return 1;
468 }
469}
470
471U_NAMESPACE_END
472
473#endif /* #if !UCONFIG_NO_COLLATION */
474