1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | /* |
4 | ******************************************************************************* |
5 | * Copyright (C) 1996-2014, International Business Machines Corporation and |
6 | * others. All Rights Reserved. |
7 | ******************************************************************************* |
8 | */ |
9 | |
10 | /* |
11 | * File coleitr.cpp |
12 | * |
13 | * Created by: Helena Shih |
14 | * |
15 | * Modification History: |
16 | * |
17 | * Date Name Description |
18 | * |
19 | * 6/23/97 helena Adding comments to make code more readable. |
20 | * 08/03/98 erm Synched with 1.2 version of CollationElementIterator.java |
21 | * 12/10/99 aliu Ported Thai collation support from Java. |
22 | * 01/25/01 swquek Modified to a C++ wrapper calling C APIs (ucoliter.h) |
23 | * 02/19/01 swquek Removed CollationElementIterator() since it is |
24 | * private constructor and no calls are made to it |
25 | * 2012-2014 markus Rewritten in C++ again. |
26 | */ |
27 | |
28 | #include "unicode/utypes.h" |
29 | |
30 | #if !UCONFIG_NO_COLLATION |
31 | |
32 | #include "unicode/chariter.h" |
33 | #include "unicode/coleitr.h" |
34 | #include "unicode/tblcoll.h" |
35 | #include "unicode/ustring.h" |
36 | #include "cmemory.h" |
37 | #include "collation.h" |
38 | #include "collationdata.h" |
39 | #include "collationiterator.h" |
40 | #include "collationsets.h" |
41 | #include "collationtailoring.h" |
42 | #include "uassert.h" |
43 | #include "uhash.h" |
44 | #include "utf16collationiterator.h" |
45 | #include "uvectr32.h" |
46 | |
47 | /* Constants --------------------------------------------------------------- */ |
48 | |
49 | U_NAMESPACE_BEGIN |
50 | |
51 | UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollationElementIterator) |
52 | |
53 | /* CollationElementIterator public constructor/destructor ------------------ */ |
54 | |
55 | CollationElementIterator::CollationElementIterator( |
56 | const CollationElementIterator& other) |
57 | : UObject(other), iter_(NULL), rbc_(NULL), otherHalf_(0), dir_(0), offsets_(NULL) { |
58 | *this = other; |
59 | } |
60 | |
61 | CollationElementIterator::~CollationElementIterator() |
62 | { |
63 | delete iter_; |
64 | delete offsets_; |
65 | } |
66 | |
67 | /* CollationElementIterator public methods --------------------------------- */ |
68 | |
69 | namespace { |
70 | |
71 | uint32_t getFirstHalf(uint32_t p, uint32_t lower32) { |
72 | return (p & 0xffff0000) | ((lower32 >> 16) & 0xff00) | ((lower32 >> 8) & 0xff); |
73 | } |
74 | uint32_t getSecondHalf(uint32_t p, uint32_t lower32) { |
75 | return (p << 16) | ((lower32 >> 8) & 0xff00) | (lower32 & 0x3f); |
76 | } |
77 | UBool ceNeedsTwoParts(int64_t ce) { |
78 | return (ce & INT64_C(0xffff00ff003f)) != 0; |
79 | } |
80 | |
81 | } // namespace |
82 | |
83 | int32_t CollationElementIterator::getOffset() const |
84 | { |
85 | if (dir_ < 0 && offsets_ != NULL && !offsets_->isEmpty()) { |
86 | // CollationIterator::previousCE() decrements the CEs length |
87 | // while it pops CEs from its internal buffer. |
88 | int32_t i = iter_->getCEsLength(); |
89 | if (otherHalf_ != 0) { |
90 | // Return the trailing CE offset while we are in the middle of a 64-bit CE. |
91 | ++i; |
92 | } |
93 | U_ASSERT(i < offsets_->size()); |
94 | return offsets_->elementAti(i); |
95 | } |
96 | return iter_->getOffset(); |
97 | } |
98 | |
99 | /** |
100 | * Get the ordering priority of the next character in the string. |
101 | * @return the next character's ordering. Returns NULLORDER if an error has |
102 | * occured or if the end of string has been reached |
103 | */ |
104 | int32_t CollationElementIterator::next(UErrorCode& status) |
105 | { |
106 | if (U_FAILURE(status)) { return NULLORDER; } |
107 | if (dir_ > 1) { |
108 | // Continue forward iteration. Test this first. |
109 | if (otherHalf_ != 0) { |
110 | uint32_t oh = otherHalf_; |
111 | otherHalf_ = 0; |
112 | return oh; |
113 | } |
114 | } else if (dir_ == 1) { |
115 | // next() after setOffset() |
116 | dir_ = 2; |
117 | } else if (dir_ == 0) { |
118 | // The iter_ is already reset to the start of the text. |
119 | dir_ = 2; |
120 | } else /* dir_ < 0 */ { |
121 | // illegal change of direction |
122 | status = U_INVALID_STATE_ERROR; |
123 | return NULLORDER; |
124 | } |
125 | // No need to keep all CEs in the buffer when we iterate. |
126 | iter_->clearCEsIfNoneRemaining(); |
127 | int64_t ce = iter_->nextCE(status); |
128 | if (ce == Collation::NO_CE) { return NULLORDER; } |
129 | // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits. |
130 | uint32_t p = (uint32_t)(ce >> 32); |
131 | uint32_t lower32 = (uint32_t)ce; |
132 | uint32_t firstHalf = getFirstHalf(p, lower32); |
133 | uint32_t secondHalf = getSecondHalf(p, lower32); |
134 | if (secondHalf != 0) { |
135 | otherHalf_ = secondHalf | 0xc0; // continuation CE |
136 | } |
137 | return firstHalf; |
138 | } |
139 | |
140 | UBool CollationElementIterator::operator!=( |
141 | const CollationElementIterator& other) const |
142 | { |
143 | return !(*this == other); |
144 | } |
145 | |
146 | UBool CollationElementIterator::operator==( |
147 | const CollationElementIterator& that) const |
148 | { |
149 | if (this == &that) { |
150 | return TRUE; |
151 | } |
152 | |
153 | return |
154 | (rbc_ == that.rbc_ || *rbc_ == *that.rbc_) && |
155 | otherHalf_ == that.otherHalf_ && |
156 | normalizeDir() == that.normalizeDir() && |
157 | string_ == that.string_ && |
158 | *iter_ == *that.iter_; |
159 | } |
160 | |
161 | /** |
162 | * Get the ordering priority of the previous collation element in the string. |
163 | * @param status the error code status. |
164 | * @return the previous element's ordering. Returns NULLORDER if an error has |
165 | * occured or if the start of string has been reached. |
166 | */ |
167 | int32_t CollationElementIterator::previous(UErrorCode& status) |
168 | { |
169 | if (U_FAILURE(status)) { return NULLORDER; } |
170 | if (dir_ < 0) { |
171 | // Continue backwards iteration. Test this first. |
172 | if (otherHalf_ != 0) { |
173 | uint32_t oh = otherHalf_; |
174 | otherHalf_ = 0; |
175 | return oh; |
176 | } |
177 | } else if (dir_ == 0) { |
178 | iter_->resetToOffset(string_.length()); |
179 | dir_ = -1; |
180 | } else if (dir_ == 1) { |
181 | // previous() after setOffset() |
182 | dir_ = -1; |
183 | } else /* dir_ > 1 */ { |
184 | // illegal change of direction |
185 | status = U_INVALID_STATE_ERROR; |
186 | return NULLORDER; |
187 | } |
188 | if (offsets_ == NULL) { |
189 | offsets_ = new UVector32(status); |
190 | if (offsets_ == NULL) { |
191 | status = U_MEMORY_ALLOCATION_ERROR; |
192 | return NULLORDER; |
193 | } |
194 | } |
195 | // If we already have expansion CEs, then we also have offsets. |
196 | // Otherwise remember the trailing offset in case we need to |
197 | // write offsets for an artificial expansion. |
198 | int32_t limitOffset = iter_->getCEsLength() == 0 ? iter_->getOffset() : 0; |
199 | int64_t ce = iter_->previousCE(*offsets_, status); |
200 | if (ce == Collation::NO_CE) { return NULLORDER; } |
201 | // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits. |
202 | uint32_t p = (uint32_t)(ce >> 32); |
203 | uint32_t lower32 = (uint32_t)ce; |
204 | uint32_t firstHalf = getFirstHalf(p, lower32); |
205 | uint32_t secondHalf = getSecondHalf(p, lower32); |
206 | if (secondHalf != 0) { |
207 | if (offsets_->isEmpty()) { |
208 | // When we convert a single 64-bit CE into two 32-bit CEs, |
209 | // we need to make this artificial expansion behave like a normal expansion. |
210 | // See CollationIterator::previousCE(). |
211 | offsets_->addElement(iter_->getOffset(), status); |
212 | offsets_->addElement(limitOffset, status); |
213 | } |
214 | otherHalf_ = firstHalf; |
215 | return secondHalf | 0xc0; // continuation CE |
216 | } |
217 | return firstHalf; |
218 | } |
219 | |
220 | /** |
221 | * Resets the cursor to the beginning of the string. |
222 | */ |
223 | void CollationElementIterator::reset() |
224 | { |
225 | iter_ ->resetToOffset(0); |
226 | otherHalf_ = 0; |
227 | dir_ = 0; |
228 | } |
229 | |
230 | void CollationElementIterator::setOffset(int32_t newOffset, |
231 | UErrorCode& status) |
232 | { |
233 | if (U_FAILURE(status)) { return; } |
234 | if (0 < newOffset && newOffset < string_.length()) { |
235 | int32_t offset = newOffset; |
236 | do { |
237 | UChar c = string_.charAt(offset); |
238 | if (!rbc_->isUnsafe(c) || |
239 | (U16_IS_LEAD(c) && !rbc_->isUnsafe(string_.char32At(offset)))) { |
240 | break; |
241 | } |
242 | // Back up to before this unsafe character. |
243 | --offset; |
244 | } while (offset > 0); |
245 | if (offset < newOffset) { |
246 | // We might have backed up more than necessary. |
247 | // For example, contractions "ch" and "cu" make both 'h' and 'u' unsafe, |
248 | // but for text "chu" setOffset(2) should remain at 2 |
249 | // although we initially back up to offset 0. |
250 | // Find the last safe offset no greater than newOffset by iterating forward. |
251 | int32_t lastSafeOffset = offset; |
252 | do { |
253 | iter_->resetToOffset(lastSafeOffset); |
254 | do { |
255 | iter_->nextCE(status); |
256 | if (U_FAILURE(status)) { return; } |
257 | } while ((offset = iter_->getOffset()) == lastSafeOffset); |
258 | if (offset <= newOffset) { |
259 | lastSafeOffset = offset; |
260 | } |
261 | } while (offset < newOffset); |
262 | newOffset = lastSafeOffset; |
263 | } |
264 | } |
265 | iter_->resetToOffset(newOffset); |
266 | otherHalf_ = 0; |
267 | dir_ = 1; |
268 | } |
269 | |
270 | /** |
271 | * Sets the source to the new source string. |
272 | */ |
273 | void CollationElementIterator::setText(const UnicodeString& source, |
274 | UErrorCode& status) |
275 | { |
276 | if (U_FAILURE(status)) { |
277 | return; |
278 | } |
279 | |
280 | string_ = source; |
281 | const UChar *s = string_.getBuffer(); |
282 | CollationIterator *newIter; |
283 | UBool numeric = rbc_->settings->isNumeric(); |
284 | if (rbc_->settings->dontCheckFCD()) { |
285 | newIter = new UTF16CollationIterator(rbc_->data, numeric, s, s, s + string_.length()); |
286 | } else { |
287 | newIter = new FCDUTF16CollationIterator(rbc_->data, numeric, s, s, s + string_.length()); |
288 | } |
289 | if (newIter == NULL) { |
290 | status = U_MEMORY_ALLOCATION_ERROR; |
291 | return; |
292 | } |
293 | delete iter_; |
294 | iter_ = newIter; |
295 | otherHalf_ = 0; |
296 | dir_ = 0; |
297 | } |
298 | |
299 | // Sets the source to the new character iterator. |
300 | void CollationElementIterator::setText(CharacterIterator& source, |
301 | UErrorCode& status) |
302 | { |
303 | if (U_FAILURE(status)) |
304 | return; |
305 | |
306 | source.getText(string_); |
307 | setText(string_, status); |
308 | } |
309 | |
310 | int32_t CollationElementIterator::strengthOrder(int32_t order) const |
311 | { |
312 | UColAttributeValue s = (UColAttributeValue)rbc_->settings->getStrength(); |
313 | // Mask off the unwanted differences. |
314 | if (s == UCOL_PRIMARY) { |
315 | order &= 0xffff0000; |
316 | } |
317 | else if (s == UCOL_SECONDARY) { |
318 | order &= 0xffffff00; |
319 | } |
320 | |
321 | return order; |
322 | } |
323 | |
324 | /* CollationElementIterator private constructors/destructors --------------- */ |
325 | |
326 | /** |
327 | * This is the "real" constructor for this class; it constructs an iterator |
328 | * over the source text using the specified collator |
329 | */ |
330 | CollationElementIterator::CollationElementIterator( |
331 | const UnicodeString &source, |
332 | const RuleBasedCollator *coll, |
333 | UErrorCode &status) |
334 | : iter_(NULL), rbc_(coll), otherHalf_(0), dir_(0), offsets_(NULL) { |
335 | setText(source, status); |
336 | } |
337 | |
338 | /** |
339 | * This is the "real" constructor for this class; it constructs an iterator over |
340 | * the source text using the specified collator |
341 | */ |
342 | CollationElementIterator::CollationElementIterator( |
343 | const CharacterIterator &source, |
344 | const RuleBasedCollator *coll, |
345 | UErrorCode &status) |
346 | : iter_(NULL), rbc_(coll), otherHalf_(0), dir_(0), offsets_(NULL) { |
347 | // We only call source.getText() which should be const anyway. |
348 | setText(const_cast<CharacterIterator &>(source), status); |
349 | } |
350 | |
351 | /* CollationElementIterator private methods -------------------------------- */ |
352 | |
353 | const CollationElementIterator& CollationElementIterator::operator=( |
354 | const CollationElementIterator& other) |
355 | { |
356 | if (this == &other) { |
357 | return *this; |
358 | } |
359 | |
360 | CollationIterator *newIter; |
361 | const FCDUTF16CollationIterator *otherFCDIter = |
362 | dynamic_cast<const FCDUTF16CollationIterator *>(other.iter_); |
363 | if(otherFCDIter != NULL) { |
364 | newIter = new FCDUTF16CollationIterator(*otherFCDIter, string_.getBuffer()); |
365 | } else { |
366 | const UTF16CollationIterator *otherIter = |
367 | dynamic_cast<const UTF16CollationIterator *>(other.iter_); |
368 | if(otherIter != NULL) { |
369 | newIter = new UTF16CollationIterator(*otherIter, string_.getBuffer()); |
370 | } else { |
371 | newIter = NULL; |
372 | } |
373 | } |
374 | if(newIter != NULL) { |
375 | delete iter_; |
376 | iter_ = newIter; |
377 | rbc_ = other.rbc_; |
378 | otherHalf_ = other.otherHalf_; |
379 | dir_ = other.dir_; |
380 | |
381 | string_ = other.string_; |
382 | } |
383 | if(other.dir_ < 0 && other.offsets_ != NULL && !other.offsets_->isEmpty()) { |
384 | UErrorCode errorCode = U_ZERO_ERROR; |
385 | if(offsets_ == NULL) { |
386 | offsets_ = new UVector32(other.offsets_->size(), errorCode); |
387 | } |
388 | if(offsets_ != NULL) { |
389 | offsets_->assign(*other.offsets_, errorCode); |
390 | } |
391 | } |
392 | return *this; |
393 | } |
394 | |
395 | namespace { |
396 | |
397 | class MaxExpSink : public ContractionsAndExpansions::CESink { |
398 | public: |
399 | MaxExpSink(UHashtable *h, UErrorCode &ec) : maxExpansions(h), errorCode(ec) {} |
400 | virtual ~MaxExpSink(); |
401 | virtual void handleCE(int64_t /*ce*/) {} |
402 | virtual void handleExpansion(const int64_t ces[], int32_t length) { |
403 | if (length <= 1) { |
404 | // We do not need to add single CEs into the map. |
405 | return; |
406 | } |
407 | int32_t count = 0; // number of CE "halves" |
408 | for (int32_t i = 0; i < length; ++i) { |
409 | count += ceNeedsTwoParts(ces[i]) ? 2 : 1; |
410 | } |
411 | // last "half" of the last CE |
412 | int64_t ce = ces[length - 1]; |
413 | uint32_t p = (uint32_t)(ce >> 32); |
414 | uint32_t lower32 = (uint32_t)ce; |
415 | uint32_t lastHalf = getSecondHalf(p, lower32); |
416 | if (lastHalf == 0) { |
417 | lastHalf = getFirstHalf(p, lower32); |
418 | U_ASSERT(lastHalf != 0); |
419 | } else { |
420 | lastHalf |= 0xc0; // old-style continuation CE |
421 | } |
422 | if (count > uhash_igeti(maxExpansions, (int32_t)lastHalf)) { |
423 | uhash_iputi(maxExpansions, (int32_t)lastHalf, count, &errorCode); |
424 | } |
425 | } |
426 | |
427 | private: |
428 | UHashtable *maxExpansions; |
429 | UErrorCode &errorCode; |
430 | }; |
431 | |
432 | MaxExpSink::~MaxExpSink() {} |
433 | |
434 | } // namespace |
435 | |
436 | UHashtable * |
437 | CollationElementIterator::computeMaxExpansions(const CollationData *data, UErrorCode &errorCode) { |
438 | if (U_FAILURE(errorCode)) { return NULL; } |
439 | UHashtable *maxExpansions = uhash_open(uhash_hashLong, uhash_compareLong, |
440 | uhash_compareLong, &errorCode); |
441 | if (U_FAILURE(errorCode)) { return NULL; } |
442 | MaxExpSink sink(maxExpansions, errorCode); |
443 | ContractionsAndExpansions(NULL, NULL, &sink, TRUE).forData(data, errorCode); |
444 | if (U_FAILURE(errorCode)) { |
445 | uhash_close(maxExpansions); |
446 | return NULL; |
447 | } |
448 | return maxExpansions; |
449 | } |
450 | |
451 | int32_t |
452 | CollationElementIterator::getMaxExpansion(int32_t order) const { |
453 | return getMaxExpansion(rbc_->tailoring->maxExpansions, order); |
454 | } |
455 | |
456 | int32_t |
457 | CollationElementIterator::getMaxExpansion(const UHashtable *maxExpansions, int32_t order) { |
458 | if (order == 0) { return 1; } |
459 | int32_t max; |
460 | if(maxExpansions != NULL && (max = uhash_igeti(maxExpansions, order)) != 0) { |
461 | return max; |
462 | } |
463 | if ((order & 0xc0) == 0xc0) { |
464 | // old-style continuation CE |
465 | return 2; |
466 | } else { |
467 | return 1; |
468 | } |
469 | } |
470 | |
471 | U_NAMESPACE_END |
472 | |
473 | #endif /* #if !UCONFIG_NO_COLLATION */ |
474 | |