1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | /* |
4 | ******************************************************************************* |
5 | * |
6 | * Copyright (C) 2001-2015, International Business Machines |
7 | * Corporation and others. All Rights Reserved. |
8 | * |
9 | ******************************************************************************* |
10 | * file name: ustrcase.cpp |
11 | * encoding: UTF-8 |
12 | * tab size: 8 (not used) |
13 | * indentation:4 |
14 | * |
15 | * created on: 2002feb20 |
16 | * created by: Markus W. Scherer |
17 | * |
18 | * Implementation file for string casing C API functions. |
19 | * Uses functions from uchar.c for basic functionality that requires access |
20 | * to the Unicode Character Database (uprops.dat). |
21 | */ |
22 | |
23 | #include "unicode/utypes.h" |
24 | #include "unicode/brkiter.h" |
25 | #include "unicode/casemap.h" |
26 | #include "unicode/edits.h" |
27 | #include "unicode/stringoptions.h" |
28 | #include "unicode/ustring.h" |
29 | #include "unicode/ucasemap.h" |
30 | #include "unicode/ubrk.h" |
31 | #include "unicode/utf.h" |
32 | #include "unicode/utf16.h" |
33 | #include "cmemory.h" |
34 | #include "ucase.h" |
35 | #include "ucasemap_imp.h" |
36 | #include "ustr_imp.h" |
37 | #include "uassert.h" |
38 | |
39 | /** |
40 | * Code point for COMBINING ACUTE ACCENT |
41 | * @internal |
42 | */ |
43 | #define ACUTE u'\u0301' |
44 | |
45 | U_NAMESPACE_BEGIN |
46 | |
47 | namespace { |
48 | |
49 | int32_t checkOverflowAndEditsError(int32_t destIndex, int32_t destCapacity, |
50 | Edits *edits, UErrorCode &errorCode) { |
51 | if (U_SUCCESS(errorCode)) { |
52 | if (destIndex > destCapacity) { |
53 | errorCode = U_BUFFER_OVERFLOW_ERROR; |
54 | } else if (edits != nullptr) { |
55 | edits->copyErrorTo(errorCode); |
56 | } |
57 | } |
58 | return destIndex; |
59 | } |
60 | |
61 | /* Appends a full case mapping result, see UCASE_MAX_STRING_LENGTH. */ |
62 | inline int32_t |
63 | appendResult(char16_t *dest, int32_t destIndex, int32_t destCapacity, |
64 | int32_t result, const char16_t *s, |
65 | int32_t cpLength, uint32_t options, icu::Edits *edits) { |
66 | UChar32 c; |
67 | int32_t length; |
68 | |
69 | /* decode the result */ |
70 | if(result<0) { |
71 | /* (not) original code point */ |
72 | if(edits!=nullptr) { |
73 | edits->addUnchanged(cpLength); |
74 | } |
75 | if(options & U_OMIT_UNCHANGED_TEXT) { |
76 | return destIndex; |
77 | } |
78 | c=~result; |
79 | if(destIndex<destCapacity && c<=0xffff) { // BMP slightly-fastpath |
80 | dest[destIndex++]=(char16_t)c; |
81 | return destIndex; |
82 | } |
83 | length=cpLength; |
84 | } else { |
85 | if(result<=UCASE_MAX_STRING_LENGTH) { |
86 | c=U_SENTINEL; |
87 | length=result; |
88 | } else if(destIndex<destCapacity && result<=0xffff) { // BMP slightly-fastpath |
89 | dest[destIndex++]=(char16_t)result; |
90 | if(edits!=nullptr) { |
91 | edits->addReplace(cpLength, 1); |
92 | } |
93 | return destIndex; |
94 | } else { |
95 | c=result; |
96 | length=U16_LENGTH(c); |
97 | } |
98 | if(edits!=nullptr) { |
99 | edits->addReplace(cpLength, length); |
100 | } |
101 | } |
102 | if(length>(INT32_MAX-destIndex)) { |
103 | return -1; // integer overflow |
104 | } |
105 | |
106 | if(destIndex<destCapacity) { |
107 | /* append the result */ |
108 | if(c>=0) { |
109 | /* code point */ |
110 | UBool isError=false; |
111 | U16_APPEND(dest, destIndex, destCapacity, c, isError); |
112 | if(isError) { |
113 | /* overflow, nothing written */ |
114 | destIndex+=length; |
115 | } |
116 | } else { |
117 | /* string */ |
118 | if((destIndex+length)<=destCapacity) { |
119 | while(length>0) { |
120 | dest[destIndex++]=*s++; |
121 | --length; |
122 | } |
123 | } else { |
124 | /* overflow */ |
125 | destIndex+=length; |
126 | } |
127 | } |
128 | } else { |
129 | /* preflight */ |
130 | destIndex+=length; |
131 | } |
132 | return destIndex; |
133 | } |
134 | |
135 | inline int32_t |
136 | appendUChar(char16_t *dest, int32_t destIndex, int32_t destCapacity, char16_t c) { |
137 | if(destIndex<destCapacity) { |
138 | dest[destIndex]=c; |
139 | } else if(destIndex==INT32_MAX) { |
140 | return -1; // integer overflow |
141 | } |
142 | return destIndex+1; |
143 | } |
144 | |
145 | int32_t |
146 | appendNonEmptyUnchanged(char16_t *dest, int32_t destIndex, int32_t destCapacity, |
147 | const char16_t *s, int32_t length, uint32_t options, icu::Edits *edits) { |
148 | if(edits!=nullptr) { |
149 | edits->addUnchanged(length); |
150 | } |
151 | if(options & U_OMIT_UNCHANGED_TEXT) { |
152 | return destIndex; |
153 | } |
154 | if(length>(INT32_MAX-destIndex)) { |
155 | return -1; // integer overflow |
156 | } |
157 | if((destIndex+length)<=destCapacity) { |
158 | u_memcpy(dest+destIndex, s, length); |
159 | } |
160 | return destIndex + length; |
161 | } |
162 | |
163 | inline int32_t |
164 | appendUnchanged(char16_t *dest, int32_t destIndex, int32_t destCapacity, |
165 | const char16_t *s, int32_t length, uint32_t options, icu::Edits *edits) { |
166 | if (length <= 0) { |
167 | return destIndex; |
168 | } |
169 | return appendNonEmptyUnchanged(dest, destIndex, destCapacity, s, length, options, edits); |
170 | } |
171 | |
172 | UChar32 U_CALLCONV |
173 | utf16_caseContextIterator(void *context, int8_t dir) { |
174 | UCaseContext *csc=(UCaseContext *)context; |
175 | UChar32 c; |
176 | |
177 | if(dir<0) { |
178 | /* reset for backward iteration */ |
179 | csc->index=csc->cpStart; |
180 | csc->dir=dir; |
181 | } else if(dir>0) { |
182 | /* reset for forward iteration */ |
183 | csc->index=csc->cpLimit; |
184 | csc->dir=dir; |
185 | } else { |
186 | /* continue current iteration direction */ |
187 | dir=csc->dir; |
188 | } |
189 | |
190 | if(dir<0) { |
191 | if(csc->start<csc->index) { |
192 | U16_PREV((const char16_t *)csc->p, csc->start, csc->index, c); |
193 | return c; |
194 | } |
195 | } else { |
196 | if(csc->index<csc->limit) { |
197 | U16_NEXT((const char16_t *)csc->p, csc->index, csc->limit, c); |
198 | return c; |
199 | } |
200 | } |
201 | return U_SENTINEL; |
202 | } |
203 | |
204 | /** |
205 | * caseLocale >= 0: Lowercases [srcStart..srcLimit[ but takes context [0..srcLength[ into account. |
206 | * caseLocale < 0: Case-folds [srcStart..srcLimit[. |
207 | */ |
208 | int32_t toLower(int32_t caseLocale, uint32_t options, |
209 | char16_t *dest, int32_t destCapacity, |
210 | const char16_t *src, UCaseContext *csc, int32_t srcStart, int32_t srcLimit, |
211 | icu::Edits *edits, UErrorCode &errorCode) { |
212 | const int8_t *latinToLower; |
213 | if (caseLocale == UCASE_LOC_ROOT || |
214 | (caseLocale >= 0 ? |
215 | !(caseLocale == UCASE_LOC_TURKISH || caseLocale == UCASE_LOC_LITHUANIAN) : |
216 | (options & _FOLD_CASE_OPTIONS_MASK) == U_FOLD_CASE_DEFAULT)) { |
217 | latinToLower = LatinCase::TO_LOWER_NORMAL; |
218 | } else { |
219 | latinToLower = LatinCase::TO_LOWER_TR_LT; |
220 | } |
221 | const UTrie2 *trie = ucase_getTrie(); |
222 | int32_t destIndex = 0; |
223 | int32_t prev = srcStart; |
224 | int32_t srcIndex = srcStart; |
225 | for (;;) { |
226 | // fast path for simple cases |
227 | char16_t lead = 0; |
228 | while (srcIndex < srcLimit) { |
229 | lead = src[srcIndex]; |
230 | int32_t delta; |
231 | if (lead < LatinCase::LONG_S) { |
232 | int8_t d = latinToLower[lead]; |
233 | if (d == LatinCase::EXC) { break; } |
234 | ++srcIndex; |
235 | if (d == 0) { continue; } |
236 | delta = d; |
237 | } else if (lead >= 0xd800) { |
238 | break; // surrogate or higher |
239 | } else { |
240 | uint16_t props = UTRIE2_GET16_FROM_U16_SINGLE_LEAD(trie, lead); |
241 | if (UCASE_HAS_EXCEPTION(props)) { break; } |
242 | ++srcIndex; |
243 | if (!UCASE_IS_UPPER_OR_TITLE(props) || (delta = UCASE_GET_DELTA(props)) == 0) { |
244 | continue; |
245 | } |
246 | } |
247 | lead += static_cast<char16_t>(delta); |
248 | destIndex = appendUnchanged(dest, destIndex, destCapacity, |
249 | src + prev, srcIndex - 1 - prev, options, edits); |
250 | if (destIndex >= 0) { |
251 | destIndex = appendUChar(dest, destIndex, destCapacity, lead); |
252 | if (edits != nullptr) { |
253 | edits->addReplace(1, 1); |
254 | } |
255 | } |
256 | if (destIndex < 0) { |
257 | errorCode = U_INDEX_OUTOFBOUNDS_ERROR; |
258 | return 0; |
259 | } |
260 | prev = srcIndex; |
261 | } |
262 | if (srcIndex >= srcLimit) { |
263 | break; |
264 | } |
265 | // slow path |
266 | int32_t cpStart = srcIndex++; |
267 | char16_t trail; |
268 | UChar32 c; |
269 | if (U16_IS_LEAD(lead) && srcIndex < srcLimit && U16_IS_TRAIL(trail = src[srcIndex])) { |
270 | c = U16_GET_SUPPLEMENTARY(lead, trail); |
271 | ++srcIndex; |
272 | } else { |
273 | c = lead; |
274 | } |
275 | const char16_t *s; |
276 | if (caseLocale >= 0) { |
277 | csc->cpStart = cpStart; |
278 | csc->cpLimit = srcIndex; |
279 | c = ucase_toFullLower(c, utf16_caseContextIterator, csc, &s, caseLocale); |
280 | } else { |
281 | c = ucase_toFullFolding(c, &s, options); |
282 | } |
283 | if (c >= 0) { |
284 | destIndex = appendUnchanged(dest, destIndex, destCapacity, |
285 | src + prev, cpStart - prev, options, edits); |
286 | if (destIndex >= 0) { |
287 | destIndex = appendResult(dest, destIndex, destCapacity, c, s, |
288 | srcIndex - cpStart, options, edits); |
289 | } |
290 | if (destIndex < 0) { |
291 | errorCode = U_INDEX_OUTOFBOUNDS_ERROR; |
292 | return 0; |
293 | } |
294 | prev = srcIndex; |
295 | } |
296 | } |
297 | destIndex = appendUnchanged(dest, destIndex, destCapacity, |
298 | src + prev, srcIndex - prev, options, edits); |
299 | if (destIndex < 0) { |
300 | errorCode = U_INDEX_OUTOFBOUNDS_ERROR; |
301 | return 0; |
302 | } |
303 | return destIndex; |
304 | } |
305 | |
306 | int32_t toUpper(int32_t caseLocale, uint32_t options, |
307 | char16_t *dest, int32_t destCapacity, |
308 | const char16_t *src, UCaseContext *csc, int32_t srcLength, |
309 | icu::Edits *edits, UErrorCode &errorCode) { |
310 | const int8_t *latinToUpper; |
311 | if (caseLocale == UCASE_LOC_TURKISH) { |
312 | latinToUpper = LatinCase::TO_UPPER_TR; |
313 | } else { |
314 | latinToUpper = LatinCase::TO_UPPER_NORMAL; |
315 | } |
316 | const UTrie2 *trie = ucase_getTrie(); |
317 | int32_t destIndex = 0; |
318 | int32_t prev = 0; |
319 | int32_t srcIndex = 0; |
320 | for (;;) { |
321 | // fast path for simple cases |
322 | char16_t lead = 0; |
323 | while (srcIndex < srcLength) { |
324 | lead = src[srcIndex]; |
325 | int32_t delta; |
326 | if (lead < LatinCase::LONG_S) { |
327 | int8_t d = latinToUpper[lead]; |
328 | if (d == LatinCase::EXC) { break; } |
329 | ++srcIndex; |
330 | if (d == 0) { continue; } |
331 | delta = d; |
332 | } else if (lead >= 0xd800) { |
333 | break; // surrogate or higher |
334 | } else { |
335 | uint16_t props = UTRIE2_GET16_FROM_U16_SINGLE_LEAD(trie, lead); |
336 | if (UCASE_HAS_EXCEPTION(props)) { break; } |
337 | ++srcIndex; |
338 | if (UCASE_GET_TYPE(props) != UCASE_LOWER || (delta = UCASE_GET_DELTA(props)) == 0) { |
339 | continue; |
340 | } |
341 | } |
342 | lead += static_cast<char16_t>(delta); |
343 | destIndex = appendUnchanged(dest, destIndex, destCapacity, |
344 | src + prev, srcIndex - 1 - prev, options, edits); |
345 | if (destIndex >= 0) { |
346 | destIndex = appendUChar(dest, destIndex, destCapacity, lead); |
347 | if (edits != nullptr) { |
348 | edits->addReplace(1, 1); |
349 | } |
350 | } |
351 | if (destIndex < 0) { |
352 | errorCode = U_INDEX_OUTOFBOUNDS_ERROR; |
353 | return 0; |
354 | } |
355 | prev = srcIndex; |
356 | } |
357 | if (srcIndex >= srcLength) { |
358 | break; |
359 | } |
360 | // slow path |
361 | int32_t cpStart; |
362 | csc->cpStart = cpStart = srcIndex++; |
363 | char16_t trail; |
364 | UChar32 c; |
365 | if (U16_IS_LEAD(lead) && srcIndex < srcLength && U16_IS_TRAIL(trail = src[srcIndex])) { |
366 | c = U16_GET_SUPPLEMENTARY(lead, trail); |
367 | ++srcIndex; |
368 | } else { |
369 | c = lead; |
370 | } |
371 | csc->cpLimit = srcIndex; |
372 | const char16_t *s; |
373 | c = ucase_toFullUpper(c, utf16_caseContextIterator, csc, &s, caseLocale); |
374 | if (c >= 0) { |
375 | destIndex = appendUnchanged(dest, destIndex, destCapacity, |
376 | src + prev, cpStart - prev, options, edits); |
377 | if (destIndex >= 0) { |
378 | destIndex = appendResult(dest, destIndex, destCapacity, c, s, |
379 | srcIndex - cpStart, options, edits); |
380 | } |
381 | if (destIndex < 0) { |
382 | errorCode = U_INDEX_OUTOFBOUNDS_ERROR; |
383 | return 0; |
384 | } |
385 | prev = srcIndex; |
386 | } |
387 | } |
388 | destIndex = appendUnchanged(dest, destIndex, destCapacity, |
389 | src + prev, srcIndex - prev, options, edits); |
390 | if (destIndex < 0) { |
391 | errorCode = U_INDEX_OUTOFBOUNDS_ERROR; |
392 | return 0; |
393 | } |
394 | return destIndex; |
395 | } |
396 | |
397 | } // namespace |
398 | |
399 | U_NAMESPACE_END |
400 | |
401 | U_NAMESPACE_USE |
402 | |
403 | #if !UCONFIG_NO_BREAK_ITERATION |
404 | |
405 | namespace { |
406 | |
407 | /** |
408 | * Input: c is a letter I with or without acute accent. |
409 | * start is the index in src after c, and is less than segmentLimit. |
410 | * If a plain i/I is followed by a plain j/J, |
411 | * or an i/I with acute (precomposed or decomposed) is followed by a j/J with acute, |
412 | * then we output accordingly. |
413 | * |
414 | * @return the src index after the titlecased sequence, or the start index if no Dutch IJ |
415 | */ |
416 | int32_t maybeTitleDutchIJ(const char16_t *src, UChar32 c, int32_t start, int32_t segmentLimit, |
417 | char16_t *dest, int32_t &destIndex, int32_t destCapacity, uint32_t options, |
418 | icu::Edits *edits) { |
419 | U_ASSERT(start < segmentLimit); |
420 | |
421 | int32_t index = start; |
422 | bool withAcute = false; |
423 | |
424 | // If the conditions are met, then the following variables tell us what to output. |
425 | int32_t unchanged1 = 0; // code units before the j, or the whole sequence (0..3) |
426 | bool doTitleJ = false; // true if the j needs to be titlecased |
427 | int32_t unchanged2 = 0; // after the j (0 or 1) |
428 | |
429 | // next character after the first letter |
430 | char16_t c2 = src[index++]; |
431 | |
432 | // Is the first letter an i/I with accent? |
433 | if (c == u'I') { |
434 | if (c2 == ACUTE) { |
435 | withAcute = true; |
436 | unchanged1 = 1; |
437 | if (index == segmentLimit) { return start; } |
438 | c2 = src[index++]; |
439 | } |
440 | } else { // Í |
441 | withAcute = true; |
442 | } |
443 | |
444 | // Is the next character a j/J? |
445 | if (c2 == u'j') { |
446 | doTitleJ = true; |
447 | } else if (c2 == u'J') { |
448 | ++unchanged1; |
449 | } else { |
450 | return start; |
451 | } |
452 | |
453 | // A plain i/I must be followed by a plain j/J. |
454 | // An i/I with acute must be followed by a j/J with acute. |
455 | if (withAcute) { |
456 | if (index == segmentLimit || src[index++] != ACUTE) { return start; } |
457 | if (doTitleJ) { |
458 | unchanged2 = 1; |
459 | } else { |
460 | ++unchanged1; |
461 | } |
462 | } |
463 | |
464 | // There must not be another combining mark. |
465 | if (index < segmentLimit) { |
466 | int32_t cp; |
467 | int32_t i = index; |
468 | U16_NEXT(src, i, segmentLimit, cp); |
469 | uint32_t typeMask = U_GET_GC_MASK(cp); |
470 | if ((typeMask & U_GC_M_MASK) != 0) { |
471 | return start; |
472 | } |
473 | } |
474 | |
475 | // Output the rest of the Dutch IJ. |
476 | destIndex = appendUnchanged(dest, destIndex, destCapacity, src + start, unchanged1, options, edits); |
477 | start += unchanged1; |
478 | if (doTitleJ) { |
479 | destIndex = appendUChar(dest, destIndex, destCapacity, u'J'); |
480 | if (edits != nullptr) { |
481 | edits->addReplace(1, 1); |
482 | } |
483 | ++start; |
484 | } |
485 | destIndex = appendUnchanged(dest, destIndex, destCapacity, src + start, unchanged2, options, edits); |
486 | |
487 | U_ASSERT(start + unchanged2 == index); |
488 | return index; |
489 | } |
490 | |
491 | } // namespace |
492 | |
493 | U_CFUNC int32_t U_CALLCONV |
494 | ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *iter, |
495 | char16_t *dest, int32_t destCapacity, |
496 | const char16_t *src, int32_t srcLength, |
497 | icu::Edits *edits, |
498 | UErrorCode &errorCode) { |
499 | if (!ustrcase_checkTitleAdjustmentOptions(options, errorCode)) { |
500 | return 0; |
501 | } |
502 | |
503 | /* set up local variables */ |
504 | UCaseContext csc=UCASECONTEXT_INITIALIZER; |
505 | csc.p=(void *)src; |
506 | csc.limit=srcLength; |
507 | int32_t destIndex=0; |
508 | int32_t prev=0; |
509 | bool isFirstIndex=true; |
510 | |
511 | /* titlecasing loop */ |
512 | while(prev<srcLength) { |
513 | /* find next index where to titlecase */ |
514 | int32_t index; |
515 | if(isFirstIndex) { |
516 | isFirstIndex=false; |
517 | index=iter->first(); |
518 | } else { |
519 | index=iter->next(); |
520 | } |
521 | if(index==UBRK_DONE || index>srcLength) { |
522 | index=srcLength; |
523 | } |
524 | |
525 | /* |
526 | * Segment [prev..index[ into 3 parts: |
527 | * a) skipped characters (copy as-is) [prev..titleStart[ |
528 | * b) first letter (titlecase) [titleStart..titleLimit[ |
529 | * c) subsequent characters (lowercase) [titleLimit..index[ |
530 | */ |
531 | if(prev<index) { |
532 | // Find and copy skipped characters [prev..titleStart[ |
533 | int32_t titleStart=prev; |
534 | int32_t titleLimit=prev; |
535 | UChar32 c; |
536 | U16_NEXT(src, titleLimit, index, c); |
537 | if ((options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0) { |
538 | // Adjust the titlecasing index to the next cased character, |
539 | // or to the next letter/number/symbol/private use. |
540 | // Stop with titleStart<titleLimit<=index |
541 | // if there is a character to be titlecased, |
542 | // or else stop with titleStart==titleLimit==index. |
543 | bool toCased = (options&U_TITLECASE_ADJUST_TO_CASED) != 0; |
544 | while (toCased ? UCASE_NONE==ucase_getType(c) : !ustrcase_isLNS(c)) { |
545 | titleStart=titleLimit; |
546 | if(titleLimit==index) { |
547 | break; |
548 | } |
549 | U16_NEXT(src, titleLimit, index, c); |
550 | } |
551 | if (prev < titleStart) { |
552 | destIndex=appendUnchanged(dest, destIndex, destCapacity, |
553 | src+prev, titleStart-prev, options, edits); |
554 | if(destIndex<0) { |
555 | errorCode=U_INDEX_OUTOFBOUNDS_ERROR; |
556 | return 0; |
557 | } |
558 | } |
559 | } |
560 | |
561 | if(titleStart<titleLimit) { |
562 | /* titlecase c which is from [titleStart..titleLimit[ */ |
563 | csc.cpStart=titleStart; |
564 | csc.cpLimit=titleLimit; |
565 | const char16_t *s; |
566 | c=ucase_toFullTitle(c, utf16_caseContextIterator, &csc, &s, caseLocale); |
567 | destIndex=appendResult(dest, destIndex, destCapacity, c, s, |
568 | titleLimit-titleStart, options, edits); |
569 | if(destIndex<0) { |
570 | errorCode=U_INDEX_OUTOFBOUNDS_ERROR; |
571 | return 0; |
572 | } |
573 | |
574 | /* Special case Dutch IJ titlecasing */ |
575 | if (titleStart+1 < index && |
576 | caseLocale == UCASE_LOC_DUTCH) { |
577 | if (c < 0) { |
578 | c = ~c; |
579 | } |
580 | |
581 | if (c == u'I' || c == u'Í') { |
582 | titleLimit = maybeTitleDutchIJ(src, c, titleStart + 1, index, |
583 | dest, destIndex, destCapacity, options, |
584 | edits); |
585 | } |
586 | } |
587 | |
588 | /* lowercase [titleLimit..index[ */ |
589 | if(titleLimit<index) { |
590 | if((options&U_TITLECASE_NO_LOWERCASE)==0) { |
591 | /* Normal operation: Lowercase the rest of the word. */ |
592 | destIndex+= |
593 | toLower( |
594 | caseLocale, options, |
595 | (dest==nullptr) ? nullptr: dest+destIndex, destCapacity-destIndex, |
596 | src, &csc, titleLimit, index, |
597 | edits, errorCode); |
598 | if(errorCode==U_BUFFER_OVERFLOW_ERROR) { |
599 | errorCode=U_ZERO_ERROR; |
600 | } |
601 | if(U_FAILURE(errorCode)) { |
602 | return destIndex; |
603 | } |
604 | } else { |
605 | /* Optionally just copy the rest of the word unchanged. */ |
606 | destIndex=appendUnchanged(dest, destIndex, destCapacity, |
607 | src+titleLimit, index-titleLimit, options, edits); |
608 | if(destIndex<0) { |
609 | errorCode=U_INDEX_OUTOFBOUNDS_ERROR; |
610 | return 0; |
611 | } |
612 | } |
613 | } |
614 | } |
615 | } |
616 | |
617 | prev=index; |
618 | } |
619 | |
620 | return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode); |
621 | } |
622 | |
623 | #endif // !UCONFIG_NO_BREAK_ITERATION |
624 | |
625 | U_NAMESPACE_BEGIN |
626 | namespace GreekUpper { |
627 | |
628 | // Data generated by prototype code, see |
629 | // https://icu.unicode.org/design/case/greek-upper |
630 | // TODO: Move this data into ucase.icu. |
631 | static const uint16_t data0370[] = { |
632 | // U+0370..03FF |
633 | 0x0370, |
634 | 0x0370, |
635 | 0x0372, |
636 | 0x0372, |
637 | 0, |
638 | 0, |
639 | 0x0376, |
640 | 0x0376, |
641 | 0, |
642 | 0, |
643 | 0x037A, |
644 | 0x03FD, |
645 | 0x03FE, |
646 | 0x03FF, |
647 | 0, |
648 | 0x037F, |
649 | 0, |
650 | 0, |
651 | 0, |
652 | 0, |
653 | 0, |
654 | 0, |
655 | 0x0391 | HAS_VOWEL | HAS_ACCENT, |
656 | 0, |
657 | 0x0395 | HAS_VOWEL | HAS_ACCENT, |
658 | 0x0397 | HAS_VOWEL | HAS_ACCENT, |
659 | 0x0399 | HAS_VOWEL | HAS_ACCENT, |
660 | 0, |
661 | 0x039F | HAS_VOWEL | HAS_ACCENT, |
662 | 0, |
663 | 0x03A5 | HAS_VOWEL | HAS_ACCENT, |
664 | 0x03A9 | HAS_VOWEL | HAS_ACCENT, |
665 | 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, |
666 | 0x0391 | HAS_VOWEL, |
667 | 0x0392, |
668 | 0x0393, |
669 | 0x0394, |
670 | 0x0395 | HAS_VOWEL, |
671 | 0x0396, |
672 | 0x0397 | HAS_VOWEL, |
673 | 0x0398, |
674 | 0x0399 | HAS_VOWEL, |
675 | 0x039A, |
676 | 0x039B, |
677 | 0x039C, |
678 | 0x039D, |
679 | 0x039E, |
680 | 0x039F | HAS_VOWEL, |
681 | 0x03A0, |
682 | 0x03A1, |
683 | 0, |
684 | 0x03A3, |
685 | 0x03A4, |
686 | 0x03A5 | HAS_VOWEL, |
687 | 0x03A6, |
688 | 0x03A7, |
689 | 0x03A8, |
690 | 0x03A9 | HAS_VOWEL, |
691 | 0x0399 | HAS_VOWEL | HAS_DIALYTIKA, |
692 | 0x03A5 | HAS_VOWEL | HAS_DIALYTIKA, |
693 | 0x0391 | HAS_VOWEL | HAS_ACCENT, |
694 | 0x0395 | HAS_VOWEL | HAS_ACCENT, |
695 | 0x0397 | HAS_VOWEL | HAS_ACCENT, |
696 | 0x0399 | HAS_VOWEL | HAS_ACCENT, |
697 | 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, |
698 | 0x0391 | HAS_VOWEL, |
699 | 0x0392, |
700 | 0x0393, |
701 | 0x0394, |
702 | 0x0395 | HAS_VOWEL, |
703 | 0x0396, |
704 | 0x0397 | HAS_VOWEL, |
705 | 0x0398, |
706 | 0x0399 | HAS_VOWEL, |
707 | 0x039A, |
708 | 0x039B, |
709 | 0x039C, |
710 | 0x039D, |
711 | 0x039E, |
712 | 0x039F | HAS_VOWEL, |
713 | 0x03A0, |
714 | 0x03A1, |
715 | 0x03A3, |
716 | 0x03A3, |
717 | 0x03A4, |
718 | 0x03A5 | HAS_VOWEL, |
719 | 0x03A6, |
720 | 0x03A7, |
721 | 0x03A8, |
722 | 0x03A9 | HAS_VOWEL, |
723 | 0x0399 | HAS_VOWEL | HAS_DIALYTIKA, |
724 | 0x03A5 | HAS_VOWEL | HAS_DIALYTIKA, |
725 | 0x039F | HAS_VOWEL | HAS_ACCENT, |
726 | 0x03A5 | HAS_VOWEL | HAS_ACCENT, |
727 | 0x03A9 | HAS_VOWEL | HAS_ACCENT, |
728 | 0x03CF, |
729 | 0x0392, |
730 | 0x0398, |
731 | 0x03D2, |
732 | 0x03D2 | HAS_ACCENT, |
733 | 0x03D2 | HAS_DIALYTIKA, |
734 | 0x03A6, |
735 | 0x03A0, |
736 | 0x03CF, |
737 | 0x03D8, |
738 | 0x03D8, |
739 | 0x03DA, |
740 | 0x03DA, |
741 | 0x03DC, |
742 | 0x03DC, |
743 | 0x03DE, |
744 | 0x03DE, |
745 | 0x03E0, |
746 | 0x03E0, |
747 | 0, |
748 | 0, |
749 | 0, |
750 | 0, |
751 | 0, |
752 | 0, |
753 | 0, |
754 | 0, |
755 | 0, |
756 | 0, |
757 | 0, |
758 | 0, |
759 | 0, |
760 | 0, |
761 | 0x039A, |
762 | 0x03A1, |
763 | 0x03F9, |
764 | 0x037F, |
765 | 0x03F4, |
766 | 0x0395 | HAS_VOWEL, |
767 | 0, |
768 | 0x03F7, |
769 | 0x03F7, |
770 | 0x03F9, |
771 | 0x03FA, |
772 | 0x03FA, |
773 | 0x03FC, |
774 | 0x03FD, |
775 | 0x03FE, |
776 | 0x03FF, |
777 | }; |
778 | |
779 | static const uint16_t data1F00[] = { |
780 | // U+1F00..1FFF |
781 | 0x0391 | HAS_VOWEL, |
782 | 0x0391 | HAS_VOWEL, |
783 | 0x0391 | HAS_VOWEL | HAS_ACCENT, |
784 | 0x0391 | HAS_VOWEL | HAS_ACCENT, |
785 | 0x0391 | HAS_VOWEL | HAS_ACCENT, |
786 | 0x0391 | HAS_VOWEL | HAS_ACCENT, |
787 | 0x0391 | HAS_VOWEL | HAS_ACCENT, |
788 | 0x0391 | HAS_VOWEL | HAS_ACCENT, |
789 | 0x0391 | HAS_VOWEL, |
790 | 0x0391 | HAS_VOWEL, |
791 | 0x0391 | HAS_VOWEL | HAS_ACCENT, |
792 | 0x0391 | HAS_VOWEL | HAS_ACCENT, |
793 | 0x0391 | HAS_VOWEL | HAS_ACCENT, |
794 | 0x0391 | HAS_VOWEL | HAS_ACCENT, |
795 | 0x0391 | HAS_VOWEL | HAS_ACCENT, |
796 | 0x0391 | HAS_VOWEL | HAS_ACCENT, |
797 | 0x0395 | HAS_VOWEL, |
798 | 0x0395 | HAS_VOWEL, |
799 | 0x0395 | HAS_VOWEL | HAS_ACCENT, |
800 | 0x0395 | HAS_VOWEL | HAS_ACCENT, |
801 | 0x0395 | HAS_VOWEL | HAS_ACCENT, |
802 | 0x0395 | HAS_VOWEL | HAS_ACCENT, |
803 | 0, |
804 | 0, |
805 | 0x0395 | HAS_VOWEL, |
806 | 0x0395 | HAS_VOWEL, |
807 | 0x0395 | HAS_VOWEL | HAS_ACCENT, |
808 | 0x0395 | HAS_VOWEL | HAS_ACCENT, |
809 | 0x0395 | HAS_VOWEL | HAS_ACCENT, |
810 | 0x0395 | HAS_VOWEL | HAS_ACCENT, |
811 | 0, |
812 | 0, |
813 | 0x0397 | HAS_VOWEL, |
814 | 0x0397 | HAS_VOWEL, |
815 | 0x0397 | HAS_VOWEL | HAS_ACCENT, |
816 | 0x0397 | HAS_VOWEL | HAS_ACCENT, |
817 | 0x0397 | HAS_VOWEL | HAS_ACCENT, |
818 | 0x0397 | HAS_VOWEL | HAS_ACCENT, |
819 | 0x0397 | HAS_VOWEL | HAS_ACCENT, |
820 | 0x0397 | HAS_VOWEL | HAS_ACCENT, |
821 | 0x0397 | HAS_VOWEL, |
822 | 0x0397 | HAS_VOWEL, |
823 | 0x0397 | HAS_VOWEL | HAS_ACCENT, |
824 | 0x0397 | HAS_VOWEL | HAS_ACCENT, |
825 | 0x0397 | HAS_VOWEL | HAS_ACCENT, |
826 | 0x0397 | HAS_VOWEL | HAS_ACCENT, |
827 | 0x0397 | HAS_VOWEL | HAS_ACCENT, |
828 | 0x0397 | HAS_VOWEL | HAS_ACCENT, |
829 | 0x0399 | HAS_VOWEL, |
830 | 0x0399 | HAS_VOWEL, |
831 | 0x0399 | HAS_VOWEL | HAS_ACCENT, |
832 | 0x0399 | HAS_VOWEL | HAS_ACCENT, |
833 | 0x0399 | HAS_VOWEL | HAS_ACCENT, |
834 | 0x0399 | HAS_VOWEL | HAS_ACCENT, |
835 | 0x0399 | HAS_VOWEL | HAS_ACCENT, |
836 | 0x0399 | HAS_VOWEL | HAS_ACCENT, |
837 | 0x0399 | HAS_VOWEL, |
838 | 0x0399 | HAS_VOWEL, |
839 | 0x0399 | HAS_VOWEL | HAS_ACCENT, |
840 | 0x0399 | HAS_VOWEL | HAS_ACCENT, |
841 | 0x0399 | HAS_VOWEL | HAS_ACCENT, |
842 | 0x0399 | HAS_VOWEL | HAS_ACCENT, |
843 | 0x0399 | HAS_VOWEL | HAS_ACCENT, |
844 | 0x0399 | HAS_VOWEL | HAS_ACCENT, |
845 | 0x039F | HAS_VOWEL, |
846 | 0x039F | HAS_VOWEL, |
847 | 0x039F | HAS_VOWEL | HAS_ACCENT, |
848 | 0x039F | HAS_VOWEL | HAS_ACCENT, |
849 | 0x039F | HAS_VOWEL | HAS_ACCENT, |
850 | 0x039F | HAS_VOWEL | HAS_ACCENT, |
851 | 0, |
852 | 0, |
853 | 0x039F | HAS_VOWEL, |
854 | 0x039F | HAS_VOWEL, |
855 | 0x039F | HAS_VOWEL | HAS_ACCENT, |
856 | 0x039F | HAS_VOWEL | HAS_ACCENT, |
857 | 0x039F | HAS_VOWEL | HAS_ACCENT, |
858 | 0x039F | HAS_VOWEL | HAS_ACCENT, |
859 | 0, |
860 | 0, |
861 | 0x03A5 | HAS_VOWEL, |
862 | 0x03A5 | HAS_VOWEL, |
863 | 0x03A5 | HAS_VOWEL | HAS_ACCENT, |
864 | 0x03A5 | HAS_VOWEL | HAS_ACCENT, |
865 | 0x03A5 | HAS_VOWEL | HAS_ACCENT, |
866 | 0x03A5 | HAS_VOWEL | HAS_ACCENT, |
867 | 0x03A5 | HAS_VOWEL | HAS_ACCENT, |
868 | 0x03A5 | HAS_VOWEL | HAS_ACCENT, |
869 | 0, |
870 | 0x03A5 | HAS_VOWEL, |
871 | 0, |
872 | 0x03A5 | HAS_VOWEL | HAS_ACCENT, |
873 | 0, |
874 | 0x03A5 | HAS_VOWEL | HAS_ACCENT, |
875 | 0, |
876 | 0x03A5 | HAS_VOWEL | HAS_ACCENT, |
877 | 0x03A9 | HAS_VOWEL, |
878 | 0x03A9 | HAS_VOWEL, |
879 | 0x03A9 | HAS_VOWEL | HAS_ACCENT, |
880 | 0x03A9 | HAS_VOWEL | HAS_ACCENT, |
881 | 0x03A9 | HAS_VOWEL | HAS_ACCENT, |
882 | 0x03A9 | HAS_VOWEL | HAS_ACCENT, |
883 | 0x03A9 | HAS_VOWEL | HAS_ACCENT, |
884 | 0x03A9 | HAS_VOWEL | HAS_ACCENT, |
885 | 0x03A9 | HAS_VOWEL, |
886 | 0x03A9 | HAS_VOWEL, |
887 | 0x03A9 | HAS_VOWEL | HAS_ACCENT, |
888 | 0x03A9 | HAS_VOWEL | HAS_ACCENT, |
889 | 0x03A9 | HAS_VOWEL | HAS_ACCENT, |
890 | 0x03A9 | HAS_VOWEL | HAS_ACCENT, |
891 | 0x03A9 | HAS_VOWEL | HAS_ACCENT, |
892 | 0x03A9 | HAS_VOWEL | HAS_ACCENT, |
893 | 0x0391 | HAS_VOWEL | HAS_ACCENT, |
894 | 0x0391 | HAS_VOWEL | HAS_ACCENT, |
895 | 0x0395 | HAS_VOWEL | HAS_ACCENT, |
896 | 0x0395 | HAS_VOWEL | HAS_ACCENT, |
897 | 0x0397 | HAS_VOWEL | HAS_ACCENT, |
898 | 0x0397 | HAS_VOWEL | HAS_ACCENT, |
899 | 0x0399 | HAS_VOWEL | HAS_ACCENT, |
900 | 0x0399 | HAS_VOWEL | HAS_ACCENT, |
901 | 0x039F | HAS_VOWEL | HAS_ACCENT, |
902 | 0x039F | HAS_VOWEL | HAS_ACCENT, |
903 | 0x03A5 | HAS_VOWEL | HAS_ACCENT, |
904 | 0x03A5 | HAS_VOWEL | HAS_ACCENT, |
905 | 0x03A9 | HAS_VOWEL | HAS_ACCENT, |
906 | 0x03A9 | HAS_VOWEL | HAS_ACCENT, |
907 | 0, |
908 | 0, |
909 | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, |
910 | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, |
911 | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
912 | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
913 | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
914 | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
915 | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
916 | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
917 | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, |
918 | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, |
919 | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
920 | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
921 | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
922 | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
923 | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
924 | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
925 | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, |
926 | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, |
927 | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
928 | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
929 | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
930 | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
931 | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
932 | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
933 | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, |
934 | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, |
935 | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
936 | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
937 | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
938 | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
939 | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
940 | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
941 | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, |
942 | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, |
943 | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
944 | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
945 | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
946 | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
947 | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
948 | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
949 | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, |
950 | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, |
951 | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
952 | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
953 | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
954 | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
955 | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
956 | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
957 | 0x0391 | HAS_VOWEL, |
958 | 0x0391 | HAS_VOWEL, |
959 | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
960 | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, |
961 | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
962 | 0, |
963 | 0x0391 | HAS_VOWEL | HAS_ACCENT, |
964 | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
965 | 0x0391 | HAS_VOWEL, |
966 | 0x0391 | HAS_VOWEL, |
967 | 0x0391 | HAS_VOWEL | HAS_ACCENT, |
968 | 0x0391 | HAS_VOWEL | HAS_ACCENT, |
969 | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, |
970 | 0, |
971 | 0x0399 | HAS_VOWEL, |
972 | 0, |
973 | 0, |
974 | 0, |
975 | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
976 | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, |
977 | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
978 | 0, |
979 | 0x0397 | HAS_VOWEL | HAS_ACCENT, |
980 | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
981 | 0x0395 | HAS_VOWEL | HAS_ACCENT, |
982 | 0x0395 | HAS_VOWEL | HAS_ACCENT, |
983 | 0x0397 | HAS_VOWEL | HAS_ACCENT, |
984 | 0x0397 | HAS_VOWEL | HAS_ACCENT, |
985 | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, |
986 | 0, |
987 | 0, |
988 | 0, |
989 | 0x0399 | HAS_VOWEL, |
990 | 0x0399 | HAS_VOWEL, |
991 | 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, |
992 | 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, |
993 | 0, |
994 | 0, |
995 | 0x0399 | HAS_VOWEL | HAS_ACCENT, |
996 | 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, |
997 | 0x0399 | HAS_VOWEL, |
998 | 0x0399 | HAS_VOWEL, |
999 | 0x0399 | HAS_VOWEL | HAS_ACCENT, |
1000 | 0x0399 | HAS_VOWEL | HAS_ACCENT, |
1001 | 0, |
1002 | 0, |
1003 | 0, |
1004 | 0, |
1005 | 0x03A5 | HAS_VOWEL, |
1006 | 0x03A5 | HAS_VOWEL, |
1007 | 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, |
1008 | 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, |
1009 | 0x03A1, |
1010 | 0x03A1, |
1011 | 0x03A5 | HAS_VOWEL | HAS_ACCENT, |
1012 | 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, |
1013 | 0x03A5 | HAS_VOWEL, |
1014 | 0x03A5 | HAS_VOWEL, |
1015 | 0x03A5 | HAS_VOWEL | HAS_ACCENT, |
1016 | 0x03A5 | HAS_VOWEL | HAS_ACCENT, |
1017 | 0x03A1, |
1018 | 0, |
1019 | 0, |
1020 | 0, |
1021 | 0, |
1022 | 0, |
1023 | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
1024 | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, |
1025 | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
1026 | 0, |
1027 | 0x03A9 | HAS_VOWEL | HAS_ACCENT, |
1028 | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
1029 | 0x039F | HAS_VOWEL | HAS_ACCENT, |
1030 | 0x039F | HAS_VOWEL | HAS_ACCENT, |
1031 | 0x03A9 | HAS_VOWEL | HAS_ACCENT, |
1032 | 0x03A9 | HAS_VOWEL | HAS_ACCENT, |
1033 | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, |
1034 | 0, |
1035 | 0, |
1036 | 0, |
1037 | }; |
1038 | |
1039 | // U+2126 Ohm sign |
1040 | static const uint16_t data2126 = 0x03A9 | HAS_VOWEL; |
1041 | |
1042 | uint32_t getLetterData(UChar32 c) { |
1043 | if (c < 0x370 || 0x2126 < c || (0x3ff < c && c < 0x1f00)) { |
1044 | return 0; |
1045 | } else if (c <= 0x3ff) { |
1046 | return data0370[c - 0x370]; |
1047 | } else if (c <= 0x1fff) { |
1048 | return data1F00[c - 0x1f00]; |
1049 | } else if (c == 0x2126) { |
1050 | return data2126; |
1051 | } else { |
1052 | return 0; |
1053 | } |
1054 | } |
1055 | |
1056 | uint32_t getDiacriticData(UChar32 c) { |
1057 | switch (c) { |
1058 | case 0x0300: // varia |
1059 | case 0x0301: // tonos = oxia |
1060 | case 0x0342: // perispomeni |
1061 | case 0x0302: // circumflex can look like perispomeni |
1062 | case 0x0303: // tilde can look like perispomeni |
1063 | case 0x0311: // inverted breve can look like perispomeni |
1064 | return HAS_ACCENT; |
1065 | case 0x0308: // dialytika = diaeresis |
1066 | return HAS_COMBINING_DIALYTIKA; |
1067 | case 0x0344: // dialytika tonos |
1068 | return HAS_COMBINING_DIALYTIKA | HAS_ACCENT; |
1069 | case 0x0345: // ypogegrammeni = iota subscript |
1070 | return HAS_YPOGEGRAMMENI; |
1071 | case 0x0304: // macron |
1072 | case 0x0306: // breve |
1073 | case 0x0313: // comma above |
1074 | case 0x0314: // reversed comma above |
1075 | case 0x0343: // koronis |
1076 | return HAS_OTHER_GREEK_DIACRITIC; |
1077 | default: |
1078 | return 0; |
1079 | } |
1080 | } |
1081 | |
1082 | UBool isFollowedByCasedLetter(const char16_t *s, int32_t i, int32_t length) { |
1083 | while (i < length) { |
1084 | UChar32 c; |
1085 | U16_NEXT(s, i, length, c); |
1086 | int32_t type = ucase_getTypeOrIgnorable(c); |
1087 | if ((type & UCASE_IGNORABLE) != 0) { |
1088 | // Case-ignorable, continue with the loop. |
1089 | } else if (type != UCASE_NONE) { |
1090 | return true; // Followed by cased letter. |
1091 | } else { |
1092 | return false; // Uncased and not case-ignorable. |
1093 | } |
1094 | } |
1095 | return false; // Not followed by cased letter. |
1096 | } |
1097 | |
1098 | /** |
1099 | * Greek string uppercasing with a state machine. |
1100 | * Probably simpler than a stateless function that has to figure out complex context-before |
1101 | * for each character. |
1102 | * TODO: Try to re-consolidate one way or another with the non-Greek function. |
1103 | */ |
1104 | int32_t toUpper(uint32_t options, |
1105 | char16_t *dest, int32_t destCapacity, |
1106 | const char16_t *src, int32_t srcLength, |
1107 | Edits *edits, |
1108 | UErrorCode &errorCode) { |
1109 | int32_t destIndex=0; |
1110 | uint32_t state = 0; |
1111 | for (int32_t i = 0; i < srcLength;) { |
1112 | int32_t nextIndex = i; |
1113 | UChar32 c; |
1114 | U16_NEXT(src, nextIndex, srcLength, c); |
1115 | uint32_t nextState = 0; |
1116 | int32_t type = ucase_getTypeOrIgnorable(c); |
1117 | if ((type & UCASE_IGNORABLE) != 0) { |
1118 | // c is case-ignorable |
1119 | nextState |= (state & AFTER_CASED); |
1120 | } else if (type != UCASE_NONE) { |
1121 | // c is cased |
1122 | nextState |= AFTER_CASED; |
1123 | } |
1124 | uint32_t data = getLetterData(c); |
1125 | if (data > 0) { |
1126 | uint32_t upper = data & UPPER_MASK; |
1127 | // Add a dialytika to this iota or ypsilon vowel |
1128 | // if we removed a tonos from the previous vowel, |
1129 | // and that previous vowel did not also have (or gain) a dialytika. |
1130 | // Adding one only to the final vowel in a longer sequence |
1131 | // (which does not occur in normal writing) would require lookahead. |
1132 | // Set the same flag as for preserving an existing dialytika. |
1133 | if ((data & HAS_VOWEL) != 0 && (state & AFTER_VOWEL_WITH_ACCENT) != 0 && |
1134 | (upper == 0x399 || upper == 0x3A5)) { |
1135 | data |= HAS_DIALYTIKA; |
1136 | } |
1137 | int32_t numYpogegrammeni = 0; // Map each one to a trailing, spacing, capital iota. |
1138 | if ((data & HAS_YPOGEGRAMMENI) != 0) { |
1139 | numYpogegrammeni = 1; |
1140 | } |
1141 | // Skip combining diacritics after this Greek letter. |
1142 | while (nextIndex < srcLength) { |
1143 | uint32_t diacriticData = getDiacriticData(src[nextIndex]); |
1144 | if (diacriticData != 0) { |
1145 | data |= diacriticData; |
1146 | if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) { |
1147 | ++numYpogegrammeni; |
1148 | } |
1149 | ++nextIndex; |
1150 | } else { |
1151 | break; // not a Greek diacritic |
1152 | } |
1153 | } |
1154 | if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) { |
1155 | nextState |= AFTER_VOWEL_WITH_ACCENT; |
1156 | } |
1157 | // Map according to Greek rules. |
1158 | UBool addTonos = false; |
1159 | if (upper == 0x397 && |
1160 | (data & HAS_ACCENT) != 0 && |
1161 | numYpogegrammeni == 0 && |
1162 | (state & AFTER_CASED) == 0 && |
1163 | !isFollowedByCasedLetter(src, nextIndex, srcLength)) { |
1164 | // Keep disjunctive "or" with (only) a tonos. |
1165 | // We use the same "word boundary" conditions as for the Final_Sigma test. |
1166 | if (i == nextIndex) { |
1167 | upper = 0x389; // Preserve the precomposed form. |
1168 | } else { |
1169 | addTonos = true; |
1170 | } |
1171 | } else if ((data & HAS_DIALYTIKA) != 0) { |
1172 | // Preserve a vowel with dialytika in precomposed form if it exists. |
1173 | if (upper == 0x399) { |
1174 | upper = 0x3AA; |
1175 | data &= ~HAS_EITHER_DIALYTIKA; |
1176 | } else if (upper == 0x3A5) { |
1177 | upper = 0x3AB; |
1178 | data &= ~HAS_EITHER_DIALYTIKA; |
1179 | } |
1180 | } |
1181 | |
1182 | UBool change; |
1183 | if (edits == nullptr && (options & U_OMIT_UNCHANGED_TEXT) == 0) { |
1184 | change = true; // common, simple usage |
1185 | } else { |
1186 | // Find out first whether we are changing the text. |
1187 | change = src[i] != upper || numYpogegrammeni > 0; |
1188 | int32_t i2 = i + 1; |
1189 | if ((data & HAS_EITHER_DIALYTIKA) != 0) { |
1190 | change |= i2 >= nextIndex || src[i2] != 0x308; |
1191 | ++i2; |
1192 | } |
1193 | if (addTonos) { |
1194 | change |= i2 >= nextIndex || src[i2] != 0x301; |
1195 | ++i2; |
1196 | } |
1197 | int32_t oldLength = nextIndex - i; |
1198 | int32_t newLength = (i2 - i) + numYpogegrammeni; |
1199 | change |= oldLength != newLength; |
1200 | if (change) { |
1201 | if (edits != nullptr) { |
1202 | edits->addReplace(oldLength, newLength); |
1203 | } |
1204 | } else { |
1205 | if (edits != nullptr) { |
1206 | edits->addUnchanged(oldLength); |
1207 | } |
1208 | // Write unchanged text? |
1209 | change = (options & U_OMIT_UNCHANGED_TEXT) == 0; |
1210 | } |
1211 | } |
1212 | |
1213 | if (change) { |
1214 | destIndex=appendUChar(dest, destIndex, destCapacity, (char16_t)upper); |
1215 | if (destIndex >= 0 && (data & HAS_EITHER_DIALYTIKA) != 0) { |
1216 | destIndex=appendUChar(dest, destIndex, destCapacity, 0x308); // restore or add a dialytika |
1217 | } |
1218 | if (destIndex >= 0 && addTonos) { |
1219 | destIndex=appendUChar(dest, destIndex, destCapacity, 0x301); |
1220 | } |
1221 | while (destIndex >= 0 && numYpogegrammeni > 0) { |
1222 | destIndex=appendUChar(dest, destIndex, destCapacity, 0x399); |
1223 | --numYpogegrammeni; |
1224 | } |
1225 | if(destIndex<0) { |
1226 | errorCode=U_INDEX_OUTOFBOUNDS_ERROR; |
1227 | return 0; |
1228 | } |
1229 | } |
1230 | } else { |
1231 | const char16_t *s; |
1232 | c=ucase_toFullUpper(c, nullptr, nullptr, &s, UCASE_LOC_GREEK); |
1233 | destIndex = appendResult(dest, destIndex, destCapacity, c, s, |
1234 | nextIndex - i, options, edits); |
1235 | if (destIndex < 0) { |
1236 | errorCode = U_INDEX_OUTOFBOUNDS_ERROR; |
1237 | return 0; |
1238 | } |
1239 | } |
1240 | i = nextIndex; |
1241 | state = nextState; |
1242 | } |
1243 | |
1244 | return destIndex; |
1245 | } |
1246 | |
1247 | } // namespace GreekUpper |
1248 | U_NAMESPACE_END |
1249 | |
1250 | /* functions available in the common library (for unistr_case.cpp) */ |
1251 | |
1252 | U_CFUNC int32_t U_CALLCONV |
1253 | ustrcase_internalToLower(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED |
1254 | char16_t *dest, int32_t destCapacity, |
1255 | const char16_t *src, int32_t srcLength, |
1256 | icu::Edits *edits, |
1257 | UErrorCode &errorCode) { |
1258 | UCaseContext csc=UCASECONTEXT_INITIALIZER; |
1259 | csc.p=(void *)src; |
1260 | csc.limit=srcLength; |
1261 | int32_t destIndex = toLower( |
1262 | caseLocale, options, |
1263 | dest, destCapacity, |
1264 | src, &csc, 0, srcLength, |
1265 | edits, errorCode); |
1266 | return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode); |
1267 | } |
1268 | |
1269 | U_CFUNC int32_t U_CALLCONV |
1270 | ustrcase_internalToUpper(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED |
1271 | char16_t *dest, int32_t destCapacity, |
1272 | const char16_t *src, int32_t srcLength, |
1273 | icu::Edits *edits, |
1274 | UErrorCode &errorCode) { |
1275 | int32_t destIndex; |
1276 | if (caseLocale == UCASE_LOC_GREEK) { |
1277 | destIndex = GreekUpper::toUpper(options, dest, destCapacity, |
1278 | src, srcLength, edits, errorCode); |
1279 | } else { |
1280 | UCaseContext csc=UCASECONTEXT_INITIALIZER; |
1281 | csc.p=(void *)src; |
1282 | csc.limit=srcLength; |
1283 | destIndex = toUpper( |
1284 | caseLocale, options, |
1285 | dest, destCapacity, |
1286 | src, &csc, srcLength, |
1287 | edits, errorCode); |
1288 | } |
1289 | return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode); |
1290 | } |
1291 | |
1292 | U_CFUNC int32_t U_CALLCONV |
1293 | ustrcase_internalFold(int32_t /* caseLocale */, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED |
1294 | char16_t *dest, int32_t destCapacity, |
1295 | const char16_t *src, int32_t srcLength, |
1296 | icu::Edits *edits, |
1297 | UErrorCode &errorCode) { |
1298 | int32_t destIndex = toLower( |
1299 | -1, options, |
1300 | dest, destCapacity, |
1301 | src, nullptr, 0, srcLength, |
1302 | edits, errorCode); |
1303 | return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode); |
1304 | } |
1305 | |
1306 | U_CFUNC int32_t |
1307 | ustrcase_map(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM |
1308 | char16_t *dest, int32_t destCapacity, |
1309 | const char16_t *src, int32_t srcLength, |
1310 | UStringCaseMapper *stringCaseMapper, |
1311 | icu::Edits *edits, |
1312 | UErrorCode &errorCode) { |
1313 | int32_t destLength; |
1314 | |
1315 | /* check argument values */ |
1316 | if(U_FAILURE(errorCode)) { |
1317 | return 0; |
1318 | } |
1319 | if( destCapacity<0 || |
1320 | (dest==nullptr && destCapacity>0) || |
1321 | src==nullptr || |
1322 | srcLength<-1 |
1323 | ) { |
1324 | errorCode=U_ILLEGAL_ARGUMENT_ERROR; |
1325 | return 0; |
1326 | } |
1327 | |
1328 | /* get the string length */ |
1329 | if(srcLength==-1) { |
1330 | srcLength=u_strlen(src); |
1331 | } |
1332 | |
1333 | /* check for overlapping source and destination */ |
1334 | if( dest!=nullptr && |
1335 | ((src>=dest && src<(dest+destCapacity)) || |
1336 | (dest>=src && dest<(src+srcLength))) |
1337 | ) { |
1338 | errorCode=U_ILLEGAL_ARGUMENT_ERROR; |
1339 | return 0; |
1340 | } |
1341 | |
1342 | if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) { |
1343 | edits->reset(); |
1344 | } |
1345 | destLength=stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR |
1346 | dest, destCapacity, src, srcLength, edits, errorCode); |
1347 | return u_terminateUChars(dest, destCapacity, destLength, &errorCode); |
1348 | } |
1349 | |
1350 | U_CFUNC int32_t |
1351 | ustrcase_mapWithOverlap(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM |
1352 | char16_t *dest, int32_t destCapacity, |
1353 | const char16_t *src, int32_t srcLength, |
1354 | UStringCaseMapper *stringCaseMapper, |
1355 | UErrorCode &errorCode) { |
1356 | char16_t buffer[300]; |
1357 | char16_t *temp; |
1358 | |
1359 | int32_t destLength; |
1360 | |
1361 | /* check argument values */ |
1362 | if(U_FAILURE(errorCode)) { |
1363 | return 0; |
1364 | } |
1365 | if( destCapacity<0 || |
1366 | (dest==nullptr && destCapacity>0) || |
1367 | src==nullptr || |
1368 | srcLength<-1 |
1369 | ) { |
1370 | errorCode=U_ILLEGAL_ARGUMENT_ERROR; |
1371 | return 0; |
1372 | } |
1373 | |
1374 | /* get the string length */ |
1375 | if(srcLength==-1) { |
1376 | srcLength=u_strlen(src); |
1377 | } |
1378 | |
1379 | /* check for overlapping source and destination */ |
1380 | if( dest!=nullptr && |
1381 | ((src>=dest && src<(dest+destCapacity)) || |
1382 | (dest>=src && dest<(src+srcLength))) |
1383 | ) { |
1384 | /* overlap: provide a temporary destination buffer and later copy the result */ |
1385 | if(destCapacity<=UPRV_LENGTHOF(buffer)) { |
1386 | /* the stack buffer is large enough */ |
1387 | temp=buffer; |
1388 | } else { |
1389 | /* allocate a buffer */ |
1390 | temp=(char16_t *)uprv_malloc(destCapacity*U_SIZEOF_UCHAR); |
1391 | if(temp==nullptr) { |
1392 | errorCode=U_MEMORY_ALLOCATION_ERROR; |
1393 | return 0; |
1394 | } |
1395 | } |
1396 | } else { |
1397 | temp=dest; |
1398 | } |
1399 | |
1400 | destLength=stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR |
1401 | temp, destCapacity, src, srcLength, nullptr, errorCode); |
1402 | if(temp!=dest) { |
1403 | /* copy the result string to the destination buffer */ |
1404 | if (U_SUCCESS(errorCode) && 0 < destLength && destLength <= destCapacity) { |
1405 | u_memmove(dest, temp, destLength); |
1406 | } |
1407 | if(temp!=buffer) { |
1408 | uprv_free(temp); |
1409 | } |
1410 | } |
1411 | |
1412 | return u_terminateUChars(dest, destCapacity, destLength, &errorCode); |
1413 | } |
1414 | |
1415 | /* public API functions */ |
1416 | |
1417 | U_CAPI int32_t U_EXPORT2 |
1418 | u_strFoldCase(char16_t *dest, int32_t destCapacity, |
1419 | const char16_t *src, int32_t srcLength, |
1420 | uint32_t options, |
1421 | UErrorCode *pErrorCode) { |
1422 | return ustrcase_mapWithOverlap( |
1423 | UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL |
1424 | dest, destCapacity, |
1425 | src, srcLength, |
1426 | ustrcase_internalFold, *pErrorCode); |
1427 | } |
1428 | |
1429 | U_NAMESPACE_BEGIN |
1430 | |
1431 | int32_t CaseMap::fold( |
1432 | uint32_t options, |
1433 | const char16_t *src, int32_t srcLength, |
1434 | char16_t *dest, int32_t destCapacity, Edits *edits, |
1435 | UErrorCode &errorCode) { |
1436 | return ustrcase_map( |
1437 | UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL |
1438 | dest, destCapacity, |
1439 | src, srcLength, |
1440 | ustrcase_internalFold, edits, errorCode); |
1441 | } |
1442 | |
1443 | U_NAMESPACE_END |
1444 | |
1445 | /* case-insensitive string comparisons -------------------------------------- */ |
1446 | |
1447 | /* |
1448 | * This function is a copy of unorm_cmpEquivFold() minus the parts for |
1449 | * canonical equivalence. |
1450 | * Keep the functions in sync, and see there for how this works. |
1451 | * The duplication is for modularization: |
1452 | * It makes caseless (but not canonical caseless) matches independent of |
1453 | * the normalization code. |
1454 | */ |
1455 | |
1456 | /* stack element for previous-level source/decomposition pointers */ |
1457 | struct CmpEquivLevel { |
1458 | const char16_t *start, *s, *limit; |
1459 | }; |
1460 | typedef struct CmpEquivLevel CmpEquivLevel; |
1461 | |
1462 | /** |
1463 | * Internal implementation code comparing string with case fold. |
1464 | * This function is called from u_strcmpFold() and u_caseInsensitivePrefixMatch(). |
1465 | * |
1466 | * @param s1 input string 1 |
1467 | * @param length1 length of string 1, or -1 (NUL terminated) |
1468 | * @param s2 input string 2 |
1469 | * @param length2 length of string 2, or -1 (NUL terminated) |
1470 | * @param options compare options |
1471 | * @param matchLen1 (output) length of partial prefix match in s1 |
1472 | * @param matchLen2 (output) length of partial prefix match in s2 |
1473 | * @param pErrorCode receives error status |
1474 | * @return The result of comparison |
1475 | */ |
1476 | static int32_t _cmpFold( |
1477 | const char16_t *s1, int32_t length1, |
1478 | const char16_t *s2, int32_t length2, |
1479 | uint32_t options, |
1480 | int32_t *matchLen1, int32_t *matchLen2, |
1481 | UErrorCode *pErrorCode) { |
1482 | int32_t cmpRes = 0; |
1483 | |
1484 | /* current-level start/limit - s1/s2 as current */ |
1485 | const char16_t *start1, *start2, *limit1, *limit2; |
1486 | |
1487 | /* points to the original start address */ |
1488 | const char16_t *org1, *org2; |
1489 | |
1490 | /* points to the end of match + 1 */ |
1491 | const char16_t *m1, *m2; |
1492 | |
1493 | /* case folding variables */ |
1494 | const char16_t *p; |
1495 | int32_t length; |
1496 | |
1497 | /* stacks of previous-level start/current/limit */ |
1498 | CmpEquivLevel stack1[2], stack2[2]; |
1499 | |
1500 | /* case folding buffers, only use current-level start/limit */ |
1501 | char16_t fold1[UCASE_MAX_STRING_LENGTH+1], fold2[UCASE_MAX_STRING_LENGTH+1]; |
1502 | |
1503 | /* track which is the current level per string */ |
1504 | int32_t level1, level2; |
1505 | |
1506 | /* current code units, and code points for lookups */ |
1507 | UChar32 c1, c2, cp1, cp2; |
1508 | |
1509 | /* no argument error checking because this itself is not an API */ |
1510 | |
1511 | /* |
1512 | * assume that at least the option U_COMPARE_IGNORE_CASE is set |
1513 | * otherwise this function would have to behave exactly as uprv_strCompare() |
1514 | */ |
1515 | if(U_FAILURE(*pErrorCode)) { |
1516 | return 0; |
1517 | } |
1518 | |
1519 | /* initialize */ |
1520 | if(matchLen1) { |
1521 | U_ASSERT(matchLen2 !=nullptr); |
1522 | *matchLen1=0; |
1523 | *matchLen2=0; |
1524 | } |
1525 | |
1526 | start1=m1=org1=s1; |
1527 | if(length1==-1) { |
1528 | limit1=nullptr; |
1529 | } else { |
1530 | limit1=s1+length1; |
1531 | } |
1532 | |
1533 | start2=m2=org2=s2; |
1534 | if(length2==-1) { |
1535 | limit2=nullptr; |
1536 | } else { |
1537 | limit2=s2+length2; |
1538 | } |
1539 | |
1540 | level1=level2=0; |
1541 | c1=c2=-1; |
1542 | |
1543 | /* comparison loop */ |
1544 | for(;;) { |
1545 | /* |
1546 | * here a code unit value of -1 means "get another code unit" |
1547 | * below it will mean "this source is finished" |
1548 | */ |
1549 | |
1550 | if(c1<0) { |
1551 | /* get next code unit from string 1, post-increment */ |
1552 | for(;;) { |
1553 | if(s1==limit1 || ((c1=*s1)==0 && (limit1==nullptr || (options&_STRNCMP_STYLE)))) { |
1554 | if(level1==0) { |
1555 | c1=-1; |
1556 | break; |
1557 | } |
1558 | } else { |
1559 | ++s1; |
1560 | break; |
1561 | } |
1562 | |
1563 | /* reached end of level buffer, pop one level */ |
1564 | do { |
1565 | --level1; |
1566 | start1=stack1[level1].start; /*Not uninitialized*/ |
1567 | } while(start1==nullptr); |
1568 | s1=stack1[level1].s; /*Not uninitialized*/ |
1569 | limit1=stack1[level1].limit; /*Not uninitialized*/ |
1570 | } |
1571 | } |
1572 | |
1573 | if(c2<0) { |
1574 | /* get next code unit from string 2, post-increment */ |
1575 | for(;;) { |
1576 | if(s2==limit2 || ((c2=*s2)==0 && (limit2==nullptr || (options&_STRNCMP_STYLE)))) { |
1577 | if(level2==0) { |
1578 | c2=-1; |
1579 | break; |
1580 | } |
1581 | } else { |
1582 | ++s2; |
1583 | break; |
1584 | } |
1585 | |
1586 | /* reached end of level buffer, pop one level */ |
1587 | do { |
1588 | --level2; |
1589 | start2=stack2[level2].start; /*Not uninitialized*/ |
1590 | } while(start2==nullptr); |
1591 | s2=stack2[level2].s; /*Not uninitialized*/ |
1592 | limit2=stack2[level2].limit; /*Not uninitialized*/ |
1593 | } |
1594 | } |
1595 | |
1596 | /* |
1597 | * compare c1 and c2 |
1598 | * either variable c1, c2 is -1 only if the corresponding string is finished |
1599 | */ |
1600 | if(c1==c2) { |
1601 | const char16_t *next1, *next2; |
1602 | |
1603 | if(c1<0) { |
1604 | cmpRes=0; /* c1==c2==-1 indicating end of strings */ |
1605 | break; |
1606 | } |
1607 | |
1608 | /* |
1609 | * Note: Move the match positions in both strings at the same time |
1610 | * only when corresponding code point(s) in the original strings |
1611 | * are fully consumed. For example, when comparing s1="Fust" and |
1612 | * s2="Fu\u00dfball", s2[2] is folded into "ss", and s1[2] matches |
1613 | * the first code point in the case-folded data. But the second "s" |
1614 | * has no matching code point in s1, so this implementation returns |
1615 | * 2 as the prefix match length ("Fu"). |
1616 | */ |
1617 | next1=next2=nullptr; |
1618 | if(level1==0) { |
1619 | next1=s1; |
1620 | } else if(s1==limit1) { |
1621 | /* Note: This implementation only use a single level of stack. |
1622 | * If this code needs to be changed to use multiple levels |
1623 | * of stacks, the code above should check if the current |
1624 | * code is at the end of all stacks. |
1625 | */ |
1626 | U_ASSERT(level1==1); |
1627 | |
1628 | /* is s1 at the end of the current stack? */ |
1629 | next1=stack1[0].s; |
1630 | } |
1631 | |
1632 | if (next1!=nullptr) { |
1633 | if(level2==0) { |
1634 | next2=s2; |
1635 | } else if(s2==limit2) { |
1636 | U_ASSERT(level2==1); |
1637 | |
1638 | /* is s2 at the end of the current stack? */ |
1639 | next2=stack2[0].s; |
1640 | } |
1641 | if(next2!=nullptr) { |
1642 | m1=next1; |
1643 | m2=next2; |
1644 | } |
1645 | } |
1646 | c1=c2=-1; /* make us fetch new code units */ |
1647 | continue; |
1648 | } else if(c1<0) { |
1649 | cmpRes=-1; /* string 1 ends before string 2 */ |
1650 | break; |
1651 | } else if(c2<0) { |
1652 | cmpRes=1; /* string 2 ends before string 1 */ |
1653 | break; |
1654 | } |
1655 | /* c1!=c2 && c1>=0 && c2>=0 */ |
1656 | |
1657 | /* get complete code points for c1, c2 for lookups if either is a surrogate */ |
1658 | cp1=c1; |
1659 | if(U_IS_SURROGATE(c1)) { |
1660 | char16_t c; |
1661 | |
1662 | if(U_IS_SURROGATE_LEAD(c1)) { |
1663 | if(s1!=limit1 && U16_IS_TRAIL(c=*s1)) { |
1664 | /* advance ++s1; only below if cp1 decomposes/case-folds */ |
1665 | cp1=U16_GET_SUPPLEMENTARY(c1, c); |
1666 | } |
1667 | } else /* isTrail(c1) */ { |
1668 | if(start1<=(s1-2) && U16_IS_LEAD(c=*(s1-2))) { |
1669 | cp1=U16_GET_SUPPLEMENTARY(c, c1); |
1670 | } |
1671 | } |
1672 | } |
1673 | |
1674 | cp2=c2; |
1675 | if(U_IS_SURROGATE(c2)) { |
1676 | char16_t c; |
1677 | |
1678 | if(U_IS_SURROGATE_LEAD(c2)) { |
1679 | if(s2!=limit2 && U16_IS_TRAIL(c=*s2)) { |
1680 | /* advance ++s2; only below if cp2 decomposes/case-folds */ |
1681 | cp2=U16_GET_SUPPLEMENTARY(c2, c); |
1682 | } |
1683 | } else /* isTrail(c2) */ { |
1684 | if(start2<=(s2-2) && U16_IS_LEAD(c=*(s2-2))) { |
1685 | cp2=U16_GET_SUPPLEMENTARY(c, c2); |
1686 | } |
1687 | } |
1688 | } |
1689 | |
1690 | /* |
1691 | * go down one level for each string |
1692 | * continue with the main loop as soon as there is a real change |
1693 | */ |
1694 | |
1695 | if( level1==0 && |
1696 | (length=ucase_toFullFolding((UChar32)cp1, &p, options))>=0 |
1697 | ) { |
1698 | /* cp1 case-folds to the code point "length" or to p[length] */ |
1699 | if(U_IS_SURROGATE(c1)) { |
1700 | if(U_IS_SURROGATE_LEAD(c1)) { |
1701 | /* advance beyond source surrogate pair if it case-folds */ |
1702 | ++s1; |
1703 | } else /* isTrail(c1) */ { |
1704 | /* |
1705 | * we got a supplementary code point when hitting its trail surrogate, |
1706 | * therefore the lead surrogate must have been the same as in the other string; |
1707 | * compare this decomposition with the lead surrogate in the other string |
1708 | * remember that this simulates bulk text replacement: |
1709 | * the decomposition would replace the entire code point |
1710 | */ |
1711 | --s2; |
1712 | --m2; |
1713 | c2=*(s2-1); |
1714 | } |
1715 | } |
1716 | |
1717 | /* push current level pointers */ |
1718 | stack1[0].start=start1; |
1719 | stack1[0].s=s1; |
1720 | stack1[0].limit=limit1; |
1721 | ++level1; |
1722 | |
1723 | /* copy the folding result to fold1[] */ |
1724 | if(length<=UCASE_MAX_STRING_LENGTH) { |
1725 | u_memcpy(fold1, p, length); |
1726 | } else { |
1727 | int32_t i=0; |
1728 | U16_APPEND_UNSAFE(fold1, i, length); |
1729 | length=i; |
1730 | } |
1731 | |
1732 | /* set next level pointers to case folding */ |
1733 | start1=s1=fold1; |
1734 | limit1=fold1+length; |
1735 | |
1736 | /* get ready to read from decomposition, continue with loop */ |
1737 | c1=-1; |
1738 | continue; |
1739 | } |
1740 | |
1741 | if( level2==0 && |
1742 | (length=ucase_toFullFolding((UChar32)cp2, &p, options))>=0 |
1743 | ) { |
1744 | /* cp2 case-folds to the code point "length" or to p[length] */ |
1745 | if(U_IS_SURROGATE(c2)) { |
1746 | if(U_IS_SURROGATE_LEAD(c2)) { |
1747 | /* advance beyond source surrogate pair if it case-folds */ |
1748 | ++s2; |
1749 | } else /* isTrail(c2) */ { |
1750 | /* |
1751 | * we got a supplementary code point when hitting its trail surrogate, |
1752 | * therefore the lead surrogate must have been the same as in the other string; |
1753 | * compare this decomposition with the lead surrogate in the other string |
1754 | * remember that this simulates bulk text replacement: |
1755 | * the decomposition would replace the entire code point |
1756 | */ |
1757 | --s1; |
1758 | --m2; |
1759 | c1=*(s1-1); |
1760 | } |
1761 | } |
1762 | |
1763 | /* push current level pointers */ |
1764 | stack2[0].start=start2; |
1765 | stack2[0].s=s2; |
1766 | stack2[0].limit=limit2; |
1767 | ++level2; |
1768 | |
1769 | /* copy the folding result to fold2[] */ |
1770 | if(length<=UCASE_MAX_STRING_LENGTH) { |
1771 | u_memcpy(fold2, p, length); |
1772 | } else { |
1773 | int32_t i=0; |
1774 | U16_APPEND_UNSAFE(fold2, i, length); |
1775 | length=i; |
1776 | } |
1777 | |
1778 | /* set next level pointers to case folding */ |
1779 | start2=s2=fold2; |
1780 | limit2=fold2+length; |
1781 | |
1782 | /* get ready to read from decomposition, continue with loop */ |
1783 | c2=-1; |
1784 | continue; |
1785 | } |
1786 | |
1787 | /* |
1788 | * no decomposition/case folding, max level for both sides: |
1789 | * return difference result |
1790 | * |
1791 | * code point order comparison must not just return cp1-cp2 |
1792 | * because when single surrogates are present then the surrogate pairs |
1793 | * that formed cp1 and cp2 may be from different string indexes |
1794 | * |
1795 | * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units |
1796 | * c1=d800 cp1=10001 c2=dc00 cp2=10000 |
1797 | * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 } |
1798 | * |
1799 | * therefore, use same fix-up as in ustring.c/uprv_strCompare() |
1800 | * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++ |
1801 | * so we have slightly different pointer/start/limit comparisons here |
1802 | */ |
1803 | |
1804 | if(c1>=0xd800 && c2>=0xd800 && (options&U_COMPARE_CODE_POINT_ORDER)) { |
1805 | /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */ |
1806 | if( |
1807 | (c1<=0xdbff && s1!=limit1 && U16_IS_TRAIL(*s1)) || |
1808 | (U16_IS_TRAIL(c1) && start1!=(s1-1) && U16_IS_LEAD(*(s1-2))) |
1809 | ) { |
1810 | /* part of a surrogate pair, leave >=d800 */ |
1811 | } else { |
1812 | /* BMP code point - may be surrogate code point - make <d800 */ |
1813 | c1-=0x2800; |
1814 | } |
1815 | |
1816 | if( |
1817 | (c2<=0xdbff && s2!=limit2 && U16_IS_TRAIL(*s2)) || |
1818 | (U16_IS_TRAIL(c2) && start2!=(s2-1) && U16_IS_LEAD(*(s2-2))) |
1819 | ) { |
1820 | /* part of a surrogate pair, leave >=d800 */ |
1821 | } else { |
1822 | /* BMP code point - may be surrogate code point - make <d800 */ |
1823 | c2-=0x2800; |
1824 | } |
1825 | } |
1826 | |
1827 | cmpRes=c1-c2; |
1828 | break; |
1829 | } |
1830 | |
1831 | if(matchLen1) { |
1832 | *matchLen1=static_cast<int32_t>(m1-org1); |
1833 | *matchLen2=static_cast<int32_t>(m2-org2); |
1834 | } |
1835 | return cmpRes; |
1836 | } |
1837 | |
1838 | /* internal function */ |
1839 | U_CFUNC int32_t |
1840 | u_strcmpFold(const char16_t *s1, int32_t length1, |
1841 | const char16_t *s2, int32_t length2, |
1842 | uint32_t options, |
1843 | UErrorCode *pErrorCode) { |
1844 | return _cmpFold(s1, length1, s2, length2, options, nullptr, nullptr, pErrorCode); |
1845 | } |
1846 | |
1847 | /* public API functions */ |
1848 | |
1849 | U_CAPI int32_t U_EXPORT2 |
1850 | u_strCaseCompare(const char16_t *s1, int32_t length1, |
1851 | const char16_t *s2, int32_t length2, |
1852 | uint32_t options, |
1853 | UErrorCode *pErrorCode) { |
1854 | /* argument checking */ |
1855 | if(pErrorCode==0 || U_FAILURE(*pErrorCode)) { |
1856 | return 0; |
1857 | } |
1858 | if(s1==nullptr || length1<-1 || s2==nullptr || length2<-1) { |
1859 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
1860 | return 0; |
1861 | } |
1862 | return u_strcmpFold(s1, length1, s2, length2, |
1863 | options|U_COMPARE_IGNORE_CASE, |
1864 | pErrorCode); |
1865 | } |
1866 | |
1867 | U_CAPI int32_t U_EXPORT2 |
1868 | u_strcasecmp(const char16_t *s1, const char16_t *s2, uint32_t options) { |
1869 | UErrorCode errorCode=U_ZERO_ERROR; |
1870 | return u_strcmpFold(s1, -1, s2, -1, |
1871 | options|U_COMPARE_IGNORE_CASE, |
1872 | &errorCode); |
1873 | } |
1874 | |
1875 | U_CAPI int32_t U_EXPORT2 |
1876 | u_memcasecmp(const char16_t *s1, const char16_t *s2, int32_t length, uint32_t options) { |
1877 | UErrorCode errorCode=U_ZERO_ERROR; |
1878 | return u_strcmpFold(s1, length, s2, length, |
1879 | options|U_COMPARE_IGNORE_CASE, |
1880 | &errorCode); |
1881 | } |
1882 | |
1883 | U_CAPI int32_t U_EXPORT2 |
1884 | u_strncasecmp(const char16_t *s1, const char16_t *s2, int32_t n, uint32_t options) { |
1885 | UErrorCode errorCode=U_ZERO_ERROR; |
1886 | return u_strcmpFold(s1, n, s2, n, |
1887 | options|(U_COMPARE_IGNORE_CASE|_STRNCMP_STYLE), |
1888 | &errorCode); |
1889 | } |
1890 | |
1891 | /* internal API - detect length of shared prefix */ |
1892 | U_CAPI void |
1893 | u_caseInsensitivePrefixMatch(const char16_t *s1, int32_t length1, |
1894 | const char16_t *s2, int32_t length2, |
1895 | uint32_t options, |
1896 | int32_t *matchLen1, int32_t *matchLen2, |
1897 | UErrorCode *pErrorCode) { |
1898 | _cmpFold(s1, length1, s2, length2, options, |
1899 | matchLen1, matchLen2, pErrorCode); |
1900 | } |
1901 | |