1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5*
6* Copyright (C) 2005-2016, International Business Machines
7* Corporation and others. All Rights Reserved.
8*
9*******************************************************************************
10* file name: ucasemap.cpp
11* encoding: UTF-8
12* tab size: 8 (not used)
13* indentation:4
14*
15* created on: 2005may06
16* created by: Markus W. Scherer
17*
18* Case mapping service object and functions using it.
19*/
20
21#include "unicode/utypes.h"
22#include "unicode/brkiter.h"
23#include "unicode/bytestream.h"
24#include "unicode/casemap.h"
25#include "unicode/edits.h"
26#include "unicode/stringoptions.h"
27#include "unicode/stringpiece.h"
28#include "unicode/ubrk.h"
29#include "unicode/uloc.h"
30#include "unicode/ustring.h"
31#include "unicode/ucasemap.h"
32#if !UCONFIG_NO_BREAK_ITERATION
33#include "unicode/utext.h"
34#endif
35#include "unicode/utf.h"
36#include "unicode/utf8.h"
37#include "unicode/utf16.h"
38#include "bytesinkutil.h"
39#include "cmemory.h"
40#include "cstring.h"
41#include "uassert.h"
42#include "ucase.h"
43#include "ucasemap_imp.h"
44#include "ustr_imp.h"
45
46U_NAMESPACE_USE
47
48/* UCaseMap service object -------------------------------------------------- */
49
50UCaseMap::UCaseMap(const char *localeID, uint32_t opts, UErrorCode *pErrorCode) :
51#if !UCONFIG_NO_BREAK_ITERATION
52 iter(nullptr),
53#endif
54 caseLocale(UCASE_LOC_UNKNOWN), options(opts) {
55 ucasemap_setLocale(this, localeID, pErrorCode);
56}
57
58UCaseMap::~UCaseMap() {
59#if !UCONFIG_NO_BREAK_ITERATION
60 delete iter;
61#endif
62}
63
64U_CAPI UCaseMap * U_EXPORT2
65ucasemap_open(const char *locale, uint32_t options, UErrorCode *pErrorCode) {
66 if(U_FAILURE(*pErrorCode)) {
67 return nullptr;
68 }
69 UCaseMap *csm = new UCaseMap(locale, options, pErrorCode);
70 if(csm==nullptr) {
71 *pErrorCode = U_MEMORY_ALLOCATION_ERROR;
72 return nullptr;
73 } else if (U_FAILURE(*pErrorCode)) {
74 delete csm;
75 return nullptr;
76 }
77 return csm;
78}
79
80U_CAPI void U_EXPORT2
81ucasemap_close(UCaseMap *csm) {
82 delete csm;
83}
84
85U_CAPI const char * U_EXPORT2
86ucasemap_getLocale(const UCaseMap *csm) {
87 return csm->locale;
88}
89
90U_CAPI uint32_t U_EXPORT2
91ucasemap_getOptions(const UCaseMap *csm) {
92 return csm->options;
93}
94
95U_CAPI void U_EXPORT2
96ucasemap_setLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode) {
97 if(U_FAILURE(*pErrorCode)) {
98 return;
99 }
100 if (locale != nullptr && *locale == 0) {
101 csm->locale[0] = 0;
102 csm->caseLocale = UCASE_LOC_ROOT;
103 return;
104 }
105
106 int32_t length=uloc_getName(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode);
107 if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR || length==sizeof(csm->locale)) {
108 *pErrorCode=U_ZERO_ERROR;
109 /* we only really need the language code for case mappings */
110 length=uloc_getLanguage(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode);
111 }
112 if(length==sizeof(csm->locale)) {
113 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
114 }
115 if(U_SUCCESS(*pErrorCode)) {
116 csm->caseLocale = ucase_getCaseLocale(csm->locale);
117 } else {
118 csm->locale[0]=0;
119 csm->caseLocale = UCASE_LOC_ROOT;
120 }
121}
122
123U_CAPI void U_EXPORT2
124ucasemap_setOptions(UCaseMap *csm, uint32_t options, UErrorCode *pErrorCode) {
125 if(U_FAILURE(*pErrorCode)) {
126 return;
127 }
128 csm->options=options;
129}
130
131/* UTF-8 string case mappings ----------------------------------------------- */
132
133/* TODO(markus): Move to a new, separate utf8case.cpp file. */
134
135namespace {
136
137/* append a full case mapping result, see UCASE_MAX_STRING_LENGTH */
138inline UBool
139appendResult(int32_t cpLength, int32_t result, const char16_t *s,
140 ByteSink &sink, uint32_t options, icu::Edits *edits, UErrorCode &errorCode) {
141 U_ASSERT(U_SUCCESS(errorCode));
142
143 /* decode the result */
144 if(result<0) {
145 /* (not) original code point */
146 if(edits!=nullptr) {
147 edits->addUnchanged(cpLength);
148 }
149 if((options & U_OMIT_UNCHANGED_TEXT) == 0) {
150 ByteSinkUtil::appendCodePoint(cpLength, ~result, sink);
151 }
152 } else {
153 if(result<=UCASE_MAX_STRING_LENGTH) {
154 // string: "result" is the UTF-16 length
155 return ByteSinkUtil::appendChange(cpLength, s, result, sink, edits, errorCode);
156 } else {
157 ByteSinkUtil::appendCodePoint(cpLength, result, sink, edits);
158 }
159 }
160 return true;
161}
162
163// See unicode/utf8.h U8_APPEND_UNSAFE().
164inline uint8_t getTwoByteLead(UChar32 c) { return (uint8_t)((c >> 6) | 0xc0); }
165inline uint8_t getTwoByteTrail(UChar32 c) { return (uint8_t)((c & 0x3f) | 0x80); }
166
167UChar32 U_CALLCONV
168utf8_caseContextIterator(void *context, int8_t dir) {
169 UCaseContext *csc=(UCaseContext *)context;
170 UChar32 c;
171
172 if(dir<0) {
173 /* reset for backward iteration */
174 csc->index=csc->cpStart;
175 csc->dir=dir;
176 } else if(dir>0) {
177 /* reset for forward iteration */
178 csc->index=csc->cpLimit;
179 csc->dir=dir;
180 } else {
181 /* continue current iteration direction */
182 dir=csc->dir;
183 }
184
185 if(dir<0) {
186 if(csc->start<csc->index) {
187 U8_PREV((const uint8_t *)csc->p, csc->start, csc->index, c);
188 return c;
189 }
190 } else {
191 if(csc->index<csc->limit) {
192 U8_NEXT((const uint8_t *)csc->p, csc->index, csc->limit, c);
193 return c;
194 }
195 }
196 return U_SENTINEL;
197}
198
199/**
200 * caseLocale >= 0: Lowercases [srcStart..srcLimit[ but takes context [0..srcLength[ into account.
201 * caseLocale < 0: Case-folds [srcStart..srcLimit[.
202 */
203void toLower(int32_t caseLocale, uint32_t options,
204 const uint8_t *src, UCaseContext *csc, int32_t srcStart, int32_t srcLimit,
205 icu::ByteSink &sink, icu::Edits *edits, UErrorCode &errorCode) {
206 const int8_t *latinToLower;
207 if (caseLocale == UCASE_LOC_ROOT ||
208 (caseLocale >= 0 ?
209 !(caseLocale == UCASE_LOC_TURKISH || caseLocale == UCASE_LOC_LITHUANIAN) :
210 (options & _FOLD_CASE_OPTIONS_MASK) == U_FOLD_CASE_DEFAULT)) {
211 latinToLower = LatinCase::TO_LOWER_NORMAL;
212 } else {
213 latinToLower = LatinCase::TO_LOWER_TR_LT;
214 }
215 const UTrie2 *trie = ucase_getTrie();
216 int32_t prev = srcStart;
217 int32_t srcIndex = srcStart;
218 for (;;) {
219 // fast path for simple cases
220 int32_t cpStart;
221 UChar32 c;
222 for (;;) {
223 if (U_FAILURE(errorCode) || srcIndex >= srcLimit) {
224 c = U_SENTINEL;
225 break;
226 }
227 uint8_t lead = src[srcIndex++];
228 if (lead <= 0x7f) {
229 int8_t d = latinToLower[lead];
230 if (d == LatinCase::EXC) {
231 cpStart = srcIndex - 1;
232 c = lead;
233 break;
234 }
235 if (d == 0) { continue; }
236 ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 1 - prev,
237 sink, options, edits, errorCode);
238 char ascii = (char)(lead + d);
239 sink.Append(&ascii, 1);
240 if (edits != nullptr) {
241 edits->addReplace(1, 1);
242 }
243 prev = srcIndex;
244 continue;
245 } else if (lead < 0xe3) {
246 uint8_t t;
247 if (0xc2 <= lead && lead <= 0xc5 && srcIndex < srcLimit &&
248 (t = src[srcIndex] - 0x80) <= 0x3f) {
249 // U+0080..U+017F
250 ++srcIndex;
251 c = ((lead - 0xc0) << 6) | t;
252 int8_t d = latinToLower[c];
253 if (d == LatinCase::EXC) {
254 cpStart = srcIndex - 2;
255 break;
256 }
257 if (d == 0) { continue; }
258 ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 2 - prev,
259 sink, options, edits, errorCode);
260 ByteSinkUtil::appendTwoBytes(c + d, sink);
261 if (edits != nullptr) {
262 edits->addReplace(2, 2);
263 }
264 prev = srcIndex;
265 continue;
266 }
267 } else if ((lead <= 0xe9 || lead == 0xeb || lead == 0xec) &&
268 (srcIndex + 2) <= srcLimit &&
269 U8_IS_TRAIL(src[srcIndex]) && U8_IS_TRAIL(src[srcIndex + 1])) {
270 // most of CJK: no case mappings
271 srcIndex += 2;
272 continue;
273 }
274 cpStart = --srcIndex;
275 U8_NEXT(src, srcIndex, srcLimit, c);
276 if (c < 0) {
277 // ill-formed UTF-8
278 continue;
279 }
280 uint16_t props = UTRIE2_GET16(trie, c);
281 if (UCASE_HAS_EXCEPTION(props)) { break; }
282 int32_t delta;
283 if (!UCASE_IS_UPPER_OR_TITLE(props) || (delta = UCASE_GET_DELTA(props)) == 0) {
284 continue;
285 }
286 ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
287 sink, options, edits, errorCode);
288 ByteSinkUtil::appendCodePoint(srcIndex - cpStart, c + delta, sink, edits);
289 prev = srcIndex;
290 }
291 if (c < 0) {
292 break;
293 }
294 // slow path
295 const char16_t *s;
296 if (caseLocale >= 0) {
297 csc->cpStart = cpStart;
298 csc->cpLimit = srcIndex;
299 c = ucase_toFullLower(c, utf8_caseContextIterator, csc, &s, caseLocale);
300 } else {
301 c = ucase_toFullFolding(c, &s, options);
302 }
303 if (c >= 0) {
304 ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
305 sink, options, edits, errorCode);
306 appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode);
307 prev = srcIndex;
308 }
309 }
310 ByteSinkUtil::appendUnchanged(src + prev, srcIndex - prev,
311 sink, options, edits, errorCode);
312}
313
314void toUpper(int32_t caseLocale, uint32_t options,
315 const uint8_t *src, UCaseContext *csc, int32_t srcLength,
316 icu::ByteSink &sink, icu::Edits *edits, UErrorCode &errorCode) {
317 const int8_t *latinToUpper;
318 if (caseLocale == UCASE_LOC_TURKISH) {
319 latinToUpper = LatinCase::TO_UPPER_TR;
320 } else {
321 latinToUpper = LatinCase::TO_UPPER_NORMAL;
322 }
323 const UTrie2 *trie = ucase_getTrie();
324 int32_t prev = 0;
325 int32_t srcIndex = 0;
326 for (;;) {
327 // fast path for simple cases
328 int32_t cpStart;
329 UChar32 c;
330 for (;;) {
331 if (U_FAILURE(errorCode) || srcIndex >= srcLength) {
332 c = U_SENTINEL;
333 break;
334 }
335 uint8_t lead = src[srcIndex++];
336 if (lead <= 0x7f) {
337 int8_t d = latinToUpper[lead];
338 if (d == LatinCase::EXC) {
339 cpStart = srcIndex - 1;
340 c = lead;
341 break;
342 }
343 if (d == 0) { continue; }
344 ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 1 - prev,
345 sink, options, edits, errorCode);
346 char ascii = (char)(lead + d);
347 sink.Append(&ascii, 1);
348 if (edits != nullptr) {
349 edits->addReplace(1, 1);
350 }
351 prev = srcIndex;
352 continue;
353 } else if (lead < 0xe3) {
354 uint8_t t;
355 if (0xc2 <= lead && lead <= 0xc5 && srcIndex < srcLength &&
356 (t = src[srcIndex] - 0x80) <= 0x3f) {
357 // U+0080..U+017F
358 ++srcIndex;
359 c = ((lead - 0xc0) << 6) | t;
360 int8_t d = latinToUpper[c];
361 if (d == LatinCase::EXC) {
362 cpStart = srcIndex - 2;
363 break;
364 }
365 if (d == 0) { continue; }
366 ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 2 - prev,
367 sink, options, edits, errorCode);
368 ByteSinkUtil::appendTwoBytes(c + d, sink);
369 if (edits != nullptr) {
370 edits->addReplace(2, 2);
371 }
372 prev = srcIndex;
373 continue;
374 }
375 } else if ((lead <= 0xe9 || lead == 0xeb || lead == 0xec) &&
376 (srcIndex + 2) <= srcLength &&
377 U8_IS_TRAIL(src[srcIndex]) && U8_IS_TRAIL(src[srcIndex + 1])) {
378 // most of CJK: no case mappings
379 srcIndex += 2;
380 continue;
381 }
382 cpStart = --srcIndex;
383 U8_NEXT(src, srcIndex, srcLength, c);
384 if (c < 0) {
385 // ill-formed UTF-8
386 continue;
387 }
388 uint16_t props = UTRIE2_GET16(trie, c);
389 if (UCASE_HAS_EXCEPTION(props)) { break; }
390 int32_t delta;
391 if (UCASE_GET_TYPE(props) != UCASE_LOWER || (delta = UCASE_GET_DELTA(props)) == 0) {
392 continue;
393 }
394 ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
395 sink, options, edits, errorCode);
396 ByteSinkUtil::appendCodePoint(srcIndex - cpStart, c + delta, sink, edits);
397 prev = srcIndex;
398 }
399 if (c < 0) {
400 break;
401 }
402 // slow path
403 csc->cpStart = cpStart;
404 csc->cpLimit = srcIndex;
405 const char16_t *s;
406 c = ucase_toFullUpper(c, utf8_caseContextIterator, csc, &s, caseLocale);
407 if (c >= 0) {
408 ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
409 sink, options, edits, errorCode);
410 appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode);
411 prev = srcIndex;
412 }
413 }
414 ByteSinkUtil::appendUnchanged(src + prev, srcIndex - prev,
415 sink, options, edits, errorCode);
416}
417
418} // namespace
419
420#if !UCONFIG_NO_BREAK_ITERATION
421
422namespace {
423
424constexpr uint8_t ACUTE_BYTE0 = u8"\u0301"[0];
425
426constexpr uint8_t ACUTE_BYTE1 = u8"\u0301"[1];
427
428/**
429 * Input: c is a letter I with or without acute accent.
430 * start is the index in src after c, and is less than segmentLimit.
431 * If a plain i/I is followed by a plain j/J,
432 * or an i/I with acute (precomposed or decomposed) is followed by a j/J with acute,
433 * then we output accordingly.
434 *
435 * @return the src index after the titlecased sequence, or the start index if no Dutch IJ
436 */
437int32_t maybeTitleDutchIJ(const uint8_t *src, UChar32 c, int32_t start, int32_t segmentLimit,
438 ByteSink &sink, uint32_t options, icu::Edits *edits, UErrorCode &errorCode) {
439 U_ASSERT(start < segmentLimit);
440
441 int32_t index = start;
442 bool withAcute = false;
443
444 // If the conditions are met, then the following variables tell us what to output.
445 int32_t unchanged1 = 0; // code units before the j, or the whole sequence (0..3)
446 bool doTitleJ = false; // true if the j needs to be titlecased
447 int32_t unchanged2 = 0; // after the j (0 or 1)
448
449 // next character after the first letter
450 UChar32 c2;
451 c2 = src[index++];
452
453 // Is the first letter an i/I with accent?
454 if (c == u'I') {
455 if (c2 == ACUTE_BYTE0 && index < segmentLimit && src[index++] == ACUTE_BYTE1) {
456 withAcute = true;
457 unchanged1 = 2; // ACUTE is 2 code units in UTF-8
458 if (index == segmentLimit) { return start; }
459 c2 = src[index++];
460 }
461 } else { // Í
462 withAcute = true;
463 }
464
465 // Is the next character a j/J?
466 if (c2 == u'j') {
467 doTitleJ = true;
468 } else if (c2 == u'J') {
469 ++unchanged1;
470 } else {
471 return start;
472 }
473
474 // A plain i/I must be followed by a plain j/J.
475 // An i/I with acute must be followed by a j/J with acute.
476 if (withAcute) {
477 if ((index + 1) >= segmentLimit || src[index++] != ACUTE_BYTE0 || src[index++] != ACUTE_BYTE1) {
478 return start;
479 }
480 if (doTitleJ) {
481 unchanged2 = 2; // ACUTE is 2 code units in UTF-8
482 } else {
483 unchanged1 = unchanged1 + 2; // ACUTE is 2 code units in UTF-8
484 }
485 }
486
487 // There must not be another combining mark.
488 if (index < segmentLimit) {
489 int32_t cp;
490 int32_t i = index;
491 U8_NEXT(src, i, segmentLimit, cp);
492 uint32_t typeMask = U_GET_GC_MASK(cp);
493 if ((typeMask & U_GC_M_MASK) != 0) {
494 return start;
495 }
496 }
497
498 // Output the rest of the Dutch IJ.
499 ByteSinkUtil::appendUnchanged(src + start, unchanged1, sink, options, edits, errorCode);
500 start += unchanged1;
501 if (doTitleJ) {
502 ByteSinkUtil::appendCodePoint(1, u'J', sink, edits);
503 ++start;
504 }
505 ByteSinkUtil::appendUnchanged(src + start, unchanged2, sink, options, edits, errorCode);
506
507 U_ASSERT(start + unchanged2 == index);
508 return index;
509}
510
511} // namespace
512
513U_CFUNC void U_CALLCONV
514ucasemap_internalUTF8ToTitle(
515 int32_t caseLocale, uint32_t options, BreakIterator *iter,
516 const uint8_t *src, int32_t srcLength,
517 ByteSink &sink, icu::Edits *edits,
518 UErrorCode &errorCode) {
519 if (!ustrcase_checkTitleAdjustmentOptions(options, errorCode)) {
520 return;
521 }
522
523 /* set up local variables */
524 UCaseContext csc=UCASECONTEXT_INITIALIZER;
525 csc.p=(void *)src;
526 csc.limit=srcLength;
527 int32_t prev=0;
528 UBool isFirstIndex=true;
529
530 /* titlecasing loop */
531 while(prev<srcLength) {
532 /* find next index where to titlecase */
533 int32_t index;
534 if(isFirstIndex) {
535 isFirstIndex=false;
536 index=iter->first();
537 } else {
538 index=iter->next();
539 }
540 if(index==UBRK_DONE || index>srcLength) {
541 index=srcLength;
542 }
543
544 /*
545 * Segment [prev..index[ into 3 parts:
546 * a) skipped characters (copy as-is) [prev..titleStart[
547 * b) first letter (titlecase) [titleStart..titleLimit[
548 * c) subsequent characters (lowercase) [titleLimit..index[
549 */
550 if(prev<index) {
551 /* find and copy skipped characters [prev..titleStart[ */
552 int32_t titleStart=prev;
553 int32_t titleLimit=prev;
554 UChar32 c;
555 U8_NEXT(src, titleLimit, index, c);
556 if ((options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0) {
557 // Adjust the titlecasing index to the next cased character,
558 // or to the next letter/number/symbol/private use.
559 // Stop with titleStart<titleLimit<=index
560 // if there is a character to be titlecased,
561 // or else stop with titleStart==titleLimit==index.
562 UBool toCased = (options&U_TITLECASE_ADJUST_TO_CASED) != 0;
563 while (toCased ? UCASE_NONE==ucase_getType(c) : !ustrcase_isLNS(c)) {
564 titleStart=titleLimit;
565 if(titleLimit==index) {
566 break;
567 }
568 U8_NEXT(src, titleLimit, index, c);
569 }
570 if (prev < titleStart) {
571 if (!ByteSinkUtil::appendUnchanged(src+prev, titleStart-prev,
572 sink, options, edits, errorCode)) {
573 return;
574 }
575 }
576 }
577
578 if(titleStart<titleLimit) {
579 /* titlecase c which is from [titleStart..titleLimit[ */
580 if(c>=0) {
581 csc.cpStart=titleStart;
582 csc.cpLimit=titleLimit;
583 const char16_t *s;
584 c=ucase_toFullTitle(c, utf8_caseContextIterator, &csc, &s, caseLocale);
585 if (!appendResult(titleLimit-titleStart, c, s, sink, options, edits, errorCode)) {
586 return;
587 }
588 } else {
589 // Malformed UTF-8.
590 if (!ByteSinkUtil::appendUnchanged(src+titleStart, titleLimit-titleStart,
591 sink, options, edits, errorCode)) {
592 return;
593 }
594 }
595
596 /* Special case Dutch IJ titlecasing */
597 if (titleLimit < index &&
598 caseLocale == UCASE_LOC_DUTCH) {
599 if (c < 0) {
600 c = ~c;
601 }
602
603 if (c == u'I' || c == u'Í') {
604 titleLimit = maybeTitleDutchIJ(src, c, titleLimit, index, sink, options, edits, errorCode);
605 }
606 }
607
608 /* lowercase [titleLimit..index[ */
609 if(titleLimit<index) {
610 if((options&U_TITLECASE_NO_LOWERCASE)==0) {
611 /* Normal operation: Lowercase the rest of the word. */
612 toLower(caseLocale, options,
613 src, &csc, titleLimit, index,
614 sink, edits, errorCode);
615 if(U_FAILURE(errorCode)) {
616 return;
617 }
618 } else {
619 /* Optionally just copy the rest of the word unchanged. */
620 if (!ByteSinkUtil::appendUnchanged(src+titleLimit, index-titleLimit,
621 sink, options, edits, errorCode)) {
622 return;
623 }
624 }
625 }
626 }
627 }
628
629 prev=index;
630 }
631}
632
633#endif
634
635U_NAMESPACE_BEGIN
636namespace GreekUpper {
637
638UBool isFollowedByCasedLetter(const uint8_t *s, int32_t i, int32_t length) {
639 while (i < length) {
640 UChar32 c;
641 U8_NEXT(s, i, length, c);
642 int32_t type = ucase_getTypeOrIgnorable(c);
643 if ((type & UCASE_IGNORABLE) != 0) {
644 // Case-ignorable, continue with the loop.
645 } else if (type != UCASE_NONE) {
646 return true; // Followed by cased letter.
647 } else {
648 return false; // Uncased and not case-ignorable.
649 }
650 }
651 return false; // Not followed by cased letter.
652}
653
654// Keep this consistent with the UTF-16 version in ustrcase.cpp and the Java version in CaseMap.java.
655void toUpper(uint32_t options,
656 const uint8_t *src, int32_t srcLength,
657 ByteSink &sink, Edits *edits,
658 UErrorCode &errorCode) {
659 uint32_t state = 0;
660 for (int32_t i = 0; i < srcLength;) {
661 int32_t nextIndex = i;
662 UChar32 c;
663 U8_NEXT(src, nextIndex, srcLength, c);
664 uint32_t nextState = 0;
665 int32_t type = ucase_getTypeOrIgnorable(c);
666 if ((type & UCASE_IGNORABLE) != 0) {
667 // c is case-ignorable
668 nextState |= (state & AFTER_CASED);
669 } else if (type != UCASE_NONE) {
670 // c is cased
671 nextState |= AFTER_CASED;
672 }
673 uint32_t data = getLetterData(c);
674 if (data > 0) {
675 uint32_t upper = data & UPPER_MASK;
676 // Add a dialytika to this iota or ypsilon vowel
677 // if we removed a tonos from the previous vowel,
678 // and that previous vowel did not also have (or gain) a dialytika.
679 // Adding one only to the final vowel in a longer sequence
680 // (which does not occur in normal writing) would require lookahead.
681 // Set the same flag as for preserving an existing dialytika.
682 if ((data & HAS_VOWEL) != 0 && (state & AFTER_VOWEL_WITH_ACCENT) != 0 &&
683 (upper == 0x399 || upper == 0x3A5)) {
684 data |= HAS_DIALYTIKA;
685 }
686 int32_t numYpogegrammeni = 0; // Map each one to a trailing, spacing, capital iota.
687 if ((data & HAS_YPOGEGRAMMENI) != 0) {
688 numYpogegrammeni = 1;
689 }
690 // Skip combining diacritics after this Greek letter.
691 int32_t nextNextIndex = nextIndex;
692 while (nextIndex < srcLength) {
693 UChar32 c2;
694 U8_NEXT(src, nextNextIndex, srcLength, c2);
695 uint32_t diacriticData = getDiacriticData(c2);
696 if (diacriticData != 0) {
697 data |= diacriticData;
698 if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) {
699 ++numYpogegrammeni;
700 }
701 nextIndex = nextNextIndex;
702 } else {
703 break; // not a Greek diacritic
704 }
705 }
706 if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) {
707 nextState |= AFTER_VOWEL_WITH_ACCENT;
708 }
709 // Map according to Greek rules.
710 UBool addTonos = false;
711 if (upper == 0x397 &&
712 (data & HAS_ACCENT) != 0 &&
713 numYpogegrammeni == 0 &&
714 (state & AFTER_CASED) == 0 &&
715 !isFollowedByCasedLetter(src, nextIndex, srcLength)) {
716 // Keep disjunctive "or" with (only) a tonos.
717 // We use the same "word boundary" conditions as for the Final_Sigma test.
718 if (i == nextIndex) {
719 upper = 0x389; // Preserve the precomposed form.
720 } else {
721 addTonos = true;
722 }
723 } else if ((data & HAS_DIALYTIKA) != 0) {
724 // Preserve a vowel with dialytika in precomposed form if it exists.
725 if (upper == 0x399) {
726 upper = 0x3AA;
727 data &= ~HAS_EITHER_DIALYTIKA;
728 } else if (upper == 0x3A5) {
729 upper = 0x3AB;
730 data &= ~HAS_EITHER_DIALYTIKA;
731 }
732 }
733
734 UBool change;
735 if (edits == nullptr && (options & U_OMIT_UNCHANGED_TEXT) == 0) {
736 change = true; // common, simple usage
737 } else {
738 // Find out first whether we are changing the text.
739 U_ASSERT(0x370 <= upper && upper <= 0x3ff); // 2-byte UTF-8, main Greek block
740 change = (i + 2) > nextIndex ||
741 src[i] != getTwoByteLead(upper) || src[i + 1] != getTwoByteTrail(upper) ||
742 numYpogegrammeni > 0;
743 int32_t i2 = i + 2;
744 if ((data & HAS_EITHER_DIALYTIKA) != 0) {
745 change |= (i2 + 2) > nextIndex ||
746 src[i2] != (uint8_t)u8"\u0308"[0] ||
747 src[i2 + 1] != (uint8_t)u8"\u0308"[1];
748 i2 += 2;
749 }
750 if (addTonos) {
751 change |= (i2 + 2) > nextIndex ||
752 src[i2] != (uint8_t)u8"\u0301"[0] ||
753 src[i2 + 1] != (uint8_t)u8"\u0301"[1];
754 i2 += 2;
755 }
756 int32_t oldLength = nextIndex - i;
757 int32_t newLength = (i2 - i) + numYpogegrammeni * 2; // 2 bytes per U+0399
758 change |= oldLength != newLength;
759 if (change) {
760 if (edits != nullptr) {
761 edits->addReplace(oldLength, newLength);
762 }
763 } else {
764 if (edits != nullptr) {
765 edits->addUnchanged(oldLength);
766 }
767 // Write unchanged text?
768 change = (options & U_OMIT_UNCHANGED_TEXT) == 0;
769 }
770 }
771
772 if (change) {
773 ByteSinkUtil::appendTwoBytes(upper, sink);
774 if ((data & HAS_EITHER_DIALYTIKA) != 0) {
775 sink.AppendU8(u8"\u0308", 2); // restore or add a dialytika
776 }
777 if (addTonos) {
778 sink.AppendU8(u8"\u0301", 2);
779 }
780 while (numYpogegrammeni > 0) {
781 sink.AppendU8(u8"\u0399", 2);
782 --numYpogegrammeni;
783 }
784 }
785 } else if(c>=0) {
786 const char16_t *s;
787 c=ucase_toFullUpper(c, nullptr, nullptr, &s, UCASE_LOC_GREEK);
788 if (!appendResult(nextIndex - i, c, s, sink, options, edits, errorCode)) {
789 return;
790 }
791 } else {
792 // Malformed UTF-8.
793 if (!ByteSinkUtil::appendUnchanged(src+i, nextIndex-i,
794 sink, options, edits, errorCode)) {
795 return;
796 }
797 }
798 i = nextIndex;
799 state = nextState;
800 }
801}
802
803} // namespace GreekUpper
804U_NAMESPACE_END
805
806static void U_CALLCONV
807ucasemap_internalUTF8ToLower(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
808 const uint8_t *src, int32_t srcLength,
809 icu::ByteSink &sink, icu::Edits *edits,
810 UErrorCode &errorCode) {
811 UCaseContext csc=UCASECONTEXT_INITIALIZER;
812 csc.p=(void *)src;
813 csc.limit=srcLength;
814 toLower(
815 caseLocale, options,
816 src, &csc, 0, srcLength,
817 sink, edits, errorCode);
818}
819
820static void U_CALLCONV
821ucasemap_internalUTF8ToUpper(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
822 const uint8_t *src, int32_t srcLength,
823 icu::ByteSink &sink, icu::Edits *edits,
824 UErrorCode &errorCode) {
825 if (caseLocale == UCASE_LOC_GREEK) {
826 GreekUpper::toUpper(options, src, srcLength, sink, edits, errorCode);
827 } else {
828 UCaseContext csc=UCASECONTEXT_INITIALIZER;
829 csc.p=(void *)src;
830 csc.limit=srcLength;
831 toUpper(
832 caseLocale, options,
833 src, &csc, srcLength,
834 sink, edits, errorCode);
835 }
836}
837
838static void U_CALLCONV
839ucasemap_internalUTF8Fold(int32_t /* caseLocale */, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
840 const uint8_t *src, int32_t srcLength,
841 icu::ByteSink &sink, icu::Edits *edits,
842 UErrorCode &errorCode) {
843 toLower(
844 -1, options,
845 src, nullptr, 0, srcLength,
846 sink, edits, errorCode);
847}
848
849void
850ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
851 const char *src, int32_t srcLength,
852 UTF8CaseMapper *stringCaseMapper,
853 icu::ByteSink &sink, icu::Edits *edits,
854 UErrorCode &errorCode) {
855 /* check argument values */
856 if (U_FAILURE(errorCode)) {
857 return;
858 }
859 if ((src == nullptr && srcLength != 0) || srcLength < -1) {
860 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
861 return;
862 }
863
864 // Get the string length.
865 if (srcLength == -1) {
866 srcLength = (int32_t)uprv_strlen((const char *)src);
867 }
868
869 if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
870 edits->reset();
871 }
872 stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
873 (const uint8_t *)src, srcLength, sink, edits, errorCode);
874 sink.Flush();
875 if (U_SUCCESS(errorCode)) {
876 if (edits != nullptr) {
877 edits->copyErrorTo(errorCode);
878 }
879 }
880}
881
882int32_t
883ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
884 char *dest, int32_t destCapacity,
885 const char *src, int32_t srcLength,
886 UTF8CaseMapper *stringCaseMapper,
887 icu::Edits *edits,
888 UErrorCode &errorCode) {
889 /* check argument values */
890 if(U_FAILURE(errorCode)) {
891 return 0;
892 }
893 if( destCapacity<0 ||
894 (dest==nullptr && destCapacity>0) ||
895 (src==nullptr && srcLength!=0) || srcLength<-1
896 ) {
897 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
898 return 0;
899 }
900
901 /* get the string length */
902 if(srcLength==-1) {
903 srcLength=(int32_t)uprv_strlen((const char *)src);
904 }
905
906 /* check for overlapping source and destination */
907 if( dest!=nullptr &&
908 ((src>=dest && src<(dest+destCapacity)) ||
909 (dest>=src && dest<(src+srcLength)))
910 ) {
911 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
912 return 0;
913 }
914
915 CheckedArrayByteSink sink(dest, destCapacity);
916 if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
917 edits->reset();
918 }
919 stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
920 (const uint8_t *)src, srcLength, sink, edits, errorCode);
921 sink.Flush();
922 if (U_SUCCESS(errorCode)) {
923 if (sink.Overflowed()) {
924 errorCode = U_BUFFER_OVERFLOW_ERROR;
925 } else if (edits != nullptr) {
926 edits->copyErrorTo(errorCode);
927 }
928 }
929 return u_terminateChars(dest, destCapacity, sink.NumberOfBytesAppended(), &errorCode);
930}
931
932/* public API functions */
933
934U_CAPI int32_t U_EXPORT2
935ucasemap_utf8ToLower(const UCaseMap *csm,
936 char *dest, int32_t destCapacity,
937 const char *src, int32_t srcLength,
938 UErrorCode *pErrorCode) {
939 return ucasemap_mapUTF8(
940 csm->caseLocale, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
941 dest, destCapacity,
942 src, srcLength,
943 ucasemap_internalUTF8ToLower, nullptr, *pErrorCode);
944}
945
946U_CAPI int32_t U_EXPORT2
947ucasemap_utf8ToUpper(const UCaseMap *csm,
948 char *dest, int32_t destCapacity,
949 const char *src, int32_t srcLength,
950 UErrorCode *pErrorCode) {
951 return ucasemap_mapUTF8(
952 csm->caseLocale, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
953 dest, destCapacity,
954 src, srcLength,
955 ucasemap_internalUTF8ToUpper, nullptr, *pErrorCode);
956}
957
958U_CAPI int32_t U_EXPORT2
959ucasemap_utf8FoldCase(const UCaseMap *csm,
960 char *dest, int32_t destCapacity,
961 const char *src, int32_t srcLength,
962 UErrorCode *pErrorCode) {
963 return ucasemap_mapUTF8(
964 UCASE_LOC_ROOT, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
965 dest, destCapacity,
966 src, srcLength,
967 ucasemap_internalUTF8Fold, nullptr, *pErrorCode);
968}
969
970U_NAMESPACE_BEGIN
971
972void CaseMap::utf8ToLower(
973 const char *locale, uint32_t options,
974 StringPiece src, ByteSink &sink, Edits *edits,
975 UErrorCode &errorCode) {
976 ucasemap_mapUTF8(
977 ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
978 src.data(), src.length(),
979 ucasemap_internalUTF8ToLower, sink, edits, errorCode);
980}
981
982void CaseMap::utf8ToUpper(
983 const char *locale, uint32_t options,
984 StringPiece src, ByteSink &sink, Edits *edits,
985 UErrorCode &errorCode) {
986 ucasemap_mapUTF8(
987 ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
988 src.data(), src.length(),
989 ucasemap_internalUTF8ToUpper, sink, edits, errorCode);
990}
991
992void CaseMap::utf8Fold(
993 uint32_t options,
994 StringPiece src, ByteSink &sink, Edits *edits,
995 UErrorCode &errorCode) {
996 ucasemap_mapUTF8(
997 UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
998 src.data(), src.length(),
999 ucasemap_internalUTF8Fold, sink, edits, errorCode);
1000}
1001
1002int32_t CaseMap::utf8ToLower(
1003 const char *locale, uint32_t options,
1004 const char *src, int32_t srcLength,
1005 char *dest, int32_t destCapacity, Edits *edits,
1006 UErrorCode &errorCode) {
1007 return ucasemap_mapUTF8(
1008 ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
1009 dest, destCapacity,
1010 src, srcLength,
1011 ucasemap_internalUTF8ToLower, edits, errorCode);
1012}
1013
1014int32_t CaseMap::utf8ToUpper(
1015 const char *locale, uint32_t options,
1016 const char *src, int32_t srcLength,
1017 char *dest, int32_t destCapacity, Edits *edits,
1018 UErrorCode &errorCode) {
1019 return ucasemap_mapUTF8(
1020 ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
1021 dest, destCapacity,
1022 src, srcLength,
1023 ucasemap_internalUTF8ToUpper, edits, errorCode);
1024}
1025
1026int32_t CaseMap::utf8Fold(
1027 uint32_t options,
1028 const char *src, int32_t srcLength,
1029 char *dest, int32_t destCapacity, Edits *edits,
1030 UErrorCode &errorCode) {
1031 return ucasemap_mapUTF8(
1032 UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
1033 dest, destCapacity,
1034 src, srcLength,
1035 ucasemap_internalUTF8Fold, edits, errorCode);
1036}
1037
1038U_NAMESPACE_END
1039