1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5*
6* Copyright (C) 2004-2014, International Business Machines
7* Corporation and others. All Rights Reserved.
8*
9*******************************************************************************
10* file name: ucase.cpp
11* encoding: UTF-8
12* tab size: 8 (not used)
13* indentation:4
14*
15* created on: 2004aug30
16* created by: Markus W. Scherer
17*
18* Low-level Unicode character/string case mapping code.
19* Much code moved here (and modified) from uchar.c.
20*/
21
22#include "unicode/utypes.h"
23#include "unicode/unistr.h"
24#include "unicode/uset.h"
25#include "unicode/utf16.h"
26#include "cmemory.h"
27#include "uassert.h"
28#include "ucase.h"
29#include "umutex.h"
30#include "utrie2.h"
31
32/* ucase_props_data.h is machine-generated by genprops/casepropsbuilder.cpp */
33#define INCLUDED_FROM_UCASE_CPP
34#include "ucase_props_data.h"
35
36/* set of property starts for UnicodeSet ------------------------------------ */
37
38static UBool U_CALLCONV
39_enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) {
40 /* add the start code point to the USet */
41 const USetAdder *sa=(const USetAdder *)context;
42 sa->add(sa->set, start);
43 return true;
44}
45
46U_CFUNC void U_EXPORT2
47ucase_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
48 if(U_FAILURE(*pErrorCode)) {
49 return;
50 }
51
52 /* add the start code point of each same-value range of the trie */
53 utrie2_enum(&ucase_props_singleton.trie, nullptr, _enumPropertyStartsRange, sa);
54
55 /* add code points with hardcoded properties, plus the ones following them */
56
57 /* (none right now, see comment below) */
58
59 /*
60 * Omit code points with hardcoded specialcasing properties
61 * because we do not build property UnicodeSets for them right now.
62 */
63}
64
65/* data access primitives --------------------------------------------------- */
66
67U_CAPI const struct UCaseProps * U_EXPORT2
68ucase_getSingleton(int32_t *pExceptionsLength, int32_t *pUnfoldLength) {
69 *pExceptionsLength = UPRV_LENGTHOF(ucase_props_exceptions);
70 *pUnfoldLength = UPRV_LENGTHOF(ucase_props_unfold);
71 return &ucase_props_singleton;
72}
73
74U_CFUNC const UTrie2 * U_EXPORT2
75ucase_getTrie() {
76 return &ucase_props_singleton.trie;
77}
78
79#define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT))
80
81/* number of bits in an 8-bit integer value */
82static const uint8_t flagsOffset[256]={
83 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
84 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
85 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
86 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
87 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
88 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
89 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
90 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
91 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
92 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
93 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
94 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
95 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
96 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
97 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
98 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
99};
100
101#define HAS_SLOT(flags, idx) ((flags)&(1<<(idx)))
102#define SLOT_OFFSET(flags, idx) flagsOffset[(flags)&((1<<(idx))-1)]
103
104/*
105 * Get the value of an optional-value slot where HAS_SLOT(excWord, idx).
106 *
107 * @param excWord (in) initial exceptions word
108 * @param idx (in) desired slot index
109 * @param pExc16 (in/out) const uint16_t * after excWord=*pExc16++;
110 * moved to the last uint16_t of the value, use +1 for beginning of next slot
111 * @param value (out) int32_t or uint32_t output if hasSlot, otherwise not modified
112 */
113#define GET_SLOT_VALUE(excWord, idx, pExc16, value) UPRV_BLOCK_MACRO_BEGIN { \
114 if(((excWord)&UCASE_EXC_DOUBLE_SLOTS)==0) { \
115 (pExc16)+=SLOT_OFFSET(excWord, idx); \
116 (value)=*pExc16; \
117 } else { \
118 (pExc16)+=2*SLOT_OFFSET(excWord, idx); \
119 (value)=*pExc16++; \
120 (value)=((value)<<16)|*pExc16; \
121 } \
122} UPRV_BLOCK_MACRO_END
123
124/* simple case mappings ----------------------------------------------------- */
125
126U_CAPI UChar32 U_EXPORT2
127ucase_tolower(UChar32 c) {
128 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
129 if(!UCASE_HAS_EXCEPTION(props)) {
130 if(UCASE_IS_UPPER_OR_TITLE(props)) {
131 c+=UCASE_GET_DELTA(props);
132 }
133 } else {
134 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
135 uint16_t excWord=*pe++;
136 if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
137 int32_t delta;
138 GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
139 return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
140 }
141 if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
142 GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe, c);
143 }
144 }
145 return c;
146}
147
148U_CAPI UChar32 U_EXPORT2
149ucase_toupper(UChar32 c) {
150 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
151 if(!UCASE_HAS_EXCEPTION(props)) {
152 if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
153 c+=UCASE_GET_DELTA(props);
154 }
155 } else {
156 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
157 uint16_t excWord=*pe++;
158 if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) {
159 int32_t delta;
160 GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
161 return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
162 }
163 if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
164 GET_SLOT_VALUE(excWord, UCASE_EXC_UPPER, pe, c);
165 }
166 }
167 return c;
168}
169
170U_CAPI UChar32 U_EXPORT2
171ucase_totitle(UChar32 c) {
172 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
173 if(!UCASE_HAS_EXCEPTION(props)) {
174 if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
175 c+=UCASE_GET_DELTA(props);
176 }
177 } else {
178 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
179 uint16_t excWord=*pe++;
180 if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) {
181 int32_t delta;
182 GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
183 return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
184 }
185 int32_t idx;
186 if(HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
187 idx=UCASE_EXC_TITLE;
188 } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
189 idx=UCASE_EXC_UPPER;
190 } else {
191 return c;
192 }
193 GET_SLOT_VALUE(excWord, idx, pe, c);
194 }
195 return c;
196}
197
198static const char16_t iDot[2] = { 0x69, 0x307 };
199static const char16_t jDot[2] = { 0x6a, 0x307 };
200static const char16_t iOgonekDot[3] = { 0x12f, 0x307 };
201static const char16_t iDotGrave[3] = { 0x69, 0x307, 0x300 };
202static const char16_t iDotAcute[3] = { 0x69, 0x307, 0x301 };
203static const char16_t iDotTilde[3] = { 0x69, 0x307, 0x303 };
204
205
206U_CFUNC void U_EXPORT2
207ucase_addCaseClosure(UChar32 c, const USetAdder *sa) {
208 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
209 if(!UCASE_HAS_EXCEPTION(props)) {
210 if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
211 /* add the one simple case mapping, no matter what type it is */
212 int32_t delta=UCASE_GET_DELTA(props);
213 if(delta!=0) {
214 sa->add(sa->set, c+delta);
215 }
216 }
217 } else {
218 /*
219 * c has exceptions, so there may be multiple simple and/or
220 * full case mappings. Add them all.
221 */
222 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
223 uint16_t excWord=*pe++;
224 const uint16_t *pe0=pe;
225
226 // Hardcode the case closure of i and its relatives and ignore the
227 // data file data for these characters.
228 // The Turkic dotless i and dotted I with their case mapping conditions
229 // and case folding option make the related characters behave specially.
230 // This code matches their closure behavior to their case folding behavior.
231 if (excWord&UCASE_EXC_CONDITIONAL_FOLD) {
232 // These characters have Turkic case foldings. Hardcode their closure.
233 if (c == 0x49) {
234 // Regular i and I are in one equivalence class.
235 sa->add(sa->set, 0x69);
236 return;
237 } else if (c == 0x130) {
238 // Dotted I is in a class with <0069 0307>
239 // (for canonical equivalence with <0049 0307>).
240 sa->addString(sa->set, iDot, 2);
241 return;
242 }
243 } else if (c == 0x69) {
244 sa->add(sa->set, 0x49);
245 return;
246 } else if (c == 0x131) {
247 // Dotless i is in a class by itself.
248 return;
249 }
250
251 /* add all simple case mappings */
252 for(int32_t idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) {
253 if(HAS_SLOT(excWord, idx)) {
254 pe=pe0;
255 UChar32 mapping;
256 GET_SLOT_VALUE(excWord, idx, pe, mapping);
257 sa->add(sa->set, mapping);
258 }
259 }
260 if(HAS_SLOT(excWord, UCASE_EXC_DELTA)) {
261 pe=pe0;
262 int32_t delta;
263 GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
264 sa->add(sa->set, (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta);
265 }
266
267 /* get the closure string pointer & length */
268 const char16_t *closure;
269 int32_t closureLength;
270 if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) {
271 pe=pe0;
272 GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength);
273 closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved */
274 closure=(const char16_t *)pe+1; /* behind this slot, unless there are full case mappings */
275 } else {
276 closureLength=0;
277 closure=nullptr;
278 }
279
280 /* add the full case folding */
281 if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
282 pe=pe0;
283 int32_t fullLength;
284 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength);
285
286 /* start of full case mapping strings */
287 ++pe;
288
289 fullLength&=0xffff; /* bits 16 and higher are reserved */
290
291 /* skip the lowercase result string */
292 pe+=fullLength&UCASE_FULL_LOWER;
293 fullLength>>=4;
294
295 /* add the full case folding string */
296 int32_t length=fullLength&0xf;
297 if(length!=0) {
298 sa->addString(sa->set, (const char16_t *)pe, length);
299 pe+=length;
300 }
301
302 /* skip the uppercase and titlecase strings */
303 fullLength>>=4;
304 pe+=fullLength&0xf;
305 fullLength>>=4;
306 pe+=fullLength;
307
308 closure=(const char16_t *)pe; /* behind full case mappings */
309 }
310
311 /* add each code point in the closure string */
312 for(int32_t idx=0; idx<closureLength;) {
313 UChar32 mapping;
314 U16_NEXT_UNSAFE(closure, idx, mapping);
315 sa->add(sa->set, mapping);
316 }
317 }
318}
319
320namespace {
321
322/**
323 * Add the simple case closure mapping,
324 * except if there is not actually an scf relationship between the two characters.
325 * TODO: Unicode should probably add the corresponding scf mappings.
326 * See https://crbug.com/v8/13377 and Unicode-internal PAG issue #23.
327 * If & when those scf mappings are added, we should be able to remove all of these exceptions.
328 */
329void addOneSimpleCaseClosure(UChar32 c, UChar32 t, const USetAdder *sa) {
330 switch (c) {
331 case 0x0390:
332 if (t == 0x1FD3) { return; }
333 break;
334 case 0x03B0:
335 if (t == 0x1FE3) { return; }
336 break;
337 case 0x1FD3:
338 if (t == 0x0390) { return; }
339 break;
340 case 0x1FE3:
341 if (t == 0x03B0) { return; }
342 break;
343 case 0xFB05:
344 if (t == 0xFB06) { return; }
345 break;
346 case 0xFB06:
347 if (t == 0xFB05) { return; }
348 break;
349 default:
350 break;
351 }
352 sa->add(sa->set, t);
353}
354
355} // namespace
356
357U_CFUNC void U_EXPORT2
358ucase_addSimpleCaseClosure(UChar32 c, const USetAdder *sa) {
359 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
360 if(!UCASE_HAS_EXCEPTION(props)) {
361 if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
362 /* add the one simple case mapping, no matter what type it is */
363 int32_t delta=UCASE_GET_DELTA(props);
364 if(delta!=0) {
365 sa->add(sa->set, c+delta);
366 }
367 }
368 } else {
369 // c has exceptions. Add the mappings relevant for scf=Simple_Case_Folding.
370 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
371 uint16_t excWord=*pe++;
372 const uint16_t *pe0=pe;
373
374 // Hardcode the case closure of i and its relatives and ignore the
375 // data file data for these characters, like in ucase_addCaseClosure().
376 if (excWord&UCASE_EXC_CONDITIONAL_FOLD) {
377 // These characters have Turkic case foldings. Hardcode their closure.
378 if (c == 0x49) {
379 // Regular i and I are in one equivalence class.
380 sa->add(sa->set, 0x69);
381 return;
382 } else if (c == 0x130) {
383 // For scf=Simple_Case_Folding, dotted I is in a class by itself.
384 return;
385 }
386 } else if (c == 0x69) {
387 sa->add(sa->set, 0x49);
388 return;
389 } else if (c == 0x131) {
390 // Dotless i is in a class by itself.
391 return;
392 }
393
394 // Add all simple case mappings.
395 for(int32_t idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) {
396 if(HAS_SLOT(excWord, idx)) {
397 pe=pe0;
398 UChar32 mapping;
399 GET_SLOT_VALUE(excWord, idx, pe, mapping);
400 addOneSimpleCaseClosure(c, mapping, sa);
401 }
402 }
403 if(HAS_SLOT(excWord, UCASE_EXC_DELTA)) {
404 pe=pe0;
405 int32_t delta;
406 GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
407 UChar32 mapping = (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
408 addOneSimpleCaseClosure(c, mapping, sa);
409 }
410
411 /* get the closure string pointer & length */
412 const char16_t *closure;
413 int32_t closureLength;
414 if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) {
415 pe=pe0;
416 GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength);
417 closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved */
418 closure=(const char16_t *)pe+1; /* behind this slot, unless there are full case mappings */
419 } else {
420 closureLength=0;
421 closure=nullptr;
422 }
423
424 // Skip the full case mappings.
425 if(closureLength > 0 && HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
426 pe=pe0;
427 int32_t fullLength;
428 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength);
429
430 /* start of full case mapping strings */
431 ++pe;
432
433 fullLength&=0xffff; /* bits 16 and higher are reserved */
434
435 // Skip all 4 full case mappings.
436 pe+=fullLength&UCASE_FULL_LOWER;
437 fullLength>>=4;
438 pe+=fullLength&0xf;
439 fullLength>>=4;
440 pe+=fullLength&0xf;
441 fullLength>>=4;
442 pe+=fullLength;
443
444 closure=(const char16_t *)pe; /* behind full case mappings */
445 }
446
447 // Add each code point in the closure string whose scf maps back to c.
448 for(int32_t idx=0; idx<closureLength;) {
449 UChar32 mapping;
450 U16_NEXT_UNSAFE(closure, idx, mapping);
451 addOneSimpleCaseClosure(c, mapping, sa);
452 }
453 }
454}
455
456/*
457 * compare s, which has a length, with t, which has a maximum length or is NUL-terminated
458 * must be length>0 and max>0 and length<=max
459 */
460static inline int32_t
461strcmpMax(const char16_t *s, int32_t length, const char16_t *t, int32_t max) {
462 int32_t c1, c2;
463
464 max-=length; /* we require length<=max, so no need to decrement max in the loop */
465 do {
466 c1=*s++;
467 c2=*t++;
468 if(c2==0) {
469 return 1; /* reached the end of t but not of s */
470 }
471 c1-=c2;
472 if(c1!=0) {
473 return c1; /* return difference result */
474 }
475 } while(--length>0);
476 /* ends with length==0 */
477
478 if(max==0 || *t==0) {
479 return 0; /* equal to length of both strings */
480 } else {
481 return -max; /* return length difference */
482 }
483}
484
485U_CFUNC UBool U_EXPORT2
486ucase_addStringCaseClosure(const char16_t *s, int32_t length, const USetAdder *sa) {
487 int32_t i, start, limit, result, unfoldRows, unfoldRowWidth, unfoldStringWidth;
488
489 if(ucase_props_singleton.unfold==nullptr || s==nullptr) {
490 return false; /* no reverse case folding data, or no string */
491 }
492 if(length<=1) {
493 /* the string is too short to find any match */
494 /*
495 * more precise would be:
496 * if(!u_strHasMoreChar32Than(s, length, 1))
497 * but this does not make much practical difference because
498 * a single supplementary code point would just not be found
499 */
500 return false;
501 }
502
503 const uint16_t *unfold=ucase_props_singleton.unfold;
504 unfoldRows=unfold[UCASE_UNFOLD_ROWS];
505 unfoldRowWidth=unfold[UCASE_UNFOLD_ROW_WIDTH];
506 unfoldStringWidth=unfold[UCASE_UNFOLD_STRING_WIDTH];
507 unfold+=unfoldRowWidth;
508
509 if(length>unfoldStringWidth) {
510 /* the string is too long to find any match */
511 return false;
512 }
513
514 /* do a binary search for the string */
515 start=0;
516 limit=unfoldRows;
517 while(start<limit) {
518 i=(start+limit)/2;
519 const char16_t *p=reinterpret_cast<const char16_t *>(unfold+(i*unfoldRowWidth));
520 result=strcmpMax(s, length, p, unfoldStringWidth);
521
522 if(result==0) {
523 /* found the string: add each code point, and its case closure */
524 UChar32 c;
525
526 for(i=unfoldStringWidth; i<unfoldRowWidth && p[i]!=0;) {
527 U16_NEXT_UNSAFE(p, i, c);
528 sa->add(sa->set, c);
529 ucase_addCaseClosure(c, sa);
530 }
531 return true;
532 } else if(result<0) {
533 limit=i;
534 } else /* result>0 */ {
535 start=i+1;
536 }
537 }
538
539 return false; /* string not found */
540}
541
542U_NAMESPACE_BEGIN
543
544FullCaseFoldingIterator::FullCaseFoldingIterator()
545 : unfold(reinterpret_cast<const char16_t *>(ucase_props_singleton.unfold)),
546 unfoldRows(unfold[UCASE_UNFOLD_ROWS]),
547 unfoldRowWidth(unfold[UCASE_UNFOLD_ROW_WIDTH]),
548 unfoldStringWidth(unfold[UCASE_UNFOLD_STRING_WIDTH]),
549 currentRow(0),
550 rowCpIndex(unfoldStringWidth) {
551 unfold+=unfoldRowWidth;
552}
553
554UChar32
555FullCaseFoldingIterator::next(UnicodeString &full) {
556 // Advance past the last-delivered code point.
557 const char16_t *p=unfold+(currentRow*unfoldRowWidth);
558 if(rowCpIndex>=unfoldRowWidth || p[rowCpIndex]==0) {
559 ++currentRow;
560 p+=unfoldRowWidth;
561 rowCpIndex=unfoldStringWidth;
562 }
563 if(currentRow>=unfoldRows) { return U_SENTINEL; }
564 // Set "full" to the NUL-terminated string in the first unfold column.
565 int32_t length=unfoldStringWidth;
566 while(length>0 && p[length-1]==0) { --length; }
567 full.setTo(false, p, length);
568 // Return the code point.
569 UChar32 c;
570 U16_NEXT_UNSAFE(p, rowCpIndex, c);
571 return c;
572}
573
574namespace LatinCase {
575
576const int8_t TO_LOWER_NORMAL[LIMIT] = {
577 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
578 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
579 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
580 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
581
582 0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
583 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0,
584 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
585 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
586
587 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
588 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
589 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
590 0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
591
592 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
593 32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, EXC,
594 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
595 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
596
597 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
598 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
599 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
600 EXC, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
601
602 0, 1, 0, 1, 0, 1, 0, 1, 0, EXC, 1, 0, 1, 0, 1, 0,
603 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
604 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
605 1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, EXC
606};
607
608const int8_t TO_LOWER_TR_LT[LIMIT] = {
609 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
610 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
611 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
612 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
613
614 0, 32, 32, 32, 32, 32, 32, 32, 32, EXC, EXC, 32, 32, 32, 32, 32,
615 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0,
616 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
617 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
618
619 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
620 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
621 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
622 0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
623
624 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, EXC, EXC, 32, 32,
625 32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, EXC,
626 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
627 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
628
629 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
630 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
631 1, 0, 1, 0, 1, 0, 1, 0, EXC, 0, 1, 0, 1, 0, EXC, 0,
632 EXC, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
633
634 0, 1, 0, 1, 0, 1, 0, 1, 0, EXC, 1, 0, 1, 0, 1, 0,
635 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
636 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
637 1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, EXC
638};
639
640const int8_t TO_UPPER_NORMAL[LIMIT] = {
641 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
642 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
643 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
644 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
645
646 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
647 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
648 0, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
649 -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0,
650
651 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
652 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
653 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
654 0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
655
656 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
657 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, EXC,
658 -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
659 -32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121,
660
661 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
662 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
663 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
664 0, EXC, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0,
665
666 -1, 0, -1, 0, -1, 0, -1, 0, -1, EXC, 0, -1, 0, -1, 0, -1,
667 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
668 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
669 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, EXC
670};
671
672const int8_t TO_UPPER_TR[LIMIT] = {
673 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
674 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
675 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
676 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
677
678 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
679 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
680 0, -32, -32, -32, -32, -32, -32, -32, -32, EXC, -32, -32, -32, -32, -32, -32,
681 -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0,
682
683 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
684 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
685 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
686 0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
687
688 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
689 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, EXC,
690 -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
691 -32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121,
692
693 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
694 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
695 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
696 0, EXC, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0,
697
698 -1, 0, -1, 0, -1, 0, -1, 0, -1, EXC, 0, -1, 0, -1, 0, -1,
699 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
700 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
701 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, EXC
702};
703
704} // namespace LatinCase
705
706U_NAMESPACE_END
707
708/** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
709U_CAPI int32_t U_EXPORT2
710ucase_getType(UChar32 c) {
711 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
712 return UCASE_GET_TYPE(props);
713}
714
715/** @return same as ucase_getType() and set bit 2 if c is case-ignorable */
716U_CAPI int32_t U_EXPORT2
717ucase_getTypeOrIgnorable(UChar32 c) {
718 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
719 return UCASE_GET_TYPE_AND_IGNORABLE(props);
720}
721
722/** @return UCASE_NO_DOT, UCASE_SOFT_DOTTED, UCASE_ABOVE, UCASE_OTHER_ACCENT */
723static inline int32_t
724getDotType(UChar32 c) {
725 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
726 if(!UCASE_HAS_EXCEPTION(props)) {
727 return props&UCASE_DOT_MASK;
728 } else {
729 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
730 return (*pe>>UCASE_EXC_DOT_SHIFT)&UCASE_DOT_MASK;
731 }
732}
733
734U_CAPI UBool U_EXPORT2
735ucase_isSoftDotted(UChar32 c) {
736 return (UBool)(getDotType(c)==UCASE_SOFT_DOTTED);
737}
738
739U_CAPI UBool U_EXPORT2
740ucase_isCaseSensitive(UChar32 c) {
741 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
742 if(!UCASE_HAS_EXCEPTION(props)) {
743 return (UBool)((props&UCASE_SENSITIVE)!=0);
744 } else {
745 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
746 return (UBool)((*pe&UCASE_EXC_SENSITIVE)!=0);
747 }
748}
749
750/* string casing ------------------------------------------------------------ */
751
752/*
753 * These internal functions form the core of string case mappings.
754 * They map single code points to result code points or strings and take
755 * all necessary conditions (context, locale ID, options) into account.
756 *
757 * They do not iterate over the source or write to the destination
758 * so that the same functions are useful for non-standard string storage,
759 * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc.
760 * For the same reason, the "surrounding text" context is passed in as a
761 * UCaseContextIterator which does not make any assumptions about
762 * the underlying storage.
763 *
764 * This section contains helper functions that check for conditions
765 * in the input text surrounding the current code point
766 * according to SpecialCasing.txt.
767 *
768 * Each helper function gets the index
769 * - after the current code point if it looks at following text
770 * - before the current code point if it looks at preceding text
771 *
772 * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows:
773 *
774 * Final_Sigma
775 * C is preceded by a sequence consisting of
776 * a cased letter and a case-ignorable sequence,
777 * and C is not followed by a sequence consisting of
778 * an ignorable sequence and then a cased letter.
779 *
780 * More_Above
781 * C is followed by one or more characters of combining class 230 (ABOVE)
782 * in the combining character sequence.
783 *
784 * After_Soft_Dotted
785 * The last preceding character with combining class of zero before C
786 * was Soft_Dotted,
787 * and there is no intervening combining character class 230 (ABOVE).
788 *
789 * Before_Dot
790 * C is followed by combining dot above (U+0307).
791 * Any sequence of characters with a combining class that is neither 0 nor 230
792 * may intervene between the current character and the combining dot above.
793 *
794 * The erratum from 2002-10-31 adds the condition
795 *
796 * After_I
797 * The last preceding base character was an uppercase I, and there is no
798 * intervening combining character class 230 (ABOVE).
799 *
800 * (See Jitterbug 2344 and the comments on After_I below.)
801 *
802 * Helper definitions in Unicode 3.2 UAX 21:
803 *
804 * D1. A character C is defined to be cased
805 * if it meets any of the following criteria:
806 *
807 * - The general category of C is Titlecase Letter (Lt)
808 * - In [CoreProps], C has one of the properties Uppercase, or Lowercase
809 * - Given D = NFD(C), then it is not the case that:
810 * D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)
811 * (This third criterion does not add any characters to the list
812 * for Unicode 3.2. Ignored.)
813 *
814 * D2. A character C is defined to be case-ignorable
815 * if it meets either of the following criteria:
816 *
817 * - The general category of C is
818 * Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or
819 * Letter Modifier (Lm), or Symbol Modifier (Sk)
820 * - C is one of the following characters
821 * U+0027 APOSTROPHE
822 * U+00AD SOFT HYPHEN (SHY)
823 * U+2019 RIGHT SINGLE QUOTATION MARK
824 * (the preferred character for apostrophe)
825 *
826 * D3. A case-ignorable sequence is a sequence of
827 * zero or more case-ignorable characters.
828 */
829
830#define is_d(c) ((c)=='d' || (c)=='D')
831#define is_e(c) ((c)=='e' || (c)=='E')
832#define is_i(c) ((c)=='i' || (c)=='I')
833#define is_l(c) ((c)=='l' || (c)=='L')
834#define is_r(c) ((c)=='r' || (c)=='R')
835#define is_t(c) ((c)=='t' || (c)=='T')
836#define is_u(c) ((c)=='u' || (c)=='U')
837#define is_y(c) ((c)=='y' || (c)=='Y')
838#define is_z(c) ((c)=='z' || (c)=='Z')
839
840/* separator? */
841#define is_sep(c) ((c)=='_' || (c)=='-' || (c)==0)
842
843/**
844 * Requires non-nullptr locale ID but otherwise does the equivalent of
845 * checking for language codes as if uloc_getLanguage() were called:
846 * Accepts both 2- and 3-letter codes and accepts case variants.
847 */
848U_CFUNC int32_t
849ucase_getCaseLocale(const char *locale) {
850 /*
851 * This function used to use uloc_getLanguage(), but the current code
852 * removes the dependency of this low-level code on uloc implementation code
853 * and is faster because not the whole locale ID has to be
854 * examined and copied/transformed.
855 *
856 * Because this code does not want to depend on uloc, the caller must
857 * pass in a non-nullptr locale, i.e., may need to call uloc_getDefault().
858 */
859 char c=*locale++;
860 // Fastpath for English "en" which is often used for default (=root locale) case mappings,
861 // and for Chinese "zh": Very common but no special case mapping behavior.
862 // Then check lowercase vs. uppercase to reduce the number of comparisons
863 // for other locales without special behavior.
864 if(c=='e') {
865 /* el or ell? */
866 c=*locale++;
867 if(is_l(c)) {
868 c=*locale++;
869 if(is_l(c)) {
870 c=*locale;
871 }
872 if(is_sep(c)) {
873 return UCASE_LOC_GREEK;
874 }
875 }
876 // en, es, ... -> root
877 } else if(c=='z') {
878 return UCASE_LOC_ROOT;
879#if U_CHARSET_FAMILY==U_ASCII_FAMILY
880 } else if(c>='a') { // ASCII a-z = 0x61..0x7a, after A-Z
881#elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY
882 } else if(c<='z') { // EBCDIC a-z = 0x81..0xa9 with two gaps, before A-Z
883#else
884# error Unknown charset family!
885#endif
886 // lowercase c
887 if(c=='t') {
888 /* tr or tur? */
889 c=*locale++;
890 if(is_u(c)) {
891 c=*locale++;
892 }
893 if(is_r(c)) {
894 c=*locale;
895 if(is_sep(c)) {
896 return UCASE_LOC_TURKISH;
897 }
898 }
899 } else if(c=='a') {
900 /* az or aze? */
901 c=*locale++;
902 if(is_z(c)) {
903 c=*locale++;
904 if(is_e(c)) {
905 c=*locale;
906 }
907 if(is_sep(c)) {
908 return UCASE_LOC_TURKISH;
909 }
910 }
911 } else if(c=='l') {
912 /* lt or lit? */
913 c=*locale++;
914 if(is_i(c)) {
915 c=*locale++;
916 }
917 if(is_t(c)) {
918 c=*locale;
919 if(is_sep(c)) {
920 return UCASE_LOC_LITHUANIAN;
921 }
922 }
923 } else if(c=='n') {
924 /* nl or nld? */
925 c=*locale++;
926 if(is_l(c)) {
927 c=*locale++;
928 if(is_d(c)) {
929 c=*locale;
930 }
931 if(is_sep(c)) {
932 return UCASE_LOC_DUTCH;
933 }
934 }
935 } else if(c=='h') {
936 /* hy or hye? *not* hyw */
937 c=*locale++;
938 if(is_y(c)) {
939 c=*locale++;
940 if(is_e(c)) {
941 c=*locale;
942 }
943 if(is_sep(c)) {
944 return UCASE_LOC_ARMENIAN;
945 }
946 }
947 }
948 } else {
949 // uppercase c
950 // Same code as for lowercase c but also check for 'E'.
951 if(c=='T') {
952 /* tr or tur? */
953 c=*locale++;
954 if(is_u(c)) {
955 c=*locale++;
956 }
957 if(is_r(c)) {
958 c=*locale;
959 if(is_sep(c)) {
960 return UCASE_LOC_TURKISH;
961 }
962 }
963 } else if(c=='A') {
964 /* az or aze? */
965 c=*locale++;
966 if(is_z(c)) {
967 c=*locale++;
968 if(is_e(c)) {
969 c=*locale;
970 }
971 if(is_sep(c)) {
972 return UCASE_LOC_TURKISH;
973 }
974 }
975 } else if(c=='L') {
976 /* lt or lit? */
977 c=*locale++;
978 if(is_i(c)) {
979 c=*locale++;
980 }
981 if(is_t(c)) {
982 c=*locale;
983 if(is_sep(c)) {
984 return UCASE_LOC_LITHUANIAN;
985 }
986 }
987 } else if(c=='E') {
988 /* el or ell? */
989 c=*locale++;
990 if(is_l(c)) {
991 c=*locale++;
992 if(is_l(c)) {
993 c=*locale;
994 }
995 if(is_sep(c)) {
996 return UCASE_LOC_GREEK;
997 }
998 }
999 } else if(c=='N') {
1000 /* nl or nld? */
1001 c=*locale++;
1002 if(is_l(c)) {
1003 c=*locale++;
1004 if(is_d(c)) {
1005 c=*locale;
1006 }
1007 if(is_sep(c)) {
1008 return UCASE_LOC_DUTCH;
1009 }
1010 }
1011 } else if(c=='H') {
1012 /* hy or hye? *not* hyw */
1013 c=*locale++;
1014 if(is_y(c)) {
1015 c=*locale++;
1016 if(is_e(c)) {
1017 c=*locale;
1018 }
1019 if(is_sep(c)) {
1020 return UCASE_LOC_ARMENIAN;
1021 }
1022 }
1023 }
1024 }
1025 return UCASE_LOC_ROOT;
1026}
1027
1028/*
1029 * Is followed by
1030 * {case-ignorable}* cased
1031 * ?
1032 * (dir determines looking forward/backward)
1033 * If a character is case-ignorable, it is skipped regardless of whether
1034 * it is also cased or not.
1035 */
1036static UBool
1037isFollowedByCasedLetter(UCaseContextIterator *iter, void *context, int8_t dir) {
1038 UChar32 c;
1039
1040 if(iter==nullptr) {
1041 return false;
1042 }
1043
1044 for(/* dir!=0 sets direction */; (c=iter(context, dir))>=0; dir=0) {
1045 int32_t type=ucase_getTypeOrIgnorable(c);
1046 if(type&4) {
1047 /* case-ignorable, continue with the loop */
1048 } else if(type!=UCASE_NONE) {
1049 return true; /* followed by cased letter */
1050 } else {
1051 return false; /* uncased and not case-ignorable */
1052 }
1053 }
1054
1055 return false; /* not followed by cased letter */
1056}
1057
1058/* Is preceded by Soft_Dotted character with no intervening cc=230 ? */
1059static UBool
1060isPrecededBySoftDotted(UCaseContextIterator *iter, void *context) {
1061 UChar32 c;
1062 int32_t dotType;
1063 int8_t dir;
1064
1065 if(iter==nullptr) {
1066 return false;
1067 }
1068
1069 for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
1070 dotType=getDotType(c);
1071 if(dotType==UCASE_SOFT_DOTTED) {
1072 return true; /* preceded by TYPE_i */
1073 } else if(dotType!=UCASE_OTHER_ACCENT) {
1074 return false; /* preceded by different base character (not TYPE_i), or intervening cc==230 */
1075 }
1076 }
1077
1078 return false; /* not preceded by TYPE_i */
1079}
1080
1081/*
1082 * See Jitterbug 2344:
1083 * The condition After_I for Turkic-lowercasing of U+0307 combining dot above
1084 * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because
1085 * we made those releases compatible with Unicode 3.2 which had not fixed
1086 * a related bug in SpecialCasing.txt.
1087 *
1088 * From the Jitterbug 2344 text:
1089 * ... this bug is listed as a Unicode erratum
1090 * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html
1091 * <quote>
1092 * There are two errors in SpecialCasing.txt.
1093 * 1. Missing semicolons on two lines. ... [irrelevant for ICU]
1094 * 2. An incorrect context definition. Correct as follows:
1095 * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
1096 * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
1097 * ---
1098 * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
1099 * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
1100 * where the context After_I is defined as:
1101 * The last preceding base character was an uppercase I, and there is no
1102 * intervening combining character class 230 (ABOVE).
1103 * </quote>
1104 *
1105 * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as:
1106 *
1107 * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
1108 * # This matches the behavior of the canonically equivalent I-dot_above
1109 *
1110 * See also the description in this place in older versions of uchar.c (revision 1.100).
1111 *
1112 * Markus W. Scherer 2003-feb-15
1113 */
1114
1115/* Is preceded by base character 'I' with no intervening cc=230 ? */
1116static UBool
1117isPrecededBy_I(UCaseContextIterator *iter, void *context) {
1118 UChar32 c;
1119 int32_t dotType;
1120 int8_t dir;
1121
1122 if(iter==nullptr) {
1123 return false;
1124 }
1125
1126 for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
1127 if(c==0x49) {
1128 return true; /* preceded by I */
1129 }
1130 dotType=getDotType(c);
1131 if(dotType!=UCASE_OTHER_ACCENT) {
1132 return false; /* preceded by different base character (not I), or intervening cc==230 */
1133 }
1134 }
1135
1136 return false; /* not preceded by I */
1137}
1138
1139/* Is followed by one or more cc==230 ? */
1140static UBool
1141isFollowedByMoreAbove(UCaseContextIterator *iter, void *context) {
1142 UChar32 c;
1143 int32_t dotType;
1144 int8_t dir;
1145
1146 if(iter==nullptr) {
1147 return false;
1148 }
1149
1150 for(dir=1; (c=iter(context, dir))>=0; dir=0) {
1151 dotType=getDotType(c);
1152 if(dotType==UCASE_ABOVE) {
1153 return true; /* at least one cc==230 following */
1154 } else if(dotType!=UCASE_OTHER_ACCENT) {
1155 return false; /* next base character, no more cc==230 following */
1156 }
1157 }
1158
1159 return false; /* no more cc==230 following */
1160}
1161
1162/* Is followed by a dot above (without cc==230 in between) ? */
1163static UBool
1164isFollowedByDotAbove(UCaseContextIterator *iter, void *context) {
1165 UChar32 c;
1166 int32_t dotType;
1167 int8_t dir;
1168
1169 if(iter==nullptr) {
1170 return false;
1171 }
1172
1173 for(dir=1; (c=iter(context, dir))>=0; dir=0) {
1174 if(c==0x307) {
1175 return true;
1176 }
1177 dotType=getDotType(c);
1178 if(dotType!=UCASE_OTHER_ACCENT) {
1179 return false; /* next base character or cc==230 in between */
1180 }
1181 }
1182
1183 return false; /* no dot above following */
1184}
1185
1186U_CAPI int32_t U_EXPORT2
1187ucase_toFullLower(UChar32 c,
1188 UCaseContextIterator *iter, void *context,
1189 const char16_t **pString,
1190 int32_t loc) {
1191 // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
1192 U_ASSERT(c >= 0);
1193 UChar32 result=c;
1194 // Reset the output pointer in case it was uninitialized.
1195 *pString=nullptr;
1196 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
1197 if(!UCASE_HAS_EXCEPTION(props)) {
1198 if(UCASE_IS_UPPER_OR_TITLE(props)) {
1199 result=c+UCASE_GET_DELTA(props);
1200 }
1201 } else {
1202 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
1203 uint16_t excWord=*pe++;
1204 int32_t full;
1205
1206 pe2=pe;
1207
1208 if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
1209 /* use hardcoded conditions and mappings */
1210
1211 /*
1212 * Test for conditional mappings first
1213 * (otherwise the unconditional default mappings are always taken),
1214 * then test for characters that have unconditional mappings in SpecialCasing.txt,
1215 * then get the UnicodeData.txt mappings.
1216 */
1217 if( loc==UCASE_LOC_LITHUANIAN &&
1218 /* base characters, find accents above */
1219 (((c==0x49 || c==0x4a || c==0x12e) &&
1220 isFollowedByMoreAbove(iter, context)) ||
1221 /* precomposed with accent above, no need to find one */
1222 (c==0xcc || c==0xcd || c==0x128))
1223 ) {
1224 /*
1225 # Lithuanian
1226
1227 # Lithuanian retains the dot in a lowercase i when followed by accents.
1228
1229 # Introduce an explicit dot above when lowercasing capital I's and J's
1230 # whenever there are more accents above.
1231 # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
1232
1233 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
1234 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
1235 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
1236 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
1237 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
1238 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
1239 */
1240 switch(c) {
1241 case 0x49: /* LATIN CAPITAL LETTER I */
1242 *pString=iDot;
1243 return 2;
1244 case 0x4a: /* LATIN CAPITAL LETTER J */
1245 *pString=jDot;
1246 return 2;
1247 case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */
1248 *pString=iOgonekDot;
1249 return 2;
1250 case 0xcc: /* LATIN CAPITAL LETTER I WITH GRAVE */
1251 *pString=iDotGrave;
1252 return 3;
1253 case 0xcd: /* LATIN CAPITAL LETTER I WITH ACUTE */
1254 *pString=iDotAcute;
1255 return 3;
1256 case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */
1257 *pString=iDotTilde;
1258 return 3;
1259 default:
1260 return 0; /* will not occur */
1261 }
1262 /* # Turkish and Azeri */
1263 } else if(loc==UCASE_LOC_TURKISH && c==0x130) {
1264 /*
1265 # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
1266 # The following rules handle those cases.
1267
1268 0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
1269 0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
1270 */
1271 return 0x69;
1272 } else if(loc==UCASE_LOC_TURKISH && c==0x307 && isPrecededBy_I(iter, context)) {
1273 /*
1274 # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
1275 # This matches the behavior of the canonically equivalent I-dot_above
1276
1277 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
1278 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
1279 */
1280 return 0; /* remove the dot (continue without output) */
1281 } else if(loc==UCASE_LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(iter, context)) {
1282 /*
1283 # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
1284
1285 0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
1286 0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
1287 */
1288 return 0x131;
1289 } else if(c==0x130) {
1290 /*
1291 # Preserve canonical equivalence for I with dot. Turkic is handled below.
1292
1293 0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1294 */
1295 *pString=iDot;
1296 return 2;
1297 } else if( c==0x3a3 &&
1298 !isFollowedByCasedLetter(iter, context, 1) &&
1299 isFollowedByCasedLetter(iter, context, -1) /* -1=preceded */
1300 ) {
1301 /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */
1302 /*
1303 # Special case for final form of sigma
1304
1305 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
1306 */
1307 return 0x3c2; /* greek small final sigma */
1308 } else {
1309 /* no known conditional special case mapping, use a normal mapping */
1310 }
1311 } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1312 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1313 full&=UCASE_FULL_LOWER;
1314 if(full!=0) {
1315 /* set the output pointer to the lowercase mapping */
1316 *pString=reinterpret_cast<const char16_t *>(pe+1);
1317
1318 /* return the string length */
1319 return full;
1320 }
1321 }
1322
1323 if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
1324 int32_t delta;
1325 GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe2, delta);
1326 return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
1327 }
1328 if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1329 GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe2, result);
1330 }
1331 }
1332
1333 return (result==c) ? ~result : result;
1334}
1335
1336/* internal */
1337static int32_t
1338toUpperOrTitle(UChar32 c,
1339 UCaseContextIterator *iter, void *context,
1340 const char16_t **pString,
1341 int32_t loc,
1342 UBool upperNotTitle) {
1343 // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
1344 U_ASSERT(c >= 0);
1345 UChar32 result=c;
1346 // Reset the output pointer in case it was uninitialized.
1347 *pString=nullptr;
1348 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
1349 if(!UCASE_HAS_EXCEPTION(props)) {
1350 if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
1351 result=c+UCASE_GET_DELTA(props);
1352 }
1353 } else {
1354 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
1355 uint16_t excWord=*pe++;
1356 int32_t full, idx;
1357
1358 pe2=pe;
1359
1360 if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
1361 /* use hardcoded conditions and mappings */
1362 if(loc==UCASE_LOC_TURKISH && c==0x69) {
1363 /*
1364 # Turkish and Azeri
1365
1366 # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
1367 # The following rules handle those cases.
1368
1369 # When uppercasing, i turns into a dotted capital I
1370
1371 0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
1372 0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
1373 */
1374 return 0x130;
1375 } else if(loc==UCASE_LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(iter, context)) {
1376 /*
1377 # Lithuanian
1378
1379 # Lithuanian retains the dot in a lowercase i when followed by accents.
1380
1381 # Remove DOT ABOVE after "i" with upper or titlecase
1382
1383 0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
1384 */
1385 return 0; /* remove the dot (continue without output) */
1386 } else if(c==0x0587) {
1387 // See ICU-13416:
1388 // և ligature ech-yiwn
1389 // uppercases to ԵՒ=ech+yiwn by default and in Western Armenian,
1390 // but to ԵՎ=ech+vew in Eastern Armenian.
1391 if(loc==UCASE_LOC_ARMENIAN) {
1392 *pString=upperNotTitle ? u"ԵՎ" : u"Եվ";
1393 } else {
1394 *pString=upperNotTitle ? u"ԵՒ" : u"Եւ";
1395 }
1396 return 2;
1397 } else {
1398 /* no known conditional special case mapping, use a normal mapping */
1399 }
1400 } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1401 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1402
1403 /* start of full case mapping strings */
1404 ++pe;
1405
1406 /* skip the lowercase and case-folding result strings */
1407 pe+=full&UCASE_FULL_LOWER;
1408 full>>=4;
1409 pe+=full&0xf;
1410 full>>=4;
1411
1412 if(upperNotTitle) {
1413 full&=0xf;
1414 } else {
1415 /* skip the uppercase result string */
1416 pe+=full&0xf;
1417 full=(full>>4)&0xf;
1418 }
1419
1420 if(full!=0) {
1421 /* set the output pointer to the result string */
1422 *pString=reinterpret_cast<const char16_t *>(pe);
1423
1424 /* return the string length */
1425 return full;
1426 }
1427 }
1428
1429 if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) {
1430 int32_t delta;
1431 GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe2, delta);
1432 return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
1433 }
1434 if(!upperNotTitle && HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
1435 idx=UCASE_EXC_TITLE;
1436 } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
1437 /* here, titlecase is same as uppercase */
1438 idx=UCASE_EXC_UPPER;
1439 } else {
1440 return ~c;
1441 }
1442 GET_SLOT_VALUE(excWord, idx, pe2, result);
1443 }
1444
1445 return (result==c) ? ~result : result;
1446}
1447
1448U_CAPI int32_t U_EXPORT2
1449ucase_toFullUpper(UChar32 c,
1450 UCaseContextIterator *iter, void *context,
1451 const char16_t **pString,
1452 int32_t caseLocale) {
1453 return toUpperOrTitle(c, iter, context, pString, caseLocale, true);
1454}
1455
1456U_CAPI int32_t U_EXPORT2
1457ucase_toFullTitle(UChar32 c,
1458 UCaseContextIterator *iter, void *context,
1459 const char16_t **pString,
1460 int32_t caseLocale) {
1461 return toUpperOrTitle(c, iter, context, pString, caseLocale, false);
1462}
1463
1464/* case folding ------------------------------------------------------------- */
1465
1466/*
1467 * Case folding is similar to lowercasing.
1468 * The result may be a simple mapping, i.e., a single code point, or
1469 * a full mapping, i.e., a string.
1470 * If the case folding for a code point is the same as its simple (1:1) lowercase mapping,
1471 * then only the lowercase mapping is stored.
1472 *
1473 * Some special cases are hardcoded because their conditions cannot be
1474 * parsed and processed from CaseFolding.txt.
1475 *
1476 * Unicode 3.2 CaseFolding.txt specifies for its status field:
1477
1478# C: common case folding, common mappings shared by both simple and full mappings.
1479# F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
1480# S: simple case folding, mappings to single characters where different from F.
1481# T: special case for uppercase I and dotted uppercase I
1482# - For non-Turkic languages, this mapping is normally not used.
1483# - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
1484#
1485# Usage:
1486# A. To do a simple case folding, use the mappings with status C + S.
1487# B. To do a full case folding, use the mappings with status C + F.
1488#
1489# The mappings with status T can be used or omitted depending on the desired case-folding
1490# behavior. (The default option is to exclude them.)
1491
1492 * Unicode 3.2 has 'T' mappings as follows:
1493
14940049; T; 0131; # LATIN CAPITAL LETTER I
14950130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1496
1497 * while the default mappings for these code points are:
1498
14990049; C; 0069; # LATIN CAPITAL LETTER I
15000130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1501
1502 * U+0130 has no simple case folding (simple-case-folds to itself).
1503 */
1504
1505/* return the simple case folding mapping for c */
1506U_CAPI UChar32 U_EXPORT2
1507ucase_fold(UChar32 c, uint32_t options) {
1508 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
1509 if(!UCASE_HAS_EXCEPTION(props)) {
1510 if(UCASE_IS_UPPER_OR_TITLE(props)) {
1511 c+=UCASE_GET_DELTA(props);
1512 }
1513 } else {
1514 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
1515 uint16_t excWord=*pe++;
1516 int32_t idx;
1517 if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1518 /* special case folding mappings, hardcoded */
1519 if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1520 /* default mappings */
1521 if(c==0x49) {
1522 /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1523 return 0x69;
1524 } else if(c==0x130) {
1525 /* no simple case folding for U+0130 */
1526 return c;
1527 }
1528 } else {
1529 /* Turkic mappings */
1530 if(c==0x49) {
1531 /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1532 return 0x131;
1533 } else if(c==0x130) {
1534 /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1535 return 0x69;
1536 }
1537 }
1538 }
1539 if((excWord&UCASE_EXC_NO_SIMPLE_CASE_FOLDING)!=0) {
1540 return c;
1541 }
1542 if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
1543 int32_t delta;
1544 GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
1545 return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
1546 }
1547 if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
1548 idx=UCASE_EXC_FOLD;
1549 } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1550 idx=UCASE_EXC_LOWER;
1551 } else {
1552 return c;
1553 }
1554 GET_SLOT_VALUE(excWord, idx, pe, c);
1555 }
1556 return c;
1557}
1558
1559/*
1560 * Issue for canonical caseless match (UAX #21):
1561 * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve
1562 * canonical equivalence, unlike default-option casefolding.
1563 * For example, I-grave and I + grave fold to strings that are not canonically
1564 * equivalent.
1565 * For more details, see the comment in unorm_compare() in unorm.cpp
1566 * and the intermediate prototype changes for Jitterbug 2021.
1567 * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.)
1568 *
1569 * This did not get fixed because it appears that it is not possible to fix
1570 * it for uppercase and lowercase characters (I-grave vs. i-grave)
1571 * together in a way that they still fold to common result strings.
1572 */
1573
1574U_CAPI int32_t U_EXPORT2
1575ucase_toFullFolding(UChar32 c,
1576 const char16_t **pString,
1577 uint32_t options) {
1578 // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
1579 U_ASSERT(c >= 0);
1580 UChar32 result=c;
1581 // Reset the output pointer in case it was uninitialized.
1582 *pString=nullptr;
1583 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
1584 if(!UCASE_HAS_EXCEPTION(props)) {
1585 if(UCASE_IS_UPPER_OR_TITLE(props)) {
1586 result=c+UCASE_GET_DELTA(props);
1587 }
1588 } else {
1589 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
1590 uint16_t excWord=*pe++;
1591 int32_t full, idx;
1592
1593 pe2=pe;
1594
1595 if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1596 /* use hardcoded conditions and mappings */
1597 if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1598 /* default mappings */
1599 if(c==0x49) {
1600 /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1601 return 0x69;
1602 } else if(c==0x130) {
1603 /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1604 *pString=iDot;
1605 return 2;
1606 }
1607 } else {
1608 /* Turkic mappings */
1609 if(c==0x49) {
1610 /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1611 return 0x131;
1612 } else if(c==0x130) {
1613 /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1614 return 0x69;
1615 }
1616 }
1617 } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1618 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1619
1620 /* start of full case mapping strings */
1621 ++pe;
1622
1623 /* skip the lowercase result string */
1624 pe+=full&UCASE_FULL_LOWER;
1625 full=(full>>4)&0xf;
1626
1627 if(full!=0) {
1628 /* set the output pointer to the result string */
1629 *pString=reinterpret_cast<const char16_t *>(pe);
1630
1631 /* return the string length */
1632 return full;
1633 }
1634 }
1635
1636 if((excWord&UCASE_EXC_NO_SIMPLE_CASE_FOLDING)!=0) {
1637 return ~c;
1638 }
1639 if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
1640 int32_t delta;
1641 GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe2, delta);
1642 return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
1643 }
1644 if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
1645 idx=UCASE_EXC_FOLD;
1646 } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1647 idx=UCASE_EXC_LOWER;
1648 } else {
1649 return ~c;
1650 }
1651 GET_SLOT_VALUE(excWord, idx, pe2, result);
1652 }
1653
1654 return (result==c) ? ~result : result;
1655}
1656
1657/* case mapping properties API ---------------------------------------------- */
1658
1659/* public API (see uchar.h) */
1660
1661U_CAPI UBool U_EXPORT2
1662u_isULowercase(UChar32 c) {
1663 return (UBool)(UCASE_LOWER==ucase_getType(c));
1664}
1665
1666U_CAPI UBool U_EXPORT2
1667u_isUUppercase(UChar32 c) {
1668 return (UBool)(UCASE_UPPER==ucase_getType(c));
1669}
1670
1671/* Transforms the Unicode character to its lower case equivalent.*/
1672U_CAPI UChar32 U_EXPORT2
1673u_tolower(UChar32 c) {
1674 return ucase_tolower(c);
1675}
1676
1677/* Transforms the Unicode character to its upper case equivalent.*/
1678U_CAPI UChar32 U_EXPORT2
1679u_toupper(UChar32 c) {
1680 return ucase_toupper(c);
1681}
1682
1683/* Transforms the Unicode character to its title case equivalent.*/
1684U_CAPI UChar32 U_EXPORT2
1685u_totitle(UChar32 c) {
1686 return ucase_totitle(c);
1687}
1688
1689/* return the simple case folding mapping for c */
1690U_CAPI UChar32 U_EXPORT2
1691u_foldCase(UChar32 c, uint32_t options) {
1692 return ucase_fold(c, options);
1693}
1694
1695U_CFUNC int32_t U_EXPORT2
1696ucase_hasBinaryProperty(UChar32 c, UProperty which) {
1697 /* case mapping properties */
1698 const char16_t *resultString;
1699 switch(which) {
1700 case UCHAR_LOWERCASE:
1701 return (UBool)(UCASE_LOWER==ucase_getType(c));
1702 case UCHAR_UPPERCASE:
1703 return (UBool)(UCASE_UPPER==ucase_getType(c));
1704 case UCHAR_SOFT_DOTTED:
1705 return ucase_isSoftDotted(c);
1706 case UCHAR_CASE_SENSITIVE:
1707 return ucase_isCaseSensitive(c);
1708 case UCHAR_CASED:
1709 return (UBool)(UCASE_NONE!=ucase_getType(c));
1710 case UCHAR_CASE_IGNORABLE:
1711 return (UBool)(ucase_getTypeOrIgnorable(c)>>2);
1712 /*
1713 * Note: The following Changes_When_Xyz are defined as testing whether
1714 * the NFD form of the input changes when Xyz-case-mapped.
1715 * However, this simpler implementation of these properties,
1716 * ignoring NFD, passes the tests.
1717 * The implementation needs to be changed if the tests start failing.
1718 * When that happens, optimizations should be used to work with the
1719 * per-single-code point ucase_toFullXyz() functions unless
1720 * the NFD form has more than one code point,
1721 * and the property starts set needs to be the union of the
1722 * start sets for normalization and case mappings.
1723 */
1724 case UCHAR_CHANGES_WHEN_LOWERCASED:
1725 return (UBool)(ucase_toFullLower(c, nullptr, nullptr, &resultString, UCASE_LOC_ROOT)>=0);
1726 case UCHAR_CHANGES_WHEN_UPPERCASED:
1727 return (UBool)(ucase_toFullUpper(c, nullptr, nullptr, &resultString, UCASE_LOC_ROOT)>=0);
1728 case UCHAR_CHANGES_WHEN_TITLECASED:
1729 return (UBool)(ucase_toFullTitle(c, nullptr, nullptr, &resultString, UCASE_LOC_ROOT)>=0);
1730 /* case UCHAR_CHANGES_WHEN_CASEFOLDED: -- in uprops.c */
1731 case UCHAR_CHANGES_WHEN_CASEMAPPED:
1732 return (UBool)(
1733 ucase_toFullLower(c, nullptr, nullptr, &resultString, UCASE_LOC_ROOT)>=0 ||
1734 ucase_toFullUpper(c, nullptr, nullptr, &resultString, UCASE_LOC_ROOT)>=0 ||
1735 ucase_toFullTitle(c, nullptr, nullptr, &resultString, UCASE_LOC_ROOT)>=0);
1736 default:
1737 return false;
1738 }
1739}
1740