ucase.cpp source code [ClickHouse/contrib/icu/icu4c/source/common/ucase.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	*******************************************************************************
5	*
6	* Copyright (C) 2004-2014, International Business Machines
7	* Corporation and others. All Rights Reserved.
8	*
9	*******************************************************************************
10	* file name: ucase.cpp
11	* encoding: UTF-8
12	* tab size: 8 (not used)
13	* indentation:4
14	*
15	* created on: 2004aug30
16	* created by: Markus W. Scherer
17	*
18	* Low-level Unicode character/string case mapping code.
19	* Much code moved here (and modified) from uchar.c.
20	*/
21
22	#include "unicode/utypes.h"
23	#include "unicode/unistr.h"
24	#include "unicode/uset.h"
25	#include "unicode/udata.h" /* UDataInfo */
26	#include "unicode/utf16.h"
27	#include "ucmndata.h" /* DataHeader */
28	#include "udatamem.h"
29	#include "umutex.h"
30	#include "uassert.h"
31	#include "cmemory.h"
32	#include "utrie2.h"
33	#include "ucase.h"
34
35	struct UCaseProps {
36	UDataMemory *mem;
37	const int32_t *indexes;
38	const uint16_t *exceptions;
39	const uint16_t *unfold;
40
41	UTrie2 trie;
42	uint8_t formatVersion[`4`];
43	};
44
45	/ ucase_props_data.h is machine-generated by gencase --csource /
46	#define INCLUDED_FROM_UCASE_CPP
47	#include "ucase_props_data.h"
48
49	/ set of property starts for UnicodeSet ------------------------------------ /
50
51	static UBool U_CALLCONV
52	_enumPropertyStartsRange(const void context, UChar32 start, UChar32 /end/, uint32_t /value/*) {
53	/ add the start code point to the USet /
54	const USetAdder sa=(const* USetAdder *)context;
55	sa->add(sa->set, start);
56	return TRUE;
57	}
58
59	U_CFUNC void U_EXPORT2
60	ucase_addPropertyStarts(const USetAdder sa, UErrorCode pErrorCode) {
61	if(U_FAILURE(*pErrorCode)) {
62	return;
63	}
64
65	/ add the start code point of each same-value range of the trie /
66	utrie2_enum(&ucase_props_singleton.trie, NULL, _enumPropertyStartsRange, sa);
67
68	/ add code points with hardcoded properties, plus the ones following them /
69
70	/ (none right now, see comment below) /
71
72	/*
73	* Omit code points with hardcoded specialcasing properties
74	* because we do not build property UnicodeSets for them right now.
75	*/
76	}
77
78	/ data access primitives --------------------------------------------------- /
79
80	U_CFUNC const UTrie2 * U_EXPORT2
81	ucase_getTrie() {
82	return &ucase_props_singleton.trie;
83	}
84
85	#define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT))
86
87	/ number of bits in an 8-bit integer value /
88	static const uint8_t flagsOffset[`256`]={
89	`0`, `1`, `1`, `2`, `1`, `2`, `2`, `3`, `1`, `2`, `2`, `3`, `2`, `3`, `3`, `4`,
90	`1`, `2`, `2`, `3`, `2`, `3`, `3`, `4`, `2`, `3`, `3`, `4`, `3`, `4`, `4`, `5`,
91	`1`, `2`, `2`, `3`, `2`, `3`, `3`, `4`, `2`, `3`, `3`, `4`, `3`, `4`, `4`, `5`,
92	`2`, `3`, `3`, `4`, `3`, `4`, `4`, `5`, `3`, `4`, `4`, `5`, `4`, `5`, `5`, `6`,
93	`1`, `2`, `2`, `3`, `2`, `3`, `3`, `4`, `2`, `3`, `3`, `4`, `3`, `4`, `4`, `5`,
94	`2`, `3`, `3`, `4`, `3`, `4`, `4`, `5`, `3`, `4`, `4`, `5`, `4`, `5`, `5`, `6`,
95	`2`, `3`, `3`, `4`, `3`, `4`, `4`, `5`, `3`, `4`, `4`, `5`, `4`, `5`, `5`, `6`,
96	`3`, `4`, `4`, `5`, `4`, `5`, `5`, `6`, `4`, `5`, `5`, `6`, `5`, `6`, `6`, `7`,
97	`1`, `2`, `2`, `3`, `2`, `3`, `3`, `4`, `2`, `3`, `3`, `4`, `3`, `4`, `4`, `5`,
98	`2`, `3`, `3`, `4`, `3`, `4`, `4`, `5`, `3`, `4`, `4`, `5`, `4`, `5`, `5`, `6`,
99	`2`, `3`, `3`, `4`, `3`, `4`, `4`, `5`, `3`, `4`, `4`, `5`, `4`, `5`, `5`, `6`,
100	`3`, `4`, `4`, `5`, `4`, `5`, `5`, `6`, `4`, `5`, `5`, `6`, `5`, `6`, `6`, `7`,
101	`2`, `3`, `3`, `4`, `3`, `4`, `4`, `5`, `3`, `4`, `4`, `5`, `4`, `5`, `5`, `6`,
102	`3`, `4`, `4`, `5`, `4`, `5`, `5`, `6`, `4`, `5`, `5`, `6`, `5`, `6`, `6`, `7`,
103	`3`, `4`, `4`, `5`, `4`, `5`, `5`, `6`, `4`, `5`, `5`, `6`, `5`, `6`, `6`, `7`,
104	`4`, `5`, `5`, `6`, `5`, `6`, `6`, `7`, `5`, `6`, `6`, `7`, `6`, `7`, `7`, `8`
105	};
106
107	#define HAS_SLOT(flags, idx) ((flags)&(1<<(idx)))
108	#define SLOT_OFFSET(flags, idx) flagsOffset[(flags)&((1<<(idx))-1)]
109
110	/*
111	* Get the value of an optional-value slot where HAS_SLOT(excWord, idx).
112	*
113	* @param excWord (in) initial exceptions word
114	* @param idx (in) desired slot index
115	* @param pExc16 (in/out) const uint16_t * after excWord=*pExc16++;
116	* moved to the last uint16_t of the value, use +1 for beginning of next slot
117	* @param value (out) int32_t or uint32_t output if hasSlot, otherwise not modified
118	*/
119	#define GET_SLOT_VALUE(excWord, idx, pExc16, value) UPRV_BLOCK_MACRO_BEGIN { \
120	if(((excWord)&UCASE_EXC_DOUBLE_SLOTS)==0) { \
121	(pExc16)+=SLOT_OFFSET(excWord, idx); \
122	(value)=*pExc16; \
123	} else { \
124	(pExc16)+=2*SLOT_OFFSET(excWord, idx); \
125	(value)=*pExc16++; \
126	(value)=((value)<<16)\|*pExc16; \
127	} \
128	} UPRV_BLOCK_MACRO_END
129
130	/ simple case mappings ----------------------------------------------------- /
131
132	U_CAPI UChar32 U_EXPORT2
133	ucase_tolower(UChar32 c) {
134	uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
135	if(!UCASE_HAS_EXCEPTION(props)) {
136	if(UCASE_IS_UPPER_OR_TITLE(props)) {
137	c+=UCASE_GET_DELTA(props);
138	}
139	} else {
140	const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
141	uint16_t excWord=*pe++;
142	if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
143	int32_t delta;
144	GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
145	return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==`0` ? c+delta : c-delta;
146	}
147	if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
148	GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe, c);
149	}
150	}
151	return c;
152	}
153
154	U_CAPI UChar32 U_EXPORT2
155	ucase_toupper(UChar32 c) {
156	uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
157	if(!UCASE_HAS_EXCEPTION(props)) {
158	if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
159	c+=UCASE_GET_DELTA(props);
160	}
161	} else {
162	const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
163	uint16_t excWord=*pe++;
164	if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) {
165	int32_t delta;
166	GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
167	return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==`0` ? c+delta : c-delta;
168	}
169	if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
170	GET_SLOT_VALUE(excWord, UCASE_EXC_UPPER, pe, c);
171	}
172	}
173	return c;
174	}
175
176	U_CAPI UChar32 U_EXPORT2
177	ucase_totitle(UChar32 c) {
178	uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
179	if(!UCASE_HAS_EXCEPTION(props)) {
180	if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
181	c+=UCASE_GET_DELTA(props);
182	}
183	} else {
184	const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
185	uint16_t excWord=*pe++;
186	if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) {
187	int32_t delta;
188	GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
189	return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==`0` ? c+delta : c-delta;
190	}
191	int32_t idx;
192	if(HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
193	idx=UCASE_EXC_TITLE;
194	} else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
195	idx=UCASE_EXC_UPPER;
196	} else {
197	return c;
198	}
199	GET_SLOT_VALUE(excWord, idx, pe, c);
200	}
201	return c;
202	}
203
204	static const UChar iDot[`2`] = { `0x69`, `0x307` };
205	static const UChar jDot[`2`] = { `0x6a`, `0x307` };
206	static const UChar iOgonekDot[`3`] = { `0x12f`, `0x307` };
207	static const UChar iDotGrave[`3`] = { `0x69`, `0x307`, `0x300` };
208	static const UChar iDotAcute[`3`] = { `0x69`, `0x307`, `0x301` };
209	static const UChar iDotTilde[`3`] = { `0x69`, `0x307`, `0x303` };
210
211
212	U_CFUNC void U_EXPORT2
213	ucase_addCaseClosure(UChar32 c, const USetAdder *sa) {
214	uint16_t props;
215
216	/*
217	* Hardcode the case closure of i and its relatives and ignore the
218	* data file data for these characters.
219	* The Turkic dotless i and dotted I with their case mapping conditions
220	* and case folding option make the related characters behave specially.
221	* This code matches their closure behavior to their case folding behavior.
222	*/
223
224	switch(c) {
225	case `0x49`:
226	/ regular i and I are in one equivalence class /
227	sa->add(sa->set, `0x69`);
228	return;
229	case `0x69`:
230	sa->add(sa->set, `0x49`);
231	return;
232	case `0x130`:
233	/ dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) /
234	sa->addString(sa->set, iDot, `2`);
235	return;
236	case `0x131`:
237	/ dotless i is in a class by itself /
238	return;
239	default:
240	/ otherwise use the data file data /
241	break;
242	}
243
244	props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
245	if(!UCASE_HAS_EXCEPTION(props)) {
246	if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
247	/ add the one simple case mapping, no matter what type it is /
248	int32_t delta=UCASE_GET_DELTA(props);
249	if(delta!=`0`) {
250	sa->add(sa->set, c+delta);
251	}
252	}
253	} else {
254	/*
255	* c has exceptions, so there may be multiple simple and/or
256	* full case mappings. Add them all.
257	*/
258	const uint16_t pe0, pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
259	const UChar *closure;
260	uint16_t excWord=*pe++;
261	int32_t idx, closureLength, fullLength, length;
262
263	pe0=pe;
264
265	/ add all simple case mappings /
266	for(idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) {
267	if(HAS_SLOT(excWord, idx)) {
268	pe=pe0;
269	GET_SLOT_VALUE(excWord, idx, pe, c);
270	sa->add(sa->set, c);
271	}
272	}
273	if(HAS_SLOT(excWord, UCASE_EXC_DELTA)) {
274	pe=pe0;
275	int32_t delta;
276	GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
277	sa->add(sa->set, (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==`0` ? c+delta : c-delta);
278	}
279
280	/ get the closure string pointer & length /
281	if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) {
282	pe=pe0;
283	GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength);
284	closureLength&=UCASE_CLOSURE_MAX_LENGTH; / higher bits are reserved /
285	closure=(const UChar )pe+`1`; /* behind this slot, unless there are full case mappings /
286	} else {
287	closureLength=`0`;
288	closure=NULL;
289	}
290
291	/ add the full case folding /
292	if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
293	pe=pe0;
294	GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength);
295
296	/ start of full case mapping strings /
297	++pe;
298
299	fullLength&=`0xffff`; / bits 16 and higher are reserved /
300
301	/ skip the lowercase result string /
302	pe+=fullLength&UCASE_FULL_LOWER;
303	fullLength>>=`4`;
304
305	/ add the full case folding string /
306	length=fullLength&`0xf`;
307	if(length!=`0`) {
308	sa->addString(sa->set, (const UChar *)pe, length);
309	pe+=length;
310	}
311
312	/ skip the uppercase and titlecase strings /
313	fullLength>>=`4`;
314	pe+=fullLength&`0xf`;
315	fullLength>>=`4`;
316	pe+=fullLength;
317
318	closure=(const UChar )pe; /* behind full case mappings /
319	}
320
321	/ add each code point in the closure string /
322	for(idx=`0`; idx<closureLength;) {
323	U16_NEXT_UNSAFE(closure, idx, c);
324	sa->add(sa->set, c);
325	}
326	}
327	}
328
329	/*
330	* compare s, which has a length, with t, which has a maximum length or is NUL-terminated
331	* must be length>0 and max>0 and length<=max
332	*/
333	static inline int32_t
334	strcmpMax(const UChar s, int32_t length, const* UChar *t, int32_t max) {
335	int32_t c1, c2;
336
337	max-=length; / we require length<=max, so no need to decrement max in the loop /
338	do {
339	c1=*s++;
340	c2=*t++;
341	if(c2==`0`) {
342	return `1`; / reached the end of t but not of s /
343	}
344	c1-=c2;
345	if(c1!=`0`) {
346	return c1; / return difference result /
347	}
348	} while(--length>`0`);
349	/ ends with length==0 /
350
351	if(max==`0` \|\| *t==`0`) {
352	return `0`; / equal to length of both strings /
353	} else {
354	return -max; / return lengh difference /
355	}
356	}
357
358	U_CFUNC UBool U_EXPORT2
359	ucase_addStringCaseClosure(const UChar s, int32_t length, const* USetAdder *sa) {
360	int32_t i, start, limit, result, unfoldRows, unfoldRowWidth, unfoldStringWidth;
361
362	if(ucase_props_singleton.unfold==NULL \|\| s==NULL) {
363	return FALSE; / no reverse case folding data, or no string /
364	}
365	if(length<=`1`) {
366	/ the string is too short to find any match /
367	/*
368	* more precise would be:
369	* if(!u_strHasMoreChar32Than(s, length, 1))
370	* but this does not make much practical difference because
371	* a single supplementary code point would just not be found
372	*/
373	return FALSE;
374	}
375
376	const uint16_t *unfold=ucase_props_singleton.unfold;
377	unfoldRows=unfold[UCASE_UNFOLD_ROWS];
378	unfoldRowWidth=unfold[UCASE_UNFOLD_ROW_WIDTH];
379	unfoldStringWidth=unfold[UCASE_UNFOLD_STRING_WIDTH];
380	unfold+=unfoldRowWidth;
381
382	if(length>unfoldStringWidth) {
383	/ the string is too long to find any match /
384	return FALSE;
385	}
386
387	/ do a binary search for the string /
388	start=`0`;
389	limit=unfoldRows;
390	while(start<limit) {
391	i=(start+limit)/`2`;
392	const UChar p=reinterpret_cast<const* UChar >(unfold+(iunfoldRowWidth));
393	result=strcmpMax(s, length, p, unfoldStringWidth);
394
395	if(result==`0`) {
396	/ found the string: add each code point, and its case closure /
397	UChar32 c;
398
399	for(i=unfoldStringWidth; i<unfoldRowWidth && p[i]!=`0`;) {
400	U16_NEXT_UNSAFE(p, i, c);
401	sa->add(sa->set, c);
402	ucase_addCaseClosure(c, sa);
403	}
404	return TRUE;
405	} else if(result<`0`) {
406	limit=i;
407	} else / result>0 / {
408	start=i+`1`;
409	}
410	}
411
412	return FALSE; / string not found /
413	}
414
415	U_NAMESPACE_BEGIN
416
417	FullCaseFoldingIterator::FullCaseFoldingIterator()
418	: unfold(reinterpret_cast<const UChar *>(ucase_props_singleton.unfold)),
419	unfoldRows(unfold[UCASE_UNFOLD_ROWS]),
420	unfoldRowWidth(unfold[UCASE_UNFOLD_ROW_WIDTH]),
421	unfoldStringWidth(unfold[UCASE_UNFOLD_STRING_WIDTH]),
422	currentRow(`0`),
423	rowCpIndex(unfoldStringWidth) {
424	unfold+=unfoldRowWidth;
425	}
426
427	UChar32
428	FullCaseFoldingIterator::next(UnicodeString &full) {
429	// Advance past the last-delivered code point.
430	const UChar p=unfold+(currentRowunfoldRowWidth);
431	if(rowCpIndex>=unfoldRowWidth \|\| p[rowCpIndex]==`0`) {
432	++currentRow;
433	p+=unfoldRowWidth;
434	rowCpIndex=unfoldStringWidth;
435	}
436	if(currentRow>=unfoldRows) { return U_SENTINEL; }
437	// Set "full" to the NUL-terminated string in the first unfold column.
438	int32_t length=unfoldStringWidth;
439	while(length>`0` && p[length-`1`]==`0`) { --length; }
440	full.setTo(FALSE, p, length);
441	// Return the code point.
442	UChar32 c;
443	U16_NEXT_UNSAFE(p, rowCpIndex, c);
444	return c;
445	}
446
447	namespace LatinCase {
448
449	const int8_t TO_LOWER_NORMAL[LIMIT] = {
450	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
451	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
452	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
453	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
454
455	`0`, `32`, `32`, `32`, `32`, `32`, `32`, `32`, `32`, `32`, `32`, `32`, `32`, `32`, `32`, `32`,
456	`32`, `32`, `32`, `32`, `32`, `32`, `32`, `32`, `32`, `32`, `32`, `0`, `0`, `0`, `0`, `0`,
457	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
458	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
459
460	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
461	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
462	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
463	`0`, `0`, `0`, `0`, `0`, EXC, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
464
465	`32`, `32`, `32`, `32`, `32`, `32`, `32`, `32`, `32`, `32`, `32`, `32`, `32`, `32`, `32`, `32`,
466	`32`, `32`, `32`, `32`, `32`, `32`, `32`, `0`, `32`, `32`, `32`, `32`, `32`, `32`, `32`, EXC,
467	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
468	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
469
470	`1`, `0`, `1`, `0`, `1`, `0`, `1`, `0`, `1`, `0`, `1`, `0`, `1`, `0`, `1`, `0`,
471	`1`, `0`, `1`, `0`, `1`, `0`, `1`, `0`, `1`, `0`, `1`, `0`, `1`, `0`, `1`, `0`,
472	`1`, `0`, `1`, `0`, `1`, `0`, `1`, `0`, `1`, `0`, `1`, `0`, `1`, `0`, `1`, `0`,
473	EXC, `0`, `1`, `0`, `1`, `0`, `1`, `0`, `0`, `1`, `0`, `1`, `0`, `1`, `0`, `1`,
474
475	`0`, `1`, `0`, `1`, `0`, `1`, `0`, `1`, `0`, EXC, `1`, `0`, `1`, `0`, `1`, `0`,
476	`1`, `0`, `1`, `0`, `1`, `0`, `1`, `0`, `1`, `0`, `1`, `0`, `1`, `0`, `1`, `0`,
477	`1`, `0`, `1`, `0`, `1`, `0`, `1`, `0`, `1`, `0`, `1`, `0`, `1`, `0`, `1`, `0`,
478	`1`, `0`, `1`, `0`, `1`, `0`, `1`, `0`, -`121`, `1`, `0`, `1`, `0`, `1`, `0`, EXC
479	};
480
481	const int8_t TO_LOWER_TR_LT[LIMIT] = {
482	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
483	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
484	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
485	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
486
487	`0`, `32`, `32`, `32`, `32`, `32`, `32`, `32`, `32`, EXC, EXC, `32`, `32`, `32`, `32`, `32`,
488	`32`, `32`, `32`, `32`, `32`, `32`, `32`, `32`, `32`, `32`, `32`, `0`, `0`, `0`, `0`, `0`,
489	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
490	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
491
492	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
493	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
494	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
495	`0`, `0`, `0`, `0`, `0`, EXC, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
496
497	`32`, `32`, `32`, `32`, `32`, `32`, `32`, `32`, `32`, `32`, `32`, `32`, EXC, EXC, `32`, `32`,
498	`32`, `32`, `32`, `32`, `32`, `32`, `32`, `0`, `32`, `32`, `32`, `32`, `32`, `32`, `32`, EXC,
499	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
500	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
501
502	`1`, `0`, `1`, `0`, `1`, `0`, `1`, `0`, `1`, `0`, `1`, `0`, `1`, `0`, `1`, `0`,
503	`1`, `0`, `1`, `0`, `1`, `0`, `1`, `0`, `1`, `0`, `1`, `0`, `1`, `0`, `1`, `0`,
504	`1`, `0`, `1`, `0`, `1`, `0`, `1`, `0`, EXC, `0`, `1`, `0`, `1`, `0`, EXC, `0`,
505	EXC, `0`, `1`, `0`, `1`, `0`, `1`, `0`, `0`, `1`, `0`, `1`, `0`, `1`, `0`, `1`,
506
507	`0`, `1`, `0`, `1`, `0`, `1`, `0`, `1`, `0`, EXC, `1`, `0`, `1`, `0`, `1`, `0`,
508	`1`, `0`, `1`, `0`, `1`, `0`, `1`, `0`, `1`, `0`, `1`, `0`, `1`, `0`, `1`, `0`,
509	`1`, `0`, `1`, `0`, `1`, `0`, `1`, `0`, `1`, `0`, `1`, `0`, `1`, `0`, `1`, `0`,
510	`1`, `0`, `1`, `0`, `1`, `0`, `1`, `0`, -`121`, `1`, `0`, `1`, `0`, `1`, `0`, EXC
511	};
512
513	const int8_t TO_UPPER_NORMAL[LIMIT] = {
514	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
515	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
516	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
517	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
518
519	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
520	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
521	`0`, -`32`, -`32`, -`32`, -`32`, -`32`, -`32`, -`32`, -`32`, -`32`, -`32`, -`32`, -`32`, -`32`, -`32`, -`32`,
522	-`32`, -`32`, -`32`, -`32`, -`32`, -`32`, -`32`, -`32`, -`32`, -`32`, -`32`, `0`, `0`, `0`, `0`, `0`,
523
524	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
525	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
526	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
527	`0`, `0`, `0`, `0`, `0`, EXC, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
528
529	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
530	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, EXC,
531	-`32`, -`32`, -`32`, -`32`, -`32`, -`32`, -`32`, -`32`, -`32`, -`32`, -`32`, -`32`, -`32`, -`32`, -`32`, -`32`,
532	-`32`, -`32`, -`32`, -`32`, -`32`, -`32`, -`32`, `0`, -`32`, -`32`, -`32`, -`32`, -`32`, -`32`, -`32`, `121`,
533
534	`0`, -`1`, `0`, -`1`, `0`, -`1`, `0`, -`1`, `0`, -`1`, `0`, -`1`, `0`, -`1`, `0`, -`1`,
535	`0`, -`1`, `0`, -`1`, `0`, -`1`, `0`, -`1`, `0`, -`1`, `0`, -`1`, `0`, -`1`, `0`, -`1`,
536	`0`, -`1`, `0`, -`1`, `0`, -`1`, `0`, -`1`, `0`, -`1`, `0`, -`1`, `0`, -`1`, `0`, -`1`,
537	`0`, EXC, `0`, -`1`, `0`, -`1`, `0`, -`1`, `0`, `0`, -`1`, `0`, -`1`, `0`, -`1`, `0`,
538
539	-`1`, `0`, -`1`, `0`, -`1`, `0`, -`1`, `0`, -`1`, EXC, `0`, -`1`, `0`, -`1`, `0`, -`1`,
540	`0`, -`1`, `0`, -`1`, `0`, -`1`, `0`, -`1`, `0`, -`1`, `0`, -`1`, `0`, -`1`, `0`, -`1`,
541	`0`, -`1`, `0`, -`1`, `0`, -`1`, `0`, -`1`, `0`, -`1`, `0`, -`1`, `0`, -`1`, `0`, -`1`,
542	`0`, -`1`, `0`, -`1`, `0`, -`1`, `0`, -`1`, `0`, `0`, -`1`, `0`, -`1`, `0`, -`1`, EXC
543	};
544
545	const int8_t TO_UPPER_TR[LIMIT] = {
546	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
547	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
548	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
549	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
550
551	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
552	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
553	`0`, -`32`, -`32`, -`32`, -`32`, -`32`, -`32`, -`32`, -`32`, EXC, -`32`, -`32`, -`32`, -`32`, -`32`, -`32`,
554	-`32`, -`32`, -`32`, -`32`, -`32`, -`32`, -`32`, -`32`, -`32`, -`32`, -`32`, `0`, `0`, `0`, `0`, `0`,
555
556	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
557	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
558	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
559	`0`, `0`, `0`, `0`, `0`, EXC, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
560
561	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
562	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, EXC,
563	-`32`, -`32`, -`32`, -`32`, -`32`, -`32`, -`32`, -`32`, -`32`, -`32`, -`32`, -`32`, -`32`, -`32`, -`32`, -`32`,
564	-`32`, -`32`, -`32`, -`32`, -`32`, -`32`, -`32`, `0`, -`32`, -`32`, -`32`, -`32`, -`32`, -`32`, -`32`, `121`,
565
566	`0`, -`1`, `0`, -`1`, `0`, -`1`, `0`, -`1`, `0`, -`1`, `0`, -`1`, `0`, -`1`, `0`, -`1`,
567	`0`, -`1`, `0`, -`1`, `0`, -`1`, `0`, -`1`, `0`, -`1`, `0`, -`1`, `0`, -`1`, `0`, -`1`,
568	`0`, -`1`, `0`, -`1`, `0`, -`1`, `0`, -`1`, `0`, -`1`, `0`, -`1`, `0`, -`1`, `0`, -`1`,
569	`0`, EXC, `0`, -`1`, `0`, -`1`, `0`, -`1`, `0`, `0`, -`1`, `0`, -`1`, `0`, -`1`, `0`,
570
571	-`1`, `0`, -`1`, `0`, -`1`, `0`, -`1`, `0`, -`1`, EXC, `0`, -`1`, `0`, -`1`, `0`, -`1`,
572	`0`, -`1`, `0`, -`1`, `0`, -`1`, `0`, -`1`, `0`, -`1`, `0`, -`1`, `0`, -`1`, `0`, -`1`,
573	`0`, -`1`, `0`, -`1`, `0`, -`1`, `0`, -`1`, `0`, -`1`, `0`, -`1`, `0`, -`1`, `0`, -`1`,
574	`0`, -`1`, `0`, -`1`, `0`, -`1`, `0`, -`1`, `0`, `0`, -`1`, `0`, -`1`, `0`, -`1`, EXC
575	};
576
577	} // namespace LatinCase
578
579	U_NAMESPACE_END
580
581	/* @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE /
582	U_CAPI int32_t U_EXPORT2
583	ucase_getType(UChar32 c) {
584	uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
585	return UCASE_GET_TYPE(props);
586	}
587
588	/* @return same as ucase_getType() and set bit 2 if c is case-ignorable /
589	U_CAPI int32_t U_EXPORT2
590	ucase_getTypeOrIgnorable(UChar32 c) {
591	uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
592	return UCASE_GET_TYPE_AND_IGNORABLE(props);
593	}
594
595	/* @return UCASE_NO_DOT, UCASE_SOFT_DOTTED, UCASE_ABOVE, UCASE_OTHER_ACCENT /
596	static inline int32_t
597	getDotType(UChar32 c) {
598	uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
599	if(!UCASE_HAS_EXCEPTION(props)) {
600	return props&UCASE_DOT_MASK;
601	} else {
602	const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
603	return (*pe>>UCASE_EXC_DOT_SHIFT)&UCASE_DOT_MASK;
604	}
605	}
606
607	U_CAPI UBool U_EXPORT2
608	ucase_isSoftDotted(UChar32 c) {
609	return (UBool)(getDotType(c)==UCASE_SOFT_DOTTED);
610	}
611
612	U_CAPI UBool U_EXPORT2
613	ucase_isCaseSensitive(UChar32 c) {
614	uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
615	if(!UCASE_HAS_EXCEPTION(props)) {
616	return (UBool)((props&UCASE_SENSITIVE)!=`0`);
617	} else {
618	const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
619	return (UBool)((*pe&UCASE_EXC_SENSITIVE)!=`0`);
620	}
621	}
622
623	/ string casing ------------------------------------------------------------ /
624
625	/*
626	* These internal functions form the core of string case mappings.
627	* They map single code points to result code points or strings and take
628	* all necessary conditions (context, locale ID, options) into account.
629	*
630	* They do not iterate over the source or write to the destination
631	* so that the same functions are useful for non-standard string storage,
632	* such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc.
633	* For the same reason, the "surrounding text" context is passed in as a
634	* UCaseContextIterator which does not make any assumptions about
635	* the underlying storage.
636	*
637	* This section contains helper functions that check for conditions
638	* in the input text surrounding the current code point
639	* according to SpecialCasing.txt.
640	*
641	* Each helper function gets the index
642	* - after the current code point if it looks at following text
643	* - before the current code point if it looks at preceding text
644	*
645	* Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows:
646	*
647	* Final_Sigma
648	* C is preceded by a sequence consisting of
649	* a cased letter and a case-ignorable sequence,
650	* and C is not followed by a sequence consisting of
651	* an ignorable sequence and then a cased letter.
652	*
653	* More_Above
654	* C is followed by one or more characters of combining class 230 (ABOVE)
655	* in the combining character sequence.
656	*
657	* After_Soft_Dotted
658	* The last preceding character with combining class of zero before C
659	* was Soft_Dotted,
660	* and there is no intervening combining character class 230 (ABOVE).
661	*
662	* Before_Dot
663	* C is followed by combining dot above (U+0307).
664	* Any sequence of characters with a combining class that is neither 0 nor 230
665	* may intervene between the current character and the combining dot above.
666	*
667	* The erratum from 2002-10-31 adds the condition
668	*
669	* After_I
670	* The last preceding base character was an uppercase I, and there is no
671	* intervening combining character class 230 (ABOVE).
672	*
673	* (See Jitterbug 2344 and the comments on After_I below.)
674	*
675	* Helper definitions in Unicode 3.2 UAX 21:
676	*
677	* D1. A character C is defined to be cased
678	* if it meets any of the following criteria:
679	*
680	* - The general category of C is Titlecase Letter (Lt)
681	* - In [CoreProps], C has one of the properties Uppercase, or Lowercase
682	* - Given D = NFD(C), then it is not the case that:
683	* D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)
684	* (This third criterium does not add any characters to the list
685	* for Unicode 3.2. Ignored.)
686	*
687	* D2. A character C is defined to be case-ignorable
688	* if it meets either of the following criteria:
689	*
690	* - The general category of C is
691	* Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or
692	* Letter Modifier (Lm), or Symbol Modifier (Sk)
693	* - C is one of the following characters
694	* U+0027 APOSTROPHE
695	* U+00AD SOFT HYPHEN (SHY)
696	* U+2019 RIGHT SINGLE QUOTATION MARK
697	* (the preferred character for apostrophe)
698	*
699	* D3. A case-ignorable sequence is a sequence of
700	* zero or more case-ignorable characters.
701	*/
702
703	#define is_d(c) ((c)=='d' \|\| (c)=='D')
704	#define is_e(c) ((c)=='e' \|\| (c)=='E')
705	#define is_i(c) ((c)=='i' \|\| (c)=='I')
706	#define is_l(c) ((c)=='l' \|\| (c)=='L')
707	#define is_r(c) ((c)=='r' \|\| (c)=='R')
708	#define is_t(c) ((c)=='t' \|\| (c)=='T')
709	#define is_u(c) ((c)=='u' \|\| (c)=='U')
710	#define is_z(c) ((c)=='z' \|\| (c)=='Z')
711
712	/ separator? /
713	#define is_sep(c) ((c)=='_' \|\| (c)=='-' \|\| (c)==0)
714
715	/**
716	* Requires non-NULL locale ID but otherwise does the equivalent of
717	* checking for language codes as if uloc_getLanguage() were called:
718	* Accepts both 2- and 3-letter codes and accepts case variants.
719	*/
720	U_CFUNC int32_t
721	ucase_getCaseLocale(const char *locale) {
722	/*
723	* This function used to use uloc_getLanguage(), but the current code
724	* removes the dependency of this low-level code on uloc implementation code
725	* and is faster because not the whole locale ID has to be
726	* examined and copied/transformed.
727	*
728	* Because this code does not want to depend on uloc, the caller must
729	* pass in a non-NULL locale, i.e., may need to call uloc_getDefault().
730	*/
731	char c=*locale++;
732	// Fastpath for English "en" which is often used for default (=root locale) case mappings,
733	// and for Chinese "zh": Very common but no special case mapping behavior.
734	// Then check lowercase vs. uppercase to reduce the number of comparisons
735	// for other locales without special behavior.
736	if(c==`'e'`) {
737	/ el or ell? /
738	c=*locale++;
739	if(is_l(c)) {
740	c=*locale++;
741	if(is_l(c)) {
742	c=*locale;
743	}
744	if(is_sep(c)) {
745	return UCASE_LOC_GREEK;
746	}
747	}
748	// en, es, ... -> root
749	} else if(c==`'z'`) {
750	return UCASE_LOC_ROOT;
751	#if U_CHARSET_FAMILY==U_ASCII_FAMILY
752	} else if(c>=`'a'`) { // ASCII a-z = 0x61..0x7a, after A-Z
753	#elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY
754	} else if(c<=`'z'`) { // EBCDIC a-z = 0x81..0xa9 with two gaps, before A-Z
755	#else
756	# error Unknown charset family!
757	#endif
758	// lowercase c
759	if(c==`'t'`) {
760	/ tr or tur? /
761	c=*locale++;
762	if(is_u(c)) {
763	c=*locale++;
764	}
765	if(is_r(c)) {
766	c=*locale;
767	if(is_sep(c)) {
768	return UCASE_LOC_TURKISH;
769	}
770	}
771	} else if(c==`'a'`) {
772	/ az or aze? /
773	c=*locale++;
774	if(is_z(c)) {
775	c=*locale++;
776	if(is_e(c)) {
777	c=*locale;
778	}
779	if(is_sep(c)) {
780	return UCASE_LOC_TURKISH;
781	}
782	}
783	} else if(c==`'l'`) {
784	/ lt or lit? /
785	c=*locale++;
786	if(is_i(c)) {
787	c=*locale++;
788	}
789	if(is_t(c)) {
790	c=*locale;
791	if(is_sep(c)) {
792	return UCASE_LOC_LITHUANIAN;
793	}
794	}
795	} else if(c==`'n'`) {
796	/ nl or nld? /
797	c=*locale++;
798	if(is_l(c)) {
799	c=*locale++;
800	if(is_d(c)) {
801	c=*locale;
802	}
803	if(is_sep(c)) {
804	return UCASE_LOC_DUTCH;
805	}
806	}
807	}
808	} else {
809	// uppercase c
810	// Same code as for lowercase c but also check for 'E'.
811	if(c==`'T'`) {
812	/ tr or tur? /
813	c=*locale++;
814	if(is_u(c)) {
815	c=*locale++;
816	}
817	if(is_r(c)) {
818	c=*locale;
819	if(is_sep(c)) {
820	return UCASE_LOC_TURKISH;
821	}
822	}
823	} else if(c==`'A'`) {
824	/ az or aze? /
825	c=*locale++;
826	if(is_z(c)) {
827	c=*locale++;
828	if(is_e(c)) {
829	c=*locale;
830	}
831	if(is_sep(c)) {
832	return UCASE_LOC_TURKISH;
833	}
834	}
835	} else if(c==`'L'`) {
836	/ lt or lit? /
837	c=*locale++;
838	if(is_i(c)) {
839	c=*locale++;
840	}
841	if(is_t(c)) {
842	c=*locale;
843	if(is_sep(c)) {
844	return UCASE_LOC_LITHUANIAN;
845	}
846	}
847	} else if(c==`'E'`) {
848	/ el or ell? /
849	c=*locale++;
850	if(is_l(c)) {
851	c=*locale++;
852	if(is_l(c)) {
853	c=*locale;
854	}
855	if(is_sep(c)) {
856	return UCASE_LOC_GREEK;
857	}
858	}
859	} else if(c==`'N'`) {
860	/ nl or nld? /
861	c=*locale++;
862	if(is_l(c)) {
863	c=*locale++;
864	if(is_d(c)) {
865	c=*locale;
866	}
867	if(is_sep(c)) {
868	return UCASE_LOC_DUTCH;
869	}
870	}
871	}
872	}
873	return UCASE_LOC_ROOT;
874	}
875
876	/*
877	* Is followed by
878	* {case-ignorable}* cased
879	* ?
880	* (dir determines looking forward/backward)
881	* If a character is case-ignorable, it is skipped regardless of whether
882	* it is also cased or not.
883	*/
884	static UBool
885	isFollowedByCasedLetter(UCaseContextIterator iter, void* *context, int8_t dir) {
886	UChar32 c;
887
888	if(iter==NULL) {
889	return FALSE;
890	}
891
892	for(/ dir!=0 sets direction /; (c=iter(context, dir))>=`0`; dir=`0`) {
893	int32_t type=ucase_getTypeOrIgnorable(c);
894	if(type&`4`) {
895	/ case-ignorable, continue with the loop /
896	} else if(type!=UCASE_NONE) {
897	return TRUE; / followed by cased letter /
898	} else {
899	return FALSE; / uncased and not case-ignorable /
900	}
901	}
902
903	return FALSE; / not followed by cased letter /
904	}
905
906	/ Is preceded by Soft_Dotted character with no intervening cc=230 ? /
907	static UBool
908	isPrecededBySoftDotted(UCaseContextIterator iter, void* *context) {
909	UChar32 c;
910	int32_t dotType;
911	int8_t dir;
912
913	if(iter==NULL) {
914	return FALSE;
915	}
916
917	for(dir=-`1`; (c=iter(context, dir))>=`0`; dir=`0`) {
918	dotType=getDotType(c);
919	if(dotType==UCASE_SOFT_DOTTED) {
920	return TRUE; / preceded by TYPE_i /
921	} else if(dotType!=UCASE_OTHER_ACCENT) {
922	return FALSE; / preceded by different base character (not TYPE_i), or intervening cc==230 /
923	}
924	}
925
926	return FALSE; / not preceded by TYPE_i /
927	}
928
929	/*
930	* See Jitterbug 2344:
931	* The condition After_I for Turkic-lowercasing of U+0307 combining dot above
932	* is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because
933	* we made those releases compatible with Unicode 3.2 which had not fixed
934	* a related bug in SpecialCasing.txt.
935	*
936	* From the Jitterbug 2344 text:
937	* ... this bug is listed as a Unicode erratum
938	* from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html
939	* <quote>
940	* There are two errors in SpecialCasing.txt.
941	* 1. Missing semicolons on two lines. ... [irrelevant for ICU]
942	* 2. An incorrect context definition. Correct as follows:
943	* < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
944	* < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
945	* ---
946	* > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
947	* > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
948	* where the context After_I is defined as:
949	* The last preceding base character was an uppercase I, and there is no
950	* intervening combining character class 230 (ABOVE).
951	* </quote>
952	*
953	* Note that SpecialCasing.txt even in Unicode 3.2 described the condition as:
954	*
955	* # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
956	* # This matches the behavior of the canonically equivalent I-dot_above
957	*
958	* See also the description in this place in older versions of uchar.c (revision 1.100).
959	*
960	* Markus W. Scherer 2003-feb-15
961	*/
962
963	/ Is preceded by base character 'I' with no intervening cc=230 ? /
964	static UBool
965	isPrecededBy_I(UCaseContextIterator iter, void* *context) {
966	UChar32 c;
967	int32_t dotType;
968	int8_t dir;
969
970	if(iter==NULL) {
971	return FALSE;
972	}
973
974	for(dir=-`1`; (c=iter(context, dir))>=`0`; dir=`0`) {
975	if(c==`0x49`) {
976	return TRUE; / preceded by I /
977	}
978	dotType=getDotType(c);
979	if(dotType!=UCASE_OTHER_ACCENT) {
980	return FALSE; / preceded by different base character (not I), or intervening cc==230 /
981	}
982	}
983
984	return FALSE; / not preceded by I /
985	}
986
987	/ Is followed by one or more cc==230 ? /
988	static UBool
989	isFollowedByMoreAbove(UCaseContextIterator iter, void* *context) {
990	UChar32 c;
991	int32_t dotType;
992	int8_t dir;
993
994	if(iter==NULL) {
995	return FALSE;
996	}
997
998	for(dir=`1`; (c=iter(context, dir))>=`0`; dir=`0`) {
999	dotType=getDotType(c);
1000	if(dotType==UCASE_ABOVE) {
1001	return TRUE; / at least one cc==230 following /
1002	} else if(dotType!=UCASE_OTHER_ACCENT) {
1003	return FALSE; / next base character, no more cc==230 following /
1004	}
1005	}
1006
1007	return FALSE; / no more cc==230 following /
1008	}
1009
1010	/ Is followed by a dot above (without cc==230 in between) ? /
1011	static UBool
1012	isFollowedByDotAbove(UCaseContextIterator iter, void* *context) {
1013	UChar32 c;
1014	int32_t dotType;
1015	int8_t dir;
1016
1017	if(iter==NULL) {
1018	return FALSE;
1019	}
1020
1021	for(dir=`1`; (c=iter(context, dir))>=`0`; dir=`0`) {
1022	if(c==`0x307`) {
1023	return TRUE;
1024	}
1025	dotType=getDotType(c);
1026	if(dotType!=UCASE_OTHER_ACCENT) {
1027	return FALSE; / next base character or cc==230 in between /
1028	}
1029	}
1030
1031	return FALSE; / no dot above following /
1032	}
1033
1034	U_CAPI int32_t U_EXPORT2
1035	ucase_toFullLower(UChar32 c,
1036	UCaseContextIterator iter, void* *context,
1037	const UChar **pString,
1038	int32_t loc) {
1039	// The sign of the result has meaning, input must be non-negative so that it can be returned as is.
1040	U_ASSERT(c >= `0`);
1041	UChar32 result=c;
1042	uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
1043	if(!UCASE_HAS_EXCEPTION(props)) {
1044	if(UCASE_IS_UPPER_OR_TITLE(props)) {
1045	result=c+UCASE_GET_DELTA(props);
1046	}
1047	} else {
1048	const uint16_t pe=GET_EXCEPTIONS(&ucase_props_singleton, props), pe2;
1049	uint16_t excWord=*pe++;
1050	int32_t full;
1051
1052	pe2=pe;
1053
1054	if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
1055	/ use hardcoded conditions and mappings /
1056
1057	/*
1058	* Test for conditional mappings first
1059	* (otherwise the unconditional default mappings are always taken),
1060	* then test for characters that have unconditional mappings in SpecialCasing.txt,
1061	* then get the UnicodeData.txt mappings.
1062	*/
1063	if( loc==UCASE_LOC_LITHUANIAN &&
1064	/ base characters, find accents above /
1065	(((c==`0x49` \|\| c==`0x4a` \|\| c==`0x12e`) &&
1066	isFollowedByMoreAbove(iter, context)) \|\|
1067	/ precomposed with accent above, no need to find one /
1068	(c==`0xcc` \|\| c==`0xcd` \|\| c==`0x128`))
1069	) {
1070	/*
1071	# Lithuanian
1072
1073	# Lithuanian retains the dot in a lowercase i when followed by accents.
1074
1075	# Introduce an explicit dot above when lowercasing capital I's and J's
1076	# whenever there are more accents above.
1077	# (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
1078
1079	0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
1080	004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
1081	012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
1082	00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
1083	00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
1084	0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
1085	*/
1086	switch(c) {
1087	case `0x49`: / LATIN CAPITAL LETTER I /
1088	*pString=iDot;
1089	return `2`;
1090	case `0x4a`: / LATIN CAPITAL LETTER J /
1091	*pString=jDot;
1092	return `2`;
1093	case `0x12e`: / LATIN CAPITAL LETTER I WITH OGONEK /
1094	*pString=iOgonekDot;
1095	return `2`;
1096	case `0xcc`: / LATIN CAPITAL LETTER I WITH GRAVE /
1097	*pString=iDotGrave;
1098	return `3`;
1099	case `0xcd`: / LATIN CAPITAL LETTER I WITH ACUTE /
1100	*pString=iDotAcute;
1101	return `3`;
1102	case `0x128`: / LATIN CAPITAL LETTER I WITH TILDE /
1103	*pString=iDotTilde;
1104	return `3`;
1105	default:
1106	return `0`; / will not occur /
1107	}
1108	/ # Turkish and Azeri /
1109	} else if(loc==UCASE_LOC_TURKISH && c==`0x130`) {
1110	/*
1111	# I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
1112	# The following rules handle those cases.
1113
1114	0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
1115	0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
1116	*/
1117	return `0x69`;
1118	} else if(loc==UCASE_LOC_TURKISH && c==`0x307` && isPrecededBy_I(iter, context)) {
1119	/*
1120	# When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
1121	# This matches the behavior of the canonically equivalent I-dot_above
1122
1123	0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
1124	0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
1125	*/
1126	pString=nullptr*;
1127	return `0`; / remove the dot (continue without output) /
1128	} else if(loc==UCASE_LOC_TURKISH && c==`0x49` && !isFollowedByDotAbove(iter, context)) {
1129	/*
1130	# When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
1131
1132	0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
1133	0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
1134	*/
1135	return `0x131`;
1136	} else if(c==`0x130`) {
1137	/*
1138	# Preserve canonical equivalence for I with dot. Turkic is handled below.
1139
1140	0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1141	*/
1142	*pString=iDot;
1143	return `2`;
1144	} else if( c==`0x3a3` &&
1145	!isFollowedByCasedLetter(iter, context, `1`) &&
1146	isFollowedByCasedLetter(iter, context, -`1`) / -1=preceded /
1147	) {
1148	/ greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) /
1149	/*
1150	# Special case for final form of sigma
1151
1152	03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
1153	*/
1154	return `0x3c2`; / greek small final sigma /
1155	} else {
1156	/ no known conditional special case mapping, use a normal mapping /
1157	}
1158	} else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1159	GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1160	full&=UCASE_FULL_LOWER;
1161	if(full!=`0`) {
1162	/ set the output pointer to the lowercase mapping /
1163	pString=reinterpret_cast<const* UChar *>(pe+`1`);
1164
1165	/ return the string length /
1166	return full;
1167	}
1168	}
1169
1170	if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
1171	int32_t delta;
1172	GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe2, delta);
1173	return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==`0` ? c+delta : c-delta;
1174	}
1175	if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1176	GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe2, result);
1177	}
1178	}
1179
1180	return (result==c) ? ~result : result;
1181	}
1182
1183	/ internal /
1184	static int32_t
1185	toUpperOrTitle(UChar32 c,
1186	UCaseContextIterator iter, void* *context,
1187	const UChar **pString,
1188	int32_t loc,
1189	UBool upperNotTitle) {
1190	// The sign of the result has meaning, input must be non-negative so that it can be returned as is.
1191	U_ASSERT(c >= `0`);
1192	UChar32 result=c;
1193	uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
1194	if(!UCASE_HAS_EXCEPTION(props)) {
1195	if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
1196	result=c+UCASE_GET_DELTA(props);
1197	}
1198	} else {
1199	const uint16_t pe=GET_EXCEPTIONS(&ucase_props_singleton, props), pe2;
1200	uint16_t excWord=*pe++;
1201	int32_t full, idx;
1202
1203	pe2=pe;
1204
1205	if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
1206	/ use hardcoded conditions and mappings /
1207	if(loc==UCASE_LOC_TURKISH && c==`0x69`) {
1208	/*
1209	# Turkish and Azeri
1210
1211	# I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
1212	# The following rules handle those cases.
1213
1214	# When uppercasing, i turns into a dotted capital I
1215
1216	0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
1217	0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
1218	*/
1219	return `0x130`;
1220	} else if(loc==UCASE_LOC_LITHUANIAN && c==`0x307` && isPrecededBySoftDotted(iter, context)) {
1221	/*
1222	# Lithuanian
1223
1224	# Lithuanian retains the dot in a lowercase i when followed by accents.
1225
1226	# Remove DOT ABOVE after "i" with upper or titlecase
1227
1228	0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
1229	*/
1230	pString=nullptr*;
1231	return `0`; / remove the dot (continue without output) /
1232	} else {
1233	/ no known conditional special case mapping, use a normal mapping /
1234	}
1235	} else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1236	GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1237
1238	/ start of full case mapping strings /
1239	++pe;
1240
1241	/ skip the lowercase and case-folding result strings /
1242	pe+=full&UCASE_FULL_LOWER;
1243	full>>=`4`;
1244	pe+=full&`0xf`;
1245	full>>=`4`;
1246
1247	if(upperNotTitle) {
1248	full&=`0xf`;
1249	} else {
1250	/ skip the uppercase result string /
1251	pe+=full&`0xf`;
1252	full=(full>>`4`)&`0xf`;
1253	}
1254
1255	if(full!=`0`) {
1256	/ set the output pointer to the result string /
1257	pString=reinterpret_cast<const* UChar *>(pe);
1258
1259	/ return the string length /
1260	return full;
1261	}
1262	}
1263
1264	if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) {
1265	int32_t delta;
1266	GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe2, delta);
1267	return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==`0` ? c+delta : c-delta;
1268	}
1269	if(!upperNotTitle && HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
1270	idx=UCASE_EXC_TITLE;
1271	} else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
1272	/ here, titlecase is same as uppercase /
1273	idx=UCASE_EXC_UPPER;
1274	} else {
1275	return ~c;
1276	}
1277	GET_SLOT_VALUE(excWord, idx, pe2, result);
1278	}
1279
1280	return (result==c) ? ~result : result;
1281	}
1282
1283	U_CAPI int32_t U_EXPORT2
1284	ucase_toFullUpper(UChar32 c,
1285	UCaseContextIterator iter, void* *context,
1286	const UChar **pString,
1287	int32_t caseLocale) {
1288	return toUpperOrTitle(c, iter, context, pString, caseLocale, TRUE);
1289	}
1290
1291	U_CAPI int32_t U_EXPORT2
1292	ucase_toFullTitle(UChar32 c,
1293	UCaseContextIterator iter, void* *context,
1294	const UChar **pString,
1295	int32_t caseLocale) {
1296	return toUpperOrTitle(c, iter, context, pString, caseLocale, FALSE);
1297	}
1298
1299	/ case folding ------------------------------------------------------------- /
1300
1301	/*
1302	* Case folding is similar to lowercasing.
1303	* The result may be a simple mapping, i.e., a single code point, or
1304	* a full mapping, i.e., a string.
1305	* If the case folding for a code point is the same as its simple (1:1) lowercase mapping,
1306	* then only the lowercase mapping is stored.
1307	*
1308	* Some special cases are hardcoded because their conditions cannot be
1309	* parsed and processed from CaseFolding.txt.
1310	*
1311	* Unicode 3.2 CaseFolding.txt specifies for its status field:
1312
1313	# C: common case folding, common mappings shared by both simple and full mappings.
1314	# F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
1315	# S: simple case folding, mappings to single characters where different from F.
1316	# T: special case for uppercase I and dotted uppercase I
1317	# - For non-Turkic languages, this mapping is normally not used.
1318	# - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
1319	#
1320	# Usage:
1321	# A. To do a simple case folding, use the mappings with status C + S.
1322	# B. To do a full case folding, use the mappings with status C + F.
1323	#
1324	# The mappings with status T can be used or omitted depending on the desired case-folding
1325	# behavior. (The default option is to exclude them.)
1326
1327	* Unicode 3.2 has 'T' mappings as follows:
1328
1329	0049; T; 0131; # LATIN CAPITAL LETTER I
1330	0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1331
1332	* while the default mappings for these code points are:
1333
1334	0049; C; 0069; # LATIN CAPITAL LETTER I
1335	0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1336
1337	* U+0130 has no simple case folding (simple-case-folds to itself).
1338	*/
1339
1340	/ return the simple case folding mapping for c /
1341	U_CAPI UChar32 U_EXPORT2
1342	ucase_fold(UChar32 c, uint32_t options) {
1343	uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
1344	if(!UCASE_HAS_EXCEPTION(props)) {
1345	if(UCASE_IS_UPPER_OR_TITLE(props)) {
1346	c+=UCASE_GET_DELTA(props);
1347	}
1348	} else {
1349	const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
1350	uint16_t excWord=*pe++;
1351	int32_t idx;
1352	if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1353	/ special case folding mappings, hardcoded /
1354	if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1355	/ default mappings /
1356	if(c==`0x49`) {
1357	/ 0049; C; 0069; # LATIN CAPITAL LETTER I /
1358	return `0x69`;
1359	} else if(c==`0x130`) {
1360	/ no simple case folding for U+0130 /
1361	return c;
1362	}
1363	} else {
1364	/ Turkic mappings /
1365	if(c==`0x49`) {
1366	/ 0049; T; 0131; # LATIN CAPITAL LETTER I /
1367	return `0x131`;
1368	} else if(c==`0x130`) {
1369	/ 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE /
1370	return `0x69`;
1371	}
1372	}
1373	}
1374	if((excWord&UCASE_EXC_NO_SIMPLE_CASE_FOLDING)!=`0`) {
1375	return c;
1376	}
1377	if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
1378	int32_t delta;
1379	GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
1380	return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==`0` ? c+delta : c-delta;
1381	}
1382	if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
1383	idx=UCASE_EXC_FOLD;
1384	} else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1385	idx=UCASE_EXC_LOWER;
1386	} else {
1387	return c;
1388	}
1389	GET_SLOT_VALUE(excWord, idx, pe, c);
1390	}
1391	return c;
1392	}
1393
1394	/*
1395	* Issue for canonical caseless match (UAX #21):
1396	* Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve
1397	* canonical equivalence, unlike default-option casefolding.
1398	* For example, I-grave and I + grave fold to strings that are not canonically
1399	* equivalent.
1400	* For more details, see the comment in unorm_compare() in unorm.cpp
1401	* and the intermediate prototype changes for Jitterbug 2021.
1402	* (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.)
1403	*
1404	* This did not get fixed because it appears that it is not possible to fix
1405	* it for uppercase and lowercase characters (I-grave vs. i-grave)
1406	* together in a way that they still fold to common result strings.
1407	*/
1408
1409	U_CAPI int32_t U_EXPORT2
1410	ucase_toFullFolding(UChar32 c,
1411	const UChar **pString,
1412	uint32_t options) {
1413	// The sign of the result has meaning, input must be non-negative so that it can be returned as is.
1414	U_ASSERT(c >= `0`);
1415	UChar32 result=c;
1416	uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
1417	if(!UCASE_HAS_EXCEPTION(props)) {
1418	if(UCASE_IS_UPPER_OR_TITLE(props)) {
1419	result=c+UCASE_GET_DELTA(props);
1420	}
1421	} else {
1422	const uint16_t pe=GET_EXCEPTIONS(&ucase_props_singleton, props), pe2;
1423	uint16_t excWord=*pe++;
1424	int32_t full, idx;
1425
1426	pe2=pe;
1427
1428	if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1429	/ use hardcoded conditions and mappings /
1430	if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1431	/ default mappings /
1432	if(c==`0x49`) {
1433	/ 0049; C; 0069; # LATIN CAPITAL LETTER I /
1434	return `0x69`;
1435	} else if(c==`0x130`) {
1436	/ 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE /
1437	*pString=iDot;
1438	return `2`;
1439	}
1440	} else {
1441	/ Turkic mappings /
1442	if(c==`0x49`) {
1443	/ 0049; T; 0131; # LATIN CAPITAL LETTER I /
1444	return `0x131`;
1445	} else if(c==`0x130`) {
1446	/ 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE /
1447	return `0x69`;
1448	}
1449	}
1450	} else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1451	GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1452
1453	/ start of full case mapping strings /
1454	++pe;
1455
1456	/ skip the lowercase result string /
1457	pe+=full&UCASE_FULL_LOWER;
1458	full=(full>>`4`)&`0xf`;
1459
1460	if(full!=`0`) {
1461	/ set the output pointer to the result string /
1462	pString=reinterpret_cast<const* UChar *>(pe);
1463
1464	/ return the string length /
1465	return full;
1466	}
1467	}
1468
1469	if((excWord&UCASE_EXC_NO_SIMPLE_CASE_FOLDING)!=`0`) {
1470	return ~c;
1471	}
1472	if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
1473	int32_t delta;
1474	GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe2, delta);
1475	return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==`0` ? c+delta : c-delta;
1476	}
1477	if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
1478	idx=UCASE_EXC_FOLD;
1479	} else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1480	idx=UCASE_EXC_LOWER;
1481	} else {
1482	return ~c;
1483	}
1484	GET_SLOT_VALUE(excWord, idx, pe2, result);
1485	}
1486
1487	return (result==c) ? ~result : result;
1488	}
1489
1490	/ case mapping properties API ---------------------------------------------- /
1491
1492	/ public API (see uchar.h) /
1493
1494	U_CAPI UBool U_EXPORT2
1495	u_isULowercase(UChar32 c) {
1496	return (UBool)(UCASE_LOWER==ucase_getType(c));
1497	}
1498
1499	U_CAPI UBool U_EXPORT2
1500	u_isUUppercase(UChar32 c) {
1501	return (UBool)(UCASE_UPPER==ucase_getType(c));
1502	}
1503
1504	/ Transforms the Unicode character to its lower case equivalent./
1505	U_CAPI UChar32 U_EXPORT2
1506	u_tolower(UChar32 c) {
1507	return ucase_tolower(c);
1508	}
1509
1510	/ Transforms the Unicode character to its upper case equivalent./
1511	U_CAPI UChar32 U_EXPORT2
1512	u_toupper(UChar32 c) {
1513	return ucase_toupper(c);
1514	}
1515
1516	/ Transforms the Unicode character to its title case equivalent./
1517	U_CAPI UChar32 U_EXPORT2
1518	u_totitle(UChar32 c) {
1519	return ucase_totitle(c);
1520	}
1521
1522	/ return the simple case folding mapping for c /
1523	U_CAPI UChar32 U_EXPORT2
1524	u_foldCase(UChar32 c, uint32_t options) {
1525	return ucase_fold(c, options);
1526	}
1527
1528	U_CFUNC int32_t U_EXPORT2
1529	ucase_hasBinaryProperty(UChar32 c, UProperty which) {
1530	/ case mapping properties /
1531	const UChar *resultString;
1532	switch(which) {
1533	case UCHAR_LOWERCASE:
1534	return (UBool)(UCASE_LOWER==ucase_getType(c));
1535	case UCHAR_UPPERCASE:
1536	return (UBool)(UCASE_UPPER==ucase_getType(c));
1537	case UCHAR_SOFT_DOTTED:
1538	return ucase_isSoftDotted(c);
1539	case UCHAR_CASE_SENSITIVE:
1540	return ucase_isCaseSensitive(c);
1541	case UCHAR_CASED:
1542	return (UBool)(UCASE_NONE!=ucase_getType(c));
1543	case UCHAR_CASE_IGNORABLE:
1544	return (UBool)(ucase_getTypeOrIgnorable(c)>>`2`);
1545	/*
1546	* Note: The following Changes_When_Xyz are defined as testing whether
1547	* the NFD form of the input changes when Xyz-case-mapped.
1548	* However, this simpler implementation of these properties,
1549	* ignoring NFD, passes the tests.
1550	* The implementation needs to be changed if the tests start failing.
1551	* When that happens, optimizations should be used to work with the
1552	* per-single-code point ucase_toFullXyz() functions unless
1553	* the NFD form has more than one code point,
1554	* and the property starts set needs to be the union of the
1555	* start sets for normalization and case mappings.
1556	*/
1557	case UCHAR_CHANGES_WHEN_LOWERCASED:
1558	return (UBool)(ucase_toFullLower(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=`0`);
1559	case UCHAR_CHANGES_WHEN_UPPERCASED:
1560	return (UBool)(ucase_toFullUpper(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=`0`);
1561	case UCHAR_CHANGES_WHEN_TITLECASED:
1562	return (UBool)(ucase_toFullTitle(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=`0`);
1563	/ case UCHAR_CHANGES_WHEN_CASEFOLDED: -- in uprops.c /
1564	case UCHAR_CHANGES_WHEN_CASEMAPPED:
1565	return (UBool)(
1566	ucase_toFullLower(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=`0` \|\|
1567	ucase_toFullUpper(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=`0` \|\|
1568	ucase_toFullTitle(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=`0`);
1569	default:
1570	return FALSE;
1571	}
1572	}
1573

Browse the source code of ClickHouse/contrib/icu/icu4c/source/common/ucase.cpp