ucasemap.cpp source code [ClickHouse/contrib/icu/icu4c/source/common/ucasemap.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	*******************************************************************************
5	*
6	* Copyright (C) 2005-2016, International Business Machines
7	* Corporation and others. All Rights Reserved.
8	*
9	*******************************************************************************
10	* file name: ucasemap.cpp
11	* encoding: UTF-8
12	* tab size: 8 (not used)
13	* indentation:4
14	*
15	* created on: 2005may06
16	* created by: Markus W. Scherer
17	*
18	* Case mapping service object and functions using it.
19	*/
20
21	#include "unicode/utypes.h"
22	#include "unicode/brkiter.h"
23	#include "unicode/bytestream.h"
24	#include "unicode/casemap.h"
25	#include "unicode/edits.h"
26	#include "unicode/stringoptions.h"
27	#include "unicode/stringpiece.h"
28	#include "unicode/ubrk.h"
29	#include "unicode/uloc.h"
30	#include "unicode/ustring.h"
31	#include "unicode/ucasemap.h"
32	#if !UCONFIG_NO_BREAK_ITERATION
33	#include "unicode/utext.h"
34	#endif
35	#include "unicode/utf.h"
36	#include "unicode/utf8.h"
37	#include "unicode/utf16.h"
38	#include "bytesinkutil.h"
39	#include "cmemory.h"
40	#include "cstring.h"
41	#include "uassert.h"
42	#include "ucase.h"
43	#include "ucasemap_imp.h"
44	#include "ustr_imp.h"
45
46	U_NAMESPACE_USE
47
48	/ UCaseMap service object -------------------------------------------------- /
49
50	UCaseMap::UCaseMap(const char localeID, uint32_t opts, UErrorCode pErrorCode) :
51	#if !UCONFIG_NO_BREAK_ITERATION
52	iter(NULL),
53	#endif
54	caseLocale(UCASE_LOC_UNKNOWN), options(opts) {
55	ucasemap_setLocale(this, localeID, pErrorCode);
56	}
57
58	UCaseMap::~UCaseMap() {
59	#if !UCONFIG_NO_BREAK_ITERATION
60	delete iter;
61	#endif
62	}
63
64	U_CAPI UCaseMap * U_EXPORT2
65	ucasemap_open(const char locale, uint32_t options, UErrorCode pErrorCode) {
66	if(U_FAILURE(*pErrorCode)) {
67	return NULL;
68	}
69	UCaseMap csm = new* UCaseMap (locale, options, pErrorCode);
70	if(csm==NULL) {
71	*pErrorCode = U_MEMORY_ALLOCATION_ERROR;
72	return NULL;
73	} else if (U_FAILURE(*pErrorCode)) {
74	delete csm;
75	return NULL;
76	}
77	return csm;
78	}
79
80	U_CAPI void U_EXPORT2
81	ucasemap_close(UCaseMap *csm) {
82	delete csm;
83	}
84
85	U_CAPI const char * U_EXPORT2
86	ucasemap_getLocale(const UCaseMap *csm) {
87	return csm->locale;
88	}
89
90	U_CAPI uint32_t U_EXPORT2
91	ucasemap_getOptions(const UCaseMap *csm) {
92	return csm->options;
93	}
94
95	U_CAPI void U_EXPORT2
96	ucasemap_setLocale(UCaseMap csm, const* char locale, UErrorCode pErrorCode) {
97	if(U_FAILURE(*pErrorCode)) {
98	return;
99	}
100	if (locale != NULL && *locale == `0`) {
101	csm->locale[`0`] = `0`;
102	csm->caseLocale = UCASE_LOC_ROOT;
103	return;
104	}
105
106	int32_t length=uloc_getName(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode);
107	if(pErrorCode==U_BUFFER_OVERFLOW_ERROR \|\| length==sizeof*(csm->locale)) {
108	*pErrorCode=U_ZERO_ERROR;
109	/ we only really need the language code for case mappings /
110	length=uloc_getLanguage(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode);
111	}
112	if(length==sizeof(csm->locale)) {
113	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
114	}
115	if(U_SUCCESS(*pErrorCode)) {
116	csm->caseLocale=UCASE_LOC_UNKNOWN;
117	csm->caseLocale = ucase_getCaseLocale(csm->locale);
118	} else {
119	csm->locale[`0`]=`0`;
120	csm->caseLocale = UCASE_LOC_ROOT;
121	}
122	}
123
124	U_CAPI void U_EXPORT2
125	ucasemap_setOptions(UCaseMap csm, uint32_t options, UErrorCode pErrorCode) {
126	if(U_FAILURE(*pErrorCode)) {
127	return;
128	}
129	csm->options=options;
130	}
131
132	/ UTF-8 string case mappings ----------------------------------------------- /
133
134	/ TODO(markus): Move to a new, separate utf8case.cpp file. /
135
136	namespace {
137
138	/ append a full case mapping result, see UCASE_MAX_STRING_LENGTH /
139	inline UBool
140	appendResult(int32_t cpLength, int32_t result, const UChar *s,
141	ByteSink &sink, uint32_t options, icu::Edits *edits, UErrorCode &errorCode) {
142	U_ASSERT(U_SUCCESS(errorCode));
143
144	/ decode the result /
145	if(result<`0`) {
146	/ (not) original code point /
147	if(edits!=NULL) {
148	edits->addUnchanged(cpLength);
149	}
150	if((options & U_OMIT_UNCHANGED_TEXT) == `0`) {
151	ByteSinkUtil::appendCodePoint(cpLength, ~result, sink);
152	}
153	} else {
154	if(result<=UCASE_MAX_STRING_LENGTH) {
155	// string: "result" is the UTF-16 length
156	return ByteSinkUtil::appendChange(cpLength, s, result, sink, edits, errorCode);
157	} else {
158	ByteSinkUtil::appendCodePoint(cpLength, result, sink, edits);
159	}
160	}
161	return TRUE;
162	}
163
164	// See unicode/utf8.h U8_APPEND_UNSAFE().
165	inline uint8_t getTwoByteLead(UChar32 c) { return (uint8_t)((c >> `6`) \| `0xc0`); }
166	inline uint8_t getTwoByteTrail(UChar32 c) { return (uint8_t)((c & `0x3f`) \| `0x80`); }
167
168	UChar32 U_CALLCONV
169	utf8_caseContextIterator(void *context, int8_t dir) {
170	UCaseContext csc=(UCaseContext )context;
171	UChar32 c;
172
173	if(dir<`0`) {
174	/ reset for backward iteration /
175	csc->index=csc->cpStart;
176	csc->dir=dir;
177	} else if(dir>`0`) {
178	/ reset for forward iteration /
179	csc->index=csc->cpLimit;
180	csc->dir=dir;
181	} else {
182	/ continue current iteration direction /
183	dir=csc->dir;
184	}
185
186	if(dir<`0`) {
187	if(csc->start<csc->index) {
188	U8_PREV((const uint8_t *)csc->p, csc->start, csc->index, c);
189	return c;
190	}
191	} else {
192	if(csc->index<csc->limit) {
193	U8_NEXT((const uint8_t *)csc->p, csc->index, csc->limit, c);
194	return c;
195	}
196	}
197	return U_SENTINEL;
198	}
199
200	/**
201	* caseLocale >= 0: Lowercases [srcStart..srcLimit[ but takes context [0..srcLength[ into account.
202	* caseLocale < 0: Case-folds [srcStart..srcLimit[.
203	*/
204	void toLower(int32_t caseLocale, uint32_t options,
205	const uint8_t src, UCaseContext csc, int32_t srcStart, int32_t srcLimit,
206	icu::ByteSink &sink, icu::Edits *edits, UErrorCode &errorCode) {
207	const int8_t *latinToLower;
208	if (caseLocale == UCASE_LOC_ROOT \|\|
209	(caseLocale >= `0` ?
210	!(caseLocale == UCASE_LOC_TURKISH \|\| caseLocale == UCASE_LOC_LITHUANIAN) :
211	(options & _FOLD_CASE_OPTIONS_MASK) == U_FOLD_CASE_DEFAULT)) {
212	latinToLower = LatinCase::TO_LOWER_NORMAL;
213	} else {
214	latinToLower = LatinCase::TO_LOWER_TR_LT;
215	}
216	const UTrie2 *trie = ucase_getTrie();
217	int32_t prev = srcStart;
218	int32_t srcIndex = srcStart;
219	for (;;) {
220	// fast path for simple cases
221	int32_t cpStart;
222	UChar32 c;
223	for (;;) {
224	if (U_FAILURE(errorCode) \|\| srcIndex >= srcLimit) {
225	c = U_SENTINEL;
226	break;
227	}
228	uint8_t lead = src[srcIndex++];
229	if (lead <= `0x7f`) {
230	int8_t d = latinToLower[lead];
231	if (d == LatinCase::EXC) {
232	cpStart = srcIndex - `1`;
233	c = lead;
234	break;
235	}
236	if (d == `0`) { continue; }
237	ByteSinkUtil::appendUnchanged(src + prev, srcIndex - `1` - prev,
238	sink, options, edits, errorCode);
239	char ascii = (char)(lead + d);
240	sink.Append(&ascii, `1`);
241	if (edits != nullptr) {
242	edits->addReplace(`1`, `1`);
243	}
244	prev = srcIndex;
245	continue;
246	} else if (lead < `0xe3`) {
247	uint8_t t;
248	if (`0xc2` <= lead && lead <= `0xc5` && srcIndex < srcLimit &&
249	(t = src[srcIndex] - `0x80`) <= `0x3f`) {
250	// U+0080..U+017F
251	++srcIndex;
252	c = ((lead - `0xc0`) << `6`) \| t;
253	int8_t d = latinToLower[c];
254	if (d == LatinCase::EXC) {
255	cpStart = srcIndex - `2`;
256	break;
257	}
258	if (d == `0`) { continue; }
259	ByteSinkUtil::appendUnchanged(src + prev, srcIndex - `2` - prev,
260	sink, options, edits, errorCode);
261	ByteSinkUtil::appendTwoBytes(c + d, sink);
262	if (edits != nullptr) {
263	edits->addReplace(`2`, `2`);
264	}
265	prev = srcIndex;
266	continue;
267	}
268	} else if ((lead <= `0xe9` \|\| lead == `0xeb` \|\| lead == `0xec`) &&
269	(srcIndex + `2`) <= srcLimit &&
270	U8_IS_TRAIL(src[srcIndex]) && U8_IS_TRAIL(src[srcIndex + `1`])) {
271	// most of CJK: no case mappings
272	srcIndex += `2`;
273	continue;
274	}
275	cpStart = --srcIndex;
276	U8_NEXT(src, srcIndex, srcLimit, c);
277	if (c < `0`) {
278	// ill-formed UTF-8
279	continue;
280	}
281	uint16_t props = UTRIE2_GET16(trie, c);
282	if (UCASE_HAS_EXCEPTION(props)) { break; }
283	int32_t delta;
284	if (!UCASE_IS_UPPER_OR_TITLE(props) \|\| (delta = UCASE_GET_DELTA(props)) == `0`) {
285	continue;
286	}
287	ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
288	sink, options, edits, errorCode);
289	ByteSinkUtil::appendCodePoint(srcIndex - cpStart, c + delta, sink, edits);
290	prev = srcIndex;
291	}
292	if (c < `0`) {
293	break;
294	}
295	// slow path
296	const UChar *s;
297	if (caseLocale >= `0`) {
298	csc->cpStart = cpStart;
299	csc->cpLimit = srcIndex;
300	c = ucase_toFullLower(c, utf8_caseContextIterator, csc, &s, caseLocale);
301	} else {
302	c = ucase_toFullFolding(c, &s, options);
303	}
304	if (c >= `0`) {
305	ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
306	sink, options, edits, errorCode);
307	appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode);
308	prev = srcIndex;
309	}
310	}
311	ByteSinkUtil::appendUnchanged(src + prev, srcIndex - prev,
312	sink, options, edits, errorCode);
313	}
314
315	void toUpper(int32_t caseLocale, uint32_t options,
316	const uint8_t src, UCaseContext csc, int32_t srcLength,
317	icu::ByteSink &sink, icu::Edits *edits, UErrorCode &errorCode) {
318	const int8_t *latinToUpper;
319	if (caseLocale == UCASE_LOC_TURKISH) {
320	latinToUpper = LatinCase::TO_UPPER_TR;
321	} else {
322	latinToUpper = LatinCase::TO_UPPER_NORMAL;
323	}
324	const UTrie2 *trie = ucase_getTrie();
325	int32_t prev = `0`;
326	int32_t srcIndex = `0`;
327	for (;;) {
328	// fast path for simple cases
329	int32_t cpStart;
330	UChar32 c;
331	for (;;) {
332	if (U_FAILURE(errorCode) \|\| srcIndex >= srcLength) {
333	c = U_SENTINEL;
334	break;
335	}
336	uint8_t lead = src[srcIndex++];
337	if (lead <= `0x7f`) {
338	int8_t d = latinToUpper[lead];
339	if (d == LatinCase::EXC) {
340	cpStart = srcIndex - `1`;
341	c = lead;
342	break;
343	}
344	if (d == `0`) { continue; }
345	ByteSinkUtil::appendUnchanged(src + prev, srcIndex - `1` - prev,
346	sink, options, edits, errorCode);
347	char ascii = (char)(lead + d);
348	sink.Append(&ascii, `1`);
349	if (edits != nullptr) {
350	edits->addReplace(`1`, `1`);
351	}
352	prev = srcIndex;
353	continue;
354	} else if (lead < `0xe3`) {
355	uint8_t t;
356	if (`0xc2` <= lead && lead <= `0xc5` && srcIndex < srcLength &&
357	(t = src[srcIndex] - `0x80`) <= `0x3f`) {
358	// U+0080..U+017F
359	++srcIndex;
360	c = ((lead - `0xc0`) << `6`) \| t;
361	int8_t d = latinToUpper[c];
362	if (d == LatinCase::EXC) {
363	cpStart = srcIndex - `2`;
364	break;
365	}
366	if (d == `0`) { continue; }
367	ByteSinkUtil::appendUnchanged(src + prev, srcIndex - `2` - prev,
368	sink, options, edits, errorCode);
369	ByteSinkUtil::appendTwoBytes(c + d, sink);
370	if (edits != nullptr) {
371	edits->addReplace(`2`, `2`);
372	}
373	prev = srcIndex;
374	continue;
375	}
376	} else if ((lead <= `0xe9` \|\| lead == `0xeb` \|\| lead == `0xec`) &&
377	(srcIndex + `2`) <= srcLength &&
378	U8_IS_TRAIL(src[srcIndex]) && U8_IS_TRAIL(src[srcIndex + `1`])) {
379	// most of CJK: no case mappings
380	srcIndex += `2`;
381	continue;
382	}
383	cpStart = --srcIndex;
384	U8_NEXT(src, srcIndex, srcLength, c);
385	if (c < `0`) {
386	// ill-formed UTF-8
387	continue;
388	}
389	uint16_t props = UTRIE2_GET16(trie, c);
390	if (UCASE_HAS_EXCEPTION(props)) { break; }
391	int32_t delta;
392	if (UCASE_GET_TYPE(props) != UCASE_LOWER \|\| (delta = UCASE_GET_DELTA(props)) == `0`) {
393	continue;
394	}
395	ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
396	sink, options, edits, errorCode);
397	ByteSinkUtil::appendCodePoint(srcIndex - cpStart, c + delta, sink, edits);
398	prev = srcIndex;
399	}
400	if (c < `0`) {
401	break;
402	}
403	// slow path
404	csc->cpStart = cpStart;
405	csc->cpLimit = srcIndex;
406	const UChar *s;
407	c = ucase_toFullUpper(c, utf8_caseContextIterator, csc, &s, caseLocale);
408	if (c >= `0`) {
409	ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
410	sink, options, edits, errorCode);
411	appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode);
412	prev = srcIndex;
413	}
414	}
415	ByteSinkUtil::appendUnchanged(src + prev, srcIndex - prev,
416	sink, options, edits, errorCode);
417	}
418
419	} // namespace
420
421	#if !UCONFIG_NO_BREAK_ITERATION
422
423	U_CFUNC void U_CALLCONV
424	ucasemap_internalUTF8ToTitle(
425	int32_t caseLocale, uint32_t options, BreakIterator *iter,
426	const uint8_t *src, int32_t srcLength,
427	ByteSink &sink, icu::Edits *edits,
428	UErrorCode &errorCode) {
429	if (!ustrcase_checkTitleAdjustmentOptions(options, errorCode)) {
430	return;
431	}
432
433	/ set up local variables /
434	UCaseContext csc=UCASECONTEXT_INITIALIZER;
435	csc.p=(void *)src;
436	csc.limit=srcLength;
437	int32_t prev=`0`;
438	UBool isFirstIndex=TRUE;
439
440	/ titlecasing loop /
441	while(prev<srcLength) {
442	/ find next index where to titlecase /
443	int32_t index;
444	if(isFirstIndex) {
445	isFirstIndex=FALSE;
446	index=iter->first();
447	} else {
448	index=iter->next();
449	}
450	if(index==UBRK_DONE \|\| index>srcLength) {
451	index=srcLength;
452	}
453
454	/*
455	* Segment [prev..index[ into 3 parts:
456	* a) skipped characters (copy as-is) [prev..titleStart[
457	* b) first letter (titlecase) [titleStart..titleLimit[
458	* c) subsequent characters (lowercase) [titleLimit..index[
459	*/
460	if(prev<index) {
461	/ find and copy skipped characters [prev..titleStart[ /
462	int32_t titleStart=prev;
463	int32_t titleLimit=prev;
464	UChar32 c;
465	U8_NEXT(src, titleLimit, index, c);
466	if ((options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==`0`) {
467	// Adjust the titlecasing index to the next cased character,
468	// or to the next letter/number/symbol/private use.
469	// Stop with titleStart<titleLimit<=index
470	// if there is a character to be titlecased,
471	// or else stop with titleStart==titleLimit==index.
472	UBool toCased = (options&U_TITLECASE_ADJUST_TO_CASED) != `0`;
473	while (toCased ? UCASE_NONE==ucase_getType(c) : !ustrcase_isLNS(c)) {
474	titleStart=titleLimit;
475	if(titleLimit==index) {
476	break;
477	}
478	U8_NEXT(src, titleLimit, index, c);
479	}
480	if (prev < titleStart) {
481	if (!ByteSinkUtil::appendUnchanged(src+prev, titleStart-prev,
482	sink, options, edits, errorCode)) {
483	return;
484	}
485	}
486	}
487
488	if(titleStart<titleLimit) {
489	/ titlecase c which is from [titleStart..titleLimit[ /
490	if(c>=`0`) {
491	csc.cpStart=titleStart;
492	csc.cpLimit=titleLimit;
493	const UChar *s;
494	c=ucase_toFullTitle(c, utf8_caseContextIterator, &csc, &s, caseLocale);
495	if (!appendResult(titleLimit-titleStart, c, s, sink, options, edits, errorCode)) {
496	return;
497	}
498	} else {
499	// Malformed UTF-8.
500	if (!ByteSinkUtil::appendUnchanged(src+titleStart, titleLimit-titleStart,
501	sink, options, edits, errorCode)) {
502	return;
503	}
504	}
505
506	/ Special case Dutch IJ titlecasing /
507	if (titleStart+`1` < index &&
508	caseLocale == UCASE_LOC_DUTCH &&
509	(src[titleStart] == `0x0049` \|\| src[titleStart] == `0x0069`)) {
510	if (src[titleStart+`1`] == `0x006A`) {
511	ByteSinkUtil::appendCodePoint(`1`, `0x004A`, sink, edits);
512	titleLimit++;
513	} else if (src[titleStart+`1`] == `0x004A`) {
514	// Keep the capital J from getting lowercased.
515	if (!ByteSinkUtil::appendUnchanged(src+titleStart+`1`, `1`,
516	sink, options, edits, errorCode)) {
517	return;
518	}
519	titleLimit++;
520	}
521	}
522
523	/ lowercase [titleLimit..index[ /
524	if(titleLimit<index) {
525	if((options&U_TITLECASE_NO_LOWERCASE)==`0`) {
526	/ Normal operation: Lowercase the rest of the word. /
527	toLower(caseLocale, options,
528	src, &csc, titleLimit, index,
529	sink, edits, errorCode);
530	if(U_FAILURE(errorCode)) {
531	return;
532	}
533	} else {
534	/ Optionally just copy the rest of the word unchanged. /
535	if (!ByteSinkUtil::appendUnchanged(src+titleLimit, index-titleLimit,
536	sink, options, edits, errorCode)) {
537	return;
538	}
539	}
540	}
541	}
542	}
543
544	prev=index;
545	}
546	}
547
548	#endif
549
550	U_NAMESPACE_BEGIN
551	namespace GreekUpper {
552
553	UBool isFollowedByCasedLetter(const uint8_t *s, int32_t i, int32_t length) {
554	while (i < length) {
555	UChar32 c;
556	U8_NEXT(s, i, length, c);
557	int32_t type = ucase_getTypeOrIgnorable(c);
558	if ((type & UCASE_IGNORABLE) != `0`) {
559	// Case-ignorable, continue with the loop.
560	} else if (type != UCASE_NONE) {
561	return TRUE; // Followed by cased letter.
562	} else {
563	return FALSE; // Uncased and not case-ignorable.
564	}
565	}
566	return FALSE; // Not followed by cased letter.
567	}
568
569	// Keep this consistent with the UTF-16 version in ustrcase.cpp and the Java version in CaseMap.java.
570	void toUpper(uint32_t options,
571	const uint8_t *src, int32_t srcLength,
572	ByteSink &sink, Edits *edits,
573	UErrorCode &errorCode) {
574	uint32_t state = `0`;
575	for (int32_t i = `0`; i < srcLength;) {
576	int32_t nextIndex = i;
577	UChar32 c;
578	U8_NEXT(src, nextIndex, srcLength, c);
579	uint32_t nextState = `0`;
580	int32_t type = ucase_getTypeOrIgnorable(c);
581	if ((type & UCASE_IGNORABLE) != `0`) {
582	// c is case-ignorable
583	nextState \|= (state & AFTER_CASED);
584	} else if (type != UCASE_NONE) {
585	// c is cased
586	nextState \|= AFTER_CASED;
587	}
588	uint32_t data = getLetterData(c);
589	if (data > `0`) {
590	uint32_t upper = data & UPPER_MASK;
591	// Add a dialytika to this iota or ypsilon vowel
592	// if we removed a tonos from the previous vowel,
593	// and that previous vowel did not also have (or gain) a dialytika.
594	// Adding one only to the final vowel in a longer sequence
595	// (which does not occur in normal writing) would require lookahead.
596	// Set the same flag as for preserving an existing dialytika.
597	if ((data & HAS_VOWEL) != `0` && (state & AFTER_VOWEL_WITH_ACCENT) != `0` &&
598	(upper == `0x399` \|\| upper == `0x3A5`)) {
599	data \|= HAS_DIALYTIKA;
600	}
601	int32_t numYpogegrammeni = `0`; // Map each one to a trailing, spacing, capital iota.
602	if ((data & HAS_YPOGEGRAMMENI) != `0`) {
603	numYpogegrammeni = `1`;
604	}
605	// Skip combining diacritics after this Greek letter.
606	int32_t nextNextIndex = nextIndex;
607	while (nextIndex < srcLength) {
608	UChar32 c2;
609	U8_NEXT(src, nextNextIndex, srcLength, c2);
610	uint32_t diacriticData = getDiacriticData(c2);
611	if (diacriticData != `0`) {
612	data \|= diacriticData;
613	if ((diacriticData & HAS_YPOGEGRAMMENI) != `0`) {
614	++numYpogegrammeni;
615	}
616	nextIndex = nextNextIndex;
617	} else {
618	break; // not a Greek diacritic
619	}
620	}
621	if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) {
622	nextState \|= AFTER_VOWEL_WITH_ACCENT;
623	}
624	// Map according to Greek rules.
625	UBool addTonos = FALSE;
626	if (upper == `0x397` &&
627	(data & HAS_ACCENT) != `0` &&
628	numYpogegrammeni == `0` &&
629	(state & AFTER_CASED) == `0` &&
630	!isFollowedByCasedLetter(src, nextIndex, srcLength)) {
631	// Keep disjunctive "or" with (only) a tonos.
632	// We use the same "word boundary" conditions as for the Final_Sigma test.
633	if (i == nextIndex) {
634	upper = `0x389`; // Preserve the precomposed form.
635	} else {
636	addTonos = TRUE;
637	}
638	} else if ((data & HAS_DIALYTIKA) != `0`) {
639	// Preserve a vowel with dialytika in precomposed form if it exists.
640	if (upper == `0x399`) {
641	upper = `0x3AA`;
642	data &= ~HAS_EITHER_DIALYTIKA;
643	} else if (upper == `0x3A5`) {
644	upper = `0x3AB`;
645	data &= ~HAS_EITHER_DIALYTIKA;
646	}
647	}
648
649	UBool change;
650	if (edits == nullptr && (options & U_OMIT_UNCHANGED_TEXT) == `0`) {
651	change = TRUE; // common, simple usage
652	} else {
653	// Find out first whether we are changing the text.
654	U_ASSERT(`0x370` <= upper && upper <= `0x3ff`); // 2-byte UTF-8, main Greek block
655	change = (i + `2`) > nextIndex \|\|
656	src[i] != getTwoByteLead(upper) \|\| src[i + `1`] != getTwoByteTrail(upper) \|\|
657	numYpogegrammeni > `0`;
658	int32_t i2 = i + `2`;
659	if ((data & HAS_EITHER_DIALYTIKA) != `0`) {
660	change \|= (i2 + `2`) > nextIndex \|\|
661	src[i2] != (uint8_t)u8"\u0308"[`0`] \|\|
662	src[i2 + `1`] != (uint8_t)u8"\u0308"[`1`];
663	i2 += `2`;
664	}
665	if (addTonos) {
666	change \|= (i2 + `2`) > nextIndex \|\|
667	src[i2] != (uint8_t)u8"\u0301"[`0`] \|\|
668	src[i2 + `1`] != (uint8_t)u8"\u0301"[`1`];
669	i2 += `2`;
670	}
671	int32_t oldLength = nextIndex - i;
672	int32_t newLength = (i2 - i) + numYpogegrammeni * `2`; // 2 bytes per U+0399
673	change \|= oldLength != newLength;
674	if (change) {
675	if (edits != NULL) {
676	edits->addReplace(oldLength, newLength);
677	}
678	} else {
679	if (edits != NULL) {
680	edits->addUnchanged(oldLength);
681	}
682	// Write unchanged text?
683	change = (options & U_OMIT_UNCHANGED_TEXT) == `0`;
684	}
685	}
686
687	if (change) {
688	ByteSinkUtil::appendTwoBytes(upper, sink);
689	if ((data & HAS_EITHER_DIALYTIKA) != `0`) {
690	sink.Append(u8"\u0308", `2`); // restore or add a dialytika
691	}
692	if (addTonos) {
693	sink.Append(u8"\u0301", `2`);
694	}
695	while (numYpogegrammeni > `0`) {
696	sink.Append(u8"\u0399", `2`);
697	--numYpogegrammeni;
698	}
699	}
700	} else if(c>=`0`) {
701	const UChar *s;
702	c=ucase_toFullUpper(c, NULL, NULL, &s, UCASE_LOC_GREEK);
703	if (!appendResult(nextIndex - i, c, s, sink, options, edits, errorCode)) {
704	return;
705	}
706	} else {
707	// Malformed UTF-8.
708	if (!ByteSinkUtil::appendUnchanged(src+i, nextIndex-i,
709	sink, options, edits, errorCode)) {
710	return;
711	}
712	}
713	i = nextIndex;
714	state = nextState;
715	}
716	}
717
718	} // namespace GreekUpper
719	U_NAMESPACE_END
720
721	static void U_CALLCONV
722	ucasemap_internalUTF8ToLower(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
723	const uint8_t *src, int32_t srcLength,
724	icu::ByteSink &sink, icu::Edits *edits,
725	UErrorCode &errorCode) {
726	UCaseContext csc=UCASECONTEXT_INITIALIZER;
727	csc.p=(void *)src;
728	csc.limit=srcLength;
729	toLower(
730	caseLocale, options,
731	src, &csc, `0`, srcLength,
732	sink, edits, errorCode);
733	}
734
735	static void U_CALLCONV
736	ucasemap_internalUTF8ToUpper(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
737	const uint8_t *src, int32_t srcLength,
738	icu::ByteSink &sink, icu::Edits *edits,
739	UErrorCode &errorCode) {
740	if (caseLocale == UCASE_LOC_GREEK) {
741	GreekUpper::toUpper(options, src, srcLength, sink, edits, errorCode);
742	} else {
743	UCaseContext csc=UCASECONTEXT_INITIALIZER;
744	csc.p=(void *)src;
745	csc.limit=srcLength;
746	toUpper(
747	caseLocale, options,
748	src, &csc, srcLength,
749	sink, edits, errorCode);
750	}
751	}
752
753	static void U_CALLCONV
754	ucasemap_internalUTF8Fold(int32_t / caseLocale /, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
755	const uint8_t *src, int32_t srcLength,
756	icu::ByteSink &sink, icu::Edits *edits,
757	UErrorCode &errorCode) {
758	toLower(
759	-`1`, options,
760	src, nullptr, `0`, srcLength,
761	sink, edits, errorCode);
762	}
763
764	void
765	ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
766	const char *src, int32_t srcLength,
767	UTF8CaseMapper *stringCaseMapper,
768	icu::ByteSink &sink, icu::Edits *edits,
769	UErrorCode &errorCode) {
770	/ check argument values /
771	if (U_FAILURE(errorCode)) {
772	return;
773	}
774	if ((src == nullptr && srcLength != `0`) \|\| srcLength < -`1`) {
775	errorCode = U_ILLEGAL_ARGUMENT_ERROR;
776	return;
777	}
778
779	// Get the string length.
780	if (srcLength == -`1`) {
781	srcLength = (int32_t)uprv_strlen((const char *)src);
782	}
783
784	if (edits != nullptr && (options & U_EDITS_NO_RESET) == `0`) {
785	edits->reset();
786	}
787	stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
788	(const uint8_t *)src, srcLength, sink, edits, errorCode);
789	sink.Flush();
790	if (U_SUCCESS(errorCode)) {
791	if (edits != nullptr) {
792	edits->copyErrorTo(errorCode);
793	}
794	}
795	}
796
797	int32_t
798	ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
799	char *dest, int32_t destCapacity,
800	const char *src, int32_t srcLength,
801	UTF8CaseMapper *stringCaseMapper,
802	icu::Edits *edits,
803	UErrorCode &errorCode) {
804	/ check argument values /
805	if(U_FAILURE(errorCode)) {
806	return `0`;
807	}
808	if( destCapacity<`0` \|\|
809	(dest==NULL && destCapacity>`0`) \|\|
810	(src==NULL && srcLength!=`0`) \|\| srcLength<-`1`
811	) {
812	errorCode=U_ILLEGAL_ARGUMENT_ERROR;
813	return `0`;
814	}
815
816	/ get the string length /
817	if(srcLength==-`1`) {
818	srcLength=(int32_t)uprv_strlen((const char *)src);
819	}
820
821	/ check for overlapping source and destination /
822	if( dest!=NULL &&
823	((src>=dest && src<(dest+destCapacity)) \|\|
824	(dest>=src && dest<(src+srcLength)))
825	) {
826	errorCode=U_ILLEGAL_ARGUMENT_ERROR;
827	return `0`;
828	}
829
830	CheckedArrayByteSink sink(dest, destCapacity);
831	if (edits != nullptr && (options & U_EDITS_NO_RESET) == `0`) {
832	edits->reset();
833	}
834	stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
835	(const uint8_t *)src, srcLength, sink, edits, errorCode);
836	sink.Flush();
837	if (U_SUCCESS(errorCode)) {
838	if (sink.Overflowed()) {
839	errorCode = U_BUFFER_OVERFLOW_ERROR;
840	} else if (edits != nullptr) {
841	edits->copyErrorTo(errorCode);
842	}
843	}
844	return u_terminateChars(dest, destCapacity, sink.NumberOfBytesAppended(), &errorCode);
845	}
846
847	/ public API functions /
848
849	U_CAPI int32_t U_EXPORT2
850	ucasemap_utf8ToLower(const UCaseMap *csm,
851	char *dest, int32_t destCapacity,
852	const char *src, int32_t srcLength,
853	UErrorCode *pErrorCode) {
854	return ucasemap_mapUTF8(
855	csm->caseLocale, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
856	dest, destCapacity,
857	src, srcLength,
858	ucasemap_internalUTF8ToLower, NULL, *pErrorCode);
859	}
860
861	U_CAPI int32_t U_EXPORT2
862	ucasemap_utf8ToUpper(const UCaseMap *csm,
863	char *dest, int32_t destCapacity,
864	const char *src, int32_t srcLength,
865	UErrorCode *pErrorCode) {
866	return ucasemap_mapUTF8(
867	csm->caseLocale, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
868	dest, destCapacity,
869	src, srcLength,
870	ucasemap_internalUTF8ToUpper, NULL, *pErrorCode);
871	}
872
873	U_CAPI int32_t U_EXPORT2
874	ucasemap_utf8FoldCase(const UCaseMap *csm,
875	char *dest, int32_t destCapacity,
876	const char *src, int32_t srcLength,
877	UErrorCode *pErrorCode) {
878	return ucasemap_mapUTF8(
879	UCASE_LOC_ROOT, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
880	dest, destCapacity,
881	src, srcLength,
882	ucasemap_internalUTF8Fold, NULL, *pErrorCode);
883	}
884
885	U_NAMESPACE_BEGIN
886
887	void CaseMap::utf8ToLower(
888	const char *locale, uint32_t options,
889	StringPiece src, ByteSink &sink, Edits *edits,
890	UErrorCode &errorCode) {
891	ucasemap_mapUTF8(
892	ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
893	src.data(), src.length(),
894	ucasemap_internalUTF8ToLower, sink, edits, errorCode);
895	}
896
897	void CaseMap::utf8ToUpper(
898	const char *locale, uint32_t options,
899	StringPiece src, ByteSink &sink, Edits *edits,
900	UErrorCode &errorCode) {
901	ucasemap_mapUTF8(
902	ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
903	src.data(), src.length(),
904	ucasemap_internalUTF8ToUpper, sink, edits, errorCode);
905	}
906
907	void CaseMap::utf8Fold(
908	uint32_t options,
909	StringPiece src, ByteSink &sink, Edits *edits,
910	UErrorCode &errorCode) {
911	ucasemap_mapUTF8(
912	UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
913	src.data(), src.length(),
914	ucasemap_internalUTF8Fold, sink, edits, errorCode);
915	}
916
917	int32_t CaseMap::utf8ToLower(
918	const char *locale, uint32_t options,
919	const char *src, int32_t srcLength,
920	char dest, int32_t destCapacity, Edits edits,
921	UErrorCode &errorCode) {
922	return ucasemap_mapUTF8(
923	ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
924	dest, destCapacity,
925	src, srcLength,
926	ucasemap_internalUTF8ToLower, edits, errorCode);
927	}
928
929	int32_t CaseMap::utf8ToUpper(
930	const char *locale, uint32_t options,
931	const char *src, int32_t srcLength,
932	char dest, int32_t destCapacity, Edits edits,
933	UErrorCode &errorCode) {
934	return ucasemap_mapUTF8(
935	ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
936	dest, destCapacity,
937	src, srcLength,
938	ucasemap_internalUTF8ToUpper, edits, errorCode);
939	}
940
941	int32_t CaseMap::utf8Fold(
942	uint32_t options,
943	const char *src, int32_t srcLength,
944	char dest, int32_t destCapacity, Edits edits,
945	UErrorCode &errorCode) {
946	return ucasemap_mapUTF8(
947	UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
948	dest, destCapacity,
949	src, srcLength,
950	ucasemap_internalUTF8Fold, edits, errorCode);
951	}
952
953	U_NAMESPACE_END
954

Browse the source code of ClickHouse/contrib/icu/icu4c/source/common/ucasemap.cpp