ucasemap.cpp source code [Godot/thirdparty/icu4c/common/ucasemap.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	*******************************************************************************
5	*
6	* Copyright (C) 2005-2016, International Business Machines
7	* Corporation and others. All Rights Reserved.
8	*
9	*******************************************************************************
10	* file name: ucasemap.cpp
11	* encoding: UTF-8
12	* tab size: 8 (not used)
13	* indentation:4
14	*
15	* created on: 2005may06
16	* created by: Markus W. Scherer
17	*
18	* Case mapping service object and functions using it.
19	*/
20
21	#include "unicode/utypes.h"
22	#include "unicode/brkiter.h"
23	#include "unicode/bytestream.h"
24	#include "unicode/casemap.h"
25	#include "unicode/edits.h"
26	#include "unicode/stringoptions.h"
27	#include "unicode/stringpiece.h"
28	#include "unicode/ubrk.h"
29	#include "unicode/uloc.h"
30	#include "unicode/ustring.h"
31	#include "unicode/ucasemap.h"
32	#if !UCONFIG_NO_BREAK_ITERATION
33	#include "unicode/utext.h"
34	#endif
35	#include "unicode/utf.h"
36	#include "unicode/utf8.h"
37	#include "unicode/utf16.h"
38	#include "bytesinkutil.h"
39	#include "cmemory.h"
40	#include "cstring.h"
41	#include "uassert.h"
42	#include "ucase.h"
43	#include "ucasemap_imp.h"
44	#include "ustr_imp.h"
45
46	U_NAMESPACE_USE
47
48	/ UCaseMap service object -------------------------------------------------- /
49
50	UCaseMap::UCaseMap(const char localeID, uint32_t opts, UErrorCode pErrorCode) :
51	#if !UCONFIG_NO_BREAK_ITERATION
52	iter(nullptr),
53	#endif
54	caseLocale(UCASE_LOC_UNKNOWN), options(opts) {
55	ucasemap_setLocale(this, localeID, pErrorCode);
56	}
57
58	UCaseMap::~UCaseMap() {
59	#if !UCONFIG_NO_BREAK_ITERATION
60	delete iter;
61	#endif
62	}
63
64	U_CAPI UCaseMap * U_EXPORT2
65	ucasemap_open(const char locale, uint32_t options, UErrorCode pErrorCode) {
66	if(U_FAILURE(*pErrorCode)) {
67	return nullptr;
68	}
69	UCaseMap csm = new* UCaseMap (locale, options, pErrorCode);
70	if(csm==nullptr) {
71	*pErrorCode = U_MEMORY_ALLOCATION_ERROR;
72	return nullptr;
73	} else if (U_FAILURE(*pErrorCode)) {
74	delete csm;
75	return nullptr;
76	}
77	return csm;
78	}
79
80	U_CAPI void U_EXPORT2
81	ucasemap_close(UCaseMap *csm) {
82	delete csm;
83	}
84
85	U_CAPI const char * U_EXPORT2
86	ucasemap_getLocale(const UCaseMap *csm) {
87	return csm->locale;
88	}
89
90	U_CAPI uint32_t U_EXPORT2
91	ucasemap_getOptions(const UCaseMap *csm) {
92	return csm->options;
93	}
94
95	U_CAPI void U_EXPORT2
96	ucasemap_setLocale(UCaseMap csm, const* char locale, UErrorCode pErrorCode) {
97	if(U_FAILURE(*pErrorCode)) {
98	return;
99	}
100	if (locale != nullptr && *locale == `0`) {
101	csm->locale[`0`] = `0`;
102	csm->caseLocale = UCASE_LOC_ROOT;
103	return;
104	}
105
106	int32_t length=uloc_getName(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode);
107	if(pErrorCode==U_BUFFER_OVERFLOW_ERROR \|\| length==sizeof*(csm->locale)) {
108	*pErrorCode=U_ZERO_ERROR;
109	/ we only really need the language code for case mappings /
110	length=uloc_getLanguage(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode);
111	}
112	if(length==sizeof(csm->locale)) {
113	*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
114	}
115	if(U_SUCCESS(*pErrorCode)) {
116	csm->caseLocale = ucase_getCaseLocale(csm->locale);
117	} else {
118	csm->locale[`0`]=`0`;
119	csm->caseLocale = UCASE_LOC_ROOT;
120	}
121	}
122
123	U_CAPI void U_EXPORT2
124	ucasemap_setOptions(UCaseMap csm, uint32_t options, UErrorCode pErrorCode) {
125	if(U_FAILURE(*pErrorCode)) {
126	return;
127	}
128	csm->options=options;
129	}
130
131	/ UTF-8 string case mappings ----------------------------------------------- /
132
133	/ TODO(markus): Move to a new, separate utf8case.cpp file. /
134
135	namespace {
136
137	/ append a full case mapping result, see UCASE_MAX_STRING_LENGTH /
138	inline UBool
139	appendResult(int32_t cpLength, int32_t result, const char16_t *s,
140	ByteSink &sink, uint32_t options, icu::Edits *edits, UErrorCode &errorCode) {
141	U_ASSERT(U_SUCCESS(errorCode));
142
143	/ decode the result /
144	if(result<`0`) {
145	/ (not) original code point /
146	if(edits!=nullptr) {
147	edits->addUnchanged(cpLength);
148	}
149	if((options & U_OMIT_UNCHANGED_TEXT) == `0`) {
150	ByteSinkUtil::appendCodePoint(cpLength, ~result, sink);
151	}
152	} else {
153	if(result<=UCASE_MAX_STRING_LENGTH) {
154	// string: "result" is the UTF-16 length
155	return ByteSinkUtil::appendChange(cpLength, s, result, sink, edits, errorCode);
156	} else {
157	ByteSinkUtil::appendCodePoint(cpLength, result, sink, edits);
158	}
159	}
160	return true;
161	}
162
163	// See unicode/utf8.h U8_APPEND_UNSAFE().
164	inline uint8_t getTwoByteLead(UChar32 c) { return (uint8_t)((c >> `6`) \| `0xc0`); }
165	inline uint8_t getTwoByteTrail(UChar32 c) { return (uint8_t)((c & `0x3f`) \| `0x80`); }
166
167	UChar32 U_CALLCONV
168	utf8_caseContextIterator(void *context, int8_t dir) {
169	UCaseContext csc=(UCaseContext )context;
170	UChar32 c;
171
172	if(dir<`0`) {
173	/ reset for backward iteration /
174	csc->index=csc->cpStart;
175	csc->dir=dir;
176	} else if(dir>`0`) {
177	/ reset for forward iteration /
178	csc->index=csc->cpLimit;
179	csc->dir=dir;
180	} else {
181	/ continue current iteration direction /
182	dir=csc->dir;
183	}
184
185	if(dir<`0`) {
186	if(csc->start<csc->index) {
187	U8_PREV((const uint8_t *)csc->p, csc->start, csc->index, c);
188	return c;
189	}
190	} else {
191	if(csc->index<csc->limit) {
192	U8_NEXT((const uint8_t *)csc->p, csc->index, csc->limit, c);
193	return c;
194	}
195	}
196	return U_SENTINEL;
197	}
198
199	/**
200	* caseLocale >= 0: Lowercases [srcStart..srcLimit[ but takes context [0..srcLength[ into account.
201	* caseLocale < 0: Case-folds [srcStart..srcLimit[.
202	*/
203	void toLower(int32_t caseLocale, uint32_t options,
204	const uint8_t src, UCaseContext csc, int32_t srcStart, int32_t srcLimit,
205	icu::ByteSink &sink, icu::Edits *edits, UErrorCode &errorCode) {
206	const int8_t *latinToLower;
207	if (caseLocale == UCASE_LOC_ROOT \|\|
208	(caseLocale >= `0` ?
209	!(caseLocale == UCASE_LOC_TURKISH \|\| caseLocale == UCASE_LOC_LITHUANIAN) :
210	(options & _FOLD_CASE_OPTIONS_MASK) == U_FOLD_CASE_DEFAULT)) {
211	latinToLower = LatinCase::TO_LOWER_NORMAL;
212	} else {
213	latinToLower = LatinCase::TO_LOWER_TR_LT;
214	}
215	const UTrie2 *trie = ucase_getTrie();
216	int32_t prev = srcStart;
217	int32_t srcIndex = srcStart;
218	for (;;) {
219	// fast path for simple cases
220	int32_t cpStart;
221	UChar32 c;
222	for (;;) {
223	if (U_FAILURE(errorCode) \|\| srcIndex >= srcLimit) {
224	c = U_SENTINEL;
225	break;
226	}
227	uint8_t lead = src[srcIndex++];
228	if (lead <= `0x7f`) {
229	int8_t d = latinToLower[lead];
230	if (d == LatinCase::EXC) {
231	cpStart = srcIndex - `1`;
232	c = lead;
233	break;
234	}
235	if (d == `0`) { continue; }
236	ByteSinkUtil::appendUnchanged(src + prev, srcIndex - `1` - prev,
237	sink, options, edits, errorCode);
238	char ascii = (char)(lead + d);
239	sink.Append(&ascii, `1`);
240	if (edits != nullptr) {
241	edits->addReplace(`1`, `1`);
242	}
243	prev = srcIndex;
244	continue;
245	} else if (lead < `0xe3`) {
246	uint8_t t;
247	if (`0xc2` <= lead && lead <= `0xc5` && srcIndex < srcLimit &&
248	(t = src[srcIndex] - `0x80`) <= `0x3f`) {
249	// U+0080..U+017F
250	++srcIndex;
251	c = ((lead - `0xc0`) << `6`) \| t;
252	int8_t d = latinToLower[c];
253	if (d == LatinCase::EXC) {
254	cpStart = srcIndex - `2`;
255	break;
256	}
257	if (d == `0`) { continue; }
258	ByteSinkUtil::appendUnchanged(src + prev, srcIndex - `2` - prev,
259	sink, options, edits, errorCode);
260	ByteSinkUtil::appendTwoBytes(c + d, sink);
261	if (edits != nullptr) {
262	edits->addReplace(`2`, `2`);
263	}
264	prev = srcIndex;
265	continue;
266	}
267	} else if ((lead <= `0xe9` \|\| lead == `0xeb` \|\| lead == `0xec`) &&
268	(srcIndex + `2`) <= srcLimit &&
269	U8_IS_TRAIL(src[srcIndex]) && U8_IS_TRAIL(src[srcIndex + `1`])) {
270	// most of CJK: no case mappings
271	srcIndex += `2`;
272	continue;
273	}
274	cpStart = --srcIndex;
275	U8_NEXT(src, srcIndex, srcLimit, c);
276	if (c < `0`) {
277	// ill-formed UTF-8
278	continue;
279	}
280	uint16_t props = UTRIE2_GET16(trie, c);
281	if (UCASE_HAS_EXCEPTION(props)) { break; }
282	int32_t delta;
283	if (!UCASE_IS_UPPER_OR_TITLE(props) \|\| (delta = UCASE_GET_DELTA(props)) == `0`) {
284	continue;
285	}
286	ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
287	sink, options, edits, errorCode);
288	ByteSinkUtil::appendCodePoint(srcIndex - cpStart, c + delta, sink, edits);
289	prev = srcIndex;
290	}
291	if (c < `0`) {
292	break;
293	}
294	// slow path
295	const char16_t *s;
296	if (caseLocale >= `0`) {
297	csc->cpStart = cpStart;
298	csc->cpLimit = srcIndex;
299	c = ucase_toFullLower(c, utf8_caseContextIterator, csc, &s, caseLocale);
300	} else {
301	c = ucase_toFullFolding(c, &s, options);
302	}
303	if (c >= `0`) {
304	ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
305	sink, options, edits, errorCode);
306	appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode);
307	prev = srcIndex;
308	}
309	}
310	ByteSinkUtil::appendUnchanged(src + prev, srcIndex - prev,
311	sink, options, edits, errorCode);
312	}
313
314	void toUpper(int32_t caseLocale, uint32_t options,
315	const uint8_t src, UCaseContext csc, int32_t srcLength,
316	icu::ByteSink &sink, icu::Edits *edits, UErrorCode &errorCode) {
317	const int8_t *latinToUpper;
318	if (caseLocale == UCASE_LOC_TURKISH) {
319	latinToUpper = LatinCase::TO_UPPER_TR;
320	} else {
321	latinToUpper = LatinCase::TO_UPPER_NORMAL;
322	}
323	const UTrie2 *trie = ucase_getTrie();
324	int32_t prev = `0`;
325	int32_t srcIndex = `0`;
326	for (;;) {
327	// fast path for simple cases
328	int32_t cpStart;
329	UChar32 c;
330	for (;;) {
331	if (U_FAILURE(errorCode) \|\| srcIndex >= srcLength) {
332	c = U_SENTINEL;
333	break;
334	}
335	uint8_t lead = src[srcIndex++];
336	if (lead <= `0x7f`) {
337	int8_t d = latinToUpper[lead];
338	if (d == LatinCase::EXC) {
339	cpStart = srcIndex - `1`;
340	c = lead;
341	break;
342	}
343	if (d == `0`) { continue; }
344	ByteSinkUtil::appendUnchanged(src + prev, srcIndex - `1` - prev,
345	sink, options, edits, errorCode);
346	char ascii = (char)(lead + d);
347	sink.Append(&ascii, `1`);
348	if (edits != nullptr) {
349	edits->addReplace(`1`, `1`);
350	}
351	prev = srcIndex;
352	continue;
353	} else if (lead < `0xe3`) {
354	uint8_t t;
355	if (`0xc2` <= lead && lead <= `0xc5` && srcIndex < srcLength &&
356	(t = src[srcIndex] - `0x80`) <= `0x3f`) {
357	// U+0080..U+017F
358	++srcIndex;
359	c = ((lead - `0xc0`) << `6`) \| t;
360	int8_t d = latinToUpper[c];
361	if (d == LatinCase::EXC) {
362	cpStart = srcIndex - `2`;
363	break;
364	}
365	if (d == `0`) { continue; }
366	ByteSinkUtil::appendUnchanged(src + prev, srcIndex - `2` - prev,
367	sink, options, edits, errorCode);
368	ByteSinkUtil::appendTwoBytes(c + d, sink);
369	if (edits != nullptr) {
370	edits->addReplace(`2`, `2`);
371	}
372	prev = srcIndex;
373	continue;
374	}
375	} else if ((lead <= `0xe9` \|\| lead == `0xeb` \|\| lead == `0xec`) &&
376	(srcIndex + `2`) <= srcLength &&
377	U8_IS_TRAIL(src[srcIndex]) && U8_IS_TRAIL(src[srcIndex + `1`])) {
378	// most of CJK: no case mappings
379	srcIndex += `2`;
380	continue;
381	}
382	cpStart = --srcIndex;
383	U8_NEXT(src, srcIndex, srcLength, c);
384	if (c < `0`) {
385	// ill-formed UTF-8
386	continue;
387	}
388	uint16_t props = UTRIE2_GET16(trie, c);
389	if (UCASE_HAS_EXCEPTION(props)) { break; }
390	int32_t delta;
391	if (UCASE_GET_TYPE(props) != UCASE_LOWER \|\| (delta = UCASE_GET_DELTA(props)) == `0`) {
392	continue;
393	}
394	ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
395	sink, options, edits, errorCode);
396	ByteSinkUtil::appendCodePoint(srcIndex - cpStart, c + delta, sink, edits);
397	prev = srcIndex;
398	}
399	if (c < `0`) {
400	break;
401	}
402	// slow path
403	csc->cpStart = cpStart;
404	csc->cpLimit = srcIndex;
405	const char16_t *s;
406	c = ucase_toFullUpper(c, utf8_caseContextIterator, csc, &s, caseLocale);
407	if (c >= `0`) {
408	ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
409	sink, options, edits, errorCode);
410	appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode);
411	prev = srcIndex;
412	}
413	}
414	ByteSinkUtil::appendUnchanged(src + prev, srcIndex - prev,
415	sink, options, edits, errorCode);
416	}
417
418	} // namespace
419
420	#if !UCONFIG_NO_BREAK_ITERATION
421
422	namespace {
423
424	constexpr uint8_t ACUTE_BYTE0 = u8"\u0301"[`0`];
425
426	constexpr uint8_t ACUTE_BYTE1 = u8"\u0301"[`1`];
427
428	/**
429	* Input: c is a letter I with or without acute accent.
430	* start is the index in src after c, and is less than segmentLimit.
431	* If a plain i/I is followed by a plain j/J,
432	* or an i/I with acute (precomposed or decomposed) is followed by a j/J with acute,
433	* then we output accordingly.
434	*
435	* @return the src index after the titlecased sequence, or the start index if no Dutch IJ
436	*/
437	int32_t maybeTitleDutchIJ(const uint8_t *src, UChar32 c, int32_t start, int32_t segmentLimit,
438	ByteSink &sink, uint32_t options, icu::Edits *edits, UErrorCode &errorCode) {
439	U_ASSERT(start < segmentLimit);
440
441	int32_t index = start;
442	bool withAcute = false;
443
444	// If the conditions are met, then the following variables tell us what to output.
445	int32_t unchanged1 = `0`; // code units before the j, or the whole sequence (0..3)
446	bool doTitleJ = false; // true if the j needs to be titlecased
447	int32_t unchanged2 = `0`; // after the j (0 or 1)
448
449	// next character after the first letter
450	UChar32 c2;
451	c2 = src[index++];
452
453	// Is the first letter an i/I with accent?
454	if (c == u`'I'`) {
455	if (c2 == ACUTE_BYTE0 && index < segmentLimit && src[index++] == ACUTE_BYTE1) {
456	withAcute = true;
457	unchanged1 = `2`; // ACUTE is 2 code units in UTF-8
458	if (index == segmentLimit) { return start; }
459	c2 = src[index++];
460	}
461	} else { // Í
462	withAcute = true;
463	}
464
465	// Is the next character a j/J?
466	if (c2 == u`'j'`) {
467	doTitleJ = true;
468	} else if (c2 == u`'J'`) {
469	++unchanged1;
470	} else {
471	return start;
472	}
473
474	// A plain i/I must be followed by a plain j/J.
475	// An i/I with acute must be followed by a j/J with acute.
476	if (withAcute) {
477	if ((index + `1`) >= segmentLimit \|\| src[index++] != ACUTE_BYTE0 \|\| src[index++] != ACUTE_BYTE1) {
478	return start;
479	}
480	if (doTitleJ) {
481	unchanged2 = `2`; // ACUTE is 2 code units in UTF-8
482	} else {
483	unchanged1 = unchanged1 + `2`; // ACUTE is 2 code units in UTF-8
484	}
485	}
486
487	// There must not be another combining mark.
488	if (index < segmentLimit) {
489	int32_t cp;
490	int32_t i = index;
491	U8_NEXT(src, i, segmentLimit, cp);
492	uint32_t typeMask = U_GET_GC_MASK(cp);
493	if ((typeMask & U_GC_M_MASK) != `0`) {
494	return start;
495	}
496	}
497
498	// Output the rest of the Dutch IJ.
499	ByteSinkUtil::appendUnchanged(src + start, unchanged1, sink, options, edits, errorCode);
500	start += unchanged1;
501	if (doTitleJ) {
502	ByteSinkUtil::appendCodePoint(`1`, u`'J'`, sink, edits);
503	++start;
504	}
505	ByteSinkUtil::appendUnchanged(src + start, unchanged2, sink, options, edits, errorCode);
506
507	U_ASSERT(start + unchanged2 == index);
508	return index;
509	}
510
511	} // namespace
512
513	U_CFUNC void U_CALLCONV
514	ucasemap_internalUTF8ToTitle(
515	int32_t caseLocale, uint32_t options, BreakIterator *iter,
516	const uint8_t *src, int32_t srcLength,
517	ByteSink &sink, icu::Edits *edits,
518	UErrorCode &errorCode) {
519	if (!ustrcase_checkTitleAdjustmentOptions(options, errorCode)) {
520	return;
521	}
522
523	/ set up local variables /
524	UCaseContext csc=UCASECONTEXT_INITIALIZER;
525	csc.p=(void *)src;
526	csc.limit=srcLength;
527	int32_t prev=`0`;
528	UBool isFirstIndex=true;
529
530	/ titlecasing loop /
531	while(prev<srcLength) {
532	/ find next index where to titlecase /
533	int32_t index;
534	if(isFirstIndex) {
535	isFirstIndex=false;
536	index=iter->first();
537	} else {
538	index=iter->next();
539	}
540	if(index==UBRK_DONE \|\| index>srcLength) {
541	index=srcLength;
542	}
543
544	/*
545	* Segment [prev..index[ into 3 parts:
546	* a) skipped characters (copy as-is) [prev..titleStart[
547	* b) first letter (titlecase) [titleStart..titleLimit[
548	* c) subsequent characters (lowercase) [titleLimit..index[
549	*/
550	if(prev<index) {
551	/ find and copy skipped characters [prev..titleStart[ /
552	int32_t titleStart=prev;
553	int32_t titleLimit=prev;
554	UChar32 c;
555	U8_NEXT(src, titleLimit, index, c);
556	if ((options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==`0`) {
557	// Adjust the titlecasing index to the next cased character,
558	// or to the next letter/number/symbol/private use.
559	// Stop with titleStart<titleLimit<=index
560	// if there is a character to be titlecased,
561	// or else stop with titleStart==titleLimit==index.
562	UBool toCased = (options&U_TITLECASE_ADJUST_TO_CASED) != `0`;
563	while (toCased ? UCASE_NONE==ucase_getType(c) : !ustrcase_isLNS(c)) {
564	titleStart=titleLimit;
565	if(titleLimit==index) {
566	break;
567	}
568	U8_NEXT(src, titleLimit, index, c);
569	}
570	if (prev < titleStart) {
571	if (!ByteSinkUtil::appendUnchanged(src+prev, titleStart-prev,
572	sink, options, edits, errorCode)) {
573	return;
574	}
575	}
576	}
577
578	if(titleStart<titleLimit) {
579	/ titlecase c which is from [titleStart..titleLimit[ /
580	if(c>=`0`) {
581	csc.cpStart=titleStart;
582	csc.cpLimit=titleLimit;
583	const char16_t *s;
584	c=ucase_toFullTitle(c, utf8_caseContextIterator, &csc, &s, caseLocale);
585	if (!appendResult(titleLimit-titleStart, c, s, sink, options, edits, errorCode)) {
586	return;
587	}
588	} else {
589	// Malformed UTF-8.
590	if (!ByteSinkUtil::appendUnchanged(src+titleStart, titleLimit-titleStart,
591	sink, options, edits, errorCode)) {
592	return;
593	}
594	}
595
596	/ Special case Dutch IJ titlecasing /
597	if (titleLimit < index &&
598	caseLocale == UCASE_LOC_DUTCH) {
599	if (c < `0`) {
600	c = ~c;
601	}
602
603	if (c == u`'I'` \|\| c == u`'Í'`) {
604	titleLimit = maybeTitleDutchIJ(src, c, titleLimit, index, sink, options, edits, errorCode);
605	}
606	}
607
608	/ lowercase [titleLimit..index[ /
609	if(titleLimit<index) {
610	if((options&U_TITLECASE_NO_LOWERCASE)==`0`) {
611	/ Normal operation: Lowercase the rest of the word. /
612	toLower(caseLocale, options,
613	src, &csc, titleLimit, index,
614	sink, edits, errorCode);
615	if(U_FAILURE(errorCode)) {
616	return;
617	}
618	} else {
619	/ Optionally just copy the rest of the word unchanged. /
620	if (!ByteSinkUtil::appendUnchanged(src+titleLimit, index-titleLimit,
621	sink, options, edits, errorCode)) {
622	return;
623	}
624	}
625	}
626	}
627	}
628
629	prev=index;
630	}
631	}
632
633	#endif
634
635	U_NAMESPACE_BEGIN
636	namespace GreekUpper {
637
638	UBool isFollowedByCasedLetter(const uint8_t *s, int32_t i, int32_t length) {
639	while (i < length) {
640	UChar32 c;
641	U8_NEXT(s, i, length, c);
642	int32_t type = ucase_getTypeOrIgnorable(c);
643	if ((type & UCASE_IGNORABLE) != `0`) {
644	// Case-ignorable, continue with the loop.
645	} else if (type != UCASE_NONE) {
646	return true; // Followed by cased letter.
647	} else {
648	return false; // Uncased and not case-ignorable.
649	}
650	}
651	return false; // Not followed by cased letter.
652	}
653
654	// Keep this consistent with the UTF-16 version in ustrcase.cpp and the Java version in CaseMap.java.
655	void toUpper(uint32_t options,
656	const uint8_t *src, int32_t srcLength,
657	ByteSink &sink, Edits *edits,
658	UErrorCode &errorCode) {
659	uint32_t state = `0`;
660	for (int32_t i = `0`; i < srcLength;) {
661	int32_t nextIndex = i;
662	UChar32 c;
663	U8_NEXT(src, nextIndex, srcLength, c);
664	uint32_t nextState = `0`;
665	int32_t type = ucase_getTypeOrIgnorable(c);
666	if ((type & UCASE_IGNORABLE) != `0`) {
667	// c is case-ignorable
668	nextState \|= (state & AFTER_CASED);
669	} else if (type != UCASE_NONE) {
670	// c is cased
671	nextState \|= AFTER_CASED;
672	}
673	uint32_t data = getLetterData(c);
674	if (data > `0`) {
675	uint32_t upper = data & UPPER_MASK;
676	// Add a dialytika to this iota or ypsilon vowel
677	// if we removed a tonos from the previous vowel,
678	// and that previous vowel did not also have (or gain) a dialytika.
679	// Adding one only to the final vowel in a longer sequence
680	// (which does not occur in normal writing) would require lookahead.
681	// Set the same flag as for preserving an existing dialytika.
682	if ((data & HAS_VOWEL) != `0` && (state & AFTER_VOWEL_WITH_ACCENT) != `0` &&
683	(upper == `0x399` \|\| upper == `0x3A5`)) {
684	data \|= HAS_DIALYTIKA;
685	}
686	int32_t numYpogegrammeni = `0`; // Map each one to a trailing, spacing, capital iota.
687	if ((data & HAS_YPOGEGRAMMENI) != `0`) {
688	numYpogegrammeni = `1`;
689	}
690	// Skip combining diacritics after this Greek letter.
691	int32_t nextNextIndex = nextIndex;
692	while (nextIndex < srcLength) {
693	UChar32 c2;
694	U8_NEXT(src, nextNextIndex, srcLength, c2);
695	uint32_t diacriticData = getDiacriticData(c2);
696	if (diacriticData != `0`) {
697	data \|= diacriticData;
698	if ((diacriticData & HAS_YPOGEGRAMMENI) != `0`) {
699	++numYpogegrammeni;
700	}
701	nextIndex = nextNextIndex;
702	} else {
703	break; // not a Greek diacritic
704	}
705	}
706	if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) {
707	nextState \|= AFTER_VOWEL_WITH_ACCENT;
708	}
709	// Map according to Greek rules.
710	UBool addTonos = false;
711	if (upper == `0x397` &&
712	(data & HAS_ACCENT) != `0` &&
713	numYpogegrammeni == `0` &&
714	(state & AFTER_CASED) == `0` &&
715	!isFollowedByCasedLetter(src, nextIndex, srcLength)) {
716	// Keep disjunctive "or" with (only) a tonos.
717	// We use the same "word boundary" conditions as for the Final_Sigma test.
718	if (i == nextIndex) {
719	upper = `0x389`; // Preserve the precomposed form.
720	} else {
721	addTonos = true;
722	}
723	} else if ((data & HAS_DIALYTIKA) != `0`) {
724	// Preserve a vowel with dialytika in precomposed form if it exists.
725	if (upper == `0x399`) {
726	upper = `0x3AA`;
727	data &= ~HAS_EITHER_DIALYTIKA;
728	} else if (upper == `0x3A5`) {
729	upper = `0x3AB`;
730	data &= ~HAS_EITHER_DIALYTIKA;
731	}
732	}
733
734	UBool change;
735	if (edits == nullptr && (options & U_OMIT_UNCHANGED_TEXT) == `0`) {
736	change = true; // common, simple usage
737	} else {
738	// Find out first whether we are changing the text.
739	U_ASSERT(`0x370` <= upper && upper <= `0x3ff`); // 2-byte UTF-8, main Greek block
740	change = (i + `2`) > nextIndex \|\|
741	src[i] != getTwoByteLead(upper) \|\| src[i + `1`] != getTwoByteTrail(upper) \|\|
742	numYpogegrammeni > `0`;
743	int32_t i2 = i + `2`;
744	if ((data & HAS_EITHER_DIALYTIKA) != `0`) {
745	change \|= (i2 + `2`) > nextIndex \|\|
746	src[i2] != (uint8_t)u8"\u0308"[`0`] \|\|
747	src[i2 + `1`] != (uint8_t)u8"\u0308"[`1`];
748	i2 += `2`;
749	}
750	if (addTonos) {
751	change \|= (i2 + `2`) > nextIndex \|\|
752	src[i2] != (uint8_t)u8"\u0301"[`0`] \|\|
753	src[i2 + `1`] != (uint8_t)u8"\u0301"[`1`];
754	i2 += `2`;
755	}
756	int32_t oldLength = nextIndex - i;
757	int32_t newLength = (i2 - i) + numYpogegrammeni * `2`; // 2 bytes per U+0399
758	change \|= oldLength != newLength;
759	if (change) {
760	if (edits != nullptr) {
761	edits->addReplace(oldLength, newLength);
762	}
763	} else {
764	if (edits != nullptr) {
765	edits->addUnchanged(oldLength);
766	}
767	// Write unchanged text?
768	change = (options & U_OMIT_UNCHANGED_TEXT) == `0`;
769	}
770	}
771
772	if (change) {
773	ByteSinkUtil::appendTwoBytes(upper, sink);
774	if ((data & HAS_EITHER_DIALYTIKA) != `0`) {
775	sink.AppendU8(u8"\u0308", `2`); // restore or add a dialytika
776	}
777	if (addTonos) {
778	sink.AppendU8(u8"\u0301", `2`);
779	}
780	while (numYpogegrammeni > `0`) {
781	sink.AppendU8(u8"\u0399", `2`);
782	--numYpogegrammeni;
783	}
784	}
785	} else if(c>=`0`) {
786	const char16_t *s;
787	c=ucase_toFullUpper(c, nullptr, nullptr, &s, UCASE_LOC_GREEK);
788	if (!appendResult(nextIndex - i, c, s, sink, options, edits, errorCode)) {
789	return;
790	}
791	} else {
792	// Malformed UTF-8.
793	if (!ByteSinkUtil::appendUnchanged(src+i, nextIndex-i,
794	sink, options, edits, errorCode)) {
795	return;
796	}
797	}
798	i = nextIndex;
799	state = nextState;
800	}
801	}
802
803	} // namespace GreekUpper
804	U_NAMESPACE_END
805
806	static void U_CALLCONV
807	ucasemap_internalUTF8ToLower(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
808	const uint8_t *src, int32_t srcLength,
809	icu::ByteSink &sink, icu::Edits *edits,
810	UErrorCode &errorCode) {
811	UCaseContext csc=UCASECONTEXT_INITIALIZER;
812	csc.p=(void *)src;
813	csc.limit=srcLength;
814	toLower(
815	caseLocale, options,
816	src, &csc, `0`, srcLength,
817	sink, edits, errorCode);
818	}
819
820	static void U_CALLCONV
821	ucasemap_internalUTF8ToUpper(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
822	const uint8_t *src, int32_t srcLength,
823	icu::ByteSink &sink, icu::Edits *edits,
824	UErrorCode &errorCode) {
825	if (caseLocale == UCASE_LOC_GREEK) {
826	GreekUpper::toUpper(options, src, srcLength, sink, edits, errorCode);
827	} else {
828	UCaseContext csc=UCASECONTEXT_INITIALIZER;
829	csc.p=(void *)src;
830	csc.limit=srcLength;
831	toUpper(
832	caseLocale, options,
833	src, &csc, srcLength,
834	sink, edits, errorCode);
835	}
836	}
837
838	static void U_CALLCONV
839	ucasemap_internalUTF8Fold(int32_t / caseLocale /, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
840	const uint8_t *src, int32_t srcLength,
841	icu::ByteSink &sink, icu::Edits *edits,
842	UErrorCode &errorCode) {
843	toLower(
844	-`1`, options,
845	src, nullptr, `0`, srcLength,
846	sink, edits, errorCode);
847	}
848
849	void
850	ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
851	const char *src, int32_t srcLength,
852	UTF8CaseMapper *stringCaseMapper,
853	icu::ByteSink &sink, icu::Edits *edits,
854	UErrorCode &errorCode) {
855	/ check argument values /
856	if (U_FAILURE(errorCode)) {
857	return;
858	}
859	if ((src == nullptr && srcLength != `0`) \|\| srcLength < -`1`) {
860	errorCode = U_ILLEGAL_ARGUMENT_ERROR;
861	return;
862	}
863
864	// Get the string length.
865	if (srcLength == -`1`) {
866	srcLength = (int32_t)uprv_strlen((const char *)src);
867	}
868
869	if (edits != nullptr && (options & U_EDITS_NO_RESET) == `0`) {
870	edits->reset();
871	}
872	stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
873	(const uint8_t *)src, srcLength, sink, edits, errorCode);
874	sink.Flush();
875	if (U_SUCCESS(errorCode)) {
876	if (edits != nullptr) {
877	edits->copyErrorTo(errorCode);
878	}
879	}
880	}
881
882	int32_t
883	ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
884	char *dest, int32_t destCapacity,
885	const char *src, int32_t srcLength,
886	UTF8CaseMapper *stringCaseMapper,
887	icu::Edits *edits,
888	UErrorCode &errorCode) {
889	/ check argument values /
890	if(U_FAILURE(errorCode)) {
891	return `0`;
892	}
893	if( destCapacity<`0` \|\|
894	(dest==nullptr && destCapacity>`0`) \|\|
895	(src==nullptr && srcLength!=`0`) \|\| srcLength<-`1`
896	) {
897	errorCode=U_ILLEGAL_ARGUMENT_ERROR;
898	return `0`;
899	}
900
901	/ get the string length /
902	if(srcLength==-`1`) {
903	srcLength=(int32_t)uprv_strlen((const char *)src);
904	}
905
906	/ check for overlapping source and destination /
907	if( dest!=nullptr &&
908	((src>=dest && src<(dest+destCapacity)) \|\|
909	(dest>=src && dest<(src+srcLength)))
910	) {
911	errorCode=U_ILLEGAL_ARGUMENT_ERROR;
912	return `0`;
913	}
914
915	CheckedArrayByteSink sink(dest, destCapacity);
916	if (edits != nullptr && (options & U_EDITS_NO_RESET) == `0`) {
917	edits->reset();
918	}
919	stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
920	(const uint8_t *)src, srcLength, sink, edits, errorCode);
921	sink.Flush();
922	if (U_SUCCESS(errorCode)) {
923	if (sink.Overflowed()) {
924	errorCode = U_BUFFER_OVERFLOW_ERROR;
925	} else if (edits != nullptr) {
926	edits->copyErrorTo(errorCode);
927	}
928	}
929	return u_terminateChars(dest, destCapacity, sink.NumberOfBytesAppended(), &errorCode);
930	}
931
932	/ public API functions /
933
934	U_CAPI int32_t U_EXPORT2
935	ucasemap_utf8ToLower(const UCaseMap *csm,
936	char *dest, int32_t destCapacity,
937	const char *src, int32_t srcLength,
938	UErrorCode *pErrorCode) {
939	return ucasemap_mapUTF8(
940	csm->caseLocale, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
941	dest, destCapacity,
942	src, srcLength,
943	ucasemap_internalUTF8ToLower, nullptr, *pErrorCode);
944	}
945
946	U_CAPI int32_t U_EXPORT2
947	ucasemap_utf8ToUpper(const UCaseMap *csm,
948	char *dest, int32_t destCapacity,
949	const char *src, int32_t srcLength,
950	UErrorCode *pErrorCode) {
951	return ucasemap_mapUTF8(
952	csm->caseLocale, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
953	dest, destCapacity,
954	src, srcLength,
955	ucasemap_internalUTF8ToUpper, nullptr, *pErrorCode);
956	}
957
958	U_CAPI int32_t U_EXPORT2
959	ucasemap_utf8FoldCase(const UCaseMap *csm,
960	char *dest, int32_t destCapacity,
961	const char *src, int32_t srcLength,
962	UErrorCode *pErrorCode) {
963	return ucasemap_mapUTF8(
964	UCASE_LOC_ROOT, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
965	dest, destCapacity,
966	src, srcLength,
967	ucasemap_internalUTF8Fold, nullptr, *pErrorCode);
968	}
969
970	U_NAMESPACE_BEGIN
971
972	void CaseMap::utf8ToLower(
973	const char *locale, uint32_t options,
974	StringPiece src, ByteSink &sink, Edits *edits,
975	UErrorCode &errorCode) {
976	ucasemap_mapUTF8(
977	ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
978	src.data(), src.length(),
979	ucasemap_internalUTF8ToLower, sink, edits, errorCode);
980	}
981
982	void CaseMap::utf8ToUpper(
983	const char *locale, uint32_t options,
984	StringPiece src, ByteSink &sink, Edits *edits,
985	UErrorCode &errorCode) {
986	ucasemap_mapUTF8(
987	ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
988	src.data(), src.length(),
989	ucasemap_internalUTF8ToUpper, sink, edits, errorCode);
990	}
991
992	void CaseMap::utf8Fold(
993	uint32_t options,
994	StringPiece src, ByteSink &sink, Edits *edits,
995	UErrorCode &errorCode) {
996	ucasemap_mapUTF8(
997	UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
998	src.data(), src.length(),
999	ucasemap_internalUTF8Fold, sink, edits, errorCode);
1000	}
1001
1002	int32_t CaseMap::utf8ToLower(
1003	const char *locale, uint32_t options,
1004	const char *src, int32_t srcLength,
1005	char dest, int32_t destCapacity, Edits edits,
1006	UErrorCode &errorCode) {
1007	return ucasemap_mapUTF8(
1008	ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
1009	dest, destCapacity,
1010	src, srcLength,
1011	ucasemap_internalUTF8ToLower, edits, errorCode);
1012	}
1013
1014	int32_t CaseMap::utf8ToUpper(
1015	const char *locale, uint32_t options,
1016	const char *src, int32_t srcLength,
1017	char dest, int32_t destCapacity, Edits edits,
1018	UErrorCode &errorCode) {
1019	return ucasemap_mapUTF8(
1020	ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
1021	dest, destCapacity,
1022	src, srcLength,
1023	ucasemap_internalUTF8ToUpper, edits, errorCode);
1024	}
1025
1026	int32_t CaseMap::utf8Fold(
1027	uint32_t options,
1028	const char *src, int32_t srcLength,
1029	char dest, int32_t destCapacity, Edits edits,
1030	UErrorCode &errorCode) {
1031	return ucasemap_mapUTF8(
1032	UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
1033	dest, destCapacity,
1034	src, srcLength,
1035	ucasemap_internalUTF8Fold, edits, errorCode);
1036	}
1037
1038	U_NAMESPACE_END
1039

Browse the source code of Godot/thirdparty/icu4c/common/ucasemap.cpp