normalizer2impl.cpp source code [Godot/thirdparty/icu4c/common/normalizer2impl.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	*******************************************************************************
5	*
6	* Copyright (C) 2009-2014, International Business Machines
7	* Corporation and others. All Rights Reserved.
8	*
9	*******************************************************************************
10	* file name: normalizer2impl.cpp
11	* encoding: UTF-8
12	* tab size: 8 (not used)
13	* indentation:4
14	*
15	* created on: 2009nov22
16	* created by: Markus W. Scherer
17	*/
18
19	// #define UCPTRIE_DEBUG
20
21	#include "unicode/utypes.h"
22
23	#if !UCONFIG_NO_NORMALIZATION
24
25	#include "unicode/bytestream.h"
26	#include "unicode/edits.h"
27	#include "unicode/normalizer2.h"
28	#include "unicode/stringoptions.h"
29	#include "unicode/ucptrie.h"
30	#include "unicode/udata.h"
31	#include "unicode/umutablecptrie.h"
32	#include "unicode/ustring.h"
33	#include "unicode/utf16.h"
34	#include "unicode/utf8.h"
35	#include "bytesinkutil.h"
36	#include "cmemory.h"
37	#include "mutex.h"
38	#include "normalizer2impl.h"
39	#include "putilimp.h"
40	#include "uassert.h"
41	#include "ucptrie_impl.h"
42	#include "uset_imp.h"
43	#include "uvector.h"
44
45	U_NAMESPACE_BEGIN
46
47	namespace {
48
49	/**
50	* UTF-8 lead byte for minNoMaybeCP.
51	* Can be lower than the actual lead byte for c.
52	* Typically U+0300 for NFC/NFD, U+00A0 for NFKC/NFKD, U+0041 for NFKC_Casefold.
53	*/
54	inline uint8_t leadByteForCP(UChar32 c) {
55	if (c <= `0x7f`) {
56	return (uint8_t)c;
57	} else if (c <= `0x7ff`) {
58	return (uint8_t)(`0xc0`+(c>>`6`));
59	} else {
60	// Should not occur because ccc(U+0300)!=0.
61	return `0xe0`;
62	}
63	}
64
65	/**
66	* Returns the code point from one single well-formed UTF-8 byte sequence
67	* between cpStart and cpLimit.
68	*
69	* Trie UTF-8 macros do not assemble whole code points (for efficiency).
70	* When we do need the code point, we call this function.
71	* We should not need it for normalization-inert data (norm16==0).
72	* Illegal sequences yield the error value norm16==0 just like real normalization-inert code points.
73	*/
74	UChar32 codePointFromValidUTF8(const uint8_t cpStart, const* uint8_t *cpLimit) {
75	// Similar to U8_NEXT_UNSAFE(s, i, c).
76	U_ASSERT(cpStart < cpLimit);
77	uint8_t c = *cpStart;
78	switch(cpLimit-cpStart) {
79	case `1`:
80	return c;
81	case `2`:
82	return ((c&`0x1f`)<<`6`) \| (cpStart[`1`]&`0x3f`);
83	case `3`:
84	// no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (char16_t)
85	return (char16_t)((c<<`12`) \| ((cpStart[`1`]&`0x3f`)<<`6`) \| (cpStart[`2`]&`0x3f`));
86	case `4`:
87	return ((c&`7`)<<`18`) \| ((cpStart[`1`]&`0x3f`)<<`12`) \| ((cpStart[`2`]&`0x3f`)<<`6`) \| (cpStart[`3`]&`0x3f`);
88	default:
89	UPRV_UNREACHABLE_EXIT; // Should not occur.
90	}
91	}
92
93	/**
94	* Returns the last code point in [start, p[ if it is valid and in U+1000..U+D7FF.
95	* Otherwise returns a negative value.
96	*/
97	UChar32 previousHangulOrJamo(const uint8_t start, const* uint8_t *p) {
98	if ((p - start) >= `3`) {
99	p -= `3`;
100	uint8_t l = *p;
101	uint8_t t1, t2;
102	if (`0xe1` <= l && l <= `0xed` &&
103	(t1 = (uint8_t)(p[`1`] - `0x80`)) <= `0x3f` &&
104	(t2 = (uint8_t)(p[`2`] - `0x80`)) <= `0x3f` &&
105	(l < `0xed` \|\| t1 <= `0x1f`)) {
106	return ((l & `0xf`) << `12`) \| (t1 << `6`) \| t2;
107	}
108	}
109	return U_SENTINEL;
110	}
111
112	/**
113	* Returns the offset from the Jamo T base if [src, limit[ starts with a single Jamo T code point.
114	* Otherwise returns a negative value.
115	*/
116	int32_t getJamoTMinusBase(const uint8_t src, const* uint8_t *limit) {
117	// Jamo T: E1 86 A8..E1 87 82
118	if ((limit - src) >= `3` && *src == `0xe1`) {
119	if (src[`1`] == `0x86`) {
120	uint8_t t = src[`2`];
121	// The first Jamo T is U+11A8 but JAMO_T_BASE is 11A7.
122	// Offset 0 does not correspond to any conjoining Jamo.
123	if (`0xa8` <= t && t <= `0xbf`) {
124	return t - `0xa7`;
125	}
126	} else if (src[`1`] == `0x87`) {
127	uint8_t t = src[`2`];
128	if ((int8_t)t <= (int8_t)`0x82u`) {
129	return t - (`0xa7` - `0x40`);
130	}
131	}
132	}
133	return -`1`;
134	}
135
136	void
137	appendCodePointDelta(const uint8_t cpStart, const* uint8_t *cpLimit, int32_t delta,
138	ByteSink &sink, Edits *edits) {
139	char buffer[U8_MAX_LENGTH];
140	int32_t length;
141	int32_t cpLength = (int32_t)(cpLimit - cpStart);
142	if (cpLength == `1`) {
143	// The builder makes ASCII map to ASCII.
144	buffer[`0`] = (uint8_t)(*cpStart + delta);
145	length = `1`;
146	} else {
147	int32_t trail = *(cpLimit-`1`) + delta;
148	if (`0x80` <= trail && trail <= `0xbf`) {
149	// The delta only changes the last trail byte.
150	--cpLimit;
151	length = `0`;
152	do { buffer[length++] = cpStart++; } while* (cpStart < cpLimit);
153	buffer[length++] = (uint8_t)trail;
154	} else {
155	// Decode the code point, add the delta, re-encode.
156	UChar32 c = codePointFromValidUTF8(cpStart, cpLimit) + delta;
157	length = `0`;
158	U8_APPEND_UNSAFE(buffer, length, c);
159	}
160	}
161	if (edits != nullptr) {
162	edits->addReplace(cpLength, length);
163	}
164	sink.Append(buffer, length);
165	}
166
167	} // namespace
168
169	// ReorderingBuffer -------------------------------------------------------- ***
170
171	ReorderingBuffer::ReorderingBuffer(const Normalizer2Impl &ni, UnicodeString &dest,
172	UErrorCode &errorCode) :
173	impl(ni), str(dest),
174	start(str.getBuffer(`8`)), reorderStart(start), limit(start),
175	remainingCapacity(str.getCapacity()), lastCC(`0`) {
176	if (start == nullptr && U_SUCCESS(errorCode)) {
177	// getBuffer() already did str.setToBogus()
178	errorCode = U_MEMORY_ALLOCATION_ERROR;
179	}
180	}
181
182	UBool ReorderingBuffer::init(int32_t destCapacity, UErrorCode &errorCode) {
183	int32_t length=str.length();
184	start=str.getBuffer(destCapacity);
185	if(start==nullptr) {
186	// getBuffer() already did str.setToBogus()
187	errorCode=U_MEMORY_ALLOCATION_ERROR;
188	return false;
189	}
190	limit=start+length;
191	remainingCapacity=str.getCapacity()-length;
192	reorderStart=start;
193	if(start==limit) {
194	lastCC=`0`;
195	} else {
196	setIterator();
197	lastCC=previousCC();
198	// Set reorderStart after the last code point with cc<=1 if there is one.
199	if(lastCC>`1`) {
200	while(previousCC()>`1`) {}
201	}
202	reorderStart=codePointLimit;
203	}
204	return true;
205	}
206
207	UBool ReorderingBuffer::equals(const char16_t otherStart, const* char16_t otherLimit) const* {
208	int32_t length=(int32_t)(limit-start);
209	return
210	length==(int32_t)(otherLimit-otherStart) &&
211	`0`==u_memcmp(start, otherStart, length);
212	}
213
214	UBool ReorderingBuffer::equals(const uint8_t otherStart, const* uint8_t otherLimit) const* {
215	U_ASSERT((otherLimit - otherStart) <= INT32_MAX); // ensured by caller
216	int32_t length = (int32_t)(limit - start);
217	int32_t otherLength = (int32_t)(otherLimit - otherStart);
218	// For equal strings, UTF-8 is at least as long as UTF-16, and at most three times as long.
219	if (otherLength < length \|\| (otherLength / `3`) > length) {
220	return false;
221	}
222	// Compare valid strings from between normalization boundaries.
223	// (Invalid sequences are normalization-inert.)
224	for (int32_t i = `0`, j = `0`;;) {
225	if (i >= length) {
226	return j >= otherLength;
227	} else if (j >= otherLength) {
228	return false;
229	}
230	// Not at the end of either string yet.
231	UChar32 c, other;
232	U16_NEXT_UNSAFE(start, i, c);
233	U8_NEXT_UNSAFE(otherStart, j, other);
234	if (c != other) {
235	return false;
236	}
237	}
238	}
239
240	UBool ReorderingBuffer::appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode) {
241	if(remainingCapacity<`2` && !resize(`2`, errorCode)) {
242	return false;
243	}
244	if(lastCC<=cc \|\| cc==`0`) {
245	limit[`0`]=U16_LEAD(c);
246	limit[`1`]=U16_TRAIL(c);
247	limit+=`2`;
248	lastCC=cc;
249	if(cc<=`1`) {
250	reorderStart=limit;
251	}
252	} else {
253	insert(c, cc);
254	}
255	remainingCapacity-=`2`;
256	return true;
257	}
258
259	UBool ReorderingBuffer::append(const char16_t *s, int32_t length, UBool isNFD,
260	uint8_t leadCC, uint8_t trailCC,
261	UErrorCode &errorCode) {
262	if(length==`0`) {
263	return true;
264	}
265	if(remainingCapacity<length && !resize(length, errorCode)) {
266	return false;
267	}
268	remainingCapacity-=length;
269	if(lastCC<=leadCC \|\| leadCC==`0`) {
270	if(trailCC<=`1`) {
271	reorderStart=limit+length;
272	} else if(leadCC<=`1`) {
273	reorderStart=limit+`1`; // Ok if not a code point boundary.
274	}
275	const char16_t *sLimit=s+length;
276	do { limit++=s++; } while(s!=sLimit);
277	lastCC=trailCC;
278	} else {
279	int32_t i=`0`;
280	UChar32 c;
281	U16_NEXT(s, i, length, c);
282	insert(c, leadCC); // insert first code point
283	while(i<length) {
284	U16_NEXT(s, i, length, c);
285	if(i<length) {
286	if (isNFD) {
287	leadCC = Normalizer2Impl::getCCFromYesOrMaybe(impl.getRawNorm16(c));
288	} else {
289	leadCC = impl.getCC(impl.getNorm16(c));
290	}
291	} else {
292	leadCC=trailCC;
293	}
294	append(c, leadCC, errorCode);
295	}
296	}
297	return true;
298	}
299
300	UBool ReorderingBuffer::appendZeroCC(UChar32 c, UErrorCode &errorCode) {
301	int32_t cpLength=U16_LENGTH(c);
302	if(remainingCapacity<cpLength && !resize(cpLength, errorCode)) {
303	return false;
304	}
305	remainingCapacity-=cpLength;
306	if(cpLength==`1`) {
307	limit++=(char16_t*)c;
308	} else {
309	limit[`0`]=U16_LEAD(c);
310	limit[`1`]=U16_TRAIL(c);
311	limit+=`2`;
312	}
313	lastCC=`0`;
314	reorderStart=limit;
315	return true;
316	}
317
318	UBool ReorderingBuffer::appendZeroCC(const char16_t s, const* char16_t *sLimit, UErrorCode &errorCode) {
319	if(s==sLimit) {
320	return true;
321	}
322	int32_t length=(int32_t)(sLimit-s);
323	if(remainingCapacity<length && !resize(length, errorCode)) {
324	return false;
325	}
326	u_memcpy(limit, s, length);
327	limit+=length;
328	remainingCapacity-=length;
329	lastCC=`0`;
330	reorderStart=limit;
331	return true;
332	}
333
334	void ReorderingBuffer::remove() {
335	reorderStart=limit=start;
336	remainingCapacity=str.getCapacity();
337	lastCC=`0`;
338	}
339
340	void ReorderingBuffer::removeSuffix(int32_t suffixLength) {
341	if(suffixLength<(limit-start)) {
342	limit-=suffixLength;
343	remainingCapacity+=suffixLength;
344	} else {
345	limit=start;
346	remainingCapacity=str.getCapacity();
347	}
348	lastCC=`0`;
349	reorderStart=limit;
350	}
351
352	UBool ReorderingBuffer::resize(int32_t appendLength, UErrorCode &errorCode) {
353	int32_t reorderStartIndex=(int32_t)(reorderStart-start);
354	int32_t length=(int32_t)(limit-start);
355	str.releaseBuffer(length);
356	int32_t newCapacity=length+appendLength;
357	int32_t doubleCapacity=`2`*str.getCapacity();
358	if(newCapacity<doubleCapacity) {
359	newCapacity=doubleCapacity;
360	}
361	if(newCapacity<`256`) {
362	newCapacity=`256`;
363	}
364	start=str.getBuffer(newCapacity);
365	if(start==nullptr) {
366	// getBuffer() already did str.setToBogus()
367	errorCode=U_MEMORY_ALLOCATION_ERROR;
368	return false;
369	}
370	reorderStart=start+reorderStartIndex;
371	limit=start+length;
372	remainingCapacity=str.getCapacity()-length;
373	return true;
374	}
375
376	void ReorderingBuffer::skipPrevious() {
377	codePointLimit=codePointStart;
378	char16_t c=*--codePointStart;
379	if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(*(codePointStart-`1`))) {
380	--codePointStart;
381	}
382	}
383
384	uint8_t ReorderingBuffer::previousCC() {
385	codePointLimit=codePointStart;
386	if(reorderStart>=codePointStart) {
387	return `0`;
388	}
389	UChar32 c=*--codePointStart;
390	char16_t c2;
391	if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(c2=*(codePointStart-`1`))) {
392	--codePointStart;
393	c=U16_GET_SUPPLEMENTARY(c2, c);
394	}
395	return impl.getCCFromYesOrMaybeCP(c);
396	}
397
398	// Inserts c somewhere before the last character.
399	// Requires 0<cc<lastCC which implies reorderStart<limit.
400	void ReorderingBuffer::insert(UChar32 c, uint8_t cc) {
401	for(setIterator(), skipPrevious(); previousCC()>cc;) {}
402	// insert c at codePointLimit, after the character with prevCC<=cc
403	char16_t *q=limit;
404	char16_t *r=limit+=U16_LENGTH(c);
405	do {
406	--r=--q;
407	} while(codePointLimit!=q);
408	writeCodePoint(q, c);
409	if(cc<=`1`) {
410	reorderStart=r;
411	}
412	}
413
414	// Normalizer2Impl --------------------------------------------------------- ***
415
416	struct CanonIterData : public UMemory {
417	CanonIterData(UErrorCode &errorCode);
418	~CanonIterData();
419	void addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode);
420	UMutableCPTrie *mutableTrie;
421	UCPTrie *trie;
422	UVector canonStartSets; // contains UnicodeSet *
423	};
424
425	Normalizer2Impl::~Normalizer2Impl() {
426	delete fCanonIterData;
427	}
428
429	void
430	Normalizer2Impl::init(const int32_t inIndexes, const* UCPTrie *inTrie,
431	const uint16_t inExtraData, const* uint8_t *inSmallFCD) {
432	minDecompNoCP = static_cast<char16_t>(inIndexes[IX_MIN_DECOMP_NO_CP]);
433	minCompNoMaybeCP = static_cast<char16_t>(inIndexes[IX_MIN_COMP_NO_MAYBE_CP]);
434	minLcccCP = static_cast<char16_t>(inIndexes[IX_MIN_LCCC_CP]);
435
436	minYesNo = static_cast<uint16_t>(inIndexes[IX_MIN_YES_NO]);
437	minYesNoMappingsOnly = static_cast<uint16_t>(inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY]);
438	minNoNo = static_cast<uint16_t>(inIndexes[IX_MIN_NO_NO]);
439	minNoNoCompBoundaryBefore = static_cast<uint16_t>(inIndexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE]);
440	minNoNoCompNoMaybeCC = static_cast<uint16_t>(inIndexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC]);
441	minNoNoEmpty = static_cast<uint16_t>(inIndexes[IX_MIN_NO_NO_EMPTY]);
442	limitNoNo = static_cast<uint16_t>(inIndexes[IX_LIMIT_NO_NO]);
443	minMaybeYes = static_cast<uint16_t>(inIndexes[IX_MIN_MAYBE_YES]);
444	U_ASSERT((minMaybeYes & `7`) == `0`); // 8-aligned for noNoDelta bit fields
445	centerNoNoDelta = (minMaybeYes >> DELTA_SHIFT) - MAX_DELTA - `1`;
446
447	normTrie=inTrie;
448
449	maybeYesCompositions=inExtraData;
450	extraData=maybeYesCompositions+((MIN_NORMAL_MAYBE_YES-minMaybeYes)>>OFFSET_SHIFT);
451
452	smallFCD=inSmallFCD;
453	}
454
455	U_CDECL_BEGIN
456
457	static uint32_t U_CALLCONV
458	segmentStarterMapper(const void * /context/, uint32_t value) {
459	return value&CANON_NOT_SEGMENT_STARTER;
460	}
461
462	U_CDECL_END
463
464	void
465	Normalizer2Impl::addLcccChars(UnicodeSet &set) const {
466	UChar32 start = `0`, end;
467	uint32_t norm16;
468	while ((end = ucptrie_getRange(normTrie, start, UCPMAP_RANGE_FIXED_LEAD_SURROGATES, INERT,
469	nullptr, nullptr, &norm16)) >= `0`) {
470	if (norm16 > Normalizer2Impl::MIN_NORMAL_MAYBE_YES &&
471	norm16 != Normalizer2Impl::JAMO_VT) {
472	set.add(start, end);
473	} else if (minNoNoCompNoMaybeCC <= norm16 && norm16 < limitNoNo) {
474	uint16_t fcd16 = getFCD16(start);
475	if (fcd16 > `0xff`) { set.add(start, end); }
476	}
477	start = end + `1`;
478	}
479	}
480
481	void
482	Normalizer2Impl::addPropertyStarts(const USetAdder sa, UErrorCode & /errorCode/) const* {
483	// Add the start code point of each same-value range of the trie.
484	UChar32 start = `0`, end;
485	uint32_t value;
486	while ((end = ucptrie_getRange(normTrie, start, UCPMAP_RANGE_FIXED_LEAD_SURROGATES, INERT,
487	nullptr, nullptr, &value)) >= `0`) {
488	sa->add(sa->set, start);
489	if (start != end && isAlgorithmicNoNo((uint16_t)value) &&
490	(value & Normalizer2Impl::DELTA_TCCC_MASK) > Normalizer2Impl::DELTA_TCCC_1) {
491	// Range of code points with same-norm16-value algorithmic decompositions.
492	// They might have different non-zero FCD16 values.
493	uint16_t prevFCD16 = getFCD16(start);
494	while (++start <= end) {
495	uint16_t fcd16 = getFCD16(start);
496	if (fcd16 != prevFCD16) {
497	sa->add(sa->set, start);
498	prevFCD16 = fcd16;
499	}
500	}
501	}
502	start = end + `1`;
503	}
504
505	/ add Hangul LV syllables and LV+1 because of skippables /
506	for(char16_t c=Hangul::HANGUL_BASE; c<Hangul::HANGUL_LIMIT; c+=Hangul::JAMO_T_COUNT) {
507	sa->add(sa->set, c);
508	sa->add(sa->set, c+`1`);
509	}
510	sa->add(sa->set, Hangul::HANGUL_LIMIT); / add Hangul+1 to continue with other properties /
511	}
512
513	void
514	Normalizer2Impl::addCanonIterPropertyStarts(const USetAdder sa, UErrorCode &errorCode) const* {
515	// Add the start code point of each same-value range of the canonical iterator data trie.
516	if (!ensureCanonIterData(errorCode)) { return; }
517	// Currently only used for the SEGMENT_STARTER property.
518	UChar32 start = `0`, end;
519	uint32_t value;
520	while ((end = ucptrie_getRange(fCanonIterData->trie, start, UCPMAP_RANGE_NORMAL, `0`,
521	segmentStarterMapper, nullptr, &value)) >= `0`) {
522	sa->add(sa->set, start);
523	start = end + `1`;
524	}
525	}
526
527	const char16_t *
528	Normalizer2Impl::copyLowPrefixFromNulTerminated(const char16_t *src,
529	UChar32 minNeedDataCP,
530	ReorderingBuffer *buffer,
531	UErrorCode &errorCode) const {
532	// Make some effort to support NUL-terminated strings reasonably.
533	// Take the part of the fast quick check loop that does not look up
534	// data and check the first part of the string.
535	// After this prefix, determine the string length to simplify the rest
536	// of the code.
537	const char16_t *prevSrc=src;
538	char16_t c;
539	while((c=*src++)<minNeedDataCP && c!=`0`) {}
540	// Back out the last character for full processing.
541	// Copy this prefix.
542	if(--src!=prevSrc) {
543	if(buffer!=nullptr) {
544	buffer->appendZeroCC(prevSrc, src, errorCode);
545	}
546	}
547	return src;
548	}
549
550	UnicodeString &
551	Normalizer2Impl::decompose(const UnicodeString &src, UnicodeString &dest,
552	UErrorCode &errorCode) const {
553	if(U_FAILURE(errorCode)) {
554	dest.setToBogus();
555	return dest;
556	}
557	const char16_t *sArray=src.getBuffer();
558	if(&dest==&src \|\| sArray==nullptr) {
559	errorCode=U_ILLEGAL_ARGUMENT_ERROR;
560	dest.setToBogus();
561	return dest;
562	}
563	decompose(sArray, sArray+src.length(), dest, src.length(), errorCode);
564	return dest;
565	}
566
567	void
568	Normalizer2Impl::decompose(const char16_t src, const* char16_t *limit,
569	UnicodeString &dest,
570	int32_t destLengthEstimate,
571	UErrorCode &errorCode) const {
572	if(destLengthEstimate<`0` && limit!=nullptr) {
573	destLengthEstimate=(int32_t)(limit-src);
574	}
575	dest.remove();
576	ReorderingBuffer buffer(*this, dest);
577	if(buffer.init(destLengthEstimate, errorCode)) {
578	decompose(src, limit, &buffer, errorCode);
579	}
580	}
581
582	// Dual functionality:
583	// buffer!=nullptr: normalize
584	// buffer==nullptr: isNormalized/spanQuickCheckYes
585	const char16_t *
586	Normalizer2Impl::decompose(const char16_t src, const* char16_t *limit,
587	ReorderingBuffer *buffer,
588	UErrorCode &errorCode) const {
589	UChar32 minNoCP=minDecompNoCP;
590	if(limit==nullptr) {
591	src=copyLowPrefixFromNulTerminated(src, minNoCP, buffer, errorCode);
592	if(U_FAILURE(errorCode)) {
593	return src;
594	}
595	limit=u_strchr(src, `0`);
596	}
597
598	const char16_t *prevSrc;
599	UChar32 c=`0`;
600	uint16_t norm16=`0`;
601
602	// only for quick check
603	const char16_t *prevBoundary=src;
604	uint8_t prevCC=`0`;
605
606	for(;;) {
607	// count code units below the minimum or with irrelevant data for the quick check
608	for(prevSrc=src; src!=limit;) {
609	if( (c=*src)<minNoCP \|\|
610	isMostDecompYesAndZeroCC(norm16=UCPTRIE_FAST_BMP_GET(normTrie, UCPTRIE_16, c))
611	) {
612	++src;
613	} else if(!U16_IS_LEAD(c)) {
614	break;
615	} else {
616	char16_t c2;
617	if((src+`1`)!=limit && U16_IS_TRAIL(c2=src[`1`])) {
618	c=U16_GET_SUPPLEMENTARY(c, c2);
619	norm16=UCPTRIE_FAST_SUPP_GET(normTrie, UCPTRIE_16, c);
620	if(isMostDecompYesAndZeroCC(norm16)) {
621	src+=`2`;
622	} else {
623	break;
624	}
625	} else {
626	++src; // unpaired lead surrogate: inert
627	}
628	}
629	}
630	// copy these code units all at once
631	if(src!=prevSrc) {
632	if(buffer!=nullptr) {
633	if(!buffer->appendZeroCC(prevSrc, src, errorCode)) {
634	break;
635	}
636	} else {
637	prevCC=`0`;
638	prevBoundary=src;
639	}
640	}
641	if(src==limit) {
642	break;
643	}
644
645	// Check one above-minimum, relevant code point.
646	src+=U16_LENGTH(c);
647	if(buffer!=nullptr) {
648	if(!decompose(c, norm16, *buffer, errorCode)) {
649	break;
650	}
651	} else {
652	if(isDecompYes(norm16)) {
653	uint8_t cc=getCCFromYesOrMaybe(norm16);
654	if(prevCC<=cc \|\| cc==`0`) {
655	prevCC=cc;
656	if(cc<=`1`) {
657	prevBoundary=src;
658	}
659	continue;
660	}
661	}
662	return prevBoundary; // "no" or cc out of order
663	}
664	}
665	return src;
666	}
667
668	// Decompose a short piece of text which is likely to contain characters that
669	// fail the quick check loop and/or where the quick check loop's overhead
670	// is unlikely to be amortized.
671	// Called by the compose() and makeFCD() implementations.
672	const char16_t *
673	Normalizer2Impl::decomposeShort(const char16_t src, const* char16_t *limit,
674	UBool stopAtCompBoundary, UBool onlyContiguous,
675	ReorderingBuffer &buffer, UErrorCode &errorCode) const {
676	if (U_FAILURE(errorCode)) {
677	return nullptr;
678	}
679	while(src<limit) {
680	if (stopAtCompBoundary && *src < minCompNoMaybeCP) {
681	return src;
682	}
683	const char16_t *prevSrc = src;
684	UChar32 c;
685	uint16_t norm16;
686	UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, src, limit, c, norm16);
687	if (stopAtCompBoundary && norm16HasCompBoundaryBefore(norm16)) {
688	return prevSrc;
689	}
690	if(!decompose(c, norm16, buffer, errorCode)) {
691	return nullptr;
692	}
693	if (stopAtCompBoundary && norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
694	return src;
695	}
696	}
697	return src;
698	}
699
700	UBool Normalizer2Impl::decompose(UChar32 c, uint16_t norm16,
701	ReorderingBuffer &buffer,
702	UErrorCode &errorCode) const {
703	// get the decomposition and the lead and trail cc's
704	if (norm16 >= limitNoNo) {
705	if (isMaybeOrNonZeroCC(norm16)) {
706	return buffer.append(c, getCCFromYesOrMaybe(norm16), errorCode);
707	}
708	// Maps to an isCompYesAndZeroCC.
709	c=mapAlgorithmic(c, norm16);
710	norm16=getRawNorm16(c);
711	}
712	if (norm16 < minYesNo) {
713	// c does not decompose
714	return buffer.append(c, `0`, errorCode);
715	} else if(isHangulLV(norm16) \|\| isHangulLVT(norm16)) {
716	// Hangul syllable: decompose algorithmically
717	char16_t jamos[`3`];
718	return buffer.appendZeroCC(jamos, jamos+Hangul::decompose(c, jamos), errorCode);
719	}
720	// c decomposes, get everything from the variable-length extra data
721	const uint16_t *mapping=getMapping(norm16);
722	uint16_t firstUnit=*mapping;
723	int32_t length=firstUnit&MAPPING_LENGTH_MASK;
724	uint8_t leadCC, trailCC;
725	trailCC=(uint8_t)(firstUnit>>`8`);
726	if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
727	leadCC=(uint8_t)(*(mapping-`1`)>>`8`);
728	} else {
729	leadCC=`0`;
730	}
731	return buffer.append((const char16_t )mapping+`1`, length, true*, leadCC, trailCC, errorCode);
732	}
733
734	// Dual functionality:
735	// sink != nullptr: normalize
736	// sink == nullptr: isNormalized/spanQuickCheckYes
737	const uint8_t *
738	Normalizer2Impl::decomposeUTF8(uint32_t options,
739	const uint8_t src, const* uint8_t *limit,
740	ByteSink sink, Edits edits, UErrorCode &errorCode) const {
741	U_ASSERT(limit != nullptr);
742	UnicodeString s16;
743	uint8_t minNoLead = leadByteForCP(minDecompNoCP);
744
745	const uint8_t *prevBoundary = src;
746	// only for quick check
747	uint8_t prevCC = `0`;
748
749	for (;;) {
750	// Fast path: Scan over a sequence of characters below the minimum "no" code point,
751	// or with (decompYes && ccc==0) properties.
752	const uint8_t *fastStart = src;
753	const uint8_t *prevSrc;
754	uint16_t norm16 = `0`;
755
756	for (;;) {
757	if (src == limit) {
758	if (prevBoundary != limit && sink != nullptr) {
759	ByteSinkUtil::appendUnchanged(prevBoundary, limit,
760	*sink, options, edits, errorCode);
761	}
762	return src;
763	}
764	if (*src < minNoLead) {
765	++src;
766	} else {
767	prevSrc = src;
768	UCPTRIE_FAST_U8_NEXT(normTrie, UCPTRIE_16, src, limit, norm16);
769	if (!isMostDecompYesAndZeroCC(norm16)) {
770	break;
771	}
772	}
773	}
774	// isMostDecompYesAndZeroCC(norm16) is false, that is, norm16>=minYesNo,
775	// and the current character at [prevSrc..src[ is not a common case with cc=0
776	// (MIN_NORMAL_MAYBE_YES or JAMO_VT).
777	// It could still be a maybeYes with cc=0.
778	if (prevSrc != fastStart) {
779	// The fast path looped over yes/0 characters before the current one.
780	if (sink != nullptr &&
781	!ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
782	*sink, options, edits, errorCode)) {
783	break;
784	}
785	prevBoundary = prevSrc;
786	prevCC = `0`;
787	}
788
789	// Medium-fast path: Quick check.
790	if (isMaybeOrNonZeroCC(norm16)) {
791	// Does not decompose.
792	uint8_t cc = getCCFromYesOrMaybe(norm16);
793	if (prevCC <= cc \|\| cc == `0`) {
794	prevCC = cc;
795	if (cc <= `1`) {
796	if (sink != nullptr &&
797	!ByteSinkUtil::appendUnchanged(prevBoundary, src,
798	*sink, options, edits, errorCode)) {
799	break;
800	}
801	prevBoundary = src;
802	}
803	continue;
804	}
805	}
806	if (sink == nullptr) {
807	return prevBoundary; // quick check: "no" or cc out of order
808	}
809
810	// Slow path
811	// Decompose up to and including the current character.
812	if (prevBoundary != prevSrc && norm16HasDecompBoundaryBefore(norm16)) {
813	if (!ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
814	*sink, options, edits, errorCode)) {
815	break;
816	}
817	prevBoundary = prevSrc;
818	}
819	ReorderingBuffer buffer(*this, s16, errorCode);
820	if (U_FAILURE(errorCode)) {
821	break;
822	}
823	decomposeShort(prevBoundary, src, STOP_AT_LIMIT, false / onlyContiguous /,
824	buffer, errorCode);
825	// Decompose until the next boundary.
826	if (buffer.getLastCC() > `1`) {
827	src = decomposeShort(src, limit, STOP_AT_DECOMP_BOUNDARY, false / onlyContiguous /,
828	buffer, errorCode);
829	}
830	if (U_FAILURE(errorCode)) {
831	break;
832	}
833	if ((src - prevSrc) > INT32_MAX) { // guard before buffer.equals()
834	errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
835	break;
836	}
837	// We already know there was a change if the original character decomposed;
838	// otherwise compare.
839	if (isMaybeOrNonZeroCC(norm16) && buffer.equals(prevBoundary, src)) {
840	if (!ByteSinkUtil::appendUnchanged(prevBoundary, src,
841	*sink, options, edits, errorCode)) {
842	break;
843	}
844	} else {
845	if (!ByteSinkUtil::appendChange(prevBoundary, src, buffer.getStart(), buffer.length(),
846	*sink, edits, errorCode)) {
847	break;
848	}
849	}
850	prevBoundary = src;
851	prevCC = `0`;
852	}
853	return src;
854	}
855
856	const uint8_t *
857	Normalizer2Impl::decomposeShort(const uint8_t src, const* uint8_t *limit,
858	StopAt stopAt, UBool onlyContiguous,
859	ReorderingBuffer &buffer, UErrorCode &errorCode) const {
860	if (U_FAILURE(errorCode)) {
861	return nullptr;
862	}
863	while (src < limit) {
864	const uint8_t *prevSrc = src;
865	uint16_t norm16;
866	UCPTRIE_FAST_U8_NEXT(normTrie, UCPTRIE_16, src, limit, norm16);
867	// Get the decomposition and the lead and trail cc's.
868	UChar32 c = U_SENTINEL;
869	if (norm16 >= limitNoNo) {
870	if (isMaybeOrNonZeroCC(norm16)) {
871	// No comp boundaries around this character.
872	uint8_t cc = getCCFromYesOrMaybe(norm16);
873	if (cc == `0` && stopAt == STOP_AT_DECOMP_BOUNDARY) {
874	return prevSrc;
875	}
876	c = codePointFromValidUTF8(prevSrc, src);
877	if (!buffer.append(c, cc, errorCode)) {
878	return nullptr;
879	}
880	if (stopAt == STOP_AT_DECOMP_BOUNDARY && buffer.getLastCC() <= `1`) {
881	return src;
882	}
883	continue;
884	}
885	// Maps to an isCompYesAndZeroCC.
886	if (stopAt != STOP_AT_LIMIT) {
887	return prevSrc;
888	}
889	c = codePointFromValidUTF8(prevSrc, src);
890	c = mapAlgorithmic(c, norm16);
891	norm16 = getRawNorm16(c);
892	} else if (stopAt != STOP_AT_LIMIT && norm16 < minNoNoCompNoMaybeCC) {
893	return prevSrc;
894	}
895	// norm16!=INERT guarantees that [prevSrc, src[ is valid UTF-8.
896	// We do not see invalid UTF-8 here because
897	// its norm16==INERT is normalization-inert,
898	// so it gets copied unchanged in the fast path,
899	// and we stop the slow path where invalid UTF-8 begins.
900	// c >= 0 is the result of an algorithmic mapping.
901	U_ASSERT(c >= `0` \|\| norm16 != INERT);
902	if (norm16 < minYesNo) {
903	if (c < `0`) {
904	c = codePointFromValidUTF8(prevSrc, src);
905	}
906	// does not decompose
907	if (!buffer.append(c, `0`, errorCode)) {
908	return nullptr;
909	}
910	} else if (isHangulLV(norm16) \|\| isHangulLVT(norm16)) {
911	// Hangul syllable: decompose algorithmically
912	if (c < `0`) {
913	c = codePointFromValidUTF8(prevSrc, src);
914	}
915	char16_t jamos[`3`];
916	if (!buffer.appendZeroCC(jamos, jamos+Hangul::decompose(c, jamos), errorCode)) {
917	return nullptr;
918	}
919	} else {
920	// The character decomposes, get everything from the variable-length extra data.
921	const uint16_t *mapping = getMapping(norm16);
922	uint16_t firstUnit = *mapping;
923	int32_t length = firstUnit & MAPPING_LENGTH_MASK;
924	uint8_t trailCC = (uint8_t)(firstUnit >> `8`);
925	uint8_t leadCC;
926	if (firstUnit & MAPPING_HAS_CCC_LCCC_WORD) {
927	leadCC = (uint8_t)(*(mapping-`1`) >> `8`);
928	} else {
929	leadCC = `0`;
930	}
931	if (leadCC == `0` && stopAt == STOP_AT_DECOMP_BOUNDARY) {
932	return prevSrc;
933	}
934	if (!buffer.append((const char16_t )mapping+`1`, length, true*, leadCC, trailCC, errorCode)) {
935	return nullptr;
936	}
937	}
938	if ((stopAt == STOP_AT_COMP_BOUNDARY && norm16HasCompBoundaryAfter(norm16, onlyContiguous)) \|\|
939	(stopAt == STOP_AT_DECOMP_BOUNDARY && buffer.getLastCC() <= `1`)) {
940	return src;
941	}
942	}
943	return src;
944	}
945
946	const char16_t *
947	Normalizer2Impl::getDecomposition(UChar32 c, char16_t buffer[`4`], int32_t &length) const {
948	uint16_t norm16;
949	if(c<minDecompNoCP \|\| isMaybeOrNonZeroCC(norm16=getNorm16(c))) {
950	// c does not decompose
951	return nullptr;
952	}
953	const char16_t decomp = nullptr*;
954	if(isDecompNoAlgorithmic(norm16)) {
955	// Maps to an isCompYesAndZeroCC.
956	c=mapAlgorithmic(c, norm16);
957	decomp=buffer;
958	length=`0`;
959	U16_APPEND_UNSAFE(buffer, length, c);
960	// The mapping might decompose further.
961	norm16 = getRawNorm16(c);
962	}
963	if (norm16 < minYesNo) {
964	return decomp;
965	} else if(isHangulLV(norm16) \|\| isHangulLVT(norm16)) {
966	// Hangul syllable: decompose algorithmically
967	length=Hangul::decompose(c, buffer);
968	return buffer;
969	}
970	// c decomposes, get everything from the variable-length extra data
971	const uint16_t *mapping=getMapping(norm16);
972	length=*mapping&MAPPING_LENGTH_MASK;
973	return (const char16_t *)mapping+`1`;
974	}
975
976	// The capacity of the buffer must be 30=MAPPING_LENGTH_MASK-1
977	// so that a raw mapping fits that consists of one unit ("rm0")
978	// plus all but the first two code units of the normal mapping.
979	// The maximum length of a normal mapping is 31=MAPPING_LENGTH_MASK.
980	const char16_t *
981	Normalizer2Impl::getRawDecomposition(UChar32 c, char16_t buffer[`30`], int32_t &length) const {
982	uint16_t norm16;
983	if(c<minDecompNoCP \|\| isDecompYes(norm16=getNorm16(c))) {
984	// c does not decompose
985	return nullptr;
986	} else if(isHangulLV(norm16) \|\| isHangulLVT(norm16)) {
987	// Hangul syllable: decompose algorithmically
988	Hangul::getRawDecomposition(c, buffer);
989	length=`2`;
990	return buffer;
991	} else if(isDecompNoAlgorithmic(norm16)) {
992	c=mapAlgorithmic(c, norm16);
993	length=`0`;
994	U16_APPEND_UNSAFE(buffer, length, c);
995	return buffer;
996	}
997	// c decomposes, get everything from the variable-length extra data
998	const uint16_t *mapping=getMapping(norm16);
999	uint16_t firstUnit=*mapping;
1000	int32_t mLength=firstUnit&MAPPING_LENGTH_MASK; // length of normal mapping
1001	if(firstUnit&MAPPING_HAS_RAW_MAPPING) {
1002	// Read the raw mapping from before the firstUnit and before the optional ccc/lccc word.
1003	// Bit 7=MAPPING_HAS_CCC_LCCC_WORD
1004	const uint16_t *rawMapping=mapping-((firstUnit>>`7`)&`1`)-`1`;
1005	uint16_t rm0=*rawMapping;
1006	if(rm0<=MAPPING_LENGTH_MASK) {
1007	length=rm0;
1008	return (const char16_t *)rawMapping-rm0;
1009	} else {
1010	// Copy the normal mapping and replace its first two code units with rm0.
1011	buffer[`0`]=(char16_t)rm0;
1012	u_memcpy(buffer+`1`, (const char16_t *)mapping+`1`+`2`, mLength-`2`);
1013	length=mLength-`1`;
1014	return buffer;
1015	}
1016	} else {
1017	length=mLength;
1018	return (const char16_t *)mapping+`1`;
1019	}
1020	}
1021
1022	void Normalizer2Impl::decomposeAndAppend(const char16_t src, const* char16_t *limit,
1023	UBool doDecompose,
1024	UnicodeString &safeMiddle,
1025	ReorderingBuffer &buffer,
1026	UErrorCode &errorCode) const {
1027	buffer.copyReorderableSuffixTo(safeMiddle);
1028	if(doDecompose) {
1029	decompose(src, limit, &buffer, errorCode);
1030	return;
1031	}
1032	// Just merge the strings at the boundary.
1033	bool isFirst = true;
1034	uint8_t firstCC = `0`, prevCC = `0`, cc;
1035	const char16_t *p = src;
1036	while (p != limit) {
1037	const char16_t *codePointStart = p;
1038	UChar32 c;
1039	uint16_t norm16;
1040	UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, p, limit, c, norm16);
1041	if ((cc = getCC(norm16)) == `0`) {
1042	p = codePointStart;
1043	break;
1044	}
1045	if (isFirst) {
1046	firstCC = cc;
1047	isFirst = false;
1048	}
1049	prevCC = cc;
1050	}
1051	if(limit==nullptr) { // appendZeroCC() needs limit!=nullptr
1052	limit=u_strchr(p, `0`);
1053	}
1054
1055	if (buffer.append(src, (int32_t)(p - src), false, firstCC, prevCC, errorCode)) {
1056	buffer.appendZeroCC(p, limit, errorCode);
1057	}
1058	}
1059
1060	UBool Normalizer2Impl::hasDecompBoundaryBefore(UChar32 c) const {
1061	return c < minLcccCP \|\| (c <= `0xffff` && !singleLeadMightHaveNonZeroFCD16(c)) \|\|
1062	norm16HasDecompBoundaryBefore(getNorm16(c));
1063	}
1064
1065	UBool Normalizer2Impl::norm16HasDecompBoundaryBefore(uint16_t norm16) const {
1066	if (norm16 < minNoNoCompNoMaybeCC) {
1067	return true;
1068	}
1069	if (norm16 >= limitNoNo) {
1070	return norm16 <= MIN_NORMAL_MAYBE_YES \|\| norm16 == JAMO_VT;
1071	}
1072	// c decomposes, get everything from the variable-length extra data
1073	const uint16_t *mapping=getMapping(norm16);
1074	uint16_t firstUnit=*mapping;
1075	// true if leadCC==0 (hasFCDBoundaryBefore())
1076	return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==`0` \|\| (*(mapping-`1`)&`0xff00`)==`0`;
1077	}
1078
1079	UBool Normalizer2Impl::hasDecompBoundaryAfter(UChar32 c) const {
1080	if (c < minDecompNoCP) {
1081	return true;
1082	}
1083	if (c <= `0xffff` && !singleLeadMightHaveNonZeroFCD16(c)) {
1084	return true;
1085	}
1086	return norm16HasDecompBoundaryAfter(getNorm16(c));
1087	}
1088
1089	UBool Normalizer2Impl::norm16HasDecompBoundaryAfter(uint16_t norm16) const {
1090	if(norm16 <= minYesNo \|\| isHangulLVT(norm16)) {
1091	return true;
1092	}
1093	if (norm16 >= limitNoNo) {
1094	if (isMaybeOrNonZeroCC(norm16)) {
1095	return norm16 <= MIN_NORMAL_MAYBE_YES \|\| norm16 == JAMO_VT;
1096	}
1097	// Maps to an isCompYesAndZeroCC.
1098	return (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1;
1099	}
1100	// c decomposes, get everything from the variable-length extra data
1101	const uint16_t *mapping=getMapping(norm16);
1102	uint16_t firstUnit=*mapping;
1103	// decomp after-boundary: same as hasFCDBoundaryAfter(),
1104	// fcd16<=1 \|\| trailCC==0
1105	if(firstUnit>`0x1ff`) {
1106	return false; // trailCC>1
1107	}
1108	if(firstUnit<=`0xff`) {
1109	return true; // trailCC==0
1110	}
1111	// if(trailCC==1) test leadCC==0, same as checking for before-boundary
1112	// true if leadCC==0 (hasFCDBoundaryBefore())
1113	return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==`0` \|\| (*(mapping-`1`)&`0xff00`)==`0`;
1114	}
1115
1116	/*
1117	* Finds the recomposition result for
1118	* a forward-combining "lead" character,
1119	* specified with a pointer to its compositions list,
1120	* and a backward-combining "trail" character.
1121	*
1122	* If the lead and trail characters combine, then this function returns
1123	* the following "compositeAndFwd" value:
1124	* Bits 21..1 composite character
1125	* Bit 0 set if the composite is a forward-combining starter
1126	* otherwise it returns -1.
1127	*
1128	* The compositions list has (trail, compositeAndFwd) pair entries,
1129	* encoded as either pairs or triples of 16-bit units.
1130	* The last entry has the high bit of its first unit set.
1131	*
1132	* The list is sorted by ascending trail characters (there are no duplicates).
1133	* A linear search is used.
1134	*
1135	* See normalizer2impl.h for a more detailed description
1136	* of the compositions list format.
1137	*/
1138	int32_t Normalizer2Impl::combine(const uint16_t *list, UChar32 trail) {
1139	uint16_t key1, firstUnit;
1140	if(trail<COMP_1_TRAIL_LIMIT) {
1141	// trail character is 0..33FF
1142	// result entry may have 2 or 3 units
1143	key1=(uint16_t)(trail<<`1`);
1144	while(key1>(firstUnit=*list)) {
1145	list+=`2`+(firstUnit&COMP_1_TRIPLE);
1146	}
1147	if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
1148	if(firstUnit&COMP_1_TRIPLE) {
1149	return ((int32_t)list[`1`]<<`16`)\|list[`2`];
1150	} else {
1151	return list[`1`];
1152	}
1153	}
1154	} else {
1155	// trail character is 3400..10FFFF
1156	// result entry has 3 units
1157	key1=(uint16_t)(COMP_1_TRAIL_LIMIT+
1158	(((trail>>COMP_1_TRAIL_SHIFT))&
1159	~COMP_1_TRIPLE));
1160	uint16_t key2=(uint16_t)(trail<<COMP_2_TRAIL_SHIFT);
1161	uint16_t secondUnit;
1162	for(;;) {
1163	if(key1>(firstUnit=*list)) {
1164	list+=`2`+(firstUnit&COMP_1_TRIPLE);
1165	} else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
1166	if(key2>(secondUnit=list[`1`])) {
1167	if(firstUnit&COMP_1_LAST_TUPLE) {
1168	break;
1169	} else {
1170	list+=`3`;
1171	}
1172	} else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) {
1173	return ((int32_t)(secondUnit&~COMP_2_TRAIL_MASK)<<`16`)\|list[`2`];
1174	} else {
1175	break;
1176	}
1177	} else {
1178	break;
1179	}
1180	}
1181	}
1182	return -`1`;
1183	}
1184
1185	/**
1186	* @param list some character's compositions list
1187	* @param set recursively receives the composites from these compositions
1188	*/
1189	void Normalizer2Impl::addComposites(const uint16_t list, UnicodeSet &set) const* {
1190	uint16_t firstUnit;
1191	int32_t compositeAndFwd;
1192	do {
1193	firstUnit=*list;
1194	if((firstUnit&COMP_1_TRIPLE)==`0`) {
1195	compositeAndFwd=list[`1`];
1196	list+=`2`;
1197	} else {
1198	compositeAndFwd=(((int32_t)list[`1`]&~COMP_2_TRAIL_MASK)<<`16`)\|list[`2`];
1199	list+=`3`;
1200	}
1201	UChar32 composite=compositeAndFwd>>`1`;
1202	if((compositeAndFwd&`1`)!=`0`) {
1203	addComposites(getCompositionsListForComposite(getRawNorm16(composite)), set);
1204	}
1205	set.add(composite);
1206	} while((firstUnit&COMP_1_LAST_TUPLE)==`0`);
1207	}
1208
1209	/*
1210	* Recomposes the buffer text starting at recomposeStartIndex
1211	* (which is in NFD - decomposed and canonically ordered),
1212	* and truncates the buffer contents.
1213	*
1214	* Note that recomposition never lengthens the text:
1215	* Any character consists of either one or two code units;
1216	* a composition may contain at most one more code unit than the original starter,
1217	* while the combining mark that is removed has at least one code unit.
1218	*/
1219	void Normalizer2Impl::recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex,
1220	UBool onlyContiguous) const {
1221	char16_t *p=buffer.getStart()+recomposeStartIndex;
1222	char16_t *limit=buffer.getLimit();
1223	if(p==limit) {
1224	return;
1225	}
1226
1227	char16_t starter, pRemove, q, r;
1228	const uint16_t *compositionsList;
1229	UChar32 c, compositeAndFwd;
1230	uint16_t norm16;
1231	uint8_t cc, prevCC;
1232	UBool starterIsSupplementary;
1233
1234	// Some of the following variables are not used until we have a forward-combining starter
1235	// and are only initialized now to avoid compiler warnings.
1236	compositionsList=nullptr; // used as indicator for whether we have a forward-combining starter
1237	starter=nullptr;
1238	starterIsSupplementary=false;
1239	prevCC=`0`;
1240
1241	for(;;) {
1242	UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, p, limit, c, norm16);
1243	cc=getCCFromYesOrMaybe(norm16);
1244	if( // this character combines backward and
1245	isMaybe(norm16) &&
1246	// we have seen a starter that combines forward and
1247	compositionsList!=nullptr &&
1248	// the backward-combining character is not blocked
1249	(prevCC<cc \|\| prevCC==`0`)
1250	) {
1251	if(isJamoVT(norm16)) {
1252	// c is a Jamo V/T, see if we can compose it with the previous character.
1253	if(c<Hangul::JAMO_T_BASE) {
1254	// c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
1255	char16_t prev=(char16_t)(*starter-Hangul::JAMO_L_BASE);
1256	if(prev<Hangul::JAMO_L_COUNT) {
1257	pRemove=p-`1`;
1258	char16_t syllable=(char16_t)
1259	(Hangul::HANGUL_BASE+
1260	(prevHangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))
1261	Hangul::JAMO_T_COUNT);
1262	char16_t t;
1263	if(p!=limit && (t=(char16_t)(*p-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) {
1264	++p;
1265	syllable+=t; // The next character was a Jamo T.
1266	}
1267	*starter=syllable;
1268	// remove the Jamo V/T
1269	q=pRemove;
1270	r=p;
1271	while(r<limit) {
1272	q++=r++;
1273	}
1274	limit=q;
1275	p=pRemove;
1276	}
1277	}
1278	/*
1279	* No "else" for Jamo T:
1280	* Since the input is in NFD, there are no Hangul LV syllables that
1281	* a Jamo T could combine with.
1282	* All Jamo Ts are combined above when handling Jamo Vs.
1283	*/
1284	if(p==limit) {
1285	break;
1286	}
1287	compositionsList=nullptr;
1288	continue;
1289	} else if((compositeAndFwd=combine(compositionsList, c))>=`0`) {
1290	// The starter and the combining mark (c) do combine.
1291	UChar32 composite=compositeAndFwd>>`1`;
1292
1293	// Replace the starter with the composite, remove the combining mark.
1294	pRemove=p-U16_LENGTH(c); // pRemove & p: start & limit of the combining mark
1295	if(starterIsSupplementary) {
1296	if(U_IS_SUPPLEMENTARY(composite)) {
1297	// both are supplementary
1298	starter[`0`]=U16_LEAD(composite);
1299	starter[`1`]=U16_TRAIL(composite);
1300	} else {
1301	starter=(char16_t*)composite;
1302	// The composite is shorter than the starter,
1303	// move the intermediate characters forward one.
1304	starterIsSupplementary=false;
1305	q=starter+`1`;
1306	r=q+`1`;
1307	while(r<pRemove) {
1308	q++=r++;
1309	}
1310	--pRemove;
1311	}
1312	} else if(U_IS_SUPPLEMENTARY(composite)) {
1313	// The composite is longer than the starter,
1314	// move the intermediate characters back one.
1315	starterIsSupplementary=true;
1316	++starter; // temporarily increment for the loop boundary
1317	q=pRemove;
1318	r=++pRemove;
1319	while(starter<q) {
1320	--r=--q;
1321	}
1322	*starter=U16_TRAIL(composite);
1323	--starter=U16_LEAD(composite); // undo the temporary increment*
1324	} else {
1325	// both are on the BMP
1326	starter=(char16_t*)composite;
1327	}
1328
1329	/ remove the combining mark by moving the following text over it /
1330	if(pRemove<p) {
1331	q=pRemove;
1332	r=p;
1333	while(r<limit) {
1334	q++=r++;
1335	}
1336	limit=q;
1337	p=pRemove;
1338	}
1339	// Keep prevCC because we removed the combining mark.
1340
1341	if(p==limit) {
1342	break;
1343	}
1344	// Is the composite a starter that combines forward?
1345	if(compositeAndFwd&`1`) {
1346	compositionsList=
1347	getCompositionsListForComposite(getRawNorm16(composite));
1348	} else {
1349	compositionsList=nullptr;
1350	}
1351
1352	// We combined; continue with looking for compositions.
1353	continue;
1354	}
1355	}
1356
1357	// no combination this time
1358	prevCC=cc;
1359	if(p==limit) {
1360	break;
1361	}
1362
1363	// If c did not combine, then check if it is a starter.
1364	if(cc==`0`) {
1365	// Found a new starter.
1366	if((compositionsList=getCompositionsListForDecompYes(norm16))!=nullptr) {
1367	// It may combine with something, prepare for it.
1368	if(U_IS_BMP(c)) {
1369	starterIsSupplementary=false;
1370	starter=p-`1`;
1371	} else {
1372	starterIsSupplementary=true;
1373	starter=p-`2`;
1374	}
1375	}
1376	} else if(onlyContiguous) {
1377	// FCC: no discontiguous compositions; any intervening character blocks.
1378	compositionsList=nullptr;
1379	}
1380	}
1381	buffer.setReorderingLimit(limit);
1382	}
1383
1384	UChar32
1385	Normalizer2Impl::composePair(UChar32 a, UChar32 b) const {
1386	uint16_t norm16=getNorm16(a); // maps an out-of-range 'a' to inert norm16
1387	const uint16_t *list;
1388	if(isInert(norm16)) {
1389	return U_SENTINEL;
1390	} else if(norm16<minYesNoMappingsOnly) {
1391	// a combines forward.
1392	if(isJamoL(norm16)) {
1393	b-=Hangul::JAMO_V_BASE;
1394	if(`0`<=b && b<Hangul::JAMO_V_COUNT) {
1395	return
1396	(Hangul::HANGUL_BASE+
1397	((a-Hangul::JAMO_L_BASE)Hangul::JAMO_V_COUNT+b)
1398	Hangul::JAMO_T_COUNT);
1399	} else {
1400	return U_SENTINEL;
1401	}
1402	} else if(isHangulLV(norm16)) {
1403	b-=Hangul::JAMO_T_BASE;
1404	if(`0`<b && b<Hangul::JAMO_T_COUNT) { // not b==0!
1405	return a+b;
1406	} else {
1407	return U_SENTINEL;
1408	}
1409	} else {
1410	// 'a' has a compositions list in extraData
1411	list=getMapping(norm16);
1412	if(norm16>minYesNo) { // composite 'a' has both mapping & compositions list
1413	list+= // mapping pointer
1414	`1`+ // +1 to skip the first unit with the mapping length
1415	(list&MAPPING_LENGTH_MASK); // + mapping length*
1416	}
1417	}
1418	} else if(norm16<minMaybeYes \|\| MIN_NORMAL_MAYBE_YES<=norm16) {
1419	return U_SENTINEL;
1420	} else {
1421	list=getCompositionsListForMaybe(norm16);
1422	}
1423	if(b<`0` \|\| `0x10ffff`<b) { // combine(list, b) requires a valid code point b
1424	return U_SENTINEL;
1425	}
1426	#if U_SIGNED_RIGHT_SHIFT_IS_ARITHMETIC
1427	return combine(list, b)>>`1`;
1428	#else
1429	int32_t compositeAndFwd=combine(list, b);
1430	return compositeAndFwd>=`0` ? compositeAndFwd>>`1` : U_SENTINEL;
1431	#endif
1432	}
1433
1434	// Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
1435	// doCompose: normalize
1436	// !doCompose: isNormalized (buffer must be empty and initialized)
1437	UBool
1438	Normalizer2Impl::compose(const char16_t src, const* char16_t *limit,
1439	UBool onlyContiguous,
1440	UBool doCompose,
1441	ReorderingBuffer &buffer,
1442	UErrorCode &errorCode) const {
1443	const char16_t *prevBoundary=src;
1444	UChar32 minNoMaybeCP=minCompNoMaybeCP;
1445	if(limit==nullptr) {
1446	src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP,
1447	doCompose ? &buffer : nullptr,
1448	errorCode);
1449	if(U_FAILURE(errorCode)) {
1450	return false;
1451	}
1452	limit=u_strchr(src, `0`);
1453	if (prevBoundary != src) {
1454	if (hasCompBoundaryAfter(*(src-`1`), onlyContiguous)) {
1455	prevBoundary = src;
1456	} else {
1457	buffer.removeSuffix(`1`);
1458	prevBoundary = --src;
1459	}
1460	}
1461	}
1462
1463	for (;;) {
1464	// Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,
1465	// or with (compYes && ccc==0) properties.
1466	const char16_t *prevSrc;
1467	UChar32 c = `0`;
1468	uint16_t norm16 = `0`;
1469	for (;;) {
1470	if (src == limit) {
1471	if (prevBoundary != limit && doCompose) {
1472	buffer.appendZeroCC(prevBoundary, limit, errorCode);
1473	}
1474	return true;
1475	}
1476	if( (c=*src)<minNoMaybeCP \|\|
1477	isCompYesAndZeroCC(norm16=UCPTRIE_FAST_BMP_GET(normTrie, UCPTRIE_16, c))
1478	) {
1479	++src;
1480	} else {
1481	prevSrc = src++;
1482	if(!U16_IS_LEAD(c)) {
1483	break;
1484	} else {
1485	char16_t c2;
1486	if(src!=limit && U16_IS_TRAIL(c2=*src)) {
1487	++src;
1488	c=U16_GET_SUPPLEMENTARY(c, c2);
1489	norm16=UCPTRIE_FAST_SUPP_GET(normTrie, UCPTRIE_16, c);
1490	if(!isCompYesAndZeroCC(norm16)) {
1491	break;
1492	}
1493	}
1494	}
1495	}
1496	}
1497	// isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1498	// The current character is either a "noNo" (has a mapping)
1499	// or a "maybeYes" (combines backward)
1500	// or a "yesYes" with ccc!=0.
1501	// It is not a Hangul syllable or Jamo L because those have "yes" properties.
1502
1503	// Medium-fast path: Handle cases that do not require full decomposition and recomposition.
1504	if (!isMaybeOrNonZeroCC(norm16)) { // minNoNo <= norm16 < minMaybeYes
1505	if (!doCompose) {
1506	return false;
1507	}
1508	// Fast path for mapping a character that is immediately surrounded by boundaries.
1509	// In this case, we need not decompose around the current character.
1510	if (isDecompNoAlgorithmic(norm16)) {
1511	// Maps to a single isCompYesAndZeroCC character
1512	// which also implies hasCompBoundaryBefore.
1513	if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) \|\|
1514	hasCompBoundaryBefore(src, limit)) {
1515	if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) {
1516	break;
1517	}
1518	if(!buffer.append(mapAlgorithmic(c, norm16), `0`, errorCode)) {
1519	break;
1520	}
1521	prevBoundary = src;
1522	continue;
1523	}
1524	} else if (norm16 < minNoNoCompBoundaryBefore) {
1525	// The mapping is comp-normalized which also implies hasCompBoundaryBefore.
1526	if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) \|\|
1527	hasCompBoundaryBefore(src, limit)) {
1528	if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) {
1529	break;
1530	}
1531	const char16_t mapping = reinterpret_cast<const* char16_t *>(getMapping(norm16));
1532	int32_t length = *mapping++ & MAPPING_LENGTH_MASK;
1533	if(!buffer.appendZeroCC(mapping, mapping + length, errorCode)) {
1534	break;
1535	}
1536	prevBoundary = src;
1537	continue;
1538	}
1539	} else if (norm16 >= minNoNoEmpty) {
1540	// The current character maps to nothing.
1541	// Simply omit it from the output if there is a boundary before _or_ after it.
1542	// The character itself implies no boundaries.
1543	if (hasCompBoundaryBefore(src, limit) \|\|
1544	hasCompBoundaryAfter(prevBoundary, prevSrc, onlyContiguous)) {
1545	if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) {
1546	break;
1547	}
1548	prevBoundary = src;
1549	continue;
1550	}
1551	}
1552	// Other "noNo" type, or need to examine more text around this character:
1553	// Fall through to the slow path.
1554	} else if (isJamoVT(norm16) && prevBoundary != prevSrc) {
1555	char16_t prev=*(prevSrc-`1`);
1556	if(c<Hangul::JAMO_T_BASE) {
1557	// The current character is a Jamo Vowel,
1558	// compose with previous Jamo L and following Jamo T.
1559	char16_t l = (char16_t)(prev-Hangul::JAMO_L_BASE);
1560	if(l<Hangul::JAMO_L_COUNT) {
1561	if (!doCompose) {
1562	return false;
1563	}
1564	int32_t t;
1565	if (src != limit &&
1566	`0` < (t = ((int32_t)*src - Hangul::JAMO_T_BASE)) &&
1567	t < Hangul::JAMO_T_COUNT) {
1568	// The next character is a Jamo T.
1569	++src;
1570	} else if (hasCompBoundaryBefore(src, limit)) {
1571	// No Jamo T follows, not even via decomposition.
1572	t = `0`;
1573	} else {
1574	t = -`1`;
1575	}
1576	if (t >= `0`) {
1577	UChar32 syllable = Hangul::HANGUL_BASE +
1578	(lHangul::JAMO_V_COUNT + (c-Hangul::JAMO_V_BASE))
1579	Hangul::JAMO_T_COUNT + t;
1580	--prevSrc; // Replace the Jamo L as well.
1581	if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) {
1582	break;
1583	}
1584	if(!buffer.appendBMP((char16_t)syllable, `0`, errorCode)) {
1585	break;
1586	}
1587	prevBoundary = src;
1588	continue;
1589	}
1590	// If we see L+V+x where x!=T then we drop to the slow path,
1591	// decompose and recompose.
1592	// This is to deal with NFKC finding normal L and V but a
1593	// compatibility variant of a T.
1594	// We need to either fully compose that combination here
1595	// (which would complicate the code and may not work with strange custom data)
1596	// or use the slow path.
1597	}
1598	} else if (Hangul::isHangulLV(prev)) {
1599	// The current character is a Jamo Trailing consonant,
1600	// compose with previous Hangul LV that does not contain a Jamo T.
1601	if (!doCompose) {
1602	return false;
1603	}
1604	UChar32 syllable = prev + c - Hangul::JAMO_T_BASE;
1605	--prevSrc; // Replace the Hangul LV as well.
1606	if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) {
1607	break;
1608	}
1609	if(!buffer.appendBMP((char16_t)syllable, `0`, errorCode)) {
1610	break;
1611	}
1612	prevBoundary = src;
1613	continue;
1614	}
1615	// No matching context, or may need to decompose surrounding text first:
1616	// Fall through to the slow path.
1617	} else if (norm16 > JAMO_VT) { // norm16 >= MIN_YES_YES_WITH_CC
1618	// One or more combining marks that do not combine-back:
1619	// Check for canonical order, copy unchanged if ok and
1620	// if followed by a character with a boundary-before.
1621	uint8_t cc = getCCFromNormalYesOrMaybe(norm16); // cc!=0
1622	if (onlyContiguous / FCC / && getPreviousTrailCC(prevBoundary, prevSrc) > cc) {
1623	// Fails FCD test, need to decompose and contiguously recompose.
1624	if (!doCompose) {
1625	return false;
1626	}
1627	} else {
1628	// If !onlyContiguous (not FCC), then we ignore the tccc of
1629	// the previous character which passed the quick check "yes && ccc==0" test.
1630	const char16_t *nextSrc;
1631	uint16_t n16;
1632	for (;;) {
1633	if (src == limit) {
1634	if (doCompose) {
1635	buffer.appendZeroCC(prevBoundary, limit, errorCode);
1636	}
1637	return true;
1638	}
1639	uint8_t prevCC = cc;
1640	nextSrc = src;
1641	UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, nextSrc, limit, c, n16);
1642	if (n16 >= MIN_YES_YES_WITH_CC) {
1643	cc = getCCFromNormalYesOrMaybe(n16);
1644	if (prevCC > cc) {
1645	if (!doCompose) {
1646	return false;
1647	}
1648	break;
1649	}
1650	} else {
1651	break;
1652	}
1653	src = nextSrc;
1654	}
1655	// src is after the last in-order combining mark.
1656	// If there is a boundary here, then we continue with no change.
1657	if (norm16HasCompBoundaryBefore(n16)) {
1658	if (isCompYesAndZeroCC(n16)) {
1659	src = nextSrc;
1660	}
1661	continue;
1662	}
1663	// Use the slow path. There is no boundary in [prevSrc, src[.
1664	}
1665	}
1666
1667	// Slow path: Find the nearest boundaries around the current character,
1668	// decompose and recompose.
1669	if (prevBoundary != prevSrc && !norm16HasCompBoundaryBefore(norm16)) {
1670	const char16_t *p = prevSrc;
1671	UCPTRIE_FAST_U16_PREV(normTrie, UCPTRIE_16, prevBoundary, p, c, norm16);
1672	if (!norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
1673	prevSrc = p;
1674	}
1675	}
1676	if (doCompose && prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) {
1677	break;
1678	}
1679	int32_t recomposeStartIndex=buffer.length();
1680	// We know there is not a boundary here.
1681	decomposeShort(prevSrc, src, false / !stopAtCompBoundary /, onlyContiguous,
1682	buffer, errorCode);
1683	// Decompose until the next boundary.
1684	src = decomposeShort(src, limit, true / stopAtCompBoundary /, onlyContiguous,
1685	buffer, errorCode);
1686	if (U_FAILURE(errorCode)) {
1687	break;
1688	}
1689	if ((src - prevSrc) > INT32_MAX) { // guard before buffer.equals()
1690	errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
1691	return true;
1692	}
1693	recompose(buffer, recomposeStartIndex, onlyContiguous);
1694	if(!doCompose) {
1695	if(!buffer.equals(prevSrc, src)) {
1696	return false;
1697	}
1698	buffer.remove();
1699	}
1700	prevBoundary=src;
1701	}
1702	return true;
1703	}
1704
1705	// Very similar to compose(): Make the same changes in both places if relevant.
1706	// pQCResult==nullptr: spanQuickCheckYes
1707	// pQCResult!=nullptr: quickCheck (pQCResult must be UNORM_YES)*
1708	const char16_t *
1709	Normalizer2Impl::composeQuickCheck(const char16_t src, const* char16_t *limit,
1710	UBool onlyContiguous,
1711	UNormalizationCheckResult pQCResult) const* {
1712	const char16_t *prevBoundary=src;
1713	UChar32 minNoMaybeCP=minCompNoMaybeCP;
1714	if(limit==nullptr) {
1715	UErrorCode errorCode=U_ZERO_ERROR;
1716	src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP, nullptr, errorCode);
1717	limit=u_strchr(src, `0`);
1718	if (prevBoundary != src) {
1719	if (hasCompBoundaryAfter(*(src-`1`), onlyContiguous)) {
1720	prevBoundary = src;
1721	} else {
1722	prevBoundary = --src;
1723	}
1724	}
1725	}
1726
1727	for(;;) {
1728	// Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,
1729	// or with (compYes && ccc==0) properties.
1730	const char16_t *prevSrc;
1731	UChar32 c = `0`;
1732	uint16_t norm16 = `0`;
1733	for (;;) {
1734	if(src==limit) {
1735	return src;
1736	}
1737	if( (c=*src)<minNoMaybeCP \|\|
1738	isCompYesAndZeroCC(norm16=UCPTRIE_FAST_BMP_GET(normTrie, UCPTRIE_16, c))
1739	) {
1740	++src;
1741	} else {
1742	prevSrc = src++;
1743	if(!U16_IS_LEAD(c)) {
1744	break;
1745	} else {
1746	char16_t c2;
1747	if(src!=limit && U16_IS_TRAIL(c2=*src)) {
1748	++src;
1749	c=U16_GET_SUPPLEMENTARY(c, c2);
1750	norm16=UCPTRIE_FAST_SUPP_GET(normTrie, UCPTRIE_16, c);
1751	if(!isCompYesAndZeroCC(norm16)) {
1752	break;
1753	}
1754	}
1755	}
1756	}
1757	}
1758	// isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1759	// The current character is either a "noNo" (has a mapping)
1760	// or a "maybeYes" (combines backward)
1761	// or a "yesYes" with ccc!=0.
1762	// It is not a Hangul syllable or Jamo L because those have "yes" properties.
1763
1764	uint16_t prevNorm16 = INERT;
1765	if (prevBoundary != prevSrc) {
1766	if (norm16HasCompBoundaryBefore(norm16)) {
1767	prevBoundary = prevSrc;
1768	} else {
1769	const char16_t *p = prevSrc;
1770	uint16_t n16;
1771	UCPTRIE_FAST_U16_PREV(normTrie, UCPTRIE_16, prevBoundary, p, c, n16);
1772	if (norm16HasCompBoundaryAfter(n16, onlyContiguous)) {
1773	prevBoundary = prevSrc;
1774	} else {
1775	prevBoundary = p;
1776	prevNorm16 = n16;
1777	}
1778	}
1779	}
1780
1781	if(isMaybeOrNonZeroCC(norm16)) {
1782	uint8_t cc=getCCFromYesOrMaybe(norm16);
1783	if (onlyContiguous / FCC / && cc != `0` &&
1784	getTrailCCFromCompYesAndZeroCC(prevNorm16) > cc) {
1785	// The [prevBoundary..prevSrc[ character
1786	// passed the quick check "yes && ccc==0" test
1787	// but is out of canonical order with the current combining mark.
1788	} else {
1789	// If !onlyContiguous (not FCC), then we ignore the tccc of
1790	// the previous character which passed the quick check "yes && ccc==0" test.
1791	const char16_t *nextSrc;
1792	for (;;) {
1793	if (norm16 < MIN_YES_YES_WITH_CC) {
1794	if (pQCResult != nullptr) {
1795	*pQCResult = UNORM_MAYBE;
1796	} else {
1797	return prevBoundary;
1798	}
1799	}
1800	if (src == limit) {
1801	return src;
1802	}
1803	uint8_t prevCC = cc;
1804	nextSrc = src;
1805	UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, nextSrc, limit, c, norm16);
1806	if (isMaybeOrNonZeroCC(norm16)) {
1807	cc = getCCFromYesOrMaybe(norm16);
1808	if (!(prevCC <= cc \|\| cc == `0`)) {
1809	break;
1810	}
1811	} else {
1812	break;
1813	}
1814	src = nextSrc;
1815	}
1816	// src is after the last in-order combining mark.
1817	if (isCompYesAndZeroCC(norm16)) {
1818	prevBoundary = src;
1819	src = nextSrc;
1820	continue;
1821	}
1822	}
1823	}
1824	if(pQCResult!=nullptr) {
1825	*pQCResult=UNORM_NO;
1826	}
1827	return prevBoundary;
1828	}
1829	}
1830
1831	void Normalizer2Impl::composeAndAppend(const char16_t src, const* char16_t *limit,
1832	UBool doCompose,
1833	UBool onlyContiguous,
1834	UnicodeString &safeMiddle,
1835	ReorderingBuffer &buffer,
1836	UErrorCode &errorCode) const {
1837	if(!buffer.isEmpty()) {
1838	const char16_t *firstStarterInSrc=findNextCompBoundary(src, limit, onlyContiguous);
1839	if(src!=firstStarterInSrc) {
1840	const char16_t *lastStarterInDest=findPreviousCompBoundary(buffer.getStart(),
1841	buffer.getLimit(), onlyContiguous);
1842	int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastStarterInDest);
1843	UnicodeString middle(lastStarterInDest, destSuffixLength);
1844	buffer.removeSuffix(destSuffixLength);
1845	safeMiddle =middle;
1846	middle.append(src, (int32_t)(firstStarterInSrc-src));
1847	const char16_t *middleStart=middle.getBuffer();
1848	compose(middleStart, middleStart+middle.length(), onlyContiguous,
1849	true, buffer, errorCode);
1850	if(U_FAILURE(errorCode)) {
1851	return;
1852	}
1853	src=firstStarterInSrc;
1854	}
1855	}
1856	if(doCompose) {
1857	compose(src, limit, onlyContiguous, true, buffer, errorCode);
1858	} else {
1859	if(limit==nullptr) { // appendZeroCC() needs limit!=nullptr
1860	limit=u_strchr(src, `0`);
1861	}
1862	buffer.appendZeroCC(src, limit, errorCode);
1863	}
1864	}
1865
1866	UBool
1867	Normalizer2Impl::composeUTF8(uint32_t options, UBool onlyContiguous,
1868	const uint8_t src, const* uint8_t *limit,
1869	ByteSink sink, Edits edits, UErrorCode &errorCode) const {
1870	U_ASSERT(limit != nullptr);
1871	UnicodeString s16;
1872	uint8_t minNoMaybeLead = leadByteForCP(minCompNoMaybeCP);
1873	const uint8_t *prevBoundary = src;
1874
1875	for (;;) {
1876	// Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,
1877	// or with (compYes && ccc==0) properties.
1878	const uint8_t *prevSrc;
1879	uint16_t norm16 = `0`;
1880	for (;;) {
1881	if (src == limit) {
1882	if (prevBoundary != limit && sink != nullptr) {
1883	ByteSinkUtil::appendUnchanged(prevBoundary, limit,
1884	*sink, options, edits, errorCode);
1885	}
1886	return true;
1887	}
1888	if (*src < minNoMaybeLead) {
1889	++src;
1890	} else {
1891	prevSrc = src;
1892	UCPTRIE_FAST_U8_NEXT(normTrie, UCPTRIE_16, src, limit, norm16);
1893	if (!isCompYesAndZeroCC(norm16)) {
1894	break;
1895	}
1896	}
1897	}
1898	// isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1899	// The current character is either a "noNo" (has a mapping)
1900	// or a "maybeYes" (combines backward)
1901	// or a "yesYes" with ccc!=0.
1902	// It is not a Hangul syllable or Jamo L because those have "yes" properties.
1903
1904	// Medium-fast path: Handle cases that do not require full decomposition and recomposition.
1905	if (!isMaybeOrNonZeroCC(norm16)) { // minNoNo <= norm16 < minMaybeYes
1906	if (sink == nullptr) {
1907	return false;
1908	}
1909	// Fast path for mapping a character that is immediately surrounded by boundaries.
1910	// In this case, we need not decompose around the current character.
1911	if (isDecompNoAlgorithmic(norm16)) {
1912	// Maps to a single isCompYesAndZeroCC character
1913	// which also implies hasCompBoundaryBefore.
1914	if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) \|\|
1915	hasCompBoundaryBefore(src, limit)) {
1916	if (prevBoundary != prevSrc &&
1917	!ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
1918	*sink, options, edits, errorCode)) {
1919	break;
1920	}
1921	appendCodePointDelta(prevSrc, src, getAlgorithmicDelta(norm16), *sink, edits);
1922	prevBoundary = src;
1923	continue;
1924	}
1925	} else if (norm16 < minNoNoCompBoundaryBefore) {
1926	// The mapping is comp-normalized which also implies hasCompBoundaryBefore.
1927	if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) \|\|
1928	hasCompBoundaryBefore(src, limit)) {
1929	if (prevBoundary != prevSrc &&
1930	!ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
1931	*sink, options, edits, errorCode)) {
1932	break;
1933	}
1934	const uint16_t *mapping = getMapping(norm16);
1935	int32_t length = *mapping++ & MAPPING_LENGTH_MASK;
1936	if (!ByteSinkUtil::appendChange(prevSrc, src, (const char16_t *)mapping, length,
1937	*sink, edits, errorCode)) {
1938	break;
1939	}
1940	prevBoundary = src;
1941	continue;
1942	}
1943	} else if (norm16 >= minNoNoEmpty) {
1944	// The current character maps to nothing.
1945	// Simply omit it from the output if there is a boundary before _or_ after it.
1946	// The character itself implies no boundaries.
1947	if (hasCompBoundaryBefore(src, limit) \|\|
1948	hasCompBoundaryAfter(prevBoundary, prevSrc, onlyContiguous)) {
1949	if (prevBoundary != prevSrc &&
1950	!ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
1951	*sink, options, edits, errorCode)) {
1952	break;
1953	}
1954	if (edits != nullptr) {
1955	edits->addReplace((int32_t)(src - prevSrc), `0`);
1956	}
1957	prevBoundary = src;
1958	continue;
1959	}
1960	}
1961	// Other "noNo" type, or need to examine more text around this character:
1962	// Fall through to the slow path.
1963	} else if (isJamoVT(norm16)) {
1964	// Jamo L: E1 84 80..92
1965	// Jamo V: E1 85 A1..B5
1966	// Jamo T: E1 86 A8..E1 87 82
1967	U_ASSERT((src - prevSrc) == `3` && *prevSrc == `0xe1`);
1968	UChar32 prev = previousHangulOrJamo(prevBoundary, prevSrc);
1969	if (prevSrc[`1`] == `0x85`) {
1970	// The current character is a Jamo Vowel,
1971	// compose with previous Jamo L and following Jamo T.
1972	UChar32 l = prev - Hangul::JAMO_L_BASE;
1973	if ((uint32_t)l < Hangul::JAMO_L_COUNT) {
1974	if (sink == nullptr) {
1975	return false;
1976	}
1977	int32_t t = getJamoTMinusBase(src, limit);
1978	if (t >= `0`) {
1979	// The next character is a Jamo T.
1980	src += `3`;
1981	} else if (hasCompBoundaryBefore(src, limit)) {
1982	// No Jamo T follows, not even via decomposition.
1983	t = `0`;
1984	}
1985	if (t >= `0`) {
1986	UChar32 syllable = Hangul::HANGUL_BASE +
1987	(lHangul::JAMO_V_COUNT + (prevSrc[`2`]-`0xa1`))
1988	Hangul::JAMO_T_COUNT + t;
1989	prevSrc -= `3`; // Replace the Jamo L as well.
1990	if (prevBoundary != prevSrc &&
1991	!ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
1992	*sink, options, edits, errorCode)) {
1993	break;
1994	}
1995	ByteSinkUtil::appendCodePoint(prevSrc, src, syllable, *sink, edits);
1996	prevBoundary = src;
1997	continue;
1998	}
1999	// If we see L+V+x where x!=T then we drop to the slow path,
2000	// decompose and recompose.
2001	// This is to deal with NFKC finding normal L and V but a
2002	// compatibility variant of a T.
2003	// We need to either fully compose that combination here
2004	// (which would complicate the code and may not work with strange custom data)
2005	// or use the slow path.
2006	}
2007	} else if (Hangul::isHangulLV(prev)) {
2008	// The current character is a Jamo Trailing consonant,
2009	// compose with previous Hangul LV that does not contain a Jamo T.
2010	if (sink == nullptr) {
2011	return false;
2012	}
2013	UChar32 syllable = prev + getJamoTMinusBase(prevSrc, src);
2014	prevSrc -= `3`; // Replace the Hangul LV as well.
2015	if (prevBoundary != prevSrc &&
2016	!ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
2017	*sink, options, edits, errorCode)) {
2018	break;
2019	}
2020	ByteSinkUtil::appendCodePoint(prevSrc, src, syllable, *sink, edits);
2021	prevBoundary = src;
2022	continue;
2023	}
2024	// No matching context, or may need to decompose surrounding text first:
2025	// Fall through to the slow path.
2026	} else if (norm16 > JAMO_VT) { // norm16 >= MIN_YES_YES_WITH_CC
2027	// One or more combining marks that do not combine-back:
2028	// Check for canonical order, copy unchanged if ok and
2029	// if followed by a character with a boundary-before.
2030	uint8_t cc = getCCFromNormalYesOrMaybe(norm16); // cc!=0
2031	if (onlyContiguous / FCC / && getPreviousTrailCC(prevBoundary, prevSrc) > cc) {
2032	// Fails FCD test, need to decompose and contiguously recompose.
2033	if (sink == nullptr) {
2034	return false;
2035	}
2036	} else {
2037	// If !onlyContiguous (not FCC), then we ignore the tccc of
2038	// the previous character which passed the quick check "yes && ccc==0" test.
2039	const uint8_t *nextSrc;
2040	uint16_t n16;
2041	for (;;) {
2042	if (src == limit) {
2043	if (sink != nullptr) {
2044	ByteSinkUtil::appendUnchanged(prevBoundary, limit,
2045	*sink, options, edits, errorCode);
2046	}
2047	return true;
2048	}
2049	uint8_t prevCC = cc;
2050	nextSrc = src;
2051	UCPTRIE_FAST_U8_NEXT(normTrie, UCPTRIE_16, nextSrc, limit, n16);
2052	if (n16 >= MIN_YES_YES_WITH_CC) {
2053	cc = getCCFromNormalYesOrMaybe(n16);
2054	if (prevCC > cc) {
2055	if (sink == nullptr) {
2056	return false;
2057	}
2058	break;
2059	}
2060	} else {
2061	break;
2062	}
2063	src = nextSrc;
2064	}
2065	// src is after the last in-order combining mark.
2066	// If there is a boundary here, then we continue with no change.
2067	if (norm16HasCompBoundaryBefore(n16)) {
2068	if (isCompYesAndZeroCC(n16)) {
2069	src = nextSrc;
2070	}
2071	continue;
2072	}
2073	// Use the slow path. There is no boundary in [prevSrc, src[.
2074	}
2075	}
2076
2077	// Slow path: Find the nearest boundaries around the current character,
2078	// decompose and recompose.
2079	if (prevBoundary != prevSrc && !norm16HasCompBoundaryBefore(norm16)) {
2080	const uint8_t *p = prevSrc;
2081	UCPTRIE_FAST_U8_PREV(normTrie, UCPTRIE_16, prevBoundary, p, norm16);
2082	if (!norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
2083	prevSrc = p;
2084	}
2085	}
2086	ReorderingBuffer buffer(*this, s16, errorCode);
2087	if (U_FAILURE(errorCode)) {
2088	break;
2089	}
2090	// We know there is not a boundary here.
2091	decomposeShort(prevSrc, src, STOP_AT_LIMIT, onlyContiguous,
2092	buffer, errorCode);
2093	// Decompose until the next boundary.
2094	src = decomposeShort(src, limit, STOP_AT_COMP_BOUNDARY, onlyContiguous,
2095	buffer, errorCode);
2096	if (U_FAILURE(errorCode)) {
2097	break;
2098	}
2099	if ((src - prevSrc) > INT32_MAX) { // guard before buffer.equals()
2100	errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
2101	return true;
2102	}
2103	recompose(buffer, `0`, onlyContiguous);
2104	if (!buffer.equals(prevSrc, src)) {
2105	if (sink == nullptr) {
2106	return false;
2107	}
2108	if (prevBoundary != prevSrc &&
2109	!ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
2110	*sink, options, edits, errorCode)) {
2111	break;
2112	}
2113	if (!ByteSinkUtil::appendChange(prevSrc, src, buffer.getStart(), buffer.length(),
2114	*sink, edits, errorCode)) {
2115	break;
2116	}
2117	prevBoundary = src;
2118	}
2119	}
2120	return true;
2121	}
2122
2123	UBool Normalizer2Impl::hasCompBoundaryBefore(const char16_t src, const* char16_t limit) const* {
2124	if (src == limit \|\| *src < minCompNoMaybeCP) {
2125	return true;
2126	}
2127	UChar32 c;
2128	uint16_t norm16;
2129	UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, src, limit, c, norm16);
2130	return norm16HasCompBoundaryBefore(norm16);
2131	}
2132
2133	UBool Normalizer2Impl::hasCompBoundaryBefore(const uint8_t src, const* uint8_t limit) const* {
2134	if (src == limit) {
2135	return true;
2136	}
2137	uint16_t norm16;
2138	UCPTRIE_FAST_U8_NEXT(normTrie, UCPTRIE_16, src, limit, norm16);
2139	return norm16HasCompBoundaryBefore(norm16);
2140	}
2141
2142	UBool Normalizer2Impl::hasCompBoundaryAfter(const char16_t start, const* char16_t *p,
2143	UBool onlyContiguous) const {
2144	if (start == p) {
2145	return true;
2146	}
2147	UChar32 c;
2148	uint16_t norm16;
2149	UCPTRIE_FAST_U16_PREV(normTrie, UCPTRIE_16, start, p, c, norm16);
2150	return norm16HasCompBoundaryAfter(norm16, onlyContiguous);
2151	}
2152
2153	UBool Normalizer2Impl::hasCompBoundaryAfter(const uint8_t start, const* uint8_t *p,
2154	UBool onlyContiguous) const {
2155	if (start == p) {
2156	return true;
2157	}
2158	uint16_t norm16;
2159	UCPTRIE_FAST_U8_PREV(normTrie, UCPTRIE_16, start, p, norm16);
2160	return norm16HasCompBoundaryAfter(norm16, onlyContiguous);
2161	}
2162
2163	const char16_t Normalizer2Impl::findPreviousCompBoundary(const* char16_t start, const* char16_t *p,
2164	UBool onlyContiguous) const {
2165	while (p != start) {
2166	const char16_t *codePointLimit = p;
2167	UChar32 c;
2168	uint16_t norm16;
2169	UCPTRIE_FAST_U16_PREV(normTrie, UCPTRIE_16, start, p, c, norm16);
2170	if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
2171	return codePointLimit;
2172	}
2173	if (hasCompBoundaryBefore(c, norm16)) {
2174	return p;
2175	}
2176	}
2177	return p;
2178	}
2179
2180	const char16_t Normalizer2Impl::findNextCompBoundary(const* char16_t p, const* char16_t *limit,
2181	UBool onlyContiguous) const {
2182	while (p != limit) {
2183	const char16_t *codePointStart = p;
2184	UChar32 c;
2185	uint16_t norm16;
2186	UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, p, limit, c, norm16);
2187	if (hasCompBoundaryBefore(c, norm16)) {
2188	return codePointStart;
2189	}
2190	if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
2191	return p;
2192	}
2193	}
2194	return p;
2195	}
2196
2197	uint8_t Normalizer2Impl::getPreviousTrailCC(const char16_t start, const* char16_t p) const* {
2198	if (start == p) {
2199	return `0`;
2200	}
2201	int32_t i = (int32_t)(p - start);
2202	UChar32 c;
2203	U16_PREV(start, `0`, i, c);
2204	return (uint8_t)getFCD16(c);
2205	}
2206
2207	uint8_t Normalizer2Impl::getPreviousTrailCC(const uint8_t start, const* uint8_t p) const* {
2208	if (start == p) {
2209	return `0`;
2210	}
2211	int32_t i = (int32_t)(p - start);
2212	UChar32 c;
2213	U8_PREV(start, `0`, i, c);
2214	return (uint8_t)getFCD16(c);
2215	}
2216
2217	// Note: normalizer2impl.cpp r30982 (2011-nov-27)
2218	// still had getFCDTrie() which built and cached an FCD trie.
2219	// That provided faster access to FCD data than getFCD16FromNormData()
2220	// but required synchronization and consumed some 10kB of heap memory
2221	// in any process that uses FCD (e.g., via collation).
2222	// minDecompNoCP etc. and smallFCD[] are intended to help with any loss of performance,
2223	// at least for ASCII & CJK.
2224
2225	// Ticket 20907 - The optimizer in MSVC/Visual Studio versions below 16.4 has trouble with this
2226	// function on Windows ARM64. As a work-around, we disable optimizations for this function.
2227	// This work-around could/should be removed once the following versions of Visual Studio are no
2228	// longer supported: All versions of VS2017, and versions of VS2019 below 16.4.
2229	#if (defined(_MSC_VER) && (defined(_M_ARM64)) && (_MSC_VER < 1924))
2230	#pragma optimize( "", off )
2231	#endif
2232	// Gets the FCD value from the regular normalization data.
2233	uint16_t Normalizer2Impl::getFCD16FromNormData(UChar32 c) const {
2234	uint16_t norm16=getNorm16(c);
2235	if (norm16 >= limitNoNo) {
2236	if(norm16>=MIN_NORMAL_MAYBE_YES) {
2237	// combining mark
2238	norm16=getCCFromNormalYesOrMaybe(norm16);
2239	return norm16\|(norm16<<`8`);
2240	} else if(norm16>=minMaybeYes) {
2241	return `0`;
2242	} else { // isDecompNoAlgorithmic(norm16)
2243	uint16_t deltaTrailCC = norm16 & DELTA_TCCC_MASK;
2244	if (deltaTrailCC <= DELTA_TCCC_1) {
2245	return deltaTrailCC >> OFFSET_SHIFT;
2246	}
2247	// Maps to an isCompYesAndZeroCC.
2248	c=mapAlgorithmic(c, norm16);
2249	norm16=getRawNorm16(c);
2250	}
2251	}
2252	if(norm16<=minYesNo \|\| isHangulLVT(norm16)) {
2253	// no decomposition or Hangul syllable, all zeros
2254	return `0`;
2255	}
2256	// c decomposes, get everything from the variable-length extra data
2257	const uint16_t *mapping=getMapping(norm16);
2258	uint16_t firstUnit=*mapping;
2259	norm16=firstUnit>>`8`; // tccc
2260	if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
2261	norm16\|=(mapping-`1`)&`0xff00`; // lccc*
2262	}
2263	return norm16;
2264	}
2265	#if (defined(_MSC_VER) && (defined(_M_ARM64)) && (_MSC_VER < 1924))
2266	#pragma optimize( "", on )
2267	#endif
2268
2269	// Dual functionality:
2270	// buffer!=nullptr: normalize
2271	// buffer==nullptr: isNormalized/quickCheck/spanQuickCheckYes
2272	const char16_t *
2273	Normalizer2Impl::makeFCD(const char16_t src, const* char16_t *limit,
2274	ReorderingBuffer *buffer,
2275	UErrorCode &errorCode) const {
2276	// Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.
2277	// Similar to the prevBoundary in the compose() implementation.
2278	const char16_t *prevBoundary=src;
2279	int32_t prevFCD16=`0`;
2280	if(limit==nullptr) {
2281	src=copyLowPrefixFromNulTerminated(src, minLcccCP, buffer, errorCode);
2282	if(U_FAILURE(errorCode)) {
2283	return src;
2284	}
2285	if(prevBoundary<src) {
2286	prevBoundary=src;
2287	// We know that the previous character's lccc==0.
2288	// Fetching the fcd16 value was deferred for this below-U+0300 code point.
2289	prevFCD16=getFCD16(*(src-`1`));
2290	if(prevFCD16>`1`) {
2291	--prevBoundary;
2292	}
2293	}
2294	limit=u_strchr(src, `0`);
2295	}
2296
2297	// Note: In this function we use buffer->appendZeroCC() because we track
2298	// the lead and trail combining classes here, rather than leaving it to
2299	// the ReorderingBuffer.
2300	// The exception is the call to decomposeShort() which uses the buffer
2301	// in the normal way.
2302
2303	const char16_t *prevSrc;
2304	UChar32 c=`0`;
2305	uint16_t fcd16=`0`;
2306
2307	for(;;) {
2308	// count code units with lccc==0
2309	for(prevSrc=src; src!=limit;) {
2310	if((c=*src)<minLcccCP) {
2311	prevFCD16=~c;
2312	++src;
2313	} else if(!singleLeadMightHaveNonZeroFCD16(c)) {
2314	prevFCD16=`0`;
2315	++src;
2316	} else {
2317	if(U16_IS_LEAD(c)) {
2318	char16_t c2;
2319	if((src+`1`)!=limit && U16_IS_TRAIL(c2=src[`1`])) {
2320	c=U16_GET_SUPPLEMENTARY(c, c2);
2321	}
2322	}
2323	if((fcd16=getFCD16FromNormData(c))<=`0xff`) {
2324	prevFCD16=fcd16;
2325	src+=U16_LENGTH(c);
2326	} else {
2327	break;
2328	}
2329	}
2330	}
2331	// copy these code units all at once
2332	if(src!=prevSrc) {
2333	if(buffer!=nullptr && !buffer->appendZeroCC(prevSrc, src, errorCode)) {
2334	break;
2335	}
2336	if(src==limit) {
2337	break;
2338	}
2339	prevBoundary=src;
2340	// We know that the previous character's lccc==0.
2341	if(prevFCD16<`0`) {
2342	// Fetching the fcd16 value was deferred for this below-minLcccCP code point.
2343	UChar32 prev=~prevFCD16;
2344	if(prev<minDecompNoCP) {
2345	prevFCD16=`0`;
2346	} else {
2347	prevFCD16=getFCD16FromNormData(prev);
2348	if(prevFCD16>`1`) {
2349	--prevBoundary;
2350	}
2351	}
2352	} else {
2353	const char16_t *p=src-`1`;
2354	if(U16_IS_TRAIL(p) && prevSrc<p && U16_IS_LEAD((p-`1`))) {
2355	--p;
2356	// Need to fetch the previous character's FCD value because
2357	// prevFCD16 was just for the trail surrogate code point.
2358	prevFCD16=getFCD16FromNormData(U16_GET_SUPPLEMENTARY(p[`0`], p[`1`]));
2359	// Still known to have lccc==0 because its lead surrogate unit had lccc==0.
2360	}
2361	if(prevFCD16>`1`) {
2362	prevBoundary=p;
2363	}
2364	}
2365	// The start of the current character (c).
2366	prevSrc=src;
2367	} else if(src==limit) {
2368	break;
2369	}
2370
2371	src+=U16_LENGTH(c);
2372	// The current character (c) at [prevSrc..src[ has a non-zero lead combining class.
2373	// Check for proper order, and decompose locally if necessary.
2374	if((prevFCD16&`0xff`)<=(fcd16>>`8`)) {
2375	// proper order: prev tccc <= current lccc
2376	if((fcd16&`0xff`)<=`1`) {
2377	prevBoundary=src;
2378	}
2379	if(buffer!=nullptr && !buffer->appendZeroCC(c, errorCode)) {
2380	break;
2381	}
2382	prevFCD16=fcd16;
2383	continue;
2384	} else if(buffer==nullptr) {
2385	return prevBoundary; // quick check "no"
2386	} else {
2387	/*
2388	* Back out the part of the source that we copied or appended
2389	* already but is now going to be decomposed.
2390	* prevSrc is set to after what was copied/appended.
2391	*/
2392	buffer->removeSuffix((int32_t)(prevSrc-prevBoundary));
2393	/*
2394	* Find the part of the source that needs to be decomposed,
2395	* up to the next safe boundary.
2396	*/
2397	src=findNextFCDBoundary(src, limit);
2398	/*
2399	* The source text does not fulfill the conditions for FCD.
2400	* Decompose and reorder a limited piece of the text.
2401	*/
2402	decomposeShort(prevBoundary, src, false, false, *buffer, errorCode);
2403	if (U_FAILURE(errorCode)) {
2404	break;
2405	}
2406	prevBoundary=src;
2407	prevFCD16=`0`;
2408	}
2409	}
2410	return src;
2411	}
2412
2413	void Normalizer2Impl::makeFCDAndAppend(const char16_t src, const* char16_t *limit,
2414	UBool doMakeFCD,
2415	UnicodeString &safeMiddle,
2416	ReorderingBuffer &buffer,
2417	UErrorCode &errorCode) const {
2418	if(!buffer.isEmpty()) {
2419	const char16_t *firstBoundaryInSrc=findNextFCDBoundary(src, limit);
2420	if(src!=firstBoundaryInSrc) {
2421	const char16_t *lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStart(),
2422	buffer.getLimit());
2423	int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastBoundaryInDest);
2424	UnicodeString middle(lastBoundaryInDest, destSuffixLength);
2425	buffer.removeSuffix(destSuffixLength);
2426	safeMiddle =middle;
2427	middle.append(src, (int32_t)(firstBoundaryInSrc-src));
2428	const char16_t *middleStart=middle.getBuffer();
2429	makeFCD(middleStart, middleStart+middle.length(), &buffer, errorCode);
2430	if(U_FAILURE(errorCode)) {
2431	return;
2432	}
2433	src=firstBoundaryInSrc;
2434	}
2435	}
2436	if(doMakeFCD) {
2437	makeFCD(src, limit, &buffer, errorCode);
2438	} else {
2439	if(limit==nullptr) { // appendZeroCC() needs limit!=nullptr
2440	limit=u_strchr(src, `0`);
2441	}
2442	buffer.appendZeroCC(src, limit, errorCode);
2443	}
2444	}
2445
2446	const char16_t Normalizer2Impl::findPreviousFCDBoundary(const* char16_t start, const* char16_t p) const* {
2447	while(start<p) {
2448	const char16_t *codePointLimit = p;
2449	UChar32 c;
2450	uint16_t norm16;
2451	UCPTRIE_FAST_U16_PREV(normTrie, UCPTRIE_16, start, p, c, norm16);
2452	if (c < minDecompNoCP \|\| norm16HasDecompBoundaryAfter(norm16)) {
2453	return codePointLimit;
2454	}
2455	if (norm16HasDecompBoundaryBefore(norm16)) {
2456	return p;
2457	}
2458	}
2459	return p;
2460	}
2461
2462	const char16_t Normalizer2Impl::findNextFCDBoundary(const* char16_t p, const* char16_t limit) const* {
2463	while(p<limit) {
2464	const char16_t *codePointStart=p;
2465	UChar32 c;
2466	uint16_t norm16;
2467	UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, p, limit, c, norm16);
2468	if (c < minLcccCP \|\| norm16HasDecompBoundaryBefore(norm16)) {
2469	return codePointStart;
2470	}
2471	if (norm16HasDecompBoundaryAfter(norm16)) {
2472	return p;
2473	}
2474	}
2475	return p;
2476	}
2477
2478	// CanonicalIterator data -------------------------------------------------- ***
2479
2480	CanonIterData::CanonIterData(UErrorCode &errorCode) :
2481	mutableTrie(umutablecptrie_open(`0`, `0`, &errorCode)), trie(nullptr),
2482	canonStartSets (uprv_deleteUObject, nullptr, errorCode) {}
2483
2484	CanonIterData::~CanonIterData() {
2485	umutablecptrie_close(mutableTrie);
2486	ucptrie_close(trie);
2487	}
2488
2489	void CanonIterData::addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode) {
2490	uint32_t canonValue = umutablecptrie_get(mutableTrie, decompLead);
2491	if((canonValue&(CANON_HAS_SET\|CANON_VALUE_MASK))==`0` && origin!=`0`) {
2492	// origin is the first character whose decomposition starts with
2493	// the character for which we are setting the value.
2494	umutablecptrie_set(mutableTrie, decompLead, canonValue\|origin, &errorCode);
2495	} else {
2496	// origin is not the first character, or it is U+0000.
2497	UnicodeSet *set;
2498	if((canonValue&CANON_HAS_SET)==`0`) {
2499	LocalPointer<UnicodeSet> lpSet(new UnicodeSet, errorCode);
2500	set=lpSet.getAlias();
2501	if(U_FAILURE(errorCode)) {
2502	return;
2503	}
2504	UChar32 firstOrigin=(UChar32)(canonValue&CANON_VALUE_MASK);
2505	canonValue=(canonValue&~CANON_VALUE_MASK)\|CANON_HAS_SET\|(uint32_t)canonStartSets.size();
2506	umutablecptrie_set(mutableTrie, decompLead, canonValue, &errorCode);
2507	canonStartSets.adoptElement(lpSet.orphan(), errorCode);
2508	if (U_FAILURE(errorCode)) {
2509	return;
2510	}
2511	if(firstOrigin!=`0`) {
2512	set->add(firstOrigin);
2513	}
2514	} else {
2515	set=(UnicodeSet *)canonStartSets [(int32_t)(canonValue&CANON_VALUE_MASK)];
2516	}
2517	set->add(origin);
2518	}
2519	}
2520
2521	// C++ class for friend access to private Normalizer2Impl members.
2522	class InitCanonIterData {
2523	public:
2524	static void doInit(Normalizer2Impl *impl, UErrorCode &errorCode);
2525	};
2526
2527	U_CDECL_BEGIN
2528
2529	// UInitOnce instantiation function for CanonIterData
2530	static void U_CALLCONV
2531	initCanonIterData(Normalizer2Impl *impl, UErrorCode &errorCode) {
2532	InitCanonIterData::doInit(impl, errorCode);
2533	}
2534
2535	U_CDECL_END
2536
2537	void InitCanonIterData::doInit(Normalizer2Impl *impl, UErrorCode &errorCode) {
2538	U_ASSERT(impl->fCanonIterData == nullptr);
2539	impl->fCanonIterData = new CanonIterData (errorCode);
2540	if (impl->fCanonIterData == nullptr) {
2541	errorCode=U_MEMORY_ALLOCATION_ERROR;
2542	}
2543	if (U_SUCCESS(errorCode)) {
2544	UChar32 start = `0`, end;
2545	uint32_t value;
2546	while ((end = ucptrie_getRange(impl->normTrie, start,
2547	UCPMAP_RANGE_FIXED_LEAD_SURROGATES, Normalizer2Impl::INERT,
2548	nullptr, nullptr, &value)) >= `0`) {
2549	// Call Normalizer2Impl::makeCanonIterDataFromNorm16() for a range of same-norm16 characters.
2550	if (value != Normalizer2Impl::INERT) {
2551	impl->makeCanonIterDataFromNorm16(start, end, value, *impl->fCanonIterData, errorCode);
2552	}
2553	start = end + `1`;
2554	}
2555	#ifdef UCPTRIE_DEBUG
2556	umutablecptrie_setName(impl->fCanonIterData->mutableTrie, "CanonIterData");
2557	#endif
2558	impl->fCanonIterData->trie = umutablecptrie_buildImmutable(
2559	impl->fCanonIterData->mutableTrie, UCPTRIE_TYPE_SMALL, UCPTRIE_VALUE_BITS_32, &errorCode);
2560	umutablecptrie_close(impl->fCanonIterData->mutableTrie);
2561	impl->fCanonIterData->mutableTrie = nullptr;
2562	}
2563	if (U_FAILURE(errorCode)) {
2564	delete impl->fCanonIterData;
2565	impl->fCanonIterData = nullptr;
2566	}
2567	}
2568
2569	void Normalizer2Impl::makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, const uint16_t norm16,
2570	CanonIterData &newData,
2571	UErrorCode &errorCode) const {
2572	if(isInert(norm16) \|\| (minYesNo<=norm16 && norm16<minNoNo)) {
2573	// Inert, or 2-way mapping (including Hangul syllable).
2574	// We do not write a canonStartSet for any yesNo character.
2575	// Composites from 2-way mappings are added at runtime from the
2576	// starter's compositions list, and the other characters in
2577	// 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are
2578	// "maybe" characters.
2579	return;
2580	}
2581	for(UChar32 c=start; c<=end; ++c) {
2582	uint32_t oldValue = umutablecptrie_get(newData.mutableTrie, c);
2583	uint32_t newValue=oldValue;
2584	if(isMaybeOrNonZeroCC(norm16)) {
2585	// not a segment starter if it occurs in a decomposition or has cc!=0
2586	newValue\|=CANON_NOT_SEGMENT_STARTER;
2587	if(norm16<MIN_NORMAL_MAYBE_YES) {
2588	newValue\|=CANON_HAS_COMPOSITIONS;
2589	}
2590	} else if(norm16<minYesNo) {
2591	newValue\|=CANON_HAS_COMPOSITIONS;
2592	} else {
2593	// c has a one-way decomposition
2594	UChar32 c2=c;
2595	// Do not modify the whole-range norm16 value.
2596	uint16_t norm16_2=norm16;
2597	if (isDecompNoAlgorithmic(norm16_2)) {
2598	// Maps to an isCompYesAndZeroCC.
2599	c2 = mapAlgorithmic(c2, norm16_2);
2600	norm16_2 = getRawNorm16(c2);
2601	// No compatibility mappings for the CanonicalIterator.
2602	U_ASSERT(!(isHangulLV(norm16_2) \|\| isHangulLVT(norm16_2)));
2603	}
2604	if (norm16_2 > minYesNo) {
2605	// c decomposes, get everything from the variable-length extra data
2606	const uint16_t *mapping=getMapping(norm16_2);
2607	uint16_t firstUnit=*mapping;
2608	int32_t length=firstUnit&MAPPING_LENGTH_MASK;
2609	if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=`0`) {
2610	if(c==c2 && (*(mapping-`1`)&`0xff`)!=`0`) {
2611	newValue\|=CANON_NOT_SEGMENT_STARTER; // original c has cc!=0
2612	}
2613	}
2614	// Skip empty mappings (no characters in the decomposition).
2615	if(length!=`0`) {
2616	++mapping; // skip over the firstUnit
2617	// add c to first code point's start set
2618	int32_t i=`0`;
2619	U16_NEXT_UNSAFE(mapping, i, c2);
2620	newData.addToStartSet(c, c2, errorCode);
2621	// Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a
2622	// one-way mapping. A 2-way mapping is possible here after
2623	// intermediate algorithmic mapping.
2624	if(norm16_2>=minNoNo) {
2625	while(i<length) {
2626	U16_NEXT_UNSAFE(mapping, i, c2);
2627	uint32_t c2Value = umutablecptrie_get(newData.mutableTrie, c2);
2628	if((c2Value&CANON_NOT_SEGMENT_STARTER)==`0`) {
2629	umutablecptrie_set(newData.mutableTrie, c2,
2630	c2Value\|CANON_NOT_SEGMENT_STARTER, &errorCode);
2631	}
2632	}
2633	}
2634	}
2635	} else {
2636	// c decomposed to c2 algorithmically; c has cc==0
2637	newData.addToStartSet(c, c2, errorCode);
2638	}
2639	}
2640	if(newValue!=oldValue) {
2641	umutablecptrie_set(newData.mutableTrie, c, newValue, &errorCode);
2642	}
2643	}
2644	}
2645
2646	UBool Normalizer2Impl::ensureCanonIterData(UErrorCode &errorCode) const {
2647	// Logically const: Synchronized instantiation.
2648	Normalizer2Impl me=const_cast<Normalizer2Impl >(this);
2649	umtx_initOnce(me->fCanonIterDataInitOnce, &initCanonIterData, me, errorCode);
2650	return U_SUCCESS(errorCode);
2651	}
2652
2653	int32_t Normalizer2Impl::getCanonValue(UChar32 c) const {
2654	return (int32_t)ucptrie_get(fCanonIterData->trie, c);
2655	}
2656
2657	const UnicodeSet &Normalizer2Impl::getCanonStartSet(int32_t n) const {
2658	return (const* UnicodeSet *)fCanonIterData->canonStartSets [n];
2659	}
2660
2661	UBool Normalizer2Impl::isCanonSegmentStarter(UChar32 c) const {
2662	return getCanonValue(c)>=`0`;
2663	}
2664
2665	UBool Normalizer2Impl::getCanonStartSet(UChar32 c, UnicodeSet &set) const {
2666	int32_t canonValue=getCanonValue(c)&~CANON_NOT_SEGMENT_STARTER;
2667	if(canonValue==`0`) {
2668	return false;
2669	}
2670	set.clear();
2671	int32_t value=canonValue&CANON_VALUE_MASK;
2672	if((canonValue&CANON_HAS_SET)!=`0`) {
2673	set.addAll(getCanonStartSet(value));
2674	} else if(value!=`0`) {
2675	set.add(value);
2676	}
2677	if((canonValue&CANON_HAS_COMPOSITIONS)!=`0`) {
2678	uint16_t norm16=getRawNorm16(c);
2679	if(norm16==JAMO_L) {
2680	UChar32 syllable=
2681	(UChar32)(Hangul::HANGUL_BASE+(c-Hangul::JAMO_L_BASE)*Hangul::JAMO_VT_COUNT);
2682	set.add(syllable, syllable+Hangul::JAMO_VT_COUNT-`1`);
2683	} else {
2684	addComposites(getCompositionsList(norm16), set);
2685	}
2686	}
2687	return true;
2688	}
2689
2690	U_NAMESPACE_END
2691
2692	// Normalizer2 data swapping ----------------------------------------------- ***
2693
2694	U_NAMESPACE_USE
2695
2696	U_CAPI int32_t U_EXPORT2
2697	unorm2_swap(const UDataSwapper *ds,
2698	const void inData, int32_t length, void* *outData,
2699	UErrorCode *pErrorCode) {
2700	const UDataInfo *pInfo;
2701	int32_t headerSize;
2702
2703	const uint8_t *inBytes;
2704	uint8_t *outBytes;
2705
2706	const int32_t *inIndexes;
2707	int32_t indexes[Normalizer2Impl::IX_TOTAL_SIZE+`1`];
2708
2709	int32_t i, offset, nextOffset, size;
2710
2711	/ udata_swapDataHeader checks the arguments /
2712	headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
2713	if(pErrorCode==nullptr \|\| U_FAILURE(*pErrorCode)) {
2714	return `0`;
2715	}
2716
2717	/ check data format and format version /
2718	pInfo=(const UDataInfo )((const* char *)inData+`4`);
2719	uint8_t formatVersion0=pInfo->formatVersion[`0`];
2720	if(!(
2721	pInfo->dataFormat[`0`]==`0x4e` && / dataFormat="Nrm2" /
2722	pInfo->dataFormat[`1`]==`0x72` &&
2723	pInfo->dataFormat[`2`]==`0x6d` &&
2724	pInfo->dataFormat[`3`]==`0x32` &&
2725	(`1`<=formatVersion0 && formatVersion0<=`4`)
2726	)) {
2727	udata_printError(ds, "unorm2_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as Normalizer2 data\n",
2728	pInfo->dataFormat[`0`], pInfo->dataFormat[`1`],
2729	pInfo->dataFormat[`2`], pInfo->dataFormat[`3`],
2730	pInfo->formatVersion[`0`]);
2731	*pErrorCode=U_UNSUPPORTED_ERROR;
2732	return `0`;
2733	}
2734
2735	inBytes=(const uint8_t *)inData+headerSize;
2736	outBytes=(outData == nullptr) ? nullptr : (uint8_t *)outData+headerSize;
2737
2738	inIndexes=(const int32_t *)inBytes;
2739	int32_t minIndexesLength;
2740	if(formatVersion0==`1`) {
2741	minIndexesLength=Normalizer2Impl::IX_MIN_MAYBE_YES+`1`;
2742	} else if(formatVersion0==`2`) {
2743	minIndexesLength=Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY+`1`;
2744	} else {
2745	minIndexesLength=Normalizer2Impl::IX_MIN_LCCC_CP+`1`;
2746	}
2747
2748	if(length>=`0`) {
2749	length-=headerSize;
2750	if(length<minIndexesLength*`4`) {
2751	udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for Normalizer2 data\n",
2752	length);
2753	*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2754	return `0`;
2755	}
2756	}
2757
2758	/ read the first few indexes /
2759	for(i=`0`; i<UPRV_LENGTHOF(indexes); ++i) {
2760	indexes[i]=udata_readInt32(ds, inIndexes[i]);
2761	}
2762
2763	/ get the total length of the data /
2764	size=indexes[Normalizer2Impl::IX_TOTAL_SIZE];
2765
2766	if(length>=`0`) {
2767	if(length<size) {
2768	udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for all of Normalizer2 data\n",
2769	length);
2770	*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2771	return `0`;
2772	}
2773
2774	/ copy the data for inaccessible bytes /
2775	if(inBytes!=outBytes) {
2776	uprv_memcpy(outBytes, inBytes, size);
2777	}
2778
2779	offset=`0`;
2780
2781	/ swap the int32_t indexes[] /
2782	nextOffset=indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET];
2783	ds->swapArray32(ds, inBytes, nextOffset-offset, outBytes, pErrorCode);
2784	offset=nextOffset;
2785
2786	/ swap the trie /
2787	nextOffset=indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET];
2788	utrie_swapAnyVersion(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
2789	offset=nextOffset;
2790
2791	/ swap the uint16_t extraData[] /
2792	nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET];
2793	ds->swapArray16(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
2794	offset=nextOffset;
2795
2796	/ no need to swap the uint8_t smallFCD[] (new in formatVersion 2) /
2797	nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET+`1`];
2798	offset=nextOffset;
2799
2800	U_ASSERT(offset==size);
2801	}
2802
2803	return headerSize+size;
2804	}
2805
2806	#endif // !UCONFIG_NO_NORMALIZATION
2807

Browse the source code of Godot/thirdparty/icu4c/common/normalizer2impl.cpp