utf8collationiterator.cpp source code [ClickHouse/contrib/icu/icu4c/source/i18n/utf8collationiterator.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	*******************************************************************************
5	* Copyright (C) 2012-2014, International Business Machines
6	* Corporation and others. All Rights Reserved.
7	*******************************************************************************
8	* utf8collationiterator.cpp
9	*
10	* created on: 2012nov12 (from utf16collationiterator.cpp & uitercollationiterator.cpp)
11	* created by: Markus W. Scherer
12	*/
13
14	#include "unicode/utypes.h"
15
16	#if !UCONFIG_NO_COLLATION
17
18	#include "unicode/utf8.h"
19	#include "charstr.h"
20	#include "cmemory.h"
21	#include "collation.h"
22	#include "collationdata.h"
23	#include "collationfcd.h"
24	#include "collationiterator.h"
25	#include "normalizer2impl.h"
26	#include "uassert.h"
27	#include "utf8collationiterator.h"
28
29	U_NAMESPACE_BEGIN
30
31	UTF8CollationIterator::~UTF8CollationIterator() {}
32
33	void
34	UTF8CollationIterator::resetToOffset(int32_t newOffset) {
35	reset();
36	pos = newOffset;
37	}
38
39	int32_t
40	UTF8CollationIterator::getOffset() const {
41	return pos;
42	}
43
44	uint32_t
45	UTF8CollationIterator::handleNextCE32(UChar32 &c, UErrorCode & /errorCode/) {
46	if(pos == length) {
47	c = U_SENTINEL;
48	return Collation::FALLBACK_CE32;
49	}
50	// Optimized combination of U8_NEXT_OR_FFFD() and UTRIE2_U8_NEXT32().
51	c = u8[pos++];
52	if(U8_IS_SINGLE(c)) {
53	// ASCII 00..7F
54	return trie->data32[c];
55	}
56	uint8_t t1, t2;
57	if(`0xe0` <= c && c < `0xf0` &&
58	((pos + `1`) < length \|\| length < `0`) &&
59	U8_IS_VALID_LEAD3_AND_T1(c, t1 = u8[pos]) &&
60	(t2 = (u8[pos + `1`] - `0x80`)) <= `0x3f`) {
61	// U+0800..U+FFFF except surrogates
62	c = (((c & `0xf`) << `12`) \| ((t1 & `0x3f`) << `6`) \| t2);
63	pos += `2`;
64	return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
65	} else if(c < `0xe0` && c >= `0xc2` && pos != length && (t1 = (u8[pos] - `0x80`)) <= `0x3f`) {
66	// U+0080..U+07FF
67	uint32_t ce32 = trie->data32[trie->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET - `0xc0`) + c] + t1];
68	c = ((c & `0x1f`) << `6`) \| t1;
69	++pos;
70	return ce32;
71	} else {
72	// Function call for supplementary code points and error cases.
73	// Illegal byte sequences yield U+FFFD.
74	c = utf8_nextCharSafeBody(u8, &pos, length, c, -`3`);
75	return data->getCE32(c);
76	}
77	}
78
79	UBool
80	UTF8CollationIterator::foundNULTerminator() {
81	if(length < `0`) {
82	length = --pos;
83	return TRUE;
84	} else {
85	return FALSE;
86	}
87	}
88
89	UBool
90	UTF8CollationIterator::forbidSurrogateCodePoints() const {
91	return TRUE;
92	}
93
94	UChar32
95	UTF8CollationIterator::nextCodePoint(UErrorCode & /errorCode/) {
96	if(pos == length) {
97	return U_SENTINEL;
98	}
99	if(u8[pos] == `0` && length < `0`) {
100	length = pos;
101	return U_SENTINEL;
102	}
103	UChar32 c;
104	U8_NEXT_OR_FFFD(u8, pos, length, c);
105	return c;
106	}
107
108	UChar32
109	UTF8CollationIterator::previousCodePoint(UErrorCode & /errorCode/) {
110	if(pos == `0`) {
111	return U_SENTINEL;
112	}
113	UChar32 c;
114	U8_PREV_OR_FFFD(u8, `0`, pos, c);
115	return c;
116	}
117
118	void
119	UTF8CollationIterator::forwardNumCodePoints(int32_t num, UErrorCode & /errorCode/) {
120	U8_FWD_N(u8, pos, length, num);
121	}
122
123	void
124	UTF8CollationIterator::backwardNumCodePoints(int32_t num, UErrorCode & /errorCode/) {
125	U8_BACK_N(u8, `0`, pos, num);
126	}
127
128	// FCDUTF8CollationIterator ------------------------------------------------ ***
129
130	FCDUTF8CollationIterator::~FCDUTF8CollationIterator() {}
131
132	void
133	FCDUTF8CollationIterator::resetToOffset(int32_t newOffset) {
134	reset();
135	start = pos = newOffset;
136	state = CHECK_FWD;
137	}
138
139	int32_t
140	FCDUTF8CollationIterator::getOffset() const {
141	if(state != IN_NORMALIZED) {
142	return pos;
143	} else if(pos == `0`) {
144	return start;
145	} else {
146	return limit;
147	}
148	}
149
150	uint32_t
151	FCDUTF8CollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) {
152	for(;;) {
153	if(state == CHECK_FWD) {
154	// Combination of UTF8CollationIterator::handleNextCE32() with FCD check fastpath.
155	if(pos == length) {
156	c = U_SENTINEL;
157	return Collation::FALLBACK_CE32;
158	}
159	c = u8[pos++];
160	if(U8_IS_SINGLE(c)) {
161	// ASCII 00..7F
162	return trie->data32[c];
163	}
164	uint8_t t1, t2;
165	if(`0xe0` <= c && c < `0xf0` &&
166	((pos + `1`) < length \|\| length < `0`) &&
167	U8_IS_VALID_LEAD3_AND_T1(c, t1 = u8[pos]) &&
168	(t2 = (u8[pos + `1`] - `0x80`)) <= `0x3f`) {
169	// U+0800..U+FFFF except surrogates
170	c = (((c & `0xf`) << `12`) \| ((t1 & `0x3f`) << `6`) \| t2);
171	pos += `2`;
172	if(CollationFCD::hasTccc(c) &&
173	(CollationFCD::maybeTibetanCompositeVowel(c) \|\|
174	(pos != length && nextHasLccc()))) {
175	pos -= `3`;
176	} else {
177	break; // return CE32(BMP)
178	}
179	} else if(c < `0xe0` && c >= `0xc2` && pos != length && (t1 = (u8[pos] - `0x80`)) <= `0x3f`) {
180	// U+0080..U+07FF
181	uint32_t ce32 = trie->data32[trie->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET - `0xc0`) + c] + t1];
182	c = ((c & `0x1f`) << `6`) \| t1;
183	++pos;
184	if(CollationFCD::hasTccc(c) && pos != length && nextHasLccc()) {
185	pos -= `2`;
186	} else {
187	return ce32;
188	}
189	} else {
190	// Function call for supplementary code points and error cases.
191	// Illegal byte sequences yield U+FFFD.
192	c = utf8_nextCharSafeBody(u8, &pos, length, c, -`3`);
193	if(c == `0xfffd`) {
194	return Collation::FFFD_CE32;
195	} else {
196	U_ASSERT(c > `0xffff`);
197	if(CollationFCD::hasTccc(U16_LEAD(c)) && pos != length && nextHasLccc()) {
198	pos -= `4`;
199	} else {
200	return data->getCE32FromSupplementary(c);
201	}
202	}
203	}
204	if(!nextSegment(errorCode)) {
205	c = U_SENTINEL;
206	return Collation::FALLBACK_CE32;
207	}
208	continue;
209	} else if(state == IN_FCD_SEGMENT && pos != limit) {
210	return UTF8CollationIterator::handleNextCE32(c, errorCode);
211	} else if(state == IN_NORMALIZED && pos != normalized.length()) {
212	c = normalized [pos++];
213	break;
214	} else {
215	switchToForward();
216	}
217	}
218	return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
219	}
220
221	UBool
222	FCDUTF8CollationIterator::nextHasLccc() const {
223	U_ASSERT(state == CHECK_FWD && pos != length);
224	// The lowest code point with ccc!=0 is U+0300 which is CC 80 in UTF-8.
225	// CJK U+4000..U+DFFF except U+Axxx are also FCD-inert. (Lead bytes E4..ED except EA.)
226	UChar32 c = u8[pos];
227	if(c < `0xcc` \|\| (`0xe4` <= c && c <= `0xed` && c != `0xea`)) { return FALSE; }
228	int32_t i = pos;
229	U8_NEXT_OR_FFFD(u8, i, length, c);
230	if(c > `0xffff`) { c = U16_LEAD(c); }
231	return CollationFCD::hasLccc(c);
232	}
233
234	UBool
235	FCDUTF8CollationIterator::previousHasTccc() const {
236	U_ASSERT(state == CHECK_BWD && pos != `0`);
237	UChar32 c = u8[pos - `1`];
238	if(U8_IS_SINGLE(c)) { return FALSE; }
239	int32_t i = pos;
240	U8_PREV_OR_FFFD(u8, `0`, i, c);
241	if(c > `0xffff`) { c = U16_LEAD(c); }
242	return CollationFCD::hasTccc(c);
243	}
244
245	UChar
246	FCDUTF8CollationIterator::handleGetTrailSurrogate() {
247	if(state != IN_NORMALIZED) { return `0`; }
248	U_ASSERT(pos < normalized.length());
249	UChar trail;
250	if(U16_IS_TRAIL(trail = normalized[pos])) { ++pos; }
251	return trail;
252	}
253
254	UBool
255	FCDUTF8CollationIterator::foundNULTerminator() {
256	if(state == CHECK_FWD && length < `0`) {
257	length = --pos;
258	return TRUE;
259	} else {
260	return FALSE;
261	}
262	}
263
264	UChar32
265	FCDUTF8CollationIterator::nextCodePoint(UErrorCode &errorCode) {
266	UChar32 c;
267	for(;;) {
268	if(state == CHECK_FWD) {
269	if(pos == length \|\| ((c = u8[pos]) == `0` && length < `0`)) {
270	return U_SENTINEL;
271	}
272	if(U8_IS_SINGLE(c)) {
273	++pos;
274	return c;
275	}
276	U8_NEXT_OR_FFFD(u8, pos, length, c);
277	if(CollationFCD::hasTccc(c <= `0xffff` ? c : U16_LEAD(c)) &&
278	(CollationFCD::maybeTibetanCompositeVowel(c) \|\|
279	(pos != length && nextHasLccc()))) {
280	// c is not FCD-inert, therefore it is not U+FFFD and it has a valid byte sequence
281	// and we can use U8_LENGTH() rather than a previous-position variable.
282	pos -= U8_LENGTH(c);
283	if(!nextSegment(errorCode)) {
284	return U_SENTINEL;
285	}
286	continue;
287	}
288	return c;
289	} else if(state == IN_FCD_SEGMENT && pos != limit) {
290	U8_NEXT_OR_FFFD(u8, pos, length, c);
291	return c;
292	} else if(state == IN_NORMALIZED && pos != normalized.length()) {
293	c = normalized.char32At(pos);
294	pos += U16_LENGTH(c);
295	return c;
296	} else {
297	switchToForward();
298	}
299	}
300	}
301
302	UChar32
303	FCDUTF8CollationIterator::previousCodePoint(UErrorCode &errorCode) {
304	UChar32 c;
305	for(;;) {
306	if(state == CHECK_BWD) {
307	if(pos == `0`) {
308	return U_SENTINEL;
309	}
310	if(U8_IS_SINGLE(c = u8[pos - `1`])) {
311	--pos;
312	return c;
313	}
314	U8_PREV_OR_FFFD(u8, `0`, pos, c);
315	if(CollationFCD::hasLccc(c <= `0xffff` ? c : U16_LEAD(c)) &&
316	(CollationFCD::maybeTibetanCompositeVowel(c) \|\|
317	(pos != `0` && previousHasTccc()))) {
318	// c is not FCD-inert, therefore it is not U+FFFD and it has a valid byte sequence
319	// and we can use U8_LENGTH() rather than a previous-position variable.
320	pos += U8_LENGTH(c);
321	if(!previousSegment(errorCode)) {
322	return U_SENTINEL;
323	}
324	continue;
325	}
326	return c;
327	} else if(state == IN_FCD_SEGMENT && pos != start) {
328	U8_PREV_OR_FFFD(u8, `0`, pos, c);
329	return c;
330	} else if(state >= IN_NORMALIZED && pos != `0`) {
331	c = normalized.char32At(pos - `1`);
332	pos -= U16_LENGTH(c);
333	return c;
334	} else {
335	switchToBackward();
336	}
337	}
338	}
339
340	void
341	FCDUTF8CollationIterator::forwardNumCodePoints(int32_t num, UErrorCode &errorCode) {
342	// Specify the class to avoid a virtual-function indirection.
343	// In Java, we would declare this class final.
344	while(num > `0` && FCDUTF8CollationIterator::nextCodePoint(errorCode) >= `0`) {
345	--num;
346	}
347	}
348
349	void
350	FCDUTF8CollationIterator::backwardNumCodePoints(int32_t num, UErrorCode &errorCode) {
351	// Specify the class to avoid a virtual-function indirection.
352	// In Java, we would declare this class final.
353	while(num > `0` && FCDUTF8CollationIterator::previousCodePoint(errorCode) >= `0`) {
354	--num;
355	}
356	}
357
358	void
359	FCDUTF8CollationIterator::switchToForward() {
360	U_ASSERT(state == CHECK_BWD \|\|
361	(state == IN_FCD_SEGMENT && pos == limit) \|\|
362	(state == IN_NORMALIZED && pos == normalized.length()));
363	if(state == CHECK_BWD) {
364	// Turn around from backward checking.
365	start = pos;
366	if(pos == limit) {
367	state = CHECK_FWD; // Check forward.
368	} else { // pos < limit
369	state = IN_FCD_SEGMENT; // Stay in FCD segment.
370	}
371	} else {
372	// Reached the end of the FCD segment.
373	if(state == IN_FCD_SEGMENT) {
374	// The input text segment is FCD, extend it forward.
375	} else {
376	// The input text segment needed to be normalized.
377	// Switch to checking forward from it.
378	start = pos = limit;
379	}
380	state = CHECK_FWD;
381	}
382	}
383
384	UBool
385	FCDUTF8CollationIterator::nextSegment(UErrorCode &errorCode) {
386	if(U_FAILURE(errorCode)) { return FALSE; }
387	U_ASSERT(state == CHECK_FWD && pos != length);
388	// The input text [start..pos[ passes the FCD check.
389	int32_t segmentStart = pos;
390	// Collect the characters being checked, in case they need to be normalized.
391	UnicodeString s;
392	uint8_t prevCC = `0`;
393	for(;;) {
394	// Fetch the next character and its fcd16 value.
395	int32_t cpStart = pos;
396	UChar32 c;
397	U8_NEXT_OR_FFFD(u8, pos, length, c);
398	uint16_t fcd16 = nfcImpl.getFCD16(c);
399	uint8_t leadCC = (uint8_t)(fcd16 >> `8`);
400	if(leadCC == `0` && cpStart != segmentStart) {
401	// FCD boundary before this character.
402	pos = cpStart;
403	break;
404	}
405	s.append(c);
406	if(leadCC != `0` && (prevCC > leadCC \|\| CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) {
407	// Fails FCD check. Find the next FCD boundary and normalize.
408	while(pos != length) {
409	cpStart = pos;
410	U8_NEXT_OR_FFFD(u8, pos, length, c);
411	if(nfcImpl.getFCD16(c) <= `0xff`) {
412	pos = cpStart;
413	break;
414	}
415	s.append(c);
416	}
417	if(!normalize(s, errorCode)) { return FALSE; }
418	start = segmentStart;
419	limit = pos;
420	state = IN_NORMALIZED;
421	pos = `0`;
422	return TRUE;
423	}
424	prevCC = (uint8_t)fcd16;
425	if(pos == length \|\| prevCC == `0`) {
426	// FCD boundary after the last character.
427	break;
428	}
429	}
430	limit = pos;
431	pos = segmentStart;
432	U_ASSERT(pos != limit);
433	state = IN_FCD_SEGMENT;
434	return TRUE;
435	}
436
437	void
438	FCDUTF8CollationIterator::switchToBackward() {
439	U_ASSERT(state == CHECK_FWD \|\|
440	(state == IN_FCD_SEGMENT && pos == start) \|\|
441	(state >= IN_NORMALIZED && pos == `0`));
442	if(state == CHECK_FWD) {
443	// Turn around from forward checking.
444	limit = pos;
445	if(pos == start) {
446	state = CHECK_BWD; // Check backward.
447	} else { // pos > start
448	state = IN_FCD_SEGMENT; // Stay in FCD segment.
449	}
450	} else {
451	// Reached the start of the FCD segment.
452	if(state == IN_FCD_SEGMENT) {
453	// The input text segment is FCD, extend it backward.
454	} else {
455	// The input text segment needed to be normalized.
456	// Switch to checking backward from it.
457	limit = pos = start;
458	}
459	state = CHECK_BWD;
460	}
461	}
462
463	UBool
464	FCDUTF8CollationIterator::previousSegment(UErrorCode &errorCode) {
465	if(U_FAILURE(errorCode)) { return FALSE; }
466	U_ASSERT(state == CHECK_BWD && pos != `0`);
467	// The input text [pos..limit[ passes the FCD check.
468	int32_t segmentLimit = pos;
469	// Collect the characters being checked, in case they need to be normalized.
470	UnicodeString s;
471	uint8_t nextCC = `0`;
472	for(;;) {
473	// Fetch the previous character and its fcd16 value.
474	int32_t cpLimit = pos;
475	UChar32 c;
476	U8_PREV_OR_FFFD(u8, `0`, pos, c);
477	uint16_t fcd16 = nfcImpl.getFCD16(c);
478	uint8_t trailCC = (uint8_t)fcd16;
479	if(trailCC == `0` && cpLimit != segmentLimit) {
480	// FCD boundary after this character.
481	pos = cpLimit;
482	break;
483	}
484	s.append(c);
485	if(trailCC != `0` && ((nextCC != `0` && trailCC > nextCC) \|\|
486	CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) {
487	// Fails FCD check. Find the previous FCD boundary and normalize.
488	while(fcd16 > `0xff` && pos != `0`) {
489	cpLimit = pos;
490	U8_PREV_OR_FFFD(u8, `0`, pos, c);
491	fcd16 = nfcImpl.getFCD16(c);
492	if(fcd16 == `0`) {
493	pos = cpLimit;
494	break;
495	}
496	s.append(c);
497	}
498	s.reverse();
499	if(!normalize(s, errorCode)) { return FALSE; }
500	limit = segmentLimit;
501	start = pos;
502	state = IN_NORMALIZED;
503	pos = normalized.length();
504	return TRUE;
505	}
506	nextCC = (uint8_t)(fcd16 >> `8`);
507	if(pos == `0` \|\| nextCC == `0`) {
508	// FCD boundary before the following character.
509	break;
510	}
511	}
512	start = pos;
513	pos = segmentLimit;
514	U_ASSERT(pos != start);
515	state = IN_FCD_SEGMENT;
516	return TRUE;
517	}
518
519	UBool
520	FCDUTF8CollationIterator::normalize(const UnicodeString &s, UErrorCode &errorCode) {
521	// NFD without argument checking.
522	U_ASSERT(U_SUCCESS(errorCode));
523	nfcImpl.decompose(s, normalized, errorCode);
524	return U_SUCCESS(errorCode);
525	}
526
527	U_NAMESPACE_END
528
529	#endif // !UCONFIG_NO_COLLATION
530

Browse the source code of ClickHouse/contrib/icu/icu4c/source/i18n/utf8collationiterator.cpp