unormcmp.cpp source code [ClickHouse/contrib/icu/icu4c/source/common/unormcmp.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	*******************************************************************************
5	*
6	* Copyright (C) 2001-2014, International Business Machines
7	* Corporation and others. All Rights Reserved.
8	*
9	*******************************************************************************
10	* file name: unormcmp.cpp
11	* encoding: UTF-8
12	* tab size: 8 (not used)
13	* indentation:4
14	*
15	* created on: 2004sep13
16	* created by: Markus W. Scherer
17	*
18	* unorm_compare() function moved here from unorm.cpp for better modularization.
19	* Depends on both normalization and case folding.
20	* Allows unorm.cpp to not depend on any character properties code.
21	*/
22
23	#include "unicode/utypes.h"
24
25	#if !UCONFIG_NO_NORMALIZATION
26
27	#include "unicode/unorm.h"
28	#include "unicode/ustring.h"
29	#include "cmemory.h"
30	#include "normalizer2impl.h"
31	#include "ucase.h"
32	#include "uprops.h"
33	#include "ustr_imp.h"
34
35	U_NAMESPACE_USE
36
37	/ compare canonically equivalent ------------------------------------------- /
38
39	/*
40	* Compare two strings for canonical equivalence.
41	* Further options include case-insensitive comparison and
42	* code point order (as opposed to code unit order).
43	*
44	* In this function, canonical equivalence is optional as well.
45	* If canonical equivalence is tested, then both strings must fulfill
46	* the FCD check.
47	*
48	* Semantically, this is equivalent to
49	* strcmp[CodePointOrder](NFD(foldCase(s1)), NFD(foldCase(s2)))
50	* where code point order, NFD and foldCase are all optional.
51	*
52	* String comparisons almost always yield results before processing both strings
53	* completely.
54	* They are generally more efficient working incrementally instead of
55	* performing the sub-processing (strlen, normalization, case-folding)
56	* on the entire strings first.
57	*
58	* It is also unnecessary to not normalize identical characters.
59	*
60	* This function works in principle as follows:
61	*
62	* loop {
63	* get one code unit c1 from s1 (-1 if end of source)
64	* get one code unit c2 from s2 (-1 if end of source)
65	*
66	* if(either string finished) {
67	* return result;
68	* }
69	* if(c1==c2) {
70	* continue;
71	* }
72	*
73	* // c1!=c2
74	* try to decompose/case-fold c1/c2, and continue if one does;
75	*
76	* // still c1!=c2 and neither decomposes/case-folds, return result
77	* return c1-c2;
78	* }
79	*
80	* When a character decomposes, then the pointer for that source changes to
81	* the decomposition, pushing the previous pointer onto a stack.
82	* When the end of the decomposition is reached, then the code unit reader
83	* pops the previous source from the stack.
84	* (Same for case-folding.)
85	*
86	* This is complicated further by operating on variable-width UTF-16.
87	* The top part of the loop works on code units, while lookups for decomposition
88	* and case-folding need code points.
89	* Code points are assembled after the equality/end-of-source part.
90	* The source pointer is only advanced beyond all code units when the code point
91	* actually decomposes/case-folds.
92	*
93	* If we were on a trail surrogate unit when assembling a code point,
94	* and the code point decomposes/case-folds, then the decomposition/folding
95	* result must be compared with the part of the other string that corresponds to
96	* this string's lead surrogate.
97	* Since we only assemble a code point when hitting a trail unit when the
98	* preceding lead units were identical, we back up the other string by one unit
99	* in such a case.
100	*
101	* The optional code point order comparison at the end works with
102	* the same fix-up as the other code point order comparison functions.
103	* See ustring.c and the comment near the end of this function.
104	*
105	* Assumption: A decomposition or case-folding result string never contains
106	* a single surrogate. This is a safe assumption in the Unicode Standard.
107	* Therefore, we do not need to check for surrogate pairs across
108	* decomposition/case-folding boundaries.
109	*
110	* Further assumptions (see verifications tstnorm.cpp):
111	* The API function checks for FCD first, while the core function
112	* first case-folds and then decomposes. This requires that case-folding does not
113	* un-FCD any strings.
114	*
115	* The API function may also NFD the input and turn off decomposition.
116	* This requires that case-folding does not un-NFD strings either.
117	*
118	* TODO If any of the above two assumptions is violated,
119	* then this entire code must be re-thought.
120	* If this happens, then a simple solution is to case-fold both strings up front
121	* and to turn off UNORM_INPUT_IS_FCD.
122	* We already do this when not both strings are in FCD because makeFCD
123	* would be a partial NFD before the case folding, which does not work.
124	* Note that all of this is only a problem when case-folding _and_
125	* canonical equivalence come together.
126	* (Comments in unorm_compare() are more up to date than this TODO.)
127	*/
128
129	/ stack element for previous-level source/decomposition pointers /
130	struct CmpEquivLevel {
131	const UChar start, s, *limit;
132	};
133	typedef struct CmpEquivLevel CmpEquivLevel;
134
135	/**
136	* Internal option for unorm_cmpEquivFold() for decomposing.
137	* If not set, just do strcasecmp().
138	*/
139	#define _COMPARE_EQUIV 0x80000
140
141	/ internal function /
142	static int32_t
143	unorm_cmpEquivFold(const UChar *s1, int32_t length1,
144	const UChar *s2, int32_t length2,
145	uint32_t options,
146	UErrorCode *pErrorCode) {
147	const Normalizer2Impl *nfcImpl;
148
149	/ current-level start/limit - s1/s2 as current /
150	const UChar start1, start2, limit1, limit2;
151
152	/ decomposition and case folding variables /
153	const UChar *p;
154	int32_t length;
155
156	/ stacks of previous-level start/current/limit /
157	CmpEquivLevel stack1[`2`], stack2[`2`];
158
159	/ buffers for algorithmic decompositions /
160	UChar decomp1[`4`], decomp2[`4`];
161
162	/ case folding buffers, only use current-level start/limit /
163	UChar fold1[UCASE_MAX_STRING_LENGTH+`1`], fold2[UCASE_MAX_STRING_LENGTH+`1`];
164
165	/ track which is the current level per string /
166	int32_t level1, level2;
167
168	/ current code units, and code points for lookups /
169	UChar32 c1, c2, cp1, cp2;
170
171	/ no argument error checking because this itself is not an API /
172
173	/*
174	* assume that at least one of the options _COMPARE_EQUIV and U_COMPARE_IGNORE_CASE is set
175	* otherwise this function must behave exactly as uprv_strCompare()
176	* not checking for that here makes testing this function easier
177	*/
178
179	/ normalization/properties data loaded? /
180	if((options&_COMPARE_EQUIV)!=`0`) {
181	nfcImpl=Normalizer2Factory::getNFCImpl(*pErrorCode);
182	} else {
183	nfcImpl=NULL;
184	}
185	if(U_FAILURE(*pErrorCode)) {
186	return `0`;
187	}
188
189	/ initialize /
190	start1=s1;
191	if(length1==-`1`) {
192	limit1=NULL;
193	} else {
194	limit1=s1+length1;
195	}
196
197	start2=s2;
198	if(length2==-`1`) {
199	limit2=NULL;
200	} else {
201	limit2=s2+length2;
202	}
203
204	level1=level2=`0`;
205	c1=c2=-`1`;
206
207	/ comparison loop /
208	for(;;) {
209	/*
210	* here a code unit value of -1 means "get another code unit"
211	* below it will mean "this source is finished"
212	*/
213
214	if(c1<`0`) {
215	/ get next code unit from string 1, post-increment /
216	for(;;) {
217	if(s1==limit1 \|\| ((c1=*s1)==`0` && (limit1==NULL \|\| (options&_STRNCMP_STYLE)))) {
218	if(level1==`0`) {
219	c1=-`1`;
220	break;
221	}
222	} else {
223	++s1;
224	break;
225	}
226
227	/ reached end of level buffer, pop one level /
228	do {
229	--level1;
230	start1=stack1[level1].start; /Not uninitialized/
231	} while(start1==NULL);
232	s1=stack1[level1].s; /Not uninitialized/
233	limit1=stack1[level1].limit; /Not uninitialized/
234	}
235	}
236
237	if(c2<`0`) {
238	/ get next code unit from string 2, post-increment /
239	for(;;) {
240	if(s2==limit2 \|\| ((c2=*s2)==`0` && (limit2==NULL \|\| (options&_STRNCMP_STYLE)))) {
241	if(level2==`0`) {
242	c2=-`1`;
243	break;
244	}
245	} else {
246	++s2;
247	break;
248	}
249
250	/ reached end of level buffer, pop one level /
251	do {
252	--level2;
253	start2=stack2[level2].start; /Not uninitialized/
254	} while(start2==NULL);
255	s2=stack2[level2].s; /Not uninitialized/
256	limit2=stack2[level2].limit; /Not uninitialized/
257	}
258	}
259
260	/*
261	* compare c1 and c2
262	* either variable c1, c2 is -1 only if the corresponding string is finished
263	*/
264	if(c1==c2) {
265	if(c1<`0`) {
266	return `0`; / c1==c2==-1 indicating end of strings /
267	}
268	c1=c2=-`1`; / make us fetch new code units /
269	continue;
270	} else if(c1<`0`) {
271	return -`1`; / string 1 ends before string 2 /
272	} else if(c2<`0`) {
273	return `1`; / string 2 ends before string 1 /
274	}
275	/ c1!=c2 && c1>=0 && c2>=0 /
276
277	/ get complete code points for c1, c2 for lookups if either is a surrogate /
278	cp1=c1;
279	if(U_IS_SURROGATE(c1)) {
280	UChar c;
281
282	if(U_IS_SURROGATE_LEAD(c1)) {
283	if(s1!=limit1 && U16_IS_TRAIL(c=*s1)) {
284	/ advance ++s1; only below if cp1 decomposes/case-folds /
285	cp1=U16_GET_SUPPLEMENTARY(c1, c);
286	}
287	} else / isTrail(c1) / {
288	if(start1<=(s1-`2`) && U16_IS_LEAD(c=*(s1-`2`))) {
289	cp1=U16_GET_SUPPLEMENTARY(c, c1);
290	}
291	}
292	}
293
294	cp2=c2;
295	if(U_IS_SURROGATE(c2)) {
296	UChar c;
297
298	if(U_IS_SURROGATE_LEAD(c2)) {
299	if(s2!=limit2 && U16_IS_TRAIL(c=*s2)) {
300	/ advance ++s2; only below if cp2 decomposes/case-folds /
301	cp2=U16_GET_SUPPLEMENTARY(c2, c);
302	}
303	} else / isTrail(c2) / {
304	if(start2<=(s2-`2`) && U16_IS_LEAD(c=*(s2-`2`))) {
305	cp2=U16_GET_SUPPLEMENTARY(c, c2);
306	}
307	}
308	}
309
310	/*
311	* go down one level for each string
312	* continue with the main loop as soon as there is a real change
313	*/
314
315	if( level1==`0` && (options&U_COMPARE_IGNORE_CASE) &&
316	(length=ucase_toFullFolding((UChar32)cp1, &p, options))>=`0`
317	) {
318	/ cp1 case-folds to the code point "length" or to p[length] /
319	if(U_IS_SURROGATE(c1)) {
320	if(U_IS_SURROGATE_LEAD(c1)) {
321	/ advance beyond source surrogate pair if it case-folds /
322	++s1;
323	} else / isTrail(c1) / {
324	/*
325	* we got a supplementary code point when hitting its trail surrogate,
326	* therefore the lead surrogate must have been the same as in the other string;
327	* compare this decomposition with the lead surrogate in the other string
328	* remember that this simulates bulk text replacement:
329	* the decomposition would replace the entire code point
330	*/
331	--s2;
332	c2=*(s2-`1`);
333	}
334	}
335
336	/ push current level pointers /
337	stack1[`0`].start=start1;
338	stack1[`0`].s=s1;
339	stack1[`0`].limit=limit1;
340	++level1;
341
342	/ copy the folding result to fold1[] /
343	if(length<=UCASE_MAX_STRING_LENGTH) {
344	u_memcpy(fold1, p, length);
345	} else {
346	int32_t i=`0`;
347	U16_APPEND_UNSAFE(fold1, i, length);
348	length=i;
349	}
350
351	/ set next level pointers to case folding /
352	start1=s1=fold1;
353	limit1=fold1+length;
354
355	/ get ready to read from decomposition, continue with loop /
356	c1=-`1`;
357	continue;
358	}
359
360	if( level2==`0` && (options&U_COMPARE_IGNORE_CASE) &&
361	(length=ucase_toFullFolding((UChar32)cp2, &p, options))>=`0`
362	) {
363	/ cp2 case-folds to the code point "length" or to p[length] /
364	if(U_IS_SURROGATE(c2)) {
365	if(U_IS_SURROGATE_LEAD(c2)) {
366	/ advance beyond source surrogate pair if it case-folds /
367	++s2;
368	} else / isTrail(c2) / {
369	/*
370	* we got a supplementary code point when hitting its trail surrogate,
371	* therefore the lead surrogate must have been the same as in the other string;
372	* compare this decomposition with the lead surrogate in the other string
373	* remember that this simulates bulk text replacement:
374	* the decomposition would replace the entire code point
375	*/
376	--s1;
377	c1=*(s1-`1`);
378	}
379	}
380
381	/ push current level pointers /
382	stack2[`0`].start=start2;
383	stack2[`0`].s=s2;
384	stack2[`0`].limit=limit2;
385	++level2;
386
387	/ copy the folding result to fold2[] /
388	if(length<=UCASE_MAX_STRING_LENGTH) {
389	u_memcpy(fold2, p, length);
390	} else {
391	int32_t i=`0`;
392	U16_APPEND_UNSAFE(fold2, i, length);
393	length=i;
394	}
395
396	/ set next level pointers to case folding /
397	start2=s2=fold2;
398	limit2=fold2+length;
399
400	/ get ready to read from decomposition, continue with loop /
401	c2=-`1`;
402	continue;
403	}
404
405	if( level1<`2` && (options&_COMPARE_EQUIV) &&
406	`0`!=(p=nfcImpl->getDecomposition((UChar32)cp1, decomp1, length))
407	) {
408	/ cp1 decomposes into p[length] /
409	if(U_IS_SURROGATE(c1)) {
410	if(U_IS_SURROGATE_LEAD(c1)) {
411	/ advance beyond source surrogate pair if it decomposes /
412	++s1;
413	} else / isTrail(c1) / {
414	/*
415	* we got a supplementary code point when hitting its trail surrogate,
416	* therefore the lead surrogate must have been the same as in the other string;
417	* compare this decomposition with the lead surrogate in the other string
418	* remember that this simulates bulk text replacement:
419	* the decomposition would replace the entire code point
420	*/
421	--s2;
422	c2=*(s2-`1`);
423	}
424	}
425
426	/ push current level pointers /
427	stack1[level1].start=start1;
428	stack1[level1].s=s1;
429	stack1[level1].limit=limit1;
430	++level1;
431
432	/ set empty intermediate level if skipped /
433	if(level1<`2`) {
434	stack1[level1++].start=NULL;
435	}
436
437	/ set next level pointers to decomposition /
438	start1=s1=p;
439	limit1=p+length;
440
441	/ get ready to read from decomposition, continue with loop /
442	c1=-`1`;
443	continue;
444	}
445
446	if( level2<`2` && (options&_COMPARE_EQUIV) &&
447	`0`!=(p=nfcImpl->getDecomposition((UChar32)cp2, decomp2, length))
448	) {
449	/ cp2 decomposes into p[length] /
450	if(U_IS_SURROGATE(c2)) {
451	if(U_IS_SURROGATE_LEAD(c2)) {
452	/ advance beyond source surrogate pair if it decomposes /
453	++s2;
454	} else / isTrail(c2) / {
455	/*
456	* we got a supplementary code point when hitting its trail surrogate,
457	* therefore the lead surrogate must have been the same as in the other string;
458	* compare this decomposition with the lead surrogate in the other string
459	* remember that this simulates bulk text replacement:
460	* the decomposition would replace the entire code point
461	*/
462	--s1;
463	c1=*(s1-`1`);
464	}
465	}
466
467	/ push current level pointers /
468	stack2[level2].start=start2;
469	stack2[level2].s=s2;
470	stack2[level2].limit=limit2;
471	++level2;
472
473	/ set empty intermediate level if skipped /
474	if(level2<`2`) {
475	stack2[level2++].start=NULL;
476	}
477
478	/ set next level pointers to decomposition /
479	start2=s2=p;
480	limit2=p+length;
481
482	/ get ready to read from decomposition, continue with loop /
483	c2=-`1`;
484	continue;
485	}
486
487	/*
488	* no decomposition/case folding, max level for both sides:
489	* return difference result
490	*
491	* code point order comparison must not just return cp1-cp2
492	* because when single surrogates are present then the surrogate pairs
493	* that formed cp1 and cp2 may be from different string indexes
494	*
495	* example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units
496	* c1=d800 cp1=10001 c2=dc00 cp2=10000
497	* cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 }
498	*
499	* therefore, use same fix-up as in ustring.c/uprv_strCompare()
500	* except: uprv_strCompare() fetches c=s while this functions fetches c=s++
501	* so we have slightly different pointer/start/limit comparisons here
502	*/
503
504	if(c1>=`0xd800` && c2>=`0xd800` && (options&U_COMPARE_CODE_POINT_ORDER)) {
505	/ subtract 0x2800 from BMP code points to make them smaller than supplementary ones /
506	if(
507	(c1<=`0xdbff` && s1!=limit1 && U16_IS_TRAIL(*s1)) \|\|
508	(U16_IS_TRAIL(c1) && start1!=(s1-`1`) && U16_IS_LEAD(*(s1-`2`)))
509	) {
510	/ part of a surrogate pair, leave >=d800 /
511	} else {
512	/ BMP code point - may be surrogate code point - make <d800 /
513	c1-=`0x2800`;
514	}
515
516	if(
517	(c2<=`0xdbff` && s2!=limit2 && U16_IS_TRAIL(*s2)) \|\|
518	(U16_IS_TRAIL(c2) && start2!=(s2-`1`) && U16_IS_LEAD(*(s2-`2`)))
519	) {
520	/ part of a surrogate pair, leave >=d800 /
521	} else {
522	/ BMP code point - may be surrogate code point - make <d800 /
523	c2-=`0x2800`;
524	}
525	}
526
527	return c1-c2;
528	}
529	}
530
531	static
532	UBool _normalize(const Normalizer2 n2, const* UChar *s, int32_t length,
533	UnicodeString &normalized, UErrorCode *pErrorCode) {
534	UnicodeString str(length<`0`, s, length);
535
536	// check if s fulfill the conditions
537	int32_t spanQCYes=n2->spanQuickCheckYes(str, *pErrorCode);
538	if (U_FAILURE(*pErrorCode)) {
539	return FALSE;
540	}
541	/*
542	* ICU 2.4 had a further optimization:
543	* If both strings were not in FCD, then they were both NFD'ed,
544	* and the _COMPARE_EQUIV option was turned off.
545	* It is not entirely clear that this is valid with the current
546	* definition of the canonical caseless match.
547	* Therefore, ICU 2.6 removes that optimization.
548	*/
549	if(spanQCYes<str.length()) {
550	UnicodeString unnormalized=str.tempSubString(spanQCYes);
551	normalized.setTo(FALSE, str.getBuffer(), spanQCYes);
552	n2->normalizeSecondAndAppend(normalized, unnormalized, *pErrorCode);
553	if (U_SUCCESS(*pErrorCode)) {
554	return TRUE;
555	}
556	}
557	return FALSE;
558	}
559
560	U_CAPI int32_t U_EXPORT2
561	unorm_compare(const UChar *s1, int32_t length1,
562	const UChar *s2, int32_t length2,
563	uint32_t options,
564	UErrorCode *pErrorCode) {
565	/ argument checking /
566	if(U_FAILURE(*pErrorCode)) {
567	return `0`;
568	}
569	if(s1==`0` \|\| length1<-`1` \|\| s2==`0` \|\| length2<-`1`) {
570	*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
571	return `0`;
572	}
573
574	UnicodeString fcd1, fcd2;
575	int32_t normOptions=(int32_t)(options>>UNORM_COMPARE_NORM_OPTIONS_SHIFT);
576	options\|=_COMPARE_EQUIV;
577
578	/*
579	* UAX #21 Case Mappings, as fixed for Unicode version 4
580	* (see Jitterbug 2021), defines a canonical caseless match as
581	*
582	* A string X is a canonical caseless match
583	* for a string Y if and only if
584	* NFD(toCasefold(NFD(X))) = NFD(toCasefold(NFD(Y)))
585	*
586	* For better performance, we check for FCD (or let the caller tell us that
587	* both strings are in FCD) for the inner normalization.
588	* BasicNormalizerTest::FindFoldFCDExceptions() makes sure that
589	* case-folding preserves the FCD-ness of a string.
590	* The outer normalization is then only performed by unorm_cmpEquivFold()
591	* when there is a difference.
592	*
593	* Exception: When using the Turkic case-folding option, we do perform
594	* full NFD first. This is because in the Turkic case precomposed characters
595	* with 0049 capital I or 0069 small i fold differently whether they
596	* are first decomposed or not, so an FCD check - a check only for
597	* canonical order - is not sufficient.
598	*/
599	if(!(options&UNORM_INPUT_IS_FCD) \|\| (options&U_FOLD_CASE_EXCLUDE_SPECIAL_I)) {
600	const Normalizer2 *n2;
601	if(options&U_FOLD_CASE_EXCLUDE_SPECIAL_I) {
602	n2=Normalizer2::getNFDInstance(*pErrorCode);
603	} else {
604	n2=Normalizer2Factory::getFCDInstance(*pErrorCode);
605	}
606	if (U_FAILURE(*pErrorCode)) {
607	return `0`;
608	}
609
610	if(normOptions&UNORM_UNICODE_3_2) {
611	const UnicodeSet uni32=uniset_getUnicode32Instance(pErrorCode);
612	FilteredNormalizer2 fn2(n2, uni32);
613	if(_normalize(&fn2, s1, length1, fcd1, pErrorCode)) {
614	s1=fcd1.getBuffer();
615	length1=fcd1.length();
616	}
617	if(_normalize(&fn2, s2, length2, fcd2, pErrorCode)) {
618	s2=fcd2.getBuffer();
619	length2=fcd2.length();
620	}
621	} else {
622	if(_normalize(n2, s1, length1, fcd1, pErrorCode)) {
623	s1=fcd1.getBuffer();
624	length1=fcd1.length();
625	}
626	if(_normalize(n2, s2, length2, fcd2, pErrorCode)) {
627	s2=fcd2.getBuffer();
628	length2=fcd2.length();
629	}
630	}
631	}
632
633	if(U_SUCCESS(*pErrorCode)) {
634	return unorm_cmpEquivFold(s1, length1, s2, length2, options, pErrorCode);
635	} else {
636	return `0`;
637	}
638	}
639
640	#endif /* #if !UCONFIG_NO_NORMALIZATION */
641

Browse the source code of ClickHouse/contrib/icu/icu4c/source/common/unormcmp.cpp