| 1 | // © 2016 and later: Unicode, Inc. and others. | 
| 2 | // License & terms of use: http://www.unicode.org/copyright.html | 
| 3 | /* | 
| 4 | ******************************************************************************* | 
| 5 | * | 
| 6 | *   Copyright (C) 2001-2014, International Business Machines | 
| 7 | *   Corporation and others.  All Rights Reserved. | 
| 8 | * | 
| 9 | ******************************************************************************* | 
| 10 | *   file name:  unormcmp.cpp | 
| 11 | *   encoding:   UTF-8 | 
| 12 | *   tab size:   8 (not used) | 
| 13 | *   indentation:4 | 
| 14 | * | 
| 15 | *   created on: 2004sep13 | 
| 16 | *   created by: Markus W. Scherer | 
| 17 | * | 
| 18 | *   unorm_compare() function moved here from unorm.cpp for better modularization. | 
| 19 | *   Depends on both normalization and case folding. | 
| 20 | *   Allows unorm.cpp to not depend on any character properties code. | 
| 21 | */ | 
| 22 |  | 
| 23 | #include "unicode/utypes.h" | 
| 24 |  | 
| 25 | #if !UCONFIG_NO_NORMALIZATION | 
| 26 |  | 
| 27 | #include "unicode/unorm.h" | 
| 28 | #include "unicode/ustring.h" | 
| 29 | #include "cmemory.h" | 
| 30 | #include "normalizer2impl.h" | 
| 31 | #include "ucase.h" | 
| 32 | #include "uprops.h" | 
| 33 | #include "ustr_imp.h" | 
| 34 |  | 
| 35 | U_NAMESPACE_USE | 
| 36 |  | 
| 37 | /* compare canonically equivalent ------------------------------------------- */ | 
| 38 |  | 
| 39 | /* | 
| 40 |  * Compare two strings for canonical equivalence. | 
| 41 |  * Further options include case-insensitive comparison and | 
| 42 |  * code point order (as opposed to code unit order). | 
| 43 |  * | 
| 44 |  * In this function, canonical equivalence is optional as well. | 
| 45 |  * If canonical equivalence is tested, then both strings must fulfill | 
| 46 |  * the FCD check. | 
| 47 |  * | 
| 48 |  * Semantically, this is equivalent to | 
| 49 |  *   strcmp[CodePointOrder](NFD(foldCase(s1)), NFD(foldCase(s2))) | 
| 50 |  * where code point order, NFD and foldCase are all optional. | 
| 51 |  * | 
| 52 |  * String comparisons almost always yield results before processing both strings | 
| 53 |  * completely. | 
| 54 |  * They are generally more efficient working incrementally instead of | 
| 55 |  * performing the sub-processing (strlen, normalization, case-folding) | 
| 56 |  * on the entire strings first. | 
| 57 |  * | 
| 58 |  * It is also unnecessary to not normalize identical characters. | 
| 59 |  * | 
| 60 |  * This function works in principle as follows: | 
| 61 |  * | 
| 62 |  * loop { | 
| 63 |  *   get one code unit c1 from s1 (-1 if end of source) | 
| 64 |  *   get one code unit c2 from s2 (-1 if end of source) | 
| 65 |  * | 
| 66 |  *   if(either string finished) { | 
| 67 |  *     return result; | 
| 68 |  *   } | 
| 69 |  *   if(c1==c2) { | 
| 70 |  *     continue; | 
| 71 |  *   } | 
| 72 |  * | 
| 73 |  *   // c1!=c2 | 
| 74 |  *   try to decompose/case-fold c1/c2, and continue if one does; | 
| 75 |  * | 
| 76 |  *   // still c1!=c2 and neither decomposes/case-folds, return result | 
| 77 |  *   return c1-c2; | 
| 78 |  * } | 
| 79 |  * | 
| 80 |  * When a character decomposes, then the pointer for that source changes to | 
| 81 |  * the decomposition, pushing the previous pointer onto a stack. | 
| 82 |  * When the end of the decomposition is reached, then the code unit reader | 
| 83 |  * pops the previous source from the stack. | 
| 84 |  * (Same for case-folding.) | 
| 85 |  * | 
| 86 |  * This is complicated further by operating on variable-width UTF-16. | 
| 87 |  * The top part of the loop works on code units, while lookups for decomposition | 
| 88 |  * and case-folding need code points. | 
| 89 |  * Code points are assembled after the equality/end-of-source part. | 
| 90 |  * The source pointer is only advanced beyond all code units when the code point | 
| 91 |  * actually decomposes/case-folds. | 
| 92 |  * | 
| 93 |  * If we were on a trail surrogate unit when assembling a code point, | 
| 94 |  * and the code point decomposes/case-folds, then the decomposition/folding | 
| 95 |  * result must be compared with the part of the other string that corresponds to | 
| 96 |  * this string's lead surrogate. | 
| 97 |  * Since we only assemble a code point when hitting a trail unit when the | 
| 98 |  * preceding lead units were identical, we back up the other string by one unit | 
| 99 |  * in such a case. | 
| 100 |  * | 
| 101 |  * The optional code point order comparison at the end works with | 
| 102 |  * the same fix-up as the other code point order comparison functions. | 
| 103 |  * See ustring.c and the comment near the end of this function. | 
| 104 |  * | 
| 105 |  * Assumption: A decomposition or case-folding result string never contains | 
| 106 |  * a single surrogate. This is a safe assumption in the Unicode Standard. | 
| 107 |  * Therefore, we do not need to check for surrogate pairs across | 
| 108 |  * decomposition/case-folding boundaries. | 
| 109 |  * | 
| 110 |  * Further assumptions (see verifications tstnorm.cpp): | 
| 111 |  * The API function checks for FCD first, while the core function | 
| 112 |  * first case-folds and then decomposes. This requires that case-folding does not | 
| 113 |  * un-FCD any strings. | 
| 114 |  * | 
| 115 |  * The API function may also NFD the input and turn off decomposition. | 
| 116 |  * This requires that case-folding does not un-NFD strings either. | 
| 117 |  * | 
| 118 |  * TODO If any of the above two assumptions is violated, | 
| 119 |  * then this entire code must be re-thought. | 
| 120 |  * If this happens, then a simple solution is to case-fold both strings up front | 
| 121 |  * and to turn off UNORM_INPUT_IS_FCD. | 
| 122 |  * We already do this when not both strings are in FCD because makeFCD | 
| 123 |  * would be a partial NFD before the case folding, which does not work. | 
| 124 |  * Note that all of this is only a problem when case-folding _and_ | 
| 125 |  * canonical equivalence come together. | 
| 126 |  * (Comments in unorm_compare() are more up to date than this TODO.) | 
| 127 |  */ | 
| 128 |  | 
| 129 | /* stack element for previous-level source/decomposition pointers */ | 
| 130 | struct CmpEquivLevel { | 
| 131 |     const UChar *start, *s, *limit; | 
| 132 | }; | 
| 133 | typedef struct CmpEquivLevel CmpEquivLevel; | 
| 134 |  | 
| 135 | /** | 
| 136 |  * Internal option for unorm_cmpEquivFold() for decomposing. | 
| 137 |  * If not set, just do strcasecmp(). | 
| 138 |  */ | 
| 139 | #define _COMPARE_EQUIV 0x80000 | 
| 140 |  | 
| 141 | /* internal function */ | 
| 142 | static int32_t | 
| 143 | unorm_cmpEquivFold(const UChar *s1, int32_t length1, | 
| 144 |                    const UChar *s2, int32_t length2, | 
| 145 |                    uint32_t options, | 
| 146 |                    UErrorCode *pErrorCode) { | 
| 147 |     const Normalizer2Impl *nfcImpl; | 
| 148 |  | 
| 149 |     /* current-level start/limit - s1/s2 as current */ | 
| 150 |     const UChar *start1, *start2, *limit1, *limit2; | 
| 151 |  | 
| 152 |     /* decomposition and case folding variables */ | 
| 153 |     const UChar *p; | 
| 154 |     int32_t length; | 
| 155 |  | 
| 156 |     /* stacks of previous-level start/current/limit */ | 
| 157 |     CmpEquivLevel stack1[2], stack2[2]; | 
| 158 |  | 
| 159 |     /* buffers for algorithmic decompositions */ | 
| 160 |     UChar decomp1[4], decomp2[4]; | 
| 161 |  | 
| 162 |     /* case folding buffers, only use current-level start/limit */ | 
| 163 |     UChar fold1[UCASE_MAX_STRING_LENGTH+1], fold2[UCASE_MAX_STRING_LENGTH+1]; | 
| 164 |  | 
| 165 |     /* track which is the current level per string */ | 
| 166 |     int32_t level1, level2; | 
| 167 |  | 
| 168 |     /* current code units, and code points for lookups */ | 
| 169 |     UChar32 c1, c2, cp1, cp2; | 
| 170 |  | 
| 171 |     /* no argument error checking because this itself is not an API */ | 
| 172 |  | 
| 173 |     /* | 
| 174 |      * assume that at least one of the options _COMPARE_EQUIV and U_COMPARE_IGNORE_CASE is set | 
| 175 |      * otherwise this function must behave exactly as uprv_strCompare() | 
| 176 |      * not checking for that here makes testing this function easier | 
| 177 |      */ | 
| 178 |  | 
| 179 |     /* normalization/properties data loaded? */ | 
| 180 |     if((options&_COMPARE_EQUIV)!=0) { | 
| 181 |         nfcImpl=Normalizer2Factory::getNFCImpl(*pErrorCode); | 
| 182 |     } else { | 
| 183 |         nfcImpl=NULL; | 
| 184 |     } | 
| 185 |     if(U_FAILURE(*pErrorCode)) { | 
| 186 |         return 0; | 
| 187 |     } | 
| 188 |  | 
| 189 |     /* initialize */ | 
| 190 |     start1=s1; | 
| 191 |     if(length1==-1) { | 
| 192 |         limit1=NULL; | 
| 193 |     } else { | 
| 194 |         limit1=s1+length1; | 
| 195 |     } | 
| 196 |  | 
| 197 |     start2=s2; | 
| 198 |     if(length2==-1) { | 
| 199 |         limit2=NULL; | 
| 200 |     } else { | 
| 201 |         limit2=s2+length2; | 
| 202 |     } | 
| 203 |  | 
| 204 |     level1=level2=0; | 
| 205 |     c1=c2=-1; | 
| 206 |  | 
| 207 |     /* comparison loop */ | 
| 208 |     for(;;) { | 
| 209 |         /* | 
| 210 |          * here a code unit value of -1 means "get another code unit" | 
| 211 |          * below it will mean "this source is finished" | 
| 212 |          */ | 
| 213 |  | 
| 214 |         if(c1<0) { | 
| 215 |             /* get next code unit from string 1, post-increment */ | 
| 216 |             for(;;) { | 
| 217 |                 if(s1==limit1 || ((c1=*s1)==0 && (limit1==NULL || (options&_STRNCMP_STYLE)))) { | 
| 218 |                     if(level1==0) { | 
| 219 |                         c1=-1; | 
| 220 |                         break; | 
| 221 |                     } | 
| 222 |                 } else { | 
| 223 |                     ++s1; | 
| 224 |                     break; | 
| 225 |                 } | 
| 226 |  | 
| 227 |                 /* reached end of level buffer, pop one level */ | 
| 228 |                 do { | 
| 229 |                     --level1; | 
| 230 |                     start1=stack1[level1].start;    /*Not uninitialized*/ | 
| 231 |                 } while(start1==NULL); | 
| 232 |                 s1=stack1[level1].s;                /*Not uninitialized*/ | 
| 233 |                 limit1=stack1[level1].limit;        /*Not uninitialized*/ | 
| 234 |             } | 
| 235 |         } | 
| 236 |  | 
| 237 |         if(c2<0) { | 
| 238 |             /* get next code unit from string 2, post-increment */ | 
| 239 |             for(;;) { | 
| 240 |                 if(s2==limit2 || ((c2=*s2)==0 && (limit2==NULL || (options&_STRNCMP_STYLE)))) { | 
| 241 |                     if(level2==0) { | 
| 242 |                         c2=-1; | 
| 243 |                         break; | 
| 244 |                     } | 
| 245 |                 } else { | 
| 246 |                     ++s2; | 
| 247 |                     break; | 
| 248 |                 } | 
| 249 |  | 
| 250 |                 /* reached end of level buffer, pop one level */ | 
| 251 |                 do { | 
| 252 |                     --level2; | 
| 253 |                     start2=stack2[level2].start;    /*Not uninitialized*/ | 
| 254 |                 } while(start2==NULL); | 
| 255 |                 s2=stack2[level2].s;                /*Not uninitialized*/ | 
| 256 |                 limit2=stack2[level2].limit;        /*Not uninitialized*/ | 
| 257 |             } | 
| 258 |         } | 
| 259 |  | 
| 260 |         /* | 
| 261 |          * compare c1 and c2 | 
| 262 |          * either variable c1, c2 is -1 only if the corresponding string is finished | 
| 263 |          */ | 
| 264 |         if(c1==c2) { | 
| 265 |             if(c1<0) { | 
| 266 |                 return 0;   /* c1==c2==-1 indicating end of strings */ | 
| 267 |             } | 
| 268 |             c1=c2=-1;       /* make us fetch new code units */ | 
| 269 |             continue; | 
| 270 |         } else if(c1<0) { | 
| 271 |             return -1;      /* string 1 ends before string 2 */ | 
| 272 |         } else if(c2<0) { | 
| 273 |             return 1;       /* string 2 ends before string 1 */ | 
| 274 |         } | 
| 275 |         /* c1!=c2 && c1>=0 && c2>=0 */ | 
| 276 |  | 
| 277 |         /* get complete code points for c1, c2 for lookups if either is a surrogate */ | 
| 278 |         cp1=c1; | 
| 279 |         if(U_IS_SURROGATE(c1)) { | 
| 280 |             UChar c; | 
| 281 |  | 
| 282 |             if(U_IS_SURROGATE_LEAD(c1)) { | 
| 283 |                 if(s1!=limit1 && U16_IS_TRAIL(c=*s1)) { | 
| 284 |                     /* advance ++s1; only below if cp1 decomposes/case-folds */ | 
| 285 |                     cp1=U16_GET_SUPPLEMENTARY(c1, c); | 
| 286 |                 } | 
| 287 |             } else /* isTrail(c1) */ { | 
| 288 |                 if(start1<=(s1-2) && U16_IS_LEAD(c=*(s1-2))) { | 
| 289 |                     cp1=U16_GET_SUPPLEMENTARY(c, c1); | 
| 290 |                 } | 
| 291 |             } | 
| 292 |         } | 
| 293 |  | 
| 294 |         cp2=c2; | 
| 295 |         if(U_IS_SURROGATE(c2)) { | 
| 296 |             UChar c; | 
| 297 |  | 
| 298 |             if(U_IS_SURROGATE_LEAD(c2)) { | 
| 299 |                 if(s2!=limit2 && U16_IS_TRAIL(c=*s2)) { | 
| 300 |                     /* advance ++s2; only below if cp2 decomposes/case-folds */ | 
| 301 |                     cp2=U16_GET_SUPPLEMENTARY(c2, c); | 
| 302 |                 } | 
| 303 |             } else /* isTrail(c2) */ { | 
| 304 |                 if(start2<=(s2-2) && U16_IS_LEAD(c=*(s2-2))) { | 
| 305 |                     cp2=U16_GET_SUPPLEMENTARY(c, c2); | 
| 306 |                 } | 
| 307 |             } | 
| 308 |         } | 
| 309 |  | 
| 310 |         /* | 
| 311 |          * go down one level for each string | 
| 312 |          * continue with the main loop as soon as there is a real change | 
| 313 |          */ | 
| 314 |  | 
| 315 |         if( level1==0 && (options&U_COMPARE_IGNORE_CASE) && | 
| 316 |             (length=ucase_toFullFolding((UChar32)cp1, &p, options))>=0 | 
| 317 |         ) { | 
| 318 |             /* cp1 case-folds to the code point "length" or to p[length] */ | 
| 319 |             if(U_IS_SURROGATE(c1)) { | 
| 320 |                 if(U_IS_SURROGATE_LEAD(c1)) { | 
| 321 |                     /* advance beyond source surrogate pair if it case-folds */ | 
| 322 |                     ++s1; | 
| 323 |                 } else /* isTrail(c1) */ { | 
| 324 |                     /* | 
| 325 |                      * we got a supplementary code point when hitting its trail surrogate, | 
| 326 |                      * therefore the lead surrogate must have been the same as in the other string; | 
| 327 |                      * compare this decomposition with the lead surrogate in the other string | 
| 328 |                      * remember that this simulates bulk text replacement: | 
| 329 |                      * the decomposition would replace the entire code point | 
| 330 |                      */ | 
| 331 |                     --s2; | 
| 332 |                     c2=*(s2-1); | 
| 333 |                 } | 
| 334 |             } | 
| 335 |  | 
| 336 |             /* push current level pointers */ | 
| 337 |             stack1[0].start=start1; | 
| 338 |             stack1[0].s=s1; | 
| 339 |             stack1[0].limit=limit1; | 
| 340 |             ++level1; | 
| 341 |  | 
| 342 |             /* copy the folding result to fold1[] */ | 
| 343 |             if(length<=UCASE_MAX_STRING_LENGTH) { | 
| 344 |                 u_memcpy(fold1, p, length); | 
| 345 |             } else { | 
| 346 |                 int32_t i=0; | 
| 347 |                 U16_APPEND_UNSAFE(fold1, i, length); | 
| 348 |                 length=i; | 
| 349 |             } | 
| 350 |  | 
| 351 |             /* set next level pointers to case folding */ | 
| 352 |             start1=s1=fold1; | 
| 353 |             limit1=fold1+length; | 
| 354 |  | 
| 355 |             /* get ready to read from decomposition, continue with loop */ | 
| 356 |             c1=-1; | 
| 357 |             continue; | 
| 358 |         } | 
| 359 |  | 
| 360 |         if( level2==0 && (options&U_COMPARE_IGNORE_CASE) && | 
| 361 |             (length=ucase_toFullFolding((UChar32)cp2, &p, options))>=0 | 
| 362 |         ) { | 
| 363 |             /* cp2 case-folds to the code point "length" or to p[length] */ | 
| 364 |             if(U_IS_SURROGATE(c2)) { | 
| 365 |                 if(U_IS_SURROGATE_LEAD(c2)) { | 
| 366 |                     /* advance beyond source surrogate pair if it case-folds */ | 
| 367 |                     ++s2; | 
| 368 |                 } else /* isTrail(c2) */ { | 
| 369 |                     /* | 
| 370 |                      * we got a supplementary code point when hitting its trail surrogate, | 
| 371 |                      * therefore the lead surrogate must have been the same as in the other string; | 
| 372 |                      * compare this decomposition with the lead surrogate in the other string | 
| 373 |                      * remember that this simulates bulk text replacement: | 
| 374 |                      * the decomposition would replace the entire code point | 
| 375 |                      */ | 
| 376 |                     --s1; | 
| 377 |                     c1=*(s1-1); | 
| 378 |                 } | 
| 379 |             } | 
| 380 |  | 
| 381 |             /* push current level pointers */ | 
| 382 |             stack2[0].start=start2; | 
| 383 |             stack2[0].s=s2; | 
| 384 |             stack2[0].limit=limit2; | 
| 385 |             ++level2; | 
| 386 |  | 
| 387 |             /* copy the folding result to fold2[] */ | 
| 388 |             if(length<=UCASE_MAX_STRING_LENGTH) { | 
| 389 |                 u_memcpy(fold2, p, length); | 
| 390 |             } else { | 
| 391 |                 int32_t i=0; | 
| 392 |                 U16_APPEND_UNSAFE(fold2, i, length); | 
| 393 |                 length=i; | 
| 394 |             } | 
| 395 |  | 
| 396 |             /* set next level pointers to case folding */ | 
| 397 |             start2=s2=fold2; | 
| 398 |             limit2=fold2+length; | 
| 399 |  | 
| 400 |             /* get ready to read from decomposition, continue with loop */ | 
| 401 |             c2=-1; | 
| 402 |             continue; | 
| 403 |         } | 
| 404 |  | 
| 405 |         if( level1<2 && (options&_COMPARE_EQUIV) && | 
| 406 |             0!=(p=nfcImpl->getDecomposition((UChar32)cp1, decomp1, length)) | 
| 407 |         ) { | 
| 408 |             /* cp1 decomposes into p[length] */ | 
| 409 |             if(U_IS_SURROGATE(c1)) { | 
| 410 |                 if(U_IS_SURROGATE_LEAD(c1)) { | 
| 411 |                     /* advance beyond source surrogate pair if it decomposes */ | 
| 412 |                     ++s1; | 
| 413 |                 } else /* isTrail(c1) */ { | 
| 414 |                     /* | 
| 415 |                      * we got a supplementary code point when hitting its trail surrogate, | 
| 416 |                      * therefore the lead surrogate must have been the same as in the other string; | 
| 417 |                      * compare this decomposition with the lead surrogate in the other string | 
| 418 |                      * remember that this simulates bulk text replacement: | 
| 419 |                      * the decomposition would replace the entire code point | 
| 420 |                      */ | 
| 421 |                     --s2; | 
| 422 |                     c2=*(s2-1); | 
| 423 |                 } | 
| 424 |             } | 
| 425 |  | 
| 426 |             /* push current level pointers */ | 
| 427 |             stack1[level1].start=start1; | 
| 428 |             stack1[level1].s=s1; | 
| 429 |             stack1[level1].limit=limit1; | 
| 430 |             ++level1; | 
| 431 |  | 
| 432 |             /* set empty intermediate level if skipped */ | 
| 433 |             if(level1<2) { | 
| 434 |                 stack1[level1++].start=NULL; | 
| 435 |             } | 
| 436 |  | 
| 437 |             /* set next level pointers to decomposition */ | 
| 438 |             start1=s1=p; | 
| 439 |             limit1=p+length; | 
| 440 |  | 
| 441 |             /* get ready to read from decomposition, continue with loop */ | 
| 442 |             c1=-1; | 
| 443 |             continue; | 
| 444 |         } | 
| 445 |  | 
| 446 |         if( level2<2 && (options&_COMPARE_EQUIV) && | 
| 447 |             0!=(p=nfcImpl->getDecomposition((UChar32)cp2, decomp2, length)) | 
| 448 |         ) { | 
| 449 |             /* cp2 decomposes into p[length] */ | 
| 450 |             if(U_IS_SURROGATE(c2)) { | 
| 451 |                 if(U_IS_SURROGATE_LEAD(c2)) { | 
| 452 |                     /* advance beyond source surrogate pair if it decomposes */ | 
| 453 |                     ++s2; | 
| 454 |                 } else /* isTrail(c2) */ { | 
| 455 |                     /* | 
| 456 |                      * we got a supplementary code point when hitting its trail surrogate, | 
| 457 |                      * therefore the lead surrogate must have been the same as in the other string; | 
| 458 |                      * compare this decomposition with the lead surrogate in the other string | 
| 459 |                      * remember that this simulates bulk text replacement: | 
| 460 |                      * the decomposition would replace the entire code point | 
| 461 |                      */ | 
| 462 |                     --s1; | 
| 463 |                     c1=*(s1-1); | 
| 464 |                 } | 
| 465 |             } | 
| 466 |  | 
| 467 |             /* push current level pointers */ | 
| 468 |             stack2[level2].start=start2; | 
| 469 |             stack2[level2].s=s2; | 
| 470 |             stack2[level2].limit=limit2; | 
| 471 |             ++level2; | 
| 472 |  | 
| 473 |             /* set empty intermediate level if skipped */ | 
| 474 |             if(level2<2) { | 
| 475 |                 stack2[level2++].start=NULL; | 
| 476 |             } | 
| 477 |  | 
| 478 |             /* set next level pointers to decomposition */ | 
| 479 |             start2=s2=p; | 
| 480 |             limit2=p+length; | 
| 481 |  | 
| 482 |             /* get ready to read from decomposition, continue with loop */ | 
| 483 |             c2=-1; | 
| 484 |             continue; | 
| 485 |         } | 
| 486 |  | 
| 487 |         /* | 
| 488 |          * no decomposition/case folding, max level for both sides: | 
| 489 |          * return difference result | 
| 490 |          * | 
| 491 |          * code point order comparison must not just return cp1-cp2 | 
| 492 |          * because when single surrogates are present then the surrogate pairs | 
| 493 |          * that formed cp1 and cp2 may be from different string indexes | 
| 494 |          * | 
| 495 |          * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units | 
| 496 |          * c1=d800 cp1=10001 c2=dc00 cp2=10000 | 
| 497 |          * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 } | 
| 498 |          * | 
| 499 |          * therefore, use same fix-up as in ustring.c/uprv_strCompare() | 
| 500 |          * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++ | 
| 501 |          * so we have slightly different pointer/start/limit comparisons here | 
| 502 |          */ | 
| 503 |  | 
| 504 |         if(c1>=0xd800 && c2>=0xd800 && (options&U_COMPARE_CODE_POINT_ORDER)) { | 
| 505 |             /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */ | 
| 506 |             if( | 
| 507 |                 (c1<=0xdbff && s1!=limit1 && U16_IS_TRAIL(*s1)) || | 
| 508 |                 (U16_IS_TRAIL(c1) && start1!=(s1-1) && U16_IS_LEAD(*(s1-2))) | 
| 509 |             ) { | 
| 510 |                 /* part of a surrogate pair, leave >=d800 */ | 
| 511 |             } else { | 
| 512 |                 /* BMP code point - may be surrogate code point - make <d800 */ | 
| 513 |                 c1-=0x2800; | 
| 514 |             } | 
| 515 |  | 
| 516 |             if( | 
| 517 |                 (c2<=0xdbff && s2!=limit2 && U16_IS_TRAIL(*s2)) || | 
| 518 |                 (U16_IS_TRAIL(c2) && start2!=(s2-1) && U16_IS_LEAD(*(s2-2))) | 
| 519 |             ) { | 
| 520 |                 /* part of a surrogate pair, leave >=d800 */ | 
| 521 |             } else { | 
| 522 |                 /* BMP code point - may be surrogate code point - make <d800 */ | 
| 523 |                 c2-=0x2800; | 
| 524 |             } | 
| 525 |         } | 
| 526 |  | 
| 527 |         return c1-c2; | 
| 528 |     } | 
| 529 | } | 
| 530 |  | 
| 531 | static | 
| 532 | UBool _normalize(const Normalizer2 *n2, const UChar *s, int32_t length, | 
| 533 |                 UnicodeString &normalized, UErrorCode *pErrorCode) { | 
| 534 |     UnicodeString str(length<0, s, length); | 
| 535 |  | 
| 536 |     // check if s fulfill the conditions | 
| 537 |     int32_t spanQCYes=n2->spanQuickCheckYes(str, *pErrorCode); | 
| 538 |     if (U_FAILURE(*pErrorCode)) { | 
| 539 |         return FALSE; | 
| 540 |     } | 
| 541 |     /* | 
| 542 |      * ICU 2.4 had a further optimization: | 
| 543 |      * If both strings were not in FCD, then they were both NFD'ed, | 
| 544 |      * and the _COMPARE_EQUIV option was turned off. | 
| 545 |      * It is not entirely clear that this is valid with the current | 
| 546 |      * definition of the canonical caseless match. | 
| 547 |      * Therefore, ICU 2.6 removes that optimization. | 
| 548 |      */ | 
| 549 |     if(spanQCYes<str.length()) { | 
| 550 |         UnicodeString unnormalized=str.tempSubString(spanQCYes); | 
| 551 |         normalized.setTo(FALSE, str.getBuffer(), spanQCYes); | 
| 552 |         n2->normalizeSecondAndAppend(normalized, unnormalized, *pErrorCode); | 
| 553 |         if (U_SUCCESS(*pErrorCode)) { | 
| 554 |             return TRUE; | 
| 555 |         } | 
| 556 |     } | 
| 557 |     return FALSE; | 
| 558 | } | 
| 559 |  | 
| 560 | U_CAPI int32_t U_EXPORT2 | 
| 561 | unorm_compare(const UChar *s1, int32_t length1, | 
| 562 |               const UChar *s2, int32_t length2, | 
| 563 |               uint32_t options, | 
| 564 |               UErrorCode *pErrorCode) { | 
| 565 |     /* argument checking */ | 
| 566 |     if(U_FAILURE(*pErrorCode)) { | 
| 567 |         return 0; | 
| 568 |     } | 
| 569 |     if(s1==0 || length1<-1 || s2==0 || length2<-1) { | 
| 570 |         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; | 
| 571 |         return 0; | 
| 572 |     } | 
| 573 |  | 
| 574 |     UnicodeString fcd1, fcd2; | 
| 575 |     int32_t normOptions=(int32_t)(options>>UNORM_COMPARE_NORM_OPTIONS_SHIFT); | 
| 576 |     options|=_COMPARE_EQUIV; | 
| 577 |  | 
| 578 |     /* | 
| 579 |      * UAX #21 Case Mappings, as fixed for Unicode version 4 | 
| 580 |      * (see Jitterbug 2021), defines a canonical caseless match as | 
| 581 |      * | 
| 582 |      * A string X is a canonical caseless match | 
| 583 |      * for a string Y if and only if | 
| 584 |      * NFD(toCasefold(NFD(X))) = NFD(toCasefold(NFD(Y))) | 
| 585 |      * | 
| 586 |      * For better performance, we check for FCD (or let the caller tell us that | 
| 587 |      * both strings are in FCD) for the inner normalization. | 
| 588 |      * BasicNormalizerTest::FindFoldFCDExceptions() makes sure that | 
| 589 |      * case-folding preserves the FCD-ness of a string. | 
| 590 |      * The outer normalization is then only performed by unorm_cmpEquivFold() | 
| 591 |      * when there is a difference. | 
| 592 |      * | 
| 593 |      * Exception: When using the Turkic case-folding option, we do perform | 
| 594 |      * full NFD first. This is because in the Turkic case precomposed characters | 
| 595 |      * with 0049 capital I or 0069 small i fold differently whether they | 
| 596 |      * are first decomposed or not, so an FCD check - a check only for | 
| 597 |      * canonical order - is not sufficient. | 
| 598 |      */ | 
| 599 |     if(!(options&UNORM_INPUT_IS_FCD) || (options&U_FOLD_CASE_EXCLUDE_SPECIAL_I)) { | 
| 600 |         const Normalizer2 *n2; | 
| 601 |         if(options&U_FOLD_CASE_EXCLUDE_SPECIAL_I) { | 
| 602 |             n2=Normalizer2::getNFDInstance(*pErrorCode); | 
| 603 |         } else { | 
| 604 |             n2=Normalizer2Factory::getFCDInstance(*pErrorCode); | 
| 605 |         } | 
| 606 |         if (U_FAILURE(*pErrorCode)) { | 
| 607 |             return 0; | 
| 608 |         } | 
| 609 |  | 
| 610 |         if(normOptions&UNORM_UNICODE_3_2) { | 
| 611 |             const UnicodeSet *uni32=uniset_getUnicode32Instance(*pErrorCode); | 
| 612 |             FilteredNormalizer2 fn2(*n2, *uni32); | 
| 613 |             if(_normalize(&fn2, s1, length1, fcd1, pErrorCode)) { | 
| 614 |                 s1=fcd1.getBuffer(); | 
| 615 |                 length1=fcd1.length(); | 
| 616 |             } | 
| 617 |             if(_normalize(&fn2, s2, length2, fcd2, pErrorCode)) { | 
| 618 |                 s2=fcd2.getBuffer(); | 
| 619 |                 length2=fcd2.length(); | 
| 620 |             } | 
| 621 |         } else { | 
| 622 |             if(_normalize(n2, s1, length1, fcd1, pErrorCode)) { | 
| 623 |                 s1=fcd1.getBuffer(); | 
| 624 |                 length1=fcd1.length(); | 
| 625 |             } | 
| 626 |             if(_normalize(n2, s2, length2, fcd2, pErrorCode)) { | 
| 627 |                 s2=fcd2.getBuffer(); | 
| 628 |                 length2=fcd2.length(); | 
| 629 |             } | 
| 630 |         } | 
| 631 |     } | 
| 632 |  | 
| 633 |     if(U_SUCCESS(*pErrorCode)) { | 
| 634 |         return unorm_cmpEquivFold(s1, length1, s2, length2, options, pErrorCode); | 
| 635 |     } else { | 
| 636 |         return 0; | 
| 637 |     } | 
| 638 | } | 
| 639 |  | 
| 640 | #endif /* #if !UCONFIG_NO_NORMALIZATION */ | 
| 641 |  |