| 1 | // © 2016 and later: Unicode, Inc. and others. |
| 2 | // License & terms of use: http://www.unicode.org/copyright.html |
| 3 | /* |
| 4 | ******************************************************************************* |
| 5 | * Copyright (C) 2013-2015, International Business Machines |
| 6 | * Corporation and others. All Rights Reserved. |
| 7 | ******************************************************************************* |
| 8 | * collationfastlatin.cpp |
| 9 | * |
| 10 | * created on: 2013aug18 |
| 11 | * created by: Markus W. Scherer |
| 12 | */ |
| 13 | |
| 14 | #include "unicode/utypes.h" |
| 15 | |
| 16 | #if !UCONFIG_NO_COLLATION |
| 17 | |
| 18 | #include "unicode/ucol.h" |
| 19 | #include "collationdata.h" |
| 20 | #include "collationfastlatin.h" |
| 21 | #include "collationsettings.h" |
| 22 | #include "uassert.h" |
| 23 | |
| 24 | U_NAMESPACE_BEGIN |
| 25 | |
| 26 | int32_t |
| 27 | CollationFastLatin::getOptions(const CollationData *data, const CollationSettings &settings, |
| 28 | uint16_t *primaries, int32_t capacity) { |
| 29 | const uint16_t *table = data->fastLatinTable; |
| 30 | if(table == NULL) { return -1; } |
| 31 | U_ASSERT(capacity == LATIN_LIMIT); |
| 32 | if(capacity != LATIN_LIMIT) { return -1; } |
| 33 | |
| 34 | uint32_t miniVarTop; |
| 35 | if((settings.options & CollationSettings::ALTERNATE_MASK) == 0) { |
| 36 | // No mini primaries are variable, set a variableTop just below the |
| 37 | // lowest long mini primary. |
| 38 | miniVarTop = MIN_LONG - 1; |
| 39 | } else { |
| 40 | int32_t = *table & 0xff; |
| 41 | int32_t i = 1 + settings.getMaxVariable(); |
| 42 | if(i >= headerLength) { |
| 43 | return -1; // variableTop >= digits, should not occur |
| 44 | } |
| 45 | miniVarTop = table[i]; |
| 46 | } |
| 47 | |
| 48 | UBool digitsAreReordered = FALSE; |
| 49 | if(settings.hasReordering()) { |
| 50 | uint32_t prevStart = 0; |
| 51 | uint32_t beforeDigitStart = 0; |
| 52 | uint32_t digitStart = 0; |
| 53 | uint32_t afterDigitStart = 0; |
| 54 | for(int32_t group = UCOL_REORDER_CODE_FIRST; |
| 55 | group < UCOL_REORDER_CODE_FIRST + CollationData::MAX_NUM_SPECIAL_REORDER_CODES; |
| 56 | ++group) { |
| 57 | uint32_t start = data->getFirstPrimaryForGroup(group); |
| 58 | start = settings.reorder(start); |
| 59 | if(group == UCOL_REORDER_CODE_DIGIT) { |
| 60 | beforeDigitStart = prevStart; |
| 61 | digitStart = start; |
| 62 | } else if(start != 0) { |
| 63 | if(start < prevStart) { |
| 64 | // The permutation affects the groups up to Latin. |
| 65 | return -1; |
| 66 | } |
| 67 | // In the future, there might be a special group between digits & Latin. |
| 68 | if(digitStart != 0 && afterDigitStart == 0 && prevStart == beforeDigitStart) { |
| 69 | afterDigitStart = start; |
| 70 | } |
| 71 | prevStart = start; |
| 72 | } |
| 73 | } |
| 74 | uint32_t latinStart = data->getFirstPrimaryForGroup(USCRIPT_LATIN); |
| 75 | latinStart = settings.reorder(latinStart); |
| 76 | if(latinStart < prevStart) { |
| 77 | return -1; |
| 78 | } |
| 79 | if(afterDigitStart == 0) { |
| 80 | afterDigitStart = latinStart; |
| 81 | } |
| 82 | if(!(beforeDigitStart < digitStart && digitStart < afterDigitStart)) { |
| 83 | digitsAreReordered = TRUE; |
| 84 | } |
| 85 | } |
| 86 | |
| 87 | table += (table[0] & 0xff); // skip the header |
| 88 | for(UChar32 c = 0; c < LATIN_LIMIT; ++c) { |
| 89 | uint32_t p = table[c]; |
| 90 | if(p >= MIN_SHORT) { |
| 91 | p &= SHORT_PRIMARY_MASK; |
| 92 | } else if(p > miniVarTop) { |
| 93 | p &= LONG_PRIMARY_MASK; |
| 94 | } else { |
| 95 | p = 0; |
| 96 | } |
| 97 | primaries[c] = (uint16_t)p; |
| 98 | } |
| 99 | if(digitsAreReordered || (settings.options & CollationSettings::NUMERIC) != 0) { |
| 100 | // Bail out for digits. |
| 101 | for(UChar32 c = 0x30; c <= 0x39; ++c) { primaries[c] = 0; } |
| 102 | } |
| 103 | |
| 104 | // Shift the miniVarTop above other options. |
| 105 | return ((int32_t)miniVarTop << 16) | settings.options; |
| 106 | } |
| 107 | |
| 108 | int32_t |
| 109 | CollationFastLatin::compareUTF16(const uint16_t *table, const uint16_t *primaries, int32_t options, |
| 110 | const UChar *left, int32_t leftLength, |
| 111 | const UChar *right, int32_t rightLength) { |
| 112 | // This is a modified copy of CollationCompare::compareUpToQuaternary(), |
| 113 | // optimized for common Latin text. |
| 114 | // Keep them in sync! |
| 115 | // Keep compareUTF16() and compareUTF8() in sync very closely! |
| 116 | |
| 117 | U_ASSERT((table[0] >> 8) == VERSION); |
| 118 | table += (table[0] & 0xff); // skip the header |
| 119 | uint32_t variableTop = (uint32_t)options >> 16; // see getOptions() |
| 120 | options &= 0xffff; // needed for CollationSettings::getStrength() to work |
| 121 | |
| 122 | // Check for supported characters, fetch mini CEs, and compare primaries. |
| 123 | int32_t leftIndex = 0, rightIndex = 0; |
| 124 | /** |
| 125 | * Single mini CE or a pair. |
| 126 | * The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits. |
| 127 | * If there is only one, then it is in the lower bits, and the upper bits are 0. |
| 128 | */ |
| 129 | uint32_t leftPair = 0, rightPair = 0; |
| 130 | for(;;) { |
| 131 | // We fetch CEs until we get a non-ignorable primary or reach the end. |
| 132 | while(leftPair == 0) { |
| 133 | if(leftIndex == leftLength) { |
| 134 | leftPair = EOS; |
| 135 | break; |
| 136 | } |
| 137 | UChar32 c = left[leftIndex++]; |
| 138 | if(c <= LATIN_MAX) { |
| 139 | leftPair = primaries[c]; |
| 140 | if(leftPair != 0) { break; } |
| 141 | if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) { |
| 142 | return BAIL_OUT_RESULT; |
| 143 | } |
| 144 | leftPair = table[c]; |
| 145 | } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { |
| 146 | leftPair = table[c - PUNCT_START + LATIN_LIMIT]; |
| 147 | } else { |
| 148 | leftPair = lookup(table, c); |
| 149 | } |
| 150 | if(leftPair >= MIN_SHORT) { |
| 151 | leftPair &= SHORT_PRIMARY_MASK; |
| 152 | break; |
| 153 | } else if(leftPair > variableTop) { |
| 154 | leftPair &= LONG_PRIMARY_MASK; |
| 155 | break; |
| 156 | } else { |
| 157 | leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength); |
| 158 | if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; } |
| 159 | leftPair = getPrimaries(variableTop, leftPair); |
| 160 | } |
| 161 | } |
| 162 | |
| 163 | while(rightPair == 0) { |
| 164 | if(rightIndex == rightLength) { |
| 165 | rightPair = EOS; |
| 166 | break; |
| 167 | } |
| 168 | UChar32 c = right[rightIndex++]; |
| 169 | if(c <= LATIN_MAX) { |
| 170 | rightPair = primaries[c]; |
| 171 | if(rightPair != 0) { break; } |
| 172 | if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) { |
| 173 | return BAIL_OUT_RESULT; |
| 174 | } |
| 175 | rightPair = table[c]; |
| 176 | } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { |
| 177 | rightPair = table[c - PUNCT_START + LATIN_LIMIT]; |
| 178 | } else { |
| 179 | rightPair = lookup(table, c); |
| 180 | } |
| 181 | if(rightPair >= MIN_SHORT) { |
| 182 | rightPair &= SHORT_PRIMARY_MASK; |
| 183 | break; |
| 184 | } else if(rightPair > variableTop) { |
| 185 | rightPair &= LONG_PRIMARY_MASK; |
| 186 | break; |
| 187 | } else { |
| 188 | rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength); |
| 189 | if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; } |
| 190 | rightPair = getPrimaries(variableTop, rightPair); |
| 191 | } |
| 192 | } |
| 193 | |
| 194 | if(leftPair == rightPair) { |
| 195 | if(leftPair == EOS) { break; } |
| 196 | leftPair = rightPair = 0; |
| 197 | continue; |
| 198 | } |
| 199 | uint32_t leftPrimary = leftPair & 0xffff; |
| 200 | uint32_t rightPrimary = rightPair & 0xffff; |
| 201 | if(leftPrimary != rightPrimary) { |
| 202 | // Return the primary difference. |
| 203 | return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER; |
| 204 | } |
| 205 | if(leftPair == EOS) { break; } |
| 206 | leftPair >>= 16; |
| 207 | rightPair >>= 16; |
| 208 | } |
| 209 | // In the following, we need to re-fetch each character because we did not buffer the CEs, |
| 210 | // but we know that the string is well-formed and |
| 211 | // only contains supported characters and mappings. |
| 212 | |
| 213 | // We might skip the secondary level but continue with the case level |
| 214 | // which is turned on separately. |
| 215 | if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) { |
| 216 | leftIndex = rightIndex = 0; |
| 217 | leftPair = rightPair = 0; |
| 218 | for(;;) { |
| 219 | while(leftPair == 0) { |
| 220 | if(leftIndex == leftLength) { |
| 221 | leftPair = EOS; |
| 222 | break; |
| 223 | } |
| 224 | UChar32 c = left[leftIndex++]; |
| 225 | if(c <= LATIN_MAX) { |
| 226 | leftPair = table[c]; |
| 227 | } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { |
| 228 | leftPair = table[c - PUNCT_START + LATIN_LIMIT]; |
| 229 | } else { |
| 230 | leftPair = lookup(table, c); |
| 231 | } |
| 232 | if(leftPair >= MIN_SHORT) { |
| 233 | leftPair = getSecondariesFromOneShortCE(leftPair); |
| 234 | break; |
| 235 | } else if(leftPair > variableTop) { |
| 236 | leftPair = COMMON_SEC_PLUS_OFFSET; |
| 237 | break; |
| 238 | } else { |
| 239 | leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength); |
| 240 | leftPair = getSecondaries(variableTop, leftPair); |
| 241 | } |
| 242 | } |
| 243 | |
| 244 | while(rightPair == 0) { |
| 245 | if(rightIndex == rightLength) { |
| 246 | rightPair = EOS; |
| 247 | break; |
| 248 | } |
| 249 | UChar32 c = right[rightIndex++]; |
| 250 | if(c <= LATIN_MAX) { |
| 251 | rightPair = table[c]; |
| 252 | } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { |
| 253 | rightPair = table[c - PUNCT_START + LATIN_LIMIT]; |
| 254 | } else { |
| 255 | rightPair = lookup(table, c); |
| 256 | } |
| 257 | if(rightPair >= MIN_SHORT) { |
| 258 | rightPair = getSecondariesFromOneShortCE(rightPair); |
| 259 | break; |
| 260 | } else if(rightPair > variableTop) { |
| 261 | rightPair = COMMON_SEC_PLUS_OFFSET; |
| 262 | break; |
| 263 | } else { |
| 264 | rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength); |
| 265 | rightPair = getSecondaries(variableTop, rightPair); |
| 266 | } |
| 267 | } |
| 268 | |
| 269 | if(leftPair == rightPair) { |
| 270 | if(leftPair == EOS) { break; } |
| 271 | leftPair = rightPair = 0; |
| 272 | continue; |
| 273 | } |
| 274 | uint32_t leftSecondary = leftPair & 0xffff; |
| 275 | uint32_t rightSecondary = rightPair & 0xffff; |
| 276 | if(leftSecondary != rightSecondary) { |
| 277 | if((options & CollationSettings::BACKWARD_SECONDARY) != 0) { |
| 278 | // Full support for backwards secondary requires backwards contraction matching |
| 279 | // and moving backwards between merge separators. |
| 280 | return BAIL_OUT_RESULT; |
| 281 | } |
| 282 | return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER; |
| 283 | } |
| 284 | if(leftPair == EOS) { break; } |
| 285 | leftPair >>= 16; |
| 286 | rightPair >>= 16; |
| 287 | } |
| 288 | } |
| 289 | |
| 290 | if((options & CollationSettings::CASE_LEVEL) != 0) { |
| 291 | UBool strengthIsPrimary = CollationSettings::getStrength(options) == UCOL_PRIMARY; |
| 292 | leftIndex = rightIndex = 0; |
| 293 | leftPair = rightPair = 0; |
| 294 | for(;;) { |
| 295 | while(leftPair == 0) { |
| 296 | if(leftIndex == leftLength) { |
| 297 | leftPair = EOS; |
| 298 | break; |
| 299 | } |
| 300 | UChar32 c = left[leftIndex++]; |
| 301 | leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); |
| 302 | if(leftPair < MIN_LONG) { |
| 303 | leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength); |
| 304 | } |
| 305 | leftPair = getCases(variableTop, strengthIsPrimary, leftPair); |
| 306 | } |
| 307 | |
| 308 | while(rightPair == 0) { |
| 309 | if(rightIndex == rightLength) { |
| 310 | rightPair = EOS; |
| 311 | break; |
| 312 | } |
| 313 | UChar32 c = right[rightIndex++]; |
| 314 | rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); |
| 315 | if(rightPair < MIN_LONG) { |
| 316 | rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength); |
| 317 | } |
| 318 | rightPair = getCases(variableTop, strengthIsPrimary, rightPair); |
| 319 | } |
| 320 | |
| 321 | if(leftPair == rightPair) { |
| 322 | if(leftPair == EOS) { break; } |
| 323 | leftPair = rightPair = 0; |
| 324 | continue; |
| 325 | } |
| 326 | uint32_t leftCase = leftPair & 0xffff; |
| 327 | uint32_t rightCase = rightPair & 0xffff; |
| 328 | if(leftCase != rightCase) { |
| 329 | if((options & CollationSettings::UPPER_FIRST) == 0) { |
| 330 | return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER; |
| 331 | } else { |
| 332 | return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS; |
| 333 | } |
| 334 | } |
| 335 | if(leftPair == EOS) { break; } |
| 336 | leftPair >>= 16; |
| 337 | rightPair >>= 16; |
| 338 | } |
| 339 | } |
| 340 | if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; } |
| 341 | |
| 342 | // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off. |
| 343 | UBool withCaseBits = CollationSettings::isTertiaryWithCaseBits(options); |
| 344 | |
| 345 | leftIndex = rightIndex = 0; |
| 346 | leftPair = rightPair = 0; |
| 347 | for(;;) { |
| 348 | while(leftPair == 0) { |
| 349 | if(leftIndex == leftLength) { |
| 350 | leftPair = EOS; |
| 351 | break; |
| 352 | } |
| 353 | UChar32 c = left[leftIndex++]; |
| 354 | leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); |
| 355 | if(leftPair < MIN_LONG) { |
| 356 | leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength); |
| 357 | } |
| 358 | leftPair = getTertiaries(variableTop, withCaseBits, leftPair); |
| 359 | } |
| 360 | |
| 361 | while(rightPair == 0) { |
| 362 | if(rightIndex == rightLength) { |
| 363 | rightPair = EOS; |
| 364 | break; |
| 365 | } |
| 366 | UChar32 c = right[rightIndex++]; |
| 367 | rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); |
| 368 | if(rightPair < MIN_LONG) { |
| 369 | rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength); |
| 370 | } |
| 371 | rightPair = getTertiaries(variableTop, withCaseBits, rightPair); |
| 372 | } |
| 373 | |
| 374 | if(leftPair == rightPair) { |
| 375 | if(leftPair == EOS) { break; } |
| 376 | leftPair = rightPair = 0; |
| 377 | continue; |
| 378 | } |
| 379 | uint32_t leftTertiary = leftPair & 0xffff; |
| 380 | uint32_t rightTertiary = rightPair & 0xffff; |
| 381 | if(leftTertiary != rightTertiary) { |
| 382 | if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) { |
| 383 | // Pass through EOS and MERGE_WEIGHT |
| 384 | // and keep real tertiary weights larger than the MERGE_WEIGHT. |
| 385 | // Tertiary CEs (secondary ignorables) are not supported in fast Latin. |
| 386 | if(leftTertiary > MERGE_WEIGHT) { |
| 387 | leftTertiary ^= CASE_MASK; |
| 388 | } |
| 389 | if(rightTertiary > MERGE_WEIGHT) { |
| 390 | rightTertiary ^= CASE_MASK; |
| 391 | } |
| 392 | } |
| 393 | return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER; |
| 394 | } |
| 395 | if(leftPair == EOS) { break; } |
| 396 | leftPair >>= 16; |
| 397 | rightPair >>= 16; |
| 398 | } |
| 399 | if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; } |
| 400 | |
| 401 | leftIndex = rightIndex = 0; |
| 402 | leftPair = rightPair = 0; |
| 403 | for(;;) { |
| 404 | while(leftPair == 0) { |
| 405 | if(leftIndex == leftLength) { |
| 406 | leftPair = EOS; |
| 407 | break; |
| 408 | } |
| 409 | UChar32 c = left[leftIndex++]; |
| 410 | leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); |
| 411 | if(leftPair < MIN_LONG) { |
| 412 | leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength); |
| 413 | } |
| 414 | leftPair = getQuaternaries(variableTop, leftPair); |
| 415 | } |
| 416 | |
| 417 | while(rightPair == 0) { |
| 418 | if(rightIndex == rightLength) { |
| 419 | rightPair = EOS; |
| 420 | break; |
| 421 | } |
| 422 | UChar32 c = right[rightIndex++]; |
| 423 | rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); |
| 424 | if(rightPair < MIN_LONG) { |
| 425 | rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength); |
| 426 | } |
| 427 | rightPair = getQuaternaries(variableTop, rightPair); |
| 428 | } |
| 429 | |
| 430 | if(leftPair == rightPair) { |
| 431 | if(leftPair == EOS) { break; } |
| 432 | leftPair = rightPair = 0; |
| 433 | continue; |
| 434 | } |
| 435 | uint32_t leftQuaternary = leftPair & 0xffff; |
| 436 | uint32_t rightQuaternary = rightPair & 0xffff; |
| 437 | if(leftQuaternary != rightQuaternary) { |
| 438 | return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER; |
| 439 | } |
| 440 | if(leftPair == EOS) { break; } |
| 441 | leftPair >>= 16; |
| 442 | rightPair >>= 16; |
| 443 | } |
| 444 | return UCOL_EQUAL; |
| 445 | } |
| 446 | |
| 447 | int32_t |
| 448 | CollationFastLatin::compareUTF8(const uint16_t *table, const uint16_t *primaries, int32_t options, |
| 449 | const uint8_t *left, int32_t leftLength, |
| 450 | const uint8_t *right, int32_t rightLength) { |
| 451 | // Keep compareUTF16() and compareUTF8() in sync very closely! |
| 452 | |
| 453 | U_ASSERT((table[0] >> 8) == VERSION); |
| 454 | table += (table[0] & 0xff); // skip the header |
| 455 | uint32_t variableTop = (uint32_t)options >> 16; // see RuleBasedCollator::getFastLatinOptions() |
| 456 | options &= 0xffff; // needed for CollationSettings::getStrength() to work |
| 457 | |
| 458 | // Check for supported characters, fetch mini CEs, and compare primaries. |
| 459 | int32_t leftIndex = 0, rightIndex = 0; |
| 460 | /** |
| 461 | * Single mini CE or a pair. |
| 462 | * The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits. |
| 463 | * If there is only one, then it is in the lower bits, and the upper bits are 0. |
| 464 | */ |
| 465 | uint32_t leftPair = 0, rightPair = 0; |
| 466 | // Note: There is no need to assemble the code point. |
| 467 | // We only need to look up the table entry for the character, |
| 468 | // and nextPair() looks for whether c==0. |
| 469 | for(;;) { |
| 470 | // We fetch CEs until we get a non-ignorable primary or reach the end. |
| 471 | while(leftPair == 0) { |
| 472 | if(leftIndex == leftLength) { |
| 473 | leftPair = EOS; |
| 474 | break; |
| 475 | } |
| 476 | UChar32 c = left[leftIndex++]; |
| 477 | uint8_t t; |
| 478 | if(c <= 0x7f) { |
| 479 | leftPair = primaries[c]; |
| 480 | if(leftPair != 0) { break; } |
| 481 | if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) { |
| 482 | return BAIL_OUT_RESULT; |
| 483 | } |
| 484 | leftPair = table[c]; |
| 485 | } else if(c <= LATIN_MAX_UTF8_LEAD && 0xc2 <= c && leftIndex != leftLength && |
| 486 | 0x80 <= (t = left[leftIndex]) && t <= 0xbf) { |
| 487 | ++leftIndex; |
| 488 | c = ((c - 0xc2) << 6) + t; |
| 489 | leftPair = primaries[c]; |
| 490 | if(leftPair != 0) { break; } |
| 491 | leftPair = table[c]; |
| 492 | } else { |
| 493 | leftPair = lookupUTF8(table, c, left, leftIndex, leftLength); |
| 494 | } |
| 495 | if(leftPair >= MIN_SHORT) { |
| 496 | leftPair &= SHORT_PRIMARY_MASK; |
| 497 | break; |
| 498 | } else if(leftPair > variableTop) { |
| 499 | leftPair &= LONG_PRIMARY_MASK; |
| 500 | break; |
| 501 | } else { |
| 502 | leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength); |
| 503 | if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; } |
| 504 | leftPair = getPrimaries(variableTop, leftPair); |
| 505 | } |
| 506 | } |
| 507 | |
| 508 | while(rightPair == 0) { |
| 509 | if(rightIndex == rightLength) { |
| 510 | rightPair = EOS; |
| 511 | break; |
| 512 | } |
| 513 | UChar32 c = right[rightIndex++]; |
| 514 | uint8_t t; |
| 515 | if(c <= 0x7f) { |
| 516 | rightPair = primaries[c]; |
| 517 | if(rightPair != 0) { break; } |
| 518 | if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) { |
| 519 | return BAIL_OUT_RESULT; |
| 520 | } |
| 521 | rightPair = table[c]; |
| 522 | } else if(c <= LATIN_MAX_UTF8_LEAD && 0xc2 <= c && rightIndex != rightLength && |
| 523 | 0x80 <= (t = right[rightIndex]) && t <= 0xbf) { |
| 524 | ++rightIndex; |
| 525 | c = ((c - 0xc2) << 6) + t; |
| 526 | rightPair = primaries[c]; |
| 527 | if(rightPair != 0) { break; } |
| 528 | rightPair = table[c]; |
| 529 | } else { |
| 530 | rightPair = lookupUTF8(table, c, right, rightIndex, rightLength); |
| 531 | } |
| 532 | if(rightPair >= MIN_SHORT) { |
| 533 | rightPair &= SHORT_PRIMARY_MASK; |
| 534 | break; |
| 535 | } else if(rightPair > variableTop) { |
| 536 | rightPair &= LONG_PRIMARY_MASK; |
| 537 | break; |
| 538 | } else { |
| 539 | rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength); |
| 540 | if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; } |
| 541 | rightPair = getPrimaries(variableTop, rightPair); |
| 542 | } |
| 543 | } |
| 544 | |
| 545 | if(leftPair == rightPair) { |
| 546 | if(leftPair == EOS) { break; } |
| 547 | leftPair = rightPair = 0; |
| 548 | continue; |
| 549 | } |
| 550 | uint32_t leftPrimary = leftPair & 0xffff; |
| 551 | uint32_t rightPrimary = rightPair & 0xffff; |
| 552 | if(leftPrimary != rightPrimary) { |
| 553 | // Return the primary difference. |
| 554 | return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER; |
| 555 | } |
| 556 | if(leftPair == EOS) { break; } |
| 557 | leftPair >>= 16; |
| 558 | rightPair >>= 16; |
| 559 | } |
| 560 | // In the following, we need to re-fetch each character because we did not buffer the CEs, |
| 561 | // but we know that the string is well-formed and |
| 562 | // only contains supported characters and mappings. |
| 563 | |
| 564 | // We might skip the secondary level but continue with the case level |
| 565 | // which is turned on separately. |
| 566 | if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) { |
| 567 | leftIndex = rightIndex = 0; |
| 568 | leftPair = rightPair = 0; |
| 569 | for(;;) { |
| 570 | while(leftPair == 0) { |
| 571 | if(leftIndex == leftLength) { |
| 572 | leftPair = EOS; |
| 573 | break; |
| 574 | } |
| 575 | UChar32 c = left[leftIndex++]; |
| 576 | if(c <= 0x7f) { |
| 577 | leftPair = table[c]; |
| 578 | } else if(c <= LATIN_MAX_UTF8_LEAD) { |
| 579 | leftPair = table[((c - 0xc2) << 6) + left[leftIndex++]]; |
| 580 | } else { |
| 581 | leftPair = lookupUTF8Unsafe(table, c, left, leftIndex); |
| 582 | } |
| 583 | if(leftPair >= MIN_SHORT) { |
| 584 | leftPair = getSecondariesFromOneShortCE(leftPair); |
| 585 | break; |
| 586 | } else if(leftPair > variableTop) { |
| 587 | leftPair = COMMON_SEC_PLUS_OFFSET; |
| 588 | break; |
| 589 | } else { |
| 590 | leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength); |
| 591 | leftPair = getSecondaries(variableTop, leftPair); |
| 592 | } |
| 593 | } |
| 594 | |
| 595 | while(rightPair == 0) { |
| 596 | if(rightIndex == rightLength) { |
| 597 | rightPair = EOS; |
| 598 | break; |
| 599 | } |
| 600 | UChar32 c = right[rightIndex++]; |
| 601 | if(c <= 0x7f) { |
| 602 | rightPair = table[c]; |
| 603 | } else if(c <= LATIN_MAX_UTF8_LEAD) { |
| 604 | rightPair = table[((c - 0xc2) << 6) + right[rightIndex++]]; |
| 605 | } else { |
| 606 | rightPair = lookupUTF8Unsafe(table, c, right, rightIndex); |
| 607 | } |
| 608 | if(rightPair >= MIN_SHORT) { |
| 609 | rightPair = getSecondariesFromOneShortCE(rightPair); |
| 610 | break; |
| 611 | } else if(rightPair > variableTop) { |
| 612 | rightPair = COMMON_SEC_PLUS_OFFSET; |
| 613 | break; |
| 614 | } else { |
| 615 | rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength); |
| 616 | rightPair = getSecondaries(variableTop, rightPair); |
| 617 | } |
| 618 | } |
| 619 | |
| 620 | if(leftPair == rightPair) { |
| 621 | if(leftPair == EOS) { break; } |
| 622 | leftPair = rightPair = 0; |
| 623 | continue; |
| 624 | } |
| 625 | uint32_t leftSecondary = leftPair & 0xffff; |
| 626 | uint32_t rightSecondary = rightPair & 0xffff; |
| 627 | if(leftSecondary != rightSecondary) { |
| 628 | if((options & CollationSettings::BACKWARD_SECONDARY) != 0) { |
| 629 | // Full support for backwards secondary requires backwards contraction matching |
| 630 | // and moving backwards between merge separators. |
| 631 | return BAIL_OUT_RESULT; |
| 632 | } |
| 633 | return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER; |
| 634 | } |
| 635 | if(leftPair == EOS) { break; } |
| 636 | leftPair >>= 16; |
| 637 | rightPair >>= 16; |
| 638 | } |
| 639 | } |
| 640 | |
| 641 | if((options & CollationSettings::CASE_LEVEL) != 0) { |
| 642 | UBool strengthIsPrimary = CollationSettings::getStrength(options) == UCOL_PRIMARY; |
| 643 | leftIndex = rightIndex = 0; |
| 644 | leftPair = rightPair = 0; |
| 645 | for(;;) { |
| 646 | while(leftPair == 0) { |
| 647 | if(leftIndex == leftLength) { |
| 648 | leftPair = EOS; |
| 649 | break; |
| 650 | } |
| 651 | UChar32 c = left[leftIndex++]; |
| 652 | leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex); |
| 653 | if(leftPair < MIN_LONG) { |
| 654 | leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength); |
| 655 | } |
| 656 | leftPair = getCases(variableTop, strengthIsPrimary, leftPair); |
| 657 | } |
| 658 | |
| 659 | while(rightPair == 0) { |
| 660 | if(rightIndex == rightLength) { |
| 661 | rightPair = EOS; |
| 662 | break; |
| 663 | } |
| 664 | UChar32 c = right[rightIndex++]; |
| 665 | rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex); |
| 666 | if(rightPair < MIN_LONG) { |
| 667 | rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength); |
| 668 | } |
| 669 | rightPair = getCases(variableTop, strengthIsPrimary, rightPair); |
| 670 | } |
| 671 | |
| 672 | if(leftPair == rightPair) { |
| 673 | if(leftPair == EOS) { break; } |
| 674 | leftPair = rightPair = 0; |
| 675 | continue; |
| 676 | } |
| 677 | uint32_t leftCase = leftPair & 0xffff; |
| 678 | uint32_t rightCase = rightPair & 0xffff; |
| 679 | if(leftCase != rightCase) { |
| 680 | if((options & CollationSettings::UPPER_FIRST) == 0) { |
| 681 | return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER; |
| 682 | } else { |
| 683 | return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS; |
| 684 | } |
| 685 | } |
| 686 | if(leftPair == EOS) { break; } |
| 687 | leftPair >>= 16; |
| 688 | rightPair >>= 16; |
| 689 | } |
| 690 | } |
| 691 | if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; } |
| 692 | |
| 693 | // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off. |
| 694 | UBool withCaseBits = CollationSettings::isTertiaryWithCaseBits(options); |
| 695 | |
| 696 | leftIndex = rightIndex = 0; |
| 697 | leftPair = rightPair = 0; |
| 698 | for(;;) { |
| 699 | while(leftPair == 0) { |
| 700 | if(leftIndex == leftLength) { |
| 701 | leftPair = EOS; |
| 702 | break; |
| 703 | } |
| 704 | UChar32 c = left[leftIndex++]; |
| 705 | leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex); |
| 706 | if(leftPair < MIN_LONG) { |
| 707 | leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength); |
| 708 | } |
| 709 | leftPair = getTertiaries(variableTop, withCaseBits, leftPair); |
| 710 | } |
| 711 | |
| 712 | while(rightPair == 0) { |
| 713 | if(rightIndex == rightLength) { |
| 714 | rightPair = EOS; |
| 715 | break; |
| 716 | } |
| 717 | UChar32 c = right[rightIndex++]; |
| 718 | rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex); |
| 719 | if(rightPair < MIN_LONG) { |
| 720 | rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength); |
| 721 | } |
| 722 | rightPair = getTertiaries(variableTop, withCaseBits, rightPair); |
| 723 | } |
| 724 | |
| 725 | if(leftPair == rightPair) { |
| 726 | if(leftPair == EOS) { break; } |
| 727 | leftPair = rightPair = 0; |
| 728 | continue; |
| 729 | } |
| 730 | uint32_t leftTertiary = leftPair & 0xffff; |
| 731 | uint32_t rightTertiary = rightPair & 0xffff; |
| 732 | if(leftTertiary != rightTertiary) { |
| 733 | if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) { |
| 734 | // Pass through EOS and MERGE_WEIGHT |
| 735 | // and keep real tertiary weights larger than the MERGE_WEIGHT. |
| 736 | // Tertiary CEs (secondary ignorables) are not supported in fast Latin. |
| 737 | if(leftTertiary > MERGE_WEIGHT) { |
| 738 | leftTertiary ^= CASE_MASK; |
| 739 | } |
| 740 | if(rightTertiary > MERGE_WEIGHT) { |
| 741 | rightTertiary ^= CASE_MASK; |
| 742 | } |
| 743 | } |
| 744 | return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER; |
| 745 | } |
| 746 | if(leftPair == EOS) { break; } |
| 747 | leftPair >>= 16; |
| 748 | rightPair >>= 16; |
| 749 | } |
| 750 | if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; } |
| 751 | |
| 752 | leftIndex = rightIndex = 0; |
| 753 | leftPair = rightPair = 0; |
| 754 | for(;;) { |
| 755 | while(leftPair == 0) { |
| 756 | if(leftIndex == leftLength) { |
| 757 | leftPair = EOS; |
| 758 | break; |
| 759 | } |
| 760 | UChar32 c = left[leftIndex++]; |
| 761 | leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex); |
| 762 | if(leftPair < MIN_LONG) { |
| 763 | leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength); |
| 764 | } |
| 765 | leftPair = getQuaternaries(variableTop, leftPair); |
| 766 | } |
| 767 | |
| 768 | while(rightPair == 0) { |
| 769 | if(rightIndex == rightLength) { |
| 770 | rightPair = EOS; |
| 771 | break; |
| 772 | } |
| 773 | UChar32 c = right[rightIndex++]; |
| 774 | rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex); |
| 775 | if(rightPair < MIN_LONG) { |
| 776 | rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength); |
| 777 | } |
| 778 | rightPair = getQuaternaries(variableTop, rightPair); |
| 779 | } |
| 780 | |
| 781 | if(leftPair == rightPair) { |
| 782 | if(leftPair == EOS) { break; } |
| 783 | leftPair = rightPair = 0; |
| 784 | continue; |
| 785 | } |
| 786 | uint32_t leftQuaternary = leftPair & 0xffff; |
| 787 | uint32_t rightQuaternary = rightPair & 0xffff; |
| 788 | if(leftQuaternary != rightQuaternary) { |
| 789 | return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER; |
| 790 | } |
| 791 | if(leftPair == EOS) { break; } |
| 792 | leftPair >>= 16; |
| 793 | rightPair >>= 16; |
| 794 | } |
| 795 | return UCOL_EQUAL; |
| 796 | } |
| 797 | |
| 798 | uint32_t |
| 799 | CollationFastLatin::lookup(const uint16_t *table, UChar32 c) { |
| 800 | U_ASSERT(c > LATIN_MAX); |
| 801 | if(PUNCT_START <= c && c < PUNCT_LIMIT) { |
| 802 | return table[c - PUNCT_START + LATIN_LIMIT]; |
| 803 | } else if(c == 0xfffe) { |
| 804 | return MERGE_WEIGHT; |
| 805 | } else if(c == 0xffff) { |
| 806 | return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER; |
| 807 | } else { |
| 808 | return BAIL_OUT; |
| 809 | } |
| 810 | } |
| 811 | |
| 812 | uint32_t |
| 813 | CollationFastLatin::lookupUTF8(const uint16_t *table, UChar32 c, |
| 814 | const uint8_t *s8, int32_t &sIndex, int32_t sLength) { |
| 815 | // The caller handled ASCII and valid/supported Latin. |
| 816 | U_ASSERT(c > 0x7f); |
| 817 | int32_t i2 = sIndex + 1; |
| 818 | if(i2 < sLength || sLength < 0) { |
| 819 | uint8_t t1 = s8[sIndex]; |
| 820 | uint8_t t2 = s8[i2]; |
| 821 | sIndex += 2; |
| 822 | if(c == 0xe2 && t1 == 0x80 && 0x80 <= t2 && t2 <= 0xbf) { |
| 823 | return table[(LATIN_LIMIT - 0x80) + t2]; // 2000..203F -> 0180..01BF |
| 824 | } else if(c == 0xef && t1 == 0xbf) { |
| 825 | if(t2 == 0xbe) { |
| 826 | return MERGE_WEIGHT; // U+FFFE |
| 827 | } else if(t2 == 0xbf) { |
| 828 | return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER; // U+FFFF |
| 829 | } |
| 830 | } |
| 831 | } |
| 832 | return BAIL_OUT; |
| 833 | } |
| 834 | |
| 835 | uint32_t |
| 836 | CollationFastLatin::lookupUTF8Unsafe(const uint16_t *table, UChar32 c, |
| 837 | const uint8_t *s8, int32_t &sIndex) { |
| 838 | // The caller handled ASCII. |
| 839 | // The string is well-formed and contains only supported characters. |
| 840 | U_ASSERT(c > 0x7f); |
| 841 | if(c <= LATIN_MAX_UTF8_LEAD) { |
| 842 | return table[((c - 0xc2) << 6) + s8[sIndex++]]; // 0080..017F |
| 843 | } |
| 844 | uint8_t t2 = s8[sIndex + 1]; |
| 845 | sIndex += 2; |
| 846 | if(c == 0xe2) { |
| 847 | return table[(LATIN_LIMIT - 0x80) + t2]; // 2000..203F -> 0180..01BF |
| 848 | } else if(t2 == 0xbe) { |
| 849 | return MERGE_WEIGHT; // U+FFFE |
| 850 | } else { |
| 851 | return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER; // U+FFFF |
| 852 | } |
| 853 | } |
| 854 | |
| 855 | uint32_t |
| 856 | CollationFastLatin::nextPair(const uint16_t *table, UChar32 c, uint32_t ce, |
| 857 | const UChar *s16, const uint8_t *s8, int32_t &sIndex, int32_t &sLength) { |
| 858 | if(ce >= MIN_LONG || ce < CONTRACTION) { |
| 859 | return ce; // simple or special mini CE |
| 860 | } else if(ce >= EXPANSION) { |
| 861 | int32_t index = NUM_FAST_CHARS + (ce & INDEX_MASK); |
| 862 | return ((uint32_t)table[index + 1] << 16) | table[index]; |
| 863 | } else /* ce >= CONTRACTION */ { |
| 864 | if(c == 0 && sLength < 0) { |
| 865 | sLength = sIndex - 1; |
| 866 | return EOS; |
| 867 | } |
| 868 | // Contraction list: Default mapping followed by |
| 869 | // 0 or more single-character contraction suffix mappings. |
| 870 | int32_t index = NUM_FAST_CHARS + (ce & INDEX_MASK); |
| 871 | if(sIndex != sLength) { |
| 872 | // Read the next character. |
| 873 | int32_t c2; |
| 874 | int32_t nextIndex = sIndex; |
| 875 | if(s16 != NULL) { |
| 876 | c2 = s16[nextIndex++]; |
| 877 | if(c2 > LATIN_MAX) { |
| 878 | if(PUNCT_START <= c2 && c2 < PUNCT_LIMIT) { |
| 879 | c2 = c2 - PUNCT_START + LATIN_LIMIT; // 2000..203F -> 0180..01BF |
| 880 | } else if(c2 == 0xfffe || c2 == 0xffff) { |
| 881 | c2 = -1; // U+FFFE & U+FFFF cannot occur in contractions. |
| 882 | } else { |
| 883 | return BAIL_OUT; |
| 884 | } |
| 885 | } |
| 886 | } else { |
| 887 | c2 = s8[nextIndex++]; |
| 888 | if(c2 > 0x7f) { |
| 889 | uint8_t t; |
| 890 | if(c2 <= 0xc5 && 0xc2 <= c2 && nextIndex != sLength && |
| 891 | 0x80 <= (t = s8[nextIndex]) && t <= 0xbf) { |
| 892 | c2 = ((c2 - 0xc2) << 6) + t; // 0080..017F |
| 893 | ++nextIndex; |
| 894 | } else { |
| 895 | int32_t i2 = nextIndex + 1; |
| 896 | if(i2 < sLength || sLength < 0) { |
| 897 | if(c2 == 0xe2 && s8[nextIndex] == 0x80 && |
| 898 | 0x80 <= (t = s8[i2]) && t <= 0xbf) { |
| 899 | c2 = (LATIN_LIMIT - 0x80) + t; // 2000..203F -> 0180..01BF |
| 900 | } else if(c2 == 0xef && s8[nextIndex] == 0xbf && |
| 901 | ((t = s8[i2]) == 0xbe || t == 0xbf)) { |
| 902 | c2 = -1; // U+FFFE & U+FFFF cannot occur in contractions. |
| 903 | } else { |
| 904 | return BAIL_OUT; |
| 905 | } |
| 906 | } else { |
| 907 | return BAIL_OUT; |
| 908 | } |
| 909 | nextIndex += 2; |
| 910 | } |
| 911 | } |
| 912 | } |
| 913 | if(c2 == 0 && sLength < 0) { |
| 914 | sLength = sIndex; |
| 915 | c2 = -1; |
| 916 | } |
| 917 | // Look for the next character in the contraction suffix list, |
| 918 | // which is in ascending order of single suffix characters. |
| 919 | int32_t i = index; |
| 920 | int32_t head = table[i]; // first skip the default mapping |
| 921 | int32_t x; |
| 922 | do { |
| 923 | i += head >> CONTR_LENGTH_SHIFT; |
| 924 | head = table[i]; |
| 925 | x = head & CONTR_CHAR_MASK; |
| 926 | } while(x < c2); |
| 927 | if(x == c2) { |
| 928 | index = i; |
| 929 | sIndex = nextIndex; |
| 930 | } |
| 931 | } |
| 932 | // Return the CE or CEs for the default or contraction mapping. |
| 933 | int32_t length = table[index] >> CONTR_LENGTH_SHIFT; |
| 934 | if(length == 1) { |
| 935 | return BAIL_OUT; |
| 936 | } |
| 937 | ce = table[index + 1]; |
| 938 | if(length == 2) { |
| 939 | return ce; |
| 940 | } else { |
| 941 | return ((uint32_t)table[index + 2] << 16) | ce; |
| 942 | } |
| 943 | } |
| 944 | } |
| 945 | |
| 946 | uint32_t |
| 947 | CollationFastLatin::getSecondaries(uint32_t variableTop, uint32_t pair) { |
| 948 | if(pair <= 0xffff) { |
| 949 | // one mini CE |
| 950 | if(pair >= MIN_SHORT) { |
| 951 | pair = getSecondariesFromOneShortCE(pair); |
| 952 | } else if(pair > variableTop) { |
| 953 | pair = COMMON_SEC_PLUS_OFFSET; |
| 954 | } else if(pair >= MIN_LONG) { |
| 955 | pair = 0; // variable |
| 956 | } |
| 957 | // else special mini CE |
| 958 | } else { |
| 959 | uint32_t ce = pair & 0xffff; |
| 960 | if(ce >= MIN_SHORT) { |
| 961 | pair = (pair & TWO_SECONDARIES_MASK) + TWO_SEC_OFFSETS; |
| 962 | } else if(ce > variableTop) { |
| 963 | pair = TWO_COMMON_SEC_PLUS_OFFSET; |
| 964 | } else { |
| 965 | U_ASSERT(ce >= MIN_LONG); |
| 966 | pair = 0; // variable |
| 967 | } |
| 968 | } |
| 969 | return pair; |
| 970 | } |
| 971 | |
| 972 | uint32_t |
| 973 | CollationFastLatin::getCases(uint32_t variableTop, UBool strengthIsPrimary, uint32_t pair) { |
| 974 | // Primary+caseLevel: Ignore case level weights of primary ignorables. |
| 975 | // Otherwise: Ignore case level weights of secondary ignorables. |
| 976 | // For details see the comments in the CollationCompare class. |
| 977 | // Tertiary CEs (secondary ignorables) are not supported in fast Latin. |
| 978 | if(pair <= 0xffff) { |
| 979 | // one mini CE |
| 980 | if(pair >= MIN_SHORT) { |
| 981 | // A high secondary weight means we really have two CEs, |
| 982 | // a primary CE and a secondary CE. |
| 983 | uint32_t ce = pair; |
| 984 | pair &= CASE_MASK; // explicit weight of primary CE |
| 985 | if(!strengthIsPrimary && (ce & SECONDARY_MASK) >= MIN_SEC_HIGH) { |
| 986 | pair |= LOWER_CASE << 16; // implied weight of secondary CE |
| 987 | } |
| 988 | } else if(pair > variableTop) { |
| 989 | pair = LOWER_CASE; |
| 990 | } else if(pair >= MIN_LONG) { |
| 991 | pair = 0; // variable |
| 992 | } |
| 993 | // else special mini CE |
| 994 | } else { |
| 995 | // two mini CEs, same primary groups, neither expands like above |
| 996 | uint32_t ce = pair & 0xffff; |
| 997 | if(ce >= MIN_SHORT) { |
| 998 | if(strengthIsPrimary && (pair & (SHORT_PRIMARY_MASK << 16)) == 0) { |
| 999 | pair &= CASE_MASK; |
| 1000 | } else { |
| 1001 | pair &= TWO_CASES_MASK; |
| 1002 | } |
| 1003 | } else if(ce > variableTop) { |
| 1004 | pair = TWO_LOWER_CASES; |
| 1005 | } else { |
| 1006 | U_ASSERT(ce >= MIN_LONG); |
| 1007 | pair = 0; // variable |
| 1008 | } |
| 1009 | } |
| 1010 | return pair; |
| 1011 | } |
| 1012 | |
| 1013 | uint32_t |
| 1014 | CollationFastLatin::getTertiaries(uint32_t variableTop, UBool withCaseBits, uint32_t pair) { |
| 1015 | if(pair <= 0xffff) { |
| 1016 | // one mini CE |
| 1017 | if(pair >= MIN_SHORT) { |
| 1018 | // A high secondary weight means we really have two CEs, |
| 1019 | // a primary CE and a secondary CE. |
| 1020 | uint32_t ce = pair; |
| 1021 | if(withCaseBits) { |
| 1022 | pair = (pair & CASE_AND_TERTIARY_MASK) + TER_OFFSET; |
| 1023 | if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) { |
| 1024 | pair |= (LOWER_CASE | COMMON_TER_PLUS_OFFSET) << 16; |
| 1025 | } |
| 1026 | } else { |
| 1027 | pair = (pair & TERTIARY_MASK) + TER_OFFSET; |
| 1028 | if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) { |
| 1029 | pair |= COMMON_TER_PLUS_OFFSET << 16; |
| 1030 | } |
| 1031 | } |
| 1032 | } else if(pair > variableTop) { |
| 1033 | pair = (pair & TERTIARY_MASK) + TER_OFFSET; |
| 1034 | if(withCaseBits) { |
| 1035 | pair |= LOWER_CASE; |
| 1036 | } |
| 1037 | } else if(pair >= MIN_LONG) { |
| 1038 | pair = 0; // variable |
| 1039 | } |
| 1040 | // else special mini CE |
| 1041 | } else { |
| 1042 | // two mini CEs, same primary groups, neither expands like above |
| 1043 | uint32_t ce = pair & 0xffff; |
| 1044 | if(ce >= MIN_SHORT) { |
| 1045 | if(withCaseBits) { |
| 1046 | pair &= TWO_CASES_MASK | TWO_TERTIARIES_MASK; |
| 1047 | } else { |
| 1048 | pair &= TWO_TERTIARIES_MASK; |
| 1049 | } |
| 1050 | pair += TWO_TER_OFFSETS; |
| 1051 | } else if(ce > variableTop) { |
| 1052 | pair = (pair & TWO_TERTIARIES_MASK) + TWO_TER_OFFSETS; |
| 1053 | if(withCaseBits) { |
| 1054 | pair |= TWO_LOWER_CASES; |
| 1055 | } |
| 1056 | } else { |
| 1057 | U_ASSERT(ce >= MIN_LONG); |
| 1058 | pair = 0; // variable |
| 1059 | } |
| 1060 | } |
| 1061 | return pair; |
| 1062 | } |
| 1063 | |
| 1064 | uint32_t |
| 1065 | CollationFastLatin::getQuaternaries(uint32_t variableTop, uint32_t pair) { |
| 1066 | // Return the primary weight of a variable CE, |
| 1067 | // or the maximum primary weight for a non-variable, not-completely-ignorable CE. |
| 1068 | if(pair <= 0xffff) { |
| 1069 | // one mini CE |
| 1070 | if(pair >= MIN_SHORT) { |
| 1071 | // A high secondary weight means we really have two CEs, |
| 1072 | // a primary CE and a secondary CE. |
| 1073 | if((pair & SECONDARY_MASK) >= MIN_SEC_HIGH) { |
| 1074 | pair = TWO_SHORT_PRIMARIES_MASK; |
| 1075 | } else { |
| 1076 | pair = SHORT_PRIMARY_MASK; |
| 1077 | } |
| 1078 | } else if(pair > variableTop) { |
| 1079 | pair = SHORT_PRIMARY_MASK; |
| 1080 | } else if(pair >= MIN_LONG) { |
| 1081 | pair &= LONG_PRIMARY_MASK; // variable |
| 1082 | } |
| 1083 | // else special mini CE |
| 1084 | } else { |
| 1085 | // two mini CEs, same primary groups, neither expands like above |
| 1086 | uint32_t ce = pair & 0xffff; |
| 1087 | if(ce > variableTop) { |
| 1088 | pair = TWO_SHORT_PRIMARIES_MASK; |
| 1089 | } else { |
| 1090 | U_ASSERT(ce >= MIN_LONG); |
| 1091 | pair &= TWO_LONG_PRIMARIES_MASK; // variable |
| 1092 | } |
| 1093 | } |
| 1094 | return pair; |
| 1095 | } |
| 1096 | |
| 1097 | U_NAMESPACE_END |
| 1098 | |
| 1099 | #endif // !UCONFIG_NO_COLLATION |
| 1100 | |