| 1 | // © 2016 and later: Unicode, Inc. and others. | 
|---|
| 2 | // License & terms of use: http://www.unicode.org/copyright.html | 
|---|
| 3 | /* | 
|---|
| 4 | ******************************************************************************* | 
|---|
| 5 | * Copyright (C) 2013-2015, International Business Machines | 
|---|
| 6 | * Corporation and others.  All Rights Reserved. | 
|---|
| 7 | ******************************************************************************* | 
|---|
| 8 | * collationfastlatin.cpp | 
|---|
| 9 | * | 
|---|
| 10 | * created on: 2013aug18 | 
|---|
| 11 | * created by: Markus W. Scherer | 
|---|
| 12 | */ | 
|---|
| 13 |  | 
|---|
| 14 | #include "unicode/utypes.h" | 
|---|
| 15 |  | 
|---|
| 16 | #if !UCONFIG_NO_COLLATION | 
|---|
| 17 |  | 
|---|
| 18 | #include "unicode/ucol.h" | 
|---|
| 19 | #include "collationdata.h" | 
|---|
| 20 | #include "collationfastlatin.h" | 
|---|
| 21 | #include "collationsettings.h" | 
|---|
| 22 | #include "uassert.h" | 
|---|
| 23 |  | 
|---|
| 24 | U_NAMESPACE_BEGIN | 
|---|
| 25 |  | 
|---|
| 26 | int32_t | 
|---|
| 27 | CollationFastLatin::getOptions(const CollationData *data, const CollationSettings &settings, | 
|---|
| 28 | uint16_t *primaries, int32_t capacity) { | 
|---|
| 29 | const uint16_t *table = data->fastLatinTable; | 
|---|
| 30 | if(table == NULL) { return -1; } | 
|---|
| 31 | U_ASSERT(capacity == LATIN_LIMIT); | 
|---|
| 32 | if(capacity != LATIN_LIMIT) { return -1; } | 
|---|
| 33 |  | 
|---|
| 34 | uint32_t miniVarTop; | 
|---|
| 35 | if((settings.options & CollationSettings::ALTERNATE_MASK) == 0) { | 
|---|
| 36 | // No mini primaries are variable, set a variableTop just below the | 
|---|
| 37 | // lowest long mini primary. | 
|---|
| 38 | miniVarTop = MIN_LONG - 1; | 
|---|
| 39 | } else { | 
|---|
| 40 | int32_t  = *table & 0xff; | 
|---|
| 41 | int32_t i = 1 + settings.getMaxVariable(); | 
|---|
| 42 | if(i >= headerLength) { | 
|---|
| 43 | return -1;  // variableTop >= digits, should not occur | 
|---|
| 44 | } | 
|---|
| 45 | miniVarTop = table[i]; | 
|---|
| 46 | } | 
|---|
| 47 |  | 
|---|
| 48 | UBool digitsAreReordered = FALSE; | 
|---|
| 49 | if(settings.hasReordering()) { | 
|---|
| 50 | uint32_t prevStart = 0; | 
|---|
| 51 | uint32_t beforeDigitStart = 0; | 
|---|
| 52 | uint32_t digitStart = 0; | 
|---|
| 53 | uint32_t afterDigitStart = 0; | 
|---|
| 54 | for(int32_t group = UCOL_REORDER_CODE_FIRST; | 
|---|
| 55 | group < UCOL_REORDER_CODE_FIRST + CollationData::MAX_NUM_SPECIAL_REORDER_CODES; | 
|---|
| 56 | ++group) { | 
|---|
| 57 | uint32_t start = data->getFirstPrimaryForGroup(group); | 
|---|
| 58 | start = settings.reorder(start); | 
|---|
| 59 | if(group == UCOL_REORDER_CODE_DIGIT) { | 
|---|
| 60 | beforeDigitStart = prevStart; | 
|---|
| 61 | digitStart = start; | 
|---|
| 62 | } else if(start != 0) { | 
|---|
| 63 | if(start < prevStart) { | 
|---|
| 64 | // The permutation affects the groups up to Latin. | 
|---|
| 65 | return -1; | 
|---|
| 66 | } | 
|---|
| 67 | // In the future, there might be a special group between digits & Latin. | 
|---|
| 68 | if(digitStart != 0 && afterDigitStart == 0 && prevStart == beforeDigitStart) { | 
|---|
| 69 | afterDigitStart = start; | 
|---|
| 70 | } | 
|---|
| 71 | prevStart = start; | 
|---|
| 72 | } | 
|---|
| 73 | } | 
|---|
| 74 | uint32_t latinStart = data->getFirstPrimaryForGroup(USCRIPT_LATIN); | 
|---|
| 75 | latinStart = settings.reorder(latinStart); | 
|---|
| 76 | if(latinStart < prevStart) { | 
|---|
| 77 | return -1; | 
|---|
| 78 | } | 
|---|
| 79 | if(afterDigitStart == 0) { | 
|---|
| 80 | afterDigitStart = latinStart; | 
|---|
| 81 | } | 
|---|
| 82 | if(!(beforeDigitStart < digitStart && digitStart < afterDigitStart)) { | 
|---|
| 83 | digitsAreReordered = TRUE; | 
|---|
| 84 | } | 
|---|
| 85 | } | 
|---|
| 86 |  | 
|---|
| 87 | table += (table[0] & 0xff);  // skip the header | 
|---|
| 88 | for(UChar32 c = 0; c < LATIN_LIMIT; ++c) { | 
|---|
| 89 | uint32_t p = table[c]; | 
|---|
| 90 | if(p >= MIN_SHORT) { | 
|---|
| 91 | p &= SHORT_PRIMARY_MASK; | 
|---|
| 92 | } else if(p > miniVarTop) { | 
|---|
| 93 | p &= LONG_PRIMARY_MASK; | 
|---|
| 94 | } else { | 
|---|
| 95 | p = 0; | 
|---|
| 96 | } | 
|---|
| 97 | primaries[c] = (uint16_t)p; | 
|---|
| 98 | } | 
|---|
| 99 | if(digitsAreReordered || (settings.options & CollationSettings::NUMERIC) != 0) { | 
|---|
| 100 | // Bail out for digits. | 
|---|
| 101 | for(UChar32 c = 0x30; c <= 0x39; ++c) { primaries[c] = 0; } | 
|---|
| 102 | } | 
|---|
| 103 |  | 
|---|
| 104 | // Shift the miniVarTop above other options. | 
|---|
| 105 | return ((int32_t)miniVarTop << 16) | settings.options; | 
|---|
| 106 | } | 
|---|
| 107 |  | 
|---|
| 108 | int32_t | 
|---|
| 109 | CollationFastLatin::compareUTF16(const uint16_t *table, const uint16_t *primaries, int32_t options, | 
|---|
| 110 | const UChar *left, int32_t leftLength, | 
|---|
| 111 | const UChar *right, int32_t rightLength) { | 
|---|
| 112 | // This is a modified copy of CollationCompare::compareUpToQuaternary(), | 
|---|
| 113 | // optimized for common Latin text. | 
|---|
| 114 | // Keep them in sync! | 
|---|
| 115 | // Keep compareUTF16() and compareUTF8() in sync very closely! | 
|---|
| 116 |  | 
|---|
| 117 | U_ASSERT((table[0] >> 8) == VERSION); | 
|---|
| 118 | table += (table[0] & 0xff);  // skip the header | 
|---|
| 119 | uint32_t variableTop = (uint32_t)options >> 16;  // see getOptions() | 
|---|
| 120 | options &= 0xffff;  // needed for CollationSettings::getStrength() to work | 
|---|
| 121 |  | 
|---|
| 122 | // Check for supported characters, fetch mini CEs, and compare primaries. | 
|---|
| 123 | int32_t leftIndex = 0, rightIndex = 0; | 
|---|
| 124 | /** | 
|---|
| 125 | * Single mini CE or a pair. | 
|---|
| 126 | * The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits. | 
|---|
| 127 | * If there is only one, then it is in the lower bits, and the upper bits are 0. | 
|---|
| 128 | */ | 
|---|
| 129 | uint32_t leftPair = 0, rightPair = 0; | 
|---|
| 130 | for(;;) { | 
|---|
| 131 | // We fetch CEs until we get a non-ignorable primary or reach the end. | 
|---|
| 132 | while(leftPair == 0) { | 
|---|
| 133 | if(leftIndex == leftLength) { | 
|---|
| 134 | leftPair = EOS; | 
|---|
| 135 | break; | 
|---|
| 136 | } | 
|---|
| 137 | UChar32 c = left[leftIndex++]; | 
|---|
| 138 | if(c <= LATIN_MAX) { | 
|---|
| 139 | leftPair = primaries[c]; | 
|---|
| 140 | if(leftPair != 0) { break; } | 
|---|
| 141 | if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) { | 
|---|
| 142 | return BAIL_OUT_RESULT; | 
|---|
| 143 | } | 
|---|
| 144 | leftPair = table[c]; | 
|---|
| 145 | } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { | 
|---|
| 146 | leftPair = table[c - PUNCT_START + LATIN_LIMIT]; | 
|---|
| 147 | } else { | 
|---|
| 148 | leftPair = lookup(table, c); | 
|---|
| 149 | } | 
|---|
| 150 | if(leftPair >= MIN_SHORT) { | 
|---|
| 151 | leftPair &= SHORT_PRIMARY_MASK; | 
|---|
| 152 | break; | 
|---|
| 153 | } else if(leftPair > variableTop) { | 
|---|
| 154 | leftPair &= LONG_PRIMARY_MASK; | 
|---|
| 155 | break; | 
|---|
| 156 | } else { | 
|---|
| 157 | leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength); | 
|---|
| 158 | if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; } | 
|---|
| 159 | leftPair = getPrimaries(variableTop, leftPair); | 
|---|
| 160 | } | 
|---|
| 161 | } | 
|---|
| 162 |  | 
|---|
| 163 | while(rightPair == 0) { | 
|---|
| 164 | if(rightIndex == rightLength) { | 
|---|
| 165 | rightPair = EOS; | 
|---|
| 166 | break; | 
|---|
| 167 | } | 
|---|
| 168 | UChar32 c = right[rightIndex++]; | 
|---|
| 169 | if(c <= LATIN_MAX) { | 
|---|
| 170 | rightPair = primaries[c]; | 
|---|
| 171 | if(rightPair != 0) { break; } | 
|---|
| 172 | if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) { | 
|---|
| 173 | return BAIL_OUT_RESULT; | 
|---|
| 174 | } | 
|---|
| 175 | rightPair = table[c]; | 
|---|
| 176 | } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { | 
|---|
| 177 | rightPair = table[c - PUNCT_START + LATIN_LIMIT]; | 
|---|
| 178 | } else { | 
|---|
| 179 | rightPair = lookup(table, c); | 
|---|
| 180 | } | 
|---|
| 181 | if(rightPair >= MIN_SHORT) { | 
|---|
| 182 | rightPair &= SHORT_PRIMARY_MASK; | 
|---|
| 183 | break; | 
|---|
| 184 | } else if(rightPair > variableTop) { | 
|---|
| 185 | rightPair &= LONG_PRIMARY_MASK; | 
|---|
| 186 | break; | 
|---|
| 187 | } else { | 
|---|
| 188 | rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength); | 
|---|
| 189 | if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; } | 
|---|
| 190 | rightPair = getPrimaries(variableTop, rightPair); | 
|---|
| 191 | } | 
|---|
| 192 | } | 
|---|
| 193 |  | 
|---|
| 194 | if(leftPair == rightPair) { | 
|---|
| 195 | if(leftPair == EOS) { break; } | 
|---|
| 196 | leftPair = rightPair = 0; | 
|---|
| 197 | continue; | 
|---|
| 198 | } | 
|---|
| 199 | uint32_t leftPrimary = leftPair & 0xffff; | 
|---|
| 200 | uint32_t rightPrimary = rightPair & 0xffff; | 
|---|
| 201 | if(leftPrimary != rightPrimary) { | 
|---|
| 202 | // Return the primary difference. | 
|---|
| 203 | return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER; | 
|---|
| 204 | } | 
|---|
| 205 | if(leftPair == EOS) { break; } | 
|---|
| 206 | leftPair >>= 16; | 
|---|
| 207 | rightPair >>= 16; | 
|---|
| 208 | } | 
|---|
| 209 | // In the following, we need to re-fetch each character because we did not buffer the CEs, | 
|---|
| 210 | // but we know that the string is well-formed and | 
|---|
| 211 | // only contains supported characters and mappings. | 
|---|
| 212 |  | 
|---|
| 213 | // We might skip the secondary level but continue with the case level | 
|---|
| 214 | // which is turned on separately. | 
|---|
| 215 | if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) { | 
|---|
| 216 | leftIndex = rightIndex = 0; | 
|---|
| 217 | leftPair = rightPair = 0; | 
|---|
| 218 | for(;;) { | 
|---|
| 219 | while(leftPair == 0) { | 
|---|
| 220 | if(leftIndex == leftLength) { | 
|---|
| 221 | leftPair = EOS; | 
|---|
| 222 | break; | 
|---|
| 223 | } | 
|---|
| 224 | UChar32 c = left[leftIndex++]; | 
|---|
| 225 | if(c <= LATIN_MAX) { | 
|---|
| 226 | leftPair = table[c]; | 
|---|
| 227 | } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { | 
|---|
| 228 | leftPair = table[c - PUNCT_START + LATIN_LIMIT]; | 
|---|
| 229 | } else { | 
|---|
| 230 | leftPair = lookup(table, c); | 
|---|
| 231 | } | 
|---|
| 232 | if(leftPair >= MIN_SHORT) { | 
|---|
| 233 | leftPair = getSecondariesFromOneShortCE(leftPair); | 
|---|
| 234 | break; | 
|---|
| 235 | } else if(leftPair > variableTop) { | 
|---|
| 236 | leftPair = COMMON_SEC_PLUS_OFFSET; | 
|---|
| 237 | break; | 
|---|
| 238 | } else { | 
|---|
| 239 | leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength); | 
|---|
| 240 | leftPair = getSecondaries(variableTop, leftPair); | 
|---|
| 241 | } | 
|---|
| 242 | } | 
|---|
| 243 |  | 
|---|
| 244 | while(rightPair == 0) { | 
|---|
| 245 | if(rightIndex == rightLength) { | 
|---|
| 246 | rightPair = EOS; | 
|---|
| 247 | break; | 
|---|
| 248 | } | 
|---|
| 249 | UChar32 c = right[rightIndex++]; | 
|---|
| 250 | if(c <= LATIN_MAX) { | 
|---|
| 251 | rightPair = table[c]; | 
|---|
| 252 | } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { | 
|---|
| 253 | rightPair = table[c - PUNCT_START + LATIN_LIMIT]; | 
|---|
| 254 | } else { | 
|---|
| 255 | rightPair = lookup(table, c); | 
|---|
| 256 | } | 
|---|
| 257 | if(rightPair >= MIN_SHORT) { | 
|---|
| 258 | rightPair = getSecondariesFromOneShortCE(rightPair); | 
|---|
| 259 | break; | 
|---|
| 260 | } else if(rightPair > variableTop) { | 
|---|
| 261 | rightPair = COMMON_SEC_PLUS_OFFSET; | 
|---|
| 262 | break; | 
|---|
| 263 | } else { | 
|---|
| 264 | rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength); | 
|---|
| 265 | rightPair = getSecondaries(variableTop, rightPair); | 
|---|
| 266 | } | 
|---|
| 267 | } | 
|---|
| 268 |  | 
|---|
| 269 | if(leftPair == rightPair) { | 
|---|
| 270 | if(leftPair == EOS) { break; } | 
|---|
| 271 | leftPair = rightPair = 0; | 
|---|
| 272 | continue; | 
|---|
| 273 | } | 
|---|
| 274 | uint32_t leftSecondary = leftPair & 0xffff; | 
|---|
| 275 | uint32_t rightSecondary = rightPair & 0xffff; | 
|---|
| 276 | if(leftSecondary != rightSecondary) { | 
|---|
| 277 | if((options & CollationSettings::BACKWARD_SECONDARY) != 0) { | 
|---|
| 278 | // Full support for backwards secondary requires backwards contraction matching | 
|---|
| 279 | // and moving backwards between merge separators. | 
|---|
| 280 | return BAIL_OUT_RESULT; | 
|---|
| 281 | } | 
|---|
| 282 | return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER; | 
|---|
| 283 | } | 
|---|
| 284 | if(leftPair == EOS) { break; } | 
|---|
| 285 | leftPair >>= 16; | 
|---|
| 286 | rightPair >>= 16; | 
|---|
| 287 | } | 
|---|
| 288 | } | 
|---|
| 289 |  | 
|---|
| 290 | if((options & CollationSettings::CASE_LEVEL) != 0) { | 
|---|
| 291 | UBool strengthIsPrimary = CollationSettings::getStrength(options) == UCOL_PRIMARY; | 
|---|
| 292 | leftIndex = rightIndex = 0; | 
|---|
| 293 | leftPair = rightPair = 0; | 
|---|
| 294 | for(;;) { | 
|---|
| 295 | while(leftPair == 0) { | 
|---|
| 296 | if(leftIndex == leftLength) { | 
|---|
| 297 | leftPair = EOS; | 
|---|
| 298 | break; | 
|---|
| 299 | } | 
|---|
| 300 | UChar32 c = left[leftIndex++]; | 
|---|
| 301 | leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); | 
|---|
| 302 | if(leftPair < MIN_LONG) { | 
|---|
| 303 | leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength); | 
|---|
| 304 | } | 
|---|
| 305 | leftPair = getCases(variableTop, strengthIsPrimary, leftPair); | 
|---|
| 306 | } | 
|---|
| 307 |  | 
|---|
| 308 | while(rightPair == 0) { | 
|---|
| 309 | if(rightIndex == rightLength) { | 
|---|
| 310 | rightPair = EOS; | 
|---|
| 311 | break; | 
|---|
| 312 | } | 
|---|
| 313 | UChar32 c = right[rightIndex++]; | 
|---|
| 314 | rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); | 
|---|
| 315 | if(rightPair < MIN_LONG) { | 
|---|
| 316 | rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength); | 
|---|
| 317 | } | 
|---|
| 318 | rightPair = getCases(variableTop, strengthIsPrimary, rightPair); | 
|---|
| 319 | } | 
|---|
| 320 |  | 
|---|
| 321 | if(leftPair == rightPair) { | 
|---|
| 322 | if(leftPair == EOS) { break; } | 
|---|
| 323 | leftPair = rightPair = 0; | 
|---|
| 324 | continue; | 
|---|
| 325 | } | 
|---|
| 326 | uint32_t leftCase = leftPair & 0xffff; | 
|---|
| 327 | uint32_t rightCase = rightPair & 0xffff; | 
|---|
| 328 | if(leftCase != rightCase) { | 
|---|
| 329 | if((options & CollationSettings::UPPER_FIRST) == 0) { | 
|---|
| 330 | return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER; | 
|---|
| 331 | } else { | 
|---|
| 332 | return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS; | 
|---|
| 333 | } | 
|---|
| 334 | } | 
|---|
| 335 | if(leftPair == EOS) { break; } | 
|---|
| 336 | leftPair >>= 16; | 
|---|
| 337 | rightPair >>= 16; | 
|---|
| 338 | } | 
|---|
| 339 | } | 
|---|
| 340 | if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; } | 
|---|
| 341 |  | 
|---|
| 342 | // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off. | 
|---|
| 343 | UBool withCaseBits = CollationSettings::isTertiaryWithCaseBits(options); | 
|---|
| 344 |  | 
|---|
| 345 | leftIndex = rightIndex = 0; | 
|---|
| 346 | leftPair = rightPair = 0; | 
|---|
| 347 | for(;;) { | 
|---|
| 348 | while(leftPair == 0) { | 
|---|
| 349 | if(leftIndex == leftLength) { | 
|---|
| 350 | leftPair = EOS; | 
|---|
| 351 | break; | 
|---|
| 352 | } | 
|---|
| 353 | UChar32 c = left[leftIndex++]; | 
|---|
| 354 | leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); | 
|---|
| 355 | if(leftPair < MIN_LONG) { | 
|---|
| 356 | leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength); | 
|---|
| 357 | } | 
|---|
| 358 | leftPair = getTertiaries(variableTop, withCaseBits, leftPair); | 
|---|
| 359 | } | 
|---|
| 360 |  | 
|---|
| 361 | while(rightPair == 0) { | 
|---|
| 362 | if(rightIndex == rightLength) { | 
|---|
| 363 | rightPair = EOS; | 
|---|
| 364 | break; | 
|---|
| 365 | } | 
|---|
| 366 | UChar32 c = right[rightIndex++]; | 
|---|
| 367 | rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); | 
|---|
| 368 | if(rightPair < MIN_LONG) { | 
|---|
| 369 | rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength); | 
|---|
| 370 | } | 
|---|
| 371 | rightPair = getTertiaries(variableTop, withCaseBits, rightPair); | 
|---|
| 372 | } | 
|---|
| 373 |  | 
|---|
| 374 | if(leftPair == rightPair) { | 
|---|
| 375 | if(leftPair == EOS) { break; } | 
|---|
| 376 | leftPair = rightPair = 0; | 
|---|
| 377 | continue; | 
|---|
| 378 | } | 
|---|
| 379 | uint32_t leftTertiary = leftPair & 0xffff; | 
|---|
| 380 | uint32_t rightTertiary = rightPair & 0xffff; | 
|---|
| 381 | if(leftTertiary != rightTertiary) { | 
|---|
| 382 | if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) { | 
|---|
| 383 | // Pass through EOS and MERGE_WEIGHT | 
|---|
| 384 | // and keep real tertiary weights larger than the MERGE_WEIGHT. | 
|---|
| 385 | // Tertiary CEs (secondary ignorables) are not supported in fast Latin. | 
|---|
| 386 | if(leftTertiary > MERGE_WEIGHT) { | 
|---|
| 387 | leftTertiary ^= CASE_MASK; | 
|---|
| 388 | } | 
|---|
| 389 | if(rightTertiary > MERGE_WEIGHT) { | 
|---|
| 390 | rightTertiary ^= CASE_MASK; | 
|---|
| 391 | } | 
|---|
| 392 | } | 
|---|
| 393 | return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER; | 
|---|
| 394 | } | 
|---|
| 395 | if(leftPair == EOS) { break; } | 
|---|
| 396 | leftPair >>= 16; | 
|---|
| 397 | rightPair >>= 16; | 
|---|
| 398 | } | 
|---|
| 399 | if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; } | 
|---|
| 400 |  | 
|---|
| 401 | leftIndex = rightIndex = 0; | 
|---|
| 402 | leftPair = rightPair = 0; | 
|---|
| 403 | for(;;) { | 
|---|
| 404 | while(leftPair == 0) { | 
|---|
| 405 | if(leftIndex == leftLength) { | 
|---|
| 406 | leftPair = EOS; | 
|---|
| 407 | break; | 
|---|
| 408 | } | 
|---|
| 409 | UChar32 c = left[leftIndex++]; | 
|---|
| 410 | leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); | 
|---|
| 411 | if(leftPair < MIN_LONG) { | 
|---|
| 412 | leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength); | 
|---|
| 413 | } | 
|---|
| 414 | leftPair = getQuaternaries(variableTop, leftPair); | 
|---|
| 415 | } | 
|---|
| 416 |  | 
|---|
| 417 | while(rightPair == 0) { | 
|---|
| 418 | if(rightIndex == rightLength) { | 
|---|
| 419 | rightPair = EOS; | 
|---|
| 420 | break; | 
|---|
| 421 | } | 
|---|
| 422 | UChar32 c = right[rightIndex++]; | 
|---|
| 423 | rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); | 
|---|
| 424 | if(rightPair < MIN_LONG) { | 
|---|
| 425 | rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength); | 
|---|
| 426 | } | 
|---|
| 427 | rightPair = getQuaternaries(variableTop, rightPair); | 
|---|
| 428 | } | 
|---|
| 429 |  | 
|---|
| 430 | if(leftPair == rightPair) { | 
|---|
| 431 | if(leftPair == EOS) { break; } | 
|---|
| 432 | leftPair = rightPair = 0; | 
|---|
| 433 | continue; | 
|---|
| 434 | } | 
|---|
| 435 | uint32_t leftQuaternary = leftPair & 0xffff; | 
|---|
| 436 | uint32_t rightQuaternary = rightPair & 0xffff; | 
|---|
| 437 | if(leftQuaternary != rightQuaternary) { | 
|---|
| 438 | return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER; | 
|---|
| 439 | } | 
|---|
| 440 | if(leftPair == EOS) { break; } | 
|---|
| 441 | leftPair >>= 16; | 
|---|
| 442 | rightPair >>= 16; | 
|---|
| 443 | } | 
|---|
| 444 | return UCOL_EQUAL; | 
|---|
| 445 | } | 
|---|
| 446 |  | 
|---|
| 447 | int32_t | 
|---|
| 448 | CollationFastLatin::compareUTF8(const uint16_t *table, const uint16_t *primaries, int32_t options, | 
|---|
| 449 | const uint8_t *left, int32_t leftLength, | 
|---|
| 450 | const uint8_t *right, int32_t rightLength) { | 
|---|
| 451 | // Keep compareUTF16() and compareUTF8() in sync very closely! | 
|---|
| 452 |  | 
|---|
| 453 | U_ASSERT((table[0] >> 8) == VERSION); | 
|---|
| 454 | table += (table[0] & 0xff);  // skip the header | 
|---|
| 455 | uint32_t variableTop = (uint32_t)options >> 16;  // see RuleBasedCollator::getFastLatinOptions() | 
|---|
| 456 | options &= 0xffff;  // needed for CollationSettings::getStrength() to work | 
|---|
| 457 |  | 
|---|
| 458 | // Check for supported characters, fetch mini CEs, and compare primaries. | 
|---|
| 459 | int32_t leftIndex = 0, rightIndex = 0; | 
|---|
| 460 | /** | 
|---|
| 461 | * Single mini CE or a pair. | 
|---|
| 462 | * The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits. | 
|---|
| 463 | * If there is only one, then it is in the lower bits, and the upper bits are 0. | 
|---|
| 464 | */ | 
|---|
| 465 | uint32_t leftPair = 0, rightPair = 0; | 
|---|
| 466 | // Note: There is no need to assemble the code point. | 
|---|
| 467 | // We only need to look up the table entry for the character, | 
|---|
| 468 | // and nextPair() looks for whether c==0. | 
|---|
| 469 | for(;;) { | 
|---|
| 470 | // We fetch CEs until we get a non-ignorable primary or reach the end. | 
|---|
| 471 | while(leftPair == 0) { | 
|---|
| 472 | if(leftIndex == leftLength) { | 
|---|
| 473 | leftPair = EOS; | 
|---|
| 474 | break; | 
|---|
| 475 | } | 
|---|
| 476 | UChar32 c = left[leftIndex++]; | 
|---|
| 477 | uint8_t t; | 
|---|
| 478 | if(c <= 0x7f) { | 
|---|
| 479 | leftPair = primaries[c]; | 
|---|
| 480 | if(leftPair != 0) { break; } | 
|---|
| 481 | if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) { | 
|---|
| 482 | return BAIL_OUT_RESULT; | 
|---|
| 483 | } | 
|---|
| 484 | leftPair = table[c]; | 
|---|
| 485 | } else if(c <= LATIN_MAX_UTF8_LEAD && 0xc2 <= c && leftIndex != leftLength && | 
|---|
| 486 | 0x80 <= (t = left[leftIndex]) && t <= 0xbf) { | 
|---|
| 487 | ++leftIndex; | 
|---|
| 488 | c = ((c - 0xc2) << 6) + t; | 
|---|
| 489 | leftPair = primaries[c]; | 
|---|
| 490 | if(leftPair != 0) { break; } | 
|---|
| 491 | leftPair = table[c]; | 
|---|
| 492 | } else { | 
|---|
| 493 | leftPair = lookupUTF8(table, c, left, leftIndex, leftLength); | 
|---|
| 494 | } | 
|---|
| 495 | if(leftPair >= MIN_SHORT) { | 
|---|
| 496 | leftPair &= SHORT_PRIMARY_MASK; | 
|---|
| 497 | break; | 
|---|
| 498 | } else if(leftPair > variableTop) { | 
|---|
| 499 | leftPair &= LONG_PRIMARY_MASK; | 
|---|
| 500 | break; | 
|---|
| 501 | } else { | 
|---|
| 502 | leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength); | 
|---|
| 503 | if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; } | 
|---|
| 504 | leftPair = getPrimaries(variableTop, leftPair); | 
|---|
| 505 | } | 
|---|
| 506 | } | 
|---|
| 507 |  | 
|---|
| 508 | while(rightPair == 0) { | 
|---|
| 509 | if(rightIndex == rightLength) { | 
|---|
| 510 | rightPair = EOS; | 
|---|
| 511 | break; | 
|---|
| 512 | } | 
|---|
| 513 | UChar32 c = right[rightIndex++]; | 
|---|
| 514 | uint8_t t; | 
|---|
| 515 | if(c <= 0x7f) { | 
|---|
| 516 | rightPair = primaries[c]; | 
|---|
| 517 | if(rightPair != 0) { break; } | 
|---|
| 518 | if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) { | 
|---|
| 519 | return BAIL_OUT_RESULT; | 
|---|
| 520 | } | 
|---|
| 521 | rightPair = table[c]; | 
|---|
| 522 | } else if(c <= LATIN_MAX_UTF8_LEAD && 0xc2 <= c && rightIndex != rightLength && | 
|---|
| 523 | 0x80 <= (t = right[rightIndex]) && t <= 0xbf) { | 
|---|
| 524 | ++rightIndex; | 
|---|
| 525 | c = ((c - 0xc2) << 6) + t; | 
|---|
| 526 | rightPair = primaries[c]; | 
|---|
| 527 | if(rightPair != 0) { break; } | 
|---|
| 528 | rightPair = table[c]; | 
|---|
| 529 | } else { | 
|---|
| 530 | rightPair = lookupUTF8(table, c, right, rightIndex, rightLength); | 
|---|
| 531 | } | 
|---|
| 532 | if(rightPair >= MIN_SHORT) { | 
|---|
| 533 | rightPair &= SHORT_PRIMARY_MASK; | 
|---|
| 534 | break; | 
|---|
| 535 | } else if(rightPair > variableTop) { | 
|---|
| 536 | rightPair &= LONG_PRIMARY_MASK; | 
|---|
| 537 | break; | 
|---|
| 538 | } else { | 
|---|
| 539 | rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength); | 
|---|
| 540 | if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; } | 
|---|
| 541 | rightPair = getPrimaries(variableTop, rightPair); | 
|---|
| 542 | } | 
|---|
| 543 | } | 
|---|
| 544 |  | 
|---|
| 545 | if(leftPair == rightPair) { | 
|---|
| 546 | if(leftPair == EOS) { break; } | 
|---|
| 547 | leftPair = rightPair = 0; | 
|---|
| 548 | continue; | 
|---|
| 549 | } | 
|---|
| 550 | uint32_t leftPrimary = leftPair & 0xffff; | 
|---|
| 551 | uint32_t rightPrimary = rightPair & 0xffff; | 
|---|
| 552 | if(leftPrimary != rightPrimary) { | 
|---|
| 553 | // Return the primary difference. | 
|---|
| 554 | return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER; | 
|---|
| 555 | } | 
|---|
| 556 | if(leftPair == EOS) { break; } | 
|---|
| 557 | leftPair >>= 16; | 
|---|
| 558 | rightPair >>= 16; | 
|---|
| 559 | } | 
|---|
| 560 | // In the following, we need to re-fetch each character because we did not buffer the CEs, | 
|---|
| 561 | // but we know that the string is well-formed and | 
|---|
| 562 | // only contains supported characters and mappings. | 
|---|
| 563 |  | 
|---|
| 564 | // We might skip the secondary level but continue with the case level | 
|---|
| 565 | // which is turned on separately. | 
|---|
| 566 | if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) { | 
|---|
| 567 | leftIndex = rightIndex = 0; | 
|---|
| 568 | leftPair = rightPair = 0; | 
|---|
| 569 | for(;;) { | 
|---|
| 570 | while(leftPair == 0) { | 
|---|
| 571 | if(leftIndex == leftLength) { | 
|---|
| 572 | leftPair = EOS; | 
|---|
| 573 | break; | 
|---|
| 574 | } | 
|---|
| 575 | UChar32 c = left[leftIndex++]; | 
|---|
| 576 | if(c <= 0x7f) { | 
|---|
| 577 | leftPair = table[c]; | 
|---|
| 578 | } else if(c <= LATIN_MAX_UTF8_LEAD) { | 
|---|
| 579 | leftPair = table[((c - 0xc2) << 6) + left[leftIndex++]]; | 
|---|
| 580 | } else { | 
|---|
| 581 | leftPair = lookupUTF8Unsafe(table, c, left, leftIndex); | 
|---|
| 582 | } | 
|---|
| 583 | if(leftPair >= MIN_SHORT) { | 
|---|
| 584 | leftPair = getSecondariesFromOneShortCE(leftPair); | 
|---|
| 585 | break; | 
|---|
| 586 | } else if(leftPair > variableTop) { | 
|---|
| 587 | leftPair = COMMON_SEC_PLUS_OFFSET; | 
|---|
| 588 | break; | 
|---|
| 589 | } else { | 
|---|
| 590 | leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength); | 
|---|
| 591 | leftPair = getSecondaries(variableTop, leftPair); | 
|---|
| 592 | } | 
|---|
| 593 | } | 
|---|
| 594 |  | 
|---|
| 595 | while(rightPair == 0) { | 
|---|
| 596 | if(rightIndex == rightLength) { | 
|---|
| 597 | rightPair = EOS; | 
|---|
| 598 | break; | 
|---|
| 599 | } | 
|---|
| 600 | UChar32 c = right[rightIndex++]; | 
|---|
| 601 | if(c <= 0x7f) { | 
|---|
| 602 | rightPair = table[c]; | 
|---|
| 603 | } else if(c <= LATIN_MAX_UTF8_LEAD) { | 
|---|
| 604 | rightPair = table[((c - 0xc2) << 6) + right[rightIndex++]]; | 
|---|
| 605 | } else { | 
|---|
| 606 | rightPair = lookupUTF8Unsafe(table, c, right, rightIndex); | 
|---|
| 607 | } | 
|---|
| 608 | if(rightPair >= MIN_SHORT) { | 
|---|
| 609 | rightPair = getSecondariesFromOneShortCE(rightPair); | 
|---|
| 610 | break; | 
|---|
| 611 | } else if(rightPair > variableTop) { | 
|---|
| 612 | rightPair = COMMON_SEC_PLUS_OFFSET; | 
|---|
| 613 | break; | 
|---|
| 614 | } else { | 
|---|
| 615 | rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength); | 
|---|
| 616 | rightPair = getSecondaries(variableTop, rightPair); | 
|---|
| 617 | } | 
|---|
| 618 | } | 
|---|
| 619 |  | 
|---|
| 620 | if(leftPair == rightPair) { | 
|---|
| 621 | if(leftPair == EOS) { break; } | 
|---|
| 622 | leftPair = rightPair = 0; | 
|---|
| 623 | continue; | 
|---|
| 624 | } | 
|---|
| 625 | uint32_t leftSecondary = leftPair & 0xffff; | 
|---|
| 626 | uint32_t rightSecondary = rightPair & 0xffff; | 
|---|
| 627 | if(leftSecondary != rightSecondary) { | 
|---|
| 628 | if((options & CollationSettings::BACKWARD_SECONDARY) != 0) { | 
|---|
| 629 | // Full support for backwards secondary requires backwards contraction matching | 
|---|
| 630 | // and moving backwards between merge separators. | 
|---|
| 631 | return BAIL_OUT_RESULT; | 
|---|
| 632 | } | 
|---|
| 633 | return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER; | 
|---|
| 634 | } | 
|---|
| 635 | if(leftPair == EOS) { break; } | 
|---|
| 636 | leftPair >>= 16; | 
|---|
| 637 | rightPair >>= 16; | 
|---|
| 638 | } | 
|---|
| 639 | } | 
|---|
| 640 |  | 
|---|
| 641 | if((options & CollationSettings::CASE_LEVEL) != 0) { | 
|---|
| 642 | UBool strengthIsPrimary = CollationSettings::getStrength(options) == UCOL_PRIMARY; | 
|---|
| 643 | leftIndex = rightIndex = 0; | 
|---|
| 644 | leftPair = rightPair = 0; | 
|---|
| 645 | for(;;) { | 
|---|
| 646 | while(leftPair == 0) { | 
|---|
| 647 | if(leftIndex == leftLength) { | 
|---|
| 648 | leftPair = EOS; | 
|---|
| 649 | break; | 
|---|
| 650 | } | 
|---|
| 651 | UChar32 c = left[leftIndex++]; | 
|---|
| 652 | leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex); | 
|---|
| 653 | if(leftPair < MIN_LONG) { | 
|---|
| 654 | leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength); | 
|---|
| 655 | } | 
|---|
| 656 | leftPair = getCases(variableTop, strengthIsPrimary, leftPair); | 
|---|
| 657 | } | 
|---|
| 658 |  | 
|---|
| 659 | while(rightPair == 0) { | 
|---|
| 660 | if(rightIndex == rightLength) { | 
|---|
| 661 | rightPair = EOS; | 
|---|
| 662 | break; | 
|---|
| 663 | } | 
|---|
| 664 | UChar32 c = right[rightIndex++]; | 
|---|
| 665 | rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex); | 
|---|
| 666 | if(rightPair < MIN_LONG) { | 
|---|
| 667 | rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength); | 
|---|
| 668 | } | 
|---|
| 669 | rightPair = getCases(variableTop, strengthIsPrimary, rightPair); | 
|---|
| 670 | } | 
|---|
| 671 |  | 
|---|
| 672 | if(leftPair == rightPair) { | 
|---|
| 673 | if(leftPair == EOS) { break; } | 
|---|
| 674 | leftPair = rightPair = 0; | 
|---|
| 675 | continue; | 
|---|
| 676 | } | 
|---|
| 677 | uint32_t leftCase = leftPair & 0xffff; | 
|---|
| 678 | uint32_t rightCase = rightPair & 0xffff; | 
|---|
| 679 | if(leftCase != rightCase) { | 
|---|
| 680 | if((options & CollationSettings::UPPER_FIRST) == 0) { | 
|---|
| 681 | return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER; | 
|---|
| 682 | } else { | 
|---|
| 683 | return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS; | 
|---|
| 684 | } | 
|---|
| 685 | } | 
|---|
| 686 | if(leftPair == EOS) { break; } | 
|---|
| 687 | leftPair >>= 16; | 
|---|
| 688 | rightPair >>= 16; | 
|---|
| 689 | } | 
|---|
| 690 | } | 
|---|
| 691 | if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; } | 
|---|
| 692 |  | 
|---|
| 693 | // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off. | 
|---|
| 694 | UBool withCaseBits = CollationSettings::isTertiaryWithCaseBits(options); | 
|---|
| 695 |  | 
|---|
| 696 | leftIndex = rightIndex = 0; | 
|---|
| 697 | leftPair = rightPair = 0; | 
|---|
| 698 | for(;;) { | 
|---|
| 699 | while(leftPair == 0) { | 
|---|
| 700 | if(leftIndex == leftLength) { | 
|---|
| 701 | leftPair = EOS; | 
|---|
| 702 | break; | 
|---|
| 703 | } | 
|---|
| 704 | UChar32 c = left[leftIndex++]; | 
|---|
| 705 | leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex); | 
|---|
| 706 | if(leftPair < MIN_LONG) { | 
|---|
| 707 | leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength); | 
|---|
| 708 | } | 
|---|
| 709 | leftPair = getTertiaries(variableTop, withCaseBits, leftPair); | 
|---|
| 710 | } | 
|---|
| 711 |  | 
|---|
| 712 | while(rightPair == 0) { | 
|---|
| 713 | if(rightIndex == rightLength) { | 
|---|
| 714 | rightPair = EOS; | 
|---|
| 715 | break; | 
|---|
| 716 | } | 
|---|
| 717 | UChar32 c = right[rightIndex++]; | 
|---|
| 718 | rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex); | 
|---|
| 719 | if(rightPair < MIN_LONG) { | 
|---|
| 720 | rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength); | 
|---|
| 721 | } | 
|---|
| 722 | rightPair = getTertiaries(variableTop, withCaseBits, rightPair); | 
|---|
| 723 | } | 
|---|
| 724 |  | 
|---|
| 725 | if(leftPair == rightPair) { | 
|---|
| 726 | if(leftPair == EOS) { break; } | 
|---|
| 727 | leftPair = rightPair = 0; | 
|---|
| 728 | continue; | 
|---|
| 729 | } | 
|---|
| 730 | uint32_t leftTertiary = leftPair & 0xffff; | 
|---|
| 731 | uint32_t rightTertiary = rightPair & 0xffff; | 
|---|
| 732 | if(leftTertiary != rightTertiary) { | 
|---|
| 733 | if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) { | 
|---|
| 734 | // Pass through EOS and MERGE_WEIGHT | 
|---|
| 735 | // and keep real tertiary weights larger than the MERGE_WEIGHT. | 
|---|
| 736 | // Tertiary CEs (secondary ignorables) are not supported in fast Latin. | 
|---|
| 737 | if(leftTertiary > MERGE_WEIGHT) { | 
|---|
| 738 | leftTertiary ^= CASE_MASK; | 
|---|
| 739 | } | 
|---|
| 740 | if(rightTertiary > MERGE_WEIGHT) { | 
|---|
| 741 | rightTertiary ^= CASE_MASK; | 
|---|
| 742 | } | 
|---|
| 743 | } | 
|---|
| 744 | return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER; | 
|---|
| 745 | } | 
|---|
| 746 | if(leftPair == EOS) { break; } | 
|---|
| 747 | leftPair >>= 16; | 
|---|
| 748 | rightPair >>= 16; | 
|---|
| 749 | } | 
|---|
| 750 | if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; } | 
|---|
| 751 |  | 
|---|
| 752 | leftIndex = rightIndex = 0; | 
|---|
| 753 | leftPair = rightPair = 0; | 
|---|
| 754 | for(;;) { | 
|---|
| 755 | while(leftPair == 0) { | 
|---|
| 756 | if(leftIndex == leftLength) { | 
|---|
| 757 | leftPair = EOS; | 
|---|
| 758 | break; | 
|---|
| 759 | } | 
|---|
| 760 | UChar32 c = left[leftIndex++]; | 
|---|
| 761 | leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex); | 
|---|
| 762 | if(leftPair < MIN_LONG) { | 
|---|
| 763 | leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength); | 
|---|
| 764 | } | 
|---|
| 765 | leftPair = getQuaternaries(variableTop, leftPair); | 
|---|
| 766 | } | 
|---|
| 767 |  | 
|---|
| 768 | while(rightPair == 0) { | 
|---|
| 769 | if(rightIndex == rightLength) { | 
|---|
| 770 | rightPair = EOS; | 
|---|
| 771 | break; | 
|---|
| 772 | } | 
|---|
| 773 | UChar32 c = right[rightIndex++]; | 
|---|
| 774 | rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex); | 
|---|
| 775 | if(rightPair < MIN_LONG) { | 
|---|
| 776 | rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength); | 
|---|
| 777 | } | 
|---|
| 778 | rightPair = getQuaternaries(variableTop, rightPair); | 
|---|
| 779 | } | 
|---|
| 780 |  | 
|---|
| 781 | if(leftPair == rightPair) { | 
|---|
| 782 | if(leftPair == EOS) { break; } | 
|---|
| 783 | leftPair = rightPair = 0; | 
|---|
| 784 | continue; | 
|---|
| 785 | } | 
|---|
| 786 | uint32_t leftQuaternary = leftPair & 0xffff; | 
|---|
| 787 | uint32_t rightQuaternary = rightPair & 0xffff; | 
|---|
| 788 | if(leftQuaternary != rightQuaternary) { | 
|---|
| 789 | return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER; | 
|---|
| 790 | } | 
|---|
| 791 | if(leftPair == EOS) { break; } | 
|---|
| 792 | leftPair >>= 16; | 
|---|
| 793 | rightPair >>= 16; | 
|---|
| 794 | } | 
|---|
| 795 | return UCOL_EQUAL; | 
|---|
| 796 | } | 
|---|
| 797 |  | 
|---|
| 798 | uint32_t | 
|---|
| 799 | CollationFastLatin::lookup(const uint16_t *table, UChar32 c) { | 
|---|
| 800 | U_ASSERT(c > LATIN_MAX); | 
|---|
| 801 | if(PUNCT_START <= c && c < PUNCT_LIMIT) { | 
|---|
| 802 | return table[c - PUNCT_START + LATIN_LIMIT]; | 
|---|
| 803 | } else if(c == 0xfffe) { | 
|---|
| 804 | return MERGE_WEIGHT; | 
|---|
| 805 | } else if(c == 0xffff) { | 
|---|
| 806 | return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER; | 
|---|
| 807 | } else { | 
|---|
| 808 | return BAIL_OUT; | 
|---|
| 809 | } | 
|---|
| 810 | } | 
|---|
| 811 |  | 
|---|
| 812 | uint32_t | 
|---|
| 813 | CollationFastLatin::lookupUTF8(const uint16_t *table, UChar32 c, | 
|---|
| 814 | const uint8_t *s8, int32_t &sIndex, int32_t sLength) { | 
|---|
| 815 | // The caller handled ASCII and valid/supported Latin. | 
|---|
| 816 | U_ASSERT(c > 0x7f); | 
|---|
| 817 | int32_t i2 = sIndex + 1; | 
|---|
| 818 | if(i2 < sLength || sLength < 0) { | 
|---|
| 819 | uint8_t t1 = s8[sIndex]; | 
|---|
| 820 | uint8_t t2 = s8[i2]; | 
|---|
| 821 | sIndex += 2; | 
|---|
| 822 | if(c == 0xe2 && t1 == 0x80 && 0x80 <= t2 && t2 <= 0xbf) { | 
|---|
| 823 | return table[(LATIN_LIMIT - 0x80) + t2];  // 2000..203F -> 0180..01BF | 
|---|
| 824 | } else if(c == 0xef && t1 == 0xbf) { | 
|---|
| 825 | if(t2 == 0xbe) { | 
|---|
| 826 | return MERGE_WEIGHT;  // U+FFFE | 
|---|
| 827 | } else if(t2 == 0xbf) { | 
|---|
| 828 | return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER;  // U+FFFF | 
|---|
| 829 | } | 
|---|
| 830 | } | 
|---|
| 831 | } | 
|---|
| 832 | return BAIL_OUT; | 
|---|
| 833 | } | 
|---|
| 834 |  | 
|---|
| 835 | uint32_t | 
|---|
| 836 | CollationFastLatin::lookupUTF8Unsafe(const uint16_t *table, UChar32 c, | 
|---|
| 837 | const uint8_t *s8, int32_t &sIndex) { | 
|---|
| 838 | // The caller handled ASCII. | 
|---|
| 839 | // The string is well-formed and contains only supported characters. | 
|---|
| 840 | U_ASSERT(c > 0x7f); | 
|---|
| 841 | if(c <= LATIN_MAX_UTF8_LEAD) { | 
|---|
| 842 | return table[((c - 0xc2) << 6) + s8[sIndex++]];  // 0080..017F | 
|---|
| 843 | } | 
|---|
| 844 | uint8_t t2 = s8[sIndex + 1]; | 
|---|
| 845 | sIndex += 2; | 
|---|
| 846 | if(c == 0xe2) { | 
|---|
| 847 | return table[(LATIN_LIMIT - 0x80) + t2];  // 2000..203F -> 0180..01BF | 
|---|
| 848 | } else if(t2 == 0xbe) { | 
|---|
| 849 | return MERGE_WEIGHT;  // U+FFFE | 
|---|
| 850 | } else { | 
|---|
| 851 | return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER;  // U+FFFF | 
|---|
| 852 | } | 
|---|
| 853 | } | 
|---|
| 854 |  | 
|---|
| 855 | uint32_t | 
|---|
| 856 | CollationFastLatin::nextPair(const uint16_t *table, UChar32 c, uint32_t ce, | 
|---|
| 857 | const UChar *s16, const uint8_t *s8, int32_t &sIndex, int32_t &sLength) { | 
|---|
| 858 | if(ce >= MIN_LONG || ce < CONTRACTION) { | 
|---|
| 859 | return ce;  // simple or special mini CE | 
|---|
| 860 | } else if(ce >= EXPANSION) { | 
|---|
| 861 | int32_t index = NUM_FAST_CHARS + (ce & INDEX_MASK); | 
|---|
| 862 | return ((uint32_t)table[index + 1] << 16) | table[index]; | 
|---|
| 863 | } else /* ce >= CONTRACTION */ { | 
|---|
| 864 | if(c == 0 && sLength < 0) { | 
|---|
| 865 | sLength = sIndex - 1; | 
|---|
| 866 | return EOS; | 
|---|
| 867 | } | 
|---|
| 868 | // Contraction list: Default mapping followed by | 
|---|
| 869 | // 0 or more single-character contraction suffix mappings. | 
|---|
| 870 | int32_t index = NUM_FAST_CHARS + (ce & INDEX_MASK); | 
|---|
| 871 | if(sIndex != sLength) { | 
|---|
| 872 | // Read the next character. | 
|---|
| 873 | int32_t c2; | 
|---|
| 874 | int32_t nextIndex = sIndex; | 
|---|
| 875 | if(s16 != NULL) { | 
|---|
| 876 | c2 = s16[nextIndex++]; | 
|---|
| 877 | if(c2 > LATIN_MAX) { | 
|---|
| 878 | if(PUNCT_START <= c2 && c2 < PUNCT_LIMIT) { | 
|---|
| 879 | c2 = c2 - PUNCT_START + LATIN_LIMIT;  // 2000..203F -> 0180..01BF | 
|---|
| 880 | } else if(c2 == 0xfffe || c2 == 0xffff) { | 
|---|
| 881 | c2 = -1;  // U+FFFE & U+FFFF cannot occur in contractions. | 
|---|
| 882 | } else { | 
|---|
| 883 | return BAIL_OUT; | 
|---|
| 884 | } | 
|---|
| 885 | } | 
|---|
| 886 | } else { | 
|---|
| 887 | c2 = s8[nextIndex++]; | 
|---|
| 888 | if(c2 > 0x7f) { | 
|---|
| 889 | uint8_t t; | 
|---|
| 890 | if(c2 <= 0xc5 && 0xc2 <= c2 && nextIndex != sLength && | 
|---|
| 891 | 0x80 <= (t = s8[nextIndex]) && t <= 0xbf) { | 
|---|
| 892 | c2 = ((c2 - 0xc2) << 6) + t;  // 0080..017F | 
|---|
| 893 | ++nextIndex; | 
|---|
| 894 | } else { | 
|---|
| 895 | int32_t i2 = nextIndex + 1; | 
|---|
| 896 | if(i2 < sLength || sLength < 0) { | 
|---|
| 897 | if(c2 == 0xe2 && s8[nextIndex] == 0x80 && | 
|---|
| 898 | 0x80 <= (t = s8[i2]) && t <= 0xbf) { | 
|---|
| 899 | c2 = (LATIN_LIMIT - 0x80) + t;  // 2000..203F -> 0180..01BF | 
|---|
| 900 | } else if(c2 == 0xef && s8[nextIndex] == 0xbf && | 
|---|
| 901 | ((t = s8[i2]) == 0xbe || t == 0xbf)) { | 
|---|
| 902 | c2 = -1;  // U+FFFE & U+FFFF cannot occur in contractions. | 
|---|
| 903 | } else { | 
|---|
| 904 | return BAIL_OUT; | 
|---|
| 905 | } | 
|---|
| 906 | } else { | 
|---|
| 907 | return BAIL_OUT; | 
|---|
| 908 | } | 
|---|
| 909 | nextIndex += 2; | 
|---|
| 910 | } | 
|---|
| 911 | } | 
|---|
| 912 | } | 
|---|
| 913 | if(c2 == 0 && sLength < 0) { | 
|---|
| 914 | sLength = sIndex; | 
|---|
| 915 | c2 = -1; | 
|---|
| 916 | } | 
|---|
| 917 | // Look for the next character in the contraction suffix list, | 
|---|
| 918 | // which is in ascending order of single suffix characters. | 
|---|
| 919 | int32_t i = index; | 
|---|
| 920 | int32_t head = table[i];  // first skip the default mapping | 
|---|
| 921 | int32_t x; | 
|---|
| 922 | do { | 
|---|
| 923 | i += head >> CONTR_LENGTH_SHIFT; | 
|---|
| 924 | head = table[i]; | 
|---|
| 925 | x = head & CONTR_CHAR_MASK; | 
|---|
| 926 | } while(x < c2); | 
|---|
| 927 | if(x == c2) { | 
|---|
| 928 | index = i; | 
|---|
| 929 | sIndex = nextIndex; | 
|---|
| 930 | } | 
|---|
| 931 | } | 
|---|
| 932 | // Return the CE or CEs for the default or contraction mapping. | 
|---|
| 933 | int32_t length = table[index] >> CONTR_LENGTH_SHIFT; | 
|---|
| 934 | if(length == 1) { | 
|---|
| 935 | return BAIL_OUT; | 
|---|
| 936 | } | 
|---|
| 937 | ce = table[index + 1]; | 
|---|
| 938 | if(length == 2) { | 
|---|
| 939 | return ce; | 
|---|
| 940 | } else { | 
|---|
| 941 | return ((uint32_t)table[index + 2] << 16) | ce; | 
|---|
| 942 | } | 
|---|
| 943 | } | 
|---|
| 944 | } | 
|---|
| 945 |  | 
|---|
| 946 | uint32_t | 
|---|
| 947 | CollationFastLatin::getSecondaries(uint32_t variableTop, uint32_t pair) { | 
|---|
| 948 | if(pair <= 0xffff) { | 
|---|
| 949 | // one mini CE | 
|---|
| 950 | if(pair >= MIN_SHORT) { | 
|---|
| 951 | pair = getSecondariesFromOneShortCE(pair); | 
|---|
| 952 | } else if(pair > variableTop) { | 
|---|
| 953 | pair = COMMON_SEC_PLUS_OFFSET; | 
|---|
| 954 | } else if(pair >= MIN_LONG) { | 
|---|
| 955 | pair = 0;  // variable | 
|---|
| 956 | } | 
|---|
| 957 | // else special mini CE | 
|---|
| 958 | } else { | 
|---|
| 959 | uint32_t ce = pair & 0xffff; | 
|---|
| 960 | if(ce >= MIN_SHORT) { | 
|---|
| 961 | pair = (pair & TWO_SECONDARIES_MASK) + TWO_SEC_OFFSETS; | 
|---|
| 962 | } else if(ce > variableTop) { | 
|---|
| 963 | pair = TWO_COMMON_SEC_PLUS_OFFSET; | 
|---|
| 964 | } else { | 
|---|
| 965 | U_ASSERT(ce >= MIN_LONG); | 
|---|
| 966 | pair = 0;  // variable | 
|---|
| 967 | } | 
|---|
| 968 | } | 
|---|
| 969 | return pair; | 
|---|
| 970 | } | 
|---|
| 971 |  | 
|---|
| 972 | uint32_t | 
|---|
| 973 | CollationFastLatin::getCases(uint32_t variableTop, UBool strengthIsPrimary, uint32_t pair) { | 
|---|
| 974 | // Primary+caseLevel: Ignore case level weights of primary ignorables. | 
|---|
| 975 | // Otherwise: Ignore case level weights of secondary ignorables. | 
|---|
| 976 | // For details see the comments in the CollationCompare class. | 
|---|
| 977 | // Tertiary CEs (secondary ignorables) are not supported in fast Latin. | 
|---|
| 978 | if(pair <= 0xffff) { | 
|---|
| 979 | // one mini CE | 
|---|
| 980 | if(pair >= MIN_SHORT) { | 
|---|
| 981 | // A high secondary weight means we really have two CEs, | 
|---|
| 982 | // a primary CE and a secondary CE. | 
|---|
| 983 | uint32_t ce = pair; | 
|---|
| 984 | pair &= CASE_MASK;  // explicit weight of primary CE | 
|---|
| 985 | if(!strengthIsPrimary && (ce & SECONDARY_MASK) >= MIN_SEC_HIGH) { | 
|---|
| 986 | pair |= LOWER_CASE << 16;  // implied weight of secondary CE | 
|---|
| 987 | } | 
|---|
| 988 | } else if(pair > variableTop) { | 
|---|
| 989 | pair = LOWER_CASE; | 
|---|
| 990 | } else if(pair >= MIN_LONG) { | 
|---|
| 991 | pair = 0;  // variable | 
|---|
| 992 | } | 
|---|
| 993 | // else special mini CE | 
|---|
| 994 | } else { | 
|---|
| 995 | // two mini CEs, same primary groups, neither expands like above | 
|---|
| 996 | uint32_t ce = pair & 0xffff; | 
|---|
| 997 | if(ce >= MIN_SHORT) { | 
|---|
| 998 | if(strengthIsPrimary && (pair & (SHORT_PRIMARY_MASK << 16)) == 0) { | 
|---|
| 999 | pair &= CASE_MASK; | 
|---|
| 1000 | } else { | 
|---|
| 1001 | pair &= TWO_CASES_MASK; | 
|---|
| 1002 | } | 
|---|
| 1003 | } else if(ce > variableTop) { | 
|---|
| 1004 | pair = TWO_LOWER_CASES; | 
|---|
| 1005 | } else { | 
|---|
| 1006 | U_ASSERT(ce >= MIN_LONG); | 
|---|
| 1007 | pair = 0;  // variable | 
|---|
| 1008 | } | 
|---|
| 1009 | } | 
|---|
| 1010 | return pair; | 
|---|
| 1011 | } | 
|---|
| 1012 |  | 
|---|
| 1013 | uint32_t | 
|---|
| 1014 | CollationFastLatin::getTertiaries(uint32_t variableTop, UBool withCaseBits, uint32_t pair) { | 
|---|
| 1015 | if(pair <= 0xffff) { | 
|---|
| 1016 | // one mini CE | 
|---|
| 1017 | if(pair >= MIN_SHORT) { | 
|---|
| 1018 | // A high secondary weight means we really have two CEs, | 
|---|
| 1019 | // a primary CE and a secondary CE. | 
|---|
| 1020 | uint32_t ce = pair; | 
|---|
| 1021 | if(withCaseBits) { | 
|---|
| 1022 | pair = (pair & CASE_AND_TERTIARY_MASK) + TER_OFFSET; | 
|---|
| 1023 | if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) { | 
|---|
| 1024 | pair |= (LOWER_CASE | COMMON_TER_PLUS_OFFSET) << 16; | 
|---|
| 1025 | } | 
|---|
| 1026 | } else { | 
|---|
| 1027 | pair = (pair & TERTIARY_MASK) + TER_OFFSET; | 
|---|
| 1028 | if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) { | 
|---|
| 1029 | pair |= COMMON_TER_PLUS_OFFSET << 16; | 
|---|
| 1030 | } | 
|---|
| 1031 | } | 
|---|
| 1032 | } else if(pair > variableTop) { | 
|---|
| 1033 | pair = (pair & TERTIARY_MASK) + TER_OFFSET; | 
|---|
| 1034 | if(withCaseBits) { | 
|---|
| 1035 | pair |= LOWER_CASE; | 
|---|
| 1036 | } | 
|---|
| 1037 | } else if(pair >= MIN_LONG) { | 
|---|
| 1038 | pair = 0;  // variable | 
|---|
| 1039 | } | 
|---|
| 1040 | // else special mini CE | 
|---|
| 1041 | } else { | 
|---|
| 1042 | // two mini CEs, same primary groups, neither expands like above | 
|---|
| 1043 | uint32_t ce = pair & 0xffff; | 
|---|
| 1044 | if(ce >= MIN_SHORT) { | 
|---|
| 1045 | if(withCaseBits) { | 
|---|
| 1046 | pair &= TWO_CASES_MASK | TWO_TERTIARIES_MASK; | 
|---|
| 1047 | } else { | 
|---|
| 1048 | pair &= TWO_TERTIARIES_MASK; | 
|---|
| 1049 | } | 
|---|
| 1050 | pair += TWO_TER_OFFSETS; | 
|---|
| 1051 | } else if(ce > variableTop) { | 
|---|
| 1052 | pair = (pair & TWO_TERTIARIES_MASK) + TWO_TER_OFFSETS; | 
|---|
| 1053 | if(withCaseBits) { | 
|---|
| 1054 | pair |= TWO_LOWER_CASES; | 
|---|
| 1055 | } | 
|---|
| 1056 | } else { | 
|---|
| 1057 | U_ASSERT(ce >= MIN_LONG); | 
|---|
| 1058 | pair = 0;  // variable | 
|---|
| 1059 | } | 
|---|
| 1060 | } | 
|---|
| 1061 | return pair; | 
|---|
| 1062 | } | 
|---|
| 1063 |  | 
|---|
| 1064 | uint32_t | 
|---|
| 1065 | CollationFastLatin::getQuaternaries(uint32_t variableTop, uint32_t pair) { | 
|---|
| 1066 | // Return the primary weight of a variable CE, | 
|---|
| 1067 | // or the maximum primary weight for a non-variable, not-completely-ignorable CE. | 
|---|
| 1068 | if(pair <= 0xffff) { | 
|---|
| 1069 | // one mini CE | 
|---|
| 1070 | if(pair >= MIN_SHORT) { | 
|---|
| 1071 | // A high secondary weight means we really have two CEs, | 
|---|
| 1072 | // a primary CE and a secondary CE. | 
|---|
| 1073 | if((pair & SECONDARY_MASK) >= MIN_SEC_HIGH) { | 
|---|
| 1074 | pair = TWO_SHORT_PRIMARIES_MASK; | 
|---|
| 1075 | } else { | 
|---|
| 1076 | pair = SHORT_PRIMARY_MASK; | 
|---|
| 1077 | } | 
|---|
| 1078 | } else if(pair > variableTop) { | 
|---|
| 1079 | pair = SHORT_PRIMARY_MASK; | 
|---|
| 1080 | } else if(pair >= MIN_LONG) { | 
|---|
| 1081 | pair &= LONG_PRIMARY_MASK;  // variable | 
|---|
| 1082 | } | 
|---|
| 1083 | // else special mini CE | 
|---|
| 1084 | } else { | 
|---|
| 1085 | // two mini CEs, same primary groups, neither expands like above | 
|---|
| 1086 | uint32_t ce = pair & 0xffff; | 
|---|
| 1087 | if(ce > variableTop) { | 
|---|
| 1088 | pair = TWO_SHORT_PRIMARIES_MASK; | 
|---|
| 1089 | } else { | 
|---|
| 1090 | U_ASSERT(ce >= MIN_LONG); | 
|---|
| 1091 | pair &= TWO_LONG_PRIMARIES_MASK;  // variable | 
|---|
| 1092 | } | 
|---|
| 1093 | } | 
|---|
| 1094 | return pair; | 
|---|
| 1095 | } | 
|---|
| 1096 |  | 
|---|
| 1097 | U_NAMESPACE_END | 
|---|
| 1098 |  | 
|---|
| 1099 | #endif  // !UCONFIG_NO_COLLATION | 
|---|
| 1100 |  | 
|---|