| 1 | // Licensed to the .NET Foundation under one or more agreements. |
| 2 | // The .NET Foundation licenses this file to you under the MIT license. |
| 3 | // See the LICENSE file in the project root for more information. |
| 4 | // |
| 5 | |
| 6 | #include <assert.h> |
| 7 | #include <pthread.h> |
| 8 | #include <stdint.h> |
| 9 | #include <vector> |
| 10 | #include <map> |
| 11 | |
| 12 | #include "icushim.h" |
| 13 | #include "locale.hpp" |
| 14 | #include "errors.h" |
| 15 | |
| 16 | const int32_t CompareOptionsIgnoreCase = 0x1; |
| 17 | const int32_t CompareOptionsIgnoreNonSpace = 0x2; |
| 18 | const int32_t CompareOptionsIgnoreSymbols = 0x4; |
| 19 | const int32_t CompareOptionsIgnoreKanaType = 0x8; |
| 20 | const int32_t CompareOptionsIgnoreWidth = 0x10; |
| 21 | // const int32_t CompareOptionsStringSort = 0x20000000; |
| 22 | // ICU's default is to use "StringSort", i.e. nonalphanumeric symbols come before alphanumeric. |
| 23 | // When StringSort is not specified (.NET's default), the sort order will be different between |
| 24 | // Windows and Unix platforms. The nonalphanumeric symbols will come after alphanumeric |
| 25 | // characters on Windows, but before on Unix. |
| 26 | // Since locale - specific string sort order can change from one version of Windows to the next, |
| 27 | // there is no reason to guarantee string sort order between Windows and ICU. Thus trying to |
| 28 | // change ICU's default behavior here isn't really justified unless someone has a strong reason |
| 29 | // for !StringSort to behave differently. |
| 30 | |
| 31 | typedef std::map<int32_t, UCollator*> TCollatorMap; |
| 32 | typedef std::pair<int32_t, UCollator*> TCollatorMapPair; |
| 33 | |
| 34 | /* |
| 35 | * For increased performance, we cache the UCollator objects for a locale and |
| 36 | * share them across threads. This is safe (and supported in ICU) if we ensure |
| 37 | * multiple threads are only ever dealing with const UCollators. |
| 38 | */ |
| 39 | typedef struct _sort_handle |
| 40 | { |
| 41 | UCollator* regular; |
| 42 | TCollatorMap collatorsPerOption; |
| 43 | pthread_mutex_t collatorsLockObject; |
| 44 | |
| 45 | _sort_handle() : regular(nullptr) |
| 46 | { |
| 47 | int result = pthread_mutex_init(&collatorsLockObject, NULL); |
| 48 | if (result != 0) |
| 49 | { |
| 50 | assert(false && "Unexpected pthread_mutex_init return value." ); |
| 51 | } |
| 52 | } |
| 53 | |
| 54 | } SortHandle; |
| 55 | |
| 56 | // Hiragana character range |
| 57 | const UChar hiraganaStart = 0x3041; |
| 58 | const UChar hiraganaEnd = 0x309e; |
| 59 | const UChar hiraganaToKatakanaOffset = 0x30a1 - 0x3041; |
| 60 | |
| 61 | // Mapping between half- and fullwidth characters. |
| 62 | // LowerChars are the characters that should sort lower than HigherChars |
| 63 | const UChar g_HalfFullLowerChars[] = { |
| 64 | // halfwidth characters |
| 65 | 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, |
| 66 | 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, |
| 67 | 0x003f, 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, |
| 68 | 0x004e, 0x004f, 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005a, 0x005b, 0x005d, |
| 69 | 0x005e, 0x005f, 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, |
| 70 | 0x006d, 0x006e, 0x006f, 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x007b, |
| 71 | 0x007c, 0x007d, 0x007e, 0x00a2, 0x00a3, 0x00ac, 0x00af, 0x00a6, 0x00a5, 0x20a9, |
| 72 | |
| 73 | // fullwidth characters |
| 74 | 0x3002, 0x300c, 0x300d, 0x3001, 0x30fb, 0x30f2, 0x30a1, 0x30a3, 0x30a5, 0x30a7, 0x30a9, 0x30e3, 0x30e5, 0x30e7, 0x30c3, |
| 75 | 0x30a2, 0x30a4, 0x30a6, 0x30a8, 0x30aa, 0x30ab, 0x30ad, 0x30af, 0x30b1, 0x30b3, 0x30b5, 0x30b7, 0x30b9, 0x30bb, 0x30bd, |
| 76 | 0x30bf, 0x30c1, 0x30c4, 0x30c6, 0x30c8, 0x30ca, 0x30cb, 0x30cc, 0x30cd, 0x30ce, 0x30cf, 0x30d2, 0x30d5, 0x30d8, 0x30db, |
| 77 | 0x30de, 0x30df, 0x30e0, 0x30e1, 0x30e2, 0x30e4, 0x30e6, 0x30e8, 0x30e9, 0x30ea, 0x30eb, 0x30ec, 0x30ed, 0x30ef, 0x30f3, |
| 78 | 0x3164, 0x3131, 0x3132, 0x3133, 0x3134, 0x3135, 0x3136, 0x3137, 0x3138, 0x3139, 0x313a, 0x313b, 0x313c, 0x313d, 0x313e, |
| 79 | 0x313f, 0x3140, 0x3141, 0x3142, 0x3143, 0x3144, 0x3145, 0x3146, 0x3147, 0x3148, 0x3149, 0x314a, 0x314b, 0x314c, 0x314d, |
| 80 | 0x314e, 0x314f, 0x3150, 0x3151, 0x3152, 0x3153, 0x3154, 0x3155, 0x3156, 0x3157, 0x3158, 0x3159, 0x315a, 0x315b, 0x315c, |
| 81 | 0x315d, 0x315e, 0x315f, 0x3160, 0x3161, 0x3162, 0x3163 |
| 82 | |
| 83 | }; |
| 84 | const UChar g_HalfFullHigherChars[] = { |
| 85 | // fullwidth characters |
| 86 | 0xff01, 0xff02, 0xff03, 0xff04, 0xff05, 0xff06, 0xff07, 0xff08, 0xff09, 0xff0a, 0xff0b, 0xff0c, 0xff0d, 0xff0e, 0xff0f, |
| 87 | 0xff10, 0xff11, 0xff12, 0xff13, 0xff14, 0xff15, 0xff16, 0xff17, 0xff18, 0xff19, 0xff1a, 0xff1b, 0xff1c, 0xff1d, 0xff1e, |
| 88 | 0xff1f, 0xff20, 0xff21, 0xff22, 0xff23, 0xff24, 0xff25, 0xff26, 0xff27, 0xff28, 0xff29, 0xff2a, 0xff2b, 0xff2c, 0xff2d, |
| 89 | 0xff2e, 0xff2f, 0xff30, 0xff31, 0xff32, 0xff33, 0xff34, 0xff35, 0xff36, 0xff37, 0xff38, 0xff39, 0xff3a, 0xff3b, 0xff3d, |
| 90 | 0xff3e, 0xff3f, 0xff40, 0xff41, 0xff42, 0xff43, 0xff44, 0xff45, 0xff46, 0xff47, 0xff48, 0xff49, 0xff4a, 0xff4b, 0xff4c, |
| 91 | 0xff4d, 0xff4e, 0xff4f, 0xff50, 0xff51, 0xff52, 0xff53, 0xff54, 0xff55, 0xff56, 0xff57, 0xff58, 0xff59, 0xff5a, 0xff5b, |
| 92 | 0xff5c, 0xff5d, 0xff5e, 0xffe0, 0xffe1, 0xffe2, 0xffe3, 0xffe4, 0xffe5, 0xffe6, |
| 93 | |
| 94 | // halfwidth characters |
| 95 | 0xff61, 0xff62, 0xff63, 0xff64, 0xff65, 0xff66, 0xff67, 0xff68, 0xff69, 0xff6a, 0xff6b, 0xff6c, 0xff6d, 0xff6e, 0xff6f, |
| 96 | 0xff71, 0xff72, 0xff73, 0xff74, 0xff75, 0xff76, 0xff77, 0xff78, 0xff79, 0xff7a, 0xff7b, 0xff7c, 0xff7d, 0xff7e, 0xff7f, |
| 97 | 0xff80, 0xff81, 0xff82, 0xff83, 0xff84, 0xff85, 0xff86, 0xff87, 0xff88, 0xff89, 0xff8a, 0xff8b, 0xff8c, 0xff8d, 0xff8e, |
| 98 | 0xff8f, 0xff90, 0xff91, 0xff92, 0xff93, 0xff94, 0xff95, 0xff96, 0xff97, 0xff98, 0xff99, 0xff9a, 0xff9b, 0xff9c, 0xff9d, |
| 99 | 0xffa0, 0xffa1, 0xffa2, 0xffa3, 0xffa4, 0xffa5, 0xffa6, 0xffa7, 0xffa8, 0xffa9, 0xffaa, 0xffab, 0xffac, 0xffad, 0xffae, |
| 100 | 0xffaf, 0xffb0, 0xffb1, 0xffb2, 0xffb3, 0xffb4, 0xffb5, 0xffb6, 0xffb7, 0xffb8, 0xffb9, 0xffba, 0xffbb, 0xffbc, 0xffbd, |
| 101 | 0xffbe, 0xffc2, 0xffc3, 0xffc4, 0xffc5, 0xffc6, 0xffc7, 0xffca, 0xffcb, 0xffcc, 0xffcd, 0xffce, 0xffcf, 0xffd2, 0xffd3, |
| 102 | 0xffd4, 0xffd5, 0xffd6, 0xffd7, 0xffda, 0xffdb, 0xffdc |
| 103 | }; |
| 104 | const int32_t g_HalfFullCharsLength = (sizeof(g_HalfFullHigherChars) / sizeof(UChar)); |
| 105 | |
| 106 | /* |
| 107 | ICU collation rules reserve any punctuation and whitespace characters for use in the syntax. |
| 108 | Thus, to use these characters in a rule, they need to be escaped. |
| 109 | |
| 110 | This rule was taken from http://www.unicode.org/reports/tr35/tr35-collation.html#Rules. |
| 111 | */ |
| 112 | bool NeedsEscape(UChar character) |
| 113 | { |
| 114 | return ((0x21 <= character && character <= 0x2f) |
| 115 | || (0x3a <= character && character <= 0x40) |
| 116 | || (0x5b <= character && character <= 0x60) |
| 117 | || (0x7b <= character && character <= 0x7e)); |
| 118 | } |
| 119 | |
| 120 | /* |
| 121 | Gets a value indicating whether the HalfFullHigher character is considered a symbol character. |
| 122 | |
| 123 | The ranges specified here are only checking for characters in the g_HalfFullHigherChars list and needs |
| 124 | to be combined with NeedsEscape above with the g_HalfFullLowerChars for all the IgnoreSymbols characters. |
| 125 | This is done so we can use range checks instead of comparing individual characters. |
| 126 | |
| 127 | These ranges were obtained by running the above characters through .NET CompareInfo.Compare |
| 128 | with CompareOptions.IgnoreSymbols on Windows. |
| 129 | */ |
| 130 | bool IsHalfFullHigherSymbol(UChar character) |
| 131 | { |
| 132 | return (0xffe0 <= character && character <= 0xffe6) |
| 133 | || (0xff61 <= character && character <= 0xff65); |
| 134 | } |
| 135 | |
| 136 | /* |
| 137 | Gets a string of custom collation rules, if necessary. |
| 138 | |
| 139 | Since the CompareOptions flags don't map 1:1 with ICU default functionality, we need to fall back to using |
| 140 | custom rules in order to support IgnoreKanaType and IgnoreWidth CompareOptions correctly. |
| 141 | */ |
| 142 | std::vector<UChar> GetCustomRules(int32_t options, UColAttributeValue strength, bool isIgnoreSymbols) |
| 143 | { |
| 144 | bool isIgnoreKanaType = (options & CompareOptionsIgnoreKanaType) == CompareOptionsIgnoreKanaType; |
| 145 | bool isIgnoreWidth = (options & CompareOptionsIgnoreWidth) == CompareOptionsIgnoreWidth; |
| 146 | |
| 147 | // kana differs at the tertiary level |
| 148 | bool needsIgnoreKanaTypeCustomRule = isIgnoreKanaType && strength >= UCOL_TERTIARY; |
| 149 | bool needsNotIgnoreKanaTypeCustomRule = !isIgnoreKanaType && strength < UCOL_TERTIARY; |
| 150 | |
| 151 | // character width differs at the tertiary level |
| 152 | bool needsIgnoreWidthCustomRule = isIgnoreWidth && strength >= UCOL_TERTIARY; |
| 153 | bool needsNotIgnoreWidthCustomRule = !isIgnoreWidth && strength < UCOL_TERTIARY; |
| 154 | |
| 155 | std::vector<UChar> customRules; |
| 156 | if (needsIgnoreKanaTypeCustomRule || needsNotIgnoreKanaTypeCustomRule || needsIgnoreWidthCustomRule || needsNotIgnoreWidthCustomRule) |
| 157 | { |
| 158 | // If we need to create customRules, the KanaType custom rule will be 88 kana characters * 4 = 352 chars long |
| 159 | // and the Width custom rule will be at least 215 halfwidth characters * 4 = 860 chars long. |
| 160 | // Use 512 as the starting size, so the customRules won't have to grow if we are just |
| 161 | // doing the KanaType custom rule. |
| 162 | customRules.reserve(512); |
| 163 | |
| 164 | if (needsIgnoreKanaTypeCustomRule || needsNotIgnoreKanaTypeCustomRule) |
| 165 | { |
| 166 | UChar compareChar = needsIgnoreKanaTypeCustomRule ? '=' : '<'; |
| 167 | |
| 168 | for (UChar hiraganaChar = hiraganaStart; hiraganaChar <= hiraganaEnd; hiraganaChar++) |
| 169 | { |
| 170 | // Hiragana is the range 3041 to 3096 & 309D & 309E |
| 171 | if (hiraganaChar <= 0x3096 || hiraganaChar >= 0x309D) // characters between 3096 and 309D are not mapped to katakana |
| 172 | { |
| 173 | customRules.push_back('&'); |
| 174 | customRules.push_back(hiraganaChar); |
| 175 | customRules.push_back(compareChar); |
| 176 | customRules.push_back(hiraganaChar + hiraganaToKatakanaOffset); |
| 177 | } |
| 178 | } |
| 179 | } |
| 180 | |
| 181 | if (needsIgnoreWidthCustomRule || needsNotIgnoreWidthCustomRule) |
| 182 | { |
| 183 | UChar compareChar = needsIgnoreWidthCustomRule ? '=' : '<'; |
| 184 | |
| 185 | UChar lowerChar; |
| 186 | UChar higherChar; |
| 187 | bool needsEscape; |
| 188 | for (int i = 0; i < g_HalfFullCharsLength; i++) |
| 189 | { |
| 190 | lowerChar = g_HalfFullLowerChars[i]; |
| 191 | higherChar = g_HalfFullHigherChars[i]; |
| 192 | // the lower chars need to be checked for escaping since they contain ASCII punctuation |
| 193 | needsEscape = NeedsEscape(lowerChar); |
| 194 | |
| 195 | // when isIgnoreSymbols is true and we are not ignoring width, check to see if |
| 196 | // this character is a symbol, and if so skip it |
| 197 | if (!(isIgnoreSymbols && needsNotIgnoreWidthCustomRule && (needsEscape || IsHalfFullHigherSymbol(higherChar)))) |
| 198 | { |
| 199 | customRules.push_back('&'); |
| 200 | |
| 201 | if (needsEscape) |
| 202 | { |
| 203 | customRules.push_back('\\'); |
| 204 | } |
| 205 | customRules.push_back(lowerChar); |
| 206 | |
| 207 | customRules.push_back(compareChar); |
| 208 | customRules.push_back(higherChar); |
| 209 | } |
| 210 | } |
| 211 | } |
| 212 | } |
| 213 | |
| 214 | return customRules; |
| 215 | } |
| 216 | |
| 217 | /* |
| 218 | * The collator returned by this function is owned by the callee and must be |
| 219 | * closed when this method returns with a U_SUCCESS UErrorCode. |
| 220 | * |
| 221 | * On error, the return value is undefined. |
| 222 | */ |
| 223 | UCollator* CloneCollatorWithOptions(const UCollator* pCollator, int32_t options, UErrorCode* pErr) |
| 224 | { |
| 225 | UColAttributeValue strength = ucol_getStrength(pCollator); |
| 226 | |
| 227 | bool isIgnoreCase = (options & CompareOptionsIgnoreCase) == CompareOptionsIgnoreCase; |
| 228 | bool isIgnoreNonSpace = (options & CompareOptionsIgnoreNonSpace) == CompareOptionsIgnoreNonSpace; |
| 229 | bool isIgnoreSymbols = (options & CompareOptionsIgnoreSymbols) == CompareOptionsIgnoreSymbols; |
| 230 | |
| 231 | if (isIgnoreCase) |
| 232 | { |
| 233 | strength = UCOL_SECONDARY; |
| 234 | } |
| 235 | |
| 236 | if (isIgnoreNonSpace) |
| 237 | { |
| 238 | strength = UCOL_PRIMARY; |
| 239 | } |
| 240 | |
| 241 | UCollator* pClonedCollator; |
| 242 | std::vector<UChar> customRules = GetCustomRules(options, strength, isIgnoreSymbols); |
| 243 | if (customRules.empty()) |
| 244 | { |
| 245 | pClonedCollator = ucol_safeClone(pCollator, nullptr, nullptr, pErr); |
| 246 | } |
| 247 | else |
| 248 | { |
| 249 | int32_t customRuleLength = customRules.size(); |
| 250 | |
| 251 | int32_t localeRulesLength; |
| 252 | const UChar* localeRules = ucol_getRules(pCollator, &localeRulesLength); |
| 253 | |
| 254 | std::vector<UChar> completeRules(localeRulesLength + customRuleLength + 1, '\0'); |
| 255 | for (int i = 0; i < localeRulesLength; i++) |
| 256 | { |
| 257 | completeRules[i] = localeRules[i]; |
| 258 | } |
| 259 | for (int i = 0; i < customRuleLength; i++) |
| 260 | { |
| 261 | completeRules[localeRulesLength + i] = customRules[i]; |
| 262 | } |
| 263 | |
| 264 | pClonedCollator = ucol_openRules(completeRules.data(), completeRules.size(), UCOL_DEFAULT, strength, NULL, pErr); |
| 265 | } |
| 266 | |
| 267 | if (isIgnoreSymbols) |
| 268 | { |
| 269 | ucol_setAttribute(pClonedCollator, UCOL_ALTERNATE_HANDLING, UCOL_SHIFTED, pErr); |
| 270 | |
| 271 | // by default, ICU alternate shifted handling only ignores punctuation, but |
| 272 | // IgnoreSymbols needs symbols and currency as well, so change the "variable top" |
| 273 | // to include all symbols and currency |
| 274 | #if HAVE_SET_MAX_VARIABLE |
| 275 | ucol_setMaxVariable(pClonedCollator, UCOL_REORDER_CODE_CURRENCY, pErr); |
| 276 | #else |
| 277 | // 0xfdfc is the last currency character before the first digit character |
| 278 | // in http://source.icu-project.org/repos/icu/icu/tags/release-52-1/source/data/unidata/FractionalUCA.txt |
| 279 | const UChar ignoreSymbolsVariableTop[] = { 0xfdfc }; |
| 280 | ucol_setVariableTop(pClonedCollator, ignoreSymbolsVariableTop, 1, pErr); |
| 281 | #endif |
| 282 | } |
| 283 | |
| 284 | ucol_setAttribute(pClonedCollator, UCOL_STRENGTH, strength, pErr); |
| 285 | |
| 286 | // casing differs at the tertiary level. |
| 287 | // if strength is less than tertiary, but we are not ignoring case, then we need to flip CASE_LEVEL On |
| 288 | if (strength < UCOL_TERTIARY && !isIgnoreCase) |
| 289 | { |
| 290 | ucol_setAttribute(pClonedCollator, UCOL_CASE_LEVEL, UCOL_ON, pErr); |
| 291 | } |
| 292 | |
| 293 | return pClonedCollator; |
| 294 | } |
| 295 | |
| 296 | // Returns TRUE if all the collation elements in str are completely ignorable |
| 297 | bool CanIgnoreAllCollationElements(const UCollator* pColl, const UChar* lpStr, int32_t length) |
| 298 | { |
| 299 | bool result = false; |
| 300 | UErrorCode err = U_ZERO_ERROR; |
| 301 | UCollationElements* pCollElem = ucol_openElements(pColl, lpStr, length, &err); |
| 302 | |
| 303 | if (U_SUCCESS(err)) |
| 304 | { |
| 305 | int32_t curCollElem = UCOL_NULLORDER; |
| 306 | |
| 307 | result = true; |
| 308 | |
| 309 | while ((curCollElem = ucol_next(pCollElem, &err)) != UCOL_NULLORDER) |
| 310 | { |
| 311 | if (curCollElem != 0) |
| 312 | { |
| 313 | result = false; |
| 314 | break; |
| 315 | } |
| 316 | } |
| 317 | |
| 318 | if (U_FAILURE(err)) |
| 319 | { |
| 320 | result = false; |
| 321 | } |
| 322 | |
| 323 | ucol_closeElements(pCollElem); |
| 324 | } |
| 325 | |
| 326 | return result; |
| 327 | |
| 328 | } |
| 329 | |
| 330 | extern "C" ResultCode GlobalizationNative_GetSortHandle(const char* lpLocaleName, SortHandle** ppSortHandle) |
| 331 | { |
| 332 | assert(ppSortHandle != nullptr); |
| 333 | |
| 334 | *ppSortHandle = new (std::nothrow) SortHandle(); |
| 335 | if ((*ppSortHandle) == nullptr) |
| 336 | { |
| 337 | return GetResultCode(U_MEMORY_ALLOCATION_ERROR); |
| 338 | } |
| 339 | |
| 340 | UErrorCode err = U_ZERO_ERROR; |
| 341 | |
| 342 | (*ppSortHandle)->regular = ucol_open(lpLocaleName, &err); |
| 343 | |
| 344 | if (U_FAILURE(err)) |
| 345 | { |
| 346 | if ((*ppSortHandle)->regular != nullptr) |
| 347 | ucol_close((*ppSortHandle)->regular); |
| 348 | |
| 349 | delete (*ppSortHandle); |
| 350 | (*ppSortHandle) = nullptr; |
| 351 | } |
| 352 | |
| 353 | return GetResultCode(err); |
| 354 | } |
| 355 | |
| 356 | extern "C" void GlobalizationNative_CloseSortHandle(SortHandle* pSortHandle) |
| 357 | { |
| 358 | ucol_close(pSortHandle->regular); |
| 359 | pSortHandle->regular = nullptr; |
| 360 | |
| 361 | TCollatorMap::iterator it; |
| 362 | for (it = pSortHandle->collatorsPerOption.begin(); it != pSortHandle->collatorsPerOption.end(); it++) |
| 363 | { |
| 364 | ucol_close(it->second); |
| 365 | } |
| 366 | |
| 367 | pthread_mutex_destroy(&pSortHandle->collatorsLockObject); |
| 368 | |
| 369 | delete pSortHandle; |
| 370 | } |
| 371 | |
| 372 | const UCollator* GetCollatorFromSortHandle(SortHandle* pSortHandle, int32_t options, UErrorCode* pErr) |
| 373 | { |
| 374 | UCollator* pCollator; |
| 375 | if (options == 0) |
| 376 | { |
| 377 | pCollator = pSortHandle->regular; |
| 378 | } |
| 379 | else |
| 380 | { |
| 381 | int lockResult = pthread_mutex_lock(&pSortHandle->collatorsLockObject); |
| 382 | if (lockResult != 0) |
| 383 | { |
| 384 | assert(false && "Unexpected pthread_mutex_lock return value." ); |
| 385 | } |
| 386 | |
| 387 | TCollatorMap::iterator entry = pSortHandle->collatorsPerOption.find(options); |
| 388 | if (entry == pSortHandle->collatorsPerOption.end()) |
| 389 | { |
| 390 | pCollator = CloneCollatorWithOptions(pSortHandle->regular, options, pErr); |
| 391 | pSortHandle->collatorsPerOption[options] = pCollator; |
| 392 | } |
| 393 | else |
| 394 | { |
| 395 | pCollator = entry->second; |
| 396 | } |
| 397 | |
| 398 | pthread_mutex_unlock(&pSortHandle->collatorsLockObject); |
| 399 | } |
| 400 | |
| 401 | return pCollator; |
| 402 | } |
| 403 | |
| 404 | extern "C" int32_t GlobalizationNative_GetSortVersion(SortHandle* pSortHandle) |
| 405 | { |
| 406 | UErrorCode err = U_ZERO_ERROR; |
| 407 | const UCollator* pColl = GetCollatorFromSortHandle(pSortHandle, 0, &err); |
| 408 | int32_t result = 0; |
| 409 | |
| 410 | if (U_SUCCESS(err)) |
| 411 | { |
| 412 | ucol_getVersion(pColl, (uint8_t *) &result); |
| 413 | } |
| 414 | else |
| 415 | { |
| 416 | assert(false && "Unexpected ucol_getVersion to fail." ); |
| 417 | |
| 418 | // we didn't use UCOL_TAILORINGS_VERSION because it is deprecated in ICU v5 |
| 419 | result = UCOL_RUNTIME_VERSION << 16 | UCOL_BUILDER_VERSION; |
| 420 | } |
| 421 | return result; |
| 422 | } |
| 423 | |
| 424 | /* |
| 425 | Function: |
| 426 | CompareString |
| 427 | */ |
| 428 | extern "C" int32_t GlobalizationNative_CompareString( |
| 429 | SortHandle* pSortHandle, const UChar* lpStr1, int32_t cwStr1Length, const UChar* lpStr2, int32_t cwStr2Length, int32_t options) |
| 430 | { |
| 431 | static_assert(UCOL_EQUAL == 0, "managed side requires 0 for equal strings" ); |
| 432 | static_assert(UCOL_LESS < 0, "managed side requires less than zero for a < b" ); |
| 433 | static_assert(UCOL_GREATER > 0, "managed side requires greater than zero for a > b" ); |
| 434 | |
| 435 | UCollationResult result = UCOL_EQUAL; |
| 436 | UErrorCode err = U_ZERO_ERROR; |
| 437 | const UCollator* pColl = GetCollatorFromSortHandle(pSortHandle, options, &err); |
| 438 | |
| 439 | if (U_SUCCESS(err)) |
| 440 | { |
| 441 | result = ucol_strcoll(pColl, lpStr1, cwStr1Length, lpStr2, cwStr2Length); |
| 442 | } |
| 443 | |
| 444 | return result; |
| 445 | } |
| 446 | |
| 447 | /* |
| 448 | Function: |
| 449 | IndexOf |
| 450 | */ |
| 451 | extern "C" int32_t GlobalizationNative_IndexOf( |
| 452 | SortHandle* pSortHandle, |
| 453 | const UChar* lpTarget, |
| 454 | int32_t cwTargetLength, |
| 455 | const UChar* lpSource, |
| 456 | int32_t cwSourceLength, |
| 457 | int32_t options, |
| 458 | int32_t* pMatchedLength) |
| 459 | { |
| 460 | static_assert(USEARCH_DONE == -1, "managed side requires -1 for not found" ); |
| 461 | |
| 462 | int32_t result = USEARCH_DONE; |
| 463 | UErrorCode err = U_ZERO_ERROR; |
| 464 | const UCollator* pColl = GetCollatorFromSortHandle(pSortHandle, options, &err); |
| 465 | |
| 466 | if (U_SUCCESS(err)) |
| 467 | { |
| 468 | UStringSearch* pSearch = usearch_openFromCollator(lpTarget, cwTargetLength, lpSource, cwSourceLength, pColl, nullptr, &err); |
| 469 | |
| 470 | if (U_SUCCESS(err)) |
| 471 | { |
| 472 | result = usearch_first(pSearch, &err); |
| 473 | |
| 474 | // if the search was successful, |
| 475 | // we'll try to get the matched string length. |
| 476 | if(result != USEARCH_DONE && pMatchedLength != NULL) |
| 477 | { |
| 478 | *pMatchedLength = usearch_getMatchedLength(pSearch); |
| 479 | } |
| 480 | usearch_close(pSearch); |
| 481 | } |
| 482 | } |
| 483 | |
| 484 | return result; |
| 485 | } |
| 486 | |
| 487 | /* |
| 488 | Function: |
| 489 | LastIndexOf |
| 490 | */ |
| 491 | extern "C" int32_t GlobalizationNative_LastIndexOf( |
| 492 | SortHandle* pSortHandle, |
| 493 | const UChar* lpTarget, |
| 494 | int32_t cwTargetLength, |
| 495 | const UChar* lpSource, |
| 496 | int32_t cwSourceLength, |
| 497 | int32_t options) |
| 498 | { |
| 499 | static_assert(USEARCH_DONE == -1, "managed side requires -1 for not found" ); |
| 500 | |
| 501 | int32_t result = USEARCH_DONE; |
| 502 | UErrorCode err = U_ZERO_ERROR; |
| 503 | const UCollator* pColl = GetCollatorFromSortHandle(pSortHandle, options, &err); |
| 504 | |
| 505 | if (U_SUCCESS(err)) |
| 506 | { |
| 507 | UStringSearch* pSearch = usearch_openFromCollator(lpTarget, cwTargetLength, lpSource, cwSourceLength, pColl, nullptr, &err); |
| 508 | |
| 509 | if (U_SUCCESS(err)) |
| 510 | { |
| 511 | result = usearch_last(pSearch, &err); |
| 512 | usearch_close(pSearch); |
| 513 | } |
| 514 | } |
| 515 | |
| 516 | return result; |
| 517 | } |
| 518 | |
| 519 | /* |
| 520 | Static Function: |
| 521 | AreEqualOrdinalIgnoreCase |
| 522 | */ |
| 523 | static bool AreEqualOrdinalIgnoreCase(UChar32 one, UChar32 two) |
| 524 | { |
| 525 | // Return whether the two characters are identical or would be identical if they were upper-cased. |
| 526 | |
| 527 | if (one == two) |
| 528 | { |
| 529 | return true; |
| 530 | } |
| 531 | |
| 532 | if (one == 0x0131 || two == 0x0131) |
| 533 | { |
| 534 | // On Windows with InvariantCulture, the LATIN SMALL LETTER DOTLESS I (U+0131) |
| 535 | // capitalizes to itself, whereas with ICU it capitalizes to LATIN CAPITAL LETTER I (U+0049). |
| 536 | // We special case it to match the Windows invariant behavior. |
| 537 | return false; |
| 538 | } |
| 539 | |
| 540 | return u_toupper(one) == u_toupper(two); |
| 541 | } |
| 542 | |
| 543 | /* |
| 544 | Function: |
| 545 | IndexOfOrdinalIgnoreCase |
| 546 | */ |
| 547 | extern "C" int32_t GlobalizationNative_IndexOfOrdinalIgnoreCase( |
| 548 | const UChar* lpTarget, int32_t cwTargetLength, const UChar* lpSource, int32_t cwSourceLength, int32_t findLast) |
| 549 | { |
| 550 | int32_t result = -1; |
| 551 | |
| 552 | int32_t endIndex = cwSourceLength - cwTargetLength; |
| 553 | assert(endIndex >= 0); |
| 554 | |
| 555 | int32_t i = 0; |
| 556 | while (i <= endIndex) |
| 557 | { |
| 558 | int32_t srcIdx = i, trgIdx = 0; |
| 559 | const UChar *src = lpSource, *trg = lpTarget; |
| 560 | UChar32 srcCodepoint, trgCodepoint; |
| 561 | |
| 562 | bool match = true; |
| 563 | while (trgIdx < cwTargetLength) |
| 564 | { |
| 565 | U16_NEXT(src, srcIdx, cwSourceLength, srcCodepoint); |
| 566 | U16_NEXT(trg, trgIdx, cwTargetLength, trgCodepoint); |
| 567 | if (!AreEqualOrdinalIgnoreCase(srcCodepoint, trgCodepoint)) |
| 568 | { |
| 569 | match = false; |
| 570 | break; |
| 571 | } |
| 572 | } |
| 573 | |
| 574 | if (match) |
| 575 | { |
| 576 | result = i; |
| 577 | if (!findLast) |
| 578 | { |
| 579 | break; |
| 580 | } |
| 581 | } |
| 582 | |
| 583 | U16_FWD_1(lpSource, i, cwSourceLength); |
| 584 | } |
| 585 | |
| 586 | return result; |
| 587 | } |
| 588 | |
| 589 | /* |
| 590 | Return value is a "Win32 BOOL" (1 = true, 0 = false) |
| 591 | */ |
| 592 | extern "C" int32_t GlobalizationNative_StartsWith( |
| 593 | SortHandle* pSortHandle, |
| 594 | const UChar* lpTarget, |
| 595 | int32_t cwTargetLength, |
| 596 | const UChar* lpSource, |
| 597 | int32_t cwSourceLength, |
| 598 | int32_t options) |
| 599 | { |
| 600 | int32_t result = FALSE; |
| 601 | UErrorCode err = U_ZERO_ERROR; |
| 602 | const UCollator* pColl = GetCollatorFromSortHandle(pSortHandle, options, &err); |
| 603 | |
| 604 | if (U_SUCCESS(err)) |
| 605 | { |
| 606 | UStringSearch* pSearch = usearch_openFromCollator(lpTarget, cwTargetLength, lpSource, cwSourceLength, pColl, nullptr, &err); |
| 607 | int32_t idx = USEARCH_DONE; |
| 608 | |
| 609 | if (U_SUCCESS(err)) |
| 610 | { |
| 611 | idx = usearch_first(pSearch, &err); |
| 612 | if (idx != USEARCH_DONE) |
| 613 | { |
| 614 | if (idx == 0) |
| 615 | { |
| 616 | result = TRUE; |
| 617 | } |
| 618 | else |
| 619 | { |
| 620 | result = CanIgnoreAllCollationElements(pColl, lpSource, idx); |
| 621 | } |
| 622 | } |
| 623 | |
| 624 | usearch_close(pSearch); |
| 625 | } |
| 626 | } |
| 627 | |
| 628 | return result; |
| 629 | } |
| 630 | |
| 631 | /* |
| 632 | Return value is a "Win32 BOOL" (1 = true, 0 = false) |
| 633 | */ |
| 634 | extern "C" int32_t GlobalizationNative_EndsWith( |
| 635 | SortHandle* pSortHandle, |
| 636 | const UChar* lpTarget, |
| 637 | int32_t cwTargetLength, |
| 638 | const UChar* lpSource, |
| 639 | int32_t cwSourceLength, |
| 640 | int32_t options) |
| 641 | { |
| 642 | int32_t result = FALSE; |
| 643 | UErrorCode err = U_ZERO_ERROR; |
| 644 | const UCollator* pColl = GetCollatorFromSortHandle(pSortHandle, options, &err); |
| 645 | |
| 646 | if (U_SUCCESS(err)) |
| 647 | { |
| 648 | UStringSearch* pSearch = usearch_openFromCollator(lpTarget, cwTargetLength, lpSource, cwSourceLength, pColl, nullptr, &err); |
| 649 | int32_t idx = USEARCH_DONE; |
| 650 | |
| 651 | if (U_SUCCESS(err)) |
| 652 | { |
| 653 | idx = usearch_last(pSearch, &err); |
| 654 | |
| 655 | if (idx != USEARCH_DONE) |
| 656 | { |
| 657 | if ((idx + usearch_getMatchedLength(pSearch)) == cwSourceLength) |
| 658 | { |
| 659 | result = TRUE; |
| 660 | } |
| 661 | else |
| 662 | { |
| 663 | int32_t matchEnd = idx + usearch_getMatchedLength(pSearch); |
| 664 | int32_t remainingStringLength = cwSourceLength - matchEnd; |
| 665 | |
| 666 | result = CanIgnoreAllCollationElements(pColl, lpSource + matchEnd, remainingStringLength); |
| 667 | } |
| 668 | } |
| 669 | |
| 670 | usearch_close(pSearch); |
| 671 | } |
| 672 | } |
| 673 | |
| 674 | return result; |
| 675 | } |
| 676 | |
| 677 | extern "C" int32_t GlobalizationNative_GetSortKey( |
| 678 | SortHandle* pSortHandle, |
| 679 | const UChar* lpStr, |
| 680 | int32_t cwStrLength, |
| 681 | uint8_t* sortKey, |
| 682 | int32_t cbSortKeyLength, |
| 683 | int32_t options) |
| 684 | { |
| 685 | UErrorCode err = U_ZERO_ERROR; |
| 686 | const UCollator* pColl = GetCollatorFromSortHandle(pSortHandle, options, &err); |
| 687 | int32_t result = 0; |
| 688 | |
| 689 | if (U_SUCCESS(err)) |
| 690 | { |
| 691 | result = ucol_getSortKey(pColl, lpStr, cwStrLength, sortKey, cbSortKeyLength); |
| 692 | } |
| 693 | |
| 694 | return result; |
| 695 | } |
| 696 | |
| 697 | extern "C" int32_t GlobalizationNative_CompareStringOrdinalIgnoreCase( |
| 698 | const UChar* lpStr1, int32_t cwStr1Length, const UChar* lpStr2, int32_t cwStr2Length) |
| 699 | { |
| 700 | assert(lpStr1 != nullptr); |
| 701 | assert(cwStr1Length >= 0); |
| 702 | assert(lpStr2 != nullptr); |
| 703 | assert(cwStr2Length >= 0); |
| 704 | |
| 705 | int32_t str1Idx = 0; |
| 706 | int32_t str2Idx = 0; |
| 707 | |
| 708 | while (str1Idx < cwStr1Length && str2Idx < cwStr2Length) |
| 709 | { |
| 710 | UChar32 str1Codepoint; |
| 711 | UChar32 str2Codepoint; |
| 712 | |
| 713 | U16_NEXT(lpStr1, str1Idx, cwStr1Length, str1Codepoint); |
| 714 | U16_NEXT(lpStr2, str2Idx, cwStr2Length, str2Codepoint); |
| 715 | |
| 716 | if (str1Codepoint != str2Codepoint && u_toupper(str1Codepoint) != u_toupper(str2Codepoint)) |
| 717 | { |
| 718 | return str1Codepoint < str2Codepoint ? -1 : 1; |
| 719 | } |
| 720 | } |
| 721 | |
| 722 | if (cwStr1Length < cwStr2Length) |
| 723 | { |
| 724 | return -1; |
| 725 | } |
| 726 | |
| 727 | if (cwStr2Length < cwStr1Length) |
| 728 | { |
| 729 | return 1; |
| 730 | } |
| 731 | |
| 732 | return 0; |
| 733 | } |
| 734 | |