1 | // Licensed to the .NET Foundation under one or more agreements. |
2 | // The .NET Foundation licenses this file to you under the MIT license. |
3 | // See the LICENSE file in the project root for more information. |
4 | // |
5 | |
6 | #include <assert.h> |
7 | #include <pthread.h> |
8 | #include <stdint.h> |
9 | #include <vector> |
10 | #include <map> |
11 | |
12 | #include "icushim.h" |
13 | #include "locale.hpp" |
14 | #include "errors.h" |
15 | |
16 | const int32_t CompareOptionsIgnoreCase = 0x1; |
17 | const int32_t CompareOptionsIgnoreNonSpace = 0x2; |
18 | const int32_t CompareOptionsIgnoreSymbols = 0x4; |
19 | const int32_t CompareOptionsIgnoreKanaType = 0x8; |
20 | const int32_t CompareOptionsIgnoreWidth = 0x10; |
21 | // const int32_t CompareOptionsStringSort = 0x20000000; |
22 | // ICU's default is to use "StringSort", i.e. nonalphanumeric symbols come before alphanumeric. |
23 | // When StringSort is not specified (.NET's default), the sort order will be different between |
24 | // Windows and Unix platforms. The nonalphanumeric symbols will come after alphanumeric |
25 | // characters on Windows, but before on Unix. |
26 | // Since locale - specific string sort order can change from one version of Windows to the next, |
27 | // there is no reason to guarantee string sort order between Windows and ICU. Thus trying to |
28 | // change ICU's default behavior here isn't really justified unless someone has a strong reason |
29 | // for !StringSort to behave differently. |
30 | |
31 | typedef std::map<int32_t, UCollator*> TCollatorMap; |
32 | typedef std::pair<int32_t, UCollator*> TCollatorMapPair; |
33 | |
34 | /* |
35 | * For increased performance, we cache the UCollator objects for a locale and |
36 | * share them across threads. This is safe (and supported in ICU) if we ensure |
37 | * multiple threads are only ever dealing with const UCollators. |
38 | */ |
39 | typedef struct _sort_handle |
40 | { |
41 | UCollator* regular; |
42 | TCollatorMap collatorsPerOption; |
43 | pthread_mutex_t collatorsLockObject; |
44 | |
45 | _sort_handle() : regular(nullptr) |
46 | { |
47 | int result = pthread_mutex_init(&collatorsLockObject, NULL); |
48 | if (result != 0) |
49 | { |
50 | assert(false && "Unexpected pthread_mutex_init return value." ); |
51 | } |
52 | } |
53 | |
54 | } SortHandle; |
55 | |
56 | // Hiragana character range |
57 | const UChar hiraganaStart = 0x3041; |
58 | const UChar hiraganaEnd = 0x309e; |
59 | const UChar hiraganaToKatakanaOffset = 0x30a1 - 0x3041; |
60 | |
61 | // Mapping between half- and fullwidth characters. |
62 | // LowerChars are the characters that should sort lower than HigherChars |
63 | const UChar g_HalfFullLowerChars[] = { |
64 | // halfwidth characters |
65 | 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, |
66 | 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, |
67 | 0x003f, 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, |
68 | 0x004e, 0x004f, 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005a, 0x005b, 0x005d, |
69 | 0x005e, 0x005f, 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, |
70 | 0x006d, 0x006e, 0x006f, 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x007b, |
71 | 0x007c, 0x007d, 0x007e, 0x00a2, 0x00a3, 0x00ac, 0x00af, 0x00a6, 0x00a5, 0x20a9, |
72 | |
73 | // fullwidth characters |
74 | 0x3002, 0x300c, 0x300d, 0x3001, 0x30fb, 0x30f2, 0x30a1, 0x30a3, 0x30a5, 0x30a7, 0x30a9, 0x30e3, 0x30e5, 0x30e7, 0x30c3, |
75 | 0x30a2, 0x30a4, 0x30a6, 0x30a8, 0x30aa, 0x30ab, 0x30ad, 0x30af, 0x30b1, 0x30b3, 0x30b5, 0x30b7, 0x30b9, 0x30bb, 0x30bd, |
76 | 0x30bf, 0x30c1, 0x30c4, 0x30c6, 0x30c8, 0x30ca, 0x30cb, 0x30cc, 0x30cd, 0x30ce, 0x30cf, 0x30d2, 0x30d5, 0x30d8, 0x30db, |
77 | 0x30de, 0x30df, 0x30e0, 0x30e1, 0x30e2, 0x30e4, 0x30e6, 0x30e8, 0x30e9, 0x30ea, 0x30eb, 0x30ec, 0x30ed, 0x30ef, 0x30f3, |
78 | 0x3164, 0x3131, 0x3132, 0x3133, 0x3134, 0x3135, 0x3136, 0x3137, 0x3138, 0x3139, 0x313a, 0x313b, 0x313c, 0x313d, 0x313e, |
79 | 0x313f, 0x3140, 0x3141, 0x3142, 0x3143, 0x3144, 0x3145, 0x3146, 0x3147, 0x3148, 0x3149, 0x314a, 0x314b, 0x314c, 0x314d, |
80 | 0x314e, 0x314f, 0x3150, 0x3151, 0x3152, 0x3153, 0x3154, 0x3155, 0x3156, 0x3157, 0x3158, 0x3159, 0x315a, 0x315b, 0x315c, |
81 | 0x315d, 0x315e, 0x315f, 0x3160, 0x3161, 0x3162, 0x3163 |
82 | |
83 | }; |
84 | const UChar g_HalfFullHigherChars[] = { |
85 | // fullwidth characters |
86 | 0xff01, 0xff02, 0xff03, 0xff04, 0xff05, 0xff06, 0xff07, 0xff08, 0xff09, 0xff0a, 0xff0b, 0xff0c, 0xff0d, 0xff0e, 0xff0f, |
87 | 0xff10, 0xff11, 0xff12, 0xff13, 0xff14, 0xff15, 0xff16, 0xff17, 0xff18, 0xff19, 0xff1a, 0xff1b, 0xff1c, 0xff1d, 0xff1e, |
88 | 0xff1f, 0xff20, 0xff21, 0xff22, 0xff23, 0xff24, 0xff25, 0xff26, 0xff27, 0xff28, 0xff29, 0xff2a, 0xff2b, 0xff2c, 0xff2d, |
89 | 0xff2e, 0xff2f, 0xff30, 0xff31, 0xff32, 0xff33, 0xff34, 0xff35, 0xff36, 0xff37, 0xff38, 0xff39, 0xff3a, 0xff3b, 0xff3d, |
90 | 0xff3e, 0xff3f, 0xff40, 0xff41, 0xff42, 0xff43, 0xff44, 0xff45, 0xff46, 0xff47, 0xff48, 0xff49, 0xff4a, 0xff4b, 0xff4c, |
91 | 0xff4d, 0xff4e, 0xff4f, 0xff50, 0xff51, 0xff52, 0xff53, 0xff54, 0xff55, 0xff56, 0xff57, 0xff58, 0xff59, 0xff5a, 0xff5b, |
92 | 0xff5c, 0xff5d, 0xff5e, 0xffe0, 0xffe1, 0xffe2, 0xffe3, 0xffe4, 0xffe5, 0xffe6, |
93 | |
94 | // halfwidth characters |
95 | 0xff61, 0xff62, 0xff63, 0xff64, 0xff65, 0xff66, 0xff67, 0xff68, 0xff69, 0xff6a, 0xff6b, 0xff6c, 0xff6d, 0xff6e, 0xff6f, |
96 | 0xff71, 0xff72, 0xff73, 0xff74, 0xff75, 0xff76, 0xff77, 0xff78, 0xff79, 0xff7a, 0xff7b, 0xff7c, 0xff7d, 0xff7e, 0xff7f, |
97 | 0xff80, 0xff81, 0xff82, 0xff83, 0xff84, 0xff85, 0xff86, 0xff87, 0xff88, 0xff89, 0xff8a, 0xff8b, 0xff8c, 0xff8d, 0xff8e, |
98 | 0xff8f, 0xff90, 0xff91, 0xff92, 0xff93, 0xff94, 0xff95, 0xff96, 0xff97, 0xff98, 0xff99, 0xff9a, 0xff9b, 0xff9c, 0xff9d, |
99 | 0xffa0, 0xffa1, 0xffa2, 0xffa3, 0xffa4, 0xffa5, 0xffa6, 0xffa7, 0xffa8, 0xffa9, 0xffaa, 0xffab, 0xffac, 0xffad, 0xffae, |
100 | 0xffaf, 0xffb0, 0xffb1, 0xffb2, 0xffb3, 0xffb4, 0xffb5, 0xffb6, 0xffb7, 0xffb8, 0xffb9, 0xffba, 0xffbb, 0xffbc, 0xffbd, |
101 | 0xffbe, 0xffc2, 0xffc3, 0xffc4, 0xffc5, 0xffc6, 0xffc7, 0xffca, 0xffcb, 0xffcc, 0xffcd, 0xffce, 0xffcf, 0xffd2, 0xffd3, |
102 | 0xffd4, 0xffd5, 0xffd6, 0xffd7, 0xffda, 0xffdb, 0xffdc |
103 | }; |
104 | const int32_t g_HalfFullCharsLength = (sizeof(g_HalfFullHigherChars) / sizeof(UChar)); |
105 | |
106 | /* |
107 | ICU collation rules reserve any punctuation and whitespace characters for use in the syntax. |
108 | Thus, to use these characters in a rule, they need to be escaped. |
109 | |
110 | This rule was taken from http://www.unicode.org/reports/tr35/tr35-collation.html#Rules. |
111 | */ |
112 | bool NeedsEscape(UChar character) |
113 | { |
114 | return ((0x21 <= character && character <= 0x2f) |
115 | || (0x3a <= character && character <= 0x40) |
116 | || (0x5b <= character && character <= 0x60) |
117 | || (0x7b <= character && character <= 0x7e)); |
118 | } |
119 | |
120 | /* |
121 | Gets a value indicating whether the HalfFullHigher character is considered a symbol character. |
122 | |
123 | The ranges specified here are only checking for characters in the g_HalfFullHigherChars list and needs |
124 | to be combined with NeedsEscape above with the g_HalfFullLowerChars for all the IgnoreSymbols characters. |
125 | This is done so we can use range checks instead of comparing individual characters. |
126 | |
127 | These ranges were obtained by running the above characters through .NET CompareInfo.Compare |
128 | with CompareOptions.IgnoreSymbols on Windows. |
129 | */ |
130 | bool IsHalfFullHigherSymbol(UChar character) |
131 | { |
132 | return (0xffe0 <= character && character <= 0xffe6) |
133 | || (0xff61 <= character && character <= 0xff65); |
134 | } |
135 | |
136 | /* |
137 | Gets a string of custom collation rules, if necessary. |
138 | |
139 | Since the CompareOptions flags don't map 1:1 with ICU default functionality, we need to fall back to using |
140 | custom rules in order to support IgnoreKanaType and IgnoreWidth CompareOptions correctly. |
141 | */ |
142 | std::vector<UChar> GetCustomRules(int32_t options, UColAttributeValue strength, bool isIgnoreSymbols) |
143 | { |
144 | bool isIgnoreKanaType = (options & CompareOptionsIgnoreKanaType) == CompareOptionsIgnoreKanaType; |
145 | bool isIgnoreWidth = (options & CompareOptionsIgnoreWidth) == CompareOptionsIgnoreWidth; |
146 | |
147 | // kana differs at the tertiary level |
148 | bool needsIgnoreKanaTypeCustomRule = isIgnoreKanaType && strength >= UCOL_TERTIARY; |
149 | bool needsNotIgnoreKanaTypeCustomRule = !isIgnoreKanaType && strength < UCOL_TERTIARY; |
150 | |
151 | // character width differs at the tertiary level |
152 | bool needsIgnoreWidthCustomRule = isIgnoreWidth && strength >= UCOL_TERTIARY; |
153 | bool needsNotIgnoreWidthCustomRule = !isIgnoreWidth && strength < UCOL_TERTIARY; |
154 | |
155 | std::vector<UChar> customRules; |
156 | if (needsIgnoreKanaTypeCustomRule || needsNotIgnoreKanaTypeCustomRule || needsIgnoreWidthCustomRule || needsNotIgnoreWidthCustomRule) |
157 | { |
158 | // If we need to create customRules, the KanaType custom rule will be 88 kana characters * 4 = 352 chars long |
159 | // and the Width custom rule will be at least 215 halfwidth characters * 4 = 860 chars long. |
160 | // Use 512 as the starting size, so the customRules won't have to grow if we are just |
161 | // doing the KanaType custom rule. |
162 | customRules.reserve(512); |
163 | |
164 | if (needsIgnoreKanaTypeCustomRule || needsNotIgnoreKanaTypeCustomRule) |
165 | { |
166 | UChar compareChar = needsIgnoreKanaTypeCustomRule ? '=' : '<'; |
167 | |
168 | for (UChar hiraganaChar = hiraganaStart; hiraganaChar <= hiraganaEnd; hiraganaChar++) |
169 | { |
170 | // Hiragana is the range 3041 to 3096 & 309D & 309E |
171 | if (hiraganaChar <= 0x3096 || hiraganaChar >= 0x309D) // characters between 3096 and 309D are not mapped to katakana |
172 | { |
173 | customRules.push_back('&'); |
174 | customRules.push_back(hiraganaChar); |
175 | customRules.push_back(compareChar); |
176 | customRules.push_back(hiraganaChar + hiraganaToKatakanaOffset); |
177 | } |
178 | } |
179 | } |
180 | |
181 | if (needsIgnoreWidthCustomRule || needsNotIgnoreWidthCustomRule) |
182 | { |
183 | UChar compareChar = needsIgnoreWidthCustomRule ? '=' : '<'; |
184 | |
185 | UChar lowerChar; |
186 | UChar higherChar; |
187 | bool needsEscape; |
188 | for (int i = 0; i < g_HalfFullCharsLength; i++) |
189 | { |
190 | lowerChar = g_HalfFullLowerChars[i]; |
191 | higherChar = g_HalfFullHigherChars[i]; |
192 | // the lower chars need to be checked for escaping since they contain ASCII punctuation |
193 | needsEscape = NeedsEscape(lowerChar); |
194 | |
195 | // when isIgnoreSymbols is true and we are not ignoring width, check to see if |
196 | // this character is a symbol, and if so skip it |
197 | if (!(isIgnoreSymbols && needsNotIgnoreWidthCustomRule && (needsEscape || IsHalfFullHigherSymbol(higherChar)))) |
198 | { |
199 | customRules.push_back('&'); |
200 | |
201 | if (needsEscape) |
202 | { |
203 | customRules.push_back('\\'); |
204 | } |
205 | customRules.push_back(lowerChar); |
206 | |
207 | customRules.push_back(compareChar); |
208 | customRules.push_back(higherChar); |
209 | } |
210 | } |
211 | } |
212 | } |
213 | |
214 | return customRules; |
215 | } |
216 | |
217 | /* |
218 | * The collator returned by this function is owned by the callee and must be |
219 | * closed when this method returns with a U_SUCCESS UErrorCode. |
220 | * |
221 | * On error, the return value is undefined. |
222 | */ |
223 | UCollator* CloneCollatorWithOptions(const UCollator* pCollator, int32_t options, UErrorCode* pErr) |
224 | { |
225 | UColAttributeValue strength = ucol_getStrength(pCollator); |
226 | |
227 | bool isIgnoreCase = (options & CompareOptionsIgnoreCase) == CompareOptionsIgnoreCase; |
228 | bool isIgnoreNonSpace = (options & CompareOptionsIgnoreNonSpace) == CompareOptionsIgnoreNonSpace; |
229 | bool isIgnoreSymbols = (options & CompareOptionsIgnoreSymbols) == CompareOptionsIgnoreSymbols; |
230 | |
231 | if (isIgnoreCase) |
232 | { |
233 | strength = UCOL_SECONDARY; |
234 | } |
235 | |
236 | if (isIgnoreNonSpace) |
237 | { |
238 | strength = UCOL_PRIMARY; |
239 | } |
240 | |
241 | UCollator* pClonedCollator; |
242 | std::vector<UChar> customRules = GetCustomRules(options, strength, isIgnoreSymbols); |
243 | if (customRules.empty()) |
244 | { |
245 | pClonedCollator = ucol_safeClone(pCollator, nullptr, nullptr, pErr); |
246 | } |
247 | else |
248 | { |
249 | int32_t customRuleLength = customRules.size(); |
250 | |
251 | int32_t localeRulesLength; |
252 | const UChar* localeRules = ucol_getRules(pCollator, &localeRulesLength); |
253 | |
254 | std::vector<UChar> completeRules(localeRulesLength + customRuleLength + 1, '\0'); |
255 | for (int i = 0; i < localeRulesLength; i++) |
256 | { |
257 | completeRules[i] = localeRules[i]; |
258 | } |
259 | for (int i = 0; i < customRuleLength; i++) |
260 | { |
261 | completeRules[localeRulesLength + i] = customRules[i]; |
262 | } |
263 | |
264 | pClonedCollator = ucol_openRules(completeRules.data(), completeRules.size(), UCOL_DEFAULT, strength, NULL, pErr); |
265 | } |
266 | |
267 | if (isIgnoreSymbols) |
268 | { |
269 | ucol_setAttribute(pClonedCollator, UCOL_ALTERNATE_HANDLING, UCOL_SHIFTED, pErr); |
270 | |
271 | // by default, ICU alternate shifted handling only ignores punctuation, but |
272 | // IgnoreSymbols needs symbols and currency as well, so change the "variable top" |
273 | // to include all symbols and currency |
274 | #if HAVE_SET_MAX_VARIABLE |
275 | ucol_setMaxVariable(pClonedCollator, UCOL_REORDER_CODE_CURRENCY, pErr); |
276 | #else |
277 | // 0xfdfc is the last currency character before the first digit character |
278 | // in http://source.icu-project.org/repos/icu/icu/tags/release-52-1/source/data/unidata/FractionalUCA.txt |
279 | const UChar ignoreSymbolsVariableTop[] = { 0xfdfc }; |
280 | ucol_setVariableTop(pClonedCollator, ignoreSymbolsVariableTop, 1, pErr); |
281 | #endif |
282 | } |
283 | |
284 | ucol_setAttribute(pClonedCollator, UCOL_STRENGTH, strength, pErr); |
285 | |
286 | // casing differs at the tertiary level. |
287 | // if strength is less than tertiary, but we are not ignoring case, then we need to flip CASE_LEVEL On |
288 | if (strength < UCOL_TERTIARY && !isIgnoreCase) |
289 | { |
290 | ucol_setAttribute(pClonedCollator, UCOL_CASE_LEVEL, UCOL_ON, pErr); |
291 | } |
292 | |
293 | return pClonedCollator; |
294 | } |
295 | |
296 | // Returns TRUE if all the collation elements in str are completely ignorable |
297 | bool CanIgnoreAllCollationElements(const UCollator* pColl, const UChar* lpStr, int32_t length) |
298 | { |
299 | bool result = false; |
300 | UErrorCode err = U_ZERO_ERROR; |
301 | UCollationElements* pCollElem = ucol_openElements(pColl, lpStr, length, &err); |
302 | |
303 | if (U_SUCCESS(err)) |
304 | { |
305 | int32_t curCollElem = UCOL_NULLORDER; |
306 | |
307 | result = true; |
308 | |
309 | while ((curCollElem = ucol_next(pCollElem, &err)) != UCOL_NULLORDER) |
310 | { |
311 | if (curCollElem != 0) |
312 | { |
313 | result = false; |
314 | break; |
315 | } |
316 | } |
317 | |
318 | if (U_FAILURE(err)) |
319 | { |
320 | result = false; |
321 | } |
322 | |
323 | ucol_closeElements(pCollElem); |
324 | } |
325 | |
326 | return result; |
327 | |
328 | } |
329 | |
330 | extern "C" ResultCode GlobalizationNative_GetSortHandle(const char* lpLocaleName, SortHandle** ppSortHandle) |
331 | { |
332 | assert(ppSortHandle != nullptr); |
333 | |
334 | *ppSortHandle = new (std::nothrow) SortHandle(); |
335 | if ((*ppSortHandle) == nullptr) |
336 | { |
337 | return GetResultCode(U_MEMORY_ALLOCATION_ERROR); |
338 | } |
339 | |
340 | UErrorCode err = U_ZERO_ERROR; |
341 | |
342 | (*ppSortHandle)->regular = ucol_open(lpLocaleName, &err); |
343 | |
344 | if (U_FAILURE(err)) |
345 | { |
346 | if ((*ppSortHandle)->regular != nullptr) |
347 | ucol_close((*ppSortHandle)->regular); |
348 | |
349 | delete (*ppSortHandle); |
350 | (*ppSortHandle) = nullptr; |
351 | } |
352 | |
353 | return GetResultCode(err); |
354 | } |
355 | |
356 | extern "C" void GlobalizationNative_CloseSortHandle(SortHandle* pSortHandle) |
357 | { |
358 | ucol_close(pSortHandle->regular); |
359 | pSortHandle->regular = nullptr; |
360 | |
361 | TCollatorMap::iterator it; |
362 | for (it = pSortHandle->collatorsPerOption.begin(); it != pSortHandle->collatorsPerOption.end(); it++) |
363 | { |
364 | ucol_close(it->second); |
365 | } |
366 | |
367 | pthread_mutex_destroy(&pSortHandle->collatorsLockObject); |
368 | |
369 | delete pSortHandle; |
370 | } |
371 | |
372 | const UCollator* GetCollatorFromSortHandle(SortHandle* pSortHandle, int32_t options, UErrorCode* pErr) |
373 | { |
374 | UCollator* pCollator; |
375 | if (options == 0) |
376 | { |
377 | pCollator = pSortHandle->regular; |
378 | } |
379 | else |
380 | { |
381 | int lockResult = pthread_mutex_lock(&pSortHandle->collatorsLockObject); |
382 | if (lockResult != 0) |
383 | { |
384 | assert(false && "Unexpected pthread_mutex_lock return value." ); |
385 | } |
386 | |
387 | TCollatorMap::iterator entry = pSortHandle->collatorsPerOption.find(options); |
388 | if (entry == pSortHandle->collatorsPerOption.end()) |
389 | { |
390 | pCollator = CloneCollatorWithOptions(pSortHandle->regular, options, pErr); |
391 | pSortHandle->collatorsPerOption[options] = pCollator; |
392 | } |
393 | else |
394 | { |
395 | pCollator = entry->second; |
396 | } |
397 | |
398 | pthread_mutex_unlock(&pSortHandle->collatorsLockObject); |
399 | } |
400 | |
401 | return pCollator; |
402 | } |
403 | |
404 | extern "C" int32_t GlobalizationNative_GetSortVersion(SortHandle* pSortHandle) |
405 | { |
406 | UErrorCode err = U_ZERO_ERROR; |
407 | const UCollator* pColl = GetCollatorFromSortHandle(pSortHandle, 0, &err); |
408 | int32_t result = 0; |
409 | |
410 | if (U_SUCCESS(err)) |
411 | { |
412 | ucol_getVersion(pColl, (uint8_t *) &result); |
413 | } |
414 | else |
415 | { |
416 | assert(false && "Unexpected ucol_getVersion to fail." ); |
417 | |
418 | // we didn't use UCOL_TAILORINGS_VERSION because it is deprecated in ICU v5 |
419 | result = UCOL_RUNTIME_VERSION << 16 | UCOL_BUILDER_VERSION; |
420 | } |
421 | return result; |
422 | } |
423 | |
424 | /* |
425 | Function: |
426 | CompareString |
427 | */ |
428 | extern "C" int32_t GlobalizationNative_CompareString( |
429 | SortHandle* pSortHandle, const UChar* lpStr1, int32_t cwStr1Length, const UChar* lpStr2, int32_t cwStr2Length, int32_t options) |
430 | { |
431 | static_assert(UCOL_EQUAL == 0, "managed side requires 0 for equal strings" ); |
432 | static_assert(UCOL_LESS < 0, "managed side requires less than zero for a < b" ); |
433 | static_assert(UCOL_GREATER > 0, "managed side requires greater than zero for a > b" ); |
434 | |
435 | UCollationResult result = UCOL_EQUAL; |
436 | UErrorCode err = U_ZERO_ERROR; |
437 | const UCollator* pColl = GetCollatorFromSortHandle(pSortHandle, options, &err); |
438 | |
439 | if (U_SUCCESS(err)) |
440 | { |
441 | result = ucol_strcoll(pColl, lpStr1, cwStr1Length, lpStr2, cwStr2Length); |
442 | } |
443 | |
444 | return result; |
445 | } |
446 | |
447 | /* |
448 | Function: |
449 | IndexOf |
450 | */ |
451 | extern "C" int32_t GlobalizationNative_IndexOf( |
452 | SortHandle* pSortHandle, |
453 | const UChar* lpTarget, |
454 | int32_t cwTargetLength, |
455 | const UChar* lpSource, |
456 | int32_t cwSourceLength, |
457 | int32_t options, |
458 | int32_t* pMatchedLength) |
459 | { |
460 | static_assert(USEARCH_DONE == -1, "managed side requires -1 for not found" ); |
461 | |
462 | int32_t result = USEARCH_DONE; |
463 | UErrorCode err = U_ZERO_ERROR; |
464 | const UCollator* pColl = GetCollatorFromSortHandle(pSortHandle, options, &err); |
465 | |
466 | if (U_SUCCESS(err)) |
467 | { |
468 | UStringSearch* pSearch = usearch_openFromCollator(lpTarget, cwTargetLength, lpSource, cwSourceLength, pColl, nullptr, &err); |
469 | |
470 | if (U_SUCCESS(err)) |
471 | { |
472 | result = usearch_first(pSearch, &err); |
473 | |
474 | // if the search was successful, |
475 | // we'll try to get the matched string length. |
476 | if(result != USEARCH_DONE && pMatchedLength != NULL) |
477 | { |
478 | *pMatchedLength = usearch_getMatchedLength(pSearch); |
479 | } |
480 | usearch_close(pSearch); |
481 | } |
482 | } |
483 | |
484 | return result; |
485 | } |
486 | |
487 | /* |
488 | Function: |
489 | LastIndexOf |
490 | */ |
491 | extern "C" int32_t GlobalizationNative_LastIndexOf( |
492 | SortHandle* pSortHandle, |
493 | const UChar* lpTarget, |
494 | int32_t cwTargetLength, |
495 | const UChar* lpSource, |
496 | int32_t cwSourceLength, |
497 | int32_t options) |
498 | { |
499 | static_assert(USEARCH_DONE == -1, "managed side requires -1 for not found" ); |
500 | |
501 | int32_t result = USEARCH_DONE; |
502 | UErrorCode err = U_ZERO_ERROR; |
503 | const UCollator* pColl = GetCollatorFromSortHandle(pSortHandle, options, &err); |
504 | |
505 | if (U_SUCCESS(err)) |
506 | { |
507 | UStringSearch* pSearch = usearch_openFromCollator(lpTarget, cwTargetLength, lpSource, cwSourceLength, pColl, nullptr, &err); |
508 | |
509 | if (U_SUCCESS(err)) |
510 | { |
511 | result = usearch_last(pSearch, &err); |
512 | usearch_close(pSearch); |
513 | } |
514 | } |
515 | |
516 | return result; |
517 | } |
518 | |
519 | /* |
520 | Static Function: |
521 | AreEqualOrdinalIgnoreCase |
522 | */ |
523 | static bool AreEqualOrdinalIgnoreCase(UChar32 one, UChar32 two) |
524 | { |
525 | // Return whether the two characters are identical or would be identical if they were upper-cased. |
526 | |
527 | if (one == two) |
528 | { |
529 | return true; |
530 | } |
531 | |
532 | if (one == 0x0131 || two == 0x0131) |
533 | { |
534 | // On Windows with InvariantCulture, the LATIN SMALL LETTER DOTLESS I (U+0131) |
535 | // capitalizes to itself, whereas with ICU it capitalizes to LATIN CAPITAL LETTER I (U+0049). |
536 | // We special case it to match the Windows invariant behavior. |
537 | return false; |
538 | } |
539 | |
540 | return u_toupper(one) == u_toupper(two); |
541 | } |
542 | |
543 | /* |
544 | Function: |
545 | IndexOfOrdinalIgnoreCase |
546 | */ |
547 | extern "C" int32_t GlobalizationNative_IndexOfOrdinalIgnoreCase( |
548 | const UChar* lpTarget, int32_t cwTargetLength, const UChar* lpSource, int32_t cwSourceLength, int32_t findLast) |
549 | { |
550 | int32_t result = -1; |
551 | |
552 | int32_t endIndex = cwSourceLength - cwTargetLength; |
553 | assert(endIndex >= 0); |
554 | |
555 | int32_t i = 0; |
556 | while (i <= endIndex) |
557 | { |
558 | int32_t srcIdx = i, trgIdx = 0; |
559 | const UChar *src = lpSource, *trg = lpTarget; |
560 | UChar32 srcCodepoint, trgCodepoint; |
561 | |
562 | bool match = true; |
563 | while (trgIdx < cwTargetLength) |
564 | { |
565 | U16_NEXT(src, srcIdx, cwSourceLength, srcCodepoint); |
566 | U16_NEXT(trg, trgIdx, cwTargetLength, trgCodepoint); |
567 | if (!AreEqualOrdinalIgnoreCase(srcCodepoint, trgCodepoint)) |
568 | { |
569 | match = false; |
570 | break; |
571 | } |
572 | } |
573 | |
574 | if (match) |
575 | { |
576 | result = i; |
577 | if (!findLast) |
578 | { |
579 | break; |
580 | } |
581 | } |
582 | |
583 | U16_FWD_1(lpSource, i, cwSourceLength); |
584 | } |
585 | |
586 | return result; |
587 | } |
588 | |
589 | /* |
590 | Return value is a "Win32 BOOL" (1 = true, 0 = false) |
591 | */ |
592 | extern "C" int32_t GlobalizationNative_StartsWith( |
593 | SortHandle* pSortHandle, |
594 | const UChar* lpTarget, |
595 | int32_t cwTargetLength, |
596 | const UChar* lpSource, |
597 | int32_t cwSourceLength, |
598 | int32_t options) |
599 | { |
600 | int32_t result = FALSE; |
601 | UErrorCode err = U_ZERO_ERROR; |
602 | const UCollator* pColl = GetCollatorFromSortHandle(pSortHandle, options, &err); |
603 | |
604 | if (U_SUCCESS(err)) |
605 | { |
606 | UStringSearch* pSearch = usearch_openFromCollator(lpTarget, cwTargetLength, lpSource, cwSourceLength, pColl, nullptr, &err); |
607 | int32_t idx = USEARCH_DONE; |
608 | |
609 | if (U_SUCCESS(err)) |
610 | { |
611 | idx = usearch_first(pSearch, &err); |
612 | if (idx != USEARCH_DONE) |
613 | { |
614 | if (idx == 0) |
615 | { |
616 | result = TRUE; |
617 | } |
618 | else |
619 | { |
620 | result = CanIgnoreAllCollationElements(pColl, lpSource, idx); |
621 | } |
622 | } |
623 | |
624 | usearch_close(pSearch); |
625 | } |
626 | } |
627 | |
628 | return result; |
629 | } |
630 | |
631 | /* |
632 | Return value is a "Win32 BOOL" (1 = true, 0 = false) |
633 | */ |
634 | extern "C" int32_t GlobalizationNative_EndsWith( |
635 | SortHandle* pSortHandle, |
636 | const UChar* lpTarget, |
637 | int32_t cwTargetLength, |
638 | const UChar* lpSource, |
639 | int32_t cwSourceLength, |
640 | int32_t options) |
641 | { |
642 | int32_t result = FALSE; |
643 | UErrorCode err = U_ZERO_ERROR; |
644 | const UCollator* pColl = GetCollatorFromSortHandle(pSortHandle, options, &err); |
645 | |
646 | if (U_SUCCESS(err)) |
647 | { |
648 | UStringSearch* pSearch = usearch_openFromCollator(lpTarget, cwTargetLength, lpSource, cwSourceLength, pColl, nullptr, &err); |
649 | int32_t idx = USEARCH_DONE; |
650 | |
651 | if (U_SUCCESS(err)) |
652 | { |
653 | idx = usearch_last(pSearch, &err); |
654 | |
655 | if (idx != USEARCH_DONE) |
656 | { |
657 | if ((idx + usearch_getMatchedLength(pSearch)) == cwSourceLength) |
658 | { |
659 | result = TRUE; |
660 | } |
661 | else |
662 | { |
663 | int32_t matchEnd = idx + usearch_getMatchedLength(pSearch); |
664 | int32_t remainingStringLength = cwSourceLength - matchEnd; |
665 | |
666 | result = CanIgnoreAllCollationElements(pColl, lpSource + matchEnd, remainingStringLength); |
667 | } |
668 | } |
669 | |
670 | usearch_close(pSearch); |
671 | } |
672 | } |
673 | |
674 | return result; |
675 | } |
676 | |
677 | extern "C" int32_t GlobalizationNative_GetSortKey( |
678 | SortHandle* pSortHandle, |
679 | const UChar* lpStr, |
680 | int32_t cwStrLength, |
681 | uint8_t* sortKey, |
682 | int32_t cbSortKeyLength, |
683 | int32_t options) |
684 | { |
685 | UErrorCode err = U_ZERO_ERROR; |
686 | const UCollator* pColl = GetCollatorFromSortHandle(pSortHandle, options, &err); |
687 | int32_t result = 0; |
688 | |
689 | if (U_SUCCESS(err)) |
690 | { |
691 | result = ucol_getSortKey(pColl, lpStr, cwStrLength, sortKey, cbSortKeyLength); |
692 | } |
693 | |
694 | return result; |
695 | } |
696 | |
697 | extern "C" int32_t GlobalizationNative_CompareStringOrdinalIgnoreCase( |
698 | const UChar* lpStr1, int32_t cwStr1Length, const UChar* lpStr2, int32_t cwStr2Length) |
699 | { |
700 | assert(lpStr1 != nullptr); |
701 | assert(cwStr1Length >= 0); |
702 | assert(lpStr2 != nullptr); |
703 | assert(cwStr2Length >= 0); |
704 | |
705 | int32_t str1Idx = 0; |
706 | int32_t str2Idx = 0; |
707 | |
708 | while (str1Idx < cwStr1Length && str2Idx < cwStr2Length) |
709 | { |
710 | UChar32 str1Codepoint; |
711 | UChar32 str2Codepoint; |
712 | |
713 | U16_NEXT(lpStr1, str1Idx, cwStr1Length, str1Codepoint); |
714 | U16_NEXT(lpStr2, str2Idx, cwStr2Length, str2Codepoint); |
715 | |
716 | if (str1Codepoint != str2Codepoint && u_toupper(str1Codepoint) != u_toupper(str2Codepoint)) |
717 | { |
718 | return str1Codepoint < str2Codepoint ? -1 : 1; |
719 | } |
720 | } |
721 | |
722 | if (cwStr1Length < cwStr2Length) |
723 | { |
724 | return -1; |
725 | } |
726 | |
727 | if (cwStr2Length < cwStr1Length) |
728 | { |
729 | return 1; |
730 | } |
731 | |
732 | return 0; |
733 | } |
734 | |