collation.cpp source code [CoreCLR/corefx/System.Globalization.Native/collation.cpp]

1	// Licensed to the .NET Foundation under one or more agreements.
2	// The .NET Foundation licenses this file to you under the MIT license.
3	// See the LICENSE file in the project root for more information.
4	//
5
6	#include <assert.h>
7	#include <pthread.h>
8	#include <stdint.h>
9	#include <vector>
10	#include <map>
11
12	#include "icushim.h"
13	#include "locale.hpp"
14	#include "errors.h"
15
16	const int32_t CompareOptionsIgnoreCase = `0x1`;
17	const int32_t CompareOptionsIgnoreNonSpace = `0x2`;
18	const int32_t CompareOptionsIgnoreSymbols = `0x4`;
19	const int32_t CompareOptionsIgnoreKanaType = `0x8`;
20	const int32_t CompareOptionsIgnoreWidth = `0x10`;
21	// const int32_t CompareOptionsStringSort = 0x20000000;
22	// ICU's default is to use "StringSort", i.e. nonalphanumeric symbols come before alphanumeric.
23	// When StringSort is not specified (.NET's default), the sort order will be different between
24	// Windows and Unix platforms. The nonalphanumeric symbols will come after alphanumeric
25	// characters on Windows, but before on Unix.
26	// Since locale - specific string sort order can change from one version of Windows to the next,
27	// there is no reason to guarantee string sort order between Windows and ICU. Thus trying to
28	// change ICU's default behavior here isn't really justified unless someone has a strong reason
29	// for !StringSort to behave differently.
30
31	typedef std::map<int32_t, UCollator*> TCollatorMap;
32	typedef std::pair<int32_t, UCollator*> TCollatorMapPair;
33
34	/*
35	* For increased performance, we cache the UCollator objects for a locale and
36	* share them across threads. This is safe (and supported in ICU) if we ensure
37	* multiple threads are only ever dealing with const UCollators.
38	*/
39	typedef struct _sort_handle
40	{
41	UCollator* regular;
42	TCollatorMap collatorsPerOption;
43	pthread_mutex_t collatorsLockObject;
44
45	_sort_handle() : regular(nullptr)
46	{
47	int result = pthread_mutex_init(&collatorsLockObject, NULL);
48	if (result != `0`)
49	{
50	assert(false && "Unexpected pthread_mutex_init return value.");
51	}
52	}
53
54	} SortHandle;
55
56	// Hiragana character range
57	const UChar hiraganaStart = `0x3041`;
58	const UChar hiraganaEnd = `0x309e`;
59	const UChar hiraganaToKatakanaOffset = `0x30a1` - `0x3041`;
60
61	// Mapping between half- and fullwidth characters.
62	// LowerChars are the characters that should sort lower than HigherChars
63	const UChar g_HalfFullLowerChars[] = {
64	// halfwidth characters
65	`0x0021`, `0x0022`, `0x0023`, `0x0024`, `0x0025`, `0x0026`, `0x0027`, `0x0028`, `0x0029`, `0x002a`, `0x002b`, `0x002c`, `0x002d`, `0x002e`, `0x002f`,
66	`0x0030`, `0x0031`, `0x0032`, `0x0033`, `0x0034`, `0x0035`, `0x0036`, `0x0037`, `0x0038`, `0x0039`, `0x003a`, `0x003b`, `0x003c`, `0x003d`, `0x003e`,
67	`0x003f`, `0x0040`, `0x0041`, `0x0042`, `0x0043`, `0x0044`, `0x0045`, `0x0046`, `0x0047`, `0x0048`, `0x0049`, `0x004a`, `0x004b`, `0x004c`, `0x004d`,
68	`0x004e`, `0x004f`, `0x0050`, `0x0051`, `0x0052`, `0x0053`, `0x0054`, `0x0055`, `0x0056`, `0x0057`, `0x0058`, `0x0059`, `0x005a`, `0x005b`, `0x005d`,
69	`0x005e`, `0x005f`, `0x0060`, `0x0061`, `0x0062`, `0x0063`, `0x0064`, `0x0065`, `0x0066`, `0x0067`, `0x0068`, `0x0069`, `0x006a`, `0x006b`, `0x006c`,
70	`0x006d`, `0x006e`, `0x006f`, `0x0070`, `0x0071`, `0x0072`, `0x0073`, `0x0074`, `0x0075`, `0x0076`, `0x0077`, `0x0078`, `0x0079`, `0x007a`, `0x007b`,
71	`0x007c`, `0x007d`, `0x007e`, `0x00a2`, `0x00a3`, `0x00ac`, `0x00af`, `0x00a6`, `0x00a5`, `0x20a9`,
72
73	// fullwidth characters
74	`0x3002`, `0x300c`, `0x300d`, `0x3001`, `0x30fb`, `0x30f2`, `0x30a1`, `0x30a3`, `0x30a5`, `0x30a7`, `0x30a9`, `0x30e3`, `0x30e5`, `0x30e7`, `0x30c3`,
75	`0x30a2`, `0x30a4`, `0x30a6`, `0x30a8`, `0x30aa`, `0x30ab`, `0x30ad`, `0x30af`, `0x30b1`, `0x30b3`, `0x30b5`, `0x30b7`, `0x30b9`, `0x30bb`, `0x30bd`,
76	`0x30bf`, `0x30c1`, `0x30c4`, `0x30c6`, `0x30c8`, `0x30ca`, `0x30cb`, `0x30cc`, `0x30cd`, `0x30ce`, `0x30cf`, `0x30d2`, `0x30d5`, `0x30d8`, `0x30db`,
77	`0x30de`, `0x30df`, `0x30e0`, `0x30e1`, `0x30e2`, `0x30e4`, `0x30e6`, `0x30e8`, `0x30e9`, `0x30ea`, `0x30eb`, `0x30ec`, `0x30ed`, `0x30ef`, `0x30f3`,
78	`0x3164`, `0x3131`, `0x3132`, `0x3133`, `0x3134`, `0x3135`, `0x3136`, `0x3137`, `0x3138`, `0x3139`, `0x313a`, `0x313b`, `0x313c`, `0x313d`, `0x313e`,
79	`0x313f`, `0x3140`, `0x3141`, `0x3142`, `0x3143`, `0x3144`, `0x3145`, `0x3146`, `0x3147`, `0x3148`, `0x3149`, `0x314a`, `0x314b`, `0x314c`, `0x314d`,
80	`0x314e`, `0x314f`, `0x3150`, `0x3151`, `0x3152`, `0x3153`, `0x3154`, `0x3155`, `0x3156`, `0x3157`, `0x3158`, `0x3159`, `0x315a`, `0x315b`, `0x315c`,
81	`0x315d`, `0x315e`, `0x315f`, `0x3160`, `0x3161`, `0x3162`, `0x3163`
82
83	};
84	const UChar g_HalfFullHigherChars[] = {
85	// fullwidth characters
86	`0xff01`, `0xff02`, `0xff03`, `0xff04`, `0xff05`, `0xff06`, `0xff07`, `0xff08`, `0xff09`, `0xff0a`, `0xff0b`, `0xff0c`, `0xff0d`, `0xff0e`, `0xff0f`,
87	`0xff10`, `0xff11`, `0xff12`, `0xff13`, `0xff14`, `0xff15`, `0xff16`, `0xff17`, `0xff18`, `0xff19`, `0xff1a`, `0xff1b`, `0xff1c`, `0xff1d`, `0xff1e`,
88	`0xff1f`, `0xff20`, `0xff21`, `0xff22`, `0xff23`, `0xff24`, `0xff25`, `0xff26`, `0xff27`, `0xff28`, `0xff29`, `0xff2a`, `0xff2b`, `0xff2c`, `0xff2d`,
89	`0xff2e`, `0xff2f`, `0xff30`, `0xff31`, `0xff32`, `0xff33`, `0xff34`, `0xff35`, `0xff36`, `0xff37`, `0xff38`, `0xff39`, `0xff3a`, `0xff3b`, `0xff3d`,
90	`0xff3e`, `0xff3f`, `0xff40`, `0xff41`, `0xff42`, `0xff43`, `0xff44`, `0xff45`, `0xff46`, `0xff47`, `0xff48`, `0xff49`, `0xff4a`, `0xff4b`, `0xff4c`,
91	`0xff4d`, `0xff4e`, `0xff4f`, `0xff50`, `0xff51`, `0xff52`, `0xff53`, `0xff54`, `0xff55`, `0xff56`, `0xff57`, `0xff58`, `0xff59`, `0xff5a`, `0xff5b`,
92	`0xff5c`, `0xff5d`, `0xff5e`, `0xffe0`, `0xffe1`, `0xffe2`, `0xffe3`, `0xffe4`, `0xffe5`, `0xffe6`,
93
94	// halfwidth characters
95	`0xff61`, `0xff62`, `0xff63`, `0xff64`, `0xff65`, `0xff66`, `0xff67`, `0xff68`, `0xff69`, `0xff6a`, `0xff6b`, `0xff6c`, `0xff6d`, `0xff6e`, `0xff6f`,
96	`0xff71`, `0xff72`, `0xff73`, `0xff74`, `0xff75`, `0xff76`, `0xff77`, `0xff78`, `0xff79`, `0xff7a`, `0xff7b`, `0xff7c`, `0xff7d`, `0xff7e`, `0xff7f`,
97	`0xff80`, `0xff81`, `0xff82`, `0xff83`, `0xff84`, `0xff85`, `0xff86`, `0xff87`, `0xff88`, `0xff89`, `0xff8a`, `0xff8b`, `0xff8c`, `0xff8d`, `0xff8e`,
98	`0xff8f`, `0xff90`, `0xff91`, `0xff92`, `0xff93`, `0xff94`, `0xff95`, `0xff96`, `0xff97`, `0xff98`, `0xff99`, `0xff9a`, `0xff9b`, `0xff9c`, `0xff9d`,
99	`0xffa0`, `0xffa1`, `0xffa2`, `0xffa3`, `0xffa4`, `0xffa5`, `0xffa6`, `0xffa7`, `0xffa8`, `0xffa9`, `0xffaa`, `0xffab`, `0xffac`, `0xffad`, `0xffae`,
100	`0xffaf`, `0xffb0`, `0xffb1`, `0xffb2`, `0xffb3`, `0xffb4`, `0xffb5`, `0xffb6`, `0xffb7`, `0xffb8`, `0xffb9`, `0xffba`, `0xffbb`, `0xffbc`, `0xffbd`,
101	`0xffbe`, `0xffc2`, `0xffc3`, `0xffc4`, `0xffc5`, `0xffc6`, `0xffc7`, `0xffca`, `0xffcb`, `0xffcc`, `0xffcd`, `0xffce`, `0xffcf`, `0xffd2`, `0xffd3`,
102	`0xffd4`, `0xffd5`, `0xffd6`, `0xffd7`, `0xffda`, `0xffdb`, `0xffdc`
103	};
104	const int32_t g_HalfFullCharsLength = (sizeof(g_HalfFullHigherChars) / sizeof(UChar));
105
106	/*
107	ICU collation rules reserve any punctuation and whitespace characters for use in the syntax.
108	Thus, to use these characters in a rule, they need to be escaped.
109
110	This rule was taken from http://www.unicode.org/reports/tr35/tr35-collation.html#Rules.
111	*/
112	bool NeedsEscape(UChar character)
113	{
114	return ((`0x21` <= character && character <= `0x2f`)
115	\|\| (`0x3a` <= character && character <= `0x40`)
116	\|\| (`0x5b` <= character && character <= `0x60`)
117	\|\| (`0x7b` <= character && character <= `0x7e`));
118	}
119
120	/*
121	Gets a value indicating whether the HalfFullHigher character is considered a symbol character.
122
123	The ranges specified here are only checking for characters in the g_HalfFullHigherChars list and needs
124	to be combined with NeedsEscape above with the g_HalfFullLowerChars for all the IgnoreSymbols characters.
125	This is done so we can use range checks instead of comparing individual characters.
126
127	These ranges were obtained by running the above characters through .NET CompareInfo.Compare
128	with CompareOptions.IgnoreSymbols on Windows.
129	*/
130	bool IsHalfFullHigherSymbol(UChar character)
131	{
132	return (`0xffe0` <= character && character <= `0xffe6`)
133	\|\| (`0xff61` <= character && character <= `0xff65`);
134	}
135
136	/*
137	Gets a string of custom collation rules, if necessary.
138
139	Since the CompareOptions flags don't map 1:1 with ICU default functionality, we need to fall back to using
140	custom rules in order to support IgnoreKanaType and IgnoreWidth CompareOptions correctly.
141	*/
142	std::vector<UChar> GetCustomRules(int32_t options, UColAttributeValue strength, bool isIgnoreSymbols)
143	{
144	bool isIgnoreKanaType = (options & CompareOptionsIgnoreKanaType) == CompareOptionsIgnoreKanaType;
145	bool isIgnoreWidth = (options & CompareOptionsIgnoreWidth) == CompareOptionsIgnoreWidth;
146
147	// kana differs at the tertiary level
148	bool needsIgnoreKanaTypeCustomRule = isIgnoreKanaType && strength >= UCOL_TERTIARY;
149	bool needsNotIgnoreKanaTypeCustomRule = !isIgnoreKanaType && strength < UCOL_TERTIARY;
150
151	// character width differs at the tertiary level
152	bool needsIgnoreWidthCustomRule = isIgnoreWidth && strength >= UCOL_TERTIARY;
153	bool needsNotIgnoreWidthCustomRule = !isIgnoreWidth && strength < UCOL_TERTIARY;
154
155	std::vector<UChar> customRules;
156	if (needsIgnoreKanaTypeCustomRule \|\| needsNotIgnoreKanaTypeCustomRule \|\| needsIgnoreWidthCustomRule \|\| needsNotIgnoreWidthCustomRule)
157	{
158	// If we need to create customRules, the KanaType custom rule will be 88 kana characters 4 = 352 chars long*
159	// and the Width custom rule will be at least 215 halfwidth characters 4 = 860 chars long.*
160	// Use 512 as the starting size, so the customRules won't have to grow if we are just
161	// doing the KanaType custom rule.
162	customRules.reserve(`512`);
163
164	if (needsIgnoreKanaTypeCustomRule \|\| needsNotIgnoreKanaTypeCustomRule)
165	{
166	UChar compareChar = needsIgnoreKanaTypeCustomRule ? `'='` : `'<'`;
167
168	for (UChar hiraganaChar = hiraganaStart; hiraganaChar <= hiraganaEnd; hiraganaChar++)
169	{
170	// Hiragana is the range 3041 to 3096 & 309D & 309E
171	if (hiraganaChar <= `0x3096` \|\| hiraganaChar >= `0x309D`) // characters between 3096 and 309D are not mapped to katakana
172	{
173	customRules.push_back(`'&'`);
174	customRules.push_back(hiraganaChar);
175	customRules.push_back(compareChar);
176	customRules.push_back(hiraganaChar + hiraganaToKatakanaOffset);
177	}
178	}
179	}
180
181	if (needsIgnoreWidthCustomRule \|\| needsNotIgnoreWidthCustomRule)
182	{
183	UChar compareChar = needsIgnoreWidthCustomRule ? `'='` : `'<'`;
184
185	UChar lowerChar;
186	UChar higherChar;
187	bool needsEscape;
188	for (int i = `0`; i < g_HalfFullCharsLength; i++)
189	{
190	lowerChar = g_HalfFullLowerChars[i];
191	higherChar = g_HalfFullHigherChars[i];
192	// the lower chars need to be checked for escaping since they contain ASCII punctuation
193	needsEscape = NeedsEscape(lowerChar);
194
195	// when isIgnoreSymbols is true and we are not ignoring width, check to see if
196	// this character is a symbol, and if so skip it
197	if (!(isIgnoreSymbols && needsNotIgnoreWidthCustomRule && (needsEscape \|\| IsHalfFullHigherSymbol(higherChar))))
198	{
199	customRules.push_back(`'&'`);
200
201	if (needsEscape)
202	{
203	customRules.push_back(`'\\'`);
204	}
205	customRules.push_back(lowerChar);
206
207	customRules.push_back(compareChar);
208	customRules.push_back(higherChar);
209	}
210	}
211	}
212	}
213
214	return customRules;
215	}
216
217	/*
218	* The collator returned by this function is owned by the callee and must be
219	* closed when this method returns with a U_SUCCESS UErrorCode.
220	*
221	* On error, the return value is undefined.
222	*/
223	UCollator* CloneCollatorWithOptions(const UCollator* pCollator, int32_t options, UErrorCode* pErr)
224	{
225	UColAttributeValue strength = ucol_getStrength(pCollator);
226
227	bool isIgnoreCase = (options & CompareOptionsIgnoreCase) == CompareOptionsIgnoreCase;
228	bool isIgnoreNonSpace = (options & CompareOptionsIgnoreNonSpace) == CompareOptionsIgnoreNonSpace;
229	bool isIgnoreSymbols = (options & CompareOptionsIgnoreSymbols) == CompareOptionsIgnoreSymbols;
230
231	if (isIgnoreCase)
232	{
233	strength = UCOL_SECONDARY;
234	}
235
236	if (isIgnoreNonSpace)
237	{
238	strength = UCOL_PRIMARY;
239	}
240
241	UCollator* pClonedCollator;
242	std::vector<UChar> customRules = GetCustomRules(options, strength, isIgnoreSymbols);
243	if (customRules.empty())
244	{
245	pClonedCollator = ucol_safeClone(pCollator, nullptr, nullptr, pErr);
246	}
247	else
248	{
249	int32_t customRuleLength = customRules.size();
250
251	int32_t localeRulesLength;
252	const UChar* localeRules = ucol_getRules(pCollator, &localeRulesLength);
253
254	std::vector<UChar> completeRules(localeRulesLength + customRuleLength + `1`, `'\0'`);
255	for (int i = `0`; i < localeRulesLength; i++)
256	{
257	completeRules [i] = localeRules[i];
258	}
259	for (int i = `0`; i < customRuleLength; i++)
260	{
261	completeRules [localeRulesLength + i] = customRules [i];
262	}
263
264	pClonedCollator = ucol_openRules(completeRules.data(), completeRules.size(), UCOL_DEFAULT, strength, NULL, pErr);
265	}
266
267	if (isIgnoreSymbols)
268	{
269	ucol_setAttribute(pClonedCollator, UCOL_ALTERNATE_HANDLING, UCOL_SHIFTED, pErr);
270
271	// by default, ICU alternate shifted handling only ignores punctuation, but
272	// IgnoreSymbols needs symbols and currency as well, so change the "variable top"
273	// to include all symbols and currency
274	#if HAVE_SET_MAX_VARIABLE
275	ucol_setMaxVariable(pClonedCollator, UCOL_REORDER_CODE_CURRENCY, pErr);
276	#else
277	// 0xfdfc is the last currency character before the first digit character
278	// in http://source.icu-project.org/repos/icu/icu/tags/release-52-1/source/data/unidata/FractionalUCA.txt
279	const UChar ignoreSymbolsVariableTop[] = { `0xfdfc` };
280	ucol_setVariableTop(pClonedCollator, ignoreSymbolsVariableTop, `1`, pErr);
281	#endif
282	}
283
284	ucol_setAttribute(pClonedCollator, UCOL_STRENGTH, strength, pErr);
285
286	// casing differs at the tertiary level.
287	// if strength is less than tertiary, but we are not ignoring case, then we need to flip CASE_LEVEL On
288	if (strength < UCOL_TERTIARY && !isIgnoreCase)
289	{
290	ucol_setAttribute(pClonedCollator, UCOL_CASE_LEVEL, UCOL_ON, pErr);
291	}
292
293	return pClonedCollator;
294	}
295
296	// Returns TRUE if all the collation elements in str are completely ignorable
297	bool CanIgnoreAllCollationElements(const UCollator* pColl, const UChar* lpStr, int32_t length)
298	{
299	bool result = false;
300	UErrorCode err = U_ZERO_ERROR;
301	UCollationElements* pCollElem = ucol_openElements(pColl, lpStr, length, &err);
302
303	if (U_SUCCESS(err))
304	{
305	int32_t curCollElem = UCOL_NULLORDER;
306
307	result = true;
308
309	while ((curCollElem = ucol_next(pCollElem, &err)) != UCOL_NULLORDER)
310	{
311	if (curCollElem != `0`)
312	{
313	result = false;
314	break;
315	}
316	}
317
318	if (U_FAILURE(err))
319	{
320	result = false;
321	}
322
323	ucol_closeElements(pCollElem);
324	}
325
326	return result;
327
328	}
329
330	extern "C" ResultCode GlobalizationNative_GetSortHandle(const char* lpLocaleName, SortHandle** ppSortHandle)
331	{
332	assert(ppSortHandle != nullptr);
333
334	ppSortHandle = new* (std::nothrow) SortHandle ();
335	if ((ppSortHandle) == nullptr*)
336	{
337	return GetResultCode(U_MEMORY_ALLOCATION_ERROR);
338	}
339
340	UErrorCode err = U_ZERO_ERROR;
341
342	(*ppSortHandle)->regular = ucol_open(lpLocaleName, &err);
343
344	if (U_FAILURE(err))
345	{
346	if ((ppSortHandle)->regular != nullptr*)
347	ucol_close((*ppSortHandle)->regular);
348
349	delete (*ppSortHandle);
350	(ppSortHandle) = nullptr*;
351	}
352
353	return GetResultCode(err);
354	}
355
356	extern "C" void GlobalizationNative_CloseSortHandle(SortHandle* pSortHandle)
357	{
358	ucol_close(pSortHandle->regular);
359	pSortHandle->regular = nullptr;
360
361	TCollatorMap::iterator it;
362	for (it = pSortHandle->collatorsPerOption.begin(); it != pSortHandle->collatorsPerOption.end(); it ++)
363	{
364	ucol_close(it ->second);
365	}
366
367	pthread_mutex_destroy(&pSortHandle->collatorsLockObject);
368
369	delete pSortHandle;
370	}
371
372	const UCollator* GetCollatorFromSortHandle(SortHandle* pSortHandle, int32_t options, UErrorCode* pErr)
373	{
374	UCollator* pCollator;
375	if (options == `0`)
376	{
377	pCollator = pSortHandle->regular;
378	}
379	else
380	{
381	int lockResult = pthread_mutex_lock(&pSortHandle->collatorsLockObject);
382	if (lockResult != `0`)
383	{
384	assert(false && "Unexpected pthread_mutex_lock return value.");
385	}
386
387	TCollatorMap::iterator entry = pSortHandle->collatorsPerOption.find(options);
388	if (entry == pSortHandle->collatorsPerOption.end())
389	{
390	pCollator = CloneCollatorWithOptions(pSortHandle->regular, options, pErr);
391	pSortHandle->collatorsPerOption [options] = pCollator;
392	}
393	else
394	{
395	pCollator = entry ->second;
396	}
397
398	pthread_mutex_unlock(&pSortHandle->collatorsLockObject);
399	}
400
401	return pCollator;
402	}
403
404	extern "C" int32_t GlobalizationNative_GetSortVersion(SortHandle* pSortHandle)
405	{
406	UErrorCode err = U_ZERO_ERROR;
407	const UCollator* pColl = GetCollatorFromSortHandle(pSortHandle, `0`, &err);
408	int32_t result = `0`;
409
410	if (U_SUCCESS(err))
411	{
412	ucol_getVersion(pColl, (uint8_t *) &result);
413	}
414	else
415	{
416	assert(false && "Unexpected ucol_getVersion to fail.");
417
418	// we didn't use UCOL_TAILORINGS_VERSION because it is deprecated in ICU v5
419	result = UCOL_RUNTIME_VERSION << `16` \| UCOL_BUILDER_VERSION;
420	}
421	return result;
422	}
423
424	/*
425	Function:
426	CompareString
427	*/
428	extern "C" int32_t GlobalizationNative_CompareString(
429	SortHandle* pSortHandle, const UChar* lpStr1, int32_t cwStr1Length, const UChar* lpStr2, int32_t cwStr2Length, int32_t options)
430	{
431	static_assert(UCOL_EQUAL == `0`, "managed side requires 0 for equal strings");
432	static_assert(UCOL_LESS < `0`, "managed side requires less than zero for a < b");
433	static_assert(UCOL_GREATER > `0`, "managed side requires greater than zero for a > b");
434
435	UCollationResult result = UCOL_EQUAL;
436	UErrorCode err = U_ZERO_ERROR;
437	const UCollator* pColl = GetCollatorFromSortHandle(pSortHandle, options, &err);
438
439	if (U_SUCCESS(err))
440	{
441	result = ucol_strcoll(pColl, lpStr1, cwStr1Length, lpStr2, cwStr2Length);
442	}
443
444	return result;
445	}
446
447	/*
448	Function:
449	IndexOf
450	*/
451	extern "C" int32_t GlobalizationNative_IndexOf(
452	SortHandle* pSortHandle,
453	const UChar* lpTarget,
454	int32_t cwTargetLength,
455	const UChar* lpSource,
456	int32_t cwSourceLength,
457	int32_t options,
458	int32_t* pMatchedLength)
459	{
460	static_assert(USEARCH_DONE == -`1`, "managed side requires -1 for not found");
461
462	int32_t result = USEARCH_DONE;
463	UErrorCode err = U_ZERO_ERROR;
464	const UCollator* pColl = GetCollatorFromSortHandle(pSortHandle, options, &err);
465
466	if (U_SUCCESS(err))
467	{
468	UStringSearch* pSearch = usearch_openFromCollator(lpTarget, cwTargetLength, lpSource, cwSourceLength, pColl, nullptr, &err);
469
470	if (U_SUCCESS(err))
471	{
472	result = usearch_first(pSearch, &err);
473
474	// if the search was successful,
475	// we'll try to get the matched string length.
476	if(result != USEARCH_DONE && pMatchedLength != NULL)
477	{
478	*pMatchedLength = usearch_getMatchedLength(pSearch);
479	}
480	usearch_close(pSearch);
481	}
482	}
483
484	return result;
485	}
486
487	/*
488	Function:
489	LastIndexOf
490	*/
491	extern "C" int32_t GlobalizationNative_LastIndexOf(
492	SortHandle* pSortHandle,
493	const UChar* lpTarget,
494	int32_t cwTargetLength,
495	const UChar* lpSource,
496	int32_t cwSourceLength,
497	int32_t options)
498	{
499	static_assert(USEARCH_DONE == -`1`, "managed side requires -1 for not found");
500
501	int32_t result = USEARCH_DONE;
502	UErrorCode err = U_ZERO_ERROR;
503	const UCollator* pColl = GetCollatorFromSortHandle(pSortHandle, options, &err);
504
505	if (U_SUCCESS(err))
506	{
507	UStringSearch* pSearch = usearch_openFromCollator(lpTarget, cwTargetLength, lpSource, cwSourceLength, pColl, nullptr, &err);
508
509	if (U_SUCCESS(err))
510	{
511	result = usearch_last(pSearch, &err);
512	usearch_close(pSearch);
513	}
514	}
515
516	return result;
517	}
518
519	/*
520	Static Function:
521	AreEqualOrdinalIgnoreCase
522	*/
523	static bool AreEqualOrdinalIgnoreCase(UChar32 one, UChar32 two)
524	{
525	// Return whether the two characters are identical or would be identical if they were upper-cased.
526
527	if (one == two)
528	{
529	return true;
530	}
531
532	if (one == `0x0131` \|\| two == `0x0131`)
533	{
534	// On Windows with InvariantCulture, the LATIN SMALL LETTER DOTLESS I (U+0131)
535	// capitalizes to itself, whereas with ICU it capitalizes to LATIN CAPITAL LETTER I (U+0049).
536	// We special case it to match the Windows invariant behavior.
537	return false;
538	}
539
540	return u_toupper(one) == u_toupper(two);
541	}
542
543	/*
544	Function:
545	IndexOfOrdinalIgnoreCase
546	*/
547	extern "C" int32_t GlobalizationNative_IndexOfOrdinalIgnoreCase(
548	const UChar* lpTarget, int32_t cwTargetLength, const UChar* lpSource, int32_t cwSourceLength, int32_t findLast)
549	{
550	int32_t result = -`1`;
551
552	int32_t endIndex = cwSourceLength - cwTargetLength;
553	assert(endIndex >= `0`);
554
555	int32_t i = `0`;
556	while (i <= endIndex)
557	{
558	int32_t srcIdx = i, trgIdx = `0`;
559	const UChar src = lpSource, trg = lpTarget;
560	UChar32 srcCodepoint, trgCodepoint;
561
562	bool match = true;
563	while (trgIdx < cwTargetLength)
564	{
565	U16_NEXT(src, srcIdx, cwSourceLength, srcCodepoint);
566	U16_NEXT(trg, trgIdx, cwTargetLength, trgCodepoint);
567	if (!AreEqualOrdinalIgnoreCase(srcCodepoint, trgCodepoint))
568	{
569	match = false;
570	break;
571	}
572	}
573
574	if (match)
575	{
576	result = i;
577	if (!findLast)
578	{
579	break;
580	}
581	}
582
583	U16_FWD_1(lpSource, i, cwSourceLength);
584	}
585
586	return result;
587	}
588
589	/*
590	Return value is a "Win32 BOOL" (1 = true, 0 = false)
591	*/
592	extern "C" int32_t GlobalizationNative_StartsWith(
593	SortHandle* pSortHandle,
594	const UChar* lpTarget,
595	int32_t cwTargetLength,
596	const UChar* lpSource,
597	int32_t cwSourceLength,
598	int32_t options)
599	{
600	int32_t result = FALSE;
601	UErrorCode err = U_ZERO_ERROR;
602	const UCollator* pColl = GetCollatorFromSortHandle(pSortHandle, options, &err);
603
604	if (U_SUCCESS(err))
605	{
606	UStringSearch* pSearch = usearch_openFromCollator(lpTarget, cwTargetLength, lpSource, cwSourceLength, pColl, nullptr, &err);
607	int32_t idx = USEARCH_DONE;
608
609	if (U_SUCCESS(err))
610	{
611	idx = usearch_first(pSearch, &err);
612	if (idx != USEARCH_DONE)
613	{
614	if (idx == `0`)
615	{
616	result = TRUE;
617	}
618	else
619	{
620	result = CanIgnoreAllCollationElements(pColl, lpSource, idx);
621	}
622	}
623
624	usearch_close(pSearch);
625	}
626	}
627
628	return result;
629	}
630
631	/*
632	Return value is a "Win32 BOOL" (1 = true, 0 = false)
633	*/
634	extern "C" int32_t GlobalizationNative_EndsWith(
635	SortHandle* pSortHandle,
636	const UChar* lpTarget,
637	int32_t cwTargetLength,
638	const UChar* lpSource,
639	int32_t cwSourceLength,
640	int32_t options)
641	{
642	int32_t result = FALSE;
643	UErrorCode err = U_ZERO_ERROR;
644	const UCollator* pColl = GetCollatorFromSortHandle(pSortHandle, options, &err);
645
646	if (U_SUCCESS(err))
647	{
648	UStringSearch* pSearch = usearch_openFromCollator(lpTarget, cwTargetLength, lpSource, cwSourceLength, pColl, nullptr, &err);
649	int32_t idx = USEARCH_DONE;
650
651	if (U_SUCCESS(err))
652	{
653	idx = usearch_last(pSearch, &err);
654
655	if (idx != USEARCH_DONE)
656	{
657	if ((idx + usearch_getMatchedLength(pSearch)) == cwSourceLength)
658	{
659	result = TRUE;
660	}
661	else
662	{
663	int32_t matchEnd = idx + usearch_getMatchedLength(pSearch);
664	int32_t remainingStringLength = cwSourceLength - matchEnd;
665
666	result = CanIgnoreAllCollationElements(pColl, lpSource + matchEnd, remainingStringLength);
667	}
668	}
669
670	usearch_close(pSearch);
671	}
672	}
673
674	return result;
675	}
676
677	extern "C" int32_t GlobalizationNative_GetSortKey(
678	SortHandle* pSortHandle,
679	const UChar* lpStr,
680	int32_t cwStrLength,
681	uint8_t* sortKey,
682	int32_t cbSortKeyLength,
683	int32_t options)
684	{
685	UErrorCode err = U_ZERO_ERROR;
686	const UCollator* pColl = GetCollatorFromSortHandle(pSortHandle, options, &err);
687	int32_t result = `0`;
688
689	if (U_SUCCESS(err))
690	{
691	result = ucol_getSortKey(pColl, lpStr, cwStrLength, sortKey, cbSortKeyLength);
692	}
693
694	return result;
695	}
696
697	extern "C" int32_t GlobalizationNative_CompareStringOrdinalIgnoreCase(
698	const UChar* lpStr1, int32_t cwStr1Length, const UChar* lpStr2, int32_t cwStr2Length)
699	{
700	assert(lpStr1 != nullptr);
701	assert(cwStr1Length >= `0`);
702	assert(lpStr2 != nullptr);
703	assert(cwStr2Length >= `0`);
704
705	int32_t str1Idx = `0`;
706	int32_t str2Idx = `0`;
707
708	while (str1Idx < cwStr1Length && str2Idx < cwStr2Length)
709	{
710	UChar32 str1Codepoint;
711	UChar32 str2Codepoint;
712
713	U16_NEXT(lpStr1, str1Idx, cwStr1Length, str1Codepoint);
714	U16_NEXT(lpStr2, str2Idx, cwStr2Length, str2Codepoint);
715
716	if (str1Codepoint != str2Codepoint && u_toupper(str1Codepoint) != u_toupper(str2Codepoint))
717	{
718	return str1Codepoint < str2Codepoint ? -`1` : `1`;
719	}
720	}
721
722	if (cwStr1Length < cwStr2Length)
723	{
724	return -`1`;
725	}
726
727	if (cwStr2Length < cwStr1Length)
728	{
729	return `1`;
730	}
731
732	return `0`;
733	}
734

Browse the source code of CoreCLR/corefx/System.Globalization.Native/collation.cpp