unistr_case.cpp source code [ClickHouse/contrib/icu/icu4c/source/common/unistr_case.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	*******************************************************************************
5	*
6	* Copyright (C) 1999-2014, International Business Machines
7	* Corporation and others. All Rights Reserved.
8	*
9	*******************************************************************************
10	* file name: unistr_case.cpp
11	* encoding: UTF-8
12	* tab size: 8 (not used)
13	* indentation:2
14	*
15	* created on: 2004aug19
16	* created by: Markus W. Scherer
17	*
18	* Case-mapping functions moved here from unistr.cpp
19	*/
20
21	#include "unicode/utypes.h"
22	#include "unicode/brkiter.h"
23	#include "unicode/casemap.h"
24	#include "unicode/edits.h"
25	#include "unicode/putil.h"
26	#include "cstring.h"
27	#include "cmemory.h"
28	#include "unicode/ustring.h"
29	#include "unicode/unistr.h"
30	#include "unicode/uchar.h"
31	#include "uassert.h"
32	#include "ucasemap_imp.h"
33	#include "uelement.h"
34
35	U_NAMESPACE_BEGIN
36
37	//========================================
38	// Read-only implementation
39	//========================================
40
41	int8_t
42	UnicodeString::doCaseCompare(int32_t start,
43	int32_t length,
44	const UChar *srcChars,
45	int32_t srcStart,
46	int32_t srcLength,
47	uint32_t options) const
48	{
49	// compare illegal string values
50	// treat const UChar srcChars==NULL as an empty string*
51	if(isBogus()) {
52	return -`1`;
53	}
54
55	// pin indices to legal values
56	pinIndices(start, length);
57
58	if(srcChars == NULL) {
59	srcStart = srcLength = `0`;
60	}
61
62	// get the correct pointer
63	const UChar *chars = getArrayStart();
64
65	chars += start;
66	if(srcStart!=`0`) {
67	srcChars += srcStart;
68	}
69
70	if(chars != srcChars) {
71	UErrorCode errorCode=U_ZERO_ERROR;
72	int32_t result=u_strcmpFold(chars, length, srcChars, srcLength,
73	options\|U_COMPARE_IGNORE_CASE, &errorCode);
74	if(result!=`0`) {
75	return (int8_t)(result >> `24` \| `1`);
76	}
77	} else {
78	// get the srcLength if necessary
79	if(srcLength < `0`) {
80	srcLength = u_strlen(srcChars + srcStart);
81	}
82	if(length != srcLength) {
83	return (int8_t)((length - srcLength) >> `24` \| `1`);
84	}
85	}
86	return `0`;
87	}
88
89	//========================================
90	// Write implementation
91	//========================================
92
93	UnicodeString &
94	UnicodeString::caseMap(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
95	UStringCaseMapper *stringCaseMapper) {
96	if(isEmpty() \|\| !isWritable()) {
97	// nothing to do
98	return *this;
99	}
100
101	UChar oldBuffer[`2` * US_STACKBUF_SIZE];
102	UChar *oldArray;
103	int32_t oldLength = length();
104	int32_t newLength;
105	UBool writable = isBufferWritable();
106	UErrorCode errorCode = U_ZERO_ERROR;
107
108	#if !UCONFIG_NO_BREAK_ITERATION
109	// Read-only alias to the original string contents for the titlecasing BreakIterator.
110	// We cannot set the iterator simply to this because this is being modified.
111	UnicodeString oldString;
112	#endif
113
114	// Try to avoid heap-allocating a new character array for this string.
115	if (writable ? oldLength <= UPRV_LENGTHOF(oldBuffer) : oldLength < US_STACKBUF_SIZE) {
116	// Short string: Copy the contents into a temporary buffer and
117	// case-map back into the current array, or into the stack buffer.
118	UChar *buffer = getArrayStart();
119	int32_t capacity;
120	oldArray = oldBuffer;
121	u_memcpy(oldBuffer, buffer, oldLength);
122	if (writable) {
123	capacity = getCapacity();
124	} else {
125	// Switch from the read-only alias or shared heap buffer to the stack buffer.
126	if (!cloneArrayIfNeeded(US_STACKBUF_SIZE, US_STACKBUF_SIZE, / doCopyArray= / FALSE)) {
127	return *this;
128	}
129	U_ASSERT(fUnion.fFields.fLengthAndFlags & kUsingStackBuffer);
130	buffer = fUnion.fStackFields.fBuffer;
131	capacity = US_STACKBUF_SIZE;
132	}
133	#if !UCONFIG_NO_BREAK_ITERATION
134	if (iter != nullptr) {
135	oldString.setTo(FALSE, oldArray, oldLength);
136	iter->setText(oldString);
137	}
138	#endif
139	newLength = stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
140	buffer, capacity,
141	oldArray, oldLength, NULL, errorCode);
142	if (U_SUCCESS(errorCode)) {
143	setLength(newLength);
144	return *this;
145	} else if (errorCode == U_BUFFER_OVERFLOW_ERROR) {
146	// common overflow handling below
147	} else {
148	setToBogus();
149	return *this;
150	}
151	} else {
152	// Longer string or read-only buffer:
153	// Collect only changes and then apply them to this string.
154	// Case mapping often changes only small parts of a string,
155	// and often does not change its length.
156	oldArray = getArrayStart();
157	Edits edits;
158	UChar replacementChars[`200`];
159	#if !UCONFIG_NO_BREAK_ITERATION
160	if (iter != nullptr) {
161	oldString.setTo(FALSE, oldArray, oldLength);
162	iter->setText(oldString);
163	}
164	#endif
165	stringCaseMapper(caseLocale, options \| U_OMIT_UNCHANGED_TEXT, UCASEMAP_BREAK_ITERATOR
166	replacementChars, UPRV_LENGTHOF(replacementChars),
167	oldArray, oldLength, &edits, errorCode);
168	if (U_SUCCESS(errorCode)) {
169	// Grow the buffer at most once, not for multiple doReplace() calls.
170	newLength = oldLength + edits.lengthDelta();
171	if (newLength > oldLength && !cloneArrayIfNeeded(newLength, newLength)) {
172	return *this;
173	}
174	for (Edits::Iterator ei = edits.getCoarseChangesIterator(); ei.next(errorCode);) {
175	doReplace(ei.destinationIndex(), ei.oldLength(),
176	replacementChars, ei.replacementIndex(), ei.newLength());
177	}
178	if (U_FAILURE(errorCode)) {
179	setToBogus();
180	}
181	return *this;
182	} else if (errorCode == U_BUFFER_OVERFLOW_ERROR) {
183	// common overflow handling below
184	newLength = oldLength + edits.lengthDelta();
185	} else {
186	setToBogus();
187	return *this;
188	}
189	}
190
191	// Handle buffer overflow, newLength is known.
192	// We need to allocate a new buffer for the internal string case mapping function.
193	// This is very similar to how doReplace() keeps the old array pointer
194	// and deletes the old array itself after it is done.
195	// In addition, we are forcing cloneArrayIfNeeded() to always allocate a new array.
196	int32_t *bufferToDelete = `0`;
197	if (!cloneArrayIfNeeded(newLength, newLength, FALSE, &bufferToDelete, TRUE)) {
198	return *this;
199	}
200	errorCode = U_ZERO_ERROR;
201	// No need to iter->setText() again: The case mapper restarts via iter->first().
202	newLength = stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
203	getArrayStart(), getCapacity(),
204	oldArray, oldLength, NULL, errorCode);
205	if (bufferToDelete) {
206	uprv_free(bufferToDelete);
207	}
208	if (U_SUCCESS(errorCode)) {
209	setLength(newLength);
210	} else {
211	setToBogus();
212	}
213	return *this;
214	}
215
216	UnicodeString &
217	UnicodeString::foldCase(uint32_t options) {
218	return caseMap(UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL ustrcase_internalFold);
219	}
220
221	U_NAMESPACE_END
222
223	// Defined here to reduce dependencies on break iterator
224	U_CAPI int32_t U_EXPORT2
225	uhash_hashCaselessUnicodeString(const UElement key) {
226	U_NAMESPACE_USE
227	const UnicodeString str = (const* UnicodeString*) key.pointer;
228	if (str == NULL) {
229	return `0`;
230	}
231	// Inefficient; a better way would be to have a hash function in
232	// UnicodeString that does case folding on the fly.
233	UnicodeString copy(*str);
234	return copy.foldCase().hashCode();
235	}
236
237	// Defined here to reduce dependencies on break iterator
238	U_CAPI UBool U_EXPORT2
239	uhash_compareCaselessUnicodeString(const UElement key1, const UElement key2) {
240	U_NAMESPACE_USE
241	const UnicodeString str1 = (const* UnicodeString*) key1.pointer;
242	const UnicodeString str2 = (const* UnicodeString*) key2.pointer;
243	if (str1 == str2) {
244	return TRUE;
245	}
246	if (str1 == NULL \|\| str2 == NULL) {
247	return FALSE;
248	}
249	return str1->caseCompare(*str2, U_FOLD_CASE_DEFAULT) == `0`;
250	}
251

Browse the source code of ClickHouse/contrib/icu/icu4c/source/common/unistr_case.cpp