unistr.cpp source code [ClickHouse/contrib/icu/icu4c/source/common/unistr.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	******************************************************************************
5	* Copyright (C) 1999-2016, International Business Machines Corporation and
6	* others. All Rights Reserved.
7	******************************************************************************
8	*
9	* File unistr.cpp
10	*
11	* Modification History:
12	*
13	* Date Name Description
14	* 09/25/98 stephen Creation.
15	* 04/20/99 stephen Overhauled per 4/16 code review.
16	* 07/09/99 stephen Renamed {hi,lo},{byte,word} to icu_X for HP/UX
17	* 11/18/99 aliu Added handleReplaceBetween() to make inherit from
18	* Replaceable.
19	* 06/25/01 grhoten Removed the dependency on iostream
20	******************************************************************************
21	*/
22
23	#include "unicode/utypes.h"
24	#include "unicode/appendable.h"
25	#include "unicode/putil.h"
26	#include "cstring.h"
27	#include "cmemory.h"
28	#include "unicode/ustring.h"
29	#include "unicode/unistr.h"
30	#include "unicode/utf.h"
31	#include "unicode/utf16.h"
32	#include "uelement.h"
33	#include "ustr_imp.h"
34	#include "umutex.h"
35	#include "uassert.h"
36
37	#if 0
38
39	#include <iostream>
40	using namespace std;
41
42	//DEBUGGING
43	void
44	print(const UnicodeString& s,
45	const char *name)
46	{
47	UChar c;
48	cout << name << ":\|";
49	for(int i = `0`; i < s.length(); ++i) {
50	c = s[i];
51	if(c>= `0x007E` \|\| c < `0x0020`)
52	cout << "[0x" << hex << s[i] << "]";
53	else
54	cout << (char) s[i];
55	}
56	cout << `'\|'` << endl;
57	}
58
59	void
60	print(const UChar *s,
61	int32_t len,
62	const char *name)
63	{
64	UChar c;
65	cout << name << ":\|";
66	for(int i = `0`; i < len; ++i) {
67	c = s[i];
68	if(c>= `0x007E` \|\| c < `0x0020`)
69	cout << "[0x" << hex << s[i] << "]";
70	else
71	cout << (char) s[i];
72	}
73	cout << `'\|'` << endl;
74	}
75	// END DEBUGGING
76	#endif
77
78	// Local function definitions for now
79
80	// need to copy areas that may overlap
81	static
82	inline void
83	us_arrayCopy(const UChar *src, int32_t srcStart,
84	UChar *dst, int32_t dstStart, int32_t count)
85	{
86	if(count>`0`) {
87	uprv_memmove(dst+dstStart, src+srcStart, (size_t)count*sizeof(*src));
88	}
89	}
90
91	// u_unescapeAt() callback to get a UChar from a UnicodeString
92	U_CDECL_BEGIN
93	static UChar U_CALLCONV
94	UnicodeString_charAt(int32_t offset, void *context) {
95	return ((icu::UnicodeString*) context)->charAt(offset);
96	}
97	U_CDECL_END
98
99	U_NAMESPACE_BEGIN
100
101	/ The Replaceable virtual destructor can't be defined in the header*
102	due to how AIX works with multiple definitions of virtual functions.
103	*/
104	Replaceable::~Replaceable() {}
105
106	UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeString)
107
108	UnicodeString U_EXPORT2
109	operator+ (const UnicodeString &s1, const UnicodeString &s2) {
110	return
111	UnicodeString (s1.length()+s2.length()+`1`, (UChar32)`0`, `0`).
112	append(s1).
113	append(s2);
114	}
115
116	//========================================
117	// Reference Counting functions, put at top of file so that optimizing compilers
118	// have a chance to automatically inline.
119	//========================================
120
121	void
122	UnicodeString::addRef() {
123	umtx_atomic_inc((u_atomic_int32_t *)fUnion.fFields.fArray - `1`);
124	}
125
126	int32_t
127	UnicodeString::removeRef() {
128	return umtx_atomic_dec((u_atomic_int32_t *)fUnion.fFields.fArray - `1`);
129	}
130
131	int32_t
132	UnicodeString::refCount() const {
133	return umtx_loadAcquire(((u_atomic_int32_t )fUnion.fFields.fArray - `1`));
134	}
135
136	void
137	UnicodeString::releaseArray() {
138	if((fUnion.fFields.fLengthAndFlags & kRefCounted) && removeRef() == `0`) {
139	uprv_free((int32_t *)fUnion.fFields.fArray - `1`);
140	}
141	}
142
143
144
145	//========================================
146	// Constructors
147	//========================================
148
149	// The default constructor is inline in unistr.h.
150
151	UnicodeString::UnicodeString(int32_t capacity, UChar32 c, int32_t count) {
152	fUnion.fFields.fLengthAndFlags = `0`;
153	if(count <= `0` \|\| (uint32_t)c > `0x10ffff`) {
154	// just allocate and do not do anything else
155	allocate(capacity);
156	} else if(c <= `0xffff`) {
157	int32_t length = count;
158	if(capacity < length) {
159	capacity = length;
160	}
161	if(allocate(capacity)) {
162	UChar *array = getArrayStart();
163	UChar unit = (UChar)c;
164	for(int32_t i = `0`; i < length; ++i) {
165	array[i] = unit;
166	}
167	setLength(length);
168	}
169	} else { // supplementary code point, write surrogate pairs
170	if(count > (INT32_MAX / `2`)) {
171	// We would get more than 2G UChars.
172	allocate(capacity);
173	return;
174	}
175	int32_t length = count * `2`;
176	if(capacity < length) {
177	capacity = length;
178	}
179	if(allocate(capacity)) {
180	UChar *array = getArrayStart();
181	UChar lead = U16_LEAD(c);
182	UChar trail = U16_TRAIL(c);
183	for(int32_t i = `0`; i < length; i += `2`) {
184	array[i] = lead;
185	array[i + `1`] = trail;
186	}
187	setLength(length);
188	}
189	}
190	}
191
192	UnicodeString::UnicodeString(UChar ch) {
193	fUnion.fFields.fLengthAndFlags = kLength1 \| kShortString;
194	fUnion.fStackFields.fBuffer[`0`] = ch;
195	}
196
197	UnicodeString::UnicodeString(UChar32 ch) {
198	fUnion.fFields.fLengthAndFlags = kShortString;
199	int32_t i = `0`;
200	UBool isError = FALSE;
201	U16_APPEND(fUnion.fStackFields.fBuffer, i, US_STACKBUF_SIZE, ch, isError);
202	// We test isError so that the compiler does not complain that we don't.
203	// If isError then i==0 which is what we want anyway.
204	if(!isError) {
205	setShortLength(i);
206	}
207	}
208
209	UnicodeString::UnicodeString(const UChar *text) {
210	fUnion.fFields.fLengthAndFlags = kShortString;
211	doAppend(text, `0`, -`1`);
212	}
213
214	UnicodeString::UnicodeString(const UChar *text,
215	int32_t textLength) {
216	fUnion.fFields.fLengthAndFlags = kShortString;
217	doAppend(text, `0`, textLength);
218	}
219
220	UnicodeString::UnicodeString(UBool isTerminated,
221	ConstChar16Ptr textPtr,
222	int32_t textLength) {
223	fUnion.fFields.fLengthAndFlags = kReadonlyAlias;
224	const UChar *text = textPtr;
225	if(text == NULL) {
226	// treat as an empty string, do not alias
227	setToEmpty();
228	} else if(textLength < -`1` \|\|
229	(textLength == -`1` && !isTerminated) \|\|
230	(textLength >= `0` && isTerminated && text[textLength] != `0`)
231	) {
232	setToBogus();
233	} else {
234	if(textLength == -`1`) {
235	// text is terminated, or else it would have failed the above test
236	textLength = u_strlen(text);
237	}
238	setArray(const_cast<UChar *>(text), textLength,
239	isTerminated ? textLength + `1` : textLength);
240	}
241	}
242
243	UnicodeString::UnicodeString(UChar *buff,
244	int32_t buffLength,
245	int32_t buffCapacity) {
246	fUnion.fFields.fLengthAndFlags = kWritableAlias;
247	if(buff == NULL) {
248	// treat as an empty string, do not alias
249	setToEmpty();
250	} else if(buffLength < -`1` \|\| buffCapacity < `0` \|\| buffLength > buffCapacity) {
251	setToBogus();
252	} else {
253	if(buffLength == -`1`) {
254	// fLength = u_strlen(buff); but do not look beyond buffCapacity
255	const UChar p = buff, limit = buff + buffCapacity;
256	while(p != limit && *p != `0`) {
257	++p;
258	}
259	buffLength = (int32_t)(p - buff);
260	}
261	setArray(buff, buffLength, buffCapacity);
262	}
263	}
264
265	UnicodeString::UnicodeString(const char *src, int32_t length, EInvariant) {
266	fUnion.fFields.fLengthAndFlags = kShortString;
267	if(src==NULL) {
268	// treat as an empty string
269	} else {
270	if(length<`0`) {
271	length=(int32_t)uprv_strlen(src);
272	}
273	if(cloneArrayIfNeeded(length, length, FALSE)) {
274	u_charsToUChars(src, getArrayStart(), length);
275	setLength(length);
276	} else {
277	setToBogus();
278	}
279	}
280	}
281
282	#if U_CHARSET_IS_UTF8
283
284	UnicodeString::UnicodeString(const char *codepageData) {
285	fUnion.fFields.fLengthAndFlags = kShortString;
286	if(codepageData != `0`) {
287	setToUTF8(codepageData);
288	}
289	}
290
291	UnicodeString::UnicodeString(const char *codepageData, int32_t dataLength) {
292	fUnion.fFields.fLengthAndFlags = kShortString;
293	// if there's nothing to convert, do nothing
294	if(codepageData == `0` \|\| dataLength == `0` \|\| dataLength < -`1`) {
295	return;
296	}
297	if(dataLength == -`1`) {
298	dataLength = (int32_t)uprv_strlen(codepageData);
299	}
300	setToUTF8(StringPiece (codepageData, dataLength));
301	}
302
303	// else see unistr_cnv.cpp
304	#endif
305
306	UnicodeString::UnicodeString(const UnicodeString& that) {
307	fUnion.fFields.fLengthAndFlags = kShortString;
308	copyFrom(that);
309	}
310
311	UnicodeString::UnicodeString(UnicodeString &&src) U_NOEXCEPT {
312	copyFieldsFrom(src, TRUE);
313	}
314
315	UnicodeString::UnicodeString(const UnicodeString& that,
316	int32_t srcStart) {
317	fUnion.fFields.fLengthAndFlags = kShortString;
318	setTo(that, srcStart);
319	}
320
321	UnicodeString::UnicodeString(const UnicodeString& that,
322	int32_t srcStart,
323	int32_t srcLength) {
324	fUnion.fFields.fLengthAndFlags = kShortString;
325	setTo(that, srcStart, srcLength);
326	}
327
328	// Replaceable base class clone() default implementation, does not clone
329	Replaceable *
330	Replaceable::clone() const {
331	return NULL;
332	}
333
334	// UnicodeString overrides clone() with a real implementation
335	UnicodeString *
336	UnicodeString::clone() const {
337	return new UnicodeString (*this);
338	}
339
340	//========================================
341	// array allocation
342	//========================================
343
344	namespace {
345
346	const int32_t kGrowSize = `128`;
347
348	// The number of bytes for one int32_t reference counter and capacity UChars
349	// must fit into a 32-bit size_t (at least when on a 32-bit platform).
350	// We also add one for the NUL terminator, to avoid reallocation in getTerminatedBuffer(),
351	// and round up to a multiple of 16 bytes.
352	// This means that capacity must be at most (0xfffffff0 - 4) / 2 - 1 = 0x7ffffff5.
353	// (With more complicated checks we could go up to 0x7ffffffd without rounding up,
354	// but that does not seem worth it.)
355	const int32_t kMaxCapacity = `0x7ffffff5`;
356
357	int32_t getGrowCapacity(int32_t newLength) {
358	int32_t growSize = (newLength >> `2`) + kGrowSize;
359	if(growSize <= (kMaxCapacity - newLength)) {
360	return newLength + growSize;
361	} else {
362	return kMaxCapacity;
363	}
364	}
365
366	} // namespace
367
368	UBool
369	UnicodeString::allocate(int32_t capacity) {
370	if(capacity <= US_STACKBUF_SIZE) {
371	fUnion.fFields.fLengthAndFlags = kShortString;
372	return TRUE;
373	}
374	if(capacity <= kMaxCapacity) {
375	++capacity; // for the NUL
376	// Switch to size_t which is unsigned so that we can allocate up to 4GB.
377	// Reference counter + UChars.
378	size_t numBytes = sizeof(int32_t) + (size_t)capacity * U_SIZEOF_UCHAR;
379	// Round up to a multiple of 16.
380	numBytes = (numBytes + `15`) & ~`15`;
381	int32_t array = (int32_t ) uprv_malloc(numBytes);
382	if(array != NULL) {
383	// set initial refCount and point behind the refCount
384	*array++ = `1`;
385	numBytes -= sizeof(int32_t);
386
387	// have fArray point to the first UChar
388	fUnion.fFields.fArray = (UChar *)array;
389	fUnion.fFields.fCapacity = (int32_t)(numBytes / U_SIZEOF_UCHAR);
390	fUnion.fFields.fLengthAndFlags = kLongString;
391	return TRUE;
392	}
393	}
394	fUnion.fFields.fLengthAndFlags = kIsBogus;
395	fUnion.fFields.fArray = `0`;
396	fUnion.fFields.fCapacity = `0`;
397	return FALSE;
398	}
399
400	//========================================
401	// Destructor
402	//========================================
403
404	#ifdef UNISTR_COUNT_FINAL_STRING_LENGTHS
405	static u_atomic_int32_t finalLengthCounts[`0x400`]; // UnicodeString::kMaxShortLength+1
406	static u_atomic_int32_t beyondCount(`0`);
407
408	U_CAPI void unistr_printLengths() {
409	int32_t i;
410	for(i = `0`; i <= `59`; ++i) {
411	printf("%2d, %9d\n", i, (int32_t)finalLengthCounts[i]);
412	}
413	int32_t beyond = beyondCount;
414	for(; i < UPRV_LENGTHOF(finalLengthCounts); ++i) {
415	beyond += finalLengthCounts[i];
416	}
417	printf(">59, %9d\n", beyond);
418	}
419	#endif
420
421	UnicodeString::~UnicodeString()
422	{
423	#ifdef UNISTR_COUNT_FINAL_STRING_LENGTHS
424	// Count lengths of strings at the end of their lifetime.
425	// Useful for discussion of a desirable stack buffer size.
426	// Count the contents length, not the optional NUL terminator nor further capacity.
427	// Ignore open-buffer strings and strings which alias external storage.
428	if((fUnion.fFields.fLengthAndFlags&(kOpenGetBuffer\|kReadonlyAlias\|kWritableAlias)) == `0`) {
429	if(hasShortLength()) {
430	umtx_atomic_inc(finalLengthCounts + getShortLength());
431	} else {
432	umtx_atomic_inc(&beyondCount);
433	}
434	}
435	#endif
436
437	releaseArray();
438	}
439
440	//========================================
441	// Factory methods
442	//========================================
443
444	UnicodeString UnicodeString::fromUTF8(StringPiece utf8) {
445	UnicodeString result;
446	result.setToUTF8(utf8);
447	return result;
448	}
449
450	UnicodeString UnicodeString::fromUTF32(const UChar32 *utf32, int32_t length) {
451	UnicodeString result;
452	int32_t capacity;
453	// Most UTF-32 strings will be BMP-only and result in a same-length
454	// UTF-16 string. We overestimate the capacity just slightly,
455	// just in case there are a few supplementary characters.
456	if(length <= US_STACKBUF_SIZE) {
457	capacity = US_STACKBUF_SIZE;
458	} else {
459	capacity = length + (length >> `4`) + `4`;
460	}
461	do {
462	UChar *utf16 = result.getBuffer(capacity);
463	int32_t length16;
464	UErrorCode errorCode = U_ZERO_ERROR;
465	u_strFromUTF32WithSub(utf16, result.getCapacity(), &length16,
466	utf32, length,
467	`0xfffd`, // Substitution character.
468	NULL, // Don't care about number of substitutions.
469	&errorCode);
470	result.releaseBuffer(length16);
471	if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
472	capacity = length16 + `1`; // +1 for the terminating NUL.
473	continue;
474	} else if(U_FAILURE(errorCode)) {
475	result.setToBogus();
476	}
477	break;
478	} while(TRUE);
479	return result;
480	}
481
482	//========================================
483	// Assignment
484	//========================================
485
486	UnicodeString &
487	UnicodeString::operator=(const UnicodeString &src) {
488	return copyFrom(src);
489	}
490
491	UnicodeString &
492	UnicodeString::fastCopyFrom(const UnicodeString &src) {
493	return copyFrom(src, TRUE);
494	}
495
496	UnicodeString &
497	UnicodeString::copyFrom(const UnicodeString &src, UBool fastCopy) {
498	// if assigning to ourselves, do nothing
499	if(this == &src) {
500	return *this;
501	}
502
503	// is the right side bogus?
504	if(src.isBogus()) {
505	setToBogus();
506	return *this;
507	}
508
509	// delete the current contents
510	releaseArray();
511
512	if(src.isEmpty()) {
513	// empty string - use the stack buffer
514	setToEmpty();
515	return *this;
516	}
517
518	// fLength>0 and not an "open" src.getBuffer(minCapacity)
519	fUnion.fFields.fLengthAndFlags = src.fUnion.fFields.fLengthAndFlags;
520	switch(src.fUnion.fFields.fLengthAndFlags & kAllStorageFlags) {
521	case kShortString:
522	// short string using the stack buffer, do the same
523	uprv_memcpy(fUnion.fStackFields.fBuffer, src.fUnion.fStackFields.fBuffer,
524	getShortLength() * U_SIZEOF_UCHAR);
525	break;
526	case kLongString:
527	// src uses a refCounted string buffer, use that buffer with refCount
528	// src is const, use a cast - we don't actually change it
529	((UnicodeString &)src).addRef();
530	// copy all fields, share the reference-counted buffer
531	fUnion.fFields.fArray = src.fUnion.fFields.fArray;
532	fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
533	if(!hasShortLength()) {
534	fUnion.fFields.fLength = src.fUnion.fFields.fLength;
535	}
536	break;
537	case kReadonlyAlias:
538	if(fastCopy) {
539	// src is a readonly alias, do the same
540	// -> maintain the readonly alias as such
541	fUnion.fFields.fArray = src.fUnion.fFields.fArray;
542	fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
543	if(!hasShortLength()) {
544	fUnion.fFields.fLength = src.fUnion.fFields.fLength;
545	}
546	break;
547	}
548	// else if(!fastCopy) fall through to case kWritableAlias
549	// -> allocate a new buffer and copy the contents
550	U_FALLTHROUGH;
551	case kWritableAlias: {
552	// src is a writable alias; we make a copy of that instead
553	int32_t srcLength = src.length();
554	if(allocate(srcLength)) {
555	u_memcpy(getArrayStart(), src.getArrayStart(), srcLength);
556	setLength(srcLength);
557	break;
558	}
559	// if there is not enough memory, then fall through to setting to bogus
560	U_FALLTHROUGH;
561	}
562	default:
563	// if src is bogus, set ourselves to bogus
564	// do not call setToBogus() here because fArray and flags are not consistent here
565	fUnion.fFields.fLengthAndFlags = kIsBogus;
566	fUnion.fFields.fArray = `0`;
567	fUnion.fFields.fCapacity = `0`;
568	break;
569	}
570
571	return *this;
572	}
573
574	UnicodeString &UnicodeString::operator=(UnicodeString &&src) U_NOEXCEPT {
575	// No explicit check for self move assignment, consistent with standard library.
576	// Self move assignment causes no crash nor leak but might make the object bogus.
577	releaseArray();
578	copyFieldsFrom(src, TRUE);
579	return *this;
580	}
581
582	// Same as move assignment except without memory management.
583	void UnicodeString::copyFieldsFrom(UnicodeString &src, UBool setSrcToBogus) U_NOEXCEPT {
584	int16_t lengthAndFlags = fUnion.fFields.fLengthAndFlags = src.fUnion.fFields.fLengthAndFlags;
585	if(lengthAndFlags & kUsingStackBuffer) {
586	// Short string using the stack buffer, copy the contents.
587	// Check for self assignment to prevent "overlap in memcpy" warnings,
588	// although it should be harmless to copy a buffer to itself exactly.
589	if(this != &src) {
590	uprv_memcpy(fUnion.fStackFields.fBuffer, src.fUnion.fStackFields.fBuffer,
591	getShortLength() * U_SIZEOF_UCHAR);
592	}
593	} else {
594	// In all other cases, copy all fields.
595	fUnion.fFields.fArray = src.fUnion.fFields.fArray;
596	fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
597	if(!hasShortLength()) {
598	fUnion.fFields.fLength = src.fUnion.fFields.fLength;
599	}
600	if(setSrcToBogus) {
601	// Set src to bogus without releasing any memory.
602	src.fUnion.fFields.fLengthAndFlags = kIsBogus;
603	src.fUnion.fFields.fArray = NULL;
604	src.fUnion.fFields.fCapacity = `0`;
605	}
606	}
607	}
608
609	void UnicodeString::swap(UnicodeString &other) U_NOEXCEPT {
610	UnicodeString temp; // Empty short string: Known not to need releaseArray().
611	// Copy fields without resetting source values in between.
612	temp.copyFieldsFrom(*this, FALSE);
613	this->copyFieldsFrom(other, FALSE);
614	other.copyFieldsFrom(temp, FALSE);
615	// Set temp to an empty string so that other's memory is not released twice.
616	temp.fUnion.fFields.fLengthAndFlags = kShortString;
617	}
618
619	//========================================
620	// Miscellaneous operations
621	//========================================
622
623	UnicodeString UnicodeString::unescape() const {
624	UnicodeString result(length(), (UChar32)`0`, (int32_t)`0`); // construct with capacity
625	if (result.isBogus()) {
626	return result;
627	}
628	const UChar *array = getBuffer();
629	int32_t len = length();
630	int32_t prev = `0`;
631	for (int32_t i=`0`;;) {
632	if (i == len) {
633	result.append(array, prev, len - prev);
634	break;
635	}
636	if (array[i++] == `0x5C` /'\\'/) {
637	result.append(array, prev, (i - `1`) - prev);
638	UChar32 c = unescapeAt(i); // advances i
639	if (c < `0`) {
640	result.remove(); // return empty string
641	break; // invalid escape sequence
642	}
643	result.append(c);
644	prev = i;
645	}
646	}
647	return result;
648	}
649
650	UChar32 UnicodeString::unescapeAt(int32_t &offset) const {
651	return u_unescapeAt(UnicodeString_charAt, &offset, length(), (void)this*);
652	}
653
654	//========================================
655	// Read-only implementation
656	//========================================
657	UBool
658	UnicodeString::doEquals(const UnicodeString &text, int32_t len) const {
659	// Requires: this & text not bogus and have same lengths.
660	// Byte-wise comparison works for equality regardless of endianness.
661	return uprv_memcmp(getArrayStart(), text.getArrayStart(), len * U_SIZEOF_UCHAR) == `0`;
662	}
663
664	int8_t
665	UnicodeString::doCompare( int32_t start,
666	int32_t length,
667	const UChar *srcChars,
668	int32_t srcStart,
669	int32_t srcLength) const
670	{
671	// compare illegal string values
672	if(isBogus()) {
673	return -`1`;
674	}
675
676	// pin indices to legal values
677	pinIndices(start, length);
678
679	if(srcChars == NULL) {
680	// treat const UChar srcChars==NULL as an empty string*
681	return length == `0` ? `0` : `1`;
682	}
683
684	// get the correct pointer
685	const UChar *chars = getArrayStart();
686
687	chars += start;
688	srcChars += srcStart;
689
690	int32_t minLength;
691	int8_t lengthResult;
692
693	// get the srcLength if necessary
694	if(srcLength < `0`) {
695	srcLength = u_strlen(srcChars + srcStart);
696	}
697
698	// are we comparing different lengths?
699	if(length != srcLength) {
700	if(length < srcLength) {
701	minLength = length;
702	lengthResult = -`1`;
703	} else {
704	minLength = srcLength;
705	lengthResult = `1`;
706	}
707	} else {
708	minLength = length;
709	lengthResult = `0`;
710	}
711
712	/*
713	* note that uprv_memcmp() returns an int but we return an int8_t;
714	* we need to take care not to truncate the result -
715	* one way to do this is to right-shift the value to
716	* move the sign bit into the lower 8 bits and making sure that this
717	* does not become 0 itself
718	*/
719
720	if(minLength > `0` && chars != srcChars) {
721	int32_t result;
722
723	# if U_IS_BIG_ENDIAN
724	// big-endian: byte comparison works
725	result = uprv_memcmp(chars, srcChars, minLength * sizeof(UChar));
726	if(result != `0`) {
727	return (int8_t)(result >> `15` \| `1`);
728	}
729	# else
730	// little-endian: compare UChar units
731	do {
732	result = ((int32_t)(chars++) - (int32_t)(srcChars++));
733	if(result != `0`) {
734	return (int8_t)(result >> `15` \| `1`);
735	}
736	} while(--minLength > `0`);
737	# endif
738	}
739	return lengthResult;
740	}
741
742	/ String compare in code point order - doCompare() compares in code unit order. /
743	int8_t
744	UnicodeString::doCompareCodePointOrder(int32_t start,
745	int32_t length,
746	const UChar *srcChars,
747	int32_t srcStart,
748	int32_t srcLength) const
749	{
750	// compare illegal string values
751	// treat const UChar srcChars==NULL as an empty string*
752	if(isBogus()) {
753	return -`1`;
754	}
755
756	// pin indices to legal values
757	pinIndices(start, length);
758
759	if(srcChars == NULL) {
760	srcStart = srcLength = `0`;
761	}
762
763	int32_t diff = uprv_strCompare(getArrayStart() + start, length, (srcChars!=NULL)?(srcChars + srcStart):NULL, srcLength, FALSE, TRUE);
764	/ translate the 32-bit result into an 8-bit one /
765	if(diff!=`0`) {
766	return (int8_t)(diff >> `15` \| `1`);
767	} else {
768	return `0`;
769	}
770	}
771
772	int32_t
773	UnicodeString::getLength() const {
774	return length();
775	}
776
777	UChar
778	UnicodeString::getCharAt(int32_t offset) const {
779	return charAt(offset);
780	}
781
782	UChar32
783	UnicodeString::getChar32At(int32_t offset) const {
784	return char32At(offset);
785	}
786
787	UChar32
788	UnicodeString::char32At(int32_t offset) const
789	{
790	int32_t len = length();
791	if((uint32_t)offset < (uint32_t)len) {
792	const UChar *array = getArrayStart();
793	UChar32 c;
794	U16_GET(array, `0`, offset, len, c);
795	return c;
796	} else {
797	return kInvalidUChar;
798	}
799	}
800
801	int32_t
802	UnicodeString::getChar32Start(int32_t offset) const {
803	if((uint32_t)offset < (uint32_t)length()) {
804	const UChar *array = getArrayStart();
805	U16_SET_CP_START(array, `0`, offset);
806	return offset;
807	} else {
808	return `0`;
809	}
810	}
811
812	int32_t
813	UnicodeString::getChar32Limit(int32_t offset) const {
814	int32_t len = length();
815	if((uint32_t)offset < (uint32_t)len) {
816	const UChar *array = getArrayStart();
817	U16_SET_CP_LIMIT(array, `0`, offset, len);
818	return offset;
819	} else {
820	return len;
821	}
822	}
823
824	int32_t
825	UnicodeString::countChar32(int32_t start, int32_t length) const {
826	pinIndices(start, length);
827	// if(isBogus()) then fArray==0 and start==0 - u_countChar32() checks for NULL
828	return u_countChar32(getArrayStart()+start, length);
829	}
830
831	UBool
832	UnicodeString::hasMoreChar32Than(int32_t start, int32_t length, int32_t number) const {
833	pinIndices(start, length);
834	// if(isBogus()) then fArray==0 and start==0 - u_strHasMoreChar32Than() checks for NULL
835	return u_strHasMoreChar32Than(getArrayStart()+start, length, number);
836	}
837
838	int32_t
839	UnicodeString::moveIndex32(int32_t index, int32_t delta) const {
840	// pin index
841	int32_t len = length();
842	if(index<`0`) {
843	index=`0`;
844	} else if(index>len) {
845	index=len;
846	}
847
848	const UChar *array = getArrayStart();
849	if(delta>`0`) {
850	U16_FWD_N(array, index, len, delta);
851	} else {
852	U16_BACK_N(array, `0`, index, -delta);
853	}
854
855	return index;
856	}
857
858	void
859	UnicodeString::doExtract(int32_t start,
860	int32_t length,
861	UChar *dst,
862	int32_t dstStart) const
863	{
864	// pin indices to legal values
865	pinIndices(start, length);
866
867	// do not copy anything if we alias dst itself
868	const UChar *array = getArrayStart();
869	if(array + start != dst + dstStart) {
870	us_arrayCopy(array, start, dst, dstStart, length);
871	}
872	}
873
874	int32_t
875	UnicodeString::extract(Char16Ptr dest, int32_t destCapacity,
876	UErrorCode &errorCode) const {
877	int32_t len = length();
878	if(U_SUCCESS(errorCode)) {
879	if(isBogus() \|\| destCapacity<`0` \|\| (destCapacity>`0` && dest==`0`)) {
880	errorCode=U_ILLEGAL_ARGUMENT_ERROR;
881	} else {
882	const UChar *array = getArrayStart();
883	if(len>`0` && len<=destCapacity && array!=dest) {
884	u_memcpy(dest, array, len);
885	}
886	return u_terminateUChars(dest, destCapacity, len, &errorCode);
887	}
888	}
889
890	return len;
891	}
892
893	int32_t
894	UnicodeString::extract(int32_t start,
895	int32_t length,
896	char *target,
897	int32_t targetCapacity,
898	enum EInvariant) const
899	{
900	// if the arguments are illegal, then do nothing
901	if(targetCapacity < `0` \|\| (targetCapacity > `0` && target == NULL)) {
902	return `0`;
903	}
904
905	// pin the indices to legal values
906	pinIndices(start, length);
907
908	if(length <= targetCapacity) {
909	u_UCharsToChars(getArrayStart() + start, target, length);
910	}
911	UErrorCode status = U_ZERO_ERROR;
912	return u_terminateChars(target, targetCapacity, length, &status);
913	}
914
915	UnicodeString
916	UnicodeString::tempSubString(int32_t start, int32_t len) const {
917	pinIndices(start, len);
918	const UChar array = getBuffer(); // not getArrayStart() to check kIsBogus & kOpenGetBuffer*
919	if(array==NULL) {
920	array=fUnion.fStackFields.fBuffer; // anything not NULL because that would make an empty string
921	len=-`2`; // bogus result string
922	}
923	return UnicodeString (FALSE, array + start, len);
924	}
925
926	int32_t
927	UnicodeString::toUTF8(int32_t start, int32_t len,
928	char target, int32_t capacity) const* {
929	pinIndices(start, len);
930	int32_t length8;
931	UErrorCode errorCode = U_ZERO_ERROR;
932	u_strToUTF8WithSub(target, capacity, &length8,
933	getBuffer() + start, len,
934	`0xFFFD`, // Standard substitution character.
935	NULL, // Don't care about number of substitutions.
936	&errorCode);
937	return length8;
938	}
939
940	#if U_CHARSET_IS_UTF8
941
942	int32_t
943	UnicodeString::extract(int32_t start, int32_t len,
944	char target, uint32_t dstSize) const* {
945	// if the arguments are illegal, then do nothing
946	if(/dstSize < 0 \|\| /(dstSize > `0` && target == `0`)) {
947	return `0`;
948	}
949	return toUTF8(start, len, target, dstSize <= `0x7fffffff` ? (int32_t)dstSize : `0x7fffffff`);
950	}
951
952	// else see unistr_cnv.cpp
953	#endif
954
955	void
956	UnicodeString::extractBetween(int32_t start,
957	int32_t limit,
958	UnicodeString& target) const {
959	pinIndex(start);
960	pinIndex(limit);
961	doExtract(start, limit - start, target);
962	}
963
964	// When converting from UTF-16 to UTF-8, the result will have at most 3 times
965	// as many bytes as the source has UChars.
966	// The "worst cases" are writing systems like Indic, Thai and CJK with
967	// 3:1 bytes:UChars.
968	void
969	UnicodeString::toUTF8(ByteSink &sink) const {
970	int32_t length16 = length();
971	if(length16 != `0`) {
972	char stackBuffer[`1024`];
973	int32_t capacity = (int32_t)sizeof(stackBuffer);
974	UBool utf8IsOwned = FALSE;
975	char *utf8 = sink.GetAppendBuffer(length16 < capacity ? length16 : capacity,
976	`3`*length16,
977	stackBuffer, capacity,
978	&capacity);
979	int32_t length8 = `0`;
980	UErrorCode errorCode = U_ZERO_ERROR;
981	u_strToUTF8WithSub(utf8, capacity, &length8,
982	getBuffer(), length16,
983	`0xFFFD`, // Standard substitution character.
984	NULL, // Don't care about number of substitutions.
985	&errorCode);
986	if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
987	utf8 = (char *)uprv_malloc(length8);
988	if(utf8 != NULL) {
989	utf8IsOwned = TRUE;
990	errorCode = U_ZERO_ERROR;
991	u_strToUTF8WithSub(utf8, length8, &length8,
992	getBuffer(), length16,
993	`0xFFFD`, // Standard substitution character.
994	NULL, // Don't care about number of substitutions.
995	&errorCode);
996	} else {
997	errorCode = U_MEMORY_ALLOCATION_ERROR;
998	}
999	}
1000	if(U_SUCCESS(errorCode)) {
1001	sink.Append(utf8, length8);
1002	sink.Flush();
1003	}
1004	if(utf8IsOwned) {
1005	uprv_free(utf8);
1006	}
1007	}
1008	}
1009
1010	int32_t
1011	UnicodeString::toUTF32(UChar32 utf32, int32_t capacity, UErrorCode &errorCode) const* {
1012	int32_t length32=`0`;
1013	if(U_SUCCESS(errorCode)) {
1014	// getBuffer() and u_strToUTF32WithSub() check for illegal arguments.
1015	u_strToUTF32WithSub(utf32, capacity, &length32,
1016	getBuffer(), length(),
1017	`0xfffd`, // Substitution character.
1018	NULL, // Don't care about number of substitutions.
1019	&errorCode);
1020	}
1021	return length32;
1022	}
1023
1024	int32_t
1025	UnicodeString::indexOf(const UChar *srcChars,
1026	int32_t srcStart,
1027	int32_t srcLength,
1028	int32_t start,
1029	int32_t length) const
1030	{
1031	if(isBogus() \|\| srcChars == `0` \|\| srcStart < `0` \|\| srcLength == `0`) {
1032	return -`1`;
1033	}
1034
1035	// UnicodeString does not find empty substrings
1036	if(srcLength < `0` && srcChars[srcStart] == `0`) {
1037	return -`1`;
1038	}
1039
1040	// get the indices within bounds
1041	pinIndices(start, length);
1042
1043	// find the first occurrence of the substring
1044	const UChar *array = getArrayStart();
1045	const UChar *match = u_strFindFirst(array + start, length, srcChars + srcStart, srcLength);
1046	if(match == NULL) {
1047	return -`1`;
1048	} else {
1049	return (int32_t)(match - array);
1050	}
1051	}
1052
1053	int32_t
1054	UnicodeString::doIndexOf(UChar c,
1055	int32_t start,
1056	int32_t length) const
1057	{
1058	// pin indices
1059	pinIndices(start, length);
1060
1061	// find the first occurrence of c
1062	const UChar *array = getArrayStart();
1063	const UChar *match = u_memchr(array + start, c, length);
1064	if(match == NULL) {
1065	return -`1`;
1066	} else {
1067	return (int32_t)(match - array);
1068	}
1069	}
1070
1071	int32_t
1072	UnicodeString::doIndexOf(UChar32 c,
1073	int32_t start,
1074	int32_t length) const {
1075	// pin indices
1076	pinIndices(start, length);
1077
1078	// find the first occurrence of c
1079	const UChar *array = getArrayStart();
1080	const UChar *match = u_memchr32(array + start, c, length);
1081	if(match == NULL) {
1082	return -`1`;
1083	} else {
1084	return (int32_t)(match - array);
1085	}
1086	}
1087
1088	int32_t
1089	UnicodeString::lastIndexOf(const UChar *srcChars,
1090	int32_t srcStart,
1091	int32_t srcLength,
1092	int32_t start,
1093	int32_t length) const
1094	{
1095	if(isBogus() \|\| srcChars == `0` \|\| srcStart < `0` \|\| srcLength == `0`) {
1096	return -`1`;
1097	}
1098
1099	// UnicodeString does not find empty substrings
1100	if(srcLength < `0` && srcChars[srcStart] == `0`) {
1101	return -`1`;
1102	}
1103
1104	// get the indices within bounds
1105	pinIndices(start, length);
1106
1107	// find the last occurrence of the substring
1108	const UChar *array = getArrayStart();
1109	const UChar *match = u_strFindLast(array + start, length, srcChars + srcStart, srcLength);
1110	if(match == NULL) {
1111	return -`1`;
1112	} else {
1113	return (int32_t)(match - array);
1114	}
1115	}
1116
1117	int32_t
1118	UnicodeString::doLastIndexOf(UChar c,
1119	int32_t start,
1120	int32_t length) const
1121	{
1122	if(isBogus()) {
1123	return -`1`;
1124	}
1125
1126	// pin indices
1127	pinIndices(start, length);
1128
1129	// find the last occurrence of c
1130	const UChar *array = getArrayStart();
1131	const UChar *match = u_memrchr(array + start, c, length);
1132	if(match == NULL) {
1133	return -`1`;
1134	} else {
1135	return (int32_t)(match - array);
1136	}
1137	}
1138
1139	int32_t
1140	UnicodeString::doLastIndexOf(UChar32 c,
1141	int32_t start,
1142	int32_t length) const {
1143	// pin indices
1144	pinIndices(start, length);
1145
1146	// find the last occurrence of c
1147	const UChar *array = getArrayStart();
1148	const UChar *match = u_memrchr32(array + start, c, length);
1149	if(match == NULL) {
1150	return -`1`;
1151	} else {
1152	return (int32_t)(match - array);
1153	}
1154	}
1155
1156	//========================================
1157	// Write implementation
1158	//========================================
1159
1160	UnicodeString&
1161	UnicodeString::findAndReplace(int32_t start,
1162	int32_t length,
1163	const UnicodeString& oldText,
1164	int32_t oldStart,
1165	int32_t oldLength,
1166	const UnicodeString& newText,
1167	int32_t newStart,
1168	int32_t newLength)
1169	{
1170	if(isBogus() \|\| oldText.isBogus() \|\| newText.isBogus()) {
1171	return *this;
1172	}
1173
1174	pinIndices(start, length);
1175	oldText.pinIndices(oldStart, oldLength);
1176	newText.pinIndices(newStart, newLength);
1177
1178	if(oldLength == `0`) {
1179	return *this;
1180	}
1181
1182	while(length > `0` && length >= oldLength) {
1183	int32_t pos = indexOf(oldText, oldStart, oldLength, start, length);
1184	if(pos < `0`) {
1185	// no more oldText's here: done
1186	break;
1187	} else {
1188	// we found oldText, replace it by newText and go beyond it
1189	replace(pos, oldLength, newText, newStart, newLength);
1190	length -= pos + oldLength - start;
1191	start = pos + newLength;
1192	}
1193	}
1194
1195	return *this;
1196	}
1197
1198
1199	void
1200	UnicodeString::setToBogus()
1201	{
1202	releaseArray();
1203
1204	fUnion.fFields.fLengthAndFlags = kIsBogus;
1205	fUnion.fFields.fArray = `0`;
1206	fUnion.fFields.fCapacity = `0`;
1207	}
1208
1209	// turn a bogus string into an empty one
1210	void
1211	UnicodeString::unBogus() {
1212	if(fUnion.fFields.fLengthAndFlags & kIsBogus) {
1213	setToEmpty();
1214	}
1215	}
1216
1217	const char16_t *
1218	UnicodeString::getTerminatedBuffer() {
1219	if(!isWritable()) {
1220	return nullptr;
1221	}
1222	UChar *array = getArrayStart();
1223	int32_t len = length();
1224	if(len < getCapacity()) {
1225	if(fUnion.fFields.fLengthAndFlags & kBufferIsReadonly) {
1226	// If len<capacity on a read-only alias, then array[len] is
1227	// either the original NUL (if constructed with (TRUE, s, length))
1228	// or one of the original string contents characters (if later truncated),
1229	// therefore we can assume that array[len] is initialized memory.
1230	if(array[len] == `0`) {
1231	return array;
1232	}
1233	} else if(((fUnion.fFields.fLengthAndFlags & kRefCounted) == `0` \|\| refCount() == `1`)) {
1234	// kRefCounted: Do not write the NUL if the buffer is shared.
1235	// That is mostly safe, except when the length of one copy was modified
1236	// without copy-on-write, e.g., via truncate(newLength) or remove(void).
1237	// Then the NUL would be written into the middle of another copy's string.
1238
1239	// Otherwise, the buffer is fully writable and it is anyway safe to write the NUL.
1240	// Do not test if there is a NUL already because it might be uninitialized memory.
1241	// (That would be safe, but tools like valgrind & Purify would complain.)
1242	array[len] = `0`;
1243	return array;
1244	}
1245	}
1246	if(len<INT32_MAX && cloneArrayIfNeeded(len+`1`)) {
1247	array = getArrayStart();
1248	array[len] = `0`;
1249	return array;
1250	} else {
1251	return nullptr;
1252	}
1253	}
1254
1255	// setTo() analogous to the readonly-aliasing constructor with the same signature
1256	UnicodeString &
1257	UnicodeString::setTo(UBool isTerminated,
1258	ConstChar16Ptr textPtr,
1259	int32_t textLength)
1260	{
1261	if(fUnion.fFields.fLengthAndFlags & kOpenGetBuffer) {
1262	// do not modify a string that has an "open" getBuffer(minCapacity)
1263	return *this;
1264	}
1265
1266	const UChar *text = textPtr;
1267	if(text == NULL) {
1268	// treat as an empty string, do not alias
1269	releaseArray();
1270	setToEmpty();
1271	return *this;
1272	}
1273
1274	if( textLength < -`1` \|\|
1275	(textLength == -`1` && !isTerminated) \|\|
1276	(textLength >= `0` && isTerminated && text[textLength] != `0`)
1277	) {
1278	setToBogus();
1279	return *this;
1280	}
1281
1282	releaseArray();
1283
1284	if(textLength == -`1`) {
1285	// text is terminated, or else it would have failed the above test
1286	textLength = u_strlen(text);
1287	}
1288	fUnion.fFields.fLengthAndFlags = kReadonlyAlias;
1289	setArray((UChar *)text, textLength, isTerminated ? textLength + `1` : textLength);
1290	return *this;
1291	}
1292
1293	// setTo() analogous to the writable-aliasing constructor with the same signature
1294	UnicodeString &
1295	UnicodeString::setTo(UChar *buffer,
1296	int32_t buffLength,
1297	int32_t buffCapacity) {
1298	if(fUnion.fFields.fLengthAndFlags & kOpenGetBuffer) {
1299	// do not modify a string that has an "open" getBuffer(minCapacity)
1300	return *this;
1301	}
1302
1303	if(buffer == NULL) {
1304	// treat as an empty string, do not alias
1305	releaseArray();
1306	setToEmpty();
1307	return *this;
1308	}
1309
1310	if(buffLength < -`1` \|\| buffCapacity < `0` \|\| buffLength > buffCapacity) {
1311	setToBogus();
1312	return *this;
1313	} else if(buffLength == -`1`) {
1314	// buffLength = u_strlen(buff); but do not look beyond buffCapacity
1315	const UChar p = buffer, limit = buffer + buffCapacity;
1316	while(p != limit && *p != `0`) {
1317	++p;
1318	}
1319	buffLength = (int32_t)(p - buffer);
1320	}
1321
1322	releaseArray();
1323
1324	fUnion.fFields.fLengthAndFlags = kWritableAlias;
1325	setArray(buffer, buffLength, buffCapacity);
1326	return *this;
1327	}
1328
1329	UnicodeString &UnicodeString::setToUTF8(StringPiece utf8) {
1330	unBogus();
1331	int32_t length = utf8.length();
1332	int32_t capacity;
1333	// The UTF-16 string will be at most as long as the UTF-8 string.
1334	if(length <= US_STACKBUF_SIZE) {
1335	capacity = US_STACKBUF_SIZE;
1336	} else {
1337	capacity = length + `1`; // +1 for the terminating NUL.
1338	}
1339	UChar *utf16 = getBuffer(capacity);
1340	int32_t length16;
1341	UErrorCode errorCode = U_ZERO_ERROR;
1342	u_strFromUTF8WithSub(utf16, getCapacity(), &length16,
1343	utf8.data(), length,
1344	`0xfffd`, // Substitution character.
1345	NULL, // Don't care about number of substitutions.
1346	&errorCode);
1347	releaseBuffer(length16);
1348	if(U_FAILURE(errorCode)) {
1349	setToBogus();
1350	}
1351	return *this;
1352	}
1353
1354	UnicodeString&
1355	UnicodeString::setCharAt(int32_t offset,
1356	UChar c)
1357	{
1358	int32_t len = length();
1359	if(cloneArrayIfNeeded() && len > `0`) {
1360	if(offset < `0`) {
1361	offset = `0`;
1362	} else if(offset >= len) {
1363	offset = len - `1`;
1364	}
1365
1366	getArrayStart()[offset] = c;
1367	}
1368	return *this;
1369	}
1370
1371	UnicodeString&
1372	UnicodeString::replace(int32_t start,
1373	int32_t _length,
1374	UChar32 srcChar) {
1375	UChar buffer[U16_MAX_LENGTH];
1376	int32_t count = `0`;
1377	UBool isError = FALSE;
1378	U16_APPEND(buffer, count, U16_MAX_LENGTH, srcChar, isError);
1379	// We test isError so that the compiler does not complain that we don't.
1380	// If isError (srcChar is not a valid code point) then count==0 which means
1381	// we remove the source segment rather than replacing it with srcChar.
1382	return doReplace(start, _length, buffer, `0`, isError ? `0` : count);
1383	}
1384
1385	UnicodeString&
1386	UnicodeString::append(UChar32 srcChar) {
1387	UChar buffer[U16_MAX_LENGTH];
1388	int32_t _length = `0`;
1389	UBool isError = FALSE;
1390	U16_APPEND(buffer, _length, U16_MAX_LENGTH, srcChar, isError);
1391	// We test isError so that the compiler does not complain that we don't.
1392	// If isError then _length==0 which turns the doAppend() into a no-op anyway.
1393	return isError ? *this : doAppend(buffer, `0`, _length);
1394	}
1395
1396	UnicodeString&
1397	UnicodeString::doReplace( int32_t start,
1398	int32_t length,
1399	const UnicodeString& src,
1400	int32_t srcStart,
1401	int32_t srcLength)
1402	{
1403	// pin the indices to legal values
1404	src.pinIndices(srcStart, srcLength);
1405
1406	// get the characters from src
1407	// and replace the range in ourselves with them
1408	return doReplace(start, length, src.getArrayStart(), srcStart, srcLength);
1409	}
1410
1411	UnicodeString&
1412	UnicodeString::doReplace(int32_t start,
1413	int32_t length,
1414	const UChar *srcChars,
1415	int32_t srcStart,
1416	int32_t srcLength)
1417	{
1418	if(!isWritable()) {
1419	return *this;
1420	}
1421
1422	int32_t oldLength = this->length();
1423
1424	// optimize (read-only alias).remove(0, start) and .remove(start, end)
1425	if((fUnion.fFields.fLengthAndFlags&kBufferIsReadonly) && srcLength == `0`) {
1426	if(start == `0`) {
1427	// remove prefix by adjusting the array pointer
1428	pinIndex(length);
1429	fUnion.fFields.fArray += length;
1430	fUnion.fFields.fCapacity -= length;
1431	setLength(oldLength - length);
1432	return *this;
1433	} else {
1434	pinIndex(start);
1435	if(length >= (oldLength - start)) {
1436	// remove suffix by reducing the length (like truncate())
1437	setLength(start);
1438	fUnion.fFields.fCapacity = start; // not NUL-terminated any more
1439	return *this;
1440	}
1441	}
1442	}
1443
1444	if(start == oldLength) {
1445	return doAppend(srcChars, srcStart, srcLength);
1446	}
1447
1448	if(srcChars == `0`) {
1449	srcLength = `0`;
1450	} else {
1451	// Perform all remaining operations relative to srcChars + srcStart.
1452	// From this point forward, do not use srcStart.
1453	srcChars += srcStart;
1454	if (srcLength < `0`) {
1455	// get the srcLength if necessary
1456	srcLength = u_strlen(srcChars);
1457	}
1458	}
1459
1460	// pin the indices to legal values
1461	pinIndices(start, length);
1462
1463	// Calculate the size of the string after the replace.
1464	// Avoid int32_t overflow.
1465	int32_t newLength = oldLength - length;
1466	if(srcLength > (INT32_MAX - newLength)) {
1467	setToBogus();
1468	return *this;
1469	}
1470	newLength += srcLength;
1471
1472	// Check for insertion into ourself
1473	const UChar *oldArray = getArrayStart();
1474	if (isBufferWritable() &&
1475	oldArray < srcChars + srcLength &&
1476	srcChars < oldArray + oldLength) {
1477	// Copy into a new UnicodeString and start over
1478	UnicodeString copy(srcChars, srcLength);
1479	if (copy.isBogus()) {
1480	setToBogus();
1481	return *this;
1482	}
1483	return doReplace(start, length, copy.getArrayStart(), `0`, srcLength);
1484	}
1485
1486	// cloneArrayIfNeeded(doCopyArray=FALSE) may change fArray but will not copy the current contents;
1487	// therefore we need to keep the current fArray
1488	UChar oldStackBuffer[US_STACKBUF_SIZE];
1489	if((fUnion.fFields.fLengthAndFlags&kUsingStackBuffer) && (newLength > US_STACKBUF_SIZE)) {
1490	// copy the stack buffer contents because it will be overwritten with
1491	// fUnion.fFields values
1492	u_memcpy(oldStackBuffer, oldArray, oldLength);
1493	oldArray = oldStackBuffer;
1494	}
1495
1496	// clone our array and allocate a bigger array if needed
1497	int32_t *bufferToDelete = `0`;
1498	if(!cloneArrayIfNeeded(newLength, getGrowCapacity(newLength),
1499	FALSE, &bufferToDelete)
1500	) {
1501	return *this;
1502	}
1503
1504	// now do the replace
1505
1506	UChar *newArray = getArrayStart();
1507	if(newArray != oldArray) {
1508	// if fArray changed, then we need to copy everything except what will change
1509	us_arrayCopy(oldArray, `0`, newArray, `0`, start);
1510	us_arrayCopy(oldArray, start + length,
1511	newArray, start + srcLength,
1512	oldLength - (start + length));
1513	} else if(length != srcLength) {
1514	// fArray did not change; copy only the portion that isn't changing, leaving a hole
1515	us_arrayCopy(oldArray, start + length,
1516	newArray, start + srcLength,
1517	oldLength - (start + length));
1518	}
1519
1520	// now fill in the hole with the new string
1521	us_arrayCopy(srcChars, `0`, newArray, start, srcLength);
1522
1523	setLength(newLength);
1524
1525	// delayed delete in case srcChars == fArray when we started, and
1526	// to keep oldArray alive for the above operations
1527	if (bufferToDelete) {
1528	uprv_free(bufferToDelete);
1529	}
1530
1531	return *this;
1532	}
1533
1534	// Versions of doReplace() only for append() variants.
1535	// doReplace() and doAppend() optimize for different cases.
1536
1537	UnicodeString&
1538	UnicodeString::doAppend(const UnicodeString& src, int32_t srcStart, int32_t srcLength) {
1539	if(srcLength == `0`) {
1540	return *this;
1541	}
1542
1543	// pin the indices to legal values
1544	src.pinIndices(srcStart, srcLength);
1545	return doAppend(src.getArrayStart(), srcStart, srcLength);
1546	}
1547
1548	UnicodeString&
1549	UnicodeString::doAppend(const UChar *srcChars, int32_t srcStart, int32_t srcLength) {
1550	if(!isWritable() \|\| srcLength == `0` \|\| srcChars == NULL) {
1551	return *this;
1552	}
1553
1554	// Perform all remaining operations relative to srcChars + srcStart.
1555	// From this point forward, do not use srcStart.
1556	srcChars += srcStart;
1557
1558	if(srcLength < `0`) {
1559	// get the srcLength if necessary
1560	if((srcLength = u_strlen(srcChars)) == `0`) {
1561	return *this;
1562	}
1563	}
1564
1565	int32_t oldLength = length();
1566	int32_t newLength = oldLength + srcLength;
1567
1568	// Check for append onto ourself
1569	const UChar* oldArray = getArrayStart();
1570	if (isBufferWritable() &&
1571	oldArray < srcChars + srcLength &&
1572	srcChars < oldArray + oldLength) {
1573	// Copy into a new UnicodeString and start over
1574	UnicodeString copy(srcChars, srcLength);
1575	if (copy.isBogus()) {
1576	setToBogus();
1577	return *this;
1578	}
1579	return doAppend(copy.getArrayStart(), `0`, srcLength);
1580	}
1581
1582	// optimize append() onto a large-enough, owned string
1583	if((newLength <= getCapacity() && isBufferWritable()) \|\|
1584	cloneArrayIfNeeded(newLength, getGrowCapacity(newLength))) {
1585	UChar *newArray = getArrayStart();
1586	// Do not copy characters when
1587	// UChar buffer=str.getAppendBuffer(...);*
1588	// is followed by
1589	// str.append(buffer, length);
1590	// or
1591	// str.appendString(buffer, length)
1592	// or similar.
1593	if(srcChars != newArray + oldLength) {
1594	us_arrayCopy(srcChars, `0`, newArray, oldLength, srcLength);
1595	}
1596	setLength(newLength);
1597	}
1598	return *this;
1599	}
1600
1601	/**
1602	* Replaceable API
1603	*/
1604	void
1605	UnicodeString::handleReplaceBetween(int32_t start,
1606	int32_t limit,
1607	const UnicodeString& text) {
1608	replaceBetween(start, limit, text);
1609	}
1610
1611	/**
1612	* Replaceable API
1613	*/
1614	void
1615	UnicodeString::copy(int32_t start, int32_t limit, int32_t dest) {
1616	if (limit <= start) {
1617	return; // Nothing to do; avoid bogus malloc call
1618	}
1619	UChar* text = (UChar) uprv_malloc( sizeof(UChar) (limit - start) );
1620	// Check to make sure text is not null.
1621	if (text != NULL) {
1622	extractBetween(start, limit, text, `0`);
1623	insert(dest, text, `0`, limit - start);
1624	uprv_free(text);
1625	}
1626	}
1627
1628	/**
1629	* Replaceable API
1630	*
1631	* NOTE: This is for the Replaceable class. There is no rep.cpp,
1632	* so we implement this function here.
1633	*/
1634	UBool Replaceable::hasMetaData() const {
1635	return TRUE;
1636	}
1637
1638	/**
1639	* Replaceable API
1640	*/
1641	UBool UnicodeString::hasMetaData() const {
1642	return FALSE;
1643	}
1644
1645	UnicodeString&
1646	UnicodeString::doReverse(int32_t start, int32_t length) {
1647	if(length <= `1` \|\| !cloneArrayIfNeeded()) {
1648	return *this;
1649	}
1650
1651	// pin the indices to legal values
1652	pinIndices(start, length);
1653	if(length <= `1`) { // pinIndices() might have shrunk the length
1654	return *this;
1655	}
1656
1657	UChar *left = getArrayStart() + start;
1658	UChar right = left + length - `1`; // -1 for inclusive boundary (length>=2)*
1659	UChar swap;
1660	UBool hasSupplementary = FALSE;
1661
1662	// Before the loop we know left<right because length>=2.
1663	do {
1664	hasSupplementary \|= (UBool)U16_IS_LEAD(swap = *left);
1665	hasSupplementary \|= (UBool)U16_IS_LEAD(left++ = right);
1666	*right-- = swap;
1667	} while(left < right);
1668	// Make sure to test the middle code unit of an odd-length string.
1669	// Redundant if the length is even.
1670	hasSupplementary \|= (UBool)U16_IS_LEAD(*left);
1671
1672	/ if there are supplementary code points in the reversed range, then re-swap their surrogates /
1673	if(hasSupplementary) {
1674	UChar swap2;
1675
1676	left = getArrayStart() + start;
1677	right = left + length - `1`; // -1 so that we can look at (left+1) if left<right*
1678	while(left < right) {
1679	if(U16_IS_TRAIL(swap = left) && U16_IS_LEAD(swap2 = (left + `1`))) {
1680	*left++ = swap2;
1681	*left++ = swap;
1682	} else {
1683	++left;
1684	}
1685	}
1686	}
1687
1688	return *this;
1689	}
1690
1691	UBool
1692	UnicodeString::padLeading(int32_t targetLength,
1693	UChar padChar)
1694	{
1695	int32_t oldLength = length();
1696	if(oldLength >= targetLength \|\| !cloneArrayIfNeeded(targetLength)) {
1697	return FALSE;
1698	} else {
1699	// move contents up by padding width
1700	UChar *array = getArrayStart();
1701	int32_t start = targetLength - oldLength;
1702	us_arrayCopy(array, `0`, array, start, oldLength);
1703
1704	// fill in padding character
1705	while(--start >= `0`) {
1706	array[start] = padChar;
1707	}
1708	setLength(targetLength);
1709	return TRUE;
1710	}
1711	}
1712
1713	UBool
1714	UnicodeString::padTrailing(int32_t targetLength,
1715	UChar padChar)
1716	{
1717	int32_t oldLength = length();
1718	if(oldLength >= targetLength \|\| !cloneArrayIfNeeded(targetLength)) {
1719	return FALSE;
1720	} else {
1721	// fill in padding character
1722	UChar *array = getArrayStart();
1723	int32_t length = targetLength;
1724	while(--length >= oldLength) {
1725	array[length] = padChar;
1726	}
1727	setLength(targetLength);
1728	return TRUE;
1729	}
1730	}
1731
1732	//========================================
1733	// Hashing
1734	//========================================
1735	int32_t
1736	UnicodeString::doHashCode() const
1737	{
1738	/ Delegate hash computation to uhash. This makes UnicodeString*
1739	* hashing consistent with UChar* hashing. */
1740	int32_t hashCode = ustr_hashUCharsN(getArrayStart(), length());
1741	if (hashCode == kInvalidHashCode) {
1742	hashCode = kEmptyHashCode;
1743	}
1744	return hashCode;
1745	}
1746
1747	//========================================
1748	// External Buffer
1749	//========================================
1750
1751	char16_t *
1752	UnicodeString::getBuffer(int32_t minCapacity) {
1753	if(minCapacity>=-`1` && cloneArrayIfNeeded(minCapacity)) {
1754	fUnion.fFields.fLengthAndFlags\|=kOpenGetBuffer;
1755	setZeroLength();
1756	return getArrayStart();
1757	} else {
1758	return nullptr;
1759	}
1760	}
1761
1762	void
1763	UnicodeString::releaseBuffer(int32_t newLength) {
1764	if(fUnion.fFields.fLengthAndFlags&kOpenGetBuffer && newLength>=-`1`) {
1765	// set the new fLength
1766	int32_t capacity=getCapacity();
1767	if(newLength==-`1`) {
1768	// the new length is the string length, capped by fCapacity
1769	const UChar array=getArrayStart(), p=array, *limit=array+capacity;
1770	while(p<limit && *p!=`0`) {
1771	++p;
1772	}
1773	newLength=(int32_t)(p-array);
1774	} else if(newLength>capacity) {
1775	newLength=capacity;
1776	}
1777	setLength(newLength);
1778	fUnion.fFields.fLengthAndFlags&=~kOpenGetBuffer;
1779	}
1780	}
1781
1782	//========================================
1783	// Miscellaneous
1784	//========================================
1785	UBool
1786	UnicodeString::cloneArrayIfNeeded(int32_t newCapacity,
1787	int32_t growCapacity,
1788	UBool doCopyArray,
1789	int32_t **pBufferToDelete,
1790	UBool forceClone) {
1791	// default parameters need to be static, therefore
1792	// the defaults are -1 to have convenience defaults
1793	if(newCapacity == -`1`) {
1794	newCapacity = getCapacity();
1795	}
1796
1797	// while a getBuffer(minCapacity) is "open",
1798	// prevent any modifications of the string by returning FALSE here
1799	// if the string is bogus, then only an assignment or similar can revive it
1800	if(!isWritable()) {
1801	return FALSE;
1802	}
1803
1804	/*
1805	* We need to make a copy of the array if
1806	* the buffer is read-only, or
1807	* the buffer is refCounted (shared), and refCount>1, or
1808	* the buffer is too small.
1809	* Return FALSE if memory could not be allocated.
1810	*/
1811	if(forceClone \|\|
1812	fUnion.fFields.fLengthAndFlags & kBufferIsReadonly \|\|
1813	(fUnion.fFields.fLengthAndFlags & kRefCounted && refCount() > `1`) \|\|
1814	newCapacity > getCapacity()
1815	) {
1816	// check growCapacity for default value and use of the stack buffer
1817	if(growCapacity < `0`) {
1818	growCapacity = newCapacity;
1819	} else if(newCapacity <= US_STACKBUF_SIZE && growCapacity > US_STACKBUF_SIZE) {
1820	growCapacity = US_STACKBUF_SIZE;
1821	}
1822
1823	// save old values
1824	UChar oldStackBuffer[US_STACKBUF_SIZE];
1825	UChar *oldArray;
1826	int32_t oldLength = length();
1827	int16_t flags = fUnion.fFields.fLengthAndFlags;
1828
1829	if(flags&kUsingStackBuffer) {
1830	U_ASSERT(!(flags&kRefCounted)); / kRefCounted and kUsingStackBuffer are mutally exclusive /
1831	if(doCopyArray && growCapacity > US_STACKBUF_SIZE) {
1832	// copy the stack buffer contents because it will be overwritten with
1833	// fUnion.fFields values
1834	us_arrayCopy(fUnion.fStackFields.fBuffer, `0`, oldStackBuffer, `0`, oldLength);
1835	oldArray = oldStackBuffer;
1836	} else {
1837	oldArray = NULL; // no need to copy from the stack buffer to itself
1838	}
1839	} else {
1840	oldArray = fUnion.fFields.fArray;
1841	U_ASSERT(oldArray!=NULL); / when stack buffer is not used, oldArray must have a non-NULL reference /
1842	}
1843
1844	// allocate a new array
1845	if(allocate(growCapacity) \|\|
1846	(newCapacity < growCapacity && allocate(newCapacity))
1847	) {
1848	if(doCopyArray) {
1849	// copy the contents
1850	// do not copy more than what fits - it may be smaller than before
1851	int32_t minLength = oldLength;
1852	newCapacity = getCapacity();
1853	if(newCapacity < minLength) {
1854	minLength = newCapacity;
1855	}
1856	if(oldArray != NULL) {
1857	us_arrayCopy(oldArray, `0`, getArrayStart(), `0`, minLength);
1858	}
1859	setLength(minLength);
1860	} else {
1861	setZeroLength();
1862	}
1863
1864	// release the old array
1865	if(flags & kRefCounted) {
1866	// the array is refCounted; decrement and release if 0
1867	u_atomic_int32_t pRefCount = ((u_atomic_int32_t )oldArray - `1`);
1868	if(umtx_atomic_dec(pRefCount) == `0`) {
1869	if(pBufferToDelete == `0`) {
1870	// Note: cast to (void ) is needed with MSVC, where u_atomic_int32_t*
1871	// is defined as volatile. (Volatile has useful non-standard behavior
1872	// with this compiler.)
1873	uprv_free((void *)pRefCount);
1874	} else {
1875	// the caller requested to delete it himself
1876	pBufferToDelete = (int32_t )pRefCount;
1877	}
1878	}
1879	}
1880	} else {
1881	// not enough memory for growCapacity and not even for the smaller newCapacity
1882	// reset the old values for setToBogus() to release the array
1883	if(!(flags&kUsingStackBuffer)) {
1884	fUnion.fFields.fArray = oldArray;
1885	}
1886	fUnion.fFields.fLengthAndFlags = flags;
1887	setToBogus();
1888	return FALSE;
1889	}
1890	}
1891	return TRUE;
1892	}
1893
1894	// UnicodeStringAppendable ------------------------------------------------- ***
1895
1896	UnicodeStringAppendable::~UnicodeStringAppendable() {}
1897
1898	UBool
1899	UnicodeStringAppendable::appendCodeUnit(UChar c) {
1900	return str.doAppend(&c, `0`, `1`).isWritable();
1901	}
1902
1903	UBool
1904	UnicodeStringAppendable::appendCodePoint(UChar32 c) {
1905	UChar buffer[U16_MAX_LENGTH];
1906	int32_t cLength = `0`;
1907	UBool isError = FALSE;
1908	U16_APPEND(buffer, cLength, U16_MAX_LENGTH, c, isError);
1909	return !isError && str.doAppend(buffer, `0`, cLength).isWritable();
1910	}
1911
1912	UBool
1913	UnicodeStringAppendable::appendString(const UChar *s, int32_t length) {
1914	return str.doAppend(s, `0`, length).isWritable();
1915	}
1916
1917	UBool
1918	UnicodeStringAppendable::reserveAppendCapacity(int32_t appendCapacity) {
1919	return str.cloneArrayIfNeeded(str.length() + appendCapacity);
1920	}
1921
1922	UChar *
1923	UnicodeStringAppendable::getAppendBuffer(int32_t minCapacity,
1924	int32_t desiredCapacityHint,
1925	UChar *scratch, int32_t scratchCapacity,
1926	int32_t *resultCapacity) {
1927	if(minCapacity < `1` \|\| scratchCapacity < minCapacity) {
1928	*resultCapacity = `0`;
1929	return NULL;
1930	}
1931	int32_t oldLength = str.length();
1932	if(minCapacity <= (kMaxCapacity - oldLength) &&
1933	desiredCapacityHint <= (kMaxCapacity - oldLength) &&
1934	str.cloneArrayIfNeeded(oldLength + minCapacity, oldLength + desiredCapacityHint)) {
1935	*resultCapacity = str.getCapacity() - oldLength;
1936	return str.getArrayStart() + oldLength;
1937	}
1938	*resultCapacity = scratchCapacity;
1939	return scratch;
1940	}
1941
1942	U_NAMESPACE_END
1943
1944	U_NAMESPACE_USE
1945
1946	U_CAPI int32_t U_EXPORT2
1947	uhash_hashUnicodeString(const UElement key) {
1948	const UnicodeString str = (const* UnicodeString*) key.pointer;
1949	return (str == NULL) ? `0` : str->hashCode();
1950	}
1951
1952	// Moved here from uhash_us.cpp so that using a UVector of UnicodeString*
1953	// does not depend on hashtable code.
1954	U_CAPI UBool U_EXPORT2
1955	uhash_compareUnicodeString(const UElement key1, const UElement key2) {
1956	const UnicodeString str1 = (const* UnicodeString*) key1.pointer;
1957	const UnicodeString str2 = (const* UnicodeString*) key2.pointer;
1958	if (str1 == str2) {
1959	return TRUE;
1960	}
1961	if (str1 == NULL \|\| str2 == NULL) {
1962	return FALSE;
1963	}
1964	return str1 == str2;
1965	}
1966
1967	#ifdef U_STATIC_IMPLEMENTATION
1968	/*
1969	This should never be called. It is defined here to make sure that the
1970	virtual vector deleting destructor is defined within unistr.cpp.
1971	The vector deleting destructor is already a part of UObject,
1972	but defining it here makes sure that it is included with this object file.
1973	This makes sure that static library dependencies are kept to a minimum.
1974	*/
1975	static void uprv_UnicodeStringDummy(void) {
1976	delete [] (new UnicodeString[`2`]);
1977	}
1978	#endif
1979

Browse the source code of ClickHouse/contrib/icu/icu4c/source/common/unistr.cpp