1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4******************************************************************************
5* Copyright (C) 1999-2016, International Business Machines Corporation and
6* others. All Rights Reserved.
7******************************************************************************
8*
9* File unistr.cpp
10*
11* Modification History:
12*
13* Date Name Description
14* 09/25/98 stephen Creation.
15* 04/20/99 stephen Overhauled per 4/16 code review.
16* 07/09/99 stephen Renamed {hi,lo},{byte,word} to icu_X for HP/UX
17* 11/18/99 aliu Added handleReplaceBetween() to make inherit from
18* Replaceable.
19* 06/25/01 grhoten Removed the dependency on iostream
20******************************************************************************
21*/
22
23#include "unicode/utypes.h"
24#include "unicode/appendable.h"
25#include "unicode/putil.h"
26#include "cstring.h"
27#include "cmemory.h"
28#include "unicode/ustring.h"
29#include "unicode/unistr.h"
30#include "unicode/utf.h"
31#include "unicode/utf16.h"
32#include "uelement.h"
33#include "ustr_imp.h"
34#include "umutex.h"
35#include "uassert.h"
36
37#if 0
38
39#include <iostream>
40using namespace std;
41
42//DEBUGGING
43void
44print(const UnicodeString& s,
45 const char *name)
46{
47 char16_t c;
48 cout << name << ":|";
49 for(int i = 0; i < s.length(); ++i) {
50 c = s[i];
51 if(c>= 0x007E || c < 0x0020)
52 cout << "[0x" << hex << s[i] << "]";
53 else
54 cout << (char) s[i];
55 }
56 cout << '|' << endl;
57}
58
59void
60print(const char16_t *s,
61 int32_t len,
62 const char *name)
63{
64 char16_t c;
65 cout << name << ":|";
66 for(int i = 0; i < len; ++i) {
67 c = s[i];
68 if(c>= 0x007E || c < 0x0020)
69 cout << "[0x" << hex << s[i] << "]";
70 else
71 cout << (char) s[i];
72 }
73 cout << '|' << endl;
74}
75// END DEBUGGING
76#endif
77
78// Local function definitions for now
79
80// need to copy areas that may overlap
81static
82inline void
83us_arrayCopy(const char16_t *src, int32_t srcStart,
84 char16_t *dst, int32_t dstStart, int32_t count)
85{
86 if(count>0) {
87 uprv_memmove(dst+dstStart, src+srcStart, (size_t)count*sizeof(*src));
88 }
89}
90
91// u_unescapeAt() callback to get a char16_t from a UnicodeString
92U_CDECL_BEGIN
93static char16_t U_CALLCONV
94UnicodeString_charAt(int32_t offset, void *context) {
95 return ((icu::UnicodeString*) context)->charAt(offset);
96}
97U_CDECL_END
98
99U_NAMESPACE_BEGIN
100
101/* The Replaceable virtual destructor can't be defined in the header
102 due to how AIX works with multiple definitions of virtual functions.
103*/
104Replaceable::~Replaceable() {}
105
106UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeString)
107
108UnicodeString U_EXPORT2
109operator+ (const UnicodeString &s1, const UnicodeString &s2) {
110 return
111 UnicodeString(s1.length()+s2.length()+1, (UChar32)0, 0).
112 append(s1).
113 append(s2);
114}
115
116//========================================
117// Reference Counting functions, put at top of file so that optimizing compilers
118// have a chance to automatically inline.
119//========================================
120
121void
122UnicodeString::addRef() {
123 umtx_atomic_inc((u_atomic_int32_t *)fUnion.fFields.fArray - 1);
124}
125
126int32_t
127UnicodeString::removeRef() {
128 return umtx_atomic_dec((u_atomic_int32_t *)fUnion.fFields.fArray - 1);
129}
130
131int32_t
132UnicodeString::refCount() const {
133 return umtx_loadAcquire(*((u_atomic_int32_t *)fUnion.fFields.fArray - 1));
134}
135
136void
137UnicodeString::releaseArray() {
138 if((fUnion.fFields.fLengthAndFlags & kRefCounted) && removeRef() == 0) {
139 uprv_free((int32_t *)fUnion.fFields.fArray - 1);
140 }
141}
142
143
144
145//========================================
146// Constructors
147//========================================
148
149// The default constructor is inline in unistr.h.
150
151UnicodeString::UnicodeString(int32_t capacity, UChar32 c, int32_t count) {
152 fUnion.fFields.fLengthAndFlags = 0;
153 if(count <= 0 || (uint32_t)c > 0x10ffff) {
154 // just allocate and do not do anything else
155 allocate(capacity);
156 } else if(c <= 0xffff) {
157 int32_t length = count;
158 if(capacity < length) {
159 capacity = length;
160 }
161 if(allocate(capacity)) {
162 char16_t *array = getArrayStart();
163 char16_t unit = (char16_t)c;
164 for(int32_t i = 0; i < length; ++i) {
165 array[i] = unit;
166 }
167 setLength(length);
168 }
169 } else { // supplementary code point, write surrogate pairs
170 if(count > (INT32_MAX / 2)) {
171 // We would get more than 2G UChars.
172 allocate(capacity);
173 return;
174 }
175 int32_t length = count * 2;
176 if(capacity < length) {
177 capacity = length;
178 }
179 if(allocate(capacity)) {
180 char16_t *array = getArrayStart();
181 char16_t lead = U16_LEAD(c);
182 char16_t trail = U16_TRAIL(c);
183 for(int32_t i = 0; i < length; i += 2) {
184 array[i] = lead;
185 array[i + 1] = trail;
186 }
187 setLength(length);
188 }
189 }
190}
191
192UnicodeString::UnicodeString(char16_t ch) {
193 fUnion.fFields.fLengthAndFlags = kLength1 | kShortString;
194 fUnion.fStackFields.fBuffer[0] = ch;
195}
196
197UnicodeString::UnicodeString(UChar32 ch) {
198 fUnion.fFields.fLengthAndFlags = kShortString;
199 int32_t i = 0;
200 UBool isError = false;
201 U16_APPEND(fUnion.fStackFields.fBuffer, i, US_STACKBUF_SIZE, ch, isError);
202 // We test isError so that the compiler does not complain that we don't.
203 // If isError then i==0 which is what we want anyway.
204 if(!isError) {
205 setShortLength(i);
206 }
207}
208
209UnicodeString::UnicodeString(const char16_t *text) {
210 fUnion.fFields.fLengthAndFlags = kShortString;
211 doAppend(text, 0, -1);
212}
213
214UnicodeString::UnicodeString(const char16_t *text,
215 int32_t textLength) {
216 fUnion.fFields.fLengthAndFlags = kShortString;
217 doAppend(text, 0, textLength);
218}
219
220UnicodeString::UnicodeString(UBool isTerminated,
221 ConstChar16Ptr textPtr,
222 int32_t textLength) {
223 fUnion.fFields.fLengthAndFlags = kReadonlyAlias;
224 const char16_t *text = textPtr;
225 if(text == nullptr) {
226 // treat as an empty string, do not alias
227 setToEmpty();
228 } else if(textLength < -1 ||
229 (textLength == -1 && !isTerminated) ||
230 (textLength >= 0 && isTerminated && text[textLength] != 0)
231 ) {
232 setToBogus();
233 } else {
234 if(textLength == -1) {
235 // text is terminated, or else it would have failed the above test
236 textLength = u_strlen(text);
237 }
238 setArray(const_cast<char16_t *>(text), textLength,
239 isTerminated ? textLength + 1 : textLength);
240 }
241}
242
243UnicodeString::UnicodeString(char16_t *buff,
244 int32_t buffLength,
245 int32_t buffCapacity) {
246 fUnion.fFields.fLengthAndFlags = kWritableAlias;
247 if(buff == nullptr) {
248 // treat as an empty string, do not alias
249 setToEmpty();
250 } else if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
251 setToBogus();
252 } else {
253 if(buffLength == -1) {
254 // fLength = u_strlen(buff); but do not look beyond buffCapacity
255 const char16_t *p = buff, *limit = buff + buffCapacity;
256 while(p != limit && *p != 0) {
257 ++p;
258 }
259 buffLength = (int32_t)(p - buff);
260 }
261 setArray(buff, buffLength, buffCapacity);
262 }
263}
264
265UnicodeString::UnicodeString(const char *src, int32_t length, EInvariant) {
266 fUnion.fFields.fLengthAndFlags = kShortString;
267 if(src==nullptr) {
268 // treat as an empty string
269 } else {
270 if(length<0) {
271 length=(int32_t)uprv_strlen(src);
272 }
273 if(cloneArrayIfNeeded(length, length, false)) {
274 u_charsToUChars(src, getArrayStart(), length);
275 setLength(length);
276 } else {
277 setToBogus();
278 }
279 }
280}
281
282#if U_CHARSET_IS_UTF8
283
284UnicodeString::UnicodeString(const char *codepageData) {
285 fUnion.fFields.fLengthAndFlags = kShortString;
286 if(codepageData != 0) {
287 setToUTF8(codepageData);
288 }
289}
290
291UnicodeString::UnicodeString(const char *codepageData, int32_t dataLength) {
292 fUnion.fFields.fLengthAndFlags = kShortString;
293 // if there's nothing to convert, do nothing
294 if(codepageData == 0 || dataLength == 0 || dataLength < -1) {
295 return;
296 }
297 if(dataLength == -1) {
298 dataLength = (int32_t)uprv_strlen(codepageData);
299 }
300 setToUTF8(StringPiece(codepageData, dataLength));
301}
302
303// else see unistr_cnv.cpp
304#endif
305
306UnicodeString::UnicodeString(const UnicodeString& that) {
307 fUnion.fFields.fLengthAndFlags = kShortString;
308 copyFrom(that);
309}
310
311UnicodeString::UnicodeString(UnicodeString &&src) noexcept {
312 copyFieldsFrom(src, true);
313}
314
315UnicodeString::UnicodeString(const UnicodeString& that,
316 int32_t srcStart) {
317 fUnion.fFields.fLengthAndFlags = kShortString;
318 setTo(that, srcStart);
319}
320
321UnicodeString::UnicodeString(const UnicodeString& that,
322 int32_t srcStart,
323 int32_t srcLength) {
324 fUnion.fFields.fLengthAndFlags = kShortString;
325 setTo(that, srcStart, srcLength);
326}
327
328// Replaceable base class clone() default implementation, does not clone
329Replaceable *
330Replaceable::clone() const {
331 return nullptr;
332}
333
334// UnicodeString overrides clone() with a real implementation
335UnicodeString *
336UnicodeString::clone() const {
337 LocalPointer<UnicodeString> clonedString(new UnicodeString(*this));
338 return clonedString.isValid() && !clonedString->isBogus() ? clonedString.orphan() : nullptr;
339}
340
341//========================================
342// array allocation
343//========================================
344
345namespace {
346
347const int32_t kGrowSize = 128;
348
349// The number of bytes for one int32_t reference counter and capacity UChars
350// must fit into a 32-bit size_t (at least when on a 32-bit platform).
351// We also add one for the NUL terminator, to avoid reallocation in getTerminatedBuffer(),
352// and round up to a multiple of 16 bytes.
353// This means that capacity must be at most (0xfffffff0 - 4) / 2 - 1 = 0x7ffffff5.
354// (With more complicated checks we could go up to 0x7ffffffd without rounding up,
355// but that does not seem worth it.)
356const int32_t kMaxCapacity = 0x7ffffff5;
357
358int32_t getGrowCapacity(int32_t newLength) {
359 int32_t growSize = (newLength >> 2) + kGrowSize;
360 if(growSize <= (kMaxCapacity - newLength)) {
361 return newLength + growSize;
362 } else {
363 return kMaxCapacity;
364 }
365}
366
367} // namespace
368
369UBool
370UnicodeString::allocate(int32_t capacity) {
371 if(capacity <= US_STACKBUF_SIZE) {
372 fUnion.fFields.fLengthAndFlags = kShortString;
373 return true;
374 }
375 if(capacity <= kMaxCapacity) {
376 ++capacity; // for the NUL
377 // Switch to size_t which is unsigned so that we can allocate up to 4GB.
378 // Reference counter + UChars.
379 size_t numBytes = sizeof(int32_t) + (size_t)capacity * U_SIZEOF_UCHAR;
380 // Round up to a multiple of 16.
381 numBytes = (numBytes + 15) & ~15;
382 int32_t *array = (int32_t *) uprv_malloc(numBytes);
383 if(array != nullptr) {
384 // set initial refCount and point behind the refCount
385 *array++ = 1;
386 numBytes -= sizeof(int32_t);
387
388 // have fArray point to the first char16_t
389 fUnion.fFields.fArray = (char16_t *)array;
390 fUnion.fFields.fCapacity = (int32_t)(numBytes / U_SIZEOF_UCHAR);
391 fUnion.fFields.fLengthAndFlags = kLongString;
392 return true;
393 }
394 }
395 fUnion.fFields.fLengthAndFlags = kIsBogus;
396 fUnion.fFields.fArray = 0;
397 fUnion.fFields.fCapacity = 0;
398 return false;
399}
400
401//========================================
402// Destructor
403//========================================
404
405#ifdef UNISTR_COUNT_FINAL_STRING_LENGTHS
406static u_atomic_int32_t finalLengthCounts[0x400]; // UnicodeString::kMaxShortLength+1
407static u_atomic_int32_t beyondCount(0);
408
409U_CAPI void unistr_printLengths() {
410 int32_t i;
411 for(i = 0; i <= 59; ++i) {
412 printf("%2d, %9d\n", i, (int32_t)finalLengthCounts[i]);
413 }
414 int32_t beyond = beyondCount;
415 for(; i < UPRV_LENGTHOF(finalLengthCounts); ++i) {
416 beyond += finalLengthCounts[i];
417 }
418 printf(">59, %9d\n", beyond);
419}
420#endif
421
422UnicodeString::~UnicodeString()
423{
424#ifdef UNISTR_COUNT_FINAL_STRING_LENGTHS
425 // Count lengths of strings at the end of their lifetime.
426 // Useful for discussion of a desirable stack buffer size.
427 // Count the contents length, not the optional NUL terminator nor further capacity.
428 // Ignore open-buffer strings and strings which alias external storage.
429 if((fUnion.fFields.fLengthAndFlags&(kOpenGetBuffer|kReadonlyAlias|kWritableAlias)) == 0) {
430 if(hasShortLength()) {
431 umtx_atomic_inc(finalLengthCounts + getShortLength());
432 } else {
433 umtx_atomic_inc(&beyondCount);
434 }
435 }
436#endif
437
438 releaseArray();
439}
440
441//========================================
442// Factory methods
443//========================================
444
445UnicodeString UnicodeString::fromUTF8(StringPiece utf8) {
446 UnicodeString result;
447 result.setToUTF8(utf8);
448 return result;
449}
450
451UnicodeString UnicodeString::fromUTF32(const UChar32 *utf32, int32_t length) {
452 UnicodeString result;
453 int32_t capacity;
454 // Most UTF-32 strings will be BMP-only and result in a same-length
455 // UTF-16 string. We overestimate the capacity just slightly,
456 // just in case there are a few supplementary characters.
457 if(length <= US_STACKBUF_SIZE) {
458 capacity = US_STACKBUF_SIZE;
459 } else {
460 capacity = length + (length >> 4) + 4;
461 }
462 do {
463 char16_t *utf16 = result.getBuffer(capacity);
464 int32_t length16;
465 UErrorCode errorCode = U_ZERO_ERROR;
466 u_strFromUTF32WithSub(utf16, result.getCapacity(), &length16,
467 utf32, length,
468 0xfffd, // Substitution character.
469 nullptr, // Don't care about number of substitutions.
470 &errorCode);
471 result.releaseBuffer(length16);
472 if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
473 capacity = length16 + 1; // +1 for the terminating NUL.
474 continue;
475 } else if(U_FAILURE(errorCode)) {
476 result.setToBogus();
477 }
478 break;
479 } while(true);
480 return result;
481}
482
483//========================================
484// Assignment
485//========================================
486
487UnicodeString &
488UnicodeString::operator=(const UnicodeString &src) {
489 return copyFrom(src);
490}
491
492UnicodeString &
493UnicodeString::fastCopyFrom(const UnicodeString &src) {
494 return copyFrom(src, true);
495}
496
497UnicodeString &
498UnicodeString::copyFrom(const UnicodeString &src, UBool fastCopy) {
499 // if assigning to ourselves, do nothing
500 if(this == &src) {
501 return *this;
502 }
503
504 // is the right side bogus?
505 if(src.isBogus()) {
506 setToBogus();
507 return *this;
508 }
509
510 // delete the current contents
511 releaseArray();
512
513 if(src.isEmpty()) {
514 // empty string - use the stack buffer
515 setToEmpty();
516 return *this;
517 }
518
519 // fLength>0 and not an "open" src.getBuffer(minCapacity)
520 fUnion.fFields.fLengthAndFlags = src.fUnion.fFields.fLengthAndFlags;
521 switch(src.fUnion.fFields.fLengthAndFlags & kAllStorageFlags) {
522 case kShortString:
523 // short string using the stack buffer, do the same
524 uprv_memcpy(fUnion.fStackFields.fBuffer, src.fUnion.fStackFields.fBuffer,
525 getShortLength() * U_SIZEOF_UCHAR);
526 break;
527 case kLongString:
528 // src uses a refCounted string buffer, use that buffer with refCount
529 // src is const, use a cast - we don't actually change it
530 const_cast<UnicodeString &>(src).addRef();
531 // copy all fields, share the reference-counted buffer
532 fUnion.fFields.fArray = src.fUnion.fFields.fArray;
533 fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
534 if(!hasShortLength()) {
535 fUnion.fFields.fLength = src.fUnion.fFields.fLength;
536 }
537 break;
538 case kReadonlyAlias:
539 if(fastCopy) {
540 // src is a readonly alias, do the same
541 // -> maintain the readonly alias as such
542 fUnion.fFields.fArray = src.fUnion.fFields.fArray;
543 fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
544 if(!hasShortLength()) {
545 fUnion.fFields.fLength = src.fUnion.fFields.fLength;
546 }
547 break;
548 }
549 // else if(!fastCopy) fall through to case kWritableAlias
550 // -> allocate a new buffer and copy the contents
551 U_FALLTHROUGH;
552 case kWritableAlias: {
553 // src is a writable alias; we make a copy of that instead
554 int32_t srcLength = src.length();
555 if(allocate(srcLength)) {
556 u_memcpy(getArrayStart(), src.getArrayStart(), srcLength);
557 setLength(srcLength);
558 break;
559 }
560 // if there is not enough memory, then fall through to setting to bogus
561 U_FALLTHROUGH;
562 }
563 default:
564 // if src is bogus, set ourselves to bogus
565 // do not call setToBogus() here because fArray and flags are not consistent here
566 fUnion.fFields.fLengthAndFlags = kIsBogus;
567 fUnion.fFields.fArray = 0;
568 fUnion.fFields.fCapacity = 0;
569 break;
570 }
571
572 return *this;
573}
574
575UnicodeString &UnicodeString::operator=(UnicodeString &&src) noexcept {
576 // No explicit check for self move assignment, consistent with standard library.
577 // Self move assignment causes no crash nor leak but might make the object bogus.
578 releaseArray();
579 copyFieldsFrom(src, true);
580 return *this;
581}
582
583// Same as move assignment except without memory management.
584void UnicodeString::copyFieldsFrom(UnicodeString &src, UBool setSrcToBogus) noexcept {
585 int16_t lengthAndFlags = fUnion.fFields.fLengthAndFlags = src.fUnion.fFields.fLengthAndFlags;
586 if(lengthAndFlags & kUsingStackBuffer) {
587 // Short string using the stack buffer, copy the contents.
588 // Check for self assignment to prevent "overlap in memcpy" warnings,
589 // although it should be harmless to copy a buffer to itself exactly.
590 if(this != &src) {
591 uprv_memcpy(fUnion.fStackFields.fBuffer, src.fUnion.fStackFields.fBuffer,
592 getShortLength() * U_SIZEOF_UCHAR);
593 }
594 } else {
595 // In all other cases, copy all fields.
596 fUnion.fFields.fArray = src.fUnion.fFields.fArray;
597 fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
598 if(!hasShortLength()) {
599 fUnion.fFields.fLength = src.fUnion.fFields.fLength;
600 }
601 if(setSrcToBogus) {
602 // Set src to bogus without releasing any memory.
603 src.fUnion.fFields.fLengthAndFlags = kIsBogus;
604 src.fUnion.fFields.fArray = nullptr;
605 src.fUnion.fFields.fCapacity = 0;
606 }
607 }
608}
609
610void UnicodeString::swap(UnicodeString &other) noexcept {
611 UnicodeString temp; // Empty short string: Known not to need releaseArray().
612 // Copy fields without resetting source values in between.
613 temp.copyFieldsFrom(*this, false);
614 this->copyFieldsFrom(other, false);
615 other.copyFieldsFrom(temp, false);
616 // Set temp to an empty string so that other's memory is not released twice.
617 temp.fUnion.fFields.fLengthAndFlags = kShortString;
618}
619
620//========================================
621// Miscellaneous operations
622//========================================
623
624UnicodeString UnicodeString::unescape() const {
625 UnicodeString result(length(), (UChar32)0, (int32_t)0); // construct with capacity
626 if (result.isBogus()) {
627 return result;
628 }
629 const char16_t *array = getBuffer();
630 int32_t len = length();
631 int32_t prev = 0;
632 for (int32_t i=0;;) {
633 if (i == len) {
634 result.append(array, prev, len - prev);
635 break;
636 }
637 if (array[i++] == 0x5C /*'\\'*/) {
638 result.append(array, prev, (i - 1) - prev);
639 UChar32 c = unescapeAt(i); // advances i
640 if (c < 0) {
641 result.remove(); // return empty string
642 break; // invalid escape sequence
643 }
644 result.append(c);
645 prev = i;
646 }
647 }
648 return result;
649}
650
651UChar32 UnicodeString::unescapeAt(int32_t &offset) const {
652 return u_unescapeAt(UnicodeString_charAt, &offset, length(), (void*)this);
653}
654
655//========================================
656// Read-only implementation
657//========================================
658UBool
659UnicodeString::doEquals(const UnicodeString &text, int32_t len) const {
660 // Requires: this & text not bogus and have same lengths.
661 // Byte-wise comparison works for equality regardless of endianness.
662 return uprv_memcmp(getArrayStart(), text.getArrayStart(), len * U_SIZEOF_UCHAR) == 0;
663}
664
665UBool
666UnicodeString::doEqualsSubstring( int32_t start,
667 int32_t length,
668 const char16_t *srcChars,
669 int32_t srcStart,
670 int32_t srcLength) const
671{
672 // compare illegal string values
673 if(isBogus()) {
674 return false;
675 }
676
677 // pin indices to legal values
678 pinIndices(start, length);
679
680 if(srcChars == nullptr) {
681 // treat const char16_t *srcChars==nullptr as an empty string
682 return length == 0 ? true : false;
683 }
684
685 // get the correct pointer
686 const char16_t *chars = getArrayStart();
687
688 chars += start;
689 srcChars += srcStart;
690
691 // get the srcLength if necessary
692 if(srcLength < 0) {
693 srcLength = u_strlen(srcChars + srcStart);
694 }
695
696 if (length != srcLength) {
697 return false;
698 }
699
700 if(length == 0 || chars == srcChars) {
701 return true;
702 }
703
704 return u_memcmp(chars, srcChars, srcLength) == 0;
705}
706
707int8_t
708UnicodeString::doCompare( int32_t start,
709 int32_t length,
710 const char16_t *srcChars,
711 int32_t srcStart,
712 int32_t srcLength) const
713{
714 // compare illegal string values
715 if(isBogus()) {
716 return -1;
717 }
718
719 // pin indices to legal values
720 pinIndices(start, length);
721
722 if(srcChars == nullptr) {
723 // treat const char16_t *srcChars==nullptr as an empty string
724 return length == 0 ? 0 : 1;
725 }
726
727 // get the correct pointer
728 const char16_t *chars = getArrayStart();
729
730 chars += start;
731 srcChars += srcStart;
732
733 int32_t minLength;
734 int8_t lengthResult;
735
736 // get the srcLength if necessary
737 if(srcLength < 0) {
738 srcLength = u_strlen(srcChars + srcStart);
739 }
740
741 // are we comparing different lengths?
742 if(length != srcLength) {
743 if(length < srcLength) {
744 minLength = length;
745 lengthResult = -1;
746 } else {
747 minLength = srcLength;
748 lengthResult = 1;
749 }
750 } else {
751 minLength = length;
752 lengthResult = 0;
753 }
754
755 /*
756 * note that uprv_memcmp() returns an int but we return an int8_t;
757 * we need to take care not to truncate the result -
758 * one way to do this is to right-shift the value to
759 * move the sign bit into the lower 8 bits and making sure that this
760 * does not become 0 itself
761 */
762
763 if(minLength > 0 && chars != srcChars) {
764 int32_t result;
765
766# if U_IS_BIG_ENDIAN
767 // big-endian: byte comparison works
768 result = uprv_memcmp(chars, srcChars, minLength * sizeof(char16_t));
769 if(result != 0) {
770 return (int8_t)(result >> 15 | 1);
771 }
772# else
773 // little-endian: compare char16_t units
774 do {
775 result = ((int32_t)*(chars++) - (int32_t)*(srcChars++));
776 if(result != 0) {
777 return (int8_t)(result >> 15 | 1);
778 }
779 } while(--minLength > 0);
780# endif
781 }
782 return lengthResult;
783}
784
785/* String compare in code point order - doCompare() compares in code unit order. */
786int8_t
787UnicodeString::doCompareCodePointOrder(int32_t start,
788 int32_t length,
789 const char16_t *srcChars,
790 int32_t srcStart,
791 int32_t srcLength) const
792{
793 // compare illegal string values
794 // treat const char16_t *srcChars==nullptr as an empty string
795 if(isBogus()) {
796 return -1;
797 }
798
799 // pin indices to legal values
800 pinIndices(start, length);
801
802 if(srcChars == nullptr) {
803 srcStart = srcLength = 0;
804 }
805
806 int32_t diff = uprv_strCompare(getArrayStart() + start, length, (srcChars!=nullptr)?(srcChars + srcStart):nullptr, srcLength, false, true);
807 /* translate the 32-bit result into an 8-bit one */
808 if(diff!=0) {
809 return (int8_t)(diff >> 15 | 1);
810 } else {
811 return 0;
812 }
813}
814
815int32_t
816UnicodeString::getLength() const {
817 return length();
818}
819
820char16_t
821UnicodeString::getCharAt(int32_t offset) const {
822 return charAt(offset);
823}
824
825UChar32
826UnicodeString::getChar32At(int32_t offset) const {
827 return char32At(offset);
828}
829
830UChar32
831UnicodeString::char32At(int32_t offset) const
832{
833 int32_t len = length();
834 if((uint32_t)offset < (uint32_t)len) {
835 const char16_t *array = getArrayStart();
836 UChar32 c;
837 U16_GET(array, 0, offset, len, c);
838 return c;
839 } else {
840 return kInvalidUChar;
841 }
842}
843
844int32_t
845UnicodeString::getChar32Start(int32_t offset) const {
846 if((uint32_t)offset < (uint32_t)length()) {
847 const char16_t *array = getArrayStart();
848 U16_SET_CP_START(array, 0, offset);
849 return offset;
850 } else {
851 return 0;
852 }
853}
854
855int32_t
856UnicodeString::getChar32Limit(int32_t offset) const {
857 int32_t len = length();
858 if((uint32_t)offset < (uint32_t)len) {
859 const char16_t *array = getArrayStart();
860 U16_SET_CP_LIMIT(array, 0, offset, len);
861 return offset;
862 } else {
863 return len;
864 }
865}
866
867int32_t
868UnicodeString::countChar32(int32_t start, int32_t length) const {
869 pinIndices(start, length);
870 // if(isBogus()) then fArray==0 and start==0 - u_countChar32() checks for nullptr
871 return u_countChar32(getArrayStart()+start, length);
872}
873
874UBool
875UnicodeString::hasMoreChar32Than(int32_t start, int32_t length, int32_t number) const {
876 pinIndices(start, length);
877 // if(isBogus()) then fArray==0 and start==0 - u_strHasMoreChar32Than() checks for nullptr
878 return u_strHasMoreChar32Than(getArrayStart()+start, length, number);
879}
880
881int32_t
882UnicodeString::moveIndex32(int32_t index, int32_t delta) const {
883 // pin index
884 int32_t len = length();
885 if(index<0) {
886 index=0;
887 } else if(index>len) {
888 index=len;
889 }
890
891 const char16_t *array = getArrayStart();
892 if(delta>0) {
893 U16_FWD_N(array, index, len, delta);
894 } else {
895 U16_BACK_N(array, 0, index, -delta);
896 }
897
898 return index;
899}
900
901void
902UnicodeString::doExtract(int32_t start,
903 int32_t length,
904 char16_t *dst,
905 int32_t dstStart) const
906{
907 // pin indices to legal values
908 pinIndices(start, length);
909
910 // do not copy anything if we alias dst itself
911 const char16_t *array = getArrayStart();
912 if(array + start != dst + dstStart) {
913 us_arrayCopy(array, start, dst, dstStart, length);
914 }
915}
916
917int32_t
918UnicodeString::extract(Char16Ptr dest, int32_t destCapacity,
919 UErrorCode &errorCode) const {
920 int32_t len = length();
921 if(U_SUCCESS(errorCode)) {
922 if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) {
923 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
924 } else {
925 const char16_t *array = getArrayStart();
926 if(len>0 && len<=destCapacity && array!=dest) {
927 u_memcpy(dest, array, len);
928 }
929 return u_terminateUChars(dest, destCapacity, len, &errorCode);
930 }
931 }
932
933 return len;
934}
935
936int32_t
937UnicodeString::extract(int32_t start,
938 int32_t length,
939 char *target,
940 int32_t targetCapacity,
941 enum EInvariant) const
942{
943 // if the arguments are illegal, then do nothing
944 if(targetCapacity < 0 || (targetCapacity > 0 && target == nullptr)) {
945 return 0;
946 }
947
948 // pin the indices to legal values
949 pinIndices(start, length);
950
951 if(length <= targetCapacity) {
952 u_UCharsToChars(getArrayStart() + start, target, length);
953 }
954 UErrorCode status = U_ZERO_ERROR;
955 return u_terminateChars(target, targetCapacity, length, &status);
956}
957
958UnicodeString
959UnicodeString::tempSubString(int32_t start, int32_t len) const {
960 pinIndices(start, len);
961 const char16_t *array = getBuffer(); // not getArrayStart() to check kIsBogus & kOpenGetBuffer
962 if(array==nullptr) {
963 array=fUnion.fStackFields.fBuffer; // anything not nullptr because that would make an empty string
964 len=-2; // bogus result string
965 }
966 return UnicodeString(false, array + start, len);
967}
968
969int32_t
970UnicodeString::toUTF8(int32_t start, int32_t len,
971 char *target, int32_t capacity) const {
972 pinIndices(start, len);
973 int32_t length8;
974 UErrorCode errorCode = U_ZERO_ERROR;
975 u_strToUTF8WithSub(target, capacity, &length8,
976 getBuffer() + start, len,
977 0xFFFD, // Standard substitution character.
978 nullptr, // Don't care about number of substitutions.
979 &errorCode);
980 return length8;
981}
982
983#if U_CHARSET_IS_UTF8
984
985int32_t
986UnicodeString::extract(int32_t start, int32_t len,
987 char *target, uint32_t dstSize) const {
988 // if the arguments are illegal, then do nothing
989 if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) {
990 return 0;
991 }
992 return toUTF8(start, len, target, dstSize <= 0x7fffffff ? (int32_t)dstSize : 0x7fffffff);
993}
994
995// else see unistr_cnv.cpp
996#endif
997
998void
999UnicodeString::extractBetween(int32_t start,
1000 int32_t limit,
1001 UnicodeString& target) const {
1002 pinIndex(start);
1003 pinIndex(limit);
1004 doExtract(start, limit - start, target);
1005}
1006
1007// When converting from UTF-16 to UTF-8, the result will have at most 3 times
1008// as many bytes as the source has UChars.
1009// The "worst cases" are writing systems like Indic, Thai and CJK with
1010// 3:1 bytes:UChars.
1011void
1012UnicodeString::toUTF8(ByteSink &sink) const {
1013 int32_t length16 = length();
1014 if(length16 != 0) {
1015 char stackBuffer[1024];
1016 int32_t capacity = (int32_t)sizeof(stackBuffer);
1017 UBool utf8IsOwned = false;
1018 char *utf8 = sink.GetAppendBuffer(length16 < capacity ? length16 : capacity,
1019 3*length16,
1020 stackBuffer, capacity,
1021 &capacity);
1022 int32_t length8 = 0;
1023 UErrorCode errorCode = U_ZERO_ERROR;
1024 u_strToUTF8WithSub(utf8, capacity, &length8,
1025 getBuffer(), length16,
1026 0xFFFD, // Standard substitution character.
1027 nullptr, // Don't care about number of substitutions.
1028 &errorCode);
1029 if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
1030 utf8 = (char *)uprv_malloc(length8);
1031 if(utf8 != nullptr) {
1032 utf8IsOwned = true;
1033 errorCode = U_ZERO_ERROR;
1034 u_strToUTF8WithSub(utf8, length8, &length8,
1035 getBuffer(), length16,
1036 0xFFFD, // Standard substitution character.
1037 nullptr, // Don't care about number of substitutions.
1038 &errorCode);
1039 } else {
1040 errorCode = U_MEMORY_ALLOCATION_ERROR;
1041 }
1042 }
1043 if(U_SUCCESS(errorCode)) {
1044 sink.Append(utf8, length8);
1045 sink.Flush();
1046 }
1047 if(utf8IsOwned) {
1048 uprv_free(utf8);
1049 }
1050 }
1051}
1052
1053int32_t
1054UnicodeString::toUTF32(UChar32 *utf32, int32_t capacity, UErrorCode &errorCode) const {
1055 int32_t length32=0;
1056 if(U_SUCCESS(errorCode)) {
1057 // getBuffer() and u_strToUTF32WithSub() check for illegal arguments.
1058 u_strToUTF32WithSub(utf32, capacity, &length32,
1059 getBuffer(), length(),
1060 0xfffd, // Substitution character.
1061 nullptr, // Don't care about number of substitutions.
1062 &errorCode);
1063 }
1064 return length32;
1065}
1066
1067int32_t
1068UnicodeString::indexOf(const char16_t *srcChars,
1069 int32_t srcStart,
1070 int32_t srcLength,
1071 int32_t start,
1072 int32_t length) const
1073{
1074 if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
1075 return -1;
1076 }
1077
1078 // UnicodeString does not find empty substrings
1079 if(srcLength < 0 && srcChars[srcStart] == 0) {
1080 return -1;
1081 }
1082
1083 // get the indices within bounds
1084 pinIndices(start, length);
1085
1086 // find the first occurrence of the substring
1087 const char16_t *array = getArrayStart();
1088 const char16_t *match = u_strFindFirst(array + start, length, srcChars + srcStart, srcLength);
1089 if(match == nullptr) {
1090 return -1;
1091 } else {
1092 return (int32_t)(match - array);
1093 }
1094}
1095
1096int32_t
1097UnicodeString::doIndexOf(char16_t c,
1098 int32_t start,
1099 int32_t length) const
1100{
1101 // pin indices
1102 pinIndices(start, length);
1103
1104 // find the first occurrence of c
1105 const char16_t *array = getArrayStart();
1106 const char16_t *match = u_memchr(array + start, c, length);
1107 if(match == nullptr) {
1108 return -1;
1109 } else {
1110 return (int32_t)(match - array);
1111 }
1112}
1113
1114int32_t
1115UnicodeString::doIndexOf(UChar32 c,
1116 int32_t start,
1117 int32_t length) const {
1118 // pin indices
1119 pinIndices(start, length);
1120
1121 // find the first occurrence of c
1122 const char16_t *array = getArrayStart();
1123 const char16_t *match = u_memchr32(array + start, c, length);
1124 if(match == nullptr) {
1125 return -1;
1126 } else {
1127 return (int32_t)(match - array);
1128 }
1129}
1130
1131int32_t
1132UnicodeString::lastIndexOf(const char16_t *srcChars,
1133 int32_t srcStart,
1134 int32_t srcLength,
1135 int32_t start,
1136 int32_t length) const
1137{
1138 if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
1139 return -1;
1140 }
1141
1142 // UnicodeString does not find empty substrings
1143 if(srcLength < 0 && srcChars[srcStart] == 0) {
1144 return -1;
1145 }
1146
1147 // get the indices within bounds
1148 pinIndices(start, length);
1149
1150 // find the last occurrence of the substring
1151 const char16_t *array = getArrayStart();
1152 const char16_t *match = u_strFindLast(array + start, length, srcChars + srcStart, srcLength);
1153 if(match == nullptr) {
1154 return -1;
1155 } else {
1156 return (int32_t)(match - array);
1157 }
1158}
1159
1160int32_t
1161UnicodeString::doLastIndexOf(char16_t c,
1162 int32_t start,
1163 int32_t length) const
1164{
1165 if(isBogus()) {
1166 return -1;
1167 }
1168
1169 // pin indices
1170 pinIndices(start, length);
1171
1172 // find the last occurrence of c
1173 const char16_t *array = getArrayStart();
1174 const char16_t *match = u_memrchr(array + start, c, length);
1175 if(match == nullptr) {
1176 return -1;
1177 } else {
1178 return (int32_t)(match - array);
1179 }
1180}
1181
1182int32_t
1183UnicodeString::doLastIndexOf(UChar32 c,
1184 int32_t start,
1185 int32_t length) const {
1186 // pin indices
1187 pinIndices(start, length);
1188
1189 // find the last occurrence of c
1190 const char16_t *array = getArrayStart();
1191 const char16_t *match = u_memrchr32(array + start, c, length);
1192 if(match == nullptr) {
1193 return -1;
1194 } else {
1195 return (int32_t)(match - array);
1196 }
1197}
1198
1199//========================================
1200// Write implementation
1201//========================================
1202
1203UnicodeString&
1204UnicodeString::findAndReplace(int32_t start,
1205 int32_t length,
1206 const UnicodeString& oldText,
1207 int32_t oldStart,
1208 int32_t oldLength,
1209 const UnicodeString& newText,
1210 int32_t newStart,
1211 int32_t newLength)
1212{
1213 if(isBogus() || oldText.isBogus() || newText.isBogus()) {
1214 return *this;
1215 }
1216
1217 pinIndices(start, length);
1218 oldText.pinIndices(oldStart, oldLength);
1219 newText.pinIndices(newStart, newLength);
1220
1221 if(oldLength == 0) {
1222 return *this;
1223 }
1224
1225 while(length > 0 && length >= oldLength) {
1226 int32_t pos = indexOf(oldText, oldStart, oldLength, start, length);
1227 if(pos < 0) {
1228 // no more oldText's here: done
1229 break;
1230 } else {
1231 // we found oldText, replace it by newText and go beyond it
1232 replace(pos, oldLength, newText, newStart, newLength);
1233 length -= pos + oldLength - start;
1234 start = pos + newLength;
1235 }
1236 }
1237
1238 return *this;
1239}
1240
1241
1242void
1243UnicodeString::setToBogus()
1244{
1245 releaseArray();
1246
1247 fUnion.fFields.fLengthAndFlags = kIsBogus;
1248 fUnion.fFields.fArray = 0;
1249 fUnion.fFields.fCapacity = 0;
1250}
1251
1252// turn a bogus string into an empty one
1253void
1254UnicodeString::unBogus() {
1255 if(fUnion.fFields.fLengthAndFlags & kIsBogus) {
1256 setToEmpty();
1257 }
1258}
1259
1260const char16_t *
1261UnicodeString::getTerminatedBuffer() {
1262 if(!isWritable()) {
1263 return nullptr;
1264 }
1265 char16_t *array = getArrayStart();
1266 int32_t len = length();
1267 if(len < getCapacity()) {
1268 if(fUnion.fFields.fLengthAndFlags & kBufferIsReadonly) {
1269 // If len<capacity on a read-only alias, then array[len] is
1270 // either the original NUL (if constructed with (true, s, length))
1271 // or one of the original string contents characters (if later truncated),
1272 // therefore we can assume that array[len] is initialized memory.
1273 if(array[len] == 0) {
1274 return array;
1275 }
1276 } else if(((fUnion.fFields.fLengthAndFlags & kRefCounted) == 0 || refCount() == 1)) {
1277 // kRefCounted: Do not write the NUL if the buffer is shared.
1278 // That is mostly safe, except when the length of one copy was modified
1279 // without copy-on-write, e.g., via truncate(newLength) or remove().
1280 // Then the NUL would be written into the middle of another copy's string.
1281
1282 // Otherwise, the buffer is fully writable and it is anyway safe to write the NUL.
1283 // Do not test if there is a NUL already because it might be uninitialized memory.
1284 // (That would be safe, but tools like valgrind & Purify would complain.)
1285 array[len] = 0;
1286 return array;
1287 }
1288 }
1289 if(len<INT32_MAX && cloneArrayIfNeeded(len+1)) {
1290 array = getArrayStart();
1291 array[len] = 0;
1292 return array;
1293 } else {
1294 return nullptr;
1295 }
1296}
1297
1298// setTo() analogous to the readonly-aliasing constructor with the same signature
1299UnicodeString &
1300UnicodeString::setTo(UBool isTerminated,
1301 ConstChar16Ptr textPtr,
1302 int32_t textLength)
1303{
1304 if(fUnion.fFields.fLengthAndFlags & kOpenGetBuffer) {
1305 // do not modify a string that has an "open" getBuffer(minCapacity)
1306 return *this;
1307 }
1308
1309 const char16_t *text = textPtr;
1310 if(text == nullptr) {
1311 // treat as an empty string, do not alias
1312 releaseArray();
1313 setToEmpty();
1314 return *this;
1315 }
1316
1317 if( textLength < -1 ||
1318 (textLength == -1 && !isTerminated) ||
1319 (textLength >= 0 && isTerminated && text[textLength] != 0)
1320 ) {
1321 setToBogus();
1322 return *this;
1323 }
1324
1325 releaseArray();
1326
1327 if(textLength == -1) {
1328 // text is terminated, or else it would have failed the above test
1329 textLength = u_strlen(text);
1330 }
1331 fUnion.fFields.fLengthAndFlags = kReadonlyAlias;
1332 setArray((char16_t *)text, textLength, isTerminated ? textLength + 1 : textLength);
1333 return *this;
1334}
1335
1336// setTo() analogous to the writable-aliasing constructor with the same signature
1337UnicodeString &
1338UnicodeString::setTo(char16_t *buffer,
1339 int32_t buffLength,
1340 int32_t buffCapacity) {
1341 if(fUnion.fFields.fLengthAndFlags & kOpenGetBuffer) {
1342 // do not modify a string that has an "open" getBuffer(minCapacity)
1343 return *this;
1344 }
1345
1346 if(buffer == nullptr) {
1347 // treat as an empty string, do not alias
1348 releaseArray();
1349 setToEmpty();
1350 return *this;
1351 }
1352
1353 if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
1354 setToBogus();
1355 return *this;
1356 } else if(buffLength == -1) {
1357 // buffLength = u_strlen(buff); but do not look beyond buffCapacity
1358 const char16_t *p = buffer, *limit = buffer + buffCapacity;
1359 while(p != limit && *p != 0) {
1360 ++p;
1361 }
1362 buffLength = (int32_t)(p - buffer);
1363 }
1364
1365 releaseArray();
1366
1367 fUnion.fFields.fLengthAndFlags = kWritableAlias;
1368 setArray(buffer, buffLength, buffCapacity);
1369 return *this;
1370}
1371
1372UnicodeString &UnicodeString::setToUTF8(StringPiece utf8) {
1373 unBogus();
1374 int32_t length = utf8.length();
1375 int32_t capacity;
1376 // The UTF-16 string will be at most as long as the UTF-8 string.
1377 if(length <= US_STACKBUF_SIZE) {
1378 capacity = US_STACKBUF_SIZE;
1379 } else {
1380 capacity = length + 1; // +1 for the terminating NUL.
1381 }
1382 char16_t *utf16 = getBuffer(capacity);
1383 int32_t length16;
1384 UErrorCode errorCode = U_ZERO_ERROR;
1385 u_strFromUTF8WithSub(utf16, getCapacity(), &length16,
1386 utf8.data(), length,
1387 0xfffd, // Substitution character.
1388 nullptr, // Don't care about number of substitutions.
1389 &errorCode);
1390 releaseBuffer(length16);
1391 if(U_FAILURE(errorCode)) {
1392 setToBogus();
1393 }
1394 return *this;
1395}
1396
1397UnicodeString&
1398UnicodeString::setCharAt(int32_t offset,
1399 char16_t c)
1400{
1401 int32_t len = length();
1402 if(cloneArrayIfNeeded() && len > 0) {
1403 if(offset < 0) {
1404 offset = 0;
1405 } else if(offset >= len) {
1406 offset = len - 1;
1407 }
1408
1409 getArrayStart()[offset] = c;
1410 }
1411 return *this;
1412}
1413
1414UnicodeString&
1415UnicodeString::replace(int32_t start,
1416 int32_t _length,
1417 UChar32 srcChar) {
1418 char16_t buffer[U16_MAX_LENGTH];
1419 int32_t count = 0;
1420 UBool isError = false;
1421 U16_APPEND(buffer, count, U16_MAX_LENGTH, srcChar, isError);
1422 // We test isError so that the compiler does not complain that we don't.
1423 // If isError (srcChar is not a valid code point) then count==0 which means
1424 // we remove the source segment rather than replacing it with srcChar.
1425 return doReplace(start, _length, buffer, 0, isError ? 0 : count);
1426}
1427
1428UnicodeString&
1429UnicodeString::append(UChar32 srcChar) {
1430 char16_t buffer[U16_MAX_LENGTH];
1431 int32_t _length = 0;
1432 UBool isError = false;
1433 U16_APPEND(buffer, _length, U16_MAX_LENGTH, srcChar, isError);
1434 // We test isError so that the compiler does not complain that we don't.
1435 // If isError then _length==0 which turns the doAppend() into a no-op anyway.
1436 return isError ? *this : doAppend(buffer, 0, _length);
1437}
1438
1439UnicodeString&
1440UnicodeString::doReplace( int32_t start,
1441 int32_t length,
1442 const UnicodeString& src,
1443 int32_t srcStart,
1444 int32_t srcLength)
1445{
1446 // pin the indices to legal values
1447 src.pinIndices(srcStart, srcLength);
1448
1449 // get the characters from src
1450 // and replace the range in ourselves with them
1451 return doReplace(start, length, src.getArrayStart(), srcStart, srcLength);
1452}
1453
1454UnicodeString&
1455UnicodeString::doReplace(int32_t start,
1456 int32_t length,
1457 const char16_t *srcChars,
1458 int32_t srcStart,
1459 int32_t srcLength)
1460{
1461 if(!isWritable()) {
1462 return *this;
1463 }
1464
1465 int32_t oldLength = this->length();
1466
1467 // optimize (read-only alias).remove(0, start) and .remove(start, end)
1468 if((fUnion.fFields.fLengthAndFlags&kBufferIsReadonly) && srcLength == 0) {
1469 if(start == 0) {
1470 // remove prefix by adjusting the array pointer
1471 pinIndex(length);
1472 fUnion.fFields.fArray += length;
1473 fUnion.fFields.fCapacity -= length;
1474 setLength(oldLength - length);
1475 return *this;
1476 } else {
1477 pinIndex(start);
1478 if(length >= (oldLength - start)) {
1479 // remove suffix by reducing the length (like truncate())
1480 setLength(start);
1481 fUnion.fFields.fCapacity = start; // not NUL-terminated any more
1482 return *this;
1483 }
1484 }
1485 }
1486
1487 if(start == oldLength) {
1488 return doAppend(srcChars, srcStart, srcLength);
1489 }
1490
1491 if(srcChars == 0) {
1492 srcLength = 0;
1493 } else {
1494 // Perform all remaining operations relative to srcChars + srcStart.
1495 // From this point forward, do not use srcStart.
1496 srcChars += srcStart;
1497 if (srcLength < 0) {
1498 // get the srcLength if necessary
1499 srcLength = u_strlen(srcChars);
1500 }
1501 }
1502
1503 // pin the indices to legal values
1504 pinIndices(start, length);
1505
1506 // Calculate the size of the string after the replace.
1507 // Avoid int32_t overflow.
1508 int32_t newLength = oldLength - length;
1509 if(srcLength > (INT32_MAX - newLength)) {
1510 setToBogus();
1511 return *this;
1512 }
1513 newLength += srcLength;
1514
1515 // Check for insertion into ourself
1516 const char16_t *oldArray = getArrayStart();
1517 if (isBufferWritable() &&
1518 oldArray < srcChars + srcLength &&
1519 srcChars < oldArray + oldLength) {
1520 // Copy into a new UnicodeString and start over
1521 UnicodeString copy(srcChars, srcLength);
1522 if (copy.isBogus()) {
1523 setToBogus();
1524 return *this;
1525 }
1526 return doReplace(start, length, copy.getArrayStart(), 0, srcLength);
1527 }
1528
1529 // cloneArrayIfNeeded(doCopyArray=false) may change fArray but will not copy the current contents;
1530 // therefore we need to keep the current fArray
1531 char16_t oldStackBuffer[US_STACKBUF_SIZE];
1532 if((fUnion.fFields.fLengthAndFlags&kUsingStackBuffer) && (newLength > US_STACKBUF_SIZE)) {
1533 // copy the stack buffer contents because it will be overwritten with
1534 // fUnion.fFields values
1535 u_memcpy(oldStackBuffer, oldArray, oldLength);
1536 oldArray = oldStackBuffer;
1537 }
1538
1539 // clone our array and allocate a bigger array if needed
1540 int32_t *bufferToDelete = 0;
1541 if(!cloneArrayIfNeeded(newLength, getGrowCapacity(newLength),
1542 false, &bufferToDelete)
1543 ) {
1544 return *this;
1545 }
1546
1547 // now do the replace
1548
1549 char16_t *newArray = getArrayStart();
1550 if(newArray != oldArray) {
1551 // if fArray changed, then we need to copy everything except what will change
1552 us_arrayCopy(oldArray, 0, newArray, 0, start);
1553 us_arrayCopy(oldArray, start + length,
1554 newArray, start + srcLength,
1555 oldLength - (start + length));
1556 } else if(length != srcLength) {
1557 // fArray did not change; copy only the portion that isn't changing, leaving a hole
1558 us_arrayCopy(oldArray, start + length,
1559 newArray, start + srcLength,
1560 oldLength - (start + length));
1561 }
1562
1563 // now fill in the hole with the new string
1564 us_arrayCopy(srcChars, 0, newArray, start, srcLength);
1565
1566 setLength(newLength);
1567
1568 // delayed delete in case srcChars == fArray when we started, and
1569 // to keep oldArray alive for the above operations
1570 if (bufferToDelete) {
1571 uprv_free(bufferToDelete);
1572 }
1573
1574 return *this;
1575}
1576
1577// Versions of doReplace() only for append() variants.
1578// doReplace() and doAppend() optimize for different cases.
1579
1580UnicodeString&
1581UnicodeString::doAppend(const UnicodeString& src, int32_t srcStart, int32_t srcLength) {
1582 if(srcLength == 0) {
1583 return *this;
1584 }
1585
1586 // pin the indices to legal values
1587 src.pinIndices(srcStart, srcLength);
1588 return doAppend(src.getArrayStart(), srcStart, srcLength);
1589}
1590
1591UnicodeString&
1592UnicodeString::doAppend(const char16_t *srcChars, int32_t srcStart, int32_t srcLength) {
1593 if(!isWritable() || srcLength == 0 || srcChars == nullptr) {
1594 return *this;
1595 }
1596
1597 // Perform all remaining operations relative to srcChars + srcStart.
1598 // From this point forward, do not use srcStart.
1599 srcChars += srcStart;
1600
1601 if(srcLength < 0) {
1602 // get the srcLength if necessary
1603 if((srcLength = u_strlen(srcChars)) == 0) {
1604 return *this;
1605 }
1606 }
1607
1608 int32_t oldLength = length();
1609 int32_t newLength;
1610 if (uprv_add32_overflow(oldLength, srcLength, &newLength)) {
1611 setToBogus();
1612 return *this;
1613 }
1614
1615 // Check for append onto ourself
1616 const char16_t* oldArray = getArrayStart();
1617 if (isBufferWritable() &&
1618 oldArray < srcChars + srcLength &&
1619 srcChars < oldArray + oldLength) {
1620 // Copy into a new UnicodeString and start over
1621 UnicodeString copy(srcChars, srcLength);
1622 if (copy.isBogus()) {
1623 setToBogus();
1624 return *this;
1625 }
1626 return doAppend(copy.getArrayStart(), 0, srcLength);
1627 }
1628
1629 // optimize append() onto a large-enough, owned string
1630 if((newLength <= getCapacity() && isBufferWritable()) ||
1631 cloneArrayIfNeeded(newLength, getGrowCapacity(newLength))) {
1632 char16_t *newArray = getArrayStart();
1633 // Do not copy characters when
1634 // char16_t *buffer=str.getAppendBuffer(...);
1635 // is followed by
1636 // str.append(buffer, length);
1637 // or
1638 // str.appendString(buffer, length)
1639 // or similar.
1640 if(srcChars != newArray + oldLength) {
1641 us_arrayCopy(srcChars, 0, newArray, oldLength, srcLength);
1642 }
1643 setLength(newLength);
1644 }
1645 return *this;
1646}
1647
1648/**
1649 * Replaceable API
1650 */
1651void
1652UnicodeString::handleReplaceBetween(int32_t start,
1653 int32_t limit,
1654 const UnicodeString& text) {
1655 replaceBetween(start, limit, text);
1656}
1657
1658/**
1659 * Replaceable API
1660 */
1661void
1662UnicodeString::copy(int32_t start, int32_t limit, int32_t dest) {
1663 if (limit <= start) {
1664 return; // Nothing to do; avoid bogus malloc call
1665 }
1666 char16_t* text = (char16_t*) uprv_malloc( sizeof(char16_t) * (limit - start) );
1667 // Check to make sure text is not null.
1668 if (text != nullptr) {
1669 extractBetween(start, limit, text, 0);
1670 insert(dest, text, 0, limit - start);
1671 uprv_free(text);
1672 }
1673}
1674
1675/**
1676 * Replaceable API
1677 *
1678 * NOTE: This is for the Replaceable class. There is no rep.cpp,
1679 * so we implement this function here.
1680 */
1681UBool Replaceable::hasMetaData() const {
1682 return true;
1683}
1684
1685/**
1686 * Replaceable API
1687 */
1688UBool UnicodeString::hasMetaData() const {
1689 return false;
1690}
1691
1692UnicodeString&
1693UnicodeString::doReverse(int32_t start, int32_t length) {
1694 if(length <= 1 || !cloneArrayIfNeeded()) {
1695 return *this;
1696 }
1697
1698 // pin the indices to legal values
1699 pinIndices(start, length);
1700 if(length <= 1) { // pinIndices() might have shrunk the length
1701 return *this;
1702 }
1703
1704 char16_t *left = getArrayStart() + start;
1705 char16_t *right = left + length - 1; // -1 for inclusive boundary (length>=2)
1706 char16_t swap;
1707 UBool hasSupplementary = false;
1708
1709 // Before the loop we know left<right because length>=2.
1710 do {
1711 hasSupplementary |= (UBool)U16_IS_LEAD(swap = *left);
1712 hasSupplementary |= (UBool)U16_IS_LEAD(*left++ = *right);
1713 *right-- = swap;
1714 } while(left < right);
1715 // Make sure to test the middle code unit of an odd-length string.
1716 // Redundant if the length is even.
1717 hasSupplementary |= (UBool)U16_IS_LEAD(*left);
1718
1719 /* if there are supplementary code points in the reversed range, then re-swap their surrogates */
1720 if(hasSupplementary) {
1721 char16_t swap2;
1722
1723 left = getArrayStart() + start;
1724 right = left + length - 1; // -1 so that we can look at *(left+1) if left<right
1725 while(left < right) {
1726 if(U16_IS_TRAIL(swap = *left) && U16_IS_LEAD(swap2 = *(left + 1))) {
1727 *left++ = swap2;
1728 *left++ = swap;
1729 } else {
1730 ++left;
1731 }
1732 }
1733 }
1734
1735 return *this;
1736}
1737
1738UBool
1739UnicodeString::padLeading(int32_t targetLength,
1740 char16_t padChar)
1741{
1742 int32_t oldLength = length();
1743 if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
1744 return false;
1745 } else {
1746 // move contents up by padding width
1747 char16_t *array = getArrayStart();
1748 int32_t start = targetLength - oldLength;
1749 us_arrayCopy(array, 0, array, start, oldLength);
1750
1751 // fill in padding character
1752 while(--start >= 0) {
1753 array[start] = padChar;
1754 }
1755 setLength(targetLength);
1756 return true;
1757 }
1758}
1759
1760UBool
1761UnicodeString::padTrailing(int32_t targetLength,
1762 char16_t padChar)
1763{
1764 int32_t oldLength = length();
1765 if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
1766 return false;
1767 } else {
1768 // fill in padding character
1769 char16_t *array = getArrayStart();
1770 int32_t length = targetLength;
1771 while(--length >= oldLength) {
1772 array[length] = padChar;
1773 }
1774 setLength(targetLength);
1775 return true;
1776 }
1777}
1778
1779//========================================
1780// Hashing
1781//========================================
1782int32_t
1783UnicodeString::doHashCode() const
1784{
1785 /* Delegate hash computation to uhash. This makes UnicodeString
1786 * hashing consistent with char16_t* hashing. */
1787 int32_t hashCode = ustr_hashUCharsN(getArrayStart(), length());
1788 if (hashCode == kInvalidHashCode) {
1789 hashCode = kEmptyHashCode;
1790 }
1791 return hashCode;
1792}
1793
1794//========================================
1795// External Buffer
1796//========================================
1797
1798char16_t *
1799UnicodeString::getBuffer(int32_t minCapacity) {
1800 if(minCapacity>=-1 && cloneArrayIfNeeded(minCapacity)) {
1801 fUnion.fFields.fLengthAndFlags|=kOpenGetBuffer;
1802 setZeroLength();
1803 return getArrayStart();
1804 } else {
1805 return nullptr;
1806 }
1807}
1808
1809void
1810UnicodeString::releaseBuffer(int32_t newLength) {
1811 if(fUnion.fFields.fLengthAndFlags&kOpenGetBuffer && newLength>=-1) {
1812 // set the new fLength
1813 int32_t capacity=getCapacity();
1814 if(newLength==-1) {
1815 // the new length is the string length, capped by fCapacity
1816 const char16_t *array=getArrayStart(), *p=array, *limit=array+capacity;
1817 while(p<limit && *p!=0) {
1818 ++p;
1819 }
1820 newLength=(int32_t)(p-array);
1821 } else if(newLength>capacity) {
1822 newLength=capacity;
1823 }
1824 setLength(newLength);
1825 fUnion.fFields.fLengthAndFlags&=~kOpenGetBuffer;
1826 }
1827}
1828
1829//========================================
1830// Miscellaneous
1831//========================================
1832UBool
1833UnicodeString::cloneArrayIfNeeded(int32_t newCapacity,
1834 int32_t growCapacity,
1835 UBool doCopyArray,
1836 int32_t **pBufferToDelete,
1837 UBool forceClone) {
1838 // default parameters need to be static, therefore
1839 // the defaults are -1 to have convenience defaults
1840 if(newCapacity == -1) {
1841 newCapacity = getCapacity();
1842 }
1843
1844 // while a getBuffer(minCapacity) is "open",
1845 // prevent any modifications of the string by returning false here
1846 // if the string is bogus, then only an assignment or similar can revive it
1847 if(!isWritable()) {
1848 return false;
1849 }
1850
1851 /*
1852 * We need to make a copy of the array if
1853 * the buffer is read-only, or
1854 * the buffer is refCounted (shared), and refCount>1, or
1855 * the buffer is too small.
1856 * Return false if memory could not be allocated.
1857 */
1858 if(forceClone ||
1859 fUnion.fFields.fLengthAndFlags & kBufferIsReadonly ||
1860 (fUnion.fFields.fLengthAndFlags & kRefCounted && refCount() > 1) ||
1861 newCapacity > getCapacity()
1862 ) {
1863 // check growCapacity for default value and use of the stack buffer
1864 if(growCapacity < 0) {
1865 growCapacity = newCapacity;
1866 } else if(newCapacity <= US_STACKBUF_SIZE && growCapacity > US_STACKBUF_SIZE) {
1867 growCapacity = US_STACKBUF_SIZE;
1868 }
1869
1870 // save old values
1871 char16_t oldStackBuffer[US_STACKBUF_SIZE];
1872 char16_t *oldArray;
1873 int32_t oldLength = length();
1874 int16_t flags = fUnion.fFields.fLengthAndFlags;
1875
1876 if(flags&kUsingStackBuffer) {
1877 U_ASSERT(!(flags&kRefCounted)); /* kRefCounted and kUsingStackBuffer are mutally exclusive */
1878 if(doCopyArray && growCapacity > US_STACKBUF_SIZE) {
1879 // copy the stack buffer contents because it will be overwritten with
1880 // fUnion.fFields values
1881 us_arrayCopy(fUnion.fStackFields.fBuffer, 0, oldStackBuffer, 0, oldLength);
1882 oldArray = oldStackBuffer;
1883 } else {
1884 oldArray = nullptr; // no need to copy from the stack buffer to itself
1885 }
1886 } else {
1887 oldArray = fUnion.fFields.fArray;
1888 U_ASSERT(oldArray!=nullptr); /* when stack buffer is not used, oldArray must have a non-nullptr reference */
1889 }
1890
1891 // allocate a new array
1892 if(allocate(growCapacity) ||
1893 (newCapacity < growCapacity && allocate(newCapacity))
1894 ) {
1895 if(doCopyArray) {
1896 // copy the contents
1897 // do not copy more than what fits - it may be smaller than before
1898 int32_t minLength = oldLength;
1899 newCapacity = getCapacity();
1900 if(newCapacity < minLength) {
1901 minLength = newCapacity;
1902 }
1903 if(oldArray != nullptr) {
1904 us_arrayCopy(oldArray, 0, getArrayStart(), 0, minLength);
1905 }
1906 setLength(minLength);
1907 } else {
1908 setZeroLength();
1909 }
1910
1911 // release the old array
1912 if(flags & kRefCounted) {
1913 // the array is refCounted; decrement and release if 0
1914 u_atomic_int32_t *pRefCount = ((u_atomic_int32_t *)oldArray - 1);
1915 if(umtx_atomic_dec(pRefCount) == 0) {
1916 if(pBufferToDelete == 0) {
1917 // Note: cast to (void *) is needed with MSVC, where u_atomic_int32_t
1918 // is defined as volatile. (Volatile has useful non-standard behavior
1919 // with this compiler.)
1920 uprv_free((void *)pRefCount);
1921 } else {
1922 // the caller requested to delete it himself
1923 *pBufferToDelete = (int32_t *)pRefCount;
1924 }
1925 }
1926 }
1927 } else {
1928 // not enough memory for growCapacity and not even for the smaller newCapacity
1929 // reset the old values for setToBogus() to release the array
1930 if(!(flags&kUsingStackBuffer)) {
1931 fUnion.fFields.fArray = oldArray;
1932 }
1933 fUnion.fFields.fLengthAndFlags = flags;
1934 setToBogus();
1935 return false;
1936 }
1937 }
1938 return true;
1939}
1940
1941// UnicodeStringAppendable ------------------------------------------------- ***
1942
1943UnicodeStringAppendable::~UnicodeStringAppendable() {}
1944
1945UBool
1946UnicodeStringAppendable::appendCodeUnit(char16_t c) {
1947 return str.doAppend(&c, 0, 1).isWritable();
1948}
1949
1950UBool
1951UnicodeStringAppendable::appendCodePoint(UChar32 c) {
1952 char16_t buffer[U16_MAX_LENGTH];
1953 int32_t cLength = 0;
1954 UBool isError = false;
1955 U16_APPEND(buffer, cLength, U16_MAX_LENGTH, c, isError);
1956 return !isError && str.doAppend(buffer, 0, cLength).isWritable();
1957}
1958
1959UBool
1960UnicodeStringAppendable::appendString(const char16_t *s, int32_t length) {
1961 return str.doAppend(s, 0, length).isWritable();
1962}
1963
1964UBool
1965UnicodeStringAppendable::reserveAppendCapacity(int32_t appendCapacity) {
1966 return str.cloneArrayIfNeeded(str.length() + appendCapacity);
1967}
1968
1969char16_t *
1970UnicodeStringAppendable::getAppendBuffer(int32_t minCapacity,
1971 int32_t desiredCapacityHint,
1972 char16_t *scratch, int32_t scratchCapacity,
1973 int32_t *resultCapacity) {
1974 if(minCapacity < 1 || scratchCapacity < minCapacity) {
1975 *resultCapacity = 0;
1976 return nullptr;
1977 }
1978 int32_t oldLength = str.length();
1979 if(minCapacity <= (kMaxCapacity - oldLength) &&
1980 desiredCapacityHint <= (kMaxCapacity - oldLength) &&
1981 str.cloneArrayIfNeeded(oldLength + minCapacity, oldLength + desiredCapacityHint)) {
1982 *resultCapacity = str.getCapacity() - oldLength;
1983 return str.getArrayStart() + oldLength;
1984 }
1985 *resultCapacity = scratchCapacity;
1986 return scratch;
1987}
1988
1989U_NAMESPACE_END
1990
1991U_NAMESPACE_USE
1992
1993U_CAPI int32_t U_EXPORT2
1994uhash_hashUnicodeString(const UElement key) {
1995 const UnicodeString *str = (const UnicodeString*) key.pointer;
1996 return (str == nullptr) ? 0 : str->hashCode();
1997}
1998
1999// Moved here from uhash_us.cpp so that using a UVector of UnicodeString*
2000// does not depend on hashtable code.
2001U_CAPI UBool U_EXPORT2
2002uhash_compareUnicodeString(const UElement key1, const UElement key2) {
2003 const UnicodeString *str1 = (const UnicodeString*) key1.pointer;
2004 const UnicodeString *str2 = (const UnicodeString*) key2.pointer;
2005 if (str1 == str2) {
2006 return true;
2007 }
2008 if (str1 == nullptr || str2 == nullptr) {
2009 return false;
2010 }
2011 return *str1 == *str2;
2012}
2013
2014#ifdef U_STATIC_IMPLEMENTATION
2015/*
2016This should never be called. It is defined here to make sure that the
2017virtual vector deleting destructor is defined within unistr.cpp.
2018The vector deleting destructor is already a part of UObject,
2019but defining it here makes sure that it is included with this object file.
2020This makes sure that static library dependencies are kept to a minimum.
2021*/
2022#if defined(__clang__) || U_GCC_MAJOR_MINOR >= 1100
2023#pragma GCC diagnostic push
2024#pragma GCC diagnostic ignored "-Wunused-function"
2025static void uprv_UnicodeStringDummy() {
2026 delete [] (new UnicodeString[2]);
2027}
2028#pragma GCC diagnostic pop
2029#endif
2030#endif
2031