1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4******************************************************************************
5* Copyright (C) 1999-2016, International Business Machines Corporation and
6* others. All Rights Reserved.
7******************************************************************************
8*
9* File unistr.cpp
10*
11* Modification History:
12*
13* Date Name Description
14* 09/25/98 stephen Creation.
15* 04/20/99 stephen Overhauled per 4/16 code review.
16* 07/09/99 stephen Renamed {hi,lo},{byte,word} to icu_X for HP/UX
17* 11/18/99 aliu Added handleReplaceBetween() to make inherit from
18* Replaceable.
19* 06/25/01 grhoten Removed the dependency on iostream
20******************************************************************************
21*/
22
23#include "unicode/utypes.h"
24#include "unicode/appendable.h"
25#include "unicode/putil.h"
26#include "cstring.h"
27#include "cmemory.h"
28#include "unicode/ustring.h"
29#include "unicode/unistr.h"
30#include "unicode/utf.h"
31#include "unicode/utf16.h"
32#include "uelement.h"
33#include "ustr_imp.h"
34#include "umutex.h"
35#include "uassert.h"
36
37#if 0
38
39#include <iostream>
40using namespace std;
41
42//DEBUGGING
43void
44print(const UnicodeString& s,
45 const char *name)
46{
47 UChar c;
48 cout << name << ":|";
49 for(int i = 0; i < s.length(); ++i) {
50 c = s[i];
51 if(c>= 0x007E || c < 0x0020)
52 cout << "[0x" << hex << s[i] << "]";
53 else
54 cout << (char) s[i];
55 }
56 cout << '|' << endl;
57}
58
59void
60print(const UChar *s,
61 int32_t len,
62 const char *name)
63{
64 UChar c;
65 cout << name << ":|";
66 for(int i = 0; i < len; ++i) {
67 c = s[i];
68 if(c>= 0x007E || c < 0x0020)
69 cout << "[0x" << hex << s[i] << "]";
70 else
71 cout << (char) s[i];
72 }
73 cout << '|' << endl;
74}
75// END DEBUGGING
76#endif
77
78// Local function definitions for now
79
80// need to copy areas that may overlap
81static
82inline void
83us_arrayCopy(const UChar *src, int32_t srcStart,
84 UChar *dst, int32_t dstStart, int32_t count)
85{
86 if(count>0) {
87 uprv_memmove(dst+dstStart, src+srcStart, (size_t)count*sizeof(*src));
88 }
89}
90
91// u_unescapeAt() callback to get a UChar from a UnicodeString
92U_CDECL_BEGIN
93static UChar U_CALLCONV
94UnicodeString_charAt(int32_t offset, void *context) {
95 return ((icu::UnicodeString*) context)->charAt(offset);
96}
97U_CDECL_END
98
99U_NAMESPACE_BEGIN
100
101/* The Replaceable virtual destructor can't be defined in the header
102 due to how AIX works with multiple definitions of virtual functions.
103*/
104Replaceable::~Replaceable() {}
105
106UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeString)
107
108UnicodeString U_EXPORT2
109operator+ (const UnicodeString &s1, const UnicodeString &s2) {
110 return
111 UnicodeString(s1.length()+s2.length()+1, (UChar32)0, 0).
112 append(s1).
113 append(s2);
114}
115
116//========================================
117// Reference Counting functions, put at top of file so that optimizing compilers
118// have a chance to automatically inline.
119//========================================
120
121void
122UnicodeString::addRef() {
123 umtx_atomic_inc((u_atomic_int32_t *)fUnion.fFields.fArray - 1);
124}
125
126int32_t
127UnicodeString::removeRef() {
128 return umtx_atomic_dec((u_atomic_int32_t *)fUnion.fFields.fArray - 1);
129}
130
131int32_t
132UnicodeString::refCount() const {
133 return umtx_loadAcquire(*((u_atomic_int32_t *)fUnion.fFields.fArray - 1));
134}
135
136void
137UnicodeString::releaseArray() {
138 if((fUnion.fFields.fLengthAndFlags & kRefCounted) && removeRef() == 0) {
139 uprv_free((int32_t *)fUnion.fFields.fArray - 1);
140 }
141}
142
143
144
145//========================================
146// Constructors
147//========================================
148
149// The default constructor is inline in unistr.h.
150
151UnicodeString::UnicodeString(int32_t capacity, UChar32 c, int32_t count) {
152 fUnion.fFields.fLengthAndFlags = 0;
153 if(count <= 0 || (uint32_t)c > 0x10ffff) {
154 // just allocate and do not do anything else
155 allocate(capacity);
156 } else if(c <= 0xffff) {
157 int32_t length = count;
158 if(capacity < length) {
159 capacity = length;
160 }
161 if(allocate(capacity)) {
162 UChar *array = getArrayStart();
163 UChar unit = (UChar)c;
164 for(int32_t i = 0; i < length; ++i) {
165 array[i] = unit;
166 }
167 setLength(length);
168 }
169 } else { // supplementary code point, write surrogate pairs
170 if(count > (INT32_MAX / 2)) {
171 // We would get more than 2G UChars.
172 allocate(capacity);
173 return;
174 }
175 int32_t length = count * 2;
176 if(capacity < length) {
177 capacity = length;
178 }
179 if(allocate(capacity)) {
180 UChar *array = getArrayStart();
181 UChar lead = U16_LEAD(c);
182 UChar trail = U16_TRAIL(c);
183 for(int32_t i = 0; i < length; i += 2) {
184 array[i] = lead;
185 array[i + 1] = trail;
186 }
187 setLength(length);
188 }
189 }
190}
191
192UnicodeString::UnicodeString(UChar ch) {
193 fUnion.fFields.fLengthAndFlags = kLength1 | kShortString;
194 fUnion.fStackFields.fBuffer[0] = ch;
195}
196
197UnicodeString::UnicodeString(UChar32 ch) {
198 fUnion.fFields.fLengthAndFlags = kShortString;
199 int32_t i = 0;
200 UBool isError = FALSE;
201 U16_APPEND(fUnion.fStackFields.fBuffer, i, US_STACKBUF_SIZE, ch, isError);
202 // We test isError so that the compiler does not complain that we don't.
203 // If isError then i==0 which is what we want anyway.
204 if(!isError) {
205 setShortLength(i);
206 }
207}
208
209UnicodeString::UnicodeString(const UChar *text) {
210 fUnion.fFields.fLengthAndFlags = kShortString;
211 doAppend(text, 0, -1);
212}
213
214UnicodeString::UnicodeString(const UChar *text,
215 int32_t textLength) {
216 fUnion.fFields.fLengthAndFlags = kShortString;
217 doAppend(text, 0, textLength);
218}
219
220UnicodeString::UnicodeString(UBool isTerminated,
221 ConstChar16Ptr textPtr,
222 int32_t textLength) {
223 fUnion.fFields.fLengthAndFlags = kReadonlyAlias;
224 const UChar *text = textPtr;
225 if(text == NULL) {
226 // treat as an empty string, do not alias
227 setToEmpty();
228 } else if(textLength < -1 ||
229 (textLength == -1 && !isTerminated) ||
230 (textLength >= 0 && isTerminated && text[textLength] != 0)
231 ) {
232 setToBogus();
233 } else {
234 if(textLength == -1) {
235 // text is terminated, or else it would have failed the above test
236 textLength = u_strlen(text);
237 }
238 setArray(const_cast<UChar *>(text), textLength,
239 isTerminated ? textLength + 1 : textLength);
240 }
241}
242
243UnicodeString::UnicodeString(UChar *buff,
244 int32_t buffLength,
245 int32_t buffCapacity) {
246 fUnion.fFields.fLengthAndFlags = kWritableAlias;
247 if(buff == NULL) {
248 // treat as an empty string, do not alias
249 setToEmpty();
250 } else if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
251 setToBogus();
252 } else {
253 if(buffLength == -1) {
254 // fLength = u_strlen(buff); but do not look beyond buffCapacity
255 const UChar *p = buff, *limit = buff + buffCapacity;
256 while(p != limit && *p != 0) {
257 ++p;
258 }
259 buffLength = (int32_t)(p - buff);
260 }
261 setArray(buff, buffLength, buffCapacity);
262 }
263}
264
265UnicodeString::UnicodeString(const char *src, int32_t length, EInvariant) {
266 fUnion.fFields.fLengthAndFlags = kShortString;
267 if(src==NULL) {
268 // treat as an empty string
269 } else {
270 if(length<0) {
271 length=(int32_t)uprv_strlen(src);
272 }
273 if(cloneArrayIfNeeded(length, length, FALSE)) {
274 u_charsToUChars(src, getArrayStart(), length);
275 setLength(length);
276 } else {
277 setToBogus();
278 }
279 }
280}
281
282#if U_CHARSET_IS_UTF8
283
284UnicodeString::UnicodeString(const char *codepageData) {
285 fUnion.fFields.fLengthAndFlags = kShortString;
286 if(codepageData != 0) {
287 setToUTF8(codepageData);
288 }
289}
290
291UnicodeString::UnicodeString(const char *codepageData, int32_t dataLength) {
292 fUnion.fFields.fLengthAndFlags = kShortString;
293 // if there's nothing to convert, do nothing
294 if(codepageData == 0 || dataLength == 0 || dataLength < -1) {
295 return;
296 }
297 if(dataLength == -1) {
298 dataLength = (int32_t)uprv_strlen(codepageData);
299 }
300 setToUTF8(StringPiece(codepageData, dataLength));
301}
302
303// else see unistr_cnv.cpp
304#endif
305
306UnicodeString::UnicodeString(const UnicodeString& that) {
307 fUnion.fFields.fLengthAndFlags = kShortString;
308 copyFrom(that);
309}
310
311UnicodeString::UnicodeString(UnicodeString &&src) U_NOEXCEPT {
312 copyFieldsFrom(src, TRUE);
313}
314
315UnicodeString::UnicodeString(const UnicodeString& that,
316 int32_t srcStart) {
317 fUnion.fFields.fLengthAndFlags = kShortString;
318 setTo(that, srcStart);
319}
320
321UnicodeString::UnicodeString(const UnicodeString& that,
322 int32_t srcStart,
323 int32_t srcLength) {
324 fUnion.fFields.fLengthAndFlags = kShortString;
325 setTo(that, srcStart, srcLength);
326}
327
328// Replaceable base class clone() default implementation, does not clone
329Replaceable *
330Replaceable::clone() const {
331 return NULL;
332}
333
334// UnicodeString overrides clone() with a real implementation
335UnicodeString *
336UnicodeString::clone() const {
337 return new UnicodeString(*this);
338}
339
340//========================================
341// array allocation
342//========================================
343
344namespace {
345
346const int32_t kGrowSize = 128;
347
348// The number of bytes for one int32_t reference counter and capacity UChars
349// must fit into a 32-bit size_t (at least when on a 32-bit platform).
350// We also add one for the NUL terminator, to avoid reallocation in getTerminatedBuffer(),
351// and round up to a multiple of 16 bytes.
352// This means that capacity must be at most (0xfffffff0 - 4) / 2 - 1 = 0x7ffffff5.
353// (With more complicated checks we could go up to 0x7ffffffd without rounding up,
354// but that does not seem worth it.)
355const int32_t kMaxCapacity = 0x7ffffff5;
356
357int32_t getGrowCapacity(int32_t newLength) {
358 int32_t growSize = (newLength >> 2) + kGrowSize;
359 if(growSize <= (kMaxCapacity - newLength)) {
360 return newLength + growSize;
361 } else {
362 return kMaxCapacity;
363 }
364}
365
366} // namespace
367
368UBool
369UnicodeString::allocate(int32_t capacity) {
370 if(capacity <= US_STACKBUF_SIZE) {
371 fUnion.fFields.fLengthAndFlags = kShortString;
372 return TRUE;
373 }
374 if(capacity <= kMaxCapacity) {
375 ++capacity; // for the NUL
376 // Switch to size_t which is unsigned so that we can allocate up to 4GB.
377 // Reference counter + UChars.
378 size_t numBytes = sizeof(int32_t) + (size_t)capacity * U_SIZEOF_UCHAR;
379 // Round up to a multiple of 16.
380 numBytes = (numBytes + 15) & ~15;
381 int32_t *array = (int32_t *) uprv_malloc(numBytes);
382 if(array != NULL) {
383 // set initial refCount and point behind the refCount
384 *array++ = 1;
385 numBytes -= sizeof(int32_t);
386
387 // have fArray point to the first UChar
388 fUnion.fFields.fArray = (UChar *)array;
389 fUnion.fFields.fCapacity = (int32_t)(numBytes / U_SIZEOF_UCHAR);
390 fUnion.fFields.fLengthAndFlags = kLongString;
391 return TRUE;
392 }
393 }
394 fUnion.fFields.fLengthAndFlags = kIsBogus;
395 fUnion.fFields.fArray = 0;
396 fUnion.fFields.fCapacity = 0;
397 return FALSE;
398}
399
400//========================================
401// Destructor
402//========================================
403
404#ifdef UNISTR_COUNT_FINAL_STRING_LENGTHS
405static u_atomic_int32_t finalLengthCounts[0x400]; // UnicodeString::kMaxShortLength+1
406static u_atomic_int32_t beyondCount(0);
407
408U_CAPI void unistr_printLengths() {
409 int32_t i;
410 for(i = 0; i <= 59; ++i) {
411 printf("%2d, %9d\n", i, (int32_t)finalLengthCounts[i]);
412 }
413 int32_t beyond = beyondCount;
414 for(; i < UPRV_LENGTHOF(finalLengthCounts); ++i) {
415 beyond += finalLengthCounts[i];
416 }
417 printf(">59, %9d\n", beyond);
418}
419#endif
420
421UnicodeString::~UnicodeString()
422{
423#ifdef UNISTR_COUNT_FINAL_STRING_LENGTHS
424 // Count lengths of strings at the end of their lifetime.
425 // Useful for discussion of a desirable stack buffer size.
426 // Count the contents length, not the optional NUL terminator nor further capacity.
427 // Ignore open-buffer strings and strings which alias external storage.
428 if((fUnion.fFields.fLengthAndFlags&(kOpenGetBuffer|kReadonlyAlias|kWritableAlias)) == 0) {
429 if(hasShortLength()) {
430 umtx_atomic_inc(finalLengthCounts + getShortLength());
431 } else {
432 umtx_atomic_inc(&beyondCount);
433 }
434 }
435#endif
436
437 releaseArray();
438}
439
440//========================================
441// Factory methods
442//========================================
443
444UnicodeString UnicodeString::fromUTF8(StringPiece utf8) {
445 UnicodeString result;
446 result.setToUTF8(utf8);
447 return result;
448}
449
450UnicodeString UnicodeString::fromUTF32(const UChar32 *utf32, int32_t length) {
451 UnicodeString result;
452 int32_t capacity;
453 // Most UTF-32 strings will be BMP-only and result in a same-length
454 // UTF-16 string. We overestimate the capacity just slightly,
455 // just in case there are a few supplementary characters.
456 if(length <= US_STACKBUF_SIZE) {
457 capacity = US_STACKBUF_SIZE;
458 } else {
459 capacity = length + (length >> 4) + 4;
460 }
461 do {
462 UChar *utf16 = result.getBuffer(capacity);
463 int32_t length16;
464 UErrorCode errorCode = U_ZERO_ERROR;
465 u_strFromUTF32WithSub(utf16, result.getCapacity(), &length16,
466 utf32, length,
467 0xfffd, // Substitution character.
468 NULL, // Don't care about number of substitutions.
469 &errorCode);
470 result.releaseBuffer(length16);
471 if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
472 capacity = length16 + 1; // +1 for the terminating NUL.
473 continue;
474 } else if(U_FAILURE(errorCode)) {
475 result.setToBogus();
476 }
477 break;
478 } while(TRUE);
479 return result;
480}
481
482//========================================
483// Assignment
484//========================================
485
486UnicodeString &
487UnicodeString::operator=(const UnicodeString &src) {
488 return copyFrom(src);
489}
490
491UnicodeString &
492UnicodeString::fastCopyFrom(const UnicodeString &src) {
493 return copyFrom(src, TRUE);
494}
495
496UnicodeString &
497UnicodeString::copyFrom(const UnicodeString &src, UBool fastCopy) {
498 // if assigning to ourselves, do nothing
499 if(this == &src) {
500 return *this;
501 }
502
503 // is the right side bogus?
504 if(src.isBogus()) {
505 setToBogus();
506 return *this;
507 }
508
509 // delete the current contents
510 releaseArray();
511
512 if(src.isEmpty()) {
513 // empty string - use the stack buffer
514 setToEmpty();
515 return *this;
516 }
517
518 // fLength>0 and not an "open" src.getBuffer(minCapacity)
519 fUnion.fFields.fLengthAndFlags = src.fUnion.fFields.fLengthAndFlags;
520 switch(src.fUnion.fFields.fLengthAndFlags & kAllStorageFlags) {
521 case kShortString:
522 // short string using the stack buffer, do the same
523 uprv_memcpy(fUnion.fStackFields.fBuffer, src.fUnion.fStackFields.fBuffer,
524 getShortLength() * U_SIZEOF_UCHAR);
525 break;
526 case kLongString:
527 // src uses a refCounted string buffer, use that buffer with refCount
528 // src is const, use a cast - we don't actually change it
529 ((UnicodeString &)src).addRef();
530 // copy all fields, share the reference-counted buffer
531 fUnion.fFields.fArray = src.fUnion.fFields.fArray;
532 fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
533 if(!hasShortLength()) {
534 fUnion.fFields.fLength = src.fUnion.fFields.fLength;
535 }
536 break;
537 case kReadonlyAlias:
538 if(fastCopy) {
539 // src is a readonly alias, do the same
540 // -> maintain the readonly alias as such
541 fUnion.fFields.fArray = src.fUnion.fFields.fArray;
542 fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
543 if(!hasShortLength()) {
544 fUnion.fFields.fLength = src.fUnion.fFields.fLength;
545 }
546 break;
547 }
548 // else if(!fastCopy) fall through to case kWritableAlias
549 // -> allocate a new buffer and copy the contents
550 U_FALLTHROUGH;
551 case kWritableAlias: {
552 // src is a writable alias; we make a copy of that instead
553 int32_t srcLength = src.length();
554 if(allocate(srcLength)) {
555 u_memcpy(getArrayStart(), src.getArrayStart(), srcLength);
556 setLength(srcLength);
557 break;
558 }
559 // if there is not enough memory, then fall through to setting to bogus
560 U_FALLTHROUGH;
561 }
562 default:
563 // if src is bogus, set ourselves to bogus
564 // do not call setToBogus() here because fArray and flags are not consistent here
565 fUnion.fFields.fLengthAndFlags = kIsBogus;
566 fUnion.fFields.fArray = 0;
567 fUnion.fFields.fCapacity = 0;
568 break;
569 }
570
571 return *this;
572}
573
574UnicodeString &UnicodeString::operator=(UnicodeString &&src) U_NOEXCEPT {
575 // No explicit check for self move assignment, consistent with standard library.
576 // Self move assignment causes no crash nor leak but might make the object bogus.
577 releaseArray();
578 copyFieldsFrom(src, TRUE);
579 return *this;
580}
581
582// Same as move assignment except without memory management.
583void UnicodeString::copyFieldsFrom(UnicodeString &src, UBool setSrcToBogus) U_NOEXCEPT {
584 int16_t lengthAndFlags = fUnion.fFields.fLengthAndFlags = src.fUnion.fFields.fLengthAndFlags;
585 if(lengthAndFlags & kUsingStackBuffer) {
586 // Short string using the stack buffer, copy the contents.
587 // Check for self assignment to prevent "overlap in memcpy" warnings,
588 // although it should be harmless to copy a buffer to itself exactly.
589 if(this != &src) {
590 uprv_memcpy(fUnion.fStackFields.fBuffer, src.fUnion.fStackFields.fBuffer,
591 getShortLength() * U_SIZEOF_UCHAR);
592 }
593 } else {
594 // In all other cases, copy all fields.
595 fUnion.fFields.fArray = src.fUnion.fFields.fArray;
596 fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
597 if(!hasShortLength()) {
598 fUnion.fFields.fLength = src.fUnion.fFields.fLength;
599 }
600 if(setSrcToBogus) {
601 // Set src to bogus without releasing any memory.
602 src.fUnion.fFields.fLengthAndFlags = kIsBogus;
603 src.fUnion.fFields.fArray = NULL;
604 src.fUnion.fFields.fCapacity = 0;
605 }
606 }
607}
608
609void UnicodeString::swap(UnicodeString &other) U_NOEXCEPT {
610 UnicodeString temp; // Empty short string: Known not to need releaseArray().
611 // Copy fields without resetting source values in between.
612 temp.copyFieldsFrom(*this, FALSE);
613 this->copyFieldsFrom(other, FALSE);
614 other.copyFieldsFrom(temp, FALSE);
615 // Set temp to an empty string so that other's memory is not released twice.
616 temp.fUnion.fFields.fLengthAndFlags = kShortString;
617}
618
619//========================================
620// Miscellaneous operations
621//========================================
622
623UnicodeString UnicodeString::unescape() const {
624 UnicodeString result(length(), (UChar32)0, (int32_t)0); // construct with capacity
625 if (result.isBogus()) {
626 return result;
627 }
628 const UChar *array = getBuffer();
629 int32_t len = length();
630 int32_t prev = 0;
631 for (int32_t i=0;;) {
632 if (i == len) {
633 result.append(array, prev, len - prev);
634 break;
635 }
636 if (array[i++] == 0x5C /*'\\'*/) {
637 result.append(array, prev, (i - 1) - prev);
638 UChar32 c = unescapeAt(i); // advances i
639 if (c < 0) {
640 result.remove(); // return empty string
641 break; // invalid escape sequence
642 }
643 result.append(c);
644 prev = i;
645 }
646 }
647 return result;
648}
649
650UChar32 UnicodeString::unescapeAt(int32_t &offset) const {
651 return u_unescapeAt(UnicodeString_charAt, &offset, length(), (void*)this);
652}
653
654//========================================
655// Read-only implementation
656//========================================
657UBool
658UnicodeString::doEquals(const UnicodeString &text, int32_t len) const {
659 // Requires: this & text not bogus and have same lengths.
660 // Byte-wise comparison works for equality regardless of endianness.
661 return uprv_memcmp(getArrayStart(), text.getArrayStart(), len * U_SIZEOF_UCHAR) == 0;
662}
663
664int8_t
665UnicodeString::doCompare( int32_t start,
666 int32_t length,
667 const UChar *srcChars,
668 int32_t srcStart,
669 int32_t srcLength) const
670{
671 // compare illegal string values
672 if(isBogus()) {
673 return -1;
674 }
675
676 // pin indices to legal values
677 pinIndices(start, length);
678
679 if(srcChars == NULL) {
680 // treat const UChar *srcChars==NULL as an empty string
681 return length == 0 ? 0 : 1;
682 }
683
684 // get the correct pointer
685 const UChar *chars = getArrayStart();
686
687 chars += start;
688 srcChars += srcStart;
689
690 int32_t minLength;
691 int8_t lengthResult;
692
693 // get the srcLength if necessary
694 if(srcLength < 0) {
695 srcLength = u_strlen(srcChars + srcStart);
696 }
697
698 // are we comparing different lengths?
699 if(length != srcLength) {
700 if(length < srcLength) {
701 minLength = length;
702 lengthResult = -1;
703 } else {
704 minLength = srcLength;
705 lengthResult = 1;
706 }
707 } else {
708 minLength = length;
709 lengthResult = 0;
710 }
711
712 /*
713 * note that uprv_memcmp() returns an int but we return an int8_t;
714 * we need to take care not to truncate the result -
715 * one way to do this is to right-shift the value to
716 * move the sign bit into the lower 8 bits and making sure that this
717 * does not become 0 itself
718 */
719
720 if(minLength > 0 && chars != srcChars) {
721 int32_t result;
722
723# if U_IS_BIG_ENDIAN
724 // big-endian: byte comparison works
725 result = uprv_memcmp(chars, srcChars, minLength * sizeof(UChar));
726 if(result != 0) {
727 return (int8_t)(result >> 15 | 1);
728 }
729# else
730 // little-endian: compare UChar units
731 do {
732 result = ((int32_t)*(chars++) - (int32_t)*(srcChars++));
733 if(result != 0) {
734 return (int8_t)(result >> 15 | 1);
735 }
736 } while(--minLength > 0);
737# endif
738 }
739 return lengthResult;
740}
741
742/* String compare in code point order - doCompare() compares in code unit order. */
743int8_t
744UnicodeString::doCompareCodePointOrder(int32_t start,
745 int32_t length,
746 const UChar *srcChars,
747 int32_t srcStart,
748 int32_t srcLength) const
749{
750 // compare illegal string values
751 // treat const UChar *srcChars==NULL as an empty string
752 if(isBogus()) {
753 return -1;
754 }
755
756 // pin indices to legal values
757 pinIndices(start, length);
758
759 if(srcChars == NULL) {
760 srcStart = srcLength = 0;
761 }
762
763 int32_t diff = uprv_strCompare(getArrayStart() + start, length, (srcChars!=NULL)?(srcChars + srcStart):NULL, srcLength, FALSE, TRUE);
764 /* translate the 32-bit result into an 8-bit one */
765 if(diff!=0) {
766 return (int8_t)(diff >> 15 | 1);
767 } else {
768 return 0;
769 }
770}
771
772int32_t
773UnicodeString::getLength() const {
774 return length();
775}
776
777UChar
778UnicodeString::getCharAt(int32_t offset) const {
779 return charAt(offset);
780}
781
782UChar32
783UnicodeString::getChar32At(int32_t offset) const {
784 return char32At(offset);
785}
786
787UChar32
788UnicodeString::char32At(int32_t offset) const
789{
790 int32_t len = length();
791 if((uint32_t)offset < (uint32_t)len) {
792 const UChar *array = getArrayStart();
793 UChar32 c;
794 U16_GET(array, 0, offset, len, c);
795 return c;
796 } else {
797 return kInvalidUChar;
798 }
799}
800
801int32_t
802UnicodeString::getChar32Start(int32_t offset) const {
803 if((uint32_t)offset < (uint32_t)length()) {
804 const UChar *array = getArrayStart();
805 U16_SET_CP_START(array, 0, offset);
806 return offset;
807 } else {
808 return 0;
809 }
810}
811
812int32_t
813UnicodeString::getChar32Limit(int32_t offset) const {
814 int32_t len = length();
815 if((uint32_t)offset < (uint32_t)len) {
816 const UChar *array = getArrayStart();
817 U16_SET_CP_LIMIT(array, 0, offset, len);
818 return offset;
819 } else {
820 return len;
821 }
822}
823
824int32_t
825UnicodeString::countChar32(int32_t start, int32_t length) const {
826 pinIndices(start, length);
827 // if(isBogus()) then fArray==0 and start==0 - u_countChar32() checks for NULL
828 return u_countChar32(getArrayStart()+start, length);
829}
830
831UBool
832UnicodeString::hasMoreChar32Than(int32_t start, int32_t length, int32_t number) const {
833 pinIndices(start, length);
834 // if(isBogus()) then fArray==0 and start==0 - u_strHasMoreChar32Than() checks for NULL
835 return u_strHasMoreChar32Than(getArrayStart()+start, length, number);
836}
837
838int32_t
839UnicodeString::moveIndex32(int32_t index, int32_t delta) const {
840 // pin index
841 int32_t len = length();
842 if(index<0) {
843 index=0;
844 } else if(index>len) {
845 index=len;
846 }
847
848 const UChar *array = getArrayStart();
849 if(delta>0) {
850 U16_FWD_N(array, index, len, delta);
851 } else {
852 U16_BACK_N(array, 0, index, -delta);
853 }
854
855 return index;
856}
857
858void
859UnicodeString::doExtract(int32_t start,
860 int32_t length,
861 UChar *dst,
862 int32_t dstStart) const
863{
864 // pin indices to legal values
865 pinIndices(start, length);
866
867 // do not copy anything if we alias dst itself
868 const UChar *array = getArrayStart();
869 if(array + start != dst + dstStart) {
870 us_arrayCopy(array, start, dst, dstStart, length);
871 }
872}
873
874int32_t
875UnicodeString::extract(Char16Ptr dest, int32_t destCapacity,
876 UErrorCode &errorCode) const {
877 int32_t len = length();
878 if(U_SUCCESS(errorCode)) {
879 if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) {
880 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
881 } else {
882 const UChar *array = getArrayStart();
883 if(len>0 && len<=destCapacity && array!=dest) {
884 u_memcpy(dest, array, len);
885 }
886 return u_terminateUChars(dest, destCapacity, len, &errorCode);
887 }
888 }
889
890 return len;
891}
892
893int32_t
894UnicodeString::extract(int32_t start,
895 int32_t length,
896 char *target,
897 int32_t targetCapacity,
898 enum EInvariant) const
899{
900 // if the arguments are illegal, then do nothing
901 if(targetCapacity < 0 || (targetCapacity > 0 && target == NULL)) {
902 return 0;
903 }
904
905 // pin the indices to legal values
906 pinIndices(start, length);
907
908 if(length <= targetCapacity) {
909 u_UCharsToChars(getArrayStart() + start, target, length);
910 }
911 UErrorCode status = U_ZERO_ERROR;
912 return u_terminateChars(target, targetCapacity, length, &status);
913}
914
915UnicodeString
916UnicodeString::tempSubString(int32_t start, int32_t len) const {
917 pinIndices(start, len);
918 const UChar *array = getBuffer(); // not getArrayStart() to check kIsBogus & kOpenGetBuffer
919 if(array==NULL) {
920 array=fUnion.fStackFields.fBuffer; // anything not NULL because that would make an empty string
921 len=-2; // bogus result string
922 }
923 return UnicodeString(FALSE, array + start, len);
924}
925
926int32_t
927UnicodeString::toUTF8(int32_t start, int32_t len,
928 char *target, int32_t capacity) const {
929 pinIndices(start, len);
930 int32_t length8;
931 UErrorCode errorCode = U_ZERO_ERROR;
932 u_strToUTF8WithSub(target, capacity, &length8,
933 getBuffer() + start, len,
934 0xFFFD, // Standard substitution character.
935 NULL, // Don't care about number of substitutions.
936 &errorCode);
937 return length8;
938}
939
940#if U_CHARSET_IS_UTF8
941
942int32_t
943UnicodeString::extract(int32_t start, int32_t len,
944 char *target, uint32_t dstSize) const {
945 // if the arguments are illegal, then do nothing
946 if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) {
947 return 0;
948 }
949 return toUTF8(start, len, target, dstSize <= 0x7fffffff ? (int32_t)dstSize : 0x7fffffff);
950}
951
952// else see unistr_cnv.cpp
953#endif
954
955void
956UnicodeString::extractBetween(int32_t start,
957 int32_t limit,
958 UnicodeString& target) const {
959 pinIndex(start);
960 pinIndex(limit);
961 doExtract(start, limit - start, target);
962}
963
964// When converting from UTF-16 to UTF-8, the result will have at most 3 times
965// as many bytes as the source has UChars.
966// The "worst cases" are writing systems like Indic, Thai and CJK with
967// 3:1 bytes:UChars.
968void
969UnicodeString::toUTF8(ByteSink &sink) const {
970 int32_t length16 = length();
971 if(length16 != 0) {
972 char stackBuffer[1024];
973 int32_t capacity = (int32_t)sizeof(stackBuffer);
974 UBool utf8IsOwned = FALSE;
975 char *utf8 = sink.GetAppendBuffer(length16 < capacity ? length16 : capacity,
976 3*length16,
977 stackBuffer, capacity,
978 &capacity);
979 int32_t length8 = 0;
980 UErrorCode errorCode = U_ZERO_ERROR;
981 u_strToUTF8WithSub(utf8, capacity, &length8,
982 getBuffer(), length16,
983 0xFFFD, // Standard substitution character.
984 NULL, // Don't care about number of substitutions.
985 &errorCode);
986 if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
987 utf8 = (char *)uprv_malloc(length8);
988 if(utf8 != NULL) {
989 utf8IsOwned = TRUE;
990 errorCode = U_ZERO_ERROR;
991 u_strToUTF8WithSub(utf8, length8, &length8,
992 getBuffer(), length16,
993 0xFFFD, // Standard substitution character.
994 NULL, // Don't care about number of substitutions.
995 &errorCode);
996 } else {
997 errorCode = U_MEMORY_ALLOCATION_ERROR;
998 }
999 }
1000 if(U_SUCCESS(errorCode)) {
1001 sink.Append(utf8, length8);
1002 sink.Flush();
1003 }
1004 if(utf8IsOwned) {
1005 uprv_free(utf8);
1006 }
1007 }
1008}
1009
1010int32_t
1011UnicodeString::toUTF32(UChar32 *utf32, int32_t capacity, UErrorCode &errorCode) const {
1012 int32_t length32=0;
1013 if(U_SUCCESS(errorCode)) {
1014 // getBuffer() and u_strToUTF32WithSub() check for illegal arguments.
1015 u_strToUTF32WithSub(utf32, capacity, &length32,
1016 getBuffer(), length(),
1017 0xfffd, // Substitution character.
1018 NULL, // Don't care about number of substitutions.
1019 &errorCode);
1020 }
1021 return length32;
1022}
1023
1024int32_t
1025UnicodeString::indexOf(const UChar *srcChars,
1026 int32_t srcStart,
1027 int32_t srcLength,
1028 int32_t start,
1029 int32_t length) const
1030{
1031 if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
1032 return -1;
1033 }
1034
1035 // UnicodeString does not find empty substrings
1036 if(srcLength < 0 && srcChars[srcStart] == 0) {
1037 return -1;
1038 }
1039
1040 // get the indices within bounds
1041 pinIndices(start, length);
1042
1043 // find the first occurrence of the substring
1044 const UChar *array = getArrayStart();
1045 const UChar *match = u_strFindFirst(array + start, length, srcChars + srcStart, srcLength);
1046 if(match == NULL) {
1047 return -1;
1048 } else {
1049 return (int32_t)(match - array);
1050 }
1051}
1052
1053int32_t
1054UnicodeString::doIndexOf(UChar c,
1055 int32_t start,
1056 int32_t length) const
1057{
1058 // pin indices
1059 pinIndices(start, length);
1060
1061 // find the first occurrence of c
1062 const UChar *array = getArrayStart();
1063 const UChar *match = u_memchr(array + start, c, length);
1064 if(match == NULL) {
1065 return -1;
1066 } else {
1067 return (int32_t)(match - array);
1068 }
1069}
1070
1071int32_t
1072UnicodeString::doIndexOf(UChar32 c,
1073 int32_t start,
1074 int32_t length) const {
1075 // pin indices
1076 pinIndices(start, length);
1077
1078 // find the first occurrence of c
1079 const UChar *array = getArrayStart();
1080 const UChar *match = u_memchr32(array + start, c, length);
1081 if(match == NULL) {
1082 return -1;
1083 } else {
1084 return (int32_t)(match - array);
1085 }
1086}
1087
1088int32_t
1089UnicodeString::lastIndexOf(const UChar *srcChars,
1090 int32_t srcStart,
1091 int32_t srcLength,
1092 int32_t start,
1093 int32_t length) const
1094{
1095 if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
1096 return -1;
1097 }
1098
1099 // UnicodeString does not find empty substrings
1100 if(srcLength < 0 && srcChars[srcStart] == 0) {
1101 return -1;
1102 }
1103
1104 // get the indices within bounds
1105 pinIndices(start, length);
1106
1107 // find the last occurrence of the substring
1108 const UChar *array = getArrayStart();
1109 const UChar *match = u_strFindLast(array + start, length, srcChars + srcStart, srcLength);
1110 if(match == NULL) {
1111 return -1;
1112 } else {
1113 return (int32_t)(match - array);
1114 }
1115}
1116
1117int32_t
1118UnicodeString::doLastIndexOf(UChar c,
1119 int32_t start,
1120 int32_t length) const
1121{
1122 if(isBogus()) {
1123 return -1;
1124 }
1125
1126 // pin indices
1127 pinIndices(start, length);
1128
1129 // find the last occurrence of c
1130 const UChar *array = getArrayStart();
1131 const UChar *match = u_memrchr(array + start, c, length);
1132 if(match == NULL) {
1133 return -1;
1134 } else {
1135 return (int32_t)(match - array);
1136 }
1137}
1138
1139int32_t
1140UnicodeString::doLastIndexOf(UChar32 c,
1141 int32_t start,
1142 int32_t length) const {
1143 // pin indices
1144 pinIndices(start, length);
1145
1146 // find the last occurrence of c
1147 const UChar *array = getArrayStart();
1148 const UChar *match = u_memrchr32(array + start, c, length);
1149 if(match == NULL) {
1150 return -1;
1151 } else {
1152 return (int32_t)(match - array);
1153 }
1154}
1155
1156//========================================
1157// Write implementation
1158//========================================
1159
1160UnicodeString&
1161UnicodeString::findAndReplace(int32_t start,
1162 int32_t length,
1163 const UnicodeString& oldText,
1164 int32_t oldStart,
1165 int32_t oldLength,
1166 const UnicodeString& newText,
1167 int32_t newStart,
1168 int32_t newLength)
1169{
1170 if(isBogus() || oldText.isBogus() || newText.isBogus()) {
1171 return *this;
1172 }
1173
1174 pinIndices(start, length);
1175 oldText.pinIndices(oldStart, oldLength);
1176 newText.pinIndices(newStart, newLength);
1177
1178 if(oldLength == 0) {
1179 return *this;
1180 }
1181
1182 while(length > 0 && length >= oldLength) {
1183 int32_t pos = indexOf(oldText, oldStart, oldLength, start, length);
1184 if(pos < 0) {
1185 // no more oldText's here: done
1186 break;
1187 } else {
1188 // we found oldText, replace it by newText and go beyond it
1189 replace(pos, oldLength, newText, newStart, newLength);
1190 length -= pos + oldLength - start;
1191 start = pos + newLength;
1192 }
1193 }
1194
1195 return *this;
1196}
1197
1198
1199void
1200UnicodeString::setToBogus()
1201{
1202 releaseArray();
1203
1204 fUnion.fFields.fLengthAndFlags = kIsBogus;
1205 fUnion.fFields.fArray = 0;
1206 fUnion.fFields.fCapacity = 0;
1207}
1208
1209// turn a bogus string into an empty one
1210void
1211UnicodeString::unBogus() {
1212 if(fUnion.fFields.fLengthAndFlags & kIsBogus) {
1213 setToEmpty();
1214 }
1215}
1216
1217const char16_t *
1218UnicodeString::getTerminatedBuffer() {
1219 if(!isWritable()) {
1220 return nullptr;
1221 }
1222 UChar *array = getArrayStart();
1223 int32_t len = length();
1224 if(len < getCapacity()) {
1225 if(fUnion.fFields.fLengthAndFlags & kBufferIsReadonly) {
1226 // If len<capacity on a read-only alias, then array[len] is
1227 // either the original NUL (if constructed with (TRUE, s, length))
1228 // or one of the original string contents characters (if later truncated),
1229 // therefore we can assume that array[len] is initialized memory.
1230 if(array[len] == 0) {
1231 return array;
1232 }
1233 } else if(((fUnion.fFields.fLengthAndFlags & kRefCounted) == 0 || refCount() == 1)) {
1234 // kRefCounted: Do not write the NUL if the buffer is shared.
1235 // That is mostly safe, except when the length of one copy was modified
1236 // without copy-on-write, e.g., via truncate(newLength) or remove(void).
1237 // Then the NUL would be written into the middle of another copy's string.
1238
1239 // Otherwise, the buffer is fully writable and it is anyway safe to write the NUL.
1240 // Do not test if there is a NUL already because it might be uninitialized memory.
1241 // (That would be safe, but tools like valgrind & Purify would complain.)
1242 array[len] = 0;
1243 return array;
1244 }
1245 }
1246 if(len<INT32_MAX && cloneArrayIfNeeded(len+1)) {
1247 array = getArrayStart();
1248 array[len] = 0;
1249 return array;
1250 } else {
1251 return nullptr;
1252 }
1253}
1254
1255// setTo() analogous to the readonly-aliasing constructor with the same signature
1256UnicodeString &
1257UnicodeString::setTo(UBool isTerminated,
1258 ConstChar16Ptr textPtr,
1259 int32_t textLength)
1260{
1261 if(fUnion.fFields.fLengthAndFlags & kOpenGetBuffer) {
1262 // do not modify a string that has an "open" getBuffer(minCapacity)
1263 return *this;
1264 }
1265
1266 const UChar *text = textPtr;
1267 if(text == NULL) {
1268 // treat as an empty string, do not alias
1269 releaseArray();
1270 setToEmpty();
1271 return *this;
1272 }
1273
1274 if( textLength < -1 ||
1275 (textLength == -1 && !isTerminated) ||
1276 (textLength >= 0 && isTerminated && text[textLength] != 0)
1277 ) {
1278 setToBogus();
1279 return *this;
1280 }
1281
1282 releaseArray();
1283
1284 if(textLength == -1) {
1285 // text is terminated, or else it would have failed the above test
1286 textLength = u_strlen(text);
1287 }
1288 fUnion.fFields.fLengthAndFlags = kReadonlyAlias;
1289 setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength);
1290 return *this;
1291}
1292
1293// setTo() analogous to the writable-aliasing constructor with the same signature
1294UnicodeString &
1295UnicodeString::setTo(UChar *buffer,
1296 int32_t buffLength,
1297 int32_t buffCapacity) {
1298 if(fUnion.fFields.fLengthAndFlags & kOpenGetBuffer) {
1299 // do not modify a string that has an "open" getBuffer(minCapacity)
1300 return *this;
1301 }
1302
1303 if(buffer == NULL) {
1304 // treat as an empty string, do not alias
1305 releaseArray();
1306 setToEmpty();
1307 return *this;
1308 }
1309
1310 if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
1311 setToBogus();
1312 return *this;
1313 } else if(buffLength == -1) {
1314 // buffLength = u_strlen(buff); but do not look beyond buffCapacity
1315 const UChar *p = buffer, *limit = buffer + buffCapacity;
1316 while(p != limit && *p != 0) {
1317 ++p;
1318 }
1319 buffLength = (int32_t)(p - buffer);
1320 }
1321
1322 releaseArray();
1323
1324 fUnion.fFields.fLengthAndFlags = kWritableAlias;
1325 setArray(buffer, buffLength, buffCapacity);
1326 return *this;
1327}
1328
1329UnicodeString &UnicodeString::setToUTF8(StringPiece utf8) {
1330 unBogus();
1331 int32_t length = utf8.length();
1332 int32_t capacity;
1333 // The UTF-16 string will be at most as long as the UTF-8 string.
1334 if(length <= US_STACKBUF_SIZE) {
1335 capacity = US_STACKBUF_SIZE;
1336 } else {
1337 capacity = length + 1; // +1 for the terminating NUL.
1338 }
1339 UChar *utf16 = getBuffer(capacity);
1340 int32_t length16;
1341 UErrorCode errorCode = U_ZERO_ERROR;
1342 u_strFromUTF8WithSub(utf16, getCapacity(), &length16,
1343 utf8.data(), length,
1344 0xfffd, // Substitution character.
1345 NULL, // Don't care about number of substitutions.
1346 &errorCode);
1347 releaseBuffer(length16);
1348 if(U_FAILURE(errorCode)) {
1349 setToBogus();
1350 }
1351 return *this;
1352}
1353
1354UnicodeString&
1355UnicodeString::setCharAt(int32_t offset,
1356 UChar c)
1357{
1358 int32_t len = length();
1359 if(cloneArrayIfNeeded() && len > 0) {
1360 if(offset < 0) {
1361 offset = 0;
1362 } else if(offset >= len) {
1363 offset = len - 1;
1364 }
1365
1366 getArrayStart()[offset] = c;
1367 }
1368 return *this;
1369}
1370
1371UnicodeString&
1372UnicodeString::replace(int32_t start,
1373 int32_t _length,
1374 UChar32 srcChar) {
1375 UChar buffer[U16_MAX_LENGTH];
1376 int32_t count = 0;
1377 UBool isError = FALSE;
1378 U16_APPEND(buffer, count, U16_MAX_LENGTH, srcChar, isError);
1379 // We test isError so that the compiler does not complain that we don't.
1380 // If isError (srcChar is not a valid code point) then count==0 which means
1381 // we remove the source segment rather than replacing it with srcChar.
1382 return doReplace(start, _length, buffer, 0, isError ? 0 : count);
1383}
1384
1385UnicodeString&
1386UnicodeString::append(UChar32 srcChar) {
1387 UChar buffer[U16_MAX_LENGTH];
1388 int32_t _length = 0;
1389 UBool isError = FALSE;
1390 U16_APPEND(buffer, _length, U16_MAX_LENGTH, srcChar, isError);
1391 // We test isError so that the compiler does not complain that we don't.
1392 // If isError then _length==0 which turns the doAppend() into a no-op anyway.
1393 return isError ? *this : doAppend(buffer, 0, _length);
1394}
1395
1396UnicodeString&
1397UnicodeString::doReplace( int32_t start,
1398 int32_t length,
1399 const UnicodeString& src,
1400 int32_t srcStart,
1401 int32_t srcLength)
1402{
1403 // pin the indices to legal values
1404 src.pinIndices(srcStart, srcLength);
1405
1406 // get the characters from src
1407 // and replace the range in ourselves with them
1408 return doReplace(start, length, src.getArrayStart(), srcStart, srcLength);
1409}
1410
1411UnicodeString&
1412UnicodeString::doReplace(int32_t start,
1413 int32_t length,
1414 const UChar *srcChars,
1415 int32_t srcStart,
1416 int32_t srcLength)
1417{
1418 if(!isWritable()) {
1419 return *this;
1420 }
1421
1422 int32_t oldLength = this->length();
1423
1424 // optimize (read-only alias).remove(0, start) and .remove(start, end)
1425 if((fUnion.fFields.fLengthAndFlags&kBufferIsReadonly) && srcLength == 0) {
1426 if(start == 0) {
1427 // remove prefix by adjusting the array pointer
1428 pinIndex(length);
1429 fUnion.fFields.fArray += length;
1430 fUnion.fFields.fCapacity -= length;
1431 setLength(oldLength - length);
1432 return *this;
1433 } else {
1434 pinIndex(start);
1435 if(length >= (oldLength - start)) {
1436 // remove suffix by reducing the length (like truncate())
1437 setLength(start);
1438 fUnion.fFields.fCapacity = start; // not NUL-terminated any more
1439 return *this;
1440 }
1441 }
1442 }
1443
1444 if(start == oldLength) {
1445 return doAppend(srcChars, srcStart, srcLength);
1446 }
1447
1448 if(srcChars == 0) {
1449 srcLength = 0;
1450 } else {
1451 // Perform all remaining operations relative to srcChars + srcStart.
1452 // From this point forward, do not use srcStart.
1453 srcChars += srcStart;
1454 if (srcLength < 0) {
1455 // get the srcLength if necessary
1456 srcLength = u_strlen(srcChars);
1457 }
1458 }
1459
1460 // pin the indices to legal values
1461 pinIndices(start, length);
1462
1463 // Calculate the size of the string after the replace.
1464 // Avoid int32_t overflow.
1465 int32_t newLength = oldLength - length;
1466 if(srcLength > (INT32_MAX - newLength)) {
1467 setToBogus();
1468 return *this;
1469 }
1470 newLength += srcLength;
1471
1472 // Check for insertion into ourself
1473 const UChar *oldArray = getArrayStart();
1474 if (isBufferWritable() &&
1475 oldArray < srcChars + srcLength &&
1476 srcChars < oldArray + oldLength) {
1477 // Copy into a new UnicodeString and start over
1478 UnicodeString copy(srcChars, srcLength);
1479 if (copy.isBogus()) {
1480 setToBogus();
1481 return *this;
1482 }
1483 return doReplace(start, length, copy.getArrayStart(), 0, srcLength);
1484 }
1485
1486 // cloneArrayIfNeeded(doCopyArray=FALSE) may change fArray but will not copy the current contents;
1487 // therefore we need to keep the current fArray
1488 UChar oldStackBuffer[US_STACKBUF_SIZE];
1489 if((fUnion.fFields.fLengthAndFlags&kUsingStackBuffer) && (newLength > US_STACKBUF_SIZE)) {
1490 // copy the stack buffer contents because it will be overwritten with
1491 // fUnion.fFields values
1492 u_memcpy(oldStackBuffer, oldArray, oldLength);
1493 oldArray = oldStackBuffer;
1494 }
1495
1496 // clone our array and allocate a bigger array if needed
1497 int32_t *bufferToDelete = 0;
1498 if(!cloneArrayIfNeeded(newLength, getGrowCapacity(newLength),
1499 FALSE, &bufferToDelete)
1500 ) {
1501 return *this;
1502 }
1503
1504 // now do the replace
1505
1506 UChar *newArray = getArrayStart();
1507 if(newArray != oldArray) {
1508 // if fArray changed, then we need to copy everything except what will change
1509 us_arrayCopy(oldArray, 0, newArray, 0, start);
1510 us_arrayCopy(oldArray, start + length,
1511 newArray, start + srcLength,
1512 oldLength - (start + length));
1513 } else if(length != srcLength) {
1514 // fArray did not change; copy only the portion that isn't changing, leaving a hole
1515 us_arrayCopy(oldArray, start + length,
1516 newArray, start + srcLength,
1517 oldLength - (start + length));
1518 }
1519
1520 // now fill in the hole with the new string
1521 us_arrayCopy(srcChars, 0, newArray, start, srcLength);
1522
1523 setLength(newLength);
1524
1525 // delayed delete in case srcChars == fArray when we started, and
1526 // to keep oldArray alive for the above operations
1527 if (bufferToDelete) {
1528 uprv_free(bufferToDelete);
1529 }
1530
1531 return *this;
1532}
1533
1534// Versions of doReplace() only for append() variants.
1535// doReplace() and doAppend() optimize for different cases.
1536
1537UnicodeString&
1538UnicodeString::doAppend(const UnicodeString& src, int32_t srcStart, int32_t srcLength) {
1539 if(srcLength == 0) {
1540 return *this;
1541 }
1542
1543 // pin the indices to legal values
1544 src.pinIndices(srcStart, srcLength);
1545 return doAppend(src.getArrayStart(), srcStart, srcLength);
1546}
1547
1548UnicodeString&
1549UnicodeString::doAppend(const UChar *srcChars, int32_t srcStart, int32_t srcLength) {
1550 if(!isWritable() || srcLength == 0 || srcChars == NULL) {
1551 return *this;
1552 }
1553
1554 // Perform all remaining operations relative to srcChars + srcStart.
1555 // From this point forward, do not use srcStart.
1556 srcChars += srcStart;
1557
1558 if(srcLength < 0) {
1559 // get the srcLength if necessary
1560 if((srcLength = u_strlen(srcChars)) == 0) {
1561 return *this;
1562 }
1563 }
1564
1565 int32_t oldLength = length();
1566 int32_t newLength;
1567 if (uprv_add32_overflow(oldLength, srcLength, &newLength)) {
1568 setToBogus();
1569 return *this;
1570 }
1571
1572 // Check for append onto ourself
1573 const UChar* oldArray = getArrayStart();
1574 if (isBufferWritable() &&
1575 oldArray < srcChars + srcLength &&
1576 srcChars < oldArray + oldLength) {
1577 // Copy into a new UnicodeString and start over
1578 UnicodeString copy(srcChars, srcLength);
1579 if (copy.isBogus()) {
1580 setToBogus();
1581 return *this;
1582 }
1583 return doAppend(copy.getArrayStart(), 0, srcLength);
1584 }
1585
1586 // optimize append() onto a large-enough, owned string
1587 if((newLength <= getCapacity() && isBufferWritable()) ||
1588 cloneArrayIfNeeded(newLength, getGrowCapacity(newLength))) {
1589 UChar *newArray = getArrayStart();
1590 // Do not copy characters when
1591 // UChar *buffer=str.getAppendBuffer(...);
1592 // is followed by
1593 // str.append(buffer, length);
1594 // or
1595 // str.appendString(buffer, length)
1596 // or similar.
1597 if(srcChars != newArray + oldLength) {
1598 us_arrayCopy(srcChars, 0, newArray, oldLength, srcLength);
1599 }
1600 setLength(newLength);
1601 }
1602 return *this;
1603}
1604
1605/**
1606 * Replaceable API
1607 */
1608void
1609UnicodeString::handleReplaceBetween(int32_t start,
1610 int32_t limit,
1611 const UnicodeString& text) {
1612 replaceBetween(start, limit, text);
1613}
1614
1615/**
1616 * Replaceable API
1617 */
1618void
1619UnicodeString::copy(int32_t start, int32_t limit, int32_t dest) {
1620 if (limit <= start) {
1621 return; // Nothing to do; avoid bogus malloc call
1622 }
1623 UChar* text = (UChar*) uprv_malloc( sizeof(UChar) * (limit - start) );
1624 // Check to make sure text is not null.
1625 if (text != NULL) {
1626 extractBetween(start, limit, text, 0);
1627 insert(dest, text, 0, limit - start);
1628 uprv_free(text);
1629 }
1630}
1631
1632/**
1633 * Replaceable API
1634 *
1635 * NOTE: This is for the Replaceable class. There is no rep.cpp,
1636 * so we implement this function here.
1637 */
1638UBool Replaceable::hasMetaData() const {
1639 return TRUE;
1640}
1641
1642/**
1643 * Replaceable API
1644 */
1645UBool UnicodeString::hasMetaData() const {
1646 return FALSE;
1647}
1648
1649UnicodeString&
1650UnicodeString::doReverse(int32_t start, int32_t length) {
1651 if(length <= 1 || !cloneArrayIfNeeded()) {
1652 return *this;
1653 }
1654
1655 // pin the indices to legal values
1656 pinIndices(start, length);
1657 if(length <= 1) { // pinIndices() might have shrunk the length
1658 return *this;
1659 }
1660
1661 UChar *left = getArrayStart() + start;
1662 UChar *right = left + length - 1; // -1 for inclusive boundary (length>=2)
1663 UChar swap;
1664 UBool hasSupplementary = FALSE;
1665
1666 // Before the loop we know left<right because length>=2.
1667 do {
1668 hasSupplementary |= (UBool)U16_IS_LEAD(swap = *left);
1669 hasSupplementary |= (UBool)U16_IS_LEAD(*left++ = *right);
1670 *right-- = swap;
1671 } while(left < right);
1672 // Make sure to test the middle code unit of an odd-length string.
1673 // Redundant if the length is even.
1674 hasSupplementary |= (UBool)U16_IS_LEAD(*left);
1675
1676 /* if there are supplementary code points in the reversed range, then re-swap their surrogates */
1677 if(hasSupplementary) {
1678 UChar swap2;
1679
1680 left = getArrayStart() + start;
1681 right = left + length - 1; // -1 so that we can look at *(left+1) if left<right
1682 while(left < right) {
1683 if(U16_IS_TRAIL(swap = *left) && U16_IS_LEAD(swap2 = *(left + 1))) {
1684 *left++ = swap2;
1685 *left++ = swap;
1686 } else {
1687 ++left;
1688 }
1689 }
1690 }
1691
1692 return *this;
1693}
1694
1695UBool
1696UnicodeString::padLeading(int32_t targetLength,
1697 UChar padChar)
1698{
1699 int32_t oldLength = length();
1700 if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
1701 return FALSE;
1702 } else {
1703 // move contents up by padding width
1704 UChar *array = getArrayStart();
1705 int32_t start = targetLength - oldLength;
1706 us_arrayCopy(array, 0, array, start, oldLength);
1707
1708 // fill in padding character
1709 while(--start >= 0) {
1710 array[start] = padChar;
1711 }
1712 setLength(targetLength);
1713 return TRUE;
1714 }
1715}
1716
1717UBool
1718UnicodeString::padTrailing(int32_t targetLength,
1719 UChar padChar)
1720{
1721 int32_t oldLength = length();
1722 if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
1723 return FALSE;
1724 } else {
1725 // fill in padding character
1726 UChar *array = getArrayStart();
1727 int32_t length = targetLength;
1728 while(--length >= oldLength) {
1729 array[length] = padChar;
1730 }
1731 setLength(targetLength);
1732 return TRUE;
1733 }
1734}
1735
1736//========================================
1737// Hashing
1738//========================================
1739int32_t
1740UnicodeString::doHashCode() const
1741{
1742 /* Delegate hash computation to uhash. This makes UnicodeString
1743 * hashing consistent with UChar* hashing. */
1744 int32_t hashCode = ustr_hashUCharsN(getArrayStart(), length());
1745 if (hashCode == kInvalidHashCode) {
1746 hashCode = kEmptyHashCode;
1747 }
1748 return hashCode;
1749}
1750
1751//========================================
1752// External Buffer
1753//========================================
1754
1755char16_t *
1756UnicodeString::getBuffer(int32_t minCapacity) {
1757 if(minCapacity>=-1 && cloneArrayIfNeeded(minCapacity)) {
1758 fUnion.fFields.fLengthAndFlags|=kOpenGetBuffer;
1759 setZeroLength();
1760 return getArrayStart();
1761 } else {
1762 return nullptr;
1763 }
1764}
1765
1766void
1767UnicodeString::releaseBuffer(int32_t newLength) {
1768 if(fUnion.fFields.fLengthAndFlags&kOpenGetBuffer && newLength>=-1) {
1769 // set the new fLength
1770 int32_t capacity=getCapacity();
1771 if(newLength==-1) {
1772 // the new length is the string length, capped by fCapacity
1773 const UChar *array=getArrayStart(), *p=array, *limit=array+capacity;
1774 while(p<limit && *p!=0) {
1775 ++p;
1776 }
1777 newLength=(int32_t)(p-array);
1778 } else if(newLength>capacity) {
1779 newLength=capacity;
1780 }
1781 setLength(newLength);
1782 fUnion.fFields.fLengthAndFlags&=~kOpenGetBuffer;
1783 }
1784}
1785
1786//========================================
1787// Miscellaneous
1788//========================================
1789UBool
1790UnicodeString::cloneArrayIfNeeded(int32_t newCapacity,
1791 int32_t growCapacity,
1792 UBool doCopyArray,
1793 int32_t **pBufferToDelete,
1794 UBool forceClone) {
1795 // default parameters need to be static, therefore
1796 // the defaults are -1 to have convenience defaults
1797 if(newCapacity == -1) {
1798 newCapacity = getCapacity();
1799 }
1800
1801 // while a getBuffer(minCapacity) is "open",
1802 // prevent any modifications of the string by returning FALSE here
1803 // if the string is bogus, then only an assignment or similar can revive it
1804 if(!isWritable()) {
1805 return FALSE;
1806 }
1807
1808 /*
1809 * We need to make a copy of the array if
1810 * the buffer is read-only, or
1811 * the buffer is refCounted (shared), and refCount>1, or
1812 * the buffer is too small.
1813 * Return FALSE if memory could not be allocated.
1814 */
1815 if(forceClone ||
1816 fUnion.fFields.fLengthAndFlags & kBufferIsReadonly ||
1817 (fUnion.fFields.fLengthAndFlags & kRefCounted && refCount() > 1) ||
1818 newCapacity > getCapacity()
1819 ) {
1820 // check growCapacity for default value and use of the stack buffer
1821 if(growCapacity < 0) {
1822 growCapacity = newCapacity;
1823 } else if(newCapacity <= US_STACKBUF_SIZE && growCapacity > US_STACKBUF_SIZE) {
1824 growCapacity = US_STACKBUF_SIZE;
1825 }
1826
1827 // save old values
1828 UChar oldStackBuffer[US_STACKBUF_SIZE];
1829 UChar *oldArray;
1830 int32_t oldLength = length();
1831 int16_t flags = fUnion.fFields.fLengthAndFlags;
1832
1833 if(flags&kUsingStackBuffer) {
1834 U_ASSERT(!(flags&kRefCounted)); /* kRefCounted and kUsingStackBuffer are mutally exclusive */
1835 if(doCopyArray && growCapacity > US_STACKBUF_SIZE) {
1836 // copy the stack buffer contents because it will be overwritten with
1837 // fUnion.fFields values
1838 us_arrayCopy(fUnion.fStackFields.fBuffer, 0, oldStackBuffer, 0, oldLength);
1839 oldArray = oldStackBuffer;
1840 } else {
1841 oldArray = NULL; // no need to copy from the stack buffer to itself
1842 }
1843 } else {
1844 oldArray = fUnion.fFields.fArray;
1845 U_ASSERT(oldArray!=NULL); /* when stack buffer is not used, oldArray must have a non-NULL reference */
1846 }
1847
1848 // allocate a new array
1849 if(allocate(growCapacity) ||
1850 (newCapacity < growCapacity && allocate(newCapacity))
1851 ) {
1852 if(doCopyArray) {
1853 // copy the contents
1854 // do not copy more than what fits - it may be smaller than before
1855 int32_t minLength = oldLength;
1856 newCapacity = getCapacity();
1857 if(newCapacity < minLength) {
1858 minLength = newCapacity;
1859 }
1860 if(oldArray != NULL) {
1861 us_arrayCopy(oldArray, 0, getArrayStart(), 0, minLength);
1862 }
1863 setLength(minLength);
1864 } else {
1865 setZeroLength();
1866 }
1867
1868 // release the old array
1869 if(flags & kRefCounted) {
1870 // the array is refCounted; decrement and release if 0
1871 u_atomic_int32_t *pRefCount = ((u_atomic_int32_t *)oldArray - 1);
1872 if(umtx_atomic_dec(pRefCount) == 0) {
1873 if(pBufferToDelete == 0) {
1874 // Note: cast to (void *) is needed with MSVC, where u_atomic_int32_t
1875 // is defined as volatile. (Volatile has useful non-standard behavior
1876 // with this compiler.)
1877 uprv_free((void *)pRefCount);
1878 } else {
1879 // the caller requested to delete it himself
1880 *pBufferToDelete = (int32_t *)pRefCount;
1881 }
1882 }
1883 }
1884 } else {
1885 // not enough memory for growCapacity and not even for the smaller newCapacity
1886 // reset the old values for setToBogus() to release the array
1887 if(!(flags&kUsingStackBuffer)) {
1888 fUnion.fFields.fArray = oldArray;
1889 }
1890 fUnion.fFields.fLengthAndFlags = flags;
1891 setToBogus();
1892 return FALSE;
1893 }
1894 }
1895 return TRUE;
1896}
1897
1898// UnicodeStringAppendable ------------------------------------------------- ***
1899
1900UnicodeStringAppendable::~UnicodeStringAppendable() {}
1901
1902UBool
1903UnicodeStringAppendable::appendCodeUnit(UChar c) {
1904 return str.doAppend(&c, 0, 1).isWritable();
1905}
1906
1907UBool
1908UnicodeStringAppendable::appendCodePoint(UChar32 c) {
1909 UChar buffer[U16_MAX_LENGTH];
1910 int32_t cLength = 0;
1911 UBool isError = FALSE;
1912 U16_APPEND(buffer, cLength, U16_MAX_LENGTH, c, isError);
1913 return !isError && str.doAppend(buffer, 0, cLength).isWritable();
1914}
1915
1916UBool
1917UnicodeStringAppendable::appendString(const UChar *s, int32_t length) {
1918 return str.doAppend(s, 0, length).isWritable();
1919}
1920
1921UBool
1922UnicodeStringAppendable::reserveAppendCapacity(int32_t appendCapacity) {
1923 return str.cloneArrayIfNeeded(str.length() + appendCapacity);
1924}
1925
1926UChar *
1927UnicodeStringAppendable::getAppendBuffer(int32_t minCapacity,
1928 int32_t desiredCapacityHint,
1929 UChar *scratch, int32_t scratchCapacity,
1930 int32_t *resultCapacity) {
1931 if(minCapacity < 1 || scratchCapacity < minCapacity) {
1932 *resultCapacity = 0;
1933 return NULL;
1934 }
1935 int32_t oldLength = str.length();
1936 if(minCapacity <= (kMaxCapacity - oldLength) &&
1937 desiredCapacityHint <= (kMaxCapacity - oldLength) &&
1938 str.cloneArrayIfNeeded(oldLength + minCapacity, oldLength + desiredCapacityHint)) {
1939 *resultCapacity = str.getCapacity() - oldLength;
1940 return str.getArrayStart() + oldLength;
1941 }
1942 *resultCapacity = scratchCapacity;
1943 return scratch;
1944}
1945
1946U_NAMESPACE_END
1947
1948U_NAMESPACE_USE
1949
1950U_CAPI int32_t U_EXPORT2
1951uhash_hashUnicodeString(const UElement key) {
1952 const UnicodeString *str = (const UnicodeString*) key.pointer;
1953 return (str == NULL) ? 0 : str->hashCode();
1954}
1955
1956// Moved here from uhash_us.cpp so that using a UVector of UnicodeString*
1957// does not depend on hashtable code.
1958U_CAPI UBool U_EXPORT2
1959uhash_compareUnicodeString(const UElement key1, const UElement key2) {
1960 const UnicodeString *str1 = (const UnicodeString*) key1.pointer;
1961 const UnicodeString *str2 = (const UnicodeString*) key2.pointer;
1962 if (str1 == str2) {
1963 return TRUE;
1964 }
1965 if (str1 == NULL || str2 == NULL) {
1966 return FALSE;
1967 }
1968 return *str1 == *str2;
1969}
1970
1971#ifdef U_STATIC_IMPLEMENTATION
1972/*
1973This should never be called. It is defined here to make sure that the
1974virtual vector deleting destructor is defined within unistr.cpp.
1975The vector deleting destructor is already a part of UObject,
1976but defining it here makes sure that it is included with this object file.
1977This makes sure that static library dependencies are kept to a minimum.
1978*/
1979static void uprv_UnicodeStringDummy(void) {
1980 delete [] (new UnicodeString[2]);
1981}
1982#endif
1983