1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4******************************************************************************
5*
6* Copyright (C) 1998-2016, International Business Machines
7* Corporation and others. All Rights Reserved.
8*
9******************************************************************************
10*
11* File ustring.cpp
12*
13* Modification History:
14*
15* Date Name Description
16* 12/07/98 bertrand Creation.
17******************************************************************************
18*/
19
20#include "unicode/utypes.h"
21#include "unicode/putil.h"
22#include "unicode/uchar.h"
23#include "unicode/ustring.h"
24#include "unicode/utf16.h"
25#include "cstring.h"
26#include "cwchar.h"
27#include "cmemory.h"
28#include "ustr_imp.h"
29
30/* ANSI string.h - style functions ------------------------------------------ */
31
32/* U+ffff is the highest BMP code point, the highest one that fits into a 16-bit char16_t */
33#define U_BMP_MAX 0xffff
34
35/* Forward binary string search functions ----------------------------------- */
36
37/*
38 * Test if a substring match inside a string is at code point boundaries.
39 * All pointers refer to the same buffer.
40 * The limit pointer may be nullptr, all others must be real pointers.
41 */
42static inline UBool
43isMatchAtCPBoundary(const char16_t *start, const char16_t *match, const char16_t *matchLimit, const char16_t *limit) {
44 if(U16_IS_TRAIL(*match) && start!=match && U16_IS_LEAD(*(match-1))) {
45 /* the leading edge of the match is in the middle of a surrogate pair */
46 return false;
47 }
48 if(U16_IS_LEAD(*(matchLimit-1)) && matchLimit!=limit && U16_IS_TRAIL(*matchLimit)) {
49 /* the trailing edge of the match is in the middle of a surrogate pair */
50 return false;
51 }
52 return true;
53}
54
55U_CAPI char16_t * U_EXPORT2
56u_strFindFirst(const char16_t *s, int32_t length,
57 const char16_t *sub, int32_t subLength) {
58 const char16_t *start, *p, *q, *subLimit;
59 char16_t c, cs, cq;
60
61 if(sub==nullptr || subLength<-1) {
62 return (char16_t *)s;
63 }
64 if(s==nullptr || length<-1) {
65 return nullptr;
66 }
67
68 start=s;
69
70 if(length<0 && subLength<0) {
71 /* both strings are NUL-terminated */
72 if((cs=*sub++)==0) {
73 return (char16_t *)s;
74 }
75 if(*sub==0 && !U16_IS_SURROGATE(cs)) {
76 /* the substring consists of a single, non-surrogate BMP code point */
77 return u_strchr(s, cs);
78 }
79
80 while((c=*s++)!=0) {
81 if(c==cs) {
82 /* found first substring char16_t, compare rest */
83 p=s;
84 q=sub;
85 for(;;) {
86 if((cq=*q)==0) {
87 if(isMatchAtCPBoundary(start, s-1, p, nullptr)) {
88 return (char16_t *)(s-1); /* well-formed match */
89 } else {
90 break; /* no match because surrogate pair is split */
91 }
92 }
93 if((c=*p)==0) {
94 return nullptr; /* no match, and none possible after s */
95 }
96 if(c!=cq) {
97 break; /* no match */
98 }
99 ++p;
100 ++q;
101 }
102 }
103 }
104
105 /* not found */
106 return nullptr;
107 }
108
109 if(subLength<0) {
110 subLength=u_strlen(sub);
111 }
112 if(subLength==0) {
113 return (char16_t *)s;
114 }
115
116 /* get sub[0] to search for it fast */
117 cs=*sub++;
118 --subLength;
119 subLimit=sub+subLength;
120
121 if(subLength==0 && !U16_IS_SURROGATE(cs)) {
122 /* the substring consists of a single, non-surrogate BMP code point */
123 return length<0 ? u_strchr(s, cs) : u_memchr(s, cs, length);
124 }
125
126 if(length<0) {
127 /* s is NUL-terminated */
128 while((c=*s++)!=0) {
129 if(c==cs) {
130 /* found first substring char16_t, compare rest */
131 p=s;
132 q=sub;
133 for(;;) {
134 if(q==subLimit) {
135 if(isMatchAtCPBoundary(start, s-1, p, nullptr)) {
136 return (char16_t *)(s-1); /* well-formed match */
137 } else {
138 break; /* no match because surrogate pair is split */
139 }
140 }
141 if((c=*p)==0) {
142 return nullptr; /* no match, and none possible after s */
143 }
144 if(c!=*q) {
145 break; /* no match */
146 }
147 ++p;
148 ++q;
149 }
150 }
151 }
152 } else {
153 const char16_t *limit, *preLimit;
154
155 /* subLength was decremented above */
156 if(length<=subLength) {
157 return nullptr; /* s is shorter than sub */
158 }
159
160 limit=s+length;
161
162 /* the substring must start before preLimit */
163 preLimit=limit-subLength;
164
165 while(s!=preLimit) {
166 c=*s++;
167 if(c==cs) {
168 /* found first substring char16_t, compare rest */
169 p=s;
170 q=sub;
171 for(;;) {
172 if(q==subLimit) {
173 if(isMatchAtCPBoundary(start, s-1, p, limit)) {
174 return (char16_t *)(s-1); /* well-formed match */
175 } else {
176 break; /* no match because surrogate pair is split */
177 }
178 }
179 if(*p!=*q) {
180 break; /* no match */
181 }
182 ++p;
183 ++q;
184 }
185 }
186 }
187 }
188
189 /* not found */
190 return nullptr;
191}
192
193U_CAPI char16_t * U_EXPORT2
194u_strstr(const char16_t *s, const char16_t *substring) {
195 return u_strFindFirst(s, -1, substring, -1);
196}
197
198U_CAPI char16_t * U_EXPORT2
199u_strchr(const char16_t *s, char16_t c) {
200 if(U16_IS_SURROGATE(c)) {
201 /* make sure to not find half of a surrogate pair */
202 return u_strFindFirst(s, -1, &c, 1);
203 } else {
204 char16_t cs;
205
206 /* trivial search for a BMP code point */
207 for(;;) {
208 if((cs=*s)==c) {
209 return (char16_t *)s;
210 }
211 if(cs==0) {
212 return nullptr;
213 }
214 ++s;
215 }
216 }
217}
218
219U_CAPI char16_t * U_EXPORT2
220u_strchr32(const char16_t *s, UChar32 c) {
221 if((uint32_t)c<=U_BMP_MAX) {
222 /* find BMP code point */
223 return u_strchr(s, (char16_t)c);
224 } else if((uint32_t)c<=UCHAR_MAX_VALUE) {
225 /* find supplementary code point as surrogate pair */
226 char16_t cs, lead=U16_LEAD(c), trail=U16_TRAIL(c);
227
228 while((cs=*s++)!=0) {
229 if(cs==lead && *s==trail) {
230 return (char16_t *)(s-1);
231 }
232 }
233 return nullptr;
234 } else {
235 /* not a Unicode code point, not findable */
236 return nullptr;
237 }
238}
239
240U_CAPI char16_t * U_EXPORT2
241u_memchr(const char16_t *s, char16_t c, int32_t count) {
242 if(count<=0) {
243 return nullptr; /* no string */
244 } else if(U16_IS_SURROGATE(c)) {
245 /* make sure to not find half of a surrogate pair */
246 return u_strFindFirst(s, count, &c, 1);
247 } else {
248 /* trivial search for a BMP code point */
249 const char16_t *limit=s+count;
250 do {
251 if(*s==c) {
252 return (char16_t *)s;
253 }
254 } while(++s!=limit);
255 return nullptr;
256 }
257}
258
259U_CAPI char16_t * U_EXPORT2
260u_memchr32(const char16_t *s, UChar32 c, int32_t count) {
261 if((uint32_t)c<=U_BMP_MAX) {
262 /* find BMP code point */
263 return u_memchr(s, (char16_t)c, count);
264 } else if(count<2) {
265 /* too short for a surrogate pair */
266 return nullptr;
267 } else if((uint32_t)c<=UCHAR_MAX_VALUE) {
268 /* find supplementary code point as surrogate pair */
269 const char16_t *limit=s+count-1; /* -1 so that we do not need a separate check for the trail unit */
270 char16_t lead=U16_LEAD(c), trail=U16_TRAIL(c);
271
272 do {
273 if(*s==lead && *(s+1)==trail) {
274 return (char16_t *)s;
275 }
276 } while(++s!=limit);
277 return nullptr;
278 } else {
279 /* not a Unicode code point, not findable */
280 return nullptr;
281 }
282}
283
284/* Backward binary string search functions ---------------------------------- */
285
286U_CAPI char16_t * U_EXPORT2
287u_strFindLast(const char16_t *s, int32_t length,
288 const char16_t *sub, int32_t subLength) {
289 const char16_t *start, *limit, *p, *q, *subLimit;
290 char16_t c, cs;
291
292 if(sub==nullptr || subLength<-1) {
293 return (char16_t *)s;
294 }
295 if(s==nullptr || length<-1) {
296 return nullptr;
297 }
298
299 /*
300 * This implementation is more lazy than the one for u_strFindFirst():
301 * There is no special search code for NUL-terminated strings.
302 * It does not seem to be worth it for searching substrings to
303 * search forward and find all matches like in u_strrchr() and similar.
304 * Therefore, we simply get both string lengths and search backward.
305 *
306 * markus 2002oct23
307 */
308
309 if(subLength<0) {
310 subLength=u_strlen(sub);
311 }
312 if(subLength==0) {
313 return (char16_t *)s;
314 }
315
316 /* get sub[subLength-1] to search for it fast */
317 subLimit=sub+subLength;
318 cs=*(--subLimit);
319 --subLength;
320
321 if(subLength==0 && !U16_IS_SURROGATE(cs)) {
322 /* the substring consists of a single, non-surrogate BMP code point */
323 return length<0 ? u_strrchr(s, cs) : u_memrchr(s, cs, length);
324 }
325
326 if(length<0) {
327 length=u_strlen(s);
328 }
329
330 /* subLength was decremented above */
331 if(length<=subLength) {
332 return nullptr; /* s is shorter than sub */
333 }
334
335 start=s;
336 limit=s+length;
337
338 /* the substring must start no later than s+subLength */
339 s+=subLength;
340
341 while(s!=limit) {
342 c=*(--limit);
343 if(c==cs) {
344 /* found last substring char16_t, compare rest */
345 p=limit;
346 q=subLimit;
347 for(;;) {
348 if(q==sub) {
349 if(isMatchAtCPBoundary(start, p, limit+1, start+length)) {
350 return (char16_t *)p; /* well-formed match */
351 } else {
352 break; /* no match because surrogate pair is split */
353 }
354 }
355 if(*(--p)!=*(--q)) {
356 break; /* no match */
357 }
358 }
359 }
360 }
361
362 /* not found */
363 return nullptr;
364}
365
366U_CAPI char16_t * U_EXPORT2
367u_strrstr(const char16_t *s, const char16_t *substring) {
368 return u_strFindLast(s, -1, substring, -1);
369}
370
371U_CAPI char16_t * U_EXPORT2
372u_strrchr(const char16_t *s, char16_t c) {
373 if(U16_IS_SURROGATE(c)) {
374 /* make sure to not find half of a surrogate pair */
375 return u_strFindLast(s, -1, &c, 1);
376 } else {
377 const char16_t *result=nullptr;
378 char16_t cs;
379
380 /* trivial search for a BMP code point */
381 for(;;) {
382 if((cs=*s)==c) {
383 result=s;
384 }
385 if(cs==0) {
386 return (char16_t *)result;
387 }
388 ++s;
389 }
390 }
391}
392
393U_CAPI char16_t * U_EXPORT2
394u_strrchr32(const char16_t *s, UChar32 c) {
395 if((uint32_t)c<=U_BMP_MAX) {
396 /* find BMP code point */
397 return u_strrchr(s, (char16_t)c);
398 } else if((uint32_t)c<=UCHAR_MAX_VALUE) {
399 /* find supplementary code point as surrogate pair */
400 const char16_t *result=nullptr;
401 char16_t cs, lead=U16_LEAD(c), trail=U16_TRAIL(c);
402
403 while((cs=*s++)!=0) {
404 if(cs==lead && *s==trail) {
405 result=s-1;
406 }
407 }
408 return (char16_t *)result;
409 } else {
410 /* not a Unicode code point, not findable */
411 return nullptr;
412 }
413}
414
415U_CAPI char16_t * U_EXPORT2
416u_memrchr(const char16_t *s, char16_t c, int32_t count) {
417 if(count<=0) {
418 return nullptr; /* no string */
419 } else if(U16_IS_SURROGATE(c)) {
420 /* make sure to not find half of a surrogate pair */
421 return u_strFindLast(s, count, &c, 1);
422 } else {
423 /* trivial search for a BMP code point */
424 const char16_t *limit=s+count;
425 do {
426 if(*(--limit)==c) {
427 return (char16_t *)limit;
428 }
429 } while(s!=limit);
430 return nullptr;
431 }
432}
433
434U_CAPI char16_t * U_EXPORT2
435u_memrchr32(const char16_t *s, UChar32 c, int32_t count) {
436 if((uint32_t)c<=U_BMP_MAX) {
437 /* find BMP code point */
438 return u_memrchr(s, (char16_t)c, count);
439 } else if(count<2) {
440 /* too short for a surrogate pair */
441 return nullptr;
442 } else if((uint32_t)c<=UCHAR_MAX_VALUE) {
443 /* find supplementary code point as surrogate pair */
444 const char16_t *limit=s+count-1;
445 char16_t lead=U16_LEAD(c), trail=U16_TRAIL(c);
446
447 do {
448 if(*limit==trail && *(limit-1)==lead) {
449 return (char16_t *)(limit-1);
450 }
451 } while(s!=--limit);
452 return nullptr;
453 } else {
454 /* not a Unicode code point, not findable */
455 return nullptr;
456 }
457}
458
459/* Tokenization functions --------------------------------------------------- */
460
461/*
462 * Match each code point in a string against each code point in the matchSet.
463 * Return the index of the first string code point that
464 * is (polarity==true) or is not (false) contained in the matchSet.
465 * Return -(string length)-1 if there is no such code point.
466 */
467static int32_t
468_matchFromSet(const char16_t *string, const char16_t *matchSet, UBool polarity) {
469 int32_t matchLen, matchBMPLen, strItr, matchItr;
470 UChar32 stringCh, matchCh;
471 char16_t c, c2;
472
473 /* first part of matchSet contains only BMP code points */
474 matchBMPLen = 0;
475 while((c = matchSet[matchBMPLen]) != 0 && U16_IS_SINGLE(c)) {
476 ++matchBMPLen;
477 }
478
479 /* second part of matchSet contains BMP and supplementary code points */
480 matchLen = matchBMPLen;
481 while(matchSet[matchLen] != 0) {
482 ++matchLen;
483 }
484
485 for(strItr = 0; (c = string[strItr]) != 0;) {
486 ++strItr;
487 if(U16_IS_SINGLE(c)) {
488 if(polarity) {
489 for(matchItr = 0; matchItr < matchLen; ++matchItr) {
490 if(c == matchSet[matchItr]) {
491 return strItr - 1; /* one matches */
492 }
493 }
494 } else {
495 for(matchItr = 0; matchItr < matchLen; ++matchItr) {
496 if(c == matchSet[matchItr]) {
497 goto endloop;
498 }
499 }
500 return strItr - 1; /* none matches */
501 }
502 } else {
503 /*
504 * No need to check for string length before U16_IS_TRAIL
505 * because c2 could at worst be the terminating NUL.
506 */
507 if(U16_IS_SURROGATE_LEAD(c) && U16_IS_TRAIL(c2 = string[strItr])) {
508 ++strItr;
509 stringCh = U16_GET_SUPPLEMENTARY(c, c2);
510 } else {
511 stringCh = c; /* unpaired trail surrogate */
512 }
513
514 if(polarity) {
515 for(matchItr = matchBMPLen; matchItr < matchLen;) {
516 U16_NEXT(matchSet, matchItr, matchLen, matchCh);
517 if(stringCh == matchCh) {
518 return strItr - U16_LENGTH(stringCh); /* one matches */
519 }
520 }
521 } else {
522 for(matchItr = matchBMPLen; matchItr < matchLen;) {
523 U16_NEXT(matchSet, matchItr, matchLen, matchCh);
524 if(stringCh == matchCh) {
525 goto endloop;
526 }
527 }
528 return strItr - U16_LENGTH(stringCh); /* none matches */
529 }
530 }
531endloop:
532 /* wish C had continue with labels like Java... */;
533 }
534
535 /* Didn't find it. */
536 return -strItr-1;
537}
538
539/* Search for a codepoint in a string that matches one of the matchSet codepoints. */
540U_CAPI char16_t * U_EXPORT2
541u_strpbrk(const char16_t *string, const char16_t *matchSet)
542{
543 int32_t idx = _matchFromSet(string, matchSet, true);
544 if(idx >= 0) {
545 return (char16_t *)string + idx;
546 } else {
547 return nullptr;
548 }
549}
550
551/* Search for a codepoint in a string that matches one of the matchSet codepoints. */
552U_CAPI int32_t U_EXPORT2
553u_strcspn(const char16_t *string, const char16_t *matchSet)
554{
555 int32_t idx = _matchFromSet(string, matchSet, true);
556 if(idx >= 0) {
557 return idx;
558 } else {
559 return -idx - 1; /* == u_strlen(string) */
560 }
561}
562
563/* Search for a codepoint in a string that does not match one of the matchSet codepoints. */
564U_CAPI int32_t U_EXPORT2
565u_strspn(const char16_t *string, const char16_t *matchSet)
566{
567 int32_t idx = _matchFromSet(string, matchSet, false);
568 if(idx >= 0) {
569 return idx;
570 } else {
571 return -idx - 1; /* == u_strlen(string) */
572 }
573}
574
575/* ----- Text manipulation functions --- */
576
577U_CAPI char16_t* U_EXPORT2
578u_strtok_r(char16_t *src,
579 const char16_t *delim,
580 char16_t **saveState)
581{
582 char16_t *tokSource;
583 char16_t *nextToken;
584 uint32_t nonDelimIdx;
585
586 /* If saveState is nullptr, the user messed up. */
587 if (src != nullptr) {
588 tokSource = src;
589 *saveState = src; /* Set to "src" in case there are no delimiters */
590 }
591 else if (*saveState) {
592 tokSource = *saveState;
593 }
594 else {
595 /* src == nullptr && *saveState == nullptr */
596 /* This shouldn't happen. We already finished tokenizing. */
597 return nullptr;
598 }
599
600 /* Skip initial delimiters */
601 nonDelimIdx = u_strspn(tokSource, delim);
602 tokSource = &tokSource[nonDelimIdx];
603
604 if (*tokSource) {
605 nextToken = u_strpbrk(tokSource, delim);
606 if (nextToken != nullptr) {
607 /* Create a token */
608 *(nextToken++) = 0;
609 *saveState = nextToken;
610 return tokSource;
611 }
612 else if (*saveState) {
613 /* Return the last token */
614 *saveState = nullptr;
615 return tokSource;
616 }
617 }
618 else {
619 /* No tokens were found. Only delimiters were left. */
620 *saveState = nullptr;
621 }
622 return nullptr;
623}
624
625/* Miscellaneous functions -------------------------------------------------- */
626
627U_CAPI char16_t* U_EXPORT2
628u_strcat(char16_t *dst,
629 const char16_t *src)
630{
631 char16_t *anchor = dst; /* save a pointer to start of dst */
632
633 while(*dst != 0) { /* To end of first string */
634 ++dst;
635 }
636 while((*(dst++) = *(src++)) != 0) { /* copy string 2 over */
637 }
638
639 return anchor;
640}
641
642U_CAPI char16_t* U_EXPORT2
643u_strncat(char16_t *dst,
644 const char16_t *src,
645 int32_t n )
646{
647 if(n > 0) {
648 char16_t *anchor = dst; /* save a pointer to start of dst */
649
650 while(*dst != 0) { /* To end of first string */
651 ++dst;
652 }
653 while((*dst = *src) != 0) { /* copy string 2 over */
654 ++dst;
655 if(--n == 0) {
656 *dst = 0;
657 break;
658 }
659 ++src;
660 }
661
662 return anchor;
663 } else {
664 return dst;
665 }
666}
667
668/* ----- Text property functions --- */
669
670U_CAPI int32_t U_EXPORT2
671u_strcmp(const char16_t *s1,
672 const char16_t *s2)
673{
674 char16_t c1, c2;
675
676 for(;;) {
677 c1=*s1++;
678 c2=*s2++;
679 if (c1 != c2 || c1 == 0) {
680 break;
681 }
682 }
683 return (int32_t)c1 - (int32_t)c2;
684}
685
686U_CFUNC int32_t U_EXPORT2
687uprv_strCompare(const char16_t *s1, int32_t length1,
688 const char16_t *s2, int32_t length2,
689 UBool strncmpStyle, UBool codePointOrder) {
690 const char16_t *start1, *start2, *limit1, *limit2;
691 char16_t c1, c2;
692
693 /* setup for fix-up */
694 start1=s1;
695 start2=s2;
696
697 /* compare identical prefixes - they do not need to be fixed up */
698 if(length1<0 && length2<0) {
699 /* strcmp style, both NUL-terminated */
700 if(s1==s2) {
701 return 0;
702 }
703
704 for(;;) {
705 c1=*s1;
706 c2=*s2;
707 if(c1!=c2) {
708 break;
709 }
710 if(c1==0) {
711 return 0;
712 }
713 ++s1;
714 ++s2;
715 }
716
717 /* setup for fix-up */
718 limit1=limit2=nullptr;
719 } else if(strncmpStyle) {
720 /* special handling for strncmp, assume length1==length2>=0 but also check for NUL */
721 if(s1==s2) {
722 return 0;
723 }
724
725 limit1=start1+length1;
726
727 for(;;) {
728 /* both lengths are same, check only one limit */
729 if(s1==limit1) {
730 return 0;
731 }
732
733 c1=*s1;
734 c2=*s2;
735 if(c1!=c2) {
736 break;
737 }
738 if(c1==0) {
739 return 0;
740 }
741 ++s1;
742 ++s2;
743 }
744
745 /* setup for fix-up */
746 limit2=start2+length1; /* use length1 here, too, to enforce assumption */
747 } else {
748 /* memcmp/UnicodeString style, both length-specified */
749 int32_t lengthResult;
750
751 if(length1<0) {
752 length1=u_strlen(s1);
753 }
754 if(length2<0) {
755 length2=u_strlen(s2);
756 }
757
758 /* limit1=start1+min(length1, length2) */
759 if(length1<length2) {
760 lengthResult=-1;
761 limit1=start1+length1;
762 } else if(length1==length2) {
763 lengthResult=0;
764 limit1=start1+length1;
765 } else /* length1>length2 */ {
766 lengthResult=1;
767 limit1=start1+length2;
768 }
769
770 if(s1==s2) {
771 return lengthResult;
772 }
773
774 for(;;) {
775 /* check pseudo-limit */
776 if(s1==limit1) {
777 return lengthResult;
778 }
779
780 c1=*s1;
781 c2=*s2;
782 if(c1!=c2) {
783 break;
784 }
785 ++s1;
786 ++s2;
787 }
788
789 /* setup for fix-up */
790 limit1=start1+length1;
791 limit2=start2+length2;
792 }
793
794 /* if both values are in or above the surrogate range, fix them up */
795 if(c1>=0xd800 && c2>=0xd800 && codePointOrder) {
796 /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
797 if(
798 (c1<=0xdbff && (s1+1)!=limit1 && U16_IS_TRAIL(*(s1+1))) ||
799 (U16_IS_TRAIL(c1) && start1!=s1 && U16_IS_LEAD(*(s1-1)))
800 ) {
801 /* part of a surrogate pair, leave >=d800 */
802 } else {
803 /* BMP code point - may be surrogate code point - make <d800 */
804 c1-=0x2800;
805 }
806
807 if(
808 (c2<=0xdbff && (s2+1)!=limit2 && U16_IS_TRAIL(*(s2+1))) ||
809 (U16_IS_TRAIL(c2) && start2!=s2 && U16_IS_LEAD(*(s2-1)))
810 ) {
811 /* part of a surrogate pair, leave >=d800 */
812 } else {
813 /* BMP code point - may be surrogate code point - make <d800 */
814 c2-=0x2800;
815 }
816 }
817
818 /* now c1 and c2 are in the requested (code unit or code point) order */
819 return (int32_t)c1-(int32_t)c2;
820}
821
822/*
823 * Compare two strings as presented by UCharIterators.
824 * Use code unit or code point order.
825 * When the function returns, it is undefined where the iterators
826 * have stopped.
827 */
828U_CAPI int32_t U_EXPORT2
829u_strCompareIter(UCharIterator *iter1, UCharIterator *iter2, UBool codePointOrder) {
830 UChar32 c1, c2;
831
832 /* argument checking */
833 if(iter1==nullptr || iter2==nullptr) {
834 return 0; /* bad arguments */
835 }
836 if(iter1==iter2) {
837 return 0; /* identical iterators */
838 }
839
840 /* reset iterators to start? */
841 iter1->move(iter1, 0, UITER_START);
842 iter2->move(iter2, 0, UITER_START);
843
844 /* compare identical prefixes - they do not need to be fixed up */
845 for(;;) {
846 c1=iter1->next(iter1);
847 c2=iter2->next(iter2);
848 if(c1!=c2) {
849 break;
850 }
851 if(c1==-1) {
852 return 0;
853 }
854 }
855
856 /* if both values are in or above the surrogate range, fix them up */
857 if(c1>=0xd800 && c2>=0xd800 && codePointOrder) {
858 /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
859 if(
860 (c1<=0xdbff && U16_IS_TRAIL(iter1->current(iter1))) ||
861 (U16_IS_TRAIL(c1) && (iter1->previous(iter1), U16_IS_LEAD(iter1->previous(iter1))))
862 ) {
863 /* part of a surrogate pair, leave >=d800 */
864 } else {
865 /* BMP code point - may be surrogate code point - make <d800 */
866 c1-=0x2800;
867 }
868
869 if(
870 (c2<=0xdbff && U16_IS_TRAIL(iter2->current(iter2))) ||
871 (U16_IS_TRAIL(c2) && (iter2->previous(iter2), U16_IS_LEAD(iter2->previous(iter2))))
872 ) {
873 /* part of a surrogate pair, leave >=d800 */
874 } else {
875 /* BMP code point - may be surrogate code point - make <d800 */
876 c2-=0x2800;
877 }
878 }
879
880 /* now c1 and c2 are in the requested (code unit or code point) order */
881 return (int32_t)c1-(int32_t)c2;
882}
883
884#if 0
885/*
886 * u_strCompareIter() does not leave the iterators _on_ the different units.
887 * This is possible but would cost a few extra indirect function calls to back
888 * up if the last unit (c1 or c2 respectively) was >=0.
889 *
890 * Consistently leaving them _behind_ the different units is not an option
891 * because the current "unit" is the end of the string if that is reached,
892 * and in such a case the iterator does not move.
893 * For example, when comparing "ab" with "abc", both iterators rest _on_ the end
894 * of their strings. Calling previous() on each does not move them to where
895 * the comparison fails.
896 *
897 * So the simplest semantics is to not define where the iterators end up.
898 *
899 * The following fragment is part of what would need to be done for backing up.
900 */
901void fragment {
902 /* iff a surrogate is part of a surrogate pair, leave >=d800 */
903 if(c1<=0xdbff) {
904 if(!U16_IS_TRAIL(iter1->current(iter1))) {
905 /* lead surrogate code point - make <d800 */
906 c1-=0x2800;
907 }
908 } else if(c1<=0xdfff) {
909 int32_t idx=iter1->getIndex(iter1, UITER_CURRENT);
910 iter1->previous(iter1); /* ==c1 */
911 if(!U16_IS_LEAD(iter1->previous(iter1))) {
912 /* trail surrogate code point - make <d800 */
913 c1-=0x2800;
914 }
915 /* go back to behind where the difference is */
916 iter1->move(iter1, idx, UITER_ZERO);
917 } else /* 0xe000<=c1<=0xffff */ {
918 /* BMP code point - make <d800 */
919 c1-=0x2800;
920 }
921}
922#endif
923
924U_CAPI int32_t U_EXPORT2
925u_strCompare(const char16_t *s1, int32_t length1,
926 const char16_t *s2, int32_t length2,
927 UBool codePointOrder) {
928 /* argument checking */
929 if(s1==nullptr || length1<-1 || s2==nullptr || length2<-1) {
930 return 0;
931 }
932 return uprv_strCompare(s1, length1, s2, length2, false, codePointOrder);
933}
934
935/* String compare in code point order - u_strcmp() compares in code unit order. */
936U_CAPI int32_t U_EXPORT2
937u_strcmpCodePointOrder(const char16_t *s1, const char16_t *s2) {
938 return uprv_strCompare(s1, -1, s2, -1, false, true);
939}
940
941U_CAPI int32_t U_EXPORT2
942u_strncmp(const char16_t *s1,
943 const char16_t *s2,
944 int32_t n)
945{
946 if(n > 0) {
947 int32_t rc;
948 for(;;) {
949 rc = (int32_t)*s1 - (int32_t)*s2;
950 if(rc != 0 || *s1 == 0 || --n == 0) {
951 return rc;
952 }
953 ++s1;
954 ++s2;
955 }
956 } else {
957 return 0;
958 }
959}
960
961U_CAPI int32_t U_EXPORT2
962u_strncmpCodePointOrder(const char16_t *s1, const char16_t *s2, int32_t n) {
963 return uprv_strCompare(s1, n, s2, n, true, true);
964}
965
966U_CAPI char16_t* U_EXPORT2
967u_strcpy(char16_t *dst,
968 const char16_t *src)
969{
970 char16_t *anchor = dst; /* save a pointer to start of dst */
971
972 while((*(dst++) = *(src++)) != 0) { /* copy string 2 over */
973 }
974
975 return anchor;
976}
977
978U_CAPI char16_t* U_EXPORT2
979u_strncpy(char16_t *dst,
980 const char16_t *src,
981 int32_t n)
982{
983 char16_t *anchor = dst; /* save a pointer to start of dst */
984
985 /* copy string 2 over */
986 while(n > 0 && (*(dst++) = *(src++)) != 0) {
987 --n;
988 }
989
990 return anchor;
991}
992
993U_CAPI int32_t U_EXPORT2
994u_strlen(const char16_t *s)
995{
996#if U_SIZEOF_WCHAR_T == U_SIZEOF_UCHAR
997 return (int32_t)uprv_wcslen((const wchar_t *)s);
998#else
999 const char16_t *t = s;
1000 while(*t != 0) {
1001 ++t;
1002 }
1003 return t - s;
1004#endif
1005}
1006
1007U_CAPI int32_t U_EXPORT2
1008u_countChar32(const char16_t *s, int32_t length) {
1009 int32_t count;
1010
1011 if(s==nullptr || length<-1) {
1012 return 0;
1013 }
1014
1015 count=0;
1016 if(length>=0) {
1017 while(length>0) {
1018 ++count;
1019 if(U16_IS_LEAD(*s) && length>=2 && U16_IS_TRAIL(*(s+1))) {
1020 s+=2;
1021 length-=2;
1022 } else {
1023 ++s;
1024 --length;
1025 }
1026 }
1027 } else /* length==-1 */ {
1028 char16_t c;
1029
1030 for(;;) {
1031 if((c=*s++)==0) {
1032 break;
1033 }
1034 ++count;
1035
1036 /*
1037 * sufficient to look ahead one because of UTF-16;
1038 * safe to look ahead one because at worst that would be the terminating NUL
1039 */
1040 if(U16_IS_LEAD(c) && U16_IS_TRAIL(*s)) {
1041 ++s;
1042 }
1043 }
1044 }
1045 return count;
1046}
1047
1048U_CAPI UBool U_EXPORT2
1049u_strHasMoreChar32Than(const char16_t *s, int32_t length, int32_t number) {
1050
1051 if(number<0) {
1052 return true;
1053 }
1054 if(s==nullptr || length<-1) {
1055 return false;
1056 }
1057
1058 if(length==-1) {
1059 /* s is NUL-terminated */
1060 char16_t c;
1061
1062 /* count code points until they exceed */
1063 for(;;) {
1064 if((c=*s++)==0) {
1065 return false;
1066 }
1067 if(number==0) {
1068 return true;
1069 }
1070 if(U16_IS_LEAD(c) && U16_IS_TRAIL(*s)) {
1071 ++s;
1072 }
1073 --number;
1074 }
1075 } else {
1076 /* length>=0 known */
1077 const char16_t *limit;
1078 int32_t maxSupplementary;
1079
1080 /* s contains at least (length+1)/2 code points: <=2 UChars per cp */
1081 if(((length+1)/2)>number) {
1082 return true;
1083 }
1084
1085 /* check if s does not even contain enough UChars */
1086 maxSupplementary=length-number;
1087 if(maxSupplementary<=0) {
1088 return false;
1089 }
1090 /* there are maxSupplementary=length-number more UChars than asked-for code points */
1091
1092 /*
1093 * count code points until they exceed and also check that there are
1094 * no more than maxSupplementary supplementary code points (char16_t pairs)
1095 */
1096 limit=s+length;
1097 for(;;) {
1098 if(s==limit) {
1099 return false;
1100 }
1101 if(number==0) {
1102 return true;
1103 }
1104 if(U16_IS_LEAD(*s++) && s!=limit && U16_IS_TRAIL(*s)) {
1105 ++s;
1106 if(--maxSupplementary<=0) {
1107 /* too many pairs - too few code points */
1108 return false;
1109 }
1110 }
1111 --number;
1112 }
1113 }
1114}
1115
1116U_CAPI char16_t * U_EXPORT2
1117u_memcpy(char16_t *dest, const char16_t *src, int32_t count) {
1118 if(count > 0) {
1119 uprv_memcpy(dest, src, (size_t)count*U_SIZEOF_UCHAR);
1120 }
1121 return dest;
1122}
1123
1124U_CAPI char16_t * U_EXPORT2
1125u_memmove(char16_t *dest, const char16_t *src, int32_t count) {
1126 if(count > 0) {
1127 uprv_memmove(dest, src, (size_t)count*U_SIZEOF_UCHAR);
1128 }
1129 return dest;
1130}
1131
1132U_CAPI char16_t * U_EXPORT2
1133u_memset(char16_t *dest, char16_t c, int32_t count) {
1134 if(count > 0) {
1135 char16_t *ptr = dest;
1136 char16_t *limit = dest + count;
1137
1138 while (ptr < limit) {
1139 *(ptr++) = c;
1140 }
1141 }
1142 return dest;
1143}
1144
1145U_CAPI int32_t U_EXPORT2
1146u_memcmp(const char16_t *buf1, const char16_t *buf2, int32_t count) {
1147 if(count > 0) {
1148 const char16_t *limit = buf1 + count;
1149 int32_t result;
1150
1151 while (buf1 < limit) {
1152 result = (int32_t)(uint16_t)*buf1 - (int32_t)(uint16_t)*buf2;
1153 if (result != 0) {
1154 return result;
1155 }
1156 buf1++;
1157 buf2++;
1158 }
1159 }
1160 return 0;
1161}
1162
1163U_CAPI int32_t U_EXPORT2
1164u_memcmpCodePointOrder(const char16_t *s1, const char16_t *s2, int32_t count) {
1165 return uprv_strCompare(s1, count, s2, count, false, true);
1166}
1167
1168/* u_unescape & support fns ------------------------------------------------- */
1169
1170/* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */
1171static const char16_t UNESCAPE_MAP[] = {
1172 /*" 0x22, 0x22 */
1173 /*' 0x27, 0x27 */
1174 /*? 0x3F, 0x3F */
1175 /*\ 0x5C, 0x5C */
1176 /*a*/ 0x61, 0x07,
1177 /*b*/ 0x62, 0x08,
1178 /*e*/ 0x65, 0x1b,
1179 /*f*/ 0x66, 0x0c,
1180 /*n*/ 0x6E, 0x0a,
1181 /*r*/ 0x72, 0x0d,
1182 /*t*/ 0x74, 0x09,
1183 /*v*/ 0x76, 0x0b
1184};
1185enum { UNESCAPE_MAP_LENGTH = UPRV_LENGTHOF(UNESCAPE_MAP) };
1186
1187/* Convert one octal digit to a numeric value 0..7, or -1 on failure */
1188static int32_t _digit8(char16_t c) {
1189 if (c >= u'0' && c <= u'7') {
1190 return c - u'0';
1191 }
1192 return -1;
1193}
1194
1195/* Convert one hex digit to a numeric value 0..F, or -1 on failure */
1196static int32_t _digit16(char16_t c) {
1197 if (c >= u'0' && c <= u'9') {
1198 return c - u'0';
1199 }
1200 if (c >= u'A' && c <= u'F') {
1201 return c - (u'A' - 10);
1202 }
1203 if (c >= u'a' && c <= u'f') {
1204 return c - (u'a' - 10);
1205 }
1206 return -1;
1207}
1208
1209/* Parse a single escape sequence. Although this method deals in
1210 * UChars, it does not use C++ or UnicodeString. This allows it to
1211 * be used from C contexts. */
1212U_CAPI UChar32 U_EXPORT2
1213u_unescapeAt(UNESCAPE_CHAR_AT charAt,
1214 int32_t *offset,
1215 int32_t length,
1216 void *context) {
1217
1218 int32_t start = *offset;
1219 UChar32 c;
1220 UChar32 result = 0;
1221 int8_t n = 0;
1222 int8_t minDig = 0;
1223 int8_t maxDig = 0;
1224 int8_t bitsPerDigit = 4;
1225 int32_t dig;
1226 UBool braces = false;
1227
1228 /* Check that offset is in range */
1229 if (*offset < 0 || *offset >= length) {
1230 goto err;
1231 }
1232
1233 /* Fetch first char16_t after '\\' */
1234 c = charAt((*offset)++, context);
1235
1236 /* Convert hexadecimal and octal escapes */
1237 switch (c) {
1238 case u'u':
1239 minDig = maxDig = 4;
1240 break;
1241 case u'U':
1242 minDig = maxDig = 8;
1243 break;
1244 case u'x':
1245 minDig = 1;
1246 if (*offset < length && charAt(*offset, context) == u'{') {
1247 ++(*offset);
1248 braces = true;
1249 maxDig = 8;
1250 } else {
1251 maxDig = 2;
1252 }
1253 break;
1254 default:
1255 dig = _digit8(c);
1256 if (dig >= 0) {
1257 minDig = 1;
1258 maxDig = 3;
1259 n = 1; /* Already have first octal digit */
1260 bitsPerDigit = 3;
1261 result = dig;
1262 }
1263 break;
1264 }
1265 if (minDig != 0) {
1266 while (*offset < length && n < maxDig) {
1267 c = charAt(*offset, context);
1268 dig = (bitsPerDigit == 3) ? _digit8(c) : _digit16(c);
1269 if (dig < 0) {
1270 break;
1271 }
1272 result = (result << bitsPerDigit) | dig;
1273 ++(*offset);
1274 ++n;
1275 }
1276 if (n < minDig) {
1277 goto err;
1278 }
1279 if (braces) {
1280 if (c != u'}') {
1281 goto err;
1282 }
1283 ++(*offset);
1284 }
1285 if (result < 0 || result >= 0x110000) {
1286 goto err;
1287 }
1288 /* If an escape sequence specifies a lead surrogate, see if
1289 * there is a trail surrogate after it, either as an escape or
1290 * as a literal. If so, join them up into a supplementary.
1291 */
1292 if (*offset < length && U16_IS_LEAD(result)) {
1293 int32_t ahead = *offset + 1;
1294 c = charAt(*offset, context);
1295 if (c == u'\\' && ahead < length) {
1296 // Calling ourselves recursively may cause a stack overflow if
1297 // we have repeated escaped lead surrogates.
1298 // Limit the length to 11 ("x{0000DFFF}") after ahead.
1299 int32_t tailLimit = ahead + 11;
1300 if (tailLimit > length) {
1301 tailLimit = length;
1302 }
1303 c = u_unescapeAt(charAt, &ahead, tailLimit, context);
1304 }
1305 if (U16_IS_TRAIL(c)) {
1306 *offset = ahead;
1307 result = U16_GET_SUPPLEMENTARY(result, c);
1308 }
1309 }
1310 return result;
1311 }
1312
1313 /* Convert C-style escapes in table */
1314 for (int32_t i=0; i<UNESCAPE_MAP_LENGTH; i+=2) {
1315 if (c == UNESCAPE_MAP[i]) {
1316 return UNESCAPE_MAP[i+1];
1317 } else if (c < UNESCAPE_MAP[i]) {
1318 break;
1319 }
1320 }
1321
1322 /* Map \cX to control-X: X & 0x1F */
1323 if (c == u'c' && *offset < length) {
1324 c = charAt((*offset)++, context);
1325 if (U16_IS_LEAD(c) && *offset < length) {
1326 char16_t c2 = charAt(*offset, context);
1327 if (U16_IS_TRAIL(c2)) {
1328 ++(*offset);
1329 c = U16_GET_SUPPLEMENTARY(c, c2);
1330 }
1331 }
1332 return 0x1F & c;
1333 }
1334
1335 /* If no special forms are recognized, then consider
1336 * the backslash to generically escape the next character.
1337 * Deal with surrogate pairs. */
1338 if (U16_IS_LEAD(c) && *offset < length) {
1339 char16_t c2 = charAt(*offset, context);
1340 if (U16_IS_TRAIL(c2)) {
1341 ++(*offset);
1342 return U16_GET_SUPPLEMENTARY(c, c2);
1343 }
1344 }
1345 return c;
1346
1347 err:
1348 /* Invalid escape sequence */
1349 *offset = start; /* Reset to initial value */
1350 return (UChar32)0xFFFFFFFF;
1351}
1352
1353/* u_unescapeAt() callback to return a char16_t from a char* */
1354static char16_t U_CALLCONV
1355_charPtr_charAt(int32_t offset, void *context) {
1356 char16_t c16;
1357 /* It would be more efficient to access the invariant tables
1358 * directly but there is no API for that. */
1359 u_charsToUChars(((char*) context) + offset, &c16, 1);
1360 return c16;
1361}
1362
1363/* Append an escape-free segment of the text; used by u_unescape() */
1364static void _appendUChars(char16_t *dest, int32_t destCapacity,
1365 const char *src, int32_t srcLen) {
1366 if (destCapacity < 0) {
1367 destCapacity = 0;
1368 }
1369 if (srcLen > destCapacity) {
1370 srcLen = destCapacity;
1371 }
1372 u_charsToUChars(src, dest, srcLen);
1373}
1374
1375/* Do an invariant conversion of char* -> char16_t*, with escape parsing */
1376U_CAPI int32_t U_EXPORT2
1377u_unescape(const char *src, char16_t *dest, int32_t destCapacity) {
1378 const char *segment = src;
1379 int32_t i = 0;
1380 char c;
1381
1382 while ((c=*src) != 0) {
1383 /* '\\' intentionally written as compiler-specific
1384 * character constant to correspond to compiler-specific
1385 * char* constants. */
1386 if (c == '\\') {
1387 int32_t lenParsed = 0;
1388 UChar32 c32;
1389 if (src != segment) {
1390 if (dest != nullptr) {
1391 _appendUChars(dest + i, destCapacity - i,
1392 segment, (int32_t)(src - segment));
1393 }
1394 i += (int32_t)(src - segment);
1395 }
1396 ++src; /* advance past '\\' */
1397 c32 = (UChar32)u_unescapeAt(_charPtr_charAt, &lenParsed, (int32_t)uprv_strlen(src), (void*)src);
1398 if (lenParsed == 0) {
1399 goto err;
1400 }
1401 src += lenParsed; /* advance past escape seq. */
1402 if (dest != nullptr && U16_LENGTH(c32) <= (destCapacity - i)) {
1403 U16_APPEND_UNSAFE(dest, i, c32);
1404 } else {
1405 i += U16_LENGTH(c32);
1406 }
1407 segment = src;
1408 } else {
1409 ++src;
1410 }
1411 }
1412 if (src != segment) {
1413 if (dest != nullptr) {
1414 _appendUChars(dest + i, destCapacity - i,
1415 segment, (int32_t)(src - segment));
1416 }
1417 i += (int32_t)(src - segment);
1418 }
1419 if (dest != nullptr && i < destCapacity) {
1420 dest[i] = 0;
1421 }
1422 return i;
1423
1424 err:
1425 if (dest != nullptr && destCapacity > 0) {
1426 *dest = 0;
1427 }
1428 return 0;
1429}
1430
1431/* NUL-termination of strings ----------------------------------------------- */
1432
1433/**
1434 * NUL-terminate a string no matter what its type.
1435 * Set warning and error codes accordingly.
1436 */
1437#define __TERMINATE_STRING(dest, destCapacity, length, pErrorCode) UPRV_BLOCK_MACRO_BEGIN { \
1438 if(pErrorCode!=nullptr && U_SUCCESS(*pErrorCode)) { \
1439 /* not a public function, so no complete argument checking */ \
1440 \
1441 if(length<0) { \
1442 /* assume that the caller handles this */ \
1443 } else if(length<destCapacity) { \
1444 /* NUL-terminate the string, the NUL fits */ \
1445 dest[length]=0; \
1446 /* unset the not-terminated warning but leave all others */ \
1447 if(*pErrorCode==U_STRING_NOT_TERMINATED_WARNING) { \
1448 *pErrorCode=U_ZERO_ERROR; \
1449 } \
1450 } else if(length==destCapacity) { \
1451 /* unable to NUL-terminate, but the string itself fit - set a warning code */ \
1452 *pErrorCode=U_STRING_NOT_TERMINATED_WARNING; \
1453 } else /* length>destCapacity */ { \
1454 /* even the string itself did not fit - set an error code */ \
1455 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; \
1456 } \
1457 } \
1458} UPRV_BLOCK_MACRO_END
1459
1460U_CAPI char16_t U_EXPORT2
1461u_asciiToUpper(char16_t c) {
1462 if (u'a' <= c && c <= u'z') {
1463 c = c + u'A' - u'a';
1464 }
1465 return c;
1466}
1467
1468U_CAPI int32_t U_EXPORT2
1469u_terminateUChars(char16_t *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) {
1470 __TERMINATE_STRING(dest, destCapacity, length, pErrorCode);
1471 return length;
1472}
1473
1474U_CAPI int32_t U_EXPORT2
1475u_terminateChars(char *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) {
1476 __TERMINATE_STRING(dest, destCapacity, length, pErrorCode);
1477 return length;
1478}
1479
1480U_CAPI int32_t U_EXPORT2
1481u_terminateUChar32s(UChar32 *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) {
1482 __TERMINATE_STRING(dest, destCapacity, length, pErrorCode);
1483 return length;
1484}
1485
1486U_CAPI int32_t U_EXPORT2
1487u_terminateWChars(wchar_t *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) {
1488 __TERMINATE_STRING(dest, destCapacity, length, pErrorCode);
1489 return length;
1490}
1491
1492// Compute the hash code for a string -------------------------------------- ***
1493
1494// Moved here from uhash.c so that UnicodeString::hashCode() does not depend
1495// on UHashtable code.
1496
1497/*
1498 Compute the hash by iterating sparsely over about 32 (up to 63)
1499 characters spaced evenly through the string. For each character,
1500 multiply the previous hash value by a prime number and add the new
1501 character in, like a linear congruential random number generator,
1502 producing a pseudorandom deterministic value well distributed over
1503 the output range. [LIU]
1504*/
1505
1506#define STRING_HASH(TYPE, STR, STRLEN, DEREF) UPRV_BLOCK_MACRO_BEGIN { \
1507 uint32_t hash = 0; \
1508 const TYPE *p = (const TYPE*) STR; \
1509 if (p != nullptr) { \
1510 int32_t len = (int32_t)(STRLEN); \
1511 int32_t inc = ((len - 32) / 32) + 1; \
1512 const TYPE *limit = p + len; \
1513 while (p<limit) { \
1514 hash = (hash * 37) + DEREF; \
1515 p += inc; \
1516 } \
1517 } \
1518 return static_cast<int32_t>(hash); \
1519} UPRV_BLOCK_MACRO_END
1520
1521/* Used by UnicodeString to compute its hashcode - Not public API. */
1522U_CAPI int32_t U_EXPORT2
1523ustr_hashUCharsN(const char16_t *str, int32_t length) {
1524 STRING_HASH(char16_t, str, length, *p);
1525}
1526
1527U_CAPI int32_t U_EXPORT2
1528ustr_hashCharsN(const char *str, int32_t length) {
1529 STRING_HASH(uint8_t, str, length, *p);
1530}
1531
1532U_CAPI int32_t U_EXPORT2
1533ustr_hashICharsN(const char *str, int32_t length) {
1534 STRING_HASH(char, str, length, (uint8_t)uprv_tolower(*p));
1535}
1536