1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4******************************************************************************
5*
6* Copyright (C) 1998-2016, International Business Machines
7* Corporation and others. All Rights Reserved.
8*
9******************************************************************************
10*
11* File ustring.cpp
12*
13* Modification History:
14*
15* Date Name Description
16* 12/07/98 bertrand Creation.
17******************************************************************************
18*/
19
20#include "unicode/utypes.h"
21#include "unicode/putil.h"
22#include "unicode/uchar.h"
23#include "unicode/ustring.h"
24#include "unicode/utf16.h"
25#include "cstring.h"
26#include "cwchar.h"
27#include "cmemory.h"
28#include "ustr_imp.h"
29
30/* ANSI string.h - style functions ------------------------------------------ */
31
32/* U+ffff is the highest BMP code point, the highest one that fits into a 16-bit UChar */
33#define U_BMP_MAX 0xffff
34
35/* Forward binary string search functions ----------------------------------- */
36
37/*
38 * Test if a substring match inside a string is at code point boundaries.
39 * All pointers refer to the same buffer.
40 * The limit pointer may be NULL, all others must be real pointers.
41 */
42static inline UBool
43isMatchAtCPBoundary(const UChar *start, const UChar *match, const UChar *matchLimit, const UChar *limit) {
44 if(U16_IS_TRAIL(*match) && start!=match && U16_IS_LEAD(*(match-1))) {
45 /* the leading edge of the match is in the middle of a surrogate pair */
46 return FALSE;
47 }
48 if(U16_IS_LEAD(*(matchLimit-1)) && match!=limit && U16_IS_TRAIL(*matchLimit)) {
49 /* the trailing edge of the match is in the middle of a surrogate pair */
50 return FALSE;
51 }
52 return TRUE;
53}
54
55U_CAPI UChar * U_EXPORT2
56u_strFindFirst(const UChar *s, int32_t length,
57 const UChar *sub, int32_t subLength) {
58 const UChar *start, *p, *q, *subLimit;
59 UChar c, cs, cq;
60
61 if(sub==NULL || subLength<-1) {
62 return (UChar *)s;
63 }
64 if(s==NULL || length<-1) {
65 return NULL;
66 }
67
68 start=s;
69
70 if(length<0 && subLength<0) {
71 /* both strings are NUL-terminated */
72 if((cs=*sub++)==0) {
73 return (UChar *)s;
74 }
75 if(*sub==0 && !U16_IS_SURROGATE(cs)) {
76 /* the substring consists of a single, non-surrogate BMP code point */
77 return u_strchr(s, cs);
78 }
79
80 while((c=*s++)!=0) {
81 if(c==cs) {
82 /* found first substring UChar, compare rest */
83 p=s;
84 q=sub;
85 for(;;) {
86 if((cq=*q)==0) {
87 if(isMatchAtCPBoundary(start, s-1, p, NULL)) {
88 return (UChar *)(s-1); /* well-formed match */
89 } else {
90 break; /* no match because surrogate pair is split */
91 }
92 }
93 if((c=*p)==0) {
94 return NULL; /* no match, and none possible after s */
95 }
96 if(c!=cq) {
97 break; /* no match */
98 }
99 ++p;
100 ++q;
101 }
102 }
103 }
104
105 /* not found */
106 return NULL;
107 }
108
109 if(subLength<0) {
110 subLength=u_strlen(sub);
111 }
112 if(subLength==0) {
113 return (UChar *)s;
114 }
115
116 /* get sub[0] to search for it fast */
117 cs=*sub++;
118 --subLength;
119 subLimit=sub+subLength;
120
121 if(subLength==0 && !U16_IS_SURROGATE(cs)) {
122 /* the substring consists of a single, non-surrogate BMP code point */
123 return length<0 ? u_strchr(s, cs) : u_memchr(s, cs, length);
124 }
125
126 if(length<0) {
127 /* s is NUL-terminated */
128 while((c=*s++)!=0) {
129 if(c==cs) {
130 /* found first substring UChar, compare rest */
131 p=s;
132 q=sub;
133 for(;;) {
134 if(q==subLimit) {
135 if(isMatchAtCPBoundary(start, s-1, p, NULL)) {
136 return (UChar *)(s-1); /* well-formed match */
137 } else {
138 break; /* no match because surrogate pair is split */
139 }
140 }
141 if((c=*p)==0) {
142 return NULL; /* no match, and none possible after s */
143 }
144 if(c!=*q) {
145 break; /* no match */
146 }
147 ++p;
148 ++q;
149 }
150 }
151 }
152 } else {
153 const UChar *limit, *preLimit;
154
155 /* subLength was decremented above */
156 if(length<=subLength) {
157 return NULL; /* s is shorter than sub */
158 }
159
160 limit=s+length;
161
162 /* the substring must start before preLimit */
163 preLimit=limit-subLength;
164
165 while(s!=preLimit) {
166 c=*s++;
167 if(c==cs) {
168 /* found first substring UChar, compare rest */
169 p=s;
170 q=sub;
171 for(;;) {
172 if(q==subLimit) {
173 if(isMatchAtCPBoundary(start, s-1, p, limit)) {
174 return (UChar *)(s-1); /* well-formed match */
175 } else {
176 break; /* no match because surrogate pair is split */
177 }
178 }
179 if(*p!=*q) {
180 break; /* no match */
181 }
182 ++p;
183 ++q;
184 }
185 }
186 }
187 }
188
189 /* not found */
190 return NULL;
191}
192
193U_CAPI UChar * U_EXPORT2
194u_strstr(const UChar *s, const UChar *substring) {
195 return u_strFindFirst(s, -1, substring, -1);
196}
197
198U_CAPI UChar * U_EXPORT2
199u_strchr(const UChar *s, UChar c) {
200 if(U16_IS_SURROGATE(c)) {
201 /* make sure to not find half of a surrogate pair */
202 return u_strFindFirst(s, -1, &c, 1);
203 } else {
204 UChar cs;
205
206 /* trivial search for a BMP code point */
207 for(;;) {
208 if((cs=*s)==c) {
209 return (UChar *)s;
210 }
211 if(cs==0) {
212 return NULL;
213 }
214 ++s;
215 }
216 }
217}
218
219U_CAPI UChar * U_EXPORT2
220u_strchr32(const UChar *s, UChar32 c) {
221 if((uint32_t)c<=U_BMP_MAX) {
222 /* find BMP code point */
223 return u_strchr(s, (UChar)c);
224 } else if((uint32_t)c<=UCHAR_MAX_VALUE) {
225 /* find supplementary code point as surrogate pair */
226 UChar cs, lead=U16_LEAD(c), trail=U16_TRAIL(c);
227
228 while((cs=*s++)!=0) {
229 if(cs==lead && *s==trail) {
230 return (UChar *)(s-1);
231 }
232 }
233 return NULL;
234 } else {
235 /* not a Unicode code point, not findable */
236 return NULL;
237 }
238}
239
240U_CAPI UChar * U_EXPORT2
241u_memchr(const UChar *s, UChar c, int32_t count) {
242 if(count<=0) {
243 return NULL; /* no string */
244 } else if(U16_IS_SURROGATE(c)) {
245 /* make sure to not find half of a surrogate pair */
246 return u_strFindFirst(s, count, &c, 1);
247 } else {
248 /* trivial search for a BMP code point */
249 const UChar *limit=s+count;
250 do {
251 if(*s==c) {
252 return (UChar *)s;
253 }
254 } while(++s!=limit);
255 return NULL;
256 }
257}
258
259U_CAPI UChar * U_EXPORT2
260u_memchr32(const UChar *s, UChar32 c, int32_t count) {
261 if((uint32_t)c<=U_BMP_MAX) {
262 /* find BMP code point */
263 return u_memchr(s, (UChar)c, count);
264 } else if(count<2) {
265 /* too short for a surrogate pair */
266 return NULL;
267 } else if((uint32_t)c<=UCHAR_MAX_VALUE) {
268 /* find supplementary code point as surrogate pair */
269 const UChar *limit=s+count-1; /* -1 so that we do not need a separate check for the trail unit */
270 UChar lead=U16_LEAD(c), trail=U16_TRAIL(c);
271
272 do {
273 if(*s==lead && *(s+1)==trail) {
274 return (UChar *)s;
275 }
276 } while(++s!=limit);
277 return NULL;
278 } else {
279 /* not a Unicode code point, not findable */
280 return NULL;
281 }
282}
283
284/* Backward binary string search functions ---------------------------------- */
285
286U_CAPI UChar * U_EXPORT2
287u_strFindLast(const UChar *s, int32_t length,
288 const UChar *sub, int32_t subLength) {
289 const UChar *start, *limit, *p, *q, *subLimit;
290 UChar c, cs;
291
292 if(sub==NULL || subLength<-1) {
293 return (UChar *)s;
294 }
295 if(s==NULL || length<-1) {
296 return NULL;
297 }
298
299 /*
300 * This implementation is more lazy than the one for u_strFindFirst():
301 * There is no special search code for NUL-terminated strings.
302 * It does not seem to be worth it for searching substrings to
303 * search forward and find all matches like in u_strrchr() and similar.
304 * Therefore, we simply get both string lengths and search backward.
305 *
306 * markus 2002oct23
307 */
308
309 if(subLength<0) {
310 subLength=u_strlen(sub);
311 }
312 if(subLength==0) {
313 return (UChar *)s;
314 }
315
316 /* get sub[subLength-1] to search for it fast */
317 subLimit=sub+subLength;
318 cs=*(--subLimit);
319 --subLength;
320
321 if(subLength==0 && !U16_IS_SURROGATE(cs)) {
322 /* the substring consists of a single, non-surrogate BMP code point */
323 return length<0 ? u_strrchr(s, cs) : u_memrchr(s, cs, length);
324 }
325
326 if(length<0) {
327 length=u_strlen(s);
328 }
329
330 /* subLength was decremented above */
331 if(length<=subLength) {
332 return NULL; /* s is shorter than sub */
333 }
334
335 start=s;
336 limit=s+length;
337
338 /* the substring must start no later than s+subLength */
339 s+=subLength;
340
341 while(s!=limit) {
342 c=*(--limit);
343 if(c==cs) {
344 /* found last substring UChar, compare rest */
345 p=limit;
346 q=subLimit;
347 for(;;) {
348 if(q==sub) {
349 if(isMatchAtCPBoundary(start, p, limit+1, start+length)) {
350 return (UChar *)p; /* well-formed match */
351 } else {
352 break; /* no match because surrogate pair is split */
353 }
354 }
355 if(*(--p)!=*(--q)) {
356 break; /* no match */
357 }
358 }
359 }
360 }
361
362 /* not found */
363 return NULL;
364}
365
366U_CAPI UChar * U_EXPORT2
367u_strrstr(const UChar *s, const UChar *substring) {
368 return u_strFindLast(s, -1, substring, -1);
369}
370
371U_CAPI UChar * U_EXPORT2
372u_strrchr(const UChar *s, UChar c) {
373 if(U16_IS_SURROGATE(c)) {
374 /* make sure to not find half of a surrogate pair */
375 return u_strFindLast(s, -1, &c, 1);
376 } else {
377 const UChar *result=NULL;
378 UChar cs;
379
380 /* trivial search for a BMP code point */
381 for(;;) {
382 if((cs=*s)==c) {
383 result=s;
384 }
385 if(cs==0) {
386 return (UChar *)result;
387 }
388 ++s;
389 }
390 }
391}
392
393U_CAPI UChar * U_EXPORT2
394u_strrchr32(const UChar *s, UChar32 c) {
395 if((uint32_t)c<=U_BMP_MAX) {
396 /* find BMP code point */
397 return u_strrchr(s, (UChar)c);
398 } else if((uint32_t)c<=UCHAR_MAX_VALUE) {
399 /* find supplementary code point as surrogate pair */
400 const UChar *result=NULL;
401 UChar cs, lead=U16_LEAD(c), trail=U16_TRAIL(c);
402
403 while((cs=*s++)!=0) {
404 if(cs==lead && *s==trail) {
405 result=s-1;
406 }
407 }
408 return (UChar *)result;
409 } else {
410 /* not a Unicode code point, not findable */
411 return NULL;
412 }
413}
414
415U_CAPI UChar * U_EXPORT2
416u_memrchr(const UChar *s, UChar c, int32_t count) {
417 if(count<=0) {
418 return NULL; /* no string */
419 } else if(U16_IS_SURROGATE(c)) {
420 /* make sure to not find half of a surrogate pair */
421 return u_strFindLast(s, count, &c, 1);
422 } else {
423 /* trivial search for a BMP code point */
424 const UChar *limit=s+count;
425 do {
426 if(*(--limit)==c) {
427 return (UChar *)limit;
428 }
429 } while(s!=limit);
430 return NULL;
431 }
432}
433
434U_CAPI UChar * U_EXPORT2
435u_memrchr32(const UChar *s, UChar32 c, int32_t count) {
436 if((uint32_t)c<=U_BMP_MAX) {
437 /* find BMP code point */
438 return u_memrchr(s, (UChar)c, count);
439 } else if(count<2) {
440 /* too short for a surrogate pair */
441 return NULL;
442 } else if((uint32_t)c<=UCHAR_MAX_VALUE) {
443 /* find supplementary code point as surrogate pair */
444 const UChar *limit=s+count-1;
445 UChar lead=U16_LEAD(c), trail=U16_TRAIL(c);
446
447 do {
448 if(*limit==trail && *(limit-1)==lead) {
449 return (UChar *)(limit-1);
450 }
451 } while(s!=--limit);
452 return NULL;
453 } else {
454 /* not a Unicode code point, not findable */
455 return NULL;
456 }
457}
458
459/* Tokenization functions --------------------------------------------------- */
460
461/*
462 * Match each code point in a string against each code point in the matchSet.
463 * Return the index of the first string code point that
464 * is (polarity==TRUE) or is not (FALSE) contained in the matchSet.
465 * Return -(string length)-1 if there is no such code point.
466 */
467static int32_t
468_matchFromSet(const UChar *string, const UChar *matchSet, UBool polarity) {
469 int32_t matchLen, matchBMPLen, strItr, matchItr;
470 UChar32 stringCh, matchCh;
471 UChar c, c2;
472
473 /* first part of matchSet contains only BMP code points */
474 matchBMPLen = 0;
475 while((c = matchSet[matchBMPLen]) != 0 && U16_IS_SINGLE(c)) {
476 ++matchBMPLen;
477 }
478
479 /* second part of matchSet contains BMP and supplementary code points */
480 matchLen = matchBMPLen;
481 while(matchSet[matchLen] != 0) {
482 ++matchLen;
483 }
484
485 for(strItr = 0; (c = string[strItr]) != 0;) {
486 ++strItr;
487 if(U16_IS_SINGLE(c)) {
488 if(polarity) {
489 for(matchItr = 0; matchItr < matchLen; ++matchItr) {
490 if(c == matchSet[matchItr]) {
491 return strItr - 1; /* one matches */
492 }
493 }
494 } else {
495 for(matchItr = 0; matchItr < matchLen; ++matchItr) {
496 if(c == matchSet[matchItr]) {
497 goto endloop;
498 }
499 }
500 return strItr - 1; /* none matches */
501 }
502 } else {
503 /*
504 * No need to check for string length before U16_IS_TRAIL
505 * because c2 could at worst be the terminating NUL.
506 */
507 if(U16_IS_SURROGATE_LEAD(c) && U16_IS_TRAIL(c2 = string[strItr])) {
508 ++strItr;
509 stringCh = U16_GET_SUPPLEMENTARY(c, c2);
510 } else {
511 stringCh = c; /* unpaired trail surrogate */
512 }
513
514 if(polarity) {
515 for(matchItr = matchBMPLen; matchItr < matchLen;) {
516 U16_NEXT(matchSet, matchItr, matchLen, matchCh);
517 if(stringCh == matchCh) {
518 return strItr - U16_LENGTH(stringCh); /* one matches */
519 }
520 }
521 } else {
522 for(matchItr = matchBMPLen; matchItr < matchLen;) {
523 U16_NEXT(matchSet, matchItr, matchLen, matchCh);
524 if(stringCh == matchCh) {
525 goto endloop;
526 }
527 }
528 return strItr - U16_LENGTH(stringCh); /* none matches */
529 }
530 }
531endloop:
532 /* wish C had continue with labels like Java... */;
533 }
534
535 /* Didn't find it. */
536 return -strItr-1;
537}
538
539/* Search for a codepoint in a string that matches one of the matchSet codepoints. */
540U_CAPI UChar * U_EXPORT2
541u_strpbrk(const UChar *string, const UChar *matchSet)
542{
543 int32_t idx = _matchFromSet(string, matchSet, TRUE);
544 if(idx >= 0) {
545 return (UChar *)string + idx;
546 } else {
547 return NULL;
548 }
549}
550
551/* Search for a codepoint in a string that matches one of the matchSet codepoints. */
552U_CAPI int32_t U_EXPORT2
553u_strcspn(const UChar *string, const UChar *matchSet)
554{
555 int32_t idx = _matchFromSet(string, matchSet, TRUE);
556 if(idx >= 0) {
557 return idx;
558 } else {
559 return -idx - 1; /* == u_strlen(string) */
560 }
561}
562
563/* Search for a codepoint in a string that does not match one of the matchSet codepoints. */
564U_CAPI int32_t U_EXPORT2
565u_strspn(const UChar *string, const UChar *matchSet)
566{
567 int32_t idx = _matchFromSet(string, matchSet, FALSE);
568 if(idx >= 0) {
569 return idx;
570 } else {
571 return -idx - 1; /* == u_strlen(string) */
572 }
573}
574
575/* ----- Text manipulation functions --- */
576
577U_CAPI UChar* U_EXPORT2
578u_strtok_r(UChar *src,
579 const UChar *delim,
580 UChar **saveState)
581{
582 UChar *tokSource;
583 UChar *nextToken;
584 uint32_t nonDelimIdx;
585
586 /* If saveState is NULL, the user messed up. */
587 if (src != NULL) {
588 tokSource = src;
589 *saveState = src; /* Set to "src" in case there are no delimiters */
590 }
591 else if (*saveState) {
592 tokSource = *saveState;
593 }
594 else {
595 /* src == NULL && *saveState == NULL */
596 /* This shouldn't happen. We already finished tokenizing. */
597 return NULL;
598 }
599
600 /* Skip initial delimiters */
601 nonDelimIdx = u_strspn(tokSource, delim);
602 tokSource = &tokSource[nonDelimIdx];
603
604 if (*tokSource) {
605 nextToken = u_strpbrk(tokSource, delim);
606 if (nextToken != NULL) {
607 /* Create a token */
608 *(nextToken++) = 0;
609 *saveState = nextToken;
610 return tokSource;
611 }
612 else if (*saveState) {
613 /* Return the last token */
614 *saveState = NULL;
615 return tokSource;
616 }
617 }
618 else {
619 /* No tokens were found. Only delimiters were left. */
620 *saveState = NULL;
621 }
622 return NULL;
623}
624
625/* Miscellaneous functions -------------------------------------------------- */
626
627U_CAPI UChar* U_EXPORT2
628u_strcat(UChar *dst,
629 const UChar *src)
630{
631 UChar *anchor = dst; /* save a pointer to start of dst */
632
633 while(*dst != 0) { /* To end of first string */
634 ++dst;
635 }
636 while((*(dst++) = *(src++)) != 0) { /* copy string 2 over */
637 }
638
639 return anchor;
640}
641
642U_CAPI UChar* U_EXPORT2
643u_strncat(UChar *dst,
644 const UChar *src,
645 int32_t n )
646{
647 if(n > 0) {
648 UChar *anchor = dst; /* save a pointer to start of dst */
649
650 while(*dst != 0) { /* To end of first string */
651 ++dst;
652 }
653 while((*dst = *src) != 0) { /* copy string 2 over */
654 ++dst;
655 if(--n == 0) {
656 *dst = 0;
657 break;
658 }
659 ++src;
660 }
661
662 return anchor;
663 } else {
664 return dst;
665 }
666}
667
668/* ----- Text property functions --- */
669
670U_CAPI int32_t U_EXPORT2
671u_strcmp(const UChar *s1,
672 const UChar *s2)
673{
674 UChar c1, c2;
675
676 for(;;) {
677 c1=*s1++;
678 c2=*s2++;
679 if (c1 != c2 || c1 == 0) {
680 break;
681 }
682 }
683 return (int32_t)c1 - (int32_t)c2;
684}
685
686U_CFUNC int32_t U_EXPORT2
687uprv_strCompare(const UChar *s1, int32_t length1,
688 const UChar *s2, int32_t length2,
689 UBool strncmpStyle, UBool codePointOrder) {
690 const UChar *start1, *start2, *limit1, *limit2;
691 UChar c1, c2;
692
693 /* setup for fix-up */
694 start1=s1;
695 start2=s2;
696
697 /* compare identical prefixes - they do not need to be fixed up */
698 if(length1<0 && length2<0) {
699 /* strcmp style, both NUL-terminated */
700 if(s1==s2) {
701 return 0;
702 }
703
704 for(;;) {
705 c1=*s1;
706 c2=*s2;
707 if(c1!=c2) {
708 break;
709 }
710 if(c1==0) {
711 return 0;
712 }
713 ++s1;
714 ++s2;
715 }
716
717 /* setup for fix-up */
718 limit1=limit2=NULL;
719 } else if(strncmpStyle) {
720 /* special handling for strncmp, assume length1==length2>=0 but also check for NUL */
721 if(s1==s2) {
722 return 0;
723 }
724
725 limit1=start1+length1;
726
727 for(;;) {
728 /* both lengths are same, check only one limit */
729 if(s1==limit1) {
730 return 0;
731 }
732
733 c1=*s1;
734 c2=*s2;
735 if(c1!=c2) {
736 break;
737 }
738 if(c1==0) {
739 return 0;
740 }
741 ++s1;
742 ++s2;
743 }
744
745 /* setup for fix-up */
746 limit2=start2+length1; /* use length1 here, too, to enforce assumption */
747 } else {
748 /* memcmp/UnicodeString style, both length-specified */
749 int32_t lengthResult;
750
751 if(length1<0) {
752 length1=u_strlen(s1);
753 }
754 if(length2<0) {
755 length2=u_strlen(s2);
756 }
757
758 /* limit1=start1+min(lenght1, length2) */
759 if(length1<length2) {
760 lengthResult=-1;
761 limit1=start1+length1;
762 } else if(length1==length2) {
763 lengthResult=0;
764 limit1=start1+length1;
765 } else /* length1>length2 */ {
766 lengthResult=1;
767 limit1=start1+length2;
768 }
769
770 if(s1==s2) {
771 return lengthResult;
772 }
773
774 for(;;) {
775 /* check pseudo-limit */
776 if(s1==limit1) {
777 return lengthResult;
778 }
779
780 c1=*s1;
781 c2=*s2;
782 if(c1!=c2) {
783 break;
784 }
785 ++s1;
786 ++s2;
787 }
788
789 /* setup for fix-up */
790 limit1=start1+length1;
791 limit2=start2+length2;
792 }
793
794 /* if both values are in or above the surrogate range, fix them up */
795 if(c1>=0xd800 && c2>=0xd800 && codePointOrder) {
796 /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
797 if(
798 (c1<=0xdbff && (s1+1)!=limit1 && U16_IS_TRAIL(*(s1+1))) ||
799 (U16_IS_TRAIL(c1) && start1!=s1 && U16_IS_LEAD(*(s1-1)))
800 ) {
801 /* part of a surrogate pair, leave >=d800 */
802 } else {
803 /* BMP code point - may be surrogate code point - make <d800 */
804 c1-=0x2800;
805 }
806
807 if(
808 (c2<=0xdbff && (s2+1)!=limit2 && U16_IS_TRAIL(*(s2+1))) ||
809 (U16_IS_TRAIL(c2) && start2!=s2 && U16_IS_LEAD(*(s2-1)))
810 ) {
811 /* part of a surrogate pair, leave >=d800 */
812 } else {
813 /* BMP code point - may be surrogate code point - make <d800 */
814 c2-=0x2800;
815 }
816 }
817
818 /* now c1 and c2 are in the requested (code unit or code point) order */
819 return (int32_t)c1-(int32_t)c2;
820}
821
822/*
823 * Compare two strings as presented by UCharIterators.
824 * Use code unit or code point order.
825 * When the function returns, it is undefined where the iterators
826 * have stopped.
827 */
828U_CAPI int32_t U_EXPORT2
829u_strCompareIter(UCharIterator *iter1, UCharIterator *iter2, UBool codePointOrder) {
830 UChar32 c1, c2;
831
832 /* argument checking */
833 if(iter1==NULL || iter2==NULL) {
834 return 0; /* bad arguments */
835 }
836 if(iter1==iter2) {
837 return 0; /* identical iterators */
838 }
839
840 /* reset iterators to start? */
841 iter1->move(iter1, 0, UITER_START);
842 iter2->move(iter2, 0, UITER_START);
843
844 /* compare identical prefixes - they do not need to be fixed up */
845 for(;;) {
846 c1=iter1->next(iter1);
847 c2=iter2->next(iter2);
848 if(c1!=c2) {
849 break;
850 }
851 if(c1==-1) {
852 return 0;
853 }
854 }
855
856 /* if both values are in or above the surrogate range, fix them up */
857 if(c1>=0xd800 && c2>=0xd800 && codePointOrder) {
858 /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
859 if(
860 (c1<=0xdbff && U16_IS_TRAIL(iter1->current(iter1))) ||
861 (U16_IS_TRAIL(c1) && (iter1->previous(iter1), U16_IS_LEAD(iter1->previous(iter1))))
862 ) {
863 /* part of a surrogate pair, leave >=d800 */
864 } else {
865 /* BMP code point - may be surrogate code point - make <d800 */
866 c1-=0x2800;
867 }
868
869 if(
870 (c2<=0xdbff && U16_IS_TRAIL(iter2->current(iter2))) ||
871 (U16_IS_TRAIL(c2) && (iter2->previous(iter2), U16_IS_LEAD(iter2->previous(iter2))))
872 ) {
873 /* part of a surrogate pair, leave >=d800 */
874 } else {
875 /* BMP code point - may be surrogate code point - make <d800 */
876 c2-=0x2800;
877 }
878 }
879
880 /* now c1 and c2 are in the requested (code unit or code point) order */
881 return (int32_t)c1-(int32_t)c2;
882}
883
884#if 0
885/*
886 * u_strCompareIter() does not leave the iterators _on_ the different units.
887 * This is possible but would cost a few extra indirect function calls to back
888 * up if the last unit (c1 or c2 respectively) was >=0.
889 *
890 * Consistently leaving them _behind_ the different units is not an option
891 * because the current "unit" is the end of the string if that is reached,
892 * and in such a case the iterator does not move.
893 * For example, when comparing "ab" with "abc", both iterators rest _on_ the end
894 * of their strings. Calling previous() on each does not move them to where
895 * the comparison fails.
896 *
897 * So the simplest semantics is to not define where the iterators end up.
898 *
899 * The following fragment is part of what would need to be done for backing up.
900 */
901void fragment {
902 /* iff a surrogate is part of a surrogate pair, leave >=d800 */
903 if(c1<=0xdbff) {
904 if(!U16_IS_TRAIL(iter1->current(iter1))) {
905 /* lead surrogate code point - make <d800 */
906 c1-=0x2800;
907 }
908 } else if(c1<=0xdfff) {
909 int32_t idx=iter1->getIndex(iter1, UITER_CURRENT);
910 iter1->previous(iter1); /* ==c1 */
911 if(!U16_IS_LEAD(iter1->previous(iter1))) {
912 /* trail surrogate code point - make <d800 */
913 c1-=0x2800;
914 }
915 /* go back to behind where the difference is */
916 iter1->move(iter1, idx, UITER_ZERO);
917 } else /* 0xe000<=c1<=0xffff */ {
918 /* BMP code point - make <d800 */
919 c1-=0x2800;
920 }
921}
922#endif
923
924U_CAPI int32_t U_EXPORT2
925u_strCompare(const UChar *s1, int32_t length1,
926 const UChar *s2, int32_t length2,
927 UBool codePointOrder) {
928 /* argument checking */
929 if(s1==NULL || length1<-1 || s2==NULL || length2<-1) {
930 return 0;
931 }
932 return uprv_strCompare(s1, length1, s2, length2, FALSE, codePointOrder);
933}
934
935/* String compare in code point order - u_strcmp() compares in code unit order. */
936U_CAPI int32_t U_EXPORT2
937u_strcmpCodePointOrder(const UChar *s1, const UChar *s2) {
938 return uprv_strCompare(s1, -1, s2, -1, FALSE, TRUE);
939}
940
941U_CAPI int32_t U_EXPORT2
942u_strncmp(const UChar *s1,
943 const UChar *s2,
944 int32_t n)
945{
946 if(n > 0) {
947 int32_t rc;
948 for(;;) {
949 rc = (int32_t)*s1 - (int32_t)*s2;
950 if(rc != 0 || *s1 == 0 || --n == 0) {
951 return rc;
952 }
953 ++s1;
954 ++s2;
955 }
956 } else {
957 return 0;
958 }
959}
960
961U_CAPI int32_t U_EXPORT2
962u_strncmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t n) {
963 return uprv_strCompare(s1, n, s2, n, TRUE, TRUE);
964}
965
966U_CAPI UChar* U_EXPORT2
967u_strcpy(UChar *dst,
968 const UChar *src)
969{
970 UChar *anchor = dst; /* save a pointer to start of dst */
971
972 while((*(dst++) = *(src++)) != 0) { /* copy string 2 over */
973 }
974
975 return anchor;
976}
977
978U_CAPI UChar* U_EXPORT2
979u_strncpy(UChar *dst,
980 const UChar *src,
981 int32_t n)
982{
983 UChar *anchor = dst; /* save a pointer to start of dst */
984
985 /* copy string 2 over */
986 while(n > 0 && (*(dst++) = *(src++)) != 0) {
987 --n;
988 }
989
990 return anchor;
991}
992
993U_CAPI int32_t U_EXPORT2
994u_strlen(const UChar *s)
995{
996#if U_SIZEOF_WCHAR_T == U_SIZEOF_UCHAR
997 return (int32_t)uprv_wcslen((const wchar_t *)s);
998#else
999 const UChar *t = s;
1000 while(*t != 0) {
1001 ++t;
1002 }
1003 return t - s;
1004#endif
1005}
1006
1007U_CAPI int32_t U_EXPORT2
1008u_countChar32(const UChar *s, int32_t length) {
1009 int32_t count;
1010
1011 if(s==NULL || length<-1) {
1012 return 0;
1013 }
1014
1015 count=0;
1016 if(length>=0) {
1017 while(length>0) {
1018 ++count;
1019 if(U16_IS_LEAD(*s) && length>=2 && U16_IS_TRAIL(*(s+1))) {
1020 s+=2;
1021 length-=2;
1022 } else {
1023 ++s;
1024 --length;
1025 }
1026 }
1027 } else /* length==-1 */ {
1028 UChar c;
1029
1030 for(;;) {
1031 if((c=*s++)==0) {
1032 break;
1033 }
1034 ++count;
1035
1036 /*
1037 * sufficient to look ahead one because of UTF-16;
1038 * safe to look ahead one because at worst that would be the terminating NUL
1039 */
1040 if(U16_IS_LEAD(c) && U16_IS_TRAIL(*s)) {
1041 ++s;
1042 }
1043 }
1044 }
1045 return count;
1046}
1047
1048U_CAPI UBool U_EXPORT2
1049u_strHasMoreChar32Than(const UChar *s, int32_t length, int32_t number) {
1050
1051 if(number<0) {
1052 return TRUE;
1053 }
1054 if(s==NULL || length<-1) {
1055 return FALSE;
1056 }
1057
1058 if(length==-1) {
1059 /* s is NUL-terminated */
1060 UChar c;
1061
1062 /* count code points until they exceed */
1063 for(;;) {
1064 if((c=*s++)==0) {
1065 return FALSE;
1066 }
1067 if(number==0) {
1068 return TRUE;
1069 }
1070 if(U16_IS_LEAD(c) && U16_IS_TRAIL(*s)) {
1071 ++s;
1072 }
1073 --number;
1074 }
1075 } else {
1076 /* length>=0 known */
1077 const UChar *limit;
1078 int32_t maxSupplementary;
1079
1080 /* s contains at least (length+1)/2 code points: <=2 UChars per cp */
1081 if(((length+1)/2)>number) {
1082 return TRUE;
1083 }
1084
1085 /* check if s does not even contain enough UChars */
1086 maxSupplementary=length-number;
1087 if(maxSupplementary<=0) {
1088 return FALSE;
1089 }
1090 /* there are maxSupplementary=length-number more UChars than asked-for code points */
1091
1092 /*
1093 * count code points until they exceed and also check that there are
1094 * no more than maxSupplementary supplementary code points (UChar pairs)
1095 */
1096 limit=s+length;
1097 for(;;) {
1098 if(s==limit) {
1099 return FALSE;
1100 }
1101 if(number==0) {
1102 return TRUE;
1103 }
1104 if(U16_IS_LEAD(*s++) && s!=limit && U16_IS_TRAIL(*s)) {
1105 ++s;
1106 if(--maxSupplementary<=0) {
1107 /* too many pairs - too few code points */
1108 return FALSE;
1109 }
1110 }
1111 --number;
1112 }
1113 }
1114}
1115
1116U_CAPI UChar * U_EXPORT2
1117u_memcpy(UChar *dest, const UChar *src, int32_t count) {
1118 if(count > 0) {
1119 uprv_memcpy(dest, src, (size_t)count*U_SIZEOF_UCHAR);
1120 }
1121 return dest;
1122}
1123
1124U_CAPI UChar * U_EXPORT2
1125u_memmove(UChar *dest, const UChar *src, int32_t count) {
1126 if(count > 0) {
1127 uprv_memmove(dest, src, (size_t)count*U_SIZEOF_UCHAR);
1128 }
1129 return dest;
1130}
1131
1132U_CAPI UChar * U_EXPORT2
1133u_memset(UChar *dest, UChar c, int32_t count) {
1134 if(count > 0) {
1135 UChar *ptr = dest;
1136 UChar *limit = dest + count;
1137
1138 while (ptr < limit) {
1139 *(ptr++) = c;
1140 }
1141 }
1142 return dest;
1143}
1144
1145U_CAPI int32_t U_EXPORT2
1146u_memcmp(const UChar *buf1, const UChar *buf2, int32_t count) {
1147 if(count > 0) {
1148 const UChar *limit = buf1 + count;
1149 int32_t result;
1150
1151 while (buf1 < limit) {
1152 result = (int32_t)(uint16_t)*buf1 - (int32_t)(uint16_t)*buf2;
1153 if (result != 0) {
1154 return result;
1155 }
1156 buf1++;
1157 buf2++;
1158 }
1159 }
1160 return 0;
1161}
1162
1163U_CAPI int32_t U_EXPORT2
1164u_memcmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t count) {
1165 return uprv_strCompare(s1, count, s2, count, FALSE, TRUE);
1166}
1167
1168/* u_unescape & support fns ------------------------------------------------- */
1169
1170/* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */
1171static const UChar UNESCAPE_MAP[] = {
1172 /*" 0x22, 0x22 */
1173 /*' 0x27, 0x27 */
1174 /*? 0x3F, 0x3F */
1175 /*\ 0x5C, 0x5C */
1176 /*a*/ 0x61, 0x07,
1177 /*b*/ 0x62, 0x08,
1178 /*e*/ 0x65, 0x1b,
1179 /*f*/ 0x66, 0x0c,
1180 /*n*/ 0x6E, 0x0a,
1181 /*r*/ 0x72, 0x0d,
1182 /*t*/ 0x74, 0x09,
1183 /*v*/ 0x76, 0x0b
1184};
1185enum { UNESCAPE_MAP_LENGTH = UPRV_LENGTHOF(UNESCAPE_MAP) };
1186
1187/* Convert one octal digit to a numeric value 0..7, or -1 on failure */
1188static int8_t _digit8(UChar c) {
1189 if (c >= 0x0030 && c <= 0x0037) {
1190 return (int8_t)(c - 0x0030);
1191 }
1192 return -1;
1193}
1194
1195/* Convert one hex digit to a numeric value 0..F, or -1 on failure */
1196static int8_t _digit16(UChar c) {
1197 if (c >= 0x0030 && c <= 0x0039) {
1198 return (int8_t)(c - 0x0030);
1199 }
1200 if (c >= 0x0041 && c <= 0x0046) {
1201 return (int8_t)(c - (0x0041 - 10));
1202 }
1203 if (c >= 0x0061 && c <= 0x0066) {
1204 return (int8_t)(c - (0x0061 - 10));
1205 }
1206 return -1;
1207}
1208
1209/* Parse a single escape sequence. Although this method deals in
1210 * UChars, it does not use C++ or UnicodeString. This allows it to
1211 * be used from C contexts. */
1212U_CAPI UChar32 U_EXPORT2
1213u_unescapeAt(UNESCAPE_CHAR_AT charAt,
1214 int32_t *offset,
1215 int32_t length,
1216 void *context) {
1217
1218 int32_t start = *offset;
1219 UChar c;
1220 UChar32 result = 0;
1221 int8_t n = 0;
1222 int8_t minDig = 0;
1223 int8_t maxDig = 0;
1224 int8_t bitsPerDigit = 4;
1225 int8_t dig;
1226 int32_t i;
1227 UBool braces = FALSE;
1228
1229 /* Check that offset is in range */
1230 if (*offset < 0 || *offset >= length) {
1231 goto err;
1232 }
1233
1234 /* Fetch first UChar after '\\' */
1235 c = charAt((*offset)++, context);
1236
1237 /* Convert hexadecimal and octal escapes */
1238 switch (c) {
1239 case 0x0075 /*'u'*/:
1240 minDig = maxDig = 4;
1241 break;
1242 case 0x0055 /*'U'*/:
1243 minDig = maxDig = 8;
1244 break;
1245 case 0x0078 /*'x'*/:
1246 minDig = 1;
1247 if (*offset < length && charAt(*offset, context) == 0x7B /*{*/) {
1248 ++(*offset);
1249 braces = TRUE;
1250 maxDig = 8;
1251 } else {
1252 maxDig = 2;
1253 }
1254 break;
1255 default:
1256 dig = _digit8(c);
1257 if (dig >= 0) {
1258 minDig = 1;
1259 maxDig = 3;
1260 n = 1; /* Already have first octal digit */
1261 bitsPerDigit = 3;
1262 result = dig;
1263 }
1264 break;
1265 }
1266 if (minDig != 0) {
1267 while (*offset < length && n < maxDig) {
1268 c = charAt(*offset, context);
1269 dig = (int8_t)((bitsPerDigit == 3) ? _digit8(c) : _digit16(c));
1270 if (dig < 0) {
1271 break;
1272 }
1273 result = (result << bitsPerDigit) | dig;
1274 ++(*offset);
1275 ++n;
1276 }
1277 if (n < minDig) {
1278 goto err;
1279 }
1280 if (braces) {
1281 if (c != 0x7D /*}*/) {
1282 goto err;
1283 }
1284 ++(*offset);
1285 }
1286 if (result < 0 || result >= 0x110000) {
1287 goto err;
1288 }
1289 /* If an escape sequence specifies a lead surrogate, see if
1290 * there is a trail surrogate after it, either as an escape or
1291 * as a literal. If so, join them up into a supplementary.
1292 */
1293 if (*offset < length && U16_IS_LEAD(result)) {
1294 int32_t ahead = *offset + 1;
1295 c = charAt(*offset, context);
1296 if (c == 0x5C /*'\\'*/ && ahead < length) {
1297 c = (UChar) u_unescapeAt(charAt, &ahead, length, context);
1298 }
1299 if (U16_IS_TRAIL(c)) {
1300 *offset = ahead;
1301 result = U16_GET_SUPPLEMENTARY(result, c);
1302 }
1303 }
1304 return result;
1305 }
1306
1307 /* Convert C-style escapes in table */
1308 for (i=0; i<UNESCAPE_MAP_LENGTH; i+=2) {
1309 if (c == UNESCAPE_MAP[i]) {
1310 return UNESCAPE_MAP[i+1];
1311 } else if (c < UNESCAPE_MAP[i]) {
1312 break;
1313 }
1314 }
1315
1316 /* Map \cX to control-X: X & 0x1F */
1317 if (c == 0x0063 /*'c'*/ && *offset < length) {
1318 c = charAt((*offset)++, context);
1319 if (U16_IS_LEAD(c) && *offset < length) {
1320 UChar c2 = charAt(*offset, context);
1321 if (U16_IS_TRAIL(c2)) {
1322 ++(*offset);
1323 c = (UChar) U16_GET_SUPPLEMENTARY(c, c2); /* [sic] */
1324 }
1325 }
1326 return 0x1F & c;
1327 }
1328
1329 /* If no special forms are recognized, then consider
1330 * the backslash to generically escape the next character.
1331 * Deal with surrogate pairs. */
1332 if (U16_IS_LEAD(c) && *offset < length) {
1333 UChar c2 = charAt(*offset, context);
1334 if (U16_IS_TRAIL(c2)) {
1335 ++(*offset);
1336 return U16_GET_SUPPLEMENTARY(c, c2);
1337 }
1338 }
1339 return c;
1340
1341 err:
1342 /* Invalid escape sequence */
1343 *offset = start; /* Reset to initial value */
1344 return (UChar32)0xFFFFFFFF;
1345}
1346
1347/* u_unescapeAt() callback to return a UChar from a char* */
1348static UChar U_CALLCONV
1349_charPtr_charAt(int32_t offset, void *context) {
1350 UChar c16;
1351 /* It would be more efficient to access the invariant tables
1352 * directly but there is no API for that. */
1353 u_charsToUChars(((char*) context) + offset, &c16, 1);
1354 return c16;
1355}
1356
1357/* Append an escape-free segment of the text; used by u_unescape() */
1358static void _appendUChars(UChar *dest, int32_t destCapacity,
1359 const char *src, int32_t srcLen) {
1360 if (destCapacity < 0) {
1361 destCapacity = 0;
1362 }
1363 if (srcLen > destCapacity) {
1364 srcLen = destCapacity;
1365 }
1366 u_charsToUChars(src, dest, srcLen);
1367}
1368
1369/* Do an invariant conversion of char* -> UChar*, with escape parsing */
1370U_CAPI int32_t U_EXPORT2
1371u_unescape(const char *src, UChar *dest, int32_t destCapacity) {
1372 const char *segment = src;
1373 int32_t i = 0;
1374 char c;
1375
1376 while ((c=*src) != 0) {
1377 /* '\\' intentionally written as compiler-specific
1378 * character constant to correspond to compiler-specific
1379 * char* constants. */
1380 if (c == '\\') {
1381 int32_t lenParsed = 0;
1382 UChar32 c32;
1383 if (src != segment) {
1384 if (dest != NULL) {
1385 _appendUChars(dest + i, destCapacity - i,
1386 segment, (int32_t)(src - segment));
1387 }
1388 i += (int32_t)(src - segment);
1389 }
1390 ++src; /* advance past '\\' */
1391 c32 = (UChar32)u_unescapeAt(_charPtr_charAt, &lenParsed, (int32_t)uprv_strlen(src), (void*)src);
1392 if (lenParsed == 0) {
1393 goto err;
1394 }
1395 src += lenParsed; /* advance past escape seq. */
1396 if (dest != NULL && U16_LENGTH(c32) <= (destCapacity - i)) {
1397 U16_APPEND_UNSAFE(dest, i, c32);
1398 } else {
1399 i += U16_LENGTH(c32);
1400 }
1401 segment = src;
1402 } else {
1403 ++src;
1404 }
1405 }
1406 if (src != segment) {
1407 if (dest != NULL) {
1408 _appendUChars(dest + i, destCapacity - i,
1409 segment, (int32_t)(src - segment));
1410 }
1411 i += (int32_t)(src - segment);
1412 }
1413 if (dest != NULL && i < destCapacity) {
1414 dest[i] = 0;
1415 }
1416 return i;
1417
1418 err:
1419 if (dest != NULL && destCapacity > 0) {
1420 *dest = 0;
1421 }
1422 return 0;
1423}
1424
1425/* NUL-termination of strings ----------------------------------------------- */
1426
1427/**
1428 * NUL-terminate a string no matter what its type.
1429 * Set warning and error codes accordingly.
1430 */
1431#define __TERMINATE_STRING(dest, destCapacity, length, pErrorCode) UPRV_BLOCK_MACRO_BEGIN { \
1432 if(pErrorCode!=NULL && U_SUCCESS(*pErrorCode)) { \
1433 /* not a public function, so no complete argument checking */ \
1434 \
1435 if(length<0) { \
1436 /* assume that the caller handles this */ \
1437 } else if(length<destCapacity) { \
1438 /* NUL-terminate the string, the NUL fits */ \
1439 dest[length]=0; \
1440 /* unset the not-terminated warning but leave all others */ \
1441 if(*pErrorCode==U_STRING_NOT_TERMINATED_WARNING) { \
1442 *pErrorCode=U_ZERO_ERROR; \
1443 } \
1444 } else if(length==destCapacity) { \
1445 /* unable to NUL-terminate, but the string itself fit - set a warning code */ \
1446 *pErrorCode=U_STRING_NOT_TERMINATED_WARNING; \
1447 } else /* length>destCapacity */ { \
1448 /* even the string itself did not fit - set an error code */ \
1449 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; \
1450 } \
1451 } \
1452} UPRV_BLOCK_MACRO_END
1453
1454U_CAPI UChar U_EXPORT2
1455u_asciiToUpper(UChar c) {
1456 if (u'a' <= c && c <= u'z') {
1457 c = c + u'A' - u'a';
1458 }
1459 return c;
1460}
1461
1462U_CAPI int32_t U_EXPORT2
1463u_terminateUChars(UChar *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) {
1464 __TERMINATE_STRING(dest, destCapacity, length, pErrorCode);
1465 return length;
1466}
1467
1468U_CAPI int32_t U_EXPORT2
1469u_terminateChars(char *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) {
1470 __TERMINATE_STRING(dest, destCapacity, length, pErrorCode);
1471 return length;
1472}
1473
1474U_CAPI int32_t U_EXPORT2
1475u_terminateUChar32s(UChar32 *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) {
1476 __TERMINATE_STRING(dest, destCapacity, length, pErrorCode);
1477 return length;
1478}
1479
1480U_CAPI int32_t U_EXPORT2
1481u_terminateWChars(wchar_t *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) {
1482 __TERMINATE_STRING(dest, destCapacity, length, pErrorCode);
1483 return length;
1484}
1485
1486// Compute the hash code for a string -------------------------------------- ***
1487
1488// Moved here from uhash.c so that UnicodeString::hashCode() does not depend
1489// on UHashtable code.
1490
1491/*
1492 Compute the hash by iterating sparsely over about 32 (up to 63)
1493 characters spaced evenly through the string. For each character,
1494 multiply the previous hash value by a prime number and add the new
1495 character in, like a linear congruential random number generator,
1496 producing a pseudorandom deterministic value well distributed over
1497 the output range. [LIU]
1498*/
1499
1500#define STRING_HASH(TYPE, STR, STRLEN, DEREF) UPRV_BLOCK_MACRO_BEGIN { \
1501 uint32_t hash = 0; \
1502 const TYPE *p = (const TYPE*) STR; \
1503 if (p != NULL) { \
1504 int32_t len = (int32_t)(STRLEN); \
1505 int32_t inc = ((len - 32) / 32) + 1; \
1506 const TYPE *limit = p + len; \
1507 while (p<limit) { \
1508 hash = (hash * 37) + DEREF; \
1509 p += inc; \
1510 } \
1511 } \
1512 return static_cast<int32_t>(hash); \
1513} UPRV_BLOCK_MACRO_END
1514
1515/* Used by UnicodeString to compute its hashcode - Not public API. */
1516U_CAPI int32_t U_EXPORT2
1517ustr_hashUCharsN(const UChar *str, int32_t length) {
1518 STRING_HASH(UChar, str, length, *p);
1519}
1520
1521U_CAPI int32_t U_EXPORT2
1522ustr_hashCharsN(const char *str, int32_t length) {
1523 STRING_HASH(uint8_t, str, length, *p);
1524}
1525
1526U_CAPI int32_t U_EXPORT2
1527ustr_hashICharsN(const char *str, int32_t length) {
1528 STRING_HASH(char, str, length, (uint8_t)uprv_tolower(*p));
1529}
1530