1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5*
6* Copyright (C) 2002-2012, International Business Machines
7* Corporation and others. All Rights Reserved.
8*
9*******************************************************************************
10* file name: uiter.cpp
11* encoding: UTF-8
12* tab size: 8 (not used)
13* indentation:4
14*
15* created on: 2002jan18
16* created by: Markus W. Scherer
17*/
18
19#include "unicode/utypes.h"
20#include "unicode/ustring.h"
21#include "unicode/chariter.h"
22#include "unicode/rep.h"
23#include "unicode/uiter.h"
24#include "unicode/utf.h"
25#include "unicode/utf8.h"
26#include "unicode/utf16.h"
27#include "cstring.h"
28
29U_NAMESPACE_USE
30
31#define IS_EVEN(n) (((n)&1)==0)
32#define IS_POINTER_EVEN(p) IS_EVEN((size_t)p)
33
34U_CDECL_BEGIN
35
36/* No-Op UCharIterator implementation for illegal input --------------------- */
37
38static int32_t U_CALLCONV
39noopGetIndex(UCharIterator * /*iter*/, UCharIteratorOrigin /*origin*/) {
40 return 0;
41}
42
43static int32_t U_CALLCONV
44noopMove(UCharIterator * /*iter*/, int32_t /*delta*/, UCharIteratorOrigin /*origin*/) {
45 return 0;
46}
47
48static UBool U_CALLCONV
49noopHasNext(UCharIterator * /*iter*/) {
50 return false;
51}
52
53static UChar32 U_CALLCONV
54noopCurrent(UCharIterator * /*iter*/) {
55 return U_SENTINEL;
56}
57
58static uint32_t U_CALLCONV
59noopGetState(const UCharIterator * /*iter*/) {
60 return UITER_NO_STATE;
61}
62
63static void U_CALLCONV
64noopSetState(UCharIterator * /*iter*/, uint32_t /*state*/, UErrorCode *pErrorCode) {
65 *pErrorCode=U_UNSUPPORTED_ERROR;
66}
67
68static const UCharIterator noopIterator={
69 0, 0, 0, 0, 0, 0,
70 noopGetIndex,
71 noopMove,
72 noopHasNext,
73 noopHasNext,
74 noopCurrent,
75 noopCurrent,
76 noopCurrent,
77 nullptr,
78 noopGetState,
79 noopSetState
80};
81
82/* UCharIterator implementation for simple strings -------------------------- */
83
84/*
85 * This is an implementation of a code unit (char16_t) iterator
86 * for char16_t * strings.
87 *
88 * The UCharIterator.context field holds a pointer to the string.
89 */
90
91static int32_t U_CALLCONV
92stringIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) UPRV_NO_SANITIZE_UNDEFINED {
93 switch(origin) {
94 case UITER_ZERO:
95 return 0;
96 case UITER_START:
97 return iter->start;
98 case UITER_CURRENT:
99 return iter->index;
100 case UITER_LIMIT:
101 return iter->limit;
102 case UITER_LENGTH:
103 return iter->length;
104 default:
105 /* not a valid origin */
106 /* Should never get here! */
107 return -1;
108 }
109}
110
111static int32_t U_CALLCONV
112stringIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) UPRV_NO_SANITIZE_UNDEFINED {
113 int32_t pos;
114
115 switch(origin) {
116 case UITER_ZERO:
117 pos=delta;
118 break;
119 case UITER_START:
120 pos=iter->start+delta;
121 break;
122 case UITER_CURRENT:
123 pos=iter->index+delta;
124 break;
125 case UITER_LIMIT:
126 pos=iter->limit+delta;
127 break;
128 case UITER_LENGTH:
129 pos=iter->length+delta;
130 break;
131 default:
132 return -1; /* Error */
133 }
134
135 if(pos<iter->start) {
136 pos=iter->start;
137 } else if(pos>iter->limit) {
138 pos=iter->limit;
139 }
140
141 return iter->index=pos;
142}
143
144static UBool U_CALLCONV
145stringIteratorHasNext(UCharIterator *iter) {
146 return iter->index<iter->limit;
147}
148
149static UBool U_CALLCONV
150stringIteratorHasPrevious(UCharIterator *iter) {
151 return iter->index>iter->start;
152}
153
154static UChar32 U_CALLCONV
155stringIteratorCurrent(UCharIterator *iter) {
156 if(iter->index<iter->limit) {
157 return ((const char16_t *)(iter->context))[iter->index];
158 } else {
159 return U_SENTINEL;
160 }
161}
162
163static UChar32 U_CALLCONV
164stringIteratorNext(UCharIterator *iter) {
165 if(iter->index<iter->limit) {
166 return ((const char16_t *)(iter->context))[iter->index++];
167 } else {
168 return U_SENTINEL;
169 }
170}
171
172static UChar32 U_CALLCONV
173stringIteratorPrevious(UCharIterator *iter) {
174 if(iter->index>iter->start) {
175 return ((const char16_t *)(iter->context))[--iter->index];
176 } else {
177 return U_SENTINEL;
178 }
179}
180
181static uint32_t U_CALLCONV
182stringIteratorGetState(const UCharIterator *iter) {
183 return (uint32_t)iter->index;
184}
185
186static void U_CALLCONV
187stringIteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) {
188 if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
189 /* do nothing */
190 } else if(iter==nullptr) {
191 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
192 } else if((int32_t)state<iter->start || iter->limit<(int32_t)state) {
193 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
194 } else {
195 iter->index=(int32_t)state;
196 }
197}
198
199static const UCharIterator stringIterator={
200 0, 0, 0, 0, 0, 0,
201 stringIteratorGetIndex,
202 stringIteratorMove,
203 stringIteratorHasNext,
204 stringIteratorHasPrevious,
205 stringIteratorCurrent,
206 stringIteratorNext,
207 stringIteratorPrevious,
208 nullptr,
209 stringIteratorGetState,
210 stringIteratorSetState
211};
212
213U_CAPI void U_EXPORT2
214uiter_setString(UCharIterator *iter, const char16_t *s, int32_t length) {
215 if(iter!=0) {
216 if(s!=0 && length>=-1) {
217 *iter=stringIterator;
218 iter->context=s;
219 if(length>=0) {
220 iter->length=length;
221 } else {
222 iter->length=u_strlen(s);
223 }
224 iter->limit=iter->length;
225 } else {
226 *iter=noopIterator;
227 }
228 }
229}
230
231/* UCharIterator implementation for UTF-16BE strings ------------------------ */
232
233/*
234 * This is an implementation of a code unit (char16_t) iterator
235 * for UTF-16BE strings, i.e., strings in byte-vectors where
236 * each char16_t is stored as a big-endian pair of bytes.
237 *
238 * The UCharIterator.context field holds a pointer to the string.
239 * Everything works just like with a normal char16_t iterator (uiter_setString),
240 * except that UChars are assembled from byte pairs.
241 */
242
243/* internal helper function */
244static inline UChar32
245utf16BEIteratorGet(UCharIterator *iter, int32_t index) {
246 const uint8_t *p=(const uint8_t *)iter->context;
247 return ((char16_t)p[2*index]<<8)|(char16_t)p[2*index+1];
248}
249
250static UChar32 U_CALLCONV
251utf16BEIteratorCurrent(UCharIterator *iter) {
252 int32_t index;
253
254 if((index=iter->index)<iter->limit) {
255 return utf16BEIteratorGet(iter, index);
256 } else {
257 return U_SENTINEL;
258 }
259}
260
261static UChar32 U_CALLCONV
262utf16BEIteratorNext(UCharIterator *iter) {
263 int32_t index;
264
265 if((index=iter->index)<iter->limit) {
266 iter->index=index+1;
267 return utf16BEIteratorGet(iter, index);
268 } else {
269 return U_SENTINEL;
270 }
271}
272
273static UChar32 U_CALLCONV
274utf16BEIteratorPrevious(UCharIterator *iter) {
275 int32_t index;
276
277 if((index=iter->index)>iter->start) {
278 iter->index=--index;
279 return utf16BEIteratorGet(iter, index);
280 } else {
281 return U_SENTINEL;
282 }
283}
284
285static const UCharIterator utf16BEIterator={
286 0, 0, 0, 0, 0, 0,
287 stringIteratorGetIndex,
288 stringIteratorMove,
289 stringIteratorHasNext,
290 stringIteratorHasPrevious,
291 utf16BEIteratorCurrent,
292 utf16BEIteratorNext,
293 utf16BEIteratorPrevious,
294 nullptr,
295 stringIteratorGetState,
296 stringIteratorSetState
297};
298
299/*
300 * Count the number of UChars in a UTF-16BE string before a terminating char16_t NUL,
301 * i.e., before a pair of 0 bytes where the first 0 byte is at an even
302 * offset from s.
303 */
304static int32_t
305utf16BE_strlen(const char *s) {
306 if(IS_POINTER_EVEN(s)) {
307 /*
308 * even-aligned, call u_strlen(s)
309 * we are probably on a little-endian machine, but searching for char16_t NUL
310 * does not care about endianness
311 */
312 return u_strlen((const char16_t *)s);
313 } else {
314 /* odd-aligned, search for pair of 0 bytes */
315 const char *p=s;
316
317 while(!(*p==0 && p[1]==0)) {
318 p+=2;
319 }
320 return (int32_t)((p-s)/2);
321 }
322}
323
324U_CAPI void U_EXPORT2
325uiter_setUTF16BE(UCharIterator *iter, const char *s, int32_t length) {
326 if(iter!=nullptr) {
327 /* allow only even-length strings (the input length counts bytes) */
328 if(s!=nullptr && (length==-1 || (length>=0 && IS_EVEN(length)))) {
329 /* length/=2, except that >>=1 also works for -1 (-1/2==0, -1>>1==-1) */
330 length>>=1;
331
332 if(U_IS_BIG_ENDIAN && IS_POINTER_EVEN(s)) {
333 /* big-endian machine and 2-aligned UTF-16BE string: use normal char16_t iterator */
334 uiter_setString(iter, (const char16_t *)s, length);
335 return;
336 }
337
338 *iter=utf16BEIterator;
339 iter->context=s;
340 if(length>=0) {
341 iter->length=length;
342 } else {
343 iter->length=utf16BE_strlen(s);
344 }
345 iter->limit=iter->length;
346 } else {
347 *iter=noopIterator;
348 }
349 }
350}
351
352/* UCharIterator wrapper around CharacterIterator --------------------------- */
353
354/*
355 * This is wrapper code around a C++ CharacterIterator to
356 * look like a C UCharIterator.
357 *
358 * The UCharIterator.context field holds a pointer to the CharacterIterator.
359 */
360
361static int32_t U_CALLCONV
362characterIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) UPRV_NO_SANITIZE_UNDEFINED {
363 switch(origin) {
364 case UITER_ZERO:
365 return 0;
366 case UITER_START:
367 return ((CharacterIterator *)(iter->context))->startIndex();
368 case UITER_CURRENT:
369 return ((CharacterIterator *)(iter->context))->getIndex();
370 case UITER_LIMIT:
371 return ((CharacterIterator *)(iter->context))->endIndex();
372 case UITER_LENGTH:
373 return ((CharacterIterator *)(iter->context))->getLength();
374 default:
375 /* not a valid origin */
376 /* Should never get here! */
377 return -1;
378 }
379}
380
381static int32_t U_CALLCONV
382characterIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) UPRV_NO_SANITIZE_UNDEFINED {
383 switch(origin) {
384 case UITER_ZERO:
385 ((CharacterIterator *)(iter->context))->setIndex(delta);
386 return ((CharacterIterator *)(iter->context))->getIndex();
387 case UITER_START:
388 case UITER_CURRENT:
389 case UITER_LIMIT:
390 return ((CharacterIterator *)(iter->context))->move(delta, (CharacterIterator::EOrigin)origin);
391 case UITER_LENGTH:
392 ((CharacterIterator *)(iter->context))->setIndex(((CharacterIterator *)(iter->context))->getLength()+delta);
393 return ((CharacterIterator *)(iter->context))->getIndex();
394 default:
395 /* not a valid origin */
396 /* Should never get here! */
397 return -1;
398 }
399}
400
401static UBool U_CALLCONV
402characterIteratorHasNext(UCharIterator *iter) {
403 return ((CharacterIterator *)(iter->context))->hasNext();
404}
405
406static UBool U_CALLCONV
407characterIteratorHasPrevious(UCharIterator *iter) {
408 return ((CharacterIterator *)(iter->context))->hasPrevious();
409}
410
411static UChar32 U_CALLCONV
412characterIteratorCurrent(UCharIterator *iter) {
413 UChar32 c;
414
415 c=((CharacterIterator *)(iter->context))->current();
416 if(c!=0xffff || ((CharacterIterator *)(iter->context))->hasNext()) {
417 return c;
418 } else {
419 return U_SENTINEL;
420 }
421}
422
423static UChar32 U_CALLCONV
424characterIteratorNext(UCharIterator *iter) {
425 if(((CharacterIterator *)(iter->context))->hasNext()) {
426 return ((CharacterIterator *)(iter->context))->nextPostInc();
427 } else {
428 return U_SENTINEL;
429 }
430}
431
432static UChar32 U_CALLCONV
433characterIteratorPrevious(UCharIterator *iter) {
434 if(((CharacterIterator *)(iter->context))->hasPrevious()) {
435 return ((CharacterIterator *)(iter->context))->previous();
436 } else {
437 return U_SENTINEL;
438 }
439}
440
441static uint32_t U_CALLCONV
442characterIteratorGetState(const UCharIterator *iter) {
443 return ((CharacterIterator *)(iter->context))->getIndex();
444}
445
446static void U_CALLCONV
447characterIteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) {
448 if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
449 /* do nothing */
450 } else if(iter==nullptr || iter->context==nullptr) {
451 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
452 } else if((int32_t)state<((CharacterIterator *)(iter->context))->startIndex() || ((CharacterIterator *)(iter->context))->endIndex()<(int32_t)state) {
453 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
454 } else {
455 ((CharacterIterator *)(iter->context))->setIndex((int32_t)state);
456 }
457}
458
459static const UCharIterator characterIteratorWrapper={
460 0, 0, 0, 0, 0, 0,
461 characterIteratorGetIndex,
462 characterIteratorMove,
463 characterIteratorHasNext,
464 characterIteratorHasPrevious,
465 characterIteratorCurrent,
466 characterIteratorNext,
467 characterIteratorPrevious,
468 nullptr,
469 characterIteratorGetState,
470 characterIteratorSetState
471};
472
473U_CAPI void U_EXPORT2
474uiter_setCharacterIterator(UCharIterator *iter, CharacterIterator *charIter) {
475 if(iter!=0) {
476 if(charIter!=0) {
477 *iter=characterIteratorWrapper;
478 iter->context=charIter;
479 } else {
480 *iter=noopIterator;
481 }
482 }
483}
484
485/* UCharIterator wrapper around Replaceable --------------------------------- */
486
487/*
488 * This is an implementation of a code unit (char16_t) iterator
489 * based on a Replaceable object.
490 *
491 * The UCharIterator.context field holds a pointer to the Replaceable.
492 * UCharIterator.length and UCharIterator.index hold Replaceable.length()
493 * and the iteration index.
494 */
495
496static UChar32 U_CALLCONV
497replaceableIteratorCurrent(UCharIterator *iter) {
498 if(iter->index<iter->limit) {
499 return ((Replaceable *)(iter->context))->charAt(iter->index);
500 } else {
501 return U_SENTINEL;
502 }
503}
504
505static UChar32 U_CALLCONV
506replaceableIteratorNext(UCharIterator *iter) {
507 if(iter->index<iter->limit) {
508 return ((Replaceable *)(iter->context))->charAt(iter->index++);
509 } else {
510 return U_SENTINEL;
511 }
512}
513
514static UChar32 U_CALLCONV
515replaceableIteratorPrevious(UCharIterator *iter) {
516 if(iter->index>iter->start) {
517 return ((Replaceable *)(iter->context))->charAt(--iter->index);
518 } else {
519 return U_SENTINEL;
520 }
521}
522
523static const UCharIterator replaceableIterator={
524 0, 0, 0, 0, 0, 0,
525 stringIteratorGetIndex,
526 stringIteratorMove,
527 stringIteratorHasNext,
528 stringIteratorHasPrevious,
529 replaceableIteratorCurrent,
530 replaceableIteratorNext,
531 replaceableIteratorPrevious,
532 nullptr,
533 stringIteratorGetState,
534 stringIteratorSetState
535};
536
537U_CAPI void U_EXPORT2
538uiter_setReplaceable(UCharIterator *iter, const Replaceable *rep) {
539 if(iter!=0) {
540 if(rep!=0) {
541 *iter=replaceableIterator;
542 iter->context=rep;
543 iter->limit=iter->length=rep->length();
544 } else {
545 *iter=noopIterator;
546 }
547 }
548}
549
550/* UCharIterator implementation for UTF-8 strings --------------------------- */
551
552/*
553 * Possible, probably necessary only for an implementation for arbitrary
554 * converters:
555 * Maintain a buffer (ring buffer?) for a piece of converted 16-bit text.
556 * This would require to turn reservedFn into a close function and
557 * to introduce a uiter_close(iter).
558 */
559
560#define UITER_CNV_CAPACITY 16
561
562/*
563 * Minimal implementation:
564 * Maintain a single-char16_t buffer for an additional surrogate.
565 * The caller must not modify start and limit because they are used internally.
566 *
567 * Use UCharIterator fields as follows:
568 * context pointer to UTF-8 string
569 * length UTF-16 length of the string; -1 until lazy evaluation
570 * start current UTF-8 index
571 * index current UTF-16 index; may be -1="unknown" after setState()
572 * limit UTF-8 length of the string
573 * reservedField supplementary code point
574 *
575 * Since UCharIterator delivers 16-bit code units, the iteration can be
576 * currently in the middle of the byte sequence for a supplementary code point.
577 * In this case, reservedField will contain that code point and start will
578 * point to after the corresponding byte sequence. The UTF-16 index will be
579 * one less than what it would otherwise be corresponding to the UTF-8 index.
580 * Otherwise, reservedField will be 0.
581 */
582
583/*
584 * Possible optimization for NUL-terminated UTF-8 and UTF-16 strings:
585 * Add implementations that do not call strlen() for iteration but check for NUL.
586 */
587
588static int32_t U_CALLCONV
589utf8IteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) UPRV_NO_SANITIZE_UNDEFINED {
590 switch(origin) {
591 case UITER_ZERO:
592 case UITER_START:
593 return 0;
594 case UITER_CURRENT:
595 if(iter->index<0) {
596 /* the current UTF-16 index is unknown after setState(), count from the beginning */
597 const uint8_t *s;
598 UChar32 c;
599 int32_t i, limit, index;
600
601 s=(const uint8_t *)iter->context;
602 i=index=0;
603 limit=iter->start; /* count up to the UTF-8 index */
604 while(i<limit) {
605 U8_NEXT_OR_FFFD(s, i, limit, c);
606 index+=U16_LENGTH(c);
607 }
608
609 iter->start=i; /* just in case setState() did not get us to a code point boundary */
610 if(i==iter->limit) {
611 iter->length=index; /* in case it was <0 or wrong */
612 }
613 if(iter->reservedField!=0) {
614 --index; /* we are in the middle of a supplementary code point */
615 }
616 iter->index=index;
617 }
618 return iter->index;
619 case UITER_LIMIT:
620 case UITER_LENGTH:
621 if(iter->length<0) {
622 const uint8_t *s;
623 UChar32 c;
624 int32_t i, limit, length;
625
626 s=(const uint8_t *)iter->context;
627 if(iter->index<0) {
628 /*
629 * the current UTF-16 index is unknown after setState(),
630 * we must first count from the beginning to here
631 */
632 i=length=0;
633 limit=iter->start;
634
635 /* count from the beginning to the current index */
636 while(i<limit) {
637 U8_NEXT_OR_FFFD(s, i, limit, c);
638 length+=U16_LENGTH(c);
639 }
640
641 /* assume i==limit==iter->start, set the UTF-16 index */
642 iter->start=i; /* just in case setState() did not get us to a code point boundary */
643 iter->index= iter->reservedField!=0 ? length-1 : length;
644 } else {
645 i=iter->start;
646 length=iter->index;
647 if(iter->reservedField!=0) {
648 ++length;
649 }
650 }
651
652 /* count from the current index to the end */
653 limit=iter->limit;
654 while(i<limit) {
655 U8_NEXT_OR_FFFD(s, i, limit, c);
656 length+=U16_LENGTH(c);
657 }
658 iter->length=length;
659 }
660 return iter->length;
661 default:
662 /* not a valid origin */
663 /* Should never get here! */
664 return -1;
665 }
666}
667
668static int32_t U_CALLCONV
669utf8IteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) UPRV_NO_SANITIZE_UNDEFINED {
670 const uint8_t *s;
671 UChar32 c;
672 int32_t pos; /* requested UTF-16 index */
673 int32_t i; /* UTF-8 index */
674 UBool havePos;
675
676 /* calculate the requested UTF-16 index */
677 switch(origin) {
678 case UITER_ZERO:
679 case UITER_START:
680 pos=delta;
681 havePos=true;
682 /* iter->index<0 (unknown) is possible */
683 break;
684 case UITER_CURRENT:
685 if(iter->index>=0) {
686 pos=iter->index+delta;
687 havePos=true;
688 } else {
689 /* the current UTF-16 index is unknown after setState(), use only delta */
690 pos=0;
691 havePos=false;
692 }
693 break;
694 case UITER_LIMIT:
695 case UITER_LENGTH:
696 if(iter->length>=0) {
697 pos=iter->length+delta;
698 havePos=true;
699 } else {
700 /* pin to the end, avoid counting the length */
701 iter->index=-1;
702 iter->start=iter->limit;
703 iter->reservedField=0;
704 if(delta>=0) {
705 return UITER_UNKNOWN_INDEX;
706 } else {
707 /* the current UTF-16 index is unknown, use only delta */
708 pos=0;
709 havePos=false;
710 }
711 }
712 break;
713 default:
714 return -1; /* Error */
715 }
716
717 if(havePos) {
718 /* shortcuts: pinning to the edges of the string */
719 if(pos<=0) {
720 iter->index=iter->start=iter->reservedField=0;
721 return 0;
722 } else if(iter->length>=0 && pos>=iter->length) {
723 iter->index=iter->length;
724 iter->start=iter->limit;
725 iter->reservedField=0;
726 return iter->index;
727 }
728
729 /* minimize the number of U8_NEXT/PREV operations */
730 if(iter->index<0 || pos<iter->index/2) {
731 /* go forward from the start instead of backward from the current index */
732 iter->index=iter->start=iter->reservedField=0;
733 } else if(iter->length>=0 && (iter->length-pos)<(pos-iter->index)) {
734 /*
735 * if we have the UTF-16 index and length and the new position is
736 * closer to the end than the current index,
737 * then go backward from the end instead of forward from the current index
738 */
739 iter->index=iter->length;
740 iter->start=iter->limit;
741 iter->reservedField=0;
742 }
743
744 delta=pos-iter->index;
745 if(delta==0) {
746 return iter->index; /* nothing to do */
747 }
748 } else {
749 /* move relative to unknown UTF-16 index */
750 if(delta==0) {
751 return UITER_UNKNOWN_INDEX; /* nothing to do */
752 } else if(-delta>=iter->start) {
753 /* moving backwards by more UChars than there are UTF-8 bytes, pin to 0 */
754 iter->index=iter->start=iter->reservedField=0;
755 return 0;
756 } else if(delta>=(iter->limit-iter->start)) {
757 /* moving forward by more UChars than the remaining UTF-8 bytes, pin to the end */
758 iter->index=iter->length; /* may or may not be <0 (unknown) */
759 iter->start=iter->limit;
760 iter->reservedField=0;
761 return iter->index>=0 ? iter->index : (int32_t)UITER_UNKNOWN_INDEX;
762 }
763 }
764
765 /* delta!=0 */
766
767 /* move towards the requested position, pin to the edges of the string */
768 s=(const uint8_t *)iter->context;
769 pos=iter->index; /* could be <0 (unknown) */
770 i=iter->start;
771 if(delta>0) {
772 /* go forward */
773 int32_t limit=iter->limit;
774 if(iter->reservedField!=0) {
775 iter->reservedField=0;
776 ++pos;
777 --delta;
778 }
779 while(delta>0 && i<limit) {
780 U8_NEXT_OR_FFFD(s, i, limit, c);
781 if(c<=0xffff) {
782 ++pos;
783 --delta;
784 } else if(delta>=2) {
785 pos+=2;
786 delta-=2;
787 } else /* delta==1 */ {
788 /* stop in the middle of a supplementary code point */
789 iter->reservedField=c;
790 ++pos;
791 break; /* delta=0; */
792 }
793 }
794 if(i==limit) {
795 if(iter->length<0 && iter->index>=0) {
796 iter->length= iter->reservedField==0 ? pos : pos+1;
797 } else if(iter->index<0 && iter->length>=0) {
798 iter->index= iter->reservedField==0 ? iter->length : iter->length-1;
799 }
800 }
801 } else /* delta<0 */ {
802 /* go backward */
803 if(iter->reservedField!=0) {
804 iter->reservedField=0;
805 i-=4; /* we stayed behind the supplementary code point; go before it now */
806 --pos;
807 ++delta;
808 }
809 while(delta<0 && i>0) {
810 U8_PREV_OR_FFFD(s, 0, i, c);
811 if(c<=0xffff) {
812 --pos;
813 ++delta;
814 } else if(delta<=-2) {
815 pos-=2;
816 delta+=2;
817 } else /* delta==-1 */ {
818 /* stop in the middle of a supplementary code point */
819 i+=4; /* back to behind this supplementary code point for consistent state */
820 iter->reservedField=c;
821 --pos;
822 break; /* delta=0; */
823 }
824 }
825 }
826
827 iter->start=i;
828 if(iter->index>=0) {
829 return iter->index=pos;
830 } else {
831 /* we started with index<0 (unknown) so pos is bogus */
832 if(i<=1) {
833 return iter->index=i; /* reached the beginning */
834 } else {
835 /* we still don't know the UTF-16 index */
836 return UITER_UNKNOWN_INDEX;
837 }
838 }
839}
840
841static UBool U_CALLCONV
842utf8IteratorHasNext(UCharIterator *iter) {
843 return iter->start<iter->limit || iter->reservedField!=0;
844}
845
846static UBool U_CALLCONV
847utf8IteratorHasPrevious(UCharIterator *iter) {
848 return iter->start>0;
849}
850
851static UChar32 U_CALLCONV
852utf8IteratorCurrent(UCharIterator *iter) {
853 if(iter->reservedField!=0) {
854 return U16_TRAIL(iter->reservedField);
855 } else if(iter->start<iter->limit) {
856 const uint8_t *s=(const uint8_t *)iter->context;
857 UChar32 c;
858 int32_t i=iter->start;
859
860 U8_NEXT_OR_FFFD(s, i, iter->limit, c);
861 if(c<=0xffff) {
862 return c;
863 } else {
864 return U16_LEAD(c);
865 }
866 } else {
867 return U_SENTINEL;
868 }
869}
870
871static UChar32 U_CALLCONV
872utf8IteratorNext(UCharIterator *iter) {
873 int32_t index;
874
875 if(iter->reservedField!=0) {
876 char16_t trail=U16_TRAIL(iter->reservedField);
877 iter->reservedField=0;
878 if((index=iter->index)>=0) {
879 iter->index=index+1;
880 }
881 return trail;
882 } else if(iter->start<iter->limit) {
883 const uint8_t *s=(const uint8_t *)iter->context;
884 UChar32 c;
885
886 U8_NEXT_OR_FFFD(s, iter->start, iter->limit, c);
887 if((index=iter->index)>=0) {
888 iter->index=++index;
889 if(iter->length<0 && iter->start==iter->limit) {
890 iter->length= c<=0xffff ? index : index+1;
891 }
892 } else if(iter->start==iter->limit && iter->length>=0) {
893 iter->index= c<=0xffff ? iter->length : iter->length-1;
894 }
895 if(c<=0xffff) {
896 return c;
897 } else {
898 iter->reservedField=c;
899 return U16_LEAD(c);
900 }
901 } else {
902 return U_SENTINEL;
903 }
904}
905
906static UChar32 U_CALLCONV
907utf8IteratorPrevious(UCharIterator *iter) {
908 int32_t index;
909
910 if(iter->reservedField!=0) {
911 char16_t lead=U16_LEAD(iter->reservedField);
912 iter->reservedField=0;
913 iter->start-=4; /* we stayed behind the supplementary code point; go before it now */
914 if((index=iter->index)>0) {
915 iter->index=index-1;
916 }
917 return lead;
918 } else if(iter->start>0) {
919 const uint8_t *s=(const uint8_t *)iter->context;
920 UChar32 c;
921
922 U8_PREV_OR_FFFD(s, 0, iter->start, c);
923 if((index=iter->index)>0) {
924 iter->index=index-1;
925 } else if(iter->start<=1) {
926 iter->index= c<=0xffff ? iter->start : iter->start+1;
927 }
928 if(c<=0xffff) {
929 return c;
930 } else {
931 iter->start+=4; /* back to behind this supplementary code point for consistent state */
932 iter->reservedField=c;
933 return U16_TRAIL(c);
934 }
935 } else {
936 return U_SENTINEL;
937 }
938}
939
940static uint32_t U_CALLCONV
941utf8IteratorGetState(const UCharIterator *iter) {
942 uint32_t state=(uint32_t)(iter->start<<1);
943 if(iter->reservedField!=0) {
944 state|=1;
945 }
946 return state;
947}
948
949static void U_CALLCONV
950utf8IteratorSetState(UCharIterator *iter,
951 uint32_t state,
952 UErrorCode *pErrorCode)
953{
954 if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
955 /* do nothing */
956 } else if(iter==nullptr) {
957 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
958 } else if(state==utf8IteratorGetState(iter)) {
959 /* setting to the current state: no-op */
960 } else {
961 int32_t index=(int32_t)(state>>1); /* UTF-8 index */
962 state&=1; /* 1 if in surrogate pair, must be index>=4 */
963
964 if((state==0 ? index<0 : index<4) || iter->limit<index) {
965 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
966 } else {
967 iter->start=index; /* restore UTF-8 byte index */
968 if(index<=1) {
969 iter->index=index;
970 } else {
971 iter->index=-1; /* unknown UTF-16 index */
972 }
973 if(state==0) {
974 iter->reservedField=0;
975 } else {
976 /* verified index>=4 above */
977 UChar32 c;
978 U8_PREV_OR_FFFD((const uint8_t *)iter->context, 0, index, c);
979 if(c<=0xffff) {
980 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
981 } else {
982 iter->reservedField=c;
983 }
984 }
985 }
986 }
987}
988
989static const UCharIterator utf8Iterator={
990 0, 0, 0, 0, 0, 0,
991 utf8IteratorGetIndex,
992 utf8IteratorMove,
993 utf8IteratorHasNext,
994 utf8IteratorHasPrevious,
995 utf8IteratorCurrent,
996 utf8IteratorNext,
997 utf8IteratorPrevious,
998 nullptr,
999 utf8IteratorGetState,
1000 utf8IteratorSetState
1001};
1002
1003U_CAPI void U_EXPORT2
1004uiter_setUTF8(UCharIterator *iter, const char *s, int32_t length) {
1005 if(iter!=0) {
1006 if(s!=0 && length>=-1) {
1007 *iter=utf8Iterator;
1008 iter->context=s;
1009 if(length>=0) {
1010 iter->limit=length;
1011 } else {
1012 iter->limit=(int32_t)uprv_strlen(s);
1013 }
1014 iter->length= iter->limit<=1 ? iter->limit : -1;
1015 } else {
1016 *iter=noopIterator;
1017 }
1018 }
1019}
1020
1021/* Helper functions --------------------------------------------------------- */
1022
1023U_CAPI UChar32 U_EXPORT2
1024uiter_current32(UCharIterator *iter) {
1025 UChar32 c, c2;
1026
1027 c=iter->current(iter);
1028 if(U16_IS_SURROGATE(c)) {
1029 if(U16_IS_SURROGATE_LEAD(c)) {
1030 /*
1031 * go to the next code unit
1032 * we know that we are not at the limit because c!=U_SENTINEL
1033 */
1034 iter->move(iter, 1, UITER_CURRENT);
1035 if(U16_IS_TRAIL(c2=iter->current(iter))) {
1036 c=U16_GET_SUPPLEMENTARY(c, c2);
1037 }
1038
1039 /* undo index movement */
1040 iter->move(iter, -1, UITER_CURRENT);
1041 } else {
1042 if(U16_IS_LEAD(c2=iter->previous(iter))) {
1043 c=U16_GET_SUPPLEMENTARY(c2, c);
1044 }
1045 if(c2>=0) {
1046 /* undo index movement */
1047 iter->move(iter, 1, UITER_CURRENT);
1048 }
1049 }
1050 }
1051 return c;
1052}
1053
1054U_CAPI UChar32 U_EXPORT2
1055uiter_next32(UCharIterator *iter) {
1056 UChar32 c, c2;
1057
1058 c=iter->next(iter);
1059 if(U16_IS_LEAD(c)) {
1060 if(U16_IS_TRAIL(c2=iter->next(iter))) {
1061 c=U16_GET_SUPPLEMENTARY(c, c2);
1062 } else if(c2>=0) {
1063 /* unmatched first surrogate, undo index movement */
1064 iter->move(iter, -1, UITER_CURRENT);
1065 }
1066 }
1067 return c;
1068}
1069
1070U_CAPI UChar32 U_EXPORT2
1071uiter_previous32(UCharIterator *iter) {
1072 UChar32 c, c2;
1073
1074 c=iter->previous(iter);
1075 if(U16_IS_TRAIL(c)) {
1076 if(U16_IS_LEAD(c2=iter->previous(iter))) {
1077 c=U16_GET_SUPPLEMENTARY(c2, c);
1078 } else if(c2>=0) {
1079 /* unmatched second surrogate, undo index movement */
1080 iter->move(iter, 1, UITER_CURRENT);
1081 }
1082 }
1083 return c;
1084}
1085
1086U_CAPI uint32_t U_EXPORT2
1087uiter_getState(const UCharIterator *iter) {
1088 if(iter==nullptr || iter->getState==nullptr) {
1089 return UITER_NO_STATE;
1090 } else {
1091 return iter->getState(iter);
1092 }
1093}
1094
1095U_CAPI void U_EXPORT2
1096uiter_setState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) {
1097 if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
1098 /* do nothing */
1099 } else if(iter==nullptr) {
1100 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1101 } else if(iter->setState==nullptr) {
1102 *pErrorCode=U_UNSUPPORTED_ERROR;
1103 } else {
1104 iter->setState(iter, state, pErrorCode);
1105 }
1106}
1107
1108U_CDECL_END
1109