1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4********************************************************************
5*
6* Copyright (C) 1997-2011, International Business Machines
7* Corporation and others. All Rights Reserved.
8*
9********************************************************************
10*/
11
12#ifndef CHARITER_H
13#define CHARITER_H
14
15#include "unicode/utypes.h"
16
17#if U_SHOW_CPLUSPLUS_API
18
19#include "unicode/uobject.h"
20#include "unicode/unistr.h"
21/**
22 * \file
23 * \brief C++ API: Character Iterator
24 */
25
26U_NAMESPACE_BEGIN
27/**
28 * Abstract class that defines an API for forward-only iteration
29 * on text objects.
30 * This is a minimal interface for iteration without random access
31 * or backwards iteration. It is especially useful for wrapping
32 * streams with converters into an object for collation or
33 * normalization.
34 *
35 * <p>Characters can be accessed in two ways: as code units or as
36 * code points.
37 * Unicode code points are 21-bit integers and are the scalar values
38 * of Unicode characters. ICU uses the type UChar32 for them.
39 * Unicode code units are the storage units of a given
40 * Unicode/UCS Transformation Format (a character encoding scheme).
41 * With UTF-16, all code points can be represented with either one
42 * or two code units ("surrogates").
43 * String storage is typically based on code units, while properties
44 * of characters are typically determined using code point values.
45 * Some processes may be designed to work with sequences of code units,
46 * or it may be known that all characters that are important to an
47 * algorithm can be represented with single code units.
48 * Other processes will need to use the code point access functions.</p>
49 *
50 * <p>ForwardCharacterIterator provides nextPostInc() to access
51 * a code unit and advance an internal position into the text object,
52 * similar to a <code>return text[position++]</code>.<br>
53 * It provides next32PostInc() to access a code point and advance an internal
54 * position.</p>
55 *
56 * <p>next32PostInc() assumes that the current position is that of
57 * the beginning of a code point, i.e., of its first code unit.
58 * After next32PostInc(), this will be true again.
59 * In general, access to code units and code points in the same
60 * iteration loop should not be mixed. In UTF-16, if the current position
61 * is on a second code unit (Low Surrogate), then only that code unit
62 * is returned even by next32PostInc().</p>
63 *
64 * <p>For iteration with either function, there are two ways to
65 * check for the end of the iteration. When there are no more
66 * characters in the text object:
67 * <ul>
68 * <li>The hasNext() function returns false.</li>
69 * <li>nextPostInc() and next32PostInc() return DONE
70 * when one attempts to read beyond the end of the text object.</li>
71 * </ul>
72 *
73 * Example:
74 * \code
75 * void function1(ForwardCharacterIterator &it) {
76 * UChar32 c;
77 * while(it.hasNext()) {
78 * c=it.next32PostInc();
79 * // use c
80 * }
81 * }
82 *
83 * void function1(ForwardCharacterIterator &it) {
84 * char16_t c;
85 * while((c=it.nextPostInc())!=ForwardCharacterIterator::DONE) {
86 * // use c
87 * }
88 * }
89 * \endcode
90 * </p>
91 *
92 * @stable ICU 2.0
93 */
94class U_COMMON_API ForwardCharacterIterator : public UObject {
95public:
96 /**
97 * Value returned by most of ForwardCharacterIterator's functions
98 * when the iterator has reached the limits of its iteration.
99 * @stable ICU 2.0
100 */
101 enum { DONE = 0xffff };
102
103 /**
104 * Destructor.
105 * @stable ICU 2.0
106 */
107 virtual ~ForwardCharacterIterator();
108
109 /**
110 * Returns true when both iterators refer to the same
111 * character in the same character-storage object.
112 * @param that The ForwardCharacterIterator to be compared for equality
113 * @return true when both iterators refer to the same
114 * character in the same character-storage object
115 * @stable ICU 2.0
116 */
117 virtual bool operator==(const ForwardCharacterIterator& that) const = 0;
118
119 /**
120 * Returns true when the iterators refer to different
121 * text-storage objects, or to different characters in the
122 * same text-storage object.
123 * @param that The ForwardCharacterIterator to be compared for inequality
124 * @return true when the iterators refer to different
125 * text-storage objects, or to different characters in the
126 * same text-storage object
127 * @stable ICU 2.0
128 */
129 inline bool operator!=(const ForwardCharacterIterator& that) const;
130
131 /**
132 * Generates a hash code for this iterator.
133 * @return the hash code.
134 * @stable ICU 2.0
135 */
136 virtual int32_t hashCode(void) const = 0;
137
138 /**
139 * Returns a UClassID for this ForwardCharacterIterator ("poor man's
140 * RTTI").<P> Despite the fact that this function is public,
141 * DO NOT CONSIDER IT PART OF CHARACTERITERATOR'S API!
142 * @return a UClassID for this ForwardCharacterIterator
143 * @stable ICU 2.0
144 */
145 virtual UClassID getDynamicClassID(void) const override = 0;
146
147 /**
148 * Gets the current code unit for returning and advances to the next code unit
149 * in the iteration range
150 * (toward endIndex()). If there are
151 * no more code units to return, returns DONE.
152 * @return the current code unit.
153 * @stable ICU 2.0
154 */
155 virtual char16_t nextPostInc(void) = 0;
156
157 /**
158 * Gets the current code point for returning and advances to the next code point
159 * in the iteration range
160 * (toward endIndex()). If there are
161 * no more code points to return, returns DONE.
162 * @return the current code point.
163 * @stable ICU 2.0
164 */
165 virtual UChar32 next32PostInc(void) = 0;
166
167 /**
168 * Returns false if there are no more code units or code points
169 * at or after the current position in the iteration range.
170 * This is used with nextPostInc() or next32PostInc() in forward
171 * iteration.
172 * @returns false if there are no more code units or code points
173 * at or after the current position in the iteration range.
174 * @stable ICU 2.0
175 */
176 virtual UBool hasNext() = 0;
177
178protected:
179 /** Default constructor to be overridden in the implementing class. @stable ICU 2.0*/
180 ForwardCharacterIterator();
181
182 /** Copy constructor to be overridden in the implementing class. @stable ICU 2.0*/
183 ForwardCharacterIterator(const ForwardCharacterIterator &other);
184
185 /**
186 * Assignment operator to be overridden in the implementing class.
187 * @stable ICU 2.0
188 */
189 ForwardCharacterIterator &operator=(const ForwardCharacterIterator&) { return *this; }
190};
191
192/**
193 * Abstract class that defines an API for iteration
194 * on text objects.
195 * This is an interface for forward and backward iteration
196 * and random access into a text object.
197 *
198 * <p>The API provides backward compatibility to the Java and older ICU
199 * CharacterIterator classes but extends them significantly:
200 * <ol>
201 * <li>CharacterIterator is now a subclass of ForwardCharacterIterator.</li>
202 * <li>While the old API functions provided forward iteration with
203 * "pre-increment" semantics, the new one also provides functions
204 * with "post-increment" semantics. They are more efficient and should
205 * be the preferred iterator functions for new implementations.
206 * The backward iteration always had "pre-decrement" semantics, which
207 * are efficient.</li>
208 * <li>Just like ForwardCharacterIterator, it provides access to
209 * both code units and code points. Code point access versions are available
210 * for the old and the new iteration semantics.</li>
211 * <li>There are new functions for setting and moving the current position
212 * without returning a character, for efficiency.</li>
213 * </ol>
214 *
215 * See ForwardCharacterIterator for examples for using the new forward iteration
216 * functions. For backward iteration, there is also a hasPrevious() function
217 * that can be used analogously to hasNext().
218 * The old functions work as before and are shown below.</p>
219 *
220 * <p>Examples for some of the new functions:</p>
221 *
222 * Forward iteration with hasNext():
223 * \code
224 * void forward1(CharacterIterator &it) {
225 * UChar32 c;
226 * for(it.setToStart(); it.hasNext();) {
227 * c=it.next32PostInc();
228 * // use c
229 * }
230 * }
231 * \endcode
232 * Forward iteration more similar to loops with the old forward iteration,
233 * showing a way to convert simple for() loops:
234 * \code
235 * void forward2(CharacterIterator &it) {
236 * char16_t c;
237 * for(c=it.firstPostInc(); c!=CharacterIterator::DONE; c=it.nextPostInc()) {
238 * // use c
239 * }
240 * }
241 * \endcode
242 * Backward iteration with setToEnd() and hasPrevious():
243 * \code
244 * void backward1(CharacterIterator &it) {
245 * UChar32 c;
246 * for(it.setToEnd(); it.hasPrevious();) {
247 * c=it.previous32();
248 * // use c
249 * }
250 * }
251 * \endcode
252 * Backward iteration with a more traditional for() loop:
253 * \code
254 * void backward2(CharacterIterator &it) {
255 * char16_t c;
256 * for(c=it.last(); c!=CharacterIterator::DONE; c=it.previous()) {
257 * // use c
258 * }
259 * }
260 * \endcode
261 *
262 * Example for random access:
263 * \code
264 * void random(CharacterIterator &it) {
265 * // set to the third code point from the beginning
266 * it.move32(3, CharacterIterator::kStart);
267 * // get a code point from here without moving the position
268 * UChar32 c=it.current32();
269 * // get the position
270 * int32_t pos=it.getIndex();
271 * // get the previous code unit
272 * char16_t u=it.previous();
273 * // move back one more code unit
274 * it.move(-1, CharacterIterator::kCurrent);
275 * // set the position back to where it was
276 * // and read the same code point c and move beyond it
277 * it.setIndex(pos);
278 * if(c!=it.next32PostInc()) {
279 * exit(1); // CharacterIterator inconsistent
280 * }
281 * }
282 * \endcode
283 *
284 * <p>Examples, especially for the old API:</p>
285 *
286 * Function processing characters, in this example simple output
287 * <pre>
288 * \code
289 * void processChar( char16_t c )
290 * {
291 * cout << " " << c;
292 * }
293 * \endcode
294 * </pre>
295 * Traverse the text from start to finish
296 * <pre>
297 * \code
298 * void traverseForward(CharacterIterator& iter)
299 * {
300 * for(char16_t c = iter.first(); c != CharacterIterator::DONE; c = iter.next()) {
301 * processChar(c);
302 * }
303 * }
304 * \endcode
305 * </pre>
306 * Traverse the text backwards, from end to start
307 * <pre>
308 * \code
309 * void traverseBackward(CharacterIterator& iter)
310 * {
311 * for(char16_t c = iter.last(); c != CharacterIterator::DONE; c = iter.previous()) {
312 * processChar(c);
313 * }
314 * }
315 * \endcode
316 * </pre>
317 * Traverse both forward and backward from a given position in the text.
318 * Calls to notBoundary() in this example represents some additional stopping criteria.
319 * <pre>
320 * \code
321 * void traverseOut(CharacterIterator& iter, int32_t pos)
322 * {
323 * char16_t c;
324 * for (c = iter.setIndex(pos);
325 * c != CharacterIterator::DONE && (Unicode::isLetter(c) || Unicode::isDigit(c));
326 * c = iter.next()) {}
327 * int32_t end = iter.getIndex();
328 * for (c = iter.setIndex(pos);
329 * c != CharacterIterator::DONE && (Unicode::isLetter(c) || Unicode::isDigit(c));
330 * c = iter.previous()) {}
331 * int32_t start = iter.getIndex() + 1;
332 *
333 * cout << "start: " << start << " end: " << end << endl;
334 * for (c = iter.setIndex(start); iter.getIndex() < end; c = iter.next() ) {
335 * processChar(c);
336 * }
337 * }
338 * \endcode
339 * </pre>
340 * Creating a StringCharacterIterator and calling the test functions
341 * <pre>
342 * \code
343 * void CharacterIterator_Example( void )
344 * {
345 * cout << endl << "===== CharacterIterator_Example: =====" << endl;
346 * UnicodeString text("Ein kleiner Satz.");
347 * StringCharacterIterator iterator(text);
348 * cout << "----- traverseForward: -----------" << endl;
349 * traverseForward( iterator );
350 * cout << endl << endl << "----- traverseBackward: ----------" << endl;
351 * traverseBackward( iterator );
352 * cout << endl << endl << "----- traverseOut: ---------------" << endl;
353 * traverseOut( iterator, 7 );
354 * cout << endl << endl << "-----" << endl;
355 * }
356 * \endcode
357 * </pre>
358 *
359 * @stable ICU 2.0
360 */
361class U_COMMON_API CharacterIterator : public ForwardCharacterIterator {
362public:
363 /**
364 * Origin enumeration for the move() and move32() functions.
365 * @stable ICU 2.0
366 */
367 enum EOrigin { kStart, kCurrent, kEnd };
368
369 /**
370 * Destructor.
371 * @stable ICU 2.0
372 */
373 virtual ~CharacterIterator();
374
375 /**
376 * Returns a pointer to a new CharacterIterator of the same
377 * concrete class as this one, and referring to the same
378 * character in the same text-storage object as this one. The
379 * caller is responsible for deleting the new clone.
380 * @return a pointer to a new CharacterIterator
381 * @stable ICU 2.0
382 */
383 virtual CharacterIterator* clone() const = 0;
384
385 /**
386 * Sets the iterator to refer to the first code unit in its
387 * iteration range, and returns that code unit.
388 * This can be used to begin an iteration with next().
389 * @return the first code unit in its iteration range.
390 * @stable ICU 2.0
391 */
392 virtual char16_t first(void) = 0;
393
394 /**
395 * Sets the iterator to refer to the first code unit in its
396 * iteration range, returns that code unit, and moves the position
397 * to the second code unit. This is an alternative to setToStart()
398 * for forward iteration with nextPostInc().
399 * @return the first code unit in its iteration range.
400 * @stable ICU 2.0
401 */
402 virtual char16_t firstPostInc(void);
403
404 /**
405 * Sets the iterator to refer to the first code point in its
406 * iteration range, and returns that code unit,
407 * This can be used to begin an iteration with next32().
408 * Note that an iteration with next32PostInc(), beginning with,
409 * e.g., setToStart() or firstPostInc(), is more efficient.
410 * @return the first code point in its iteration range.
411 * @stable ICU 2.0
412 */
413 virtual UChar32 first32(void) = 0;
414
415 /**
416 * Sets the iterator to refer to the first code point in its
417 * iteration range, returns that code point, and moves the position
418 * to the second code point. This is an alternative to setToStart()
419 * for forward iteration with next32PostInc().
420 * @return the first code point in its iteration range.
421 * @stable ICU 2.0
422 */
423 virtual UChar32 first32PostInc(void);
424
425 /**
426 * Sets the iterator to refer to the first code unit or code point in its
427 * iteration range. This can be used to begin a forward
428 * iteration with nextPostInc() or next32PostInc().
429 * @return the start position of the iteration range
430 * @stable ICU 2.0
431 */
432 inline int32_t setToStart();
433
434 /**
435 * Sets the iterator to refer to the last code unit in its
436 * iteration range, and returns that code unit.
437 * This can be used to begin an iteration with previous().
438 * @return the last code unit.
439 * @stable ICU 2.0
440 */
441 virtual char16_t last(void) = 0;
442
443 /**
444 * Sets the iterator to refer to the last code point in its
445 * iteration range, and returns that code unit.
446 * This can be used to begin an iteration with previous32().
447 * @return the last code point.
448 * @stable ICU 2.0
449 */
450 virtual UChar32 last32(void) = 0;
451
452 /**
453 * Sets the iterator to the end of its iteration range, just behind
454 * the last code unit or code point. This can be used to begin a backward
455 * iteration with previous() or previous32().
456 * @return the end position of the iteration range
457 * @stable ICU 2.0
458 */
459 inline int32_t setToEnd();
460
461 /**
462 * Sets the iterator to refer to the "position"-th code unit
463 * in the text-storage object the iterator refers to, and
464 * returns that code unit.
465 * @param position the "position"-th code unit in the text-storage object
466 * @return the "position"-th code unit.
467 * @stable ICU 2.0
468 */
469 virtual char16_t setIndex(int32_t position) = 0;
470
471 /**
472 * Sets the iterator to refer to the beginning of the code point
473 * that contains the "position"-th code unit
474 * in the text-storage object the iterator refers to, and
475 * returns that code point.
476 * The current position is adjusted to the beginning of the code point
477 * (its first code unit).
478 * @param position the "position"-th code unit in the text-storage object
479 * @return the "position"-th code point.
480 * @stable ICU 2.0
481 */
482 virtual UChar32 setIndex32(int32_t position) = 0;
483
484 /**
485 * Returns the code unit the iterator currently refers to.
486 * @return the current code unit.
487 * @stable ICU 2.0
488 */
489 virtual char16_t current(void) const = 0;
490
491 /**
492 * Returns the code point the iterator currently refers to.
493 * @return the current code point.
494 * @stable ICU 2.0
495 */
496 virtual UChar32 current32(void) const = 0;
497
498 /**
499 * Advances to the next code unit in the iteration range
500 * (toward endIndex()), and returns that code unit. If there are
501 * no more code units to return, returns DONE.
502 * @return the next code unit.
503 * @stable ICU 2.0
504 */
505 virtual char16_t next(void) = 0;
506
507 /**
508 * Advances to the next code point in the iteration range
509 * (toward endIndex()), and returns that code point. If there are
510 * no more code points to return, returns DONE.
511 * Note that iteration with "pre-increment" semantics is less
512 * efficient than iteration with "post-increment" semantics
513 * that is provided by next32PostInc().
514 * @return the next code point.
515 * @stable ICU 2.0
516 */
517 virtual UChar32 next32(void) = 0;
518
519 /**
520 * Advances to the previous code unit in the iteration range
521 * (toward startIndex()), and returns that code unit. If there are
522 * no more code units to return, returns DONE.
523 * @return the previous code unit.
524 * @stable ICU 2.0
525 */
526 virtual char16_t previous(void) = 0;
527
528 /**
529 * Advances to the previous code point in the iteration range
530 * (toward startIndex()), and returns that code point. If there are
531 * no more code points to return, returns DONE.
532 * @return the previous code point.
533 * @stable ICU 2.0
534 */
535 virtual UChar32 previous32(void) = 0;
536
537 /**
538 * Returns false if there are no more code units or code points
539 * before the current position in the iteration range.
540 * This is used with previous() or previous32() in backward
541 * iteration.
542 * @return false if there are no more code units or code points
543 * before the current position in the iteration range, return true otherwise.
544 * @stable ICU 2.0
545 */
546 virtual UBool hasPrevious() = 0;
547
548 /**
549 * Returns the numeric index in the underlying text-storage
550 * object of the character returned by first(). Since it's
551 * possible to create an iterator that iterates across only
552 * part of a text-storage object, this number isn't
553 * necessarily 0.
554 * @returns the numeric index in the underlying text-storage
555 * object of the character returned by first().
556 * @stable ICU 2.0
557 */
558 inline int32_t startIndex(void) const;
559
560 /**
561 * Returns the numeric index in the underlying text-storage
562 * object of the position immediately BEYOND the character
563 * returned by last().
564 * @return the numeric index in the underlying text-storage
565 * object of the position immediately BEYOND the character
566 * returned by last().
567 * @stable ICU 2.0
568 */
569 inline int32_t endIndex(void) const;
570
571 /**
572 * Returns the numeric index in the underlying text-storage
573 * object of the character the iterator currently refers to
574 * (i.e., the character returned by current()).
575 * @return the numeric index in the text-storage object of
576 * the character the iterator currently refers to
577 * @stable ICU 2.0
578 */
579 inline int32_t getIndex(void) const;
580
581 /**
582 * Returns the length of the entire text in the underlying
583 * text-storage object.
584 * @return the length of the entire text in the text-storage object
585 * @stable ICU 2.0
586 */
587 inline int32_t getLength() const;
588
589 /**
590 * Moves the current position relative to the start or end of the
591 * iteration range, or relative to the current position itself.
592 * The movement is expressed in numbers of code units forward
593 * or backward by specifying a positive or negative delta.
594 * @param delta the position relative to origin. A positive delta means forward;
595 * a negative delta means backward.
596 * @param origin Origin enumeration {kStart, kCurrent, kEnd}
597 * @return the new position
598 * @stable ICU 2.0
599 */
600 virtual int32_t move(int32_t delta, EOrigin origin) = 0;
601
602 /**
603 * Moves the current position relative to the start or end of the
604 * iteration range, or relative to the current position itself.
605 * The movement is expressed in numbers of code points forward
606 * or backward by specifying a positive or negative delta.
607 * @param delta the position relative to origin. A positive delta means forward;
608 * a negative delta means backward.
609 * @param origin Origin enumeration {kStart, kCurrent, kEnd}
610 * @return the new position
611 * @stable ICU 2.0
612 */
613#ifdef move32
614 // One of the system headers right now is sometimes defining a conflicting macro we don't use
615#undef move32
616#endif
617 virtual int32_t move32(int32_t delta, EOrigin origin) = 0;
618
619 /**
620 * Copies the text under iteration into the UnicodeString
621 * referred to by "result".
622 * @param result Receives a copy of the text under iteration.
623 * @stable ICU 2.0
624 */
625 virtual void getText(UnicodeString& result) = 0;
626
627protected:
628 /**
629 * Empty constructor.
630 * @stable ICU 2.0
631 */
632 CharacterIterator();
633
634 /**
635 * Constructor, just setting the length field in this base class.
636 * @stable ICU 2.0
637 */
638 CharacterIterator(int32_t length);
639
640 /**
641 * Constructor, just setting the length and position fields in this base class.
642 * @stable ICU 2.0
643 */
644 CharacterIterator(int32_t length, int32_t position);
645
646 /**
647 * Constructor, just setting the length, start, end, and position fields in this base class.
648 * @stable ICU 2.0
649 */
650 CharacterIterator(int32_t length, int32_t textBegin, int32_t textEnd, int32_t position);
651
652 /**
653 * Copy constructor.
654 *
655 * @param that The CharacterIterator to be copied
656 * @stable ICU 2.0
657 */
658 CharacterIterator(const CharacterIterator &that);
659
660 /**
661 * Assignment operator. Sets this CharacterIterator to have the same behavior,
662 * as the one passed in.
663 * @param that The CharacterIterator passed in.
664 * @return the newly set CharacterIterator.
665 * @stable ICU 2.0
666 */
667 CharacterIterator &operator=(const CharacterIterator &that);
668
669 /**
670 * Base class text length field.
671 * Necessary this for correct getText() and hashCode().
672 * @stable ICU 2.0
673 */
674 int32_t textLength;
675
676 /**
677 * Base class field for the current position.
678 * @stable ICU 2.0
679 */
680 int32_t pos;
681
682 /**
683 * Base class field for the start of the iteration range.
684 * @stable ICU 2.0
685 */
686 int32_t begin;
687
688 /**
689 * Base class field for the end of the iteration range.
690 * @stable ICU 2.0
691 */
692 int32_t end;
693};
694
695inline bool
696ForwardCharacterIterator::operator!=(const ForwardCharacterIterator& that) const {
697 return !operator==(that);
698}
699
700inline int32_t
701CharacterIterator::setToStart() {
702 return move(0, kStart);
703}
704
705inline int32_t
706CharacterIterator::setToEnd() {
707 return move(0, kEnd);
708}
709
710inline int32_t
711CharacterIterator::startIndex(void) const {
712 return begin;
713}
714
715inline int32_t
716CharacterIterator::endIndex(void) const {
717 return end;
718}
719
720inline int32_t
721CharacterIterator::getIndex(void) const {
722 return pos;
723}
724
725inline int32_t
726CharacterIterator::getLength(void) const {
727 return textLength;
728}
729
730U_NAMESPACE_END
731
732#endif /* U_SHOW_CPLUSPLUS_API */
733
734#endif
735