1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5* Copyright (C) 2012-2016, International Business Machines
6* Corporation and others. All Rights Reserved.
7*******************************************************************************
8* utf8collationiterator.h
9*
10* created on: 2012nov12 (from utf16collationiterator.h & uitercollationiterator.h)
11* created by: Markus W. Scherer
12*/
13
14#ifndef __UTF8COLLATIONITERATOR_H__
15#define __UTF8COLLATIONITERATOR_H__
16
17#include "unicode/utypes.h"
18
19#if !UCONFIG_NO_COLLATION
20
21#include "cmemory.h"
22#include "collation.h"
23#include "collationdata.h"
24#include "collationiterator.h"
25#include "normalizer2impl.h"
26
27U_NAMESPACE_BEGIN
28
29/**
30 * UTF-8 collation element and character iterator.
31 * Handles normalized UTF-8 text inline, with length or NUL-terminated.
32 * Unnormalized text is handled by a subclass.
33 */
34class U_I18N_API UTF8CollationIterator : public CollationIterator {
35public:
36 UTF8CollationIterator(const CollationData *d, UBool numeric,
37 const uint8_t *s, int32_t p, int32_t len)
38 : CollationIterator(d, numeric),
39 u8(s), pos(p), length(len) {}
40
41 virtual ~UTF8CollationIterator();
42
43 virtual void resetToOffset(int32_t newOffset);
44
45 virtual int32_t getOffset() const;
46
47 virtual UChar32 nextCodePoint(UErrorCode &errorCode);
48
49 virtual UChar32 previousCodePoint(UErrorCode &errorCode);
50
51protected:
52 /**
53 * For byte sequences that are illegal in UTF-8, an error value may be returned
54 * together with a bogus code point. The caller will ignore that code point.
55 *
56 * Special values may be returned for surrogate code points, which are also illegal in UTF-8,
57 * but the caller will treat them like U+FFFD because forbidSurrogateCodePoints() returns TRUE.
58 *
59 * Valid lead surrogates are returned from inside a normalized text segment,
60 * where handleGetTrailSurrogate() will return the matching trail surrogate.
61 */
62 virtual uint32_t handleNextCE32(UChar32 &c, UErrorCode &errorCode);
63
64 virtual UBool foundNULTerminator();
65
66 virtual UBool forbidSurrogateCodePoints() const;
67
68 virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode);
69
70 virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode);
71
72 const uint8_t *u8;
73 int32_t pos;
74 int32_t length; // <0 for NUL-terminated strings
75};
76
77/**
78 * Incrementally checks the input text for FCD and normalizes where necessary.
79 */
80class U_I18N_API FCDUTF8CollationIterator : public UTF8CollationIterator {
81public:
82 FCDUTF8CollationIterator(const CollationData *data, UBool numeric,
83 const uint8_t *s, int32_t p, int32_t len)
84 : UTF8CollationIterator(data, numeric, s, p, len),
85 state(CHECK_FWD), start(p),
86 nfcImpl(data->nfcImpl) {}
87
88 virtual ~FCDUTF8CollationIterator();
89
90 virtual void resetToOffset(int32_t newOffset);
91
92 virtual int32_t getOffset() const;
93
94 virtual UChar32 nextCodePoint(UErrorCode &errorCode);
95
96 virtual UChar32 previousCodePoint(UErrorCode &errorCode);
97
98protected:
99 virtual uint32_t handleNextCE32(UChar32 &c, UErrorCode &errorCode);
100
101 virtual UChar handleGetTrailSurrogate();
102
103 virtual UBool foundNULTerminator();
104
105 virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode);
106
107 virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode);
108
109private:
110 UBool nextHasLccc() const;
111 UBool previousHasTccc() const;
112
113 /**
114 * Switches to forward checking if possible.
115 */
116 void switchToForward();
117
118 /**
119 * Extends the FCD text segment forward or normalizes around pos.
120 * @return TRUE if success
121 */
122 UBool nextSegment(UErrorCode &errorCode);
123
124 /**
125 * Switches to backward checking.
126 */
127 void switchToBackward();
128
129 /**
130 * Extends the FCD text segment backward or normalizes around pos.
131 * @return TRUE if success
132 */
133 UBool previousSegment(UErrorCode &errorCode);
134
135 UBool normalize(const UnicodeString &s, UErrorCode &errorCode);
136
137 enum State {
138 /**
139 * The input text [start..pos[ passes the FCD check.
140 * Moving forward checks incrementally.
141 * limit is undefined.
142 */
143 CHECK_FWD,
144 /**
145 * The input text [pos..limit[ passes the FCD check.
146 * Moving backward checks incrementally.
147 * start is undefined.
148 */
149 CHECK_BWD,
150 /**
151 * The input text [start..limit[ passes the FCD check.
152 * pos tracks the current text index.
153 */
154 IN_FCD_SEGMENT,
155 /**
156 * The input text [start..limit[ failed the FCD check and was normalized.
157 * pos tracks the current index in the normalized string.
158 */
159 IN_NORMALIZED
160 };
161
162 State state;
163
164 int32_t start;
165 int32_t limit;
166
167 const Normalizer2Impl &nfcImpl;
168 UnicodeString normalized;
169};
170
171U_NAMESPACE_END
172
173#endif // !UCONFIG_NO_COLLATION
174#endif // __UTF8COLLATIONITERATOR_H__
175