utf8collationiterator.h source code [ClickHouse/contrib/icu/icu4c/source/i18n/utf8collationiterator.h]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	*******************************************************************************
5	* Copyright (C) 2012-2016, International Business Machines
6	* Corporation and others. All Rights Reserved.
7	*******************************************************************************
8	* utf8collationiterator.h
9	*
10	* created on: 2012nov12 (from utf16collationiterator.h & uitercollationiterator.h)
11	* created by: Markus W. Scherer
12	*/
13
14	#ifndef __UTF8COLLATIONITERATOR_H__
15	#define __UTF8COLLATIONITERATOR_H__
16
17	#include "unicode/utypes.h"
18
19	#if !UCONFIG_NO_COLLATION
20
21	#include "cmemory.h"
22	#include "collation.h"
23	#include "collationdata.h"
24	#include "collationiterator.h"
25	#include "normalizer2impl.h"
26
27	U_NAMESPACE_BEGIN
28
29	/**
30	* UTF-8 collation element and character iterator.
31	* Handles normalized UTF-8 text inline, with length or NUL-terminated.
32	* Unnormalized text is handled by a subclass.
33	*/
34	class U_I18N_API UTF8CollationIterator : public CollationIterator {
35	public:
36	UTF8CollationIterator(const CollationData *d, UBool numeric,
37	const uint8_t *s, int32_t p, int32_t len)
38	: CollationIterator (d, numeric),
39	u8(s), pos(p), length(len) {}
40
41	virtual ~UTF8CollationIterator();
42
43	virtual void resetToOffset(int32_t newOffset);
44
45	virtual int32_t getOffset() const;
46
47	virtual UChar32 nextCodePoint(UErrorCode &errorCode);
48
49	virtual UChar32 previousCodePoint(UErrorCode &errorCode);
50
51	protected:
52	/**
53	* For byte sequences that are illegal in UTF-8, an error value may be returned
54	* together with a bogus code point. The caller will ignore that code point.
55	*
56	* Special values may be returned for surrogate code points, which are also illegal in UTF-8,
57	* but the caller will treat them like U+FFFD because forbidSurrogateCodePoints() returns TRUE.
58	*
59	* Valid lead surrogates are returned from inside a normalized text segment,
60	* where handleGetTrailSurrogate() will return the matching trail surrogate.
61	*/
62	virtual uint32_t handleNextCE32(UChar32 &c, UErrorCode &errorCode);
63
64	virtual UBool foundNULTerminator();
65
66	virtual UBool forbidSurrogateCodePoints() const;
67
68	virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode);
69
70	virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode);
71
72	const uint8_t *u8;
73	int32_t pos;
74	int32_t length; // <0 for NUL-terminated strings
75	};
76
77	/**
78	* Incrementally checks the input text for FCD and normalizes where necessary.
79	*/
80	class U_I18N_API FCDUTF8CollationIterator : public UTF8CollationIterator {
81	public:
82	FCDUTF8CollationIterator(const CollationData *data, UBool numeric,
83	const uint8_t *s, int32_t p, int32_t len)
84	: UTF8CollationIterator (data, numeric, s, p, len),
85	state(CHECK_FWD), start(p),
86	nfcImpl(data->nfcImpl) {}
87
88	virtual ~FCDUTF8CollationIterator();
89
90	virtual void resetToOffset(int32_t newOffset);
91
92	virtual int32_t getOffset() const;
93
94	virtual UChar32 nextCodePoint(UErrorCode &errorCode);
95
96	virtual UChar32 previousCodePoint(UErrorCode &errorCode);
97
98	protected:
99	virtual uint32_t handleNextCE32(UChar32 &c, UErrorCode &errorCode);
100
101	virtual UChar handleGetTrailSurrogate();
102
103	virtual UBool foundNULTerminator();
104
105	virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode);
106
107	virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode);
108
109	private:
110	UBool nextHasLccc() const;
111	UBool previousHasTccc() const;
112
113	/**
114	* Switches to forward checking if possible.
115	*/
116	void switchToForward();
117
118	/**
119	* Extends the FCD text segment forward or normalizes around pos.
120	* @return TRUE if success
121	*/
122	UBool nextSegment(UErrorCode &errorCode);
123
124	/**
125	* Switches to backward checking.
126	*/
127	void switchToBackward();
128
129	/**
130	* Extends the FCD text segment backward or normalizes around pos.
131	* @return TRUE if success
132	*/
133	UBool previousSegment(UErrorCode &errorCode);
134
135	UBool normalize(const UnicodeString &s, UErrorCode &errorCode);
136
137	enum State {
138	/**
139	* The input text [start..pos[ passes the FCD check.
140	* Moving forward checks incrementally.
141	* limit is undefined.
142	*/
143	CHECK_FWD,
144	/**
145	* The input text [pos..limit[ passes the FCD check.
146	* Moving backward checks incrementally.
147	* start is undefined.
148	*/
149	CHECK_BWD,
150	/**
151	* The input text [start..limit[ passes the FCD check.
152	* pos tracks the current text index.
153	*/
154	IN_FCD_SEGMENT,
155	/**
156	* The input text [start..limit[ failed the FCD check and was normalized.
157	* pos tracks the current index in the normalized string.
158	*/
159	IN_NORMALIZED
160	};
161
162	State state;
163
164	int32_t start;
165	int32_t limit;
166
167	const Normalizer2Impl &nfcImpl;
168	UnicodeString normalized;
169	};
170
171	U_NAMESPACE_END
172
173	#endif // !UCONFIG_NO_COLLATION
174	#endif // __UTF8COLLATIONITERATOR_H__
175

Browse the source code of ClickHouse/contrib/icu/icu4c/source/i18n/utf8collationiterator.h