usrchimp.h source code [engine/third_party/icu/source/i18n/usrchimp.h]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	**********************************************************************
5	* Copyright (C) 2001-2015 IBM and others. All rights reserved.
6	**********************************************************************
7	* Date Name Description
8	* 08/13/2001 synwee Creation.
9	**********************************************************************
10	*/
11	#ifndef USRCHIMP_H
12	#define USRCHIMP_H
13
14	#include "unicode/utypes.h"
15
16	#if !UCONFIG_NO_COLLATION
17
18	#include "unicode/normalizer2.h"
19	#include "unicode/ucol.h"
20	#include "unicode/ucoleitr.h"
21	#include "unicode/ubrk.h"
22
23	/ mask off anything but primary order /
24	#define UCOL_PRIMARYORDERMASK 0xffff0000
25	/ mask off anything but secondary order /
26	#define UCOL_SECONDARYORDERMASK 0x0000ff00
27	/ mask off anything but tertiary order /
28	#define UCOL_TERTIARYORDERMASK 0x000000ff
29	/ primary order shift /
30	#define UCOL_PRIMARYORDERSHIFT 16
31	/ secondary order shift /
32	#define UCOL_SECONDARYORDERSHIFT 8
33
34	#define UCOL_IGNORABLE 0
35
36	/ get weights from a CE /
37	#define UCOL_PRIMARYORDER(order) (((order) >> 16) & 0xffff)
38	#define UCOL_SECONDARYORDER(order) (((order) & UCOL_SECONDARYORDERMASK)>> UCOL_SECONDARYORDERSHIFT)
39	#define UCOL_TERTIARYORDER(order) ((order) & UCOL_TERTIARYORDERMASK)
40
41	#define UCOL_CONTINUATION_MARKER 0xC0
42
43	#define isContinuation(CE) (((CE) & UCOL_CONTINUATION_MARKER) == UCOL_CONTINUATION_MARKER)
44
45	/**
46	* This indicates an error has occured during processing or there are no more CEs
47	* to be returned.
48	*/
49	#define UCOL_PROCESSED_NULLORDER ((int64_t)U_INT64_MAX)
50
51	U_NAMESPACE_BEGIN
52
53	class CollationElementIterator;
54	class Collator;
55
56	struct PCEI
57	{
58	uint64_t ce;
59	int32_t low;
60	int32_t high;
61	};
62
63	struct PCEBuffer
64	{
65	PCEI defaultBuffer[`16`];
66	PCEI *buffer;
67	int32_t bufferIndex;
68	int32_t bufferSize;
69
70	PCEBuffer();
71	~PCEBuffer();
72
73	void reset();
74	UBool isEmpty() const;
75	void put(uint64_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode);
76	const PCEI *get();
77	};
78
79	class UCollationPCE : public UMemory {
80	private:
81	PCEBuffer pceBuffer;
82	CollationElementIterator *cei;
83	UCollationStrength strength;
84	UBool toShift;
85	UBool isShifted;
86	uint32_t variableTop;
87
88	public:
89	UCollationPCE(UCollationElements *elems);
90	UCollationPCE(CollationElementIterator *iter);
91	~UCollationPCE();
92
93	void init(UCollationElements *elems);
94	void init(CollationElementIterator *iter);
95
96	/**
97	* Get the processed ordering priority of the next collation element in the text.
98	* A single character may contain more than one collation element.
99	*
100	* @param ixLow a pointer to an int32_t to receive the iterator index before fetching the CE.
101	* @param ixHigh a pointer to an int32_t to receive the iterator index after fetching the CE.
102	* @param status A pointer to an UErrorCode to receive any errors.
103	* @return The next collation elements ordering, otherwise returns UCOL_PROCESSED_NULLORDER
104	* if an error has occured or if the end of string has been reached
105	*/
106	int64_t nextProcessed(int32_t ixLow, int32_t ixHigh, UErrorCode *status);
107	/**
108	* Get the processed ordering priority of the previous collation element in the text.
109	* A single character may contain more than one collation element.
110	*
111	* @param ixLow A pointer to an int32_t to receive the iterator index after fetching the CE
112	* @param ixHigh A pointer to an int32_t to receiver the iterator index before fetching the CE
113	* @param status A pointer to an UErrorCode to receive any errors. Noteably
114	* a U_BUFFER_OVERFLOW_ERROR is returned if the internal stack
115	* buffer has been exhausted.
116	* @return The previous collation elements ordering, otherwise returns
117	* UCOL_PROCESSED_NULLORDER if an error has occured or if the start of
118	* string has been reached.
119	*/
120	int64_t previousProcessed(int32_t ixLow, int32_t ixHigh, UErrorCode *status);
121
122	private:
123	void init(const Collator &coll);
124	uint64_t processCE(uint32_t ce);
125	};
126
127	U_NAMESPACE_END
128
129	#define INITIAL_ARRAY_SIZE_ 256
130	#define MAX_TABLE_SIZE_ 257
131
132	struct USearch {
133	// required since collation element iterator does not have a getText API
134	const UChar *text;
135	int32_t textLength; // exact length
136	UBool isOverlap;
137	UBool isCanonicalMatch;
138	int16_t elementComparisonType;
139	UBreakIterator internalBreakIter; //internal character breakiterator*
140	UBreakIterator *breakIter;
141	// value USEARCH_DONE is the default value
142	// if we are not at the start of the text or the end of the text,
143	// depending on the iteration direction and matchedIndex is USEARCH_DONE
144	// it means that we can't find any more matches in that particular direction
145	int32_t matchedIndex;
146	int32_t matchedLength;
147	UBool isForwardSearching;
148	UBool reset;
149	};
150
151	struct UPattern {
152	const UChar *text;
153	int32_t textLength; // exact length
154	// length required for backwards ce comparison
155	int32_t cesLength;
156	int32_t *ces;
157	int32_t cesBuffer[INITIAL_ARRAY_SIZE_];
158	int32_t pcesLength;
159	int64_t *pces;
160	int64_t pcesBuffer[INITIAL_ARRAY_SIZE_];
161	UBool hasPrefixAccents;
162	UBool hasSuffixAccents;
163	int16_t defaultShiftSize;
164	int16_t shift[MAX_TABLE_SIZE_];
165	int16_t backShift[MAX_TABLE_SIZE_];
166	};
167
168	struct UStringSearch {
169	struct USearch *search;
170	struct UPattern pattern;
171	const UCollator *collator;
172	const icu::Normalizer2 *nfd;
173	// positions within the collation element iterator is used to determine
174	// if we are at the start of the text.
175	UCollationElements *textIter;
176	icu::UCollationPCE *textProcessedIter;
177	// utility collation element, used throughout program for temporary
178	// iteration.
179	UCollationElements *utilIter;
180	UBool ownCollator;
181	UCollationStrength strength;
182	uint32_t ceMask;
183	uint32_t variableTop;
184	UBool toShift;
185	UChar canonicalPrefixAccents[INITIAL_ARRAY_SIZE_];
186	UChar canonicalSuffixAccents[INITIAL_ARRAY_SIZE_];
187	};
188
189	/**
190	* Exact matches without checking for the ends for extra accents.
191	* The match after the position within the collation element iterator is to be
192	* found.
193	* After a match is found the offset in the collation element iterator will be
194	* shifted to the start of the match.
195	* Implementation note:
196	* For tertiary we can't use the collator->tertiaryMask, that is a
197	* preprocessed mask that takes into account case options. since we are only
198	* concerned with exact matches, we don't need that.
199	* Alternate handling - since only the 16 most significant digits is only used,
200	* we can safely do a compare without masking if the ce is a variable, we mask
201	* and get only the primary values no shifting to quartenary is required since
202	* all primary values less than variabletop will need to be masked off anyway.
203	* If the end character is composite and the pattern ce does not match the text
204	* ce, we skip it until we find a match in the end composite character or when
205	* it has passed the character. This is so that we can match pattern "a" with
206	* the text "\u00e6"
207	* @param strsrch string search data
208	* @param status error status if any
209	* @return TRUE if an exact match is found, FALSE otherwise
210	*/
211	U_CFUNC
212	UBool usearch_handleNextExact(UStringSearch strsrch, UErrorCode status);
213
214	/**
215	* Canonical matches.
216	* According to the definition, matches found here will include the whole span
217	* of beginning and ending accents if it overlaps that region.
218	* @param strsrch string search data
219	* @param status error status if any
220	* @return TRUE if a canonical match is found, FALSE otherwise
221	*/
222	U_CFUNC
223	UBool usearch_handleNextCanonical(UStringSearch strsrch, UErrorCode status);
224
225	/**
226	* Gets the previous match.
227	* Comments follows from handleNextExact
228	* @param strsrch string search data
229	* @param status error status if any
230	* @return True if a exact math is found, FALSE otherwise.
231	*/
232	U_CFUNC
233	UBool usearch_handlePreviousExact(UStringSearch strsrch, UErrorCode status);
234
235	/**
236	* Canonical matches.
237	* According to the definition, matches found here will include the whole span
238	* of beginning and ending accents if it overlaps that region.
239	* @param strsrch string search data
240	* @param status error status if any
241	* @return TRUE if a canonical match is found, FALSE otherwise
242	*/
243	U_CFUNC
244	UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
245	UErrorCode *status);
246
247	#endif /* #if !UCONFIG_NO_COLLATION */
248
249	#endif
250

Browse the source code of engine/third_party/icu/source/i18n/usrchimp.h