usearch.cpp source code [ClickHouse/contrib/icu/icu4c/source/i18n/usearch.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	**********************************************************************
5	* Copyright (C) 2001-2015 IBM and others. All rights reserved.
6	**********************************************************************
7	* Date Name Description
8	* 07/02/2001 synwee Creation.
9	**********************************************************************
10	*/
11
12	#include "unicode/utypes.h"
13
14	#if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
15
16	#include "unicode/usearch.h"
17	#include "unicode/ustring.h"
18	#include "unicode/uchar.h"
19	#include "unicode/utf16.h"
20	#include "normalizer2impl.h"
21	#include "usrchimp.h"
22	#include "cmemory.h"
23	#include "ucln_in.h"
24	#include "uassert.h"
25	#include "ustr_imp.h"
26
27	U_NAMESPACE_USE
28
29	// don't use Boyer-Moore
30	// (and if we decide to turn this on again there are several new TODOs that will need to be addressed)
31	#define BOYER_MOORE 0
32
33	// internal definition ---------------------------------------------------
34
35	#define LAST_BYTE_MASK_ 0xFF
36	#define SECOND_LAST_BYTE_SHIFT_ 8
37	#define SUPPLEMENTARY_MIN_VALUE_ 0x10000
38
39	static const Normalizer2Impl *g_nfcImpl = NULL;
40
41	// internal methods -------------------------------------------------
42
43	/**
44	* Fast collation element iterator setOffset.
45	* This function does not check for bounds.
46	* @param coleiter collation element iterator
47	* @param offset to set
48	*/
49	static
50	inline void setColEIterOffset(UCollationElements *elems,
51	int32_t offset)
52	{
53	// Note: Not "fast" any more after the 2013 collation rewrite.
54	// We do not want to expose more internals than necessary.
55	UErrorCode status = U_ZERO_ERROR;
56	ucol_setOffset(elems, offset, &status);
57	}
58
59	/**
60	* Getting the mask for collation strength
61	* @param strength collation strength
62	* @return collation element mask
63	*/
64	static
65	inline uint32_t getMask(UCollationStrength strength)
66	{
67	switch (strength)
68	{
69	case UCOL_PRIMARY:
70	return UCOL_PRIMARYORDERMASK;
71	case UCOL_SECONDARY:
72	return UCOL_SECONDARYORDERMASK \| UCOL_PRIMARYORDERMASK;
73	default:
74	return UCOL_TERTIARYORDERMASK \| UCOL_SECONDARYORDERMASK \|
75	UCOL_PRIMARYORDERMASK;
76	}
77	}
78
79	/**
80	* @param ce 32-bit collation element
81	* @return hash code
82	*/
83	static
84	inline int hashFromCE32(uint32_t ce)
85	{
86	int hc = (int)(
87	((((((ce >> `24`) * `37`) +
88	(ce >> `16`)) * `37`) +
89	(ce >> `8`)) * `37`) +
90	ce);
91	hc %= MAX_TABLE_SIZE_;
92	if (hc < `0`) {
93	hc += MAX_TABLE_SIZE_;
94	}
95	return hc;
96	}
97
98	U_CDECL_BEGIN
99	static UBool U_CALLCONV
100	usearch_cleanup(void) {
101	g_nfcImpl = NULL;
102	return TRUE;
103	}
104	U_CDECL_END
105
106	/**
107	* Initializing the fcd tables.
108	* Internal method, status assumed to be a success.
109	* @param status output error if any, caller to check status before calling
110	* method, status assumed to be success when passed in.
111	*/
112	static
113	inline void initializeFCD(UErrorCode *status)
114	{
115	if (g_nfcImpl == NULL) {
116	g_nfcImpl = Normalizer2Factory::getNFCImpl(*status);
117	ucln_i18n_registerCleanup(UCLN_I18N_USEARCH, usearch_cleanup);
118	}
119	}
120
121	/**
122	* Gets the fcd value for a character at the argument index.
123	* This method takes into accounts of the supplementary characters.
124	* @param str UTF16 string where character for fcd retrieval resides
125	* @param offset position of the character whose fcd is to be retrieved, to be
126	* overwritten with the next character position, taking
127	* surrogate characters into consideration.
128	* @param strlength length of the argument string
129	* @return fcd value
130	*/
131	static
132	uint16_t getFCD(const UChar str, int32_t offset,
133	int32_t strlength)
134	{
135	const UChar temp = str + offset;
136	uint16_t result = g_nfcImpl->nextFCD16(temp, str + strlength);
137	*offset = (int32_t)(temp - str);
138	return result;
139	}
140
141	/**
142	* Getting the modified collation elements taking into account the collation
143	* attributes
144	* @param strsrch string search data
145	* @param sourcece
146	* @return the modified collation element
147	*/
148	static
149	inline int32_t getCE(const UStringSearch *strsrch, uint32_t sourcece)
150	{
151	// note for tertiary we can't use the collator->tertiaryMask, that
152	// is a preprocessed mask that takes into account case options. since
153	// we are only concerned with exact matches, we don't need that.
154	sourcece &= strsrch->ceMask;
155
156	if (strsrch->toShift) {
157	// alternate handling here, since only the 16 most significant digits
158	// is only used, we can safely do a compare without masking
159	// if the ce is a variable, we mask and get only the primary values
160	// no shifting to quartenary is required since all primary values
161	// less than variabletop will need to be masked off anyway.
162	if (strsrch->variableTop > sourcece) {
163	if (strsrch->strength >= UCOL_QUATERNARY) {
164	sourcece &= UCOL_PRIMARYORDERMASK;
165	}
166	else {
167	sourcece = UCOL_IGNORABLE;
168	}
169	}
170	} else if (strsrch->strength >= UCOL_QUATERNARY && sourcece == UCOL_IGNORABLE) {
171	sourcece = `0xFFFF`;
172	}
173
174	return sourcece;
175	}
176
177	/**
178	* Allocate a memory and returns NULL if it failed.
179	* Internal method, status assumed to be a success.
180	* @param size to allocate
181	* @param status output error if any, caller to check status before calling
182	* method, status assumed to be success when passed in.
183	* @return newly allocated array, NULL otherwise
184	*/
185	static
186	inline void * allocateMemory(uint32_t size, UErrorCode *status)
187	{
188	uint32_t result = (uint32_t )uprv_malloc(size);
189	if (result == NULL) {
190	*status = U_MEMORY_ALLOCATION_ERROR;
191	}
192	return result;
193	}
194
195	/**
196	* Adds a uint32_t value to a destination array.
197	* Creates a new array if we run out of space. The caller will have to
198	* manually deallocate the newly allocated array.
199	* Internal method, status assumed to be success, caller has to check status
200	* before calling this method. destination not to be NULL and has at least
201	* size destinationlength.
202	* @param destination target array
203	* @param offset destination offset to add value
204	* @param destinationlength target array size, return value for the new size
205	* @param value to be added
206	* @param increments incremental size expected
207	* @param status output error if any, caller to check status before calling
208	* method, status assumed to be success when passed in.
209	* @return new destination array, destination if there was no new allocation
210	*/
211	static
212	inline int32_t * addTouint32_tArray(int32_t *destination,
213	uint32_t offset,
214	uint32_t *destinationlength,
215	uint32_t value,
216	uint32_t increments,
217	UErrorCode *status)
218	{
219	uint32_t newlength = *destinationlength;
220	if (offset + `1` == newlength) {
221	newlength += increments;
222	int32_t temp = (int32_t )allocateMemory(
223	sizeof(int32_t) * newlength, status);
224	if (U_FAILURE(*status)) {
225	return NULL;
226	}
227	uprv_memcpy(temp, destination, sizeof(int32_t) * (size_t)offset);
228	*destinationlength = newlength;
229	destination = temp;
230	}
231	destination[offset] = value;
232	return destination;
233	}
234
235	/**
236	* Adds a uint64_t value to a destination array.
237	* Creates a new array if we run out of space. The caller will have to
238	* manually deallocate the newly allocated array.
239	* Internal method, status assumed to be success, caller has to check status
240	* before calling this method. destination not to be NULL and has at least
241	* size destinationlength.
242	* @param destination target array
243	* @param offset destination offset to add value
244	* @param destinationlength target array size, return value for the new size
245	* @param value to be added
246	* @param increments incremental size expected
247	* @param status output error if any, caller to check status before calling
248	* method, status assumed to be success when passed in.
249	* @return new destination array, destination if there was no new allocation
250	*/
251	static
252	inline int64_t * addTouint64_tArray(int64_t *destination,
253	uint32_t offset,
254	uint32_t *destinationlength,
255	uint64_t value,
256	uint32_t increments,
257	UErrorCode *status)
258	{
259	uint32_t newlength = *destinationlength;
260	if (offset + `1` == newlength) {
261	newlength += increments;
262	int64_t temp = (int64_t )allocateMemory(
263	sizeof(int64_t) * newlength, status);
264
265	if (U_FAILURE(*status)) {
266	return NULL;
267	}
268
269	uprv_memcpy(temp, destination, sizeof(int64_t) * (size_t)offset);
270	*destinationlength = newlength;
271	destination = temp;
272	}
273
274	destination[offset] = value;
275
276	return destination;
277	}
278
279	/**
280	* Initializing the ce table for a pattern.
281	* Stores non-ignorable collation keys.
282	* Table size will be estimated by the size of the pattern text. Table
283	* expansion will be perform as we go along. Adding 1 to ensure that the table
284	* size definitely increases.
285	* Internal method, status assumed to be a success.
286	* @param strsrch string search data
287	* @param status output error if any, caller to check status before calling
288	* method, status assumed to be success when passed in.
289	* @return total number of expansions
290	*/
291	static
292	inline uint16_t initializePatternCETable(UStringSearch *strsrch,
293	UErrorCode *status)
294	{
295	UPattern *pattern = &(strsrch->pattern);
296	uint32_t cetablesize = INITIAL_ARRAY_SIZE_;
297	int32_t *cetable = pattern->cesBuffer;
298	uint32_t patternlength = pattern->textLength;
299	UCollationElements *coleiter = strsrch->utilIter;
300
301	if (coleiter == NULL) {
302	coleiter = ucol_openElements(strsrch->collator, pattern->text,
303	patternlength, status);
304	// status will be checked in ucol_next(..) later and if it is an
305	// error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
306	// returned.
307	strsrch->utilIter = coleiter;
308	}
309	else {
310	ucol_setText(coleiter, pattern->text, pattern->textLength, status);
311	}
312	if(U_FAILURE(*status)) {
313	return `0`;
314	}
315
316	if (pattern->ces != cetable && pattern->ces) {
317	uprv_free(pattern->ces);
318	}
319
320	uint32_t offset = `0`;
321	uint16_t result = `0`;
322	int32_t ce;
323
324	while ((ce = ucol_next(coleiter, status)) != UCOL_NULLORDER &&
325	U_SUCCESS(*status)) {
326	uint32_t newce = getCE(strsrch, ce);
327	if (newce) {
328	int32_t *temp = addTouint32_tArray(cetable, offset, &cetablesize,
329	newce,
330	patternlength - ucol_getOffset(coleiter) + `1`,
331	status);
332	if (U_FAILURE(*status)) {
333	return `0`;
334	}
335	offset ++;
336	if (cetable != temp && cetable != pattern->cesBuffer) {
337	uprv_free(cetable);
338	}
339	cetable = temp;
340	}
341	result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - `1`);
342	}
343
344	cetable[offset] = `0`;
345	pattern->ces = cetable;
346	pattern->cesLength = offset;
347
348	return result;
349	}
350
351	/**
352	* Initializing the pce table for a pattern.
353	* Stores non-ignorable collation keys.
354	* Table size will be estimated by the size of the pattern text. Table
355	* expansion will be perform as we go along. Adding 1 to ensure that the table
356	* size definitely increases.
357	* Internal method, status assumed to be a success.
358	* @param strsrch string search data
359	* @param status output error if any, caller to check status before calling
360	* method, status assumed to be success when passed in.
361	* @return total number of expansions
362	*/
363	static
364	inline uint16_t initializePatternPCETable(UStringSearch *strsrch,
365	UErrorCode *status)
366	{
367	UPattern *pattern = &(strsrch->pattern);
368	uint32_t pcetablesize = INITIAL_ARRAY_SIZE_;
369	int64_t *pcetable = pattern->pcesBuffer;
370	uint32_t patternlength = pattern->textLength;
371	UCollationElements *coleiter = strsrch->utilIter;
372
373	if (coleiter == NULL) {
374	coleiter = ucol_openElements(strsrch->collator, pattern->text,
375	patternlength, status);
376	// status will be checked in ucol_next(..) later and if it is an
377	// error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
378	// returned.
379	strsrch->utilIter = coleiter;
380	} else {
381	ucol_setText(coleiter, pattern->text, pattern->textLength, status);
382	}
383	if(U_FAILURE(*status)) {
384	return `0`;
385	}
386
387	if (pattern->pces != pcetable && pattern->pces != NULL) {
388	uprv_free(pattern->pces);
389	}
390
391	uint32_t offset = `0`;
392	uint16_t result = `0`;
393	int64_t pce;
394
395	icu::UCollationPCE iter(coleiter);
396
397	// * Should processed CEs be signed or unsigned?*
398	// * (the rest of the code in this file seems to play fast-and-loose with*
399	// * whether a CE is signed or unsigned. For example, look at routine above this one.)*
400	while ((pce = iter.nextProcessed(NULL, NULL, status)) != UCOL_PROCESSED_NULLORDER &&
401	U_SUCCESS(*status)) {
402	int64_t *temp = addTouint64_tArray(pcetable, offset, &pcetablesize,
403	pce,
404	patternlength - ucol_getOffset(coleiter) + `1`,
405	status);
406
407	if (U_FAILURE(*status)) {
408	return `0`;
409	}
410
411	offset += `1`;
412
413	if (pcetable != temp && pcetable != pattern->pcesBuffer) {
414	uprv_free(pcetable);
415	}
416
417	pcetable = temp;
418	//result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
419	}
420
421	pcetable[offset] = `0`;
422	pattern->pces = pcetable;
423	pattern->pcesLength = offset;
424
425	return result;
426	}
427
428	/**
429	* Initializes the pattern struct.
430	* Internal method, status assumed to be success.
431	* @param strsrch UStringSearch data storage
432	* @param status output error if any, caller to check status before calling
433	* method, status assumed to be success when passed in.
434	* @return expansionsize the total expansion size of the pattern
435	*/
436	static
437	inline int16_t initializePattern(UStringSearch strsrch, UErrorCode status)
438	{
439	if (U_FAILURE(status)) { return* `0`; }
440	UPattern *pattern = &(strsrch->pattern);
441	const UChar *patterntext = pattern->text;
442	int32_t length = pattern->textLength;
443	int32_t index = `0`;
444
445	// Since the strength is primary, accents are ignored in the pattern.
446	if (strsrch->strength == UCOL_PRIMARY) {
447	pattern->hasPrefixAccents = `0`;
448	pattern->hasSuffixAccents = `0`;
449	} else {
450	pattern->hasPrefixAccents = getFCD(patterntext, &index, length) >>
451	SECOND_LAST_BYTE_SHIFT_;
452	index = length;
453	U16_BACK_1(patterntext, `0`, index);
454	pattern->hasSuffixAccents = getFCD(patterntext, &index, length) &
455	LAST_BYTE_MASK_;
456	}
457
458	// HACK
459	if (strsrch->pattern.pces != NULL) {
460	if (strsrch->pattern.pces != strsrch->pattern.pcesBuffer) {
461	uprv_free(strsrch->pattern.pces);
462	}
463
464	strsrch->pattern.pces = NULL;
465	}
466
467	// since intializePattern is an internal method status is a success.
468	return initializePatternCETable(strsrch, status);
469	}
470
471	/**
472	* Initializing shift tables, with the default values.
473	* If a corresponding default value is 0, the shift table is not set.
474	* @param shift table for forwards shift
475	* @param backshift table for backwards shift
476	* @param cetable table containing pattern ce
477	* @param cesize size of the pattern ces
478	* @param expansionsize total size of the expansions
479	* @param defaultforward the default forward value
480	* @param defaultbackward the default backward value
481	*/
482	static
483	inline void setShiftTable(int16_t shift[], int16_t backshift[],
484	int32_t *cetable, int32_t cesize,
485	int16_t expansionsize,
486	int16_t defaultforward,
487	int16_t defaultbackward)
488	{
489	// estimate the value to shift. to do that we estimate the smallest
490	// number of characters to give the relevant ces, ie approximately
491	// the number of ces minus their expansion, since expansions can come
492	// from a character.
493	int32_t count;
494	for (count = `0`; count < MAX_TABLE_SIZE_; count ++) {
495	shift[count] = defaultforward;
496	}
497	cesize --; // down to the last index
498	for (count = `0`; count < cesize; count ++) {
499	// number of ces from right of array to the count
500	int temp = defaultforward - count - `1`;
501	shift[hashFromCE32(cetable[count])] = temp > `1` ? static_cast<int16_t>(temp) : `1`;
502	}
503	shift[hashFromCE32(cetable[cesize])] = `1`;
504	// for ignorables we just shift by one. see test examples.
505	shift[hashFromCE32(`0`)] = `1`;
506
507	for (count = `0`; count < MAX_TABLE_SIZE_; count ++) {
508	backshift[count] = defaultbackward;
509	}
510	for (count = cesize; count > `0`; count --) {
511	// the original value count does not seem to work
512	backshift[hashFromCE32(cetable[count])] = count > expansionsize ?
513	(int16_t)(count - expansionsize) : `1`;
514	}
515	backshift[hashFromCE32(cetable[`0`])] = `1`;
516	backshift[hashFromCE32(`0`)] = `1`;
517	}
518
519	/**
520	* Building of the pattern collation element list and the boyer moore strsrch
521	* table.
522	* The canonical match will only be performed after the default match fails.
523	* For both cases we need to remember the size of the composed and decomposed
524	* versions of the string. Since the Boyer-Moore shift calculations shifts by
525	* a number of characters in the text and tries to match the pattern from that
526	* offset, the shift value can not be too large in case we miss some
527	* characters. To choose a right shift size, we estimate the NFC form of the
528	* and use its size as a shift guide. The NFC form should be the small
529	* possible representation of the pattern. Anyways, we'll err on the smaller
530	* shift size. Hence the calculation for minlength.
531	* Canonical match will be performed slightly differently. We'll split the
532	* pattern into 3 parts, the prefix accents (PA), the middle string bounded by
533	* the first and last base character (MS), the ending accents (EA). Matches
534	* will be done on MS first, and only when we match MS then some processing
535	* will be required for the prefix and end accents in order to determine if
536	* they match PA and EA. Hence the default shift values
537	* for the canonical match will take the size of either end's accent into
538	* consideration. Forwards search will take the end accents into consideration
539	* for the default shift values and the backwards search will take the prefix
540	* accents into consideration.
541	* If pattern has no non-ignorable ce, we return a illegal argument error.
542	* Internal method, status assumed to be success.
543	* @param strsrch UStringSearch data storage
544	* @param status for output errors if it occurs, status is assumed to be a
545	* success when it is passed in.
546	*/
547	static
548	inline void initialize(UStringSearch strsrch, UErrorCode status)
549	{
550	int16_t expandlength = initializePattern(strsrch, status);
551	if (U_SUCCESS(*status) && strsrch->pattern.cesLength > `0`) {
552	UPattern *pattern = &strsrch->pattern;
553	int32_t cesize = pattern->cesLength;
554
555	int16_t minlength = cesize > expandlength
556	? (int16_t)cesize - expandlength : `1`;
557	pattern->defaultShiftSize = minlength;
558	setShiftTable(pattern->shift, pattern->backShift, pattern->ces,
559	cesize, expandlength, minlength, minlength);
560	return;
561	}
562	strsrch->pattern.defaultShiftSize = `0`;
563	}
564
565	#if BOYER_MOORE
566	/**
567	* Check to make sure that the match length is at the end of the character by
568	* using the breakiterator.
569	* @param strsrch string search data
570	* @param start target text start offset
571	* @param end target text end offset
572	*/
573	static
574	void checkBreakBoundary(const UStringSearch strsrch, int32_t /start/,
575	int32_t *end)
576	{
577	#if !UCONFIG_NO_BREAK_ITERATION
578	UBreakIterator *breakiterator = strsrch->search->internalBreakIter;
579	if (breakiterator) {
580	int32_t matchend = *end;
581	//int32_t matchstart = start;*
582
583	if (!ubrk_isBoundary(breakiterator, matchend)) {
584	*end = ubrk_following(breakiterator, matchend);
585	}
586
587	/ Check the start of the matched text to make sure it doesn't have any accents*
588	* before it. This code may not be necessary and so it is commented out */
589	/if (!ubrk_isBoundary(breakiterator, matchstart) && !ubrk_isBoundary(breakiterator, matchstart-1)) {*
590	*start = ubrk_preceding(breakiterator, matchstart);
591	}/*
592	}
593	#endif
594	}
595
596	/**
597	* Determine whether the target text in UStringSearch bounded by the offset
598	* start and end is one or more whole units of text as
599	* determined by the breakiterator in UStringSearch.
600	* @param strsrch string search data
601	* @param start target text start offset
602	* @param end target text end offset
603	*/
604	static
605	UBool isBreakUnit(const UStringSearch *strsrch, int32_t start,
606	int32_t end)
607	{
608	#if !UCONFIG_NO_BREAK_ITERATION
609	UBreakIterator *breakiterator = strsrch->search->breakIter;
610	//TODO: Add here.
611	if (breakiterator) {
612	int32_t startindex = ubrk_first(breakiterator);
613	int32_t endindex = ubrk_last(breakiterator);
614
615	// out-of-range indexes are never boundary positions
616	if (start < startindex \|\| start > endindex \|\|
617	end < startindex \|\| end > endindex) {
618	return FALSE;
619	}
620	// otherwise, we can use following() on the position before the
621	// specified one and return true of the position we get back is the
622	// one the user specified
623	UBool result = (start == startindex \|\|
624	ubrk_following(breakiterator, start - `1`) == start) &&
625	(end == endindex \|\|
626	ubrk_following(breakiterator, end - `1`) == end);
627	if (result) {
628	// iterates the individual ces
629	UCollationElements *coleiter = strsrch->utilIter;
630	const UChar *text = strsrch->search->text +
631	start;
632	UErrorCode status = U_ZERO_ERROR;
633	ucol_setText(coleiter, text, end - start, &status);
634	for (int32_t count = `0`; count < strsrch->pattern.cesLength;
635	count ++) {
636	int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
637	if (ce == UCOL_IGNORABLE) {
638	count --;
639	continue;
640	}
641	if (U_FAILURE(status) \|\| ce != strsrch->pattern.ces[count]) {
642	return FALSE;
643	}
644	}
645	int32_t nextce = ucol_next(coleiter, &status);
646	while (ucol_getOffset(coleiter) == (end - start)
647	&& getCE(strsrch, nextce) == UCOL_IGNORABLE) {
648	nextce = ucol_next(coleiter, &status);
649	}
650	if (ucol_getOffset(coleiter) == (end - start)
651	&& nextce != UCOL_NULLORDER) {
652	// extra collation elements at the end of the match
653	return FALSE;
654	}
655	}
656	return result;
657	}
658	#endif
659	return TRUE;
660	}
661
662	/**
663	* Getting the next base character offset if current offset is an accent,
664	* or the current offset if the current character contains a base character.
665	* accents the following base character will be returned
666	* @param text string
667	* @param textoffset current offset
668	* @param textlength length of text string
669	* @return the next base character or the current offset
670	* if the current character is contains a base character.
671	*/
672	static
673	inline int32_t getNextBaseOffset(const UChar *text,
674	int32_t textoffset,
675	int32_t textlength)
676	{
677	if (textoffset < textlength) {
678	int32_t temp = textoffset;
679	if (getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
680	while (temp < textlength) {
681	int32_t result = temp;
682	if ((getFCD(text, &temp, textlength) >>
683	SECOND_LAST_BYTE_SHIFT_) == `0`) {
684	return result;
685	}
686	}
687	return textlength;
688	}
689	}
690	return textoffset;
691	}
692
693	/**
694	* Gets the next base character offset depending on the string search pattern
695	* data
696	* @param strsrch string search data
697	* @param textoffset current offset, one offset away from the last character
698	* to search for.
699	* @return start index of the next base character or the current offset
700	* if the current character is contains a base character.
701	*/
702	static
703	inline int32_t getNextUStringSearchBaseOffset(UStringSearch *strsrch,
704	int32_t textoffset)
705	{
706	int32_t textlength = strsrch->search->textLength;
707	if (strsrch->pattern.hasSuffixAccents &&
708	textoffset < textlength) {
709	int32_t temp = textoffset;
710	const UChar *text = strsrch->search->text;
711	U16_BACK_1(text, `0`, temp);
712	if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
713	return getNextBaseOffset(text, textoffset, textlength);
714	}
715	}
716	return textoffset;
717	}
718
719	/**
720	* Shifting the collation element iterator position forward to prepare for
721	* a following match. If the last character is a unsafe character, we'll only
722	* shift by 1 to capture contractions, normalization etc.
723	* Internal method, status assumed to be success.
724	* @param text strsrch string search data
725	* @param textoffset start text position to do search
726	* @param ce the text ce which failed the match.
727	* @param patternceindex index of the ce within the pattern ce buffer which
728	* failed the match
729	* @return final offset
730	*/
731	static
732	inline int32_t shiftForward(UStringSearch *strsrch,
733	int32_t textoffset,
734	int32_t ce,
735	int32_t patternceindex)
736	{
737	UPattern *pattern = &(strsrch->pattern);
738	if (ce != UCOL_NULLORDER) {
739	int32_t shift = pattern->shift[hashFromCE32(ce)];
740	// this is to adjust for characters in the middle of the
741	// substring for matching that failed.
742	int32_t adjust = pattern->cesLength - patternceindex;
743	if (adjust > `1` && shift >= adjust) {
744	shift -= adjust - `1`;
745	}
746	textoffset += shift;
747	}
748	else {
749	textoffset += pattern->defaultShiftSize;
750	}
751
752	textoffset = getNextUStringSearchBaseOffset(strsrch, textoffset);
753	// check for unsafe characters
754	// if it is the start or middle of a contraction: to be done after*
755	// a initial match is found
756	// thai or lao base consonant character: similar to contraction*
757	// high surrogate character: similar to contraction*
758	// next character is a accent: shift to the next base character*
759	return textoffset;
760	}
761	#endif // #if BOYER_MOORE
762
763	/**
764	* sets match not found
765	* @param strsrch string search data
766	*/
767	static
768	inline void setMatchNotFound(UStringSearch *strsrch)
769	{
770	// this method resets the match result regardless of the error status.
771	strsrch->search->matchedIndex = USEARCH_DONE;
772	strsrch->search->matchedLength = `0`;
773	if (strsrch->search->isForwardSearching) {
774	setColEIterOffset(strsrch->textIter, strsrch->search->textLength);
775	}
776	else {
777	setColEIterOffset(strsrch->textIter, `0`);
778	}
779	}
780
781	#if BOYER_MOORE
782	/**
783	* Gets the offset to the next safe point in text.
784	* ie. not the middle of a contraction, swappable characters or supplementary
785	* characters.
786	* @param collator collation sata
787	* @param text string to work with
788	* @param textoffset offset in string
789	* @param textlength length of text string
790	* @return offset to the next safe character
791	*/
792	static
793	inline int32_t getNextSafeOffset(const UCollator *collator,
794	const UChar *text,
795	int32_t textoffset,
796	int32_t textlength)
797	{
798	int32_t result = textoffset; // first contraction character
799	while (result != textlength && ucol_unsafeCP(text[result], collator)) {
800	result ++;
801	}
802	return result;
803	}
804
805	/**
806	* This checks for accents in the potential match started with a .
807	* composite character.
808	* This is really painful... we have to check that composite character do not
809	* have any extra accents. We have to normalize the potential match and find
810	* the immediate decomposed character before the match.
811	* The first composite character would have been taken care of by the fcd
812	* checks in checkForwardExactMatch.
813	* This is the slow path after the fcd of the first character and
814	* the last character has been checked by checkForwardExactMatch and we
815	* determine that the potential match has extra non-ignorable preceding
816	* ces.
817	* E.g. looking for \u0301 acute in \u01FA A ring above and acute,
818	* checkExtraMatchAccent should fail since there is a middle ring in \u01FA
819	* Note here that accents checking are slow and cautioned in the API docs.
820	* Internal method, status assumed to be a success, caller should check status
821	* before calling this method
822	* @param strsrch string search data
823	* @param start index of the potential unfriendly composite character
824	* @param end index of the potential unfriendly composite character
825	* @param status output error status if any.
826	* @return TRUE if there is non-ignorable accents before at the beginning
827	* of the match, FALSE otherwise.
828	*/
829
830	static
831	UBool checkExtraMatchAccents(const UStringSearch *strsrch, int32_t start,
832	int32_t end,
833	UErrorCode *status)
834	{
835	UBool result = FALSE;
836	if (strsrch->pattern.hasPrefixAccents) {
837	int32_t length = end - start;
838	int32_t offset = `0`;
839	const UChar *text = strsrch->search->text + start;
840
841	U16_FWD_1(text, offset, length);
842	// we are only concerned with the first composite character
843	if (unorm_quickCheck(text, offset, UNORM_NFD, status) == UNORM_NO) {
844	int32_t safeoffset = getNextSafeOffset(strsrch->collator,
845	text, `0`, length);
846	if (safeoffset != length) {
847	safeoffset ++;
848	}
849	UChar *norm = NULL;
850	UChar buffer[INITIAL_ARRAY_SIZE_];
851	int32_t size = unorm_normalize(text, safeoffset, UNORM_NFD, `0`,
852	buffer, INITIAL_ARRAY_SIZE_,
853	status);
854	if (U_FAILURE(*status)) {
855	return FALSE;
856	}
857	if (size >= INITIAL_ARRAY_SIZE_) {
858	norm = (UChar )allocateMemory((size + `1`) sizeof(UChar),
859	status);
860	// if allocation failed, status will be set to
861	// U_MEMORY_ALLOCATION_ERROR and unorm_normalize internally
862	// checks for it.
863	size = unorm_normalize(text, safeoffset, UNORM_NFD, `0`, norm,
864	size, status);
865	if (U_FAILURE(*status) && norm != NULL) {
866	uprv_free(norm);
867	return FALSE;
868	}
869	}
870	else {
871	norm = buffer;
872	}
873
874	UCollationElements *coleiter = strsrch->utilIter;
875	ucol_setText(coleiter, norm, size, status);
876	uint32_t firstce = strsrch->pattern.ces[`0`];
877	UBool ignorable = TRUE;
878	uint32_t ce = UCOL_IGNORABLE;
879	while (U_SUCCESS(*status) && ce != firstce && ce != (uint32_t)UCOL_NULLORDER) {
880	offset = ucol_getOffset(coleiter);
881	if (ce != firstce && ce != UCOL_IGNORABLE) {
882	ignorable = FALSE;
883	}
884	ce = ucol_next(coleiter, status);
885	}
886	UChar32 codepoint;
887	U16_PREV(norm, `0`, offset, codepoint);
888	result = !ignorable && (u_getCombiningClass(codepoint) != `0`);
889
890	if (norm != buffer) {
891	uprv_free(norm);
892	}
893	}
894	}
895
896	return result;
897	}
898
899	/**
900	* Used by exact matches, checks if there are accents before the match.
901	* This is really painful... we have to check that composite characters at
902	* the start of the matches have to not have any extra accents.
903	* We check the FCD of the character first, if it starts with an accent and
904	* the first pattern ce does not match the first ce of the character, we bail.
905	* Otherwise we try normalizing the first composite
906	* character and find the immediate decomposed character before the match to
907	* see if it is an non-ignorable accent.
908	* Now normalizing the first composite character is enough because we ensure
909	* that when the match is passed in here with extra beginning ces, the
910	* first or last ce that match has to occur within the first character.
911	* E.g. looking for \u0301 acute in \u01FA A ring above and acute,
912	* checkExtraMatchAccent should fail since there is a middle ring in \u01FA
913	* Note here that accents checking are slow and cautioned in the API docs.
914	* @param strsrch string search data
915	* @param start offset
916	* @param end offset
917	* @return TRUE if there are accents on either side of the match,
918	* FALSE otherwise
919	*/
920	static
921	UBool hasAccentsBeforeMatch(const UStringSearch *strsrch, int32_t start,
922	int32_t end)
923	{
924	if (strsrch->pattern.hasPrefixAccents) {
925	UCollationElements *coleiter = strsrch->textIter;
926	UErrorCode status = U_ZERO_ERROR;
927	// we have been iterating forwards previously
928	uint32_t ignorable = TRUE;
929	int32_t firstce = strsrch->pattern.ces[`0`];
930
931	setColEIterOffset(coleiter, start);
932	int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
933	if (U_FAILURE(status)) {
934	return TRUE;
935	}
936	while (ce != firstce) {
937	if (ce != UCOL_IGNORABLE) {
938	ignorable = FALSE;
939	}
940	ce = getCE(strsrch, ucol_next(coleiter, &status));
941	if (U_FAILURE(status) \|\| ce == UCOL_NULLORDER) {
942	return TRUE;
943	}
944	}
945	if (!ignorable && inNormBuf(coleiter)) {
946	// within normalization buffer, discontiguous handled here
947	return TRUE;
948	}
949
950	// within text
951	int32_t temp = start;
952	// original code
953	// accent = (getFCD(strsrch->search->text, &temp,
954	// strsrch->search->textLength)
955	// >> SECOND_LAST_BYTE_SHIFT_);
956	// however this code does not work well with VC7 .net in release mode.
957	// maybe the inlines for getFCD combined with shifting has bugs in
958	// VC7. anyways this is a work around.
959	UBool accent = getFCD(strsrch->search->text, &temp,
960	strsrch->search->textLength) > `0xFF`;
961	if (!accent) {
962	return checkExtraMatchAccents(strsrch, start, end, &status);
963	}
964	if (!ignorable) {
965	return TRUE;
966	}
967	if (start > `0`) {
968	temp = start;
969	U16_BACK_1(strsrch->search->text, `0`, temp);
970	if (getFCD(strsrch->search->text, &temp,
971	strsrch->search->textLength) & LAST_BYTE_MASK_) {
972	setColEIterOffset(coleiter, start);
973	ce = ucol_previous(coleiter, &status);
974	if (U_FAILURE(status) \|\|
975	(ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE)) {
976	return TRUE;
977	}
978	}
979	}
980	}
981
982	return FALSE;
983	}
984
985	/**
986	* Used by exact matches, checks if there are accents bounding the match.
987	* Note this is the initial boundary check. If the potential match
988	* starts or ends with composite characters, the accents in those
989	* characters will be determined later.
990	* Not doing backwards iteration here, since discontiguos contraction for
991	* backwards collation element iterator, use up too many characters.
992	* E.g. looking for \u030A ring in \u01FA A ring above and acute,
993	* should fail since there is a acute at the end of \u01FA
994	* Note here that accents checking are slow and cautioned in the API docs.
995	* @param strsrch string search data
996	* @param start offset of match
997	* @param end end offset of the match
998	* @return TRUE if there are accents on either side of the match,
999	* FALSE otherwise
1000	*/
1001	static
1002	UBool hasAccentsAfterMatch(const UStringSearch *strsrch, int32_t start,
1003	int32_t end)
1004	{
1005	if (strsrch->pattern.hasSuffixAccents) {
1006	const UChar *text = strsrch->search->text;
1007	int32_t temp = end;
1008	int32_t textlength = strsrch->search->textLength;
1009	U16_BACK_1(text, `0`, temp);
1010	if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
1011	int32_t firstce = strsrch->pattern.ces[`0`];
1012	UCollationElements *coleiter = strsrch->textIter;
1013	UErrorCode status = U_ZERO_ERROR;
1014	int32_t ce;
1015	setColEIterOffset(coleiter, start);
1016	while ((ce = getCE(strsrch, ucol_next(coleiter, &status))) != firstce) {
1017	if (U_FAILURE(status) \|\| ce == UCOL_NULLORDER) {
1018	return TRUE;
1019	}
1020	}
1021	int32_t count = `1`;
1022	while (count < strsrch->pattern.cesLength) {
1023	if (getCE(strsrch, ucol_next(coleiter, &status))
1024	== UCOL_IGNORABLE) {
1025	// Thai can give an ignorable here.
1026	count --;
1027	}
1028	if (U_FAILURE(status)) {
1029	return TRUE;
1030	}
1031	count ++;
1032	}
1033
1034	ce = ucol_next(coleiter, &status);
1035	if (U_FAILURE(status)) {
1036	return TRUE;
1037	}
1038	if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
1039	ce = getCE(strsrch, ce);
1040	}
1041	if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
1042	if (ucol_getOffset(coleiter) <= end) {
1043	return TRUE;
1044	}
1045	if (getFCD(text, &end, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
1046	return TRUE;
1047	}
1048	}
1049	}
1050	}
1051	return FALSE;
1052	}
1053	#endif // #if BOYER_MOORE
1054
1055	/**
1056	* Checks if the offset runs out of the text string
1057	* @param offset
1058	* @param textlength of the text string
1059	* @return TRUE if offset is out of bounds, FALSE otherwise
1060	*/
1061	static
1062	inline UBool isOutOfBounds(int32_t textlength, int32_t offset)
1063	{
1064	return offset < `0` \|\| offset > textlength;
1065	}
1066
1067	/**
1068	* Checks for identical match
1069	* @param strsrch string search data
1070	* @param start offset of possible match
1071	* @param end offset of possible match
1072	* @return TRUE if identical match is found
1073	*/
1074	static
1075	inline UBool checkIdentical(const UStringSearch *strsrch, int32_t start,
1076	int32_t end)
1077	{
1078	if (strsrch->strength != UCOL_IDENTICAL) {
1079	return TRUE;
1080	}
1081
1082	// Note: We could use Normalizer::compare() or similar, but for short strings
1083	// which may not be in FCD it might be faster to just NFD them.
1084	UErrorCode status = U_ZERO_ERROR;
1085	UnicodeString t2, p2;
1086	strsrch->nfd->normalize(
1087	UnicodeString (FALSE, strsrch->search->text + start, end - start), t2, status);
1088	strsrch->nfd->normalize(
1089	UnicodeString (FALSE, strsrch->pattern.text, strsrch->pattern.textLength), p2, status);
1090	// return FALSE if NFD failed
1091	return U_SUCCESS(status) && t2 == p2;
1092	}
1093
1094	#if BOYER_MOORE
1095	/**
1096	* Checks to see if the match is repeated
1097	* @param strsrch string search data
1098	* @param start new match start index
1099	* @param end new match end index
1100	* @return TRUE if the the match is repeated, FALSE otherwise
1101	*/
1102	static
1103	inline UBool checkRepeatedMatch(UStringSearch *strsrch,
1104	int32_t start,
1105	int32_t end)
1106	{
1107	int32_t lastmatchindex = strsrch->search->matchedIndex;
1108	UBool result;
1109	if (lastmatchindex == USEARCH_DONE) {
1110	return FALSE;
1111	}
1112	if (strsrch->search->isForwardSearching) {
1113	result = start <= lastmatchindex;
1114	}
1115	else {
1116	result = start >= lastmatchindex;
1117	}
1118	if (!result && !strsrch->search->isOverlap) {
1119	if (strsrch->search->isForwardSearching) {
1120	result = start < lastmatchindex + strsrch->search->matchedLength;
1121	}
1122	else {
1123	result = end > lastmatchindex;
1124	}
1125	}
1126	return result;
1127	}
1128
1129	/**
1130	* Gets the collation element iterator's current offset.
1131	* @param coleiter collation element iterator
1132	* @param forwards flag TRUE if we are moving in th forwards direction
1133	* @return current offset
1134	*/
1135	static
1136	inline int32_t getColElemIterOffset(const UCollationElements *coleiter,
1137	UBool forwards)
1138	{
1139	int32_t result = ucol_getOffset(coleiter);
1140	// intricacies of the the backwards collation element iterator
1141	if (FALSE && !forwards && inNormBuf(coleiter) && !isFCDPointerNull(coleiter)) {
1142	result ++;
1143	}
1144	return result;
1145	}
1146
1147	/**
1148	* Checks match for contraction.
1149	* If the match ends with a partial contraction we fail.
1150	* If the match starts too far off (because of backwards iteration) we try to
1151	* chip off the extra characters depending on whether a breakiterator has
1152	* been used.
1153	* Internal method, error assumed to be success, caller has to check status
1154	* before calling this method.
1155	* @param strsrch string search data
1156	* @param start offset of potential match, to be modified if necessary
1157	* @param end offset of potential match, to be modified if necessary
1158	* @param status output error status if any
1159	* @return TRUE if match passes the contraction test, FALSE otherwise
1160	*/
1161
1162	static
1163	UBool checkNextExactContractionMatch(UStringSearch *strsrch,
1164	int32_t *start,
1165	int32_t end, UErrorCode status)
1166	{
1167	UCollationElements *coleiter = strsrch->textIter;
1168	int32_t textlength = strsrch->search->textLength;
1169	int32_t temp = *start;
1170	const UCollator *collator = strsrch->collator;
1171	const UChar *text = strsrch->search->text;
1172	// This part checks if either ends of the match contains potential
1173	// contraction. If so we'll have to iterate through them
1174	// The start contraction needs to be checked since ucol_previous dumps
1175	// all characters till the first safe character into the buffer.
1176	// start + 1 is used to test for the unsafe characters instead of start
1177	// because ucol_prev takes all unsafe characters till the first safe
1178	// character ie start. so by testing start + 1, we can estimate if
1179	// excess prefix characters has been included in the potential search
1180	// results.
1181	if ((end < textlength && ucol_unsafeCP(text[end], collator)) \|\|
1182	(*start + `1` < textlength
1183	&& ucol_unsafeCP(text[*start + `1`], collator))) {
1184	int32_t expansion = getExpansionPrefix(coleiter);
1185	UBool expandflag = expansion > `0`;
1186	setColEIterOffset(coleiter, *start);
1187	while (expansion > `0`) {
1188	// getting rid of the redundant ce, caused by setOffset.
1189	// since backward contraction/expansion may have extra ces if we
1190	// are in the normalization buffer, hasAccentsBeforeMatch would
1191	// have taken care of it.
1192	// E.g. the character \u01FA will have an expansion of 3, but if
1193	// we are only looking for acute and ring \u030A and \u0301, we'll
1194	// have to skip the first ce in the expansion buffer.
1195	ucol_next(coleiter, status);
1196	if (U_FAILURE(*status)) {
1197	return FALSE;
1198	}
1199	if (ucol_getOffset(coleiter) != temp) {
1200	*start = temp;
1201	temp = ucol_getOffset(coleiter);
1202	}
1203	expansion --;
1204	}
1205
1206	int32_t *patternce = strsrch->pattern.ces;
1207	int32_t patterncelength = strsrch->pattern.cesLength;
1208	int32_t count = `0`;
1209	while (count < patterncelength) {
1210	int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
1211	if (ce == UCOL_IGNORABLE) {
1212	continue;
1213	}
1214	if (expandflag && count == `0` && ucol_getOffset(coleiter) != temp) {
1215	*start = temp;
1216	temp = ucol_getOffset(coleiter);
1217	}
1218	if (U_FAILURE(*status) \|\| ce != patternce[count]) {
1219	(*end) ++;
1220	end = getNextUStringSearchBaseOffset(strsrch, end);
1221	return FALSE;
1222	}
1223	count ++;
1224	}
1225	}
1226	return TRUE;
1227	}
1228
1229	/**
1230	* Checks and sets the match information if found.
1231	* Checks
1232	* <ul>
1233	* <li> the potential match does not repeat the previous match
1234	* <li> boundaries are correct
1235	* <li> exact matches has no extra accents
1236	* <li> identical matchesb
1237	* <li> potential match does not end in the middle of a contraction
1238	* <\ul>
1239	* Otherwise the offset will be shifted to the next character.
1240	* Internal method, status assumed to be success, caller has to check status
1241	* before calling this method.
1242	* @param strsrch string search data
1243	* @param textoffset offset in the collation element text. the returned value
1244	* will be the truncated end offset of the match or the new start
1245	* search offset.
1246	* @param status output error status if any
1247	* @return TRUE if the match is valid, FALSE otherwise
1248	*/
1249	static
1250	inline UBool checkNextExactMatch(UStringSearch *strsrch,
1251	int32_t textoffset, UErrorCode status)
1252	{
1253	UCollationElements *coleiter = strsrch->textIter;
1254	int32_t start = getColElemIterOffset(coleiter, FALSE);
1255
1256	if (!checkNextExactContractionMatch(strsrch, &start, textoffset, status)) {
1257	return FALSE;
1258	}
1259
1260	// this totally matches, however we need to check if it is repeating
1261	if (!isBreakUnit(strsrch, start, *textoffset) \|\|
1262	checkRepeatedMatch(strsrch, start, *textoffset) \|\|
1263	hasAccentsBeforeMatch(strsrch, start, *textoffset) \|\|
1264	!checkIdentical(strsrch, start, *textoffset) \|\|
1265	hasAccentsAfterMatch(strsrch, start, *textoffset)) {
1266
1267	(*textoffset) ++;
1268	textoffset = getNextUStringSearchBaseOffset(strsrch, textoffset);
1269	return FALSE;
1270	}
1271
1272	//Add breakiterator boundary check for primary strength search.
1273	if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
1274	checkBreakBoundary(strsrch, &start, textoffset);
1275	}
1276
1277	// totally match, we will get rid of the ending ignorables.
1278	strsrch->search->matchedIndex = start;
1279	strsrch->search->matchedLength = *textoffset - start;
1280	return TRUE;
1281	}
1282
1283	/**
1284	* Getting the previous base character offset, or the current offset if the
1285	* current character is a base character
1286	* @param text string
1287	* @param textoffset one offset after the current character
1288	* @return the offset of the next character after the base character or the first
1289	* composed character with accents
1290	*/
1291	static
1292	inline int32_t getPreviousBaseOffset(const UChar *text,
1293	int32_t textoffset)
1294	{
1295	if (textoffset > `0`) {
1296	for (;;) {
1297	int32_t result = textoffset;
1298	U16_BACK_1(text, `0`, textoffset);
1299	int32_t temp = textoffset;
1300	uint16_t fcd = getFCD(text, &temp, result);
1301	if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == `0`) {
1302	if (fcd & LAST_BYTE_MASK_) {
1303	return textoffset;
1304	}
1305	return result;
1306	}
1307	if (textoffset == `0`) {
1308	return `0`;
1309	}
1310	}
1311	}
1312	return textoffset;
1313	}
1314
1315	/**
1316	* Getting the indexes of the accents that are not blocked in the argument
1317	* accent array
1318	* @param accents array of accents in nfd terminated by a 0.
1319	* @param accentsindex array of indexes of the accents that are not blocked
1320	*/
1321	static
1322	inline int getUnblockedAccentIndex(UChar accents, int32_t accentsindex)
1323	{
1324	int32_t index = `0`;
1325	int32_t length = u_strlen(accents);
1326	UChar32 codepoint = `0`;
1327	int cclass = `0`;
1328	int result = `0`;
1329	int32_t temp;
1330	while (index < length) {
1331	temp = index;
1332	U16_NEXT(accents, index, length, codepoint);
1333	if (u_getCombiningClass(codepoint) != cclass) {
1334	cclass = u_getCombiningClass(codepoint);
1335	accentsindex[result] = temp;
1336	result ++;
1337	}
1338	}
1339	accentsindex[result] = length;
1340	return result;
1341	}
1342
1343	/**
1344	* Appends 3 UChar arrays to a destination array.
1345	* Creates a new array if we run out of space. The caller will have to
1346	* manually deallocate the newly allocated array.
1347	* Internal method, status assumed to be success, caller has to check status
1348	* before calling this method. destination not to be NULL and has at least
1349	* size destinationlength.
1350	* @param destination target array
1351	* @param destinationlength target array size, returning the appended length
1352	* @param source1 null-terminated first array
1353	* @param source2 second array
1354	* @param source2length length of second array
1355	* @param source3 null-terminated third array
1356	* @param status error status if any
1357	* @return new destination array, destination if there was no new allocation
1358	*/
1359	static
1360	inline UChar * addToUCharArray( UChar *destination,
1361	int32_t *destinationlength,
1362	const UChar *source1,
1363	const UChar *source2,
1364	int32_t source2length,
1365	const UChar *source3,
1366	UErrorCode *status)
1367	{
1368	int32_t source1length = source1 ? u_strlen(source1) : `0`;
1369	int32_t source3length = source3 ? u_strlen(source3) : `0`;
1370	if (*destinationlength < source1length + source2length + source3length +
1371	`1`)
1372	{
1373	destination = (UChar *)allocateMemory(
1374	(source1length + source2length + source3length + `1`) * sizeof(UChar),
1375	status);
1376	// if error allocating memory, status will be
1377	// U_MEMORY_ALLOCATION_ERROR
1378	if (U_FAILURE(*status)) {
1379	*destinationlength = `0`;
1380	return NULL;
1381	}
1382	}
1383	if (source1length != `0`) {
1384	u_memcpy(destination, source1, source1length);
1385	}
1386	if (source2length != `0`) {
1387	uprv_memcpy(destination + source1length, source2,
1388	sizeof(UChar) * source2length);
1389	}
1390	if (source3length != `0`) {
1391	uprv_memcpy(destination + source1length + source2length, source3,
1392	sizeof(UChar) * source3length);
1393	}
1394	*destinationlength = source1length + source2length + source3length;
1395	return destination;
1396	}
1397
1398	/**
1399	* Running through a collation element iterator to see if the contents matches
1400	* pattern in string search data
1401	* @param strsrch string search data
1402	* @param coleiter collation element iterator
1403	* @return TRUE if a match if found, FALSE otherwise
1404	*/
1405	static
1406	inline UBool checkCollationMatch(const UStringSearch *strsrch,
1407	UCollationElements *coleiter)
1408	{
1409	int patternceindex = strsrch->pattern.cesLength;
1410	int32_t *patternce = strsrch->pattern.ces;
1411	UErrorCode status = U_ZERO_ERROR;
1412	while (patternceindex > `0`) {
1413	int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
1414	if (ce == UCOL_IGNORABLE) {
1415	continue;
1416	}
1417	if (U_FAILURE(status) \|\| ce != *patternce) {
1418	return FALSE;
1419	}
1420	patternce ++;
1421	patternceindex --;
1422	}
1423	return TRUE;
1424	}
1425
1426	/**
1427	* Rearranges the front accents to try matching.
1428	* Prefix accents in the text will be grouped according to their combining
1429	* class and the groups will be mixed and matched to try find the perfect
1430	* match with the pattern.
1431	* So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1432	* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1433	* "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1434	* "\u0301\u0325".
1435	* step 2: check if any of the generated substrings matches the pattern.
1436	* Internal method, status is assumed to be success, caller has to check status
1437	* before calling this method.
1438	* @param strsrch string search match
1439	* @param start first offset of the accents to start searching
1440	* @param end start of the last accent set
1441	* @param status output error status if any
1442	* @return USEARCH_DONE if a match is not found, otherwise return the starting
1443	* offset of the match. Note this start includes all preceding accents.
1444	*/
1445	static
1446	int32_t doNextCanonicalPrefixMatch(UStringSearch *strsrch,
1447	int32_t start,
1448	int32_t end,
1449	UErrorCode *status)
1450	{
1451	const UChar *text = strsrch->search->text;
1452	int32_t textlength = strsrch->search->textLength;
1453	int32_t tempstart = start;
1454
1455	if ((getFCD(text, &tempstart, textlength) & LAST_BYTE_MASK_) == `0`) {
1456	// die... failed at a base character
1457	return USEARCH_DONE;
1458	}
1459
1460	int32_t offset = getNextBaseOffset(text, tempstart, textlength);
1461	start = getPreviousBaseOffset(text, tempstart);
1462
1463	UChar accents[INITIAL_ARRAY_SIZE_];
1464	// normalizing the offensive string
1465	unorm_normalize(text + start, offset - start, UNORM_NFD, `0`, accents,
1466	INITIAL_ARRAY_SIZE_, status);
1467	if (U_FAILURE(*status)) {
1468	return USEARCH_DONE;
1469	}
1470
1471	int32_t accentsindex[INITIAL_ARRAY_SIZE_];
1472	int32_t accentsize = getUnblockedAccentIndex(accents,
1473	accentsindex);
1474	int32_t count = (`2` << (accentsize - `1`)) - `1`;
1475	UChar buffer[INITIAL_ARRAY_SIZE_];
1476	UCollationElements *coleiter = strsrch->utilIter;
1477	while (U_SUCCESS(*status) && count > `0`) {
1478	UChar *rearrange = strsrch->canonicalPrefixAccents;
1479	// copy the base characters
1480	for (int k = `0`; k < accentsindex[`0`]; k ++) {
1481	*rearrange ++ = accents[k];
1482	}
1483	// forming all possible canonical rearrangement by dropping
1484	// sets of accents
1485	for (int i = `0`; i <= accentsize - `1`; i ++) {
1486	int32_t mask = `1` << (accentsize - i - `1`);
1487	if (count & mask) {
1488	for (int j = accentsindex[i]; j < accentsindex[i + `1`]; j ++) {
1489	*rearrange ++ = accents[j];
1490	}
1491	}
1492	}
1493	*rearrange = `0`;
1494	int32_t matchsize = INITIAL_ARRAY_SIZE_;
1495	UChar *match = addToUCharArray(buffer, &matchsize,
1496	strsrch->canonicalPrefixAccents,
1497	strsrch->search->text + offset,
1498	end - offset,
1499	strsrch->canonicalSuffixAccents,
1500	status);
1501
1502	// if status is a failure, ucol_setText does nothing.
1503	// run the collator iterator through this match
1504	ucol_setText(coleiter, match, matchsize, status);
1505	if (U_SUCCESS(*status)) {
1506	if (checkCollationMatch(strsrch, coleiter)) {
1507	if (match != buffer) {
1508	uprv_free(match);
1509	}
1510	return start;
1511	}
1512	}
1513	count --;
1514	}
1515	return USEARCH_DONE;
1516	}
1517
1518	/**
1519	* Gets the offset to the safe point in text before textoffset.
1520	* ie. not the middle of a contraction, swappable characters or supplementary
1521	* characters.
1522	* @param collator collation sata
1523	* @param text string to work with
1524	* @param textoffset offset in string
1525	* @param textlength length of text string
1526	* @return offset to the previous safe character
1527	*/
1528	static
1529	inline uint32_t getPreviousSafeOffset(const UCollator *collator,
1530	const UChar *text,
1531	int32_t textoffset)
1532	{
1533	int32_t result = textoffset; // first contraction character
1534	while (result != `0` && ucol_unsafeCP(text[result - `1`], collator)) {
1535	result --;
1536	}
1537	if (result != `0`) {
1538	// the first contraction character is consider unsafe here
1539	result --;
1540	}
1541	return result;
1542	}
1543
1544	/**
1545	* Cleaning up after we passed the safe zone
1546	* @param strsrch string search data
1547	* @param safetext safe text array
1548	* @param safebuffer safe text buffer
1549	* @param coleiter collation element iterator for safe text
1550	*/
1551	static
1552	inline void cleanUpSafeText(const UStringSearch strsrch, UChar safetext,
1553	UChar *safebuffer)
1554	{
1555	if (safetext != safebuffer && safetext != strsrch->canonicalSuffixAccents)
1556	{
1557	uprv_free(safetext);
1558	}
1559	}
1560
1561	/**
1562	* Take the rearranged end accents and tries matching. If match failed at
1563	* a separate preceding set of accents (separated from the rearranged on by
1564	* at least a base character) then we rearrange the preceding accents and
1565	* tries matching again.
1566	* We allow skipping of the ends of the accent set if the ces do not match.
1567	* However if the failure is found before the accent set, it fails.
1568	* Internal method, status assumed to be success, caller has to check status
1569	* before calling this method.
1570	* @param strsrch string search data
1571	* @param textoffset of the start of the rearranged accent
1572	* @param status output error status if any
1573	* @return USEARCH_DONE if a match is not found, otherwise return the starting
1574	* offset of the match. Note this start includes all preceding accents.
1575	*/
1576	static
1577	int32_t doNextCanonicalSuffixMatch(UStringSearch *strsrch,
1578	int32_t textoffset,
1579	UErrorCode *status)
1580	{
1581	const UChar *text = strsrch->search->text;
1582	const UCollator *collator = strsrch->collator;
1583	int32_t safelength = `0`;
1584	UChar *safetext;
1585	int32_t safetextlength;
1586	UChar safebuffer[INITIAL_ARRAY_SIZE_];
1587	UCollationElements *coleiter = strsrch->utilIter;
1588	int32_t safeoffset = textoffset;
1589
1590	if (textoffset != `0` && ucol_unsafeCP(strsrch->canonicalSuffixAccents[`0`],
1591	collator)) {
1592	safeoffset = getPreviousSafeOffset(collator, text, textoffset);
1593	safelength = textoffset - safeoffset;
1594	safetextlength = INITIAL_ARRAY_SIZE_;
1595	safetext = addToUCharArray(safebuffer, &safetextlength, NULL,
1596	text + safeoffset, safelength,
1597	strsrch->canonicalSuffixAccents,
1598	status);
1599	}
1600	else {
1601	safetextlength = u_strlen(strsrch->canonicalSuffixAccents);
1602	safetext = strsrch->canonicalSuffixAccents;
1603	}
1604
1605	// if status is a failure, ucol_setText does nothing
1606	ucol_setText(coleiter, safetext, safetextlength, status);
1607	// status checked in loop below
1608
1609	int32_t *ce = strsrch->pattern.ces;
1610	int32_t celength = strsrch->pattern.cesLength;
1611	int ceindex = celength - `1`;
1612	UBool isSafe = TRUE; // indication flag for position in safe zone
1613
1614	while (ceindex >= `0`) {
1615	int32_t textce = ucol_previous(coleiter, status);
1616	if (U_FAILURE(*status)) {
1617	if (isSafe) {
1618	cleanUpSafeText(strsrch, safetext, safebuffer);
1619	}
1620	return USEARCH_DONE;
1621	}
1622	if (textce == UCOL_NULLORDER) {
1623	// check if we have passed the safe buffer
1624	if (coleiter == strsrch->textIter) {
1625	cleanUpSafeText(strsrch, safetext, safebuffer);
1626	return USEARCH_DONE;
1627	}
1628	cleanUpSafeText(strsrch, safetext, safebuffer);
1629	safetext = safebuffer;
1630	coleiter = strsrch->textIter;
1631	setColEIterOffset(coleiter, safeoffset);
1632	// status checked at the start of the loop
1633	isSafe = FALSE;
1634	continue;
1635	}
1636	textce = getCE(strsrch, textce);
1637	if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
1638	// do the beginning stuff
1639	int32_t failedoffset = getColElemIterOffset(coleiter, FALSE);
1640	if (isSafe && failedoffset >= safelength) {
1641	// alas... no hope. failed at rearranged accent set
1642	cleanUpSafeText(strsrch, safetext, safebuffer);
1643	return USEARCH_DONE;
1644	}
1645	else {
1646	if (isSafe) {
1647	failedoffset += safeoffset;
1648	cleanUpSafeText(strsrch, safetext, safebuffer);
1649	}
1650
1651	// try rearranging the front accents
1652	int32_t result = doNextCanonicalPrefixMatch(strsrch,
1653	failedoffset, textoffset, status);
1654	if (result != USEARCH_DONE) {
1655	// if status is a failure, ucol_setOffset does nothing
1656	setColEIterOffset(strsrch->textIter, result);
1657	}
1658	if (U_FAILURE(*status)) {
1659	return USEARCH_DONE;
1660	}
1661	return result;
1662	}
1663	}
1664	if (textce == ce[ceindex]) {
1665	ceindex --;
1666	}
1667	}
1668	// set offset here
1669	if (isSafe) {
1670	int32_t result = getColElemIterOffset(coleiter, FALSE);
1671	// sets the text iterator here with the correct expansion and offset
1672	int32_t leftoverces = getExpansionPrefix(coleiter);
1673	cleanUpSafeText(strsrch, safetext, safebuffer);
1674	if (result >= safelength) {
1675	result = textoffset;
1676	}
1677	else {
1678	result += safeoffset;
1679	}
1680	setColEIterOffset(strsrch->textIter, result);
1681	strsrch->textIter->iteratordata_.toReturn =
1682	setExpansionPrefix(strsrch->textIter, leftoverces);
1683	return result;
1684	}
1685
1686	return ucol_getOffset(coleiter);
1687	}
1688
1689	/**
1690	* Trying out the substring and sees if it can be a canonical match.
1691	* This will try normalizing the end accents and arranging them into canonical
1692	* equivalents and check their corresponding ces with the pattern ce.
1693	* Suffix accents in the text will be grouped according to their combining
1694	* class and the groups will be mixed and matched to try find the perfect
1695	* match with the pattern.
1696	* So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1697	* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1698	* "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1699	* "\u0301\u0325".
1700	* step 2: check if any of the generated substrings matches the pattern.
1701	* Internal method, status assumed to be success, caller has to check status
1702	* before calling this method.
1703	* @param strsrch string search data
1704	* @param textoffset end offset in the collation element text that ends with
1705	* the accents to be rearranged
1706	* @param status error status if any
1707	* @return TRUE if the match is valid, FALSE otherwise
1708	*/
1709	static
1710	UBool doNextCanonicalMatch(UStringSearch *strsrch,
1711	int32_t textoffset,
1712	UErrorCode *status)
1713	{
1714	const UChar *text = strsrch->search->text;
1715	int32_t temp = textoffset;
1716	U16_BACK_1(text, `0`, temp);
1717	if ((getFCD(text, &temp, textoffset) & LAST_BYTE_MASK_) == `0`) {
1718	UCollationElements *coleiter = strsrch->textIter;
1719	int32_t offset = getColElemIterOffset(coleiter, FALSE);
1720	if (strsrch->pattern.hasPrefixAccents) {
1721	offset = doNextCanonicalPrefixMatch(strsrch, offset, textoffset,
1722	status);
1723	if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
1724	setColEIterOffset(coleiter, offset);
1725	return TRUE;
1726	}
1727	}
1728	return FALSE;
1729	}
1730
1731	if (!strsrch->pattern.hasSuffixAccents) {
1732	return FALSE;
1733	}
1734
1735	UChar accents[INITIAL_ARRAY_SIZE_];
1736	// offset to the last base character in substring to search
1737	int32_t baseoffset = getPreviousBaseOffset(text, textoffset);
1738	// normalizing the offensive string
1739	unorm_normalize(text + baseoffset, textoffset - baseoffset, UNORM_NFD,
1740	`0`, accents, INITIAL_ARRAY_SIZE_, status);
1741	// status checked in loop below
1742
1743	int32_t accentsindex[INITIAL_ARRAY_SIZE_];
1744	int32_t size = getUnblockedAccentIndex(accents, accentsindex);
1745
1746	// 2 power n - 1 plus the full set of accents
1747	int32_t count = (`2` << (size - `1`)) - `1`;
1748	while (U_SUCCESS(*status) && count > `0`) {
1749	UChar *rearrange = strsrch->canonicalSuffixAccents;
1750	// copy the base characters
1751	for (int k = `0`; k < accentsindex[`0`]; k ++) {
1752	*rearrange ++ = accents[k];
1753	}
1754	// forming all possible canonical rearrangement by dropping
1755	// sets of accents
1756	for (int i = `0`; i <= size - `1`; i ++) {
1757	int32_t mask = `1` << (size - i - `1`);
1758	if (count & mask) {
1759	for (int j = accentsindex[i]; j < accentsindex[i + `1`]; j ++) {
1760	*rearrange ++ = accents[j];
1761	}
1762	}
1763	}
1764	*rearrange = `0`;
1765	int32_t offset = doNextCanonicalSuffixMatch(strsrch, baseoffset,
1766	status);
1767	if (offset != USEARCH_DONE) {
1768	return TRUE; // match found
1769	}
1770	count --;
1771	}
1772	return FALSE;
1773	}
1774
1775	/**
1776	* Gets the previous base character offset depending on the string search
1777	* pattern data
1778	* @param strsrch string search data
1779	* @param textoffset current offset, current character
1780	* @return the offset of the next character after this base character or itself
1781	* if it is a composed character with accents
1782	*/
1783	static
1784	inline int32_t getPreviousUStringSearchBaseOffset(UStringSearch *strsrch,
1785	int32_t textoffset)
1786	{
1787	if (strsrch->pattern.hasPrefixAccents && textoffset > `0`) {
1788	const UChar *text = strsrch->search->text;
1789	int32_t offset = textoffset;
1790	if (getFCD(text, &offset, strsrch->search->textLength) >>
1791	SECOND_LAST_BYTE_SHIFT_) {
1792	return getPreviousBaseOffset(text, textoffset);
1793	}
1794	}
1795	return textoffset;
1796	}
1797
1798	/**
1799	* Checks match for contraction.
1800	* If the match ends with a partial contraction we fail.
1801	* If the match starts too far off (because of backwards iteration) we try to
1802	* chip off the extra characters
1803	* Internal method, status assumed to be success, caller has to check status
1804	* before calling this method.
1805	* @param strsrch string search data
1806	* @param start offset of potential match, to be modified if necessary
1807	* @param end offset of potential match, to be modified if necessary
1808	* @param status output error status if any
1809	* @return TRUE if match passes the contraction test, FALSE otherwise
1810	*/
1811	static
1812	UBool checkNextCanonicalContractionMatch(UStringSearch *strsrch,
1813	int32_t *start,
1814	int32_t *end,
1815	UErrorCode *status)
1816	{
1817	UCollationElements *coleiter = strsrch->textIter;
1818	int32_t textlength = strsrch->search->textLength;
1819	int32_t temp = *start;
1820	const UCollator *collator = strsrch->collator;
1821	const UChar *text = strsrch->search->text;
1822	// This part checks if either ends of the match contains potential
1823	// contraction. If so we'll have to iterate through them
1824	if ((end < textlength && ucol_unsafeCP(text[end], collator)) \|\|
1825	(*start + `1` < textlength
1826	&& ucol_unsafeCP(text[*start + `1`], collator))) {
1827	int32_t expansion = getExpansionPrefix(coleiter);
1828	UBool expandflag = expansion > `0`;
1829	setColEIterOffset(coleiter, *start);
1830	while (expansion > `0`) {
1831	// getting rid of the redundant ce, caused by setOffset.
1832	// since backward contraction/expansion may have extra ces if we
1833	// are in the normalization buffer, hasAccentsBeforeMatch would
1834	// have taken care of it.
1835	// E.g. the character \u01FA will have an expansion of 3, but if
1836	// we are only looking for acute and ring \u030A and \u0301, we'll
1837	// have to skip the first ce in the expansion buffer.
1838	ucol_next(coleiter, status);
1839	if (U_FAILURE(*status)) {
1840	return FALSE;
1841	}
1842	if (ucol_getOffset(coleiter) != temp) {
1843	*start = temp;
1844	temp = ucol_getOffset(coleiter);
1845	}
1846	expansion --;
1847	}
1848
1849	int32_t *patternce = strsrch->pattern.ces;
1850	int32_t patterncelength = strsrch->pattern.cesLength;
1851	int32_t count = `0`;
1852	int32_t textlength = strsrch->search->textLength;
1853	while (count < patterncelength) {
1854	int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
1855	// status checked below, note that if status is a failure
1856	// ucol_next returns UCOL_NULLORDER
1857	if (ce == UCOL_IGNORABLE) {
1858	continue;
1859	}
1860	if (expandflag && count == `0` && ucol_getOffset(coleiter) != temp) {
1861	*start = temp;
1862	temp = ucol_getOffset(coleiter);
1863	}
1864
1865	if (count == `0` && ce != patternce[`0`]) {
1866	// accents may have extra starting ces, this occurs when a
1867	// pure accent pattern is matched without rearrangement
1868	// text \u0325\u0300 and looking for \u0300
1869	int32_t expected = patternce[`0`];
1870	if (getFCD(text, start, textlength) & LAST_BYTE_MASK_) {
1871	ce = getCE(strsrch, ucol_next(coleiter, status));
1872	while (U_SUCCESS(*status) && ce != expected &&
1873	ce != UCOL_NULLORDER &&
1874	ucol_getOffset(coleiter) <= *end) {
1875	ce = getCE(strsrch, ucol_next(coleiter, status));
1876	}
1877	}
1878	}
1879	if (U_FAILURE(*status) \|\| ce != patternce[count]) {
1880	(*end) ++;
1881	end = getNextUStringSearchBaseOffset(strsrch, end);
1882	return FALSE;
1883	}
1884	count ++;
1885	}
1886	}
1887	return TRUE;
1888	}
1889
1890	/**
1891	* Checks and sets the match information if found.
1892	* Checks
1893	* <ul>
1894	* <li> the potential match does not repeat the previous match
1895	* <li> boundaries are correct
1896	* <li> potential match does not end in the middle of a contraction
1897	* <li> identical matches
1898	* <\ul>
1899	* Otherwise the offset will be shifted to the next character.
1900	* Internal method, status assumed to be success, caller has to check the
1901	* status before calling this method.
1902	* @param strsrch string search data
1903	* @param textoffset offset in the collation element text. the returned value
1904	* will be the truncated end offset of the match or the new start
1905	* search offset.
1906	* @param status output error status if any
1907	* @return TRUE if the match is valid, FALSE otherwise
1908	*/
1909	static
1910	inline UBool checkNextCanonicalMatch(UStringSearch *strsrch,
1911	int32_t *textoffset,
1912	UErrorCode *status)
1913	{
1914	// to ensure that the start and ends are not composite characters
1915	UCollationElements *coleiter = strsrch->textIter;
1916	// if we have a canonical accent match
1917	if ((strsrch->pattern.hasSuffixAccents &&
1918	strsrch->canonicalSuffixAccents[`0`]) \|\|
1919	(strsrch->pattern.hasPrefixAccents &&
1920	strsrch->canonicalPrefixAccents[`0`])) {
1921	strsrch->search->matchedIndex = getPreviousUStringSearchBaseOffset(
1922	strsrch,
1923	ucol_getOffset(coleiter));
1924	strsrch->search->matchedLength = *textoffset -
1925	strsrch->search->matchedIndex;
1926	return TRUE;
1927	}
1928
1929	int32_t start = getColElemIterOffset(coleiter, FALSE);
1930	if (!checkNextCanonicalContractionMatch(strsrch, &start, textoffset,
1931	status) \|\| U_FAILURE(*status)) {
1932	return FALSE;
1933	}
1934
1935	start = getPreviousUStringSearchBaseOffset(strsrch, start);
1936	// this totally matches, however we need to check if it is repeating
1937	if (checkRepeatedMatch(strsrch, start, *textoffset) \|\|
1938	!isBreakUnit(strsrch, start, *textoffset) \|\|
1939	!checkIdentical(strsrch, start, *textoffset)) {
1940	(*textoffset) ++;
1941	textoffset = getNextBaseOffset(strsrch->search->text, textoffset,
1942	strsrch->search->textLength);
1943	return FALSE;
1944	}
1945
1946	strsrch->search->matchedIndex = start;
1947	strsrch->search->matchedLength = *textoffset - start;
1948	return TRUE;
1949	}
1950
1951	/**
1952	* Shifting the collation element iterator position forward to prepare for
1953	* a preceding match. If the first character is a unsafe character, we'll only
1954	* shift by 1 to capture contractions, normalization etc.
1955	* Internal method, status assumed to be success, caller has to check status
1956	* before calling this method.
1957	* @param text strsrch string search data
1958	* @param textoffset start text position to do search
1959	* @param ce the text ce which failed the match.
1960	* @param patternceindex index of the ce within the pattern ce buffer which
1961	* failed the match
1962	* @return final offset
1963	*/
1964	static
1965	inline int32_t reverseShift(UStringSearch *strsrch,
1966	int32_t textoffset,
1967	int32_t ce,
1968	int32_t patternceindex)
1969	{
1970	if (strsrch->search->isOverlap) {
1971	if (textoffset != strsrch->search->textLength) {
1972	textoffset --;
1973	}
1974	else {
1975	textoffset -= strsrch->pattern.defaultShiftSize;
1976	}
1977	}
1978	else {
1979	if (ce != UCOL_NULLORDER) {
1980	int32_t shift = strsrch->pattern.backShift[hashFromCE32(ce)];
1981
1982	// this is to adjust for characters in the middle of the substring
1983	// for matching that failed.
1984	int32_t adjust = patternceindex;
1985	if (adjust > `1` && shift > adjust) {
1986	shift -= adjust - `1`;
1987	}
1988	textoffset -= shift;
1989	}
1990	else {
1991	textoffset -= strsrch->pattern.defaultShiftSize;
1992	}
1993	}
1994	textoffset = getPreviousUStringSearchBaseOffset(strsrch, textoffset);
1995	return textoffset;
1996	}
1997
1998	/**
1999	* Checks match for contraction.
2000	* If the match starts with a partial contraction we fail.
2001	* Internal method, status assumed to be success, caller has to check status
2002	* before calling this method.
2003	* @param strsrch string search data
2004	* @param start offset of potential match, to be modified if necessary
2005	* @param end offset of potential match, to be modified if necessary
2006	* @param status output error status if any
2007	* @return TRUE if match passes the contraction test, FALSE otherwise
2008	*/
2009	static
2010	UBool checkPreviousExactContractionMatch(UStringSearch *strsrch,
2011	int32_t *start,
2012	int32_t end, UErrorCode status)
2013	{
2014	UCollationElements *coleiter = strsrch->textIter;
2015	int32_t textlength = strsrch->search->textLength;
2016	int32_t temp = *end;
2017	const UCollator *collator = strsrch->collator;
2018	const UChar *text = strsrch->search->text;
2019	// This part checks if either if the start of the match contains potential
2020	// contraction. If so we'll have to iterate through them
2021	// Since we used ucol_next while previously looking for the potential
2022	// match, this guarantees that our end will not be a partial contraction,
2023	// or a partial supplementary character.
2024	if (start < textlength && ucol_unsafeCP(text[start], collator)) {
2025	int32_t expansion = getExpansionSuffix(coleiter);
2026	UBool expandflag = expansion > `0`;
2027	setColEIterOffset(coleiter, *end);
2028	while (U_SUCCESS(*status) && expansion > `0`) {
2029	// getting rid of the redundant ce
2030	// since forward contraction/expansion may have extra ces
2031	// if we are in the normalization buffer, hasAccentsBeforeMatch
2032	// would have taken care of it.
2033	// E.g. the character \u01FA will have an expansion of 3, but if
2034	// we are only looking for A ring A\u030A, we'll have to skip the
2035	// last ce in the expansion buffer
2036	ucol_previous(coleiter, status);
2037	if (U_FAILURE(*status)) {
2038	return FALSE;
2039	}
2040	if (ucol_getOffset(coleiter) != temp) {
2041	*end = temp;
2042	temp = ucol_getOffset(coleiter);
2043	}
2044	expansion --;
2045	}
2046
2047	int32_t *patternce = strsrch->pattern.ces;
2048	int32_t patterncelength = strsrch->pattern.cesLength;
2049	int32_t count = patterncelength;
2050	while (count > `0`) {
2051	int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
2052	// status checked below, note that if status is a failure
2053	// ucol_previous returns UCOL_NULLORDER
2054	if (ce == UCOL_IGNORABLE) {
2055	continue;
2056	}
2057	if (expandflag && count == `0` &&
2058	getColElemIterOffset(coleiter, FALSE) != temp) {
2059	*end = temp;
2060	temp = ucol_getOffset(coleiter);
2061	}
2062	if (U_FAILURE(*status) \|\| ce != patternce[count - `1`]) {
2063	(*start) --;
2064	start = getPreviousBaseOffset(text, start);
2065	return FALSE;
2066	}
2067	count --;
2068	}
2069	}
2070	return TRUE;
2071	}
2072
2073	/**
2074	* Checks and sets the match information if found.
2075	* Checks
2076	* <ul>
2077	* <li> the current match does not repeat the last match
2078	* <li> boundaries are correct
2079	* <li> exact matches has no extra accents
2080	* <li> identical matches
2081	* <\ul>
2082	* Otherwise the offset will be shifted to the preceding character.
2083	* Internal method, status assumed to be success, caller has to check status
2084	* before calling this method.
2085	* @param strsrch string search data
2086	* @param collator
2087	* @param coleiter collation element iterator
2088	* @param text string
2089	* @param textoffset offset in the collation element text. the returned value
2090	* will be the truncated start offset of the match or the new start
2091	* search offset.
2092	* @param status output error status if any
2093	* @return TRUE if the match is valid, FALSE otherwise
2094	*/
2095	static
2096	inline UBool checkPreviousExactMatch(UStringSearch *strsrch,
2097	int32_t *textoffset,
2098	UErrorCode *status)
2099	{
2100	// to ensure that the start and ends are not composite characters
2101	int32_t end = ucol_getOffset(strsrch->textIter);
2102	if (!checkPreviousExactContractionMatch(strsrch, textoffset, &end, status)
2103	\|\| U_FAILURE(*status)) {
2104	return FALSE;
2105	}
2106
2107	// this totally matches, however we need to check if it is repeating
2108	// the old match
2109	if (checkRepeatedMatch(strsrch, *textoffset, end) \|\|
2110	!isBreakUnit(strsrch, *textoffset, end) \|\|
2111	hasAccentsBeforeMatch(strsrch, *textoffset, end) \|\|
2112	!checkIdentical(strsrch, *textoffset, end) \|\|
2113	hasAccentsAfterMatch(strsrch, *textoffset, end)) {
2114	(*textoffset) --;
2115	*textoffset = getPreviousBaseOffset(strsrch->search->text,
2116	*textoffset);
2117	return FALSE;
2118	}
2119
2120	//Add breakiterator boundary check for primary strength search.
2121	if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
2122	checkBreakBoundary(strsrch, textoffset, &end);
2123	}
2124
2125	strsrch->search->matchedIndex = *textoffset;
2126	strsrch->search->matchedLength = end - *textoffset;
2127	return TRUE;
2128	}
2129
2130	/**
2131	* Rearranges the end accents to try matching.
2132	* Suffix accents in the text will be grouped according to their combining
2133	* class and the groups will be mixed and matched to try find the perfect
2134	* match with the pattern.
2135	* So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2136	* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
2137	* "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
2138	* "\u0301\u0325".
2139	* step 2: check if any of the generated substrings matches the pattern.
2140	* Internal method, status assumed to be success, user has to check status
2141	* before calling this method.
2142	* @param strsrch string search match
2143	* @param start offset of the first base character
2144	* @param end start of the last accent set
2145	* @param status only error status if any
2146	* @return USEARCH_DONE if a match is not found, otherwise return the ending
2147	* offset of the match. Note this start includes all following accents.
2148	*/
2149	static
2150	int32_t doPreviousCanonicalSuffixMatch(UStringSearch *strsrch,
2151	int32_t start,
2152	int32_t end,
2153	UErrorCode *status)
2154	{
2155	const UChar *text = strsrch->search->text;
2156	int32_t tempend = end;
2157
2158	U16_BACK_1(text, `0`, tempend);
2159	if (!(getFCD(text, &tempend, strsrch->search->textLength) &
2160	LAST_BYTE_MASK_)) {
2161	// die... failed at a base character
2162	return USEARCH_DONE;
2163	}
2164	end = getNextBaseOffset(text, end, strsrch->search->textLength);
2165
2166	if (U_SUCCESS(*status)) {
2167	UChar accents[INITIAL_ARRAY_SIZE_];
2168	int32_t offset = getPreviousBaseOffset(text, end);
2169	// normalizing the offensive string
2170	unorm_normalize(text + offset, end - offset, UNORM_NFD, `0`, accents,
2171	INITIAL_ARRAY_SIZE_, status);
2172
2173	int32_t accentsindex[INITIAL_ARRAY_SIZE_];
2174	int32_t accentsize = getUnblockedAccentIndex(accents,
2175	accentsindex);
2176	int32_t count = (`2` << (accentsize - `1`)) - `1`;
2177	UChar buffer[INITIAL_ARRAY_SIZE_];
2178	UCollationElements *coleiter = strsrch->utilIter;
2179	while (U_SUCCESS(*status) && count > `0`) {
2180	UChar *rearrange = strsrch->canonicalSuffixAccents;
2181	// copy the base characters
2182	for (int k = `0`; k < accentsindex[`0`]; k ++) {
2183	*rearrange ++ = accents[k];
2184	}
2185	// forming all possible canonical rearrangement by dropping
2186	// sets of accents
2187	for (int i = `0`; i <= accentsize - `1`; i ++) {
2188	int32_t mask = `1` << (accentsize - i - `1`);
2189	if (count & mask) {
2190	for (int j = accentsindex[i]; j < accentsindex[i + `1`]; j ++) {
2191	*rearrange ++ = accents[j];
2192	}
2193	}
2194	}
2195	*rearrange = `0`;
2196	int32_t matchsize = INITIAL_ARRAY_SIZE_;
2197	UChar *match = addToUCharArray(buffer, &matchsize,
2198	strsrch->canonicalPrefixAccents,
2199	strsrch->search->text + start,
2200	offset - start,
2201	strsrch->canonicalSuffixAccents,
2202	status);
2203
2204	// run the collator iterator through this match
2205	// if status is a failure ucol_setText does nothing
2206	ucol_setText(coleiter, match, matchsize, status);
2207	if (U_SUCCESS(*status)) {
2208	if (checkCollationMatch(strsrch, coleiter)) {
2209	if (match != buffer) {
2210	uprv_free(match);
2211	}
2212	return end;
2213	}
2214	}
2215	count --;
2216	}
2217	}
2218	return USEARCH_DONE;
2219	}
2220
2221	/**
2222	* Take the rearranged start accents and tries matching. If match failed at
2223	* a separate following set of accents (separated from the rearranged on by
2224	* at least a base character) then we rearrange the preceding accents and
2225	* tries matching again.
2226	* We allow skipping of the ends of the accent set if the ces do not match.
2227	* However if the failure is found before the accent set, it fails.
2228	* Internal method, status assumed to be success, caller has to check status
2229	* before calling this method.
2230	* @param strsrch string search data
2231	* @param textoffset of the ends of the rearranged accent
2232	* @param status output error status if any
2233	* @return USEARCH_DONE if a match is not found, otherwise return the ending
2234	* offset of the match. Note this start includes all following accents.
2235	*/
2236	static
2237	int32_t doPreviousCanonicalPrefixMatch(UStringSearch *strsrch,
2238	int32_t textoffset,
2239	UErrorCode *status)
2240	{
2241	const UChar *text = strsrch->search->text;
2242	const UCollator *collator = strsrch->collator;
2243	int32_t safelength = `0`;
2244	UChar *safetext;
2245	int32_t safetextlength;
2246	UChar safebuffer[INITIAL_ARRAY_SIZE_];
2247	int32_t safeoffset = textoffset;
2248
2249	if (textoffset &&
2250	ucol_unsafeCP(strsrch->canonicalPrefixAccents[
2251	u_strlen(strsrch->canonicalPrefixAccents) - `1`
2252	], collator)) {
2253	safeoffset = getNextSafeOffset(collator, text, textoffset,
2254	strsrch->search->textLength);
2255	safelength = safeoffset - textoffset;
2256	safetextlength = INITIAL_ARRAY_SIZE_;
2257	safetext = addToUCharArray(safebuffer, &safetextlength,
2258	strsrch->canonicalPrefixAccents,
2259	text + textoffset, safelength,
2260	NULL, status);
2261	}
2262	else {
2263	safetextlength = u_strlen(strsrch->canonicalPrefixAccents);
2264	safetext = strsrch->canonicalPrefixAccents;
2265	}
2266
2267	UCollationElements *coleiter = strsrch->utilIter;
2268	// if status is a failure, ucol_setText does nothing
2269	ucol_setText(coleiter, safetext, safetextlength, status);
2270	// status checked in loop below
2271
2272	int32_t *ce = strsrch->pattern.ces;
2273	int32_t celength = strsrch->pattern.cesLength;
2274	int ceindex = `0`;
2275	UBool isSafe = TRUE; // safe zone indication flag for position
2276	int32_t prefixlength = u_strlen(strsrch->canonicalPrefixAccents);
2277
2278	while (ceindex < celength) {
2279	int32_t textce = ucol_next(coleiter, status);
2280	if (U_FAILURE(*status)) {
2281	if (isSafe) {
2282	cleanUpSafeText(strsrch, safetext, safebuffer);
2283	}
2284	return USEARCH_DONE;
2285	}
2286	if (textce == UCOL_NULLORDER) {
2287	// check if we have passed the safe buffer
2288	if (coleiter == strsrch->textIter) {
2289	cleanUpSafeText(strsrch, safetext, safebuffer);
2290	return USEARCH_DONE;
2291	}
2292	cleanUpSafeText(strsrch, safetext, safebuffer);
2293	safetext = safebuffer;
2294	coleiter = strsrch->textIter;
2295	setColEIterOffset(coleiter, safeoffset);
2296	// status checked at the start of the loop
2297	isSafe = FALSE;
2298	continue;
2299	}
2300	textce = getCE(strsrch, textce);
2301	if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
2302	// do the beginning stuff
2303	int32_t failedoffset = ucol_getOffset(coleiter);
2304	if (isSafe && failedoffset <= prefixlength) {
2305	// alas... no hope. failed at rearranged accent set
2306	cleanUpSafeText(strsrch, safetext, safebuffer);
2307	return USEARCH_DONE;
2308	}
2309	else {
2310	if (isSafe) {
2311	failedoffset = safeoffset - failedoffset;
2312	cleanUpSafeText(strsrch, safetext, safebuffer);
2313	}
2314
2315	// try rearranging the end accents
2316	int32_t result = doPreviousCanonicalSuffixMatch(strsrch,
2317	textoffset, failedoffset, status);
2318	if (result != USEARCH_DONE) {
2319	// if status is a failure, ucol_setOffset does nothing
2320	setColEIterOffset(strsrch->textIter, result);
2321	}
2322	if (U_FAILURE(*status)) {
2323	return USEARCH_DONE;
2324	}
2325	return result;
2326	}
2327	}
2328	if (textce == ce[ceindex]) {
2329	ceindex ++;
2330	}
2331	}
2332	// set offset here
2333	if (isSafe) {
2334	int32_t result = ucol_getOffset(coleiter);
2335	// sets the text iterator here with the correct expansion and offset
2336	int32_t leftoverces = getExpansionSuffix(coleiter);
2337	cleanUpSafeText(strsrch, safetext, safebuffer);
2338	if (result <= prefixlength) {
2339	result = textoffset;
2340	}
2341	else {
2342	result = textoffset + (safeoffset - result);
2343	}
2344	setColEIterOffset(strsrch->textIter, result);
2345	setExpansionSuffix(strsrch->textIter, leftoverces);
2346	return result;
2347	}
2348
2349	return ucol_getOffset(coleiter);
2350	}
2351
2352	/**
2353	* Trying out the substring and sees if it can be a canonical match.
2354	* This will try normalizing the starting accents and arranging them into
2355	* canonical equivalents and check their corresponding ces with the pattern ce.
2356	* Prefix accents in the text will be grouped according to their combining
2357	* class and the groups will be mixed and matched to try find the perfect
2358	* match with the pattern.
2359	* So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2360	* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
2361	* "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
2362	* "\u0301\u0325".
2363	* step 2: check if any of the generated substrings matches the pattern.
2364	* Internal method, status assumed to be success, caller has to check status
2365	* before calling this method.
2366	* @param strsrch string search data
2367	* @param textoffset start offset in the collation element text that starts
2368	* with the accents to be rearranged
2369	* @param status output error status if any
2370	* @return TRUE if the match is valid, FALSE otherwise
2371	*/
2372	static
2373	UBool doPreviousCanonicalMatch(UStringSearch *strsrch,
2374	int32_t textoffset,
2375	UErrorCode *status)
2376	{
2377	const UChar *text = strsrch->search->text;
2378	int32_t temp = textoffset;
2379	int32_t textlength = strsrch->search->textLength;
2380	if ((getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) == `0`) {
2381	UCollationElements *coleiter = strsrch->textIter;
2382	int32_t offset = ucol_getOffset(coleiter);
2383	if (strsrch->pattern.hasSuffixAccents) {
2384	offset = doPreviousCanonicalSuffixMatch(strsrch, textoffset,
2385	offset, status);
2386	if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
2387	setColEIterOffset(coleiter, offset);
2388	return TRUE;
2389	}
2390	}
2391	return FALSE;
2392	}
2393
2394	if (!strsrch->pattern.hasPrefixAccents) {
2395	return FALSE;
2396	}
2397
2398	UChar accents[INITIAL_ARRAY_SIZE_];
2399	// offset to the last base character in substring to search
2400	int32_t baseoffset = getNextBaseOffset(text, textoffset, textlength);
2401	// normalizing the offensive string
2402	unorm_normalize(text + textoffset, baseoffset - textoffset, UNORM_NFD,
2403	`0`, accents, INITIAL_ARRAY_SIZE_, status);
2404	// status checked in loop
2405
2406	int32_t accentsindex[INITIAL_ARRAY_SIZE_];
2407	int32_t size = getUnblockedAccentIndex(accents, accentsindex);
2408
2409	// 2 power n - 1 plus the full set of accents
2410	int32_t count = (`2` << (size - `1`)) - `1`;
2411	while (U_SUCCESS(*status) && count > `0`) {
2412	UChar *rearrange = strsrch->canonicalPrefixAccents;
2413	// copy the base characters
2414	for (int k = `0`; k < accentsindex[`0`]; k ++) {
2415	*rearrange ++ = accents[k];
2416	}
2417	// forming all possible canonical rearrangement by dropping
2418	// sets of accents
2419	for (int i = `0`; i <= size - `1`; i ++) {
2420	int32_t mask = `1` << (size - i - `1`);
2421	if (count & mask) {
2422	for (int j = accentsindex[i]; j < accentsindex[i + `1`]; j ++) {
2423	*rearrange ++ = accents[j];
2424	}
2425	}
2426	}
2427	*rearrange = `0`;
2428	int32_t offset = doPreviousCanonicalPrefixMatch(strsrch,
2429	baseoffset, status);
2430	if (offset != USEARCH_DONE) {
2431	return TRUE; // match found
2432	}
2433	count --;
2434	}
2435	return FALSE;
2436	}
2437
2438	/**
2439	* Checks match for contraction.
2440	* If the match starts with a partial contraction we fail.
2441	* Internal method, status assumed to be success, caller has to check status
2442	* before calling this method.
2443	* @param strsrch string search data
2444	* @param start offset of potential match, to be modified if necessary
2445	* @param end offset of potential match, to be modified if necessary
2446	* @param status only error status if any
2447	* @return TRUE if match passes the contraction test, FALSE otherwise
2448	*/
2449	static
2450	UBool checkPreviousCanonicalContractionMatch(UStringSearch *strsrch,
2451	int32_t *start,
2452	int32_t end, UErrorCode status)
2453	{
2454	UCollationElements *coleiter = strsrch->textIter;
2455	int32_t textlength = strsrch->search->textLength;
2456	int32_t temp = *end;
2457	const UCollator *collator = strsrch->collator;
2458	const UChar *text = strsrch->search->text;
2459	// This part checks if either if the start of the match contains potential
2460	// contraction. If so we'll have to iterate through them
2461	// Since we used ucol_next while previously looking for the potential
2462	// match, this guarantees that our end will not be a partial contraction,
2463	// or a partial supplementary character.
2464	if (start < textlength && ucol_unsafeCP(text[start], collator)) {
2465	int32_t expansion = getExpansionSuffix(coleiter);
2466	UBool expandflag = expansion > `0`;
2467	setColEIterOffset(coleiter, *end);
2468	while (expansion > `0`) {
2469	// getting rid of the redundant ce
2470	// since forward contraction/expansion may have extra ces
2471	// if we are in the normalization buffer, hasAccentsBeforeMatch
2472	// would have taken care of it.
2473	// E.g. the character \u01FA will have an expansion of 3, but if
2474	// we are only looking for A ring A\u030A, we'll have to skip the
2475	// last ce in the expansion buffer
2476	ucol_previous(coleiter, status);
2477	if (U_FAILURE(*status)) {
2478	return FALSE;
2479	}
2480	if (ucol_getOffset(coleiter) != temp) {
2481	*end = temp;
2482	temp = ucol_getOffset(coleiter);
2483	}
2484	expansion --;
2485	}
2486
2487	int32_t *patternce = strsrch->pattern.ces;
2488	int32_t patterncelength = strsrch->pattern.cesLength;
2489	int32_t count = patterncelength;
2490	while (count > `0`) {
2491	int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
2492	// status checked below, note that if status is a failure
2493	// ucol_previous returns UCOL_NULLORDER
2494	if (ce == UCOL_IGNORABLE) {
2495	continue;
2496	}
2497	if (expandflag && count == `0` &&
2498	getColElemIterOffset(coleiter, FALSE) != temp) {
2499	*end = temp;
2500	temp = ucol_getOffset(coleiter);
2501	}
2502	if (count == patterncelength &&
2503	ce != patternce[patterncelength - `1`]) {
2504	// accents may have extra starting ces, this occurs when a
2505	// pure accent pattern is matched without rearrangement
2506	int32_t expected = patternce[patterncelength - `1`];
2507	U16_BACK_1(text, `0`, *end);
2508	if (getFCD(text, end, textlength) & LAST_BYTE_MASK_) {
2509	ce = getCE(strsrch, ucol_previous(coleiter, status));
2510	while (U_SUCCESS(*status) && ce != expected &&
2511	ce != UCOL_NULLORDER &&
2512	ucol_getOffset(coleiter) <= *start) {
2513	ce = getCE(strsrch, ucol_previous(coleiter, status));
2514	}
2515	}
2516	}
2517	if (U_FAILURE(*status) \|\| ce != patternce[count - `1`]) {
2518	(*start) --;
2519	start = getPreviousBaseOffset(text, start);
2520	return FALSE;
2521	}
2522	count --;
2523	}
2524	}
2525	return TRUE;
2526	}
2527
2528	/**
2529	* Checks and sets the match information if found.
2530	* Checks
2531	* <ul>
2532	* <li> the potential match does not repeat the previous match
2533	* <li> boundaries are correct
2534	* <li> potential match does not end in the middle of a contraction
2535	* <li> identical matches
2536	* <\ul>
2537	* Otherwise the offset will be shifted to the next character.
2538	* Internal method, status assumed to be success, caller has to check status
2539	* before calling this method.
2540	* @param strsrch string search data
2541	* @param textoffset offset in the collation element text. the returned value
2542	* will be the truncated start offset of the match or the new start
2543	* search offset.
2544	* @param status only error status if any
2545	* @return TRUE if the match is valid, FALSE otherwise
2546	*/
2547	static
2548	inline UBool checkPreviousCanonicalMatch(UStringSearch *strsrch,
2549	int32_t *textoffset,
2550	UErrorCode *status)
2551	{
2552	// to ensure that the start and ends are not composite characters
2553	UCollationElements *coleiter = strsrch->textIter;
2554	// if we have a canonical accent match
2555	if ((strsrch->pattern.hasSuffixAccents &&
2556	strsrch->canonicalSuffixAccents[`0`]) \|\|
2557	(strsrch->pattern.hasPrefixAccents &&
2558	strsrch->canonicalPrefixAccents[`0`])) {
2559	strsrch->search->matchedIndex = *textoffset;
2560	strsrch->search->matchedLength =
2561	getNextUStringSearchBaseOffset(strsrch,
2562	getColElemIterOffset(coleiter, FALSE))
2563	- *textoffset;
2564	return TRUE;
2565	}
2566
2567	int32_t end = ucol_getOffset(coleiter);
2568	if (!checkPreviousCanonicalContractionMatch(strsrch, textoffset, &end,
2569	status) \|\|
2570	U_FAILURE(*status)) {
2571	return FALSE;
2572	}
2573
2574	end = getNextUStringSearchBaseOffset(strsrch, end);
2575	// this totally matches, however we need to check if it is repeating
2576	if (checkRepeatedMatch(strsrch, *textoffset, end) \|\|
2577	!isBreakUnit(strsrch, *textoffset, end) \|\|
2578	!checkIdentical(strsrch, *textoffset, end)) {
2579	(*textoffset) --;
2580	*textoffset = getPreviousBaseOffset(strsrch->search->text,
2581	*textoffset);
2582	return FALSE;
2583	}
2584
2585	strsrch->search->matchedIndex = *textoffset;
2586	strsrch->search->matchedLength = end - *textoffset;
2587	return TRUE;
2588	}
2589	#endif // #if BOYER_MOORE
2590
2591	// constructors and destructor -------------------------------------------
2592
2593	U_CAPI UStringSearch * U_EXPORT2 usearch_open(const UChar *pattern,
2594	int32_t patternlength,
2595	const UChar *text,
2596	int32_t textlength,
2597	const char *locale,
2598	UBreakIterator *breakiter,
2599	UErrorCode *status)
2600	{
2601	if (U_FAILURE(*status)) {
2602	return NULL;
2603	}
2604	#if UCONFIG_NO_BREAK_ITERATION
2605	if (breakiter != NULL) {
2606	*status = U_UNSUPPORTED_ERROR;
2607	return NULL;
2608	}
2609	#endif
2610	if (locale) {
2611	// ucol_open internally checks for status
2612	UCollator *collator = ucol_open(locale, status);
2613	// pattern, text checks are done in usearch_openFromCollator
2614	UStringSearch *result = usearch_openFromCollator(pattern,
2615	patternlength, text, textlength,
2616	collator, breakiter, status);
2617
2618	if (result == NULL \|\| U_FAILURE(*status)) {
2619	if (collator) {
2620	ucol_close(collator);
2621	}
2622	return NULL;
2623	}
2624	else {
2625	result->ownCollator = TRUE;
2626	}
2627	return result;
2628	}
2629	*status = U_ILLEGAL_ARGUMENT_ERROR;
2630	return NULL;
2631	}
2632
2633	U_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator(
2634	const UChar *pattern,
2635	int32_t patternlength,
2636	const UChar *text,
2637	int32_t textlength,
2638	const UCollator *collator,
2639	UBreakIterator *breakiter,
2640	UErrorCode *status)
2641	{
2642	if (U_FAILURE(*status)) {
2643	return NULL;
2644	}
2645	#if UCONFIG_NO_BREAK_ITERATION
2646	if (breakiter != NULL) {
2647	*status = U_UNSUPPORTED_ERROR;
2648	return NULL;
2649	}
2650	#endif
2651	if (pattern == NULL \|\| text == NULL \|\| collator == NULL) {
2652	*status = U_ILLEGAL_ARGUMENT_ERROR;
2653	return NULL;
2654	}
2655
2656	// string search does not really work when numeric collation is turned on
2657	if(ucol_getAttribute(collator, UCOL_NUMERIC_COLLATION, status) == UCOL_ON) {
2658	*status = U_UNSUPPORTED_ERROR;
2659	return NULL;
2660	}
2661
2662	if (U_SUCCESS(*status)) {
2663	initializeFCD(status);
2664	if (U_FAILURE(*status)) {
2665	return NULL;
2666	}
2667
2668	UStringSearch *result;
2669	if (textlength == -`1`) {
2670	textlength = u_strlen(text);
2671	}
2672	if (patternlength == -`1`) {
2673	patternlength = u_strlen(pattern);
2674	}
2675	if (textlength <= `0` \|\| patternlength <= `0`) {
2676	*status = U_ILLEGAL_ARGUMENT_ERROR;
2677	return NULL;
2678	}
2679
2680	result = (UStringSearch )uprv_malloc(sizeof*(UStringSearch));
2681	if (result == NULL) {
2682	*status = U_MEMORY_ALLOCATION_ERROR;
2683	return NULL;
2684	}
2685
2686	result->collator = collator;
2687	result->strength = ucol_getStrength(collator);
2688	result->ceMask = getMask(result->strength);
2689	result->toShift =
2690	ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
2691	UCOL_SHIFTED;
2692	result->variableTop = ucol_getVariableTop(collator, status);
2693
2694	result->nfd = Normalizer2::getNFDInstance(*status);
2695
2696	if (U_FAILURE(*status)) {
2697	uprv_free(result);
2698	return NULL;
2699	}
2700
2701	result->search = (USearch )uprv_malloc(sizeof*(USearch));
2702	if (result->search == NULL) {
2703	*status = U_MEMORY_ALLOCATION_ERROR;
2704	uprv_free(result);
2705	return NULL;
2706	}
2707
2708	result->search->text = text;
2709	result->search->textLength = textlength;
2710
2711	result->pattern.text = pattern;
2712	result->pattern.textLength = patternlength;
2713	result->pattern.ces = NULL;
2714	result->pattern.pces = NULL;
2715
2716	result->search->breakIter = breakiter;
2717	#if !UCONFIG_NO_BREAK_ITERATION
2718	result->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(result->collator, ULOC_VALID_LOCALE, status), text, textlength, status);
2719	if (breakiter) {
2720	ubrk_setText(breakiter, text, textlength, status);
2721	}
2722	#endif
2723
2724	result->ownCollator = FALSE;
2725	result->search->matchedLength = `0`;
2726	result->search->matchedIndex = USEARCH_DONE;
2727	result->utilIter = NULL;
2728	result->textIter = ucol_openElements(collator, text,
2729	textlength, status);
2730	result->textProcessedIter = NULL;
2731	if (U_FAILURE(*status)) {
2732	usearch_close(result);
2733	return NULL;
2734	}
2735
2736	result->search->isOverlap = FALSE;
2737	result->search->isCanonicalMatch = FALSE;
2738	result->search->elementComparisonType = `0`;
2739	result->search->isForwardSearching = TRUE;
2740	result->search->reset = TRUE;
2741
2742	initialize(result, status);
2743
2744	if (U_FAILURE(*status)) {
2745	usearch_close(result);
2746	return NULL;
2747	}
2748
2749	return result;
2750	}
2751	return NULL;
2752	}
2753
2754	U_CAPI void U_EXPORT2 usearch_close(UStringSearch *strsrch)
2755	{
2756	if (strsrch) {
2757	if (strsrch->pattern.ces != strsrch->pattern.cesBuffer &&
2758	strsrch->pattern.ces) {
2759	uprv_free(strsrch->pattern.ces);
2760	}
2761
2762	if (strsrch->pattern.pces != NULL &&
2763	strsrch->pattern.pces != strsrch->pattern.pcesBuffer) {
2764	uprv_free(strsrch->pattern.pces);
2765	}
2766
2767	delete strsrch->textProcessedIter;
2768	ucol_closeElements(strsrch->textIter);
2769	ucol_closeElements(strsrch->utilIter);
2770
2771	if (strsrch->ownCollator && strsrch->collator) {
2772	ucol_close((UCollator *)strsrch->collator);
2773	}
2774
2775	#if !UCONFIG_NO_BREAK_ITERATION
2776	if (strsrch->search->internalBreakIter) {
2777	ubrk_close(strsrch->search->internalBreakIter);
2778	}
2779	#endif
2780
2781	uprv_free(strsrch->search);
2782	uprv_free(strsrch);
2783	}
2784	}
2785
2786	namespace {
2787
2788	UBool initTextProcessedIter(UStringSearch strsrch, UErrorCode status) {
2789	if (U_FAILURE(status)) { return* FALSE; }
2790	if (strsrch->textProcessedIter == NULL) {
2791	strsrch->textProcessedIter = new icu::UCollationPCE (strsrch->textIter);
2792	if (strsrch->textProcessedIter == NULL) {
2793	*status = U_MEMORY_ALLOCATION_ERROR;
2794	return FALSE;
2795	}
2796	} else {
2797	strsrch->textProcessedIter->init(strsrch->textIter);
2798	}
2799	return TRUE;
2800	}
2801
2802	}
2803
2804	// set and get methods --------------------------------------------------
2805
2806	U_CAPI void U_EXPORT2 usearch_setOffset(UStringSearch *strsrch,
2807	int32_t position,
2808	UErrorCode *status)
2809	{
2810	if (U_SUCCESS(*status) && strsrch) {
2811	if (isOutOfBounds(strsrch->search->textLength, position)) {
2812	*status = U_INDEX_OUTOFBOUNDS_ERROR;
2813	}
2814	else {
2815	setColEIterOffset(strsrch->textIter, position);
2816	}
2817	strsrch->search->matchedIndex = USEARCH_DONE;
2818	strsrch->search->matchedLength = `0`;
2819	strsrch->search->reset = FALSE;
2820	}
2821	}
2822
2823	U_CAPI int32_t U_EXPORT2 usearch_getOffset(const UStringSearch *strsrch)
2824	{
2825	if (strsrch) {
2826	int32_t result = ucol_getOffset(strsrch->textIter);
2827	if (isOutOfBounds(strsrch->search->textLength, result)) {
2828	return USEARCH_DONE;
2829	}
2830	return result;
2831	}
2832	return USEARCH_DONE;
2833	}
2834
2835	U_CAPI void U_EXPORT2 usearch_setAttribute(UStringSearch *strsrch,
2836	USearchAttribute attribute,
2837	USearchAttributeValue value,
2838	UErrorCode *status)
2839	{
2840	if (U_SUCCESS(*status) && strsrch) {
2841	switch (attribute)
2842	{
2843	case USEARCH_OVERLAP :
2844	strsrch->search->isOverlap = (value == USEARCH_ON ? TRUE : FALSE);
2845	break;
2846	case USEARCH_CANONICAL_MATCH :
2847	strsrch->search->isCanonicalMatch = (value == USEARCH_ON ? TRUE :
2848	FALSE);
2849	break;
2850	case USEARCH_ELEMENT_COMPARISON :
2851	if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD \|\| value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {
2852	strsrch->search->elementComparisonType = (int16_t)value;
2853	} else {
2854	strsrch->search->elementComparisonType = `0`;
2855	}
2856	break;
2857	case USEARCH_ATTRIBUTE_COUNT :
2858	default:
2859	*status = U_ILLEGAL_ARGUMENT_ERROR;
2860	}
2861	}
2862	if (value == USEARCH_ATTRIBUTE_VALUE_COUNT) {
2863	*status = U_ILLEGAL_ARGUMENT_ERROR;
2864	}
2865	}
2866
2867	U_CAPI USearchAttributeValue U_EXPORT2 usearch_getAttribute(
2868	const UStringSearch *strsrch,
2869	USearchAttribute attribute)
2870	{
2871	if (strsrch) {
2872	switch (attribute) {
2873	case USEARCH_OVERLAP :
2874	return (strsrch->search->isOverlap == TRUE ? USEARCH_ON :
2875	USEARCH_OFF);
2876	case USEARCH_CANONICAL_MATCH :
2877	return (strsrch->search->isCanonicalMatch == TRUE ? USEARCH_ON :
2878	USEARCH_OFF);
2879	case USEARCH_ELEMENT_COMPARISON :
2880	{
2881	int16_t value = strsrch->search->elementComparisonType;
2882	if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD \|\| value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {
2883	return (USearchAttributeValue)value;
2884	} else {
2885	return USEARCH_STANDARD_ELEMENT_COMPARISON;
2886	}
2887	}
2888	case USEARCH_ATTRIBUTE_COUNT :
2889	return USEARCH_DEFAULT;
2890	}
2891	}
2892	return USEARCH_DEFAULT;
2893	}
2894
2895	U_CAPI int32_t U_EXPORT2 usearch_getMatchedStart(
2896	const UStringSearch *strsrch)
2897	{
2898	if (strsrch == NULL) {
2899	return USEARCH_DONE;
2900	}
2901	return strsrch->search->matchedIndex;
2902	}
2903
2904
2905	U_CAPI int32_t U_EXPORT2 usearch_getMatchedText(const UStringSearch *strsrch,
2906	UChar *result,
2907	int32_t resultCapacity,
2908	UErrorCode *status)
2909	{
2910	if (U_FAILURE(*status)) {
2911	return USEARCH_DONE;
2912	}
2913	if (strsrch == NULL \|\| resultCapacity < `0` \|\| (resultCapacity > `0` &&
2914	result == NULL)) {
2915	*status = U_ILLEGAL_ARGUMENT_ERROR;
2916	return USEARCH_DONE;
2917	}
2918
2919	int32_t copylength = strsrch->search->matchedLength;
2920	int32_t copyindex = strsrch->search->matchedIndex;
2921	if (copyindex == USEARCH_DONE) {
2922	u_terminateUChars(result, resultCapacity, `0`, status);
2923	return USEARCH_DONE;
2924	}
2925
2926	if (resultCapacity < copylength) {
2927	copylength = resultCapacity;
2928	}
2929	if (copylength > `0`) {
2930	uprv_memcpy(result, strsrch->search->text + copyindex,
2931	copylength * sizeof(UChar));
2932	}
2933	return u_terminateUChars(result, resultCapacity,
2934	strsrch->search->matchedLength, status);
2935	}
2936
2937	U_CAPI int32_t U_EXPORT2 usearch_getMatchedLength(
2938	const UStringSearch *strsrch)
2939	{
2940	if (strsrch) {
2941	return strsrch->search->matchedLength;
2942	}
2943	return USEARCH_DONE;
2944	}
2945
2946	#if !UCONFIG_NO_BREAK_ITERATION
2947
2948	U_CAPI void U_EXPORT2 usearch_setBreakIterator(UStringSearch *strsrch,
2949	UBreakIterator *breakiter,
2950	UErrorCode *status)
2951	{
2952	if (U_SUCCESS(*status) && strsrch) {
2953	strsrch->search->breakIter = breakiter;
2954	if (breakiter) {
2955	ubrk_setText(breakiter, strsrch->search->text,
2956	strsrch->search->textLength, status);
2957	}
2958	}
2959	}
2960
2961	U_CAPI const UBreakIterator* U_EXPORT2
2962	usearch_getBreakIterator(const UStringSearch *strsrch)
2963	{
2964	if (strsrch) {
2965	return strsrch->search->breakIter;
2966	}
2967	return NULL;
2968	}
2969
2970	#endif
2971
2972	U_CAPI void U_EXPORT2 usearch_setText( UStringSearch *strsrch,
2973	const UChar *text,
2974	int32_t textlength,
2975	UErrorCode *status)
2976	{
2977	if (U_SUCCESS(*status)) {
2978	if (strsrch == NULL \|\| text == NULL \|\| textlength < -`1` \|\|
2979	textlength == `0`) {
2980	*status = U_ILLEGAL_ARGUMENT_ERROR;
2981	}
2982	else {
2983	if (textlength == -`1`) {
2984	textlength = u_strlen(text);
2985	}
2986	strsrch->search->text = text;
2987	strsrch->search->textLength = textlength;
2988	ucol_setText(strsrch->textIter, text, textlength, status);
2989	strsrch->search->matchedIndex = USEARCH_DONE;
2990	strsrch->search->matchedLength = `0`;
2991	strsrch->search->reset = TRUE;
2992	#if !UCONFIG_NO_BREAK_ITERATION
2993	if (strsrch->search->breakIter != NULL) {
2994	ubrk_setText(strsrch->search->breakIter, text,
2995	textlength, status);
2996	}
2997	ubrk_setText(strsrch->search->internalBreakIter, text, textlength, status);
2998	#endif
2999	}
3000	}
3001	}
3002
3003	U_CAPI const UChar * U_EXPORT2 usearch_getText(const UStringSearch *strsrch,
3004	int32_t *length)
3005	{
3006	if (strsrch) {
3007	*length = strsrch->search->textLength;
3008	return strsrch->search->text;
3009	}
3010	return NULL;
3011	}
3012
3013	U_CAPI void U_EXPORT2 usearch_setCollator( UStringSearch *strsrch,
3014	const UCollator *collator,
3015	UErrorCode *status)
3016	{
3017	if (U_SUCCESS(*status)) {
3018	if (collator == NULL) {
3019	*status = U_ILLEGAL_ARGUMENT_ERROR;
3020	return;
3021	}
3022
3023	if (strsrch) {
3024	delete strsrch->textProcessedIter;
3025	strsrch->textProcessedIter = NULL;
3026	ucol_closeElements(strsrch->textIter);
3027	ucol_closeElements(strsrch->utilIter);
3028	strsrch->textIter = strsrch->utilIter = NULL;
3029	if (strsrch->ownCollator && (strsrch->collator != collator)) {
3030	ucol_close((UCollator *)strsrch->collator);
3031	strsrch->ownCollator = FALSE;
3032	}
3033	strsrch->collator = collator;
3034	strsrch->strength = ucol_getStrength(collator);
3035	strsrch->ceMask = getMask(strsrch->strength);
3036	#if !UCONFIG_NO_BREAK_ITERATION
3037	ubrk_close(strsrch->search->internalBreakIter);
3038	strsrch->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(collator, ULOC_VALID_LOCALE, status),
3039	strsrch->search->text, strsrch->search->textLength, status);
3040	#endif
3041	// if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
3042	strsrch->toShift =
3043	ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
3044	UCOL_SHIFTED;
3045	// if status is a failure, ucol_getVariableTop returns 0
3046	strsrch->variableTop = ucol_getVariableTop(collator, status);
3047	strsrch->textIter = ucol_openElements(collator,
3048	strsrch->search->text,
3049	strsrch->search->textLength,
3050	status);
3051	strsrch->utilIter = ucol_openElements(
3052	collator, strsrch->pattern.text, strsrch->pattern.textLength, status);
3053	// initialize() _after_ setting the iterators for the new collator.
3054	initialize(strsrch, status);
3055	}
3056
3057	// *** are these calls needed?*
3058	// *** we call uprv_init_pce in initializePatternPCETable*
3059	// *** and the CEIBuffer constructor...*
3060	#if 0
3061	uprv_init_pce(strsrch->textIter);
3062	uprv_init_pce(strsrch->utilIter);
3063	#endif
3064	}
3065	}
3066
3067	U_CAPI UCollator * U_EXPORT2 usearch_getCollator(const UStringSearch *strsrch)
3068	{
3069	if (strsrch) {
3070	return (UCollator *)strsrch->collator;
3071	}
3072	return NULL;
3073	}
3074
3075	U_CAPI void U_EXPORT2 usearch_setPattern( UStringSearch *strsrch,
3076	const UChar *pattern,
3077	int32_t patternlength,
3078	UErrorCode *status)
3079	{
3080	if (U_SUCCESS(*status)) {
3081	if (strsrch == NULL \|\| pattern == NULL) {
3082	*status = U_ILLEGAL_ARGUMENT_ERROR;
3083	}
3084	else {
3085	if (patternlength == -`1`) {
3086	patternlength = u_strlen(pattern);
3087	}
3088	if (patternlength == `0`) {
3089	*status = U_ILLEGAL_ARGUMENT_ERROR;
3090	return;
3091	}
3092	strsrch->pattern.text = pattern;
3093	strsrch->pattern.textLength = patternlength;
3094	initialize(strsrch, status);
3095	}
3096	}
3097	}
3098
3099	U_CAPI const UChar* U_EXPORT2
3100	usearch_getPattern(const UStringSearch *strsrch,
3101	int32_t *length)
3102	{
3103	if (strsrch) {
3104	*length = strsrch->pattern.textLength;
3105	return strsrch->pattern.text;
3106	}
3107	return NULL;
3108	}
3109
3110	// miscellanous methods --------------------------------------------------
3111
3112	U_CAPI int32_t U_EXPORT2 usearch_first(UStringSearch *strsrch,
3113	UErrorCode *status)
3114	{
3115	if (strsrch && U_SUCCESS(*status)) {
3116	strsrch->search->isForwardSearching = TRUE;
3117	usearch_setOffset(strsrch, `0`, status);
3118	if (U_SUCCESS(*status)) {
3119	return usearch_next(strsrch, status);
3120	}
3121	}
3122	return USEARCH_DONE;
3123	}
3124
3125	U_CAPI int32_t U_EXPORT2 usearch_following(UStringSearch *strsrch,
3126	int32_t position,
3127	UErrorCode *status)
3128	{
3129	if (strsrch && U_SUCCESS(*status)) {
3130	strsrch->search->isForwardSearching = TRUE;
3131	// position checked in usearch_setOffset
3132	usearch_setOffset(strsrch, position, status);
3133	if (U_SUCCESS(*status)) {
3134	return usearch_next(strsrch, status);
3135	}
3136	}
3137	return USEARCH_DONE;
3138	}
3139
3140	U_CAPI int32_t U_EXPORT2 usearch_last(UStringSearch *strsrch,
3141	UErrorCode *status)
3142	{
3143	if (strsrch && U_SUCCESS(*status)) {
3144	strsrch->search->isForwardSearching = FALSE;
3145	usearch_setOffset(strsrch, strsrch->search->textLength, status);
3146	if (U_SUCCESS(*status)) {
3147	return usearch_previous(strsrch, status);
3148	}
3149	}
3150	return USEARCH_DONE;
3151	}
3152
3153	U_CAPI int32_t U_EXPORT2 usearch_preceding(UStringSearch *strsrch,
3154	int32_t position,
3155	UErrorCode *status)
3156	{
3157	if (strsrch && U_SUCCESS(*status)) {
3158	strsrch->search->isForwardSearching = FALSE;
3159	// position checked in usearch_setOffset
3160	usearch_setOffset(strsrch, position, status);
3161	if (U_SUCCESS(*status)) {
3162	return usearch_previous(strsrch, status);
3163	}
3164	}
3165	return USEARCH_DONE;
3166	}
3167
3168	/**
3169	* If a direction switch is required, we'll count the number of ces till the
3170	* beginning of the collation element iterator and iterate forwards that
3171	* number of times. This is so that we get to the correct point within the
3172	* string to continue the search in. Imagine when we are in the middle of the
3173	* normalization buffer when the change in direction is request. arrrgghh....
3174	* After searching the offset within the collation element iterator will be
3175	* shifted to the start of the match. If a match is not found, the offset would
3176	* have been set to the end of the text string in the collation element
3177	* iterator.
3178	* Okay, here's my take on normalization buffer. The only time when there can
3179	* be 2 matches within the same normalization is when the pattern is consists
3180	* of all accents. But since the offset returned is from the text string, we
3181	* should not confuse the caller by returning the second match within the
3182	* same normalization buffer. If we do, the 2 results will have the same match
3183	* offsets, and that'll be confusing. I'll return the next match that doesn't
3184	* fall within the same normalization buffer. Note this does not affect the
3185	* results of matches spanning the text and the normalization buffer.
3186	* The position to start searching is taken from the collation element
3187	* iterator. Callers of this API would have to set the offset in the collation
3188	* element iterator before using this method.
3189	*/
3190	U_CAPI int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch,
3191	UErrorCode *status)
3192	{
3193	if (U_SUCCESS(*status) && strsrch) {
3194	// note offset is either equivalent to the start of the previous match
3195	// or is set by the user
3196	int32_t offset = usearch_getOffset(strsrch);
3197	USearch *search = strsrch->search;
3198	search->reset = FALSE;
3199	int32_t textlength = search->textLength;
3200	if (search->isForwardSearching) {
3201	#if BOYER_MOORE
3202	if (offset == textlength
3203	\|\| (!search->isOverlap &&
3204	(offset + strsrch->pattern.defaultShiftSize > textlength \|\|
3205	(search->matchedIndex != USEARCH_DONE &&
3206	offset + search->matchedLength >= textlength)))) {
3207	// not enough characters to match
3208	setMatchNotFound(strsrch);
3209	return USEARCH_DONE;
3210	}
3211	#else
3212	if (offset == textlength \|\|
3213	(! search->isOverlap &&
3214	(search->matchedIndex != USEARCH_DONE &&
3215	offset + search->matchedLength > textlength))) {
3216	// not enough characters to match
3217	setMatchNotFound(strsrch);
3218	return USEARCH_DONE;
3219	}
3220	#endif
3221	}
3222	else {
3223	// switching direction.
3224	// if matchedIndex == USEARCH_DONE, it means that either a
3225	// setOffset has been called or that previous ran off the text
3226	// string. the iterator would have been set to offset 0 if a
3227	// match is not found.
3228	search->isForwardSearching = TRUE;
3229	if (search->matchedIndex != USEARCH_DONE) {
3230	// there's no need to set the collation element iterator
3231	// the next call to next will set the offset.
3232	return search->matchedIndex;
3233	}
3234	}
3235
3236	if (U_SUCCESS(*status)) {
3237	if (strsrch->pattern.cesLength == `0`) {
3238	if (search->matchedIndex == USEARCH_DONE) {
3239	search->matchedIndex = offset;
3240	}
3241	else { // moves by codepoints
3242	U16_FWD_1(search->text, search->matchedIndex, textlength);
3243	}
3244
3245	search->matchedLength = `0`;
3246	setColEIterOffset(strsrch->textIter, search->matchedIndex);
3247	// status checked below
3248	if (search->matchedIndex == textlength) {
3249	search->matchedIndex = USEARCH_DONE;
3250	}
3251	}
3252	else {
3253	if (search->matchedLength > `0`) {
3254	// if matchlength is 0 we are at the start of the iteration
3255	if (search->isOverlap) {
3256	ucol_setOffset(strsrch->textIter, offset + `1`, status);
3257	}
3258	else {
3259	ucol_setOffset(strsrch->textIter,
3260	offset + search->matchedLength, status);
3261	}
3262	}
3263	else {
3264	// for boundary check purposes. this will ensure that the
3265	// next match will not preceed the current offset
3266	// note search->matchedIndex will always be set to something
3267	// in the code
3268	search->matchedIndex = offset - `1`;
3269	}
3270
3271	if (search->isCanonicalMatch) {
3272	// can't use exact here since extra accents are allowed.
3273	usearch_handleNextCanonical(strsrch, status);
3274	}
3275	else {
3276	usearch_handleNextExact(strsrch, status);
3277	}
3278	}
3279
3280	if (U_FAILURE(*status)) {
3281	return USEARCH_DONE;
3282	}
3283
3284	#if !BOYER_MOORE
3285	if (search->matchedIndex == USEARCH_DONE) {
3286	ucol_setOffset(strsrch->textIter, search->textLength, status);
3287	} else {
3288	ucol_setOffset(strsrch->textIter, search->matchedIndex, status);
3289	}
3290	#endif
3291
3292	return search->matchedIndex;
3293	}
3294	}
3295	return USEARCH_DONE;
3296	}
3297
3298	U_CAPI int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch,
3299	UErrorCode *status)
3300	{
3301	if (U_SUCCESS(*status) && strsrch) {
3302	int32_t offset;
3303	USearch *search = strsrch->search;
3304	if (search->reset) {
3305	offset = search->textLength;
3306	search->isForwardSearching = FALSE;
3307	search->reset = FALSE;
3308	setColEIterOffset(strsrch->textIter, offset);
3309	}
3310	else {
3311	offset = usearch_getOffset(strsrch);
3312	}
3313
3314	int32_t matchedindex = search->matchedIndex;
3315	if (search->isForwardSearching == TRUE) {
3316	// switching direction.
3317	// if matchedIndex == USEARCH_DONE, it means that either a
3318	// setOffset has been called or that next ran off the text
3319	// string. the iterator would have been set to offset textLength if
3320	// a match is not found.
3321	search->isForwardSearching = FALSE;
3322	if (matchedindex != USEARCH_DONE) {
3323	return matchedindex;
3324	}
3325	}
3326	else {
3327	#if BOYER_MOORE
3328	if (offset == `0` \|\| matchedindex == `0` \|\|
3329	(!search->isOverlap &&
3330	(offset < strsrch->pattern.defaultShiftSize \|\|
3331	(matchedindex != USEARCH_DONE &&
3332	matchedindex < strsrch->pattern.defaultShiftSize)))) {
3333	// not enough characters to match
3334	setMatchNotFound(strsrch);
3335	return USEARCH_DONE;
3336	}
3337	#else
3338	// Could check pattern length, but the
3339	// linear search will do the right thing
3340	if (offset == `0` \|\| matchedindex == `0`) {
3341	setMatchNotFound(strsrch);
3342	return USEARCH_DONE;
3343	}
3344	#endif
3345	}
3346
3347	if (U_SUCCESS(*status)) {
3348	if (strsrch->pattern.cesLength == `0`) {
3349	search->matchedIndex =
3350	(matchedindex == USEARCH_DONE ? offset : matchedindex);
3351	if (search->matchedIndex == `0`) {
3352	setMatchNotFound(strsrch);
3353	// status checked below
3354	}
3355	else { // move by codepoints
3356	U16_BACK_1(search->text, `0`, search->matchedIndex);
3357	setColEIterOffset(strsrch->textIter, search->matchedIndex);
3358	// status checked below
3359	search->matchedLength = `0`;
3360	}
3361	}
3362	else {
3363	if (strsrch->search->isCanonicalMatch) {
3364	// can't use exact here since extra accents are allowed.
3365	usearch_handlePreviousCanonical(strsrch, status);
3366	// status checked below
3367	}
3368	else {
3369	usearch_handlePreviousExact(strsrch, status);
3370	// status checked below
3371	}
3372	}
3373
3374	if (U_FAILURE(*status)) {
3375	return USEARCH_DONE;
3376	}
3377
3378	return search->matchedIndex;
3379	}
3380	}
3381	return USEARCH_DONE;
3382	}
3383
3384
3385
3386	U_CAPI void U_EXPORT2 usearch_reset(UStringSearch *strsrch)
3387	{
3388	/*
3389	reset is setting the attributes that are already in
3390	string search, hence all attributes in the collator should
3391	be retrieved without any problems
3392	*/
3393	if (strsrch) {
3394	UErrorCode status = U_ZERO_ERROR;
3395	UBool sameCollAttribute = TRUE;
3396	uint32_t ceMask;
3397	UBool shift;
3398	uint32_t varTop;
3399
3400	// ** hack to deal w/ how processed CEs encode quaternary **
3401	UCollationStrength newStrength = ucol_getStrength(strsrch->collator);
3402	if ((strsrch->strength < UCOL_QUATERNARY && newStrength >= UCOL_QUATERNARY) \|\|
3403	(strsrch->strength >= UCOL_QUATERNARY && newStrength < UCOL_QUATERNARY)) {
3404	sameCollAttribute = FALSE;
3405	}
3406
3407	strsrch->strength = ucol_getStrength(strsrch->collator);
3408	ceMask = getMask(strsrch->strength);
3409	if (strsrch->ceMask != ceMask) {
3410	strsrch->ceMask = ceMask;
3411	sameCollAttribute = FALSE;
3412	}
3413
3414	// if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
3415	shift = ucol_getAttribute(strsrch->collator, UCOL_ALTERNATE_HANDLING,
3416	&status) == UCOL_SHIFTED;
3417	if (strsrch->toShift != shift) {
3418	strsrch->toShift = shift;
3419	sameCollAttribute = FALSE;
3420	}
3421
3422	// if status is a failure, ucol_getVariableTop returns 0
3423	varTop = ucol_getVariableTop(strsrch->collator, &status);
3424	if (strsrch->variableTop != varTop) {
3425	strsrch->variableTop = varTop;
3426	sameCollAttribute = FALSE;
3427	}
3428	if (!sameCollAttribute) {
3429	initialize(strsrch, &status);
3430	}
3431	ucol_setText(strsrch->textIter, strsrch->search->text,
3432	strsrch->search->textLength,
3433	&status);
3434	strsrch->search->matchedLength = `0`;
3435	strsrch->search->matchedIndex = USEARCH_DONE;
3436	strsrch->search->isOverlap = FALSE;
3437	strsrch->search->isCanonicalMatch = FALSE;
3438	strsrch->search->elementComparisonType = `0`;
3439	strsrch->search->isForwardSearching = TRUE;
3440	strsrch->search->reset = TRUE;
3441	}
3442	}
3443
3444	//
3445	// CEI Collation Element + source text index.
3446	// These structs are kept in the circular buffer.
3447	//
3448	struct CEI {
3449	int64_t ce;
3450	int32_t lowIndex;
3451	int32_t highIndex;
3452	};
3453
3454	U_NAMESPACE_BEGIN
3455
3456	namespace {
3457	//
3458	// CEIBuffer A circular buffer of CEs-with-index from the text being searched.
3459	//
3460	#define DEFAULT_CEBUFFER_SIZE 96
3461	#define CEBUFFER_EXTRA 32
3462	// Some typical max values to make buffer size more reasonable for asymmetric search.
3463	// #8694 is for a better long-term solution to allocation of this buffer.
3464	#define MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L 8
3465	#define MAX_TARGET_IGNORABLES_PER_PAT_OTHER 3
3466	#define MIGHT_BE_JAMO_L(c) ((c >= 0x1100 && c <= 0x115E) \|\| (c >= 0x3131 && c <= 0x314E) \|\| (c >= 0x3165 && c <= 0x3186))
3467	struct CEIBuffer {
3468	CEI defBuf[DEFAULT_CEBUFFER_SIZE];
3469	CEI *buf;
3470	int32_t bufSize;
3471	int32_t firstIx;
3472	int32_t limitIx;
3473	UCollationElements *ceIter;
3474	UStringSearch *strSearch;
3475
3476
3477
3478	CEIBuffer(UStringSearch ss, UErrorCode status);
3479	~CEIBuffer();
3480	const CEI *get(int32_t index);
3481	const CEI *getPrevious(int32_t index);
3482	};
3483
3484
3485	CEIBuffer::CEIBuffer(UStringSearch ss, UErrorCode status) {
3486	buf = defBuf;
3487	strSearch = ss;
3488	bufSize = ss->pattern.pcesLength + CEBUFFER_EXTRA;
3489	if (ss->search->elementComparisonType != `0`) {
3490	const UChar * patText = ss->pattern.text;
3491	if (patText) {
3492	const UChar * patTextLimit = patText + ss->pattern.textLength;
3493	while ( patText < patTextLimit ) {
3494	UChar c = *patText++;
3495	if (MIGHT_BE_JAMO_L(c)) {
3496	bufSize += MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L;
3497	} else {
3498	// No check for surrogates, we might allocate slightly more buffer than necessary.
3499	bufSize += MAX_TARGET_IGNORABLES_PER_PAT_OTHER;
3500	}
3501	}
3502	}
3503	}
3504	ceIter = ss->textIter;
3505	firstIx = `0`;
3506	limitIx = `0`;
3507
3508	if (!initTextProcessedIter(ss, status)) { return; }
3509
3510	if (bufSize>DEFAULT_CEBUFFER_SIZE) {
3511	buf = (CEI )uprv_malloc(bufSize sizeof(CEI));
3512	if (buf == NULL) {
3513	*status = U_MEMORY_ALLOCATION_ERROR;
3514	}
3515	}
3516	}
3517
3518	// TODO: add a reset or init function so that allocated
3519	// buffers can be retained & reused.
3520
3521	CEIBuffer::~CEIBuffer() {
3522	if (buf != defBuf) {
3523	uprv_free(buf);
3524	}
3525	}
3526
3527
3528	// Get the CE with the specified index.
3529	// Index must be in the range
3530	// n-history_size < index < n+1
3531	// where n is the largest index to have been fetched by some previous call to this function.
3532	// The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
3533	//
3534	const CEI *CEIBuffer::get(int32_t index) {
3535	int i = index % bufSize;
3536
3537	if (index>=firstIx && index<limitIx) {
3538	// The request was for an entry already in our buffer.
3539	// Just return it.
3540	return &buf[i];
3541	}
3542
3543	// Caller is requesting a new, never accessed before, CE.
3544	// Verify that it is the next one in sequence, which is all
3545	// that is allowed.
3546	if (index != limitIx) {
3547	U_ASSERT(FALSE);
3548	// TODO: In ICU 64 the above assert was changed to use UPRV_UNREACHABLE instead
3549	// which unconditionally calls abort(). However, there were cases where this was
3550	// being hit. This change is reverted for now, restoring the existing behavior.
3551	// ICU-20792 tracks the follow-up work/further investigation on this.
3552	return NULL;
3553	}
3554
3555	// Manage the circular CE buffer indexing
3556	limitIx++;
3557
3558	if (limitIx - firstIx >= bufSize) {
3559	// The buffer is full, knock out the lowest-indexed entry.
3560	firstIx++;
3561	}
3562
3563	UErrorCode status = U_ZERO_ERROR;
3564
3565	buf[i].ce = strSearch->textProcessedIter->nextProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status);
3566
3567	return &buf[i];
3568	}
3569
3570	// Get the CE with the specified index.
3571	// Index must be in the range
3572	// n-history_size < index < n+1
3573	// where n is the largest index to have been fetched by some previous call to this function.
3574	// The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
3575	//
3576	const CEI *CEIBuffer::getPrevious(int32_t index) {
3577	int i = index % bufSize;
3578
3579	if (index>=firstIx && index<limitIx) {
3580	// The request was for an entry already in our buffer.
3581	// Just return it.
3582	return &buf[i];
3583	}
3584
3585	// Caller is requesting a new, never accessed before, CE.
3586	// Verify that it is the next one in sequence, which is all
3587	// that is allowed.
3588	if (index != limitIx) {
3589	U_ASSERT(FALSE);
3590	// TODO: In ICU 64 the above assert was changed to use UPRV_UNREACHABLE instead
3591	// which unconditionally calls abort(). However, there were cases where this was
3592	// being hit. This change is reverted for now, restoring the existing behavior.
3593	// ICU-20792 tracks the follow-up work/further investigation on this.
3594	return NULL;
3595	}
3596
3597	// Manage the circular CE buffer indexing
3598	limitIx++;
3599
3600	if (limitIx - firstIx >= bufSize) {
3601	// The buffer is full, knock out the lowest-indexed entry.
3602	firstIx++;
3603	}
3604
3605	UErrorCode status = U_ZERO_ERROR;
3606
3607	buf[i].ce = strSearch->textProcessedIter->previousProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status);
3608
3609	return &buf[i];
3610	}
3611
3612	}
3613
3614	U_NAMESPACE_END
3615
3616
3617	// #define USEARCH_DEBUG
3618
3619	#ifdef USEARCH_DEBUG
3620	#include <stdio.h>
3621	#include <stdlib.h>
3622	#endif
3623
3624	/*
3625	* Find the next break boundary after startIndex. If the UStringSearch object
3626	* has an external break iterator, use that. Otherwise use the internal character
3627	* break iterator.
3628	*/
3629	static int32_t nextBoundaryAfter(UStringSearch *strsrch, int32_t startIndex) {
3630	#if 0
3631	const UChar *text = strsrch->search->text;
3632	int32_t textLen = strsrch->search->textLength;
3633
3634	U_ASSERT(startIndex>=`0`);
3635	U_ASSERT(startIndex<=textLen);
3636
3637	if (startIndex >= textLen) {
3638	return startIndex;
3639	}
3640
3641	UChar32 c;
3642	int32_t i = startIndex;
3643	U16_NEXT(text, i, textLen, c);
3644
3645	// If we are on a control character, stop without looking for combining marks.
3646	// Control characters do not combine.
3647	int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3648	if (gcProperty==U_GCB_CONTROL \|\| gcProperty==U_GCB_LF \|\| gcProperty==U_GCB_CR) {
3649	return i;
3650	}
3651
3652	// The initial character was not a control, and can thus accept trailing
3653	// combining characters. Advance over however many of them there are.
3654	int32_t indexOfLastCharChecked;
3655	for (;;) {
3656	indexOfLastCharChecked = i;
3657	if (i>=textLen) {
3658	break;
3659	}
3660	U16_NEXT(text, i, textLen, c);
3661	gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3662	if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
3663	break;
3664	}
3665	}
3666	return indexOfLastCharChecked;
3667	#elif !UCONFIG_NO_BREAK_ITERATION
3668	UBreakIterator *breakiterator = strsrch->search->breakIter;
3669
3670	if (breakiterator == NULL) {
3671	breakiterator = strsrch->search->internalBreakIter;
3672	}
3673
3674	if (breakiterator != NULL) {
3675	return ubrk_following(breakiterator, startIndex);
3676	}
3677
3678	return startIndex;
3679	#else
3680	// ** or should we use the original code? **
3681	return startIndex;
3682	#endif
3683
3684	}
3685
3686	/*
3687	* Returns TRUE if index is on a break boundary. If the UStringSearch
3688	* has an external break iterator, test using that, otherwise test
3689	* using the internal character break iterator.
3690	*/
3691	static UBool isBreakBoundary(UStringSearch *strsrch, int32_t index) {
3692	#if 0
3693	const UChar *text = strsrch->search->text;
3694	int32_t textLen = strsrch->search->textLength;
3695
3696	U_ASSERT(index>=`0`);
3697	U_ASSERT(index<=textLen);
3698
3699	if (index>=textLen \|\| index<=`0`) {
3700	return TRUE;
3701	}
3702
3703	// If the character at the current index is not a GRAPHEME_EXTEND
3704	// then we can not be within a combining sequence.
3705	UChar32 c;
3706	U16_GET(text, `0`, index, textLen, c);
3707	int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3708	if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
3709	return TRUE;
3710	}
3711
3712	// We are at a combining mark. If the preceding character is anything
3713	// except a CONTROL, CR or LF, we are in a combining sequence.
3714	U16_PREV(text, `0`, index, c);
3715	gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3716	UBool combining = !(gcProperty==U_GCB_CONTROL \|\| gcProperty==U_GCB_LF \|\| gcProperty==U_GCB_CR);
3717	return !combining;
3718	#elif !UCONFIG_NO_BREAK_ITERATION
3719	UBreakIterator *breakiterator = strsrch->search->breakIter;
3720
3721	if (breakiterator == NULL) {
3722	breakiterator = strsrch->search->internalBreakIter;
3723	}
3724
3725	return (breakiterator != NULL && ubrk_isBoundary(breakiterator, index));
3726	#else
3727	// ** or use the original code? **
3728	return TRUE;
3729	#endif
3730	}
3731
3732	#if 0
3733	static UBool onBreakBoundaries(const UStringSearch *strsrch, int32_t start, int32_t end)
3734	{
3735	#if !UCONFIG_NO_BREAK_ITERATION
3736	UBreakIterator *breakiterator = strsrch->search->breakIter;
3737
3738	if (breakiterator != NULL) {
3739	int32_t startindex = ubrk_first(breakiterator);
3740	int32_t endindex = ubrk_last(breakiterator);
3741
3742	// out-of-range indexes are never boundary positions
3743	if (start < startindex \|\| start > endindex \|\|
3744	end < startindex \|\| end > endindex) {
3745	return FALSE;
3746	}
3747
3748	return ubrk_isBoundary(breakiterator, start) &&
3749	ubrk_isBoundary(breakiterator, end);
3750	}
3751	#endif
3752
3753	return TRUE;
3754	}
3755	#endif
3756
3757	typedef enum {
3758	U_CE_MATCH = -`1`,
3759	U_CE_NO_MATCH = `0`,
3760	U_CE_SKIP_TARG,
3761	U_CE_SKIP_PATN
3762	} UCompareCEsResult;
3763	#define U_CE_LEVEL2_BASE 0x00000005
3764	#define U_CE_LEVEL3_BASE 0x00050000
3765
3766	static UCompareCEsResult compareCE64s(int64_t targCE, int64_t patCE, int16_t compareType) {
3767	if (targCE == patCE) {
3768	return U_CE_MATCH;
3769	}
3770	if (compareType == `0`) {
3771	return U_CE_NO_MATCH;
3772	}
3773
3774	int64_t targCEshifted = targCE >> `32`;
3775	int64_t patCEshifted = patCE >> `32`;
3776	int64_t mask;
3777
3778	mask = `0xFFFF0000`;
3779	int32_t targLev1 = (int32_t)(targCEshifted & mask);
3780	int32_t patLev1 = (int32_t)(patCEshifted & mask);
3781	if ( targLev1 != patLev1 ) {
3782	if ( targLev1 == `0` ) {
3783	return U_CE_SKIP_TARG;
3784	}
3785	if ( patLev1 == `0` && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {
3786	return U_CE_SKIP_PATN;
3787	}
3788	return U_CE_NO_MATCH;
3789	}
3790
3791	mask = `0x0000FFFF`;
3792	int32_t targLev2 = (int32_t)(targCEshifted & mask);
3793	int32_t patLev2 = (int32_t)(patCEshifted & mask);
3794	if ( targLev2 != patLev2 ) {
3795	if ( targLev2 == `0` ) {
3796	return U_CE_SKIP_TARG;
3797	}
3798	if ( patLev2 == `0` && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {
3799	return U_CE_SKIP_PATN;
3800	}
3801	return (patLev2 == U_CE_LEVEL2_BASE \|\| (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev2 == U_CE_LEVEL2_BASE) )?
3802	U_CE_MATCH: U_CE_NO_MATCH;
3803	}
3804
3805	mask = `0xFFFF0000`;
3806	int32_t targLev3 = (int32_t)(targCE & mask);
3807	int32_t patLev3 = (int32_t)(patCE & mask);
3808	if ( targLev3 != patLev3 ) {
3809	return (patLev3 == U_CE_LEVEL3_BASE \|\| (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev3 == U_CE_LEVEL3_BASE) )?
3810	U_CE_MATCH: U_CE_NO_MATCH;
3811	}
3812
3813	return U_CE_MATCH;
3814	}
3815
3816	#if BOYER_MOORE
3817	// TODO: #if BOYER_MOORE, need 32-bit version of compareCE64s
3818	#endif
3819
3820	namespace {
3821
3822	UChar32 codePointAt(const USearch &search, int32_t index) {
3823	if (index < search.textLength) {
3824	UChar32 c;
3825	U16_NEXT(search.text, index, search.textLength, c);
3826	return c;
3827	}
3828	return U_SENTINEL;
3829	}
3830
3831	UChar32 codePointBefore(const USearch &search, int32_t index) {
3832	if (`0` < index) {
3833	UChar32 c;
3834	U16_PREV(search.text, `0`, index, c);
3835	return c;
3836	}
3837	return U_SENTINEL;
3838	}
3839
3840	} // namespace
3841
3842	U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch,
3843	int32_t startIdx,
3844	int32_t *matchStart,
3845	int32_t *matchLimit,
3846	UErrorCode *status)
3847	{
3848	if (U_FAILURE(*status)) {
3849	return FALSE;
3850	}
3851
3852	// TODO: reject search patterns beginning with a combining char.
3853
3854	#ifdef USEARCH_DEBUG
3855	if (getenv("USEARCH_DEBUG") != NULL) {
3856	printf("Pattern CEs\n");
3857	for (int ii=`0`; ii<strsrch->pattern.cesLength; ii++) {
3858	printf(" %8x", strsrch->pattern.ces[ii]);
3859	}
3860	printf("\n");
3861	}
3862
3863	#endif
3864	// Input parameter sanity check.
3865	// TODO: should input indices clip to the text length
3866	// in the same way that UText does.
3867	if(strsrch->pattern.cesLength == `0` \|\|
3868	startIdx < `0` \|\|
3869	startIdx > strsrch->search->textLength \|\|
3870	strsrch->pattern.ces == NULL) {
3871	*status = U_ILLEGAL_ARGUMENT_ERROR;
3872	return FALSE;
3873	}
3874
3875	if (strsrch->pattern.pces == NULL) {
3876	initializePatternPCETable(strsrch, status);
3877	}
3878
3879	ucol_setOffset(strsrch->textIter, startIdx, status);
3880	CEIBuffer ceb(strsrch, status);
3881
3882
3883	int32_t targetIx = `0`;
3884	const CEI *targetCEI = NULL;
3885	int32_t patIx;
3886	UBool found;
3887
3888	int32_t mStart = -`1`;
3889	int32_t mLimit = -`1`;
3890	int32_t minLimit;
3891	int32_t maxLimit;
3892
3893
3894
3895	// Outer loop moves over match starting positions in the
3896	// target CE space.
3897	// Here we see the target as a sequence of collation elements, resulting from the following:
3898	// 1. Target characters were decomposed, and (if appropriate) other compressions and expansions are applied
3899	// (for example, digraphs such as IJ may be broken into two characters).
3900	// 2. An int64_t CE weight is determined for each resulting unit (high 16 bits are primary strength, next
3901	// 16 bits are secondary, next 16 (the high 16 bits of the low 32-bit half) are tertiary. Any of these
3902	// fields that are for strengths below that of the collator are set to 0. If this makes the int64_t
3903	// CE weight 0 (as for a combining diacritic with secondary weight when the collator strentgh is primary),
3904	// then the CE is deleted, so the following code sees only CEs that are relevant.
3905	// For each CE, the lowIndex and highIndex correspond to where this CE begins and ends in the original text.
3906	// If lowIndex==highIndex, either the CE resulted from an expansion/decomposition of one of the original text
3907	// characters, or the CE marks the limit of the target text (in which case the CE weight is UCOL_PROCESSED_NULLORDER).
3908	//
3909	for(targetIx=`0`; ; targetIx++)
3910	{
3911	found = TRUE;
3912	// Inner loop checks for a match beginning at each
3913	// position from the outer loop.
3914	int32_t targetIxOffset = `0`;
3915	int64_t patCE = `0`;
3916	// For targetIx > 0, this ceb.get gets a CE that is as far back in the ring buffer
3917	// (compared to the last CE fetched for the previous targetIx value) as we need to go
3918	// for this targetIx value, so if it is non-NULL then other ceb.get calls should be OK.
3919	const CEI *firstCEI = ceb.get(targetIx);
3920	if (firstCEI == NULL) {
3921	*status = U_INTERNAL_PROGRAM_ERROR;
3922	found = FALSE;
3923	break;
3924	}
3925
3926	for (patIx=`0`; patIx<strsrch->pattern.pcesLength; patIx++) {
3927	patCE = strsrch->pattern.pces[patIx];
3928	targetCEI = ceb.get(targetIx+patIx+targetIxOffset);
3929	// Compare CE from target string with CE from the pattern.
3930	// Note that the target CE will be UCOL_PROCESSED_NULLORDER if we reach the end of input,
3931	// which will fail the compare, below.
3932	UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);
3933	if ( ceMatch == U_CE_NO_MATCH ) {
3934	found = FALSE;
3935	break;
3936	} else if ( ceMatch > U_CE_NO_MATCH ) {
3937	if ( ceMatch == U_CE_SKIP_TARG ) {
3938	// redo with same patCE, next targCE
3939	patIx--;
3940	targetIxOffset++;
3941	} else { // ceMatch == U_CE_SKIP_PATN
3942	// redo with same targCE, next patCE
3943	targetIxOffset--;
3944	}
3945	}
3946	}
3947	targetIxOffset += strsrch->pattern.pcesLength; // this is now the offset in target CE space to end of the match so far
3948
3949	if (!found && ((targetCEI == NULL) \|\| (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) {
3950	// No match at this targetIx. Try again at the next.
3951	continue;
3952	}
3953
3954	if (!found) {
3955	// No match at all, we have run off the end of the target text.
3956	break;
3957	}
3958
3959
3960	// We have found a match in CE space.
3961	// Now determine the bounds in string index space.
3962	// There still is a chance of match failure if the CE range not correspond to
3963	// an acceptable character range.
3964	//
3965	const CEI *lastCEI = ceb.get(targetIx + targetIxOffset - `1`);
3966
3967	mStart = firstCEI->lowIndex;
3968	minLimit = lastCEI->lowIndex;
3969
3970	// Look at the CE following the match. If it is UCOL_NULLORDER the match
3971	// extended to the end of input, and the match is good.
3972
3973	// Look at the high and low indices of the CE following the match. If
3974	// they are the same it means one of two things:
3975	// 1. The match extended to the last CE from the target text, which is OK, or
3976	// 2. The last CE that was part of the match is in an expansion that extends
3977	// to the first CE after the match. In this case, we reject the match.
3978	const CEI *nextCEI = `0`;
3979	if (strsrch->search->elementComparisonType == `0`) {
3980	nextCEI = ceb.get(targetIx + targetIxOffset);
3981	maxLimit = nextCEI->lowIndex;
3982	if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
3983	found = FALSE;
3984	}
3985	} else {
3986	for ( ; ; ++targetIxOffset ) {
3987	nextCEI = ceb.get(targetIx + targetIxOffset);
3988	maxLimit = nextCEI->lowIndex;
3989	// If we are at the end of the target too, match succeeds
3990	if ( nextCEI->ce == UCOL_PROCESSED_NULLORDER ) {
3991	break;
3992	}
3993	// As long as the next CE has primary weight of 0,
3994	// it is part of the last target element matched by the pattern;
3995	// make sure it can be part of a match with the last patCE
3996	if ( (((nextCEI->ce) >> `32`) & `0xFFFF0000UL`) == `0` ) {
3997	UCompareCEsResult ceMatch = compareCE64s(nextCEI->ce, patCE, strsrch->search->elementComparisonType);
3998	if ( ceMatch == U_CE_NO_MATCH \|\| ceMatch == U_CE_SKIP_PATN ) {
3999	found = FALSE;
4000	break;
4001	}
4002	// If lowIndex == highIndex, this target CE is part of an expansion of the last matched
4003	// target element, but it has non-zero primary weight => match fails
4004	} else if ( nextCEI->lowIndex == nextCEI->highIndex ) {
4005	found = false;
4006	break;
4007	// Else the target CE is not part of an expansion of the last matched element, match succeeds
4008	} else {
4009	break;
4010	}
4011	}
4012	}
4013
4014
4015	// Check for the start of the match being within a combining sequence.
4016	// This can happen if the pattern itself begins with a combining char, and
4017	// the match found combining marks in the target text that were attached
4018	// to something else.
4019	// This type of match should be rejected for not completely consuming a
4020	// combining sequence.
4021	if (!isBreakBoundary(strsrch, mStart)) {
4022	found = FALSE;
4023	}
4024
4025	// Check for the start of the match being within an Collation Element Expansion,
4026	// meaning that the first char of the match is only partially matched.
4027	// With expansions, the first CE will report the index of the source
4028	// character, and all subsequent (expansions) CEs will report the source index of the
4029	// _following_ character.
4030	int32_t secondIx = firstCEI->highIndex;
4031	if (mStart == secondIx) {
4032	found = FALSE;
4033	}
4034
4035	// Allow matches to end in the middle of a grapheme cluster if the following
4036	// conditions are met; this is needed to make prefix search work properly in
4037	// Indic, see #11750
4038	// the default breakIter is being used*
4039	// the next collation element after this combining sequence*
4040	// - has non-zero primary weight
4041	// - corresponds to a separate character following the one at end of the current match
4042	// (the second of these conditions, and perhaps both, may be redundant given the
4043	// subsequent check for normalization boundary; however they are likely much faster
4044	// tests in any case)
4045	// the match limit is a normalization boundary*
4046	UBool allowMidclusterMatch = FALSE;
4047	if (strsrch->search->text != NULL && strsrch->search->textLength > maxLimit) {
4048	allowMidclusterMatch =
4049	strsrch->search->breakIter == NULL &&
4050	nextCEI != NULL && (((nextCEI->ce) >> `32`) & `0xFFFF0000UL`) != `0` &&
4051	maxLimit >= lastCEI->highIndex && nextCEI->highIndex > maxLimit &&
4052	(strsrch->nfd->hasBoundaryBefore(codePointAt(*strsrch->search, maxLimit)) \|\|
4053	strsrch->nfd->hasBoundaryAfter(codePointBefore(*strsrch->search, maxLimit)));
4054	}
4055	// If those conditions are met, then:
4056	// do NOT advance the candidate match limit (mLimit) to a break boundary; however*
4057	// the match limit may be backed off to a previous break boundary. This handles
4058	// cases in which mLimit includes target characters that are ignorable with current
4059	// settings (such as space) and which extend beyond the pattern match.
4060	// do NOT require that end of the combining sequence not extend beyond the match in CE space*
4061	// do NOT require that match limit be on a breakIter boundary*
4062
4063	// Advance the match end position to the first acceptable match boundary.
4064	// This advances the index over any combining charcters.
4065	mLimit = maxLimit;
4066	if (minLimit < maxLimit) {
4067	// When the last CE's low index is same with its high index, the CE is likely
4068	// a part of expansion. In this case, the index is located just after the
4069	// character corresponding to the CEs compared above. If the index is right
4070	// at the break boundary, move the position to the next boundary will result
4071	// incorrect match length when there are ignorable characters exist between
4072	// the position and the next character produces CE(s). See ticket#8482.
4073	if (minLimit == lastCEI->highIndex && isBreakBoundary(strsrch, minLimit)) {
4074	mLimit = minLimit;
4075	} else {
4076	int32_t nba = nextBoundaryAfter(strsrch, minLimit);
4077	// Note that we can have nba < maxLimit && nba >= minLImit, in which
4078	// case we want to set mLimit to nba regardless of allowMidclusterMatch
4079	// (i.e. we back off mLimit to the previous breakIterator boundary).
4080	if (nba >= lastCEI->highIndex && (!allowMidclusterMatch \|\| nba < maxLimit)) {
4081	mLimit = nba;
4082	}
4083	}
4084	}
4085
4086	#ifdef USEARCH_DEBUG
4087	if (getenv("USEARCH_DEBUG") != NULL) {
4088	printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
4089	}
4090	#endif
4091
4092	if (!allowMidclusterMatch) {
4093	// If advancing to the end of a combining sequence in character indexing space
4094	// advanced us beyond the end of the match in CE space, reject this match.
4095	if (mLimit > maxLimit) {
4096	found = FALSE;
4097	}
4098
4099	if (!isBreakBoundary(strsrch, mLimit)) {
4100	found = FALSE;
4101	}
4102	}
4103
4104	if (! checkIdentical(strsrch, mStart, mLimit)) {
4105	found = FALSE;
4106	}
4107
4108	if (found) {
4109	break;
4110	}
4111	}
4112
4113	#ifdef USEARCH_DEBUG
4114	if (getenv("USEARCH_DEBUG") != NULL) {
4115	printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
4116	int32_t lastToPrint = ceb.limitIx+`2`;
4117	for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
4118	printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
4119	}
4120	printf("\n%s\n", found? "match found" : "no match");
4121	}
4122	#endif
4123
4124	// All Done. Store back the match bounds to the caller.
4125	//
4126	if (found==FALSE) {
4127	mLimit = -`1`;
4128	mStart = -`1`;
4129	}
4130
4131	if (matchStart != NULL) {
4132	*matchStart= mStart;
4133	}
4134
4135	if (matchLimit != NULL) {
4136	*matchLimit = mLimit;
4137	}
4138
4139	return found;
4140	}
4141
4142	U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch *strsrch,
4143	int32_t startIdx,
4144	int32_t *matchStart,
4145	int32_t *matchLimit,
4146	UErrorCode *status)
4147	{
4148	if (U_FAILURE(*status)) {
4149	return FALSE;
4150	}
4151
4152	// TODO: reject search patterns beginning with a combining char.
4153
4154	#ifdef USEARCH_DEBUG
4155	if (getenv("USEARCH_DEBUG") != NULL) {
4156	printf("Pattern CEs\n");
4157	for (int ii=`0`; ii<strsrch->pattern.cesLength; ii++) {
4158	printf(" %8x", strsrch->pattern.ces[ii]);
4159	}
4160	printf("\n");
4161	}
4162
4163	#endif
4164	// Input parameter sanity check.
4165	// TODO: should input indicies clip to the text length
4166	// in the same way that UText does.
4167	if(strsrch->pattern.cesLength == `0` \|\|
4168	startIdx < `0` \|\|
4169	startIdx > strsrch->search->textLength \|\|
4170	strsrch->pattern.ces == NULL) {
4171	*status = U_ILLEGAL_ARGUMENT_ERROR;
4172	return FALSE;
4173	}
4174
4175	if (strsrch->pattern.pces == NULL) {
4176	initializePatternPCETable(strsrch, status);
4177	}
4178
4179	CEIBuffer ceb(strsrch, status);
4180	int32_t targetIx = `0`;
4181
4182	/*
4183	* Pre-load the buffer with the CE's for the grapheme
4184	* after our starting position so that we're sure that
4185	* we can look at the CE following the match when we
4186	* check the match boundaries.
4187	*
4188	* This will also pre-fetch the first CE that we'll
4189	* consider for the match.
4190	*/
4191	if (startIdx < strsrch->search->textLength) {
4192	UBreakIterator *bi = strsrch->search->internalBreakIter;
4193	int32_t next = ubrk_following(bi, startIdx);
4194
4195	ucol_setOffset(strsrch->textIter, next, status);
4196
4197	for (targetIx = `0`; ; targetIx += `1`) {
4198	if (ceb.getPrevious(targetIx)->lowIndex < startIdx) {
4199	break;
4200	}
4201	}
4202	} else {
4203	ucol_setOffset(strsrch->textIter, startIdx, status);
4204	}
4205
4206
4207	const CEI *targetCEI = NULL;
4208	int32_t patIx;
4209	UBool found;
4210
4211	int32_t limitIx = targetIx;
4212	int32_t mStart = -`1`;
4213	int32_t mLimit = -`1`;
4214	int32_t minLimit;
4215	int32_t maxLimit;
4216
4217
4218
4219	// Outer loop moves over match starting positions in the
4220	// target CE space.
4221	// Here, targetIx values increase toward the beginning of the base text (i.e. we get the text CEs in reverse order).
4222	// But patIx is 0 at the beginning of the pattern and increases toward the end.
4223	// So this loop performs a comparison starting with the end of pattern, and prcessd toward the beginning of the pattern
4224	// and the beginning of the base text.
4225	for(targetIx = limitIx; ; targetIx += `1`)
4226	{
4227	found = TRUE;
4228	// For targetIx > limitIx, this ceb.getPrevious gets a CE that is as far back in the ring buffer
4229	// (compared to the last CE fetched for the previous targetIx value) as we need to go
4230	// for this targetIx value, so if it is non-NULL then other ceb.getPrevious calls should be OK.
4231	const CEI *lastCEI = ceb.getPrevious(targetIx);
4232	if (lastCEI == NULL) {
4233	*status = U_INTERNAL_PROGRAM_ERROR;
4234	found = FALSE;
4235	break;
4236	}
4237	// Inner loop checks for a match beginning at each
4238	// position from the outer loop.
4239	int32_t targetIxOffset = `0`;
4240	for (patIx = strsrch->pattern.pcesLength - `1`; patIx >= `0`; patIx -= `1`) {
4241	int64_t patCE = strsrch->pattern.pces[patIx];
4242
4243	targetCEI = ceb.getPrevious(targetIx + strsrch->pattern.pcesLength - `1` - patIx + targetIxOffset);
4244	// Compare CE from target string with CE from the pattern.
4245	// Note that the target CE will be UCOL_NULLORDER if we reach the end of input,
4246	// which will fail the compare, below.
4247	UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);
4248	if ( ceMatch == U_CE_NO_MATCH ) {
4249	found = FALSE;
4250	break;
4251	} else if ( ceMatch > U_CE_NO_MATCH ) {
4252	if ( ceMatch == U_CE_SKIP_TARG ) {
4253	// redo with same patCE, next targCE
4254	patIx++;
4255	targetIxOffset++;
4256	} else { // ceMatch == U_CE_SKIP_PATN
4257	// redo with same targCE, next patCE
4258	targetIxOffset--;
4259	}
4260	}
4261	}
4262
4263	if (!found && ((targetCEI == NULL) \|\| (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) {
4264	// No match at this targetIx. Try again at the next.
4265	continue;
4266	}
4267
4268	if (!found) {
4269	// No match at all, we have run off the end of the target text.
4270	break;
4271	}
4272
4273
4274	// We have found a match in CE space.
4275	// Now determine the bounds in string index space.
4276	// There still is a chance of match failure if the CE range not correspond to
4277	// an acceptable character range.
4278	//
4279	const CEI *firstCEI = ceb.getPrevious(targetIx + strsrch->pattern.pcesLength - `1` + targetIxOffset);
4280	mStart = firstCEI->lowIndex;
4281
4282	// Check for the start of the match being within a combining sequence.
4283	// This can happen if the pattern itself begins with a combining char, and
4284	// the match found combining marks in the target text that were attached
4285	// to something else.
4286	// This type of match should be rejected for not completely consuming a
4287	// combining sequence.
4288	if (!isBreakBoundary(strsrch, mStart)) {
4289	found = FALSE;
4290	}
4291
4292	// Look at the high index of the first CE in the match. If it's the same as the
4293	// low index, the first CE in the match is in the middle of an expansion.
4294	if (mStart == firstCEI->highIndex) {
4295	found = FALSE;
4296	}
4297
4298
4299	minLimit = lastCEI->lowIndex;
4300
4301	if (targetIx > `0`) {
4302	// Look at the CE following the match. If it is UCOL_NULLORDER the match
4303	// extended to the end of input, and the match is good.
4304
4305	// Look at the high and low indices of the CE following the match. If
4306	// they are the same it means one of two things:
4307	// 1. The match extended to the last CE from the target text, which is OK, or
4308	// 2. The last CE that was part of the match is in an expansion that extends
4309	// to the first CE after the match. In this case, we reject the match.
4310	const CEI *nextCEI = ceb.getPrevious(targetIx - `1`);
4311
4312	if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
4313	found = FALSE;
4314	}
4315
4316	mLimit = maxLimit = nextCEI->lowIndex;
4317
4318	// Allow matches to end in the middle of a grapheme cluster if the following
4319	// conditions are met; this is needed to make prefix search work properly in
4320	// Indic, see #11750
4321	// the default breakIter is being used*
4322	// the next collation element after this combining sequence*
4323	// - has non-zero primary weight
4324	// - corresponds to a separate character following the one at end of the current match
4325	// (the second of these conditions, and perhaps both, may be redundant given the
4326	// subsequent check for normalization boundary; however they are likely much faster
4327	// tests in any case)
4328	// the match limit is a normalization boundary*
4329	UBool allowMidclusterMatch = FALSE;
4330	if (strsrch->search->text != NULL && strsrch->search->textLength > maxLimit) {
4331	allowMidclusterMatch =
4332	strsrch->search->breakIter == NULL &&
4333	nextCEI != NULL && (((nextCEI->ce) >> `32`) & `0xFFFF0000UL`) != `0` &&
4334	maxLimit >= lastCEI->highIndex && nextCEI->highIndex > maxLimit &&
4335	(strsrch->nfd->hasBoundaryBefore(codePointAt(*strsrch->search, maxLimit)) \|\|
4336	strsrch->nfd->hasBoundaryAfter(codePointBefore(*strsrch->search, maxLimit)));
4337	}
4338	// If those conditions are met, then:
4339	// do NOT advance the candidate match limit (mLimit) to a break boundary; however*
4340	// the match limit may be backed off to a previous break boundary. This handles
4341	// cases in which mLimit includes target characters that are ignorable with current
4342	// settings (such as space) and which extend beyond the pattern match.
4343	// do NOT require that end of the combining sequence not extend beyond the match in CE space*
4344	// do NOT require that match limit be on a breakIter boundary*
4345
4346	// Advance the match end position to the first acceptable match boundary.
4347	// This advances the index over any combining characters.
4348	if (minLimit < maxLimit) {
4349	int32_t nba = nextBoundaryAfter(strsrch, minLimit);
4350	// Note that we can have nba < maxLimit && nba >= minLImit, in which
4351	// case we want to set mLimit to nba regardless of allowMidclusterMatch
4352	// (i.e. we back off mLimit to the previous breakIterator boundary).
4353	if (nba >= lastCEI->highIndex && (!allowMidclusterMatch \|\| nba < maxLimit)) {
4354	mLimit = nba;
4355	}
4356	}
4357
4358	if (!allowMidclusterMatch) {
4359	// If advancing to the end of a combining sequence in character indexing space
4360	// advanced us beyond the end of the match in CE space, reject this match.
4361	if (mLimit > maxLimit) {
4362	found = FALSE;
4363	}
4364
4365	// Make sure the end of the match is on a break boundary
4366	if (!isBreakBoundary(strsrch, mLimit)) {
4367	found = FALSE;
4368	}
4369	}
4370
4371	} else {
4372	// No non-ignorable CEs after this point.
4373	// The maximum position is detected by boundary after
4374	// the last non-ignorable CE. Combining sequence
4375	// across the start index will be truncated.
4376	int32_t nba = nextBoundaryAfter(strsrch, minLimit);
4377	mLimit = maxLimit = (nba > `0`) && (startIdx > nba) ? nba : startIdx;
4378	}
4379
4380	#ifdef USEARCH_DEBUG
4381	if (getenv("USEARCH_DEBUG") != NULL) {
4382	printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
4383	}
4384	#endif
4385
4386
4387	if (! checkIdentical(strsrch, mStart, mLimit)) {
4388	found = FALSE;
4389	}
4390
4391	if (found) {
4392	break;
4393	}
4394	}
4395
4396	#ifdef USEARCH_DEBUG
4397	if (getenv("USEARCH_DEBUG") != NULL) {
4398	printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
4399	int32_t lastToPrint = ceb.limitIx+`2`;
4400	for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
4401	printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
4402	}
4403	printf("\n%s\n", found? "match found" : "no match");
4404	}
4405	#endif
4406
4407	// All Done. Store back the match bounds to the caller.
4408	//
4409	if (found==FALSE) {
4410	mLimit = -`1`;
4411	mStart = -`1`;
4412	}
4413
4414	if (matchStart != NULL) {
4415	*matchStart= mStart;
4416	}
4417
4418	if (matchLimit != NULL) {
4419	*matchLimit = mLimit;
4420	}
4421
4422	return found;
4423	}
4424
4425	// internal use methods declared in usrchimp.h -----------------------------
4426
4427	UBool usearch_handleNextExact(UStringSearch strsrch, UErrorCode status)
4428	{
4429	if (U_FAILURE(*status)) {
4430	setMatchNotFound(strsrch);
4431	return FALSE;
4432	}
4433
4434	#if BOYER_MOORE
4435	UCollationElements *coleiter = strsrch->textIter;
4436	int32_t textlength = strsrch->search->textLength;
4437	int32_t *patternce = strsrch->pattern.ces;
4438	int32_t patterncelength = strsrch->pattern.cesLength;
4439	int32_t textoffset = ucol_getOffset(coleiter);
4440
4441	// status used in setting coleiter offset, since offset is checked in
4442	// shiftForward before setting the coleiter offset, status never
4443	// a failure
4444	textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
4445	patterncelength);
4446	while (textoffset <= textlength)
4447	{
4448	uint32_t patternceindex = patterncelength - `1`;
4449	int32_t targetce;
4450	UBool found = FALSE;
4451	int32_t lastce = UCOL_NULLORDER;
4452
4453	setColEIterOffset(coleiter, textoffset);
4454
4455	for (;;) {
4456	// finding the last pattern ce match, imagine composite characters
4457	// for example: search for pattern A in text \u00C0
4458	// we'll have to skip \u0300 the grave first before we get to A
4459	targetce = ucol_previous(coleiter, status);
4460	if (U_FAILURE(*status) \|\| targetce == UCOL_NULLORDER) {
4461	found = FALSE;
4462	break;
4463	}
4464	targetce = getCE(strsrch, targetce);
4465	if (targetce == UCOL_IGNORABLE && inNormBuf(coleiter)) {
4466	// this is for the text \u0315\u0300 that requires
4467	// normalization and pattern \u0300, where \u0315 is ignorable
4468	continue;
4469	}
4470	if (lastce == UCOL_NULLORDER \|\| lastce == UCOL_IGNORABLE) {
4471	lastce = targetce;
4472	}
4473	// TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4474	if (targetce == patternce[patternceindex]) {
4475	// the first ce can be a contraction
4476	found = TRUE;
4477	break;
4478	}
4479	if (!hasExpansion(coleiter)) {
4480	found = FALSE;
4481	break;
4482	}
4483	}
4484
4485	//targetce = lastce;
4486
4487	while (found && patternceindex > `0`) {
4488	lastce = targetce;
4489	targetce = ucol_previous(coleiter, status);
4490	if (U_FAILURE(*status) \|\| targetce == UCOL_NULLORDER) {
4491	found = FALSE;
4492	break;
4493	}
4494	targetce = getCE(strsrch, targetce);
4495	if (targetce == UCOL_IGNORABLE) {
4496	continue;
4497	}
4498
4499	patternceindex --;
4500	// TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4501	found = found && targetce == patternce[patternceindex];
4502	}
4503
4504	targetce = lastce;
4505
4506	if (!found) {
4507	if (U_FAILURE(*status)) {
4508	break;
4509	}
4510	textoffset = shiftForward(strsrch, textoffset, lastce,
4511	patternceindex);
4512	// status checked at loop.
4513	patternceindex = patterncelength;
4514	continue;
4515	}
4516
4517	if (checkNextExactMatch(strsrch, &textoffset, status)) {
4518	// status checked in ucol_setOffset
4519	setColEIterOffset(coleiter, strsrch->search->matchedIndex);
4520	return TRUE;
4521	}
4522	}
4523	setMatchNotFound(strsrch);
4524	return FALSE;
4525	#else
4526	int32_t textOffset = ucol_getOffset(strsrch->textIter);
4527	int32_t start = -`1`;
4528	int32_t end = -`1`;
4529
4530	if (usearch_search(strsrch, textOffset, &start, &end, status)) {
4531	strsrch->search->matchedIndex = start;
4532	strsrch->search->matchedLength = end - start;
4533	return TRUE;
4534	} else {
4535	setMatchNotFound(strsrch);
4536	return FALSE;
4537	}
4538	#endif
4539	}
4540
4541	UBool usearch_handleNextCanonical(UStringSearch strsrch, UErrorCode status)
4542	{
4543	if (U_FAILURE(*status)) {
4544	setMatchNotFound(strsrch);
4545	return FALSE;
4546	}
4547
4548	#if BOYER_MOORE
4549	UCollationElements *coleiter = strsrch->textIter;
4550	int32_t textlength = strsrch->search->textLength;
4551	int32_t *patternce = strsrch->pattern.ces;
4552	int32_t patterncelength = strsrch->pattern.cesLength;
4553	int32_t textoffset = ucol_getOffset(coleiter);
4554	UBool hasPatternAccents =
4555	strsrch->pattern.hasSuffixAccents \|\| strsrch->pattern.hasPrefixAccents;
4556
4557	textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
4558	patterncelength);
4559	strsrch->canonicalPrefixAccents[`0`] = `0`;
4560	strsrch->canonicalSuffixAccents[`0`] = `0`;
4561
4562	while (textoffset <= textlength)
4563	{
4564	int32_t patternceindex = patterncelength - `1`;
4565	int32_t targetce;
4566	UBool found = FALSE;
4567	int32_t lastce = UCOL_NULLORDER;
4568
4569	setColEIterOffset(coleiter, textoffset);
4570
4571	for (;;) {
4572	// finding the last pattern ce match, imagine composite characters
4573	// for example: search for pattern A in text \u00C0
4574	// we'll have to skip \u0300 the grave first before we get to A
4575	targetce = ucol_previous(coleiter, status);
4576	if (U_FAILURE(*status) \|\| targetce == UCOL_NULLORDER) {
4577	found = FALSE;
4578	break;
4579	}
4580	targetce = getCE(strsrch, targetce);
4581	if (lastce == UCOL_NULLORDER \|\| lastce == UCOL_IGNORABLE) {
4582	lastce = targetce;
4583	}
4584	// TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4585	if (targetce == patternce[patternceindex]) {
4586	// the first ce can be a contraction
4587	found = TRUE;
4588	break;
4589	}
4590	if (!hasExpansion(coleiter)) {
4591	found = FALSE;
4592	break;
4593	}
4594	}
4595
4596	while (found && patternceindex > `0`) {
4597	targetce = ucol_previous(coleiter, status);
4598	if (U_FAILURE(*status) \|\| targetce == UCOL_NULLORDER) {
4599	found = FALSE;
4600	break;
4601	}
4602	targetce = getCE(strsrch, targetce);
4603	if (targetce == UCOL_IGNORABLE) {
4604	continue;
4605	}
4606
4607	patternceindex --;
4608	// TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4609	found = found && targetce == patternce[patternceindex];
4610	}
4611
4612	// initializing the rearranged accent array
4613	if (hasPatternAccents && !found) {
4614	strsrch->canonicalPrefixAccents[`0`] = `0`;
4615	strsrch->canonicalSuffixAccents[`0`] = `0`;
4616	if (U_FAILURE(*status)) {
4617	break;
4618	}
4619	found = doNextCanonicalMatch(strsrch, textoffset, status);
4620	}
4621
4622	if (!found) {
4623	if (U_FAILURE(*status)) {
4624	break;
4625	}
4626	textoffset = shiftForward(strsrch, textoffset, lastce,
4627	patternceindex);
4628	// status checked at loop
4629	patternceindex = patterncelength;
4630	continue;
4631	}
4632
4633	if (checkNextCanonicalMatch(strsrch, &textoffset, status)) {
4634	setColEIterOffset(coleiter, strsrch->search->matchedIndex);
4635	return TRUE;
4636	}
4637	}
4638	setMatchNotFound(strsrch);
4639	return FALSE;
4640	#else
4641	int32_t textOffset = ucol_getOffset(strsrch->textIter);
4642	int32_t start = -`1`;
4643	int32_t end = -`1`;
4644
4645	if (usearch_search(strsrch, textOffset, &start, &end, status)) {
4646	strsrch->search->matchedIndex = start;
4647	strsrch->search->matchedLength = end - start;
4648	return TRUE;
4649	} else {
4650	setMatchNotFound(strsrch);
4651	return FALSE;
4652	}
4653	#endif
4654	}
4655
4656	UBool usearch_handlePreviousExact(UStringSearch strsrch, UErrorCode status)
4657	{
4658	if (U_FAILURE(*status)) {
4659	setMatchNotFound(strsrch);
4660	return FALSE;
4661	}
4662
4663	#if BOYER_MOORE
4664	UCollationElements *coleiter = strsrch->textIter;
4665	int32_t *patternce = strsrch->pattern.ces;
4666	int32_t patterncelength = strsrch->pattern.cesLength;
4667	int32_t textoffset = ucol_getOffset(coleiter);
4668
4669	// shifting it check for setting offset
4670	// if setOffset is called previously or there was no previous match, we
4671	// leave the offset as it is.
4672	if (strsrch->search->matchedIndex != USEARCH_DONE) {
4673	textoffset = strsrch->search->matchedIndex;
4674	}
4675
4676	textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
4677	patterncelength);
4678
4679	while (textoffset >= `0`)
4680	{
4681	int32_t patternceindex = `1`;
4682	int32_t targetce;
4683	UBool found = FALSE;
4684	int32_t firstce = UCOL_NULLORDER;
4685
4686	// if status is a failure, ucol_setOffset does nothing
4687	setColEIterOffset(coleiter, textoffset);
4688
4689	for (;;) {
4690	// finding the first pattern ce match, imagine composite
4691	// characters. for example: search for pattern \u0300 in text
4692	// \u00C0, we'll have to skip A first before we get to
4693	// \u0300 the grave accent
4694	targetce = ucol_next(coleiter, status);
4695	if (U_FAILURE(*status) \|\| targetce == UCOL_NULLORDER) {
4696	found = FALSE;
4697	break;
4698	}
4699	targetce = getCE(strsrch, targetce);
4700	if (firstce == UCOL_NULLORDER \|\| firstce == UCOL_IGNORABLE) {
4701	firstce = targetce;
4702	}
4703	if (targetce == UCOL_IGNORABLE && strsrch->strength != UCOL_PRIMARY) {
4704	continue;
4705	}
4706	// TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4707	if (targetce == patternce[`0`]) {
4708	found = TRUE;
4709	break;
4710	}
4711	if (!hasExpansion(coleiter)) {
4712	// checking for accents in composite character
4713	found = FALSE;
4714	break;
4715	}
4716	}
4717
4718	//targetce = firstce;
4719
4720	while (found && (patternceindex < patterncelength)) {
4721	firstce = targetce;
4722	targetce = ucol_next(coleiter, status);
4723	if (U_FAILURE(*status) \|\| targetce == UCOL_NULLORDER) {
4724	found = FALSE;
4725	break;
4726	}
4727	targetce = getCE(strsrch, targetce);
4728	if (targetce == UCOL_IGNORABLE) {
4729	continue;
4730	}
4731
4732	// TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4733	found = found && targetce == patternce[patternceindex];
4734	patternceindex ++;
4735	}
4736
4737	targetce = firstce;
4738
4739	if (!found) {
4740	if (U_FAILURE(*status)) {
4741	break;
4742	}
4743
4744	textoffset = reverseShift(strsrch, textoffset, targetce,
4745	patternceindex);
4746	patternceindex = `0`;
4747	continue;
4748	}
4749
4750	if (checkPreviousExactMatch(strsrch, &textoffset, status)) {
4751	setColEIterOffset(coleiter, textoffset);
4752	return TRUE;
4753	}
4754	}
4755	setMatchNotFound(strsrch);
4756	return FALSE;
4757	#else
4758	int32_t textOffset;
4759
4760	if (strsrch->search->isOverlap) {
4761	if (strsrch->search->matchedIndex != USEARCH_DONE) {
4762	textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - `1`;
4763	} else {
4764	// move the start position at the end of possible match
4765	initializePatternPCETable(strsrch, status);
4766	if (!initTextProcessedIter(strsrch, status)) {
4767	setMatchNotFound(strsrch);
4768	return FALSE;
4769	}
4770	for (int32_t nPCEs = `0`; nPCEs < strsrch->pattern.pcesLength - `1`; nPCEs++) {
4771	int64_t pce = strsrch->textProcessedIter->nextProcessed(NULL, NULL, status);
4772	if (pce == UCOL_PROCESSED_NULLORDER) {
4773	// at the end of the text
4774	break;
4775	}
4776	}
4777	if (U_FAILURE(*status)) {
4778	setMatchNotFound(strsrch);
4779	return FALSE;
4780	}
4781	textOffset = ucol_getOffset(strsrch->textIter);
4782	}
4783	} else {
4784	textOffset = ucol_getOffset(strsrch->textIter);
4785	}
4786
4787	int32_t start = -`1`;
4788	int32_t end = -`1`;
4789
4790	if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
4791	strsrch->search->matchedIndex = start;
4792	strsrch->search->matchedLength = end - start;
4793	return TRUE;
4794	} else {
4795	setMatchNotFound(strsrch);
4796	return FALSE;
4797	}
4798	#endif
4799	}
4800
4801	UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
4802	UErrorCode *status)
4803	{
4804	if (U_FAILURE(*status)) {
4805	setMatchNotFound(strsrch);
4806	return FALSE;
4807	}
4808
4809	#if BOYER_MOORE
4810	UCollationElements *coleiter = strsrch->textIter;
4811	int32_t *patternce = strsrch->pattern.ces;
4812	int32_t patterncelength = strsrch->pattern.cesLength;
4813	int32_t textoffset = ucol_getOffset(coleiter);
4814	UBool hasPatternAccents =
4815	strsrch->pattern.hasSuffixAccents \|\| strsrch->pattern.hasPrefixAccents;
4816
4817	// shifting it check for setting offset
4818	// if setOffset is called previously or there was no previous match, we
4819	// leave the offset as it is.
4820	if (strsrch->search->matchedIndex != USEARCH_DONE) {
4821	textoffset = strsrch->search->matchedIndex;
4822	}
4823
4824	textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
4825	patterncelength);
4826	strsrch->canonicalPrefixAccents[`0`] = `0`;
4827	strsrch->canonicalSuffixAccents[`0`] = `0`;
4828
4829	while (textoffset >= `0`)
4830	{
4831	int32_t patternceindex = `1`;
4832	int32_t targetce;
4833	UBool found = FALSE;
4834	int32_t firstce = UCOL_NULLORDER;
4835
4836	setColEIterOffset(coleiter, textoffset);
4837	for (;;) {
4838	// finding the first pattern ce match, imagine composite
4839	// characters. for example: search for pattern \u0300 in text
4840	// \u00C0, we'll have to skip A first before we get to
4841	// \u0300 the grave accent
4842	targetce = ucol_next(coleiter, status);
4843	if (U_FAILURE(*status) \|\| targetce == UCOL_NULLORDER) {
4844	found = FALSE;
4845	break;
4846	}
4847	targetce = getCE(strsrch, targetce);
4848	if (firstce == UCOL_NULLORDER \|\| firstce == UCOL_IGNORABLE) {
4849	firstce = targetce;
4850	}
4851
4852	// TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4853	if (targetce == patternce[`0`]) {
4854	// the first ce can be a contraction
4855	found = TRUE;
4856	break;
4857	}
4858	if (!hasExpansion(coleiter)) {
4859	// checking for accents in composite character
4860	found = FALSE;
4861	break;
4862	}
4863	}
4864
4865	targetce = firstce;
4866
4867	while (found && patternceindex < patterncelength) {
4868	targetce = ucol_next(coleiter, status);
4869	if (U_FAILURE(*status) \|\| targetce == UCOL_NULLORDER) {
4870	found = FALSE;
4871	break;
4872	}
4873	targetce = getCE(strsrch, targetce);
4874	if (targetce == UCOL_IGNORABLE) {
4875	continue;
4876	}
4877
4878	// TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4879	found = found && targetce == patternce[patternceindex];
4880	patternceindex ++;
4881	}
4882
4883	// initializing the rearranged accent array
4884	if (hasPatternAccents && !found) {
4885	strsrch->canonicalPrefixAccents[`0`] = `0`;
4886	strsrch->canonicalSuffixAccents[`0`] = `0`;
4887	if (U_FAILURE(*status)) {
4888	break;
4889	}
4890	found = doPreviousCanonicalMatch(strsrch, textoffset, status);
4891	}
4892
4893	if (!found) {
4894	if (U_FAILURE(*status)) {
4895	break;
4896	}
4897	textoffset = reverseShift(strsrch, textoffset, targetce,
4898	patternceindex);
4899	patternceindex = `0`;
4900	continue;
4901	}
4902
4903	if (checkPreviousCanonicalMatch(strsrch, &textoffset, status)) {
4904	setColEIterOffset(coleiter, textoffset);
4905	return TRUE;
4906	}
4907	}
4908	setMatchNotFound(strsrch);
4909	return FALSE;
4910	#else
4911	int32_t textOffset;
4912
4913	if (strsrch->search->isOverlap) {
4914	if (strsrch->search->matchedIndex != USEARCH_DONE) {
4915	textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - `1`;
4916	} else {
4917	// move the start position at the end of possible match
4918	initializePatternPCETable(strsrch, status);
4919	if (!initTextProcessedIter(strsrch, status)) {
4920	setMatchNotFound(strsrch);
4921	return FALSE;
4922	}
4923	for (int32_t nPCEs = `0`; nPCEs < strsrch->pattern.pcesLength - `1`; nPCEs++) {
4924	int64_t pce = strsrch->textProcessedIter->nextProcessed(NULL, NULL, status);
4925	if (pce == UCOL_PROCESSED_NULLORDER) {
4926	// at the end of the text
4927	break;
4928	}
4929	}
4930	if (U_FAILURE(*status)) {
4931	setMatchNotFound(strsrch);
4932	return FALSE;
4933	}
4934	textOffset = ucol_getOffset(strsrch->textIter);
4935	}
4936	} else {
4937	textOffset = ucol_getOffset(strsrch->textIter);
4938	}
4939
4940	int32_t start = -`1`;
4941	int32_t end = -`1`;
4942
4943	if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
4944	strsrch->search->matchedIndex = start;
4945	strsrch->search->matchedLength = end - start;
4946	return TRUE;
4947	} else {
4948	setMatchNotFound(strsrch);
4949	return FALSE;
4950	}
4951	#endif
4952	}
4953
4954	#endif /* #if !UCONFIG_NO_COLLATION */
4955

Browse the source code of ClickHouse/contrib/icu/icu4c/source/i18n/usearch.cpp