1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4**********************************************************************
5* Copyright (C) 2001-2015 IBM and others. All rights reserved.
6**********************************************************************
7* Date Name Description
8* 07/02/2001 synwee Creation.
9**********************************************************************
10*/
11
12#include "unicode/utypes.h"
13
14#if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
15
16#include "unicode/usearch.h"
17#include "unicode/ustring.h"
18#include "unicode/uchar.h"
19#include "unicode/utf16.h"
20#include "normalizer2impl.h"
21#include "usrchimp.h"
22#include "cmemory.h"
23#include "ucln_in.h"
24#include "uassert.h"
25#include "ustr_imp.h"
26
27U_NAMESPACE_USE
28
29// don't use Boyer-Moore
30// (and if we decide to turn this on again there are several new TODOs that will need to be addressed)
31#define BOYER_MOORE 0
32
33// internal definition ---------------------------------------------------
34
35#define LAST_BYTE_MASK_ 0xFF
36#define SECOND_LAST_BYTE_SHIFT_ 8
37#define SUPPLEMENTARY_MIN_VALUE_ 0x10000
38
39static const Normalizer2Impl *g_nfcImpl = NULL;
40
41// internal methods -------------------------------------------------
42
43/**
44* Fast collation element iterator setOffset.
45* This function does not check for bounds.
46* @param coleiter collation element iterator
47* @param offset to set
48*/
49static
50inline void setColEIterOffset(UCollationElements *elems,
51 int32_t offset)
52{
53 // Note: Not "fast" any more after the 2013 collation rewrite.
54 // We do not want to expose more internals than necessary.
55 UErrorCode status = U_ZERO_ERROR;
56 ucol_setOffset(elems, offset, &status);
57}
58
59/**
60* Getting the mask for collation strength
61* @param strength collation strength
62* @return collation element mask
63*/
64static
65inline uint32_t getMask(UCollationStrength strength)
66{
67 switch (strength)
68 {
69 case UCOL_PRIMARY:
70 return UCOL_PRIMARYORDERMASK;
71 case UCOL_SECONDARY:
72 return UCOL_SECONDARYORDERMASK | UCOL_PRIMARYORDERMASK;
73 default:
74 return UCOL_TERTIARYORDERMASK | UCOL_SECONDARYORDERMASK |
75 UCOL_PRIMARYORDERMASK;
76 }
77}
78
79/**
80* @param ce 32-bit collation element
81* @return hash code
82*/
83static
84inline int hashFromCE32(uint32_t ce)
85{
86 int hc = (int)(
87 ((((((ce >> 24) * 37) +
88 (ce >> 16)) * 37) +
89 (ce >> 8)) * 37) +
90 ce);
91 hc %= MAX_TABLE_SIZE_;
92 if (hc < 0) {
93 hc += MAX_TABLE_SIZE_;
94 }
95 return hc;
96}
97
98U_CDECL_BEGIN
99static UBool U_CALLCONV
100usearch_cleanup(void) {
101 g_nfcImpl = NULL;
102 return TRUE;
103}
104U_CDECL_END
105
106/**
107* Initializing the fcd tables.
108* Internal method, status assumed to be a success.
109* @param status output error if any, caller to check status before calling
110* method, status assumed to be success when passed in.
111*/
112static
113inline void initializeFCD(UErrorCode *status)
114{
115 if (g_nfcImpl == NULL) {
116 g_nfcImpl = Normalizer2Factory::getNFCImpl(*status);
117 ucln_i18n_registerCleanup(UCLN_I18N_USEARCH, usearch_cleanup);
118 }
119}
120
121/**
122* Gets the fcd value for a character at the argument index.
123* This method takes into accounts of the supplementary characters.
124* @param str UTF16 string where character for fcd retrieval resides
125* @param offset position of the character whose fcd is to be retrieved, to be
126* overwritten with the next character position, taking
127* surrogate characters into consideration.
128* @param strlength length of the argument string
129* @return fcd value
130*/
131static
132uint16_t getFCD(const UChar *str, int32_t *offset,
133 int32_t strlength)
134{
135 const UChar *temp = str + *offset;
136 uint16_t result = g_nfcImpl->nextFCD16(temp, str + strlength);
137 *offset = (int32_t)(temp - str);
138 return result;
139}
140
141/**
142* Getting the modified collation elements taking into account the collation
143* attributes
144* @param strsrch string search data
145* @param sourcece
146* @return the modified collation element
147*/
148static
149inline int32_t getCE(const UStringSearch *strsrch, uint32_t sourcece)
150{
151 // note for tertiary we can't use the collator->tertiaryMask, that
152 // is a preprocessed mask that takes into account case options. since
153 // we are only concerned with exact matches, we don't need that.
154 sourcece &= strsrch->ceMask;
155
156 if (strsrch->toShift) {
157 // alternate handling here, since only the 16 most significant digits
158 // is only used, we can safely do a compare without masking
159 // if the ce is a variable, we mask and get only the primary values
160 // no shifting to quartenary is required since all primary values
161 // less than variabletop will need to be masked off anyway.
162 if (strsrch->variableTop > sourcece) {
163 if (strsrch->strength >= UCOL_QUATERNARY) {
164 sourcece &= UCOL_PRIMARYORDERMASK;
165 }
166 else {
167 sourcece = UCOL_IGNORABLE;
168 }
169 }
170 } else if (strsrch->strength >= UCOL_QUATERNARY && sourcece == UCOL_IGNORABLE) {
171 sourcece = 0xFFFF;
172 }
173
174 return sourcece;
175}
176
177/**
178* Allocate a memory and returns NULL if it failed.
179* Internal method, status assumed to be a success.
180* @param size to allocate
181* @param status output error if any, caller to check status before calling
182* method, status assumed to be success when passed in.
183* @return newly allocated array, NULL otherwise
184*/
185static
186inline void * allocateMemory(uint32_t size, UErrorCode *status)
187{
188 uint32_t *result = (uint32_t *)uprv_malloc(size);
189 if (result == NULL) {
190 *status = U_MEMORY_ALLOCATION_ERROR;
191 }
192 return result;
193}
194
195/**
196* Adds a uint32_t value to a destination array.
197* Creates a new array if we run out of space. The caller will have to
198* manually deallocate the newly allocated array.
199* Internal method, status assumed to be success, caller has to check status
200* before calling this method. destination not to be NULL and has at least
201* size destinationlength.
202* @param destination target array
203* @param offset destination offset to add value
204* @param destinationlength target array size, return value for the new size
205* @param value to be added
206* @param increments incremental size expected
207* @param status output error if any, caller to check status before calling
208* method, status assumed to be success when passed in.
209* @return new destination array, destination if there was no new allocation
210*/
211static
212inline int32_t * addTouint32_tArray(int32_t *destination,
213 uint32_t offset,
214 uint32_t *destinationlength,
215 uint32_t value,
216 uint32_t increments,
217 UErrorCode *status)
218{
219 uint32_t newlength = *destinationlength;
220 if (offset + 1 == newlength) {
221 newlength += increments;
222 int32_t *temp = (int32_t *)allocateMemory(
223 sizeof(int32_t) * newlength, status);
224 if (U_FAILURE(*status)) {
225 return NULL;
226 }
227 uprv_memcpy(temp, destination, sizeof(int32_t) * (size_t)offset);
228 *destinationlength = newlength;
229 destination = temp;
230 }
231 destination[offset] = value;
232 return destination;
233}
234
235/**
236* Adds a uint64_t value to a destination array.
237* Creates a new array if we run out of space. The caller will have to
238* manually deallocate the newly allocated array.
239* Internal method, status assumed to be success, caller has to check status
240* before calling this method. destination not to be NULL and has at least
241* size destinationlength.
242* @param destination target array
243* @param offset destination offset to add value
244* @param destinationlength target array size, return value for the new size
245* @param value to be added
246* @param increments incremental size expected
247* @param status output error if any, caller to check status before calling
248* method, status assumed to be success when passed in.
249* @return new destination array, destination if there was no new allocation
250*/
251static
252inline int64_t * addTouint64_tArray(int64_t *destination,
253 uint32_t offset,
254 uint32_t *destinationlength,
255 uint64_t value,
256 uint32_t increments,
257 UErrorCode *status)
258{
259 uint32_t newlength = *destinationlength;
260 if (offset + 1 == newlength) {
261 newlength += increments;
262 int64_t *temp = (int64_t *)allocateMemory(
263 sizeof(int64_t) * newlength, status);
264
265 if (U_FAILURE(*status)) {
266 return NULL;
267 }
268
269 uprv_memcpy(temp, destination, sizeof(int64_t) * (size_t)offset);
270 *destinationlength = newlength;
271 destination = temp;
272 }
273
274 destination[offset] = value;
275
276 return destination;
277}
278
279/**
280* Initializing the ce table for a pattern.
281* Stores non-ignorable collation keys.
282* Table size will be estimated by the size of the pattern text. Table
283* expansion will be perform as we go along. Adding 1 to ensure that the table
284* size definitely increases.
285* Internal method, status assumed to be a success.
286* @param strsrch string search data
287* @param status output error if any, caller to check status before calling
288* method, status assumed to be success when passed in.
289* @return total number of expansions
290*/
291static
292inline uint16_t initializePatternCETable(UStringSearch *strsrch,
293 UErrorCode *status)
294{
295 UPattern *pattern = &(strsrch->pattern);
296 uint32_t cetablesize = INITIAL_ARRAY_SIZE_;
297 int32_t *cetable = pattern->cesBuffer;
298 uint32_t patternlength = pattern->textLength;
299 UCollationElements *coleiter = strsrch->utilIter;
300
301 if (coleiter == NULL) {
302 coleiter = ucol_openElements(strsrch->collator, pattern->text,
303 patternlength, status);
304 // status will be checked in ucol_next(..) later and if it is an
305 // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
306 // returned.
307 strsrch->utilIter = coleiter;
308 }
309 else {
310 ucol_setText(coleiter, pattern->text, pattern->textLength, status);
311 }
312 if(U_FAILURE(*status)) {
313 return 0;
314 }
315
316 if (pattern->ces != cetable && pattern->ces) {
317 uprv_free(pattern->ces);
318 }
319
320 uint32_t offset = 0;
321 uint16_t result = 0;
322 int32_t ce;
323
324 while ((ce = ucol_next(coleiter, status)) != UCOL_NULLORDER &&
325 U_SUCCESS(*status)) {
326 uint32_t newce = getCE(strsrch, ce);
327 if (newce) {
328 int32_t *temp = addTouint32_tArray(cetable, offset, &cetablesize,
329 newce,
330 patternlength - ucol_getOffset(coleiter) + 1,
331 status);
332 if (U_FAILURE(*status)) {
333 return 0;
334 }
335 offset ++;
336 if (cetable != temp && cetable != pattern->cesBuffer) {
337 uprv_free(cetable);
338 }
339 cetable = temp;
340 }
341 result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
342 }
343
344 cetable[offset] = 0;
345 pattern->ces = cetable;
346 pattern->cesLength = offset;
347
348 return result;
349}
350
351/**
352* Initializing the pce table for a pattern.
353* Stores non-ignorable collation keys.
354* Table size will be estimated by the size of the pattern text. Table
355* expansion will be perform as we go along. Adding 1 to ensure that the table
356* size definitely increases.
357* Internal method, status assumed to be a success.
358* @param strsrch string search data
359* @param status output error if any, caller to check status before calling
360* method, status assumed to be success when passed in.
361* @return total number of expansions
362*/
363static
364inline uint16_t initializePatternPCETable(UStringSearch *strsrch,
365 UErrorCode *status)
366{
367 UPattern *pattern = &(strsrch->pattern);
368 uint32_t pcetablesize = INITIAL_ARRAY_SIZE_;
369 int64_t *pcetable = pattern->pcesBuffer;
370 uint32_t patternlength = pattern->textLength;
371 UCollationElements *coleiter = strsrch->utilIter;
372
373 if (coleiter == NULL) {
374 coleiter = ucol_openElements(strsrch->collator, pattern->text,
375 patternlength, status);
376 // status will be checked in ucol_next(..) later and if it is an
377 // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
378 // returned.
379 strsrch->utilIter = coleiter;
380 } else {
381 ucol_setText(coleiter, pattern->text, pattern->textLength, status);
382 }
383 if(U_FAILURE(*status)) {
384 return 0;
385 }
386
387 if (pattern->pces != pcetable && pattern->pces != NULL) {
388 uprv_free(pattern->pces);
389 }
390
391 uint32_t offset = 0;
392 uint16_t result = 0;
393 int64_t pce;
394
395 icu::UCollationPCE iter(coleiter);
396
397 // ** Should processed CEs be signed or unsigned?
398 // ** (the rest of the code in this file seems to play fast-and-loose with
399 // ** whether a CE is signed or unsigned. For example, look at routine above this one.)
400 while ((pce = iter.nextProcessed(NULL, NULL, status)) != UCOL_PROCESSED_NULLORDER &&
401 U_SUCCESS(*status)) {
402 int64_t *temp = addTouint64_tArray(pcetable, offset, &pcetablesize,
403 pce,
404 patternlength - ucol_getOffset(coleiter) + 1,
405 status);
406
407 if (U_FAILURE(*status)) {
408 return 0;
409 }
410
411 offset += 1;
412
413 if (pcetable != temp && pcetable != pattern->pcesBuffer) {
414 uprv_free(pcetable);
415 }
416
417 pcetable = temp;
418 //result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
419 }
420
421 pcetable[offset] = 0;
422 pattern->pces = pcetable;
423 pattern->pcesLength = offset;
424
425 return result;
426}
427
428/**
429* Initializes the pattern struct.
430* Internal method, status assumed to be success.
431* @param strsrch UStringSearch data storage
432* @param status output error if any, caller to check status before calling
433* method, status assumed to be success when passed in.
434* @return expansionsize the total expansion size of the pattern
435*/
436static
437inline int16_t initializePattern(UStringSearch *strsrch, UErrorCode *status)
438{
439 if (U_FAILURE(*status)) { return 0; }
440 UPattern *pattern = &(strsrch->pattern);
441 const UChar *patterntext = pattern->text;
442 int32_t length = pattern->textLength;
443 int32_t index = 0;
444
445 // Since the strength is primary, accents are ignored in the pattern.
446 if (strsrch->strength == UCOL_PRIMARY) {
447 pattern->hasPrefixAccents = 0;
448 pattern->hasSuffixAccents = 0;
449 } else {
450 pattern->hasPrefixAccents = getFCD(patterntext, &index, length) >>
451 SECOND_LAST_BYTE_SHIFT_;
452 index = length;
453 U16_BACK_1(patterntext, 0, index);
454 pattern->hasSuffixAccents = getFCD(patterntext, &index, length) &
455 LAST_BYTE_MASK_;
456 }
457
458 // ** HACK **
459 if (strsrch->pattern.pces != NULL) {
460 if (strsrch->pattern.pces != strsrch->pattern.pcesBuffer) {
461 uprv_free(strsrch->pattern.pces);
462 }
463
464 strsrch->pattern.pces = NULL;
465 }
466
467 // since intializePattern is an internal method status is a success.
468 return initializePatternCETable(strsrch, status);
469}
470
471/**
472* Initializing shift tables, with the default values.
473* If a corresponding default value is 0, the shift table is not set.
474* @param shift table for forwards shift
475* @param backshift table for backwards shift
476* @param cetable table containing pattern ce
477* @param cesize size of the pattern ces
478* @param expansionsize total size of the expansions
479* @param defaultforward the default forward value
480* @param defaultbackward the default backward value
481*/
482static
483inline void setShiftTable(int16_t shift[], int16_t backshift[],
484 int32_t *cetable, int32_t cesize,
485 int16_t expansionsize,
486 int16_t defaultforward,
487 int16_t defaultbackward)
488{
489 // estimate the value to shift. to do that we estimate the smallest
490 // number of characters to give the relevant ces, ie approximately
491 // the number of ces minus their expansion, since expansions can come
492 // from a character.
493 int32_t count;
494 for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
495 shift[count] = defaultforward;
496 }
497 cesize --; // down to the last index
498 for (count = 0; count < cesize; count ++) {
499 // number of ces from right of array to the count
500 int temp = defaultforward - count - 1;
501 shift[hashFromCE32(cetable[count])] = temp > 1 ? static_cast<int16_t>(temp) : 1;
502 }
503 shift[hashFromCE32(cetable[cesize])] = 1;
504 // for ignorables we just shift by one. see test examples.
505 shift[hashFromCE32(0)] = 1;
506
507 for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
508 backshift[count] = defaultbackward;
509 }
510 for (count = cesize; count > 0; count --) {
511 // the original value count does not seem to work
512 backshift[hashFromCE32(cetable[count])] = count > expansionsize ?
513 (int16_t)(count - expansionsize) : 1;
514 }
515 backshift[hashFromCE32(cetable[0])] = 1;
516 backshift[hashFromCE32(0)] = 1;
517}
518
519/**
520* Building of the pattern collation element list and the boyer moore strsrch
521* table.
522* The canonical match will only be performed after the default match fails.
523* For both cases we need to remember the size of the composed and decomposed
524* versions of the string. Since the Boyer-Moore shift calculations shifts by
525* a number of characters in the text and tries to match the pattern from that
526* offset, the shift value can not be too large in case we miss some
527* characters. To choose a right shift size, we estimate the NFC form of the
528* and use its size as a shift guide. The NFC form should be the small
529* possible representation of the pattern. Anyways, we'll err on the smaller
530* shift size. Hence the calculation for minlength.
531* Canonical match will be performed slightly differently. We'll split the
532* pattern into 3 parts, the prefix accents (PA), the middle string bounded by
533* the first and last base character (MS), the ending accents (EA). Matches
534* will be done on MS first, and only when we match MS then some processing
535* will be required for the prefix and end accents in order to determine if
536* they match PA and EA. Hence the default shift values
537* for the canonical match will take the size of either end's accent into
538* consideration. Forwards search will take the end accents into consideration
539* for the default shift values and the backwards search will take the prefix
540* accents into consideration.
541* If pattern has no non-ignorable ce, we return a illegal argument error.
542* Internal method, status assumed to be success.
543* @param strsrch UStringSearch data storage
544* @param status for output errors if it occurs, status is assumed to be a
545* success when it is passed in.
546*/
547static
548inline void initialize(UStringSearch *strsrch, UErrorCode *status)
549{
550 int16_t expandlength = initializePattern(strsrch, status);
551 if (U_SUCCESS(*status) && strsrch->pattern.cesLength > 0) {
552 UPattern *pattern = &strsrch->pattern;
553 int32_t cesize = pattern->cesLength;
554
555 int16_t minlength = cesize > expandlength
556 ? (int16_t)cesize - expandlength : 1;
557 pattern->defaultShiftSize = minlength;
558 setShiftTable(pattern->shift, pattern->backShift, pattern->ces,
559 cesize, expandlength, minlength, minlength);
560 return;
561 }
562 strsrch->pattern.defaultShiftSize = 0;
563}
564
565#if BOYER_MOORE
566/**
567* Check to make sure that the match length is at the end of the character by
568* using the breakiterator.
569* @param strsrch string search data
570* @param start target text start offset
571* @param end target text end offset
572*/
573static
574void checkBreakBoundary(const UStringSearch *strsrch, int32_t * /*start*/,
575 int32_t *end)
576{
577#if !UCONFIG_NO_BREAK_ITERATION
578 UBreakIterator *breakiterator = strsrch->search->internalBreakIter;
579 if (breakiterator) {
580 int32_t matchend = *end;
581 //int32_t matchstart = *start;
582
583 if (!ubrk_isBoundary(breakiterator, matchend)) {
584 *end = ubrk_following(breakiterator, matchend);
585 }
586
587 /* Check the start of the matched text to make sure it doesn't have any accents
588 * before it. This code may not be necessary and so it is commented out */
589 /*if (!ubrk_isBoundary(breakiterator, matchstart) && !ubrk_isBoundary(breakiterator, matchstart-1)) {
590 *start = ubrk_preceding(breakiterator, matchstart);
591 }*/
592 }
593#endif
594}
595
596/**
597* Determine whether the target text in UStringSearch bounded by the offset
598* start and end is one or more whole units of text as
599* determined by the breakiterator in UStringSearch.
600* @param strsrch string search data
601* @param start target text start offset
602* @param end target text end offset
603*/
604static
605UBool isBreakUnit(const UStringSearch *strsrch, int32_t start,
606 int32_t end)
607{
608#if !UCONFIG_NO_BREAK_ITERATION
609 UBreakIterator *breakiterator = strsrch->search->breakIter;
610 //TODO: Add here.
611 if (breakiterator) {
612 int32_t startindex = ubrk_first(breakiterator);
613 int32_t endindex = ubrk_last(breakiterator);
614
615 // out-of-range indexes are never boundary positions
616 if (start < startindex || start > endindex ||
617 end < startindex || end > endindex) {
618 return FALSE;
619 }
620 // otherwise, we can use following() on the position before the
621 // specified one and return true of the position we get back is the
622 // one the user specified
623 UBool result = (start == startindex ||
624 ubrk_following(breakiterator, start - 1) == start) &&
625 (end == endindex ||
626 ubrk_following(breakiterator, end - 1) == end);
627 if (result) {
628 // iterates the individual ces
629 UCollationElements *coleiter = strsrch->utilIter;
630 const UChar *text = strsrch->search->text +
631 start;
632 UErrorCode status = U_ZERO_ERROR;
633 ucol_setText(coleiter, text, end - start, &status);
634 for (int32_t count = 0; count < strsrch->pattern.cesLength;
635 count ++) {
636 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
637 if (ce == UCOL_IGNORABLE) {
638 count --;
639 continue;
640 }
641 if (U_FAILURE(status) || ce != strsrch->pattern.ces[count]) {
642 return FALSE;
643 }
644 }
645 int32_t nextce = ucol_next(coleiter, &status);
646 while (ucol_getOffset(coleiter) == (end - start)
647 && getCE(strsrch, nextce) == UCOL_IGNORABLE) {
648 nextce = ucol_next(coleiter, &status);
649 }
650 if (ucol_getOffset(coleiter) == (end - start)
651 && nextce != UCOL_NULLORDER) {
652 // extra collation elements at the end of the match
653 return FALSE;
654 }
655 }
656 return result;
657 }
658#endif
659 return TRUE;
660}
661
662/**
663* Getting the next base character offset if current offset is an accent,
664* or the current offset if the current character contains a base character.
665* accents the following base character will be returned
666* @param text string
667* @param textoffset current offset
668* @param textlength length of text string
669* @return the next base character or the current offset
670* if the current character is contains a base character.
671*/
672static
673inline int32_t getNextBaseOffset(const UChar *text,
674 int32_t textoffset,
675 int32_t textlength)
676{
677 if (textoffset < textlength) {
678 int32_t temp = textoffset;
679 if (getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
680 while (temp < textlength) {
681 int32_t result = temp;
682 if ((getFCD(text, &temp, textlength) >>
683 SECOND_LAST_BYTE_SHIFT_) == 0) {
684 return result;
685 }
686 }
687 return textlength;
688 }
689 }
690 return textoffset;
691}
692
693/**
694* Gets the next base character offset depending on the string search pattern
695* data
696* @param strsrch string search data
697* @param textoffset current offset, one offset away from the last character
698* to search for.
699* @return start index of the next base character or the current offset
700* if the current character is contains a base character.
701*/
702static
703inline int32_t getNextUStringSearchBaseOffset(UStringSearch *strsrch,
704 int32_t textoffset)
705{
706 int32_t textlength = strsrch->search->textLength;
707 if (strsrch->pattern.hasSuffixAccents &&
708 textoffset < textlength) {
709 int32_t temp = textoffset;
710 const UChar *text = strsrch->search->text;
711 U16_BACK_1(text, 0, temp);
712 if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
713 return getNextBaseOffset(text, textoffset, textlength);
714 }
715 }
716 return textoffset;
717}
718
719/**
720* Shifting the collation element iterator position forward to prepare for
721* a following match. If the last character is a unsafe character, we'll only
722* shift by 1 to capture contractions, normalization etc.
723* Internal method, status assumed to be success.
724* @param text strsrch string search data
725* @param textoffset start text position to do search
726* @param ce the text ce which failed the match.
727* @param patternceindex index of the ce within the pattern ce buffer which
728* failed the match
729* @return final offset
730*/
731static
732inline int32_t shiftForward(UStringSearch *strsrch,
733 int32_t textoffset,
734 int32_t ce,
735 int32_t patternceindex)
736{
737 UPattern *pattern = &(strsrch->pattern);
738 if (ce != UCOL_NULLORDER) {
739 int32_t shift = pattern->shift[hashFromCE32(ce)];
740 // this is to adjust for characters in the middle of the
741 // substring for matching that failed.
742 int32_t adjust = pattern->cesLength - patternceindex;
743 if (adjust > 1 && shift >= adjust) {
744 shift -= adjust - 1;
745 }
746 textoffset += shift;
747 }
748 else {
749 textoffset += pattern->defaultShiftSize;
750 }
751
752 textoffset = getNextUStringSearchBaseOffset(strsrch, textoffset);
753 // check for unsafe characters
754 // * if it is the start or middle of a contraction: to be done after
755 // a initial match is found
756 // * thai or lao base consonant character: similar to contraction
757 // * high surrogate character: similar to contraction
758 // * next character is a accent: shift to the next base character
759 return textoffset;
760}
761#endif // #if BOYER_MOORE
762
763/**
764* sets match not found
765* @param strsrch string search data
766*/
767static
768inline void setMatchNotFound(UStringSearch *strsrch)
769{
770 // this method resets the match result regardless of the error status.
771 strsrch->search->matchedIndex = USEARCH_DONE;
772 strsrch->search->matchedLength = 0;
773 if (strsrch->search->isForwardSearching) {
774 setColEIterOffset(strsrch->textIter, strsrch->search->textLength);
775 }
776 else {
777 setColEIterOffset(strsrch->textIter, 0);
778 }
779}
780
781#if BOYER_MOORE
782/**
783* Gets the offset to the next safe point in text.
784* ie. not the middle of a contraction, swappable characters or supplementary
785* characters.
786* @param collator collation sata
787* @param text string to work with
788* @param textoffset offset in string
789* @param textlength length of text string
790* @return offset to the next safe character
791*/
792static
793inline int32_t getNextSafeOffset(const UCollator *collator,
794 const UChar *text,
795 int32_t textoffset,
796 int32_t textlength)
797{
798 int32_t result = textoffset; // first contraction character
799 while (result != textlength && ucol_unsafeCP(text[result], collator)) {
800 result ++;
801 }
802 return result;
803}
804
805/**
806* This checks for accents in the potential match started with a .
807* composite character.
808* This is really painful... we have to check that composite character do not
809* have any extra accents. We have to normalize the potential match and find
810* the immediate decomposed character before the match.
811* The first composite character would have been taken care of by the fcd
812* checks in checkForwardExactMatch.
813* This is the slow path after the fcd of the first character and
814* the last character has been checked by checkForwardExactMatch and we
815* determine that the potential match has extra non-ignorable preceding
816* ces.
817* E.g. looking for \u0301 acute in \u01FA A ring above and acute,
818* checkExtraMatchAccent should fail since there is a middle ring in \u01FA
819* Note here that accents checking are slow and cautioned in the API docs.
820* Internal method, status assumed to be a success, caller should check status
821* before calling this method
822* @param strsrch string search data
823* @param start index of the potential unfriendly composite character
824* @param end index of the potential unfriendly composite character
825* @param status output error status if any.
826* @return TRUE if there is non-ignorable accents before at the beginning
827* of the match, FALSE otherwise.
828*/
829
830static
831UBool checkExtraMatchAccents(const UStringSearch *strsrch, int32_t start,
832 int32_t end,
833 UErrorCode *status)
834{
835 UBool result = FALSE;
836 if (strsrch->pattern.hasPrefixAccents) {
837 int32_t length = end - start;
838 int32_t offset = 0;
839 const UChar *text = strsrch->search->text + start;
840
841 U16_FWD_1(text, offset, length);
842 // we are only concerned with the first composite character
843 if (unorm_quickCheck(text, offset, UNORM_NFD, status) == UNORM_NO) {
844 int32_t safeoffset = getNextSafeOffset(strsrch->collator,
845 text, 0, length);
846 if (safeoffset != length) {
847 safeoffset ++;
848 }
849 UChar *norm = NULL;
850 UChar buffer[INITIAL_ARRAY_SIZE_];
851 int32_t size = unorm_normalize(text, safeoffset, UNORM_NFD, 0,
852 buffer, INITIAL_ARRAY_SIZE_,
853 status);
854 if (U_FAILURE(*status)) {
855 return FALSE;
856 }
857 if (size >= INITIAL_ARRAY_SIZE_) {
858 norm = (UChar *)allocateMemory((size + 1) * sizeof(UChar),
859 status);
860 // if allocation failed, status will be set to
861 // U_MEMORY_ALLOCATION_ERROR and unorm_normalize internally
862 // checks for it.
863 size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, norm,
864 size, status);
865 if (U_FAILURE(*status) && norm != NULL) {
866 uprv_free(norm);
867 return FALSE;
868 }
869 }
870 else {
871 norm = buffer;
872 }
873
874 UCollationElements *coleiter = strsrch->utilIter;
875 ucol_setText(coleiter, norm, size, status);
876 uint32_t firstce = strsrch->pattern.ces[0];
877 UBool ignorable = TRUE;
878 uint32_t ce = UCOL_IGNORABLE;
879 while (U_SUCCESS(*status) && ce != firstce && ce != (uint32_t)UCOL_NULLORDER) {
880 offset = ucol_getOffset(coleiter);
881 if (ce != firstce && ce != UCOL_IGNORABLE) {
882 ignorable = FALSE;
883 }
884 ce = ucol_next(coleiter, status);
885 }
886 UChar32 codepoint;
887 U16_PREV(norm, 0, offset, codepoint);
888 result = !ignorable && (u_getCombiningClass(codepoint) != 0);
889
890 if (norm != buffer) {
891 uprv_free(norm);
892 }
893 }
894 }
895
896 return result;
897}
898
899/**
900* Used by exact matches, checks if there are accents before the match.
901* This is really painful... we have to check that composite characters at
902* the start of the matches have to not have any extra accents.
903* We check the FCD of the character first, if it starts with an accent and
904* the first pattern ce does not match the first ce of the character, we bail.
905* Otherwise we try normalizing the first composite
906* character and find the immediate decomposed character before the match to
907* see if it is an non-ignorable accent.
908* Now normalizing the first composite character is enough because we ensure
909* that when the match is passed in here with extra beginning ces, the
910* first or last ce that match has to occur within the first character.
911* E.g. looking for \u0301 acute in \u01FA A ring above and acute,
912* checkExtraMatchAccent should fail since there is a middle ring in \u01FA
913* Note here that accents checking are slow and cautioned in the API docs.
914* @param strsrch string search data
915* @param start offset
916* @param end offset
917* @return TRUE if there are accents on either side of the match,
918* FALSE otherwise
919*/
920static
921UBool hasAccentsBeforeMatch(const UStringSearch *strsrch, int32_t start,
922 int32_t end)
923{
924 if (strsrch->pattern.hasPrefixAccents) {
925 UCollationElements *coleiter = strsrch->textIter;
926 UErrorCode status = U_ZERO_ERROR;
927 // we have been iterating forwards previously
928 uint32_t ignorable = TRUE;
929 int32_t firstce = strsrch->pattern.ces[0];
930
931 setColEIterOffset(coleiter, start);
932 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
933 if (U_FAILURE(status)) {
934 return TRUE;
935 }
936 while (ce != firstce) {
937 if (ce != UCOL_IGNORABLE) {
938 ignorable = FALSE;
939 }
940 ce = getCE(strsrch, ucol_next(coleiter, &status));
941 if (U_FAILURE(status) || ce == UCOL_NULLORDER) {
942 return TRUE;
943 }
944 }
945 if (!ignorable && inNormBuf(coleiter)) {
946 // within normalization buffer, discontiguous handled here
947 return TRUE;
948 }
949
950 // within text
951 int32_t temp = start;
952 // original code
953 // accent = (getFCD(strsrch->search->text, &temp,
954 // strsrch->search->textLength)
955 // >> SECOND_LAST_BYTE_SHIFT_);
956 // however this code does not work well with VC7 .net in release mode.
957 // maybe the inlines for getFCD combined with shifting has bugs in
958 // VC7. anyways this is a work around.
959 UBool accent = getFCD(strsrch->search->text, &temp,
960 strsrch->search->textLength) > 0xFF;
961 if (!accent) {
962 return checkExtraMatchAccents(strsrch, start, end, &status);
963 }
964 if (!ignorable) {
965 return TRUE;
966 }
967 if (start > 0) {
968 temp = start;
969 U16_BACK_1(strsrch->search->text, 0, temp);
970 if (getFCD(strsrch->search->text, &temp,
971 strsrch->search->textLength) & LAST_BYTE_MASK_) {
972 setColEIterOffset(coleiter, start);
973 ce = ucol_previous(coleiter, &status);
974 if (U_FAILURE(status) ||
975 (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE)) {
976 return TRUE;
977 }
978 }
979 }
980 }
981
982 return FALSE;
983}
984
985/**
986* Used by exact matches, checks if there are accents bounding the match.
987* Note this is the initial boundary check. If the potential match
988* starts or ends with composite characters, the accents in those
989* characters will be determined later.
990* Not doing backwards iteration here, since discontiguos contraction for
991* backwards collation element iterator, use up too many characters.
992* E.g. looking for \u030A ring in \u01FA A ring above and acute,
993* should fail since there is a acute at the end of \u01FA
994* Note here that accents checking are slow and cautioned in the API docs.
995* @param strsrch string search data
996* @param start offset of match
997* @param end end offset of the match
998* @return TRUE if there are accents on either side of the match,
999* FALSE otherwise
1000*/
1001static
1002UBool hasAccentsAfterMatch(const UStringSearch *strsrch, int32_t start,
1003 int32_t end)
1004{
1005 if (strsrch->pattern.hasSuffixAccents) {
1006 const UChar *text = strsrch->search->text;
1007 int32_t temp = end;
1008 int32_t textlength = strsrch->search->textLength;
1009 U16_BACK_1(text, 0, temp);
1010 if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
1011 int32_t firstce = strsrch->pattern.ces[0];
1012 UCollationElements *coleiter = strsrch->textIter;
1013 UErrorCode status = U_ZERO_ERROR;
1014 int32_t ce;
1015 setColEIterOffset(coleiter, start);
1016 while ((ce = getCE(strsrch, ucol_next(coleiter, &status))) != firstce) {
1017 if (U_FAILURE(status) || ce == UCOL_NULLORDER) {
1018 return TRUE;
1019 }
1020 }
1021 int32_t count = 1;
1022 while (count < strsrch->pattern.cesLength) {
1023 if (getCE(strsrch, ucol_next(coleiter, &status))
1024 == UCOL_IGNORABLE) {
1025 // Thai can give an ignorable here.
1026 count --;
1027 }
1028 if (U_FAILURE(status)) {
1029 return TRUE;
1030 }
1031 count ++;
1032 }
1033
1034 ce = ucol_next(coleiter, &status);
1035 if (U_FAILURE(status)) {
1036 return TRUE;
1037 }
1038 if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
1039 ce = getCE(strsrch, ce);
1040 }
1041 if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
1042 if (ucol_getOffset(coleiter) <= end) {
1043 return TRUE;
1044 }
1045 if (getFCD(text, &end, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
1046 return TRUE;
1047 }
1048 }
1049 }
1050 }
1051 return FALSE;
1052}
1053#endif // #if BOYER_MOORE
1054
1055/**
1056* Checks if the offset runs out of the text string
1057* @param offset
1058* @param textlength of the text string
1059* @return TRUE if offset is out of bounds, FALSE otherwise
1060*/
1061static
1062inline UBool isOutOfBounds(int32_t textlength, int32_t offset)
1063{
1064 return offset < 0 || offset > textlength;
1065}
1066
1067/**
1068* Checks for identical match
1069* @param strsrch string search data
1070* @param start offset of possible match
1071* @param end offset of possible match
1072* @return TRUE if identical match is found
1073*/
1074static
1075inline UBool checkIdentical(const UStringSearch *strsrch, int32_t start,
1076 int32_t end)
1077{
1078 if (strsrch->strength != UCOL_IDENTICAL) {
1079 return TRUE;
1080 }
1081
1082 // Note: We could use Normalizer::compare() or similar, but for short strings
1083 // which may not be in FCD it might be faster to just NFD them.
1084 UErrorCode status = U_ZERO_ERROR;
1085 UnicodeString t2, p2;
1086 strsrch->nfd->normalize(
1087 UnicodeString(FALSE, strsrch->search->text + start, end - start), t2, status);
1088 strsrch->nfd->normalize(
1089 UnicodeString(FALSE, strsrch->pattern.text, strsrch->pattern.textLength), p2, status);
1090 // return FALSE if NFD failed
1091 return U_SUCCESS(status) && t2 == p2;
1092}
1093
1094#if BOYER_MOORE
1095/**
1096* Checks to see if the match is repeated
1097* @param strsrch string search data
1098* @param start new match start index
1099* @param end new match end index
1100* @return TRUE if the the match is repeated, FALSE otherwise
1101*/
1102static
1103inline UBool checkRepeatedMatch(UStringSearch *strsrch,
1104 int32_t start,
1105 int32_t end)
1106{
1107 int32_t lastmatchindex = strsrch->search->matchedIndex;
1108 UBool result;
1109 if (lastmatchindex == USEARCH_DONE) {
1110 return FALSE;
1111 }
1112 if (strsrch->search->isForwardSearching) {
1113 result = start <= lastmatchindex;
1114 }
1115 else {
1116 result = start >= lastmatchindex;
1117 }
1118 if (!result && !strsrch->search->isOverlap) {
1119 if (strsrch->search->isForwardSearching) {
1120 result = start < lastmatchindex + strsrch->search->matchedLength;
1121 }
1122 else {
1123 result = end > lastmatchindex;
1124 }
1125 }
1126 return result;
1127}
1128
1129/**
1130* Gets the collation element iterator's current offset.
1131* @param coleiter collation element iterator
1132* @param forwards flag TRUE if we are moving in th forwards direction
1133* @return current offset
1134*/
1135static
1136inline int32_t getColElemIterOffset(const UCollationElements *coleiter,
1137 UBool forwards)
1138{
1139 int32_t result = ucol_getOffset(coleiter);
1140 // intricacies of the the backwards collation element iterator
1141 if (FALSE && !forwards && inNormBuf(coleiter) && !isFCDPointerNull(coleiter)) {
1142 result ++;
1143 }
1144 return result;
1145}
1146
1147/**
1148* Checks match for contraction.
1149* If the match ends with a partial contraction we fail.
1150* If the match starts too far off (because of backwards iteration) we try to
1151* chip off the extra characters depending on whether a breakiterator has
1152* been used.
1153* Internal method, error assumed to be success, caller has to check status
1154* before calling this method.
1155* @param strsrch string search data
1156* @param start offset of potential match, to be modified if necessary
1157* @param end offset of potential match, to be modified if necessary
1158* @param status output error status if any
1159* @return TRUE if match passes the contraction test, FALSE otherwise
1160*/
1161
1162static
1163UBool checkNextExactContractionMatch(UStringSearch *strsrch,
1164 int32_t *start,
1165 int32_t *end, UErrorCode *status)
1166{
1167 UCollationElements *coleiter = strsrch->textIter;
1168 int32_t textlength = strsrch->search->textLength;
1169 int32_t temp = *start;
1170 const UCollator *collator = strsrch->collator;
1171 const UChar *text = strsrch->search->text;
1172 // This part checks if either ends of the match contains potential
1173 // contraction. If so we'll have to iterate through them
1174 // The start contraction needs to be checked since ucol_previous dumps
1175 // all characters till the first safe character into the buffer.
1176 // *start + 1 is used to test for the unsafe characters instead of *start
1177 // because ucol_prev takes all unsafe characters till the first safe
1178 // character ie *start. so by testing *start + 1, we can estimate if
1179 // excess prefix characters has been included in the potential search
1180 // results.
1181 if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
1182 (*start + 1 < textlength
1183 && ucol_unsafeCP(text[*start + 1], collator))) {
1184 int32_t expansion = getExpansionPrefix(coleiter);
1185 UBool expandflag = expansion > 0;
1186 setColEIterOffset(coleiter, *start);
1187 while (expansion > 0) {
1188 // getting rid of the redundant ce, caused by setOffset.
1189 // since backward contraction/expansion may have extra ces if we
1190 // are in the normalization buffer, hasAccentsBeforeMatch would
1191 // have taken care of it.
1192 // E.g. the character \u01FA will have an expansion of 3, but if
1193 // we are only looking for acute and ring \u030A and \u0301, we'll
1194 // have to skip the first ce in the expansion buffer.
1195 ucol_next(coleiter, status);
1196 if (U_FAILURE(*status)) {
1197 return FALSE;
1198 }
1199 if (ucol_getOffset(coleiter) != temp) {
1200 *start = temp;
1201 temp = ucol_getOffset(coleiter);
1202 }
1203 expansion --;
1204 }
1205
1206 int32_t *patternce = strsrch->pattern.ces;
1207 int32_t patterncelength = strsrch->pattern.cesLength;
1208 int32_t count = 0;
1209 while (count < patterncelength) {
1210 int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
1211 if (ce == UCOL_IGNORABLE) {
1212 continue;
1213 }
1214 if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
1215 *start = temp;
1216 temp = ucol_getOffset(coleiter);
1217 }
1218 if (U_FAILURE(*status) || ce != patternce[count]) {
1219 (*end) ++;
1220 *end = getNextUStringSearchBaseOffset(strsrch, *end);
1221 return FALSE;
1222 }
1223 count ++;
1224 }
1225 }
1226 return TRUE;
1227}
1228
1229/**
1230* Checks and sets the match information if found.
1231* Checks
1232* <ul>
1233* <li> the potential match does not repeat the previous match
1234* <li> boundaries are correct
1235* <li> exact matches has no extra accents
1236* <li> identical matchesb
1237* <li> potential match does not end in the middle of a contraction
1238* <\ul>
1239* Otherwise the offset will be shifted to the next character.
1240* Internal method, status assumed to be success, caller has to check status
1241* before calling this method.
1242* @param strsrch string search data
1243* @param textoffset offset in the collation element text. the returned value
1244* will be the truncated end offset of the match or the new start
1245* search offset.
1246* @param status output error status if any
1247* @return TRUE if the match is valid, FALSE otherwise
1248*/
1249static
1250inline UBool checkNextExactMatch(UStringSearch *strsrch,
1251 int32_t *textoffset, UErrorCode *status)
1252{
1253 UCollationElements *coleiter = strsrch->textIter;
1254 int32_t start = getColElemIterOffset(coleiter, FALSE);
1255
1256 if (!checkNextExactContractionMatch(strsrch, &start, textoffset, status)) {
1257 return FALSE;
1258 }
1259
1260 // this totally matches, however we need to check if it is repeating
1261 if (!isBreakUnit(strsrch, start, *textoffset) ||
1262 checkRepeatedMatch(strsrch, start, *textoffset) ||
1263 hasAccentsBeforeMatch(strsrch, start, *textoffset) ||
1264 !checkIdentical(strsrch, start, *textoffset) ||
1265 hasAccentsAfterMatch(strsrch, start, *textoffset)) {
1266
1267 (*textoffset) ++;
1268 *textoffset = getNextUStringSearchBaseOffset(strsrch, *textoffset);
1269 return FALSE;
1270 }
1271
1272 //Add breakiterator boundary check for primary strength search.
1273 if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
1274 checkBreakBoundary(strsrch, &start, textoffset);
1275 }
1276
1277 // totally match, we will get rid of the ending ignorables.
1278 strsrch->search->matchedIndex = start;
1279 strsrch->search->matchedLength = *textoffset - start;
1280 return TRUE;
1281}
1282
1283/**
1284* Getting the previous base character offset, or the current offset if the
1285* current character is a base character
1286* @param text string
1287* @param textoffset one offset after the current character
1288* @return the offset of the next character after the base character or the first
1289* composed character with accents
1290*/
1291static
1292inline int32_t getPreviousBaseOffset(const UChar *text,
1293 int32_t textoffset)
1294{
1295 if (textoffset > 0) {
1296 for (;;) {
1297 int32_t result = textoffset;
1298 U16_BACK_1(text, 0, textoffset);
1299 int32_t temp = textoffset;
1300 uint16_t fcd = getFCD(text, &temp, result);
1301 if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) {
1302 if (fcd & LAST_BYTE_MASK_) {
1303 return textoffset;
1304 }
1305 return result;
1306 }
1307 if (textoffset == 0) {
1308 return 0;
1309 }
1310 }
1311 }
1312 return textoffset;
1313}
1314
1315/**
1316* Getting the indexes of the accents that are not blocked in the argument
1317* accent array
1318* @param accents array of accents in nfd terminated by a 0.
1319* @param accentsindex array of indexes of the accents that are not blocked
1320*/
1321static
1322inline int getUnblockedAccentIndex(UChar *accents, int32_t *accentsindex)
1323{
1324 int32_t index = 0;
1325 int32_t length = u_strlen(accents);
1326 UChar32 codepoint = 0;
1327 int cclass = 0;
1328 int result = 0;
1329 int32_t temp;
1330 while (index < length) {
1331 temp = index;
1332 U16_NEXT(accents, index, length, codepoint);
1333 if (u_getCombiningClass(codepoint) != cclass) {
1334 cclass = u_getCombiningClass(codepoint);
1335 accentsindex[result] = temp;
1336 result ++;
1337 }
1338 }
1339 accentsindex[result] = length;
1340 return result;
1341}
1342
1343/**
1344* Appends 3 UChar arrays to a destination array.
1345* Creates a new array if we run out of space. The caller will have to
1346* manually deallocate the newly allocated array.
1347* Internal method, status assumed to be success, caller has to check status
1348* before calling this method. destination not to be NULL and has at least
1349* size destinationlength.
1350* @param destination target array
1351* @param destinationlength target array size, returning the appended length
1352* @param source1 null-terminated first array
1353* @param source2 second array
1354* @param source2length length of second array
1355* @param source3 null-terminated third array
1356* @param status error status if any
1357* @return new destination array, destination if there was no new allocation
1358*/
1359static
1360inline UChar * addToUCharArray( UChar *destination,
1361 int32_t *destinationlength,
1362 const UChar *source1,
1363 const UChar *source2,
1364 int32_t source2length,
1365 const UChar *source3,
1366 UErrorCode *status)
1367{
1368 int32_t source1length = source1 ? u_strlen(source1) : 0;
1369 int32_t source3length = source3 ? u_strlen(source3) : 0;
1370 if (*destinationlength < source1length + source2length + source3length +
1371 1)
1372 {
1373 destination = (UChar *)allocateMemory(
1374 (source1length + source2length + source3length + 1) * sizeof(UChar),
1375 status);
1376 // if error allocating memory, status will be
1377 // U_MEMORY_ALLOCATION_ERROR
1378 if (U_FAILURE(*status)) {
1379 *destinationlength = 0;
1380 return NULL;
1381 }
1382 }
1383 if (source1length != 0) {
1384 u_memcpy(destination, source1, source1length);
1385 }
1386 if (source2length != 0) {
1387 uprv_memcpy(destination + source1length, source2,
1388 sizeof(UChar) * source2length);
1389 }
1390 if (source3length != 0) {
1391 uprv_memcpy(destination + source1length + source2length, source3,
1392 sizeof(UChar) * source3length);
1393 }
1394 *destinationlength = source1length + source2length + source3length;
1395 return destination;
1396}
1397
1398/**
1399* Running through a collation element iterator to see if the contents matches
1400* pattern in string search data
1401* @param strsrch string search data
1402* @param coleiter collation element iterator
1403* @return TRUE if a match if found, FALSE otherwise
1404*/
1405static
1406inline UBool checkCollationMatch(const UStringSearch *strsrch,
1407 UCollationElements *coleiter)
1408{
1409 int patternceindex = strsrch->pattern.cesLength;
1410 int32_t *patternce = strsrch->pattern.ces;
1411 UErrorCode status = U_ZERO_ERROR;
1412 while (patternceindex > 0) {
1413 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
1414 if (ce == UCOL_IGNORABLE) {
1415 continue;
1416 }
1417 if (U_FAILURE(status) || ce != *patternce) {
1418 return FALSE;
1419 }
1420 patternce ++;
1421 patternceindex --;
1422 }
1423 return TRUE;
1424}
1425
1426/**
1427* Rearranges the front accents to try matching.
1428* Prefix accents in the text will be grouped according to their combining
1429* class and the groups will be mixed and matched to try find the perfect
1430* match with the pattern.
1431* So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1432* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1433* "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1434* "\u0301\u0325".
1435* step 2: check if any of the generated substrings matches the pattern.
1436* Internal method, status is assumed to be success, caller has to check status
1437* before calling this method.
1438* @param strsrch string search match
1439* @param start first offset of the accents to start searching
1440* @param end start of the last accent set
1441* @param status output error status if any
1442* @return USEARCH_DONE if a match is not found, otherwise return the starting
1443* offset of the match. Note this start includes all preceding accents.
1444*/
1445static
1446int32_t doNextCanonicalPrefixMatch(UStringSearch *strsrch,
1447 int32_t start,
1448 int32_t end,
1449 UErrorCode *status)
1450{
1451 const UChar *text = strsrch->search->text;
1452 int32_t textlength = strsrch->search->textLength;
1453 int32_t tempstart = start;
1454
1455 if ((getFCD(text, &tempstart, textlength) & LAST_BYTE_MASK_) == 0) {
1456 // die... failed at a base character
1457 return USEARCH_DONE;
1458 }
1459
1460 int32_t offset = getNextBaseOffset(text, tempstart, textlength);
1461 start = getPreviousBaseOffset(text, tempstart);
1462
1463 UChar accents[INITIAL_ARRAY_SIZE_];
1464 // normalizing the offensive string
1465 unorm_normalize(text + start, offset - start, UNORM_NFD, 0, accents,
1466 INITIAL_ARRAY_SIZE_, status);
1467 if (U_FAILURE(*status)) {
1468 return USEARCH_DONE;
1469 }
1470
1471 int32_t accentsindex[INITIAL_ARRAY_SIZE_];
1472 int32_t accentsize = getUnblockedAccentIndex(accents,
1473 accentsindex);
1474 int32_t count = (2 << (accentsize - 1)) - 1;
1475 UChar buffer[INITIAL_ARRAY_SIZE_];
1476 UCollationElements *coleiter = strsrch->utilIter;
1477 while (U_SUCCESS(*status) && count > 0) {
1478 UChar *rearrange = strsrch->canonicalPrefixAccents;
1479 // copy the base characters
1480 for (int k = 0; k < accentsindex[0]; k ++) {
1481 *rearrange ++ = accents[k];
1482 }
1483 // forming all possible canonical rearrangement by dropping
1484 // sets of accents
1485 for (int i = 0; i <= accentsize - 1; i ++) {
1486 int32_t mask = 1 << (accentsize - i - 1);
1487 if (count & mask) {
1488 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
1489 *rearrange ++ = accents[j];
1490 }
1491 }
1492 }
1493 *rearrange = 0;
1494 int32_t matchsize = INITIAL_ARRAY_SIZE_;
1495 UChar *match = addToUCharArray(buffer, &matchsize,
1496 strsrch->canonicalPrefixAccents,
1497 strsrch->search->text + offset,
1498 end - offset,
1499 strsrch->canonicalSuffixAccents,
1500 status);
1501
1502 // if status is a failure, ucol_setText does nothing.
1503 // run the collator iterator through this match
1504 ucol_setText(coleiter, match, matchsize, status);
1505 if (U_SUCCESS(*status)) {
1506 if (checkCollationMatch(strsrch, coleiter)) {
1507 if (match != buffer) {
1508 uprv_free(match);
1509 }
1510 return start;
1511 }
1512 }
1513 count --;
1514 }
1515 return USEARCH_DONE;
1516}
1517
1518/**
1519* Gets the offset to the safe point in text before textoffset.
1520* ie. not the middle of a contraction, swappable characters or supplementary
1521* characters.
1522* @param collator collation sata
1523* @param text string to work with
1524* @param textoffset offset in string
1525* @param textlength length of text string
1526* @return offset to the previous safe character
1527*/
1528static
1529inline uint32_t getPreviousSafeOffset(const UCollator *collator,
1530 const UChar *text,
1531 int32_t textoffset)
1532{
1533 int32_t result = textoffset; // first contraction character
1534 while (result != 0 && ucol_unsafeCP(text[result - 1], collator)) {
1535 result --;
1536 }
1537 if (result != 0) {
1538 // the first contraction character is consider unsafe here
1539 result --;
1540 }
1541 return result;
1542}
1543
1544/**
1545* Cleaning up after we passed the safe zone
1546* @param strsrch string search data
1547* @param safetext safe text array
1548* @param safebuffer safe text buffer
1549* @param coleiter collation element iterator for safe text
1550*/
1551static
1552inline void cleanUpSafeText(const UStringSearch *strsrch, UChar *safetext,
1553 UChar *safebuffer)
1554{
1555 if (safetext != safebuffer && safetext != strsrch->canonicalSuffixAccents)
1556 {
1557 uprv_free(safetext);
1558 }
1559}
1560
1561/**
1562* Take the rearranged end accents and tries matching. If match failed at
1563* a separate preceding set of accents (separated from the rearranged on by
1564* at least a base character) then we rearrange the preceding accents and
1565* tries matching again.
1566* We allow skipping of the ends of the accent set if the ces do not match.
1567* However if the failure is found before the accent set, it fails.
1568* Internal method, status assumed to be success, caller has to check status
1569* before calling this method.
1570* @param strsrch string search data
1571* @param textoffset of the start of the rearranged accent
1572* @param status output error status if any
1573* @return USEARCH_DONE if a match is not found, otherwise return the starting
1574* offset of the match. Note this start includes all preceding accents.
1575*/
1576static
1577int32_t doNextCanonicalSuffixMatch(UStringSearch *strsrch,
1578 int32_t textoffset,
1579 UErrorCode *status)
1580{
1581 const UChar *text = strsrch->search->text;
1582 const UCollator *collator = strsrch->collator;
1583 int32_t safelength = 0;
1584 UChar *safetext;
1585 int32_t safetextlength;
1586 UChar safebuffer[INITIAL_ARRAY_SIZE_];
1587 UCollationElements *coleiter = strsrch->utilIter;
1588 int32_t safeoffset = textoffset;
1589
1590 if (textoffset != 0 && ucol_unsafeCP(strsrch->canonicalSuffixAccents[0],
1591 collator)) {
1592 safeoffset = getPreviousSafeOffset(collator, text, textoffset);
1593 safelength = textoffset - safeoffset;
1594 safetextlength = INITIAL_ARRAY_SIZE_;
1595 safetext = addToUCharArray(safebuffer, &safetextlength, NULL,
1596 text + safeoffset, safelength,
1597 strsrch->canonicalSuffixAccents,
1598 status);
1599 }
1600 else {
1601 safetextlength = u_strlen(strsrch->canonicalSuffixAccents);
1602 safetext = strsrch->canonicalSuffixAccents;
1603 }
1604
1605 // if status is a failure, ucol_setText does nothing
1606 ucol_setText(coleiter, safetext, safetextlength, status);
1607 // status checked in loop below
1608
1609 int32_t *ce = strsrch->pattern.ces;
1610 int32_t celength = strsrch->pattern.cesLength;
1611 int ceindex = celength - 1;
1612 UBool isSafe = TRUE; // indication flag for position in safe zone
1613
1614 while (ceindex >= 0) {
1615 int32_t textce = ucol_previous(coleiter, status);
1616 if (U_FAILURE(*status)) {
1617 if (isSafe) {
1618 cleanUpSafeText(strsrch, safetext, safebuffer);
1619 }
1620 return USEARCH_DONE;
1621 }
1622 if (textce == UCOL_NULLORDER) {
1623 // check if we have passed the safe buffer
1624 if (coleiter == strsrch->textIter) {
1625 cleanUpSafeText(strsrch, safetext, safebuffer);
1626 return USEARCH_DONE;
1627 }
1628 cleanUpSafeText(strsrch, safetext, safebuffer);
1629 safetext = safebuffer;
1630 coleiter = strsrch->textIter;
1631 setColEIterOffset(coleiter, safeoffset);
1632 // status checked at the start of the loop
1633 isSafe = FALSE;
1634 continue;
1635 }
1636 textce = getCE(strsrch, textce);
1637 if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
1638 // do the beginning stuff
1639 int32_t failedoffset = getColElemIterOffset(coleiter, FALSE);
1640 if (isSafe && failedoffset >= safelength) {
1641 // alas... no hope. failed at rearranged accent set
1642 cleanUpSafeText(strsrch, safetext, safebuffer);
1643 return USEARCH_DONE;
1644 }
1645 else {
1646 if (isSafe) {
1647 failedoffset += safeoffset;
1648 cleanUpSafeText(strsrch, safetext, safebuffer);
1649 }
1650
1651 // try rearranging the front accents
1652 int32_t result = doNextCanonicalPrefixMatch(strsrch,
1653 failedoffset, textoffset, status);
1654 if (result != USEARCH_DONE) {
1655 // if status is a failure, ucol_setOffset does nothing
1656 setColEIterOffset(strsrch->textIter, result);
1657 }
1658 if (U_FAILURE(*status)) {
1659 return USEARCH_DONE;
1660 }
1661 return result;
1662 }
1663 }
1664 if (textce == ce[ceindex]) {
1665 ceindex --;
1666 }
1667 }
1668 // set offset here
1669 if (isSafe) {
1670 int32_t result = getColElemIterOffset(coleiter, FALSE);
1671 // sets the text iterator here with the correct expansion and offset
1672 int32_t leftoverces = getExpansionPrefix(coleiter);
1673 cleanUpSafeText(strsrch, safetext, safebuffer);
1674 if (result >= safelength) {
1675 result = textoffset;
1676 }
1677 else {
1678 result += safeoffset;
1679 }
1680 setColEIterOffset(strsrch->textIter, result);
1681 strsrch->textIter->iteratordata_.toReturn =
1682 setExpansionPrefix(strsrch->textIter, leftoverces);
1683 return result;
1684 }
1685
1686 return ucol_getOffset(coleiter);
1687}
1688
1689/**
1690* Trying out the substring and sees if it can be a canonical match.
1691* This will try normalizing the end accents and arranging them into canonical
1692* equivalents and check their corresponding ces with the pattern ce.
1693* Suffix accents in the text will be grouped according to their combining
1694* class and the groups will be mixed and matched to try find the perfect
1695* match with the pattern.
1696* So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1697* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1698* "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1699* "\u0301\u0325".
1700* step 2: check if any of the generated substrings matches the pattern.
1701* Internal method, status assumed to be success, caller has to check status
1702* before calling this method.
1703* @param strsrch string search data
1704* @param textoffset end offset in the collation element text that ends with
1705* the accents to be rearranged
1706* @param status error status if any
1707* @return TRUE if the match is valid, FALSE otherwise
1708*/
1709static
1710UBool doNextCanonicalMatch(UStringSearch *strsrch,
1711 int32_t textoffset,
1712 UErrorCode *status)
1713{
1714 const UChar *text = strsrch->search->text;
1715 int32_t temp = textoffset;
1716 U16_BACK_1(text, 0, temp);
1717 if ((getFCD(text, &temp, textoffset) & LAST_BYTE_MASK_) == 0) {
1718 UCollationElements *coleiter = strsrch->textIter;
1719 int32_t offset = getColElemIterOffset(coleiter, FALSE);
1720 if (strsrch->pattern.hasPrefixAccents) {
1721 offset = doNextCanonicalPrefixMatch(strsrch, offset, textoffset,
1722 status);
1723 if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
1724 setColEIterOffset(coleiter, offset);
1725 return TRUE;
1726 }
1727 }
1728 return FALSE;
1729 }
1730
1731 if (!strsrch->pattern.hasSuffixAccents) {
1732 return FALSE;
1733 }
1734
1735 UChar accents[INITIAL_ARRAY_SIZE_];
1736 // offset to the last base character in substring to search
1737 int32_t baseoffset = getPreviousBaseOffset(text, textoffset);
1738 // normalizing the offensive string
1739 unorm_normalize(text + baseoffset, textoffset - baseoffset, UNORM_NFD,
1740 0, accents, INITIAL_ARRAY_SIZE_, status);
1741 // status checked in loop below
1742
1743 int32_t accentsindex[INITIAL_ARRAY_SIZE_];
1744 int32_t size = getUnblockedAccentIndex(accents, accentsindex);
1745
1746 // 2 power n - 1 plus the full set of accents
1747 int32_t count = (2 << (size - 1)) - 1;
1748 while (U_SUCCESS(*status) && count > 0) {
1749 UChar *rearrange = strsrch->canonicalSuffixAccents;
1750 // copy the base characters
1751 for (int k = 0; k < accentsindex[0]; k ++) {
1752 *rearrange ++ = accents[k];
1753 }
1754 // forming all possible canonical rearrangement by dropping
1755 // sets of accents
1756 for (int i = 0; i <= size - 1; i ++) {
1757 int32_t mask = 1 << (size - i - 1);
1758 if (count & mask) {
1759 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
1760 *rearrange ++ = accents[j];
1761 }
1762 }
1763 }
1764 *rearrange = 0;
1765 int32_t offset = doNextCanonicalSuffixMatch(strsrch, baseoffset,
1766 status);
1767 if (offset != USEARCH_DONE) {
1768 return TRUE; // match found
1769 }
1770 count --;
1771 }
1772 return FALSE;
1773}
1774
1775/**
1776* Gets the previous base character offset depending on the string search
1777* pattern data
1778* @param strsrch string search data
1779* @param textoffset current offset, current character
1780* @return the offset of the next character after this base character or itself
1781* if it is a composed character with accents
1782*/
1783static
1784inline int32_t getPreviousUStringSearchBaseOffset(UStringSearch *strsrch,
1785 int32_t textoffset)
1786{
1787 if (strsrch->pattern.hasPrefixAccents && textoffset > 0) {
1788 const UChar *text = strsrch->search->text;
1789 int32_t offset = textoffset;
1790 if (getFCD(text, &offset, strsrch->search->textLength) >>
1791 SECOND_LAST_BYTE_SHIFT_) {
1792 return getPreviousBaseOffset(text, textoffset);
1793 }
1794 }
1795 return textoffset;
1796}
1797
1798/**
1799* Checks match for contraction.
1800* If the match ends with a partial contraction we fail.
1801* If the match starts too far off (because of backwards iteration) we try to
1802* chip off the extra characters
1803* Internal method, status assumed to be success, caller has to check status
1804* before calling this method.
1805* @param strsrch string search data
1806* @param start offset of potential match, to be modified if necessary
1807* @param end offset of potential match, to be modified if necessary
1808* @param status output error status if any
1809* @return TRUE if match passes the contraction test, FALSE otherwise
1810*/
1811static
1812UBool checkNextCanonicalContractionMatch(UStringSearch *strsrch,
1813 int32_t *start,
1814 int32_t *end,
1815 UErrorCode *status)
1816{
1817 UCollationElements *coleiter = strsrch->textIter;
1818 int32_t textlength = strsrch->search->textLength;
1819 int32_t temp = *start;
1820 const UCollator *collator = strsrch->collator;
1821 const UChar *text = strsrch->search->text;
1822 // This part checks if either ends of the match contains potential
1823 // contraction. If so we'll have to iterate through them
1824 if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
1825 (*start + 1 < textlength
1826 && ucol_unsafeCP(text[*start + 1], collator))) {
1827 int32_t expansion = getExpansionPrefix(coleiter);
1828 UBool expandflag = expansion > 0;
1829 setColEIterOffset(coleiter, *start);
1830 while (expansion > 0) {
1831 // getting rid of the redundant ce, caused by setOffset.
1832 // since backward contraction/expansion may have extra ces if we
1833 // are in the normalization buffer, hasAccentsBeforeMatch would
1834 // have taken care of it.
1835 // E.g. the character \u01FA will have an expansion of 3, but if
1836 // we are only looking for acute and ring \u030A and \u0301, we'll
1837 // have to skip the first ce in the expansion buffer.
1838 ucol_next(coleiter, status);
1839 if (U_FAILURE(*status)) {
1840 return FALSE;
1841 }
1842 if (ucol_getOffset(coleiter) != temp) {
1843 *start = temp;
1844 temp = ucol_getOffset(coleiter);
1845 }
1846 expansion --;
1847 }
1848
1849 int32_t *patternce = strsrch->pattern.ces;
1850 int32_t patterncelength = strsrch->pattern.cesLength;
1851 int32_t count = 0;
1852 int32_t textlength = strsrch->search->textLength;
1853 while (count < patterncelength) {
1854 int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
1855 // status checked below, note that if status is a failure
1856 // ucol_next returns UCOL_NULLORDER
1857 if (ce == UCOL_IGNORABLE) {
1858 continue;
1859 }
1860 if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
1861 *start = temp;
1862 temp = ucol_getOffset(coleiter);
1863 }
1864
1865 if (count == 0 && ce != patternce[0]) {
1866 // accents may have extra starting ces, this occurs when a
1867 // pure accent pattern is matched without rearrangement
1868 // text \u0325\u0300 and looking for \u0300
1869 int32_t expected = patternce[0];
1870 if (getFCD(text, start, textlength) & LAST_BYTE_MASK_) {
1871 ce = getCE(strsrch, ucol_next(coleiter, status));
1872 while (U_SUCCESS(*status) && ce != expected &&
1873 ce != UCOL_NULLORDER &&
1874 ucol_getOffset(coleiter) <= *end) {
1875 ce = getCE(strsrch, ucol_next(coleiter, status));
1876 }
1877 }
1878 }
1879 if (U_FAILURE(*status) || ce != patternce[count]) {
1880 (*end) ++;
1881 *end = getNextUStringSearchBaseOffset(strsrch, *end);
1882 return FALSE;
1883 }
1884 count ++;
1885 }
1886 }
1887 return TRUE;
1888}
1889
1890/**
1891* Checks and sets the match information if found.
1892* Checks
1893* <ul>
1894* <li> the potential match does not repeat the previous match
1895* <li> boundaries are correct
1896* <li> potential match does not end in the middle of a contraction
1897* <li> identical matches
1898* <\ul>
1899* Otherwise the offset will be shifted to the next character.
1900* Internal method, status assumed to be success, caller has to check the
1901* status before calling this method.
1902* @param strsrch string search data
1903* @param textoffset offset in the collation element text. the returned value
1904* will be the truncated end offset of the match or the new start
1905* search offset.
1906* @param status output error status if any
1907* @return TRUE if the match is valid, FALSE otherwise
1908*/
1909static
1910inline UBool checkNextCanonicalMatch(UStringSearch *strsrch,
1911 int32_t *textoffset,
1912 UErrorCode *status)
1913{
1914 // to ensure that the start and ends are not composite characters
1915 UCollationElements *coleiter = strsrch->textIter;
1916 // if we have a canonical accent match
1917 if ((strsrch->pattern.hasSuffixAccents &&
1918 strsrch->canonicalSuffixAccents[0]) ||
1919 (strsrch->pattern.hasPrefixAccents &&
1920 strsrch->canonicalPrefixAccents[0])) {
1921 strsrch->search->matchedIndex = getPreviousUStringSearchBaseOffset(
1922 strsrch,
1923 ucol_getOffset(coleiter));
1924 strsrch->search->matchedLength = *textoffset -
1925 strsrch->search->matchedIndex;
1926 return TRUE;
1927 }
1928
1929 int32_t start = getColElemIterOffset(coleiter, FALSE);
1930 if (!checkNextCanonicalContractionMatch(strsrch, &start, textoffset,
1931 status) || U_FAILURE(*status)) {
1932 return FALSE;
1933 }
1934
1935 start = getPreviousUStringSearchBaseOffset(strsrch, start);
1936 // this totally matches, however we need to check if it is repeating
1937 if (checkRepeatedMatch(strsrch, start, *textoffset) ||
1938 !isBreakUnit(strsrch, start, *textoffset) ||
1939 !checkIdentical(strsrch, start, *textoffset)) {
1940 (*textoffset) ++;
1941 *textoffset = getNextBaseOffset(strsrch->search->text, *textoffset,
1942 strsrch->search->textLength);
1943 return FALSE;
1944 }
1945
1946 strsrch->search->matchedIndex = start;
1947 strsrch->search->matchedLength = *textoffset - start;
1948 return TRUE;
1949}
1950
1951/**
1952* Shifting the collation element iterator position forward to prepare for
1953* a preceding match. If the first character is a unsafe character, we'll only
1954* shift by 1 to capture contractions, normalization etc.
1955* Internal method, status assumed to be success, caller has to check status
1956* before calling this method.
1957* @param text strsrch string search data
1958* @param textoffset start text position to do search
1959* @param ce the text ce which failed the match.
1960* @param patternceindex index of the ce within the pattern ce buffer which
1961* failed the match
1962* @return final offset
1963*/
1964static
1965inline int32_t reverseShift(UStringSearch *strsrch,
1966 int32_t textoffset,
1967 int32_t ce,
1968 int32_t patternceindex)
1969{
1970 if (strsrch->search->isOverlap) {
1971 if (textoffset != strsrch->search->textLength) {
1972 textoffset --;
1973 }
1974 else {
1975 textoffset -= strsrch->pattern.defaultShiftSize;
1976 }
1977 }
1978 else {
1979 if (ce != UCOL_NULLORDER) {
1980 int32_t shift = strsrch->pattern.backShift[hashFromCE32(ce)];
1981
1982 // this is to adjust for characters in the middle of the substring
1983 // for matching that failed.
1984 int32_t adjust = patternceindex;
1985 if (adjust > 1 && shift > adjust) {
1986 shift -= adjust - 1;
1987 }
1988 textoffset -= shift;
1989 }
1990 else {
1991 textoffset -= strsrch->pattern.defaultShiftSize;
1992 }
1993 }
1994 textoffset = getPreviousUStringSearchBaseOffset(strsrch, textoffset);
1995 return textoffset;
1996}
1997
1998/**
1999* Checks match for contraction.
2000* If the match starts with a partial contraction we fail.
2001* Internal method, status assumed to be success, caller has to check status
2002* before calling this method.
2003* @param strsrch string search data
2004* @param start offset of potential match, to be modified if necessary
2005* @param end offset of potential match, to be modified if necessary
2006* @param status output error status if any
2007* @return TRUE if match passes the contraction test, FALSE otherwise
2008*/
2009static
2010UBool checkPreviousExactContractionMatch(UStringSearch *strsrch,
2011 int32_t *start,
2012 int32_t *end, UErrorCode *status)
2013{
2014 UCollationElements *coleiter = strsrch->textIter;
2015 int32_t textlength = strsrch->search->textLength;
2016 int32_t temp = *end;
2017 const UCollator *collator = strsrch->collator;
2018 const UChar *text = strsrch->search->text;
2019 // This part checks if either if the start of the match contains potential
2020 // contraction. If so we'll have to iterate through them
2021 // Since we used ucol_next while previously looking for the potential
2022 // match, this guarantees that our end will not be a partial contraction,
2023 // or a partial supplementary character.
2024 if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
2025 int32_t expansion = getExpansionSuffix(coleiter);
2026 UBool expandflag = expansion > 0;
2027 setColEIterOffset(coleiter, *end);
2028 while (U_SUCCESS(*status) && expansion > 0) {
2029 // getting rid of the redundant ce
2030 // since forward contraction/expansion may have extra ces
2031 // if we are in the normalization buffer, hasAccentsBeforeMatch
2032 // would have taken care of it.
2033 // E.g. the character \u01FA will have an expansion of 3, but if
2034 // we are only looking for A ring A\u030A, we'll have to skip the
2035 // last ce in the expansion buffer
2036 ucol_previous(coleiter, status);
2037 if (U_FAILURE(*status)) {
2038 return FALSE;
2039 }
2040 if (ucol_getOffset(coleiter) != temp) {
2041 *end = temp;
2042 temp = ucol_getOffset(coleiter);
2043 }
2044 expansion --;
2045 }
2046
2047 int32_t *patternce = strsrch->pattern.ces;
2048 int32_t patterncelength = strsrch->pattern.cesLength;
2049 int32_t count = patterncelength;
2050 while (count > 0) {
2051 int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
2052 // status checked below, note that if status is a failure
2053 // ucol_previous returns UCOL_NULLORDER
2054 if (ce == UCOL_IGNORABLE) {
2055 continue;
2056 }
2057 if (expandflag && count == 0 &&
2058 getColElemIterOffset(coleiter, FALSE) != temp) {
2059 *end = temp;
2060 temp = ucol_getOffset(coleiter);
2061 }
2062 if (U_FAILURE(*status) || ce != patternce[count - 1]) {
2063 (*start) --;
2064 *start = getPreviousBaseOffset(text, *start);
2065 return FALSE;
2066 }
2067 count --;
2068 }
2069 }
2070 return TRUE;
2071}
2072
2073/**
2074* Checks and sets the match information if found.
2075* Checks
2076* <ul>
2077* <li> the current match does not repeat the last match
2078* <li> boundaries are correct
2079* <li> exact matches has no extra accents
2080* <li> identical matches
2081* <\ul>
2082* Otherwise the offset will be shifted to the preceding character.
2083* Internal method, status assumed to be success, caller has to check status
2084* before calling this method.
2085* @param strsrch string search data
2086* @param collator
2087* @param coleiter collation element iterator
2088* @param text string
2089* @param textoffset offset in the collation element text. the returned value
2090* will be the truncated start offset of the match or the new start
2091* search offset.
2092* @param status output error status if any
2093* @return TRUE if the match is valid, FALSE otherwise
2094*/
2095static
2096inline UBool checkPreviousExactMatch(UStringSearch *strsrch,
2097 int32_t *textoffset,
2098 UErrorCode *status)
2099{
2100 // to ensure that the start and ends are not composite characters
2101 int32_t end = ucol_getOffset(strsrch->textIter);
2102 if (!checkPreviousExactContractionMatch(strsrch, textoffset, &end, status)
2103 || U_FAILURE(*status)) {
2104 return FALSE;
2105 }
2106
2107 // this totally matches, however we need to check if it is repeating
2108 // the old match
2109 if (checkRepeatedMatch(strsrch, *textoffset, end) ||
2110 !isBreakUnit(strsrch, *textoffset, end) ||
2111 hasAccentsBeforeMatch(strsrch, *textoffset, end) ||
2112 !checkIdentical(strsrch, *textoffset, end) ||
2113 hasAccentsAfterMatch(strsrch, *textoffset, end)) {
2114 (*textoffset) --;
2115 *textoffset = getPreviousBaseOffset(strsrch->search->text,
2116 *textoffset);
2117 return FALSE;
2118 }
2119
2120 //Add breakiterator boundary check for primary strength search.
2121 if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
2122 checkBreakBoundary(strsrch, textoffset, &end);
2123 }
2124
2125 strsrch->search->matchedIndex = *textoffset;
2126 strsrch->search->matchedLength = end - *textoffset;
2127 return TRUE;
2128}
2129
2130/**
2131* Rearranges the end accents to try matching.
2132* Suffix accents in the text will be grouped according to their combining
2133* class and the groups will be mixed and matched to try find the perfect
2134* match with the pattern.
2135* So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2136* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
2137* "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
2138* "\u0301\u0325".
2139* step 2: check if any of the generated substrings matches the pattern.
2140* Internal method, status assumed to be success, user has to check status
2141* before calling this method.
2142* @param strsrch string search match
2143* @param start offset of the first base character
2144* @param end start of the last accent set
2145* @param status only error status if any
2146* @return USEARCH_DONE if a match is not found, otherwise return the ending
2147* offset of the match. Note this start includes all following accents.
2148*/
2149static
2150int32_t doPreviousCanonicalSuffixMatch(UStringSearch *strsrch,
2151 int32_t start,
2152 int32_t end,
2153 UErrorCode *status)
2154{
2155 const UChar *text = strsrch->search->text;
2156 int32_t tempend = end;
2157
2158 U16_BACK_1(text, 0, tempend);
2159 if (!(getFCD(text, &tempend, strsrch->search->textLength) &
2160 LAST_BYTE_MASK_)) {
2161 // die... failed at a base character
2162 return USEARCH_DONE;
2163 }
2164 end = getNextBaseOffset(text, end, strsrch->search->textLength);
2165
2166 if (U_SUCCESS(*status)) {
2167 UChar accents[INITIAL_ARRAY_SIZE_];
2168 int32_t offset = getPreviousBaseOffset(text, end);
2169 // normalizing the offensive string
2170 unorm_normalize(text + offset, end - offset, UNORM_NFD, 0, accents,
2171 INITIAL_ARRAY_SIZE_, status);
2172
2173 int32_t accentsindex[INITIAL_ARRAY_SIZE_];
2174 int32_t accentsize = getUnblockedAccentIndex(accents,
2175 accentsindex);
2176 int32_t count = (2 << (accentsize - 1)) - 1;
2177 UChar buffer[INITIAL_ARRAY_SIZE_];
2178 UCollationElements *coleiter = strsrch->utilIter;
2179 while (U_SUCCESS(*status) && count > 0) {
2180 UChar *rearrange = strsrch->canonicalSuffixAccents;
2181 // copy the base characters
2182 for (int k = 0; k < accentsindex[0]; k ++) {
2183 *rearrange ++ = accents[k];
2184 }
2185 // forming all possible canonical rearrangement by dropping
2186 // sets of accents
2187 for (int i = 0; i <= accentsize - 1; i ++) {
2188 int32_t mask = 1 << (accentsize - i - 1);
2189 if (count & mask) {
2190 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
2191 *rearrange ++ = accents[j];
2192 }
2193 }
2194 }
2195 *rearrange = 0;
2196 int32_t matchsize = INITIAL_ARRAY_SIZE_;
2197 UChar *match = addToUCharArray(buffer, &matchsize,
2198 strsrch->canonicalPrefixAccents,
2199 strsrch->search->text + start,
2200 offset - start,
2201 strsrch->canonicalSuffixAccents,
2202 status);
2203
2204 // run the collator iterator through this match
2205 // if status is a failure ucol_setText does nothing
2206 ucol_setText(coleiter, match, matchsize, status);
2207 if (U_SUCCESS(*status)) {
2208 if (checkCollationMatch(strsrch, coleiter)) {
2209 if (match != buffer) {
2210 uprv_free(match);
2211 }
2212 return end;
2213 }
2214 }
2215 count --;
2216 }
2217 }
2218 return USEARCH_DONE;
2219}
2220
2221/**
2222* Take the rearranged start accents and tries matching. If match failed at
2223* a separate following set of accents (separated from the rearranged on by
2224* at least a base character) then we rearrange the preceding accents and
2225* tries matching again.
2226* We allow skipping of the ends of the accent set if the ces do not match.
2227* However if the failure is found before the accent set, it fails.
2228* Internal method, status assumed to be success, caller has to check status
2229* before calling this method.
2230* @param strsrch string search data
2231* @param textoffset of the ends of the rearranged accent
2232* @param status output error status if any
2233* @return USEARCH_DONE if a match is not found, otherwise return the ending
2234* offset of the match. Note this start includes all following accents.
2235*/
2236static
2237int32_t doPreviousCanonicalPrefixMatch(UStringSearch *strsrch,
2238 int32_t textoffset,
2239 UErrorCode *status)
2240{
2241 const UChar *text = strsrch->search->text;
2242 const UCollator *collator = strsrch->collator;
2243 int32_t safelength = 0;
2244 UChar *safetext;
2245 int32_t safetextlength;
2246 UChar safebuffer[INITIAL_ARRAY_SIZE_];
2247 int32_t safeoffset = textoffset;
2248
2249 if (textoffset &&
2250 ucol_unsafeCP(strsrch->canonicalPrefixAccents[
2251 u_strlen(strsrch->canonicalPrefixAccents) - 1
2252 ], collator)) {
2253 safeoffset = getNextSafeOffset(collator, text, textoffset,
2254 strsrch->search->textLength);
2255 safelength = safeoffset - textoffset;
2256 safetextlength = INITIAL_ARRAY_SIZE_;
2257 safetext = addToUCharArray(safebuffer, &safetextlength,
2258 strsrch->canonicalPrefixAccents,
2259 text + textoffset, safelength,
2260 NULL, status);
2261 }
2262 else {
2263 safetextlength = u_strlen(strsrch->canonicalPrefixAccents);
2264 safetext = strsrch->canonicalPrefixAccents;
2265 }
2266
2267 UCollationElements *coleiter = strsrch->utilIter;
2268 // if status is a failure, ucol_setText does nothing
2269 ucol_setText(coleiter, safetext, safetextlength, status);
2270 // status checked in loop below
2271
2272 int32_t *ce = strsrch->pattern.ces;
2273 int32_t celength = strsrch->pattern.cesLength;
2274 int ceindex = 0;
2275 UBool isSafe = TRUE; // safe zone indication flag for position
2276 int32_t prefixlength = u_strlen(strsrch->canonicalPrefixAccents);
2277
2278 while (ceindex < celength) {
2279 int32_t textce = ucol_next(coleiter, status);
2280 if (U_FAILURE(*status)) {
2281 if (isSafe) {
2282 cleanUpSafeText(strsrch, safetext, safebuffer);
2283 }
2284 return USEARCH_DONE;
2285 }
2286 if (textce == UCOL_NULLORDER) {
2287 // check if we have passed the safe buffer
2288 if (coleiter == strsrch->textIter) {
2289 cleanUpSafeText(strsrch, safetext, safebuffer);
2290 return USEARCH_DONE;
2291 }
2292 cleanUpSafeText(strsrch, safetext, safebuffer);
2293 safetext = safebuffer;
2294 coleiter = strsrch->textIter;
2295 setColEIterOffset(coleiter, safeoffset);
2296 // status checked at the start of the loop
2297 isSafe = FALSE;
2298 continue;
2299 }
2300 textce = getCE(strsrch, textce);
2301 if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
2302 // do the beginning stuff
2303 int32_t failedoffset = ucol_getOffset(coleiter);
2304 if (isSafe && failedoffset <= prefixlength) {
2305 // alas... no hope. failed at rearranged accent set
2306 cleanUpSafeText(strsrch, safetext, safebuffer);
2307 return USEARCH_DONE;
2308 }
2309 else {
2310 if (isSafe) {
2311 failedoffset = safeoffset - failedoffset;
2312 cleanUpSafeText(strsrch, safetext, safebuffer);
2313 }
2314
2315 // try rearranging the end accents
2316 int32_t result = doPreviousCanonicalSuffixMatch(strsrch,
2317 textoffset, failedoffset, status);
2318 if (result != USEARCH_DONE) {
2319 // if status is a failure, ucol_setOffset does nothing
2320 setColEIterOffset(strsrch->textIter, result);
2321 }
2322 if (U_FAILURE(*status)) {
2323 return USEARCH_DONE;
2324 }
2325 return result;
2326 }
2327 }
2328 if (textce == ce[ceindex]) {
2329 ceindex ++;
2330 }
2331 }
2332 // set offset here
2333 if (isSafe) {
2334 int32_t result = ucol_getOffset(coleiter);
2335 // sets the text iterator here with the correct expansion and offset
2336 int32_t leftoverces = getExpansionSuffix(coleiter);
2337 cleanUpSafeText(strsrch, safetext, safebuffer);
2338 if (result <= prefixlength) {
2339 result = textoffset;
2340 }
2341 else {
2342 result = textoffset + (safeoffset - result);
2343 }
2344 setColEIterOffset(strsrch->textIter, result);
2345 setExpansionSuffix(strsrch->textIter, leftoverces);
2346 return result;
2347 }
2348
2349 return ucol_getOffset(coleiter);
2350}
2351
2352/**
2353* Trying out the substring and sees if it can be a canonical match.
2354* This will try normalizing the starting accents and arranging them into
2355* canonical equivalents and check their corresponding ces with the pattern ce.
2356* Prefix accents in the text will be grouped according to their combining
2357* class and the groups will be mixed and matched to try find the perfect
2358* match with the pattern.
2359* So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2360* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
2361* "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
2362* "\u0301\u0325".
2363* step 2: check if any of the generated substrings matches the pattern.
2364* Internal method, status assumed to be success, caller has to check status
2365* before calling this method.
2366* @param strsrch string search data
2367* @param textoffset start offset in the collation element text that starts
2368* with the accents to be rearranged
2369* @param status output error status if any
2370* @return TRUE if the match is valid, FALSE otherwise
2371*/
2372static
2373UBool doPreviousCanonicalMatch(UStringSearch *strsrch,
2374 int32_t textoffset,
2375 UErrorCode *status)
2376{
2377 const UChar *text = strsrch->search->text;
2378 int32_t temp = textoffset;
2379 int32_t textlength = strsrch->search->textLength;
2380 if ((getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) == 0) {
2381 UCollationElements *coleiter = strsrch->textIter;
2382 int32_t offset = ucol_getOffset(coleiter);
2383 if (strsrch->pattern.hasSuffixAccents) {
2384 offset = doPreviousCanonicalSuffixMatch(strsrch, textoffset,
2385 offset, status);
2386 if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
2387 setColEIterOffset(coleiter, offset);
2388 return TRUE;
2389 }
2390 }
2391 return FALSE;
2392 }
2393
2394 if (!strsrch->pattern.hasPrefixAccents) {
2395 return FALSE;
2396 }
2397
2398 UChar accents[INITIAL_ARRAY_SIZE_];
2399 // offset to the last base character in substring to search
2400 int32_t baseoffset = getNextBaseOffset(text, textoffset, textlength);
2401 // normalizing the offensive string
2402 unorm_normalize(text + textoffset, baseoffset - textoffset, UNORM_NFD,
2403 0, accents, INITIAL_ARRAY_SIZE_, status);
2404 // status checked in loop
2405
2406 int32_t accentsindex[INITIAL_ARRAY_SIZE_];
2407 int32_t size = getUnblockedAccentIndex(accents, accentsindex);
2408
2409 // 2 power n - 1 plus the full set of accents
2410 int32_t count = (2 << (size - 1)) - 1;
2411 while (U_SUCCESS(*status) && count > 0) {
2412 UChar *rearrange = strsrch->canonicalPrefixAccents;
2413 // copy the base characters
2414 for (int k = 0; k < accentsindex[0]; k ++) {
2415 *rearrange ++ = accents[k];
2416 }
2417 // forming all possible canonical rearrangement by dropping
2418 // sets of accents
2419 for (int i = 0; i <= size - 1; i ++) {
2420 int32_t mask = 1 << (size - i - 1);
2421 if (count & mask) {
2422 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
2423 *rearrange ++ = accents[j];
2424 }
2425 }
2426 }
2427 *rearrange = 0;
2428 int32_t offset = doPreviousCanonicalPrefixMatch(strsrch,
2429 baseoffset, status);
2430 if (offset != USEARCH_DONE) {
2431 return TRUE; // match found
2432 }
2433 count --;
2434 }
2435 return FALSE;
2436}
2437
2438/**
2439* Checks match for contraction.
2440* If the match starts with a partial contraction we fail.
2441* Internal method, status assumed to be success, caller has to check status
2442* before calling this method.
2443* @param strsrch string search data
2444* @param start offset of potential match, to be modified if necessary
2445* @param end offset of potential match, to be modified if necessary
2446* @param status only error status if any
2447* @return TRUE if match passes the contraction test, FALSE otherwise
2448*/
2449static
2450UBool checkPreviousCanonicalContractionMatch(UStringSearch *strsrch,
2451 int32_t *start,
2452 int32_t *end, UErrorCode *status)
2453{
2454 UCollationElements *coleiter = strsrch->textIter;
2455 int32_t textlength = strsrch->search->textLength;
2456 int32_t temp = *end;
2457 const UCollator *collator = strsrch->collator;
2458 const UChar *text = strsrch->search->text;
2459 // This part checks if either if the start of the match contains potential
2460 // contraction. If so we'll have to iterate through them
2461 // Since we used ucol_next while previously looking for the potential
2462 // match, this guarantees that our end will not be a partial contraction,
2463 // or a partial supplementary character.
2464 if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
2465 int32_t expansion = getExpansionSuffix(coleiter);
2466 UBool expandflag = expansion > 0;
2467 setColEIterOffset(coleiter, *end);
2468 while (expansion > 0) {
2469 // getting rid of the redundant ce
2470 // since forward contraction/expansion may have extra ces
2471 // if we are in the normalization buffer, hasAccentsBeforeMatch
2472 // would have taken care of it.
2473 // E.g. the character \u01FA will have an expansion of 3, but if
2474 // we are only looking for A ring A\u030A, we'll have to skip the
2475 // last ce in the expansion buffer
2476 ucol_previous(coleiter, status);
2477 if (U_FAILURE(*status)) {
2478 return FALSE;
2479 }
2480 if (ucol_getOffset(coleiter) != temp) {
2481 *end = temp;
2482 temp = ucol_getOffset(coleiter);
2483 }
2484 expansion --;
2485 }
2486
2487 int32_t *patternce = strsrch->pattern.ces;
2488 int32_t patterncelength = strsrch->pattern.cesLength;
2489 int32_t count = patterncelength;
2490 while (count > 0) {
2491 int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
2492 // status checked below, note that if status is a failure
2493 // ucol_previous returns UCOL_NULLORDER
2494 if (ce == UCOL_IGNORABLE) {
2495 continue;
2496 }
2497 if (expandflag && count == 0 &&
2498 getColElemIterOffset(coleiter, FALSE) != temp) {
2499 *end = temp;
2500 temp = ucol_getOffset(coleiter);
2501 }
2502 if (count == patterncelength &&
2503 ce != patternce[patterncelength - 1]) {
2504 // accents may have extra starting ces, this occurs when a
2505 // pure accent pattern is matched without rearrangement
2506 int32_t expected = patternce[patterncelength - 1];
2507 U16_BACK_1(text, 0, *end);
2508 if (getFCD(text, end, textlength) & LAST_BYTE_MASK_) {
2509 ce = getCE(strsrch, ucol_previous(coleiter, status));
2510 while (U_SUCCESS(*status) && ce != expected &&
2511 ce != UCOL_NULLORDER &&
2512 ucol_getOffset(coleiter) <= *start) {
2513 ce = getCE(strsrch, ucol_previous(coleiter, status));
2514 }
2515 }
2516 }
2517 if (U_FAILURE(*status) || ce != patternce[count - 1]) {
2518 (*start) --;
2519 *start = getPreviousBaseOffset(text, *start);
2520 return FALSE;
2521 }
2522 count --;
2523 }
2524 }
2525 return TRUE;
2526}
2527
2528/**
2529* Checks and sets the match information if found.
2530* Checks
2531* <ul>
2532* <li> the potential match does not repeat the previous match
2533* <li> boundaries are correct
2534* <li> potential match does not end in the middle of a contraction
2535* <li> identical matches
2536* <\ul>
2537* Otherwise the offset will be shifted to the next character.
2538* Internal method, status assumed to be success, caller has to check status
2539* before calling this method.
2540* @param strsrch string search data
2541* @param textoffset offset in the collation element text. the returned value
2542* will be the truncated start offset of the match or the new start
2543* search offset.
2544* @param status only error status if any
2545* @return TRUE if the match is valid, FALSE otherwise
2546*/
2547static
2548inline UBool checkPreviousCanonicalMatch(UStringSearch *strsrch,
2549 int32_t *textoffset,
2550 UErrorCode *status)
2551{
2552 // to ensure that the start and ends are not composite characters
2553 UCollationElements *coleiter = strsrch->textIter;
2554 // if we have a canonical accent match
2555 if ((strsrch->pattern.hasSuffixAccents &&
2556 strsrch->canonicalSuffixAccents[0]) ||
2557 (strsrch->pattern.hasPrefixAccents &&
2558 strsrch->canonicalPrefixAccents[0])) {
2559 strsrch->search->matchedIndex = *textoffset;
2560 strsrch->search->matchedLength =
2561 getNextUStringSearchBaseOffset(strsrch,
2562 getColElemIterOffset(coleiter, FALSE))
2563 - *textoffset;
2564 return TRUE;
2565 }
2566
2567 int32_t end = ucol_getOffset(coleiter);
2568 if (!checkPreviousCanonicalContractionMatch(strsrch, textoffset, &end,
2569 status) ||
2570 U_FAILURE(*status)) {
2571 return FALSE;
2572 }
2573
2574 end = getNextUStringSearchBaseOffset(strsrch, end);
2575 // this totally matches, however we need to check if it is repeating
2576 if (checkRepeatedMatch(strsrch, *textoffset, end) ||
2577 !isBreakUnit(strsrch, *textoffset, end) ||
2578 !checkIdentical(strsrch, *textoffset, end)) {
2579 (*textoffset) --;
2580 *textoffset = getPreviousBaseOffset(strsrch->search->text,
2581 *textoffset);
2582 return FALSE;
2583 }
2584
2585 strsrch->search->matchedIndex = *textoffset;
2586 strsrch->search->matchedLength = end - *textoffset;
2587 return TRUE;
2588}
2589#endif // #if BOYER_MOORE
2590
2591// constructors and destructor -------------------------------------------
2592
2593U_CAPI UStringSearch * U_EXPORT2 usearch_open(const UChar *pattern,
2594 int32_t patternlength,
2595 const UChar *text,
2596 int32_t textlength,
2597 const char *locale,
2598 UBreakIterator *breakiter,
2599 UErrorCode *status)
2600{
2601 if (U_FAILURE(*status)) {
2602 return NULL;
2603 }
2604#if UCONFIG_NO_BREAK_ITERATION
2605 if (breakiter != NULL) {
2606 *status = U_UNSUPPORTED_ERROR;
2607 return NULL;
2608 }
2609#endif
2610 if (locale) {
2611 // ucol_open internally checks for status
2612 UCollator *collator = ucol_open(locale, status);
2613 // pattern, text checks are done in usearch_openFromCollator
2614 UStringSearch *result = usearch_openFromCollator(pattern,
2615 patternlength, text, textlength,
2616 collator, breakiter, status);
2617
2618 if (result == NULL || U_FAILURE(*status)) {
2619 if (collator) {
2620 ucol_close(collator);
2621 }
2622 return NULL;
2623 }
2624 else {
2625 result->ownCollator = TRUE;
2626 }
2627 return result;
2628 }
2629 *status = U_ILLEGAL_ARGUMENT_ERROR;
2630 return NULL;
2631}
2632
2633U_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator(
2634 const UChar *pattern,
2635 int32_t patternlength,
2636 const UChar *text,
2637 int32_t textlength,
2638 const UCollator *collator,
2639 UBreakIterator *breakiter,
2640 UErrorCode *status)
2641{
2642 if (U_FAILURE(*status)) {
2643 return NULL;
2644 }
2645#if UCONFIG_NO_BREAK_ITERATION
2646 if (breakiter != NULL) {
2647 *status = U_UNSUPPORTED_ERROR;
2648 return NULL;
2649 }
2650#endif
2651 if (pattern == NULL || text == NULL || collator == NULL) {
2652 *status = U_ILLEGAL_ARGUMENT_ERROR;
2653 return NULL;
2654 }
2655
2656 // string search does not really work when numeric collation is turned on
2657 if(ucol_getAttribute(collator, UCOL_NUMERIC_COLLATION, status) == UCOL_ON) {
2658 *status = U_UNSUPPORTED_ERROR;
2659 return NULL;
2660 }
2661
2662 if (U_SUCCESS(*status)) {
2663 initializeFCD(status);
2664 if (U_FAILURE(*status)) {
2665 return NULL;
2666 }
2667
2668 UStringSearch *result;
2669 if (textlength == -1) {
2670 textlength = u_strlen(text);
2671 }
2672 if (patternlength == -1) {
2673 patternlength = u_strlen(pattern);
2674 }
2675 if (textlength <= 0 || patternlength <= 0) {
2676 *status = U_ILLEGAL_ARGUMENT_ERROR;
2677 return NULL;
2678 }
2679
2680 result = (UStringSearch *)uprv_malloc(sizeof(UStringSearch));
2681 if (result == NULL) {
2682 *status = U_MEMORY_ALLOCATION_ERROR;
2683 return NULL;
2684 }
2685
2686 result->collator = collator;
2687 result->strength = ucol_getStrength(collator);
2688 result->ceMask = getMask(result->strength);
2689 result->toShift =
2690 ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
2691 UCOL_SHIFTED;
2692 result->variableTop = ucol_getVariableTop(collator, status);
2693
2694 result->nfd = Normalizer2::getNFDInstance(*status);
2695
2696 if (U_FAILURE(*status)) {
2697 uprv_free(result);
2698 return NULL;
2699 }
2700
2701 result->search = (USearch *)uprv_malloc(sizeof(USearch));
2702 if (result->search == NULL) {
2703 *status = U_MEMORY_ALLOCATION_ERROR;
2704 uprv_free(result);
2705 return NULL;
2706 }
2707
2708 result->search->text = text;
2709 result->search->textLength = textlength;
2710
2711 result->pattern.text = pattern;
2712 result->pattern.textLength = patternlength;
2713 result->pattern.ces = NULL;
2714 result->pattern.pces = NULL;
2715
2716 result->search->breakIter = breakiter;
2717#if !UCONFIG_NO_BREAK_ITERATION
2718 result->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(result->collator, ULOC_VALID_LOCALE, status), text, textlength, status);
2719 if (breakiter) {
2720 ubrk_setText(breakiter, text, textlength, status);
2721 }
2722#endif
2723
2724 result->ownCollator = FALSE;
2725 result->search->matchedLength = 0;
2726 result->search->matchedIndex = USEARCH_DONE;
2727 result->utilIter = NULL;
2728 result->textIter = ucol_openElements(collator, text,
2729 textlength, status);
2730 result->textProcessedIter = NULL;
2731 if (U_FAILURE(*status)) {
2732 usearch_close(result);
2733 return NULL;
2734 }
2735
2736 result->search->isOverlap = FALSE;
2737 result->search->isCanonicalMatch = FALSE;
2738 result->search->elementComparisonType = 0;
2739 result->search->isForwardSearching = TRUE;
2740 result->search->reset = TRUE;
2741
2742 initialize(result, status);
2743
2744 if (U_FAILURE(*status)) {
2745 usearch_close(result);
2746 return NULL;
2747 }
2748
2749 return result;
2750 }
2751 return NULL;
2752}
2753
2754U_CAPI void U_EXPORT2 usearch_close(UStringSearch *strsrch)
2755{
2756 if (strsrch) {
2757 if (strsrch->pattern.ces != strsrch->pattern.cesBuffer &&
2758 strsrch->pattern.ces) {
2759 uprv_free(strsrch->pattern.ces);
2760 }
2761
2762 if (strsrch->pattern.pces != NULL &&
2763 strsrch->pattern.pces != strsrch->pattern.pcesBuffer) {
2764 uprv_free(strsrch->pattern.pces);
2765 }
2766
2767 delete strsrch->textProcessedIter;
2768 ucol_closeElements(strsrch->textIter);
2769 ucol_closeElements(strsrch->utilIter);
2770
2771 if (strsrch->ownCollator && strsrch->collator) {
2772 ucol_close((UCollator *)strsrch->collator);
2773 }
2774
2775#if !UCONFIG_NO_BREAK_ITERATION
2776 if (strsrch->search->internalBreakIter) {
2777 ubrk_close(strsrch->search->internalBreakIter);
2778 }
2779#endif
2780
2781 uprv_free(strsrch->search);
2782 uprv_free(strsrch);
2783 }
2784}
2785
2786namespace {
2787
2788UBool initTextProcessedIter(UStringSearch *strsrch, UErrorCode *status) {
2789 if (U_FAILURE(*status)) { return FALSE; }
2790 if (strsrch->textProcessedIter == NULL) {
2791 strsrch->textProcessedIter = new icu::UCollationPCE(strsrch->textIter);
2792 if (strsrch->textProcessedIter == NULL) {
2793 *status = U_MEMORY_ALLOCATION_ERROR;
2794 return FALSE;
2795 }
2796 } else {
2797 strsrch->textProcessedIter->init(strsrch->textIter);
2798 }
2799 return TRUE;
2800}
2801
2802}
2803
2804// set and get methods --------------------------------------------------
2805
2806U_CAPI void U_EXPORT2 usearch_setOffset(UStringSearch *strsrch,
2807 int32_t position,
2808 UErrorCode *status)
2809{
2810 if (U_SUCCESS(*status) && strsrch) {
2811 if (isOutOfBounds(strsrch->search->textLength, position)) {
2812 *status = U_INDEX_OUTOFBOUNDS_ERROR;
2813 }
2814 else {
2815 setColEIterOffset(strsrch->textIter, position);
2816 }
2817 strsrch->search->matchedIndex = USEARCH_DONE;
2818 strsrch->search->matchedLength = 0;
2819 strsrch->search->reset = FALSE;
2820 }
2821}
2822
2823U_CAPI int32_t U_EXPORT2 usearch_getOffset(const UStringSearch *strsrch)
2824{
2825 if (strsrch) {
2826 int32_t result = ucol_getOffset(strsrch->textIter);
2827 if (isOutOfBounds(strsrch->search->textLength, result)) {
2828 return USEARCH_DONE;
2829 }
2830 return result;
2831 }
2832 return USEARCH_DONE;
2833}
2834
2835U_CAPI void U_EXPORT2 usearch_setAttribute(UStringSearch *strsrch,
2836 USearchAttribute attribute,
2837 USearchAttributeValue value,
2838 UErrorCode *status)
2839{
2840 if (U_SUCCESS(*status) && strsrch) {
2841 switch (attribute)
2842 {
2843 case USEARCH_OVERLAP :
2844 strsrch->search->isOverlap = (value == USEARCH_ON ? TRUE : FALSE);
2845 break;
2846 case USEARCH_CANONICAL_MATCH :
2847 strsrch->search->isCanonicalMatch = (value == USEARCH_ON ? TRUE :
2848 FALSE);
2849 break;
2850 case USEARCH_ELEMENT_COMPARISON :
2851 if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {
2852 strsrch->search->elementComparisonType = (int16_t)value;
2853 } else {
2854 strsrch->search->elementComparisonType = 0;
2855 }
2856 break;
2857 case USEARCH_ATTRIBUTE_COUNT :
2858 default:
2859 *status = U_ILLEGAL_ARGUMENT_ERROR;
2860 }
2861 }
2862 if (value == USEARCH_ATTRIBUTE_VALUE_COUNT) {
2863 *status = U_ILLEGAL_ARGUMENT_ERROR;
2864 }
2865}
2866
2867U_CAPI USearchAttributeValue U_EXPORT2 usearch_getAttribute(
2868 const UStringSearch *strsrch,
2869 USearchAttribute attribute)
2870{
2871 if (strsrch) {
2872 switch (attribute) {
2873 case USEARCH_OVERLAP :
2874 return (strsrch->search->isOverlap == TRUE ? USEARCH_ON :
2875 USEARCH_OFF);
2876 case USEARCH_CANONICAL_MATCH :
2877 return (strsrch->search->isCanonicalMatch == TRUE ? USEARCH_ON :
2878 USEARCH_OFF);
2879 case USEARCH_ELEMENT_COMPARISON :
2880 {
2881 int16_t value = strsrch->search->elementComparisonType;
2882 if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {
2883 return (USearchAttributeValue)value;
2884 } else {
2885 return USEARCH_STANDARD_ELEMENT_COMPARISON;
2886 }
2887 }
2888 case USEARCH_ATTRIBUTE_COUNT :
2889 return USEARCH_DEFAULT;
2890 }
2891 }
2892 return USEARCH_DEFAULT;
2893}
2894
2895U_CAPI int32_t U_EXPORT2 usearch_getMatchedStart(
2896 const UStringSearch *strsrch)
2897{
2898 if (strsrch == NULL) {
2899 return USEARCH_DONE;
2900 }
2901 return strsrch->search->matchedIndex;
2902}
2903
2904
2905U_CAPI int32_t U_EXPORT2 usearch_getMatchedText(const UStringSearch *strsrch,
2906 UChar *result,
2907 int32_t resultCapacity,
2908 UErrorCode *status)
2909{
2910 if (U_FAILURE(*status)) {
2911 return USEARCH_DONE;
2912 }
2913 if (strsrch == NULL || resultCapacity < 0 || (resultCapacity > 0 &&
2914 result == NULL)) {
2915 *status = U_ILLEGAL_ARGUMENT_ERROR;
2916 return USEARCH_DONE;
2917 }
2918
2919 int32_t copylength = strsrch->search->matchedLength;
2920 int32_t copyindex = strsrch->search->matchedIndex;
2921 if (copyindex == USEARCH_DONE) {
2922 u_terminateUChars(result, resultCapacity, 0, status);
2923 return USEARCH_DONE;
2924 }
2925
2926 if (resultCapacity < copylength) {
2927 copylength = resultCapacity;
2928 }
2929 if (copylength > 0) {
2930 uprv_memcpy(result, strsrch->search->text + copyindex,
2931 copylength * sizeof(UChar));
2932 }
2933 return u_terminateUChars(result, resultCapacity,
2934 strsrch->search->matchedLength, status);
2935}
2936
2937U_CAPI int32_t U_EXPORT2 usearch_getMatchedLength(
2938 const UStringSearch *strsrch)
2939{
2940 if (strsrch) {
2941 return strsrch->search->matchedLength;
2942 }
2943 return USEARCH_DONE;
2944}
2945
2946#if !UCONFIG_NO_BREAK_ITERATION
2947
2948U_CAPI void U_EXPORT2 usearch_setBreakIterator(UStringSearch *strsrch,
2949 UBreakIterator *breakiter,
2950 UErrorCode *status)
2951{
2952 if (U_SUCCESS(*status) && strsrch) {
2953 strsrch->search->breakIter = breakiter;
2954 if (breakiter) {
2955 ubrk_setText(breakiter, strsrch->search->text,
2956 strsrch->search->textLength, status);
2957 }
2958 }
2959}
2960
2961U_CAPI const UBreakIterator* U_EXPORT2
2962usearch_getBreakIterator(const UStringSearch *strsrch)
2963{
2964 if (strsrch) {
2965 return strsrch->search->breakIter;
2966 }
2967 return NULL;
2968}
2969
2970#endif
2971
2972U_CAPI void U_EXPORT2 usearch_setText( UStringSearch *strsrch,
2973 const UChar *text,
2974 int32_t textlength,
2975 UErrorCode *status)
2976{
2977 if (U_SUCCESS(*status)) {
2978 if (strsrch == NULL || text == NULL || textlength < -1 ||
2979 textlength == 0) {
2980 *status = U_ILLEGAL_ARGUMENT_ERROR;
2981 }
2982 else {
2983 if (textlength == -1) {
2984 textlength = u_strlen(text);
2985 }
2986 strsrch->search->text = text;
2987 strsrch->search->textLength = textlength;
2988 ucol_setText(strsrch->textIter, text, textlength, status);
2989 strsrch->search->matchedIndex = USEARCH_DONE;
2990 strsrch->search->matchedLength = 0;
2991 strsrch->search->reset = TRUE;
2992#if !UCONFIG_NO_BREAK_ITERATION
2993 if (strsrch->search->breakIter != NULL) {
2994 ubrk_setText(strsrch->search->breakIter, text,
2995 textlength, status);
2996 }
2997 ubrk_setText(strsrch->search->internalBreakIter, text, textlength, status);
2998#endif
2999 }
3000 }
3001}
3002
3003U_CAPI const UChar * U_EXPORT2 usearch_getText(const UStringSearch *strsrch,
3004 int32_t *length)
3005{
3006 if (strsrch) {
3007 *length = strsrch->search->textLength;
3008 return strsrch->search->text;
3009 }
3010 return NULL;
3011}
3012
3013U_CAPI void U_EXPORT2 usearch_setCollator( UStringSearch *strsrch,
3014 const UCollator *collator,
3015 UErrorCode *status)
3016{
3017 if (U_SUCCESS(*status)) {
3018 if (collator == NULL) {
3019 *status = U_ILLEGAL_ARGUMENT_ERROR;
3020 return;
3021 }
3022
3023 if (strsrch) {
3024 delete strsrch->textProcessedIter;
3025 strsrch->textProcessedIter = NULL;
3026 ucol_closeElements(strsrch->textIter);
3027 ucol_closeElements(strsrch->utilIter);
3028 strsrch->textIter = strsrch->utilIter = NULL;
3029 if (strsrch->ownCollator && (strsrch->collator != collator)) {
3030 ucol_close((UCollator *)strsrch->collator);
3031 strsrch->ownCollator = FALSE;
3032 }
3033 strsrch->collator = collator;
3034 strsrch->strength = ucol_getStrength(collator);
3035 strsrch->ceMask = getMask(strsrch->strength);
3036#if !UCONFIG_NO_BREAK_ITERATION
3037 ubrk_close(strsrch->search->internalBreakIter);
3038 strsrch->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(collator, ULOC_VALID_LOCALE, status),
3039 strsrch->search->text, strsrch->search->textLength, status);
3040#endif
3041 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
3042 strsrch->toShift =
3043 ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
3044 UCOL_SHIFTED;
3045 // if status is a failure, ucol_getVariableTop returns 0
3046 strsrch->variableTop = ucol_getVariableTop(collator, status);
3047 strsrch->textIter = ucol_openElements(collator,
3048 strsrch->search->text,
3049 strsrch->search->textLength,
3050 status);
3051 strsrch->utilIter = ucol_openElements(
3052 collator, strsrch->pattern.text, strsrch->pattern.textLength, status);
3053 // initialize() _after_ setting the iterators for the new collator.
3054 initialize(strsrch, status);
3055 }
3056
3057 // **** are these calls needed?
3058 // **** we call uprv_init_pce in initializePatternPCETable
3059 // **** and the CEIBuffer constructor...
3060#if 0
3061 uprv_init_pce(strsrch->textIter);
3062 uprv_init_pce(strsrch->utilIter);
3063#endif
3064 }
3065}
3066
3067U_CAPI UCollator * U_EXPORT2 usearch_getCollator(const UStringSearch *strsrch)
3068{
3069 if (strsrch) {
3070 return (UCollator *)strsrch->collator;
3071 }
3072 return NULL;
3073}
3074
3075U_CAPI void U_EXPORT2 usearch_setPattern( UStringSearch *strsrch,
3076 const UChar *pattern,
3077 int32_t patternlength,
3078 UErrorCode *status)
3079{
3080 if (U_SUCCESS(*status)) {
3081 if (strsrch == NULL || pattern == NULL) {
3082 *status = U_ILLEGAL_ARGUMENT_ERROR;
3083 }
3084 else {
3085 if (patternlength == -1) {
3086 patternlength = u_strlen(pattern);
3087 }
3088 if (patternlength == 0) {
3089 *status = U_ILLEGAL_ARGUMENT_ERROR;
3090 return;
3091 }
3092 strsrch->pattern.text = pattern;
3093 strsrch->pattern.textLength = patternlength;
3094 initialize(strsrch, status);
3095 }
3096 }
3097}
3098
3099U_CAPI const UChar* U_EXPORT2
3100usearch_getPattern(const UStringSearch *strsrch,
3101 int32_t *length)
3102{
3103 if (strsrch) {
3104 *length = strsrch->pattern.textLength;
3105 return strsrch->pattern.text;
3106 }
3107 return NULL;
3108}
3109
3110// miscellanous methods --------------------------------------------------
3111
3112U_CAPI int32_t U_EXPORT2 usearch_first(UStringSearch *strsrch,
3113 UErrorCode *status)
3114{
3115 if (strsrch && U_SUCCESS(*status)) {
3116 strsrch->search->isForwardSearching = TRUE;
3117 usearch_setOffset(strsrch, 0, status);
3118 if (U_SUCCESS(*status)) {
3119 return usearch_next(strsrch, status);
3120 }
3121 }
3122 return USEARCH_DONE;
3123}
3124
3125U_CAPI int32_t U_EXPORT2 usearch_following(UStringSearch *strsrch,
3126 int32_t position,
3127 UErrorCode *status)
3128{
3129 if (strsrch && U_SUCCESS(*status)) {
3130 strsrch->search->isForwardSearching = TRUE;
3131 // position checked in usearch_setOffset
3132 usearch_setOffset(strsrch, position, status);
3133 if (U_SUCCESS(*status)) {
3134 return usearch_next(strsrch, status);
3135 }
3136 }
3137 return USEARCH_DONE;
3138}
3139
3140U_CAPI int32_t U_EXPORT2 usearch_last(UStringSearch *strsrch,
3141 UErrorCode *status)
3142{
3143 if (strsrch && U_SUCCESS(*status)) {
3144 strsrch->search->isForwardSearching = FALSE;
3145 usearch_setOffset(strsrch, strsrch->search->textLength, status);
3146 if (U_SUCCESS(*status)) {
3147 return usearch_previous(strsrch, status);
3148 }
3149 }
3150 return USEARCH_DONE;
3151}
3152
3153U_CAPI int32_t U_EXPORT2 usearch_preceding(UStringSearch *strsrch,
3154 int32_t position,
3155 UErrorCode *status)
3156{
3157 if (strsrch && U_SUCCESS(*status)) {
3158 strsrch->search->isForwardSearching = FALSE;
3159 // position checked in usearch_setOffset
3160 usearch_setOffset(strsrch, position, status);
3161 if (U_SUCCESS(*status)) {
3162 return usearch_previous(strsrch, status);
3163 }
3164 }
3165 return USEARCH_DONE;
3166}
3167
3168/**
3169* If a direction switch is required, we'll count the number of ces till the
3170* beginning of the collation element iterator and iterate forwards that
3171* number of times. This is so that we get to the correct point within the
3172* string to continue the search in. Imagine when we are in the middle of the
3173* normalization buffer when the change in direction is request. arrrgghh....
3174* After searching the offset within the collation element iterator will be
3175* shifted to the start of the match. If a match is not found, the offset would
3176* have been set to the end of the text string in the collation element
3177* iterator.
3178* Okay, here's my take on normalization buffer. The only time when there can
3179* be 2 matches within the same normalization is when the pattern is consists
3180* of all accents. But since the offset returned is from the text string, we
3181* should not confuse the caller by returning the second match within the
3182* same normalization buffer. If we do, the 2 results will have the same match
3183* offsets, and that'll be confusing. I'll return the next match that doesn't
3184* fall within the same normalization buffer. Note this does not affect the
3185* results of matches spanning the text and the normalization buffer.
3186* The position to start searching is taken from the collation element
3187* iterator. Callers of this API would have to set the offset in the collation
3188* element iterator before using this method.
3189*/
3190U_CAPI int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch,
3191 UErrorCode *status)
3192{
3193 if (U_SUCCESS(*status) && strsrch) {
3194 // note offset is either equivalent to the start of the previous match
3195 // or is set by the user
3196 int32_t offset = usearch_getOffset(strsrch);
3197 USearch *search = strsrch->search;
3198 search->reset = FALSE;
3199 int32_t textlength = search->textLength;
3200 if (search->isForwardSearching) {
3201#if BOYER_MOORE
3202 if (offset == textlength
3203 || (!search->isOverlap &&
3204 (offset + strsrch->pattern.defaultShiftSize > textlength ||
3205 (search->matchedIndex != USEARCH_DONE &&
3206 offset + search->matchedLength >= textlength)))) {
3207 // not enough characters to match
3208 setMatchNotFound(strsrch);
3209 return USEARCH_DONE;
3210 }
3211#else
3212 if (offset == textlength ||
3213 (! search->isOverlap &&
3214 (search->matchedIndex != USEARCH_DONE &&
3215 offset + search->matchedLength > textlength))) {
3216 // not enough characters to match
3217 setMatchNotFound(strsrch);
3218 return USEARCH_DONE;
3219 }
3220#endif
3221 }
3222 else {
3223 // switching direction.
3224 // if matchedIndex == USEARCH_DONE, it means that either a
3225 // setOffset has been called or that previous ran off the text
3226 // string. the iterator would have been set to offset 0 if a
3227 // match is not found.
3228 search->isForwardSearching = TRUE;
3229 if (search->matchedIndex != USEARCH_DONE) {
3230 // there's no need to set the collation element iterator
3231 // the next call to next will set the offset.
3232 return search->matchedIndex;
3233 }
3234 }
3235
3236 if (U_SUCCESS(*status)) {
3237 if (strsrch->pattern.cesLength == 0) {
3238 if (search->matchedIndex == USEARCH_DONE) {
3239 search->matchedIndex = offset;
3240 }
3241 else { // moves by codepoints
3242 U16_FWD_1(search->text, search->matchedIndex, textlength);
3243 }
3244
3245 search->matchedLength = 0;
3246 setColEIterOffset(strsrch->textIter, search->matchedIndex);
3247 // status checked below
3248 if (search->matchedIndex == textlength) {
3249 search->matchedIndex = USEARCH_DONE;
3250 }
3251 }
3252 else {
3253 if (search->matchedLength > 0) {
3254 // if matchlength is 0 we are at the start of the iteration
3255 if (search->isOverlap) {
3256 ucol_setOffset(strsrch->textIter, offset + 1, status);
3257 }
3258 else {
3259 ucol_setOffset(strsrch->textIter,
3260 offset + search->matchedLength, status);
3261 }
3262 }
3263 else {
3264 // for boundary check purposes. this will ensure that the
3265 // next match will not preceed the current offset
3266 // note search->matchedIndex will always be set to something
3267 // in the code
3268 search->matchedIndex = offset - 1;
3269 }
3270
3271 if (search->isCanonicalMatch) {
3272 // can't use exact here since extra accents are allowed.
3273 usearch_handleNextCanonical(strsrch, status);
3274 }
3275 else {
3276 usearch_handleNextExact(strsrch, status);
3277 }
3278 }
3279
3280 if (U_FAILURE(*status)) {
3281 return USEARCH_DONE;
3282 }
3283
3284#if !BOYER_MOORE
3285 if (search->matchedIndex == USEARCH_DONE) {
3286 ucol_setOffset(strsrch->textIter, search->textLength, status);
3287 } else {
3288 ucol_setOffset(strsrch->textIter, search->matchedIndex, status);
3289 }
3290#endif
3291
3292 return search->matchedIndex;
3293 }
3294 }
3295 return USEARCH_DONE;
3296}
3297
3298U_CAPI int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch,
3299 UErrorCode *status)
3300{
3301 if (U_SUCCESS(*status) && strsrch) {
3302 int32_t offset;
3303 USearch *search = strsrch->search;
3304 if (search->reset) {
3305 offset = search->textLength;
3306 search->isForwardSearching = FALSE;
3307 search->reset = FALSE;
3308 setColEIterOffset(strsrch->textIter, offset);
3309 }
3310 else {
3311 offset = usearch_getOffset(strsrch);
3312 }
3313
3314 int32_t matchedindex = search->matchedIndex;
3315 if (search->isForwardSearching == TRUE) {
3316 // switching direction.
3317 // if matchedIndex == USEARCH_DONE, it means that either a
3318 // setOffset has been called or that next ran off the text
3319 // string. the iterator would have been set to offset textLength if
3320 // a match is not found.
3321 search->isForwardSearching = FALSE;
3322 if (matchedindex != USEARCH_DONE) {
3323 return matchedindex;
3324 }
3325 }
3326 else {
3327#if BOYER_MOORE
3328 if (offset == 0 || matchedindex == 0 ||
3329 (!search->isOverlap &&
3330 (offset < strsrch->pattern.defaultShiftSize ||
3331 (matchedindex != USEARCH_DONE &&
3332 matchedindex < strsrch->pattern.defaultShiftSize)))) {
3333 // not enough characters to match
3334 setMatchNotFound(strsrch);
3335 return USEARCH_DONE;
3336 }
3337#else
3338 // Could check pattern length, but the
3339 // linear search will do the right thing
3340 if (offset == 0 || matchedindex == 0) {
3341 setMatchNotFound(strsrch);
3342 return USEARCH_DONE;
3343 }
3344#endif
3345 }
3346
3347 if (U_SUCCESS(*status)) {
3348 if (strsrch->pattern.cesLength == 0) {
3349 search->matchedIndex =
3350 (matchedindex == USEARCH_DONE ? offset : matchedindex);
3351 if (search->matchedIndex == 0) {
3352 setMatchNotFound(strsrch);
3353 // status checked below
3354 }
3355 else { // move by codepoints
3356 U16_BACK_1(search->text, 0, search->matchedIndex);
3357 setColEIterOffset(strsrch->textIter, search->matchedIndex);
3358 // status checked below
3359 search->matchedLength = 0;
3360 }
3361 }
3362 else {
3363 if (strsrch->search->isCanonicalMatch) {
3364 // can't use exact here since extra accents are allowed.
3365 usearch_handlePreviousCanonical(strsrch, status);
3366 // status checked below
3367 }
3368 else {
3369 usearch_handlePreviousExact(strsrch, status);
3370 // status checked below
3371 }
3372 }
3373
3374 if (U_FAILURE(*status)) {
3375 return USEARCH_DONE;
3376 }
3377
3378 return search->matchedIndex;
3379 }
3380 }
3381 return USEARCH_DONE;
3382}
3383
3384
3385
3386U_CAPI void U_EXPORT2 usearch_reset(UStringSearch *strsrch)
3387{
3388 /*
3389 reset is setting the attributes that are already in
3390 string search, hence all attributes in the collator should
3391 be retrieved without any problems
3392 */
3393 if (strsrch) {
3394 UErrorCode status = U_ZERO_ERROR;
3395 UBool sameCollAttribute = TRUE;
3396 uint32_t ceMask;
3397 UBool shift;
3398 uint32_t varTop;
3399
3400 // **** hack to deal w/ how processed CEs encode quaternary ****
3401 UCollationStrength newStrength = ucol_getStrength(strsrch->collator);
3402 if ((strsrch->strength < UCOL_QUATERNARY && newStrength >= UCOL_QUATERNARY) ||
3403 (strsrch->strength >= UCOL_QUATERNARY && newStrength < UCOL_QUATERNARY)) {
3404 sameCollAttribute = FALSE;
3405 }
3406
3407 strsrch->strength = ucol_getStrength(strsrch->collator);
3408 ceMask = getMask(strsrch->strength);
3409 if (strsrch->ceMask != ceMask) {
3410 strsrch->ceMask = ceMask;
3411 sameCollAttribute = FALSE;
3412 }
3413
3414 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
3415 shift = ucol_getAttribute(strsrch->collator, UCOL_ALTERNATE_HANDLING,
3416 &status) == UCOL_SHIFTED;
3417 if (strsrch->toShift != shift) {
3418 strsrch->toShift = shift;
3419 sameCollAttribute = FALSE;
3420 }
3421
3422 // if status is a failure, ucol_getVariableTop returns 0
3423 varTop = ucol_getVariableTop(strsrch->collator, &status);
3424 if (strsrch->variableTop != varTop) {
3425 strsrch->variableTop = varTop;
3426 sameCollAttribute = FALSE;
3427 }
3428 if (!sameCollAttribute) {
3429 initialize(strsrch, &status);
3430 }
3431 ucol_setText(strsrch->textIter, strsrch->search->text,
3432 strsrch->search->textLength,
3433 &status);
3434 strsrch->search->matchedLength = 0;
3435 strsrch->search->matchedIndex = USEARCH_DONE;
3436 strsrch->search->isOverlap = FALSE;
3437 strsrch->search->isCanonicalMatch = FALSE;
3438 strsrch->search->elementComparisonType = 0;
3439 strsrch->search->isForwardSearching = TRUE;
3440 strsrch->search->reset = TRUE;
3441 }
3442}
3443
3444//
3445// CEI Collation Element + source text index.
3446// These structs are kept in the circular buffer.
3447//
3448struct CEI {
3449 int64_t ce;
3450 int32_t lowIndex;
3451 int32_t highIndex;
3452};
3453
3454U_NAMESPACE_BEGIN
3455
3456namespace {
3457//
3458// CEIBuffer A circular buffer of CEs-with-index from the text being searched.
3459//
3460#define DEFAULT_CEBUFFER_SIZE 96
3461#define CEBUFFER_EXTRA 32
3462// Some typical max values to make buffer size more reasonable for asymmetric search.
3463// #8694 is for a better long-term solution to allocation of this buffer.
3464#define MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L 8
3465#define MAX_TARGET_IGNORABLES_PER_PAT_OTHER 3
3466#define MIGHT_BE_JAMO_L(c) ((c >= 0x1100 && c <= 0x115E) || (c >= 0x3131 && c <= 0x314E) || (c >= 0x3165 && c <= 0x3186))
3467struct CEIBuffer {
3468 CEI defBuf[DEFAULT_CEBUFFER_SIZE];
3469 CEI *buf;
3470 int32_t bufSize;
3471 int32_t firstIx;
3472 int32_t limitIx;
3473 UCollationElements *ceIter;
3474 UStringSearch *strSearch;
3475
3476
3477
3478 CEIBuffer(UStringSearch *ss, UErrorCode *status);
3479 ~CEIBuffer();
3480 const CEI *get(int32_t index);
3481 const CEI *getPrevious(int32_t index);
3482};
3483
3484
3485CEIBuffer::CEIBuffer(UStringSearch *ss, UErrorCode *status) {
3486 buf = defBuf;
3487 strSearch = ss;
3488 bufSize = ss->pattern.pcesLength + CEBUFFER_EXTRA;
3489 if (ss->search->elementComparisonType != 0) {
3490 const UChar * patText = ss->pattern.text;
3491 if (patText) {
3492 const UChar * patTextLimit = patText + ss->pattern.textLength;
3493 while ( patText < patTextLimit ) {
3494 UChar c = *patText++;
3495 if (MIGHT_BE_JAMO_L(c)) {
3496 bufSize += MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L;
3497 } else {
3498 // No check for surrogates, we might allocate slightly more buffer than necessary.
3499 bufSize += MAX_TARGET_IGNORABLES_PER_PAT_OTHER;
3500 }
3501 }
3502 }
3503 }
3504 ceIter = ss->textIter;
3505 firstIx = 0;
3506 limitIx = 0;
3507
3508 if (!initTextProcessedIter(ss, status)) { return; }
3509
3510 if (bufSize>DEFAULT_CEBUFFER_SIZE) {
3511 buf = (CEI *)uprv_malloc(bufSize * sizeof(CEI));
3512 if (buf == NULL) {
3513 *status = U_MEMORY_ALLOCATION_ERROR;
3514 }
3515 }
3516}
3517
3518// TODO: add a reset or init function so that allocated
3519// buffers can be retained & reused.
3520
3521CEIBuffer::~CEIBuffer() {
3522 if (buf != defBuf) {
3523 uprv_free(buf);
3524 }
3525}
3526
3527
3528// Get the CE with the specified index.
3529// Index must be in the range
3530// n-history_size < index < n+1
3531// where n is the largest index to have been fetched by some previous call to this function.
3532// The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
3533//
3534const CEI *CEIBuffer::get(int32_t index) {
3535 int i = index % bufSize;
3536
3537 if (index>=firstIx && index<limitIx) {
3538 // The request was for an entry already in our buffer.
3539 // Just return it.
3540 return &buf[i];
3541 }
3542
3543 // Caller is requesting a new, never accessed before, CE.
3544 // Verify that it is the next one in sequence, which is all
3545 // that is allowed.
3546 if (index != limitIx) {
3547 U_ASSERT(FALSE);
3548 // TODO: In ICU 64 the above assert was changed to use UPRV_UNREACHABLE instead
3549 // which unconditionally calls abort(). However, there were cases where this was
3550 // being hit. This change is reverted for now, restoring the existing behavior.
3551 // ICU-20792 tracks the follow-up work/further investigation on this.
3552 return NULL;
3553 }
3554
3555 // Manage the circular CE buffer indexing
3556 limitIx++;
3557
3558 if (limitIx - firstIx >= bufSize) {
3559 // The buffer is full, knock out the lowest-indexed entry.
3560 firstIx++;
3561 }
3562
3563 UErrorCode status = U_ZERO_ERROR;
3564
3565 buf[i].ce = strSearch->textProcessedIter->nextProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status);
3566
3567 return &buf[i];
3568}
3569
3570// Get the CE with the specified index.
3571// Index must be in the range
3572// n-history_size < index < n+1
3573// where n is the largest index to have been fetched by some previous call to this function.
3574// The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
3575//
3576const CEI *CEIBuffer::getPrevious(int32_t index) {
3577 int i = index % bufSize;
3578
3579 if (index>=firstIx && index<limitIx) {
3580 // The request was for an entry already in our buffer.
3581 // Just return it.
3582 return &buf[i];
3583 }
3584
3585 // Caller is requesting a new, never accessed before, CE.
3586 // Verify that it is the next one in sequence, which is all
3587 // that is allowed.
3588 if (index != limitIx) {
3589 U_ASSERT(FALSE);
3590 // TODO: In ICU 64 the above assert was changed to use UPRV_UNREACHABLE instead
3591 // which unconditionally calls abort(). However, there were cases where this was
3592 // being hit. This change is reverted for now, restoring the existing behavior.
3593 // ICU-20792 tracks the follow-up work/further investigation on this.
3594 return NULL;
3595 }
3596
3597 // Manage the circular CE buffer indexing
3598 limitIx++;
3599
3600 if (limitIx - firstIx >= bufSize) {
3601 // The buffer is full, knock out the lowest-indexed entry.
3602 firstIx++;
3603 }
3604
3605 UErrorCode status = U_ZERO_ERROR;
3606
3607 buf[i].ce = strSearch->textProcessedIter->previousProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status);
3608
3609 return &buf[i];
3610}
3611
3612}
3613
3614U_NAMESPACE_END
3615
3616
3617// #define USEARCH_DEBUG
3618
3619#ifdef USEARCH_DEBUG
3620#include <stdio.h>
3621#include <stdlib.h>
3622#endif
3623
3624/*
3625 * Find the next break boundary after startIndex. If the UStringSearch object
3626 * has an external break iterator, use that. Otherwise use the internal character
3627 * break iterator.
3628 */
3629static int32_t nextBoundaryAfter(UStringSearch *strsrch, int32_t startIndex) {
3630#if 0
3631 const UChar *text = strsrch->search->text;
3632 int32_t textLen = strsrch->search->textLength;
3633
3634 U_ASSERT(startIndex>=0);
3635 U_ASSERT(startIndex<=textLen);
3636
3637 if (startIndex >= textLen) {
3638 return startIndex;
3639 }
3640
3641 UChar32 c;
3642 int32_t i = startIndex;
3643 U16_NEXT(text, i, textLen, c);
3644
3645 // If we are on a control character, stop without looking for combining marks.
3646 // Control characters do not combine.
3647 int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3648 if (gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR) {
3649 return i;
3650 }
3651
3652 // The initial character was not a control, and can thus accept trailing
3653 // combining characters. Advance over however many of them there are.
3654 int32_t indexOfLastCharChecked;
3655 for (;;) {
3656 indexOfLastCharChecked = i;
3657 if (i>=textLen) {
3658 break;
3659 }
3660 U16_NEXT(text, i, textLen, c);
3661 gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3662 if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
3663 break;
3664 }
3665 }
3666 return indexOfLastCharChecked;
3667#elif !UCONFIG_NO_BREAK_ITERATION
3668 UBreakIterator *breakiterator = strsrch->search->breakIter;
3669
3670 if (breakiterator == NULL) {
3671 breakiterator = strsrch->search->internalBreakIter;
3672 }
3673
3674 if (breakiterator != NULL) {
3675 return ubrk_following(breakiterator, startIndex);
3676 }
3677
3678 return startIndex;
3679#else
3680 // **** or should we use the original code? ****
3681 return startIndex;
3682#endif
3683
3684}
3685
3686/*
3687 * Returns TRUE if index is on a break boundary. If the UStringSearch
3688 * has an external break iterator, test using that, otherwise test
3689 * using the internal character break iterator.
3690 */
3691static UBool isBreakBoundary(UStringSearch *strsrch, int32_t index) {
3692#if 0
3693 const UChar *text = strsrch->search->text;
3694 int32_t textLen = strsrch->search->textLength;
3695
3696 U_ASSERT(index>=0);
3697 U_ASSERT(index<=textLen);
3698
3699 if (index>=textLen || index<=0) {
3700 return TRUE;
3701 }
3702
3703 // If the character at the current index is not a GRAPHEME_EXTEND
3704 // then we can not be within a combining sequence.
3705 UChar32 c;
3706 U16_GET(text, 0, index, textLen, c);
3707 int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3708 if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
3709 return TRUE;
3710 }
3711
3712 // We are at a combining mark. If the preceding character is anything
3713 // except a CONTROL, CR or LF, we are in a combining sequence.
3714 U16_PREV(text, 0, index, c);
3715 gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3716 UBool combining = !(gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR);
3717 return !combining;
3718#elif !UCONFIG_NO_BREAK_ITERATION
3719 UBreakIterator *breakiterator = strsrch->search->breakIter;
3720
3721 if (breakiterator == NULL) {
3722 breakiterator = strsrch->search->internalBreakIter;
3723 }
3724
3725 return (breakiterator != NULL && ubrk_isBoundary(breakiterator, index));
3726#else
3727 // **** or use the original code? ****
3728 return TRUE;
3729#endif
3730}
3731
3732#if 0
3733static UBool onBreakBoundaries(const UStringSearch *strsrch, int32_t start, int32_t end)
3734{
3735#if !UCONFIG_NO_BREAK_ITERATION
3736 UBreakIterator *breakiterator = strsrch->search->breakIter;
3737
3738 if (breakiterator != NULL) {
3739 int32_t startindex = ubrk_first(breakiterator);
3740 int32_t endindex = ubrk_last(breakiterator);
3741
3742 // out-of-range indexes are never boundary positions
3743 if (start < startindex || start > endindex ||
3744 end < startindex || end > endindex) {
3745 return FALSE;
3746 }
3747
3748 return ubrk_isBoundary(breakiterator, start) &&
3749 ubrk_isBoundary(breakiterator, end);
3750 }
3751#endif
3752
3753 return TRUE;
3754}
3755#endif
3756
3757typedef enum {
3758 U_CE_MATCH = -1,
3759 U_CE_NO_MATCH = 0,
3760 U_CE_SKIP_TARG,
3761 U_CE_SKIP_PATN
3762} UCompareCEsResult;
3763#define U_CE_LEVEL2_BASE 0x00000005
3764#define U_CE_LEVEL3_BASE 0x00050000
3765
3766static UCompareCEsResult compareCE64s(int64_t targCE, int64_t patCE, int16_t compareType) {
3767 if (targCE == patCE) {
3768 return U_CE_MATCH;
3769 }
3770 if (compareType == 0) {
3771 return U_CE_NO_MATCH;
3772 }
3773
3774 int64_t targCEshifted = targCE >> 32;
3775 int64_t patCEshifted = patCE >> 32;
3776 int64_t mask;
3777
3778 mask = 0xFFFF0000;
3779 int32_t targLev1 = (int32_t)(targCEshifted & mask);
3780 int32_t patLev1 = (int32_t)(patCEshifted & mask);
3781 if ( targLev1 != patLev1 ) {
3782 if ( targLev1 == 0 ) {
3783 return U_CE_SKIP_TARG;
3784 }
3785 if ( patLev1 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {
3786 return U_CE_SKIP_PATN;
3787 }
3788 return U_CE_NO_MATCH;
3789 }
3790
3791 mask = 0x0000FFFF;
3792 int32_t targLev2 = (int32_t)(targCEshifted & mask);
3793 int32_t patLev2 = (int32_t)(patCEshifted & mask);
3794 if ( targLev2 != patLev2 ) {
3795 if ( targLev2 == 0 ) {
3796 return U_CE_SKIP_TARG;
3797 }
3798 if ( patLev2 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {
3799 return U_CE_SKIP_PATN;
3800 }
3801 return (patLev2 == U_CE_LEVEL2_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev2 == U_CE_LEVEL2_BASE) )?
3802 U_CE_MATCH: U_CE_NO_MATCH;
3803 }
3804
3805 mask = 0xFFFF0000;
3806 int32_t targLev3 = (int32_t)(targCE & mask);
3807 int32_t patLev3 = (int32_t)(patCE & mask);
3808 if ( targLev3 != patLev3 ) {
3809 return (patLev3 == U_CE_LEVEL3_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev3 == U_CE_LEVEL3_BASE) )?
3810 U_CE_MATCH: U_CE_NO_MATCH;
3811 }
3812
3813 return U_CE_MATCH;
3814}
3815
3816#if BOYER_MOORE
3817// TODO: #if BOYER_MOORE, need 32-bit version of compareCE64s
3818#endif
3819
3820namespace {
3821
3822UChar32 codePointAt(const USearch &search, int32_t index) {
3823 if (index < search.textLength) {
3824 UChar32 c;
3825 U16_NEXT(search.text, index, search.textLength, c);
3826 return c;
3827 }
3828 return U_SENTINEL;
3829}
3830
3831UChar32 codePointBefore(const USearch &search, int32_t index) {
3832 if (0 < index) {
3833 UChar32 c;
3834 U16_PREV(search.text, 0, index, c);
3835 return c;
3836 }
3837 return U_SENTINEL;
3838}
3839
3840} // namespace
3841
3842U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch,
3843 int32_t startIdx,
3844 int32_t *matchStart,
3845 int32_t *matchLimit,
3846 UErrorCode *status)
3847{
3848 if (U_FAILURE(*status)) {
3849 return FALSE;
3850 }
3851
3852 // TODO: reject search patterns beginning with a combining char.
3853
3854#ifdef USEARCH_DEBUG
3855 if (getenv("USEARCH_DEBUG") != NULL) {
3856 printf("Pattern CEs\n");
3857 for (int ii=0; ii<strsrch->pattern.cesLength; ii++) {
3858 printf(" %8x", strsrch->pattern.ces[ii]);
3859 }
3860 printf("\n");
3861 }
3862
3863#endif
3864 // Input parameter sanity check.
3865 // TODO: should input indices clip to the text length
3866 // in the same way that UText does.
3867 if(strsrch->pattern.cesLength == 0 ||
3868 startIdx < 0 ||
3869 startIdx > strsrch->search->textLength ||
3870 strsrch->pattern.ces == NULL) {
3871 *status = U_ILLEGAL_ARGUMENT_ERROR;
3872 return FALSE;
3873 }
3874
3875 if (strsrch->pattern.pces == NULL) {
3876 initializePatternPCETable(strsrch, status);
3877 }
3878
3879 ucol_setOffset(strsrch->textIter, startIdx, status);
3880 CEIBuffer ceb(strsrch, status);
3881
3882
3883 int32_t targetIx = 0;
3884 const CEI *targetCEI = NULL;
3885 int32_t patIx;
3886 UBool found;
3887
3888 int32_t mStart = -1;
3889 int32_t mLimit = -1;
3890 int32_t minLimit;
3891 int32_t maxLimit;
3892
3893
3894
3895 // Outer loop moves over match starting positions in the
3896 // target CE space.
3897 // Here we see the target as a sequence of collation elements, resulting from the following:
3898 // 1. Target characters were decomposed, and (if appropriate) other compressions and expansions are applied
3899 // (for example, digraphs such as IJ may be broken into two characters).
3900 // 2. An int64_t CE weight is determined for each resulting unit (high 16 bits are primary strength, next
3901 // 16 bits are secondary, next 16 (the high 16 bits of the low 32-bit half) are tertiary. Any of these
3902 // fields that are for strengths below that of the collator are set to 0. If this makes the int64_t
3903 // CE weight 0 (as for a combining diacritic with secondary weight when the collator strentgh is primary),
3904 // then the CE is deleted, so the following code sees only CEs that are relevant.
3905 // For each CE, the lowIndex and highIndex correspond to where this CE begins and ends in the original text.
3906 // If lowIndex==highIndex, either the CE resulted from an expansion/decomposition of one of the original text
3907 // characters, or the CE marks the limit of the target text (in which case the CE weight is UCOL_PROCESSED_NULLORDER).
3908 //
3909 for(targetIx=0; ; targetIx++)
3910 {
3911 found = TRUE;
3912 // Inner loop checks for a match beginning at each
3913 // position from the outer loop.
3914 int32_t targetIxOffset = 0;
3915 int64_t patCE = 0;
3916 // For targetIx > 0, this ceb.get gets a CE that is as far back in the ring buffer
3917 // (compared to the last CE fetched for the previous targetIx value) as we need to go
3918 // for this targetIx value, so if it is non-NULL then other ceb.get calls should be OK.
3919 const CEI *firstCEI = ceb.get(targetIx);
3920 if (firstCEI == NULL) {
3921 *status = U_INTERNAL_PROGRAM_ERROR;
3922 found = FALSE;
3923 break;
3924 }
3925
3926 for (patIx=0; patIx<strsrch->pattern.pcesLength; patIx++) {
3927 patCE = strsrch->pattern.pces[patIx];
3928 targetCEI = ceb.get(targetIx+patIx+targetIxOffset);
3929 // Compare CE from target string with CE from the pattern.
3930 // Note that the target CE will be UCOL_PROCESSED_NULLORDER if we reach the end of input,
3931 // which will fail the compare, below.
3932 UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);
3933 if ( ceMatch == U_CE_NO_MATCH ) {
3934 found = FALSE;
3935 break;
3936 } else if ( ceMatch > U_CE_NO_MATCH ) {
3937 if ( ceMatch == U_CE_SKIP_TARG ) {
3938 // redo with same patCE, next targCE
3939 patIx--;
3940 targetIxOffset++;
3941 } else { // ceMatch == U_CE_SKIP_PATN
3942 // redo with same targCE, next patCE
3943 targetIxOffset--;
3944 }
3945 }
3946 }
3947 targetIxOffset += strsrch->pattern.pcesLength; // this is now the offset in target CE space to end of the match so far
3948
3949 if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) {
3950 // No match at this targetIx. Try again at the next.
3951 continue;
3952 }
3953
3954 if (!found) {
3955 // No match at all, we have run off the end of the target text.
3956 break;
3957 }
3958
3959
3960 // We have found a match in CE space.
3961 // Now determine the bounds in string index space.
3962 // There still is a chance of match failure if the CE range not correspond to
3963 // an acceptable character range.
3964 //
3965 const CEI *lastCEI = ceb.get(targetIx + targetIxOffset - 1);
3966
3967 mStart = firstCEI->lowIndex;
3968 minLimit = lastCEI->lowIndex;
3969
3970 // Look at the CE following the match. If it is UCOL_NULLORDER the match
3971 // extended to the end of input, and the match is good.
3972
3973 // Look at the high and low indices of the CE following the match. If
3974 // they are the same it means one of two things:
3975 // 1. The match extended to the last CE from the target text, which is OK, or
3976 // 2. The last CE that was part of the match is in an expansion that extends
3977 // to the first CE after the match. In this case, we reject the match.
3978 const CEI *nextCEI = 0;
3979 if (strsrch->search->elementComparisonType == 0) {
3980 nextCEI = ceb.get(targetIx + targetIxOffset);
3981 maxLimit = nextCEI->lowIndex;
3982 if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
3983 found = FALSE;
3984 }
3985 } else {
3986 for ( ; ; ++targetIxOffset ) {
3987 nextCEI = ceb.get(targetIx + targetIxOffset);
3988 maxLimit = nextCEI->lowIndex;
3989 // If we are at the end of the target too, match succeeds
3990 if ( nextCEI->ce == UCOL_PROCESSED_NULLORDER ) {
3991 break;
3992 }
3993 // As long as the next CE has primary weight of 0,
3994 // it is part of the last target element matched by the pattern;
3995 // make sure it can be part of a match with the last patCE
3996 if ( (((nextCEI->ce) >> 32) & 0xFFFF0000UL) == 0 ) {
3997 UCompareCEsResult ceMatch = compareCE64s(nextCEI->ce, patCE, strsrch->search->elementComparisonType);
3998 if ( ceMatch == U_CE_NO_MATCH || ceMatch == U_CE_SKIP_PATN ) {
3999 found = FALSE;
4000 break;
4001 }
4002 // If lowIndex == highIndex, this target CE is part of an expansion of the last matched
4003 // target element, but it has non-zero primary weight => match fails
4004 } else if ( nextCEI->lowIndex == nextCEI->highIndex ) {
4005 found = false;
4006 break;
4007 // Else the target CE is not part of an expansion of the last matched element, match succeeds
4008 } else {
4009 break;
4010 }
4011 }
4012 }
4013
4014
4015 // Check for the start of the match being within a combining sequence.
4016 // This can happen if the pattern itself begins with a combining char, and
4017 // the match found combining marks in the target text that were attached
4018 // to something else.
4019 // This type of match should be rejected for not completely consuming a
4020 // combining sequence.
4021 if (!isBreakBoundary(strsrch, mStart)) {
4022 found = FALSE;
4023 }
4024
4025 // Check for the start of the match being within an Collation Element Expansion,
4026 // meaning that the first char of the match is only partially matched.
4027 // With expansions, the first CE will report the index of the source
4028 // character, and all subsequent (expansions) CEs will report the source index of the
4029 // _following_ character.
4030 int32_t secondIx = firstCEI->highIndex;
4031 if (mStart == secondIx) {
4032 found = FALSE;
4033 }
4034
4035 // Allow matches to end in the middle of a grapheme cluster if the following
4036 // conditions are met; this is needed to make prefix search work properly in
4037 // Indic, see #11750
4038 // * the default breakIter is being used
4039 // * the next collation element after this combining sequence
4040 // - has non-zero primary weight
4041 // - corresponds to a separate character following the one at end of the current match
4042 // (the second of these conditions, and perhaps both, may be redundant given the
4043 // subsequent check for normalization boundary; however they are likely much faster
4044 // tests in any case)
4045 // * the match limit is a normalization boundary
4046 UBool allowMidclusterMatch = FALSE;
4047 if (strsrch->search->text != NULL && strsrch->search->textLength > maxLimit) {
4048 allowMidclusterMatch =
4049 strsrch->search->breakIter == NULL &&
4050 nextCEI != NULL && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 &&
4051 maxLimit >= lastCEI->highIndex && nextCEI->highIndex > maxLimit &&
4052 (strsrch->nfd->hasBoundaryBefore(codePointAt(*strsrch->search, maxLimit)) ||
4053 strsrch->nfd->hasBoundaryAfter(codePointBefore(*strsrch->search, maxLimit)));
4054 }
4055 // If those conditions are met, then:
4056 // * do NOT advance the candidate match limit (mLimit) to a break boundary; however
4057 // the match limit may be backed off to a previous break boundary. This handles
4058 // cases in which mLimit includes target characters that are ignorable with current
4059 // settings (such as space) and which extend beyond the pattern match.
4060 // * do NOT require that end of the combining sequence not extend beyond the match in CE space
4061 // * do NOT require that match limit be on a breakIter boundary
4062
4063 // Advance the match end position to the first acceptable match boundary.
4064 // This advances the index over any combining charcters.
4065 mLimit = maxLimit;
4066 if (minLimit < maxLimit) {
4067 // When the last CE's low index is same with its high index, the CE is likely
4068 // a part of expansion. In this case, the index is located just after the
4069 // character corresponding to the CEs compared above. If the index is right
4070 // at the break boundary, move the position to the next boundary will result
4071 // incorrect match length when there are ignorable characters exist between
4072 // the position and the next character produces CE(s). See ticket#8482.
4073 if (minLimit == lastCEI->highIndex && isBreakBoundary(strsrch, minLimit)) {
4074 mLimit = minLimit;
4075 } else {
4076 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
4077 // Note that we can have nba < maxLimit && nba >= minLImit, in which
4078 // case we want to set mLimit to nba regardless of allowMidclusterMatch
4079 // (i.e. we back off mLimit to the previous breakIterator boundary).
4080 if (nba >= lastCEI->highIndex && (!allowMidclusterMatch || nba < maxLimit)) {
4081 mLimit = nba;
4082 }
4083 }
4084 }
4085
4086 #ifdef USEARCH_DEBUG
4087 if (getenv("USEARCH_DEBUG") != NULL) {
4088 printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
4089 }
4090 #endif
4091
4092 if (!allowMidclusterMatch) {
4093 // If advancing to the end of a combining sequence in character indexing space
4094 // advanced us beyond the end of the match in CE space, reject this match.
4095 if (mLimit > maxLimit) {
4096 found = FALSE;
4097 }
4098
4099 if (!isBreakBoundary(strsrch, mLimit)) {
4100 found = FALSE;
4101 }
4102 }
4103
4104 if (! checkIdentical(strsrch, mStart, mLimit)) {
4105 found = FALSE;
4106 }
4107
4108 if (found) {
4109 break;
4110 }
4111 }
4112
4113 #ifdef USEARCH_DEBUG
4114 if (getenv("USEARCH_DEBUG") != NULL) {
4115 printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
4116 int32_t lastToPrint = ceb.limitIx+2;
4117 for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
4118 printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
4119 }
4120 printf("\n%s\n", found? "match found" : "no match");
4121 }
4122 #endif
4123
4124 // All Done. Store back the match bounds to the caller.
4125 //
4126 if (found==FALSE) {
4127 mLimit = -1;
4128 mStart = -1;
4129 }
4130
4131 if (matchStart != NULL) {
4132 *matchStart= mStart;
4133 }
4134
4135 if (matchLimit != NULL) {
4136 *matchLimit = mLimit;
4137 }
4138
4139 return found;
4140}
4141
4142U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch *strsrch,
4143 int32_t startIdx,
4144 int32_t *matchStart,
4145 int32_t *matchLimit,
4146 UErrorCode *status)
4147{
4148 if (U_FAILURE(*status)) {
4149 return FALSE;
4150 }
4151
4152 // TODO: reject search patterns beginning with a combining char.
4153
4154#ifdef USEARCH_DEBUG
4155 if (getenv("USEARCH_DEBUG") != NULL) {
4156 printf("Pattern CEs\n");
4157 for (int ii=0; ii<strsrch->pattern.cesLength; ii++) {
4158 printf(" %8x", strsrch->pattern.ces[ii]);
4159 }
4160 printf("\n");
4161 }
4162
4163#endif
4164 // Input parameter sanity check.
4165 // TODO: should input indicies clip to the text length
4166 // in the same way that UText does.
4167 if(strsrch->pattern.cesLength == 0 ||
4168 startIdx < 0 ||
4169 startIdx > strsrch->search->textLength ||
4170 strsrch->pattern.ces == NULL) {
4171 *status = U_ILLEGAL_ARGUMENT_ERROR;
4172 return FALSE;
4173 }
4174
4175 if (strsrch->pattern.pces == NULL) {
4176 initializePatternPCETable(strsrch, status);
4177 }
4178
4179 CEIBuffer ceb(strsrch, status);
4180 int32_t targetIx = 0;
4181
4182 /*
4183 * Pre-load the buffer with the CE's for the grapheme
4184 * after our starting position so that we're sure that
4185 * we can look at the CE following the match when we
4186 * check the match boundaries.
4187 *
4188 * This will also pre-fetch the first CE that we'll
4189 * consider for the match.
4190 */
4191 if (startIdx < strsrch->search->textLength) {
4192 UBreakIterator *bi = strsrch->search->internalBreakIter;
4193 int32_t next = ubrk_following(bi, startIdx);
4194
4195 ucol_setOffset(strsrch->textIter, next, status);
4196
4197 for (targetIx = 0; ; targetIx += 1) {
4198 if (ceb.getPrevious(targetIx)->lowIndex < startIdx) {
4199 break;
4200 }
4201 }
4202 } else {
4203 ucol_setOffset(strsrch->textIter, startIdx, status);
4204 }
4205
4206
4207 const CEI *targetCEI = NULL;
4208 int32_t patIx;
4209 UBool found;
4210
4211 int32_t limitIx = targetIx;
4212 int32_t mStart = -1;
4213 int32_t mLimit = -1;
4214 int32_t minLimit;
4215 int32_t maxLimit;
4216
4217
4218
4219 // Outer loop moves over match starting positions in the
4220 // target CE space.
4221 // Here, targetIx values increase toward the beginning of the base text (i.e. we get the text CEs in reverse order).
4222 // But patIx is 0 at the beginning of the pattern and increases toward the end.
4223 // So this loop performs a comparison starting with the end of pattern, and prcessd toward the beginning of the pattern
4224 // and the beginning of the base text.
4225 for(targetIx = limitIx; ; targetIx += 1)
4226 {
4227 found = TRUE;
4228 // For targetIx > limitIx, this ceb.getPrevious gets a CE that is as far back in the ring buffer
4229 // (compared to the last CE fetched for the previous targetIx value) as we need to go
4230 // for this targetIx value, so if it is non-NULL then other ceb.getPrevious calls should be OK.
4231 const CEI *lastCEI = ceb.getPrevious(targetIx);
4232 if (lastCEI == NULL) {
4233 *status = U_INTERNAL_PROGRAM_ERROR;
4234 found = FALSE;
4235 break;
4236 }
4237 // Inner loop checks for a match beginning at each
4238 // position from the outer loop.
4239 int32_t targetIxOffset = 0;
4240 for (patIx = strsrch->pattern.pcesLength - 1; patIx >= 0; patIx -= 1) {
4241 int64_t patCE = strsrch->pattern.pces[patIx];
4242
4243 targetCEI = ceb.getPrevious(targetIx + strsrch->pattern.pcesLength - 1 - patIx + targetIxOffset);
4244 // Compare CE from target string with CE from the pattern.
4245 // Note that the target CE will be UCOL_NULLORDER if we reach the end of input,
4246 // which will fail the compare, below.
4247 UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);
4248 if ( ceMatch == U_CE_NO_MATCH ) {
4249 found = FALSE;
4250 break;
4251 } else if ( ceMatch > U_CE_NO_MATCH ) {
4252 if ( ceMatch == U_CE_SKIP_TARG ) {
4253 // redo with same patCE, next targCE
4254 patIx++;
4255 targetIxOffset++;
4256 } else { // ceMatch == U_CE_SKIP_PATN
4257 // redo with same targCE, next patCE
4258 targetIxOffset--;
4259 }
4260 }
4261 }
4262
4263 if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) {
4264 // No match at this targetIx. Try again at the next.
4265 continue;
4266 }
4267
4268 if (!found) {
4269 // No match at all, we have run off the end of the target text.
4270 break;
4271 }
4272
4273
4274 // We have found a match in CE space.
4275 // Now determine the bounds in string index space.
4276 // There still is a chance of match failure if the CE range not correspond to
4277 // an acceptable character range.
4278 //
4279 const CEI *firstCEI = ceb.getPrevious(targetIx + strsrch->pattern.pcesLength - 1 + targetIxOffset);
4280 mStart = firstCEI->lowIndex;
4281
4282 // Check for the start of the match being within a combining sequence.
4283 // This can happen if the pattern itself begins with a combining char, and
4284 // the match found combining marks in the target text that were attached
4285 // to something else.
4286 // This type of match should be rejected for not completely consuming a
4287 // combining sequence.
4288 if (!isBreakBoundary(strsrch, mStart)) {
4289 found = FALSE;
4290 }
4291
4292 // Look at the high index of the first CE in the match. If it's the same as the
4293 // low index, the first CE in the match is in the middle of an expansion.
4294 if (mStart == firstCEI->highIndex) {
4295 found = FALSE;
4296 }
4297
4298
4299 minLimit = lastCEI->lowIndex;
4300
4301 if (targetIx > 0) {
4302 // Look at the CE following the match. If it is UCOL_NULLORDER the match
4303 // extended to the end of input, and the match is good.
4304
4305 // Look at the high and low indices of the CE following the match. If
4306 // they are the same it means one of two things:
4307 // 1. The match extended to the last CE from the target text, which is OK, or
4308 // 2. The last CE that was part of the match is in an expansion that extends
4309 // to the first CE after the match. In this case, we reject the match.
4310 const CEI *nextCEI = ceb.getPrevious(targetIx - 1);
4311
4312 if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
4313 found = FALSE;
4314 }
4315
4316 mLimit = maxLimit = nextCEI->lowIndex;
4317
4318 // Allow matches to end in the middle of a grapheme cluster if the following
4319 // conditions are met; this is needed to make prefix search work properly in
4320 // Indic, see #11750
4321 // * the default breakIter is being used
4322 // * the next collation element after this combining sequence
4323 // - has non-zero primary weight
4324 // - corresponds to a separate character following the one at end of the current match
4325 // (the second of these conditions, and perhaps both, may be redundant given the
4326 // subsequent check for normalization boundary; however they are likely much faster
4327 // tests in any case)
4328 // * the match limit is a normalization boundary
4329 UBool allowMidclusterMatch = FALSE;
4330 if (strsrch->search->text != NULL && strsrch->search->textLength > maxLimit) {
4331 allowMidclusterMatch =
4332 strsrch->search->breakIter == NULL &&
4333 nextCEI != NULL && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 &&
4334 maxLimit >= lastCEI->highIndex && nextCEI->highIndex > maxLimit &&
4335 (strsrch->nfd->hasBoundaryBefore(codePointAt(*strsrch->search, maxLimit)) ||
4336 strsrch->nfd->hasBoundaryAfter(codePointBefore(*strsrch->search, maxLimit)));
4337 }
4338 // If those conditions are met, then:
4339 // * do NOT advance the candidate match limit (mLimit) to a break boundary; however
4340 // the match limit may be backed off to a previous break boundary. This handles
4341 // cases in which mLimit includes target characters that are ignorable with current
4342 // settings (such as space) and which extend beyond the pattern match.
4343 // * do NOT require that end of the combining sequence not extend beyond the match in CE space
4344 // * do NOT require that match limit be on a breakIter boundary
4345
4346 // Advance the match end position to the first acceptable match boundary.
4347 // This advances the index over any combining characters.
4348 if (minLimit < maxLimit) {
4349 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
4350 // Note that we can have nba < maxLimit && nba >= minLImit, in which
4351 // case we want to set mLimit to nba regardless of allowMidclusterMatch
4352 // (i.e. we back off mLimit to the previous breakIterator boundary).
4353 if (nba >= lastCEI->highIndex && (!allowMidclusterMatch || nba < maxLimit)) {
4354 mLimit = nba;
4355 }
4356 }
4357
4358 if (!allowMidclusterMatch) {
4359 // If advancing to the end of a combining sequence in character indexing space
4360 // advanced us beyond the end of the match in CE space, reject this match.
4361 if (mLimit > maxLimit) {
4362 found = FALSE;
4363 }
4364
4365 // Make sure the end of the match is on a break boundary
4366 if (!isBreakBoundary(strsrch, mLimit)) {
4367 found = FALSE;
4368 }
4369 }
4370
4371 } else {
4372 // No non-ignorable CEs after this point.
4373 // The maximum position is detected by boundary after
4374 // the last non-ignorable CE. Combining sequence
4375 // across the start index will be truncated.
4376 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
4377 mLimit = maxLimit = (nba > 0) && (startIdx > nba) ? nba : startIdx;
4378 }
4379
4380 #ifdef USEARCH_DEBUG
4381 if (getenv("USEARCH_DEBUG") != NULL) {
4382 printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
4383 }
4384 #endif
4385
4386
4387 if (! checkIdentical(strsrch, mStart, mLimit)) {
4388 found = FALSE;
4389 }
4390
4391 if (found) {
4392 break;
4393 }
4394 }
4395
4396 #ifdef USEARCH_DEBUG
4397 if (getenv("USEARCH_DEBUG") != NULL) {
4398 printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
4399 int32_t lastToPrint = ceb.limitIx+2;
4400 for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
4401 printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
4402 }
4403 printf("\n%s\n", found? "match found" : "no match");
4404 }
4405 #endif
4406
4407 // All Done. Store back the match bounds to the caller.
4408 //
4409 if (found==FALSE) {
4410 mLimit = -1;
4411 mStart = -1;
4412 }
4413
4414 if (matchStart != NULL) {
4415 *matchStart= mStart;
4416 }
4417
4418 if (matchLimit != NULL) {
4419 *matchLimit = mLimit;
4420 }
4421
4422 return found;
4423}
4424
4425// internal use methods declared in usrchimp.h -----------------------------
4426
4427UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status)
4428{
4429 if (U_FAILURE(*status)) {
4430 setMatchNotFound(strsrch);
4431 return FALSE;
4432 }
4433
4434#if BOYER_MOORE
4435 UCollationElements *coleiter = strsrch->textIter;
4436 int32_t textlength = strsrch->search->textLength;
4437 int32_t *patternce = strsrch->pattern.ces;
4438 int32_t patterncelength = strsrch->pattern.cesLength;
4439 int32_t textoffset = ucol_getOffset(coleiter);
4440
4441 // status used in setting coleiter offset, since offset is checked in
4442 // shiftForward before setting the coleiter offset, status never
4443 // a failure
4444 textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
4445 patterncelength);
4446 while (textoffset <= textlength)
4447 {
4448 uint32_t patternceindex = patterncelength - 1;
4449 int32_t targetce;
4450 UBool found = FALSE;
4451 int32_t lastce = UCOL_NULLORDER;
4452
4453 setColEIterOffset(coleiter, textoffset);
4454
4455 for (;;) {
4456 // finding the last pattern ce match, imagine composite characters
4457 // for example: search for pattern A in text \u00C0
4458 // we'll have to skip \u0300 the grave first before we get to A
4459 targetce = ucol_previous(coleiter, status);
4460 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4461 found = FALSE;
4462 break;
4463 }
4464 targetce = getCE(strsrch, targetce);
4465 if (targetce == UCOL_IGNORABLE && inNormBuf(coleiter)) {
4466 // this is for the text \u0315\u0300 that requires
4467 // normalization and pattern \u0300, where \u0315 is ignorable
4468 continue;
4469 }
4470 if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
4471 lastce = targetce;
4472 }
4473 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4474 if (targetce == patternce[patternceindex]) {
4475 // the first ce can be a contraction
4476 found = TRUE;
4477 break;
4478 }
4479 if (!hasExpansion(coleiter)) {
4480 found = FALSE;
4481 break;
4482 }
4483 }
4484
4485 //targetce = lastce;
4486
4487 while (found && patternceindex > 0) {
4488 lastce = targetce;
4489 targetce = ucol_previous(coleiter, status);
4490 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4491 found = FALSE;
4492 break;
4493 }
4494 targetce = getCE(strsrch, targetce);
4495 if (targetce == UCOL_IGNORABLE) {
4496 continue;
4497 }
4498
4499 patternceindex --;
4500 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4501 found = found && targetce == patternce[patternceindex];
4502 }
4503
4504 targetce = lastce;
4505
4506 if (!found) {
4507 if (U_FAILURE(*status)) {
4508 break;
4509 }
4510 textoffset = shiftForward(strsrch, textoffset, lastce,
4511 patternceindex);
4512 // status checked at loop.
4513 patternceindex = patterncelength;
4514 continue;
4515 }
4516
4517 if (checkNextExactMatch(strsrch, &textoffset, status)) {
4518 // status checked in ucol_setOffset
4519 setColEIterOffset(coleiter, strsrch->search->matchedIndex);
4520 return TRUE;
4521 }
4522 }
4523 setMatchNotFound(strsrch);
4524 return FALSE;
4525#else
4526 int32_t textOffset = ucol_getOffset(strsrch->textIter);
4527 int32_t start = -1;
4528 int32_t end = -1;
4529
4530 if (usearch_search(strsrch, textOffset, &start, &end, status)) {
4531 strsrch->search->matchedIndex = start;
4532 strsrch->search->matchedLength = end - start;
4533 return TRUE;
4534 } else {
4535 setMatchNotFound(strsrch);
4536 return FALSE;
4537 }
4538#endif
4539}
4540
4541UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status)
4542{
4543 if (U_FAILURE(*status)) {
4544 setMatchNotFound(strsrch);
4545 return FALSE;
4546 }
4547
4548#if BOYER_MOORE
4549 UCollationElements *coleiter = strsrch->textIter;
4550 int32_t textlength = strsrch->search->textLength;
4551 int32_t *patternce = strsrch->pattern.ces;
4552 int32_t patterncelength = strsrch->pattern.cesLength;
4553 int32_t textoffset = ucol_getOffset(coleiter);
4554 UBool hasPatternAccents =
4555 strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
4556
4557 textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
4558 patterncelength);
4559 strsrch->canonicalPrefixAccents[0] = 0;
4560 strsrch->canonicalSuffixAccents[0] = 0;
4561
4562 while (textoffset <= textlength)
4563 {
4564 int32_t patternceindex = patterncelength - 1;
4565 int32_t targetce;
4566 UBool found = FALSE;
4567 int32_t lastce = UCOL_NULLORDER;
4568
4569 setColEIterOffset(coleiter, textoffset);
4570
4571 for (;;) {
4572 // finding the last pattern ce match, imagine composite characters
4573 // for example: search for pattern A in text \u00C0
4574 // we'll have to skip \u0300 the grave first before we get to A
4575 targetce = ucol_previous(coleiter, status);
4576 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4577 found = FALSE;
4578 break;
4579 }
4580 targetce = getCE(strsrch, targetce);
4581 if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
4582 lastce = targetce;
4583 }
4584 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4585 if (targetce == patternce[patternceindex]) {
4586 // the first ce can be a contraction
4587 found = TRUE;
4588 break;
4589 }
4590 if (!hasExpansion(coleiter)) {
4591 found = FALSE;
4592 break;
4593 }
4594 }
4595
4596 while (found && patternceindex > 0) {
4597 targetce = ucol_previous(coleiter, status);
4598 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4599 found = FALSE;
4600 break;
4601 }
4602 targetce = getCE(strsrch, targetce);
4603 if (targetce == UCOL_IGNORABLE) {
4604 continue;
4605 }
4606
4607 patternceindex --;
4608 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4609 found = found && targetce == patternce[patternceindex];
4610 }
4611
4612 // initializing the rearranged accent array
4613 if (hasPatternAccents && !found) {
4614 strsrch->canonicalPrefixAccents[0] = 0;
4615 strsrch->canonicalSuffixAccents[0] = 0;
4616 if (U_FAILURE(*status)) {
4617 break;
4618 }
4619 found = doNextCanonicalMatch(strsrch, textoffset, status);
4620 }
4621
4622 if (!found) {
4623 if (U_FAILURE(*status)) {
4624 break;
4625 }
4626 textoffset = shiftForward(strsrch, textoffset, lastce,
4627 patternceindex);
4628 // status checked at loop
4629 patternceindex = patterncelength;
4630 continue;
4631 }
4632
4633 if (checkNextCanonicalMatch(strsrch, &textoffset, status)) {
4634 setColEIterOffset(coleiter, strsrch->search->matchedIndex);
4635 return TRUE;
4636 }
4637 }
4638 setMatchNotFound(strsrch);
4639 return FALSE;
4640#else
4641 int32_t textOffset = ucol_getOffset(strsrch->textIter);
4642 int32_t start = -1;
4643 int32_t end = -1;
4644
4645 if (usearch_search(strsrch, textOffset, &start, &end, status)) {
4646 strsrch->search->matchedIndex = start;
4647 strsrch->search->matchedLength = end - start;
4648 return TRUE;
4649 } else {
4650 setMatchNotFound(strsrch);
4651 return FALSE;
4652 }
4653#endif
4654}
4655
4656UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status)
4657{
4658 if (U_FAILURE(*status)) {
4659 setMatchNotFound(strsrch);
4660 return FALSE;
4661 }
4662
4663#if BOYER_MOORE
4664 UCollationElements *coleiter = strsrch->textIter;
4665 int32_t *patternce = strsrch->pattern.ces;
4666 int32_t patterncelength = strsrch->pattern.cesLength;
4667 int32_t textoffset = ucol_getOffset(coleiter);
4668
4669 // shifting it check for setting offset
4670 // if setOffset is called previously or there was no previous match, we
4671 // leave the offset as it is.
4672 if (strsrch->search->matchedIndex != USEARCH_DONE) {
4673 textoffset = strsrch->search->matchedIndex;
4674 }
4675
4676 textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
4677 patterncelength);
4678
4679 while (textoffset >= 0)
4680 {
4681 int32_t patternceindex = 1;
4682 int32_t targetce;
4683 UBool found = FALSE;
4684 int32_t firstce = UCOL_NULLORDER;
4685
4686 // if status is a failure, ucol_setOffset does nothing
4687 setColEIterOffset(coleiter, textoffset);
4688
4689 for (;;) {
4690 // finding the first pattern ce match, imagine composite
4691 // characters. for example: search for pattern \u0300 in text
4692 // \u00C0, we'll have to skip A first before we get to
4693 // \u0300 the grave accent
4694 targetce = ucol_next(coleiter, status);
4695 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4696 found = FALSE;
4697 break;
4698 }
4699 targetce = getCE(strsrch, targetce);
4700 if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
4701 firstce = targetce;
4702 }
4703 if (targetce == UCOL_IGNORABLE && strsrch->strength != UCOL_PRIMARY) {
4704 continue;
4705 }
4706 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4707 if (targetce == patternce[0]) {
4708 found = TRUE;
4709 break;
4710 }
4711 if (!hasExpansion(coleiter)) {
4712 // checking for accents in composite character
4713 found = FALSE;
4714 break;
4715 }
4716 }
4717
4718 //targetce = firstce;
4719
4720 while (found && (patternceindex < patterncelength)) {
4721 firstce = targetce;
4722 targetce = ucol_next(coleiter, status);
4723 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4724 found = FALSE;
4725 break;
4726 }
4727 targetce = getCE(strsrch, targetce);
4728 if (targetce == UCOL_IGNORABLE) {
4729 continue;
4730 }
4731
4732 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4733 found = found && targetce == patternce[patternceindex];
4734 patternceindex ++;
4735 }
4736
4737 targetce = firstce;
4738
4739 if (!found) {
4740 if (U_FAILURE(*status)) {
4741 break;
4742 }
4743
4744 textoffset = reverseShift(strsrch, textoffset, targetce,
4745 patternceindex);
4746 patternceindex = 0;
4747 continue;
4748 }
4749
4750 if (checkPreviousExactMatch(strsrch, &textoffset, status)) {
4751 setColEIterOffset(coleiter, textoffset);
4752 return TRUE;
4753 }
4754 }
4755 setMatchNotFound(strsrch);
4756 return FALSE;
4757#else
4758 int32_t textOffset;
4759
4760 if (strsrch->search->isOverlap) {
4761 if (strsrch->search->matchedIndex != USEARCH_DONE) {
4762 textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1;
4763 } else {
4764 // move the start position at the end of possible match
4765 initializePatternPCETable(strsrch, status);
4766 if (!initTextProcessedIter(strsrch, status)) {
4767 setMatchNotFound(strsrch);
4768 return FALSE;
4769 }
4770 for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.pcesLength - 1; nPCEs++) {
4771 int64_t pce = strsrch->textProcessedIter->nextProcessed(NULL, NULL, status);
4772 if (pce == UCOL_PROCESSED_NULLORDER) {
4773 // at the end of the text
4774 break;
4775 }
4776 }
4777 if (U_FAILURE(*status)) {
4778 setMatchNotFound(strsrch);
4779 return FALSE;
4780 }
4781 textOffset = ucol_getOffset(strsrch->textIter);
4782 }
4783 } else {
4784 textOffset = ucol_getOffset(strsrch->textIter);
4785 }
4786
4787 int32_t start = -1;
4788 int32_t end = -1;
4789
4790 if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
4791 strsrch->search->matchedIndex = start;
4792 strsrch->search->matchedLength = end - start;
4793 return TRUE;
4794 } else {
4795 setMatchNotFound(strsrch);
4796 return FALSE;
4797 }
4798#endif
4799}
4800
4801UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
4802 UErrorCode *status)
4803{
4804 if (U_FAILURE(*status)) {
4805 setMatchNotFound(strsrch);
4806 return FALSE;
4807 }
4808
4809#if BOYER_MOORE
4810 UCollationElements *coleiter = strsrch->textIter;
4811 int32_t *patternce = strsrch->pattern.ces;
4812 int32_t patterncelength = strsrch->pattern.cesLength;
4813 int32_t textoffset = ucol_getOffset(coleiter);
4814 UBool hasPatternAccents =
4815 strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
4816
4817 // shifting it check for setting offset
4818 // if setOffset is called previously or there was no previous match, we
4819 // leave the offset as it is.
4820 if (strsrch->search->matchedIndex != USEARCH_DONE) {
4821 textoffset = strsrch->search->matchedIndex;
4822 }
4823
4824 textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
4825 patterncelength);
4826 strsrch->canonicalPrefixAccents[0] = 0;
4827 strsrch->canonicalSuffixAccents[0] = 0;
4828
4829 while (textoffset >= 0)
4830 {
4831 int32_t patternceindex = 1;
4832 int32_t targetce;
4833 UBool found = FALSE;
4834 int32_t firstce = UCOL_NULLORDER;
4835
4836 setColEIterOffset(coleiter, textoffset);
4837 for (;;) {
4838 // finding the first pattern ce match, imagine composite
4839 // characters. for example: search for pattern \u0300 in text
4840 // \u00C0, we'll have to skip A first before we get to
4841 // \u0300 the grave accent
4842 targetce = ucol_next(coleiter, status);
4843 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4844 found = FALSE;
4845 break;
4846 }
4847 targetce = getCE(strsrch, targetce);
4848 if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
4849 firstce = targetce;
4850 }
4851
4852 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4853 if (targetce == patternce[0]) {
4854 // the first ce can be a contraction
4855 found = TRUE;
4856 break;
4857 }
4858 if (!hasExpansion(coleiter)) {
4859 // checking for accents in composite character
4860 found = FALSE;
4861 break;
4862 }
4863 }
4864
4865 targetce = firstce;
4866
4867 while (found && patternceindex < patterncelength) {
4868 targetce = ucol_next(coleiter, status);
4869 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4870 found = FALSE;
4871 break;
4872 }
4873 targetce = getCE(strsrch, targetce);
4874 if (targetce == UCOL_IGNORABLE) {
4875 continue;
4876 }
4877
4878 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4879 found = found && targetce == patternce[patternceindex];
4880 patternceindex ++;
4881 }
4882
4883 // initializing the rearranged accent array
4884 if (hasPatternAccents && !found) {
4885 strsrch->canonicalPrefixAccents[0] = 0;
4886 strsrch->canonicalSuffixAccents[0] = 0;
4887 if (U_FAILURE(*status)) {
4888 break;
4889 }
4890 found = doPreviousCanonicalMatch(strsrch, textoffset, status);
4891 }
4892
4893 if (!found) {
4894 if (U_FAILURE(*status)) {
4895 break;
4896 }
4897 textoffset = reverseShift(strsrch, textoffset, targetce,
4898 patternceindex);
4899 patternceindex = 0;
4900 continue;
4901 }
4902
4903 if (checkPreviousCanonicalMatch(strsrch, &textoffset, status)) {
4904 setColEIterOffset(coleiter, textoffset);
4905 return TRUE;
4906 }
4907 }
4908 setMatchNotFound(strsrch);
4909 return FALSE;
4910#else
4911 int32_t textOffset;
4912
4913 if (strsrch->search->isOverlap) {
4914 if (strsrch->search->matchedIndex != USEARCH_DONE) {
4915 textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1;
4916 } else {
4917 // move the start position at the end of possible match
4918 initializePatternPCETable(strsrch, status);
4919 if (!initTextProcessedIter(strsrch, status)) {
4920 setMatchNotFound(strsrch);
4921 return FALSE;
4922 }
4923 for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.pcesLength - 1; nPCEs++) {
4924 int64_t pce = strsrch->textProcessedIter->nextProcessed(NULL, NULL, status);
4925 if (pce == UCOL_PROCESSED_NULLORDER) {
4926 // at the end of the text
4927 break;
4928 }
4929 }
4930 if (U_FAILURE(*status)) {
4931 setMatchNotFound(strsrch);
4932 return FALSE;
4933 }
4934 textOffset = ucol_getOffset(strsrch->textIter);
4935 }
4936 } else {
4937 textOffset = ucol_getOffset(strsrch->textIter);
4938 }
4939
4940 int32_t start = -1;
4941 int32_t end = -1;
4942
4943 if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
4944 strsrch->search->matchedIndex = start;
4945 strsrch->search->matchedLength = end - start;
4946 return TRUE;
4947 } else {
4948 setMatchNotFound(strsrch);
4949 return FALSE;
4950 }
4951#endif
4952}
4953
4954#endif /* #if !UCONFIG_NO_COLLATION */
4955