1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | /* |
4 | ********************************************************************** |
5 | * Copyright (C) 2001-2011 IBM and others. All rights reserved. |
6 | ********************************************************************** |
7 | * Date Name Description |
8 | * 03/22/2000 helena Creation. |
9 | ********************************************************************** |
10 | */ |
11 | |
12 | #ifndef SEARCH_H |
13 | #define SEARCH_H |
14 | |
15 | #include "unicode/utypes.h" |
16 | |
17 | #if U_SHOW_CPLUSPLUS_API |
18 | |
19 | /** |
20 | * \file |
21 | * \brief C++ API: SearchIterator object. |
22 | */ |
23 | |
24 | #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION |
25 | |
26 | #include "unicode/uobject.h" |
27 | #include "unicode/unistr.h" |
28 | #include "unicode/chariter.h" |
29 | #include "unicode/brkiter.h" |
30 | #include "unicode/usearch.h" |
31 | |
32 | /** |
33 | * @stable ICU 2.0 |
34 | */ |
35 | struct USearch; |
36 | /** |
37 | * @stable ICU 2.0 |
38 | */ |
39 | typedef struct USearch USearch; |
40 | |
41 | U_NAMESPACE_BEGIN |
42 | |
43 | /** |
44 | * |
45 | * <tt>SearchIterator</tt> is an abstract base class that provides |
46 | * methods to search for a pattern within a text string. Instances of |
47 | * <tt>SearchIterator</tt> maintain a current position and scans over the |
48 | * target text, returning the indices the pattern is matched and the length |
49 | * of each match. |
50 | * <p> |
51 | * <tt>SearchIterator</tt> defines a protocol for text searching. |
52 | * Subclasses provide concrete implementations of various search algorithms. |
53 | * For example, <tt>StringSearch</tt> implements language-sensitive pattern |
54 | * matching based on the comparison rules defined in a |
55 | * <tt>RuleBasedCollator</tt> object. |
56 | * <p> |
57 | * Other options for searching includes using a BreakIterator to restrict |
58 | * the points at which matches are detected. |
59 | * <p> |
60 | * <tt>SearchIterator</tt> provides an API that is similar to that of |
61 | * other text iteration classes such as <tt>BreakIterator</tt>. Using |
62 | * this class, it is easy to scan through text looking for all occurances of |
63 | * a given pattern. The following example uses a <tt>StringSearch</tt> |
64 | * object to find all instances of "fox" in the target string. Any other |
65 | * subclass of <tt>SearchIterator</tt> can be used in an identical |
66 | * manner. |
67 | * <pre><code> |
68 | * UnicodeString target("The quick brown fox jumped over the lazy fox"); |
69 | * UnicodeString pattern("fox"); |
70 | * |
71 | * SearchIterator *iter = new StringSearch(pattern, target); |
72 | * UErrorCode error = U_ZERO_ERROR; |
73 | * for (int pos = iter->first(error); pos != USEARCH_DONE; |
74 | * pos = iter->next(error)) { |
75 | * printf("Found match at %d pos, length is %d\n", pos, iter.getMatchedLength()); |
76 | * } |
77 | * </code></pre> |
78 | * |
79 | * @see StringSearch |
80 | * @see RuleBasedCollator |
81 | */ |
82 | class U_I18N_API SearchIterator : public UObject { |
83 | |
84 | public: |
85 | |
86 | // public constructors and destructors ------------------------------- |
87 | |
88 | /** |
89 | * Copy constructor that creates a SearchIterator instance with the same |
90 | * behavior, and iterating over the same text. |
91 | * @param other the SearchIterator instance to be copied. |
92 | * @stable ICU 2.0 |
93 | */ |
94 | SearchIterator(const SearchIterator &other); |
95 | |
96 | /** |
97 | * Destructor. Cleans up the search iterator data struct. |
98 | * @stable ICU 2.0 |
99 | */ |
100 | virtual ~SearchIterator(); |
101 | |
102 | // public get and set methods ---------------------------------------- |
103 | |
104 | /** |
105 | * Sets the index to point to the given position, and clears any state |
106 | * that's affected. |
107 | * <p> |
108 | * This method takes the argument index and sets the position in the text |
109 | * string accordingly without checking if the index is pointing to a |
110 | * valid starting point to begin searching. |
111 | * @param position within the text to be set. If position is less |
112 | * than or greater than the text range for searching, |
113 | * an U_INDEX_OUTOFBOUNDS_ERROR will be returned |
114 | * @param status for errors if it occurs |
115 | * @stable ICU 2.0 |
116 | */ |
117 | virtual void setOffset(int32_t position, UErrorCode &status) = 0; |
118 | |
119 | /** |
120 | * Return the current index in the text being searched. |
121 | * If the iteration has gone past the end of the text |
122 | * (or past the beginning for a backwards search), USEARCH_DONE |
123 | * is returned. |
124 | * @return current index in the text being searched. |
125 | * @stable ICU 2.0 |
126 | */ |
127 | virtual int32_t getOffset(void) const = 0; |
128 | |
129 | /** |
130 | * Sets the text searching attributes located in the enum |
131 | * USearchAttribute with values from the enum USearchAttributeValue. |
132 | * USEARCH_DEFAULT can be used for all attributes for resetting. |
133 | * @param attribute text attribute (enum USearchAttribute) to be set |
134 | * @param value text attribute value |
135 | * @param status for errors if it occurs |
136 | * @stable ICU 2.0 |
137 | */ |
138 | void setAttribute(USearchAttribute attribute, |
139 | USearchAttributeValue value, |
140 | UErrorCode &status); |
141 | |
142 | /** |
143 | * Gets the text searching attributes |
144 | * @param attribute text attribute (enum USearchAttribute) to be retrieve |
145 | * @return text attribute value |
146 | * @stable ICU 2.0 |
147 | */ |
148 | USearchAttributeValue getAttribute(USearchAttribute attribute) const; |
149 | |
150 | /** |
151 | * Returns the index to the match in the text string that was searched. |
152 | * This call returns a valid result only after a successful call to |
153 | * <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>. |
154 | * Just after construction, or after a searching method returns |
155 | * <tt>USEARCH_DONE</tt>, this method will return <tt>USEARCH_DONE</tt>. |
156 | * <p> |
157 | * Use getMatchedLength to get the matched string length. |
158 | * @return index of a substring within the text string that is being |
159 | * searched. |
160 | * @see #first |
161 | * @see #next |
162 | * @see #previous |
163 | * @see #last |
164 | * @stable ICU 2.0 |
165 | */ |
166 | int32_t getMatchedStart(void) const; |
167 | |
168 | /** |
169 | * Returns the length of text in the string which matches the search |
170 | * pattern. This call returns a valid result only after a successful call |
171 | * to <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>. |
172 | * Just after construction, or after a searching method returns |
173 | * <tt>USEARCH_DONE</tt>, this method will return 0. |
174 | * @return The length of the match in the target text, or 0 if there |
175 | * is no match currently. |
176 | * @see #first |
177 | * @see #next |
178 | * @see #previous |
179 | * @see #last |
180 | * @stable ICU 2.0 |
181 | */ |
182 | int32_t getMatchedLength(void) const; |
183 | |
184 | /** |
185 | * Returns the text that was matched by the most recent call to |
186 | * <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>. |
187 | * If the iterator is not pointing at a valid match (e.g. just after |
188 | * construction or after <tt>USEARCH_DONE</tt> has been returned, |
189 | * returns an empty string. |
190 | * @param result stores the matched string or an empty string if a match |
191 | * is not found. |
192 | * @see #first |
193 | * @see #next |
194 | * @see #previous |
195 | * @see #last |
196 | * @stable ICU 2.0 |
197 | */ |
198 | void getMatchedText(UnicodeString &result) const; |
199 | |
200 | /** |
201 | * Set the BreakIterator that will be used to restrict the points |
202 | * at which matches are detected. The user is responsible for deleting |
203 | * the breakiterator. |
204 | * @param breakiter A BreakIterator that will be used to restrict the |
205 | * points at which matches are detected. If a match is |
206 | * found, but the match's start or end index is not a |
207 | * boundary as determined by the <tt>BreakIterator</tt>, |
208 | * the match will be rejected and another will be searched |
209 | * for. If this parameter is <tt>NULL</tt>, no break |
210 | * detection is attempted. |
211 | * @param status for errors if it occurs |
212 | * @see BreakIterator |
213 | * @stable ICU 2.0 |
214 | */ |
215 | void setBreakIterator(BreakIterator *breakiter, UErrorCode &status); |
216 | |
217 | /** |
218 | * Returns the BreakIterator that is used to restrict the points at |
219 | * which matches are detected. This will be the same object that was |
220 | * passed to the constructor or to <tt>setBreakIterator</tt>. |
221 | * Note that <tt>NULL</tt> is a legal value; it means that break |
222 | * detection should not be attempted. |
223 | * @return BreakIterator used to restrict matchings. |
224 | * @see #setBreakIterator |
225 | * @stable ICU 2.0 |
226 | */ |
227 | const BreakIterator * getBreakIterator(void) const; |
228 | |
229 | /** |
230 | * Set the string text to be searched. Text iteration will hence begin at |
231 | * the start of the text string. This method is useful if you want to |
232 | * re-use an iterator to search for the same pattern within a different |
233 | * body of text. The user is responsible for deleting the text. |
234 | * @param text string to be searched. |
235 | * @param status for errors. If the text length is 0, |
236 | * an U_ILLEGAL_ARGUMENT_ERROR is returned. |
237 | * @stable ICU 2.0 |
238 | */ |
239 | virtual void setText(const UnicodeString &text, UErrorCode &status); |
240 | |
241 | /** |
242 | * Set the string text to be searched. Text iteration will hence begin at |
243 | * the start of the text string. This method is useful if you want to |
244 | * re-use an iterator to search for the same pattern within a different |
245 | * body of text. |
246 | * <p> |
247 | * Note: No parsing of the text within the <tt>CharacterIterator</tt> |
248 | * will be done during searching for this version. The block of text |
249 | * in <tt>CharacterIterator</tt> will be used as it is. |
250 | * The user is responsible for deleting the text. |
251 | * @param text string iterator to be searched. |
252 | * @param status for errors if any. If the text length is 0 then an |
253 | * U_ILLEGAL_ARGUMENT_ERROR is returned. |
254 | * @stable ICU 2.0 |
255 | */ |
256 | virtual void setText(CharacterIterator &text, UErrorCode &status); |
257 | |
258 | /** |
259 | * Return the string text to be searched. |
260 | * @return text string to be searched. |
261 | * @stable ICU 2.0 |
262 | */ |
263 | const UnicodeString & getText(void) const; |
264 | |
265 | // operator overloading ---------------------------------------------- |
266 | |
267 | /** |
268 | * Equality operator. |
269 | * @param that SearchIterator instance to be compared. |
270 | * @return TRUE if both BreakIterators are of the same class, have the |
271 | * same behavior, terates over the same text and have the same |
272 | * attributes. FALSE otherwise. |
273 | * @stable ICU 2.0 |
274 | */ |
275 | virtual UBool operator==(const SearchIterator &that) const; |
276 | |
277 | /** |
278 | * Not-equal operator. |
279 | * @param that SearchIterator instance to be compared. |
280 | * @return FALSE if operator== returns TRUE, and vice versa. |
281 | * @stable ICU 2.0 |
282 | */ |
283 | UBool operator!=(const SearchIterator &that) const; |
284 | |
285 | // public methods ---------------------------------------------------- |
286 | |
287 | /** |
288 | * Returns a copy of SearchIterator with the same behavior, and |
289 | * iterating over the same text, as this one. Note that all data will be |
290 | * replicated, except for the text string to be searched. |
291 | * @return cloned object |
292 | * @stable ICU 2.0 |
293 | */ |
294 | virtual SearchIterator* safeClone(void) const = 0; |
295 | |
296 | /** |
297 | * Returns the first index at which the string text matches the search |
298 | * pattern. The iterator is adjusted so that its current index (as |
299 | * returned by <tt>getOffset</tt>) is the match position if one |
300 | * was found. |
301 | * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and |
302 | * the iterator will be adjusted to the index USEARCH_DONE |
303 | * @param status for errors if it occurs |
304 | * @return The character index of the first match, or |
305 | * <tt>USEARCH_DONE</tt> if there are no matches. |
306 | * @see #getOffset |
307 | * @stable ICU 2.0 |
308 | */ |
309 | int32_t first(UErrorCode &status); |
310 | |
311 | /** |
312 | * Returns the first index equal or greater than <tt>position</tt> at which the |
313 | * string text matches the search pattern. The iterator is adjusted so |
314 | * that its current index (as returned by <tt>getOffset</tt>) is the |
315 | * match position if one was found. |
316 | * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and the |
317 | * iterator will be adjusted to the index <tt>USEARCH_DONE</tt>. |
318 | * @param position where search if to start from. If position is less |
319 | * than or greater than the text range for searching, |
320 | * an U_INDEX_OUTOFBOUNDS_ERROR will be returned |
321 | * @param status for errors if it occurs |
322 | * @return The character index of the first match following |
323 | * <tt>position</tt>, or <tt>USEARCH_DONE</tt> if there are no |
324 | * matches. |
325 | * @see #getOffset |
326 | * @stable ICU 2.0 |
327 | */ |
328 | int32_t following(int32_t position, UErrorCode &status); |
329 | |
330 | /** |
331 | * Returns the last index in the target text at which it matches the |
332 | * search pattern. The iterator is adjusted so that its current index |
333 | * (as returned by <tt>getOffset</tt>) is the match position if one was |
334 | * found. |
335 | * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and |
336 | * the iterator will be adjusted to the index USEARCH_DONE. |
337 | * @param status for errors if it occurs |
338 | * @return The index of the first match, or <tt>USEARCH_DONE</tt> if |
339 | * there are no matches. |
340 | * @see #getOffset |
341 | * @stable ICU 2.0 |
342 | */ |
343 | int32_t last(UErrorCode &status); |
344 | |
345 | /** |
346 | * Returns the first index less than <tt>position</tt> at which the string |
347 | * text matches the search pattern. The iterator is adjusted so that its |
348 | * current index (as returned by <tt>getOffset</tt>) is the match |
349 | * position if one was found. If a match is not found, |
350 | * <tt>USEARCH_DONE</tt> will be returned and the iterator will be |
351 | * adjusted to the index USEARCH_DONE |
352 | * <p> |
353 | * When <tt>USEARCH_OVERLAP</tt> option is off, the last index of the |
354 | * result match is always less than <tt>position</tt>. |
355 | * When <tt>USERARCH_OVERLAP</tt> is on, the result match may span across |
356 | * <tt>position</tt>. |
357 | * |
358 | * @param position where search is to start from. If position is less |
359 | * than or greater than the text range for searching, |
360 | * an U_INDEX_OUTOFBOUNDS_ERROR will be returned |
361 | * @param status for errors if it occurs |
362 | * @return The character index of the first match preceding |
363 | * <tt>position</tt>, or <tt>USEARCH_DONE</tt> if there are |
364 | * no matches. |
365 | * @see #getOffset |
366 | * @stable ICU 2.0 |
367 | */ |
368 | int32_t preceding(int32_t position, UErrorCode &status); |
369 | |
370 | /** |
371 | * Returns the index of the next point at which the text matches the |
372 | * search pattern, starting from the current position |
373 | * The iterator is adjusted so that its current index (as returned by |
374 | * <tt>getOffset</tt>) is the match position if one was found. |
375 | * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and |
376 | * the iterator will be adjusted to a position after the end of the text |
377 | * string. |
378 | * @param status for errors if it occurs |
379 | * @return The index of the next match after the current position, |
380 | * or <tt>USEARCH_DONE</tt> if there are no more matches. |
381 | * @see #getOffset |
382 | * @stable ICU 2.0 |
383 | */ |
384 | int32_t next(UErrorCode &status); |
385 | |
386 | /** |
387 | * Returns the index of the previous point at which the string text |
388 | * matches the search pattern, starting at the current position. |
389 | * The iterator is adjusted so that its current index (as returned by |
390 | * <tt>getOffset</tt>) is the match position if one was found. |
391 | * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and |
392 | * the iterator will be adjusted to the index USEARCH_DONE |
393 | * @param status for errors if it occurs |
394 | * @return The index of the previous match before the current position, |
395 | * or <tt>USEARCH_DONE</tt> if there are no more matches. |
396 | * @see #getOffset |
397 | * @stable ICU 2.0 |
398 | */ |
399 | int32_t previous(UErrorCode &status); |
400 | |
401 | /** |
402 | * Resets the iteration. |
403 | * Search will begin at the start of the text string if a forward |
404 | * iteration is initiated before a backwards iteration. Otherwise if a |
405 | * backwards iteration is initiated before a forwards iteration, the |
406 | * search will begin at the end of the text string. |
407 | * @stable ICU 2.0 |
408 | */ |
409 | virtual void reset(); |
410 | |
411 | protected: |
412 | // protected data members --------------------------------------------- |
413 | |
414 | /** |
415 | * C search data struct |
416 | * @stable ICU 2.0 |
417 | */ |
418 | USearch *m_search_; |
419 | |
420 | /** |
421 | * Break iterator. |
422 | * Currently the C++ breakiterator does not have getRules etc to reproduce |
423 | * another in C. Hence we keep the original around and do the verification |
424 | * at the end of the match. The user is responsible for deleting this |
425 | * break iterator. |
426 | * @stable ICU 2.0 |
427 | */ |
428 | BreakIterator *m_breakiterator_; |
429 | |
430 | /** |
431 | * Unicode string version of the search text |
432 | * @stable ICU 2.0 |
433 | */ |
434 | UnicodeString m_text_; |
435 | |
436 | // protected constructors and destructors ----------------------------- |
437 | |
438 | /** |
439 | * Default constructor. |
440 | * Initializes data to the default values. |
441 | * @stable ICU 2.0 |
442 | */ |
443 | SearchIterator(); |
444 | |
445 | /** |
446 | * Constructor for use by subclasses. |
447 | * @param text The target text to be searched. |
448 | * @param breakiter A {@link BreakIterator} that is used to restrict the |
449 | * points at which matches are detected. If |
450 | * <tt>handleNext</tt> or <tt>handlePrev</tt> finds a |
451 | * match, but the match's start or end index is not a |
452 | * boundary as determined by the <tt>BreakIterator</tt>, |
453 | * the match is rejected and <tt>handleNext</tt> or |
454 | * <tt>handlePrev</tt> is called again. If this parameter |
455 | * is <tt>NULL</tt>, no break detection is attempted. |
456 | * @see #handleNext |
457 | * @see #handlePrev |
458 | * @stable ICU 2.0 |
459 | */ |
460 | SearchIterator(const UnicodeString &text, |
461 | BreakIterator *breakiter = NULL); |
462 | |
463 | /** |
464 | * Constructor for use by subclasses. |
465 | * <p> |
466 | * Note: No parsing of the text within the <tt>CharacterIterator</tt> |
467 | * will be done during searching for this version. The block of text |
468 | * in <tt>CharacterIterator</tt> will be used as it is. |
469 | * @param text The target text to be searched. |
470 | * @param breakiter A {@link BreakIterator} that is used to restrict the |
471 | * points at which matches are detected. If |
472 | * <tt>handleNext</tt> or <tt>handlePrev</tt> finds a |
473 | * match, but the match's start or end index is not a |
474 | * boundary as determined by the <tt>BreakIterator</tt>, |
475 | * the match is rejected and <tt>handleNext</tt> or |
476 | * <tt>handlePrev</tt> is called again. If this parameter |
477 | * is <tt>NULL</tt>, no break detection is attempted. |
478 | * @see #handleNext |
479 | * @see #handlePrev |
480 | * @stable ICU 2.0 |
481 | */ |
482 | SearchIterator(CharacterIterator &text, BreakIterator *breakiter = NULL); |
483 | |
484 | // protected methods -------------------------------------------------- |
485 | |
486 | /** |
487 | * Assignment operator. Sets this iterator to have the same behavior, |
488 | * and iterate over the same text, as the one passed in. |
489 | * @param that instance to be copied. |
490 | * @stable ICU 2.0 |
491 | */ |
492 | SearchIterator & operator=(const SearchIterator &that); |
493 | |
494 | /** |
495 | * Abstract method which subclasses override to provide the mechanism |
496 | * for finding the next match in the target text. This allows different |
497 | * subclasses to provide different search algorithms. |
498 | * <p> |
499 | * If a match is found, the implementation should return the index at |
500 | * which the match starts and should call |
501 | * <tt>setMatchLength</tt> with the number of characters |
502 | * in the target text that make up the match. If no match is found, the |
503 | * method should return USEARCH_DONE. |
504 | * <p> |
505 | * @param position The index in the target text at which the search |
506 | * should start. |
507 | * @param status for error codes if it occurs. |
508 | * @return index at which the match starts, else if match is not found |
509 | * USEARCH_DONE is returned |
510 | * @see #setMatchLength |
511 | * @stable ICU 2.0 |
512 | */ |
513 | virtual int32_t handleNext(int32_t position, UErrorCode &status) |
514 | = 0; |
515 | |
516 | /** |
517 | * Abstract method which subclasses override to provide the mechanism for |
518 | * finding the previous match in the target text. This allows different |
519 | * subclasses to provide different search algorithms. |
520 | * <p> |
521 | * If a match is found, the implementation should return the index at |
522 | * which the match starts and should call |
523 | * <tt>setMatchLength</tt> with the number of characters |
524 | * in the target text that make up the match. If no match is found, the |
525 | * method should return USEARCH_DONE. |
526 | * <p> |
527 | * @param position The index in the target text at which the search |
528 | * should start. |
529 | * @param status for error codes if it occurs. |
530 | * @return index at which the match starts, else if match is not found |
531 | * USEARCH_DONE is returned |
532 | * @see #setMatchLength |
533 | * @stable ICU 2.0 |
534 | */ |
535 | virtual int32_t handlePrev(int32_t position, UErrorCode &status) |
536 | = 0; |
537 | |
538 | /** |
539 | * Sets the length of the currently matched string in the text string to |
540 | * be searched. |
541 | * Subclasses' <tt>handleNext</tt> and <tt>handlePrev</tt> |
542 | * methods should call this when they find a match in the target text. |
543 | * @param length length of the matched text. |
544 | * @see #handleNext |
545 | * @see #handlePrev |
546 | * @stable ICU 2.0 |
547 | */ |
548 | virtual void setMatchLength(int32_t length); |
549 | |
550 | /** |
551 | * Sets the offset of the currently matched string in the text string to |
552 | * be searched. |
553 | * Subclasses' <tt>handleNext</tt> and <tt>handlePrev</tt> |
554 | * methods should call this when they find a match in the target text. |
555 | * @param position start offset of the matched text. |
556 | * @see #handleNext |
557 | * @see #handlePrev |
558 | * @stable ICU 2.0 |
559 | */ |
560 | virtual void setMatchStart(int32_t position); |
561 | |
562 | /** |
563 | * sets match not found |
564 | * @stable ICU 2.0 |
565 | */ |
566 | void setMatchNotFound(); |
567 | }; |
568 | |
569 | inline UBool SearchIterator::operator!=(const SearchIterator &that) const |
570 | { |
571 | return !operator==(that); |
572 | } |
573 | U_NAMESPACE_END |
574 | |
575 | #endif /* #if !UCONFIG_NO_COLLATION */ |
576 | |
577 | #endif /* U_SHOW_CPLUSPLUS_API */ |
578 | |
579 | #endif |
580 | |
581 | |