1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | /* |
4 | ********************************************************************** |
5 | * Copyright (C) 2001-2014 IBM and others. All rights reserved. |
6 | ********************************************************************** |
7 | * Date Name Description |
8 | * 03/22/2000 helena Creation. |
9 | ********************************************************************** |
10 | */ |
11 | |
12 | #ifndef STSEARCH_H |
13 | #define STSEARCH_H |
14 | |
15 | #include "unicode/utypes.h" |
16 | |
17 | #if U_SHOW_CPLUSPLUS_API |
18 | |
19 | /** |
20 | * \file |
21 | * \brief C++ API: Service for searching text based on RuleBasedCollator. |
22 | */ |
23 | |
24 | #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION |
25 | |
26 | #include "unicode/tblcoll.h" |
27 | #include "unicode/coleitr.h" |
28 | #include "unicode/search.h" |
29 | |
30 | U_NAMESPACE_BEGIN |
31 | |
32 | /** |
33 | * |
34 | * <tt>StringSearch</tt> is a <tt>SearchIterator</tt> that provides |
35 | * language-sensitive text searching based on the comparison rules defined |
36 | * in a {@link RuleBasedCollator} object. |
37 | * StringSearch ensures that language eccentricity can be |
38 | * handled, e.g. for the German collator, characters ß and SS will be matched |
39 | * if case is chosen to be ignored. |
40 | * See the <a href="http://source.icu-project.org/repos/icu/icuhtml/trunk/design/collation/ICU_collation_design.htm"> |
41 | * "ICU Collation Design Document"</a> for more information. |
42 | * <p> |
43 | * There are 2 match options for selection:<br> |
44 | * Let S' be the sub-string of a text string S between the offsets start and |
45 | * end [start, end]. |
46 | * <br> |
47 | * A pattern string P matches a text string S at the offsets [start, end] |
48 | * if |
49 | * <pre> |
50 | * option 1. Some canonical equivalent of P matches some canonical equivalent |
51 | * of S' |
52 | * option 2. P matches S' and if P starts or ends with a combining mark, |
53 | * there exists no non-ignorable combining mark before or after S? |
54 | * in S respectively. |
55 | * </pre> |
56 | * Option 2. will be the default. |
57 | * <p> |
58 | * This search has APIs similar to that of other text iteration mechanisms |
59 | * such as the break iterators in <tt>BreakIterator</tt>. Using these |
60 | * APIs, it is easy to scan through text looking for all occurrences of |
61 | * a given pattern. This search iterator allows changing of direction by |
62 | * calling a <tt>reset</tt> followed by a <tt>next</tt> or <tt>previous</tt>. |
63 | * Though a direction change can occur without calling <tt>reset</tt> first, |
64 | * this operation comes with some speed penalty. |
65 | * Match results in the forward direction will match the result matches in |
66 | * the backwards direction in the reverse order |
67 | * <p> |
68 | * <tt>SearchIterator</tt> provides APIs to specify the starting position |
69 | * within the text string to be searched, e.g. <tt>setOffset</tt>, |
70 | * <tt>preceding</tt> and <tt>following</tt>. Since the |
71 | * starting position will be set as it is specified, please take note that |
72 | * there are some danger points which the search may render incorrect |
73 | * results: |
74 | * <ul> |
75 | * <li> The midst of a substring that requires normalization. |
76 | * <li> If the following match is to be found, the position should not be the |
77 | * second character which requires to be swapped with the preceding |
78 | * character. Vice versa, if the preceding match is to be found, |
79 | * position to search from should not be the first character which |
80 | * requires to be swapped with the next character. E.g certain Thai and |
81 | * Lao characters require swapping. |
82 | * <li> If a following pattern match is to be found, any position within a |
83 | * contracting sequence except the first will fail. Vice versa if a |
84 | * preceding pattern match is to be found, a invalid starting point |
85 | * would be any character within a contracting sequence except the last. |
86 | * </ul> |
87 | * <p> |
88 | * A <tt>BreakIterator</tt> can be used if only matches at logical breaks are desired. |
89 | * Using a <tt>BreakIterator</tt> will only give you results that exactly matches the |
90 | * boundaries given by the breakiterator. For instance the pattern "e" will |
91 | * not be found in the string "\u00e9" if a character break iterator is used. |
92 | * <p> |
93 | * Options are provided to handle overlapping matches. |
94 | * E.g. In English, overlapping matches produces the result 0 and 2 |
95 | * for the pattern "abab" in the text "ababab", where else mutually |
96 | * exclusive matches only produce the result of 0. |
97 | * <p> |
98 | * Though collator attributes will be taken into consideration while |
99 | * performing matches, there are no APIs here for setting and getting the |
100 | * attributes. These attributes can be set by getting the collator |
101 | * from <tt>getCollator</tt> and using the APIs in <tt>coll.h</tt>. |
102 | * Lastly to update <tt>StringSearch</tt> to the new collator attributes, |
103 | * <tt>reset</tt> has to be called. |
104 | * <p> |
105 | * Restriction: <br> |
106 | * Currently there are no composite characters that consists of a |
107 | * character with combining class > 0 before a character with combining |
108 | * class == 0. However, if such a character exists in the future, |
109 | * <tt>StringSearch</tt> does not guarantee the results for option 1. |
110 | * <p> |
111 | * Consult the <tt>SearchIterator</tt> documentation for information on |
112 | * and examples of how to use instances of this class to implement text |
113 | * searching. |
114 | * <pre><code> |
115 | * UnicodeString target("The quick brown fox jumps over the lazy dog."); |
116 | * UnicodeString pattern("fox"); |
117 | * |
118 | * UErrorCode error = U_ZERO_ERROR; |
119 | * StringSearch iter(pattern, target, Locale::getUS(), NULL, status); |
120 | * for (int pos = iter.first(error); |
121 | * pos != USEARCH_DONE; |
122 | * pos = iter.next(error)) |
123 | * { |
124 | * printf("Found match at %d pos, length is %d\n", pos, iter.getMatchedLength()); |
125 | * } |
126 | * </code></pre> |
127 | * <p> |
128 | * Note, <tt>StringSearch</tt> is not to be subclassed. |
129 | * </p> |
130 | * @see SearchIterator |
131 | * @see RuleBasedCollator |
132 | * @since ICU 2.0 |
133 | */ |
134 | |
135 | class U_I18N_API StringSearch U_FINAL : public SearchIterator |
136 | { |
137 | public: |
138 | |
139 | // public constructors and destructors -------------------------------- |
140 | |
141 | /** |
142 | * Creating a <tt>StringSearch</tt> instance using the argument locale |
143 | * language rule set. A collator will be created in the process, which |
144 | * will be owned by this instance and will be deleted during |
145 | * destruction |
146 | * @param pattern The text for which this object will search. |
147 | * @param text The text in which to search for the pattern. |
148 | * @param locale A locale which defines the language-sensitive |
149 | * comparison rules used to determine whether text in the |
150 | * pattern and target matches. |
151 | * @param breakiter A <tt>BreakIterator</tt> object used to constrain |
152 | * the matches that are found. Matches whose start and end |
153 | * indices in the target text are not boundaries as |
154 | * determined by the <tt>BreakIterator</tt> are |
155 | * ignored. If this behavior is not desired, |
156 | * <tt>NULL</tt> can be passed in instead. |
157 | * @param status for errors if any. If pattern or text is NULL, or if |
158 | * either the length of pattern or text is 0 then an |
159 | * U_ILLEGAL_ARGUMENT_ERROR is returned. |
160 | * @stable ICU 2.0 |
161 | */ |
162 | StringSearch(const UnicodeString &pattern, const UnicodeString &text, |
163 | const Locale &locale, |
164 | BreakIterator *breakiter, |
165 | UErrorCode &status); |
166 | |
167 | /** |
168 | * Creating a <tt>StringSearch</tt> instance using the argument collator |
169 | * language rule set. Note, user retains the ownership of this collator, |
170 | * it does not get destroyed during this instance's destruction. |
171 | * @param pattern The text for which this object will search. |
172 | * @param text The text in which to search for the pattern. |
173 | * @param coll A <tt>RuleBasedCollator</tt> object which defines |
174 | * the language-sensitive comparison rules used to |
175 | * determine whether text in the pattern and target |
176 | * matches. User is responsible for the clearing of this |
177 | * object. |
178 | * @param breakiter A <tt>BreakIterator</tt> object used to constrain |
179 | * the matches that are found. Matches whose start and end |
180 | * indices in the target text are not boundaries as |
181 | * determined by the <tt>BreakIterator</tt> are |
182 | * ignored. If this behavior is not desired, |
183 | * <tt>NULL</tt> can be passed in instead. |
184 | * @param status for errors if any. If either the length of pattern or |
185 | * text is 0 then an U_ILLEGAL_ARGUMENT_ERROR is returned. |
186 | * @stable ICU 2.0 |
187 | */ |
188 | StringSearch(const UnicodeString &pattern, |
189 | const UnicodeString &text, |
190 | RuleBasedCollator *coll, |
191 | BreakIterator *breakiter, |
192 | UErrorCode &status); |
193 | |
194 | /** |
195 | * Creating a <tt>StringSearch</tt> instance using the argument locale |
196 | * language rule set. A collator will be created in the process, which |
197 | * will be owned by this instance and will be deleted during |
198 | * destruction |
199 | * <p> |
200 | * Note: No parsing of the text within the <tt>CharacterIterator</tt> |
201 | * will be done during searching for this version. The block of text |
202 | * in <tt>CharacterIterator</tt> will be used as it is. |
203 | * @param pattern The text for which this object will search. |
204 | * @param text The text iterator in which to search for the pattern. |
205 | * @param locale A locale which defines the language-sensitive |
206 | * comparison rules used to determine whether text in the |
207 | * pattern and target matches. User is responsible for |
208 | * the clearing of this object. |
209 | * @param breakiter A <tt>BreakIterator</tt> object used to constrain |
210 | * the matches that are found. Matches whose start and end |
211 | * indices in the target text are not boundaries as |
212 | * determined by the <tt>BreakIterator</tt> are |
213 | * ignored. If this behavior is not desired, |
214 | * <tt>NULL</tt> can be passed in instead. |
215 | * @param status for errors if any. If either the length of pattern or |
216 | * text is 0 then an U_ILLEGAL_ARGUMENT_ERROR is returned. |
217 | * @stable ICU 2.0 |
218 | */ |
219 | StringSearch(const UnicodeString &pattern, CharacterIterator &text, |
220 | const Locale &locale, |
221 | BreakIterator *breakiter, |
222 | UErrorCode &status); |
223 | |
224 | /** |
225 | * Creating a <tt>StringSearch</tt> instance using the argument collator |
226 | * language rule set. Note, user retains the ownership of this collator, |
227 | * it does not get destroyed during this instance's destruction. |
228 | * <p> |
229 | * Note: No parsing of the text within the <tt>CharacterIterator</tt> |
230 | * will be done during searching for this version. The block of text |
231 | * in <tt>CharacterIterator</tt> will be used as it is. |
232 | * @param pattern The text for which this object will search. |
233 | * @param text The text in which to search for the pattern. |
234 | * @param coll A <tt>RuleBasedCollator</tt> object which defines |
235 | * the language-sensitive comparison rules used to |
236 | * determine whether text in the pattern and target |
237 | * matches. User is responsible for the clearing of this |
238 | * object. |
239 | * @param breakiter A <tt>BreakIterator</tt> object used to constrain |
240 | * the matches that are found. Matches whose start and end |
241 | * indices in the target text are not boundaries as |
242 | * determined by the <tt>BreakIterator</tt> are |
243 | * ignored. If this behavior is not desired, |
244 | * <tt>NULL</tt> can be passed in instead. |
245 | * @param status for errors if any. If either the length of pattern or |
246 | * text is 0 then an U_ILLEGAL_ARGUMENT_ERROR is returned. |
247 | * @stable ICU 2.0 |
248 | */ |
249 | StringSearch(const UnicodeString &pattern, CharacterIterator &text, |
250 | RuleBasedCollator *coll, |
251 | BreakIterator *breakiter, |
252 | UErrorCode &status); |
253 | |
254 | /** |
255 | * Copy constructor that creates a StringSearch instance with the same |
256 | * behavior, and iterating over the same text. |
257 | * @param that StringSearch instance to be copied. |
258 | * @stable ICU 2.0 |
259 | */ |
260 | StringSearch(const StringSearch &that); |
261 | |
262 | /** |
263 | * Destructor. Cleans up the search iterator data struct. |
264 | * If a collator is created in the constructor, it will be destroyed here. |
265 | * @stable ICU 2.0 |
266 | */ |
267 | virtual ~StringSearch(void); |
268 | |
269 | /** |
270 | * Clone this object. |
271 | * Clones can be used concurrently in multiple threads. |
272 | * If an error occurs, then NULL is returned. |
273 | * The caller must delete the clone. |
274 | * |
275 | * @return a clone of this object |
276 | * |
277 | * @see getDynamicClassID |
278 | * @stable ICU 2.8 |
279 | */ |
280 | StringSearch *clone() const; |
281 | |
282 | // operator overloading --------------------------------------------- |
283 | |
284 | /** |
285 | * Assignment operator. Sets this iterator to have the same behavior, |
286 | * and iterate over the same text, as the one passed in. |
287 | * @param that instance to be copied. |
288 | * @stable ICU 2.0 |
289 | */ |
290 | StringSearch & operator=(const StringSearch &that); |
291 | |
292 | /** |
293 | * Equality operator. |
294 | * @param that instance to be compared. |
295 | * @return TRUE if both instances have the same attributes, |
296 | * breakiterators, collators and iterate over the same text |
297 | * while looking for the same pattern. |
298 | * @stable ICU 2.0 |
299 | */ |
300 | virtual UBool operator==(const SearchIterator &that) const; |
301 | |
302 | // public get and set methods ---------------------------------------- |
303 | |
304 | /** |
305 | * Sets the index to point to the given position, and clears any state |
306 | * that's affected. |
307 | * <p> |
308 | * This method takes the argument index and sets the position in the text |
309 | * string accordingly without checking if the index is pointing to a |
310 | * valid starting point to begin searching. |
311 | * @param position within the text to be set. If position is less |
312 | * than or greater than the text range for searching, |
313 | * an U_INDEX_OUTOFBOUNDS_ERROR will be returned |
314 | * @param status for errors if it occurs |
315 | * @stable ICU 2.0 |
316 | */ |
317 | virtual void setOffset(int32_t position, UErrorCode &status); |
318 | |
319 | /** |
320 | * Return the current index in the text being searched. |
321 | * If the iteration has gone past the end of the text |
322 | * (or past the beginning for a backwards search), USEARCH_DONE |
323 | * is returned. |
324 | * @return current index in the text being searched. |
325 | * @stable ICU 2.0 |
326 | */ |
327 | virtual int32_t getOffset(void) const; |
328 | |
329 | /** |
330 | * Set the target text to be searched. |
331 | * Text iteration will hence begin at the start of the text string. |
332 | * This method is |
333 | * useful if you want to re-use an iterator to search for the same |
334 | * pattern within a different body of text. |
335 | * @param text text string to be searched |
336 | * @param status for errors if any. If the text length is 0 then an |
337 | * U_ILLEGAL_ARGUMENT_ERROR is returned. |
338 | * @stable ICU 2.0 |
339 | */ |
340 | virtual void setText(const UnicodeString &text, UErrorCode &status); |
341 | |
342 | /** |
343 | * Set the target text to be searched. |
344 | * Text iteration will hence begin at the start of the text string. |
345 | * This method is |
346 | * useful if you want to re-use an iterator to search for the same |
347 | * pattern within a different body of text. |
348 | * Note: No parsing of the text within the <tt>CharacterIterator</tt> |
349 | * will be done during searching for this version. The block of text |
350 | * in <tt>CharacterIterator</tt> will be used as it is. |
351 | * @param text text string to be searched |
352 | * @param status for errors if any. If the text length is 0 then an |
353 | * U_ILLEGAL_ARGUMENT_ERROR is returned. |
354 | * @stable ICU 2.0 |
355 | */ |
356 | virtual void setText(CharacterIterator &text, UErrorCode &status); |
357 | |
358 | /** |
359 | * Gets the collator used for the language rules. |
360 | * <p> |
361 | * Caller may modify but <b>must not</b> delete the <tt>RuleBasedCollator</tt>! |
362 | * Modifications to this collator will affect the original collator passed in to |
363 | * the <tt>StringSearch></tt> constructor or to setCollator, if any. |
364 | * @return collator used for string search |
365 | * @stable ICU 2.0 |
366 | */ |
367 | RuleBasedCollator * getCollator() const; |
368 | |
369 | /** |
370 | * Sets the collator used for the language rules. User retains the |
371 | * ownership of this collator, thus the responsibility of deletion lies |
372 | * with the user. The iterator's position will not be changed by this method. |
373 | * @param coll collator |
374 | * @param status for errors if any |
375 | * @stable ICU 2.0 |
376 | */ |
377 | void setCollator(RuleBasedCollator *coll, UErrorCode &status); |
378 | |
379 | /** |
380 | * Sets the pattern used for matching. |
381 | * The iterator's position will not be changed by this method. |
382 | * @param pattern search pattern to be found |
383 | * @param status for errors if any. If the pattern length is 0 then an |
384 | * U_ILLEGAL_ARGUMENT_ERROR is returned. |
385 | * @stable ICU 2.0 |
386 | */ |
387 | void setPattern(const UnicodeString &pattern, UErrorCode &status); |
388 | |
389 | /** |
390 | * Gets the search pattern. |
391 | * @return pattern used for matching |
392 | * @stable ICU 2.0 |
393 | */ |
394 | const UnicodeString & getPattern() const; |
395 | |
396 | // public methods ---------------------------------------------------- |
397 | |
398 | /** |
399 | * Reset the iteration. |
400 | * Search will begin at the start of the text string if a forward |
401 | * iteration is initiated before a backwards iteration. Otherwise if |
402 | * a backwards iteration is initiated before a forwards iteration, the |
403 | * search will begin at the end of the text string. |
404 | * @stable ICU 2.0 |
405 | */ |
406 | virtual void reset(); |
407 | |
408 | /** |
409 | * Returns a copy of StringSearch with the same behavior, and |
410 | * iterating over the same text, as this one. Note that all data will be |
411 | * replicated, except for the user-specified collator and the |
412 | * breakiterator. |
413 | * @return cloned object |
414 | * @stable ICU 2.0 |
415 | */ |
416 | virtual StringSearch * safeClone() const; |
417 | |
418 | /** |
419 | * ICU "poor man's RTTI", returns a UClassID for the actual class. |
420 | * |
421 | * @stable ICU 2.2 |
422 | */ |
423 | virtual UClassID getDynamicClassID() const; |
424 | |
425 | /** |
426 | * ICU "poor man's RTTI", returns a UClassID for this class. |
427 | * |
428 | * @stable ICU 2.2 |
429 | */ |
430 | static UClassID U_EXPORT2 getStaticClassID(); |
431 | |
432 | protected: |
433 | |
434 | // protected method ------------------------------------------------- |
435 | |
436 | /** |
437 | * Search forward for matching text, starting at a given location. |
438 | * Clients should not call this method directly; instead they should |
439 | * call {@link SearchIterator#next }. |
440 | * <p> |
441 | * If a match is found, this method returns the index at which the match |
442 | * starts and calls {@link SearchIterator#setMatchLength } with the number |
443 | * of characters in the target text that make up the match. If no match |
444 | * is found, the method returns <tt>USEARCH_DONE</tt>. |
445 | * <p> |
446 | * The <tt>StringSearch</tt> is adjusted so that its current index |
447 | * (as returned by {@link #getOffset }) is the match position if one was |
448 | * found. |
449 | * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and |
450 | * the <tt>StringSearch</tt> will be adjusted to the index USEARCH_DONE. |
451 | * @param position The index in the target text at which the search |
452 | * starts |
453 | * @param status for errors if any occurs |
454 | * @return The index at which the matched text in the target starts, or |
455 | * USEARCH_DONE if no match was found. |
456 | * @stable ICU 2.0 |
457 | */ |
458 | virtual int32_t handleNext(int32_t position, UErrorCode &status); |
459 | |
460 | /** |
461 | * Search backward for matching text, starting at a given location. |
462 | * Clients should not call this method directly; instead they should call |
463 | * <tt>SearchIterator.previous()</tt>, which this method overrides. |
464 | * <p> |
465 | * If a match is found, this method returns the index at which the match |
466 | * starts and calls {@link SearchIterator#setMatchLength } with the number |
467 | * of characters in the target text that make up the match. If no match |
468 | * is found, the method returns <tt>USEARCH_DONE</tt>. |
469 | * <p> |
470 | * The <tt>StringSearch</tt> is adjusted so that its current index |
471 | * (as returned by {@link #getOffset }) is the match position if one was |
472 | * found. |
473 | * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and |
474 | * the <tt>StringSearch</tt> will be adjusted to the index USEARCH_DONE. |
475 | * @param position The index in the target text at which the search |
476 | * starts. |
477 | * @param status for errors if any occurs |
478 | * @return The index at which the matched text in the target starts, or |
479 | * USEARCH_DONE if no match was found. |
480 | * @stable ICU 2.0 |
481 | */ |
482 | virtual int32_t handlePrev(int32_t position, UErrorCode &status); |
483 | |
484 | private : |
485 | StringSearch(); // default constructor not implemented |
486 | |
487 | // private data members ---------------------------------------------- |
488 | |
489 | /** |
490 | * Pattern text |
491 | * @stable ICU 2.0 |
492 | */ |
493 | UnicodeString m_pattern_; |
494 | /** |
495 | * String search struct data |
496 | * @stable ICU 2.0 |
497 | */ |
498 | UStringSearch *m_strsrch_; |
499 | |
500 | }; |
501 | |
502 | U_NAMESPACE_END |
503 | |
504 | #endif /* #if !UCONFIG_NO_COLLATION */ |
505 | |
506 | #endif /* U_SHOW_CPLUSPLUS_API */ |
507 | |
508 | #endif |
509 | |
510 | |