1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4**********************************************************************
5* Copyright (C) 2001-2014 IBM and others. All rights reserved.
6**********************************************************************
7* Date Name Description
8* 03/22/2000 helena Creation.
9**********************************************************************
10*/
11
12#ifndef STSEARCH_H
13#define STSEARCH_H
14
15#include "unicode/utypes.h"
16
17#if U_SHOW_CPLUSPLUS_API
18
19/**
20 * \file
21 * \brief C++ API: Service for searching text based on RuleBasedCollator.
22 */
23
24#if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
25
26#include "unicode/tblcoll.h"
27#include "unicode/coleitr.h"
28#include "unicode/search.h"
29
30U_NAMESPACE_BEGIN
31
32/**
33 *
34 * <tt>StringSearch</tt> is a <tt>SearchIterator</tt> that provides
35 * language-sensitive text searching based on the comparison rules defined
36 * in a {@link RuleBasedCollator} object.
37 * StringSearch ensures that language eccentricity can be
38 * handled, e.g. for the German collator, characters &szlig; and SS will be matched
39 * if case is chosen to be ignored.
40 * See the <a href="http://source.icu-project.org/repos/icu/icuhtml/trunk/design/collation/ICU_collation_design.htm">
41 * "ICU Collation Design Document"</a> for more information.
42 * <p>
43 * There are 2 match options for selection:<br>
44 * Let S' be the sub-string of a text string S between the offsets start and
45 * end [start, end].
46 * <br>
47 * A pattern string P matches a text string S at the offsets [start, end]
48 * if
49 * <pre>
50 * option 1. Some canonical equivalent of P matches some canonical equivalent
51 * of S'
52 * option 2. P matches S' and if P starts or ends with a combining mark,
53 * there exists no non-ignorable combining mark before or after S?
54 * in S respectively.
55 * </pre>
56 * Option 2. will be the default.
57 * <p>
58 * This search has APIs similar to that of other text iteration mechanisms
59 * such as the break iterators in <tt>BreakIterator</tt>. Using these
60 * APIs, it is easy to scan through text looking for all occurrences of
61 * a given pattern. This search iterator allows changing of direction by
62 * calling a <tt>reset</tt> followed by a <tt>next</tt> or <tt>previous</tt>.
63 * Though a direction change can occur without calling <tt>reset</tt> first,
64 * this operation comes with some speed penalty.
65 * Match results in the forward direction will match the result matches in
66 * the backwards direction in the reverse order
67 * <p>
68 * <tt>SearchIterator</tt> provides APIs to specify the starting position
69 * within the text string to be searched, e.g. <tt>setOffset</tt>,
70 * <tt>preceding</tt> and <tt>following</tt>. Since the
71 * starting position will be set as it is specified, please take note that
72 * there are some danger points which the search may render incorrect
73 * results:
74 * <ul>
75 * <li> The midst of a substring that requires normalization.
76 * <li> If the following match is to be found, the position should not be the
77 * second character which requires to be swapped with the preceding
78 * character. Vice versa, if the preceding match is to be found,
79 * position to search from should not be the first character which
80 * requires to be swapped with the next character. E.g certain Thai and
81 * Lao characters require swapping.
82 * <li> If a following pattern match is to be found, any position within a
83 * contracting sequence except the first will fail. Vice versa if a
84 * preceding pattern match is to be found, a invalid starting point
85 * would be any character within a contracting sequence except the last.
86 * </ul>
87 * <p>
88 * A <tt>BreakIterator</tt> can be used if only matches at logical breaks are desired.
89 * Using a <tt>BreakIterator</tt> will only give you results that exactly matches the
90 * boundaries given by the breakiterator. For instance the pattern "e" will
91 * not be found in the string "\u00e9" if a character break iterator is used.
92 * <p>
93 * Options are provided to handle overlapping matches.
94 * E.g. In English, overlapping matches produces the result 0 and 2
95 * for the pattern "abab" in the text "ababab", where else mutually
96 * exclusive matches only produce the result of 0.
97 * <p>
98 * Though collator attributes will be taken into consideration while
99 * performing matches, there are no APIs here for setting and getting the
100 * attributes. These attributes can be set by getting the collator
101 * from <tt>getCollator</tt> and using the APIs in <tt>coll.h</tt>.
102 * Lastly to update <tt>StringSearch</tt> to the new collator attributes,
103 * <tt>reset</tt> has to be called.
104 * <p>
105 * Restriction: <br>
106 * Currently there are no composite characters that consists of a
107 * character with combining class > 0 before a character with combining
108 * class == 0. However, if such a character exists in the future,
109 * <tt>StringSearch</tt> does not guarantee the results for option 1.
110 * <p>
111 * Consult the <tt>SearchIterator</tt> documentation for information on
112 * and examples of how to use instances of this class to implement text
113 * searching.
114 * <pre><code>
115 * UnicodeString target("The quick brown fox jumps over the lazy dog.");
116 * UnicodeString pattern("fox");
117 *
118 * UErrorCode error = U_ZERO_ERROR;
119 * StringSearch iter(pattern, target, Locale::getUS(), NULL, status);
120 * for (int pos = iter.first(error);
121 * pos != USEARCH_DONE;
122 * pos = iter.next(error))
123 * {
124 * printf("Found match at %d pos, length is %d\n", pos, iter.getMatchedLength());
125 * }
126 * </code></pre>
127 * <p>
128 * Note, <tt>StringSearch</tt> is not to be subclassed.
129 * </p>
130 * @see SearchIterator
131 * @see RuleBasedCollator
132 * @since ICU 2.0
133 */
134
135class U_I18N_API StringSearch U_FINAL : public SearchIterator
136{
137public:
138
139 // public constructors and destructors --------------------------------
140
141 /**
142 * Creating a <tt>StringSearch</tt> instance using the argument locale
143 * language rule set. A collator will be created in the process, which
144 * will be owned by this instance and will be deleted during
145 * destruction
146 * @param pattern The text for which this object will search.
147 * @param text The text in which to search for the pattern.
148 * @param locale A locale which defines the language-sensitive
149 * comparison rules used to determine whether text in the
150 * pattern and target matches.
151 * @param breakiter A <tt>BreakIterator</tt> object used to constrain
152 * the matches that are found. Matches whose start and end
153 * indices in the target text are not boundaries as
154 * determined by the <tt>BreakIterator</tt> are
155 * ignored. If this behavior is not desired,
156 * <tt>NULL</tt> can be passed in instead.
157 * @param status for errors if any. If pattern or text is NULL, or if
158 * either the length of pattern or text is 0 then an
159 * U_ILLEGAL_ARGUMENT_ERROR is returned.
160 * @stable ICU 2.0
161 */
162 StringSearch(const UnicodeString &pattern, const UnicodeString &text,
163 const Locale &locale,
164 BreakIterator *breakiter,
165 UErrorCode &status);
166
167 /**
168 * Creating a <tt>StringSearch</tt> instance using the argument collator
169 * language rule set. Note, user retains the ownership of this collator,
170 * it does not get destroyed during this instance's destruction.
171 * @param pattern The text for which this object will search.
172 * @param text The text in which to search for the pattern.
173 * @param coll A <tt>RuleBasedCollator</tt> object which defines
174 * the language-sensitive comparison rules used to
175 * determine whether text in the pattern and target
176 * matches. User is responsible for the clearing of this
177 * object.
178 * @param breakiter A <tt>BreakIterator</tt> object used to constrain
179 * the matches that are found. Matches whose start and end
180 * indices in the target text are not boundaries as
181 * determined by the <tt>BreakIterator</tt> are
182 * ignored. If this behavior is not desired,
183 * <tt>NULL</tt> can be passed in instead.
184 * @param status for errors if any. If either the length of pattern or
185 * text is 0 then an U_ILLEGAL_ARGUMENT_ERROR is returned.
186 * @stable ICU 2.0
187 */
188 StringSearch(const UnicodeString &pattern,
189 const UnicodeString &text,
190 RuleBasedCollator *coll,
191 BreakIterator *breakiter,
192 UErrorCode &status);
193
194 /**
195 * Creating a <tt>StringSearch</tt> instance using the argument locale
196 * language rule set. A collator will be created in the process, which
197 * will be owned by this instance and will be deleted during
198 * destruction
199 * <p>
200 * Note: No parsing of the text within the <tt>CharacterIterator</tt>
201 * will be done during searching for this version. The block of text
202 * in <tt>CharacterIterator</tt> will be used as it is.
203 * @param pattern The text for which this object will search.
204 * @param text The text iterator in which to search for the pattern.
205 * @param locale A locale which defines the language-sensitive
206 * comparison rules used to determine whether text in the
207 * pattern and target matches. User is responsible for
208 * the clearing of this object.
209 * @param breakiter A <tt>BreakIterator</tt> object used to constrain
210 * the matches that are found. Matches whose start and end
211 * indices in the target text are not boundaries as
212 * determined by the <tt>BreakIterator</tt> are
213 * ignored. If this behavior is not desired,
214 * <tt>NULL</tt> can be passed in instead.
215 * @param status for errors if any. If either the length of pattern or
216 * text is 0 then an U_ILLEGAL_ARGUMENT_ERROR is returned.
217 * @stable ICU 2.0
218 */
219 StringSearch(const UnicodeString &pattern, CharacterIterator &text,
220 const Locale &locale,
221 BreakIterator *breakiter,
222 UErrorCode &status);
223
224 /**
225 * Creating a <tt>StringSearch</tt> instance using the argument collator
226 * language rule set. Note, user retains the ownership of this collator,
227 * it does not get destroyed during this instance's destruction.
228 * <p>
229 * Note: No parsing of the text within the <tt>CharacterIterator</tt>
230 * will be done during searching for this version. The block of text
231 * in <tt>CharacterIterator</tt> will be used as it is.
232 * @param pattern The text for which this object will search.
233 * @param text The text in which to search for the pattern.
234 * @param coll A <tt>RuleBasedCollator</tt> object which defines
235 * the language-sensitive comparison rules used to
236 * determine whether text in the pattern and target
237 * matches. User is responsible for the clearing of this
238 * object.
239 * @param breakiter A <tt>BreakIterator</tt> object used to constrain
240 * the matches that are found. Matches whose start and end
241 * indices in the target text are not boundaries as
242 * determined by the <tt>BreakIterator</tt> are
243 * ignored. If this behavior is not desired,
244 * <tt>NULL</tt> can be passed in instead.
245 * @param status for errors if any. If either the length of pattern or
246 * text is 0 then an U_ILLEGAL_ARGUMENT_ERROR is returned.
247 * @stable ICU 2.0
248 */
249 StringSearch(const UnicodeString &pattern, CharacterIterator &text,
250 RuleBasedCollator *coll,
251 BreakIterator *breakiter,
252 UErrorCode &status);
253
254 /**
255 * Copy constructor that creates a StringSearch instance with the same
256 * behavior, and iterating over the same text.
257 * @param that StringSearch instance to be copied.
258 * @stable ICU 2.0
259 */
260 StringSearch(const StringSearch &that);
261
262 /**
263 * Destructor. Cleans up the search iterator data struct.
264 * If a collator is created in the constructor, it will be destroyed here.
265 * @stable ICU 2.0
266 */
267 virtual ~StringSearch(void);
268
269 /**
270 * Clone this object.
271 * Clones can be used concurrently in multiple threads.
272 * If an error occurs, then NULL is returned.
273 * The caller must delete the clone.
274 *
275 * @return a clone of this object
276 *
277 * @see getDynamicClassID
278 * @stable ICU 2.8
279 */
280 StringSearch *clone() const;
281
282 // operator overloading ---------------------------------------------
283
284 /**
285 * Assignment operator. Sets this iterator to have the same behavior,
286 * and iterate over the same text, as the one passed in.
287 * @param that instance to be copied.
288 * @stable ICU 2.0
289 */
290 StringSearch & operator=(const StringSearch &that);
291
292 /**
293 * Equality operator.
294 * @param that instance to be compared.
295 * @return TRUE if both instances have the same attributes,
296 * breakiterators, collators and iterate over the same text
297 * while looking for the same pattern.
298 * @stable ICU 2.0
299 */
300 virtual UBool operator==(const SearchIterator &that) const;
301
302 // public get and set methods ----------------------------------------
303
304 /**
305 * Sets the index to point to the given position, and clears any state
306 * that's affected.
307 * <p>
308 * This method takes the argument index and sets the position in the text
309 * string accordingly without checking if the index is pointing to a
310 * valid starting point to begin searching.
311 * @param position within the text to be set. If position is less
312 * than or greater than the text range for searching,
313 * an U_INDEX_OUTOFBOUNDS_ERROR will be returned
314 * @param status for errors if it occurs
315 * @stable ICU 2.0
316 */
317 virtual void setOffset(int32_t position, UErrorCode &status);
318
319 /**
320 * Return the current index in the text being searched.
321 * If the iteration has gone past the end of the text
322 * (or past the beginning for a backwards search), USEARCH_DONE
323 * is returned.
324 * @return current index in the text being searched.
325 * @stable ICU 2.0
326 */
327 virtual int32_t getOffset(void) const;
328
329 /**
330 * Set the target text to be searched.
331 * Text iteration will hence begin at the start of the text string.
332 * This method is
333 * useful if you want to re-use an iterator to search for the same
334 * pattern within a different body of text.
335 * @param text text string to be searched
336 * @param status for errors if any. If the text length is 0 then an
337 * U_ILLEGAL_ARGUMENT_ERROR is returned.
338 * @stable ICU 2.0
339 */
340 virtual void setText(const UnicodeString &text, UErrorCode &status);
341
342 /**
343 * Set the target text to be searched.
344 * Text iteration will hence begin at the start of the text string.
345 * This method is
346 * useful if you want to re-use an iterator to search for the same
347 * pattern within a different body of text.
348 * Note: No parsing of the text within the <tt>CharacterIterator</tt>
349 * will be done during searching for this version. The block of text
350 * in <tt>CharacterIterator</tt> will be used as it is.
351 * @param text text string to be searched
352 * @param status for errors if any. If the text length is 0 then an
353 * U_ILLEGAL_ARGUMENT_ERROR is returned.
354 * @stable ICU 2.0
355 */
356 virtual void setText(CharacterIterator &text, UErrorCode &status);
357
358 /**
359 * Gets the collator used for the language rules.
360 * <p>
361 * Caller may modify but <b>must not</b> delete the <tt>RuleBasedCollator</tt>!
362 * Modifications to this collator will affect the original collator passed in to
363 * the <tt>StringSearch></tt> constructor or to setCollator, if any.
364 * @return collator used for string search
365 * @stable ICU 2.0
366 */
367 RuleBasedCollator * getCollator() const;
368
369 /**
370 * Sets the collator used for the language rules. User retains the
371 * ownership of this collator, thus the responsibility of deletion lies
372 * with the user. The iterator's position will not be changed by this method.
373 * @param coll collator
374 * @param status for errors if any
375 * @stable ICU 2.0
376 */
377 void setCollator(RuleBasedCollator *coll, UErrorCode &status);
378
379 /**
380 * Sets the pattern used for matching.
381 * The iterator's position will not be changed by this method.
382 * @param pattern search pattern to be found
383 * @param status for errors if any. If the pattern length is 0 then an
384 * U_ILLEGAL_ARGUMENT_ERROR is returned.
385 * @stable ICU 2.0
386 */
387 void setPattern(const UnicodeString &pattern, UErrorCode &status);
388
389 /**
390 * Gets the search pattern.
391 * @return pattern used for matching
392 * @stable ICU 2.0
393 */
394 const UnicodeString & getPattern() const;
395
396 // public methods ----------------------------------------------------
397
398 /**
399 * Reset the iteration.
400 * Search will begin at the start of the text string if a forward
401 * iteration is initiated before a backwards iteration. Otherwise if
402 * a backwards iteration is initiated before a forwards iteration, the
403 * search will begin at the end of the text string.
404 * @stable ICU 2.0
405 */
406 virtual void reset();
407
408 /**
409 * Returns a copy of StringSearch with the same behavior, and
410 * iterating over the same text, as this one. Note that all data will be
411 * replicated, except for the user-specified collator and the
412 * breakiterator.
413 * @return cloned object
414 * @stable ICU 2.0
415 */
416 virtual StringSearch * safeClone() const;
417
418 /**
419 * ICU "poor man's RTTI", returns a UClassID for the actual class.
420 *
421 * @stable ICU 2.2
422 */
423 virtual UClassID getDynamicClassID() const;
424
425 /**
426 * ICU "poor man's RTTI", returns a UClassID for this class.
427 *
428 * @stable ICU 2.2
429 */
430 static UClassID U_EXPORT2 getStaticClassID();
431
432protected:
433
434 // protected method -------------------------------------------------
435
436 /**
437 * Search forward for matching text, starting at a given location.
438 * Clients should not call this method directly; instead they should
439 * call {@link SearchIterator#next }.
440 * <p>
441 * If a match is found, this method returns the index at which the match
442 * starts and calls {@link SearchIterator#setMatchLength } with the number
443 * of characters in the target text that make up the match. If no match
444 * is found, the method returns <tt>USEARCH_DONE</tt>.
445 * <p>
446 * The <tt>StringSearch</tt> is adjusted so that its current index
447 * (as returned by {@link #getOffset }) is the match position if one was
448 * found.
449 * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
450 * the <tt>StringSearch</tt> will be adjusted to the index USEARCH_DONE.
451 * @param position The index in the target text at which the search
452 * starts
453 * @param status for errors if any occurs
454 * @return The index at which the matched text in the target starts, or
455 * USEARCH_DONE if no match was found.
456 * @stable ICU 2.0
457 */
458 virtual int32_t handleNext(int32_t position, UErrorCode &status);
459
460 /**
461 * Search backward for matching text, starting at a given location.
462 * Clients should not call this method directly; instead they should call
463 * <tt>SearchIterator.previous()</tt>, which this method overrides.
464 * <p>
465 * If a match is found, this method returns the index at which the match
466 * starts and calls {@link SearchIterator#setMatchLength } with the number
467 * of characters in the target text that make up the match. If no match
468 * is found, the method returns <tt>USEARCH_DONE</tt>.
469 * <p>
470 * The <tt>StringSearch</tt> is adjusted so that its current index
471 * (as returned by {@link #getOffset }) is the match position if one was
472 * found.
473 * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
474 * the <tt>StringSearch</tt> will be adjusted to the index USEARCH_DONE.
475 * @param position The index in the target text at which the search
476 * starts.
477 * @param status for errors if any occurs
478 * @return The index at which the matched text in the target starts, or
479 * USEARCH_DONE if no match was found.
480 * @stable ICU 2.0
481 */
482 virtual int32_t handlePrev(int32_t position, UErrorCode &status);
483
484private :
485 StringSearch(); // default constructor not implemented
486
487 // private data members ----------------------------------------------
488
489 /**
490 * Pattern text
491 * @stable ICU 2.0
492 */
493 UnicodeString m_pattern_;
494 /**
495 * String search struct data
496 * @stable ICU 2.0
497 */
498 UStringSearch *m_strsrch_;
499
500};
501
502U_NAMESPACE_END
503
504#endif /* #if !UCONFIG_NO_COLLATION */
505
506#endif /* U_SHOW_CPLUSPLUS_API */
507
508#endif
509
510