| 1 | // © 2016 and later: Unicode, Inc. and others. | 
|---|
| 2 | // License & terms of use: http://www.unicode.org/copyright.html | 
|---|
| 3 | /* | 
|---|
| 4 | *************************************************************************** | 
|---|
| 5 | *   Copyright (C) 1999-2016 International Business Machines Corporation   * | 
|---|
| 6 | *   and others. All rights reserved.                                      * | 
|---|
| 7 | *************************************************************************** | 
|---|
| 8 |  | 
|---|
| 9 | ********************************************************************** | 
|---|
| 10 | *   Date        Name        Description | 
|---|
| 11 | *   10/22/99    alan        Creation. | 
|---|
| 12 | *   11/11/99    rgillam     Complete port from Java. | 
|---|
| 13 | ********************************************************************** | 
|---|
| 14 | */ | 
|---|
| 15 |  | 
|---|
| 16 | #ifndef RBBI_H | 
|---|
| 17 | #define RBBI_H | 
|---|
| 18 |  | 
|---|
| 19 | #include "unicode/utypes.h" | 
|---|
| 20 |  | 
|---|
| 21 | #if U_SHOW_CPLUSPLUS_API | 
|---|
| 22 |  | 
|---|
| 23 | /** | 
|---|
| 24 | * \file | 
|---|
| 25 | * \brief C++ API: Rule Based Break Iterator | 
|---|
| 26 | */ | 
|---|
| 27 |  | 
|---|
| 28 | #if !UCONFIG_NO_BREAK_ITERATION | 
|---|
| 29 |  | 
|---|
| 30 | #include "unicode/brkiter.h" | 
|---|
| 31 | #include "unicode/udata.h" | 
|---|
| 32 | #include "unicode/parseerr.h" | 
|---|
| 33 | #include "unicode/schriter.h" | 
|---|
| 34 |  | 
|---|
| 35 | U_NAMESPACE_BEGIN | 
|---|
| 36 |  | 
|---|
| 37 | /** @internal */ | 
|---|
| 38 | class  LanguageBreakEngine; | 
|---|
| 39 | struct ; | 
|---|
| 40 | class  RBBIDataWrapper; | 
|---|
| 41 | class  UnhandledEngine; | 
|---|
| 42 | class  UStack; | 
|---|
| 43 |  | 
|---|
| 44 | /** | 
|---|
| 45 | * | 
|---|
| 46 | * A subclass of BreakIterator whose behavior is specified using a list of rules. | 
|---|
| 47 | * <p>Instances of this class are most commonly created by the factory methods of | 
|---|
| 48 | *  BreakIterator::createWordInstance(), BreakIterator::createLineInstance(), etc., | 
|---|
| 49 | *  and then used via the abstract API in class BreakIterator</p> | 
|---|
| 50 | * | 
|---|
| 51 | * <p>See the ICU User Guide for information on Break Iterator Rules.</p> | 
|---|
| 52 | * | 
|---|
| 53 | * <p>This class is not intended to be subclassed.</p> | 
|---|
| 54 | */ | 
|---|
| 55 | class U_COMMON_API RuleBasedBreakIterator /*U_FINAL*/ : public BreakIterator { | 
|---|
| 56 |  | 
|---|
| 57 | private: | 
|---|
| 58 | /** | 
|---|
| 59 | * The UText through which this BreakIterator accesses the text | 
|---|
| 60 | * @internal (private) | 
|---|
| 61 | */ | 
|---|
| 62 | UText  fText; | 
|---|
| 63 |  | 
|---|
| 64 | #ifndef U_HIDE_INTERNAL_API | 
|---|
| 65 | public: | 
|---|
| 66 | #endif /* U_HIDE_INTERNAL_API */ | 
|---|
| 67 | /** | 
|---|
| 68 | * The rule data for this BreakIterator instance. | 
|---|
| 69 | * Not for general use; Public only for testing purposes. | 
|---|
| 70 | * @internal | 
|---|
| 71 | */ | 
|---|
| 72 | RBBIDataWrapper    *fData; | 
|---|
| 73 | private: | 
|---|
| 74 |  | 
|---|
| 75 | /** | 
|---|
| 76 | * The current  position of the iterator. Pinned, 0 < fPosition <= text.length. | 
|---|
| 77 | * Never has the value UBRK_DONE (-1). | 
|---|
| 78 | */ | 
|---|
| 79 | int32_t         fPosition; | 
|---|
| 80 |  | 
|---|
| 81 | /** | 
|---|
| 82 | * TODO: | 
|---|
| 83 | */ | 
|---|
| 84 | int32_t         fRuleStatusIndex; | 
|---|
| 85 |  | 
|---|
| 86 | /** | 
|---|
| 87 | *   Cache of previously determined boundary positions. | 
|---|
| 88 | */ | 
|---|
| 89 | class BreakCache; | 
|---|
| 90 | BreakCache         *fBreakCache; | 
|---|
| 91 |  | 
|---|
| 92 | /** | 
|---|
| 93 | *  Cache of boundary positions within a region of text that has been | 
|---|
| 94 | *  sub-divided by dictionary based breaking. | 
|---|
| 95 | */ | 
|---|
| 96 | class DictionaryCache; | 
|---|
| 97 | DictionaryCache *fDictionaryCache; | 
|---|
| 98 |  | 
|---|
| 99 | /** | 
|---|
| 100 | * | 
|---|
| 101 | * If present, UStack of LanguageBreakEngine objects that might handle | 
|---|
| 102 | * dictionary characters. Searched from top to bottom to find an object to | 
|---|
| 103 | * handle a given character. | 
|---|
| 104 | * @internal (private) | 
|---|
| 105 | */ | 
|---|
| 106 | UStack              *fLanguageBreakEngines; | 
|---|
| 107 |  | 
|---|
| 108 | /** | 
|---|
| 109 | * | 
|---|
| 110 | * If present, the special LanguageBreakEngine used for handling | 
|---|
| 111 | * characters that are in the dictionary set, but not handled by any | 
|---|
| 112 | * LanguageBreakEngine. | 
|---|
| 113 | * @internal (private) | 
|---|
| 114 | */ | 
|---|
| 115 | UnhandledEngine     *fUnhandledBreakEngine; | 
|---|
| 116 |  | 
|---|
| 117 | /** | 
|---|
| 118 | * Counter for the number of characters encountered with the "dictionary" | 
|---|
| 119 | *   flag set. | 
|---|
| 120 | * @internal (private) | 
|---|
| 121 | */ | 
|---|
| 122 | uint32_t            fDictionaryCharCount; | 
|---|
| 123 |  | 
|---|
| 124 | /** | 
|---|
| 125 | *   A character iterator that refers to the same text as the UText, above. | 
|---|
| 126 | *   Only included for compatibility with old API, which was based on CharacterIterators. | 
|---|
| 127 | *   Value may be adopted from outside, or one of fSCharIter or fDCharIter, below. | 
|---|
| 128 | */ | 
|---|
| 129 | CharacterIterator  *fCharIter; | 
|---|
| 130 |  | 
|---|
| 131 | /** | 
|---|
| 132 | *   When the input text is provided by a UnicodeString, this will point to | 
|---|
| 133 | *    a characterIterator that wraps that data.  Needed only for the | 
|---|
| 134 | *    implementation of getText(), a backwards compatibility issue. | 
|---|
| 135 | */ | 
|---|
| 136 | StringCharacterIterator fSCharIter; | 
|---|
| 137 |  | 
|---|
| 138 | /** | 
|---|
| 139 | * True when iteration has run off the end, and iterator functions should return UBRK_DONE. | 
|---|
| 140 | */ | 
|---|
| 141 | UBool           fDone; | 
|---|
| 142 |  | 
|---|
| 143 | //======================================================================= | 
|---|
| 144 | // constructors | 
|---|
| 145 | //======================================================================= | 
|---|
| 146 |  | 
|---|
| 147 | /** | 
|---|
| 148 | * Constructor from a flattened set of RBBI data in malloced memory. | 
|---|
| 149 | *             RulesBasedBreakIterators built from a custom set of rules | 
|---|
| 150 | *             are created via this constructor; the rules are compiled | 
|---|
| 151 | *             into memory, then the break iterator is constructed here. | 
|---|
| 152 | * | 
|---|
| 153 | *             The break iterator adopts the memory, and will | 
|---|
| 154 | *             free it when done. | 
|---|
| 155 | * @internal (private) | 
|---|
| 156 | */ | 
|---|
| 157 | (RBBIDataHeader* data, UErrorCode &status); | 
|---|
| 158 |  | 
|---|
| 159 | /** @internal */ | 
|---|
| 160 | friend class RBBIRuleBuilder; | 
|---|
| 161 | /** @internal */ | 
|---|
| 162 | friend class BreakIterator; | 
|---|
| 163 |  | 
|---|
| 164 | public: | 
|---|
| 165 |  | 
|---|
| 166 | /** Default constructor.  Creates an empty shell of an iterator, with no | 
|---|
| 167 | *  rules or text to iterate over.   Object can subsequently be assigned to. | 
|---|
| 168 | *  @stable ICU 2.2 | 
|---|
| 169 | */ | 
|---|
| 170 | RuleBasedBreakIterator(); | 
|---|
| 171 |  | 
|---|
| 172 | /** | 
|---|
| 173 | * Copy constructor.  Will produce a break iterator with the same behavior, | 
|---|
| 174 | * and which iterates over the same text, as the one passed in. | 
|---|
| 175 | * @param that The RuleBasedBreakIterator passed to be copied | 
|---|
| 176 | * @stable ICU 2.0 | 
|---|
| 177 | */ | 
|---|
| 178 | RuleBasedBreakIterator(const RuleBasedBreakIterator& that); | 
|---|
| 179 |  | 
|---|
| 180 | /** | 
|---|
| 181 | * Construct a RuleBasedBreakIterator from a set of rules supplied as a string. | 
|---|
| 182 | * @param rules The break rules to be used. | 
|---|
| 183 | * @param parseError  In the event of a syntax error in the rules, provides the location | 
|---|
| 184 | *                    within the rules of the problem. | 
|---|
| 185 | * @param status Information on any errors encountered. | 
|---|
| 186 | * @stable ICU 2.2 | 
|---|
| 187 | */ | 
|---|
| 188 | RuleBasedBreakIterator( const UnicodeString    &rules, | 
|---|
| 189 | UParseError           &parseError, | 
|---|
| 190 | UErrorCode            &status); | 
|---|
| 191 |  | 
|---|
| 192 | /** | 
|---|
| 193 | * Construct a RuleBasedBreakIterator from a set of precompiled binary rules. | 
|---|
| 194 | * Binary rules are obtained from RulesBasedBreakIterator::getBinaryRules(). | 
|---|
| 195 | * Construction of a break iterator in this way is substantially faster than | 
|---|
| 196 | * construction from source rules. | 
|---|
| 197 | * | 
|---|
| 198 | * Ownership of the storage containing the compiled rules remains with the | 
|---|
| 199 | * caller of this function.  The compiled rules must not be  modified or | 
|---|
| 200 | * deleted during the life of the break iterator. | 
|---|
| 201 | * | 
|---|
| 202 | * The compiled rules are not compatible across different major versions of ICU. | 
|---|
| 203 | * The compiled rules are compatible only between machines with the same | 
|---|
| 204 | * byte ordering (little or big endian) and the same base character set family | 
|---|
| 205 | * (ASCII or EBCDIC). | 
|---|
| 206 | * | 
|---|
| 207 | * @see #getBinaryRules | 
|---|
| 208 | * @param compiledRules A pointer to the compiled break rules to be used. | 
|---|
| 209 | * @param ruleLength The length of the compiled break rules, in bytes.  This | 
|---|
| 210 | *   corresponds to the length value produced by getBinaryRules(). | 
|---|
| 211 | * @param status Information on any errors encountered, including invalid | 
|---|
| 212 | *   binary rules. | 
|---|
| 213 | * @stable ICU 4.8 | 
|---|
| 214 | */ | 
|---|
| 215 | RuleBasedBreakIterator(const uint8_t *compiledRules, | 
|---|
| 216 | uint32_t       ruleLength, | 
|---|
| 217 | UErrorCode    &status); | 
|---|
| 218 |  | 
|---|
| 219 | /** | 
|---|
| 220 | * This constructor uses the udata interface to create a BreakIterator | 
|---|
| 221 | * whose internal tables live in a memory-mapped file.  "image" is an | 
|---|
| 222 | * ICU UDataMemory handle for the pre-compiled break iterator tables. | 
|---|
| 223 | * @param image handle to the memory image for the break iterator data. | 
|---|
| 224 | *        Ownership of the UDataMemory handle passes to the Break Iterator, | 
|---|
| 225 | *        which will be responsible for closing it when it is no longer needed. | 
|---|
| 226 | * @param status Information on any errors encountered. | 
|---|
| 227 | * @see udata_open | 
|---|
| 228 | * @see #getBinaryRules | 
|---|
| 229 | * @stable ICU 2.8 | 
|---|
| 230 | */ | 
|---|
| 231 | RuleBasedBreakIterator(UDataMemory* image, UErrorCode &status); | 
|---|
| 232 |  | 
|---|
| 233 | /** | 
|---|
| 234 | * Destructor | 
|---|
| 235 | *  @stable ICU 2.0 | 
|---|
| 236 | */ | 
|---|
| 237 | virtual ~RuleBasedBreakIterator(); | 
|---|
| 238 |  | 
|---|
| 239 | /** | 
|---|
| 240 | * Assignment operator.  Sets this iterator to have the same behavior, | 
|---|
| 241 | * and iterate over the same text, as the one passed in. | 
|---|
| 242 | * @param that The RuleBasedBreakItertor passed in | 
|---|
| 243 | * @return the newly created RuleBasedBreakIterator | 
|---|
| 244 | *  @stable ICU 2.0 | 
|---|
| 245 | */ | 
|---|
| 246 | RuleBasedBreakIterator& operator=(const RuleBasedBreakIterator& that); | 
|---|
| 247 |  | 
|---|
| 248 | /** | 
|---|
| 249 | * Equality operator.  Returns TRUE if both BreakIterators are of the | 
|---|
| 250 | * same class, have the same behavior, and iterate over the same text. | 
|---|
| 251 | * @param that The BreakIterator to be compared for equality | 
|---|
| 252 | * @return TRUE if both BreakIterators are of the | 
|---|
| 253 | * same class, have the same behavior, and iterate over the same text. | 
|---|
| 254 | *  @stable ICU 2.0 | 
|---|
| 255 | */ | 
|---|
| 256 | virtual UBool operator==(const BreakIterator& that) const; | 
|---|
| 257 |  | 
|---|
| 258 | /** | 
|---|
| 259 | * Not-equal operator.  If operator== returns TRUE, this returns FALSE, | 
|---|
| 260 | * and vice versa. | 
|---|
| 261 | * @param that The BreakIterator to be compared for inequality | 
|---|
| 262 | * @return TRUE if both BreakIterators are not same. | 
|---|
| 263 | *  @stable ICU 2.0 | 
|---|
| 264 | */ | 
|---|
| 265 | inline UBool operator!=(const BreakIterator& that) const; | 
|---|
| 266 |  | 
|---|
| 267 | /** | 
|---|
| 268 | * Returns a newly-constructed RuleBasedBreakIterator with the same | 
|---|
| 269 | * behavior, and iterating over the same text, as this one. | 
|---|
| 270 | * Differs from the copy constructor in that it is polymorphic, and | 
|---|
| 271 | * will correctly clone (copy) a derived class. | 
|---|
| 272 | * clone() is thread safe.  Multiple threads may simultaneously | 
|---|
| 273 | * clone the same source break iterator. | 
|---|
| 274 | * @return a newly-constructed RuleBasedBreakIterator | 
|---|
| 275 | * @stable ICU 2.0 | 
|---|
| 276 | */ | 
|---|
| 277 | virtual RuleBasedBreakIterator* clone() const; | 
|---|
| 278 |  | 
|---|
| 279 | /** | 
|---|
| 280 | * Compute a hash code for this BreakIterator | 
|---|
| 281 | * @return A hash code | 
|---|
| 282 | *  @stable ICU 2.0 | 
|---|
| 283 | */ | 
|---|
| 284 | virtual int32_t hashCode(void) const; | 
|---|
| 285 |  | 
|---|
| 286 | /** | 
|---|
| 287 | * Returns the description used to create this iterator | 
|---|
| 288 | * @return the description used to create this iterator | 
|---|
| 289 | *  @stable ICU 2.0 | 
|---|
| 290 | */ | 
|---|
| 291 | virtual const UnicodeString& getRules(void) const; | 
|---|
| 292 |  | 
|---|
| 293 | //======================================================================= | 
|---|
| 294 | // BreakIterator overrides | 
|---|
| 295 | //======================================================================= | 
|---|
| 296 |  | 
|---|
| 297 | /** | 
|---|
| 298 | * <p> | 
|---|
| 299 | * Return a CharacterIterator over the text being analyzed. | 
|---|
| 300 | * The returned character iterator is owned by the break iterator, and must | 
|---|
| 301 | * not be deleted by the caller.  Repeated calls to this function may | 
|---|
| 302 | * return the same CharacterIterator. | 
|---|
| 303 | * </p> | 
|---|
| 304 | * <p> | 
|---|
| 305 | * The returned character iterator must not be used concurrently with | 
|---|
| 306 | * the break iterator.  If concurrent operation is needed, clone the | 
|---|
| 307 | * returned character iterator first and operate on the clone. | 
|---|
| 308 | * </p> | 
|---|
| 309 | * <p> | 
|---|
| 310 | * When the break iterator is operating on text supplied via a UText, | 
|---|
| 311 | * this function will fail.  Lacking any way to signal failures, it | 
|---|
| 312 | * returns an CharacterIterator containing no text. | 
|---|
| 313 | * The function getUText() provides similar functionality, | 
|---|
| 314 | * is reliable, and is more efficient. | 
|---|
| 315 | * </p> | 
|---|
| 316 | * | 
|---|
| 317 | * TODO:  deprecate this function? | 
|---|
| 318 | * | 
|---|
| 319 | * @return An iterator over the text being analyzed. | 
|---|
| 320 | * @stable ICU 2.0 | 
|---|
| 321 | */ | 
|---|
| 322 | virtual  CharacterIterator& getText(void) const; | 
|---|
| 323 |  | 
|---|
| 324 |  | 
|---|
| 325 | /** | 
|---|
| 326 | *  Get a UText for the text being analyzed. | 
|---|
| 327 | *  The returned UText is a shallow clone of the UText used internally | 
|---|
| 328 | *  by the break iterator implementation.  It can safely be used to | 
|---|
| 329 | *  access the text without impacting any break iterator operations, | 
|---|
| 330 | *  but the underlying text itself must not be altered. | 
|---|
| 331 | * | 
|---|
| 332 | * @param fillIn A UText to be filled in.  If NULL, a new UText will be | 
|---|
| 333 | *           allocated to hold the result. | 
|---|
| 334 | * @param status receives any error codes. | 
|---|
| 335 | * @return   The current UText for this break iterator.  If an input | 
|---|
| 336 | *           UText was provided, it will always be returned. | 
|---|
| 337 | * @stable ICU 3.4 | 
|---|
| 338 | */ | 
|---|
| 339 | virtual UText *getUText(UText *fillIn, UErrorCode &status) const; | 
|---|
| 340 |  | 
|---|
| 341 | /** | 
|---|
| 342 | * Set the iterator to analyze a new piece of text.  This function resets | 
|---|
| 343 | * the current iteration position to the beginning of the text. | 
|---|
| 344 | * @param newText An iterator over the text to analyze.  The BreakIterator | 
|---|
| 345 | * takes ownership of the character iterator.  The caller MUST NOT delete it! | 
|---|
| 346 | *  @stable ICU 2.0 | 
|---|
| 347 | */ | 
|---|
| 348 | virtual void adoptText(CharacterIterator* newText); | 
|---|
| 349 |  | 
|---|
| 350 | /** | 
|---|
| 351 | * Set the iterator to analyze a new piece of text.  This function resets | 
|---|
| 352 | * the current iteration position to the beginning of the text. | 
|---|
| 353 | * | 
|---|
| 354 | * The BreakIterator will retain a reference to the supplied string. | 
|---|
| 355 | * The caller must not modify or delete the text while the BreakIterator | 
|---|
| 356 | * retains the reference. | 
|---|
| 357 | * | 
|---|
| 358 | * @param newText The text to analyze. | 
|---|
| 359 | *  @stable ICU 2.0 | 
|---|
| 360 | */ | 
|---|
| 361 | virtual void setText(const UnicodeString& newText); | 
|---|
| 362 |  | 
|---|
| 363 | /** | 
|---|
| 364 | * Reset the break iterator to operate over the text represented by | 
|---|
| 365 | * the UText.  The iterator position is reset to the start. | 
|---|
| 366 | * | 
|---|
| 367 | * This function makes a shallow clone of the supplied UText.  This means | 
|---|
| 368 | * that the caller is free to immediately close or otherwise reuse the | 
|---|
| 369 | * Utext that was passed as a parameter, but that the underlying text itself | 
|---|
| 370 | * must not be altered while being referenced by the break iterator. | 
|---|
| 371 | * | 
|---|
| 372 | * @param text    The UText used to change the text. | 
|---|
| 373 | * @param status  Receives any error codes. | 
|---|
| 374 | * @stable ICU 3.4 | 
|---|
| 375 | */ | 
|---|
| 376 | virtual void  setText(UText *text, UErrorCode &status); | 
|---|
| 377 |  | 
|---|
| 378 | /** | 
|---|
| 379 | * Sets the current iteration position to the beginning of the text, position zero. | 
|---|
| 380 | * @return The offset of the beginning of the text, zero. | 
|---|
| 381 | *  @stable ICU 2.0 | 
|---|
| 382 | */ | 
|---|
| 383 | virtual int32_t first(void); | 
|---|
| 384 |  | 
|---|
| 385 | /** | 
|---|
| 386 | * Sets the current iteration position to the end of the text. | 
|---|
| 387 | * @return The text's past-the-end offset. | 
|---|
| 388 | *  @stable ICU 2.0 | 
|---|
| 389 | */ | 
|---|
| 390 | virtual int32_t last(void); | 
|---|
| 391 |  | 
|---|
| 392 | /** | 
|---|
| 393 | * Advances the iterator either forward or backward the specified number of steps. | 
|---|
| 394 | * Negative values move backward, and positive values move forward.  This is | 
|---|
| 395 | * equivalent to repeatedly calling next() or previous(). | 
|---|
| 396 | * @param n The number of steps to move.  The sign indicates the direction | 
|---|
| 397 | * (negative is backwards, and positive is forwards). | 
|---|
| 398 | * @return The character offset of the boundary position n boundaries away from | 
|---|
| 399 | * the current one. | 
|---|
| 400 | *  @stable ICU 2.0 | 
|---|
| 401 | */ | 
|---|
| 402 | virtual int32_t next(int32_t n); | 
|---|
| 403 |  | 
|---|
| 404 | /** | 
|---|
| 405 | * Advances the iterator to the next boundary position. | 
|---|
| 406 | * @return The position of the first boundary after this one. | 
|---|
| 407 | *  @stable ICU 2.0 | 
|---|
| 408 | */ | 
|---|
| 409 | virtual int32_t next(void); | 
|---|
| 410 |  | 
|---|
| 411 | /** | 
|---|
| 412 | * Moves the iterator backwards, to the last boundary preceding this one. | 
|---|
| 413 | * @return The position of the last boundary position preceding this one. | 
|---|
| 414 | *  @stable ICU 2.0 | 
|---|
| 415 | */ | 
|---|
| 416 | virtual int32_t previous(void); | 
|---|
| 417 |  | 
|---|
| 418 | /** | 
|---|
| 419 | * Sets the iterator to refer to the first boundary position following | 
|---|
| 420 | * the specified position. | 
|---|
| 421 | * @param offset The position from which to begin searching for a break position. | 
|---|
| 422 | * @return The position of the first break after the current position. | 
|---|
| 423 | *  @stable ICU 2.0 | 
|---|
| 424 | */ | 
|---|
| 425 | virtual int32_t following(int32_t offset); | 
|---|
| 426 |  | 
|---|
| 427 | /** | 
|---|
| 428 | * Sets the iterator to refer to the last boundary position before the | 
|---|
| 429 | * specified position. | 
|---|
| 430 | * @param offset The position to begin searching for a break from. | 
|---|
| 431 | * @return The position of the last boundary before the starting position. | 
|---|
| 432 | *  @stable ICU 2.0 | 
|---|
| 433 | */ | 
|---|
| 434 | virtual int32_t preceding(int32_t offset); | 
|---|
| 435 |  | 
|---|
| 436 | /** | 
|---|
| 437 | * Returns true if the specified position is a boundary position.  As a side | 
|---|
| 438 | * effect, leaves the iterator pointing to the first boundary position at | 
|---|
| 439 | * or after "offset". | 
|---|
| 440 | * @param offset the offset to check. | 
|---|
| 441 | * @return True if "offset" is a boundary position. | 
|---|
| 442 | *  @stable ICU 2.0 | 
|---|
| 443 | */ | 
|---|
| 444 | virtual UBool isBoundary(int32_t offset); | 
|---|
| 445 |  | 
|---|
| 446 | /** | 
|---|
| 447 | * Returns the current iteration position. Note that UBRK_DONE is never | 
|---|
| 448 | * returned from this function; if iteration has run to the end of a | 
|---|
| 449 | * string, current() will return the length of the string while | 
|---|
| 450 | * next() will return UBRK_DONE). | 
|---|
| 451 | * @return The current iteration position. | 
|---|
| 452 | * @stable ICU 2.0 | 
|---|
| 453 | */ | 
|---|
| 454 | virtual int32_t current(void) const; | 
|---|
| 455 |  | 
|---|
| 456 |  | 
|---|
| 457 | /** | 
|---|
| 458 | * Return the status tag from the break rule that determined the boundary at | 
|---|
| 459 | * the current iteration position.  For break rules that do not specify a | 
|---|
| 460 | * status, a default value of 0 is returned.  If more than one break rule | 
|---|
| 461 | * would cause a boundary to be located at some position in the text, | 
|---|
| 462 | * the numerically largest of the applicable status values is returned. | 
|---|
| 463 | * <p> | 
|---|
| 464 | * Of the standard types of ICU break iterators, only word break and | 
|---|
| 465 | * line break provide status values.  The values are defined in | 
|---|
| 466 | * the header file ubrk.h.  For Word breaks, the status allows distinguishing between words | 
|---|
| 467 | * that contain alphabetic letters, "words" that appear to be numbers, | 
|---|
| 468 | * punctuation and spaces, words containing ideographic characters, and | 
|---|
| 469 | * more.  For Line Break, the status distinguishes between hard (mandatory) breaks | 
|---|
| 470 | * and soft (potential) break positions. | 
|---|
| 471 | * <p> | 
|---|
| 472 | * <code>getRuleStatus()</code> can be called after obtaining a boundary | 
|---|
| 473 | * position from <code>next()</code>, <code>previous()</code>, or | 
|---|
| 474 | * any other break iterator functions that returns a boundary position. | 
|---|
| 475 | * <p> | 
|---|
| 476 | * Note that <code>getRuleStatus()</code> returns the value corresponding to | 
|---|
| 477 | * <code>current()</code> index even after <code>next()</code> has returned DONE. | 
|---|
| 478 | * <p> | 
|---|
| 479 | * When creating custom break rules, one is free to define whatever | 
|---|
| 480 | * status values may be convenient for the application. | 
|---|
| 481 | * <p> | 
|---|
| 482 | * @return the status from the break rule that determined the boundary | 
|---|
| 483 | * at the current iteration position. | 
|---|
| 484 | * | 
|---|
| 485 | * @see UWordBreak | 
|---|
| 486 | * @stable ICU 2.2 | 
|---|
| 487 | */ | 
|---|
| 488 | virtual int32_t getRuleStatus() const; | 
|---|
| 489 |  | 
|---|
| 490 | /** | 
|---|
| 491 | * Get the status (tag) values from the break rule(s) that determined the boundary | 
|---|
| 492 | * at the current iteration position. | 
|---|
| 493 | * <p> | 
|---|
| 494 | * The returned status value(s) are stored into an array provided by the caller. | 
|---|
| 495 | * The values are stored in sorted (ascending) order. | 
|---|
| 496 | * If the capacity of the output array is insufficient to hold the data, | 
|---|
| 497 | *  the output will be truncated to the available length, and a | 
|---|
| 498 | *  U_BUFFER_OVERFLOW_ERROR will be signaled. | 
|---|
| 499 | * | 
|---|
| 500 | * @param fillInVec an array to be filled in with the status values. | 
|---|
| 501 | * @param capacity  the length of the supplied vector.  A length of zero causes | 
|---|
| 502 | *                  the function to return the number of status values, in the | 
|---|
| 503 | *                  normal way, without attempting to store any values. | 
|---|
| 504 | * @param status    receives error codes. | 
|---|
| 505 | * @return          The number of rule status values from the rules that determined | 
|---|
| 506 | *                  the boundary at the current iteration position. | 
|---|
| 507 | *                  In the event of a U_BUFFER_OVERFLOW_ERROR, the return value | 
|---|
| 508 | *                  is the total number of status values that were available, | 
|---|
| 509 | *                  not the reduced number that were actually returned. | 
|---|
| 510 | * @see getRuleStatus | 
|---|
| 511 | * @stable ICU 3.0 | 
|---|
| 512 | */ | 
|---|
| 513 | virtual int32_t getRuleStatusVec(int32_t *fillInVec, int32_t capacity, UErrorCode &status); | 
|---|
| 514 |  | 
|---|
| 515 | /** | 
|---|
| 516 | * Returns a unique class ID POLYMORPHICALLY.  Pure virtual override. | 
|---|
| 517 | * This method is to implement a simple version of RTTI, since not all | 
|---|
| 518 | * C++ compilers support genuine RTTI.  Polymorphic operator==() and | 
|---|
| 519 | * clone() methods call this method. | 
|---|
| 520 | * | 
|---|
| 521 | * @return          The class ID for this object. All objects of a | 
|---|
| 522 | *                  given class have the same class ID.  Objects of | 
|---|
| 523 | *                  other classes have different class IDs. | 
|---|
| 524 | * @stable ICU 2.0 | 
|---|
| 525 | */ | 
|---|
| 526 | virtual UClassID getDynamicClassID(void) const; | 
|---|
| 527 |  | 
|---|
| 528 | /** | 
|---|
| 529 | * Returns the class ID for this class.  This is useful only for | 
|---|
| 530 | * comparing to a return value from getDynamicClassID().  For example: | 
|---|
| 531 | * | 
|---|
| 532 | *      Base* polymorphic_pointer = createPolymorphicObject(); | 
|---|
| 533 | *      if (polymorphic_pointer->getDynamicClassID() == | 
|---|
| 534 | *          Derived::getStaticClassID()) ... | 
|---|
| 535 | * | 
|---|
| 536 | * @return          The class ID for all objects of this class. | 
|---|
| 537 | * @stable ICU 2.0 | 
|---|
| 538 | */ | 
|---|
| 539 | static UClassID U_EXPORT2 getStaticClassID(void); | 
|---|
| 540 |  | 
|---|
| 541 | #ifndef U_FORCE_HIDE_DEPRECATED_API | 
|---|
| 542 | /** | 
|---|
| 543 | * Deprecated functionality. Use clone() instead. | 
|---|
| 544 | * | 
|---|
| 545 | * Create a clone (copy) of this break iterator in memory provided | 
|---|
| 546 | *  by the caller.  The idea is to increase performance by avoiding | 
|---|
| 547 | *  a storage allocation.  Use of this function is NOT RECOMMENDED. | 
|---|
| 548 | *  Performance gains are minimal, and correct buffer management is | 
|---|
| 549 | *  tricky.  Use clone() instead. | 
|---|
| 550 | * | 
|---|
| 551 | * @param stackBuffer  The pointer to the memory into which the cloned object | 
|---|
| 552 | *                     should be placed.  If NULL,  allocate heap memory | 
|---|
| 553 | *                     for the cloned object. | 
|---|
| 554 | * @param BufferSize   The size of the buffer.  If zero, return the required | 
|---|
| 555 | *                     buffer size, but do not clone the object.  If the | 
|---|
| 556 | *                     size was too small (but not zero), allocate heap | 
|---|
| 557 | *                     storage for the cloned object. | 
|---|
| 558 | * | 
|---|
| 559 | * @param status       Error status.  U_SAFECLONE_ALLOCATED_WARNING will be | 
|---|
| 560 | *                     returned if the provided buffer was too small, and | 
|---|
| 561 | *                     the clone was therefore put on the heap. | 
|---|
| 562 | * | 
|---|
| 563 | * @return  Pointer to the clone object.  This may differ from the stackBuffer | 
|---|
| 564 | *          address if the byte alignment of the stack buffer was not suitable | 
|---|
| 565 | *          or if the stackBuffer was too small to hold the clone. | 
|---|
| 566 | * @deprecated ICU 52. Use clone() instead. | 
|---|
| 567 | */ | 
|---|
| 568 | virtual RuleBasedBreakIterator *createBufferClone(void *stackBuffer, | 
|---|
| 569 | int32_t &BufferSize, | 
|---|
| 570 | UErrorCode &status); | 
|---|
| 571 | #endif  // U_FORCE_HIDE_DEPRECATED_API | 
|---|
| 572 |  | 
|---|
| 573 | /** | 
|---|
| 574 | * Return the binary form of compiled break rules, | 
|---|
| 575 | * which can then be used to create a new break iterator at some | 
|---|
| 576 | * time in the future.  Creating a break iterator from pre-compiled rules | 
|---|
| 577 | * is much faster than building one from the source form of the | 
|---|
| 578 | * break rules. | 
|---|
| 579 | * | 
|---|
| 580 | * The binary data can only be used with the same version of ICU | 
|---|
| 581 | *  and on the same platform type (processor endian-ness) | 
|---|
| 582 | * | 
|---|
| 583 | * @param length Returns the length of the binary data.  (Out parameter.) | 
|---|
| 584 | * | 
|---|
| 585 | * @return   A pointer to the binary (compiled) rule data.  The storage | 
|---|
| 586 | *           belongs to the RulesBasedBreakIterator object, not the | 
|---|
| 587 | *           caller, and must not be modified or deleted. | 
|---|
| 588 | * @stable ICU 4.8 | 
|---|
| 589 | */ | 
|---|
| 590 | virtual const uint8_t *getBinaryRules(uint32_t &length); | 
|---|
| 591 |  | 
|---|
| 592 | /** | 
|---|
| 593 | *  Set the subject text string upon which the break iterator is operating | 
|---|
| 594 | *  without changing any other aspect of the matching state. | 
|---|
| 595 | *  The new and previous text strings must have the same content. | 
|---|
| 596 | * | 
|---|
| 597 | *  This function is intended for use in environments where ICU is operating on | 
|---|
| 598 | *  strings that may move around in memory.  It provides a mechanism for notifying | 
|---|
| 599 | *  ICU that the string has been relocated, and providing a new UText to access the | 
|---|
| 600 | *  string in its new position. | 
|---|
| 601 | * | 
|---|
| 602 | *  Note that the break iterator implementation never copies the underlying text | 
|---|
| 603 | *  of a string being processed, but always operates directly on the original text | 
|---|
| 604 | *  provided by the user. Refreshing simply drops the references to the old text | 
|---|
| 605 | *  and replaces them with references to the new. | 
|---|
| 606 | * | 
|---|
| 607 | *  Caution:  this function is normally used only by very specialized, | 
|---|
| 608 | *  system-level code.  One example use case is with garbage collection that moves | 
|---|
| 609 | *  the text in memory. | 
|---|
| 610 | * | 
|---|
| 611 | * @param input      The new (moved) text string. | 
|---|
| 612 | * @param status     Receives errors detected by this function. | 
|---|
| 613 | * @return           *this | 
|---|
| 614 | * | 
|---|
| 615 | * @stable ICU 49 | 
|---|
| 616 | */ | 
|---|
| 617 | virtual RuleBasedBreakIterator &refreshInputText(UText *input, UErrorCode &status); | 
|---|
| 618 |  | 
|---|
| 619 |  | 
|---|
| 620 | private: | 
|---|
| 621 | //======================================================================= | 
|---|
| 622 | // implementation | 
|---|
| 623 | //======================================================================= | 
|---|
| 624 | /** | 
|---|
| 625 | * Dumps caches and performs other actions associated with a complete change | 
|---|
| 626 | * in text or iteration position. | 
|---|
| 627 | * @internal (private) | 
|---|
| 628 | */ | 
|---|
| 629 | void reset(void); | 
|---|
| 630 |  | 
|---|
| 631 | /** | 
|---|
| 632 | * Common initialization function, used by constructors and bufferClone. | 
|---|
| 633 | * @internal (private) | 
|---|
| 634 | */ | 
|---|
| 635 | void init(UErrorCode &status); | 
|---|
| 636 |  | 
|---|
| 637 | /** | 
|---|
| 638 | * Iterate backwards from an arbitrary position in the input text using the | 
|---|
| 639 | * synthesized Safe Reverse rules. | 
|---|
| 640 | * This locates a "Safe Position" from which the forward break rules | 
|---|
| 641 | * will operate correctly. A Safe Position is not necessarily a boundary itself. | 
|---|
| 642 | * | 
|---|
| 643 | * @param fromPosition the position in the input text to begin the iteration. | 
|---|
| 644 | * @internal (private) | 
|---|
| 645 | */ | 
|---|
| 646 | int32_t handleSafePrevious(int32_t fromPosition); | 
|---|
| 647 |  | 
|---|
| 648 | /** | 
|---|
| 649 | * Find a rule-based boundary by running the state machine. | 
|---|
| 650 | * Input | 
|---|
| 651 | *    fPosition, the position in the text to begin from. | 
|---|
| 652 | * Output | 
|---|
| 653 | *    fPosition:           the boundary following the starting position. | 
|---|
| 654 | *    fDictionaryCharCount the number of dictionary characters encountered. | 
|---|
| 655 | *                         If > 0, the segment will be further subdivided | 
|---|
| 656 | *    fRuleStatusIndex     Info from the state table indicating which rules caused the boundary. | 
|---|
| 657 | * | 
|---|
| 658 | * @internal (private) | 
|---|
| 659 | */ | 
|---|
| 660 | int32_t handleNext(); | 
|---|
| 661 |  | 
|---|
| 662 |  | 
|---|
| 663 | /** | 
|---|
| 664 | * This function returns the appropriate LanguageBreakEngine for a | 
|---|
| 665 | * given character c. | 
|---|
| 666 | * @param c         A character in the dictionary set | 
|---|
| 667 | * @internal (private) | 
|---|
| 668 | */ | 
|---|
| 669 | const LanguageBreakEngine *getLanguageBreakEngine(UChar32 c); | 
|---|
| 670 |  | 
|---|
| 671 | public: | 
|---|
| 672 | #ifndef U_HIDE_INTERNAL_API | 
|---|
| 673 | /** | 
|---|
| 674 | *   Debugging function only. | 
|---|
| 675 | *   @internal | 
|---|
| 676 | */ | 
|---|
| 677 | void dumpCache(); | 
|---|
| 678 |  | 
|---|
| 679 | /** | 
|---|
| 680 | * Debugging function only. | 
|---|
| 681 | * @internal | 
|---|
| 682 | */ | 
|---|
| 683 | void dumpTables(); | 
|---|
| 684 |  | 
|---|
| 685 | #endif  /* U_HIDE_INTERNAL_API */ | 
|---|
| 686 | }; | 
|---|
| 687 |  | 
|---|
| 688 | //------------------------------------------------------------------------------ | 
|---|
| 689 | // | 
|---|
| 690 | //   Inline Functions Definitions ... | 
|---|
| 691 | // | 
|---|
| 692 | //------------------------------------------------------------------------------ | 
|---|
| 693 |  | 
|---|
| 694 | inline UBool RuleBasedBreakIterator::operator!=(const BreakIterator& that) const { | 
|---|
| 695 | return !operator==(that); | 
|---|
| 696 | } | 
|---|
| 697 |  | 
|---|
| 698 | U_NAMESPACE_END | 
|---|
| 699 |  | 
|---|
| 700 | #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ | 
|---|
| 701 |  | 
|---|
| 702 | #endif /* U_SHOW_CPLUSPLUS_API */ | 
|---|
| 703 |  | 
|---|
| 704 | #endif | 
|---|
| 705 |  | 
|---|