| 1 | // © 2016 and later: Unicode, Inc. and others. |
| 2 | // License & terms of use: http://www.unicode.org/copyright.html |
| 3 | /* |
| 4 | *************************************************************************** |
| 5 | * Copyright (C) 1999-2016 International Business Machines Corporation * |
| 6 | * and others. All rights reserved. * |
| 7 | *************************************************************************** |
| 8 | |
| 9 | ********************************************************************** |
| 10 | * Date Name Description |
| 11 | * 10/22/99 alan Creation. |
| 12 | * 11/11/99 rgillam Complete port from Java. |
| 13 | ********************************************************************** |
| 14 | */ |
| 15 | |
| 16 | #ifndef RBBI_H |
| 17 | #define RBBI_H |
| 18 | |
| 19 | #include "unicode/utypes.h" |
| 20 | |
| 21 | #if U_SHOW_CPLUSPLUS_API |
| 22 | |
| 23 | /** |
| 24 | * \file |
| 25 | * \brief C++ API: Rule Based Break Iterator |
| 26 | */ |
| 27 | |
| 28 | #if !UCONFIG_NO_BREAK_ITERATION |
| 29 | |
| 30 | #include "unicode/brkiter.h" |
| 31 | #include "unicode/udata.h" |
| 32 | #include "unicode/parseerr.h" |
| 33 | #include "unicode/schriter.h" |
| 34 | |
| 35 | U_NAMESPACE_BEGIN |
| 36 | |
| 37 | /** @internal */ |
| 38 | class LanguageBreakEngine; |
| 39 | struct ; |
| 40 | class RBBIDataWrapper; |
| 41 | class UnhandledEngine; |
| 42 | class UStack; |
| 43 | |
| 44 | /** |
| 45 | * |
| 46 | * A subclass of BreakIterator whose behavior is specified using a list of rules. |
| 47 | * <p>Instances of this class are most commonly created by the factory methods of |
| 48 | * BreakIterator::createWordInstance(), BreakIterator::createLineInstance(), etc., |
| 49 | * and then used via the abstract API in class BreakIterator</p> |
| 50 | * |
| 51 | * <p>See the ICU User Guide for information on Break Iterator Rules.</p> |
| 52 | * |
| 53 | * <p>This class is not intended to be subclassed.</p> |
| 54 | */ |
| 55 | class U_COMMON_API RuleBasedBreakIterator /*U_FINAL*/ : public BreakIterator { |
| 56 | |
| 57 | private: |
| 58 | /** |
| 59 | * The UText through which this BreakIterator accesses the text |
| 60 | * @internal (private) |
| 61 | */ |
| 62 | UText fText; |
| 63 | |
| 64 | #ifndef U_HIDE_INTERNAL_API |
| 65 | public: |
| 66 | #endif /* U_HIDE_INTERNAL_API */ |
| 67 | /** |
| 68 | * The rule data for this BreakIterator instance. |
| 69 | * Not for general use; Public only for testing purposes. |
| 70 | * @internal |
| 71 | */ |
| 72 | RBBIDataWrapper *fData; |
| 73 | private: |
| 74 | |
| 75 | /** |
| 76 | * The current position of the iterator. Pinned, 0 < fPosition <= text.length. |
| 77 | * Never has the value UBRK_DONE (-1). |
| 78 | */ |
| 79 | int32_t fPosition; |
| 80 | |
| 81 | /** |
| 82 | * TODO: |
| 83 | */ |
| 84 | int32_t fRuleStatusIndex; |
| 85 | |
| 86 | /** |
| 87 | * Cache of previously determined boundary positions. |
| 88 | */ |
| 89 | class BreakCache; |
| 90 | BreakCache *fBreakCache; |
| 91 | |
| 92 | /** |
| 93 | * Cache of boundary positions within a region of text that has been |
| 94 | * sub-divided by dictionary based breaking. |
| 95 | */ |
| 96 | class DictionaryCache; |
| 97 | DictionaryCache *fDictionaryCache; |
| 98 | |
| 99 | /** |
| 100 | * |
| 101 | * If present, UStack of LanguageBreakEngine objects that might handle |
| 102 | * dictionary characters. Searched from top to bottom to find an object to |
| 103 | * handle a given character. |
| 104 | * @internal (private) |
| 105 | */ |
| 106 | UStack *fLanguageBreakEngines; |
| 107 | |
| 108 | /** |
| 109 | * |
| 110 | * If present, the special LanguageBreakEngine used for handling |
| 111 | * characters that are in the dictionary set, but not handled by any |
| 112 | * LanguageBreakEngine. |
| 113 | * @internal (private) |
| 114 | */ |
| 115 | UnhandledEngine *fUnhandledBreakEngine; |
| 116 | |
| 117 | /** |
| 118 | * Counter for the number of characters encountered with the "dictionary" |
| 119 | * flag set. |
| 120 | * @internal (private) |
| 121 | */ |
| 122 | uint32_t fDictionaryCharCount; |
| 123 | |
| 124 | /** |
| 125 | * A character iterator that refers to the same text as the UText, above. |
| 126 | * Only included for compatibility with old API, which was based on CharacterIterators. |
| 127 | * Value may be adopted from outside, or one of fSCharIter or fDCharIter, below. |
| 128 | */ |
| 129 | CharacterIterator *fCharIter; |
| 130 | |
| 131 | /** |
| 132 | * When the input text is provided by a UnicodeString, this will point to |
| 133 | * a characterIterator that wraps that data. Needed only for the |
| 134 | * implementation of getText(), a backwards compatibility issue. |
| 135 | */ |
| 136 | StringCharacterIterator fSCharIter; |
| 137 | |
| 138 | /** |
| 139 | * True when iteration has run off the end, and iterator functions should return UBRK_DONE. |
| 140 | */ |
| 141 | UBool fDone; |
| 142 | |
| 143 | //======================================================================= |
| 144 | // constructors |
| 145 | //======================================================================= |
| 146 | |
| 147 | /** |
| 148 | * Constructor from a flattened set of RBBI data in malloced memory. |
| 149 | * RulesBasedBreakIterators built from a custom set of rules |
| 150 | * are created via this constructor; the rules are compiled |
| 151 | * into memory, then the break iterator is constructed here. |
| 152 | * |
| 153 | * The break iterator adopts the memory, and will |
| 154 | * free it when done. |
| 155 | * @internal (private) |
| 156 | */ |
| 157 | (RBBIDataHeader* data, UErrorCode &status); |
| 158 | |
| 159 | /** @internal */ |
| 160 | friend class RBBIRuleBuilder; |
| 161 | /** @internal */ |
| 162 | friend class BreakIterator; |
| 163 | |
| 164 | public: |
| 165 | |
| 166 | /** Default constructor. Creates an empty shell of an iterator, with no |
| 167 | * rules or text to iterate over. Object can subsequently be assigned to. |
| 168 | * @stable ICU 2.2 |
| 169 | */ |
| 170 | RuleBasedBreakIterator(); |
| 171 | |
| 172 | /** |
| 173 | * Copy constructor. Will produce a break iterator with the same behavior, |
| 174 | * and which iterates over the same text, as the one passed in. |
| 175 | * @param that The RuleBasedBreakIterator passed to be copied |
| 176 | * @stable ICU 2.0 |
| 177 | */ |
| 178 | RuleBasedBreakIterator(const RuleBasedBreakIterator& that); |
| 179 | |
| 180 | /** |
| 181 | * Construct a RuleBasedBreakIterator from a set of rules supplied as a string. |
| 182 | * @param rules The break rules to be used. |
| 183 | * @param parseError In the event of a syntax error in the rules, provides the location |
| 184 | * within the rules of the problem. |
| 185 | * @param status Information on any errors encountered. |
| 186 | * @stable ICU 2.2 |
| 187 | */ |
| 188 | RuleBasedBreakIterator( const UnicodeString &rules, |
| 189 | UParseError &parseError, |
| 190 | UErrorCode &status); |
| 191 | |
| 192 | /** |
| 193 | * Construct a RuleBasedBreakIterator from a set of precompiled binary rules. |
| 194 | * Binary rules are obtained from RulesBasedBreakIterator::getBinaryRules(). |
| 195 | * Construction of a break iterator in this way is substantially faster than |
| 196 | * construction from source rules. |
| 197 | * |
| 198 | * Ownership of the storage containing the compiled rules remains with the |
| 199 | * caller of this function. The compiled rules must not be modified or |
| 200 | * deleted during the life of the break iterator. |
| 201 | * |
| 202 | * The compiled rules are not compatible across different major versions of ICU. |
| 203 | * The compiled rules are compatible only between machines with the same |
| 204 | * byte ordering (little or big endian) and the same base character set family |
| 205 | * (ASCII or EBCDIC). |
| 206 | * |
| 207 | * @see #getBinaryRules |
| 208 | * @param compiledRules A pointer to the compiled break rules to be used. |
| 209 | * @param ruleLength The length of the compiled break rules, in bytes. This |
| 210 | * corresponds to the length value produced by getBinaryRules(). |
| 211 | * @param status Information on any errors encountered, including invalid |
| 212 | * binary rules. |
| 213 | * @stable ICU 4.8 |
| 214 | */ |
| 215 | RuleBasedBreakIterator(const uint8_t *compiledRules, |
| 216 | uint32_t ruleLength, |
| 217 | UErrorCode &status); |
| 218 | |
| 219 | /** |
| 220 | * This constructor uses the udata interface to create a BreakIterator |
| 221 | * whose internal tables live in a memory-mapped file. "image" is an |
| 222 | * ICU UDataMemory handle for the pre-compiled break iterator tables. |
| 223 | * @param image handle to the memory image for the break iterator data. |
| 224 | * Ownership of the UDataMemory handle passes to the Break Iterator, |
| 225 | * which will be responsible for closing it when it is no longer needed. |
| 226 | * @param status Information on any errors encountered. |
| 227 | * @see udata_open |
| 228 | * @see #getBinaryRules |
| 229 | * @stable ICU 2.8 |
| 230 | */ |
| 231 | RuleBasedBreakIterator(UDataMemory* image, UErrorCode &status); |
| 232 | |
| 233 | /** |
| 234 | * Destructor |
| 235 | * @stable ICU 2.0 |
| 236 | */ |
| 237 | virtual ~RuleBasedBreakIterator(); |
| 238 | |
| 239 | /** |
| 240 | * Assignment operator. Sets this iterator to have the same behavior, |
| 241 | * and iterate over the same text, as the one passed in. |
| 242 | * @param that The RuleBasedBreakItertor passed in |
| 243 | * @return the newly created RuleBasedBreakIterator |
| 244 | * @stable ICU 2.0 |
| 245 | */ |
| 246 | RuleBasedBreakIterator& operator=(const RuleBasedBreakIterator& that); |
| 247 | |
| 248 | /** |
| 249 | * Equality operator. Returns TRUE if both BreakIterators are of the |
| 250 | * same class, have the same behavior, and iterate over the same text. |
| 251 | * @param that The BreakIterator to be compared for equality |
| 252 | * @return TRUE if both BreakIterators are of the |
| 253 | * same class, have the same behavior, and iterate over the same text. |
| 254 | * @stable ICU 2.0 |
| 255 | */ |
| 256 | virtual UBool operator==(const BreakIterator& that) const; |
| 257 | |
| 258 | /** |
| 259 | * Not-equal operator. If operator== returns TRUE, this returns FALSE, |
| 260 | * and vice versa. |
| 261 | * @param that The BreakIterator to be compared for inequality |
| 262 | * @return TRUE if both BreakIterators are not same. |
| 263 | * @stable ICU 2.0 |
| 264 | */ |
| 265 | inline UBool operator!=(const BreakIterator& that) const; |
| 266 | |
| 267 | /** |
| 268 | * Returns a newly-constructed RuleBasedBreakIterator with the same |
| 269 | * behavior, and iterating over the same text, as this one. |
| 270 | * Differs from the copy constructor in that it is polymorphic, and |
| 271 | * will correctly clone (copy) a derived class. |
| 272 | * clone() is thread safe. Multiple threads may simultaneously |
| 273 | * clone the same source break iterator. |
| 274 | * @return a newly-constructed RuleBasedBreakIterator |
| 275 | * @stable ICU 2.0 |
| 276 | */ |
| 277 | virtual RuleBasedBreakIterator* clone() const; |
| 278 | |
| 279 | /** |
| 280 | * Compute a hash code for this BreakIterator |
| 281 | * @return A hash code |
| 282 | * @stable ICU 2.0 |
| 283 | */ |
| 284 | virtual int32_t hashCode(void) const; |
| 285 | |
| 286 | /** |
| 287 | * Returns the description used to create this iterator |
| 288 | * @return the description used to create this iterator |
| 289 | * @stable ICU 2.0 |
| 290 | */ |
| 291 | virtual const UnicodeString& getRules(void) const; |
| 292 | |
| 293 | //======================================================================= |
| 294 | // BreakIterator overrides |
| 295 | //======================================================================= |
| 296 | |
| 297 | /** |
| 298 | * <p> |
| 299 | * Return a CharacterIterator over the text being analyzed. |
| 300 | * The returned character iterator is owned by the break iterator, and must |
| 301 | * not be deleted by the caller. Repeated calls to this function may |
| 302 | * return the same CharacterIterator. |
| 303 | * </p> |
| 304 | * <p> |
| 305 | * The returned character iterator must not be used concurrently with |
| 306 | * the break iterator. If concurrent operation is needed, clone the |
| 307 | * returned character iterator first and operate on the clone. |
| 308 | * </p> |
| 309 | * <p> |
| 310 | * When the break iterator is operating on text supplied via a UText, |
| 311 | * this function will fail. Lacking any way to signal failures, it |
| 312 | * returns an CharacterIterator containing no text. |
| 313 | * The function getUText() provides similar functionality, |
| 314 | * is reliable, and is more efficient. |
| 315 | * </p> |
| 316 | * |
| 317 | * TODO: deprecate this function? |
| 318 | * |
| 319 | * @return An iterator over the text being analyzed. |
| 320 | * @stable ICU 2.0 |
| 321 | */ |
| 322 | virtual CharacterIterator& getText(void) const; |
| 323 | |
| 324 | |
| 325 | /** |
| 326 | * Get a UText for the text being analyzed. |
| 327 | * The returned UText is a shallow clone of the UText used internally |
| 328 | * by the break iterator implementation. It can safely be used to |
| 329 | * access the text without impacting any break iterator operations, |
| 330 | * but the underlying text itself must not be altered. |
| 331 | * |
| 332 | * @param fillIn A UText to be filled in. If NULL, a new UText will be |
| 333 | * allocated to hold the result. |
| 334 | * @param status receives any error codes. |
| 335 | * @return The current UText for this break iterator. If an input |
| 336 | * UText was provided, it will always be returned. |
| 337 | * @stable ICU 3.4 |
| 338 | */ |
| 339 | virtual UText *getUText(UText *fillIn, UErrorCode &status) const; |
| 340 | |
| 341 | /** |
| 342 | * Set the iterator to analyze a new piece of text. This function resets |
| 343 | * the current iteration position to the beginning of the text. |
| 344 | * @param newText An iterator over the text to analyze. The BreakIterator |
| 345 | * takes ownership of the character iterator. The caller MUST NOT delete it! |
| 346 | * @stable ICU 2.0 |
| 347 | */ |
| 348 | virtual void adoptText(CharacterIterator* newText); |
| 349 | |
| 350 | /** |
| 351 | * Set the iterator to analyze a new piece of text. This function resets |
| 352 | * the current iteration position to the beginning of the text. |
| 353 | * |
| 354 | * The BreakIterator will retain a reference to the supplied string. |
| 355 | * The caller must not modify or delete the text while the BreakIterator |
| 356 | * retains the reference. |
| 357 | * |
| 358 | * @param newText The text to analyze. |
| 359 | * @stable ICU 2.0 |
| 360 | */ |
| 361 | virtual void setText(const UnicodeString& newText); |
| 362 | |
| 363 | /** |
| 364 | * Reset the break iterator to operate over the text represented by |
| 365 | * the UText. The iterator position is reset to the start. |
| 366 | * |
| 367 | * This function makes a shallow clone of the supplied UText. This means |
| 368 | * that the caller is free to immediately close or otherwise reuse the |
| 369 | * Utext that was passed as a parameter, but that the underlying text itself |
| 370 | * must not be altered while being referenced by the break iterator. |
| 371 | * |
| 372 | * @param text The UText used to change the text. |
| 373 | * @param status Receives any error codes. |
| 374 | * @stable ICU 3.4 |
| 375 | */ |
| 376 | virtual void setText(UText *text, UErrorCode &status); |
| 377 | |
| 378 | /** |
| 379 | * Sets the current iteration position to the beginning of the text, position zero. |
| 380 | * @return The offset of the beginning of the text, zero. |
| 381 | * @stable ICU 2.0 |
| 382 | */ |
| 383 | virtual int32_t first(void); |
| 384 | |
| 385 | /** |
| 386 | * Sets the current iteration position to the end of the text. |
| 387 | * @return The text's past-the-end offset. |
| 388 | * @stable ICU 2.0 |
| 389 | */ |
| 390 | virtual int32_t last(void); |
| 391 | |
| 392 | /** |
| 393 | * Advances the iterator either forward or backward the specified number of steps. |
| 394 | * Negative values move backward, and positive values move forward. This is |
| 395 | * equivalent to repeatedly calling next() or previous(). |
| 396 | * @param n The number of steps to move. The sign indicates the direction |
| 397 | * (negative is backwards, and positive is forwards). |
| 398 | * @return The character offset of the boundary position n boundaries away from |
| 399 | * the current one. |
| 400 | * @stable ICU 2.0 |
| 401 | */ |
| 402 | virtual int32_t next(int32_t n); |
| 403 | |
| 404 | /** |
| 405 | * Advances the iterator to the next boundary position. |
| 406 | * @return The position of the first boundary after this one. |
| 407 | * @stable ICU 2.0 |
| 408 | */ |
| 409 | virtual int32_t next(void); |
| 410 | |
| 411 | /** |
| 412 | * Moves the iterator backwards, to the last boundary preceding this one. |
| 413 | * @return The position of the last boundary position preceding this one. |
| 414 | * @stable ICU 2.0 |
| 415 | */ |
| 416 | virtual int32_t previous(void); |
| 417 | |
| 418 | /** |
| 419 | * Sets the iterator to refer to the first boundary position following |
| 420 | * the specified position. |
| 421 | * @param offset The position from which to begin searching for a break position. |
| 422 | * @return The position of the first break after the current position. |
| 423 | * @stable ICU 2.0 |
| 424 | */ |
| 425 | virtual int32_t following(int32_t offset); |
| 426 | |
| 427 | /** |
| 428 | * Sets the iterator to refer to the last boundary position before the |
| 429 | * specified position. |
| 430 | * @param offset The position to begin searching for a break from. |
| 431 | * @return The position of the last boundary before the starting position. |
| 432 | * @stable ICU 2.0 |
| 433 | */ |
| 434 | virtual int32_t preceding(int32_t offset); |
| 435 | |
| 436 | /** |
| 437 | * Returns true if the specified position is a boundary position. As a side |
| 438 | * effect, leaves the iterator pointing to the first boundary position at |
| 439 | * or after "offset". |
| 440 | * @param offset the offset to check. |
| 441 | * @return True if "offset" is a boundary position. |
| 442 | * @stable ICU 2.0 |
| 443 | */ |
| 444 | virtual UBool isBoundary(int32_t offset); |
| 445 | |
| 446 | /** |
| 447 | * Returns the current iteration position. Note that UBRK_DONE is never |
| 448 | * returned from this function; if iteration has run to the end of a |
| 449 | * string, current() will return the length of the string while |
| 450 | * next() will return UBRK_DONE). |
| 451 | * @return The current iteration position. |
| 452 | * @stable ICU 2.0 |
| 453 | */ |
| 454 | virtual int32_t current(void) const; |
| 455 | |
| 456 | |
| 457 | /** |
| 458 | * Return the status tag from the break rule that determined the boundary at |
| 459 | * the current iteration position. For break rules that do not specify a |
| 460 | * status, a default value of 0 is returned. If more than one break rule |
| 461 | * would cause a boundary to be located at some position in the text, |
| 462 | * the numerically largest of the applicable status values is returned. |
| 463 | * <p> |
| 464 | * Of the standard types of ICU break iterators, only word break and |
| 465 | * line break provide status values. The values are defined in |
| 466 | * the header file ubrk.h. For Word breaks, the status allows distinguishing between words |
| 467 | * that contain alphabetic letters, "words" that appear to be numbers, |
| 468 | * punctuation and spaces, words containing ideographic characters, and |
| 469 | * more. For Line Break, the status distinguishes between hard (mandatory) breaks |
| 470 | * and soft (potential) break positions. |
| 471 | * <p> |
| 472 | * <code>getRuleStatus()</code> can be called after obtaining a boundary |
| 473 | * position from <code>next()</code>, <code>previous()</code>, or |
| 474 | * any other break iterator functions that returns a boundary position. |
| 475 | * <p> |
| 476 | * Note that <code>getRuleStatus()</code> returns the value corresponding to |
| 477 | * <code>current()</code> index even after <code>next()</code> has returned DONE. |
| 478 | * <p> |
| 479 | * When creating custom break rules, one is free to define whatever |
| 480 | * status values may be convenient for the application. |
| 481 | * <p> |
| 482 | * @return the status from the break rule that determined the boundary |
| 483 | * at the current iteration position. |
| 484 | * |
| 485 | * @see UWordBreak |
| 486 | * @stable ICU 2.2 |
| 487 | */ |
| 488 | virtual int32_t getRuleStatus() const; |
| 489 | |
| 490 | /** |
| 491 | * Get the status (tag) values from the break rule(s) that determined the boundary |
| 492 | * at the current iteration position. |
| 493 | * <p> |
| 494 | * The returned status value(s) are stored into an array provided by the caller. |
| 495 | * The values are stored in sorted (ascending) order. |
| 496 | * If the capacity of the output array is insufficient to hold the data, |
| 497 | * the output will be truncated to the available length, and a |
| 498 | * U_BUFFER_OVERFLOW_ERROR will be signaled. |
| 499 | * |
| 500 | * @param fillInVec an array to be filled in with the status values. |
| 501 | * @param capacity the length of the supplied vector. A length of zero causes |
| 502 | * the function to return the number of status values, in the |
| 503 | * normal way, without attempting to store any values. |
| 504 | * @param status receives error codes. |
| 505 | * @return The number of rule status values from the rules that determined |
| 506 | * the boundary at the current iteration position. |
| 507 | * In the event of a U_BUFFER_OVERFLOW_ERROR, the return value |
| 508 | * is the total number of status values that were available, |
| 509 | * not the reduced number that were actually returned. |
| 510 | * @see getRuleStatus |
| 511 | * @stable ICU 3.0 |
| 512 | */ |
| 513 | virtual int32_t getRuleStatusVec(int32_t *fillInVec, int32_t capacity, UErrorCode &status); |
| 514 | |
| 515 | /** |
| 516 | * Returns a unique class ID POLYMORPHICALLY. Pure virtual override. |
| 517 | * This method is to implement a simple version of RTTI, since not all |
| 518 | * C++ compilers support genuine RTTI. Polymorphic operator==() and |
| 519 | * clone() methods call this method. |
| 520 | * |
| 521 | * @return The class ID for this object. All objects of a |
| 522 | * given class have the same class ID. Objects of |
| 523 | * other classes have different class IDs. |
| 524 | * @stable ICU 2.0 |
| 525 | */ |
| 526 | virtual UClassID getDynamicClassID(void) const; |
| 527 | |
| 528 | /** |
| 529 | * Returns the class ID for this class. This is useful only for |
| 530 | * comparing to a return value from getDynamicClassID(). For example: |
| 531 | * |
| 532 | * Base* polymorphic_pointer = createPolymorphicObject(); |
| 533 | * if (polymorphic_pointer->getDynamicClassID() == |
| 534 | * Derived::getStaticClassID()) ... |
| 535 | * |
| 536 | * @return The class ID for all objects of this class. |
| 537 | * @stable ICU 2.0 |
| 538 | */ |
| 539 | static UClassID U_EXPORT2 getStaticClassID(void); |
| 540 | |
| 541 | #ifndef U_FORCE_HIDE_DEPRECATED_API |
| 542 | /** |
| 543 | * Deprecated functionality. Use clone() instead. |
| 544 | * |
| 545 | * Create a clone (copy) of this break iterator in memory provided |
| 546 | * by the caller. The idea is to increase performance by avoiding |
| 547 | * a storage allocation. Use of this function is NOT RECOMMENDED. |
| 548 | * Performance gains are minimal, and correct buffer management is |
| 549 | * tricky. Use clone() instead. |
| 550 | * |
| 551 | * @param stackBuffer The pointer to the memory into which the cloned object |
| 552 | * should be placed. If NULL, allocate heap memory |
| 553 | * for the cloned object. |
| 554 | * @param BufferSize The size of the buffer. If zero, return the required |
| 555 | * buffer size, but do not clone the object. If the |
| 556 | * size was too small (but not zero), allocate heap |
| 557 | * storage for the cloned object. |
| 558 | * |
| 559 | * @param status Error status. U_SAFECLONE_ALLOCATED_WARNING will be |
| 560 | * returned if the provided buffer was too small, and |
| 561 | * the clone was therefore put on the heap. |
| 562 | * |
| 563 | * @return Pointer to the clone object. This may differ from the stackBuffer |
| 564 | * address if the byte alignment of the stack buffer was not suitable |
| 565 | * or if the stackBuffer was too small to hold the clone. |
| 566 | * @deprecated ICU 52. Use clone() instead. |
| 567 | */ |
| 568 | virtual RuleBasedBreakIterator *createBufferClone(void *stackBuffer, |
| 569 | int32_t &BufferSize, |
| 570 | UErrorCode &status); |
| 571 | #endif // U_FORCE_HIDE_DEPRECATED_API |
| 572 | |
| 573 | /** |
| 574 | * Return the binary form of compiled break rules, |
| 575 | * which can then be used to create a new break iterator at some |
| 576 | * time in the future. Creating a break iterator from pre-compiled rules |
| 577 | * is much faster than building one from the source form of the |
| 578 | * break rules. |
| 579 | * |
| 580 | * The binary data can only be used with the same version of ICU |
| 581 | * and on the same platform type (processor endian-ness) |
| 582 | * |
| 583 | * @param length Returns the length of the binary data. (Out parameter.) |
| 584 | * |
| 585 | * @return A pointer to the binary (compiled) rule data. The storage |
| 586 | * belongs to the RulesBasedBreakIterator object, not the |
| 587 | * caller, and must not be modified or deleted. |
| 588 | * @stable ICU 4.8 |
| 589 | */ |
| 590 | virtual const uint8_t *getBinaryRules(uint32_t &length); |
| 591 | |
| 592 | /** |
| 593 | * Set the subject text string upon which the break iterator is operating |
| 594 | * without changing any other aspect of the matching state. |
| 595 | * The new and previous text strings must have the same content. |
| 596 | * |
| 597 | * This function is intended for use in environments where ICU is operating on |
| 598 | * strings that may move around in memory. It provides a mechanism for notifying |
| 599 | * ICU that the string has been relocated, and providing a new UText to access the |
| 600 | * string in its new position. |
| 601 | * |
| 602 | * Note that the break iterator implementation never copies the underlying text |
| 603 | * of a string being processed, but always operates directly on the original text |
| 604 | * provided by the user. Refreshing simply drops the references to the old text |
| 605 | * and replaces them with references to the new. |
| 606 | * |
| 607 | * Caution: this function is normally used only by very specialized, |
| 608 | * system-level code. One example use case is with garbage collection that moves |
| 609 | * the text in memory. |
| 610 | * |
| 611 | * @param input The new (moved) text string. |
| 612 | * @param status Receives errors detected by this function. |
| 613 | * @return *this |
| 614 | * |
| 615 | * @stable ICU 49 |
| 616 | */ |
| 617 | virtual RuleBasedBreakIterator &refreshInputText(UText *input, UErrorCode &status); |
| 618 | |
| 619 | |
| 620 | private: |
| 621 | //======================================================================= |
| 622 | // implementation |
| 623 | //======================================================================= |
| 624 | /** |
| 625 | * Dumps caches and performs other actions associated with a complete change |
| 626 | * in text or iteration position. |
| 627 | * @internal (private) |
| 628 | */ |
| 629 | void reset(void); |
| 630 | |
| 631 | /** |
| 632 | * Common initialization function, used by constructors and bufferClone. |
| 633 | * @internal (private) |
| 634 | */ |
| 635 | void init(UErrorCode &status); |
| 636 | |
| 637 | /** |
| 638 | * Iterate backwards from an arbitrary position in the input text using the |
| 639 | * synthesized Safe Reverse rules. |
| 640 | * This locates a "Safe Position" from which the forward break rules |
| 641 | * will operate correctly. A Safe Position is not necessarily a boundary itself. |
| 642 | * |
| 643 | * @param fromPosition the position in the input text to begin the iteration. |
| 644 | * @internal (private) |
| 645 | */ |
| 646 | int32_t handleSafePrevious(int32_t fromPosition); |
| 647 | |
| 648 | /** |
| 649 | * Find a rule-based boundary by running the state machine. |
| 650 | * Input |
| 651 | * fPosition, the position in the text to begin from. |
| 652 | * Output |
| 653 | * fPosition: the boundary following the starting position. |
| 654 | * fDictionaryCharCount the number of dictionary characters encountered. |
| 655 | * If > 0, the segment will be further subdivided |
| 656 | * fRuleStatusIndex Info from the state table indicating which rules caused the boundary. |
| 657 | * |
| 658 | * @internal (private) |
| 659 | */ |
| 660 | int32_t handleNext(); |
| 661 | |
| 662 | |
| 663 | /** |
| 664 | * This function returns the appropriate LanguageBreakEngine for a |
| 665 | * given character c. |
| 666 | * @param c A character in the dictionary set |
| 667 | * @internal (private) |
| 668 | */ |
| 669 | const LanguageBreakEngine *getLanguageBreakEngine(UChar32 c); |
| 670 | |
| 671 | public: |
| 672 | #ifndef U_HIDE_INTERNAL_API |
| 673 | /** |
| 674 | * Debugging function only. |
| 675 | * @internal |
| 676 | */ |
| 677 | void dumpCache(); |
| 678 | |
| 679 | /** |
| 680 | * Debugging function only. |
| 681 | * @internal |
| 682 | */ |
| 683 | void dumpTables(); |
| 684 | |
| 685 | #endif /* U_HIDE_INTERNAL_API */ |
| 686 | }; |
| 687 | |
| 688 | //------------------------------------------------------------------------------ |
| 689 | // |
| 690 | // Inline Functions Definitions ... |
| 691 | // |
| 692 | //------------------------------------------------------------------------------ |
| 693 | |
| 694 | inline UBool RuleBasedBreakIterator::operator!=(const BreakIterator& that) const { |
| 695 | return !operator==(that); |
| 696 | } |
| 697 | |
| 698 | U_NAMESPACE_END |
| 699 | |
| 700 | #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |
| 701 | |
| 702 | #endif /* U_SHOW_CPLUSPLUS_API */ |
| 703 | |
| 704 | #endif |
| 705 | |