1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4***************************************************************************
5* Copyright (C) 1999-2016 International Business Machines Corporation *
6* and others. All rights reserved. *
7***************************************************************************
8
9**********************************************************************
10* Date Name Description
11* 10/22/99 alan Creation.
12* 11/11/99 rgillam Complete port from Java.
13**********************************************************************
14*/
15
16#ifndef RBBI_H
17#define RBBI_H
18
19#include "unicode/utypes.h"
20
21#if U_SHOW_CPLUSPLUS_API
22
23/**
24 * \file
25 * \brief C++ API: Rule Based Break Iterator
26 */
27
28#if !UCONFIG_NO_BREAK_ITERATION
29
30#include "unicode/brkiter.h"
31#include "unicode/udata.h"
32#include "unicode/parseerr.h"
33#include "unicode/schriter.h"
34
35U_NAMESPACE_BEGIN
36
37/** @internal */
38class LanguageBreakEngine;
39struct RBBIDataHeader;
40class RBBIDataWrapper;
41class UnhandledEngine;
42class UStack;
43
44/**
45 *
46 * A subclass of BreakIterator whose behavior is specified using a list of rules.
47 * <p>Instances of this class are most commonly created by the factory methods of
48 * BreakIterator::createWordInstance(), BreakIterator::createLineInstance(), etc.,
49 * and then used via the abstract API in class BreakIterator</p>
50 *
51 * <p>See the ICU User Guide for information on Break Iterator Rules.</p>
52 *
53 * <p>This class is not intended to be subclassed.</p>
54 */
55class U_COMMON_API RuleBasedBreakIterator /*U_FINAL*/ : public BreakIterator {
56
57private:
58 /**
59 * The UText through which this BreakIterator accesses the text
60 * @internal (private)
61 */
62 UText fText;
63
64#ifndef U_HIDE_INTERNAL_API
65public:
66#endif /* U_HIDE_INTERNAL_API */
67 /**
68 * The rule data for this BreakIterator instance.
69 * Not for general use; Public only for testing purposes.
70 * @internal
71 */
72 RBBIDataWrapper *fData;
73private:
74
75 /**
76 * The current position of the iterator. Pinned, 0 < fPosition <= text.length.
77 * Never has the value UBRK_DONE (-1).
78 */
79 int32_t fPosition;
80
81 /**
82 * TODO:
83 */
84 int32_t fRuleStatusIndex;
85
86 /**
87 * Cache of previously determined boundary positions.
88 */
89 class BreakCache;
90 BreakCache *fBreakCache;
91
92 /**
93 * Cache of boundary positions within a region of text that has been
94 * sub-divided by dictionary based breaking.
95 */
96 class DictionaryCache;
97 DictionaryCache *fDictionaryCache;
98
99 /**
100 *
101 * If present, UStack of LanguageBreakEngine objects that might handle
102 * dictionary characters. Searched from top to bottom to find an object to
103 * handle a given character.
104 * @internal (private)
105 */
106 UStack *fLanguageBreakEngines;
107
108 /**
109 *
110 * If present, the special LanguageBreakEngine used for handling
111 * characters that are in the dictionary set, but not handled by any
112 * LanguageBreakEngine.
113 * @internal (private)
114 */
115 UnhandledEngine *fUnhandledBreakEngine;
116
117 /**
118 * Counter for the number of characters encountered with the "dictionary"
119 * flag set.
120 * @internal (private)
121 */
122 uint32_t fDictionaryCharCount;
123
124 /**
125 * A character iterator that refers to the same text as the UText, above.
126 * Only included for compatibility with old API, which was based on CharacterIterators.
127 * Value may be adopted from outside, or one of fSCharIter or fDCharIter, below.
128 */
129 CharacterIterator *fCharIter;
130
131 /**
132 * When the input text is provided by a UnicodeString, this will point to
133 * a characterIterator that wraps that data. Needed only for the
134 * implementation of getText(), a backwards compatibility issue.
135 */
136 StringCharacterIterator fSCharIter;
137
138 /**
139 * True when iteration has run off the end, and iterator functions should return UBRK_DONE.
140 */
141 UBool fDone;
142
143 //=======================================================================
144 // constructors
145 //=======================================================================
146
147 /**
148 * Constructor from a flattened set of RBBI data in malloced memory.
149 * RulesBasedBreakIterators built from a custom set of rules
150 * are created via this constructor; the rules are compiled
151 * into memory, then the break iterator is constructed here.
152 *
153 * The break iterator adopts the memory, and will
154 * free it when done.
155 * @internal (private)
156 */
157 RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status);
158
159 /** @internal */
160 friend class RBBIRuleBuilder;
161 /** @internal */
162 friend class BreakIterator;
163
164public:
165
166 /** Default constructor. Creates an empty shell of an iterator, with no
167 * rules or text to iterate over. Object can subsequently be assigned to.
168 * @stable ICU 2.2
169 */
170 RuleBasedBreakIterator();
171
172 /**
173 * Copy constructor. Will produce a break iterator with the same behavior,
174 * and which iterates over the same text, as the one passed in.
175 * @param that The RuleBasedBreakIterator passed to be copied
176 * @stable ICU 2.0
177 */
178 RuleBasedBreakIterator(const RuleBasedBreakIterator& that);
179
180 /**
181 * Construct a RuleBasedBreakIterator from a set of rules supplied as a string.
182 * @param rules The break rules to be used.
183 * @param parseError In the event of a syntax error in the rules, provides the location
184 * within the rules of the problem.
185 * @param status Information on any errors encountered.
186 * @stable ICU 2.2
187 */
188 RuleBasedBreakIterator( const UnicodeString &rules,
189 UParseError &parseError,
190 UErrorCode &status);
191
192 /**
193 * Construct a RuleBasedBreakIterator from a set of precompiled binary rules.
194 * Binary rules are obtained from RulesBasedBreakIterator::getBinaryRules().
195 * Construction of a break iterator in this way is substantially faster than
196 * construction from source rules.
197 *
198 * Ownership of the storage containing the compiled rules remains with the
199 * caller of this function. The compiled rules must not be modified or
200 * deleted during the life of the break iterator.
201 *
202 * The compiled rules are not compatible across different major versions of ICU.
203 * The compiled rules are compatible only between machines with the same
204 * byte ordering (little or big endian) and the same base character set family
205 * (ASCII or EBCDIC).
206 *
207 * @see #getBinaryRules
208 * @param compiledRules A pointer to the compiled break rules to be used.
209 * @param ruleLength The length of the compiled break rules, in bytes. This
210 * corresponds to the length value produced by getBinaryRules().
211 * @param status Information on any errors encountered, including invalid
212 * binary rules.
213 * @stable ICU 4.8
214 */
215 RuleBasedBreakIterator(const uint8_t *compiledRules,
216 uint32_t ruleLength,
217 UErrorCode &status);
218
219 /**
220 * This constructor uses the udata interface to create a BreakIterator
221 * whose internal tables live in a memory-mapped file. "image" is an
222 * ICU UDataMemory handle for the pre-compiled break iterator tables.
223 * @param image handle to the memory image for the break iterator data.
224 * Ownership of the UDataMemory handle passes to the Break Iterator,
225 * which will be responsible for closing it when it is no longer needed.
226 * @param status Information on any errors encountered.
227 * @see udata_open
228 * @see #getBinaryRules
229 * @stable ICU 2.8
230 */
231 RuleBasedBreakIterator(UDataMemory* image, UErrorCode &status);
232
233 /**
234 * Destructor
235 * @stable ICU 2.0
236 */
237 virtual ~RuleBasedBreakIterator();
238
239 /**
240 * Assignment operator. Sets this iterator to have the same behavior,
241 * and iterate over the same text, as the one passed in.
242 * @param that The RuleBasedBreakItertor passed in
243 * @return the newly created RuleBasedBreakIterator
244 * @stable ICU 2.0
245 */
246 RuleBasedBreakIterator& operator=(const RuleBasedBreakIterator& that);
247
248 /**
249 * Equality operator. Returns TRUE if both BreakIterators are of the
250 * same class, have the same behavior, and iterate over the same text.
251 * @param that The BreakIterator to be compared for equality
252 * @return TRUE if both BreakIterators are of the
253 * same class, have the same behavior, and iterate over the same text.
254 * @stable ICU 2.0
255 */
256 virtual UBool operator==(const BreakIterator& that) const;
257
258 /**
259 * Not-equal operator. If operator== returns TRUE, this returns FALSE,
260 * and vice versa.
261 * @param that The BreakIterator to be compared for inequality
262 * @return TRUE if both BreakIterators are not same.
263 * @stable ICU 2.0
264 */
265 inline UBool operator!=(const BreakIterator& that) const;
266
267 /**
268 * Returns a newly-constructed RuleBasedBreakIterator with the same
269 * behavior, and iterating over the same text, as this one.
270 * Differs from the copy constructor in that it is polymorphic, and
271 * will correctly clone (copy) a derived class.
272 * clone() is thread safe. Multiple threads may simultaneously
273 * clone the same source break iterator.
274 * @return a newly-constructed RuleBasedBreakIterator
275 * @stable ICU 2.0
276 */
277 virtual RuleBasedBreakIterator* clone() const;
278
279 /**
280 * Compute a hash code for this BreakIterator
281 * @return A hash code
282 * @stable ICU 2.0
283 */
284 virtual int32_t hashCode(void) const;
285
286 /**
287 * Returns the description used to create this iterator
288 * @return the description used to create this iterator
289 * @stable ICU 2.0
290 */
291 virtual const UnicodeString& getRules(void) const;
292
293 //=======================================================================
294 // BreakIterator overrides
295 //=======================================================================
296
297 /**
298 * <p>
299 * Return a CharacterIterator over the text being analyzed.
300 * The returned character iterator is owned by the break iterator, and must
301 * not be deleted by the caller. Repeated calls to this function may
302 * return the same CharacterIterator.
303 * </p>
304 * <p>
305 * The returned character iterator must not be used concurrently with
306 * the break iterator. If concurrent operation is needed, clone the
307 * returned character iterator first and operate on the clone.
308 * </p>
309 * <p>
310 * When the break iterator is operating on text supplied via a UText,
311 * this function will fail. Lacking any way to signal failures, it
312 * returns an CharacterIterator containing no text.
313 * The function getUText() provides similar functionality,
314 * is reliable, and is more efficient.
315 * </p>
316 *
317 * TODO: deprecate this function?
318 *
319 * @return An iterator over the text being analyzed.
320 * @stable ICU 2.0
321 */
322 virtual CharacterIterator& getText(void) const;
323
324
325 /**
326 * Get a UText for the text being analyzed.
327 * The returned UText is a shallow clone of the UText used internally
328 * by the break iterator implementation. It can safely be used to
329 * access the text without impacting any break iterator operations,
330 * but the underlying text itself must not be altered.
331 *
332 * @param fillIn A UText to be filled in. If NULL, a new UText will be
333 * allocated to hold the result.
334 * @param status receives any error codes.
335 * @return The current UText for this break iterator. If an input
336 * UText was provided, it will always be returned.
337 * @stable ICU 3.4
338 */
339 virtual UText *getUText(UText *fillIn, UErrorCode &status) const;
340
341 /**
342 * Set the iterator to analyze a new piece of text. This function resets
343 * the current iteration position to the beginning of the text.
344 * @param newText An iterator over the text to analyze. The BreakIterator
345 * takes ownership of the character iterator. The caller MUST NOT delete it!
346 * @stable ICU 2.0
347 */
348 virtual void adoptText(CharacterIterator* newText);
349
350 /**
351 * Set the iterator to analyze a new piece of text. This function resets
352 * the current iteration position to the beginning of the text.
353 *
354 * The BreakIterator will retain a reference to the supplied string.
355 * The caller must not modify or delete the text while the BreakIterator
356 * retains the reference.
357 *
358 * @param newText The text to analyze.
359 * @stable ICU 2.0
360 */
361 virtual void setText(const UnicodeString& newText);
362
363 /**
364 * Reset the break iterator to operate over the text represented by
365 * the UText. The iterator position is reset to the start.
366 *
367 * This function makes a shallow clone of the supplied UText. This means
368 * that the caller is free to immediately close or otherwise reuse the
369 * Utext that was passed as a parameter, but that the underlying text itself
370 * must not be altered while being referenced by the break iterator.
371 *
372 * @param text The UText used to change the text.
373 * @param status Receives any error codes.
374 * @stable ICU 3.4
375 */
376 virtual void setText(UText *text, UErrorCode &status);
377
378 /**
379 * Sets the current iteration position to the beginning of the text, position zero.
380 * @return The offset of the beginning of the text, zero.
381 * @stable ICU 2.0
382 */
383 virtual int32_t first(void);
384
385 /**
386 * Sets the current iteration position to the end of the text.
387 * @return The text's past-the-end offset.
388 * @stable ICU 2.0
389 */
390 virtual int32_t last(void);
391
392 /**
393 * Advances the iterator either forward or backward the specified number of steps.
394 * Negative values move backward, and positive values move forward. This is
395 * equivalent to repeatedly calling next() or previous().
396 * @param n The number of steps to move. The sign indicates the direction
397 * (negative is backwards, and positive is forwards).
398 * @return The character offset of the boundary position n boundaries away from
399 * the current one.
400 * @stable ICU 2.0
401 */
402 virtual int32_t next(int32_t n);
403
404 /**
405 * Advances the iterator to the next boundary position.
406 * @return The position of the first boundary after this one.
407 * @stable ICU 2.0
408 */
409 virtual int32_t next(void);
410
411 /**
412 * Moves the iterator backwards, to the last boundary preceding this one.
413 * @return The position of the last boundary position preceding this one.
414 * @stable ICU 2.0
415 */
416 virtual int32_t previous(void);
417
418 /**
419 * Sets the iterator to refer to the first boundary position following
420 * the specified position.
421 * @param offset The position from which to begin searching for a break position.
422 * @return The position of the first break after the current position.
423 * @stable ICU 2.0
424 */
425 virtual int32_t following(int32_t offset);
426
427 /**
428 * Sets the iterator to refer to the last boundary position before the
429 * specified position.
430 * @param offset The position to begin searching for a break from.
431 * @return The position of the last boundary before the starting position.
432 * @stable ICU 2.0
433 */
434 virtual int32_t preceding(int32_t offset);
435
436 /**
437 * Returns true if the specified position is a boundary position. As a side
438 * effect, leaves the iterator pointing to the first boundary position at
439 * or after "offset".
440 * @param offset the offset to check.
441 * @return True if "offset" is a boundary position.
442 * @stable ICU 2.0
443 */
444 virtual UBool isBoundary(int32_t offset);
445
446 /**
447 * Returns the current iteration position. Note that UBRK_DONE is never
448 * returned from this function; if iteration has run to the end of a
449 * string, current() will return the length of the string while
450 * next() will return UBRK_DONE).
451 * @return The current iteration position.
452 * @stable ICU 2.0
453 */
454 virtual int32_t current(void) const;
455
456
457 /**
458 * Return the status tag from the break rule that determined the boundary at
459 * the current iteration position. For break rules that do not specify a
460 * status, a default value of 0 is returned. If more than one break rule
461 * would cause a boundary to be located at some position in the text,
462 * the numerically largest of the applicable status values is returned.
463 * <p>
464 * Of the standard types of ICU break iterators, only word break and
465 * line break provide status values. The values are defined in
466 * the header file ubrk.h. For Word breaks, the status allows distinguishing between words
467 * that contain alphabetic letters, "words" that appear to be numbers,
468 * punctuation and spaces, words containing ideographic characters, and
469 * more. For Line Break, the status distinguishes between hard (mandatory) breaks
470 * and soft (potential) break positions.
471 * <p>
472 * <code>getRuleStatus()</code> can be called after obtaining a boundary
473 * position from <code>next()</code>, <code>previous()</code>, or
474 * any other break iterator functions that returns a boundary position.
475 * <p>
476 * Note that <code>getRuleStatus()</code> returns the value corresponding to
477 * <code>current()</code> index even after <code>next()</code> has returned DONE.
478 * <p>
479 * When creating custom break rules, one is free to define whatever
480 * status values may be convenient for the application.
481 * <p>
482 * @return the status from the break rule that determined the boundary
483 * at the current iteration position.
484 *
485 * @see UWordBreak
486 * @stable ICU 2.2
487 */
488 virtual int32_t getRuleStatus() const;
489
490 /**
491 * Get the status (tag) values from the break rule(s) that determined the boundary
492 * at the current iteration position.
493 * <p>
494 * The returned status value(s) are stored into an array provided by the caller.
495 * The values are stored in sorted (ascending) order.
496 * If the capacity of the output array is insufficient to hold the data,
497 * the output will be truncated to the available length, and a
498 * U_BUFFER_OVERFLOW_ERROR will be signaled.
499 *
500 * @param fillInVec an array to be filled in with the status values.
501 * @param capacity the length of the supplied vector. A length of zero causes
502 * the function to return the number of status values, in the
503 * normal way, without attempting to store any values.
504 * @param status receives error codes.
505 * @return The number of rule status values from the rules that determined
506 * the boundary at the current iteration position.
507 * In the event of a U_BUFFER_OVERFLOW_ERROR, the return value
508 * is the total number of status values that were available,
509 * not the reduced number that were actually returned.
510 * @see getRuleStatus
511 * @stable ICU 3.0
512 */
513 virtual int32_t getRuleStatusVec(int32_t *fillInVec, int32_t capacity, UErrorCode &status);
514
515 /**
516 * Returns a unique class ID POLYMORPHICALLY. Pure virtual override.
517 * This method is to implement a simple version of RTTI, since not all
518 * C++ compilers support genuine RTTI. Polymorphic operator==() and
519 * clone() methods call this method.
520 *
521 * @return The class ID for this object. All objects of a
522 * given class have the same class ID. Objects of
523 * other classes have different class IDs.
524 * @stable ICU 2.0
525 */
526 virtual UClassID getDynamicClassID(void) const;
527
528 /**
529 * Returns the class ID for this class. This is useful only for
530 * comparing to a return value from getDynamicClassID(). For example:
531 *
532 * Base* polymorphic_pointer = createPolymorphicObject();
533 * if (polymorphic_pointer->getDynamicClassID() ==
534 * Derived::getStaticClassID()) ...
535 *
536 * @return The class ID for all objects of this class.
537 * @stable ICU 2.0
538 */
539 static UClassID U_EXPORT2 getStaticClassID(void);
540
541#ifndef U_FORCE_HIDE_DEPRECATED_API
542 /**
543 * Deprecated functionality. Use clone() instead.
544 *
545 * Create a clone (copy) of this break iterator in memory provided
546 * by the caller. The idea is to increase performance by avoiding
547 * a storage allocation. Use of this function is NOT RECOMMENDED.
548 * Performance gains are minimal, and correct buffer management is
549 * tricky. Use clone() instead.
550 *
551 * @param stackBuffer The pointer to the memory into which the cloned object
552 * should be placed. If NULL, allocate heap memory
553 * for the cloned object.
554 * @param BufferSize The size of the buffer. If zero, return the required
555 * buffer size, but do not clone the object. If the
556 * size was too small (but not zero), allocate heap
557 * storage for the cloned object.
558 *
559 * @param status Error status. U_SAFECLONE_ALLOCATED_WARNING will be
560 * returned if the provided buffer was too small, and
561 * the clone was therefore put on the heap.
562 *
563 * @return Pointer to the clone object. This may differ from the stackBuffer
564 * address if the byte alignment of the stack buffer was not suitable
565 * or if the stackBuffer was too small to hold the clone.
566 * @deprecated ICU 52. Use clone() instead.
567 */
568 virtual RuleBasedBreakIterator *createBufferClone(void *stackBuffer,
569 int32_t &BufferSize,
570 UErrorCode &status);
571#endif // U_FORCE_HIDE_DEPRECATED_API
572
573 /**
574 * Return the binary form of compiled break rules,
575 * which can then be used to create a new break iterator at some
576 * time in the future. Creating a break iterator from pre-compiled rules
577 * is much faster than building one from the source form of the
578 * break rules.
579 *
580 * The binary data can only be used with the same version of ICU
581 * and on the same platform type (processor endian-ness)
582 *
583 * @param length Returns the length of the binary data. (Out parameter.)
584 *
585 * @return A pointer to the binary (compiled) rule data. The storage
586 * belongs to the RulesBasedBreakIterator object, not the
587 * caller, and must not be modified or deleted.
588 * @stable ICU 4.8
589 */
590 virtual const uint8_t *getBinaryRules(uint32_t &length);
591
592 /**
593 * Set the subject text string upon which the break iterator is operating
594 * without changing any other aspect of the matching state.
595 * The new and previous text strings must have the same content.
596 *
597 * This function is intended for use in environments where ICU is operating on
598 * strings that may move around in memory. It provides a mechanism for notifying
599 * ICU that the string has been relocated, and providing a new UText to access the
600 * string in its new position.
601 *
602 * Note that the break iterator implementation never copies the underlying text
603 * of a string being processed, but always operates directly on the original text
604 * provided by the user. Refreshing simply drops the references to the old text
605 * and replaces them with references to the new.
606 *
607 * Caution: this function is normally used only by very specialized,
608 * system-level code. One example use case is with garbage collection that moves
609 * the text in memory.
610 *
611 * @param input The new (moved) text string.
612 * @param status Receives errors detected by this function.
613 * @return *this
614 *
615 * @stable ICU 49
616 */
617 virtual RuleBasedBreakIterator &refreshInputText(UText *input, UErrorCode &status);
618
619
620private:
621 //=======================================================================
622 // implementation
623 //=======================================================================
624 /**
625 * Dumps caches and performs other actions associated with a complete change
626 * in text or iteration position.
627 * @internal (private)
628 */
629 void reset(void);
630
631 /**
632 * Common initialization function, used by constructors and bufferClone.
633 * @internal (private)
634 */
635 void init(UErrorCode &status);
636
637 /**
638 * Iterate backwards from an arbitrary position in the input text using the
639 * synthesized Safe Reverse rules.
640 * This locates a "Safe Position" from which the forward break rules
641 * will operate correctly. A Safe Position is not necessarily a boundary itself.
642 *
643 * @param fromPosition the position in the input text to begin the iteration.
644 * @internal (private)
645 */
646 int32_t handleSafePrevious(int32_t fromPosition);
647
648 /**
649 * Find a rule-based boundary by running the state machine.
650 * Input
651 * fPosition, the position in the text to begin from.
652 * Output
653 * fPosition: the boundary following the starting position.
654 * fDictionaryCharCount the number of dictionary characters encountered.
655 * If > 0, the segment will be further subdivided
656 * fRuleStatusIndex Info from the state table indicating which rules caused the boundary.
657 *
658 * @internal (private)
659 */
660 int32_t handleNext();
661
662
663 /**
664 * This function returns the appropriate LanguageBreakEngine for a
665 * given character c.
666 * @param c A character in the dictionary set
667 * @internal (private)
668 */
669 const LanguageBreakEngine *getLanguageBreakEngine(UChar32 c);
670
671 public:
672#ifndef U_HIDE_INTERNAL_API
673 /**
674 * Debugging function only.
675 * @internal
676 */
677 void dumpCache();
678
679 /**
680 * Debugging function only.
681 * @internal
682 */
683 void dumpTables();
684
685#endif /* U_HIDE_INTERNAL_API */
686};
687
688//------------------------------------------------------------------------------
689//
690// Inline Functions Definitions ...
691//
692//------------------------------------------------------------------------------
693
694inline UBool RuleBasedBreakIterator::operator!=(const BreakIterator& that) const {
695 return !operator==(that);
696}
697
698U_NAMESPACE_END
699
700#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
701
702#endif /* U_SHOW_CPLUSPLUS_API */
703
704#endif
705