1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4***************************************************************************
5* Copyright (C) 1999-2016 International Business Machines Corporation
6* and others. All rights reserved.
7***************************************************************************
8*/
9//
10// file: rbbi.cpp Contains the implementation of the rule based break iterator
11// runtime engine and the API implementation for
12// class RuleBasedBreakIterator
13//
14
15#include "utypeinfo.h" // for 'typeid' to work
16
17#include "unicode/utypes.h"
18
19#if !UCONFIG_NO_BREAK_ITERATION
20
21#include <cinttypes>
22
23#include "unicode/rbbi.h"
24#include "unicode/schriter.h"
25#include "unicode/uchriter.h"
26#include "unicode/uclean.h"
27#include "unicode/udata.h"
28
29#include "brkeng.h"
30#include "ucln_cmn.h"
31#include "cmemory.h"
32#include "cstring.h"
33#include "localsvc.h"
34#include "rbbidata.h"
35#include "rbbi_cache.h"
36#include "rbbirb.h"
37#include "uassert.h"
38#include "umutex.h"
39#include "uvectr32.h"
40
41#ifdef RBBI_DEBUG
42static UBool gTrace = FALSE;
43#endif
44
45U_NAMESPACE_BEGIN
46
47// The state number of the starting state
48constexpr int32_t START_STATE = 1;
49
50// The state-transition value indicating "stop"
51constexpr int32_t STOP_STATE = 0;
52
53
54UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator)
55
56
57//=======================================================================
58// constructors
59//=======================================================================
60
61/**
62 * Constructs a RuleBasedBreakIterator that uses the already-created
63 * tables object that is passed in as a parameter.
64 */
65RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status)
66 : fSCharIter(UnicodeString())
67{
68 init(status);
69 fData = new RBBIDataWrapper(data, status); // status checked in constructor
70 if (U_FAILURE(status)) {return;}
71 if(fData == 0) {
72 status = U_MEMORY_ALLOCATION_ERROR;
73 return;
74 }
75}
76
77//
78// Construct from precompiled binary rules (tables). This constructor is public API,
79// taking the rules as a (const uint8_t *) to match the type produced by getBinaryRules().
80//
81RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules,
82 uint32_t ruleLength,
83 UErrorCode &status)
84 : fSCharIter(UnicodeString())
85{
86 init(status);
87 if (U_FAILURE(status)) {
88 return;
89 }
90 if (compiledRules == NULL || ruleLength < sizeof(RBBIDataHeader)) {
91 status = U_ILLEGAL_ARGUMENT_ERROR;
92 return;
93 }
94 const RBBIDataHeader *data = (const RBBIDataHeader *)compiledRules;
95 if (data->fLength > ruleLength) {
96 status = U_ILLEGAL_ARGUMENT_ERROR;
97 return;
98 }
99 fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status);
100 if (U_FAILURE(status)) {return;}
101 if(fData == 0) {
102 status = U_MEMORY_ALLOCATION_ERROR;
103 return;
104 }
105}
106
107
108//-------------------------------------------------------------------------------
109//
110// Constructor from a UDataMemory handle to precompiled break rules
111// stored in an ICU data file.
112//
113//-------------------------------------------------------------------------------
114RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status)
115 : fSCharIter(UnicodeString())
116{
117 init(status);
118 fData = new RBBIDataWrapper(udm, status); // status checked in constructor
119 if (U_FAILURE(status)) {return;}
120 if(fData == 0) {
121 status = U_MEMORY_ALLOCATION_ERROR;
122 return;
123 }
124}
125
126
127
128//-------------------------------------------------------------------------------
129//
130// Constructor from a set of rules supplied as a string.
131//
132//-------------------------------------------------------------------------------
133RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString &rules,
134 UParseError &parseError,
135 UErrorCode &status)
136 : fSCharIter(UnicodeString())
137{
138 init(status);
139 if (U_FAILURE(status)) {return;}
140 RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *)
141 RBBIRuleBuilder::createRuleBasedBreakIterator(rules, &parseError, status);
142 // Note: This is a bit awkward. The RBBI ruleBuilder has a factory method that
143 // creates and returns a complete RBBI. From here, in a constructor, we
144 // can't just return the object created by the builder factory, hence
145 // the assignment of the factory created object to "this".
146 if (U_SUCCESS(status)) {
147 *this = *bi;
148 delete bi;
149 }
150}
151
152
153//-------------------------------------------------------------------------------
154//
155// Default Constructor. Create an empty shell that can be set up later.
156// Used when creating a RuleBasedBreakIterator from a set
157// of rules.
158//-------------------------------------------------------------------------------
159RuleBasedBreakIterator::RuleBasedBreakIterator()
160 : fSCharIter(UnicodeString())
161{
162 UErrorCode status = U_ZERO_ERROR;
163 init(status);
164}
165
166
167//-------------------------------------------------------------------------------
168//
169// Copy constructor. Will produce a break iterator with the same behavior,
170// and which iterates over the same text, as the one passed in.
171//
172//-------------------------------------------------------------------------------
173RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other)
174: BreakIterator(other),
175 fSCharIter(UnicodeString())
176{
177 UErrorCode status = U_ZERO_ERROR;
178 this->init(status);
179 *this = other;
180}
181
182
183/**
184 * Destructor
185 */
186RuleBasedBreakIterator::~RuleBasedBreakIterator() {
187 if (fCharIter != &fSCharIter) {
188 // fCharIter was adopted from the outside.
189 delete fCharIter;
190 }
191 fCharIter = NULL;
192
193 utext_close(&fText);
194
195 if (fData != NULL) {
196 fData->removeReference();
197 fData = NULL;
198 }
199 delete fBreakCache;
200 fBreakCache = NULL;
201
202 delete fDictionaryCache;
203 fDictionaryCache = NULL;
204
205 delete fLanguageBreakEngines;
206 fLanguageBreakEngines = NULL;
207
208 delete fUnhandledBreakEngine;
209 fUnhandledBreakEngine = NULL;
210}
211
212/**
213 * Assignment operator. Sets this iterator to have the same behavior,
214 * and iterate over the same text, as the one passed in.
215 */
216RuleBasedBreakIterator&
217RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
218 if (this == &that) {
219 return *this;
220 }
221 BreakIterator::operator=(that);
222
223 if (fLanguageBreakEngines != NULL) {
224 delete fLanguageBreakEngines;
225 fLanguageBreakEngines = NULL; // Just rebuild for now
226 }
227 // TODO: clone fLanguageBreakEngines from "that"
228 UErrorCode status = U_ZERO_ERROR;
229 utext_clone(&fText, &that.fText, FALSE, TRUE, &status);
230
231 if (fCharIter != &fSCharIter) {
232 delete fCharIter;
233 }
234 fCharIter = &fSCharIter;
235
236 if (that.fCharIter != NULL && that.fCharIter != &that.fSCharIter) {
237 // This is a little bit tricky - it will intially appear that
238 // this->fCharIter is adopted, even if that->fCharIter was
239 // not adopted. That's ok.
240 fCharIter = that.fCharIter->clone();
241 }
242 fSCharIter = that.fSCharIter;
243 if (fCharIter == NULL) {
244 fCharIter = &fSCharIter;
245 }
246
247 if (fData != NULL) {
248 fData->removeReference();
249 fData = NULL;
250 }
251 if (that.fData != NULL) {
252 fData = that.fData->addReference();
253 }
254
255 fPosition = that.fPosition;
256 fRuleStatusIndex = that.fRuleStatusIndex;
257 fDone = that.fDone;
258
259 // TODO: both the dictionary and the main cache need to be copied.
260 // Current position could be within a dictionary range. Trying to continue
261 // the iteration without the caches present would go to the rules, with
262 // the assumption that the current position is on a rule boundary.
263 fBreakCache->reset(fPosition, fRuleStatusIndex);
264 fDictionaryCache->reset();
265
266 return *this;
267}
268
269
270
271//-----------------------------------------------------------------------------
272//
273// init() Shared initialization routine. Used by all the constructors.
274// Initializes all fields, leaving the object in a consistent state.
275//
276//-----------------------------------------------------------------------------
277void RuleBasedBreakIterator::init(UErrorCode &status) {
278 fCharIter = NULL;
279 fData = NULL;
280 fPosition = 0;
281 fRuleStatusIndex = 0;
282 fDone = false;
283 fDictionaryCharCount = 0;
284 fLanguageBreakEngines = NULL;
285 fUnhandledBreakEngine = NULL;
286 fBreakCache = NULL;
287 fDictionaryCache = NULL;
288
289 // Note: IBM xlC is unable to assign or initialize member fText from UTEXT_INITIALIZER.
290 // fText = UTEXT_INITIALIZER;
291 static const UText initializedUText = UTEXT_INITIALIZER;
292 uprv_memcpy(&fText, &initializedUText, sizeof(UText));
293
294 if (U_FAILURE(status)) {
295 return;
296 }
297
298 utext_openUChars(&fText, NULL, 0, &status);
299 fDictionaryCache = new DictionaryCache(this, status);
300 fBreakCache = new BreakCache(this, status);
301 if (U_SUCCESS(status) && (fDictionaryCache == NULL || fBreakCache == NULL)) {
302 status = U_MEMORY_ALLOCATION_ERROR;
303 }
304
305#ifdef RBBI_DEBUG
306 static UBool debugInitDone = FALSE;
307 if (debugInitDone == FALSE) {
308 char *debugEnv = getenv("U_RBBIDEBUG");
309 if (debugEnv && uprv_strstr(debugEnv, "trace")) {
310 gTrace = TRUE;
311 }
312 debugInitDone = TRUE;
313 }
314#endif
315}
316
317
318
319//-----------------------------------------------------------------------------
320//
321// clone - Returns a newly-constructed RuleBasedBreakIterator with the same
322// behavior, and iterating over the same text, as this one.
323// Virtual function: does the right thing with subclasses.
324//
325//-----------------------------------------------------------------------------
326RuleBasedBreakIterator*
327RuleBasedBreakIterator::clone() const {
328 return new RuleBasedBreakIterator(*this);
329}
330
331/**
332 * Equality operator. Returns TRUE if both BreakIterators are of the
333 * same class, have the same behavior, and iterate over the same text.
334 */
335UBool
336RuleBasedBreakIterator::operator==(const BreakIterator& that) const {
337 if (typeid(*this) != typeid(that)) {
338 return FALSE;
339 }
340 if (this == &that) {
341 return TRUE;
342 }
343
344 // The base class BreakIterator carries no state that participates in equality,
345 // and does not implement an equality function that would otherwise be
346 // checked at this point.
347
348 const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that;
349
350 if (!utext_equals(&fText, &that2.fText)) {
351 // The two break iterators are operating on different text,
352 // or have a different iteration position.
353 // Note that fText's position is always the same as the break iterator's position.
354 return FALSE;
355 }
356
357 if (!(fPosition == that2.fPosition &&
358 fRuleStatusIndex == that2.fRuleStatusIndex &&
359 fDone == that2.fDone)) {
360 return FALSE;
361 }
362
363 if (that2.fData == fData ||
364 (fData != NULL && that2.fData != NULL && *that2.fData == *fData)) {
365 // The two break iterators are using the same rules.
366 return TRUE;
367 }
368 return FALSE;
369}
370
371/**
372 * Compute a hash code for this BreakIterator
373 * @return A hash code
374 */
375int32_t
376RuleBasedBreakIterator::hashCode(void) const {
377 int32_t hash = 0;
378 if (fData != NULL) {
379 hash = fData->hashCode();
380 }
381 return hash;
382}
383
384
385void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) {
386 if (U_FAILURE(status)) {
387 return;
388 }
389 fBreakCache->reset();
390 fDictionaryCache->reset();
391 utext_clone(&fText, ut, FALSE, TRUE, &status);
392
393 // Set up a dummy CharacterIterator to be returned if anyone
394 // calls getText(). With input from UText, there is no reasonable
395 // way to return a characterIterator over the actual input text.
396 // Return one over an empty string instead - this is the closest
397 // we can come to signaling a failure.
398 // (GetText() is obsolete, this failure is sort of OK)
399 fSCharIter.setText(UnicodeString());
400
401 if (fCharIter != &fSCharIter) {
402 // existing fCharIter was adopted from the outside. Delete it now.
403 delete fCharIter;
404 }
405 fCharIter = &fSCharIter;
406
407 this->first();
408}
409
410
411UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const {
412 UText *result = utext_clone(fillIn, &fText, FALSE, TRUE, &status);
413 return result;
414}
415
416
417//=======================================================================
418// BreakIterator overrides
419//=======================================================================
420
421/**
422 * Return a CharacterIterator over the text being analyzed.
423 */
424CharacterIterator&
425RuleBasedBreakIterator::getText() const {
426 return *fCharIter;
427}
428
429/**
430 * Set the iterator to analyze a new piece of text. This function resets
431 * the current iteration position to the beginning of the text.
432 * @param newText An iterator over the text to analyze.
433 */
434void
435RuleBasedBreakIterator::adoptText(CharacterIterator* newText) {
436 // If we are holding a CharacterIterator adopted from a
437 // previous call to this function, delete it now.
438 if (fCharIter != &fSCharIter) {
439 delete fCharIter;
440 }
441
442 fCharIter = newText;
443 UErrorCode status = U_ZERO_ERROR;
444 fBreakCache->reset();
445 fDictionaryCache->reset();
446 if (newText==NULL || newText->startIndex() != 0) {
447 // startIndex !=0 wants to be an error, but there's no way to report it.
448 // Make the iterator text be an empty string.
449 utext_openUChars(&fText, NULL, 0, &status);
450 } else {
451 utext_openCharacterIterator(&fText, newText, &status);
452 }
453 this->first();
454}
455
456/**
457 * Set the iterator to analyze a new piece of text. This function resets
458 * the current iteration position to the beginning of the text.
459 * @param newText An iterator over the text to analyze.
460 */
461void
462RuleBasedBreakIterator::setText(const UnicodeString& newText) {
463 UErrorCode status = U_ZERO_ERROR;
464 fBreakCache->reset();
465 fDictionaryCache->reset();
466 utext_openConstUnicodeString(&fText, &newText, &status);
467
468 // Set up a character iterator on the string.
469 // Needed in case someone calls getText().
470 // Can not, unfortunately, do this lazily on the (probably never)
471 // call to getText(), because getText is const.
472 fSCharIter.setText(newText);
473
474 if (fCharIter != &fSCharIter) {
475 // old fCharIter was adopted from the outside. Delete it.
476 delete fCharIter;
477 }
478 fCharIter = &fSCharIter;
479
480 this->first();
481}
482
483
484/**
485 * Provide a new UText for the input text. Must reference text with contents identical
486 * to the original.
487 * Intended for use with text data originating in Java (garbage collected) environments
488 * where the data may be moved in memory at arbitrary times.
489 */
490RuleBasedBreakIterator &RuleBasedBreakIterator::refreshInputText(UText *input, UErrorCode &status) {
491 if (U_FAILURE(status)) {
492 return *this;
493 }
494 if (input == NULL) {
495 status = U_ILLEGAL_ARGUMENT_ERROR;
496 return *this;
497 }
498 int64_t pos = utext_getNativeIndex(&fText);
499 // Shallow read-only clone of the new UText into the existing input UText
500 utext_clone(&fText, input, FALSE, TRUE, &status);
501 if (U_FAILURE(status)) {
502 return *this;
503 }
504 utext_setNativeIndex(&fText, pos);
505 if (utext_getNativeIndex(&fText) != pos) {
506 // Sanity check. The new input utext is supposed to have the exact same
507 // contents as the old. If we can't set to the same position, it doesn't.
508 // The contents underlying the old utext might be invalid at this point,
509 // so it's not safe to check directly.
510 status = U_ILLEGAL_ARGUMENT_ERROR;
511 }
512 return *this;
513}
514
515
516/**
517 * Sets the current iteration position to the beginning of the text, position zero.
518 * @return The new iterator position, which is zero.
519 */
520int32_t RuleBasedBreakIterator::first(void) {
521 UErrorCode status = U_ZERO_ERROR;
522 if (!fBreakCache->seek(0)) {
523 fBreakCache->populateNear(0, status);
524 }
525 fBreakCache->current();
526 U_ASSERT(fPosition == 0);
527 return 0;
528}
529
530/**
531 * Sets the current iteration position to the end of the text.
532 * @return The text's past-the-end offset.
533 */
534int32_t RuleBasedBreakIterator::last(void) {
535 int32_t endPos = (int32_t)utext_nativeLength(&fText);
536 UBool endShouldBeBoundary = isBoundary(endPos); // Has side effect of setting iterator position.
537 (void)endShouldBeBoundary;
538 U_ASSERT(endShouldBeBoundary);
539 U_ASSERT(fPosition == endPos);
540 return endPos;
541}
542
543/**
544 * Advances the iterator either forward or backward the specified number of steps.
545 * Negative values move backward, and positive values move forward. This is
546 * equivalent to repeatedly calling next() or previous().
547 * @param n The number of steps to move. The sign indicates the direction
548 * (negative is backwards, and positive is forwards).
549 * @return The character offset of the boundary position n boundaries away from
550 * the current one.
551 */
552int32_t RuleBasedBreakIterator::next(int32_t n) {
553 int32_t result = 0;
554 if (n > 0) {
555 for (; n > 0 && result != UBRK_DONE; --n) {
556 result = next();
557 }
558 } else if (n < 0) {
559 for (; n < 0 && result != UBRK_DONE; ++n) {
560 result = previous();
561 }
562 } else {
563 result = current();
564 }
565 return result;
566}
567
568/**
569 * Advances the iterator to the next boundary position.
570 * @return The position of the first boundary after this one.
571 */
572int32_t RuleBasedBreakIterator::next(void) {
573 fBreakCache->next();
574 return fDone ? UBRK_DONE : fPosition;
575}
576
577/**
578 * Move the iterator backwards, to the boundary preceding the current one.
579 *
580 * Starts from the current position within fText.
581 * Starting position need not be on a boundary.
582 *
583 * @return The position of the boundary position immediately preceding the starting position.
584 */
585int32_t RuleBasedBreakIterator::previous(void) {
586 UErrorCode status = U_ZERO_ERROR;
587 fBreakCache->previous(status);
588 return fDone ? UBRK_DONE : fPosition;
589}
590
591/**
592 * Sets the iterator to refer to the first boundary position following
593 * the specified position.
594 * @param startPos The position from which to begin searching for a break position.
595 * @return The position of the first break after the current position.
596 */
597int32_t RuleBasedBreakIterator::following(int32_t startPos) {
598 // if the supplied position is before the beginning, return the
599 // text's starting offset
600 if (startPos < 0) {
601 return first();
602 }
603
604 // Move requested offset to a code point start. It might be on a trail surrogate,
605 // or on a trail byte if the input is UTF-8. Or it may be beyond the end of the text.
606 utext_setNativeIndex(&fText, startPos);
607 startPos = (int32_t)utext_getNativeIndex(&fText);
608
609 UErrorCode status = U_ZERO_ERROR;
610 fBreakCache->following(startPos, status);
611 return fDone ? UBRK_DONE : fPosition;
612}
613
614/**
615 * Sets the iterator to refer to the last boundary position before the
616 * specified position.
617 * @param offset The position to begin searching for a break from.
618 * @return The position of the last boundary before the starting position.
619 */
620int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
621 if (offset > utext_nativeLength(&fText)) {
622 return last();
623 }
624
625 // Move requested offset to a code point start. It might be on a trail surrogate,
626 // or on a trail byte if the input is UTF-8.
627
628 utext_setNativeIndex(&fText, offset);
629 int32_t adjustedOffset = static_cast<int32_t>(utext_getNativeIndex(&fText));
630
631 UErrorCode status = U_ZERO_ERROR;
632 fBreakCache->preceding(adjustedOffset, status);
633 return fDone ? UBRK_DONE : fPosition;
634}
635
636/**
637 * Returns true if the specfied position is a boundary position. As a side
638 * effect, leaves the iterator pointing to the first boundary position at
639 * or after "offset".
640 *
641 * @param offset the offset to check.
642 * @return True if "offset" is a boundary position.
643 */
644UBool RuleBasedBreakIterator::isBoundary(int32_t offset) {
645 // out-of-range indexes are never boundary positions
646 if (offset < 0) {
647 first(); // For side effects on current position, tag values.
648 return FALSE;
649 }
650
651 // Adjust offset to be on a code point boundary and not beyond the end of the text.
652 // Note that isBoundary() is always false for offsets that are not on code point boundaries.
653 // But we still need the side effect of leaving iteration at the following boundary.
654
655 utext_setNativeIndex(&fText, offset);
656 int32_t adjustedOffset = static_cast<int32_t>(utext_getNativeIndex(&fText));
657
658 bool result = false;
659 UErrorCode status = U_ZERO_ERROR;
660 if (fBreakCache->seek(adjustedOffset) || fBreakCache->populateNear(adjustedOffset, status)) {
661 result = (fBreakCache->current() == offset);
662 }
663
664 if (result && adjustedOffset < offset && utext_char32At(&fText, offset) == U_SENTINEL) {
665 // Original offset is beyond the end of the text. Return FALSE, it's not a boundary,
666 // but the iteration position remains set to the end of the text, which is a boundary.
667 return FALSE;
668 }
669 if (!result) {
670 // Not on a boundary. isBoundary() must leave iterator on the following boundary.
671 // Cache->seek(), above, left us on the preceding boundary, so advance one.
672 next();
673 }
674 return result;
675}
676
677
678/**
679 * Returns the current iteration position.
680 * @return The current iteration position.
681 */
682int32_t RuleBasedBreakIterator::current(void) const {
683 return fPosition;
684}
685
686
687//=======================================================================
688// implementation
689//=======================================================================
690
691//
692// RBBIRunMode - the state machine runs an extra iteration at the beginning and end
693// of user text. A variable with this enum type keeps track of where we
694// are. The state machine only fetches user input while in the RUN mode.
695//
696enum RBBIRunMode {
697 RBBI_START, // state machine processing is before first char of input
698 RBBI_RUN, // state machine processing is in the user text
699 RBBI_END // state machine processing is after end of user text.
700};
701
702
703// Map from look-ahead break states (corresponds to rules) to boundary positions.
704// Allows multiple lookahead break rules to be in flight at the same time.
705//
706// This is a temporary approach for ICU 57. A better fix is to make the look-ahead numbers
707// in the state table be sequential, then we can just index an array. And the
708// table could also tell us in advance how big that array needs to be.
709//
710// Before ICU 57 there was just a single simple variable for a look-ahead match that
711// was in progress. Two rules at once did not work.
712
713static const int32_t kMaxLookaheads = 8;
714struct LookAheadResults {
715 int32_t fUsedSlotLimit;
716 int32_t fPositions[8];
717 int16_t fKeys[8];
718
719 LookAheadResults() : fUsedSlotLimit(0), fPositions(), fKeys() {}
720
721 int32_t getPosition(int16_t key) {
722 for (int32_t i=0; i<fUsedSlotLimit; ++i) {
723 if (fKeys[i] == key) {
724 return fPositions[i];
725 }
726 }
727 UPRV_UNREACHABLE;
728 }
729
730 void setPosition(int16_t key, int32_t position) {
731 int32_t i;
732 for (i=0; i<fUsedSlotLimit; ++i) {
733 if (fKeys[i] == key) {
734 fPositions[i] = position;
735 return;
736 }
737 }
738 if (i >= kMaxLookaheads) {
739 UPRV_UNREACHABLE;
740 }
741 fKeys[i] = key;
742 fPositions[i] = position;
743 U_ASSERT(fUsedSlotLimit == i);
744 fUsedSlotLimit = i + 1;
745 }
746};
747
748
749//-----------------------------------------------------------------------------------
750//
751// handleNext()
752// Run the state machine to find a boundary
753//
754//-----------------------------------------------------------------------------------
755int32_t RuleBasedBreakIterator::handleNext() {
756 int32_t state;
757 uint16_t category = 0;
758 RBBIRunMode mode;
759
760 RBBIStateTableRow *row;
761 UChar32 c;
762 LookAheadResults lookAheadMatches;
763 int32_t result = 0;
764 int32_t initialPosition = 0;
765 const RBBIStateTable *statetable = fData->fForwardTable;
766 const char *tableData = statetable->fTableData;
767 uint32_t tableRowLen = statetable->fRowLen;
768 #ifdef RBBI_DEBUG
769 if (gTrace) {
770 RBBIDebugPuts("Handle Next pos char state category");
771 }
772 #endif
773
774 // handleNext alway sets the break tag value.
775 // Set the default for it.
776 fRuleStatusIndex = 0;
777
778 fDictionaryCharCount = 0;
779
780 // if we're already at the end of the text, return DONE.
781 initialPosition = fPosition;
782 UTEXT_SETNATIVEINDEX(&fText, initialPosition);
783 result = initialPosition;
784 c = UTEXT_NEXT32(&fText);
785 if (c==U_SENTINEL) {
786 fDone = TRUE;
787 return UBRK_DONE;
788 }
789
790 // Set the initial state for the state machine
791 state = START_STATE;
792 row = (RBBIStateTableRow *)
793 //(statetable->fTableData + (statetable->fRowLen * state));
794 (tableData + tableRowLen * state);
795
796
797 mode = RBBI_RUN;
798 if (statetable->fFlags & RBBI_BOF_REQUIRED) {
799 category = 2;
800 mode = RBBI_START;
801 }
802
803
804 // loop until we reach the end of the text or transition to state 0
805 //
806 for (;;) {
807 if (c == U_SENTINEL) {
808 // Reached end of input string.
809 if (mode == RBBI_END) {
810 // We have already run the loop one last time with the
811 // character set to the psueudo {eof} value. Now it is time
812 // to unconditionally bail out.
813 break;
814 }
815 // Run the loop one last time with the fake end-of-input character category.
816 mode = RBBI_END;
817 category = 1;
818 }
819
820 //
821 // Get the char category. An incoming category of 1 or 2 means that
822 // we are preset for doing the beginning or end of input, and
823 // that we shouldn't get a category from an actual text input character.
824 //
825 if (mode == RBBI_RUN) {
826 // look up the current character's character category, which tells us
827 // which column in the state table to look at.
828 // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
829 // not the size of the character going in, which is a UChar32.
830 //
831 category = UTRIE2_GET16(fData->fTrie, c);
832
833 // Check the dictionary bit in the character's category.
834 // Counter is only used by dictionary based iteration.
835 // Chars that need to be handled by a dictionary have a flag bit set
836 // in their category values.
837 //
838 if ((category & 0x4000) != 0) {
839 fDictionaryCharCount++;
840 // And off the dictionary flag bit.
841 category &= ~0x4000;
842 }
843 }
844
845 #ifdef RBBI_DEBUG
846 if (gTrace) {
847 RBBIDebugPrintf(" %4" PRId64 " ", utext_getNativeIndex(&fText));
848 if (0x20<=c && c<0x7f) {
849 RBBIDebugPrintf("\"%c\" ", c);
850 } else {
851 RBBIDebugPrintf("%5x ", c);
852 }
853 RBBIDebugPrintf("%3d %3d\n", state, category);
854 }
855 #endif
856
857 // State Transition - move machine to its next state
858 //
859
860 // fNextState is a variable-length array.
861 U_ASSERT(category<fData->fHeader->fCatCount);
862 state = row->fNextState[category]; /*Not accessing beyond memory*/
863 row = (RBBIStateTableRow *)
864 // (statetable->fTableData + (statetable->fRowLen * state));
865 (tableData + tableRowLen * state);
866
867
868 if (row->fAccepting == -1) {
869 // Match found, common case.
870 if (mode != RBBI_START) {
871 result = (int32_t)UTEXT_GETNATIVEINDEX(&fText);
872 }
873 fRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values.
874 }
875
876 int16_t completedRule = row->fAccepting;
877 if (completedRule > 0) {
878 // Lookahead match is completed.
879 int32_t lookaheadResult = lookAheadMatches.getPosition(completedRule);
880 if (lookaheadResult >= 0) {
881 fRuleStatusIndex = row->fTagIdx;
882 fPosition = lookaheadResult;
883 return lookaheadResult;
884 }
885 }
886
887 // If we are at the position of the '/' in a look-ahead (hard break) rule;
888 // record the current position, to be returned later, if the full rule matches.
889 // TODO: Move this check before the previous check of fAccepting.
890 // This would enable hard-break rules with no following context.
891 // But there are line break test failures when trying this. Investigate.
892 // Issue ICU-20837
893 int16_t rule = row->fLookAhead;
894 if (rule != 0) {
895 int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(&fText);
896 lookAheadMatches.setPosition(rule, pos);
897 }
898
899 if (state == STOP_STATE) {
900 // This is the normal exit from the lookup state machine.
901 // We have advanced through the string until it is certain that no
902 // longer match is possible, no matter what characters follow.
903 break;
904 }
905
906 // Advance to the next character.
907 // If this is a beginning-of-input loop iteration, don't advance
908 // the input position. The next iteration will be processing the
909 // first real input character.
910 if (mode == RBBI_RUN) {
911 c = UTEXT_NEXT32(&fText);
912 } else {
913 if (mode == RBBI_START) {
914 mode = RBBI_RUN;
915 }
916 }
917 }
918
919 // The state machine is done. Check whether it found a match...
920
921 // If the iterator failed to advance in the match engine, force it ahead by one.
922 // (This really indicates a defect in the break rules. They should always match
923 // at least one character.)
924 if (result == initialPosition) {
925 utext_setNativeIndex(&fText, initialPosition);
926 utext_next32(&fText);
927 result = (int32_t)utext_getNativeIndex(&fText);
928 fRuleStatusIndex = 0;
929 }
930
931 // Leave the iterator at our result position.
932 fPosition = result;
933 #ifdef RBBI_DEBUG
934 if (gTrace) {
935 RBBIDebugPrintf("result = %d\n\n", result);
936 }
937 #endif
938 return result;
939}
940
941
942//-----------------------------------------------------------------------------------
943//
944// handleSafePrevious()
945//
946// Iterate backwards using the safe reverse rules.
947// The logic of this function is similar to handleNext(), but simpler
948// because the safe table does not require as many options.
949//
950//-----------------------------------------------------------------------------------
951int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) {
952 int32_t state;
953 uint16_t category = 0;
954 RBBIStateTableRow *row;
955 UChar32 c;
956 int32_t result = 0;
957
958 const RBBIStateTable *stateTable = fData->fReverseTable;
959 UTEXT_SETNATIVEINDEX(&fText, fromPosition);
960 #ifdef RBBI_DEBUG
961 if (gTrace) {
962 RBBIDebugPuts("Handle Previous pos char state category");
963 }
964 #endif
965
966 // if we're already at the start of the text, return DONE.
967 if (fData == NULL || UTEXT_GETNATIVEINDEX(&fText)==0) {
968 return BreakIterator::DONE;
969 }
970
971 // Set the initial state for the state machine
972 c = UTEXT_PREVIOUS32(&fText);
973 state = START_STATE;
974 row = (RBBIStateTableRow *)
975 (stateTable->fTableData + (stateTable->fRowLen * state));
976
977 // loop until we reach the start of the text or transition to state 0
978 //
979 for (; c != U_SENTINEL; c = UTEXT_PREVIOUS32(&fText)) {
980
981 // look up the current character's character category, which tells us
982 // which column in the state table to look at.
983 // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
984 // not the size of the character going in, which is a UChar32.
985 //
986 // And off the dictionary flag bit. For reverse iteration it is not used.
987 category = UTRIE2_GET16(fData->fTrie, c);
988 category &= ~0x4000;
989
990 #ifdef RBBI_DEBUG
991 if (gTrace) {
992 RBBIDebugPrintf(" %4d ", (int32_t)utext_getNativeIndex(&fText));
993 if (0x20<=c && c<0x7f) {
994 RBBIDebugPrintf("\"%c\" ", c);
995 } else {
996 RBBIDebugPrintf("%5x ", c);
997 }
998 RBBIDebugPrintf("%3d %3d\n", state, category);
999 }
1000 #endif
1001
1002 // State Transition - move machine to its next state
1003 //
1004 // fNextState is a variable-length array.
1005 U_ASSERT(category<fData->fHeader->fCatCount);
1006 state = row->fNextState[category]; /*Not accessing beyond memory*/
1007 row = (RBBIStateTableRow *)
1008 (stateTable->fTableData + (stateTable->fRowLen * state));
1009
1010 if (state == STOP_STATE) {
1011 // This is the normal exit from the lookup state machine.
1012 // Transistion to state zero means we have found a safe point.
1013 break;
1014 }
1015 }
1016
1017 // The state machine is done. Check whether it found a match...
1018 result = (int32_t)UTEXT_GETNATIVEINDEX(&fText);
1019 #ifdef RBBI_DEBUG
1020 if (gTrace) {
1021 RBBIDebugPrintf("result = %d\n\n", result);
1022 }
1023 #endif
1024 return result;
1025}
1026
1027//-------------------------------------------------------------------------------
1028//
1029// getRuleStatus() Return the break rule tag associated with the current
1030// iterator position. If the iterator arrived at its current
1031// position by iterating forwards, the value will have been
1032// cached by the handleNext() function.
1033//
1034//-------------------------------------------------------------------------------
1035
1036int32_t RuleBasedBreakIterator::getRuleStatus() const {
1037
1038 // fLastRuleStatusIndex indexes to the start of the appropriate status record
1039 // (the number of status values.)
1040 // This function returns the last (largest) of the array of status values.
1041 int32_t idx = fRuleStatusIndex + fData->fRuleStatusTable[fRuleStatusIndex];
1042 int32_t tagVal = fData->fRuleStatusTable[idx];
1043
1044 return tagVal;
1045}
1046
1047
1048int32_t RuleBasedBreakIterator::getRuleStatusVec(
1049 int32_t *fillInVec, int32_t capacity, UErrorCode &status) {
1050 if (U_FAILURE(status)) {
1051 return 0;
1052 }
1053
1054 int32_t numVals = fData->fRuleStatusTable[fRuleStatusIndex];
1055 int32_t numValsToCopy = numVals;
1056 if (numVals > capacity) {
1057 status = U_BUFFER_OVERFLOW_ERROR;
1058 numValsToCopy = capacity;
1059 }
1060 int i;
1061 for (i=0; i<numValsToCopy; i++) {
1062 fillInVec[i] = fData->fRuleStatusTable[fRuleStatusIndex + i + 1];
1063 }
1064 return numVals;
1065}
1066
1067
1068
1069//-------------------------------------------------------------------------------
1070//
1071// getBinaryRules Access to the compiled form of the rules,
1072// for use by build system tools that save the data
1073// for standard iterator types.
1074//
1075//-------------------------------------------------------------------------------
1076const uint8_t *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) {
1077 const uint8_t *retPtr = NULL;
1078 length = 0;
1079
1080 if (fData != NULL) {
1081 retPtr = (const uint8_t *)fData->fHeader;
1082 length = fData->fHeader->fLength;
1083 }
1084 return retPtr;
1085}
1086
1087
1088RuleBasedBreakIterator *RuleBasedBreakIterator::createBufferClone(
1089 void * /*stackBuffer*/, int32_t &bufferSize, UErrorCode &status) {
1090 if (U_FAILURE(status)){
1091 return NULL;
1092 }
1093
1094 if (bufferSize == 0) {
1095 bufferSize = 1; // preflighting for deprecated functionality
1096 return NULL;
1097 }
1098
1099 BreakIterator *clonedBI = clone();
1100 if (clonedBI == NULL) {
1101 status = U_MEMORY_ALLOCATION_ERROR;
1102 } else {
1103 status = U_SAFECLONE_ALLOCATED_WARNING;
1104 }
1105 return (RuleBasedBreakIterator *)clonedBI;
1106}
1107
1108U_NAMESPACE_END
1109
1110
1111static icu::UStack *gLanguageBreakFactories = nullptr;
1112static const icu::UnicodeString *gEmptyString = nullptr;
1113static icu::UInitOnce gLanguageBreakFactoriesInitOnce = U_INITONCE_INITIALIZER;
1114static icu::UInitOnce gRBBIInitOnce = U_INITONCE_INITIALIZER;
1115
1116/**
1117 * Release all static memory held by breakiterator.
1118 */
1119U_CDECL_BEGIN
1120static UBool U_CALLCONV rbbi_cleanup(void) {
1121 delete gLanguageBreakFactories;
1122 gLanguageBreakFactories = nullptr;
1123 delete gEmptyString;
1124 gEmptyString = nullptr;
1125 gLanguageBreakFactoriesInitOnce.reset();
1126 gRBBIInitOnce.reset();
1127 return TRUE;
1128}
1129U_CDECL_END
1130
1131U_CDECL_BEGIN
1132static void U_CALLCONV _deleteFactory(void *obj) {
1133 delete (icu::LanguageBreakFactory *) obj;
1134}
1135U_CDECL_END
1136U_NAMESPACE_BEGIN
1137
1138static void U_CALLCONV rbbiInit() {
1139 gEmptyString = new UnicodeString();
1140 ucln_common_registerCleanup(UCLN_COMMON_RBBI, rbbi_cleanup);
1141}
1142
1143static void U_CALLCONV initLanguageFactories() {
1144 UErrorCode status = U_ZERO_ERROR;
1145 U_ASSERT(gLanguageBreakFactories == NULL);
1146 gLanguageBreakFactories = new UStack(_deleteFactory, NULL, status);
1147 if (gLanguageBreakFactories != NULL && U_SUCCESS(status)) {
1148 ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status);
1149 gLanguageBreakFactories->push(builtIn, status);
1150#ifdef U_LOCAL_SERVICE_HOOK
1151 LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status);
1152 if (extra != NULL) {
1153 gLanguageBreakFactories->push(extra, status);
1154 }
1155#endif
1156 }
1157 ucln_common_registerCleanup(UCLN_COMMON_RBBI, rbbi_cleanup);
1158}
1159
1160
1161static const LanguageBreakEngine*
1162getLanguageBreakEngineFromFactory(UChar32 c)
1163{
1164 umtx_initOnce(gLanguageBreakFactoriesInitOnce, &initLanguageFactories);
1165 if (gLanguageBreakFactories == NULL) {
1166 return NULL;
1167 }
1168
1169 int32_t i = gLanguageBreakFactories->size();
1170 const LanguageBreakEngine *lbe = NULL;
1171 while (--i >= 0) {
1172 LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i));
1173 lbe = factory->getEngineFor(c);
1174 if (lbe != NULL) {
1175 break;
1176 }
1177 }
1178 return lbe;
1179}
1180
1181
1182//-------------------------------------------------------------------------------
1183//
1184// getLanguageBreakEngine Find an appropriate LanguageBreakEngine for the
1185// the character c.
1186//
1187//-------------------------------------------------------------------------------
1188const LanguageBreakEngine *
1189RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) {
1190 const LanguageBreakEngine *lbe = NULL;
1191 UErrorCode status = U_ZERO_ERROR;
1192
1193 if (fLanguageBreakEngines == NULL) {
1194 fLanguageBreakEngines = new UStack(status);
1195 if (fLanguageBreakEngines == NULL || U_FAILURE(status)) {
1196 delete fLanguageBreakEngines;
1197 fLanguageBreakEngines = 0;
1198 return NULL;
1199 }
1200 }
1201
1202 int32_t i = fLanguageBreakEngines->size();
1203 while (--i >= 0) {
1204 lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i));
1205 if (lbe->handles(c)) {
1206 return lbe;
1207 }
1208 }
1209
1210 // No existing dictionary took the character. See if a factory wants to
1211 // give us a new LanguageBreakEngine for this character.
1212 lbe = getLanguageBreakEngineFromFactory(c);
1213
1214 // If we got one, use it and push it on our stack.
1215 if (lbe != NULL) {
1216 fLanguageBreakEngines->push((void *)lbe, status);
1217 // Even if we can't remember it, we can keep looking it up, so
1218 // return it even if the push fails.
1219 return lbe;
1220 }
1221
1222 // No engine is forthcoming for this character. Add it to the
1223 // reject set. Create the reject break engine if needed.
1224 if (fUnhandledBreakEngine == NULL) {
1225 fUnhandledBreakEngine = new UnhandledEngine(status);
1226 if (U_SUCCESS(status) && fUnhandledBreakEngine == NULL) {
1227 status = U_MEMORY_ALLOCATION_ERROR;
1228 return nullptr;
1229 }
1230 // Put it last so that scripts for which we have an engine get tried
1231 // first.
1232 fLanguageBreakEngines->insertElementAt(fUnhandledBreakEngine, 0, status);
1233 // If we can't insert it, or creation failed, get rid of it
1234 if (U_FAILURE(status)) {
1235 delete fUnhandledBreakEngine;
1236 fUnhandledBreakEngine = 0;
1237 return NULL;
1238 }
1239 }
1240
1241 // Tell the reject engine about the character; at its discretion, it may
1242 // add more than just the one character.
1243 fUnhandledBreakEngine->handleCharacter(c);
1244
1245 return fUnhandledBreakEngine;
1246}
1247
1248void RuleBasedBreakIterator::dumpCache() {
1249 fBreakCache->dumpCache();
1250}
1251
1252void RuleBasedBreakIterator::dumpTables() {
1253 fData->printData();
1254}
1255
1256/**
1257 * Returns the description used to create this iterator
1258 */
1259
1260const UnicodeString&
1261RuleBasedBreakIterator::getRules() const {
1262 if (fData != NULL) {
1263 return fData->getRuleSourceString();
1264 } else {
1265 umtx_initOnce(gRBBIInitOnce, &rbbiInit);
1266 return *gEmptyString;
1267 }
1268}
1269
1270U_NAMESPACE_END
1271
1272#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1273