1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4***************************************************************************
5* Copyright (C) 1999-2016 International Business Machines Corporation
6* and others. All rights reserved.
7***************************************************************************
8*/
9//
10// file: rbbi.cpp Contains the implementation of the rule based break iterator
11// runtime engine and the API implementation for
12// class RuleBasedBreakIterator
13//
14
15#include "utypeinfo.h" // for 'typeid' to work
16
17#include "unicode/utypes.h"
18
19#if !UCONFIG_NO_BREAK_ITERATION
20
21#include <cinttypes>
22
23#include "unicode/rbbi.h"
24#include "unicode/schriter.h"
25#include "unicode/uchriter.h"
26#include "unicode/uclean.h"
27#include "unicode/udata.h"
28
29#include "brkeng.h"
30#include "ucln_cmn.h"
31#include "cmemory.h"
32#include "cstring.h"
33#include "localsvc.h"
34#include "rbbidata.h"
35#include "rbbi_cache.h"
36#include "rbbirb.h"
37#include "uassert.h"
38#include "umutex.h"
39#include "uvectr32.h"
40
41#ifdef RBBI_DEBUG
42static UBool gTrace = false;
43#endif
44
45U_NAMESPACE_BEGIN
46
47// The state number of the starting state
48constexpr int32_t START_STATE = 1;
49
50// The state-transition value indicating "stop"
51constexpr int32_t STOP_STATE = 0;
52
53
54UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator)
55
56
57//=======================================================================
58// constructors
59//=======================================================================
60
61/**
62 * Constructs a RuleBasedBreakIterator that uses the already-created
63 * tables object that is passed in as a parameter.
64 */
65RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status)
66 : RuleBasedBreakIterator(&status)
67{
68 fData = new RBBIDataWrapper(data, status); // status checked in constructor
69 if (U_FAILURE(status)) {return;}
70 if(fData == nullptr) {
71 status = U_MEMORY_ALLOCATION_ERROR;
72 return;
73 }
74 if (fData->fForwardTable->fLookAheadResultsSize > 0) {
75 fLookAheadMatches = static_cast<int32_t *>(
76 uprv_malloc(fData->fForwardTable->fLookAheadResultsSize * sizeof(int32_t)));
77 if (fLookAheadMatches == nullptr) {
78 status = U_MEMORY_ALLOCATION_ERROR;
79 return;
80 }
81 }
82}
83
84//-------------------------------------------------------------------------------
85//
86// Constructor from a UDataMemory handle to precompiled break rules
87// stored in an ICU data file. This construcotr is private API,
88// only for internal use.
89//
90//-------------------------------------------------------------------------------
91RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UBool isPhraseBreaking,
92 UErrorCode &status) : RuleBasedBreakIterator(udm, status)
93{
94 fIsPhraseBreaking = isPhraseBreaking;
95}
96
97//
98// Construct from precompiled binary rules (tables). This constructor is public API,
99// taking the rules as a (const uint8_t *) to match the type produced by getBinaryRules().
100//
101RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules,
102 uint32_t ruleLength,
103 UErrorCode &status)
104 : RuleBasedBreakIterator(&status)
105{
106 if (U_FAILURE(status)) {
107 return;
108 }
109 if (compiledRules == nullptr || ruleLength < sizeof(RBBIDataHeader)) {
110 status = U_ILLEGAL_ARGUMENT_ERROR;
111 return;
112 }
113 const RBBIDataHeader *data = (const RBBIDataHeader *)compiledRules;
114 if (data->fLength > ruleLength) {
115 status = U_ILLEGAL_ARGUMENT_ERROR;
116 return;
117 }
118 fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status);
119 if (U_FAILURE(status)) {return;}
120 if(fData == nullptr) {
121 status = U_MEMORY_ALLOCATION_ERROR;
122 return;
123 }
124 if (fData->fForwardTable->fLookAheadResultsSize > 0) {
125 fLookAheadMatches = static_cast<int32_t *>(
126 uprv_malloc(fData->fForwardTable->fLookAheadResultsSize * sizeof(int32_t)));
127 if (fLookAheadMatches == nullptr) {
128 status = U_MEMORY_ALLOCATION_ERROR;
129 return;
130 }
131 }
132}
133
134
135//-------------------------------------------------------------------------------
136//
137// Constructor from a UDataMemory handle to precompiled break rules
138// stored in an ICU data file.
139//
140//-------------------------------------------------------------------------------
141RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status)
142 : RuleBasedBreakIterator(&status)
143{
144 fData = new RBBIDataWrapper(udm, status); // status checked in constructor
145 if (U_FAILURE(status)) {return;}
146 if(fData == nullptr) {
147 status = U_MEMORY_ALLOCATION_ERROR;
148 return;
149 }
150 if (fData->fForwardTable->fLookAheadResultsSize > 0) {
151 fLookAheadMatches = static_cast<int32_t *>(
152 uprv_malloc(fData->fForwardTable->fLookAheadResultsSize * sizeof(int32_t)));
153 if (fLookAheadMatches == nullptr) {
154 status = U_MEMORY_ALLOCATION_ERROR;
155 return;
156 }
157 }
158}
159
160
161
162//-------------------------------------------------------------------------------
163//
164// Constructor from a set of rules supplied as a string.
165//
166//-------------------------------------------------------------------------------
167RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString &rules,
168 UParseError &parseError,
169 UErrorCode &status)
170 : RuleBasedBreakIterator(&status)
171{
172 if (U_FAILURE(status)) {return;}
173 RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *)
174 RBBIRuleBuilder::createRuleBasedBreakIterator(rules, &parseError, status);
175 // Note: This is a bit awkward. The RBBI ruleBuilder has a factory method that
176 // creates and returns a complete RBBI. From here, in a constructor, we
177 // can't just return the object created by the builder factory, hence
178 // the assignment of the factory created object to "this".
179 if (U_SUCCESS(status)) {
180 *this = *bi;
181 delete bi;
182 }
183}
184
185
186//-------------------------------------------------------------------------------
187//
188// Default Constructor. Create an empty shell that can be set up later.
189// Used when creating a RuleBasedBreakIterator from a set
190// of rules.
191//-------------------------------------------------------------------------------
192RuleBasedBreakIterator::RuleBasedBreakIterator()
193 : RuleBasedBreakIterator(nullptr)
194{
195}
196
197/**
198 * Simple Constructor with an error code.
199 * Handles common initialization for all other constructors.
200 */
201RuleBasedBreakIterator::RuleBasedBreakIterator(UErrorCode *status) {
202 UErrorCode ec = U_ZERO_ERROR;
203 if (status == nullptr) {
204 status = &ec;
205 }
206 utext_openUChars(&fText, nullptr, 0, status);
207 LocalPointer<DictionaryCache> lpDictionaryCache(new DictionaryCache(this, *status), *status);
208 LocalPointer<BreakCache> lpBreakCache(new BreakCache(this, *status), *status);
209 if (U_FAILURE(*status)) {
210 fErrorCode = *status;
211 return;
212 }
213 fDictionaryCache = lpDictionaryCache.orphan();
214 fBreakCache = lpBreakCache.orphan();
215
216#ifdef RBBI_DEBUG
217 static UBool debugInitDone = false;
218 if (debugInitDone == false) {
219 char *debugEnv = getenv("U_RBBIDEBUG");
220 if (debugEnv && uprv_strstr(debugEnv, "trace")) {
221 gTrace = true;
222 }
223 debugInitDone = true;
224 }
225#endif
226}
227
228
229//-------------------------------------------------------------------------------
230//
231// Copy constructor. Will produce a break iterator with the same behavior,
232// and which iterates over the same text, as the one passed in.
233//
234//-------------------------------------------------------------------------------
235RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other)
236: RuleBasedBreakIterator()
237{
238 *this = other;
239}
240
241
242/**
243 * Destructor
244 */
245RuleBasedBreakIterator::~RuleBasedBreakIterator() {
246 if (fCharIter != &fSCharIter) {
247 // fCharIter was adopted from the outside.
248 delete fCharIter;
249 }
250 fCharIter = nullptr;
251
252 utext_close(&fText);
253
254 if (fData != nullptr) {
255 fData->removeReference();
256 fData = nullptr;
257 }
258 delete fBreakCache;
259 fBreakCache = nullptr;
260
261 delete fDictionaryCache;
262 fDictionaryCache = nullptr;
263
264 delete fLanguageBreakEngines;
265 fLanguageBreakEngines = nullptr;
266
267 delete fUnhandledBreakEngine;
268 fUnhandledBreakEngine = nullptr;
269
270 uprv_free(fLookAheadMatches);
271 fLookAheadMatches = nullptr;
272}
273
274/**
275 * Assignment operator. Sets this iterator to have the same behavior,
276 * and iterate over the same text, as the one passed in.
277 * TODO: needs better handling of memory allocation errors.
278 */
279RuleBasedBreakIterator&
280RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
281 if (this == &that) {
282 return *this;
283 }
284 BreakIterator::operator=(that);
285
286 if (fLanguageBreakEngines != nullptr) {
287 delete fLanguageBreakEngines;
288 fLanguageBreakEngines = nullptr; // Just rebuild for now
289 }
290 // TODO: clone fLanguageBreakEngines from "that"
291 UErrorCode status = U_ZERO_ERROR;
292 utext_clone(&fText, &that.fText, false, true, &status);
293
294 if (fCharIter != &fSCharIter) {
295 delete fCharIter;
296 }
297 fCharIter = &fSCharIter;
298
299 if (that.fCharIter != nullptr && that.fCharIter != &that.fSCharIter) {
300 // This is a little bit tricky - it will initially appear that
301 // this->fCharIter is adopted, even if that->fCharIter was
302 // not adopted. That's ok.
303 fCharIter = that.fCharIter->clone();
304 }
305 fSCharIter = that.fSCharIter;
306 if (fCharIter == nullptr) {
307 fCharIter = &fSCharIter;
308 }
309
310 if (fData != nullptr) {
311 fData->removeReference();
312 fData = nullptr;
313 }
314 if (that.fData != nullptr) {
315 fData = that.fData->addReference();
316 }
317
318 uprv_free(fLookAheadMatches);
319 fLookAheadMatches = nullptr;
320 if (fData && fData->fForwardTable->fLookAheadResultsSize > 0) {
321 fLookAheadMatches = static_cast<int32_t *>(
322 uprv_malloc(fData->fForwardTable->fLookAheadResultsSize * sizeof(int32_t)));
323 }
324
325
326 fPosition = that.fPosition;
327 fRuleStatusIndex = that.fRuleStatusIndex;
328 fDone = that.fDone;
329
330 // TODO: both the dictionary and the main cache need to be copied.
331 // Current position could be within a dictionary range. Trying to continue
332 // the iteration without the caches present would go to the rules, with
333 // the assumption that the current position is on a rule boundary.
334 fBreakCache->reset(fPosition, fRuleStatusIndex);
335 fDictionaryCache->reset();
336
337 return *this;
338}
339
340//-----------------------------------------------------------------------------
341//
342// clone - Returns a newly-constructed RuleBasedBreakIterator with the same
343// behavior, and iterating over the same text, as this one.
344// Virtual function: does the right thing with subclasses.
345//
346//-----------------------------------------------------------------------------
347RuleBasedBreakIterator*
348RuleBasedBreakIterator::clone() const {
349 return new RuleBasedBreakIterator(*this);
350}
351
352/**
353 * Equality operator. Returns true if both BreakIterators are of the
354 * same class, have the same behavior, and iterate over the same text.
355 */
356bool
357RuleBasedBreakIterator::operator==(const BreakIterator& that) const {
358 if (typeid(*this) != typeid(that)) {
359 return false;
360 }
361 if (this == &that) {
362 return true;
363 }
364
365 // The base class BreakIterator carries no state that participates in equality,
366 // and does not implement an equality function that would otherwise be
367 // checked at this point.
368
369 const RuleBasedBreakIterator& that2 = static_cast<const RuleBasedBreakIterator&>(that);
370
371 if (!utext_equals(&fText, &that2.fText)) {
372 // The two break iterators are operating on different text,
373 // or have a different iteration position.
374 // Note that fText's position is always the same as the break iterator's position.
375 return false;
376 }
377
378 if (!(fPosition == that2.fPosition &&
379 fRuleStatusIndex == that2.fRuleStatusIndex &&
380 fDone == that2.fDone)) {
381 return false;
382 }
383
384 if (that2.fData == fData ||
385 (fData != nullptr && that2.fData != nullptr && *that2.fData == *fData)) {
386 // The two break iterators are using the same rules.
387 return true;
388 }
389 return false;
390}
391
392/**
393 * Compute a hash code for this BreakIterator
394 * @return A hash code
395 */
396int32_t
397RuleBasedBreakIterator::hashCode() const {
398 int32_t hash = 0;
399 if (fData != nullptr) {
400 hash = fData->hashCode();
401 }
402 return hash;
403}
404
405
406void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) {
407 if (U_FAILURE(status)) {
408 return;
409 }
410 fBreakCache->reset();
411 fDictionaryCache->reset();
412 utext_clone(&fText, ut, false, true, &status);
413
414 // Set up a dummy CharacterIterator to be returned if anyone
415 // calls getText(). With input from UText, there is no reasonable
416 // way to return a characterIterator over the actual input text.
417 // Return one over an empty string instead - this is the closest
418 // we can come to signaling a failure.
419 // (GetText() is obsolete, this failure is sort of OK)
420 fSCharIter.setText(u"", 0);
421
422 if (fCharIter != &fSCharIter) {
423 // existing fCharIter was adopted from the outside. Delete it now.
424 delete fCharIter;
425 }
426 fCharIter = &fSCharIter;
427
428 this->first();
429}
430
431
432UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const {
433 UText *result = utext_clone(fillIn, &fText, false, true, &status);
434 return result;
435}
436
437
438//=======================================================================
439// BreakIterator overrides
440//=======================================================================
441
442/**
443 * Return a CharacterIterator over the text being analyzed.
444 */
445CharacterIterator&
446RuleBasedBreakIterator::getText() const {
447 return *fCharIter;
448}
449
450/**
451 * Set the iterator to analyze a new piece of text. This function resets
452 * the current iteration position to the beginning of the text.
453 * @param newText An iterator over the text to analyze.
454 */
455void
456RuleBasedBreakIterator::adoptText(CharacterIterator* newText) {
457 // If we are holding a CharacterIterator adopted from a
458 // previous call to this function, delete it now.
459 if (fCharIter != &fSCharIter) {
460 delete fCharIter;
461 }
462
463 fCharIter = newText;
464 UErrorCode status = U_ZERO_ERROR;
465 fBreakCache->reset();
466 fDictionaryCache->reset();
467 if (newText==nullptr || newText->startIndex() != 0) {
468 // startIndex !=0 wants to be an error, but there's no way to report it.
469 // Make the iterator text be an empty string.
470 utext_openUChars(&fText, nullptr, 0, &status);
471 } else {
472 utext_openCharacterIterator(&fText, newText, &status);
473 }
474 this->first();
475}
476
477/**
478 * Set the iterator to analyze a new piece of text. This function resets
479 * the current iteration position to the beginning of the text.
480 * @param newText An iterator over the text to analyze.
481 */
482void
483RuleBasedBreakIterator::setText(const UnicodeString& newText) {
484 UErrorCode status = U_ZERO_ERROR;
485 fBreakCache->reset();
486 fDictionaryCache->reset();
487 utext_openConstUnicodeString(&fText, &newText, &status);
488
489 // Set up a character iterator on the string.
490 // Needed in case someone calls getText().
491 // Can not, unfortunately, do this lazily on the (probably never)
492 // call to getText(), because getText is const.
493 fSCharIter.setText(newText.getBuffer(), newText.length());
494
495 if (fCharIter != &fSCharIter) {
496 // old fCharIter was adopted from the outside. Delete it.
497 delete fCharIter;
498 }
499 fCharIter = &fSCharIter;
500
501 this->first();
502}
503
504
505/**
506 * Provide a new UText for the input text. Must reference text with contents identical
507 * to the original.
508 * Intended for use with text data originating in Java (garbage collected) environments
509 * where the data may be moved in memory at arbitrary times.
510 */
511RuleBasedBreakIterator &RuleBasedBreakIterator::refreshInputText(UText *input, UErrorCode &status) {
512 if (U_FAILURE(status)) {
513 return *this;
514 }
515 if (input == nullptr) {
516 status = U_ILLEGAL_ARGUMENT_ERROR;
517 return *this;
518 }
519 int64_t pos = utext_getNativeIndex(&fText);
520 // Shallow read-only clone of the new UText into the existing input UText
521 utext_clone(&fText, input, false, true, &status);
522 if (U_FAILURE(status)) {
523 return *this;
524 }
525 utext_setNativeIndex(&fText, pos);
526 if (utext_getNativeIndex(&fText) != pos) {
527 // Sanity check. The new input utext is supposed to have the exact same
528 // contents as the old. If we can't set to the same position, it doesn't.
529 // The contents underlying the old utext might be invalid at this point,
530 // so it's not safe to check directly.
531 status = U_ILLEGAL_ARGUMENT_ERROR;
532 }
533 return *this;
534}
535
536
537/**
538 * Sets the current iteration position to the beginning of the text, position zero.
539 * @return The new iterator position, which is zero.
540 */
541int32_t RuleBasedBreakIterator::first() {
542 UErrorCode status = U_ZERO_ERROR;
543 if (!fBreakCache->seek(0)) {
544 fBreakCache->populateNear(0, status);
545 }
546 fBreakCache->current();
547 U_ASSERT(fPosition == 0);
548 return 0;
549}
550
551/**
552 * Sets the current iteration position to the end of the text.
553 * @return The text's past-the-end offset.
554 */
555int32_t RuleBasedBreakIterator::last() {
556 int32_t endPos = (int32_t)utext_nativeLength(&fText);
557 UBool endShouldBeBoundary = isBoundary(endPos); // Has side effect of setting iterator position.
558 (void)endShouldBeBoundary;
559 U_ASSERT(endShouldBeBoundary);
560 U_ASSERT(fPosition == endPos);
561 return endPos;
562}
563
564/**
565 * Advances the iterator either forward or backward the specified number of steps.
566 * Negative values move backward, and positive values move forward. This is
567 * equivalent to repeatedly calling next() or previous().
568 * @param n The number of steps to move. The sign indicates the direction
569 * (negative is backwards, and positive is forwards).
570 * @return The character offset of the boundary position n boundaries away from
571 * the current one.
572 */
573int32_t RuleBasedBreakIterator::next(int32_t n) {
574 int32_t result = 0;
575 if (n > 0) {
576 for (; n > 0 && result != UBRK_DONE; --n) {
577 result = next();
578 }
579 } else if (n < 0) {
580 for (; n < 0 && result != UBRK_DONE; ++n) {
581 result = previous();
582 }
583 } else {
584 result = current();
585 }
586 return result;
587}
588
589/**
590 * Advances the iterator to the next boundary position.
591 * @return The position of the first boundary after this one.
592 */
593int32_t RuleBasedBreakIterator::next() {
594 fBreakCache->next();
595 return fDone ? UBRK_DONE : fPosition;
596}
597
598/**
599 * Move the iterator backwards, to the boundary preceding the current one.
600 *
601 * Starts from the current position within fText.
602 * Starting position need not be on a boundary.
603 *
604 * @return The position of the boundary position immediately preceding the starting position.
605 */
606int32_t RuleBasedBreakIterator::previous() {
607 UErrorCode status = U_ZERO_ERROR;
608 fBreakCache->previous(status);
609 return fDone ? UBRK_DONE : fPosition;
610}
611
612/**
613 * Sets the iterator to refer to the first boundary position following
614 * the specified position.
615 * @param startPos The position from which to begin searching for a break position.
616 * @return The position of the first break after the current position.
617 */
618int32_t RuleBasedBreakIterator::following(int32_t startPos) {
619 // if the supplied position is before the beginning, return the
620 // text's starting offset
621 if (startPos < 0) {
622 return first();
623 }
624
625 // Move requested offset to a code point start. It might be on a trail surrogate,
626 // or on a trail byte if the input is UTF-8. Or it may be beyond the end of the text.
627 utext_setNativeIndex(&fText, startPos);
628 startPos = (int32_t)utext_getNativeIndex(&fText);
629
630 UErrorCode status = U_ZERO_ERROR;
631 fBreakCache->following(startPos, status);
632 return fDone ? UBRK_DONE : fPosition;
633}
634
635/**
636 * Sets the iterator to refer to the last boundary position before the
637 * specified position.
638 * @param offset The position to begin searching for a break from.
639 * @return The position of the last boundary before the starting position.
640 */
641int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
642 if (offset > utext_nativeLength(&fText)) {
643 return last();
644 }
645
646 // Move requested offset to a code point start. It might be on a trail surrogate,
647 // or on a trail byte if the input is UTF-8.
648
649 utext_setNativeIndex(&fText, offset);
650 int32_t adjustedOffset = static_cast<int32_t>(utext_getNativeIndex(&fText));
651
652 UErrorCode status = U_ZERO_ERROR;
653 fBreakCache->preceding(adjustedOffset, status);
654 return fDone ? UBRK_DONE : fPosition;
655}
656
657/**
658 * Returns true if the specified position is a boundary position. As a side
659 * effect, leaves the iterator pointing to the first boundary position at
660 * or after "offset".
661 *
662 * @param offset the offset to check.
663 * @return True if "offset" is a boundary position.
664 */
665UBool RuleBasedBreakIterator::isBoundary(int32_t offset) {
666 // out-of-range indexes are never boundary positions
667 if (offset < 0) {
668 first(); // For side effects on current position, tag values.
669 return false;
670 }
671
672 // Adjust offset to be on a code point boundary and not beyond the end of the text.
673 // Note that isBoundary() is always false for offsets that are not on code point boundaries.
674 // But we still need the side effect of leaving iteration at the following boundary.
675
676 utext_setNativeIndex(&fText, offset);
677 int32_t adjustedOffset = static_cast<int32_t>(utext_getNativeIndex(&fText));
678
679 bool result = false;
680 UErrorCode status = U_ZERO_ERROR;
681 if (fBreakCache->seek(adjustedOffset) || fBreakCache->populateNear(adjustedOffset, status)) {
682 result = (fBreakCache->current() == offset);
683 }
684
685 if (result && adjustedOffset < offset && utext_char32At(&fText, offset) == U_SENTINEL) {
686 // Original offset is beyond the end of the text. Return false, it's not a boundary,
687 // but the iteration position remains set to the end of the text, which is a boundary.
688 return false;
689 }
690 if (!result) {
691 // Not on a boundary. isBoundary() must leave iterator on the following boundary.
692 // Cache->seek(), above, left us on the preceding boundary, so advance one.
693 next();
694 }
695 return result;
696}
697
698
699/**
700 * Returns the current iteration position.
701 * @return The current iteration position.
702 */
703int32_t RuleBasedBreakIterator::current() const {
704 return fPosition;
705}
706
707
708//=======================================================================
709// implementation
710//=======================================================================
711
712//
713// RBBIRunMode - the state machine runs an extra iteration at the beginning and end
714// of user text. A variable with this enum type keeps track of where we
715// are. The state machine only fetches user input while in the RUN mode.
716//
717enum RBBIRunMode {
718 RBBI_START, // state machine processing is before first char of input
719 RBBI_RUN, // state machine processing is in the user text
720 RBBI_END // state machine processing is after end of user text.
721};
722
723
724// Wrapper functions to select the appropriate handleNext() or handleSafePrevious()
725// instantiation, based on whether an 8 or 16 bit table is required.
726//
727// These Trie access functions will be inlined within the handleNext()/Previous() instantions.
728static inline uint16_t TrieFunc8(const UCPTrie *trie, UChar32 c) {
729 return UCPTRIE_FAST_GET(trie, UCPTRIE_8, c);
730}
731
732static inline uint16_t TrieFunc16(const UCPTrie *trie, UChar32 c) {
733 return UCPTRIE_FAST_GET(trie, UCPTRIE_16, c);
734}
735
736int32_t RuleBasedBreakIterator::handleNext() {
737 const RBBIStateTable *statetable = fData->fForwardTable;
738 bool use8BitsTrie = ucptrie_getValueWidth(fData->fTrie) == UCPTRIE_VALUE_BITS_8;
739 if (statetable->fFlags & RBBI_8BITS_ROWS) {
740 if (use8BitsTrie) {
741 return handleNext<RBBIStateTableRow8, TrieFunc8>();
742 } else {
743 return handleNext<RBBIStateTableRow8, TrieFunc16>();
744 }
745 } else {
746 if (use8BitsTrie) {
747 return handleNext<RBBIStateTableRow16, TrieFunc8>();
748 } else {
749 return handleNext<RBBIStateTableRow16, TrieFunc16>();
750 }
751 }
752}
753
754int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) {
755 const RBBIStateTable *statetable = fData->fReverseTable;
756 bool use8BitsTrie = ucptrie_getValueWidth(fData->fTrie) == UCPTRIE_VALUE_BITS_8;
757 if (statetable->fFlags & RBBI_8BITS_ROWS) {
758 if (use8BitsTrie) {
759 return handleSafePrevious<RBBIStateTableRow8, TrieFunc8>(fromPosition);
760 } else {
761 return handleSafePrevious<RBBIStateTableRow8, TrieFunc16>(fromPosition);
762 }
763 } else {
764 if (use8BitsTrie) {
765 return handleSafePrevious<RBBIStateTableRow16, TrieFunc8>(fromPosition);
766 } else {
767 return handleSafePrevious<RBBIStateTableRow16, TrieFunc16>(fromPosition);
768 }
769 }
770}
771
772
773//-----------------------------------------------------------------------------------
774//
775// handleNext()
776// Run the state machine to find a boundary
777//
778//-----------------------------------------------------------------------------------
779template <typename RowType, RuleBasedBreakIterator::PTrieFunc trieFunc>
780int32_t RuleBasedBreakIterator::handleNext() {
781 int32_t state;
782 uint16_t category = 0;
783 RBBIRunMode mode;
784
785 RowType *row;
786 UChar32 c;
787 int32_t result = 0;
788 int32_t initialPosition = 0;
789 const RBBIStateTable *statetable = fData->fForwardTable;
790 const char *tableData = statetable->fTableData;
791 uint32_t tableRowLen = statetable->fRowLen;
792 uint32_t dictStart = statetable->fDictCategoriesStart;
793 #ifdef RBBI_DEBUG
794 if (gTrace) {
795 RBBIDebugPuts("Handle Next pos char state category");
796 }
797 #endif
798
799 // handleNext always sets the break tag value.
800 // Set the default for it.
801 fRuleStatusIndex = 0;
802
803 fDictionaryCharCount = 0;
804
805 // if we're already at the end of the text, return DONE.
806 initialPosition = fPosition;
807 UTEXT_SETNATIVEINDEX(&fText, initialPosition);
808 result = initialPosition;
809 c = UTEXT_NEXT32(&fText);
810 if (c==U_SENTINEL) {
811 fDone = true;
812 return UBRK_DONE;
813 }
814
815 // Set the initial state for the state machine
816 state = START_STATE;
817 row = (RowType *)
818 //(statetable->fTableData + (statetable->fRowLen * state));
819 (tableData + tableRowLen * state);
820
821
822 mode = RBBI_RUN;
823 if (statetable->fFlags & RBBI_BOF_REQUIRED) {
824 category = 2;
825 mode = RBBI_START;
826 }
827
828
829 // loop until we reach the end of the text or transition to state 0
830 //
831 for (;;) {
832 if (c == U_SENTINEL) {
833 // Reached end of input string.
834 if (mode == RBBI_END) {
835 // We have already run the loop one last time with the
836 // character set to the psueudo {eof} value. Now it is time
837 // to unconditionally bail out.
838 break;
839 }
840 // Run the loop one last time with the fake end-of-input character category.
841 mode = RBBI_END;
842 category = 1;
843 }
844
845 //
846 // Get the char category. An incoming category of 1 or 2 means that
847 // we are preset for doing the beginning or end of input, and
848 // that we shouldn't get a category from an actual text input character.
849 //
850 if (mode == RBBI_RUN) {
851 // look up the current character's character category, which tells us
852 // which column in the state table to look at.
853 category = trieFunc(fData->fTrie, c);
854 fDictionaryCharCount += (category >= dictStart);
855 }
856
857 #ifdef RBBI_DEBUG
858 if (gTrace) {
859 RBBIDebugPrintf(" %4" PRId64 " ", utext_getNativeIndex(&fText));
860 if (0x20<=c && c<0x7f) {
861 RBBIDebugPrintf("\"%c\" ", c);
862 } else {
863 RBBIDebugPrintf("%5x ", c);
864 }
865 RBBIDebugPrintf("%3d %3d\n", state, category);
866 }
867 #endif
868
869 // State Transition - move machine to its next state
870 //
871
872 // fNextState is a variable-length array.
873 U_ASSERT(category<fData->fHeader->fCatCount);
874 state = row->fNextState[category]; /*Not accessing beyond memory*/
875 row = (RowType *)
876 // (statetable->fTableData + (statetable->fRowLen * state));
877 (tableData + tableRowLen * state);
878
879
880 uint16_t accepting = row->fAccepting;
881 if (accepting == ACCEPTING_UNCONDITIONAL) {
882 // Match found, common case.
883 if (mode != RBBI_START) {
884 result = (int32_t)UTEXT_GETNATIVEINDEX(&fText);
885 }
886 fRuleStatusIndex = row->fTagsIdx; // Remember the break status (tag) values.
887 } else if (accepting > ACCEPTING_UNCONDITIONAL) {
888 // Lookahead match is completed.
889 U_ASSERT(accepting < fData->fForwardTable->fLookAheadResultsSize);
890 int32_t lookaheadResult = fLookAheadMatches[accepting];
891 if (lookaheadResult >= 0) {
892 fRuleStatusIndex = row->fTagsIdx;
893 fPosition = lookaheadResult;
894 return lookaheadResult;
895 }
896 }
897
898 // If we are at the position of the '/' in a look-ahead (hard break) rule;
899 // record the current position, to be returned later, if the full rule matches.
900 // TODO: Move this check before the previous check of fAccepting.
901 // This would enable hard-break rules with no following context.
902 // But there are line break test failures when trying this. Investigate.
903 // Issue ICU-20837
904 uint16_t rule = row->fLookAhead;
905 U_ASSERT(rule == 0 || rule > ACCEPTING_UNCONDITIONAL);
906 U_ASSERT(rule == 0 || rule < fData->fForwardTable->fLookAheadResultsSize);
907 if (rule > ACCEPTING_UNCONDITIONAL) {
908 int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(&fText);
909 fLookAheadMatches[rule] = pos;
910 }
911
912 if (state == STOP_STATE) {
913 // This is the normal exit from the lookup state machine.
914 // We have advanced through the string until it is certain that no
915 // longer match is possible, no matter what characters follow.
916 break;
917 }
918
919 // Advance to the next character.
920 // If this is a beginning-of-input loop iteration, don't advance
921 // the input position. The next iteration will be processing the
922 // first real input character.
923 if (mode == RBBI_RUN) {
924 c = UTEXT_NEXT32(&fText);
925 } else {
926 if (mode == RBBI_START) {
927 mode = RBBI_RUN;
928 }
929 }
930 }
931
932 // The state machine is done. Check whether it found a match...
933
934 // If the iterator failed to advance in the match engine, force it ahead by one.
935 // (This really indicates a defect in the break rules. They should always match
936 // at least one character.)
937 if (result == initialPosition) {
938 utext_setNativeIndex(&fText, initialPosition);
939 utext_next32(&fText);
940 result = (int32_t)utext_getNativeIndex(&fText);
941 fRuleStatusIndex = 0;
942 }
943
944 // Leave the iterator at our result position.
945 fPosition = result;
946 #ifdef RBBI_DEBUG
947 if (gTrace) {
948 RBBIDebugPrintf("result = %d\n\n", result);
949 }
950 #endif
951 return result;
952}
953
954
955//-----------------------------------------------------------------------------------
956//
957// handleSafePrevious()
958//
959// Iterate backwards using the safe reverse rules.
960// The logic of this function is similar to handleNext(), but simpler
961// because the safe table does not require as many options.
962//
963//-----------------------------------------------------------------------------------
964template <typename RowType, RuleBasedBreakIterator::PTrieFunc trieFunc>
965int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) {
966
967 int32_t state;
968 uint16_t category = 0;
969 RowType *row;
970 UChar32 c;
971 int32_t result = 0;
972
973 const RBBIStateTable *stateTable = fData->fReverseTable;
974 UTEXT_SETNATIVEINDEX(&fText, fromPosition);
975 #ifdef RBBI_DEBUG
976 if (gTrace) {
977 RBBIDebugPuts("Handle Previous pos char state category");
978 }
979 #endif
980
981 // if we're already at the start of the text, return DONE.
982 if (fData == nullptr || UTEXT_GETNATIVEINDEX(&fText)==0) {
983 return BreakIterator::DONE;
984 }
985
986 // Set the initial state for the state machine
987 c = UTEXT_PREVIOUS32(&fText);
988 state = START_STATE;
989 row = (RowType *)
990 (stateTable->fTableData + (stateTable->fRowLen * state));
991
992 // loop until we reach the start of the text or transition to state 0
993 //
994 for (; c != U_SENTINEL; c = UTEXT_PREVIOUS32(&fText)) {
995
996 // look up the current character's character category, which tells us
997 // which column in the state table to look at.
998 //
999 // Off the dictionary flag bit. For reverse iteration it is not used.
1000 category = trieFunc(fData->fTrie, c);
1001
1002 #ifdef RBBI_DEBUG
1003 if (gTrace) {
1004 RBBIDebugPrintf(" %4d ", (int32_t)utext_getNativeIndex(&fText));
1005 if (0x20<=c && c<0x7f) {
1006 RBBIDebugPrintf("\"%c\" ", c);
1007 } else {
1008 RBBIDebugPrintf("%5x ", c);
1009 }
1010 RBBIDebugPrintf("%3d %3d\n", state, category);
1011 }
1012 #endif
1013
1014 // State Transition - move machine to its next state
1015 //
1016 // fNextState is a variable-length array.
1017 U_ASSERT(category<fData->fHeader->fCatCount);
1018 state = row->fNextState[category]; /*Not accessing beyond memory*/
1019 row = (RowType *)
1020 (stateTable->fTableData + (stateTable->fRowLen * state));
1021
1022 if (state == STOP_STATE) {
1023 // This is the normal exit from the lookup state machine.
1024 // Transition to state zero means we have found a safe point.
1025 break;
1026 }
1027 }
1028
1029 // The state machine is done. Check whether it found a match...
1030 result = (int32_t)UTEXT_GETNATIVEINDEX(&fText);
1031 #ifdef RBBI_DEBUG
1032 if (gTrace) {
1033 RBBIDebugPrintf("result = %d\n\n", result);
1034 }
1035 #endif
1036 return result;
1037}
1038
1039
1040//-------------------------------------------------------------------------------
1041//
1042// getRuleStatus() Return the break rule tag associated with the current
1043// iterator position. If the iterator arrived at its current
1044// position by iterating forwards, the value will have been
1045// cached by the handleNext() function.
1046//
1047//-------------------------------------------------------------------------------
1048
1049int32_t RuleBasedBreakIterator::getRuleStatus() const {
1050
1051 // fLastRuleStatusIndex indexes to the start of the appropriate status record
1052 // (the number of status values.)
1053 // This function returns the last (largest) of the array of status values.
1054 int32_t idx = fRuleStatusIndex + fData->fRuleStatusTable[fRuleStatusIndex];
1055 int32_t tagVal = fData->fRuleStatusTable[idx];
1056
1057 return tagVal;
1058}
1059
1060
1061int32_t RuleBasedBreakIterator::getRuleStatusVec(
1062 int32_t *fillInVec, int32_t capacity, UErrorCode &status) {
1063 if (U_FAILURE(status)) {
1064 return 0;
1065 }
1066
1067 int32_t numVals = fData->fRuleStatusTable[fRuleStatusIndex];
1068 int32_t numValsToCopy = numVals;
1069 if (numVals > capacity) {
1070 status = U_BUFFER_OVERFLOW_ERROR;
1071 numValsToCopy = capacity;
1072 }
1073 int i;
1074 for (i=0; i<numValsToCopy; i++) {
1075 fillInVec[i] = fData->fRuleStatusTable[fRuleStatusIndex + i + 1];
1076 }
1077 return numVals;
1078}
1079
1080
1081
1082//-------------------------------------------------------------------------------
1083//
1084// getBinaryRules Access to the compiled form of the rules,
1085// for use by build system tools that save the data
1086// for standard iterator types.
1087//
1088//-------------------------------------------------------------------------------
1089const uint8_t *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) {
1090 const uint8_t *retPtr = nullptr;
1091 length = 0;
1092
1093 if (fData != nullptr) {
1094 retPtr = (const uint8_t *)fData->fHeader;
1095 length = fData->fHeader->fLength;
1096 }
1097 return retPtr;
1098}
1099
1100
1101RuleBasedBreakIterator *RuleBasedBreakIterator::createBufferClone(
1102 void * /*stackBuffer*/, int32_t &bufferSize, UErrorCode &status) {
1103 if (U_FAILURE(status)){
1104 return nullptr;
1105 }
1106
1107 if (bufferSize == 0) {
1108 bufferSize = 1; // preflighting for deprecated functionality
1109 return nullptr;
1110 }
1111
1112 BreakIterator *clonedBI = clone();
1113 if (clonedBI == nullptr) {
1114 status = U_MEMORY_ALLOCATION_ERROR;
1115 } else {
1116 status = U_SAFECLONE_ALLOCATED_WARNING;
1117 }
1118 return (RuleBasedBreakIterator *)clonedBI;
1119}
1120
1121U_NAMESPACE_END
1122
1123
1124static icu::UStack *gLanguageBreakFactories = nullptr;
1125static const icu::UnicodeString *gEmptyString = nullptr;
1126static icu::UInitOnce gLanguageBreakFactoriesInitOnce {};
1127static icu::UInitOnce gRBBIInitOnce {};
1128
1129/**
1130 * Release all static memory held by breakiterator.
1131 */
1132U_CDECL_BEGIN
1133UBool U_CALLCONV rbbi_cleanup() {
1134 delete gLanguageBreakFactories;
1135 gLanguageBreakFactories = nullptr;
1136 delete gEmptyString;
1137 gEmptyString = nullptr;
1138 gLanguageBreakFactoriesInitOnce.reset();
1139 gRBBIInitOnce.reset();
1140 return true;
1141}
1142U_CDECL_END
1143
1144U_CDECL_BEGIN
1145static void U_CALLCONV _deleteFactory(void *obj) {
1146 delete (icu::LanguageBreakFactory *) obj;
1147}
1148U_CDECL_END
1149U_NAMESPACE_BEGIN
1150
1151static void U_CALLCONV rbbiInit() {
1152 gEmptyString = new UnicodeString();
1153 ucln_common_registerCleanup(UCLN_COMMON_RBBI, rbbi_cleanup);
1154}
1155
1156static void U_CALLCONV initLanguageFactories() {
1157 UErrorCode status = U_ZERO_ERROR;
1158 U_ASSERT(gLanguageBreakFactories == nullptr);
1159 gLanguageBreakFactories = new UStack(_deleteFactory, nullptr, status);
1160 if (gLanguageBreakFactories != nullptr && U_SUCCESS(status)) {
1161 ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status);
1162 gLanguageBreakFactories->push(builtIn, status);
1163#ifdef U_LOCAL_SERVICE_HOOK
1164 LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status);
1165 if (extra != nullptr) {
1166 gLanguageBreakFactories->push(extra, status);
1167 }
1168#endif
1169 }
1170 ucln_common_registerCleanup(UCLN_COMMON_RBBI, rbbi_cleanup);
1171}
1172
1173
1174static const LanguageBreakEngine*
1175getLanguageBreakEngineFromFactory(UChar32 c)
1176{
1177 umtx_initOnce(gLanguageBreakFactoriesInitOnce, &initLanguageFactories);
1178 if (gLanguageBreakFactories == nullptr) {
1179 return nullptr;
1180 }
1181
1182 int32_t i = gLanguageBreakFactories->size();
1183 const LanguageBreakEngine *lbe = nullptr;
1184 while (--i >= 0) {
1185 LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i));
1186 lbe = factory->getEngineFor(c);
1187 if (lbe != nullptr) {
1188 break;
1189 }
1190 }
1191 return lbe;
1192}
1193
1194
1195//-------------------------------------------------------------------------------
1196//
1197// getLanguageBreakEngine Find an appropriate LanguageBreakEngine for the
1198// the character c.
1199//
1200//-------------------------------------------------------------------------------
1201const LanguageBreakEngine *
1202RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) {
1203 const LanguageBreakEngine *lbe = nullptr;
1204 UErrorCode status = U_ZERO_ERROR;
1205
1206 if (fLanguageBreakEngines == nullptr) {
1207 fLanguageBreakEngines = new UStack(status);
1208 if (fLanguageBreakEngines == nullptr || U_FAILURE(status)) {
1209 delete fLanguageBreakEngines;
1210 fLanguageBreakEngines = 0;
1211 return nullptr;
1212 }
1213 }
1214
1215 int32_t i = fLanguageBreakEngines->size();
1216 while (--i >= 0) {
1217 lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i));
1218 if (lbe->handles(c)) {
1219 return lbe;
1220 }
1221 }
1222
1223 // No existing dictionary took the character. See if a factory wants to
1224 // give us a new LanguageBreakEngine for this character.
1225 lbe = getLanguageBreakEngineFromFactory(c);
1226
1227 // If we got one, use it and push it on our stack.
1228 if (lbe != nullptr) {
1229 fLanguageBreakEngines->push((void *)lbe, status);
1230 // Even if we can't remember it, we can keep looking it up, so
1231 // return it even if the push fails.
1232 return lbe;
1233 }
1234
1235 // No engine is forthcoming for this character. Add it to the
1236 // reject set. Create the reject break engine if needed.
1237 if (fUnhandledBreakEngine == nullptr) {
1238 fUnhandledBreakEngine = new UnhandledEngine(status);
1239 if (U_SUCCESS(status) && fUnhandledBreakEngine == nullptr) {
1240 status = U_MEMORY_ALLOCATION_ERROR;
1241 return nullptr;
1242 }
1243 // Put it last so that scripts for which we have an engine get tried
1244 // first.
1245 fLanguageBreakEngines->insertElementAt(fUnhandledBreakEngine, 0, status);
1246 // If we can't insert it, or creation failed, get rid of it
1247 U_ASSERT(!fLanguageBreakEngines->hasDeleter());
1248 if (U_FAILURE(status)) {
1249 delete fUnhandledBreakEngine;
1250 fUnhandledBreakEngine = 0;
1251 return nullptr;
1252 }
1253 }
1254
1255 // Tell the reject engine about the character; at its discretion, it may
1256 // add more than just the one character.
1257 fUnhandledBreakEngine->handleCharacter(c);
1258
1259 return fUnhandledBreakEngine;
1260}
1261
1262void RuleBasedBreakIterator::dumpCache() {
1263 fBreakCache->dumpCache();
1264}
1265
1266void RuleBasedBreakIterator::dumpTables() {
1267 fData->printData();
1268}
1269
1270/**
1271 * Returns the description used to create this iterator
1272 */
1273
1274const UnicodeString&
1275RuleBasedBreakIterator::getRules() const {
1276 if (fData != nullptr) {
1277 return fData->getRuleSourceString();
1278 } else {
1279 umtx_initOnce(gRBBIInitOnce, &rbbiInit);
1280 return *gEmptyString;
1281 }
1282}
1283
1284U_NAMESPACE_END
1285
1286#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1287