rbbi.cpp source code [Godot/thirdparty/icu4c/common/rbbi.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	***************************************************************************
5	* Copyright (C) 1999-2016 International Business Machines Corporation
6	* and others. All rights reserved.
7	***************************************************************************
8	*/
9	//
10	// file: rbbi.cpp Contains the implementation of the rule based break iterator
11	// runtime engine and the API implementation for
12	// class RuleBasedBreakIterator
13	//
14
15	#include "utypeinfo.h" // for 'typeid' to work
16
17	#include "unicode/utypes.h"
18
19	#if !UCONFIG_NO_BREAK_ITERATION
20
21	#include <cinttypes>
22
23	#include "unicode/rbbi.h"
24	#include "unicode/schriter.h"
25	#include "unicode/uchriter.h"
26	#include "unicode/uclean.h"
27	#include "unicode/udata.h"
28
29	#include "brkeng.h"
30	#include "ucln_cmn.h"
31	#include "cmemory.h"
32	#include "cstring.h"
33	#include "localsvc.h"
34	#include "rbbidata.h"
35	#include "rbbi_cache.h"
36	#include "rbbirb.h"
37	#include "uassert.h"
38	#include "umutex.h"
39	#include "uvectr32.h"
40
41	#ifdef RBBI_DEBUG
42	static UBool gTrace = false;
43	#endif
44
45	U_NAMESPACE_BEGIN
46
47	// The state number of the starting state
48	constexpr int32_t START_STATE = `1`;
49
50	// The state-transition value indicating "stop"
51	constexpr int32_t STOP_STATE = `0`;
52
53
54	UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator)
55
56
57	//=======================================================================
58	// constructors
59	//=======================================================================
60
61	/**
62	* Constructs a RuleBasedBreakIterator that uses the already-created
63	* tables object that is passed in as a parameter.
64	*/
65	RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status)
66	: RuleBasedBreakIterator (&status)
67	{
68	fData = new RBBIDataWrapper (data, status); // status checked in constructor
69	if (U_FAILURE(status)) {return;}
70	if(fData == nullptr) {
71	status = U_MEMORY_ALLOCATION_ERROR;
72	return;
73	}
74	if (fData->fForwardTable->fLookAheadResultsSize > `0`) {
75	fLookAheadMatches = static_cast<int32_t *>(
76	uprv_malloc(fData->fForwardTable->fLookAheadResultsSize * sizeof(int32_t)));
77	if (fLookAheadMatches == nullptr) {
78	status = U_MEMORY_ALLOCATION_ERROR;
79	return;
80	}
81	}
82	}
83
84	//-------------------------------------------------------------------------------
85	//
86	// Constructor from a UDataMemory handle to precompiled break rules
87	// stored in an ICU data file. This construcotr is private API,
88	// only for internal use.
89	//
90	//-------------------------------------------------------------------------------
91	RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UBool isPhraseBreaking,
92	UErrorCode &status) : RuleBasedBreakIterator (udm, status)
93	{
94	fIsPhraseBreaking = isPhraseBreaking;
95	}
96
97	//
98	// Construct from precompiled binary rules (tables). This constructor is public API,
99	// taking the rules as a (const uint8_t ) to match the type produced by getBinaryRules().*
100	//
101	RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules,
102	uint32_t ruleLength,
103	UErrorCode &status)
104	: RuleBasedBreakIterator (&status)
105	{
106	if (U_FAILURE(status)) {
107	return;
108	}
109	if (compiledRules == nullptr \|\| ruleLength < sizeof(RBBIDataHeader)) {
110	status = U_ILLEGAL_ARGUMENT_ERROR;
111	return;
112	}
113	const RBBIDataHeader data = (const* RBBIDataHeader *)compiledRules;
114	if (data->fLength > ruleLength) {
115	status = U_ILLEGAL_ARGUMENT_ERROR;
116	return;
117	}
118	fData = new RBBIDataWrapper (data, RBBIDataWrapper::kDontAdopt, status);
119	if (U_FAILURE(status)) {return;}
120	if(fData == nullptr) {
121	status = U_MEMORY_ALLOCATION_ERROR;
122	return;
123	}
124	if (fData->fForwardTable->fLookAheadResultsSize > `0`) {
125	fLookAheadMatches = static_cast<int32_t *>(
126	uprv_malloc(fData->fForwardTable->fLookAheadResultsSize * sizeof(int32_t)));
127	if (fLookAheadMatches == nullptr) {
128	status = U_MEMORY_ALLOCATION_ERROR;
129	return;
130	}
131	}
132	}
133
134
135	//-------------------------------------------------------------------------------
136	//
137	// Constructor from a UDataMemory handle to precompiled break rules
138	// stored in an ICU data file.
139	//
140	//-------------------------------------------------------------------------------
141	RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status)
142	: RuleBasedBreakIterator (&status)
143	{
144	fData = new RBBIDataWrapper (udm, status); // status checked in constructor
145	if (U_FAILURE(status)) {return;}
146	if(fData == nullptr) {
147	status = U_MEMORY_ALLOCATION_ERROR;
148	return;
149	}
150	if (fData->fForwardTable->fLookAheadResultsSize > `0`) {
151	fLookAheadMatches = static_cast<int32_t *>(
152	uprv_malloc(fData->fForwardTable->fLookAheadResultsSize * sizeof(int32_t)));
153	if (fLookAheadMatches == nullptr) {
154	status = U_MEMORY_ALLOCATION_ERROR;
155	return;
156	}
157	}
158	}
159
160
161
162	//-------------------------------------------------------------------------------
163	//
164	// Constructor from a set of rules supplied as a string.
165	//
166	//-------------------------------------------------------------------------------
167	RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString &rules,
168	UParseError &parseError,
169	UErrorCode &status)
170	: RuleBasedBreakIterator (&status)
171	{
172	if (U_FAILURE(status)) {return;}
173	RuleBasedBreakIterator bi = (RuleBasedBreakIterator )
174	RBBIRuleBuilder::createRuleBasedBreakIterator(rules, &parseError, status);
175	// Note: This is a bit awkward. The RBBI ruleBuilder has a factory method that
176	// creates and returns a complete RBBI. From here, in a constructor, we
177	// can't just return the object created by the builder factory, hence
178	// the assignment of the factory created object to "this".
179	if (U_SUCCESS(status)) {
180	*this = *bi;
181	delete bi;
182	}
183	}
184
185
186	//-------------------------------------------------------------------------------
187	//
188	// Default Constructor. Create an empty shell that can be set up later.
189	// Used when creating a RuleBasedBreakIterator from a set
190	// of rules.
191	//-------------------------------------------------------------------------------
192	RuleBasedBreakIterator::RuleBasedBreakIterator()
193	: RuleBasedBreakIterator (nullptr)
194	{
195	}
196
197	/**
198	* Simple Constructor with an error code.
199	* Handles common initialization for all other constructors.
200	*/
201	RuleBasedBreakIterator::RuleBasedBreakIterator(UErrorCode *status) {
202	UErrorCode ec = U_ZERO_ERROR;
203	if (status == nullptr) {
204	status = &ec;
205	}
206	utext_openUChars(&fText, nullptr, `0`, status);
207	LocalPointer<DictionaryCache> lpDictionaryCache(new DictionaryCache (this, status), status);
208	LocalPointer<BreakCache> lpBreakCache(new BreakCache (this, status), status);
209	if (U_FAILURE(*status)) {
210	fErrorCode = *status;
211	return;
212	}
213	fDictionaryCache = lpDictionaryCache.orphan();
214	fBreakCache = lpBreakCache.orphan();
215
216	#ifdef RBBI_DEBUG
217	static UBool debugInitDone = false;
218	if (debugInitDone == false) {
219	char *debugEnv = getenv("U_RBBIDEBUG");
220	if (debugEnv && uprv_strstr(debugEnv, "trace")) {
221	gTrace = true;
222	}
223	debugInitDone = true;
224	}
225	#endif
226	}
227
228
229	//-------------------------------------------------------------------------------
230	//
231	// Copy constructor. Will produce a break iterator with the same behavior,
232	// and which iterates over the same text, as the one passed in.
233	//
234	//-------------------------------------------------------------------------------
235	RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other)
236	: RuleBasedBreakIterator ()
237	{
238	*this = other;
239	}
240
241
242	/**
243	* Destructor
244	*/
245	RuleBasedBreakIterator::~RuleBasedBreakIterator() {
246	if (fCharIter != &fSCharIter) {
247	// fCharIter was adopted from the outside.
248	delete fCharIter;
249	}
250	fCharIter = nullptr;
251
252	utext_close(&fText);
253
254	if (fData != nullptr) {
255	fData->removeReference();
256	fData = nullptr;
257	}
258	delete fBreakCache;
259	fBreakCache = nullptr;
260
261	delete fDictionaryCache;
262	fDictionaryCache = nullptr;
263
264	delete fLanguageBreakEngines;
265	fLanguageBreakEngines = nullptr;
266
267	delete fUnhandledBreakEngine;
268	fUnhandledBreakEngine = nullptr;
269
270	uprv_free(fLookAheadMatches);
271	fLookAheadMatches = nullptr;
272	}
273
274	/**
275	* Assignment operator. Sets this iterator to have the same behavior,
276	* and iterate over the same text, as the one passed in.
277	* TODO: needs better handling of memory allocation errors.
278	*/
279	RuleBasedBreakIterator&
280	RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
281	if (this == &that) {
282	return *this;
283	}
284	BreakIterator::operator=(that);
285
286	if (fLanguageBreakEngines != nullptr) {
287	delete fLanguageBreakEngines;
288	fLanguageBreakEngines = nullptr; // Just rebuild for now
289	}
290	// TODO: clone fLanguageBreakEngines from "that"
291	UErrorCode status = U_ZERO_ERROR;
292	utext_clone(&fText, &that.fText, false, true, &status);
293
294	if (fCharIter != &fSCharIter) {
295	delete fCharIter;
296	}
297	fCharIter = &fSCharIter;
298
299	if (that.fCharIter != nullptr && that.fCharIter != &that.fSCharIter) {
300	// This is a little bit tricky - it will initially appear that
301	// this->fCharIter is adopted, even if that->fCharIter was
302	// not adopted. That's ok.
303	fCharIter = that.fCharIter->clone();
304	}
305	fSCharIter = that.fSCharIter;
306	if (fCharIter == nullptr) {
307	fCharIter = &fSCharIter;
308	}
309
310	if (fData != nullptr) {
311	fData->removeReference();
312	fData = nullptr;
313	}
314	if (that.fData != nullptr) {
315	fData = that.fData->addReference();
316	}
317
318	uprv_free(fLookAheadMatches);
319	fLookAheadMatches = nullptr;
320	if (fData && fData->fForwardTable->fLookAheadResultsSize > `0`) {
321	fLookAheadMatches = static_cast<int32_t *>(
322	uprv_malloc(fData->fForwardTable->fLookAheadResultsSize * sizeof(int32_t)));
323	}
324
325
326	fPosition = that.fPosition;
327	fRuleStatusIndex = that.fRuleStatusIndex;
328	fDone = that.fDone;
329
330	// TODO: both the dictionary and the main cache need to be copied.
331	// Current position could be within a dictionary range. Trying to continue
332	// the iteration without the caches present would go to the rules, with
333	// the assumption that the current position is on a rule boundary.
334	fBreakCache->reset(fPosition, fRuleStatusIndex);
335	fDictionaryCache->reset();
336
337	return *this;
338	}
339
340	//-----------------------------------------------------------------------------
341	//
342	// clone - Returns a newly-constructed RuleBasedBreakIterator with the same
343	// behavior, and iterating over the same text, as this one.
344	// Virtual function: does the right thing with subclasses.
345	//
346	//-----------------------------------------------------------------------------
347	RuleBasedBreakIterator*
348	RuleBasedBreakIterator::clone() const {
349	return new RuleBasedBreakIterator (*this);
350	}
351
352	/**
353	* Equality operator. Returns true if both BreakIterators are of the
354	* same class, have the same behavior, and iterate over the same text.
355	*/
356	bool
357	RuleBasedBreakIterator::operator==(const BreakIterator& that) const {
358	if (typeid(*this) != typeid(that)) {
359	return false;
360	}
361	if (this == &that) {
362	return true;
363	}
364
365	// The base class BreakIterator carries no state that participates in equality,
366	// and does not implement an equality function that would otherwise be
367	// checked at this point.
368
369	const RuleBasedBreakIterator& that2 = static_cast<const RuleBasedBreakIterator&>(that);
370
371	if (!utext_equals(&fText, &that2.fText)) {
372	// The two break iterators are operating on different text,
373	// or have a different iteration position.
374	// Note that fText's position is always the same as the break iterator's position.
375	return false;
376	}
377
378	if (!(fPosition == that2.fPosition &&
379	fRuleStatusIndex == that2.fRuleStatusIndex &&
380	fDone == that2.fDone)) {
381	return false;
382	}
383
384	if (that2.fData == fData \|\|
385	(fData != nullptr && that2.fData != nullptr && that2.fData == fData)) {
386	// The two break iterators are using the same rules.
387	return true;
388	}
389	return false;
390	}
391
392	/**
393	* Compute a hash code for this BreakIterator
394	* @return A hash code
395	*/
396	int32_t
397	RuleBasedBreakIterator::hashCode() const {
398	int32_t hash = `0`;
399	if (fData != nullptr) {
400	hash = fData->hashCode();
401	}
402	return hash;
403	}
404
405
406	void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) {
407	if (U_FAILURE(status)) {
408	return;
409	}
410	fBreakCache->reset();
411	fDictionaryCache->reset();
412	utext_clone(&fText, ut, false, true, &status);
413
414	// Set up a dummy CharacterIterator to be returned if anyone
415	// calls getText(). With input from UText, there is no reasonable
416	// way to return a characterIterator over the actual input text.
417	// Return one over an empty string instead - this is the closest
418	// we can come to signaling a failure.
419	// (GetText() is obsolete, this failure is sort of OK)
420	fSCharIter.setText(u"", `0`);
421
422	if (fCharIter != &fSCharIter) {
423	// existing fCharIter was adopted from the outside. Delete it now.
424	delete fCharIter;
425	}
426	fCharIter = &fSCharIter;
427
428	this->first();
429	}
430
431
432	UText RuleBasedBreakIterator::getUText(UText fillIn, UErrorCode &status) const {
433	UText result = utext_clone(fillIn, &fText, false, true*, &status);
434	return result;
435	}
436
437
438	//=======================================================================
439	// BreakIterator overrides
440	//=======================================================================
441
442	/**
443	* Return a CharacterIterator over the text being analyzed.
444	*/
445	CharacterIterator&
446	RuleBasedBreakIterator::getText() const {
447	return *fCharIter;
448	}
449
450	/**
451	* Set the iterator to analyze a new piece of text. This function resets
452	* the current iteration position to the beginning of the text.
453	* @param newText An iterator over the text to analyze.
454	*/
455	void
456	RuleBasedBreakIterator::adoptText(CharacterIterator* newText) {
457	// If we are holding a CharacterIterator adopted from a
458	// previous call to this function, delete it now.
459	if (fCharIter != &fSCharIter) {
460	delete fCharIter;
461	}
462
463	fCharIter = newText;
464	UErrorCode status = U_ZERO_ERROR;
465	fBreakCache->reset();
466	fDictionaryCache->reset();
467	if (newText==nullptr \|\| newText->startIndex() != `0`) {
468	// startIndex !=0 wants to be an error, but there's no way to report it.
469	// Make the iterator text be an empty string.
470	utext_openUChars(&fText, nullptr, `0`, &status);
471	} else {
472	utext_openCharacterIterator(&fText, newText, &status);
473	}
474	this->first();
475	}
476
477	/**
478	* Set the iterator to analyze a new piece of text. This function resets
479	* the current iteration position to the beginning of the text.
480	* @param newText An iterator over the text to analyze.
481	*/
482	void
483	RuleBasedBreakIterator::setText(const UnicodeString& newText) {
484	UErrorCode status = U_ZERO_ERROR;
485	fBreakCache->reset();
486	fDictionaryCache->reset();
487	utext_openConstUnicodeString(&fText, &newText, &status);
488
489	// Set up a character iterator on the string.
490	// Needed in case someone calls getText().
491	// Can not, unfortunately, do this lazily on the (probably never)
492	// call to getText(), because getText is const.
493	fSCharIter.setText(newText.getBuffer(), newText.length());
494
495	if (fCharIter != &fSCharIter) {
496	// old fCharIter was adopted from the outside. Delete it.
497	delete fCharIter;
498	}
499	fCharIter = &fSCharIter;
500
501	this->first();
502	}
503
504
505	/**
506	* Provide a new UText for the input text. Must reference text with contents identical
507	* to the original.
508	* Intended for use with text data originating in Java (garbage collected) environments
509	* where the data may be moved in memory at arbitrary times.
510	*/
511	RuleBasedBreakIterator &RuleBasedBreakIterator::refreshInputText(UText *input, UErrorCode &status) {
512	if (U_FAILURE(status)) {
513	return *this;
514	}
515	if (input == nullptr) {
516	status = U_ILLEGAL_ARGUMENT_ERROR;
517	return *this;
518	}
519	int64_t pos = utext_getNativeIndex(&fText);
520	// Shallow read-only clone of the new UText into the existing input UText
521	utext_clone(&fText, input, false, true, &status);
522	if (U_FAILURE(status)) {
523	return *this;
524	}
525	utext_setNativeIndex(&fText, pos);
526	if (utext_getNativeIndex(&fText) != pos) {
527	// Sanity check. The new input utext is supposed to have the exact same
528	// contents as the old. If we can't set to the same position, it doesn't.
529	// The contents underlying the old utext might be invalid at this point,
530	// so it's not safe to check directly.
531	status = U_ILLEGAL_ARGUMENT_ERROR;
532	}
533	return *this;
534	}
535
536
537	/**
538	* Sets the current iteration position to the beginning of the text, position zero.
539	* @return The new iterator position, which is zero.
540	*/
541	int32_t RuleBasedBreakIterator::first() {
542	UErrorCode status = U_ZERO_ERROR;
543	if (!fBreakCache->seek(`0`)) {
544	fBreakCache->populateNear(`0`, status);
545	}
546	fBreakCache->current();
547	U_ASSERT(fPosition == `0`);
548	return `0`;
549	}
550
551	/**
552	* Sets the current iteration position to the end of the text.
553	* @return The text's past-the-end offset.
554	*/
555	int32_t RuleBasedBreakIterator::last() {
556	int32_t endPos = (int32_t)utext_nativeLength(&fText);
557	UBool endShouldBeBoundary = isBoundary(endPos); // Has side effect of setting iterator position.
558	(void)endShouldBeBoundary;
559	U_ASSERT(endShouldBeBoundary);
560	U_ASSERT(fPosition == endPos);
561	return endPos;
562	}
563
564	/**
565	* Advances the iterator either forward or backward the specified number of steps.
566	* Negative values move backward, and positive values move forward. This is
567	* equivalent to repeatedly calling next() or previous().
568	* @param n The number of steps to move. The sign indicates the direction
569	* (negative is backwards, and positive is forwards).
570	* @return The character offset of the boundary position n boundaries away from
571	* the current one.
572	*/
573	int32_t RuleBasedBreakIterator::next(int32_t n) {
574	int32_t result = `0`;
575	if (n > `0`) {
576	for (; n > `0` && result != UBRK_DONE; --n) {
577	result = next();
578	}
579	} else if (n < `0`) {
580	for (; n < `0` && result != UBRK_DONE; ++n) {
581	result = previous();
582	}
583	} else {
584	result = current();
585	}
586	return result;
587	}
588
589	/**
590	* Advances the iterator to the next boundary position.
591	* @return The position of the first boundary after this one.
592	*/
593	int32_t RuleBasedBreakIterator::next() {
594	fBreakCache->next();
595	return fDone ? UBRK_DONE : fPosition;
596	}
597
598	/**
599	* Move the iterator backwards, to the boundary preceding the current one.
600	*
601	* Starts from the current position within fText.
602	* Starting position need not be on a boundary.
603	*
604	* @return The position of the boundary position immediately preceding the starting position.
605	*/
606	int32_t RuleBasedBreakIterator::previous() {
607	UErrorCode status = U_ZERO_ERROR;
608	fBreakCache->previous(status);
609	return fDone ? UBRK_DONE : fPosition;
610	}
611
612	/**
613	* Sets the iterator to refer to the first boundary position following
614	* the specified position.
615	* @param startPos The position from which to begin searching for a break position.
616	* @return The position of the first break after the current position.
617	*/
618	int32_t RuleBasedBreakIterator::following(int32_t startPos) {
619	// if the supplied position is before the beginning, return the
620	// text's starting offset
621	if (startPos < `0`) {
622	return first();
623	}
624
625	// Move requested offset to a code point start. It might be on a trail surrogate,
626	// or on a trail byte if the input is UTF-8. Or it may be beyond the end of the text.
627	utext_setNativeIndex(&fText, startPos);
628	startPos = (int32_t)utext_getNativeIndex(&fText);
629
630	UErrorCode status = U_ZERO_ERROR;
631	fBreakCache->following(startPos, status);
632	return fDone ? UBRK_DONE : fPosition;
633	}
634
635	/**
636	* Sets the iterator to refer to the last boundary position before the
637	* specified position.
638	* @param offset The position to begin searching for a break from.
639	* @return The position of the last boundary before the starting position.
640	*/
641	int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
642	if (offset > utext_nativeLength(&fText)) {
643	return last();
644	}
645
646	// Move requested offset to a code point start. It might be on a trail surrogate,
647	// or on a trail byte if the input is UTF-8.
648
649	utext_setNativeIndex(&fText, offset);
650	int32_t adjustedOffset = static_cast<int32_t>(utext_getNativeIndex(&fText));
651
652	UErrorCode status = U_ZERO_ERROR;
653	fBreakCache->preceding(adjustedOffset, status);
654	return fDone ? UBRK_DONE : fPosition;
655	}
656
657	/**
658	* Returns true if the specified position is a boundary position. As a side
659	* effect, leaves the iterator pointing to the first boundary position at
660	* or after "offset".
661	*
662	* @param offset the offset to check.
663	* @return True if "offset" is a boundary position.
664	*/
665	UBool RuleBasedBreakIterator::isBoundary(int32_t offset) {
666	// out-of-range indexes are never boundary positions
667	if (offset < `0`) {
668	first(); // For side effects on current position, tag values.
669	return false;
670	}
671
672	// Adjust offset to be on a code point boundary and not beyond the end of the text.
673	// Note that isBoundary() is always false for offsets that are not on code point boundaries.
674	// But we still need the side effect of leaving iteration at the following boundary.
675
676	utext_setNativeIndex(&fText, offset);
677	int32_t adjustedOffset = static_cast<int32_t>(utext_getNativeIndex(&fText));
678
679	bool result = false;
680	UErrorCode status = U_ZERO_ERROR;
681	if (fBreakCache->seek(adjustedOffset) \|\| fBreakCache->populateNear(adjustedOffset, status)) {
682	result = (fBreakCache->current() == offset);
683	}
684
685	if (result && adjustedOffset < offset && utext_char32At(&fText, offset) == U_SENTINEL) {
686	// Original offset is beyond the end of the text. Return false, it's not a boundary,
687	// but the iteration position remains set to the end of the text, which is a boundary.
688	return false;
689	}
690	if (!result) {
691	// Not on a boundary. isBoundary() must leave iterator on the following boundary.
692	// Cache->seek(), above, left us on the preceding boundary, so advance one.
693	next();
694	}
695	return result;
696	}
697
698
699	/**
700	* Returns the current iteration position.
701	* @return The current iteration position.
702	*/
703	int32_t RuleBasedBreakIterator::current() const {
704	return fPosition;
705	}
706
707
708	//=======================================================================
709	// implementation
710	//=======================================================================
711
712	//
713	// RBBIRunMode - the state machine runs an extra iteration at the beginning and end
714	// of user text. A variable with this enum type keeps track of where we
715	// are. The state machine only fetches user input while in the RUN mode.
716	//
717	enum RBBIRunMode {
718	RBBI_START, // state machine processing is before first char of input
719	RBBI_RUN, // state machine processing is in the user text
720	RBBI_END // state machine processing is after end of user text.
721	};
722
723
724	// Wrapper functions to select the appropriate handleNext() or handleSafePrevious()
725	// instantiation, based on whether an 8 or 16 bit table is required.
726	//
727	// These Trie access functions will be inlined within the handleNext()/Previous() instantions.
728	static inline uint16_t TrieFunc8(const UCPTrie *trie, UChar32 c) {
729	return UCPTRIE_FAST_GET(trie, UCPTRIE_8, c);
730	}
731
732	static inline uint16_t TrieFunc16(const UCPTrie *trie, UChar32 c) {
733	return UCPTRIE_FAST_GET(trie, UCPTRIE_16, c);
734	}
735
736	int32_t RuleBasedBreakIterator::handleNext() {
737	const RBBIStateTable *statetable = fData->fForwardTable;
738	bool use8BitsTrie = ucptrie_getValueWidth(fData->fTrie) == UCPTRIE_VALUE_BITS_8;
739	if (statetable->fFlags & RBBI_8BITS_ROWS) {
740	if (use8BitsTrie) {
741	return handleNext<RBBIStateTableRow8, TrieFunc8>();
742	} else {
743	return handleNext<RBBIStateTableRow8, TrieFunc16>();
744	}
745	} else {
746	if (use8BitsTrie) {
747	return handleNext<RBBIStateTableRow16, TrieFunc8>();
748	} else {
749	return handleNext<RBBIStateTableRow16, TrieFunc16>();
750	}
751	}
752	}
753
754	int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) {
755	const RBBIStateTable *statetable = fData->fReverseTable;
756	bool use8BitsTrie = ucptrie_getValueWidth(fData->fTrie) == UCPTRIE_VALUE_BITS_8;
757	if (statetable->fFlags & RBBI_8BITS_ROWS) {
758	if (use8BitsTrie) {
759	return handleSafePrevious<RBBIStateTableRow8, TrieFunc8>(fromPosition);
760	} else {
761	return handleSafePrevious<RBBIStateTableRow8, TrieFunc16>(fromPosition);
762	}
763	} else {
764	if (use8BitsTrie) {
765	return handleSafePrevious<RBBIStateTableRow16, TrieFunc8>(fromPosition);
766	} else {
767	return handleSafePrevious<RBBIStateTableRow16, TrieFunc16>(fromPosition);
768	}
769	}
770	}
771
772
773	//-----------------------------------------------------------------------------------
774	//
775	// handleNext()
776	// Run the state machine to find a boundary
777	//
778	//-----------------------------------------------------------------------------------
779	template <typename RowType, RuleBasedBreakIterator::PTrieFunc trieFunc>
780	int32_t RuleBasedBreakIterator::handleNext() {
781	int32_t state;
782	uint16_t category = `0`;
783	RBBIRunMode mode;
784
785	RowType *row;
786	UChar32 c;
787	int32_t result = `0`;
788	int32_t initialPosition = `0`;
789	const RBBIStateTable *statetable = fData->fForwardTable;
790	const char *tableData = statetable->fTableData;
791	uint32_t tableRowLen = statetable->fRowLen;
792	uint32_t dictStart = statetable->fDictCategoriesStart;
793	#ifdef RBBI_DEBUG
794	if (gTrace) {
795	RBBIDebugPuts("Handle Next pos char state category");
796	}
797	#endif
798
799	// handleNext always sets the break tag value.
800	// Set the default for it.
801	fRuleStatusIndex = `0`;
802
803	fDictionaryCharCount = `0`;
804
805	// if we're already at the end of the text, return DONE.
806	initialPosition = fPosition;
807	UTEXT_SETNATIVEINDEX(&fText, initialPosition);
808	result = initialPosition;
809	c = UTEXT_NEXT32(&fText);
810	if (c==U_SENTINEL) {
811	fDone = true;
812	return UBRK_DONE;
813	}
814
815	// Set the initial state for the state machine
816	state = START_STATE;
817	row = (RowType *)
818	//(statetable->fTableData + (statetable->fRowLen state));*
819	(tableData + tableRowLen * state);
820
821
822	mode = RBBI_RUN;
823	if (statetable->fFlags & RBBI_BOF_REQUIRED) {
824	category = `2`;
825	mode = RBBI_START;
826	}
827
828
829	// loop until we reach the end of the text or transition to state 0
830	//
831	for (;;) {
832	if (c == U_SENTINEL) {
833	// Reached end of input string.
834	if (mode == RBBI_END) {
835	// We have already run the loop one last time with the
836	// character set to the psueudo {eof} value. Now it is time
837	// to unconditionally bail out.
838	break;
839	}
840	// Run the loop one last time with the fake end-of-input character category.
841	mode = RBBI_END;
842	category = `1`;
843	}
844
845	//
846	// Get the char category. An incoming category of 1 or 2 means that
847	// we are preset for doing the beginning or end of input, and
848	// that we shouldn't get a category from an actual text input character.
849	//
850	if (mode == RBBI_RUN) {
851	// look up the current character's character category, which tells us
852	// which column in the state table to look at.
853	category = trieFunc(fData->fTrie, c);
854	fDictionaryCharCount += (category >= dictStart);
855	}
856
857	#ifdef RBBI_DEBUG
858	if (gTrace) {
859	RBBIDebugPrintf(" %4" PRId64 " ", utext_getNativeIndex(&fText));
860	if (`0x20`<=c && c<`0x7f`) {
861	RBBIDebugPrintf("\"%c\" ", c);
862	} else {
863	RBBIDebugPrintf("%5x ", c);
864	}
865	RBBIDebugPrintf("%3d %3d\n", state, category);
866	}
867	#endif
868
869	// State Transition - move machine to its next state
870	//
871
872	// fNextState is a variable-length array.
873	U_ASSERT(category<fData->fHeader->fCatCount);
874	state = row->fNextState[category]; /Not accessing beyond memory/
875	row = (RowType *)
876	// (statetable->fTableData + (statetable->fRowLen state));*
877	(tableData + tableRowLen * state);
878
879
880	uint16_t accepting = row->fAccepting;
881	if (accepting == ACCEPTING_UNCONDITIONAL) {
882	// Match found, common case.
883	if (mode != RBBI_START) {
884	result = (int32_t)UTEXT_GETNATIVEINDEX(&fText);
885	}
886	fRuleStatusIndex = row->fTagsIdx; // Remember the break status (tag) values.
887	} else if (accepting > ACCEPTING_UNCONDITIONAL) {
888	// Lookahead match is completed.
889	U_ASSERT(accepting < fData->fForwardTable->fLookAheadResultsSize);
890	int32_t lookaheadResult = fLookAheadMatches[accepting];
891	if (lookaheadResult >= `0`) {
892	fRuleStatusIndex = row->fTagsIdx;
893	fPosition = lookaheadResult;
894	return lookaheadResult;
895	}
896	}
897
898	// If we are at the position of the '/' in a look-ahead (hard break) rule;
899	// record the current position, to be returned later, if the full rule matches.
900	// TODO: Move this check before the previous check of fAccepting.
901	// This would enable hard-break rules with no following context.
902	// But there are line break test failures when trying this. Investigate.
903	// Issue ICU-20837
904	uint16_t rule = row->fLookAhead;
905	U_ASSERT(rule == `0` \|\| rule > ACCEPTING_UNCONDITIONAL);
906	U_ASSERT(rule == `0` \|\| rule < fData->fForwardTable->fLookAheadResultsSize);
907	if (rule > ACCEPTING_UNCONDITIONAL) {
908	int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(&fText);
909	fLookAheadMatches[rule] = pos;
910	}
911
912	if (state == STOP_STATE) {
913	// This is the normal exit from the lookup state machine.
914	// We have advanced through the string until it is certain that no
915	// longer match is possible, no matter what characters follow.
916	break;
917	}
918
919	// Advance to the next character.
920	// If this is a beginning-of-input loop iteration, don't advance
921	// the input position. The next iteration will be processing the
922	// first real input character.
923	if (mode == RBBI_RUN) {
924	c = UTEXT_NEXT32(&fText);
925	} else {
926	if (mode == RBBI_START) {
927	mode = RBBI_RUN;
928	}
929	}
930	}
931
932	// The state machine is done. Check whether it found a match...
933
934	// If the iterator failed to advance in the match engine, force it ahead by one.
935	// (This really indicates a defect in the break rules. They should always match
936	// at least one character.)
937	if (result == initialPosition) {
938	utext_setNativeIndex(&fText, initialPosition);
939	utext_next32(&fText);
940	result = (int32_t)utext_getNativeIndex(&fText);
941	fRuleStatusIndex = `0`;
942	}
943
944	// Leave the iterator at our result position.
945	fPosition = result;
946	#ifdef RBBI_DEBUG
947	if (gTrace) {
948	RBBIDebugPrintf("result = %d\n\n", result);
949	}
950	#endif
951	return result;
952	}
953
954
955	//-----------------------------------------------------------------------------------
956	//
957	// handleSafePrevious()
958	//
959	// Iterate backwards using the safe reverse rules.
960	// The logic of this function is similar to handleNext(), but simpler
961	// because the safe table does not require as many options.
962	//
963	//-----------------------------------------------------------------------------------
964	template <typename RowType, RuleBasedBreakIterator::PTrieFunc trieFunc>
965	int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) {
966
967	int32_t state;
968	uint16_t category = `0`;
969	RowType *row;
970	UChar32 c;
971	int32_t result = `0`;
972
973	const RBBIStateTable *stateTable = fData->fReverseTable;
974	UTEXT_SETNATIVEINDEX(&fText, fromPosition);
975	#ifdef RBBI_DEBUG
976	if (gTrace) {
977	RBBIDebugPuts("Handle Previous pos char state category");
978	}
979	#endif
980
981	// if we're already at the start of the text, return DONE.
982	if (fData == nullptr \|\| UTEXT_GETNATIVEINDEX(&fText)==`0`) {
983	return BreakIterator::DONE;
984	}
985
986	// Set the initial state for the state machine
987	c = UTEXT_PREVIOUS32(&fText);
988	state = START_STATE;
989	row = (RowType *)
990	(stateTable->fTableData + (stateTable->fRowLen * state));
991
992	// loop until we reach the start of the text or transition to state 0
993	//
994	for (; c != U_SENTINEL; c = UTEXT_PREVIOUS32(&fText)) {
995
996	// look up the current character's character category, which tells us
997	// which column in the state table to look at.
998	//
999	// Off the dictionary flag bit. For reverse iteration it is not used.
1000	category = trieFunc(fData->fTrie, c);
1001
1002	#ifdef RBBI_DEBUG
1003	if (gTrace) {
1004	RBBIDebugPrintf(" %4d ", (int32_t)utext_getNativeIndex(&fText));
1005	if (`0x20`<=c && c<`0x7f`) {
1006	RBBIDebugPrintf("\"%c\" ", c);
1007	} else {
1008	RBBIDebugPrintf("%5x ", c);
1009	}
1010	RBBIDebugPrintf("%3d %3d\n", state, category);
1011	}
1012	#endif
1013
1014	// State Transition - move machine to its next state
1015	//
1016	// fNextState is a variable-length array.
1017	U_ASSERT(category<fData->fHeader->fCatCount);
1018	state = row->fNextState[category]; /Not accessing beyond memory/
1019	row = (RowType *)
1020	(stateTable->fTableData + (stateTable->fRowLen * state));
1021
1022	if (state == STOP_STATE) {
1023	// This is the normal exit from the lookup state machine.
1024	// Transition to state zero means we have found a safe point.
1025	break;
1026	}
1027	}
1028
1029	// The state machine is done. Check whether it found a match...
1030	result = (int32_t)UTEXT_GETNATIVEINDEX(&fText);
1031	#ifdef RBBI_DEBUG
1032	if (gTrace) {
1033	RBBIDebugPrintf("result = %d\n\n", result);
1034	}
1035	#endif
1036	return result;
1037	}
1038
1039
1040	//-------------------------------------------------------------------------------
1041	//
1042	// getRuleStatus() Return the break rule tag associated with the current
1043	// iterator position. If the iterator arrived at its current
1044	// position by iterating forwards, the value will have been
1045	// cached by the handleNext() function.
1046	//
1047	//-------------------------------------------------------------------------------
1048
1049	int32_t RuleBasedBreakIterator::getRuleStatus() const {
1050
1051	// fLastRuleStatusIndex indexes to the start of the appropriate status record
1052	// (the number of status values.)
1053	// This function returns the last (largest) of the array of status values.
1054	int32_t idx = fRuleStatusIndex + fData->fRuleStatusTable[fRuleStatusIndex];
1055	int32_t tagVal = fData->fRuleStatusTable[idx];
1056
1057	return tagVal;
1058	}
1059
1060
1061	int32_t RuleBasedBreakIterator::getRuleStatusVec(
1062	int32_t *fillInVec, int32_t capacity, UErrorCode &status) {
1063	if (U_FAILURE(status)) {
1064	return `0`;
1065	}
1066
1067	int32_t numVals = fData->fRuleStatusTable[fRuleStatusIndex];
1068	int32_t numValsToCopy = numVals;
1069	if (numVals > capacity) {
1070	status = U_BUFFER_OVERFLOW_ERROR;
1071	numValsToCopy = capacity;
1072	}
1073	int i;
1074	for (i=`0`; i<numValsToCopy; i++) {
1075	fillInVec[i] = fData->fRuleStatusTable[fRuleStatusIndex + i + `1`];
1076	}
1077	return numVals;
1078	}
1079
1080
1081
1082	//-------------------------------------------------------------------------------
1083	//
1084	// getBinaryRules Access to the compiled form of the rules,
1085	// for use by build system tools that save the data
1086	// for standard iterator types.
1087	//
1088	//-------------------------------------------------------------------------------
1089	const uint8_t *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) {
1090	const uint8_t retPtr = nullptr*;
1091	length = `0`;
1092
1093	if (fData != nullptr) {
1094	retPtr = (const uint8_t *)fData->fHeader;
1095	length = fData->fHeader->fLength;
1096	}
1097	return retPtr;
1098	}
1099
1100
1101	RuleBasedBreakIterator *RuleBasedBreakIterator::createBufferClone(
1102	void * /stackBuffer/, int32_t &bufferSize, UErrorCode &status) {
1103	if (U_FAILURE(status)){
1104	return nullptr;
1105	}
1106
1107	if (bufferSize == `0`) {
1108	bufferSize = `1`; // preflighting for deprecated functionality
1109	return nullptr;
1110	}
1111
1112	BreakIterator *clonedBI = clone();
1113	if (clonedBI == nullptr) {
1114	status = U_MEMORY_ALLOCATION_ERROR;
1115	} else {
1116	status = U_SAFECLONE_ALLOCATED_WARNING;
1117	}
1118	return (RuleBasedBreakIterator *)clonedBI;
1119	}
1120
1121	U_NAMESPACE_END
1122
1123
1124	static icu::UStack gLanguageBreakFactories = nullptr*;
1125	static const icu::UnicodeString gEmptyString = nullptr*;
1126	static icu::UInitOnce gLanguageBreakFactoriesInitOnce {};
1127	static icu::UInitOnce gRBBIInitOnce {};
1128
1129	/**
1130	* Release all static memory held by breakiterator.
1131	*/
1132	U_CDECL_BEGIN
1133	UBool U_CALLCONV rbbi_cleanup() {
1134	delete gLanguageBreakFactories;
1135	gLanguageBreakFactories = nullptr;
1136	delete gEmptyString;
1137	gEmptyString = nullptr;
1138	gLanguageBreakFactoriesInitOnce.reset();
1139	gRBBIInitOnce.reset();
1140	return true;
1141	}
1142	U_CDECL_END
1143
1144	U_CDECL_BEGIN
1145	static void U_CALLCONV _deleteFactory(void *obj) {
1146	delete (icu::LanguageBreakFactory *) obj;
1147	}
1148	U_CDECL_END
1149	U_NAMESPACE_BEGIN
1150
1151	static void U_CALLCONV rbbiInit() {
1152	gEmptyString = new UnicodeString ();
1153	ucln_common_registerCleanup(UCLN_COMMON_RBBI, rbbi_cleanup);
1154	}
1155
1156	static void U_CALLCONV initLanguageFactories() {
1157	UErrorCode status = U_ZERO_ERROR;
1158	U_ASSERT(gLanguageBreakFactories == nullptr);
1159	gLanguageBreakFactories = new UStack (_deleteFactory, nullptr, status);
1160	if (gLanguageBreakFactories != nullptr && U_SUCCESS(status)) {
1161	ICULanguageBreakFactory builtIn = new* ICULanguageBreakFactory (status);
1162	gLanguageBreakFactories->push(builtIn, status);
1163	#ifdef U_LOCAL_SERVICE_HOOK
1164	LanguageBreakFactory extra = (LanguageBreakFactory )uprv_svc_hook("languageBreakFactory", &status);
1165	if (extra != nullptr) {
1166	gLanguageBreakFactories->push(extra, status);
1167	}
1168	#endif
1169	}
1170	ucln_common_registerCleanup(UCLN_COMMON_RBBI, rbbi_cleanup);
1171	}
1172
1173
1174	static const LanguageBreakEngine*
1175	getLanguageBreakEngineFromFactory(UChar32 c)
1176	{
1177	umtx_initOnce(gLanguageBreakFactoriesInitOnce, &initLanguageFactories);
1178	if (gLanguageBreakFactories == nullptr) {
1179	return nullptr;
1180	}
1181
1182	int32_t i = gLanguageBreakFactories->size();
1183	const LanguageBreakEngine lbe = nullptr*;
1184	while (--i >= `0`) {
1185	LanguageBreakFactory factory = (LanguageBreakFactory )(gLanguageBreakFactories->elementAt(i));
1186	lbe = factory->getEngineFor(c);
1187	if (lbe != nullptr) {
1188	break;
1189	}
1190	}
1191	return lbe;
1192	}
1193
1194
1195	//-------------------------------------------------------------------------------
1196	//
1197	// getLanguageBreakEngine Find an appropriate LanguageBreakEngine for the
1198	// the character c.
1199	//
1200	//-------------------------------------------------------------------------------
1201	const LanguageBreakEngine *
1202	RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) {
1203	const LanguageBreakEngine lbe = nullptr*;
1204	UErrorCode status = U_ZERO_ERROR;
1205
1206	if (fLanguageBreakEngines == nullptr) {
1207	fLanguageBreakEngines = new UStack (status);
1208	if (fLanguageBreakEngines == nullptr \|\| U_FAILURE(status)) {
1209	delete fLanguageBreakEngines;
1210	fLanguageBreakEngines = `0`;
1211	return nullptr;
1212	}
1213	}
1214
1215	int32_t i = fLanguageBreakEngines->size();
1216	while (--i >= `0`) {
1217	lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i));
1218	if (lbe->handles(c)) {
1219	return lbe;
1220	}
1221	}
1222
1223	// No existing dictionary took the character. See if a factory wants to
1224	// give us a new LanguageBreakEngine for this character.
1225	lbe = getLanguageBreakEngineFromFactory(c);
1226
1227	// If we got one, use it and push it on our stack.
1228	if (lbe != nullptr) {
1229	fLanguageBreakEngines->push((void *)lbe, status);
1230	// Even if we can't remember it, we can keep looking it up, so
1231	// return it even if the push fails.
1232	return lbe;
1233	}
1234
1235	// No engine is forthcoming for this character. Add it to the
1236	// reject set. Create the reject break engine if needed.
1237	if (fUnhandledBreakEngine == nullptr) {
1238	fUnhandledBreakEngine = new UnhandledEngine (status);
1239	if (U_SUCCESS(status) && fUnhandledBreakEngine == nullptr) {
1240	status = U_MEMORY_ALLOCATION_ERROR;
1241	return nullptr;
1242	}
1243	// Put it last so that scripts for which we have an engine get tried
1244	// first.
1245	fLanguageBreakEngines->insertElementAt(fUnhandledBreakEngine, `0`, status);
1246	// If we can't insert it, or creation failed, get rid of it
1247	U_ASSERT(!fLanguageBreakEngines->hasDeleter());
1248	if (U_FAILURE(status)) {
1249	delete fUnhandledBreakEngine;
1250	fUnhandledBreakEngine = `0`;
1251	return nullptr;
1252	}
1253	}
1254
1255	// Tell the reject engine about the character; at its discretion, it may
1256	// add more than just the one character.
1257	fUnhandledBreakEngine->handleCharacter(c);
1258
1259	return fUnhandledBreakEngine;
1260	}
1261
1262	void RuleBasedBreakIterator::dumpCache() {
1263	fBreakCache->dumpCache();
1264	}
1265
1266	void RuleBasedBreakIterator::dumpTables() {
1267	fData->printData();
1268	}
1269
1270	/**
1271	* Returns the description used to create this iterator
1272	*/
1273
1274	const UnicodeString&
1275	RuleBasedBreakIterator::getRules() const {
1276	if (fData != nullptr) {
1277	return fData->getRuleSourceString();
1278	} else {
1279	umtx_initOnce(gRBBIInitOnce, &rbbiInit);
1280	return *gEmptyString;
1281	}
1282	}
1283
1284	U_NAMESPACE_END
1285
1286	#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1287

Browse the source code of Godot/thirdparty/icu4c/common/rbbi.cpp