dictbe.cpp source code [Godot/thirdparty/icu4c/common/dictbe.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/**
4	*******************************************************************************
5	* Copyright (C) 2006-2016, International Business Machines Corporation
6	* and others. All Rights Reserved.
7	*******************************************************************************
8	*/
9
10	#include <utility>
11
12	#include "unicode/utypes.h"
13
14	#if !UCONFIG_NO_BREAK_ITERATION
15
16	#include "brkeng.h"
17	#include "dictbe.h"
18	#include "unicode/uniset.h"
19	#include "unicode/chariter.h"
20	#include "unicode/resbund.h"
21	#include "unicode/ubrk.h"
22	#include "unicode/usetiter.h"
23	#include "ubrkimpl.h"
24	#include "utracimp.h"
25	#include "uvectr32.h"
26	#include "uvector.h"
27	#include "uassert.h"
28	#include "unicode/normlzr.h"
29	#include "cmemory.h"
30	#include "dictionarydata.h"
31
32	U_NAMESPACE_BEGIN
33
34	/*
35	******************************************************************
36	*/
37
38	DictionaryBreakEngine::DictionaryBreakEngine() {
39	}
40
41	DictionaryBreakEngine::~DictionaryBreakEngine() {
42	}
43
44	UBool
45	DictionaryBreakEngine::handles(UChar32 c) const {
46	return fSet.contains(c);
47	}
48
49	int32_t
50	DictionaryBreakEngine::findBreaks( UText *text,
51	int32_t startPos,
52	int32_t endPos,
53	UVector32 &foundBreaks,
54	UBool isPhraseBreaking,
55	UErrorCode& status) const {
56	if (U_FAILURE(status)) return `0`;
57	(void)startPos; // TODO: remove this param?
58	int32_t result = `0`;
59
60	// Find the span of characters included in the set.
61	// The span to break begins at the current position in the text, and
62	// extends towards the start or end of the text, depending on 'reverse'.
63
64	int32_t start = (int32_t)utext_getNativeIndex(text);
65	int32_t current;
66	int32_t rangeStart;
67	int32_t rangeEnd;
68	UChar32 c = utext_current32(text);
69	while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) {
70	utext_next32(text); // TODO: recast loop for postincrement
71	c = utext_current32(text);
72	}
73	rangeStart = start;
74	rangeEnd = current;
75	result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks, isPhraseBreaking, status);
76	utext_setNativeIndex(text, current);
77
78	return result;
79	}
80
81	void
82	DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) {
83	fSet = set;
84	// Compact for caching
85	fSet.compact();
86	}
87
88	/*
89	******************************************************************
90	* PossibleWord
91	*/
92
93	// Helper class for improving readability of the Thai/Lao/Khmer word break
94	// algorithm. The implementation is completely inline.
95
96	// List size, limited by the maximum number of words in the dictionary
97	// that form a nested sequence.
98	static const int32_t POSSIBLE_WORD_LIST_MAX = `20`;
99
100	class PossibleWord {
101	private:
102	// list of word candidate lengths, in increasing length order
103	// TODO: bytes would be sufficient for word lengths.
104	int32_t count; // Count of candidates
105	int32_t prefix; // The longest match with a dictionary word
106	int32_t offset; // Offset in the text of these candidates
107	int32_t mark; // The preferred candidate's offset
108	int32_t current; // The candidate we're currently looking at
109	int32_t cuLengths[POSSIBLE_WORD_LIST_MAX]; // Word Lengths, in code units.
110	int32_t cpLengths[POSSIBLE_WORD_LIST_MAX]; // Word Lengths, in code points.
111
112	public:
113	PossibleWord() : count(`0`), prefix(`0`), offset(-`1`), mark(`0`), current(`0`) {}
114	~PossibleWord() {}
115
116	// Fill the list of candidates if needed, select the longest, and return the number found
117	int32_t candidates( UText text, DictionaryMatcher dict, int32_t rangeEnd );
118
119	// Select the currently marked candidate, point after it in the text, and invalidate self
120	int32_t acceptMarked( UText *text );
121
122	// Back up from the current candidate to the next shorter one; return true if that exists
123	// and point the text after it
124	UBool backUp( UText *text );
125
126	// Return the longest prefix this candidate location shares with a dictionary word
127	// Return value is in code points.
128	int32_t longestPrefix() { return prefix; }
129
130	// Mark the current candidate as the one we like
131	void markCurrent() { mark = current; }
132
133	// Get length in code points of the marked word.
134	int32_t markedCPLength() { return cpLengths[mark]; }
135	};
136
137
138	int32_t PossibleWord::candidates( UText text, DictionaryMatcher dict, int32_t rangeEnd ) {
139	// TODO: If getIndex is too slow, use offset < 0 and add discardAll()
140	int32_t start = (int32_t)utext_getNativeIndex(text);
141	if (start != offset) {
142	offset = start;
143	count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, nullptr, &prefix);
144	// Dictionary leaves text after longest prefix, not longest word. Back up.
145	if (count <= `0`) {
146	utext_setNativeIndex(text, start);
147	}
148	}
149	if (count > `0`) {
150	utext_setNativeIndex(text, start+cuLengths[count-`1`]);
151	}
152	current = count-`1`;
153	mark = current;
154	return count;
155	}
156
157	int32_t
158	PossibleWord::acceptMarked( UText *text ) {
159	utext_setNativeIndex(text, offset + cuLengths[mark]);
160	return cuLengths[mark];
161	}
162
163
164	UBool
165	PossibleWord::backUp( UText *text ) {
166	if (current > `0`) {
167	utext_setNativeIndex(text, offset + cuLengths[--current]);
168	return true;
169	}
170	return false;
171	}
172
173	/*
174	******************************************************************
175	* ThaiBreakEngine
176	*/
177
178	// How many words in a row are "good enough"?
179	static const int32_t THAI_LOOKAHEAD = `3`;
180
181	// Will not combine a non-word with a preceding dictionary word longer than this
182	static const int32_t THAI_ROOT_COMBINE_THRESHOLD = `3`;
183
184	// Will not combine a non-word that shares at least this much prefix with a
185	// dictionary word, with a preceding word
186	static const int32_t THAI_PREFIX_COMBINE_THRESHOLD = `3`;
187
188	// Elision character
189	static const int32_t THAI_PAIYANNOI = `0x0E2F`;
190
191	// Repeat character
192	static const int32_t THAI_MAIYAMOK = `0x0E46`;
193
194	// Minimum word size
195	static const int32_t THAI_MIN_WORD = `2`;
196
197	// Minimum number of characters for two words
198	static const int32_t THAI_MIN_WORD_SPAN = THAI_MIN_WORD * `2`;
199
200	ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
201	: DictionaryBreakEngine (),
202	fDictionary(adoptDictionary)
203	{
204	UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
205	UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Thai");
206	UnicodeSet thaiWordSet(UnicodeString (u"[[:Thai:]&[:LineBreak=SA:]]"), status);
207	if (U_SUCCESS(status)) {
208	setCharacters(thaiWordSet);
209	}
210	fMarkSet.applyPattern(UnicodeString (u"[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status);
211	fMarkSet.add(`0x0020`);
212	fEndWordSet = thaiWordSet;
213	fEndWordSet.remove(`0x0E31`); // MAI HAN-AKAT
214	fEndWordSet.remove(`0x0E40`, `0x0E44`); // SARA E through SARA AI MAIMALAI
215	fBeginWordSet.add(`0x0E01`, `0x0E2E`); // KO KAI through HO NOKHUK
216	fBeginWordSet.add(`0x0E40`, `0x0E44`); // SARA E through SARA AI MAIMALAI
217	fSuffixSet.add(THAI_PAIYANNOI);
218	fSuffixSet.add(THAI_MAIYAMOK);
219
220	// Compact for caching.
221	fMarkSet.compact();
222	fEndWordSet.compact();
223	fBeginWordSet.compact();
224	fSuffixSet.compact();
225	UTRACE_EXIT_STATUS(status);
226	}
227
228	ThaiBreakEngine::~ThaiBreakEngine() {
229	delete fDictionary;
230	}
231
232	int32_t
233	ThaiBreakEngine::divideUpDictionaryRange( UText *text,
234	int32_t rangeStart,
235	int32_t rangeEnd,
236	UVector32 &foundBreaks,
237	UBool / isPhraseBreaking /,
238	UErrorCode& status) const {
239	if (U_FAILURE(status)) return `0`;
240	utext_setNativeIndex(text, rangeStart);
241	utext_moveIndex32(text, THAI_MIN_WORD_SPAN);
242	if (utext_getNativeIndex(text) >= rangeEnd) {
243	return `0`; // Not enough characters for two words
244	}
245	utext_setNativeIndex(text, rangeStart);
246
247
248	uint32_t wordsFound = `0`;
249	int32_t cpWordLength = `0`; // Word Length in Code Points.
250	int32_t cuWordLength = `0`; // Word length in code units (UText native indexing)
251	int32_t current;
252	PossibleWord words[THAI_LOOKAHEAD];
253
254	utext_setNativeIndex(text, rangeStart);
255
256	while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
257	cpWordLength = `0`;
258	cuWordLength = `0`;
259
260	// Look for candidate words at the current position
261	int32_t candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
262
263	// If we found exactly one, use that
264	if (candidates == `1`) {
265	cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
266	cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
267	wordsFound += `1`;
268	}
269	// If there was more than one, see which one can take us forward the most words
270	else if (candidates > `1`) {
271	// If we're already at the end of the range, we're done
272	if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
273	goto foundBest;
274	}
275	do {
276	if (words[(wordsFound + `1`) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > `0`) {
277	// Followed by another dictionary word; mark first word as a good candidate
278	words[wordsFound%THAI_LOOKAHEAD].markCurrent();
279
280	// If we're already at the end of the range, we're done
281	if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
282	goto foundBest;
283	}
284
285	// See if any of the possible second words is followed by a third word
286	do {
287	// If we find a third word, stop right away
288	if (words[(wordsFound + `2`) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
289	words[wordsFound % THAI_LOOKAHEAD].markCurrent();
290	goto foundBest;
291	}
292	}
293	while (words[(wordsFound + `1`) % THAI_LOOKAHEAD].backUp(text));
294	}
295	}
296	while (words[wordsFound % THAI_LOOKAHEAD].backUp(text));
297	foundBest:
298	// Set UText position to after the accepted word.
299	cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
300	cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
301	wordsFound += `1`;
302	}
303
304	// We come here after having either found a word or not. We look ahead to the
305	// next word. If it's not a dictionary word, we will combine it with the word we
306	// just found (if there is one), but only if the preceding word does not exceed
307	// the threshold.
308	// The text iterator should now be positioned at the end of the word we found.
309
310	UChar32 uc = `0`;
311	if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < THAI_ROOT_COMBINE_THRESHOLD) {
312	// if it is a dictionary word, do nothing. If it isn't, then if there is
313	// no preceding word, or the non-word shares less than the minimum threshold
314	// of characters with a dictionary word, then scan to resynchronize
315	if (words[wordsFound % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= `0`
316	&& (cuWordLength == `0`
317	\|\| words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) {
318	// Look for a plausible word boundary
319	int32_t remaining = rangeEnd - (current+cuWordLength);
320	UChar32 pc;
321	int32_t chars = `0`;
322	for (;;) {
323	int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
324	pc = utext_next32(text);
325	int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
326	chars += pcSize;
327	remaining -= pcSize;
328	if (remaining <= `0`) {
329	break;
330	}
331	uc = utext_current32(text);
332	if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
333	// Maybe. See if it's in the dictionary.
334	// NOTE: In the original Apple code, checked that the next
335	// two characters after uc were not 0x0E4C THANTHAKHAT before
336	// checking the dictionary. That is just a performance filter,
337	// but it's not clear it's faster than checking the trie.
338	int32_t num_candidates = words[(wordsFound + `1`) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
339	utext_setNativeIndex(text, current + cuWordLength + chars);
340	if (num_candidates > `0`) {
341	break;
342	}
343	}
344	}
345
346	// Bump the word count if there wasn't already one
347	if (cuWordLength <= `0`) {
348	wordsFound += `1`;
349	}
350
351	// Update the length with the passed-over characters
352	cuWordLength += chars;
353	}
354	else {
355	// Back up to where we were for next iteration
356	utext_setNativeIndex(text, current+cuWordLength);
357	}
358	}
359
360	// Never stop before a combining mark.
361	int32_t currPos;
362	while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
363	utext_next32(text);
364	cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
365	}
366
367	// Look ahead for possible suffixes if a dictionary word does not follow.
368	// We do this in code rather than using a rule so that the heuristic
369	// resynch continues to function. For example, one of the suffix characters
370	// could be a typo in the middle of a word.
371	if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cuWordLength > `0`) {
372	if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= `0`
373	&& fSuffixSet.contains(uc = utext_current32(text))) {
374	if (uc == THAI_PAIYANNOI) {
375	if (!fSuffixSet.contains(utext_previous32(text))) {
376	// Skip over previous end and PAIYANNOI
377	utext_next32(text);
378	int32_t paiyannoiIndex = (int32_t)utext_getNativeIndex(text);
379	utext_next32(text);
380	cuWordLength += (int32_t)utext_getNativeIndex(text) - paiyannoiIndex; // Add PAIYANNOI to word
381	uc = utext_current32(text); // Fetch next character
382	}
383	else {
384	// Restore prior position
385	utext_next32(text);
386	}
387	}
388	if (uc == THAI_MAIYAMOK) {
389	if (utext_previous32(text) != THAI_MAIYAMOK) {
390	// Skip over previous end and MAIYAMOK
391	utext_next32(text);
392	int32_t maiyamokIndex = (int32_t)utext_getNativeIndex(text);
393	utext_next32(text);
394	cuWordLength += (int32_t)utext_getNativeIndex(text) - maiyamokIndex; // Add MAIYAMOK to word
395	}
396	else {
397	// Restore prior position
398	utext_next32(text);
399	}
400	}
401	}
402	else {
403	utext_setNativeIndex(text, current+cuWordLength);
404	}
405	}
406
407	// Did we find a word on this iteration? If so, push it on the break stack
408	if (cuWordLength > `0`) {
409	foundBreaks.push((current+cuWordLength), status);
410	}
411	}
412
413	// Don't return a break for the end of the dictionary range if there is one there.
414	if (foundBreaks.peeki() >= rangeEnd) {
415	(void) foundBreaks.popi();
416	wordsFound -= `1`;
417	}
418
419	return wordsFound;
420	}
421
422	/*
423	******************************************************************
424	* LaoBreakEngine
425	*/
426
427	// How many words in a row are "good enough"?
428	static const int32_t LAO_LOOKAHEAD = `3`;
429
430	// Will not combine a non-word with a preceding dictionary word longer than this
431	static const int32_t LAO_ROOT_COMBINE_THRESHOLD = `3`;
432
433	// Will not combine a non-word that shares at least this much prefix with a
434	// dictionary word, with a preceding word
435	static const int32_t LAO_PREFIX_COMBINE_THRESHOLD = `3`;
436
437	// Minimum word size
438	static const int32_t LAO_MIN_WORD = `2`;
439
440	// Minimum number of characters for two words
441	static const int32_t LAO_MIN_WORD_SPAN = LAO_MIN_WORD * `2`;
442
443	LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
444	: DictionaryBreakEngine (),
445	fDictionary(adoptDictionary)
446	{
447	UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
448	UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Laoo");
449	UnicodeSet laoWordSet(UnicodeString (u"[[:Laoo:]&[:LineBreak=SA:]]"), status);
450	if (U_SUCCESS(status)) {
451	setCharacters(laoWordSet);
452	}
453	fMarkSet.applyPattern(UnicodeString (u"[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status);
454	fMarkSet.add(`0x0020`);
455	fEndWordSet = laoWordSet;
456	fEndWordSet.remove(`0x0EC0`, `0x0EC4`); // prefix vowels
457	fBeginWordSet.add(`0x0E81`, `0x0EAE`); // basic consonants (including holes for corresponding Thai characters)
458	fBeginWordSet.add(`0x0EDC`, `0x0EDD`); // digraph consonants (no Thai equivalent)
459	fBeginWordSet.add(`0x0EC0`, `0x0EC4`); // prefix vowels
460
461	// Compact for caching.
462	fMarkSet.compact();
463	fEndWordSet.compact();
464	fBeginWordSet.compact();
465	UTRACE_EXIT_STATUS(status);
466	}
467
468	LaoBreakEngine::~LaoBreakEngine() {
469	delete fDictionary;
470	}
471
472	int32_t
473	LaoBreakEngine::divideUpDictionaryRange( UText *text,
474	int32_t rangeStart,
475	int32_t rangeEnd,
476	UVector32 &foundBreaks,
477	UBool / isPhraseBreaking /,
478	UErrorCode& status) const {
479	if (U_FAILURE(status)) return `0`;
480	if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) {
481	return `0`; // Not enough characters for two words
482	}
483
484	uint32_t wordsFound = `0`;
485	int32_t cpWordLength = `0`;
486	int32_t cuWordLength = `0`;
487	int32_t current;
488	PossibleWord words[LAO_LOOKAHEAD];
489
490	utext_setNativeIndex(text, rangeStart);
491
492	while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
493	cuWordLength = `0`;
494	cpWordLength = `0`;
495
496	// Look for candidate words at the current position
497	int32_t candidates = words[wordsFound%LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
498
499	// If we found exactly one, use that
500	if (candidates == `1`) {
501	cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
502	cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
503	wordsFound += `1`;
504	}
505	// If there was more than one, see which one can take us forward the most words
506	else if (candidates > `1`) {
507	// If we're already at the end of the range, we're done
508	if (utext_getNativeIndex(text) >= rangeEnd) {
509	goto foundBest;
510	}
511	do {
512	if (words[(wordsFound + `1`) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > `0`) {
513	// Followed by another dictionary word; mark first word as a good candidate
514	words[wordsFound%LAO_LOOKAHEAD].markCurrent();
515
516	// If we're already at the end of the range, we're done
517	if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
518	goto foundBest;
519	}
520
521	// See if any of the possible second words is followed by a third word
522	do {
523	// If we find a third word, stop right away
524	if (words[(wordsFound + `2`) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
525	words[wordsFound % LAO_LOOKAHEAD].markCurrent();
526	goto foundBest;
527	}
528	}
529	while (words[(wordsFound + `1`) % LAO_LOOKAHEAD].backUp(text));
530	}
531	}
532	while (words[wordsFound % LAO_LOOKAHEAD].backUp(text));
533	foundBest:
534	cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
535	cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
536	wordsFound += `1`;
537	}
538
539	// We come here after having either found a word or not. We look ahead to the
540	// next word. If it's not a dictionary word, we will combine it with the word we
541	// just found (if there is one), but only if the preceding word does not exceed
542	// the threshold.
543	// The text iterator should now be positioned at the end of the word we found.
544	if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < LAO_ROOT_COMBINE_THRESHOLD) {
545	// if it is a dictionary word, do nothing. If it isn't, then if there is
546	// no preceding word, or the non-word shares less than the minimum threshold
547	// of characters with a dictionary word, then scan to resynchronize
548	if (words[wordsFound % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= `0`
549	&& (cuWordLength == `0`
550	\|\| words[wordsFound%LAO_LOOKAHEAD].longestPrefix() < LAO_PREFIX_COMBINE_THRESHOLD)) {
551	// Look for a plausible word boundary
552	int32_t remaining = rangeEnd - (current + cuWordLength);
553	UChar32 pc;
554	UChar32 uc;
555	int32_t chars = `0`;
556	for (;;) {
557	int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
558	pc = utext_next32(text);
559	int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
560	chars += pcSize;
561	remaining -= pcSize;
562	if (remaining <= `0`) {
563	break;
564	}
565	uc = utext_current32(text);
566	if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
567	// Maybe. See if it's in the dictionary.
568	// TODO: this looks iffy; compare with old code.
569	int32_t num_candidates = words[(wordsFound + `1`) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
570	utext_setNativeIndex(text, current + cuWordLength + chars);
571	if (num_candidates > `0`) {
572	break;
573	}
574	}
575	}
576
577	// Bump the word count if there wasn't already one
578	if (cuWordLength <= `0`) {
579	wordsFound += `1`;
580	}
581
582	// Update the length with the passed-over characters
583	cuWordLength += chars;
584	}
585	else {
586	// Back up to where we were for next iteration
587	utext_setNativeIndex(text, current + cuWordLength);
588	}
589	}
590
591	// Never stop before a combining mark.
592	int32_t currPos;
593	while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
594	utext_next32(text);
595	cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
596	}
597
598	// Look ahead for possible suffixes if a dictionary word does not follow.
599	// We do this in code rather than using a rule so that the heuristic
600	// resynch continues to function. For example, one of the suffix characters
601	// could be a typo in the middle of a word.
602	// NOT CURRENTLY APPLICABLE TO LAO
603
604	// Did we find a word on this iteration? If so, push it on the break stack
605	if (cuWordLength > `0`) {
606	foundBreaks.push((current+cuWordLength), status);
607	}
608	}
609
610	// Don't return a break for the end of the dictionary range if there is one there.
611	if (foundBreaks.peeki() >= rangeEnd) {
612	(void) foundBreaks.popi();
613	wordsFound -= `1`;
614	}
615
616	return wordsFound;
617	}
618
619	/*
620	******************************************************************
621	* BurmeseBreakEngine
622	*/
623
624	// How many words in a row are "good enough"?
625	static const int32_t BURMESE_LOOKAHEAD = `3`;
626
627	// Will not combine a non-word with a preceding dictionary word longer than this
628	static const int32_t BURMESE_ROOT_COMBINE_THRESHOLD = `3`;
629
630	// Will not combine a non-word that shares at least this much prefix with a
631	// dictionary word, with a preceding word
632	static const int32_t BURMESE_PREFIX_COMBINE_THRESHOLD = `3`;
633
634	// Minimum word size
635	static const int32_t BURMESE_MIN_WORD = `2`;
636
637	// Minimum number of characters for two words
638	static const int32_t BURMESE_MIN_WORD_SPAN = BURMESE_MIN_WORD * `2`;
639
640	BurmeseBreakEngine::BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
641	: DictionaryBreakEngine (),
642	fDictionary(adoptDictionary)
643	{
644	UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
645	UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Mymr");
646	fBeginWordSet.add(`0x1000`, `0x102A`); // basic consonants and independent vowels
647	fEndWordSet.applyPattern(UnicodeString (u"[[:Mymr:]&[:LineBreak=SA:]]"), status);
648	fMarkSet.applyPattern(UnicodeString (u"[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status);
649	fMarkSet.add(`0x0020`);
650	if (U_SUCCESS(status)) {
651	setCharacters(fEndWordSet);
652	}
653
654	// Compact for caching.
655	fMarkSet.compact();
656	fEndWordSet.compact();
657	fBeginWordSet.compact();
658	UTRACE_EXIT_STATUS(status);
659	}
660
661	BurmeseBreakEngine::~BurmeseBreakEngine() {
662	delete fDictionary;
663	}
664
665	int32_t
666	BurmeseBreakEngine::divideUpDictionaryRange( UText *text,
667	int32_t rangeStart,
668	int32_t rangeEnd,
669	UVector32 &foundBreaks,
670	UBool / isPhraseBreaking /,
671	UErrorCode& status ) const {
672	if (U_FAILURE(status)) return `0`;
673	if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD_SPAN) {
674	return `0`; // Not enough characters for two words
675	}
676
677	uint32_t wordsFound = `0`;
678	int32_t cpWordLength = `0`;
679	int32_t cuWordLength = `0`;
680	int32_t current;
681	PossibleWord words[BURMESE_LOOKAHEAD];
682
683	utext_setNativeIndex(text, rangeStart);
684
685	while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
686	cuWordLength = `0`;
687	cpWordLength = `0`;
688
689	// Look for candidate words at the current position
690	int32_t candidates = words[wordsFound%BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
691
692	// If we found exactly one, use that
693	if (candidates == `1`) {
694	cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
695	cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
696	wordsFound += `1`;
697	}
698	// If there was more than one, see which one can take us forward the most words
699	else if (candidates > `1`) {
700	// If we're already at the end of the range, we're done
701	if (utext_getNativeIndex(text) >= rangeEnd) {
702	goto foundBest;
703	}
704	do {
705	if (words[(wordsFound + `1`) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > `0`) {
706	// Followed by another dictionary word; mark first word as a good candidate
707	words[wordsFound%BURMESE_LOOKAHEAD].markCurrent();
708
709	// If we're already at the end of the range, we're done
710	if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
711	goto foundBest;
712	}
713
714	// See if any of the possible second words is followed by a third word
715	do {
716	// If we find a third word, stop right away
717	if (words[(wordsFound + `2`) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
718	words[wordsFound % BURMESE_LOOKAHEAD].markCurrent();
719	goto foundBest;
720	}
721	}
722	while (words[(wordsFound + `1`) % BURMESE_LOOKAHEAD].backUp(text));
723	}
724	}
725	while (words[wordsFound % BURMESE_LOOKAHEAD].backUp(text));
726	foundBest:
727	cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
728	cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
729	wordsFound += `1`;
730	}
731
732	// We come here after having either found a word or not. We look ahead to the
733	// next word. If it's not a dictionary word, we will combine it with the word we
734	// just found (if there is one), but only if the preceding word does not exceed
735	// the threshold.
736	// The text iterator should now be positioned at the end of the word we found.
737	if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < BURMESE_ROOT_COMBINE_THRESHOLD) {
738	// if it is a dictionary word, do nothing. If it isn't, then if there is
739	// no preceding word, or the non-word shares less than the minimum threshold
740	// of characters with a dictionary word, then scan to resynchronize
741	if (words[wordsFound % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= `0`
742	&& (cuWordLength == `0`
743	\|\| words[wordsFound%BURMESE_LOOKAHEAD].longestPrefix() < BURMESE_PREFIX_COMBINE_THRESHOLD)) {
744	// Look for a plausible word boundary
745	int32_t remaining = rangeEnd - (current + cuWordLength);
746	UChar32 pc;
747	UChar32 uc;
748	int32_t chars = `0`;
749	for (;;) {
750	int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
751	pc = utext_next32(text);
752	int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
753	chars += pcSize;
754	remaining -= pcSize;
755	if (remaining <= `0`) {
756	break;
757	}
758	uc = utext_current32(text);
759	if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
760	// Maybe. See if it's in the dictionary.
761	// TODO: this looks iffy; compare with old code.
762	int32_t num_candidates = words[(wordsFound + `1`) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
763	utext_setNativeIndex(text, current + cuWordLength + chars);
764	if (num_candidates > `0`) {
765	break;
766	}
767	}
768	}
769
770	// Bump the word count if there wasn't already one
771	if (cuWordLength <= `0`) {
772	wordsFound += `1`;
773	}
774
775	// Update the length with the passed-over characters
776	cuWordLength += chars;
777	}
778	else {
779	// Back up to where we were for next iteration
780	utext_setNativeIndex(text, current + cuWordLength);
781	}
782	}
783
784	// Never stop before a combining mark.
785	int32_t currPos;
786	while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
787	utext_next32(text);
788	cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
789	}
790
791	// Look ahead for possible suffixes if a dictionary word does not follow.
792	// We do this in code rather than using a rule so that the heuristic
793	// resynch continues to function. For example, one of the suffix characters
794	// could be a typo in the middle of a word.
795	// NOT CURRENTLY APPLICABLE TO BURMESE
796
797	// Did we find a word on this iteration? If so, push it on the break stack
798	if (cuWordLength > `0`) {
799	foundBreaks.push((current+cuWordLength), status);
800	}
801	}
802
803	// Don't return a break for the end of the dictionary range if there is one there.
804	if (foundBreaks.peeki() >= rangeEnd) {
805	(void) foundBreaks.popi();
806	wordsFound -= `1`;
807	}
808
809	return wordsFound;
810	}
811
812	/*
813	******************************************************************
814	* KhmerBreakEngine
815	*/
816
817	// How many words in a row are "good enough"?
818	static const int32_t KHMER_LOOKAHEAD = `3`;
819
820	// Will not combine a non-word with a preceding dictionary word longer than this
821	static const int32_t KHMER_ROOT_COMBINE_THRESHOLD = `3`;
822
823	// Will not combine a non-word that shares at least this much prefix with a
824	// dictionary word, with a preceding word
825	static const int32_t KHMER_PREFIX_COMBINE_THRESHOLD = `3`;
826
827	// Minimum word size
828	static const int32_t KHMER_MIN_WORD = `2`;
829
830	// Minimum number of characters for two words
831	static const int32_t KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * `2`;
832
833	KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
834	: DictionaryBreakEngine (),
835	fDictionary(adoptDictionary)
836	{
837	UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
838	UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Khmr");
839	UnicodeSet khmerWordSet(UnicodeString (u"[[:Khmr:]&[:LineBreak=SA:]]"), status);
840	if (U_SUCCESS(status)) {
841	setCharacters(khmerWordSet);
842	}
843	fMarkSet.applyPattern(UnicodeString (u"[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);
844	fMarkSet.add(`0x0020`);
845	fEndWordSet = khmerWordSet;
846	fBeginWordSet.add(`0x1780`, `0x17B3`);
847	//fBeginWordSet.add(0x17A3, 0x17A4); // deprecated vowels
848	//fEndWordSet.remove(0x17A5, 0x17A9); // Khmer independent vowels that can't end a word
849	//fEndWordSet.remove(0x17B2); // Khmer independent vowel that can't end a word
850	fEndWordSet.remove(`0x17D2`); // KHMER SIGN COENG that combines some following characters
851	//fEndWordSet.remove(0x17B6, 0x17C5); // Remove dependent vowels
852	// fEndWordSet.remove(0x0E31); // MAI HAN-AKAT
853	// fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
854	// fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK
855	// fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
856	// fSuffixSet.add(THAI_PAIYANNOI);
857	// fSuffixSet.add(THAI_MAIYAMOK);
858
859	// Compact for caching.
860	fMarkSet.compact();
861	fEndWordSet.compact();
862	fBeginWordSet.compact();
863	// fSuffixSet.compact();
864	UTRACE_EXIT_STATUS(status);
865	}
866
867	KhmerBreakEngine::~KhmerBreakEngine() {
868	delete fDictionary;
869	}
870
871	int32_t
872	KhmerBreakEngine::divideUpDictionaryRange( UText *text,
873	int32_t rangeStart,
874	int32_t rangeEnd,
875	UVector32 &foundBreaks,
876	UBool / isPhraseBreaking /,
877	UErrorCode& status ) const {
878	if (U_FAILURE(status)) return `0`;
879	if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
880	return `0`; // Not enough characters for two words
881	}
882
883	uint32_t wordsFound = `0`;
884	int32_t cpWordLength = `0`;
885	int32_t cuWordLength = `0`;
886	int32_t current;
887	PossibleWord words[KHMER_LOOKAHEAD];
888
889	utext_setNativeIndex(text, rangeStart);
890
891	while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
892	cuWordLength = `0`;
893	cpWordLength = `0`;
894
895	// Look for candidate words at the current position
896	int32_t candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
897
898	// If we found exactly one, use that
899	if (candidates == `1`) {
900	cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
901	cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
902	wordsFound += `1`;
903	}
904
905	// If there was more than one, see which one can take us forward the most words
906	else if (candidates > `1`) {
907	// If we're already at the end of the range, we're done
908	if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
909	goto foundBest;
910	}
911	do {
912	if (words[(wordsFound + `1`) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > `0`) {
913	// Followed by another dictionary word; mark first word as a good candidate
914	words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
915
916	// If we're already at the end of the range, we're done
917	if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
918	goto foundBest;
919	}
920
921	// See if any of the possible second words is followed by a third word
922	do {
923	// If we find a third word, stop right away
924	if (words[(wordsFound + `2`) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
925	words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
926	goto foundBest;
927	}
928	}
929	while (words[(wordsFound + `1`) % KHMER_LOOKAHEAD].backUp(text));
930	}
931	}
932	while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text));
933	foundBest:
934	cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
935	cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
936	wordsFound += `1`;
937	}
938
939	// We come here after having either found a word or not. We look ahead to the
940	// next word. If it's not a dictionary word, we will combine it with the word we
941	// just found (if there is one), but only if the preceding word does not exceed
942	// the threshold.
943	// The text iterator should now be positioned at the end of the word we found.
944	if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < KHMER_ROOT_COMBINE_THRESHOLD) {
945	// if it is a dictionary word, do nothing. If it isn't, then if there is
946	// no preceding word, or the non-word shares less than the minimum threshold
947	// of characters with a dictionary word, then scan to resynchronize
948	if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= `0`
949	&& (cuWordLength == `0`
950	\|\| words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) {
951	// Look for a plausible word boundary
952	int32_t remaining = rangeEnd - (current+cuWordLength);
953	UChar32 pc;
954	UChar32 uc;
955	int32_t chars = `0`;
956	for (;;) {
957	int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
958	pc = utext_next32(text);
959	int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
960	chars += pcSize;
961	remaining -= pcSize;
962	if (remaining <= `0`) {
963	break;
964	}
965	uc = utext_current32(text);
966	if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
967	// Maybe. See if it's in the dictionary.
968	int32_t num_candidates = words[(wordsFound + `1`) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
969	utext_setNativeIndex(text, current+cuWordLength+chars);
970	if (num_candidates > `0`) {
971	break;
972	}
973	}
974	}
975
976	// Bump the word count if there wasn't already one
977	if (cuWordLength <= `0`) {
978	wordsFound += `1`;
979	}
980
981	// Update the length with the passed-over characters
982	cuWordLength += chars;
983	}
984	else {
985	// Back up to where we were for next iteration
986	utext_setNativeIndex(text, current+cuWordLength);
987	}
988	}
989
990	// Never stop before a combining mark.
991	int32_t currPos;
992	while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
993	utext_next32(text);
994	cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
995	}
996
997	// Look ahead for possible suffixes if a dictionary word does not follow.
998	// We do this in code rather than using a rule so that the heuristic
999	// resynch continues to function. For example, one of the suffix characters
1000	// could be a typo in the middle of a word.
1001	// if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
1002	// if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
1003	// && fSuffixSet.contains(uc = utext_current32(text))) {
1004	// if (uc == KHMER_PAIYANNOI) {
1005	// if (!fSuffixSet.contains(utext_previous32(text))) {
1006	// // Skip over previous end and PAIYANNOI
1007	// utext_next32(text);
1008	// utext_next32(text);
1009	// wordLength += 1; // Add PAIYANNOI to word
1010	// uc = utext_current32(text); // Fetch next character
1011	// }
1012	// else {
1013	// // Restore prior position
1014	// utext_next32(text);
1015	// }
1016	// }
1017	// if (uc == KHMER_MAIYAMOK) {
1018	// if (utext_previous32(text) != KHMER_MAIYAMOK) {
1019	// // Skip over previous end and MAIYAMOK
1020	// utext_next32(text);
1021	// utext_next32(text);
1022	// wordLength += 1; // Add MAIYAMOK to word
1023	// }
1024	// else {
1025	// // Restore prior position
1026	// utext_next32(text);
1027	// }
1028	// }
1029	// }
1030	// else {
1031	// utext_setNativeIndex(text, current+wordLength);
1032	// }
1033	// }
1034
1035	// Did we find a word on this iteration? If so, push it on the break stack
1036	if (cuWordLength > `0`) {
1037	foundBreaks.push((current+cuWordLength), status);
1038	}
1039	}
1040
1041	// Don't return a break for the end of the dictionary range if there is one there.
1042	if (foundBreaks.peeki() >= rangeEnd) {
1043	(void) foundBreaks.popi();
1044	wordsFound -= `1`;
1045	}
1046
1047	return wordsFound;
1048	}
1049
1050	#if !UCONFIG_NO_NORMALIZATION
1051	/*
1052	******************************************************************
1053	* CjkBreakEngine
1054	*/
1055	static const uint32_t kuint32max = `0xFFFFFFFF`;
1056	CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status)
1057	: DictionaryBreakEngine (), fDictionary(adoptDictionary), isCj(false) {
1058	UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
1059	UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Hani");
1060	fMlBreakEngine = nullptr;
1061	nfkcNorm2 = Normalizer2::getNFKCInstance(status);
1062	// Korean dictionary only includes Hangul syllables
1063	fHangulWordSet.applyPattern(UnicodeString (u"[\\uac00-\\ud7a3]"), status);
1064	fHangulWordSet.compact();
1065	// Digits, open puncutation and Alphabetic characters.
1066	fDigitOrOpenPunctuationOrAlphabetSet.applyPattern(
1067	UnicodeString (u"[[:Nd:][:Pi:][:Ps:][:Alphabetic:]]"), status);
1068	fDigitOrOpenPunctuationOrAlphabetSet.compact();
1069	fClosePunctuationSet.applyPattern(UnicodeString (u"[[:Pc:][:Pd:][:Pe:][:Pf:][:Po:]]"), status);
1070	fClosePunctuationSet.compact();
1071
1072	// handle Korean and Japanese/Chinese using different dictionaries
1073	if (type == kKorean) {
1074	if (U_SUCCESS(status)) {
1075	setCharacters(fHangulWordSet);
1076	}
1077	} else { // Chinese and Japanese
1078	UnicodeSet cjSet(UnicodeString (u"[[:Han:][:Hiragana:][:Katakana:]\\u30fc\\uff70\\uff9e\\uff9f]"), status);
1079	isCj = true;
1080	if (U_SUCCESS(status)) {
1081	setCharacters(cjSet);
1082	#if UCONFIG_USE_ML_PHRASE_BREAKING
1083	fMlBreakEngine = new MlBreakEngine(fDigitOrOpenPunctuationOrAlphabetSet,
1084	fClosePunctuationSet, status);
1085	if (fMlBreakEngine == nullptr) {
1086	status = U_MEMORY_ALLOCATION_ERROR;
1087	}
1088	#else
1089	initJapanesePhraseParameter(status);
1090	#endif
1091	}
1092	}
1093	UTRACE_EXIT_STATUS(status);
1094	}
1095
1096	CjkBreakEngine::~CjkBreakEngine(){
1097	delete fDictionary;
1098	delete fMlBreakEngine;
1099	}
1100
1101	// The katakanaCost values below are based on the length frequencies of all
1102	// katakana phrases in the dictionary
1103	static const int32_t kMaxKatakanaLength = `8`;
1104	static const int32_t kMaxKatakanaGroupLength = `20`;
1105	static const uint32_t maxSnlp = `255`;
1106
1107	static inline uint32_t getKatakanaCost(int32_t wordLength){
1108	//TODO: fill array with actual values from dictionary!
1109	static const uint32_t katakanaCost[kMaxKatakanaLength + `1`]
1110	= {`8192`, `984`, `408`, `240`, `204`, `252`, `300`, `372`, `480`};
1111	return (wordLength > kMaxKatakanaLength) ? `8192` : katakanaCost[wordLength];
1112	}
1113
1114	static inline bool isKatakana(UChar32 value) {
1115	return (value >= `0x30A1` && value <= `0x30FE` && value != `0x30FB`) \|\|
1116	(value >= `0xFF66` && value <= `0xFF9f`);
1117	}
1118
1119	// Function for accessing internal utext flags.
1120	// Replicates an internal UText function.
1121
1122	static inline int32_t utext_i32_flag(int32_t bitIndex) {
1123	return (int32_t)`1` << bitIndex;
1124	}
1125
1126	/*
1127	* @param text A UText representing the text
1128	* @param rangeStart The start of the range of dictionary characters
1129	* @param rangeEnd The end of the range of dictionary characters
1130	* @param foundBreaks vector<int32> to receive the break positions
1131	* @return The number of breaks found
1132	*/
1133	int32_t
1134	CjkBreakEngine::divideUpDictionaryRange( UText *inText,
1135	int32_t rangeStart,
1136	int32_t rangeEnd,
1137	UVector32 &foundBreaks,
1138	UBool isPhraseBreaking,
1139	UErrorCode& status) const {
1140	if (U_FAILURE(status)) return `0`;
1141	if (rangeStart >= rangeEnd) {
1142	return `0`;
1143	}
1144
1145	// UnicodeString version of input UText, NFKC normalized if necessary.
1146	UnicodeString inString;
1147
1148	// inputMap[inStringIndex] = corresponding native index from UText inText.
1149	// If nullptr then mapping is 1:1
1150	LocalPointer<UVector32> inputMap;
1151
1152	// if UText has the input string as one contiguous UTF-16 chunk
1153	if ((inText->providerProperties & utext_i32_flag(UTEXT_PROVIDER_STABLE_CHUNKS)) &&
1154	inText->chunkNativeStart <= rangeStart &&
1155	inText->chunkNativeLimit >= rangeEnd &&
1156	inText->nativeIndexingLimit >= rangeEnd - inText->chunkNativeStart) {
1157
1158	// Input UText is in one contiguous UTF-16 chunk.
1159	// Use Read-only aliasing UnicodeString.
1160	inString.setTo(false,
1161	inText->chunkContents + rangeStart - inText->chunkNativeStart,
1162	rangeEnd - rangeStart);
1163	} else {
1164	// Copy the text from the original inText (UText) to inString (UnicodeString).
1165	// Create a map from UnicodeString indices -> UText offsets.
1166	utext_setNativeIndex(inText, rangeStart);
1167	int32_t limit = rangeEnd;
1168	U_ASSERT(limit <= utext_nativeLength(inText));
1169	if (limit > utext_nativeLength(inText)) {
1170	limit = (int32_t)utext_nativeLength(inText);
1171	}
1172	inputMap.adoptInsteadAndCheckErrorCode(new UVector32 (status), status);
1173	if (U_FAILURE(status)) {
1174	return `0`;
1175	}
1176	while (utext_getNativeIndex(inText) < limit) {
1177	int32_t nativePosition = (int32_t)utext_getNativeIndex(inText);
1178	UChar32 c = utext_next32(inText);
1179	U_ASSERT(c != U_SENTINEL);
1180	inString.append(c);
1181	while (inputMap ->size() < inString.length()) {
1182	inputMap ->addElement(nativePosition, status);
1183	}
1184	}
1185	inputMap ->addElement(limit, status);
1186	}
1187
1188
1189	if (!nfkcNorm2->isNormalized(inString, status)) {
1190	UnicodeString normalizedInput;
1191	// normalizedMap[normalizedInput position] == original UText position.
1192	LocalPointer<UVector32> normalizedMap(new UVector32 (status), status);
1193	if (U_FAILURE(status)) {
1194	return `0`;
1195	}
1196
1197	UnicodeString fragment;
1198	UnicodeString normalizedFragment;
1199	for (int32_t srcI = `0`; srcI < inString.length();) { // Once per normalization chunk
1200	fragment.remove();
1201	int32_t fragmentStartI = srcI;
1202	UChar32 c = inString.char32At(srcI);
1203	for (;;) {
1204	fragment.append(c);
1205	srcI = inString.moveIndex32(srcI, `1`);
1206	if (srcI == inString.length()) {
1207	break;
1208	}
1209	c = inString.char32At(srcI);
1210	if (nfkcNorm2->hasBoundaryBefore(c)) {
1211	break;
1212	}
1213	}
1214	nfkcNorm2->normalize(fragment, normalizedFragment, status);
1215	normalizedInput.append(normalizedFragment);
1216
1217	// Map every position in the normalized chunk to the start of the chunk
1218	// in the original input.
1219	int32_t fragmentOriginalStart = inputMap.isValid() ?
1220	inputMap ->elementAti(fragmentStartI) : fragmentStartI+rangeStart;
1221	while (normalizedMap ->size() < normalizedInput.length()) {
1222	normalizedMap ->addElement(fragmentOriginalStart, status);
1223	if (U_FAILURE(status)) {
1224	break;
1225	}
1226	}
1227	}
1228	U_ASSERT(normalizedMap->size() == normalizedInput.length());
1229	int32_t nativeEnd = inputMap.isValid() ?
1230	inputMap ->elementAti(inString.length()) : inString.length()+rangeStart;
1231	normalizedMap ->addElement(nativeEnd, status);
1232
1233	inputMap = std::move(normalizedMap);
1234	inString = std::move(normalizedInput);
1235	}
1236
1237	int32_t numCodePts = inString.countChar32();
1238	if (numCodePts != inString.length()) {
1239	// There are supplementary characters in the input.
1240	// The dictionary will produce boundary positions in terms of code point indexes,
1241	// not in terms of code unit string indexes.
1242	// Use the inputMap mechanism to take care of this in addition to indexing differences
1243	// from normalization and/or UTF-8 input.
1244	UBool hadExistingMap = inputMap.isValid();
1245	if (!hadExistingMap) {
1246	inputMap.adoptInsteadAndCheckErrorCode(new UVector32 (status), status);
1247	if (U_FAILURE(status)) {
1248	return `0`;
1249	}
1250	}
1251	int32_t cpIdx = `0`;
1252	for (int32_t cuIdx = `0`; ; cuIdx = inString.moveIndex32(cuIdx, `1`)) {
1253	U_ASSERT(cuIdx >= cpIdx);
1254	if (hadExistingMap) {
1255	inputMap ->setElementAt(inputMap ->elementAti(cuIdx), cpIdx);
1256	} else {
1257	inputMap ->addElement(cuIdx+rangeStart, status);
1258	}
1259	cpIdx++;
1260	if (cuIdx == inString.length()) {
1261	break;
1262	}
1263	}
1264	}
1265
1266	#if UCONFIG_USE_ML_PHRASE_BREAKING
1267	// PhraseBreaking is supported in ja and ko; MlBreakEngine only supports ja.
1268	if (isPhraseBreaking && isCj) {
1269	return fMlBreakEngine->divideUpRange(inText, rangeStart, rangeEnd, foundBreaks, inString,
1270	inputMap, status);
1271	}
1272	#endif
1273
1274	// bestSnlp[i] is the snlp of the best segmentation of the first i
1275	// code points in the range to be matched.
1276	UVector32 bestSnlp(numCodePts + `1`, status);
1277	bestSnlp.addElement(`0`, status);
1278	for(int32_t i = `1`; i <= numCodePts; i++) {
1279	bestSnlp.addElement(kuint32max, status);
1280	}
1281
1282
1283	// prev[i] is the index of the last CJK code point in the previous word in
1284	// the best segmentation of the first i characters.
1285	UVector32 prev(numCodePts + `1`, status);
1286	for(int32_t i = `0`; i <= numCodePts; i++){
1287	prev.addElement(-`1`, status);
1288	}
1289
1290	const int32_t maxWordSize = `20`;
1291	UVector32 values(numCodePts, status);
1292	values.setSize(numCodePts);
1293	UVector32 lengths(numCodePts, status);
1294	lengths.setSize(numCodePts);
1295
1296	UText fu = UTEXT_INITIALIZER;
1297	utext_openUnicodeString(&fu, &inString, &status);
1298
1299	// Dynamic programming to find the best segmentation.
1300
1301	// In outer loop, i is the code point index,
1302	// ix is the corresponding string (code unit) index.
1303	// They differ when the string contains supplementary characters.
1304	int32_t ix = `0`;
1305	bool is_prev_katakana = false;
1306	for (int32_t i = `0`; i < numCodePts; ++i, ix = inString.moveIndex32(ix, `1`)) {
1307	if ((uint32_t)bestSnlp.elementAti(i) == kuint32max) {
1308	continue;
1309	}
1310
1311	int32_t count;
1312	utext_setNativeIndex(&fu, ix);
1313	count = fDictionary->matches(&fu, maxWordSize, numCodePts,
1314	nullptr, lengths.getBuffer(), values.getBuffer(), nullptr);
1315	// Note: lengths is filled with code point lengths
1316	// The nullptr parameter is the ignored code unit lengths.
1317
1318	// if there are no single character matches found in the dictionary
1319	// starting with this character, treat character as a 1-character word
1320	// with the highest value possible, i.e. the least likely to occur.
1321	// Exclude Korean characters from this treatment, as they should be left
1322	// together by default.
1323	if ((count == `0` \|\| lengths.elementAti(`0`) != `1`) &&
1324	!fHangulWordSet.contains(inString.char32At(ix))) {
1325	values.setElementAt(maxSnlp, count); // 255
1326	lengths.setElementAt(`1`, count++);
1327	}
1328
1329	for (int32_t j = `0`; j < count; j++) {
1330	uint32_t newSnlp = (uint32_t)bestSnlp.elementAti(i) + (uint32_t)values.elementAti(j);
1331	int32_t ln_j_i = lengths.elementAti(j) + i;
1332	if (newSnlp < (uint32_t)bestSnlp.elementAti(ln_j_i)) {
1333	bestSnlp.setElementAt(newSnlp, ln_j_i);
1334	prev.setElementAt(i, ln_j_i);
1335	}
1336	}
1337
1338	// In Japanese,
1339	// Katakana word in single character is pretty rare. So we apply
1340	// the following heuristic to Katakana: any continuous run of Katakana
1341	// characters is considered a candidate word with a default cost
1342	// specified in the katakanaCost table according to its length.
1343
1344	bool is_katakana = isKatakana(inString.char32At(ix));
1345	int32_t katakanaRunLength = `1`;
1346	if (!is_prev_katakana && is_katakana) {
1347	int32_t j = inString.moveIndex32(ix, `1`);
1348	// Find the end of the continuous run of Katakana characters
1349	while (j < inString.length() && katakanaRunLength < kMaxKatakanaGroupLength &&
1350	isKatakana(inString.char32At(j))) {
1351	j = inString.moveIndex32(j, `1`);
1352	katakanaRunLength++;
1353	}
1354	if (katakanaRunLength < kMaxKatakanaGroupLength) {
1355	uint32_t newSnlp = bestSnlp.elementAti(i) + getKatakanaCost(katakanaRunLength);
1356	if (newSnlp < (uint32_t)bestSnlp.elementAti(i+katakanaRunLength)) {
1357	bestSnlp.setElementAt(newSnlp, i+katakanaRunLength);
1358	prev.setElementAt(i, i+katakanaRunLength); // prev[j] = i;
1359	}
1360	}
1361	}
1362	is_prev_katakana = is_katakana;
1363	}
1364	utext_close(&fu);
1365
1366	// Start pushing the optimal offset index into t_boundary (t for tentative).
1367	// prev[numCodePts] is guaranteed to be meaningful.
1368	// We'll first push in the reverse order, i.e.,
1369	// t_boundary[0] = numCodePts, and afterwards do a swap.
1370	UVector32 t_boundary(numCodePts+`1`, status);
1371
1372	int32_t numBreaks = `0`;
1373	// No segmentation found, set boundary to end of range
1374	if ((uint32_t)bestSnlp.elementAti(numCodePts) == kuint32max) {
1375	t_boundary.addElement(numCodePts, status);
1376	numBreaks++;
1377	} else if (isPhraseBreaking) {
1378	t_boundary.addElement(numCodePts, status);
1379	if(U_SUCCESS(status)) {
1380	numBreaks++;
1381	int32_t prevIdx = numCodePts;
1382
1383	int32_t codeUnitIdx = -`1`;
1384	int32_t prevCodeUnitIdx = -`1`;
1385	int32_t length = -`1`;
1386	for (int32_t i = prev.elementAti(numCodePts); i > `0`; i = prev.elementAti(i)) {
1387	codeUnitIdx = inString.moveIndex32(`0`, i);
1388	prevCodeUnitIdx = inString.moveIndex32(`0`, prevIdx);
1389	// Calculate the length by using the code unit.
1390	length = prevCodeUnitIdx - codeUnitIdx;
1391	prevIdx = i;
1392	// Keep the breakpoint if the pattern is not in the fSkipSet and continuous Katakana
1393	// characters don't occur.
1394	if (!fSkipSet.containsKey(inString.tempSubString(codeUnitIdx, length))
1395	&& (!isKatakana(inString.char32At(inString.moveIndex32(codeUnitIdx, -`1`)))
1396	\|\| !isKatakana(inString.char32At(codeUnitIdx)))) {
1397	t_boundary.addElement(i, status);
1398	numBreaks++;
1399	}
1400	}
1401	}
1402	} else {
1403	for (int32_t i = numCodePts; i > `0`; i = prev.elementAti(i)) {
1404	t_boundary.addElement(i, status);
1405	numBreaks++;
1406	}
1407	U_ASSERT(prev.elementAti(t_boundary.elementAti(numBreaks - `1`)) == `0`);
1408	}
1409
1410	// Add a break for the start of the dictionary range if there is not one
1411	// there already.
1412	if (foundBreaks.size() == `0` \|\| foundBreaks.peeki() < rangeStart) {
1413	t_boundary.addElement(`0`, status);
1414	numBreaks++;
1415	}
1416
1417	// Now that we're done, convert positions in t_boundary[] (indices in
1418	// the normalized input string) back to indices in the original input UText
1419	// while reversing t_boundary and pushing values to foundBreaks.
1420	int32_t prevCPPos = -`1`;
1421	int32_t prevUTextPos = -`1`;
1422	int32_t correctedNumBreaks = `0`;
1423	for (int32_t i = numBreaks - `1`; i >= `0`; i--) {
1424	int32_t cpPos = t_boundary.elementAti(i);
1425	U_ASSERT(cpPos > prevCPPos);
1426	int32_t utextPos = inputMap.isValid() ? inputMap ->elementAti(cpPos) : cpPos + rangeStart;
1427	U_ASSERT(utextPos >= prevUTextPos);
1428	if (utextPos > prevUTextPos) {
1429	// Boundaries are added to foundBreaks output in ascending order.
1430	U_ASSERT(foundBreaks.size() == `0` \|\| foundBreaks.peeki() < utextPos);
1431	// In phrase breaking, there has to be a breakpoint between Cj character and close
1432	// punctuation.
1433	// E.g.［携帯電話］正しい選択 -> ［携帯▁電話］▁正しい▁選択 -> breakpoint between ］ and 正
1434	if (utextPos != rangeStart
1435	\|\| (isPhraseBreaking && utextPos > `0`
1436	&& fClosePunctuationSet.contains(utext_char32At(inText, utextPos - `1`)))) {
1437	foundBreaks.push(utextPos, status);
1438	correctedNumBreaks++;
1439	}
1440	} else {
1441	// Normalization expanded the input text, the dictionary found a boundary
1442	// within the expansion, giving two boundaries with the same index in the
1443	// original text. Ignore the second. See ticket #12918.
1444	--numBreaks;
1445	}
1446	prevCPPos = cpPos;
1447	prevUTextPos = utextPos;
1448	}
1449	(void)prevCPPos; // suppress compiler warnings about unused variable
1450
1451	UChar32 nextChar = utext_char32At(inText, rangeEnd);
1452	if (!foundBreaks.isEmpty() && foundBreaks.peeki() == rangeEnd) {
1453	// In phrase breaking, there has to be a breakpoint between Cj character and
1454	// the number/open punctuation.
1455	// E.g. る文字「そうだ、京都」->る▁文字▁「そうだ、▁京都」-> breakpoint between 字 and「
1456	// E.g. 乗車率９０％程度だろうか -> 乗車▁率▁９０％▁程度だろうか -> breakpoint between 率 and ９
1457	// E.g. しかもロゴがＵｎｉｃｏｄｅ！ -> しかも▁ロゴが▁Ｕｎｉｃｏｄｅ！-> breakpoint between が and Ｕ
1458	if (isPhraseBreaking) {
1459	if (!fDigitOrOpenPunctuationOrAlphabetSet.contains(nextChar)) {
1460	foundBreaks.popi();
1461	correctedNumBreaks--;
1462	}
1463	} else {
1464	foundBreaks.popi();
1465	correctedNumBreaks--;
1466	}
1467	}
1468
1469	// inString goes out of scope
1470	// inputMap goes out of scope
1471	return correctedNumBreaks;
1472	}
1473
1474	void CjkBreakEngine::initJapanesePhraseParameter(UErrorCode& error) {
1475	loadJapaneseExtensions(error);
1476	loadHiragana(error);
1477	}
1478
1479	void CjkBreakEngine::loadJapaneseExtensions(UErrorCode& error) {
1480	const char* tag = "extensions";
1481	ResourceBundle ja(U_ICUDATA_BRKITR, "ja", error);
1482	if (U_SUCCESS(error)) {
1483	ResourceBundle bundle = ja.get(tag, error);
1484	while (U_SUCCESS(error) && bundle.hasNext()) {
1485	fSkipSet.puti(bundle.getNextString(error), `1`, error);
1486	}
1487	}
1488	}
1489
1490	void CjkBreakEngine::loadHiragana(UErrorCode& error) {
1491	UnicodeSet hiraganaWordSet(UnicodeString (u"[:Hiragana:]"), error);
1492	hiraganaWordSet.compact();
1493	UnicodeSetIterator iterator(hiraganaWordSet);
1494	while (iterator.next()) {
1495	fSkipSet.puti(UnicodeString (iterator.getCodepoint()), `1`, error);
1496	}
1497	}
1498	#endif
1499
1500	U_NAMESPACE_END
1501
1502	#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1503
1504

Browse the source code of Godot/thirdparty/icu4c/common/dictbe.cpp