dictbe.cpp source code [engine/third_party/icu/source/common/dictbe.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/**
4	*******************************************************************************
5	* Copyright (C) 2006-2016, International Business Machines Corporation
6	* and others. All Rights Reserved.
7	*******************************************************************************
8	*/
9
10	#include <utility>
11
12	#include "unicode/utypes.h"
13
14	#if !UCONFIG_NO_BREAK_ITERATION
15
16	#include "brkeng.h"
17	#include "dictbe.h"
18	#include "unicode/uniset.h"
19	#include "unicode/chariter.h"
20	#include "unicode/ubrk.h"
21	#include "utracimp.h"
22	#include "uvectr32.h"
23	#include "uvector.h"
24	#include "uassert.h"
25	#include "unicode/normlzr.h"
26	#include "cmemory.h"
27	#include "dictionarydata.h"
28
29	U_NAMESPACE_BEGIN
30
31	/*
32	******************************************************************
33	*/
34
35	DictionaryBreakEngine::DictionaryBreakEngine() {
36	}
37
38	DictionaryBreakEngine::~DictionaryBreakEngine() {
39	}
40
41	UBool
42	DictionaryBreakEngine::handles(UChar32 c) const {
43	return fSet.contains(c);
44	}
45
46	int32_t
47	DictionaryBreakEngine::findBreaks( UText *text,
48	int32_t startPos,
49	int32_t endPos,
50	UVector32 &foundBreaks ) const {
51	(void)startPos; // TODO: remove this param?
52	int32_t result = `0`;
53
54	// Find the span of characters included in the set.
55	// The span to break begins at the current position in the text, and
56	// extends towards the start or end of the text, depending on 'reverse'.
57
58	int32_t start = (int32_t)utext_getNativeIndex(text);
59	int32_t current;
60	int32_t rangeStart;
61	int32_t rangeEnd;
62	UChar32 c = utext_current32(text);
63	while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) {
64	utext_next32(text); // TODO: recast loop for postincrement
65	c = utext_current32(text);
66	}
67	rangeStart = start;
68	rangeEnd = current;
69	result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
70	utext_setNativeIndex(text, current);
71
72	return result;
73	}
74
75	void
76	DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) {
77	fSet = set;
78	// Compact for caching
79	fSet.compact();
80	}
81
82	/*
83	******************************************************************
84	* PossibleWord
85	*/
86
87	// Helper class for improving readability of the Thai/Lao/Khmer word break
88	// algorithm. The implementation is completely inline.
89
90	// List size, limited by the maximum number of words in the dictionary
91	// that form a nested sequence.
92	static const int32_t POSSIBLE_WORD_LIST_MAX = `20`;
93
94	class PossibleWord {
95	private:
96	// list of word candidate lengths, in increasing length order
97	// TODO: bytes would be sufficient for word lengths.
98	int32_t count; // Count of candidates
99	int32_t prefix; // The longest match with a dictionary word
100	int32_t offset; // Offset in the text of these candidates
101	int32_t mark; // The preferred candidate's offset
102	int32_t current; // The candidate we're currently looking at
103	int32_t cuLengths[POSSIBLE_WORD_LIST_MAX]; // Word Lengths, in code units.
104	int32_t cpLengths[POSSIBLE_WORD_LIST_MAX]; // Word Lengths, in code points.
105
106	public:
107	PossibleWord() : count(`0`), prefix(`0`), offset(-`1`), mark(`0`), current(`0`) {}
108	~PossibleWord() {}
109
110	// Fill the list of candidates if needed, select the longest, and return the number found
111	int32_t candidates( UText text, DictionaryMatcher dict, int32_t rangeEnd );
112
113	// Select the currently marked candidate, point after it in the text, and invalidate self
114	int32_t acceptMarked( UText *text );
115
116	// Back up from the current candidate to the next shorter one; return TRUE if that exists
117	// and point the text after it
118	UBool backUp( UText *text );
119
120	// Return the longest prefix this candidate location shares with a dictionary word
121	// Return value is in code points.
122	int32_t longestPrefix() { return prefix; }
123
124	// Mark the current candidate as the one we like
125	void markCurrent() { mark = current; }
126
127	// Get length in code points of the marked word.
128	int32_t markedCPLength() { return cpLengths[mark]; }
129	};
130
131
132	int32_t PossibleWord::candidates( UText text, DictionaryMatcher dict, int32_t rangeEnd ) {
133	// TODO: If getIndex is too slow, use offset < 0 and add discardAll()
134	int32_t start = (int32_t)utext_getNativeIndex(text);
135	if (start != offset) {
136	offset = start;
137	count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, NULL, &prefix);
138	// Dictionary leaves text after longest prefix, not longest word. Back up.
139	if (count <= `0`) {
140	utext_setNativeIndex(text, start);
141	}
142	}
143	if (count > `0`) {
144	utext_setNativeIndex(text, start+cuLengths[count-`1`]);
145	}
146	current = count-`1`;
147	mark = current;
148	return count;
149	}
150
151	int32_t
152	PossibleWord::acceptMarked( UText *text ) {
153	utext_setNativeIndex(text, offset + cuLengths[mark]);
154	return cuLengths[mark];
155	}
156
157
158	UBool
159	PossibleWord::backUp( UText *text ) {
160	if (current > `0`) {
161	utext_setNativeIndex(text, offset + cuLengths[--current]);
162	return TRUE;
163	}
164	return FALSE;
165	}
166
167	/*
168	******************************************************************
169	* ThaiBreakEngine
170	*/
171
172	// How many words in a row are "good enough"?
173	static const int32_t THAI_LOOKAHEAD = `3`;
174
175	// Will not combine a non-word with a preceding dictionary word longer than this
176	static const int32_t THAI_ROOT_COMBINE_THRESHOLD = `3`;
177
178	// Will not combine a non-word that shares at least this much prefix with a
179	// dictionary word, with a preceding word
180	static const int32_t THAI_PREFIX_COMBINE_THRESHOLD = `3`;
181
182	// Ellision character
183	static const int32_t THAI_PAIYANNOI = `0x0E2F`;
184
185	// Repeat character
186	static const int32_t THAI_MAIYAMOK = `0x0E46`;
187
188	// Minimum word size
189	static const int32_t THAI_MIN_WORD = `2`;
190
191	// Minimum number of characters for two words
192	static const int32_t THAI_MIN_WORD_SPAN = THAI_MIN_WORD * `2`;
193
194	ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
195	: DictionaryBreakEngine (),
196	fDictionary(adoptDictionary)
197	{
198	UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
199	UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Thai");
200	fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status);
201	if (U_SUCCESS(status)) {
202	setCharacters(fThaiWordSet);
203	}
204	fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status);
205	fMarkSet.add(`0x0020`);
206	fEndWordSet = fThaiWordSet;
207	fEndWordSet.remove(`0x0E31`); // MAI HAN-AKAT
208	fEndWordSet.remove(`0x0E40`, `0x0E44`); // SARA E through SARA AI MAIMALAI
209	fBeginWordSet.add(`0x0E01`, `0x0E2E`); // KO KAI through HO NOKHUK
210	fBeginWordSet.add(`0x0E40`, `0x0E44`); // SARA E through SARA AI MAIMALAI
211	fSuffixSet.add(THAI_PAIYANNOI);
212	fSuffixSet.add(THAI_MAIYAMOK);
213
214	// Compact for caching.
215	fMarkSet.compact();
216	fEndWordSet.compact();
217	fBeginWordSet.compact();
218	fSuffixSet.compact();
219	UTRACE_EXIT_STATUS(status);
220	}
221
222	ThaiBreakEngine::~ThaiBreakEngine() {
223	delete fDictionary;
224	}
225
226	int32_t
227	ThaiBreakEngine::divideUpDictionaryRange( UText *text,
228	int32_t rangeStart,
229	int32_t rangeEnd,
230	UVector32 &foundBreaks ) const {
231	utext_setNativeIndex(text, rangeStart);
232	utext_moveIndex32(text, THAI_MIN_WORD_SPAN);
233	if (utext_getNativeIndex(text) >= rangeEnd) {
234	return `0`; // Not enough characters for two words
235	}
236	utext_setNativeIndex(text, rangeStart);
237
238
239	uint32_t wordsFound = `0`;
240	int32_t cpWordLength = `0`; // Word Length in Code Points.
241	int32_t cuWordLength = `0`; // Word length in code units (UText native indexing)
242	int32_t current;
243	UErrorCode status = U_ZERO_ERROR;
244	PossibleWord words[THAI_LOOKAHEAD];
245
246	utext_setNativeIndex(text, rangeStart);
247
248	while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
249	cpWordLength = `0`;
250	cuWordLength = `0`;
251
252	// Look for candidate words at the current position
253	int32_t candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
254
255	// If we found exactly one, use that
256	if (candidates == `1`) {
257	cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
258	cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
259	wordsFound += `1`;
260	}
261	// If there was more than one, see which one can take us forward the most words
262	else if (candidates > `1`) {
263	// If we're already at the end of the range, we're done
264	if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
265	goto foundBest;
266	}
267	do {
268	int32_t wordsMatched = `1`;
269	if (words[(wordsFound + `1`) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > `0`) {
270	if (wordsMatched < `2`) {
271	// Followed by another dictionary word; mark first word as a good candidate
272	words[wordsFound%THAI_LOOKAHEAD].markCurrent();
273	wordsMatched = `2`;
274	}
275
276	// If we're already at the end of the range, we're done
277	if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
278	goto foundBest;
279	}
280
281	// See if any of the possible second words is followed by a third word
282	do {
283	// If we find a third word, stop right away
284	if (words[(wordsFound + `2`) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
285	words[wordsFound % THAI_LOOKAHEAD].markCurrent();
286	goto foundBest;
287	}
288	}
289	while (words[(wordsFound + `1`) % THAI_LOOKAHEAD].backUp(text));
290	}
291	}
292	while (words[wordsFound % THAI_LOOKAHEAD].backUp(text));
293	foundBest:
294	// Set UText position to after the accepted word.
295	cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
296	cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
297	wordsFound += `1`;
298	}
299
300	// We come here after having either found a word or not. We look ahead to the
301	// next word. If it's not a dictionary word, we will combine it with the word we
302	// just found (if there is one), but only if the preceding word does not exceed
303	// the threshold.
304	// The text iterator should now be positioned at the end of the word we found.
305
306	UChar32 uc = `0`;
307	if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < THAI_ROOT_COMBINE_THRESHOLD) {
308	// if it is a dictionary word, do nothing. If it isn't, then if there is
309	// no preceding word, or the non-word shares less than the minimum threshold
310	// of characters with a dictionary word, then scan to resynchronize
311	if (words[wordsFound % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= `0`
312	&& (cuWordLength == `0`
313	\|\| words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) {
314	// Look for a plausible word boundary
315	int32_t remaining = rangeEnd - (current+cuWordLength);
316	UChar32 pc;
317	int32_t chars = `0`;
318	for (;;) {
319	int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
320	pc = utext_next32(text);
321	int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
322	chars += pcSize;
323	remaining -= pcSize;
324	if (remaining <= `0`) {
325	break;
326	}
327	uc = utext_current32(text);
328	if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
329	// Maybe. See if it's in the dictionary.
330	// NOTE: In the original Apple code, checked that the next
331	// two characters after uc were not 0x0E4C THANTHAKHAT before
332	// checking the dictionary. That is just a performance filter,
333	// but it's not clear it's faster than checking the trie.
334	int32_t num_candidates = words[(wordsFound + `1`) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
335	utext_setNativeIndex(text, current + cuWordLength + chars);
336	if (num_candidates > `0`) {
337	break;
338	}
339	}
340	}
341
342	// Bump the word count if there wasn't already one
343	if (cuWordLength <= `0`) {
344	wordsFound += `1`;
345	}
346
347	// Update the length with the passed-over characters
348	cuWordLength += chars;
349	}
350	else {
351	// Back up to where we were for next iteration
352	utext_setNativeIndex(text, current+cuWordLength);
353	}
354	}
355
356	// Never stop before a combining mark.
357	int32_t currPos;
358	while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
359	utext_next32(text);
360	cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
361	}
362
363	// Look ahead for possible suffixes if a dictionary word does not follow.
364	// We do this in code rather than using a rule so that the heuristic
365	// resynch continues to function. For example, one of the suffix characters
366	// could be a typo in the middle of a word.
367	if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cuWordLength > `0`) {
368	if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= `0`
369	&& fSuffixSet.contains(uc = utext_current32(text))) {
370	if (uc == THAI_PAIYANNOI) {
371	if (!fSuffixSet.contains(utext_previous32(text))) {
372	// Skip over previous end and PAIYANNOI
373	utext_next32(text);
374	int32_t paiyannoiIndex = (int32_t)utext_getNativeIndex(text);
375	utext_next32(text);
376	cuWordLength += (int32_t)utext_getNativeIndex(text) - paiyannoiIndex; // Add PAIYANNOI to word
377	uc = utext_current32(text); // Fetch next character
378	}
379	else {
380	// Restore prior position
381	utext_next32(text);
382	}
383	}
384	if (uc == THAI_MAIYAMOK) {
385	if (utext_previous32(text) != THAI_MAIYAMOK) {
386	// Skip over previous end and MAIYAMOK
387	utext_next32(text);
388	int32_t maiyamokIndex = (int32_t)utext_getNativeIndex(text);
389	utext_next32(text);
390	cuWordLength += (int32_t)utext_getNativeIndex(text) - maiyamokIndex; // Add MAIYAMOK to word
391	}
392	else {
393	// Restore prior position
394	utext_next32(text);
395	}
396	}
397	}
398	else {
399	utext_setNativeIndex(text, current+cuWordLength);
400	}
401	}
402
403	// Did we find a word on this iteration? If so, push it on the break stack
404	if (cuWordLength > `0`) {
405	foundBreaks.push((current+cuWordLength), status);
406	}
407	}
408
409	// Don't return a break for the end of the dictionary range if there is one there.
410	if (foundBreaks.peeki() >= rangeEnd) {
411	(void) foundBreaks.popi();
412	wordsFound -= `1`;
413	}
414
415	return wordsFound;
416	}
417
418	/*
419	******************************************************************
420	* LaoBreakEngine
421	*/
422
423	// How many words in a row are "good enough"?
424	static const int32_t LAO_LOOKAHEAD = `3`;
425
426	// Will not combine a non-word with a preceding dictionary word longer than this
427	static const int32_t LAO_ROOT_COMBINE_THRESHOLD = `3`;
428
429	// Will not combine a non-word that shares at least this much prefix with a
430	// dictionary word, with a preceding word
431	static const int32_t LAO_PREFIX_COMBINE_THRESHOLD = `3`;
432
433	// Minimum word size
434	static const int32_t LAO_MIN_WORD = `2`;
435
436	// Minimum number of characters for two words
437	static const int32_t LAO_MIN_WORD_SPAN = LAO_MIN_WORD * `2`;
438
439	LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
440	: DictionaryBreakEngine (),
441	fDictionary(adoptDictionary)
442	{
443	UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
444	UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Laoo");
445	fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]"), status);
446	if (U_SUCCESS(status)) {
447	setCharacters(fLaoWordSet);
448	}
449	fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status);
450	fMarkSet.add(`0x0020`);
451	fEndWordSet = fLaoWordSet;
452	fEndWordSet.remove(`0x0EC0`, `0x0EC4`); // prefix vowels
453	fBeginWordSet.add(`0x0E81`, `0x0EAE`); // basic consonants (including holes for corresponding Thai characters)
454	fBeginWordSet.add(`0x0EDC`, `0x0EDD`); // digraph consonants (no Thai equivalent)
455	fBeginWordSet.add(`0x0EC0`, `0x0EC4`); // prefix vowels
456
457	// Compact for caching.
458	fMarkSet.compact();
459	fEndWordSet.compact();
460	fBeginWordSet.compact();
461	UTRACE_EXIT_STATUS(status);
462	}
463
464	LaoBreakEngine::~LaoBreakEngine() {
465	delete fDictionary;
466	}
467
468	int32_t
469	LaoBreakEngine::divideUpDictionaryRange( UText *text,
470	int32_t rangeStart,
471	int32_t rangeEnd,
472	UVector32 &foundBreaks ) const {
473	if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) {
474	return `0`; // Not enough characters for two words
475	}
476
477	uint32_t wordsFound = `0`;
478	int32_t cpWordLength = `0`;
479	int32_t cuWordLength = `0`;
480	int32_t current;
481	UErrorCode status = U_ZERO_ERROR;
482	PossibleWord words[LAO_LOOKAHEAD];
483
484	utext_setNativeIndex(text, rangeStart);
485
486	while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
487	cuWordLength = `0`;
488	cpWordLength = `0`;
489
490	// Look for candidate words at the current position
491	int32_t candidates = words[wordsFound%LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
492
493	// If we found exactly one, use that
494	if (candidates == `1`) {
495	cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
496	cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
497	wordsFound += `1`;
498	}
499	// If there was more than one, see which one can take us forward the most words
500	else if (candidates > `1`) {
501	// If we're already at the end of the range, we're done
502	if (utext_getNativeIndex(text) >= rangeEnd) {
503	goto foundBest;
504	}
505	do {
506	int32_t wordsMatched = `1`;
507	if (words[(wordsFound + `1`) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > `0`) {
508	if (wordsMatched < `2`) {
509	// Followed by another dictionary word; mark first word as a good candidate
510	words[wordsFound%LAO_LOOKAHEAD].markCurrent();
511	wordsMatched = `2`;
512	}
513
514	// If we're already at the end of the range, we're done
515	if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
516	goto foundBest;
517	}
518
519	// See if any of the possible second words is followed by a third word
520	do {
521	// If we find a third word, stop right away
522	if (words[(wordsFound + `2`) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
523	words[wordsFound % LAO_LOOKAHEAD].markCurrent();
524	goto foundBest;
525	}
526	}
527	while (words[(wordsFound + `1`) % LAO_LOOKAHEAD].backUp(text));
528	}
529	}
530	while (words[wordsFound % LAO_LOOKAHEAD].backUp(text));
531	foundBest:
532	cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
533	cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
534	wordsFound += `1`;
535	}
536
537	// We come here after having either found a word or not. We look ahead to the
538	// next word. If it's not a dictionary word, we will combine it withe the word we
539	// just found (if there is one), but only if the preceding word does not exceed
540	// the threshold.
541	// The text iterator should now be positioned at the end of the word we found.
542	if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < LAO_ROOT_COMBINE_THRESHOLD) {
543	// if it is a dictionary word, do nothing. If it isn't, then if there is
544	// no preceding word, or the non-word shares less than the minimum threshold
545	// of characters with a dictionary word, then scan to resynchronize
546	if (words[wordsFound % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= `0`
547	&& (cuWordLength == `0`
548	\|\| words[wordsFound%LAO_LOOKAHEAD].longestPrefix() < LAO_PREFIX_COMBINE_THRESHOLD)) {
549	// Look for a plausible word boundary
550	int32_t remaining = rangeEnd - (current + cuWordLength);
551	UChar32 pc;
552	UChar32 uc;
553	int32_t chars = `0`;
554	for (;;) {
555	int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
556	pc = utext_next32(text);
557	int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
558	chars += pcSize;
559	remaining -= pcSize;
560	if (remaining <= `0`) {
561	break;
562	}
563	uc = utext_current32(text);
564	if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
565	// Maybe. See if it's in the dictionary.
566	// TODO: this looks iffy; compare with old code.
567	int32_t num_candidates = words[(wordsFound + `1`) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
568	utext_setNativeIndex(text, current + cuWordLength + chars);
569	if (num_candidates > `0`) {
570	break;
571	}
572	}
573	}
574
575	// Bump the word count if there wasn't already one
576	if (cuWordLength <= `0`) {
577	wordsFound += `1`;
578	}
579
580	// Update the length with the passed-over characters
581	cuWordLength += chars;
582	}
583	else {
584	// Back up to where we were for next iteration
585	utext_setNativeIndex(text, current + cuWordLength);
586	}
587	}
588
589	// Never stop before a combining mark.
590	int32_t currPos;
591	while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
592	utext_next32(text);
593	cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
594	}
595
596	// Look ahead for possible suffixes if a dictionary word does not follow.
597	// We do this in code rather than using a rule so that the heuristic
598	// resynch continues to function. For example, one of the suffix characters
599	// could be a typo in the middle of a word.
600	// NOT CURRENTLY APPLICABLE TO LAO
601
602	// Did we find a word on this iteration? If so, push it on the break stack
603	if (cuWordLength > `0`) {
604	foundBreaks.push((current+cuWordLength), status);
605	}
606	}
607
608	// Don't return a break for the end of the dictionary range if there is one there.
609	if (foundBreaks.peeki() >= rangeEnd) {
610	(void) foundBreaks.popi();
611	wordsFound -= `1`;
612	}
613
614	return wordsFound;
615	}
616
617	/*
618	******************************************************************
619	* BurmeseBreakEngine
620	*/
621
622	// How many words in a row are "good enough"?
623	static const int32_t BURMESE_LOOKAHEAD = `3`;
624
625	// Will not combine a non-word with a preceding dictionary word longer than this
626	static const int32_t BURMESE_ROOT_COMBINE_THRESHOLD = `3`;
627
628	// Will not combine a non-word that shares at least this much prefix with a
629	// dictionary word, with a preceding word
630	static const int32_t BURMESE_PREFIX_COMBINE_THRESHOLD = `3`;
631
632	// Minimum word size
633	static const int32_t BURMESE_MIN_WORD = `2`;
634
635	// Minimum number of characters for two words
636	static const int32_t BURMESE_MIN_WORD_SPAN = BURMESE_MIN_WORD * `2`;
637
638	BurmeseBreakEngine::BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
639	: DictionaryBreakEngine (),
640	fDictionary(adoptDictionary)
641	{
642	UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
643	UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Mymr");
644	fBurmeseWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]]"), status);
645	if (U_SUCCESS(status)) {
646	setCharacters(fBurmeseWordSet);
647	}
648	fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status);
649	fMarkSet.add(`0x0020`);
650	fEndWordSet = fBurmeseWordSet;
651	fBeginWordSet.add(`0x1000`, `0x102A`); // basic consonants and independent vowels
652
653	// Compact for caching.
654	fMarkSet.compact();
655	fEndWordSet.compact();
656	fBeginWordSet.compact();
657	UTRACE_EXIT_STATUS(status);
658	}
659
660	BurmeseBreakEngine::~BurmeseBreakEngine() {
661	delete fDictionary;
662	}
663
664	int32_t
665	BurmeseBreakEngine::divideUpDictionaryRange( UText *text,
666	int32_t rangeStart,
667	int32_t rangeEnd,
668	UVector32 &foundBreaks ) const {
669	if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD_SPAN) {
670	return `0`; // Not enough characters for two words
671	}
672
673	uint32_t wordsFound = `0`;
674	int32_t cpWordLength = `0`;
675	int32_t cuWordLength = `0`;
676	int32_t current;
677	UErrorCode status = U_ZERO_ERROR;
678	PossibleWord words[BURMESE_LOOKAHEAD];
679
680	utext_setNativeIndex(text, rangeStart);
681
682	while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
683	cuWordLength = `0`;
684	cpWordLength = `0`;
685
686	// Look for candidate words at the current position
687	int32_t candidates = words[wordsFound%BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
688
689	// If we found exactly one, use that
690	if (candidates == `1`) {
691	cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
692	cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
693	wordsFound += `1`;
694	}
695	// If there was more than one, see which one can take us forward the most words
696	else if (candidates > `1`) {
697	// If we're already at the end of the range, we're done
698	if (utext_getNativeIndex(text) >= rangeEnd) {
699	goto foundBest;
700	}
701	do {
702	int32_t wordsMatched = `1`;
703	if (words[(wordsFound + `1`) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > `0`) {
704	if (wordsMatched < `2`) {
705	// Followed by another dictionary word; mark first word as a good candidate
706	words[wordsFound%BURMESE_LOOKAHEAD].markCurrent();
707	wordsMatched = `2`;
708	}
709
710	// If we're already at the end of the range, we're done
711	if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
712	goto foundBest;
713	}
714
715	// See if any of the possible second words is followed by a third word
716	do {
717	// If we find a third word, stop right away
718	if (words[(wordsFound + `2`) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
719	words[wordsFound % BURMESE_LOOKAHEAD].markCurrent();
720	goto foundBest;
721	}
722	}
723	while (words[(wordsFound + `1`) % BURMESE_LOOKAHEAD].backUp(text));
724	}
725	}
726	while (words[wordsFound % BURMESE_LOOKAHEAD].backUp(text));
727	foundBest:
728	cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
729	cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
730	wordsFound += `1`;
731	}
732
733	// We come here after having either found a word or not. We look ahead to the
734	// next word. If it's not a dictionary word, we will combine it withe the word we
735	// just found (if there is one), but only if the preceding word does not exceed
736	// the threshold.
737	// The text iterator should now be positioned at the end of the word we found.
738	if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < BURMESE_ROOT_COMBINE_THRESHOLD) {
739	// if it is a dictionary word, do nothing. If it isn't, then if there is
740	// no preceding word, or the non-word shares less than the minimum threshold
741	// of characters with a dictionary word, then scan to resynchronize
742	if (words[wordsFound % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= `0`
743	&& (cuWordLength == `0`
744	\|\| words[wordsFound%BURMESE_LOOKAHEAD].longestPrefix() < BURMESE_PREFIX_COMBINE_THRESHOLD)) {
745	// Look for a plausible word boundary
746	int32_t remaining = rangeEnd - (current + cuWordLength);
747	UChar32 pc;
748	UChar32 uc;
749	int32_t chars = `0`;
750	for (;;) {
751	int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
752	pc = utext_next32(text);
753	int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
754	chars += pcSize;
755	remaining -= pcSize;
756	if (remaining <= `0`) {
757	break;
758	}
759	uc = utext_current32(text);
760	if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
761	// Maybe. See if it's in the dictionary.
762	// TODO: this looks iffy; compare with old code.
763	int32_t num_candidates = words[(wordsFound + `1`) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
764	utext_setNativeIndex(text, current + cuWordLength + chars);
765	if (num_candidates > `0`) {
766	break;
767	}
768	}
769	}
770
771	// Bump the word count if there wasn't already one
772	if (cuWordLength <= `0`) {
773	wordsFound += `1`;
774	}
775
776	// Update the length with the passed-over characters
777	cuWordLength += chars;
778	}
779	else {
780	// Back up to where we were for next iteration
781	utext_setNativeIndex(text, current + cuWordLength);
782	}
783	}
784
785	// Never stop before a combining mark.
786	int32_t currPos;
787	while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
788	utext_next32(text);
789	cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
790	}
791
792	// Look ahead for possible suffixes if a dictionary word does not follow.
793	// We do this in code rather than using a rule so that the heuristic
794	// resynch continues to function. For example, one of the suffix characters
795	// could be a typo in the middle of a word.
796	// NOT CURRENTLY APPLICABLE TO BURMESE
797
798	// Did we find a word on this iteration? If so, push it on the break stack
799	if (cuWordLength > `0`) {
800	foundBreaks.push((current+cuWordLength), status);
801	}
802	}
803
804	// Don't return a break for the end of the dictionary range if there is one there.
805	if (foundBreaks.peeki() >= rangeEnd) {
806	(void) foundBreaks.popi();
807	wordsFound -= `1`;
808	}
809
810	return wordsFound;
811	}
812
813	/*
814	******************************************************************
815	* KhmerBreakEngine
816	*/
817
818	// How many words in a row are "good enough"?
819	static const int32_t KHMER_LOOKAHEAD = `3`;
820
821	// Will not combine a non-word with a preceding dictionary word longer than this
822	static const int32_t KHMER_ROOT_COMBINE_THRESHOLD = `10`;
823
824	// Will not combine a non-word that shares at least this much prefix with a
825	// dictionary word, with a preceding word
826	static const int32_t KHMER_PREFIX_COMBINE_THRESHOLD = `5`;
827
828	// Minimum word size
829	static const int32_t KHMER_MIN_WORD = `2`;
830
831	// Minimum number of characters for two words
832	static const int32_t KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * `2`;
833
834	KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
835	: DictionaryBreakEngine (),
836	fDictionary(adoptDictionary)
837	{
838	UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
839	UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Khmr");
840	fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status);
841	if (U_SUCCESS(status)) {
842	setCharacters(fKhmerWordSet);
843	}
844	fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);
845	fMarkSet.add(`0x0020`);
846	fEndWordSet = fKhmerWordSet;
847	fBeginWordSet.add(`0x1780`, `0x17B3`);
848	//fBeginWordSet.add(0x17A3, 0x17A4); // deprecated vowels
849	//fEndWordSet.remove(0x17A5, 0x17A9); // Khmer independent vowels that can't end a word
850	//fEndWordSet.remove(0x17B2); // Khmer independent vowel that can't end a word
851	fEndWordSet.remove(`0x17D2`); // KHMER SIGN COENG that combines some following characters
852	//fEndWordSet.remove(0x17B6, 0x17C5); // Remove dependent vowels
853	// fEndWordSet.remove(0x0E31); // MAI HAN-AKAT
854	// fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
855	// fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK
856	// fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
857	// fSuffixSet.add(THAI_PAIYANNOI);
858	// fSuffixSet.add(THAI_MAIYAMOK);
859
860	// Compact for caching.
861	fMarkSet.compact();
862	fEndWordSet.compact();
863	fBeginWordSet.compact();
864	// fSuffixSet.compact();
865	UTRACE_EXIT_STATUS(status);
866	}
867
868	KhmerBreakEngine::~KhmerBreakEngine() {
869	delete fDictionary;
870	}
871
872	int32_t
873	KhmerBreakEngine::divideUpDictionaryRange( UText *text,
874	int32_t rangeStart,
875	int32_t rangeEnd,
876	UVector32 &foundBreaks ) const {
877	if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
878	return `0`; // Not enough characters for two words
879	}
880
881	uint32_t wordsFound = `0`;
882	int32_t cpWordLength = `0`;
883	int32_t cuWordLength = `0`;
884	int32_t current;
885	UErrorCode status = U_ZERO_ERROR;
886	PossibleWord words[KHMER_LOOKAHEAD];
887
888	utext_setNativeIndex(text, rangeStart);
889
890	while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
891	cuWordLength = `0`;
892	cpWordLength = `0`;
893
894	// Look for candidate words at the current position
895	int32_t candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
896
897	// If we found exactly one, use that
898	if (candidates == `1`) {
899	cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
900	cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
901	wordsFound += `1`;
902	}
903
904	// If there was more than one, see which one can take us forward the most words
905	else if (candidates > `1`) {
906	// If we're already at the end of the range, we're done
907	if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
908	goto foundBest;
909	}
910	do {
911	int32_t wordsMatched = `1`;
912	if (words[(wordsFound + `1`) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > `0`) {
913	if (wordsMatched < `2`) {
914	// Followed by another dictionary word; mark first word as a good candidate
915	words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
916	wordsMatched = `2`;
917	}
918
919	// If we're already at the end of the range, we're done
920	if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
921	goto foundBest;
922	}
923
924	// See if any of the possible second words is followed by a third word
925	do {
926	// If we find a third word, stop right away
927	if (words[(wordsFound + `2`) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
928	words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
929	goto foundBest;
930	}
931	}
932	while (words[(wordsFound + `1`) % KHMER_LOOKAHEAD].backUp(text));
933	}
934	}
935	while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text));
936	foundBest:
937	cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
938	cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
939	wordsFound += `1`;
940	}
941
942	// We come here after having either found a word or not. We look ahead to the
943	// next word. If it's not a dictionary word, we will combine it with the word we
944	// just found (if there is one), but only if the preceding word does not exceed
945	// the threshold.
946	// The text iterator should now be positioned at the end of the word we found.
947	if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < KHMER_ROOT_COMBINE_THRESHOLD) {
948	// if it is a dictionary word, do nothing. If it isn't, then if there is
949	// no preceding word, or the non-word shares less than the minimum threshold
950	// of characters with a dictionary word, then scan to resynchronize
951	if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= `0`
952	&& (cuWordLength == `0`
953	\|\| words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) {
954	// Look for a plausible word boundary
955	int32_t remaining = rangeEnd - (current+cuWordLength);
956	UChar32 pc;
957	UChar32 uc;
958	int32_t chars = `0`;
959	for (;;) {
960	int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
961	pc = utext_next32(text);
962	int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
963	chars += pcSize;
964	remaining -= pcSize;
965	if (remaining <= `0`) {
966	break;
967	}
968	uc = utext_current32(text);
969	if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
970	// Maybe. See if it's in the dictionary.
971	int32_t num_candidates = words[(wordsFound + `1`) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
972	utext_setNativeIndex(text, current+cuWordLength+chars);
973	if (num_candidates > `0`) {
974	break;
975	}
976	}
977	}
978
979	// Bump the word count if there wasn't already one
980	if (cuWordLength <= `0`) {
981	wordsFound += `1`;
982	}
983
984	// Update the length with the passed-over characters
985	cuWordLength += chars;
986	}
987	else {
988	// Back up to where we were for next iteration
989	utext_setNativeIndex(text, current+cuWordLength);
990	}
991	}
992
993	// Never stop before a combining mark.
994	int32_t currPos;
995	while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
996	utext_next32(text);
997	cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
998	}
999
1000	// Look ahead for possible suffixes if a dictionary word does not follow.
1001	// We do this in code rather than using a rule so that the heuristic
1002	// resynch continues to function. For example, one of the suffix characters
1003	// could be a typo in the middle of a word.
1004	// if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
1005	// if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
1006	// && fSuffixSet.contains(uc = utext_current32(text))) {
1007	// if (uc == KHMER_PAIYANNOI) {
1008	// if (!fSuffixSet.contains(utext_previous32(text))) {
1009	// // Skip over previous end and PAIYANNOI
1010	// utext_next32(text);
1011	// utext_next32(text);
1012	// wordLength += 1; // Add PAIYANNOI to word
1013	// uc = utext_current32(text); // Fetch next character
1014	// }
1015	// else {
1016	// // Restore prior position
1017	// utext_next32(text);
1018	// }
1019	// }
1020	// if (uc == KHMER_MAIYAMOK) {
1021	// if (utext_previous32(text) != KHMER_MAIYAMOK) {
1022	// // Skip over previous end and MAIYAMOK
1023	// utext_next32(text);
1024	// utext_next32(text);
1025	// wordLength += 1; // Add MAIYAMOK to word
1026	// }
1027	// else {
1028	// // Restore prior position
1029	// utext_next32(text);
1030	// }
1031	// }
1032	// }
1033	// else {
1034	// utext_setNativeIndex(text, current+wordLength);
1035	// }
1036	// }
1037
1038	// Did we find a word on this iteration? If so, push it on the break stack
1039	if (cuWordLength > `0`) {
1040	foundBreaks.push((current+cuWordLength), status);
1041	}
1042	}
1043
1044	// Don't return a break for the end of the dictionary range if there is one there.
1045	if (foundBreaks.peeki() >= rangeEnd) {
1046	(void) foundBreaks.popi();
1047	wordsFound -= `1`;
1048	}
1049
1050	return wordsFound;
1051	}
1052
1053	#if !UCONFIG_NO_NORMALIZATION
1054	/*
1055	******************************************************************
1056	* CjkBreakEngine
1057	*/
1058	static const uint32_t kuint32max = `0xFFFFFFFF`;
1059	CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status)
1060	: DictionaryBreakEngine (), fDictionary(adoptDictionary) {
1061	UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
1062	UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Hani");
1063	// Korean dictionary only includes Hangul syllables
1064	fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status);
1065	fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);
1066	fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status);
1067	fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status);
1068	nfkcNorm2 = Normalizer2::getNFKCInstance(status);
1069
1070	if (U_SUCCESS(status)) {
1071	// handle Korean and Japanese/Chinese using different dictionaries
1072	if (type == kKorean) {
1073	setCharacters(fHangulWordSet);
1074	} else { //Chinese and Japanese
1075	UnicodeSet cjSet;
1076	cjSet.addAll(fHanWordSet);
1077	cjSet.addAll(fKatakanaWordSet);
1078	cjSet.addAll(fHiraganaWordSet);
1079	cjSet.add(`0xFF70`); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
1080	cjSet.add(`0x30FC`); // KATAKANA-HIRAGANA PROLONGED SOUND MARK
1081	setCharacters(cjSet);
1082	}
1083	}
1084	UTRACE_EXIT_STATUS(status);
1085	}
1086
1087	CjkBreakEngine::~CjkBreakEngine(){
1088	delete fDictionary;
1089	}
1090
1091	// The katakanaCost values below are based on the length frequencies of all
1092	// katakana phrases in the dictionary
1093	static const int32_t kMaxKatakanaLength = `8`;
1094	static const int32_t kMaxKatakanaGroupLength = `20`;
1095	static const uint32_t maxSnlp = `255`;
1096
1097	static inline uint32_t getKatakanaCost(int32_t wordLength){
1098	//TODO: fill array with actual values from dictionary!
1099	static const uint32_t katakanaCost[kMaxKatakanaLength + `1`]
1100	= {`8192`, `984`, `408`, `240`, `204`, `252`, `300`, `372`, `480`};
1101	return (wordLength > kMaxKatakanaLength) ? `8192` : katakanaCost[wordLength];
1102	}
1103
1104	static inline bool isKatakana(UChar32 value) {
1105	return (value >= `0x30A1` && value <= `0x30FE` && value != `0x30FB`) \|\|
1106	(value >= `0xFF66` && value <= `0xFF9f`);
1107	}
1108
1109
1110	// Function for accessing internal utext flags.
1111	// Replicates an internal UText function.
1112
1113	static inline int32_t utext_i32_flag(int32_t bitIndex) {
1114	return (int32_t)`1` << bitIndex;
1115	}
1116
1117
1118	/*
1119	* @param text A UText representing the text
1120	* @param rangeStart The start of the range of dictionary characters
1121	* @param rangeEnd The end of the range of dictionary characters
1122	* @param foundBreaks vector<int32> to receive the break positions
1123	* @return The number of breaks found
1124	*/
1125	int32_t
1126	CjkBreakEngine::divideUpDictionaryRange( UText *inText,
1127	int32_t rangeStart,
1128	int32_t rangeEnd,
1129	UVector32 &foundBreaks ) const {
1130	if (rangeStart >= rangeEnd) {
1131	return `0`;
1132	}
1133
1134	// UnicodeString version of input UText, NFKC normalized if necessary.
1135	UnicodeString inString;
1136
1137	// inputMap[inStringIndex] = corresponding native index from UText inText.
1138	// If NULL then mapping is 1:1
1139	LocalPointer<UVector32> inputMap;
1140
1141	UErrorCode status = U_ZERO_ERROR;
1142
1143
1144	// if UText has the input string as one contiguous UTF-16 chunk
1145	if ((inText->providerProperties & utext_i32_flag(UTEXT_PROVIDER_STABLE_CHUNKS)) &&
1146	inText->chunkNativeStart <= rangeStart &&
1147	inText->chunkNativeLimit >= rangeEnd &&
1148	inText->nativeIndexingLimit >= rangeEnd - inText->chunkNativeStart) {
1149
1150	// Input UText is in one contiguous UTF-16 chunk.
1151	// Use Read-only aliasing UnicodeString.
1152	inString.setTo(FALSE,
1153	inText->chunkContents + rangeStart - inText->chunkNativeStart,
1154	rangeEnd - rangeStart);
1155	} else {
1156	// Copy the text from the original inText (UText) to inString (UnicodeString).
1157	// Create a map from UnicodeString indices -> UText offsets.
1158	utext_setNativeIndex(inText, rangeStart);
1159	int32_t limit = rangeEnd;
1160	U_ASSERT(limit <= utext_nativeLength(inText));
1161	if (limit > utext_nativeLength(inText)) {
1162	limit = (int32_t)utext_nativeLength(inText);
1163	}
1164	inputMap.adoptInsteadAndCheckErrorCode(new UVector32 (status), status);
1165	if (U_FAILURE(status)) {
1166	return `0`;
1167	}
1168	while (utext_getNativeIndex(inText) < limit) {
1169	int32_t nativePosition = (int32_t)utext_getNativeIndex(inText);
1170	UChar32 c = utext_next32(inText);
1171	U_ASSERT(c != U_SENTINEL);
1172	inString.append(c);
1173	while (inputMap ->size() < inString.length()) {
1174	inputMap ->addElement(nativePosition, status);
1175	}
1176	}
1177	inputMap ->addElement(limit, status);
1178	}
1179
1180
1181	if (!nfkcNorm2->isNormalized(inString, status)) {
1182	UnicodeString normalizedInput;
1183	// normalizedMap[normalizedInput position] == original UText position.
1184	LocalPointer<UVector32> normalizedMap(new UVector32 (status), status);
1185	if (U_FAILURE(status)) {
1186	return `0`;
1187	}
1188
1189	UnicodeString fragment;
1190	UnicodeString normalizedFragment;
1191	for (int32_t srcI = `0`; srcI < inString.length();) { // Once per normalization chunk
1192	fragment.remove();
1193	int32_t fragmentStartI = srcI;
1194	UChar32 c = inString.char32At(srcI);
1195	for (;;) {
1196	fragment.append(c);
1197	srcI = inString.moveIndex32(srcI, `1`);
1198	if (srcI == inString.length()) {
1199	break;
1200	}
1201	c = inString.char32At(srcI);
1202	if (nfkcNorm2->hasBoundaryBefore(c)) {
1203	break;
1204	}
1205	}
1206	nfkcNorm2->normalize(fragment, normalizedFragment, status);
1207	normalizedInput.append(normalizedFragment);
1208
1209	// Map every position in the normalized chunk to the start of the chunk
1210	// in the original input.
1211	int32_t fragmentOriginalStart = inputMap.isValid() ?
1212	inputMap ->elementAti(fragmentStartI) : fragmentStartI+rangeStart;
1213	while (normalizedMap ->size() < normalizedInput.length()) {
1214	normalizedMap ->addElement(fragmentOriginalStart, status);
1215	if (U_FAILURE(status)) {
1216	break;
1217	}
1218	}
1219	}
1220	U_ASSERT(normalizedMap ->size() == normalizedInput.length());
1221	int32_t nativeEnd = inputMap.isValid() ?
1222	inputMap ->elementAti(inString.length()) : inString.length()+rangeStart;
1223	normalizedMap ->addElement(nativeEnd, status);
1224
1225	inputMap = std::move(normalizedMap);
1226	inString = std::move(normalizedInput);
1227	}
1228
1229	int32_t numCodePts = inString.countChar32();
1230	if (numCodePts != inString.length()) {
1231	// There are supplementary characters in the input.
1232	// The dictionary will produce boundary positions in terms of code point indexes,
1233	// not in terms of code unit string indexes.
1234	// Use the inputMap mechanism to take care of this in addition to indexing differences
1235	// from normalization and/or UTF-8 input.
1236	UBool hadExistingMap = inputMap.isValid();
1237	if (!hadExistingMap) {
1238	inputMap.adoptInsteadAndCheckErrorCode(new UVector32 (status), status);
1239	if (U_FAILURE(status)) {
1240	return `0`;
1241	}
1242	}
1243	int32_t cpIdx = `0`;
1244	for (int32_t cuIdx = `0`; ; cuIdx = inString.moveIndex32(cuIdx, `1`)) {
1245	U_ASSERT(cuIdx >= cpIdx);
1246	if (hadExistingMap) {
1247	inputMap ->setElementAt(inputMap ->elementAti(cuIdx), cpIdx);
1248	} else {
1249	inputMap ->addElement(cuIdx+rangeStart, status);
1250	}
1251	cpIdx++;
1252	if (cuIdx == inString.length()) {
1253	break;
1254	}
1255	}
1256	}
1257
1258	// bestSnlp[i] is the snlp of the best segmentation of the first i
1259	// code points in the range to be matched.
1260	UVector32 bestSnlp(numCodePts + `1`, status);
1261	bestSnlp.addElement(`0`, status);
1262	for(int32_t i = `1`; i <= numCodePts; i++) {
1263	bestSnlp.addElement(kuint32max, status);
1264	}
1265
1266
1267	// prev[i] is the index of the last CJK code point in the previous word in
1268	// the best segmentation of the first i characters.
1269	UVector32 prev(numCodePts + `1`, status);
1270	for(int32_t i = `0`; i <= numCodePts; i++){
1271	prev.addElement(-`1`, status);
1272	}
1273
1274	const int32_t maxWordSize = `20`;
1275	UVector32 values(numCodePts, status);
1276	values.setSize(numCodePts);
1277	UVector32 lengths(numCodePts, status);
1278	lengths.setSize(numCodePts);
1279
1280	UText fu = UTEXT_INITIALIZER;
1281	utext_openUnicodeString(&fu, &inString, &status);
1282
1283	// Dynamic programming to find the best segmentation.
1284
1285	// In outer loop, i is the code point index,
1286	// ix is the corresponding string (code unit) index.
1287	// They differ when the string contains supplementary characters.
1288	int32_t ix = `0`;
1289	bool is_prev_katakana = false;
1290	for (int32_t i = `0`; i < numCodePts; ++i, ix = inString.moveIndex32(ix, `1`)) {
1291	if ((uint32_t)bestSnlp.elementAti(i) == kuint32max) {
1292	continue;
1293	}
1294
1295	int32_t count;
1296	utext_setNativeIndex(&fu, ix);
1297	count = fDictionary->matches(&fu, maxWordSize, numCodePts,
1298	NULL, lengths.getBuffer(), values.getBuffer(), NULL);
1299	// Note: lengths is filled with code point lengths
1300	// The NULL parameter is the ignored code unit lengths.
1301
1302	// if there are no single character matches found in the dictionary
1303	// starting with this character, treat character as a 1-character word
1304	// with the highest value possible, i.e. the least likely to occur.
1305	// Exclude Korean characters from this treatment, as they should be left
1306	// together by default.
1307	if ((count == `0` \|\| lengths.elementAti(`0`) != `1`) &&
1308	!fHangulWordSet.contains(inString.char32At(ix))) {
1309	values.setElementAt(maxSnlp, count); // 255
1310	lengths.setElementAt(`1`, count++);
1311	}
1312
1313	for (int32_t j = `0`; j < count; j++) {
1314	uint32_t newSnlp = (uint32_t)bestSnlp.elementAti(i) + (uint32_t)values.elementAti(j);
1315	int32_t ln_j_i = lengths.elementAti(j) + i;
1316	if (newSnlp < (uint32_t)bestSnlp.elementAti(ln_j_i)) {
1317	bestSnlp.setElementAt(newSnlp, ln_j_i);
1318	prev.setElementAt(i, ln_j_i);
1319	}
1320	}
1321
1322	// In Japanese,
1323	// Katakana word in single character is pretty rare. So we apply
1324	// the following heuristic to Katakana: any continuous run of Katakana
1325	// characters is considered a candidate word with a default cost
1326	// specified in the katakanaCost table according to its length.
1327
1328	bool is_katakana = isKatakana(inString.char32At(ix));
1329	int32_t katakanaRunLength = `1`;
1330	if (!is_prev_katakana && is_katakana) {
1331	int32_t j = inString.moveIndex32(ix, `1`);
1332	// Find the end of the continuous run of Katakana characters
1333	while (j < inString.length() && katakanaRunLength < kMaxKatakanaGroupLength &&
1334	isKatakana(inString.char32At(j))) {
1335	j = inString.moveIndex32(j, `1`);
1336	katakanaRunLength++;
1337	}
1338	if (katakanaRunLength < kMaxKatakanaGroupLength) {
1339	uint32_t newSnlp = bestSnlp.elementAti(i) + getKatakanaCost(katakanaRunLength);
1340	if (newSnlp < (uint32_t)bestSnlp.elementAti(i+katakanaRunLength)) {
1341	bestSnlp.setElementAt(newSnlp, i+katakanaRunLength);
1342	prev.setElementAt(i, i+katakanaRunLength); // prev[j] = i;
1343	}
1344	}
1345	}
1346	is_prev_katakana = is_katakana;
1347	}
1348	utext_close(&fu);
1349
1350	// Start pushing the optimal offset index into t_boundary (t for tentative).
1351	// prev[numCodePts] is guaranteed to be meaningful.
1352	// We'll first push in the reverse order, i.e.,
1353	// t_boundary[0] = numCodePts, and afterwards do a swap.
1354	UVector32 t_boundary(numCodePts+`1`, status);
1355
1356	int32_t numBreaks = `0`;
1357	// No segmentation found, set boundary to end of range
1358	if ((uint32_t)bestSnlp.elementAti(numCodePts) == kuint32max) {
1359	t_boundary.addElement(numCodePts, status);
1360	numBreaks++;
1361	} else {
1362	for (int32_t i = numCodePts; i > `0`; i = prev.elementAti(i)) {
1363	t_boundary.addElement(i, status);
1364	numBreaks++;
1365	}
1366	U_ASSERT(prev.elementAti(t_boundary.elementAti(numBreaks - `1`)) == `0`);
1367	}
1368
1369	// Add a break for the start of the dictionary range if there is not one
1370	// there already.
1371	if (foundBreaks.size() == `0` \|\| foundBreaks.peeki() < rangeStart) {
1372	t_boundary.addElement(`0`, status);
1373	numBreaks++;
1374	}
1375
1376	// Now that we're done, convert positions in t_boundary[] (indices in
1377	// the normalized input string) back to indices in the original input UText
1378	// while reversing t_boundary and pushing values to foundBreaks.
1379	int32_t prevCPPos = -`1`;
1380	int32_t prevUTextPos = -`1`;
1381	for (int32_t i = numBreaks-`1`; i >= `0`; i--) {
1382	int32_t cpPos = t_boundary.elementAti(i);
1383	U_ASSERT(cpPos > prevCPPos);
1384	int32_t utextPos = inputMap.isValid() ? inputMap ->elementAti(cpPos) : cpPos + rangeStart;
1385	U_ASSERT(utextPos >= prevUTextPos);
1386	if (utextPos > prevUTextPos) {
1387	// Boundaries are added to foundBreaks output in ascending order.
1388	U_ASSERT(foundBreaks.size() == `0` \|\| foundBreaks.peeki() < utextPos);
1389	foundBreaks.push(utextPos, status);
1390	} else {
1391	// Normalization expanded the input text, the dictionary found a boundary
1392	// within the expansion, giving two boundaries with the same index in the
1393	// original text. Ignore the second. See ticket #12918.
1394	--numBreaks;
1395	}
1396	prevCPPos = cpPos;
1397	prevUTextPos = utextPos;
1398	}
1399	(void)prevCPPos; // suppress compiler warnings about unused variable
1400
1401	// inString goes out of scope
1402	// inputMap goes out of scope
1403	return numBreaks;
1404	}
1405	#endif
1406
1407	U_NAMESPACE_END
1408
1409	#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1410
1411

Browse the source code of engine/third_party/icu/source/common/dictbe.cpp