dictbe.cpp source code [ClickHouse/contrib/icu/icu4c/source/common/dictbe.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/**
4	*******************************************************************************
5	* Copyright (C) 2006-2016, International Business Machines Corporation
6	* and others. All Rights Reserved.
7	*******************************************************************************
8	*/
9
10	#include <utility>
11
12	#include "unicode/utypes.h"
13
14	#if !UCONFIG_NO_BREAK_ITERATION
15
16	#include "brkeng.h"
17	#include "dictbe.h"
18	#include "unicode/uniset.h"
19	#include "unicode/chariter.h"
20	#include "unicode/ubrk.h"
21	#include "uvectr32.h"
22	#include "uvector.h"
23	#include "uassert.h"
24	#include "unicode/normlzr.h"
25	#include "cmemory.h"
26	#include "dictionarydata.h"
27
28	U_NAMESPACE_BEGIN
29
30	/*
31	******************************************************************
32	*/
33
34	DictionaryBreakEngine::DictionaryBreakEngine() {
35	}
36
37	DictionaryBreakEngine::~DictionaryBreakEngine() {
38	}
39
40	UBool
41	DictionaryBreakEngine::handles(UChar32 c) const {
42	return fSet.contains(c);
43	}
44
45	int32_t
46	DictionaryBreakEngine::findBreaks( UText *text,
47	int32_t startPos,
48	int32_t endPos,
49	UVector32 &foundBreaks ) const {
50	(void)startPos; // TODO: remove this param?
51	int32_t result = `0`;
52
53	// Find the span of characters included in the set.
54	// The span to break begins at the current position in the text, and
55	// extends towards the start or end of the text, depending on 'reverse'.
56
57	int32_t start = (int32_t)utext_getNativeIndex(text);
58	int32_t current;
59	int32_t rangeStart;
60	int32_t rangeEnd;
61	UChar32 c = utext_current32(text);
62	while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) {
63	utext_next32(text); // TODO: recast loop for postincrement
64	c = utext_current32(text);
65	}
66	rangeStart = start;
67	rangeEnd = current;
68	result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
69	utext_setNativeIndex(text, current);
70
71	return result;
72	}
73
74	void
75	DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) {
76	fSet = set;
77	// Compact for caching
78	fSet.compact();
79	}
80
81	/*
82	******************************************************************
83	* PossibleWord
84	*/
85
86	// Helper class for improving readability of the Thai/Lao/Khmer word break
87	// algorithm. The implementation is completely inline.
88
89	// List size, limited by the maximum number of words in the dictionary
90	// that form a nested sequence.
91	static const int32_t POSSIBLE_WORD_LIST_MAX = `20`;
92
93	class PossibleWord {
94	private:
95	// list of word candidate lengths, in increasing length order
96	// TODO: bytes would be sufficient for word lengths.
97	int32_t count; // Count of candidates
98	int32_t prefix; // The longest match with a dictionary word
99	int32_t offset; // Offset in the text of these candidates
100	int32_t mark; // The preferred candidate's offset
101	int32_t current; // The candidate we're currently looking at
102	int32_t cuLengths[POSSIBLE_WORD_LIST_MAX]; // Word Lengths, in code units.
103	int32_t cpLengths[POSSIBLE_WORD_LIST_MAX]; // Word Lengths, in code points.
104
105	public:
106	PossibleWord() : count(`0`), prefix(`0`), offset(-`1`), mark(`0`), current(`0`) {}
107	~PossibleWord() {}
108
109	// Fill the list of candidates if needed, select the longest, and return the number found
110	int32_t candidates( UText text, DictionaryMatcher dict, int32_t rangeEnd );
111
112	// Select the currently marked candidate, point after it in the text, and invalidate self
113	int32_t acceptMarked( UText *text );
114
115	// Back up from the current candidate to the next shorter one; return TRUE if that exists
116	// and point the text after it
117	UBool backUp( UText *text );
118
119	// Return the longest prefix this candidate location shares with a dictionary word
120	// Return value is in code points.
121	int32_t longestPrefix() { return prefix; }
122
123	// Mark the current candidate as the one we like
124	void markCurrent() { mark = current; }
125
126	// Get length in code points of the marked word.
127	int32_t markedCPLength() { return cpLengths[mark]; }
128	};
129
130
131	int32_t PossibleWord::candidates( UText text, DictionaryMatcher dict, int32_t rangeEnd ) {
132	// TODO: If getIndex is too slow, use offset < 0 and add discardAll()
133	int32_t start = (int32_t)utext_getNativeIndex(text);
134	if (start != offset) {
135	offset = start;
136	count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, NULL, &prefix);
137	// Dictionary leaves text after longest prefix, not longest word. Back up.
138	if (count <= `0`) {
139	utext_setNativeIndex(text, start);
140	}
141	}
142	if (count > `0`) {
143	utext_setNativeIndex(text, start+cuLengths[count-`1`]);
144	}
145	current = count-`1`;
146	mark = current;
147	return count;
148	}
149
150	int32_t
151	PossibleWord::acceptMarked( UText *text ) {
152	utext_setNativeIndex(text, offset + cuLengths[mark]);
153	return cuLengths[mark];
154	}
155
156
157	UBool
158	PossibleWord::backUp( UText *text ) {
159	if (current > `0`) {
160	utext_setNativeIndex(text, offset + cuLengths[--current]);
161	return TRUE;
162	}
163	return FALSE;
164	}
165
166	/*
167	******************************************************************
168	* ThaiBreakEngine
169	*/
170
171	// How many words in a row are "good enough"?
172	static const int32_t THAI_LOOKAHEAD = `3`;
173
174	// Will not combine a non-word with a preceding dictionary word longer than this
175	static const int32_t THAI_ROOT_COMBINE_THRESHOLD = `3`;
176
177	// Will not combine a non-word that shares at least this much prefix with a
178	// dictionary word, with a preceding word
179	static const int32_t THAI_PREFIX_COMBINE_THRESHOLD = `3`;
180
181	// Ellision character
182	static const int32_t THAI_PAIYANNOI = `0x0E2F`;
183
184	// Repeat character
185	static const int32_t THAI_MAIYAMOK = `0x0E46`;
186
187	// Minimum word size
188	static const int32_t THAI_MIN_WORD = `2`;
189
190	// Minimum number of characters for two words
191	static const int32_t THAI_MIN_WORD_SPAN = THAI_MIN_WORD * `2`;
192
193	ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
194	: DictionaryBreakEngine (),
195	fDictionary(adoptDictionary)
196	{
197	fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status);
198	if (U_SUCCESS(status)) {
199	setCharacters(fThaiWordSet);
200	}
201	fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status);
202	fMarkSet.add(`0x0020`);
203	fEndWordSet = fThaiWordSet;
204	fEndWordSet.remove(`0x0E31`); // MAI HAN-AKAT
205	fEndWordSet.remove(`0x0E40`, `0x0E44`); // SARA E through SARA AI MAIMALAI
206	fBeginWordSet.add(`0x0E01`, `0x0E2E`); // KO KAI through HO NOKHUK
207	fBeginWordSet.add(`0x0E40`, `0x0E44`); // SARA E through SARA AI MAIMALAI
208	fSuffixSet.add(THAI_PAIYANNOI);
209	fSuffixSet.add(THAI_MAIYAMOK);
210
211	// Compact for caching.
212	fMarkSet.compact();
213	fEndWordSet.compact();
214	fBeginWordSet.compact();
215	fSuffixSet.compact();
216	}
217
218	ThaiBreakEngine::~ThaiBreakEngine() {
219	delete fDictionary;
220	}
221
222	int32_t
223	ThaiBreakEngine::divideUpDictionaryRange( UText *text,
224	int32_t rangeStart,
225	int32_t rangeEnd,
226	UVector32 &foundBreaks ) const {
227	utext_setNativeIndex(text, rangeStart);
228	utext_moveIndex32(text, THAI_MIN_WORD_SPAN);
229	if (utext_getNativeIndex(text) >= rangeEnd) {
230	return `0`; // Not enough characters for two words
231	}
232	utext_setNativeIndex(text, rangeStart);
233
234
235	uint32_t wordsFound = `0`;
236	int32_t cpWordLength = `0`; // Word Length in Code Points.
237	int32_t cuWordLength = `0`; // Word length in code units (UText native indexing)
238	int32_t current;
239	UErrorCode status = U_ZERO_ERROR;
240	PossibleWord words[THAI_LOOKAHEAD];
241
242	utext_setNativeIndex(text, rangeStart);
243
244	while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
245	cpWordLength = `0`;
246	cuWordLength = `0`;
247
248	// Look for candidate words at the current position
249	int32_t candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
250
251	// If we found exactly one, use that
252	if (candidates == `1`) {
253	cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
254	cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
255	wordsFound += `1`;
256	}
257	// If there was more than one, see which one can take us forward the most words
258	else if (candidates > `1`) {
259	// If we're already at the end of the range, we're done
260	if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
261	goto foundBest;
262	}
263	do {
264	int32_t wordsMatched = `1`;
265	if (words[(wordsFound + `1`) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > `0`) {
266	if (wordsMatched < `2`) {
267	// Followed by another dictionary word; mark first word as a good candidate
268	words[wordsFound%THAI_LOOKAHEAD].markCurrent();
269	wordsMatched = `2`;
270	}
271
272	// If we're already at the end of the range, we're done
273	if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
274	goto foundBest;
275	}
276
277	// See if any of the possible second words is followed by a third word
278	do {
279	// If we find a third word, stop right away
280	if (words[(wordsFound + `2`) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
281	words[wordsFound % THAI_LOOKAHEAD].markCurrent();
282	goto foundBest;
283	}
284	}
285	while (words[(wordsFound + `1`) % THAI_LOOKAHEAD].backUp(text));
286	}
287	}
288	while (words[wordsFound % THAI_LOOKAHEAD].backUp(text));
289	foundBest:
290	// Set UText position to after the accepted word.
291	cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
292	cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
293	wordsFound += `1`;
294	}
295
296	// We come here after having either found a word or not. We look ahead to the
297	// next word. If it's not a dictionary word, we will combine it with the word we
298	// just found (if there is one), but only if the preceding word does not exceed
299	// the threshold.
300	// The text iterator should now be positioned at the end of the word we found.
301
302	UChar32 uc = `0`;
303	if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < THAI_ROOT_COMBINE_THRESHOLD) {
304	// if it is a dictionary word, do nothing. If it isn't, then if there is
305	// no preceding word, or the non-word shares less than the minimum threshold
306	// of characters with a dictionary word, then scan to resynchronize
307	if (words[wordsFound % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= `0`
308	&& (cuWordLength == `0`
309	\|\| words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) {
310	// Look for a plausible word boundary
311	int32_t remaining = rangeEnd - (current+cuWordLength);
312	UChar32 pc;
313	int32_t chars = `0`;
314	for (;;) {
315	int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
316	pc = utext_next32(text);
317	int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
318	chars += pcSize;
319	remaining -= pcSize;
320	if (remaining <= `0`) {
321	break;
322	}
323	uc = utext_current32(text);
324	if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
325	// Maybe. See if it's in the dictionary.
326	// NOTE: In the original Apple code, checked that the next
327	// two characters after uc were not 0x0E4C THANTHAKHAT before
328	// checking the dictionary. That is just a performance filter,
329	// but it's not clear it's faster than checking the trie.
330	int32_t num_candidates = words[(wordsFound + `1`) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
331	utext_setNativeIndex(text, current + cuWordLength + chars);
332	if (num_candidates > `0`) {
333	break;
334	}
335	}
336	}
337
338	// Bump the word count if there wasn't already one
339	if (cuWordLength <= `0`) {
340	wordsFound += `1`;
341	}
342
343	// Update the length with the passed-over characters
344	cuWordLength += chars;
345	}
346	else {
347	// Back up to where we were for next iteration
348	utext_setNativeIndex(text, current+cuWordLength);
349	}
350	}
351
352	// Never stop before a combining mark.
353	int32_t currPos;
354	while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
355	utext_next32(text);
356	cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
357	}
358
359	// Look ahead for possible suffixes if a dictionary word does not follow.
360	// We do this in code rather than using a rule so that the heuristic
361	// resynch continues to function. For example, one of the suffix characters
362	// could be a typo in the middle of a word.
363	if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cuWordLength > `0`) {
364	if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= `0`
365	&& fSuffixSet.contains(uc = utext_current32(text))) {
366	if (uc == THAI_PAIYANNOI) {
367	if (!fSuffixSet.contains(utext_previous32(text))) {
368	// Skip over previous end and PAIYANNOI
369	utext_next32(text);
370	int32_t paiyannoiIndex = (int32_t)utext_getNativeIndex(text);
371	utext_next32(text);
372	cuWordLength += (int32_t)utext_getNativeIndex(text) - paiyannoiIndex; // Add PAIYANNOI to word
373	uc = utext_current32(text); // Fetch next character
374	}
375	else {
376	// Restore prior position
377	utext_next32(text);
378	}
379	}
380	if (uc == THAI_MAIYAMOK) {
381	if (utext_previous32(text) != THAI_MAIYAMOK) {
382	// Skip over previous end and MAIYAMOK
383	utext_next32(text);
384	int32_t maiyamokIndex = (int32_t)utext_getNativeIndex(text);
385	utext_next32(text);
386	cuWordLength += (int32_t)utext_getNativeIndex(text) - maiyamokIndex; // Add MAIYAMOK to word
387	}
388	else {
389	// Restore prior position
390	utext_next32(text);
391	}
392	}
393	}
394	else {
395	utext_setNativeIndex(text, current+cuWordLength);
396	}
397	}
398
399	// Did we find a word on this iteration? If so, push it on the break stack
400	if (cuWordLength > `0`) {
401	foundBreaks.push((current+cuWordLength), status);
402	}
403	}
404
405	// Don't return a break for the end of the dictionary range if there is one there.
406	if (foundBreaks.peeki() >= rangeEnd) {
407	(void) foundBreaks.popi();
408	wordsFound -= `1`;
409	}
410
411	return wordsFound;
412	}
413
414	/*
415	******************************************************************
416	* LaoBreakEngine
417	*/
418
419	// How many words in a row are "good enough"?
420	static const int32_t LAO_LOOKAHEAD = `3`;
421
422	// Will not combine a non-word with a preceding dictionary word longer than this
423	static const int32_t LAO_ROOT_COMBINE_THRESHOLD = `3`;
424
425	// Will not combine a non-word that shares at least this much prefix with a
426	// dictionary word, with a preceding word
427	static const int32_t LAO_PREFIX_COMBINE_THRESHOLD = `3`;
428
429	// Minimum word size
430	static const int32_t LAO_MIN_WORD = `2`;
431
432	// Minimum number of characters for two words
433	static const int32_t LAO_MIN_WORD_SPAN = LAO_MIN_WORD * `2`;
434
435	LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
436	: DictionaryBreakEngine (),
437	fDictionary(adoptDictionary)
438	{
439	fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]"), status);
440	if (U_SUCCESS(status)) {
441	setCharacters(fLaoWordSet);
442	}
443	fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status);
444	fMarkSet.add(`0x0020`);
445	fEndWordSet = fLaoWordSet;
446	fEndWordSet.remove(`0x0EC0`, `0x0EC4`); // prefix vowels
447	fBeginWordSet.add(`0x0E81`, `0x0EAE`); // basic consonants (including holes for corresponding Thai characters)
448	fBeginWordSet.add(`0x0EDC`, `0x0EDD`); // digraph consonants (no Thai equivalent)
449	fBeginWordSet.add(`0x0EC0`, `0x0EC4`); // prefix vowels
450
451	// Compact for caching.
452	fMarkSet.compact();
453	fEndWordSet.compact();
454	fBeginWordSet.compact();
455	}
456
457	LaoBreakEngine::~LaoBreakEngine() {
458	delete fDictionary;
459	}
460
461	int32_t
462	LaoBreakEngine::divideUpDictionaryRange( UText *text,
463	int32_t rangeStart,
464	int32_t rangeEnd,
465	UVector32 &foundBreaks ) const {
466	if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) {
467	return `0`; // Not enough characters for two words
468	}
469
470	uint32_t wordsFound = `0`;
471	int32_t cpWordLength = `0`;
472	int32_t cuWordLength = `0`;
473	int32_t current;
474	UErrorCode status = U_ZERO_ERROR;
475	PossibleWord words[LAO_LOOKAHEAD];
476
477	utext_setNativeIndex(text, rangeStart);
478
479	while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
480	cuWordLength = `0`;
481	cpWordLength = `0`;
482
483	// Look for candidate words at the current position
484	int32_t candidates = words[wordsFound%LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
485
486	// If we found exactly one, use that
487	if (candidates == `1`) {
488	cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
489	cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
490	wordsFound += `1`;
491	}
492	// If there was more than one, see which one can take us forward the most words
493	else if (candidates > `1`) {
494	// If we're already at the end of the range, we're done
495	if (utext_getNativeIndex(text) >= rangeEnd) {
496	goto foundBest;
497	}
498	do {
499	int32_t wordsMatched = `1`;
500	if (words[(wordsFound + `1`) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > `0`) {
501	if (wordsMatched < `2`) {
502	// Followed by another dictionary word; mark first word as a good candidate
503	words[wordsFound%LAO_LOOKAHEAD].markCurrent();
504	wordsMatched = `2`;
505	}
506
507	// If we're already at the end of the range, we're done
508	if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
509	goto foundBest;
510	}
511
512	// See if any of the possible second words is followed by a third word
513	do {
514	// If we find a third word, stop right away
515	if (words[(wordsFound + `2`) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
516	words[wordsFound % LAO_LOOKAHEAD].markCurrent();
517	goto foundBest;
518	}
519	}
520	while (words[(wordsFound + `1`) % LAO_LOOKAHEAD].backUp(text));
521	}
522	}
523	while (words[wordsFound % LAO_LOOKAHEAD].backUp(text));
524	foundBest:
525	cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
526	cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
527	wordsFound += `1`;
528	}
529
530	// We come here after having either found a word or not. We look ahead to the
531	// next word. If it's not a dictionary word, we will combine it withe the word we
532	// just found (if there is one), but only if the preceding word does not exceed
533	// the threshold.
534	// The text iterator should now be positioned at the end of the word we found.
535	if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < LAO_ROOT_COMBINE_THRESHOLD) {
536	// if it is a dictionary word, do nothing. If it isn't, then if there is
537	// no preceding word, or the non-word shares less than the minimum threshold
538	// of characters with a dictionary word, then scan to resynchronize
539	if (words[wordsFound % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= `0`
540	&& (cuWordLength == `0`
541	\|\| words[wordsFound%LAO_LOOKAHEAD].longestPrefix() < LAO_PREFIX_COMBINE_THRESHOLD)) {
542	// Look for a plausible word boundary
543	int32_t remaining = rangeEnd - (current + cuWordLength);
544	UChar32 pc;
545	UChar32 uc;
546	int32_t chars = `0`;
547	for (;;) {
548	int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
549	pc = utext_next32(text);
550	int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
551	chars += pcSize;
552	remaining -= pcSize;
553	if (remaining <= `0`) {
554	break;
555	}
556	uc = utext_current32(text);
557	if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
558	// Maybe. See if it's in the dictionary.
559	// TODO: this looks iffy; compare with old code.
560	int32_t num_candidates = words[(wordsFound + `1`) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
561	utext_setNativeIndex(text, current + cuWordLength + chars);
562	if (num_candidates > `0`) {
563	break;
564	}
565	}
566	}
567
568	// Bump the word count if there wasn't already one
569	if (cuWordLength <= `0`) {
570	wordsFound += `1`;
571	}
572
573	// Update the length with the passed-over characters
574	cuWordLength += chars;
575	}
576	else {
577	// Back up to where we were for next iteration
578	utext_setNativeIndex(text, current + cuWordLength);
579	}
580	}
581
582	// Never stop before a combining mark.
583	int32_t currPos;
584	while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
585	utext_next32(text);
586	cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
587	}
588
589	// Look ahead for possible suffixes if a dictionary word does not follow.
590	// We do this in code rather than using a rule so that the heuristic
591	// resynch continues to function. For example, one of the suffix characters
592	// could be a typo in the middle of a word.
593	// NOT CURRENTLY APPLICABLE TO LAO
594
595	// Did we find a word on this iteration? If so, push it on the break stack
596	if (cuWordLength > `0`) {
597	foundBreaks.push((current+cuWordLength), status);
598	}
599	}
600
601	// Don't return a break for the end of the dictionary range if there is one there.
602	if (foundBreaks.peeki() >= rangeEnd) {
603	(void) foundBreaks.popi();
604	wordsFound -= `1`;
605	}
606
607	return wordsFound;
608	}
609
610	/*
611	******************************************************************
612	* BurmeseBreakEngine
613	*/
614
615	// How many words in a row are "good enough"?
616	static const int32_t BURMESE_LOOKAHEAD = `3`;
617
618	// Will not combine a non-word with a preceding dictionary word longer than this
619	static const int32_t BURMESE_ROOT_COMBINE_THRESHOLD = `3`;
620
621	// Will not combine a non-word that shares at least this much prefix with a
622	// dictionary word, with a preceding word
623	static const int32_t BURMESE_PREFIX_COMBINE_THRESHOLD = `3`;
624
625	// Minimum word size
626	static const int32_t BURMESE_MIN_WORD = `2`;
627
628	// Minimum number of characters for two words
629	static const int32_t BURMESE_MIN_WORD_SPAN = BURMESE_MIN_WORD * `2`;
630
631	BurmeseBreakEngine::BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
632	: DictionaryBreakEngine (),
633	fDictionary(adoptDictionary)
634	{
635	fBurmeseWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]]"), status);
636	if (U_SUCCESS(status)) {
637	setCharacters(fBurmeseWordSet);
638	}
639	fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status);
640	fMarkSet.add(`0x0020`);
641	fEndWordSet = fBurmeseWordSet;
642	fBeginWordSet.add(`0x1000`, `0x102A`); // basic consonants and independent vowels
643
644	// Compact for caching.
645	fMarkSet.compact();
646	fEndWordSet.compact();
647	fBeginWordSet.compact();
648	}
649
650	BurmeseBreakEngine::~BurmeseBreakEngine() {
651	delete fDictionary;
652	}
653
654	int32_t
655	BurmeseBreakEngine::divideUpDictionaryRange( UText *text,
656	int32_t rangeStart,
657	int32_t rangeEnd,
658	UVector32 &foundBreaks ) const {
659	if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD_SPAN) {
660	return `0`; // Not enough characters for two words
661	}
662
663	uint32_t wordsFound = `0`;
664	int32_t cpWordLength = `0`;
665	int32_t cuWordLength = `0`;
666	int32_t current;
667	UErrorCode status = U_ZERO_ERROR;
668	PossibleWord words[BURMESE_LOOKAHEAD];
669
670	utext_setNativeIndex(text, rangeStart);
671
672	while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
673	cuWordLength = `0`;
674	cpWordLength = `0`;
675
676	// Look for candidate words at the current position
677	int32_t candidates = words[wordsFound%BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
678
679	// If we found exactly one, use that
680	if (candidates == `1`) {
681	cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
682	cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
683	wordsFound += `1`;
684	}
685	// If there was more than one, see which one can take us forward the most words
686	else if (candidates > `1`) {
687	// If we're already at the end of the range, we're done
688	if (utext_getNativeIndex(text) >= rangeEnd) {
689	goto foundBest;
690	}
691	do {
692	int32_t wordsMatched = `1`;
693	if (words[(wordsFound + `1`) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > `0`) {
694	if (wordsMatched < `2`) {
695	// Followed by another dictionary word; mark first word as a good candidate
696	words[wordsFound%BURMESE_LOOKAHEAD].markCurrent();
697	wordsMatched = `2`;
698	}
699
700	// If we're already at the end of the range, we're done
701	if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
702	goto foundBest;
703	}
704
705	// See if any of the possible second words is followed by a third word
706	do {
707	// If we find a third word, stop right away
708	if (words[(wordsFound + `2`) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
709	words[wordsFound % BURMESE_LOOKAHEAD].markCurrent();
710	goto foundBest;
711	}
712	}
713	while (words[(wordsFound + `1`) % BURMESE_LOOKAHEAD].backUp(text));
714	}
715	}
716	while (words[wordsFound % BURMESE_LOOKAHEAD].backUp(text));
717	foundBest:
718	cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
719	cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
720	wordsFound += `1`;
721	}
722
723	// We come here after having either found a word or not. We look ahead to the
724	// next word. If it's not a dictionary word, we will combine it withe the word we
725	// just found (if there is one), but only if the preceding word does not exceed
726	// the threshold.
727	// The text iterator should now be positioned at the end of the word we found.
728	if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < BURMESE_ROOT_COMBINE_THRESHOLD) {
729	// if it is a dictionary word, do nothing. If it isn't, then if there is
730	// no preceding word, or the non-word shares less than the minimum threshold
731	// of characters with a dictionary word, then scan to resynchronize
732	if (words[wordsFound % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= `0`
733	&& (cuWordLength == `0`
734	\|\| words[wordsFound%BURMESE_LOOKAHEAD].longestPrefix() < BURMESE_PREFIX_COMBINE_THRESHOLD)) {
735	// Look for a plausible word boundary
736	int32_t remaining = rangeEnd - (current + cuWordLength);
737	UChar32 pc;
738	UChar32 uc;
739	int32_t chars = `0`;
740	for (;;) {
741	int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
742	pc = utext_next32(text);
743	int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
744	chars += pcSize;
745	remaining -= pcSize;
746	if (remaining <= `0`) {
747	break;
748	}
749	uc = utext_current32(text);
750	if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
751	// Maybe. See if it's in the dictionary.
752	// TODO: this looks iffy; compare with old code.
753	int32_t num_candidates = words[(wordsFound + `1`) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
754	utext_setNativeIndex(text, current + cuWordLength + chars);
755	if (num_candidates > `0`) {
756	break;
757	}
758	}
759	}
760
761	// Bump the word count if there wasn't already one
762	if (cuWordLength <= `0`) {
763	wordsFound += `1`;
764	}
765
766	// Update the length with the passed-over characters
767	cuWordLength += chars;
768	}
769	else {
770	// Back up to where we were for next iteration
771	utext_setNativeIndex(text, current + cuWordLength);
772	}
773	}
774
775	// Never stop before a combining mark.
776	int32_t currPos;
777	while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
778	utext_next32(text);
779	cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
780	}
781
782	// Look ahead for possible suffixes if a dictionary word does not follow.
783	// We do this in code rather than using a rule so that the heuristic
784	// resynch continues to function. For example, one of the suffix characters
785	// could be a typo in the middle of a word.
786	// NOT CURRENTLY APPLICABLE TO BURMESE
787
788	// Did we find a word on this iteration? If so, push it on the break stack
789	if (cuWordLength > `0`) {
790	foundBreaks.push((current+cuWordLength), status);
791	}
792	}
793
794	// Don't return a break for the end of the dictionary range if there is one there.
795	if (foundBreaks.peeki() >= rangeEnd) {
796	(void) foundBreaks.popi();
797	wordsFound -= `1`;
798	}
799
800	return wordsFound;
801	}
802
803	/*
804	******************************************************************
805	* KhmerBreakEngine
806	*/
807
808	// How many words in a row are "good enough"?
809	static const int32_t KHMER_LOOKAHEAD = `3`;
810
811	// Will not combine a non-word with a preceding dictionary word longer than this
812	static const int32_t KHMER_ROOT_COMBINE_THRESHOLD = `3`;
813
814	// Will not combine a non-word that shares at least this much prefix with a
815	// dictionary word, with a preceding word
816	static const int32_t KHMER_PREFIX_COMBINE_THRESHOLD = `3`;
817
818	// Minimum word size
819	static const int32_t KHMER_MIN_WORD = `2`;
820
821	// Minimum number of characters for two words
822	static const int32_t KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * `2`;
823
824	KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
825	: DictionaryBreakEngine (),
826	fDictionary(adoptDictionary)
827	{
828	fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status);
829	if (U_SUCCESS(status)) {
830	setCharacters(fKhmerWordSet);
831	}
832	fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);
833	fMarkSet.add(`0x0020`);
834	fEndWordSet = fKhmerWordSet;
835	fBeginWordSet.add(`0x1780`, `0x17B3`);
836	//fBeginWordSet.add(0x17A3, 0x17A4); // deprecated vowels
837	//fEndWordSet.remove(0x17A5, 0x17A9); // Khmer independent vowels that can't end a word
838	//fEndWordSet.remove(0x17B2); // Khmer independent vowel that can't end a word
839	fEndWordSet.remove(`0x17D2`); // KHMER SIGN COENG that combines some following characters
840	//fEndWordSet.remove(0x17B6, 0x17C5); // Remove dependent vowels
841	// fEndWordSet.remove(0x0E31); // MAI HAN-AKAT
842	// fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
843	// fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK
844	// fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
845	// fSuffixSet.add(THAI_PAIYANNOI);
846	// fSuffixSet.add(THAI_MAIYAMOK);
847
848	// Compact for caching.
849	fMarkSet.compact();
850	fEndWordSet.compact();
851	fBeginWordSet.compact();
852	// fSuffixSet.compact();
853	}
854
855	KhmerBreakEngine::~KhmerBreakEngine() {
856	delete fDictionary;
857	}
858
859	int32_t
860	KhmerBreakEngine::divideUpDictionaryRange( UText *text,
861	int32_t rangeStart,
862	int32_t rangeEnd,
863	UVector32 &foundBreaks ) const {
864	if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
865	return `0`; // Not enough characters for two words
866	}
867
868	uint32_t wordsFound = `0`;
869	int32_t cpWordLength = `0`;
870	int32_t cuWordLength = `0`;
871	int32_t current;
872	UErrorCode status = U_ZERO_ERROR;
873	PossibleWord words[KHMER_LOOKAHEAD];
874
875	utext_setNativeIndex(text, rangeStart);
876
877	while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
878	cuWordLength = `0`;
879	cpWordLength = `0`;
880
881	// Look for candidate words at the current position
882	int32_t candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
883
884	// If we found exactly one, use that
885	if (candidates == `1`) {
886	cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
887	cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
888	wordsFound += `1`;
889	}
890
891	// If there was more than one, see which one can take us forward the most words
892	else if (candidates > `1`) {
893	// If we're already at the end of the range, we're done
894	if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
895	goto foundBest;
896	}
897	do {
898	int32_t wordsMatched = `1`;
899	if (words[(wordsFound + `1`) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > `0`) {
900	if (wordsMatched < `2`) {
901	// Followed by another dictionary word; mark first word as a good candidate
902	words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
903	wordsMatched = `2`;
904	}
905
906	// If we're already at the end of the range, we're done
907	if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
908	goto foundBest;
909	}
910
911	// See if any of the possible second words is followed by a third word
912	do {
913	// If we find a third word, stop right away
914	if (words[(wordsFound + `2`) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
915	words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
916	goto foundBest;
917	}
918	}
919	while (words[(wordsFound + `1`) % KHMER_LOOKAHEAD].backUp(text));
920	}
921	}
922	while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text));
923	foundBest:
924	cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
925	cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
926	wordsFound += `1`;
927	}
928
929	// We come here after having either found a word or not. We look ahead to the
930	// next word. If it's not a dictionary word, we will combine it with the word we
931	// just found (if there is one), but only if the preceding word does not exceed
932	// the threshold.
933	// The text iterator should now be positioned at the end of the word we found.
934	if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < KHMER_ROOT_COMBINE_THRESHOLD) {
935	// if it is a dictionary word, do nothing. If it isn't, then if there is
936	// no preceding word, or the non-word shares less than the minimum threshold
937	// of characters with a dictionary word, then scan to resynchronize
938	if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= `0`
939	&& (cuWordLength == `0`
940	\|\| words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) {
941	// Look for a plausible word boundary
942	int32_t remaining = rangeEnd - (current+cuWordLength);
943	UChar32 pc;
944	UChar32 uc;
945	int32_t chars = `0`;
946	for (;;) {
947	int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
948	pc = utext_next32(text);
949	int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
950	chars += pcSize;
951	remaining -= pcSize;
952	if (remaining <= `0`) {
953	break;
954	}
955	uc = utext_current32(text);
956	if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
957	// Maybe. See if it's in the dictionary.
958	int32_t num_candidates = words[(wordsFound + `1`) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
959	utext_setNativeIndex(text, current+cuWordLength+chars);
960	if (num_candidates > `0`) {
961	break;
962	}
963	}
964	}
965
966	// Bump the word count if there wasn't already one
967	if (cuWordLength <= `0`) {
968	wordsFound += `1`;
969	}
970
971	// Update the length with the passed-over characters
972	cuWordLength += chars;
973	}
974	else {
975	// Back up to where we were for next iteration
976	utext_setNativeIndex(text, current+cuWordLength);
977	}
978	}
979
980	// Never stop before a combining mark.
981	int32_t currPos;
982	while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
983	utext_next32(text);
984	cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
985	}
986
987	// Look ahead for possible suffixes if a dictionary word does not follow.
988	// We do this in code rather than using a rule so that the heuristic
989	// resynch continues to function. For example, one of the suffix characters
990	// could be a typo in the middle of a word.
991	// if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
992	// if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
993	// && fSuffixSet.contains(uc = utext_current32(text))) {
994	// if (uc == KHMER_PAIYANNOI) {
995	// if (!fSuffixSet.contains(utext_previous32(text))) {
996	// // Skip over previous end and PAIYANNOI
997	// utext_next32(text);
998	// utext_next32(text);
999	// wordLength += 1; // Add PAIYANNOI to word
1000	// uc = utext_current32(text); // Fetch next character
1001	// }
1002	// else {
1003	// // Restore prior position
1004	// utext_next32(text);
1005	// }
1006	// }
1007	// if (uc == KHMER_MAIYAMOK) {
1008	// if (utext_previous32(text) != KHMER_MAIYAMOK) {
1009	// // Skip over previous end and MAIYAMOK
1010	// utext_next32(text);
1011	// utext_next32(text);
1012	// wordLength += 1; // Add MAIYAMOK to word
1013	// }
1014	// else {
1015	// // Restore prior position
1016	// utext_next32(text);
1017	// }
1018	// }
1019	// }
1020	// else {
1021	// utext_setNativeIndex(text, current+wordLength);
1022	// }
1023	// }
1024
1025	// Did we find a word on this iteration? If so, push it on the break stack
1026	if (cuWordLength > `0`) {
1027	foundBreaks.push((current+cuWordLength), status);
1028	}
1029	}
1030
1031	// Don't return a break for the end of the dictionary range if there is one there.
1032	if (foundBreaks.peeki() >= rangeEnd) {
1033	(void) foundBreaks.popi();
1034	wordsFound -= `1`;
1035	}
1036
1037	return wordsFound;
1038	}
1039
1040	#if !UCONFIG_NO_NORMALIZATION
1041	/*
1042	******************************************************************
1043	* CjkBreakEngine
1044	*/
1045	static const uint32_t kuint32max = `0xFFFFFFFF`;
1046	CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status)
1047	: DictionaryBreakEngine (), fDictionary(adoptDictionary) {
1048	// Korean dictionary only includes Hangul syllables
1049	fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status);
1050	fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);
1051	fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status);
1052	fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status);
1053	nfkcNorm2 = Normalizer2::getNFKCInstance(status);
1054
1055	if (U_SUCCESS(status)) {
1056	// handle Korean and Japanese/Chinese using different dictionaries
1057	if (type == kKorean) {
1058	setCharacters(fHangulWordSet);
1059	} else { //Chinese and Japanese
1060	UnicodeSet cjSet;
1061	cjSet.addAll(fHanWordSet);
1062	cjSet.addAll(fKatakanaWordSet);
1063	cjSet.addAll(fHiraganaWordSet);
1064	cjSet.add(`0xFF70`); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
1065	cjSet.add(`0x30FC`); // KATAKANA-HIRAGANA PROLONGED SOUND MARK
1066	setCharacters(cjSet);
1067	}
1068	}
1069	}
1070
1071	CjkBreakEngine::~CjkBreakEngine(){
1072	delete fDictionary;
1073	}
1074
1075	// The katakanaCost values below are based on the length frequencies of all
1076	// katakana phrases in the dictionary
1077	static const int32_t kMaxKatakanaLength = `8`;
1078	static const int32_t kMaxKatakanaGroupLength = `20`;
1079	static const uint32_t maxSnlp = `255`;
1080
1081	static inline uint32_t getKatakanaCost(int32_t wordLength){
1082	//TODO: fill array with actual values from dictionary!
1083	static const uint32_t katakanaCost[kMaxKatakanaLength + `1`]
1084	= {`8192`, `984`, `408`, `240`, `204`, `252`, `300`, `372`, `480`};
1085	return (wordLength > kMaxKatakanaLength) ? `8192` : katakanaCost[wordLength];
1086	}
1087
1088	static inline bool isKatakana(UChar32 value) {
1089	return (value >= `0x30A1` && value <= `0x30FE` && value != `0x30FB`) \|\|
1090	(value >= `0xFF66` && value <= `0xFF9f`);
1091	}
1092
1093
1094	// Function for accessing internal utext flags.
1095	// Replicates an internal UText function.
1096
1097	static inline int32_t utext_i32_flag(int32_t bitIndex) {
1098	return (int32_t)`1` << bitIndex;
1099	}
1100
1101
1102	/*
1103	* @param text A UText representing the text
1104	* @param rangeStart The start of the range of dictionary characters
1105	* @param rangeEnd The end of the range of dictionary characters
1106	* @param foundBreaks vector<int32> to receive the break positions
1107	* @return The number of breaks found
1108	*/
1109	int32_t
1110	CjkBreakEngine::divideUpDictionaryRange( UText *inText,
1111	int32_t rangeStart,
1112	int32_t rangeEnd,
1113	UVector32 &foundBreaks ) const {
1114	if (rangeStart >= rangeEnd) {
1115	return `0`;
1116	}
1117
1118	// UnicodeString version of input UText, NFKC normalized if necessary.
1119	UnicodeString inString;
1120
1121	// inputMap[inStringIndex] = corresponding native index from UText inText.
1122	// If NULL then mapping is 1:1
1123	LocalPointer<UVector32> inputMap;
1124
1125	UErrorCode status = U_ZERO_ERROR;
1126
1127
1128	// if UText has the input string as one contiguous UTF-16 chunk
1129	if ((inText->providerProperties & utext_i32_flag(UTEXT_PROVIDER_STABLE_CHUNKS)) &&
1130	inText->chunkNativeStart <= rangeStart &&
1131	inText->chunkNativeLimit >= rangeEnd &&
1132	inText->nativeIndexingLimit >= rangeEnd - inText->chunkNativeStart) {
1133
1134	// Input UText is in one contiguous UTF-16 chunk.
1135	// Use Read-only aliasing UnicodeString.
1136	inString.setTo(FALSE,
1137	inText->chunkContents + rangeStart - inText->chunkNativeStart,
1138	rangeEnd - rangeStart);
1139	} else {
1140	// Copy the text from the original inText (UText) to inString (UnicodeString).
1141	// Create a map from UnicodeString indices -> UText offsets.
1142	utext_setNativeIndex(inText, rangeStart);
1143	int32_t limit = rangeEnd;
1144	U_ASSERT(limit <= utext_nativeLength(inText));
1145	if (limit > utext_nativeLength(inText)) {
1146	limit = (int32_t)utext_nativeLength(inText);
1147	}
1148	inputMap.adoptInsteadAndCheckErrorCode(new UVector32 (status), status);
1149	if (U_FAILURE(status)) {
1150	return `0`;
1151	}
1152	while (utext_getNativeIndex(inText) < limit) {
1153	int32_t nativePosition = (int32_t)utext_getNativeIndex(inText);
1154	UChar32 c = utext_next32(inText);
1155	U_ASSERT(c != U_SENTINEL);
1156	inString.append(c);
1157	while (inputMap ->size() < inString.length()) {
1158	inputMap ->addElement(nativePosition, status);
1159	}
1160	}
1161	inputMap ->addElement(limit, status);
1162	}
1163
1164
1165	if (!nfkcNorm2->isNormalized(inString, status)) {
1166	UnicodeString normalizedInput;
1167	// normalizedMap[normalizedInput position] == original UText position.
1168	LocalPointer<UVector32> normalizedMap(new UVector32 (status), status);
1169	if (U_FAILURE(status)) {
1170	return `0`;
1171	}
1172
1173	UnicodeString fragment;
1174	UnicodeString normalizedFragment;
1175	for (int32_t srcI = `0`; srcI < inString.length();) { // Once per normalization chunk
1176	fragment.remove();
1177	int32_t fragmentStartI = srcI;
1178	UChar32 c = inString.char32At(srcI);
1179	for (;;) {
1180	fragment.append(c);
1181	srcI = inString.moveIndex32(srcI, `1`);
1182	if (srcI == inString.length()) {
1183	break;
1184	}
1185	c = inString.char32At(srcI);
1186	if (nfkcNorm2->hasBoundaryBefore(c)) {
1187	break;
1188	}
1189	}
1190	nfkcNorm2->normalize(fragment, normalizedFragment, status);
1191	normalizedInput.append(normalizedFragment);
1192
1193	// Map every position in the normalized chunk to the start of the chunk
1194	// in the original input.
1195	int32_t fragmentOriginalStart = inputMap.isValid() ?
1196	inputMap ->elementAti(fragmentStartI) : fragmentStartI+rangeStart;
1197	while (normalizedMap ->size() < normalizedInput.length()) {
1198	normalizedMap ->addElement(fragmentOriginalStart, status);
1199	if (U_FAILURE(status)) {
1200	break;
1201	}
1202	}
1203	}
1204	U_ASSERT(normalizedMap->size() == normalizedInput.length());
1205	int32_t nativeEnd = inputMap.isValid() ?
1206	inputMap ->elementAti(inString.length()) : inString.length()+rangeStart;
1207	normalizedMap ->addElement(nativeEnd, status);
1208
1209	inputMap = std::move(normalizedMap);
1210	inString = std::move(normalizedInput);
1211	}
1212
1213	int32_t numCodePts = inString.countChar32();
1214	if (numCodePts != inString.length()) {
1215	// There are supplementary characters in the input.
1216	// The dictionary will produce boundary positions in terms of code point indexes,
1217	// not in terms of code unit string indexes.
1218	// Use the inputMap mechanism to take care of this in addition to indexing differences
1219	// from normalization and/or UTF-8 input.
1220	UBool hadExistingMap = inputMap.isValid();
1221	if (!hadExistingMap) {
1222	inputMap.adoptInsteadAndCheckErrorCode(new UVector32 (status), status);
1223	if (U_FAILURE(status)) {
1224	return `0`;
1225	}
1226	}
1227	int32_t cpIdx = `0`;
1228	for (int32_t cuIdx = `0`; ; cuIdx = inString.moveIndex32(cuIdx, `1`)) {
1229	U_ASSERT(cuIdx >= cpIdx);
1230	if (hadExistingMap) {
1231	inputMap ->setElementAt(inputMap ->elementAti(cuIdx), cpIdx);
1232	} else {
1233	inputMap ->addElement(cuIdx+rangeStart, status);
1234	}
1235	cpIdx++;
1236	if (cuIdx == inString.length()) {
1237	break;
1238	}
1239	}
1240	}
1241
1242	// bestSnlp[i] is the snlp of the best segmentation of the first i
1243	// code points in the range to be matched.
1244	UVector32 bestSnlp(numCodePts + `1`, status);
1245	bestSnlp.addElement(`0`, status);
1246	for(int32_t i = `1`; i <= numCodePts; i++) {
1247	bestSnlp.addElement(kuint32max, status);
1248	}
1249
1250
1251	// prev[i] is the index of the last CJK code point in the previous word in
1252	// the best segmentation of the first i characters.
1253	UVector32 prev(numCodePts + `1`, status);
1254	for(int32_t i = `0`; i <= numCodePts; i++){
1255	prev.addElement(-`1`, status);
1256	}
1257
1258	const int32_t maxWordSize = `20`;
1259	UVector32 values(numCodePts, status);
1260	values.setSize(numCodePts);
1261	UVector32 lengths(numCodePts, status);
1262	lengths.setSize(numCodePts);
1263
1264	UText fu = UTEXT_INITIALIZER;
1265	utext_openUnicodeString(&fu, &inString, &status);
1266
1267	// Dynamic programming to find the best segmentation.
1268
1269	// In outer loop, i is the code point index,
1270	// ix is the corresponding string (code unit) index.
1271	// They differ when the string contains supplementary characters.
1272	int32_t ix = `0`;
1273	bool is_prev_katakana = false;
1274	for (int32_t i = `0`; i < numCodePts; ++i, ix = inString.moveIndex32(ix, `1`)) {
1275	if ((uint32_t)bestSnlp.elementAti(i) == kuint32max) {
1276	continue;
1277	}
1278
1279	int32_t count;
1280	utext_setNativeIndex(&fu, ix);
1281	count = fDictionary->matches(&fu, maxWordSize, numCodePts,
1282	NULL, lengths.getBuffer(), values.getBuffer(), NULL);
1283	// Note: lengths is filled with code point lengths
1284	// The NULL parameter is the ignored code unit lengths.
1285
1286	// if there are no single character matches found in the dictionary
1287	// starting with this character, treat character as a 1-character word
1288	// with the highest value possible, i.e. the least likely to occur.
1289	// Exclude Korean characters from this treatment, as they should be left
1290	// together by default.
1291	if ((count == `0` \|\| lengths.elementAti(`0`) != `1`) &&
1292	!fHangulWordSet.contains(inString.char32At(ix))) {
1293	values.setElementAt(maxSnlp, count); // 255
1294	lengths.setElementAt(`1`, count++);
1295	}
1296
1297	for (int32_t j = `0`; j < count; j++) {
1298	uint32_t newSnlp = (uint32_t)bestSnlp.elementAti(i) + (uint32_t)values.elementAti(j);
1299	int32_t ln_j_i = lengths.elementAti(j) + i;
1300	if (newSnlp < (uint32_t)bestSnlp.elementAti(ln_j_i)) {
1301	bestSnlp.setElementAt(newSnlp, ln_j_i);
1302	prev.setElementAt(i, ln_j_i);
1303	}
1304	}
1305
1306	// In Japanese,
1307	// Katakana word in single character is pretty rare. So we apply
1308	// the following heuristic to Katakana: any continuous run of Katakana
1309	// characters is considered a candidate word with a default cost
1310	// specified in the katakanaCost table according to its length.
1311
1312	bool is_katakana = isKatakana(inString.char32At(ix));
1313	int32_t katakanaRunLength = `1`;
1314	if (!is_prev_katakana && is_katakana) {
1315	int32_t j = inString.moveIndex32(ix, `1`);
1316	// Find the end of the continuous run of Katakana characters
1317	while (j < inString.length() && katakanaRunLength < kMaxKatakanaGroupLength &&
1318	isKatakana(inString.char32At(j))) {
1319	j = inString.moveIndex32(j, `1`);
1320	katakanaRunLength++;
1321	}
1322	if (katakanaRunLength < kMaxKatakanaGroupLength) {
1323	uint32_t newSnlp = bestSnlp.elementAti(i) + getKatakanaCost(katakanaRunLength);
1324	if (newSnlp < (uint32_t)bestSnlp.elementAti(i+katakanaRunLength)) {
1325	bestSnlp.setElementAt(newSnlp, i+katakanaRunLength);
1326	prev.setElementAt(i, i+katakanaRunLength); // prev[j] = i;
1327	}
1328	}
1329	}
1330	is_prev_katakana = is_katakana;
1331	}
1332	utext_close(&fu);
1333
1334	// Start pushing the optimal offset index into t_boundary (t for tentative).
1335	// prev[numCodePts] is guaranteed to be meaningful.
1336	// We'll first push in the reverse order, i.e.,
1337	// t_boundary[0] = numCodePts, and afterwards do a swap.
1338	UVector32 t_boundary(numCodePts+`1`, status);
1339
1340	int32_t numBreaks = `0`;
1341	// No segmentation found, set boundary to end of range
1342	if ((uint32_t)bestSnlp.elementAti(numCodePts) == kuint32max) {
1343	t_boundary.addElement(numCodePts, status);
1344	numBreaks++;
1345	} else {
1346	for (int32_t i = numCodePts; i > `0`; i = prev.elementAti(i)) {
1347	t_boundary.addElement(i, status);
1348	numBreaks++;
1349	}
1350	U_ASSERT(prev.elementAti(t_boundary.elementAti(numBreaks - `1`)) == `0`);
1351	}
1352
1353	// Add a break for the start of the dictionary range if there is not one
1354	// there already.
1355	if (foundBreaks.size() == `0` \|\| foundBreaks.peeki() < rangeStart) {
1356	t_boundary.addElement(`0`, status);
1357	numBreaks++;
1358	}
1359
1360	// Now that we're done, convert positions in t_boundary[] (indices in
1361	// the normalized input string) back to indices in the original input UText
1362	// while reversing t_boundary and pushing values to foundBreaks.
1363	int32_t prevCPPos = -`1`;
1364	int32_t prevUTextPos = -`1`;
1365	for (int32_t i = numBreaks-`1`; i >= `0`; i--) {
1366	int32_t cpPos = t_boundary.elementAti(i);
1367	U_ASSERT(cpPos > prevCPPos);
1368	int32_t utextPos = inputMap.isValid() ? inputMap ->elementAti(cpPos) : cpPos + rangeStart;
1369	U_ASSERT(utextPos >= prevUTextPos);
1370	if (utextPos > prevUTextPos) {
1371	// Boundaries are added to foundBreaks output in ascending order.
1372	U_ASSERT(foundBreaks.size() == `0` \|\| foundBreaks.peeki() < utextPos);
1373	foundBreaks.push(utextPos, status);
1374	} else {
1375	// Normalization expanded the input text, the dictionary found a boundary
1376	// within the expansion, giving two boundaries with the same index in the
1377	// original text. Ignore the second. See ticket #12918.
1378	--numBreaks;
1379	}
1380	prevCPPos = cpPos;
1381	prevUTextPos = utextPos;
1382	}
1383	(void)prevCPPos; // suppress compiler warnings about unused variable
1384
1385	// inString goes out of scope
1386	// inputMap goes out of scope
1387	return numBreaks;
1388	}
1389	#endif
1390
1391	U_NAMESPACE_END
1392
1393	#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1394
1395

Browse the source code of ClickHouse/contrib/icu/icu4c/source/common/dictbe.cpp