1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/**
4 *******************************************************************************
5 * Copyright (C) 2006-2016, International Business Machines Corporation
6 * and others. All Rights Reserved.
7 *******************************************************************************
8 */
9
10#include <utility>
11
12#include "unicode/utypes.h"
13
14#if !UCONFIG_NO_BREAK_ITERATION
15
16#include "brkeng.h"
17#include "dictbe.h"
18#include "unicode/uniset.h"
19#include "unicode/chariter.h"
20#include "unicode/ubrk.h"
21#include "utracimp.h"
22#include "uvectr32.h"
23#include "uvector.h"
24#include "uassert.h"
25#include "unicode/normlzr.h"
26#include "cmemory.h"
27#include "dictionarydata.h"
28
29U_NAMESPACE_BEGIN
30
31/*
32 ******************************************************************
33 */
34
35DictionaryBreakEngine::DictionaryBreakEngine() {
36}
37
38DictionaryBreakEngine::~DictionaryBreakEngine() {
39}
40
41UBool
42DictionaryBreakEngine::handles(UChar32 c) const {
43 return fSet.contains(c);
44}
45
46int32_t
47DictionaryBreakEngine::findBreaks( UText *text,
48 int32_t startPos,
49 int32_t endPos,
50 UVector32 &foundBreaks ) const {
51 (void)startPos; // TODO: remove this param?
52 int32_t result = 0;
53
54 // Find the span of characters included in the set.
55 // The span to break begins at the current position in the text, and
56 // extends towards the start or end of the text, depending on 'reverse'.
57
58 int32_t start = (int32_t)utext_getNativeIndex(text);
59 int32_t current;
60 int32_t rangeStart;
61 int32_t rangeEnd;
62 UChar32 c = utext_current32(text);
63 while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) {
64 utext_next32(text); // TODO: recast loop for postincrement
65 c = utext_current32(text);
66 }
67 rangeStart = start;
68 rangeEnd = current;
69 result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
70 utext_setNativeIndex(text, current);
71
72 return result;
73}
74
75void
76DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) {
77 fSet = set;
78 // Compact for caching
79 fSet.compact();
80}
81
82/*
83 ******************************************************************
84 * PossibleWord
85 */
86
87// Helper class for improving readability of the Thai/Lao/Khmer word break
88// algorithm. The implementation is completely inline.
89
90// List size, limited by the maximum number of words in the dictionary
91// that form a nested sequence.
92static const int32_t POSSIBLE_WORD_LIST_MAX = 20;
93
94class PossibleWord {
95private:
96 // list of word candidate lengths, in increasing length order
97 // TODO: bytes would be sufficient for word lengths.
98 int32_t count; // Count of candidates
99 int32_t prefix; // The longest match with a dictionary word
100 int32_t offset; // Offset in the text of these candidates
101 int32_t mark; // The preferred candidate's offset
102 int32_t current; // The candidate we're currently looking at
103 int32_t cuLengths[POSSIBLE_WORD_LIST_MAX]; // Word Lengths, in code units.
104 int32_t cpLengths[POSSIBLE_WORD_LIST_MAX]; // Word Lengths, in code points.
105
106public:
107 PossibleWord() : count(0), prefix(0), offset(-1), mark(0), current(0) {}
108 ~PossibleWord() {}
109
110 // Fill the list of candidates if needed, select the longest, and return the number found
111 int32_t candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd );
112
113 // Select the currently marked candidate, point after it in the text, and invalidate self
114 int32_t acceptMarked( UText *text );
115
116 // Back up from the current candidate to the next shorter one; return TRUE if that exists
117 // and point the text after it
118 UBool backUp( UText *text );
119
120 // Return the longest prefix this candidate location shares with a dictionary word
121 // Return value is in code points.
122 int32_t longestPrefix() { return prefix; }
123
124 // Mark the current candidate as the one we like
125 void markCurrent() { mark = current; }
126
127 // Get length in code points of the marked word.
128 int32_t markedCPLength() { return cpLengths[mark]; }
129};
130
131
132int32_t PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) {
133 // TODO: If getIndex is too slow, use offset < 0 and add discardAll()
134 int32_t start = (int32_t)utext_getNativeIndex(text);
135 if (start != offset) {
136 offset = start;
137 count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, NULL, &prefix);
138 // Dictionary leaves text after longest prefix, not longest word. Back up.
139 if (count <= 0) {
140 utext_setNativeIndex(text, start);
141 }
142 }
143 if (count > 0) {
144 utext_setNativeIndex(text, start+cuLengths[count-1]);
145 }
146 current = count-1;
147 mark = current;
148 return count;
149}
150
151int32_t
152PossibleWord::acceptMarked( UText *text ) {
153 utext_setNativeIndex(text, offset + cuLengths[mark]);
154 return cuLengths[mark];
155}
156
157
158UBool
159PossibleWord::backUp( UText *text ) {
160 if (current > 0) {
161 utext_setNativeIndex(text, offset + cuLengths[--current]);
162 return TRUE;
163 }
164 return FALSE;
165}
166
167/*
168 ******************************************************************
169 * ThaiBreakEngine
170 */
171
172// How many words in a row are "good enough"?
173static const int32_t THAI_LOOKAHEAD = 3;
174
175// Will not combine a non-word with a preceding dictionary word longer than this
176static const int32_t THAI_ROOT_COMBINE_THRESHOLD = 3;
177
178// Will not combine a non-word that shares at least this much prefix with a
179// dictionary word, with a preceding word
180static const int32_t THAI_PREFIX_COMBINE_THRESHOLD = 3;
181
182// Ellision character
183static const int32_t THAI_PAIYANNOI = 0x0E2F;
184
185// Repeat character
186static const int32_t THAI_MAIYAMOK = 0x0E46;
187
188// Minimum word size
189static const int32_t THAI_MIN_WORD = 2;
190
191// Minimum number of characters for two words
192static const int32_t THAI_MIN_WORD_SPAN = THAI_MIN_WORD * 2;
193
194ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
195 : DictionaryBreakEngine(),
196 fDictionary(adoptDictionary)
197{
198 UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
199 UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Thai");
200 fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status);
201 if (U_SUCCESS(status)) {
202 setCharacters(fThaiWordSet);
203 }
204 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status);
205 fMarkSet.add(0x0020);
206 fEndWordSet = fThaiWordSet;
207 fEndWordSet.remove(0x0E31); // MAI HAN-AKAT
208 fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
209 fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK
210 fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
211 fSuffixSet.add(THAI_PAIYANNOI);
212 fSuffixSet.add(THAI_MAIYAMOK);
213
214 // Compact for caching.
215 fMarkSet.compact();
216 fEndWordSet.compact();
217 fBeginWordSet.compact();
218 fSuffixSet.compact();
219 UTRACE_EXIT_STATUS(status);
220}
221
222ThaiBreakEngine::~ThaiBreakEngine() {
223 delete fDictionary;
224}
225
226int32_t
227ThaiBreakEngine::divideUpDictionaryRange( UText *text,
228 int32_t rangeStart,
229 int32_t rangeEnd,
230 UVector32 &foundBreaks ) const {
231 utext_setNativeIndex(text, rangeStart);
232 utext_moveIndex32(text, THAI_MIN_WORD_SPAN);
233 if (utext_getNativeIndex(text) >= rangeEnd) {
234 return 0; // Not enough characters for two words
235 }
236 utext_setNativeIndex(text, rangeStart);
237
238
239 uint32_t wordsFound = 0;
240 int32_t cpWordLength = 0; // Word Length in Code Points.
241 int32_t cuWordLength = 0; // Word length in code units (UText native indexing)
242 int32_t current;
243 UErrorCode status = U_ZERO_ERROR;
244 PossibleWord words[THAI_LOOKAHEAD];
245
246 utext_setNativeIndex(text, rangeStart);
247
248 while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
249 cpWordLength = 0;
250 cuWordLength = 0;
251
252 // Look for candidate words at the current position
253 int32_t candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
254
255 // If we found exactly one, use that
256 if (candidates == 1) {
257 cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
258 cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
259 wordsFound += 1;
260 }
261 // If there was more than one, see which one can take us forward the most words
262 else if (candidates > 1) {
263 // If we're already at the end of the range, we're done
264 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
265 goto foundBest;
266 }
267 do {
268 int32_t wordsMatched = 1;
269 if (words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
270 if (wordsMatched < 2) {
271 // Followed by another dictionary word; mark first word as a good candidate
272 words[wordsFound%THAI_LOOKAHEAD].markCurrent();
273 wordsMatched = 2;
274 }
275
276 // If we're already at the end of the range, we're done
277 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
278 goto foundBest;
279 }
280
281 // See if any of the possible second words is followed by a third word
282 do {
283 // If we find a third word, stop right away
284 if (words[(wordsFound + 2) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
285 words[wordsFound % THAI_LOOKAHEAD].markCurrent();
286 goto foundBest;
287 }
288 }
289 while (words[(wordsFound + 1) % THAI_LOOKAHEAD].backUp(text));
290 }
291 }
292 while (words[wordsFound % THAI_LOOKAHEAD].backUp(text));
293foundBest:
294 // Set UText position to after the accepted word.
295 cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
296 cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
297 wordsFound += 1;
298 }
299
300 // We come here after having either found a word or not. We look ahead to the
301 // next word. If it's not a dictionary word, we will combine it with the word we
302 // just found (if there is one), but only if the preceding word does not exceed
303 // the threshold.
304 // The text iterator should now be positioned at the end of the word we found.
305
306 UChar32 uc = 0;
307 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < THAI_ROOT_COMBINE_THRESHOLD) {
308 // if it is a dictionary word, do nothing. If it isn't, then if there is
309 // no preceding word, or the non-word shares less than the minimum threshold
310 // of characters with a dictionary word, then scan to resynchronize
311 if (words[wordsFound % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
312 && (cuWordLength == 0
313 || words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) {
314 // Look for a plausible word boundary
315 int32_t remaining = rangeEnd - (current+cuWordLength);
316 UChar32 pc;
317 int32_t chars = 0;
318 for (;;) {
319 int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
320 pc = utext_next32(text);
321 int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
322 chars += pcSize;
323 remaining -= pcSize;
324 if (remaining <= 0) {
325 break;
326 }
327 uc = utext_current32(text);
328 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
329 // Maybe. See if it's in the dictionary.
330 // NOTE: In the original Apple code, checked that the next
331 // two characters after uc were not 0x0E4C THANTHAKHAT before
332 // checking the dictionary. That is just a performance filter,
333 // but it's not clear it's faster than checking the trie.
334 int32_t num_candidates = words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
335 utext_setNativeIndex(text, current + cuWordLength + chars);
336 if (num_candidates > 0) {
337 break;
338 }
339 }
340 }
341
342 // Bump the word count if there wasn't already one
343 if (cuWordLength <= 0) {
344 wordsFound += 1;
345 }
346
347 // Update the length with the passed-over characters
348 cuWordLength += chars;
349 }
350 else {
351 // Back up to where we were for next iteration
352 utext_setNativeIndex(text, current+cuWordLength);
353 }
354 }
355
356 // Never stop before a combining mark.
357 int32_t currPos;
358 while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
359 utext_next32(text);
360 cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
361 }
362
363 // Look ahead for possible suffixes if a dictionary word does not follow.
364 // We do this in code rather than using a rule so that the heuristic
365 // resynch continues to function. For example, one of the suffix characters
366 // could be a typo in the middle of a word.
367 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cuWordLength > 0) {
368 if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
369 && fSuffixSet.contains(uc = utext_current32(text))) {
370 if (uc == THAI_PAIYANNOI) {
371 if (!fSuffixSet.contains(utext_previous32(text))) {
372 // Skip over previous end and PAIYANNOI
373 utext_next32(text);
374 int32_t paiyannoiIndex = (int32_t)utext_getNativeIndex(text);
375 utext_next32(text);
376 cuWordLength += (int32_t)utext_getNativeIndex(text) - paiyannoiIndex; // Add PAIYANNOI to word
377 uc = utext_current32(text); // Fetch next character
378 }
379 else {
380 // Restore prior position
381 utext_next32(text);
382 }
383 }
384 if (uc == THAI_MAIYAMOK) {
385 if (utext_previous32(text) != THAI_MAIYAMOK) {
386 // Skip over previous end and MAIYAMOK
387 utext_next32(text);
388 int32_t maiyamokIndex = (int32_t)utext_getNativeIndex(text);
389 utext_next32(text);
390 cuWordLength += (int32_t)utext_getNativeIndex(text) - maiyamokIndex; // Add MAIYAMOK to word
391 }
392 else {
393 // Restore prior position
394 utext_next32(text);
395 }
396 }
397 }
398 else {
399 utext_setNativeIndex(text, current+cuWordLength);
400 }
401 }
402
403 // Did we find a word on this iteration? If so, push it on the break stack
404 if (cuWordLength > 0) {
405 foundBreaks.push((current+cuWordLength), status);
406 }
407 }
408
409 // Don't return a break for the end of the dictionary range if there is one there.
410 if (foundBreaks.peeki() >= rangeEnd) {
411 (void) foundBreaks.popi();
412 wordsFound -= 1;
413 }
414
415 return wordsFound;
416}
417
418/*
419 ******************************************************************
420 * LaoBreakEngine
421 */
422
423// How many words in a row are "good enough"?
424static const int32_t LAO_LOOKAHEAD = 3;
425
426// Will not combine a non-word with a preceding dictionary word longer than this
427static const int32_t LAO_ROOT_COMBINE_THRESHOLD = 3;
428
429// Will not combine a non-word that shares at least this much prefix with a
430// dictionary word, with a preceding word
431static const int32_t LAO_PREFIX_COMBINE_THRESHOLD = 3;
432
433// Minimum word size
434static const int32_t LAO_MIN_WORD = 2;
435
436// Minimum number of characters for two words
437static const int32_t LAO_MIN_WORD_SPAN = LAO_MIN_WORD * 2;
438
439LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
440 : DictionaryBreakEngine(),
441 fDictionary(adoptDictionary)
442{
443 UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
444 UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Laoo");
445 fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]"), status);
446 if (U_SUCCESS(status)) {
447 setCharacters(fLaoWordSet);
448 }
449 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status);
450 fMarkSet.add(0x0020);
451 fEndWordSet = fLaoWordSet;
452 fEndWordSet.remove(0x0EC0, 0x0EC4); // prefix vowels
453 fBeginWordSet.add(0x0E81, 0x0EAE); // basic consonants (including holes for corresponding Thai characters)
454 fBeginWordSet.add(0x0EDC, 0x0EDD); // digraph consonants (no Thai equivalent)
455 fBeginWordSet.add(0x0EC0, 0x0EC4); // prefix vowels
456
457 // Compact for caching.
458 fMarkSet.compact();
459 fEndWordSet.compact();
460 fBeginWordSet.compact();
461 UTRACE_EXIT_STATUS(status);
462}
463
464LaoBreakEngine::~LaoBreakEngine() {
465 delete fDictionary;
466}
467
468int32_t
469LaoBreakEngine::divideUpDictionaryRange( UText *text,
470 int32_t rangeStart,
471 int32_t rangeEnd,
472 UVector32 &foundBreaks ) const {
473 if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) {
474 return 0; // Not enough characters for two words
475 }
476
477 uint32_t wordsFound = 0;
478 int32_t cpWordLength = 0;
479 int32_t cuWordLength = 0;
480 int32_t current;
481 UErrorCode status = U_ZERO_ERROR;
482 PossibleWord words[LAO_LOOKAHEAD];
483
484 utext_setNativeIndex(text, rangeStart);
485
486 while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
487 cuWordLength = 0;
488 cpWordLength = 0;
489
490 // Look for candidate words at the current position
491 int32_t candidates = words[wordsFound%LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
492
493 // If we found exactly one, use that
494 if (candidates == 1) {
495 cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
496 cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
497 wordsFound += 1;
498 }
499 // If there was more than one, see which one can take us forward the most words
500 else if (candidates > 1) {
501 // If we're already at the end of the range, we're done
502 if (utext_getNativeIndex(text) >= rangeEnd) {
503 goto foundBest;
504 }
505 do {
506 int32_t wordsMatched = 1;
507 if (words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
508 if (wordsMatched < 2) {
509 // Followed by another dictionary word; mark first word as a good candidate
510 words[wordsFound%LAO_LOOKAHEAD].markCurrent();
511 wordsMatched = 2;
512 }
513
514 // If we're already at the end of the range, we're done
515 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
516 goto foundBest;
517 }
518
519 // See if any of the possible second words is followed by a third word
520 do {
521 // If we find a third word, stop right away
522 if (words[(wordsFound + 2) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
523 words[wordsFound % LAO_LOOKAHEAD].markCurrent();
524 goto foundBest;
525 }
526 }
527 while (words[(wordsFound + 1) % LAO_LOOKAHEAD].backUp(text));
528 }
529 }
530 while (words[wordsFound % LAO_LOOKAHEAD].backUp(text));
531foundBest:
532 cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
533 cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
534 wordsFound += 1;
535 }
536
537 // We come here after having either found a word or not. We look ahead to the
538 // next word. If it's not a dictionary word, we will combine it withe the word we
539 // just found (if there is one), but only if the preceding word does not exceed
540 // the threshold.
541 // The text iterator should now be positioned at the end of the word we found.
542 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < LAO_ROOT_COMBINE_THRESHOLD) {
543 // if it is a dictionary word, do nothing. If it isn't, then if there is
544 // no preceding word, or the non-word shares less than the minimum threshold
545 // of characters with a dictionary word, then scan to resynchronize
546 if (words[wordsFound % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
547 && (cuWordLength == 0
548 || words[wordsFound%LAO_LOOKAHEAD].longestPrefix() < LAO_PREFIX_COMBINE_THRESHOLD)) {
549 // Look for a plausible word boundary
550 int32_t remaining = rangeEnd - (current + cuWordLength);
551 UChar32 pc;
552 UChar32 uc;
553 int32_t chars = 0;
554 for (;;) {
555 int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
556 pc = utext_next32(text);
557 int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
558 chars += pcSize;
559 remaining -= pcSize;
560 if (remaining <= 0) {
561 break;
562 }
563 uc = utext_current32(text);
564 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
565 // Maybe. See if it's in the dictionary.
566 // TODO: this looks iffy; compare with old code.
567 int32_t num_candidates = words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
568 utext_setNativeIndex(text, current + cuWordLength + chars);
569 if (num_candidates > 0) {
570 break;
571 }
572 }
573 }
574
575 // Bump the word count if there wasn't already one
576 if (cuWordLength <= 0) {
577 wordsFound += 1;
578 }
579
580 // Update the length with the passed-over characters
581 cuWordLength += chars;
582 }
583 else {
584 // Back up to where we were for next iteration
585 utext_setNativeIndex(text, current + cuWordLength);
586 }
587 }
588
589 // Never stop before a combining mark.
590 int32_t currPos;
591 while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
592 utext_next32(text);
593 cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
594 }
595
596 // Look ahead for possible suffixes if a dictionary word does not follow.
597 // We do this in code rather than using a rule so that the heuristic
598 // resynch continues to function. For example, one of the suffix characters
599 // could be a typo in the middle of a word.
600 // NOT CURRENTLY APPLICABLE TO LAO
601
602 // Did we find a word on this iteration? If so, push it on the break stack
603 if (cuWordLength > 0) {
604 foundBreaks.push((current+cuWordLength), status);
605 }
606 }
607
608 // Don't return a break for the end of the dictionary range if there is one there.
609 if (foundBreaks.peeki() >= rangeEnd) {
610 (void) foundBreaks.popi();
611 wordsFound -= 1;
612 }
613
614 return wordsFound;
615}
616
617/*
618 ******************************************************************
619 * BurmeseBreakEngine
620 */
621
622// How many words in a row are "good enough"?
623static const int32_t BURMESE_LOOKAHEAD = 3;
624
625// Will not combine a non-word with a preceding dictionary word longer than this
626static const int32_t BURMESE_ROOT_COMBINE_THRESHOLD = 3;
627
628// Will not combine a non-word that shares at least this much prefix with a
629// dictionary word, with a preceding word
630static const int32_t BURMESE_PREFIX_COMBINE_THRESHOLD = 3;
631
632// Minimum word size
633static const int32_t BURMESE_MIN_WORD = 2;
634
635// Minimum number of characters for two words
636static const int32_t BURMESE_MIN_WORD_SPAN = BURMESE_MIN_WORD * 2;
637
638BurmeseBreakEngine::BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
639 : DictionaryBreakEngine(),
640 fDictionary(adoptDictionary)
641{
642 UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
643 UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Mymr");
644 fBurmeseWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]]"), status);
645 if (U_SUCCESS(status)) {
646 setCharacters(fBurmeseWordSet);
647 }
648 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status);
649 fMarkSet.add(0x0020);
650 fEndWordSet = fBurmeseWordSet;
651 fBeginWordSet.add(0x1000, 0x102A); // basic consonants and independent vowels
652
653 // Compact for caching.
654 fMarkSet.compact();
655 fEndWordSet.compact();
656 fBeginWordSet.compact();
657 UTRACE_EXIT_STATUS(status);
658}
659
660BurmeseBreakEngine::~BurmeseBreakEngine() {
661 delete fDictionary;
662}
663
664int32_t
665BurmeseBreakEngine::divideUpDictionaryRange( UText *text,
666 int32_t rangeStart,
667 int32_t rangeEnd,
668 UVector32 &foundBreaks ) const {
669 if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD_SPAN) {
670 return 0; // Not enough characters for two words
671 }
672
673 uint32_t wordsFound = 0;
674 int32_t cpWordLength = 0;
675 int32_t cuWordLength = 0;
676 int32_t current;
677 UErrorCode status = U_ZERO_ERROR;
678 PossibleWord words[BURMESE_LOOKAHEAD];
679
680 utext_setNativeIndex(text, rangeStart);
681
682 while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
683 cuWordLength = 0;
684 cpWordLength = 0;
685
686 // Look for candidate words at the current position
687 int32_t candidates = words[wordsFound%BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
688
689 // If we found exactly one, use that
690 if (candidates == 1) {
691 cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
692 cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
693 wordsFound += 1;
694 }
695 // If there was more than one, see which one can take us forward the most words
696 else if (candidates > 1) {
697 // If we're already at the end of the range, we're done
698 if (utext_getNativeIndex(text) >= rangeEnd) {
699 goto foundBest;
700 }
701 do {
702 int32_t wordsMatched = 1;
703 if (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
704 if (wordsMatched < 2) {
705 // Followed by another dictionary word; mark first word as a good candidate
706 words[wordsFound%BURMESE_LOOKAHEAD].markCurrent();
707 wordsMatched = 2;
708 }
709
710 // If we're already at the end of the range, we're done
711 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
712 goto foundBest;
713 }
714
715 // See if any of the possible second words is followed by a third word
716 do {
717 // If we find a third word, stop right away
718 if (words[(wordsFound + 2) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
719 words[wordsFound % BURMESE_LOOKAHEAD].markCurrent();
720 goto foundBest;
721 }
722 }
723 while (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].backUp(text));
724 }
725 }
726 while (words[wordsFound % BURMESE_LOOKAHEAD].backUp(text));
727foundBest:
728 cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
729 cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
730 wordsFound += 1;
731 }
732
733 // We come here after having either found a word or not. We look ahead to the
734 // next word. If it's not a dictionary word, we will combine it withe the word we
735 // just found (if there is one), but only if the preceding word does not exceed
736 // the threshold.
737 // The text iterator should now be positioned at the end of the word we found.
738 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < BURMESE_ROOT_COMBINE_THRESHOLD) {
739 // if it is a dictionary word, do nothing. If it isn't, then if there is
740 // no preceding word, or the non-word shares less than the minimum threshold
741 // of characters with a dictionary word, then scan to resynchronize
742 if (words[wordsFound % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
743 && (cuWordLength == 0
744 || words[wordsFound%BURMESE_LOOKAHEAD].longestPrefix() < BURMESE_PREFIX_COMBINE_THRESHOLD)) {
745 // Look for a plausible word boundary
746 int32_t remaining = rangeEnd - (current + cuWordLength);
747 UChar32 pc;
748 UChar32 uc;
749 int32_t chars = 0;
750 for (;;) {
751 int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
752 pc = utext_next32(text);
753 int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
754 chars += pcSize;
755 remaining -= pcSize;
756 if (remaining <= 0) {
757 break;
758 }
759 uc = utext_current32(text);
760 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
761 // Maybe. See if it's in the dictionary.
762 // TODO: this looks iffy; compare with old code.
763 int32_t num_candidates = words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
764 utext_setNativeIndex(text, current + cuWordLength + chars);
765 if (num_candidates > 0) {
766 break;
767 }
768 }
769 }
770
771 // Bump the word count if there wasn't already one
772 if (cuWordLength <= 0) {
773 wordsFound += 1;
774 }
775
776 // Update the length with the passed-over characters
777 cuWordLength += chars;
778 }
779 else {
780 // Back up to where we were for next iteration
781 utext_setNativeIndex(text, current + cuWordLength);
782 }
783 }
784
785 // Never stop before a combining mark.
786 int32_t currPos;
787 while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
788 utext_next32(text);
789 cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
790 }
791
792 // Look ahead for possible suffixes if a dictionary word does not follow.
793 // We do this in code rather than using a rule so that the heuristic
794 // resynch continues to function. For example, one of the suffix characters
795 // could be a typo in the middle of a word.
796 // NOT CURRENTLY APPLICABLE TO BURMESE
797
798 // Did we find a word on this iteration? If so, push it on the break stack
799 if (cuWordLength > 0) {
800 foundBreaks.push((current+cuWordLength), status);
801 }
802 }
803
804 // Don't return a break for the end of the dictionary range if there is one there.
805 if (foundBreaks.peeki() >= rangeEnd) {
806 (void) foundBreaks.popi();
807 wordsFound -= 1;
808 }
809
810 return wordsFound;
811}
812
813/*
814 ******************************************************************
815 * KhmerBreakEngine
816 */
817
818// How many words in a row are "good enough"?
819static const int32_t KHMER_LOOKAHEAD = 3;
820
821// Will not combine a non-word with a preceding dictionary word longer than this
822static const int32_t KHMER_ROOT_COMBINE_THRESHOLD = 10;
823
824// Will not combine a non-word that shares at least this much prefix with a
825// dictionary word, with a preceding word
826static const int32_t KHMER_PREFIX_COMBINE_THRESHOLD = 5;
827
828// Minimum word size
829static const int32_t KHMER_MIN_WORD = 2;
830
831// Minimum number of characters for two words
832static const int32_t KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * 2;
833
834KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
835 : DictionaryBreakEngine(),
836 fDictionary(adoptDictionary)
837{
838 UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
839 UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Khmr");
840 fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status);
841 if (U_SUCCESS(status)) {
842 setCharacters(fKhmerWordSet);
843 }
844 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);
845 fMarkSet.add(0x0020);
846 fEndWordSet = fKhmerWordSet;
847 fBeginWordSet.add(0x1780, 0x17B3);
848 //fBeginWordSet.add(0x17A3, 0x17A4); // deprecated vowels
849 //fEndWordSet.remove(0x17A5, 0x17A9); // Khmer independent vowels that can't end a word
850 //fEndWordSet.remove(0x17B2); // Khmer independent vowel that can't end a word
851 fEndWordSet.remove(0x17D2); // KHMER SIGN COENG that combines some following characters
852 //fEndWordSet.remove(0x17B6, 0x17C5); // Remove dependent vowels
853// fEndWordSet.remove(0x0E31); // MAI HAN-AKAT
854// fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
855// fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK
856// fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
857// fSuffixSet.add(THAI_PAIYANNOI);
858// fSuffixSet.add(THAI_MAIYAMOK);
859
860 // Compact for caching.
861 fMarkSet.compact();
862 fEndWordSet.compact();
863 fBeginWordSet.compact();
864// fSuffixSet.compact();
865 UTRACE_EXIT_STATUS(status);
866}
867
868KhmerBreakEngine::~KhmerBreakEngine() {
869 delete fDictionary;
870}
871
872int32_t
873KhmerBreakEngine::divideUpDictionaryRange( UText *text,
874 int32_t rangeStart,
875 int32_t rangeEnd,
876 UVector32 &foundBreaks ) const {
877 if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
878 return 0; // Not enough characters for two words
879 }
880
881 uint32_t wordsFound = 0;
882 int32_t cpWordLength = 0;
883 int32_t cuWordLength = 0;
884 int32_t current;
885 UErrorCode status = U_ZERO_ERROR;
886 PossibleWord words[KHMER_LOOKAHEAD];
887
888 utext_setNativeIndex(text, rangeStart);
889
890 while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
891 cuWordLength = 0;
892 cpWordLength = 0;
893
894 // Look for candidate words at the current position
895 int32_t candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
896
897 // If we found exactly one, use that
898 if (candidates == 1) {
899 cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
900 cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
901 wordsFound += 1;
902 }
903
904 // If there was more than one, see which one can take us forward the most words
905 else if (candidates > 1) {
906 // If we're already at the end of the range, we're done
907 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
908 goto foundBest;
909 }
910 do {
911 int32_t wordsMatched = 1;
912 if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
913 if (wordsMatched < 2) {
914 // Followed by another dictionary word; mark first word as a good candidate
915 words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
916 wordsMatched = 2;
917 }
918
919 // If we're already at the end of the range, we're done
920 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
921 goto foundBest;
922 }
923
924 // See if any of the possible second words is followed by a third word
925 do {
926 // If we find a third word, stop right away
927 if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
928 words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
929 goto foundBest;
930 }
931 }
932 while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text));
933 }
934 }
935 while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text));
936foundBest:
937 cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
938 cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
939 wordsFound += 1;
940 }
941
942 // We come here after having either found a word or not. We look ahead to the
943 // next word. If it's not a dictionary word, we will combine it with the word we
944 // just found (if there is one), but only if the preceding word does not exceed
945 // the threshold.
946 // The text iterator should now be positioned at the end of the word we found.
947 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < KHMER_ROOT_COMBINE_THRESHOLD) {
948 // if it is a dictionary word, do nothing. If it isn't, then if there is
949 // no preceding word, or the non-word shares less than the minimum threshold
950 // of characters with a dictionary word, then scan to resynchronize
951 if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
952 && (cuWordLength == 0
953 || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) {
954 // Look for a plausible word boundary
955 int32_t remaining = rangeEnd - (current+cuWordLength);
956 UChar32 pc;
957 UChar32 uc;
958 int32_t chars = 0;
959 for (;;) {
960 int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
961 pc = utext_next32(text);
962 int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
963 chars += pcSize;
964 remaining -= pcSize;
965 if (remaining <= 0) {
966 break;
967 }
968 uc = utext_current32(text);
969 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
970 // Maybe. See if it's in the dictionary.
971 int32_t num_candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
972 utext_setNativeIndex(text, current+cuWordLength+chars);
973 if (num_candidates > 0) {
974 break;
975 }
976 }
977 }
978
979 // Bump the word count if there wasn't already one
980 if (cuWordLength <= 0) {
981 wordsFound += 1;
982 }
983
984 // Update the length with the passed-over characters
985 cuWordLength += chars;
986 }
987 else {
988 // Back up to where we were for next iteration
989 utext_setNativeIndex(text, current+cuWordLength);
990 }
991 }
992
993 // Never stop before a combining mark.
994 int32_t currPos;
995 while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
996 utext_next32(text);
997 cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
998 }
999
1000 // Look ahead for possible suffixes if a dictionary word does not follow.
1001 // We do this in code rather than using a rule so that the heuristic
1002 // resynch continues to function. For example, one of the suffix characters
1003 // could be a typo in the middle of a word.
1004// if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
1005// if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
1006// && fSuffixSet.contains(uc = utext_current32(text))) {
1007// if (uc == KHMER_PAIYANNOI) {
1008// if (!fSuffixSet.contains(utext_previous32(text))) {
1009// // Skip over previous end and PAIYANNOI
1010// utext_next32(text);
1011// utext_next32(text);
1012// wordLength += 1; // Add PAIYANNOI to word
1013// uc = utext_current32(text); // Fetch next character
1014// }
1015// else {
1016// // Restore prior position
1017// utext_next32(text);
1018// }
1019// }
1020// if (uc == KHMER_MAIYAMOK) {
1021// if (utext_previous32(text) != KHMER_MAIYAMOK) {
1022// // Skip over previous end and MAIYAMOK
1023// utext_next32(text);
1024// utext_next32(text);
1025// wordLength += 1; // Add MAIYAMOK to word
1026// }
1027// else {
1028// // Restore prior position
1029// utext_next32(text);
1030// }
1031// }
1032// }
1033// else {
1034// utext_setNativeIndex(text, current+wordLength);
1035// }
1036// }
1037
1038 // Did we find a word on this iteration? If so, push it on the break stack
1039 if (cuWordLength > 0) {
1040 foundBreaks.push((current+cuWordLength), status);
1041 }
1042 }
1043
1044 // Don't return a break for the end of the dictionary range if there is one there.
1045 if (foundBreaks.peeki() >= rangeEnd) {
1046 (void) foundBreaks.popi();
1047 wordsFound -= 1;
1048 }
1049
1050 return wordsFound;
1051}
1052
1053#if !UCONFIG_NO_NORMALIZATION
1054/*
1055 ******************************************************************
1056 * CjkBreakEngine
1057 */
1058static const uint32_t kuint32max = 0xFFFFFFFF;
1059CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status)
1060: DictionaryBreakEngine(), fDictionary(adoptDictionary) {
1061 UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
1062 UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Hani");
1063 // Korean dictionary only includes Hangul syllables
1064 fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status);
1065 fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);
1066 fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status);
1067 fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status);
1068 nfkcNorm2 = Normalizer2::getNFKCInstance(status);
1069
1070 if (U_SUCCESS(status)) {
1071 // handle Korean and Japanese/Chinese using different dictionaries
1072 if (type == kKorean) {
1073 setCharacters(fHangulWordSet);
1074 } else { //Chinese and Japanese
1075 UnicodeSet cjSet;
1076 cjSet.addAll(fHanWordSet);
1077 cjSet.addAll(fKatakanaWordSet);
1078 cjSet.addAll(fHiraganaWordSet);
1079 cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
1080 cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK
1081 setCharacters(cjSet);
1082 }
1083 }
1084 UTRACE_EXIT_STATUS(status);
1085}
1086
1087CjkBreakEngine::~CjkBreakEngine(){
1088 delete fDictionary;
1089}
1090
1091// The katakanaCost values below are based on the length frequencies of all
1092// katakana phrases in the dictionary
1093static const int32_t kMaxKatakanaLength = 8;
1094static const int32_t kMaxKatakanaGroupLength = 20;
1095static const uint32_t maxSnlp = 255;
1096
1097static inline uint32_t getKatakanaCost(int32_t wordLength){
1098 //TODO: fill array with actual values from dictionary!
1099 static const uint32_t katakanaCost[kMaxKatakanaLength + 1]
1100 = {8192, 984, 408, 240, 204, 252, 300, 372, 480};
1101 return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength];
1102}
1103
1104static inline bool isKatakana(UChar32 value) {
1105 return (value >= 0x30A1 && value <= 0x30FE && value != 0x30FB) ||
1106 (value >= 0xFF66 && value <= 0xFF9f);
1107}
1108
1109
1110// Function for accessing internal utext flags.
1111// Replicates an internal UText function.
1112
1113static inline int32_t utext_i32_flag(int32_t bitIndex) {
1114 return (int32_t)1 << bitIndex;
1115}
1116
1117
1118/*
1119 * @param text A UText representing the text
1120 * @param rangeStart The start of the range of dictionary characters
1121 * @param rangeEnd The end of the range of dictionary characters
1122 * @param foundBreaks vector<int32> to receive the break positions
1123 * @return The number of breaks found
1124 */
1125int32_t
1126CjkBreakEngine::divideUpDictionaryRange( UText *inText,
1127 int32_t rangeStart,
1128 int32_t rangeEnd,
1129 UVector32 &foundBreaks ) const {
1130 if (rangeStart >= rangeEnd) {
1131 return 0;
1132 }
1133
1134 // UnicodeString version of input UText, NFKC normalized if necessary.
1135 UnicodeString inString;
1136
1137 // inputMap[inStringIndex] = corresponding native index from UText inText.
1138 // If NULL then mapping is 1:1
1139 LocalPointer<UVector32> inputMap;
1140
1141 UErrorCode status = U_ZERO_ERROR;
1142
1143
1144 // if UText has the input string as one contiguous UTF-16 chunk
1145 if ((inText->providerProperties & utext_i32_flag(UTEXT_PROVIDER_STABLE_CHUNKS)) &&
1146 inText->chunkNativeStart <= rangeStart &&
1147 inText->chunkNativeLimit >= rangeEnd &&
1148 inText->nativeIndexingLimit >= rangeEnd - inText->chunkNativeStart) {
1149
1150 // Input UText is in one contiguous UTF-16 chunk.
1151 // Use Read-only aliasing UnicodeString.
1152 inString.setTo(FALSE,
1153 inText->chunkContents + rangeStart - inText->chunkNativeStart,
1154 rangeEnd - rangeStart);
1155 } else {
1156 // Copy the text from the original inText (UText) to inString (UnicodeString).
1157 // Create a map from UnicodeString indices -> UText offsets.
1158 utext_setNativeIndex(inText, rangeStart);
1159 int32_t limit = rangeEnd;
1160 U_ASSERT(limit <= utext_nativeLength(inText));
1161 if (limit > utext_nativeLength(inText)) {
1162 limit = (int32_t)utext_nativeLength(inText);
1163 }
1164 inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status);
1165 if (U_FAILURE(status)) {
1166 return 0;
1167 }
1168 while (utext_getNativeIndex(inText) < limit) {
1169 int32_t nativePosition = (int32_t)utext_getNativeIndex(inText);
1170 UChar32 c = utext_next32(inText);
1171 U_ASSERT(c != U_SENTINEL);
1172 inString.append(c);
1173 while (inputMap->size() < inString.length()) {
1174 inputMap->addElement(nativePosition, status);
1175 }
1176 }
1177 inputMap->addElement(limit, status);
1178 }
1179
1180
1181 if (!nfkcNorm2->isNormalized(inString, status)) {
1182 UnicodeString normalizedInput;
1183 // normalizedMap[normalizedInput position] == original UText position.
1184 LocalPointer<UVector32> normalizedMap(new UVector32(status), status);
1185 if (U_FAILURE(status)) {
1186 return 0;
1187 }
1188
1189 UnicodeString fragment;
1190 UnicodeString normalizedFragment;
1191 for (int32_t srcI = 0; srcI < inString.length();) { // Once per normalization chunk
1192 fragment.remove();
1193 int32_t fragmentStartI = srcI;
1194 UChar32 c = inString.char32At(srcI);
1195 for (;;) {
1196 fragment.append(c);
1197 srcI = inString.moveIndex32(srcI, 1);
1198 if (srcI == inString.length()) {
1199 break;
1200 }
1201 c = inString.char32At(srcI);
1202 if (nfkcNorm2->hasBoundaryBefore(c)) {
1203 break;
1204 }
1205 }
1206 nfkcNorm2->normalize(fragment, normalizedFragment, status);
1207 normalizedInput.append(normalizedFragment);
1208
1209 // Map every position in the normalized chunk to the start of the chunk
1210 // in the original input.
1211 int32_t fragmentOriginalStart = inputMap.isValid() ?
1212 inputMap->elementAti(fragmentStartI) : fragmentStartI+rangeStart;
1213 while (normalizedMap->size() < normalizedInput.length()) {
1214 normalizedMap->addElement(fragmentOriginalStart, status);
1215 if (U_FAILURE(status)) {
1216 break;
1217 }
1218 }
1219 }
1220 U_ASSERT(normalizedMap->size() == normalizedInput.length());
1221 int32_t nativeEnd = inputMap.isValid() ?
1222 inputMap->elementAti(inString.length()) : inString.length()+rangeStart;
1223 normalizedMap->addElement(nativeEnd, status);
1224
1225 inputMap = std::move(normalizedMap);
1226 inString = std::move(normalizedInput);
1227 }
1228
1229 int32_t numCodePts = inString.countChar32();
1230 if (numCodePts != inString.length()) {
1231 // There are supplementary characters in the input.
1232 // The dictionary will produce boundary positions in terms of code point indexes,
1233 // not in terms of code unit string indexes.
1234 // Use the inputMap mechanism to take care of this in addition to indexing differences
1235 // from normalization and/or UTF-8 input.
1236 UBool hadExistingMap = inputMap.isValid();
1237 if (!hadExistingMap) {
1238 inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status);
1239 if (U_FAILURE(status)) {
1240 return 0;
1241 }
1242 }
1243 int32_t cpIdx = 0;
1244 for (int32_t cuIdx = 0; ; cuIdx = inString.moveIndex32(cuIdx, 1)) {
1245 U_ASSERT(cuIdx >= cpIdx);
1246 if (hadExistingMap) {
1247 inputMap->setElementAt(inputMap->elementAti(cuIdx), cpIdx);
1248 } else {
1249 inputMap->addElement(cuIdx+rangeStart, status);
1250 }
1251 cpIdx++;
1252 if (cuIdx == inString.length()) {
1253 break;
1254 }
1255 }
1256 }
1257
1258 // bestSnlp[i] is the snlp of the best segmentation of the first i
1259 // code points in the range to be matched.
1260 UVector32 bestSnlp(numCodePts + 1, status);
1261 bestSnlp.addElement(0, status);
1262 for(int32_t i = 1; i <= numCodePts; i++) {
1263 bestSnlp.addElement(kuint32max, status);
1264 }
1265
1266
1267 // prev[i] is the index of the last CJK code point in the previous word in
1268 // the best segmentation of the first i characters.
1269 UVector32 prev(numCodePts + 1, status);
1270 for(int32_t i = 0; i <= numCodePts; i++){
1271 prev.addElement(-1, status);
1272 }
1273
1274 const int32_t maxWordSize = 20;
1275 UVector32 values(numCodePts, status);
1276 values.setSize(numCodePts);
1277 UVector32 lengths(numCodePts, status);
1278 lengths.setSize(numCodePts);
1279
1280 UText fu = UTEXT_INITIALIZER;
1281 utext_openUnicodeString(&fu, &inString, &status);
1282
1283 // Dynamic programming to find the best segmentation.
1284
1285 // In outer loop, i is the code point index,
1286 // ix is the corresponding string (code unit) index.
1287 // They differ when the string contains supplementary characters.
1288 int32_t ix = 0;
1289 bool is_prev_katakana = false;
1290 for (int32_t i = 0; i < numCodePts; ++i, ix = inString.moveIndex32(ix, 1)) {
1291 if ((uint32_t)bestSnlp.elementAti(i) == kuint32max) {
1292 continue;
1293 }
1294
1295 int32_t count;
1296 utext_setNativeIndex(&fu, ix);
1297 count = fDictionary->matches(&fu, maxWordSize, numCodePts,
1298 NULL, lengths.getBuffer(), values.getBuffer(), NULL);
1299 // Note: lengths is filled with code point lengths
1300 // The NULL parameter is the ignored code unit lengths.
1301
1302 // if there are no single character matches found in the dictionary
1303 // starting with this character, treat character as a 1-character word
1304 // with the highest value possible, i.e. the least likely to occur.
1305 // Exclude Korean characters from this treatment, as they should be left
1306 // together by default.
1307 if ((count == 0 || lengths.elementAti(0) != 1) &&
1308 !fHangulWordSet.contains(inString.char32At(ix))) {
1309 values.setElementAt(maxSnlp, count); // 255
1310 lengths.setElementAt(1, count++);
1311 }
1312
1313 for (int32_t j = 0; j < count; j++) {
1314 uint32_t newSnlp = (uint32_t)bestSnlp.elementAti(i) + (uint32_t)values.elementAti(j);
1315 int32_t ln_j_i = lengths.elementAti(j) + i;
1316 if (newSnlp < (uint32_t)bestSnlp.elementAti(ln_j_i)) {
1317 bestSnlp.setElementAt(newSnlp, ln_j_i);
1318 prev.setElementAt(i, ln_j_i);
1319 }
1320 }
1321
1322 // In Japanese,
1323 // Katakana word in single character is pretty rare. So we apply
1324 // the following heuristic to Katakana: any continuous run of Katakana
1325 // characters is considered a candidate word with a default cost
1326 // specified in the katakanaCost table according to its length.
1327
1328 bool is_katakana = isKatakana(inString.char32At(ix));
1329 int32_t katakanaRunLength = 1;
1330 if (!is_prev_katakana && is_katakana) {
1331 int32_t j = inString.moveIndex32(ix, 1);
1332 // Find the end of the continuous run of Katakana characters
1333 while (j < inString.length() && katakanaRunLength < kMaxKatakanaGroupLength &&
1334 isKatakana(inString.char32At(j))) {
1335 j = inString.moveIndex32(j, 1);
1336 katakanaRunLength++;
1337 }
1338 if (katakanaRunLength < kMaxKatakanaGroupLength) {
1339 uint32_t newSnlp = bestSnlp.elementAti(i) + getKatakanaCost(katakanaRunLength);
1340 if (newSnlp < (uint32_t)bestSnlp.elementAti(i+katakanaRunLength)) {
1341 bestSnlp.setElementAt(newSnlp, i+katakanaRunLength);
1342 prev.setElementAt(i, i+katakanaRunLength); // prev[j] = i;
1343 }
1344 }
1345 }
1346 is_prev_katakana = is_katakana;
1347 }
1348 utext_close(&fu);
1349
1350 // Start pushing the optimal offset index into t_boundary (t for tentative).
1351 // prev[numCodePts] is guaranteed to be meaningful.
1352 // We'll first push in the reverse order, i.e.,
1353 // t_boundary[0] = numCodePts, and afterwards do a swap.
1354 UVector32 t_boundary(numCodePts+1, status);
1355
1356 int32_t numBreaks = 0;
1357 // No segmentation found, set boundary to end of range
1358 if ((uint32_t)bestSnlp.elementAti(numCodePts) == kuint32max) {
1359 t_boundary.addElement(numCodePts, status);
1360 numBreaks++;
1361 } else {
1362 for (int32_t i = numCodePts; i > 0; i = prev.elementAti(i)) {
1363 t_boundary.addElement(i, status);
1364 numBreaks++;
1365 }
1366 U_ASSERT(prev.elementAti(t_boundary.elementAti(numBreaks - 1)) == 0);
1367 }
1368
1369 // Add a break for the start of the dictionary range if there is not one
1370 // there already.
1371 if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) {
1372 t_boundary.addElement(0, status);
1373 numBreaks++;
1374 }
1375
1376 // Now that we're done, convert positions in t_boundary[] (indices in
1377 // the normalized input string) back to indices in the original input UText
1378 // while reversing t_boundary and pushing values to foundBreaks.
1379 int32_t prevCPPos = -1;
1380 int32_t prevUTextPos = -1;
1381 for (int32_t i = numBreaks-1; i >= 0; i--) {
1382 int32_t cpPos = t_boundary.elementAti(i);
1383 U_ASSERT(cpPos > prevCPPos);
1384 int32_t utextPos = inputMap.isValid() ? inputMap->elementAti(cpPos) : cpPos + rangeStart;
1385 U_ASSERT(utextPos >= prevUTextPos);
1386 if (utextPos > prevUTextPos) {
1387 // Boundaries are added to foundBreaks output in ascending order.
1388 U_ASSERT(foundBreaks.size() == 0 || foundBreaks.peeki() < utextPos);
1389 foundBreaks.push(utextPos, status);
1390 } else {
1391 // Normalization expanded the input text, the dictionary found a boundary
1392 // within the expansion, giving two boundaries with the same index in the
1393 // original text. Ignore the second. See ticket #12918.
1394 --numBreaks;
1395 }
1396 prevCPPos = cpPos;
1397 prevUTextPos = utextPos;
1398 }
1399 (void)prevCPPos; // suppress compiler warnings about unused variable
1400
1401 // inString goes out of scope
1402 // inputMap goes out of scope
1403 return numBreaks;
1404}
1405#endif
1406
1407U_NAMESPACE_END
1408
1409#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1410
1411