1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5* Copyright (C) 2014-2015, International Business Machines Corporation and
6* others. All Rights Reserved.
7*******************************************************************************
8*/
9
10#include "unicode/utypes.h"
11#if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_FILTERED_BREAK_ITERATION
12
13#include "cmemory.h"
14
15#include "unicode/filteredbrk.h"
16#include "unicode/ucharstriebuilder.h"
17#include "unicode/ures.h"
18
19#include "uresimp.h" // ures_getByKeyWithFallback
20#include "ubrkimpl.h" // U_ICUDATA_BRKITR
21#include "uvector.h"
22#include "cmemory.h"
23
24U_NAMESPACE_BEGIN
25
26#ifndef FB_DEBUG
27#define FB_DEBUG 0
28#endif
29
30#if FB_DEBUG
31#include <stdio.h>
32static void _fb_trace(const char *m, const UnicodeString *s, UBool b, int32_t d, const char *f, int l) {
33 char buf[2048];
34 if(s) {
35 s->extract(0,s->length(),buf,2048);
36 } else {
37 strcpy(buf,"NULL");
38 }
39 fprintf(stderr,"%s:%d: %s. s='%s'(%p), b=%c, d=%d\n",
40 f, l, m, buf, (const void*)s, b?'T':'F',(int)d);
41}
42
43#define FB_TRACE(m,s,b,d) _fb_trace(m,s,b,d,__FILE__,__LINE__)
44#else
45#define FB_TRACE(m,s,b,d)
46#endif
47
48/**
49 * Used with sortedInsert()
50 */
51static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) {
52 const UnicodeString &a = *(const UnicodeString*)t1.pointer;
53 const UnicodeString &b = *(const UnicodeString*)t2.pointer;
54 return a.compare(b);
55}
56
57/**
58 * A UVector which implements a set of strings.
59 */
60class U_COMMON_API UStringSet : public UVector {
61 public:
62 UStringSet(UErrorCode &status) : UVector(uprv_deleteUObject,
63 uhash_compareUnicodeString,
64 1,
65 status) {}
66 virtual ~UStringSet();
67 /**
68 * Is this UnicodeSet contained?
69 */
70 inline UBool contains(const UnicodeString& s) {
71 return contains((void*) &s);
72 }
73 using UVector::contains;
74 /**
75 * Return the ith UnicodeString alias
76 */
77 inline const UnicodeString* getStringAt(int32_t i) const {
78 return (const UnicodeString*)elementAt(i);
79 }
80 /**
81 * Adopt the UnicodeString if not already contained.
82 * Caller no longer owns the pointer in any case.
83 * @return true if adopted successfully, false otherwise (error, or else duplicate)
84 */
85 inline UBool adopt(UnicodeString *str, UErrorCode &status) {
86 if(U_FAILURE(status) || contains(*str)) {
87 delete str;
88 return false;
89 } else {
90 sortedInsert(str, compareUnicodeString, status);
91 if(U_FAILURE(status)) {
92 delete str;
93 return false;
94 }
95 return true;
96 }
97 }
98 /**
99 * Add by value.
100 * @return true if successfully adopted.
101 */
102 inline UBool add(const UnicodeString& str, UErrorCode &status) {
103 if(U_FAILURE(status)) return false;
104 UnicodeString *t = new UnicodeString(str);
105 if(t==NULL) {
106 status = U_MEMORY_ALLOCATION_ERROR; return false;
107 }
108 return adopt(t, status);
109 }
110 /**
111 * Remove this string.
112 * @return true if successfully removed, false otherwise (error, or else it wasn't there)
113 */
114 inline UBool remove(const UnicodeString &s, UErrorCode &status) {
115 if(U_FAILURE(status)) return false;
116 return removeElement((void*) &s);
117 }
118};
119
120/**
121 * Virtual, won't be inlined
122 */
123UStringSet::~UStringSet() {}
124
125/* ----------------------------------------------------------- */
126
127
128/* Filtered Break constants */
129static const int32_t kPARTIAL = (1<<0); //< partial - need to run through forward trie
130static const int32_t kMATCH = (1<<1); //< exact match - skip this one.
131static const int32_t kSuppressInReverse = (1<<0);
132static const int32_t kAddToForward = (1<<1);
133static const UChar kFULLSTOP = 0x002E; // '.'
134
135/**
136 * Shared data for SimpleFilteredSentenceBreakIterator
137 */
138class SimpleFilteredSentenceBreakData : public UMemory {
139public:
140 SimpleFilteredSentenceBreakData(UCharsTrie *forwards, UCharsTrie *backwards )
141 : fForwardsPartialTrie(forwards), fBackwardsTrie(backwards), refcount(1) { }
142 SimpleFilteredSentenceBreakData *incr() { refcount++; return this; }
143 SimpleFilteredSentenceBreakData *decr() { if((--refcount) <= 0) delete this; return 0; }
144 virtual ~SimpleFilteredSentenceBreakData();
145
146 LocalPointer<UCharsTrie> fForwardsPartialTrie; // Has ".a" for "a.M."
147 LocalPointer<UCharsTrie> fBackwardsTrie; // i.e. ".srM" for Mrs.
148 int32_t refcount;
149};
150
151SimpleFilteredSentenceBreakData::~SimpleFilteredSentenceBreakData() {}
152
153/**
154 * Concrete implementation
155 */
156class SimpleFilteredSentenceBreakIterator : public BreakIterator {
157public:
158 SimpleFilteredSentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status);
159 SimpleFilteredSentenceBreakIterator(const SimpleFilteredSentenceBreakIterator& other);
160 virtual ~SimpleFilteredSentenceBreakIterator();
161private:
162 SimpleFilteredSentenceBreakData *fData;
163 LocalPointer<BreakIterator> fDelegate;
164 LocalUTextPointer fText;
165
166 /* -- subclass interface -- */
167public:
168 /* -- cloning and other subclass stuff -- */
169 virtual BreakIterator * createBufferClone(void * /*stackBuffer*/,
170 int32_t &/*BufferSize*/,
171 UErrorCode &status) {
172 // for now - always deep clone
173 status = U_SAFECLONE_ALLOCATED_WARNING;
174 return clone();
175 }
176 virtual SimpleFilteredSentenceBreakIterator* clone() const { return new SimpleFilteredSentenceBreakIterator(*this); }
177 virtual UClassID getDynamicClassID(void) const { return NULL; }
178 virtual UBool operator==(const BreakIterator& o) const { if(this==&o) return true; return false; }
179
180 /* -- text modifying -- */
181 virtual void setText(UText *text, UErrorCode &status) { fDelegate->setText(text,status); }
182 virtual BreakIterator &refreshInputText(UText *input, UErrorCode &status) { fDelegate->refreshInputText(input,status); return *this; }
183 virtual void adoptText(CharacterIterator* it) { fDelegate->adoptText(it); }
184 virtual void setText(const UnicodeString &text) { fDelegate->setText(text); }
185
186 /* -- other functions that are just delegated -- */
187 virtual UText *getUText(UText *fillIn, UErrorCode &status) const { return fDelegate->getUText(fillIn,status); }
188 virtual CharacterIterator& getText(void) const { return fDelegate->getText(); }
189
190 /* -- ITERATION -- */
191 virtual int32_t first(void);
192 virtual int32_t preceding(int32_t offset);
193 virtual int32_t previous(void);
194 virtual UBool isBoundary(int32_t offset);
195 virtual int32_t current(void) const { return fDelegate->current(); } // we keep the delegate current, so this should be correct.
196
197 virtual int32_t next(void);
198
199 virtual int32_t next(int32_t n);
200 virtual int32_t following(int32_t offset);
201 virtual int32_t last(void);
202
203private:
204 /**
205 * Given that the fDelegate has already given its "initial" answer,
206 * find the NEXT actual (non-excepted) break.
207 * @param n initial position from delegate
208 * @return new break position or UBRK_DONE
209 */
210 int32_t internalNext(int32_t n);
211 /**
212 * Given that the fDelegate has already given its "initial" answer,
213 * find the PREV actual (non-excepted) break.
214 * @param n initial position from delegate
215 * @return new break position or UBRK_DONE
216 */
217 int32_t internalPrev(int32_t n);
218 /**
219 * set up the UText with the value of the fDelegate.
220 * Call this before calling breakExceptionAt.
221 * May be able to avoid excess calls
222 */
223 void resetState(UErrorCode &status);
224 /**
225 * Is there a match (exception) at this spot?
226 */
227 enum EFBMatchResult { kNoExceptionHere, kExceptionHere };
228 /**
229 * Determine if there is an exception at this spot
230 * @param n spot to check
231 * @return kNoExceptionHere or kExceptionHere
232 **/
233 enum EFBMatchResult breakExceptionAt(int32_t n);
234};
235
236SimpleFilteredSentenceBreakIterator::SimpleFilteredSentenceBreakIterator(const SimpleFilteredSentenceBreakIterator& other)
237 : BreakIterator(other), fData(other.fData->incr()), fDelegate(other.fDelegate->clone())
238{
239}
240
241
242SimpleFilteredSentenceBreakIterator::SimpleFilteredSentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status) :
243 BreakIterator(adopt->getLocale(ULOC_VALID_LOCALE,status),adopt->getLocale(ULOC_ACTUAL_LOCALE,status)),
244 fData(new SimpleFilteredSentenceBreakData(forwards, backwards)),
245 fDelegate(adopt)
246{
247 // all set..
248}
249
250SimpleFilteredSentenceBreakIterator::~SimpleFilteredSentenceBreakIterator() {
251 fData = fData->decr();
252}
253
254void SimpleFilteredSentenceBreakIterator::resetState(UErrorCode &status) {
255 fText.adoptInstead(fDelegate->getUText(fText.orphan(), status));
256}
257
258SimpleFilteredSentenceBreakIterator::EFBMatchResult
259SimpleFilteredSentenceBreakIterator::breakExceptionAt(int32_t n) {
260 int64_t bestPosn = -1;
261 int32_t bestValue = -1;
262 // loops while 'n' points to an exception.
263 utext_setNativeIndex(fText.getAlias(), n); // from n..
264 fData->fBackwardsTrie->reset();
265 UChar32 uch;
266
267 //if(debug2) u_printf(" n@ %d\n", n);
268 // Assume a space is following the '.' (so we handle the case: "Mr. /Brown")
269 if((uch=utext_previous32(fText.getAlias()))==(UChar32)0x0020) { // TODO: skip a class of chars here??
270 // TODO only do this the 1st time?
271 //if(debug2) u_printf("skipping prev: |%C| \n", (UChar)uch);
272 } else {
273 //if(debug2) u_printf("not skipping prev: |%C| \n", (UChar)uch);
274 uch = utext_next32(fText.getAlias());
275 //if(debug2) u_printf(" -> : |%C| \n", (UChar)uch);
276 }
277
278 UStringTrieResult r = USTRINGTRIE_INTERMEDIATE_VALUE;
279
280 while((uch=utext_previous32(fText.getAlias()))!=U_SENTINEL && // more to consume backwards and..
281 USTRINGTRIE_HAS_NEXT(r=fData->fBackwardsTrie->nextForCodePoint(uch))) {// more in the trie
282 if(USTRINGTRIE_HAS_VALUE(r)) { // remember the best match so far
283 bestPosn = utext_getNativeIndex(fText.getAlias());
284 bestValue = fData->fBackwardsTrie->getValue();
285 }
286 //if(debug2) u_printf("rev< /%C/ cont?%d @%d\n", (UChar)uch, r, utext_getNativeIndex(fText.getAlias()));
287 }
288
289 if(USTRINGTRIE_MATCHES(r)) { // exact match?
290 //if(debug2) u_printf("rev<?/%C/?end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue);
291 bestValue = fData->fBackwardsTrie->getValue();
292 bestPosn = utext_getNativeIndex(fText.getAlias());
293 //if(debug2) u_printf("rev<+/%C/+end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue);
294 }
295
296 if(bestPosn>=0) {
297 //if(debug2) u_printf("rev< /%C/ end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue);
298
299 //if(USTRINGTRIE_MATCHES(r)) { // matched - so, now what?
300 //int32_t bestValue = fBackwardsTrie->getValue();
301 ////if(debug2) u_printf("rev< /%C/ matched, skip..%d bestValue=%d\n", (UChar)uch, r, bestValue);
302
303 if(bestValue == kMATCH) { // exact match!
304 //if(debug2) u_printf(" exact backward match\n");
305 return kExceptionHere; // See if the next is another exception.
306 } else if(bestValue == kPARTIAL
307 && fData->fForwardsPartialTrie.isValid()) { // make sure there's a forward trie
308 //if(debug2) u_printf(" partial backward match\n");
309 // We matched the "Ph." in "Ph.D." - now we need to run everything through the forwards trie
310 // to see if it matches something going forward.
311 fData->fForwardsPartialTrie->reset();
312 UStringTrieResult rfwd = USTRINGTRIE_INTERMEDIATE_VALUE;
313 utext_setNativeIndex(fText.getAlias(), bestPosn); // hope that's close ..
314 //if(debug2) u_printf("Retrying at %d\n", bestPosn);
315 while((uch=utext_next32(fText.getAlias()))!=U_SENTINEL &&
316 USTRINGTRIE_HAS_NEXT(rfwd=fData->fForwardsPartialTrie->nextForCodePoint(uch))) {
317 //if(debug2) u_printf("fwd> /%C/ cont?%d @%d\n", (UChar)uch, rfwd, utext_getNativeIndex(fText.getAlias()));
318 }
319 if(USTRINGTRIE_MATCHES(rfwd)) {
320 //if(debug2) u_printf("fwd> /%C/ == forward match!\n", (UChar)uch);
321 // only full matches here, nothing to check
322 // skip the next:
323 return kExceptionHere;
324 } else {
325 //if(debug2) u_printf("fwd> /%C/ no match.\n", (UChar)uch);
326 // no match (no exception) -return the 'underlying' break
327 return kNoExceptionHere;
328 }
329 } else {
330 return kNoExceptionHere; // internal error and/or no forwards trie
331 }
332 } else {
333 //if(debug2) u_printf("rev< /%C/ .. no match..%d\n", (UChar)uch, r); // no best match
334 return kNoExceptionHere; // No match - so exit. Not an exception.
335 }
336}
337
338// the workhorse single next.
339int32_t
340SimpleFilteredSentenceBreakIterator::internalNext(int32_t n) {
341 if(n == UBRK_DONE || // at end or
342 fData->fBackwardsTrie.isNull()) { // .. no backwards table loaded == no exceptions
343 return n;
344 }
345 // OK, do we need to break here?
346 UErrorCode status = U_ZERO_ERROR;
347 // refresh text
348 resetState(status);
349 if(U_FAILURE(status)) return UBRK_DONE; // bail out
350 int64_t utextLen = utext_nativeLength(fText.getAlias());
351
352 //if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias()));
353 while (n != UBRK_DONE && n != utextLen) { // outer loop runs once per underlying break (from fDelegate).
354 SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(n);
355
356 switch(m) {
357 case kExceptionHere:
358 n = fDelegate->next(); // skip this one. Find the next lowerlevel break.
359 continue;
360
361 default:
362 case kNoExceptionHere:
363 return n;
364 }
365 }
366 return n;
367}
368
369int32_t
370SimpleFilteredSentenceBreakIterator::internalPrev(int32_t n) {
371 if(n == 0 || n == UBRK_DONE || // at end or
372 fData->fBackwardsTrie.isNull()) { // .. no backwards table loaded == no exceptions
373 return n;
374 }
375 // OK, do we need to break here?
376 UErrorCode status = U_ZERO_ERROR;
377 // refresh text
378 resetState(status);
379 if(U_FAILURE(status)) return UBRK_DONE; // bail out
380
381 //if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias()));
382 while (n != UBRK_DONE && n != 0) { // outer loop runs once per underlying break (from fDelegate).
383 SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(n);
384
385 switch(m) {
386 case kExceptionHere:
387 n = fDelegate->previous(); // skip this one. Find the next lowerlevel break.
388 continue;
389
390 default:
391 case kNoExceptionHere:
392 return n;
393 }
394 }
395 return n;
396}
397
398
399int32_t
400SimpleFilteredSentenceBreakIterator::next() {
401 return internalNext(fDelegate->next());
402}
403
404int32_t
405SimpleFilteredSentenceBreakIterator::first(void) {
406 // Don't suppress a break opportunity at the beginning of text.
407 return fDelegate->first();
408}
409
410int32_t
411SimpleFilteredSentenceBreakIterator::preceding(int32_t offset) {
412 return internalPrev(fDelegate->preceding(offset));
413}
414
415int32_t
416SimpleFilteredSentenceBreakIterator::previous(void) {
417 return internalPrev(fDelegate->previous());
418}
419
420UBool SimpleFilteredSentenceBreakIterator::isBoundary(int32_t offset) {
421 if (!fDelegate->isBoundary(offset)) return false; // no break to suppress
422
423 if (fData->fBackwardsTrie.isNull()) return true; // no data = no suppressions
424
425 UErrorCode status = U_ZERO_ERROR;
426 resetState(status);
427
428 SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(offset);
429
430 switch(m) {
431 case kExceptionHere:
432 return false;
433 default:
434 case kNoExceptionHere:
435 return true;
436 }
437}
438
439int32_t
440SimpleFilteredSentenceBreakIterator::next(int32_t offset) {
441 return internalNext(fDelegate->next(offset));
442}
443
444int32_t
445SimpleFilteredSentenceBreakIterator::following(int32_t offset) {
446 return internalNext(fDelegate->following(offset));
447}
448
449int32_t
450SimpleFilteredSentenceBreakIterator::last(void) {
451 // Don't suppress a break opportunity at the end of text.
452 return fDelegate->last();
453}
454
455
456/**
457 * Concrete implementation of builder class.
458 */
459class U_COMMON_API SimpleFilteredBreakIteratorBuilder : public FilteredBreakIteratorBuilder {
460public:
461 virtual ~SimpleFilteredBreakIteratorBuilder();
462 SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status);
463 SimpleFilteredBreakIteratorBuilder(UErrorCode &status);
464 virtual UBool suppressBreakAfter(const UnicodeString& exception, UErrorCode& status);
465 virtual UBool unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status);
466 virtual BreakIterator *build(BreakIterator* adoptBreakIterator, UErrorCode& status);
467private:
468 UStringSet fSet;
469};
470
471SimpleFilteredBreakIteratorBuilder::~SimpleFilteredBreakIteratorBuilder()
472{
473}
474
475SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(UErrorCode &status)
476 : fSet(status)
477{
478}
479
480SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status)
481 : fSet(status)
482{
483 if(U_SUCCESS(status)) {
484 UErrorCode subStatus = U_ZERO_ERROR;
485 LocalUResourceBundlePointer b(ures_open(U_ICUDATA_BRKITR, fromLocale.getBaseName(), &subStatus));
486 if (U_FAILURE(subStatus) || (subStatus == U_USING_DEFAULT_WARNING) ) {
487 status = subStatus; // copy the failing status
488#if FB_DEBUG
489 fprintf(stderr, "open BUNDLE %s : %s, %s\n", fromLocale.getBaseName(), "[exit]", u_errorName(status));
490#endif
491 return; // leaves the builder empty, if you try to use it.
492 }
493 LocalUResourceBundlePointer exceptions(ures_getByKeyWithFallback(b.getAlias(), "exceptions", NULL, &subStatus));
494 if (U_FAILURE(subStatus) || (subStatus == U_USING_DEFAULT_WARNING) ) {
495 status = subStatus; // copy the failing status
496#if FB_DEBUG
497 fprintf(stderr, "open EXCEPTIONS %s : %s, %s\n", fromLocale.getBaseName(), "[exit]", u_errorName(status));
498#endif
499 return; // leaves the builder empty, if you try to use it.
500 }
501 LocalUResourceBundlePointer breaks(ures_getByKeyWithFallback(exceptions.getAlias(), "SentenceBreak", NULL, &subStatus));
502
503#if FB_DEBUG
504 {
505 UErrorCode subsub = subStatus;
506 fprintf(stderr, "open SentenceBreak %s => %s, %s\n", fromLocale.getBaseName(), ures_getLocale(breaks.getAlias(), &subsub), u_errorName(subStatus));
507 }
508#endif
509
510 if (U_FAILURE(subStatus) || (subStatus == U_USING_DEFAULT_WARNING) ) {
511 status = subStatus; // copy the failing status
512#if FB_DEBUG
513 fprintf(stderr, "open %s : %s, %s\n", fromLocale.getBaseName(), "[exit]", u_errorName(status));
514#endif
515 return; // leaves the builder empty, if you try to use it.
516 }
517
518 LocalUResourceBundlePointer strs;
519 subStatus = status; // Pick up inherited warning status now
520 do {
521 strs.adoptInstead(ures_getNextResource(breaks.getAlias(), strs.orphan(), &subStatus));
522 if(strs.isValid() && U_SUCCESS(subStatus)) {
523 UnicodeString str(ures_getUnicodeString(strs.getAlias(), &status));
524 suppressBreakAfter(str, status); // load the string
525 }
526 } while (strs.isValid() && U_SUCCESS(subStatus));
527 if(U_FAILURE(subStatus)&&subStatus!=U_INDEX_OUTOFBOUNDS_ERROR&&U_SUCCESS(status)) {
528 status = subStatus;
529 }
530 }
531}
532
533UBool
534SimpleFilteredBreakIteratorBuilder::suppressBreakAfter(const UnicodeString& exception, UErrorCode& status)
535{
536 UBool r = fSet.add(exception, status);
537 FB_TRACE("suppressBreakAfter",&exception,r,0);
538 return r;
539}
540
541UBool
542SimpleFilteredBreakIteratorBuilder::unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status)
543{
544 UBool r = fSet.remove(exception, status);
545 FB_TRACE("unsuppressBreakAfter",&exception,r,0);
546 return r;
547}
548
549/**
550 * Jitterbug 2974: MSVC has a bug whereby new X[0] behaves badly.
551 * Work around this.
552 *
553 * Note: "new UnicodeString[subCount]" ends up calling global operator new
554 * on MSVC2012 for some reason.
555 */
556static inline UnicodeString* newUnicodeStringArray(size_t count) {
557 return new UnicodeString[count ? count : 1];
558}
559
560BreakIterator *
561SimpleFilteredBreakIteratorBuilder::build(BreakIterator* adoptBreakIterator, UErrorCode& status) {
562 LocalPointer<BreakIterator> adopt(adoptBreakIterator);
563
564 LocalPointer<UCharsTrieBuilder> builder(new UCharsTrieBuilder(status), status);
565 LocalPointer<UCharsTrieBuilder> builder2(new UCharsTrieBuilder(status), status);
566 if(U_FAILURE(status)) {
567 return NULL;
568 }
569
570 int32_t revCount = 0;
571 int32_t fwdCount = 0;
572
573 int32_t subCount = fSet.size();
574
575 UnicodeString *ustrs_ptr = newUnicodeStringArray(subCount);
576
577 LocalArray<UnicodeString> ustrs(ustrs_ptr);
578
579 LocalMemory<int> partials;
580 partials.allocateInsteadAndReset(subCount);
581
582 LocalPointer<UCharsTrie> backwardsTrie; // i.e. ".srM" for Mrs.
583 LocalPointer<UCharsTrie> forwardsPartialTrie; // Has ".a" for "a.M."
584
585 int n=0;
586 for ( int32_t i = 0;
587 i<fSet.size();
588 i++) {
589 const UnicodeString *abbr = fSet.getStringAt(i);
590 if(abbr) {
591 FB_TRACE("build",abbr,TRUE,i);
592 ustrs[n] = *abbr; // copy by value
593 FB_TRACE("ustrs[n]",&ustrs[n],TRUE,i);
594 } else {
595 FB_TRACE("build",abbr,FALSE,i);
596 status = U_MEMORY_ALLOCATION_ERROR;
597 return NULL;
598 }
599 partials[n] = 0; // default: not partial
600 n++;
601 }
602 // first pass - find partials.
603 for(int i=0;i<subCount;i++) {
604 int nn = ustrs[i].indexOf(kFULLSTOP); // TODO: non-'.' abbreviations
605 if(nn>-1 && (nn+1)!=ustrs[i].length()) {
606 FB_TRACE("partial",&ustrs[i],FALSE,i);
607 // is partial.
608 // is it unique?
609 int sameAs = -1;
610 for(int j=0;j<subCount;j++) {
611 if(j==i) continue;
612 if(ustrs[i].compare(0,nn+1,ustrs[j],0,nn+1)==0) {
613 FB_TRACE("prefix",&ustrs[j],FALSE,nn+1);
614 //UBool otherIsPartial = ((nn+1)!=ustrs[j].length()); // true if ustrs[j] doesn't end at nn
615 if(partials[j]==0) { // hasn't been processed yet
616 partials[j] = kSuppressInReverse | kAddToForward;
617 FB_TRACE("suppressing",&ustrs[j],FALSE,j);
618 } else if(partials[j] & kSuppressInReverse) {
619 sameAs = j; // the other entry is already in the reverse table.
620 }
621 }
622 }
623 FB_TRACE("for partial same-",&ustrs[i],FALSE,sameAs);
624 FB_TRACE(" == partial #",&ustrs[i],FALSE,partials[i]);
625 UnicodeString prefix(ustrs[i], 0, nn+1);
626 if(sameAs == -1 && partials[i] == 0) {
627 // first one - add the prefix to the reverse table.
628 prefix.reverse();
629 builder->add(prefix, kPARTIAL, status);
630 revCount++;
631 FB_TRACE("Added partial",&prefix,FALSE, i);
632 FB_TRACE(u_errorName(status),&ustrs[i],FALSE,i);
633 partials[i] = kSuppressInReverse | kAddToForward;
634 } else {
635 FB_TRACE("NOT adding partial",&prefix,FALSE, i);
636 FB_TRACE(u_errorName(status),&ustrs[i],FALSE,i);
637 }
638 }
639 }
640 for(int i=0;i<subCount;i++) {
641 if(partials[i]==0) {
642 ustrs[i].reverse();
643 builder->add(ustrs[i], kMATCH, status);
644 revCount++;
645 FB_TRACE(u_errorName(status), &ustrs[i], FALSE, i);
646 } else {
647 FB_TRACE("Adding fwd",&ustrs[i], FALSE, i);
648
649 // an optimization would be to only add the portion after the '.'
650 // for example, for "Ph.D." we store ".hP" in the reverse table. We could just store "D." in the forward,
651 // instead of "Ph.D." since we already know the "Ph." part is a match.
652 // would need the trie to be able to hold 0-length strings, though.
653 builder2->add(ustrs[i], kMATCH, status); // forward
654 fwdCount++;
655 //ustrs[i].reverse();
656 ////if(debug2) u_printf("SUPPRESS- not Added(%d): /%S/ status=%s\n",partials[i], ustrs[i].getTerminatedBuffer(), u_errorName(status));
657 }
658 }
659 FB_TRACE("AbbrCount",NULL,FALSE, subCount);
660
661 if(revCount>0) {
662 backwardsTrie.adoptInstead(builder->build(USTRINGTRIE_BUILD_FAST, status));
663 if(U_FAILURE(status)) {
664 FB_TRACE(u_errorName(status),NULL,FALSE, -1);
665 return NULL;
666 }
667 }
668
669 if(fwdCount>0) {
670 forwardsPartialTrie.adoptInstead(builder2->build(USTRINGTRIE_BUILD_FAST, status));
671 if(U_FAILURE(status)) {
672 FB_TRACE(u_errorName(status),NULL,FALSE, -1);
673 return NULL;
674 }
675 }
676
677 return new SimpleFilteredSentenceBreakIterator(adopt.orphan(), forwardsPartialTrie.orphan(), backwardsTrie.orphan(), status);
678}
679
680
681// ----------- Base class implementation
682
683FilteredBreakIteratorBuilder::FilteredBreakIteratorBuilder() {
684}
685
686FilteredBreakIteratorBuilder::~FilteredBreakIteratorBuilder() {
687}
688
689FilteredBreakIteratorBuilder *
690FilteredBreakIteratorBuilder::createInstance(const Locale& where, UErrorCode& status) {
691 if(U_FAILURE(status)) return NULL;
692 LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder(where, status), status);
693 return (U_SUCCESS(status))? ret.orphan(): NULL;
694}
695
696FilteredBreakIteratorBuilder *
697FilteredBreakIteratorBuilder::createInstance(UErrorCode &status) {
698 return createEmptyInstance(status);
699}
700
701FilteredBreakIteratorBuilder *
702FilteredBreakIteratorBuilder::createEmptyInstance(UErrorCode& status) {
703 if(U_FAILURE(status)) return NULL;
704 LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder(status), status);
705 return (U_SUCCESS(status))? ret.orphan(): NULL;
706}
707
708U_NAMESPACE_END
709
710#endif //#if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_FILTERED_BREAK_ITERATION
711