1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5* Copyright (C) 2014-2015, International Business Machines Corporation and
6* others. All Rights Reserved.
7*******************************************************************************
8*/
9
10#include "unicode/utypes.h"
11#if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_FILTERED_BREAK_ITERATION
12
13#include "cmemory.h"
14
15#include "unicode/filteredbrk.h"
16#include "unicode/ucharstriebuilder.h"
17#include "unicode/ures.h"
18
19#include "uresimp.h" // ures_getByKeyWithFallback
20#include "ubrkimpl.h" // U_ICUDATA_BRKITR
21#include "uvector.h"
22#include "cmemory.h"
23#include "umutex.h"
24
25U_NAMESPACE_BEGIN
26
27#ifndef FB_DEBUG
28#define FB_DEBUG 0
29#endif
30
31#if FB_DEBUG
32#include <stdio.h>
33static void _fb_trace(const char *m, const UnicodeString *s, UBool b, int32_t d, const char *f, int l) {
34 char buf[2048];
35 if(s) {
36 s->extract(0,s->length(),buf,2048);
37 } else {
38 strcpy(buf,"nullptr");
39 }
40 fprintf(stderr,"%s:%d: %s. s='%s'(%p), b=%c, d=%d\n",
41 f, l, m, buf, (const void*)s, b?'T':'F',(int)d);
42}
43
44#define FB_TRACE(m,s,b,d) _fb_trace(m,s,b,d,__FILE__,__LINE__)
45#else
46#define FB_TRACE(m,s,b,d)
47#endif
48
49/**
50 * Used with sortedInsert()
51 */
52static int32_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) {
53 const UnicodeString &a = *(const UnicodeString*)t1.pointer;
54 const UnicodeString &b = *(const UnicodeString*)t2.pointer;
55 return a.compare(b);
56}
57
58/**
59 * A UVector which implements a set of strings.
60 */
61class UStringSet : public UVector {
62 public:
63 UStringSet(UErrorCode &status) : UVector(uprv_deleteUObject,
64 uhash_compareUnicodeString,
65 1,
66 status) {}
67 virtual ~UStringSet();
68 /**
69 * Is this UnicodeSet contained?
70 */
71 inline UBool contains(const UnicodeString& s) {
72 return contains((void*) &s);
73 }
74 using UVector::contains;
75 /**
76 * Return the ith UnicodeString alias
77 */
78 inline const UnicodeString* getStringAt(int32_t i) const {
79 return (const UnicodeString*)elementAt(i);
80 }
81 /**
82 * Adopt the UnicodeString if not already contained.
83 * Caller no longer owns the pointer in any case.
84 * @return true if adopted successfully, false otherwise (error, or else duplicate)
85 */
86 inline UBool adopt(UnicodeString *str, UErrorCode &status) {
87 if(U_FAILURE(status) || contains(*str)) {
88 delete str;
89 return false;
90 } else {
91 sortedInsert(str, compareUnicodeString, status);
92 if(U_FAILURE(status)) {
93 return false;
94 }
95 return true;
96 }
97 }
98 /**
99 * Add by value.
100 * @return true if successfully adopted.
101 */
102 inline UBool add(const UnicodeString& str, UErrorCode &status) {
103 if(U_FAILURE(status)) return false;
104 UnicodeString *t = new UnicodeString(str);
105 if(t==nullptr) {
106 status = U_MEMORY_ALLOCATION_ERROR; return false;
107 }
108 return adopt(t, status);
109 }
110 /**
111 * Remove this string.
112 * @return true if successfully removed, false otherwise (error, or else it wasn't there)
113 */
114 inline UBool remove(const UnicodeString &s, UErrorCode &status) {
115 if(U_FAILURE(status)) return false;
116 return removeElement((void*) &s);
117 }
118};
119
120/**
121 * Virtual, won't be inlined
122 */
123UStringSet::~UStringSet() {}
124
125/* ----------------------------------------------------------- */
126
127
128/* Filtered Break constants */
129static const int32_t kPARTIAL = (1<<0); //< partial - need to run through forward trie
130static const int32_t kMATCH = (1<<1); //< exact match - skip this one.
131static const int32_t kSuppressInReverse = (1<<0);
132static const int32_t kAddToForward = (1<<1);
133static const char16_t kFULLSTOP = 0x002E; // '.'
134
135/**
136 * Shared data for SimpleFilteredSentenceBreakIterator
137 */
138class SimpleFilteredSentenceBreakData : public UMemory {
139public:
140 SimpleFilteredSentenceBreakData(UCharsTrie *forwards, UCharsTrie *backwards )
141 : fForwardsPartialTrie(forwards), fBackwardsTrie(backwards), refcount(1) { }
142 SimpleFilteredSentenceBreakData *incr() {
143 umtx_atomic_inc(&refcount);
144 return this;
145 }
146 SimpleFilteredSentenceBreakData *decr() {
147 if(umtx_atomic_dec(&refcount) <= 0) {
148 delete this;
149 }
150 return 0;
151 }
152 virtual ~SimpleFilteredSentenceBreakData();
153
154 bool hasForwardsPartialTrie() const { return fForwardsPartialTrie.isValid(); }
155 bool hasBackwardsTrie() const { return fBackwardsTrie.isValid(); }
156
157 const UCharsTrie &getForwardsPartialTrie() const { return *fForwardsPartialTrie; }
158 const UCharsTrie &getBackwardsTrie() const { return *fBackwardsTrie; }
159
160private:
161 // These tries own their data arrays.
162 // They are shared and must therefore not be modified.
163 LocalPointer<UCharsTrie> fForwardsPartialTrie; // Has ".a" for "a.M."
164 LocalPointer<UCharsTrie> fBackwardsTrie; // i.e. ".srM" for Mrs.
165 u_atomic_int32_t refcount;
166};
167
168SimpleFilteredSentenceBreakData::~SimpleFilteredSentenceBreakData() {}
169
170/**
171 * Concrete implementation
172 */
173class SimpleFilteredSentenceBreakIterator : public BreakIterator {
174public:
175 SimpleFilteredSentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status);
176 SimpleFilteredSentenceBreakIterator(const SimpleFilteredSentenceBreakIterator& other);
177 virtual ~SimpleFilteredSentenceBreakIterator();
178private:
179 SimpleFilteredSentenceBreakData *fData;
180 LocalPointer<BreakIterator> fDelegate;
181 LocalUTextPointer fText;
182
183 /* -- subclass interface -- */
184public:
185 /* -- cloning and other subclass stuff -- */
186 virtual BreakIterator * createBufferClone(void * /*stackBuffer*/,
187 int32_t &/*BufferSize*/,
188 UErrorCode &status) override {
189 // for now - always deep clone
190 status = U_SAFECLONE_ALLOCATED_WARNING;
191 return clone();
192 }
193 virtual SimpleFilteredSentenceBreakIterator* clone() const override { return new SimpleFilteredSentenceBreakIterator(*this); }
194 virtual UClassID getDynamicClassID() const override { return nullptr; }
195 virtual bool operator==(const BreakIterator& o) const override { if(this==&o) return true; return false; }
196
197 /* -- text modifying -- */
198 virtual void setText(UText *text, UErrorCode &status) override { fDelegate->setText(text,status); }
199 virtual BreakIterator &refreshInputText(UText *input, UErrorCode &status) override { fDelegate->refreshInputText(input,status); return *this; }
200 virtual void adoptText(CharacterIterator* it) override { fDelegate->adoptText(it); }
201 virtual void setText(const UnicodeString &text) override { fDelegate->setText(text); }
202
203 /* -- other functions that are just delegated -- */
204 virtual UText *getUText(UText *fillIn, UErrorCode &status) const override { return fDelegate->getUText(fillIn,status); }
205 virtual CharacterIterator& getText() const override { return fDelegate->getText(); }
206
207 /* -- ITERATION -- */
208 virtual int32_t first() override;
209 virtual int32_t preceding(int32_t offset) override;
210 virtual int32_t previous() override;
211 virtual UBool isBoundary(int32_t offset) override;
212 virtual int32_t current() const override { return fDelegate->current(); } // we keep the delegate current, so this should be correct.
213
214 virtual int32_t next() override;
215
216 virtual int32_t next(int32_t n) override;
217 virtual int32_t following(int32_t offset) override;
218 virtual int32_t last() override;
219
220private:
221 /**
222 * Given that the fDelegate has already given its "initial" answer,
223 * find the NEXT actual (non-excepted) break.
224 * @param n initial position from delegate
225 * @return new break position or UBRK_DONE
226 */
227 int32_t internalNext(int32_t n);
228 /**
229 * Given that the fDelegate has already given its "initial" answer,
230 * find the PREV actual (non-excepted) break.
231 * @param n initial position from delegate
232 * @return new break position or UBRK_DONE
233 */
234 int32_t internalPrev(int32_t n);
235 /**
236 * set up the UText with the value of the fDelegate.
237 * Call this before calling breakExceptionAt.
238 * May be able to avoid excess calls
239 */
240 void resetState(UErrorCode &status);
241 /**
242 * Is there a match (exception) at this spot?
243 */
244 enum EFBMatchResult { kNoExceptionHere, kExceptionHere };
245 /**
246 * Determine if there is an exception at this spot
247 * @param n spot to check
248 * @return kNoExceptionHere or kExceptionHere
249 **/
250 enum EFBMatchResult breakExceptionAt(int32_t n);
251};
252
253SimpleFilteredSentenceBreakIterator::SimpleFilteredSentenceBreakIterator(const SimpleFilteredSentenceBreakIterator& other)
254 : BreakIterator(other), fData(other.fData->incr()), fDelegate(other.fDelegate->clone())
255{
256}
257
258
259SimpleFilteredSentenceBreakIterator::SimpleFilteredSentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status) :
260 BreakIterator(adopt->getLocale(ULOC_VALID_LOCALE,status),adopt->getLocale(ULOC_ACTUAL_LOCALE,status)),
261 fData(new SimpleFilteredSentenceBreakData(forwards, backwards)),
262 fDelegate(adopt)
263{
264 if (fData == nullptr) {
265 delete forwards;
266 delete backwards;
267 if (U_SUCCESS(status)) {
268 status = U_MEMORY_ALLOCATION_ERROR;
269 }
270 }
271}
272
273SimpleFilteredSentenceBreakIterator::~SimpleFilteredSentenceBreakIterator() {
274 fData = fData->decr();
275}
276
277void SimpleFilteredSentenceBreakIterator::resetState(UErrorCode &status) {
278 fText.adoptInstead(fDelegate->getUText(fText.orphan(), status));
279}
280
281SimpleFilteredSentenceBreakIterator::EFBMatchResult
282SimpleFilteredSentenceBreakIterator::breakExceptionAt(int32_t n) {
283 int64_t bestPosn = -1;
284 int32_t bestValue = -1;
285 // loops while 'n' points to an exception.
286 utext_setNativeIndex(fText.getAlias(), n); // from n..
287
288 //if(debug2) u_printf(" n@ %d\n", n);
289 // Assume a space is following the '.' (so we handle the case: "Mr. /Brown")
290 if(utext_previous32(fText.getAlias())==u' ') { // TODO: skip a class of chars here??
291 // TODO only do this the 1st time?
292 //if(debug2) u_printf("skipping prev: |%C| \n", (char16_t)uch);
293 } else {
294 //if(debug2) u_printf("not skipping prev: |%C| \n", (char16_t)uch);
295 utext_next32(fText.getAlias());
296 //if(debug2) u_printf(" -> : |%C| \n", (char16_t)uch);
297 }
298
299 {
300 // Do not modify the shared trie!
301 UCharsTrie iter(fData->getBackwardsTrie());
302 UChar32 uch;
303 while((uch=utext_previous32(fText.getAlias()))!=U_SENTINEL) { // more to consume backwards
304 UStringTrieResult r = iter.nextForCodePoint(uch);
305 if(USTRINGTRIE_HAS_VALUE(r)) { // remember the best match so far
306 bestPosn = utext_getNativeIndex(fText.getAlias());
307 bestValue = iter.getValue();
308 }
309 if(!USTRINGTRIE_HAS_NEXT(r)) {
310 break;
311 }
312 //if(debug2) u_printf("rev< /%C/ cont?%d @%d\n", (char16_t)uch, r, utext_getNativeIndex(fText.getAlias()));
313 }
314 }
315
316 //if(bestValue >= 0) {
317 //if(debug2) u_printf("rev<+/%C/+end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (char16_t)uch, r, bestPosn, bestValue);
318 //}
319
320 if(bestPosn>=0) {
321 //if(debug2) u_printf("rev< /%C/ end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (char16_t)uch, r, bestPosn, bestValue);
322
323 //if(USTRINGTRIE_MATCHES(r)) { // matched - so, now what?
324 //int32_t bestValue = iter.getValue();
325 ////if(debug2) u_printf("rev< /%C/ matched, skip..%d bestValue=%d\n", (char16_t)uch, r, bestValue);
326
327 if(bestValue == kMATCH) { // exact match!
328 //if(debug2) u_printf(" exact backward match\n");
329 return kExceptionHere; // See if the next is another exception.
330 } else if(bestValue == kPARTIAL
331 && fData->hasForwardsPartialTrie()) { // make sure there's a forward trie
332 //if(debug2) u_printf(" partial backward match\n");
333 // We matched the "Ph." in "Ph.D." - now we need to run everything through the forwards trie
334 // to see if it matches something going forward.
335 UStringTrieResult rfwd = USTRINGTRIE_INTERMEDIATE_VALUE;
336 utext_setNativeIndex(fText.getAlias(), bestPosn); // hope that's close ..
337 //if(debug2) u_printf("Retrying at %d\n", bestPosn);
338 // Do not modify the shared trie!
339 UCharsTrie iter(fData->getForwardsPartialTrie());
340 UChar32 uch;
341 while((uch=utext_next32(fText.getAlias()))!=U_SENTINEL &&
342 USTRINGTRIE_HAS_NEXT(rfwd=iter.nextForCodePoint(uch))) {
343 //if(debug2) u_printf("fwd> /%C/ cont?%d @%d\n", (char16_t)uch, rfwd, utext_getNativeIndex(fText.getAlias()));
344 }
345 if(USTRINGTRIE_MATCHES(rfwd)) {
346 //if(debug2) u_printf("fwd> /%C/ == forward match!\n", (char16_t)uch);
347 // only full matches here, nothing to check
348 // skip the next:
349 return kExceptionHere;
350 } else {
351 //if(debug2) u_printf("fwd> /%C/ no match.\n", (char16_t)uch);
352 // no match (no exception) -return the 'underlying' break
353 return kNoExceptionHere;
354 }
355 } else {
356 return kNoExceptionHere; // internal error and/or no forwards trie
357 }
358 } else {
359 //if(debug2) u_printf("rev< /%C/ .. no match..%d\n", (char16_t)uch, r); // no best match
360 return kNoExceptionHere; // No match - so exit. Not an exception.
361 }
362}
363
364// the workhorse single next.
365int32_t
366SimpleFilteredSentenceBreakIterator::internalNext(int32_t n) {
367 if(n == UBRK_DONE || // at end or
368 !fData->hasBackwardsTrie()) { // .. no backwards table loaded == no exceptions
369 return n;
370 }
371 // OK, do we need to break here?
372 UErrorCode status = U_ZERO_ERROR;
373 // refresh text
374 resetState(status);
375 if(U_FAILURE(status)) return UBRK_DONE; // bail out
376 int64_t utextLen = utext_nativeLength(fText.getAlias());
377
378 //if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias()));
379 while (n != UBRK_DONE && n != utextLen) { // outer loop runs once per underlying break (from fDelegate).
380 SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(n);
381
382 switch(m) {
383 case kExceptionHere:
384 n = fDelegate->next(); // skip this one. Find the next lowerlevel break.
385 continue;
386
387 default:
388 case kNoExceptionHere:
389 return n;
390 }
391 }
392 return n;
393}
394
395int32_t
396SimpleFilteredSentenceBreakIterator::internalPrev(int32_t n) {
397 if(n == 0 || n == UBRK_DONE || // at end or
398 !fData->hasBackwardsTrie()) { // .. no backwards table loaded == no exceptions
399 return n;
400 }
401 // OK, do we need to break here?
402 UErrorCode status = U_ZERO_ERROR;
403 // refresh text
404 resetState(status);
405 if(U_FAILURE(status)) return UBRK_DONE; // bail out
406
407 //if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias()));
408 while (n != UBRK_DONE && n != 0) { // outer loop runs once per underlying break (from fDelegate).
409 SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(n);
410
411 switch(m) {
412 case kExceptionHere:
413 n = fDelegate->previous(); // skip this one. Find the next lowerlevel break.
414 continue;
415
416 default:
417 case kNoExceptionHere:
418 return n;
419 }
420 }
421 return n;
422}
423
424
425int32_t
426SimpleFilteredSentenceBreakIterator::next() {
427 return internalNext(fDelegate->next());
428}
429
430int32_t
431SimpleFilteredSentenceBreakIterator::first() {
432 // Don't suppress a break opportunity at the beginning of text.
433 return fDelegate->first();
434}
435
436int32_t
437SimpleFilteredSentenceBreakIterator::preceding(int32_t offset) {
438 return internalPrev(fDelegate->preceding(offset));
439}
440
441int32_t
442SimpleFilteredSentenceBreakIterator::previous() {
443 return internalPrev(fDelegate->previous());
444}
445
446UBool SimpleFilteredSentenceBreakIterator::isBoundary(int32_t offset) {
447 if (!fDelegate->isBoundary(offset)) return false; // no break to suppress
448
449 if (!fData->hasBackwardsTrie()) return true; // no data = no suppressions
450
451 UErrorCode status = U_ZERO_ERROR;
452 resetState(status);
453
454 SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(offset);
455
456 switch(m) {
457 case kExceptionHere:
458 return false;
459 default:
460 case kNoExceptionHere:
461 return true;
462 }
463}
464
465int32_t
466SimpleFilteredSentenceBreakIterator::next(int32_t offset) {
467 return internalNext(fDelegate->next(offset));
468}
469
470int32_t
471SimpleFilteredSentenceBreakIterator::following(int32_t offset) {
472 return internalNext(fDelegate->following(offset));
473}
474
475int32_t
476SimpleFilteredSentenceBreakIterator::last() {
477 // Don't suppress a break opportunity at the end of text.
478 return fDelegate->last();
479}
480
481
482/**
483 * Concrete implementation of builder class.
484 */
485class SimpleFilteredBreakIteratorBuilder : public FilteredBreakIteratorBuilder {
486public:
487 virtual ~SimpleFilteredBreakIteratorBuilder();
488 SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status);
489 SimpleFilteredBreakIteratorBuilder(UErrorCode &status);
490 virtual UBool suppressBreakAfter(const UnicodeString& exception, UErrorCode& status) override;
491 virtual UBool unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status) override;
492 virtual BreakIterator *build(BreakIterator* adoptBreakIterator, UErrorCode& status) override;
493private:
494 UStringSet fSet;
495};
496
497SimpleFilteredBreakIteratorBuilder::~SimpleFilteredBreakIteratorBuilder()
498{
499}
500
501SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(UErrorCode &status)
502 : fSet(status)
503{
504}
505
506SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status)
507 : fSet(status)
508{
509 if(U_SUCCESS(status)) {
510 UErrorCode subStatus = U_ZERO_ERROR;
511 LocalUResourceBundlePointer b(ures_open(U_ICUDATA_BRKITR, fromLocale.getBaseName(), &subStatus));
512 if (U_FAILURE(subStatus) || (subStatus == U_USING_DEFAULT_WARNING) ) {
513 status = subStatus; // copy the failing status
514#if FB_DEBUG
515 fprintf(stderr, "open BUNDLE %s : %s, %s\n", fromLocale.getBaseName(), "[exit]", u_errorName(status));
516#endif
517 return; // leaves the builder empty, if you try to use it.
518 }
519 LocalUResourceBundlePointer exceptions(ures_getByKeyWithFallback(b.getAlias(), "exceptions", nullptr, &subStatus));
520 if (U_FAILURE(subStatus) || (subStatus == U_USING_DEFAULT_WARNING) ) {
521 status = subStatus; // copy the failing status
522#if FB_DEBUG
523 fprintf(stderr, "open EXCEPTIONS %s : %s, %s\n", fromLocale.getBaseName(), "[exit]", u_errorName(status));
524#endif
525 return; // leaves the builder empty, if you try to use it.
526 }
527 LocalUResourceBundlePointer breaks(ures_getByKeyWithFallback(exceptions.getAlias(), "SentenceBreak", nullptr, &subStatus));
528
529#if FB_DEBUG
530 {
531 UErrorCode subsub = subStatus;
532 fprintf(stderr, "open SentenceBreak %s => %s, %s\n", fromLocale.getBaseName(), ures_getLocale(breaks.getAlias(), &subsub), u_errorName(subStatus));
533 }
534#endif
535
536 if (U_FAILURE(subStatus) || (subStatus == U_USING_DEFAULT_WARNING) ) {
537 status = subStatus; // copy the failing status
538#if FB_DEBUG
539 fprintf(stderr, "open %s : %s, %s\n", fromLocale.getBaseName(), "[exit]", u_errorName(status));
540#endif
541 return; // leaves the builder empty, if you try to use it.
542 }
543
544 LocalUResourceBundlePointer strs;
545 subStatus = status; // Pick up inherited warning status now
546 do {
547 strs.adoptInstead(ures_getNextResource(breaks.getAlias(), strs.orphan(), &subStatus));
548 if(strs.isValid() && U_SUCCESS(subStatus)) {
549 UnicodeString str(ures_getUnicodeString(strs.getAlias(), &status));
550 suppressBreakAfter(str, status); // load the string
551 }
552 } while (strs.isValid() && U_SUCCESS(subStatus));
553 if(U_FAILURE(subStatus)&&subStatus!=U_INDEX_OUTOFBOUNDS_ERROR&&U_SUCCESS(status)) {
554 status = subStatus;
555 }
556 }
557}
558
559UBool
560SimpleFilteredBreakIteratorBuilder::suppressBreakAfter(const UnicodeString& exception, UErrorCode& status)
561{
562 UBool r = fSet.add(exception, status);
563 FB_TRACE("suppressBreakAfter",&exception,r,0);
564 return r;
565}
566
567UBool
568SimpleFilteredBreakIteratorBuilder::unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status)
569{
570 UBool r = fSet.remove(exception, status);
571 FB_TRACE("unsuppressBreakAfter",&exception,r,0);
572 return r;
573}
574
575/**
576 * Jitterbug 2974: MSVC has a bug whereby new X[0] behaves badly.
577 * Work around this.
578 *
579 * Note: "new UnicodeString[subCount]" ends up calling global operator new
580 * on MSVC2012 for some reason.
581 */
582static inline UnicodeString* newUnicodeStringArray(size_t count) {
583 return new UnicodeString[count ? count : 1];
584}
585
586BreakIterator *
587SimpleFilteredBreakIteratorBuilder::build(BreakIterator* adoptBreakIterator, UErrorCode& status) {
588 LocalPointer<BreakIterator> adopt(adoptBreakIterator);
589
590 LocalPointer<UCharsTrieBuilder> builder(new UCharsTrieBuilder(status), status);
591 LocalPointer<UCharsTrieBuilder> builder2(new UCharsTrieBuilder(status), status);
592 if(U_FAILURE(status)) {
593 return nullptr;
594 }
595
596 int32_t revCount = 0;
597 int32_t fwdCount = 0;
598
599 int32_t subCount = fSet.size();
600
601 UnicodeString *ustrs_ptr = newUnicodeStringArray(subCount);
602
603 LocalArray<UnicodeString> ustrs(ustrs_ptr);
604
605 LocalMemory<int> partials;
606 partials.allocateInsteadAndReset(subCount);
607
608 LocalPointer<UCharsTrie> backwardsTrie; // i.e. ".srM" for Mrs.
609 LocalPointer<UCharsTrie> forwardsPartialTrie; // Has ".a" for "a.M."
610
611 int n=0;
612 for ( int32_t i = 0;
613 i<fSet.size();
614 i++) {
615 const UnicodeString *abbr = fSet.getStringAt(i);
616 if(abbr) {
617 FB_TRACE("build",abbr,true,i);
618 ustrs[n] = *abbr; // copy by value
619 FB_TRACE("ustrs[n]",&ustrs[n],true,i);
620 } else {
621 FB_TRACE("build",abbr,false,i);
622 status = U_MEMORY_ALLOCATION_ERROR;
623 return nullptr;
624 }
625 partials[n] = 0; // default: not partial
626 n++;
627 }
628 // first pass - find partials.
629 for(int i=0;i<subCount;i++) {
630 int nn = ustrs[i].indexOf(kFULLSTOP); // TODO: non-'.' abbreviations
631 if(nn>-1 && (nn+1)!=ustrs[i].length()) {
632 FB_TRACE("partial",&ustrs[i],false,i);
633 // is partial.
634 // is it unique?
635 int sameAs = -1;
636 for(int j=0;j<subCount;j++) {
637 if(j==i) continue;
638 if(ustrs[i].compare(0,nn+1,ustrs[j],0,nn+1)==0) {
639 FB_TRACE("prefix",&ustrs[j],false,nn+1);
640 //UBool otherIsPartial = ((nn+1)!=ustrs[j].length()); // true if ustrs[j] doesn't end at nn
641 if(partials[j]==0) { // hasn't been processed yet
642 partials[j] = kSuppressInReverse | kAddToForward;
643 FB_TRACE("suppressing",&ustrs[j],false,j);
644 } else if(partials[j] & kSuppressInReverse) {
645 sameAs = j; // the other entry is already in the reverse table.
646 }
647 }
648 }
649 FB_TRACE("for partial same-",&ustrs[i],false,sameAs);
650 FB_TRACE(" == partial #",&ustrs[i],false,partials[i]);
651 UnicodeString prefix(ustrs[i], 0, nn+1);
652 if(sameAs == -1 && partials[i] == 0) {
653 // first one - add the prefix to the reverse table.
654 prefix.reverse();
655 builder->add(prefix, kPARTIAL, status);
656 revCount++;
657 FB_TRACE("Added partial",&prefix,false, i);
658 FB_TRACE(u_errorName(status),&ustrs[i],false,i);
659 partials[i] = kSuppressInReverse | kAddToForward;
660 } else {
661 FB_TRACE("NOT adding partial",&prefix,false, i);
662 FB_TRACE(u_errorName(status),&ustrs[i],false,i);
663 }
664 }
665 }
666 for(int i=0;i<subCount;i++) {
667 if(partials[i]==0) {
668 ustrs[i].reverse();
669 builder->add(ustrs[i], kMATCH, status);
670 revCount++;
671 FB_TRACE(u_errorName(status), &ustrs[i], false, i);
672 } else {
673 FB_TRACE("Adding fwd",&ustrs[i], false, i);
674
675 // an optimization would be to only add the portion after the '.'
676 // for example, for "Ph.D." we store ".hP" in the reverse table. We could just store "D." in the forward,
677 // instead of "Ph.D." since we already know the "Ph." part is a match.
678 // would need the trie to be able to hold 0-length strings, though.
679 builder2->add(ustrs[i], kMATCH, status); // forward
680 fwdCount++;
681 //ustrs[i].reverse();
682 ////if(debug2) u_printf("SUPPRESS- not Added(%d): /%S/ status=%s\n",partials[i], ustrs[i].getTerminatedBuffer(), u_errorName(status));
683 }
684 }
685 FB_TRACE("AbbrCount",nullptr,false, subCount);
686
687 if(revCount>0) {
688 backwardsTrie.adoptInstead(builder->build(USTRINGTRIE_BUILD_FAST, status));
689 if(U_FAILURE(status)) {
690 FB_TRACE(u_errorName(status),nullptr,false, -1);
691 return nullptr;
692 }
693 }
694
695 if(fwdCount>0) {
696 forwardsPartialTrie.adoptInstead(builder2->build(USTRINGTRIE_BUILD_FAST, status));
697 if(U_FAILURE(status)) {
698 FB_TRACE(u_errorName(status),nullptr,false, -1);
699 return nullptr;
700 }
701 }
702
703 return new SimpleFilteredSentenceBreakIterator(adopt.orphan(), forwardsPartialTrie.orphan(), backwardsTrie.orphan(), status);
704}
705
706
707// ----------- Base class implementation
708
709FilteredBreakIteratorBuilder::FilteredBreakIteratorBuilder() {
710}
711
712FilteredBreakIteratorBuilder::~FilteredBreakIteratorBuilder() {
713}
714
715FilteredBreakIteratorBuilder *
716FilteredBreakIteratorBuilder::createInstance(const Locale& where, UErrorCode& status) {
717 if(U_FAILURE(status)) return nullptr;
718 LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder(where, status), status);
719 return (U_SUCCESS(status))? ret.orphan(): nullptr;
720}
721
722FilteredBreakIteratorBuilder *
723FilteredBreakIteratorBuilder::createInstance(UErrorCode &status) {
724 return createEmptyInstance(status);
725}
726
727FilteredBreakIteratorBuilder *
728FilteredBreakIteratorBuilder::createEmptyInstance(UErrorCode& status) {
729 if(U_FAILURE(status)) return nullptr;
730 LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder(status), status);
731 return (U_SUCCESS(status))? ret.orphan(): nullptr;
732}
733
734U_NAMESPACE_END
735
736#endif //#if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_FILTERED_BREAK_ITERATION
737