| 1 | // © 2016 and later: Unicode, Inc. and others. |
| 2 | // License & terms of use: http://www.unicode.org/copyright.html |
| 3 | /* |
| 4 | ******************************************************************************* |
| 5 | * Copyright (C) 2014-2015, International Business Machines Corporation and |
| 6 | * others. All Rights Reserved. |
| 7 | ******************************************************************************* |
| 8 | */ |
| 9 | |
| 10 | #include "unicode/utypes.h" |
| 11 | #if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_FILTERED_BREAK_ITERATION |
| 12 | |
| 13 | #include "cmemory.h" |
| 14 | |
| 15 | #include "unicode/filteredbrk.h" |
| 16 | #include "unicode/ucharstriebuilder.h" |
| 17 | #include "unicode/ures.h" |
| 18 | |
| 19 | #include "uresimp.h" // ures_getByKeyWithFallback |
| 20 | #include "ubrkimpl.h" // U_ICUDATA_BRKITR |
| 21 | #include "uvector.h" |
| 22 | #include "cmemory.h" |
| 23 | #include "umutex.h" |
| 24 | |
| 25 | U_NAMESPACE_BEGIN |
| 26 | |
| 27 | #ifndef FB_DEBUG |
| 28 | #define FB_DEBUG 0 |
| 29 | #endif |
| 30 | |
| 31 | #if FB_DEBUG |
| 32 | #include <stdio.h> |
| 33 | static void _fb_trace(const char *m, const UnicodeString *s, UBool b, int32_t d, const char *f, int l) { |
| 34 | char buf[2048]; |
| 35 | if(s) { |
| 36 | s->extract(0,s->length(),buf,2048); |
| 37 | } else { |
| 38 | strcpy(buf,"nullptr" ); |
| 39 | } |
| 40 | fprintf(stderr,"%s:%d: %s. s='%s'(%p), b=%c, d=%d\n" , |
| 41 | f, l, m, buf, (const void*)s, b?'T':'F',(int)d); |
| 42 | } |
| 43 | |
| 44 | #define FB_TRACE(m,s,b,d) _fb_trace(m,s,b,d,__FILE__,__LINE__) |
| 45 | #else |
| 46 | #define FB_TRACE(m,s,b,d) |
| 47 | #endif |
| 48 | |
| 49 | /** |
| 50 | * Used with sortedInsert() |
| 51 | */ |
| 52 | static int32_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) { |
| 53 | const UnicodeString &a = *(const UnicodeString*)t1.pointer; |
| 54 | const UnicodeString &b = *(const UnicodeString*)t2.pointer; |
| 55 | return a.compare(b); |
| 56 | } |
| 57 | |
| 58 | /** |
| 59 | * A UVector which implements a set of strings. |
| 60 | */ |
| 61 | class UStringSet : public UVector { |
| 62 | public: |
| 63 | UStringSet(UErrorCode &status) : UVector(uprv_deleteUObject, |
| 64 | uhash_compareUnicodeString, |
| 65 | 1, |
| 66 | status) {} |
| 67 | virtual ~UStringSet(); |
| 68 | /** |
| 69 | * Is this UnicodeSet contained? |
| 70 | */ |
| 71 | inline UBool contains(const UnicodeString& s) { |
| 72 | return contains((void*) &s); |
| 73 | } |
| 74 | using UVector::contains; |
| 75 | /** |
| 76 | * Return the ith UnicodeString alias |
| 77 | */ |
| 78 | inline const UnicodeString* getStringAt(int32_t i) const { |
| 79 | return (const UnicodeString*)elementAt(i); |
| 80 | } |
| 81 | /** |
| 82 | * Adopt the UnicodeString if not already contained. |
| 83 | * Caller no longer owns the pointer in any case. |
| 84 | * @return true if adopted successfully, false otherwise (error, or else duplicate) |
| 85 | */ |
| 86 | inline UBool adopt(UnicodeString *str, UErrorCode &status) { |
| 87 | if(U_FAILURE(status) || contains(*str)) { |
| 88 | delete str; |
| 89 | return false; |
| 90 | } else { |
| 91 | sortedInsert(str, compareUnicodeString, status); |
| 92 | if(U_FAILURE(status)) { |
| 93 | return false; |
| 94 | } |
| 95 | return true; |
| 96 | } |
| 97 | } |
| 98 | /** |
| 99 | * Add by value. |
| 100 | * @return true if successfully adopted. |
| 101 | */ |
| 102 | inline UBool add(const UnicodeString& str, UErrorCode &status) { |
| 103 | if(U_FAILURE(status)) return false; |
| 104 | UnicodeString *t = new UnicodeString(str); |
| 105 | if(t==nullptr) { |
| 106 | status = U_MEMORY_ALLOCATION_ERROR; return false; |
| 107 | } |
| 108 | return adopt(t, status); |
| 109 | } |
| 110 | /** |
| 111 | * Remove this string. |
| 112 | * @return true if successfully removed, false otherwise (error, or else it wasn't there) |
| 113 | */ |
| 114 | inline UBool remove(const UnicodeString &s, UErrorCode &status) { |
| 115 | if(U_FAILURE(status)) return false; |
| 116 | return removeElement((void*) &s); |
| 117 | } |
| 118 | }; |
| 119 | |
| 120 | /** |
| 121 | * Virtual, won't be inlined |
| 122 | */ |
| 123 | UStringSet::~UStringSet() {} |
| 124 | |
| 125 | /* ----------------------------------------------------------- */ |
| 126 | |
| 127 | |
| 128 | /* Filtered Break constants */ |
| 129 | static const int32_t kPARTIAL = (1<<0); //< partial - need to run through forward trie |
| 130 | static const int32_t kMATCH = (1<<1); //< exact match - skip this one. |
| 131 | static const int32_t kSuppressInReverse = (1<<0); |
| 132 | static const int32_t kAddToForward = (1<<1); |
| 133 | static const char16_t kFULLSTOP = 0x002E; // '.' |
| 134 | |
| 135 | /** |
| 136 | * Shared data for SimpleFilteredSentenceBreakIterator |
| 137 | */ |
| 138 | class SimpleFilteredSentenceBreakData : public UMemory { |
| 139 | public: |
| 140 | SimpleFilteredSentenceBreakData(UCharsTrie *forwards, UCharsTrie *backwards ) |
| 141 | : fForwardsPartialTrie(forwards), fBackwardsTrie(backwards), refcount(1) { } |
| 142 | SimpleFilteredSentenceBreakData *incr() { |
| 143 | umtx_atomic_inc(&refcount); |
| 144 | return this; |
| 145 | } |
| 146 | SimpleFilteredSentenceBreakData *decr() { |
| 147 | if(umtx_atomic_dec(&refcount) <= 0) { |
| 148 | delete this; |
| 149 | } |
| 150 | return 0; |
| 151 | } |
| 152 | virtual ~SimpleFilteredSentenceBreakData(); |
| 153 | |
| 154 | bool hasForwardsPartialTrie() const { return fForwardsPartialTrie.isValid(); } |
| 155 | bool hasBackwardsTrie() const { return fBackwardsTrie.isValid(); } |
| 156 | |
| 157 | const UCharsTrie &getForwardsPartialTrie() const { return *fForwardsPartialTrie; } |
| 158 | const UCharsTrie &getBackwardsTrie() const { return *fBackwardsTrie; } |
| 159 | |
| 160 | private: |
| 161 | // These tries own their data arrays. |
| 162 | // They are shared and must therefore not be modified. |
| 163 | LocalPointer<UCharsTrie> fForwardsPartialTrie; // Has ".a" for "a.M." |
| 164 | LocalPointer<UCharsTrie> fBackwardsTrie; // i.e. ".srM" for Mrs. |
| 165 | u_atomic_int32_t refcount; |
| 166 | }; |
| 167 | |
| 168 | SimpleFilteredSentenceBreakData::~SimpleFilteredSentenceBreakData() {} |
| 169 | |
| 170 | /** |
| 171 | * Concrete implementation |
| 172 | */ |
| 173 | class SimpleFilteredSentenceBreakIterator : public BreakIterator { |
| 174 | public: |
| 175 | SimpleFilteredSentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status); |
| 176 | SimpleFilteredSentenceBreakIterator(const SimpleFilteredSentenceBreakIterator& other); |
| 177 | virtual ~SimpleFilteredSentenceBreakIterator(); |
| 178 | private: |
| 179 | SimpleFilteredSentenceBreakData *fData; |
| 180 | LocalPointer<BreakIterator> fDelegate; |
| 181 | LocalUTextPointer fText; |
| 182 | |
| 183 | /* -- subclass interface -- */ |
| 184 | public: |
| 185 | /* -- cloning and other subclass stuff -- */ |
| 186 | virtual BreakIterator * createBufferClone(void * /*stackBuffer*/, |
| 187 | int32_t &/*BufferSize*/, |
| 188 | UErrorCode &status) override { |
| 189 | // for now - always deep clone |
| 190 | status = U_SAFECLONE_ALLOCATED_WARNING; |
| 191 | return clone(); |
| 192 | } |
| 193 | virtual SimpleFilteredSentenceBreakIterator* clone() const override { return new SimpleFilteredSentenceBreakIterator(*this); } |
| 194 | virtual UClassID getDynamicClassID() const override { return nullptr; } |
| 195 | virtual bool operator==(const BreakIterator& o) const override { if(this==&o) return true; return false; } |
| 196 | |
| 197 | /* -- text modifying -- */ |
| 198 | virtual void setText(UText *text, UErrorCode &status) override { fDelegate->setText(text,status); } |
| 199 | virtual BreakIterator &refreshInputText(UText *input, UErrorCode &status) override { fDelegate->refreshInputText(input,status); return *this; } |
| 200 | virtual void adoptText(CharacterIterator* it) override { fDelegate->adoptText(it); } |
| 201 | virtual void setText(const UnicodeString &text) override { fDelegate->setText(text); } |
| 202 | |
| 203 | /* -- other functions that are just delegated -- */ |
| 204 | virtual UText *getUText(UText *fillIn, UErrorCode &status) const override { return fDelegate->getUText(fillIn,status); } |
| 205 | virtual CharacterIterator& getText() const override { return fDelegate->getText(); } |
| 206 | |
| 207 | /* -- ITERATION -- */ |
| 208 | virtual int32_t first() override; |
| 209 | virtual int32_t preceding(int32_t offset) override; |
| 210 | virtual int32_t previous() override; |
| 211 | virtual UBool isBoundary(int32_t offset) override; |
| 212 | virtual int32_t current() const override { return fDelegate->current(); } // we keep the delegate current, so this should be correct. |
| 213 | |
| 214 | virtual int32_t next() override; |
| 215 | |
| 216 | virtual int32_t next(int32_t n) override; |
| 217 | virtual int32_t following(int32_t offset) override; |
| 218 | virtual int32_t last() override; |
| 219 | |
| 220 | private: |
| 221 | /** |
| 222 | * Given that the fDelegate has already given its "initial" answer, |
| 223 | * find the NEXT actual (non-excepted) break. |
| 224 | * @param n initial position from delegate |
| 225 | * @return new break position or UBRK_DONE |
| 226 | */ |
| 227 | int32_t internalNext(int32_t n); |
| 228 | /** |
| 229 | * Given that the fDelegate has already given its "initial" answer, |
| 230 | * find the PREV actual (non-excepted) break. |
| 231 | * @param n initial position from delegate |
| 232 | * @return new break position or UBRK_DONE |
| 233 | */ |
| 234 | int32_t internalPrev(int32_t n); |
| 235 | /** |
| 236 | * set up the UText with the value of the fDelegate. |
| 237 | * Call this before calling breakExceptionAt. |
| 238 | * May be able to avoid excess calls |
| 239 | */ |
| 240 | void resetState(UErrorCode &status); |
| 241 | /** |
| 242 | * Is there a match (exception) at this spot? |
| 243 | */ |
| 244 | enum EFBMatchResult { kNoExceptionHere, kExceptionHere }; |
| 245 | /** |
| 246 | * Determine if there is an exception at this spot |
| 247 | * @param n spot to check |
| 248 | * @return kNoExceptionHere or kExceptionHere |
| 249 | **/ |
| 250 | enum EFBMatchResult breakExceptionAt(int32_t n); |
| 251 | }; |
| 252 | |
| 253 | SimpleFilteredSentenceBreakIterator::SimpleFilteredSentenceBreakIterator(const SimpleFilteredSentenceBreakIterator& other) |
| 254 | : BreakIterator(other), fData(other.fData->incr()), fDelegate(other.fDelegate->clone()) |
| 255 | { |
| 256 | } |
| 257 | |
| 258 | |
| 259 | SimpleFilteredSentenceBreakIterator::SimpleFilteredSentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status) : |
| 260 | BreakIterator(adopt->getLocale(ULOC_VALID_LOCALE,status),adopt->getLocale(ULOC_ACTUAL_LOCALE,status)), |
| 261 | fData(new SimpleFilteredSentenceBreakData(forwards, backwards)), |
| 262 | fDelegate(adopt) |
| 263 | { |
| 264 | if (fData == nullptr) { |
| 265 | delete forwards; |
| 266 | delete backwards; |
| 267 | if (U_SUCCESS(status)) { |
| 268 | status = U_MEMORY_ALLOCATION_ERROR; |
| 269 | } |
| 270 | } |
| 271 | } |
| 272 | |
| 273 | SimpleFilteredSentenceBreakIterator::~SimpleFilteredSentenceBreakIterator() { |
| 274 | fData = fData->decr(); |
| 275 | } |
| 276 | |
| 277 | void SimpleFilteredSentenceBreakIterator::resetState(UErrorCode &status) { |
| 278 | fText.adoptInstead(fDelegate->getUText(fText.orphan(), status)); |
| 279 | } |
| 280 | |
| 281 | SimpleFilteredSentenceBreakIterator::EFBMatchResult |
| 282 | SimpleFilteredSentenceBreakIterator::breakExceptionAt(int32_t n) { |
| 283 | int64_t bestPosn = -1; |
| 284 | int32_t bestValue = -1; |
| 285 | // loops while 'n' points to an exception. |
| 286 | utext_setNativeIndex(fText.getAlias(), n); // from n.. |
| 287 | |
| 288 | //if(debug2) u_printf(" n@ %d\n", n); |
| 289 | // Assume a space is following the '.' (so we handle the case: "Mr. /Brown") |
| 290 | if(utext_previous32(fText.getAlias())==u' ') { // TODO: skip a class of chars here?? |
| 291 | // TODO only do this the 1st time? |
| 292 | //if(debug2) u_printf("skipping prev: |%C| \n", (char16_t)uch); |
| 293 | } else { |
| 294 | //if(debug2) u_printf("not skipping prev: |%C| \n", (char16_t)uch); |
| 295 | utext_next32(fText.getAlias()); |
| 296 | //if(debug2) u_printf(" -> : |%C| \n", (char16_t)uch); |
| 297 | } |
| 298 | |
| 299 | { |
| 300 | // Do not modify the shared trie! |
| 301 | UCharsTrie iter(fData->getBackwardsTrie()); |
| 302 | UChar32 uch; |
| 303 | while((uch=utext_previous32(fText.getAlias()))!=U_SENTINEL) { // more to consume backwards |
| 304 | UStringTrieResult r = iter.nextForCodePoint(uch); |
| 305 | if(USTRINGTRIE_HAS_VALUE(r)) { // remember the best match so far |
| 306 | bestPosn = utext_getNativeIndex(fText.getAlias()); |
| 307 | bestValue = iter.getValue(); |
| 308 | } |
| 309 | if(!USTRINGTRIE_HAS_NEXT(r)) { |
| 310 | break; |
| 311 | } |
| 312 | //if(debug2) u_printf("rev< /%C/ cont?%d @%d\n", (char16_t)uch, r, utext_getNativeIndex(fText.getAlias())); |
| 313 | } |
| 314 | } |
| 315 | |
| 316 | //if(bestValue >= 0) { |
| 317 | //if(debug2) u_printf("rev<+/%C/+end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (char16_t)uch, r, bestPosn, bestValue); |
| 318 | //} |
| 319 | |
| 320 | if(bestPosn>=0) { |
| 321 | //if(debug2) u_printf("rev< /%C/ end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (char16_t)uch, r, bestPosn, bestValue); |
| 322 | |
| 323 | //if(USTRINGTRIE_MATCHES(r)) { // matched - so, now what? |
| 324 | //int32_t bestValue = iter.getValue(); |
| 325 | ////if(debug2) u_printf("rev< /%C/ matched, skip..%d bestValue=%d\n", (char16_t)uch, r, bestValue); |
| 326 | |
| 327 | if(bestValue == kMATCH) { // exact match! |
| 328 | //if(debug2) u_printf(" exact backward match\n"); |
| 329 | return kExceptionHere; // See if the next is another exception. |
| 330 | } else if(bestValue == kPARTIAL |
| 331 | && fData->hasForwardsPartialTrie()) { // make sure there's a forward trie |
| 332 | //if(debug2) u_printf(" partial backward match\n"); |
| 333 | // We matched the "Ph." in "Ph.D." - now we need to run everything through the forwards trie |
| 334 | // to see if it matches something going forward. |
| 335 | UStringTrieResult rfwd = USTRINGTRIE_INTERMEDIATE_VALUE; |
| 336 | utext_setNativeIndex(fText.getAlias(), bestPosn); // hope that's close .. |
| 337 | //if(debug2) u_printf("Retrying at %d\n", bestPosn); |
| 338 | // Do not modify the shared trie! |
| 339 | UCharsTrie iter(fData->getForwardsPartialTrie()); |
| 340 | UChar32 uch; |
| 341 | while((uch=utext_next32(fText.getAlias()))!=U_SENTINEL && |
| 342 | USTRINGTRIE_HAS_NEXT(rfwd=iter.nextForCodePoint(uch))) { |
| 343 | //if(debug2) u_printf("fwd> /%C/ cont?%d @%d\n", (char16_t)uch, rfwd, utext_getNativeIndex(fText.getAlias())); |
| 344 | } |
| 345 | if(USTRINGTRIE_MATCHES(rfwd)) { |
| 346 | //if(debug2) u_printf("fwd> /%C/ == forward match!\n", (char16_t)uch); |
| 347 | // only full matches here, nothing to check |
| 348 | // skip the next: |
| 349 | return kExceptionHere; |
| 350 | } else { |
| 351 | //if(debug2) u_printf("fwd> /%C/ no match.\n", (char16_t)uch); |
| 352 | // no match (no exception) -return the 'underlying' break |
| 353 | return kNoExceptionHere; |
| 354 | } |
| 355 | } else { |
| 356 | return kNoExceptionHere; // internal error and/or no forwards trie |
| 357 | } |
| 358 | } else { |
| 359 | //if(debug2) u_printf("rev< /%C/ .. no match..%d\n", (char16_t)uch, r); // no best match |
| 360 | return kNoExceptionHere; // No match - so exit. Not an exception. |
| 361 | } |
| 362 | } |
| 363 | |
| 364 | // the workhorse single next. |
| 365 | int32_t |
| 366 | SimpleFilteredSentenceBreakIterator::internalNext(int32_t n) { |
| 367 | if(n == UBRK_DONE || // at end or |
| 368 | !fData->hasBackwardsTrie()) { // .. no backwards table loaded == no exceptions |
| 369 | return n; |
| 370 | } |
| 371 | // OK, do we need to break here? |
| 372 | UErrorCode status = U_ZERO_ERROR; |
| 373 | // refresh text |
| 374 | resetState(status); |
| 375 | if(U_FAILURE(status)) return UBRK_DONE; // bail out |
| 376 | int64_t utextLen = utext_nativeLength(fText.getAlias()); |
| 377 | |
| 378 | //if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias())); |
| 379 | while (n != UBRK_DONE && n != utextLen) { // outer loop runs once per underlying break (from fDelegate). |
| 380 | SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(n); |
| 381 | |
| 382 | switch(m) { |
| 383 | case kExceptionHere: |
| 384 | n = fDelegate->next(); // skip this one. Find the next lowerlevel break. |
| 385 | continue; |
| 386 | |
| 387 | default: |
| 388 | case kNoExceptionHere: |
| 389 | return n; |
| 390 | } |
| 391 | } |
| 392 | return n; |
| 393 | } |
| 394 | |
| 395 | int32_t |
| 396 | SimpleFilteredSentenceBreakIterator::internalPrev(int32_t n) { |
| 397 | if(n == 0 || n == UBRK_DONE || // at end or |
| 398 | !fData->hasBackwardsTrie()) { // .. no backwards table loaded == no exceptions |
| 399 | return n; |
| 400 | } |
| 401 | // OK, do we need to break here? |
| 402 | UErrorCode status = U_ZERO_ERROR; |
| 403 | // refresh text |
| 404 | resetState(status); |
| 405 | if(U_FAILURE(status)) return UBRK_DONE; // bail out |
| 406 | |
| 407 | //if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias())); |
| 408 | while (n != UBRK_DONE && n != 0) { // outer loop runs once per underlying break (from fDelegate). |
| 409 | SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(n); |
| 410 | |
| 411 | switch(m) { |
| 412 | case kExceptionHere: |
| 413 | n = fDelegate->previous(); // skip this one. Find the next lowerlevel break. |
| 414 | continue; |
| 415 | |
| 416 | default: |
| 417 | case kNoExceptionHere: |
| 418 | return n; |
| 419 | } |
| 420 | } |
| 421 | return n; |
| 422 | } |
| 423 | |
| 424 | |
| 425 | int32_t |
| 426 | SimpleFilteredSentenceBreakIterator::next() { |
| 427 | return internalNext(fDelegate->next()); |
| 428 | } |
| 429 | |
| 430 | int32_t |
| 431 | SimpleFilteredSentenceBreakIterator::first() { |
| 432 | // Don't suppress a break opportunity at the beginning of text. |
| 433 | return fDelegate->first(); |
| 434 | } |
| 435 | |
| 436 | int32_t |
| 437 | SimpleFilteredSentenceBreakIterator::preceding(int32_t offset) { |
| 438 | return internalPrev(fDelegate->preceding(offset)); |
| 439 | } |
| 440 | |
| 441 | int32_t |
| 442 | SimpleFilteredSentenceBreakIterator::previous() { |
| 443 | return internalPrev(fDelegate->previous()); |
| 444 | } |
| 445 | |
| 446 | UBool SimpleFilteredSentenceBreakIterator::isBoundary(int32_t offset) { |
| 447 | if (!fDelegate->isBoundary(offset)) return false; // no break to suppress |
| 448 | |
| 449 | if (!fData->hasBackwardsTrie()) return true; // no data = no suppressions |
| 450 | |
| 451 | UErrorCode status = U_ZERO_ERROR; |
| 452 | resetState(status); |
| 453 | |
| 454 | SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(offset); |
| 455 | |
| 456 | switch(m) { |
| 457 | case kExceptionHere: |
| 458 | return false; |
| 459 | default: |
| 460 | case kNoExceptionHere: |
| 461 | return true; |
| 462 | } |
| 463 | } |
| 464 | |
| 465 | int32_t |
| 466 | SimpleFilteredSentenceBreakIterator::next(int32_t offset) { |
| 467 | return internalNext(fDelegate->next(offset)); |
| 468 | } |
| 469 | |
| 470 | int32_t |
| 471 | SimpleFilteredSentenceBreakIterator::following(int32_t offset) { |
| 472 | return internalNext(fDelegate->following(offset)); |
| 473 | } |
| 474 | |
| 475 | int32_t |
| 476 | SimpleFilteredSentenceBreakIterator::last() { |
| 477 | // Don't suppress a break opportunity at the end of text. |
| 478 | return fDelegate->last(); |
| 479 | } |
| 480 | |
| 481 | |
| 482 | /** |
| 483 | * Concrete implementation of builder class. |
| 484 | */ |
| 485 | class SimpleFilteredBreakIteratorBuilder : public FilteredBreakIteratorBuilder { |
| 486 | public: |
| 487 | virtual ~SimpleFilteredBreakIteratorBuilder(); |
| 488 | SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status); |
| 489 | SimpleFilteredBreakIteratorBuilder(UErrorCode &status); |
| 490 | virtual UBool suppressBreakAfter(const UnicodeString& exception, UErrorCode& status) override; |
| 491 | virtual UBool unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status) override; |
| 492 | virtual BreakIterator *build(BreakIterator* adoptBreakIterator, UErrorCode& status) override; |
| 493 | private: |
| 494 | UStringSet fSet; |
| 495 | }; |
| 496 | |
| 497 | SimpleFilteredBreakIteratorBuilder::~SimpleFilteredBreakIteratorBuilder() |
| 498 | { |
| 499 | } |
| 500 | |
| 501 | SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(UErrorCode &status) |
| 502 | : fSet(status) |
| 503 | { |
| 504 | } |
| 505 | |
| 506 | SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status) |
| 507 | : fSet(status) |
| 508 | { |
| 509 | if(U_SUCCESS(status)) { |
| 510 | UErrorCode subStatus = U_ZERO_ERROR; |
| 511 | LocalUResourceBundlePointer b(ures_open(U_ICUDATA_BRKITR, fromLocale.getBaseName(), &subStatus)); |
| 512 | if (U_FAILURE(subStatus) || (subStatus == U_USING_DEFAULT_WARNING) ) { |
| 513 | status = subStatus; // copy the failing status |
| 514 | #if FB_DEBUG |
| 515 | fprintf(stderr, "open BUNDLE %s : %s, %s\n" , fromLocale.getBaseName(), "[exit]" , u_errorName(status)); |
| 516 | #endif |
| 517 | return; // leaves the builder empty, if you try to use it. |
| 518 | } |
| 519 | LocalUResourceBundlePointer exceptions(ures_getByKeyWithFallback(b.getAlias(), "exceptions" , nullptr, &subStatus)); |
| 520 | if (U_FAILURE(subStatus) || (subStatus == U_USING_DEFAULT_WARNING) ) { |
| 521 | status = subStatus; // copy the failing status |
| 522 | #if FB_DEBUG |
| 523 | fprintf(stderr, "open EXCEPTIONS %s : %s, %s\n" , fromLocale.getBaseName(), "[exit]" , u_errorName(status)); |
| 524 | #endif |
| 525 | return; // leaves the builder empty, if you try to use it. |
| 526 | } |
| 527 | LocalUResourceBundlePointer breaks(ures_getByKeyWithFallback(exceptions.getAlias(), "SentenceBreak" , nullptr, &subStatus)); |
| 528 | |
| 529 | #if FB_DEBUG |
| 530 | { |
| 531 | UErrorCode subsub = subStatus; |
| 532 | fprintf(stderr, "open SentenceBreak %s => %s, %s\n" , fromLocale.getBaseName(), ures_getLocale(breaks.getAlias(), &subsub), u_errorName(subStatus)); |
| 533 | } |
| 534 | #endif |
| 535 | |
| 536 | if (U_FAILURE(subStatus) || (subStatus == U_USING_DEFAULT_WARNING) ) { |
| 537 | status = subStatus; // copy the failing status |
| 538 | #if FB_DEBUG |
| 539 | fprintf(stderr, "open %s : %s, %s\n" , fromLocale.getBaseName(), "[exit]" , u_errorName(status)); |
| 540 | #endif |
| 541 | return; // leaves the builder empty, if you try to use it. |
| 542 | } |
| 543 | |
| 544 | LocalUResourceBundlePointer strs; |
| 545 | subStatus = status; // Pick up inherited warning status now |
| 546 | do { |
| 547 | strs.adoptInstead(ures_getNextResource(breaks.getAlias(), strs.orphan(), &subStatus)); |
| 548 | if(strs.isValid() && U_SUCCESS(subStatus)) { |
| 549 | UnicodeString str(ures_getUnicodeString(strs.getAlias(), &status)); |
| 550 | suppressBreakAfter(str, status); // load the string |
| 551 | } |
| 552 | } while (strs.isValid() && U_SUCCESS(subStatus)); |
| 553 | if(U_FAILURE(subStatus)&&subStatus!=U_INDEX_OUTOFBOUNDS_ERROR&&U_SUCCESS(status)) { |
| 554 | status = subStatus; |
| 555 | } |
| 556 | } |
| 557 | } |
| 558 | |
| 559 | UBool |
| 560 | SimpleFilteredBreakIteratorBuilder::suppressBreakAfter(const UnicodeString& exception, UErrorCode& status) |
| 561 | { |
| 562 | UBool r = fSet.add(exception, status); |
| 563 | FB_TRACE("suppressBreakAfter" ,&exception,r,0); |
| 564 | return r; |
| 565 | } |
| 566 | |
| 567 | UBool |
| 568 | SimpleFilteredBreakIteratorBuilder::unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status) |
| 569 | { |
| 570 | UBool r = fSet.remove(exception, status); |
| 571 | FB_TRACE("unsuppressBreakAfter" ,&exception,r,0); |
| 572 | return r; |
| 573 | } |
| 574 | |
| 575 | /** |
| 576 | * Jitterbug 2974: MSVC has a bug whereby new X[0] behaves badly. |
| 577 | * Work around this. |
| 578 | * |
| 579 | * Note: "new UnicodeString[subCount]" ends up calling global operator new |
| 580 | * on MSVC2012 for some reason. |
| 581 | */ |
| 582 | static inline UnicodeString* newUnicodeStringArray(size_t count) { |
| 583 | return new UnicodeString[count ? count : 1]; |
| 584 | } |
| 585 | |
| 586 | BreakIterator * |
| 587 | SimpleFilteredBreakIteratorBuilder::build(BreakIterator* adoptBreakIterator, UErrorCode& status) { |
| 588 | LocalPointer<BreakIterator> adopt(adoptBreakIterator); |
| 589 | |
| 590 | LocalPointer<UCharsTrieBuilder> builder(new UCharsTrieBuilder(status), status); |
| 591 | LocalPointer<UCharsTrieBuilder> builder2(new UCharsTrieBuilder(status), status); |
| 592 | if(U_FAILURE(status)) { |
| 593 | return nullptr; |
| 594 | } |
| 595 | |
| 596 | int32_t revCount = 0; |
| 597 | int32_t fwdCount = 0; |
| 598 | |
| 599 | int32_t subCount = fSet.size(); |
| 600 | |
| 601 | UnicodeString *ustrs_ptr = newUnicodeStringArray(subCount); |
| 602 | |
| 603 | LocalArray<UnicodeString> ustrs(ustrs_ptr); |
| 604 | |
| 605 | LocalMemory<int> partials; |
| 606 | partials.allocateInsteadAndReset(subCount); |
| 607 | |
| 608 | LocalPointer<UCharsTrie> backwardsTrie; // i.e. ".srM" for Mrs. |
| 609 | LocalPointer<UCharsTrie> forwardsPartialTrie; // Has ".a" for "a.M." |
| 610 | |
| 611 | int n=0; |
| 612 | for ( int32_t i = 0; |
| 613 | i<fSet.size(); |
| 614 | i++) { |
| 615 | const UnicodeString *abbr = fSet.getStringAt(i); |
| 616 | if(abbr) { |
| 617 | FB_TRACE("build" ,abbr,true,i); |
| 618 | ustrs[n] = *abbr; // copy by value |
| 619 | FB_TRACE("ustrs[n]" ,&ustrs[n],true,i); |
| 620 | } else { |
| 621 | FB_TRACE("build" ,abbr,false,i); |
| 622 | status = U_MEMORY_ALLOCATION_ERROR; |
| 623 | return nullptr; |
| 624 | } |
| 625 | partials[n] = 0; // default: not partial |
| 626 | n++; |
| 627 | } |
| 628 | // first pass - find partials. |
| 629 | for(int i=0;i<subCount;i++) { |
| 630 | int nn = ustrs[i].indexOf(kFULLSTOP); // TODO: non-'.' abbreviations |
| 631 | if(nn>-1 && (nn+1)!=ustrs[i].length()) { |
| 632 | FB_TRACE("partial" ,&ustrs[i],false,i); |
| 633 | // is partial. |
| 634 | // is it unique? |
| 635 | int sameAs = -1; |
| 636 | for(int j=0;j<subCount;j++) { |
| 637 | if(j==i) continue; |
| 638 | if(ustrs[i].compare(0,nn+1,ustrs[j],0,nn+1)==0) { |
| 639 | FB_TRACE("prefix" ,&ustrs[j],false,nn+1); |
| 640 | //UBool otherIsPartial = ((nn+1)!=ustrs[j].length()); // true if ustrs[j] doesn't end at nn |
| 641 | if(partials[j]==0) { // hasn't been processed yet |
| 642 | partials[j] = kSuppressInReverse | kAddToForward; |
| 643 | FB_TRACE("suppressing" ,&ustrs[j],false,j); |
| 644 | } else if(partials[j] & kSuppressInReverse) { |
| 645 | sameAs = j; // the other entry is already in the reverse table. |
| 646 | } |
| 647 | } |
| 648 | } |
| 649 | FB_TRACE("for partial same-" ,&ustrs[i],false,sameAs); |
| 650 | FB_TRACE(" == partial #" ,&ustrs[i],false,partials[i]); |
| 651 | UnicodeString prefix(ustrs[i], 0, nn+1); |
| 652 | if(sameAs == -1 && partials[i] == 0) { |
| 653 | // first one - add the prefix to the reverse table. |
| 654 | prefix.reverse(); |
| 655 | builder->add(prefix, kPARTIAL, status); |
| 656 | revCount++; |
| 657 | FB_TRACE("Added partial" ,&prefix,false, i); |
| 658 | FB_TRACE(u_errorName(status),&ustrs[i],false,i); |
| 659 | partials[i] = kSuppressInReverse | kAddToForward; |
| 660 | } else { |
| 661 | FB_TRACE("NOT adding partial" ,&prefix,false, i); |
| 662 | FB_TRACE(u_errorName(status),&ustrs[i],false,i); |
| 663 | } |
| 664 | } |
| 665 | } |
| 666 | for(int i=0;i<subCount;i++) { |
| 667 | if(partials[i]==0) { |
| 668 | ustrs[i].reverse(); |
| 669 | builder->add(ustrs[i], kMATCH, status); |
| 670 | revCount++; |
| 671 | FB_TRACE(u_errorName(status), &ustrs[i], false, i); |
| 672 | } else { |
| 673 | FB_TRACE("Adding fwd" ,&ustrs[i], false, i); |
| 674 | |
| 675 | // an optimization would be to only add the portion after the '.' |
| 676 | // for example, for "Ph.D." we store ".hP" in the reverse table. We could just store "D." in the forward, |
| 677 | // instead of "Ph.D." since we already know the "Ph." part is a match. |
| 678 | // would need the trie to be able to hold 0-length strings, though. |
| 679 | builder2->add(ustrs[i], kMATCH, status); // forward |
| 680 | fwdCount++; |
| 681 | //ustrs[i].reverse(); |
| 682 | ////if(debug2) u_printf("SUPPRESS- not Added(%d): /%S/ status=%s\n",partials[i], ustrs[i].getTerminatedBuffer(), u_errorName(status)); |
| 683 | } |
| 684 | } |
| 685 | FB_TRACE("AbbrCount" ,nullptr,false, subCount); |
| 686 | |
| 687 | if(revCount>0) { |
| 688 | backwardsTrie.adoptInstead(builder->build(USTRINGTRIE_BUILD_FAST, status)); |
| 689 | if(U_FAILURE(status)) { |
| 690 | FB_TRACE(u_errorName(status),nullptr,false, -1); |
| 691 | return nullptr; |
| 692 | } |
| 693 | } |
| 694 | |
| 695 | if(fwdCount>0) { |
| 696 | forwardsPartialTrie.adoptInstead(builder2->build(USTRINGTRIE_BUILD_FAST, status)); |
| 697 | if(U_FAILURE(status)) { |
| 698 | FB_TRACE(u_errorName(status),nullptr,false, -1); |
| 699 | return nullptr; |
| 700 | } |
| 701 | } |
| 702 | |
| 703 | return new SimpleFilteredSentenceBreakIterator(adopt.orphan(), forwardsPartialTrie.orphan(), backwardsTrie.orphan(), status); |
| 704 | } |
| 705 | |
| 706 | |
| 707 | // ----------- Base class implementation |
| 708 | |
| 709 | FilteredBreakIteratorBuilder::FilteredBreakIteratorBuilder() { |
| 710 | } |
| 711 | |
| 712 | FilteredBreakIteratorBuilder::~FilteredBreakIteratorBuilder() { |
| 713 | } |
| 714 | |
| 715 | FilteredBreakIteratorBuilder * |
| 716 | FilteredBreakIteratorBuilder::createInstance(const Locale& where, UErrorCode& status) { |
| 717 | if(U_FAILURE(status)) return nullptr; |
| 718 | LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder(where, status), status); |
| 719 | return (U_SUCCESS(status))? ret.orphan(): nullptr; |
| 720 | } |
| 721 | |
| 722 | FilteredBreakIteratorBuilder * |
| 723 | FilteredBreakIteratorBuilder::createInstance(UErrorCode &status) { |
| 724 | return createEmptyInstance(status); |
| 725 | } |
| 726 | |
| 727 | FilteredBreakIteratorBuilder * |
| 728 | FilteredBreakIteratorBuilder::createEmptyInstance(UErrorCode& status) { |
| 729 | if(U_FAILURE(status)) return nullptr; |
| 730 | LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder(status), status); |
| 731 | return (U_SUCCESS(status))? ret.orphan(): nullptr; |
| 732 | } |
| 733 | |
| 734 | U_NAMESPACE_END |
| 735 | |
| 736 | #endif //#if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_FILTERED_BREAK_ITERATION |
| 737 | |