1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | /* |
4 | ******************************************************************************* |
5 | * Copyright (C) 2014-2015, International Business Machines Corporation and |
6 | * others. All Rights Reserved. |
7 | ******************************************************************************* |
8 | */ |
9 | |
10 | #include "unicode/utypes.h" |
11 | #if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_FILTERED_BREAK_ITERATION |
12 | |
13 | #include "cmemory.h" |
14 | |
15 | #include "unicode/filteredbrk.h" |
16 | #include "unicode/ucharstriebuilder.h" |
17 | #include "unicode/ures.h" |
18 | |
19 | #include "uresimp.h" // ures_getByKeyWithFallback |
20 | #include "ubrkimpl.h" // U_ICUDATA_BRKITR |
21 | #include "uvector.h" |
22 | #include "cmemory.h" |
23 | #include "umutex.h" |
24 | |
25 | U_NAMESPACE_BEGIN |
26 | |
27 | #ifndef FB_DEBUG |
28 | #define FB_DEBUG 0 |
29 | #endif |
30 | |
31 | #if FB_DEBUG |
32 | #include <stdio.h> |
33 | static void _fb_trace(const char *m, const UnicodeString *s, UBool b, int32_t d, const char *f, int l) { |
34 | char buf[2048]; |
35 | if(s) { |
36 | s->extract(0,s->length(),buf,2048); |
37 | } else { |
38 | strcpy(buf,"nullptr" ); |
39 | } |
40 | fprintf(stderr,"%s:%d: %s. s='%s'(%p), b=%c, d=%d\n" , |
41 | f, l, m, buf, (const void*)s, b?'T':'F',(int)d); |
42 | } |
43 | |
44 | #define FB_TRACE(m,s,b,d) _fb_trace(m,s,b,d,__FILE__,__LINE__) |
45 | #else |
46 | #define FB_TRACE(m,s,b,d) |
47 | #endif |
48 | |
49 | /** |
50 | * Used with sortedInsert() |
51 | */ |
52 | static int32_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) { |
53 | const UnicodeString &a = *(const UnicodeString*)t1.pointer; |
54 | const UnicodeString &b = *(const UnicodeString*)t2.pointer; |
55 | return a.compare(b); |
56 | } |
57 | |
58 | /** |
59 | * A UVector which implements a set of strings. |
60 | */ |
61 | class UStringSet : public UVector { |
62 | public: |
63 | UStringSet(UErrorCode &status) : UVector(uprv_deleteUObject, |
64 | uhash_compareUnicodeString, |
65 | 1, |
66 | status) {} |
67 | virtual ~UStringSet(); |
68 | /** |
69 | * Is this UnicodeSet contained? |
70 | */ |
71 | inline UBool contains(const UnicodeString& s) { |
72 | return contains((void*) &s); |
73 | } |
74 | using UVector::contains; |
75 | /** |
76 | * Return the ith UnicodeString alias |
77 | */ |
78 | inline const UnicodeString* getStringAt(int32_t i) const { |
79 | return (const UnicodeString*)elementAt(i); |
80 | } |
81 | /** |
82 | * Adopt the UnicodeString if not already contained. |
83 | * Caller no longer owns the pointer in any case. |
84 | * @return true if adopted successfully, false otherwise (error, or else duplicate) |
85 | */ |
86 | inline UBool adopt(UnicodeString *str, UErrorCode &status) { |
87 | if(U_FAILURE(status) || contains(*str)) { |
88 | delete str; |
89 | return false; |
90 | } else { |
91 | sortedInsert(str, compareUnicodeString, status); |
92 | if(U_FAILURE(status)) { |
93 | return false; |
94 | } |
95 | return true; |
96 | } |
97 | } |
98 | /** |
99 | * Add by value. |
100 | * @return true if successfully adopted. |
101 | */ |
102 | inline UBool add(const UnicodeString& str, UErrorCode &status) { |
103 | if(U_FAILURE(status)) return false; |
104 | UnicodeString *t = new UnicodeString(str); |
105 | if(t==nullptr) { |
106 | status = U_MEMORY_ALLOCATION_ERROR; return false; |
107 | } |
108 | return adopt(t, status); |
109 | } |
110 | /** |
111 | * Remove this string. |
112 | * @return true if successfully removed, false otherwise (error, or else it wasn't there) |
113 | */ |
114 | inline UBool remove(const UnicodeString &s, UErrorCode &status) { |
115 | if(U_FAILURE(status)) return false; |
116 | return removeElement((void*) &s); |
117 | } |
118 | }; |
119 | |
120 | /** |
121 | * Virtual, won't be inlined |
122 | */ |
123 | UStringSet::~UStringSet() {} |
124 | |
125 | /* ----------------------------------------------------------- */ |
126 | |
127 | |
128 | /* Filtered Break constants */ |
129 | static const int32_t kPARTIAL = (1<<0); //< partial - need to run through forward trie |
130 | static const int32_t kMATCH = (1<<1); //< exact match - skip this one. |
131 | static const int32_t kSuppressInReverse = (1<<0); |
132 | static const int32_t kAddToForward = (1<<1); |
133 | static const char16_t kFULLSTOP = 0x002E; // '.' |
134 | |
135 | /** |
136 | * Shared data for SimpleFilteredSentenceBreakIterator |
137 | */ |
138 | class SimpleFilteredSentenceBreakData : public UMemory { |
139 | public: |
140 | SimpleFilteredSentenceBreakData(UCharsTrie *forwards, UCharsTrie *backwards ) |
141 | : fForwardsPartialTrie(forwards), fBackwardsTrie(backwards), refcount(1) { } |
142 | SimpleFilteredSentenceBreakData *incr() { |
143 | umtx_atomic_inc(&refcount); |
144 | return this; |
145 | } |
146 | SimpleFilteredSentenceBreakData *decr() { |
147 | if(umtx_atomic_dec(&refcount) <= 0) { |
148 | delete this; |
149 | } |
150 | return 0; |
151 | } |
152 | virtual ~SimpleFilteredSentenceBreakData(); |
153 | |
154 | bool hasForwardsPartialTrie() const { return fForwardsPartialTrie.isValid(); } |
155 | bool hasBackwardsTrie() const { return fBackwardsTrie.isValid(); } |
156 | |
157 | const UCharsTrie &getForwardsPartialTrie() const { return *fForwardsPartialTrie; } |
158 | const UCharsTrie &getBackwardsTrie() const { return *fBackwardsTrie; } |
159 | |
160 | private: |
161 | // These tries own their data arrays. |
162 | // They are shared and must therefore not be modified. |
163 | LocalPointer<UCharsTrie> fForwardsPartialTrie; // Has ".a" for "a.M." |
164 | LocalPointer<UCharsTrie> fBackwardsTrie; // i.e. ".srM" for Mrs. |
165 | u_atomic_int32_t refcount; |
166 | }; |
167 | |
168 | SimpleFilteredSentenceBreakData::~SimpleFilteredSentenceBreakData() {} |
169 | |
170 | /** |
171 | * Concrete implementation |
172 | */ |
173 | class SimpleFilteredSentenceBreakIterator : public BreakIterator { |
174 | public: |
175 | SimpleFilteredSentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status); |
176 | SimpleFilteredSentenceBreakIterator(const SimpleFilteredSentenceBreakIterator& other); |
177 | virtual ~SimpleFilteredSentenceBreakIterator(); |
178 | private: |
179 | SimpleFilteredSentenceBreakData *fData; |
180 | LocalPointer<BreakIterator> fDelegate; |
181 | LocalUTextPointer fText; |
182 | |
183 | /* -- subclass interface -- */ |
184 | public: |
185 | /* -- cloning and other subclass stuff -- */ |
186 | virtual BreakIterator * createBufferClone(void * /*stackBuffer*/, |
187 | int32_t &/*BufferSize*/, |
188 | UErrorCode &status) override { |
189 | // for now - always deep clone |
190 | status = U_SAFECLONE_ALLOCATED_WARNING; |
191 | return clone(); |
192 | } |
193 | virtual SimpleFilteredSentenceBreakIterator* clone() const override { return new SimpleFilteredSentenceBreakIterator(*this); } |
194 | virtual UClassID getDynamicClassID() const override { return nullptr; } |
195 | virtual bool operator==(const BreakIterator& o) const override { if(this==&o) return true; return false; } |
196 | |
197 | /* -- text modifying -- */ |
198 | virtual void setText(UText *text, UErrorCode &status) override { fDelegate->setText(text,status); } |
199 | virtual BreakIterator &refreshInputText(UText *input, UErrorCode &status) override { fDelegate->refreshInputText(input,status); return *this; } |
200 | virtual void adoptText(CharacterIterator* it) override { fDelegate->adoptText(it); } |
201 | virtual void setText(const UnicodeString &text) override { fDelegate->setText(text); } |
202 | |
203 | /* -- other functions that are just delegated -- */ |
204 | virtual UText *getUText(UText *fillIn, UErrorCode &status) const override { return fDelegate->getUText(fillIn,status); } |
205 | virtual CharacterIterator& getText() const override { return fDelegate->getText(); } |
206 | |
207 | /* -- ITERATION -- */ |
208 | virtual int32_t first() override; |
209 | virtual int32_t preceding(int32_t offset) override; |
210 | virtual int32_t previous() override; |
211 | virtual UBool isBoundary(int32_t offset) override; |
212 | virtual int32_t current() const override { return fDelegate->current(); } // we keep the delegate current, so this should be correct. |
213 | |
214 | virtual int32_t next() override; |
215 | |
216 | virtual int32_t next(int32_t n) override; |
217 | virtual int32_t following(int32_t offset) override; |
218 | virtual int32_t last() override; |
219 | |
220 | private: |
221 | /** |
222 | * Given that the fDelegate has already given its "initial" answer, |
223 | * find the NEXT actual (non-excepted) break. |
224 | * @param n initial position from delegate |
225 | * @return new break position or UBRK_DONE |
226 | */ |
227 | int32_t internalNext(int32_t n); |
228 | /** |
229 | * Given that the fDelegate has already given its "initial" answer, |
230 | * find the PREV actual (non-excepted) break. |
231 | * @param n initial position from delegate |
232 | * @return new break position or UBRK_DONE |
233 | */ |
234 | int32_t internalPrev(int32_t n); |
235 | /** |
236 | * set up the UText with the value of the fDelegate. |
237 | * Call this before calling breakExceptionAt. |
238 | * May be able to avoid excess calls |
239 | */ |
240 | void resetState(UErrorCode &status); |
241 | /** |
242 | * Is there a match (exception) at this spot? |
243 | */ |
244 | enum EFBMatchResult { kNoExceptionHere, kExceptionHere }; |
245 | /** |
246 | * Determine if there is an exception at this spot |
247 | * @param n spot to check |
248 | * @return kNoExceptionHere or kExceptionHere |
249 | **/ |
250 | enum EFBMatchResult breakExceptionAt(int32_t n); |
251 | }; |
252 | |
253 | SimpleFilteredSentenceBreakIterator::SimpleFilteredSentenceBreakIterator(const SimpleFilteredSentenceBreakIterator& other) |
254 | : BreakIterator(other), fData(other.fData->incr()), fDelegate(other.fDelegate->clone()) |
255 | { |
256 | } |
257 | |
258 | |
259 | SimpleFilteredSentenceBreakIterator::SimpleFilteredSentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status) : |
260 | BreakIterator(adopt->getLocale(ULOC_VALID_LOCALE,status),adopt->getLocale(ULOC_ACTUAL_LOCALE,status)), |
261 | fData(new SimpleFilteredSentenceBreakData(forwards, backwards)), |
262 | fDelegate(adopt) |
263 | { |
264 | if (fData == nullptr) { |
265 | delete forwards; |
266 | delete backwards; |
267 | if (U_SUCCESS(status)) { |
268 | status = U_MEMORY_ALLOCATION_ERROR; |
269 | } |
270 | } |
271 | } |
272 | |
273 | SimpleFilteredSentenceBreakIterator::~SimpleFilteredSentenceBreakIterator() { |
274 | fData = fData->decr(); |
275 | } |
276 | |
277 | void SimpleFilteredSentenceBreakIterator::resetState(UErrorCode &status) { |
278 | fText.adoptInstead(fDelegate->getUText(fText.orphan(), status)); |
279 | } |
280 | |
281 | SimpleFilteredSentenceBreakIterator::EFBMatchResult |
282 | SimpleFilteredSentenceBreakIterator::breakExceptionAt(int32_t n) { |
283 | int64_t bestPosn = -1; |
284 | int32_t bestValue = -1; |
285 | // loops while 'n' points to an exception. |
286 | utext_setNativeIndex(fText.getAlias(), n); // from n.. |
287 | |
288 | //if(debug2) u_printf(" n@ %d\n", n); |
289 | // Assume a space is following the '.' (so we handle the case: "Mr. /Brown") |
290 | if(utext_previous32(fText.getAlias())==u' ') { // TODO: skip a class of chars here?? |
291 | // TODO only do this the 1st time? |
292 | //if(debug2) u_printf("skipping prev: |%C| \n", (char16_t)uch); |
293 | } else { |
294 | //if(debug2) u_printf("not skipping prev: |%C| \n", (char16_t)uch); |
295 | utext_next32(fText.getAlias()); |
296 | //if(debug2) u_printf(" -> : |%C| \n", (char16_t)uch); |
297 | } |
298 | |
299 | { |
300 | // Do not modify the shared trie! |
301 | UCharsTrie iter(fData->getBackwardsTrie()); |
302 | UChar32 uch; |
303 | while((uch=utext_previous32(fText.getAlias()))!=U_SENTINEL) { // more to consume backwards |
304 | UStringTrieResult r = iter.nextForCodePoint(uch); |
305 | if(USTRINGTRIE_HAS_VALUE(r)) { // remember the best match so far |
306 | bestPosn = utext_getNativeIndex(fText.getAlias()); |
307 | bestValue = iter.getValue(); |
308 | } |
309 | if(!USTRINGTRIE_HAS_NEXT(r)) { |
310 | break; |
311 | } |
312 | //if(debug2) u_printf("rev< /%C/ cont?%d @%d\n", (char16_t)uch, r, utext_getNativeIndex(fText.getAlias())); |
313 | } |
314 | } |
315 | |
316 | //if(bestValue >= 0) { |
317 | //if(debug2) u_printf("rev<+/%C/+end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (char16_t)uch, r, bestPosn, bestValue); |
318 | //} |
319 | |
320 | if(bestPosn>=0) { |
321 | //if(debug2) u_printf("rev< /%C/ end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (char16_t)uch, r, bestPosn, bestValue); |
322 | |
323 | //if(USTRINGTRIE_MATCHES(r)) { // matched - so, now what? |
324 | //int32_t bestValue = iter.getValue(); |
325 | ////if(debug2) u_printf("rev< /%C/ matched, skip..%d bestValue=%d\n", (char16_t)uch, r, bestValue); |
326 | |
327 | if(bestValue == kMATCH) { // exact match! |
328 | //if(debug2) u_printf(" exact backward match\n"); |
329 | return kExceptionHere; // See if the next is another exception. |
330 | } else if(bestValue == kPARTIAL |
331 | && fData->hasForwardsPartialTrie()) { // make sure there's a forward trie |
332 | //if(debug2) u_printf(" partial backward match\n"); |
333 | // We matched the "Ph." in "Ph.D." - now we need to run everything through the forwards trie |
334 | // to see if it matches something going forward. |
335 | UStringTrieResult rfwd = USTRINGTRIE_INTERMEDIATE_VALUE; |
336 | utext_setNativeIndex(fText.getAlias(), bestPosn); // hope that's close .. |
337 | //if(debug2) u_printf("Retrying at %d\n", bestPosn); |
338 | // Do not modify the shared trie! |
339 | UCharsTrie iter(fData->getForwardsPartialTrie()); |
340 | UChar32 uch; |
341 | while((uch=utext_next32(fText.getAlias()))!=U_SENTINEL && |
342 | USTRINGTRIE_HAS_NEXT(rfwd=iter.nextForCodePoint(uch))) { |
343 | //if(debug2) u_printf("fwd> /%C/ cont?%d @%d\n", (char16_t)uch, rfwd, utext_getNativeIndex(fText.getAlias())); |
344 | } |
345 | if(USTRINGTRIE_MATCHES(rfwd)) { |
346 | //if(debug2) u_printf("fwd> /%C/ == forward match!\n", (char16_t)uch); |
347 | // only full matches here, nothing to check |
348 | // skip the next: |
349 | return kExceptionHere; |
350 | } else { |
351 | //if(debug2) u_printf("fwd> /%C/ no match.\n", (char16_t)uch); |
352 | // no match (no exception) -return the 'underlying' break |
353 | return kNoExceptionHere; |
354 | } |
355 | } else { |
356 | return kNoExceptionHere; // internal error and/or no forwards trie |
357 | } |
358 | } else { |
359 | //if(debug2) u_printf("rev< /%C/ .. no match..%d\n", (char16_t)uch, r); // no best match |
360 | return kNoExceptionHere; // No match - so exit. Not an exception. |
361 | } |
362 | } |
363 | |
364 | // the workhorse single next. |
365 | int32_t |
366 | SimpleFilteredSentenceBreakIterator::internalNext(int32_t n) { |
367 | if(n == UBRK_DONE || // at end or |
368 | !fData->hasBackwardsTrie()) { // .. no backwards table loaded == no exceptions |
369 | return n; |
370 | } |
371 | // OK, do we need to break here? |
372 | UErrorCode status = U_ZERO_ERROR; |
373 | // refresh text |
374 | resetState(status); |
375 | if(U_FAILURE(status)) return UBRK_DONE; // bail out |
376 | int64_t utextLen = utext_nativeLength(fText.getAlias()); |
377 | |
378 | //if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias())); |
379 | while (n != UBRK_DONE && n != utextLen) { // outer loop runs once per underlying break (from fDelegate). |
380 | SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(n); |
381 | |
382 | switch(m) { |
383 | case kExceptionHere: |
384 | n = fDelegate->next(); // skip this one. Find the next lowerlevel break. |
385 | continue; |
386 | |
387 | default: |
388 | case kNoExceptionHere: |
389 | return n; |
390 | } |
391 | } |
392 | return n; |
393 | } |
394 | |
395 | int32_t |
396 | SimpleFilteredSentenceBreakIterator::internalPrev(int32_t n) { |
397 | if(n == 0 || n == UBRK_DONE || // at end or |
398 | !fData->hasBackwardsTrie()) { // .. no backwards table loaded == no exceptions |
399 | return n; |
400 | } |
401 | // OK, do we need to break here? |
402 | UErrorCode status = U_ZERO_ERROR; |
403 | // refresh text |
404 | resetState(status); |
405 | if(U_FAILURE(status)) return UBRK_DONE; // bail out |
406 | |
407 | //if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias())); |
408 | while (n != UBRK_DONE && n != 0) { // outer loop runs once per underlying break (from fDelegate). |
409 | SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(n); |
410 | |
411 | switch(m) { |
412 | case kExceptionHere: |
413 | n = fDelegate->previous(); // skip this one. Find the next lowerlevel break. |
414 | continue; |
415 | |
416 | default: |
417 | case kNoExceptionHere: |
418 | return n; |
419 | } |
420 | } |
421 | return n; |
422 | } |
423 | |
424 | |
425 | int32_t |
426 | SimpleFilteredSentenceBreakIterator::next() { |
427 | return internalNext(fDelegate->next()); |
428 | } |
429 | |
430 | int32_t |
431 | SimpleFilteredSentenceBreakIterator::first() { |
432 | // Don't suppress a break opportunity at the beginning of text. |
433 | return fDelegate->first(); |
434 | } |
435 | |
436 | int32_t |
437 | SimpleFilteredSentenceBreakIterator::preceding(int32_t offset) { |
438 | return internalPrev(fDelegate->preceding(offset)); |
439 | } |
440 | |
441 | int32_t |
442 | SimpleFilteredSentenceBreakIterator::previous() { |
443 | return internalPrev(fDelegate->previous()); |
444 | } |
445 | |
446 | UBool SimpleFilteredSentenceBreakIterator::isBoundary(int32_t offset) { |
447 | if (!fDelegate->isBoundary(offset)) return false; // no break to suppress |
448 | |
449 | if (!fData->hasBackwardsTrie()) return true; // no data = no suppressions |
450 | |
451 | UErrorCode status = U_ZERO_ERROR; |
452 | resetState(status); |
453 | |
454 | SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(offset); |
455 | |
456 | switch(m) { |
457 | case kExceptionHere: |
458 | return false; |
459 | default: |
460 | case kNoExceptionHere: |
461 | return true; |
462 | } |
463 | } |
464 | |
465 | int32_t |
466 | SimpleFilteredSentenceBreakIterator::next(int32_t offset) { |
467 | return internalNext(fDelegate->next(offset)); |
468 | } |
469 | |
470 | int32_t |
471 | SimpleFilteredSentenceBreakIterator::following(int32_t offset) { |
472 | return internalNext(fDelegate->following(offset)); |
473 | } |
474 | |
475 | int32_t |
476 | SimpleFilteredSentenceBreakIterator::last() { |
477 | // Don't suppress a break opportunity at the end of text. |
478 | return fDelegate->last(); |
479 | } |
480 | |
481 | |
482 | /** |
483 | * Concrete implementation of builder class. |
484 | */ |
485 | class SimpleFilteredBreakIteratorBuilder : public FilteredBreakIteratorBuilder { |
486 | public: |
487 | virtual ~SimpleFilteredBreakIteratorBuilder(); |
488 | SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status); |
489 | SimpleFilteredBreakIteratorBuilder(UErrorCode &status); |
490 | virtual UBool suppressBreakAfter(const UnicodeString& exception, UErrorCode& status) override; |
491 | virtual UBool unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status) override; |
492 | virtual BreakIterator *build(BreakIterator* adoptBreakIterator, UErrorCode& status) override; |
493 | private: |
494 | UStringSet fSet; |
495 | }; |
496 | |
497 | SimpleFilteredBreakIteratorBuilder::~SimpleFilteredBreakIteratorBuilder() |
498 | { |
499 | } |
500 | |
501 | SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(UErrorCode &status) |
502 | : fSet(status) |
503 | { |
504 | } |
505 | |
506 | SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status) |
507 | : fSet(status) |
508 | { |
509 | if(U_SUCCESS(status)) { |
510 | UErrorCode subStatus = U_ZERO_ERROR; |
511 | LocalUResourceBundlePointer b(ures_open(U_ICUDATA_BRKITR, fromLocale.getBaseName(), &subStatus)); |
512 | if (U_FAILURE(subStatus) || (subStatus == U_USING_DEFAULT_WARNING) ) { |
513 | status = subStatus; // copy the failing status |
514 | #if FB_DEBUG |
515 | fprintf(stderr, "open BUNDLE %s : %s, %s\n" , fromLocale.getBaseName(), "[exit]" , u_errorName(status)); |
516 | #endif |
517 | return; // leaves the builder empty, if you try to use it. |
518 | } |
519 | LocalUResourceBundlePointer exceptions(ures_getByKeyWithFallback(b.getAlias(), "exceptions" , nullptr, &subStatus)); |
520 | if (U_FAILURE(subStatus) || (subStatus == U_USING_DEFAULT_WARNING) ) { |
521 | status = subStatus; // copy the failing status |
522 | #if FB_DEBUG |
523 | fprintf(stderr, "open EXCEPTIONS %s : %s, %s\n" , fromLocale.getBaseName(), "[exit]" , u_errorName(status)); |
524 | #endif |
525 | return; // leaves the builder empty, if you try to use it. |
526 | } |
527 | LocalUResourceBundlePointer breaks(ures_getByKeyWithFallback(exceptions.getAlias(), "SentenceBreak" , nullptr, &subStatus)); |
528 | |
529 | #if FB_DEBUG |
530 | { |
531 | UErrorCode subsub = subStatus; |
532 | fprintf(stderr, "open SentenceBreak %s => %s, %s\n" , fromLocale.getBaseName(), ures_getLocale(breaks.getAlias(), &subsub), u_errorName(subStatus)); |
533 | } |
534 | #endif |
535 | |
536 | if (U_FAILURE(subStatus) || (subStatus == U_USING_DEFAULT_WARNING) ) { |
537 | status = subStatus; // copy the failing status |
538 | #if FB_DEBUG |
539 | fprintf(stderr, "open %s : %s, %s\n" , fromLocale.getBaseName(), "[exit]" , u_errorName(status)); |
540 | #endif |
541 | return; // leaves the builder empty, if you try to use it. |
542 | } |
543 | |
544 | LocalUResourceBundlePointer strs; |
545 | subStatus = status; // Pick up inherited warning status now |
546 | do { |
547 | strs.adoptInstead(ures_getNextResource(breaks.getAlias(), strs.orphan(), &subStatus)); |
548 | if(strs.isValid() && U_SUCCESS(subStatus)) { |
549 | UnicodeString str(ures_getUnicodeString(strs.getAlias(), &status)); |
550 | suppressBreakAfter(str, status); // load the string |
551 | } |
552 | } while (strs.isValid() && U_SUCCESS(subStatus)); |
553 | if(U_FAILURE(subStatus)&&subStatus!=U_INDEX_OUTOFBOUNDS_ERROR&&U_SUCCESS(status)) { |
554 | status = subStatus; |
555 | } |
556 | } |
557 | } |
558 | |
559 | UBool |
560 | SimpleFilteredBreakIteratorBuilder::suppressBreakAfter(const UnicodeString& exception, UErrorCode& status) |
561 | { |
562 | UBool r = fSet.add(exception, status); |
563 | FB_TRACE("suppressBreakAfter" ,&exception,r,0); |
564 | return r; |
565 | } |
566 | |
567 | UBool |
568 | SimpleFilteredBreakIteratorBuilder::unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status) |
569 | { |
570 | UBool r = fSet.remove(exception, status); |
571 | FB_TRACE("unsuppressBreakAfter" ,&exception,r,0); |
572 | return r; |
573 | } |
574 | |
575 | /** |
576 | * Jitterbug 2974: MSVC has a bug whereby new X[0] behaves badly. |
577 | * Work around this. |
578 | * |
579 | * Note: "new UnicodeString[subCount]" ends up calling global operator new |
580 | * on MSVC2012 for some reason. |
581 | */ |
582 | static inline UnicodeString* newUnicodeStringArray(size_t count) { |
583 | return new UnicodeString[count ? count : 1]; |
584 | } |
585 | |
586 | BreakIterator * |
587 | SimpleFilteredBreakIteratorBuilder::build(BreakIterator* adoptBreakIterator, UErrorCode& status) { |
588 | LocalPointer<BreakIterator> adopt(adoptBreakIterator); |
589 | |
590 | LocalPointer<UCharsTrieBuilder> builder(new UCharsTrieBuilder(status), status); |
591 | LocalPointer<UCharsTrieBuilder> builder2(new UCharsTrieBuilder(status), status); |
592 | if(U_FAILURE(status)) { |
593 | return nullptr; |
594 | } |
595 | |
596 | int32_t revCount = 0; |
597 | int32_t fwdCount = 0; |
598 | |
599 | int32_t subCount = fSet.size(); |
600 | |
601 | UnicodeString *ustrs_ptr = newUnicodeStringArray(subCount); |
602 | |
603 | LocalArray<UnicodeString> ustrs(ustrs_ptr); |
604 | |
605 | LocalMemory<int> partials; |
606 | partials.allocateInsteadAndReset(subCount); |
607 | |
608 | LocalPointer<UCharsTrie> backwardsTrie; // i.e. ".srM" for Mrs. |
609 | LocalPointer<UCharsTrie> forwardsPartialTrie; // Has ".a" for "a.M." |
610 | |
611 | int n=0; |
612 | for ( int32_t i = 0; |
613 | i<fSet.size(); |
614 | i++) { |
615 | const UnicodeString *abbr = fSet.getStringAt(i); |
616 | if(abbr) { |
617 | FB_TRACE("build" ,abbr,true,i); |
618 | ustrs[n] = *abbr; // copy by value |
619 | FB_TRACE("ustrs[n]" ,&ustrs[n],true,i); |
620 | } else { |
621 | FB_TRACE("build" ,abbr,false,i); |
622 | status = U_MEMORY_ALLOCATION_ERROR; |
623 | return nullptr; |
624 | } |
625 | partials[n] = 0; // default: not partial |
626 | n++; |
627 | } |
628 | // first pass - find partials. |
629 | for(int i=0;i<subCount;i++) { |
630 | int nn = ustrs[i].indexOf(kFULLSTOP); // TODO: non-'.' abbreviations |
631 | if(nn>-1 && (nn+1)!=ustrs[i].length()) { |
632 | FB_TRACE("partial" ,&ustrs[i],false,i); |
633 | // is partial. |
634 | // is it unique? |
635 | int sameAs = -1; |
636 | for(int j=0;j<subCount;j++) { |
637 | if(j==i) continue; |
638 | if(ustrs[i].compare(0,nn+1,ustrs[j],0,nn+1)==0) { |
639 | FB_TRACE("prefix" ,&ustrs[j],false,nn+1); |
640 | //UBool otherIsPartial = ((nn+1)!=ustrs[j].length()); // true if ustrs[j] doesn't end at nn |
641 | if(partials[j]==0) { // hasn't been processed yet |
642 | partials[j] = kSuppressInReverse | kAddToForward; |
643 | FB_TRACE("suppressing" ,&ustrs[j],false,j); |
644 | } else if(partials[j] & kSuppressInReverse) { |
645 | sameAs = j; // the other entry is already in the reverse table. |
646 | } |
647 | } |
648 | } |
649 | FB_TRACE("for partial same-" ,&ustrs[i],false,sameAs); |
650 | FB_TRACE(" == partial #" ,&ustrs[i],false,partials[i]); |
651 | UnicodeString prefix(ustrs[i], 0, nn+1); |
652 | if(sameAs == -1 && partials[i] == 0) { |
653 | // first one - add the prefix to the reverse table. |
654 | prefix.reverse(); |
655 | builder->add(prefix, kPARTIAL, status); |
656 | revCount++; |
657 | FB_TRACE("Added partial" ,&prefix,false, i); |
658 | FB_TRACE(u_errorName(status),&ustrs[i],false,i); |
659 | partials[i] = kSuppressInReverse | kAddToForward; |
660 | } else { |
661 | FB_TRACE("NOT adding partial" ,&prefix,false, i); |
662 | FB_TRACE(u_errorName(status),&ustrs[i],false,i); |
663 | } |
664 | } |
665 | } |
666 | for(int i=0;i<subCount;i++) { |
667 | if(partials[i]==0) { |
668 | ustrs[i].reverse(); |
669 | builder->add(ustrs[i], kMATCH, status); |
670 | revCount++; |
671 | FB_TRACE(u_errorName(status), &ustrs[i], false, i); |
672 | } else { |
673 | FB_TRACE("Adding fwd" ,&ustrs[i], false, i); |
674 | |
675 | // an optimization would be to only add the portion after the '.' |
676 | // for example, for "Ph.D." we store ".hP" in the reverse table. We could just store "D." in the forward, |
677 | // instead of "Ph.D." since we already know the "Ph." part is a match. |
678 | // would need the trie to be able to hold 0-length strings, though. |
679 | builder2->add(ustrs[i], kMATCH, status); // forward |
680 | fwdCount++; |
681 | //ustrs[i].reverse(); |
682 | ////if(debug2) u_printf("SUPPRESS- not Added(%d): /%S/ status=%s\n",partials[i], ustrs[i].getTerminatedBuffer(), u_errorName(status)); |
683 | } |
684 | } |
685 | FB_TRACE("AbbrCount" ,nullptr,false, subCount); |
686 | |
687 | if(revCount>0) { |
688 | backwardsTrie.adoptInstead(builder->build(USTRINGTRIE_BUILD_FAST, status)); |
689 | if(U_FAILURE(status)) { |
690 | FB_TRACE(u_errorName(status),nullptr,false, -1); |
691 | return nullptr; |
692 | } |
693 | } |
694 | |
695 | if(fwdCount>0) { |
696 | forwardsPartialTrie.adoptInstead(builder2->build(USTRINGTRIE_BUILD_FAST, status)); |
697 | if(U_FAILURE(status)) { |
698 | FB_TRACE(u_errorName(status),nullptr,false, -1); |
699 | return nullptr; |
700 | } |
701 | } |
702 | |
703 | return new SimpleFilteredSentenceBreakIterator(adopt.orphan(), forwardsPartialTrie.orphan(), backwardsTrie.orphan(), status); |
704 | } |
705 | |
706 | |
707 | // ----------- Base class implementation |
708 | |
709 | FilteredBreakIteratorBuilder::FilteredBreakIteratorBuilder() { |
710 | } |
711 | |
712 | FilteredBreakIteratorBuilder::~FilteredBreakIteratorBuilder() { |
713 | } |
714 | |
715 | FilteredBreakIteratorBuilder * |
716 | FilteredBreakIteratorBuilder::createInstance(const Locale& where, UErrorCode& status) { |
717 | if(U_FAILURE(status)) return nullptr; |
718 | LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder(where, status), status); |
719 | return (U_SUCCESS(status))? ret.orphan(): nullptr; |
720 | } |
721 | |
722 | FilteredBreakIteratorBuilder * |
723 | FilteredBreakIteratorBuilder::createInstance(UErrorCode &status) { |
724 | return createEmptyInstance(status); |
725 | } |
726 | |
727 | FilteredBreakIteratorBuilder * |
728 | FilteredBreakIteratorBuilder::createEmptyInstance(UErrorCode& status) { |
729 | if(U_FAILURE(status)) return nullptr; |
730 | LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder(status), status); |
731 | return (U_SUCCESS(status))? ret.orphan(): nullptr; |
732 | } |
733 | |
734 | U_NAMESPACE_END |
735 | |
736 | #endif //#if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_FILTERED_BREAK_ITERATION |
737 | |