1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | /* |
4 | ******************************************************************************* |
5 | * Copyright (C) 2014-2015, International Business Machines Corporation and |
6 | * others. All Rights Reserved. |
7 | ******************************************************************************* |
8 | */ |
9 | |
10 | #include "unicode/utypes.h" |
11 | #if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_FILTERED_BREAK_ITERATION |
12 | |
13 | #include "cmemory.h" |
14 | |
15 | #include "unicode/filteredbrk.h" |
16 | #include "unicode/ucharstriebuilder.h" |
17 | #include "unicode/ures.h" |
18 | |
19 | #include "uresimp.h" // ures_getByKeyWithFallback |
20 | #include "ubrkimpl.h" // U_ICUDATA_BRKITR |
21 | #include "uvector.h" |
22 | #include "cmemory.h" |
23 | |
24 | U_NAMESPACE_BEGIN |
25 | |
26 | #ifndef FB_DEBUG |
27 | #define FB_DEBUG 0 |
28 | #endif |
29 | |
30 | #if FB_DEBUG |
31 | #include <stdio.h> |
32 | static void _fb_trace(const char *m, const UnicodeString *s, UBool b, int32_t d, const char *f, int l) { |
33 | char buf[2048]; |
34 | if(s) { |
35 | s->extract(0,s->length(),buf,2048); |
36 | } else { |
37 | strcpy(buf,"NULL" ); |
38 | } |
39 | fprintf(stderr,"%s:%d: %s. s='%s'(%p), b=%c, d=%d\n" , |
40 | f, l, m, buf, (const void*)s, b?'T':'F',(int)d); |
41 | } |
42 | |
43 | #define FB_TRACE(m,s,b,d) _fb_trace(m,s,b,d,__FILE__,__LINE__) |
44 | #else |
45 | #define FB_TRACE(m,s,b,d) |
46 | #endif |
47 | |
48 | /** |
49 | * Used with sortedInsert() |
50 | */ |
51 | static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) { |
52 | const UnicodeString &a = *(const UnicodeString*)t1.pointer; |
53 | const UnicodeString &b = *(const UnicodeString*)t2.pointer; |
54 | return a.compare(b); |
55 | } |
56 | |
57 | /** |
58 | * A UVector which implements a set of strings. |
59 | */ |
60 | class U_COMMON_API UStringSet : public UVector { |
61 | public: |
62 | UStringSet(UErrorCode &status) : UVector(uprv_deleteUObject, |
63 | uhash_compareUnicodeString, |
64 | 1, |
65 | status) {} |
66 | virtual ~UStringSet(); |
67 | /** |
68 | * Is this UnicodeSet contained? |
69 | */ |
70 | inline UBool contains(const UnicodeString& s) { |
71 | return contains((void*) &s); |
72 | } |
73 | using UVector::contains; |
74 | /** |
75 | * Return the ith UnicodeString alias |
76 | */ |
77 | inline const UnicodeString* getStringAt(int32_t i) const { |
78 | return (const UnicodeString*)elementAt(i); |
79 | } |
80 | /** |
81 | * Adopt the UnicodeString if not already contained. |
82 | * Caller no longer owns the pointer in any case. |
83 | * @return true if adopted successfully, false otherwise (error, or else duplicate) |
84 | */ |
85 | inline UBool adopt(UnicodeString *str, UErrorCode &status) { |
86 | if(U_FAILURE(status) || contains(*str)) { |
87 | delete str; |
88 | return false; |
89 | } else { |
90 | sortedInsert(str, compareUnicodeString, status); |
91 | if(U_FAILURE(status)) { |
92 | delete str; |
93 | return false; |
94 | } |
95 | return true; |
96 | } |
97 | } |
98 | /** |
99 | * Add by value. |
100 | * @return true if successfully adopted. |
101 | */ |
102 | inline UBool add(const UnicodeString& str, UErrorCode &status) { |
103 | if(U_FAILURE(status)) return false; |
104 | UnicodeString *t = new UnicodeString(str); |
105 | if(t==NULL) { |
106 | status = U_MEMORY_ALLOCATION_ERROR; return false; |
107 | } |
108 | return adopt(t, status); |
109 | } |
110 | /** |
111 | * Remove this string. |
112 | * @return true if successfully removed, false otherwise (error, or else it wasn't there) |
113 | */ |
114 | inline UBool remove(const UnicodeString &s, UErrorCode &status) { |
115 | if(U_FAILURE(status)) return false; |
116 | return removeElement((void*) &s); |
117 | } |
118 | }; |
119 | |
120 | /** |
121 | * Virtual, won't be inlined |
122 | */ |
123 | UStringSet::~UStringSet() {} |
124 | |
125 | /* ----------------------------------------------------------- */ |
126 | |
127 | |
128 | /* Filtered Break constants */ |
129 | static const int32_t kPARTIAL = (1<<0); //< partial - need to run through forward trie |
130 | static const int32_t kMATCH = (1<<1); //< exact match - skip this one. |
131 | static const int32_t kSuppressInReverse = (1<<0); |
132 | static const int32_t kAddToForward = (1<<1); |
133 | static const UChar kFULLSTOP = 0x002E; // '.' |
134 | |
135 | /** |
136 | * Shared data for SimpleFilteredSentenceBreakIterator |
137 | */ |
138 | class SimpleFilteredSentenceBreakData : public UMemory { |
139 | public: |
140 | SimpleFilteredSentenceBreakData(UCharsTrie *forwards, UCharsTrie *backwards ) |
141 | : fForwardsPartialTrie(forwards), fBackwardsTrie(backwards), refcount(1) { } |
142 | SimpleFilteredSentenceBreakData *incr() { refcount++; return this; } |
143 | SimpleFilteredSentenceBreakData *decr() { if((--refcount) <= 0) delete this; return 0; } |
144 | virtual ~SimpleFilteredSentenceBreakData(); |
145 | |
146 | LocalPointer<UCharsTrie> fForwardsPartialTrie; // Has ".a" for "a.M." |
147 | LocalPointer<UCharsTrie> fBackwardsTrie; // i.e. ".srM" for Mrs. |
148 | int32_t refcount; |
149 | }; |
150 | |
151 | SimpleFilteredSentenceBreakData::~SimpleFilteredSentenceBreakData() {} |
152 | |
153 | /** |
154 | * Concrete implementation |
155 | */ |
156 | class SimpleFilteredSentenceBreakIterator : public BreakIterator { |
157 | public: |
158 | SimpleFilteredSentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status); |
159 | SimpleFilteredSentenceBreakIterator(const SimpleFilteredSentenceBreakIterator& other); |
160 | virtual ~SimpleFilteredSentenceBreakIterator(); |
161 | private: |
162 | SimpleFilteredSentenceBreakData *fData; |
163 | LocalPointer<BreakIterator> fDelegate; |
164 | LocalUTextPointer fText; |
165 | |
166 | /* -- subclass interface -- */ |
167 | public: |
168 | /* -- cloning and other subclass stuff -- */ |
169 | virtual BreakIterator * createBufferClone(void * /*stackBuffer*/, |
170 | int32_t &/*BufferSize*/, |
171 | UErrorCode &status) { |
172 | // for now - always deep clone |
173 | status = U_SAFECLONE_ALLOCATED_WARNING; |
174 | return clone(); |
175 | } |
176 | virtual SimpleFilteredSentenceBreakIterator* clone() const { return new SimpleFilteredSentenceBreakIterator(*this); } |
177 | virtual UClassID getDynamicClassID(void) const { return NULL; } |
178 | virtual UBool operator==(const BreakIterator& o) const { if(this==&o) return true; return false; } |
179 | |
180 | /* -- text modifying -- */ |
181 | virtual void setText(UText *text, UErrorCode &status) { fDelegate->setText(text,status); } |
182 | virtual BreakIterator &refreshInputText(UText *input, UErrorCode &status) { fDelegate->refreshInputText(input,status); return *this; } |
183 | virtual void adoptText(CharacterIterator* it) { fDelegate->adoptText(it); } |
184 | virtual void setText(const UnicodeString &text) { fDelegate->setText(text); } |
185 | |
186 | /* -- other functions that are just delegated -- */ |
187 | virtual UText *getUText(UText *fillIn, UErrorCode &status) const { return fDelegate->getUText(fillIn,status); } |
188 | virtual CharacterIterator& getText(void) const { return fDelegate->getText(); } |
189 | |
190 | /* -- ITERATION -- */ |
191 | virtual int32_t first(void); |
192 | virtual int32_t preceding(int32_t offset); |
193 | virtual int32_t previous(void); |
194 | virtual UBool isBoundary(int32_t offset); |
195 | virtual int32_t current(void) const { return fDelegate->current(); } // we keep the delegate current, so this should be correct. |
196 | |
197 | virtual int32_t next(void); |
198 | |
199 | virtual int32_t next(int32_t n); |
200 | virtual int32_t following(int32_t offset); |
201 | virtual int32_t last(void); |
202 | |
203 | private: |
204 | /** |
205 | * Given that the fDelegate has already given its "initial" answer, |
206 | * find the NEXT actual (non-excepted) break. |
207 | * @param n initial position from delegate |
208 | * @return new break position or UBRK_DONE |
209 | */ |
210 | int32_t internalNext(int32_t n); |
211 | /** |
212 | * Given that the fDelegate has already given its "initial" answer, |
213 | * find the PREV actual (non-excepted) break. |
214 | * @param n initial position from delegate |
215 | * @return new break position or UBRK_DONE |
216 | */ |
217 | int32_t internalPrev(int32_t n); |
218 | /** |
219 | * set up the UText with the value of the fDelegate. |
220 | * Call this before calling breakExceptionAt. |
221 | * May be able to avoid excess calls |
222 | */ |
223 | void resetState(UErrorCode &status); |
224 | /** |
225 | * Is there a match (exception) at this spot? |
226 | */ |
227 | enum EFBMatchResult { kNoExceptionHere, kExceptionHere }; |
228 | /** |
229 | * Determine if there is an exception at this spot |
230 | * @param n spot to check |
231 | * @return kNoExceptionHere or kExceptionHere |
232 | **/ |
233 | enum EFBMatchResult breakExceptionAt(int32_t n); |
234 | }; |
235 | |
236 | SimpleFilteredSentenceBreakIterator::SimpleFilteredSentenceBreakIterator(const SimpleFilteredSentenceBreakIterator& other) |
237 | : BreakIterator(other), fData(other.fData->incr()), fDelegate(other.fDelegate->clone()) |
238 | { |
239 | } |
240 | |
241 | |
242 | SimpleFilteredSentenceBreakIterator::SimpleFilteredSentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status) : |
243 | BreakIterator(adopt->getLocale(ULOC_VALID_LOCALE,status),adopt->getLocale(ULOC_ACTUAL_LOCALE,status)), |
244 | fData(new SimpleFilteredSentenceBreakData(forwards, backwards)), |
245 | fDelegate(adopt) |
246 | { |
247 | // all set.. |
248 | } |
249 | |
250 | SimpleFilteredSentenceBreakIterator::~SimpleFilteredSentenceBreakIterator() { |
251 | fData = fData->decr(); |
252 | } |
253 | |
254 | void SimpleFilteredSentenceBreakIterator::resetState(UErrorCode &status) { |
255 | fText.adoptInstead(fDelegate->getUText(fText.orphan(), status)); |
256 | } |
257 | |
258 | SimpleFilteredSentenceBreakIterator::EFBMatchResult |
259 | SimpleFilteredSentenceBreakIterator::breakExceptionAt(int32_t n) { |
260 | int64_t bestPosn = -1; |
261 | int32_t bestValue = -1; |
262 | // loops while 'n' points to an exception. |
263 | utext_setNativeIndex(fText.getAlias(), n); // from n.. |
264 | fData->fBackwardsTrie->reset(); |
265 | UChar32 uch; |
266 | |
267 | //if(debug2) u_printf(" n@ %d\n", n); |
268 | // Assume a space is following the '.' (so we handle the case: "Mr. /Brown") |
269 | if((uch=utext_previous32(fText.getAlias()))==(UChar32)0x0020) { // TODO: skip a class of chars here?? |
270 | // TODO only do this the 1st time? |
271 | //if(debug2) u_printf("skipping prev: |%C| \n", (UChar)uch); |
272 | } else { |
273 | //if(debug2) u_printf("not skipping prev: |%C| \n", (UChar)uch); |
274 | uch = utext_next32(fText.getAlias()); |
275 | //if(debug2) u_printf(" -> : |%C| \n", (UChar)uch); |
276 | } |
277 | |
278 | UStringTrieResult r = USTRINGTRIE_INTERMEDIATE_VALUE; |
279 | |
280 | while((uch=utext_previous32(fText.getAlias()))!=U_SENTINEL && // more to consume backwards and.. |
281 | USTRINGTRIE_HAS_NEXT(r=fData->fBackwardsTrie->nextForCodePoint(uch))) {// more in the trie |
282 | if(USTRINGTRIE_HAS_VALUE(r)) { // remember the best match so far |
283 | bestPosn = utext_getNativeIndex(fText.getAlias()); |
284 | bestValue = fData->fBackwardsTrie->getValue(); |
285 | } |
286 | //if(debug2) u_printf("rev< /%C/ cont?%d @%d\n", (UChar)uch, r, utext_getNativeIndex(fText.getAlias())); |
287 | } |
288 | |
289 | if(USTRINGTRIE_MATCHES(r)) { // exact match? |
290 | //if(debug2) u_printf("rev<?/%C/?end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue); |
291 | bestValue = fData->fBackwardsTrie->getValue(); |
292 | bestPosn = utext_getNativeIndex(fText.getAlias()); |
293 | //if(debug2) u_printf("rev<+/%C/+end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue); |
294 | } |
295 | |
296 | if(bestPosn>=0) { |
297 | //if(debug2) u_printf("rev< /%C/ end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue); |
298 | |
299 | //if(USTRINGTRIE_MATCHES(r)) { // matched - so, now what? |
300 | //int32_t bestValue = fBackwardsTrie->getValue(); |
301 | ////if(debug2) u_printf("rev< /%C/ matched, skip..%d bestValue=%d\n", (UChar)uch, r, bestValue); |
302 | |
303 | if(bestValue == kMATCH) { // exact match! |
304 | //if(debug2) u_printf(" exact backward match\n"); |
305 | return kExceptionHere; // See if the next is another exception. |
306 | } else if(bestValue == kPARTIAL |
307 | && fData->fForwardsPartialTrie.isValid()) { // make sure there's a forward trie |
308 | //if(debug2) u_printf(" partial backward match\n"); |
309 | // We matched the "Ph." in "Ph.D." - now we need to run everything through the forwards trie |
310 | // to see if it matches something going forward. |
311 | fData->fForwardsPartialTrie->reset(); |
312 | UStringTrieResult rfwd = USTRINGTRIE_INTERMEDIATE_VALUE; |
313 | utext_setNativeIndex(fText.getAlias(), bestPosn); // hope that's close .. |
314 | //if(debug2) u_printf("Retrying at %d\n", bestPosn); |
315 | while((uch=utext_next32(fText.getAlias()))!=U_SENTINEL && |
316 | USTRINGTRIE_HAS_NEXT(rfwd=fData->fForwardsPartialTrie->nextForCodePoint(uch))) { |
317 | //if(debug2) u_printf("fwd> /%C/ cont?%d @%d\n", (UChar)uch, rfwd, utext_getNativeIndex(fText.getAlias())); |
318 | } |
319 | if(USTRINGTRIE_MATCHES(rfwd)) { |
320 | //if(debug2) u_printf("fwd> /%C/ == forward match!\n", (UChar)uch); |
321 | // only full matches here, nothing to check |
322 | // skip the next: |
323 | return kExceptionHere; |
324 | } else { |
325 | //if(debug2) u_printf("fwd> /%C/ no match.\n", (UChar)uch); |
326 | // no match (no exception) -return the 'underlying' break |
327 | return kNoExceptionHere; |
328 | } |
329 | } else { |
330 | return kNoExceptionHere; // internal error and/or no forwards trie |
331 | } |
332 | } else { |
333 | //if(debug2) u_printf("rev< /%C/ .. no match..%d\n", (UChar)uch, r); // no best match |
334 | return kNoExceptionHere; // No match - so exit. Not an exception. |
335 | } |
336 | } |
337 | |
338 | // the workhorse single next. |
339 | int32_t |
340 | SimpleFilteredSentenceBreakIterator::internalNext(int32_t n) { |
341 | if(n == UBRK_DONE || // at end or |
342 | fData->fBackwardsTrie.isNull()) { // .. no backwards table loaded == no exceptions |
343 | return n; |
344 | } |
345 | // OK, do we need to break here? |
346 | UErrorCode status = U_ZERO_ERROR; |
347 | // refresh text |
348 | resetState(status); |
349 | if(U_FAILURE(status)) return UBRK_DONE; // bail out |
350 | int64_t utextLen = utext_nativeLength(fText.getAlias()); |
351 | |
352 | //if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias())); |
353 | while (n != UBRK_DONE && n != utextLen) { // outer loop runs once per underlying break (from fDelegate). |
354 | SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(n); |
355 | |
356 | switch(m) { |
357 | case kExceptionHere: |
358 | n = fDelegate->next(); // skip this one. Find the next lowerlevel break. |
359 | continue; |
360 | |
361 | default: |
362 | case kNoExceptionHere: |
363 | return n; |
364 | } |
365 | } |
366 | return n; |
367 | } |
368 | |
369 | int32_t |
370 | SimpleFilteredSentenceBreakIterator::internalPrev(int32_t n) { |
371 | if(n == 0 || n == UBRK_DONE || // at end or |
372 | fData->fBackwardsTrie.isNull()) { // .. no backwards table loaded == no exceptions |
373 | return n; |
374 | } |
375 | // OK, do we need to break here? |
376 | UErrorCode status = U_ZERO_ERROR; |
377 | // refresh text |
378 | resetState(status); |
379 | if(U_FAILURE(status)) return UBRK_DONE; // bail out |
380 | |
381 | //if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias())); |
382 | while (n != UBRK_DONE && n != 0) { // outer loop runs once per underlying break (from fDelegate). |
383 | SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(n); |
384 | |
385 | switch(m) { |
386 | case kExceptionHere: |
387 | n = fDelegate->previous(); // skip this one. Find the next lowerlevel break. |
388 | continue; |
389 | |
390 | default: |
391 | case kNoExceptionHere: |
392 | return n; |
393 | } |
394 | } |
395 | return n; |
396 | } |
397 | |
398 | |
399 | int32_t |
400 | SimpleFilteredSentenceBreakIterator::next() { |
401 | return internalNext(fDelegate->next()); |
402 | } |
403 | |
404 | int32_t |
405 | SimpleFilteredSentenceBreakIterator::first(void) { |
406 | // Don't suppress a break opportunity at the beginning of text. |
407 | return fDelegate->first(); |
408 | } |
409 | |
410 | int32_t |
411 | SimpleFilteredSentenceBreakIterator::preceding(int32_t offset) { |
412 | return internalPrev(fDelegate->preceding(offset)); |
413 | } |
414 | |
415 | int32_t |
416 | SimpleFilteredSentenceBreakIterator::previous(void) { |
417 | return internalPrev(fDelegate->previous()); |
418 | } |
419 | |
420 | UBool SimpleFilteredSentenceBreakIterator::isBoundary(int32_t offset) { |
421 | if (!fDelegate->isBoundary(offset)) return false; // no break to suppress |
422 | |
423 | if (fData->fBackwardsTrie.isNull()) return true; // no data = no suppressions |
424 | |
425 | UErrorCode status = U_ZERO_ERROR; |
426 | resetState(status); |
427 | |
428 | SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(offset); |
429 | |
430 | switch(m) { |
431 | case kExceptionHere: |
432 | return false; |
433 | default: |
434 | case kNoExceptionHere: |
435 | return true; |
436 | } |
437 | } |
438 | |
439 | int32_t |
440 | SimpleFilteredSentenceBreakIterator::next(int32_t offset) { |
441 | return internalNext(fDelegate->next(offset)); |
442 | } |
443 | |
444 | int32_t |
445 | SimpleFilteredSentenceBreakIterator::following(int32_t offset) { |
446 | return internalNext(fDelegate->following(offset)); |
447 | } |
448 | |
449 | int32_t |
450 | SimpleFilteredSentenceBreakIterator::last(void) { |
451 | // Don't suppress a break opportunity at the end of text. |
452 | return fDelegate->last(); |
453 | } |
454 | |
455 | |
456 | /** |
457 | * Concrete implementation of builder class. |
458 | */ |
459 | class U_COMMON_API SimpleFilteredBreakIteratorBuilder : public FilteredBreakIteratorBuilder { |
460 | public: |
461 | virtual ~SimpleFilteredBreakIteratorBuilder(); |
462 | SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status); |
463 | SimpleFilteredBreakIteratorBuilder(UErrorCode &status); |
464 | virtual UBool suppressBreakAfter(const UnicodeString& exception, UErrorCode& status); |
465 | virtual UBool unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status); |
466 | virtual BreakIterator *build(BreakIterator* adoptBreakIterator, UErrorCode& status); |
467 | private: |
468 | UStringSet fSet; |
469 | }; |
470 | |
471 | SimpleFilteredBreakIteratorBuilder::~SimpleFilteredBreakIteratorBuilder() |
472 | { |
473 | } |
474 | |
475 | SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(UErrorCode &status) |
476 | : fSet(status) |
477 | { |
478 | } |
479 | |
480 | SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status) |
481 | : fSet(status) |
482 | { |
483 | if(U_SUCCESS(status)) { |
484 | UErrorCode subStatus = U_ZERO_ERROR; |
485 | LocalUResourceBundlePointer b(ures_open(U_ICUDATA_BRKITR, fromLocale.getBaseName(), &subStatus)); |
486 | if (U_FAILURE(subStatus) || (subStatus == U_USING_DEFAULT_WARNING) ) { |
487 | status = subStatus; // copy the failing status |
488 | #if FB_DEBUG |
489 | fprintf(stderr, "open BUNDLE %s : %s, %s\n" , fromLocale.getBaseName(), "[exit]" , u_errorName(status)); |
490 | #endif |
491 | return; // leaves the builder empty, if you try to use it. |
492 | } |
493 | LocalUResourceBundlePointer exceptions(ures_getByKeyWithFallback(b.getAlias(), "exceptions" , NULL, &subStatus)); |
494 | if (U_FAILURE(subStatus) || (subStatus == U_USING_DEFAULT_WARNING) ) { |
495 | status = subStatus; // copy the failing status |
496 | #if FB_DEBUG |
497 | fprintf(stderr, "open EXCEPTIONS %s : %s, %s\n" , fromLocale.getBaseName(), "[exit]" , u_errorName(status)); |
498 | #endif |
499 | return; // leaves the builder empty, if you try to use it. |
500 | } |
501 | LocalUResourceBundlePointer breaks(ures_getByKeyWithFallback(exceptions.getAlias(), "SentenceBreak" , NULL, &subStatus)); |
502 | |
503 | #if FB_DEBUG |
504 | { |
505 | UErrorCode subsub = subStatus; |
506 | fprintf(stderr, "open SentenceBreak %s => %s, %s\n" , fromLocale.getBaseName(), ures_getLocale(breaks.getAlias(), &subsub), u_errorName(subStatus)); |
507 | } |
508 | #endif |
509 | |
510 | if (U_FAILURE(subStatus) || (subStatus == U_USING_DEFAULT_WARNING) ) { |
511 | status = subStatus; // copy the failing status |
512 | #if FB_DEBUG |
513 | fprintf(stderr, "open %s : %s, %s\n" , fromLocale.getBaseName(), "[exit]" , u_errorName(status)); |
514 | #endif |
515 | return; // leaves the builder empty, if you try to use it. |
516 | } |
517 | |
518 | LocalUResourceBundlePointer strs; |
519 | subStatus = status; // Pick up inherited warning status now |
520 | do { |
521 | strs.adoptInstead(ures_getNextResource(breaks.getAlias(), strs.orphan(), &subStatus)); |
522 | if(strs.isValid() && U_SUCCESS(subStatus)) { |
523 | UnicodeString str(ures_getUnicodeString(strs.getAlias(), &status)); |
524 | suppressBreakAfter(str, status); // load the string |
525 | } |
526 | } while (strs.isValid() && U_SUCCESS(subStatus)); |
527 | if(U_FAILURE(subStatus)&&subStatus!=U_INDEX_OUTOFBOUNDS_ERROR&&U_SUCCESS(status)) { |
528 | status = subStatus; |
529 | } |
530 | } |
531 | } |
532 | |
533 | UBool |
534 | SimpleFilteredBreakIteratorBuilder::suppressBreakAfter(const UnicodeString& exception, UErrorCode& status) |
535 | { |
536 | UBool r = fSet.add(exception, status); |
537 | FB_TRACE("suppressBreakAfter" ,&exception,r,0); |
538 | return r; |
539 | } |
540 | |
541 | UBool |
542 | SimpleFilteredBreakIteratorBuilder::unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status) |
543 | { |
544 | UBool r = fSet.remove(exception, status); |
545 | FB_TRACE("unsuppressBreakAfter" ,&exception,r,0); |
546 | return r; |
547 | } |
548 | |
549 | /** |
550 | * Jitterbug 2974: MSVC has a bug whereby new X[0] behaves badly. |
551 | * Work around this. |
552 | * |
553 | * Note: "new UnicodeString[subCount]" ends up calling global operator new |
554 | * on MSVC2012 for some reason. |
555 | */ |
556 | static inline UnicodeString* newUnicodeStringArray(size_t count) { |
557 | return new UnicodeString[count ? count : 1]; |
558 | } |
559 | |
560 | BreakIterator * |
561 | SimpleFilteredBreakIteratorBuilder::build(BreakIterator* adoptBreakIterator, UErrorCode& status) { |
562 | LocalPointer<BreakIterator> adopt(adoptBreakIterator); |
563 | |
564 | LocalPointer<UCharsTrieBuilder> builder(new UCharsTrieBuilder(status), status); |
565 | LocalPointer<UCharsTrieBuilder> builder2(new UCharsTrieBuilder(status), status); |
566 | if(U_FAILURE(status)) { |
567 | return NULL; |
568 | } |
569 | |
570 | int32_t revCount = 0; |
571 | int32_t fwdCount = 0; |
572 | |
573 | int32_t subCount = fSet.size(); |
574 | |
575 | UnicodeString *ustrs_ptr = newUnicodeStringArray(subCount); |
576 | |
577 | LocalArray<UnicodeString> ustrs(ustrs_ptr); |
578 | |
579 | LocalMemory<int> partials; |
580 | partials.allocateInsteadAndReset(subCount); |
581 | |
582 | LocalPointer<UCharsTrie> backwardsTrie; // i.e. ".srM" for Mrs. |
583 | LocalPointer<UCharsTrie> forwardsPartialTrie; // Has ".a" for "a.M." |
584 | |
585 | int n=0; |
586 | for ( int32_t i = 0; |
587 | i<fSet.size(); |
588 | i++) { |
589 | const UnicodeString *abbr = fSet.getStringAt(i); |
590 | if(abbr) { |
591 | FB_TRACE("build" ,abbr,TRUE,i); |
592 | ustrs[n] = *abbr; // copy by value |
593 | FB_TRACE("ustrs[n]" ,&ustrs[n],TRUE,i); |
594 | } else { |
595 | FB_TRACE("build" ,abbr,FALSE,i); |
596 | status = U_MEMORY_ALLOCATION_ERROR; |
597 | return NULL; |
598 | } |
599 | partials[n] = 0; // default: not partial |
600 | n++; |
601 | } |
602 | // first pass - find partials. |
603 | for(int i=0;i<subCount;i++) { |
604 | int nn = ustrs[i].indexOf(kFULLSTOP); // TODO: non-'.' abbreviations |
605 | if(nn>-1 && (nn+1)!=ustrs[i].length()) { |
606 | FB_TRACE("partial" ,&ustrs[i],FALSE,i); |
607 | // is partial. |
608 | // is it unique? |
609 | int sameAs = -1; |
610 | for(int j=0;j<subCount;j++) { |
611 | if(j==i) continue; |
612 | if(ustrs[i].compare(0,nn+1,ustrs[j],0,nn+1)==0) { |
613 | FB_TRACE("prefix" ,&ustrs[j],FALSE,nn+1); |
614 | //UBool otherIsPartial = ((nn+1)!=ustrs[j].length()); // true if ustrs[j] doesn't end at nn |
615 | if(partials[j]==0) { // hasn't been processed yet |
616 | partials[j] = kSuppressInReverse | kAddToForward; |
617 | FB_TRACE("suppressing" ,&ustrs[j],FALSE,j); |
618 | } else if(partials[j] & kSuppressInReverse) { |
619 | sameAs = j; // the other entry is already in the reverse table. |
620 | } |
621 | } |
622 | } |
623 | FB_TRACE("for partial same-" ,&ustrs[i],FALSE,sameAs); |
624 | FB_TRACE(" == partial #" ,&ustrs[i],FALSE,partials[i]); |
625 | UnicodeString prefix(ustrs[i], 0, nn+1); |
626 | if(sameAs == -1 && partials[i] == 0) { |
627 | // first one - add the prefix to the reverse table. |
628 | prefix.reverse(); |
629 | builder->add(prefix, kPARTIAL, status); |
630 | revCount++; |
631 | FB_TRACE("Added partial" ,&prefix,FALSE, i); |
632 | FB_TRACE(u_errorName(status),&ustrs[i],FALSE,i); |
633 | partials[i] = kSuppressInReverse | kAddToForward; |
634 | } else { |
635 | FB_TRACE("NOT adding partial" ,&prefix,FALSE, i); |
636 | FB_TRACE(u_errorName(status),&ustrs[i],FALSE,i); |
637 | } |
638 | } |
639 | } |
640 | for(int i=0;i<subCount;i++) { |
641 | if(partials[i]==0) { |
642 | ustrs[i].reverse(); |
643 | builder->add(ustrs[i], kMATCH, status); |
644 | revCount++; |
645 | FB_TRACE(u_errorName(status), &ustrs[i], FALSE, i); |
646 | } else { |
647 | FB_TRACE("Adding fwd" ,&ustrs[i], FALSE, i); |
648 | |
649 | // an optimization would be to only add the portion after the '.' |
650 | // for example, for "Ph.D." we store ".hP" in the reverse table. We could just store "D." in the forward, |
651 | // instead of "Ph.D." since we already know the "Ph." part is a match. |
652 | // would need the trie to be able to hold 0-length strings, though. |
653 | builder2->add(ustrs[i], kMATCH, status); // forward |
654 | fwdCount++; |
655 | //ustrs[i].reverse(); |
656 | ////if(debug2) u_printf("SUPPRESS- not Added(%d): /%S/ status=%s\n",partials[i], ustrs[i].getTerminatedBuffer(), u_errorName(status)); |
657 | } |
658 | } |
659 | FB_TRACE("AbbrCount" ,NULL,FALSE, subCount); |
660 | |
661 | if(revCount>0) { |
662 | backwardsTrie.adoptInstead(builder->build(USTRINGTRIE_BUILD_FAST, status)); |
663 | if(U_FAILURE(status)) { |
664 | FB_TRACE(u_errorName(status),NULL,FALSE, -1); |
665 | return NULL; |
666 | } |
667 | } |
668 | |
669 | if(fwdCount>0) { |
670 | forwardsPartialTrie.adoptInstead(builder2->build(USTRINGTRIE_BUILD_FAST, status)); |
671 | if(U_FAILURE(status)) { |
672 | FB_TRACE(u_errorName(status),NULL,FALSE, -1); |
673 | return NULL; |
674 | } |
675 | } |
676 | |
677 | return new SimpleFilteredSentenceBreakIterator(adopt.orphan(), forwardsPartialTrie.orphan(), backwardsTrie.orphan(), status); |
678 | } |
679 | |
680 | |
681 | // ----------- Base class implementation |
682 | |
683 | FilteredBreakIteratorBuilder::FilteredBreakIteratorBuilder() { |
684 | } |
685 | |
686 | FilteredBreakIteratorBuilder::~FilteredBreakIteratorBuilder() { |
687 | } |
688 | |
689 | FilteredBreakIteratorBuilder * |
690 | FilteredBreakIteratorBuilder::createInstance(const Locale& where, UErrorCode& status) { |
691 | if(U_FAILURE(status)) return NULL; |
692 | LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder(where, status), status); |
693 | return (U_SUCCESS(status))? ret.orphan(): NULL; |
694 | } |
695 | |
696 | FilteredBreakIteratorBuilder * |
697 | FilteredBreakIteratorBuilder::createInstance(UErrorCode &status) { |
698 | return createEmptyInstance(status); |
699 | } |
700 | |
701 | FilteredBreakIteratorBuilder * |
702 | FilteredBreakIteratorBuilder::createEmptyInstance(UErrorCode& status) { |
703 | if(U_FAILURE(status)) return NULL; |
704 | LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder(status), status); |
705 | return (U_SUCCESS(status))? ret.orphan(): NULL; |
706 | } |
707 | |
708 | U_NAMESPACE_END |
709 | |
710 | #endif //#if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_FILTERED_BREAK_ITERATION |
711 | |