1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | /* |
4 | ********************************************************************** |
5 | * Copyright (C) 2008-2015, International Business Machines |
6 | * Corporation and others. All Rights Reserved. |
7 | ********************************************************************** |
8 | * Date Name Description |
9 | * 05/11/2008 Andy Heninger Port from Java |
10 | ********************************************************************** |
11 | */ |
12 | |
13 | #include <utility> |
14 | |
15 | #include "unicode/utypes.h" |
16 | |
17 | #if !UCONFIG_NO_TRANSLITERATION && !UCONFIG_NO_BREAK_ITERATION |
18 | |
19 | #include "unicode/brkiter.h" |
20 | #include "unicode/localpointer.h" |
21 | #include "unicode/uchar.h" |
22 | #include "unicode/unifilt.h" |
23 | #include "unicode/uniset.h" |
24 | |
25 | #include "brktrans.h" |
26 | #include "cmemory.h" |
27 | #include "mutex.h" |
28 | #include "uprops.h" |
29 | #include "uinvchar.h" |
30 | #include "util.h" |
31 | #include "uvectr32.h" |
32 | |
33 | U_NAMESPACE_BEGIN |
34 | |
35 | UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BreakTransliterator) |
36 | |
37 | static const UChar SPACE = 32; // ' ' |
38 | |
39 | |
40 | /** |
41 | * Constructs a transliterator with the default delimiters '{' and |
42 | * '}'. |
43 | */ |
44 | BreakTransliterator::BreakTransliterator(UnicodeFilter* adoptedFilter) : |
45 | Transliterator(UNICODE_STRING("Any-BreakInternal" , 17), adoptedFilter), |
46 | cachedBI(NULL), cachedBoundaries(NULL), fInsertion(SPACE) { |
47 | } |
48 | |
49 | |
50 | /** |
51 | * Destructor. |
52 | */ |
53 | BreakTransliterator::~BreakTransliterator() { |
54 | } |
55 | |
56 | /** |
57 | * Copy constructor. |
58 | */ |
59 | BreakTransliterator::BreakTransliterator(const BreakTransliterator& o) : |
60 | Transliterator(o), cachedBI(NULL), cachedBoundaries(NULL), fInsertion(o.fInsertion) { |
61 | } |
62 | |
63 | |
64 | /** |
65 | * Transliterator API. |
66 | */ |
67 | BreakTransliterator* BreakTransliterator::clone() const { |
68 | return new BreakTransliterator(*this); |
69 | } |
70 | |
71 | /** |
72 | * Implements {@link Transliterator#handleTransliterate}. |
73 | */ |
74 | void BreakTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets, |
75 | UBool isIncremental ) const { |
76 | |
77 | UErrorCode status = U_ZERO_ERROR; |
78 | LocalPointer<BreakIterator> bi; |
79 | LocalPointer<UVector32> boundaries; |
80 | |
81 | { |
82 | Mutex m; |
83 | BreakTransliterator *nonConstThis = const_cast<BreakTransliterator *>(this); |
84 | boundaries = std::move(nonConstThis->cachedBoundaries); |
85 | bi = std::move(nonConstThis->cachedBI); |
86 | } |
87 | if (bi.isNull()) { |
88 | bi.adoptInstead(BreakIterator::createWordInstance(Locale::getEnglish(), status)); |
89 | } |
90 | if (boundaries.isNull()) { |
91 | boundaries.adoptInstead(new UVector32(status)); |
92 | } |
93 | |
94 | if (bi.isNull() || boundaries.isNull() || U_FAILURE(status)) { |
95 | return; |
96 | } |
97 | |
98 | boundaries->removeAllElements(); |
99 | UnicodeString sText = replaceableAsString(text); |
100 | bi->setText(sText); |
101 | bi->preceding(offsets.start); |
102 | |
103 | // To make things much easier, we will stack the boundaries, and then insert at the end. |
104 | // generally, we won't need too many, since we will be filtered. |
105 | |
106 | int32_t boundary; |
107 | for(boundary = bi->next(); boundary != UBRK_DONE && boundary < offsets.limit; boundary = bi->next()) { |
108 | if (boundary == 0) continue; |
109 | // HACK: Check to see that preceeding item was a letter |
110 | |
111 | UChar32 cp = sText.char32At(boundary-1); |
112 | int type = u_charType(cp); |
113 | //System.out.println(Integer.toString(cp,16) + " (before): " + type); |
114 | if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue; |
115 | |
116 | cp = sText.char32At(boundary); |
117 | type = u_charType(cp); |
118 | //System.out.println(Integer.toString(cp,16) + " (after): " + type); |
119 | if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue; |
120 | |
121 | boundaries->addElement(boundary, status); |
122 | // printf("Boundary at %d\n", boundary); |
123 | } |
124 | |
125 | int delta = 0; |
126 | int lastBoundary = 0; |
127 | |
128 | if (boundaries->size() != 0) { // if we found something, adjust |
129 | delta = boundaries->size() * fInsertion.length(); |
130 | lastBoundary = boundaries->lastElementi(); |
131 | |
132 | // we do this from the end backwards, so that we don't have to keep updating. |
133 | |
134 | while (boundaries->size() > 0) { |
135 | boundary = boundaries->popi(); |
136 | text.handleReplaceBetween(boundary, boundary, fInsertion); |
137 | } |
138 | } |
139 | |
140 | // Now fix up the return values |
141 | offsets.contextLimit += delta; |
142 | offsets.limit += delta; |
143 | offsets.start = isIncremental ? lastBoundary + delta : offsets.limit; |
144 | |
145 | // Return break iterator & boundaries vector to the cache. |
146 | { |
147 | Mutex m; |
148 | BreakTransliterator *nonConstThis = const_cast<BreakTransliterator *>(this); |
149 | if (nonConstThis->cachedBI.isNull()) { |
150 | nonConstThis->cachedBI = std::move(bi); |
151 | } |
152 | if (nonConstThis->cachedBoundaries.isNull()) { |
153 | nonConstThis->cachedBoundaries = std::move(boundaries); |
154 | } |
155 | } |
156 | |
157 | // TODO: do something with U_FAILURE(status); |
158 | // (need to look at transliterators overall, not just here.) |
159 | } |
160 | |
161 | // |
162 | // getInsertion() |
163 | // |
164 | const UnicodeString &BreakTransliterator::getInsertion() const { |
165 | return fInsertion; |
166 | } |
167 | |
168 | // |
169 | // setInsertion() |
170 | // |
171 | void BreakTransliterator::setInsertion(const UnicodeString &insertion) { |
172 | this->fInsertion = insertion; |
173 | } |
174 | |
175 | // |
176 | // replaceableAsString Hack to let break iterators work |
177 | // on the replaceable text from transliterators. |
178 | // In practice, the only real Replaceable type that we |
179 | // will be seeing is UnicodeString, so this function |
180 | // will normally be efficient. |
181 | // |
182 | UnicodeString BreakTransliterator::replaceableAsString(Replaceable &r) { |
183 | UnicodeString s; |
184 | UnicodeString *rs = dynamic_cast<UnicodeString *>(&r); |
185 | if (rs != NULL) { |
186 | s = *rs; |
187 | } else { |
188 | r.extractBetween(0, r.length(), s); |
189 | } |
190 | return s; |
191 | } |
192 | |
193 | U_NAMESPACE_END |
194 | |
195 | #endif /* #if !UCONFIG_NO_TRANSLITERATION */ |
196 | |