1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | /* |
4 | ************************************************************************************ |
5 | * Copyright (C) 2006-2016, International Business Machines Corporation |
6 | * and others. All Rights Reserved. |
7 | ************************************************************************************ |
8 | */ |
9 | |
10 | #include "unicode/utypes.h" |
11 | |
12 | #if !UCONFIG_NO_BREAK_ITERATION |
13 | |
14 | #include "unicode/uchar.h" |
15 | #include "unicode/uniset.h" |
16 | #include "unicode/chariter.h" |
17 | #include "unicode/ures.h" |
18 | #include "unicode/udata.h" |
19 | #include "unicode/putil.h" |
20 | #include "unicode/ustring.h" |
21 | #include "unicode/uscript.h" |
22 | #include "unicode/ucharstrie.h" |
23 | #include "unicode/bytestrie.h" |
24 | |
25 | #include "brkeng.h" |
26 | #include "cmemory.h" |
27 | #include "dictbe.h" |
28 | #include "lstmbe.h" |
29 | #include "charstr.h" |
30 | #include "dictionarydata.h" |
31 | #include "mutex.h" |
32 | #include "uvector.h" |
33 | #include "umutex.h" |
34 | #include "uresimp.h" |
35 | #include "ubrkimpl.h" |
36 | |
37 | U_NAMESPACE_BEGIN |
38 | |
39 | /* |
40 | ****************************************************************** |
41 | */ |
42 | |
43 | LanguageBreakEngine::LanguageBreakEngine() { |
44 | } |
45 | |
46 | LanguageBreakEngine::~LanguageBreakEngine() { |
47 | } |
48 | |
49 | /* |
50 | ****************************************************************** |
51 | */ |
52 | |
53 | LanguageBreakFactory::LanguageBreakFactory() { |
54 | } |
55 | |
56 | LanguageBreakFactory::~LanguageBreakFactory() { |
57 | } |
58 | |
59 | /* |
60 | ****************************************************************** |
61 | */ |
62 | |
63 | UnhandledEngine::UnhandledEngine(UErrorCode &status) : fHandled(nullptr) { |
64 | (void)status; |
65 | } |
66 | |
67 | UnhandledEngine::~UnhandledEngine() { |
68 | delete fHandled; |
69 | fHandled = nullptr; |
70 | } |
71 | |
72 | UBool |
73 | UnhandledEngine::handles(UChar32 c) const { |
74 | return fHandled && fHandled->contains(c); |
75 | } |
76 | |
77 | int32_t |
78 | UnhandledEngine::findBreaks( UText *text, |
79 | int32_t /* startPos */, |
80 | int32_t endPos, |
81 | UVector32 &/*foundBreaks*/, |
82 | UBool /* isPhraseBreaking */, |
83 | UErrorCode &status) const { |
84 | if (U_FAILURE(status)) return 0; |
85 | UChar32 c = utext_current32(text); |
86 | while((int32_t)utext_getNativeIndex(text) < endPos && fHandled->contains(c)) { |
87 | utext_next32(text); // TODO: recast loop to work with post-increment operations. |
88 | c = utext_current32(text); |
89 | } |
90 | return 0; |
91 | } |
92 | |
93 | void |
94 | UnhandledEngine::handleCharacter(UChar32 c) { |
95 | if (fHandled == nullptr) { |
96 | fHandled = new UnicodeSet(); |
97 | if (fHandled == nullptr) { |
98 | return; |
99 | } |
100 | } |
101 | if (!fHandled->contains(c)) { |
102 | UErrorCode status = U_ZERO_ERROR; |
103 | // Apply the entire script of the character. |
104 | int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT); |
105 | fHandled->applyIntPropertyValue(UCHAR_SCRIPT, script, status); |
106 | } |
107 | } |
108 | |
109 | /* |
110 | ****************************************************************** |
111 | */ |
112 | |
113 | ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) { |
114 | fEngines = 0; |
115 | } |
116 | |
117 | ICULanguageBreakFactory::~ICULanguageBreakFactory() { |
118 | if (fEngines != 0) { |
119 | delete fEngines; |
120 | } |
121 | } |
122 | |
123 | U_NAMESPACE_END |
124 | U_CDECL_BEGIN |
125 | static void U_CALLCONV _deleteEngine(void *obj) { |
126 | delete (const icu::LanguageBreakEngine *) obj; |
127 | } |
128 | U_CDECL_END |
129 | U_NAMESPACE_BEGIN |
130 | |
131 | const LanguageBreakEngine * |
132 | ICULanguageBreakFactory::getEngineFor(UChar32 c) { |
133 | const LanguageBreakEngine *lbe = nullptr; |
134 | UErrorCode status = U_ZERO_ERROR; |
135 | |
136 | static UMutex gBreakEngineMutex; |
137 | Mutex m(&gBreakEngineMutex); |
138 | |
139 | if (fEngines == nullptr) { |
140 | LocalPointer<UStack> engines(new UStack(_deleteEngine, nullptr, status), status); |
141 | if (U_FAILURE(status) ) { |
142 | // Note: no way to return error code to caller. |
143 | return nullptr; |
144 | } |
145 | fEngines = engines.orphan(); |
146 | } else { |
147 | int32_t i = fEngines->size(); |
148 | while (--i >= 0) { |
149 | lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i)); |
150 | if (lbe != nullptr && lbe->handles(c)) { |
151 | return lbe; |
152 | } |
153 | } |
154 | } |
155 | |
156 | // We didn't find an engine. Create one. |
157 | lbe = loadEngineFor(c); |
158 | if (lbe != nullptr) { |
159 | fEngines->push((void *)lbe, status); |
160 | } |
161 | return U_SUCCESS(status) ? lbe : nullptr; |
162 | } |
163 | |
164 | const LanguageBreakEngine * |
165 | ICULanguageBreakFactory::loadEngineFor(UChar32 c) { |
166 | UErrorCode status = U_ZERO_ERROR; |
167 | UScriptCode code = uscript_getScript(c, &status); |
168 | if (U_SUCCESS(status)) { |
169 | const LanguageBreakEngine *engine = nullptr; |
170 | // Try to use LSTM first |
171 | const LSTMData *data = CreateLSTMDataForScript(code, status); |
172 | if (U_SUCCESS(status)) { |
173 | if (data != nullptr) { |
174 | engine = CreateLSTMBreakEngine(code, data, status); |
175 | if (U_SUCCESS(status) && engine != nullptr) { |
176 | return engine; |
177 | } |
178 | if (engine != nullptr) { |
179 | delete engine; |
180 | engine = nullptr; |
181 | } else { |
182 | DeleteLSTMData(data); |
183 | } |
184 | } |
185 | } |
186 | status = U_ZERO_ERROR; // fallback to dictionary based |
187 | DictionaryMatcher *m = loadDictionaryMatcherFor(code); |
188 | if (m != nullptr) { |
189 | switch(code) { |
190 | case USCRIPT_THAI: |
191 | engine = new ThaiBreakEngine(m, status); |
192 | break; |
193 | case USCRIPT_LAO: |
194 | engine = new LaoBreakEngine(m, status); |
195 | break; |
196 | case USCRIPT_MYANMAR: |
197 | engine = new BurmeseBreakEngine(m, status); |
198 | break; |
199 | case USCRIPT_KHMER: |
200 | engine = new KhmerBreakEngine(m, status); |
201 | break; |
202 | |
203 | #if !UCONFIG_NO_NORMALIZATION |
204 | // CJK not available w/o normalization |
205 | case USCRIPT_HANGUL: |
206 | engine = new CjkBreakEngine(m, kKorean, status); |
207 | break; |
208 | |
209 | // use same BreakEngine and dictionary for both Chinese and Japanese |
210 | case USCRIPT_HIRAGANA: |
211 | case USCRIPT_KATAKANA: |
212 | case USCRIPT_HAN: |
213 | engine = new CjkBreakEngine(m, kChineseJapanese, status); |
214 | break; |
215 | #if 0 |
216 | // TODO: Have to get some characters with script=common handled |
217 | // by CjkBreakEngine (e.g. U+309B). Simply subjecting |
218 | // them to CjkBreakEngine does not work. The engine has to |
219 | // special-case them. |
220 | case USCRIPT_COMMON: |
221 | { |
222 | UBlockCode block = ublock_getCode(code); |
223 | if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA) |
224 | engine = new CjkBreakEngine(dict, kChineseJapanese, status); |
225 | break; |
226 | } |
227 | #endif |
228 | #endif |
229 | |
230 | default: |
231 | break; |
232 | } |
233 | if (engine == nullptr) { |
234 | delete m; |
235 | } |
236 | else if (U_FAILURE(status)) { |
237 | delete engine; |
238 | engine = nullptr; |
239 | } |
240 | return engine; |
241 | } |
242 | } |
243 | return nullptr; |
244 | } |
245 | |
246 | DictionaryMatcher * |
247 | ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script) { |
248 | UErrorCode status = U_ZERO_ERROR; |
249 | // open root from brkitr tree. |
250 | UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "" , &status); |
251 | b = ures_getByKeyWithFallback(b, "dictionaries" , b, &status); |
252 | int32_t dictnlength = 0; |
253 | const char16_t *dictfname = |
254 | ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status); |
255 | if (U_FAILURE(status)) { |
256 | ures_close(b); |
257 | return nullptr; |
258 | } |
259 | CharString dictnbuf; |
260 | CharString ext; |
261 | const char16_t *extStart = u_memrchr(dictfname, 0x002e, dictnlength); // last dot |
262 | if (extStart != nullptr) { |
263 | int32_t len = (int32_t)(extStart - dictfname); |
264 | ext.appendInvariantChars(UnicodeString(false, extStart + 1, dictnlength - len - 1), status); |
265 | dictnlength = len; |
266 | } |
267 | dictnbuf.appendInvariantChars(UnicodeString(false, dictfname, dictnlength), status); |
268 | ures_close(b); |
269 | |
270 | UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status); |
271 | if (U_SUCCESS(status)) { |
272 | // build trie |
273 | const uint8_t *data = (const uint8_t *)udata_getMemory(file); |
274 | const int32_t *indexes = (const int32_t *)data; |
275 | const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET]; |
276 | const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK; |
277 | DictionaryMatcher *m = nullptr; |
278 | if (trieType == DictionaryData::TRIE_TYPE_BYTES) { |
279 | const int32_t transform = indexes[DictionaryData::IX_TRANSFORM]; |
280 | const char *characters = (const char *)(data + offset); |
281 | m = new BytesDictionaryMatcher(characters, transform, file); |
282 | } |
283 | else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) { |
284 | const char16_t *characters = (const char16_t *)(data + offset); |
285 | m = new UCharsDictionaryMatcher(characters, file); |
286 | } |
287 | if (m == nullptr) { |
288 | // no matcher exists to take ownership - either we are an invalid |
289 | // type or memory allocation failed |
290 | udata_close(file); |
291 | } |
292 | return m; |
293 | } else if (dictfname != nullptr) { |
294 | // we don't have a dictionary matcher. |
295 | // returning nullptr here will cause us to fail to find a dictionary break engine, as expected |
296 | status = U_ZERO_ERROR; |
297 | return nullptr; |
298 | } |
299 | return nullptr; |
300 | } |
301 | |
302 | U_NAMESPACE_END |
303 | |
304 | #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |
305 | |