1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | /** |
4 | ******************************************************************************* |
5 | * Copyright (C) 2006-2014, International Business Machines Corporation * |
6 | * and others. All Rights Reserved. * |
7 | ******************************************************************************* |
8 | */ |
9 | |
10 | #ifndef DICTBE_H |
11 | #define DICTBE_H |
12 | |
13 | #include "unicode/utypes.h" |
14 | #include "unicode/uniset.h" |
15 | #include "unicode/utext.h" |
16 | |
17 | #include "brkeng.h" |
18 | #include "hash.h" |
19 | #include "mlbe.h" |
20 | #include "uvectr32.h" |
21 | |
22 | U_NAMESPACE_BEGIN |
23 | |
24 | class DictionaryMatcher; |
25 | class MlBreakEngine; |
26 | class Normalizer2; |
27 | |
28 | /******************************************************************* |
29 | * DictionaryBreakEngine |
30 | */ |
31 | |
32 | /** |
33 | * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a |
34 | * dictionary to determine language-specific breaks.</p> |
35 | * |
36 | * <p>After it is constructed a DictionaryBreakEngine may be shared between |
37 | * threads without synchronization.</p> |
38 | */ |
39 | class DictionaryBreakEngine : public LanguageBreakEngine { |
40 | private: |
41 | /** |
42 | * The set of characters handled by this engine |
43 | * @internal |
44 | */ |
45 | |
46 | UnicodeSet fSet; |
47 | |
48 | public: |
49 | |
50 | /** |
51 | * <p>Constructor </p> |
52 | */ |
53 | DictionaryBreakEngine(); |
54 | |
55 | /** |
56 | * <p>Virtual destructor.</p> |
57 | */ |
58 | virtual ~DictionaryBreakEngine(); |
59 | |
60 | /** |
61 | * <p>Indicate whether this engine handles a particular character for |
62 | * a particular kind of break.</p> |
63 | * |
64 | * @param c A character which begins a run that the engine might handle |
65 | * @return true if this engine handles the particular character and break |
66 | * type. |
67 | */ |
68 | virtual UBool handles(UChar32 c) const override; |
69 | |
70 | /** |
71 | * <p>Find any breaks within a run in the supplied text.</p> |
72 | * |
73 | * @param text A UText representing the text. The iterator is left at |
74 | * the end of the run of characters which the engine is capable of handling |
75 | * that starts from the first character in the range. |
76 | * @param startPos The start of the run within the supplied text. |
77 | * @param endPos The end of the run within the supplied text. |
78 | * @param foundBreaks vector of int32_t to receive the break positions |
79 | * @param status Information on any errors encountered. |
80 | * @return The number of breaks found. |
81 | */ |
82 | virtual int32_t findBreaks( UText *text, |
83 | int32_t startPos, |
84 | int32_t endPos, |
85 | UVector32 &foundBreaks, |
86 | UBool isPhraseBreaking, |
87 | UErrorCode& status ) const override; |
88 | |
89 | protected: |
90 | |
91 | /** |
92 | * <p>Set the character set handled by this engine.</p> |
93 | * |
94 | * @param set A UnicodeSet of the set of characters handled by the engine |
95 | */ |
96 | virtual void setCharacters( const UnicodeSet &set ); |
97 | |
98 | /** |
99 | * <p>Divide up a range of known dictionary characters handled by this break engine.</p> |
100 | * |
101 | * @param text A UText representing the text |
102 | * @param rangeStart The start of the range of dictionary characters |
103 | * @param rangeEnd The end of the range of dictionary characters |
104 | * @param foundBreaks Output of C array of int32_t break positions, or 0 |
105 | * @param status Information on any errors encountered. |
106 | * @return The number of breaks found |
107 | */ |
108 | virtual int32_t divideUpDictionaryRange( UText *text, |
109 | int32_t rangeStart, |
110 | int32_t rangeEnd, |
111 | UVector32 &foundBreaks, |
112 | UBool isPhraseBreaking, |
113 | UErrorCode& status) const = 0; |
114 | |
115 | }; |
116 | |
117 | /******************************************************************* |
118 | * ThaiBreakEngine |
119 | */ |
120 | |
121 | /** |
122 | * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a |
123 | * dictionary and heuristics to determine Thai-specific breaks.</p> |
124 | * |
125 | * <p>After it is constructed a ThaiBreakEngine may be shared between |
126 | * threads without synchronization.</p> |
127 | */ |
128 | class ThaiBreakEngine : public DictionaryBreakEngine { |
129 | private: |
130 | /** |
131 | * The set of characters handled by this engine |
132 | * @internal |
133 | */ |
134 | |
135 | UnicodeSet fEndWordSet; |
136 | UnicodeSet fBeginWordSet; |
137 | UnicodeSet fSuffixSet; |
138 | UnicodeSet fMarkSet; |
139 | DictionaryMatcher *fDictionary; |
140 | |
141 | public: |
142 | |
143 | /** |
144 | * <p>Default constructor.</p> |
145 | * |
146 | * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the |
147 | * engine is deleted. |
148 | */ |
149 | ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); |
150 | |
151 | /** |
152 | * <p>Virtual destructor.</p> |
153 | */ |
154 | virtual ~ThaiBreakEngine(); |
155 | |
156 | protected: |
157 | /** |
158 | * <p>Divide up a range of known dictionary characters handled by this break engine.</p> |
159 | * |
160 | * @param text A UText representing the text |
161 | * @param rangeStart The start of the range of dictionary characters |
162 | * @param rangeEnd The end of the range of dictionary characters |
163 | * @param foundBreaks Output of C array of int32_t break positions, or 0 |
164 | * @param status Information on any errors encountered. |
165 | * @return The number of breaks found |
166 | */ |
167 | virtual int32_t divideUpDictionaryRange( UText *text, |
168 | int32_t rangeStart, |
169 | int32_t rangeEnd, |
170 | UVector32 &foundBreaks, |
171 | UBool isPhraseBreaking, |
172 | UErrorCode& status) const override; |
173 | |
174 | }; |
175 | |
176 | /******************************************************************* |
177 | * LaoBreakEngine |
178 | */ |
179 | |
180 | /** |
181 | * <p>LaoBreakEngine is a kind of DictionaryBreakEngine that uses a |
182 | * dictionary and heuristics to determine Lao-specific breaks.</p> |
183 | * |
184 | * <p>After it is constructed a LaoBreakEngine may be shared between |
185 | * threads without synchronization.</p> |
186 | */ |
187 | class LaoBreakEngine : public DictionaryBreakEngine { |
188 | private: |
189 | /** |
190 | * The set of characters handled by this engine |
191 | * @internal |
192 | */ |
193 | |
194 | UnicodeSet fEndWordSet; |
195 | UnicodeSet fBeginWordSet; |
196 | UnicodeSet fMarkSet; |
197 | DictionaryMatcher *fDictionary; |
198 | |
199 | public: |
200 | |
201 | /** |
202 | * <p>Default constructor.</p> |
203 | * |
204 | * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the |
205 | * engine is deleted. |
206 | */ |
207 | LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); |
208 | |
209 | /** |
210 | * <p>Virtual destructor.</p> |
211 | */ |
212 | virtual ~LaoBreakEngine(); |
213 | |
214 | protected: |
215 | /** |
216 | * <p>Divide up a range of known dictionary characters handled by this break engine.</p> |
217 | * |
218 | * @param text A UText representing the text |
219 | * @param rangeStart The start of the range of dictionary characters |
220 | * @param rangeEnd The end of the range of dictionary characters |
221 | * @param foundBreaks Output of C array of int32_t break positions, or 0 |
222 | * @param status Information on any errors encountered. |
223 | * @return The number of breaks found |
224 | */ |
225 | virtual int32_t divideUpDictionaryRange( UText *text, |
226 | int32_t rangeStart, |
227 | int32_t rangeEnd, |
228 | UVector32 &foundBreaks, |
229 | UBool isPhraseBreaking, |
230 | UErrorCode& status) const override; |
231 | |
232 | }; |
233 | |
234 | /******************************************************************* |
235 | * BurmeseBreakEngine |
236 | */ |
237 | |
238 | /** |
239 | * <p>BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a |
240 | * DictionaryMatcher and heuristics to determine Burmese-specific breaks.</p> |
241 | * |
242 | * <p>After it is constructed a BurmeseBreakEngine may be shared between |
243 | * threads without synchronization.</p> |
244 | */ |
245 | class BurmeseBreakEngine : public DictionaryBreakEngine { |
246 | private: |
247 | /** |
248 | * The set of characters handled by this engine |
249 | * @internal |
250 | */ |
251 | |
252 | UnicodeSet fEndWordSet; |
253 | UnicodeSet fBeginWordSet; |
254 | UnicodeSet fMarkSet; |
255 | DictionaryMatcher *fDictionary; |
256 | |
257 | public: |
258 | |
259 | /** |
260 | * <p>Default constructor.</p> |
261 | * |
262 | * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the |
263 | * engine is deleted. |
264 | */ |
265 | BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); |
266 | |
267 | /** |
268 | * <p>Virtual destructor.</p> |
269 | */ |
270 | virtual ~BurmeseBreakEngine(); |
271 | |
272 | protected: |
273 | /** |
274 | * <p>Divide up a range of known dictionary characters.</p> |
275 | * |
276 | * @param text A UText representing the text |
277 | * @param rangeStart The start of the range of dictionary characters |
278 | * @param rangeEnd The end of the range of dictionary characters |
279 | * @param foundBreaks Output of C array of int32_t break positions, or 0 |
280 | * @param status Information on any errors encountered. |
281 | * @return The number of breaks found |
282 | */ |
283 | virtual int32_t divideUpDictionaryRange( UText *text, |
284 | int32_t rangeStart, |
285 | int32_t rangeEnd, |
286 | UVector32 &foundBreaks, |
287 | UBool isPhraseBreaking, |
288 | UErrorCode& status) const override; |
289 | |
290 | }; |
291 | |
292 | /******************************************************************* |
293 | * KhmerBreakEngine |
294 | */ |
295 | |
296 | /** |
297 | * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a |
298 | * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p> |
299 | * |
300 | * <p>After it is constructed a KhmerBreakEngine may be shared between |
301 | * threads without synchronization.</p> |
302 | */ |
303 | class KhmerBreakEngine : public DictionaryBreakEngine { |
304 | private: |
305 | /** |
306 | * The set of characters handled by this engine |
307 | * @internal |
308 | */ |
309 | |
310 | UnicodeSet fEndWordSet; |
311 | UnicodeSet fBeginWordSet; |
312 | UnicodeSet fMarkSet; |
313 | DictionaryMatcher *fDictionary; |
314 | |
315 | public: |
316 | |
317 | /** |
318 | * <p>Default constructor.</p> |
319 | * |
320 | * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the |
321 | * engine is deleted. |
322 | */ |
323 | KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); |
324 | |
325 | /** |
326 | * <p>Virtual destructor.</p> |
327 | */ |
328 | virtual ~KhmerBreakEngine(); |
329 | |
330 | protected: |
331 | /** |
332 | * <p>Divide up a range of known dictionary characters.</p> |
333 | * |
334 | * @param text A UText representing the text |
335 | * @param rangeStart The start of the range of dictionary characters |
336 | * @param rangeEnd The end of the range of dictionary characters |
337 | * @param foundBreaks Output of C array of int32_t break positions, or 0 |
338 | * @param status Information on any errors encountered. |
339 | * @return The number of breaks found |
340 | */ |
341 | virtual int32_t divideUpDictionaryRange( UText *text, |
342 | int32_t rangeStart, |
343 | int32_t rangeEnd, |
344 | UVector32 &foundBreaks, |
345 | UBool isPhraseBreaking, |
346 | UErrorCode& status) const override; |
347 | |
348 | }; |
349 | |
350 | #if !UCONFIG_NO_NORMALIZATION |
351 | |
352 | /******************************************************************* |
353 | * CjkBreakEngine |
354 | */ |
355 | |
356 | //indicates language/script that the CjkBreakEngine will handle |
357 | enum LanguageType { |
358 | kKorean, |
359 | kChineseJapanese |
360 | }; |
361 | |
362 | /** |
363 | * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a |
364 | * dictionary with costs associated with each word and |
365 | * Viterbi decoding to determine CJK-specific breaks.</p> |
366 | */ |
367 | class CjkBreakEngine : public DictionaryBreakEngine { |
368 | protected: |
369 | /** |
370 | * The set of characters handled by this engine |
371 | * @internal |
372 | */ |
373 | UnicodeSet fHangulWordSet; |
374 | UnicodeSet fDigitOrOpenPunctuationOrAlphabetSet; |
375 | UnicodeSet fClosePunctuationSet; |
376 | |
377 | DictionaryMatcher *fDictionary; |
378 | const Normalizer2 *nfkcNorm2; |
379 | MlBreakEngine *fMlBreakEngine; |
380 | bool isCj; |
381 | |
382 | private: |
383 | // Load Japanese extensions. |
384 | void loadJapaneseExtensions(UErrorCode& error); |
385 | // Load Japanese Hiragana. |
386 | void loadHiragana(UErrorCode& error); |
387 | // Initialize fSkipSet by loading Japanese Hiragana and extensions. |
388 | void initJapanesePhraseParameter(UErrorCode& error); |
389 | |
390 | Hashtable fSkipSet; |
391 | |
392 | public: |
393 | |
394 | /** |
395 | * <p>Default constructor.</p> |
396 | * |
397 | * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the |
398 | * engine is deleted. The DictionaryMatcher must contain costs for each word |
399 | * in order for the dictionary to work properly. |
400 | */ |
401 | CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status); |
402 | |
403 | /** |
404 | * <p>Virtual destructor.</p> |
405 | */ |
406 | virtual ~CjkBreakEngine(); |
407 | |
408 | protected: |
409 | /** |
410 | * <p>Divide up a range of known dictionary characters handled by this break engine.</p> |
411 | * |
412 | * @param text A UText representing the text |
413 | * @param rangeStart The start of the range of dictionary characters |
414 | * @param rangeEnd The end of the range of dictionary characters |
415 | * @param foundBreaks Output of C array of int32_t break positions, or 0 |
416 | * @param status Information on any errors encountered. |
417 | * @return The number of breaks found |
418 | */ |
419 | virtual int32_t divideUpDictionaryRange( UText *text, |
420 | int32_t rangeStart, |
421 | int32_t rangeEnd, |
422 | UVector32 &foundBreaks, |
423 | UBool isPhraseBreaking, |
424 | UErrorCode& status) const override; |
425 | |
426 | }; |
427 | |
428 | #endif |
429 | |
430 | U_NAMESPACE_END |
431 | |
432 | /* DICTBE_H */ |
433 | #endif |
434 | |