1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/**
4 *******************************************************************************
5 * Copyright (C) 2006-2014, International Business Machines Corporation *
6 * and others. All Rights Reserved. *
7 *******************************************************************************
8 */
9
10#ifndef DICTBE_H
11#define DICTBE_H
12
13#include "unicode/utypes.h"
14#include "unicode/uniset.h"
15#include "unicode/utext.h"
16
17#include "brkeng.h"
18#include "hash.h"
19#include "mlbe.h"
20#include "uvectr32.h"
21
22U_NAMESPACE_BEGIN
23
24class DictionaryMatcher;
25class MlBreakEngine;
26class Normalizer2;
27
28/*******************************************************************
29 * DictionaryBreakEngine
30 */
31
32/**
33 * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a
34 * dictionary to determine language-specific breaks.</p>
35 *
36 * <p>After it is constructed a DictionaryBreakEngine may be shared between
37 * threads without synchronization.</p>
38 */
39class DictionaryBreakEngine : public LanguageBreakEngine {
40 private:
41 /**
42 * The set of characters handled by this engine
43 * @internal
44 */
45
46 UnicodeSet fSet;
47
48 public:
49
50 /**
51 * <p>Constructor </p>
52 */
53 DictionaryBreakEngine();
54
55 /**
56 * <p>Virtual destructor.</p>
57 */
58 virtual ~DictionaryBreakEngine();
59
60 /**
61 * <p>Indicate whether this engine handles a particular character for
62 * a particular kind of break.</p>
63 *
64 * @param c A character which begins a run that the engine might handle
65 * @return true if this engine handles the particular character and break
66 * type.
67 */
68 virtual UBool handles(UChar32 c) const override;
69
70 /**
71 * <p>Find any breaks within a run in the supplied text.</p>
72 *
73 * @param text A UText representing the text. The iterator is left at
74 * the end of the run of characters which the engine is capable of handling
75 * that starts from the first character in the range.
76 * @param startPos The start of the run within the supplied text.
77 * @param endPos The end of the run within the supplied text.
78 * @param foundBreaks vector of int32_t to receive the break positions
79 * @param status Information on any errors encountered.
80 * @return The number of breaks found.
81 */
82 virtual int32_t findBreaks( UText *text,
83 int32_t startPos,
84 int32_t endPos,
85 UVector32 &foundBreaks,
86 UBool isPhraseBreaking,
87 UErrorCode& status ) const override;
88
89 protected:
90
91 /**
92 * <p>Set the character set handled by this engine.</p>
93 *
94 * @param set A UnicodeSet of the set of characters handled by the engine
95 */
96 virtual void setCharacters( const UnicodeSet &set );
97
98 /**
99 * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
100 *
101 * @param text A UText representing the text
102 * @param rangeStart The start of the range of dictionary characters
103 * @param rangeEnd The end of the range of dictionary characters
104 * @param foundBreaks Output of C array of int32_t break positions, or 0
105 * @param status Information on any errors encountered.
106 * @return The number of breaks found
107 */
108 virtual int32_t divideUpDictionaryRange( UText *text,
109 int32_t rangeStart,
110 int32_t rangeEnd,
111 UVector32 &foundBreaks,
112 UBool isPhraseBreaking,
113 UErrorCode& status) const = 0;
114
115};
116
117/*******************************************************************
118 * ThaiBreakEngine
119 */
120
121/**
122 * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a
123 * dictionary and heuristics to determine Thai-specific breaks.</p>
124 *
125 * <p>After it is constructed a ThaiBreakEngine may be shared between
126 * threads without synchronization.</p>
127 */
128class ThaiBreakEngine : public DictionaryBreakEngine {
129 private:
130 /**
131 * The set of characters handled by this engine
132 * @internal
133 */
134
135 UnicodeSet fEndWordSet;
136 UnicodeSet fBeginWordSet;
137 UnicodeSet fSuffixSet;
138 UnicodeSet fMarkSet;
139 DictionaryMatcher *fDictionary;
140
141 public:
142
143 /**
144 * <p>Default constructor.</p>
145 *
146 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
147 * engine is deleted.
148 */
149 ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
150
151 /**
152 * <p>Virtual destructor.</p>
153 */
154 virtual ~ThaiBreakEngine();
155
156 protected:
157 /**
158 * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
159 *
160 * @param text A UText representing the text
161 * @param rangeStart The start of the range of dictionary characters
162 * @param rangeEnd The end of the range of dictionary characters
163 * @param foundBreaks Output of C array of int32_t break positions, or 0
164 * @param status Information on any errors encountered.
165 * @return The number of breaks found
166 */
167 virtual int32_t divideUpDictionaryRange( UText *text,
168 int32_t rangeStart,
169 int32_t rangeEnd,
170 UVector32 &foundBreaks,
171 UBool isPhraseBreaking,
172 UErrorCode& status) const override;
173
174};
175
176/*******************************************************************
177 * LaoBreakEngine
178 */
179
180/**
181 * <p>LaoBreakEngine is a kind of DictionaryBreakEngine that uses a
182 * dictionary and heuristics to determine Lao-specific breaks.</p>
183 *
184 * <p>After it is constructed a LaoBreakEngine may be shared between
185 * threads without synchronization.</p>
186 */
187class LaoBreakEngine : public DictionaryBreakEngine {
188 private:
189 /**
190 * The set of characters handled by this engine
191 * @internal
192 */
193
194 UnicodeSet fEndWordSet;
195 UnicodeSet fBeginWordSet;
196 UnicodeSet fMarkSet;
197 DictionaryMatcher *fDictionary;
198
199 public:
200
201 /**
202 * <p>Default constructor.</p>
203 *
204 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
205 * engine is deleted.
206 */
207 LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
208
209 /**
210 * <p>Virtual destructor.</p>
211 */
212 virtual ~LaoBreakEngine();
213
214 protected:
215 /**
216 * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
217 *
218 * @param text A UText representing the text
219 * @param rangeStart The start of the range of dictionary characters
220 * @param rangeEnd The end of the range of dictionary characters
221 * @param foundBreaks Output of C array of int32_t break positions, or 0
222 * @param status Information on any errors encountered.
223 * @return The number of breaks found
224 */
225 virtual int32_t divideUpDictionaryRange( UText *text,
226 int32_t rangeStart,
227 int32_t rangeEnd,
228 UVector32 &foundBreaks,
229 UBool isPhraseBreaking,
230 UErrorCode& status) const override;
231
232};
233
234/*******************************************************************
235 * BurmeseBreakEngine
236 */
237
238/**
239 * <p>BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a
240 * DictionaryMatcher and heuristics to determine Burmese-specific breaks.</p>
241 *
242 * <p>After it is constructed a BurmeseBreakEngine may be shared between
243 * threads without synchronization.</p>
244 */
245class BurmeseBreakEngine : public DictionaryBreakEngine {
246 private:
247 /**
248 * The set of characters handled by this engine
249 * @internal
250 */
251
252 UnicodeSet fEndWordSet;
253 UnicodeSet fBeginWordSet;
254 UnicodeSet fMarkSet;
255 DictionaryMatcher *fDictionary;
256
257 public:
258
259 /**
260 * <p>Default constructor.</p>
261 *
262 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
263 * engine is deleted.
264 */
265 BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
266
267 /**
268 * <p>Virtual destructor.</p>
269 */
270 virtual ~BurmeseBreakEngine();
271
272 protected:
273 /**
274 * <p>Divide up a range of known dictionary characters.</p>
275 *
276 * @param text A UText representing the text
277 * @param rangeStart The start of the range of dictionary characters
278 * @param rangeEnd The end of the range of dictionary characters
279 * @param foundBreaks Output of C array of int32_t break positions, or 0
280 * @param status Information on any errors encountered.
281 * @return The number of breaks found
282 */
283 virtual int32_t divideUpDictionaryRange( UText *text,
284 int32_t rangeStart,
285 int32_t rangeEnd,
286 UVector32 &foundBreaks,
287 UBool isPhraseBreaking,
288 UErrorCode& status) const override;
289
290};
291
292/*******************************************************************
293 * KhmerBreakEngine
294 */
295
296/**
297 * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a
298 * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p>
299 *
300 * <p>After it is constructed a KhmerBreakEngine may be shared between
301 * threads without synchronization.</p>
302 */
303class KhmerBreakEngine : public DictionaryBreakEngine {
304 private:
305 /**
306 * The set of characters handled by this engine
307 * @internal
308 */
309
310 UnicodeSet fEndWordSet;
311 UnicodeSet fBeginWordSet;
312 UnicodeSet fMarkSet;
313 DictionaryMatcher *fDictionary;
314
315 public:
316
317 /**
318 * <p>Default constructor.</p>
319 *
320 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
321 * engine is deleted.
322 */
323 KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
324
325 /**
326 * <p>Virtual destructor.</p>
327 */
328 virtual ~KhmerBreakEngine();
329
330 protected:
331 /**
332 * <p>Divide up a range of known dictionary characters.</p>
333 *
334 * @param text A UText representing the text
335 * @param rangeStart The start of the range of dictionary characters
336 * @param rangeEnd The end of the range of dictionary characters
337 * @param foundBreaks Output of C array of int32_t break positions, or 0
338 * @param status Information on any errors encountered.
339 * @return The number of breaks found
340 */
341 virtual int32_t divideUpDictionaryRange( UText *text,
342 int32_t rangeStart,
343 int32_t rangeEnd,
344 UVector32 &foundBreaks,
345 UBool isPhraseBreaking,
346 UErrorCode& status) const override;
347
348};
349
350#if !UCONFIG_NO_NORMALIZATION
351
352/*******************************************************************
353 * CjkBreakEngine
354 */
355
356//indicates language/script that the CjkBreakEngine will handle
357enum LanguageType {
358 kKorean,
359 kChineseJapanese
360};
361
362/**
363 * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a
364 * dictionary with costs associated with each word and
365 * Viterbi decoding to determine CJK-specific breaks.</p>
366 */
367class CjkBreakEngine : public DictionaryBreakEngine {
368 protected:
369 /**
370 * The set of characters handled by this engine
371 * @internal
372 */
373 UnicodeSet fHangulWordSet;
374 UnicodeSet fDigitOrOpenPunctuationOrAlphabetSet;
375 UnicodeSet fClosePunctuationSet;
376
377 DictionaryMatcher *fDictionary;
378 const Normalizer2 *nfkcNorm2;
379 MlBreakEngine *fMlBreakEngine;
380 bool isCj;
381
382 private:
383 // Load Japanese extensions.
384 void loadJapaneseExtensions(UErrorCode& error);
385 // Load Japanese Hiragana.
386 void loadHiragana(UErrorCode& error);
387 // Initialize fSkipSet by loading Japanese Hiragana and extensions.
388 void initJapanesePhraseParameter(UErrorCode& error);
389
390 Hashtable fSkipSet;
391
392 public:
393
394 /**
395 * <p>Default constructor.</p>
396 *
397 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
398 * engine is deleted. The DictionaryMatcher must contain costs for each word
399 * in order for the dictionary to work properly.
400 */
401 CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status);
402
403 /**
404 * <p>Virtual destructor.</p>
405 */
406 virtual ~CjkBreakEngine();
407
408 protected:
409 /**
410 * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
411 *
412 * @param text A UText representing the text
413 * @param rangeStart The start of the range of dictionary characters
414 * @param rangeEnd The end of the range of dictionary characters
415 * @param foundBreaks Output of C array of int32_t break positions, or 0
416 * @param status Information on any errors encountered.
417 * @return The number of breaks found
418 */
419 virtual int32_t divideUpDictionaryRange( UText *text,
420 int32_t rangeStart,
421 int32_t rangeEnd,
422 UVector32 &foundBreaks,
423 UBool isPhraseBreaking,
424 UErrorCode& status) const override;
425
426};
427
428#endif
429
430U_NAMESPACE_END
431
432 /* DICTBE_H */
433#endif
434