1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/**
4 ************************************************************************************
5 * Copyright (C) 2006-2012, International Business Machines Corporation and others. *
6 * All Rights Reserved. *
7 ************************************************************************************
8 */
9
10#ifndef BRKENG_H
11#define BRKENG_H
12
13#include "unicode/utypes.h"
14#include "unicode/uobject.h"
15#include "unicode/utext.h"
16#include "unicode/uscript.h"
17
18U_NAMESPACE_BEGIN
19
20class UnicodeSet;
21class UStack;
22class UVector32;
23class DictionaryMatcher;
24
25/*******************************************************************
26 * LanguageBreakEngine
27 */
28
29/**
30 * <p>LanguageBreakEngines implement language-specific knowledge for
31 * finding text boundaries within a run of characters belonging to a
32 * specific set. The boundaries will be of a specific kind, e.g. word,
33 * line, etc.</p>
34 *
35 * <p>LanguageBreakEngines should normally be implemented so as to
36 * be shared between threads without locking.</p>
37 */
38class LanguageBreakEngine : public UMemory {
39 public:
40
41 /**
42 * <p>Default constructor.</p>
43 *
44 */
45 LanguageBreakEngine();
46
47 /**
48 * <p>Virtual destructor.</p>
49 */
50 virtual ~LanguageBreakEngine();
51
52 /**
53 * <p>Indicate whether this engine handles a particular character for
54 * a particular kind of break.</p>
55 *
56 * @param c A character which begins a run that the engine might handle
57 * @return true if this engine handles the particular character and break
58 * type.
59 */
60 virtual UBool handles(UChar32 c) const = 0;
61
62 /**
63 * <p>Find any breaks within a run in the supplied text.</p>
64 *
65 * @param text A UText representing the text. The
66 * iterator is left at the end of the run of characters which the engine
67 * is capable of handling.
68 * @param startPos The start of the run within the supplied text.
69 * @param endPos The end of the run within the supplied text.
70 * @param foundBreaks A Vector of int32_t to receive the breaks.
71 * @param status Information on any errors encountered.
72 * @return The number of breaks found.
73 */
74 virtual int32_t findBreaks( UText *text,
75 int32_t startPos,
76 int32_t endPos,
77 UVector32 &foundBreaks,
78 UBool isPhraseBreaking,
79 UErrorCode &status) const = 0;
80
81};
82
83/*******************************************************************
84 * LanguageBreakFactory
85 */
86
87/**
88 * <p>LanguageBreakFactorys find and return a LanguageBreakEngine
89 * that can determine breaks for characters in a specific set, if
90 * such an object can be found.</p>
91 *
92 * <p>If a LanguageBreakFactory is to be shared between threads,
93 * appropriate synchronization must be used; there is none internal
94 * to the factory.</p>
95 *
96 * <p>A LanguageBreakEngine returned by a LanguageBreakFactory can
97 * normally be shared between threads without synchronization, unless
98 * the specific subclass of LanguageBreakFactory indicates otherwise.</p>
99 *
100 * <p>A LanguageBreakFactory is responsible for deleting any LanguageBreakEngine
101 * it returns when it itself is deleted, unless the specific subclass of
102 * LanguageBreakFactory indicates otherwise. Naturally, the factory should
103 * not be deleted until the LanguageBreakEngines it has returned are no
104 * longer needed.</p>
105 */
106class LanguageBreakFactory : public UMemory {
107 public:
108
109 /**
110 * <p>Default constructor.</p>
111 *
112 */
113 LanguageBreakFactory();
114
115 /**
116 * <p>Virtual destructor.</p>
117 */
118 virtual ~LanguageBreakFactory();
119
120 /**
121 * <p>Find and return a LanguageBreakEngine that can find the desired
122 * kind of break for the set of characters to which the supplied
123 * character belongs. It is up to the set of available engines to
124 * determine what the sets of characters are.</p>
125 *
126 * @param c A character that begins a run for which a LanguageBreakEngine is
127 * sought.
128 * @return A LanguageBreakEngine with the desired characteristics, or 0.
129 */
130 virtual const LanguageBreakEngine *getEngineFor(UChar32 c) = 0;
131
132};
133
134/*******************************************************************
135 * UnhandledEngine
136 */
137
138/**
139 * <p>UnhandledEngine is a special subclass of LanguageBreakEngine that
140 * handles characters that no other LanguageBreakEngine is available to
141 * handle. It is told the character and the type of break; at its
142 * discretion it may handle more than the specified character (e.g.,
143 * the entire script to which that character belongs.</p>
144 *
145 * <p>UnhandledEngines may not be shared between threads without
146 * external synchronization.</p>
147 */
148
149class UnhandledEngine : public LanguageBreakEngine {
150 private:
151
152 /**
153 * The sets of characters handled.
154 * @internal
155 */
156
157 UnicodeSet *fHandled;
158
159 public:
160
161 /**
162 * <p>Default constructor.</p>
163 *
164 */
165 UnhandledEngine(UErrorCode &status);
166
167 /**
168 * <p>Virtual destructor.</p>
169 */
170 virtual ~UnhandledEngine();
171
172 /**
173 * <p>Indicate whether this engine handles a particular character for
174 * a particular kind of break.</p>
175 *
176 * @param c A character which begins a run that the engine might handle
177 * @return true if this engine handles the particular character and break
178 * type.
179 */
180 virtual UBool handles(UChar32 c) const override;
181
182 /**
183 * <p>Find any breaks within a run in the supplied text.</p>
184 *
185 * @param text A UText representing the text (TODO: UText). The
186 * iterator is left at the end of the run of characters which the engine
187 * is capable of handling.
188 * @param startPos The start of the run within the supplied text.
189 * @param endPos The end of the run within the supplied text.
190 * @param foundBreaks An allocated C array of the breaks found, if any
191 * @param status Information on any errors encountered.
192 * @return The number of breaks found.
193 */
194 virtual int32_t findBreaks( UText *text,
195 int32_t startPos,
196 int32_t endPos,
197 UVector32 &foundBreaks,
198 UBool isPhraseBreaking,
199 UErrorCode &status) const override;
200
201 /**
202 * <p>Tell the engine to handle a particular character and break type.</p>
203 *
204 * @param c A character which the engine should handle
205 */
206 virtual void handleCharacter(UChar32 c);
207
208};
209
210/*******************************************************************
211 * ICULanguageBreakFactory
212 */
213
214/**
215 * <p>ICULanguageBreakFactory is the default LanguageBreakFactory for
216 * ICU. It creates dictionary-based LanguageBreakEngines from dictionary
217 * data in the ICU data file.</p>
218 */
219class ICULanguageBreakFactory : public LanguageBreakFactory {
220 private:
221
222 /**
223 * The stack of break engines created by this factory
224 * @internal
225 */
226
227 UStack *fEngines;
228
229 public:
230
231 /**
232 * <p>Standard constructor.</p>
233 *
234 */
235 ICULanguageBreakFactory(UErrorCode &status);
236
237 /**
238 * <p>Virtual destructor.</p>
239 */
240 virtual ~ICULanguageBreakFactory();
241
242 /**
243 * <p>Find and return a LanguageBreakEngine that can find the desired
244 * kind of break for the set of characters to which the supplied
245 * character belongs. It is up to the set of available engines to
246 * determine what the sets of characters are.</p>
247 *
248 * @param c A character that begins a run for which a LanguageBreakEngine is
249 * sought.
250 * @return A LanguageBreakEngine with the desired characteristics, or 0.
251 */
252 virtual const LanguageBreakEngine *getEngineFor(UChar32 c) override;
253
254protected:
255 /**
256 * <p>Create a LanguageBreakEngine for the set of characters to which
257 * the supplied character belongs, for the specified break type.</p>
258 *
259 * @param c A character that begins a run for which a LanguageBreakEngine is
260 * sought.
261 * @return A LanguageBreakEngine with the desired characteristics, or 0.
262 */
263 virtual const LanguageBreakEngine *loadEngineFor(UChar32 c);
264
265 /**
266 * <p>Create a DictionaryMatcher for the specified script and break type.</p>
267 * @param script An ISO 15924 script code that identifies the dictionary to be
268 * created.
269 * @return A DictionaryMatcher with the desired characteristics, or nullptr.
270 */
271 virtual DictionaryMatcher *loadDictionaryMatcherFor(UScriptCode script);
272};
273
274U_NAMESPACE_END
275
276 /* BRKENG_H */
277#endif
278