1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | /** |
4 | ************************************************************************************ |
5 | * Copyright (C) 2006-2012, International Business Machines Corporation and others. * |
6 | * All Rights Reserved. * |
7 | ************************************************************************************ |
8 | */ |
9 | |
10 | #ifndef BRKENG_H |
11 | #define BRKENG_H |
12 | |
13 | #include "unicode/utypes.h" |
14 | #include "unicode/uobject.h" |
15 | #include "unicode/utext.h" |
16 | #include "unicode/uscript.h" |
17 | |
18 | U_NAMESPACE_BEGIN |
19 | |
20 | class UnicodeSet; |
21 | class UStack; |
22 | class UVector32; |
23 | class DictionaryMatcher; |
24 | |
25 | /******************************************************************* |
26 | * LanguageBreakEngine |
27 | */ |
28 | |
29 | /** |
30 | * <p>LanguageBreakEngines implement language-specific knowledge for |
31 | * finding text boundaries within a run of characters belonging to a |
32 | * specific set. The boundaries will be of a specific kind, e.g. word, |
33 | * line, etc.</p> |
34 | * |
35 | * <p>LanguageBreakEngines should normally be implemented so as to |
36 | * be shared between threads without locking.</p> |
37 | */ |
38 | class LanguageBreakEngine : public UMemory { |
39 | public: |
40 | |
41 | /** |
42 | * <p>Default constructor.</p> |
43 | * |
44 | */ |
45 | LanguageBreakEngine(); |
46 | |
47 | /** |
48 | * <p>Virtual destructor.</p> |
49 | */ |
50 | virtual ~LanguageBreakEngine(); |
51 | |
52 | /** |
53 | * <p>Indicate whether this engine handles a particular character for |
54 | * a particular kind of break.</p> |
55 | * |
56 | * @param c A character which begins a run that the engine might handle |
57 | * @return true if this engine handles the particular character and break |
58 | * type. |
59 | */ |
60 | virtual UBool handles(UChar32 c) const = 0; |
61 | |
62 | /** |
63 | * <p>Find any breaks within a run in the supplied text.</p> |
64 | * |
65 | * @param text A UText representing the text. The |
66 | * iterator is left at the end of the run of characters which the engine |
67 | * is capable of handling. |
68 | * @param startPos The start of the run within the supplied text. |
69 | * @param endPos The end of the run within the supplied text. |
70 | * @param foundBreaks A Vector of int32_t to receive the breaks. |
71 | * @param status Information on any errors encountered. |
72 | * @return The number of breaks found. |
73 | */ |
74 | virtual int32_t findBreaks( UText *text, |
75 | int32_t startPos, |
76 | int32_t endPos, |
77 | UVector32 &foundBreaks, |
78 | UBool isPhraseBreaking, |
79 | UErrorCode &status) const = 0; |
80 | |
81 | }; |
82 | |
83 | /******************************************************************* |
84 | * LanguageBreakFactory |
85 | */ |
86 | |
87 | /** |
88 | * <p>LanguageBreakFactorys find and return a LanguageBreakEngine |
89 | * that can determine breaks for characters in a specific set, if |
90 | * such an object can be found.</p> |
91 | * |
92 | * <p>If a LanguageBreakFactory is to be shared between threads, |
93 | * appropriate synchronization must be used; there is none internal |
94 | * to the factory.</p> |
95 | * |
96 | * <p>A LanguageBreakEngine returned by a LanguageBreakFactory can |
97 | * normally be shared between threads without synchronization, unless |
98 | * the specific subclass of LanguageBreakFactory indicates otherwise.</p> |
99 | * |
100 | * <p>A LanguageBreakFactory is responsible for deleting any LanguageBreakEngine |
101 | * it returns when it itself is deleted, unless the specific subclass of |
102 | * LanguageBreakFactory indicates otherwise. Naturally, the factory should |
103 | * not be deleted until the LanguageBreakEngines it has returned are no |
104 | * longer needed.</p> |
105 | */ |
106 | class LanguageBreakFactory : public UMemory { |
107 | public: |
108 | |
109 | /** |
110 | * <p>Default constructor.</p> |
111 | * |
112 | */ |
113 | LanguageBreakFactory(); |
114 | |
115 | /** |
116 | * <p>Virtual destructor.</p> |
117 | */ |
118 | virtual ~LanguageBreakFactory(); |
119 | |
120 | /** |
121 | * <p>Find and return a LanguageBreakEngine that can find the desired |
122 | * kind of break for the set of characters to which the supplied |
123 | * character belongs. It is up to the set of available engines to |
124 | * determine what the sets of characters are.</p> |
125 | * |
126 | * @param c A character that begins a run for which a LanguageBreakEngine is |
127 | * sought. |
128 | * @return A LanguageBreakEngine with the desired characteristics, or 0. |
129 | */ |
130 | virtual const LanguageBreakEngine *getEngineFor(UChar32 c) = 0; |
131 | |
132 | }; |
133 | |
134 | /******************************************************************* |
135 | * UnhandledEngine |
136 | */ |
137 | |
138 | /** |
139 | * <p>UnhandledEngine is a special subclass of LanguageBreakEngine that |
140 | * handles characters that no other LanguageBreakEngine is available to |
141 | * handle. It is told the character and the type of break; at its |
142 | * discretion it may handle more than the specified character (e.g., |
143 | * the entire script to which that character belongs.</p> |
144 | * |
145 | * <p>UnhandledEngines may not be shared between threads without |
146 | * external synchronization.</p> |
147 | */ |
148 | |
149 | class UnhandledEngine : public LanguageBreakEngine { |
150 | private: |
151 | |
152 | /** |
153 | * The sets of characters handled. |
154 | * @internal |
155 | */ |
156 | |
157 | UnicodeSet *fHandled; |
158 | |
159 | public: |
160 | |
161 | /** |
162 | * <p>Default constructor.</p> |
163 | * |
164 | */ |
165 | UnhandledEngine(UErrorCode &status); |
166 | |
167 | /** |
168 | * <p>Virtual destructor.</p> |
169 | */ |
170 | virtual ~UnhandledEngine(); |
171 | |
172 | /** |
173 | * <p>Indicate whether this engine handles a particular character for |
174 | * a particular kind of break.</p> |
175 | * |
176 | * @param c A character which begins a run that the engine might handle |
177 | * @return true if this engine handles the particular character and break |
178 | * type. |
179 | */ |
180 | virtual UBool handles(UChar32 c) const override; |
181 | |
182 | /** |
183 | * <p>Find any breaks within a run in the supplied text.</p> |
184 | * |
185 | * @param text A UText representing the text (TODO: UText). The |
186 | * iterator is left at the end of the run of characters which the engine |
187 | * is capable of handling. |
188 | * @param startPos The start of the run within the supplied text. |
189 | * @param endPos The end of the run within the supplied text. |
190 | * @param foundBreaks An allocated C array of the breaks found, if any |
191 | * @param status Information on any errors encountered. |
192 | * @return The number of breaks found. |
193 | */ |
194 | virtual int32_t findBreaks( UText *text, |
195 | int32_t startPos, |
196 | int32_t endPos, |
197 | UVector32 &foundBreaks, |
198 | UBool isPhraseBreaking, |
199 | UErrorCode &status) const override; |
200 | |
201 | /** |
202 | * <p>Tell the engine to handle a particular character and break type.</p> |
203 | * |
204 | * @param c A character which the engine should handle |
205 | */ |
206 | virtual void handleCharacter(UChar32 c); |
207 | |
208 | }; |
209 | |
210 | /******************************************************************* |
211 | * ICULanguageBreakFactory |
212 | */ |
213 | |
214 | /** |
215 | * <p>ICULanguageBreakFactory is the default LanguageBreakFactory for |
216 | * ICU. It creates dictionary-based LanguageBreakEngines from dictionary |
217 | * data in the ICU data file.</p> |
218 | */ |
219 | class ICULanguageBreakFactory : public LanguageBreakFactory { |
220 | private: |
221 | |
222 | /** |
223 | * The stack of break engines created by this factory |
224 | * @internal |
225 | */ |
226 | |
227 | UStack *fEngines; |
228 | |
229 | public: |
230 | |
231 | /** |
232 | * <p>Standard constructor.</p> |
233 | * |
234 | */ |
235 | ICULanguageBreakFactory(UErrorCode &status); |
236 | |
237 | /** |
238 | * <p>Virtual destructor.</p> |
239 | */ |
240 | virtual ~ICULanguageBreakFactory(); |
241 | |
242 | /** |
243 | * <p>Find and return a LanguageBreakEngine that can find the desired |
244 | * kind of break for the set of characters to which the supplied |
245 | * character belongs. It is up to the set of available engines to |
246 | * determine what the sets of characters are.</p> |
247 | * |
248 | * @param c A character that begins a run for which a LanguageBreakEngine is |
249 | * sought. |
250 | * @return A LanguageBreakEngine with the desired characteristics, or 0. |
251 | */ |
252 | virtual const LanguageBreakEngine *getEngineFor(UChar32 c) override; |
253 | |
254 | protected: |
255 | /** |
256 | * <p>Create a LanguageBreakEngine for the set of characters to which |
257 | * the supplied character belongs, for the specified break type.</p> |
258 | * |
259 | * @param c A character that begins a run for which a LanguageBreakEngine is |
260 | * sought. |
261 | * @return A LanguageBreakEngine with the desired characteristics, or 0. |
262 | */ |
263 | virtual const LanguageBreakEngine *loadEngineFor(UChar32 c); |
264 | |
265 | /** |
266 | * <p>Create a DictionaryMatcher for the specified script and break type.</p> |
267 | * @param script An ISO 15924 script code that identifies the dictionary to be |
268 | * created. |
269 | * @return A DictionaryMatcher with the desired characteristics, or nullptr. |
270 | */ |
271 | virtual DictionaryMatcher *loadDictionaryMatcherFor(UScriptCode script); |
272 | }; |
273 | |
274 | U_NAMESPACE_END |
275 | |
276 | /* BRKENG_H */ |
277 | #endif |
278 | |