1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | /** |
4 | ************************************************************************************ |
5 | * Copyright (C) 2006-2012, International Business Machines Corporation and others. * |
6 | * All Rights Reserved. * |
7 | ************************************************************************************ |
8 | */ |
9 | |
10 | #ifndef BRKENG_H |
11 | #define BRKENG_H |
12 | |
13 | #include "unicode/utypes.h" |
14 | #include "unicode/uobject.h" |
15 | #include "unicode/utext.h" |
16 | #include "unicode/uscript.h" |
17 | |
18 | U_NAMESPACE_BEGIN |
19 | |
20 | class UnicodeSet; |
21 | class UStack; |
22 | class UVector32; |
23 | class DictionaryMatcher; |
24 | |
25 | /******************************************************************* |
26 | * LanguageBreakEngine |
27 | */ |
28 | |
29 | /** |
30 | * <p>LanguageBreakEngines implement language-specific knowledge for |
31 | * finding text boundaries within a run of characters belonging to a |
32 | * specific set. The boundaries will be of a specific kind, e.g. word, |
33 | * line, etc.</p> |
34 | * |
35 | * <p>LanguageBreakEngines should normally be implemented so as to |
36 | * be shared between threads without locking.</p> |
37 | */ |
38 | class LanguageBreakEngine : public UMemory { |
39 | public: |
40 | |
41 | /** |
42 | * <p>Default constructor.</p> |
43 | * |
44 | */ |
45 | LanguageBreakEngine(); |
46 | |
47 | /** |
48 | * <p>Virtual destructor.</p> |
49 | */ |
50 | virtual ~LanguageBreakEngine(); |
51 | |
52 | /** |
53 | * <p>Indicate whether this engine handles a particular character for |
54 | * a particular kind of break.</p> |
55 | * |
56 | * @param c A character which begins a run that the engine might handle |
57 | * @return TRUE if this engine handles the particular character and break |
58 | * type. |
59 | */ |
60 | virtual UBool handles(UChar32 c) const = 0; |
61 | |
62 | /** |
63 | * <p>Find any breaks within a run in the supplied text.</p> |
64 | * |
65 | * @param text A UText representing the text. The |
66 | * iterator is left at the end of the run of characters which the engine |
67 | * is capable of handling. |
68 | * @param startPos The start of the run within the supplied text. |
69 | * @param endPos The end of the run within the supplied text. |
70 | * @param foundBreaks A Vector of int32_t to receive the breaks. |
71 | * @return The number of breaks found. |
72 | */ |
73 | virtual int32_t findBreaks( UText *text, |
74 | int32_t startPos, |
75 | int32_t endPos, |
76 | UVector32 &foundBreaks ) const = 0; |
77 | |
78 | }; |
79 | |
80 | /******************************************************************* |
81 | * LanguageBreakFactory |
82 | */ |
83 | |
84 | /** |
85 | * <p>LanguageBreakFactorys find and return a LanguageBreakEngine |
86 | * that can determine breaks for characters in a specific set, if |
87 | * such an object can be found.</p> |
88 | * |
89 | * <p>If a LanguageBreakFactory is to be shared between threads, |
90 | * appropriate synchronization must be used; there is none internal |
91 | * to the factory.</p> |
92 | * |
93 | * <p>A LanguageBreakEngine returned by a LanguageBreakFactory can |
94 | * normally be shared between threads without synchronization, unless |
95 | * the specific subclass of LanguageBreakFactory indicates otherwise.</p> |
96 | * |
97 | * <p>A LanguageBreakFactory is responsible for deleting any LanguageBreakEngine |
98 | * it returns when it itself is deleted, unless the specific subclass of |
99 | * LanguageBreakFactory indicates otherwise. Naturally, the factory should |
100 | * not be deleted until the LanguageBreakEngines it has returned are no |
101 | * longer needed.</p> |
102 | */ |
103 | class LanguageBreakFactory : public UMemory { |
104 | public: |
105 | |
106 | /** |
107 | * <p>Default constructor.</p> |
108 | * |
109 | */ |
110 | LanguageBreakFactory(); |
111 | |
112 | /** |
113 | * <p>Virtual destructor.</p> |
114 | */ |
115 | virtual ~LanguageBreakFactory(); |
116 | |
117 | /** |
118 | * <p>Find and return a LanguageBreakEngine that can find the desired |
119 | * kind of break for the set of characters to which the supplied |
120 | * character belongs. It is up to the set of available engines to |
121 | * determine what the sets of characters are.</p> |
122 | * |
123 | * @param c A character that begins a run for which a LanguageBreakEngine is |
124 | * sought. |
125 | * @return A LanguageBreakEngine with the desired characteristics, or 0. |
126 | */ |
127 | virtual const LanguageBreakEngine *getEngineFor(UChar32 c) = 0; |
128 | |
129 | }; |
130 | |
131 | /******************************************************************* |
132 | * UnhandledEngine |
133 | */ |
134 | |
135 | /** |
136 | * <p>UnhandledEngine is a special subclass of LanguageBreakEngine that |
137 | * handles characters that no other LanguageBreakEngine is available to |
138 | * handle. It is told the character and the type of break; at its |
139 | * discretion it may handle more than the specified character (e.g., |
140 | * the entire script to which that character belongs.</p> |
141 | * |
142 | * <p>UnhandledEngines may not be shared between threads without |
143 | * external synchronization.</p> |
144 | */ |
145 | |
146 | class UnhandledEngine : public LanguageBreakEngine { |
147 | private: |
148 | |
149 | /** |
150 | * The sets of characters handled. |
151 | * @internal |
152 | */ |
153 | |
154 | UnicodeSet *fHandled; |
155 | |
156 | public: |
157 | |
158 | /** |
159 | * <p>Default constructor.</p> |
160 | * |
161 | */ |
162 | UnhandledEngine(UErrorCode &status); |
163 | |
164 | /** |
165 | * <p>Virtual destructor.</p> |
166 | */ |
167 | virtual ~UnhandledEngine(); |
168 | |
169 | /** |
170 | * <p>Indicate whether this engine handles a particular character for |
171 | * a particular kind of break.</p> |
172 | * |
173 | * @param c A character which begins a run that the engine might handle |
174 | * @return TRUE if this engine handles the particular character and break |
175 | * type. |
176 | */ |
177 | virtual UBool handles(UChar32 c) const; |
178 | |
179 | /** |
180 | * <p>Find any breaks within a run in the supplied text.</p> |
181 | * |
182 | * @param text A UText representing the text (TODO: UText). The |
183 | * iterator is left at the end of the run of characters which the engine |
184 | * is capable of handling. |
185 | * @param startPos The start of the run within the supplied text. |
186 | * @param endPos The end of the run within the supplied text. |
187 | * @param foundBreaks An allocated C array of the breaks found, if any |
188 | * @return The number of breaks found. |
189 | */ |
190 | virtual int32_t findBreaks( UText *text, |
191 | int32_t startPos, |
192 | int32_t endPos, |
193 | UVector32 &foundBreaks ) const; |
194 | |
195 | /** |
196 | * <p>Tell the engine to handle a particular character and break type.</p> |
197 | * |
198 | * @param c A character which the engine should handle |
199 | */ |
200 | virtual void handleCharacter(UChar32 c); |
201 | |
202 | }; |
203 | |
204 | /******************************************************************* |
205 | * ICULanguageBreakFactory |
206 | */ |
207 | |
208 | /** |
209 | * <p>ICULanguageBreakFactory is the default LanguageBreakFactory for |
210 | * ICU. It creates dictionary-based LanguageBreakEngines from dictionary |
211 | * data in the ICU data file.</p> |
212 | */ |
213 | class ICULanguageBreakFactory : public LanguageBreakFactory { |
214 | private: |
215 | |
216 | /** |
217 | * The stack of break engines created by this factory |
218 | * @internal |
219 | */ |
220 | |
221 | UStack *fEngines; |
222 | |
223 | public: |
224 | |
225 | /** |
226 | * <p>Standard constructor.</p> |
227 | * |
228 | */ |
229 | ICULanguageBreakFactory(UErrorCode &status); |
230 | |
231 | /** |
232 | * <p>Virtual destructor.</p> |
233 | */ |
234 | virtual ~ICULanguageBreakFactory(); |
235 | |
236 | /** |
237 | * <p>Find and return a LanguageBreakEngine that can find the desired |
238 | * kind of break for the set of characters to which the supplied |
239 | * character belongs. It is up to the set of available engines to |
240 | * determine what the sets of characters are.</p> |
241 | * |
242 | * @param c A character that begins a run for which a LanguageBreakEngine is |
243 | * sought. |
244 | * @return A LanguageBreakEngine with the desired characteristics, or 0. |
245 | */ |
246 | virtual const LanguageBreakEngine *getEngineFor(UChar32 c); |
247 | |
248 | protected: |
249 | /** |
250 | * <p>Create a LanguageBreakEngine for the set of characters to which |
251 | * the supplied character belongs, for the specified break type.</p> |
252 | * |
253 | * @param c A character that begins a run for which a LanguageBreakEngine is |
254 | * sought. |
255 | * @return A LanguageBreakEngine with the desired characteristics, or 0. |
256 | */ |
257 | virtual const LanguageBreakEngine *loadEngineFor(UChar32 c); |
258 | |
259 | /** |
260 | * <p>Create a DictionaryMatcher for the specified script and break type.</p> |
261 | * @param script An ISO 15924 script code that identifies the dictionary to be |
262 | * created. |
263 | * @return A DictionaryMatcher with the desired characteristics, or NULL. |
264 | */ |
265 | virtual DictionaryMatcher *loadDictionaryMatcherFor(UScriptCode script); |
266 | }; |
267 | |
268 | U_NAMESPACE_END |
269 | |
270 | /* BRKENG_H */ |
271 | #endif |
272 | |