1// © 2017 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3
4// casemap.h
5// created: 2017jan12 Markus W. Scherer
6
7#ifndef __CASEMAP_H__
8#define __CASEMAP_H__
9
10#include "unicode/utypes.h"
11
12#if U_SHOW_CPLUSPLUS_API
13
14#include "unicode/stringpiece.h"
15#include "unicode/uobject.h"
16
17/**
18 * \file
19 * \brief C++ API: Low-level C++ case mapping functions.
20 */
21
22U_NAMESPACE_BEGIN
23
24class BreakIterator;
25class ByteSink;
26class Edits;
27
28/**
29 * Low-level C++ case mapping functions.
30 *
31 * @stable ICU 59
32 */
33class U_COMMON_API CaseMap final : public UMemory {
34public:
35 /**
36 * Lowercases a UTF-16 string and optionally records edits.
37 * Casing is locale-dependent and context-sensitive.
38 * The result may be longer or shorter than the original.
39 * The source string and the destination buffer must not overlap.
40 *
41 * @param locale The locale ID. ("" = root locale, nullptr = default locale.)
42 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
43 * @param src The original string.
44 * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
45 * @param dest A buffer for the result string. The result will be NUL-terminated if
46 * the buffer is large enough.
47 * The contents is undefined in case of failure.
48 * @param destCapacity The size of the buffer (number of char16_ts). If it is 0, then
49 * dest may be nullptr and the function will only return the length of the result
50 * without writing any of the result string.
51 * @param edits Records edits for index mapping, working with styled text,
52 * and getting only changes (if any).
53 * The Edits contents is undefined if any error occurs.
54 * This function calls edits->reset() first unless
55 * options includes U_EDITS_NO_RESET. edits can be nullptr.
56 * @param errorCode Reference to an in/out error code value
57 * which must not indicate a failure before the function call.
58 * @return The length of the result string, if successful.
59 * When the result would be longer than destCapacity,
60 * the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set.
61 *
62 * @see u_strToLower
63 * @stable ICU 59
64 */
65 static int32_t toLower(
66 const char *locale, uint32_t options,
67 const char16_t *src, int32_t srcLength,
68 char16_t *dest, int32_t destCapacity, Edits *edits,
69 UErrorCode &errorCode);
70
71 /**
72 * Uppercases a UTF-16 string and optionally records edits.
73 * Casing is locale-dependent and context-sensitive.
74 * The result may be longer or shorter than the original.
75 * The source string and the destination buffer must not overlap.
76 *
77 * @param locale The locale ID. ("" = root locale, nullptr = default locale.)
78 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
79 * @param src The original string.
80 * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
81 * @param dest A buffer for the result string. The result will be NUL-terminated if
82 * the buffer is large enough.
83 * The contents is undefined in case of failure.
84 * @param destCapacity The size of the buffer (number of char16_ts). If it is 0, then
85 * dest may be nullptr and the function will only return the length of the result
86 * without writing any of the result string.
87 * @param edits Records edits for index mapping, working with styled text,
88 * and getting only changes (if any).
89 * The Edits contents is undefined if any error occurs.
90 * This function calls edits->reset() first unless
91 * options includes U_EDITS_NO_RESET. edits can be nullptr.
92 * @param errorCode Reference to an in/out error code value
93 * which must not indicate a failure before the function call.
94 * @return The length of the result string, if successful.
95 * When the result would be longer than destCapacity,
96 * the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set.
97 *
98 * @see u_strToUpper
99 * @stable ICU 59
100 */
101 static int32_t toUpper(
102 const char *locale, uint32_t options,
103 const char16_t *src, int32_t srcLength,
104 char16_t *dest, int32_t destCapacity, Edits *edits,
105 UErrorCode &errorCode);
106
107#if !UCONFIG_NO_BREAK_ITERATION
108
109 /**
110 * Titlecases a UTF-16 string and optionally records edits.
111 * Casing is locale-dependent and context-sensitive.
112 * The result may be longer or shorter than the original.
113 * The source string and the destination buffer must not overlap.
114 *
115 * Titlecasing uses a break iterator to find the first characters of words
116 * that are to be titlecased. It titlecases those characters and lowercases
117 * all others. (This can be modified with options bits.)
118 *
119 * @param locale The locale ID. ("" = root locale, nullptr = default locale.)
120 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT, U_EDITS_NO_RESET,
121 * U_TITLECASE_NO_LOWERCASE,
122 * U_TITLECASE_NO_BREAK_ADJUSTMENT, U_TITLECASE_ADJUST_TO_CASED,
123 * U_TITLECASE_WHOLE_STRING, U_TITLECASE_SENTENCES.
124 * @param iter A break iterator to find the first characters of words that are to be titlecased.
125 * It is set to the source string (setText())
126 * and used one or more times for iteration (first() and next()).
127 * If nullptr, then a word break iterator for the locale is used
128 * (or something equivalent).
129 * @param src The original string.
130 * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
131 * @param dest A buffer for the result string. The result will be NUL-terminated if
132 * the buffer is large enough.
133 * The contents is undefined in case of failure.
134 * @param destCapacity The size of the buffer (number of char16_ts). If it is 0, then
135 * dest may be nullptr and the function will only return the length of the result
136 * without writing any of the result string.
137 * @param edits Records edits for index mapping, working with styled text,
138 * and getting only changes (if any).
139 * The Edits contents is undefined if any error occurs.
140 * This function calls edits->reset() first unless
141 * options includes U_EDITS_NO_RESET. edits can be nullptr.
142 * @param errorCode Reference to an in/out error code value
143 * which must not indicate a failure before the function call.
144 * @return The length of the result string, if successful.
145 * When the result would be longer than destCapacity,
146 * the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set.
147 *
148 * @see u_strToTitle
149 * @see ucasemap_toTitle
150 * @stable ICU 59
151 */
152 static int32_t toTitle(
153 const char *locale, uint32_t options, BreakIterator *iter,
154 const char16_t *src, int32_t srcLength,
155 char16_t *dest, int32_t destCapacity, Edits *edits,
156 UErrorCode &errorCode);
157
158#endif // UCONFIG_NO_BREAK_ITERATION
159
160 /**
161 * Case-folds a UTF-16 string and optionally records edits.
162 *
163 * Case folding is locale-independent and not context-sensitive,
164 * but there is an option for whether to include or exclude mappings for dotted I
165 * and dotless i that are marked with 'T' in CaseFolding.txt.
166 *
167 * The result may be longer or shorter than the original.
168 * The source string and the destination buffer must not overlap.
169 *
170 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT, U_EDITS_NO_RESET,
171 * U_FOLD_CASE_DEFAULT, U_FOLD_CASE_EXCLUDE_SPECIAL_I.
172 * @param src The original string.
173 * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
174 * @param dest A buffer for the result string. The result will be NUL-terminated if
175 * the buffer is large enough.
176 * The contents is undefined in case of failure.
177 * @param destCapacity The size of the buffer (number of char16_ts). If it is 0, then
178 * dest may be nullptr and the function will only return the length of the result
179 * without writing any of the result string.
180 * @param edits Records edits for index mapping, working with styled text,
181 * and getting only changes (if any).
182 * The Edits contents is undefined if any error occurs.
183 * This function calls edits->reset() first unless
184 * options includes U_EDITS_NO_RESET. edits can be nullptr.
185 * @param errorCode Reference to an in/out error code value
186 * which must not indicate a failure before the function call.
187 * @return The length of the result string, if successful.
188 * When the result would be longer than destCapacity,
189 * the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set.
190 *
191 * @see u_strFoldCase
192 * @stable ICU 59
193 */
194 static int32_t fold(
195 uint32_t options,
196 const char16_t *src, int32_t srcLength,
197 char16_t *dest, int32_t destCapacity, Edits *edits,
198 UErrorCode &errorCode);
199
200 /**
201 * Lowercases a UTF-8 string and optionally records edits.
202 * Casing is locale-dependent and context-sensitive.
203 * The result may be longer or shorter than the original.
204 *
205 * @param locale The locale ID. ("" = root locale, nullptr = default locale.)
206 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
207 * @param src The original string.
208 * @param sink A ByteSink to which the result string is written.
209 * sink.Flush() is called at the end.
210 * @param edits Records edits for index mapping, working with styled text,
211 * and getting only changes (if any).
212 * The Edits contents is undefined if any error occurs.
213 * This function calls edits->reset() first unless
214 * options includes U_EDITS_NO_RESET. edits can be nullptr.
215 * @param errorCode Reference to an in/out error code value
216 * which must not indicate a failure before the function call.
217 *
218 * @see ucasemap_utf8ToLower
219 * @stable ICU 60
220 */
221 static void utf8ToLower(
222 const char *locale, uint32_t options,
223 StringPiece src, ByteSink &sink, Edits *edits,
224 UErrorCode &errorCode);
225
226 /**
227 * Uppercases a UTF-8 string and optionally records edits.
228 * Casing is locale-dependent and context-sensitive.
229 * The result may be longer or shorter than the original.
230 *
231 * @param locale The locale ID. ("" = root locale, nullptr = default locale.)
232 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
233 * @param src The original string.
234 * @param sink A ByteSink to which the result string is written.
235 * sink.Flush() is called at the end.
236 * @param edits Records edits for index mapping, working with styled text,
237 * and getting only changes (if any).
238 * The Edits contents is undefined if any error occurs.
239 * This function calls edits->reset() first unless
240 * options includes U_EDITS_NO_RESET. edits can be nullptr.
241 * @param errorCode Reference to an in/out error code value
242 * which must not indicate a failure before the function call.
243 *
244 * @see ucasemap_utf8ToUpper
245 * @stable ICU 60
246 */
247 static void utf8ToUpper(
248 const char *locale, uint32_t options,
249 StringPiece src, ByteSink &sink, Edits *edits,
250 UErrorCode &errorCode);
251
252#if !UCONFIG_NO_BREAK_ITERATION
253
254 /**
255 * Titlecases a UTF-8 string and optionally records edits.
256 * Casing is locale-dependent and context-sensitive.
257 * The result may be longer or shorter than the original.
258 *
259 * Titlecasing uses a break iterator to find the first characters of words
260 * that are to be titlecased. It titlecases those characters and lowercases
261 * all others. (This can be modified with options bits.)
262 *
263 * @param locale The locale ID. ("" = root locale, nullptr = default locale.)
264 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT, U_EDITS_NO_RESET,
265 * U_TITLECASE_NO_LOWERCASE,
266 * U_TITLECASE_NO_BREAK_ADJUSTMENT, U_TITLECASE_ADJUST_TO_CASED,
267 * U_TITLECASE_WHOLE_STRING, U_TITLECASE_SENTENCES.
268 * @param iter A break iterator to find the first characters of words that are to be titlecased.
269 * It is set to the source string (setUText())
270 * and used one or more times for iteration (first() and next()).
271 * If nullptr, then a word break iterator for the locale is used
272 * (or something equivalent).
273 * @param src The original string.
274 * @param sink A ByteSink to which the result string is written.
275 * sink.Flush() is called at the end.
276 * @param edits Records edits for index mapping, working with styled text,
277 * and getting only changes (if any).
278 * The Edits contents is undefined if any error occurs.
279 * This function calls edits->reset() first unless
280 * options includes U_EDITS_NO_RESET. edits can be nullptr.
281 * @param errorCode Reference to an in/out error code value
282 * which must not indicate a failure before the function call.
283 *
284 * @see ucasemap_utf8ToTitle
285 * @stable ICU 60
286 */
287 static void utf8ToTitle(
288 const char *locale, uint32_t options, BreakIterator *iter,
289 StringPiece src, ByteSink &sink, Edits *edits,
290 UErrorCode &errorCode);
291
292#endif // UCONFIG_NO_BREAK_ITERATION
293
294 /**
295 * Case-folds a UTF-8 string and optionally records edits.
296 *
297 * Case folding is locale-independent and not context-sensitive,
298 * but there is an option for whether to include or exclude mappings for dotted I
299 * and dotless i that are marked with 'T' in CaseFolding.txt.
300 *
301 * The result may be longer or shorter than the original.
302 *
303 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
304 * @param src The original string.
305 * @param sink A ByteSink to which the result string is written.
306 * sink.Flush() is called at the end.
307 * @param edits Records edits for index mapping, working with styled text,
308 * and getting only changes (if any).
309 * The Edits contents is undefined if any error occurs.
310 * This function calls edits->reset() first unless
311 * options includes U_EDITS_NO_RESET. edits can be nullptr.
312 * @param errorCode Reference to an in/out error code value
313 * which must not indicate a failure before the function call.
314 *
315 * @see ucasemap_utf8FoldCase
316 * @stable ICU 60
317 */
318 static void utf8Fold(
319 uint32_t options,
320 StringPiece src, ByteSink &sink, Edits *edits,
321 UErrorCode &errorCode);
322
323 /**
324 * Lowercases a UTF-8 string and optionally records edits.
325 * Casing is locale-dependent and context-sensitive.
326 * The result may be longer or shorter than the original.
327 * The source string and the destination buffer must not overlap.
328 *
329 * @param locale The locale ID. ("" = root locale, nullptr = default locale.)
330 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
331 * @param src The original string.
332 * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
333 * @param dest A buffer for the result string. The result will be NUL-terminated if
334 * the buffer is large enough.
335 * The contents is undefined in case of failure.
336 * @param destCapacity The size of the buffer (number of bytes). If it is 0, then
337 * dest may be nullptr and the function will only return the length of the result
338 * without writing any of the result string.
339 * @param edits Records edits for index mapping, working with styled text,
340 * and getting only changes (if any).
341 * The Edits contents is undefined if any error occurs.
342 * This function calls edits->reset() first unless
343 * options includes U_EDITS_NO_RESET. edits can be nullptr.
344 * @param errorCode Reference to an in/out error code value
345 * which must not indicate a failure before the function call.
346 * @return The length of the result string, if successful.
347 * When the result would be longer than destCapacity,
348 * the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set.
349 *
350 * @see ucasemap_utf8ToLower
351 * @stable ICU 59
352 */
353 static int32_t utf8ToLower(
354 const char *locale, uint32_t options,
355 const char *src, int32_t srcLength,
356 char *dest, int32_t destCapacity, Edits *edits,
357 UErrorCode &errorCode);
358
359 /**
360 * Uppercases a UTF-8 string and optionally records edits.
361 * Casing is locale-dependent and context-sensitive.
362 * The result may be longer or shorter than the original.
363 * The source string and the destination buffer must not overlap.
364 *
365 * @param locale The locale ID. ("" = root locale, nullptr = default locale.)
366 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
367 * @param src The original string.
368 * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
369 * @param dest A buffer for the result string. The result will be NUL-terminated if
370 * the buffer is large enough.
371 * The contents is undefined in case of failure.
372 * @param destCapacity The size of the buffer (number of bytes). If it is 0, then
373 * dest may be nullptr and the function will only return the length of the result
374 * without writing any of the result string.
375 * @param edits Records edits for index mapping, working with styled text,
376 * and getting only changes (if any).
377 * The Edits contents is undefined if any error occurs.
378 * This function calls edits->reset() first unless
379 * options includes U_EDITS_NO_RESET. edits can be nullptr.
380 * @param errorCode Reference to an in/out error code value
381 * which must not indicate a failure before the function call.
382 * @return The length of the result string, if successful.
383 * When the result would be longer than destCapacity,
384 * the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set.
385 *
386 * @see ucasemap_utf8ToUpper
387 * @stable ICU 59
388 */
389 static int32_t utf8ToUpper(
390 const char *locale, uint32_t options,
391 const char *src, int32_t srcLength,
392 char *dest, int32_t destCapacity, Edits *edits,
393 UErrorCode &errorCode);
394
395#if !UCONFIG_NO_BREAK_ITERATION
396
397 /**
398 * Titlecases a UTF-8 string and optionally records edits.
399 * Casing is locale-dependent and context-sensitive.
400 * The result may be longer or shorter than the original.
401 * The source string and the destination buffer must not overlap.
402 *
403 * Titlecasing uses a break iterator to find the first characters of words
404 * that are to be titlecased. It titlecases those characters and lowercases
405 * all others. (This can be modified with options bits.)
406 *
407 * @param locale The locale ID. ("" = root locale, nullptr = default locale.)
408 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT, U_EDITS_NO_RESET,
409 * U_TITLECASE_NO_LOWERCASE,
410 * U_TITLECASE_NO_BREAK_ADJUSTMENT, U_TITLECASE_ADJUST_TO_CASED,
411 * U_TITLECASE_WHOLE_STRING, U_TITLECASE_SENTENCES.
412 * @param iter A break iterator to find the first characters of words that are to be titlecased.
413 * It is set to the source string (setUText())
414 * and used one or more times for iteration (first() and next()).
415 * If nullptr, then a word break iterator for the locale is used
416 * (or something equivalent).
417 * @param src The original string.
418 * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
419 * @param dest A buffer for the result string. The result will be NUL-terminated if
420 * the buffer is large enough.
421 * The contents is undefined in case of failure.
422 * @param destCapacity The size of the buffer (number of bytes). If it is 0, then
423 * dest may be nullptr and the function will only return the length of the result
424 * without writing any of the result string.
425 * @param edits Records edits for index mapping, working with styled text,
426 * and getting only changes (if any).
427 * The Edits contents is undefined if any error occurs.
428 * This function calls edits->reset() first unless
429 * options includes U_EDITS_NO_RESET. edits can be nullptr.
430 * @param errorCode Reference to an in/out error code value
431 * which must not indicate a failure before the function call.
432 * @return The length of the result string, if successful.
433 * When the result would be longer than destCapacity,
434 * the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set.
435 *
436 * @see ucasemap_utf8ToTitle
437 * @stable ICU 59
438 */
439 static int32_t utf8ToTitle(
440 const char *locale, uint32_t options, BreakIterator *iter,
441 const char *src, int32_t srcLength,
442 char *dest, int32_t destCapacity, Edits *edits,
443 UErrorCode &errorCode);
444
445#endif // UCONFIG_NO_BREAK_ITERATION
446
447 /**
448 * Case-folds a UTF-8 string and optionally records edits.
449 *
450 * Case folding is locale-independent and not context-sensitive,
451 * but there is an option for whether to include or exclude mappings for dotted I
452 * and dotless i that are marked with 'T' in CaseFolding.txt.
453 *
454 * The result may be longer or shorter than the original.
455 * The source string and the destination buffer must not overlap.
456 *
457 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT, U_EDITS_NO_RESET,
458 * U_FOLD_CASE_DEFAULT, U_FOLD_CASE_EXCLUDE_SPECIAL_I.
459 * @param src The original string.
460 * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
461 * @param dest A buffer for the result string. The result will be NUL-terminated if
462 * the buffer is large enough.
463 * The contents is undefined in case of failure.
464 * @param destCapacity The size of the buffer (number of bytes). If it is 0, then
465 * dest may be nullptr and the function will only return the length of the result
466 * without writing any of the result string.
467 * @param edits Records edits for index mapping, working with styled text,
468 * and getting only changes (if any).
469 * The Edits contents is undefined if any error occurs.
470 * This function calls edits->reset() first unless
471 * options includes U_EDITS_NO_RESET. edits can be nullptr.
472 * @param errorCode Reference to an in/out error code value
473 * which must not indicate a failure before the function call.
474 * @return The length of the result string, if successful.
475 * When the result would be longer than destCapacity,
476 * the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set.
477 *
478 * @see ucasemap_utf8FoldCase
479 * @stable ICU 59
480 */
481 static int32_t utf8Fold(
482 uint32_t options,
483 const char *src, int32_t srcLength,
484 char *dest, int32_t destCapacity, Edits *edits,
485 UErrorCode &errorCode);
486
487private:
488 CaseMap() = delete;
489 CaseMap(const CaseMap &other) = delete;
490 CaseMap &operator=(const CaseMap &other) = delete;
491};
492
493U_NAMESPACE_END
494
495#endif /* U_SHOW_CPLUSPLUS_API */
496
497#endif // __CASEMAP_H__
498