1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | /* |
4 | ******************************************************************************* |
5 | * |
6 | * Copyright (C) 2004-2012, International Business Machines |
7 | * Corporation and others. All Rights Reserved. |
8 | * |
9 | ******************************************************************************* |
10 | * file name: ucase.h |
11 | * encoding: UTF-8 |
12 | * tab size: 8 (not used) |
13 | * indentation:4 |
14 | * |
15 | * created on: 2004aug30 |
16 | * created by: Markus W. Scherer |
17 | * |
18 | * Low-level Unicode character/string case mapping code. |
19 | */ |
20 | |
21 | #ifndef __UCASE_H__ |
22 | #define __UCASE_H__ |
23 | |
24 | #include "unicode/utypes.h" |
25 | #include "unicode/uset.h" |
26 | #include "putilimp.h" |
27 | #include "uset_imp.h" |
28 | #include "udataswp.h" |
29 | #include "utrie2.h" |
30 | |
31 | #ifdef __cplusplus |
32 | U_NAMESPACE_BEGIN |
33 | |
34 | class UnicodeString; |
35 | |
36 | U_NAMESPACE_END |
37 | #endif |
38 | |
39 | /* library API -------------------------------------------------------------- */ |
40 | |
41 | U_CFUNC void U_EXPORT2 |
42 | ucase_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode); |
43 | |
44 | /** |
45 | * Requires non-NULL locale ID but otherwise does the equivalent of |
46 | * checking for language codes as if uloc_getLanguage() were called: |
47 | * Accepts both 2- and 3-letter codes and accepts case variants. |
48 | */ |
49 | U_CFUNC int32_t |
50 | ucase_getCaseLocale(const char *locale); |
51 | |
52 | /* Casing locale types for ucase_getCaseLocale */ |
53 | enum { |
54 | UCASE_LOC_UNKNOWN, |
55 | UCASE_LOC_ROOT, |
56 | UCASE_LOC_TURKISH, |
57 | UCASE_LOC_LITHUANIAN, |
58 | UCASE_LOC_GREEK, |
59 | UCASE_LOC_DUTCH, |
60 | UCASE_LOC_ARMENIAN |
61 | }; |
62 | |
63 | /** |
64 | * Bit mask for getting just the options from a string compare options word |
65 | * that are relevant for case-insensitive string comparison. |
66 | * See stringoptions.h. Also include _STRNCMP_STYLE and U_COMPARE_CODE_POINT_ORDER. |
67 | * @internal |
68 | */ |
69 | #define _STRCASECMP_OPTIONS_MASK 0xffff |
70 | |
71 | /** |
72 | * Bit mask for getting just the options from a string compare options word |
73 | * that are relevant for case folding (of a single string or code point). |
74 | * |
75 | * Currently only bit 0 for U_FOLD_CASE_EXCLUDE_SPECIAL_I. |
76 | * It is conceivable that at some point we might use one more bit for using uppercase sharp s. |
77 | * It is conceivable that at some point we might want the option to use only simple case foldings |
78 | * when operating on strings. |
79 | * |
80 | * See stringoptions.h. |
81 | * @internal |
82 | */ |
83 | #define _FOLD_CASE_OPTIONS_MASK 7 |
84 | |
85 | /* single-code point functions */ |
86 | |
87 | U_CAPI UChar32 U_EXPORT2 |
88 | ucase_tolower(UChar32 c); |
89 | |
90 | U_CAPI UChar32 U_EXPORT2 |
91 | ucase_toupper(UChar32 c); |
92 | |
93 | U_CAPI UChar32 U_EXPORT2 |
94 | ucase_totitle(UChar32 c); |
95 | |
96 | U_CAPI UChar32 U_EXPORT2 |
97 | ucase_fold(UChar32 c, uint32_t options); |
98 | |
99 | /** |
100 | * Adds all simple case mappings and the full case folding for c to sa, |
101 | * and also adds special case closure mappings. |
102 | * c itself is not added. |
103 | * For example, the mappings |
104 | * - for s include long s |
105 | * - for sharp s include ss |
106 | * - for k include the Kelvin sign |
107 | */ |
108 | U_CFUNC void U_EXPORT2 |
109 | ucase_addCaseClosure(UChar32 c, const USetAdder *sa); |
110 | |
111 | /** Case closure with only scf=Simple_Case_Folding. */ |
112 | U_CFUNC void U_EXPORT2 |
113 | ucase_addSimpleCaseClosure(UChar32 c, const USetAdder *sa); |
114 | |
115 | /** |
116 | * Maps the string to single code points and adds the associated case closure |
117 | * mappings. |
118 | * The string is mapped to code points if it is their full case folding string. |
119 | * In other words, this performs a reverse full case folding and then |
120 | * adds the case closure items of the resulting code points. |
121 | * If the string is found and its closure applied, then |
122 | * the string itself is added as well as part of its code points' closure. |
123 | * It must be length>=0. |
124 | * |
125 | * @return true if the string was found |
126 | */ |
127 | U_CFUNC UBool U_EXPORT2 |
128 | ucase_addStringCaseClosure(const UChar *s, int32_t length, const USetAdder *sa); |
129 | |
130 | #ifdef __cplusplus |
131 | U_NAMESPACE_BEGIN |
132 | |
133 | /** |
134 | * Iterator over characters with more than one code point in the full default Case_Folding. |
135 | */ |
136 | class U_COMMON_API FullCaseFoldingIterator { |
137 | public: |
138 | /** Constructor. */ |
139 | FullCaseFoldingIterator(); |
140 | /** |
141 | * Returns the next (cp, full) pair where "full" is cp's full default Case_Folding. |
142 | * Returns a negative cp value at the end of the iteration. |
143 | */ |
144 | UChar32 next(UnicodeString &full); |
145 | private: |
146 | FullCaseFoldingIterator(const FullCaseFoldingIterator &) = delete; // no copy |
147 | FullCaseFoldingIterator &operator=(const FullCaseFoldingIterator &) = delete; // no assignment |
148 | |
149 | const char16_t *unfold; |
150 | int32_t unfoldRows; |
151 | int32_t unfoldRowWidth; |
152 | int32_t unfoldStringWidth; |
153 | int32_t currentRow; |
154 | int32_t rowCpIndex; |
155 | }; |
156 | |
157 | /** |
158 | * Fast case mapping data for ASCII/Latin. |
159 | * Linear arrays of delta bytes: 0=no mapping; EXC=exception. |
160 | * Deltas must not cross the ASCII boundary, or else they cannot be easily used |
161 | * in simple UTF-8 code. |
162 | */ |
163 | namespace LatinCase { |
164 | |
165 | /** Case mapping/folding data for code points up to U+017F. */ |
166 | constexpr char16_t LIMIT = 0x180; |
167 | /** U+017F case-folds and uppercases crossing the ASCII boundary. */ |
168 | constexpr char16_t LONG_S = 0x17f; |
169 | /** Exception: Complex mapping, or too-large delta. */ |
170 | constexpr int8_t EXC = -0x80; |
171 | |
172 | /** Deltas for lowercasing for most locales, and default case folding. */ |
173 | extern const int8_t TO_LOWER_NORMAL[LIMIT]; |
174 | /** Deltas for lowercasing for tr/az/lt, and Turkic case folding. */ |
175 | extern const int8_t TO_LOWER_TR_LT[LIMIT]; |
176 | |
177 | /** Deltas for uppercasing for most locales. */ |
178 | extern const int8_t TO_UPPER_NORMAL[LIMIT]; |
179 | /** Deltas for uppercasing for tr/az. */ |
180 | extern const int8_t TO_UPPER_TR[LIMIT]; |
181 | |
182 | } // namespace LatinCase |
183 | |
184 | U_NAMESPACE_END |
185 | #endif |
186 | |
187 | /** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */ |
188 | U_CAPI int32_t U_EXPORT2 |
189 | ucase_getType(UChar32 c); |
190 | |
191 | /** @return like ucase_getType() but also sets UCASE_IGNORABLE if c is case-ignorable */ |
192 | U_CAPI int32_t U_EXPORT2 |
193 | ucase_getTypeOrIgnorable(UChar32 c); |
194 | |
195 | U_CAPI UBool U_EXPORT2 |
196 | ucase_isSoftDotted(UChar32 c); |
197 | |
198 | U_CAPI UBool U_EXPORT2 |
199 | ucase_isCaseSensitive(UChar32 c); |
200 | |
201 | /* string case mapping functions */ |
202 | |
203 | U_CDECL_BEGIN |
204 | |
205 | /** |
206 | * Iterator function for string case mappings, which need to look at the |
207 | * context (surrounding text) of a given character for conditional mappings. |
208 | * |
209 | * The iterator only needs to go backward or forward away from the |
210 | * character in question. It does not use any indexes on this interface. |
211 | * It does not support random access or an arbitrary change of |
212 | * iteration direction. |
213 | * |
214 | * The code point being case-mapped itself is never returned by |
215 | * this iterator. |
216 | * |
217 | * @param context A pointer to the iterator's working data. |
218 | * @param dir If <0 then start iterating backward from the character; |
219 | * if >0 then start iterating forward from the character; |
220 | * if 0 then continue iterating in the current direction. |
221 | * @return Next code point, or <0 when the iteration is done. |
222 | */ |
223 | typedef UChar32 U_CALLCONV |
224 | UCaseContextIterator(void *context, int8_t dir); |
225 | |
226 | /** |
227 | * Sample struct which may be used by some implementations of |
228 | * UCaseContextIterator. |
229 | */ |
230 | struct UCaseContext { |
231 | void *p; |
232 | int32_t start, index, limit; |
233 | int32_t cpStart, cpLimit; |
234 | int8_t dir; |
235 | int8_t b1, b2, b3; |
236 | }; |
237 | typedef struct UCaseContext UCaseContext; |
238 | |
239 | U_CDECL_END |
240 | |
241 | #define UCASECONTEXT_INITIALIZER { NULL, 0, 0, 0, 0, 0, 0, 0, 0, 0 } |
242 | |
243 | enum { |
244 | /** |
245 | * For string case mappings, a single character (a code point) is mapped |
246 | * either to itself (in which case in-place mapping functions do nothing), |
247 | * or to another single code point, or to a string. |
248 | * Aside from the string contents, these are indicated with a single int32_t |
249 | * value as follows: |
250 | * |
251 | * Mapping to self: Negative values (~self instead of -self to support U+0000) |
252 | * |
253 | * Mapping to another code point: Positive values >UCASE_MAX_STRING_LENGTH |
254 | * |
255 | * Mapping to a string: The string length (0..UCASE_MAX_STRING_LENGTH) is |
256 | * returned. Note that the string result may indeed have zero length. |
257 | */ |
258 | UCASE_MAX_STRING_LENGTH=0x1f |
259 | }; |
260 | |
261 | /** |
262 | * Get the full lowercase mapping for c. |
263 | * |
264 | * @param csp Case mapping properties. |
265 | * @param c Character to be mapped. |
266 | * @param iter Character iterator, used for context-sensitive mappings. |
267 | * See UCaseContextIterator for details. |
268 | * If iter==NULL then a context-independent result is returned. |
269 | * @param context Pointer to be passed into iter. |
270 | * @param pString If the mapping result is a string, then the pointer is |
271 | * written to *pString. |
272 | * @param caseLocale Case locale value from ucase_getCaseLocale(). |
273 | * @return Output code point or string length, see UCASE_MAX_STRING_LENGTH. |
274 | * |
275 | * @see UCaseContextIterator |
276 | * @see UCASE_MAX_STRING_LENGTH |
277 | * @internal |
278 | */ |
279 | U_CAPI int32_t U_EXPORT2 |
280 | ucase_toFullLower(UChar32 c, |
281 | UCaseContextIterator *iter, void *context, |
282 | const UChar **pString, |
283 | int32_t caseLocale); |
284 | |
285 | U_CAPI int32_t U_EXPORT2 |
286 | ucase_toFullUpper(UChar32 c, |
287 | UCaseContextIterator *iter, void *context, |
288 | const UChar **pString, |
289 | int32_t caseLocale); |
290 | |
291 | U_CAPI int32_t U_EXPORT2 |
292 | ucase_toFullTitle(UChar32 c, |
293 | UCaseContextIterator *iter, void *context, |
294 | const UChar **pString, |
295 | int32_t caseLocale); |
296 | |
297 | U_CAPI int32_t U_EXPORT2 |
298 | ucase_toFullFolding(UChar32 c, |
299 | const UChar **pString, |
300 | uint32_t options); |
301 | |
302 | U_CFUNC int32_t U_EXPORT2 |
303 | ucase_hasBinaryProperty(UChar32 c, UProperty which); |
304 | |
305 | |
306 | U_CDECL_BEGIN |
307 | |
308 | /** |
309 | * @internal |
310 | */ |
311 | typedef int32_t U_CALLCONV |
312 | UCaseMapFull(UChar32 c, |
313 | UCaseContextIterator *iter, void *context, |
314 | const UChar **pString, |
315 | int32_t caseLocale); |
316 | |
317 | U_CDECL_END |
318 | |
319 | /* for icuexportdata -------------------------------------------------------- */ |
320 | |
321 | struct UCaseProps { |
322 | void *mem; // TODO: was unused, and type UDataMemory -- remove |
323 | const int32_t *indexes; |
324 | const uint16_t *exceptions; |
325 | const uint16_t *unfold; |
326 | |
327 | UTrie2 trie; |
328 | uint8_t formatVersion[4]; |
329 | }; |
330 | |
331 | U_CAPI const struct UCaseProps * U_EXPORT2 |
332 | ucase_getSingleton(int32_t *pExceptionsLength, int32_t *pUnfoldLength); |
333 | |
334 | /* file definitions --------------------------------------------------------- */ |
335 | |
336 | #define UCASE_DATA_NAME "ucase" |
337 | #define UCASE_DATA_TYPE "icu" |
338 | |
339 | /* format "cAsE" */ |
340 | #define UCASE_FMT_0 0x63 |
341 | #define UCASE_FMT_1 0x41 |
342 | #define UCASE_FMT_2 0x53 |
343 | #define UCASE_FMT_3 0x45 |
344 | |
345 | /* indexes into indexes[] */ |
346 | enum { |
347 | UCASE_IX_INDEX_TOP, |
348 | UCASE_IX_LENGTH, |
349 | UCASE_IX_TRIE_SIZE, |
350 | UCASE_IX_EXC_LENGTH, |
351 | UCASE_IX_UNFOLD_LENGTH, |
352 | |
353 | UCASE_IX_MAX_FULL_LENGTH=15, |
354 | UCASE_IX_TOP=16 |
355 | }; |
356 | |
357 | /* definitions for 16-bit case properties word ------------------------------ */ |
358 | |
359 | U_CFUNC const UTrie2 * U_EXPORT2 |
360 | ucase_getTrie(); |
361 | |
362 | /* 2-bit constants for types of cased characters */ |
363 | #define UCASE_TYPE_MASK 3 |
364 | enum { |
365 | UCASE_NONE, |
366 | UCASE_LOWER, |
367 | UCASE_UPPER, |
368 | UCASE_TITLE |
369 | }; |
370 | |
371 | #define UCASE_GET_TYPE(props) ((props)&UCASE_TYPE_MASK) |
372 | #define UCASE_GET_TYPE_AND_IGNORABLE(props) ((props)&7) |
373 | |
374 | #define UCASE_IS_UPPER_OR_TITLE(props) ((props)&2) |
375 | |
376 | #define UCASE_IGNORABLE 4 |
377 | #define UCASE_EXCEPTION 8 |
378 | #define UCASE_SENSITIVE 0x10 |
379 | |
380 | #define UCASE_HAS_EXCEPTION(props) ((props)&UCASE_EXCEPTION) |
381 | |
382 | #define UCASE_DOT_MASK 0x60 |
383 | enum { |
384 | UCASE_NO_DOT=0, /* normal characters with cc=0 */ |
385 | UCASE_SOFT_DOTTED=0x20, /* soft-dotted characters with cc=0 */ |
386 | UCASE_ABOVE=0x40, /* "above" accents with cc=230 */ |
387 | UCASE_OTHER_ACCENT=0x60 /* other accent character (0<cc!=230) */ |
388 | }; |
389 | |
390 | /* no exception: bits 15..7 are a 9-bit signed case mapping delta */ |
391 | #define UCASE_DELTA_SHIFT 7 |
392 | #define UCASE_DELTA_MASK 0xff80 |
393 | #define UCASE_MAX_DELTA 0xff |
394 | #define UCASE_MIN_DELTA (-UCASE_MAX_DELTA-1) |
395 | |
396 | #if U_SIGNED_RIGHT_SHIFT_IS_ARITHMETIC |
397 | # define UCASE_GET_DELTA(props) ((int16_t)(props)>>UCASE_DELTA_SHIFT) |
398 | #else |
399 | # define UCASE_GET_DELTA(props) (int16_t)(((props)&0x8000) ? (((props)>>UCASE_DELTA_SHIFT)|0xfe00) : ((uint16_t)(props)>>UCASE_DELTA_SHIFT)) |
400 | #endif |
401 | |
402 | /* exception: bits 15..4 are an unsigned 12-bit index into the exceptions array */ |
403 | #define UCASE_EXC_SHIFT 4 |
404 | #define UCASE_EXC_MASK 0xfff0 |
405 | #define UCASE_MAX_EXCEPTIONS ((UCASE_EXC_MASK>>UCASE_EXC_SHIFT)+1) |
406 | |
407 | /* definitions for 16-bit main exceptions word ------------------------------ */ |
408 | |
409 | /* first 8 bits indicate values in optional slots */ |
410 | enum { |
411 | UCASE_EXC_LOWER, |
412 | UCASE_EXC_FOLD, |
413 | UCASE_EXC_UPPER, |
414 | UCASE_EXC_TITLE, |
415 | UCASE_EXC_DELTA, |
416 | UCASE_EXC_5, /* reserved */ |
417 | UCASE_EXC_CLOSURE, |
418 | UCASE_EXC_FULL_MAPPINGS, |
419 | UCASE_EXC_ALL_SLOTS /* one past the last slot */ |
420 | }; |
421 | |
422 | /* each slot is 2 uint16_t instead of 1 */ |
423 | #define UCASE_EXC_DOUBLE_SLOTS 0x100 |
424 | |
425 | enum { |
426 | UCASE_EXC_NO_SIMPLE_CASE_FOLDING=0x200, |
427 | UCASE_EXC_DELTA_IS_NEGATIVE=0x400, |
428 | UCASE_EXC_SENSITIVE=0x800 |
429 | }; |
430 | |
431 | /* UCASE_EXC_DOT_MASK=UCASE_DOT_MASK<<UCASE_EXC_DOT_SHIFT */ |
432 | #define UCASE_EXC_DOT_SHIFT 7 |
433 | |
434 | /* normally stored in the main word, but pushed out for larger exception indexes */ |
435 | #define UCASE_EXC_DOT_MASK 0x3000 |
436 | enum { |
437 | UCASE_EXC_NO_DOT=0, |
438 | UCASE_EXC_SOFT_DOTTED=0x1000, |
439 | UCASE_EXC_ABOVE=0x2000, /* "above" accents with cc=230 */ |
440 | UCASE_EXC_OTHER_ACCENT=0x3000 /* other character (0<cc!=230) */ |
441 | }; |
442 | |
443 | /* complex/conditional mappings */ |
444 | #define UCASE_EXC_CONDITIONAL_SPECIAL 0x4000 |
445 | #define UCASE_EXC_CONDITIONAL_FOLD 0x8000 |
446 | |
447 | /* definitions for lengths word for full case mappings */ |
448 | #define UCASE_FULL_LOWER 0xf |
449 | #define UCASE_FULL_FOLDING 0xf0 |
450 | #define UCASE_FULL_UPPER 0xf00 |
451 | #define UCASE_FULL_TITLE 0xf000 |
452 | |
453 | /* maximum lengths */ |
454 | #define UCASE_FULL_MAPPINGS_MAX_LENGTH (4*0xf) |
455 | #define UCASE_CLOSURE_MAX_LENGTH 0xf |
456 | |
457 | /* constants for reverse case folding ("unfold") data */ |
458 | enum { |
459 | UCASE_UNFOLD_ROWS, |
460 | UCASE_UNFOLD_ROW_WIDTH, |
461 | UCASE_UNFOLD_STRING_WIDTH |
462 | }; |
463 | |
464 | #endif |
465 | |