1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | /* |
4 | ******************************************************************************* |
5 | * |
6 | * Copyright (C) 2002-2016, International Business Machines |
7 | * Corporation and others. All Rights Reserved. |
8 | * |
9 | ******************************************************************************* |
10 | * file name: uprops.cpp |
11 | * encoding: UTF-8 |
12 | * tab size: 8 (not used) |
13 | * indentation:4 |
14 | * |
15 | * created on: 2002feb24 |
16 | * created by: Markus W. Scherer |
17 | * |
18 | * Implementations for mostly non-core Unicode character properties |
19 | * stored in uprops.icu. |
20 | * |
21 | * With the APIs implemented here, almost all properties files and |
22 | * their associated implementation files are used from this file, |
23 | * including those for normalization and case mappings. |
24 | */ |
25 | |
26 | #include "unicode/utypes.h" |
27 | #include "unicode/uchar.h" |
28 | #include "unicode/ucptrie.h" |
29 | #include "unicode/udata.h" |
30 | #include "unicode/unorm2.h" |
31 | #include "unicode/uscript.h" |
32 | #include "unicode/ustring.h" |
33 | #include "unicode/utf16.h" |
34 | #include "cstring.h" |
35 | #include "emojiprops.h" |
36 | #include "mutex.h" |
37 | #include "normalizer2impl.h" |
38 | #include "umutex.h" |
39 | #include "ubidi_props.h" |
40 | #include "uprops.h" |
41 | #include "ucase.h" |
42 | #include "ucln_cmn.h" |
43 | #include "ulayout_props.h" |
44 | #include "ustr_imp.h" |
45 | |
46 | U_NAMESPACE_USE |
47 | |
48 | // Unicode text layout properties data ----------------------------------------- |
49 | |
50 | namespace { |
51 | |
52 | icu::UInitOnce gLayoutInitOnce {}; |
53 | UDataMemory *gLayoutMemory = nullptr; |
54 | |
55 | UCPTrie *gInpcTrie = nullptr; // Indic_Positional_Category |
56 | UCPTrie *gInscTrie = nullptr; // Indic_Syllabic_Category |
57 | UCPTrie *gVoTrie = nullptr; // Vertical_Orientation |
58 | |
59 | int32_t gMaxInpcValue = 0; |
60 | int32_t gMaxInscValue = 0; |
61 | int32_t gMaxVoValue = 0; |
62 | |
63 | UBool U_CALLCONV uprops_cleanup() { |
64 | udata_close(gLayoutMemory); |
65 | gLayoutMemory = nullptr; |
66 | |
67 | ucptrie_close(gInpcTrie); |
68 | gInpcTrie = nullptr; |
69 | ucptrie_close(gInscTrie); |
70 | gInscTrie = nullptr; |
71 | ucptrie_close(gVoTrie); |
72 | gVoTrie = nullptr; |
73 | |
74 | gMaxInpcValue = 0; |
75 | gMaxInscValue = 0; |
76 | gMaxVoValue = 0; |
77 | |
78 | gLayoutInitOnce.reset(); |
79 | return true; |
80 | } |
81 | |
82 | UBool U_CALLCONV |
83 | ulayout_isAcceptable(void * /*context*/, |
84 | const char * /* type */, const char * /*name*/, |
85 | const UDataInfo *pInfo) { |
86 | return pInfo->size >= 20 && |
87 | pInfo->isBigEndian == U_IS_BIG_ENDIAN && |
88 | pInfo->charsetFamily == U_CHARSET_FAMILY && |
89 | pInfo->dataFormat[0] == ULAYOUT_FMT_0 && |
90 | pInfo->dataFormat[1] == ULAYOUT_FMT_1 && |
91 | pInfo->dataFormat[2] == ULAYOUT_FMT_2 && |
92 | pInfo->dataFormat[3] == ULAYOUT_FMT_3 && |
93 | pInfo->formatVersion[0] == 1; |
94 | } |
95 | |
96 | // UInitOnce singleton initialization function |
97 | void U_CALLCONV ulayout_load(UErrorCode &errorCode) { |
98 | gLayoutMemory = udata_openChoice( |
99 | nullptr, ULAYOUT_DATA_TYPE, ULAYOUT_DATA_NAME, |
100 | ulayout_isAcceptable, nullptr, &errorCode); |
101 | if (U_FAILURE(errorCode)) { return; } |
102 | |
103 | const uint8_t *inBytes = (const uint8_t *)udata_getMemory(gLayoutMemory); |
104 | const int32_t *inIndexes = (const int32_t *)inBytes; |
105 | int32_t indexesLength = inIndexes[ULAYOUT_IX_INDEXES_LENGTH]; |
106 | if (indexesLength < 12) { |
107 | errorCode = U_INVALID_FORMAT_ERROR; // Not enough indexes. |
108 | return; |
109 | } |
110 | int32_t offset = indexesLength * 4; |
111 | int32_t top = inIndexes[ULAYOUT_IX_INPC_TRIE_TOP]; |
112 | int32_t trieSize = top - offset; |
113 | if (trieSize >= 16) { |
114 | gInpcTrie = ucptrie_openFromBinary( |
115 | UCPTRIE_TYPE_ANY, UCPTRIE_VALUE_BITS_ANY, |
116 | inBytes + offset, trieSize, nullptr, &errorCode); |
117 | } |
118 | offset = top; |
119 | top = inIndexes[ULAYOUT_IX_INSC_TRIE_TOP]; |
120 | trieSize = top - offset; |
121 | if (trieSize >= 16) { |
122 | gInscTrie = ucptrie_openFromBinary( |
123 | UCPTRIE_TYPE_ANY, UCPTRIE_VALUE_BITS_ANY, |
124 | inBytes + offset, trieSize, nullptr, &errorCode); |
125 | } |
126 | offset = top; |
127 | top = inIndexes[ULAYOUT_IX_VO_TRIE_TOP]; |
128 | trieSize = top - offset; |
129 | if (trieSize >= 16) { |
130 | gVoTrie = ucptrie_openFromBinary( |
131 | UCPTRIE_TYPE_ANY, UCPTRIE_VALUE_BITS_ANY, |
132 | inBytes + offset, trieSize, nullptr, &errorCode); |
133 | } |
134 | |
135 | uint32_t maxValues = inIndexes[ULAYOUT_IX_MAX_VALUES]; |
136 | gMaxInpcValue = maxValues >> ULAYOUT_MAX_INPC_SHIFT; |
137 | gMaxInscValue = (maxValues >> ULAYOUT_MAX_INSC_SHIFT) & 0xff; |
138 | gMaxVoValue = (maxValues >> ULAYOUT_MAX_VO_SHIFT) & 0xff; |
139 | |
140 | ucln_common_registerCleanup(UCLN_COMMON_UPROPS, uprops_cleanup); |
141 | } |
142 | |
143 | UBool ulayout_ensureData(UErrorCode &errorCode) { |
144 | if (U_FAILURE(errorCode)) { return false; } |
145 | umtx_initOnce(gLayoutInitOnce, &ulayout_load, errorCode); |
146 | return U_SUCCESS(errorCode); |
147 | } |
148 | |
149 | UBool ulayout_ensureData() { |
150 | UErrorCode errorCode = U_ZERO_ERROR; |
151 | return ulayout_ensureData(errorCode); |
152 | } |
153 | |
154 | } // namespace |
155 | |
156 | /* general properties API functions ----------------------------------------- */ |
157 | |
158 | struct BinaryProperty; |
159 | |
160 | typedef UBool BinaryPropertyContains(const BinaryProperty &prop, UChar32 c, UProperty which); |
161 | |
162 | struct BinaryProperty { |
163 | int32_t column; // SRC_PROPSVEC column, or "source" if mask==0 |
164 | uint32_t mask; |
165 | BinaryPropertyContains *contains; |
166 | }; |
167 | |
168 | static UBool defaultContains(const BinaryProperty &prop, UChar32 c, UProperty /*which*/) { |
169 | /* systematic, directly stored properties */ |
170 | return (u_getUnicodeProperties(c, prop.column)&prop.mask)!=0; |
171 | } |
172 | |
173 | static UBool caseBinaryPropertyContains(const BinaryProperty &/*prop*/, UChar32 c, UProperty which) { |
174 | return static_cast<UBool>(ucase_hasBinaryProperty(c, which)); |
175 | } |
176 | |
177 | static UBool isBidiControl(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
178 | return ubidi_isBidiControl(c); |
179 | } |
180 | |
181 | static UBool isMirrored(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
182 | return ubidi_isMirrored(c); |
183 | } |
184 | |
185 | static UBool isJoinControl(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
186 | return ubidi_isJoinControl(c); |
187 | } |
188 | |
189 | #if UCONFIG_NO_NORMALIZATION |
190 | static UBool hasFullCompositionExclusion(const BinaryProperty &, UChar32, UProperty) { |
191 | return false; |
192 | } |
193 | #else |
194 | static UBool hasFullCompositionExclusion(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
195 | // By definition, Full_Composition_Exclusion is the same as NFC_QC=No. |
196 | UErrorCode errorCode=U_ZERO_ERROR; |
197 | const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode); |
198 | return U_SUCCESS(errorCode) && impl->isCompNo(impl->getNorm16(c)); |
199 | } |
200 | #endif |
201 | |
202 | // UCHAR_NF*_INERT properties |
203 | #if UCONFIG_NO_NORMALIZATION |
204 | static UBool isNormInert(const BinaryProperty &, UChar32, UProperty) { |
205 | return false; |
206 | } |
207 | #else |
208 | static UBool isNormInert(const BinaryProperty &/*prop*/, UChar32 c, UProperty which) { |
209 | UErrorCode errorCode=U_ZERO_ERROR; |
210 | const Normalizer2 *norm2=Normalizer2Factory::getInstance( |
211 | (UNormalizationMode)(which-UCHAR_NFD_INERT+UNORM_NFD), errorCode); |
212 | return U_SUCCESS(errorCode) && norm2->isInert(c); |
213 | } |
214 | #endif |
215 | |
216 | #if UCONFIG_NO_NORMALIZATION |
217 | static UBool changesWhenCasefolded(const BinaryProperty &, UChar32, UProperty) { |
218 | return false; |
219 | } |
220 | #else |
221 | static UBool changesWhenCasefolded(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
222 | UnicodeString nfd; |
223 | UErrorCode errorCode=U_ZERO_ERROR; |
224 | const Normalizer2 *nfcNorm2=Normalizer2::getNFCInstance(errorCode); |
225 | if(U_FAILURE(errorCode)) { |
226 | return false; |
227 | } |
228 | if(nfcNorm2->getDecomposition(c, nfd)) { |
229 | /* c has a decomposition */ |
230 | if(nfd.length()==1) { |
231 | c=nfd[0]; /* single BMP code point */ |
232 | } else if(nfd.length()<=U16_MAX_LENGTH && |
233 | nfd.length()==U16_LENGTH(c=nfd.char32At(0)) |
234 | ) { |
235 | /* single supplementary code point */ |
236 | } else { |
237 | c=U_SENTINEL; |
238 | } |
239 | } else if(c<0) { |
240 | return false; /* protect against bad input */ |
241 | } |
242 | if(c>=0) { |
243 | /* single code point */ |
244 | const char16_t *resultString; |
245 | return (UBool)(ucase_toFullFolding(c, &resultString, U_FOLD_CASE_DEFAULT)>=0); |
246 | } else { |
247 | /* guess some large but stack-friendly capacity */ |
248 | char16_t dest[2*UCASE_MAX_STRING_LENGTH]; |
249 | int32_t destLength; |
250 | destLength=u_strFoldCase(dest, UPRV_LENGTHOF(dest), |
251 | nfd.getBuffer(), nfd.length(), |
252 | U_FOLD_CASE_DEFAULT, &errorCode); |
253 | return (UBool)(U_SUCCESS(errorCode) && |
254 | 0!=u_strCompare(nfd.getBuffer(), nfd.length(), |
255 | dest, destLength, false)); |
256 | } |
257 | } |
258 | #endif |
259 | |
260 | #if UCONFIG_NO_NORMALIZATION |
261 | static UBool changesWhenNFKC_Casefolded(const BinaryProperty &, UChar32, UProperty) { |
262 | return false; |
263 | } |
264 | #else |
265 | static UBool changesWhenNFKC_Casefolded(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
266 | UErrorCode errorCode=U_ZERO_ERROR; |
267 | const Normalizer2Impl *kcf=Normalizer2Factory::getNFKC_CFImpl(errorCode); |
268 | if(U_FAILURE(errorCode)) { |
269 | return false; |
270 | } |
271 | UnicodeString src(c); |
272 | UnicodeString dest; |
273 | { |
274 | // The ReorderingBuffer must be in a block because its destructor |
275 | // needs to release dest's buffer before we look at its contents. |
276 | ReorderingBuffer buffer(*kcf, dest); |
277 | // Small destCapacity for NFKC_CF(c). |
278 | if(buffer.init(5, errorCode)) { |
279 | const char16_t *srcArray=src.getBuffer(); |
280 | kcf->compose(srcArray, srcArray+src.length(), false, |
281 | true, buffer, errorCode); |
282 | } |
283 | } |
284 | return U_SUCCESS(errorCode) && dest!=src; |
285 | } |
286 | #endif |
287 | |
288 | #if UCONFIG_NO_NORMALIZATION |
289 | static UBool isCanonSegmentStarter(const BinaryProperty &, UChar32, UProperty) { |
290 | return false; |
291 | } |
292 | #else |
293 | static UBool isCanonSegmentStarter(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
294 | UErrorCode errorCode=U_ZERO_ERROR; |
295 | const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode); |
296 | return |
297 | U_SUCCESS(errorCode) && impl->ensureCanonIterData(errorCode) && |
298 | impl->isCanonSegmentStarter(c); |
299 | } |
300 | #endif |
301 | |
302 | static UBool isPOSIX_alnum(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
303 | return u_isalnumPOSIX(c); |
304 | } |
305 | |
306 | static UBool isPOSIX_blank(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
307 | return u_isblank(c); |
308 | } |
309 | |
310 | static UBool isPOSIX_graph(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
311 | return u_isgraphPOSIX(c); |
312 | } |
313 | |
314 | static UBool isPOSIX_print(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
315 | return u_isprintPOSIX(c); |
316 | } |
317 | |
318 | static UBool isPOSIX_xdigit(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
319 | return u_isxdigit(c); |
320 | } |
321 | |
322 | static UBool isRegionalIndicator(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
323 | // Property starts are a subset of lb=RI etc. |
324 | return 0x1F1E6<=c && c<=0x1F1FF; |
325 | } |
326 | |
327 | static UBool hasEmojiProperty(const BinaryProperty &/*prop*/, UChar32 c, UProperty which) { |
328 | return EmojiProps::hasBinaryProperty(c, which); |
329 | } |
330 | |
331 | static const BinaryProperty binProps[UCHAR_BINARY_LIMIT]={ |
332 | /* |
333 | * column and mask values for binary properties from u_getUnicodeProperties(). |
334 | * Must be in order of corresponding UProperty, |
335 | * and there must be exactly one entry per binary UProperty. |
336 | * |
337 | * Properties with mask==0 are handled in code. |
338 | * For them, column is the UPropertySource value. |
339 | */ |
340 | { 1, U_MASK(UPROPS_ALPHABETIC), defaultContains }, |
341 | { 1, U_MASK(UPROPS_ASCII_HEX_DIGIT), defaultContains }, |
342 | { UPROPS_SRC_BIDI, 0, isBidiControl }, |
343 | { UPROPS_SRC_BIDI, 0, isMirrored }, |
344 | { 1, U_MASK(UPROPS_DASH), defaultContains }, |
345 | { 1, U_MASK(UPROPS_DEFAULT_IGNORABLE_CODE_POINT), defaultContains }, |
346 | { 1, U_MASK(UPROPS_DEPRECATED), defaultContains }, |
347 | { 1, U_MASK(UPROPS_DIACRITIC), defaultContains }, |
348 | { 1, U_MASK(UPROPS_EXTENDER), defaultContains }, |
349 | { UPROPS_SRC_NFC, 0, hasFullCompositionExclusion }, |
350 | { 1, U_MASK(UPROPS_GRAPHEME_BASE), defaultContains }, |
351 | { 1, U_MASK(UPROPS_GRAPHEME_EXTEND), defaultContains }, |
352 | { 1, U_MASK(UPROPS_GRAPHEME_LINK), defaultContains }, |
353 | { 1, U_MASK(UPROPS_HEX_DIGIT), defaultContains }, |
354 | { 1, U_MASK(UPROPS_HYPHEN), defaultContains }, |
355 | { 1, U_MASK(UPROPS_ID_CONTINUE), defaultContains }, |
356 | { 1, U_MASK(UPROPS_ID_START), defaultContains }, |
357 | { 1, U_MASK(UPROPS_IDEOGRAPHIC), defaultContains }, |
358 | { 1, U_MASK(UPROPS_IDS_BINARY_OPERATOR), defaultContains }, |
359 | { 1, U_MASK(UPROPS_IDS_TRINARY_OPERATOR), defaultContains }, |
360 | { UPROPS_SRC_BIDI, 0, isJoinControl }, |
361 | { 1, U_MASK(UPROPS_LOGICAL_ORDER_EXCEPTION), defaultContains }, |
362 | { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_LOWERCASE |
363 | { 1, U_MASK(UPROPS_MATH), defaultContains }, |
364 | { 1, U_MASK(UPROPS_NONCHARACTER_CODE_POINT), defaultContains }, |
365 | { 1, U_MASK(UPROPS_QUOTATION_MARK), defaultContains }, |
366 | { 1, U_MASK(UPROPS_RADICAL), defaultContains }, |
367 | { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_SOFT_DOTTED |
368 | { 1, U_MASK(UPROPS_TERMINAL_PUNCTUATION), defaultContains }, |
369 | { 1, U_MASK(UPROPS_UNIFIED_IDEOGRAPH), defaultContains }, |
370 | { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_UPPERCASE |
371 | { 1, U_MASK(UPROPS_WHITE_SPACE), defaultContains }, |
372 | { 1, U_MASK(UPROPS_XID_CONTINUE), defaultContains }, |
373 | { 1, U_MASK(UPROPS_XID_START), defaultContains }, |
374 | { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CASE_SENSITIVE |
375 | { 1, U_MASK(UPROPS_S_TERM), defaultContains }, |
376 | { 1, U_MASK(UPROPS_VARIATION_SELECTOR), defaultContains }, |
377 | { UPROPS_SRC_NFC, 0, isNormInert }, // UCHAR_NFD_INERT |
378 | { UPROPS_SRC_NFKC, 0, isNormInert }, // UCHAR_NFKD_INERT |
379 | { UPROPS_SRC_NFC, 0, isNormInert }, // UCHAR_NFC_INERT |
380 | { UPROPS_SRC_NFKC, 0, isNormInert }, // UCHAR_NFKC_INERT |
381 | { UPROPS_SRC_NFC_CANON_ITER, 0, isCanonSegmentStarter }, |
382 | { 1, U_MASK(UPROPS_PATTERN_SYNTAX), defaultContains }, |
383 | { 1, U_MASK(UPROPS_PATTERN_WHITE_SPACE), defaultContains }, |
384 | { UPROPS_SRC_CHAR_AND_PROPSVEC, 0, isPOSIX_alnum }, |
385 | { UPROPS_SRC_CHAR, 0, isPOSIX_blank }, |
386 | { UPROPS_SRC_CHAR, 0, isPOSIX_graph }, |
387 | { UPROPS_SRC_CHAR, 0, isPOSIX_print }, |
388 | { UPROPS_SRC_CHAR, 0, isPOSIX_xdigit }, |
389 | { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CASED |
390 | { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CASE_IGNORABLE |
391 | { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CHANGES_WHEN_LOWERCASED |
392 | { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CHANGES_WHEN_UPPERCASED |
393 | { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CHANGES_WHEN_TITLECASED |
394 | { UPROPS_SRC_CASE_AND_NORM, 0, changesWhenCasefolded }, |
395 | { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CHANGES_WHEN_CASEMAPPED |
396 | { UPROPS_SRC_NFKC_CF, 0, changesWhenNFKC_Casefolded }, |
397 | { UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_EMOJI |
398 | { UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_EMOJI_PRESENTATION |
399 | { UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_EMOJI_MODIFIER |
400 | { UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_EMOJI_MODIFIER_BASE |
401 | { UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_EMOJI_COMPONENT |
402 | { 2, 0, isRegionalIndicator }, |
403 | { 1, U_MASK(UPROPS_PREPENDED_CONCATENATION_MARK), defaultContains }, |
404 | { UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_EXTENDED_PICTOGRAPHIC |
405 | { UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_BASIC_EMOJI |
406 | { UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_EMOJI_KEYCAP_SEQUENCE |
407 | { UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_RGI_EMOJI_MODIFIER_SEQUENCE |
408 | { UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_RGI_EMOJI_FLAG_SEQUENCE |
409 | { UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_RGI_EMOJI_TAG_SEQUENCE |
410 | { UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_RGI_EMOJI_ZWJ_SEQUENCE |
411 | { UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_RGI_EMOJI |
412 | }; |
413 | |
414 | U_CAPI UBool U_EXPORT2 |
415 | u_hasBinaryProperty(UChar32 c, UProperty which) { |
416 | /* c is range-checked in the functions that are called from here */ |
417 | if(which<UCHAR_BINARY_START || UCHAR_BINARY_LIMIT<=which) { |
418 | /* not a known binary property */ |
419 | return false; |
420 | } else { |
421 | const BinaryProperty &prop=binProps[which]; |
422 | return prop.contains(prop, c, which); |
423 | } |
424 | } |
425 | |
426 | /* Checks if the Unicode character can start a Unicode identifier.*/ |
427 | U_CAPI UBool U_EXPORT2 |
428 | u_isIDStart(UChar32 c) { |
429 | return u_hasBinaryProperty(c, UCHAR_ID_START); |
430 | } |
431 | |
432 | /* Checks if the Unicode character can be a Unicode identifier part other than starting the |
433 | identifier.*/ |
434 | U_CAPI UBool U_EXPORT2 |
435 | u_isIDPart(UChar32 c) { |
436 | return u_hasBinaryProperty(c, UCHAR_ID_CONTINUE); |
437 | } |
438 | |
439 | U_CAPI UBool U_EXPORT2 |
440 | u_stringHasBinaryProperty(const char16_t *s, int32_t length, UProperty which) { |
441 | if (s == nullptr && length != 0) { return false; } |
442 | if (length == 1) { |
443 | return u_hasBinaryProperty(s[0], which); // single code point |
444 | } else if (length == 2 || (length < 0 && *s != 0)) { // not empty string |
445 | // first code point |
446 | int32_t i = 0; |
447 | UChar32 c; |
448 | U16_NEXT(s, i, length, c); |
449 | if (length > 0 ? i == length : s[i] == 0) { |
450 | return u_hasBinaryProperty(c, which); // single code point |
451 | } |
452 | } |
453 | // Only call into EmojiProps for a relevant property, |
454 | // so that we not unnecessarily try to load its data file. |
455 | return UCHAR_BASIC_EMOJI <= which && which <= UCHAR_RGI_EMOJI && |
456 | EmojiProps::hasBinaryProperty(s, length, which); |
457 | } |
458 | |
459 | struct IntProperty; |
460 | |
461 | typedef int32_t IntPropertyGetValue(const IntProperty &prop, UChar32 c, UProperty which); |
462 | typedef int32_t IntPropertyGetMaxValue(const IntProperty &prop, UProperty which); |
463 | |
464 | struct IntProperty { |
465 | int32_t column; // SRC_PROPSVEC column, or "source" if mask==0 |
466 | uint32_t mask; |
467 | int32_t shift; // =maxValue if getMaxValueFromShift() is used |
468 | IntPropertyGetValue *getValue; |
469 | IntPropertyGetMaxValue *getMaxValue; |
470 | }; |
471 | |
472 | static int32_t defaultGetValue(const IntProperty &prop, UChar32 c, UProperty /*which*/) { |
473 | /* systematic, directly stored properties */ |
474 | return (int32_t)(u_getUnicodeProperties(c, prop.column)&prop.mask)>>prop.shift; |
475 | } |
476 | |
477 | static int32_t defaultGetMaxValue(const IntProperty &prop, UProperty /*which*/) { |
478 | return (uprv_getMaxValues(prop.column)&prop.mask)>>prop.shift; |
479 | } |
480 | |
481 | static int32_t getMaxValueFromShift(const IntProperty &prop, UProperty /*which*/) { |
482 | return prop.shift; |
483 | } |
484 | |
485 | static int32_t getBiDiClass(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
486 | return (int32_t)u_charDirection(c); |
487 | } |
488 | |
489 | static int32_t getBiDiPairedBracketType(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
490 | return (int32_t)ubidi_getPairedBracketType(c); |
491 | } |
492 | |
493 | static int32_t biDiGetMaxValue(const IntProperty &/*prop*/, UProperty which) { |
494 | return ubidi_getMaxValue(which); |
495 | } |
496 | |
497 | #if UCONFIG_NO_NORMALIZATION |
498 | static int32_t getCombiningClass(const IntProperty &, UChar32, UProperty) { |
499 | return 0; |
500 | } |
501 | #else |
502 | static int32_t getCombiningClass(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
503 | return u_getCombiningClass(c); |
504 | } |
505 | #endif |
506 | |
507 | static int32_t getGeneralCategory(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
508 | return (int32_t)u_charType(c); |
509 | } |
510 | |
511 | static int32_t getJoiningGroup(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
512 | return ubidi_getJoiningGroup(c); |
513 | } |
514 | |
515 | static int32_t getJoiningType(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
516 | return ubidi_getJoiningType(c); |
517 | } |
518 | |
519 | static int32_t getNumericType(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
520 | int32_t ntv=(int32_t)GET_NUMERIC_TYPE_VALUE(u_getMainProperties(c)); |
521 | return UPROPS_NTV_GET_TYPE(ntv); |
522 | } |
523 | |
524 | static int32_t getScript(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
525 | UErrorCode errorCode=U_ZERO_ERROR; |
526 | return (int32_t)uscript_getScript(c, &errorCode); |
527 | } |
528 | |
529 | static int32_t scriptGetMaxValue(const IntProperty &/*prop*/, UProperty /*which*/) { |
530 | uint32_t scriptX=uprv_getMaxValues(0)&UPROPS_SCRIPT_X_MASK; |
531 | return uprops_mergeScriptCodeOrIndex(scriptX); |
532 | } |
533 | |
534 | /* |
535 | * Map some of the Grapheme Cluster Break values to Hangul Syllable Types. |
536 | * Hangul_Syllable_Type is fully redundant with a subset of Grapheme_Cluster_Break. |
537 | */ |
538 | static const UHangulSyllableType gcbToHst[]={ |
539 | U_HST_NOT_APPLICABLE, /* U_GCB_OTHER */ |
540 | U_HST_NOT_APPLICABLE, /* U_GCB_CONTROL */ |
541 | U_HST_NOT_APPLICABLE, /* U_GCB_CR */ |
542 | U_HST_NOT_APPLICABLE, /* U_GCB_EXTEND */ |
543 | U_HST_LEADING_JAMO, /* U_GCB_L */ |
544 | U_HST_NOT_APPLICABLE, /* U_GCB_LF */ |
545 | U_HST_LV_SYLLABLE, /* U_GCB_LV */ |
546 | U_HST_LVT_SYLLABLE, /* U_GCB_LVT */ |
547 | U_HST_TRAILING_JAMO, /* U_GCB_T */ |
548 | U_HST_VOWEL_JAMO /* U_GCB_V */ |
549 | /* |
550 | * Omit GCB values beyond what we need for hst. |
551 | * The code below checks for the array length. |
552 | */ |
553 | }; |
554 | |
555 | static int32_t getHangulSyllableType(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
556 | /* see comments on gcbToHst[] above */ |
557 | int32_t gcb=(int32_t)(u_getUnicodeProperties(c, 2)&UPROPS_GCB_MASK)>>UPROPS_GCB_SHIFT; |
558 | if(gcb<UPRV_LENGTHOF(gcbToHst)) { |
559 | return gcbToHst[gcb]; |
560 | } else { |
561 | return U_HST_NOT_APPLICABLE; |
562 | } |
563 | } |
564 | |
565 | #if UCONFIG_NO_NORMALIZATION |
566 | static int32_t getNormQuickCheck(const IntProperty &, UChar32, UProperty) { |
567 | return 0; |
568 | } |
569 | #else |
570 | static int32_t getNormQuickCheck(const IntProperty &/*prop*/, UChar32 c, UProperty which) { |
571 | return (int32_t)unorm_getQuickCheck(c, (UNormalizationMode)(which-UCHAR_NFD_QUICK_CHECK+UNORM_NFD)); |
572 | } |
573 | #endif |
574 | |
575 | #if UCONFIG_NO_NORMALIZATION |
576 | static int32_t getLeadCombiningClass(const IntProperty &, UChar32, UProperty) { |
577 | return 0; |
578 | } |
579 | #else |
580 | static int32_t getLeadCombiningClass(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
581 | return unorm_getFCD16(c)>>8; |
582 | } |
583 | #endif |
584 | |
585 | #if UCONFIG_NO_NORMALIZATION |
586 | static int32_t getTrailCombiningClass(const IntProperty &, UChar32, UProperty) { |
587 | return 0; |
588 | } |
589 | #else |
590 | static int32_t getTrailCombiningClass(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
591 | return unorm_getFCD16(c)&0xff; |
592 | } |
593 | #endif |
594 | |
595 | static int32_t getInPC(const IntProperty &, UChar32 c, UProperty) { |
596 | return ulayout_ensureData() && gInpcTrie != nullptr ? ucptrie_get(gInpcTrie, c) : 0; |
597 | } |
598 | |
599 | static int32_t getInSC(const IntProperty &, UChar32 c, UProperty) { |
600 | return ulayout_ensureData() && gInscTrie != nullptr ? ucptrie_get(gInscTrie, c) : 0; |
601 | } |
602 | |
603 | static int32_t getVo(const IntProperty &, UChar32 c, UProperty) { |
604 | return ulayout_ensureData() && gVoTrie != nullptr ? ucptrie_get(gVoTrie, c) : 0; |
605 | } |
606 | |
607 | static int32_t layoutGetMaxValue(const IntProperty &/*prop*/, UProperty which) { |
608 | if (!ulayout_ensureData()) { return 0; } |
609 | switch (which) { |
610 | case UCHAR_INDIC_POSITIONAL_CATEGORY: |
611 | return gMaxInpcValue; |
612 | case UCHAR_INDIC_SYLLABIC_CATEGORY: |
613 | return gMaxInscValue; |
614 | case UCHAR_VERTICAL_ORIENTATION: |
615 | return gMaxVoValue; |
616 | default: |
617 | return 0; |
618 | } |
619 | } |
620 | |
621 | static const IntProperty intProps[UCHAR_INT_LIMIT-UCHAR_INT_START]={ |
622 | /* |
623 | * column, mask and shift values for int-value properties from u_getUnicodeProperties(). |
624 | * Must be in order of corresponding UProperty, |
625 | * and there must be exactly one entry per int UProperty. |
626 | * |
627 | * Properties with mask==0 are handled in code. |
628 | * For them, column is the UPropertySource value. |
629 | */ |
630 | { UPROPS_SRC_BIDI, 0, 0, getBiDiClass, biDiGetMaxValue }, |
631 | { 0, UPROPS_BLOCK_MASK, UPROPS_BLOCK_SHIFT, defaultGetValue, defaultGetMaxValue }, |
632 | { UPROPS_SRC_NFC, 0, 0xff, getCombiningClass, getMaxValueFromShift }, |
633 | { 2, UPROPS_DT_MASK, 0, defaultGetValue, defaultGetMaxValue }, |
634 | { 0, UPROPS_EA_MASK, UPROPS_EA_SHIFT, defaultGetValue, defaultGetMaxValue }, |
635 | { UPROPS_SRC_CHAR, 0, (int32_t)U_CHAR_CATEGORY_COUNT-1,getGeneralCategory, getMaxValueFromShift }, |
636 | { UPROPS_SRC_BIDI, 0, 0, getJoiningGroup, biDiGetMaxValue }, |
637 | { UPROPS_SRC_BIDI, 0, 0, getJoiningType, biDiGetMaxValue }, |
638 | { 2, UPROPS_LB_MASK, UPROPS_LB_SHIFT, defaultGetValue, defaultGetMaxValue }, |
639 | { UPROPS_SRC_CHAR, 0, (int32_t)U_NT_COUNT-1, getNumericType, getMaxValueFromShift }, |
640 | { UPROPS_SRC_PROPSVEC, 0, 0, getScript, scriptGetMaxValue }, |
641 | { UPROPS_SRC_PROPSVEC, 0, (int32_t)U_HST_COUNT-1, getHangulSyllableType, getMaxValueFromShift }, |
642 | // UCHAR_NFD_QUICK_CHECK: max=1=YES -- never "maybe", only "no" or "yes" |
643 | { UPROPS_SRC_NFC, 0, (int32_t)UNORM_YES, getNormQuickCheck, getMaxValueFromShift }, |
644 | // UCHAR_NFKD_QUICK_CHECK: max=1=YES -- never "maybe", only "no" or "yes" |
645 | { UPROPS_SRC_NFKC, 0, (int32_t)UNORM_YES, getNormQuickCheck, getMaxValueFromShift }, |
646 | // UCHAR_NFC_QUICK_CHECK: max=2=MAYBE |
647 | { UPROPS_SRC_NFC, 0, (int32_t)UNORM_MAYBE, getNormQuickCheck, getMaxValueFromShift }, |
648 | // UCHAR_NFKC_QUICK_CHECK: max=2=MAYBE |
649 | { UPROPS_SRC_NFKC, 0, (int32_t)UNORM_MAYBE, getNormQuickCheck, getMaxValueFromShift }, |
650 | { UPROPS_SRC_NFC, 0, 0xff, getLeadCombiningClass, getMaxValueFromShift }, |
651 | { UPROPS_SRC_NFC, 0, 0xff, getTrailCombiningClass, getMaxValueFromShift }, |
652 | { 2, UPROPS_GCB_MASK, UPROPS_GCB_SHIFT, defaultGetValue, defaultGetMaxValue }, |
653 | { 2, UPROPS_SB_MASK, UPROPS_SB_SHIFT, defaultGetValue, defaultGetMaxValue }, |
654 | { 2, UPROPS_WB_MASK, UPROPS_WB_SHIFT, defaultGetValue, defaultGetMaxValue }, |
655 | { UPROPS_SRC_BIDI, 0, 0, getBiDiPairedBracketType, biDiGetMaxValue }, |
656 | { UPROPS_SRC_INPC, 0, 0, getInPC, layoutGetMaxValue }, |
657 | { UPROPS_SRC_INSC, 0, 0, getInSC, layoutGetMaxValue }, |
658 | { UPROPS_SRC_VO, 0, 0, getVo, layoutGetMaxValue }, |
659 | }; |
660 | |
661 | U_CAPI int32_t U_EXPORT2 |
662 | u_getIntPropertyValue(UChar32 c, UProperty which) { |
663 | if(which<UCHAR_INT_START) { |
664 | if(UCHAR_BINARY_START<=which && which<UCHAR_BINARY_LIMIT) { |
665 | const BinaryProperty &prop=binProps[which]; |
666 | return prop.contains(prop, c, which); |
667 | } |
668 | } else if(which<UCHAR_INT_LIMIT) { |
669 | const IntProperty &prop=intProps[which-UCHAR_INT_START]; |
670 | return prop.getValue(prop, c, which); |
671 | } else if(which==UCHAR_GENERAL_CATEGORY_MASK) { |
672 | return U_MASK(u_charType(c)); |
673 | } |
674 | return 0; // undefined |
675 | } |
676 | |
677 | U_CAPI int32_t U_EXPORT2 |
678 | u_getIntPropertyMinValue(UProperty /*which*/) { |
679 | return 0; /* all binary/enum/int properties have a minimum value of 0 */ |
680 | } |
681 | |
682 | U_CAPI int32_t U_EXPORT2 |
683 | u_getIntPropertyMaxValue(UProperty which) { |
684 | if(which<UCHAR_INT_START) { |
685 | if(UCHAR_BINARY_START<=which && which<UCHAR_BINARY_LIMIT) { |
686 | return 1; // maximum true for all binary properties |
687 | } |
688 | } else if(which<UCHAR_INT_LIMIT) { |
689 | const IntProperty &prop=intProps[which-UCHAR_INT_START]; |
690 | return prop.getMaxValue(prop, which); |
691 | } |
692 | return -1; // undefined |
693 | } |
694 | |
695 | U_CFUNC UPropertySource U_EXPORT2 |
696 | uprops_getSource(UProperty which) { |
697 | if(which<UCHAR_BINARY_START) { |
698 | return UPROPS_SRC_NONE; /* undefined */ |
699 | } else if(which<UCHAR_BINARY_LIMIT) { |
700 | const BinaryProperty &prop=binProps[which]; |
701 | if(prop.mask!=0) { |
702 | return UPROPS_SRC_PROPSVEC; |
703 | } else { |
704 | return (UPropertySource)prop.column; |
705 | } |
706 | } else if(which<UCHAR_INT_START) { |
707 | return UPROPS_SRC_NONE; /* undefined */ |
708 | } else if(which<UCHAR_INT_LIMIT) { |
709 | const IntProperty &prop=intProps[which-UCHAR_INT_START]; |
710 | if(prop.mask!=0) { |
711 | return UPROPS_SRC_PROPSVEC; |
712 | } else { |
713 | return (UPropertySource)prop.column; |
714 | } |
715 | } else if(which<UCHAR_STRING_START) { |
716 | switch(which) { |
717 | case UCHAR_GENERAL_CATEGORY_MASK: |
718 | case UCHAR_NUMERIC_VALUE: |
719 | return UPROPS_SRC_CHAR; |
720 | |
721 | default: |
722 | return UPROPS_SRC_NONE; |
723 | } |
724 | } else if(which<UCHAR_STRING_LIMIT) { |
725 | switch(which) { |
726 | case UCHAR_AGE: |
727 | return UPROPS_SRC_PROPSVEC; |
728 | |
729 | case UCHAR_BIDI_MIRRORING_GLYPH: |
730 | return UPROPS_SRC_BIDI; |
731 | |
732 | case UCHAR_CASE_FOLDING: |
733 | case UCHAR_LOWERCASE_MAPPING: |
734 | case UCHAR_SIMPLE_CASE_FOLDING: |
735 | case UCHAR_SIMPLE_LOWERCASE_MAPPING: |
736 | case UCHAR_SIMPLE_TITLECASE_MAPPING: |
737 | case UCHAR_SIMPLE_UPPERCASE_MAPPING: |
738 | case UCHAR_TITLECASE_MAPPING: |
739 | case UCHAR_UPPERCASE_MAPPING: |
740 | return UPROPS_SRC_CASE; |
741 | |
742 | case UCHAR_ISO_COMMENT: |
743 | case UCHAR_NAME: |
744 | case UCHAR_UNICODE_1_NAME: |
745 | return UPROPS_SRC_NAMES; |
746 | |
747 | default: |
748 | return UPROPS_SRC_NONE; |
749 | } |
750 | } else { |
751 | switch(which) { |
752 | case UCHAR_SCRIPT_EXTENSIONS: |
753 | return UPROPS_SRC_PROPSVEC; |
754 | default: |
755 | return UPROPS_SRC_NONE; /* undefined */ |
756 | } |
757 | } |
758 | } |
759 | |
760 | U_CFUNC void U_EXPORT2 |
761 | uprops_addPropertyStarts(UPropertySource src, const USetAdder *sa, UErrorCode *pErrorCode) { |
762 | if (!ulayout_ensureData(*pErrorCode)) { return; } |
763 | const UCPTrie *trie; |
764 | switch (src) { |
765 | case UPROPS_SRC_INPC: |
766 | trie = gInpcTrie; |
767 | break; |
768 | case UPROPS_SRC_INSC: |
769 | trie = gInscTrie; |
770 | break; |
771 | case UPROPS_SRC_VO: |
772 | trie = gVoTrie; |
773 | break; |
774 | default: |
775 | *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; |
776 | return; |
777 | } |
778 | |
779 | if (trie == nullptr) { |
780 | *pErrorCode = U_MISSING_RESOURCE_ERROR; |
781 | return; |
782 | } |
783 | |
784 | // Add the start code point of each same-value range of the trie. |
785 | UChar32 start = 0, end; |
786 | while ((end = ucptrie_getRange(trie, start, UCPMAP_RANGE_NORMAL, 0, |
787 | nullptr, nullptr, nullptr)) >= 0) { |
788 | sa->add(sa->set, start); |
789 | start = end + 1; |
790 | } |
791 | } |
792 | |
793 | #if !UCONFIG_NO_NORMALIZATION |
794 | |
795 | U_CAPI int32_t U_EXPORT2 |
796 | u_getFC_NFKC_Closure(UChar32 c, char16_t *dest, int32_t destCapacity, UErrorCode *pErrorCode) { |
797 | if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) { |
798 | return 0; |
799 | } |
800 | if(destCapacity<0 || (dest==nullptr && destCapacity>0)) { |
801 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
802 | return 0; |
803 | } |
804 | // Compute the FC_NFKC_Closure on the fly: |
805 | // We have the API for complete coverage of Unicode properties, although |
806 | // this value by itself is not useful via API. |
807 | // (What could be useful is a custom normalization table that combines |
808 | // case folding and NFKC.) |
809 | // For the derivation, see Unicode's DerivedNormalizationProps.txt. |
810 | const Normalizer2 *nfkc=Normalizer2::getNFKCInstance(*pErrorCode); |
811 | if(U_FAILURE(*pErrorCode)) { |
812 | return 0; |
813 | } |
814 | // first: b = NFKC(Fold(a)) |
815 | UnicodeString folded1String; |
816 | const char16_t *folded1; |
817 | int32_t folded1Length=ucase_toFullFolding(c, &folded1, U_FOLD_CASE_DEFAULT); |
818 | if(folded1Length<0) { |
819 | const Normalizer2Impl *nfkcImpl=Normalizer2Factory::getImpl(nfkc); |
820 | if(nfkcImpl->getCompQuickCheck(nfkcImpl->getNorm16(c))!=UNORM_NO) { |
821 | return u_terminateUChars(dest, destCapacity, 0, pErrorCode); // c does not change at all under CaseFolding+NFKC |
822 | } |
823 | folded1String.setTo(c); |
824 | } else { |
825 | if(folded1Length>UCASE_MAX_STRING_LENGTH) { |
826 | folded1String.setTo(folded1Length); |
827 | } else { |
828 | folded1String.setTo(false, folded1, folded1Length); |
829 | } |
830 | } |
831 | UnicodeString kc1=nfkc->normalize(folded1String, *pErrorCode); |
832 | // second: c = NFKC(Fold(b)) |
833 | UnicodeString folded2String(kc1); |
834 | UnicodeString kc2=nfkc->normalize(folded2String.foldCase(), *pErrorCode); |
835 | // if (c != b) add the mapping from a to c |
836 | if(U_FAILURE(*pErrorCode) || kc1==kc2) { |
837 | return u_terminateUChars(dest, destCapacity, 0, pErrorCode); |
838 | } else { |
839 | return kc2.extract(dest, destCapacity, *pErrorCode); |
840 | } |
841 | } |
842 | |
843 | #endif |
844 | |