1// © 2018 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3
4// characterproperties.cpp
5// created: 2018sep03 Markus W. Scherer
6
7#include "unicode/utypes.h"
8#include "unicode/localpointer.h"
9#include "unicode/uchar.h"
10#include "unicode/ucpmap.h"
11#include "unicode/ucptrie.h"
12#include "unicode/umutablecptrie.h"
13#include "unicode/uniset.h"
14#include "unicode/uscript.h"
15#include "unicode/uset.h"
16#include "cmemory.h"
17#include "emojiprops.h"
18#include "mutex.h"
19#include "normalizer2impl.h"
20#include "uassert.h"
21#include "ubidi_props.h"
22#include "ucase.h"
23#include "ucln_cmn.h"
24#include "umutex.h"
25#include "uprops.h"
26
27using icu::LocalPointer;
28#if !UCONFIG_NO_NORMALIZATION
29using icu::Normalizer2Factory;
30using icu::Normalizer2Impl;
31#endif
32using icu::UInitOnce;
33using icu::UnicodeSet;
34
35namespace {
36
37UBool U_CALLCONV characterproperties_cleanup();
38
39constexpr int32_t NUM_INCLUSIONS = UPROPS_SRC_COUNT + (UCHAR_INT_LIMIT - UCHAR_INT_START);
40
41struct Inclusion {
42 UnicodeSet *fSet = nullptr;
43 UInitOnce fInitOnce {};
44};
45Inclusion gInclusions[NUM_INCLUSIONS]; // cached getInclusions()
46
47UnicodeSet *sets[UCHAR_BINARY_LIMIT] = {};
48
49UCPMap *maps[UCHAR_INT_LIMIT - UCHAR_INT_START] = {};
50
51icu::UMutex cpMutex;
52
53//----------------------------------------------------------------
54// Inclusions list
55//----------------------------------------------------------------
56
57// USetAdder implementation
58// Does not use uset.h to reduce code dependencies
59void U_CALLCONV
60_set_add(USet *set, UChar32 c) {
61 ((UnicodeSet *)set)->add(c);
62}
63
64void U_CALLCONV
65_set_addRange(USet *set, UChar32 start, UChar32 end) {
66 ((UnicodeSet *)set)->add(start, end);
67}
68
69void U_CALLCONV
70_set_addString(USet *set, const char16_t *str, int32_t length) {
71 ((UnicodeSet *)set)->add(icu::UnicodeString((UBool)(length<0), str, length));
72}
73
74UBool U_CALLCONV characterproperties_cleanup() {
75 for (Inclusion &in: gInclusions) {
76 delete in.fSet;
77 in.fSet = nullptr;
78 in.fInitOnce.reset();
79 }
80 for (int32_t i = 0; i < UPRV_LENGTHOF(sets); ++i) {
81 delete sets[i];
82 sets[i] = nullptr;
83 }
84 for (int32_t i = 0; i < UPRV_LENGTHOF(maps); ++i) {
85 ucptrie_close(reinterpret_cast<UCPTrie *>(maps[i]));
86 maps[i] = nullptr;
87 }
88 return true;
89}
90
91void U_CALLCONV initInclusion(UPropertySource src, UErrorCode &errorCode) {
92 // This function is invoked only via umtx_initOnce().
93 U_ASSERT(0 <= src && src < UPROPS_SRC_COUNT);
94 if (src == UPROPS_SRC_NONE) {
95 errorCode = U_INTERNAL_PROGRAM_ERROR;
96 return;
97 }
98 U_ASSERT(gInclusions[src].fSet == nullptr);
99
100 LocalPointer<UnicodeSet> incl(new UnicodeSet());
101 if (incl.isNull()) {
102 errorCode = U_MEMORY_ALLOCATION_ERROR;
103 return;
104 }
105 USetAdder sa = {
106 (USet *)incl.getAlias(),
107 _set_add,
108 _set_addRange,
109 _set_addString,
110 nullptr, // don't need remove()
111 nullptr // don't need removeRange()
112 };
113
114 switch(src) {
115 case UPROPS_SRC_CHAR:
116 uchar_addPropertyStarts(&sa, &errorCode);
117 break;
118 case UPROPS_SRC_PROPSVEC:
119 upropsvec_addPropertyStarts(&sa, &errorCode);
120 break;
121 case UPROPS_SRC_CHAR_AND_PROPSVEC:
122 uchar_addPropertyStarts(&sa, &errorCode);
123 upropsvec_addPropertyStarts(&sa, &errorCode);
124 break;
125#if !UCONFIG_NO_NORMALIZATION
126 case UPROPS_SRC_CASE_AND_NORM: {
127 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
128 if(U_SUCCESS(errorCode)) {
129 impl->addPropertyStarts(&sa, errorCode);
130 }
131 ucase_addPropertyStarts(&sa, &errorCode);
132 break;
133 }
134 case UPROPS_SRC_NFC: {
135 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
136 if(U_SUCCESS(errorCode)) {
137 impl->addPropertyStarts(&sa, errorCode);
138 }
139 break;
140 }
141 case UPROPS_SRC_NFKC: {
142 const Normalizer2Impl *impl=Normalizer2Factory::getNFKCImpl(errorCode);
143 if(U_SUCCESS(errorCode)) {
144 impl->addPropertyStarts(&sa, errorCode);
145 }
146 break;
147 }
148 case UPROPS_SRC_NFKC_CF: {
149 const Normalizer2Impl *impl=Normalizer2Factory::getNFKC_CFImpl(errorCode);
150 if(U_SUCCESS(errorCode)) {
151 impl->addPropertyStarts(&sa, errorCode);
152 }
153 break;
154 }
155 case UPROPS_SRC_NFC_CANON_ITER: {
156 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
157 if(U_SUCCESS(errorCode)) {
158 impl->addCanonIterPropertyStarts(&sa, errorCode);
159 }
160 break;
161 }
162#endif
163 case UPROPS_SRC_CASE:
164 ucase_addPropertyStarts(&sa, &errorCode);
165 break;
166 case UPROPS_SRC_BIDI:
167 ubidi_addPropertyStarts(&sa, &errorCode);
168 break;
169 case UPROPS_SRC_INPC:
170 case UPROPS_SRC_INSC:
171 case UPROPS_SRC_VO:
172 uprops_addPropertyStarts((UPropertySource)src, &sa, &errorCode);
173 break;
174 case UPROPS_SRC_EMOJI: {
175 const icu::EmojiProps *ep = icu::EmojiProps::getSingleton(errorCode);
176 if (U_SUCCESS(errorCode)) {
177 ep->addPropertyStarts(&sa, errorCode);
178 }
179 break;
180 }
181 default:
182 errorCode = U_INTERNAL_PROGRAM_ERROR;
183 break;
184 }
185
186 if (U_FAILURE(errorCode)) {
187 return;
188 }
189 if (incl->isBogus()) {
190 errorCode = U_MEMORY_ALLOCATION_ERROR;
191 return;
192 }
193 // Compact for caching.
194 incl->compact();
195 gInclusions[src].fSet = incl.orphan();
196 ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup);
197}
198
199const UnicodeSet *getInclusionsForSource(UPropertySource src, UErrorCode &errorCode) {
200 if (U_FAILURE(errorCode)) { return nullptr; }
201 if (src < 0 || UPROPS_SRC_COUNT <= src) {
202 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
203 return nullptr;
204 }
205 Inclusion &i = gInclusions[src];
206 umtx_initOnce(i.fInitOnce, &initInclusion, src, errorCode);
207 return i.fSet;
208}
209
210void U_CALLCONV initIntPropInclusion(UProperty prop, UErrorCode &errorCode) {
211 // This function is invoked only via umtx_initOnce().
212 U_ASSERT(UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT);
213 int32_t inclIndex = UPROPS_SRC_COUNT + (prop - UCHAR_INT_START);
214 U_ASSERT(gInclusions[inclIndex].fSet == nullptr);
215 UPropertySource src = uprops_getSource(prop);
216 const UnicodeSet *incl = getInclusionsForSource(src, errorCode);
217 if (U_FAILURE(errorCode)) {
218 return;
219 }
220
221 LocalPointer<UnicodeSet> intPropIncl(new UnicodeSet(0, 0));
222 if (intPropIncl.isNull()) {
223 errorCode = U_MEMORY_ALLOCATION_ERROR;
224 return;
225 }
226 int32_t numRanges = incl->getRangeCount();
227 int32_t prevValue = 0;
228 for (int32_t i = 0; i < numRanges; ++i) {
229 UChar32 rangeEnd = incl->getRangeEnd(i);
230 for (UChar32 c = incl->getRangeStart(i); c <= rangeEnd; ++c) {
231 // TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch.
232 int32_t value = u_getIntPropertyValue(c, prop);
233 if (value != prevValue) {
234 intPropIncl->add(c);
235 prevValue = value;
236 }
237 }
238 }
239
240 if (intPropIncl->isBogus()) {
241 errorCode = U_MEMORY_ALLOCATION_ERROR;
242 return;
243 }
244 // Compact for caching.
245 intPropIncl->compact();
246 gInclusions[inclIndex].fSet = intPropIncl.orphan();
247 ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup);
248}
249
250} // namespace
251
252U_NAMESPACE_BEGIN
253
254const UnicodeSet *CharacterProperties::getInclusionsForProperty(
255 UProperty prop, UErrorCode &errorCode) {
256 if (U_FAILURE(errorCode)) { return nullptr; }
257 if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) {
258 int32_t inclIndex = UPROPS_SRC_COUNT + (prop - UCHAR_INT_START);
259 Inclusion &i = gInclusions[inclIndex];
260 umtx_initOnce(i.fInitOnce, &initIntPropInclusion, prop, errorCode);
261 return i.fSet;
262 } else {
263 UPropertySource src = uprops_getSource(prop);
264 return getInclusionsForSource(src, errorCode);
265 }
266}
267
268U_NAMESPACE_END
269
270namespace {
271
272UnicodeSet *makeSet(UProperty property, UErrorCode &errorCode) {
273 if (U_FAILURE(errorCode)) { return nullptr; }
274 LocalPointer<UnicodeSet> set(new UnicodeSet());
275 if (set.isNull()) {
276 errorCode = U_MEMORY_ALLOCATION_ERROR;
277 return nullptr;
278 }
279 if (UCHAR_BASIC_EMOJI <= property && property <= UCHAR_RGI_EMOJI) {
280 // property of strings
281 const icu::EmojiProps *ep = icu::EmojiProps::getSingleton(errorCode);
282 if (U_FAILURE(errorCode)) { return nullptr; }
283 USetAdder sa = {
284 (USet *)set.getAlias(),
285 _set_add,
286 _set_addRange,
287 _set_addString,
288 nullptr, // don't need remove()
289 nullptr // don't need removeRange()
290 };
291 ep->addStrings(&sa, property, errorCode);
292 if (property != UCHAR_BASIC_EMOJI && property != UCHAR_RGI_EMOJI) {
293 // property of _only_ strings
294 set->freeze();
295 return set.orphan();
296 }
297 }
298
299 const UnicodeSet *inclusions =
300 icu::CharacterProperties::getInclusionsForProperty(property, errorCode);
301 if (U_FAILURE(errorCode)) { return nullptr; }
302 int32_t numRanges = inclusions->getRangeCount();
303 UChar32 startHasProperty = -1;
304
305 for (int32_t i = 0; i < numRanges; ++i) {
306 UChar32 rangeEnd = inclusions->getRangeEnd(i);
307 for (UChar32 c = inclusions->getRangeStart(i); c <= rangeEnd; ++c) {
308 // TODO: Get a UCharacterProperty.BinaryProperty to avoid the property dispatch.
309 if (u_hasBinaryProperty(c, property)) {
310 if (startHasProperty < 0) {
311 // Transition from false to true.
312 startHasProperty = c;
313 }
314 } else if (startHasProperty >= 0) {
315 // Transition from true to false.
316 set->add(startHasProperty, c - 1);
317 startHasProperty = -1;
318 }
319 }
320 }
321 if (startHasProperty >= 0) {
322 set->add(startHasProperty, 0x10FFFF);
323 }
324 set->freeze();
325 return set.orphan();
326}
327
328UCPMap *makeMap(UProperty property, UErrorCode &errorCode) {
329 if (U_FAILURE(errorCode)) { return nullptr; }
330 uint32_t nullValue = property == UCHAR_SCRIPT ? USCRIPT_UNKNOWN : 0;
331 icu::LocalUMutableCPTriePointer mutableTrie(
332 umutablecptrie_open(nullValue, nullValue, &errorCode));
333 const UnicodeSet *inclusions =
334 icu::CharacterProperties::getInclusionsForProperty(property, errorCode);
335 if (U_FAILURE(errorCode)) { return nullptr; }
336 int32_t numRanges = inclusions->getRangeCount();
337 UChar32 start = 0;
338 uint32_t value = nullValue;
339
340 for (int32_t i = 0; i < numRanges; ++i) {
341 UChar32 rangeEnd = inclusions->getRangeEnd(i);
342 for (UChar32 c = inclusions->getRangeStart(i); c <= rangeEnd; ++c) {
343 // TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch.
344 uint32_t nextValue = u_getIntPropertyValue(c, property);
345 if (value != nextValue) {
346 if (value != nullValue) {
347 umutablecptrie_setRange(mutableTrie.getAlias(), start, c - 1, value, &errorCode);
348 }
349 start = c;
350 value = nextValue;
351 }
352 }
353 }
354 if (value != 0) {
355 umutablecptrie_setRange(mutableTrie.getAlias(), start, 0x10FFFF, value, &errorCode);
356 }
357
358 UCPTrieType type;
359 if (property == UCHAR_BIDI_CLASS || property == UCHAR_GENERAL_CATEGORY) {
360 type = UCPTRIE_TYPE_FAST;
361 } else {
362 type = UCPTRIE_TYPE_SMALL;
363 }
364 UCPTrieValueWidth valueWidth;
365 // TODO: UCharacterProperty.IntProperty
366 int32_t max = u_getIntPropertyMaxValue(property);
367 if (max <= 0xff) {
368 valueWidth = UCPTRIE_VALUE_BITS_8;
369 } else if (max <= 0xffff) {
370 valueWidth = UCPTRIE_VALUE_BITS_16;
371 } else {
372 valueWidth = UCPTRIE_VALUE_BITS_32;
373 }
374 return reinterpret_cast<UCPMap *>(
375 umutablecptrie_buildImmutable(mutableTrie.getAlias(), type, valueWidth, &errorCode));
376}
377
378} // namespace
379
380U_NAMESPACE_BEGIN
381
382const UnicodeSet *CharacterProperties::getBinaryPropertySet(UProperty property, UErrorCode &errorCode) {
383 if (U_FAILURE(errorCode)) { return nullptr; }
384 if (property < 0 || UCHAR_BINARY_LIMIT <= property) {
385 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
386 return nullptr;
387 }
388 Mutex m(&cpMutex);
389 UnicodeSet *set = sets[property];
390 if (set == nullptr) {
391 sets[property] = set = makeSet(property, errorCode);
392 }
393 return set;
394}
395
396U_NAMESPACE_END
397
398U_NAMESPACE_USE
399
400U_CAPI const USet * U_EXPORT2
401u_getBinaryPropertySet(UProperty property, UErrorCode *pErrorCode) {
402 const UnicodeSet *set = CharacterProperties::getBinaryPropertySet(property, *pErrorCode);
403 return U_SUCCESS(*pErrorCode) ? set->toUSet() : nullptr;
404}
405
406U_CAPI const UCPMap * U_EXPORT2
407u_getIntPropertyMap(UProperty property, UErrorCode *pErrorCode) {
408 if (U_FAILURE(*pErrorCode)) { return nullptr; }
409 if (property < UCHAR_INT_START || UCHAR_INT_LIMIT <= property) {
410 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
411 return nullptr;
412 }
413 Mutex m(&cpMutex);
414 UCPMap *map = maps[property - UCHAR_INT_START];
415 if (map == nullptr) {
416 maps[property - UCHAR_INT_START] = map = makeMap(property, *pErrorCode);
417 }
418 return map;
419}
420