1// © 2018 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3
4// characterproperties.cpp
5// created: 2018sep03 Markus W. Scherer
6
7#include "unicode/utypes.h"
8#include "unicode/localpointer.h"
9#include "unicode/uchar.h"
10#include "unicode/ucpmap.h"
11#include "unicode/ucptrie.h"
12#include "unicode/umutablecptrie.h"
13#include "unicode/uniset.h"
14#include "unicode/uscript.h"
15#include "unicode/uset.h"
16#include "cmemory.h"
17#include "mutex.h"
18#include "normalizer2impl.h"
19#include "uassert.h"
20#include "ubidi_props.h"
21#include "ucase.h"
22#include "ucln_cmn.h"
23#include "umutex.h"
24#include "uprops.h"
25
26using icu::LocalPointer;
27#if !UCONFIG_NO_NORMALIZATION
28using icu::Normalizer2Factory;
29using icu::Normalizer2Impl;
30#endif
31using icu::UInitOnce;
32using icu::UnicodeSet;
33
34namespace {
35
36UBool U_CALLCONV characterproperties_cleanup();
37
38constexpr int32_t NUM_INCLUSIONS = UPROPS_SRC_COUNT + UCHAR_INT_LIMIT - UCHAR_INT_START;
39
40struct Inclusion {
41 UnicodeSet *fSet = nullptr;
42 UInitOnce fInitOnce = U_INITONCE_INITIALIZER;
43};
44Inclusion gInclusions[NUM_INCLUSIONS]; // cached getInclusions()
45
46UnicodeSet *sets[UCHAR_BINARY_LIMIT] = {};
47
48UCPMap *maps[UCHAR_INT_LIMIT - UCHAR_INT_START] = {};
49
50icu::UMutex cpMutex;
51
52//----------------------------------------------------------------
53// Inclusions list
54//----------------------------------------------------------------
55
56// USetAdder implementation
57// Does not use uset.h to reduce code dependencies
58void U_CALLCONV
59_set_add(USet *set, UChar32 c) {
60 ((UnicodeSet *)set)->add(c);
61}
62
63void U_CALLCONV
64_set_addRange(USet *set, UChar32 start, UChar32 end) {
65 ((UnicodeSet *)set)->add(start, end);
66}
67
68void U_CALLCONV
69_set_addString(USet *set, const UChar *str, int32_t length) {
70 ((UnicodeSet *)set)->add(icu::UnicodeString((UBool)(length<0), str, length));
71}
72
73UBool U_CALLCONV characterproperties_cleanup() {
74 for (Inclusion &in: gInclusions) {
75 delete in.fSet;
76 in.fSet = nullptr;
77 in.fInitOnce.reset();
78 }
79 for (int32_t i = 0; i < UPRV_LENGTHOF(sets); ++i) {
80 delete sets[i];
81 sets[i] = nullptr;
82 }
83 for (int32_t i = 0; i < UPRV_LENGTHOF(maps); ++i) {
84 ucptrie_close(reinterpret_cast<UCPTrie *>(maps[i]));
85 maps[i] = nullptr;
86 }
87 return TRUE;
88}
89
90void U_CALLCONV initInclusion(UPropertySource src, UErrorCode &errorCode) {
91 // This function is invoked only via umtx_initOnce().
92 U_ASSERT(0 <= src && src < UPROPS_SRC_COUNT);
93 if (src == UPROPS_SRC_NONE) {
94 errorCode = U_INTERNAL_PROGRAM_ERROR;
95 return;
96 }
97 U_ASSERT(gInclusions[src].fSet == nullptr);
98
99 LocalPointer<UnicodeSet> incl(new UnicodeSet());
100 if (incl.isNull()) {
101 errorCode = U_MEMORY_ALLOCATION_ERROR;
102 return;
103 }
104 USetAdder sa = {
105 (USet *)incl.getAlias(),
106 _set_add,
107 _set_addRange,
108 _set_addString,
109 nullptr, // don't need remove()
110 nullptr // don't need removeRange()
111 };
112
113 switch(src) {
114 case UPROPS_SRC_CHAR:
115 uchar_addPropertyStarts(&sa, &errorCode);
116 break;
117 case UPROPS_SRC_PROPSVEC:
118 upropsvec_addPropertyStarts(&sa, &errorCode);
119 break;
120 case UPROPS_SRC_CHAR_AND_PROPSVEC:
121 uchar_addPropertyStarts(&sa, &errorCode);
122 upropsvec_addPropertyStarts(&sa, &errorCode);
123 break;
124#if !UCONFIG_NO_NORMALIZATION
125 case UPROPS_SRC_CASE_AND_NORM: {
126 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
127 if(U_SUCCESS(errorCode)) {
128 impl->addPropertyStarts(&sa, errorCode);
129 }
130 ucase_addPropertyStarts(&sa, &errorCode);
131 break;
132 }
133 case UPROPS_SRC_NFC: {
134 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
135 if(U_SUCCESS(errorCode)) {
136 impl->addPropertyStarts(&sa, errorCode);
137 }
138 break;
139 }
140 case UPROPS_SRC_NFKC: {
141 const Normalizer2Impl *impl=Normalizer2Factory::getNFKCImpl(errorCode);
142 if(U_SUCCESS(errorCode)) {
143 impl->addPropertyStarts(&sa, errorCode);
144 }
145 break;
146 }
147 case UPROPS_SRC_NFKC_CF: {
148 const Normalizer2Impl *impl=Normalizer2Factory::getNFKC_CFImpl(errorCode);
149 if(U_SUCCESS(errorCode)) {
150 impl->addPropertyStarts(&sa, errorCode);
151 }
152 break;
153 }
154 case UPROPS_SRC_NFC_CANON_ITER: {
155 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
156 if(U_SUCCESS(errorCode)) {
157 impl->addCanonIterPropertyStarts(&sa, errorCode);
158 }
159 break;
160 }
161#endif
162 case UPROPS_SRC_CASE:
163 ucase_addPropertyStarts(&sa, &errorCode);
164 break;
165 case UPROPS_SRC_BIDI:
166 ubidi_addPropertyStarts(&sa, &errorCode);
167 break;
168 case UPROPS_SRC_INPC:
169 case UPROPS_SRC_INSC:
170 case UPROPS_SRC_VO:
171 uprops_addPropertyStarts((UPropertySource)src, &sa, &errorCode);
172 break;
173 default:
174 errorCode = U_INTERNAL_PROGRAM_ERROR;
175 break;
176 }
177
178 if (U_FAILURE(errorCode)) {
179 return;
180 }
181 if (incl->isBogus()) {
182 errorCode = U_MEMORY_ALLOCATION_ERROR;
183 return;
184 }
185 // Compact for caching.
186 incl->compact();
187 gInclusions[src].fSet = incl.orphan();
188 ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup);
189}
190
191const UnicodeSet *getInclusionsForSource(UPropertySource src, UErrorCode &errorCode) {
192 if (U_FAILURE(errorCode)) { return nullptr; }
193 if (src < 0 || UPROPS_SRC_COUNT <= src) {
194 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
195 return nullptr;
196 }
197 Inclusion &i = gInclusions[src];
198 umtx_initOnce(i.fInitOnce, &initInclusion, src, errorCode);
199 return i.fSet;
200}
201
202void U_CALLCONV initIntPropInclusion(UProperty prop, UErrorCode &errorCode) {
203 // This function is invoked only via umtx_initOnce().
204 U_ASSERT(UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT);
205 int32_t inclIndex = UPROPS_SRC_COUNT + prop - UCHAR_INT_START;
206 U_ASSERT(gInclusions[inclIndex].fSet == nullptr);
207 UPropertySource src = uprops_getSource(prop);
208 const UnicodeSet *incl = getInclusionsForSource(src, errorCode);
209 if (U_FAILURE(errorCode)) {
210 return;
211 }
212
213 LocalPointer<UnicodeSet> intPropIncl(new UnicodeSet(0, 0));
214 if (intPropIncl.isNull()) {
215 errorCode = U_MEMORY_ALLOCATION_ERROR;
216 return;
217 }
218 int32_t numRanges = incl->getRangeCount();
219 int32_t prevValue = 0;
220 for (int32_t i = 0; i < numRanges; ++i) {
221 UChar32 rangeEnd = incl->getRangeEnd(i);
222 for (UChar32 c = incl->getRangeStart(i); c <= rangeEnd; ++c) {
223 // TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch.
224 int32_t value = u_getIntPropertyValue(c, prop);
225 if (value != prevValue) {
226 intPropIncl->add(c);
227 prevValue = value;
228 }
229 }
230 }
231
232 if (intPropIncl->isBogus()) {
233 errorCode = U_MEMORY_ALLOCATION_ERROR;
234 return;
235 }
236 // Compact for caching.
237 intPropIncl->compact();
238 gInclusions[inclIndex].fSet = intPropIncl.orphan();
239 ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup);
240}
241
242} // namespace
243
244U_NAMESPACE_BEGIN
245
246const UnicodeSet *CharacterProperties::getInclusionsForProperty(
247 UProperty prop, UErrorCode &errorCode) {
248 if (U_FAILURE(errorCode)) { return nullptr; }
249 if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) {
250 int32_t inclIndex = UPROPS_SRC_COUNT + prop - UCHAR_INT_START;
251 Inclusion &i = gInclusions[inclIndex];
252 umtx_initOnce(i.fInitOnce, &initIntPropInclusion, prop, errorCode);
253 return i.fSet;
254 } else {
255 UPropertySource src = uprops_getSource(prop);
256 return getInclusionsForSource(src, errorCode);
257 }
258}
259
260U_NAMESPACE_END
261
262namespace {
263
264UnicodeSet *makeSet(UProperty property, UErrorCode &errorCode) {
265 if (U_FAILURE(errorCode)) { return nullptr; }
266 LocalPointer<UnicodeSet> set(new UnicodeSet());
267 if (set.isNull()) {
268 errorCode = U_MEMORY_ALLOCATION_ERROR;
269 return nullptr;
270 }
271 const UnicodeSet *inclusions =
272 icu::CharacterProperties::getInclusionsForProperty(property, errorCode);
273 if (U_FAILURE(errorCode)) { return nullptr; }
274 int32_t numRanges = inclusions->getRangeCount();
275 UChar32 startHasProperty = -1;
276
277 for (int32_t i = 0; i < numRanges; ++i) {
278 UChar32 rangeEnd = inclusions->getRangeEnd(i);
279 for (UChar32 c = inclusions->getRangeStart(i); c <= rangeEnd; ++c) {
280 // TODO: Get a UCharacterProperty.BinaryProperty to avoid the property dispatch.
281 if (u_hasBinaryProperty(c, property)) {
282 if (startHasProperty < 0) {
283 // Transition from false to true.
284 startHasProperty = c;
285 }
286 } else if (startHasProperty >= 0) {
287 // Transition from true to false.
288 set->add(startHasProperty, c - 1);
289 startHasProperty = -1;
290 }
291 }
292 }
293 if (startHasProperty >= 0) {
294 set->add(startHasProperty, 0x10FFFF);
295 }
296 set->freeze();
297 return set.orphan();
298}
299
300UCPMap *makeMap(UProperty property, UErrorCode &errorCode) {
301 if (U_FAILURE(errorCode)) { return nullptr; }
302 uint32_t nullValue = property == UCHAR_SCRIPT ? USCRIPT_UNKNOWN : 0;
303 icu::LocalUMutableCPTriePointer mutableTrie(
304 umutablecptrie_open(nullValue, nullValue, &errorCode));
305 const UnicodeSet *inclusions =
306 icu::CharacterProperties::getInclusionsForProperty(property, errorCode);
307 if (U_FAILURE(errorCode)) { return nullptr; }
308 int32_t numRanges = inclusions->getRangeCount();
309 UChar32 start = 0;
310 uint32_t value = nullValue;
311
312 for (int32_t i = 0; i < numRanges; ++i) {
313 UChar32 rangeEnd = inclusions->getRangeEnd(i);
314 for (UChar32 c = inclusions->getRangeStart(i); c <= rangeEnd; ++c) {
315 // TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch.
316 uint32_t nextValue = u_getIntPropertyValue(c, property);
317 if (value != nextValue) {
318 if (value != nullValue) {
319 umutablecptrie_setRange(mutableTrie.getAlias(), start, c - 1, value, &errorCode);
320 }
321 start = c;
322 value = nextValue;
323 }
324 }
325 }
326 if (value != 0) {
327 umutablecptrie_setRange(mutableTrie.getAlias(), start, 0x10FFFF, value, &errorCode);
328 }
329
330 UCPTrieType type;
331 if (property == UCHAR_BIDI_CLASS || property == UCHAR_GENERAL_CATEGORY) {
332 type = UCPTRIE_TYPE_FAST;
333 } else {
334 type = UCPTRIE_TYPE_SMALL;
335 }
336 UCPTrieValueWidth valueWidth;
337 // TODO: UCharacterProperty.IntProperty
338 int32_t max = u_getIntPropertyMaxValue(property);
339 if (max <= 0xff) {
340 valueWidth = UCPTRIE_VALUE_BITS_8;
341 } else if (max <= 0xffff) {
342 valueWidth = UCPTRIE_VALUE_BITS_16;
343 } else {
344 valueWidth = UCPTRIE_VALUE_BITS_32;
345 }
346 return reinterpret_cast<UCPMap *>(
347 umutablecptrie_buildImmutable(mutableTrie.getAlias(), type, valueWidth, &errorCode));
348}
349
350} // namespace
351
352U_NAMESPACE_USE
353
354U_CAPI const USet * U_EXPORT2
355u_getBinaryPropertySet(UProperty property, UErrorCode *pErrorCode) {
356 if (U_FAILURE(*pErrorCode)) { return nullptr; }
357 if (property < 0 || UCHAR_BINARY_LIMIT <= property) {
358 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
359 return nullptr;
360 }
361 Mutex m(&cpMutex);
362 UnicodeSet *set = sets[property];
363 if (set == nullptr) {
364 sets[property] = set = makeSet(property, *pErrorCode);
365 }
366 if (U_FAILURE(*pErrorCode)) { return nullptr; }
367 return set->toUSet();
368}
369
370U_CAPI const UCPMap * U_EXPORT2
371u_getIntPropertyMap(UProperty property, UErrorCode *pErrorCode) {
372 if (U_FAILURE(*pErrorCode)) { return nullptr; }
373 if (property < UCHAR_INT_START || UCHAR_INT_LIMIT <= property) {
374 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
375 return nullptr;
376 }
377 Mutex m(&cpMutex);
378 UCPMap *map = maps[property - UCHAR_INT_START];
379 if (map == nullptr) {
380 maps[property - UCHAR_INT_START] = map = makeMap(property, *pErrorCode);
381 }
382 return map;
383}
384