static_unicode_sets.cpp source code [ClickHouse/contrib/icu/icu4c/source/common/static_unicode_sets.cpp]

1	// © 2018 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3
4	#include "unicode/utypes.h"
5
6	#if !UCONFIG_NO_FORMATTING
7
8	// Allow implicit conversion from char16_t to UnicodeString for this file:*
9	// Helpful in toString methods and elsewhere.
10	#define UNISTR_FROM_STRING_EXPLICIT
11
12	#include "static_unicode_sets.h"
13	#include "umutex.h"
14	#include "ucln_cmn.h"
15	#include "unicode/uniset.h"
16	#include "uresimp.h"
17	#include "cstring.h"
18	#include "uassert.h"
19
20	using namespace icu;
21	using namespace icu::unisets;
22
23
24	namespace {
25
26	UnicodeSet* gUnicodeSets[UNISETS_KEY_COUNT] = {};
27
28	// Save the empty instance in static memory to have well-defined behavior if a
29	// regular UnicodeSet cannot be allocated.
30	alignas(UnicodeSet)
31	char gEmptyUnicodeSet[sizeof(UnicodeSet)];
32
33	// Whether the gEmptyUnicodeSet is initialized and ready to use.
34	UBool gEmptyUnicodeSetInitialized = FALSE;
35
36	inline UnicodeSet* getImpl(Key key) {
37	UnicodeSet* candidate = gUnicodeSets[key];
38	if (candidate == nullptr) {
39	return reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet);
40	}
41	return candidate;
42	}
43
44	UnicodeSet* computeUnion(Key k1, Key k2) {
45	UnicodeSet* result = new UnicodeSet ();
46	if (result == nullptr) {
47	return nullptr;
48	}
49	result->addAll(*getImpl(k1));
50	result->addAll(*getImpl(k2));
51	result->freeze();
52	return result;
53	}
54
55	UnicodeSet* computeUnion(Key k1, Key k2, Key k3) {
56	UnicodeSet* result = new UnicodeSet ();
57	if (result == nullptr) {
58	return nullptr;
59	}
60	result->addAll(*getImpl(k1));
61	result->addAll(*getImpl(k2));
62	result->addAll(*getImpl(k3));
63	result->freeze();
64	return result;
65	}
66
67
68	void saveSet(Key key, const UnicodeString& unicodeSetPattern, UErrorCode& status) {
69	// assert unicodeSets.get(key) == null;
70	gUnicodeSets[key] = new UnicodeSet (unicodeSetPattern, status);
71	}
72
73	class ParseDataSink : public ResourceSink {
74	public:
75	void put(const char* key, ResourceValue& value, UBool /noFallback/, UErrorCode& status) U_OVERRIDE {
76	ResourceTable contextsTable = value.getTable(status);
77	if (U_FAILURE(status)) { return; }
78	for (int i = `0`; contextsTable.getKeyAndValue(i, key, value); i++) {
79	if (uprv_strcmp(key, "date") == `0`) {
80	// ignore
81	} else {
82	ResourceTable strictnessTable = value.getTable(status);
83	if (U_FAILURE(status)) { return; }
84	for (int j = `0`; strictnessTable.getKeyAndValue(j, key, value); j++) {
85	bool isLenient = (uprv_strcmp(key, "lenient") == `0`);
86	ResourceArray array = value.getArray(status);
87	if (U_FAILURE(status)) { return; }
88	for (int k = `0`; k < array.getSize(); k++) {
89	array.getValue(k, value);
90	UnicodeString str = value.getUnicodeString(status);
91	if (U_FAILURE(status)) { return; }
92	// There is both lenient and strict data for comma/period,
93	// but not for any of the other symbols.
94	if (str.indexOf(u`'.'`) != -`1`) {
95	saveSet(isLenient ? PERIOD : STRICT_PERIOD, str, status);
96	} else if (str.indexOf(u`','`) != -`1`) {
97	saveSet(isLenient ? COMMA : STRICT_COMMA, str, status);
98	} else if (str.indexOf(u`'+'`) != -`1`) {
99	saveSet(PLUS_SIGN, str, status);
100	} else if (str.indexOf(u`'-'`) != -`1`) {
101	saveSet(MINUS_SIGN, str, status);
102	} else if (str.indexOf(u`'$'`) != -`1`) {
103	saveSet(DOLLAR_SIGN, str, status);
104	} else if (str.indexOf(u`'£'`) != -`1`) {
105	saveSet(POUND_SIGN, str, status);
106	} else if (str.indexOf(u`'₹'`) != -`1`) {
107	saveSet(RUPEE_SIGN, str, status);
108	} else if (str.indexOf(u`'¥'`) != -`1`) {
109	saveSet(YEN_SIGN, str, status);
110	} else if (str.indexOf(u`'₩'`) != -`1`) {
111	saveSet(WON_SIGN, str, status);
112	} else if (str.indexOf(u`'%'`) != -`1`) {
113	saveSet(PERCENT_SIGN, str, status);
114	} else if (str.indexOf(u`'‰'`) != -`1`) {
115	saveSet(PERMILLE_SIGN, str, status);
116	} else if (str.indexOf(u`'’'`) != -`1`) {
117	saveSet(APOSTROPHE_SIGN, str, status);
118	} else {
119	// Unknown class of parse lenients
120	// TODO(ICU-20428): Make ICU automatically accept new classes?
121	U_ASSERT(FALSE);
122	}
123	if (U_FAILURE(status)) { return; }
124	}
125	}
126	}
127	}
128	}
129	};
130
131
132	icu::UInitOnce gNumberParseUniSetsInitOnce = U_INITONCE_INITIALIZER;
133
134	UBool U_CALLCONV cleanupNumberParseUniSets() {
135	if (gEmptyUnicodeSetInitialized) {
136	reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet)->~UnicodeSet();
137	gEmptyUnicodeSetInitialized = FALSE;
138	}
139	for (int32_t i = `0`; i < UNISETS_KEY_COUNT; i++) {
140	delete gUnicodeSets[i];
141	gUnicodeSets[i] = nullptr;
142	}
143	gNumberParseUniSetsInitOnce.reset();
144	return TRUE;
145	}
146
147	void U_CALLCONV initNumberParseUniSets(UErrorCode& status) {
148	ucln_common_registerCleanup(UCLN_COMMON_NUMPARSE_UNISETS, cleanupNumberParseUniSets);
149
150	// Initialize the empty instance for well-defined fallback behavior
151	new(gEmptyUnicodeSet) UnicodeSet ();
152	reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet)->freeze();
153	gEmptyUnicodeSetInitialized = TRUE;
154
155	// These sets were decided after discussion with icu-design@. See tickets #13084 and #13309.
156	// Zs+TAB is "horizontal whitespace" according to UTS #18 (blank property).
157	gUnicodeSets[DEFAULT_IGNORABLES] = new UnicodeSet (
158	u"[[:Zs:][\\u0009][:Bidi_Control:][:Variation_Selector:]]", status);
159	gUnicodeSets[STRICT_IGNORABLES] = new UnicodeSet (u"[[:Bidi_Control:]]", status);
160
161	LocalUResourceBundlePointer rb(ures_open(nullptr, "root", &status));
162	if (U_FAILURE(status)) { return; }
163	ParseDataSink sink;
164	ures_getAllItemsWithFallback(rb.getAlias(), "parse", sink, status);
165	if (U_FAILURE(status)) { return; }
166
167	// NOTE: It is OK for these assertions to fail if there was a no-data build.
168	U_ASSERT(gUnicodeSets[COMMA] != nullptr);
169	U_ASSERT(gUnicodeSets[STRICT_COMMA] != nullptr);
170	U_ASSERT(gUnicodeSets[PERIOD] != nullptr);
171	U_ASSERT(gUnicodeSets[STRICT_PERIOD] != nullptr);
172	U_ASSERT(gUnicodeSets[APOSTROPHE_SIGN] != nullptr);
173
174	LocalPointer<UnicodeSet> otherGrouping(new UnicodeSet (
175	u"[٬‘\\u0020\\u00A0\\u2000-\\u200A\\u202F\\u205F\\u3000]",
176	status
177	), status);
178	if (U_FAILURE(status)) { return; }
179	otherGrouping ->addAll(*gUnicodeSets[APOSTROPHE_SIGN]);
180	gUnicodeSets[OTHER_GROUPING_SEPARATORS] = otherGrouping.orphan();
181	gUnicodeSets[ALL_SEPARATORS] = computeUnion(COMMA, PERIOD, OTHER_GROUPING_SEPARATORS);
182	gUnicodeSets[STRICT_ALL_SEPARATORS] = computeUnion(
183	STRICT_COMMA, STRICT_PERIOD, OTHER_GROUPING_SEPARATORS);
184
185	U_ASSERT(gUnicodeSets[MINUS_SIGN] != nullptr);
186	U_ASSERT(gUnicodeSets[PLUS_SIGN] != nullptr);
187	U_ASSERT(gUnicodeSets[PERCENT_SIGN] != nullptr);
188	U_ASSERT(gUnicodeSets[PERMILLE_SIGN] != nullptr);
189
190	gUnicodeSets[INFINITY_SIGN] = new UnicodeSet (u"[∞]", status);
191	if (U_FAILURE(status)) { return; }
192
193	U_ASSERT(gUnicodeSets[DOLLAR_SIGN] != nullptr);
194	U_ASSERT(gUnicodeSets[POUND_SIGN] != nullptr);
195	U_ASSERT(gUnicodeSets[RUPEE_SIGN] != nullptr);
196	U_ASSERT(gUnicodeSets[YEN_SIGN] != nullptr);
197	U_ASSERT(gUnicodeSets[WON_SIGN] != nullptr);
198
199	gUnicodeSets[DIGITS] = new UnicodeSet (u"[:digit:]", status);
200	if (U_FAILURE(status)) { return; }
201	gUnicodeSets[DIGITS_OR_ALL_SEPARATORS] = computeUnion(DIGITS, ALL_SEPARATORS);
202	gUnicodeSets[DIGITS_OR_STRICT_ALL_SEPARATORS] = computeUnion(DIGITS, STRICT_ALL_SEPARATORS);
203
204	for (auto* uniset : gUnicodeSets) {
205	if (uniset != nullptr) {
206	uniset->freeze();
207	}
208	}
209	}
210
211	}
212
213	const UnicodeSet* unisets::get(Key key) {
214	UErrorCode localStatus = U_ZERO_ERROR;
215	umtx_initOnce(gNumberParseUniSetsInitOnce, &initNumberParseUniSets, localStatus);
216	if (U_FAILURE(localStatus)) {
217	return reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet);
218	}
219	return getImpl(key);
220	}
221
222	Key unisets::chooseFrom(UnicodeString str, Key key1) {
223	return get(key1)->contains(str) ? key1 : NONE;
224	}
225
226	Key unisets::chooseFrom(UnicodeString str, Key key1, Key key2) {
227	return get(key1)->contains(str) ? key1 : chooseFrom(str, key2);
228	}
229
230	//Key unisets::chooseCurrency(UnicodeString str) {
231	// if (get(DOLLAR_SIGN)->contains(str)) {
232	// return DOLLAR_SIGN;
233	// } else if (get(POUND_SIGN)->contains(str)) {
234	// return POUND_SIGN;
235	// } else if (get(RUPEE_SIGN)->contains(str)) {
236	// return RUPEE_SIGN;
237	// } else if (get(YEN_SIGN)->contains(str)) {
238	// return YEN_SIGN;
239	// } else {
240	// return NONE;
241	// }
242	//}
243
244
245	#endif /* #if !UCONFIG_NO_FORMATTING */
246

Browse the source code of ClickHouse/contrib/icu/icu4c/source/common/static_unicode_sets.cpp