1// © 2018 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3
4#include "unicode/utypes.h"
5
6#if !UCONFIG_NO_FORMATTING
7
8// Allow implicit conversion from char16_t* to UnicodeString for this file:
9// Helpful in toString methods and elsewhere.
10#define UNISTR_FROM_STRING_EXPLICIT
11
12#include "static_unicode_sets.h"
13#include "umutex.h"
14#include "ucln_cmn.h"
15#include "unicode/uniset.h"
16#include "uresimp.h"
17#include "cstring.h"
18#include "uassert.h"
19
20using namespace icu;
21using namespace icu::unisets;
22
23
24namespace {
25
26UnicodeSet* gUnicodeSets[UNISETS_KEY_COUNT] = {};
27
28// Save the empty instance in static memory to have well-defined behavior if a
29// regular UnicodeSet cannot be allocated.
30alignas(UnicodeSet)
31char gEmptyUnicodeSet[sizeof(UnicodeSet)];
32
33// Whether the gEmptyUnicodeSet is initialized and ready to use.
34UBool gEmptyUnicodeSetInitialized = false;
35
36inline UnicodeSet* getImpl(Key key) {
37 UnicodeSet* candidate = gUnicodeSets[key];
38 if (candidate == nullptr) {
39 return reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet);
40 }
41 return candidate;
42}
43
44UnicodeSet* computeUnion(Key k1, Key k2) {
45 UnicodeSet* result = new UnicodeSet();
46 if (result == nullptr) {
47 return nullptr;
48 }
49 result->addAll(*getImpl(k1));
50 result->addAll(*getImpl(k2));
51 result->freeze();
52 return result;
53}
54
55UnicodeSet* computeUnion(Key k1, Key k2, Key k3) {
56 UnicodeSet* result = new UnicodeSet();
57 if (result == nullptr) {
58 return nullptr;
59 }
60 result->addAll(*getImpl(k1));
61 result->addAll(*getImpl(k2));
62 result->addAll(*getImpl(k3));
63 result->freeze();
64 return result;
65}
66
67
68void saveSet(Key key, const UnicodeString& unicodeSetPattern, UErrorCode& status) {
69 // assert unicodeSets.get(key) == null;
70 gUnicodeSets[key] = new UnicodeSet(unicodeSetPattern, status);
71}
72
73class ParseDataSink : public ResourceSink {
74 public:
75 void put(const char* key, ResourceValue& value, UBool /*noFallback*/, UErrorCode& status) override {
76 ResourceTable contextsTable = value.getTable(status);
77 if (U_FAILURE(status)) { return; }
78 for (int i = 0; contextsTable.getKeyAndValue(i, key, value); i++) {
79 if (uprv_strcmp(key, "date") == 0) {
80 // ignore
81 } else {
82 ResourceTable strictnessTable = value.getTable(status);
83 if (U_FAILURE(status)) { return; }
84 for (int j = 0; strictnessTable.getKeyAndValue(j, key, value); j++) {
85 bool isLenient = (uprv_strcmp(key, "lenient") == 0);
86 ResourceArray array = value.getArray(status);
87 if (U_FAILURE(status)) { return; }
88 for (int k = 0; k < array.getSize(); k++) {
89 array.getValue(k, value);
90 UnicodeString str = value.getUnicodeString(status);
91 if (U_FAILURE(status)) { return; }
92 // There is both lenient and strict data for comma/period,
93 // but not for any of the other symbols.
94 if (str.indexOf(u'.') != -1) {
95 saveSet(isLenient ? PERIOD : STRICT_PERIOD, str, status);
96 } else if (str.indexOf(u',') != -1) {
97 saveSet(isLenient ? COMMA : STRICT_COMMA, str, status);
98 } else if (str.indexOf(u'+') != -1) {
99 saveSet(PLUS_SIGN, str, status);
100 } else if (str.indexOf(u'-') != -1) {
101 saveSet(MINUS_SIGN, str, status);
102 } else if (str.indexOf(u'$') != -1) {
103 saveSet(DOLLAR_SIGN, str, status);
104 } else if (str.indexOf(u'£') != -1) {
105 saveSet(POUND_SIGN, str, status);
106 } else if (str.indexOf(u'₹') != -1) {
107 saveSet(RUPEE_SIGN, str, status);
108 } else if (str.indexOf(u'¥') != -1) {
109 saveSet(YEN_SIGN, str, status);
110 } else if (str.indexOf(u'₩') != -1) {
111 saveSet(WON_SIGN, str, status);
112 } else if (str.indexOf(u'%') != -1) {
113 saveSet(PERCENT_SIGN, str, status);
114 } else if (str.indexOf(u'‰') != -1) {
115 saveSet(PERMILLE_SIGN, str, status);
116 } else if (str.indexOf(u'’') != -1) {
117 saveSet(APOSTROPHE_SIGN, str, status);
118 } else {
119 // Unknown class of parse lenients
120 // TODO(ICU-20428): Make ICU automatically accept new classes?
121 U_ASSERT(false);
122 }
123 if (U_FAILURE(status)) { return; }
124 }
125 }
126 }
127 }
128 }
129};
130
131
132icu::UInitOnce gNumberParseUniSetsInitOnce {};
133
134UBool U_CALLCONV cleanupNumberParseUniSets() {
135 if (gEmptyUnicodeSetInitialized) {
136 reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet)->~UnicodeSet();
137 gEmptyUnicodeSetInitialized = false;
138 }
139 for (int32_t i = 0; i < UNISETS_KEY_COUNT; i++) {
140 delete gUnicodeSets[i];
141 gUnicodeSets[i] = nullptr;
142 }
143 gNumberParseUniSetsInitOnce.reset();
144 return true;
145}
146
147void U_CALLCONV initNumberParseUniSets(UErrorCode& status) {
148 ucln_common_registerCleanup(UCLN_COMMON_NUMPARSE_UNISETS, cleanupNumberParseUniSets);
149
150 // Initialize the empty instance for well-defined fallback behavior
151 new(gEmptyUnicodeSet) UnicodeSet();
152 reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet)->freeze();
153 gEmptyUnicodeSetInitialized = true;
154
155 // These sets were decided after discussion with icu-design@. See tickets #13084 and #13309.
156 // Zs+TAB is "horizontal whitespace" according to UTS #18 (blank property).
157 gUnicodeSets[DEFAULT_IGNORABLES] = new UnicodeSet(
158 u"[[:Zs:][\\u0009][:Bidi_Control:][:Variation_Selector:]]", status);
159 gUnicodeSets[STRICT_IGNORABLES] = new UnicodeSet(u"[[:Bidi_Control:]]", status);
160
161 LocalUResourceBundlePointer rb(ures_open(nullptr, "root", &status));
162 if (U_FAILURE(status)) { return; }
163 ParseDataSink sink;
164 ures_getAllItemsWithFallback(rb.getAlias(), "parse", sink, status);
165 if (U_FAILURE(status)) { return; }
166
167 // NOTE: It is OK for these assertions to fail if there was a no-data build.
168 U_ASSERT(gUnicodeSets[COMMA] != nullptr);
169 U_ASSERT(gUnicodeSets[STRICT_COMMA] != nullptr);
170 U_ASSERT(gUnicodeSets[PERIOD] != nullptr);
171 U_ASSERT(gUnicodeSets[STRICT_PERIOD] != nullptr);
172 U_ASSERT(gUnicodeSets[APOSTROPHE_SIGN] != nullptr);
173
174 LocalPointer<UnicodeSet> otherGrouping(new UnicodeSet(
175 u"[٬‘\\u0020\\u00A0\\u2000-\\u200A\\u202F\\u205F\\u3000]",
176 status
177 ), status);
178 if (U_FAILURE(status)) { return; }
179 otherGrouping->addAll(*gUnicodeSets[APOSTROPHE_SIGN]);
180 gUnicodeSets[OTHER_GROUPING_SEPARATORS] = otherGrouping.orphan();
181 gUnicodeSets[ALL_SEPARATORS] = computeUnion(COMMA, PERIOD, OTHER_GROUPING_SEPARATORS);
182 gUnicodeSets[STRICT_ALL_SEPARATORS] = computeUnion(
183 STRICT_COMMA, STRICT_PERIOD, OTHER_GROUPING_SEPARATORS);
184
185 U_ASSERT(gUnicodeSets[MINUS_SIGN] != nullptr);
186 U_ASSERT(gUnicodeSets[PLUS_SIGN] != nullptr);
187 U_ASSERT(gUnicodeSets[PERCENT_SIGN] != nullptr);
188 U_ASSERT(gUnicodeSets[PERMILLE_SIGN] != nullptr);
189
190 gUnicodeSets[INFINITY_SIGN] = new UnicodeSet(u"[∞]", status);
191 if (U_FAILURE(status)) { return; }
192
193 U_ASSERT(gUnicodeSets[DOLLAR_SIGN] != nullptr);
194 U_ASSERT(gUnicodeSets[POUND_SIGN] != nullptr);
195 U_ASSERT(gUnicodeSets[RUPEE_SIGN] != nullptr);
196 U_ASSERT(gUnicodeSets[YEN_SIGN] != nullptr);
197 U_ASSERT(gUnicodeSets[WON_SIGN] != nullptr);
198
199 gUnicodeSets[DIGITS] = new UnicodeSet(u"[:digit:]", status);
200 if (U_FAILURE(status)) { return; }
201 gUnicodeSets[DIGITS_OR_ALL_SEPARATORS] = computeUnion(DIGITS, ALL_SEPARATORS);
202 gUnicodeSets[DIGITS_OR_STRICT_ALL_SEPARATORS] = computeUnion(DIGITS, STRICT_ALL_SEPARATORS);
203
204 for (auto* uniset : gUnicodeSets) {
205 if (uniset != nullptr) {
206 uniset->freeze();
207 }
208 }
209}
210
211}
212
213const UnicodeSet* unisets::get(Key key) {
214 UErrorCode localStatus = U_ZERO_ERROR;
215 umtx_initOnce(gNumberParseUniSetsInitOnce, &initNumberParseUniSets, localStatus);
216 if (U_FAILURE(localStatus)) {
217 return reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet);
218 }
219 return getImpl(key);
220}
221
222Key unisets::chooseFrom(UnicodeString str, Key key1) {
223 return get(key1)->contains(str) ? key1 : NONE;
224}
225
226Key unisets::chooseFrom(UnicodeString str, Key key1, Key key2) {
227 return get(key1)->contains(str) ? key1 : chooseFrom(str, key2);
228}
229
230//Key unisets::chooseCurrency(UnicodeString str) {
231// if (get(DOLLAR_SIGN)->contains(str)) {
232// return DOLLAR_SIGN;
233// } else if (get(POUND_SIGN)->contains(str)) {
234// return POUND_SIGN;
235// } else if (get(RUPEE_SIGN)->contains(str)) {
236// return RUPEE_SIGN;
237// } else if (get(YEN_SIGN)->contains(str)) {
238// return YEN_SIGN;
239// } else {
240// return NONE;
241// }
242//}
243
244
245#endif /* #if !UCONFIG_NO_FORMATTING */
246