| 1 | // © 2018 and later: Unicode, Inc. and others. | 
|---|
| 2 | // License & terms of use: http://www.unicode.org/copyright.html | 
|---|
| 3 |  | 
|---|
| 4 | #include "unicode/utypes.h" | 
|---|
| 5 |  | 
|---|
| 6 | #if !UCONFIG_NO_FORMATTING | 
|---|
| 7 |  | 
|---|
| 8 | // Allow implicit conversion from char16_t* to UnicodeString for this file: | 
|---|
| 9 | // Helpful in toString methods and elsewhere. | 
|---|
| 10 | #define UNISTR_FROM_STRING_EXPLICIT | 
|---|
| 11 |  | 
|---|
| 12 | #include "static_unicode_sets.h" | 
|---|
| 13 | #include "umutex.h" | 
|---|
| 14 | #include "ucln_cmn.h" | 
|---|
| 15 | #include "unicode/uniset.h" | 
|---|
| 16 | #include "uresimp.h" | 
|---|
| 17 | #include "cstring.h" | 
|---|
| 18 | #include "uassert.h" | 
|---|
| 19 |  | 
|---|
| 20 | using namespace icu; | 
|---|
| 21 | using namespace icu::unisets; | 
|---|
| 22 |  | 
|---|
| 23 |  | 
|---|
| 24 | namespace { | 
|---|
| 25 |  | 
|---|
| 26 | UnicodeSet* gUnicodeSets[UNISETS_KEY_COUNT] = {}; | 
|---|
| 27 |  | 
|---|
| 28 | // Save the empty instance in static memory to have well-defined behavior if a | 
|---|
| 29 | // regular UnicodeSet cannot be allocated. | 
|---|
| 30 | alignas(UnicodeSet) | 
|---|
| 31 | char gEmptyUnicodeSet[sizeof(UnicodeSet)]; | 
|---|
| 32 |  | 
|---|
| 33 | // Whether the gEmptyUnicodeSet is initialized and ready to use. | 
|---|
| 34 | UBool gEmptyUnicodeSetInitialized = false; | 
|---|
| 35 |  | 
|---|
| 36 | inline UnicodeSet* getImpl(Key key) { | 
|---|
| 37 | UnicodeSet* candidate = gUnicodeSets[key]; | 
|---|
| 38 | if (candidate == nullptr) { | 
|---|
| 39 | return reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet); | 
|---|
| 40 | } | 
|---|
| 41 | return candidate; | 
|---|
| 42 | } | 
|---|
| 43 |  | 
|---|
| 44 | UnicodeSet* computeUnion(Key k1, Key k2) { | 
|---|
| 45 | UnicodeSet* result = new UnicodeSet(); | 
|---|
| 46 | if (result == nullptr) { | 
|---|
| 47 | return nullptr; | 
|---|
| 48 | } | 
|---|
| 49 | result->addAll(*getImpl(k1)); | 
|---|
| 50 | result->addAll(*getImpl(k2)); | 
|---|
| 51 | result->freeze(); | 
|---|
| 52 | return result; | 
|---|
| 53 | } | 
|---|
| 54 |  | 
|---|
| 55 | UnicodeSet* computeUnion(Key k1, Key k2, Key k3) { | 
|---|
| 56 | UnicodeSet* result = new UnicodeSet(); | 
|---|
| 57 | if (result == nullptr) { | 
|---|
| 58 | return nullptr; | 
|---|
| 59 | } | 
|---|
| 60 | result->addAll(*getImpl(k1)); | 
|---|
| 61 | result->addAll(*getImpl(k2)); | 
|---|
| 62 | result->addAll(*getImpl(k3)); | 
|---|
| 63 | result->freeze(); | 
|---|
| 64 | return result; | 
|---|
| 65 | } | 
|---|
| 66 |  | 
|---|
| 67 |  | 
|---|
| 68 | void saveSet(Key key, const UnicodeString& unicodeSetPattern, UErrorCode& status) { | 
|---|
| 69 | // assert unicodeSets.get(key) == null; | 
|---|
| 70 | gUnicodeSets[key] = new UnicodeSet(unicodeSetPattern, status); | 
|---|
| 71 | } | 
|---|
| 72 |  | 
|---|
| 73 | class ParseDataSink : public ResourceSink { | 
|---|
| 74 | public: | 
|---|
| 75 | void put(const char* key, ResourceValue& value, UBool /*noFallback*/, UErrorCode& status) override { | 
|---|
| 76 | ResourceTable contextsTable = value.getTable(status); | 
|---|
| 77 | if (U_FAILURE(status)) { return; } | 
|---|
| 78 | for (int i = 0; contextsTable.getKeyAndValue(i, key, value); i++) { | 
|---|
| 79 | if (uprv_strcmp(key, "date") == 0) { | 
|---|
| 80 | // ignore | 
|---|
| 81 | } else { | 
|---|
| 82 | ResourceTable strictnessTable = value.getTable(status); | 
|---|
| 83 | if (U_FAILURE(status)) { return; } | 
|---|
| 84 | for (int j = 0; strictnessTable.getKeyAndValue(j, key, value); j++) { | 
|---|
| 85 | bool isLenient = (uprv_strcmp(key, "lenient") == 0); | 
|---|
| 86 | ResourceArray array = value.getArray(status); | 
|---|
| 87 | if (U_FAILURE(status)) { return; } | 
|---|
| 88 | for (int k = 0; k < array.getSize(); k++) { | 
|---|
| 89 | array.getValue(k, value); | 
|---|
| 90 | UnicodeString str = value.getUnicodeString(status); | 
|---|
| 91 | if (U_FAILURE(status)) { return; } | 
|---|
| 92 | // There is both lenient and strict data for comma/period, | 
|---|
| 93 | // but not for any of the other symbols. | 
|---|
| 94 | if (str.indexOf(u'.') != -1) { | 
|---|
| 95 | saveSet(isLenient ? PERIOD : STRICT_PERIOD, str, status); | 
|---|
| 96 | } else if (str.indexOf(u',') != -1) { | 
|---|
| 97 | saveSet(isLenient ? COMMA : STRICT_COMMA, str, status); | 
|---|
| 98 | } else if (str.indexOf(u'+') != -1) { | 
|---|
| 99 | saveSet(PLUS_SIGN, str, status); | 
|---|
| 100 | } else if (str.indexOf(u'-') != -1) { | 
|---|
| 101 | saveSet(MINUS_SIGN, str, status); | 
|---|
| 102 | } else if (str.indexOf(u'$') != -1) { | 
|---|
| 103 | saveSet(DOLLAR_SIGN, str, status); | 
|---|
| 104 | } else if (str.indexOf(u'£') != -1) { | 
|---|
| 105 | saveSet(POUND_SIGN, str, status); | 
|---|
| 106 | } else if (str.indexOf(u'₹') != -1) { | 
|---|
| 107 | saveSet(RUPEE_SIGN, str, status); | 
|---|
| 108 | } else if (str.indexOf(u'¥') != -1) { | 
|---|
| 109 | saveSet(YEN_SIGN, str, status); | 
|---|
| 110 | } else if (str.indexOf(u'₩') != -1) { | 
|---|
| 111 | saveSet(WON_SIGN, str, status); | 
|---|
| 112 | } else if (str.indexOf(u'%') != -1) { | 
|---|
| 113 | saveSet(PERCENT_SIGN, str, status); | 
|---|
| 114 | } else if (str.indexOf(u'‰') != -1) { | 
|---|
| 115 | saveSet(PERMILLE_SIGN, str, status); | 
|---|
| 116 | } else if (str.indexOf(u'’') != -1) { | 
|---|
| 117 | saveSet(APOSTROPHE_SIGN, str, status); | 
|---|
| 118 | } else { | 
|---|
| 119 | // Unknown class of parse lenients | 
|---|
| 120 | // TODO(ICU-20428): Make ICU automatically accept new classes? | 
|---|
| 121 | U_ASSERT(false); | 
|---|
| 122 | } | 
|---|
| 123 | if (U_FAILURE(status)) { return; } | 
|---|
| 124 | } | 
|---|
| 125 | } | 
|---|
| 126 | } | 
|---|
| 127 | } | 
|---|
| 128 | } | 
|---|
| 129 | }; | 
|---|
| 130 |  | 
|---|
| 131 |  | 
|---|
| 132 | icu::UInitOnce gNumberParseUniSetsInitOnce {}; | 
|---|
| 133 |  | 
|---|
| 134 | UBool U_CALLCONV cleanupNumberParseUniSets() { | 
|---|
| 135 | if (gEmptyUnicodeSetInitialized) { | 
|---|
| 136 | reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet)->~UnicodeSet(); | 
|---|
| 137 | gEmptyUnicodeSetInitialized = false; | 
|---|
| 138 | } | 
|---|
| 139 | for (int32_t i = 0; i < UNISETS_KEY_COUNT; i++) { | 
|---|
| 140 | delete gUnicodeSets[i]; | 
|---|
| 141 | gUnicodeSets[i] = nullptr; | 
|---|
| 142 | } | 
|---|
| 143 | gNumberParseUniSetsInitOnce.reset(); | 
|---|
| 144 | return true; | 
|---|
| 145 | } | 
|---|
| 146 |  | 
|---|
| 147 | void U_CALLCONV initNumberParseUniSets(UErrorCode& status) { | 
|---|
| 148 | ucln_common_registerCleanup(UCLN_COMMON_NUMPARSE_UNISETS, cleanupNumberParseUniSets); | 
|---|
| 149 |  | 
|---|
| 150 | // Initialize the empty instance for well-defined fallback behavior | 
|---|
| 151 | new(gEmptyUnicodeSet) UnicodeSet(); | 
|---|
| 152 | reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet)->freeze(); | 
|---|
| 153 | gEmptyUnicodeSetInitialized = true; | 
|---|
| 154 |  | 
|---|
| 155 | // These sets were decided after discussion with icu-design@. See tickets #13084 and #13309. | 
|---|
| 156 | // Zs+TAB is "horizontal whitespace" according to UTS #18 (blank property). | 
|---|
| 157 | gUnicodeSets[DEFAULT_IGNORABLES] = new UnicodeSet( | 
|---|
| 158 | u "[[:Zs:][\\u0009][:Bidi_Control:][:Variation_Selector:]]", status); | 
|---|
| 159 | gUnicodeSets[STRICT_IGNORABLES] = new UnicodeSet(u "[[:Bidi_Control:]]", status); | 
|---|
| 160 |  | 
|---|
| 161 | LocalUResourceBundlePointer rb(ures_open(nullptr, "root", &status)); | 
|---|
| 162 | if (U_FAILURE(status)) { return; } | 
|---|
| 163 | ParseDataSink sink; | 
|---|
| 164 | ures_getAllItemsWithFallback(rb.getAlias(), "parse", sink, status); | 
|---|
| 165 | if (U_FAILURE(status)) { return; } | 
|---|
| 166 |  | 
|---|
| 167 | // NOTE: It is OK for these assertions to fail if there was a no-data build. | 
|---|
| 168 | U_ASSERT(gUnicodeSets[COMMA] != nullptr); | 
|---|
| 169 | U_ASSERT(gUnicodeSets[STRICT_COMMA] != nullptr); | 
|---|
| 170 | U_ASSERT(gUnicodeSets[PERIOD] != nullptr); | 
|---|
| 171 | U_ASSERT(gUnicodeSets[STRICT_PERIOD] != nullptr); | 
|---|
| 172 | U_ASSERT(gUnicodeSets[APOSTROPHE_SIGN] != nullptr); | 
|---|
| 173 |  | 
|---|
| 174 | LocalPointer<UnicodeSet> otherGrouping(new UnicodeSet( | 
|---|
| 175 | u "[٬‘\\u0020\\u00A0\\u2000-\\u200A\\u202F\\u205F\\u3000]", | 
|---|
| 176 | status | 
|---|
| 177 | ), status); | 
|---|
| 178 | if (U_FAILURE(status)) { return; } | 
|---|
| 179 | otherGrouping->addAll(*gUnicodeSets[APOSTROPHE_SIGN]); | 
|---|
| 180 | gUnicodeSets[OTHER_GROUPING_SEPARATORS] = otherGrouping.orphan(); | 
|---|
| 181 | gUnicodeSets[ALL_SEPARATORS] = computeUnion(COMMA, PERIOD, OTHER_GROUPING_SEPARATORS); | 
|---|
| 182 | gUnicodeSets[STRICT_ALL_SEPARATORS] = computeUnion( | 
|---|
| 183 | STRICT_COMMA, STRICT_PERIOD, OTHER_GROUPING_SEPARATORS); | 
|---|
| 184 |  | 
|---|
| 185 | U_ASSERT(gUnicodeSets[MINUS_SIGN] != nullptr); | 
|---|
| 186 | U_ASSERT(gUnicodeSets[PLUS_SIGN] != nullptr); | 
|---|
| 187 | U_ASSERT(gUnicodeSets[PERCENT_SIGN] != nullptr); | 
|---|
| 188 | U_ASSERT(gUnicodeSets[PERMILLE_SIGN] != nullptr); | 
|---|
| 189 |  | 
|---|
| 190 | gUnicodeSets[INFINITY_SIGN] = new UnicodeSet(u "[∞]", status); | 
|---|
| 191 | if (U_FAILURE(status)) { return; } | 
|---|
| 192 |  | 
|---|
| 193 | U_ASSERT(gUnicodeSets[DOLLAR_SIGN] != nullptr); | 
|---|
| 194 | U_ASSERT(gUnicodeSets[POUND_SIGN] != nullptr); | 
|---|
| 195 | U_ASSERT(gUnicodeSets[RUPEE_SIGN] != nullptr); | 
|---|
| 196 | U_ASSERT(gUnicodeSets[YEN_SIGN] != nullptr); | 
|---|
| 197 | U_ASSERT(gUnicodeSets[WON_SIGN] != nullptr); | 
|---|
| 198 |  | 
|---|
| 199 | gUnicodeSets[DIGITS] = new UnicodeSet(u "[:digit:]", status); | 
|---|
| 200 | if (U_FAILURE(status)) { return; } | 
|---|
| 201 | gUnicodeSets[DIGITS_OR_ALL_SEPARATORS] = computeUnion(DIGITS, ALL_SEPARATORS); | 
|---|
| 202 | gUnicodeSets[DIGITS_OR_STRICT_ALL_SEPARATORS] = computeUnion(DIGITS, STRICT_ALL_SEPARATORS); | 
|---|
| 203 |  | 
|---|
| 204 | for (auto* uniset : gUnicodeSets) { | 
|---|
| 205 | if (uniset != nullptr) { | 
|---|
| 206 | uniset->freeze(); | 
|---|
| 207 | } | 
|---|
| 208 | } | 
|---|
| 209 | } | 
|---|
| 210 |  | 
|---|
| 211 | } | 
|---|
| 212 |  | 
|---|
| 213 | const UnicodeSet* unisets::get(Key key) { | 
|---|
| 214 | UErrorCode localStatus = U_ZERO_ERROR; | 
|---|
| 215 | umtx_initOnce(gNumberParseUniSetsInitOnce, &initNumberParseUniSets, localStatus); | 
|---|
| 216 | if (U_FAILURE(localStatus)) { | 
|---|
| 217 | return reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet); | 
|---|
| 218 | } | 
|---|
| 219 | return getImpl(key); | 
|---|
| 220 | } | 
|---|
| 221 |  | 
|---|
| 222 | Key unisets::chooseFrom(UnicodeString str, Key key1) { | 
|---|
| 223 | return get(key1)->contains(str) ? key1 : NONE; | 
|---|
| 224 | } | 
|---|
| 225 |  | 
|---|
| 226 | Key unisets::chooseFrom(UnicodeString str, Key key1, Key key2) { | 
|---|
| 227 | return get(key1)->contains(str) ? key1 : chooseFrom(str, key2); | 
|---|
| 228 | } | 
|---|
| 229 |  | 
|---|
| 230 | //Key unisets::chooseCurrency(UnicodeString str) { | 
|---|
| 231 | //    if (get(DOLLAR_SIGN)->contains(str)) { | 
|---|
| 232 | //        return DOLLAR_SIGN; | 
|---|
| 233 | //    } else if (get(POUND_SIGN)->contains(str)) { | 
|---|
| 234 | //        return POUND_SIGN; | 
|---|
| 235 | //    } else if (get(RUPEE_SIGN)->contains(str)) { | 
|---|
| 236 | //        return RUPEE_SIGN; | 
|---|
| 237 | //    } else if (get(YEN_SIGN)->contains(str)) { | 
|---|
| 238 | //        return YEN_SIGN; | 
|---|
| 239 | //    } else { | 
|---|
| 240 | //        return NONE; | 
|---|
| 241 | //    } | 
|---|
| 242 | //} | 
|---|
| 243 |  | 
|---|
| 244 |  | 
|---|
| 245 | #endif /* #if !UCONFIG_NO_FORMATTING */ | 
|---|
| 246 |  | 
|---|