1 | // © 2018 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | |
4 | #include "unicode/utypes.h" |
5 | |
6 | #if !UCONFIG_NO_FORMATTING |
7 | |
8 | // Allow implicit conversion from char16_t* to UnicodeString for this file: |
9 | // Helpful in toString methods and elsewhere. |
10 | #define UNISTR_FROM_STRING_EXPLICIT |
11 | |
12 | #include "numparse_types.h" |
13 | #include "numparse_affixes.h" |
14 | #include "numparse_utils.h" |
15 | #include "number_utils.h" |
16 | #include "string_segment.h" |
17 | |
18 | using namespace icu; |
19 | using namespace icu::numparse; |
20 | using namespace icu::numparse::impl; |
21 | using namespace icu::number; |
22 | using namespace icu::number::impl; |
23 | |
24 | |
25 | namespace { |
26 | |
27 | /** |
28 | * Helper method to return whether the given AffixPatternMatcher equals the given pattern string. |
29 | * Either both arguments must be null or the pattern string inside the AffixPatternMatcher must equal |
30 | * the given pattern string. |
31 | */ |
32 | static bool matched(const AffixPatternMatcher* affix, const UnicodeString& patternString) { |
33 | return (affix == nullptr && patternString.isBogus()) || |
34 | (affix != nullptr && affix->getPattern() == patternString); |
35 | } |
36 | |
37 | /** |
38 | * Helper method to return the length of the given AffixPatternMatcher. Returns 0 for null. |
39 | */ |
40 | static int32_t length(const AffixPatternMatcher* matcher) { |
41 | return matcher == nullptr ? 0 : matcher->getPattern().length(); |
42 | } |
43 | |
44 | /** |
45 | * Helper method to return whether (1) both lhs and rhs are null/invalid, or (2) if they are both |
46 | * valid, whether they are equal according to operator==. Similar to Java Objects.equals() |
47 | */ |
48 | static bool equals(const AffixPatternMatcher* lhs, const AffixPatternMatcher* rhs) { |
49 | if (lhs == nullptr && rhs == nullptr) { |
50 | return true; |
51 | } |
52 | if (lhs == nullptr || rhs == nullptr) { |
53 | return false; |
54 | } |
55 | return *lhs == *rhs; |
56 | } |
57 | |
58 | } |
59 | |
60 | |
61 | AffixPatternMatcherBuilder::AffixPatternMatcherBuilder(const UnicodeString& pattern, |
62 | AffixTokenMatcherWarehouse& warehouse, |
63 | IgnorablesMatcher* ignorables) |
64 | : fMatchersLen(0), |
65 | fLastTypeOrCp(0), |
66 | fPattern(pattern), |
67 | fWarehouse(warehouse), |
68 | fIgnorables(ignorables) {} |
69 | |
70 | void AffixPatternMatcherBuilder::consumeToken(AffixPatternType type, UChar32 cp, UErrorCode& status) { |
71 | // This is called by AffixUtils.iterateWithConsumer() for each token. |
72 | |
73 | // Add an ignorables matcher between tokens except between two literals, and don't put two |
74 | // ignorables matchers in a row. |
75 | if (fIgnorables != nullptr && fMatchersLen > 0 && |
76 | (fLastTypeOrCp < 0 || !fIgnorables->getSet()->contains(fLastTypeOrCp))) { |
77 | addMatcher(*fIgnorables); |
78 | } |
79 | |
80 | if (type != TYPE_CODEPOINT) { |
81 | // Case 1: the token is a symbol. |
82 | switch (type) { |
83 | case TYPE_MINUS_SIGN: |
84 | addMatcher(fWarehouse.minusSign()); |
85 | break; |
86 | case TYPE_PLUS_SIGN: |
87 | addMatcher(fWarehouse.plusSign()); |
88 | break; |
89 | case TYPE_PERCENT: |
90 | addMatcher(fWarehouse.percent()); |
91 | break; |
92 | case TYPE_PERMILLE: |
93 | addMatcher(fWarehouse.permille()); |
94 | break; |
95 | case TYPE_CURRENCY_SINGLE: |
96 | case TYPE_CURRENCY_DOUBLE: |
97 | case TYPE_CURRENCY_TRIPLE: |
98 | case TYPE_CURRENCY_QUAD: |
99 | case TYPE_CURRENCY_QUINT: |
100 | // All currency symbols use the same matcher |
101 | addMatcher(fWarehouse.currency(status)); |
102 | break; |
103 | default: |
104 | UPRV_UNREACHABLE; |
105 | } |
106 | |
107 | } else if (fIgnorables != nullptr && fIgnorables->getSet()->contains(cp)) { |
108 | // Case 2: the token is an ignorable literal. |
109 | // No action necessary: the ignorables matcher has already been added. |
110 | |
111 | } else { |
112 | // Case 3: the token is a non-ignorable literal. |
113 | if (auto* ptr = fWarehouse.nextCodePointMatcher(cp, status)) { |
114 | addMatcher(*ptr); |
115 | } else { |
116 | // OOM; unwind the stack |
117 | return; |
118 | } |
119 | } |
120 | fLastTypeOrCp = type != TYPE_CODEPOINT ? type : cp; |
121 | } |
122 | |
123 | void AffixPatternMatcherBuilder::addMatcher(NumberParseMatcher& matcher) { |
124 | if (fMatchersLen >= fMatchers.getCapacity()) { |
125 | fMatchers.resize(fMatchersLen * 2, fMatchersLen); |
126 | } |
127 | fMatchers[fMatchersLen++] = &matcher; |
128 | } |
129 | |
130 | AffixPatternMatcher AffixPatternMatcherBuilder::build() { |
131 | return AffixPatternMatcher(fMatchers, fMatchersLen, fPattern); |
132 | } |
133 | |
134 | AffixTokenMatcherWarehouse::AffixTokenMatcherWarehouse(const AffixTokenMatcherSetupData* setupData) |
135 | : fSetupData(setupData) {} |
136 | |
137 | NumberParseMatcher& AffixTokenMatcherWarehouse::minusSign() { |
138 | return fMinusSign = {fSetupData->dfs, true}; |
139 | } |
140 | |
141 | NumberParseMatcher& AffixTokenMatcherWarehouse::plusSign() { |
142 | return fPlusSign = {fSetupData->dfs, true}; |
143 | } |
144 | |
145 | NumberParseMatcher& AffixTokenMatcherWarehouse::percent() { |
146 | return fPercent = {fSetupData->dfs}; |
147 | } |
148 | |
149 | NumberParseMatcher& AffixTokenMatcherWarehouse::permille() { |
150 | return fPermille = {fSetupData->dfs}; |
151 | } |
152 | |
153 | NumberParseMatcher& AffixTokenMatcherWarehouse::currency(UErrorCode& status) { |
154 | return fCurrency = {fSetupData->currencySymbols, fSetupData->dfs, fSetupData->parseFlags, status}; |
155 | } |
156 | |
157 | IgnorablesMatcher& AffixTokenMatcherWarehouse::ignorables() { |
158 | return fSetupData->ignorables; |
159 | } |
160 | |
161 | NumberParseMatcher* AffixTokenMatcherWarehouse::nextCodePointMatcher(UChar32 cp, UErrorCode& status) { |
162 | if (U_FAILURE(status)) { |
163 | return nullptr; |
164 | } |
165 | auto* result = fCodePoints.create(cp); |
166 | if (result == nullptr) { |
167 | status = U_MEMORY_ALLOCATION_ERROR; |
168 | } |
169 | return result; |
170 | } |
171 | |
172 | |
173 | CodePointMatcher::CodePointMatcher(UChar32 cp) |
174 | : fCp(cp) {} |
175 | |
176 | bool CodePointMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode&) const { |
177 | if (segment.startsWith(fCp)) { |
178 | segment.adjustOffsetByCodePoint(); |
179 | result.setCharsConsumed(segment); |
180 | } |
181 | return false; |
182 | } |
183 | |
184 | bool CodePointMatcher::smokeTest(const StringSegment& segment) const { |
185 | return segment.startsWith(fCp); |
186 | } |
187 | |
188 | UnicodeString CodePointMatcher::toString() const { |
189 | return u"<CodePoint>" ; |
190 | } |
191 | |
192 | |
193 | AffixPatternMatcher AffixPatternMatcher::fromAffixPattern(const UnicodeString& affixPattern, |
194 | AffixTokenMatcherWarehouse& tokenWarehouse, |
195 | parse_flags_t parseFlags, bool* success, |
196 | UErrorCode& status) { |
197 | if (affixPattern.isEmpty()) { |
198 | *success = false; |
199 | return {}; |
200 | } |
201 | *success = true; |
202 | |
203 | IgnorablesMatcher* ignorables; |
204 | if (0 != (parseFlags & PARSE_FLAG_EXACT_AFFIX)) { |
205 | ignorables = nullptr; |
206 | } else { |
207 | ignorables = &tokenWarehouse.ignorables(); |
208 | } |
209 | |
210 | AffixPatternMatcherBuilder builder(affixPattern, tokenWarehouse, ignorables); |
211 | AffixUtils::iterateWithConsumer(affixPattern, builder, status); |
212 | return builder.build(); |
213 | } |
214 | |
215 | AffixPatternMatcher::AffixPatternMatcher(MatcherArray& matchers, int32_t matchersLen, |
216 | const UnicodeString& pattern) |
217 | : ArraySeriesMatcher(matchers, matchersLen), fPattern(pattern) {} |
218 | |
219 | UnicodeString AffixPatternMatcher::getPattern() const { |
220 | return fPattern.toAliasedUnicodeString(); |
221 | } |
222 | |
223 | bool AffixPatternMatcher::operator==(const AffixPatternMatcher& other) const { |
224 | return fPattern == other.fPattern; |
225 | } |
226 | |
227 | |
228 | AffixMatcherWarehouse::AffixMatcherWarehouse(AffixTokenMatcherWarehouse* tokenWarehouse) |
229 | : fTokenWarehouse(tokenWarehouse) { |
230 | } |
231 | |
232 | bool AffixMatcherWarehouse::isInteresting(const AffixPatternProvider& patternInfo, |
233 | const IgnorablesMatcher& ignorables, parse_flags_t parseFlags, |
234 | UErrorCode& status) { |
235 | UnicodeString posPrefixString = patternInfo.getString(AffixPatternProvider::AFFIX_POS_PREFIX); |
236 | UnicodeString posSuffixString = patternInfo.getString(AffixPatternProvider::AFFIX_POS_SUFFIX); |
237 | UnicodeString negPrefixString; |
238 | UnicodeString negSuffixString; |
239 | if (patternInfo.hasNegativeSubpattern()) { |
240 | negPrefixString = patternInfo.getString(AffixPatternProvider::AFFIX_NEG_PREFIX); |
241 | negSuffixString = patternInfo.getString(AffixPatternProvider::AFFIX_NEG_SUFFIX); |
242 | } |
243 | |
244 | if (0 == (parseFlags & PARSE_FLAG_USE_FULL_AFFIXES) && |
245 | AffixUtils::containsOnlySymbolsAndIgnorables(posPrefixString, *ignorables.getSet(), status) && |
246 | AffixUtils::containsOnlySymbolsAndIgnorables(posSuffixString, *ignorables.getSet(), status) && |
247 | AffixUtils::containsOnlySymbolsAndIgnorables(negPrefixString, *ignorables.getSet(), status) && |
248 | AffixUtils::containsOnlySymbolsAndIgnorables(negSuffixString, *ignorables.getSet(), status) |
249 | // HACK: Plus and minus sign are a special case: we accept them trailing only if they are |
250 | // trailing in the pattern string. |
251 | && !AffixUtils::containsType(posSuffixString, TYPE_PLUS_SIGN, status) && |
252 | !AffixUtils::containsType(posSuffixString, TYPE_MINUS_SIGN, status) && |
253 | !AffixUtils::containsType(negSuffixString, TYPE_PLUS_SIGN, status) && |
254 | !AffixUtils::containsType(negSuffixString, TYPE_MINUS_SIGN, status)) { |
255 | // The affixes contain only symbols and ignorables. |
256 | // No need to generate affix matchers. |
257 | return false; |
258 | } |
259 | return true; |
260 | } |
261 | |
262 | void AffixMatcherWarehouse::createAffixMatchers(const AffixPatternProvider& patternInfo, |
263 | MutableMatcherCollection& output, |
264 | const IgnorablesMatcher& ignorables, |
265 | parse_flags_t parseFlags, UErrorCode& status) { |
266 | if (!isInteresting(patternInfo, ignorables, parseFlags, status)) { |
267 | return; |
268 | } |
269 | |
270 | // The affixes have interesting characters, or we are in strict mode. |
271 | // Use initial capacity of 6, the highest possible number of AffixMatchers. |
272 | UnicodeString sb; |
273 | bool includeUnpaired = 0 != (parseFlags & PARSE_FLAG_INCLUDE_UNPAIRED_AFFIXES); |
274 | |
275 | int32_t numAffixMatchers = 0; |
276 | int32_t numAffixPatternMatchers = 0; |
277 | |
278 | AffixPatternMatcher* posPrefix = nullptr; |
279 | AffixPatternMatcher* posSuffix = nullptr; |
280 | |
281 | // Pre-process the affix strings to resolve LDML rules like sign display. |
282 | for (int8_t typeInt = 0; typeInt < PATTERN_SIGN_TYPE_COUNT; typeInt++) { |
283 | auto type = static_cast<PatternSignType>(typeInt); |
284 | |
285 | // Skip affixes in some cases |
286 | if (type == PATTERN_SIGN_TYPE_POS |
287 | && 0 != (parseFlags & PARSE_FLAG_PLUS_SIGN_ALLOWED)) { |
288 | continue; |
289 | } |
290 | if (type == PATTERN_SIGN_TYPE_POS_SIGN |
291 | && 0 == (parseFlags & PARSE_FLAG_PLUS_SIGN_ALLOWED)) { |
292 | continue; |
293 | } |
294 | |
295 | // Generate Prefix |
296 | bool hasPrefix = false; |
297 | PatternStringUtils::patternInfoToStringBuilder( |
298 | patternInfo, true, type, StandardPlural::OTHER, false, sb); |
299 | fAffixPatternMatchers[numAffixPatternMatchers] = AffixPatternMatcher::fromAffixPattern( |
300 | sb, *fTokenWarehouse, parseFlags, &hasPrefix, status); |
301 | AffixPatternMatcher* prefix = hasPrefix ? &fAffixPatternMatchers[numAffixPatternMatchers++] |
302 | : nullptr; |
303 | |
304 | // Generate Suffix |
305 | bool hasSuffix = false; |
306 | PatternStringUtils::patternInfoToStringBuilder( |
307 | patternInfo, false, type, StandardPlural::OTHER, false, sb); |
308 | fAffixPatternMatchers[numAffixPatternMatchers] = AffixPatternMatcher::fromAffixPattern( |
309 | sb, *fTokenWarehouse, parseFlags, &hasSuffix, status); |
310 | AffixPatternMatcher* suffix = hasSuffix ? &fAffixPatternMatchers[numAffixPatternMatchers++] |
311 | : nullptr; |
312 | |
313 | if (type == PATTERN_SIGN_TYPE_POS) { |
314 | posPrefix = prefix; |
315 | posSuffix = suffix; |
316 | } else if (equals(prefix, posPrefix) && equals(suffix, posSuffix)) { |
317 | // Skip adding these matchers (we already have equivalents) |
318 | continue; |
319 | } |
320 | |
321 | // Flags for setting in the ParsedNumber; the token matchers may add more. |
322 | int flags = (type == PATTERN_SIGN_TYPE_NEG) ? FLAG_NEGATIVE : 0; |
323 | |
324 | // Note: it is indeed possible for posPrefix and posSuffix to both be null. |
325 | // We still need to add that matcher for strict mode to work. |
326 | fAffixMatchers[numAffixMatchers++] = {prefix, suffix, flags}; |
327 | if (includeUnpaired && prefix != nullptr && suffix != nullptr) { |
328 | // The following if statements are designed to prevent adding two identical matchers. |
329 | if (type == PATTERN_SIGN_TYPE_POS || !equals(prefix, posPrefix)) { |
330 | fAffixMatchers[numAffixMatchers++] = {prefix, nullptr, flags}; |
331 | } |
332 | if (type == PATTERN_SIGN_TYPE_POS || !equals(suffix, posSuffix)) { |
333 | fAffixMatchers[numAffixMatchers++] = {nullptr, suffix, flags}; |
334 | } |
335 | } |
336 | } |
337 | |
338 | // Put the AffixMatchers in order, and then add them to the output. |
339 | // Since there are at most 9 elements, do a simple-to-implement bubble sort. |
340 | bool madeChanges; |
341 | do { |
342 | madeChanges = false; |
343 | for (int32_t i = 1; i < numAffixMatchers; i++) { |
344 | if (fAffixMatchers[i - 1].compareTo(fAffixMatchers[i]) > 0) { |
345 | madeChanges = true; |
346 | AffixMatcher temp = std::move(fAffixMatchers[i - 1]); |
347 | fAffixMatchers[i - 1] = std::move(fAffixMatchers[i]); |
348 | fAffixMatchers[i] = std::move(temp); |
349 | } |
350 | } |
351 | } while (madeChanges); |
352 | |
353 | for (int32_t i = 0; i < numAffixMatchers; i++) { |
354 | // Enable the following line to debug affixes |
355 | //std::cout << "Adding affix matcher: " << CStr(fAffixMatchers[i].toString())() << std::endl; |
356 | output.addMatcher(fAffixMatchers[i]); |
357 | } |
358 | } |
359 | |
360 | |
361 | AffixMatcher::AffixMatcher(AffixPatternMatcher* prefix, AffixPatternMatcher* suffix, result_flags_t flags) |
362 | : fPrefix(prefix), fSuffix(suffix), fFlags(flags) {} |
363 | |
364 | bool AffixMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const { |
365 | if (!result.seenNumber()) { |
366 | // Prefix |
367 | // Do not match if: |
368 | // 1. We have already seen a prefix (result.prefix != null) |
369 | // 2. The prefix in this AffixMatcher is empty (prefix == null) |
370 | if (!result.prefix.isBogus() || fPrefix == nullptr) { |
371 | return false; |
372 | } |
373 | |
374 | // Attempt to match the prefix. |
375 | int initialOffset = segment.getOffset(); |
376 | bool maybeMore = fPrefix->match(segment, result, status); |
377 | if (initialOffset != segment.getOffset()) { |
378 | result.prefix = fPrefix->getPattern(); |
379 | } |
380 | return maybeMore; |
381 | |
382 | } else { |
383 | // Suffix |
384 | // Do not match if: |
385 | // 1. We have already seen a suffix (result.suffix != null) |
386 | // 2. The suffix in this AffixMatcher is empty (suffix == null) |
387 | // 3. The matched prefix does not equal this AffixMatcher's prefix |
388 | if (!result.suffix.isBogus() || fSuffix == nullptr || !matched(fPrefix, result.prefix)) { |
389 | return false; |
390 | } |
391 | |
392 | // Attempt to match the suffix. |
393 | int initialOffset = segment.getOffset(); |
394 | bool maybeMore = fSuffix->match(segment, result, status); |
395 | if (initialOffset != segment.getOffset()) { |
396 | result.suffix = fSuffix->getPattern(); |
397 | } |
398 | return maybeMore; |
399 | } |
400 | } |
401 | |
402 | bool AffixMatcher::smokeTest(const StringSegment& segment) const { |
403 | return (fPrefix != nullptr && fPrefix->smokeTest(segment)) || |
404 | (fSuffix != nullptr && fSuffix->smokeTest(segment)); |
405 | } |
406 | |
407 | void AffixMatcher::postProcess(ParsedNumber& result) const { |
408 | // Check to see if our affix is the one that was matched. If so, set the flags in the result. |
409 | if (matched(fPrefix, result.prefix) && matched(fSuffix, result.suffix)) { |
410 | // Fill in the result prefix and suffix with non-null values (empty string). |
411 | // Used by strict mode to determine whether an entire affix pair was matched. |
412 | if (result.prefix.isBogus()) { |
413 | result.prefix = UnicodeString(); |
414 | } |
415 | if (result.suffix.isBogus()) { |
416 | result.suffix = UnicodeString(); |
417 | } |
418 | result.flags |= fFlags; |
419 | if (fPrefix != nullptr) { |
420 | fPrefix->postProcess(result); |
421 | } |
422 | if (fSuffix != nullptr) { |
423 | fSuffix->postProcess(result); |
424 | } |
425 | } |
426 | } |
427 | |
428 | int8_t AffixMatcher::compareTo(const AffixMatcher& rhs) const { |
429 | const AffixMatcher& lhs = *this; |
430 | if (length(lhs.fPrefix) != length(rhs.fPrefix)) { |
431 | return length(lhs.fPrefix) > length(rhs.fPrefix) ? -1 : 1; |
432 | } else if (length(lhs.fSuffix) != length(rhs.fSuffix)) { |
433 | return length(lhs.fSuffix) > length(rhs.fSuffix) ? -1 : 1; |
434 | } else { |
435 | return 0; |
436 | } |
437 | } |
438 | |
439 | UnicodeString AffixMatcher::toString() const { |
440 | bool isNegative = 0 != (fFlags & FLAG_NEGATIVE); |
441 | return UnicodeString(u"<Affix" ) + (isNegative ? u":negative " : u" " ) + |
442 | (fPrefix ? fPrefix->getPattern() : u"null" ) + u"#" + |
443 | (fSuffix ? fSuffix->getPattern() : u"null" ) + u">" ; |
444 | |
445 | } |
446 | |
447 | |
448 | #endif /* #if !UCONFIG_NO_FORMATTING */ |
449 | |
450 | |
451 | |
452 | |
453 | |
454 | |
455 | |
456 | |
457 | |
458 | |
459 | |
460 | |
461 | |
462 | |
463 | |
464 | |
465 | |
466 | |
467 | |
468 | |
469 | |
470 | |
471 | |
472 | |
473 | |
474 | |