1 | // © 2018 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | |
4 | #include "unicode/utypes.h" |
5 | |
6 | #if !UCONFIG_NO_FORMATTING |
7 | |
8 | // Allow implicit conversion from char16_t* to UnicodeString for this file: |
9 | // Helpful in toString methods and elsewhere. |
10 | #define UNISTR_FROM_STRING_EXPLICIT |
11 | |
12 | #include "numparse_types.h" |
13 | #include "numparse_affixes.h" |
14 | #include "numparse_utils.h" |
15 | #include "number_utils.h" |
16 | #include "string_segment.h" |
17 | |
18 | using namespace icu; |
19 | using namespace icu::numparse; |
20 | using namespace icu::numparse::impl; |
21 | using namespace icu::number; |
22 | using namespace icu::number::impl; |
23 | |
24 | |
25 | namespace { |
26 | |
27 | /** |
28 | * Helper method to return whether the given AffixPatternMatcher equals the given pattern string. |
29 | * Either both arguments must be null or the pattern string inside the AffixPatternMatcher must equal |
30 | * the given pattern string. |
31 | */ |
32 | static bool matched(const AffixPatternMatcher* affix, const UnicodeString& patternString) { |
33 | return (affix == nullptr && patternString.isBogus()) || |
34 | (affix != nullptr && affix->getPattern() == patternString); |
35 | } |
36 | |
37 | /** |
38 | * Helper method to return the length of the given AffixPatternMatcher. Returns 0 for null. |
39 | */ |
40 | static int32_t length(const AffixPatternMatcher* matcher) { |
41 | return matcher == nullptr ? 0 : matcher->getPattern().length(); |
42 | } |
43 | |
44 | /** |
45 | * Helper method to return whether (1) both lhs and rhs are null/invalid, or (2) if they are both |
46 | * valid, whether they are equal according to operator==. Similar to Java Objects.equals() |
47 | */ |
48 | static bool equals(const AffixPatternMatcher* lhs, const AffixPatternMatcher* rhs) { |
49 | if (lhs == nullptr && rhs == nullptr) { |
50 | return true; |
51 | } |
52 | if (lhs == nullptr || rhs == nullptr) { |
53 | return false; |
54 | } |
55 | return *lhs == *rhs; |
56 | } |
57 | |
58 | } |
59 | |
60 | |
61 | AffixPatternMatcherBuilder::AffixPatternMatcherBuilder(const UnicodeString& pattern, |
62 | AffixTokenMatcherWarehouse& warehouse, |
63 | IgnorablesMatcher* ignorables) |
64 | : fMatchersLen(0), |
65 | fLastTypeOrCp(0), |
66 | fPattern(pattern), |
67 | fWarehouse(warehouse), |
68 | fIgnorables(ignorables) {} |
69 | |
70 | void AffixPatternMatcherBuilder::consumeToken(AffixPatternType type, UChar32 cp, UErrorCode& status) { |
71 | // This is called by AffixUtils.iterateWithConsumer() for each token. |
72 | |
73 | // Add an ignorables matcher between tokens except between two literals, and don't put two |
74 | // ignorables matchers in a row. |
75 | if (fIgnorables != nullptr && fMatchersLen > 0 && |
76 | (fLastTypeOrCp < 0 || !fIgnorables->getSet()->contains(fLastTypeOrCp))) { |
77 | addMatcher(*fIgnorables); |
78 | } |
79 | |
80 | if (type != TYPE_CODEPOINT) { |
81 | // Case 1: the token is a symbol. |
82 | switch (type) { |
83 | case TYPE_MINUS_SIGN: |
84 | addMatcher(fWarehouse.minusSign()); |
85 | break; |
86 | case TYPE_PLUS_SIGN: |
87 | addMatcher(fWarehouse.plusSign()); |
88 | break; |
89 | case TYPE_PERCENT: |
90 | addMatcher(fWarehouse.percent()); |
91 | break; |
92 | case TYPE_PERMILLE: |
93 | addMatcher(fWarehouse.permille()); |
94 | break; |
95 | case TYPE_CURRENCY_SINGLE: |
96 | case TYPE_CURRENCY_DOUBLE: |
97 | case TYPE_CURRENCY_TRIPLE: |
98 | case TYPE_CURRENCY_QUAD: |
99 | case TYPE_CURRENCY_QUINT: |
100 | // All currency symbols use the same matcher |
101 | addMatcher(fWarehouse.currency(status)); |
102 | break; |
103 | default: |
104 | UPRV_UNREACHABLE; |
105 | } |
106 | |
107 | } else if (fIgnorables != nullptr && fIgnorables->getSet()->contains(cp)) { |
108 | // Case 2: the token is an ignorable literal. |
109 | // No action necessary: the ignorables matcher has already been added. |
110 | |
111 | } else { |
112 | // Case 3: the token is a non-ignorable literal. |
113 | if (auto* ptr = fWarehouse.nextCodePointMatcher(cp, status)) { |
114 | addMatcher(*ptr); |
115 | } else { |
116 | // OOM; unwind the stack |
117 | return; |
118 | } |
119 | } |
120 | fLastTypeOrCp = type != TYPE_CODEPOINT ? type : cp; |
121 | } |
122 | |
123 | void AffixPatternMatcherBuilder::addMatcher(NumberParseMatcher& matcher) { |
124 | if (fMatchersLen >= fMatchers.getCapacity()) { |
125 | fMatchers.resize(fMatchersLen * 2, fMatchersLen); |
126 | } |
127 | fMatchers[fMatchersLen++] = &matcher; |
128 | } |
129 | |
130 | AffixPatternMatcher AffixPatternMatcherBuilder::build() { |
131 | return AffixPatternMatcher(fMatchers, fMatchersLen, fPattern); |
132 | } |
133 | |
134 | AffixTokenMatcherWarehouse::AffixTokenMatcherWarehouse(const AffixTokenMatcherSetupData* setupData) |
135 | : fSetupData(setupData) {} |
136 | |
137 | NumberParseMatcher& AffixTokenMatcherWarehouse::minusSign() { |
138 | return fMinusSign = {fSetupData->dfs, true}; |
139 | } |
140 | |
141 | NumberParseMatcher& AffixTokenMatcherWarehouse::plusSign() { |
142 | return fPlusSign = {fSetupData->dfs, true}; |
143 | } |
144 | |
145 | NumberParseMatcher& AffixTokenMatcherWarehouse::percent() { |
146 | return fPercent = {fSetupData->dfs}; |
147 | } |
148 | |
149 | NumberParseMatcher& AffixTokenMatcherWarehouse::permille() { |
150 | return fPermille = {fSetupData->dfs}; |
151 | } |
152 | |
153 | NumberParseMatcher& AffixTokenMatcherWarehouse::currency(UErrorCode& status) { |
154 | return fCurrency = {fSetupData->currencySymbols, fSetupData->dfs, fSetupData->parseFlags, status}; |
155 | } |
156 | |
157 | IgnorablesMatcher& AffixTokenMatcherWarehouse::ignorables() { |
158 | return fSetupData->ignorables; |
159 | } |
160 | |
161 | NumberParseMatcher* AffixTokenMatcherWarehouse::nextCodePointMatcher(UChar32 cp, UErrorCode& status) { |
162 | if (U_FAILURE(status)) { |
163 | return nullptr; |
164 | } |
165 | auto* result = fCodePoints.create(cp); |
166 | if (result == nullptr) { |
167 | status = U_MEMORY_ALLOCATION_ERROR; |
168 | } |
169 | return result; |
170 | } |
171 | |
172 | |
173 | CodePointMatcher::CodePointMatcher(UChar32 cp) |
174 | : fCp(cp) {} |
175 | |
176 | bool CodePointMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode&) const { |
177 | if (segment.startsWith(fCp)) { |
178 | segment.adjustOffsetByCodePoint(); |
179 | result.setCharsConsumed(segment); |
180 | } |
181 | return false; |
182 | } |
183 | |
184 | bool CodePointMatcher::smokeTest(const StringSegment& segment) const { |
185 | return segment.startsWith(fCp); |
186 | } |
187 | |
188 | UnicodeString CodePointMatcher::toString() const { |
189 | return u"<CodePoint>" ; |
190 | } |
191 | |
192 | |
193 | AffixPatternMatcher AffixPatternMatcher::fromAffixPattern(const UnicodeString& affixPattern, |
194 | AffixTokenMatcherWarehouse& tokenWarehouse, |
195 | parse_flags_t parseFlags, bool* success, |
196 | UErrorCode& status) { |
197 | if (affixPattern.isEmpty()) { |
198 | *success = false; |
199 | return {}; |
200 | } |
201 | *success = true; |
202 | |
203 | IgnorablesMatcher* ignorables; |
204 | if (0 != (parseFlags & PARSE_FLAG_EXACT_AFFIX)) { |
205 | ignorables = nullptr; |
206 | } else { |
207 | ignorables = &tokenWarehouse.ignorables(); |
208 | } |
209 | |
210 | AffixPatternMatcherBuilder builder(affixPattern, tokenWarehouse, ignorables); |
211 | AffixUtils::iterateWithConsumer(affixPattern, builder, status); |
212 | return builder.build(); |
213 | } |
214 | |
215 | AffixPatternMatcher::AffixPatternMatcher(MatcherArray& matchers, int32_t matchersLen, |
216 | const UnicodeString& pattern) |
217 | : ArraySeriesMatcher(matchers, matchersLen), fPattern(pattern) {} |
218 | |
219 | UnicodeString AffixPatternMatcher::getPattern() const { |
220 | return fPattern.toAliasedUnicodeString(); |
221 | } |
222 | |
223 | bool AffixPatternMatcher::operator==(const AffixPatternMatcher& other) const { |
224 | return fPattern == other.fPattern; |
225 | } |
226 | |
227 | |
228 | AffixMatcherWarehouse::AffixMatcherWarehouse(AffixTokenMatcherWarehouse* tokenWarehouse) |
229 | : fTokenWarehouse(tokenWarehouse) { |
230 | } |
231 | |
232 | bool AffixMatcherWarehouse::isInteresting(const AffixPatternProvider& patternInfo, |
233 | const IgnorablesMatcher& ignorables, parse_flags_t parseFlags, |
234 | UErrorCode& status) { |
235 | UnicodeString posPrefixString = patternInfo.getString(AffixPatternProvider::AFFIX_POS_PREFIX); |
236 | UnicodeString posSuffixString = patternInfo.getString(AffixPatternProvider::AFFIX_POS_SUFFIX); |
237 | UnicodeString negPrefixString; |
238 | UnicodeString negSuffixString; |
239 | if (patternInfo.hasNegativeSubpattern()) { |
240 | negPrefixString = patternInfo.getString(AffixPatternProvider::AFFIX_NEG_PREFIX); |
241 | negSuffixString = patternInfo.getString(AffixPatternProvider::AFFIX_NEG_SUFFIX); |
242 | } |
243 | |
244 | if (0 == (parseFlags & PARSE_FLAG_USE_FULL_AFFIXES) && |
245 | AffixUtils::containsOnlySymbolsAndIgnorables(posPrefixString, *ignorables.getSet(), status) && |
246 | AffixUtils::containsOnlySymbolsAndIgnorables(posSuffixString, *ignorables.getSet(), status) && |
247 | AffixUtils::containsOnlySymbolsAndIgnorables(negPrefixString, *ignorables.getSet(), status) && |
248 | AffixUtils::containsOnlySymbolsAndIgnorables(negSuffixString, *ignorables.getSet(), status) |
249 | // HACK: Plus and minus sign are a special case: we accept them trailing only if they are |
250 | // trailing in the pattern string. |
251 | && !AffixUtils::containsType(posSuffixString, TYPE_PLUS_SIGN, status) && |
252 | !AffixUtils::containsType(posSuffixString, TYPE_MINUS_SIGN, status) && |
253 | !AffixUtils::containsType(negSuffixString, TYPE_PLUS_SIGN, status) && |
254 | !AffixUtils::containsType(negSuffixString, TYPE_MINUS_SIGN, status)) { |
255 | // The affixes contain only symbols and ignorables. |
256 | // No need to generate affix matchers. |
257 | return false; |
258 | } |
259 | return true; |
260 | } |
261 | |
262 | void AffixMatcherWarehouse::createAffixMatchers(const AffixPatternProvider& patternInfo, |
263 | MutableMatcherCollection& output, |
264 | const IgnorablesMatcher& ignorables, |
265 | parse_flags_t parseFlags, UErrorCode& status) { |
266 | if (!isInteresting(patternInfo, ignorables, parseFlags, status)) { |
267 | return; |
268 | } |
269 | |
270 | // The affixes have interesting characters, or we are in strict mode. |
271 | // Use initial capacity of 6, the highest possible number of AffixMatchers. |
272 | UnicodeString sb; |
273 | bool includeUnpaired = 0 != (parseFlags & PARSE_FLAG_INCLUDE_UNPAIRED_AFFIXES); |
274 | UNumberSignDisplay signDisplay = (0 != (parseFlags & PARSE_FLAG_PLUS_SIGN_ALLOWED)) ? UNUM_SIGN_ALWAYS |
275 | : UNUM_SIGN_AUTO; |
276 | |
277 | int32_t numAffixMatchers = 0; |
278 | int32_t numAffixPatternMatchers = 0; |
279 | |
280 | AffixPatternMatcher* posPrefix = nullptr; |
281 | AffixPatternMatcher* posSuffix = nullptr; |
282 | |
283 | // Pre-process the affix strings to resolve LDML rules like sign display. |
284 | for (int8_t signumInt = 1; signumInt >= -1; signumInt--) { |
285 | auto signum = static_cast<Signum>(signumInt); |
286 | |
287 | // Generate Prefix |
288 | bool hasPrefix = false; |
289 | PatternStringUtils::patternInfoToStringBuilder( |
290 | patternInfo, true, signum, signDisplay, StandardPlural::OTHER, false, sb); |
291 | fAffixPatternMatchers[numAffixPatternMatchers] = AffixPatternMatcher::fromAffixPattern( |
292 | sb, *fTokenWarehouse, parseFlags, &hasPrefix, status); |
293 | AffixPatternMatcher* prefix = hasPrefix ? &fAffixPatternMatchers[numAffixPatternMatchers++] |
294 | : nullptr; |
295 | |
296 | // Generate Suffix |
297 | bool hasSuffix = false; |
298 | PatternStringUtils::patternInfoToStringBuilder( |
299 | patternInfo, false, signum, signDisplay, StandardPlural::OTHER, false, sb); |
300 | fAffixPatternMatchers[numAffixPatternMatchers] = AffixPatternMatcher::fromAffixPattern( |
301 | sb, *fTokenWarehouse, parseFlags, &hasSuffix, status); |
302 | AffixPatternMatcher* suffix = hasSuffix ? &fAffixPatternMatchers[numAffixPatternMatchers++] |
303 | : nullptr; |
304 | |
305 | if (signum == 1) { |
306 | posPrefix = prefix; |
307 | posSuffix = suffix; |
308 | } else if (equals(prefix, posPrefix) && equals(suffix, posSuffix)) { |
309 | // Skip adding these matchers (we already have equivalents) |
310 | continue; |
311 | } |
312 | |
313 | // Flags for setting in the ParsedNumber; the token matchers may add more. |
314 | int flags = (signum == -1) ? FLAG_NEGATIVE : 0; |
315 | |
316 | // Note: it is indeed possible for posPrefix and posSuffix to both be null. |
317 | // We still need to add that matcher for strict mode to work. |
318 | fAffixMatchers[numAffixMatchers++] = {prefix, suffix, flags}; |
319 | if (includeUnpaired && prefix != nullptr && suffix != nullptr) { |
320 | // The following if statements are designed to prevent adding two identical matchers. |
321 | if (signum == 1 || !equals(prefix, posPrefix)) { |
322 | fAffixMatchers[numAffixMatchers++] = {prefix, nullptr, flags}; |
323 | } |
324 | if (signum == 1 || !equals(suffix, posSuffix)) { |
325 | fAffixMatchers[numAffixMatchers++] = {nullptr, suffix, flags}; |
326 | } |
327 | } |
328 | } |
329 | |
330 | // Put the AffixMatchers in order, and then add them to the output. |
331 | // Since there are at most 9 elements, do a simple-to-implement bubble sort. |
332 | bool madeChanges; |
333 | do { |
334 | madeChanges = false; |
335 | for (int32_t i = 1; i < numAffixMatchers; i++) { |
336 | if (fAffixMatchers[i - 1].compareTo(fAffixMatchers[i]) > 0) { |
337 | madeChanges = true; |
338 | AffixMatcher temp = std::move(fAffixMatchers[i - 1]); |
339 | fAffixMatchers[i - 1] = std::move(fAffixMatchers[i]); |
340 | fAffixMatchers[i] = std::move(temp); |
341 | } |
342 | } |
343 | } while (madeChanges); |
344 | |
345 | for (int32_t i = 0; i < numAffixMatchers; i++) { |
346 | // Enable the following line to debug affixes |
347 | //std::cout << "Adding affix matcher: " << CStr(fAffixMatchers[i].toString())() << std::endl; |
348 | output.addMatcher(fAffixMatchers[i]); |
349 | } |
350 | } |
351 | |
352 | |
353 | AffixMatcher::AffixMatcher(AffixPatternMatcher* prefix, AffixPatternMatcher* suffix, result_flags_t flags) |
354 | : fPrefix(prefix), fSuffix(suffix), fFlags(flags) {} |
355 | |
356 | bool AffixMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const { |
357 | if (!result.seenNumber()) { |
358 | // Prefix |
359 | // Do not match if: |
360 | // 1. We have already seen a prefix (result.prefix != null) |
361 | // 2. The prefix in this AffixMatcher is empty (prefix == null) |
362 | if (!result.prefix.isBogus() || fPrefix == nullptr) { |
363 | return false; |
364 | } |
365 | |
366 | // Attempt to match the prefix. |
367 | int initialOffset = segment.getOffset(); |
368 | bool maybeMore = fPrefix->match(segment, result, status); |
369 | if (initialOffset != segment.getOffset()) { |
370 | result.prefix = fPrefix->getPattern(); |
371 | } |
372 | return maybeMore; |
373 | |
374 | } else { |
375 | // Suffix |
376 | // Do not match if: |
377 | // 1. We have already seen a suffix (result.suffix != null) |
378 | // 2. The suffix in this AffixMatcher is empty (suffix == null) |
379 | // 3. The matched prefix does not equal this AffixMatcher's prefix |
380 | if (!result.suffix.isBogus() || fSuffix == nullptr || !matched(fPrefix, result.prefix)) { |
381 | return false; |
382 | } |
383 | |
384 | // Attempt to match the suffix. |
385 | int initialOffset = segment.getOffset(); |
386 | bool maybeMore = fSuffix->match(segment, result, status); |
387 | if (initialOffset != segment.getOffset()) { |
388 | result.suffix = fSuffix->getPattern(); |
389 | } |
390 | return maybeMore; |
391 | } |
392 | } |
393 | |
394 | bool AffixMatcher::smokeTest(const StringSegment& segment) const { |
395 | return (fPrefix != nullptr && fPrefix->smokeTest(segment)) || |
396 | (fSuffix != nullptr && fSuffix->smokeTest(segment)); |
397 | } |
398 | |
399 | void AffixMatcher::postProcess(ParsedNumber& result) const { |
400 | // Check to see if our affix is the one that was matched. If so, set the flags in the result. |
401 | if (matched(fPrefix, result.prefix) && matched(fSuffix, result.suffix)) { |
402 | // Fill in the result prefix and suffix with non-null values (empty string). |
403 | // Used by strict mode to determine whether an entire affix pair was matched. |
404 | if (result.prefix.isBogus()) { |
405 | result.prefix = UnicodeString(); |
406 | } |
407 | if (result.suffix.isBogus()) { |
408 | result.suffix = UnicodeString(); |
409 | } |
410 | result.flags |= fFlags; |
411 | if (fPrefix != nullptr) { |
412 | fPrefix->postProcess(result); |
413 | } |
414 | if (fSuffix != nullptr) { |
415 | fSuffix->postProcess(result); |
416 | } |
417 | } |
418 | } |
419 | |
420 | int8_t AffixMatcher::compareTo(const AffixMatcher& rhs) const { |
421 | const AffixMatcher& lhs = *this; |
422 | if (length(lhs.fPrefix) != length(rhs.fPrefix)) { |
423 | return length(lhs.fPrefix) > length(rhs.fPrefix) ? -1 : 1; |
424 | } else if (length(lhs.fSuffix) != length(rhs.fSuffix)) { |
425 | return length(lhs.fSuffix) > length(rhs.fSuffix) ? -1 : 1; |
426 | } else { |
427 | return 0; |
428 | } |
429 | } |
430 | |
431 | UnicodeString AffixMatcher::toString() const { |
432 | bool isNegative = 0 != (fFlags & FLAG_NEGATIVE); |
433 | return UnicodeString(u"<Affix" ) + (isNegative ? u":negative " : u" " ) + |
434 | (fPrefix ? fPrefix->getPattern() : u"null" ) + u"#" + |
435 | (fSuffix ? fSuffix->getPattern() : u"null" ) + u">" ; |
436 | |
437 | } |
438 | |
439 | |
440 | #endif /* #if !UCONFIG_NO_FORMATTING */ |
441 | |
442 | |
443 | |
444 | |
445 | |
446 | |
447 | |
448 | |
449 | |
450 | |
451 | |
452 | |
453 | |
454 | |
455 | |
456 | |
457 | |
458 | |
459 | |
460 | |
461 | |
462 | |
463 | |
464 | |
465 | |
466 | |