1// © 2018 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3
4#include "unicode/utypes.h"
5
6#if !UCONFIG_NO_FORMATTING
7
8// Allow implicit conversion from char16_t* to UnicodeString for this file:
9// Helpful in toString methods and elsewhere.
10#define UNISTR_FROM_STRING_EXPLICIT
11
12#include "numparse_types.h"
13#include "numparse_affixes.h"
14#include "numparse_utils.h"
15#include "number_utils.h"
16#include "string_segment.h"
17
18using namespace icu;
19using namespace icu::numparse;
20using namespace icu::numparse::impl;
21using namespace icu::number;
22using namespace icu::number::impl;
23
24
25namespace {
26
27/**
28 * Helper method to return whether the given AffixPatternMatcher equals the given pattern string.
29 * Either both arguments must be null or the pattern string inside the AffixPatternMatcher must equal
30 * the given pattern string.
31 */
32static bool matched(const AffixPatternMatcher* affix, const UnicodeString& patternString) {
33 return (affix == nullptr && patternString.isBogus()) ||
34 (affix != nullptr && affix->getPattern() == patternString);
35}
36
37/**
38 * Helper method to return the length of the given AffixPatternMatcher. Returns 0 for null.
39 */
40static int32_t length(const AffixPatternMatcher* matcher) {
41 return matcher == nullptr ? 0 : matcher->getPattern().length();
42}
43
44/**
45 * Helper method to return whether (1) both lhs and rhs are null/invalid, or (2) if they are both
46 * valid, whether they are equal according to operator==. Similar to Java Objects.equals()
47 */
48static bool equals(const AffixPatternMatcher* lhs, const AffixPatternMatcher* rhs) {
49 if (lhs == nullptr && rhs == nullptr) {
50 return true;
51 }
52 if (lhs == nullptr || rhs == nullptr) {
53 return false;
54 }
55 return *lhs == *rhs;
56}
57
58}
59
60
61AffixPatternMatcherBuilder::AffixPatternMatcherBuilder(const UnicodeString& pattern,
62 AffixTokenMatcherWarehouse& warehouse,
63 IgnorablesMatcher* ignorables)
64 : fMatchersLen(0),
65 fLastTypeOrCp(0),
66 fPattern(pattern),
67 fWarehouse(warehouse),
68 fIgnorables(ignorables) {}
69
70void AffixPatternMatcherBuilder::consumeToken(AffixPatternType type, UChar32 cp, UErrorCode& status) {
71 // This is called by AffixUtils.iterateWithConsumer() for each token.
72
73 // Add an ignorables matcher between tokens except between two literals, and don't put two
74 // ignorables matchers in a row.
75 if (fIgnorables != nullptr && fMatchersLen > 0 &&
76 (fLastTypeOrCp < 0 || !fIgnorables->getSet()->contains(fLastTypeOrCp))) {
77 addMatcher(*fIgnorables);
78 }
79
80 if (type != TYPE_CODEPOINT) {
81 // Case 1: the token is a symbol.
82 switch (type) {
83 case TYPE_MINUS_SIGN:
84 addMatcher(fWarehouse.minusSign());
85 break;
86 case TYPE_PLUS_SIGN:
87 addMatcher(fWarehouse.plusSign());
88 break;
89 case TYPE_PERCENT:
90 addMatcher(fWarehouse.percent());
91 break;
92 case TYPE_PERMILLE:
93 addMatcher(fWarehouse.permille());
94 break;
95 case TYPE_CURRENCY_SINGLE:
96 case TYPE_CURRENCY_DOUBLE:
97 case TYPE_CURRENCY_TRIPLE:
98 case TYPE_CURRENCY_QUAD:
99 case TYPE_CURRENCY_QUINT:
100 // All currency symbols use the same matcher
101 addMatcher(fWarehouse.currency(status));
102 break;
103 default:
104 UPRV_UNREACHABLE;
105 }
106
107 } else if (fIgnorables != nullptr && fIgnorables->getSet()->contains(cp)) {
108 // Case 2: the token is an ignorable literal.
109 // No action necessary: the ignorables matcher has already been added.
110
111 } else {
112 // Case 3: the token is a non-ignorable literal.
113 if (auto* ptr = fWarehouse.nextCodePointMatcher(cp, status)) {
114 addMatcher(*ptr);
115 } else {
116 // OOM; unwind the stack
117 return;
118 }
119 }
120 fLastTypeOrCp = type != TYPE_CODEPOINT ? type : cp;
121}
122
123void AffixPatternMatcherBuilder::addMatcher(NumberParseMatcher& matcher) {
124 if (fMatchersLen >= fMatchers.getCapacity()) {
125 fMatchers.resize(fMatchersLen * 2, fMatchersLen);
126 }
127 fMatchers[fMatchersLen++] = &matcher;
128}
129
130AffixPatternMatcher AffixPatternMatcherBuilder::build() {
131 return AffixPatternMatcher(fMatchers, fMatchersLen, fPattern);
132}
133
134AffixTokenMatcherWarehouse::AffixTokenMatcherWarehouse(const AffixTokenMatcherSetupData* setupData)
135 : fSetupData(setupData) {}
136
137NumberParseMatcher& AffixTokenMatcherWarehouse::minusSign() {
138 return fMinusSign = {fSetupData->dfs, true};
139}
140
141NumberParseMatcher& AffixTokenMatcherWarehouse::plusSign() {
142 return fPlusSign = {fSetupData->dfs, true};
143}
144
145NumberParseMatcher& AffixTokenMatcherWarehouse::percent() {
146 return fPercent = {fSetupData->dfs};
147}
148
149NumberParseMatcher& AffixTokenMatcherWarehouse::permille() {
150 return fPermille = {fSetupData->dfs};
151}
152
153NumberParseMatcher& AffixTokenMatcherWarehouse::currency(UErrorCode& status) {
154 return fCurrency = {fSetupData->currencySymbols, fSetupData->dfs, fSetupData->parseFlags, status};
155}
156
157IgnorablesMatcher& AffixTokenMatcherWarehouse::ignorables() {
158 return fSetupData->ignorables;
159}
160
161NumberParseMatcher* AffixTokenMatcherWarehouse::nextCodePointMatcher(UChar32 cp, UErrorCode& status) {
162 if (U_FAILURE(status)) {
163 return nullptr;
164 }
165 auto* result = fCodePoints.create(cp);
166 if (result == nullptr) {
167 status = U_MEMORY_ALLOCATION_ERROR;
168 }
169 return result;
170}
171
172
173CodePointMatcher::CodePointMatcher(UChar32 cp)
174 : fCp(cp) {}
175
176bool CodePointMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode&) const {
177 if (segment.startsWith(fCp)) {
178 segment.adjustOffsetByCodePoint();
179 result.setCharsConsumed(segment);
180 }
181 return false;
182}
183
184bool CodePointMatcher::smokeTest(const StringSegment& segment) const {
185 return segment.startsWith(fCp);
186}
187
188UnicodeString CodePointMatcher::toString() const {
189 return u"<CodePoint>";
190}
191
192
193AffixPatternMatcher AffixPatternMatcher::fromAffixPattern(const UnicodeString& affixPattern,
194 AffixTokenMatcherWarehouse& tokenWarehouse,
195 parse_flags_t parseFlags, bool* success,
196 UErrorCode& status) {
197 if (affixPattern.isEmpty()) {
198 *success = false;
199 return {};
200 }
201 *success = true;
202
203 IgnorablesMatcher* ignorables;
204 if (0 != (parseFlags & PARSE_FLAG_EXACT_AFFIX)) {
205 ignorables = nullptr;
206 } else {
207 ignorables = &tokenWarehouse.ignorables();
208 }
209
210 AffixPatternMatcherBuilder builder(affixPattern, tokenWarehouse, ignorables);
211 AffixUtils::iterateWithConsumer(affixPattern, builder, status);
212 return builder.build();
213}
214
215AffixPatternMatcher::AffixPatternMatcher(MatcherArray& matchers, int32_t matchersLen,
216 const UnicodeString& pattern)
217 : ArraySeriesMatcher(matchers, matchersLen), fPattern(pattern) {}
218
219UnicodeString AffixPatternMatcher::getPattern() const {
220 return fPattern.toAliasedUnicodeString();
221}
222
223bool AffixPatternMatcher::operator==(const AffixPatternMatcher& other) const {
224 return fPattern == other.fPattern;
225}
226
227
228AffixMatcherWarehouse::AffixMatcherWarehouse(AffixTokenMatcherWarehouse* tokenWarehouse)
229 : fTokenWarehouse(tokenWarehouse) {
230}
231
232bool AffixMatcherWarehouse::isInteresting(const AffixPatternProvider& patternInfo,
233 const IgnorablesMatcher& ignorables, parse_flags_t parseFlags,
234 UErrorCode& status) {
235 UnicodeString posPrefixString = patternInfo.getString(AffixPatternProvider::AFFIX_POS_PREFIX);
236 UnicodeString posSuffixString = patternInfo.getString(AffixPatternProvider::AFFIX_POS_SUFFIX);
237 UnicodeString negPrefixString;
238 UnicodeString negSuffixString;
239 if (patternInfo.hasNegativeSubpattern()) {
240 negPrefixString = patternInfo.getString(AffixPatternProvider::AFFIX_NEG_PREFIX);
241 negSuffixString = patternInfo.getString(AffixPatternProvider::AFFIX_NEG_SUFFIX);
242 }
243
244 if (0 == (parseFlags & PARSE_FLAG_USE_FULL_AFFIXES) &&
245 AffixUtils::containsOnlySymbolsAndIgnorables(posPrefixString, *ignorables.getSet(), status) &&
246 AffixUtils::containsOnlySymbolsAndIgnorables(posSuffixString, *ignorables.getSet(), status) &&
247 AffixUtils::containsOnlySymbolsAndIgnorables(negPrefixString, *ignorables.getSet(), status) &&
248 AffixUtils::containsOnlySymbolsAndIgnorables(negSuffixString, *ignorables.getSet(), status)
249 // HACK: Plus and minus sign are a special case: we accept them trailing only if they are
250 // trailing in the pattern string.
251 && !AffixUtils::containsType(posSuffixString, TYPE_PLUS_SIGN, status) &&
252 !AffixUtils::containsType(posSuffixString, TYPE_MINUS_SIGN, status) &&
253 !AffixUtils::containsType(negSuffixString, TYPE_PLUS_SIGN, status) &&
254 !AffixUtils::containsType(negSuffixString, TYPE_MINUS_SIGN, status)) {
255 // The affixes contain only symbols and ignorables.
256 // No need to generate affix matchers.
257 return false;
258 }
259 return true;
260}
261
262void AffixMatcherWarehouse::createAffixMatchers(const AffixPatternProvider& patternInfo,
263 MutableMatcherCollection& output,
264 const IgnorablesMatcher& ignorables,
265 parse_flags_t parseFlags, UErrorCode& status) {
266 if (!isInteresting(patternInfo, ignorables, parseFlags, status)) {
267 return;
268 }
269
270 // The affixes have interesting characters, or we are in strict mode.
271 // Use initial capacity of 6, the highest possible number of AffixMatchers.
272 UnicodeString sb;
273 bool includeUnpaired = 0 != (parseFlags & PARSE_FLAG_INCLUDE_UNPAIRED_AFFIXES);
274
275 int32_t numAffixMatchers = 0;
276 int32_t numAffixPatternMatchers = 0;
277
278 AffixPatternMatcher* posPrefix = nullptr;
279 AffixPatternMatcher* posSuffix = nullptr;
280
281 // Pre-process the affix strings to resolve LDML rules like sign display.
282 for (int8_t typeInt = 0; typeInt < PATTERN_SIGN_TYPE_COUNT; typeInt++) {
283 auto type = static_cast<PatternSignType>(typeInt);
284
285 // Skip affixes in some cases
286 if (type == PATTERN_SIGN_TYPE_POS
287 && 0 != (parseFlags & PARSE_FLAG_PLUS_SIGN_ALLOWED)) {
288 continue;
289 }
290 if (type == PATTERN_SIGN_TYPE_POS_SIGN
291 && 0 == (parseFlags & PARSE_FLAG_PLUS_SIGN_ALLOWED)) {
292 continue;
293 }
294
295 // Generate Prefix
296 bool hasPrefix = false;
297 PatternStringUtils::patternInfoToStringBuilder(
298 patternInfo, true, type, StandardPlural::OTHER, false, sb);
299 fAffixPatternMatchers[numAffixPatternMatchers] = AffixPatternMatcher::fromAffixPattern(
300 sb, *fTokenWarehouse, parseFlags, &hasPrefix, status);
301 AffixPatternMatcher* prefix = hasPrefix ? &fAffixPatternMatchers[numAffixPatternMatchers++]
302 : nullptr;
303
304 // Generate Suffix
305 bool hasSuffix = false;
306 PatternStringUtils::patternInfoToStringBuilder(
307 patternInfo, false, type, StandardPlural::OTHER, false, sb);
308 fAffixPatternMatchers[numAffixPatternMatchers] = AffixPatternMatcher::fromAffixPattern(
309 sb, *fTokenWarehouse, parseFlags, &hasSuffix, status);
310 AffixPatternMatcher* suffix = hasSuffix ? &fAffixPatternMatchers[numAffixPatternMatchers++]
311 : nullptr;
312
313 if (type == PATTERN_SIGN_TYPE_POS) {
314 posPrefix = prefix;
315 posSuffix = suffix;
316 } else if (equals(prefix, posPrefix) && equals(suffix, posSuffix)) {
317 // Skip adding these matchers (we already have equivalents)
318 continue;
319 }
320
321 // Flags for setting in the ParsedNumber; the token matchers may add more.
322 int flags = (type == PATTERN_SIGN_TYPE_NEG) ? FLAG_NEGATIVE : 0;
323
324 // Note: it is indeed possible for posPrefix and posSuffix to both be null.
325 // We still need to add that matcher for strict mode to work.
326 fAffixMatchers[numAffixMatchers++] = {prefix, suffix, flags};
327 if (includeUnpaired && prefix != nullptr && suffix != nullptr) {
328 // The following if statements are designed to prevent adding two identical matchers.
329 if (type == PATTERN_SIGN_TYPE_POS || !equals(prefix, posPrefix)) {
330 fAffixMatchers[numAffixMatchers++] = {prefix, nullptr, flags};
331 }
332 if (type == PATTERN_SIGN_TYPE_POS || !equals(suffix, posSuffix)) {
333 fAffixMatchers[numAffixMatchers++] = {nullptr, suffix, flags};
334 }
335 }
336 }
337
338 // Put the AffixMatchers in order, and then add them to the output.
339 // Since there are at most 9 elements, do a simple-to-implement bubble sort.
340 bool madeChanges;
341 do {
342 madeChanges = false;
343 for (int32_t i = 1; i < numAffixMatchers; i++) {
344 if (fAffixMatchers[i - 1].compareTo(fAffixMatchers[i]) > 0) {
345 madeChanges = true;
346 AffixMatcher temp = std::move(fAffixMatchers[i - 1]);
347 fAffixMatchers[i - 1] = std::move(fAffixMatchers[i]);
348 fAffixMatchers[i] = std::move(temp);
349 }
350 }
351 } while (madeChanges);
352
353 for (int32_t i = 0; i < numAffixMatchers; i++) {
354 // Enable the following line to debug affixes
355 //std::cout << "Adding affix matcher: " << CStr(fAffixMatchers[i].toString())() << std::endl;
356 output.addMatcher(fAffixMatchers[i]);
357 }
358}
359
360
361AffixMatcher::AffixMatcher(AffixPatternMatcher* prefix, AffixPatternMatcher* suffix, result_flags_t flags)
362 : fPrefix(prefix), fSuffix(suffix), fFlags(flags) {}
363
364bool AffixMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const {
365 if (!result.seenNumber()) {
366 // Prefix
367 // Do not match if:
368 // 1. We have already seen a prefix (result.prefix != null)
369 // 2. The prefix in this AffixMatcher is empty (prefix == null)
370 if (!result.prefix.isBogus() || fPrefix == nullptr) {
371 return false;
372 }
373
374 // Attempt to match the prefix.
375 int initialOffset = segment.getOffset();
376 bool maybeMore = fPrefix->match(segment, result, status);
377 if (initialOffset != segment.getOffset()) {
378 result.prefix = fPrefix->getPattern();
379 }
380 return maybeMore;
381
382 } else {
383 // Suffix
384 // Do not match if:
385 // 1. We have already seen a suffix (result.suffix != null)
386 // 2. The suffix in this AffixMatcher is empty (suffix == null)
387 // 3. The matched prefix does not equal this AffixMatcher's prefix
388 if (!result.suffix.isBogus() || fSuffix == nullptr || !matched(fPrefix, result.prefix)) {
389 return false;
390 }
391
392 // Attempt to match the suffix.
393 int initialOffset = segment.getOffset();
394 bool maybeMore = fSuffix->match(segment, result, status);
395 if (initialOffset != segment.getOffset()) {
396 result.suffix = fSuffix->getPattern();
397 }
398 return maybeMore;
399 }
400}
401
402bool AffixMatcher::smokeTest(const StringSegment& segment) const {
403 return (fPrefix != nullptr && fPrefix->smokeTest(segment)) ||
404 (fSuffix != nullptr && fSuffix->smokeTest(segment));
405}
406
407void AffixMatcher::postProcess(ParsedNumber& result) const {
408 // Check to see if our affix is the one that was matched. If so, set the flags in the result.
409 if (matched(fPrefix, result.prefix) && matched(fSuffix, result.suffix)) {
410 // Fill in the result prefix and suffix with non-null values (empty string).
411 // Used by strict mode to determine whether an entire affix pair was matched.
412 if (result.prefix.isBogus()) {
413 result.prefix = UnicodeString();
414 }
415 if (result.suffix.isBogus()) {
416 result.suffix = UnicodeString();
417 }
418 result.flags |= fFlags;
419 if (fPrefix != nullptr) {
420 fPrefix->postProcess(result);
421 }
422 if (fSuffix != nullptr) {
423 fSuffix->postProcess(result);
424 }
425 }
426}
427
428int8_t AffixMatcher::compareTo(const AffixMatcher& rhs) const {
429 const AffixMatcher& lhs = *this;
430 if (length(lhs.fPrefix) != length(rhs.fPrefix)) {
431 return length(lhs.fPrefix) > length(rhs.fPrefix) ? -1 : 1;
432 } else if (length(lhs.fSuffix) != length(rhs.fSuffix)) {
433 return length(lhs.fSuffix) > length(rhs.fSuffix) ? -1 : 1;
434 } else {
435 return 0;
436 }
437}
438
439UnicodeString AffixMatcher::toString() const {
440 bool isNegative = 0 != (fFlags & FLAG_NEGATIVE);
441 return UnicodeString(u"<Affix") + (isNegative ? u":negative " : u" ") +
442 (fPrefix ? fPrefix->getPattern() : u"null") + u"#" +
443 (fSuffix ? fSuffix->getPattern() : u"null") + u">";
444
445}
446
447
448#endif /* #if !UCONFIG_NO_FORMATTING */
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474