numparse_decimal.cpp source code [ClickHouse/contrib/icu/icu4c/source/i18n/numparse_decimal.cpp]

1	// © 2018 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3
4	#include "unicode/utypes.h"
5
6	#if !UCONFIG_NO_FORMATTING
7
8	// Allow implicit conversion from char16_t to UnicodeString for this file:*
9	// Helpful in toString methods and elsewhere.
10	#define UNISTR_FROM_STRING_EXPLICIT
11
12	#include "numparse_types.h"
13	#include "numparse_decimal.h"
14	#include "static_unicode_sets.h"
15	#include "numparse_utils.h"
16	#include "unicode/uchar.h"
17	#include "putilimp.h"
18	#include "number_decimalquantity.h"
19	#include "string_segment.h"
20
21	using namespace icu;
22	using namespace icu::numparse;
23	using namespace icu::numparse::impl;
24
25
26	DecimalMatcher::DecimalMatcher(const DecimalFormatSymbols& symbols, const Grouper& grouper,
27	parse_flags_t parseFlags) {
28	if (`0` != (parseFlags & PARSE_FLAG_MONETARY_SEPARATORS)) {
29	groupingSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kMonetaryGroupingSeparatorSymbol);
30	decimalSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kMonetarySeparatorSymbol);
31	} else {
32	groupingSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kGroupingSeparatorSymbol);
33	decimalSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kDecimalSeparatorSymbol);
34	}
35	bool strictSeparators = `0` != (parseFlags & PARSE_FLAG_STRICT_SEPARATORS);
36	unisets::Key groupingKey = strictSeparators ? unisets::STRICT_ALL_SEPARATORS
37	: unisets::ALL_SEPARATORS;
38
39	// Attempt to find separators in the static cache
40
41	groupingUniSet = unisets::get(groupingKey);
42	unisets::Key decimalKey = unisets::chooseFrom(
43	decimalSeparator,
44	strictSeparators ? unisets::STRICT_COMMA : unisets::COMMA,
45	strictSeparators ? unisets::STRICT_PERIOD : unisets::PERIOD);
46	if (decimalKey >= `0`) {
47	decimalUniSet = unisets::get(decimalKey);
48	} else if (!decimalSeparator.isEmpty()) {
49	auto* set = new UnicodeSet ();
50	set->add(decimalSeparator.char32At(`0`));
51	set->freeze();
52	decimalUniSet = set;
53	fLocalDecimalUniSet.adoptInstead(set);
54	} else {
55	decimalUniSet = unisets::get(unisets::EMPTY);
56	}
57
58	if (groupingKey >= `0` && decimalKey >= `0`) {
59	// Everything is available in the static cache
60	separatorSet = groupingUniSet;
61	leadSet = unisets::get(
62	strictSeparators ? unisets::DIGITS_OR_ALL_SEPARATORS
63	: unisets::DIGITS_OR_STRICT_ALL_SEPARATORS);
64	} else {
65	auto* set = new UnicodeSet ();
66	set->addAll(*groupingUniSet);
67	set->addAll(*decimalUniSet);
68	set->freeze();
69	separatorSet = set;
70	fLocalSeparatorSet.adoptInstead(set);
71	leadSet = nullptr;
72	}
73
74	UChar32 cpZero = symbols.getCodePointZero();
75	if (cpZero == -`1` \|\| !u_isdigit(cpZero) \|\| u_digit(cpZero, `10`) != `0`) {
76	// Uncommon case: okay to allocate.
77	auto digitStrings = new UnicodeString[`10`];
78	fLocalDigitStrings.adoptInstead(digitStrings);
79	for (int32_t i = `0`; i <= `9`; i++) {
80	digitStrings[i] = symbols.getConstDigitSymbol(i);
81	}
82	}
83
84	requireGroupingMatch = `0` != (parseFlags & PARSE_FLAG_STRICT_GROUPING_SIZE);
85	groupingDisabled = `0` != (parseFlags & PARSE_FLAG_GROUPING_DISABLED);
86	integerOnly = `0` != (parseFlags & PARSE_FLAG_INTEGER_ONLY);
87	grouping1 = grouper.getPrimary();
88	grouping2 = grouper.getSecondary();
89
90	// Fraction grouping parsing is disabled for now but could be enabled later.
91	// See http://bugs.icu-project.org/trac/ticket/10794
92	// fractionGrouping = 0 != (parseFlags & PARSE_FLAG_FRACTION_GROUPING_ENABLED);
93	}
94
95	bool DecimalMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const {
96	return match(segment, result, `0`, status);
97	}
98
99	bool DecimalMatcher::match(StringSegment& segment, ParsedNumber& result, int8_t exponentSign,
100	UErrorCode&) const {
101	if (result.seenNumber() && exponentSign == `0`) {
102	// A number has already been consumed.
103	return false;
104	} else if (exponentSign != `0`) {
105	// scientific notation always comes after the number
106	U_ASSERT(!result.quantity.bogus);
107	}
108
109	// Initial offset before any character consumption.
110	int32_t initialOffset = segment.getOffset();
111
112	// Return value: whether to ask for more characters.
113	bool maybeMore = false;
114
115	// All digits consumed so far.
116	number::impl::DecimalQuantity digitsConsumed;
117	digitsConsumed.bogus = true;
118
119	// The total number of digits after the decimal place, used for scaling the result.
120	int32_t digitsAfterDecimalPlace = `0`;
121
122	// The actual grouping and decimal separators used in the string.
123	// If non-null, we have seen that token.
124	UnicodeString actualGroupingString;
125	UnicodeString actualDecimalString;
126	actualGroupingString.setToBogus();
127	actualDecimalString.setToBogus();
128
129	// Information for two groups: the previous group and the current group.
130	//
131	// Each group has three pieces of information:
132	//
133	// Offset: the string position of the beginning of the group, including a leading separator
134	// if there was a leading separator. This is needed in case we need to rewind the parse to
135	// that position.
136	//
137	// Separator type:
138	// 0 => beginning of string
139	// 1 => lead separator is a grouping separator
140	// 2 => lead separator is a decimal separator
141	//
142	// Count: the number of digits in the group. If -1, the group has been validated.
143	int32_t currGroupOffset = `0`;
144	int32_t currGroupSepType = `0`;
145	int32_t currGroupCount = `0`;
146	int32_t prevGroupOffset = -`1`;
147	int32_t prevGroupSepType = -`1`;
148	int32_t prevGroupCount = -`1`;
149
150	while (segment.length() > `0`) {
151	maybeMore = false;
152
153	// Attempt to match a digit.
154	int8_t digit = -`1`;
155
156	// Try by code point digit value.
157	UChar32 cp = segment.getCodePoint();
158	if (u_isdigit(cp)) {
159	segment.adjustOffset(U16_LENGTH(cp));
160	digit = static_cast<int8_t>(u_digit(cp, `10`));
161	}
162
163	// Try by digit string.
164	if (digit == -`1` && !fLocalDigitStrings.isNull()) {
165	for (int32_t i = `0`; i < `10`; i++) {
166	const UnicodeString& str = fLocalDigitStrings [i];
167	if (str.isEmpty()) {
168	continue;
169	}
170	int32_t overlap = segment.getCommonPrefixLength(str);
171	if (overlap == str.length()) {
172	segment.adjustOffset(overlap);
173	digit = static_cast<int8_t>(i);
174	break;
175	}
176	maybeMore = maybeMore \|\| (overlap == segment.length());
177	}
178	}
179
180	if (digit >= `0`) {
181	// Digit was found.
182	if (digitsConsumed.bogus) {
183	digitsConsumed.bogus = false;
184	digitsConsumed.clear();
185	}
186	digitsConsumed.appendDigit(digit, `0`, true);
187	currGroupCount++;
188	if (!actualDecimalString.isBogus()) {
189	digitsAfterDecimalPlace++;
190	}
191	continue;
192	}
193
194	// Attempt to match a literal grouping or decimal separator.
195	bool isDecimal = false;
196	bool isGrouping = false;
197
198	// 1) Attempt the decimal separator string literal.
199	// if (we have not seen a decimal separator yet) { ... }
200	if (actualDecimalString.isBogus() && !decimalSeparator.isEmpty()) {
201	int32_t overlap = segment.getCommonPrefixLength(decimalSeparator);
202	maybeMore = maybeMore \|\| (overlap == segment.length());
203	if (overlap == decimalSeparator.length()) {
204	isDecimal = true;
205	actualDecimalString = decimalSeparator;
206	}
207	}
208
209	// 2) Attempt to match the actual grouping string literal.
210	if (!actualGroupingString.isBogus()) {
211	int32_t overlap = segment.getCommonPrefixLength(actualGroupingString);
212	maybeMore = maybeMore \|\| (overlap == segment.length());
213	if (overlap == actualGroupingString.length()) {
214	isGrouping = true;
215	}
216	}
217
218	// 2.5) Attempt to match a new the grouping separator string literal.
219	// if (we have not seen a grouping or decimal separator yet) { ... }
220	if (!groupingDisabled && actualGroupingString.isBogus() && actualDecimalString.isBogus() &&
221	!groupingSeparator.isEmpty()) {
222	int32_t overlap = segment.getCommonPrefixLength(groupingSeparator);
223	maybeMore = maybeMore \|\| (overlap == segment.length());
224	if (overlap == groupingSeparator.length()) {
225	isGrouping = true;
226	actualGroupingString = groupingSeparator;
227	}
228	}
229
230	// 3) Attempt to match a decimal separator from the equivalence set.
231	// if (we have not seen a decimal separator yet) { ... }
232	// The !isGrouping is to confirm that we haven't yet matched the current character.
233	if (!isGrouping && actualDecimalString.isBogus()) {
234	if (decimalUniSet->contains(cp)) {
235	isDecimal = true;
236	actualDecimalString = UnicodeString (cp);
237	}
238	}
239
240	// 4) Attempt to match a grouping separator from the equivalence set.
241	// if (we have not seen a grouping or decimal separator yet) { ... }
242	if (!groupingDisabled && actualGroupingString.isBogus() && actualDecimalString.isBogus()) {
243	if (groupingUniSet->contains(cp)) {
244	isGrouping = true;
245	actualGroupingString = UnicodeString (cp);
246	}
247	}
248
249	// Leave if we failed to match this as a separator.
250	if (!isDecimal && !isGrouping) {
251	break;
252	}
253
254	// Check for conditions when we don't want to accept the separator.
255	if (isDecimal && integerOnly) {
256	break;
257	} else if (currGroupSepType == `2` && isGrouping) {
258	// Fraction grouping
259	break;
260	}
261
262	// Validate intermediate grouping sizes.
263	bool prevValidSecondary = validateGroup(prevGroupSepType, prevGroupCount, false);
264	bool currValidPrimary = validateGroup(currGroupSepType, currGroupCount, true);
265	if (!prevValidSecondary \|\| (isDecimal && !currValidPrimary)) {
266	// Invalid grouping sizes.
267	if (isGrouping && currGroupCount == `0`) {
268	// Trailing grouping separators: these are taken care of below
269	U_ASSERT(currGroupSepType == `1`);
270	} else if (requireGroupingMatch) {
271	// Strict mode: reject the parse
272	digitsConsumed.clear();
273	digitsConsumed.bogus = true;
274	}
275	break;
276	} else if (requireGroupingMatch && currGroupCount == `0` && currGroupSepType == `1`) {
277	break;
278	} else {
279	// Grouping sizes OK so far.
280	prevGroupOffset = currGroupOffset;
281	prevGroupCount = currGroupCount;
282	if (isDecimal) {
283	// Do not validate this group any more.
284	prevGroupSepType = -`1`;
285	} else {
286	prevGroupSepType = currGroupSepType;
287	}
288	}
289
290	// OK to accept the separator.
291	// Special case: don't update currGroup if it is empty; this allows two grouping
292	// separators in a row in lenient mode.
293	if (currGroupCount != `0`) {
294	currGroupOffset = segment.getOffset();
295	}
296	currGroupSepType = isGrouping ? `1` : `2`;
297	currGroupCount = `0`;
298	if (isGrouping) {
299	segment.adjustOffset(actualGroupingString.length());
300	} else {
301	segment.adjustOffset(actualDecimalString.length());
302	}
303	}
304
305	// End of main loop.
306	// Back up if there was a trailing grouping separator.
307	// Shift prev -> curr so we can check it as a final group.
308	if (currGroupSepType != `2` && currGroupCount == `0`) {
309	maybeMore = true;
310	segment.setOffset(currGroupOffset);
311	currGroupOffset = prevGroupOffset;
312	currGroupSepType = prevGroupSepType;
313	currGroupCount = prevGroupCount;
314	prevGroupOffset = -`1`;
315	prevGroupSepType = `0`;
316	prevGroupCount = `1`;
317	}
318
319	// Validate final grouping sizes.
320	bool prevValidSecondary = validateGroup(prevGroupSepType, prevGroupCount, false);
321	bool currValidPrimary = validateGroup(currGroupSepType, currGroupCount, true);
322	if (!requireGroupingMatch) {
323	// The cases we need to handle here are lone digits.
324	// Examples: "1,1" "1,1," "1,1,1" "1,1,1," ",1" (all parse as 1)
325	// See more examples in numberformattestspecification.txt
326	int32_t digitsToRemove = `0`;
327	if (!prevValidSecondary) {
328	segment.setOffset(prevGroupOffset);
329	digitsToRemove += prevGroupCount;
330	digitsToRemove += currGroupCount;
331	} else if (!currValidPrimary && (prevGroupSepType != `0` \|\| prevGroupCount != `0`)) {
332	maybeMore = true;
333	segment.setOffset(currGroupOffset);
334	digitsToRemove += currGroupCount;
335	}
336	if (digitsToRemove != `0`) {
337	digitsConsumed.adjustMagnitude(-digitsToRemove);
338	digitsConsumed.truncate();
339	}
340	prevValidSecondary = true;
341	currValidPrimary = true;
342	}
343	if (currGroupSepType != `2` && (!prevValidSecondary \|\| !currValidPrimary)) {
344	// Grouping failure.
345	digitsConsumed.bogus = true;
346	}
347
348	// Strings that start with a separator but have no digits,
349	// or strings that failed a grouping size check.
350	if (digitsConsumed.bogus) {
351	maybeMore = maybeMore \|\| (segment.length() == `0`);
352	segment.setOffset(initialOffset);
353	return maybeMore;
354	}
355
356	// We passed all inspections. Start post-processing.
357
358	// Adjust for fraction part.
359	digitsConsumed.adjustMagnitude(-digitsAfterDecimalPlace);
360
361	// Set the digits, either normal or exponent.
362	if (exponentSign != `0` && segment.getOffset() != initialOffset) {
363	bool overflow = false;
364	if (digitsConsumed.fitsInLong()) {
365	int64_t exponentLong = digitsConsumed.toLong(false);
366	U_ASSERT(exponentLong >= `0`);
367	if (exponentLong <= INT32_MAX) {
368	auto exponentInt = static_cast<int32_t>(exponentLong);
369	if (result.quantity.adjustMagnitude(exponentSign * exponentInt)) {
370	overflow = true;
371	}
372	} else {
373	overflow = true;
374	}
375	} else {
376	overflow = true;
377	}
378	if (overflow) {
379	if (exponentSign == -`1`) {
380	// Set to zero
381	result.quantity.clear();
382	} else {
383	// Set to infinity
384	result.quantity.bogus = true;
385	result.flags \|= FLAG_INFINITY;
386	}
387	}
388	} else {
389	result.quantity = digitsConsumed;
390	}
391
392	// Set other information into the result and return.
393	if (!actualDecimalString.isBogus()) {
394	result.flags \|= FLAG_HAS_DECIMAL_SEPARATOR;
395	}
396	result.setCharsConsumed(segment);
397	return segment.length() == `0` \|\| maybeMore;
398	}
399
400	bool DecimalMatcher::validateGroup(int32_t sepType, int32_t count, bool isPrimary) const {
401	if (requireGroupingMatch) {
402	if (sepType == -`1`) {
403	// No such group (prevGroup before first shift).
404	return true;
405	} else if (sepType == `0`) {
406	// First group.
407	if (isPrimary) {
408	// No grouping separators is OK.
409	return true;
410	} else {
411	return count != `0` && count <= grouping2;
412	}
413	} else if (sepType == `1`) {
414	// Middle group.
415	if (isPrimary) {
416	return count == grouping1;
417	} else {
418	return count == grouping2;
419	}
420	} else {
421	U_ASSERT(sepType == `2`);
422	// After the decimal separator.
423	return true;
424	}
425	} else {
426	if (sepType == `1`) {
427	// #11230: don't accept middle groups with only 1 digit.
428	return count != `1`;
429	} else {
430	return true;
431	}
432	}
433	}
434
435	bool DecimalMatcher::smokeTest(const StringSegment& segment) const {
436	// The common case uses a static leadSet for efficiency.
437	if (fLocalDigitStrings.isNull() && leadSet != nullptr) {
438	return segment.startsWith(*leadSet);
439	}
440	if (segment.startsWith(*separatorSet) \|\| u_isdigit(segment.getCodePoint())) {
441	return true;
442	}
443	if (fLocalDigitStrings.isNull()) {
444	return false;
445	}
446	for (int32_t i = `0`; i < `10`; i++) {
447	if (segment.startsWith(fLocalDigitStrings [i])) {
448	return true;
449	}
450	}
451	return false;
452	}
453
454	UnicodeString DecimalMatcher::toString() const {
455	return u"<Decimal>";
456	}
457
458
459	#endif /* #if !UCONFIG_NO_FORMATTING */
460

Browse the source code of ClickHouse/contrib/icu/icu4c/source/i18n/numparse_decimal.cpp