charconv_parse.cc source code [Abseil/strings/internal/charconv_parse.cc]

1	// Copyright 2018 The Abseil Authors.
2	//
3	// Licensed under the Apache License, Version 2.0 (the "License");
4	// you may not use this file except in compliance with the License.
5	// You may obtain a copy of the License at
6	//
7	// https://www.apache.org/licenses/LICENSE-2.0
8	//
9	// Unless required by applicable law or agreed to in writing, software
10	// distributed under the License is distributed on an "AS IS" BASIS,
11	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12	// See the License for the specific language governing permissions and
13	// limitations under the License.
14
15	#include "absl/strings/internal/charconv_parse.h"
16	#include "absl/strings/charconv.h"
17
18	#include <cassert>
19	#include <cstdint>
20	#include <limits>
21
22	#include "absl/strings/internal/memutil.h"
23
24	namespace absl {
25	namespace {
26
27	// ParseFloat<10> will read the first 19 significant digits of the mantissa.
28	// This number was chosen for multiple reasons.
29	//
30	// (a) First, for whatever integer type we choose to represent the mantissa, we
31	// want to choose the largest possible number of decimal digits for that integer
32	// type. We are using uint64_t, which can express any 19-digit unsigned
33	// integer.
34	//
35	// (b) Second, we need to parse enough digits that the binary value of any
36	// mantissa we capture has more bits of resolution than the mantissa
37	// representation in the target float. Our algorithm requires at least 3 bits
38	// of headway, but 19 decimal digits give a little more than that.
39	//
40	// The following static assertions verify the above comments:
41	constexpr int kDecimalMantissaDigitsMax = `19`;
42
43	static_assert(std::numeric_limits<uint64_t>::digits10 ==
44	kDecimalMantissaDigitsMax,
45	"(a) above");
46
47	// IEEE doubles, which we assume in Abseil, have 53 binary bits of mantissa.
48	static_assert(std::numeric_limits<double>::is_iec559, "IEEE double assumed");
49	static_assert(std::numeric_limits<double>::radix == `2`, "IEEE double fact");
50	static_assert(std::numeric_limits<double>::digits == `53`, "IEEE double fact");
51
52	// The lowest valued 19-digit decimal mantissa we can read still contains
53	// sufficient information to reconstruct a binary mantissa.
54	static_assert(`1000000000000000000u` > (uint64_t(`1`) << (`53` + `3`)), "(b) above");
55
56	// ParseFloat<16> will read the first 15 significant digits of the mantissa.
57	//
58	// Because a base-16-to-base-2 conversion can be done exactly, we do not need
59	// to maximize the number of scanned hex digits to improve our conversion. What
60	// is required is to scan two more bits than the mantissa can represent, so that
61	// we always round correctly.
62	//
63	// (One extra bit does not suffice to perform correct rounding, since a number
64	// exactly halfway between two representable floats has unique rounding rules,
65	// so we need to differentiate between a "halfway between" number and a "closer
66	// to the larger value" number.)
67	constexpr int kHexadecimalMantissaDigitsMax = `15`;
68
69	// The minimum number of significant bits that will be read from
70	// kHexadecimalMantissaDigitsMax hex digits. We must subtract by three, since
71	// the most significant digit can be a "1", which only contributes a single
72	// significant bit.
73	constexpr int kGuaranteedHexadecimalMantissaBitPrecision =
74	`4` * kHexadecimalMantissaDigitsMax - `3`;
75
76	static_assert(kGuaranteedHexadecimalMantissaBitPrecision >
77	std::numeric_limits<double>::digits + `2`,
78	"kHexadecimalMantissaDigitsMax too small");
79
80	// We also impose a limit on the number of significant digits we will read from
81	// an exponent, to avoid having to deal with integer overflow. We use 9 for
82	// this purpose.
83	//
84	// If we read a 9 digit exponent, the end result of the conversion will
85	// necessarily be infinity or zero, depending on the sign of the exponent.
86	// Therefore we can just drop extra digits on the floor without any extra
87	// logic.
88	constexpr int kDecimalExponentDigitsMax = `9`;
89	static_assert(std::numeric_limits<int>::digits10 >= kDecimalExponentDigitsMax,
90	"int type too small");
91
92	// To avoid incredibly large inputs causing integer overflow for our exponent,
93	// we impose an arbitrary but very large limit on the number of significant
94	// digits we will accept. The implementation refuses to match a string with
95	// more consecutive significant mantissa digits than this.
96	constexpr int kDecimalDigitLimit = `50000000`;
97
98	// Corresponding limit for hexadecimal digit inputs. This is one fourth the
99	// amount of kDecimalDigitLimit, since each dropped hexadecimal digit requires
100	// a binary exponent adjustment of 4.
101	constexpr int kHexadecimalDigitLimit = kDecimalDigitLimit / `4`;
102
103	// The largest exponent we can read is 999999999 (per
104	// kDecimalExponentDigitsMax), and the largest exponent adjustment we can get
105	// from dropped mantissa digits is 2 kDecimalDigitLimit, and the sum of these*
106	// comfortably fits in an integer.
107	//
108	// We count kDecimalDigitLimit twice because there are independent limits for
109	// numbers before and after the decimal point. (In the case where there are no
110	// significant digits before the decimal point, there are independent limits for
111	// post-decimal-point leading zeroes and for significant digits.)
112	static_assert(`999999999` + `2` * kDecimalDigitLimit <
113	std::numeric_limits<int>::max(),
114	"int type too small");
115	static_assert(`999999999` + `2` * (`4` * kHexadecimalDigitLimit) <
116	std::numeric_limits<int>::max(),
117	"int type too small");
118
119	// Returns true if the provided bitfield allows parsing an exponent value
120	// (e.g., "1.5e100").
121	bool AllowExponent(chars_format flags) {
122	bool fixed = (flags & chars_format::fixed) == chars_format::fixed;
123	bool scientific =
124	(flags & chars_format::scientific) == chars_format::scientific;
125	return scientific \|\| !fixed;
126	}
127
128	// Returns true if the provided bitfield requires an exponent value be present.
129	bool RequireExponent(chars_format flags) {
130	bool fixed = (flags & chars_format::fixed) == chars_format::fixed;
131	bool scientific =
132	(flags & chars_format::scientific) == chars_format::scientific;
133	return scientific && !fixed;
134	}
135
136	const int8_t kAsciiToInt[`256`] = {
137	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`,
138	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`,
139	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`,
140	`9`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `10`, `11`, `12`, `13`, `14`, `15`, -`1`, -`1`, -`1`, -`1`, -`1`,
141	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`,
142	-`1`, -`1`, `10`, `11`, `12`, `13`, `14`, `15`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`,
143	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`,
144	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`,
145	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`,
146	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`,
147	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`,
148	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`,
149	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`,
150	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`};
151
152	// Returns true if `ch` is a digit in the given base
153	template <int base>
154	bool IsDigit(char ch);
155
156	// Converts a valid `ch` to its digit value in the given base.
157	template <int base>
158	unsigned ToDigit(char ch);
159
160	// Returns true if `ch` is the exponent delimiter for the given base.
161	template <int base>
162	bool IsExponentCharacter(char ch);
163
164	// Returns the maximum number of significant digits we will read for a float
165	// in the given base.
166	template <int base>
167	constexpr int MantissaDigitsMax();
168
169	// Returns the largest consecutive run of digits we will accept when parsing a
170	// number in the given base.
171	template <int base>
172	constexpr int DigitLimit();
173
174	// Returns the amount the exponent must be adjusted by for each dropped digit.
175	// (For decimal this is 1, since the digits are in base 10 and the exponent base
176	// is also 10, but for hexadecimal this is 4, since the digits are base 16 but
177	// the exponent base is 2.)
178	template <int base>
179	constexpr int DigitMagnitude();
180
181	template <>
182	bool IsDigit<`10`>(char ch) {
183	return ch >= `'0'` && ch <= `'9'`;
184	}
185	template <>
186	bool IsDigit<`16`>(char ch) {
187	return kAsciiToInt[static_cast<unsigned char>(ch)] >= `0`;
188	}
189
190	template <>
191	unsigned ToDigit<`10`>(char ch) {
192	return ch - `'0'`;
193	}
194	template <>
195	unsigned ToDigit<`16`>(char ch) {
196	return kAsciiToInt[static_cast<unsigned char>(ch)];
197	}
198
199	template <>
200	bool IsExponentCharacter<`10`>(char ch) {
201	return ch == `'e'` \|\| ch == `'E'`;
202	}
203
204	template <>
205	bool IsExponentCharacter<`16`>(char ch) {
206	return ch == `'p'` \|\| ch == `'P'`;
207	}
208
209	template <>
210	constexpr int MantissaDigitsMax<`10`>() {
211	return kDecimalMantissaDigitsMax;
212	}
213	template <>
214	constexpr int MantissaDigitsMax<`16`>() {
215	return kHexadecimalMantissaDigitsMax;
216	}
217
218	template <>
219	constexpr int DigitLimit<`10`>() {
220	return kDecimalDigitLimit;
221	}
222	template <>
223	constexpr int DigitLimit<`16`>() {
224	return kHexadecimalDigitLimit;
225	}
226
227	template <>
228	constexpr int DigitMagnitude<`10`>() {
229	return `1`;
230	}
231	template <>
232	constexpr int DigitMagnitude<`16`>() {
233	return `4`;
234	}
235
236	// Reads decimal digits from [begin, end) into out. Returns the number of*
237	// digits consumed.
238	//
239	// After max_digits has been read, keeps consuming characters, but no longer
240	// adjusts out. If a nonzero digit is dropped this way, dropped_nonzero_digit
241	// is set; otherwise, it is left unmodified.
242	//
243	// If no digits are matched, returns 0 and leaves out unchanged.*
244	//
245	// ConsumeDigits does not protect against overflow on out; max_digits must*
246	// be chosen with respect to type T to avoid the possibility of overflow.
247	template <int base, typename T>
248	std::size_t ConsumeDigits(const char* begin, const char* end, int max_digits,
249	T* out, bool* dropped_nonzero_digit) {
250	if (base == `10`) {
251	assert(max_digits <= std::numeric_limits<T>::digits10);
252	} else if (base == `16`) {
253	assert(max_digits * `4` <= std::numeric_limits<T>::digits);
254	}
255	const char* const original_begin = begin;
256	T accumulator = *out;
257	const char* significant_digits_end =
258	(end - begin > max_digits) ? begin + max_digits : end;
259	while (begin < significant_digits_end && IsDigit<base>(*begin)) {
260	// Do not guard against out overflow; max_digits was chosen to avoid this.*
261	// Do assert against it, to detect problems in debug builds.
262	auto digit = static_cast<T>(ToDigit<base>(*begin));
263	assert(accumulator * base >= accumulator);
264	accumulator *= base;
265	assert(accumulator + digit >= accumulator);
266	accumulator += digit;
267	++begin;
268	}
269	bool dropped_nonzero = false;
270	while (begin < end && IsDigit<base>(*begin)) {
271	dropped_nonzero = dropped_nonzero \|\| (*begin != `'0'`);
272	++begin;
273	}
274	if (dropped_nonzero && dropped_nonzero_digit != nullptr) {
275	dropped_nonzero_digit = true*;
276	}
277	*out = accumulator;
278	return begin - original_begin;
279	}
280
281	// Returns true if `v` is one of the chars allowed inside parentheses following
282	// a NaN.
283	bool IsNanChar(char v) {
284	return (v == `'_'`) \|\| (v >= `'0'` && v <= `'9'`) \|\| (v >= `'a'` && v <= `'z'`) \|\|
285	(v >= `'A'` && v <= `'Z'`);
286	}
287
288	// Checks the range [begin, end) for a strtod()-formatted infinity or NaN. If
289	// one is found, sets `out` appropriately and returns true.
290	bool ParseInfinityOrNan(const char* begin, const char* end,
291	strings_internal::ParsedFloat* out) {
292	if (end - begin < `3`) {
293	return false;
294	}
295	switch (*begin) {
296	case `'i'`:
297	case `'I'`: {
298	// An infinity std::string consists of the characters "inf" or "infinity",
299	// case insensitive.
300	if (strings_internal::memcasecmp(begin + `1`, "nf", `2`) != `0`) {
301	return false;
302	}
303	out->type = strings_internal::FloatType::kInfinity;
304	if (end - begin >= `8` &&
305	strings_internal::memcasecmp(begin + `3`, "inity", `5`) == `0`) {
306	out->end = begin + `8`;
307	} else {
308	out->end = begin + `3`;
309	}
310	return true;
311	}
312	case `'n'`:
313	case `'N'`: {
314	// A NaN consists of the characters "nan", case insensitive, optionally
315	// followed by a parenthesized sequence of zero or more alphanumeric
316	// characters and/or underscores.
317	if (strings_internal::memcasecmp(begin + `1`, "an", `2`) != `0`) {
318	return false;
319	}
320	out->type = strings_internal::FloatType::kNan;
321	out->end = begin + `3`;
322	// NaN is allowed to be followed by a parenthesized std::string, consisting of
323	// only the characters [a-zA-Z0-9_]. Match that if it's present.
324	begin += `3`;
325	if (begin < end && *begin == `'('`) {
326	const char* nan_begin = begin + `1`;
327	while (nan_begin < end && IsNanChar(*nan_begin)) {
328	++nan_begin;
329	}
330	if (nan_begin < end && *nan_begin == `')'`) {
331	// We found an extra NaN specifier range
332	out->subrange_begin = begin + `1`;
333	out->subrange_end = nan_begin;
334	out->end = nan_begin + `1`;
335	}
336	}
337	return true;
338	}
339	default:
340	return false;
341	}
342	}
343	} // namespace
344
345	namespace strings_internal {
346
347	template <int base>
348	strings_internal::ParsedFloat ParseFloat(const char* begin, const char* end,
349	chars_format format_flags) {
350	strings_internal::ParsedFloat result;
351
352	// Exit early if we're given an empty range.
353	if (begin == end) return result;
354
355	// Handle the infinity and NaN cases.
356	if (ParseInfinityOrNan(begin, end, &result)) {
357	return result;
358	}
359
360	const char* const mantissa_begin = begin;
361	while (begin < end && *begin == `'0'`) {
362	++begin; // skip leading zeros
363	}
364	uint64_t mantissa = `0`;
365
366	int exponent_adjustment = `0`;
367	bool mantissa_is_inexact = false;
368	std::size_t pre_decimal_digits = ConsumeDigits<base>(
369	begin, end, MantissaDigitsMax<base>(), &mantissa, &mantissa_is_inexact);
370	begin += pre_decimal_digits;
371	int digits_left;
372	if (pre_decimal_digits >= DigitLimit<base>()) {
373	// refuse to parse pathological inputs
374	return result;
375	} else if (pre_decimal_digits > MantissaDigitsMax<base>()) {
376	// We dropped some non-fraction digits on the floor. Adjust our exponent
377	// to compensate.
378	exponent_adjustment =
379	static_cast<int>(pre_decimal_digits - MantissaDigitsMax<base>());
380	digits_left = `0`;
381	} else {
382	digits_left =
383	static_cast<int>(MantissaDigitsMax<base>() - pre_decimal_digits);
384	}
385	if (begin < end && *begin == `'.'`) {
386	++begin;
387	if (mantissa == `0`) {
388	// If we haven't seen any nonzero digits yet, keep skipping zeros. We
389	// have to adjust the exponent to reflect the changed place value.
390	const char* begin_zeros = begin;
391	while (begin < end && *begin == `'0'`) {
392	++begin;
393	}
394	std::size_t zeros_skipped = begin - begin_zeros;
395	if (zeros_skipped >= DigitLimit<base>()) {
396	// refuse to parse pathological inputs
397	return result;
398	}
399	exponent_adjustment -= static_cast<int>(zeros_skipped);
400	}
401	std::size_t post_decimal_digits = ConsumeDigits<base>(
402	begin, end, digits_left, &mantissa, &mantissa_is_inexact);
403	begin += post_decimal_digits;
404
405	// Since `mantissa` is an integer, each significant digit we read after
406	// the decimal point requires an adjustment to the exponent. "1.23e0" will
407	// be stored as `mantissa` == 123 and `exponent` == -2 (that is,
408	// "123e-2").
409	if (post_decimal_digits >= DigitLimit<base>()) {
410	// refuse to parse pathological inputs
411	return result;
412	} else if (post_decimal_digits > digits_left) {
413	exponent_adjustment -= digits_left;
414	} else {
415	exponent_adjustment -= post_decimal_digits;
416	}
417	}
418	// If we've found no mantissa whatsoever, this isn't a number.
419	if (mantissa_begin == begin) {
420	return result;
421	}
422	// A bare "." doesn't count as a mantissa either.
423	if (begin - mantissa_begin == `1` && *mantissa_begin == `'.'`) {
424	return result;
425	}
426
427	if (mantissa_is_inexact) {
428	// We dropped significant digits on the floor. Handle this appropriately.
429	if (base == `10`) {
430	// If we truncated significant decimal digits, store the full range of the
431	// mantissa for future big integer math for exact rounding.
432	result.subrange_begin = mantissa_begin;
433	result.subrange_end = begin;
434	} else if (base == `16`) {
435	// If we truncated hex digits, reflect this fact by setting the low
436	// ("sticky") bit. This allows for correct rounding in all cases.
437	mantissa \|= `1`;
438	}
439	}
440	result.mantissa = mantissa;
441
442	const char* const exponent_begin = begin;
443	result.literal_exponent = `0`;
444	bool found_exponent = false;
445	if (AllowExponent(format_flags) && begin < end &&
446	IsExponentCharacter<base>(*begin)) {
447	bool negative_exponent = false;
448	++begin;
449	if (begin < end && *begin == `'-'`) {
450	negative_exponent = true;
451	++begin;
452	} else if (begin < end && *begin == `'+'`) {
453	++begin;
454	}
455	const char* const exponent_digits_begin = begin;
456	// Exponent is always expressed in decimal, even for hexadecimal floats.
457	begin += ConsumeDigits<`10`>(begin, end, kDecimalExponentDigitsMax,
458	&result.literal_exponent, nullptr);
459	if (begin == exponent_digits_begin) {
460	// there were no digits where we expected an exponent. We failed to read
461	// an exponent and should not consume the 'e' after all. Rewind 'begin'.
462	found_exponent = false;
463	begin = exponent_begin;
464	} else {
465	found_exponent = true;
466	if (negative_exponent) {
467	result.literal_exponent = -result.literal_exponent;
468	}
469	}
470	}
471
472	if (!found_exponent && RequireExponent(format_flags)) {
473	// Provided flags required an exponent, but none was found. This results
474	// in a failure to scan.
475	return result;
476	}
477
478	// Success!
479	result.type = strings_internal::FloatType::kNumber;
480	if (result.mantissa > `0`) {
481	result.exponent = result.literal_exponent +
482	(DigitMagnitude<base>() * exponent_adjustment);
483	} else {
484	result.exponent = `0`;
485	}
486	result.end = begin;
487	return result;
488	}
489
490	template ParsedFloat ParseFloat<`10`>(const char* begin, const char* end,
491	chars_format format_flags);
492	template ParsedFloat ParseFloat<`16`>(const char* begin, const char* end,
493	chars_format format_flags);
494
495	} // namespace strings_internal
496	} // namespace absl
497

Browse the source code of Abseil/strings/internal/charconv_parse.cc