1// Copyright 2018 The Abseil Authors.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// https://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#include "absl/strings/internal/charconv_parse.h"
16#include "absl/strings/charconv.h"
17
18#include <cassert>
19#include <cstdint>
20#include <limits>
21
22#include "absl/strings/internal/memutil.h"
23
24namespace absl {
25namespace {
26
27// ParseFloat<10> will read the first 19 significant digits of the mantissa.
28// This number was chosen for multiple reasons.
29//
30// (a) First, for whatever integer type we choose to represent the mantissa, we
31// want to choose the largest possible number of decimal digits for that integer
32// type. We are using uint64_t, which can express any 19-digit unsigned
33// integer.
34//
35// (b) Second, we need to parse enough digits that the binary value of any
36// mantissa we capture has more bits of resolution than the mantissa
37// representation in the target float. Our algorithm requires at least 3 bits
38// of headway, but 19 decimal digits give a little more than that.
39//
40// The following static assertions verify the above comments:
41constexpr int kDecimalMantissaDigitsMax = 19;
42
43static_assert(std::numeric_limits<uint64_t>::digits10 ==
44 kDecimalMantissaDigitsMax,
45 "(a) above");
46
47// IEEE doubles, which we assume in Abseil, have 53 binary bits of mantissa.
48static_assert(std::numeric_limits<double>::is_iec559, "IEEE double assumed");
49static_assert(std::numeric_limits<double>::radix == 2, "IEEE double fact");
50static_assert(std::numeric_limits<double>::digits == 53, "IEEE double fact");
51
52// The lowest valued 19-digit decimal mantissa we can read still contains
53// sufficient information to reconstruct a binary mantissa.
54static_assert(1000000000000000000u > (uint64_t(1) << (53 + 3)), "(b) above");
55
56// ParseFloat<16> will read the first 15 significant digits of the mantissa.
57//
58// Because a base-16-to-base-2 conversion can be done exactly, we do not need
59// to maximize the number of scanned hex digits to improve our conversion. What
60// is required is to scan two more bits than the mantissa can represent, so that
61// we always round correctly.
62//
63// (One extra bit does not suffice to perform correct rounding, since a number
64// exactly halfway between two representable floats has unique rounding rules,
65// so we need to differentiate between a "halfway between" number and a "closer
66// to the larger value" number.)
67constexpr int kHexadecimalMantissaDigitsMax = 15;
68
69// The minimum number of significant bits that will be read from
70// kHexadecimalMantissaDigitsMax hex digits. We must subtract by three, since
71// the most significant digit can be a "1", which only contributes a single
72// significant bit.
73constexpr int kGuaranteedHexadecimalMantissaBitPrecision =
74 4 * kHexadecimalMantissaDigitsMax - 3;
75
76static_assert(kGuaranteedHexadecimalMantissaBitPrecision >
77 std::numeric_limits<double>::digits + 2,
78 "kHexadecimalMantissaDigitsMax too small");
79
80// We also impose a limit on the number of significant digits we will read from
81// an exponent, to avoid having to deal with integer overflow. We use 9 for
82// this purpose.
83//
84// If we read a 9 digit exponent, the end result of the conversion will
85// necessarily be infinity or zero, depending on the sign of the exponent.
86// Therefore we can just drop extra digits on the floor without any extra
87// logic.
88constexpr int kDecimalExponentDigitsMax = 9;
89static_assert(std::numeric_limits<int>::digits10 >= kDecimalExponentDigitsMax,
90 "int type too small");
91
92// To avoid incredibly large inputs causing integer overflow for our exponent,
93// we impose an arbitrary but very large limit on the number of significant
94// digits we will accept. The implementation refuses to match a string with
95// more consecutive significant mantissa digits than this.
96constexpr int kDecimalDigitLimit = 50000000;
97
98// Corresponding limit for hexadecimal digit inputs. This is one fourth the
99// amount of kDecimalDigitLimit, since each dropped hexadecimal digit requires
100// a binary exponent adjustment of 4.
101constexpr int kHexadecimalDigitLimit = kDecimalDigitLimit / 4;
102
103// The largest exponent we can read is 999999999 (per
104// kDecimalExponentDigitsMax), and the largest exponent adjustment we can get
105// from dropped mantissa digits is 2 * kDecimalDigitLimit, and the sum of these
106// comfortably fits in an integer.
107//
108// We count kDecimalDigitLimit twice because there are independent limits for
109// numbers before and after the decimal point. (In the case where there are no
110// significant digits before the decimal point, there are independent limits for
111// post-decimal-point leading zeroes and for significant digits.)
112static_assert(999999999 + 2 * kDecimalDigitLimit <
113 std::numeric_limits<int>::max(),
114 "int type too small");
115static_assert(999999999 + 2 * (4 * kHexadecimalDigitLimit) <
116 std::numeric_limits<int>::max(),
117 "int type too small");
118
119// Returns true if the provided bitfield allows parsing an exponent value
120// (e.g., "1.5e100").
121bool AllowExponent(chars_format flags) {
122 bool fixed = (flags & chars_format::fixed) == chars_format::fixed;
123 bool scientific =
124 (flags & chars_format::scientific) == chars_format::scientific;
125 return scientific || !fixed;
126}
127
128// Returns true if the provided bitfield requires an exponent value be present.
129bool RequireExponent(chars_format flags) {
130 bool fixed = (flags & chars_format::fixed) == chars_format::fixed;
131 bool scientific =
132 (flags & chars_format::scientific) == chars_format::scientific;
133 return scientific && !fixed;
134}
135
136const int8_t kAsciiToInt[256] = {
137 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
138 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
139 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8,
140 9, -1, -1, -1, -1, -1, -1, -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1,
141 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
142 -1, -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
143 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
144 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
145 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
146 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
147 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
148 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
149 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
150 -1, -1, -1, -1, -1, -1, -1, -1, -1};
151
152// Returns true if `ch` is a digit in the given base
153template <int base>
154bool IsDigit(char ch);
155
156// Converts a valid `ch` to its digit value in the given base.
157template <int base>
158unsigned ToDigit(char ch);
159
160// Returns true if `ch` is the exponent delimiter for the given base.
161template <int base>
162bool IsExponentCharacter(char ch);
163
164// Returns the maximum number of significant digits we will read for a float
165// in the given base.
166template <int base>
167constexpr int MantissaDigitsMax();
168
169// Returns the largest consecutive run of digits we will accept when parsing a
170// number in the given base.
171template <int base>
172constexpr int DigitLimit();
173
174// Returns the amount the exponent must be adjusted by for each dropped digit.
175// (For decimal this is 1, since the digits are in base 10 and the exponent base
176// is also 10, but for hexadecimal this is 4, since the digits are base 16 but
177// the exponent base is 2.)
178template <int base>
179constexpr int DigitMagnitude();
180
181template <>
182bool IsDigit<10>(char ch) {
183 return ch >= '0' && ch <= '9';
184}
185template <>
186bool IsDigit<16>(char ch) {
187 return kAsciiToInt[static_cast<unsigned char>(ch)] >= 0;
188}
189
190template <>
191unsigned ToDigit<10>(char ch) {
192 return ch - '0';
193}
194template <>
195unsigned ToDigit<16>(char ch) {
196 return kAsciiToInt[static_cast<unsigned char>(ch)];
197}
198
199template <>
200bool IsExponentCharacter<10>(char ch) {
201 return ch == 'e' || ch == 'E';
202}
203
204template <>
205bool IsExponentCharacter<16>(char ch) {
206 return ch == 'p' || ch == 'P';
207}
208
209template <>
210constexpr int MantissaDigitsMax<10>() {
211 return kDecimalMantissaDigitsMax;
212}
213template <>
214constexpr int MantissaDigitsMax<16>() {
215 return kHexadecimalMantissaDigitsMax;
216}
217
218template <>
219constexpr int DigitLimit<10>() {
220 return kDecimalDigitLimit;
221}
222template <>
223constexpr int DigitLimit<16>() {
224 return kHexadecimalDigitLimit;
225}
226
227template <>
228constexpr int DigitMagnitude<10>() {
229 return 1;
230}
231template <>
232constexpr int DigitMagnitude<16>() {
233 return 4;
234}
235
236// Reads decimal digits from [begin, end) into *out. Returns the number of
237// digits consumed.
238//
239// After max_digits has been read, keeps consuming characters, but no longer
240// adjusts *out. If a nonzero digit is dropped this way, *dropped_nonzero_digit
241// is set; otherwise, it is left unmodified.
242//
243// If no digits are matched, returns 0 and leaves *out unchanged.
244//
245// ConsumeDigits does not protect against overflow on *out; max_digits must
246// be chosen with respect to type T to avoid the possibility of overflow.
247template <int base, typename T>
248std::size_t ConsumeDigits(const char* begin, const char* end, int max_digits,
249 T* out, bool* dropped_nonzero_digit) {
250 if (base == 10) {
251 assert(max_digits <= std::numeric_limits<T>::digits10);
252 } else if (base == 16) {
253 assert(max_digits * 4 <= std::numeric_limits<T>::digits);
254 }
255 const char* const original_begin = begin;
256 T accumulator = *out;
257 const char* significant_digits_end =
258 (end - begin > max_digits) ? begin + max_digits : end;
259 while (begin < significant_digits_end && IsDigit<base>(*begin)) {
260 // Do not guard against *out overflow; max_digits was chosen to avoid this.
261 // Do assert against it, to detect problems in debug builds.
262 auto digit = static_cast<T>(ToDigit<base>(*begin));
263 assert(accumulator * base >= accumulator);
264 accumulator *= base;
265 assert(accumulator + digit >= accumulator);
266 accumulator += digit;
267 ++begin;
268 }
269 bool dropped_nonzero = false;
270 while (begin < end && IsDigit<base>(*begin)) {
271 dropped_nonzero = dropped_nonzero || (*begin != '0');
272 ++begin;
273 }
274 if (dropped_nonzero && dropped_nonzero_digit != nullptr) {
275 *dropped_nonzero_digit = true;
276 }
277 *out = accumulator;
278 return begin - original_begin;
279}
280
281// Returns true if `v` is one of the chars allowed inside parentheses following
282// a NaN.
283bool IsNanChar(char v) {
284 return (v == '_') || (v >= '0' && v <= '9') || (v >= 'a' && v <= 'z') ||
285 (v >= 'A' && v <= 'Z');
286}
287
288// Checks the range [begin, end) for a strtod()-formatted infinity or NaN. If
289// one is found, sets `out` appropriately and returns true.
290bool ParseInfinityOrNan(const char* begin, const char* end,
291 strings_internal::ParsedFloat* out) {
292 if (end - begin < 3) {
293 return false;
294 }
295 switch (*begin) {
296 case 'i':
297 case 'I': {
298 // An infinity std::string consists of the characters "inf" or "infinity",
299 // case insensitive.
300 if (strings_internal::memcasecmp(begin + 1, "nf", 2) != 0) {
301 return false;
302 }
303 out->type = strings_internal::FloatType::kInfinity;
304 if (end - begin >= 8 &&
305 strings_internal::memcasecmp(begin + 3, "inity", 5) == 0) {
306 out->end = begin + 8;
307 } else {
308 out->end = begin + 3;
309 }
310 return true;
311 }
312 case 'n':
313 case 'N': {
314 // A NaN consists of the characters "nan", case insensitive, optionally
315 // followed by a parenthesized sequence of zero or more alphanumeric
316 // characters and/or underscores.
317 if (strings_internal::memcasecmp(begin + 1, "an", 2) != 0) {
318 return false;
319 }
320 out->type = strings_internal::FloatType::kNan;
321 out->end = begin + 3;
322 // NaN is allowed to be followed by a parenthesized std::string, consisting of
323 // only the characters [a-zA-Z0-9_]. Match that if it's present.
324 begin += 3;
325 if (begin < end && *begin == '(') {
326 const char* nan_begin = begin + 1;
327 while (nan_begin < end && IsNanChar(*nan_begin)) {
328 ++nan_begin;
329 }
330 if (nan_begin < end && *nan_begin == ')') {
331 // We found an extra NaN specifier range
332 out->subrange_begin = begin + 1;
333 out->subrange_end = nan_begin;
334 out->end = nan_begin + 1;
335 }
336 }
337 return true;
338 }
339 default:
340 return false;
341 }
342}
343} // namespace
344
345namespace strings_internal {
346
347template <int base>
348strings_internal::ParsedFloat ParseFloat(const char* begin, const char* end,
349 chars_format format_flags) {
350 strings_internal::ParsedFloat result;
351
352 // Exit early if we're given an empty range.
353 if (begin == end) return result;
354
355 // Handle the infinity and NaN cases.
356 if (ParseInfinityOrNan(begin, end, &result)) {
357 return result;
358 }
359
360 const char* const mantissa_begin = begin;
361 while (begin < end && *begin == '0') {
362 ++begin; // skip leading zeros
363 }
364 uint64_t mantissa = 0;
365
366 int exponent_adjustment = 0;
367 bool mantissa_is_inexact = false;
368 std::size_t pre_decimal_digits = ConsumeDigits<base>(
369 begin, end, MantissaDigitsMax<base>(), &mantissa, &mantissa_is_inexact);
370 begin += pre_decimal_digits;
371 int digits_left;
372 if (pre_decimal_digits >= DigitLimit<base>()) {
373 // refuse to parse pathological inputs
374 return result;
375 } else if (pre_decimal_digits > MantissaDigitsMax<base>()) {
376 // We dropped some non-fraction digits on the floor. Adjust our exponent
377 // to compensate.
378 exponent_adjustment =
379 static_cast<int>(pre_decimal_digits - MantissaDigitsMax<base>());
380 digits_left = 0;
381 } else {
382 digits_left =
383 static_cast<int>(MantissaDigitsMax<base>() - pre_decimal_digits);
384 }
385 if (begin < end && *begin == '.') {
386 ++begin;
387 if (mantissa == 0) {
388 // If we haven't seen any nonzero digits yet, keep skipping zeros. We
389 // have to adjust the exponent to reflect the changed place value.
390 const char* begin_zeros = begin;
391 while (begin < end && *begin == '0') {
392 ++begin;
393 }
394 std::size_t zeros_skipped = begin - begin_zeros;
395 if (zeros_skipped >= DigitLimit<base>()) {
396 // refuse to parse pathological inputs
397 return result;
398 }
399 exponent_adjustment -= static_cast<int>(zeros_skipped);
400 }
401 std::size_t post_decimal_digits = ConsumeDigits<base>(
402 begin, end, digits_left, &mantissa, &mantissa_is_inexact);
403 begin += post_decimal_digits;
404
405 // Since `mantissa` is an integer, each significant digit we read after
406 // the decimal point requires an adjustment to the exponent. "1.23e0" will
407 // be stored as `mantissa` == 123 and `exponent` == -2 (that is,
408 // "123e-2").
409 if (post_decimal_digits >= DigitLimit<base>()) {
410 // refuse to parse pathological inputs
411 return result;
412 } else if (post_decimal_digits > digits_left) {
413 exponent_adjustment -= digits_left;
414 } else {
415 exponent_adjustment -= post_decimal_digits;
416 }
417 }
418 // If we've found no mantissa whatsoever, this isn't a number.
419 if (mantissa_begin == begin) {
420 return result;
421 }
422 // A bare "." doesn't count as a mantissa either.
423 if (begin - mantissa_begin == 1 && *mantissa_begin == '.') {
424 return result;
425 }
426
427 if (mantissa_is_inexact) {
428 // We dropped significant digits on the floor. Handle this appropriately.
429 if (base == 10) {
430 // If we truncated significant decimal digits, store the full range of the
431 // mantissa for future big integer math for exact rounding.
432 result.subrange_begin = mantissa_begin;
433 result.subrange_end = begin;
434 } else if (base == 16) {
435 // If we truncated hex digits, reflect this fact by setting the low
436 // ("sticky") bit. This allows for correct rounding in all cases.
437 mantissa |= 1;
438 }
439 }
440 result.mantissa = mantissa;
441
442 const char* const exponent_begin = begin;
443 result.literal_exponent = 0;
444 bool found_exponent = false;
445 if (AllowExponent(format_flags) && begin < end &&
446 IsExponentCharacter<base>(*begin)) {
447 bool negative_exponent = false;
448 ++begin;
449 if (begin < end && *begin == '-') {
450 negative_exponent = true;
451 ++begin;
452 } else if (begin < end && *begin == '+') {
453 ++begin;
454 }
455 const char* const exponent_digits_begin = begin;
456 // Exponent is always expressed in decimal, even for hexadecimal floats.
457 begin += ConsumeDigits<10>(begin, end, kDecimalExponentDigitsMax,
458 &result.literal_exponent, nullptr);
459 if (begin == exponent_digits_begin) {
460 // there were no digits where we expected an exponent. We failed to read
461 // an exponent and should not consume the 'e' after all. Rewind 'begin'.
462 found_exponent = false;
463 begin = exponent_begin;
464 } else {
465 found_exponent = true;
466 if (negative_exponent) {
467 result.literal_exponent = -result.literal_exponent;
468 }
469 }
470 }
471
472 if (!found_exponent && RequireExponent(format_flags)) {
473 // Provided flags required an exponent, but none was found. This results
474 // in a failure to scan.
475 return result;
476 }
477
478 // Success!
479 result.type = strings_internal::FloatType::kNumber;
480 if (result.mantissa > 0) {
481 result.exponent = result.literal_exponent +
482 (DigitMagnitude<base>() * exponent_adjustment);
483 } else {
484 result.exponent = 0;
485 }
486 result.end = begin;
487 return result;
488}
489
490template ParsedFloat ParseFloat<10>(const char* begin, const char* end,
491 chars_format format_flags);
492template ParsedFloat ParseFloat<16>(const char* begin, const char* end,
493 chars_format format_flags);
494
495} // namespace strings_internal
496} // namespace absl
497