1 | // Copyright 2018 The Abseil Authors. |
2 | // |
3 | // Licensed under the Apache License, Version 2.0 (the "License"); |
4 | // you may not use this file except in compliance with the License. |
5 | // You may obtain a copy of the License at |
6 | // |
7 | // https://www.apache.org/licenses/LICENSE-2.0 |
8 | // |
9 | // Unless required by applicable law or agreed to in writing, software |
10 | // distributed under the License is distributed on an "AS IS" BASIS, |
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | // See the License for the specific language governing permissions and |
13 | // limitations under the License. |
14 | |
15 | #include "absl/strings/internal/charconv_parse.h" |
16 | #include "absl/strings/charconv.h" |
17 | |
18 | #include <cassert> |
19 | #include <cstdint> |
20 | #include <limits> |
21 | |
22 | #include "absl/strings/internal/memutil.h" |
23 | |
24 | namespace absl { |
25 | namespace { |
26 | |
27 | // ParseFloat<10> will read the first 19 significant digits of the mantissa. |
28 | // This number was chosen for multiple reasons. |
29 | // |
30 | // (a) First, for whatever integer type we choose to represent the mantissa, we |
31 | // want to choose the largest possible number of decimal digits for that integer |
32 | // type. We are using uint64_t, which can express any 19-digit unsigned |
33 | // integer. |
34 | // |
35 | // (b) Second, we need to parse enough digits that the binary value of any |
36 | // mantissa we capture has more bits of resolution than the mantissa |
37 | // representation in the target float. Our algorithm requires at least 3 bits |
38 | // of headway, but 19 decimal digits give a little more than that. |
39 | // |
40 | // The following static assertions verify the above comments: |
41 | constexpr int kDecimalMantissaDigitsMax = 19; |
42 | |
43 | static_assert(std::numeric_limits<uint64_t>::digits10 == |
44 | kDecimalMantissaDigitsMax, |
45 | "(a) above" ); |
46 | |
47 | // IEEE doubles, which we assume in Abseil, have 53 binary bits of mantissa. |
48 | static_assert(std::numeric_limits<double>::is_iec559, "IEEE double assumed" ); |
49 | static_assert(std::numeric_limits<double>::radix == 2, "IEEE double fact" ); |
50 | static_assert(std::numeric_limits<double>::digits == 53, "IEEE double fact" ); |
51 | |
52 | // The lowest valued 19-digit decimal mantissa we can read still contains |
53 | // sufficient information to reconstruct a binary mantissa. |
54 | static_assert(1000000000000000000u > (uint64_t(1) << (53 + 3)), "(b) above" ); |
55 | |
56 | // ParseFloat<16> will read the first 15 significant digits of the mantissa. |
57 | // |
58 | // Because a base-16-to-base-2 conversion can be done exactly, we do not need |
59 | // to maximize the number of scanned hex digits to improve our conversion. What |
60 | // is required is to scan two more bits than the mantissa can represent, so that |
61 | // we always round correctly. |
62 | // |
63 | // (One extra bit does not suffice to perform correct rounding, since a number |
64 | // exactly halfway between two representable floats has unique rounding rules, |
65 | // so we need to differentiate between a "halfway between" number and a "closer |
66 | // to the larger value" number.) |
67 | constexpr int kHexadecimalMantissaDigitsMax = 15; |
68 | |
69 | // The minimum number of significant bits that will be read from |
70 | // kHexadecimalMantissaDigitsMax hex digits. We must subtract by three, since |
71 | // the most significant digit can be a "1", which only contributes a single |
72 | // significant bit. |
73 | constexpr int kGuaranteedHexadecimalMantissaBitPrecision = |
74 | 4 * kHexadecimalMantissaDigitsMax - 3; |
75 | |
76 | static_assert(kGuaranteedHexadecimalMantissaBitPrecision > |
77 | std::numeric_limits<double>::digits + 2, |
78 | "kHexadecimalMantissaDigitsMax too small" ); |
79 | |
80 | // We also impose a limit on the number of significant digits we will read from |
81 | // an exponent, to avoid having to deal with integer overflow. We use 9 for |
82 | // this purpose. |
83 | // |
84 | // If we read a 9 digit exponent, the end result of the conversion will |
85 | // necessarily be infinity or zero, depending on the sign of the exponent. |
86 | // Therefore we can just drop extra digits on the floor without any extra |
87 | // logic. |
88 | constexpr int kDecimalExponentDigitsMax = 9; |
89 | static_assert(std::numeric_limits<int>::digits10 >= kDecimalExponentDigitsMax, |
90 | "int type too small" ); |
91 | |
92 | // To avoid incredibly large inputs causing integer overflow for our exponent, |
93 | // we impose an arbitrary but very large limit on the number of significant |
94 | // digits we will accept. The implementation refuses to match a string with |
95 | // more consecutive significant mantissa digits than this. |
96 | constexpr int kDecimalDigitLimit = 50000000; |
97 | |
98 | // Corresponding limit for hexadecimal digit inputs. This is one fourth the |
99 | // amount of kDecimalDigitLimit, since each dropped hexadecimal digit requires |
100 | // a binary exponent adjustment of 4. |
101 | constexpr int kHexadecimalDigitLimit = kDecimalDigitLimit / 4; |
102 | |
103 | // The largest exponent we can read is 999999999 (per |
104 | // kDecimalExponentDigitsMax), and the largest exponent adjustment we can get |
105 | // from dropped mantissa digits is 2 * kDecimalDigitLimit, and the sum of these |
106 | // comfortably fits in an integer. |
107 | // |
108 | // We count kDecimalDigitLimit twice because there are independent limits for |
109 | // numbers before and after the decimal point. (In the case where there are no |
110 | // significant digits before the decimal point, there are independent limits for |
111 | // post-decimal-point leading zeroes and for significant digits.) |
112 | static_assert(999999999 + 2 * kDecimalDigitLimit < |
113 | std::numeric_limits<int>::max(), |
114 | "int type too small" ); |
115 | static_assert(999999999 + 2 * (4 * kHexadecimalDigitLimit) < |
116 | std::numeric_limits<int>::max(), |
117 | "int type too small" ); |
118 | |
119 | // Returns true if the provided bitfield allows parsing an exponent value |
120 | // (e.g., "1.5e100"). |
121 | bool AllowExponent(chars_format flags) { |
122 | bool fixed = (flags & chars_format::fixed) == chars_format::fixed; |
123 | bool scientific = |
124 | (flags & chars_format::scientific) == chars_format::scientific; |
125 | return scientific || !fixed; |
126 | } |
127 | |
128 | // Returns true if the provided bitfield requires an exponent value be present. |
129 | bool RequireExponent(chars_format flags) { |
130 | bool fixed = (flags & chars_format::fixed) == chars_format::fixed; |
131 | bool scientific = |
132 | (flags & chars_format::scientific) == chars_format::scientific; |
133 | return scientific && !fixed; |
134 | } |
135 | |
136 | const int8_t kAsciiToInt[256] = { |
137 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
138 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
139 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, |
140 | 9, -1, -1, -1, -1, -1, -1, -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, |
141 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
142 | -1, -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
143 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
144 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
145 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
146 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
147 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
148 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
149 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
150 | -1, -1, -1, -1, -1, -1, -1, -1, -1}; |
151 | |
152 | // Returns true if `ch` is a digit in the given base |
153 | template <int base> |
154 | bool IsDigit(char ch); |
155 | |
156 | // Converts a valid `ch` to its digit value in the given base. |
157 | template <int base> |
158 | unsigned ToDigit(char ch); |
159 | |
160 | // Returns true if `ch` is the exponent delimiter for the given base. |
161 | template <int base> |
162 | bool IsExponentCharacter(char ch); |
163 | |
164 | // Returns the maximum number of significant digits we will read for a float |
165 | // in the given base. |
166 | template <int base> |
167 | constexpr int MantissaDigitsMax(); |
168 | |
169 | // Returns the largest consecutive run of digits we will accept when parsing a |
170 | // number in the given base. |
171 | template <int base> |
172 | constexpr int DigitLimit(); |
173 | |
174 | // Returns the amount the exponent must be adjusted by for each dropped digit. |
175 | // (For decimal this is 1, since the digits are in base 10 and the exponent base |
176 | // is also 10, but for hexadecimal this is 4, since the digits are base 16 but |
177 | // the exponent base is 2.) |
178 | template <int base> |
179 | constexpr int DigitMagnitude(); |
180 | |
181 | template <> |
182 | bool IsDigit<10>(char ch) { |
183 | return ch >= '0' && ch <= '9'; |
184 | } |
185 | template <> |
186 | bool IsDigit<16>(char ch) { |
187 | return kAsciiToInt[static_cast<unsigned char>(ch)] >= 0; |
188 | } |
189 | |
190 | template <> |
191 | unsigned ToDigit<10>(char ch) { |
192 | return ch - '0'; |
193 | } |
194 | template <> |
195 | unsigned ToDigit<16>(char ch) { |
196 | return kAsciiToInt[static_cast<unsigned char>(ch)]; |
197 | } |
198 | |
199 | template <> |
200 | bool IsExponentCharacter<10>(char ch) { |
201 | return ch == 'e' || ch == 'E'; |
202 | } |
203 | |
204 | template <> |
205 | bool IsExponentCharacter<16>(char ch) { |
206 | return ch == 'p' || ch == 'P'; |
207 | } |
208 | |
209 | template <> |
210 | constexpr int MantissaDigitsMax<10>() { |
211 | return kDecimalMantissaDigitsMax; |
212 | } |
213 | template <> |
214 | constexpr int MantissaDigitsMax<16>() { |
215 | return kHexadecimalMantissaDigitsMax; |
216 | } |
217 | |
218 | template <> |
219 | constexpr int DigitLimit<10>() { |
220 | return kDecimalDigitLimit; |
221 | } |
222 | template <> |
223 | constexpr int DigitLimit<16>() { |
224 | return kHexadecimalDigitLimit; |
225 | } |
226 | |
227 | template <> |
228 | constexpr int DigitMagnitude<10>() { |
229 | return 1; |
230 | } |
231 | template <> |
232 | constexpr int DigitMagnitude<16>() { |
233 | return 4; |
234 | } |
235 | |
236 | // Reads decimal digits from [begin, end) into *out. Returns the number of |
237 | // digits consumed. |
238 | // |
239 | // After max_digits has been read, keeps consuming characters, but no longer |
240 | // adjusts *out. If a nonzero digit is dropped this way, *dropped_nonzero_digit |
241 | // is set; otherwise, it is left unmodified. |
242 | // |
243 | // If no digits are matched, returns 0 and leaves *out unchanged. |
244 | // |
245 | // ConsumeDigits does not protect against overflow on *out; max_digits must |
246 | // be chosen with respect to type T to avoid the possibility of overflow. |
247 | template <int base, typename T> |
248 | std::size_t ConsumeDigits(const char* begin, const char* end, int max_digits, |
249 | T* out, bool* dropped_nonzero_digit) { |
250 | if (base == 10) { |
251 | assert(max_digits <= std::numeric_limits<T>::digits10); |
252 | } else if (base == 16) { |
253 | assert(max_digits * 4 <= std::numeric_limits<T>::digits); |
254 | } |
255 | const char* const original_begin = begin; |
256 | T accumulator = *out; |
257 | const char* significant_digits_end = |
258 | (end - begin > max_digits) ? begin + max_digits : end; |
259 | while (begin < significant_digits_end && IsDigit<base>(*begin)) { |
260 | // Do not guard against *out overflow; max_digits was chosen to avoid this. |
261 | // Do assert against it, to detect problems in debug builds. |
262 | auto digit = static_cast<T>(ToDigit<base>(*begin)); |
263 | assert(accumulator * base >= accumulator); |
264 | accumulator *= base; |
265 | assert(accumulator + digit >= accumulator); |
266 | accumulator += digit; |
267 | ++begin; |
268 | } |
269 | bool dropped_nonzero = false; |
270 | while (begin < end && IsDigit<base>(*begin)) { |
271 | dropped_nonzero = dropped_nonzero || (*begin != '0'); |
272 | ++begin; |
273 | } |
274 | if (dropped_nonzero && dropped_nonzero_digit != nullptr) { |
275 | *dropped_nonzero_digit = true; |
276 | } |
277 | *out = accumulator; |
278 | return begin - original_begin; |
279 | } |
280 | |
281 | // Returns true if `v` is one of the chars allowed inside parentheses following |
282 | // a NaN. |
283 | bool IsNanChar(char v) { |
284 | return (v == '_') || (v >= '0' && v <= '9') || (v >= 'a' && v <= 'z') || |
285 | (v >= 'A' && v <= 'Z'); |
286 | } |
287 | |
288 | // Checks the range [begin, end) for a strtod()-formatted infinity or NaN. If |
289 | // one is found, sets `out` appropriately and returns true. |
290 | bool ParseInfinityOrNan(const char* begin, const char* end, |
291 | strings_internal::ParsedFloat* out) { |
292 | if (end - begin < 3) { |
293 | return false; |
294 | } |
295 | switch (*begin) { |
296 | case 'i': |
297 | case 'I': { |
298 | // An infinity std::string consists of the characters "inf" or "infinity", |
299 | // case insensitive. |
300 | if (strings_internal::memcasecmp(begin + 1, "nf" , 2) != 0) { |
301 | return false; |
302 | } |
303 | out->type = strings_internal::FloatType::kInfinity; |
304 | if (end - begin >= 8 && |
305 | strings_internal::memcasecmp(begin + 3, "inity" , 5) == 0) { |
306 | out->end = begin + 8; |
307 | } else { |
308 | out->end = begin + 3; |
309 | } |
310 | return true; |
311 | } |
312 | case 'n': |
313 | case 'N': { |
314 | // A NaN consists of the characters "nan", case insensitive, optionally |
315 | // followed by a parenthesized sequence of zero or more alphanumeric |
316 | // characters and/or underscores. |
317 | if (strings_internal::memcasecmp(begin + 1, "an" , 2) != 0) { |
318 | return false; |
319 | } |
320 | out->type = strings_internal::FloatType::kNan; |
321 | out->end = begin + 3; |
322 | // NaN is allowed to be followed by a parenthesized std::string, consisting of |
323 | // only the characters [a-zA-Z0-9_]. Match that if it's present. |
324 | begin += 3; |
325 | if (begin < end && *begin == '(') { |
326 | const char* nan_begin = begin + 1; |
327 | while (nan_begin < end && IsNanChar(*nan_begin)) { |
328 | ++nan_begin; |
329 | } |
330 | if (nan_begin < end && *nan_begin == ')') { |
331 | // We found an extra NaN specifier range |
332 | out->subrange_begin = begin + 1; |
333 | out->subrange_end = nan_begin; |
334 | out->end = nan_begin + 1; |
335 | } |
336 | } |
337 | return true; |
338 | } |
339 | default: |
340 | return false; |
341 | } |
342 | } |
343 | } // namespace |
344 | |
345 | namespace strings_internal { |
346 | |
347 | template <int base> |
348 | strings_internal::ParsedFloat ParseFloat(const char* begin, const char* end, |
349 | chars_format format_flags) { |
350 | strings_internal::ParsedFloat result; |
351 | |
352 | // Exit early if we're given an empty range. |
353 | if (begin == end) return result; |
354 | |
355 | // Handle the infinity and NaN cases. |
356 | if (ParseInfinityOrNan(begin, end, &result)) { |
357 | return result; |
358 | } |
359 | |
360 | const char* const mantissa_begin = begin; |
361 | while (begin < end && *begin == '0') { |
362 | ++begin; // skip leading zeros |
363 | } |
364 | uint64_t mantissa = 0; |
365 | |
366 | int exponent_adjustment = 0; |
367 | bool mantissa_is_inexact = false; |
368 | std::size_t pre_decimal_digits = ConsumeDigits<base>( |
369 | begin, end, MantissaDigitsMax<base>(), &mantissa, &mantissa_is_inexact); |
370 | begin += pre_decimal_digits; |
371 | int digits_left; |
372 | if (pre_decimal_digits >= DigitLimit<base>()) { |
373 | // refuse to parse pathological inputs |
374 | return result; |
375 | } else if (pre_decimal_digits > MantissaDigitsMax<base>()) { |
376 | // We dropped some non-fraction digits on the floor. Adjust our exponent |
377 | // to compensate. |
378 | exponent_adjustment = |
379 | static_cast<int>(pre_decimal_digits - MantissaDigitsMax<base>()); |
380 | digits_left = 0; |
381 | } else { |
382 | digits_left = |
383 | static_cast<int>(MantissaDigitsMax<base>() - pre_decimal_digits); |
384 | } |
385 | if (begin < end && *begin == '.') { |
386 | ++begin; |
387 | if (mantissa == 0) { |
388 | // If we haven't seen any nonzero digits yet, keep skipping zeros. We |
389 | // have to adjust the exponent to reflect the changed place value. |
390 | const char* begin_zeros = begin; |
391 | while (begin < end && *begin == '0') { |
392 | ++begin; |
393 | } |
394 | std::size_t zeros_skipped = begin - begin_zeros; |
395 | if (zeros_skipped >= DigitLimit<base>()) { |
396 | // refuse to parse pathological inputs |
397 | return result; |
398 | } |
399 | exponent_adjustment -= static_cast<int>(zeros_skipped); |
400 | } |
401 | std::size_t post_decimal_digits = ConsumeDigits<base>( |
402 | begin, end, digits_left, &mantissa, &mantissa_is_inexact); |
403 | begin += post_decimal_digits; |
404 | |
405 | // Since `mantissa` is an integer, each significant digit we read after |
406 | // the decimal point requires an adjustment to the exponent. "1.23e0" will |
407 | // be stored as `mantissa` == 123 and `exponent` == -2 (that is, |
408 | // "123e-2"). |
409 | if (post_decimal_digits >= DigitLimit<base>()) { |
410 | // refuse to parse pathological inputs |
411 | return result; |
412 | } else if (post_decimal_digits > digits_left) { |
413 | exponent_adjustment -= digits_left; |
414 | } else { |
415 | exponent_adjustment -= post_decimal_digits; |
416 | } |
417 | } |
418 | // If we've found no mantissa whatsoever, this isn't a number. |
419 | if (mantissa_begin == begin) { |
420 | return result; |
421 | } |
422 | // A bare "." doesn't count as a mantissa either. |
423 | if (begin - mantissa_begin == 1 && *mantissa_begin == '.') { |
424 | return result; |
425 | } |
426 | |
427 | if (mantissa_is_inexact) { |
428 | // We dropped significant digits on the floor. Handle this appropriately. |
429 | if (base == 10) { |
430 | // If we truncated significant decimal digits, store the full range of the |
431 | // mantissa for future big integer math for exact rounding. |
432 | result.subrange_begin = mantissa_begin; |
433 | result.subrange_end = begin; |
434 | } else if (base == 16) { |
435 | // If we truncated hex digits, reflect this fact by setting the low |
436 | // ("sticky") bit. This allows for correct rounding in all cases. |
437 | mantissa |= 1; |
438 | } |
439 | } |
440 | result.mantissa = mantissa; |
441 | |
442 | const char* const exponent_begin = begin; |
443 | result.literal_exponent = 0; |
444 | bool found_exponent = false; |
445 | if (AllowExponent(format_flags) && begin < end && |
446 | IsExponentCharacter<base>(*begin)) { |
447 | bool negative_exponent = false; |
448 | ++begin; |
449 | if (begin < end && *begin == '-') { |
450 | negative_exponent = true; |
451 | ++begin; |
452 | } else if (begin < end && *begin == '+') { |
453 | ++begin; |
454 | } |
455 | const char* const exponent_digits_begin = begin; |
456 | // Exponent is always expressed in decimal, even for hexadecimal floats. |
457 | begin += ConsumeDigits<10>(begin, end, kDecimalExponentDigitsMax, |
458 | &result.literal_exponent, nullptr); |
459 | if (begin == exponent_digits_begin) { |
460 | // there were no digits where we expected an exponent. We failed to read |
461 | // an exponent and should not consume the 'e' after all. Rewind 'begin'. |
462 | found_exponent = false; |
463 | begin = exponent_begin; |
464 | } else { |
465 | found_exponent = true; |
466 | if (negative_exponent) { |
467 | result.literal_exponent = -result.literal_exponent; |
468 | } |
469 | } |
470 | } |
471 | |
472 | if (!found_exponent && RequireExponent(format_flags)) { |
473 | // Provided flags required an exponent, but none was found. This results |
474 | // in a failure to scan. |
475 | return result; |
476 | } |
477 | |
478 | // Success! |
479 | result.type = strings_internal::FloatType::kNumber; |
480 | if (result.mantissa > 0) { |
481 | result.exponent = result.literal_exponent + |
482 | (DigitMagnitude<base>() * exponent_adjustment); |
483 | } else { |
484 | result.exponent = 0; |
485 | } |
486 | result.end = begin; |
487 | return result; |
488 | } |
489 | |
490 | template ParsedFloat ParseFloat<10>(const char* begin, const char* end, |
491 | chars_format format_flags); |
492 | template ParsedFloat ParseFloat<16>(const char* begin, const char* end, |
493 | chars_format format_flags); |
494 | |
495 | } // namespace strings_internal |
496 | } // namespace absl |
497 | |