1 | #include <type_traits> |
2 | #include <IO/ReadHelpers.h> |
3 | #include <Core/Defines.h> |
4 | #include <common/shift10.h> |
5 | #include <common/likely.h> |
6 | #include <Common/StringUtils/StringUtils.h> |
7 | #include <double-conversion/double-conversion.h> |
8 | |
9 | |
10 | /** Methods for reading floating point numbers from text with decimal representation. |
11 | * There are "precise", "fast" and "simple" implementations. |
12 | * |
13 | * Neither of methods support hexadecimal numbers (0xABC), binary exponent (1p100), leading plus sign. |
14 | * |
15 | * Precise method always returns a number that is the closest machine representable number to the input. |
16 | * |
17 | * Fast method is faster (up to 3 times) and usually return the same value, |
18 | * but in rare cases result may differ by lest significant bit (for Float32) |
19 | * and by up to two least significant bits (for Float64) from precise method. |
20 | * Also fast method may parse some garbage as some other unspecified garbage. |
21 | * |
22 | * Simple method is little faster for cases of parsing short (few digit) integers, but less precise and slower in other cases. |
23 | * It's not recommended to use simple method and it is left only for reference. |
24 | * |
25 | * For performance test, look at 'read_float_perf' test. |
26 | * |
27 | * For precision test. |
28 | * Parse all existing Float32 numbers: |
29 | |
30 | CREATE TABLE test.floats ENGINE = Log AS SELECT reinterpretAsFloat32(reinterpretAsString(toUInt32(number))) AS x FROM numbers(0x100000000); |
31 | |
32 | WITH |
33 | toFloat32(toString(x)) AS y, |
34 | reinterpretAsUInt32(reinterpretAsString(x)) AS bin_x, |
35 | reinterpretAsUInt32(reinterpretAsString(y)) AS bin_y, |
36 | abs(bin_x - bin_y) AS diff |
37 | SELECT |
38 | diff, |
39 | count() |
40 | FROM test.floats |
41 | WHERE NOT isNaN(x) |
42 | GROUP BY diff |
43 | ORDER BY diff ASC |
44 | LIMIT 100 |
45 | |
46 | * Here are the results: |
47 | * |
48 | Precise: |
49 | ┌─diff─┬────count()─┐ |
50 | │ 0 │ 4278190082 │ |
51 | └──────┴────────────┘ |
52 | (100% roundtrip property) |
53 | |
54 | Fast: |
55 | ┌─diff─┬────count()─┐ |
56 | │ 0 │ 3685260580 │ |
57 | │ 1 │ 592929502 │ |
58 | └──────┴────────────┘ |
59 | (The difference is 1 in least significant bit in 13.8% of numbers.) |
60 | |
61 | Simple: |
62 | ┌─diff─┬────count()─┐ |
63 | │ 0 │ 2169879994 │ |
64 | │ 1 │ 1807178292 │ |
65 | │ 2 │ 269505944 │ |
66 | │ 3 │ 28826966 │ |
67 | │ 4 │ 2566488 │ |
68 | │ 5 │ 212878 │ |
69 | │ 6 │ 18276 │ |
70 | │ 7 │ 1214 │ |
71 | │ 8 │ 30 │ |
72 | └──────┴────────────┘ |
73 | |
74 | * Parse random Float64 numbers: |
75 | |
76 | WITH |
77 | rand64() AS bin_x, |
78 | reinterpretAsFloat64(reinterpretAsString(bin_x)) AS x, |
79 | toFloat64(toString(x)) AS y, |
80 | reinterpretAsUInt64(reinterpretAsString(y)) AS bin_y, |
81 | abs(bin_x - bin_y) AS diff |
82 | SELECT |
83 | diff, |
84 | count() |
85 | FROM numbers(100000000) |
86 | WHERE NOT isNaN(x) |
87 | GROUP BY diff |
88 | ORDER BY diff ASC |
89 | LIMIT 100 |
90 | |
91 | */ |
92 | |
93 | |
94 | namespace DB |
95 | { |
96 | |
97 | namespace ErrorCodes |
98 | { |
99 | extern const int CANNOT_PARSE_NUMBER; |
100 | extern const int ARGUMENT_OUT_OF_BOUND; |
101 | } |
102 | |
103 | |
104 | /// Returns true, iff parsed. |
105 | bool parseInfinity(ReadBuffer & buf); |
106 | bool parseNaN(ReadBuffer & buf); |
107 | |
108 | void assertInfinity(ReadBuffer & buf); |
109 | void assertNaN(ReadBuffer & buf); |
110 | |
111 | |
112 | template <bool throw_exception> |
113 | bool assertOrParseInfinity(ReadBuffer & buf) |
114 | { |
115 | if constexpr (throw_exception) |
116 | { |
117 | assertInfinity(buf); |
118 | return true; |
119 | } |
120 | else |
121 | return parseInfinity(buf); |
122 | } |
123 | |
124 | template <bool throw_exception> |
125 | bool assertOrParseNaN(ReadBuffer & buf) |
126 | { |
127 | if constexpr (throw_exception) |
128 | { |
129 | assertNaN(buf); |
130 | return true; |
131 | } |
132 | else |
133 | return parseNaN(buf); |
134 | } |
135 | |
136 | |
137 | /// Some garbage may be successfully parsed, examples: '--1' parsed as '1'. |
138 | template <typename T, typename ReturnType> |
139 | ReturnType readFloatTextPreciseImpl(T & x, ReadBuffer & buf) |
140 | { |
141 | static_assert(std::is_same_v<T, double> || std::is_same_v<T, float>, "Argument for readFloatTextImpl must be float or double" ); |
142 | static constexpr bool throw_exception = std::is_same_v<ReturnType, void>; |
143 | |
144 | if (buf.eof()) |
145 | { |
146 | if constexpr (throw_exception) |
147 | throw Exception("Cannot read floating point value" , ErrorCodes::CANNOT_PARSE_NUMBER); |
148 | else |
149 | return ReturnType(false); |
150 | } |
151 | |
152 | /// We use special code to read denormals (inf, nan), because we support slightly more variants that double-conversion library does: |
153 | /// Example: inf and Infinity. |
154 | |
155 | bool negative = false; |
156 | |
157 | while (true) |
158 | { |
159 | switch (*buf.position()) |
160 | { |
161 | case '-': |
162 | { |
163 | negative = true; |
164 | ++buf.position(); |
165 | continue; |
166 | } |
167 | |
168 | case 'i': [[fallthrough]]; |
169 | case 'I': |
170 | { |
171 | if (assertOrParseInfinity<throw_exception>(buf)) |
172 | { |
173 | x = std::numeric_limits<T>::infinity(); |
174 | if (negative) |
175 | x = -x; |
176 | return ReturnType(true); |
177 | } |
178 | return ReturnType(false); |
179 | } |
180 | |
181 | case 'n': [[fallthrough]]; |
182 | case 'N': |
183 | { |
184 | if (assertOrParseNaN<throw_exception>(buf)) |
185 | { |
186 | x = std::numeric_limits<T>::quiet_NaN(); |
187 | if (negative) |
188 | x = -x; |
189 | return ReturnType(true); |
190 | } |
191 | return ReturnType(false); |
192 | } |
193 | |
194 | default: |
195 | break; |
196 | } |
197 | break; |
198 | } |
199 | |
200 | static const double_conversion::StringToDoubleConverter converter( |
201 | double_conversion::StringToDoubleConverter::ALLOW_TRAILING_JUNK, |
202 | 0, 0, nullptr, nullptr); |
203 | |
204 | /// Fast path (avoid copying) if the buffer have at least MAX_LENGTH bytes. |
205 | static constexpr int MAX_LENGTH = 316; |
206 | |
207 | if (buf.position() + MAX_LENGTH <= buf.buffer().end()) |
208 | { |
209 | int num_processed_characters = 0; |
210 | |
211 | if constexpr (std::is_same_v<T, double>) |
212 | x = converter.StringToDouble(buf.position(), buf.buffer().end() - buf.position(), &num_processed_characters); |
213 | else |
214 | x = converter.StringToFloat(buf.position(), buf.buffer().end() - buf.position(), &num_processed_characters); |
215 | |
216 | if (num_processed_characters < 0) |
217 | { |
218 | if constexpr (throw_exception) |
219 | throw Exception("Cannot read floating point value" , ErrorCodes::CANNOT_PARSE_NUMBER); |
220 | else |
221 | return ReturnType(false); |
222 | } |
223 | |
224 | buf.position() += num_processed_characters; |
225 | |
226 | if (negative) |
227 | x = -x; |
228 | return ReturnType(true); |
229 | } |
230 | else |
231 | { |
232 | /// Slow path. Copy characters that may be present in floating point number to temporary buffer. |
233 | |
234 | char tmp_buf[MAX_LENGTH]; |
235 | int num_copied_chars = 0; |
236 | |
237 | while (!buf.eof() && num_copied_chars < MAX_LENGTH) |
238 | { |
239 | char c = *buf.position(); |
240 | if (!(isNumericASCII(c) || c == '-' || c == '+' || c == '.' || c == 'e' || c == 'E')) |
241 | break; |
242 | |
243 | tmp_buf[num_copied_chars] = c; |
244 | ++buf.position(); |
245 | ++num_copied_chars; |
246 | } |
247 | |
248 | int num_processed_characters = 0; |
249 | |
250 | if constexpr (std::is_same_v<T, double>) |
251 | x = converter.StringToDouble(tmp_buf, num_copied_chars, &num_processed_characters); |
252 | else |
253 | x = converter.StringToFloat(tmp_buf, num_copied_chars, &num_processed_characters); |
254 | |
255 | if (num_processed_characters < num_copied_chars) |
256 | { |
257 | if constexpr (throw_exception) |
258 | throw Exception("Cannot read floating point value" , ErrorCodes::CANNOT_PARSE_NUMBER); |
259 | else |
260 | return ReturnType(false); |
261 | } |
262 | |
263 | if (negative) |
264 | x = -x; |
265 | return ReturnType(true); |
266 | } |
267 | } |
268 | |
269 | |
270 | template <size_t N, typename T> |
271 | static inline void readUIntTextUpToNSignificantDigits(T & x, ReadBuffer & buf) |
272 | { |
273 | /// In optimistic case we can skip bound checking for first loop. |
274 | if (buf.position() + N <= buf.buffer().end()) |
275 | { |
276 | for (size_t i = 0; i < N; ++i) |
277 | { |
278 | if (isNumericASCII(*buf.position())) |
279 | { |
280 | x *= 10; |
281 | x += *buf.position() & 0x0F; |
282 | ++buf.position(); |
283 | } |
284 | else |
285 | return; |
286 | } |
287 | |
288 | while (!buf.eof() && isNumericASCII(*buf.position())) |
289 | ++buf.position(); |
290 | } |
291 | else |
292 | { |
293 | for (size_t i = 0; i < N; ++i) |
294 | { |
295 | if (!buf.eof() && isNumericASCII(*buf.position())) |
296 | { |
297 | x *= 10; |
298 | x += *buf.position() & 0x0F; |
299 | ++buf.position(); |
300 | } |
301 | else |
302 | return; |
303 | } |
304 | |
305 | while (!buf.eof() && isNumericASCII(*buf.position())) |
306 | ++buf.position(); |
307 | } |
308 | } |
309 | |
310 | |
311 | template <typename T, typename ReturnType> |
312 | ReturnType readFloatTextFastImpl(T & x, ReadBuffer & in) |
313 | { |
314 | static_assert(std::is_same_v<T, double> || std::is_same_v<T, float>, "Argument for readFloatTextImpl must be float or double" ); |
315 | static_assert('a' > '.' && 'A' > '.' && '\n' < '.' && '\t' < '.' && '\'' < '.' && '"' < '.', "Layout of char is not like ASCII" ); //-V590 |
316 | |
317 | static constexpr bool throw_exception = std::is_same_v<ReturnType, void>; |
318 | |
319 | bool negative = false; |
320 | x = 0; |
321 | UInt64 before_point = 0; |
322 | UInt64 after_point = 0; |
323 | int after_point_exponent = 0; |
324 | int exponent = 0; |
325 | |
326 | if (in.eof()) |
327 | { |
328 | if constexpr (throw_exception) |
329 | throw Exception("Cannot read floating point value" , ErrorCodes::CANNOT_PARSE_NUMBER); |
330 | else |
331 | return false; |
332 | } |
333 | |
334 | if (*in.position() == '-') |
335 | { |
336 | negative = true; |
337 | ++in.position(); |
338 | } |
339 | |
340 | auto count_after_sign = in.count(); |
341 | |
342 | constexpr int significant_digits = std::numeric_limits<UInt64>::digits10; |
343 | readUIntTextUpToNSignificantDigits<significant_digits>(before_point, in); |
344 | |
345 | int read_digits = in.count() - count_after_sign; |
346 | |
347 | if (unlikely(read_digits > significant_digits)) |
348 | { |
349 | int before_point_additional_exponent = read_digits - significant_digits; |
350 | x = shift10(before_point, before_point_additional_exponent); |
351 | } |
352 | else |
353 | { |
354 | x = before_point; |
355 | |
356 | /// Shortcut for the common case when there is an integer that fit in Int64. |
357 | if (read_digits && (in.eof() || *in.position() < '.')) |
358 | { |
359 | if (negative) |
360 | x = -x; |
361 | return ReturnType(true); |
362 | } |
363 | } |
364 | |
365 | if (checkChar('.', in)) |
366 | { |
367 | auto after_point_count = in.count(); |
368 | |
369 | while (!in.eof() && *in.position() == '0') |
370 | ++in.position(); |
371 | |
372 | auto after_leading_zeros_count = in.count(); |
373 | auto after_point_num_leading_zeros = after_leading_zeros_count - after_point_count; |
374 | |
375 | readUIntTextUpToNSignificantDigits<significant_digits>(after_point, in); |
376 | read_digits = in.count() - after_leading_zeros_count; |
377 | after_point_exponent = (read_digits > significant_digits ? -significant_digits : -read_digits) - after_point_num_leading_zeros; |
378 | } |
379 | |
380 | if (checkChar('e', in) || checkChar('E', in)) |
381 | { |
382 | if (in.eof()) |
383 | { |
384 | if constexpr (throw_exception) |
385 | throw Exception("Cannot read floating point value" , ErrorCodes::CANNOT_PARSE_NUMBER); |
386 | else |
387 | return false; |
388 | } |
389 | |
390 | bool exponent_negative = false; |
391 | if (*in.position() == '-') |
392 | { |
393 | exponent_negative = true; |
394 | ++in.position(); |
395 | } |
396 | else if (*in.position() == '+') |
397 | { |
398 | ++in.position(); |
399 | } |
400 | |
401 | readUIntTextUpToNSignificantDigits<4>(exponent, in); |
402 | if (exponent_negative) |
403 | exponent = -exponent; |
404 | } |
405 | |
406 | if (after_point) |
407 | x += shift10(after_point, after_point_exponent); |
408 | |
409 | if (exponent) |
410 | x = shift10(x, exponent); |
411 | |
412 | if (negative) |
413 | x = -x; |
414 | |
415 | auto num_characters_without_sign = in.count() - count_after_sign; |
416 | |
417 | /// Denormals. At most one character is read before denormal and it is '-'. |
418 | if (num_characters_without_sign == 0) |
419 | { |
420 | if (in.eof()) |
421 | { |
422 | if constexpr (throw_exception) |
423 | throw Exception("Cannot read floating point value" , ErrorCodes::CANNOT_PARSE_NUMBER); |
424 | else |
425 | return false; |
426 | } |
427 | |
428 | if (*in.position() == 'i' || *in.position() == 'I') |
429 | { |
430 | if (assertOrParseInfinity<throw_exception>(in)) |
431 | { |
432 | x = std::numeric_limits<T>::infinity(); |
433 | if (negative) |
434 | x = -x; |
435 | return ReturnType(true); |
436 | } |
437 | return ReturnType(false); |
438 | } |
439 | else if (*in.position() == 'n' || *in.position() == 'N') |
440 | { |
441 | if (assertOrParseNaN<throw_exception>(in)) |
442 | { |
443 | x = std::numeric_limits<T>::quiet_NaN(); |
444 | if (negative) |
445 | x = -x; |
446 | return ReturnType(true); |
447 | } |
448 | return ReturnType(false); |
449 | } |
450 | } |
451 | |
452 | return ReturnType(true); |
453 | } |
454 | |
455 | |
456 | template <typename T, typename ReturnType> |
457 | ReturnType readFloatTextSimpleImpl(T & x, ReadBuffer & buf) |
458 | { |
459 | static constexpr bool throw_exception = std::is_same_v<ReturnType, void>; |
460 | |
461 | bool negative = false; |
462 | x = 0; |
463 | bool after_point = false; |
464 | double power_of_ten = 1; |
465 | |
466 | if (buf.eof()) |
467 | throwReadAfterEOF(); |
468 | |
469 | while (!buf.eof()) |
470 | { |
471 | switch (*buf.position()) |
472 | { |
473 | case '+': |
474 | break; |
475 | case '-': |
476 | negative = true; |
477 | break; |
478 | case '.': |
479 | after_point = true; |
480 | break; |
481 | case '0': [[fallthrough]]; |
482 | case '1': [[fallthrough]]; |
483 | case '2': [[fallthrough]]; |
484 | case '3': [[fallthrough]]; |
485 | case '4': [[fallthrough]]; |
486 | case '5': [[fallthrough]]; |
487 | case '6': [[fallthrough]]; |
488 | case '7': [[fallthrough]]; |
489 | case '8': [[fallthrough]]; |
490 | case '9': |
491 | if (after_point) |
492 | { |
493 | power_of_ten /= 10; |
494 | x += (*buf.position() - '0') * power_of_ten; |
495 | } |
496 | else |
497 | { |
498 | x *= 10; |
499 | x += *buf.position() - '0'; |
500 | } |
501 | break; |
502 | case 'e': [[fallthrough]]; |
503 | case 'E': |
504 | { |
505 | ++buf.position(); |
506 | Int32 exponent = 0; |
507 | readIntText(exponent, buf); |
508 | x = shift10(x, exponent); |
509 | if (negative) |
510 | x = -x; |
511 | return ReturnType(true); |
512 | } |
513 | |
514 | case 'i': [[fallthrough]]; |
515 | case 'I': |
516 | { |
517 | if (assertOrParseInfinity<throw_exception>(buf)) |
518 | { |
519 | x = std::numeric_limits<T>::infinity(); |
520 | if (negative) |
521 | x = -x; |
522 | return ReturnType(true); |
523 | } |
524 | return ReturnType(false); |
525 | } |
526 | |
527 | case 'n': [[fallthrough]]; |
528 | case 'N': |
529 | { |
530 | if (assertOrParseNaN<throw_exception>(buf)) |
531 | { |
532 | x = std::numeric_limits<T>::quiet_NaN(); |
533 | if (negative) |
534 | x = -x; |
535 | return ReturnType(true); |
536 | } |
537 | return ReturnType(false); |
538 | } |
539 | |
540 | default: |
541 | { |
542 | if (negative) |
543 | x = -x; |
544 | return ReturnType(true); |
545 | } |
546 | } |
547 | ++buf.position(); |
548 | } |
549 | |
550 | if (negative) |
551 | x = -x; |
552 | |
553 | return ReturnType(true); |
554 | } |
555 | |
556 | |
557 | template <typename T> void readFloatTextPrecise(T & x, ReadBuffer & in) { readFloatTextPreciseImpl<T, void>(x, in); } |
558 | template <typename T> bool tryReadFloatTextPrecise(T & x, ReadBuffer & in) { return readFloatTextPreciseImpl<T, bool>(x, in); } |
559 | |
560 | template <typename T> void readFloatTextFast(T & x, ReadBuffer & in) { readFloatTextFastImpl<T, void>(x, in); } |
561 | template <typename T> bool tryReadFloatTextFast(T & x, ReadBuffer & in) { return readFloatTextFastImpl<T, bool>(x, in); } |
562 | |
563 | template <typename T> void readFloatTextSimple(T & x, ReadBuffer & in) { readFloatTextSimpleImpl<T, void>(x, in); } |
564 | template <typename T> bool tryReadFloatTextSimple(T & x, ReadBuffer & in) { return readFloatTextSimpleImpl<T, bool>(x, in); } |
565 | |
566 | |
567 | /// Implementation that is selected as default. |
568 | |
569 | template <typename T> void readFloatText(T & x, ReadBuffer & in) { readFloatTextFast(x, in); } |
570 | template <typename T> bool tryReadFloatText(T & x, ReadBuffer & in) { return tryReadFloatTextFast(x, in); } |
571 | |
572 | |
573 | } |
574 | |