readFloatText.h source code [ClickHouse/dbms/src/IO/readFloatText.h]

1	#include <type_traits>
2	#include <IO/ReadHelpers.h>
3	#include <Core/Defines.h>
4	#include <common/shift10.h>
5	#include <common/likely.h>
6	#include <Common/StringUtils/StringUtils.h>
7	#include <double-conversion/double-conversion.h>
8
9
10	/* Methods for reading floating point numbers from text with decimal representation.*
11	* There are "precise", "fast" and "simple" implementations.
12	*
13	* Neither of methods support hexadecimal numbers (0xABC), binary exponent (1p100), leading plus sign.
14	*
15	* Precise method always returns a number that is the closest machine representable number to the input.
16	*
17	* Fast method is faster (up to 3 times) and usually return the same value,
18	* but in rare cases result may differ by lest significant bit (for Float32)
19	* and by up to two least significant bits (for Float64) from precise method.
20	* Also fast method may parse some garbage as some other unspecified garbage.
21	*
22	* Simple method is little faster for cases of parsing short (few digit) integers, but less precise and slower in other cases.
23	* It's not recommended to use simple method and it is left only for reference.
24	*
25	* For performance test, look at 'read_float_perf' test.
26	*
27	* For precision test.
28	* Parse all existing Float32 numbers:
29
30	CREATE TABLE test.floats ENGINE = Log AS SELECT reinterpretAsFloat32(reinterpretAsString(toUInt32(number))) AS x FROM numbers(0x100000000);
31
32	WITH
33	toFloat32(toString(x)) AS y,
34	reinterpretAsUInt32(reinterpretAsString(x)) AS bin_x,
35	reinterpretAsUInt32(reinterpretAsString(y)) AS bin_y,
36	abs(bin_x - bin_y) AS diff
37	SELECT
38	diff,
39	count()
40	FROM test.floats
41	WHERE NOT isNaN(x)
42	GROUP BY diff
43	ORDER BY diff ASC
44	LIMIT 100
45
46	* Here are the results:
47	*
48	Precise:
49	┌─diff─┬────count()─┐
50	│ 0 │ 4278190082 │
51	└──────┴────────────┘
52	(100% roundtrip property)
53
54	Fast:
55	┌─diff─┬────count()─┐
56	│ 0 │ 3685260580 │
57	│ 1 │ 592929502 │
58	└──────┴────────────┘
59	(The difference is 1 in least significant bit in 13.8% of numbers.)
60
61	Simple:
62	┌─diff─┬────count()─┐
63	│ 0 │ 2169879994 │
64	│ 1 │ 1807178292 │
65	│ 2 │ 269505944 │
66	│ 3 │ 28826966 │
67	│ 4 │ 2566488 │
68	│ 5 │ 212878 │
69	│ 6 │ 18276 │
70	│ 7 │ 1214 │
71	│ 8 │ 30 │
72	└──────┴────────────┘
73
74	* Parse random Float64 numbers:
75
76	WITH
77	rand64() AS bin_x,
78	reinterpretAsFloat64(reinterpretAsString(bin_x)) AS x,
79	toFloat64(toString(x)) AS y,
80	reinterpretAsUInt64(reinterpretAsString(y)) AS bin_y,
81	abs(bin_x - bin_y) AS diff
82	SELECT
83	diff,
84	count()
85	FROM numbers(100000000)
86	WHERE NOT isNaN(x)
87	GROUP BY diff
88	ORDER BY diff ASC
89	LIMIT 100
90
91	*/
92
93
94	namespace DB
95	{
96
97	namespace ErrorCodes
98	{
99	extern const int CANNOT_PARSE_NUMBER;
100	extern const int ARGUMENT_OUT_OF_BOUND;
101	}
102
103
104	/// Returns true, iff parsed.
105	bool parseInfinity(ReadBuffer & buf);
106	bool parseNaN(ReadBuffer & buf);
107
108	void assertInfinity(ReadBuffer & buf);
109	void assertNaN(ReadBuffer & buf);
110
111
112	template <bool throw_exception>
113	bool assertOrParseInfinity(ReadBuffer & buf)
114	{
115	if constexpr (throw_exception)
116	{
117	assertInfinity(buf);
118	return true;
119	}
120	else
121	return parseInfinity(buf);
122	}
123
124	template <bool throw_exception>
125	bool assertOrParseNaN(ReadBuffer & buf)
126	{
127	if constexpr (throw_exception)
128	{
129	assertNaN(buf);
130	return true;
131	}
132	else
133	return parseNaN(buf);
134	}
135
136
137	/// Some garbage may be successfully parsed, examples: '--1' parsed as '1'.
138	template <typename T, typename ReturnType>
139	ReturnType readFloatTextPreciseImpl(T & x, ReadBuffer & buf)
140	{
141	static_assert(std::is_same_v<T, double> \|\| std::is_same_v<T, float>, "Argument for readFloatTextImpl must be float or double");
142	static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;
143
144	if (buf.eof())
145	{
146	if constexpr (throw_exception)
147	throw Exception ("Cannot read floating point value", ErrorCodes::CANNOT_PARSE_NUMBER);
148	else
149	return ReturnType(false);
150	}
151
152	/// We use special code to read denormals (inf, nan), because we support slightly more variants that double-conversion library does:
153	/// Example: inf and Infinity.
154
155	bool negative = false;
156
157	while (true)
158	{
159	switch (*buf.position())
160	{
161	case `'-'`:
162	{
163	negative = true;
164	++buf.position();
165	continue;
166	}
167
168	case `'i'`: [[fallthrough]];
169	case `'I'`:
170	{
171	if (assertOrParseInfinity<throw_exception>(buf))
172	{
173	x = std::numeric_limits<T>::infinity();
174	if (negative)
175	x = -x;
176	return ReturnType(true);
177	}
178	return ReturnType(false);
179	}
180
181	case `'n'`: [[fallthrough]];
182	case `'N'`:
183	{
184	if (assertOrParseNaN<throw_exception>(buf))
185	{
186	x = std::numeric_limits<T>::quiet_NaN();
187	if (negative)
188	x = -x;
189	return ReturnType(true);
190	}
191	return ReturnType(false);
192	}
193
194	default:
195	break;
196	}
197	break;
198	}
199
200	static const double_conversion::StringToDoubleConverter converter(
201	double_conversion::StringToDoubleConverter::ALLOW_TRAILING_JUNK,
202	`0`, `0`, nullptr, nullptr);
203
204	/// Fast path (avoid copying) if the buffer have at least MAX_LENGTH bytes.
205	static constexpr int MAX_LENGTH = `316`;
206
207	if (buf.position() + MAX_LENGTH <= buf.buffer().end())
208	{
209	int num_processed_characters = `0`;
210
211	if constexpr (std::is_same_v<T, double>)
212	x = converter.StringToDouble(buf.position(), buf.buffer().end() - buf.position(), &num_processed_characters);
213	else
214	x = converter.StringToFloat(buf.position(), buf.buffer().end() - buf.position(), &num_processed_characters);
215
216	if (num_processed_characters < `0`)
217	{
218	if constexpr (throw_exception)
219	throw Exception ("Cannot read floating point value", ErrorCodes::CANNOT_PARSE_NUMBER);
220	else
221	return ReturnType(false);
222	}
223
224	buf.position() += num_processed_characters;
225
226	if (negative)
227	x = -x;
228	return ReturnType(true);
229	}
230	else
231	{
232	/// Slow path. Copy characters that may be present in floating point number to temporary buffer.
233
234	char tmp_buf[MAX_LENGTH];
235	int num_copied_chars = `0`;
236
237	while (!buf.eof() && num_copied_chars < MAX_LENGTH)
238	{
239	char c = *buf.position();
240	if (!(isNumericASCII(c) \|\| c == `'-'` \|\| c == `'+'` \|\| c == `'.'` \|\| c == `'e'` \|\| c == `'E'`))
241	break;
242
243	tmp_buf[num_copied_chars] = c;
244	++buf.position();
245	++num_copied_chars;
246	}
247
248	int num_processed_characters = `0`;
249
250	if constexpr (std::is_same_v<T, double>)
251	x = converter.StringToDouble(tmp_buf, num_copied_chars, &num_processed_characters);
252	else
253	x = converter.StringToFloat(tmp_buf, num_copied_chars, &num_processed_characters);
254
255	if (num_processed_characters < num_copied_chars)
256	{
257	if constexpr (throw_exception)
258	throw Exception ("Cannot read floating point value", ErrorCodes::CANNOT_PARSE_NUMBER);
259	else
260	return ReturnType(false);
261	}
262
263	if (negative)
264	x = -x;
265	return ReturnType(true);
266	}
267	}
268
269
270	template <size_t N, typename T>
271	static inline void readUIntTextUpToNSignificantDigits(T & x, ReadBuffer & buf)
272	{
273	/// In optimistic case we can skip bound checking for first loop.
274	if (buf.position() + N <= buf.buffer().end())
275	{
276	for (size_t i = `0`; i < N; ++i)
277	{
278	if (isNumericASCII(*buf.position()))
279	{
280	x *= `10`;
281	x += *buf.position() & `0x0F`;
282	++buf.position();
283	}
284	else
285	return;
286	}
287
288	while (!buf.eof() && isNumericASCII(*buf.position()))
289	++buf.position();
290	}
291	else
292	{
293	for (size_t i = `0`; i < N; ++i)
294	{
295	if (!buf.eof() && isNumericASCII(*buf.position()))
296	{
297	x *= `10`;
298	x += *buf.position() & `0x0F`;
299	++buf.position();
300	}
301	else
302	return;
303	}
304
305	while (!buf.eof() && isNumericASCII(*buf.position()))
306	++buf.position();
307	}
308	}
309
310
311	template <typename T, typename ReturnType>
312	ReturnType readFloatTextFastImpl(T & x, ReadBuffer & in)
313	{
314	static_assert(std::is_same_v<T, double> \|\| std::is_same_v<T, float>, "Argument for readFloatTextImpl must be float or double");
315	static_assert(`'a'` > `'.'` && `'A'` > `'.'` && `'\n'` < `'.'` && `'\t'` < `'.'` && `'\''` < `'.'` && `'"'` < `'.'`, "Layout of char is not like ASCII"); //-V590
316
317	static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;
318
319	bool negative = false;
320	x = `0`;
321	UInt64 before_point = `0`;
322	UInt64 after_point = `0`;
323	int after_point_exponent = `0`;
324	int exponent = `0`;
325
326	if (in.eof())
327	{
328	if constexpr (throw_exception)
329	throw Exception ("Cannot read floating point value", ErrorCodes::CANNOT_PARSE_NUMBER);
330	else
331	return false;
332	}
333
334	if (*in.position() == `'-'`)
335	{
336	negative = true;
337	++in.position();
338	}
339
340	auto count_after_sign = in.count();
341
342	constexpr int significant_digits = std::numeric_limits<UInt64>::digits10;
343	readUIntTextUpToNSignificantDigits<significant_digits>(before_point, in);
344
345	int read_digits = in.count() - count_after_sign;
346
347	if (unlikely(read_digits > significant_digits))
348	{
349	int before_point_additional_exponent = read_digits - significant_digits;
350	x = shift10(before_point, before_point_additional_exponent);
351	}
352	else
353	{
354	x = before_point;
355
356	/// Shortcut for the common case when there is an integer that fit in Int64.
357	if (read_digits && (in.eof() \|\| *in.position() < `'.'`))
358	{
359	if (negative)
360	x = -x;
361	return ReturnType(true);
362	}
363	}
364
365	if (checkChar(`'.'`, in))
366	{
367	auto after_point_count = in.count();
368
369	while (!in.eof() && *in.position() == `'0'`)
370	++in.position();
371
372	auto after_leading_zeros_count = in.count();
373	auto after_point_num_leading_zeros = after_leading_zeros_count - after_point_count;
374
375	readUIntTextUpToNSignificantDigits<significant_digits>(after_point, in);
376	read_digits = in.count() - after_leading_zeros_count;
377	after_point_exponent = (read_digits > significant_digits ? -significant_digits : -read_digits) - after_point_num_leading_zeros;
378	}
379
380	if (checkChar(`'e'`, in) \|\| checkChar(`'E'`, in))
381	{
382	if (in.eof())
383	{
384	if constexpr (throw_exception)
385	throw Exception ("Cannot read floating point value", ErrorCodes::CANNOT_PARSE_NUMBER);
386	else
387	return false;
388	}
389
390	bool exponent_negative = false;
391	if (*in.position() == `'-'`)
392	{
393	exponent_negative = true;
394	++in.position();
395	}
396	else if (*in.position() == `'+'`)
397	{
398	++in.position();
399	}
400
401	readUIntTextUpToNSignificantDigits<`4`>(exponent, in);
402	if (exponent_negative)
403	exponent = -exponent;
404	}
405
406	if (after_point)
407	x += shift10(after_point, after_point_exponent);
408
409	if (exponent)
410	x = shift10(x, exponent);
411
412	if (negative)
413	x = -x;
414
415	auto num_characters_without_sign = in.count() - count_after_sign;
416
417	/// Denormals. At most one character is read before denormal and it is '-'.
418	if (num_characters_without_sign == `0`)
419	{
420	if (in.eof())
421	{
422	if constexpr (throw_exception)
423	throw Exception ("Cannot read floating point value", ErrorCodes::CANNOT_PARSE_NUMBER);
424	else
425	return false;
426	}
427
428	if (in.position() == `'i'` \|\| in.position() == `'I'`)
429	{
430	if (assertOrParseInfinity<throw_exception>(in))
431	{
432	x = std::numeric_limits<T>::infinity();
433	if (negative)
434	x = -x;
435	return ReturnType(true);
436	}
437	return ReturnType(false);
438	}
439	else if (in.position() == `'n'` \|\| in.position() == `'N'`)
440	{
441	if (assertOrParseNaN<throw_exception>(in))
442	{
443	x = std::numeric_limits<T>::quiet_NaN();
444	if (negative)
445	x = -x;
446	return ReturnType(true);
447	}
448	return ReturnType(false);
449	}
450	}
451
452	return ReturnType(true);
453	}
454
455
456	template <typename T, typename ReturnType>
457	ReturnType readFloatTextSimpleImpl(T & x, ReadBuffer & buf)
458	{
459	static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;
460
461	bool negative = false;
462	x = `0`;
463	bool after_point = false;
464	double power_of_ten = `1`;
465
466	if (buf.eof())
467	throwReadAfterEOF();
468
469	while (!buf.eof())
470	{
471	switch (*buf.position())
472	{
473	case `'+'`:
474	break;
475	case `'-'`:
476	negative = true;
477	break;
478	case `'.'`:
479	after_point = true;
480	break;
481	case `'0'`: [[fallthrough]];
482	case `'1'`: [[fallthrough]];
483	case `'2'`: [[fallthrough]];
484	case `'3'`: [[fallthrough]];
485	case `'4'`: [[fallthrough]];
486	case `'5'`: [[fallthrough]];
487	case `'6'`: [[fallthrough]];
488	case `'7'`: [[fallthrough]];
489	case `'8'`: [[fallthrough]];
490	case `'9'`:
491	if (after_point)
492	{
493	power_of_ten /= `10`;
494	x += (buf.position() - `'0'`) power_of_ten;
495	}
496	else
497	{
498	x *= `10`;
499	x += *buf.position() - `'0'`;
500	}
501	break;
502	case `'e'`: [[fallthrough]];
503	case `'E'`:
504	{
505	++buf.position();
506	Int32 exponent = `0`;
507	readIntText(exponent, buf);
508	x = shift10(x, exponent);
509	if (negative)
510	x = -x;
511	return ReturnType(true);
512	}
513
514	case `'i'`: [[fallthrough]];
515	case `'I'`:
516	{
517	if (assertOrParseInfinity<throw_exception>(buf))
518	{
519	x = std::numeric_limits<T>::infinity();
520	if (negative)
521	x = -x;
522	return ReturnType(true);
523	}
524	return ReturnType(false);
525	}
526
527	case `'n'`: [[fallthrough]];
528	case `'N'`:
529	{
530	if (assertOrParseNaN<throw_exception>(buf))
531	{
532	x = std::numeric_limits<T>::quiet_NaN();
533	if (negative)
534	x = -x;
535	return ReturnType(true);
536	}
537	return ReturnType(false);
538	}
539
540	default:
541	{
542	if (negative)
543	x = -x;
544	return ReturnType(true);
545	}
546	}
547	++buf.position();
548	}
549
550	if (negative)
551	x = -x;
552
553	return ReturnType(true);
554	}
555
556
557	template <typename T> void readFloatTextPrecise(T & x, ReadBuffer & in) { readFloatTextPreciseImpl<T, void>(x, in); }
558	template <typename T> bool tryReadFloatTextPrecise(T & x, ReadBuffer & in) { return readFloatTextPreciseImpl<T, bool>(x, in); }
559
560	template <typename T> void readFloatTextFast(T & x, ReadBuffer & in) { readFloatTextFastImpl<T, void>(x, in); }
561	template <typename T> bool tryReadFloatTextFast(T & x, ReadBuffer & in) { return readFloatTextFastImpl<T, bool>(x, in); }
562
563	template <typename T> void readFloatTextSimple(T & x, ReadBuffer & in) { readFloatTextSimpleImpl<T, void>(x, in); }
564	template <typename T> bool tryReadFloatTextSimple(T & x, ReadBuffer & in) { return readFloatTextSimpleImpl<T, bool>(x, in); }
565
566
567	/// Implementation that is selected as default.
568
569	template <typename T> void readFloatText(T & x, ReadBuffer & in) { readFloatTextFast(x, in); }
570	template <typename T> bool tryReadFloatText(T & x, ReadBuffer & in) { return tryReadFloatTextFast(x, in); }
571
572
573	}
574

Browse the source code of ClickHouse/dbms/src/IO/readFloatText.h