numberparsing.h source code [ClickHouse/contrib/simdjson/src/generic/numberparsing.h]

1
2	// Allowable floating-point values range
3	// std::numeric_limits<double>::lowest() to std::numeric_limits<double>::max(),
4	// so from -1.7976e308 all the way to 1.7975e308 in binary64. The lowest
5	// non-zero normal values is std::numeric_limits<double>::min() or
6	// about 2.225074e-308.
7	static const double power_of_ten[] = {
8	`1e-308`, `1e-307`, `1e-306`, `1e-305`, `1e-304`, `1e-303`, `1e-302`, `1e-301`, `1e-300`,
9	`1e-299`, `1e-298`, `1e-297`, `1e-296`, `1e-295`, `1e-294`, `1e-293`, `1e-292`, `1e-291`,
10	`1e-290`, `1e-289`, `1e-288`, `1e-287`, `1e-286`, `1e-285`, `1e-284`, `1e-283`, `1e-282`,
11	`1e-281`, `1e-280`, `1e-279`, `1e-278`, `1e-277`, `1e-276`, `1e-275`, `1e-274`, `1e-273`,
12	`1e-272`, `1e-271`, `1e-270`, `1e-269`, `1e-268`, `1e-267`, `1e-266`, `1e-265`, `1e-264`,
13	`1e-263`, `1e-262`, `1e-261`, `1e-260`, `1e-259`, `1e-258`, `1e-257`, `1e-256`, `1e-255`,
14	`1e-254`, `1e-253`, `1e-252`, `1e-251`, `1e-250`, `1e-249`, `1e-248`, `1e-247`, `1e-246`,
15	`1e-245`, `1e-244`, `1e-243`, `1e-242`, `1e-241`, `1e-240`, `1e-239`, `1e-238`, `1e-237`,
16	`1e-236`, `1e-235`, `1e-234`, `1e-233`, `1e-232`, `1e-231`, `1e-230`, `1e-229`, `1e-228`,
17	`1e-227`, `1e-226`, `1e-225`, `1e-224`, `1e-223`, `1e-222`, `1e-221`, `1e-220`, `1e-219`,
18	`1e-218`, `1e-217`, `1e-216`, `1e-215`, `1e-214`, `1e-213`, `1e-212`, `1e-211`, `1e-210`,
19	`1e-209`, `1e-208`, `1e-207`, `1e-206`, `1e-205`, `1e-204`, `1e-203`, `1e-202`, `1e-201`,
20	`1e-200`, `1e-199`, `1e-198`, `1e-197`, `1e-196`, `1e-195`, `1e-194`, `1e-193`, `1e-192`,
21	`1e-191`, `1e-190`, `1e-189`, `1e-188`, `1e-187`, `1e-186`, `1e-185`, `1e-184`, `1e-183`,
22	`1e-182`, `1e-181`, `1e-180`, `1e-179`, `1e-178`, `1e-177`, `1e-176`, `1e-175`, `1e-174`,
23	`1e-173`, `1e-172`, `1e-171`, `1e-170`, `1e-169`, `1e-168`, `1e-167`, `1e-166`, `1e-165`,
24	`1e-164`, `1e-163`, `1e-162`, `1e-161`, `1e-160`, `1e-159`, `1e-158`, `1e-157`, `1e-156`,
25	`1e-155`, `1e-154`, `1e-153`, `1e-152`, `1e-151`, `1e-150`, `1e-149`, `1e-148`, `1e-147`,
26	`1e-146`, `1e-145`, `1e-144`, `1e-143`, `1e-142`, `1e-141`, `1e-140`, `1e-139`, `1e-138`,
27	`1e-137`, `1e-136`, `1e-135`, `1e-134`, `1e-133`, `1e-132`, `1e-131`, `1e-130`, `1e-129`,
28	`1e-128`, `1e-127`, `1e-126`, `1e-125`, `1e-124`, `1e-123`, `1e-122`, `1e-121`, `1e-120`,
29	`1e-119`, `1e-118`, `1e-117`, `1e-116`, `1e-115`, `1e-114`, `1e-113`, `1e-112`, `1e-111`,
30	`1e-110`, `1e-109`, `1e-108`, `1e-107`, `1e-106`, `1e-105`, `1e-104`, `1e-103`, `1e-102`,
31	`1e-101`, `1e-100`, `1e-99`, `1e-98`, `1e-97`, `1e-96`, `1e-95`, `1e-94`, `1e-93`,
32	`1e-92`, `1e-91`, `1e-90`, `1e-89`, `1e-88`, `1e-87`, `1e-86`, `1e-85`, `1e-84`,
33	`1e-83`, `1e-82`, `1e-81`, `1e-80`, `1e-79`, `1e-78`, `1e-77`, `1e-76`, `1e-75`,
34	`1e-74`, `1e-73`, `1e-72`, `1e-71`, `1e-70`, `1e-69`, `1e-68`, `1e-67`, `1e-66`,
35	`1e-65`, `1e-64`, `1e-63`, `1e-62`, `1e-61`, `1e-60`, `1e-59`, `1e-58`, `1e-57`,
36	`1e-56`, `1e-55`, `1e-54`, `1e-53`, `1e-52`, `1e-51`, `1e-50`, `1e-49`, `1e-48`,
37	`1e-47`, `1e-46`, `1e-45`, `1e-44`, `1e-43`, `1e-42`, `1e-41`, `1e-40`, `1e-39`,
38	`1e-38`, `1e-37`, `1e-36`, `1e-35`, `1e-34`, `1e-33`, `1e-32`, `1e-31`, `1e-30`,
39	`1e-29`, `1e-28`, `1e-27`, `1e-26`, `1e-25`, `1e-24`, `1e-23`, `1e-22`, `1e-21`,
40	`1e-20`, `1e-19`, `1e-18`, `1e-17`, `1e-16`, `1e-15`, `1e-14`, `1e-13`, `1e-12`,
41	`1e-11`, `1e-10`, `1e-9`, `1e-8`, `1e-7`, `1e-6`, `1e-5`, `1e-4`, `1e-3`,
42	`1e-2`, `1e-1`, `1e0`, `1e1`, `1e2`, `1e3`, `1e4`, `1e5`, `1e6`,
43	`1e7`, `1e8`, `1e9`, `1e10`, `1e11`, `1e12`, `1e13`, `1e14`, `1e15`,
44	`1e16`, `1e17`, `1e18`, `1e19`, `1e20`, `1e21`, `1e22`, `1e23`, `1e24`,
45	`1e25`, `1e26`, `1e27`, `1e28`, `1e29`, `1e30`, `1e31`, `1e32`, `1e33`,
46	`1e34`, `1e35`, `1e36`, `1e37`, `1e38`, `1e39`, `1e40`, `1e41`, `1e42`,
47	`1e43`, `1e44`, `1e45`, `1e46`, `1e47`, `1e48`, `1e49`, `1e50`, `1e51`,
48	`1e52`, `1e53`, `1e54`, `1e55`, `1e56`, `1e57`, `1e58`, `1e59`, `1e60`,
49	`1e61`, `1e62`, `1e63`, `1e64`, `1e65`, `1e66`, `1e67`, `1e68`, `1e69`,
50	`1e70`, `1e71`, `1e72`, `1e73`, `1e74`, `1e75`, `1e76`, `1e77`, `1e78`,
51	`1e79`, `1e80`, `1e81`, `1e82`, `1e83`, `1e84`, `1e85`, `1e86`, `1e87`,
52	`1e88`, `1e89`, `1e90`, `1e91`, `1e92`, `1e93`, `1e94`, `1e95`, `1e96`,
53	`1e97`, `1e98`, `1e99`, `1e100`, `1e101`, `1e102`, `1e103`, `1e104`, `1e105`,
54	`1e106`, `1e107`, `1e108`, `1e109`, `1e110`, `1e111`, `1e112`, `1e113`, `1e114`,
55	`1e115`, `1e116`, `1e117`, `1e118`, `1e119`, `1e120`, `1e121`, `1e122`, `1e123`,
56	`1e124`, `1e125`, `1e126`, `1e127`, `1e128`, `1e129`, `1e130`, `1e131`, `1e132`,
57	`1e133`, `1e134`, `1e135`, `1e136`, `1e137`, `1e138`, `1e139`, `1e140`, `1e141`,
58	`1e142`, `1e143`, `1e144`, `1e145`, `1e146`, `1e147`, `1e148`, `1e149`, `1e150`,
59	`1e151`, `1e152`, `1e153`, `1e154`, `1e155`, `1e156`, `1e157`, `1e158`, `1e159`,
60	`1e160`, `1e161`, `1e162`, `1e163`, `1e164`, `1e165`, `1e166`, `1e167`, `1e168`,
61	`1e169`, `1e170`, `1e171`, `1e172`, `1e173`, `1e174`, `1e175`, `1e176`, `1e177`,
62	`1e178`, `1e179`, `1e180`, `1e181`, `1e182`, `1e183`, `1e184`, `1e185`, `1e186`,
63	`1e187`, `1e188`, `1e189`, `1e190`, `1e191`, `1e192`, `1e193`, `1e194`, `1e195`,
64	`1e196`, `1e197`, `1e198`, `1e199`, `1e200`, `1e201`, `1e202`, `1e203`, `1e204`,
65	`1e205`, `1e206`, `1e207`, `1e208`, `1e209`, `1e210`, `1e211`, `1e212`, `1e213`,
66	`1e214`, `1e215`, `1e216`, `1e217`, `1e218`, `1e219`, `1e220`, `1e221`, `1e222`,
67	`1e223`, `1e224`, `1e225`, `1e226`, `1e227`, `1e228`, `1e229`, `1e230`, `1e231`,
68	`1e232`, `1e233`, `1e234`, `1e235`, `1e236`, `1e237`, `1e238`, `1e239`, `1e240`,
69	`1e241`, `1e242`, `1e243`, `1e244`, `1e245`, `1e246`, `1e247`, `1e248`, `1e249`,
70	`1e250`, `1e251`, `1e252`, `1e253`, `1e254`, `1e255`, `1e256`, `1e257`, `1e258`,
71	`1e259`, `1e260`, `1e261`, `1e262`, `1e263`, `1e264`, `1e265`, `1e266`, `1e267`,
72	`1e268`, `1e269`, `1e270`, `1e271`, `1e272`, `1e273`, `1e274`, `1e275`, `1e276`,
73	`1e277`, `1e278`, `1e279`, `1e280`, `1e281`, `1e282`, `1e283`, `1e284`, `1e285`,
74	`1e286`, `1e287`, `1e288`, `1e289`, `1e290`, `1e291`, `1e292`, `1e293`, `1e294`,
75	`1e295`, `1e296`, `1e297`, `1e298`, `1e299`, `1e300`, `1e301`, `1e302`, `1e303`,
76	`1e304`, `1e305`, `1e306`, `1e307`, `1e308`};
77
78	static inline bool is_integer(char c) {
79	return (c >= `'0'` && c <= `'9'`);
80	// this gets compiled to (uint8_t)(c - '0') <= 9 on all decent compilers
81	}
82
83	// We need to check that the character following a zero is valid. This is
84	// probably frequent and it is hard than it looks. We are building all of this
85	// just to differentiate between 0x1 (invalid), 0,1 (valid) 0e1 (valid)...
86	const bool structural_or_whitespace_or_exponent_or_decimal_negated[`256`] = {
87	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `0`, `0`, `1`, `1`, `0`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`,
88	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `0`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `0`, `1`, `0`, `1`,
89	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `0`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `0`, `1`, `1`,
90	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `0`, `1`, `0`, `1`, `1`,
91	`1`, `1`, `1`, `1`, `1`, `0`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`,
92	`1`, `1`, `1`, `0`, `1`, `0`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`,
93	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`,
94	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`,
95	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`,
96	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`,
97	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`};
98
99	really_inline bool
100	is_not_structural_or_whitespace_or_exponent_or_decimal(unsigned char c) {
101	return structural_or_whitespace_or_exponent_or_decimal_negated[c];
102	}
103
104	// check quickly whether the next 8 chars are made of digits
105	// at a glance, it looks better than Mula's
106	// http://0x80.pl/articles/swar-digits-validate.html
107	static inline bool is_made_of_eight_digits_fast(const char *chars) {
108	uint64_t val;
109	// this can read up to 7 bytes beyond the buffer size, but we require
110	// SIMDJSON_PADDING of padding
111	static_assert(`7` <= SIMDJSON_PADDING);
112	memcpy(&val, chars, `8`);
113	// a branchy method might be faster:
114	// return (( val & 0xF0F0F0F0F0F0F0F0 ) == 0x3030303030303030)
115	// && (( (val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0 ) ==
116	// 0x3030303030303030);
117	return (((val & `0xF0F0F0F0F0F0F0F0`) \|
118	(((val + `0x0606060606060606`) & `0xF0F0F0F0F0F0F0F0`) >> `4`)) ==
119	`0x3333333333333333`);
120	}
121
122
123	//
124	// This function computes base 10 ^ (- negative_exponent ).*
125	// It is only even going to be used when negative_exponent is tiny.
126	static double subnormal_power10(double base, int64_t negative_exponent) {
127	// avoid integer overflows in the pow expression, those values would
128	// become zero anyway.
129	if(negative_exponent < -`1000`) {
130	return `0`;
131	}
132
133	// this is probably not going to be fast
134	return base * `1e-308` * pow(`10`, negative_exponent + `308`);
135	}
136
137	// called by parse_number when we know that the output is a float,
138	// but where there might be some integer overflow. The trick here is to
139	// parse using floats from the start.
140	// Do not call this function directly as it skips some of the checks from
141	// parse_number
142	//
143	// This function will almost never be called!!!
144	//
145	// Note: a redesign could avoid this function entirely.
146	//
147	static never_inline bool parse_float(const uint8_t *const buf, ParsedJson &pj,
148	const uint32_t offset, bool found_minus) {
149	const char p = reinterpret_cast<const* char *>(buf + offset);
150	bool negative = false;
151	if (found_minus) {
152	++p;
153	negative = true;
154	}
155	long double i;
156	if (p == `'0'`) { // 0 cannot be followed by an integer*
157	++p;
158	i = `0`;
159	} else {
160	unsigned char digit = *p - `'0'`;
161	i = digit;
162	p++;
163	while (is_integer(*p)) {
164	digit = *p - `'0'`;
165	i = `10` * i + digit;
166	++p;
167	}
168	}
169	if (`'.'` == *p) {
170	++p;
171	int fractional_weight = `308`;
172	if (is_integer(*p)) {
173	unsigned char digit = *p - `'0'`;
174	++p;
175
176	fractional_weight--;
177	i = i + digit * (fractional_weight >= `0` ? power_of_ten[fractional_weight]
178	: `0`);
179	} else {
180	#ifdef JSON_TEST_NUMBERS // for unit testing
181	found_invalid_number(buf + offset);
182	#endif
183	return false;
184	}
185	while (is_integer(*p)) {
186	unsigned char digit = *p - `'0'`;
187	++p;
188	fractional_weight--;
189	i = i + digit * (fractional_weight >= `0` ? power_of_ten[fractional_weight]
190	: `0`);
191	}
192	}
193	if ((`'e'` == p) \|\| (`'E'` == p)) {
194	++p;
195	bool neg_exp = false;
196	if (`'-'` == *p) {
197	neg_exp = true;
198	++p;
199	} else if (`'+'` == *p) {
200	++p;
201	}
202	if (!is_integer(*p)) {
203	#ifdef JSON_TEST_NUMBERS // for unit testing
204	found_invalid_number(buf + offset);
205	#endif
206	return false;
207	}
208	unsigned char digit = *p - `'0'`;
209	int64_t exp_number = digit; // exponential part
210	p++;
211	if (is_integer(*p)) {
212	digit = *p - `'0'`;
213	exp_number = `10` * exp_number + digit;
214	++p;
215	}
216	if (is_integer(*p)) {
217	digit = *p - `'0'`;
218	exp_number = `10` * exp_number + digit;
219	++p;
220	}
221	if (is_integer(*p)) {
222	digit = *p - `'0'`;
223	exp_number = `10` * exp_number + digit;
224	++p;
225	}
226	while (is_integer(*p)) {
227	if (exp_number > `0x100000000`) { // we need to check for overflows
228	// we refuse to parse this
229	#ifdef JSON_TEST_NUMBERS // for unit testing
230	found_invalid_number(buf + offset);
231	#endif
232	return false;
233	}
234	digit = *p - `'0'`;
235	exp_number = `10` * exp_number + digit;
236	++p;
237	}
238	if (unlikely(exp_number > `308`)) {
239	// this path is unlikely
240	if (neg_exp) {
241	// We either have zero or a subnormal.
242	// We expect this to be uncommon so we go through a slow path.
243	i = subnormal_power10(i, -exp_number);
244	} else {
245	// We know for sure that we have a number that is too large,
246	// we refuse to parse this
247	#ifdef JSON_TEST_NUMBERS // for unit testing
248	found_invalid_number(buf + offset);
249	#endif
250	return false;
251	}
252	} else {
253	int exponent = (neg_exp ? -exp_number : exp_number);
254	// we have that exp_number is [0,308] so that
255	// exponent is [-308,308] so that
256	// 308 + exponent is in [0, 2 308]*
257	i *= power_of_ten[`308` + exponent];
258	}
259	}
260	if (is_not_structural_or_whitespace(*p)) {
261	return false;
262	}
263	// check that we can go from long double to double safely.
264	if(i > std::numeric_limits<double>::max()) {
265	#ifdef JSON_TEST_NUMBERS // for unit testing
266	found_invalid_number(buf + offset);
267	#endif
268	return false;
269	}
270	double d = negative ? -i : i;
271	pj.write_tape_double(d);
272	#ifdef JSON_TEST_NUMBERS // for unit testing
273	found_float(d, buf + offset);
274	#endif
275	return is_structural_or_whitespace(*p);
276	}
277
278	// called by parse_number when we know that the output is an integer,
279	// but where there might be some integer overflow.
280	// we want to catch overflows!
281	// Do not call this function directly as it skips some of the checks from
282	// parse_number
283	//
284	// This function will almost never be called!!!
285	//
286	static never_inline bool parse_large_integer(const uint8_t *const buf,
287	ParsedJson &pj,
288	const uint32_t offset,
289	bool found_minus) {
290	const char p = reinterpret_cast<const* char *>(buf + offset);
291
292	bool negative = false;
293	if (found_minus) {
294	++p;
295	negative = true;
296	}
297	uint64_t i;
298	if (p == `'0'`) { // 0 cannot be followed by an integer*
299	++p;
300	i = `0`;
301	} else {
302	unsigned char digit = *p - `'0'`;
303	i = digit;
304	p++;
305	// the is_made_of_eight_digits_fast routine is unlikely to help here because
306	// we rarely see large integer parts like 123456789
307	while (is_integer(*p)) {
308	digit = *p - `'0'`;
309	if (mul_overflow(i, `10`, &i)) {
310	#ifdef JSON_TEST_NUMBERS // for unit testing
311	found_invalid_number(buf + offset);
312	#endif
313	return false; // overflow
314	}
315	if (add_overflow(i, digit, &i)) {
316	#ifdef JSON_TEST_NUMBERS // for unit testing
317	found_invalid_number(buf + offset);
318	#endif
319	return false; // overflow
320	}
321	++p;
322	}
323	}
324	if (negative) {
325	if (i > `0x8000000000000000`) {
326	// overflows!
327	#ifdef JSON_TEST_NUMBERS // for unit testing
328	found_invalid_number(buf + offset);
329	#endif
330	return false; // overflow
331	} else if (i == `0x8000000000000000`) {
332	// In two's complement, we cannot represent 0x8000000000000000
333	// as a positive signed integer, but the negative version is
334	// possible.
335	constexpr int64_t signed_answer = INT64_MIN;
336	pj.write_tape_s64(signed_answer);
337	#ifdef JSON_TEST_NUMBERS // for unit testing
338	found_integer(signed_answer, buf + offset);
339	#endif
340	} else {
341	// we can negate safely
342	int64_t signed_answer = -static_cast<int64_t>(i);
343	pj.write_tape_s64(signed_answer);
344	#ifdef JSON_TEST_NUMBERS // for unit testing
345	found_integer(signed_answer, buf + offset);
346	#endif
347	}
348	} else {
349	// we have a positive integer, the contract is that
350	// we try to represent it as a signed integer and only
351	// fallback on unsigned integers if absolutely necessary.
352	if(i < `0x8000000000000000`) {
353	#ifdef JSON_TEST_NUMBERS // for unit testing
354	found_integer(i, buf + offset);
355	#endif
356	pj.write_tape_s64(i);
357	} else {
358	#ifdef JSON_TEST_NUMBERS // for unit testing
359	found_unsigned_integer(i, buf + offset);
360	#endif
361	pj.write_tape_u64(i);
362	}
363	}
364	return is_structural_or_whitespace(*p);
365	}
366
367	// parse the number at buf + offset
368	// define JSON_TEST_NUMBERS for unit testing
369	//
370	// It is assumed that the number is followed by a structural ({,},],[) character
371	// or a white space character. If that is not the case (e.g., when the JSON
372	// document is made of a single number), then it is necessary to copy the
373	// content and append a space before calling this function.
374	//
375	// Our objective is accurate parsing (ULP of 0 or 1) at high speed.
376	static really_inline bool parse_number(const uint8_t *const buf, ParsedJson &pj,
377	const uint32_t offset,
378	bool found_minus) {
379	#ifdef SIMDJSON_SKIPNUMBERPARSING // for performance analysis, it is sometimes
380	// useful to skip parsing
381	pj.write_tape_s64(`0`); // always write zero
382	return true; // always succeeds
383	#else
384	const char p = reinterpret_cast<const* char *>(buf + offset);
385	bool negative = false;
386	if (found_minus) {
387	++p;
388	negative = true;
389	if (!is_integer(p)) { // a negative sign must be followed by an integer*
390	#ifdef JSON_TEST_NUMBERS // for unit testing
391	found_invalid_number(buf + offset);
392	#endif
393	return false;
394	}
395	}
396	const char *const start_digits = p;
397
398	uint64_t i; // an unsigned int avoids signed overflows (which are bad)
399	if (p == `'0'`) { // 0 cannot be followed by an integer*
400	++p;
401	if (is_not_structural_or_whitespace_or_exponent_or_decimal(*p)) {
402	#ifdef JSON_TEST_NUMBERS // for unit testing
403	found_invalid_number(buf + offset);
404	#endif
405	return false;
406	}
407	i = `0`;
408	} else {
409	if (!(is_integer(p))) { // must start with an integer*
410	#ifdef JSON_TEST_NUMBERS // for unit testing
411	found_invalid_number(buf + offset);
412	#endif
413	return false;
414	}
415	unsigned char digit = *p - `'0'`;
416	i = digit;
417	p++;
418	// the is_made_of_eight_digits_fast routine is unlikely to help here because
419	// we rarely see large integer parts like 123456789
420	while (is_integer(*p)) {
421	digit = *p - `'0'`;
422	// a multiplication by 10 is cheaper than an arbitrary integer
423	// multiplication
424	i = `10` * i + digit; // might overflow, we will handle the overflow later
425	++p;
426	}
427	}
428	int64_t exponent = `0`;
429	bool is_float = false;
430	if (`'.'` == *p) {
431	is_float = true; // At this point we know that we have a float
432	// we continue with the fiction that we have an integer. If the
433	// floating point number is representable as x 10^z for some integer*
434	// z that fits in 53 bits, then we will be able to convert back the
435	// the integer into a float in a lossless manner.
436	++p;
437	const char *const first_after_period = p;
438	if (is_integer(*p)) {
439	unsigned char digit = *p - `'0'`;
440	++p;
441	i = i * `10` + digit; // might overflow + multiplication by 10 is likely
442	// cheaper than arbitrary mult.
443	// we will handle the overflow later
444	} else {
445	#ifdef JSON_TEST_NUMBERS // for unit testing
446	found_invalid_number(buf + offset);
447	#endif
448	return false;
449	}
450	#ifdef SWAR_NUMBER_PARSING
451	// this helps if we have lots of decimals!
452	// this turns out to be frequent enough.
453	if (is_made_of_eight_digits_fast(p)) {
454	i = i * `100000000` + parse_eight_digits_unrolled(p);
455	p += `8`;
456	}
457	#endif
458	while (is_integer(*p)) {
459	unsigned char digit = *p - `'0'`;
460	++p;
461	i = i * `10` + digit; // in rare cases, this will overflow, but that's ok
462	// because we have parse_highprecision_float later.
463	}
464	exponent = first_after_period - p;
465	}
466	int digit_count =
467	p - start_digits - `1`; // used later to guard against overflows
468	int64_t exp_number = `0`; // exponential part
469	if ((`'e'` == p) \|\| (`'E'` == p)) {
470	is_float = true;
471	++p;
472	bool neg_exp = false;
473	if (`'-'` == *p) {
474	neg_exp = true;
475	++p;
476	} else if (`'+'` == *p) {
477	++p;
478	}
479	if (!is_integer(*p)) {
480	#ifdef JSON_TEST_NUMBERS // for unit testing
481	found_invalid_number(buf + offset);
482	#endif
483	return false;
484	}
485	unsigned char digit = *p - `'0'`;
486	exp_number = digit;
487	p++;
488	if (is_integer(*p)) {
489	digit = *p - `'0'`;
490	exp_number = `10` * exp_number + digit;
491	++p;
492	}
493	if (is_integer(*p)) {
494	digit = *p - `'0'`;
495	exp_number = `10` * exp_number + digit;
496	++p;
497	}
498	while (is_integer(*p)) {
499	if (exp_number > `0x100000000`) { // we need to check for overflows
500	// we refuse to parse this
501	#ifdef JSON_TEST_NUMBERS // for unit testing
502	found_invalid_number(buf + offset);
503	#endif
504	return false;
505	}
506	digit = *p - `'0'`;
507	exp_number = `10` * exp_number + digit;
508	++p;
509	}
510	exponent += (neg_exp ? -exp_number : exp_number);
511	}
512	if (is_float) {
513	uint64_t power_index = `308` + exponent;
514	if (unlikely((digit_count >= `19`))) { // this is uncommon
515	// It is possible that the integer had an overflow.
516	// We have to handle the case where we have 0.0000somenumber.
517	const char *start = start_digits;
518	while ((start == `'0'`) \|\| (start == `'.'`)) {
519	start++;
520	}
521	// we over-decrement by one when there is a '.'
522	digit_count -= (start - start_digits);
523	if (digit_count >= `19`) {
524	// Ok, chances are good that we had an overflow!
525	// this is almost never going to get called!!!
526	// we start anew, going slowly!!!
527	return parse_float(buf, pj, offset, found_minus);
528	}
529	}
530	if (unlikely((power_index > `2` * `308`))) { // this is uncommon!!!
531	// this is almost never going to get called!!!
532	// we start anew, going slowly!!!
533	return parse_float(buf, pj, offset, found_minus);
534	}
535	double factor = power_of_ten[power_index];
536	factor = negative ? -factor : factor;
537	double d = i * factor;
538	pj.write_tape_double(d);
539	#ifdef JSON_TEST_NUMBERS // for unit testing
540	found_float(d, buf + offset);
541	#endif
542	} else {
543	if (unlikely(digit_count >= `18`)) { // this is uncommon!!!
544	// there is a good chance that we had an overflow, so we need
545	// need to recover: we parse the whole thing again.
546	return parse_large_integer(buf, pj, offset, found_minus);
547	}
548	i = negative ? `0` - i : i;
549	pj.write_tape_s64(i);
550	#ifdef JSON_TEST_NUMBERS // for unit testing
551	found_integer(i, buf + offset);
552	#endif
553	}
554	return is_structural_or_whitespace(*p);
555	#endif // SIMDJSON_SKIPNUMBERPARSING
556	}
557
558

Browse the source code of ClickHouse/contrib/simdjson/src/generic/numberparsing.h