1 | #include "simdjson/internal/numberparsing_tables.h" |
2 | #include <limits> |
3 | |
4 | namespace simdjson { |
5 | namespace SIMDJSON_IMPLEMENTATION { |
6 | |
7 | namespace ondemand { |
8 | /** |
9 | * The type of a JSON number |
10 | */ |
11 | enum class number_type { |
12 | floating_point_number=1, /// a binary64 number |
13 | signed_integer, /// a signed integer that fits in a 64-bit word using two's complement |
14 | unsigned_integer /// a positive integer larger or equal to 1<<63 |
15 | }; |
16 | } |
17 | |
18 | namespace { |
19 | /// @private |
20 | namespace numberparsing { |
21 | |
22 | |
23 | |
24 | #ifdef JSON_TEST_NUMBERS |
25 | #define INVALID_NUMBER(SRC) (found_invalid_number((SRC)), NUMBER_ERROR) |
26 | #define WRITE_INTEGER(VALUE, SRC, WRITER) (found_integer((VALUE), (SRC)), (WRITER).append_s64((VALUE))) |
27 | #define WRITE_UNSIGNED(VALUE, SRC, WRITER) (found_unsigned_integer((VALUE), (SRC)), (WRITER).append_u64((VALUE))) |
28 | #define WRITE_DOUBLE(VALUE, SRC, WRITER) (found_float((VALUE), (SRC)), (WRITER).append_double((VALUE))) |
29 | #else |
30 | #define INVALID_NUMBER(SRC) (NUMBER_ERROR) |
31 | #define WRITE_INTEGER(VALUE, SRC, WRITER) (WRITER).append_s64((VALUE)) |
32 | #define WRITE_UNSIGNED(VALUE, SRC, WRITER) (WRITER).append_u64((VALUE)) |
33 | #define WRITE_DOUBLE(VALUE, SRC, WRITER) (WRITER).append_double((VALUE)) |
34 | #endif |
35 | |
36 | namespace { |
37 | // Convert a mantissa, an exponent and a sign bit into an ieee64 double. |
38 | // The real_exponent needs to be in [0, 2046] (technically real_exponent = 2047 would be acceptable). |
39 | // The mantissa should be in [0,1<<53). The bit at index (1ULL << 52) while be zeroed. |
40 | simdjson_inline double to_double(uint64_t mantissa, uint64_t real_exponent, bool negative) { |
41 | double d; |
42 | mantissa &= ~(1ULL << 52); |
43 | mantissa |= real_exponent << 52; |
44 | mantissa |= ((static_cast<uint64_t>(negative)) << 63); |
45 | std::memcpy(dest: &d, src: &mantissa, n: sizeof(d)); |
46 | return d; |
47 | } |
48 | } |
49 | // Attempts to compute i * 10^(power) exactly; and if "negative" is |
50 | // true, negate the result. |
51 | // This function will only work in some cases, when it does not work, success is |
52 | // set to false. This should work *most of the time* (like 99% of the time). |
53 | // We assume that power is in the [smallest_power, |
54 | // largest_power] interval: the caller is responsible for this check. |
55 | simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative, double &d) { |
56 | // we start with a fast path |
57 | // It was described in |
58 | // Clinger WD. How to read floating point numbers accurately. |
59 | // ACM SIGPLAN Notices. 1990 |
60 | #ifndef FLT_EVAL_METHOD |
61 | #error "FLT_EVAL_METHOD should be defined, please include cfloat." |
62 | #endif |
63 | #if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0) |
64 | // We cannot be certain that x/y is rounded to nearest. |
65 | if (0 <= power && power <= 22 && i <= 9007199254740991) { |
66 | #else |
67 | if (-22 <= power && power <= 22 && i <= 9007199254740991) { |
68 | #endif |
69 | // convert the integer into a double. This is lossless since |
70 | // 0 <= i <= 2^53 - 1. |
71 | d = double(i); |
72 | // |
73 | // The general idea is as follows. |
74 | // If 0 <= s < 2^53 and if 10^0 <= p <= 10^22 then |
75 | // 1) Both s and p can be represented exactly as 64-bit floating-point |
76 | // values |
77 | // (binary64). |
78 | // 2) Because s and p can be represented exactly as floating-point values, |
79 | // then s * p |
80 | // and s / p will produce correctly rounded values. |
81 | // |
82 | if (power < 0) { |
83 | d = d / simdjson::internal::power_of_ten[-power]; |
84 | } else { |
85 | d = d * simdjson::internal::power_of_ten[power]; |
86 | } |
87 | if (negative) { |
88 | d = -d; |
89 | } |
90 | return true; |
91 | } |
92 | // When 22 < power && power < 22 + 16, we could |
93 | // hope for another, secondary fast path. It was |
94 | // described by David M. Gay in "Correctly rounded |
95 | // binary-decimal and decimal-binary conversions." (1990) |
96 | // If you need to compute i * 10^(22 + x) for x < 16, |
97 | // first compute i * 10^x, if you know that result is exact |
98 | // (e.g., when i * 10^x < 2^53), |
99 | // then you can still proceed and do (i * 10^x) * 10^22. |
100 | // Is this worth your time? |
101 | // You need 22 < power *and* power < 22 + 16 *and* (i * 10^(x-22) < 2^53) |
102 | // for this second fast path to work. |
103 | // If you you have 22 < power *and* power < 22 + 16, and then you |
104 | // optimistically compute "i * 10^(x-22)", there is still a chance that you |
105 | // have wasted your time if i * 10^(x-22) >= 2^53. It makes the use cases of |
106 | // this optimization maybe less common than we would like. Source: |
107 | // http://www.exploringbinary.com/fast-path-decimal-to-floating-point-conversion/ |
108 | // also used in RapidJSON: https://rapidjson.org/strtod_8h_source.html |
109 | |
110 | // The fast path has now failed, so we are failing back on the slower path. |
111 | |
112 | // In the slow path, we need to adjust i so that it is > 1<<63 which is always |
113 | // possible, except if i == 0, so we handle i == 0 separately. |
114 | if(i == 0) { |
115 | d = negative ? -0.0 : 0.0; |
116 | return true; |
117 | } |
118 | |
119 | |
120 | // The exponent is 1024 + 63 + power |
121 | // + floor(log(5**power)/log(2)). |
122 | // The 1024 comes from the ieee64 standard. |
123 | // The 63 comes from the fact that we use a 64-bit word. |
124 | // |
125 | // Computing floor(log(5**power)/log(2)) could be |
126 | // slow. Instead we use a fast function. |
127 | // |
128 | // For power in (-400,350), we have that |
129 | // (((152170 + 65536) * power ) >> 16); |
130 | // is equal to |
131 | // floor(log(5**power)/log(2)) + power when power >= 0 |
132 | // and it is equal to |
133 | // ceil(log(5**-power)/log(2)) + power when power < 0 |
134 | // |
135 | // The 65536 is (1<<16) and corresponds to |
136 | // (65536 * power) >> 16 ---> power |
137 | // |
138 | // ((152170 * power ) >> 16) is equal to |
139 | // floor(log(5**power)/log(2)) |
140 | // |
141 | // Note that this is not magic: 152170/(1<<16) is |
142 | // approximatively equal to log(5)/log(2). |
143 | // The 1<<16 value is a power of two; we could use a |
144 | // larger power of 2 if we wanted to. |
145 | // |
146 | int64_t exponent = (((152170 + 65536) * power) >> 16) + 1024 + 63; |
147 | |
148 | |
149 | // We want the most significant bit of i to be 1. Shift if needed. |
150 | int lz = leading_zeroes(input_num: i); |
151 | i <<= lz; |
152 | |
153 | |
154 | // We are going to need to do some 64-bit arithmetic to get a precise product. |
155 | // We use a table lookup approach. |
156 | // It is safe because |
157 | // power >= smallest_power |
158 | // and power <= largest_power |
159 | // We recover the mantissa of the power, it has a leading 1. It is always |
160 | // rounded down. |
161 | // |
162 | // We want the most significant 64 bits of the product. We know |
163 | // this will be non-zero because the most significant bit of i is |
164 | // 1. |
165 | const uint32_t index = 2 * uint32_t(power - simdjson::internal::smallest_power); |
166 | // Optimization: It may be that materializing the index as a variable might confuse some compilers and prevent effective complex-addressing loads. (Done for code clarity.) |
167 | // |
168 | // The full_multiplication function computes the 128-bit product of two 64-bit words |
169 | // with a returned value of type value128 with a "low component" corresponding to the |
170 | // 64-bit least significant bits of the product and with a "high component" corresponding |
171 | // to the 64-bit most significant bits of the product. |
172 | simdjson::internal::value128 firstproduct = jsoncharutils::full_multiplication(value1: i, value2: simdjson::internal::power_of_five_128[index]); |
173 | // Both i and power_of_five_128[index] have their most significant bit set to 1 which |
174 | // implies that the either the most or the second most significant bit of the product |
175 | // is 1. We pack values in this manner for efficiency reasons: it maximizes the use |
176 | // we make of the product. It also makes it easy to reason about the product: there |
177 | // is 0 or 1 leading zero in the product. |
178 | |
179 | // Unless the least significant 9 bits of the high (64-bit) part of the full |
180 | // product are all 1s, then we know that the most significant 55 bits are |
181 | // exact and no further work is needed. Having 55 bits is necessary because |
182 | // we need 53 bits for the mantissa but we have to have one rounding bit and |
183 | // we can waste a bit if the most significant bit of the product is zero. |
184 | if((firstproduct.high & 0x1FF) == 0x1FF) { |
185 | // We want to compute i * 5^q, but only care about the top 55 bits at most. |
186 | // Consider the scenario where q>=0. Then 5^q may not fit in 64-bits. Doing |
187 | // the full computation is wasteful. So we do what is called a "truncated |
188 | // multiplication". |
189 | // We take the most significant 64-bits, and we put them in |
190 | // power_of_five_128[index]. Usually, that's good enough to approximate i * 5^q |
191 | // to the desired approximation using one multiplication. Sometimes it does not suffice. |
192 | // Then we store the next most significant 64 bits in power_of_five_128[index + 1], and |
193 | // then we get a better approximation to i * 5^q. In very rare cases, even that |
194 | // will not suffice, though it is seemingly very hard to find such a scenario. |
195 | // |
196 | // That's for when q>=0. The logic for q<0 is somewhat similar but it is somewhat |
197 | // more complicated. |
198 | // |
199 | // There is an extra layer of complexity in that we need more than 55 bits of |
200 | // accuracy in the round-to-even scenario. |
201 | // |
202 | // The full_multiplication function computes the 128-bit product of two 64-bit words |
203 | // with a returned value of type value128 with a "low component" corresponding to the |
204 | // 64-bit least significant bits of the product and with a "high component" corresponding |
205 | // to the 64-bit most significant bits of the product. |
206 | simdjson::internal::value128 secondproduct = jsoncharutils::full_multiplication(value1: i, value2: simdjson::internal::power_of_five_128[index + 1]); |
207 | firstproduct.low += secondproduct.high; |
208 | if(secondproduct.high > firstproduct.low) { firstproduct.high++; } |
209 | // At this point, we might need to add at most one to firstproduct, but this |
210 | // can only change the value of firstproduct.high if firstproduct.low is maximal. |
211 | if(simdjson_unlikely(firstproduct.low == 0xFFFFFFFFFFFFFFFF)) { |
212 | // This is very unlikely, but if so, we need to do much more work! |
213 | return false; |
214 | } |
215 | } |
216 | uint64_t lower = firstproduct.low; |
217 | uint64_t upper = firstproduct.high; |
218 | // The final mantissa should be 53 bits with a leading 1. |
219 | // We shift it so that it occupies 54 bits with a leading 1. |
220 | /////// |
221 | uint64_t upperbit = upper >> 63; |
222 | uint64_t mantissa = upper >> (upperbit + 9); |
223 | lz += int(1 ^ upperbit); |
224 | |
225 | // Here we have mantissa < (1<<54). |
226 | int64_t real_exponent = exponent - lz; |
227 | if (simdjson_unlikely(real_exponent <= 0)) { // we have a subnormal? |
228 | // Here have that real_exponent <= 0 so -real_exponent >= 0 |
229 | if(-real_exponent + 1 >= 64) { // if we have more than 64 bits below the minimum exponent, you have a zero for sure. |
230 | d = negative ? -0.0 : 0.0; |
231 | return true; |
232 | } |
233 | // next line is safe because -real_exponent + 1 < 0 |
234 | mantissa >>= -real_exponent + 1; |
235 | // Thankfully, we can't have both "round-to-even" and subnormals because |
236 | // "round-to-even" only occurs for powers close to 0. |
237 | mantissa += (mantissa & 1); // round up |
238 | mantissa >>= 1; |
239 | // There is a weird scenario where we don't have a subnormal but just. |
240 | // Suppose we start with 2.2250738585072013e-308, we end up |
241 | // with 0x3fffffffffffff x 2^-1023-53 which is technically subnormal |
242 | // whereas 0x40000000000000 x 2^-1023-53 is normal. Now, we need to round |
243 | // up 0x3fffffffffffff x 2^-1023-53 and once we do, we are no longer |
244 | // subnormal, but we can only know this after rounding. |
245 | // So we only declare a subnormal if we are smaller than the threshold. |
246 | real_exponent = (mantissa < (uint64_t(1) << 52)) ? 0 : 1; |
247 | d = to_double(mantissa, real_exponent, negative); |
248 | return true; |
249 | } |
250 | // We have to round to even. The "to even" part |
251 | // is only a problem when we are right in between two floats |
252 | // which we guard against. |
253 | // If we have lots of trailing zeros, we may fall right between two |
254 | // floating-point values. |
255 | // |
256 | // The round-to-even cases take the form of a number 2m+1 which is in (2^53,2^54] |
257 | // times a power of two. That is, it is right between a number with binary significand |
258 | // m and another number with binary significand m+1; and it must be the case |
259 | // that it cannot be represented by a float itself. |
260 | // |
261 | // We must have that w * 10 ^q == (2m+1) * 2^p for some power of two 2^p. |
262 | // Recall that 10^q = 5^q * 2^q. |
263 | // When q >= 0, we must have that (2m+1) is divible by 5^q, so 5^q <= 2^54. We have that |
264 | // 5^23 <= 2^54 and it is the last power of five to qualify, so q <= 23. |
265 | // When q<0, we have w >= (2m+1) x 5^{-q}. We must have that w<2^{64} so |
266 | // (2m+1) x 5^{-q} < 2^{64}. We have that 2m+1>2^{53}. Hence, we must have |
267 | // 2^{53} x 5^{-q} < 2^{64}. |
268 | // Hence we have 5^{-q} < 2^{11}$ or q>= -4. |
269 | // |
270 | // We require lower <= 1 and not lower == 0 because we could not prove that |
271 | // that lower == 0 is implied; but we could prove that lower <= 1 is a necessary and sufficient test. |
272 | if (simdjson_unlikely((lower <= 1) && (power >= -4) && (power <= 23) && ((mantissa & 3) == 1))) { |
273 | if((mantissa << (upperbit + 64 - 53 - 2)) == upper) { |
274 | mantissa &= ~1; // flip it so that we do not round up |
275 | } |
276 | } |
277 | |
278 | mantissa += mantissa & 1; |
279 | mantissa >>= 1; |
280 | |
281 | // Here we have mantissa < (1<<53), unless there was an overflow |
282 | if (mantissa >= (1ULL << 53)) { |
283 | ////////// |
284 | // This will happen when parsing values such as 7.2057594037927933e+16 |
285 | //////// |
286 | mantissa = (1ULL << 52); |
287 | real_exponent++; |
288 | } |
289 | mantissa &= ~(1ULL << 52); |
290 | // we have to check that real_exponent is in range, otherwise we bail out |
291 | if (simdjson_unlikely(real_exponent > 2046)) { |
292 | // We have an infinite value!!! We could actually throw an error here if we could. |
293 | return false; |
294 | } |
295 | d = to_double(mantissa, real_exponent, negative); |
296 | return true; |
297 | } |
298 | |
299 | // We call a fallback floating-point parser that might be slow. Note |
300 | // it will accept JSON numbers, but the JSON spec. is more restrictive so |
301 | // before you call parse_float_fallback, you need to have validated the input |
302 | // string with the JSON grammar. |
303 | // It will return an error (false) if the parsed number is infinite. |
304 | // The string parsing itself always succeeds. We know that there is at least |
305 | // one digit. |
306 | static bool parse_float_fallback(const uint8_t *ptr, double *outDouble) { |
307 | *outDouble = simdjson::internal::from_chars(first: reinterpret_cast<const char *>(ptr)); |
308 | // We do not accept infinite values. |
309 | |
310 | // Detecting finite values in a portable manner is ridiculously hard, ideally |
311 | // we would want to do: |
312 | // return !std::isfinite(*outDouble); |
313 | // but that mysteriously fails under legacy/old libc++ libraries, see |
314 | // https://github.com/simdjson/simdjson/issues/1286 |
315 | // |
316 | // Therefore, fall back to this solution (the extra parens are there |
317 | // to handle that max may be a macro on windows). |
318 | return !(*outDouble > (std::numeric_limits<double>::max)() || *outDouble < std::numeric_limits<double>::lowest()); |
319 | } |
320 | static bool parse_float_fallback(const uint8_t *ptr, const uint8_t *end_ptr, double *outDouble) { |
321 | *outDouble = simdjson::internal::from_chars(first: reinterpret_cast<const char *>(ptr), end: reinterpret_cast<const char *>(end_ptr)); |
322 | // We do not accept infinite values. |
323 | |
324 | // Detecting finite values in a portable manner is ridiculously hard, ideally |
325 | // we would want to do: |
326 | // return !std::isfinite(*outDouble); |
327 | // but that mysteriously fails under legacy/old libc++ libraries, see |
328 | // https://github.com/simdjson/simdjson/issues/1286 |
329 | // |
330 | // Therefore, fall back to this solution (the extra parens are there |
331 | // to handle that max may be a macro on windows). |
332 | return !(*outDouble > (std::numeric_limits<double>::max)() || *outDouble < std::numeric_limits<double>::lowest()); |
333 | } |
334 | |
335 | // check quickly whether the next 8 chars are made of digits |
336 | // at a glance, it looks better than Mula's |
337 | // http://0x80.pl/articles/swar-digits-validate.html |
338 | simdjson_inline bool is_made_of_eight_digits_fast(const uint8_t *chars) { |
339 | uint64_t val; |
340 | // this can read up to 7 bytes beyond the buffer size, but we require |
341 | // SIMDJSON_PADDING of padding |
342 | static_assert(7 <= SIMDJSON_PADDING, "SIMDJSON_PADDING must be bigger than 7" ); |
343 | std::memcpy(dest: &val, src: chars, n: 8); |
344 | // a branchy method might be faster: |
345 | // return (( val & 0xF0F0F0F0F0F0F0F0 ) == 0x3030303030303030) |
346 | // && (( (val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0 ) == |
347 | // 0x3030303030303030); |
348 | return (((val & 0xF0F0F0F0F0F0F0F0) | |
349 | (((val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0) >> 4)) == |
350 | 0x3333333333333333); |
351 | } |
352 | |
353 | template<typename W> |
354 | error_code slow_float_parsing(simdjson_unused const uint8_t * src, W writer) { |
355 | double d; |
356 | if (parse_float_fallback(ptr: src, outDouble: &d)) { |
357 | writer.append_double(d); |
358 | return SUCCESS; |
359 | } |
360 | return INVALID_NUMBER(src); |
361 | } |
362 | |
363 | template<typename I> |
364 | SIMDJSON_NO_SANITIZE_UNDEFINED // We deliberately allow overflow here and check later |
365 | simdjson_inline bool parse_digit(const uint8_t c, I &i) { |
366 | const uint8_t digit = static_cast<uint8_t>(c - '0'); |
367 | if (digit > 9) { |
368 | return false; |
369 | } |
370 | // PERF NOTE: multiplication by 10 is cheaper than arbitrary integer multiplication |
371 | i = 10 * i + digit; // might overflow, we will handle the overflow later |
372 | return true; |
373 | } |
374 | |
375 | simdjson_inline error_code parse_decimal(simdjson_unused const uint8_t *const src, const uint8_t *&p, uint64_t &i, int64_t &exponent) { |
376 | // we continue with the fiction that we have an integer. If the |
377 | // floating point number is representable as x * 10^z for some integer |
378 | // z that fits in 53 bits, then we will be able to convert back the |
379 | // the integer into a float in a lossless manner. |
380 | const uint8_t *const first_after_period = p; |
381 | |
382 | #ifdef SIMDJSON_SWAR_NUMBER_PARSING |
383 | #if SIMDJSON_SWAR_NUMBER_PARSING |
384 | // this helps if we have lots of decimals! |
385 | // this turns out to be frequent enough. |
386 | if (is_made_of_eight_digits_fast(chars: p)) { |
387 | i = i * 100000000 + parse_eight_digits_unrolled(chars: p); |
388 | p += 8; |
389 | } |
390 | #endif // SIMDJSON_SWAR_NUMBER_PARSING |
391 | #endif // #ifdef SIMDJSON_SWAR_NUMBER_PARSING |
392 | // Unrolling the first digit makes a small difference on some implementations (e.g. westmere) |
393 | if (parse_digit(c: *p, i)) { ++p; } |
394 | while (parse_digit(c: *p, i)) { p++; } |
395 | exponent = first_after_period - p; |
396 | // Decimal without digits (123.) is illegal |
397 | if (exponent == 0) { |
398 | return INVALID_NUMBER(src); |
399 | } |
400 | return SUCCESS; |
401 | } |
402 | |
403 | simdjson_inline error_code parse_exponent(simdjson_unused const uint8_t *const src, const uint8_t *&p, int64_t &exponent) { |
404 | // Exp Sign: -123.456e[-]78 |
405 | bool neg_exp = ('-' == *p); |
406 | if (neg_exp || '+' == *p) { p++; } // Skip + as well |
407 | |
408 | // Exponent: -123.456e-[78] |
409 | auto start_exp = p; |
410 | int64_t exp_number = 0; |
411 | while (parse_digit(c: *p, i&: exp_number)) { ++p; } |
412 | // It is possible for parse_digit to overflow. |
413 | // In particular, it could overflow to INT64_MIN, and we cannot do - INT64_MIN. |
414 | // Thus we *must* check for possible overflow before we negate exp_number. |
415 | |
416 | // Performance notes: it may seem like combining the two "simdjson_unlikely checks" below into |
417 | // a single simdjson_unlikely path would be faster. The reasoning is sound, but the compiler may |
418 | // not oblige and may, in fact, generate two distinct paths in any case. It might be |
419 | // possible to do uint64_t(p - start_exp - 1) >= 18 but it could end up trading off |
420 | // instructions for a simdjson_likely branch, an unconclusive gain. |
421 | |
422 | // If there were no digits, it's an error. |
423 | if (simdjson_unlikely(p == start_exp)) { |
424 | return INVALID_NUMBER(src); |
425 | } |
426 | // We have a valid positive exponent in exp_number at this point, except that |
427 | // it may have overflowed. |
428 | |
429 | // If there were more than 18 digits, we may have overflowed the integer. We have to do |
430 | // something!!!! |
431 | if (simdjson_unlikely(p > start_exp+18)) { |
432 | // Skip leading zeroes: 1e000000000000000000001 is technically valid and doesn't overflow |
433 | while (*start_exp == '0') { start_exp++; } |
434 | // 19 digits could overflow int64_t and is kind of absurd anyway. We don't |
435 | // support exponents smaller than -999,999,999,999,999,999 and bigger |
436 | // than 999,999,999,999,999,999. |
437 | // We can truncate. |
438 | // Note that 999999999999999999 is assuredly too large. The maximal ieee64 value before |
439 | // infinity is ~1.8e308. The smallest subnormal is ~5e-324. So, actually, we could |
440 | // truncate at 324. |
441 | // Note that there is no reason to fail per se at this point in time. |
442 | // E.g., 0e999999999999999999999 is a fine number. |
443 | if (p > start_exp+18) { exp_number = 999999999999999999; } |
444 | } |
445 | // At this point, we know that exp_number is a sane, positive, signed integer. |
446 | // It is <= 999,999,999,999,999,999. As long as 'exponent' is in |
447 | // [-8223372036854775808, 8223372036854775808], we won't overflow. Because 'exponent' |
448 | // is bounded in magnitude by the size of the JSON input, we are fine in this universe. |
449 | // To sum it up: the next line should never overflow. |
450 | exponent += (neg_exp ? -exp_number : exp_number); |
451 | return SUCCESS; |
452 | } |
453 | |
454 | simdjson_inline size_t significant_digits(const uint8_t * start_digits, size_t digit_count) { |
455 | // It is possible that the integer had an overflow. |
456 | // We have to handle the case where we have 0.0000somenumber. |
457 | const uint8_t *start = start_digits; |
458 | while ((*start == '0') || (*start == '.')) { ++start; } |
459 | // we over-decrement by one when there is a '.' |
460 | return digit_count - size_t(start - start_digits); |
461 | } |
462 | |
463 | template<typename W> |
464 | simdjson_inline error_code write_float(const uint8_t *const src, bool negative, uint64_t i, const uint8_t * start_digits, size_t digit_count, int64_t exponent, W &writer) { |
465 | // If we frequently had to deal with long strings of digits, |
466 | // we could extend our code by using a 128-bit integer instead |
467 | // of a 64-bit integer. However, this is uncommon in practice. |
468 | // |
469 | // 9999999999999999999 < 2**64 so we can accommodate 19 digits. |
470 | // If we have a decimal separator, then digit_count - 1 is the number of digits, but we |
471 | // may not have a decimal separator! |
472 | if (simdjson_unlikely(digit_count > 19 && significant_digits(start_digits, digit_count) > 19)) { |
473 | // Ok, chances are good that we had an overflow! |
474 | // this is almost never going to get called!!! |
475 | // we start anew, going slowly!!! |
476 | // This will happen in the following examples: |
477 | // 10000000000000000000000000000000000000000000e+308 |
478 | // 3.1415926535897932384626433832795028841971693993751 |
479 | // |
480 | // NOTE: This makes a *copy* of the writer and passes it to slow_float_parsing. This happens |
481 | // because slow_float_parsing is a non-inlined function. If we passed our writer reference to |
482 | // it, it would force it to be stored in memory, preventing the compiler from picking it apart |
483 | // and putting into registers. i.e. if we pass it as reference, it gets slow. |
484 | // This is what forces the skip_double, as well. |
485 | error_code error = slow_float_parsing(src, writer); |
486 | writer.skip_double(); |
487 | return error; |
488 | } |
489 | // NOTE: it's weird that the simdjson_unlikely() only wraps half the if, but it seems to get slower any other |
490 | // way we've tried: https://github.com/simdjson/simdjson/pull/990#discussion_r448497331 |
491 | // To future reader: we'd love if someone found a better way, or at least could explain this result! |
492 | if (simdjson_unlikely(exponent < simdjson::internal::smallest_power) || (exponent > simdjson::internal::largest_power)) { |
493 | // |
494 | // Important: smallest_power is such that it leads to a zero value. |
495 | // Observe that 18446744073709551615e-343 == 0, i.e. (2**64 - 1) e -343 is zero |
496 | // so something x 10^-343 goes to zero, but not so with something x 10^-342. |
497 | static_assert(simdjson::internal::smallest_power <= -342, "smallest_power is not small enough" ); |
498 | // |
499 | if((exponent < simdjson::internal::smallest_power) || (i == 0)) { |
500 | // E.g. Parse "-0.0e-999" into the same value as "-0.0". See https://en.wikipedia.org/wiki/Signed_zero |
501 | WRITE_DOUBLE(negative ? -0.0 : 0.0, src, writer); |
502 | return SUCCESS; |
503 | } else { // (exponent > largest_power) and (i != 0) |
504 | // We have, for sure, an infinite value and simdjson refuses to parse infinite values. |
505 | return INVALID_NUMBER(src); |
506 | } |
507 | } |
508 | double d; |
509 | if (!compute_float_64(power: exponent, i, negative, d)) { |
510 | // we are almost never going to get here. |
511 | if (!parse_float_fallback(ptr: src, outDouble: &d)) { return INVALID_NUMBER(src); } |
512 | } |
513 | WRITE_DOUBLE(d, src, writer); |
514 | return SUCCESS; |
515 | } |
516 | |
517 | // for performance analysis, it is sometimes useful to skip parsing |
518 | #ifdef SIMDJSON_SKIPNUMBERPARSING |
519 | |
520 | template<typename W> |
521 | simdjson_inline error_code parse_number(const uint8_t *const, W &writer) { |
522 | writer.append_s64(0); // always write zero |
523 | return SUCCESS; // always succeeds |
524 | } |
525 | |
526 | simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src) noexcept { return 0; } |
527 | simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer(const uint8_t * const src) noexcept { return 0; } |
528 | simdjson_unused simdjson_inline simdjson_result<double> parse_double(const uint8_t * const src) noexcept { return 0; } |
529 | simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned_in_string(const uint8_t * const src) noexcept { return 0; } |
530 | simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer_in_string(const uint8_t * const src) noexcept { return 0; } |
531 | simdjson_unused simdjson_inline simdjson_result<double> parse_double_in_string(const uint8_t * const src) noexcept { return 0; } |
532 | simdjson_unused simdjson_inline bool is_negative(const uint8_t * src) noexcept { return false; } |
533 | simdjson_unused simdjson_inline simdjson_result<bool> is_integer(const uint8_t * src) noexcept { return false; } |
534 | simdjson_unused simdjson_inline simdjson_result<ondemand::number_type> get_number_type(const uint8_t * src) noexcept { return ondemand::number_type::signed_integer; } |
535 | #else |
536 | |
537 | // parse the number at src |
538 | // define JSON_TEST_NUMBERS for unit testing |
539 | // |
540 | // It is assumed that the number is followed by a structural ({,},],[) character |
541 | // or a white space character. If that is not the case (e.g., when the JSON |
542 | // document is made of a single number), then it is necessary to copy the |
543 | // content and append a space before calling this function. |
544 | // |
545 | // Our objective is accurate parsing (ULP of 0) at high speed. |
546 | template<typename W> |
547 | simdjson_inline error_code parse_number(const uint8_t *const src, W &writer) { |
548 | |
549 | // |
550 | // Check for minus sign |
551 | // |
552 | bool negative = (*src == '-'); |
553 | const uint8_t *p = src + uint8_t(negative); |
554 | |
555 | // |
556 | // Parse the integer part. |
557 | // |
558 | // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare |
559 | const uint8_t *const start_digits = p; |
560 | uint64_t i = 0; |
561 | while (parse_digit(c: *p, i)) { p++; } |
562 | |
563 | // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error. |
564 | // Optimization note: size_t is expected to be unsigned. |
565 | size_t digit_count = size_t(p - start_digits); |
566 | if (digit_count == 0 || ('0' == *start_digits && digit_count > 1)) { return INVALID_NUMBER(src); } |
567 | |
568 | // |
569 | // Handle floats if there is a . or e (or both) |
570 | // |
571 | int64_t exponent = 0; |
572 | bool is_float = false; |
573 | if ('.' == *p) { |
574 | is_float = true; |
575 | ++p; |
576 | SIMDJSON_TRY( parse_decimal(src, p, i, exponent) ); |
577 | digit_count = int(p - start_digits); // used later to guard against overflows |
578 | } |
579 | if (('e' == *p) || ('E' == *p)) { |
580 | is_float = true; |
581 | ++p; |
582 | SIMDJSON_TRY( parse_exponent(src, p, exponent) ); |
583 | } |
584 | if (is_float) { |
585 | const bool dirty_end = jsoncharutils::is_not_structural_or_whitespace(c: *p); |
586 | SIMDJSON_TRY( write_float(src, negative, i, start_digits, digit_count, exponent, writer) ); |
587 | if (dirty_end) { return INVALID_NUMBER(src); } |
588 | return SUCCESS; |
589 | } |
590 | |
591 | // The longest negative 64-bit number is 19 digits. |
592 | // The longest positive 64-bit number is 20 digits. |
593 | // We do it this way so we don't trigger this branch unless we must. |
594 | size_t longest_digit_count = negative ? 19 : 20; |
595 | if (digit_count > longest_digit_count) { return INVALID_NUMBER(src); } |
596 | if (digit_count == longest_digit_count) { |
597 | if (negative) { |
598 | // Anything negative above INT64_MAX+1 is invalid |
599 | if (i > uint64_t(INT64_MAX)+1) { return INVALID_NUMBER(src); } |
600 | WRITE_INTEGER(~i+1, src, writer); |
601 | if (jsoncharutils::is_not_structural_or_whitespace(c: *p)) { return INVALID_NUMBER(src); } |
602 | return SUCCESS; |
603 | // Positive overflow check: |
604 | // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the |
605 | // biggest uint64_t. |
606 | // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX. |
607 | // If we got here, it's a 20 digit number starting with the digit "1". |
608 | // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller |
609 | // than 1,553,255,926,290,448,384. |
610 | // - That is smaller than the smallest possible 20-digit number the user could write: |
611 | // 10,000,000,000,000,000,000. |
612 | // - Therefore, if the number is positive and lower than that, it's overflow. |
613 | // - The value we are looking at is less than or equal to INT64_MAX. |
614 | // |
615 | } else if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INVALID_NUMBER(src); } |
616 | } |
617 | |
618 | // Write unsigned if it doesn't fit in a signed integer. |
619 | if (i > uint64_t(INT64_MAX)) { |
620 | WRITE_UNSIGNED(i, src, writer); |
621 | } else { |
622 | WRITE_INTEGER(negative ? (~i+1) : i, src, writer); |
623 | } |
624 | if (jsoncharutils::is_not_structural_or_whitespace(c: *p)) { return INVALID_NUMBER(src); } |
625 | return SUCCESS; |
626 | } |
627 | |
628 | // Inlineable functions |
629 | namespace { |
630 | |
631 | // This table can be used to characterize the final character of an integer |
632 | // string. For JSON structural character and allowable white space characters, |
633 | // we return SUCCESS. For 'e', '.' and 'E', we return INCORRECT_TYPE. Otherwise |
634 | // we return NUMBER_ERROR. |
635 | // Optimization note: we could easily reduce the size of the table by half (to 128) |
636 | // at the cost of an extra branch. |
637 | // Optimization note: we want the values to use at most 8 bits (not, e.g., 32 bits): |
638 | static_assert(error_code(uint8_t(NUMBER_ERROR))== NUMBER_ERROR, "bad NUMBER_ERROR cast" ); |
639 | static_assert(error_code(uint8_t(SUCCESS))== SUCCESS, "bad NUMBER_ERROR cast" ); |
640 | static_assert(error_code(uint8_t(INCORRECT_TYPE))== INCORRECT_TYPE, "bad NUMBER_ERROR cast" ); |
641 | |
642 | const uint8_t integer_string_finisher[256] = { |
643 | NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, |
644 | NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, SUCCESS, |
645 | SUCCESS, NUMBER_ERROR, NUMBER_ERROR, SUCCESS, NUMBER_ERROR, |
646 | NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, |
647 | NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, |
648 | NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, |
649 | NUMBER_ERROR, NUMBER_ERROR, SUCCESS, NUMBER_ERROR, NUMBER_ERROR, |
650 | NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, |
651 | NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, SUCCESS, |
652 | NUMBER_ERROR, INCORRECT_TYPE, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, |
653 | NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, |
654 | NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, SUCCESS, NUMBER_ERROR, |
655 | NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, |
656 | NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, INCORRECT_TYPE, |
657 | NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, |
658 | NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, |
659 | NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, |
660 | NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, |
661 | NUMBER_ERROR, SUCCESS, NUMBER_ERROR, SUCCESS, NUMBER_ERROR, |
662 | NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, |
663 | NUMBER_ERROR, INCORRECT_TYPE, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, |
664 | NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, |
665 | NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, |
666 | NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, |
667 | NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, SUCCESS, NUMBER_ERROR, |
668 | SUCCESS, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, |
669 | NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, |
670 | NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, |
671 | NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, |
672 | NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, |
673 | NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, |
674 | NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, |
675 | NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, |
676 | NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, |
677 | NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, |
678 | NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, |
679 | NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, |
680 | NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, |
681 | NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, |
682 | NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, |
683 | NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, |
684 | NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, |
685 | NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, |
686 | NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, |
687 | NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, |
688 | NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, |
689 | NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, |
690 | NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, |
691 | NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, |
692 | NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, |
693 | NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, |
694 | NUMBER_ERROR}; |
695 | |
696 | // Parse any number from 0 to 18,446,744,073,709,551,615 |
697 | simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src) noexcept { |
698 | const uint8_t *p = src; |
699 | // |
700 | // Parse the integer part. |
701 | // |
702 | // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare |
703 | const uint8_t *const start_digits = p; |
704 | uint64_t i = 0; |
705 | while (parse_digit(c: *p, i)) { p++; } |
706 | |
707 | // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error. |
708 | // Optimization note: size_t is expected to be unsigned. |
709 | size_t digit_count = size_t(p - start_digits); |
710 | // The longest positive 64-bit number is 20 digits. |
711 | // We do it this way so we don't trigger this branch unless we must. |
712 | // Optimization note: the compiler can probably merge |
713 | // ((digit_count == 0) || (digit_count > 20)) |
714 | // into a single branch since digit_count is unsigned. |
715 | if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; } |
716 | // Here digit_count > 0. |
717 | if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; } |
718 | // We can do the following... |
719 | // if (!jsoncharutils::is_structural_or_whitespace(*p)) { |
720 | // return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR; |
721 | // } |
722 | // as a single table lookup: |
723 | if (integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); } |
724 | |
725 | if (digit_count == 20) { |
726 | // Positive overflow check: |
727 | // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the |
728 | // biggest uint64_t. |
729 | // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX. |
730 | // If we got here, it's a 20 digit number starting with the digit "1". |
731 | // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller |
732 | // than 1,553,255,926,290,448,384. |
733 | // - That is smaller than the smallest possible 20-digit number the user could write: |
734 | // 10,000,000,000,000,000,000. |
735 | // - Therefore, if the number is positive and lower than that, it's overflow. |
736 | // - The value we are looking at is less than or equal to INT64_MAX. |
737 | // |
738 | if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; } |
739 | } |
740 | |
741 | return i; |
742 | } |
743 | |
744 | |
745 | // Parse any number from 0 to 18,446,744,073,709,551,615 |
746 | // Never read at src_end or beyond |
747 | simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src, const uint8_t * const src_end) noexcept { |
748 | const uint8_t *p = src; |
749 | // |
750 | // Parse the integer part. |
751 | // |
752 | // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare |
753 | const uint8_t *const start_digits = p; |
754 | uint64_t i = 0; |
755 | while ((p != src_end) && parse_digit(c: *p, i)) { p++; } |
756 | |
757 | // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error. |
758 | // Optimization note: size_t is expected to be unsigned. |
759 | size_t digit_count = size_t(p - start_digits); |
760 | // The longest positive 64-bit number is 20 digits. |
761 | // We do it this way so we don't trigger this branch unless we must. |
762 | // Optimization note: the compiler can probably merge |
763 | // ((digit_count == 0) || (digit_count > 20)) |
764 | // into a single branch since digit_count is unsigned. |
765 | if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; } |
766 | // Here digit_count > 0. |
767 | if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; } |
768 | // We can do the following... |
769 | // if (!jsoncharutils::is_structural_or_whitespace(*p)) { |
770 | // return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR; |
771 | // } |
772 | // as a single table lookup: |
773 | if ((p != src_end) && integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); } |
774 | |
775 | if (digit_count == 20) { |
776 | // Positive overflow check: |
777 | // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the |
778 | // biggest uint64_t. |
779 | // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX. |
780 | // If we got here, it's a 20 digit number starting with the digit "1". |
781 | // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller |
782 | // than 1,553,255,926,290,448,384. |
783 | // - That is smaller than the smallest possible 20-digit number the user could write: |
784 | // 10,000,000,000,000,000,000. |
785 | // - Therefore, if the number is positive and lower than that, it's overflow. |
786 | // - The value we are looking at is less than or equal to INT64_MAX. |
787 | // |
788 | if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; } |
789 | } |
790 | |
791 | return i; |
792 | } |
793 | |
794 | // Parse any number from 0 to 18,446,744,073,709,551,615 |
795 | simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned_in_string(const uint8_t * const src) noexcept { |
796 | const uint8_t *p = src + 1; |
797 | // |
798 | // Parse the integer part. |
799 | // |
800 | // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare |
801 | const uint8_t *const start_digits = p; |
802 | uint64_t i = 0; |
803 | while (parse_digit(c: *p, i)) { p++; } |
804 | |
805 | // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error. |
806 | // Optimization note: size_t is expected to be unsigned. |
807 | size_t digit_count = size_t(p - start_digits); |
808 | // The longest positive 64-bit number is 20 digits. |
809 | // We do it this way so we don't trigger this branch unless we must. |
810 | // Optimization note: the compiler can probably merge |
811 | // ((digit_count == 0) || (digit_count > 20)) |
812 | // into a single branch since digit_count is unsigned. |
813 | if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; } |
814 | // Here digit_count > 0. |
815 | if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; } |
816 | // We can do the following... |
817 | // if (!jsoncharutils::is_structural_or_whitespace(*p)) { |
818 | // return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR; |
819 | // } |
820 | // as a single table lookup: |
821 | if (*p != '"') { return NUMBER_ERROR; } |
822 | |
823 | if (digit_count == 20) { |
824 | // Positive overflow check: |
825 | // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the |
826 | // biggest uint64_t. |
827 | // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX. |
828 | // If we got here, it's a 20 digit number starting with the digit "1". |
829 | // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller |
830 | // than 1,553,255,926,290,448,384. |
831 | // - That is smaller than the smallest possible 20-digit number the user could write: |
832 | // 10,000,000,000,000,000,000. |
833 | // - Therefore, if the number is positive and lower than that, it's overflow. |
834 | // - The value we are looking at is less than or equal to INT64_MAX. |
835 | // |
836 | // Note: we use src[1] and not src[0] because src[0] is the quote character in this |
837 | // instance. |
838 | if (src[1] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; } |
839 | } |
840 | |
841 | return i; |
842 | } |
843 | |
844 | // Parse any number from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807 |
845 | simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer(const uint8_t *src) noexcept { |
846 | // |
847 | // Check for minus sign |
848 | // |
849 | bool negative = (*src == '-'); |
850 | const uint8_t *p = src + uint8_t(negative); |
851 | |
852 | // |
853 | // Parse the integer part. |
854 | // |
855 | // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare |
856 | const uint8_t *const start_digits = p; |
857 | uint64_t i = 0; |
858 | while (parse_digit(c: *p, i)) { p++; } |
859 | |
860 | // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error. |
861 | // Optimization note: size_t is expected to be unsigned. |
862 | size_t digit_count = size_t(p - start_digits); |
863 | // We go from |
864 | // -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807 |
865 | // so we can never represent numbers that have more than 19 digits. |
866 | size_t longest_digit_count = 19; |
867 | // Optimization note: the compiler can probably merge |
868 | // ((digit_count == 0) || (digit_count > longest_digit_count)) |
869 | // into a single branch since digit_count is unsigned. |
870 | if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; } |
871 | // Here digit_count > 0. |
872 | if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; } |
873 | // We can do the following... |
874 | // if (!jsoncharutils::is_structural_or_whitespace(*p)) { |
875 | // return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR; |
876 | // } |
877 | // as a single table lookup: |
878 | if(integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); } |
879 | // Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX. |
880 | // Performance note: This check is only needed when digit_count == longest_digit_count but it is |
881 | // so cheap that we might as well always make it. |
882 | if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; } |
883 | return negative ? (~i+1) : i; |
884 | } |
885 | |
886 | // Parse any number from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807 |
887 | // Never read at src_end or beyond |
888 | simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer(const uint8_t * const src, const uint8_t * const src_end) noexcept { |
889 | // |
890 | // Check for minus sign |
891 | // |
892 | if(src == src_end) { return NUMBER_ERROR; } |
893 | bool negative = (*src == '-'); |
894 | const uint8_t *p = src + uint8_t(negative); |
895 | |
896 | // |
897 | // Parse the integer part. |
898 | // |
899 | // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare |
900 | const uint8_t *const start_digits = p; |
901 | uint64_t i = 0; |
902 | while ((p != src_end) && parse_digit(c: *p, i)) { p++; } |
903 | |
904 | // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error. |
905 | // Optimization note: size_t is expected to be unsigned. |
906 | size_t digit_count = size_t(p - start_digits); |
907 | // We go from |
908 | // -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807 |
909 | // so we can never represent numbers that have more than 19 digits. |
910 | size_t longest_digit_count = 19; |
911 | // Optimization note: the compiler can probably merge |
912 | // ((digit_count == 0) || (digit_count > longest_digit_count)) |
913 | // into a single branch since digit_count is unsigned. |
914 | if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; } |
915 | // Here digit_count > 0. |
916 | if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; } |
917 | // We can do the following... |
918 | // if (!jsoncharutils::is_structural_or_whitespace(*p)) { |
919 | // return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR; |
920 | // } |
921 | // as a single table lookup: |
922 | if((p != src_end) && integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); } |
923 | // Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX. |
924 | // Performance note: This check is only needed when digit_count == longest_digit_count but it is |
925 | // so cheap that we might as well always make it. |
926 | if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; } |
927 | return negative ? (~i+1) : i; |
928 | } |
929 | |
930 | // Parse any number from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807 |
931 | simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer_in_string(const uint8_t *src) noexcept { |
932 | // |
933 | // Check for minus sign |
934 | // |
935 | bool negative = (*(src + 1) == '-'); |
936 | src += uint8_t(negative) + 1; |
937 | |
938 | // |
939 | // Parse the integer part. |
940 | // |
941 | // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare |
942 | const uint8_t *const start_digits = src; |
943 | uint64_t i = 0; |
944 | while (parse_digit(c: *src, i)) { src++; } |
945 | |
946 | // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error. |
947 | // Optimization note: size_t is expected to be unsigned. |
948 | size_t digit_count = size_t(src - start_digits); |
949 | // We go from |
950 | // -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807 |
951 | // so we can never represent numbers that have more than 19 digits. |
952 | size_t longest_digit_count = 19; |
953 | // Optimization note: the compiler can probably merge |
954 | // ((digit_count == 0) || (digit_count > longest_digit_count)) |
955 | // into a single branch since digit_count is unsigned. |
956 | if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; } |
957 | // Here digit_count > 0. |
958 | if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; } |
959 | // We can do the following... |
960 | // if (!jsoncharutils::is_structural_or_whitespace(*src)) { |
961 | // return (*src == '.' || *src == 'e' || *src == 'E') ? INCORRECT_TYPE : NUMBER_ERROR; |
962 | // } |
963 | // as a single table lookup: |
964 | if(*src != '"') { return NUMBER_ERROR; } |
965 | // Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX. |
966 | // Performance note: This check is only needed when digit_count == longest_digit_count but it is |
967 | // so cheap that we might as well always make it. |
968 | if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; } |
969 | return negative ? (~i+1) : i; |
970 | } |
971 | |
972 | simdjson_unused simdjson_inline simdjson_result<double> parse_double(const uint8_t * src) noexcept { |
973 | // |
974 | // Check for minus sign |
975 | // |
976 | bool negative = (*src == '-'); |
977 | src += uint8_t(negative); |
978 | |
979 | // |
980 | // Parse the integer part. |
981 | // |
982 | uint64_t i = 0; |
983 | const uint8_t *p = src; |
984 | p += parse_digit(c: *p, i); |
985 | bool leading_zero = (i == 0); |
986 | while (parse_digit(c: *p, i)) { p++; } |
987 | // no integer digits, or 0123 (zero must be solo) |
988 | if ( p == src ) { return INCORRECT_TYPE; } |
989 | if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; } |
990 | |
991 | // |
992 | // Parse the decimal part. |
993 | // |
994 | int64_t exponent = 0; |
995 | bool overflow; |
996 | if (simdjson_likely(*p == '.')) { |
997 | p++; |
998 | const uint8_t *start_decimal_digits = p; |
999 | if (!parse_digit(c: *p, i)) { return NUMBER_ERROR; } // no decimal digits |
1000 | p++; |
1001 | while (parse_digit(c: *p, i)) { p++; } |
1002 | exponent = -(p - start_decimal_digits); |
1003 | |
1004 | // Overflow check. More than 19 digits (minus the decimal) may be overflow. |
1005 | overflow = p-src-1 > 19; |
1006 | if (simdjson_unlikely(overflow && leading_zero)) { |
1007 | // Skip leading 0.00000 and see if it still overflows |
1008 | const uint8_t *start_digits = src + 2; |
1009 | while (*start_digits == '0') { start_digits++; } |
1010 | overflow = start_digits-src > 19; |
1011 | } |
1012 | } else { |
1013 | overflow = p-src > 19; |
1014 | } |
1015 | |
1016 | // |
1017 | // Parse the exponent |
1018 | // |
1019 | if (*p == 'e' || *p == 'E') { |
1020 | p++; |
1021 | bool exp_neg = *p == '-'; |
1022 | p += exp_neg || *p == '+'; |
1023 | |
1024 | uint64_t exp = 0; |
1025 | const uint8_t *start_exp_digits = p; |
1026 | while (parse_digit(c: *p, i&: exp)) { p++; } |
1027 | // no exp digits, or 20+ exp digits |
1028 | if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; } |
1029 | |
1030 | exponent += exp_neg ? 0-exp : exp; |
1031 | } |
1032 | |
1033 | if (jsoncharutils::is_not_structural_or_whitespace(c: *p)) { return NUMBER_ERROR; } |
1034 | |
1035 | overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power; |
1036 | |
1037 | // |
1038 | // Assemble (or slow-parse) the float |
1039 | // |
1040 | double d; |
1041 | if (simdjson_likely(!overflow)) { |
1042 | if (compute_float_64(power: exponent, i, negative, d)) { return d; } |
1043 | } |
1044 | if (!parse_float_fallback(ptr: src - uint8_t(negative), outDouble: &d)) { |
1045 | return NUMBER_ERROR; |
1046 | } |
1047 | return d; |
1048 | } |
1049 | |
1050 | simdjson_unused simdjson_inline bool is_negative(const uint8_t * src) noexcept { |
1051 | return (*src == '-'); |
1052 | } |
1053 | |
1054 | simdjson_unused simdjson_inline simdjson_result<bool> is_integer(const uint8_t * src) noexcept { |
1055 | bool negative = (*src == '-'); |
1056 | src += uint8_t(negative); |
1057 | const uint8_t *p = src; |
1058 | while(static_cast<uint8_t>(*p - '0') <= 9) { p++; } |
1059 | if ( p == src ) { return NUMBER_ERROR; } |
1060 | if (jsoncharutils::is_structural_or_whitespace(c: *p)) { return true; } |
1061 | return false; |
1062 | } |
1063 | |
1064 | simdjson_unused simdjson_inline simdjson_result<ondemand::number_type> get_number_type(const uint8_t * src) noexcept { |
1065 | bool negative = (*src == '-'); |
1066 | src += uint8_t(negative); |
1067 | const uint8_t *p = src; |
1068 | while(static_cast<uint8_t>(*p - '0') <= 9) { p++; } |
1069 | if ( p == src ) { return NUMBER_ERROR; } |
1070 | if (jsoncharutils::is_structural_or_whitespace(c: *p)) { |
1071 | // We have an integer. |
1072 | // If the number is negative and valid, it must be a signed integer. |
1073 | if(negative) { return ondemand::number_type::signed_integer; } |
1074 | // We want values larger or equal to 9223372036854775808 to be unsigned |
1075 | // integers, and the other values to be signed integers. |
1076 | int digit_count = int(p - src); |
1077 | if(digit_count >= 19) { |
1078 | const uint8_t * smaller_big_integer = reinterpret_cast<const uint8_t *>("9223372036854775808" ); |
1079 | if((digit_count >= 20) || (memcmp(s1: src, s2: smaller_big_integer, n: 19) >= 0)) { |
1080 | return ondemand::number_type::unsigned_integer; |
1081 | } |
1082 | } |
1083 | return ondemand::number_type::signed_integer; |
1084 | } |
1085 | // Hopefully, we have 'e' or 'E' or '.'. |
1086 | return ondemand::number_type::floating_point_number; |
1087 | } |
1088 | |
1089 | // Never read at src_end or beyond |
1090 | simdjson_unused simdjson_inline simdjson_result<double> parse_double(const uint8_t * src, const uint8_t * const src_end) noexcept { |
1091 | if(src == src_end) { return NUMBER_ERROR; } |
1092 | // |
1093 | // Check for minus sign |
1094 | // |
1095 | bool negative = (*src == '-'); |
1096 | src += uint8_t(negative); |
1097 | |
1098 | // |
1099 | // Parse the integer part. |
1100 | // |
1101 | uint64_t i = 0; |
1102 | const uint8_t *p = src; |
1103 | if(p == src_end) { return NUMBER_ERROR; } |
1104 | p += parse_digit(c: *p, i); |
1105 | bool leading_zero = (i == 0); |
1106 | while ((p != src_end) && parse_digit(c: *p, i)) { p++; } |
1107 | // no integer digits, or 0123 (zero must be solo) |
1108 | if ( p == src ) { return INCORRECT_TYPE; } |
1109 | if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; } |
1110 | |
1111 | // |
1112 | // Parse the decimal part. |
1113 | // |
1114 | int64_t exponent = 0; |
1115 | bool overflow; |
1116 | if (simdjson_likely((p != src_end) && (*p == '.'))) { |
1117 | p++; |
1118 | const uint8_t *start_decimal_digits = p; |
1119 | if ((p == src_end) || !parse_digit(c: *p, i)) { return NUMBER_ERROR; } // no decimal digits |
1120 | p++; |
1121 | while ((p != src_end) && parse_digit(c: *p, i)) { p++; } |
1122 | exponent = -(p - start_decimal_digits); |
1123 | |
1124 | // Overflow check. More than 19 digits (minus the decimal) may be overflow. |
1125 | overflow = p-src-1 > 19; |
1126 | if (simdjson_unlikely(overflow && leading_zero)) { |
1127 | // Skip leading 0.00000 and see if it still overflows |
1128 | const uint8_t *start_digits = src + 2; |
1129 | while (*start_digits == '0') { start_digits++; } |
1130 | overflow = start_digits-src > 19; |
1131 | } |
1132 | } else { |
1133 | overflow = p-src > 19; |
1134 | } |
1135 | |
1136 | // |
1137 | // Parse the exponent |
1138 | // |
1139 | if ((p != src_end) && (*p == 'e' || *p == 'E')) { |
1140 | p++; |
1141 | if(p == src_end) { return NUMBER_ERROR; } |
1142 | bool exp_neg = *p == '-'; |
1143 | p += exp_neg || *p == '+'; |
1144 | |
1145 | uint64_t exp = 0; |
1146 | const uint8_t *start_exp_digits = p; |
1147 | while ((p != src_end) && parse_digit(c: *p, i&: exp)) { p++; } |
1148 | // no exp digits, or 20+ exp digits |
1149 | if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; } |
1150 | |
1151 | exponent += exp_neg ? 0-exp : exp; |
1152 | } |
1153 | |
1154 | if ((p != src_end) && jsoncharutils::is_not_structural_or_whitespace(c: *p)) { return NUMBER_ERROR; } |
1155 | |
1156 | overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power; |
1157 | |
1158 | // |
1159 | // Assemble (or slow-parse) the float |
1160 | // |
1161 | double d; |
1162 | if (simdjson_likely(!overflow)) { |
1163 | if (compute_float_64(power: exponent, i, negative, d)) { return d; } |
1164 | } |
1165 | if (!parse_float_fallback(ptr: src - uint8_t(negative), end_ptr: src_end, outDouble: &d)) { |
1166 | return NUMBER_ERROR; |
1167 | } |
1168 | return d; |
1169 | } |
1170 | |
1171 | simdjson_unused simdjson_inline simdjson_result<double> parse_double_in_string(const uint8_t * src) noexcept { |
1172 | // |
1173 | // Check for minus sign |
1174 | // |
1175 | bool negative = (*(src + 1) == '-'); |
1176 | src += uint8_t(negative) + 1; |
1177 | |
1178 | // |
1179 | // Parse the integer part. |
1180 | // |
1181 | uint64_t i = 0; |
1182 | const uint8_t *p = src; |
1183 | p += parse_digit(c: *p, i); |
1184 | bool leading_zero = (i == 0); |
1185 | while (parse_digit(c: *p, i)) { p++; } |
1186 | // no integer digits, or 0123 (zero must be solo) |
1187 | if ( p == src ) { return INCORRECT_TYPE; } |
1188 | if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; } |
1189 | |
1190 | // |
1191 | // Parse the decimal part. |
1192 | // |
1193 | int64_t exponent = 0; |
1194 | bool overflow; |
1195 | if (simdjson_likely(*p == '.')) { |
1196 | p++; |
1197 | const uint8_t *start_decimal_digits = p; |
1198 | if (!parse_digit(c: *p, i)) { return NUMBER_ERROR; } // no decimal digits |
1199 | p++; |
1200 | while (parse_digit(c: *p, i)) { p++; } |
1201 | exponent = -(p - start_decimal_digits); |
1202 | |
1203 | // Overflow check. More than 19 digits (minus the decimal) may be overflow. |
1204 | overflow = p-src-1 > 19; |
1205 | if (simdjson_unlikely(overflow && leading_zero)) { |
1206 | // Skip leading 0.00000 and see if it still overflows |
1207 | const uint8_t *start_digits = src + 2; |
1208 | while (*start_digits == '0') { start_digits++; } |
1209 | overflow = start_digits-src > 19; |
1210 | } |
1211 | } else { |
1212 | overflow = p-src > 19; |
1213 | } |
1214 | |
1215 | // |
1216 | // Parse the exponent |
1217 | // |
1218 | if (*p == 'e' || *p == 'E') { |
1219 | p++; |
1220 | bool exp_neg = *p == '-'; |
1221 | p += exp_neg || *p == '+'; |
1222 | |
1223 | uint64_t exp = 0; |
1224 | const uint8_t *start_exp_digits = p; |
1225 | while (parse_digit(c: *p, i&: exp)) { p++; } |
1226 | // no exp digits, or 20+ exp digits |
1227 | if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; } |
1228 | |
1229 | exponent += exp_neg ? 0-exp : exp; |
1230 | } |
1231 | |
1232 | if (*p != '"') { return NUMBER_ERROR; } |
1233 | |
1234 | overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power; |
1235 | |
1236 | // |
1237 | // Assemble (or slow-parse) the float |
1238 | // |
1239 | double d; |
1240 | if (simdjson_likely(!overflow)) { |
1241 | if (compute_float_64(power: exponent, i, negative, d)) { return d; } |
1242 | } |
1243 | if (!parse_float_fallback(ptr: src - uint8_t(negative), outDouble: &d)) { |
1244 | return NUMBER_ERROR; |
1245 | } |
1246 | return d; |
1247 | } |
1248 | } //namespace {} |
1249 | #endif // SIMDJSON_SKIPNUMBERPARSING |
1250 | |
1251 | } // namespace numberparsing |
1252 | } // unnamed namespace |
1253 | } // namespace SIMDJSON_IMPLEMENTATION |
1254 | } // namespace simdjson |
1255 | |