lj_strscan.c source code [Aerospike/modules/luajit/src/lj_strscan.c]

1	/*
2	** String scanning.
3	** Copyright (C) 2005-2014 Mike Pall. See Copyright Notice in luajit.h
4	*/
5
6	#include <math.h>
7
8	#define lj_strscan_c
9	#define LUA_CORE
10
11	#include "lj_obj.h"
12	#include "lj_char.h"
13	#include "lj_strscan.h"
14
15	/ -- Scanning numbers ---------------------------------------------------- /
16
17	/*
18	** Rationale for the builtin string to number conversion library:
19	**
20	** It removes a dependency on libc's strtod(), which is a true portability
21	** nightmare. Mainly due to the plethora of supported OS and toolchain
22	** combinations. Sadly, the various implementations
23	** a) are often buggy, incomplete (no hex floats) and/or imprecise,
24	** b) sometimes crash or hang on certain inputs,
25	** c) return non-standard NaNs that need to be filtered out, and
26	** d) fail if the locale-specific decimal separator is not a dot,
27	** which can only be fixed with atrocious workarounds.
28	**
29	** Also, most of the strtod() implementations are hopelessly bloated,
30	** which is not just an I-cache hog, but a problem for static linkage
31	** on embedded systems, too.
32	**
33	** OTOH the builtin conversion function is very compact. Even though it
34	** does a lot more, like parsing long longs, octal or imaginary numbers
35	** and returning the result in different formats:
36	** a) It needs less than 3 KB (!) of machine code (on x64 with -Os),
37	** b) it doesn't perform any dynamic allocation and,
38	** c) it needs only around 600 bytes of stack space.
39	**
40	** The builtin function is faster than strtod() for typical inputs, e.g.
41	** "123", "1.5" or "1e6". Arguably, it's slower for very large exponents,
42	** which are not very common (this could be fixed, if needed).
43	**
44	** And most importantly, the builtin function is equally precise on all
45	** platforms. It correctly converts and rounds any input to a double.
46	** If this is not the case, please send a bug report -- but PLEASE verify
47	** that the implementation you're comparing to is not the culprit!
48	**
49	** The implementation quickly pre-scans the entire string first and
50	** handles simple integers on-the-fly. Otherwise, it dispatches to the
51	** base-specific parser. Hex and octal is straightforward.
52	**
53	** Decimal to binary conversion uses a fixed-length circular buffer in
54	** base 100. Some simple cases are handled directly. For other cases, the
55	** number in the buffer is up-scaled or down-scaled until the integer part
56	** is in the proper range. Then the integer part is rounded and converted
57	** to a double which is finally rescaled to the result. Denormals need
58	** special treatment to prevent incorrect 'double rounding'.
59	*/
60
61	/ Definitions for circular decimal digit buffer (base 100 = 2 digits/byte). /
62	#define STRSCAN_DIG 1024
63	#define STRSCAN_MAXDIG 800 /* 772 + extra are sufficient. */
64	#define STRSCAN_DDIG (STRSCAN_DIG/2)
65	#define STRSCAN_DMASK (STRSCAN_DDIG-1)
66
67	/ Helpers for circular buffer. /
68	#define DNEXT(a) (((a)+1) & STRSCAN_DMASK)
69	#define DPREV(a) (((a)-1) & STRSCAN_DMASK)
70	#define DLEN(lo, hi) ((int32_t)(((lo)-(hi)) & STRSCAN_DMASK))
71
72	#define casecmp(c, k) (((c) \| 0x20) == k)
73
74	/ Final conversion to double. /
75	static void strscan_double(uint64_t x, TValue *o, int32_t ex2, int32_t neg)
76	{
77	double n;
78
79	/ Avoid double rounding for denormals. /
80	if (LJ_UNLIKELY(ex2 <= -`1075` && x != `0`)) {
81	/ NYI: all of this generates way too much code on 32 bit CPUs. /
82	#if defined(__GNUC__) && LJ_64
83	int32_t b = (int32_t)(__builtin_clzll(x)^`63`);
84	#else
85	int32_t b = (x>>`32`) ? `32`+(int32_t)lj_fls((uint32_t)(x>>`32`)) :
86	(int32_t)lj_fls((uint32_t)x);
87	#endif
88	if ((int32_t)b + ex2 <= -`1023` && (int32_t)b + ex2 >= -`1075`) {
89	uint64_t rb = (uint64_t)`1` << (-`1075`-ex2);
90	if ((x & rb) && ((x & (rb+rb+rb-`1`)))) x += rb+rb;
91	x = (x & ~(rb+rb-`1`));
92	}
93	}
94
95	/ Convert to double using a signed int64_t conversion, then rescale. /
96	lua_assert((int64_t)x >= `0`);
97	n = (double)(int64_t)x;
98	if (neg) n = -n;
99	if (ex2) n = ldexp(n, ex2);
100	o->n = n;
101	}
102
103	/ Parse hexadecimal number. /
104	static StrScanFmt strscan_hex(const uint8_t p, TValue o,
105	StrScanFmt fmt, uint32_t opt,
106	int32_t ex2, int32_t neg, uint32_t dig)
107	{
108	uint64_t x = `0`;
109	uint32_t i;
110
111	/ Scan hex digits. /
112	for (i = dig > `16` ? `16` : dig ; i; i--, p++) {
113	uint32_t d = (p != `'.'` ? p : *++p); if (d > `'9'`) d += `9`;
114	x = (x << `4`) + (d & `15`);
115	}
116
117	/ Summarize rounding-effect of excess digits. /
118	for (i = `16`; i < dig; i++, p++)
119	x \|= ((p != `'.'` ? p : *++p) != `'0'`), ex2 += `4`;
120
121	/ Format-specific handling. /
122	switch (fmt) {
123	case STRSCAN_INT:
124	if (!(opt & STRSCAN_OPT_TONUM) && x < `0x80000000u`+neg) {
125	o->i = neg ? -(int32_t)x : (int32_t)x;
126	return STRSCAN_INT; / Fast path for 32 bit integers. /
127	}
128	if (!(opt & STRSCAN_OPT_C)) { fmt = STRSCAN_NUM; break; }
129	/ fallthrough /
130	case STRSCAN_U32:
131	if (dig > `8`) return STRSCAN_ERROR;
132	o->i = neg ? -(int32_t)x : (int32_t)x;
133	return STRSCAN_U32;
134	case STRSCAN_I64:
135	case STRSCAN_U64:
136	if (dig > `16`) return STRSCAN_ERROR;
137	o->u64 = neg ? (uint64_t)-(int64_t)x : x;
138	return fmt;
139	default:
140	break;
141	}
142
143	/ Reduce range then convert to double. /
144	if ((x & U64x(c0000000,`0000000`))) { x = (x >> `2`) \| (x & `3`); ex2 += `2`; }
145	strscan_double(x, o, ex2, neg);
146	return fmt;
147	}
148
149	/ Parse octal number. /
150	static StrScanFmt strscan_oct(const uint8_t p, TValue o,
151	StrScanFmt fmt, int32_t neg, uint32_t dig)
152	{
153	uint64_t x = `0`;
154
155	/ Scan octal digits. /
156	if (dig > `22` \|\| (dig == `22` && p > `'1'`)) return* STRSCAN_ERROR;
157	while (dig-- > `0`) {
158	if (!(p >= `'0'` && p <= `'7'`)) return STRSCAN_ERROR;
159	x = (x << `3`) + (*p++ & `7`);
160	}
161
162	/ Format-specific handling. /
163	switch (fmt) {
164	case STRSCAN_INT:
165	if (x >= `0x80000000u`+neg) fmt = STRSCAN_U32;
166	/ fallthrough /
167	case STRSCAN_U32:
168	if ((x >> `32`)) return STRSCAN_ERROR;
169	o->i = neg ? -(int32_t)x : (int32_t)x;
170	break;
171	default:
172	case STRSCAN_I64:
173	case STRSCAN_U64:
174	o->u64 = neg ? (uint64_t)-(int64_t)x : x;
175	break;
176	}
177	return fmt;
178	}
179
180	/ Parse decimal number. /
181	static StrScanFmt strscan_dec(const uint8_t p, TValue o,
182	StrScanFmt fmt, uint32_t opt,
183	int32_t ex10, int32_t neg, uint32_t dig)
184	{
185	uint8_t xi[STRSCAN_DDIG], *xip = xi;
186
187	if (dig) {
188	uint32_t i = dig;
189	if (i > STRSCAN_MAXDIG) {
190	ex10 += (int32_t)(i - STRSCAN_MAXDIG);
191	i = STRSCAN_MAXDIG;
192	}
193	/ Scan unaligned leading digit. /
194	if (((ex10^i) & `1`))
195	xip++ = ((p != `'.'` ? p : ++p) & `15`), i--, p++;
196	/ Scan aligned double-digits. /
197	for ( ; i > `1`; i -= `2`) {
198	uint32_t d = `10` * ((p != `'.'` ? p : *++p) & `15`); p++;
199	xip++ = d + ((p != `'.'` ? p : ++p) & `15`); p++;
200	}
201	/ Scan and realign trailing digit. /
202	if (i) xip++ = `10` ((p != `'.'` ? p : *++p) & `15`), ex10--, p++;
203
204	/ Summarize rounding-effect of excess digits. /
205	if (dig > STRSCAN_MAXDIG) {
206	do {
207	if ((p != `'.'` ? p : ++p) != `'0'`) { xip[-`1`] \|= `1`; break*; }
208	p++;
209	} while (--dig > STRSCAN_MAXDIG);
210	dig = STRSCAN_MAXDIG;
211	} else { / Simplify exponent. /
212	while (ex10 > `0` && dig <= `18`) *xip++ = `0`, ex10 -= `2`, dig += `2`;
213	}
214	} else { / Only got zeros. /
215	ex10 = `0`;
216	xi[`0`] = `0`;
217	}
218
219	/ Fast path for numbers in integer format (but handles e.g. 1e6, too). /
220	if (dig <= `20` && ex10 == `0`) {
221	uint8_t *xis;
222	uint64_t x = xi[`0`];
223	double n;
224	for (xis = xi+`1`; xis < xip; xis++) x = x * `100` + *xis;
225	if (!(dig == `20` && (xi[`0`] > `18` \|\| (int64_t)x >= `0`))) { / No overflow? /
226	/ Format-specific handling. /
227	switch (fmt) {
228	case STRSCAN_INT:
229	if (!(opt & STRSCAN_OPT_TONUM) && x < `0x80000000u`+neg) {
230	o->i = neg ? -(int32_t)x : (int32_t)x;
231	return STRSCAN_INT; / Fast path for 32 bit integers. /
232	}
233	if (!(opt & STRSCAN_OPT_C)) { fmt = STRSCAN_NUM; goto plainnumber; }
234	/ fallthrough /
235	case STRSCAN_U32:
236	if ((x >> `32`) != `0`) return STRSCAN_ERROR;
237	o->i = neg ? -(int32_t)x : (int32_t)x;
238	return STRSCAN_U32;
239	case STRSCAN_I64:
240	case STRSCAN_U64:
241	o->u64 = neg ? (uint64_t)-(int64_t)x : x;
242	return fmt;
243	default:
244	plainnumber: / Fast path for plain numbers < 2^63. /
245	if ((int64_t)x < `0`) break;
246	n = (double)(int64_t)x;
247	if (neg) n = -n;
248	o->n = n;
249	return fmt;
250	}
251	}
252	}
253
254	/ Slow non-integer path. /
255	if (fmt == STRSCAN_INT) {
256	if ((opt & STRSCAN_OPT_C)) return STRSCAN_ERROR;
257	fmt = STRSCAN_NUM;
258	} else if (fmt > STRSCAN_INT) {
259	return STRSCAN_ERROR;
260	}
261	{
262	uint32_t hi = `0`, lo = (uint32_t)(xip-xi);
263	int32_t ex2 = `0`, idig = (int32_t)lo + (ex10 >> `1`);
264
265	lua_assert(lo > `0` && (ex10 & `1`) == `0`);
266
267	/ Handle simple overflow/underflow. /
268	if (idig > `310`/`2`) { if (neg) setminfV(o); else setpinfV(o); return fmt; }
269	else if (idig < -`326`/`2`) { o->n = neg ? -`0.0` : `0.0`; return fmt; }
270
271	/ Scale up until we have at least 17 or 18 integer part digits. /
272	while (idig < `9` && idig < DLEN(lo, hi)) {
273	uint32_t i, cy = `0`;
274	ex2 -= `6`;
275	for (i = DPREV(lo); ; i = DPREV(i)) {
276	uint32_t d = (xi[i] << `6`) + cy;
277	cy = (((d >> `2`) * `5243`) >> `17`); d = d - cy * `100`; / Div/mod 100. /
278	xi[i] = (uint8_t)d;
279	if (i == hi) break;
280	if (d == `0` && i == DPREV(lo)) lo = i;
281	}
282	if (cy) {
283	hi = DPREV(hi);
284	if (xi[DPREV(lo)] == `0`) lo = DPREV(lo);
285	else if (hi == lo) { lo = DPREV(lo); xi[DPREV(lo)] \|= xi[lo]; }
286	xi[hi] = (uint8_t)cy; idig++;
287	}
288	}
289
290	/ Scale down until no more than 17 or 18 integer part digits remain. /
291	while (idig > `9`) {
292	uint32_t i, cy = `0`;
293	ex2 += `6`;
294	for (i = hi; i != lo; i = DNEXT(i)) {
295	cy += xi[i];
296	xi[i] = (cy >> `6`);
297	cy = `100` * (cy & `0x3f`);
298	if (xi[i] == `0` && i == hi) hi = DNEXT(hi), idig--;
299	}
300	while (cy) {
301	if (hi == lo) { xi[DPREV(lo)] \|= `1`; break; }
302	xi[lo] = (cy >> `6`); lo = DNEXT(lo);
303	cy = `100` * (cy & `0x3f`);
304	}
305	}
306
307	/ Collect integer part digits and convert to rescaled double. /
308	{
309	uint64_t x = xi[hi];
310	uint32_t i;
311	for (i = DNEXT(hi); --idig > `0` && i != lo; i = DNEXT(i))
312	x = x * `100` + xi[i];
313	if (i == lo) {
314	while (--idig >= `0`) x = x * `100`;
315	} else { / Gather round bit from remaining digits. /
316	x <<= `1`; ex2--;
317	do {
318	if (xi[i]) { x \|= `1`; break; }
319	i = DNEXT(i);
320	} while (i != lo);
321	}
322	strscan_double(x, o, ex2, neg);
323	}
324	}
325	return fmt;
326	}
327
328	/ Scan string containing a number. Returns format. Returns value in o. /
329	StrScanFmt lj_strscan_scan(const uint8_t p, TValue o, uint32_t opt)
330	{
331	int32_t neg = `0`;
332
333	/ Remove leading space, parse sign and non-numbers. /
334	if (LJ_UNLIKELY(!lj_char_isdigit(*p))) {
335	while (lj_char_isspace(*p)) p++;
336	if (p == `'+'` \|\| p == `'-'`) neg = (*p++ == `'-'`);
337	if (LJ_UNLIKELY(p >= `'A'`)) { /* Parse "inf", "infinity" or "nan". /
338	TValue tmp;
339	setnanV(&tmp);
340	if (casecmp(p[`0`],`'i'`) && casecmp(p[`1`],`'n'`) && casecmp(p[`2`],`'f'`)) {
341	if (neg) setminfV(&tmp); else setpinfV(&tmp);
342	p += `3`;
343	if (casecmp(p[`0`],`'i'`) && casecmp(p[`1`],`'n'`) && casecmp(p[`2`],`'i'`) &&
344	casecmp(p[`3`],`'t'`) && casecmp(p[`4`],`'y'`)) p += `5`;
345	} else if (casecmp(p[`0`],`'n'`) && casecmp(p[`1`],`'a'`) && casecmp(p[`2`],`'n'`)) {
346	p += `3`;
347	}
348	while (lj_char_isspace(*p)) p++;
349	if (p) return* STRSCAN_ERROR;
350	o->u64 = tmp.u64;
351	return STRSCAN_NUM;
352	}
353	}
354
355	/ Parse regular number. /
356	{
357	StrScanFmt fmt = STRSCAN_INT;
358	int cmask = LJ_CHAR_DIGIT;
359	int base = (opt & STRSCAN_OPT_C) && *p == `'0'` ? `0` : `10`;
360	const uint8_t sp, dp = NULL;
361	uint32_t dig = `0`, hasdig = `0`, x = `0`;
362	int32_t ex = `0`;
363
364	/ Determine base and skip leading zeros. /
365	if (LJ_UNLIKELY(*p <= `'0'`)) {
366	if (*p == `'0'` && casecmp(p[`1`], `'x'`))
367	base = `16`, cmask = LJ_CHAR_XDIGIT, p += `2`;
368	for ( ; ; p++) {
369	if (*p == `'0'`) {
370	hasdig = `1`;
371	} else if (*p == `'.'`) {
372	if (dp) return STRSCAN_ERROR;
373	dp = p;
374	} else {
375	break;
376	}
377	}
378	}
379
380	/ Preliminary digit and decimal point scan. /
381	for (sp = p; ; p++) {
382	if (LJ_LIKELY(lj_char_isa(*p, cmask))) {
383	x = x * `10` + (p & `15`); /* For fast path below. /
384	dig++;
385	} else if (*p == `'.'`) {
386	if (dp) return STRSCAN_ERROR;
387	dp = p;
388	} else {
389	break;
390	}
391	}
392	if (!(hasdig \| dig)) return STRSCAN_ERROR;
393
394	/ Handle decimal point. /
395	if (dp) {
396	fmt = STRSCAN_NUM;
397	if (dig) {
398	ex = (int32_t)(dp-(p-`1`)); dp = p-`1`;
399	while (ex < `0` && dp-- == `'0'`) ex++, dig--; /* Skip trailing zeros. /
400	if (base == `16`) ex *= `4`;
401	}
402	}
403
404	/ Parse exponent. /
405	if (casecmp(*p, (uint32_t)(base == `16` ? `'p'` : `'e'`))) {
406	uint32_t xx;
407	int negx = `0`;
408	fmt = STRSCAN_NUM; p++;
409	if (p == `'+'` \|\| p == `'-'`) negx = (*p++ == `'-'`);
410	if (!lj_char_isdigit(p)) return* STRSCAN_ERROR;
411	xx = (*p++ & `15`);
412	while (lj_char_isdigit(*p)) {
413	if (xx < `65536`) xx = xx * `10` + (*p & `15`);
414	p++;
415	}
416	ex += negx ? -(int32_t)xx : (int32_t)xx;
417	}
418
419	/ Parse suffix. /
420	if (*p) {
421	/ I (IMAG), U (U32), LL (I64), ULL/LLU (U64), L (long), UL/LU (ulong). /
422	/ NYI: f (float). Not needed until cp_number() handles non-integers. /
423	if (casecmp(*p, `'i'`)) {
424	if (!(opt & STRSCAN_OPT_IMAG)) return STRSCAN_ERROR;
425	p++; fmt = STRSCAN_IMAG;
426	} else if (fmt == STRSCAN_INT) {
427	if (casecmp(*p, `'u'`)) p++, fmt = STRSCAN_U32;
428	if (casecmp(*p, `'l'`)) {
429	p++;
430	if (casecmp(*p, `'l'`)) p++, fmt += STRSCAN_I64 - STRSCAN_INT;
431	else if (!(opt & STRSCAN_OPT_C)) return STRSCAN_ERROR;
432	else if (sizeof(long) == `8`) fmt += STRSCAN_I64 - STRSCAN_INT;
433	}
434	if (casecmp(*p, `'u'`) && (fmt == STRSCAN_INT \|\| fmt == STRSCAN_I64))
435	p++, fmt += STRSCAN_U32 - STRSCAN_INT;
436	if ((fmt == STRSCAN_U32 && !(opt & STRSCAN_OPT_C)) \|\|
437	(fmt >= STRSCAN_I64 && !(opt & STRSCAN_OPT_LL)))
438	return STRSCAN_ERROR;
439	}
440	while (lj_char_isspace(*p)) p++;
441	if (p) return* STRSCAN_ERROR;
442	}
443
444	/ Fast path for decimal 32 bit integers. /
445	if (fmt == STRSCAN_INT && base == `10` &&
446	(dig < `10` \|\| (dig == `10` && *sp <= `'2'` && x < `0x80000000u`+neg))) {
447	int32_t y = neg ? -(int32_t)x : (int32_t)x;
448	if ((opt & STRSCAN_OPT_TONUM)) {
449	o->n = (double)y;
450	return STRSCAN_NUM;
451	} else {
452	o->i = y;
453	return STRSCAN_INT;
454	}
455	}
456
457	/ Dispatch to base-specific parser. /
458	if (base == `0` && !(fmt == STRSCAN_NUM \|\| fmt == STRSCAN_IMAG))
459	return strscan_oct(sp, o, fmt, neg, dig);
460	if (base == `16`)
461	fmt = strscan_hex(sp, o, fmt, opt, ex, neg, dig);
462	else
463	fmt = strscan_dec(sp, o, fmt, opt, ex, neg, dig);
464
465	/ Try to convert number to integer, if requested. /
466	if (fmt == STRSCAN_NUM && (opt & STRSCAN_OPT_TOINT)) {
467	double n = o->n;
468	int32_t i = lj_num2int(n);
469	if (n == (lua_Number)i) { o->i = i; return STRSCAN_INT; }
470	}
471	return fmt;
472	}
473	}
474
475	int LJ_FASTCALL lj_strscan_num(GCstr str, TValue o)
476	{
477	StrScanFmt fmt = lj_strscan_scan((const uint8_t *)strdata(str), o,
478	STRSCAN_OPT_TONUM);
479	lua_assert(fmt == STRSCAN_ERROR \|\| fmt == STRSCAN_NUM);
480	return (fmt != STRSCAN_ERROR);
481	}
482
483	#if LJ_DUALNUM
484	int LJ_FASTCALL lj_strscan_number(GCstr str, TValue o)
485	{
486	StrScanFmt fmt = lj_strscan_scan((const uint8_t *)strdata(str), o,
487	STRSCAN_OPT_TOINT);
488	lua_assert(fmt == STRSCAN_ERROR \|\| fmt == STRSCAN_NUM \|\| fmt == STRSCAN_INT);
489	if (fmt == STRSCAN_INT) setitype(o, LJ_TISNUM);
490	return (fmt != STRSCAN_ERROR);
491	}
492	#endif
493
494	#undef DNEXT
495	#undef DPREV
496	#undef DLEN
497
498

Browse the source code of Aerospike/modules/luajit/src/lj_strscan.c