pd_json.c source code [ClickHouse/contrib/poco/JSON/src/pd_json.c]

1	#include <stdio.h>
2	#include <stdbool.h>
3	#include <stdlib.h>
4	#include <string.h>
5	#include <ctype.h>
6	#include <errno.h>
7	#include "pd_json.h"
8
9	#define json_error(json, format, ...) \
10	if (!json->error) { \
11	json->error = 1; \
12	snprintf(json->errmsg, sizeof(json->errmsg), \
13	"error: %lu: " format, \
14	(unsigned long) json->lineno, \
15	__VA_ARGS__); \
16	} \
17
18	#define STACK_INC 4
19
20	static enum json_type
21	push(json_stream json, enum* json_type type)
22	{
23	json->stack_top++;
24
25	if (json->stack_top >= json->stack_size) {
26	struct json_stack *stack;
27	stack = json->alloc.realloc(json->stack,
28	(json->stack_size + STACK_INC) * sizeof(*json->stack));
29	if (stack == NULL) {
30	json_error(json, "%s", strerror(errno));
31	return JSON_ERROR;
32	}
33
34	json->stack_size += STACK_INC;
35	json->stack = stack;
36	}
37
38	json->stack[json->stack_top].type = type;
39	json->stack[json->stack_top].count = `0`;
40
41	return type;
42	}
43
44	static enum json_type
45	pop(json_stream json, int* c, enum json_type expected)
46	{
47	if (json->stack == NULL \|\| json->stack[json->stack_top].type != expected) {
48	json_error(json, "unexpected byte, '%c'", c);
49	json->alloc.free(json->stack);
50	return JSON_ERROR;
51	}
52	json->stack_top--;
53	return expected == JSON_ARRAY ? JSON_ARRAY_END : JSON_OBJECT_END;
54	}
55
56	static void pop_all(json_stream *json)
57	{
58	json->alloc.free(json->stack);
59	}
60
61	static int buffer_peek(struct json_source *source)
62	{
63	if (source->position < source->source.buffer.length)
64	return source->source.buffer.buffer[source->position];
65	else
66	return EOF;
67	}
68
69	static int buffer_get(struct json_source *source)
70	{
71	int c = source->peek(source);
72	source->position++;
73	return c;
74	}
75
76	static int stream_get(struct json_source *source)
77	{
78	source->position++;
79	return fgetc(source->source.stream.stream);
80	}
81
82	static int stream_peek(struct json_source *source)
83	{
84	int c = fgetc(source->source.stream.stream);
85	ungetc(c, source->source.stream.stream);
86	return c;
87	}
88
89	static void init(json_stream *json)
90	{
91	json->lineno = `1`;
92	json->error = `0`;
93	json->errmsg[`0`] = `'\0'`;
94	json->ntokens = `0`;
95	json->next = `0`;
96	json->streaming = true;
97
98	json->stack = NULL;
99	json->stack_top = -`1`;
100	json->stack_size = `0`;
101
102	json->data.string = NULL;
103	json->data.string_size = `0`;
104	json->data.string_fill = `0`;
105	json->source.position = `0`;
106
107	json->alloc.malloc = malloc;
108	json->alloc.realloc = realloc;
109	json->alloc.free = free;
110	}
111
112	static enum json_type
113	is_match(json_stream json, const* char pattern, enum* json_type type)
114	{
115	for (const char p = pattern; p; p++)
116	if (*p != json->source.get(&json->source))
117	return JSON_ERROR;
118	return type;
119	}
120
121	static int pushchar(json_stream json, int* c)
122	{
123	if (json->data.string_fill == json->data.string_size) {
124	size_t size = json->data.string_size * `2`;
125	char *buffer = json->alloc.realloc(json->data.string, size);
126	if (buffer == NULL) {
127	json_error(json, "%s", strerror(errno));
128	return -`1`;
129	} else {
130	json->data.string_size = size;
131	json->data.string = buffer;
132	}
133	}
134	json->data.string[json->data.string_fill++] = c;
135	return `0`;
136	}
137
138	static int init_string(json_stream *json)
139	{
140	json->data.string_fill = `0`;
141	if (json->data.string == NULL) {
142	json->data.string_size = `1024`;
143	json->data.string = json->alloc.malloc(json->data.string_size);
144	if (json->data.string == NULL) {
145	json_error(json, "%s", strerror(errno));
146	return -`1`;
147	}
148	}
149	json->data.string[`0`] = `'\0'`;
150	return `0`;
151	}
152
153	static int encode_utf8(json_stream json, unsigned* long c)
154	{
155	if (c < `0x80UL`) {
156	return pushchar(json, c);
157	} else if (c < `0x0800UL`) {
158	return !((pushchar(json, (c >> `6` & `0x1F`) \| `0xC0`) == `0`) &&
159	(pushchar(json, (c >> `0` & `0x3F`) \| `0x80`) == `0`));
160	} else if (c < `0x010000UL`) {
161	if (c >= `0xd800` && c <= `0xdfff`) {
162	json_error(json, "invalid codepoint %06lx", c);
163	return -`1`;
164	}
165	return !((pushchar(json, (c >> `12` & `0x0F`) \| `0xE0`) == `0`) &&
166	(pushchar(json, (c >> `6` & `0x3F`) \| `0x80`) == `0`) &&
167	(pushchar(json, (c >> `0` & `0x3F`) \| `0x80`) == `0`));
168	} else if (c < `0x110000UL`) {
169	return !((pushchar(json, (c >> `18` & `0x07`) \| `0xF0`) == `0`) &&
170	(pushchar(json, (c >> `12` & `0x3F`) \| `0x80`) == `0`) &&
171	(pushchar(json, (c >> `6` & `0x3F`) \| `0x80`) == `0`) &&
172	(pushchar(json, (c >> `0` & `0x3F`) \| `0x80`) == `0`));
173	} else {
174	json_error(json, "can't encode UTF-8 for %06lx", c);
175	return -`1`;
176	}
177	}
178
179	static int hexchar(int c)
180	{
181	switch (c) {
182	case `'0'`: return `0`;
183	case `'1'`: return `1`;
184	case `'2'`: return `2`;
185	case `'3'`: return `3`;
186	case `'4'`: return `4`;
187	case `'5'`: return `5`;
188	case `'6'`: return `6`;
189	case `'7'`: return `7`;
190	case `'8'`: return `8`;
191	case `'9'`: return `9`;
192	case `'a'`:
193	case `'A'`: return `10`;
194	case `'b'`:
195	case `'B'`: return `11`;
196	case `'c'`:
197	case `'C'`: return `12`;
198	case `'d'`:
199	case `'D'`: return `13`;
200	case `'e'`:
201	case `'E'`: return `14`;
202	case `'f'`:
203	case `'F'`: return `15`;
204	default:
205	return -`1`;
206	}
207	}
208
209	static long
210	read_unicode_cp(json_stream *json)
211	{
212	long cp = `0`;
213	int shift = `12`;
214
215	for (size_t i = `0`; i < `4`; i++) {
216	int c = json->source.get(&json->source);
217	int hc;
218
219	if (c == EOF) {
220	json_error(json, "%s", "unterminated string literal in unicode");
221	return -`1`;
222	} else if ((hc = hexchar(c)) == -`1`) {
223	json_error(json, "bad escape unicode byte, '%c'", c);
224	return -`1`;
225	}
226
227	cp += hc * (`1` << shift);
228	shift -= `4`;
229	}
230
231
232	return cp;
233	}
234
235	static int read_unicode(json_stream *json)
236	{
237	long cp, h, l;
238
239	if ((cp = read_unicode_cp(json)) == -`1`) {
240	return -`1`;
241	}
242
243	if (cp >= `0xd800` && cp <= `0xdbff`) {
244	/ This is the high portion of a surrogate pair; we need to read the*
245	* lower portion to get the codepoint
246	*/
247	h = cp;
248
249	int c = json->source.get(&json->source);
250	if (c == EOF) {
251	json_error(json, "%s", "unterminated string literal in unicode");
252	return -`1`;
253	} else if (c != `'\\'`) {
254	json_error(json, "invalid continuation for surrogate pair: '%c', "
255	"expected '\\'", c);
256	return -`1`;
257	}
258
259	c = json->source.get(&json->source);
260	if (c == EOF) {
261	json_error(json, "%s", "unterminated string literal in unicode");
262	return -`1`;
263	} else if (c != `'u'`) {
264	json_error(json, "invalid continuation for surrogate pair: '%c', "
265	"expected 'u'", c);
266	return -`1`;
267	}
268
269	if ((l = read_unicode_cp(json)) == -`1`) {
270	return -`1`;
271	}
272
273	if (l < `0xdc00` \|\| l > `0xdfff`) {
274	json_error(json, "invalid surrogate pair continuation \\u%04lx out "
275	"of range (dc00-dfff)", l);
276	return -`1`;
277	}
278
279	cp = ((h - `0xd800`) * `0x400`) + ((l - `0xdc00`) + `0x10000`);
280	} else if (cp >= `0xdc00` && cp <= `0xdfff`) {
281	json_error(json, "dangling surrogate \\u%04lx", cp);
282	return -`1`;
283	}
284
285	return encode_utf8(json, cp);
286	}
287
288	int read_escaped(json_stream *json)
289	{
290	int c = json->source.get(&json->source);
291	if (c == EOF) {
292	json_error(json, "%s", "unterminated string literal in escape");
293	return -`1`;
294	} else if (c == `'u'`) {
295	if (read_unicode(json) != `0`)
296	return -`1`;
297	} else {
298	switch (c) {
299	case `'\\'`:
300	case `'b'`:
301	case `'f'`:
302	case `'n'`:
303	case `'r'`:
304	case `'t'`:
305	case `'/'`:
306	case `'"'`:
307	{
308	const char *codes = "\\bfnrt/\"";
309	char *p = strchr(codes, c);
310	if (pushchar(json, "\\\b\f\n\r\t/\""[p - codes]) != `0`)
311	return -`1`;
312	}
313	break;
314	default:
315	json_error(json, "bad escaped byte, '%c'", c);
316	return -`1`;
317	}
318	}
319	return `0`;
320	}
321
322	static int
323	char_needs_escaping(int c)
324	{
325	if ((c >= `0`) && (c < `0x20` \|\| c == `0x22` \|\| c == `0x5c`)) {
326	return `1`;
327	}
328
329	return `0`;
330	}
331
332	static int
333	utf8_seq_length(char byte)
334	{
335	unsigned char u = (unsigned char) byte;
336	if (u < `0x80`) return `1`;
337
338	if (`0x80` <= u && u <= `0xBF`)
339	{
340	// second, third or fourth byte of a multi-byte
341	// sequence, i.e. a "continuation byte"
342	return `0`;
343	}
344	else if (u == `0xC0` \|\| u == `0xC1`)
345	{
346	// overlong encoding of an ASCII byte
347	return `0`;
348	}
349	else if (`0xC2` <= u && u <= `0xDF`)
350	{
351	// 2-byte sequence
352	return `2`;
353	}
354	else if (`0xE0` <= u && u <= `0xEF`)
355	{
356	// 3-byte sequence
357	return `3`;
358	}
359	else if (`0xF0` <= u && u <= `0xF4`)
360	{
361	// 4-byte sequence
362	return `4`;
363	}
364	else
365	{
366	// u >= 0xF5
367	// Restricted (start of 4-, 5- or 6-byte sequence) or invalid UTF-8
368	return `0`;
369	}
370	}
371
372	static int
373	is_legal_utf8(const unsigned char bytes, int* length)
374	{
375	if (`0` == bytes \|\| `0` == length) return `0`;
376
377	unsigned char a;
378	const unsigned char* srcptr = bytes + length;
379	switch (length)
380	{
381	default:
382	return `0`;
383	// Everything else falls through when true.
384	case `4`:
385	if ((a = (--srcptr)) < `0x80` \|\| a > `0xBF`) return* `0`;
386	case `3`:
387	if ((a = (--srcptr)) < `0x80` \|\| a > `0xBF`) return* `0`;
388	case `2`:
389	a = (*--srcptr);
390	switch (*bytes)
391	{
392	case `0xE0`:
393	if (a < `0xA0` \|\| a > `0xBF`) return `0`;
394	break;
395	case `0xED`:
396	if (a < `0x80` \|\| a > `0x9F`) return `0`;
397	break;
398	case `0xF0`:
399	if (a < `0x90` \|\| a > `0xBF`) return `0`;
400	break;
401	case `0xF4`:
402	if (a < `0x80` \|\| a > `0x8F`) return `0`;
403	break;
404	default:
405	if (a < `0x80` \|\| a > `0xBF`) return `0`;
406	}
407	case `1`:
408	if (bytes >= `0x80` && bytes < `0xC2`) return `0`;
409	}
410	return *bytes <= `0xF4`;
411	}
412
413	static int
414	read_utf8(json_stream* json, int next_char)
415	{
416	int count = utf8_seq_length(next_char);
417	if (!count)
418	{
419	json_error(json, "%s", "Bad character.");
420	return -`1`;
421	}
422
423	char buffer[`4`];
424	buffer[`0`] = next_char;
425	for (int i = `1`; i < count; ++i)
426	{
427	buffer[i] = json->source.get(&json->source);;
428	}
429
430	if (!is_legal_utf8((unsigned char*) buffer, count))
431	{
432	json_error(json, "%s", "No legal UTF8 found");
433	return -`1`;
434	}
435
436	for (int i = `0`; i < count; ++i)
437	{
438	if (pushchar(json, buffer[i]) != `0`)
439	return -`1`;
440	}
441	return `0`;
442	}
443
444	static enum json_type
445	read_string(json_stream *json)
446	{
447	if (init_string(json) != `0`)
448	return JSON_ERROR;
449	while (`1`) {
450	int c = json->source.get(&json->source);
451	if (c == EOF) {
452	json_error(json, "%s", "unterminated string literal");
453	return JSON_ERROR;
454	} else if (c == `'"'`) {
455	if (pushchar(json, `'\0'`) == `0`)
456	return JSON_STRING;
457	else
458	return JSON_ERROR;
459	} else if (c == `'\\'`) {
460	if (read_escaped(json) != `0`)
461	return JSON_ERROR;
462	} else if ((unsigned) c >= `0x80`) {
463	if (read_utf8(json, c) != `0`)
464	return JSON_ERROR;
465	} else {
466	if (char_needs_escaping(c)) {
467	json_error(json, "%s:%u", "unescaped control character in string", (unsigned)c);
468	return JSON_ERROR;
469	}
470
471	if (pushchar(json, c) != `0`)
472	return JSON_ERROR;
473	}
474	}
475	return JSON_ERROR;
476	}
477
478	static int
479	is_digit(int c)
480	{
481	return c >= `48` /0/ && c <= `57` /9/;
482	}
483
484	static int
485	read_digits(json_stream *json)
486	{
487	unsigned nread = `0`;
488	while (is_digit(json->source.peek(&json->source))) {
489	if (pushchar(json, json->source.get(&json->source)) != `0`)
490	return -`1`;
491
492	nread++;
493	}
494
495	if (nread == `0`) {
496	return -`1`;
497	}
498
499	return `0`;
500	}
501
502	static enum json_type
503	read_number(json_stream json, int* c)
504	{
505	if (pushchar(json, c) != `0`)
506	return JSON_ERROR;
507	if (c == `'-'`) {
508	c = json->source.get(&json->source);
509	if (is_digit(c)) {
510	return read_number(json, c);
511	} else {
512	json_error(json, "unexpected byte, '%c'", c);
513	}
514	} else if (strchr("123456789", c) != NULL) {
515	c = json->source.peek(&json->source);
516	if (is_digit(c)) {
517	if (read_digits(json) != `0`)
518	return JSON_ERROR;
519	}
520	}
521	/ Up to decimal or exponent has been read. /
522	c = json->source.peek(&json->source);
523	if (strchr(".eE", c) == NULL) {
524	if (pushchar(json, `'\0'`) != `0`)
525	return JSON_ERROR;
526	else
527	return JSON_NUMBER;
528	}
529	if (c == `'.'`) {
530	json->source.get(&json->source); // consume .
531	if (pushchar(json, c) != `0`)
532	return JSON_ERROR;
533	if (read_digits(json) != `0`)
534	return JSON_ERROR;
535	}
536	/ Check for exponent. /
537	c = json->source.peek(&json->source);
538	if (c == `'e'` \|\| c == `'E'`) {
539	json->source.get(&json->source); // consume e/E
540	if (pushchar(json, c) != `0`)
541	return JSON_ERROR;
542	c = json->source.peek(&json->source);
543	if (c == `'+'` \|\| c == `'-'`) {
544	json->source.get(&json->source); // consume
545	if (pushchar(json, c) != `0`)
546	return JSON_ERROR;
547	if (read_digits(json) != `0`)
548	return JSON_ERROR;
549	} else if (is_digit(c)) {
550	if (read_digits(json) != `0`)
551	return JSON_ERROR;
552	} else {
553	json_error(json, "unexpected byte in number, '%c'", c);
554	return JSON_ERROR;
555	}
556	}
557	if (pushchar(json, `'\0'`) != `0`)
558	return JSON_ERROR;
559	else
560	return JSON_NUMBER;
561	}
562
563	static int
564	json_isspace(int c)
565	{
566	switch (c) {
567	case `0x09`:
568	case `0x0a`:
569	case `0x0d`:
570	case `0x20`:
571	return `1`;
572	}
573
574	return `0`;
575	}
576
577	/ Returns the next non-whitespace character in the stream. /
578	static int next(json_stream *json)
579	{
580	int c;
581	while (json_isspace(c = json->source.get(&json->source)))
582	if (c == `'\n'`)
583	json->lineno++;
584	return c;
585	}
586
587	static enum json_type
588	read_value(json_stream json, int* c)
589	{
590	json->ntokens++;
591	switch (c) {
592	case EOF:
593	json_error(json, "%s", "unexpected end of data");
594	return JSON_ERROR;
595	case `'{'`:
596	return push(json, JSON_OBJECT);
597	case `'['`:
598	return push(json, JSON_ARRAY);
599	case `'"'`:
600	return read_string(json);
601	case `'n'`:
602	return is_match(json, "ull", JSON_NULL);
603	case `'f'`:
604	return is_match(json, "alse", JSON_FALSE);
605	case `'t'`:
606	return is_match(json, "rue", JSON_TRUE);
607	case `'0'`:
608	case `'1'`:
609	case `'2'`:
610	case `'3'`:
611	case `'4'`:
612	case `'5'`:
613	case `'6'`:
614	case `'7'`:
615	case `'8'`:
616	case `'9'`:
617	case `'-'`:
618	if (init_string(json) != `0`)
619	return JSON_ERROR;
620	return read_number(json, c);
621	default:
622	json_error(json, "unexpected byte, '%c'", c);
623	return JSON_ERROR;
624	}
625	}
626
627	enum json_type json_peek(json_stream *json)
628	{
629	enum json_type next = json_next(json);
630	json->next = next;
631	return next;
632	}
633
634	enum json_type json_next(json_stream *json)
635	{
636	if (json->error)
637	return JSON_ERROR;
638	if (json->next != `0`) {
639	enum json_type next = json->next;
640	json->next = `0`;
641	return next;
642	}
643	if (json->ntokens > `0` && json->stack_top == (size_t)-`1`) {
644	int c;
645
646	do {
647	c = json->source.peek(&json->source);
648	if (json_isspace(c)) {
649	c = json->source.get(&json->source);
650	}
651	} while (json_isspace(c));
652	if (!json->streaming && c != EOF) {
653	return JSON_ERROR;
654	}
655	return JSON_DONE;
656	}
657	int c = next(json);
658	if (json->stack == NULL)
659	return read_value(json, c);
660	if (json->stack[json->stack_top].type == JSON_ARRAY) {
661	if (json->stack[json->stack_top].count == `0`) {
662	if (c == `']'`) {
663	return pop(json, c, JSON_ARRAY);
664	}
665	json->stack[json->stack_top].count++;
666	return read_value(json, c);
667	} else if (c == `','`) {
668	json->stack[json->stack_top].count++;
669	return read_value(json, next(json));
670	} else if (c == `']'`) {
671	return pop(json, c, JSON_ARRAY);
672	} else {
673	json_error(json, "unexpected byte, '%c'", c);
674	return JSON_ERROR;
675	}
676	} else if (json->stack[json->stack_top].type == JSON_OBJECT) {
677	if (json->stack[json->stack_top].count == `0`) {
678	if (c == `'}'`) {
679	return pop(json, c, JSON_OBJECT);
680	}
681
682	/ No property value pairs yet. /
683	enum json_type value = read_value(json, c);
684	if (value != JSON_STRING) {
685	json_error(json, "%s", "expected property name or '}'");
686	return JSON_ERROR;
687	} else {
688	json->stack[json->stack_top].count++;
689	return value;
690	}
691	} else if ((json->stack[json->stack_top].count % `2`) == `0`) {
692	/ Expecting comma followed by property name. /
693	if (c != `','` && c != `'}'`) {
694	json_error(json, "%s", "expected ',' or '}'");
695	return JSON_ERROR;
696	} else if (c == `'}'`) {
697	return pop(json, c, JSON_OBJECT);
698	} else {
699	enum json_type value = read_value(json, next(json));
700	if (value != JSON_STRING) {
701	json_error(json, "%s", "expected property name");
702	return JSON_ERROR;
703	} else {
704	json->stack[json->stack_top].count++;
705	return value;
706	}
707	}
708	} else if ((json->stack[json->stack_top].count % `2`) == `1`) {
709	/ Expecting colon followed by value. /
710	if (c != `':'`) {
711	json_error(json, "%s", "expected ':' after property name");
712	return JSON_ERROR;
713	} else {
714	json->stack[json->stack_top].count++;
715	return read_value(json, next(json));
716	}
717	}
718	}
719	json_error(json, "%s", "invalid parser state");
720	return JSON_ERROR;
721	}
722
723	void json_reset(json_stream *json)
724	{
725	pop_all(json);
726	json->ntokens = `0`;
727	json->error = `0`;
728	json->errmsg[`0`] = `'\0'`;
729	}
730
731	const char json_get_string(json_stream json, size_t *length)
732	{
733	if (length != NULL)
734	*length = json->data.string_fill;
735	if (json->data.string == NULL)
736	return "";
737	else
738	return json->data.string;
739	}
740
741	double json_get_number(json_stream *json)
742	{
743	char *p = json->data.string;
744	return p == NULL ? `0` : strtod(p, NULL);
745	}
746
747	const char json_get_error(json_stream json)
748	{
749	return json->error ? json->errmsg : NULL;
750	}
751
752	size_t json_get_lineno(json_stream *json)
753	{
754	return json->lineno;
755	}
756
757	size_t json_get_position(json_stream *json)
758	{
759	return json->source.position;
760	}
761
762	size_t json_get_depth(json_stream *json)
763	{
764	return json->stack_top + `1`;
765	}
766
767	void json_open_buffer(json_stream json, const* void *buffer, size_t size)
768	{
769	init(json);
770	json->source.get = buffer_get;
771	json->source.peek = buffer_peek;
772	json->source.source.buffer.buffer = buffer;
773	json->source.source.buffer.length = size;
774	}
775
776	void json_open_string(json_stream json, const* char *string)
777	{
778	json_open_buffer(json, string, strlen(string));
779	}
780
781	void json_open_stream(json_stream json, FILE stream)
782	{
783	init(json);
784	json->source.get = stream_get;
785	json->source.peek = stream_peek;
786	json->source.source.stream.stream = stream;
787	}
788
789	void json_set_allocator(json_stream json, json_allocator a)
790	{
791	json->alloc = *a;
792	}
793
794	void json_set_streaming(json_stream *json, bool streaming)
795	{
796	json->streaming = streaming;
797	}
798
799	void json_close(json_stream *json)
800	{
801	pop_all(json);
802	json->alloc.free(json->data.string);
803	}
804

Browse the source code of ClickHouse/contrib/poco/JSON/src/pd_json.c