stringparsing.h source code [ClickHouse/contrib/simdjson/src/generic/stringparsing.h]

1	// This file contains the common code every implementation uses
2	// It is intended to be included multiple times and compiled multiple times
3	// We assume the file in which it is include already includes
4	// "stringparsing.h" (this simplifies amalgation)
5
6	// begin copypasta
7	// These chars yield themselves: " \ /
8	// b -> backspace, f -> formfeed, n -> newline, r -> cr, t -> horizontal tab
9	// u not handled in this table as it's complex
10	static const uint8_t escape_map[`256`] = {
11	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, // 0x0.
12	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
13	`0`, `0`, `0x22`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0x2f`,
14	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
15
16	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, // 0x4.
17	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0x5c`, `0`, `0`, `0`, // 0x5.
18	`0`, `0`, `0x08`, `0`, `0`, `0`, `0x0c`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0x0a`, `0`, // 0x6.
19	`0`, `0`, `0x0d`, `0`, `0x09`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, // 0x7.
20
21	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
22	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
23	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
24	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
25
26	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
27	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
28	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
29	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
30	};
31
32	// handle a unicode codepoint
33	// write appropriate values into dest
34	// src will advance 6 bytes or 12 bytes
35	// dest will advance a variable amount (return via pointer)
36	// return true if the unicode codepoint was valid
37	// We work in little-endian then swap at write time
38	WARN_UNUSED
39	really_inline bool handle_unicode_codepoint(const uint8_t **src_ptr,
40	uint8_t **dst_ptr) {
41	// hex_to_u32_nocheck fills high 16 bits of the return value with 1s if the
42	// conversion isn't valid; we defer the check for this to inside the
43	// multilingual plane check
44	uint32_t code_point = hex_to_u32_nocheck(*src_ptr + `2`);
45	*src_ptr += `6`;
46	// check for low surrogate for characters outside the Basic
47	// Multilingual Plane.
48	if (code_point >= `0xd800` && code_point < `0xdc00`) {
49	if (((src_ptr)[`0`] != `'\\'`) \|\| (src_ptr)[`1`] != `'u'`) {
50	return false;
51	}
52	uint32_t code_point_2 = hex_to_u32_nocheck(*src_ptr + `2`);
53
54	// if the first code point is invalid we will get here, as we will go past
55	// the check for being outside the Basic Multilingual plane. If we don't
56	// find a \u immediately afterwards we fail out anyhow, but if we do,
57	// this check catches both the case of the first code point being invalid
58	// or the second code point being invalid.
59	if ((code_point \| code_point_2) >> `16`) {
60	return false;
61	}
62
63	code_point =
64	(((code_point - `0xd800`) << `10`) \| (code_point_2 - `0xdc00`)) + `0x10000`;
65	*src_ptr += `6`;
66	}
67	size_t offset = codepoint_to_utf8(code_point, *dst_ptr);
68	*dst_ptr += offset;
69	return offset > `0`;
70	}
71
72	WARN_UNUSED really_inline bool parse_string(UNUSED const uint8_t *buf,
73	UNUSED size_t len, ParsedJson &pj,
74	UNUSED const uint32_t depth,
75	UNUSED uint32_t offset) {
76	pj.write_tape(pj.current_string_buf_loc - pj.string_buf, `'"'`);
77	const uint8_t src = &buf[offset + `1`]; /* we know that buf at offset is a " /
78	uint8_t dst = pj.current_string_buf_loc + sizeof*(uint32_t);
79	const uint8_t *const start_of_string = dst;
80	while (`1`) {
81	parse_string_helper helper = find_bs_bits_and_quote_bits(src, dst);
82	if (((helper.bs_bits - `1`) & helper.quote_bits) != `0`) {
83	/ we encountered quotes first. Move dst to point to quotes and exit*
84	*/
85
86	/ find out where the quote is... /
87	auto quote_dist = trailing_zeroes(helper.quote_bits);
88
89	/ NULL termination is still handy if you expect all your strings to*
90	* be NULL terminated? */
91	/ It comes at a small cost /
92	dst[quote_dist] = `0`;
93
94	uint32_t str_length = (dst - start_of_string) + quote_dist;
95	memcpy(pj.current_string_buf_loc, &str_length, sizeof(str_length));
96	/*****************************
97	* Above, check for overflow in case someone has a crazy string
98	* (>=4GB?) _
99	* But only add the overflow check when the document itself exceeds
100	* 4GB
101	* Currently unneeded because we refuse to parse docs larger or equal
102	* to 4GB.
103	****************************/
104
105	/ we advance the point, accounting for the fact that we have a NULL*
106	* termination */
107	pj.current_string_buf_loc = dst + quote_dist + `1`;
108	return true;
109	}
110	if (((helper.quote_bits - `1`) & helper.bs_bits) != `0`) {
111	/ find out where the backspace is /
112	auto bs_dist = trailing_zeroes(helper.bs_bits);
113	uint8_t escape_char = src[bs_dist + `1`];
114	/ we encountered backslash first. Handle backslash /
115	if (escape_char == `'u'`) {
116	/ move src/dst up to the start; they will be further adjusted*
117	within the unicode codepoint handling code. /*
118	src += bs_dist;
119	dst += bs_dist;
120	if (!handle_unicode_codepoint(&src, &dst)) {
121	return false;
122	}
123	} else {
124	/ simple 1:1 conversion. Will eat bs_dist+2 characters in input and*
125	* write bs_dist+1 characters to output
126	* note this may reach beyond the part of the buffer we've actually
127	* seen. I think this is ok */
128	uint8_t escape_result = escape_map[escape_char];
129	if (escape_result == `0u`) {
130	return false; / bogus escape value is an error /
131	}
132	dst[bs_dist] = escape_result;
133	src += bs_dist + `2`;
134	dst += bs_dist + `1`;
135	}
136	} else {
137	/ they are the same. Since they can't co-occur, it means we*
138	* encountered neither. */
139	src += parse_string_helper::BYTES_PROCESSED;
140	dst += parse_string_helper::BYTES_PROCESSED;
141	}
142	}
143	/ can't be reached /
144	return true;
145	}
146

Browse the source code of ClickHouse/contrib/simdjson/src/generic/stringparsing.h