1// This file contains the common code every implementation uses
2// It is intended to be included multiple times and compiled multiple times
3// We assume the file in which it is include already includes
4// "stringparsing.h" (this simplifies amalgation)
5
6// begin copypasta
7// These chars yield themselves: " \ /
8// b -> backspace, f -> formfeed, n -> newline, r -> cr, t -> horizontal tab
9// u not handled in this table as it's complex
10static const uint8_t escape_map[256] = {
11 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x0.
12 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
13 0, 0, 0x22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x2f,
14 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
15
16 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x4.
17 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x5c, 0, 0, 0, // 0x5.
18 0, 0, 0x08, 0, 0, 0, 0x0c, 0, 0, 0, 0, 0, 0, 0, 0x0a, 0, // 0x6.
19 0, 0, 0x0d, 0, 0x09, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x7.
20
21 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
22 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
23 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
24 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
25
26 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
27 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
28 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
29 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
30};
31
32// handle a unicode codepoint
33// write appropriate values into dest
34// src will advance 6 bytes or 12 bytes
35// dest will advance a variable amount (return via pointer)
36// return true if the unicode codepoint was valid
37// We work in little-endian then swap at write time
38WARN_UNUSED
39really_inline bool handle_unicode_codepoint(const uint8_t **src_ptr,
40 uint8_t **dst_ptr) {
41 // hex_to_u32_nocheck fills high 16 bits of the return value with 1s if the
42 // conversion isn't valid; we defer the check for this to inside the
43 // multilingual plane check
44 uint32_t code_point = hex_to_u32_nocheck(*src_ptr + 2);
45 *src_ptr += 6;
46 // check for low surrogate for characters outside the Basic
47 // Multilingual Plane.
48 if (code_point >= 0xd800 && code_point < 0xdc00) {
49 if (((*src_ptr)[0] != '\\') || (*src_ptr)[1] != 'u') {
50 return false;
51 }
52 uint32_t code_point_2 = hex_to_u32_nocheck(*src_ptr + 2);
53
54 // if the first code point is invalid we will get here, as we will go past
55 // the check for being outside the Basic Multilingual plane. If we don't
56 // find a \u immediately afterwards we fail out anyhow, but if we do,
57 // this check catches both the case of the first code point being invalid
58 // or the second code point being invalid.
59 if ((code_point | code_point_2) >> 16) {
60 return false;
61 }
62
63 code_point =
64 (((code_point - 0xd800) << 10) | (code_point_2 - 0xdc00)) + 0x10000;
65 *src_ptr += 6;
66 }
67 size_t offset = codepoint_to_utf8(code_point, *dst_ptr);
68 *dst_ptr += offset;
69 return offset > 0;
70}
71
72WARN_UNUSED really_inline bool parse_string(UNUSED const uint8_t *buf,
73 UNUSED size_t len, ParsedJson &pj,
74 UNUSED const uint32_t depth,
75 UNUSED uint32_t offset) {
76 pj.write_tape(pj.current_string_buf_loc - pj.string_buf, '"');
77 const uint8_t *src = &buf[offset + 1]; /* we know that buf at offset is a " */
78 uint8_t *dst = pj.current_string_buf_loc + sizeof(uint32_t);
79 const uint8_t *const start_of_string = dst;
80 while (1) {
81 parse_string_helper helper = find_bs_bits_and_quote_bits(src, dst);
82 if (((helper.bs_bits - 1) & helper.quote_bits) != 0) {
83 /* we encountered quotes first. Move dst to point to quotes and exit
84 */
85
86 /* find out where the quote is... */
87 auto quote_dist = trailing_zeroes(helper.quote_bits);
88
89 /* NULL termination is still handy if you expect all your strings to
90 * be NULL terminated? */
91 /* It comes at a small cost */
92 dst[quote_dist] = 0;
93
94 uint32_t str_length = (dst - start_of_string) + quote_dist;
95 memcpy(pj.current_string_buf_loc, &str_length, sizeof(str_length));
96 /*****************************
97 * Above, check for overflow in case someone has a crazy string
98 * (>=4GB?) _
99 * But only add the overflow check when the document itself exceeds
100 * 4GB
101 * Currently unneeded because we refuse to parse docs larger or equal
102 * to 4GB.
103 ****************************/
104
105 /* we advance the point, accounting for the fact that we have a NULL
106 * termination */
107 pj.current_string_buf_loc = dst + quote_dist + 1;
108 return true;
109 }
110 if (((helper.quote_bits - 1) & helper.bs_bits) != 0) {
111 /* find out where the backspace is */
112 auto bs_dist = trailing_zeroes(helper.bs_bits);
113 uint8_t escape_char = src[bs_dist + 1];
114 /* we encountered backslash first. Handle backslash */
115 if (escape_char == 'u') {
116 /* move src/dst up to the start; they will be further adjusted
117 within the unicode codepoint handling code. */
118 src += bs_dist;
119 dst += bs_dist;
120 if (!handle_unicode_codepoint(&src, &dst)) {
121 return false;
122 }
123 } else {
124 /* simple 1:1 conversion. Will eat bs_dist+2 characters in input and
125 * write bs_dist+1 characters to output
126 * note this may reach beyond the part of the buffer we've actually
127 * seen. I think this is ok */
128 uint8_t escape_result = escape_map[escape_char];
129 if (escape_result == 0u) {
130 return false; /* bogus escape value is an error */
131 }
132 dst[bs_dist] = escape_result;
133 src += bs_dist + 2;
134 dst += bs_dist + 1;
135 }
136 } else {
137 /* they are the same. Since they can't co-occur, it means we
138 * encountered neither. */
139 src += parse_string_helper::BYTES_PROCESSED;
140 dst += parse_string_helper::BYTES_PROCESSED;
141 }
142 }
143 /* can't be reached */
144 return true;
145}
146