1 | // This file contains the common code every implementation uses |
2 | // It is intended to be included multiple times and compiled multiple times |
3 | // We assume the file in which it is include already includes |
4 | // "stringparsing.h" (this simplifies amalgation) |
5 | |
6 | // begin copypasta |
7 | // These chars yield themselves: " \ / |
8 | // b -> backspace, f -> formfeed, n -> newline, r -> cr, t -> horizontal tab |
9 | // u not handled in this table as it's complex |
10 | static const uint8_t escape_map[256] = { |
11 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x0. |
12 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
13 | 0, 0, 0x22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x2f, |
14 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
15 | |
16 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x4. |
17 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x5c, 0, 0, 0, // 0x5. |
18 | 0, 0, 0x08, 0, 0, 0, 0x0c, 0, 0, 0, 0, 0, 0, 0, 0x0a, 0, // 0x6. |
19 | 0, 0, 0x0d, 0, 0x09, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x7. |
20 | |
21 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
22 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
23 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
24 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
25 | |
26 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
27 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
28 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
29 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
30 | }; |
31 | |
32 | // handle a unicode codepoint |
33 | // write appropriate values into dest |
34 | // src will advance 6 bytes or 12 bytes |
35 | // dest will advance a variable amount (return via pointer) |
36 | // return true if the unicode codepoint was valid |
37 | // We work in little-endian then swap at write time |
38 | WARN_UNUSED |
39 | really_inline bool handle_unicode_codepoint(const uint8_t **src_ptr, |
40 | uint8_t **dst_ptr) { |
41 | // hex_to_u32_nocheck fills high 16 bits of the return value with 1s if the |
42 | // conversion isn't valid; we defer the check for this to inside the |
43 | // multilingual plane check |
44 | uint32_t code_point = hex_to_u32_nocheck(*src_ptr + 2); |
45 | *src_ptr += 6; |
46 | // check for low surrogate for characters outside the Basic |
47 | // Multilingual Plane. |
48 | if (code_point >= 0xd800 && code_point < 0xdc00) { |
49 | if (((*src_ptr)[0] != '\\') || (*src_ptr)[1] != 'u') { |
50 | return false; |
51 | } |
52 | uint32_t code_point_2 = hex_to_u32_nocheck(*src_ptr + 2); |
53 | |
54 | // if the first code point is invalid we will get here, as we will go past |
55 | // the check for being outside the Basic Multilingual plane. If we don't |
56 | // find a \u immediately afterwards we fail out anyhow, but if we do, |
57 | // this check catches both the case of the first code point being invalid |
58 | // or the second code point being invalid. |
59 | if ((code_point | code_point_2) >> 16) { |
60 | return false; |
61 | } |
62 | |
63 | code_point = |
64 | (((code_point - 0xd800) << 10) | (code_point_2 - 0xdc00)) + 0x10000; |
65 | *src_ptr += 6; |
66 | } |
67 | size_t offset = codepoint_to_utf8(code_point, *dst_ptr); |
68 | *dst_ptr += offset; |
69 | return offset > 0; |
70 | } |
71 | |
72 | WARN_UNUSED really_inline bool parse_string(UNUSED const uint8_t *buf, |
73 | UNUSED size_t len, ParsedJson &pj, |
74 | UNUSED const uint32_t depth, |
75 | UNUSED uint32_t offset) { |
76 | pj.write_tape(pj.current_string_buf_loc - pj.string_buf, '"'); |
77 | const uint8_t *src = &buf[offset + 1]; /* we know that buf at offset is a " */ |
78 | uint8_t *dst = pj.current_string_buf_loc + sizeof(uint32_t); |
79 | const uint8_t *const start_of_string = dst; |
80 | while (1) { |
81 | parse_string_helper helper = find_bs_bits_and_quote_bits(src, dst); |
82 | if (((helper.bs_bits - 1) & helper.quote_bits) != 0) { |
83 | /* we encountered quotes first. Move dst to point to quotes and exit |
84 | */ |
85 | |
86 | /* find out where the quote is... */ |
87 | auto quote_dist = trailing_zeroes(helper.quote_bits); |
88 | |
89 | /* NULL termination is still handy if you expect all your strings to |
90 | * be NULL terminated? */ |
91 | /* It comes at a small cost */ |
92 | dst[quote_dist] = 0; |
93 | |
94 | uint32_t str_length = (dst - start_of_string) + quote_dist; |
95 | memcpy(pj.current_string_buf_loc, &str_length, sizeof(str_length)); |
96 | /***************************** |
97 | * Above, check for overflow in case someone has a crazy string |
98 | * (>=4GB?) _ |
99 | * But only add the overflow check when the document itself exceeds |
100 | * 4GB |
101 | * Currently unneeded because we refuse to parse docs larger or equal |
102 | * to 4GB. |
103 | ****************************/ |
104 | |
105 | /* we advance the point, accounting for the fact that we have a NULL |
106 | * termination */ |
107 | pj.current_string_buf_loc = dst + quote_dist + 1; |
108 | return true; |
109 | } |
110 | if (((helper.quote_bits - 1) & helper.bs_bits) != 0) { |
111 | /* find out where the backspace is */ |
112 | auto bs_dist = trailing_zeroes(helper.bs_bits); |
113 | uint8_t escape_char = src[bs_dist + 1]; |
114 | /* we encountered backslash first. Handle backslash */ |
115 | if (escape_char == 'u') { |
116 | /* move src/dst up to the start; they will be further adjusted |
117 | within the unicode codepoint handling code. */ |
118 | src += bs_dist; |
119 | dst += bs_dist; |
120 | if (!handle_unicode_codepoint(&src, &dst)) { |
121 | return false; |
122 | } |
123 | } else { |
124 | /* simple 1:1 conversion. Will eat bs_dist+2 characters in input and |
125 | * write bs_dist+1 characters to output |
126 | * note this may reach beyond the part of the buffer we've actually |
127 | * seen. I think this is ok */ |
128 | uint8_t escape_result = escape_map[escape_char]; |
129 | if (escape_result == 0u) { |
130 | return false; /* bogus escape value is an error */ |
131 | } |
132 | dst[bs_dist] = escape_result; |
133 | src += bs_dist + 2; |
134 | dst += bs_dist + 1; |
135 | } |
136 | } else { |
137 | /* they are the same. Since they can't co-occur, it means we |
138 | * encountered neither. */ |
139 | src += parse_string_helper::BYTES_PROCESSED; |
140 | dst += parse_string_helper::BYTES_PROCESSED; |
141 | } |
142 | } |
143 | /* can't be reached */ |
144 | return true; |
145 | } |
146 | |