| 1 | // This file contains the common code every implementation uses |
| 2 | // It is intended to be included multiple times and compiled multiple times |
| 3 | // We assume the file in which it is include already includes |
| 4 | // "stringparsing.h" (this simplifies amalgation) |
| 5 | |
| 6 | // begin copypasta |
| 7 | // These chars yield themselves: " \ / |
| 8 | // b -> backspace, f -> formfeed, n -> newline, r -> cr, t -> horizontal tab |
| 9 | // u not handled in this table as it's complex |
| 10 | static const uint8_t escape_map[256] = { |
| 11 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x0. |
| 12 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 13 | 0, 0, 0x22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x2f, |
| 14 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 15 | |
| 16 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x4. |
| 17 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x5c, 0, 0, 0, // 0x5. |
| 18 | 0, 0, 0x08, 0, 0, 0, 0x0c, 0, 0, 0, 0, 0, 0, 0, 0x0a, 0, // 0x6. |
| 19 | 0, 0, 0x0d, 0, 0x09, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x7. |
| 20 | |
| 21 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 22 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 23 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 24 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 25 | |
| 26 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 27 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 28 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 29 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 30 | }; |
| 31 | |
| 32 | // handle a unicode codepoint |
| 33 | // write appropriate values into dest |
| 34 | // src will advance 6 bytes or 12 bytes |
| 35 | // dest will advance a variable amount (return via pointer) |
| 36 | // return true if the unicode codepoint was valid |
| 37 | // We work in little-endian then swap at write time |
| 38 | WARN_UNUSED |
| 39 | really_inline bool handle_unicode_codepoint(const uint8_t **src_ptr, |
| 40 | uint8_t **dst_ptr) { |
| 41 | // hex_to_u32_nocheck fills high 16 bits of the return value with 1s if the |
| 42 | // conversion isn't valid; we defer the check for this to inside the |
| 43 | // multilingual plane check |
| 44 | uint32_t code_point = hex_to_u32_nocheck(*src_ptr + 2); |
| 45 | *src_ptr += 6; |
| 46 | // check for low surrogate for characters outside the Basic |
| 47 | // Multilingual Plane. |
| 48 | if (code_point >= 0xd800 && code_point < 0xdc00) { |
| 49 | if (((*src_ptr)[0] != '\\') || (*src_ptr)[1] != 'u') { |
| 50 | return false; |
| 51 | } |
| 52 | uint32_t code_point_2 = hex_to_u32_nocheck(*src_ptr + 2); |
| 53 | |
| 54 | // if the first code point is invalid we will get here, as we will go past |
| 55 | // the check for being outside the Basic Multilingual plane. If we don't |
| 56 | // find a \u immediately afterwards we fail out anyhow, but if we do, |
| 57 | // this check catches both the case of the first code point being invalid |
| 58 | // or the second code point being invalid. |
| 59 | if ((code_point | code_point_2) >> 16) { |
| 60 | return false; |
| 61 | } |
| 62 | |
| 63 | code_point = |
| 64 | (((code_point - 0xd800) << 10) | (code_point_2 - 0xdc00)) + 0x10000; |
| 65 | *src_ptr += 6; |
| 66 | } |
| 67 | size_t offset = codepoint_to_utf8(code_point, *dst_ptr); |
| 68 | *dst_ptr += offset; |
| 69 | return offset > 0; |
| 70 | } |
| 71 | |
| 72 | WARN_UNUSED really_inline bool parse_string(UNUSED const uint8_t *buf, |
| 73 | UNUSED size_t len, ParsedJson &pj, |
| 74 | UNUSED const uint32_t depth, |
| 75 | UNUSED uint32_t offset) { |
| 76 | pj.write_tape(pj.current_string_buf_loc - pj.string_buf, '"'); |
| 77 | const uint8_t *src = &buf[offset + 1]; /* we know that buf at offset is a " */ |
| 78 | uint8_t *dst = pj.current_string_buf_loc + sizeof(uint32_t); |
| 79 | const uint8_t *const start_of_string = dst; |
| 80 | while (1) { |
| 81 | parse_string_helper helper = find_bs_bits_and_quote_bits(src, dst); |
| 82 | if (((helper.bs_bits - 1) & helper.quote_bits) != 0) { |
| 83 | /* we encountered quotes first. Move dst to point to quotes and exit |
| 84 | */ |
| 85 | |
| 86 | /* find out where the quote is... */ |
| 87 | auto quote_dist = trailing_zeroes(helper.quote_bits); |
| 88 | |
| 89 | /* NULL termination is still handy if you expect all your strings to |
| 90 | * be NULL terminated? */ |
| 91 | /* It comes at a small cost */ |
| 92 | dst[quote_dist] = 0; |
| 93 | |
| 94 | uint32_t str_length = (dst - start_of_string) + quote_dist; |
| 95 | memcpy(pj.current_string_buf_loc, &str_length, sizeof(str_length)); |
| 96 | /***************************** |
| 97 | * Above, check for overflow in case someone has a crazy string |
| 98 | * (>=4GB?) _ |
| 99 | * But only add the overflow check when the document itself exceeds |
| 100 | * 4GB |
| 101 | * Currently unneeded because we refuse to parse docs larger or equal |
| 102 | * to 4GB. |
| 103 | ****************************/ |
| 104 | |
| 105 | /* we advance the point, accounting for the fact that we have a NULL |
| 106 | * termination */ |
| 107 | pj.current_string_buf_loc = dst + quote_dist + 1; |
| 108 | return true; |
| 109 | } |
| 110 | if (((helper.quote_bits - 1) & helper.bs_bits) != 0) { |
| 111 | /* find out where the backspace is */ |
| 112 | auto bs_dist = trailing_zeroes(helper.bs_bits); |
| 113 | uint8_t escape_char = src[bs_dist + 1]; |
| 114 | /* we encountered backslash first. Handle backslash */ |
| 115 | if (escape_char == 'u') { |
| 116 | /* move src/dst up to the start; they will be further adjusted |
| 117 | within the unicode codepoint handling code. */ |
| 118 | src += bs_dist; |
| 119 | dst += bs_dist; |
| 120 | if (!handle_unicode_codepoint(&src, &dst)) { |
| 121 | return false; |
| 122 | } |
| 123 | } else { |
| 124 | /* simple 1:1 conversion. Will eat bs_dist+2 characters in input and |
| 125 | * write bs_dist+1 characters to output |
| 126 | * note this may reach beyond the part of the buffer we've actually |
| 127 | * seen. I think this is ok */ |
| 128 | uint8_t escape_result = escape_map[escape_char]; |
| 129 | if (escape_result == 0u) { |
| 130 | return false; /* bogus escape value is an error */ |
| 131 | } |
| 132 | dst[bs_dist] = escape_result; |
| 133 | src += bs_dist + 2; |
| 134 | dst += bs_dist + 1; |
| 135 | } |
| 136 | } else { |
| 137 | /* they are the same. Since they can't co-occur, it means we |
| 138 | * encountered neither. */ |
| 139 | src += parse_string_helper::BYTES_PROCESSED; |
| 140 | dst += parse_string_helper::BYTES_PROCESSED; |
| 141 | } |
| 142 | } |
| 143 | /* can't be reached */ |
| 144 | return true; |
| 145 | } |
| 146 | |