| 1 | // Protocol Buffers - Google's data interchange format |
| 2 | // Copyright 2008 Google Inc. All rights reserved. |
| 3 | // https://developers.google.com/protocol-buffers/ |
| 4 | // |
| 5 | // Redistribution and use in source and binary forms, with or without |
| 6 | // modification, are permitted provided that the following conditions are |
| 7 | // met: |
| 8 | // |
| 9 | // * Redistributions of source code must retain the above copyright |
| 10 | // notice, this list of conditions and the following disclaimer. |
| 11 | // * Redistributions in binary form must reproduce the above |
| 12 | // copyright notice, this list of conditions and the following disclaimer |
| 13 | // in the documentation and/or other materials provided with the |
| 14 | // distribution. |
| 15 | // * Neither the name of Google Inc. nor the names of its |
| 16 | // contributors may be used to endorse or promote products derived from |
| 17 | // this software without specific prior written permission. |
| 18 | // |
| 19 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| 20 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| 21 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| 22 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| 23 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| 24 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| 25 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| 26 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| 27 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| 28 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| 29 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 30 | |
| 31 | #include <google/protobuf/util/internal/json_escaping.h> |
| 32 | |
| 33 | #include <cstdint> |
| 34 | |
| 35 | #include <google/protobuf/stubs/logging.h> |
| 36 | #include <google/protobuf/stubs/common.h> |
| 37 | |
| 38 | namespace google { |
| 39 | namespace protobuf { |
| 40 | namespace util { |
| 41 | namespace converter { |
| 42 | |
| 43 | namespace { |
| 44 | |
| 45 | // Array of hex characters for conversion to hex. |
| 46 | static const char kHex[] = "0123456789abcdef" ; |
| 47 | |
| 48 | // Characters 0x00 to 0x9f are very commonly used, so we provide a special |
| 49 | // table lookup. |
| 50 | // |
| 51 | // For unicode code point ch < 0xa0: |
| 52 | // kCommonEscapes[ch] is the escaped string of ch, if escaping is needed; |
| 53 | // or an empty string, if escaping is not needed. |
| 54 | static const char kCommonEscapes[160][7] = { |
| 55 | // C0 (ASCII and derivatives) control characters |
| 56 | "\\u0000" , "\\u0001" , "\\u0002" , "\\u0003" , // 0x00 |
| 57 | "\\u0004" , "\\u0005" , "\\u0006" , "\\u0007" , "\\b" , "\\t" , "\\n" , "\\u000b" , |
| 58 | "\\f" , "\\r" , "\\u000e" , "\\u000f" , "\\u0010" , "\\u0011" , "\\u0012" , |
| 59 | "\\u0013" , // 0x10 |
| 60 | "\\u0014" , "\\u0015" , "\\u0016" , "\\u0017" , "\\u0018" , "\\u0019" , "\\u001a" , |
| 61 | "\\u001b" , "\\u001c" , "\\u001d" , "\\u001e" , "\\u001f" , |
| 62 | // Escaping of " and \ are required by www.json.org string definition. |
| 63 | // Escaping of < and > are required for HTML security. |
| 64 | "" , "" , "\\\"" , "" , "" , "" , "" , "" , // 0x20 |
| 65 | "" , "" , "" , "" , "" , "" , "" , "" , "" , "" , "" , "" , "" , "" , "" , "" , // 0x30 |
| 66 | "" , "" , "" , "" , "\\u003c" , "" , "\\u003e" , "" , "" , "" , "" , "" , "" , "" , "" , |
| 67 | "" , // 0x40 |
| 68 | "" , "" , "" , "" , "" , "" , "" , "" , "" , "" , "" , "" , "" , "" , "" , "" , // 0x50 |
| 69 | "" , "" , "" , "" , "\\\\" , "" , "" , "" , "" , "" , "" , "" , "" , "" , "" , "" , // 0x60 |
| 70 | "" , "" , "" , "" , "" , "" , "" , "" , "" , "" , "" , "" , "" , "" , "" , "" , // 0x70 |
| 71 | "" , "" , "" , "" , "" , "" , "" , "\\u007f" , |
| 72 | // C1 (ISO 8859 and Unicode) extended control characters |
| 73 | "\\u0080" , "\\u0081" , "\\u0082" , "\\u0083" , // 0x80 |
| 74 | "\\u0084" , "\\u0085" , "\\u0086" , "\\u0087" , "\\u0088" , "\\u0089" , "\\u008a" , |
| 75 | "\\u008b" , "\\u008c" , "\\u008d" , "\\u008e" , "\\u008f" , "\\u0090" , "\\u0091" , |
| 76 | "\\u0092" , "\\u0093" , // 0x90 |
| 77 | "\\u0094" , "\\u0095" , "\\u0096" , "\\u0097" , "\\u0098" , "\\u0099" , "\\u009a" , |
| 78 | "\\u009b" , "\\u009c" , "\\u009d" , "\\u009e" , "\\u009f" }; |
| 79 | |
| 80 | // Determines if the given char value is a unicode surrogate code unit (either |
| 81 | // high-surrogate or low-surrogate). |
| 82 | inline bool IsSurrogate(uint32_t c) { |
| 83 | // Optimized form of: |
| 84 | // return c >= kMinHighSurrogate && c <= kMaxLowSurrogate; |
| 85 | // (Reduced from 3 ALU instructions to 2 ALU instructions) |
| 86 | return (c & 0xfffff800) == JsonEscaping::kMinHighSurrogate; |
| 87 | } |
| 88 | |
| 89 | // Returns true if the given unicode code point cp is a valid |
| 90 | // unicode code point (i.e. in the range 0 <= cp <= kMaxCodePoint). |
| 91 | inline bool IsValidCodePoint(uint32_t cp) { |
| 92 | return cp <= JsonEscaping::kMaxCodePoint; |
| 93 | } |
| 94 | |
| 95 | // Returns the low surrogate for the given unicode code point. The result is |
| 96 | // meaningless if the given code point is not a supplementary character. |
| 97 | inline uint16_t ToLowSurrogate(uint32_t cp) { |
| 98 | return (cp & |
| 99 | (JsonEscaping::kMaxLowSurrogate - JsonEscaping::kMinLowSurrogate)) + |
| 100 | JsonEscaping::kMinLowSurrogate; |
| 101 | } |
| 102 | |
| 103 | // Returns the high surrogate for the given unicode code point. The result is |
| 104 | // meaningless if the given code point is not a supplementary character. |
| 105 | inline uint16_t ToHighSurrogate(uint32_t cp) { |
| 106 | return (cp >> 10) + (JsonEscaping::kMinHighSurrogate - |
| 107 | (JsonEscaping::kMinSupplementaryCodePoint >> 10)); |
| 108 | } |
| 109 | |
| 110 | // Input str is encoded in UTF-8. A unicode code point could be encoded in |
| 111 | // UTF-8 using anywhere from 1 to 4 characters, and it could span multiple |
| 112 | // reads of the ByteSource. |
| 113 | // |
| 114 | // This function reads the next unicode code point from the input (str) at |
| 115 | // the given position (index), taking into account any left-over partial |
| 116 | // code point from the previous iteration (cp), together with the number |
| 117 | // of characters left to read to complete this code point (num_left). |
| 118 | // |
| 119 | // This function assumes that the input (str) is valid at the given position |
| 120 | // (index). In order words, at least one character could be read successfully. |
| 121 | // |
| 122 | // The code point read (partial or complete) is stored in (cp). Upon return, |
| 123 | // (num_left) stores the number of characters that has yet to be read in |
| 124 | // order to complete the current unicode code point. If the read is complete, |
| 125 | // then (num_left) is 0. Also, (num_read) is the number of characters read. |
| 126 | // |
| 127 | // Returns false if we encounter an invalid UTF-8 string. Returns true |
| 128 | // otherwise, including the case when we reach the end of the input (str) |
| 129 | // before a complete unicode code point is read. |
| 130 | bool ReadCodePoint(StringPiece str, int index, uint32_t* cp, |
| 131 | int* num_left, int* num_read) { |
| 132 | if (*num_left == 0) { |
| 133 | // Last read was complete. Start reading a new unicode code point. |
| 134 | *cp = static_cast<uint8_t>(str[index++]); |
| 135 | *num_read = 1; |
| 136 | // The length of the code point is determined from reading the first byte. |
| 137 | // |
| 138 | // If the first byte is between: |
| 139 | // 0..0x7f: that's the value of the code point. |
| 140 | // 0x80..0xbf: <invalid> |
| 141 | // 0xc0..0xdf: 11-bit code point encoded in 2 bytes. |
| 142 | // bit 10-6, bit 5-0 |
| 143 | // 0xe0..0xef: 16-bit code point encoded in 3 bytes. |
| 144 | // bit 15-12, bit 11-6, bit 5-0 |
| 145 | // 0xf0..0xf7: 21-bit code point encoded in 4 bytes. |
| 146 | // bit 20-18, bit 17-12, bit 11-6, bit 5-0 |
| 147 | // 0xf8..0xff: <invalid> |
| 148 | // |
| 149 | // Meaning of each bit: |
| 150 | // <msb> bit 7: 0 - single byte code point: bits 6-0 are values. |
| 151 | // 1 - multibyte code point |
| 152 | // bit 6: 0 - subsequent bytes of multibyte code point: |
| 153 | // bits 5-0 are values. |
| 154 | // 1 - first byte of multibyte code point |
| 155 | // bit 5: 0 - first byte of 2-byte code point: bits 4-0 are values. |
| 156 | // 1 - first byte of code point with >= 3 bytes. |
| 157 | // bit 4: 0 - first byte of 3-byte code point: bits 3-0 are values. |
| 158 | // 1 - first byte of code point with >= 4 bytes. |
| 159 | // bit 3: 0 - first byte of 4-byte code point: bits 2-0 are values. |
| 160 | // 1 - reserved for future expansion. |
| 161 | if (*cp <= 0x7f) { |
| 162 | return true; |
| 163 | } else if (*cp <= 0xbf) { |
| 164 | return false; |
| 165 | } else if (*cp <= 0xdf) { |
| 166 | *cp &= 0x1f; |
| 167 | *num_left = 1; |
| 168 | } else if (*cp <= 0xef) { |
| 169 | *cp &= 0x0f; |
| 170 | *num_left = 2; |
| 171 | } else if (*cp <= 0xf7) { |
| 172 | *cp &= 0x07; |
| 173 | *num_left = 3; |
| 174 | } else { |
| 175 | return false; |
| 176 | } |
| 177 | } else { |
| 178 | // Last read was partial. Initialize num_read to 0 and continue reading |
| 179 | // the last unicode code point. |
| 180 | *num_read = 0; |
| 181 | } |
| 182 | while (*num_left > 0 && index < str.size()) { |
| 183 | uint32_t ch = static_cast<uint8_t>(str[index++]); |
| 184 | --(*num_left); |
| 185 | ++(*num_read); |
| 186 | *cp = (*cp << 6) | (ch & 0x3f); |
| 187 | if (ch < 0x80 || ch > 0xbf) return false; |
| 188 | } |
| 189 | return *num_left > 0 || (!IsSurrogate(c: *cp) && IsValidCodePoint(cp: *cp)); |
| 190 | } |
| 191 | |
| 192 | // Stores the 16-bit unicode code point as its hexadecimal digits in buffer |
| 193 | // and returns a StringPiece that points to this buffer. The input buffer needs |
| 194 | // to be at least 6 bytes long. |
| 195 | StringPiece ToHex(uint16_t cp, char* buffer) { |
| 196 | buffer[5] = kHex[cp & 0x0f]; |
| 197 | cp >>= 4; |
| 198 | buffer[4] = kHex[cp & 0x0f]; |
| 199 | cp >>= 4; |
| 200 | buffer[3] = kHex[cp & 0x0f]; |
| 201 | cp >>= 4; |
| 202 | buffer[2] = kHex[cp & 0x0f]; |
| 203 | return StringPiece(buffer, 6); |
| 204 | } |
| 205 | |
| 206 | // Stores the 32-bit unicode code point as its hexadecimal digits in buffer |
| 207 | // and returns a StringPiece that points to this buffer. The input buffer needs |
| 208 | // to be at least 12 bytes long. |
| 209 | StringPiece ToSurrogateHex(uint32_t cp, char* buffer) { |
| 210 | uint16_t low = ToLowSurrogate(cp); |
| 211 | uint16_t high = ToHighSurrogate(cp); |
| 212 | |
| 213 | buffer[11] = kHex[low & 0x0f]; |
| 214 | low >>= 4; |
| 215 | buffer[10] = kHex[low & 0x0f]; |
| 216 | low >>= 4; |
| 217 | buffer[9] = kHex[low & 0x0f]; |
| 218 | low >>= 4; |
| 219 | buffer[8] = kHex[low & 0x0f]; |
| 220 | |
| 221 | buffer[5] = kHex[high & 0x0f]; |
| 222 | high >>= 4; |
| 223 | buffer[4] = kHex[high & 0x0f]; |
| 224 | high >>= 4; |
| 225 | buffer[3] = kHex[high & 0x0f]; |
| 226 | high >>= 4; |
| 227 | buffer[2] = kHex[high & 0x0f]; |
| 228 | |
| 229 | return StringPiece(buffer, 12); |
| 230 | } |
| 231 | |
| 232 | // If the given unicode code point needs escaping, then returns the |
| 233 | // escaped form. The returned StringPiece either points to statically |
| 234 | // pre-allocated char[] or to the given buffer. The input buffer needs |
| 235 | // to be at least 12 bytes long. |
| 236 | // |
| 237 | // If the given unicode code point does not need escaping, an empty |
| 238 | // StringPiece is returned. |
| 239 | StringPiece EscapeCodePoint(uint32_t cp, char* buffer) { |
| 240 | if (cp < 0xa0) return kCommonEscapes[cp]; |
| 241 | switch (cp) { |
| 242 | // These are not required by json spec |
| 243 | // but used to prevent security bugs in javascript. |
| 244 | case 0xfeff: // Zero width no-break space |
| 245 | case 0xfff9: // Interlinear annotation anchor |
| 246 | case 0xfffa: // Interlinear annotation separator |
| 247 | case 0xfffb: // Interlinear annotation terminator |
| 248 | |
| 249 | case 0x00ad: // Soft-hyphen |
| 250 | case 0x06dd: // Arabic end of ayah |
| 251 | case 0x070f: // Syriac abbreviation mark |
| 252 | case 0x17b4: // Khmer vowel inherent Aq |
| 253 | case 0x17b5: // Khmer vowel inherent Aa |
| 254 | return ToHex(cp, buffer); |
| 255 | |
| 256 | default: |
| 257 | if ((cp >= 0x0600 && cp <= 0x0603) || // Arabic signs |
| 258 | (cp >= 0x200b && cp <= 0x200f) || // Zero width etc. |
| 259 | (cp >= 0x2028 && cp <= 0x202e) || // Separators etc. |
| 260 | (cp >= 0x2060 && cp <= 0x2064) || // Invisible etc. |
| 261 | (cp >= 0x206a && cp <= 0x206f)) { // Shaping etc. |
| 262 | return ToHex(cp, buffer); |
| 263 | } |
| 264 | |
| 265 | if (cp == 0x000e0001 || // Language tag |
| 266 | (cp >= 0x0001d173 && cp <= 0x0001d17a) || // Music formatting |
| 267 | (cp >= 0x000e0020 && cp <= 0x000e007f)) { // TAG symbols |
| 268 | return ToSurrogateHex(cp, buffer); |
| 269 | } |
| 270 | } |
| 271 | return StringPiece(); |
| 272 | } |
| 273 | |
| 274 | // Tries to escape the given code point first. If the given code point |
| 275 | // does not need to be escaped, but force_output is true, then render |
| 276 | // the given multi-byte code point in UTF8 in the buffer and returns it. |
| 277 | StringPiece EscapeCodePoint(uint32_t cp, char* buffer, |
| 278 | bool force_output) { |
| 279 | StringPiece sp = EscapeCodePoint(cp, buffer); |
| 280 | if (force_output && sp.empty()) { |
| 281 | buffer[5] = (cp & 0x3f) | 0x80; |
| 282 | cp >>= 6; |
| 283 | if (cp <= 0x1f) { |
| 284 | buffer[4] = cp | 0xc0; |
| 285 | sp = StringPiece(buffer + 4, 2); |
| 286 | return sp; |
| 287 | } |
| 288 | buffer[4] = (cp & 0x3f) | 0x80; |
| 289 | cp >>= 6; |
| 290 | if (cp <= 0x0f) { |
| 291 | buffer[3] = cp | 0xe0; |
| 292 | sp = StringPiece(buffer + 3, 3); |
| 293 | return sp; |
| 294 | } |
| 295 | buffer[3] = (cp & 0x3f) | 0x80; |
| 296 | buffer[2] = ((cp >> 6) & 0x07) | 0xf0; |
| 297 | sp = StringPiece(buffer + 2, 4); |
| 298 | } |
| 299 | return sp; |
| 300 | } |
| 301 | |
| 302 | } // namespace |
| 303 | |
| 304 | void JsonEscaping::Escape(strings::ByteSource* input, |
| 305 | strings::ByteSink* output) { |
| 306 | char buffer[12] = "\\udead\\ubee" ; |
| 307 | uint32_t cp = 0; // Current unicode code point. |
| 308 | int num_left = 0; // Num of chars to read to complete the code point. |
| 309 | while (input->Available() > 0) { |
| 310 | StringPiece str = input->Peek(); |
| 311 | StringPiece escaped; |
| 312 | int i = 0; |
| 313 | int num_read; |
| 314 | bool ok; |
| 315 | bool cp_was_split = num_left > 0; |
| 316 | // Loop until we encounter either |
| 317 | // i) a code point that needs to be escaped; or |
| 318 | // ii) a split code point is completely read; or |
| 319 | // iii) a character that is not a valid utf8; or |
| 320 | // iv) end of the StringPiece str is reached. |
| 321 | do { |
| 322 | ok = ReadCodePoint(str, index: i, cp: &cp, num_left: &num_left, num_read: &num_read); |
| 323 | if (num_left > 0 || !ok) break; // case iii or iv |
| 324 | escaped = EscapeCodePoint(cp, buffer, force_output: cp_was_split); |
| 325 | if (!escaped.empty()) break; // case i or ii |
| 326 | i += num_read; |
| 327 | num_read = 0; |
| 328 | } while (i < str.length()); // case iv |
| 329 | // First copy the un-escaped prefix, if any, to the output ByteSink. |
| 330 | if (i > 0) input->CopyTo(sink: output, n: i); |
| 331 | if (num_read > 0) input->Skip(n: num_read); |
| 332 | if (!ok) { |
| 333 | // Case iii: Report error. |
| 334 | // TODO(wpoon): Add error reporting. |
| 335 | num_left = 0; |
| 336 | } else if (num_left == 0 && !escaped.empty()) { |
| 337 | // Case i or ii: Append the escaped code point to the output ByteSink. |
| 338 | output->Append(bytes: escaped.data(), n: escaped.size()); |
| 339 | } |
| 340 | } |
| 341 | if (num_left > 0) { |
| 342 | // Treat as case iii: report error. |
| 343 | // TODO(wpoon): Add error reporting. |
| 344 | } |
| 345 | } |
| 346 | |
| 347 | void JsonEscaping::Escape(StringPiece input, strings::ByteSink* output) { |
| 348 | const size_t len = input.length(); |
| 349 | const char* p = input.data(); |
| 350 | |
| 351 | bool can_skip_escaping = true; |
| 352 | for (int i = 0; i < len; i++) { |
| 353 | char c = p[i]; |
| 354 | if (c < 0x20 || c >= 0x7F || c == '"' || c == '<' || c == '>' || |
| 355 | c == '\\') { |
| 356 | can_skip_escaping = false; |
| 357 | break; |
| 358 | } |
| 359 | } |
| 360 | |
| 361 | if (can_skip_escaping) { |
| 362 | output->Append(bytes: input.data(), n: input.length()); |
| 363 | } else { |
| 364 | strings::ArrayByteSource source(input); |
| 365 | Escape(input: &source, output); |
| 366 | } |
| 367 | } |
| 368 | |
| 369 | } // namespace converter |
| 370 | } // namespace util |
| 371 | } // namespace protobuf |
| 372 | } // namespace google |
| 373 | |