1// Protocol Buffers - Google's data interchange format
2// Copyright 2008 Google Inc. All rights reserved.
3// https://developers.google.com/protocol-buffers/
4//
5// Redistribution and use in source and binary forms, with or without
6// modification, are permitted provided that the following conditions are
7// met:
8//
9// * Redistributions of source code must retain the above copyright
10// notice, this list of conditions and the following disclaimer.
11// * Redistributions in binary form must reproduce the above
12// copyright notice, this list of conditions and the following disclaimer
13// in the documentation and/or other materials provided with the
14// distribution.
15// * Neither the name of Google Inc. nor the names of its
16// contributors may be used to endorse or promote products derived from
17// this software without specific prior written permission.
18//
19// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31#include <google/protobuf/util/internal/json_escaping.h>
32
33#include <cstdint>
34
35#include <google/protobuf/stubs/logging.h>
36#include <google/protobuf/stubs/common.h>
37
38namespace google {
39namespace protobuf {
40namespace util {
41namespace converter {
42
43namespace {
44
45// Array of hex characters for conversion to hex.
46static const char kHex[] = "0123456789abcdef";
47
48// Characters 0x00 to 0x9f are very commonly used, so we provide a special
49// table lookup.
50//
51// For unicode code point ch < 0xa0:
52// kCommonEscapes[ch] is the escaped string of ch, if escaping is needed;
53// or an empty string, if escaping is not needed.
54static const char kCommonEscapes[160][7] = {
55 // C0 (ASCII and derivatives) control characters
56 "\\u0000", "\\u0001", "\\u0002", "\\u0003", // 0x00
57 "\\u0004", "\\u0005", "\\u0006", "\\u0007", "\\b", "\\t", "\\n", "\\u000b",
58 "\\f", "\\r", "\\u000e", "\\u000f", "\\u0010", "\\u0011", "\\u0012",
59 "\\u0013", // 0x10
60 "\\u0014", "\\u0015", "\\u0016", "\\u0017", "\\u0018", "\\u0019", "\\u001a",
61 "\\u001b", "\\u001c", "\\u001d", "\\u001e", "\\u001f",
62 // Escaping of " and \ are required by www.json.org string definition.
63 // Escaping of < and > are required for HTML security.
64 "", "", "\\\"", "", "", "", "", "", // 0x20
65 "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", // 0x30
66 "", "", "", "", "\\u003c", "", "\\u003e", "", "", "", "", "", "", "", "",
67 "", // 0x40
68 "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", // 0x50
69 "", "", "", "", "\\\\", "", "", "", "", "", "", "", "", "", "", "", // 0x60
70 "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", // 0x70
71 "", "", "", "", "", "", "", "\\u007f",
72 // C1 (ISO 8859 and Unicode) extended control characters
73 "\\u0080", "\\u0081", "\\u0082", "\\u0083", // 0x80
74 "\\u0084", "\\u0085", "\\u0086", "\\u0087", "\\u0088", "\\u0089", "\\u008a",
75 "\\u008b", "\\u008c", "\\u008d", "\\u008e", "\\u008f", "\\u0090", "\\u0091",
76 "\\u0092", "\\u0093", // 0x90
77 "\\u0094", "\\u0095", "\\u0096", "\\u0097", "\\u0098", "\\u0099", "\\u009a",
78 "\\u009b", "\\u009c", "\\u009d", "\\u009e", "\\u009f"};
79
80// Determines if the given char value is a unicode surrogate code unit (either
81// high-surrogate or low-surrogate).
82inline bool IsSurrogate(uint32_t c) {
83 // Optimized form of:
84 // return c >= kMinHighSurrogate && c <= kMaxLowSurrogate;
85 // (Reduced from 3 ALU instructions to 2 ALU instructions)
86 return (c & 0xfffff800) == JsonEscaping::kMinHighSurrogate;
87}
88
89// Returns true if the given unicode code point cp is a valid
90// unicode code point (i.e. in the range 0 <= cp <= kMaxCodePoint).
91inline bool IsValidCodePoint(uint32_t cp) {
92 return cp <= JsonEscaping::kMaxCodePoint;
93}
94
95// Returns the low surrogate for the given unicode code point. The result is
96// meaningless if the given code point is not a supplementary character.
97inline uint16_t ToLowSurrogate(uint32_t cp) {
98 return (cp &
99 (JsonEscaping::kMaxLowSurrogate - JsonEscaping::kMinLowSurrogate)) +
100 JsonEscaping::kMinLowSurrogate;
101}
102
103// Returns the high surrogate for the given unicode code point. The result is
104// meaningless if the given code point is not a supplementary character.
105inline uint16_t ToHighSurrogate(uint32_t cp) {
106 return (cp >> 10) + (JsonEscaping::kMinHighSurrogate -
107 (JsonEscaping::kMinSupplementaryCodePoint >> 10));
108}
109
110// Input str is encoded in UTF-8. A unicode code point could be encoded in
111// UTF-8 using anywhere from 1 to 4 characters, and it could span multiple
112// reads of the ByteSource.
113//
114// This function reads the next unicode code point from the input (str) at
115// the given position (index), taking into account any left-over partial
116// code point from the previous iteration (cp), together with the number
117// of characters left to read to complete this code point (num_left).
118//
119// This function assumes that the input (str) is valid at the given position
120// (index). In order words, at least one character could be read successfully.
121//
122// The code point read (partial or complete) is stored in (cp). Upon return,
123// (num_left) stores the number of characters that has yet to be read in
124// order to complete the current unicode code point. If the read is complete,
125// then (num_left) is 0. Also, (num_read) is the number of characters read.
126//
127// Returns false if we encounter an invalid UTF-8 string. Returns true
128// otherwise, including the case when we reach the end of the input (str)
129// before a complete unicode code point is read.
130bool ReadCodePoint(StringPiece str, int index, uint32_t* cp,
131 int* num_left, int* num_read) {
132 if (*num_left == 0) {
133 // Last read was complete. Start reading a new unicode code point.
134 *cp = static_cast<uint8_t>(str[index++]);
135 *num_read = 1;
136 // The length of the code point is determined from reading the first byte.
137 //
138 // If the first byte is between:
139 // 0..0x7f: that's the value of the code point.
140 // 0x80..0xbf: <invalid>
141 // 0xc0..0xdf: 11-bit code point encoded in 2 bytes.
142 // bit 10-6, bit 5-0
143 // 0xe0..0xef: 16-bit code point encoded in 3 bytes.
144 // bit 15-12, bit 11-6, bit 5-0
145 // 0xf0..0xf7: 21-bit code point encoded in 4 bytes.
146 // bit 20-18, bit 17-12, bit 11-6, bit 5-0
147 // 0xf8..0xff: <invalid>
148 //
149 // Meaning of each bit:
150 // <msb> bit 7: 0 - single byte code point: bits 6-0 are values.
151 // 1 - multibyte code point
152 // bit 6: 0 - subsequent bytes of multibyte code point:
153 // bits 5-0 are values.
154 // 1 - first byte of multibyte code point
155 // bit 5: 0 - first byte of 2-byte code point: bits 4-0 are values.
156 // 1 - first byte of code point with >= 3 bytes.
157 // bit 4: 0 - first byte of 3-byte code point: bits 3-0 are values.
158 // 1 - first byte of code point with >= 4 bytes.
159 // bit 3: 0 - first byte of 4-byte code point: bits 2-0 are values.
160 // 1 - reserved for future expansion.
161 if (*cp <= 0x7f) {
162 return true;
163 } else if (*cp <= 0xbf) {
164 return false;
165 } else if (*cp <= 0xdf) {
166 *cp &= 0x1f;
167 *num_left = 1;
168 } else if (*cp <= 0xef) {
169 *cp &= 0x0f;
170 *num_left = 2;
171 } else if (*cp <= 0xf7) {
172 *cp &= 0x07;
173 *num_left = 3;
174 } else {
175 return false;
176 }
177 } else {
178 // Last read was partial. Initialize num_read to 0 and continue reading
179 // the last unicode code point.
180 *num_read = 0;
181 }
182 while (*num_left > 0 && index < str.size()) {
183 uint32_t ch = static_cast<uint8_t>(str[index++]);
184 --(*num_left);
185 ++(*num_read);
186 *cp = (*cp << 6) | (ch & 0x3f);
187 if (ch < 0x80 || ch > 0xbf) return false;
188 }
189 return *num_left > 0 || (!IsSurrogate(c: *cp) && IsValidCodePoint(cp: *cp));
190}
191
192// Stores the 16-bit unicode code point as its hexadecimal digits in buffer
193// and returns a StringPiece that points to this buffer. The input buffer needs
194// to be at least 6 bytes long.
195StringPiece ToHex(uint16_t cp, char* buffer) {
196 buffer[5] = kHex[cp & 0x0f];
197 cp >>= 4;
198 buffer[4] = kHex[cp & 0x0f];
199 cp >>= 4;
200 buffer[3] = kHex[cp & 0x0f];
201 cp >>= 4;
202 buffer[2] = kHex[cp & 0x0f];
203 return StringPiece(buffer, 6);
204}
205
206// Stores the 32-bit unicode code point as its hexadecimal digits in buffer
207// and returns a StringPiece that points to this buffer. The input buffer needs
208// to be at least 12 bytes long.
209StringPiece ToSurrogateHex(uint32_t cp, char* buffer) {
210 uint16_t low = ToLowSurrogate(cp);
211 uint16_t high = ToHighSurrogate(cp);
212
213 buffer[11] = kHex[low & 0x0f];
214 low >>= 4;
215 buffer[10] = kHex[low & 0x0f];
216 low >>= 4;
217 buffer[9] = kHex[low & 0x0f];
218 low >>= 4;
219 buffer[8] = kHex[low & 0x0f];
220
221 buffer[5] = kHex[high & 0x0f];
222 high >>= 4;
223 buffer[4] = kHex[high & 0x0f];
224 high >>= 4;
225 buffer[3] = kHex[high & 0x0f];
226 high >>= 4;
227 buffer[2] = kHex[high & 0x0f];
228
229 return StringPiece(buffer, 12);
230}
231
232// If the given unicode code point needs escaping, then returns the
233// escaped form. The returned StringPiece either points to statically
234// pre-allocated char[] or to the given buffer. The input buffer needs
235// to be at least 12 bytes long.
236//
237// If the given unicode code point does not need escaping, an empty
238// StringPiece is returned.
239StringPiece EscapeCodePoint(uint32_t cp, char* buffer) {
240 if (cp < 0xa0) return kCommonEscapes[cp];
241 switch (cp) {
242 // These are not required by json spec
243 // but used to prevent security bugs in javascript.
244 case 0xfeff: // Zero width no-break space
245 case 0xfff9: // Interlinear annotation anchor
246 case 0xfffa: // Interlinear annotation separator
247 case 0xfffb: // Interlinear annotation terminator
248
249 case 0x00ad: // Soft-hyphen
250 case 0x06dd: // Arabic end of ayah
251 case 0x070f: // Syriac abbreviation mark
252 case 0x17b4: // Khmer vowel inherent Aq
253 case 0x17b5: // Khmer vowel inherent Aa
254 return ToHex(cp, buffer);
255
256 default:
257 if ((cp >= 0x0600 && cp <= 0x0603) || // Arabic signs
258 (cp >= 0x200b && cp <= 0x200f) || // Zero width etc.
259 (cp >= 0x2028 && cp <= 0x202e) || // Separators etc.
260 (cp >= 0x2060 && cp <= 0x2064) || // Invisible etc.
261 (cp >= 0x206a && cp <= 0x206f)) { // Shaping etc.
262 return ToHex(cp, buffer);
263 }
264
265 if (cp == 0x000e0001 || // Language tag
266 (cp >= 0x0001d173 && cp <= 0x0001d17a) || // Music formatting
267 (cp >= 0x000e0020 && cp <= 0x000e007f)) { // TAG symbols
268 return ToSurrogateHex(cp, buffer);
269 }
270 }
271 return StringPiece();
272}
273
274// Tries to escape the given code point first. If the given code point
275// does not need to be escaped, but force_output is true, then render
276// the given multi-byte code point in UTF8 in the buffer and returns it.
277StringPiece EscapeCodePoint(uint32_t cp, char* buffer,
278 bool force_output) {
279 StringPiece sp = EscapeCodePoint(cp, buffer);
280 if (force_output && sp.empty()) {
281 buffer[5] = (cp & 0x3f) | 0x80;
282 cp >>= 6;
283 if (cp <= 0x1f) {
284 buffer[4] = cp | 0xc0;
285 sp = StringPiece(buffer + 4, 2);
286 return sp;
287 }
288 buffer[4] = (cp & 0x3f) | 0x80;
289 cp >>= 6;
290 if (cp <= 0x0f) {
291 buffer[3] = cp | 0xe0;
292 sp = StringPiece(buffer + 3, 3);
293 return sp;
294 }
295 buffer[3] = (cp & 0x3f) | 0x80;
296 buffer[2] = ((cp >> 6) & 0x07) | 0xf0;
297 sp = StringPiece(buffer + 2, 4);
298 }
299 return sp;
300}
301
302} // namespace
303
304void JsonEscaping::Escape(strings::ByteSource* input,
305 strings::ByteSink* output) {
306 char buffer[12] = "\\udead\\ubee";
307 uint32_t cp = 0; // Current unicode code point.
308 int num_left = 0; // Num of chars to read to complete the code point.
309 while (input->Available() > 0) {
310 StringPiece str = input->Peek();
311 StringPiece escaped;
312 int i = 0;
313 int num_read;
314 bool ok;
315 bool cp_was_split = num_left > 0;
316 // Loop until we encounter either
317 // i) a code point that needs to be escaped; or
318 // ii) a split code point is completely read; or
319 // iii) a character that is not a valid utf8; or
320 // iv) end of the StringPiece str is reached.
321 do {
322 ok = ReadCodePoint(str, index: i, cp: &cp, num_left: &num_left, num_read: &num_read);
323 if (num_left > 0 || !ok) break; // case iii or iv
324 escaped = EscapeCodePoint(cp, buffer, force_output: cp_was_split);
325 if (!escaped.empty()) break; // case i or ii
326 i += num_read;
327 num_read = 0;
328 } while (i < str.length()); // case iv
329 // First copy the un-escaped prefix, if any, to the output ByteSink.
330 if (i > 0) input->CopyTo(sink: output, n: i);
331 if (num_read > 0) input->Skip(n: num_read);
332 if (!ok) {
333 // Case iii: Report error.
334 // TODO(wpoon): Add error reporting.
335 num_left = 0;
336 } else if (num_left == 0 && !escaped.empty()) {
337 // Case i or ii: Append the escaped code point to the output ByteSink.
338 output->Append(bytes: escaped.data(), n: escaped.size());
339 }
340 }
341 if (num_left > 0) {
342 // Treat as case iii: report error.
343 // TODO(wpoon): Add error reporting.
344 }
345}
346
347void JsonEscaping::Escape(StringPiece input, strings::ByteSink* output) {
348 const size_t len = input.length();
349 const char* p = input.data();
350
351 bool can_skip_escaping = true;
352 for (int i = 0; i < len; i++) {
353 char c = p[i];
354 if (c < 0x20 || c >= 0x7F || c == '"' || c == '<' || c == '>' ||
355 c == '\\') {
356 can_skip_escaping = false;
357 break;
358 }
359 }
360
361 if (can_skip_escaping) {
362 output->Append(bytes: input.data(), n: input.length());
363 } else {
364 strings::ArrayByteSource source(input);
365 Escape(input: &source, output);
366 }
367}
368
369} // namespace converter
370} // namespace util
371} // namespace protobuf
372} // namespace google
373