json_escaping.cc source code [Velox/build/_deps/protobuf-src/src/google/protobuf/util/internal/json_escaping.cc]

1	// Protocol Buffers - Google's data interchange format
2	// Copyright 2008 Google Inc. All rights reserved.
3	// https://developers.google.com/protocol-buffers/
4	//
5	// Redistribution and use in source and binary forms, with or without
6	// modification, are permitted provided that the following conditions are
7	// met:
8	//
9	// Redistributions of source code must retain the above copyright*
10	// notice, this list of conditions and the following disclaimer.
11	// Redistributions in binary form must reproduce the above*
12	// copyright notice, this list of conditions and the following disclaimer
13	// in the documentation and/or other materials provided with the
14	// distribution.
15	// Neither the name of Google Inc. nor the names of its*
16	// contributors may be used to endorse or promote products derived from
17	// this software without specific prior written permission.
18	//
19	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20	// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21	// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22	// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23	// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24	// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25	// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26	// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27	// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28	// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29	// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31	#include <google/protobuf/util/internal/json_escaping.h>
32
33	#include <cstdint>
34
35	#include <google/protobuf/stubs/logging.h>
36	#include <google/protobuf/stubs/common.h>
37
38	namespace google {
39	namespace protobuf {
40	namespace util {
41	namespace converter {
42
43	namespace {
44
45	// Array of hex characters for conversion to hex.
46	static const char kHex[] = "0123456789abcdef";
47
48	// Characters 0x00 to 0x9f are very commonly used, so we provide a special
49	// table lookup.
50	//
51	// For unicode code point ch < 0xa0:
52	// kCommonEscapes[ch] is the escaped string of ch, if escaping is needed;
53	// or an empty string, if escaping is not needed.
54	static const char kCommonEscapes[`160`][`7`] = {
55	// C0 (ASCII and derivatives) control characters
56	"\\u0000", "\\u0001", "\\u0002", "\\u0003", // 0x00
57	"\\u0004", "\\u0005", "\\u0006", "\\u0007", "\\b", "\\t", "\\n", "\\u000b",
58	"\\f", "\\r", "\\u000e", "\\u000f", "\\u0010", "\\u0011", "\\u0012",
59	"\\u0013", // 0x10
60	"\\u0014", "\\u0015", "\\u0016", "\\u0017", "\\u0018", "\\u0019", "\\u001a",
61	"\\u001b", "\\u001c", "\\u001d", "\\u001e", "\\u001f",
62	// Escaping of " and \ are required by www.json.org string definition.
63	// Escaping of < and > are required for HTML security.
64	"", "", "\\\"", "", "", "", "", "", // 0x20
65	"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", // 0x30
66	"", "", "", "", "\\u003c", "", "\\u003e", "", "", "", "", "", "", "", "",
67	"", // 0x40
68	"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", // 0x50
69	"", "", "", "", "\\\\", "", "", "", "", "", "", "", "", "", "", "", // 0x60
70	"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", // 0x70
71	"", "", "", "", "", "", "", "\\u007f",
72	// C1 (ISO 8859 and Unicode) extended control characters
73	"\\u0080", "\\u0081", "\\u0082", "\\u0083", // 0x80
74	"\\u0084", "\\u0085", "\\u0086", "\\u0087", "\\u0088", "\\u0089", "\\u008a",
75	"\\u008b", "\\u008c", "\\u008d", "\\u008e", "\\u008f", "\\u0090", "\\u0091",
76	"\\u0092", "\\u0093", // 0x90
77	"\\u0094", "\\u0095", "\\u0096", "\\u0097", "\\u0098", "\\u0099", "\\u009a",
78	"\\u009b", "\\u009c", "\\u009d", "\\u009e", "\\u009f"};
79
80	// Determines if the given char value is a unicode surrogate code unit (either
81	// high-surrogate or low-surrogate).
82	inline bool IsSurrogate(uint32_t c) {
83	// Optimized form of:
84	// return c >= kMinHighSurrogate && c <= kMaxLowSurrogate;
85	// (Reduced from 3 ALU instructions to 2 ALU instructions)
86	return (c & `0xfffff800`) == JsonEscaping::kMinHighSurrogate;
87	}
88
89	// Returns true if the given unicode code point cp is a valid
90	// unicode code point (i.e. in the range 0 <= cp <= kMaxCodePoint).
91	inline bool IsValidCodePoint(uint32_t cp) {
92	return cp <= JsonEscaping::kMaxCodePoint;
93	}
94
95	// Returns the low surrogate for the given unicode code point. The result is
96	// meaningless if the given code point is not a supplementary character.
97	inline uint16_t ToLowSurrogate(uint32_t cp) {
98	return (cp &
99	(JsonEscaping::kMaxLowSurrogate - JsonEscaping::kMinLowSurrogate)) +
100	JsonEscaping::kMinLowSurrogate;
101	}
102
103	// Returns the high surrogate for the given unicode code point. The result is
104	// meaningless if the given code point is not a supplementary character.
105	inline uint16_t ToHighSurrogate(uint32_t cp) {
106	return (cp >> `10`) + (JsonEscaping::kMinHighSurrogate -
107	(JsonEscaping::kMinSupplementaryCodePoint >> `10`));
108	}
109
110	// Input str is encoded in UTF-8. A unicode code point could be encoded in
111	// UTF-8 using anywhere from 1 to 4 characters, and it could span multiple
112	// reads of the ByteSource.
113	//
114	// This function reads the next unicode code point from the input (str) at
115	// the given position (index), taking into account any left-over partial
116	// code point from the previous iteration (cp), together with the number
117	// of characters left to read to complete this code point (num_left).
118	//
119	// This function assumes that the input (str) is valid at the given position
120	// (index). In order words, at least one character could be read successfully.
121	//
122	// The code point read (partial or complete) is stored in (cp). Upon return,
123	// (num_left) stores the number of characters that has yet to be read in
124	// order to complete the current unicode code point. If the read is complete,
125	// then (num_left) is 0. Also, (num_read) is the number of characters read.
126	//
127	// Returns false if we encounter an invalid UTF-8 string. Returns true
128	// otherwise, including the case when we reach the end of the input (str)
129	// before a complete unicode code point is read.
130	bool ReadCodePoint(StringPiece str, int index, uint32_t* cp,
131	int* num_left, int* num_read) {
132	if (*num_left == `0`) {
133	// Last read was complete. Start reading a new unicode code point.
134	cp = static_cast*<uint8_t>(str [index++]);
135	*num_read = `1`;
136	// The length of the code point is determined from reading the first byte.
137	//
138	// If the first byte is between:
139	// 0..0x7f: that's the value of the code point.
140	// 0x80..0xbf: <invalid>
141	// 0xc0..0xdf: 11-bit code point encoded in 2 bytes.
142	// bit 10-6, bit 5-0
143	// 0xe0..0xef: 16-bit code point encoded in 3 bytes.
144	// bit 15-12, bit 11-6, bit 5-0
145	// 0xf0..0xf7: 21-bit code point encoded in 4 bytes.
146	// bit 20-18, bit 17-12, bit 11-6, bit 5-0
147	// 0xf8..0xff: <invalid>
148	//
149	// Meaning of each bit:
150	// <msb> bit 7: 0 - single byte code point: bits 6-0 are values.
151	// 1 - multibyte code point
152	// bit 6: 0 - subsequent bytes of multibyte code point:
153	// bits 5-0 are values.
154	// 1 - first byte of multibyte code point
155	// bit 5: 0 - first byte of 2-byte code point: bits 4-0 are values.
156	// 1 - first byte of code point with >= 3 bytes.
157	// bit 4: 0 - first byte of 3-byte code point: bits 3-0 are values.
158	// 1 - first byte of code point with >= 4 bytes.
159	// bit 3: 0 - first byte of 4-byte code point: bits 2-0 are values.
160	// 1 - reserved for future expansion.
161	if (*cp <= `0x7f`) {
162	return true;
163	} else if (*cp <= `0xbf`) {
164	return false;
165	} else if (*cp <= `0xdf`) {
166	*cp &= `0x1f`;
167	*num_left = `1`;
168	} else if (*cp <= `0xef`) {
169	*cp &= `0x0f`;
170	*num_left = `2`;
171	} else if (*cp <= `0xf7`) {
172	*cp &= `0x07`;
173	*num_left = `3`;
174	} else {
175	return false;
176	}
177	} else {
178	// Last read was partial. Initialize num_read to 0 and continue reading
179	// the last unicode code point.
180	*num_read = `0`;
181	}
182	while (*num_left > `0` && index < str.size()) {
183	uint32_t ch = static_cast<uint8_t>(str [index++]);
184	--(*num_left);
185	++(*num_read);
186	cp = (cp << `6`) \| (ch & `0x3f`);
187	if (ch < `0x80` \|\| ch > `0xbf`) return false;
188	}
189	return num_left > `0` \|\| (!IsSurrogate(c: cp) && IsValidCodePoint(cp: *cp));
190	}
191
192	// Stores the 16-bit unicode code point as its hexadecimal digits in buffer
193	// and returns a StringPiece that points to this buffer. The input buffer needs
194	// to be at least 6 bytes long.
195	StringPiece ToHex(uint16_t cp, char* buffer) {
196	buffer[`5`] = kHex[cp & `0x0f`];
197	cp >>= `4`;
198	buffer[`4`] = kHex[cp & `0x0f`];
199	cp >>= `4`;
200	buffer[`3`] = kHex[cp & `0x0f`];
201	cp >>= `4`;
202	buffer[`2`] = kHex[cp & `0x0f`];
203	return StringPiece(buffer, `6`);
204	}
205
206	// Stores the 32-bit unicode code point as its hexadecimal digits in buffer
207	// and returns a StringPiece that points to this buffer. The input buffer needs
208	// to be at least 12 bytes long.
209	StringPiece ToSurrogateHex(uint32_t cp, char* buffer) {
210	uint16_t low = ToLowSurrogate(cp);
211	uint16_t high = ToHighSurrogate(cp);
212
213	buffer[`11`] = kHex[low & `0x0f`];
214	low >>= `4`;
215	buffer[`10`] = kHex[low & `0x0f`];
216	low >>= `4`;
217	buffer[`9`] = kHex[low & `0x0f`];
218	low >>= `4`;
219	buffer[`8`] = kHex[low & `0x0f`];
220
221	buffer[`5`] = kHex[high & `0x0f`];
222	high >>= `4`;
223	buffer[`4`] = kHex[high & `0x0f`];
224	high >>= `4`;
225	buffer[`3`] = kHex[high & `0x0f`];
226	high >>= `4`;
227	buffer[`2`] = kHex[high & `0x0f`];
228
229	return StringPiece(buffer, `12`);
230	}
231
232	// If the given unicode code point needs escaping, then returns the
233	// escaped form. The returned StringPiece either points to statically
234	// pre-allocated char[] or to the given buffer. The input buffer needs
235	// to be at least 12 bytes long.
236	//
237	// If the given unicode code point does not need escaping, an empty
238	// StringPiece is returned.
239	StringPiece EscapeCodePoint(uint32_t cp, char* buffer) {
240	if (cp < `0xa0`) return kCommonEscapes[cp];
241	switch (cp) {
242	// These are not required by json spec
243	// but used to prevent security bugs in javascript.
244	case `0xfeff`: // Zero width no-break space
245	case `0xfff9`: // Interlinear annotation anchor
246	case `0xfffa`: // Interlinear annotation separator
247	case `0xfffb`: // Interlinear annotation terminator
248
249	case `0x00ad`: // Soft-hyphen
250	case `0x06dd`: // Arabic end of ayah
251	case `0x070f`: // Syriac abbreviation mark
252	case `0x17b4`: // Khmer vowel inherent Aq
253	case `0x17b5`: // Khmer vowel inherent Aa
254	return ToHex(cp, buffer);
255
256	default:
257	if ((cp >= `0x0600` && cp <= `0x0603`) \|\| // Arabic signs
258	(cp >= `0x200b` && cp <= `0x200f`) \|\| // Zero width etc.
259	(cp >= `0x2028` && cp <= `0x202e`) \|\| // Separators etc.
260	(cp >= `0x2060` && cp <= `0x2064`) \|\| // Invisible etc.
261	(cp >= `0x206a` && cp <= `0x206f`)) { // Shaping etc.
262	return ToHex(cp, buffer);
263	}
264
265	if (cp == `0x000e0001` \|\| // Language tag
266	(cp >= `0x0001d173` && cp <= `0x0001d17a`) \|\| // Music formatting
267	(cp >= `0x000e0020` && cp <= `0x000e007f`)) { // TAG symbols
268	return ToSurrogateHex(cp, buffer);
269	}
270	}
271	return StringPiece();
272	}
273
274	// Tries to escape the given code point first. If the given code point
275	// does not need to be escaped, but force_output is true, then render
276	// the given multi-byte code point in UTF8 in the buffer and returns it.
277	StringPiece EscapeCodePoint(uint32_t cp, char* buffer,
278	bool force_output) {
279	StringPiece sp = EscapeCodePoint(cp, buffer);
280	if (force_output && sp.empty()) {
281	buffer[`5`] = (cp & `0x3f`) \| `0x80`;
282	cp >>= `6`;
283	if (cp <= `0x1f`) {
284	buffer[`4`] = cp \| `0xc0`;
285	sp = StringPiece(buffer + `4`, `2`);
286	return sp;
287	}
288	buffer[`4`] = (cp & `0x3f`) \| `0x80`;
289	cp >>= `6`;
290	if (cp <= `0x0f`) {
291	buffer[`3`] = cp \| `0xe0`;
292	sp = StringPiece(buffer + `3`, `3`);
293	return sp;
294	}
295	buffer[`3`] = (cp & `0x3f`) \| `0x80`;
296	buffer[`2`] = ((cp >> `6`) & `0x07`) \| `0xf0`;
297	sp = StringPiece(buffer + `2`, `4`);
298	}
299	return sp;
300	}
301
302	} // namespace
303
304	void JsonEscaping::Escape(strings::ByteSource* input,
305	strings::ByteSink* output) {
306	char buffer[`12`] = "\\udead\\ubee";
307	uint32_t cp = `0`; // Current unicode code point.
308	int num_left = `0`; // Num of chars to read to complete the code point.
309	while (input->Available() > `0`) {
310	StringPiece str = input->Peek();
311	StringPiece escaped;
312	int i = `0`;
313	int num_read;
314	bool ok;
315	bool cp_was_split = num_left > `0`;
316	// Loop until we encounter either
317	// i) a code point that needs to be escaped; or
318	// ii) a split code point is completely read; or
319	// iii) a character that is not a valid utf8; or
320	// iv) end of the StringPiece str is reached.
321	do {
322	ok = ReadCodePoint(str, index: i, cp: &cp, num_left: &num_left, num_read: &num_read);
323	if (num_left > `0` \|\| !ok) break; // case iii or iv
324	escaped = EscapeCodePoint(cp, buffer, force_output: cp_was_split);
325	if (!escaped.empty()) break; // case i or ii
326	i += num_read;
327	num_read = `0`;
328	} while (i < str.length()); // case iv
329	// First copy the un-escaped prefix, if any, to the output ByteSink.
330	if (i > `0`) input->CopyTo(sink: output, n: i);
331	if (num_read > `0`) input->Skip(n: num_read);
332	if (!ok) {
333	// Case iii: Report error.
334	// TODO(wpoon): Add error reporting.
335	num_left = `0`;
336	} else if (num_left == `0` && !escaped.empty()) {
337	// Case i or ii: Append the escaped code point to the output ByteSink.
338	output->Append(bytes: escaped.data(), n: escaped.size());
339	}
340	}
341	if (num_left > `0`) {
342	// Treat as case iii: report error.
343	// TODO(wpoon): Add error reporting.
344	}
345	}
346
347	void JsonEscaping::Escape(StringPiece input, strings::ByteSink* output) {
348	const size_t len = input.length();
349	const char* p = input.data();
350
351	bool can_skip_escaping = true;
352	for (int i = `0`; i < len; i++) {
353	char c = p[i];
354	if (c < `0x20` \|\| c >= `0x7F` \|\| c == `'"'` \|\| c == `'<'` \|\| c == `'>'` \|\|
355	c == `'\\'`) {
356	can_skip_escaping = false;
357	break;
358	}
359	}
360
361	if (can_skip_escaping) {
362	output->Append(bytes: input.data(), n: input.length());
363	} else {
364	strings::ArrayByteSource source(input);
365	Escape(input: &source, output);
366	}
367	}
368
369	} // namespace converter
370	} // namespace util
371	} // namespace protobuf
372	} // namespace google
373

Browse the source code of Velox/build/_deps/protobuf-src/src/google/protobuf/util/internal/json_escaping.cc