1 | /* |
2 | * Copyright 2011-present Facebook, Inc. |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | */ |
16 | #include <folly/json.h> |
17 | |
18 | #include <algorithm> |
19 | #include <functional> |
20 | #include <iterator> |
21 | #include <type_traits> |
22 | |
23 | #include <boost/algorithm/string.hpp> |
24 | |
25 | #include <folly/Conv.h> |
26 | #include <folly/Portability.h> |
27 | #include <folly/Range.h> |
28 | #include <folly/String.h> |
29 | #include <folly/Unicode.h> |
30 | #include <folly/lang/Bits.h> |
31 | #include <folly/portability/Constexpr.h> |
32 | |
33 | namespace folly { |
34 | |
35 | ////////////////////////////////////////////////////////////////////// |
36 | |
37 | namespace json { |
38 | namespace { |
39 | |
40 | struct Printer { |
41 | explicit Printer( |
42 | std::string& out, |
43 | unsigned* indentLevel, |
44 | serialization_opts const* opts) |
45 | : out_(out), indentLevel_(indentLevel), opts_(*opts) {} |
46 | |
47 | void operator()(dynamic const& v) const { |
48 | switch (v.type()) { |
49 | case dynamic::DOUBLE: |
50 | if (!opts_.allow_nan_inf && |
51 | (std::isnan(v.asDouble()) || std::isinf(v.asDouble()))) { |
52 | throw std::runtime_error( |
53 | "folly::toJson: JSON object value was a " |
54 | "NaN or INF" ); |
55 | } |
56 | toAppend( |
57 | v.asDouble(), &out_, opts_.double_mode, opts_.double_num_digits); |
58 | break; |
59 | case dynamic::INT64: { |
60 | auto intval = v.asInt(); |
61 | if (opts_.javascript_safe) { |
62 | // Use folly::to to check that this integer can be represented |
63 | // as a double without loss of precision. |
64 | intval = int64_t(to<double>(intval)); |
65 | } |
66 | toAppend(intval, &out_); |
67 | break; |
68 | } |
69 | case dynamic::BOOL: |
70 | out_ += v.asBool() ? "true" : "false" ; |
71 | break; |
72 | case dynamic::NULLT: |
73 | out_ += "null" ; |
74 | break; |
75 | case dynamic::STRING: |
76 | escapeString(v.asString(), out_, opts_); |
77 | break; |
78 | case dynamic::OBJECT: |
79 | printObject(v); |
80 | break; |
81 | case dynamic::ARRAY: |
82 | printArray(v); |
83 | break; |
84 | default: |
85 | CHECK(0) << "Bad type " << v.type(); |
86 | } |
87 | } |
88 | |
89 | private: |
90 | void printKV(const std::pair<const dynamic, dynamic>& p) const { |
91 | if (!opts_.allow_non_string_keys && !p.first.isString()) { |
92 | throw std::runtime_error( |
93 | "folly::toJson: JSON object key was not a " |
94 | "string" ); |
95 | } |
96 | (*this)(p.first); |
97 | mapColon(); |
98 | (*this)(p.second); |
99 | } |
100 | |
101 | template <typename Iterator> |
102 | void printKVPairs(Iterator begin, Iterator end) const { |
103 | printKV(*begin); |
104 | for (++begin; begin != end; ++begin) { |
105 | out_ += ','; |
106 | newline(); |
107 | printKV(*begin); |
108 | } |
109 | } |
110 | |
111 | void printObject(dynamic const& o) const { |
112 | if (o.empty()) { |
113 | out_ += "{}" ; |
114 | return; |
115 | } |
116 | |
117 | out_ += '{'; |
118 | indent(); |
119 | newline(); |
120 | if (opts_.sort_keys || opts_.sort_keys_by) { |
121 | using ref = std::reference_wrapper<decltype(o.items())::value_type const>; |
122 | std::vector<ref> refs(o.items().begin(), o.items().end()); |
123 | |
124 | using SortByRef = FunctionRef<bool(dynamic const&, dynamic const&)>; |
125 | auto const& sort_keys_by = opts_.sort_keys_by |
126 | ? SortByRef(opts_.sort_keys_by) |
127 | : SortByRef(std::less<dynamic>()); |
128 | std::sort(refs.begin(), refs.end(), [&](ref a, ref b) { |
129 | // Only compare keys. No ordering among identical keys. |
130 | return sort_keys_by(a.get().first, b.get().first); |
131 | }); |
132 | printKVPairs(refs.cbegin(), refs.cend()); |
133 | } else { |
134 | printKVPairs(o.items().begin(), o.items().end()); |
135 | } |
136 | outdent(); |
137 | newline(); |
138 | out_ += '}'; |
139 | } |
140 | |
141 | void printArray(dynamic const& a) const { |
142 | if (a.empty()) { |
143 | out_ += "[]" ; |
144 | return; |
145 | } |
146 | |
147 | out_ += '['; |
148 | indent(); |
149 | newline(); |
150 | (*this)(a[0]); |
151 | for (auto& val : range(std::next(a.begin()), a.end())) { |
152 | out_ += ','; |
153 | newline(); |
154 | (*this)(val); |
155 | } |
156 | outdent(); |
157 | newline(); |
158 | out_ += ']'; |
159 | } |
160 | |
161 | private: |
162 | void outdent() const { |
163 | if (indentLevel_) { |
164 | --*indentLevel_; |
165 | } |
166 | } |
167 | |
168 | void indent() const { |
169 | if (indentLevel_) { |
170 | ++*indentLevel_; |
171 | } |
172 | } |
173 | |
174 | void newline() const { |
175 | if (indentLevel_) { |
176 | out_ += to<std::string>('\n', std::string(*indentLevel_ * 2, ' ')); |
177 | } |
178 | } |
179 | |
180 | void mapColon() const { |
181 | out_ += indentLevel_ ? ": " : ":" ; |
182 | } |
183 | |
184 | private: |
185 | std::string& out_; |
186 | unsigned* const indentLevel_; |
187 | serialization_opts const& opts_; |
188 | }; |
189 | |
190 | ////////////////////////////////////////////////////////////////////// |
191 | |
192 | struct FOLLY_EXPORT ParseError : std::runtime_error { |
193 | explicit ParseError( |
194 | unsigned int line, |
195 | std::string const& context, |
196 | std::string const& expected) |
197 | : std::runtime_error(to<std::string>( |
198 | "json parse error on line " , |
199 | line, |
200 | !context.empty() ? to<std::string>(" near `" , context, '\'') : "" , |
201 | ": " , |
202 | expected)) {} |
203 | }; |
204 | |
205 | // Wraps our input buffer with some helper functions. |
206 | struct Input { |
207 | explicit Input(StringPiece range, json::serialization_opts const* opts) |
208 | : range_(range), opts_(*opts), lineNum_(0) { |
209 | storeCurrent(); |
210 | } |
211 | |
212 | Input(Input const&) = delete; |
213 | Input& operator=(Input const&) = delete; |
214 | |
215 | char const* begin() const { |
216 | return range_.begin(); |
217 | } |
218 | |
219 | // Parse ahead for as long as the supplied predicate is satisfied, |
220 | // returning a range of what was skipped. |
221 | template <class Predicate> |
222 | StringPiece skipWhile(const Predicate& p) { |
223 | std::size_t skipped = 0; |
224 | for (; skipped < range_.size(); ++skipped) { |
225 | if (!p(range_[skipped])) { |
226 | break; |
227 | } |
228 | if (range_[skipped] == '\n') { |
229 | ++lineNum_; |
230 | } |
231 | } |
232 | auto ret = range_.subpiece(0, skipped); |
233 | range_.advance(skipped); |
234 | storeCurrent(); |
235 | return ret; |
236 | } |
237 | |
238 | StringPiece skipDigits() { |
239 | return skipWhile([](char c) { return c >= '0' && c <= '9'; }); |
240 | } |
241 | |
242 | StringPiece skipMinusAndDigits() { |
243 | bool firstChar = true; |
244 | return skipWhile([&firstChar](char c) { |
245 | bool result = (c >= '0' && c <= '9') || (firstChar && c == '-'); |
246 | firstChar = false; |
247 | return result; |
248 | }); |
249 | } |
250 | |
251 | void skipWhitespace() { |
252 | unsigned index = 0; |
253 | while (true) { |
254 | while (index < range_.size() && range_[index] == ' ') { |
255 | index++; |
256 | } |
257 | if (index < range_.size()) { |
258 | if (range_[index] == '\n') { |
259 | index++; |
260 | ++lineNum_; |
261 | continue; |
262 | } |
263 | if (range_[index] == '\t' || range_[index] == '\r') { |
264 | index++; |
265 | continue; |
266 | } |
267 | } |
268 | break; |
269 | } |
270 | range_.advance(index); |
271 | storeCurrent(); |
272 | } |
273 | |
274 | void expect(char c) { |
275 | if (**this != c) { |
276 | throw ParseError( |
277 | lineNum_, context(), to<std::string>("expected '" , c, '\'')); |
278 | } |
279 | ++*this; |
280 | } |
281 | |
282 | std::size_t size() const { |
283 | return range_.size(); |
284 | } |
285 | |
286 | int operator*() const { |
287 | return current_; |
288 | } |
289 | |
290 | void operator++() { |
291 | range_.pop_front(); |
292 | storeCurrent(); |
293 | } |
294 | |
295 | template <class T> |
296 | T () { |
297 | try { |
298 | return to<T>(&range_); |
299 | } catch (std::exception const& e) { |
300 | error(e.what()); |
301 | } |
302 | } |
303 | |
304 | bool consume(StringPiece str) { |
305 | if (boost::starts_with(range_, str)) { |
306 | range_.advance(str.size()); |
307 | storeCurrent(); |
308 | return true; |
309 | } |
310 | return false; |
311 | } |
312 | |
313 | std::string context() const { |
314 | return range_.subpiece(0, 16 /* arbitrary */).toString(); |
315 | } |
316 | |
317 | dynamic error(char const* what) const { |
318 | throw ParseError(lineNum_, context(), what); |
319 | } |
320 | |
321 | json::serialization_opts const& getOpts() { |
322 | return opts_; |
323 | } |
324 | |
325 | void incrementRecursionLevel() { |
326 | if (currentRecursionLevel_ > opts_.recursion_limit) { |
327 | error("recursion limit exceeded" ); |
328 | } |
329 | currentRecursionLevel_++; |
330 | } |
331 | |
332 | void decrementRecursionLevel() { |
333 | currentRecursionLevel_--; |
334 | } |
335 | |
336 | private: |
337 | void storeCurrent() { |
338 | current_ = range_.empty() ? EOF : range_.front(); |
339 | } |
340 | |
341 | private: |
342 | StringPiece range_; |
343 | json::serialization_opts const& opts_; |
344 | unsigned lineNum_; |
345 | int current_; |
346 | unsigned int currentRecursionLevel_{0}; |
347 | }; |
348 | |
349 | class RecursionGuard { |
350 | public: |
351 | explicit RecursionGuard(Input& in) : in_(in) { |
352 | in_.incrementRecursionLevel(); |
353 | } |
354 | |
355 | ~RecursionGuard() { |
356 | in_.decrementRecursionLevel(); |
357 | } |
358 | |
359 | private: |
360 | Input& in_; |
361 | }; |
362 | |
363 | dynamic parseValue(Input& in); |
364 | std::string parseString(Input& in); |
365 | dynamic parseNumber(Input& in); |
366 | |
367 | dynamic parseObject(Input& in) { |
368 | DCHECK_EQ(*in, '{'); |
369 | ++in; |
370 | |
371 | dynamic ret = dynamic::object; |
372 | |
373 | in.skipWhitespace(); |
374 | if (*in == '}') { |
375 | ++in; |
376 | return ret; |
377 | } |
378 | |
379 | for (;;) { |
380 | if (in.getOpts().allow_trailing_comma && *in == '}') { |
381 | break; |
382 | } |
383 | if (*in == '\"') { // string |
384 | auto key = parseString(in); |
385 | in.skipWhitespace(); |
386 | in.expect(':'); |
387 | in.skipWhitespace(); |
388 | ret.insert(std::move(key), parseValue(in)); |
389 | } else if (!in.getOpts().allow_non_string_keys) { |
390 | in.error("expected string for object key name" ); |
391 | } else { |
392 | auto key = parseValue(in); |
393 | in.skipWhitespace(); |
394 | in.expect(':'); |
395 | in.skipWhitespace(); |
396 | ret.insert(std::move(key), parseValue(in)); |
397 | } |
398 | |
399 | in.skipWhitespace(); |
400 | if (*in != ',') { |
401 | break; |
402 | } |
403 | ++in; |
404 | in.skipWhitespace(); |
405 | } |
406 | in.expect('}'); |
407 | |
408 | return ret; |
409 | } |
410 | |
411 | dynamic parseArray(Input& in) { |
412 | DCHECK_EQ(*in, '['); |
413 | ++in; |
414 | |
415 | dynamic ret = dynamic::array; |
416 | |
417 | in.skipWhitespace(); |
418 | if (*in == ']') { |
419 | ++in; |
420 | return ret; |
421 | } |
422 | |
423 | for (;;) { |
424 | if (in.getOpts().allow_trailing_comma && *in == ']') { |
425 | break; |
426 | } |
427 | ret.push_back(parseValue(in)); |
428 | in.skipWhitespace(); |
429 | if (*in != ',') { |
430 | break; |
431 | } |
432 | ++in; |
433 | in.skipWhitespace(); |
434 | } |
435 | in.expect(']'); |
436 | |
437 | return ret; |
438 | } |
439 | |
440 | dynamic parseNumber(Input& in) { |
441 | bool const negative = (*in == '-'); |
442 | if (negative && in.consume("-Infinity" )) { |
443 | if (in.getOpts().parse_numbers_as_strings) { |
444 | return "-Infinity" ; |
445 | } else { |
446 | return -std::numeric_limits<double>::infinity(); |
447 | } |
448 | } |
449 | |
450 | auto integral = in.skipMinusAndDigits(); |
451 | if (negative && integral.size() < 2) { |
452 | in.error("expected digits after `-'" ); |
453 | } |
454 | |
455 | auto const wasE = *in == 'e' || *in == 'E'; |
456 | |
457 | constexpr const char* maxInt = "9223372036854775807" ; |
458 | constexpr const char* minInt = "-9223372036854775808" ; |
459 | constexpr auto maxIntLen = constexpr_strlen(maxInt); |
460 | constexpr auto minIntLen = constexpr_strlen(minInt); |
461 | |
462 | if (*in != '.' && !wasE && in.getOpts().parse_numbers_as_strings) { |
463 | return integral; |
464 | } |
465 | |
466 | if (*in != '.' && !wasE) { |
467 | if (LIKELY(!in.getOpts().double_fallback || integral.size() < maxIntLen) || |
468 | (!negative && integral.size() == maxIntLen && integral <= maxInt) || |
469 | (negative && integral.size() == minIntLen && integral <= minInt)) { |
470 | auto val = to<int64_t>(integral); |
471 | in.skipWhitespace(); |
472 | return val; |
473 | } else { |
474 | auto val = to<double>(integral); |
475 | in.skipWhitespace(); |
476 | return val; |
477 | } |
478 | } |
479 | |
480 | auto end = !wasE ? (++in, in.skipDigits().end()) : in.begin(); |
481 | if (*in == 'e' || *in == 'E') { |
482 | ++in; |
483 | if (*in == '+' || *in == '-') { |
484 | ++in; |
485 | } |
486 | auto expPart = in.skipDigits(); |
487 | end = expPart.end(); |
488 | } |
489 | auto fullNum = range(integral.begin(), end); |
490 | if (in.getOpts().parse_numbers_as_strings) { |
491 | return fullNum; |
492 | } |
493 | auto val = to<double>(fullNum); |
494 | return val; |
495 | } |
496 | |
497 | std::string decodeUnicodeEscape(Input& in) { |
498 | auto hexVal = [&](int c) -> uint16_t { |
499 | // clang-format off |
500 | return uint16_t( |
501 | c >= '0' && c <= '9' ? c - '0' : |
502 | c >= 'a' && c <= 'f' ? c - 'a' + 10 : |
503 | c >= 'A' && c <= 'F' ? c - 'A' + 10 : |
504 | (in.error("invalid hex digit" ), 0)); |
505 | // clang-format on |
506 | }; |
507 | |
508 | auto readHex = [&]() -> uint16_t { |
509 | if (in.size() < 4) { |
510 | in.error("expected 4 hex digits" ); |
511 | } |
512 | |
513 | uint16_t ret = uint16_t(hexVal(*in) * 4096); |
514 | ++in; |
515 | ret += hexVal(*in) * 256; |
516 | ++in; |
517 | ret += hexVal(*in) * 16; |
518 | ++in; |
519 | ret += hexVal(*in); |
520 | ++in; |
521 | return ret; |
522 | }; |
523 | |
524 | /* |
525 | * If the value encoded is in the surrogate pair range, we need to |
526 | * make sure there is another escape that we can use also. |
527 | */ |
528 | uint32_t codePoint = readHex(); |
529 | if (codePoint >= 0xd800 && codePoint <= 0xdbff) { |
530 | if (!in.consume("\\u" )) { |
531 | in.error( |
532 | "expected another unicode escape for second half of " |
533 | "surrogate pair" ); |
534 | } |
535 | uint16_t second = readHex(); |
536 | if (second >= 0xdc00 && second <= 0xdfff) { |
537 | codePoint = 0x10000 + ((codePoint & 0x3ff) << 10) + (second & 0x3ff); |
538 | } else { |
539 | in.error("second character in surrogate pair is invalid" ); |
540 | } |
541 | } else if (codePoint >= 0xdc00 && codePoint <= 0xdfff) { |
542 | in.error("invalid unicode code point (in range [0xdc00,0xdfff])" ); |
543 | } |
544 | |
545 | return codePointToUtf8(codePoint); |
546 | } |
547 | |
548 | std::string parseString(Input& in) { |
549 | DCHECK_EQ(*in, '\"'); |
550 | ++in; |
551 | |
552 | std::string ret; |
553 | for (;;) { |
554 | auto range = in.skipWhile([](char c) { return c != '\"' && c != '\\'; }); |
555 | ret.append(range.begin(), range.end()); |
556 | |
557 | if (*in == '\"') { |
558 | ++in; |
559 | break; |
560 | } |
561 | if (*in == '\\') { |
562 | ++in; |
563 | switch (*in) { |
564 | // clang-format off |
565 | case '\"': ret.push_back('\"'); ++in; break; |
566 | case '\\': ret.push_back('\\'); ++in; break; |
567 | case '/': ret.push_back('/'); ++in; break; |
568 | case 'b': ret.push_back('\b'); ++in; break; |
569 | case 'f': ret.push_back('\f'); ++in; break; |
570 | case 'n': ret.push_back('\n'); ++in; break; |
571 | case 'r': ret.push_back('\r'); ++in; break; |
572 | case 't': ret.push_back('\t'); ++in; break; |
573 | case 'u': ++in; ret += decodeUnicodeEscape(in); break; |
574 | // clang-format on |
575 | default: |
576 | in.error( |
577 | to<std::string>("unknown escape " , *in, " in string" ).c_str()); |
578 | } |
579 | continue; |
580 | } |
581 | if (*in == EOF) { |
582 | in.error("unterminated string" ); |
583 | } |
584 | if (!*in) { |
585 | /* |
586 | * Apparently we're actually supposed to ban all control |
587 | * characters from strings. This seems unnecessarily |
588 | * restrictive, so we're only banning zero bytes. (Since the |
589 | * string is presumed to be UTF-8 encoded it's fine to just |
590 | * check this way.) |
591 | */ |
592 | in.error("null byte in string" ); |
593 | } |
594 | |
595 | ret.push_back(char(*in)); |
596 | ++in; |
597 | } |
598 | |
599 | return ret; |
600 | } |
601 | |
602 | dynamic parseValue(Input& in) { |
603 | RecursionGuard guard(in); |
604 | |
605 | in.skipWhitespace(); |
606 | // clang-format off |
607 | return |
608 | *in == '[' ? parseArray(in) : |
609 | *in == '{' ? parseObject(in) : |
610 | *in == '\"' ? parseString(in) : |
611 | (*in == '-' || (*in >= '0' && *in <= '9')) ? parseNumber(in) : |
612 | in.consume("true" ) ? true : |
613 | in.consume("false" ) ? false : |
614 | in.consume("null" ) ? nullptr : |
615 | in.consume("Infinity" ) ? |
616 | (in.getOpts().parse_numbers_as_strings ? (dynamic)"Infinity" : |
617 | (dynamic)std::numeric_limits<double>::infinity()) : |
618 | in.consume("NaN" ) ? |
619 | (in.getOpts().parse_numbers_as_strings ? (dynamic)"NaN" : |
620 | (dynamic)std::numeric_limits<double>::quiet_NaN()) : |
621 | in.error("expected json value" ); |
622 | // clang-format on |
623 | } |
624 | |
625 | } // namespace |
626 | |
627 | ////////////////////////////////////////////////////////////////////// |
628 | |
629 | std::array<uint64_t, 2> (StringPiece chars) { |
630 | std::array<uint64_t, 2> escapes{{0, 0}}; |
631 | for (auto b : ByteRange(chars)) { |
632 | if (b >= 0x20 && b < 0x80) { |
633 | escapes[b / 64] |= uint64_t(1) << (b % 64); |
634 | } |
635 | } |
636 | return escapes; |
637 | } |
638 | |
639 | std::string serialize(dynamic const& dyn, serialization_opts const& opts) { |
640 | std::string ret; |
641 | unsigned indentLevel = 0; |
642 | Printer p(ret, opts.pretty_formatting ? &indentLevel : nullptr, &opts); |
643 | p(dyn); |
644 | return ret; |
645 | } |
646 | |
647 | // Fast path to determine the longest prefix that can be left |
648 | // unescaped in a string of sizeof(T) bytes packed in an integer of |
649 | // type T. |
650 | template <bool EnableExtraAsciiEscapes, class T> |
651 | size_t firstEscapableInWord(T s, const serialization_opts& opts) { |
652 | static_assert(std::is_unsigned<T>::value, "Unsigned integer required" ); |
653 | static constexpr T kOnes = ~T() / 255; // 0x...0101 |
654 | static constexpr T kMsbs = kOnes * 0x80; // 0x...8080 |
655 | |
656 | // Sets the MSB of bytes < b. Precondition: b < 128. |
657 | auto isLess = [](T w, uint8_t b) { |
658 | // A byte is < b iff subtracting b underflows, so we check that |
659 | // the MSB wasn't set before and it's set after the subtraction. |
660 | return (w - kOnes * b) & ~w & kMsbs; |
661 | }; |
662 | |
663 | auto isChar = [&](uint8_t c) { |
664 | // A byte is == c iff it is 0 if xored with c. |
665 | return isLess(s ^ (kOnes * c), 1); |
666 | }; |
667 | |
668 | // The following masks have the MSB set for each byte of the word |
669 | // that satisfies the corresponding condition. |
670 | auto isHigh = s & kMsbs; // >= 128 |
671 | auto isLow = isLess(s, 0x20); // <= 0x1f |
672 | auto needsEscape = isHigh | isLow | isChar('\\') | isChar('"'); |
673 | |
674 | if /* constexpr */ (EnableExtraAsciiEscapes) { |
675 | // Deal with optional bitmap for unicode escapes. Escapes can optionally be |
676 | // set for ascii characters 32 - 127, so the inner loop may run up to 96 |
677 | // times. However, for the case where 0 or a handful of bits are set, |
678 | // looping will be minimal through use of findFirstSet. |
679 | for (size_t i = 0; i < opts.extra_ascii_to_escape_bitmap.size(); ++i) { |
680 | const auto offset = i * 64; |
681 | // Clear first 32 characters if this is the first index, since those are |
682 | // always escaped. |
683 | auto bitmap = opts.extra_ascii_to_escape_bitmap[i] & |
684 | (i == 0 ? uint64_t(-1) << 32 : ~0UL); |
685 | while (bitmap) { |
686 | auto bit = folly::findFirstSet(bitmap); |
687 | needsEscape |= isChar(offset + bit - 1); |
688 | bitmap &= bitmap - 1; |
689 | } |
690 | } |
691 | } |
692 | |
693 | if (!needsEscape) { |
694 | return sizeof(T); |
695 | } |
696 | |
697 | if (folly::kIsLittleEndian) { |
698 | return folly::findFirstSet(needsEscape) / 8 - 1; |
699 | } else { |
700 | return sizeof(T) - folly::findLastSet(needsEscape) / 8; |
701 | } |
702 | } |
703 | |
704 | // Escape a string so that it is legal to print it in JSON text. |
705 | template <bool EnableExtraAsciiEscapes> |
706 | void escapeStringImpl( |
707 | StringPiece input, |
708 | std::string& out, |
709 | const serialization_opts& opts) { |
710 | auto hexDigit = [](uint8_t c) -> char { |
711 | return c < 10 ? c + '0' : c - 10 + 'a'; |
712 | }; |
713 | |
714 | out.push_back('\"'); |
715 | |
716 | auto* p = reinterpret_cast<const unsigned char*>(input.begin()); |
717 | auto* q = reinterpret_cast<const unsigned char*>(input.begin()); |
718 | auto* e = reinterpret_cast<const unsigned char*>(input.end()); |
719 | |
720 | while (p < e) { |
721 | // Find the longest prefix that does not need escaping, and copy |
722 | // it literally into the output string. |
723 | auto firstEsc = p; |
724 | while (firstEsc < e) { |
725 | auto avail = e - firstEsc; |
726 | uint64_t word = 0; |
727 | if (avail >= 8) { |
728 | word = folly::loadUnaligned<uint64_t>(firstEsc); |
729 | } else { |
730 | word = folly::partialLoadUnaligned<uint64_t>(firstEsc, avail); |
731 | } |
732 | auto prefix = firstEscapableInWord<EnableExtraAsciiEscapes>(word, opts); |
733 | DCHECK_LE(prefix, avail); |
734 | firstEsc += prefix; |
735 | if (prefix < 8) { |
736 | break; |
737 | } |
738 | } |
739 | if (firstEsc > p) { |
740 | out.append(reinterpret_cast<const char*>(p), firstEsc - p); |
741 | p = firstEsc; |
742 | // We can't be in the middle of a multibyte sequence, so we can reset q. |
743 | q = p; |
744 | if (p == e) { |
745 | break; |
746 | } |
747 | } |
748 | |
749 | // Handle the next byte that may need escaping. |
750 | |
751 | // Since non-ascii encoding inherently does utf8 validation |
752 | // we explicitly validate utf8 only if non-ascii encoding is disabled. |
753 | if ((opts.validate_utf8 || opts.skip_invalid_utf8) && |
754 | !opts.encode_non_ascii) { |
755 | // To achieve better spatial and temporal coherence |
756 | // we do utf8 validation progressively along with the |
757 | // string-escaping instead of two separate passes. |
758 | |
759 | // As the encoding progresses, q will stay at or ahead of p. |
760 | CHECK_GE(q, p); |
761 | |
762 | // As p catches up with q, move q forward. |
763 | if (q == p) { |
764 | // calling utf8_decode has the side effect of |
765 | // checking that utf8 encodings are valid |
766 | char32_t v = utf8ToCodePoint(q, e, opts.skip_invalid_utf8); |
767 | if (opts.skip_invalid_utf8 && v == U'\ufffd') { |
768 | out.append(u8"\ufffd" ); |
769 | p = q; |
770 | continue; |
771 | } |
772 | } |
773 | } |
774 | |
775 | auto encodeUnicode = opts.encode_non_ascii && (*p & 0x80); |
776 | if /* constexpr */ (EnableExtraAsciiEscapes) { |
777 | encodeUnicode = encodeUnicode || |
778 | (*p >= 0x20 && *p < 0x80 && |
779 | (opts.extra_ascii_to_escape_bitmap[*p / 64] & |
780 | (uint64_t(1) << (*p % 64)))); |
781 | } |
782 | |
783 | if (encodeUnicode) { |
784 | // note that this if condition captures utf8 chars |
785 | // with value > 127, so size > 1 byte (or they are whitelisted for |
786 | // Unicode encoding). |
787 | // NOTE: char32_t / char16_t are both unsigned. |
788 | char32_t cp = utf8ToCodePoint(p, e, opts.skip_invalid_utf8); |
789 | auto writeHex = [&](char16_t v) { |
790 | char buf[] = "\\u\0\0\0\0" ; |
791 | buf[2] = hexDigit((v >> 12) & 0x0f); |
792 | buf[3] = hexDigit((v >> 8) & 0x0f); |
793 | buf[4] = hexDigit((v >> 4) & 0x0f); |
794 | buf[5] = hexDigit(v & 0x0f); |
795 | out.append(buf, 6); |
796 | }; |
797 | // From the ECMA-404 The JSON Data Interchange Syntax 2nd Edition Dec 2017 |
798 | if (cp < 0x10000u) { |
799 | // If the code point is in the Basic Multilingual Plane (U+0000 through |
800 | // U+FFFF), then it may be represented as a six-character sequence: |
801 | // a reverse solidus, followed by the lowercase letter u, followed by |
802 | // four hexadecimal digits that encode the code point. |
803 | writeHex(static_cast<char16_t>(cp)); |
804 | } else { |
805 | // To escape a code point that is not in the Basic Multilingual Plane, |
806 | // the character may be represented as a twelve-character sequence, |
807 | // encoding the UTF-16 surrogate pair corresponding to the code point. |
808 | writeHex(static_cast<char16_t>( |
809 | 0xd800u + (((cp - 0x10000u) >> 10) & 0x3ffu))); |
810 | writeHex(static_cast<char16_t>(0xdc00u + ((cp - 0x10000u) & 0x3ffu))); |
811 | } |
812 | } else if (*p == '\\' || *p == '\"') { |
813 | char buf[] = "\\\0" ; |
814 | buf[1] = char(*p++); |
815 | out.append(buf, 2); |
816 | } else if (*p <= 0x1f) { |
817 | switch (*p) { |
818 | // clang-format off |
819 | case '\b': out.append("\\b" ); p++; break; |
820 | case '\f': out.append("\\f" ); p++; break; |
821 | case '\n': out.append("\\n" ); p++; break; |
822 | case '\r': out.append("\\r" ); p++; break; |
823 | case '\t': out.append("\\t" ); p++; break; |
824 | // clang-format on |
825 | default: |
826 | // Note that this if condition captures non readable chars |
827 | // with value < 32, so size = 1 byte (e.g control chars). |
828 | char buf[] = "\\u00\0\0" ; |
829 | buf[4] = hexDigit(uint8_t((*p & 0xf0) >> 4)); |
830 | buf[5] = hexDigit(uint8_t(*p & 0xf)); |
831 | out.append(buf, 6); |
832 | p++; |
833 | } |
834 | } else { |
835 | out.push_back(char(*p++)); |
836 | } |
837 | } |
838 | |
839 | out.push_back('\"'); |
840 | } |
841 | |
842 | void escapeString( |
843 | StringPiece input, |
844 | std::string& out, |
845 | const serialization_opts& opts) { |
846 | if (FOLLY_UNLIKELY( |
847 | opts.extra_ascii_to_escape_bitmap[0] || |
848 | opts.extra_ascii_to_escape_bitmap[1])) { |
849 | escapeStringImpl<true>(input, out, opts); |
850 | } else { |
851 | escapeStringImpl<false>(input, out, opts); |
852 | } |
853 | } |
854 | |
855 | std::string (StringPiece jsonC) { |
856 | std::string result; |
857 | enum class State { |
858 | None, |
859 | InString, |
860 | , |
861 | |
862 | } state = State::None; |
863 | |
864 | for (size_t i = 0; i < jsonC.size(); ++i) { |
865 | auto s = jsonC.subpiece(i); |
866 | switch (state) { |
867 | case State::None: |
868 | if (s.startsWith("/*" )) { |
869 | state = State::InlineComment; |
870 | ++i; |
871 | continue; |
872 | } else if (s.startsWith("//" )) { |
873 | state = State::LineComment; |
874 | ++i; |
875 | continue; |
876 | } else if (s[0] == '\"') { |
877 | state = State::InString; |
878 | } |
879 | result.push_back(s[0]); |
880 | break; |
881 | case State::InString: |
882 | if (s[0] == '\\') { |
883 | if (UNLIKELY(s.size() == 1)) { |
884 | throw std::logic_error("Invalid JSONC: string is not terminated" ); |
885 | } |
886 | result.push_back(s[0]); |
887 | result.push_back(s[1]); |
888 | ++i; |
889 | continue; |
890 | } else if (s[0] == '\"') { |
891 | state = State::None; |
892 | } |
893 | result.push_back(s[0]); |
894 | break; |
895 | case State::InlineComment: |
896 | if (s.startsWith("*/" )) { |
897 | state = State::None; |
898 | ++i; |
899 | } |
900 | break; |
901 | case State::LineComment: |
902 | if (s[0] == '\n') { |
903 | // skip the line break. It doesn't matter. |
904 | state = State::None; |
905 | } |
906 | break; |
907 | default: |
908 | throw std::logic_error("Unknown comment state" ); |
909 | } |
910 | } |
911 | return result; |
912 | } |
913 | |
914 | } // namespace json |
915 | |
916 | ////////////////////////////////////////////////////////////////////// |
917 | |
918 | dynamic parseJson(StringPiece range) { |
919 | return parseJson(range, json::serialization_opts()); |
920 | } |
921 | |
922 | dynamic parseJson(StringPiece range, json::serialization_opts const& opts) { |
923 | json::Input in(range, &opts); |
924 | |
925 | auto ret = parseValue(in); |
926 | in.skipWhitespace(); |
927 | if (in.size() && *in != '\0') { |
928 | in.error("parsing didn't consume all input" ); |
929 | } |
930 | return ret; |
931 | } |
932 | |
933 | std::string toJson(dynamic const& dyn) { |
934 | return json::serialize(dyn, json::serialization_opts()); |
935 | } |
936 | |
937 | std::string toPrettyJson(dynamic const& dyn) { |
938 | json::serialization_opts opts; |
939 | opts.pretty_formatting = true; |
940 | return json::serialize(dyn, opts); |
941 | } |
942 | |
943 | ////////////////////////////////////////////////////////////////////// |
944 | // dynamic::print_as_pseudo_json() is implemented here for header |
945 | // ordering reasons (most of the dynamic implementation is in |
946 | // dynamic-inl.h, which we don't want to include json.h). |
947 | |
948 | void dynamic::print_as_pseudo_json(std::ostream& out) const { |
949 | json::serialization_opts opts; |
950 | opts.allow_non_string_keys = true; |
951 | opts.allow_nan_inf = true; |
952 | out << json::serialize(*this, opts); |
953 | } |
954 | |
955 | void PrintTo(const dynamic& dyn, std::ostream* os) { |
956 | json::serialization_opts opts; |
957 | opts.allow_nan_inf = true; |
958 | opts.allow_non_string_keys = true; |
959 | opts.pretty_formatting = true; |
960 | opts.sort_keys = true; |
961 | *os << json::serialize(dyn, opts); |
962 | } |
963 | |
964 | ////////////////////////////////////////////////////////////////////// |
965 | |
966 | } // namespace folly |
967 | |