1/*
2 * Copyright 2012-present Facebook, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#pragma once
18
19#include <iterator>
20#include <stdexcept>
21
22#include <folly/CppAttributes.h>
23
24#ifndef FOLLY_STRING_H_
25#error This file may only be included from String.h
26#endif
27
28namespace folly {
29
30namespace detail {
31// Map from character code to value of one-character escape sequence
32// ('\n' = 10 maps to 'n'), 'O' if the character should be printed as
33// an octal escape sequence, or 'P' if the character is printable and
34// should be printed as is.
35extern const std::array<char, 256> cEscapeTable;
36} // namespace detail
37
38template <class String>
39void cEscape(StringPiece str, String& out) {
40 char esc[4];
41 esc[0] = '\\';
42 out.reserve(out.size() + str.size());
43 auto p = str.begin();
44 auto last = p; // last regular character
45 // We advance over runs of regular characters (printable, not double-quote or
46 // backslash) and copy them in one go; this is faster than calling push_back
47 // repeatedly.
48 while (p != str.end()) {
49 char c = *p;
50 unsigned char v = static_cast<unsigned char>(c);
51 char e = detail::cEscapeTable[v];
52 if (e == 'P') { // printable
53 ++p;
54 } else if (e == 'O') { // octal
55 out.append(&*last, size_t(p - last));
56 esc[1] = '0' + ((v >> 6) & 7);
57 esc[2] = '0' + ((v >> 3) & 7);
58 esc[3] = '0' + (v & 7);
59 out.append(esc, 4);
60 ++p;
61 last = p;
62 } else { // special 1-character escape
63 out.append(&*last, size_t(p - last));
64 esc[1] = e;
65 out.append(esc, 2);
66 ++p;
67 last = p;
68 }
69 }
70 out.append(&*last, size_t(p - last));
71}
72
73namespace detail {
74// Map from the character code of the character following a backslash to
75// the unescaped character if a valid one-character escape sequence
76// ('n' maps to 10 = '\n'), 'O' if this is the first character of an
77// octal escape sequence, 'X' if this is the first character of a
78// hexadecimal escape sequence, or 'I' if this escape sequence is invalid.
79extern const std::array<char, 256> cUnescapeTable;
80
81// Map from the character code to the hex value, or 16 if invalid hex char.
82extern const std::array<unsigned char, 256> hexTable;
83} // namespace detail
84
85template <class String>
86void cUnescape(StringPiece str, String& out, bool strict) {
87 out.reserve(out.size() + str.size());
88 auto p = str.begin();
89 auto last = p; // last regular character (not part of an escape sequence)
90 // We advance over runs of regular characters (not backslash) and copy them
91 // in one go; this is faster than calling push_back repeatedly.
92 while (p != str.end()) {
93 char c = *p;
94 if (c != '\\') { // normal case
95 ++p;
96 continue;
97 }
98 out.append(&*last, p - last);
99 ++p;
100 if (p == str.end()) { // backslash at end of string
101 if (strict) {
102 throw_exception<std::invalid_argument>("incomplete escape sequence");
103 }
104 out.push_back('\\');
105 last = p;
106 continue;
107 }
108 char e = detail::cUnescapeTable[static_cast<unsigned char>(*p)];
109 if (e == 'O') { // octal
110 unsigned char val = 0;
111 for (int i = 0; i < 3 && p != str.end() && *p >= '0' && *p <= '7';
112 ++i, ++p) {
113 val <<= 3;
114 val |= (*p - '0');
115 }
116 out.push_back(val);
117 last = p;
118 } else if (e == 'X') { // hex
119 ++p;
120 if (p == str.end()) { // \x at end of string
121 if (strict) {
122 throw_exception<std::invalid_argument>(
123 "incomplete hex escape sequence");
124 }
125 out.append("\\x");
126 last = p;
127 continue;
128 }
129 unsigned char val = 0;
130 unsigned char h;
131 for (; (p != str.end() &&
132 (h = detail::hexTable[static_cast<unsigned char>(*p)]) < 16);
133 ++p) {
134 val <<= 4;
135 val |= h;
136 }
137 out.push_back(val);
138 last = p;
139 } else if (e == 'I') { // invalid
140 if (strict) {
141 throw_exception<std::invalid_argument>("invalid escape sequence");
142 }
143 out.push_back('\\');
144 out.push_back(*p);
145 ++p;
146 last = p;
147 } else { // standard escape sequence, \' etc
148 out.push_back(e);
149 ++p;
150 last = p;
151 }
152 }
153 out.append(&*last, p - last);
154}
155
156namespace detail {
157// Map from character code to escape mode:
158// 0 = pass through
159// 1 = unused
160// 2 = pass through in PATH mode
161// 3 = space, replace with '+' in QUERY mode
162// 4 = percent-encode
163extern const std::array<unsigned char, 256> uriEscapeTable;
164} // namespace detail
165
166template <class String>
167void uriEscape(StringPiece str, String& out, UriEscapeMode mode) {
168 static const char hexValues[] = "0123456789abcdef";
169 char esc[3];
170 esc[0] = '%';
171 // Preallocate assuming that 25% of the input string will be escaped
172 out.reserve(out.size() + str.size() + 3 * (str.size() / 4));
173 auto p = str.begin();
174 auto last = p; // last regular character
175 // We advance over runs of passthrough characters and copy them in one go;
176 // this is faster than calling push_back repeatedly.
177 unsigned char minEncode = static_cast<unsigned char>(mode);
178 while (p != str.end()) {
179 char c = *p;
180 unsigned char v = static_cast<unsigned char>(c);
181 unsigned char discriminator = detail::uriEscapeTable[v];
182 if (LIKELY(discriminator <= minEncode)) {
183 ++p;
184 } else if (mode == UriEscapeMode::QUERY && discriminator == 3) {
185 out.append(&*last, size_t(p - last));
186 out.push_back('+');
187 ++p;
188 last = p;
189 } else {
190 out.append(&*last, size_t(p - last));
191 esc[1] = hexValues[v >> 4];
192 esc[2] = hexValues[v & 0x0f];
193 out.append(esc, 3);
194 ++p;
195 last = p;
196 }
197 }
198 out.append(&*last, size_t(p - last));
199}
200
201template <class String>
202void uriUnescape(StringPiece str, String& out, UriEscapeMode mode) {
203 out.reserve(out.size() + str.size());
204 auto p = str.begin();
205 auto last = p;
206 // We advance over runs of passthrough characters and copy them in one go;
207 // this is faster than calling push_back repeatedly.
208 while (p != str.end()) {
209 char c = *p;
210 switch (c) {
211 case '%': {
212 if (UNLIKELY(std::distance(p, str.end()) < 3)) {
213 throw_exception<std::invalid_argument>(
214 "incomplete percent encode sequence");
215 }
216 auto h1 = detail::hexTable[static_cast<unsigned char>(p[1])];
217 auto h2 = detail::hexTable[static_cast<unsigned char>(p[2])];
218 if (UNLIKELY(h1 == 16 || h2 == 16)) {
219 throw_exception<std::invalid_argument>(
220 "invalid percent encode sequence");
221 }
222 out.append(&*last, size_t(p - last));
223 out.push_back((h1 << 4) | h2);
224 p += 3;
225 last = p;
226 break;
227 }
228 case '+':
229 if (mode == UriEscapeMode::QUERY) {
230 out.append(&*last, size_t(p - last));
231 out.push_back(' ');
232 ++p;
233 last = p;
234 break;
235 }
236 // else fallthrough
237 FOLLY_FALLTHROUGH;
238 default:
239 ++p;
240 break;
241 }
242 }
243 out.append(&*last, size_t(p - last));
244}
245
246namespace detail {
247
248/*
249 * The following functions are type-overloaded helpers for
250 * internalSplit().
251 */
252inline size_t delimSize(char) {
253 return 1;
254}
255inline size_t delimSize(StringPiece s) {
256 return s.size();
257}
258inline bool atDelim(const char* s, char c) {
259 return *s == c;
260}
261inline bool atDelim(const char* s, StringPiece sp) {
262 return !std::memcmp(s, sp.start(), sp.size());
263}
264
265// These are used to short-circuit internalSplit() in the case of
266// 1-character strings.
267inline char delimFront(char c) {
268 // This one exists only for compile-time; it should never be called.
269 std::abort();
270 return c;
271}
272inline char delimFront(StringPiece s) {
273 assert(!s.empty() && s.start() != nullptr);
274 return *s.start();
275}
276
277/*
278 * Shared implementation for all the split() overloads.
279 *
280 * This uses some external helpers that are overloaded to let this
281 * algorithm be more performant if the deliminator is a single
282 * character instead of a whole string.
283 *
284 * @param ignoreEmpty iff true, don't copy empty segments to output
285 */
286template <class OutStringT, class DelimT, class OutputIterator>
287void internalSplit(
288 DelimT delim,
289 StringPiece sp,
290 OutputIterator out,
291 bool ignoreEmpty) {
292 assert(sp.empty() || sp.start() != nullptr);
293
294 const char* s = sp.start();
295 const size_t strSize = sp.size();
296 const size_t dSize = delimSize(delim);
297
298 if (dSize > strSize || dSize == 0) {
299 if (!ignoreEmpty || strSize > 0) {
300 *out++ = to<OutStringT>(sp);
301 }
302 return;
303 }
304 if (std::is_same<DelimT, StringPiece>::value && dSize == 1) {
305 // Call the char version because it is significantly faster.
306 return internalSplit<OutStringT>(delimFront(delim), sp, out, ignoreEmpty);
307 }
308
309 size_t tokenStartPos = 0;
310 size_t tokenSize = 0;
311 for (size_t i = 0; i <= strSize - dSize; ++i) {
312 if (atDelim(&s[i], delim)) {
313 if (!ignoreEmpty || tokenSize > 0) {
314 *out++ = to<OutStringT>(sp.subpiece(tokenStartPos, tokenSize));
315 }
316
317 tokenStartPos = i + dSize;
318 tokenSize = 0;
319 i += dSize - 1;
320 } else {
321 ++tokenSize;
322 }
323 }
324 tokenSize = strSize - tokenStartPos;
325 if (!ignoreEmpty || tokenSize > 0) {
326 *out++ = to<OutStringT>(sp.subpiece(tokenStartPos, tokenSize));
327 }
328}
329
330template <class String>
331StringPiece prepareDelim(const String& s) {
332 return StringPiece(s);
333}
334inline char prepareDelim(char c) {
335 return c;
336}
337
338template <class OutputType>
339void toOrIgnore(StringPiece input, OutputType& output) {
340 output = folly::to<OutputType>(input);
341}
342
343inline void toOrIgnore(StringPiece, decltype(std::ignore)&) {}
344
345template <bool exact, class Delim, class OutputType>
346bool splitFixed(const Delim& delimiter, StringPiece input, OutputType& output) {
347 static_assert(
348 exact || std::is_same<OutputType, StringPiece>::value ||
349 IsSomeString<OutputType>::value ||
350 std::is_same<OutputType, decltype(std::ignore)>::value,
351 "split<false>() requires that the last argument be a string type "
352 "or std::ignore");
353 if (exact && UNLIKELY(std::string::npos != input.find(delimiter))) {
354 return false;
355 }
356 toOrIgnore(input, output);
357 return true;
358}
359
360template <bool exact, class Delim, class OutputType, class... OutputTypes>
361bool splitFixed(
362 const Delim& delimiter,
363 StringPiece input,
364 OutputType& outHead,
365 OutputTypes&... outTail) {
366 size_t cut = input.find(delimiter);
367 if (UNLIKELY(cut == std::string::npos)) {
368 return false;
369 }
370 StringPiece head(input.begin(), input.begin() + cut);
371 StringPiece tail(
372 input.begin() + cut + detail::delimSize(delimiter), input.end());
373 if (LIKELY(splitFixed<exact>(delimiter, tail, outTail...))) {
374 toOrIgnore(head, outHead);
375 return true;
376 }
377 return false;
378}
379
380} // namespace detail
381
382//////////////////////////////////////////////////////////////////////
383
384template <class Delim, class String, class OutputType>
385void split(
386 const Delim& delimiter,
387 const String& input,
388 std::vector<OutputType>& out,
389 bool ignoreEmpty) {
390 detail::internalSplit<OutputType>(
391 detail::prepareDelim(delimiter),
392 StringPiece(input),
393 std::back_inserter(out),
394 ignoreEmpty);
395}
396
397template <class Delim, class String, class OutputType>
398void split(
399 const Delim& delimiter,
400 const String& input,
401 fbvector<OutputType, std::allocator<OutputType>>& out,
402 bool ignoreEmpty) {
403 detail::internalSplit<OutputType>(
404 detail::prepareDelim(delimiter),
405 StringPiece(input),
406 std::back_inserter(out),
407 ignoreEmpty);
408}
409
410template <
411 class OutputValueType,
412 class Delim,
413 class String,
414 class OutputIterator>
415void splitTo(
416 const Delim& delimiter,
417 const String& input,
418 OutputIterator out,
419 bool ignoreEmpty) {
420 detail::internalSplit<OutputValueType>(
421 detail::prepareDelim(delimiter), StringPiece(input), out, ignoreEmpty);
422}
423
424template <bool exact, class Delim, class... OutputTypes>
425typename std::enable_if<
426 StrictConjunction<IsConvertible<OutputTypes>...>::value &&
427 sizeof...(OutputTypes) >= 1,
428 bool>::type
429split(const Delim& delimiter, StringPiece input, OutputTypes&... outputs) {
430 return detail::splitFixed<exact>(
431 detail::prepareDelim(delimiter), input, outputs...);
432}
433
434namespace detail {
435
436/*
437 * If a type can have its string size determined cheaply, we can more
438 * efficiently append it in a loop (see internalJoinAppend). Note that the
439 * struct need not conform to the std::string api completely (ex. does not need
440 * to implement append()).
441 */
442template <class T>
443struct IsSizableString {
444 enum {
445 value = IsSomeString<T>::value || std::is_same<T, StringPiece>::value
446 };
447};
448
449template <class Iterator>
450struct IsSizableStringContainerIterator
451 : IsSizableString<typename std::iterator_traits<Iterator>::value_type> {};
452
453template <class Delim, class Iterator, class String>
454void internalJoinAppend(
455 Delim delimiter,
456 Iterator begin,
457 Iterator end,
458 String& output) {
459 assert(begin != end);
460 if (std::is_same<Delim, StringPiece>::value && delimSize(delimiter) == 1) {
461 internalJoinAppend(delimFront(delimiter), begin, end, output);
462 return;
463 }
464 toAppend(*begin, &output);
465 while (++begin != end) {
466 toAppend(delimiter, *begin, &output);
467 }
468}
469
470template <class Delim, class Iterator, class String>
471typename std::enable_if<IsSizableStringContainerIterator<Iterator>::value>::type
472internalJoin(Delim delimiter, Iterator begin, Iterator end, String& output) {
473 output.clear();
474 if (begin == end) {
475 return;
476 }
477 const size_t dsize = delimSize(delimiter);
478 Iterator it = begin;
479 size_t size = it->size();
480 while (++it != end) {
481 size += dsize + it->size();
482 }
483 output.reserve(size);
484 internalJoinAppend(delimiter, begin, end, output);
485}
486
487template <class Delim, class Iterator, class String>
488typename std::enable_if<
489 !IsSizableStringContainerIterator<Iterator>::value>::type
490internalJoin(Delim delimiter, Iterator begin, Iterator end, String& output) {
491 output.clear();
492 if (begin == end) {
493 return;
494 }
495 internalJoinAppend(delimiter, begin, end, output);
496}
497
498} // namespace detail
499
500template <class Delim, class Iterator, class String>
501void join(
502 const Delim& delimiter,
503 Iterator begin,
504 Iterator end,
505 String& output) {
506 detail::internalJoin(detail::prepareDelim(delimiter), begin, end, output);
507}
508
509template <class OutputString>
510void backslashify(
511 folly::StringPiece input,
512 OutputString& output,
513 bool hex_style) {
514 static const char hexValues[] = "0123456789abcdef";
515 output.clear();
516 output.reserve(3 * input.size());
517 for (unsigned char c : input) {
518 // less than space or greater than '~' are considered unprintable
519 if (c < 0x20 || c > 0x7e || c == '\\') {
520 bool hex_append = false;
521 output.push_back('\\');
522 if (hex_style) {
523 hex_append = true;
524 } else {
525 if (c == '\r') {
526 output += 'r';
527 } else if (c == '\n') {
528 output += 'n';
529 } else if (c == '\t') {
530 output += 't';
531 } else if (c == '\a') {
532 output += 'a';
533 } else if (c == '\b') {
534 output += 'b';
535 } else if (c == '\0') {
536 output += '0';
537 } else if (c == '\\') {
538 output += '\\';
539 } else {
540 hex_append = true;
541 }
542 }
543 if (hex_append) {
544 output.push_back('x');
545 output.push_back(hexValues[(c >> 4) & 0xf]);
546 output.push_back(hexValues[c & 0xf]);
547 }
548 } else {
549 output += c;
550 }
551 }
552}
553
554template <class String1, class String2>
555void humanify(const String1& input, String2& output) {
556 size_t numUnprintable = 0;
557 size_t numPrintablePrefix = 0;
558 for (unsigned char c : input) {
559 if (c < 0x20 || c > 0x7e || c == '\\') {
560 ++numUnprintable;
561 }
562 if (numUnprintable == 0) {
563 ++numPrintablePrefix;
564 }
565 }
566
567 // hexlify doubles a string's size; backslashify can potentially
568 // explode it by 4x. Now, the printable range of the ascii
569 // "spectrum" is around 95 out of 256 values, so a "random" binary
570 // string should be around 60% unprintable. We use a 50% hueristic
571 // here, so if a string is 60% unprintable, then we just use hex
572 // output. Otherwise we backslash.
573 //
574 // UTF8 is completely ignored; as a result, utf8 characters will
575 // likely be \x escaped (since most common glyphs fit in two bytes).
576 // This is a tradeoff of complexity/speed instead of a convenience
577 // that likely would rarely matter. Moreover, this function is more
578 // about displaying underlying bytes, not about displaying glyphs
579 // from languages.
580 if (numUnprintable == 0) {
581 output = input;
582 } else if (5 * numUnprintable >= 3 * input.size()) {
583 // However! If we have a "meaningful" prefix of printable
584 // characters, say 20% of the string, we backslashify under the
585 // assumption viewing the prefix as ascii is worth blowing the
586 // output size up a bit.
587 if (5 * numPrintablePrefix >= input.size()) {
588 backslashify(input, output);
589 } else {
590 output = "0x";
591 hexlify(input, output, true /* append output */);
592 }
593 } else {
594 backslashify(input, output);
595 }
596}
597
598template <class InputString, class OutputString>
599bool hexlify(
600 const InputString& input,
601 OutputString& output,
602 bool append_output) {
603 if (!append_output) {
604 output.clear();
605 }
606
607 static char hexValues[] = "0123456789abcdef";
608 auto j = output.size();
609 output.resize(2 * input.size() + output.size());
610 for (size_t i = 0; i < input.size(); ++i) {
611 int ch = input[i];
612 output[j++] = hexValues[(ch >> 4) & 0xf];
613 output[j++] = hexValues[ch & 0xf];
614 }
615 return true;
616}
617
618template <class InputString, class OutputString>
619bool unhexlify(const InputString& input, OutputString& output) {
620 if (input.size() % 2 != 0) {
621 return false;
622 }
623 output.resize(input.size() / 2);
624 int j = 0;
625
626 for (size_t i = 0; i < input.size(); i += 2) {
627 int highBits = detail::hexTable[static_cast<uint8_t>(input[i])];
628 int lowBits = detail::hexTable[static_cast<uint8_t>(input[i + 1])];
629 if ((highBits | lowBits) & 0x10) {
630 // One of the characters wasn't a hex digit
631 return false;
632 }
633 output[j++] = (highBits << 4) + lowBits;
634 }
635 return true;
636}
637
638namespace detail {
639/**
640 * Hex-dump at most 16 bytes starting at offset from a memory area of size
641 * bytes. Return the number of bytes actually dumped.
642 */
643size_t
644hexDumpLine(const void* ptr, size_t offset, size_t size, std::string& line);
645} // namespace detail
646
647template <class OutIt>
648void hexDump(const void* ptr, size_t size, OutIt out) {
649 size_t offset = 0;
650 std::string line;
651 while (offset < size) {
652 offset += detail::hexDumpLine(ptr, offset, size, line);
653 *out++ = line;
654 }
655}
656
657} // namespace folly
658