String-inl.h source code [folly/String-inl.h]

1	/*
2	* Copyright 2012-present Facebook, Inc.
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*/
16
17	#pragma once
18
19	#include <iterator>
20	#include <stdexcept>
21
22	#include <folly/CppAttributes.h>
23
24	#ifndef FOLLY_STRING_H_
25	#error This file may only be included from String.h
26	#endif
27
28	namespace folly {
29
30	namespace detail {
31	// Map from character code to value of one-character escape sequence
32	// ('\n' = 10 maps to 'n'), 'O' if the character should be printed as
33	// an octal escape sequence, or 'P' if the character is printable and
34	// should be printed as is.
35	extern const std::array<char, `256`> cEscapeTable;
36	} // namespace detail
37
38	template <class String>
39	void cEscape(StringPiece str, String& out) {
40	char esc[`4`];
41	esc[`0`] = `'\\'`;
42	out.reserve(out.size() + str.size());
43	auto p = str.begin();
44	auto last = p; // last regular character
45	// We advance over runs of regular characters (printable, not double-quote or
46	// backslash) and copy them in one go; this is faster than calling push_back
47	// repeatedly.
48	while (p != str.end()) {
49	char c = *p;
50	unsigned char v = static_cast<unsigned char>(c);
51	char e = detail::cEscapeTable [v];
52	if (e == `'P'`) { // printable
53	++p;
54	} else if (e == `'O'`) { // octal
55	out.append(&*last, size_t(p - last));
56	esc[`1`] = `'0'` + ((v >> `6`) & `7`);
57	esc[`2`] = `'0'` + ((v >> `3`) & `7`);
58	esc[`3`] = `'0'` + (v & `7`);
59	out.append(esc, `4`);
60	++p;
61	last = p;
62	} else { // special 1-character escape
63	out.append(&*last, size_t(p - last));
64	esc[`1`] = e;
65	out.append(esc, `2`);
66	++p;
67	last = p;
68	}
69	}
70	out.append(&*last, size_t(p - last));
71	}
72
73	namespace detail {
74	// Map from the character code of the character following a backslash to
75	// the unescaped character if a valid one-character escape sequence
76	// ('n' maps to 10 = '\n'), 'O' if this is the first character of an
77	// octal escape sequence, 'X' if this is the first character of a
78	// hexadecimal escape sequence, or 'I' if this escape sequence is invalid.
79	extern const std::array<char, `256`> cUnescapeTable;
80
81	// Map from the character code to the hex value, or 16 if invalid hex char.
82	extern const std::array<unsigned char, `256`> hexTable;
83	} // namespace detail
84
85	template <class String>
86	void cUnescape(StringPiece str, String& out, bool strict) {
87	out.reserve(out.size() + str.size());
88	auto p = str.begin();
89	auto last = p; // last regular character (not part of an escape sequence)
90	// We advance over runs of regular characters (not backslash) and copy them
91	// in one go; this is faster than calling push_back repeatedly.
92	while (p != str.end()) {
93	char c = *p;
94	if (c != `'\\'`) { // normal case
95	++p;
96	continue;
97	}
98	out.append(&*last, p - last);
99	++p;
100	if (p == str.end()) { // backslash at end of string
101	if (strict) {
102	throw_exception<std::invalid_argument>("incomplete escape sequence");
103	}
104	out.push_back(`'\\'`);
105	last = p;
106	continue;
107	}
108	char e = detail::cUnescapeTable [static_cast<unsigned char>(*p)];
109	if (e == `'O'`) { // octal
110	unsigned char val = `0`;
111	for (int i = `0`; i < `3` && p != str.end() && p >= `'0'` && p <= `'7'`;
112	++i, ++p) {
113	val <<= `3`;
114	val \|= (*p - `'0'`);
115	}
116	out.push_back(val);
117	last = p;
118	} else if (e == `'X'`) { // hex
119	++p;
120	if (p == str.end()) { // \x at end of string
121	if (strict) {
122	throw_exception<std::invalid_argument>(
123	"incomplete hex escape sequence");
124	}
125	out.append("\\x");
126	last = p;
127	continue;
128	}
129	unsigned char val = `0`;
130	unsigned char h;
131	for (; (p != str.end() &&
132	(h = detail::hexTable [static_cast<unsigned char>(*p)]) < `16`);
133	++p) {
134	val <<= `4`;
135	val \|= h;
136	}
137	out.push_back(val);
138	last = p;
139	} else if (e == `'I'`) { // invalid
140	if (strict) {
141	throw_exception<std::invalid_argument>("invalid escape sequence");
142	}
143	out.push_back(`'\\'`);
144	out.push_back(*p);
145	++p;
146	last = p;
147	} else { // standard escape sequence, \' etc
148	out.push_back(e);
149	++p;
150	last = p;
151	}
152	}
153	out.append(&*last, p - last);
154	}
155
156	namespace detail {
157	// Map from character code to escape mode:
158	// 0 = pass through
159	// 1 = unused
160	// 2 = pass through in PATH mode
161	// 3 = space, replace with '+' in QUERY mode
162	// 4 = percent-encode
163	extern const std::array<unsigned char, `256`> uriEscapeTable;
164	} // namespace detail
165
166	template <class String>
167	void uriEscape(StringPiece str, String& out, UriEscapeMode mode) {
168	static const char hexValues[] = "0123456789abcdef";
169	char esc[`3`];
170	esc[`0`] = `'%'`;
171	// Preallocate assuming that 25% of the input string will be escaped
172	out.reserve(out.size() + str.size() + `3` * (str.size() / `4`));
173	auto p = str.begin();
174	auto last = p; // last regular character
175	// We advance over runs of passthrough characters and copy them in one go;
176	// this is faster than calling push_back repeatedly.
177	unsigned char minEncode = static_cast<unsigned char>(mode);
178	while (p != str.end()) {
179	char c = *p;
180	unsigned char v = static_cast<unsigned char>(c);
181	unsigned char discriminator = detail::uriEscapeTable [v];
182	if (LIKELY(discriminator <= minEncode)) {
183	++p;
184	} else if (mode == UriEscapeMode::QUERY && discriminator == `3`) {
185	out.append(&*last, size_t(p - last));
186	out.push_back(`'+'`);
187	++p;
188	last = p;
189	} else {
190	out.append(&*last, size_t(p - last));
191	esc[`1`] = hexValues[v >> `4`];
192	esc[`2`] = hexValues[v & `0x0f`];
193	out.append(esc, `3`);
194	++p;
195	last = p;
196	}
197	}
198	out.append(&*last, size_t(p - last));
199	}
200
201	template <class String>
202	void uriUnescape(StringPiece str, String& out, UriEscapeMode mode) {
203	out.reserve(out.size() + str.size());
204	auto p = str.begin();
205	auto last = p;
206	// We advance over runs of passthrough characters and copy them in one go;
207	// this is faster than calling push_back repeatedly.
208	while (p != str.end()) {
209	char c = *p;
210	switch (c) {
211	case `'%'`: {
212	if (UNLIKELY(std::distance(p, str.end()) < `3`)) {
213	throw_exception<std::invalid_argument>(
214	"incomplete percent encode sequence");
215	}
216	auto h1 = detail::hexTable [static_cast<unsigned char>(p[`1`])];
217	auto h2 = detail::hexTable [static_cast<unsigned char>(p[`2`])];
218	if (UNLIKELY(h1 == `16` \|\| h2 == `16`)) {
219	throw_exception<std::invalid_argument>(
220	"invalid percent encode sequence");
221	}
222	out.append(&*last, size_t(p - last));
223	out.push_back((h1 << `4`) \| h2);
224	p += `3`;
225	last = p;
226	break;
227	}
228	case `'+'`:
229	if (mode == UriEscapeMode::QUERY) {
230	out.append(&*last, size_t(p - last));
231	out.push_back(`' '`);
232	++p;
233	last = p;
234	break;
235	}
236	// else fallthrough
237	FOLLY_FALLTHROUGH;
238	default:
239	++p;
240	break;
241	}
242	}
243	out.append(&*last, size_t(p - last));
244	}
245
246	namespace detail {
247
248	/*
249	* The following functions are type-overloaded helpers for
250	* internalSplit().
251	*/
252	inline size_t delimSize(char) {
253	return `1`;
254	}
255	inline size_t delimSize(StringPiece s) {
256	return s.size();
257	}
258	inline bool atDelim(const char* s, char c) {
259	return *s == c;
260	}
261	inline bool atDelim(const char* s, StringPiece sp) {
262	return !std::memcmp(s, sp.start(), sp.size());
263	}
264
265	// These are used to short-circuit internalSplit() in the case of
266	// 1-character strings.
267	inline char delimFront(char c) {
268	// This one exists only for compile-time; it should never be called.
269	std::abort();
270	return c;
271	}
272	inline char delimFront(StringPiece s) {
273	assert(!s.empty() && s.start() != nullptr);
274	return *s.start();
275	}
276
277	/*
278	* Shared implementation for all the split() overloads.
279	*
280	* This uses some external helpers that are overloaded to let this
281	* algorithm be more performant if the deliminator is a single
282	* character instead of a whole string.
283	*
284	* @param ignoreEmpty iff true, don't copy empty segments to output
285	*/
286	template <class OutStringT, class DelimT, class OutputIterator>
287	void internalSplit(
288	DelimT delim,
289	StringPiece sp,
290	OutputIterator out,
291	bool ignoreEmpty) {
292	assert(sp.empty() \|\| sp.start() != nullptr);
293
294	const char* s = sp.start();
295	const size_t strSize = sp.size();
296	const size_t dSize = delimSize(delim);
297
298	if (dSize > strSize \|\| dSize == `0`) {
299	if (!ignoreEmpty \|\| strSize > `0`) {
300	*out++ = to<OutStringT>(sp);
301	}
302	return;
303	}
304	if (std::is_same<DelimT, StringPiece>::value && dSize == `1`) {
305	// Call the char version because it is significantly faster.
306	return internalSplit<OutStringT>(delimFront(delim), sp, out, ignoreEmpty);
307	}
308
309	size_t tokenStartPos = `0`;
310	size_t tokenSize = `0`;
311	for (size_t i = `0`; i <= strSize - dSize; ++i) {
312	if (atDelim(&s[i], delim)) {
313	if (!ignoreEmpty \|\| tokenSize > `0`) {
314	*out++ = to<OutStringT>(sp.subpiece(tokenStartPos, tokenSize));
315	}
316
317	tokenStartPos = i + dSize;
318	tokenSize = `0`;
319	i += dSize - `1`;
320	} else {
321	++tokenSize;
322	}
323	}
324	tokenSize = strSize - tokenStartPos;
325	if (!ignoreEmpty \|\| tokenSize > `0`) {
326	*out++ = to<OutStringT>(sp.subpiece(tokenStartPos, tokenSize));
327	}
328	}
329
330	template <class String>
331	StringPiece prepareDelim(const String& s) {
332	return StringPiece(s);
333	}
334	inline char prepareDelim(char c) {
335	return c;
336	}
337
338	template <class OutputType>
339	void toOrIgnore(StringPiece input, OutputType& output) {
340	output = folly::to<OutputType>(input);
341	}
342
343	inline void toOrIgnore(StringPiece, decltype(std::ignore)&) {}
344
345	template <bool exact, class Delim, class OutputType>
346	bool splitFixed(const Delim& delimiter, StringPiece input, OutputType& output) {
347	static_assert(
348	exact \|\| std::is_same<OutputType, StringPiece>::value \|\|
349	IsSomeString<OutputType>::value \|\|
350	std::is_same<OutputType, decltype(std::ignore)>::value,
351	"split<false>() requires that the last argument be a string type "
352	"or std::ignore");
353	if (exact && UNLIKELY(std::string::npos != input.find(delimiter))) {
354	return false;
355	}
356	toOrIgnore(input, output);
357	return true;
358	}
359
360	template <bool exact, class Delim, class OutputType, class... OutputTypes>
361	bool splitFixed(
362	const Delim& delimiter,
363	StringPiece input,
364	OutputType& outHead,
365	OutputTypes&... outTail) {
366	size_t cut = input.find(delimiter);
367	if (UNLIKELY(cut == std::string::npos)) {
368	return false;
369	}
370	StringPiece head(input.begin(), input.begin() + cut);
371	StringPiece tail(
372	input.begin() + cut + detail::delimSize(delimiter), input.end());
373	if (LIKELY(splitFixed<exact>(delimiter, tail, outTail...))) {
374	toOrIgnore(head, outHead);
375	return true;
376	}
377	return false;
378	}
379
380	} // namespace detail
381
382	//////////////////////////////////////////////////////////////////////
383
384	template <class Delim, class String, class OutputType>
385	void split(
386	const Delim& delimiter,
387	const String& input,
388	std::vector<OutputType>& out,
389	bool ignoreEmpty) {
390	detail::internalSplit<OutputType>(
391	detail::prepareDelim(delimiter),
392	StringPiece(input),
393	std::back_inserter(out),
394	ignoreEmpty);
395	}
396
397	template <class Delim, class String, class OutputType>
398	void split(
399	const Delim& delimiter,
400	const String& input,
401	fbvector<OutputType, std::allocator<OutputType>>& out,
402	bool ignoreEmpty) {
403	detail::internalSplit<OutputType>(
404	detail::prepareDelim(delimiter),
405	StringPiece(input),
406	std::back_inserter(out),
407	ignoreEmpty);
408	}
409
410	template <
411	class OutputValueType,
412	class Delim,
413	class String,
414	class OutputIterator>
415	void splitTo(
416	const Delim& delimiter,
417	const String& input,
418	OutputIterator out,
419	bool ignoreEmpty) {
420	detail::internalSplit<OutputValueType>(
421	detail::prepareDelim(delimiter), StringPiece(input), out, ignoreEmpty);
422	}
423
424	template <bool exact, class Delim, class... OutputTypes>
425	typename std::enable_if<
426	StrictConjunction<IsConvertible<OutputTypes>...>::value &&
427	sizeof...(OutputTypes) >= `1`,
428	bool>::type
429	split(const Delim& delimiter, StringPiece input, OutputTypes&... outputs) {
430	return detail::splitFixed<exact>(
431	detail::prepareDelim(delimiter), input, outputs...);
432	}
433
434	namespace detail {
435
436	/*
437	* If a type can have its string size determined cheaply, we can more
438	* efficiently append it in a loop (see internalJoinAppend). Note that the
439	* struct need not conform to the std::string api completely (ex. does not need
440	* to implement append()).
441	*/
442	template <class T>
443	struct IsSizableString {
444	enum {
445	value = IsSomeString<T>::value \|\| std::is_same<T, StringPiece>::value
446	};
447	};
448
449	template <class Iterator>
450	struct IsSizableStringContainerIterator
451	: IsSizableString<typename std::iterator_traits<Iterator>::value_type> {};
452
453	template <class Delim, class Iterator, class String>
454	void internalJoinAppend(
455	Delim delimiter,
456	Iterator begin,
457	Iterator end,
458	String& output) {
459	assert(begin != end);
460	if (std::is_same<Delim, StringPiece>::value && delimSize(delimiter) == `1`) {
461	internalJoinAppend(delimFront(delimiter), begin, end, output);
462	return;
463	}
464	toAppend(*begin, &output);
465	while (++begin != end) {
466	toAppend(delimiter, *begin, &output);
467	}
468	}
469
470	template <class Delim, class Iterator, class String>
471	typename std::enable_if<IsSizableStringContainerIterator<Iterator>::value>::type
472	internalJoin(Delim delimiter, Iterator begin, Iterator end, String& output) {
473	output.clear();
474	if (begin == end) {
475	return;
476	}
477	const size_t dsize = delimSize(delimiter);
478	Iterator it = begin;
479	size_t size = it->size();
480	while (++it != end) {
481	size += dsize + it->size();
482	}
483	output.reserve(size);
484	internalJoinAppend(delimiter, begin, end, output);
485	}
486
487	template <class Delim, class Iterator, class String>
488	typename std::enable_if<
489	!IsSizableStringContainerIterator<Iterator>::value>::type
490	internalJoin(Delim delimiter, Iterator begin, Iterator end, String& output) {
491	output.clear();
492	if (begin == end) {
493	return;
494	}
495	internalJoinAppend(delimiter, begin, end, output);
496	}
497
498	} // namespace detail
499
500	template <class Delim, class Iterator, class String>
501	void join(
502	const Delim& delimiter,
503	Iterator begin,
504	Iterator end,
505	String& output) {
506	detail::internalJoin(detail::prepareDelim(delimiter), begin, end, output);
507	}
508
509	template <class OutputString>
510	void backslashify(
511	folly::StringPiece input,
512	OutputString& output,
513	bool hex_style) {
514	static const char hexValues[] = "0123456789abcdef";
515	output.clear();
516	output.reserve(`3` * input.size());
517	for (unsigned char c : input) {
518	// less than space or greater than '~' are considered unprintable
519	if (c < `0x20` \|\| c > `0x7e` \|\| c == `'\\'`) {
520	bool hex_append = false;
521	output.push_back(`'\\'`);
522	if (hex_style) {
523	hex_append = true;
524	} else {
525	if (c == `'\r'`) {
526	output += `'r'`;
527	} else if (c == `'\n'`) {
528	output += `'n'`;
529	} else if (c == `'\t'`) {
530	output += `'t'`;
531	} else if (c == `'\a'`) {
532	output += `'a'`;
533	} else if (c == `'\b'`) {
534	output += `'b'`;
535	} else if (c == `'\0'`) {
536	output += `'0'`;
537	} else if (c == `'\\'`) {
538	output += `'\\'`;
539	} else {
540	hex_append = true;
541	}
542	}
543	if (hex_append) {
544	output.push_back(`'x'`);
545	output.push_back(hexValues[(c >> `4`) & `0xf`]);
546	output.push_back(hexValues[c & `0xf`]);
547	}
548	} else {
549	output += c;
550	}
551	}
552	}
553
554	template <class String1, class String2>
555	void humanify(const String1& input, String2& output) {
556	size_t numUnprintable = `0`;
557	size_t numPrintablePrefix = `0`;
558	for (unsigned char c : input) {
559	if (c < `0x20` \|\| c > `0x7e` \|\| c == `'\\'`) {
560	++numUnprintable;
561	}
562	if (numUnprintable == `0`) {
563	++numPrintablePrefix;
564	}
565	}
566
567	// hexlify doubles a string's size; backslashify can potentially
568	// explode it by 4x. Now, the printable range of the ascii
569	// "spectrum" is around 95 out of 256 values, so a "random" binary
570	// string should be around 60% unprintable. We use a 50% hueristic
571	// here, so if a string is 60% unprintable, then we just use hex
572	// output. Otherwise we backslash.
573	//
574	// UTF8 is completely ignored; as a result, utf8 characters will
575	// likely be \x escaped (since most common glyphs fit in two bytes).
576	// This is a tradeoff of complexity/speed instead of a convenience
577	// that likely would rarely matter. Moreover, this function is more
578	// about displaying underlying bytes, not about displaying glyphs
579	// from languages.
580	if (numUnprintable == `0`) {
581	output = input;
582	} else if (`5` * numUnprintable >= `3` * input.size()) {
583	// However! If we have a "meaningful" prefix of printable
584	// characters, say 20% of the string, we backslashify under the
585	// assumption viewing the prefix as ascii is worth blowing the
586	// output size up a bit.
587	if (`5` * numPrintablePrefix >= input.size()) {
588	backslashify(input, output);
589	} else {
590	output = "0x";
591	hexlify(input, output, true / append output /);
592	}
593	} else {
594	backslashify(input, output);
595	}
596	}
597
598	template <class InputString, class OutputString>
599	bool hexlify(
600	const InputString& input,
601	OutputString& output,
602	bool append_output) {
603	if (!append_output) {
604	output.clear();
605	}
606
607	static char hexValues[] = "0123456789abcdef";
608	auto j = output.size();
609	output.resize(`2` * input.size() + output.size());
610	for (size_t i = `0`; i < input.size(); ++i) {
611	int ch = input[i];
612	output[j++] = hexValues[(ch >> `4`) & `0xf`];
613	output[j++] = hexValues[ch & `0xf`];
614	}
615	return true;
616	}
617
618	template <class InputString, class OutputString>
619	bool unhexlify(const InputString& input, OutputString& output) {
620	if (input.size() % `2` != `0`) {
621	return false;
622	}
623	output.resize(input.size() / `2`);
624	int j = `0`;
625
626	for (size_t i = `0`; i < input.size(); i += `2`) {
627	int highBits = detail::hexTable [static_cast<uint8_t>(input[i])];
628	int lowBits = detail::hexTable [static_cast<uint8_t>(input[i + `1`])];
629	if ((highBits \| lowBits) & `0x10`) {
630	// One of the characters wasn't a hex digit
631	return false;
632	}
633	output[j++] = (highBits << `4`) + lowBits;
634	}
635	return true;
636	}
637
638	namespace detail {
639	/**
640	* Hex-dump at most 16 bytes starting at offset from a memory area of size
641	* bytes. Return the number of bytes actually dumped.
642	*/
643	size_t
644	hexDumpLine(const void* ptr, size_t offset, size_t size, std::string& line);
645	} // namespace detail
646
647	template <class OutIt>
648	void hexDump(const void* ptr, size_t size, OutIt out) {
649	size_t offset = `0`;
650	std::string line;
651	while (offset < size) {
652	offset += detail::hexDumpLine(ptr, offset, size, line);
653	*out++ = line;
654	}
655	}
656
657	} // namespace folly
658

Browse the source code of folly/String-inl.h