encoding.c++ source code [ClickHouse/contrib/capnproto/c++/src/kj/encoding.c++]

1	// Copyright (c) 2017 Cloudflare, Inc.; Sandstorm Development Group, Inc.; and contributors
2	// Licensed under the MIT License:
3	//
4	// Permission is hereby granted, free of charge, to any person obtaining a copy
5	// of this software and associated documentation files (the "Software"), to deal
6	// in the Software without restriction, including without limitation the rights
7	// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8	// copies of the Software, and to permit persons to whom the Software is
9	// furnished to do so, subject to the following conditions:
10	//
11	// The above copyright notice and this permission notice shall be included in
12	// all copies or substantial portions of the Software.
13	//
14	// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15	// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16	// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17	// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18	// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19	// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20	// THE SOFTWARE.
21
22	#include "encoding.h"
23	#include "vector.h"
24	#include "debug.h"
25
26	namespace kj {
27
28	namespace {
29
30	#define GOTO_ERROR_IF(cond) if (KJ_UNLIKELY(cond)) goto error
31
32	inline void addChar32(Vector<char16_t>& vec, char32_t u) {
33	// Encode as surrogate pair.
34	u -= `0x10000`;
35	vec.add(`0xd800` \| (u >> `10`));
36	vec.add(`0xdc00` \| (u & `0x03ff`));
37	}
38
39	inline void addChar32(Vector<char32_t>& vec, char32_t u) {
40	vec.add(u);
41	}
42
43	template <typename T>
44	EncodingResult<Array<T>> encodeUtf(ArrayPtr<const char> text, bool nulTerminate) {
45	Vector<T> result(text.size() + nulTerminate);
46	bool hadErrors = false;
47
48	size_t i = `0`;
49	while (i < text.size()) {
50	byte c = text [i++];
51	if (c < `0x80`) {
52	// 0xxxxxxx -- ASCII
53	result.add(c);
54	continue;
55	} else if (KJ_UNLIKELY(c < `0xc0`)) {
56	// 10xxxxxx -- malformed continuation byte
57	goto error;
58	} else if (c < `0xe0`) {
59	// 110xxxxx -- 2-byte
60	byte c2;
61	GOTO_ERROR_IF(i == text.size() \|\| ((c2 = text[i]) & `0xc0`) != `0x80`); ++i;
62	char16_t u = (static_cast<char16_t>(c & `0x1f`) << `6`)
63	\| (static_cast<char16_t>(c2 & `0x3f`) );
64
65	// Disallow overlong sequence.
66	GOTO_ERROR_IF(u < `0x80`);
67
68	result.add(u);
69	continue;
70	} else if (c < `0xf0`) {
71	// 1110xxxx -- 3-byte
72	byte c2, c3;
73	GOTO_ERROR_IF(i == text.size() \|\| ((c2 = text[i]) & `0xc0`) != `0x80`); ++i;
74	GOTO_ERROR_IF(i == text.size() \|\| ((c3 = text[i]) & `0xc0`) != `0x80`); ++i;
75	char16_t u = (static_cast<char16_t>(c & `0x0f`) << `12`)
76	\| (static_cast<char16_t>(c2 & `0x3f`) << `6`)
77	\| (static_cast<char16_t>(c3 & `0x3f`) );
78
79	// Disallow overlong sequence.
80	GOTO_ERROR_IF(u < `0x0800`);
81
82	// Flag surrogate pair code points as errors, but allow them through.
83	if (KJ_UNLIKELY((u & `0xf800`) == `0xd800`)) {
84	if (result.size() > `0` &&
85	(u & `0xfc00`) == `0xdc00` &&
86	(result.back() & `0xfc00`) == `0xd800`) {
87	// Whoops, the previous* character was also an invalid surrogate, and if we add this*
88	// one too, they'll form a valid surrogate pair. If we allowed this, then it would mean
89	// invalid UTF-8 round-tripped to UTF-16 and back could actually change meaning entirely.
90	// OTOH, the reason we allow dangling surrogates is to allow invalid UTF-16 to round-trip
91	// to UTF-8 without loss, but if the original UTF-16 had a valid surrogate pair, it would
92	// have been encoded as a valid single UTF-8 codepoint, not as separate UTF-8 codepoints
93	// for each surrogate.
94	goto error;
95	}
96
97	hadErrors = true;
98	}
99
100	result.add(u);
101	continue;
102	} else if (c < `0xf8`) {
103	// 11110xxx -- 4-byte
104	byte c2, c3, c4;
105	GOTO_ERROR_IF(i == text.size() \|\| ((c2 = text[i]) & `0xc0`) != `0x80`); ++i;
106	GOTO_ERROR_IF(i == text.size() \|\| ((c3 = text[i]) & `0xc0`) != `0x80`); ++i;
107	GOTO_ERROR_IF(i == text.size() \|\| ((c4 = text[i]) & `0xc0`) != `0x80`); ++i;
108	char32_t u = (static_cast<char32_t>(c & `0x07`) << `18`)
109	\| (static_cast<char32_t>(c2 & `0x3f`) << `12`)
110	\| (static_cast<char32_t>(c3 & `0x3f`) << `6`)
111	\| (static_cast<char32_t>(c4 & `0x3f`) );
112
113	// Disallow overlong sequence.
114	GOTO_ERROR_IF(u < `0x10000`);
115
116	// Unicode ends at U+10FFFF
117	GOTO_ERROR_IF(u >= `0x110000`);
118
119	addChar32(result, u);
120	continue;
121	} else {
122	// 5-byte and 6-byte sequences are not legal as they'd result in codepoints outside the
123	// range of Unicode.
124	goto error;
125	}
126
127	error:
128	result.add(`0xfffd`);
129	hadErrors = true;
130	// Ignore all continuation bytes.
131	while (i < text.size() && (text [i] & `0xc0`) == `0x80`) {
132	++i;
133	}
134	}
135
136	if (nulTerminate) result.add(`0`);
137
138	return { result.releaseAsArray(), hadErrors };
139	}
140
141	} // namespace
142
143	EncodingResult<Array<char16_t>> encodeUtf16(ArrayPtr<const char> text, bool nulTerminate) {
144	return encodeUtf<char16_t>(text, nulTerminate);
145	}
146
147	EncodingResult<Array<char32_t>> encodeUtf32(ArrayPtr<const char> text, bool nulTerminate) {
148	return encodeUtf<char32_t>(text, nulTerminate);
149	}
150
151	EncodingResult<String> decodeUtf16(ArrayPtr<const char16_t> utf16) {
152	Vector<char> result(utf16.size() + `1`);
153	bool hadErrors = false;
154
155	size_t i = `0`;
156	while (i < utf16.size()) {
157	char16_t u = utf16 [i++];
158
159	if (u < `0x80`) {
160	result.add(u);
161	continue;
162	} else if (u < `0x0800`) {
163	result.addAll<std::initializer_list<char>>({
164	static_cast<char>(((u >> `6`) ) \| `0xc0`),
165	static_cast<char>(((u ) & `0x3f`) \| `0x80`)
166	});
167	continue;
168	} else if ((u & `0xf800`) == `0xd800`) {
169	// surrogate pair
170	char16_t u2;
171	if (KJ_UNLIKELY(i == utf16.size() // missing second half
172	\|\| (u & `0x0400`) != `0` // first half in wrong range
173	\|\| ((u2 = utf16[i]) & `0xfc00`) != `0xdc00`)) { // second half in wrong range
174	hadErrors = true;
175	goto threeByte;
176	}
177	++i;
178
179	char32_t u32 = (((u & `0x03ff`) << `10`) \| (u2 & `0x03ff`)) + `0x10000`;
180	result.addAll<std::initializer_list<char>>({
181	static_cast<char>(((u32 >> `18`) ) \| `0xf0`),
182	static_cast<char>(((u32 >> `12`) & `0x3f`) \| `0x80`),
183	static_cast<char>(((u32 >> `6`) & `0x3f`) \| `0x80`),
184	static_cast<char>(((u32 ) & `0x3f`) \| `0x80`)
185	});
186	continue;
187	} else {
188	threeByte:
189	result.addAll<std::initializer_list<char>>({
190	static_cast<char>(((u >> `12`) ) \| `0xe0`),
191	static_cast<char>(((u >> `6`) & `0x3f`) \| `0x80`),
192	static_cast<char>(((u ) & `0x3f`) \| `0x80`)
193	});
194	continue;
195	}
196	}
197
198	result.add(`0`);
199	return { String (result.releaseAsArray()), hadErrors };
200	}
201
202	EncodingResult<String> decodeUtf32(ArrayPtr<const char32_t> utf16) {
203	Vector<char> result(utf16.size() + `1`);
204	bool hadErrors = false;
205
206	size_t i = `0`;
207	while (i < utf16.size()) {
208	char32_t u = utf16 [i++];
209
210	if (u < `0x80`) {
211	result.add(u);
212	continue;
213	} else if (u < `0x0800`) {
214	result.addAll<std::initializer_list<char>>({
215	static_cast<char>(((u >> `6`) ) \| `0xc0`),
216	static_cast<char>(((u ) & `0x3f`) \| `0x80`)
217	});
218	continue;
219	} else if (u < `0x10000`) {
220	if (KJ_UNLIKELY((u & `0xfffff800`) == `0xd800`)) {
221	// no surrogates allowed in utf-32
222	hadErrors = true;
223	}
224	result.addAll<std::initializer_list<char>>({
225	static_cast<char>(((u >> `12`) ) \| `0xe0`),
226	static_cast<char>(((u >> `6`) & `0x3f`) \| `0x80`),
227	static_cast<char>(((u ) & `0x3f`) \| `0x80`)
228	});
229	continue;
230	} else {
231	GOTO_ERROR_IF(u >= `0x110000`); // outside Unicode range
232	result.addAll<std::initializer_list<char>>({
233	static_cast<char>(((u >> `18`) ) \| `0xf0`),
234	static_cast<char>(((u >> `12`) & `0x3f`) \| `0x80`),
235	static_cast<char>(((u >> `6`) & `0x3f`) \| `0x80`),
236	static_cast<char>(((u ) & `0x3f`) \| `0x80`)
237	});
238	continue;
239	}
240
241	error:
242	result.addAll(StringPtr (u8"\ufffd"));
243	hadErrors = true;
244	}
245
246	result.add(`0`);
247	return { String (result.releaseAsArray()), hadErrors };
248	}
249
250	namespace {
251
252	#if __GNUC__ >= 8 && !__clang__
253	// GCC 8's new class-memaccess warning rightly dislikes the following hacks, but we're really sure
254	// we want to allow them so disable the warning.
255	#pragma GCC diagnostic push
256	#pragma GCC diagnostic ignored "-Wclass-memaccess"
257	#endif
258
259	template <typename To, typename From>
260	Array<To> coerceTo(Array<From>&& array) {
261	static_assert(sizeof(To) == sizeof(From), "incompatible coercion");
262	Array<wchar_t> result;
263	memcpy(&result, &array, sizeof(array));
264	memset(&array, `0`, sizeof(array));
265	return result;
266	}
267
268	template <typename To, typename From>
269	ArrayPtr<To> coerceTo(ArrayPtr<From> array) {
270	static_assert(sizeof(To) == sizeof(From), "incompatible coercion");
271	return arrayPtr(reinterpret_cast<To*>(array.begin()), array.size());
272	}
273
274	template <typename To, typename From>
275	EncodingResult<Array<To>> coerceTo(EncodingResult<Array<From>>&& result) {
276	return { coerceTo<To>(Array<From>(kj::mv(result))), result.hadErrors };
277	}
278
279	#if __GNUC__ >= 8 && !__clang__
280	#pragma GCC diagnostic pop
281	#endif
282
283	template <size_t s>
284	struct WideConverter;
285
286	template <>
287	struct WideConverter<sizeof(char)> {
288	typedef char Type;
289
290	static EncodingResult<Array<char>> encode(ArrayPtr<const char> text, bool nulTerminate) {
291	auto result = heapArray<char>(text.size() + nulTerminate);
292	memcpy(result.begin(), text.begin(), text.size());
293	if (nulTerminate) result.back() = `0`;
294	return { kj::mv(result), false };
295	}
296
297	static EncodingResult<kj::String> decode(ArrayPtr<const char> text) {
298	return { kj::heapString(text), false };
299	}
300	};
301
302	template <>
303	struct WideConverter<sizeof(char16_t)> {
304	typedef char16_t Type;
305
306	static inline EncodingResult<Array<char16_t>> encode(
307	ArrayPtr<const char> text, bool nulTerminate) {
308	return encodeUtf16(text, nulTerminate);
309	}
310
311	static inline EncodingResult<kj::String> decode(ArrayPtr<const char16_t> text) {
312	return decodeUtf16(text);
313	}
314	};
315
316	template <>
317	struct WideConverter<sizeof(char32_t)> {
318	typedef char32_t Type;
319
320	static inline EncodingResult<Array<char32_t>> encode(
321	ArrayPtr<const char> text, bool nulTerminate) {
322	return encodeUtf32(text, nulTerminate);
323	}
324
325	static inline EncodingResult<kj::String> decode(ArrayPtr<const char32_t> text) {
326	return decodeUtf32(text);
327	}
328	};
329
330	} // namespace
331
332	EncodingResult<Array<wchar_t>> encodeWideString(ArrayPtr<const char> text, bool nulTerminate) {
333	return coerceTo<wchar_t>(WideConverter<sizeof(wchar_t)>::encode(text, nulTerminate));
334	}
335	EncodingResult<String> decodeWideString(ArrayPtr<const wchar_t> wide) {
336	using Converter = WideConverter<sizeof(wchar_t)>;
337	return Converter::decode(coerceTo<const Converter::Type>(wide));
338	}
339
340	// =======================================================================================
341
342	namespace {
343
344	const char HEX_DIGITS[] = "0123456789abcdef";
345	// Maps integer in the range [0,16) to a hex digit.
346
347	const char HEX_DIGITS_URI[] = "0123456789ABCDEF";
348	// RFC 3986 section 2.1 says "For consistency, URI producers and normalizers should use uppercase
349	// hexadecimal digits for all percent-encodings.
350
351	static Maybe<uint> tryFromHexDigit(char c) {
352	if (`'0'` <= c && c <= `'9'`) {
353	return c - `'0'`;
354	} else if (`'a'` <= c && c <= `'f'`) {
355	return c - (`'a'` - `10`);
356	} else if (`'A'` <= c && c <= `'F'`) {
357	return c - (`'A'` - `10`);
358	} else {
359	return nullptr;
360	}
361	}
362
363	static Maybe<uint> tryFromOctDigit(char c) {
364	if (`'0'` <= c && c <= `'7'`) {
365	return c - `'0'`;
366	} else {
367	return nullptr;
368	}
369	}
370
371	} // namespace
372
373	String encodeHex(ArrayPtr<const byte> input) {
374	return strArray(KJ_MAP(b, input) {
375	return heapArray<char>({HEX_DIGITS[b/`16`], HEX_DIGITS[b%`16`]});
376	}, "");
377	}
378
379	EncodingResult<Array<byte>> decodeHex(ArrayPtr<const char> text) {
380	auto result = heapArray<byte>(text.size() / `2`);
381	bool hadErrors = text.size() % `2`;
382
383	for (auto i: kj::indices(result)) {
384	byte b = `0`;
385	KJ_IF_MAYBE(d1, tryFromHexDigit(text[i*`2`])) {
386	b = *d1 << `4`;
387	} else {
388	hadErrors = true;
389	}
390	KJ_IF_MAYBE(d2, tryFromHexDigit(text[i*`2`+`1`])) {
391	b \|= *d2;
392	} else {
393	hadErrors = true;
394	}
395	result [i] = b;
396	}
397
398	return { kj::mv(result), hadErrors };
399	}
400
401	String encodeUriComponent(ArrayPtr<const byte> bytes) {
402	Vector<char> result(bytes.size() + `1`);
403	for (byte b: bytes) {
404	if ((`'A'` <= b && b <= `'Z'`) \|\|
405	(`'a'` <= b && b <= `'z'`) \|\|
406	(`'0'` <= b && b <= `'9'`) \|\|
407	b == `'-'` \|\| b == `'_'` \|\| b == `'.'` \|\| b == `'!'` \|\| b == `'~'` \|\| b == `'*'` \|\| b == `'\''` \|\|
408	b == `'('` \|\| b == `')'`) {
409	result.add(b);
410	} else {
411	result.add(`'%'`);
412	result.add(HEX_DIGITS_URI[b/`16`]);
413	result.add(HEX_DIGITS_URI[b%`16`]);
414	}
415	}
416	result.add(`'\0'`);
417	return String (result.releaseAsArray());
418	}
419
420	String encodeUriFragment(ArrayPtr<const byte> bytes) {
421	Vector<char> result(bytes.size() + `1`);
422	for (byte b: bytes) {
423	if ((`'?'` <= b && b <= `'_'`) \|\| // covers A-Z
424	(`'a'` <= b && b <= `'~'`) \|\| // covers a-z
425	(`'&'` <= b && b <= `';'`) \|\| // covers 0-9
426	b == `'!'` \|\| b == `'='` \|\| b == `'#'` \|\| b == `'$'`) {
427	result.add(b);
428	} else {
429	result.add(`'%'`);
430	result.add(HEX_DIGITS_URI[b/`16`]);
431	result.add(HEX_DIGITS_URI[b%`16`]);
432	}
433	}
434	result.add(`'\0'`);
435	return String (result.releaseAsArray());
436	}
437
438	String encodeUriPath(ArrayPtr<const byte> bytes) {
439	Vector<char> result(bytes.size() + `1`);
440	for (byte b: bytes) {
441	if ((`'@'` <= b && b <= `'['`) \|\| // covers A-Z
442	(`'a'` <= b && b <= `'z'`) \|\|
443	(`'0'` <= b && b <= `';'`) \|\| // covers 0-9
444	(`'&'` <= b && b <= `'.'`) \|\|
445	b == `'_'` \|\| b == `'!'` \|\| b == `'='` \|\| b == `']'` \|\|
446	b == `'^'` \|\| b == `'\|'` \|\| b == `'~'` \|\| b == `'$'`) {
447	result.add(b);
448	} else {
449	result.add(`'%'`);
450	result.add(HEX_DIGITS_URI[b/`16`]);
451	result.add(HEX_DIGITS_URI[b%`16`]);
452	}
453	}
454	result.add(`'\0'`);
455	return String (result.releaseAsArray());
456	}
457
458	String encodeUriUserInfo(ArrayPtr<const byte> bytes) {
459	Vector<char> result(bytes.size() + `1`);
460	for (byte b: bytes) {
461	if ((`'A'` <= b && b <= `'Z'`) \|\|
462	(`'a'` <= b && b <= `'z'`) \|\|
463	(`'0'` <= b && b <= `'9'`) \|\|
464	(`'&'` <= b && b <= `'.'`) \|\|
465	b == `'_'` \|\| b == `'!'` \|\| b == `'~'` \|\| b == `'$'`) {
466	result.add(b);
467	} else {
468	result.add(`'%'`);
469	result.add(HEX_DIGITS_URI[b/`16`]);
470	result.add(HEX_DIGITS_URI[b%`16`]);
471	}
472	}
473	result.add(`'\0'`);
474	return String (result.releaseAsArray());
475	}
476
477	String encodeWwwForm(ArrayPtr<const byte> bytes) {
478	Vector<char> result(bytes.size() + `1`);
479	for (byte b: bytes) {
480	if ((`'A'` <= b && b <= `'Z'`) \|\|
481	(`'a'` <= b && b <= `'z'`) \|\|
482	(`'0'` <= b && b <= `'9'`) \|\|
483	b == `'-'` \|\| b == `'_'` \|\| b == `'.'` \|\| b == `'*'`) {
484	result.add(b);
485	} else if (b == `' '`) {
486	result.add(`'+'`);
487	} else {
488	result.add(`'%'`);
489	result.add(HEX_DIGITS_URI[b/`16`]);
490	result.add(HEX_DIGITS_URI[b%`16`]);
491	}
492	}
493	result.add(`'\0'`);
494	return String (result.releaseAsArray());
495	}
496
497	EncodingResult<Array<byte>> decodeBinaryUriComponent(
498	ArrayPtr<const char> text, DecodeUriOptions options) {
499	Vector<byte> result(text.size() + options.nulTerminate);
500	bool hadErrors = false;
501
502	const char* ptr = text.begin();
503	const char* end = text.end();
504	while (ptr < end) {
505	if (*ptr == `'%'`) {
506	++ptr;
507
508	if (ptr == end) {
509	hadErrors = true;
510	} else KJ_IF_MAYBE(d1, tryFromHexDigit(*ptr)) {
511	byte b = *d1;
512	++ptr;
513	if (ptr == end) {
514	hadErrors = true;
515	} else KJ_IF_MAYBE(d2, tryFromHexDigit(*ptr)) {
516	b = (b << `4`) \| *d2;
517	++ptr;
518	} else {
519	hadErrors = true;
520	}
521	result.add(b);
522	} else {
523	hadErrors = true;
524	}
525	} else if (options.plusToSpace && *ptr == `'+'`) {
526	++ptr;
527	result.add(`' '`);
528	} else {
529	result.add(*ptr++);
530	}
531	}
532
533	if (options.nulTerminate) result.add(`0`);
534	return { result.releaseAsArray(), hadErrors };
535	}
536
537	// =======================================================================================
538
539	String encodeCEscape(ArrayPtr<const byte> bytes) {
540	Vector<char> escaped(bytes.size());
541
542	for (byte b: bytes) {
543	switch (b) {
544	case `'\a'`: escaped.addAll(StringPtr ("\\a")); break;
545	case `'\b'`: escaped.addAll(StringPtr ("\\b")); break;
546	case `'\f'`: escaped.addAll(StringPtr ("\\f")); break;
547	case `'\n'`: escaped.addAll(StringPtr ("\\n")); break;
548	case `'\r'`: escaped.addAll(StringPtr ("\\r")); break;
549	case `'\t'`: escaped.addAll(StringPtr ("\\t")); break;
550	case `'\v'`: escaped.addAll(StringPtr ("\\v")); break;
551	case `'\''`: escaped.addAll(StringPtr ("\\\'")); break;
552	case `'\"'`: escaped.addAll(StringPtr ("\\\"")); break;
553	case `'\\'`: escaped.addAll(StringPtr ("\\\\")); break;
554	default:
555	if (b < `0x20` \|\| b == `0x7f`) {
556	// Use octal escape, not hex, because hex escapes technically have no length limit and
557	// so can create ambiguity with subsequent characters.
558	escaped.add(`'\\'`);
559	escaped.add(HEX_DIGITS[b / `64`]);
560	escaped.add(HEX_DIGITS[(b / `8`) % `8`]);
561	escaped.add(HEX_DIGITS[b % `8`]);
562	} else {
563	escaped.add(b);
564	}
565	break;
566	}
567	}
568
569	escaped.add(`0`);
570	return String (escaped.releaseAsArray());
571	}
572
573	EncodingResult<Array<byte>> decodeBinaryCEscape(ArrayPtr<const char> text, bool nulTerminate) {
574	Vector<byte> result(text.size() + nulTerminate);
575	bool hadErrors = false;
576
577	size_t i = `0`;
578	while (i < text.size()) {
579	char c = text [i++];
580	if (c == `'\\'`) {
581	if (i == text.size()) {
582	hadErrors = true;
583	continue;
584	}
585	char c2 = text [i++];
586	switch (c2) {
587	case `'a'` : result.add(`'\a'`); break;
588	case `'b'` : result.add(`'\b'`); break;
589	case `'f'` : result.add(`'\f'`); break;
590	case `'n'` : result.add(`'\n'`); break;
591	case `'r'` : result.add(`'\r'`); break;
592	case `'t'` : result.add(`'\t'`); break;
593	case `'v'` : result.add(`'\v'`); break;
594	case `'\''`: result.add(`'\''`); break;
595	case `'\"'`: result.add(`'\"'`); break;
596	case `'\\'`: result.add(`'\\'`); break;
597
598	case `'0'`:
599	case `'1'`:
600	case `'2'`:
601	case `'3'`:
602	case `'4'`:
603	case `'5'`:
604	case `'6'`:
605	case `'7'`: {
606	uint value = c2 - `'0'`;
607	for (uint j = `0`; j < `2` && i < text.size(); j++) {
608	KJ_IF_MAYBE(d, tryFromOctDigit(text[i])) {
609	++i;
610	value = (value << `3`) \| *d;
611	} else {
612	break;
613	}
614	}
615	if (value >= `0x100`) hadErrors = true;
616	result.add(value);
617	break;
618	}
619
620	case `'x'`: {
621	uint value = `0`;
622	while (i < text.size()) {
623	KJ_IF_MAYBE(d, tryFromHexDigit(text[i])) {
624	++i;
625	value = (value << `4`) \| *d;
626	} else {
627	break;
628	}
629	}
630	if (value >= `0x100`) hadErrors = true;
631	result.add(value);
632	break;
633	}
634
635	case `'u'`: {
636	char16_t value = `0`;
637	for (uint j = `0`; j < `4`; j++) {
638	if (i == text.size()) {
639	hadErrors = true;
640	break;
641	} else KJ_IF_MAYBE(d, tryFromHexDigit(text[i])) {
642	++i;
643	value = (value << `4`) \| *d;
644	} else {
645	hadErrors = true;
646	break;
647	}
648	}
649	auto utf = decodeUtf16(arrayPtr(&value, `1`));
650	if (utf.hadErrors) hadErrors = true;
651	result.addAll(utf.asBytes());
652	break;
653	}
654
655	case `'U'`: {
656	char32_t value = `0`;
657	for (uint j = `0`; j < `8`; j++) {
658	if (i == text.size()) {
659	hadErrors = true;
660	break;
661	} else KJ_IF_MAYBE(d, tryFromHexDigit(text[i])) {
662	++i;
663	value = (value << `4`) \| *d;
664	} else {
665	hadErrors = true;
666	break;
667	}
668	}
669	auto utf = decodeUtf32(arrayPtr(&value, `1`));
670	if (utf.hadErrors) hadErrors = true;
671	result.addAll(utf.asBytes());
672	break;
673	}
674
675	default:
676	result.add(c2);
677	}
678	} else {
679	result.add(c);
680	}
681	}
682
683	if (nulTerminate) result.add(`0`);
684	return { result.releaseAsArray(), hadErrors };
685	}
686
687	// =======================================================================================
688	// This code is derived from libb64 which has been placed in the public domain.
689	// For details, see http://sourceforge.net/projects/libb64
690
691	// -------------------------------------------------------------------
692	// Encoder
693
694	namespace {
695
696	typedef enum {
697	step_A, step_B, step_C
698	} base64_encodestep;
699
700	typedef struct {
701	base64_encodestep step;
702	char result;
703	int stepcount;
704	} base64_encodestate;
705
706	const int CHARS_PER_LINE = `72`;
707
708	void base64_init_encodestate(base64_encodestate* state_in) {
709	state_in->step = step_A;
710	state_in->result = `0`;
711	state_in->stepcount = `0`;
712	}
713
714	char base64_encode_value(char value_in) {
715	static const char* encoding = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
716	if (value_in > `63`) return `'='`;
717	return encoding[(int)value_in];
718	}
719
720	int base64_encode_block(const char* plaintext_in, int length_in,
721	char* code_out, base64_encodestate* state_in, bool breakLines) {
722	const char* plainchar = plaintext_in;
723	const char* const plaintextend = plaintext_in + length_in;
724	char* codechar = code_out;
725	char result;
726	char fragment;
727
728	result = state_in->result;
729
730	switch (state_in->step) {
731	while (`1`) {
732	case step_A:
733	if (plainchar == plaintextend) {
734	state_in->result = result;
735	state_in->step = step_A;
736	return codechar - code_out;
737	}
738	fragment = *plainchar++;
739	result = (fragment & `0x0fc`) >> `2`;
740	*codechar++ = base64_encode_value(result);
741	result = (fragment & `0x003`) << `4`;
742	// fallthrough
743	case step_B:
744	if (plainchar == plaintextend) {
745	state_in->result = result;
746	state_in->step = step_B;
747	return codechar - code_out;
748	}
749	fragment = *plainchar++;
750	result \|= (fragment & `0x0f0`) >> `4`;
751	*codechar++ = base64_encode_value(result);
752	result = (fragment & `0x00f`) << `2`;
753	// fallthrough
754	case step_C:
755	if (plainchar == plaintextend) {
756	state_in->result = result;
757	state_in->step = step_C;
758	return codechar - code_out;
759	}
760	fragment = *plainchar++;
761	result \|= (fragment & `0x0c0`) >> `6`;
762	*codechar++ = base64_encode_value(result);
763	result = (fragment & `0x03f`) >> `0`;
764	*codechar++ = base64_encode_value(result);
765
766	++(state_in->stepcount);
767	if (breakLines && state_in->stepcount == CHARS_PER_LINE/`4`) {
768	*codechar++ = `'\n'`;
769	state_in->stepcount = `0`;
770	}
771	}
772	}
773	/ control should not reach here /
774	return codechar - code_out;
775	}
776
777	int base64_encode_blockend(char* code_out, base64_encodestate* state_in, bool breakLines) {
778	char* codechar = code_out;
779
780	switch (state_in->step) {
781	case step_B:
782	*codechar++ = base64_encode_value(state_in->result);
783	*codechar++ = `'='`;
784	*codechar++ = `'='`;
785	++state_in->stepcount;
786	break;
787	case step_C:
788	*codechar++ = base64_encode_value(state_in->result);
789	*codechar++ = `'='`;
790	++state_in->stepcount;
791	break;
792	case step_A:
793	break;
794	}
795	if (breakLines && state_in->stepcount > `0`) {
796	*codechar++ = `'\n'`;
797	}
798
799	return codechar - code_out;
800	}
801
802	} // namespace
803
804	String encodeBase64(ArrayPtr<const byte> input, bool breakLines) {
805	/ set up a destination buffer large enough to hold the encoded data /
806	// equivalent to ceil(input.size() / 3) 4*
807	auto numChars = (input.size() + `2`) / `3` * `4`;
808	if (breakLines) {
809	// Add space for newline characters.
810	uint lineCount = numChars / CHARS_PER_LINE;
811	if (numChars % CHARS_PER_LINE > `0`) {
812	// Partial line.
813	++lineCount;
814	}
815	numChars = numChars + lineCount;
816	}
817	auto output = heapString(numChars);
818	/ keep track of our encoded position /
819	char* c = output.begin();
820	/ store the number of bytes encoded by a single call /
821	int cnt = `0`;
822	size_t total = `0`;
823	/ we need an encoder state /
824	base64_encodestate s;
825
826	/---------- START ENCODING ----------/
827	/ initialise the encoder state /
828	base64_init_encodestate(&s);
829	/ gather data from the input and send it to the output /
830	cnt = base64_encode_block((const char *)input.begin(), input.size(), c, &s, breakLines);
831	c += cnt;
832	total += cnt;
833
834	/ since we have encoded the entire input string, we know that*
835	there is no more input data; finalise the encoding /*
836	cnt = base64_encode_blockend(c, &s, breakLines);
837	c += cnt;
838	total += cnt;
839	/---------- STOP ENCODING ----------/
840
841	KJ_ASSERT(total == output.size(), total, output.size());
842
843	return output;
844	}
845
846	// -------------------------------------------------------------------
847	// Decoder
848
849	namespace {
850
851	typedef enum {
852	step_a, step_b, step_c, step_d
853	} base64_decodestep;
854
855	typedef struct {
856	bool hadErrors = false;
857	size_t nPaddingBytesSeen = `0`;
858	// Output state. `nPaddingBytesSeen` is not guaranteed to be correct if `hadErrors` is true. It is
859	// included in the state purely to preserve the streaming capability of the algorithm while still
860	// checking for errors correctly (consider chunk 1 = "abc=", chunk 2 = "d").
861
862	base64_decodestep step = step_a;
863	char plainchar = `0`;
864	} base64_decodestate;
865
866	int base64_decode_value(char value_in) {
867	// Returns either the fragment value or: -1 on whitespace, -2 on padding, -3 on invalid input.
868	//
869	// Note that the original libb64 implementation used -1 for invalid input, -2 on padding -- this
870	// new scheme allows for some simpler error checks in steps A and B.
871
872	static const signed char decoding[] = {
873	-`3`,-`3`,-`3`,-`3`,-`3`,-`3`,-`3`,-`3`, -`3`,-`1`,-`1`,-`3`,-`1`,-`1`,-`3`,-`3`,
874	-`3`,-`3`,-`3`,-`3`,-`3`,-`3`,-`3`,-`3`, -`3`,-`3`,-`3`,-`3`,-`3`,-`3`,-`3`,-`3`,
875	-`1`,-`3`,-`3`,-`3`,-`3`,-`3`,-`3`,-`3`, -`3`,-`3`,-`3`,`62`,-`3`,-`3`,-`3`,`63`,
876	`52`,`53`,`54`,`55`,`56`,`57`,`58`,`59`, `60`,`61`,-`3`,-`3`,-`3`,-`2`,-`3`,-`3`,
877	-`3`, `0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`,`10`,`11`,`12`,`13`,`14`,
878	`15`,`16`,`17`,`18`,`19`,`20`,`21`,`22`, `23`,`24`,`25`,-`3`,-`3`,-`3`,-`3`,-`3`,
879	-`3`,`26`,`27`,`28`,`29`,`30`,`31`,`32`, `33`,`34`,`35`,`36`,`37`,`38`,`39`,`40`,
880	`41`,`42`,`43`,`44`,`45`,`46`,`47`,`48`, `49`,`50`,`51`,-`3`,-`3`,-`3`,-`3`,-`3`,
881
882	-`3`,-`3`,-`3`,-`3`,-`3`,-`3`,-`3`,-`3`, -`3`,-`3`,-`3`,-`3`,-`3`,-`3`,-`3`,-`3`,
883	-`3`,-`3`,-`3`,-`3`,-`3`,-`3`,-`3`,-`3`, -`3`,-`3`,-`3`,-`3`,-`3`,-`3`,-`3`,-`3`,
884	-`3`,-`3`,-`3`,-`3`,-`3`,-`3`,-`3`,-`3`, -`3`,-`3`,-`3`,-`3`,-`3`,-`3`,-`3`,-`3`,
885	-`3`,-`3`,-`3`,-`3`,-`3`,-`3`,-`3`,-`3`, -`3`,-`3`,-`3`,-`3`,-`3`,-`3`,-`3`,-`3`,
886	-`3`,-`3`,-`3`,-`3`,-`3`,-`3`,-`3`,-`3`, -`3`,-`3`,-`3`,-`3`,-`3`,-`3`,-`3`,-`3`,
887	-`3`,-`3`,-`3`,-`3`,-`3`,-`3`,-`3`,-`3`, -`3`,-`3`,-`3`,-`3`,-`3`,-`3`,-`3`,-`3`,
888	-`3`,-`3`,-`3`,-`3`,-`3`,-`3`,-`3`,-`3`, -`3`,-`3`,-`3`,-`3`,-`3`,-`3`,-`3`,-`3`,
889	-`3`,-`3`,-`3`,-`3`,-`3`,-`3`,-`3`,-`3`, -`3`,-`3`,-`3`,-`3`,-`3`,-`3`,-`3`,-`3`,
890	};
891	static_assert(sizeof(decoding) == `256`, "base64 decoding table size error");
892	return decoding[(unsigned char)value_in];
893	}
894
895	int base64_decode_block(const char* code_in, const int length_in,
896	char* plaintext_out, base64_decodestate* state_in) {
897	const char* codechar = code_in;
898	char* plainchar = plaintext_out;
899	signed char fragment;
900
901	if (state_in->step != step_a) {
902	*plainchar = state_in->plainchar;
903	}
904
905	#define ERROR_IF(predicate) state_in->hadErrors = state_in->hadErrors \|\| (predicate)
906
907	switch (state_in->step)
908	{
909	while (`1`)
910	{
911	case step_a:
912	do {
913	if (codechar == code_in+length_in) {
914	state_in->step = step_a;
915	state_in->plainchar = `'\0'`;
916	return plainchar - plaintext_out;
917	}
918	fragment = (signed char)base64_decode_value(*codechar++);
919	// It is an error to see invalid or padding bytes in step A.
920	ERROR_IF(fragment < -`1`);
921	} while (fragment < `0`);
922	*plainchar = (fragment & `0x03f`) << `2`;
923	// fallthrough
924	case step_b:
925	do {
926	if (codechar == code_in+length_in) {
927	state_in->step = step_b;
928	state_in->plainchar = *plainchar;
929	// It is always an error to suspend from step B, because we don't have enough bits yet.
930	// TODO(someday): This actually breaks the streaming use case, if base64_decode_block() is
931	// to be called multiple times. We'll fix it if we ever care to support streaming.
932	state_in->hadErrors = true;
933	return plainchar - plaintext_out;
934	}
935	fragment = (signed char)base64_decode_value(*codechar++);
936	// It is an error to see invalid or padding bytes in step B.
937	ERROR_IF(fragment < -`1`);
938	} while (fragment < `0`);
939	*plainchar++ \|= (fragment & `0x030`) >> `4`;
940	*plainchar = (fragment & `0x00f`) << `4`;
941	// fallthrough
942	case step_c:
943	do {
944	if (codechar == code_in+length_in) {
945	state_in->step = step_c;
946	state_in->plainchar = *plainchar;
947	// It is an error to complete from step C if we have seen incomplete padding.
948	// TODO(someday): This actually breaks the streaming use case, if base64_decode_block() is
949	// to be called multiple times. We'll fix it if we ever care to support streaming.
950	ERROR_IF(state_in->nPaddingBytesSeen == `1`);
951	return plainchar - plaintext_out;
952	}
953	fragment = (signed char)base64_decode_value(*codechar++);
954	// It is an error to see invalid bytes or more than two padding bytes in step C.
955	ERROR_IF(fragment < -`2` \|\| (fragment == -`2` && ++state_in->nPaddingBytesSeen > `2`));
956	} while (fragment < `0`);
957	// It is an error to continue from step C after having seen any padding.
958	ERROR_IF(state_in->nPaddingBytesSeen > `0`);
959	*plainchar++ \|= (fragment & `0x03c`) >> `2`;
960	*plainchar = (fragment & `0x003`) << `6`;
961	// fallthrough
962	case step_d:
963	do {
964	if (codechar == code_in+length_in) {
965	state_in->step = step_d;
966	state_in->plainchar = *plainchar;
967	return plainchar - plaintext_out;
968	}
969	fragment = (signed char)base64_decode_value(*codechar++);
970	// It is an error to see invalid bytes or more than one padding byte in step D.
971	ERROR_IF(fragment < -`2` \|\| (fragment == -`2` && ++state_in->nPaddingBytesSeen > `1`));
972	} while (fragment < `0`);
973	// It is an error to continue from step D after having seen padding bytes.
974	ERROR_IF(state_in->nPaddingBytesSeen > `0`);
975	*plainchar++ \|= (fragment & `0x03f`);
976	}
977	}
978
979	#undef ERROR_IF
980
981	/ control should not reach here /
982	return plainchar - plaintext_out;
983	}
984
985	} // namespace
986
987	EncodingResult<Array<byte>> decodeBase64(ArrayPtr<const char> input) {
988	base64_decodestate state;
989
990	auto output = heapArray<byte>((input.size() * `6` + `7`) / `8`);
991
992	size_t n = base64_decode_block(input.begin(), input.size(),
993	reinterpret_cast<char*>(output.begin()), &state);
994
995	if (n < output.size()) {
996	auto copy = heapArray<byte>(n);
997	memcpy(copy.begin(), output.begin(), n);
998	output = kj::mv(copy);
999	}
1000
1001	return EncodingResult<Array<byte>>(kj::mv(output), state.hadErrors);
1002	}
1003
1004	} // namespace kj
1005

Browse the source code of ClickHouse/contrib/capnproto/c++/src/kj/encoding.c++