encoding.h source code [ClickHouse/contrib/capnproto/c++/src/kj/encoding.h]

1	// Copyright (c) 2017 Cloudflare, Inc. and contributors
2	// Licensed under the MIT License:
3	//
4	// Permission is hereby granted, free of charge, to any person obtaining a copy
5	// of this software and associated documentation files (the "Software"), to deal
6	// in the Software without restriction, including without limitation the rights
7	// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8	// copies of the Software, and to permit persons to whom the Software is
9	// furnished to do so, subject to the following conditions:
10	//
11	// The above copyright notice and this permission notice shall be included in
12	// all copies or substantial portions of the Software.
13	//
14	// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15	// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16	// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17	// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18	// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19	// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20	// THE SOFTWARE.
21
22	#pragma once
23	// Functions for encoding/decoding bytes and text in common formats, including:
24	// - UTF-{8,16,32}
25	// - Hex
26	// - URI encoding
27	// - Base64
28
29	#if defined(__GNUC__) && !KJ_HEADER_WARNINGS
30	#pragma GCC system_header
31	#endif
32
33	#include "string.h"
34
35	namespace kj {
36
37	template <typename ResultType>
38	struct EncodingResult: public ResultType {
39	// Equivalent to ResultType (a String or wide-char array) for all intents and purposes, except
40	// that the bool `hadErrors` can be inspected to see if any errors were encountered in the input.
41	// Each encoding/decoding function that returns this type will "work around" errors in some way,
42	// so an application doesn't strictly have to check for errors. E.g. the Unicode functions
43	// replace errors with U+FFFD in the output.
44	//
45	// Through magic, KJ_IF_MAYBE() and KJ_{REQUIRE,ASSERT}_NONNULL() work on EncodingResult<T>
46	// exactly if it were a Maybe<T> that is null in case of errors.
47
48	inline EncodingResult(ResultType&& result, bool hadErrors)
49	: ResultType(kj::mv(result)), hadErrors(hadErrors) {}
50
51	const bool hadErrors;
52	};
53
54	template <typename T>
55	inline auto KJ_STRINGIFY(const EncodingResult<T>& value)
56	-> decltype(toCharSequence(implicitCast<const T&>(value))) {
57	return toCharSequence(implicitCast<const T&>(value));
58	}
59
60	EncodingResult<Array<char16_t>> encodeUtf16(ArrayPtr<const char> text, bool nulTerminate = false);
61	EncodingResult<Array<char32_t>> encodeUtf32(ArrayPtr<const char> text, bool nulTerminate = false);
62	// Convert UTF-8 text (which KJ strings use) to UTF-16 or UTF-32.
63	//
64	// If `nulTerminate` is true, an extra NUL character will be added to the end of the output.
65	//
66	// The returned arrays are in platform-native endianness (otherwise they wouldn't really be
67	// char16_t / char32_t).
68	//
69	// Note that the KJ Unicode encoding and decoding functions actually implement
70	// [WTF-8 encoding](http://simonsapin.github.io/wtf-8/), which affects how invalid input is
71	// handled. See comments on decodeUtf16() for more info.
72
73	EncodingResult<String> decodeUtf16(ArrayPtr<const char16_t> utf16);
74	EncodingResult<String> decodeUtf32(ArrayPtr<const char32_t> utf32);
75	// Convert UTF-16 or UTF-32 to UTF-8 (which KJ strings use).
76	//
77	// The input should NOT include a NUL terminator; any NUL characters in the input array will be
78	// preserved in the output.
79	//
80	// The input must be in platform-native endianness. BOMs are NOT recognized by these functions.
81	//
82	// Note that the KJ Unicode encoding and decoding functions actually implement
83	// [WTF-8 encoding](http://simonsapin.github.io/wtf-8/). This means that if you start with an array
84	// of char16_t and you pass it through any number of conversions to other Unicode encodings,
85	// eventually returning it to UTF-16, all the while ignoring `hadErrors`, you will end up with
86	// exactly the same char16_t array you started with, even if* the array is not valid UTF-16. This*
87	// is useful because many real-world systems that were designed for UCS-2 (plain 16-bit Unicode)
88	// and later "upgraded" to UTF-16 do not enforce that their UTF-16 is well-formed. For example,
89	// file names on Windows NT are encoded using 16-bit characters, without enforcing that the
90	// character sequence is valid UTF-16. It is important that programs on Windows be able to handle
91	// such filenames, even if they choose to convert the name to UTF-8 for internal processing.
92	//
93	// Specifically, KJ's Unicode handling allows unpaired surrogate code points to round-trip through
94	// UTF-8 and UTF-32. Unpaired surrogates will be flagged as an error (setting `hadErrors` in the
95	// result), but will NOT be replaced with the Unicode replacement character as other erroneous
96	// sequences would be, but rather encoded as an invalid surrogate codepoint in the target encoding.
97	//
98	// KJ makes the following guarantees about invalid input:
99	// - A round trip from UTF-16 to other encodings and back will produce exactly the original input,
100	// with every leg of the trip raising the `hadErrors` flag if the original input was not valid.
101	// - A round trip from UTF-8 or UTF-32 to other encodings and back will either produce exactly
102	// the original input, or will have replaced some invalid sequences with the Unicode replacement
103	// character, U+FFFD. No code units will ever be removed unless they are replaced with U+FFFD,
104	// and no code units will ever be added except to encode U+FFFD. If the original input was not
105	// valid, the `hadErrors` flag will be raised on the first leg of the trip, and will also be
106	// raised on subsequent legs unless all invalid sequences were replaced with U+FFFD (which, after
107	// all, is a valid code point).
108
109	EncodingResult<Array<wchar_t>> encodeWideString(
110	ArrayPtr<const char> text, bool nulTerminate = false);
111	EncodingResult<String> decodeWideString(ArrayPtr<const wchar_t> wide);
112	// Encode / decode strings of wchar_t, aka "wide strings". Unfortunately, different platforms have
113	// different definitions for wchar_t. For example, on Windows they are 16-bit and encode UTF-16,
114	// but on Linux they are 32-bit and encode UTF-32. Some platforms even define wchar_t as 8-bit,
115	// encoding UTF-8 (e.g. BeOS did this).
116	//
117	// KJ assumes that wide strings use the UTF encoding that corresponds to the size of wchar_t on
118	// the target platform. So, these functions are simple aliases for encodeUtf/decodeUtf, above
119	// (or simply make a copy if wchar_t is 8 bits).
120
121	String encodeHex(ArrayPtr<const byte> bytes);
122	EncodingResult<Array<byte>> decodeHex(ArrayPtr<const char> text);
123	// Encode/decode bytes as hex strings.
124
125	String encodeUriComponent(ArrayPtr<const byte> bytes);
126	String encodeUriComponent(ArrayPtr<const char> bytes);
127	EncodingResult<String> decodeUriComponent(ArrayPtr<const char> text);
128	// Encode/decode URI components using % escapes for characters listed as "reserved" in RFC 2396.
129	// This is the same behavior as JavaScript's `encodeURIComponent()`.
130	//
131	// See https://tools.ietf.org/html/rfc2396#section-2.3
132
133	String encodeUriFragment(ArrayPtr<const byte> bytes);
134	String encodeUriFragment(ArrayPtr<const char> bytes);
135	// Encode URL fragment components using the fragment percent encode set defined by the WHATWG URL
136	// specification. Use decodeUriComponent() to decode.
137	//
138	// Quirk: We also percent-encode the '%' sign itself, because we expect to be called on percent-
139	// decoded data. In other words, this function is not idempotent, in contrast to the URL spec.
140	//
141	// See https://url.spec.whatwg.org/#fragment-percent-encode-set
142
143	String encodeUriPath(ArrayPtr<const byte> bytes);
144	String encodeUriPath(ArrayPtr<const char> bytes);
145	// Encode URL path components (not entire paths!) using the path percent encode set defined by the
146	// WHATWG URL specification. Use decodeUriComponent() to decode.
147	//
148	// Quirk: We also percent-encode the '%' sign itself, because we expect to be called on percent-
149	// decoded data. In other words, this function is not idempotent, in contrast to the URL spec.
150	//
151	// Quirk: This percent-encodes '/' and '\' characters as well, which are not actually in the set
152	// defined by the WHATWG URL spec. Since a conforming URL implementation will only ever call this
153	// function on individual path components, and never entire paths, augmenting the character set to
154	// include these separators allows this function to be used to implement a URL class that stores
155	// its path components in percent-decoded form.
156	//
157	// See https://url.spec.whatwg.org/#path-percent-encode-set
158
159	String encodeUriUserInfo(ArrayPtr<const byte> bytes);
160	String encodeUriUserInfo(ArrayPtr<const char> bytes);
161	// Encode URL userinfo components using the userinfo percent encode set defined by the WHATWG URL
162	// specification. Use decodeUriComponent() to decode.
163	//
164	// Quirk: We also percent-encode the '%' sign itself, because we expect to be called on percent-
165	// decoded data. In other words, this function is not idempotent, in contrast to the URL spec.
166	//
167	// See https://url.spec.whatwg.org/#userinfo-percent-encode-set
168
169	String encodeWwwForm(ArrayPtr<const byte> bytes);
170	String encodeWwwForm(ArrayPtr<const char> bytes);
171	EncodingResult<String> decodeWwwForm(ArrayPtr<const char> text);
172	// Encode/decode URI components using % escapes and '+' (for spaces) according to the
173	// application/x-www-form-urlencoded format defined by the WHATWG URL specification.
174	//
175	// Note: Like the fragment, path, and userinfo percent-encoding functions above, this function is
176	// not idempotent: we percent-encode '%' signs. However, in this particular case the spec happens
177	// to agree with us!
178	//
179	// See https://url.spec.whatwg.org/#concept-urlencoded-byte-serializer
180
181	struct DecodeUriOptions {
182	// Parameter to `decodeBinaryUriComponent()`.
183
184	// This struct is intentionally convertible from bool, in order to maintain backwards
185	// compatibility with code written when `decodeBinaryUriComponent()` took a boolean second
186	// parameter.
187	DecodeUriOptions(bool nulTerminate = false, bool plusToSpace = false)
188	: nulTerminate(nulTerminate), plusToSpace(plusToSpace) {}
189
190	bool nulTerminate;
191	// Append a terminal NUL byte.
192
193	bool plusToSpace;
194	// Convert '+' to ' ' characters before percent decoding. Used to decode
195	// application/x-www-form-urlencoded text, such as query strings.
196	};
197	EncodingResult<Array<byte>> decodeBinaryUriComponent(
198	ArrayPtr<const char> text, DecodeUriOptions options = DecodeUriOptions ());
199	// Decode URI components using % escapes. This is a lower-level interface used to implement both
200	// `decodeUriComponent()` and `decodeWwwForm()`
201
202	String encodeCEscape(ArrayPtr<const byte> bytes);
203	String encodeCEscape(ArrayPtr<const char> bytes);
204	EncodingResult<Array<byte>> decodeBinaryCEscape(
205	ArrayPtr<const char> text, bool nulTerminate = false);
206	EncodingResult<String> decodeCEscape(ArrayPtr<const char> text);
207
208	String encodeBase64(ArrayPtr<const byte> bytes, bool breakLines = false);
209	// Encode the given bytes as base64 text. If `breakLines` is true, line breaks will be inserted
210	// into the output every 72 characters (e.g. for encoding e-mail bodies).
211
212	EncodingResult<Array<byte>> decodeBase64(ArrayPtr<const char> text);
213	// Decode base64 text. This function reports errors required by the WHATWG HTML/Infra specs: see
214	// https://html.spec.whatwg.org/multipage/webappapis.html#atob for details.
215
216	// =======================================================================================
217	// inline implementation details
218
219	namespace _ { // private
220
221	template <typename T>
222	NullableValue<T> readMaybe(EncodingResult<T>&& value) {
223	if (value.hadErrors) {
224	return nullptr;
225	} else {
226	return kj::mv(value);
227	}
228	}
229
230	template <typename T>
231	T* readMaybe(EncodingResult<T>& value) {
232	if (value.hadErrors) {
233	return nullptr;
234	} else {
235	return &value;
236	}
237	}
238
239	template <typename T>
240	const T* readMaybe(const EncodingResult<T>& value) {
241	if (value.hadErrors) {
242	return nullptr;
243	} else {
244	return &value;
245	}
246	}
247
248	} // namespace _ (private)
249
250	inline String encodeUriComponent(ArrayPtr<const char> text) {
251	return encodeUriComponent(text.asBytes());
252	}
253	inline EncodingResult<String> decodeUriComponent(ArrayPtr<const char> text) {
254	auto result = decodeBinaryUriComponent(text, DecodeUriOptions { /.nulTerminate=/true });
255	return { String (result.releaseAsChars()), result.hadErrors };
256	}
257
258	inline String encodeUriFragment(ArrayPtr<const char> text) {
259	return encodeUriFragment(text.asBytes());
260	}
261	inline String encodeUriPath(ArrayPtr<const char> text) {
262	return encodeUriPath(text.asBytes());
263	}
264	inline String encodeUriUserInfo(ArrayPtr<const char> text) {
265	return encodeUriUserInfo(text.asBytes());
266	}
267
268	inline String encodeWwwForm(ArrayPtr<const char> text) {
269	return encodeWwwForm(text.asBytes());
270	}
271	inline EncodingResult<String> decodeWwwForm(ArrayPtr<const char> text) {
272	auto result = decodeBinaryUriComponent(text, DecodeUriOptions { /.nulTerminate=/true,
273	/.plusToSpace=/true });
274	return { String (result.releaseAsChars()), result.hadErrors };
275	}
276
277	inline String encodeCEscape(ArrayPtr<const char> text) {
278	return encodeCEscape(text.asBytes());
279	}
280	inline EncodingResult<String> decodeCEscape(ArrayPtr<const char> text) {
281	auto result = decodeBinaryCEscape(text, true);
282	return { String (result.releaseAsChars()), result.hadErrors };
283	}
284
285	// If you pass a string literal to a function taking ArrayPtr<const char>, it'll include the NUL
286	// termintator, which is surprising. Let's add overloads that avoid that. In practice this probably
287	// only even matters for encoding-test.c++.
288
289	template <size_t s>
290	inline EncodingResult<Array<char16_t>> encodeUtf16(const char (&text)[s], bool nulTerminate=false) {
291	return encodeUtf16(arrayPtr(text, s - `1`), nulTerminate);
292	}
293	template <size_t s>
294	inline EncodingResult<Array<char32_t>> encodeUtf32(const char (&text)[s], bool nulTerminate=false) {
295	return encodeUtf32(arrayPtr(text, s - `1`), nulTerminate);
296	}
297	template <size_t s>
298	inline EncodingResult<Array<wchar_t>> encodeWideString(
299	const char (&text)[s], bool nulTerminate=false) {
300	return encodeWideString(arrayPtr(text, s - `1`), nulTerminate);
301	}
302	template <size_t s>
303	inline EncodingResult<String> decodeUtf16(const char16_t (&utf16)[s]) {
304	return decodeUtf16(arrayPtr(utf16, s - `1`));
305	}
306	template <size_t s>
307	inline EncodingResult<String> decodeUtf32(const char32_t (&utf32)[s]) {
308	return decodeUtf32(arrayPtr(utf32, s - `1`));
309	}
310	template <size_t s>
311	inline EncodingResult<String> decodeWideString(const wchar_t (&utf32)[s]) {
312	return decodeWideString(arrayPtr(utf32, s - `1`));
313	}
314	template <size_t s>
315	inline EncodingResult<Array<byte>> decodeHex(const char (&text)[s]) {
316	return decodeHex(arrayPtr(text, s - `1`));
317	}
318	template <size_t s>
319	inline String encodeUriComponent(const char (&text)[s]) {
320	return encodeUriComponent(arrayPtr(text, s - `1`));
321	}
322	template <size_t s>
323	inline Array<byte> decodeBinaryUriComponent(const char (&text)[s]) {
324	return decodeBinaryUriComponent(arrayPtr(text, s - `1`));
325	}
326	template <size_t s>
327	inline EncodingResult<String> decodeUriComponent(const char (&text)[s]) {
328	return decodeUriComponent(arrayPtr(text, s-`1`));
329	}
330	template <size_t s>
331	inline String encodeUriFragment(const char (&text)[s]) {
332	return encodeUriFragment(arrayPtr(text, s - `1`));
333	}
334	template <size_t s>
335	inline String encodeUriPath(const char (&text)[s]) {
336	return encodeUriPath(arrayPtr(text, s - `1`));
337	}
338	template <size_t s>
339	inline String encodeUriUserInfo(const char (&text)[s]) {
340	return encodeUriUserInfo(arrayPtr(text, s - `1`));
341	}
342	template <size_t s>
343	inline String encodeWwwForm(const char (&text)[s]) {
344	return encodeWwwForm(arrayPtr(text, s - `1`));
345	}
346	template <size_t s>
347	inline EncodingResult<String> decodeWwwForm(const char (&text)[s]) {
348	return decodeWwwForm(arrayPtr(text, s-`1`));
349	}
350	template <size_t s>
351	inline String encodeCEscape(const char (&text)[s]) {
352	return encodeCEscape(arrayPtr(text, s - `1`));
353	}
354	template <size_t s>
355	inline EncodingResult<Array<byte>> decodeBinaryCEscape(const char (&text)[s]) {
356	return decodeBinaryCEscape(arrayPtr(text, s - `1`));
357	}
358	template <size_t s>
359	inline EncodingResult<String> decodeCEscape(const char (&text)[s]) {
360	return decodeCEscape(arrayPtr(text, s-`1`));
361	}
362	template <size_t s>
363	EncodingResult<Array<byte>> decodeBase64(const char (&text)[s]) {
364	return decodeBase64(arrayPtr(text, s - `1`));
365	}
366
367	} // namespace kj
368

Browse the source code of ClickHouse/contrib/capnproto/c++/src/kj/encoding.h