1// Copyright (c) 2017 Cloudflare, Inc. and contributors
2// Licensed under the MIT License:
3//
4// Permission is hereby granted, free of charge, to any person obtaining a copy
5// of this software and associated documentation files (the "Software"), to deal
6// in the Software without restriction, including without limitation the rights
7// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8// copies of the Software, and to permit persons to whom the Software is
9// furnished to do so, subject to the following conditions:
10//
11// The above copyright notice and this permission notice shall be included in
12// all copies or substantial portions of the Software.
13//
14// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20// THE SOFTWARE.
21
22#pragma once
23// Functions for encoding/decoding bytes and text in common formats, including:
24// - UTF-{8,16,32}
25// - Hex
26// - URI encoding
27// - Base64
28
29#if defined(__GNUC__) && !KJ_HEADER_WARNINGS
30#pragma GCC system_header
31#endif
32
33#include "string.h"
34
35namespace kj {
36
37template <typename ResultType>
38struct EncodingResult: public ResultType {
39 // Equivalent to ResultType (a String or wide-char array) for all intents and purposes, except
40 // that the bool `hadErrors` can be inspected to see if any errors were encountered in the input.
41 // Each encoding/decoding function that returns this type will "work around" errors in some way,
42 // so an application doesn't strictly have to check for errors. E.g. the Unicode functions
43 // replace errors with U+FFFD in the output.
44 //
45 // Through magic, KJ_IF_MAYBE() and KJ_{REQUIRE,ASSERT}_NONNULL() work on EncodingResult<T>
46 // exactly if it were a Maybe<T> that is null in case of errors.
47
48 inline EncodingResult(ResultType&& result, bool hadErrors)
49 : ResultType(kj::mv(result)), hadErrors(hadErrors) {}
50
51 const bool hadErrors;
52};
53
54template <typename T>
55inline auto KJ_STRINGIFY(const EncodingResult<T>& value)
56 -> decltype(toCharSequence(implicitCast<const T&>(value))) {
57 return toCharSequence(implicitCast<const T&>(value));
58}
59
60EncodingResult<Array<char16_t>> encodeUtf16(ArrayPtr<const char> text, bool nulTerminate = false);
61EncodingResult<Array<char32_t>> encodeUtf32(ArrayPtr<const char> text, bool nulTerminate = false);
62// Convert UTF-8 text (which KJ strings use) to UTF-16 or UTF-32.
63//
64// If `nulTerminate` is true, an extra NUL character will be added to the end of the output.
65//
66// The returned arrays are in platform-native endianness (otherwise they wouldn't really be
67// char16_t / char32_t).
68//
69// Note that the KJ Unicode encoding and decoding functions actually implement
70// [WTF-8 encoding](http://simonsapin.github.io/wtf-8/), which affects how invalid input is
71// handled. See comments on decodeUtf16() for more info.
72
73EncodingResult<String> decodeUtf16(ArrayPtr<const char16_t> utf16);
74EncodingResult<String> decodeUtf32(ArrayPtr<const char32_t> utf32);
75// Convert UTF-16 or UTF-32 to UTF-8 (which KJ strings use).
76//
77// The input should NOT include a NUL terminator; any NUL characters in the input array will be
78// preserved in the output.
79//
80// The input must be in platform-native endianness. BOMs are NOT recognized by these functions.
81//
82// Note that the KJ Unicode encoding and decoding functions actually implement
83// [WTF-8 encoding](http://simonsapin.github.io/wtf-8/). This means that if you start with an array
84// of char16_t and you pass it through any number of conversions to other Unicode encodings,
85// eventually returning it to UTF-16, all the while ignoring `hadErrors`, you will end up with
86// exactly the same char16_t array you started with, *even if* the array is not valid UTF-16. This
87// is useful because many real-world systems that were designed for UCS-2 (plain 16-bit Unicode)
88// and later "upgraded" to UTF-16 do not enforce that their UTF-16 is well-formed. For example,
89// file names on Windows NT are encoded using 16-bit characters, without enforcing that the
90// character sequence is valid UTF-16. It is important that programs on Windows be able to handle
91// such filenames, even if they choose to convert the name to UTF-8 for internal processing.
92//
93// Specifically, KJ's Unicode handling allows unpaired surrogate code points to round-trip through
94// UTF-8 and UTF-32. Unpaired surrogates will be flagged as an error (setting `hadErrors` in the
95// result), but will NOT be replaced with the Unicode replacement character as other erroneous
96// sequences would be, but rather encoded as an invalid surrogate codepoint in the target encoding.
97//
98// KJ makes the following guarantees about invalid input:
99// - A round trip from UTF-16 to other encodings and back will produce exactly the original input,
100// with every leg of the trip raising the `hadErrors` flag if the original input was not valid.
101// - A round trip from UTF-8 or UTF-32 to other encodings and back will either produce exactly
102// the original input, or will have replaced some invalid sequences with the Unicode replacement
103// character, U+FFFD. No code units will ever be removed unless they are replaced with U+FFFD,
104// and no code units will ever be added except to encode U+FFFD. If the original input was not
105// valid, the `hadErrors` flag will be raised on the first leg of the trip, and will also be
106// raised on subsequent legs unless all invalid sequences were replaced with U+FFFD (which, after
107// all, is a valid code point).
108
109EncodingResult<Array<wchar_t>> encodeWideString(
110 ArrayPtr<const char> text, bool nulTerminate = false);
111EncodingResult<String> decodeWideString(ArrayPtr<const wchar_t> wide);
112// Encode / decode strings of wchar_t, aka "wide strings". Unfortunately, different platforms have
113// different definitions for wchar_t. For example, on Windows they are 16-bit and encode UTF-16,
114// but on Linux they are 32-bit and encode UTF-32. Some platforms even define wchar_t as 8-bit,
115// encoding UTF-8 (e.g. BeOS did this).
116//
117// KJ assumes that wide strings use the UTF encoding that corresponds to the size of wchar_t on
118// the target platform. So, these functions are simple aliases for encodeUtf*/decodeUtf*, above
119// (or simply make a copy if wchar_t is 8 bits).
120
121String encodeHex(ArrayPtr<const byte> bytes);
122EncodingResult<Array<byte>> decodeHex(ArrayPtr<const char> text);
123// Encode/decode bytes as hex strings.
124
125String encodeUriComponent(ArrayPtr<const byte> bytes);
126String encodeUriComponent(ArrayPtr<const char> bytes);
127EncodingResult<String> decodeUriComponent(ArrayPtr<const char> text);
128// Encode/decode URI components using % escapes for characters listed as "reserved" in RFC 2396.
129// This is the same behavior as JavaScript's `encodeURIComponent()`.
130//
131// See https://tools.ietf.org/html/rfc2396#section-2.3
132
133String encodeUriFragment(ArrayPtr<const byte> bytes);
134String encodeUriFragment(ArrayPtr<const char> bytes);
135// Encode URL fragment components using the fragment percent encode set defined by the WHATWG URL
136// specification. Use decodeUriComponent() to decode.
137//
138// Quirk: We also percent-encode the '%' sign itself, because we expect to be called on percent-
139// decoded data. In other words, this function is not idempotent, in contrast to the URL spec.
140//
141// See https://url.spec.whatwg.org/#fragment-percent-encode-set
142
143String encodeUriPath(ArrayPtr<const byte> bytes);
144String encodeUriPath(ArrayPtr<const char> bytes);
145// Encode URL path components (not entire paths!) using the path percent encode set defined by the
146// WHATWG URL specification. Use decodeUriComponent() to decode.
147//
148// Quirk: We also percent-encode the '%' sign itself, because we expect to be called on percent-
149// decoded data. In other words, this function is not idempotent, in contrast to the URL spec.
150//
151// Quirk: This percent-encodes '/' and '\' characters as well, which are not actually in the set
152// defined by the WHATWG URL spec. Since a conforming URL implementation will only ever call this
153// function on individual path components, and never entire paths, augmenting the character set to
154// include these separators allows this function to be used to implement a URL class that stores
155// its path components in percent-decoded form.
156//
157// See https://url.spec.whatwg.org/#path-percent-encode-set
158
159String encodeUriUserInfo(ArrayPtr<const byte> bytes);
160String encodeUriUserInfo(ArrayPtr<const char> bytes);
161// Encode URL userinfo components using the userinfo percent encode set defined by the WHATWG URL
162// specification. Use decodeUriComponent() to decode.
163//
164// Quirk: We also percent-encode the '%' sign itself, because we expect to be called on percent-
165// decoded data. In other words, this function is not idempotent, in contrast to the URL spec.
166//
167// See https://url.spec.whatwg.org/#userinfo-percent-encode-set
168
169String encodeWwwForm(ArrayPtr<const byte> bytes);
170String encodeWwwForm(ArrayPtr<const char> bytes);
171EncodingResult<String> decodeWwwForm(ArrayPtr<const char> text);
172// Encode/decode URI components using % escapes and '+' (for spaces) according to the
173// application/x-www-form-urlencoded format defined by the WHATWG URL specification.
174//
175// Note: Like the fragment, path, and userinfo percent-encoding functions above, this function is
176// not idempotent: we percent-encode '%' signs. However, in this particular case the spec happens
177// to agree with us!
178//
179// See https://url.spec.whatwg.org/#concept-urlencoded-byte-serializer
180
181struct DecodeUriOptions {
182 // Parameter to `decodeBinaryUriComponent()`.
183
184 // This struct is intentionally convertible from bool, in order to maintain backwards
185 // compatibility with code written when `decodeBinaryUriComponent()` took a boolean second
186 // parameter.
187 DecodeUriOptions(bool nulTerminate = false, bool plusToSpace = false)
188 : nulTerminate(nulTerminate), plusToSpace(plusToSpace) {}
189
190 bool nulTerminate;
191 // Append a terminal NUL byte.
192
193 bool plusToSpace;
194 // Convert '+' to ' ' characters before percent decoding. Used to decode
195 // application/x-www-form-urlencoded text, such as query strings.
196};
197EncodingResult<Array<byte>> decodeBinaryUriComponent(
198 ArrayPtr<const char> text, DecodeUriOptions options = DecodeUriOptions());
199// Decode URI components using % escapes. This is a lower-level interface used to implement both
200// `decodeUriComponent()` and `decodeWwwForm()`
201
202String encodeCEscape(ArrayPtr<const byte> bytes);
203String encodeCEscape(ArrayPtr<const char> bytes);
204EncodingResult<Array<byte>> decodeBinaryCEscape(
205 ArrayPtr<const char> text, bool nulTerminate = false);
206EncodingResult<String> decodeCEscape(ArrayPtr<const char> text);
207
208String encodeBase64(ArrayPtr<const byte> bytes, bool breakLines = false);
209// Encode the given bytes as base64 text. If `breakLines` is true, line breaks will be inserted
210// into the output every 72 characters (e.g. for encoding e-mail bodies).
211
212EncodingResult<Array<byte>> decodeBase64(ArrayPtr<const char> text);
213// Decode base64 text. This function reports errors required by the WHATWG HTML/Infra specs: see
214// https://html.spec.whatwg.org/multipage/webappapis.html#atob for details.
215
216// =======================================================================================
217// inline implementation details
218
219namespace _ { // private
220
221template <typename T>
222NullableValue<T> readMaybe(EncodingResult<T>&& value) {
223 if (value.hadErrors) {
224 return nullptr;
225 } else {
226 return kj::mv(value);
227 }
228}
229
230template <typename T>
231T* readMaybe(EncodingResult<T>& value) {
232 if (value.hadErrors) {
233 return nullptr;
234 } else {
235 return &value;
236 }
237}
238
239template <typename T>
240const T* readMaybe(const EncodingResult<T>& value) {
241 if (value.hadErrors) {
242 return nullptr;
243 } else {
244 return &value;
245 }
246}
247
248} // namespace _ (private)
249
250inline String encodeUriComponent(ArrayPtr<const char> text) {
251 return encodeUriComponent(text.asBytes());
252}
253inline EncodingResult<String> decodeUriComponent(ArrayPtr<const char> text) {
254 auto result = decodeBinaryUriComponent(text, DecodeUriOptions { /*.nulTerminate=*/true });
255 return { String(result.releaseAsChars()), result.hadErrors };
256}
257
258inline String encodeUriFragment(ArrayPtr<const char> text) {
259 return encodeUriFragment(text.asBytes());
260}
261inline String encodeUriPath(ArrayPtr<const char> text) {
262 return encodeUriPath(text.asBytes());
263}
264inline String encodeUriUserInfo(ArrayPtr<const char> text) {
265 return encodeUriUserInfo(text.asBytes());
266}
267
268inline String encodeWwwForm(ArrayPtr<const char> text) {
269 return encodeWwwForm(text.asBytes());
270}
271inline EncodingResult<String> decodeWwwForm(ArrayPtr<const char> text) {
272 auto result = decodeBinaryUriComponent(text, DecodeUriOptions { /*.nulTerminate=*/true,
273 /*.plusToSpace=*/true });
274 return { String(result.releaseAsChars()), result.hadErrors };
275}
276
277inline String encodeCEscape(ArrayPtr<const char> text) {
278 return encodeCEscape(text.asBytes());
279}
280inline EncodingResult<String> decodeCEscape(ArrayPtr<const char> text) {
281 auto result = decodeBinaryCEscape(text, true);
282 return { String(result.releaseAsChars()), result.hadErrors };
283}
284
285// If you pass a string literal to a function taking ArrayPtr<const char>, it'll include the NUL
286// termintator, which is surprising. Let's add overloads that avoid that. In practice this probably
287// only even matters for encoding-test.c++.
288
289template <size_t s>
290inline EncodingResult<Array<char16_t>> encodeUtf16(const char (&text)[s], bool nulTerminate=false) {
291 return encodeUtf16(arrayPtr(text, s - 1), nulTerminate);
292}
293template <size_t s>
294inline EncodingResult<Array<char32_t>> encodeUtf32(const char (&text)[s], bool nulTerminate=false) {
295 return encodeUtf32(arrayPtr(text, s - 1), nulTerminate);
296}
297template <size_t s>
298inline EncodingResult<Array<wchar_t>> encodeWideString(
299 const char (&text)[s], bool nulTerminate=false) {
300 return encodeWideString(arrayPtr(text, s - 1), nulTerminate);
301}
302template <size_t s>
303inline EncodingResult<String> decodeUtf16(const char16_t (&utf16)[s]) {
304 return decodeUtf16(arrayPtr(utf16, s - 1));
305}
306template <size_t s>
307inline EncodingResult<String> decodeUtf32(const char32_t (&utf32)[s]) {
308 return decodeUtf32(arrayPtr(utf32, s - 1));
309}
310template <size_t s>
311inline EncodingResult<String> decodeWideString(const wchar_t (&utf32)[s]) {
312 return decodeWideString(arrayPtr(utf32, s - 1));
313}
314template <size_t s>
315inline EncodingResult<Array<byte>> decodeHex(const char (&text)[s]) {
316 return decodeHex(arrayPtr(text, s - 1));
317}
318template <size_t s>
319inline String encodeUriComponent(const char (&text)[s]) {
320 return encodeUriComponent(arrayPtr(text, s - 1));
321}
322template <size_t s>
323inline Array<byte> decodeBinaryUriComponent(const char (&text)[s]) {
324 return decodeBinaryUriComponent(arrayPtr(text, s - 1));
325}
326template <size_t s>
327inline EncodingResult<String> decodeUriComponent(const char (&text)[s]) {
328 return decodeUriComponent(arrayPtr(text, s-1));
329}
330template <size_t s>
331inline String encodeUriFragment(const char (&text)[s]) {
332 return encodeUriFragment(arrayPtr(text, s - 1));
333}
334template <size_t s>
335inline String encodeUriPath(const char (&text)[s]) {
336 return encodeUriPath(arrayPtr(text, s - 1));
337}
338template <size_t s>
339inline String encodeUriUserInfo(const char (&text)[s]) {
340 return encodeUriUserInfo(arrayPtr(text, s - 1));
341}
342template <size_t s>
343inline String encodeWwwForm(const char (&text)[s]) {
344 return encodeWwwForm(arrayPtr(text, s - 1));
345}
346template <size_t s>
347inline EncodingResult<String> decodeWwwForm(const char (&text)[s]) {
348 return decodeWwwForm(arrayPtr(text, s-1));
349}
350template <size_t s>
351inline String encodeCEscape(const char (&text)[s]) {
352 return encodeCEscape(arrayPtr(text, s - 1));
353}
354template <size_t s>
355inline EncodingResult<Array<byte>> decodeBinaryCEscape(const char (&text)[s]) {
356 return decodeBinaryCEscape(arrayPtr(text, s - 1));
357}
358template <size_t s>
359inline EncodingResult<String> decodeCEscape(const char (&text)[s]) {
360 return decodeCEscape(arrayPtr(text, s-1));
361}
362template <size_t s>
363EncodingResult<Array<byte>> decodeBase64(const char (&text)[s]) {
364 return decodeBase64(arrayPtr(text, s - 1));
365}
366
367} // namespace kj
368