1 | // Copyright (c) 2017 Cloudflare, Inc. and contributors |
2 | // Licensed under the MIT License: |
3 | // |
4 | // Permission is hereby granted, free of charge, to any person obtaining a copy |
5 | // of this software and associated documentation files (the "Software"), to deal |
6 | // in the Software without restriction, including without limitation the rights |
7 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
8 | // copies of the Software, and to permit persons to whom the Software is |
9 | // furnished to do so, subject to the following conditions: |
10 | // |
11 | // The above copyright notice and this permission notice shall be included in |
12 | // all copies or substantial portions of the Software. |
13 | // |
14 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
15 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
16 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
17 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
18 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
19 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
20 | // THE SOFTWARE. |
21 | |
22 | #pragma once |
23 | // Functions for encoding/decoding bytes and text in common formats, including: |
24 | // - UTF-{8,16,32} |
25 | // - Hex |
26 | // - URI encoding |
27 | // - Base64 |
28 | |
29 | #if defined(__GNUC__) && !KJ_HEADER_WARNINGS |
30 | #pragma GCC system_header |
31 | #endif |
32 | |
33 | #include "string.h" |
34 | |
35 | namespace kj { |
36 | |
37 | template <typename ResultType> |
38 | struct EncodingResult: public ResultType { |
39 | // Equivalent to ResultType (a String or wide-char array) for all intents and purposes, except |
40 | // that the bool `hadErrors` can be inspected to see if any errors were encountered in the input. |
41 | // Each encoding/decoding function that returns this type will "work around" errors in some way, |
42 | // so an application doesn't strictly have to check for errors. E.g. the Unicode functions |
43 | // replace errors with U+FFFD in the output. |
44 | // |
45 | // Through magic, KJ_IF_MAYBE() and KJ_{REQUIRE,ASSERT}_NONNULL() work on EncodingResult<T> |
46 | // exactly if it were a Maybe<T> that is null in case of errors. |
47 | |
48 | inline EncodingResult(ResultType&& result, bool hadErrors) |
49 | : ResultType(kj::mv(result)), hadErrors(hadErrors) {} |
50 | |
51 | const bool hadErrors; |
52 | }; |
53 | |
54 | template <typename T> |
55 | inline auto KJ_STRINGIFY(const EncodingResult<T>& value) |
56 | -> decltype(toCharSequence(implicitCast<const T&>(value))) { |
57 | return toCharSequence(implicitCast<const T&>(value)); |
58 | } |
59 | |
60 | EncodingResult<Array<char16_t>> encodeUtf16(ArrayPtr<const char> text, bool nulTerminate = false); |
61 | EncodingResult<Array<char32_t>> encodeUtf32(ArrayPtr<const char> text, bool nulTerminate = false); |
62 | // Convert UTF-8 text (which KJ strings use) to UTF-16 or UTF-32. |
63 | // |
64 | // If `nulTerminate` is true, an extra NUL character will be added to the end of the output. |
65 | // |
66 | // The returned arrays are in platform-native endianness (otherwise they wouldn't really be |
67 | // char16_t / char32_t). |
68 | // |
69 | // Note that the KJ Unicode encoding and decoding functions actually implement |
70 | // [WTF-8 encoding](http://simonsapin.github.io/wtf-8/), which affects how invalid input is |
71 | // handled. See comments on decodeUtf16() for more info. |
72 | |
73 | EncodingResult<String> decodeUtf16(ArrayPtr<const char16_t> utf16); |
74 | EncodingResult<String> decodeUtf32(ArrayPtr<const char32_t> utf32); |
75 | // Convert UTF-16 or UTF-32 to UTF-8 (which KJ strings use). |
76 | // |
77 | // The input should NOT include a NUL terminator; any NUL characters in the input array will be |
78 | // preserved in the output. |
79 | // |
80 | // The input must be in platform-native endianness. BOMs are NOT recognized by these functions. |
81 | // |
82 | // Note that the KJ Unicode encoding and decoding functions actually implement |
83 | // [WTF-8 encoding](http://simonsapin.github.io/wtf-8/). This means that if you start with an array |
84 | // of char16_t and you pass it through any number of conversions to other Unicode encodings, |
85 | // eventually returning it to UTF-16, all the while ignoring `hadErrors`, you will end up with |
86 | // exactly the same char16_t array you started with, *even if* the array is not valid UTF-16. This |
87 | // is useful because many real-world systems that were designed for UCS-2 (plain 16-bit Unicode) |
88 | // and later "upgraded" to UTF-16 do not enforce that their UTF-16 is well-formed. For example, |
89 | // file names on Windows NT are encoded using 16-bit characters, without enforcing that the |
90 | // character sequence is valid UTF-16. It is important that programs on Windows be able to handle |
91 | // such filenames, even if they choose to convert the name to UTF-8 for internal processing. |
92 | // |
93 | // Specifically, KJ's Unicode handling allows unpaired surrogate code points to round-trip through |
94 | // UTF-8 and UTF-32. Unpaired surrogates will be flagged as an error (setting `hadErrors` in the |
95 | // result), but will NOT be replaced with the Unicode replacement character as other erroneous |
96 | // sequences would be, but rather encoded as an invalid surrogate codepoint in the target encoding. |
97 | // |
98 | // KJ makes the following guarantees about invalid input: |
99 | // - A round trip from UTF-16 to other encodings and back will produce exactly the original input, |
100 | // with every leg of the trip raising the `hadErrors` flag if the original input was not valid. |
101 | // - A round trip from UTF-8 or UTF-32 to other encodings and back will either produce exactly |
102 | // the original input, or will have replaced some invalid sequences with the Unicode replacement |
103 | // character, U+FFFD. No code units will ever be removed unless they are replaced with U+FFFD, |
104 | // and no code units will ever be added except to encode U+FFFD. If the original input was not |
105 | // valid, the `hadErrors` flag will be raised on the first leg of the trip, and will also be |
106 | // raised on subsequent legs unless all invalid sequences were replaced with U+FFFD (which, after |
107 | // all, is a valid code point). |
108 | |
109 | EncodingResult<Array<wchar_t>> encodeWideString( |
110 | ArrayPtr<const char> text, bool nulTerminate = false); |
111 | EncodingResult<String> decodeWideString(ArrayPtr<const wchar_t> wide); |
112 | // Encode / decode strings of wchar_t, aka "wide strings". Unfortunately, different platforms have |
113 | // different definitions for wchar_t. For example, on Windows they are 16-bit and encode UTF-16, |
114 | // but on Linux they are 32-bit and encode UTF-32. Some platforms even define wchar_t as 8-bit, |
115 | // encoding UTF-8 (e.g. BeOS did this). |
116 | // |
117 | // KJ assumes that wide strings use the UTF encoding that corresponds to the size of wchar_t on |
118 | // the target platform. So, these functions are simple aliases for encodeUtf*/decodeUtf*, above |
119 | // (or simply make a copy if wchar_t is 8 bits). |
120 | |
121 | String encodeHex(ArrayPtr<const byte> bytes); |
122 | EncodingResult<Array<byte>> decodeHex(ArrayPtr<const char> text); |
123 | // Encode/decode bytes as hex strings. |
124 | |
125 | String encodeUriComponent(ArrayPtr<const byte> bytes); |
126 | String encodeUriComponent(ArrayPtr<const char> bytes); |
127 | EncodingResult<String> decodeUriComponent(ArrayPtr<const char> text); |
128 | // Encode/decode URI components using % escapes for characters listed as "reserved" in RFC 2396. |
129 | // This is the same behavior as JavaScript's `encodeURIComponent()`. |
130 | // |
131 | // See https://tools.ietf.org/html/rfc2396#section-2.3 |
132 | |
133 | String encodeUriFragment(ArrayPtr<const byte> bytes); |
134 | String encodeUriFragment(ArrayPtr<const char> bytes); |
135 | // Encode URL fragment components using the fragment percent encode set defined by the WHATWG URL |
136 | // specification. Use decodeUriComponent() to decode. |
137 | // |
138 | // Quirk: We also percent-encode the '%' sign itself, because we expect to be called on percent- |
139 | // decoded data. In other words, this function is not idempotent, in contrast to the URL spec. |
140 | // |
141 | // See https://url.spec.whatwg.org/#fragment-percent-encode-set |
142 | |
143 | String encodeUriPath(ArrayPtr<const byte> bytes); |
144 | String encodeUriPath(ArrayPtr<const char> bytes); |
145 | // Encode URL path components (not entire paths!) using the path percent encode set defined by the |
146 | // WHATWG URL specification. Use decodeUriComponent() to decode. |
147 | // |
148 | // Quirk: We also percent-encode the '%' sign itself, because we expect to be called on percent- |
149 | // decoded data. In other words, this function is not idempotent, in contrast to the URL spec. |
150 | // |
151 | // Quirk: This percent-encodes '/' and '\' characters as well, which are not actually in the set |
152 | // defined by the WHATWG URL spec. Since a conforming URL implementation will only ever call this |
153 | // function on individual path components, and never entire paths, augmenting the character set to |
154 | // include these separators allows this function to be used to implement a URL class that stores |
155 | // its path components in percent-decoded form. |
156 | // |
157 | // See https://url.spec.whatwg.org/#path-percent-encode-set |
158 | |
159 | String encodeUriUserInfo(ArrayPtr<const byte> bytes); |
160 | String encodeUriUserInfo(ArrayPtr<const char> bytes); |
161 | // Encode URL userinfo components using the userinfo percent encode set defined by the WHATWG URL |
162 | // specification. Use decodeUriComponent() to decode. |
163 | // |
164 | // Quirk: We also percent-encode the '%' sign itself, because we expect to be called on percent- |
165 | // decoded data. In other words, this function is not idempotent, in contrast to the URL spec. |
166 | // |
167 | // See https://url.spec.whatwg.org/#userinfo-percent-encode-set |
168 | |
169 | String encodeWwwForm(ArrayPtr<const byte> bytes); |
170 | String encodeWwwForm(ArrayPtr<const char> bytes); |
171 | EncodingResult<String> decodeWwwForm(ArrayPtr<const char> text); |
172 | // Encode/decode URI components using % escapes and '+' (for spaces) according to the |
173 | // application/x-www-form-urlencoded format defined by the WHATWG URL specification. |
174 | // |
175 | // Note: Like the fragment, path, and userinfo percent-encoding functions above, this function is |
176 | // not idempotent: we percent-encode '%' signs. However, in this particular case the spec happens |
177 | // to agree with us! |
178 | // |
179 | // See https://url.spec.whatwg.org/#concept-urlencoded-byte-serializer |
180 | |
181 | struct DecodeUriOptions { |
182 | // Parameter to `decodeBinaryUriComponent()`. |
183 | |
184 | // This struct is intentionally convertible from bool, in order to maintain backwards |
185 | // compatibility with code written when `decodeBinaryUriComponent()` took a boolean second |
186 | // parameter. |
187 | DecodeUriOptions(bool nulTerminate = false, bool plusToSpace = false) |
188 | : nulTerminate(nulTerminate), plusToSpace(plusToSpace) {} |
189 | |
190 | bool nulTerminate; |
191 | // Append a terminal NUL byte. |
192 | |
193 | bool plusToSpace; |
194 | // Convert '+' to ' ' characters before percent decoding. Used to decode |
195 | // application/x-www-form-urlencoded text, such as query strings. |
196 | }; |
197 | EncodingResult<Array<byte>> decodeBinaryUriComponent( |
198 | ArrayPtr<const char> text, DecodeUriOptions options = DecodeUriOptions()); |
199 | // Decode URI components using % escapes. This is a lower-level interface used to implement both |
200 | // `decodeUriComponent()` and `decodeWwwForm()` |
201 | |
202 | String encodeCEscape(ArrayPtr<const byte> bytes); |
203 | String encodeCEscape(ArrayPtr<const char> bytes); |
204 | EncodingResult<Array<byte>> decodeBinaryCEscape( |
205 | ArrayPtr<const char> text, bool nulTerminate = false); |
206 | EncodingResult<String> decodeCEscape(ArrayPtr<const char> text); |
207 | |
208 | String encodeBase64(ArrayPtr<const byte> bytes, bool breakLines = false); |
209 | // Encode the given bytes as base64 text. If `breakLines` is true, line breaks will be inserted |
210 | // into the output every 72 characters (e.g. for encoding e-mail bodies). |
211 | |
212 | EncodingResult<Array<byte>> decodeBase64(ArrayPtr<const char> text); |
213 | // Decode base64 text. This function reports errors required by the WHATWG HTML/Infra specs: see |
214 | // https://html.spec.whatwg.org/multipage/webappapis.html#atob for details. |
215 | |
216 | // ======================================================================================= |
217 | // inline implementation details |
218 | |
219 | namespace _ { // private |
220 | |
221 | template <typename T> |
222 | NullableValue<T> readMaybe(EncodingResult<T>&& value) { |
223 | if (value.hadErrors) { |
224 | return nullptr; |
225 | } else { |
226 | return kj::mv(value); |
227 | } |
228 | } |
229 | |
230 | template <typename T> |
231 | T* readMaybe(EncodingResult<T>& value) { |
232 | if (value.hadErrors) { |
233 | return nullptr; |
234 | } else { |
235 | return &value; |
236 | } |
237 | } |
238 | |
239 | template <typename T> |
240 | const T* readMaybe(const EncodingResult<T>& value) { |
241 | if (value.hadErrors) { |
242 | return nullptr; |
243 | } else { |
244 | return &value; |
245 | } |
246 | } |
247 | |
248 | } // namespace _ (private) |
249 | |
250 | inline String encodeUriComponent(ArrayPtr<const char> text) { |
251 | return encodeUriComponent(text.asBytes()); |
252 | } |
253 | inline EncodingResult<String> decodeUriComponent(ArrayPtr<const char> text) { |
254 | auto result = decodeBinaryUriComponent(text, DecodeUriOptions { /*.nulTerminate=*/true }); |
255 | return { String(result.releaseAsChars()), result.hadErrors }; |
256 | } |
257 | |
258 | inline String encodeUriFragment(ArrayPtr<const char> text) { |
259 | return encodeUriFragment(text.asBytes()); |
260 | } |
261 | inline String encodeUriPath(ArrayPtr<const char> text) { |
262 | return encodeUriPath(text.asBytes()); |
263 | } |
264 | inline String encodeUriUserInfo(ArrayPtr<const char> text) { |
265 | return encodeUriUserInfo(text.asBytes()); |
266 | } |
267 | |
268 | inline String encodeWwwForm(ArrayPtr<const char> text) { |
269 | return encodeWwwForm(text.asBytes()); |
270 | } |
271 | inline EncodingResult<String> decodeWwwForm(ArrayPtr<const char> text) { |
272 | auto result = decodeBinaryUriComponent(text, DecodeUriOptions { /*.nulTerminate=*/true, |
273 | /*.plusToSpace=*/true }); |
274 | return { String(result.releaseAsChars()), result.hadErrors }; |
275 | } |
276 | |
277 | inline String encodeCEscape(ArrayPtr<const char> text) { |
278 | return encodeCEscape(text.asBytes()); |
279 | } |
280 | inline EncodingResult<String> decodeCEscape(ArrayPtr<const char> text) { |
281 | auto result = decodeBinaryCEscape(text, true); |
282 | return { String(result.releaseAsChars()), result.hadErrors }; |
283 | } |
284 | |
285 | // If you pass a string literal to a function taking ArrayPtr<const char>, it'll include the NUL |
286 | // termintator, which is surprising. Let's add overloads that avoid that. In practice this probably |
287 | // only even matters for encoding-test.c++. |
288 | |
289 | template <size_t s> |
290 | inline EncodingResult<Array<char16_t>> encodeUtf16(const char (&text)[s], bool nulTerminate=false) { |
291 | return encodeUtf16(arrayPtr(text, s - 1), nulTerminate); |
292 | } |
293 | template <size_t s> |
294 | inline EncodingResult<Array<char32_t>> encodeUtf32(const char (&text)[s], bool nulTerminate=false) { |
295 | return encodeUtf32(arrayPtr(text, s - 1), nulTerminate); |
296 | } |
297 | template <size_t s> |
298 | inline EncodingResult<Array<wchar_t>> encodeWideString( |
299 | const char (&text)[s], bool nulTerminate=false) { |
300 | return encodeWideString(arrayPtr(text, s - 1), nulTerminate); |
301 | } |
302 | template <size_t s> |
303 | inline EncodingResult<String> decodeUtf16(const char16_t (&utf16)[s]) { |
304 | return decodeUtf16(arrayPtr(utf16, s - 1)); |
305 | } |
306 | template <size_t s> |
307 | inline EncodingResult<String> decodeUtf32(const char32_t (&utf32)[s]) { |
308 | return decodeUtf32(arrayPtr(utf32, s - 1)); |
309 | } |
310 | template <size_t s> |
311 | inline EncodingResult<String> decodeWideString(const wchar_t (&utf32)[s]) { |
312 | return decodeWideString(arrayPtr(utf32, s - 1)); |
313 | } |
314 | template <size_t s> |
315 | inline EncodingResult<Array<byte>> decodeHex(const char (&text)[s]) { |
316 | return decodeHex(arrayPtr(text, s - 1)); |
317 | } |
318 | template <size_t s> |
319 | inline String encodeUriComponent(const char (&text)[s]) { |
320 | return encodeUriComponent(arrayPtr(text, s - 1)); |
321 | } |
322 | template <size_t s> |
323 | inline Array<byte> decodeBinaryUriComponent(const char (&text)[s]) { |
324 | return decodeBinaryUriComponent(arrayPtr(text, s - 1)); |
325 | } |
326 | template <size_t s> |
327 | inline EncodingResult<String> decodeUriComponent(const char (&text)[s]) { |
328 | return decodeUriComponent(arrayPtr(text, s-1)); |
329 | } |
330 | template <size_t s> |
331 | inline String encodeUriFragment(const char (&text)[s]) { |
332 | return encodeUriFragment(arrayPtr(text, s - 1)); |
333 | } |
334 | template <size_t s> |
335 | inline String encodeUriPath(const char (&text)[s]) { |
336 | return encodeUriPath(arrayPtr(text, s - 1)); |
337 | } |
338 | template <size_t s> |
339 | inline String encodeUriUserInfo(const char (&text)[s]) { |
340 | return encodeUriUserInfo(arrayPtr(text, s - 1)); |
341 | } |
342 | template <size_t s> |
343 | inline String encodeWwwForm(const char (&text)[s]) { |
344 | return encodeWwwForm(arrayPtr(text, s - 1)); |
345 | } |
346 | template <size_t s> |
347 | inline EncodingResult<String> decodeWwwForm(const char (&text)[s]) { |
348 | return decodeWwwForm(arrayPtr(text, s-1)); |
349 | } |
350 | template <size_t s> |
351 | inline String encodeCEscape(const char (&text)[s]) { |
352 | return encodeCEscape(arrayPtr(text, s - 1)); |
353 | } |
354 | template <size_t s> |
355 | inline EncodingResult<Array<byte>> decodeBinaryCEscape(const char (&text)[s]) { |
356 | return decodeBinaryCEscape(arrayPtr(text, s - 1)); |
357 | } |
358 | template <size_t s> |
359 | inline EncodingResult<String> decodeCEscape(const char (&text)[s]) { |
360 | return decodeCEscape(arrayPtr(text, s-1)); |
361 | } |
362 | template <size_t s> |
363 | EncodingResult<Array<byte>> decodeBase64(const char (&text)[s]) { |
364 | return decodeBase64(arrayPtr(text, s - 1)); |
365 | } |
366 | |
367 | } // namespace kj |
368 | |