| 1 | // Copyright (c) 2017 Cloudflare, Inc. and contributors |
| 2 | // Licensed under the MIT License: |
| 3 | // |
| 4 | // Permission is hereby granted, free of charge, to any person obtaining a copy |
| 5 | // of this software and associated documentation files (the "Software"), to deal |
| 6 | // in the Software without restriction, including without limitation the rights |
| 7 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
| 8 | // copies of the Software, and to permit persons to whom the Software is |
| 9 | // furnished to do so, subject to the following conditions: |
| 10 | // |
| 11 | // The above copyright notice and this permission notice shall be included in |
| 12 | // all copies or substantial portions of the Software. |
| 13 | // |
| 14 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 15 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 16 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 17 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 18 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 19 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
| 20 | // THE SOFTWARE. |
| 21 | |
| 22 | #pragma once |
| 23 | // Functions for encoding/decoding bytes and text in common formats, including: |
| 24 | // - UTF-{8,16,32} |
| 25 | // - Hex |
| 26 | // - URI encoding |
| 27 | // - Base64 |
| 28 | |
| 29 | #if defined(__GNUC__) && !KJ_HEADER_WARNINGS |
| 30 | #pragma GCC system_header |
| 31 | #endif |
| 32 | |
| 33 | #include "string.h" |
| 34 | |
| 35 | namespace kj { |
| 36 | |
| 37 | template <typename ResultType> |
| 38 | struct EncodingResult: public ResultType { |
| 39 | // Equivalent to ResultType (a String or wide-char array) for all intents and purposes, except |
| 40 | // that the bool `hadErrors` can be inspected to see if any errors were encountered in the input. |
| 41 | // Each encoding/decoding function that returns this type will "work around" errors in some way, |
| 42 | // so an application doesn't strictly have to check for errors. E.g. the Unicode functions |
| 43 | // replace errors with U+FFFD in the output. |
| 44 | // |
| 45 | // Through magic, KJ_IF_MAYBE() and KJ_{REQUIRE,ASSERT}_NONNULL() work on EncodingResult<T> |
| 46 | // exactly if it were a Maybe<T> that is null in case of errors. |
| 47 | |
| 48 | inline EncodingResult(ResultType&& result, bool hadErrors) |
| 49 | : ResultType(kj::mv(result)), hadErrors(hadErrors) {} |
| 50 | |
| 51 | const bool hadErrors; |
| 52 | }; |
| 53 | |
| 54 | template <typename T> |
| 55 | inline auto KJ_STRINGIFY(const EncodingResult<T>& value) |
| 56 | -> decltype(toCharSequence(implicitCast<const T&>(value))) { |
| 57 | return toCharSequence(implicitCast<const T&>(value)); |
| 58 | } |
| 59 | |
| 60 | EncodingResult<Array<char16_t>> encodeUtf16(ArrayPtr<const char> text, bool nulTerminate = false); |
| 61 | EncodingResult<Array<char32_t>> encodeUtf32(ArrayPtr<const char> text, bool nulTerminate = false); |
| 62 | // Convert UTF-8 text (which KJ strings use) to UTF-16 or UTF-32. |
| 63 | // |
| 64 | // If `nulTerminate` is true, an extra NUL character will be added to the end of the output. |
| 65 | // |
| 66 | // The returned arrays are in platform-native endianness (otherwise they wouldn't really be |
| 67 | // char16_t / char32_t). |
| 68 | // |
| 69 | // Note that the KJ Unicode encoding and decoding functions actually implement |
| 70 | // [WTF-8 encoding](http://simonsapin.github.io/wtf-8/), which affects how invalid input is |
| 71 | // handled. See comments on decodeUtf16() for more info. |
| 72 | |
| 73 | EncodingResult<String> decodeUtf16(ArrayPtr<const char16_t> utf16); |
| 74 | EncodingResult<String> decodeUtf32(ArrayPtr<const char32_t> utf32); |
| 75 | // Convert UTF-16 or UTF-32 to UTF-8 (which KJ strings use). |
| 76 | // |
| 77 | // The input should NOT include a NUL terminator; any NUL characters in the input array will be |
| 78 | // preserved in the output. |
| 79 | // |
| 80 | // The input must be in platform-native endianness. BOMs are NOT recognized by these functions. |
| 81 | // |
| 82 | // Note that the KJ Unicode encoding and decoding functions actually implement |
| 83 | // [WTF-8 encoding](http://simonsapin.github.io/wtf-8/). This means that if you start with an array |
| 84 | // of char16_t and you pass it through any number of conversions to other Unicode encodings, |
| 85 | // eventually returning it to UTF-16, all the while ignoring `hadErrors`, you will end up with |
| 86 | // exactly the same char16_t array you started with, *even if* the array is not valid UTF-16. This |
| 87 | // is useful because many real-world systems that were designed for UCS-2 (plain 16-bit Unicode) |
| 88 | // and later "upgraded" to UTF-16 do not enforce that their UTF-16 is well-formed. For example, |
| 89 | // file names on Windows NT are encoded using 16-bit characters, without enforcing that the |
| 90 | // character sequence is valid UTF-16. It is important that programs on Windows be able to handle |
| 91 | // such filenames, even if they choose to convert the name to UTF-8 for internal processing. |
| 92 | // |
| 93 | // Specifically, KJ's Unicode handling allows unpaired surrogate code points to round-trip through |
| 94 | // UTF-8 and UTF-32. Unpaired surrogates will be flagged as an error (setting `hadErrors` in the |
| 95 | // result), but will NOT be replaced with the Unicode replacement character as other erroneous |
| 96 | // sequences would be, but rather encoded as an invalid surrogate codepoint in the target encoding. |
| 97 | // |
| 98 | // KJ makes the following guarantees about invalid input: |
| 99 | // - A round trip from UTF-16 to other encodings and back will produce exactly the original input, |
| 100 | // with every leg of the trip raising the `hadErrors` flag if the original input was not valid. |
| 101 | // - A round trip from UTF-8 or UTF-32 to other encodings and back will either produce exactly |
| 102 | // the original input, or will have replaced some invalid sequences with the Unicode replacement |
| 103 | // character, U+FFFD. No code units will ever be removed unless they are replaced with U+FFFD, |
| 104 | // and no code units will ever be added except to encode U+FFFD. If the original input was not |
| 105 | // valid, the `hadErrors` flag will be raised on the first leg of the trip, and will also be |
| 106 | // raised on subsequent legs unless all invalid sequences were replaced with U+FFFD (which, after |
| 107 | // all, is a valid code point). |
| 108 | |
| 109 | EncodingResult<Array<wchar_t>> encodeWideString( |
| 110 | ArrayPtr<const char> text, bool nulTerminate = false); |
| 111 | EncodingResult<String> decodeWideString(ArrayPtr<const wchar_t> wide); |
| 112 | // Encode / decode strings of wchar_t, aka "wide strings". Unfortunately, different platforms have |
| 113 | // different definitions for wchar_t. For example, on Windows they are 16-bit and encode UTF-16, |
| 114 | // but on Linux they are 32-bit and encode UTF-32. Some platforms even define wchar_t as 8-bit, |
| 115 | // encoding UTF-8 (e.g. BeOS did this). |
| 116 | // |
| 117 | // KJ assumes that wide strings use the UTF encoding that corresponds to the size of wchar_t on |
| 118 | // the target platform. So, these functions are simple aliases for encodeUtf*/decodeUtf*, above |
| 119 | // (or simply make a copy if wchar_t is 8 bits). |
| 120 | |
| 121 | String encodeHex(ArrayPtr<const byte> bytes); |
| 122 | EncodingResult<Array<byte>> decodeHex(ArrayPtr<const char> text); |
| 123 | // Encode/decode bytes as hex strings. |
| 124 | |
| 125 | String encodeUriComponent(ArrayPtr<const byte> bytes); |
| 126 | String encodeUriComponent(ArrayPtr<const char> bytes); |
| 127 | EncodingResult<String> decodeUriComponent(ArrayPtr<const char> text); |
| 128 | // Encode/decode URI components using % escapes for characters listed as "reserved" in RFC 2396. |
| 129 | // This is the same behavior as JavaScript's `encodeURIComponent()`. |
| 130 | // |
| 131 | // See https://tools.ietf.org/html/rfc2396#section-2.3 |
| 132 | |
| 133 | String encodeUriFragment(ArrayPtr<const byte> bytes); |
| 134 | String encodeUriFragment(ArrayPtr<const char> bytes); |
| 135 | // Encode URL fragment components using the fragment percent encode set defined by the WHATWG URL |
| 136 | // specification. Use decodeUriComponent() to decode. |
| 137 | // |
| 138 | // Quirk: We also percent-encode the '%' sign itself, because we expect to be called on percent- |
| 139 | // decoded data. In other words, this function is not idempotent, in contrast to the URL spec. |
| 140 | // |
| 141 | // See https://url.spec.whatwg.org/#fragment-percent-encode-set |
| 142 | |
| 143 | String encodeUriPath(ArrayPtr<const byte> bytes); |
| 144 | String encodeUriPath(ArrayPtr<const char> bytes); |
| 145 | // Encode URL path components (not entire paths!) using the path percent encode set defined by the |
| 146 | // WHATWG URL specification. Use decodeUriComponent() to decode. |
| 147 | // |
| 148 | // Quirk: We also percent-encode the '%' sign itself, because we expect to be called on percent- |
| 149 | // decoded data. In other words, this function is not idempotent, in contrast to the URL spec. |
| 150 | // |
| 151 | // Quirk: This percent-encodes '/' and '\' characters as well, which are not actually in the set |
| 152 | // defined by the WHATWG URL spec. Since a conforming URL implementation will only ever call this |
| 153 | // function on individual path components, and never entire paths, augmenting the character set to |
| 154 | // include these separators allows this function to be used to implement a URL class that stores |
| 155 | // its path components in percent-decoded form. |
| 156 | // |
| 157 | // See https://url.spec.whatwg.org/#path-percent-encode-set |
| 158 | |
| 159 | String encodeUriUserInfo(ArrayPtr<const byte> bytes); |
| 160 | String encodeUriUserInfo(ArrayPtr<const char> bytes); |
| 161 | // Encode URL userinfo components using the userinfo percent encode set defined by the WHATWG URL |
| 162 | // specification. Use decodeUriComponent() to decode. |
| 163 | // |
| 164 | // Quirk: We also percent-encode the '%' sign itself, because we expect to be called on percent- |
| 165 | // decoded data. In other words, this function is not idempotent, in contrast to the URL spec. |
| 166 | // |
| 167 | // See https://url.spec.whatwg.org/#userinfo-percent-encode-set |
| 168 | |
| 169 | String encodeWwwForm(ArrayPtr<const byte> bytes); |
| 170 | String encodeWwwForm(ArrayPtr<const char> bytes); |
| 171 | EncodingResult<String> decodeWwwForm(ArrayPtr<const char> text); |
| 172 | // Encode/decode URI components using % escapes and '+' (for spaces) according to the |
| 173 | // application/x-www-form-urlencoded format defined by the WHATWG URL specification. |
| 174 | // |
| 175 | // Note: Like the fragment, path, and userinfo percent-encoding functions above, this function is |
| 176 | // not idempotent: we percent-encode '%' signs. However, in this particular case the spec happens |
| 177 | // to agree with us! |
| 178 | // |
| 179 | // See https://url.spec.whatwg.org/#concept-urlencoded-byte-serializer |
| 180 | |
| 181 | struct DecodeUriOptions { |
| 182 | // Parameter to `decodeBinaryUriComponent()`. |
| 183 | |
| 184 | // This struct is intentionally convertible from bool, in order to maintain backwards |
| 185 | // compatibility with code written when `decodeBinaryUriComponent()` took a boolean second |
| 186 | // parameter. |
| 187 | DecodeUriOptions(bool nulTerminate = false, bool plusToSpace = false) |
| 188 | : nulTerminate(nulTerminate), plusToSpace(plusToSpace) {} |
| 189 | |
| 190 | bool nulTerminate; |
| 191 | // Append a terminal NUL byte. |
| 192 | |
| 193 | bool plusToSpace; |
| 194 | // Convert '+' to ' ' characters before percent decoding. Used to decode |
| 195 | // application/x-www-form-urlencoded text, such as query strings. |
| 196 | }; |
| 197 | EncodingResult<Array<byte>> decodeBinaryUriComponent( |
| 198 | ArrayPtr<const char> text, DecodeUriOptions options = DecodeUriOptions()); |
| 199 | // Decode URI components using % escapes. This is a lower-level interface used to implement both |
| 200 | // `decodeUriComponent()` and `decodeWwwForm()` |
| 201 | |
| 202 | String encodeCEscape(ArrayPtr<const byte> bytes); |
| 203 | String encodeCEscape(ArrayPtr<const char> bytes); |
| 204 | EncodingResult<Array<byte>> decodeBinaryCEscape( |
| 205 | ArrayPtr<const char> text, bool nulTerminate = false); |
| 206 | EncodingResult<String> decodeCEscape(ArrayPtr<const char> text); |
| 207 | |
| 208 | String encodeBase64(ArrayPtr<const byte> bytes, bool breakLines = false); |
| 209 | // Encode the given bytes as base64 text. If `breakLines` is true, line breaks will be inserted |
| 210 | // into the output every 72 characters (e.g. for encoding e-mail bodies). |
| 211 | |
| 212 | EncodingResult<Array<byte>> decodeBase64(ArrayPtr<const char> text); |
| 213 | // Decode base64 text. This function reports errors required by the WHATWG HTML/Infra specs: see |
| 214 | // https://html.spec.whatwg.org/multipage/webappapis.html#atob for details. |
| 215 | |
| 216 | // ======================================================================================= |
| 217 | // inline implementation details |
| 218 | |
| 219 | namespace _ { // private |
| 220 | |
| 221 | template <typename T> |
| 222 | NullableValue<T> readMaybe(EncodingResult<T>&& value) { |
| 223 | if (value.hadErrors) { |
| 224 | return nullptr; |
| 225 | } else { |
| 226 | return kj::mv(value); |
| 227 | } |
| 228 | } |
| 229 | |
| 230 | template <typename T> |
| 231 | T* readMaybe(EncodingResult<T>& value) { |
| 232 | if (value.hadErrors) { |
| 233 | return nullptr; |
| 234 | } else { |
| 235 | return &value; |
| 236 | } |
| 237 | } |
| 238 | |
| 239 | template <typename T> |
| 240 | const T* readMaybe(const EncodingResult<T>& value) { |
| 241 | if (value.hadErrors) { |
| 242 | return nullptr; |
| 243 | } else { |
| 244 | return &value; |
| 245 | } |
| 246 | } |
| 247 | |
| 248 | } // namespace _ (private) |
| 249 | |
| 250 | inline String encodeUriComponent(ArrayPtr<const char> text) { |
| 251 | return encodeUriComponent(text.asBytes()); |
| 252 | } |
| 253 | inline EncodingResult<String> decodeUriComponent(ArrayPtr<const char> text) { |
| 254 | auto result = decodeBinaryUriComponent(text, DecodeUriOptions { /*.nulTerminate=*/true }); |
| 255 | return { String(result.releaseAsChars()), result.hadErrors }; |
| 256 | } |
| 257 | |
| 258 | inline String encodeUriFragment(ArrayPtr<const char> text) { |
| 259 | return encodeUriFragment(text.asBytes()); |
| 260 | } |
| 261 | inline String encodeUriPath(ArrayPtr<const char> text) { |
| 262 | return encodeUriPath(text.asBytes()); |
| 263 | } |
| 264 | inline String encodeUriUserInfo(ArrayPtr<const char> text) { |
| 265 | return encodeUriUserInfo(text.asBytes()); |
| 266 | } |
| 267 | |
| 268 | inline String encodeWwwForm(ArrayPtr<const char> text) { |
| 269 | return encodeWwwForm(text.asBytes()); |
| 270 | } |
| 271 | inline EncodingResult<String> decodeWwwForm(ArrayPtr<const char> text) { |
| 272 | auto result = decodeBinaryUriComponent(text, DecodeUriOptions { /*.nulTerminate=*/true, |
| 273 | /*.plusToSpace=*/true }); |
| 274 | return { String(result.releaseAsChars()), result.hadErrors }; |
| 275 | } |
| 276 | |
| 277 | inline String encodeCEscape(ArrayPtr<const char> text) { |
| 278 | return encodeCEscape(text.asBytes()); |
| 279 | } |
| 280 | inline EncodingResult<String> decodeCEscape(ArrayPtr<const char> text) { |
| 281 | auto result = decodeBinaryCEscape(text, true); |
| 282 | return { String(result.releaseAsChars()), result.hadErrors }; |
| 283 | } |
| 284 | |
| 285 | // If you pass a string literal to a function taking ArrayPtr<const char>, it'll include the NUL |
| 286 | // termintator, which is surprising. Let's add overloads that avoid that. In practice this probably |
| 287 | // only even matters for encoding-test.c++. |
| 288 | |
| 289 | template <size_t s> |
| 290 | inline EncodingResult<Array<char16_t>> encodeUtf16(const char (&text)[s], bool nulTerminate=false) { |
| 291 | return encodeUtf16(arrayPtr(text, s - 1), nulTerminate); |
| 292 | } |
| 293 | template <size_t s> |
| 294 | inline EncodingResult<Array<char32_t>> encodeUtf32(const char (&text)[s], bool nulTerminate=false) { |
| 295 | return encodeUtf32(arrayPtr(text, s - 1), nulTerminate); |
| 296 | } |
| 297 | template <size_t s> |
| 298 | inline EncodingResult<Array<wchar_t>> encodeWideString( |
| 299 | const char (&text)[s], bool nulTerminate=false) { |
| 300 | return encodeWideString(arrayPtr(text, s - 1), nulTerminate); |
| 301 | } |
| 302 | template <size_t s> |
| 303 | inline EncodingResult<String> decodeUtf16(const char16_t (&utf16)[s]) { |
| 304 | return decodeUtf16(arrayPtr(utf16, s - 1)); |
| 305 | } |
| 306 | template <size_t s> |
| 307 | inline EncodingResult<String> decodeUtf32(const char32_t (&utf32)[s]) { |
| 308 | return decodeUtf32(arrayPtr(utf32, s - 1)); |
| 309 | } |
| 310 | template <size_t s> |
| 311 | inline EncodingResult<String> decodeWideString(const wchar_t (&utf32)[s]) { |
| 312 | return decodeWideString(arrayPtr(utf32, s - 1)); |
| 313 | } |
| 314 | template <size_t s> |
| 315 | inline EncodingResult<Array<byte>> decodeHex(const char (&text)[s]) { |
| 316 | return decodeHex(arrayPtr(text, s - 1)); |
| 317 | } |
| 318 | template <size_t s> |
| 319 | inline String encodeUriComponent(const char (&text)[s]) { |
| 320 | return encodeUriComponent(arrayPtr(text, s - 1)); |
| 321 | } |
| 322 | template <size_t s> |
| 323 | inline Array<byte> decodeBinaryUriComponent(const char (&text)[s]) { |
| 324 | return decodeBinaryUriComponent(arrayPtr(text, s - 1)); |
| 325 | } |
| 326 | template <size_t s> |
| 327 | inline EncodingResult<String> decodeUriComponent(const char (&text)[s]) { |
| 328 | return decodeUriComponent(arrayPtr(text, s-1)); |
| 329 | } |
| 330 | template <size_t s> |
| 331 | inline String encodeUriFragment(const char (&text)[s]) { |
| 332 | return encodeUriFragment(arrayPtr(text, s - 1)); |
| 333 | } |
| 334 | template <size_t s> |
| 335 | inline String encodeUriPath(const char (&text)[s]) { |
| 336 | return encodeUriPath(arrayPtr(text, s - 1)); |
| 337 | } |
| 338 | template <size_t s> |
| 339 | inline String encodeUriUserInfo(const char (&text)[s]) { |
| 340 | return encodeUriUserInfo(arrayPtr(text, s - 1)); |
| 341 | } |
| 342 | template <size_t s> |
| 343 | inline String encodeWwwForm(const char (&text)[s]) { |
| 344 | return encodeWwwForm(arrayPtr(text, s - 1)); |
| 345 | } |
| 346 | template <size_t s> |
| 347 | inline EncodingResult<String> decodeWwwForm(const char (&text)[s]) { |
| 348 | return decodeWwwForm(arrayPtr(text, s-1)); |
| 349 | } |
| 350 | template <size_t s> |
| 351 | inline String encodeCEscape(const char (&text)[s]) { |
| 352 | return encodeCEscape(arrayPtr(text, s - 1)); |
| 353 | } |
| 354 | template <size_t s> |
| 355 | inline EncodingResult<Array<byte>> decodeBinaryCEscape(const char (&text)[s]) { |
| 356 | return decodeBinaryCEscape(arrayPtr(text, s - 1)); |
| 357 | } |
| 358 | template <size_t s> |
| 359 | inline EncodingResult<String> decodeCEscape(const char (&text)[s]) { |
| 360 | return decodeCEscape(arrayPtr(text, s-1)); |
| 361 | } |
| 362 | template <size_t s> |
| 363 | EncodingResult<Array<byte>> decodeBase64(const char (&text)[s]) { |
| 364 | return decodeBase64(arrayPtr(text, s - 1)); |
| 365 | } |
| 366 | |
| 367 | } // namespace kj |
| 368 | |