| 1 | /**************************************************************************** | 
|---|
| 2 | ** | 
|---|
| 3 | ** Copyright (C) 2016 Intel Corporation. | 
|---|
| 4 | ** Contact: https://www.qt.io/licensing/ | 
|---|
| 5 | ** | 
|---|
| 6 | ** This file is part of the QtCore module of the Qt Toolkit. | 
|---|
| 7 | ** | 
|---|
| 8 | ** $QT_BEGIN_LICENSE:LGPL$ | 
|---|
| 9 | ** Commercial License Usage | 
|---|
| 10 | ** Licensees holding valid commercial Qt licenses may use this file in | 
|---|
| 11 | ** accordance with the commercial license agreement provided with the | 
|---|
| 12 | ** Software or, alternatively, in accordance with the terms contained in | 
|---|
| 13 | ** a written agreement between you and The Qt Company. For licensing terms | 
|---|
| 14 | ** and conditions see https://www.qt.io/terms-conditions. For further | 
|---|
| 15 | ** information use the contact form at https://www.qt.io/contact-us. | 
|---|
| 16 | ** | 
|---|
| 17 | ** GNU Lesser General Public License Usage | 
|---|
| 18 | ** Alternatively, this file may be used under the terms of the GNU Lesser | 
|---|
| 19 | ** General Public License version 3 as published by the Free Software | 
|---|
| 20 | ** Foundation and appearing in the file LICENSE.LGPL3 included in the | 
|---|
| 21 | ** packaging of this file. Please review the following information to | 
|---|
| 22 | ** ensure the GNU Lesser General Public License version 3 requirements | 
|---|
| 23 | ** will be met: https://www.gnu.org/licenses/lgpl-3.0.html. | 
|---|
| 24 | ** | 
|---|
| 25 | ** GNU General Public License Usage | 
|---|
| 26 | ** Alternatively, this file may be used under the terms of the GNU | 
|---|
| 27 | ** General Public License version 2.0 or (at your option) the GNU General | 
|---|
| 28 | ** Public license version 3 or any later version approved by the KDE Free | 
|---|
| 29 | ** Qt Foundation. The licenses are as published by the Free Software | 
|---|
| 30 | ** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3 | 
|---|
| 31 | ** included in the packaging of this file. Please review the following | 
|---|
| 32 | ** information to ensure the GNU General Public License requirements will | 
|---|
| 33 | ** be met: https://www.gnu.org/licenses/gpl-2.0.html and | 
|---|
| 34 | ** https://www.gnu.org/licenses/gpl-3.0.html. | 
|---|
| 35 | ** | 
|---|
| 36 | ** $QT_END_LICENSE$ | 
|---|
| 37 | ** | 
|---|
| 38 | ****************************************************************************/ | 
|---|
| 39 |  | 
|---|
| 40 | #include "qurl.h" | 
|---|
| 41 | #include "private/qstringconverter_p.h" | 
|---|
| 42 | #include "private/qtools_p.h" | 
|---|
| 43 | #include "private/qsimd_p.h" | 
|---|
| 44 |  | 
|---|
| 45 | QT_BEGIN_NAMESPACE | 
|---|
| 46 |  | 
|---|
| 47 | // ### move to qurl_p.h | 
|---|
| 48 | enum EncodingAction { | 
|---|
| 49 | DecodeCharacter = 0, | 
|---|
| 50 | LeaveCharacter = 1, | 
|---|
| 51 | EncodeCharacter = 2 | 
|---|
| 52 | }; | 
|---|
| 53 |  | 
|---|
| 54 | // From RFC 3896, Appendix A Collected ABNF for URI | 
|---|
| 55 | //    unreserved    = ALPHA / DIGIT / "-" / "." / "_" / "~" | 
|---|
| 56 | //    reserved      = gen-delims / sub-delims | 
|---|
| 57 | //    gen-delims    = ":" / "/" / "?" / "#" / "[" / "]" / "@" | 
|---|
| 58 | //    sub-delims    = "!" / "$" / "&" / "'" / "(" / ")" | 
|---|
| 59 | //                  / "*" / "+" / "," / ";" / "=" | 
|---|
| 60 | static const uchar defaultActionTable[96] = { | 
|---|
| 61 | 2, // space | 
|---|
| 62 | 1, // '!' (sub-delim) | 
|---|
| 63 | 2, // '"' | 
|---|
| 64 | 1, // '#' (gen-delim) | 
|---|
| 65 | 1, // '$' (gen-delim) | 
|---|
| 66 | 2, // '%' (percent) | 
|---|
| 67 | 1, // '&' (gen-delim) | 
|---|
| 68 | 1, // "'" (sub-delim) | 
|---|
| 69 | 1, // '(' (sub-delim) | 
|---|
| 70 | 1, // ')' (sub-delim) | 
|---|
| 71 | 1, // '*' (sub-delim) | 
|---|
| 72 | 1, // '+' (sub-delim) | 
|---|
| 73 | 1, // ',' (sub-delim) | 
|---|
| 74 | 0, // '-' (unreserved) | 
|---|
| 75 | 0, // '.' (unreserved) | 
|---|
| 76 | 1, // '/' (gen-delim) | 
|---|
| 77 |  | 
|---|
| 78 | 0, 0, 0, 0, 0,  // '0' to '4' (unreserved) | 
|---|
| 79 | 0, 0, 0, 0, 0,  // '5' to '9' (unreserved) | 
|---|
| 80 | 1, // ':' (gen-delim) | 
|---|
| 81 | 1, // ';' (sub-delim) | 
|---|
| 82 | 2, // '<' | 
|---|
| 83 | 1, // '=' (sub-delim) | 
|---|
| 84 | 2, // '>' | 
|---|
| 85 | 1, // '?' (gen-delim) | 
|---|
| 86 |  | 
|---|
| 87 | 1, // '@' (gen-delim) | 
|---|
| 88 | 0, 0, 0, 0, 0,  // 'A' to 'E' (unreserved) | 
|---|
| 89 | 0, 0, 0, 0, 0,  // 'F' to 'J' (unreserved) | 
|---|
| 90 | 0, 0, 0, 0, 0,  // 'K' to 'O' (unreserved) | 
|---|
| 91 | 0, 0, 0, 0, 0,  // 'P' to 'T' (unreserved) | 
|---|
| 92 | 0, 0, 0, 0, 0, 0,  // 'U' to 'Z' (unreserved) | 
|---|
| 93 | 1, // '[' (gen-delim) | 
|---|
| 94 | 2, // '\' | 
|---|
| 95 | 1, // ']' (gen-delim) | 
|---|
| 96 | 2, // '^' | 
|---|
| 97 | 0, // '_' (unreserved) | 
|---|
| 98 |  | 
|---|
| 99 | 2, // '`' | 
|---|
| 100 | 0, 0, 0, 0, 0,  // 'a' to 'e' (unreserved) | 
|---|
| 101 | 0, 0, 0, 0, 0,  // 'f' to 'j' (unreserved) | 
|---|
| 102 | 0, 0, 0, 0, 0,  // 'k' to 'o' (unreserved) | 
|---|
| 103 | 0, 0, 0, 0, 0,  // 'p' to 't' (unreserved) | 
|---|
| 104 | 0, 0, 0, 0, 0, 0,  // 'u' to 'z' (unreserved) | 
|---|
| 105 | 2, // '{' | 
|---|
| 106 | 2, // '|' | 
|---|
| 107 | 2, // '}' | 
|---|
| 108 | 0, // '~' (unreserved) | 
|---|
| 109 |  | 
|---|
| 110 | 2  // BSKP | 
|---|
| 111 | }; | 
|---|
| 112 |  | 
|---|
| 113 | // mask tables, in negative polarity | 
|---|
| 114 | // 0x00 if it belongs to this category | 
|---|
| 115 | // 0xff if it doesn't | 
|---|
| 116 |  | 
|---|
| 117 | static const uchar reservedMask[96] = { | 
|---|
| 118 | 0xff, // space | 
|---|
| 119 | 0xff, // '!' (sub-delim) | 
|---|
| 120 | 0x00, // '"' | 
|---|
| 121 | 0xff, // '#' (gen-delim) | 
|---|
| 122 | 0xff, // '$' (gen-delim) | 
|---|
| 123 | 0xff, // '%' (percent) | 
|---|
| 124 | 0xff, // '&' (gen-delim) | 
|---|
| 125 | 0xff, // "'" (sub-delim) | 
|---|
| 126 | 0xff, // '(' (sub-delim) | 
|---|
| 127 | 0xff, // ')' (sub-delim) | 
|---|
| 128 | 0xff, // '*' (sub-delim) | 
|---|
| 129 | 0xff, // '+' (sub-delim) | 
|---|
| 130 | 0xff, // ',' (sub-delim) | 
|---|
| 131 | 0xff, // '-' (unreserved) | 
|---|
| 132 | 0xff, // '.' (unreserved) | 
|---|
| 133 | 0xff, // '/' (gen-delim) | 
|---|
| 134 |  | 
|---|
| 135 | 0xff, 0xff, 0xff, 0xff, 0xff,  // '0' to '4' (unreserved) | 
|---|
| 136 | 0xff, 0xff, 0xff, 0xff, 0xff,  // '5' to '9' (unreserved) | 
|---|
| 137 | 0xff, // ':' (gen-delim) | 
|---|
| 138 | 0xff, // ';' (sub-delim) | 
|---|
| 139 | 0x00, // '<' | 
|---|
| 140 | 0xff, // '=' (sub-delim) | 
|---|
| 141 | 0x00, // '>' | 
|---|
| 142 | 0xff, // '?' (gen-delim) | 
|---|
| 143 |  | 
|---|
| 144 | 0xff, // '@' (gen-delim) | 
|---|
| 145 | 0xff, 0xff, 0xff, 0xff, 0xff,  // 'A' to 'E' (unreserved) | 
|---|
| 146 | 0xff, 0xff, 0xff, 0xff, 0xff,  // 'F' to 'J' (unreserved) | 
|---|
| 147 | 0xff, 0xff, 0xff, 0xff, 0xff,  // 'K' to 'O' (unreserved) | 
|---|
| 148 | 0xff, 0xff, 0xff, 0xff, 0xff,  // 'P' to 'T' (unreserved) | 
|---|
| 149 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,  // 'U' to 'Z' (unreserved) | 
|---|
| 150 | 0xff, // '[' (gen-delim) | 
|---|
| 151 | 0x00, // '\' | 
|---|
| 152 | 0xff, // ']' (gen-delim) | 
|---|
| 153 | 0x00, // '^' | 
|---|
| 154 | 0xff, // '_' (unreserved) | 
|---|
| 155 |  | 
|---|
| 156 | 0x00, // '`' | 
|---|
| 157 | 0xff, 0xff, 0xff, 0xff, 0xff,  // 'a' to 'e' (unreserved) | 
|---|
| 158 | 0xff, 0xff, 0xff, 0xff, 0xff,  // 'f' to 'j' (unreserved) | 
|---|
| 159 | 0xff, 0xff, 0xff, 0xff, 0xff,  // 'k' to 'o' (unreserved) | 
|---|
| 160 | 0xff, 0xff, 0xff, 0xff, 0xff,  // 'p' to 't' (unreserved) | 
|---|
| 161 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,  // 'u' to 'z' (unreserved) | 
|---|
| 162 | 0x00, // '{' | 
|---|
| 163 | 0x00, // '|' | 
|---|
| 164 | 0x00, // '}' | 
|---|
| 165 | 0xff, // '~' (unreserved) | 
|---|
| 166 |  | 
|---|
| 167 | 0xff  // BSKP | 
|---|
| 168 | }; | 
|---|
| 169 |  | 
|---|
| 170 | static inline bool isHex(ushort c) | 
|---|
| 171 | { | 
|---|
| 172 | return (c >= 'a' && c <= 'f') || | 
|---|
| 173 | (c >= 'A' && c <= 'F') || | 
|---|
| 174 | (c >= '0' && c <= '9'); | 
|---|
| 175 | } | 
|---|
| 176 |  | 
|---|
| 177 | static inline bool isUpperHex(ushort c) | 
|---|
| 178 | { | 
|---|
| 179 | // undefined behaviour if c isn't an hex char! | 
|---|
| 180 | return c < 0x60; | 
|---|
| 181 | } | 
|---|
| 182 |  | 
|---|
| 183 | static inline ushort toUpperHex(ushort c) | 
|---|
| 184 | { | 
|---|
| 185 | return isUpperHex(c) ? c : c - 0x20; | 
|---|
| 186 | } | 
|---|
| 187 |  | 
|---|
| 188 | static inline ushort decodeNibble(ushort c) | 
|---|
| 189 | { | 
|---|
| 190 | return c >= 'a' ? c - 'a' + 0xA : | 
|---|
| 191 | c >= 'A' ? c - 'A' + 0xA : c - '0'; | 
|---|
| 192 | } | 
|---|
| 193 |  | 
|---|
| 194 | // if the sequence at input is 2*HEXDIG, returns its decoding | 
|---|
| 195 | // returns -1 if it isn't. | 
|---|
| 196 | // assumes that the range has been checked already | 
|---|
| 197 | static inline ushort decodePercentEncoding(const ushort *input) | 
|---|
| 198 | { | 
|---|
| 199 | ushort c1 = input[1]; | 
|---|
| 200 | ushort c2 = input[2]; | 
|---|
| 201 | if (!isHex(c1) || !isHex(c2)) | 
|---|
| 202 | return ushort(-1); | 
|---|
| 203 | return decodeNibble(c1) << 4 | decodeNibble(c2); | 
|---|
| 204 | } | 
|---|
| 205 |  | 
|---|
| 206 | static inline ushort encodeNibble(ushort c) | 
|---|
| 207 | { | 
|---|
| 208 | return ushort(QtMiscUtils::toHexUpper(c)); | 
|---|
| 209 | } | 
|---|
| 210 |  | 
|---|
| 211 | static void ensureDetached(QString &result, ushort *&output, const ushort *begin, const ushort *input, const ushort *end, | 
|---|
| 212 | int add = 0) | 
|---|
| 213 | { | 
|---|
| 214 | if (!output) { | 
|---|
| 215 | // now detach | 
|---|
| 216 | // create enough space if the rest of the string needed to be percent-encoded | 
|---|
| 217 | int charsProcessed = input - begin; | 
|---|
| 218 | int charsRemaining = end - input; | 
|---|
| 219 | int spaceNeeded = end - begin + 2 * charsRemaining + add; | 
|---|
| 220 | int origSize = result.size(); | 
|---|
| 221 | result.resize(origSize + spaceNeeded); | 
|---|
| 222 |  | 
|---|
| 223 | // we know that resize() above detached, so we bypass the reference count check | 
|---|
| 224 | output = const_cast<ushort *>(reinterpret_cast<const ushort *>(result.constData())) | 
|---|
| 225 | + origSize; | 
|---|
| 226 |  | 
|---|
| 227 | // copy the chars we've already processed | 
|---|
| 228 | int i; | 
|---|
| 229 | for (i = 0; i < charsProcessed; ++i) | 
|---|
| 230 | output[i] = begin[i]; | 
|---|
| 231 | output += i; | 
|---|
| 232 | } | 
|---|
| 233 | } | 
|---|
| 234 |  | 
|---|
| 235 | namespace { | 
|---|
| 236 | struct QUrlUtf8Traits : public QUtf8BaseTraitsNoAscii | 
|---|
| 237 | { | 
|---|
| 238 | // From RFC 3987: | 
|---|
| 239 | //    iunreserved    = ALPHA / DIGIT / "-" / "." / "_" / "~" / ucschar | 
|---|
| 240 | // | 
|---|
| 241 | //    ucschar        = %xA0-D7FF / %xF900-FDCF / %xFDF0-FFEF | 
|---|
| 242 | //                   / %x10000-1FFFD / %x20000-2FFFD / %x30000-3FFFD | 
|---|
| 243 | //                   / %x40000-4FFFD / %x50000-5FFFD / %x60000-6FFFD | 
|---|
| 244 | //                   / %x70000-7FFFD / %x80000-8FFFD / %x90000-9FFFD | 
|---|
| 245 | //                   / %xA0000-AFFFD / %xB0000-BFFFD / %xC0000-CFFFD | 
|---|
| 246 | //                   / %xD0000-DFFFD / %xE1000-EFFFD | 
|---|
| 247 | // | 
|---|
| 248 | //    iprivate       = %xE000-F8FF / %xF0000-FFFFD / %x100000-10FFFD | 
|---|
| 249 | // | 
|---|
| 250 | // That RFC allows iprivate only as part of iquery, but we don't know here | 
|---|
| 251 | // whether we're looking at a query or another part of an URI, so we accept | 
|---|
| 252 | // them too. The definition above excludes U+FFF0 to U+FFFD from appearing | 
|---|
| 253 | // unencoded, but we see no reason for its exclusion, so we allow them to | 
|---|
| 254 | // be decoded (and we need U+FFFD the replacement character to indicate | 
|---|
| 255 | // failure to decode). | 
|---|
| 256 | // | 
|---|
| 257 | // That means we must disallow: | 
|---|
| 258 | //  * unpaired surrogates (QUtf8Functions takes care of that for us) | 
|---|
| 259 | //  * non-characters | 
|---|
| 260 | static const bool allowNonCharacters = false; | 
|---|
| 261 |  | 
|---|
| 262 | // override: our "bytes" are three percent-encoded UTF-16 characters | 
|---|
| 263 | static void appendByte(ushort *&ptr, uchar b) | 
|---|
| 264 | { | 
|---|
| 265 | // b >= 0x80, by construction, so percent-encode | 
|---|
| 266 | *ptr++ = '%'; | 
|---|
| 267 | *ptr++ = encodeNibble(b >> 4); | 
|---|
| 268 | *ptr++ = encodeNibble(b & 0xf); | 
|---|
| 269 | } | 
|---|
| 270 |  | 
|---|
| 271 | static uchar peekByte(const ushort *ptr, qsizetype n = 0) | 
|---|
| 272 | { | 
|---|
| 273 | // decodePercentEncoding returns ushort(-1) if it can't decode, | 
|---|
| 274 | // which means we return 0xff, which is not a valid continuation byte. | 
|---|
| 275 | // If ptr[i * 3] is not '%', we'll multiply by zero and return 0, | 
|---|
| 276 | // also not a valid continuation byte (if it's '%', we multiply by 1). | 
|---|
| 277 | return uchar(decodePercentEncoding(ptr + n * 3)) | 
|---|
| 278 | * uchar(ptr[n * 3] == '%'); | 
|---|
| 279 | } | 
|---|
| 280 |  | 
|---|
| 281 | static qptrdiff availableBytes(const ushort *ptr, const ushort *end) | 
|---|
| 282 | { | 
|---|
| 283 | return (end - ptr) / 3; | 
|---|
| 284 | } | 
|---|
| 285 |  | 
|---|
| 286 | static void advanceByte(const ushort *&ptr, int n = 1) | 
|---|
| 287 | { | 
|---|
| 288 | ptr += n * 3; | 
|---|
| 289 | } | 
|---|
| 290 | }; | 
|---|
| 291 | } | 
|---|
| 292 |  | 
|---|
| 293 | // returns true if we performed an UTF-8 decoding | 
|---|
| 294 | static bool encodedUtf8ToUtf16(QString &result, ushort *&output, const ushort *begin, const ushort *&input, | 
|---|
| 295 | const ushort *end, ushort decoded) | 
|---|
| 296 | { | 
|---|
| 297 | uint ucs4 = 0, *dst = &ucs4; | 
|---|
| 298 | const ushort *src = input + 3;// skip the %XX that yielded \a decoded | 
|---|
| 299 | int charsNeeded = QUtf8Functions::fromUtf8<QUrlUtf8Traits>(decoded, dst, src, end); | 
|---|
| 300 | if (charsNeeded < 0) | 
|---|
| 301 | return false; | 
|---|
| 302 |  | 
|---|
| 303 | if (!QChar::requiresSurrogates(ucs4)) { | 
|---|
| 304 | // UTF-8 decoded and no surrogates are required | 
|---|
| 305 | // detach if necessary | 
|---|
| 306 | // possibilities are: 6 chars (%XX%XX) -> one char; 9 chars (%XX%XX%XX) -> one char | 
|---|
| 307 | ensureDetached(result, output, begin, input, end, -3 * charsNeeded + 1); | 
|---|
| 308 | *output++ = ucs4; | 
|---|
| 309 | } else { | 
|---|
| 310 | // UTF-8 decoded to something that requires a surrogate pair | 
|---|
| 311 | // compressing from %XX%XX%XX%XX (12 chars) to two | 
|---|
| 312 | ensureDetached(result, output, begin, input, end, -10); | 
|---|
| 313 | *output++ = QChar::highSurrogate(ucs4); | 
|---|
| 314 | *output++ = QChar::lowSurrogate(ucs4); | 
|---|
| 315 | } | 
|---|
| 316 |  | 
|---|
| 317 | input = src - 1; | 
|---|
| 318 | return true; | 
|---|
| 319 | } | 
|---|
| 320 |  | 
|---|
| 321 | static void unicodeToEncodedUtf8(QString &result, ushort *&output, const ushort *begin, | 
|---|
| 322 | const ushort *&input, const ushort *end, ushort decoded) | 
|---|
| 323 | { | 
|---|
| 324 | // calculate the utf8 length and ensure enough space is available | 
|---|
| 325 | int utf8len = QChar::isHighSurrogate(decoded) ? 4 : decoded >= 0x800 ? 3 : 2; | 
|---|
| 326 |  | 
|---|
| 327 | // detach | 
|---|
| 328 | if (!output) { | 
|---|
| 329 | // we need 3 * utf8len for the encoded UTF-8 sequence | 
|---|
| 330 | // but ensureDetached already adds 3 for the char we're processing | 
|---|
| 331 | ensureDetached(result, output, begin, input, end, 3*utf8len - 3); | 
|---|
| 332 | } else { | 
|---|
| 333 | // verify that there's enough space or expand | 
|---|
| 334 | int charsRemaining = end - input - 1; // not including this one | 
|---|
| 335 | int pos = output - reinterpret_cast<const ushort *>(result.constData()); | 
|---|
| 336 | int spaceRemaining = result.size() - pos; | 
|---|
| 337 | if (spaceRemaining < 3*charsRemaining + 3*utf8len) { | 
|---|
| 338 | // must resize | 
|---|
| 339 | result.resize(result.size() + 3*utf8len); | 
|---|
| 340 |  | 
|---|
| 341 | // we know that resize() above detached, so we bypass the reference count check | 
|---|
| 342 | output = const_cast<ushort *>(reinterpret_cast<const ushort *>(result.constData())); | 
|---|
| 343 | output += pos; | 
|---|
| 344 | } | 
|---|
| 345 | } | 
|---|
| 346 |  | 
|---|
| 347 | ++input; | 
|---|
| 348 | int res = QUtf8Functions::toUtf8<QUrlUtf8Traits>(decoded, output, input, end); | 
|---|
| 349 | --input; | 
|---|
| 350 | if (res < 0) { | 
|---|
| 351 | // bad surrogate pair sequence | 
|---|
| 352 | // we will encode bad UTF-16 to UTF-8 | 
|---|
| 353 | // but they don't get decoded back | 
|---|
| 354 |  | 
|---|
| 355 | // first of three bytes | 
|---|
| 356 | uchar c = 0xe0 | uchar(decoded >> 12); | 
|---|
| 357 | *output++ = '%'; | 
|---|
| 358 | *output++ = 'E'; | 
|---|
| 359 | *output++ = encodeNibble(c & 0xf); | 
|---|
| 360 |  | 
|---|
| 361 | // second byte | 
|---|
| 362 | c = 0x80 | (uchar(decoded >> 6) & 0x3f); | 
|---|
| 363 | *output++ = '%'; | 
|---|
| 364 | *output++ = encodeNibble(c >> 4); | 
|---|
| 365 | *output++ = encodeNibble(c & 0xf); | 
|---|
| 366 |  | 
|---|
| 367 | // third byte | 
|---|
| 368 | c = 0x80 | (decoded & 0x3f); | 
|---|
| 369 | *output++ = '%'; | 
|---|
| 370 | *output++ = encodeNibble(c >> 4); | 
|---|
| 371 | *output++ = encodeNibble(c & 0xf); | 
|---|
| 372 | } | 
|---|
| 373 | } | 
|---|
| 374 |  | 
|---|
| 375 | static int recode(QString &result, const ushort *begin, const ushort *end, QUrl::ComponentFormattingOptions encoding, | 
|---|
| 376 | const uchar *actionTable, bool retryBadEncoding) | 
|---|
| 377 | { | 
|---|
| 378 | const int origSize = result.size(); | 
|---|
| 379 | const ushort *input = begin; | 
|---|
| 380 | ushort *output = nullptr; | 
|---|
| 381 |  | 
|---|
| 382 | EncodingAction action = EncodeCharacter; | 
|---|
| 383 | for ( ; input != end; ++input) { | 
|---|
| 384 | ushort c; | 
|---|
| 385 | // try a run where no change is necessary | 
|---|
| 386 | for ( ; input != end; ++input) { | 
|---|
| 387 | c = *input; | 
|---|
| 388 | if (c < 0x20U) | 
|---|
| 389 | action = EncodeCharacter; | 
|---|
| 390 | if (c < 0x20U || c >= 0x80U) // also: (c - 0x20 < 0x60U) | 
|---|
| 391 | goto non_trivial; | 
|---|
| 392 | action = EncodingAction(actionTable[c - ' ']); | 
|---|
| 393 | if (action == EncodeCharacter) | 
|---|
| 394 | goto non_trivial; | 
|---|
| 395 | if (output) | 
|---|
| 396 | *output++ = c; | 
|---|
| 397 | } | 
|---|
| 398 | break; | 
|---|
| 399 |  | 
|---|
| 400 | non_trivial: | 
|---|
| 401 | uint decoded; | 
|---|
| 402 | if (c == '%' && retryBadEncoding) { | 
|---|
| 403 | // always write "%25" | 
|---|
| 404 | ensureDetached(result, output, begin, input, end); | 
|---|
| 405 | *output++ = '%'; | 
|---|
| 406 | *output++ = '2'; | 
|---|
| 407 | *output++ = '5'; | 
|---|
| 408 | continue; | 
|---|
| 409 | } else if (c == '%') { | 
|---|
| 410 | // check if the input is valid | 
|---|
| 411 | if (input + 2 >= end || (decoded = decodePercentEncoding(input)) == ushort(-1)) { | 
|---|
| 412 | // not valid, retry | 
|---|
| 413 | result.resize(origSize); | 
|---|
| 414 | return recode(result, begin, end, encoding, actionTable, true); | 
|---|
| 415 | } | 
|---|
| 416 |  | 
|---|
| 417 | if (decoded >= 0x80) { | 
|---|
| 418 | // decode the UTF-8 sequence | 
|---|
| 419 | if (!(encoding & QUrl::EncodeUnicode) && | 
|---|
| 420 | encodedUtf8ToUtf16(result, output, begin, input, end, decoded)) | 
|---|
| 421 | continue; | 
|---|
| 422 |  | 
|---|
| 423 | // decoding the encoded UTF-8 failed | 
|---|
| 424 | action = LeaveCharacter; | 
|---|
| 425 | } else if (decoded >= 0x20) { | 
|---|
| 426 | action = EncodingAction(actionTable[decoded - ' ']); | 
|---|
| 427 | } | 
|---|
| 428 | } else { | 
|---|
| 429 | decoded = c; | 
|---|
| 430 | if (decoded >= 0x80 && encoding & QUrl::EncodeUnicode) { | 
|---|
| 431 | // encode the UTF-8 sequence | 
|---|
| 432 | unicodeToEncodedUtf8(result, output, begin, input, end, decoded); | 
|---|
| 433 | continue; | 
|---|
| 434 | } else if (decoded >= 0x80) { | 
|---|
| 435 | if (output) | 
|---|
| 436 | *output++ = c; | 
|---|
| 437 | continue; | 
|---|
| 438 | } | 
|---|
| 439 | } | 
|---|
| 440 |  | 
|---|
| 441 | // there are six possibilities: | 
|---|
| 442 | //  current \ action  | DecodeCharacter | LeaveCharacter | EncodeCharacter | 
|---|
| 443 | //      decoded       |    1:leave      |    2:leave     |    3:encode | 
|---|
| 444 | //      encoded       |    4:decode     |    5:leave     |    6:leave | 
|---|
| 445 | // cases 1 and 2 were handled before this section | 
|---|
| 446 |  | 
|---|
| 447 | if (c == '%' && action != DecodeCharacter) { | 
|---|
| 448 | // cases 5 and 6: it's encoded and we're leaving it as it is | 
|---|
| 449 | // except we're pedantic and we'll uppercase the hex | 
|---|
| 450 | if (output || !isUpperHex(input[1]) || !isUpperHex(input[2])) { | 
|---|
| 451 | ensureDetached(result, output, begin, input, end); | 
|---|
| 452 | *output++ = '%'; | 
|---|
| 453 | *output++ = toUpperHex(*++input); | 
|---|
| 454 | *output++ = toUpperHex(*++input); | 
|---|
| 455 | } | 
|---|
| 456 | } else if (c == '%' && action == DecodeCharacter) { | 
|---|
| 457 | // case 4: we need to decode | 
|---|
| 458 | ensureDetached(result, output, begin, input, end); | 
|---|
| 459 | *output++ = decoded; | 
|---|
| 460 | input += 2; | 
|---|
| 461 | } else { | 
|---|
| 462 | // must be case 3: we need to encode | 
|---|
| 463 | ensureDetached(result, output, begin, input, end); | 
|---|
| 464 | *output++ = '%'; | 
|---|
| 465 | *output++ = encodeNibble(c >> 4); | 
|---|
| 466 | *output++ = encodeNibble(c & 0xf); | 
|---|
| 467 | } | 
|---|
| 468 | } | 
|---|
| 469 |  | 
|---|
| 470 | if (output) { | 
|---|
| 471 | int len = output - reinterpret_cast<const ushort *>(result.constData()); | 
|---|
| 472 | result.truncate(len); | 
|---|
| 473 | return len - origSize; | 
|---|
| 474 | } | 
|---|
| 475 | return 0; | 
|---|
| 476 | } | 
|---|
| 477 |  | 
|---|
| 478 | /* | 
|---|
| 479 | * Returns true if the input it checked (if it checked anything) is not | 
|---|
| 480 | * encoded. A return of false indicates there's a percent at \a input that | 
|---|
| 481 | * needs to be decoded. | 
|---|
| 482 | */ | 
|---|
| 483 | #ifdef __SSE2__ | 
|---|
| 484 | static bool simdCheckNonEncoded(QChar *&output, const char16_t *&input, const char16_t *end) | 
|---|
| 485 | { | 
|---|
| 486 | #  ifdef __AVX2__ | 
|---|
| 487 | const __m256i percents256 = _mm256_broadcastw_epi16(_mm_cvtsi32_si128('%')); | 
|---|
| 488 | const __m128i percents = _mm256_castsi256_si128(percents256); | 
|---|
| 489 | #  else | 
|---|
| 490 | const __m128i percents = _mm_set1_epi16('%'); | 
|---|
| 491 | #  endif | 
|---|
| 492 |  | 
|---|
| 493 | uint idx = 0; | 
|---|
| 494 | quint32 mask = 0; | 
|---|
| 495 | if (input + 16 <= end) { | 
|---|
| 496 | qptrdiff offset = 0; | 
|---|
| 497 | for ( ; input + offset + 16 <= end; offset += 16) { | 
|---|
| 498 | #  ifdef __AVX2__ | 
|---|
| 499 | // do 32 bytes at a time using AVX2 | 
|---|
| 500 | __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(input + offset)); | 
|---|
| 501 | __m256i comparison = _mm256_cmpeq_epi16(data, percents256); | 
|---|
| 502 | mask = _mm256_movemask_epi8(comparison); | 
|---|
| 503 | _mm256_storeu_si256(reinterpret_cast<__m256i *>(output + offset), data); | 
|---|
| 504 | #  else | 
|---|
| 505 | // do 32 bytes at a time using unrolled SSE2 | 
|---|
| 506 | __m128i data1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + offset)); | 
|---|
| 507 | __m128i data2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + offset + 8)); | 
|---|
| 508 | __m128i comparison1 = _mm_cmpeq_epi16(data1, percents); | 
|---|
| 509 | __m128i comparison2 = _mm_cmpeq_epi16(data2, percents); | 
|---|
| 510 | uint mask1 = _mm_movemask_epi8(comparison1); | 
|---|
| 511 | uint mask2 = _mm_movemask_epi8(comparison2); | 
|---|
| 512 |  | 
|---|
| 513 | _mm_storeu_si128(reinterpret_cast<__m128i *>(output + offset), data1); | 
|---|
| 514 | if (!mask1) | 
|---|
| 515 | _mm_storeu_si128(reinterpret_cast<__m128i *>(output + offset + 8), data2); | 
|---|
| 516 | mask = mask1 | (mask2 << 16); | 
|---|
| 517 | #  endif | 
|---|
| 518 |  | 
|---|
| 519 | if (mask) { | 
|---|
| 520 | idx = qCountTrailingZeroBits(mask) / 2; | 
|---|
| 521 | break; | 
|---|
| 522 | } | 
|---|
| 523 | } | 
|---|
| 524 |  | 
|---|
| 525 | input += offset; | 
|---|
| 526 | if (output) | 
|---|
| 527 | output += offset; | 
|---|
| 528 | } else if (input + 8 <= end) { | 
|---|
| 529 | // do 16 bytes at a time | 
|---|
| 530 | __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i *>(input)); | 
|---|
| 531 | __m128i comparison = _mm_cmpeq_epi16(data, percents); | 
|---|
| 532 | mask = _mm_movemask_epi8(comparison); | 
|---|
| 533 | _mm_storeu_si128(reinterpret_cast<__m128i *>(output), data); | 
|---|
| 534 | idx = qCountTrailingZeroBits(quint16(mask)) / 2; | 
|---|
| 535 | } else if (input + 4 <= end) { | 
|---|
| 536 | // do 8 bytes only | 
|---|
| 537 | __m128i data = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(input)); | 
|---|
| 538 | __m128i comparison = _mm_cmpeq_epi16(data, percents); | 
|---|
| 539 | mask = _mm_movemask_epi8(comparison) & 0xffu; | 
|---|
| 540 | _mm_storel_epi64(reinterpret_cast<__m128i *>(output), data); | 
|---|
| 541 | idx = qCountTrailingZeroBits(quint8(mask)) / 2; | 
|---|
| 542 | } else { | 
|---|
| 543 | // no percents found (because we didn't check) | 
|---|
| 544 | return true; | 
|---|
| 545 | } | 
|---|
| 546 |  | 
|---|
| 547 | // advance to the next non-encoded | 
|---|
| 548 | input += idx; | 
|---|
| 549 | output += idx; | 
|---|
| 550 |  | 
|---|
| 551 | return !mask; | 
|---|
| 552 | } | 
|---|
| 553 | #else | 
|---|
| 554 | static bool simdCheckNonEncoded(...) | 
|---|
| 555 | { | 
|---|
| 556 | return true; | 
|---|
| 557 | } | 
|---|
| 558 | #endif | 
|---|
| 559 |  | 
|---|
| 560 | /*! | 
|---|
| 561 | \since 5.0 | 
|---|
| 562 | \internal | 
|---|
| 563 |  | 
|---|
| 564 | This function decodes a percent-encoded string located in \a in | 
|---|
| 565 | by appending each character to \a appendTo. It returns the number of | 
|---|
| 566 | characters appended. Each percent-encoded sequence is decoded as follows: | 
|---|
| 567 |  | 
|---|
| 568 | \list | 
|---|
| 569 | \li from %00 to %7F: the exact decoded value is appended; | 
|---|
| 570 | \li from %80 to %FF: QChar::ReplacementCharacter is appended; | 
|---|
| 571 | \li bad encoding: original input is copied to the output, undecoded. | 
|---|
| 572 | \endlist | 
|---|
| 573 |  | 
|---|
| 574 | Given the above, it's important for the input to already have all UTF-8 | 
|---|
| 575 | percent sequences decoded by qt_urlRecode (that is, the input should not | 
|---|
| 576 | have been processed with QUrl::EncodeUnicode). | 
|---|
| 577 |  | 
|---|
| 578 | The input should also be a valid percent-encoded sequence (the output of | 
|---|
| 579 | qt_urlRecode is always valid). | 
|---|
| 580 | */ | 
|---|
| 581 | static qsizetype decode(QString &appendTo, QStringView in) | 
|---|
| 582 | { | 
|---|
| 583 | const char16_t *begin = in.utf16(); | 
|---|
| 584 | const char16_t *end = begin + in.size(); | 
|---|
| 585 |  | 
|---|
| 586 | // fast check whether there's anything to be decoded in the first place | 
|---|
| 587 | const char16_t *input = QtPrivate::qustrchr(in, '%'); | 
|---|
| 588 |  | 
|---|
| 589 | if (Q_LIKELY(input == end)) | 
|---|
| 590 | return 0;           // nothing to do, it was already decoded! | 
|---|
| 591 |  | 
|---|
| 592 | // detach | 
|---|
| 593 | const int origSize = appendTo.size(); | 
|---|
| 594 | appendTo.resize(origSize + (end - begin)); | 
|---|
| 595 | QChar *output = appendTo.data() + origSize; | 
|---|
| 596 | memcpy(static_cast<void *>(output), static_cast<const void *>(begin), (input - begin) * sizeof(QChar)); | 
|---|
| 597 | output += input - begin; | 
|---|
| 598 |  | 
|---|
| 599 | while (input != end) { | 
|---|
| 600 | // something was encoded | 
|---|
| 601 | Q_ASSERT(*input == '%'); | 
|---|
| 602 |  | 
|---|
| 603 | if (Q_UNLIKELY(end - input < 3 || !isHex(input[1]) || !isHex(input[2]))) { | 
|---|
| 604 | // badly-encoded data | 
|---|
| 605 | appendTo.resize(origSize + (end - begin)); | 
|---|
| 606 | memcpy(static_cast<void *>(appendTo.begin() + origSize), static_cast<const void *>(begin), (end - begin) * sizeof(ushort)); | 
|---|
| 607 | return end - begin; | 
|---|
| 608 | } | 
|---|
| 609 |  | 
|---|
| 610 | ++input; | 
|---|
| 611 | *output++ = QChar::fromUcs2(decodeNibble(input[0]) << 4 | decodeNibble(input[1])); | 
|---|
| 612 | if (output[-1].unicode() >= 0x80) | 
|---|
| 613 | output[-1] = QChar::ReplacementCharacter; | 
|---|
| 614 | input += 2; | 
|---|
| 615 |  | 
|---|
| 616 | // search for the next percent, copying from input to output | 
|---|
| 617 | if (simdCheckNonEncoded(output, input, end)) { | 
|---|
| 618 | while (input != end) { | 
|---|
| 619 | const char16_t uc = *input; | 
|---|
| 620 | if (uc == '%') | 
|---|
| 621 | break; | 
|---|
| 622 | *output++ = uc; | 
|---|
| 623 | ++input; | 
|---|
| 624 | } | 
|---|
| 625 | } | 
|---|
| 626 | } | 
|---|
| 627 |  | 
|---|
| 628 | const qsizetype len = output - appendTo.begin(); | 
|---|
| 629 | appendTo.truncate(len); | 
|---|
| 630 | return len - origSize; | 
|---|
| 631 | } | 
|---|
| 632 |  | 
|---|
| 633 | template <size_t N> | 
|---|
| 634 | static void maskTable(uchar (&table)[N], const uchar (&mask)[N]) | 
|---|
| 635 | { | 
|---|
| 636 | for (size_t i = 0; i < N; ++i) | 
|---|
| 637 | table[i] &= mask[i]; | 
|---|
| 638 | } | 
|---|
| 639 |  | 
|---|
| 640 | /*! | 
|---|
| 641 | \internal | 
|---|
| 642 |  | 
|---|
| 643 | Recodes the string from \a begin to \a end. If any transformations are | 
|---|
| 644 | done, append them to \a appendTo and return the number of characters added. | 
|---|
| 645 | If no transformations were required, return 0. | 
|---|
| 646 |  | 
|---|
| 647 | The \a encoding option modifies the default behaviour: | 
|---|
| 648 | \list | 
|---|
| 649 | \li QUrl::DecodeReserved: if set, reserved characters will be decoded; | 
|---|
| 650 | if unset, reserved characters will be encoded | 
|---|
| 651 | \li QUrl::EncodeSpaces: if set, spaces will be encoded to "%20"; if unset, they will be " " | 
|---|
| 652 | \li QUrl::EncodeUnicode: if set, characters above U+0080 will be encoded to their UTF-8 | 
|---|
| 653 | percent-encoded form; if unset, they will be decoded to UTF-16 | 
|---|
| 654 | \li QUrl::FullyDecoded: if set, this function will decode all percent-encoded sequences, | 
|---|
| 655 | including that of the percent character. The resulting string | 
|---|
| 656 | will not be percent-encoded anymore. Use with caution! | 
|---|
| 657 | In this mode, the behaviour is undefined if the input string | 
|---|
| 658 | contains any percent-encoding sequences above %80. | 
|---|
| 659 | Also, the function will not correct bad % sequences. | 
|---|
| 660 | \endlist | 
|---|
| 661 |  | 
|---|
| 662 | Other flags are ignored (including QUrl::EncodeReserved). | 
|---|
| 663 |  | 
|---|
| 664 | The \a tableModifications argument can be used to supply extra | 
|---|
| 665 | modifications to the tables, to be applied after the flags above are | 
|---|
| 666 | handled. It consists of a sequence of 16-bit values, where the low 8 bits | 
|---|
| 667 | indicate the character in question and the high 8 bits are either \c | 
|---|
| 668 | EncodeCharacter, \c LeaveCharacter or \c DecodeCharacter. | 
|---|
| 669 |  | 
|---|
| 670 | This function corrects percent-encoded errors by interpreting every '%' as | 
|---|
| 671 | meaning "%25" (all percents in the same content). | 
|---|
| 672 | */ | 
|---|
| 673 |  | 
|---|
| 674 | Q_AUTOTEST_EXPORT qsizetype | 
|---|
| 675 | qt_urlRecode(QString &appendTo, QStringView in, | 
|---|
| 676 | QUrl::ComponentFormattingOptions encoding, const ushort *tableModifications) | 
|---|
| 677 | { | 
|---|
| 678 | uchar actionTable[sizeof defaultActionTable]; | 
|---|
| 679 | if ((encoding & QUrl::FullyDecoded) == QUrl::FullyDecoded) { | 
|---|
| 680 | return int(decode(appendTo, in)); | 
|---|
| 681 | } | 
|---|
| 682 |  | 
|---|
| 683 | memcpy(actionTable, defaultActionTable, sizeof actionTable); | 
|---|
| 684 | if (encoding & QUrl::DecodeReserved) | 
|---|
| 685 | maskTable(actionTable, reservedMask); | 
|---|
| 686 | if (!(encoding & QUrl::EncodeSpaces)) | 
|---|
| 687 | actionTable[0] = DecodeCharacter; // decode | 
|---|
| 688 |  | 
|---|
| 689 | if (tableModifications) { | 
|---|
| 690 | for (const ushort *p = tableModifications; *p; ++p) | 
|---|
| 691 | actionTable[uchar(*p) - ' '] = *p >> 8; | 
|---|
| 692 | } | 
|---|
| 693 |  | 
|---|
| 694 | return recode(appendTo, reinterpret_cast<const ushort *>(in.begin()), reinterpret_cast<const ushort *>(in.end()), | 
|---|
| 695 | encoding, actionTable, false); | 
|---|
| 696 | } | 
|---|
| 697 |  | 
|---|
| 698 | QT_END_NAMESPACE | 
|---|
| 699 |  | 
|---|