qurlrecode.cpp source code [Qt/src/corelib/io/qurlrecode.cpp]

1	/****************************************************************************
2	**
3	** Copyright (C) 2016 Intel Corporation.
4	** Contact: https://www.qt.io/licensing/
5	**
6	** This file is part of the QtCore module of the Qt Toolkit.
7	**
8	** $QT_BEGIN_LICENSE:LGPL$
9	** Commercial License Usage
10	** Licensees holding valid commercial Qt licenses may use this file in
11	** accordance with the commercial license agreement provided with the
12	** Software or, alternatively, in accordance with the terms contained in
13	** a written agreement between you and The Qt Company. For licensing terms
14	** and conditions see https://www.qt.io/terms-conditions. For further
15	** information use the contact form at https://www.qt.io/contact-us.
16	**
17	** GNU Lesser General Public License Usage
18	** Alternatively, this file may be used under the terms of the GNU Lesser
19	** General Public License version 3 as published by the Free Software
20	** Foundation and appearing in the file LICENSE.LGPL3 included in the
21	** packaging of this file. Please review the following information to
22	** ensure the GNU Lesser General Public License version 3 requirements
23	** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
24	**
25	** GNU General Public License Usage
26	** Alternatively, this file may be used under the terms of the GNU
27	** General Public License version 2.0 or (at your option) the GNU General
28	** Public license version 3 or any later version approved by the KDE Free
29	** Qt Foundation. The licenses are as published by the Free Software
30	** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
31	** included in the packaging of this file. Please review the following
32	** information to ensure the GNU General Public License requirements will
33	** be met: https://www.gnu.org/licenses/gpl-2.0.html and
34	** https://www.gnu.org/licenses/gpl-3.0.html.
35	**
36	** $QT_END_LICENSE$
37	**
38	****************************************************************************/
39
40	#include "qurl.h"
41	#include "private/qstringconverter_p.h"
42	#include "private/qtools_p.h"
43	#include "private/qsimd_p.h"
44
45	QT_BEGIN_NAMESPACE
46
47	// ### move to qurl_p.h
48	enum EncodingAction {
49	DecodeCharacter = `0`,
50	LeaveCharacter = `1`,
51	EncodeCharacter = `2`
52	};
53
54	// From RFC 3896, Appendix A Collected ABNF for URI
55	// unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
56	// reserved = gen-delims / sub-delims
57	// gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
58	// sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
59	// / "" / "+" / "," / ";" / "="*
60	static const uchar defaultActionTable[`96`] = {
61	`2`, // space
62	`1`, // '!' (sub-delim)
63	`2`, // '"'
64	`1`, // '#' (gen-delim)
65	`1`, // '$' (gen-delim)
66	`2`, // '%' (percent)
67	`1`, // '&' (gen-delim)
68	`1`, // "'" (sub-delim)
69	`1`, // '(' (sub-delim)
70	`1`, // ')' (sub-delim)
71	`1`, // '' (sub-delim)*
72	`1`, // '+' (sub-delim)
73	`1`, // ',' (sub-delim)
74	`0`, // '-' (unreserved)
75	`0`, // '.' (unreserved)
76	`1`, // '/' (gen-delim)
77
78	`0`, `0`, `0`, `0`, `0`, // '0' to '4' (unreserved)
79	`0`, `0`, `0`, `0`, `0`, // '5' to '9' (unreserved)
80	`1`, // ':' (gen-delim)
81	`1`, // ';' (sub-delim)
82	`2`, // '<'
83	`1`, // '=' (sub-delim)
84	`2`, // '>'
85	`1`, // '?' (gen-delim)
86
87	`1`, // '@' (gen-delim)
88	`0`, `0`, `0`, `0`, `0`, // 'A' to 'E' (unreserved)
89	`0`, `0`, `0`, `0`, `0`, // 'F' to 'J' (unreserved)
90	`0`, `0`, `0`, `0`, `0`, // 'K' to 'O' (unreserved)
91	`0`, `0`, `0`, `0`, `0`, // 'P' to 'T' (unreserved)
92	`0`, `0`, `0`, `0`, `0`, `0`, // 'U' to 'Z' (unreserved)
93	`1`, // '[' (gen-delim)
94	`2`, // '\'
95	`1`, // ']' (gen-delim)
96	`2`, // '^'
97	`0`, // '_' (unreserved)
98
99	`2`, // '`'
100	`0`, `0`, `0`, `0`, `0`, // 'a' to 'e' (unreserved)
101	`0`, `0`, `0`, `0`, `0`, // 'f' to 'j' (unreserved)
102	`0`, `0`, `0`, `0`, `0`, // 'k' to 'o' (unreserved)
103	`0`, `0`, `0`, `0`, `0`, // 'p' to 't' (unreserved)
104	`0`, `0`, `0`, `0`, `0`, `0`, // 'u' to 'z' (unreserved)
105	`2`, // '{'
106	`2`, // '\|'
107	`2`, // '}'
108	`0`, // '~' (unreserved)
109
110	`2` // BSKP
111	};
112
113	// mask tables, in negative polarity
114	// 0x00 if it belongs to this category
115	// 0xff if it doesn't
116
117	static const uchar reservedMask[`96`] = {
118	`0xff`, // space
119	`0xff`, // '!' (sub-delim)
120	`0x00`, // '"'
121	`0xff`, // '#' (gen-delim)
122	`0xff`, // '$' (gen-delim)
123	`0xff`, // '%' (percent)
124	`0xff`, // '&' (gen-delim)
125	`0xff`, // "'" (sub-delim)
126	`0xff`, // '(' (sub-delim)
127	`0xff`, // ')' (sub-delim)
128	`0xff`, // '' (sub-delim)*
129	`0xff`, // '+' (sub-delim)
130	`0xff`, // ',' (sub-delim)
131	`0xff`, // '-' (unreserved)
132	`0xff`, // '.' (unreserved)
133	`0xff`, // '/' (gen-delim)
134
135	`0xff`, `0xff`, `0xff`, `0xff`, `0xff`, // '0' to '4' (unreserved)
136	`0xff`, `0xff`, `0xff`, `0xff`, `0xff`, // '5' to '9' (unreserved)
137	`0xff`, // ':' (gen-delim)
138	`0xff`, // ';' (sub-delim)
139	`0x00`, // '<'
140	`0xff`, // '=' (sub-delim)
141	`0x00`, // '>'
142	`0xff`, // '?' (gen-delim)
143
144	`0xff`, // '@' (gen-delim)
145	`0xff`, `0xff`, `0xff`, `0xff`, `0xff`, // 'A' to 'E' (unreserved)
146	`0xff`, `0xff`, `0xff`, `0xff`, `0xff`, // 'F' to 'J' (unreserved)
147	`0xff`, `0xff`, `0xff`, `0xff`, `0xff`, // 'K' to 'O' (unreserved)
148	`0xff`, `0xff`, `0xff`, `0xff`, `0xff`, // 'P' to 'T' (unreserved)
149	`0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, // 'U' to 'Z' (unreserved)
150	`0xff`, // '[' (gen-delim)
151	`0x00`, // '\'
152	`0xff`, // ']' (gen-delim)
153	`0x00`, // '^'
154	`0xff`, // '_' (unreserved)
155
156	`0x00`, // '`'
157	`0xff`, `0xff`, `0xff`, `0xff`, `0xff`, // 'a' to 'e' (unreserved)
158	`0xff`, `0xff`, `0xff`, `0xff`, `0xff`, // 'f' to 'j' (unreserved)
159	`0xff`, `0xff`, `0xff`, `0xff`, `0xff`, // 'k' to 'o' (unreserved)
160	`0xff`, `0xff`, `0xff`, `0xff`, `0xff`, // 'p' to 't' (unreserved)
161	`0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, // 'u' to 'z' (unreserved)
162	`0x00`, // '{'
163	`0x00`, // '\|'
164	`0x00`, // '}'
165	`0xff`, // '~' (unreserved)
166
167	`0xff` // BSKP
168	};
169
170	static inline bool isHex(ushort c)
171	{
172	return (c >= `'a'` && c <= `'f'`) \|\|
173	(c >= `'A'` && c <= `'F'`) \|\|
174	(c >= `'0'` && c <= `'9'`);
175	}
176
177	static inline bool isUpperHex(ushort c)
178	{
179	// undefined behaviour if c isn't an hex char!
180	return c < `0x60`;
181	}
182
183	static inline ushort toUpperHex(ushort c)
184	{
185	return isUpperHex(c) ? c : c - `0x20`;
186	}
187
188	static inline ushort decodeNibble(ushort c)
189	{
190	return c >= `'a'` ? c - `'a'` + `0xA` :
191	c >= `'A'` ? c - `'A'` + `0xA` : c - `'0'`;
192	}
193
194	// if the sequence at input is 2HEXDIG, returns its decoding*
195	// returns -1 if it isn't.
196	// assumes that the range has been checked already
197	static inline ushort decodePercentEncoding(const ushort *input)
198	{
199	ushort c1 = input[`1`];
200	ushort c2 = input[`2`];
201	if (!isHex(c1) \|\| !isHex(c2))
202	return ushort(-`1`);
203	return decodeNibble(c1) << `4` \| decodeNibble(c2);
204	}
205
206	static inline ushort encodeNibble(ushort c)
207	{
208	return ushort(QtMiscUtils::toHexUpper(c));
209	}
210
211	static void ensureDetached(QString &result, ushort &output, const* ushort begin, const* ushort input, const* ushort *end,
212	int add = `0`)
213	{
214	if (!output) {
215	// now detach
216	// create enough space if the rest of the string needed to be percent-encoded
217	int charsProcessed = input - begin;
218	int charsRemaining = end - input;
219	int spaceNeeded = end - begin + `2` * charsRemaining + add;
220	int origSize = result.size();
221	result.resize(origSize + spaceNeeded);
222
223	// we know that resize() above detached, so we bypass the reference count check
224	output = const_cast<ushort >(reinterpret_cast<const* ushort *>(result.constData()))
225	+ origSize;
226
227	// copy the chars we've already processed
228	int i;
229	for (i = `0`; i < charsProcessed; ++i)
230	output[i] = begin[i];
231	output += i;
232	}
233	}
234
235	namespace {
236	struct QUrlUtf8Traits : public QUtf8BaseTraitsNoAscii
237	{
238	// From RFC 3987:
239	// iunreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" / ucschar
240	//
241	// ucschar = %xA0-D7FF / %xF900-FDCF / %xFDF0-FFEF
242	// / %x10000-1FFFD / %x20000-2FFFD / %x30000-3FFFD
243	// / %x40000-4FFFD / %x50000-5FFFD / %x60000-6FFFD
244	// / %x70000-7FFFD / %x80000-8FFFD / %x90000-9FFFD
245	// / %xA0000-AFFFD / %xB0000-BFFFD / %xC0000-CFFFD
246	// / %xD0000-DFFFD / %xE1000-EFFFD
247	//
248	// iprivate = %xE000-F8FF / %xF0000-FFFFD / %x100000-10FFFD
249	//
250	// That RFC allows iprivate only as part of iquery, but we don't know here
251	// whether we're looking at a query or another part of an URI, so we accept
252	// them too. The definition above excludes U+FFF0 to U+FFFD from appearing
253	// unencoded, but we see no reason for its exclusion, so we allow them to
254	// be decoded (and we need U+FFFD the replacement character to indicate
255	// failure to decode).
256	//
257	// That means we must disallow:
258	// unpaired surrogates (QUtf8Functions takes care of that for us)*
259	// non-characters*
260	static const bool allowNonCharacters = false;
261
262	// override: our "bytes" are three percent-encoded UTF-16 characters
263	static void appendByte(ushort *&ptr, uchar b)
264	{
265	// b >= 0x80, by construction, so percent-encode
266	*ptr++ = `'%'`;
267	*ptr++ = encodeNibble(b >> `4`);
268	*ptr++ = encodeNibble(b & `0xf`);
269	}
270
271	static uchar peekByte(const ushort *ptr, qsizetype n = `0`)
272	{
273	// decodePercentEncoding returns ushort(-1) if it can't decode,
274	// which means we return 0xff, which is not a valid continuation byte.
275	// If ptr[i 3] is not '%', we'll multiply by zero and return 0,*
276	// also not a valid continuation byte (if it's '%', we multiply by 1).
277	return uchar(decodePercentEncoding(ptr + n * `3`))
278	* uchar(ptr[n * `3`] == `'%'`);
279	}
280
281	static qptrdiff availableBytes(const ushort ptr, const* ushort *end)
282	{
283	return (end - ptr) / `3`;
284	}
285
286	static void advanceByte(const ushort &ptr, int* n = `1`)
287	{
288	ptr += n * `3`;
289	}
290	};
291	}
292
293	// returns true if we performed an UTF-8 decoding
294	static bool encodedUtf8ToUtf16(QString &result, ushort &output, const* ushort begin, const* ushort *&input,
295	const ushort *end, ushort decoded)
296	{
297	uint ucs4 = `0`, *dst = &ucs4;
298	const ushort src = input + `3`;// skip the %XX that yielded \a decoded*
299	int charsNeeded = QUtf8Functions::fromUtf8<QUrlUtf8Traits>(decoded, dst, src, end);
300	if (charsNeeded < `0`)
301	return false;
302
303	if (!QChar::requiresSurrogates(ucs4)) {
304	// UTF-8 decoded and no surrogates are required
305	// detach if necessary
306	// possibilities are: 6 chars (%XX%XX) -> one char; 9 chars (%XX%XX%XX) -> one char
307	ensureDetached(result, output, begin, input, end, -`3` * charsNeeded + `1`);
308	*output++ = ucs4;
309	} else {
310	// UTF-8 decoded to something that requires a surrogate pair
311	// compressing from %XX%XX%XX%XX (12 chars) to two
312	ensureDetached(result, output, begin, input, end, -`10`);
313	*output++ = QChar::highSurrogate(ucs4);
314	*output++ = QChar::lowSurrogate(ucs4);
315	}
316
317	input = src - `1`;
318	return true;
319	}
320
321	static void unicodeToEncodedUtf8(QString &result, ushort &output, const* ushort *begin,
322	const ushort &input, const* ushort *end, ushort decoded)
323	{
324	// calculate the utf8 length and ensure enough space is available
325	int utf8len = QChar::isHighSurrogate(decoded) ? `4` : decoded >= `0x800` ? `3` : `2`;
326
327	// detach
328	if (!output) {
329	// we need 3 utf8len for the encoded UTF-8 sequence*
330	// but ensureDetached already adds 3 for the char we're processing
331	ensureDetached(result, output, begin, input, end, `3`*utf8len - `3`);
332	} else {
333	// verify that there's enough space or expand
334	int charsRemaining = end - input - `1`; // not including this one
335	int pos = output - reinterpret_cast<const ushort *>(result.constData());
336	int spaceRemaining = result.size() - pos;
337	if (spaceRemaining < `3`charsRemaining + `3`utf8len) {
338	// must resize
339	result.resize(result.size() + `3`*utf8len);
340
341	// we know that resize() above detached, so we bypass the reference count check
342	output = const_cast<ushort >(reinterpret_cast<const* ushort *>(result.constData()));
343	output += pos;
344	}
345	}
346
347	++input;
348	int res = QUtf8Functions::toUtf8<QUrlUtf8Traits>(decoded, output, input, end);
349	--input;
350	if (res < `0`) {
351	// bad surrogate pair sequence
352	// we will encode bad UTF-16 to UTF-8
353	// but they don't get decoded back
354
355	// first of three bytes
356	uchar c = `0xe0` \| uchar(decoded >> `12`);
357	*output++ = `'%'`;
358	*output++ = `'E'`;
359	*output++ = encodeNibble(c & `0xf`);
360
361	// second byte
362	c = `0x80` \| (uchar(decoded >> `6`) & `0x3f`);
363	*output++ = `'%'`;
364	*output++ = encodeNibble(c >> `4`);
365	*output++ = encodeNibble(c & `0xf`);
366
367	// third byte
368	c = `0x80` \| (decoded & `0x3f`);
369	*output++ = `'%'`;
370	*output++ = encodeNibble(c >> `4`);
371	*output++ = encodeNibble(c & `0xf`);
372	}
373	}
374
375	static int recode(QString &result, const ushort begin, const* ushort *end, QUrl::ComponentFormattingOptions encoding,
376	const uchar actionTable, bool* retryBadEncoding)
377	{
378	const int origSize = result.size();
379	const ushort *input = begin;
380	ushort output = nullptr*;
381
382	EncodingAction action = EncodeCharacter;
383	for ( ; input != end; ++input) {
384	ushort c;
385	// try a run where no change is necessary
386	for ( ; input != end; ++input) {
387	c = *input;
388	if (c < `0x20U`)
389	action = EncodeCharacter;
390	if (c < `0x20U` \|\| c >= `0x80U`) // also: (c - 0x20 < 0x60U)
391	goto non_trivial;
392	action = EncodingAction(actionTable[c - `' '`]);
393	if (action == EncodeCharacter)
394	goto non_trivial;
395	if (output)
396	*output++ = c;
397	}
398	break;
399
400	non_trivial:
401	uint decoded;
402	if (c == `'%'` && retryBadEncoding) {
403	// always write "%25"
404	ensureDetached(result, output, begin, input, end);
405	*output++ = `'%'`;
406	*output++ = `'2'`;
407	*output++ = `'5'`;
408	continue;
409	} else if (c == `'%'`) {
410	// check if the input is valid
411	if (input + `2` >= end \|\| (decoded = decodePercentEncoding(input)) == ushort(-`1`)) {
412	// not valid, retry
413	result.resize(origSize);
414	return recode(result, begin, end, encoding, actionTable, true);
415	}
416
417	if (decoded >= `0x80`) {
418	// decode the UTF-8 sequence
419	if (!(encoding & QUrl::EncodeUnicode) &&
420	encodedUtf8ToUtf16(result, output, begin, input, end, decoded))
421	continue;
422
423	// decoding the encoded UTF-8 failed
424	action = LeaveCharacter;
425	} else if (decoded >= `0x20`) {
426	action = EncodingAction(actionTable[decoded - `' '`]);
427	}
428	} else {
429	decoded = c;
430	if (decoded >= `0x80` && encoding & QUrl::EncodeUnicode) {
431	// encode the UTF-8 sequence
432	unicodeToEncodedUtf8(result, output, begin, input, end, decoded);
433	continue;
434	} else if (decoded >= `0x80`) {
435	if (output)
436	*output++ = c;
437	continue;
438	}
439	}
440
441	// there are six possibilities:
442	// current \ action \| DecodeCharacter \| LeaveCharacter \| EncodeCharacter
443	// decoded \| 1:leave \| 2:leave \| 3:encode
444	// encoded \| 4:decode \| 5:leave \| 6:leave
445	// cases 1 and 2 were handled before this section
446
447	if (c == `'%'` && action != DecodeCharacter) {
448	// cases 5 and 6: it's encoded and we're leaving it as it is
449	// except we're pedantic and we'll uppercase the hex
450	if (output \|\| !isUpperHex(input[`1`]) \|\| !isUpperHex(input[`2`])) {
451	ensureDetached(result, output, begin, input, end);
452	*output++ = `'%'`;
453	output++ = toUpperHex(++input);
454	output++ = toUpperHex(++input);
455	}
456	} else if (c == `'%'` && action == DecodeCharacter) {
457	// case 4: we need to decode
458	ensureDetached(result, output, begin, input, end);
459	*output++ = decoded;
460	input += `2`;
461	} else {
462	// must be case 3: we need to encode
463	ensureDetached(result, output, begin, input, end);
464	*output++ = `'%'`;
465	*output++ = encodeNibble(c >> `4`);
466	*output++ = encodeNibble(c & `0xf`);
467	}
468	}
469
470	if (output) {
471	int len = output - reinterpret_cast<const ushort *>(result.constData());
472	result.truncate(len);
473	return len - origSize;
474	}
475	return `0`;
476	}
477
478	/*
479	* Returns true if the input it checked (if it checked anything) is not
480	* encoded. A return of false indicates there's a percent at \a input that
481	* needs to be decoded.
482	*/
483	#ifdef __SSE2__
484	static bool simdCheckNonEncoded(QChar &output, const* char16_t &input, const* char16_t *end)
485	{
486	# ifdef __AVX2__
487	const __m256i percents256 = _mm256_broadcastw_epi16(_mm_cvtsi32_si128(`'%'`));
488	const __m128i percents = _mm256_castsi256_si128(percents256);
489	# else
490	const __m128i percents = _mm_set1_epi16(`'%'`);
491	# endif
492
493	uint idx = `0`;
494	quint32 mask = `0`;
495	if (input + `16` <= end) {
496	qptrdiff offset = `0`;
497	for ( ; input + offset + `16` <= end; offset += `16`) {
498	# ifdef __AVX2__
499	// do 32 bytes at a time using AVX2
500	__m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(input + offset));
501	__m256i comparison = _mm256_cmpeq_epi16(data, percents256);
502	mask = _mm256_movemask_epi8(comparison);
503	_mm256_storeu_si256(reinterpret_cast<__m256i *>(output + offset), data);
504	# else
505	// do 32 bytes at a time using unrolled SSE2
506	__m128i data1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + offset));
507	__m128i data2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + offset + `8`));
508	__m128i comparison1 = _mm_cmpeq_epi16(data1, percents);
509	__m128i comparison2 = _mm_cmpeq_epi16(data2, percents);
510	uint mask1 = _mm_movemask_epi8(comparison1);
511	uint mask2 = _mm_movemask_epi8(comparison2);
512
513	_mm_storeu_si128(reinterpret_cast<__m128i *>(output + offset), data1);
514	if (!mask1)
515	_mm_storeu_si128(reinterpret_cast<__m128i *>(output + offset + `8`), data2);
516	mask = mask1 \| (mask2 << `16`);
517	# endif
518
519	if (mask) {
520	idx = qCountTrailingZeroBits(mask) / `2`;
521	break;
522	}
523	}
524
525	input += offset;
526	if (output)
527	output += offset;
528	} else if (input + `8` <= end) {
529	// do 16 bytes at a time
530	__m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i *>(input));
531	__m128i comparison = _mm_cmpeq_epi16(data, percents);
532	mask = _mm_movemask_epi8(comparison);
533	_mm_storeu_si128(reinterpret_cast<__m128i *>(output), data);
534	idx = qCountTrailingZeroBits(quint16(mask)) / `2`;
535	} else if (input + `4` <= end) {
536	// do 8 bytes only
537	__m128i data = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(input));
538	__m128i comparison = _mm_cmpeq_epi16(data, percents);
539	mask = _mm_movemask_epi8(comparison) & `0xffu`;
540	_mm_storel_epi64(reinterpret_cast<__m128i *>(output), data);
541	idx = qCountTrailingZeroBits(quint8(mask)) / `2`;
542	} else {
543	// no percents found (because we didn't check)
544	return true;
545	}
546
547	// advance to the next non-encoded
548	input += idx;
549	output += idx;
550
551	return !mask;
552	}
553	#else
554	static bool simdCheckNonEncoded(...)
555	{
556	return true;
557	}
558	#endif
559
560	/!*
561	\since 5.0
562	\internal
563
564	This function decodes a percent-encoded string located in \a in
565	by appending each character to \a appendTo. It returns the number of
566	characters appended. Each percent-encoded sequence is decoded as follows:
567
568	\list
569	\li from %00 to %7F: the exact decoded value is appended;
570	\li from %80 to %FF: QChar::ReplacementCharacter is appended;
571	\li bad encoding: original input is copied to the output, undecoded.
572	\endlist
573
574	Given the above, it's important for the input to already have all UTF-8
575	percent sequences decoded by qt_urlRecode (that is, the input should not
576	have been processed with QUrl::EncodeUnicode).
577
578	The input should also be a valid percent-encoded sequence (the output of
579	qt_urlRecode is always valid).
580	*/
581	static qsizetype decode(QString &appendTo, QStringView in)
582	{
583	const char16_t *begin = in.utf16();
584	const char16_t *end = begin + in.size();
585
586	// fast check whether there's anything to be decoded in the first place
587	const char16_t *input = QtPrivate::qustrchr(in, `'%'`);
588
589	if (Q_LIKELY(input == end))
590	return `0`; // nothing to do, it was already decoded!
591
592	// detach
593	const int origSize = appendTo.size();
594	appendTo.resize(origSize + (end - begin));
595	QChar *output = appendTo.data() + origSize;
596	memcpy(static_cast<void >(output), static_cast<const* void >(begin), (input - begin) sizeof(QChar));
597	output += input - begin;
598
599	while (input != end) {
600	// something was encoded
601	Q_ASSERT(*input == `'%'`);
602
603	if (Q_UNLIKELY(end - input < `3` \|\| !isHex(input[`1`]) \|\| !isHex(input[`2`]))) {
604	// badly-encoded data
605	appendTo.resize(origSize + (end - begin));
606	memcpy(static_cast<void >(appendTo.begin() + origSize), static_cast<const* void >(begin), (end - begin) sizeof(ushort));
607	return end - begin;
608	}
609
610	++input;
611	*output++ = QChar::fromUcs2(decodeNibble(input[`0`]) << `4` \| decodeNibble(input[`1`]));
612	if (output[-`1`].unicode() >= `0x80`)
613	output[-`1`] = QChar::ReplacementCharacter;
614	input += `2`;
615
616	// search for the next percent, copying from input to output
617	if (simdCheckNonEncoded(output, input, end)) {
618	while (input != end) {
619	const char16_t uc = *input;
620	if (uc == `'%'`)
621	break;
622	*output++ = uc;
623	++input;
624	}
625	}
626	}
627
628	const qsizetype len = output - appendTo.begin();
629	appendTo.truncate(len);
630	return len - origSize;
631	}
632
633	template <size_t N>
634	static void maskTable(uchar (&table)[N], const uchar (&mask)[N])
635	{
636	for (size_t i = `0`; i < N; ++i)
637	table[i] &= mask[i];
638	}
639
640	/!*
641	\internal
642
643	Recodes the string from \a begin to \a end. If any transformations are
644	done, append them to \a appendTo and return the number of characters added.
645	If no transformations were required, return 0.
646
647	The \a encoding option modifies the default behaviour:
648	\list
649	\li QUrl::DecodeReserved: if set, reserved characters will be decoded;
650	if unset, reserved characters will be encoded
651	\li QUrl::EncodeSpaces: if set, spaces will be encoded to "%20"; if unset, they will be " "
652	\li QUrl::EncodeUnicode: if set, characters above U+0080 will be encoded to their UTF-8
653	percent-encoded form; if unset, they will be decoded to UTF-16
654	\li QUrl::FullyDecoded: if set, this function will decode all percent-encoded sequences,
655	including that of the percent character. The resulting string
656	will not be percent-encoded anymore. Use with caution!
657	In this mode, the behaviour is undefined if the input string
658	contains any percent-encoding sequences above %80.
659	Also, the function will not correct bad % sequences.
660	\endlist
661
662	Other flags are ignored (including QUrl::EncodeReserved).
663
664	The \a tableModifications argument can be used to supply extra
665	modifications to the tables, to be applied after the flags above are
666	handled. It consists of a sequence of 16-bit values, where the low 8 bits
667	indicate the character in question and the high 8 bits are either \c
668	EncodeCharacter, \c LeaveCharacter or \c DecodeCharacter.
669
670	This function corrects percent-encoded errors by interpreting every '%' as
671	meaning "%25" (all percents in the same content).
672	*/
673
674	Q_AUTOTEST_EXPORT qsizetype
675	qt_urlRecode(QString &appendTo, QStringView in,
676	QUrl::ComponentFormattingOptions encoding, const ushort *tableModifications)
677	{
678	uchar actionTable[sizeof defaultActionTable];
679	if ((encoding & QUrl::FullyDecoded) == QUrl::FullyDecoded) {
680	return int(decode(appendTo, in));
681	}
682
683	memcpy(actionTable, defaultActionTable, sizeof actionTable);
684	if (encoding & QUrl::DecodeReserved)
685	maskTable(actionTable, reservedMask);
686	if (!(encoding & QUrl::EncodeSpaces))
687	actionTable[`0`] = DecodeCharacter; // decode
688
689	if (tableModifications) {
690	for (const ushort p = tableModifications; p; ++p)
691	actionTable[uchar(p) - `' '`] = p >> `8`;
692	}
693
694	return recode(appendTo, reinterpret_cast<const ushort >(in.begin()), reinterpret_cast<const* ushort *>(in.end()),
695	encoding, actionTable, false);
696	}
697
698	QT_END_NAMESPACE
699

Browse the source code of Qt/src/corelib/io/qurlrecode.cpp