qstringconverter.cpp source code [Qt/src/corelib/text/qstringconverter.cpp]

1	/****************************************************************************
2	**
3	** Copyright (C) 2020 The Qt Company Ltd.
4	** Copyright (C) 2020 Intel Corporation.
5	** Contact: https://www.qt.io/licensing/
6	**
7	** This file is part of the QtCore module of the Qt Toolkit.
8	**
9	** $QT_BEGIN_LICENSE:LGPL$
10	** Commercial License Usage
11	** Licensees holding valid commercial Qt licenses may use this file in
12	** accordance with the commercial license agreement provided with the
13	** Software or, alternatively, in accordance with the terms contained in
14	** a written agreement between you and The Qt Company. For licensing terms
15	** and conditions see https://www.qt.io/terms-conditions. For further
16	** information use the contact form at https://www.qt.io/contact-us.
17	**
18	** GNU Lesser General Public License Usage
19	** Alternatively, this file may be used under the terms of the GNU Lesser
20	** General Public License version 3 as published by the Free Software
21	** Foundation and appearing in the file LICENSE.LGPL3 included in the
22	** packaging of this file. Please review the following information to
23	** ensure the GNU Lesser General Public License version 3 requirements
24	** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
25	**
26	** GNU General Public License Usage
27	** Alternatively, this file may be used under the terms of the GNU
28	** General Public License version 2.0 or (at your option) the GNU General
29	** Public license version 3 or any later version approved by the KDE Free
30	** Qt Foundation. The licenses are as published by the Free Software
31	** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
32	** included in the packaging of this file. Please review the following
33	** information to ensure the GNU General Public License requirements will
34	** be met: https://www.gnu.org/licenses/gpl-2.0.html and
35	** https://www.gnu.org/licenses/gpl-3.0.html.
36	**
37	** $QT_END_LICENSE$
38	**
39	****************************************************************************/
40
41	#include <qstringconverter.h>
42	#include <private/qstringconverter_p.h>
43	#include "qendian.h"
44
45	#include "private/qsimd_p.h"
46	#include "private/qstringiterator_p.h"
47	#include "qbytearraymatcher.h"
48
49	#ifdef Q_OS_WIN
50	#include <qt_windows.h>
51	#endif
52
53	#if __has_include(<bit>) && __cplusplus > 201703L
54	#include <bit>
55	#endif
56
57	QT_BEGIN_NAMESPACE
58
59	enum { Endian = `0`, Data = `1` };
60
61	static const uchar utf8bom[] = { `0xef`, `0xbb`, `0xbf` };
62
63	#if (defined(__SSE2__) && defined(QT_COMPILER_SUPPORTS_SSE2)) \
64	\|\| (defined(__ARM_NEON__) && defined(Q_PROCESSOR_ARM_64))
65	static Q_ALWAYS_INLINE uint qBitScanReverse(unsigned v) noexcept
66	{
67	#if defined(__cpp_lib_int_pow2) && __cpp_lib_int_pow2 >= 202002L
68	return std::bit_width(v) - `1`;
69	#else
70	uint result = qCountLeadingZeroBits(v);
71	// Now Invert the result: clz will count down* from the msb to the lsb, so the msb index is 31*
72	// and the lsb index is 0. The result for _bit_scan_reverse is expected to be the index when
73	// counting up: msb index is 0 (because it starts there), and the lsb index is 31.
74	result ^= sizeof(unsigned) * `8` - `1`;
75	return result;
76	#endif
77	}
78	#endif
79
80	#if defined(__SSE2__) && defined(QT_COMPILER_SUPPORTS_SSE2)
81	static inline bool simdEncodeAscii(uchar &dst, const* ushort &nextAscii, const* ushort &src, const* ushort *end)
82	{
83	// do sixteen characters at a time
84	for ( ; end - src >= `16`; src += `16`, dst += `16`) {
85	# ifdef __AVX2__
86	__m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
87	__m128i data1 = _mm256_castsi256_si128(data);
88	__m128i data2 = _mm256_extracti128_si256(data, `1`);
89	# else
90	__m128i data1 = _mm_loadu_si128((const __m128i*)src);
91	__m128i data2 = _mm_loadu_si128(`1`+(const __m128i*)src);
92	# endif
93
94	// check if everything is ASCII
95	// the highest ASCII value is U+007F
96	// Do the packing directly:
97	// The PACKUSWB instruction has packs a signed 16-bit integer to an unsigned 8-bit
98	// with saturation. That is, anything from 0x0100 to 0x7fff is saturated to 0xff,
99	// while all negatives (0x8000 to 0xffff) get saturated to 0x00. To detect non-ASCII,
100	// we simply do a signed greater-than comparison to 0x00. That means we detect NULs as
101	// "non-ASCII", but it's an acceptable compromise.
102	__m128i packed = _mm_packus_epi16(data1, data2);
103	__m128i nonAscii = _mm_cmpgt_epi8(packed, _mm_setzero_si128());
104
105	// store, even if there are non-ASCII characters here
106	_mm_storeu_si128((__m128i*)dst, packed);
107
108	// n will contain 1 bit set per character in [data1, data2] that is non-ASCII (or NUL)
109	ushort n = ~_mm_movemask_epi8(nonAscii);
110	if (n) {
111	// find the next probable ASCII character
112	// we don't want to load 32 bytes again in this loop if we know there are non-ASCII
113	// characters still coming
114	nextAscii = src + qBitScanReverse(n) + `1`;
115
116	n = qCountTrailingZeroBits(n);
117	dst += n;
118	src += n;
119	return false;
120	}
121	}
122
123	if (end - src >= `8`) {
124	// do eight characters at a time
125	__m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
126	__m128i packed = _mm_packus_epi16(data, data);
127	__m128i nonAscii = _mm_cmpgt_epi8(packed, _mm_setzero_si128());
128
129	// store even non-ASCII
130	_mm_storel_epi64(reinterpret_cast<__m128i *>(dst), packed);
131
132	uchar n = ~_mm_movemask_epi8(nonAscii);
133	if (n) {
134	nextAscii = src + qBitScanReverse(n) + `1`;
135	n = qCountTrailingZeroBits(n);
136	dst += n;
137	src += n;
138	return false;
139	}
140	}
141
142	return src == end;
143	}
144
145	static inline bool simdDecodeAscii(ushort &dst, const* uchar &nextAscii, const* uchar &src, const* uchar *end)
146	{
147	// do sixteen characters at a time
148	for ( ; end - src >= `16`; src += `16`, dst += `16`) {
149	__m128i data = _mm_loadu_si128((const __m128i*)src);
150
151	#ifdef __AVX2__
152	const int BitSpacing = `2`;
153	// load and zero extend to an YMM register
154	const __m256i extended = _mm256_cvtepu8_epi16(data);
155
156	uint n = _mm256_movemask_epi8(extended);
157	if (!n) {
158	// store
159	_mm256_storeu_si256((__m256i*)dst, extended);
160	continue;
161	}
162	#else
163	const int BitSpacing = `1`;
164
165	// check if everything is ASCII
166	// movemask extracts the high bit of every byte, so n is non-zero if something isn't ASCII
167	uint n = _mm_movemask_epi8(data);
168	if (!n) {
169	// unpack
170	_mm_storeu_si128((__m128i*)dst, _mm_unpacklo_epi8(data, _mm_setzero_si128()));
171	_mm_storeu_si128(`1`+(__m128i*)dst, _mm_unpackhi_epi8(data, _mm_setzero_si128()));
172	continue;
173	}
174	#endif
175
176	// copy the front part that is still ASCII
177	while (!(n & `1`)) {
178	dst++ = src++;
179	n >>= BitSpacing;
180	}
181
182	// find the next probable ASCII character
183	// we don't want to load 16 bytes again in this loop if we know there are non-ASCII
184	// characters still coming
185	n = qBitScanReverse(n);
186	nextAscii = src + (n / BitSpacing) + `1`;
187	return false;
188
189	}
190
191	if (end - src >= `8`) {
192	__m128i data = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(src));
193	uint n = _mm_movemask_epi8(data) & `0xff`;
194	if (!n) {
195	// unpack and store
196	_mm_storeu_si128(reinterpret_cast<__m128i *>(dst), _mm_unpacklo_epi8(data, _mm_setzero_si128()));
197	} else {
198	while (!(n & `1`)) {
199	dst++ = src++;
200	n >>= `1`;
201	}
202
203	n = qBitScanReverse(n);
204	nextAscii = src + n + `1`;
205	return false;
206	}
207	}
208
209	return src == end;
210	}
211
212	static inline const uchar simdFindNonAscii(const* uchar src, const* uchar end, const* uchar *&nextAscii)
213	{
214	#ifdef __AVX2__
215	// do 32 characters at a time
216	// (this is similar to simdTestMask in qstring.cpp)
217	const __m256i mask = _mm256_set1_epi8(`0x80`);
218	for ( ; end - src >= `32`; src += `32`) {
219	__m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
220	if (_mm256_testz_si256(mask, data))
221	continue;
222
223	uint n = _mm256_movemask_epi8(data);
224	Q_ASSUME(n);
225
226	// find the next probable ASCII character
227	// we don't want to load 32 bytes again in this loop if we know there are non-ASCII
228	// characters still coming
229	nextAscii = src + qBitScanReverse(n) + `1`;
230
231	// return the non-ASCII character
232	return src + qCountTrailingZeroBits(n);
233	}
234	#endif
235
236	// do sixteen characters at a time
237	for ( ; end - src >= `16`; src += `16`) {
238	__m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));
239
240	// check if everything is ASCII
241	// movemask extracts the high bit of every byte, so n is non-zero if something isn't ASCII
242	uint n = _mm_movemask_epi8(data);
243	if (!n)
244	continue;
245
246	// find the next probable ASCII character
247	// we don't want to load 16 bytes again in this loop if we know there are non-ASCII
248	// characters still coming
249	nextAscii = src + qBitScanReverse(n) + `1`;
250
251	// return the non-ASCII character
252	return src + qCountTrailingZeroBits(n);
253	}
254
255	// do four characters at a time
256	for ( ; end - src >= `4`; src += `4`) {
257	quint32 data = qFromUnaligned<quint32>(src);
258	data &= `0x80808080U`;
259	if (!data)
260	continue;
261
262	// We don't try to guess which of the three bytes is ASCII and which
263	// one isn't. The chance that at least two of them are non-ASCII is
264	// better than 75%.
265	nextAscii = src;
266	return src;
267	}
268	nextAscii = end;
269	return src;
270	}
271
272	// Compare only the US-ASCII beginning of [src8, end8) and [src16, end16)
273	// and advance src8 and src16 to the first character that could not be compared
274	static void simdCompareAscii(const char8_t &src8, const* char8_t end8, const* char16_t &src16, const* char16_t *end16)
275	{
276	int bitSpacing = `1`;
277	qptrdiff len = qMin(end8 - src8, end16 - src16);
278	qptrdiff offset = `0`;
279	uint mask = `0`;
280
281	// do sixteen characters at a time
282	for ( ; offset + `16` < len; offset += `16`) {
283	__m128i data8 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src8 + offset));
284	#ifdef __AVX2__
285	// AVX2 version, use 256-bit registers and VPMOVXZBW
286	__m256i data16 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src16 + offset));
287
288	// expand US-ASCII as if it were Latin1 and confirm it's US-ASCII
289	__m256i datax8 = _mm256_cvtepu8_epi16(data8);
290	mask = _mm256_movemask_epi8(datax8);
291	if (mask)
292	break;
293
294	// compare Latin1 to UTF-16
295	__m256i latin1cmp = _mm256_cmpeq_epi16(datax8, data16);
296	mask = ~_mm256_movemask_epi8(latin1cmp);
297	if (mask)
298	break;
299	#else
300	// non-AVX2 code
301	__m128i datalo16 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src16 + offset));
302	__m128i datahi16 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src16 + offset) + `1`);
303
304	// expand US-ASCII as if it were Latin1, we'll confirm later
305	__m128i datalo8 = _mm_unpacklo_epi8(data8, _mm_setzero_si128());
306	__m128i datahi8 = _mm_unpackhi_epi8(data8, _mm_setzero_si128());
307
308	// compare Latin1 to UTF-16
309	__m128i latin1cmplo = _mm_cmpeq_epi16(datalo8, datalo16);
310	__m128i latin1cmphi = _mm_cmpeq_epi16(datahi8, datahi16);
311	mask = _mm_movemask_epi8(latin1cmphi) << `16`;
312	mask \|= ushort(_mm_movemask_epi8(latin1cmplo));
313	mask = ~mask;
314	if (mask)
315	break;
316
317	// confirm it was US-ASCII
318	mask = _mm_movemask_epi8(data8);
319	if (mask) {
320	bitSpacing = `0`;
321	break;
322	}
323	#endif
324	}
325
326	// helper for comparing 4 or 8 characters
327	auto cmp_lt_16 = [&mask, &offset](int n, __m128i data8, __m128i data16) {
328	// n = 4 -> sizemask = 0xff
329	// n = 8 -> sizemask = 0xffff
330	unsigned sizemask = (`1U` << (`2` * n)) - `1`;
331
332	// expand as if Latin1
333	data8 = _mm_unpacklo_epi8(data8, _mm_setzero_si128());
334
335	// compare and confirm it's US-ASCII
336	__m128i latin1cmp = _mm_cmpeq_epi16(data8, data16);
337	mask = ~_mm_movemask_epi8(latin1cmp) & sizemask;
338	mask \|= _mm_movemask_epi8(data8);
339	if (mask == `0`)
340	offset += n;
341	};
342
343	// do eight characters at a time
344	if (mask == `0` && offset + `8` < len) {
345	__m128i data8 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(src8 + offset));
346	__m128i data16 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src16 + offset));
347	cmp_lt_16(`8`, data8, data16);
348	}
349
350	// do four characters
351	if (mask == `0` && offset + `4` < len) {
352	__m128i data8 = _mm_cvtsi32_si128(qFromUnaligned<quint32>(src8 + offset));
353	__m128i data16 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(src16 + offset));
354	cmp_lt_16(`4`, data8, data16);
355	}
356
357	// correct the source pointers to point to the first character we couldn't deal with
358	if (mask)
359	offset += qCountTrailingZeroBits(mask) >> bitSpacing;
360	src8 += offset;
361	src16 += offset;
362	}
363	#elif defined(__ARM_NEON__) && defined(Q_PROCESSOR_ARM_64) // vaddv is only available on Aarch64
364	static inline bool simdEncodeAscii(uchar &dst, const* ushort &nextAscii, const* ushort &src, const* ushort *end)
365	{
366	uint16x8_t maxAscii = vdupq_n_u16(`0x7f`);
367	uint16x8_t mask1 = { `1`, `1` << `2`, `1` << `4`, `1` << `6`, `1` << `8`, `1` << `10`, `1` << `12`, `1` << `14` };
368	uint16x8_t mask2 = vshlq_n_u16(mask1, `1`);
369
370	// do sixteen characters at a time
371	for ( ; end - src >= `16`; src += `16`, dst += `16`) {
372	// load 2 lanes (or: "load interleaved")
373	uint16x8x2_t in = vld2q_u16(src);
374
375	// check if any of the elements > 0x7f, select 1 bit per element (element 0 -> bit 0, element 1 -> bit 1, etc),
376	// add those together into a scalar, and merge the scalars.
377	uint16_t nonAscii = vaddvq_u16(vandq_u16(vcgtq_u16(in.val[`0`], maxAscii), mask1))
378	\| vaddvq_u16(vandq_u16(vcgtq_u16(in.val[`1`], maxAscii), mask2));
379
380	// merge the two lanes by shifting the values of the second by 8 and inserting them
381	uint16x8_t out = vsliq_n_u16(in.val[`0`], in.val[`1`], `8`);
382
383	// store, even if there are non-ASCII characters here
384	vst1q_u8(dst, vreinterpretq_u8_u16(out));
385
386	if (nonAscii) {
387	// find the next probable ASCII character
388	// we don't want to load 32 bytes again in this loop if we know there are non-ASCII
389	// characters still coming
390	nextAscii = src + qBitScanReverse(nonAscii) + `1`;
391
392	nonAscii = qCountTrailingZeroBits(nonAscii);
393	dst += nonAscii;
394	src += nonAscii;
395	return false;
396	}
397	}
398	return src == end;
399	}
400
401	static inline bool simdDecodeAscii(ushort &dst, const* uchar &nextAscii, const* uchar &src, const* uchar *end)
402	{
403	// do eight characters at a time
404	uint8x8_t msb_mask = vdup_n_u8(`0x80`);
405	uint8x8_t add_mask = { `1`, `1` << `1`, `1` << `2`, `1` << `3`, `1` << `4`, `1` << `5`, `1` << `6`, `1` << `7` };
406	for ( ; end - src >= `8`; src += `8`, dst += `8`) {
407	uint8x8_t c = vld1_u8(src);
408	uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask));
409	if (!n) {
410	// store
411	vst1q_u16(dst, vmovl_u8(c));
412	continue;
413	}
414
415	// copy the front part that is still ASCII
416	while (!(n & `1`)) {
417	dst++ = src++;
418	n >>= `1`;
419	}
420
421	// find the next probable ASCII character
422	// we don't want to load 16 bytes again in this loop if we know there are non-ASCII
423	// characters still coming
424	n = qBitScanReverse(n);
425	nextAscii = src + n + `1`;
426	return false;
427
428	}
429	return src == end;
430	}
431
432	static inline const uchar simdFindNonAscii(const* uchar src, const* uchar end, const* uchar *&nextAscii)
433	{
434	// The SIMD code below is untested, so just force an early return until
435	// we've had the time to verify it works.
436	nextAscii = end;
437	return src;
438
439	// do eight characters at a time
440	uint8x8_t msb_mask = vdup_n_u8(`0x80`);
441	uint8x8_t add_mask = { `1`, `1` << `1`, `1` << `2`, `1` << `3`, `1` << `4`, `1` << `5`, `1` << `6`, `1` << `7` };
442	for ( ; end - src >= `8`; src += `8`) {
443	uint8x8_t c = vld1_u8(src);
444	uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask));
445	if (!n)
446	continue;
447
448	// find the next probable ASCII character
449	// we don't want to load 16 bytes again in this loop if we know there are non-ASCII
450	// characters still coming
451	nextAscii = src + qBitScanReverse(n) + `1`;
452
453	// return the non-ASCII character
454	return src + qCountTrailingZeroBits(n);
455	}
456	nextAscii = end;
457	return src;
458	}
459
460	static void simdCompareAscii(const char8_t &, const* char8_t , const* char16_t &, const* char16_t *)
461	{
462	}
463	#else
464	static inline bool simdEncodeAscii(uchar , const* ushort , const* ushort , const* ushort *)
465	{
466	return false;
467	}
468
469	static inline bool simdDecodeAscii(ushort , const* uchar , const* uchar , const* uchar *)
470	{
471	return false;
472	}
473
474	static inline const uchar simdFindNonAscii(const* uchar src, const* uchar end, const* uchar *&nextAscii)
475	{
476	nextAscii = end;
477	return src;
478	}
479
480	static void simdCompareAscii(const char8_t &, const* char8_t , const* char16_t &, const* char16_t *)
481	{
482	}
483	#endif
484
485	enum { HeaderDone = `1` };
486
487	QByteArray QUtf8::convertFromUnicode(QStringView in)
488	{
489	qsizetype len = in.size();
490
491	// create a QByteArray with the worst case scenario size
492	QByteArray result(len * `3`, Qt::Uninitialized);
493	uchar dst = reinterpret_cast<uchar >(const_cast<char *>(result.constData()));
494	const ushort src = reinterpret_cast<const* ushort *>(in.data());
495	const ushort *const end = src + len;
496
497	while (src != end) {
498	const ushort *nextAscii = end;
499	if (simdEncodeAscii(dst, nextAscii, src, end))
500	break;
501
502	do {
503	ushort u = *src++;
504	int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(u, dst, src, end);
505	if (res < `0`) {
506	// encoding error - append '?'
507	*dst++ = `'?'`;
508	}
509	} while (src < nextAscii);
510	}
511
512	result.truncate(dst - reinterpret_cast<uchar >(const_cast<char* *>(result.constData())));
513	return result;
514	}
515
516	QByteArray QUtf8::convertFromUnicode(QStringView in, QStringConverterBase::State *state)
517	{
518	QByteArray ba(`3`*in.size() +`3`, Qt::Uninitialized);
519	char *end = convertFromUnicode(ba.data(), in, state);
520	ba.truncate(end - ba.data());
521	return ba;
522	}
523
524	char QUtf8::convertFromUnicode(char* out, QStringView in, QStringConverter::State state)
525	{
526	Q_ASSERT(state);
527	const QChar *uc = in.data();
528	qsizetype len = in.length();
529	if (!len)
530	return out;
531
532	auto appendReplacementChar = [state](uchar cursor) -> uchar {
533	if (state->flags & QStringConverter::Flag::ConvertInvalidToNull) {
534	*cursor++ = `0`;
535	} else {
536	// QChar::replacement encoded in utf8
537	*cursor++ = `0xef`;
538	*cursor++ = `0xbf`;
539	*cursor++ = `0xbd`;
540	}
541	return cursor;
542	};
543
544	uchar cursor = reinterpret_cast<uchar >(out);
545	const ushort src = reinterpret_cast<const* ushort *>(uc);
546	const ushort *const end = src + len;
547
548	if (!(state->flags & QStringDecoder::Flag::Stateless)) {
549	if (state->remainingChars) {
550	int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(state->state_data[`0`], cursor, src, end);
551	if (res < `0`)
552	cursor = appendReplacementChar (cursor);
553	state->state_data[`0`] = `0`;
554	state->remainingChars = `0`;
555	} else if (!(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom) {
556	// append UTF-8 BOM
557	*cursor++ = utf8bom[`0`];
558	*cursor++ = utf8bom[`1`];
559	*cursor++ = utf8bom[`2`];
560	state->internalState \|= HeaderDone;
561	}
562	}
563
564	while (src != end) {
565	const ushort *nextAscii = end;
566	if (simdEncodeAscii(cursor, nextAscii, src, end))
567	break;
568
569	do {
570	ushort uc = *src++;
571	int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, cursor, src, end);
572	if (Q_LIKELY(res >= `0`))
573	continue;
574
575	if (res == QUtf8BaseTraits::Error) {
576	// encoding error
577	++state->invalidChars;
578	cursor = appendReplacementChar (cursor);
579	} else if (res == QUtf8BaseTraits::EndOfString) {
580	if (state->flags & QStringConverter::Flag::Stateless) {
581	++state->invalidChars;
582	cursor = appendReplacementChar (cursor);
583	} else {
584	state->remainingChars = `1`;
585	state->state_data[`0`] = uc;
586	}
587	return reinterpret_cast<char *>(cursor);
588	}
589	} while (src < nextAscii);
590	}
591
592	return reinterpret_cast<char *>(cursor);
593	}
594
595	QString QUtf8::convertToUnicode(QByteArrayView in)
596	{
597	// UTF-8 to UTF-16 always needs the exact same number of words or less:
598	// UTF-8 UTF-16
599	// 1 byte 1 word
600	// 2 bytes 1 word
601	// 3 bytes 1 word
602	// 4 bytes 2 words (one surrogate pair)
603	// That is, we'll use the full buffer if the input is US-ASCII (1-byte UTF-8),
604	// half the buffer for U+0080-U+07FF text (e.g., Greek, Cyrillic, Arabic) or
605	// non-BMP text, and one third of the buffer for U+0800-U+FFFF text (e.g, CJK).
606	//
607	// The table holds for invalid sequences too: we'll insert one replacement char
608	// per invalid byte.
609	QString result(in.size(), Qt::Uninitialized);
610	QChar data = const_cast<QChar>(result.constData()); // we know we're not shared
611	const QChar *end = convertToUnicode(data, in);
612	result.truncate(end - data);
613	return result;
614	}
615
616	/!*
617	\since 5.7
618	\overload
619
620	Converts the UTF-8 sequence of bytes viewed by \a in to a sequence of
621	QChar starting at \a buffer. The buffer is expected to be large enough
622	to hold the result. An upper bound for the size of the buffer is
623	\c in.size() QChars.
624
625	If, during decoding, an error occurs, a QChar::ReplacementCharacter is
626	written.
627
628	Returns a pointer to one past the last QChar written.
629
630	This function never throws.
631	*/
632
633	QChar QUtf8::convertToUnicode(QChar buffer, QByteArrayView in) noexcept
634	{
635	ushort dst = reinterpret_cast<ushort >(buffer);
636	const uchar *const start = reinterpret_cast<const uchar *>(in.data());
637	const uchar *src = start;
638	const uchar *end = src + in.size();
639
640	// attempt to do a full decoding in SIMD
641	const uchar *nextAscii = end;
642	if (!simdDecodeAscii(dst, nextAscii, src, end)) {
643	// at least one non-ASCII entry
644	// check if we failed to decode the UTF-8 BOM; if so, skip it
645	if (Q_UNLIKELY(src == start)
646	&& end - src >= `3`
647	&& Q_UNLIKELY(src[`0`] == utf8bom[`0`] && src[`1`] == utf8bom[`1`] && src[`2`] == utf8bom[`2`])) {
648	src += `3`;
649	}
650
651	while (src < end) {
652	nextAscii = end;
653	if (simdDecodeAscii(dst, nextAscii, src, end))
654	break;
655
656	do {
657	uchar b = *src++;
658	int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst, src, end);
659	if (res < `0`) {
660	// decoding error
661	*dst++ = QChar::ReplacementCharacter;
662	}
663	} while (src < nextAscii);
664	}
665	}
666
667	return reinterpret_cast<QChar *>(dst);
668	}
669
670	QString QUtf8::convertToUnicode(QByteArrayView in, QStringConverter::State *state)
671	{
672	// See above for buffer requirements for stateless decoding. However, that
673	// fails if the state is not empty. The following situations can add to the
674	// requirements:
675	// state contains chars starts with requirement
676	// 1 of 2 bytes valid continuation 0
677	// 2 of 3 bytes same 0
678	// 3 bytes of 4 same +1 (need to insert surrogate pair)
679	// 1 of 2 bytes invalid continuation +1 (need to insert replacement and restart)
680	// 2 of 3 bytes same +1 (same)
681	// 3 of 4 bytes same +1 (same)
682	QString result(in.size() + `1`, Qt::Uninitialized);
683	QChar *end = convertToUnicode(result.data(), in, state);
684	result.truncate(end - result.constData());
685	return result;
686	}
687
688	QChar QUtf8::convertToUnicode(QChar out, QByteArrayView in, QStringConverter::State *state)
689	{
690	qsizetype len = in.size();
691
692	Q_ASSERT(state);
693	if (!len)
694	return out;
695
696
697	ushort replacement = QChar::ReplacementCharacter;
698	if (state->flags & QStringConverter::Flag::ConvertInvalidToNull)
699	replacement = QChar::Null;
700
701	int res;
702	uchar ch = `0`;
703
704	ushort dst = reinterpret_cast<ushort >(out);
705	const uchar src = reinterpret_cast<const* uchar *>(in.data());
706	const uchar *end = src + len;
707
708	if (!(state->flags & QStringConverter::Flag::Stateless)) {
709	bool headerdone = state->internalState & HeaderDone \|\| state->flags & QStringConverter::Flag::ConvertInitialBom;
710	if (state->remainingChars \|\| !headerdone) {
711	// handle incoming state first
712	uchar remainingCharsData[`4`]; // longest UTF-8 sequence possible
713	qsizetype remainingCharsCount = state->remainingChars;
714	qsizetype newCharsToCopy = qMin<qsizetype>(sizeof(remainingCharsData) - remainingCharsCount, end - src);
715
716	memset(remainingCharsData, `0`, sizeof(remainingCharsData));
717	memcpy(remainingCharsData, &state->state_data[`0`], remainingCharsCount);
718	memcpy(remainingCharsData + remainingCharsCount, src, newCharsToCopy);
719
720	const uchar *begin = &remainingCharsData[`1`];
721	res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(remainingCharsData[`0`], dst, begin,
722	static_cast<const uchar *>(remainingCharsData) + remainingCharsCount + newCharsToCopy);
723	if (res == QUtf8BaseTraits::Error) {
724	++state->invalidChars;
725	*dst++ = replacement;
726	++src;
727	} else if (res == QUtf8BaseTraits::EndOfString) {
728	// if we got EndOfString again, then there were too few bytes in src;
729	// copy to our state and return
730	state->remainingChars = remainingCharsCount + newCharsToCopy;
731	memcpy(&state->state_data[`0`], remainingCharsData, state->remainingChars);
732	return out;
733	} else if (!headerdone) {
734	// eat the UTF-8 BOM
735	if (dst[-`1`] == `0xfeff`)
736	--dst;
737	}
738	state->internalState \|= HeaderDone;
739
740	// adjust src now that we have maybe consumed a few chars
741	if (res >= `0`) {
742	Q_ASSERT(res > remainingCharsCount);
743	src += res - remainingCharsCount;
744	}
745	}
746	} else if (!(state->flags & QStringConverter::Flag::ConvertInitialBom)) {
747	// stateless, remove initial BOM
748	if (len > `2` && src[`0`] == utf8bom[`0`] && src[`1`] == utf8bom[`1`] && src[`2`] == utf8bom[`2`])
749	// skip BOM
750	src += `3`;
751	}
752
753	// main body, stateless decoding
754	res = `0`;
755	const uchar *nextAscii = src;
756	while (res >= `0` && src < end) {
757	if (src >= nextAscii && simdDecodeAscii(dst, nextAscii, src, end))
758	break;
759
760	ch = *src++;
761	res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(ch, dst, src, end);
762	if (res == QUtf8BaseTraits::Error) {
763	res = `0`;
764	++state->invalidChars;
765	*dst++ = replacement;
766	}
767	}
768
769	if (res == QUtf8BaseTraits::EndOfString) {
770	// unterminated UTF sequence
771	if (state->flags & QStringConverter::Flag::Stateless) {
772	*dst++ = QChar::ReplacementCharacter;
773	++state->invalidChars;
774	while (src++ < end) {
775	*dst++ = QChar::ReplacementCharacter;
776	++state->invalidChars;
777	}
778	state->remainingChars = `0`;
779	} else {
780	--src; // unread the byte in ch
781	state->remainingChars = end - src;
782	memcpy(&state->state_data[`0`], src, end - src);
783	}
784	} else {
785	state->remainingChars = `0`;
786	}
787
788	return reinterpret_cast<QChar *>(dst);
789	}
790
791	struct QUtf8NoOutputTraits : public QUtf8BaseTraitsNoAscii
792	{
793	struct NoOutput {};
794	static void appendUtf16(const NoOutput &, ushort) {}
795	static void appendUcs4(const NoOutput &, uint) {}
796	};
797
798	QUtf8::ValidUtf8Result QUtf8::isValidUtf8(QByteArrayView in)
799	{
800	const uchar src = reinterpret_cast<const* uchar *>(in.data());
801	const uchar *end = src + in.size();
802	const uchar *nextAscii = src;
803	bool isValidAscii = true;
804
805	while (src < end) {
806	if (src >= nextAscii)
807	src = simdFindNonAscii(src, end, nextAscii);
808	if (src == end)
809	break;
810
811	do {
812	uchar b = *src++;
813	if ((b & `0x80`) == `0`)
814	continue;
815
816	isValidAscii = false;
817	QUtf8NoOutputTraits::NoOutput output;
818	int res = QUtf8Functions::fromUtf8<QUtf8NoOutputTraits>(b, output, src, end);
819	if (res < `0`) {
820	// decoding error
821	return { false, false };
822	}
823	} while (src < nextAscii);
824	}
825
826	return { true, isValidAscii };
827	}
828
829	int QUtf8::compareUtf8(QByteArrayView utf8, QStringView utf16) noexcept
830	{
831	auto src1 = reinterpret_cast<const char8_t *>(utf8.data());
832	auto end1 = src1 + utf8.size();
833	auto src2 = reinterpret_cast<const char16_t *>(utf16.data());
834	auto end2 = src2 + utf16.size();
835
836	do {
837	simdCompareAscii(src1, end1, src2, end2);
838
839	if (src1 < end1 && src2 < end2) {
840	char32_t uc1 = *src1++;
841	char32_t uc2 = *src2++;
842
843	if (uc1 >= `0x80`) {
844	char32_t *output = &uc1;
845	int res = QUtf8Functions::fromUtf8<QUtf8BaseTraitsNoAscii>(uc1, output, src1, end1);
846	if (res < `0`) {
847	// decoding error
848	uc1 = QChar::ReplacementCharacter;
849	}
850
851	// Only decode the UTF-16 surrogate pair if the UTF-8 code point
852	// wasn't US-ASCII (a surrogate cannot match US-ASCII).
853	if (QChar::isHighSurrogate(uc2) && src2 < end2 && QChar::isLowSurrogate(*src2))
854	uc2 = QChar::surrogateToUcs4(uc2, *src2++);
855	}
856
857	if (uc1 != uc2)
858	return int(uc1) - int(uc2);
859	}
860	} while (src1 < end1 && src2 < end2);
861
862	// the shorter string sorts first
863	return (end1 > src1) - int(end2 > src2);
864	}
865
866	int QUtf8::compareUtf8(QByteArrayView utf8, QLatin1String s)
867	{
868	uint uc1 = QChar::Null;
869	auto src1 = reinterpret_cast<const uchar *>(utf8.data());
870	auto end1 = src1 + utf8.size();
871	auto src2 = reinterpret_cast<const uchar *>(s.latin1());
872	auto end2 = src2 + s.size();
873
874	while (src1 < end1 && src2 < end2) {
875	uchar b = *src1++;
876	uint *output = &uc1;
877	int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, output, src1, end1);
878	if (res < `0`) {
879	// decoding error
880	uc1 = QChar::ReplacementCharacter;
881	}
882
883	uint uc2 = *src2++;
884	if (uc1 != uc2)
885	return int(uc1) - int(uc2);
886	}
887
888	// the shorter string sorts first
889	return (end1 > src1) - (end2 > src2);
890	}
891
892	QByteArray QUtf16::convertFromUnicode(QStringView in, QStringConverter::State *state, DataEndianness endian)
893	{
894	bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
895	qsizetype length = `2` * in.size();
896	if (writeBom)
897	length += `2`;
898
899	QByteArray d(length, Qt::Uninitialized);
900	char *end = convertFromUnicode(d.data(), in, state, endian);
901	Q_ASSERT(end - d.constData() == d.length());
902	Q_UNUSED(end);
903	return d;
904	}
905
906	char QUtf16::convertFromUnicode(char* out, QStringView in, QStringConverter::State state, DataEndianness endian)
907	{
908	Q_ASSERT(state);
909	bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
910
911	if (endian == DetectEndianness)
912	endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
913
914	if (writeBom) {
915	QChar bom(QChar::ByteOrderMark);
916	if (endian == BigEndianness)
917	qToBigEndian(bom.unicode(), out);
918	else
919	qToLittleEndian(bom.unicode(), out);
920	out += `2`;
921	}
922	if (endian == BigEndianness)
923	qToBigEndian<ushort>(in.data(), in.length(), out);
924	else
925	qToLittleEndian<ushort>(in.data(), in.length(), out);
926
927	state->remainingChars = `0`;
928	state->internalState \|= HeaderDone;
929	return out + `2`*in.length();
930	}
931
932	QString QUtf16::convertToUnicode(QByteArrayView in, QStringConverter::State *state, DataEndianness endian)
933	{
934	QString result((in.size() + `1`) >> `1`, Qt::Uninitialized); // worst case
935	QChar *qch = convertToUnicode(result.data(), in, state, endian);
936	result.truncate(qch - result.constData());
937	return result;
938	}
939
940	QChar QUtf16::convertToUnicode(QChar out, QByteArrayView in, QStringConverter::State *state, DataEndianness endian)
941	{
942	qsizetype len = in.size();
943	const char *chars = in.data();
944
945	Q_ASSERT(state);
946
947	if (endian == DetectEndianness)
948	endian = (DataEndianness)state->state_data[Endian];
949
950	const char *end = chars + len;
951
952	// make sure we can decode at least one char
953	if (state->remainingChars + len < `2`) {
954	if (len) {
955	Q_ASSERT(state->remainingChars == `0` && len == `1`);
956	state->remainingChars = `1`;
957	state->state_data[Data] = *chars;
958	}
959	return out;
960	}
961
962	bool headerdone = state && state->internalState & HeaderDone;
963	if (state->flags & QStringConverter::Flag::ConvertInitialBom)
964	headerdone = true;
965
966	if (!headerdone \|\| state->remainingChars) {
967	uchar buf;
968	if (state->remainingChars)
969	buf = state->state_data[Data];
970	else
971	buf = *chars++;
972
973	// detect BOM, set endianness
974	state->internalState \|= HeaderDone;
975	QChar ch(buf, *chars++);
976	if (endian == DetectEndianness) {
977	if (ch == QChar::ByteOrderSwapped) {
978	endian = BigEndianness;
979	} else if (ch == QChar::ByteOrderMark) {
980	endian = LittleEndianness;
981	} else {
982	if (QSysInfo::ByteOrder == QSysInfo::BigEndian) {
983	endian = BigEndianness;
984	} else {
985	endian = LittleEndianness;
986	}
987	}
988	}
989	if (endian == BigEndianness)
990	ch = QChar::fromUcs2((ch.unicode() >> `8`) \| ((ch.unicode() & `0xff`) << `8`));
991	if (headerdone \|\| ch != QChar::ByteOrderMark)
992	*out++ = ch;
993	} else if (endian == DetectEndianness) {
994	endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
995	}
996
997	int nPairs = (end - chars) >> `1`;
998	if (endian == BigEndianness)
999	qFromBigEndian<ushort>(chars, nPairs, out);
1000	else
1001	qFromLittleEndian<ushort>(chars, nPairs, out);
1002	out += nPairs;
1003
1004	state->state_data[Endian] = endian;
1005	state->remainingChars = `0`;
1006	if ((end - chars) & `1`) {
1007	if (state->flags & QStringConverter::Flag::Stateless) {
1008	*out++ = state->flags & QStringConverter::Flag::ConvertInvalidToNull ? QChar::Null : QChar::ReplacementCharacter;
1009	} else {
1010	state->remainingChars = `1`;
1011	state->state_data[Data] = *(end - `1`);
1012	}
1013	} else {
1014	state->state_data[Data] = `0`;
1015	}
1016
1017	return out;
1018	}
1019
1020	QByteArray QUtf32::convertFromUnicode(QStringView in, QStringConverter::State *state, DataEndianness endian)
1021	{
1022	bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
1023	int length = `4`*in.size();
1024	if (writeBom)
1025	length += `4`;
1026	QByteArray ba(length, Qt::Uninitialized);
1027	char *end = convertFromUnicode(ba.data(), in, state, endian);
1028	Q_ASSERT(end - ba.constData() == length);
1029	Q_UNUSED(end);
1030	return ba;
1031	}
1032
1033	char QUtf32::convertFromUnicode(char* out, QStringView in, QStringConverter::State state, DataEndianness endian)
1034	{
1035	Q_ASSERT(state);
1036
1037	bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
1038	qsizetype length = `4`*in.length();
1039	if (writeBom)
1040	length += `4`;
1041
1042	if (endian == DetectEndianness)
1043	endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
1044
1045	if (writeBom) {
1046	if (endian == BigEndianness) {
1047	out[`0`] = `0`;
1048	out[`1`] = `0`;
1049	out[`2`] = (char)`0xfe`;
1050	out[`3`] = (char)`0xff`;
1051	} else {
1052	out[`0`] = (char)`0xff`;
1053	out[`1`] = (char)`0xfe`;
1054	out[`2`] = `0`;
1055	out[`3`] = `0`;
1056	}
1057	out += `4`;
1058	state->internalState \|= HeaderDone;
1059	}
1060
1061	const QChar *uc = in.data();
1062	const QChar *end = in.data() + in.length();
1063	QChar ch;
1064	uint ucs4;
1065	if (state->remainingChars == `1`) {
1066	ch = state->state_data[Data];
1067	// this is ugly, but shortcuts a whole lot of logic that would otherwise be required
1068	state->remainingChars = `0`;
1069	goto decode_surrogate;
1070	}
1071
1072	while (uc < end) {
1073	ch = *uc++;
1074	if (Q_LIKELY(!ch.isSurrogate())) {
1075	ucs4 = ch.unicode();
1076	} else if (Q_LIKELY(ch.isHighSurrogate())) {
1077	decode_surrogate:
1078	if (uc == end) {
1079	if (state->flags & QStringConverter::Flag::Stateless) {
1080	ucs4 = state->flags & QStringConverter::Flag::ConvertInvalidToNull ? `0` : QChar::ReplacementCharacter;
1081	} else {
1082	state->remainingChars = `1`;
1083	state->state_data[Data] = ch.unicode();
1084	return out;
1085	}
1086	} else if (uc->isLowSurrogate()) {
1087	ucs4 = QChar::surrogateToUcs4(ch, *uc++);
1088	} else {
1089	ucs4 = state->flags & QStringConverter::Flag::ConvertInvalidToNull ? `0` : QChar::ReplacementCharacter;
1090	}
1091	} else {
1092	ucs4 = state->flags & QStringConverter::Flag::ConvertInvalidToNull ? `0` : QChar::ReplacementCharacter;
1093	}
1094	if (endian == BigEndianness)
1095	qToBigEndian(ucs4, out);
1096	else
1097	qToLittleEndian(ucs4, out);
1098	out += `4`;
1099	}
1100
1101	return out;
1102	}
1103
1104	QString QUtf32::convertToUnicode(QByteArrayView in, QStringConverter::State *state, DataEndianness endian)
1105	{
1106	QString result;
1107	result.resize((in.size() + `7`) >> `1`); // worst case
1108	QChar *end = convertToUnicode(result.data(), in, state, endian);
1109	result.truncate(end - result.constData());
1110	return result;
1111	}
1112
1113	QChar QUtf32::convertToUnicode(QChar out, QByteArrayView in, QStringConverter::State *state, DataEndianness endian)
1114	{
1115	qsizetype len = in.size();
1116	const char *chars = in.data();
1117
1118	Q_ASSERT(state);
1119	if (endian == DetectEndianness)
1120	endian = (DataEndianness)state->state_data[Endian];
1121
1122	const char *end = chars + len;
1123
1124	uchar tuple[`4`];
1125	memcpy(tuple, &state->state_data[Data], `4`);
1126
1127	// make sure we can decode at least one char
1128	if (state->remainingChars + len < `4`) {
1129	if (len) {
1130	while (chars < end) {
1131	tuple[state->remainingChars] = *chars;
1132	++state->remainingChars;
1133	++chars;
1134	}
1135	Q_ASSERT(state->remainingChars < `4`);
1136	memcpy(&state->state_data[Data], tuple, `4`);
1137	}
1138	return out;
1139	}
1140
1141	bool headerdone = state->internalState & HeaderDone;
1142	if (state->flags & QStringConverter::Flag::ConvertInitialBom)
1143	headerdone = true;
1144
1145	int num = state->remainingChars;
1146	state->remainingChars = `0`;
1147
1148	if (!headerdone \|\| endian == DetectEndianness \|\| num) {
1149	while (num < `4`)
1150	tuple[num++] = *chars++;
1151	if (endian == DetectEndianness) {
1152	if (tuple[`0`] == `0xff` && tuple[`1`] == `0xfe` && tuple[`2`] == `0` && tuple[`3`] == `0`) {
1153	endian = LittleEndianness;
1154	} else if (tuple[`0`] == `0` && tuple[`1`] == `0` && tuple[`2`] == `0xfe` && tuple[`3`] == `0xff`) {
1155	endian = BigEndianness;
1156	} else if (QSysInfo::ByteOrder == QSysInfo::BigEndian) {
1157	endian = BigEndianness;
1158	} else {
1159	endian = LittleEndianness;
1160	}
1161	}
1162	uint code = (endian == BigEndianness) ? qFromBigEndian<quint32>(tuple) : qFromLittleEndian<quint32>(tuple);
1163	if (headerdone \|\| code != QChar::ByteOrderMark) {
1164	if (QChar::requiresSurrogates(code)) {
1165	*out++ = QChar (QChar::highSurrogate(code));
1166	*out++ = QChar (QChar::lowSurrogate(code));
1167	} else {
1168	*out++ = QChar (code);
1169	}
1170	}
1171	num = `0`;
1172	} else if (endian == DetectEndianness) {
1173	endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
1174	}
1175	state->state_data[Endian] = endian;
1176	state->internalState \|= HeaderDone;
1177
1178	while (chars < end) {
1179	tuple[num++] = *chars++;
1180	if (num == `4`) {
1181	uint code = (endian == BigEndianness) ? qFromBigEndian<quint32>(tuple) : qFromLittleEndian<quint32>(tuple);
1182	for (char16_t c : QChar::fromUcs4(code))
1183	*out++ = c;
1184	num = `0`;
1185	}
1186	}
1187
1188	if (num) {
1189	if (state->flags & QStringDecoder::Flag::Stateless) {
1190	*out++ = QChar::ReplacementCharacter;
1191	} else {
1192	state->state_data[Endian] = endian;
1193	state->remainingChars = num;
1194	memcpy(&state->state_data[Data], tuple, `4`);
1195	}
1196	}
1197
1198	return out;
1199	}
1200
1201	#if defined(Q_OS_WIN) && !defined(QT_BOOTSTRAPPED)
1202	static QString convertToUnicodeCharByChar(QByteArrayView in, QStringConverter::State *state)
1203	{
1204	qsizetype length = in.size();
1205	const char *chars = in.data();
1206
1207	Q_ASSERT(state);
1208	if (state->flags & QStringConverter::Flag::Stateless) // temporary
1209	state = nullptr;
1210
1211	if (!chars \|\| !length)
1212	return QString();
1213
1214	int copyLocation = `0`;
1215	int extra = `2`;
1216	if (state && state->remainingChars) {
1217	copyLocation = state->remainingChars;
1218	extra += copyLocation;
1219	}
1220	int newLength = length + extra;
1221	char mbcs = new* char[newLength];
1222	//ensure that we have a NULL terminated string
1223	mbcs[newLength-`1`] = `0`;
1224	mbcs[newLength-`2`] = `0`;
1225	memcpy(&(mbcs[copyLocation]), chars, length);
1226	if (copyLocation) {
1227	//copy the last character from the state
1228	mbcs[`0`] = (char)state->state_data[`0`];
1229	state->remainingChars = `0`;
1230	}
1231	const char *mb = mbcs;
1232	const char *next = `0`;
1233	QString s;
1234	while ((next = CharNextExA(CP_ACP, mb, `0`)) != mb) {
1235	wchar_t wc[`2`] ={`0`};
1236	int charlength = next - mb;
1237	int len = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED\|MB_ERR_INVALID_CHARS, mb, charlength, wc, `2`);
1238	if (len>`0`) {
1239	s.append(QChar(wc[`0`]));
1240	} else {
1241	int r = GetLastError();
1242	//check if the character being dropped is the last character
1243	if (r == ERROR_NO_UNICODE_TRANSLATION && mb == (mbcs+newLength -`3`) && state) {
1244	state->remainingChars = `1`;
1245	state->state_data[`0`] = (char)*mb;
1246	}
1247	}
1248	mb = next;
1249	}
1250	delete [] mbcs;
1251	return s;
1252	}
1253
1254
1255	QString QLocal8Bit::convertToUnicode(QByteArrayView in, QStringConverter::State *state)
1256	{
1257	qsizetype length = in.size();
1258
1259	Q_ASSERT(length < INT_MAX); // ### FIXME
1260	const char *mb = in.data();
1261	int mblen = length;
1262
1263	if (!mb \|\| !mblen)
1264	return QString();
1265
1266	QVarLengthArray<wchar_t, `4096`> wc(`4096`);
1267	int len;
1268	QString sp;
1269	bool prepend = false;
1270	char state_data = `0`;
1271	int remainingChars = `0`;
1272
1273	//save the current state information
1274	if (state) {
1275	state_data = (char)state->state_data[`0`];
1276	remainingChars = state->remainingChars;
1277	}
1278
1279	//convert the pending character (if available)
1280	if (state && remainingChars) {
1281	char prev[`3`] = {`0`};
1282	prev[`0`] = state_data;
1283	prev[`1`] = mb[`0`];
1284	remainingChars = `0`;
1285	len = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED,
1286	prev, `2`, wc.data(), wc.length());
1287	if (len) {
1288	sp.append(QChar(wc[`0`]));
1289	if (mblen == `1`) {
1290	state->remainingChars = `0`;
1291	return sp;
1292	}
1293	prepend = true;
1294	mb++;
1295	mblen--;
1296	wc[`0`] = `0`;
1297	}
1298	}
1299
1300	while (!(len=MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED\|MB_ERR_INVALID_CHARS,
1301	mb, mblen, wc.data(), wc.length()))) {
1302	int r = GetLastError();
1303	if (r == ERROR_INSUFFICIENT_BUFFER) {
1304	const int wclen = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED,
1305	mb, mblen, `0`, `0`);
1306	wc.resize(wclen);
1307	} else if (r == ERROR_NO_UNICODE_TRANSLATION) {
1308	//find the last non NULL character
1309	while (mblen > `1` && !(mb[mblen-`1`]))
1310	mblen--;
1311	//check whether, we hit an invalid character in the middle
1312	if ((mblen <= `1`) \|\| (remainingChars && state_data))
1313	return convertToUnicodeCharByChar(in, state);
1314	//Remove the last character and try again...
1315	state_data = mb[mblen-`1`];
1316	remainingChars = `1`;
1317	mblen--;
1318	} else {
1319	// Fail.
1320	qWarning("MultiByteToWideChar: Cannot convert multibyte text");
1321	break;
1322	}
1323	}
1324
1325	if (len <= `0`)
1326	return QString();
1327
1328	if (wc[len-`1`] == `0`) // len - 1: we don't want terminator
1329	--len;
1330
1331	//save the new state information
1332	if (state) {
1333	state->state_data[`0`] = (char)state_data;
1334	state->remainingChars = remainingChars;
1335	}
1336	QString s((QChar*)wc.data(), len);
1337	if (prepend) {
1338	return sp+s;
1339	}
1340	return s;
1341	}
1342
1343	QByteArray QLocal8Bit::convertFromUnicode(QStringView in, QStringConverter::State *state)
1344	{
1345	const QChar *ch = in.data();
1346	qsizetype uclen = in.size();
1347
1348	Q_ASSERT(uclen < INT_MAX); // ### FIXME
1349	Q_ASSERT(state);
1350	Q_UNUSED(state); // ### Fixme
1351	if (state->flags & QStringConverter::Flag::Stateless) // temporary
1352	state = nullptr;
1353
1354	if (!ch)
1355	return QByteArray();
1356	if (uclen == `0`)
1357	return QByteArray("");
1358	BOOL used_def;
1359	QByteArray mb(`4096`, `0`);
1360	int len;
1361	while (!(len=WideCharToMultiByte(CP_ACP, `0`, (const wchar_t*)ch, uclen,
1362	mb.data(), mb.size()-`1`, `0`, &used_def)))
1363	{
1364	int r = GetLastError();
1365	if (r == ERROR_INSUFFICIENT_BUFFER) {
1366	mb.resize(`1`+WideCharToMultiByte(CP_ACP, `0`,
1367	(const wchar_t*)ch, uclen,
1368	`0`, `0`, `0`, &used_def));
1369	// and try again...
1370	} else {
1371	// Fail. Probably can't happen in fact (dwFlags is 0).
1372	#ifndef QT_NO_DEBUG
1373	// Can't use qWarning(), as it'll recurse to handle %ls
1374	fprintf(stderr,
1375	"WideCharToMultiByte: Cannot convert multibyte text (error %d): %ls\n",
1376	r, reinterpret_cast<const wchar_t*>(QString(ch, uclen).utf16()));
1377	#endif
1378	break;
1379	}
1380	}
1381	mb.resize(len);
1382	return mb;
1383	}
1384	#endif
1385
1386	void QStringConverter::State::clear()
1387	{
1388	if (clearFn)
1389	clearFn(this);
1390	else
1391	state_data[`0`] = state_data[`1`] = state_data[`2`] = state_data[`3`] = `0`;
1392	remainingChars = `0`;
1393	invalidChars = `0`;
1394	internalState = `0`;
1395	}
1396
1397	static QChar fromUtf16(QChar out, QByteArrayView in, QStringConverter::State *state)
1398	{
1399	return QUtf16::convertToUnicode(out, in, state, DetectEndianness);
1400	}
1401
1402	static char toUtf16(char* out, QStringView in, QStringConverter::State state)
1403	{
1404	return QUtf16::convertFromUnicode(out, in, state, DetectEndianness);
1405	}
1406
1407	static QChar fromUtf16BE(QChar out, QByteArrayView in, QStringConverter::State *state)
1408	{
1409	return QUtf16::convertToUnicode(out, in, state, BigEndianness);
1410	}
1411
1412	static char toUtf16BE(char* out, QStringView in, QStringConverter::State state)
1413	{
1414	return QUtf16::convertFromUnicode(out, in, state, BigEndianness);
1415	}
1416
1417	static QChar fromUtf16LE(QChar out, QByteArrayView in, QStringConverter::State *state)
1418	{
1419	return QUtf16::convertToUnicode(out, in, state, LittleEndianness);
1420	}
1421
1422	static char toUtf16LE(char* out, QStringView in, QStringConverter::State state)
1423	{
1424	return QUtf16::convertFromUnicode(out, in, state, LittleEndianness);
1425	}
1426
1427	static QChar fromUtf32(QChar out, QByteArrayView in, QStringConverter::State *state)
1428	{
1429	return QUtf32::convertToUnicode(out, in, state, DetectEndianness);
1430	}
1431
1432	static char toUtf32(char* out, QStringView in, QStringConverter::State state)
1433	{
1434	return QUtf32::convertFromUnicode(out, in, state, DetectEndianness);
1435	}
1436
1437	static QChar fromUtf32BE(QChar out, QByteArrayView in, QStringConverter::State *state)
1438	{
1439	return QUtf32::convertToUnicode(out, in, state, BigEndianness);
1440	}
1441
1442	static char toUtf32BE(char* out, QStringView in, QStringConverter::State state)
1443	{
1444	return QUtf32::convertFromUnicode(out, in, state, BigEndianness);
1445	}
1446
1447	static QChar fromUtf32LE(QChar out, QByteArrayView in, QStringConverter::State *state)
1448	{
1449	return QUtf32::convertToUnicode(out, in, state, LittleEndianness);
1450	}
1451
1452	static char toUtf32LE(char* out, QStringView in, QStringConverter::State state)
1453	{
1454	return QUtf32::convertFromUnicode(out, in, state, LittleEndianness);
1455	}
1456
1457	void qt_from_latin1(char16_t dst, const* char str, size_t size) noexcept*;
1458
1459	static QChar fromLatin1(QChar out, QByteArrayView in, QStringConverter::State *state)
1460	{
1461	Q_ASSERT(state);
1462	Q_UNUSED(state);
1463
1464	qt_from_latin1(reinterpret_cast<char16_t *>(out), in.data(), size_t(in.size()));
1465	return out + in.size();
1466	}
1467
1468
1469	static char toLatin1(char* out, QStringView in, QStringConverter::State state)
1470	{
1471	Q_ASSERT(state);
1472	if (state->flags & QStringConverter::Flag::Stateless) // temporary
1473	state = nullptr;
1474
1475	const char replacement = (state && state->flags & QStringConverter::Flag::ConvertInvalidToNull) ? `0` : `'?'`;
1476	int invalid = `0`;
1477	for (qsizetype i = `0`; i < in.length(); ++i) {
1478	if (in [i] > QChar (`0xff`)) {
1479	*out = replacement;
1480	++invalid;
1481	} else {
1482	out = (char*)in [i].cell();
1483	}
1484	++out;
1485	}
1486	if (state)
1487	state->invalidChars += invalid;
1488	return out;
1489	}
1490
1491	static QChar fromLocal8Bit(QChar out, QByteArrayView in, QStringConverter::State *state)
1492	{
1493	QString s = QLocal8Bit::convertToUnicode(in, state);
1494	memcpy(out, s.constData(), s.length()*sizeof(QChar));
1495	return out + s.length();
1496	}
1497
1498	static char toLocal8Bit(char* out, QStringView in, QStringConverter::State state)
1499	{
1500	QByteArray s = QLocal8Bit::convertFromUnicode(in, state);
1501	memcpy(out, s.constData(), s.length());
1502	return out + s.length();
1503	}
1504
1505
1506	static qsizetype fromUtf8Len(qsizetype l) { return l + `1`; }
1507	static qsizetype toUtf8Len(qsizetype l) { return `3`*(l + `1`); }
1508
1509	static qsizetype fromUtf16Len(qsizetype l) { return l/`2` + `2`; }
1510	static qsizetype toUtf16Len(qsizetype l) { return `2`*(l + `1`); }
1511
1512	static qsizetype fromUtf32Len(qsizetype l) { return l/`2` + `2`; }
1513	static qsizetype toUtf32Len(qsizetype l) { return `4`*(l + `1`); }
1514
1515	static qsizetype fromLatin1Len(qsizetype l) { return l + `1`; }
1516	static qsizetype toLatin1Len(qsizetype l) { return l + `1`; }
1517
1518
1519
1520	/!*
1521	\class QStringConverterBase
1522	\internal
1523
1524	Just a common base class for QStringConverter and QTextCodec
1525	*/
1526
1527	/!*
1528	\class QStringConverter
1529	\inmodule QtCore
1530	\brief The QStringConverter class provides a base class for encoding and decoding text.
1531	\reentrant
1532	\ingroup i18n
1533
1534	Qt uses UTF-16 to store, draw and manipulate strings. In many
1535	situations you may wish to deal with data that uses a different
1536	encoding. Most text data transferred over files and network connections is encoded
1537	in UTF-8.
1538
1539	The QStringConverter class is a base class for the \l {QStringEncoder} and
1540	\l {QStringDecoder} classes that help with converting between different
1541	text encodings. QStringDecoder can decode a string from an encoded representation
1542	into UTF-16, the format Qt uses internally. QStringEncoder does the opposite
1543	operation, encoding UTF-16 encoded data (usually in the form of a QString) to
1544	the requested encoding.
1545
1546	The supported encodings are:
1547
1548	\list
1549	\li UTF-8
1550	\li UTF-16
1551	\li UTF-16BE
1552	\li UTF-16LE
1553	\li UTF-32
1554	\li UTF-32BE
1555	\li UTF-32LE
1556	\li ISO-8859-1 (Latin-1)
1557	\li The system encoding
1558	\endlist
1559
1560	\l {QStringConverter}s can be used as follows to convert some encoded
1561	string to and from UTF-16.
1562
1563	Suppose you have some string encoded in UTF-8, and
1564	want to convert it to a QString. The simple way
1565	to do it is to use a \l {QStringDecoder} like this:
1566
1567	\snippet code/src_corelib_text_qstringconverter.cpp 0
1568
1569	After this, \c string holds the text in decoded form.
1570	Converting a string from Unicode to the local encoding is just as
1571	easy using the \l {QStringEncoder} class:
1572
1573	\snippet code/src_corelib_text_qstringconverter.cpp 1
1574
1575	To read or write text files in various encodings, use QTextStream and
1576	its \l{QTextStream::setEncoding()}{setEncoding()} function.
1577
1578	Some care must be taken when trying to convert the data in chunks,
1579	for example, when receiving it over a network. In such cases it is
1580	possible that a multi-byte character will be split over two
1581	chunks. At best this might result in the loss of a character and
1582	at worst cause the entire conversion to fail.
1583
1584	Both QStringEncoder and QStringDecoder make this easy, by tracking
1585	this in an internal state. So simply calling the encoder or decoder
1586	again with the next chunk of data will automatically continue encoding
1587	or decoding the data correctly:
1588
1589	\snippet code/src_corelib_text_qstringconverter.cpp 2
1590
1591	The QStringDecoder object maintains state between chunks and therefore
1592	works correctly even if a multi-byte character is split between
1593	chunks.
1594
1595	QStringConverter objects can't be copied because of their internal state, but
1596	can be moved.
1597
1598	\sa QTextStream, QStringDecoder, QStringEncoder
1599	*/
1600
1601	/!*
1602	\enum QStringConverter::Flag
1603
1604	\value Default Default conversion rules apply.
1605	\value ConvertInvalidToNull If this flag is set, each invalid input
1606	character is output as a null character. If it is not set,
1607	invalid input characters are represented as QChar::ReplacementCharacter
1608	if the output encoding can represent that character, otherwise as a question mark.
1609	\value WriteBom When converting from a QString to an output encoding, write a QChar::ByteOrderMark as the first
1610	character if the output encoding supports this. This is the case for UTF-8, UTF-16 and UTF-32
1611	encodings.
1612	\value ConvertInitialBom When converting from an input encoding to a QString the QStringDecoder usually skips an
1613	leading QChar::ByteOrderMark. When this flag is set, the byte order mark will not be
1614	skipped, but converted to utf-16 and inserted at the start of the created QString.
1615	\value Stateless Ignore possible converter states between different function calls
1616	to encode or decode strings. This will also cause the QStringConverter to raise an error if an incomplete
1617	sequence of data is encountered.
1618	*/
1619
1620	/!*
1621	\enum QStringConverter::Encoding
1622	\value Utf8 Create a converter to or from UTF-8
1623	\value Utf16 Create a converter to or from UTF-16. When decoding, the byte order will get automatically
1624	detected by a leading byte order mark. If none exists or when encoding, the system byte order will
1625	be assumed.
1626	\value Utf16BE Create a converter to or from big endian UTF-16.
1627	\value Utf16LE Create a converter to or from litte endian UTF-16.
1628	\value Utf32 Create a converter to or from UTF-32. When decoding, the byte order will get automatically
1629	detected by a leading byte order mark. If none exists or when encoding, the system byte order will
1630	be assumed.
1631	\value Utf32BE Create a converter to or from big endian UTF-32.
1632	\value Utf32LE Create a converter to or from litte endian UTF-32.
1633	\value Latin1 Create a converter to or from ISO-8859-1 (Latin1).
1634	\value System Create a converter to or from the underlying encoding of the
1635	operating systems locale. This is always assumed to be UTF-8 for Unix based
1636	systems. On Windows, this converts to and from the locale code page.
1637	*/
1638
1639	/!*
1640	\struct QStringConverter::Interface
1641	\internal
1642	*/
1643
1644	const QStringConverter::Interface QStringConverter::encodingInterfaces[QStringConverter::LastEncoding + `1`] =
1645	{
1646	{ "UTF-8", QUtf8::convertToUnicode, fromUtf8Len, QUtf8::convertFromUnicode, toUtf8Len },
1647	{ "UTF-16", fromUtf16, fromUtf16Len, toUtf16, toUtf16Len },
1648	{ "UTF-16LE", fromUtf16LE, fromUtf16Len, toUtf16LE, toUtf16Len },
1649	{ "UTF-16BE", fromUtf16BE, fromUtf16Len, toUtf16BE, toUtf16Len },
1650	{ "UTF-32", fromUtf32, fromUtf32Len, toUtf32, toUtf32Len },
1651	{ "UTF-32LE", fromUtf32LE, fromUtf32Len, toUtf32LE, toUtf32Len },
1652	{ "UTF-32BE", fromUtf32BE, fromUtf32Len, toUtf32BE, toUtf32Len },
1653	{ "ISO-8859-1", fromLatin1, fromLatin1Len, toLatin1, toLatin1Len },
1654	{ "Locale", fromLocal8Bit, fromUtf8Len, toLocal8Bit, toUtf8Len }
1655	};
1656
1657	// match names case insensitive and skipping '-' and '_'
1658	static bool nameMatch(const char a, const* char *b)
1659	{
1660	while (a && b) {
1661	if (a == `'-'` \|\| a == `'_'`) {
1662	++a;
1663	continue;
1664	}
1665	if (b == `'-'` \|\| b == `'_'`) {
1666	++b;
1667	continue;
1668	}
1669	if (toupper(a) != toupper(b))
1670	return false;
1671	++a;
1672	++b;
1673	}
1674	return !a && !b;
1675	}
1676
1677
1678	/!*
1679	\fn constexpr QStringConverter::QStringConverter()
1680	\internal
1681	*/
1682
1683	/!*
1684	\fn constexpr QStringConverter::QStringConverter(Encoding, Flags)
1685	\internal
1686	*/
1687
1688	/!*
1689	\internal
1690	*/
1691	QStringConverter::QStringConverter(const char *name, Flags f)
1692	: iface(nullptr), state (f)
1693	{
1694	auto e = encodingForName(name);
1695	if (e)
1696	iface = encodingInterfaces + int(e.value());
1697	}
1698
1699	/!*
1700	\fn bool QStringConverter::isValid() const
1701
1702	Returns true if this is a valid string converter that can be used for encoding or
1703	decoding text.
1704
1705	Default constructed string converters or converters constructed with an unsupported
1706	name are not valid.
1707	*/
1708
1709	/!*
1710	\fn void QStringConverter::resetState()
1711
1712	Resets the internal state of the converter, clearing potential errors or partial
1713	conversions.
1714	*/
1715
1716	/!*
1717	\fn bool QStringConverter::hasError() const
1718
1719	Returns true if a conversion could not correctly convert a character. This could for example
1720	get triggered by an invalid UTF-8 sequence or when a character can't get converted due to
1721	limitations in the target encoding.
1722	*/
1723
1724	/!*
1725	\fn const char QStringConverter::name() const*
1726
1727	Returns the canonical name of the encoding this QStringConverter can encode or decode.
1728	Returns a nullptr if the converter is not valid.
1729
1730	\sa isValid()
1731	*/
1732
1733	/!*
1734	Returns an optional encoding for \a name. The optional is empty if the name could
1735	not get converted to a valid encoding.
1736	*/
1737	std::optional<QStringConverter::Encoding> QStringConverter::encodingForName(const char *name)
1738	{
1739	for (int i = `0`; i < LastEncoding + `1`; ++i) {
1740	if (nameMatch(encodingInterfaces[i].name, name))
1741	return QStringConverter::Encoding(i);
1742	}
1743	if (nameMatch(name, "latin1"))
1744	return QStringConverter::Latin1;
1745	return std::nullopt;
1746	}
1747
1748	/!*
1749	Returns the encoding for the content of \a data if it can be determined.
1750	\a expectedFirstCharacter can be passed as an additional hint to help determine
1751	the encoding.
1752
1753	The returned optional is empty, if the encoding is unclear.
1754	*/
1755	std::optional<QStringConverter::Encoding> QStringConverter::encodingForData(QByteArrayView data, char16_t expectedFirstCharacter)
1756	{
1757	qsizetype arraySize = data.size();
1758	if (arraySize > `3`) {
1759	uint uc = qFromUnaligned<uint>(data.data());
1760	if (uc == qToBigEndian(uint(QChar::ByteOrderMark)))
1761	return QStringConverter::Utf32BE;
1762	if (uc == qToLittleEndian(uint(QChar::ByteOrderMark)))
1763	return QStringConverter::Utf32LE;
1764	if (expectedFirstCharacter) {
1765	// catch also anything starting with the expected character
1766	if (qToLittleEndian(uc) == expectedFirstCharacter)
1767	return QStringConverter::Utf32LE;
1768	else if (qToBigEndian(uc) == expectedFirstCharacter)
1769	return QStringConverter::Utf32BE;
1770	}
1771	}
1772
1773	if (arraySize > `2`) {
1774	if (memcmp(data.data(), utf8bom, sizeof(utf8bom)) == `0`)
1775	return QStringConverter::Utf8;
1776	}
1777
1778	if (arraySize > `1`) {
1779	ushort uc = qFromUnaligned<ushort>(data.data());
1780	if (uc == qToBigEndian(ushort(QChar::ByteOrderMark)))
1781	return QStringConverter::Utf16BE;
1782	if (uc == qToLittleEndian(ushort(QChar::ByteOrderMark)))
1783	return QStringConverter::Utf16LE;
1784	if (expectedFirstCharacter) {
1785	// catch also anything starting with the expected character
1786	if (qToLittleEndian(uc) == expectedFirstCharacter)
1787	return QStringConverter::Utf16LE;
1788	else if (qToBigEndian(uc) == expectedFirstCharacter)
1789	return QStringConverter::Utf16BE;
1790	}
1791	}
1792	return std::nullopt;
1793	}
1794
1795	/!*
1796	Tries to determine the encoding of the HTML in \a data by looking at leading byte
1797	order marks or a charset specifier in the HTML meta tag. If the optional is empty,
1798	the encoding specified is not supported by QStringConverter. If no encoding is
1799	detected, the method returns Utf8.
1800	*/
1801	std::optional<QStringConverter::Encoding> QStringConverter::encodingForHtml(QByteArrayView data)
1802	{
1803	// determine charset
1804	auto encoding = encodingForData(data);
1805	if (encoding)
1806	// trust the initial BOM
1807	return encoding;
1808
1809	QByteArray header = data.first(qMin(data.size(), qsizetype(`1024`))).toByteArray().toLower();
1810	int pos = header.indexOf("meta ");
1811	if (pos != -`1`) {
1812	pos = header.indexOf("charset=", pos);
1813	if (pos != -`1`) {
1814	pos += int(qstrlen("charset="));
1815	if (pos < header.size() && (header.at(pos) == `'\"'` \|\| header.at(pos) == `'\''`))
1816	++pos;
1817
1818	int pos2 = pos;
1819	// The attribute can be closed with either """, "'", ">" or "/",
1820	// none of which are valid charset characters.
1821	while (++pos2 < header.size()) {
1822	char ch = header.at(pos2);
1823	if (ch == `'\"'` \|\| ch == `'\''` \|\| ch == `'>'` \|\| ch == `'/'`) {
1824	QByteArray name = header.mid(pos, pos2 - pos);
1825	int colon = name.indexOf(`':'`);
1826	if (colon > `0`)
1827	name = name.left(colon);
1828	name = name.simplified();
1829	if (name == "unicode") // QTBUG-41998, ICU will return UTF-16.
1830	name = QByteArrayLiteral("UTF-8");
1831	if (!name.isEmpty())
1832	return encodingForName(name);
1833	}
1834	}
1835	}
1836	}
1837	return Utf8;
1838	}
1839
1840	/!*
1841	Returns the canonical name for \a encoding.
1842	*/
1843	const char *QStringConverter::nameForEncoding(QStringConverter::Encoding e)
1844	{
1845	return encodingInterfaces[int(e)].name;
1846	}
1847
1848	/!*
1849	\class QStringEncoder
1850	\inmodule QtCore
1851	\brief The QStringEncoder class provides a state-based encoder for text.
1852	\reentrant
1853	\ingroup i18n
1854
1855	A text encoder converts text from Qt's internal representation into an encoded
1856	text format using a specific encoding.
1857
1858	Converting a string from Unicode to the local encoding can be achieved
1859	using the following code:
1860
1861	\snippet code/src_corelib_text_qstringconverter.cpp 1
1862
1863	The encoder remembers any state that is required between calls, so converting
1864	data received in chunks, for example, when receiving it over a network, is just as
1865	easy, by calling the encoder whenever new data is available:
1866
1867	\snippet code/src_corelib_text_qstringconverter.cpp 3
1868
1869	The QStringEncoder object maintains state between chunks and therefore
1870	works correctly even if a UTF-16 surrogate character is split between
1871	chunks.
1872
1873	QStringEncoder objects can't be copied because of their internal state, but
1874	can be moved.
1875
1876	\sa QStringConverter, QStringDecoder
1877	*/
1878
1879	/!*
1880	\fn constexpr QStringEncoder::QStringEncoder(const Interface i)*
1881	\internal
1882	*/
1883
1884	/!*
1885	\fn constexpr QStringEncoder::QStringEncoder()
1886
1887	Default constructs an encoder. The default encoder is not valid,
1888	and can't be used for converting text.
1889	*/
1890
1891	/!*
1892	\fn constexpr QStringEncoder::QStringEncoder(Encoding encoding, Flags flags = Flag::Default)
1893
1894	Creates an encoder object using \a encoding and \a flags.
1895	*/
1896
1897	/!*
1898	\fn constexpr QStringEncoder::QStringEncoder(const char name, Flags flags = Flag::Default)*
1899
1900	Creates an encoder object using \a name and \a flags.
1901	If \a name is not the name of a known encoding an invalid converter will get created.
1902
1903	\sa isValid()
1904	*/
1905
1906	/!*
1907	\fn QByteArray QStringEncoder::operator()(const QString &in)
1908	\fn QByteArray QStringEncoder::encode(const QString &in)
1909
1910	Converts \a in and returns the data as a byte array.
1911	*/
1912
1913	/!*
1914	\fn QByteArray QStringEncoder::operator()(QStringView in)
1915	\fn QByteArray QStringEncoder::encode(QStringView in)
1916	\overload
1917
1918	Converts \a in and returns the data as a byte array.
1919	*/
1920
1921	/!*
1922	\fn QByteArray QStringEncoder::operator()(const QChar in, qsizetype length)*
1923	\fn QByteArray QStringEncoder::encode(const QChar in, qsizetype length)*
1924	\overload
1925
1926	Converts \a length QChars from \a in and returns the data as a byte array.
1927	*/
1928
1929	/!*
1930	\fn qsizetype QStringEncoder::requiredSpace(qsizetype inputLength) const
1931
1932	Returns the maximum amount of characters required to be able to process
1933	\a inputLength decoded data.
1934
1935	\sa appendToBuffer
1936	*/
1937
1938	/!*
1939	\fn char QStringEncoder::appendToBuffer(char out, const QChar in, qsizetype length)*
1940
1941	Encodes \a length QChars from \a in and writes the encoded result into the buffer
1942	starting at \a out. Returns a pointer to the end of data written.
1943
1944	\a out needs to be large enough to be able to hold all the decoded data. Use
1945	\l{requiredSpace} to determine the maximum size requirements to be able to encode
1946	a QChar buffer of \a length.
1947
1948	\sa requiredSpace
1949	*/
1950
1951	/!*
1952	\class QStringDecoder
1953	\inmodule QtCore
1954	\brief The QStringDecoder class provides a state-based decoder for text.
1955	\reentrant
1956	\ingroup i18n
1957
1958	A text decoder converts text an encoded text format that uses a specific encoding
1959	into Qt's internal representation.
1960
1961	Converting encoded data into a QString can be achieved
1962	using the following code:
1963
1964	\snippet code/src_corelib_text_qstringconverter.cpp 0
1965
1966	The decoder remembers any state that is required between calls, so converting
1967	data received in chunks, for example, when receiving it over a network, is just as
1968	easy, by calling the decoder whenever new data is available:
1969
1970	\snippet code/src_corelib_text_qstringconverter.cpp 2
1971
1972	The QStringDecoder object maintains state between chunks and therefore
1973	works correctly even if chunks are split in the middle of a multi-byte character
1974	sequence.
1975
1976	QStringDecoder objects can't be copied because of their internal state, but
1977	can be moved.
1978
1979	\sa QStringConverter, QStringEncoder
1980	*/
1981
1982	/!*
1983	\fn constexpr QStringDecoder::QStringDecoder(const Interface i)*
1984	\internal
1985	*/
1986
1987	/!*
1988	\fn constexpr QStringDecoder::QStringDecoder()
1989
1990	Default constructs an decoder. The default decoder is not valid,
1991	and can't be used for converting text.
1992	*/
1993
1994	/!*
1995	\fn constexpr QStringDecoder::QStringDecoder(Encoding encoding, Flags flags = Flag::Default)
1996
1997	Creates an decoder object using \a encoding and \a flags.
1998	*/
1999
2000	/!*
2001	\fn constexpr QStringDecoder::QStringDecoder(const char name, Flags flags = Flag::Default)*
2002
2003	Creates an decoder object using \a name and \a flags.
2004	If \a name is not the name of a known encoding an invalid converter will get created.
2005
2006	\sa isValid()
2007	*/
2008
2009	/!*
2010	\fn QString QStringDecoder::operator()(const QByteArray &ba)
2011	\fn QString QStringDecoder::decode(const QByteArray &ba)
2012
2013	Converts \a ba and returns the data as a QString.
2014	*/
2015
2016	/!*
2017	\fn QString QStringDecoder::operator()(const char in, qsizetype size)*
2018	\fn QString QStringDecoder::decode(const char in, qsizetype size)*
2019	\overload
2020
2021	Converts a byte array containing the first \a size bytes of the array \a in
2022	and returns the data as a QString.
2023	*/
2024
2025	/!*
2026	\fn QString QStringDecoder::operator()(const char chars)*
2027	\fn QString QStringDecoder::decode(const char chars)*
2028	\overload
2029
2030	Converts \a chars and returns the data as a QString. \a chars is assumed to
2031	point to a \c{\0}-terminated string and its length is determined dynamically.
2032	*/
2033
2034	/!*
2035	\fn qsizetype QStringDecoder::requiredSpace(qsizetype inputLength) const
2036
2037	Returns the maximum amount of UTF-16 code units required to be able to process
2038	\a inputLength encoded data.
2039
2040	\sa appendToBuffer
2041	*/
2042
2043	/!*
2044	\fn QChar QStringDecoder::appendToBuffer(QChar out, QByteArrayView in)
2045
2046	Decodes the sequence of bytes viewed by \a in and writes the decoded result into
2047	the buffer starting at \a out. Returns a pointer to the end of data written.
2048
2049	\a out needs to be large enough to be able to hold all the decoded data. Use
2050	\l{requiredSpace} to determine the maximum size requirements to decode an encoded
2051	data buffer of \c in.size() bytes.
2052
2053	\sa requiredSpace
2054	*/
2055
2056	QT_END_NAMESPACE
2057

Browse the source code of Qt/src/corelib/text/qstringconverter.cpp