1/****************************************************************************
2**
3** Copyright (C) 2020 The Qt Company Ltd.
4** Copyright (C) 2020 Intel Corporation.
5** Contact: https://www.qt.io/licensing/
6**
7** This file is part of the QtCore module of the Qt Toolkit.
8**
9** $QT_BEGIN_LICENSE:LGPL$
10** Commercial License Usage
11** Licensees holding valid commercial Qt licenses may use this file in
12** accordance with the commercial license agreement provided with the
13** Software or, alternatively, in accordance with the terms contained in
14** a written agreement between you and The Qt Company. For licensing terms
15** and conditions see https://www.qt.io/terms-conditions. For further
16** information use the contact form at https://www.qt.io/contact-us.
17**
18** GNU Lesser General Public License Usage
19** Alternatively, this file may be used under the terms of the GNU Lesser
20** General Public License version 3 as published by the Free Software
21** Foundation and appearing in the file LICENSE.LGPL3 included in the
22** packaging of this file. Please review the following information to
23** ensure the GNU Lesser General Public License version 3 requirements
24** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
25**
26** GNU General Public License Usage
27** Alternatively, this file may be used under the terms of the GNU
28** General Public License version 2.0 or (at your option) the GNU General
29** Public license version 3 or any later version approved by the KDE Free
30** Qt Foundation. The licenses are as published by the Free Software
31** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
32** included in the packaging of this file. Please review the following
33** information to ensure the GNU General Public License requirements will
34** be met: https://www.gnu.org/licenses/gpl-2.0.html and
35** https://www.gnu.org/licenses/gpl-3.0.html.
36**
37** $QT_END_LICENSE$
38**
39****************************************************************************/
40
41#include <qstringconverter.h>
42#include <private/qstringconverter_p.h>
43#include "qendian.h"
44
45#include "private/qsimd_p.h"
46#include "private/qstringiterator_p.h"
47#include "qbytearraymatcher.h"
48
49#ifdef Q_OS_WIN
50#include <qt_windows.h>
51#endif
52
53#if __has_include(<bit>) && __cplusplus > 201703L
54#include <bit>
55#endif
56
57QT_BEGIN_NAMESPACE
58
59enum { Endian = 0, Data = 1 };
60
61static const uchar utf8bom[] = { 0xef, 0xbb, 0xbf };
62
63#if (defined(__SSE2__) && defined(QT_COMPILER_SUPPORTS_SSE2)) \
64 || (defined(__ARM_NEON__) && defined(Q_PROCESSOR_ARM_64))
65static Q_ALWAYS_INLINE uint qBitScanReverse(unsigned v) noexcept
66{
67#if defined(__cpp_lib_int_pow2) && __cpp_lib_int_pow2 >= 202002L
68 return std::bit_width(v) - 1;
69#else
70 uint result = qCountLeadingZeroBits(v);
71 // Now Invert the result: clz will count *down* from the msb to the lsb, so the msb index is 31
72 // and the lsb index is 0. The result for _bit_scan_reverse is expected to be the index when
73 // counting up: msb index is 0 (because it starts there), and the lsb index is 31.
74 result ^= sizeof(unsigned) * 8 - 1;
75 return result;
76#endif
77}
78#endif
79
80#if defined(__SSE2__) && defined(QT_COMPILER_SUPPORTS_SSE2)
81static inline bool simdEncodeAscii(uchar *&dst, const ushort *&nextAscii, const ushort *&src, const ushort *end)
82{
83 // do sixteen characters at a time
84 for ( ; end - src >= 16; src += 16, dst += 16) {
85# ifdef __AVX2__
86 __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
87 __m128i data1 = _mm256_castsi256_si128(data);
88 __m128i data2 = _mm256_extracti128_si256(data, 1);
89# else
90 __m128i data1 = _mm_loadu_si128((const __m128i*)src);
91 __m128i data2 = _mm_loadu_si128(1+(const __m128i*)src);
92# endif
93
94 // check if everything is ASCII
95 // the highest ASCII value is U+007F
96 // Do the packing directly:
97 // The PACKUSWB instruction has packs a signed 16-bit integer to an unsigned 8-bit
98 // with saturation. That is, anything from 0x0100 to 0x7fff is saturated to 0xff,
99 // while all negatives (0x8000 to 0xffff) get saturated to 0x00. To detect non-ASCII,
100 // we simply do a signed greater-than comparison to 0x00. That means we detect NULs as
101 // "non-ASCII", but it's an acceptable compromise.
102 __m128i packed = _mm_packus_epi16(data1, data2);
103 __m128i nonAscii = _mm_cmpgt_epi8(packed, _mm_setzero_si128());
104
105 // store, even if there are non-ASCII characters here
106 _mm_storeu_si128((__m128i*)dst, packed);
107
108 // n will contain 1 bit set per character in [data1, data2] that is non-ASCII (or NUL)
109 ushort n = ~_mm_movemask_epi8(nonAscii);
110 if (n) {
111 // find the next probable ASCII character
112 // we don't want to load 32 bytes again in this loop if we know there are non-ASCII
113 // characters still coming
114 nextAscii = src + qBitScanReverse(n) + 1;
115
116 n = qCountTrailingZeroBits(n);
117 dst += n;
118 src += n;
119 return false;
120 }
121 }
122
123 if (end - src >= 8) {
124 // do eight characters at a time
125 __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
126 __m128i packed = _mm_packus_epi16(data, data);
127 __m128i nonAscii = _mm_cmpgt_epi8(packed, _mm_setzero_si128());
128
129 // store even non-ASCII
130 _mm_storel_epi64(reinterpret_cast<__m128i *>(dst), packed);
131
132 uchar n = ~_mm_movemask_epi8(nonAscii);
133 if (n) {
134 nextAscii = src + qBitScanReverse(n) + 1;
135 n = qCountTrailingZeroBits(n);
136 dst += n;
137 src += n;
138 return false;
139 }
140 }
141
142 return src == end;
143}
144
145static inline bool simdDecodeAscii(ushort *&dst, const uchar *&nextAscii, const uchar *&src, const uchar *end)
146{
147 // do sixteen characters at a time
148 for ( ; end - src >= 16; src += 16, dst += 16) {
149 __m128i data = _mm_loadu_si128((const __m128i*)src);
150
151#ifdef __AVX2__
152 const int BitSpacing = 2;
153 // load and zero extend to an YMM register
154 const __m256i extended = _mm256_cvtepu8_epi16(data);
155
156 uint n = _mm256_movemask_epi8(extended);
157 if (!n) {
158 // store
159 _mm256_storeu_si256((__m256i*)dst, extended);
160 continue;
161 }
162#else
163 const int BitSpacing = 1;
164
165 // check if everything is ASCII
166 // movemask extracts the high bit of every byte, so n is non-zero if something isn't ASCII
167 uint n = _mm_movemask_epi8(data);
168 if (!n) {
169 // unpack
170 _mm_storeu_si128((__m128i*)dst, _mm_unpacklo_epi8(data, _mm_setzero_si128()));
171 _mm_storeu_si128(1+(__m128i*)dst, _mm_unpackhi_epi8(data, _mm_setzero_si128()));
172 continue;
173 }
174#endif
175
176 // copy the front part that is still ASCII
177 while (!(n & 1)) {
178 *dst++ = *src++;
179 n >>= BitSpacing;
180 }
181
182 // find the next probable ASCII character
183 // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
184 // characters still coming
185 n = qBitScanReverse(n);
186 nextAscii = src + (n / BitSpacing) + 1;
187 return false;
188
189 }
190
191 if (end - src >= 8) {
192 __m128i data = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(src));
193 uint n = _mm_movemask_epi8(data) & 0xff;
194 if (!n) {
195 // unpack and store
196 _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), _mm_unpacklo_epi8(data, _mm_setzero_si128()));
197 } else {
198 while (!(n & 1)) {
199 *dst++ = *src++;
200 n >>= 1;
201 }
202
203 n = qBitScanReverse(n);
204 nextAscii = src + n + 1;
205 return false;
206 }
207 }
208
209 return src == end;
210}
211
212static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii)
213{
214#ifdef __AVX2__
215 // do 32 characters at a time
216 // (this is similar to simdTestMask in qstring.cpp)
217 const __m256i mask = _mm256_set1_epi8(0x80);
218 for ( ; end - src >= 32; src += 32) {
219 __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
220 if (_mm256_testz_si256(mask, data))
221 continue;
222
223 uint n = _mm256_movemask_epi8(data);
224 Q_ASSUME(n);
225
226 // find the next probable ASCII character
227 // we don't want to load 32 bytes again in this loop if we know there are non-ASCII
228 // characters still coming
229 nextAscii = src + qBitScanReverse(n) + 1;
230
231 // return the non-ASCII character
232 return src + qCountTrailingZeroBits(n);
233 }
234#endif
235
236 // do sixteen characters at a time
237 for ( ; end - src >= 16; src += 16) {
238 __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));
239
240 // check if everything is ASCII
241 // movemask extracts the high bit of every byte, so n is non-zero if something isn't ASCII
242 uint n = _mm_movemask_epi8(data);
243 if (!n)
244 continue;
245
246 // find the next probable ASCII character
247 // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
248 // characters still coming
249 nextAscii = src + qBitScanReverse(n) + 1;
250
251 // return the non-ASCII character
252 return src + qCountTrailingZeroBits(n);
253 }
254
255 // do four characters at a time
256 for ( ; end - src >= 4; src += 4) {
257 quint32 data = qFromUnaligned<quint32>(src);
258 data &= 0x80808080U;
259 if (!data)
260 continue;
261
262 // We don't try to guess which of the three bytes is ASCII and which
263 // one isn't. The chance that at least two of them are non-ASCII is
264 // better than 75%.
265 nextAscii = src;
266 return src;
267 }
268 nextAscii = end;
269 return src;
270}
271
272// Compare only the US-ASCII beginning of [src8, end8) and [src16, end16)
273// and advance src8 and src16 to the first character that could not be compared
274static void simdCompareAscii(const char8_t *&src8, const char8_t *end8, const char16_t *&src16, const char16_t *end16)
275{
276 int bitSpacing = 1;
277 qptrdiff len = qMin(end8 - src8, end16 - src16);
278 qptrdiff offset = 0;
279 uint mask = 0;
280
281 // do sixteen characters at a time
282 for ( ; offset + 16 < len; offset += 16) {
283 __m128i data8 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src8 + offset));
284#ifdef __AVX2__
285 // AVX2 version, use 256-bit registers and VPMOVXZBW
286 __m256i data16 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src16 + offset));
287
288 // expand US-ASCII as if it were Latin1 and confirm it's US-ASCII
289 __m256i datax8 = _mm256_cvtepu8_epi16(data8);
290 mask = _mm256_movemask_epi8(datax8);
291 if (mask)
292 break;
293
294 // compare Latin1 to UTF-16
295 __m256i latin1cmp = _mm256_cmpeq_epi16(datax8, data16);
296 mask = ~_mm256_movemask_epi8(latin1cmp);
297 if (mask)
298 break;
299#else
300 // non-AVX2 code
301 __m128i datalo16 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src16 + offset));
302 __m128i datahi16 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src16 + offset) + 1);
303
304 // expand US-ASCII as if it were Latin1, we'll confirm later
305 __m128i datalo8 = _mm_unpacklo_epi8(data8, _mm_setzero_si128());
306 __m128i datahi8 = _mm_unpackhi_epi8(data8, _mm_setzero_si128());
307
308 // compare Latin1 to UTF-16
309 __m128i latin1cmplo = _mm_cmpeq_epi16(datalo8, datalo16);
310 __m128i latin1cmphi = _mm_cmpeq_epi16(datahi8, datahi16);
311 mask = _mm_movemask_epi8(latin1cmphi) << 16;
312 mask |= ushort(_mm_movemask_epi8(latin1cmplo));
313 mask = ~mask;
314 if (mask)
315 break;
316
317 // confirm it was US-ASCII
318 mask = _mm_movemask_epi8(data8);
319 if (mask) {
320 bitSpacing = 0;
321 break;
322 }
323#endif
324 }
325
326 // helper for comparing 4 or 8 characters
327 auto cmp_lt_16 = [&mask, &offset](int n, __m128i data8, __m128i data16) {
328 // n = 4 -> sizemask = 0xff
329 // n = 8 -> sizemask = 0xffff
330 unsigned sizemask = (1U << (2 * n)) - 1;
331
332 // expand as if Latin1
333 data8 = _mm_unpacklo_epi8(data8, _mm_setzero_si128());
334
335 // compare and confirm it's US-ASCII
336 __m128i latin1cmp = _mm_cmpeq_epi16(data8, data16);
337 mask = ~_mm_movemask_epi8(latin1cmp) & sizemask;
338 mask |= _mm_movemask_epi8(data8);
339 if (mask == 0)
340 offset += n;
341 };
342
343 // do eight characters at a time
344 if (mask == 0 && offset + 8 < len) {
345 __m128i data8 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(src8 + offset));
346 __m128i data16 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src16 + offset));
347 cmp_lt_16(8, data8, data16);
348 }
349
350 // do four characters
351 if (mask == 0 && offset + 4 < len) {
352 __m128i data8 = _mm_cvtsi32_si128(qFromUnaligned<quint32>(src8 + offset));
353 __m128i data16 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(src16 + offset));
354 cmp_lt_16(4, data8, data16);
355 }
356
357 // correct the source pointers to point to the first character we couldn't deal with
358 if (mask)
359 offset += qCountTrailingZeroBits(mask) >> bitSpacing;
360 src8 += offset;
361 src16 += offset;
362}
363#elif defined(__ARM_NEON__) && defined(Q_PROCESSOR_ARM_64) // vaddv is only available on Aarch64
364static inline bool simdEncodeAscii(uchar *&dst, const ushort *&nextAscii, const ushort *&src, const ushort *end)
365{
366 uint16x8_t maxAscii = vdupq_n_u16(0x7f);
367 uint16x8_t mask1 = { 1, 1 << 2, 1 << 4, 1 << 6, 1 << 8, 1 << 10, 1 << 12, 1 << 14 };
368 uint16x8_t mask2 = vshlq_n_u16(mask1, 1);
369
370 // do sixteen characters at a time
371 for ( ; end - src >= 16; src += 16, dst += 16) {
372 // load 2 lanes (or: "load interleaved")
373 uint16x8x2_t in = vld2q_u16(src);
374
375 // check if any of the elements > 0x7f, select 1 bit per element (element 0 -> bit 0, element 1 -> bit 1, etc),
376 // add those together into a scalar, and merge the scalars.
377 uint16_t nonAscii = vaddvq_u16(vandq_u16(vcgtq_u16(in.val[0], maxAscii), mask1))
378 | vaddvq_u16(vandq_u16(vcgtq_u16(in.val[1], maxAscii), mask2));
379
380 // merge the two lanes by shifting the values of the second by 8 and inserting them
381 uint16x8_t out = vsliq_n_u16(in.val[0], in.val[1], 8);
382
383 // store, even if there are non-ASCII characters here
384 vst1q_u8(dst, vreinterpretq_u8_u16(out));
385
386 if (nonAscii) {
387 // find the next probable ASCII character
388 // we don't want to load 32 bytes again in this loop if we know there are non-ASCII
389 // characters still coming
390 nextAscii = src + qBitScanReverse(nonAscii) + 1;
391
392 nonAscii = qCountTrailingZeroBits(nonAscii);
393 dst += nonAscii;
394 src += nonAscii;
395 return false;
396 }
397 }
398 return src == end;
399}
400
401static inline bool simdDecodeAscii(ushort *&dst, const uchar *&nextAscii, const uchar *&src, const uchar *end)
402{
403 // do eight characters at a time
404 uint8x8_t msb_mask = vdup_n_u8(0x80);
405 uint8x8_t add_mask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 };
406 for ( ; end - src >= 8; src += 8, dst += 8) {
407 uint8x8_t c = vld1_u8(src);
408 uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask));
409 if (!n) {
410 // store
411 vst1q_u16(dst, vmovl_u8(c));
412 continue;
413 }
414
415 // copy the front part that is still ASCII
416 while (!(n & 1)) {
417 *dst++ = *src++;
418 n >>= 1;
419 }
420
421 // find the next probable ASCII character
422 // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
423 // characters still coming
424 n = qBitScanReverse(n);
425 nextAscii = src + n + 1;
426 return false;
427
428 }
429 return src == end;
430}
431
432static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii)
433{
434 // The SIMD code below is untested, so just force an early return until
435 // we've had the time to verify it works.
436 nextAscii = end;
437 return src;
438
439 // do eight characters at a time
440 uint8x8_t msb_mask = vdup_n_u8(0x80);
441 uint8x8_t add_mask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 };
442 for ( ; end - src >= 8; src += 8) {
443 uint8x8_t c = vld1_u8(src);
444 uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask));
445 if (!n)
446 continue;
447
448 // find the next probable ASCII character
449 // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
450 // characters still coming
451 nextAscii = src + qBitScanReverse(n) + 1;
452
453 // return the non-ASCII character
454 return src + qCountTrailingZeroBits(n);
455 }
456 nextAscii = end;
457 return src;
458}
459
460static void simdCompareAscii(const char8_t *&, const char8_t *, const char16_t *&, const char16_t *)
461{
462}
463#else
464static inline bool simdEncodeAscii(uchar *, const ushort *, const ushort *, const ushort *)
465{
466 return false;
467}
468
469static inline bool simdDecodeAscii(ushort *, const uchar *, const uchar *, const uchar *)
470{
471 return false;
472}
473
474static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii)
475{
476 nextAscii = end;
477 return src;
478}
479
480static void simdCompareAscii(const char8_t *&, const char8_t *, const char16_t *&, const char16_t *)
481{
482}
483#endif
484
485enum { HeaderDone = 1 };
486
487QByteArray QUtf8::convertFromUnicode(QStringView in)
488{
489 qsizetype len = in.size();
490
491 // create a QByteArray with the worst case scenario size
492 QByteArray result(len * 3, Qt::Uninitialized);
493 uchar *dst = reinterpret_cast<uchar *>(const_cast<char *>(result.constData()));
494 const ushort *src = reinterpret_cast<const ushort *>(in.data());
495 const ushort *const end = src + len;
496
497 while (src != end) {
498 const ushort *nextAscii = end;
499 if (simdEncodeAscii(dst, nextAscii, src, end))
500 break;
501
502 do {
503 ushort u = *src++;
504 int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(u, dst, src, end);
505 if (res < 0) {
506 // encoding error - append '?'
507 *dst++ = '?';
508 }
509 } while (src < nextAscii);
510 }
511
512 result.truncate(dst - reinterpret_cast<uchar *>(const_cast<char *>(result.constData())));
513 return result;
514}
515
516QByteArray QUtf8::convertFromUnicode(QStringView in, QStringConverterBase::State *state)
517{
518 QByteArray ba(3*in.size() +3, Qt::Uninitialized);
519 char *end = convertFromUnicode(ba.data(), in, state);
520 ba.truncate(end - ba.data());
521 return ba;
522}
523
524char *QUtf8::convertFromUnicode(char *out, QStringView in, QStringConverter::State *state)
525{
526 Q_ASSERT(state);
527 const QChar *uc = in.data();
528 qsizetype len = in.length();
529 if (!len)
530 return out;
531
532 auto appendReplacementChar = [state](uchar *cursor) -> uchar * {
533 if (state->flags & QStringConverter::Flag::ConvertInvalidToNull) {
534 *cursor++ = 0;
535 } else {
536 // QChar::replacement encoded in utf8
537 *cursor++ = 0xef;
538 *cursor++ = 0xbf;
539 *cursor++ = 0xbd;
540 }
541 return cursor;
542 };
543
544 uchar *cursor = reinterpret_cast<uchar *>(out);
545 const ushort *src = reinterpret_cast<const ushort *>(uc);
546 const ushort *const end = src + len;
547
548 if (!(state->flags & QStringDecoder::Flag::Stateless)) {
549 if (state->remainingChars) {
550 int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(state->state_data[0], cursor, src, end);
551 if (res < 0)
552 cursor = appendReplacementChar(cursor);
553 state->state_data[0] = 0;
554 state->remainingChars = 0;
555 } else if (!(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom) {
556 // append UTF-8 BOM
557 *cursor++ = utf8bom[0];
558 *cursor++ = utf8bom[1];
559 *cursor++ = utf8bom[2];
560 state->internalState |= HeaderDone;
561 }
562 }
563
564 while (src != end) {
565 const ushort *nextAscii = end;
566 if (simdEncodeAscii(cursor, nextAscii, src, end))
567 break;
568
569 do {
570 ushort uc = *src++;
571 int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, cursor, src, end);
572 if (Q_LIKELY(res >= 0))
573 continue;
574
575 if (res == QUtf8BaseTraits::Error) {
576 // encoding error
577 ++state->invalidChars;
578 cursor = appendReplacementChar(cursor);
579 } else if (res == QUtf8BaseTraits::EndOfString) {
580 if (state->flags & QStringConverter::Flag::Stateless) {
581 ++state->invalidChars;
582 cursor = appendReplacementChar(cursor);
583 } else {
584 state->remainingChars = 1;
585 state->state_data[0] = uc;
586 }
587 return reinterpret_cast<char *>(cursor);
588 }
589 } while (src < nextAscii);
590 }
591
592 return reinterpret_cast<char *>(cursor);
593}
594
595QString QUtf8::convertToUnicode(QByteArrayView in)
596{
597 // UTF-8 to UTF-16 always needs the exact same number of words or less:
598 // UTF-8 UTF-16
599 // 1 byte 1 word
600 // 2 bytes 1 word
601 // 3 bytes 1 word
602 // 4 bytes 2 words (one surrogate pair)
603 // That is, we'll use the full buffer if the input is US-ASCII (1-byte UTF-8),
604 // half the buffer for U+0080-U+07FF text (e.g., Greek, Cyrillic, Arabic) or
605 // non-BMP text, and one third of the buffer for U+0800-U+FFFF text (e.g, CJK).
606 //
607 // The table holds for invalid sequences too: we'll insert one replacement char
608 // per invalid byte.
609 QString result(in.size(), Qt::Uninitialized);
610 QChar *data = const_cast<QChar*>(result.constData()); // we know we're not shared
611 const QChar *end = convertToUnicode(data, in);
612 result.truncate(end - data);
613 return result;
614}
615
616/*!
617 \since 5.7
618 \overload
619
620 Converts the UTF-8 sequence of bytes viewed by \a in to a sequence of
621 QChar starting at \a buffer. The buffer is expected to be large enough
622 to hold the result. An upper bound for the size of the buffer is
623 \c in.size() QChars.
624
625 If, during decoding, an error occurs, a QChar::ReplacementCharacter is
626 written.
627
628 Returns a pointer to one past the last QChar written.
629
630 This function never throws.
631*/
632
633QChar *QUtf8::convertToUnicode(QChar *buffer, QByteArrayView in) noexcept
634{
635 ushort *dst = reinterpret_cast<ushort *>(buffer);
636 const uchar *const start = reinterpret_cast<const uchar *>(in.data());
637 const uchar *src = start;
638 const uchar *end = src + in.size();
639
640 // attempt to do a full decoding in SIMD
641 const uchar *nextAscii = end;
642 if (!simdDecodeAscii(dst, nextAscii, src, end)) {
643 // at least one non-ASCII entry
644 // check if we failed to decode the UTF-8 BOM; if so, skip it
645 if (Q_UNLIKELY(src == start)
646 && end - src >= 3
647 && Q_UNLIKELY(src[0] == utf8bom[0] && src[1] == utf8bom[1] && src[2] == utf8bom[2])) {
648 src += 3;
649 }
650
651 while (src < end) {
652 nextAscii = end;
653 if (simdDecodeAscii(dst, nextAscii, src, end))
654 break;
655
656 do {
657 uchar b = *src++;
658 int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst, src, end);
659 if (res < 0) {
660 // decoding error
661 *dst++ = QChar::ReplacementCharacter;
662 }
663 } while (src < nextAscii);
664 }
665 }
666
667 return reinterpret_cast<QChar *>(dst);
668}
669
670QString QUtf8::convertToUnicode(QByteArrayView in, QStringConverter::State *state)
671{
672 // See above for buffer requirements for stateless decoding. However, that
673 // fails if the state is not empty. The following situations can add to the
674 // requirements:
675 // state contains chars starts with requirement
676 // 1 of 2 bytes valid continuation 0
677 // 2 of 3 bytes same 0
678 // 3 bytes of 4 same +1 (need to insert surrogate pair)
679 // 1 of 2 bytes invalid continuation +1 (need to insert replacement and restart)
680 // 2 of 3 bytes same +1 (same)
681 // 3 of 4 bytes same +1 (same)
682 QString result(in.size() + 1, Qt::Uninitialized);
683 QChar *end = convertToUnicode(result.data(), in, state);
684 result.truncate(end - result.constData());
685 return result;
686}
687
688QChar *QUtf8::convertToUnicode(QChar *out, QByteArrayView in, QStringConverter::State *state)
689{
690 qsizetype len = in.size();
691
692 Q_ASSERT(state);
693 if (!len)
694 return out;
695
696
697 ushort replacement = QChar::ReplacementCharacter;
698 if (state->flags & QStringConverter::Flag::ConvertInvalidToNull)
699 replacement = QChar::Null;
700
701 int res;
702 uchar ch = 0;
703
704 ushort *dst = reinterpret_cast<ushort *>(out);
705 const uchar *src = reinterpret_cast<const uchar *>(in.data());
706 const uchar *end = src + len;
707
708 if (!(state->flags & QStringConverter::Flag::Stateless)) {
709 bool headerdone = state->internalState & HeaderDone || state->flags & QStringConverter::Flag::ConvertInitialBom;
710 if (state->remainingChars || !headerdone) {
711 // handle incoming state first
712 uchar remainingCharsData[4]; // longest UTF-8 sequence possible
713 qsizetype remainingCharsCount = state->remainingChars;
714 qsizetype newCharsToCopy = qMin<qsizetype>(sizeof(remainingCharsData) - remainingCharsCount, end - src);
715
716 memset(remainingCharsData, 0, sizeof(remainingCharsData));
717 memcpy(remainingCharsData, &state->state_data[0], remainingCharsCount);
718 memcpy(remainingCharsData + remainingCharsCount, src, newCharsToCopy);
719
720 const uchar *begin = &remainingCharsData[1];
721 res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(remainingCharsData[0], dst, begin,
722 static_cast<const uchar *>(remainingCharsData) + remainingCharsCount + newCharsToCopy);
723 if (res == QUtf8BaseTraits::Error) {
724 ++state->invalidChars;
725 *dst++ = replacement;
726 ++src;
727 } else if (res == QUtf8BaseTraits::EndOfString) {
728 // if we got EndOfString again, then there were too few bytes in src;
729 // copy to our state and return
730 state->remainingChars = remainingCharsCount + newCharsToCopy;
731 memcpy(&state->state_data[0], remainingCharsData, state->remainingChars);
732 return out;
733 } else if (!headerdone) {
734 // eat the UTF-8 BOM
735 if (dst[-1] == 0xfeff)
736 --dst;
737 }
738 state->internalState |= HeaderDone;
739
740 // adjust src now that we have maybe consumed a few chars
741 if (res >= 0) {
742 Q_ASSERT(res > remainingCharsCount);
743 src += res - remainingCharsCount;
744 }
745 }
746 } else if (!(state->flags & QStringConverter::Flag::ConvertInitialBom)) {
747 // stateless, remove initial BOM
748 if (len > 2 && src[0] == utf8bom[0] && src[1] == utf8bom[1] && src[2] == utf8bom[2])
749 // skip BOM
750 src += 3;
751 }
752
753 // main body, stateless decoding
754 res = 0;
755 const uchar *nextAscii = src;
756 while (res >= 0 && src < end) {
757 if (src >= nextAscii && simdDecodeAscii(dst, nextAscii, src, end))
758 break;
759
760 ch = *src++;
761 res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(ch, dst, src, end);
762 if (res == QUtf8BaseTraits::Error) {
763 res = 0;
764 ++state->invalidChars;
765 *dst++ = replacement;
766 }
767 }
768
769 if (res == QUtf8BaseTraits::EndOfString) {
770 // unterminated UTF sequence
771 if (state->flags & QStringConverter::Flag::Stateless) {
772 *dst++ = QChar::ReplacementCharacter;
773 ++state->invalidChars;
774 while (src++ < end) {
775 *dst++ = QChar::ReplacementCharacter;
776 ++state->invalidChars;
777 }
778 state->remainingChars = 0;
779 } else {
780 --src; // unread the byte in ch
781 state->remainingChars = end - src;
782 memcpy(&state->state_data[0], src, end - src);
783 }
784 } else {
785 state->remainingChars = 0;
786 }
787
788 return reinterpret_cast<QChar *>(dst);
789}
790
791struct QUtf8NoOutputTraits : public QUtf8BaseTraitsNoAscii
792{
793 struct NoOutput {};
794 static void appendUtf16(const NoOutput &, ushort) {}
795 static void appendUcs4(const NoOutput &, uint) {}
796};
797
798QUtf8::ValidUtf8Result QUtf8::isValidUtf8(QByteArrayView in)
799{
800 const uchar *src = reinterpret_cast<const uchar *>(in.data());
801 const uchar *end = src + in.size();
802 const uchar *nextAscii = src;
803 bool isValidAscii = true;
804
805 while (src < end) {
806 if (src >= nextAscii)
807 src = simdFindNonAscii(src, end, nextAscii);
808 if (src == end)
809 break;
810
811 do {
812 uchar b = *src++;
813 if ((b & 0x80) == 0)
814 continue;
815
816 isValidAscii = false;
817 QUtf8NoOutputTraits::NoOutput output;
818 int res = QUtf8Functions::fromUtf8<QUtf8NoOutputTraits>(b, output, src, end);
819 if (res < 0) {
820 // decoding error
821 return { false, false };
822 }
823 } while (src < nextAscii);
824 }
825
826 return { true, isValidAscii };
827}
828
829int QUtf8::compareUtf8(QByteArrayView utf8, QStringView utf16) noexcept
830{
831 auto src1 = reinterpret_cast<const char8_t *>(utf8.data());
832 auto end1 = src1 + utf8.size();
833 auto src2 = reinterpret_cast<const char16_t *>(utf16.data());
834 auto end2 = src2 + utf16.size();
835
836 do {
837 simdCompareAscii(src1, end1, src2, end2);
838
839 if (src1 < end1 && src2 < end2) {
840 char32_t uc1 = *src1++;
841 char32_t uc2 = *src2++;
842
843 if (uc1 >= 0x80) {
844 char32_t *output = &uc1;
845 int res = QUtf8Functions::fromUtf8<QUtf8BaseTraitsNoAscii>(uc1, output, src1, end1);
846 if (res < 0) {
847 // decoding error
848 uc1 = QChar::ReplacementCharacter;
849 }
850
851 // Only decode the UTF-16 surrogate pair if the UTF-8 code point
852 // wasn't US-ASCII (a surrogate cannot match US-ASCII).
853 if (QChar::isHighSurrogate(uc2) && src2 < end2 && QChar::isLowSurrogate(*src2))
854 uc2 = QChar::surrogateToUcs4(uc2, *src2++);
855 }
856
857 if (uc1 != uc2)
858 return int(uc1) - int(uc2);
859 }
860 } while (src1 < end1 && src2 < end2);
861
862 // the shorter string sorts first
863 return (end1 > src1) - int(end2 > src2);
864}
865
866int QUtf8::compareUtf8(QByteArrayView utf8, QLatin1String s)
867{
868 uint uc1 = QChar::Null;
869 auto src1 = reinterpret_cast<const uchar *>(utf8.data());
870 auto end1 = src1 + utf8.size();
871 auto src2 = reinterpret_cast<const uchar *>(s.latin1());
872 auto end2 = src2 + s.size();
873
874 while (src1 < end1 && src2 < end2) {
875 uchar b = *src1++;
876 uint *output = &uc1;
877 int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, output, src1, end1);
878 if (res < 0) {
879 // decoding error
880 uc1 = QChar::ReplacementCharacter;
881 }
882
883 uint uc2 = *src2++;
884 if (uc1 != uc2)
885 return int(uc1) - int(uc2);
886 }
887
888 // the shorter string sorts first
889 return (end1 > src1) - (end2 > src2);
890}
891
892QByteArray QUtf16::convertFromUnicode(QStringView in, QStringConverter::State *state, DataEndianness endian)
893{
894 bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
895 qsizetype length = 2 * in.size();
896 if (writeBom)
897 length += 2;
898
899 QByteArray d(length, Qt::Uninitialized);
900 char *end = convertFromUnicode(d.data(), in, state, endian);
901 Q_ASSERT(end - d.constData() == d.length());
902 Q_UNUSED(end);
903 return d;
904}
905
906char *QUtf16::convertFromUnicode(char *out, QStringView in, QStringConverter::State *state, DataEndianness endian)
907{
908 Q_ASSERT(state);
909 bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
910
911 if (endian == DetectEndianness)
912 endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
913
914 if (writeBom) {
915 QChar bom(QChar::ByteOrderMark);
916 if (endian == BigEndianness)
917 qToBigEndian(bom.unicode(), out);
918 else
919 qToLittleEndian(bom.unicode(), out);
920 out += 2;
921 }
922 if (endian == BigEndianness)
923 qToBigEndian<ushort>(in.data(), in.length(), out);
924 else
925 qToLittleEndian<ushort>(in.data(), in.length(), out);
926
927 state->remainingChars = 0;
928 state->internalState |= HeaderDone;
929 return out + 2*in.length();
930}
931
932QString QUtf16::convertToUnicode(QByteArrayView in, QStringConverter::State *state, DataEndianness endian)
933{
934 QString result((in.size() + 1) >> 1, Qt::Uninitialized); // worst case
935 QChar *qch = convertToUnicode(result.data(), in, state, endian);
936 result.truncate(qch - result.constData());
937 return result;
938}
939
940QChar *QUtf16::convertToUnicode(QChar *out, QByteArrayView in, QStringConverter::State *state, DataEndianness endian)
941{
942 qsizetype len = in.size();
943 const char *chars = in.data();
944
945 Q_ASSERT(state);
946
947 if (endian == DetectEndianness)
948 endian = (DataEndianness)state->state_data[Endian];
949
950 const char *end = chars + len;
951
952 // make sure we can decode at least one char
953 if (state->remainingChars + len < 2) {
954 if (len) {
955 Q_ASSERT(state->remainingChars == 0 && len == 1);
956 state->remainingChars = 1;
957 state->state_data[Data] = *chars;
958 }
959 return out;
960 }
961
962 bool headerdone = state && state->internalState & HeaderDone;
963 if (state->flags & QStringConverter::Flag::ConvertInitialBom)
964 headerdone = true;
965
966 if (!headerdone || state->remainingChars) {
967 uchar buf;
968 if (state->remainingChars)
969 buf = state->state_data[Data];
970 else
971 buf = *chars++;
972
973 // detect BOM, set endianness
974 state->internalState |= HeaderDone;
975 QChar ch(buf, *chars++);
976 if (endian == DetectEndianness) {
977 if (ch == QChar::ByteOrderSwapped) {
978 endian = BigEndianness;
979 } else if (ch == QChar::ByteOrderMark) {
980 endian = LittleEndianness;
981 } else {
982 if (QSysInfo::ByteOrder == QSysInfo::BigEndian) {
983 endian = BigEndianness;
984 } else {
985 endian = LittleEndianness;
986 }
987 }
988 }
989 if (endian == BigEndianness)
990 ch = QChar::fromUcs2((ch.unicode() >> 8) | ((ch.unicode() & 0xff) << 8));
991 if (headerdone || ch != QChar::ByteOrderMark)
992 *out++ = ch;
993 } else if (endian == DetectEndianness) {
994 endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
995 }
996
997 int nPairs = (end - chars) >> 1;
998 if (endian == BigEndianness)
999 qFromBigEndian<ushort>(chars, nPairs, out);
1000 else
1001 qFromLittleEndian<ushort>(chars, nPairs, out);
1002 out += nPairs;
1003
1004 state->state_data[Endian] = endian;
1005 state->remainingChars = 0;
1006 if ((end - chars) & 1) {
1007 if (state->flags & QStringConverter::Flag::Stateless) {
1008 *out++ = state->flags & QStringConverter::Flag::ConvertInvalidToNull ? QChar::Null : QChar::ReplacementCharacter;
1009 } else {
1010 state->remainingChars = 1;
1011 state->state_data[Data] = *(end - 1);
1012 }
1013 } else {
1014 state->state_data[Data] = 0;
1015 }
1016
1017 return out;
1018}
1019
1020QByteArray QUtf32::convertFromUnicode(QStringView in, QStringConverter::State *state, DataEndianness endian)
1021{
1022 bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
1023 int length = 4*in.size();
1024 if (writeBom)
1025 length += 4;
1026 QByteArray ba(length, Qt::Uninitialized);
1027 char *end = convertFromUnicode(ba.data(), in, state, endian);
1028 Q_ASSERT(end - ba.constData() == length);
1029 Q_UNUSED(end);
1030 return ba;
1031}
1032
1033char *QUtf32::convertFromUnicode(char *out, QStringView in, QStringConverter::State *state, DataEndianness endian)
1034{
1035 Q_ASSERT(state);
1036
1037 bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
1038 qsizetype length = 4*in.length();
1039 if (writeBom)
1040 length += 4;
1041
1042 if (endian == DetectEndianness)
1043 endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
1044
1045 if (writeBom) {
1046 if (endian == BigEndianness) {
1047 out[0] = 0;
1048 out[1] = 0;
1049 out[2] = (char)0xfe;
1050 out[3] = (char)0xff;
1051 } else {
1052 out[0] = (char)0xff;
1053 out[1] = (char)0xfe;
1054 out[2] = 0;
1055 out[3] = 0;
1056 }
1057 out += 4;
1058 state->internalState |= HeaderDone;
1059 }
1060
1061 const QChar *uc = in.data();
1062 const QChar *end = in.data() + in.length();
1063 QChar ch;
1064 uint ucs4;
1065 if (state->remainingChars == 1) {
1066 ch = state->state_data[Data];
1067 // this is ugly, but shortcuts a whole lot of logic that would otherwise be required
1068 state->remainingChars = 0;
1069 goto decode_surrogate;
1070 }
1071
1072 while (uc < end) {
1073 ch = *uc++;
1074 if (Q_LIKELY(!ch.isSurrogate())) {
1075 ucs4 = ch.unicode();
1076 } else if (Q_LIKELY(ch.isHighSurrogate())) {
1077decode_surrogate:
1078 if (uc == end) {
1079 if (state->flags & QStringConverter::Flag::Stateless) {
1080 ucs4 = state->flags & QStringConverter::Flag::ConvertInvalidToNull ? 0 : QChar::ReplacementCharacter;
1081 } else {
1082 state->remainingChars = 1;
1083 state->state_data[Data] = ch.unicode();
1084 return out;
1085 }
1086 } else if (uc->isLowSurrogate()) {
1087 ucs4 = QChar::surrogateToUcs4(ch, *uc++);
1088 } else {
1089 ucs4 = state->flags & QStringConverter::Flag::ConvertInvalidToNull ? 0 : QChar::ReplacementCharacter;
1090 }
1091 } else {
1092 ucs4 = state->flags & QStringConverter::Flag::ConvertInvalidToNull ? 0 : QChar::ReplacementCharacter;
1093 }
1094 if (endian == BigEndianness)
1095 qToBigEndian(ucs4, out);
1096 else
1097 qToLittleEndian(ucs4, out);
1098 out += 4;
1099 }
1100
1101 return out;
1102}
1103
1104QString QUtf32::convertToUnicode(QByteArrayView in, QStringConverter::State *state, DataEndianness endian)
1105{
1106 QString result;
1107 result.resize((in.size() + 7) >> 1); // worst case
1108 QChar *end = convertToUnicode(result.data(), in, state, endian);
1109 result.truncate(end - result.constData());
1110 return result;
1111}
1112
1113QChar *QUtf32::convertToUnicode(QChar *out, QByteArrayView in, QStringConverter::State *state, DataEndianness endian)
1114{
1115 qsizetype len = in.size();
1116 const char *chars = in.data();
1117
1118 Q_ASSERT(state);
1119 if (endian == DetectEndianness)
1120 endian = (DataEndianness)state->state_data[Endian];
1121
1122 const char *end = chars + len;
1123
1124 uchar tuple[4];
1125 memcpy(tuple, &state->state_data[Data], 4);
1126
1127 // make sure we can decode at least one char
1128 if (state->remainingChars + len < 4) {
1129 if (len) {
1130 while (chars < end) {
1131 tuple[state->remainingChars] = *chars;
1132 ++state->remainingChars;
1133 ++chars;
1134 }
1135 Q_ASSERT(state->remainingChars < 4);
1136 memcpy(&state->state_data[Data], tuple, 4);
1137 }
1138 return out;
1139 }
1140
1141 bool headerdone = state->internalState & HeaderDone;
1142 if (state->flags & QStringConverter::Flag::ConvertInitialBom)
1143 headerdone = true;
1144
1145 int num = state->remainingChars;
1146 state->remainingChars = 0;
1147
1148 if (!headerdone || endian == DetectEndianness || num) {
1149 while (num < 4)
1150 tuple[num++] = *chars++;
1151 if (endian == DetectEndianness) {
1152 if (tuple[0] == 0xff && tuple[1] == 0xfe && tuple[2] == 0 && tuple[3] == 0) {
1153 endian = LittleEndianness;
1154 } else if (tuple[0] == 0 && tuple[1] == 0 && tuple[2] == 0xfe && tuple[3] == 0xff) {
1155 endian = BigEndianness;
1156 } else if (QSysInfo::ByteOrder == QSysInfo::BigEndian) {
1157 endian = BigEndianness;
1158 } else {
1159 endian = LittleEndianness;
1160 }
1161 }
1162 uint code = (endian == BigEndianness) ? qFromBigEndian<quint32>(tuple) : qFromLittleEndian<quint32>(tuple);
1163 if (headerdone || code != QChar::ByteOrderMark) {
1164 if (QChar::requiresSurrogates(code)) {
1165 *out++ = QChar(QChar::highSurrogate(code));
1166 *out++ = QChar(QChar::lowSurrogate(code));
1167 } else {
1168 *out++ = QChar(code);
1169 }
1170 }
1171 num = 0;
1172 } else if (endian == DetectEndianness) {
1173 endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
1174 }
1175 state->state_data[Endian] = endian;
1176 state->internalState |= HeaderDone;
1177
1178 while (chars < end) {
1179 tuple[num++] = *chars++;
1180 if (num == 4) {
1181 uint code = (endian == BigEndianness) ? qFromBigEndian<quint32>(tuple) : qFromLittleEndian<quint32>(tuple);
1182 for (char16_t c : QChar::fromUcs4(code))
1183 *out++ = c;
1184 num = 0;
1185 }
1186 }
1187
1188 if (num) {
1189 if (state->flags & QStringDecoder::Flag::Stateless) {
1190 *out++ = QChar::ReplacementCharacter;
1191 } else {
1192 state->state_data[Endian] = endian;
1193 state->remainingChars = num;
1194 memcpy(&state->state_data[Data], tuple, 4);
1195 }
1196 }
1197
1198 return out;
1199}
1200
1201#if defined(Q_OS_WIN) && !defined(QT_BOOTSTRAPPED)
1202static QString convertToUnicodeCharByChar(QByteArrayView in, QStringConverter::State *state)
1203{
1204 qsizetype length = in.size();
1205 const char *chars = in.data();
1206
1207 Q_ASSERT(state);
1208 if (state->flags & QStringConverter::Flag::Stateless) // temporary
1209 state = nullptr;
1210
1211 if (!chars || !length)
1212 return QString();
1213
1214 int copyLocation = 0;
1215 int extra = 2;
1216 if (state && state->remainingChars) {
1217 copyLocation = state->remainingChars;
1218 extra += copyLocation;
1219 }
1220 int newLength = length + extra;
1221 char *mbcs = new char[newLength];
1222 //ensure that we have a NULL terminated string
1223 mbcs[newLength-1] = 0;
1224 mbcs[newLength-2] = 0;
1225 memcpy(&(mbcs[copyLocation]), chars, length);
1226 if (copyLocation) {
1227 //copy the last character from the state
1228 mbcs[0] = (char)state->state_data[0];
1229 state->remainingChars = 0;
1230 }
1231 const char *mb = mbcs;
1232 const char *next = 0;
1233 QString s;
1234 while ((next = CharNextExA(CP_ACP, mb, 0)) != mb) {
1235 wchar_t wc[2] ={0};
1236 int charlength = next - mb;
1237 int len = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED|MB_ERR_INVALID_CHARS, mb, charlength, wc, 2);
1238 if (len>0) {
1239 s.append(QChar(wc[0]));
1240 } else {
1241 int r = GetLastError();
1242 //check if the character being dropped is the last character
1243 if (r == ERROR_NO_UNICODE_TRANSLATION && mb == (mbcs+newLength -3) && state) {
1244 state->remainingChars = 1;
1245 state->state_data[0] = (char)*mb;
1246 }
1247 }
1248 mb = next;
1249 }
1250 delete [] mbcs;
1251 return s;
1252}
1253
1254
1255QString QLocal8Bit::convertToUnicode(QByteArrayView in, QStringConverter::State *state)
1256{
1257 qsizetype length = in.size();
1258
1259 Q_ASSERT(length < INT_MAX); // ### FIXME
1260 const char *mb = in.data();
1261 int mblen = length;
1262
1263 if (!mb || !mblen)
1264 return QString();
1265
1266 QVarLengthArray<wchar_t, 4096> wc(4096);
1267 int len;
1268 QString sp;
1269 bool prepend = false;
1270 char state_data = 0;
1271 int remainingChars = 0;
1272
1273 //save the current state information
1274 if (state) {
1275 state_data = (char)state->state_data[0];
1276 remainingChars = state->remainingChars;
1277 }
1278
1279 //convert the pending character (if available)
1280 if (state && remainingChars) {
1281 char prev[3] = {0};
1282 prev[0] = state_data;
1283 prev[1] = mb[0];
1284 remainingChars = 0;
1285 len = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED,
1286 prev, 2, wc.data(), wc.length());
1287 if (len) {
1288 sp.append(QChar(wc[0]));
1289 if (mblen == 1) {
1290 state->remainingChars = 0;
1291 return sp;
1292 }
1293 prepend = true;
1294 mb++;
1295 mblen--;
1296 wc[0] = 0;
1297 }
1298 }
1299
1300 while (!(len=MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED|MB_ERR_INVALID_CHARS,
1301 mb, mblen, wc.data(), wc.length()))) {
1302 int r = GetLastError();
1303 if (r == ERROR_INSUFFICIENT_BUFFER) {
1304 const int wclen = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED,
1305 mb, mblen, 0, 0);
1306 wc.resize(wclen);
1307 } else if (r == ERROR_NO_UNICODE_TRANSLATION) {
1308 //find the last non NULL character
1309 while (mblen > 1 && !(mb[mblen-1]))
1310 mblen--;
1311 //check whether, we hit an invalid character in the middle
1312 if ((mblen <= 1) || (remainingChars && state_data))
1313 return convertToUnicodeCharByChar(in, state);
1314 //Remove the last character and try again...
1315 state_data = mb[mblen-1];
1316 remainingChars = 1;
1317 mblen--;
1318 } else {
1319 // Fail.
1320 qWarning("MultiByteToWideChar: Cannot convert multibyte text");
1321 break;
1322 }
1323 }
1324
1325 if (len <= 0)
1326 return QString();
1327
1328 if (wc[len-1] == 0) // len - 1: we don't want terminator
1329 --len;
1330
1331 //save the new state information
1332 if (state) {
1333 state->state_data[0] = (char)state_data;
1334 state->remainingChars = remainingChars;
1335 }
1336 QString s((QChar*)wc.data(), len);
1337 if (prepend) {
1338 return sp+s;
1339 }
1340 return s;
1341}
1342
1343QByteArray QLocal8Bit::convertFromUnicode(QStringView in, QStringConverter::State *state)
1344{
1345 const QChar *ch = in.data();
1346 qsizetype uclen = in.size();
1347
1348 Q_ASSERT(uclen < INT_MAX); // ### FIXME
1349 Q_ASSERT(state);
1350 Q_UNUSED(state); // ### Fixme
1351 if (state->flags & QStringConverter::Flag::Stateless) // temporary
1352 state = nullptr;
1353
1354 if (!ch)
1355 return QByteArray();
1356 if (uclen == 0)
1357 return QByteArray("");
1358 BOOL used_def;
1359 QByteArray mb(4096, 0);
1360 int len;
1361 while (!(len=WideCharToMultiByte(CP_ACP, 0, (const wchar_t*)ch, uclen,
1362 mb.data(), mb.size()-1, 0, &used_def)))
1363 {
1364 int r = GetLastError();
1365 if (r == ERROR_INSUFFICIENT_BUFFER) {
1366 mb.resize(1+WideCharToMultiByte(CP_ACP, 0,
1367 (const wchar_t*)ch, uclen,
1368 0, 0, 0, &used_def));
1369 // and try again...
1370 } else {
1371 // Fail. Probably can't happen in fact (dwFlags is 0).
1372#ifndef QT_NO_DEBUG
1373 // Can't use qWarning(), as it'll recurse to handle %ls
1374 fprintf(stderr,
1375 "WideCharToMultiByte: Cannot convert multibyte text (error %d): %ls\n",
1376 r, reinterpret_cast<const wchar_t*>(QString(ch, uclen).utf16()));
1377#endif
1378 break;
1379 }
1380 }
1381 mb.resize(len);
1382 return mb;
1383}
1384#endif
1385
1386void QStringConverter::State::clear()
1387{
1388 if (clearFn)
1389 clearFn(this);
1390 else
1391 state_data[0] = state_data[1] = state_data[2] = state_data[3] = 0;
1392 remainingChars = 0;
1393 invalidChars = 0;
1394 internalState = 0;
1395}
1396
1397static QChar *fromUtf16(QChar *out, QByteArrayView in, QStringConverter::State *state)
1398{
1399 return QUtf16::convertToUnicode(out, in, state, DetectEndianness);
1400}
1401
1402static char *toUtf16(char *out, QStringView in, QStringConverter::State *state)
1403{
1404 return QUtf16::convertFromUnicode(out, in, state, DetectEndianness);
1405}
1406
1407static QChar *fromUtf16BE(QChar *out, QByteArrayView in, QStringConverter::State *state)
1408{
1409 return QUtf16::convertToUnicode(out, in, state, BigEndianness);
1410}
1411
1412static char *toUtf16BE(char *out, QStringView in, QStringConverter::State *state)
1413{
1414 return QUtf16::convertFromUnicode(out, in, state, BigEndianness);
1415}
1416
1417static QChar *fromUtf16LE(QChar *out, QByteArrayView in, QStringConverter::State *state)
1418{
1419 return QUtf16::convertToUnicode(out, in, state, LittleEndianness);
1420}
1421
1422static char *toUtf16LE(char *out, QStringView in, QStringConverter::State *state)
1423{
1424 return QUtf16::convertFromUnicode(out, in, state, LittleEndianness);
1425}
1426
1427static QChar *fromUtf32(QChar *out, QByteArrayView in, QStringConverter::State *state)
1428{
1429 return QUtf32::convertToUnicode(out, in, state, DetectEndianness);
1430}
1431
1432static char *toUtf32(char *out, QStringView in, QStringConverter::State *state)
1433{
1434 return QUtf32::convertFromUnicode(out, in, state, DetectEndianness);
1435}
1436
1437static QChar *fromUtf32BE(QChar *out, QByteArrayView in, QStringConverter::State *state)
1438{
1439 return QUtf32::convertToUnicode(out, in, state, BigEndianness);
1440}
1441
1442static char *toUtf32BE(char *out, QStringView in, QStringConverter::State *state)
1443{
1444 return QUtf32::convertFromUnicode(out, in, state, BigEndianness);
1445}
1446
1447static QChar *fromUtf32LE(QChar *out, QByteArrayView in, QStringConverter::State *state)
1448{
1449 return QUtf32::convertToUnicode(out, in, state, LittleEndianness);
1450}
1451
1452static char *toUtf32LE(char *out, QStringView in, QStringConverter::State *state)
1453{
1454 return QUtf32::convertFromUnicode(out, in, state, LittleEndianness);
1455}
1456
1457void qt_from_latin1(char16_t *dst, const char *str, size_t size) noexcept;
1458
1459static QChar *fromLatin1(QChar *out, QByteArrayView in, QStringConverter::State *state)
1460{
1461 Q_ASSERT(state);
1462 Q_UNUSED(state);
1463
1464 qt_from_latin1(reinterpret_cast<char16_t *>(out), in.data(), size_t(in.size()));
1465 return out + in.size();
1466}
1467
1468
1469static char *toLatin1(char *out, QStringView in, QStringConverter::State *state)
1470{
1471 Q_ASSERT(state);
1472 if (state->flags & QStringConverter::Flag::Stateless) // temporary
1473 state = nullptr;
1474
1475 const char replacement = (state && state->flags & QStringConverter::Flag::ConvertInvalidToNull) ? 0 : '?';
1476 int invalid = 0;
1477 for (qsizetype i = 0; i < in.length(); ++i) {
1478 if (in[i] > QChar(0xff)) {
1479 *out = replacement;
1480 ++invalid;
1481 } else {
1482 *out = (char)in[i].cell();
1483 }
1484 ++out;
1485 }
1486 if (state)
1487 state->invalidChars += invalid;
1488 return out;
1489}
1490
1491static QChar *fromLocal8Bit(QChar *out, QByteArrayView in, QStringConverter::State *state)
1492{
1493 QString s = QLocal8Bit::convertToUnicode(in, state);
1494 memcpy(out, s.constData(), s.length()*sizeof(QChar));
1495 return out + s.length();
1496}
1497
1498static char *toLocal8Bit(char *out, QStringView in, QStringConverter::State *state)
1499{
1500 QByteArray s = QLocal8Bit::convertFromUnicode(in, state);
1501 memcpy(out, s.constData(), s.length());
1502 return out + s.length();
1503}
1504
1505
1506static qsizetype fromUtf8Len(qsizetype l) { return l + 1; }
1507static qsizetype toUtf8Len(qsizetype l) { return 3*(l + 1); }
1508
1509static qsizetype fromUtf16Len(qsizetype l) { return l/2 + 2; }
1510static qsizetype toUtf16Len(qsizetype l) { return 2*(l + 1); }
1511
1512static qsizetype fromUtf32Len(qsizetype l) { return l/2 + 2; }
1513static qsizetype toUtf32Len(qsizetype l) { return 4*(l + 1); }
1514
1515static qsizetype fromLatin1Len(qsizetype l) { return l + 1; }
1516static qsizetype toLatin1Len(qsizetype l) { return l + 1; }
1517
1518
1519
1520/*!
1521 \class QStringConverterBase
1522 \internal
1523
1524 Just a common base class for QStringConverter and QTextCodec
1525*/
1526
1527/*!
1528 \class QStringConverter
1529 \inmodule QtCore
1530 \brief The QStringConverter class provides a base class for encoding and decoding text.
1531 \reentrant
1532 \ingroup i18n
1533
1534 Qt uses UTF-16 to store, draw and manipulate strings. In many
1535 situations you may wish to deal with data that uses a different
1536 encoding. Most text data transferred over files and network connections is encoded
1537 in UTF-8.
1538
1539 The QStringConverter class is a base class for the \l {QStringEncoder} and
1540 \l {QStringDecoder} classes that help with converting between different
1541 text encodings. QStringDecoder can decode a string from an encoded representation
1542 into UTF-16, the format Qt uses internally. QStringEncoder does the opposite
1543 operation, encoding UTF-16 encoded data (usually in the form of a QString) to
1544 the requested encoding.
1545
1546 The supported encodings are:
1547
1548 \list
1549 \li UTF-8
1550 \li UTF-16
1551 \li UTF-16BE
1552 \li UTF-16LE
1553 \li UTF-32
1554 \li UTF-32BE
1555 \li UTF-32LE
1556 \li ISO-8859-1 (Latin-1)
1557 \li The system encoding
1558 \endlist
1559
1560 \l {QStringConverter}s can be used as follows to convert some encoded
1561 string to and from UTF-16.
1562
1563 Suppose you have some string encoded in UTF-8, and
1564 want to convert it to a QString. The simple way
1565 to do it is to use a \l {QStringDecoder} like this:
1566
1567 \snippet code/src_corelib_text_qstringconverter.cpp 0
1568
1569 After this, \c string holds the text in decoded form.
1570 Converting a string from Unicode to the local encoding is just as
1571 easy using the \l {QStringEncoder} class:
1572
1573 \snippet code/src_corelib_text_qstringconverter.cpp 1
1574
1575 To read or write text files in various encodings, use QTextStream and
1576 its \l{QTextStream::setEncoding()}{setEncoding()} function.
1577
1578 Some care must be taken when trying to convert the data in chunks,
1579 for example, when receiving it over a network. In such cases it is
1580 possible that a multi-byte character will be split over two
1581 chunks. At best this might result in the loss of a character and
1582 at worst cause the entire conversion to fail.
1583
1584 Both QStringEncoder and QStringDecoder make this easy, by tracking
1585 this in an internal state. So simply calling the encoder or decoder
1586 again with the next chunk of data will automatically continue encoding
1587 or decoding the data correctly:
1588
1589 \snippet code/src_corelib_text_qstringconverter.cpp 2
1590
1591 The QStringDecoder object maintains state between chunks and therefore
1592 works correctly even if a multi-byte character is split between
1593 chunks.
1594
1595 QStringConverter objects can't be copied because of their internal state, but
1596 can be moved.
1597
1598 \sa QTextStream, QStringDecoder, QStringEncoder
1599*/
1600
1601/*!
1602 \enum QStringConverter::Flag
1603
1604 \value Default Default conversion rules apply.
1605 \value ConvertInvalidToNull If this flag is set, each invalid input
1606 character is output as a null character. If it is not set,
1607 invalid input characters are represented as QChar::ReplacementCharacter
1608 if the output encoding can represent that character, otherwise as a question mark.
1609 \value WriteBom When converting from a QString to an output encoding, write a QChar::ByteOrderMark as the first
1610 character if the output encoding supports this. This is the case for UTF-8, UTF-16 and UTF-32
1611 encodings.
1612 \value ConvertInitialBom When converting from an input encoding to a QString the QStringDecoder usually skips an
1613 leading QChar::ByteOrderMark. When this flag is set, the byte order mark will not be
1614 skipped, but converted to utf-16 and inserted at the start of the created QString.
1615 \value Stateless Ignore possible converter states between different function calls
1616 to encode or decode strings. This will also cause the QStringConverter to raise an error if an incomplete
1617 sequence of data is encountered.
1618*/
1619
1620/*!
1621 \enum QStringConverter::Encoding
1622 \value Utf8 Create a converter to or from UTF-8
1623 \value Utf16 Create a converter to or from UTF-16. When decoding, the byte order will get automatically
1624 detected by a leading byte order mark. If none exists or when encoding, the system byte order will
1625 be assumed.
1626 \value Utf16BE Create a converter to or from big endian UTF-16.
1627 \value Utf16LE Create a converter to or from litte endian UTF-16.
1628 \value Utf32 Create a converter to or from UTF-32. When decoding, the byte order will get automatically
1629 detected by a leading byte order mark. If none exists or when encoding, the system byte order will
1630 be assumed.
1631 \value Utf32BE Create a converter to or from big endian UTF-32.
1632 \value Utf32LE Create a converter to or from litte endian UTF-32.
1633 \value Latin1 Create a converter to or from ISO-8859-1 (Latin1).
1634 \value System Create a converter to or from the underlying encoding of the
1635 operating systems locale. This is always assumed to be UTF-8 for Unix based
1636 systems. On Windows, this converts to and from the locale code page.
1637*/
1638
1639/*!
1640 \struct QStringConverter::Interface
1641 \internal
1642*/
1643
1644const QStringConverter::Interface QStringConverter::encodingInterfaces[QStringConverter::LastEncoding + 1] =
1645{
1646 { "UTF-8", QUtf8::convertToUnicode, fromUtf8Len, QUtf8::convertFromUnicode, toUtf8Len },
1647 { "UTF-16", fromUtf16, fromUtf16Len, toUtf16, toUtf16Len },
1648 { "UTF-16LE", fromUtf16LE, fromUtf16Len, toUtf16LE, toUtf16Len },
1649 { "UTF-16BE", fromUtf16BE, fromUtf16Len, toUtf16BE, toUtf16Len },
1650 { "UTF-32", fromUtf32, fromUtf32Len, toUtf32, toUtf32Len },
1651 { "UTF-32LE", fromUtf32LE, fromUtf32Len, toUtf32LE, toUtf32Len },
1652 { "UTF-32BE", fromUtf32BE, fromUtf32Len, toUtf32BE, toUtf32Len },
1653 { "ISO-8859-1", fromLatin1, fromLatin1Len, toLatin1, toLatin1Len },
1654 { "Locale", fromLocal8Bit, fromUtf8Len, toLocal8Bit, toUtf8Len }
1655};
1656
1657// match names case insensitive and skipping '-' and '_'
1658static bool nameMatch(const char *a, const char *b)
1659{
1660 while (*a && *b) {
1661 if (*a == '-' || *a == '_') {
1662 ++a;
1663 continue;
1664 }
1665 if (*b == '-' || *b == '_') {
1666 ++b;
1667 continue;
1668 }
1669 if (toupper(*a) != toupper(*b))
1670 return false;
1671 ++a;
1672 ++b;
1673 }
1674 return !*a && !*b;
1675}
1676
1677
1678/*!
1679 \fn constexpr QStringConverter::QStringConverter()
1680 \internal
1681*/
1682
1683/*!
1684 \fn constexpr QStringConverter::QStringConverter(Encoding, Flags)
1685 \internal
1686*/
1687
1688/*!
1689 \internal
1690*/
1691QStringConverter::QStringConverter(const char *name, Flags f)
1692 : iface(nullptr), state(f)
1693{
1694 auto e = encodingForName(name);
1695 if (e)
1696 iface = encodingInterfaces + int(e.value());
1697}
1698
1699/*!
1700 \fn bool QStringConverter::isValid() const
1701
1702 Returns true if this is a valid string converter that can be used for encoding or
1703 decoding text.
1704
1705 Default constructed string converters or converters constructed with an unsupported
1706 name are not valid.
1707*/
1708
1709/*!
1710 \fn void QStringConverter::resetState()
1711
1712 Resets the internal state of the converter, clearing potential errors or partial
1713 conversions.
1714*/
1715
1716/*!
1717 \fn bool QStringConverter::hasError() const
1718
1719 Returns true if a conversion could not correctly convert a character. This could for example
1720 get triggered by an invalid UTF-8 sequence or when a character can't get converted due to
1721 limitations in the target encoding.
1722*/
1723
1724/*!
1725 \fn const char *QStringConverter::name() const
1726
1727 Returns the canonical name of the encoding this QStringConverter can encode or decode.
1728 Returns a nullptr if the converter is not valid.
1729
1730 \sa isValid()
1731*/
1732
1733/*!
1734 Returns an optional encoding for \a name. The optional is empty if the name could
1735 not get converted to a valid encoding.
1736*/
1737std::optional<QStringConverter::Encoding> QStringConverter::encodingForName(const char *name)
1738{
1739 for (int i = 0; i < LastEncoding + 1; ++i) {
1740 if (nameMatch(encodingInterfaces[i].name, name))
1741 return QStringConverter::Encoding(i);
1742 }
1743 if (nameMatch(name, "latin1"))
1744 return QStringConverter::Latin1;
1745 return std::nullopt;
1746}
1747
1748/*!
1749 Returns the encoding for the content of \a data if it can be determined.
1750 \a expectedFirstCharacter can be passed as an additional hint to help determine
1751 the encoding.
1752
1753 The returned optional is empty, if the encoding is unclear.
1754 */
1755std::optional<QStringConverter::Encoding> QStringConverter::encodingForData(QByteArrayView data, char16_t expectedFirstCharacter)
1756{
1757 qsizetype arraySize = data.size();
1758 if (arraySize > 3) {
1759 uint uc = qFromUnaligned<uint>(data.data());
1760 if (uc == qToBigEndian(uint(QChar::ByteOrderMark)))
1761 return QStringConverter::Utf32BE;
1762 if (uc == qToLittleEndian(uint(QChar::ByteOrderMark)))
1763 return QStringConverter::Utf32LE;
1764 if (expectedFirstCharacter) {
1765 // catch also anything starting with the expected character
1766 if (qToLittleEndian(uc) == expectedFirstCharacter)
1767 return QStringConverter::Utf32LE;
1768 else if (qToBigEndian(uc) == expectedFirstCharacter)
1769 return QStringConverter::Utf32BE;
1770 }
1771 }
1772
1773 if (arraySize > 2) {
1774 if (memcmp(data.data(), utf8bom, sizeof(utf8bom)) == 0)
1775 return QStringConverter::Utf8;
1776 }
1777
1778 if (arraySize > 1) {
1779 ushort uc = qFromUnaligned<ushort>(data.data());
1780 if (uc == qToBigEndian(ushort(QChar::ByteOrderMark)))
1781 return QStringConverter::Utf16BE;
1782 if (uc == qToLittleEndian(ushort(QChar::ByteOrderMark)))
1783 return QStringConverter::Utf16LE;
1784 if (expectedFirstCharacter) {
1785 // catch also anything starting with the expected character
1786 if (qToLittleEndian(uc) == expectedFirstCharacter)
1787 return QStringConverter::Utf16LE;
1788 else if (qToBigEndian(uc) == expectedFirstCharacter)
1789 return QStringConverter::Utf16BE;
1790 }
1791 }
1792 return std::nullopt;
1793}
1794
1795/*!
1796 Tries to determine the encoding of the HTML in \a data by looking at leading byte
1797 order marks or a charset specifier in the HTML meta tag. If the optional is empty,
1798 the encoding specified is not supported by QStringConverter. If no encoding is
1799 detected, the method returns Utf8.
1800*/
1801std::optional<QStringConverter::Encoding> QStringConverter::encodingForHtml(QByteArrayView data)
1802{
1803 // determine charset
1804 auto encoding = encodingForData(data);
1805 if (encoding)
1806 // trust the initial BOM
1807 return encoding;
1808
1809 QByteArray header = data.first(qMin(data.size(), qsizetype(1024))).toByteArray().toLower();
1810 int pos = header.indexOf("meta ");
1811 if (pos != -1) {
1812 pos = header.indexOf("charset=", pos);
1813 if (pos != -1) {
1814 pos += int(qstrlen("charset="));
1815 if (pos < header.size() && (header.at(pos) == '\"' || header.at(pos) == '\''))
1816 ++pos;
1817
1818 int pos2 = pos;
1819 // The attribute can be closed with either """, "'", ">" or "/",
1820 // none of which are valid charset characters.
1821 while (++pos2 < header.size()) {
1822 char ch = header.at(pos2);
1823 if (ch == '\"' || ch == '\'' || ch == '>' || ch == '/') {
1824 QByteArray name = header.mid(pos, pos2 - pos);
1825 int colon = name.indexOf(':');
1826 if (colon > 0)
1827 name = name.left(colon);
1828 name = name.simplified();
1829 if (name == "unicode") // QTBUG-41998, ICU will return UTF-16.
1830 name = QByteArrayLiteral("UTF-8");
1831 if (!name.isEmpty())
1832 return encodingForName(name);
1833 }
1834 }
1835 }
1836 }
1837 return Utf8;
1838}
1839
1840/*!
1841 Returns the canonical name for \a encoding.
1842*/
1843const char *QStringConverter::nameForEncoding(QStringConverter::Encoding e)
1844{
1845 return encodingInterfaces[int(e)].name;
1846}
1847
1848/*!
1849 \class QStringEncoder
1850 \inmodule QtCore
1851 \brief The QStringEncoder class provides a state-based encoder for text.
1852 \reentrant
1853 \ingroup i18n
1854
1855 A text encoder converts text from Qt's internal representation into an encoded
1856 text format using a specific encoding.
1857
1858 Converting a string from Unicode to the local encoding can be achieved
1859 using the following code:
1860
1861 \snippet code/src_corelib_text_qstringconverter.cpp 1
1862
1863 The encoder remembers any state that is required between calls, so converting
1864 data received in chunks, for example, when receiving it over a network, is just as
1865 easy, by calling the encoder whenever new data is available:
1866
1867 \snippet code/src_corelib_text_qstringconverter.cpp 3
1868
1869 The QStringEncoder object maintains state between chunks and therefore
1870 works correctly even if a UTF-16 surrogate character is split between
1871 chunks.
1872
1873 QStringEncoder objects can't be copied because of their internal state, but
1874 can be moved.
1875
1876 \sa QStringConverter, QStringDecoder
1877*/
1878
1879/*!
1880 \fn constexpr QStringEncoder::QStringEncoder(const Interface *i)
1881 \internal
1882*/
1883
1884/*!
1885 \fn constexpr QStringEncoder::QStringEncoder()
1886
1887 Default constructs an encoder. The default encoder is not valid,
1888 and can't be used for converting text.
1889*/
1890
1891/*!
1892 \fn constexpr QStringEncoder::QStringEncoder(Encoding encoding, Flags flags = Flag::Default)
1893
1894 Creates an encoder object using \a encoding and \a flags.
1895*/
1896
1897/*!
1898 \fn constexpr QStringEncoder::QStringEncoder(const char *name, Flags flags = Flag::Default)
1899
1900 Creates an encoder object using \a name and \a flags.
1901 If \a name is not the name of a known encoding an invalid converter will get created.
1902
1903 \sa isValid()
1904*/
1905
1906/*!
1907 \fn QByteArray QStringEncoder::operator()(const QString &in)
1908 \fn QByteArray QStringEncoder::encode(const QString &in)
1909
1910 Converts \a in and returns the data as a byte array.
1911*/
1912
1913/*!
1914 \fn QByteArray QStringEncoder::operator()(QStringView in)
1915 \fn QByteArray QStringEncoder::encode(QStringView in)
1916 \overload
1917
1918 Converts \a in and returns the data as a byte array.
1919*/
1920
1921/*!
1922 \fn QByteArray QStringEncoder::operator()(const QChar *in, qsizetype length)
1923 \fn QByteArray QStringEncoder::encode(const QChar *in, qsizetype length)
1924 \overload
1925
1926 Converts \a length QChars from \a in and returns the data as a byte array.
1927*/
1928
1929/*!
1930 \fn qsizetype QStringEncoder::requiredSpace(qsizetype inputLength) const
1931
1932 Returns the maximum amount of characters required to be able to process
1933 \a inputLength decoded data.
1934
1935 \sa appendToBuffer
1936*/
1937
1938/*!
1939 \fn char *QStringEncoder::appendToBuffer(char *out, const QChar *in, qsizetype length)
1940
1941 Encodes \a length QChars from \a in and writes the encoded result into the buffer
1942 starting at \a out. Returns a pointer to the end of data written.
1943
1944 \a out needs to be large enough to be able to hold all the decoded data. Use
1945 \l{requiredSpace} to determine the maximum size requirements to be able to encode
1946 a QChar buffer of \a length.
1947
1948 \sa requiredSpace
1949*/
1950
1951/*!
1952 \class QStringDecoder
1953 \inmodule QtCore
1954 \brief The QStringDecoder class provides a state-based decoder for text.
1955 \reentrant
1956 \ingroup i18n
1957
1958 A text decoder converts text an encoded text format that uses a specific encoding
1959 into Qt's internal representation.
1960
1961 Converting encoded data into a QString can be achieved
1962 using the following code:
1963
1964 \snippet code/src_corelib_text_qstringconverter.cpp 0
1965
1966 The decoder remembers any state that is required between calls, so converting
1967 data received in chunks, for example, when receiving it over a network, is just as
1968 easy, by calling the decoder whenever new data is available:
1969
1970 \snippet code/src_corelib_text_qstringconverter.cpp 2
1971
1972 The QStringDecoder object maintains state between chunks and therefore
1973 works correctly even if chunks are split in the middle of a multi-byte character
1974 sequence.
1975
1976 QStringDecoder objects can't be copied because of their internal state, but
1977 can be moved.
1978
1979 \sa QStringConverter, QStringEncoder
1980*/
1981
1982/*!
1983 \fn constexpr QStringDecoder::QStringDecoder(const Interface *i)
1984 \internal
1985*/
1986
1987/*!
1988 \fn constexpr QStringDecoder::QStringDecoder()
1989
1990 Default constructs an decoder. The default decoder is not valid,
1991 and can't be used for converting text.
1992*/
1993
1994/*!
1995 \fn constexpr QStringDecoder::QStringDecoder(Encoding encoding, Flags flags = Flag::Default)
1996
1997 Creates an decoder object using \a encoding and \a flags.
1998*/
1999
2000/*!
2001 \fn constexpr QStringDecoder::QStringDecoder(const char *name, Flags flags = Flag::Default)
2002
2003 Creates an decoder object using \a name and \a flags.
2004 If \a name is not the name of a known encoding an invalid converter will get created.
2005
2006 \sa isValid()
2007*/
2008
2009/*!
2010 \fn QString QStringDecoder::operator()(const QByteArray &ba)
2011 \fn QString QStringDecoder::decode(const QByteArray &ba)
2012
2013 Converts \a ba and returns the data as a QString.
2014*/
2015
2016/*!
2017 \fn QString QStringDecoder::operator()(const char *in, qsizetype size)
2018 \fn QString QStringDecoder::decode(const char *in, qsizetype size)
2019 \overload
2020
2021 Converts a byte array containing the first \a size bytes of the array \a in
2022 and returns the data as a QString.
2023*/
2024
2025/*!
2026 \fn QString QStringDecoder::operator()(const char *chars)
2027 \fn QString QStringDecoder::decode(const char *chars)
2028 \overload
2029
2030 Converts \a chars and returns the data as a QString. \a chars is assumed to
2031 point to a \c{\0}-terminated string and its length is determined dynamically.
2032*/
2033
2034/*!
2035 \fn qsizetype QStringDecoder::requiredSpace(qsizetype inputLength) const
2036
2037 Returns the maximum amount of UTF-16 code units required to be able to process
2038 \a inputLength encoded data.
2039
2040 \sa appendToBuffer
2041*/
2042
2043/*!
2044 \fn QChar *QStringDecoder::appendToBuffer(QChar *out, QByteArrayView in)
2045
2046 Decodes the sequence of bytes viewed by \a in and writes the decoded result into
2047 the buffer starting at \a out. Returns a pointer to the end of data written.
2048
2049 \a out needs to be large enough to be able to hold all the decoded data. Use
2050 \l{requiredSpace} to determine the maximum size requirements to decode an encoded
2051 data buffer of \c in.size() bytes.
2052
2053 \sa requiredSpace
2054*/
2055
2056QT_END_NAMESPACE
2057